From e62f97620a9231052eb34d9150835e2fcc58c9a0 Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Tue, 7 May 2024 14:04:37 -0400 Subject: [PATCH 001/114] feat: add gff3 gz uri fields to db + models --- bento_reference_service/db.py | 32 ++++++++++++++++++++++---- bento_reference_service/models.py | 3 +++ bento_reference_service/sql/schema.sql | 8 +++++++ 3 files changed, 39 insertions(+), 4 deletions(-) diff --git a/bento_reference_service/db.py b/bento_reference_service/db.py index 8a7877a..efb12e3 100644 --- a/bento_reference_service/db.py +++ b/bento_reference_service/db.py @@ -19,10 +19,10 @@ def __init__(self, config: Config): super().__init__(config.database_uri, SCHEMA_PATH) @staticmethod - def deserialize_alias(rec: asyncpg.Record) -> Alias: + def deserialize_alias(rec: asyncpg.Record | dict) -> Alias: return Alias(alias=rec["alias"], naming_authority=rec["naming_authority"]) - def deserialize_contig(self, rec: asyncpg.Record) -> ContigWithRefgetURI: + def deserialize_contig(self, rec: asyncpg.Record | dict) -> ContigWithRefgetURI: service_base_url = self._config.service_url_base_path.rstrip("/") refget_uri_base = f"{service_base_url}/sequence" md5 = rec["md5_checksum"] @@ -57,6 +57,16 @@ def deserialize_genome(self, rec: asyncpg.Record, external_resource_uris: bool) ga4gh=rec["ga4gh_checksum"], fasta=f"{genome_uri}.fa" if external_resource_uris else rec["fasta_uri"], fai=f"{genome_uri}.fa.fai" if external_resource_uris else rec["fai_uri"], + gff3_gz=( + (f"{genome_uri}/features.gff3.gz" if external_resource_uris else rec["gff3_gz_uri"]) + if rec["gff3_gz_uri"] + else None + ), + gff3_gz_tbi=( + (f"{genome_uri}/features.gff3.gz.tbi" if external_resource_uris else rec["gff3_gz_tbi_uri"]) + if rec["gff3_gz_tbi_uri"] + else None + ), taxon=OntologyTerm(id=rec["taxon_id"], label=rec["taxon_label"]), ) @@ -126,13 +136,27 @@ async def create_genome(self, g: Genome, return_external_resource_uris: bool) -> async with conn.transaction(): # Create the genome record: await conn.execute( - "INSERT INTO genomes (id, md5_checksum, ga4gh_checksum, fasta_uri, fai_uri, taxon_id, taxon_label) " - "VALUES ($1, $2, $3, $4, $5, $6, $7)", + """ + INSERT INTO genomes ( + id, + md5_checksum, + ga4gh_checksum, + fasta_uri, + fai_uri, + gff3_uri, + gff3_tbi_uri, + taxon_id, + taxon_label + ) + VALUES ($1, $2, $3, $4, $5, $6, $7) + """, g.id, g.md5, g.ga4gh, g.fasta, g.fai, + g.gff3, + g.gff3_tbi, g.taxon.id, g.taxon.label, ) diff --git a/bento_reference_service/models.py b/bento_reference_service/models.py index f802bb0..76e5019 100644 --- a/bento_reference_service/models.py +++ b/bento_reference_service/models.py @@ -64,6 +64,9 @@ class Genome(BaseModel): fasta: str # URI fai: str # URI + gff3_gz: str | None = None # URI + gff3_gz_tbi: str | None = None # URI + # biological information taxon: OntologyTerm # MUST be from NCBITaxon ontology - ingestion SHOULD validate this contigs: tuple[Contig, ...] diff --git a/bento_reference_service/sql/schema.sql b/bento_reference_service/sql/schema.sql index 05842a5..97b4187 100644 --- a/bento_reference_service/sql/schema.sql +++ b/bento_reference_service/sql/schema.sql @@ -6,10 +6,18 @@ CREATE TABLE IF NOT EXISTS genomes ( ga4gh_checksum VARCHAR(63) NOT NULL UNIQUE, -- GA4GH/VRS/RefGet 2-formatted checksum: SQ.(truncated SHA12, B64) fasta_uri TEXT NOT NULL UNIQUE, -- Can be a local file URI, an S3 URI, a DRS URI, or an HTTPS resource. fai_uri TEXT NOT NULL UNIQUE, -- Corresponding .fa.fai for the FASTA. See fasta_uri for what this can be. + gff3_gz_uri TEXT UNIQUE, -- Optional GFF3 annotation URI for the genome. + gff3_gz_tbi_uri TEXT UNIQUE, -- Tabix index for the optional GFF3 annotation file for the genome. taxon_id VARCHAR(31) NOT NULL, -- e.g., NCBITaxon:9606 taxon_label TEXT NOT NULL -- e.g., Homo sapiens ); +-- Migration (v0.2.0): add genomes.gff3_uri and genomes.gff3_tbi_uri if they do not exist: +ALTER TABLE genomes + ADD COLUMN IF NOT EXISTS gff3_gz_uri TEXT UNIQUE, + ADD COLUMN IF NOT EXISTS gff3_gz_tbi_uri TEXT UNIQUE; +-- End migration (v0.2.0) + CREATE TABLE IF NOT EXISTS genome_aliases ( genome_id VARCHAR(31) NOT NULL REFERENCES genomes ON DELETE CASCADE, alias VARCHAR(31) NOT NULL, From 74244424afa9a9aa98cab1b6fade5ca1f60bd8ac Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Tue, 7 May 2024 14:57:56 -0400 Subject: [PATCH 002/114] chore: add schema, models, db functions for genome features --- bento_reference_service/db.py | 151 ++++++++++++++++++++++++- bento_reference_service/models.py | 39 +++++-- bento_reference_service/sql/schema.sql | 37 ++++-- 3 files changed, 204 insertions(+), 23 deletions(-) diff --git a/bento_reference_service/db.py b/bento_reference_service/db.py index efb12e3..9ac9232 100644 --- a/bento_reference_service/db.py +++ b/bento_reference_service/db.py @@ -7,7 +7,7 @@ from typing import Annotated, AsyncIterator from .config import Config, ConfigDependency -from .models import Alias, ContigWithRefgetURI, Genome, GenomeWithURIs, OntologyTerm +from .models import Alias, ContigWithRefgetURI, Genome, GenomeWithURIs, OntologyTerm, GenomeFeatureEntry, GenomeFeature SCHEMA_PATH = Path(__file__).parent / "sql" / "schema.sql" @@ -194,6 +194,155 @@ async def create_genome(self, g: Genome, return_external_resource_uris: bool) -> return await self.get_genome(g.id, external_resource_uris=return_external_resource_uris) + async def genome_feature_types_summary(self, g_id: str): + conn: asyncpg.Connection + async with self.connect() as conn: + res = await conn.fetch( + """ + SELECT feature_type, COUNT(feature_type) as ft_count + FROM genome_features + WHERE genome_id = $1 + GROUP BY feature_type + """, + g_id, + ) + + return {row["feature_type"]: row["ft_count"] for row in res} + + @staticmethod + def deserialize_genome_feature_entry(rec: asyncpg.Record | dict) -> GenomeFeatureEntry: + return GenomeFeatureEntry( + start_pos=rec["start_pos"], + end_pos=rec["end_pos"], + score=rec["score"], + phase=rec["phase"], + ) + + @staticmethod + def deserialize_genome_feature(rec: asyncpg.Record) -> GenomeFeature: + return GenomeFeature( + genome_id=rec["genome_id"], + contig_name=rec["contig_name"], + strand=rec["strand"], + feature_id=rec["feature_id"], + feature_name=rec["feature_name"], + feature_type=rec["feature_type"], + source=rec["source"], + entries=tuple(map(Database.deserialize_genome_feature_entry, json.loads(rec["entries"]))), + annotations=json.loads(rec["annotations"]), # TODO + parents=tuple(rec["parents"]), # tuple of parent IDs + ) + + @staticmethod + def _feature_inner_entries_query(where_expr: str | None = None) -> str: + where_clause = f"AND {where_expr}" if where_expr else "" + return f""" + WITH entries_tmp AS ( + SELECT start_pos, end_pos, score, phase FROM genome_feature_entries gfe + WHERE gfe.genome_id = gf.genome_id AND gfe.feature_id = gf.feature_id {where_clause} + ) + SELECT jsonb_agg(entries_tmp.*) FROM entries_tmp + """ + + async def get_genome_features_by_ids( + self, + g_id: str, + f_ids: list[str], + offset: int = 0, + limit: int = 10, + existing_conn: asyncpg.Connection | None = None, + ): + final_query = f""" + SELECT + genome_id, + contig_name, + strand, + feature_id, + feature_name, + feature_type, + source, + ({self._feature_inner_entries_query()}) entries, + ( + SELECT array_agg(gfp.parent_id) FROM genome_feature_parents gfp + WHERE gfp.genome_id = gf.genome_id AND gfp.feature_id = gf.feature_id + ) parents + FROM genome_features gf + WHERE gf.genome_id = $1 AND feature_id IN $2 + OFFSET $3 LIMIT $4 + """ + + conn: asyncpg.Connection + async with self.connect(existing_conn) as conn: + final_res = await conn.fetch(final_query, g_id, f_ids, offset, limit) + return [self.deserialize_genome_feature(r) for r in final_res] + + async def get_genome_feature_by_id(self, g_id: str, f_id: str) -> GenomeFeature | None: + res = await self.get_genome_features_by_ids(g_id, [f_id], 0, 1) + return res[0] if res else None + + async def query_genome_features( + self, + g_id: str, + q: str | None, + name: str | None, + position: str | None, + start: int | None, + end: int | None, + feature_types: list[str] | None, + offset: int = 0, + limit: int = 10, + ) -> tuple[list[GenomeFeature], dict]: # list of genome features + pagination dict object + gf_where_items: list[str] = [] + gfe_where_items: list[str] = [] + q_params: list[str | int] = [] + + def _q_param(pv: str | int) -> str: + q_params.append(pv) + return f"${len(gf_where_items) + 2}" + + if q: + param = _q_param(q) + gf_where_items.append(f"(gf.feature_id ~ {param} OR gf.feature_name ~ {param})") + + if name: + gf_where_items.append(f"gf.feature_name = {_q_param(name)}") + + if position: + gfe_where_items.append(f"gfe.position_text ~ {_q_param(position)}") + + if start is not None: + gfe_where_items.append(f"gfe.start_pos >= {_q_param(start)}") + + if end is not None: + gfe_where_items.append(f"gfe.start_pos <= {_q_param(end)}") + + if feature_types: + or_items = [] + for ft in feature_types: + gf_where_items.append(f"gf.feature_type = f{_q_param(ft)}") + gf_where_items.append(f"({' OR '.join(or_items)})") + + where_clause = " AND ".join(gf_where_items) if gf_where_items else "true" + gfe_where_clause = " AND ".join(gfe_where_items) if gfe_where_items else None + + id_query = f""" + SELECT feature_id, ({self._feature_inner_entries_query(gfe_where_clause)}) entries + FROM genome_features gf + WHERE + gf.genome_id = $1 + AND jsonb_array_length(gf.entries) > 0 + AND {where_clause}; + """ + + conn: asyncpg.Connection + async with self.connect() as conn: + id_res = await conn.fetch(id_query, g_id, *q_params) + final_list = await self.get_genome_features_by_ids( + g_id, [r["feature_id"] for r in id_res], offset, limit, conn + ) + + return final_list, {"offset": offset, "limit": limit, "total": len(id_res)} + @lru_cache() def get_db(config: ConfigDependency) -> Database: # pragma: no cover diff --git a/bento_reference_service/models.py b/bento_reference_service/models.py index 76e5019..59b569f 100644 --- a/bento_reference_service/models.py +++ b/bento_reference_service/models.py @@ -1,28 +1,20 @@ from pydantic import BaseModel -from typing import TypedDict +from typing import Literal __all__ = [ - "GTFFeature", "OntologyTerm", "Alias", "Contig", "ContigWithRefgetURI", "Genome", "GenomeWithURIs", + "GenomeFeatureEntry", + "GenomeFeature", ] # Pydantic/dict models, not database models -class GTFFeature(TypedDict): - id: str - name: str - position: str - type: str - genome: str - strand: str - - class OntologyTerm(BaseModel): id: str label: str @@ -75,3 +67,28 @@ class Genome(BaseModel): class GenomeWithURIs(Genome): uri: str contigs: tuple[ContigWithRefgetURI, ...] + + +class GenomeFeatureEntry(BaseModel): + start_pos: int # 1-based, inclusive + end_pos: int # 1-based, exclusive + score: float | None + phase: int | None + + +class GenomeFeature(BaseModel): + genome_id: str + contig_name: str + + strand: Literal["negative", "positive", "unknown", "not_stranded"] + + feature_id: str + feature_name: str + feature_type: str + + source: str + + entries: list[GenomeFeatureEntry] + annotations: dict[str, list[str]] + + parents: tuple[str, ...] diff --git a/bento_reference_service/sql/schema.sql b/bento_reference_service/sql/schema.sql index 97b4187..5d6d569 100644 --- a/bento_reference_service/sql/schema.sql +++ b/bento_reference_service/sql/schema.sql @@ -33,11 +33,15 @@ CREATE TABLE IF NOT EXISTS genome_contigs ( -- Checksums: the two checksums given here are the ones recommended for RefGet v2; -- see http://samtools.github.io/hts-specs/refget.html#checksum-calculation -- The UNIQUE constraint on these two columns creates a B-tree index on each, so contigs can be queried by checksum. - md5_checksum VARCHAR(32) NOT NULL UNIQUE, -- Hexadecimal string representation of MD5 checksum bytes - ga4gh_checksum VARCHAR(63) NOT NULL UNIQUE, -- GA4GH/VRS/RefGet 2-formatted checksum: SQ.(truncated SHA12, B64) + md5_checksum VARCHAR(32) NOT NULL, -- Hexadecimal string representation of MD5 checksum bytes + ga4gh_checksum VARCHAR(63) NOT NULL, -- GA4GH/VRS/RefGet 2-formatted checksum: SQ.(truncated SHA12, B64) -- Contigs are unique only within the context of a particular reference genome: - PRIMARY KEY (genome_id, contig_name) + PRIMARY KEY (genome_id, contig_name), + UNIQUE (genome_id, md5_checksum), + UNIQUE (genome_id, ga4gh_checksum) ); +CREATE INDEX IF NOT EXISTS genome_contigs_md5_checksum_idx ON genome_contigs (md5_checksum); +CREATE INDEX IF NOT EXISTS genome_contigs_ga4gh_checksum_idx ON genome_contigs (ga4gh_checksum); CREATE TABLE IF NOT EXISTS genome_contig_aliases ( genome_id VARCHAR(31) NOT NULL REFERENCES genomes ON DELETE CASCADE, @@ -60,8 +64,7 @@ CREATE TABLE IF NOT EXISTS genome_feature_type_synonyms ( ); DO $$ BEGIN - -- corresponds with the GFF3 values: [-, +, ?, .] respectively - CREATE TYPE strand_type AS ENUM ('negative', 'positive', 'unknown', 'not_stranded'); + CREATE TYPE strand_type AS ENUM ('-', '+', '?', '.'); EXCEPTION WHEN duplicate_object THEN null; END $$; @@ -70,9 +73,6 @@ CREATE TABLE IF NOT EXISTS genome_features ( genome_id VARCHAR(31) NOT NULL REFERENCES genomes, -- Feature location information, on the genome: contig_name VARCHAR(63) NOT NULL, - start_pos INTEGER NOT NULL, -- 1-based, inclusive - end_pos INTEGER NOT NULL, -- 1-based, exclusive - if start_pos == end_pos then it's a 0-length feature - position_text TEXT NOT NULL, -- chr:start-end style searchable string - cached for indexing purposes strand strand_type NOT NULL, -- Feature characteristics / attributes: -- - technically, there can be multiple rows in a GFF3 file with the same ID, for discontinuous features. @@ -81,14 +81,29 @@ CREATE TABLE IF NOT EXISTS genome_features ( feature_name TEXT NOT NULL, feature_type VARCHAR(15) NOT NULL REFERENCES genome_feature_types, source TEXT NOT NULL, - score FLOAT, - phase SMALLINT, -- Keys: PRIMARY KEY (genome_id, feature_id), FOREIGN KEY (genome_id, contig_name) REFERENCES genome_contigs ); + +CREATE INDEX IF NOT EXISTS genome_features_feature_id_trgm_gin ON genome_features USING GIN (feature_id gin_trgm_ops); CREATE INDEX IF NOT EXISTS genome_features_feature_name_trgm_gin ON genome_features USING GIN (feature_name gin_trgm_ops); -CREATE INDEX IF NOT EXISTS genome_features_position_text_trgm_gin ON genome_features USING GIN (position_text gin_trgm_ops); + +CREATE TABLE IF NOT EXISTS genome_feature_entries ( + genome_id VARCHAR(31) NOT NULL REFERENCES genomes, + feature_id VARCHAR(63) NOT NULL, + start_pos INTEGER NOT NULL, -- 1-based, inclusive + end_pos INTEGER NOT NULL, -- 1-based, exclusive - if start_pos == end_pos then it's a 0-length feature + position_text TEXT NOT NULL, -- chr:start-end style searchable string - cached for indexing purposes + score FLOAT, + phase SMALLINT, + -- Keys: + FOREIGN KEY (genome_id, feature_id) REFERENCES genome_features +) + +CREATE INDEX IF NOT EXISTS genome_feature_entries_position_text_trgm_gin + ON genome_feature_entries + USING GIN (position_text gin_trgm_ops); -- in GFF3 files, features can have one or multiple parents within the same annotation file -- - facilitate this via a many-to-many table From abfe9558edcacf2204a2efa5e959f676f62cd5e9 Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Tue, 7 May 2024 15:18:22 -0400 Subject: [PATCH 003/114] feat(routes): define gff3 / feature API endpoints --- bento_reference_service/routers/genomes.py | 149 +++++++++++++++------ 1 file changed, 109 insertions(+), 40 deletions(-) diff --git a/bento_reference_service/routers/genomes.py b/bento_reference_service/routers/genomes.py index ed9e7fe..ee83c41 100644 --- a/bento_reference_service/routers/genomes.py +++ b/bento_reference_service/routers/genomes.py @@ -3,13 +3,14 @@ from bento_lib.auth.permissions import P_INGEST_REFERENCE_MATERIAL, P_DELETE_REFERENCE_MATERIAL from bento_lib.auth.resources import RESOURCE_EVERYTHING -from fastapi import APIRouter, HTTPException, Request, status +from fastapi import APIRouter, HTTPException, Query, Request, status from fastapi.responses import StreamingResponse +from typing import Annotated from .. import models as m from ..authz import authz_middleware from ..config import ConfigDependency -from ..db import DatabaseDependency +from ..db import Database, DatabaseDependency from ..logger import LoggerDependency from ..streaming import generate_uri_streaming_response @@ -20,6 +21,15 @@ genome_router = APIRouter(prefix="/genomes") +async def get_genome_or_raise_404( + db: Database, genome_id: str, external_resource_uris: bool = True +) -> m.GenomeWithURIs: + genome: m.GenomeWithURIs = await db.get_genome(genome_id, external_resource_uris=external_resource_uris) + if genome is None: + raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=f"Genome with ID {genome_id} not found") + return genome + + @genome_router.get("", dependencies=[authz_middleware.dep_public_endpoint()]) async def genomes_list( db: DatabaseDependency, response_format: str | None = None @@ -68,10 +78,8 @@ async def genomes_create( async def genomes_detail_fasta( genome_id: str, config: ConfigDependency, db: DatabaseDependency, logger: LoggerDependency, request: Request ) -> StreamingResponse: - genome: m.Genome = await db.get_genome(genome_id, external_resource_uris=False) # need internal FASTA URI - - if genome is None: - raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=f"Genome with ID {genome_id} not found") + # need internal FASTA URI: + genome: m.Genome = await get_genome_or_raise_404(db, genome_id, external_resource_uris=False) # Don't use FastAPI's auto-Header tool for the Range header # 'cause I don't want to shadow Python's range() function @@ -91,10 +99,8 @@ async def genomes_detail_fasta( async def genomes_detail_fasta_index( genome_id: str, config: ConfigDependency, db: DatabaseDependency, logger: LoggerDependency, request: Request ) -> StreamingResponse: - genome: m.Genome = await db.get_genome(genome_id, external_resource_uris=False) # need internal FAI URI - - if genome is None: - raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=f"Genome with ID {genome_id} not found") + # need internal FAI URI: + genome: m.Genome = await get_genome_or_raise_404(db, genome_id, external_resource_uris=False) # Don't use FastAPI's auto-Header tool for the Range header 'cause I don't want to shadow Python's range() function: range_header: str | None = request.headers.get("Range", None) @@ -111,9 +117,7 @@ async def genomes_detail_fasta_index( @genome_router.get("/{genome_id}", dependencies=[authz_middleware.dep_public_endpoint()]) async def genomes_detail(genome_id: str, db: DatabaseDependency) -> m.GenomeWithURIs: - if g := await db.get_genome(genome_id, external_resource_uris=True): - return g - raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=f"Genome with ID {genome_id} not found") + return await get_genome_or_raise_404(db, genome_id) @genome_router.delete( @@ -128,30 +132,22 @@ async def genomes_detail(genome_id: str, db: DatabaseDependency) -> m.GenomeWith async def genomes_delete(genome_id: str, db: DatabaseDependency): # TODO: also delete DRS objects!! - if await db.get_genome(genome_id): - await db.delete_genome(genome_id) - return - - raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=f"Genome with ID {genome_id} not found") + await get_genome_or_raise_404(db, genome_id) + await db.delete_genome(genome_id) @genome_router.get("/{genome_id}/contigs", dependencies=[authz_middleware.dep_public_endpoint()]) async def genomes_detail_contigs(genome_id: str, db: DatabaseDependency) -> tuple[m.ContigWithRefgetURI, ...]: - if g := await db.get_genome(genome_id, external_resource_uris=True): - return g.contigs - raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=f"Genome with ID {genome_id} not found") + return (await get_genome_or_raise_404(db, genome_id)).contigs @genome_router.get("/{genome_id}/contigs/{contig_name}", dependencies=[authz_middleware.dep_public_endpoint()]) async def genomes_detail_contig_detail( genome_id: str, contig_name: str, db: DatabaseDependency ) -> m.ContigWithRefgetURI: - genome: m.GenomeWithURIs | None = await db.get_genome(genome_id, external_resource_uris=True) - if genome is None: - raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=f"Genome with ID {genome_id} not found") + genome: m.GenomeWithURIs = await get_genome_or_raise_404(db, genome_id) contig: m.ContigWithRefgetURI | None = next((c for c in genome.contigs if c.name == contig_name), None) - if contig is None: raise HTTPException( status_code=status.HTTP_404_NOT_FOUND, @@ -161,18 +157,91 @@ async def genomes_detail_contig_detail( return contig -# @genome_router.get("/{genome_id}/gene_features.gtf.gz") -# async def genomes_detail_gene_features(genome_id: str): -# # TODO: how to return empty gtf.gz if nothing is here yet? -# raise NotImplementedError() -# # TODO: slices of GTF.gz -# -# -# @genome_router.get("/{genome_id}/gene_features.gtf.gz.tbi") -# async def genomes_detail_gene_features_index(genome_id: str): -# # TODO: how to return empty gtf.gz.tbi if nothing is here yet? -# raise NotImplementedError() # TODO: gene features GTF tabix file -# -# -# # TODO: more normal annotation PUT endpoint -# # - treat gene_features as a file that can be replaced basically +@genome_router.get("/{genome_id}/feature_types", dependencies=[authz_middleware.dep_public_endpoint()]) +async def genomes_detail_feature_types(db: DatabaseDependency, genome_id: str) -> dict[str, int]: + return await db.genome_feature_types_summary(genome_id) + + +@genome_router.get("/{genome_id}/features", dependencies=[authz_middleware.dep_public_endpoint()]) +async def genomes_detail_features( + db: DatabaseDependency, + genome_id: str, + q: str | None = None, + name: str | None = None, + position: str | None = None, + start: int | None = None, + end: int | None = None, + feature_type: Annotated[list[str] | None, Query()] = None, + offset: int = 0, + limit: int = 10, +): + results, pagination = await db.query_genome_features( + genome_id, q, name, position, start, end, feature_type, offset, limit + ) + + return { + "results": results, + "pagination": pagination, + } + + +@genome_router.get("/{genome_id}/features/{feature_id}", dependencies=[authz_middleware.dep_public_endpoint()]) +async def genomes_detail_features_detail(db: DatabaseDependency, genome_id: str, feature_id: str): + return await db.get_genome_feature_by_id(genome_id, feature_id) + + +@genome_router.get("/{genome_id}/features.gff3.gz", dependencies=[authz_middleware.dep_public_endpoint()]) +async def genomes_detail_features_gff3( + config: ConfigDependency, db: DatabaseDependency, logger: LoggerDependency, request: Request, genome_id: str +): + # need internal GFF3.gz URI: + genome: m.Genome = await get_genome_or_raise_404(db, genome_id=genome_id, external_resource_uris=False) + + if not genome.gff3_gz: + raise HTTPException( + status_code=status.HTTP_404_NOT_FOUND, detail=f"Genome with ID {genome_id} has no GFF3 annotation file" + ) + + # Don't use FastAPI's auto-Header tool for the Range header + # 'cause I don't want to shadow Python's range() function + range_header: str | None = request.headers.get("Range", None) + return await generate_uri_streaming_response( + config, + logger, + genome.gff3_gz, + range_header, + "application/gzip", + impose_response_limit=False, + extra_response_headers={"Accept-Ranges": "bytes"}, + ) + + +@genome_router.get("/{genome_id}/features.gff3.gz.tbi", dependencies=[authz_middleware.dep_public_endpoint()]) +async def genomes_detail_gene_features_gff3_index( + config: ConfigDependency, db: DatabaseDependency, logger: LoggerDependency, request: Request, genome_id: str +): + # need internal GFF3.gz URI: + genome: m.Genome = await get_genome_or_raise_404(db, genome_id=genome_id, external_resource_uris=False) + + if not genome.gff3_gz_tbi: + raise HTTPException( + status_code=status.HTTP_404_NOT_FOUND, + detail=f"Genome with ID {genome_id} has no GFF3 annotation TABIX index", + ) + + # Don't use FastAPI's auto-Header tool for the Range header + # 'cause I don't want to shadow Python's range() function + range_header: str | None = request.headers.get("Range", None) + return await generate_uri_streaming_response( + config, + logger, + genome.gff3_gz_tbi, + range_header, + "application/octet-stream", + impose_response_limit=False, + extra_response_headers={"Accept-Ranges": "bytes"}, + ) + + +# TODO: more normal annotation PUT endpoint +# - treat gene_features as a file that can be replaced basically From 2c1e01f0930b9447fa3351ded7a5bcc5b1617b21 Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Tue, 7 May 2024 16:42:21 -0400 Subject: [PATCH 004/114] chore: work on gff3 parsing code for ingestion --- bento_reference_service/genomes.py | 126 ++++++++++++++----------- bento_reference_service/models.py | 4 +- bento_reference_service/sql/schema.sql | 10 +- 3 files changed, 77 insertions(+), 63 deletions(-) diff --git a/bento_reference_service/genomes.py b/bento_reference_service/genomes.py index 394b651..61146fc 100644 --- a/bento_reference_service/genomes.py +++ b/bento_reference_service/genomes.py @@ -2,9 +2,9 @@ import pysam import re -from elasticsearch.helpers import async_streaming_bulk from pathlib import Path from typing import Generator +from urllib.parse import unquote as url_unquote from . import models as m from .db import Database @@ -21,56 +21,49 @@ class AnnotationIngestError(Exception): pass -def extract_feature_id_and_name(record) -> tuple[str | None, str | None]: +def parse_attributes(raw_attributes: dict[str, str]) -> dict[str, list[str]]: + # See "attributes" in http://gmod.org/wiki/GFF3 + return {k: [url_unquote(e) for e in v.split(",")] for k, v in raw_attributes.items()} + + +def extract_feature_name(record, attributes: dict[str, list[str]]) -> str | None: feature_type = record.feature - gene_id = record.gene_id - gene_name = record.attributes.get("gene_name", gene_id) + feature_name: str | None = attributes.get("Name", (None,))[0] + + if feature_name: + return feature_name - feature_id: str | None = None - feature_name: str | None = None + transcript_id = attributes.get("transcript_id", (None,))[0] + transcript_name = attributes.get("transcript_name", (transcript_id,))[0] match feature_type: case "gene": - feature_id = gene_id - feature_name = gene_name + return attributes.get("gene_name", attributes.get("gene_id", (None,)))[0] case "transcript": - feature_id = record.transcript_id - feature_name = feature_id # Explicitly re-use ID as name here + return attributes.get("transcript_name", attributes.get("transcript_id", (None,)))[0] case "5UTR" | "five_prime_utr": # 5' untranslated region (UTR) - feature_id = f"{gene_id}-5UTR" - feature_name = f"{gene_name} 5' UTR" + return f"{transcript_name} 5' UTR" case "3UTR" | "three_prime_utr": # 3' untranslated region (UTR) - feature_id = f"{gene_id}-3UTR" - feature_name = f"{gene_name} 3' UTR" - case "start_codon": # TODO: multiple start codons may exist? - feature_id = f"{gene_id}-start_codon" - feature_name = f"{gene_name} start codon" - case "stop_codon": # TODO: multiple stop codons may exist? - feature_id = f"{gene_id}-stop_codon" - feature_name = f"{gene_name} stop codon" + return f"{transcript_name} 3' UTR" + case "start_codon": + return f"{transcript_name} start codon" + case "stop_codon": + return f"{transcript_name} stop codon" case "exon": - feature_id = record.attributes["exon_id"] # TODO: fallback with gene ID + exon number? - if "exon_number" in record.attributes: - # TODO: Validate this, I think slightly wrong because it uses gene vs. transcript - feature_name = f"{gene_name} exon {record.attributes['exon_number']}" + if "exon_id" in attributes: + return attributes["exon_id"][0] else: - feature_name = feature_id # Explicitly re-use ID as name here + return attributes["ID"][0] case "CDS": # coding sequence - exon_id = record.attributes["exon_id"] - feature_id = f"{exon_id}-CDS" - if "exon_number" in record.attributes: - # TODO: Validate this, I think slightly wrong because it uses gene vs. transcript - feature_name = f"{gene_name} exon {record.attributes['exon_number']} CDS" - else: - feature_name = f"{exon_id} CDS" # Explicitly re-use ID as name here - - return feature_id, feature_name + return f"{transcript_name} CDS" + case _: + return None async def ingest_gene_feature_annotation( genome_id: str, - gtf_annotation_path: Path, - gtf_annotation_index_path: Path, + gff_path: Path, + gff_index_path: Path, db: Database, logger: logging.Logger, ) -> None: @@ -79,8 +72,8 @@ async def ingest_gene_feature_annotation( into the relevant .bentoGenome directory and ingests the annotations into an ElasticSearch index for fuzzy text querying of features. :param genome_id: The ID of the genome to attach the annotation to. - :param gtf_annotation_path: The path to an external GTF.gz-formatted annotation file to copy and read from. - :param gtf_annotation_index_path: The path to an external index file for the above .gtf.gz. + :param gff_path: The path to an external GTF.gz-formatted annotation file to copy and read from. + :param gff_index_path: The path to an external index file for the above .gtf.gz. :param db: Database connection/management object. :param logger: Python logger object. :return: None @@ -91,18 +84,28 @@ async def ingest_gene_feature_annotation( genome: m.GenomeWithURIs | None = await db.get_genome(genome_id) + if genome is None: + raise AnnotationIngestError(f"Genome with ID {genome_id} not found") + log_progress_interval = 1000 - def _iter_features() -> Generator[m.GTFFeature, None, None]: - gtf = pysam.TabixFile(str(gtf_annotation_path), index=str(gtf_annotation_index_path)) + def _iter_features() -> Generator[m.GenomeFeature, None, None]: + gff = pysam.TabixFile(str(gff_path), index=str(gff_index_path)) total_processed: int = 0 try: + features_by_id: dict[str, m.GenomeFeature] = {} + for contig in genome.contigs: logger.info(f"Indexing features from contig {contig.name}") + features_by_id.clear() + + for record in gff.fetch(contig.name, parser=pysam.asGFF3()): + # for some reason, dict(...) returns the attributes dict: + record_attributes = parse_attributes(dict(record)) - for record in gtf.fetch(contig.name, parser=pysam.asGTF()): feature_type = record.feature - feature_id, feature_name = extract_feature_id_and_name(record) + feature_id = record_attributes.get("ID", (None,))[0] + feature_name = extract_feature_name(record, record_attributes) if feature_id is None: logger.warning(f"Skipping unsupported feature (type={feature_type}, no ID retrieval): {record}") @@ -112,23 +115,34 @@ def _iter_features() -> Generator[m.GTFFeature, None, None]: logger.warning(f"Using ID as name for feature: {record}") feature_name = feature_id - yield { - "id": feature_id, - "name": feature_name, - "position": f"{contig.name}:{record.start}-{record.end}", - "type": feature_type, - "genome": genome_id, - "strand": record.strand, - } + entry = m.GenomeFeatureEntry( + start_pos=record.start, + end_pos=record.end, + score=record.score, + phase=record.frame, # misnamed in PySAM's GFF3 parser + ) + + if feature_id in features_by_id: + features_by_id[feature_id].entries.append(entry) + else: + features_by_id[feature_id] = m.GenomeFeature( + genome_id=genome_id, + contig_name=contig.name, + strand=record.strand, + feature_id=feature_id, + feature_name=feature_name, + feature_type=feature_type, + source=record.source, + entries=[entry], + attributes=record_attributes, + parents=tuple(p for p in record_attributes.get("Parent", "").split(",") if p), + ) total_processed += 1 if total_processed % log_progress_interval == 0: logger.info(f"Processed {total_processed} features") - finally: - gtf.close() + yield from features_by_id.values() - async for ok, result in async_streaming_bulk(es, _iter_features()): - if not ok: - action, result = result.popitem() - raise AnnotationIngestError(f"failed to {action} document: {result}") + finally: + gff.close() diff --git a/bento_reference_service/models.py b/bento_reference_service/models.py index 59b569f..01fac9e 100644 --- a/bento_reference_service/models.py +++ b/bento_reference_service/models.py @@ -88,7 +88,7 @@ class GenomeFeature(BaseModel): source: str - entries: list[GenomeFeatureEntry] - annotations: dict[str, list[str]] + entries: list[GenomeFeatureEntry] # mutable to allow us to gradually build up entry list during ingestion + attributes: dict[str, list[str]] parents: tuple[str, ...] diff --git a/bento_reference_service/sql/schema.sql b/bento_reference_service/sql/schema.sql index 5d6d569..418aae5 100644 --- a/bento_reference_service/sql/schema.sql +++ b/bento_reference_service/sql/schema.sql @@ -115,9 +115,9 @@ CREATE TABLE IF NOT EXISTS genome_feature_parents ( FOREIGN KEY (genome_id, parent_id) REFERENCES genome_features ); --- annotations can also have multiple values, so we don't enforce uniqueness on (genome_id, feature_id, attr_tag) --- these are 'non-parent' annotations -CREATE TABLE IF NOT EXISTS genome_feature_annotations ( +-- attributes can also have multiple values, so we don't enforce uniqueness on (genome_id, feature_id, attr_tag) +-- these are non-Parent, non-ID attributes +CREATE TABLE IF NOT EXISTS genome_feature_attributes ( annotation_id SERIAL PRIMARY KEY, genome_id VARCHAR(31) NOT NULL REFERENCES genomes, feature_id VARCHAR(63) NOT NULL, @@ -125,5 +125,5 @@ CREATE TABLE IF NOT EXISTS genome_feature_annotations ( attr_val VARCHAR(63) NOT NULL, FOREIGN KEY (genome_id, feature_id) REFERENCES genome_features ); -CREATE INDEX IF NOT EXISTS annotations_genome_feature_attr_idx - ON genome_feature_annotations (genome_id, feature_id, attr_tag); +CREATE INDEX IF NOT EXISTS genome_feature_attributes_attr_idx + ON genome_feature_attributes (genome_id, feature_id, attr_tag); From 2e1457f3730b43f4243e6dc473f9132c3c6aad81 Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Tue, 7 May 2024 16:49:33 -0400 Subject: [PATCH 005/114] chore: stub out db calls for genome feature ingest --- bento_reference_service/db.py | 6 +++++- bento_reference_service/genomes.py | 8 ++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/bento_reference_service/db.py b/bento_reference_service/db.py index 9ac9232..f8b8e0d 100644 --- a/bento_reference_service/db.py +++ b/bento_reference_service/db.py @@ -4,7 +4,7 @@ from fastapi import Depends from functools import lru_cache from pathlib import Path -from typing import Annotated, AsyncIterator +from typing import Annotated, AsyncIterator, Iterable from .config import Config, ConfigDependency from .models import Alias, ContigWithRefgetURI, Genome, GenomeWithURIs, OntologyTerm, GenomeFeatureEntry, GenomeFeature @@ -343,6 +343,10 @@ def _q_param(pv: str | int) -> str: return final_list, {"offset": offset, "limit": limit, "total": len(id_res)} + async def bulk_ingest_genome_features(self, features: Iterable[GenomeFeature]): + # TODO + pass + @lru_cache() def get_db(config: ConfigDependency) -> Database: # pragma: no cover diff --git a/bento_reference_service/genomes.py b/bento_reference_service/genomes.py index 61146fc..ee453ef 100644 --- a/bento_reference_service/genomes.py +++ b/bento_reference_service/genomes.py @@ -1,3 +1,4 @@ +import itertools import logging import pysam import re @@ -61,9 +62,11 @@ def extract_feature_name(record, attributes: dict[str, list[str]]) -> str | None async def ingest_gene_feature_annotation( + # parameters: genome_id: str, gff_path: Path, gff_index_path: Path, + # dependencies: db: Database, logger: logging.Logger, ) -> None: @@ -146,3 +149,8 @@ def _iter_features() -> Generator[m.GenomeFeature, None, None]: finally: gff.close() + + features_to_ingest = _iter_features() + + while data := tuple(itertools.islice(features_to_ingest, 1000)): # take features in batches + await db.bulk_ingest_genome_features(data) From 7a2a7f3b54be9972150a624a68897a10e2ee985f Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Tue, 7 May 2024 16:51:57 -0400 Subject: [PATCH 006/114] fix(schema): missing semicolon --- bento_reference_service/sql/schema.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bento_reference_service/sql/schema.sql b/bento_reference_service/sql/schema.sql index 418aae5..33ae43d 100644 --- a/bento_reference_service/sql/schema.sql +++ b/bento_reference_service/sql/schema.sql @@ -99,7 +99,7 @@ CREATE TABLE IF NOT EXISTS genome_feature_entries ( phase SMALLINT, -- Keys: FOREIGN KEY (genome_id, feature_id) REFERENCES genome_features -) +); CREATE INDEX IF NOT EXISTS genome_feature_entries_position_text_trgm_gin ON genome_feature_entries From 687dfc7be6127c33ebda44090334bb8113b89925 Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Tue, 7 May 2024 16:58:48 -0400 Subject: [PATCH 007/114] fix: issues with db interface --- bento_reference_service/db.py | 20 ++++++++++++++------ tests/conftest.py | 8 +++++--- 2 files changed, 19 insertions(+), 9 deletions(-) diff --git a/bento_reference_service/db.py b/bento_reference_service/db.py index f8b8e0d..b209025 100644 --- a/bento_reference_service/db.py +++ b/bento_reference_service/db.py @@ -77,7 +77,15 @@ async def _select_genomes(self, g_id: str | None, external_resource_uris: bool) res = await conn.fetch( f""" SELECT - id, md5_checksum, ga4gh_checksum, fasta_uri, fai_uri, taxon_id, taxon_label, + id, + md5_checksum, + ga4gh_checksum, + fasta_uri, + fai_uri, + gff3_gz_uri, + gff3_gz_tbi_uri, + taxon_id, + taxon_label, array( SELECT json_agg(ga.*) FROM genome_aliases ga WHERE g.id = ga.genome_id ) aliases, @@ -143,20 +151,20 @@ async def create_genome(self, g: Genome, return_external_resource_uris: bool) -> ga4gh_checksum, fasta_uri, fai_uri, - gff3_uri, - gff3_tbi_uri, + gff3_gz_uri, + gff3_gz_tbi_uri, taxon_id, taxon_label ) - VALUES ($1, $2, $3, $4, $5, $6, $7) + VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9) """, g.id, g.md5, g.ga4gh, g.fasta, g.fai, - g.gff3, - g.gff3_tbi, + g.gff3_gz, + g.gff3_gz_tbi, g.taxon.id, g.taxon.label, ) diff --git a/tests/conftest.py b/tests/conftest.py index 361d220..27ff130 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -34,11 +34,13 @@ async def db_cleanup(db: Database): async with db.connect() as conn: await conn.execute( """ + DROP INDEX genome_features_feature_id_trgm_gin; DROP INDEX genome_features_feature_name_trgm_gin; - DROP INDEX genome_features_position_text_trgm_gin; - DROP INDEX annotations_genome_feature_attr_idx; + DROP INDEX genome_feature_entries_position_text_trgm_gin; + DROP INDEX genome_feature_attributes_attr_idx; - DROP TABLE genome_feature_annotations; + DROP TABLE genome_feature_entries; + DROP TABLE genome_feature_attributes; DROP TABLE genome_feature_parents; DROP TABLE genome_features; From 3a136bbf143eb1ba37d2c836e2c565e6a5abad8f Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Tue, 7 May 2024 17:01:19 -0400 Subject: [PATCH 008/114] lint --- tests/conftest.py | 42 +++++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 27ff130..c6cf460 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -34,27 +34,27 @@ async def db_cleanup(db: Database): async with db.connect() as conn: await conn.execute( """ - DROP INDEX genome_features_feature_id_trgm_gin; - DROP INDEX genome_features_feature_name_trgm_gin; - DROP INDEX genome_feature_entries_position_text_trgm_gin; - DROP INDEX genome_feature_attributes_attr_idx; - - DROP TABLE genome_feature_entries; - DROP TABLE genome_feature_attributes; - DROP TABLE genome_feature_parents; - DROP TABLE genome_features; - - DROP TABLE genome_feature_type_synonyms; - DROP TABLE genome_feature_types; - - DROP TABLE genome_contig_aliases; - DROP TABLE genome_contigs; - - DROP TABLE genome_aliases; - DROP TABLE genomes; - - DROP TYPE strand_type; - """ + DROP INDEX genome_features_feature_id_trgm_gin; + DROP INDEX genome_features_feature_name_trgm_gin; + DROP INDEX genome_feature_entries_position_text_trgm_gin; + DROP INDEX genome_feature_attributes_attr_idx; + + DROP TABLE genome_feature_entries; + DROP TABLE genome_feature_attributes; + DROP TABLE genome_feature_parents; + DROP TABLE genome_features; + + DROP TABLE genome_feature_type_synonyms; + DROP TABLE genome_feature_types; + + DROP TABLE genome_contig_aliases; + DROP TABLE genome_contigs; + + DROP TABLE genome_aliases; + DROP TABLE genomes; + + DROP TYPE strand_type; + """ ) await db.close() From 87695c5a3bffe09fa69b0c4b902dc1d38c9ad4fe Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Tue, 7 May 2024 17:01:26 -0400 Subject: [PATCH 009/114] chore: work on workflow definitions --- bento_reference_service/workflows/metadata.py | 44 ++++++++++++++++++- .../workflows/wdls/fasta_ref.wdl | 1 + .../workflows/wdls/gff3_annot.wdl | 12 +++++ 3 files changed, 55 insertions(+), 2 deletions(-) create mode 100644 bento_reference_service/workflows/wdls/gff3_annot.wdl diff --git a/bento_reference_service/workflows/metadata.py b/bento_reference_service/workflows/metadata.py index 9c5f6c1..b0381d7 100644 --- a/bento_reference_service/workflows/metadata.py +++ b/bento_reference_service/workflows/metadata.py @@ -5,6 +5,7 @@ __all__ = ["workflow_set"] WORKFLOW_FASTA_REFERENCE = "fasta_ref" +WORKFLOW_GFF3_ANNOTATION = "gff3_annot" workflow_set = WorkflowSet(Path(__file__).parent / "wdls") @@ -14,8 +15,9 @@ name="Ingest FASTA-formatted reference genome", type="ingestion", description=( - "Given a FASTA or gzipped FASTA reference genome, this workflow indexes and ingests it into the Bento " - "Reference Service. All ingested FASTA files are COMPLETELY PUBLIC, so do not ingest any sensitive data!" + "Given a FASTA or gzipped FASTA reference genome, and optionally a GFF3-formatted annotation file, this " + "workflow indexes and ingests it into the Bento Reference Service. All ingested FASTA files are COMPLETELY " + "PUBLIC, so do not ingest any sensitive data!" ), file="fasta_ref.wdl", tags=frozenset(("reference", "fasta")), @@ -46,6 +48,44 @@ pattern=r"^.*\.(fa|fa.gz|fna|fna.gz|fas|fas.gz|fasta|fasta.gz)$", help="FASTA file for the reference genome, either gzipped or uncompressed.", ), + wm.WorkflowFileInput( + id="genome_gff3", + pattern=r"^.*\.(gff|gff3)$", + required=False, + help="GFF3-formatted annotation file for the reference genome.", + ), + ], + ), +) + +workflow_set.add_workflow( + WORKFLOW_GFF3_ANNOTATION, + wm.WorkflowDefinition( + name="Add GFF3-formatted annotation data to a reference genome", + type="ingestion", + description=( + "Given a GFF3-formatted annotation file, extract the features to make them queryable and attach them to an " + "existing reference genome." + ), + file="gff3_annot.wdl", + tags=frozenset(("reference", "gff3")), + inputs=[ + # Injected + wm.WorkflowSecretInput(id="access_token", key="access_token"), + wm.WorkflowServiceUrlInput(id="drs_url", service_kind="drs"), + wm.WorkflowServiceUrlInput(id="reference_url", service_kind="reference"), + wm.WorkflowConfigInput(id="validate_ssl", key="validate_ssl"), + # User-specified + wm.WorkflowEnumInput( + id="genome_id", + values="{{ serviceUrls.reference }}/genomes?response_format=id_list", + help="The reference genome to annotate with the GFF3 file.", + ), + wm.WorkflowFileInput( + id="genome_gff3", + pattern=r"^.*\.(gff|gff3)$", + help="GFF3-formatted annotation file for the reference genome.", + ), ], ), ) diff --git a/bento_reference_service/workflows/wdls/fasta_ref.wdl b/bento_reference_service/workflows/wdls/fasta_ref.wdl index 23890cf..7579011 100644 --- a/bento_reference_service/workflows/wdls/fasta_ref.wdl +++ b/bento_reference_service/workflows/wdls/fasta_ref.wdl @@ -5,6 +5,7 @@ workflow fasta_ref { String genome_id String taxon_term_json File genome_fasta + File? genome_gff3 String access_token String drs_url String reference_url diff --git a/bento_reference_service/workflows/wdls/gff3_annot.wdl b/bento_reference_service/workflows/wdls/gff3_annot.wdl new file mode 100644 index 0000000..7c95b0b --- /dev/null +++ b/bento_reference_service/workflows/wdls/gff3_annot.wdl @@ -0,0 +1,12 @@ +version 1.0 + +workflow fasta_ref { + input { + String genome_id + File genome_gff3 + String access_token + String drs_url + String reference_url + Boolean validate_ssl + } +} From 388b439283b88fe08fd73fd2f14bbdb2e3376615 Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Tue, 7 May 2024 17:11:30 -0400 Subject: [PATCH 010/114] chore: bump version to 0.2.0 --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index f6e8159..3884d58 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api" [tool.poetry] name = "bento_reference_service" -version = "0.1.2" +version = "0.2.0" description = "Reference data (genomes & annotations) service for the Bento platform." authors = [ "David Lougheed ", From d971fc94dfd3a1f1465e3235915e9d20a6aab7aa Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Wed, 8 May 2024 13:27:39 -0400 Subject: [PATCH 011/114] feat: allow putting GFF3 files to ingest annotations --- bento_reference_service/config.py | 3 +- .../{genomes.py => features.py} | 0 bento_reference_service/routers/genomes.py | 62 ++++++++++++++----- bento_reference_service/routers/refget.py | 8 ++- bento_reference_service/streaming.py | 10 ++- data/.gitkeep | 0 6 files changed, 66 insertions(+), 17 deletions(-) rename bento_reference_service/{genomes.py => features.py} (100%) delete mode 100644 data/.gitkeep diff --git a/bento_reference_service/config.py b/bento_reference_service/config.py index 307a061..55b65a0 100644 --- a/bento_reference_service/config.py +++ b/bento_reference_service/config.py @@ -20,7 +20,8 @@ class Config(BentoBaseConfig): service_url_base_path: str = "http://127.0.0.1:5000" # Base path to construct URIs from database_uri: str = "postgres://localhost:5432" - data_path: Path = Path(__file__).parent / "data" + file_ingest_tmp_dir: Path = Path(__file__).parent / "tmp" + file_ingest_chunk_size: int = 1024 * 256 # 256 KiB at a time file_response_chunk_size: int = 1024 * 16 # 16 KiB at a time response_substring_limit: int = 10000 # TODO: Refine default diff --git a/bento_reference_service/genomes.py b/bento_reference_service/features.py similarity index 100% rename from bento_reference_service/genomes.py rename to bento_reference_service/features.py diff --git a/bento_reference_service/routers/genomes.py b/bento_reference_service/routers/genomes.py index ee83c41..ac1c9af 100644 --- a/bento_reference_service/routers/genomes.py +++ b/bento_reference_service/routers/genomes.py @@ -1,16 +1,19 @@ +import aiofiles import asyncpg import traceback from bento_lib.auth.permissions import P_INGEST_REFERENCE_MATERIAL, P_DELETE_REFERENCE_MATERIAL from bento_lib.auth.resources import RESOURCE_EVERYTHING -from fastapi import APIRouter, HTTPException, Query, Request, status +from fastapi import APIRouter, HTTPException, Query, Request, UploadFile, status from fastapi.responses import StreamingResponse from typing import Annotated +from uuid import uuid4 from .. import models as m from ..authz import authz_middleware from ..config import ConfigDependency from ..db import Database, DatabaseDependency +from ..features import ingest_gene_feature_annotation from ..logger import LoggerDependency from ..streaming import generate_uri_streaming_response @@ -18,6 +21,10 @@ __all__ = ["genome_router"] +DEPENDENCY_INGEST_REFERENCE_MATERIAL = authz_middleware.dep_require_permissions_on_resource( + frozenset({P_INGEST_REFERENCE_MATERIAL}), RESOURCE_EVERYTHING +) + genome_router = APIRouter(prefix="/genomes") @@ -44,11 +51,7 @@ async def genomes_list( @genome_router.post( "", status_code=status.HTTP_201_CREATED, - dependencies=[ - authz_middleware.dep_require_permissions_on_resource( - frozenset({P_INGEST_REFERENCE_MATERIAL}), RESOURCE_EVERYTHING - ) - ], + dependencies=[DEPENDENCY_INGEST_REFERENCE_MATERIAL], ) async def genomes_create( db: DatabaseDependency, genome: m.Genome, logger: LoggerDependency, request: Request @@ -91,7 +94,7 @@ async def genomes_detail_fasta( range_header, "text/x-fasta", impose_response_limit=False, - extra_response_headers={"Accept-Ranges": "bytes"}, + support_byte_ranges=True, ) @@ -111,7 +114,7 @@ async def genomes_detail_fasta_index( range_header, "text/plain", impose_response_limit=False, - extra_response_headers={"Accept-Ranges": "bytes"}, + support_byte_ranges=True, ) @@ -212,10 +215,45 @@ async def genomes_detail_features_gff3( range_header, "application/gzip", impose_response_limit=False, - extra_response_headers={"Accept-Ranges": "bytes"}, + support_byte_ranges=True, ) +@genome_router.put( + "/{genome_id}/features.gff3.gz", + dependencies=[DEPENDENCY_INGEST_REFERENCE_MATERIAL], + status_code=status.HTTP_204_NO_CONTENT, +) +async def genomes_detail_features_ingest_gff3( + config: ConfigDependency, + db: DatabaseDependency, + logger: LoggerDependency, + genome_id: str, + gff3_gz: UploadFile, + gff3_gz_tbi: UploadFile, +): + fn = config.file_ingest_tmp_dir / f"{uuid4()}.gff3.gz" + fn_tbi = config.file_ingest_tmp_dir / f"{fn}.tbi" + + try: + # copy .gff3.gz to temporary directory for ingestion + async with aiofiles.open(fn, "wb") as fh: + while data := (await gff3_gz.read(config.file_response_chunk_size)): + await fh.write(data) + + # copy .gff3.gz.tbi to temporary directory for ingestion + async with aiofiles.open(fn_tbi, "wb") as fh: + while data := (await gff3_gz_tbi.read(config.file_response_chunk_size)): + await fh.write(data) + + # ingest gene features into the database + await ingest_gene_feature_annotation(genome_id, fn, fn_tbi, db, logger) + + finally: + fn.unlink(missing_ok=True) + fn_tbi.unlink(missing_ok=True) + + @genome_router.get("/{genome_id}/features.gff3.gz.tbi", dependencies=[authz_middleware.dep_public_endpoint()]) async def genomes_detail_gene_features_gff3_index( config: ConfigDependency, db: DatabaseDependency, logger: LoggerDependency, request: Request, genome_id: str @@ -239,9 +277,5 @@ async def genomes_detail_gene_features_gff3_index( range_header, "application/octet-stream", impose_response_limit=False, - extra_response_headers={"Accept-Ranges": "bytes"}, + support_byte_ranges=True, ) - - -# TODO: more normal annotation PUT endpoint -# - treat gene_features as a file that can be replaced basically diff --git a/bento_reference_service/routers/refget.py b/bento_reference_service/routers/refget.py index b1f50c3..75dcefd 100644 --- a/bento_reference_service/routers/refget.py +++ b/bento_reference_service/routers/refget.py @@ -145,7 +145,13 @@ async def refget_sequence( # TODO: correct range: accounting for offsets in file from FAI return generate_uri_streaming_response( - config, logger, genome.fasta, "TODO", "text/x-fasta", headers, impose_response_limit=True + config, + logger, + genome.fasta, + "TODO", + "text/x-fasta", + impose_response_limit=True, + extra_response_headers=headers, ) diff --git a/bento_reference_service/streaming.py b/bento_reference_service/streaming.py index a8bf518..ab8fb6e 100644 --- a/bento_reference_service/streaming.py +++ b/bento_reference_service/streaming.py @@ -21,6 +21,9 @@ ] +ACCEPT_BYTE_RANGES = {"Accept-Ranges": "bytes"} + + class StreamingRangeNotSatisfiable(Exception): def __init__(self, message: str, n_bytes: int | None): self._n_bytes: int | None = n_bytes @@ -223,13 +226,18 @@ async def generate_uri_streaming_response( range_header: str | None, media_type: str, impose_response_limit: bool, + support_byte_ranges: bool = False, extra_response_headers: dict[str, str] | None = None, ): try: content_length, stream = await stream_from_uri(config, logger, uri, range_header, impose_response_limit) return StreamingResponse( stream, - headers={**(extra_response_headers or {}), "Content-Length": str(content_length)}, + headers={ + **(extra_response_headers or {}), + **(ACCEPT_BYTE_RANGES if support_byte_ranges else {}), + "Content-Length": str(content_length), + }, media_type=media_type, status_code=status.HTTP_206_PARTIAL_CONTENT if range_header else status.HTTP_200_OK, ) diff --git a/data/.gitkeep b/data/.gitkeep deleted file mode 100644 index e69de29..0000000 From b30747d181de8fb1d9dfff49e16a35bae3ebf198 Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Wed, 8 May 2024 13:51:15 -0400 Subject: [PATCH 012/114] feat(workflows): optional gff3 ingestion in fasta ingest workflow --- .../workflows/wdls/fasta_ref.wdl | 101 +++++++++++++++++- 1 file changed, 100 insertions(+), 1 deletion(-) diff --git a/bento_reference_service/workflows/wdls/fasta_ref.wdl b/bento_reference_service/workflows/wdls/fasta_ref.wdl index 7579011..b724341 100644 --- a/bento_reference_service/workflows/wdls/fasta_ref.wdl +++ b/bento_reference_service/workflows/wdls/fasta_ref.wdl @@ -34,6 +34,30 @@ workflow fasta_ref { validate_ssl = validate_ssl } + if (genome_gff3) { + call normalize_and_compress_gff3_and_index as gi { + input: + genome_id = genome_id, + gff3 = genome_gff3 + } + + call ingest_into_drs as drs_gff3 { + input: + file = gi.sorted_gff3_gz, + drs_url = drs_url, + access_token = access_token, + validate_ssl = validate_ssl + } + + call ingest_into_drs as drs_gff3_tbi { + input: + file = gi.sorted_gff3_gz_tbi, + drs_url = drs_url, + access_token = access_token, + validate_ssl = validate_ssl + } + } + call ingest_metadata_into_ref { input: genome_id = genome_id, @@ -46,6 +70,18 @@ workflow fasta_ref { token = access_token, validate_ssl = validate_ssl } + + if (genome_gff3) { + call ingest_gff3_into_ref { + input: + genome_id = genome_id, + gff3_gz = gi.sorted_gff3_gz, + gff3_gz_tbi = gi.sorted_gff3_gz_tbi, + reference_url = reference_url, + token = access_token, + validate_ssl = validate_ssl + } + } } task uncompress_fasta_and_generate_fai_if_needed { @@ -70,6 +106,7 @@ task uncompress_fasta_and_generate_fai_if_needed { } } +# TODO: shared file with this task task ingest_into_drs { input { File file @@ -104,6 +141,34 @@ task ingest_into_drs { } } +# TODO: shared file with this task +task normalize_and_compress_gff3_and_index { + input { + String genome_id + File gff3 + } + + command <<< + if [[ '~{gff3}' == *.gz ]]; then + zcat '~{gff3}' > unsorted.gff3 + else + cp '~{gff3}' unsorted.gff3 + fi + + out_file='~{genome_id}_annotation.gff3.gz' + + # See http://www.htslib.org/doc/tabix.html#EXAMPLE + # - sort the GFF3 file + (grep ^"#" sorted.gff3; grep -v ^"#" sorted.gff3 | sort -k1,1 -k4,4n) | bgzip -@ 2 > "${out_file}" + tabix -@ 2 "${out_file}" + >>> + + output { + File sorted_gff3_gz = "${genome_id}_annotation.gff3.gz" + File sorted_gff3_gz_tbi = "${genome_id}_annotation.gff3.gz.tbi" + } +} + task ingest_metadata_into_ref { input { String genome_id @@ -112,6 +177,8 @@ task ingest_metadata_into_ref { File fai String fasta_drs_uri String fai_drs_uri + String? gff3_gz_drs_uri + String? gff3_gz_tbi_drs_uri String reference_url String token Boolean validate_ssl @@ -119,7 +186,13 @@ task ingest_metadata_into_ref { command <<< fasta-checksum-utils '~{fasta}' --fai '~{fai}' --genome-id '~{genome_id}' --out-format bento-json | \ - jq '.fasta = "~{fasta_drs_uri}" | .fai = "~{fai_drs_uri}" | .taxon = ~{taxon_term_json}' > metadata.json + jq '.fasta = "~{fasta_drs_uri}" | .fai = "~{fai_drs_uri}" | .taxon = ~{taxon_term_json}' > metadata.json + + if [[ '~{gff3_gz_drs_uri}' != '' ]]; then # assume if this is set then both gff3 variables are set. + cat metadata.json | \ + jq '.gff3_gz = "~{gff3_gz_drs_uri}" | .gff_gz_tbi = "~{gff3_gz_tbi_drs_uri}"' > metadata.json.tmp + mv metadata.json.tmp metadata.json + fi rm '~{fasta}' '~{fai}' @@ -137,3 +210,29 @@ task ingest_metadata_into_ref { File err = stderr() } } + +task ingest_gff3_into_ref { + input { + String genome_id + File gff3_gz + File gff3_gz_tbi + String reference_url + String token + Boolean validate_ssl + } + + command <<< + curl ~{true="" false="-k" validate_ssl} \ + -X PUT \ + -F "gff3_gz=@~{gff3_gz}" \ + -F "gff3_gz_tbi=@~{gff3_gz_tbi}" \ + -H "Authorization: Bearer ~{token}" \ + --fail-with-body \ + "~{reference_url}/genomes/~{genome_id}/features.gff3.gz" + >>> + + output { + File out = stdout() + File err = stderr() + } +} From 813008cb6e27f291f00dcb1dafc4f6174a740102 Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Wed, 8 May 2024 14:31:27 -0400 Subject: [PATCH 013/114] fix: workflow issues --- bento_reference_service/workflows/metadata.py | 2 +- bento_reference_service/workflows/wdls/fasta_ref.wdl | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/bento_reference_service/workflows/metadata.py b/bento_reference_service/workflows/metadata.py index b0381d7..2758ac6 100644 --- a/bento_reference_service/workflows/metadata.py +++ b/bento_reference_service/workflows/metadata.py @@ -83,7 +83,7 @@ ), wm.WorkflowFileInput( id="genome_gff3", - pattern=r"^.*\.(gff|gff3)$", + pattern=r"^.*\.(gff|gff3|gff.gz|gff3.gz)$", help="GFF3-formatted annotation file for the reference genome.", ), ], diff --git a/bento_reference_service/workflows/wdls/fasta_ref.wdl b/bento_reference_service/workflows/wdls/fasta_ref.wdl index b724341..bf82650 100644 --- a/bento_reference_service/workflows/wdls/fasta_ref.wdl +++ b/bento_reference_service/workflows/wdls/fasta_ref.wdl @@ -159,8 +159,8 @@ task normalize_and_compress_gff3_and_index { # See http://www.htslib.org/doc/tabix.html#EXAMPLE # - sort the GFF3 file - (grep ^"#" sorted.gff3; grep -v ^"#" sorted.gff3 | sort -k1,1 -k4,4n) | bgzip -@ 2 > "${out_file}" - tabix -@ 2 "${out_file}" + (grep ^"#" unsorted.gff3; grep -v ^"#" unsorted.gff3 | sort -k1,1 -k4,4n) | bgzip -@ 2 > "${out_file}" + tabix "${out_file}" >>> output { From 89f2dcb42797fbd7f7bc7acdc80aad484ffe3b30 Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Wed, 8 May 2024 14:31:56 -0400 Subject: [PATCH 014/114] chore(workflows): implement gff3_annot workflow --- .../workflows/wdls/gff3_annot.wdl | 74 ++++++++++++++++++- 1 file changed, 73 insertions(+), 1 deletion(-) diff --git a/bento_reference_service/workflows/wdls/gff3_annot.wdl b/bento_reference_service/workflows/wdls/gff3_annot.wdl index 7c95b0b..b7fc75d 100644 --- a/bento_reference_service/workflows/wdls/gff3_annot.wdl +++ b/bento_reference_service/workflows/wdls/gff3_annot.wdl @@ -1,6 +1,6 @@ version 1.0 -workflow fasta_ref { +workflow gff3_annot { input { String genome_id File genome_gff3 @@ -9,4 +9,76 @@ workflow fasta_ref { String reference_url Boolean validate_ssl } + + call normalize_and_compress_gff3_and_index as gi { + input: + genome_id = genome_id, + gff3 = genome_gff3 + } + + # TODO: DRS ingestion + updating reference metadata record + + call ingest_gff3_into_ref { + input: + genome_id = genome_id, + gff3_gz = gi.sorted_gff3_gz, + gff3_gz_tbi = gi.sorted_gff3_gz_tbi, + reference_url = reference_url, + token = access_token, + validate_ssl = validate_ssl + } +} + +# TODO: shared file with this task +task normalize_and_compress_gff3_and_index { + input { + String genome_id + File gff3 + } + + command <<< + if [[ '~{gff3}' == *.gz ]]; then + zcat '~{gff3}' > unsorted.gff3 + else + cp '~{gff3}' unsorted.gff3 + fi + + out_file='~{genome_id}_annotation.gff3.gz' + + # See http://www.htslib.org/doc/tabix.html#EXAMPLE + # - sort the GFF3 file + (grep ^"#" unsorted.gff3; grep -v ^"#" unsorted.gff3 | sort -k1,1 -k4,4n) | bgzip -@ 2 > "${out_file}" + tabix "${out_file}" + >>> + + output { + File sorted_gff3_gz = "${genome_id}_annotation.gff3.gz" + File sorted_gff3_gz_tbi = "${genome_id}_annotation.gff3.gz.tbi" + } +} + +task ingest_gff3_into_ref { + input { + String genome_id + File gff3_gz + File gff3_gz_tbi + String reference_url + String token + Boolean validate_ssl + } + + command <<< + curl ~{true="" false="-k" validate_ssl} \ + -X PUT \ + -F "gff3_gz=@~{gff3_gz}" \ + -F "gff3_gz_tbi=@~{gff3_gz_tbi}" \ + -H "Authorization: Bearer ~{token}" \ + --fail-with-body \ + "~{reference_url}/genomes/~{genome_id}/features.gff3.gz" + >>> + + output { + File out = stdout() + File err = stderr() + } } From 5a7c310924ff81367d21609d8a90c4bb950e4f15 Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Wed, 8 May 2024 14:32:05 -0400 Subject: [PATCH 015/114] chore(db): implement bulk feature ingestion --- bento_reference_service/db.py | 49 +++++++++++++++++++++++++++++++++-- 1 file changed, 47 insertions(+), 2 deletions(-) diff --git a/bento_reference_service/db.py b/bento_reference_service/db.py index b209025..5c4e975 100644 --- a/bento_reference_service/db.py +++ b/bento_reference_service/db.py @@ -352,8 +352,53 @@ def _q_param(pv: str | int) -> str: return final_list, {"offset": offset, "limit": limit, "total": len(id_res)} async def bulk_ingest_genome_features(self, features: Iterable[GenomeFeature]): - # TODO - pass + entries: list[tuple[str, str, int, int, str, float | None, int | None]] = [] + attributes: list[tuple[str, str, str, str]] = [] + parents: list[tuple[str, str, str]] = [] + feature_tuples: list[tuple[str, str, str, str, str, str, str]] = [] + + for feature in features: + genome_id = feature.genome_id + contig_name = feature.contig_name + feature_id = feature.feature_id + + entries.extend( + ( + genome_id, + feature_id, + e.start_pos, + e.end_pos, + f"{contig_name}:{e.start_pos}-{e.end_pos}", + e.score, + e.phase, + ) + for e in feature.entries + ) + + for attr_tag, attr_vals in feature.attributes.items(): + attributes.extend((genome_id, feature_id, attr_tag, attr_val) for attr_val in attr_vals) + + parents.extend((genome_id, feature_id, p) for p in feature.parents) + + feature_tuples.append( + ( + genome_id, + contig_name, + feature.strand, + feature_id, + feature.feature_name, + feature.feature_type, + feature.source, + ) + ) + + conn: asyncpg.Connection + async with self.connect() as conn: + async with conn.transaction(): + await conn.copy_records_to_table("genome_features", records=feature_tuples) + await conn.copy_records_to_table("genome_feature_attributes", records=attributes) + await conn.copy_records_to_table("genome_feature_entries", records=entries) + await conn.copy_records_to_table("genome_feature_parents", records=parents) @lru_cache() From 3b0b9bde9b77483cd9aa851c71357f75f8d83a19 Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Wed, 8 May 2024 14:39:05 -0400 Subject: [PATCH 016/114] chore: add default tmp folder --- bento_reference_service/config.py | 2 +- tmp/.gitkeep | 0 2 files changed, 1 insertion(+), 1 deletion(-) create mode 100644 tmp/.gitkeep diff --git a/bento_reference_service/config.py b/bento_reference_service/config.py index 55b65a0..c63e400 100644 --- a/bento_reference_service/config.py +++ b/bento_reference_service/config.py @@ -20,7 +20,7 @@ class Config(BentoBaseConfig): service_url_base_path: str = "http://127.0.0.1:5000" # Base path to construct URIs from database_uri: str = "postgres://localhost:5432" - file_ingest_tmp_dir: Path = Path(__file__).parent / "tmp" + file_ingest_tmp_dir: Path = Path(__file__).parent.parent / "tmp" # Default to repository `tmp` folder file_ingest_chunk_size: int = 1024 * 256 # 256 KiB at a time file_response_chunk_size: int = 1024 * 16 # 16 KiB at a time diff --git a/tmp/.gitkeep b/tmp/.gitkeep new file mode 100644 index 0000000..e69de29 From 94efe08dadc0111b1803ffc13faf60285fb703d4 Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Wed, 8 May 2024 14:40:58 -0400 Subject: [PATCH 017/114] chore: create ingest tmp dir if needed --- bento_reference_service/main.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/bento_reference_service/main.py b/bento_reference_service/main.py index 4797265..df56e95 100644 --- a/bento_reference_service/main.py +++ b/bento_reference_service/main.py @@ -48,6 +48,10 @@ app.exception_handler(RequestValidationError)(validation_exception_handler_factory(authz_middleware)) +# Create the required ingestion temporary directory if needed +config_for_setup.file_ingest_tmp_dir.mkdir(exist_ok=True) + + @app.get("/service-info", dependencies=[authz_middleware.dep_public_endpoint()]) async def service_info(config: ConfigDependency, logger: LoggerDependency): return await build_service_info_from_pydantic_config( From ccb5563579ce7c4b6fa1ce68fcdece8a60fab307 Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Wed, 8 May 2024 15:06:41 -0400 Subject: [PATCH 018/114] fix: mismatch between DB and Pydantic model for strand --- bento_reference_service/models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bento_reference_service/models.py b/bento_reference_service/models.py index 01fac9e..954bad6 100644 --- a/bento_reference_service/models.py +++ b/bento_reference_service/models.py @@ -80,7 +80,7 @@ class GenomeFeature(BaseModel): genome_id: str contig_name: str - strand: Literal["negative", "positive", "unknown", "not_stranded"] + strand: Literal["-", "+", "?", "."] feature_id: str feature_name: str From 60a5515e605adfb2631cd1955e263f5640d71863 Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Wed, 8 May 2024 15:07:14 -0400 Subject: [PATCH 019/114] chore: properly ignore contents of tmp folder --- tmp/.gitignore | 2 ++ tmp/.gitkeep | 0 2 files changed, 2 insertions(+) create mode 100644 tmp/.gitignore delete mode 100644 tmp/.gitkeep diff --git a/tmp/.gitignore b/tmp/.gitignore new file mode 100644 index 0000000..d6b7ef3 --- /dev/null +++ b/tmp/.gitignore @@ -0,0 +1,2 @@ +* +!.gitignore diff --git a/tmp/.gitkeep b/tmp/.gitkeep deleted file mode 100644 index e69de29..0000000 From 44ef1fa4193c82f6631248643c2bc302ba4468eb Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Wed, 8 May 2024 15:24:52 -0400 Subject: [PATCH 020/114] fix: misc issues with GFF feature ingestion --- bento_reference_service/features.py | 134 ++++++++++++++++------------ 1 file changed, 78 insertions(+), 56 deletions(-) diff --git a/bento_reference_service/features.py b/bento_reference_service/features.py index ee453ef..3090929 100644 --- a/bento_reference_service/features.py +++ b/bento_reference_service/features.py @@ -1,7 +1,7 @@ import itertools import logging import pysam -import re +import traceback from pathlib import Path from typing import Generator @@ -15,7 +15,10 @@ ] -VALID_GENOME_ID = re.compile(r"^[a-zA-Z0-9\-_.]{3,50}$") +GFF_CAPTURED_ATTRIBUTES = frozenset({"ID", "Parent"}) +GFF_SKIPPED_FEATURE_TYPES = frozenset({"stop_codon_redefined_as_selenocysteine"}) +GFF_BATCH_SIZE = 5000 +GFF_LOG_PROGRESS_INTERVAL = 1000 class AnnotationIngestError(Exception): @@ -24,27 +27,26 @@ class AnnotationIngestError(Exception): def parse_attributes(raw_attributes: dict[str, str]) -> dict[str, list[str]]: # See "attributes" in http://gmod.org/wiki/GFF3 - return {k: [url_unquote(e) for e in v.split(",")] for k, v in raw_attributes.items()} + return {k: [url_unquote(e) for e in str(v).split(",") if e] for k, v in raw_attributes.items()} def extract_feature_name(record, attributes: dict[str, list[str]]) -> str | None: - feature_type = record.feature + feature_type = record.feature.lower() feature_name: str | None = attributes.get("Name", (None,))[0] if feature_name: return feature_name - transcript_id = attributes.get("transcript_id", (None,))[0] - transcript_name = attributes.get("transcript_name", (transcript_id,))[0] + transcript_name = attributes.get("transcript_name", attributes.get("transcript_id", (None,)))[0] match feature_type: case "gene": return attributes.get("gene_name", attributes.get("gene_id", (None,)))[0] case "transcript": - return attributes.get("transcript_name", attributes.get("transcript_id", (None,)))[0] - case "5UTR" | "five_prime_utr": # 5' untranslated region (UTR) + return transcript_name + case "5utr" | "five_prime_utr": # 5' untranslated region (UTR) return f"{transcript_name} 5' UTR" - case "3UTR" | "three_prime_utr": # 3' untranslated region (UTR) + case "3utr" | "three_prime_utr": # 3' untranslated region (UTR) return f"{transcript_name} 3' UTR" case "start_codon": return f"{transcript_name} start codon" @@ -55,7 +57,7 @@ def extract_feature_name(record, attributes: dict[str, list[str]]) -> str | None return attributes["exon_id"][0] else: return attributes["ID"][0] - case "CDS": # coding sequence + case "cds": # coding sequence return f"{transcript_name} CDS" case _: return None @@ -82,75 +84,95 @@ async def ingest_gene_feature_annotation( :return: None """ - # TODO: make sure it's a gtf.gz - # TODO: copy it in place - genome: m.GenomeWithURIs | None = await db.get_genome(genome_id) if genome is None: raise AnnotationIngestError(f"Genome with ID {genome_id} not found") - log_progress_interval = 1000 - def _iter_features() -> Generator[m.GenomeFeature, None, None]: gff = pysam.TabixFile(str(gff_path), index=str(gff_index_path)) total_processed: int = 0 + try: features_by_id: dict[str, m.GenomeFeature] = {} for contig in genome.contigs: logger.info(f"Indexing features from contig {contig.name}") - features_by_id.clear() - for record in gff.fetch(contig.name, parser=pysam.asGFF3()): - # for some reason, dict(...) returns the attributes dict: - record_attributes = parse_attributes(dict(record)) + try: + fetch_iter = gff.fetch(contig.name, parser=pysam.asGFF3()) + except ValueError: + logger.warning(f"Could not find contig with name {contig.name} in GFF3; skipping...") + continue + for i, record in fetch_iter: feature_type = record.feature - feature_id = record_attributes.get("ID", (None,))[0] - feature_name = extract_feature_name(record, record_attributes) - - if feature_id is None: - logger.warning(f"Skipping unsupported feature (type={feature_type}, no ID retrieval): {record}") - continue - - if feature_name is None: - logger.warning(f"Using ID as name for feature: {record}") - feature_name = feature_id - - entry = m.GenomeFeatureEntry( - start_pos=record.start, - end_pos=record.end, - score=record.score, - phase=record.frame, # misnamed in PySAM's GFF3 parser - ) - - if feature_id in features_by_id: - features_by_id[feature_id].entries.append(entry) - else: - features_by_id[feature_id] = m.GenomeFeature( - genome_id=genome_id, - contig_name=contig.name, - strand=record.strand, - feature_id=feature_id, - feature_name=feature_name, - feature_type=feature_type, - source=record.source, - entries=[entry], - attributes=record_attributes, - parents=tuple(p for p in record_attributes.get("Parent", "").split(",") if p), + + if feature_type in GFF_SKIPPED_FEATURE_TYPES: + continue # Don't ingest stop_codon_redefined_as_selenocysteine annotations + + # for some reason, dict(...) returns the attributes dict: + feature_raw_attributes = dict(record) + + try: + record_attributes = parse_attributes(feature_raw_attributes) + feature_id = record_attributes.get("ID", (None,))[0] + feature_name = extract_feature_name(record, record_attributes) + + if feature_id is None: + logger.warning( + f"Skipping unsupported feature {i}: type={feature_type}, no ID retrieval; {record}" + ) + continue + + if feature_name is None: + logger.warning(f"Using ID as name for feature {i}: {record}") + feature_name = feature_id + + entry = m.GenomeFeatureEntry( + start_pos=record.start, + end_pos=record.end, + score=record.score, + phase=record.frame, # misnamed in PySAM's GFF3 parser ) - total_processed += 1 - if total_processed % log_progress_interval == 0: - logger.info(f"Processed {total_processed} features") + if feature_id in features_by_id: + features_by_id[feature_id].entries.append(entry) + else: + features_by_id[feature_id] = m.GenomeFeature( + genome_id=genome_id, + contig_name=contig.name, + strand=record.strand, + feature_id=feature_id, + feature_name=feature_name, + feature_type=feature_type, + source=record.source, + entries=[entry], + attributes={ + # skip attributes which have been captured in the above information + k: v for k, v in record_attributes.items() if k not in GFF_CAPTURED_ATTRIBUTES + }, + parents=tuple(p for p in record_attributes.get("Parent", ()) if p), + ) + + except Exception as e: + logger.error( + f"Could not process feature {i}: {feature_type=}, {feature_raw_attributes=}; encountered " + f"exception: {e}" + ) + logger.error(traceback.format_exc()) - yield from features_by_id.values() + total_processed += 1 + if total_processed % GFF_LOG_PROGRESS_INTERVAL == 0: + logger.info(f"Processed {total_processed} features") + + yield from features_by_id.values() + features_by_id.clear() finally: gff.close() features_to_ingest = _iter_features() - while data := tuple(itertools.islice(features_to_ingest, 1000)): # take features in batches + while data := tuple(itertools.islice(features_to_ingest, GFF_BATCH_SIZE)): # take features in batches await db.bulk_ingest_genome_features(data) From 3aabd0c386698757297760be4ffedf6b2d5a3a54 Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Wed, 8 May 2024 15:26:15 -0400 Subject: [PATCH 021/114] fix: missing enumerate() in feature ingest --- bento_reference_service/features.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bento_reference_service/features.py b/bento_reference_service/features.py index 3090929..1974d06 100644 --- a/bento_reference_service/features.py +++ b/bento_reference_service/features.py @@ -105,7 +105,7 @@ def _iter_features() -> Generator[m.GenomeFeature, None, None]: logger.warning(f"Could not find contig with name {contig.name} in GFF3; skipping...") continue - for i, record in fetch_iter: + for i, record in enumerate(fetch_iter): feature_type = record.feature if feature_type in GFF_SKIPPED_FEATURE_TYPES: From 77991c6f0a18aef201369d82e7faa9c7bb619d0e Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Wed, 8 May 2024 15:43:50 -0400 Subject: [PATCH 022/114] fix: clear genome features before (re)ingesting them --- bento_reference_service/db.py | 8 ++++++++ bento_reference_service/routers/genomes.py | 3 +++ 2 files changed, 11 insertions(+) diff --git a/bento_reference_service/db.py b/bento_reference_service/db.py index 5c4e975..5e86eff 100644 --- a/bento_reference_service/db.py +++ b/bento_reference_service/db.py @@ -351,6 +351,14 @@ def _q_param(pv: str | int) -> str: return final_list, {"offset": offset, "limit": limit, "total": len(id_res)} + async def clear_genome_features(self, g_id: str): + conn: asyncpg.Connection + async with self.connect() as conn: + await conn.execute("DELETE FROM genome_feature_attributes WHERE genome_id = $1", g_id) + await conn.execute("DELETE FROM genome_feature_entries WHERE genome_id = $1", g_id) + await conn.execute("DELETE FROM genome_feature_parents WHERE genome_id = $1", g_id) + await conn.execute("DELETE FROM genome_features WHERE genome_id = $1", g_id) + async def bulk_ingest_genome_features(self, features: Iterable[GenomeFeature]): entries: list[tuple[str, str, int, int, str, float | None, int | None]] = [] attributes: list[tuple[str, str, str, str]] = [] diff --git a/bento_reference_service/routers/genomes.py b/bento_reference_service/routers/genomes.py index ac1c9af..480115a 100644 --- a/bento_reference_service/routers/genomes.py +++ b/bento_reference_service/routers/genomes.py @@ -246,6 +246,9 @@ async def genomes_detail_features_ingest_gff3( while data := (await gff3_gz_tbi.read(config.file_response_chunk_size)): await fh.write(data) + # clear existing gene features for this genome + await db.clear_genome_features(genome_id) + # ingest gene features into the database await ingest_gene_feature_annotation(genome_id, fn, fn_tbi, db, logger) From 57be8f570deb792d46962b1c935b63c2fe8c8b22 Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Wed, 8 May 2024 16:25:19 -0400 Subject: [PATCH 023/114] fix: issues with ingesting features to db + ingest feature types --- bento_reference_service/db.py | 55 ++++++++++++++++++++++++++--- bento_reference_service/features.py | 4 ++- 2 files changed, 54 insertions(+), 5 deletions(-) diff --git a/bento_reference_service/db.py b/bento_reference_service/db.py index 5e86eff..8bce7e3 100644 --- a/bento_reference_service/db.py +++ b/bento_reference_service/db.py @@ -360,6 +360,7 @@ async def clear_genome_features(self, g_id: str): await conn.execute("DELETE FROM genome_features WHERE genome_id = $1", g_id) async def bulk_ingest_genome_features(self, features: Iterable[GenomeFeature]): + feature_types: list[tuple[str]] = [] entries: list[tuple[str, str, int, int, str, float | None, int | None]] = [] attributes: list[tuple[str, str, str, str]] = [] parents: list[tuple[str, str, str]] = [] @@ -370,6 +371,8 @@ async def bulk_ingest_genome_features(self, features: Iterable[GenomeFeature]): contig_name = feature.contig_name feature_id = feature.feature_id + feature_types.append((feature.feature_type,)) + entries.extend( ( genome_id, @@ -403,10 +406,54 @@ async def bulk_ingest_genome_features(self, features: Iterable[GenomeFeature]): conn: asyncpg.Connection async with self.connect() as conn: async with conn.transaction(): - await conn.copy_records_to_table("genome_features", records=feature_tuples) - await conn.copy_records_to_table("genome_feature_attributes", records=attributes) - await conn.copy_records_to_table("genome_feature_entries", records=entries) - await conn.copy_records_to_table("genome_feature_parents", records=parents) + await conn.executemany("INSERT INTO genome_feature_types(type_id) VALUES ($1) ON CONFLICT DO NOTHING", + feature_types) + + await conn.copy_records_to_table( + "genome_features", + columns=[ + "genome_id", + "contig_name", + "strand", + "feature_id", + "feature_name", + "feature_type", + "source", + ], + records=feature_tuples, + ) + await conn.copy_records_to_table( + "genome_feature_attributes", + columns=[ + "genome_id", + "feature_id", + "attr_tag", + "attr_val", + ], + records=attributes, + ) + await conn.copy_records_to_table( + "genome_feature_entries", + columns=[ + "genome_id", + "feature_id", + "start_pos", + "end_pos", + "position_text", + "score", + "phase", + ], + records=entries, + ) + await conn.copy_records_to_table( + "genome_feature_parents", + columns=[ + "genome_id", + "feature_id", + "parent_id", + ], + records=parents, + ) @lru_cache() diff --git a/bento_reference_service/features.py b/bento_reference_service/features.py index 1974d06..9e87a0b 100644 --- a/bento_reference_service/features.py +++ b/bento_reference_service/features.py @@ -150,7 +150,9 @@ def _iter_features() -> Generator[m.GenomeFeature, None, None]: entries=[entry], attributes={ # skip attributes which have been captured in the above information - k: v for k, v in record_attributes.items() if k not in GFF_CAPTURED_ATTRIBUTES + k: v + for k, v in record_attributes.items() + if k not in GFF_CAPTURED_ATTRIBUTES }, parents=tuple(p for p in record_attributes.get("Parent", ()) if p), ) From 8229bd93e6bf055a3f9678e85643ce1e43cfe9fe Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Wed, 8 May 2024 16:25:40 -0400 Subject: [PATCH 024/114] chore(sql): don't create unneeded table, add v0.2 migration --- bento_reference_service/sql/migrate_v0_2.sql | 7 +++++++ bento_reference_service/sql/schema.sql | 5 ----- 2 files changed, 7 insertions(+), 5 deletions(-) create mode 100644 bento_reference_service/sql/migrate_v0_2.sql diff --git a/bento_reference_service/sql/migrate_v0_2.sql b/bento_reference_service/sql/migrate_v0_2.sql new file mode 100644 index 0000000..ff61e11 --- /dev/null +++ b/bento_reference_service/sql/migrate_v0_2.sql @@ -0,0 +1,7 @@ +-- Run these commands before migrating to v0.2.x + +DROP TABLE genome_feature_annotations CASCADE; +DROP TABLE genome_feature_features CASCADE; +DROP TABLE IF EXISTS genome_feature_type_synonyms; -- from v0.1, now unused + +DROP TYPE strand_type; diff --git a/bento_reference_service/sql/schema.sql b/bento_reference_service/sql/schema.sql index 33ae43d..d0716ec 100644 --- a/bento_reference_service/sql/schema.sql +++ b/bento_reference_service/sql/schema.sql @@ -57,11 +57,6 @@ CREATE TABLE IF NOT EXISTS genome_contig_aliases ( CREATE TABLE IF NOT EXISTS genome_feature_types ( type_id VARCHAR(63) NOT NULL PRIMARY KEY -- Term ID from the Sequence Ontology ); -CREATE TABLE IF NOT EXISTS genome_feature_type_synonyms ( - type_id VARCHAR(63) NOT NULL REFERENCES genome_feature_types, - synonym VARCHAR(63) NOT NULL UNIQUE, - PRIMARY KEY (type_id, synonym) -); DO $$ BEGIN CREATE TYPE strand_type AS ENUM ('-', '+', '?', '.'); From e764d76fbbcbc6b342fdf77a8b0a9c939a476f79 Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Wed, 8 May 2024 16:25:51 -0400 Subject: [PATCH 025/114] chore: don't log everything as debug --- bento_reference_service/logger.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/bento_reference_service/logger.py b/bento_reference_service/logger.py index 841f969..acfb882 100644 --- a/bento_reference_service/logger.py +++ b/bento_reference_service/logger.py @@ -12,8 +12,6 @@ "LoggerDependency", ] -logging.basicConfig(level=logging.DEBUG) - @lru_cache def get_logger(config: ConfigDependency) -> logging.Logger: From a63dcfe76b61975c196060c17fffdcce6e4ef892 Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Wed, 8 May 2024 17:31:25 -0400 Subject: [PATCH 026/114] fix: issues with feature ingest --- bento_reference_service/features.py | 34 +++++++++++++++++--- bento_reference_service/routers/genomes.py | 4 +++ bento_reference_service/sql/migrate_v0_2.sql | 2 +- bento_reference_service/sql/schema.sql | 2 +- 4 files changed, 36 insertions(+), 6 deletions(-) diff --git a/bento_reference_service/features.py b/bento_reference_service/features.py index 9e87a0b..0be7ab3 100644 --- a/bento_reference_service/features.py +++ b/bento_reference_service/features.py @@ -30,6 +30,24 @@ def parse_attributes(raw_attributes: dict[str, str]) -> dict[str, list[str]]: return {k: [url_unquote(e) for e in str(v).split(",") if e] for k, v in raw_attributes.items()} +def extract_feature_id(record, attributes: dict[str, list[str]]) -> str | None: + feature_type = record.feature.lower() + feature_id = attributes.get("ID", (None,))[0] + + if feature_id: + return feature_id + + match feature_type: + case "gene": + return attributes.get("gene_id", (None,))[0] + case "transcript": + return attributes.get("transcript_id", (None,))[0] + case "exon": + return attributes.get("exon_id", (None,))[0] + case _: # no alternative ID attribute to use, so we couldn't figure anything out. + return None + + def extract_feature_name(record, attributes: dict[str, list[str]]) -> str | None: feature_type = record.feature.lower() feature_name: str | None = attributes.get("Name", (None,))[0] @@ -101,8 +119,8 @@ def _iter_features() -> Generator[m.GenomeFeature, None, None]: try: fetch_iter = gff.fetch(contig.name, parser=pysam.asGFF3()) - except ValueError: - logger.warning(f"Could not find contig with name {contig.name} in GFF3; skipping...") + except ValueError as e: + logger.warning(f"Could not find contig with name {contig.name} in GFF3; skipping... ({e})") continue for i, record in enumerate(fetch_iter): @@ -116,7 +134,7 @@ def _iter_features() -> Generator[m.GenomeFeature, None, None]: try: record_attributes = parse_attributes(feature_raw_attributes) - feature_id = record_attributes.get("ID", (None,))[0] + feature_id = extract_feature_id(record, record_attributes) feature_name = extract_feature_name(record, record_attributes) if feature_id is None: @@ -142,7 +160,7 @@ def _iter_features() -> Generator[m.GenomeFeature, None, None]: features_by_id[feature_id] = m.GenomeFeature( genome_id=genome_id, contig_name=contig.name, - strand=record.strand, + strand=record.strand or ".", # None/"." <=> unstranded feature_id=feature_id, feature_name=feature_name, feature_type=feature_type, @@ -176,5 +194,13 @@ def _iter_features() -> Generator[m.GenomeFeature, None, None]: features_to_ingest = _iter_features() + n_ingested: int = 0 + while data := tuple(itertools.islice(features_to_ingest, GFF_BATCH_SIZE)): # take features in batches await db.bulk_ingest_genome_features(data) + n_ingested += len(data) + + if n_ingested == 0: + raise AnnotationIngestError("No gene features could be ingested - is this a valid GFF3 file?") + + logger.info(f"Ingested {n_ingested} gene features") diff --git a/bento_reference_service/routers/genomes.py b/bento_reference_service/routers/genomes.py index 480115a..56ae132 100644 --- a/bento_reference_service/routers/genomes.py +++ b/bento_reference_service/routers/genomes.py @@ -241,11 +241,15 @@ async def genomes_detail_features_ingest_gff3( while data := (await gff3_gz.read(config.file_response_chunk_size)): await fh.write(data) + logger.debug(f"Wrote GFF.gz data to {fn}; size={fn.stat().st_size}") + # copy .gff3.gz.tbi to temporary directory for ingestion async with aiofiles.open(fn_tbi, "wb") as fh: while data := (await gff3_gz_tbi.read(config.file_response_chunk_size)): await fh.write(data) + logger.debug(f"Wrote GFF.gz.tbi data to {fn_tbi}; size={fn_tbi.stat().st_size}") + # clear existing gene features for this genome await db.clear_genome_features(genome_id) diff --git a/bento_reference_service/sql/migrate_v0_2.sql b/bento_reference_service/sql/migrate_v0_2.sql index ff61e11..354ce70 100644 --- a/bento_reference_service/sql/migrate_v0_2.sql +++ b/bento_reference_service/sql/migrate_v0_2.sql @@ -4,4 +4,4 @@ DROP TABLE genome_feature_annotations CASCADE; DROP TABLE genome_feature_features CASCADE; DROP TABLE IF EXISTS genome_feature_type_synonyms; -- from v0.1, now unused -DROP TYPE strand_type; +DROP TYPE strand_type CASCADE; diff --git a/bento_reference_service/sql/schema.sql b/bento_reference_service/sql/schema.sql index d0716ec..102e28e 100644 --- a/bento_reference_service/sql/schema.sql +++ b/bento_reference_service/sql/schema.sql @@ -117,7 +117,7 @@ CREATE TABLE IF NOT EXISTS genome_feature_attributes ( genome_id VARCHAR(31) NOT NULL REFERENCES genomes, feature_id VARCHAR(63) NOT NULL, attr_tag VARCHAR(63) NOT NULL, - attr_val VARCHAR(63) NOT NULL, + attr_val TEXT NOT NULL, FOREIGN KEY (genome_id, feature_id) REFERENCES genome_features ); CREATE INDEX IF NOT EXISTS genome_feature_attributes_attr_idx From 203ca58c8b8bd010a2e420c132cc92fe3e58fc48 Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Wed, 8 May 2024 17:31:40 -0400 Subject: [PATCH 027/114] test: genome feature testing --- tests/conftest.py | 30 +- tests/data/sars_cov_2.fa | 929 ++++++++++++++++-------------- tests/data/sars_cov_2.fa.fai | 2 +- tests/data/sars_cov_2.gff3.gz | Bin 0 -> 1384 bytes tests/data/sars_cov_2.gff3.gz.tbi | Bin 0 -> 118 bytes tests/shared_data.py | 12 +- tests/test_genome_routes.py | 38 +- 7 files changed, 561 insertions(+), 450 deletions(-) create mode 100644 tests/data/sars_cov_2.gff3.gz create mode 100644 tests/data/sars_cov_2.gff3.gz.tbi diff --git a/tests/conftest.py b/tests/conftest.py index c6cf460..efaa42f 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -34,26 +34,26 @@ async def db_cleanup(db: Database): async with db.connect() as conn: await conn.execute( """ - DROP INDEX genome_features_feature_id_trgm_gin; - DROP INDEX genome_features_feature_name_trgm_gin; - DROP INDEX genome_feature_entries_position_text_trgm_gin; - DROP INDEX genome_feature_attributes_attr_idx; + DROP INDEX IF EXISTS genome_features_feature_id_trgm_gin; + DROP INDEX IF EXISTS genome_features_feature_name_trgm_gin; + DROP INDEX IF EXISTS genome_feature_entries_position_text_trgm_gin; + DROP INDEX IF EXISTS genome_feature_attributes_attr_idx; - DROP TABLE genome_feature_entries; - DROP TABLE genome_feature_attributes; - DROP TABLE genome_feature_parents; - DROP TABLE genome_features; + DROP TABLE IF EXISTS genome_feature_entries; + DROP TABLE IF EXISTS genome_feature_attributes; + DROP TABLE IF EXISTS genome_feature_parents; + DROP TABLE IF EXISTS genome_features; - DROP TABLE genome_feature_type_synonyms; - DROP TABLE genome_feature_types; + DROP TABLE IF EXISTS genome_feature_type_synonyms; + DROP TABLE IF EXISTS genome_feature_types; - DROP TABLE genome_contig_aliases; - DROP TABLE genome_contigs; + DROP TABLE IF EXISTS genome_contig_aliases; + DROP TABLE IF EXISTS genome_contigs; - DROP TABLE genome_aliases; - DROP TABLE genomes; + DROP TABLE IF EXISTS genome_aliases; + DROP TABLE IF EXISTS genomes; - DROP TYPE strand_type; + DROP TYPE IF EXISTS strand_type; """ ) await db.close() diff --git a/tests/data/sars_cov_2.fa b/tests/data/sars_cov_2.fa index b15a9f0..4001a6c 100644 --- a/tests/data/sars_cov_2.fa +++ b/tests/data/sars_cov_2.fa @@ -1,429 +1,500 @@ ->NC_045512.2 Severe acute respiratory syndrome coronavirus 2 isolate Wuhan-Hu-1, complete genome -ATTAAAGGTTTATACCTTCCCAGGTAACAAACCAACCAACTTTCGATCTCTTGTAGATCTGTTCTCTAAA -CGAACTTTAAAATCTGTGTGGCTGTCACTCGGCTGCATGCTTAGTGCACTCACGCAGTATAATTAATAAC -TAATTACTGTCGTTGACAGGACACGAGTAACTCGTCTATCTTCTGCAGGCTGCTTACGGTTTCGTCCGTG -TTGCAGCCGATCATCAGCACATCTAGGTTTCGTCCGGGTGTGACCGAAAGGTAAGATGGAGAGCCTTGTC -CCTGGTTTCAACGAGAAAACACACGTCCAACTCAGTTTGCCTGTTTTACAGGTTCGCGACGTGCTCGTAC -GTGGCTTTGGAGACTCCGTGGAGGAGGTCTTATCAGAGGCACGTCAACATCTTAAAGATGGCACTTGTGG -CTTAGTAGAAGTTGAAAAAGGCGTTTTGCCTCAACTTGAACAGCCCTATGTGTTCATCAAACGTTCGGAT -GCTCGAACTGCACCTCATGGTCATGTTATGGTTGAGCTGGTAGCAGAACTCGAAGGCATTCAGTACGGTC -GTAGTGGTGAGACACTTGGTGTCCTTGTCCCTCATGTGGGCGAAATACCAGTGGCTTACCGCAAGGTTCT -TCTTCGTAAGAACGGTAATAAAGGAGCTGGTGGCCATAGTTACGGCGCCGATCTAAAGTCATTTGACTTA -GGCGACGAGCTTGGCACTGATCCTTATGAAGATTTTCAAGAAAACTGGAACACTAAACATAGCAGTGGTG -TTACCCGTGAACTCATGCGTGAGCTTAACGGAGGGGCATACACTCGCTATGTCGATAACAACTTCTGTGG -CCCTGATGGCTACCCTCTTGAGTGCATTAAAGACCTTCTAGCACGTGCTGGTAAAGCTTCATGCACTTTG -TCCGAACAACTGGACTTTATTGACACTAAGAGGGGTGTATACTGCTGCCGTGAACATGAGCATGAAATTG -CTTGGTACACGGAACGTTCTGAAAAGAGCTATGAATTGCAGACACCTTTTGAAATTAAATTGGCAAAGAA -ATTTGACACCTTCAATGGGGAATGTCCAAATTTTGTATTTCCCTTAAATTCCATAATCAAGACTATTCAA -CCAAGGGTTGAAAAGAAAAAGCTTGATGGCTTTATGGGTAGAATTCGATCTGTCTATCCAGTTGCGTCAC -CAAATGAATGCAACCAAATGTGCCTTTCAACTCTCATGAAGTGTGATCATTGTGGTGAAACTTCATGGCA -GACGGGCGATTTTGTTAAAGCCACTTGCGAATTTTGTGGCACTGAGAATTTGACTAAAGAAGGTGCCACT -ACTTGTGGTTACTTACCCCAAAATGCTGTTGTTAAAATTTATTGTCCAGCATGTCACAATTCAGAAGTAG -GACCTGAGCATAGTCTTGCCGAATACCATAATGAATCTGGCTTGAAAACCATTCTTCGTAAGGGTGGTCG -CACTATTGCCTTTGGAGGCTGTGTGTTCTCTTATGTTGGTTGCCATAACAAGTGTGCCTATTGGGTTCCA -CGTGCTAGCGCTAACATAGGTTGTAACCATACAGGTGTTGTTGGAGAAGGTTCCGAAGGTCTTAATGACA -ACCTTCTTGAAATACTCCAAAAAGAGAAAGTCAACATCAATATTGTTGGTGACTTTAAACTTAATGAAGA -GATCGCCATTATTTTGGCATCTTTTTCTGCTTCCACAAGTGCTTTTGTGGAAACTGTGAAAGGTTTGGAT -TATAAAGCATTCAAACAAATTGTTGAATCCTGTGGTAATTTTAAAGTTACAAAAGGAAAAGCTAAAAAAG -GTGCCTGGAATATTGGTGAACAGAAATCAATACTGAGTCCTCTTTATGCATTTGCATCAGAGGCTGCTCG -TGTTGTACGATCAATTTTCTCCCGCACTCTTGAAACTGCTCAAAATTCTGTGCGTGTTTTACAGAAGGCC -GCTATAACAATACTAGATGGAATTTCACAGTATTCACTGAGACTCATTGATGCTATGATGTTCACATCTG -ATTTGGCTACTAACAATCTAGTTGTAATGGCCTACATTACAGGTGGTGTTGTTCAGTTGACTTCGCAGTG -GCTAACTAACATCTTTGGCACTGTTTATGAAAAACTCAAACCCGTCCTTGATTGGCTTGAAGAGAAGTTT -AAGGAAGGTGTAGAGTTTCTTAGAGACGGTTGGGAAATTGTTAAATTTATCTCAACCTGTGCTTGTGAAA -TTGTCGGTGGACAAATTGTCACCTGTGCAAAGGAAATTAAGGAGAGTGTTCAGACATTCTTTAAGCTTGT -AAATAAATTTTTGGCTTTGTGTGCTGACTCTATCATTATTGGTGGAGCTAAACTTAAAGCCTTGAATTTA -GGTGAAACATTTGTCACGCACTCAAAGGGATTGTACAGAAAGTGTGTTAAATCCAGAGAAGAAACTGGCC -TACTCATGCCTCTAAAAGCCCCAAAAGAAATTATCTTCTTAGAGGGAGAAACACTTCCCACAGAAGTGTT -AACAGAGGAAGTTGTCTTGAAAACTGGTGATTTACAACCATTAGAACAACCTACTAGTGAAGCTGTTGAA -GCTCCATTGGTTGGTACACCAGTTTGTATTAACGGGCTTATGTTGCTCGAAATCAAAGACACAGAAAAGT -ACTGTGCCCTTGCACCTAATATGATGGTAACAAACAATACCTTCACACTCAAAGGCGGTGCACCAACAAA -GGTTACTTTTGGTGATGACACTGTGATAGAAGTGCAAGGTTACAAGAGTGTGAATATCACTTTTGAACTT -GATGAAAGGATTGATAAAGTACTTAATGAGAAGTGCTCTGCCTATACAGTTGAACTCGGTACAGAAGTAA -ATGAGTTCGCCTGTGTTGTGGCAGATGCTGTCATAAAAACTTTGCAACCAGTATCTGAATTACTTACACC -ACTGGGCATTGATTTAGATGAGTGGAGTATGGCTACATACTACTTATTTGATGAGTCTGGTGAGTTTAAA -TTGGCTTCACATATGTATTGTTCTTTCTACCCTCCAGATGAGGATGAAGAAGAAGGTGATTGTGAAGAAG -AAGAGTTTGAGCCATCAACTCAATATGAGTATGGTACTGAAGATGATTACCAAGGTAAACCTTTGGAATT -TGGTGCCACTTCTGCTGCTCTTCAACCTGAAGAAGAGCAAGAAGAAGATTGGTTAGATGATGATAGTCAA -CAAACTGTTGGTCAACAAGACGGCAGTGAGGACAATCAGACAACTACTATTCAAACAATTGTTGAGGTTC -AACCTCAATTAGAGATGGAACTTACACCAGTTGTTCAGACTATTGAAGTGAATAGTTTTAGTGGTTATTT -AAAACTTACTGACAATGTATACATTAAAAATGCAGACATTGTGGAAGAAGCTAAAAAGGTAAAACCAACA -GTGGTTGTTAATGCAGCCAATGTTTACCTTAAACATGGAGGAGGTGTTGCAGGAGCCTTAAATAAGGCTA -CTAACAATGCCATGCAAGTTGAATCTGATGATTACATAGCTACTAATGGACCACTTAAAGTGGGTGGTAG -TTGTGTTTTAAGCGGACACAATCTTGCTAAACACTGTCTTCATGTTGTCGGCCCAAATGTTAACAAAGGT -GAAGACATTCAACTTCTTAAGAGTGCTTATGAAAATTTTAATCAGCACGAAGTTCTACTTGCACCATTAT -TATCAGCTGGTATTTTTGGTGCTGACCCTATACATTCTTTAAGAGTTTGTGTAGATACTGTTCGCACAAA -TGTCTACTTAGCTGTCTTTGATAAAAATCTCTATGACAAACTTGTTTCAAGCTTTTTGGAAATGAAGAGT -GAAAAGCAAGTTGAACAAAAGATCGCTGAGATTCCTAAAGAGGAAGTTAAGCCATTTATAACTGAAAGTA -AACCTTCAGTTGAACAGAGAAAACAAGATGATAAGAAAATCAAAGCTTGTGTTGAAGAAGTTACAACAAC -TCTGGAAGAAACTAAGTTCCTCACAGAAAACTTGTTACTTTATATTGACATTAATGGCAATCTTCATCCA -GATTCTGCCACTCTTGTTAGTGACATTGACATCACTTTCTTAAAGAAAGATGCTCCATATATAGTGGGTG -ATGTTGTTCAAGAGGGTGTTTTAACTGCTGTGGTTATACCTACTAAAAAGGCTGGTGGCACTACTGAAAT -GCTAGCGAAAGCTTTGAGAAAAGTGCCAACAGACAATTATATAACCACTTACCCGGGTCAGGGTTTAAAT -GGTTACACTGTAGAGGAGGCAAAGACAGTGCTTAAAAAGTGTAAAAGTGCCTTTTACATTCTACCATCTA -TTATCTCTAATGAGAAGCAAGAAATTCTTGGAACTGTTTCTTGGAATTTGCGAGAAATGCTTGCACATGC -AGAAGAAACACGCAAATTAATGCCTGTCTGTGTGGAAACTAAAGCCATAGTTTCAACTATACAGCGTAAA -TATAAGGGTATTAAAATACAAGAGGGTGTGGTTGATTATGGTGCTAGATTTTACTTTTACACCAGTAAAA -CAACTGTAGCGTCACTTATCAACACACTTAACGATCTAAATGAAACTCTTGTTACAATGCCACTTGGCTA -TGTAACACATGGCTTAAATTTGGAAGAAGCTGCTCGGTATATGAGATCTCTCAAAGTGCCAGCTACAGTT -TCTGTTTCTTCACCTGATGCTGTTACAGCGTATAATGGTTATCTTACTTCTTCTTCTAAAACACCTGAAG -AACATTTTATTGAAACCATCTCACTTGCTGGTTCCTATAAAGATTGGTCCTATTCTGGACAATCTACACA -ACTAGGTATAGAATTTCTTAAGAGAGGTGATAAAAGTGTATATTACACTAGTAATCCTACCACATTCCAC -CTAGATGGTGAAGTTATCACCTTTGACAATCTTAAGACACTTCTTTCTTTGAGAGAAGTGAGGACTATTA -AGGTGTTTACAACAGTAGACAACATTAACCTCCACACGCAAGTTGTGGACATGTCAATGACATATGGACA -ACAGTTTGGTCCAACTTATTTGGATGGAGCTGATGTTACTAAAATAAAACCTCATAATTCACATGAAGGT -AAAACATTTTATGTTTTACCTAATGATGACACTCTACGTGTTGAGGCTTTTGAGTACTACCACACAACTG -ATCCTAGTTTTCTGGGTAGGTACATGTCAGCATTAAATCACACTAAAAAGTGGAAATACCCACAAGTTAA -TGGTTTAACTTCTATTAAATGGGCAGATAACAACTGTTATCTTGCCACTGCATTGTTAACACTCCAACAA -ATAGAGTTGAAGTTTAATCCACCTGCTCTACAAGATGCTTATTACAGAGCAAGGGCTGGTGAAGCTGCTA -ACTTTTGTGCACTTATCTTAGCCTACTGTAATAAGACAGTAGGTGAGTTAGGTGATGTTAGAGAAACAAT -GAGTTACTTGTTTCAACATGCCAATTTAGATTCTTGCAAAAGAGTCTTGAACGTGGTGTGTAAAACTTGT -GGACAACAGCAGACAACCCTTAAGGGTGTAGAAGCTGTTATGTACATGGGCACACTTTCTTATGAACAAT -TTAAGAAAGGTGTTCAGATACCTTGTACGTGTGGTAAACAAGCTACAAAATATCTAGTACAACAGGAGTC -ACCTTTTGTTATGATGTCAGCACCACCTGCTCAGTATGAACTTAAGCATGGTACATTTACTTGTGCTAGT -GAGTACACTGGTAATTACCAGTGTGGTCACTATAAACATATAACTTCTAAAGAAACTTTGTATTGCATAG -ACGGTGCTTTACTTACAAAGTCCTCAGAATACAAAGGTCCTATTACGGATGTTTTCTACAAAGAAAACAG -TTACACAACAACCATAAAACCAGTTACTTATAAATTGGATGGTGTTGTTTGTACAGAAATTGACCCTAAG -TTGGACAATTATTATAAGAAAGACAATTCTTATTTCACAGAGCAACCAATTGATCTTGTACCAAACCAAC -CATATCCAAACGCAAGCTTCGATAATTTTAAGTTTGTATGTGATAATATCAAATTTGCTGATGATTTAAA -CCAGTTAACTGGTTATAAGAAACCTGCTTCAAGAGAGCTTAAAGTTACATTTTTCCCTGACTTAAATGGT -GATGTGGTGGCTATTGATTATAAACACTACACACCCTCTTTTAAGAAAGGAGCTAAATTGTTACATAAAC -CTATTGTTTGGCATGTTAACAATGCAACTAATAAAGCCACGTATAAACCAAATACCTGGTGTATACGTTG -TCTTTGGAGCACAAAACCAGTTGAAACATCAAATTCGTTTGATGTACTGAAGTCAGAGGACGCGCAGGGA -ATGGATAATCTTGCCTGCGAAGATCTAAAACCAGTCTCTGAAGAAGTAGTGGAAAATCCTACCATACAGA -AAGACGTTCTTGAGTGTAATGTGAAAACTACCGAAGTTGTAGGAGACATTATACTTAAACCAGCAAATAA -TAGTTTAAAAATTACAGAAGAGGTTGGCCACACAGATCTAATGGCTGCTTATGTAGACAATTCTAGTCTT -ACTATTAAGAAACCTAATGAATTATCTAGAGTATTAGGTTTGAAAACCCTTGCTACTCATGGTTTAGCTG -CTGTTAATAGTGTCCCTTGGGATACTATAGCTAATTATGCTAAGCCTTTTCTTAACAAAGTTGTTAGTAC -AACTACTAACATAGTTACACGGTGTTTAAACCGTGTTTGTACTAATTATATGCCTTATTTCTTTACTTTA -TTGCTACAATTGTGTACTTTTACTAGAAGTACAAATTCTAGAATTAAAGCATCTATGCCGACTACTATAG -CAAAGAATACTGTTAAGAGTGTCGGTAAATTTTGTCTAGAGGCTTCATTTAATTATTTGAAGTCACCTAA -TTTTTCTAAACTGATAAATATTATAATTTGGTTTTTACTATTAAGTGTTTGCCTAGGTTCTTTAATCTAC -TCAACCGCTGCTTTAGGTGTTTTAATGTCTAATTTAGGCATGCCTTCTTACTGTACTGGTTACAGAGAAG -GCTATTTGAACTCTACTAATGTCACTATTGCAACCTACTGTACTGGTTCTATACCTTGTAGTGTTTGTCT -TAGTGGTTTAGATTCTTTAGACACCTATCCTTCTTTAGAAACTATACAAATTACCATTTCATCTTTTAAA -TGGGATTTAACTGCTTTTGGCTTAGTTGCAGAGTGGTTTTTGGCATATATTCTTTTCACTAGGTTTTTCT -ATGTACTTGGATTGGCTGCAATCATGCAATTGTTTTTCAGCTATTTTGCAGTACATTTTATTAGTAATTC -TTGGCTTATGTGGTTAATAATTAATCTTGTACAAATGGCCCCGATTTCAGCTATGGTTAGAATGTACATC -TTCTTTGCATCATTTTATTATGTATGGAAAAGTTATGTGCATGTTGTAGACGGTTGTAATTCATCAACTT -GTATGATGTGTTACAAACGTAATAGAGCAACAAGAGTCGAATGTACAACTATTGTTAATGGTGTTAGAAG -GTCCTTTTATGTCTATGCTAATGGAGGTAAAGGCTTTTGCAAACTACACAATTGGAATTGTGTTAATTGT -GATACATTCTGTGCTGGTAGTACATTTATTAGTGATGAAGTTGCGAGAGACTTGTCACTACAGTTTAAAA -GACCAATAAATCCTACTGACCAGTCTTCTTACATCGTTGATAGTGTTACAGTGAAGAATGGTTCCATCCA -TCTTTACTTTGATAAAGCTGGTCAAAAGACTTATGAAAGACATTCTCTCTCTCATTTTGTTAACTTAGAC -AACCTGAGAGCTAATAACACTAAAGGTTCATTGCCTATTAATGTTATAGTTTTTGATGGTAAATCAAAAT -GTGAAGAATCATCTGCAAAATCAGCGTCTGTTTACTACAGTCAGCTTATGTGTCAACCTATACTGTTACT -AGATCAGGCATTAGTGTCTGATGTTGGTGATAGTGCGGAAGTTGCAGTTAAAATGTTTGATGCTTACGTT -AATACGTTTTCATCAACTTTTAACGTACCAATGGAAAAACTCAAAACACTAGTTGCAACTGCAGAAGCTG -AACTTGCAAAGAATGTGTCCTTAGACAATGTCTTATCTACTTTTATTTCAGCAGCTCGGCAAGGGTTTGT -TGATTCAGATGTAGAAACTAAAGATGTTGTTGAATGTCTTAAATTGTCACATCAATCTGACATAGAAGTT -ACTGGCGATAGTTGTAATAACTATATGCTCACCTATAACAAAGTTGAAAACATGACACCCCGTGACCTTG -GTGCTTGTATTGACTGTAGTGCGCGTCATATTAATGCGCAGGTAGCAAAAAGTCACAACATTGCTTTGAT -ATGGAACGTTAAAGATTTCATGTCATTGTCTGAACAACTACGAAAACAAATACGTAGTGCTGCTAAAAAG -AATAACTTACCTTTTAAGTTGACATGTGCAACTACTAGACAAGTTGTTAATGTTGTAACAACAAAGATAG -CACTTAAGGGTGGTAAAATTGTTAATAATTGGTTGAAGCAGTTAATTAAAGTTACACTTGTGTTCCTTTT -TGTTGCTGCTATTTTCTATTTAATAACACCTGTTCATGTCATGTCTAAACATACTGACTTTTCAAGTGAA -ATCATAGGATACAAGGCTATTGATGGTGGTGTCACTCGTGACATAGCATCTACAGATACTTGTTTTGCTA -ACAAACATGCTGATTTTGACACATGGTTTAGCCAGCGTGGTGGTAGTTATACTAATGACAAAGCTTGCCC -ATTGATTGCTGCAGTCATAACAAGAGAAGTGGGTTTTGTCGTGCCTGGTTTGCCTGGCACGATATTACGC -ACAACTAATGGTGACTTTTTGCATTTCTTACCTAGAGTTTTTAGTGCAGTTGGTAACATCTGTTACACAC -CATCAAAACTTATAGAGTACACTGACTTTGCAACATCAGCTTGTGTTTTGGCTGCTGAATGTACAATTTT -TAAAGATGCTTCTGGTAAGCCAGTACCATATTGTTATGATACCAATGTACTAGAAGGTTCTGTTGCTTAT -GAAAGTTTACGCCCTGACACACGTTATGTGCTCATGGATGGCTCTATTATTCAATTTCCTAACACCTACC -TTGAAGGTTCTGTTAGAGTGGTAACAACTTTTGATTCTGAGTACTGTAGGCACGGCACTTGTGAAAGATC -AGAAGCTGGTGTTTGTGTATCTACTAGTGGTAGATGGGTACTTAACAATGATTATTACAGATCTTTACCA -GGAGTTTTCTGTGGTGTAGATGCTGTAAATTTACTTACTAATATGTTTACACCACTAATTCAACCTATTG -GTGCTTTGGACATATCAGCATCTATAGTAGCTGGTGGTATTGTAGCTATCGTAGTAACATGCCTTGCCTA -CTATTTTATGAGGTTTAGAAGAGCTTTTGGTGAATACAGTCATGTAGTTGCCTTTAATACTTTACTATTC -CTTATGTCATTCACTGTACTCTGTTTAACACCAGTTTACTCATTCTTACCTGGTGTTTATTCTGTTATTT -ACTTGTACTTGACATTTTATCTTACTAATGATGTTTCTTTTTTAGCACATATTCAGTGGATGGTTATGTT -CACACCTTTAGTACCTTTCTGGATAACAATTGCTTATATCATTTGTATTTCCACAAAGCATTTCTATTGG -TTCTTTAGTAATTACCTAAAGAGACGTGTAGTCTTTAATGGTGTTTCCTTTAGTACTTTTGAAGAAGCTG -CGCTGTGCACCTTTTTGTTAAATAAAGAAATGTATCTAAAGTTGCGTAGTGATGTGCTATTACCTCTTAC -GCAATATAATAGATACTTAGCTCTTTATAATAAGTACAAGTATTTTAGTGGAGCAATGGATACAACTAGC -TACAGAGAAGCTGCTTGTTGTCATCTCGCAAAGGCTCTCAATGACTTCAGTAACTCAGGTTCTGATGTTC -TTTACCAACCACCACAAACCTCTATCACCTCAGCTGTTTTGCAGAGTGGTTTTAGAAAAATGGCATTCCC -ATCTGGTAAAGTTGAGGGTTGTATGGTACAAGTAACTTGTGGTACAACTACACTTAACGGTCTTTGGCTT -GATGACGTAGTTTACTGTCCAAGACATGTGATCTGCACCTCTGAAGACATGCTTAACCCTAATTATGAAG -ATTTACTCATTCGTAAGTCTAATCATAATTTCTTGGTACAGGCTGGTAATGTTCAACTCAGGGTTATTGG -ACATTCTATGCAAAATTGTGTACTTAAGCTTAAGGTTGATACAGCCAATCCTAAGACACCTAAGTATAAG -TTTGTTCGCATTCAACCAGGACAGACTTTTTCAGTGTTAGCTTGTTACAATGGTTCACCATCTGGTGTTT -ACCAATGTGCTATGAGGCCCAATTTCACTATTAAGGGTTCATTCCTTAATGGTTCATGTGGTAGTGTTGG -TTTTAACATAGATTATGACTGTGTCTCTTTTTGTTACATGCACCATATGGAATTACCAACTGGAGTTCAT -GCTGGCACAGACTTAGAAGGTAACTTTTATGGACCTTTTGTTGACAGGCAAACAGCACAAGCAGCTGGTA -CGGACACAACTATTACAGTTAATGTTTTAGCTTGGTTGTACGCTGCTGTTATAAATGGAGACAGGTGGTT -TCTCAATCGATTTACCACAACTCTTAATGACTTTAACCTTGTGGCTATGAAGTACAATTATGAACCTCTA -ACACAAGACCATGTTGACATACTAGGACCTCTTTCTGCTCAAACTGGAATTGCCGTTTTAGATATGTGTG -CTTCATTAAAAGAATTACTGCAAAATGGTATGAATGGACGTACCATATTGGGTAGTGCTTTATTAGAAGA -TGAATTTACACCTTTTGATGTTGTTAGACAATGCTCAGGTGTTACTTTCCAAAGTGCAGTGAAAAGAACA -ATCAAGGGTACACACCACTGGTTGTTACTCACAATTTTGACTTCACTTTTAGTTTTAGTCCAGAGTACTC -AATGGTCTTTGTTCTTTTTTTTGTATGAAAATGCCTTTTTACCTTTTGCTATGGGTATTATTGCTATGTC -TGCTTTTGCAATGATGTTTGTCAAACATAAGCATGCATTTCTCTGTTTGTTTTTGTTACCTTCTCTTGCC -ACTGTAGCTTATTTTAATATGGTCTATATGCCTGCTAGTTGGGTGATGCGTATTATGACATGGTTGGATA -TGGTTGATACTAGTTTGTCTGGTTTTAAGCTAAAAGACTGTGTTATGTATGCATCAGCTGTAGTGTTACT -AATCCTTATGACAGCAAGAACTGTGTATGATGATGGTGCTAGGAGAGTGTGGACACTTATGAATGTCTTG -ACACTCGTTTATAAAGTTTATTATGGTAATGCTTTAGATCAAGCCATTTCCATGTGGGCTCTTATAATCT -CTGTTACTTCTAACTACTCAGGTGTAGTTACAACTGTCATGTTTTTGGCCAGAGGTATTGTTTTTATGTG -TGTTGAGTATTGCCCTATTTTCTTCATAACTGGTAATACACTTCAGTGTATAATGCTAGTTTATTGTTTC -TTAGGCTATTTTTGTACTTGTTACTTTGGCCTCTTTTGTTTACTCAACCGCTACTTTAGACTGACTCTTG -GTGTTTATGATTACTTAGTTTCTACACAGGAGTTTAGATATATGAATTCACAGGGACTACTCCCACCCAA -GAATAGCATAGATGCCTTCAAACTCAACATTAAATTGTTGGGTGTTGGTGGCAAACCTTGTATCAAAGTA -GCCACTGTACAGTCTAAAATGTCAGATGTAAAGTGCACATCAGTAGTCTTACTCTCAGTTTTGCAACAAC -TCAGAGTAGAATCATCATCTAAATTGTGGGCTCAATGTGTCCAGTTACACAATGACATTCTCTTAGCTAA -AGATACTACTGAAGCCTTTGAAAAAATGGTTTCACTACTTTCTGTTTTGCTTTCCATGCAGGGTGCTGTA -GACATAAACAAGCTTTGTGAAGAAATGCTGGACAACAGGGCAACCTTACAAGCTATAGCCTCAGAGTTTA -GTTCCCTTCCATCATATGCAGCTTTTGCTACTGCTCAAGAAGCTTATGAGCAGGCTGTTGCTAATGGTGA -TTCTGAAGTTGTTCTTAAAAAGTTGAAGAAGTCTTTGAATGTGGCTAAATCTGAATTTGACCGTGATGCA -GCCATGCAACGTAAGTTGGAAAAGATGGCTGATCAAGCTATGACCCAAATGTATAAACAGGCTAGATCTG -AGGACAAGAGGGCAAAAGTTACTAGTGCTATGCAGACAATGCTTTTCACTATGCTTAGAAAGTTGGATAA -TGATGCACTCAACAACATTATCAACAATGCAAGAGATGGTTGTGTTCCCTTGAACATAATACCTCTTACA -ACAGCAGCCAAACTAATGGTTGTCATACCAGACTATAACACATATAAAAATACGTGTGATGGTACAACAT -TTACTTATGCATCAGCATTGTGGGAAATCCAACAGGTTGTAGATGCAGATAGTAAAATTGTTCAACTTAG -TGAAATTAGTATGGACAATTCACCTAATTTAGCATGGCCTCTTATTGTAACAGCTTTAAGGGCCAATTCT -GCTGTCAAATTACAGAATAATGAGCTTAGTCCTGTTGCACTACGACAGATGTCTTGTGCTGCCGGTACTA -CACAAACTGCTTGCACTGATGACAATGCGTTAGCTTACTACAACACAACAAAGGGAGGTAGGTTTGTACT -TGCACTGTTATCCGATTTACAGGATTTGAAATGGGCTAGATTCCCTAAGAGTGATGGAACTGGTACTATC -TATACAGAACTGGAACCACCTTGTAGGTTTGTTACAGACACACCTAAAGGTCCTAAAGTGAAGTATTTAT -ACTTTATTAAAGGATTAAACAACCTAAATAGAGGTATGGTACTTGGTAGTTTAGCTGCCACAGTACGTCT -ACAAGCTGGTAATGCAACAGAAGTGCCTGCCAATTCAACTGTATTATCTTTCTGTGCTTTTGCTGTAGAT -GCTGCTAAAGCTTACAAAGATTATCTAGCTAGTGGGGGACAACCAATCACTAATTGTGTTAAGATGTTGT -GTACACACACTGGTACTGGTCAGGCAATAACAGTTACACCGGAAGCCAATATGGATCAAGAATCCTTTGG -TGGTGCATCGTGTTGTCTGTACTGCCGTTGCCACATAGATCATCCAAATCCTAAAGGATTTTGTGACTTA -AAAGGTAAGTATGTACAAATACCTACAACTTGTGCTAATGACCCTGTGGGTTTTACACTTAAAAACACAG -TCTGTACCGTCTGCGGTATGTGGAAAGGTTATGGCTGTAGTTGTGATCAACTCCGCGAACCCATGCTTCA -GTCAGCTGATGCACAATCGTTTTTAAACGGGTTTGCGGTGTAAGTGCAGCCCGTCTTACACCGTGCGGCA -CAGGCACTAGTACTGATGTCGTATACAGGGCTTTTGACATCTACAATGATAAAGTAGCTGGTTTTGCTAA -ATTCCTAAAAACTAATTGTTGTCGCTTCCAAGAAAAGGACGAAGATGACAATTTAATTGATTCTTACTTT -GTAGTTAAGAGACACACTTTCTCTAACTACCAACATGAAGAAACAATTTATAATTTACTTAAGGATTGTC -CAGCTGTTGCTAAACATGACTTCTTTAAGTTTAGAATAGACGGTGACATGGTACCACATATATCACGTCA -ACGTCTTACTAAATACACAATGGCAGACCTCGTCTATGCTTTAAGGCATTTTGATGAAGGTAATTGTGAC -ACATTAAAAGAAATACTTGTCACATACAATTGTTGTGATGATGATTATTTCAATAAAAAGGACTGGTATG -ATTTTGTAGAAAACCCAGATATATTACGCGTATACGCCAACTTAGGTGAACGTGTACGCCAAGCTTTGTT -AAAAACAGTACAATTCTGTGATGCCATGCGAAATGCTGGTATTGTTGGTGTACTGACATTAGATAATCAA -GATCTCAATGGTAACTGGTATGATTTCGGTGATTTCATACAAACCACGCCAGGTAGTGGAGTTCCTGTTG -TAGATTCTTATTATTCATTGTTAATGCCTATATTAACCTTGACCAGGGCTTTAACTGCAGAGTCACATGT -TGACACTGACTTAACAAAGCCTTACATTAAGTGGGATTTGTTAAAATATGACTTCACGGAAGAGAGGTTA -AAACTCTTTGACCGTTATTTTAAATATTGGGATCAGACATACCACCCAAATTGTGTTAACTGTTTGGATG -ACAGATGCATTCTGCATTGTGCAAACTTTAATGTTTTATTCTCTACAGTGTTCCCACCTACAAGTTTTGG -ACCACTAGTGAGAAAAATATTTGTTGATGGTGTTCCATTTGTAGTTTCAACTGGATACCACTTCAGAGAG -CTAGGTGTTGTACATAATCAGGATGTAAACTTACATAGCTCTAGACTTAGTTTTAAGGAATTACTTGTGT -ATGCTGCTGACCCTGCTATGCACGCTGCTTCTGGTAATCTATTACTAGATAAACGCACTACGTGCTTTTC -AGTAGCTGCACTTACTAACAATGTTGCTTTTCAAACTGTCAAACCCGGTAATTTTAACAAAGACTTCTAT -GACTTTGCTGTGTCTAAGGGTTTCTTTAAGGAAGGAAGTTCTGTTGAATTAAAACACTTCTTCTTTGCTC -AGGATGGTAATGCTGCTATCAGCGATTATGACTACTATCGTTATAATCTACCAACAATGTGTGATATCAG -ACAACTACTATTTGTAGTTGAAGTTGTTGATAAGTACTTTGATTGTTACGATGGTGGCTGTATTAATGCT -AACCAAGTCATCGTCAACAACCTAGACAAATCAGCTGGTTTTCCATTTAATAAATGGGGTAAGGCTAGAC -TTTATTATGATTCAATGAGTTATGAGGATCAAGATGCACTTTTCGCATATACAAAACGTAATGTCATCCC -TACTATAACTCAAATGAATCTTAAGTATGCCATTAGTGCAAAGAATAGAGCTCGCACCGTAGCTGGTGTC -TCTATCTGTAGTACTATGACCAATAGACAGTTTCATCAAAAATTATTGAAATCAATAGCCGCCACTAGAG -GAGCTACTGTAGTAATTGGAACAAGCAAATTCTATGGTGGTTGGCACAACATGTTAAAAACTGTTTATAG -TGATGTAGAAAACCCTCACCTTATGGGTTGGGATTATCCTAAATGTGATAGAGCCATGCCTAACATGCTT -AGAATTATGGCCTCACTTGTTCTTGCTCGCAAACATACAACGTGTTGTAGCTTGTCACACCGTTTCTATA -GATTAGCTAATGAGTGTGCTCAAGTATTGAGTGAAATGGTCATGTGTGGCGGTTCACTATATGTTAAACC -AGGTGGAACCTCATCAGGAGATGCCACAACTGCTTATGCTAATAGTGTTTTTAACATTTGTCAAGCTGTC -ACGGCCAATGTTAATGCACTTTTATCTACTGATGGTAACAAAATTGCCGATAAGTATGTCCGCAATTTAC -AACACAGACTTTATGAGTGTCTCTATAGAAATAGAGATGTTGACACAGACTTTGTGAATGAGTTTTACGC -ATATTTGCGTAAACATTTCTCAATGATGATACTCTCTGACGATGCTGTTGTGTGTTTCAATAGCACTTAT -GCATCTCAAGGTCTAGTGGCTAGCATAAAGAACTTTAAGTCAGTTCTTTATTATCAAAACAATGTTTTTA -TGTCTGAAGCAAAATGTTGGACTGAGACTGACCTTACTAAAGGACCTCATGAATTTTGCTCTCAACATAC -AATGCTAGTTAAACAGGGTGATGATTATGTGTACCTTCCTTACCCAGATCCATCAAGAATCCTAGGGGCC -GGCTGTTTTGTAGATGATATCGTAAAAACAGATGGTACACTTATGATTGAACGGTTCGTGTCTTTAGCTA -TAGATGCTTACCCACTTACTAAACATCCTAATCAGGAGTATGCTGATGTCTTTCATTTGTACTTACAATA -CATAAGAAAGCTACATGATGAGTTAACAGGACACATGTTAGACATGTATTCTGTTATGCTTACTAATGAT -AACACTTCAAGGTATTGGGAACCTGAGTTTTATGAGGCTATGTACACACCGCATACAGTCTTACAGGCTG -TTGGGGCTTGTGTTCTTTGCAATTCACAGACTTCATTAAGATGTGGTGCTTGCATACGTAGACCATTCTT -ATGTTGTAAATGCTGTTACGACCATGTCATATCAACATCACATAAATTAGTCTTGTCTGTTAATCCGTAT -GTTTGCAATGCTCCAGGTTGTGATGTCACAGATGTGACTCAACTTTACTTAGGAGGTATGAGCTATTATT -GTAAATCACATAAACCACCCATTAGTTTTCCATTGTGTGCTAATGGACAAGTTTTTGGTTTATATAAAAA -TACATGTGTTGGTAGCGATAATGTTACTGACTTTAATGCAATTGCAACATGTGACTGGACAAATGCTGGT -GATTACATTTTAGCTAACACCTGTACTGAAAGACTCAAGCTTTTTGCAGCAGAAACGCTCAAAGCTACTG -AGGAGACATTTAAACTGTCTTATGGTATTGCTACTGTACGTGAAGTGCTGTCTGACAGAGAATTACATCT -TTCATGGGAAGTTGGTAAACCTAGACCACCACTTAACCGAAATTATGTCTTTACTGGTTATCGTGTAACT -AAAAACAGTAAAGTACAAATAGGAGAGTACACCTTTGAAAAAGGTGACTATGGTGATGCTGTTGTTTACC -GAGGTACAACAACTTACAAATTAAATGTTGGTGATTATTTTGTGCTGACATCACATACAGTAATGCCATT -AAGTGCACCTACACTAGTGCCACAAGAGCACTATGTTAGAATTACTGGCTTATACCCAACACTCAATATC -TCAGATGAGTTTTCTAGCAATGTTGCAAATTATCAAAAGGTTGGTATGCAAAAGTATTCTACACTCCAGG -GACCACCTGGTACTGGTAAGAGTCATTTTGCTATTGGCCTAGCTCTCTACTACCCTTCTGCTCGCATAGT -GTATACAGCTTGCTCTCATGCCGCTGTTGATGCACTATGTGAGAAGGCATTAAAATATTTGCCTATAGAT -AAATGTAGTAGAATTATACCTGCACGTGCTCGTGTAGAGTGTTTTGATAAATTCAAAGTGAATTCAACAT -TAGAACAGTATGTCTTTTGTACTGTAAATGCATTGCCTGAGACGACAGCAGATATAGTTGTCTTTGATGA -AATTTCAATGGCCACAAATTATGATTTGAGTGTTGTCAATGCCAGATTACGTGCTAAGCACTATGTGTAC -ATTGGCGACCCTGCTCAATTACCTGCACCACGCACATTGCTAACTAAGGGCACACTAGAACCAGAATATT -TCAATTCAGTGTGTAGACTTATGAAAACTATAGGTCCAGACATGTTCCTCGGAACTTGTCGGCGTTGTCC -TGCTGAAATTGTTGACACTGTGAGTGCTTTGGTTTATGATAATAAGCTTAAAGCACATAAAGACAAATCA -GCTCAATGCTTTAAAATGTTTTATAAGGGTGTTATCACGCATGATGTTTCATCTGCAATTAACAGGCCAC -AAATAGGCGTGGTAAGAGAATTCCTTACACGTAACCCTGCTTGGAGAAAAGCTGTCTTTATTTCACCTTA -TAATTCACAGAATGCTGTAGCCTCAAAGATTTTGGGACTACCAACTCAAACTGTTGATTCATCACAGGGC -TCAGAATATGACTATGTCATATTCACTCAAACCACTGAAACAGCTCACTCTTGTAATGTAAACAGATTTA -ATGTTGCTATTACCAGAGCAAAAGTAGGCATACTTTGCATAATGTCTGATAGAGACCTTTATGACAAGTT -GCAATTTACAAGTCTTGAAATTCCACGTAGGAATGTGGCAACTTTACAAGCTGAAAATGTAACAGGACTC -TTTAAAGATTGTAGTAAGGTAATCACTGGGTTACATCCTACACAGGCACCTACACACCTCAGTGTTGACA -CTAAATTCAAAACTGAAGGTTTATGTGTTGACATACCTGGCATACCTAAGGACATGACCTATAGAAGACT -CATCTCTATGATGGGTTTTAAAATGAATTATCAAGTTAATGGTTACCCTAACATGTTTATCACCCGCGAA -GAAGCTATAAGACATGTACGTGCATGGATTGGCTTCGATGTCGAGGGGTGTCATGCTACTAGAGAAGCTG -TTGGTACCAATTTACCTTTACAGCTAGGTTTTTCTACAGGTGTTAACCTAGTTGCTGTACCTACAGGTTA -TGTTGATACACCTAATAATACAGATTTTTCCAGAGTTAGTGCTAAACCACCGCCTGGAGATCAATTTAAA -CACCTCATACCACTTATGTACAAAGGACTTCCTTGGAATGTAGTGCGTATAAAGATTGTACAAATGTTAA -GTGACACACTTAAAAATCTCTCTGACAGAGTCGTATTTGTCTTATGGGCACATGGCTTTGAGTTGACATC -TATGAAGTATTTTGTGAAAATAGGACCTGAGCGCACCTGTTGTCTATGTGATAGACGTGCCACATGCTTT -TCCACTGCTTCAGACACTTATGCCTGTTGGCATCATTCTATTGGATTTGATTACGTCTATAATCCGTTTA -TGATTGATGTTCAACAATGGGGTTTTACAGGTAACCTACAAAGCAACCATGATCTGTATTGTCAAGTCCA -TGGTAATGCACATGTAGCTAGTTGTGATGCAATCATGACTAGGTGTCTAGCTGTCCACGAGTGCTTTGTT -AAGCGTGTTGACTGGACTATTGAATATCCTATAATTGGTGATGAACTGAAGATTAATGCGGCTTGTAGAA -AGGTTCAACACATGGTTGTTAAAGCTGCATTATTAGCAGACAAATTCCCAGTTCTTCACGACATTGGTAA -CCCTAAAGCTATTAAGTGTGTACCTCAAGCTGATGTAGAATGGAAGTTCTATGATGCACAGCCTTGTAGT -GACAAAGCTTATAAAATAGAAGAATTATTCTATTCTTATGCCACACATTCTGACAAATTCACAGATGGTG -TATGCCTATTTTGGAATTGCAATGTCGATAGATATCCTGCTAATTCCATTGTTTGTAGATTTGACACTAG -AGTGCTATCTAACCTTAACTTGCCTGGTTGTGATGGTGGCAGTTTGTATGTAAATAAACATGCATTCCAC -ACACCAGCTTTTGATAAAAGTGCTTTTGTTAATTTAAAACAATTACCATTTTTCTATTACTCTGACAGTC -CATGTGAGTCTCATGGAAAACAAGTAGTGTCAGATATAGATTATGTACCACTAAAGTCTGCTACGTGTAT -AACACGTTGCAATTTAGGTGGTGCTGTCTGTAGACATCATGCTAATGAGTACAGATTGTATCTCGATGCT -TATAACATGATGATCTCAGCTGGCTTTAGCTTGTGGGTTTACAAACAATTTGATACTTATAACCTCTGGA -ACACTTTTACAAGACTTCAGAGTTTAGAAAATGTGGCTTTTAATGTTGTAAATAAGGGACACTTTGATGG -ACAACAGGGTGAAGTACCAGTTTCTATCATTAATAACACTGTTTACACAAAAGTTGATGGTGTTGATGTA -GAATTGTTTGAAAATAAAACAACATTACCTGTTAATGTAGCATTTGAGCTTTGGGCTAAGCGCAACATTA -AACCAGTACCAGAGGTGAAAATACTCAATAATTTGGGTGTGGACATTGCTGCTAATACTGTGATCTGGGA -CTACAAAAGAGATGCTCCAGCACATATATCTACTATTGGTGTTTGTTCTATGACTGACATAGCCAAGAAA -CCAACTGAAACGATTTGTGCACCACTCACTGTCTTTTTTGATGGTAGAGTTGATGGTCAAGTAGACTTAT -TTAGAAATGCCCGTAATGGTGTTCTTATTACAGAAGGTAGTGTTAAAGGTTTACAACCATCTGTAGGTCC -CAAACAAGCTAGTCTTAATGGAGTCACATTAATTGGAGAAGCCGTAAAAACACAGTTCAATTATTATAAG -AAAGTTGATGGTGTTGTCCAACAATTACCTGAAACTTACTTTACTCAGAGTAGAAATTTACAAGAATTTA -AACCCAGGAGTCAAATGGAAATTGATTTCTTAGAATTAGCTATGGATGAATTCATTGAACGGTATAAATT -AGAAGGCTATGCCTTCGAACATATCGTTTATGGAGATTTTAGTCATAGTCAGTTAGGTGGTTTACATCTA -CTGATTGGACTAGCTAAACGTTTTAAGGAATCACCTTTTGAATTAGAAGATTTTATTCCTATGGACAGTA -CAGTTAAAAACTATTTCATAACAGATGCGCAAACAGGTTCATCTAAGTGTGTGTGTTCTGTTATTGATTT -ATTACTTGATGATTTTGTTGAAATAATAAAATCCCAAGATTTATCTGTAGTTTCTAAGGTTGTCAAAGTG -ACTATTGACTATACAGAAATTTCATTTATGCTTTGGTGTAAAGATGGCCATGTAGAAACATTTTACCCAA -AATTACAATCTAGTCAAGCGTGGCAACCGGGTGTTGCTATGCCTAATCTTTACAAAATGCAAAGAATGCT -ATTAGAAAAGTGTGACCTTCAAAATTATGGTGATAGTGCAACATTACCTAAAGGCATAATGATGAATGTC -GCAAAATATACTCAACTGTGTCAATATTTAAACACATTAACATTAGCTGTACCCTATAATATGAGAGTTA -TACATTTTGGTGCTGGTTCTGATAAAGGAGTTGCACCAGGTACAGCTGTTTTAAGACAGTGGTTGCCTAC -GGGTACGCTGCTTGTCGATTCAGATCTTAATGACTTTGTCTCTGATGCAGATTCAACTTTGATTGGTGAT -TGTGCAACTGTACATACAGCTAATAAATGGGATCTCATTATTAGTGATATGTACGACCCTAAGACTAAAA -ATGTTACAAAAGAAAATGACTCTAAAGAGGGTTTTTTCACTTACATTTGTGGGTTTATACAACAAAAGCT -AGCTCTTGGAGGTTCCGTGGCTATAAAGATAACAGAACATTCTTGGAATGCTGATCTTTATAAGCTCATG -GGACACTTCGCATGGTGGACAGCCTTTGTTACTAATGTGAATGCGTCATCATCTGAAGCATTTTTAATTG -GATGTAATTATCTTGGCAAACCACGCGAACAAATAGATGGTTATGTCATGCATGCAAATTACATATTTTG -GAGGAATACAAATCCAATTCAGTTGTCTTCCTATTCTTTATTTGACATGAGTAAATTTCCCCTTAAATTA -AGGGGTACTGCTGTTATGTCTTTAAAAGAAGGTCAAATCAATGATATGATTTTATCTCTTCTTAGTAAAG -GTAGACTTATAATTAGAGAAAACAACAGAGTTGTTATTTCTAGTGATGTTCTTGTTAACAACTAAACGAA -CAATGTTTGTTTTTCTTGTTTTATTGCCACTAGTCTCTAGTCAGTGTGTTAATCTTACAACCAGAACTCA -ATTACCCCCTGCATACACTAATTCTTTCACACGTGGTGTTTATTACCCTGACAAAGTTTTCAGATCCTCA -GTTTTACATTCAACTCAGGACTTGTTCTTACCTTTCTTTTCCAATGTTACTTGGTTCCATGCTATACATG -TCTCTGGGACCAATGGTACTAAGAGGTTTGATAACCCTGTCCTACCATTTAATGATGGTGTTTATTTTGC -TTCCACTGAGAAGTCTAACATAATAAGAGGCTGGATTTTTGGTACTACTTTAGATTCGAAGACCCAGTCC -CTACTTATTGTTAATAACGCTACTAATGTTGTTATTAAAGTCTGTGAATTTCAATTTTGTAATGATCCAT -TTTTGGGTGTTTATTACCACAAAAACAACAAAAGTTGGATGGAAAGTGAGTTCAGAGTTTATTCTAGTGC -GAATAATTGCACTTTTGAATATGTCTCTCAGCCTTTTCTTATGGACCTTGAAGGAAAACAGGGTAATTTC -AAAAATCTTAGGGAATTTGTGTTTAAGAATATTGATGGTTATTTTAAAATATATTCTAAGCACACGCCTA -TTAATTTAGTGCGTGATCTCCCTCAGGGTTTTTCGGCTTTAGAACCATTGGTAGATTTGCCAATAGGTAT -TAACATCACTAGGTTTCAAACTTTACTTGCTTTACATAGAAGTTATTTGACTCCTGGTGATTCTTCTTCA -GGTTGGACAGCTGGTGCTGCAGCTTATTATGTGGGTTATCTTCAACCTAGGACTTTTCTATTAAAATATA -ATGAAAATGGAACCATTACAGATGCTGTAGACTGTGCACTTGACCCTCTCTCAGAAACAAAGTGTACGTT -GAAATCCTTCACTGTAGAAAAAGGAATCTATCAAACTTCTAACTTTAGAGTCCAACCAACAGAATCTATT -GTTAGATTTCCTAATATTACAAACTTGTGCCCTTTTGGTGAAGTTTTTAACGCCACCAGATTTGCATCTG -TTTATGCTTGGAACAGGAAGAGAATCAGCAACTGTGTTGCTGATTATTCTGTCCTATATAATTCCGCATC -ATTTTCCACTTTTAAGTGTTATGGAGTGTCTCCTACTAAATTAAATGATCTCTGCTTTACTAATGTCTAT -GCAGATTCATTTGTAATTAGAGGTGATGAAGTCAGACAAATCGCTCCAGGGCAAACTGGAAAGATTGCTG -ATTATAATTATAAATTACCAGATGATTTTACAGGCTGCGTTATAGCTTGGAATTCTAACAATCTTGATTC -TAAGGTTGGTGGTAATTATAATTACCTGTATAGATTGTTTAGGAAGTCTAATCTCAAACCTTTTGAGAGA -GATATTTCAACTGAAATCTATCAGGCCGGTAGCACACCTTGTAATGGTGTTGAAGGTTTTAATTGTTACT -TTCCTTTACAATCATATGGTTTCCAACCCACTAATGGTGTTGGTTACCAACCATACAGAGTAGTAGTACT -TTCTTTTGAACTTCTACATGCACCAGCAACTGTTTGTGGACCTAAAAAGTCTACTAATTTGGTTAAAAAC -AAATGTGTCAATTTCAACTTCAATGGTTTAACAGGCACAGGTGTTCTTACTGAGTCTAACAAAAAGTTTC -TGCCTTTCCAACAATTTGGCAGAGACATTGCTGACACTACTGATGCTGTCCGTGATCCACAGACACTTGA -GATTCTTGACATTACACCATGTTCTTTTGGTGGTGTCAGTGTTATAACACCAGGAACAAATACTTCTAAC -CAGGTTGCTGTTCTTTATCAGGATGTTAACTGCACAGAAGTCCCTGTTGCTATTCATGCAGATCAACTTA -CTCCTACTTGGCGTGTTTATTCTACAGGTTCTAATGTTTTTCAAACACGTGCAGGCTGTTTAATAGGGGC -TGAACATGTCAACAACTCATATGAGTGTGACATACCCATTGGTGCAGGTATATGCGCTAGTTATCAGACT -CAGACTAATTCTCCTCGGCGGGCACGTAGTGTAGCTAGTCAATCCATCATTGCCTACACTATGTCACTTG -GTGCAGAAAATTCAGTTGCTTACTCTAATAACTCTATTGCCATACCCACAAATTTTACTATTAGTGTTAC -CACAGAAATTCTACCAGTGTCTATGACCAAGACATCAGTAGATTGTACAATGTACATTTGTGGTGATTCA -ACTGAATGCAGCAATCTTTTGTTGCAATATGGCAGTTTTTGTACACAATTAAACCGTGCTTTAACTGGAA -TAGCTGTTGAACAAGACAAAAACACCCAAGAAGTTTTTGCACAAGTCAAACAAATTTACAAAACACCACC -AATTAAAGATTTTGGTGGTTTTAATTTTTCACAAATATTACCAGATCCATCAAAACCAAGCAAGAGGTCA -TTTATTGAAGATCTACTTTTCAACAAAGTGACACTTGCAGATGCTGGCTTCATCAAACAATATGGTGATT -GCCTTGGTGATATTGCTGCTAGAGACCTCATTTGTGCACAAAAGTTTAACGGCCTTACTGTTTTGCCACC -TTTGCTCACAGATGAAATGATTGCTCAATACACTTCTGCACTGTTAGCGGGTACAATCACTTCTGGTTGG -ACCTTTGGTGCAGGTGCTGCATTACAAATACCATTTGCTATGCAAATGGCTTATAGGTTTAATGGTATTG -GAGTTACACAGAATGTTCTCTATGAGAACCAAAAATTGATTGCCAACCAATTTAATAGTGCTATTGGCAA -AATTCAAGACTCACTTTCTTCCACAGCAAGTGCACTTGGAAAACTTCAAGATGTGGTCAACCAAAATGCA -CAAGCTTTAAACACGCTTGTTAAACAACTTAGCTCCAATTTTGGTGCAATTTCAAGTGTTTTAAATGATA -TCCTTTCACGTCTTGACAAAGTTGAGGCTGAAGTGCAAATTGATAGGTTGATCACAGGCAGACTTCAAAG -TTTGCAGACATATGTGACTCAACAATTAATTAGAGCTGCAGAAATCAGAGCTTCTGCTAATCTTGCTGCT -ACTAAAATGTCAGAGTGTGTACTTGGACAATCAAAAAGAGTTGATTTTTGTGGAAAGGGCTATCATCTTA -TGTCCTTCCCTCAGTCAGCACCTCATGGTGTAGTCTTCTTGCATGTGACTTATGTCCCTGCACAAGAAAA -GAACTTCACAACTGCTCCTGCCATTTGTCATGATGGAAAAGCACACTTTCCTCGTGAAGGTGTCTTTGTT -TCAAATGGCACACACTGGTTTGTAACACAAAGGAATTTTTATGAACCACAAATCATTACTACAGACAACA -CATTTGTGTCTGGTAACTGTGATGTTGTAATAGGAATTGTCAACAACACAGTTTATGATCCTTTGCAACC -TGAATTAGACTCATTCAAGGAGGAGTTAGATAAATATTTTAAGAATCATACATCACCAGATGTTGATTTA -GGTGACATCTCTGGCATTAATGCTTCAGTTGTAAACATTCAAAAAGAAATTGACCGCCTCAATGAGGTTG -CCAAGAATTTAAATGAATCTCTCATCGATCTCCAAGAACTTGGAAAGTATGAGCAGTATATAAAATGGCC -ATGGTACATTTGGCTAGGTTTTATAGCTGGCTTGATTGCCATAGTAATGGTGACAATTATGCTTTGCTGT -ATGACCAGTTGCTGTAGTTGTCTCAAGGGCTGTTGTTCTTGTGGATCCTGCTGCAAATTTGATGAAGACG -ACTCTGAGCCAGTGCTCAAAGGAGTCAAATTACATTACACATAAACGAACTTATGGATTTGTTTATGAGA -ATCTTCACAATTGGAACTGTAACTTTGAAGCAAGGTGAAATCAAGGATGCTACTCCTTCAGATTTTGTTC -GCGCTACTGCAACGATACCGATACAAGCCTCACTCCCTTTCGGATGGCTTATTGTTGGCGTTGCACTTCT -TGCTGTTTTTCAGAGCGCTTCCAAAATCATAACCCTCAAAAAGAGATGGCAACTAGCACTCTCCAAGGGT -GTTCACTTTGTTTGCAACTTGCTGTTGTTGTTTGTAACAGTTTACTCACACCTTTTGCTCGTTGCTGCTG -GCCTTGAAGCCCCTTTTCTCTATCTTTATGCTTTAGTCTACTTCTTGCAGAGTATAAACTTTGTAAGAAT -AATAATGAGGCTTTGGCTTTGCTGGAAATGCCGTTCCAAAAACCCATTACTTTATGATGCCAACTATTTT -CTTTGCTGGCATACTAATTGTTACGACTATTGTATACCTTACAATAGTGTAACTTCTTCAATTGTCATTA -CTTCAGGTGATGGCACAACAAGTCCTATTTCTGAACATGACTACCAGATTGGTGGTTATACTGAAAAATG -GGAATCTGGAGTAAAAGACTGTGTTGTATTACACAGTTACTTCACTTCAGACTATTACCAGCTGTACTCA -ACTCAATTGAGTACAGACACTGGTGTTGAACATGTTACCTTCTTCATCTACAATAAAATTGTTGATGAGC -CTGAAGAACATGTCCAAATTCACACAATCGACGGTTCATCCGGAGTTGTTAATCCAGTAATGGAACCAAT -TTATGATGAACCGACGACGACTACTAGCGTGCCTTTGTAAGCACAAGCTGATGAGTACGAACTTATGTAC -TCATTCGTTTCGGAAGAGACAGGTACGTTAATAGTTAATAGCGTACTTCTTTTTCTTGCTTTCGTGGTAT -TCTTGCTAGTTACACTAGCCATCCTTACTGCGCTTCGATTGTGTGCGTACTGCTGCAATATTGTTAACGT -GAGTCTTGTAAAACCTTCTTTTTACGTTTACTCTCGTGTTAAAAATCTGAATTCTTCTAGAGTTCCTGAT -CTTCTGGTCTAAACGAACTAAATATTATATTAGTTTTTCTGTTTGGAACTTTAATTTTAGCCATGGCAGA -TTCCAACGGTACTATTACCGTTGAAGAGCTTAAAAAGCTCCTTGAACAATGGAACCTAGTAATAGGTTTC -CTATTCCTTACATGGATTTGTCTTCTACAATTTGCCTATGCCAACAGGAATAGGTTTTTGTATATAATTA -AGTTAATTTTCCTCTGGCTGTTATGGCCAGTAACTTTAGCTTGTTTTGTGCTTGCTGCTGTTTACAGAAT -AAATTGGATCACCGGTGGAATTGCTATCGCAATGGCTTGTCTTGTAGGCTTGATGTGGCTCAGCTACTTC -ATTGCTTCTTTCAGACTGTTTGCGCGTACGCGTTCCATGTGGTCATTCAATCCAGAAACTAACATTCTTC -TCAACGTGCCACTCCATGGCACTATTCTGACCAGACCGCTTCTAGAAAGTGAACTCGTAATCGGAGCTGT -GATCCTTCGTGGACATCTTCGTATTGCTGGACACCATCTAGGACGCTGTGACATCAAGGACCTGCCTAAA -GAAATCACTGTTGCTACATCACGAACGCTTTCTTATTACAAATTGGGAGCTTCGCAGCGTGTAGCAGGTG -ACTCAGGTTTTGCTGCATACAGTCGCTACAGGATTGGCAACTATAAATTAAACACAGACCATTCCAGTAG -CAGTGACAATATTGCTTTGCTTGTACAGTAAGTGACAACAGATGTTTCATCTCGTTGACTTTCAGGTTAC -TATAGCAGAGATATTACTAATTATTATGAGGACTTTTAAAGTTTCCATTTGGAATCTTGATTACATCATA -AACCTCATAATTAAAAATTTATCTAAGTCACTAACTGAGAATAAATATTCTCAATTAGATGAAGAGCAAC -CAATGGAGATTGATTAAACGAACATGAAAATTATTCTTTTCTTGGCACTGATAACACTCGCTACTTGTGA -GCTTTATCACTACCAAGAGTGTGTTAGAGGTACAACAGTACTTTTAAAAGAACCTTGCTCTTCTGGAACA -TACGAGGGCAATTCACCATTTCATCCTCTAGCTGATAACAAATTTGCACTGACTTGCTTTAGCACTCAAT -TTGCTTTTGCTTGTCCTGACGGCGTAAAACACGTCTATCAGTTACGTGCCAGATCAGTTTCACCTAAACT -GTTCATCAGACAAGAGGAAGTTCAAGAACTTTACTCTCCAATTTTTCTTATTGTTGCGGCAATAGTGTTT -ATAACACTTTGCTTCACACTCAAAAGAAAGACAGAATGATTGAACTTTCATTAATTGACTTCTATTTGTG -CTTTTTAGCCTTTCTGCTATTCCTTGTTTTAATTATGCTTATTATCTTTTGGTTCTCACTTGAACTGCAA -GATCATAATGAAACTTGTCACGCCTAAACGAACATGAAATTTCTTGTTTTCTTAGGAATCATCACAACTG -TAGCTGCATTTCACCAAGAATGTAGTTTACAGTCATGTACTCAACATCAACCATATGTAGTTGATGACCC -GTGTCCTATTCACTTCTATTCTAAATGGTATATTAGAGTAGGAGCTAGAAAATCAGCACCTTTAATTGAA -TTGTGCGTGGATGAGGCTGGTTCTAAATCACCCATTCAGTACATCGATATCGGTAATTATACAGTTTCCT -GTTTACCTTTTACAATTAATTGCCAGGAACCTAAATTGGGTAGTCTTGTAGTGCGTTGTTCGTTCTATGA -AGACTTTTTAGAGTATCATGACGTTCGTGTTGTTTTAGATTTCATCTAAACGAACAAACTAAAATGTCTG -ATAATGGACCCCAAAATCAGCGAAATGCACCCCGCATTACGTTTGGTGGACCCTCAGATTCAACTGGCAG -TAACCAGAATGGAGAACGCAGTGGGGCGCGATCAAAACAACGTCGGCCCCAAGGTTTACCCAATAATACT -GCGTCTTGGTTCACCGCTCTCACTCAACATGGCAAGGAAGACCTTAAATTCCCTCGAGGACAAGGCGTTC -CAATTAACACCAATAGCAGTCCAGATGACCAAATTGGCTACTACCGAAGAGCTACCAGACGAATTCGTGG -TGGTGACGGTAAAATGAAAGATCTCAGTCCAAGATGGTATTTCTACTACCTAGGAACTGGGCCAGAAGCT -GGACTTCCCTATGGTGCTAACAAAGACGGCATCATATGGGTTGCAACTGAGGGAGCCTTGAATACACCAA -AAGATCACATTGGCACCCGCAATCCTGCTAACAATGCTGCAATCGTGCTACAACTTCCTCAAGGAACAAC -ATTGCCAAAAGGCTTCTACGCAGAAGGGAGCAGAGGCGGCAGTCAAGCCTCTTCTCGTTCCTCATCACGT -AGTCGCAACAGTTCAAGAAATTCAACTCCAGGCAGCAGTAGGGGAACTTCTCCTGCTAGAATGGCTGGCA -ATGGCGGTGATGCTGCTCTTGCTTTGCTGCTGCTTGACAGATTGAACCAGCTTGAGAGCAAAATGTCTGG -TAAAGGCCAACAACAACAAGGCCAAACTGTCACTAAGAAATCTGCTGCTGAGGCTTCTAAGAAGCCTCGG -CAAAAACGTACTGCCACTAAAGCATACAATGTAACACAAGCTTTCGGCAGACGTGGTCCAGAACAAACCC -AAGGAAATTTTGGGGACCAGGAACTAATCAGACAAGGAACTGATTACAAACATTGGCCGCAAATTGCACA -ATTTGCCCCCAGCGCTTCAGCGTTCTTCGGAATGTCGCGCATTGGCATGGAAGTCACACCTTCGGGAACG -TGGTTGACCTACACAGGTGCCATCAAATTGGATGACAAAGATCCAAATTTCAAAGATCAAGTCATTTTGC -TGAATAAGCATATTGACGCATACAAAACATTCCCACCAACAGAGCCTAAAAAGGACAAAAAGAAGAAGGC -TGATGAAACTCAAGCCTTACCGCAGAGACAGAAGAAACAGCAAACTGTGACTCTTCTTCCTGCTGCAGAT -TTGGATGATTTCTCCAAACAATTGCAACAATCCATGAGCAGTGCTGACTCAACTCAGGCCTAAACTCATG -CAGACCACACAAGGCAGATGGGCTATATAAACGTTTTCGCTTTTCCGTTTACGATATATAGTCTACTCTT -GTGCAGAATGAATTCTCGTAACTACATAGCACAAGTAGATGTAGTTAACTTTAATCTCACATAGCAATCT -TTAATCAGTGTGTAACATTAGGGAGGACTTGAAAGAGCCACCACATTTTCACCGAGGCCACGCGGAGTAC -GATCGAGTGTACAGTGAACAATGCTAGGGAGAGCTGCCTATATGGAAGAGCCCTAATGTGTAAAATTAAT -TTTAGTAGTGCTATCCCCATGTGATTTTAATAGCTTCTTAGGAGAATGACAAAAAAAAAAAAAAAAAAAA -AAAAAAAAAAAAA +>MN908947.3 dna:primary_assembly primary_assembly:ASM985889v3:MN908947.3:1:29903:1 REF +ATTAAAGGTTTATACCTTCCCAGGTAACAAACCAACCAACTTTCGATCTCTTGTAGATCT +GTTCTCTAAACGAACTTTAAAATCTGTGTGGCTGTCACTCGGCTGCATGCTTAGTGCACT +CACGCAGTATAATTAATAACTAATTACTGTCGTTGACAGGACACGAGTAACTCGTCTATC +TTCTGCAGGCTGCTTACGGTTTCGTCCGTGTTGCAGCCGATCATCAGCACATCTAGGTTT +CGTCCGGGTGTGACCGAAAGGTAAGATGGAGAGCCTTGTCCCTGGTTTCAACGAGAAAAC +ACACGTCCAACTCAGTTTGCCTGTTTTACAGGTTCGCGACGTGCTCGTACGTGGCTTTGG +AGACTCCGTGGAGGAGGTCTTATCAGAGGCACGTCAACATCTTAAAGATGGCACTTGTGG +CTTAGTAGAAGTTGAAAAAGGCGTTTTGCCTCAACTTGAACAGCCCTATGTGTTCATCAA +ACGTTCGGATGCTCGAACTGCACCTCATGGTCATGTTATGGTTGAGCTGGTAGCAGAACT +CGAAGGCATTCAGTACGGTCGTAGTGGTGAGACACTTGGTGTCCTTGTCCCTCATGTGGG +CGAAATACCAGTGGCTTACCGCAAGGTTCTTCTTCGTAAGAACGGTAATAAAGGAGCTGG +TGGCCATAGTTACGGCGCCGATCTAAAGTCATTTGACTTAGGCGACGAGCTTGGCACTGA +TCCTTATGAAGATTTTCAAGAAAACTGGAACACTAAACATAGCAGTGGTGTTACCCGTGA +ACTCATGCGTGAGCTTAACGGAGGGGCATACACTCGCTATGTCGATAACAACTTCTGTGG +CCCTGATGGCTACCCTCTTGAGTGCATTAAAGACCTTCTAGCACGTGCTGGTAAAGCTTC +ATGCACTTTGTCCGAACAACTGGACTTTATTGACACTAAGAGGGGTGTATACTGCTGCCG +TGAACATGAGCATGAAATTGCTTGGTACACGGAACGTTCTGAAAAGAGCTATGAATTGCA +GACACCTTTTGAAATTAAATTGGCAAAGAAATTTGACACCTTCAATGGGGAATGTCCAAA +TTTTGTATTTCCCTTAAATTCCATAATCAAGACTATTCAACCAAGGGTTGAAAAGAAAAA +GCTTGATGGCTTTATGGGTAGAATTCGATCTGTCTATCCAGTTGCGTCACCAAATGAATG +CAACCAAATGTGCCTTTCAACTCTCATGAAGTGTGATCATTGTGGTGAAACTTCATGGCA +GACGGGCGATTTTGTTAAAGCCACTTGCGAATTTTGTGGCACTGAGAATTTGACTAAAGA +AGGTGCCACTACTTGTGGTTACTTACCCCAAAATGCTGTTGTTAAAATTTATTGTCCAGC +ATGTCACAATTCAGAAGTAGGACCTGAGCATAGTCTTGCCGAATACCATAATGAATCTGG +CTTGAAAACCATTCTTCGTAAGGGTGGTCGCACTATTGCCTTTGGAGGCTGTGTGTTCTC +TTATGTTGGTTGCCATAACAAGTGTGCCTATTGGGTTCCACGTGCTAGCGCTAACATAGG +TTGTAACCATACAGGTGTTGTTGGAGAAGGTTCCGAAGGTCTTAATGACAACCTTCTTGA +AATACTCCAAAAAGAGAAAGTCAACATCAATATTGTTGGTGACTTTAAACTTAATGAAGA +GATCGCCATTATTTTGGCATCTTTTTCTGCTTCCACAAGTGCTTTTGTGGAAACTGTGAA +AGGTTTGGATTATAAAGCATTCAAACAAATTGTTGAATCCTGTGGTAATTTTAAAGTTAC +AAAAGGAAAAGCTAAAAAAGGTGCCTGGAATATTGGTGAACAGAAATCAATACTGAGTCC +TCTTTATGCATTTGCATCAGAGGCTGCTCGTGTTGTACGATCAATTTTCTCCCGCACTCT +TGAAACTGCTCAAAATTCTGTGCGTGTTTTACAGAAGGCCGCTATAACAATACTAGATGG +AATTTCACAGTATTCACTGAGACTCATTGATGCTATGATGTTCACATCTGATTTGGCTAC +TAACAATCTAGTTGTAATGGCCTACATTACAGGTGGTGTTGTTCAGTTGACTTCGCAGTG +GCTAACTAACATCTTTGGCACTGTTTATGAAAAACTCAAACCCGTCCTTGATTGGCTTGA +AGAGAAGTTTAAGGAAGGTGTAGAGTTTCTTAGAGACGGTTGGGAAATTGTTAAATTTAT +CTCAACCTGTGCTTGTGAAATTGTCGGTGGACAAATTGTCACCTGTGCAAAGGAAATTAA +GGAGAGTGTTCAGACATTCTTTAAGCTTGTAAATAAATTTTTGGCTTTGTGTGCTGACTC +TATCATTATTGGTGGAGCTAAACTTAAAGCCTTGAATTTAGGTGAAACATTTGTCACGCA +CTCAAAGGGATTGTACAGAAAGTGTGTTAAATCCAGAGAAGAAACTGGCCTACTCATGCC +TCTAAAAGCCCCAAAAGAAATTATCTTCTTAGAGGGAGAAACACTTCCCACAGAAGTGTT +AACAGAGGAAGTTGTCTTGAAAACTGGTGATTTACAACCATTAGAACAACCTACTAGTGA +AGCTGTTGAAGCTCCATTGGTTGGTACACCAGTTTGTATTAACGGGCTTATGTTGCTCGA +AATCAAAGACACAGAAAAGTACTGTGCCCTTGCACCTAATATGATGGTAACAAACAATAC +CTTCACACTCAAAGGCGGTGCACCAACAAAGGTTACTTTTGGTGATGACACTGTGATAGA +AGTGCAAGGTTACAAGAGTGTGAATATCACTTTTGAACTTGATGAAAGGATTGATAAAGT +ACTTAATGAGAAGTGCTCTGCCTATACAGTTGAACTCGGTACAGAAGTAAATGAGTTCGC +CTGTGTTGTGGCAGATGCTGTCATAAAAACTTTGCAACCAGTATCTGAATTACTTACACC +ACTGGGCATTGATTTAGATGAGTGGAGTATGGCTACATACTACTTATTTGATGAGTCTGG +TGAGTTTAAATTGGCTTCACATATGTATTGTTCTTTCTACCCTCCAGATGAGGATGAAGA +AGAAGGTGATTGTGAAGAAGAAGAGTTTGAGCCATCAACTCAATATGAGTATGGTACTGA +AGATGATTACCAAGGTAAACCTTTGGAATTTGGTGCCACTTCTGCTGCTCTTCAACCTGA +AGAAGAGCAAGAAGAAGATTGGTTAGATGATGATAGTCAACAAACTGTTGGTCAACAAGA +CGGCAGTGAGGACAATCAGACAACTACTATTCAAACAATTGTTGAGGTTCAACCTCAATT +AGAGATGGAACTTACACCAGTTGTTCAGACTATTGAAGTGAATAGTTTTAGTGGTTATTT +AAAACTTACTGACAATGTATACATTAAAAATGCAGACATTGTGGAAGAAGCTAAAAAGGT +AAAACCAACAGTGGTTGTTAATGCAGCCAATGTTTACCTTAAACATGGAGGAGGTGTTGC +AGGAGCCTTAAATAAGGCTACTAACAATGCCATGCAAGTTGAATCTGATGATTACATAGC +TACTAATGGACCACTTAAAGTGGGTGGTAGTTGTGTTTTAAGCGGACACAATCTTGCTAA +ACACTGTCTTCATGTTGTCGGCCCAAATGTTAACAAAGGTGAAGACATTCAACTTCTTAA +GAGTGCTTATGAAAATTTTAATCAGCACGAAGTTCTACTTGCACCATTATTATCAGCTGG +TATTTTTGGTGCTGACCCTATACATTCTTTAAGAGTTTGTGTAGATACTGTTCGCACAAA +TGTCTACTTAGCTGTCTTTGATAAAAATCTCTATGACAAACTTGTTTCAAGCTTTTTGGA +AATGAAGAGTGAAAAGCAAGTTGAACAAAAGATCGCTGAGATTCCTAAAGAGGAAGTTAA +GCCATTTATAACTGAAAGTAAACCTTCAGTTGAACAGAGAAAACAAGATGATAAGAAAAT +CAAAGCTTGTGTTGAAGAAGTTACAACAACTCTGGAAGAAACTAAGTTCCTCACAGAAAA +CTTGTTACTTTATATTGACATTAATGGCAATCTTCATCCAGATTCTGCCACTCTTGTTAG +TGACATTGACATCACTTTCTTAAAGAAAGATGCTCCATATATAGTGGGTGATGTTGTTCA +AGAGGGTGTTTTAACTGCTGTGGTTATACCTACTAAAAAGGCTGGTGGCACTACTGAAAT +GCTAGCGAAAGCTTTGAGAAAAGTGCCAACAGACAATTATATAACCACTTACCCGGGTCA +GGGTTTAAATGGTTACACTGTAGAGGAGGCAAAGACAGTGCTTAAAAAGTGTAAAAGTGC +CTTTTACATTCTACCATCTATTATCTCTAATGAGAAGCAAGAAATTCTTGGAACTGTTTC +TTGGAATTTGCGAGAAATGCTTGCACATGCAGAAGAAACACGCAAATTAATGCCTGTCTG +TGTGGAAACTAAAGCCATAGTTTCAACTATACAGCGTAAATATAAGGGTATTAAAATACA +AGAGGGTGTGGTTGATTATGGTGCTAGATTTTACTTTTACACCAGTAAAACAACTGTAGC +GTCACTTATCAACACACTTAACGATCTAAATGAAACTCTTGTTACAATGCCACTTGGCTA +TGTAACACATGGCTTAAATTTGGAAGAAGCTGCTCGGTATATGAGATCTCTCAAAGTGCC +AGCTACAGTTTCTGTTTCTTCACCTGATGCTGTTACAGCGTATAATGGTTATCTTACTTC +TTCTTCTAAAACACCTGAAGAACATTTTATTGAAACCATCTCACTTGCTGGTTCCTATAA +AGATTGGTCCTATTCTGGACAATCTACACAACTAGGTATAGAATTTCTTAAGAGAGGTGA +TAAAAGTGTATATTACACTAGTAATCCTACCACATTCCACCTAGATGGTGAAGTTATCAC +CTTTGACAATCTTAAGACACTTCTTTCTTTGAGAGAAGTGAGGACTATTAAGGTGTTTAC +AACAGTAGACAACATTAACCTCCACACGCAAGTTGTGGACATGTCAATGACATATGGACA +ACAGTTTGGTCCAACTTATTTGGATGGAGCTGATGTTACTAAAATAAAACCTCATAATTC +ACATGAAGGTAAAACATTTTATGTTTTACCTAATGATGACACTCTACGTGTTGAGGCTTT +TGAGTACTACCACACAACTGATCCTAGTTTTCTGGGTAGGTACATGTCAGCATTAAATCA +CACTAAAAAGTGGAAATACCCACAAGTTAATGGTTTAACTTCTATTAAATGGGCAGATAA +CAACTGTTATCTTGCCACTGCATTGTTAACACTCCAACAAATAGAGTTGAAGTTTAATCC +ACCTGCTCTACAAGATGCTTATTACAGAGCAAGGGCTGGTGAAGCTGCTAACTTTTGTGC +ACTTATCTTAGCCTACTGTAATAAGACAGTAGGTGAGTTAGGTGATGTTAGAGAAACAAT +GAGTTACTTGTTTCAACATGCCAATTTAGATTCTTGCAAAAGAGTCTTGAACGTGGTGTG +TAAAACTTGTGGACAACAGCAGACAACCCTTAAGGGTGTAGAAGCTGTTATGTACATGGG +CACACTTTCTTATGAACAATTTAAGAAAGGTGTTCAGATACCTTGTACGTGTGGTAAACA +AGCTACAAAATATCTAGTACAACAGGAGTCACCTTTTGTTATGATGTCAGCACCACCTGC +TCAGTATGAACTTAAGCATGGTACATTTACTTGTGCTAGTGAGTACACTGGTAATTACCA +GTGTGGTCACTATAAACATATAACTTCTAAAGAAACTTTGTATTGCATAGACGGTGCTTT +ACTTACAAAGTCCTCAGAATACAAAGGTCCTATTACGGATGTTTTCTACAAAGAAAACAG +TTACACAACAACCATAAAACCAGTTACTTATAAATTGGATGGTGTTGTTTGTACAGAAAT +TGACCCTAAGTTGGACAATTATTATAAGAAAGACAATTCTTATTTCACAGAGCAACCAAT +TGATCTTGTACCAAACCAACCATATCCAAACGCAAGCTTCGATAATTTTAAGTTTGTATG +TGATAATATCAAATTTGCTGATGATTTAAACCAGTTAACTGGTTATAAGAAACCTGCTTC +AAGAGAGCTTAAAGTTACATTTTTCCCTGACTTAAATGGTGATGTGGTGGCTATTGATTA +TAAACACTACACACCCTCTTTTAAGAAAGGAGCTAAATTGTTACATAAACCTATTGTTTG +GCATGTTAACAATGCAACTAATAAAGCCACGTATAAACCAAATACCTGGTGTATACGTTG +TCTTTGGAGCACAAAACCAGTTGAAACATCAAATTCGTTTGATGTACTGAAGTCAGAGGA +CGCGCAGGGAATGGATAATCTTGCCTGCGAAGATCTAAAACCAGTCTCTGAAGAAGTAGT +GGAAAATCCTACCATACAGAAAGACGTTCTTGAGTGTAATGTGAAAACTACCGAAGTTGT +AGGAGACATTATACTTAAACCAGCAAATAATAGTTTAAAAATTACAGAAGAGGTTGGCCA +CACAGATCTAATGGCTGCTTATGTAGACAATTCTAGTCTTACTATTAAGAAACCTAATGA +ATTATCTAGAGTATTAGGTTTGAAAACCCTTGCTACTCATGGTTTAGCTGCTGTTAATAG +TGTCCCTTGGGATACTATAGCTAATTATGCTAAGCCTTTTCTTAACAAAGTTGTTAGTAC +AACTACTAACATAGTTACACGGTGTTTAAACCGTGTTTGTACTAATTATATGCCTTATTT +CTTTACTTTATTGCTACAATTGTGTACTTTTACTAGAAGTACAAATTCTAGAATTAAAGC +ATCTATGCCGACTACTATAGCAAAGAATACTGTTAAGAGTGTCGGTAAATTTTGTCTAGA +GGCTTCATTTAATTATTTGAAGTCACCTAATTTTTCTAAACTGATAAATATTATAATTTG +GTTTTTACTATTAAGTGTTTGCCTAGGTTCTTTAATCTACTCAACCGCTGCTTTAGGTGT +TTTAATGTCTAATTTAGGCATGCCTTCTTACTGTACTGGTTACAGAGAAGGCTATTTGAA +CTCTACTAATGTCACTATTGCAACCTACTGTACTGGTTCTATACCTTGTAGTGTTTGTCT +TAGTGGTTTAGATTCTTTAGACACCTATCCTTCTTTAGAAACTATACAAATTACCATTTC +ATCTTTTAAATGGGATTTAACTGCTTTTGGCTTAGTTGCAGAGTGGTTTTTGGCATATAT +TCTTTTCACTAGGTTTTTCTATGTACTTGGATTGGCTGCAATCATGCAATTGTTTTTCAG +CTATTTTGCAGTACATTTTATTAGTAATTCTTGGCTTATGTGGTTAATAATTAATCTTGT +ACAAATGGCCCCGATTTCAGCTATGGTTAGAATGTACATCTTCTTTGCATCATTTTATTA +TGTATGGAAAAGTTATGTGCATGTTGTAGACGGTTGTAATTCATCAACTTGTATGATGTG +TTACAAACGTAATAGAGCAACAAGAGTCGAATGTACAACTATTGTTAATGGTGTTAGAAG +GTCCTTTTATGTCTATGCTAATGGAGGTAAAGGCTTTTGCAAACTACACAATTGGAATTG +TGTTAATTGTGATACATTCTGTGCTGGTAGTACATTTATTAGTGATGAAGTTGCGAGAGA +CTTGTCACTACAGTTTAAAAGACCAATAAATCCTACTGACCAGTCTTCTTACATCGTTGA +TAGTGTTACAGTGAAGAATGGTTCCATCCATCTTTACTTTGATAAAGCTGGTCAAAAGAC +TTATGAAAGACATTCTCTCTCTCATTTTGTTAACTTAGACAACCTGAGAGCTAATAACAC +TAAAGGTTCATTGCCTATTAATGTTATAGTTTTTGATGGTAAATCAAAATGTGAAGAATC +ATCTGCAAAATCAGCGTCTGTTTACTACAGTCAGCTTATGTGTCAACCTATACTGTTACT +AGATCAGGCATTAGTGTCTGATGTTGGTGATAGTGCGGAAGTTGCAGTTAAAATGTTTGA +TGCTTACGTTAATACGTTTTCATCAACTTTTAACGTACCAATGGAAAAACTCAAAACACT +AGTTGCAACTGCAGAAGCTGAACTTGCAAAGAATGTGTCCTTAGACAATGTCTTATCTAC +TTTTATTTCAGCAGCTCGGCAAGGGTTTGTTGATTCAGATGTAGAAACTAAAGATGTTGT +TGAATGTCTTAAATTGTCACATCAATCTGACATAGAAGTTACTGGCGATAGTTGTAATAA +CTATATGCTCACCTATAACAAAGTTGAAAACATGACACCCCGTGACCTTGGTGCTTGTAT +TGACTGTAGTGCGCGTCATATTAATGCGCAGGTAGCAAAAAGTCACAACATTGCTTTGAT +ATGGAACGTTAAAGATTTCATGTCATTGTCTGAACAACTACGAAAACAAATACGTAGTGC +TGCTAAAAAGAATAACTTACCTTTTAAGTTGACATGTGCAACTACTAGACAAGTTGTTAA +TGTTGTAACAACAAAGATAGCACTTAAGGGTGGTAAAATTGTTAATAATTGGTTGAAGCA +GTTAATTAAAGTTACACTTGTGTTCCTTTTTGTTGCTGCTATTTTCTATTTAATAACACC +TGTTCATGTCATGTCTAAACATACTGACTTTTCAAGTGAAATCATAGGATACAAGGCTAT +TGATGGTGGTGTCACTCGTGACATAGCATCTACAGATACTTGTTTTGCTAACAAACATGC +TGATTTTGACACATGGTTTAGCCAGCGTGGTGGTAGTTATACTAATGACAAAGCTTGCCC +ATTGATTGCTGCAGTCATAACAAGAGAAGTGGGTTTTGTCGTGCCTGGTTTGCCTGGCAC +GATATTACGCACAACTAATGGTGACTTTTTGCATTTCTTACCTAGAGTTTTTAGTGCAGT +TGGTAACATCTGTTACACACCATCAAAACTTATAGAGTACACTGACTTTGCAACATCAGC +TTGTGTTTTGGCTGCTGAATGTACAATTTTTAAAGATGCTTCTGGTAAGCCAGTACCATA +TTGTTATGATACCAATGTACTAGAAGGTTCTGTTGCTTATGAAAGTTTACGCCCTGACAC +ACGTTATGTGCTCATGGATGGCTCTATTATTCAATTTCCTAACACCTACCTTGAAGGTTC +TGTTAGAGTGGTAACAACTTTTGATTCTGAGTACTGTAGGCACGGCACTTGTGAAAGATC +AGAAGCTGGTGTTTGTGTATCTACTAGTGGTAGATGGGTACTTAACAATGATTATTACAG +ATCTTTACCAGGAGTTTTCTGTGGTGTAGATGCTGTAAATTTACTTACTAATATGTTTAC +ACCACTAATTCAACCTATTGGTGCTTTGGACATATCAGCATCTATAGTAGCTGGTGGTAT +TGTAGCTATCGTAGTAACATGCCTTGCCTACTATTTTATGAGGTTTAGAAGAGCTTTTGG +TGAATACAGTCATGTAGTTGCCTTTAATACTTTACTATTCCTTATGTCATTCACTGTACT +CTGTTTAACACCAGTTTACTCATTCTTACCTGGTGTTTATTCTGTTATTTACTTGTACTT +GACATTTTATCTTACTAATGATGTTTCTTTTTTAGCACATATTCAGTGGATGGTTATGTT +CACACCTTTAGTACCTTTCTGGATAACAATTGCTTATATCATTTGTATTTCCACAAAGCA +TTTCTATTGGTTCTTTAGTAATTACCTAAAGAGACGTGTAGTCTTTAATGGTGTTTCCTT +TAGTACTTTTGAAGAAGCTGCGCTGTGCACCTTTTTGTTAAATAAAGAAATGTATCTAAA +GTTGCGTAGTGATGTGCTATTACCTCTTACGCAATATAATAGATACTTAGCTCTTTATAA +TAAGTACAAGTATTTTAGTGGAGCAATGGATACAACTAGCTACAGAGAAGCTGCTTGTTG +TCATCTCGCAAAGGCTCTCAATGACTTCAGTAACTCAGGTTCTGATGTTCTTTACCAACC +ACCACAAACCTCTATCACCTCAGCTGTTTTGCAGAGTGGTTTTAGAAAAATGGCATTCCC +ATCTGGTAAAGTTGAGGGTTGTATGGTACAAGTAACTTGTGGTACAACTACACTTAACGG +TCTTTGGCTTGATGACGTAGTTTACTGTCCAAGACATGTGATCTGCACCTCTGAAGACAT +GCTTAACCCTAATTATGAAGATTTACTCATTCGTAAGTCTAATCATAATTTCTTGGTACA +GGCTGGTAATGTTCAACTCAGGGTTATTGGACATTCTATGCAAAATTGTGTACTTAAGCT +TAAGGTTGATACAGCCAATCCTAAGACACCTAAGTATAAGTTTGTTCGCATTCAACCAGG +ACAGACTTTTTCAGTGTTAGCTTGTTACAATGGTTCACCATCTGGTGTTTACCAATGTGC +TATGAGGCCCAATTTCACTATTAAGGGTTCATTCCTTAATGGTTCATGTGGTAGTGTTGG +TTTTAACATAGATTATGACTGTGTCTCTTTTTGTTACATGCACCATATGGAATTACCAAC +TGGAGTTCATGCTGGCACAGACTTAGAAGGTAACTTTTATGGACCTTTTGTTGACAGGCA +AACAGCACAAGCAGCTGGTACGGACACAACTATTACAGTTAATGTTTTAGCTTGGTTGTA +CGCTGCTGTTATAAATGGAGACAGGTGGTTTCTCAATCGATTTACCACAACTCTTAATGA +CTTTAACCTTGTGGCTATGAAGTACAATTATGAACCTCTAACACAAGACCATGTTGACAT +ACTAGGACCTCTTTCTGCTCAAACTGGAATTGCCGTTTTAGATATGTGTGCTTCATTAAA +AGAATTACTGCAAAATGGTATGAATGGACGTACCATATTGGGTAGTGCTTTATTAGAAGA +TGAATTTACACCTTTTGATGTTGTTAGACAATGCTCAGGTGTTACTTTCCAAAGTGCAGT +GAAAAGAACAATCAAGGGTACACACCACTGGTTGTTACTCACAATTTTGACTTCACTTTT +AGTTTTAGTCCAGAGTACTCAATGGTCTTTGTTCTTTTTTTTGTATGAAAATGCCTTTTT +ACCTTTTGCTATGGGTATTATTGCTATGTCTGCTTTTGCAATGATGTTTGTCAAACATAA +GCATGCATTTCTCTGTTTGTTTTTGTTACCTTCTCTTGCCACTGTAGCTTATTTTAATAT +GGTCTATATGCCTGCTAGTTGGGTGATGCGTATTATGACATGGTTGGATATGGTTGATAC +TAGTTTGTCTGGTTTTAAGCTAAAAGACTGTGTTATGTATGCATCAGCTGTAGTGTTACT +AATCCTTATGACAGCAAGAACTGTGTATGATGATGGTGCTAGGAGAGTGTGGACACTTAT +GAATGTCTTGACACTCGTTTATAAAGTTTATTATGGTAATGCTTTAGATCAAGCCATTTC +CATGTGGGCTCTTATAATCTCTGTTACTTCTAACTACTCAGGTGTAGTTACAACTGTCAT +GTTTTTGGCCAGAGGTATTGTTTTTATGTGTGTTGAGTATTGCCCTATTTTCTTCATAAC +TGGTAATACACTTCAGTGTATAATGCTAGTTTATTGTTTCTTAGGCTATTTTTGTACTTG +TTACTTTGGCCTCTTTTGTTTACTCAACCGCTACTTTAGACTGACTCTTGGTGTTTATGA +TTACTTAGTTTCTACACAGGAGTTTAGATATATGAATTCACAGGGACTACTCCCACCCAA +GAATAGCATAGATGCCTTCAAACTCAACATTAAATTGTTGGGTGTTGGTGGCAAACCTTG +TATCAAAGTAGCCACTGTACAGTCTAAAATGTCAGATGTAAAGTGCACATCAGTAGTCTT +ACTCTCAGTTTTGCAACAACTCAGAGTAGAATCATCATCTAAATTGTGGGCTCAATGTGT +CCAGTTACACAATGACATTCTCTTAGCTAAAGATACTACTGAAGCCTTTGAAAAAATGGT +TTCACTACTTTCTGTTTTGCTTTCCATGCAGGGTGCTGTAGACATAAACAAGCTTTGTGA +AGAAATGCTGGACAACAGGGCAACCTTACAAGCTATAGCCTCAGAGTTTAGTTCCCTTCC +ATCATATGCAGCTTTTGCTACTGCTCAAGAAGCTTATGAGCAGGCTGTTGCTAATGGTGA +TTCTGAAGTTGTTCTTAAAAAGTTGAAGAAGTCTTTGAATGTGGCTAAATCTGAATTTGA +CCGTGATGCAGCCATGCAACGTAAGTTGGAAAAGATGGCTGATCAAGCTATGACCCAAAT +GTATAAACAGGCTAGATCTGAGGACAAGAGGGCAAAAGTTACTAGTGCTATGCAGACAAT +GCTTTTCACTATGCTTAGAAAGTTGGATAATGATGCACTCAACAACATTATCAACAATGC +AAGAGATGGTTGTGTTCCCTTGAACATAATACCTCTTACAACAGCAGCCAAACTAATGGT +TGTCATACCAGACTATAACACATATAAAAATACGTGTGATGGTACAACATTTACTTATGC +ATCAGCATTGTGGGAAATCCAACAGGTTGTAGATGCAGATAGTAAAATTGTTCAACTTAG +TGAAATTAGTATGGACAATTCACCTAATTTAGCATGGCCTCTTATTGTAACAGCTTTAAG +GGCCAATTCTGCTGTCAAATTACAGAATAATGAGCTTAGTCCTGTTGCACTACGACAGAT +GTCTTGTGCTGCCGGTACTACACAAACTGCTTGCACTGATGACAATGCGTTAGCTTACTA +CAACACAACAAAGGGAGGTAGGTTTGTACTTGCACTGTTATCCGATTTACAGGATTTGAA +ATGGGCTAGATTCCCTAAGAGTGATGGAACTGGTACTATCTATACAGAACTGGAACCACC +TTGTAGGTTTGTTACAGACACACCTAAAGGTCCTAAAGTGAAGTATTTATACTTTATTAA +AGGATTAAACAACCTAAATAGAGGTATGGTACTTGGTAGTTTAGCTGCCACAGTACGTCT +ACAAGCTGGTAATGCAACAGAAGTGCCTGCCAATTCAACTGTATTATCTTTCTGTGCTTT +TGCTGTAGATGCTGCTAAAGCTTACAAAGATTATCTAGCTAGTGGGGGACAACCAATCAC +TAATTGTGTTAAGATGTTGTGTACACACACTGGTACTGGTCAGGCAATAACAGTTACACC +GGAAGCCAATATGGATCAAGAATCCTTTGGTGGTGCATCGTGTTGTCTGTACTGCCGTTG +CCACATAGATCATCCAAATCCTAAAGGATTTTGTGACTTAAAAGGTAAGTATGTACAAAT +ACCTACAACTTGTGCTAATGACCCTGTGGGTTTTACACTTAAAAACACAGTCTGTACCGT +CTGCGGTATGTGGAAAGGTTATGGCTGTAGTTGTGATCAACTCCGCGAACCCATGCTTCA +GTCAGCTGATGCACAATCGTTTTTAAACGGGTTTGCGGTGTAAGTGCAGCCCGTCTTACA +CCGTGCGGCACAGGCACTAGTACTGATGTCGTATACAGGGCTTTTGACATCTACAATGAT +AAAGTAGCTGGTTTTGCTAAATTCCTAAAAACTAATTGTTGTCGCTTCCAAGAAAAGGAC +GAAGATGACAATTTAATTGATTCTTACTTTGTAGTTAAGAGACACACTTTCTCTAACTAC +CAACATGAAGAAACAATTTATAATTTACTTAAGGATTGTCCAGCTGTTGCTAAACATGAC +TTCTTTAAGTTTAGAATAGACGGTGACATGGTACCACATATATCACGTCAACGTCTTACT +AAATACACAATGGCAGACCTCGTCTATGCTTTAAGGCATTTTGATGAAGGTAATTGTGAC +ACATTAAAAGAAATACTTGTCACATACAATTGTTGTGATGATGATTATTTCAATAAAAAG +GACTGGTATGATTTTGTAGAAAACCCAGATATATTACGCGTATACGCCAACTTAGGTGAA +CGTGTACGCCAAGCTTTGTTAAAAACAGTACAATTCTGTGATGCCATGCGAAATGCTGGT +ATTGTTGGTGTACTGACATTAGATAATCAAGATCTCAATGGTAACTGGTATGATTTCGGT +GATTTCATACAAACCACGCCAGGTAGTGGAGTTCCTGTTGTAGATTCTTATTATTCATTG +TTAATGCCTATATTAACCTTGACCAGGGCTTTAACTGCAGAGTCACATGTTGACACTGAC +TTAACAAAGCCTTACATTAAGTGGGATTTGTTAAAATATGACTTCACGGAAGAGAGGTTA +AAACTCTTTGACCGTTATTTTAAATATTGGGATCAGACATACCACCCAAATTGTGTTAAC +TGTTTGGATGACAGATGCATTCTGCATTGTGCAAACTTTAATGTTTTATTCTCTACAGTG +TTCCCACCTACAAGTTTTGGACCACTAGTGAGAAAAATATTTGTTGATGGTGTTCCATTT +GTAGTTTCAACTGGATACCACTTCAGAGAGCTAGGTGTTGTACATAATCAGGATGTAAAC +TTACATAGCTCTAGACTTAGTTTTAAGGAATTACTTGTGTATGCTGCTGACCCTGCTATG +CACGCTGCTTCTGGTAATCTATTACTAGATAAACGCACTACGTGCTTTTCAGTAGCTGCA +CTTACTAACAATGTTGCTTTTCAAACTGTCAAACCCGGTAATTTTAACAAAGACTTCTAT +GACTTTGCTGTGTCTAAGGGTTTCTTTAAGGAAGGAAGTTCTGTTGAATTAAAACACTTC +TTCTTTGCTCAGGATGGTAATGCTGCTATCAGCGATTATGACTACTATCGTTATAATCTA +CCAACAATGTGTGATATCAGACAACTACTATTTGTAGTTGAAGTTGTTGATAAGTACTTT +GATTGTTACGATGGTGGCTGTATTAATGCTAACCAAGTCATCGTCAACAACCTAGACAAA +TCAGCTGGTTTTCCATTTAATAAATGGGGTAAGGCTAGACTTTATTATGATTCAATGAGT +TATGAGGATCAAGATGCACTTTTCGCATATACAAAACGTAATGTCATCCCTACTATAACT +CAAATGAATCTTAAGTATGCCATTAGTGCAAAGAATAGAGCTCGCACCGTAGCTGGTGTC +TCTATCTGTAGTACTATGACCAATAGACAGTTTCATCAAAAATTATTGAAATCAATAGCC +GCCACTAGAGGAGCTACTGTAGTAATTGGAACAAGCAAATTCTATGGTGGTTGGCACAAC +ATGTTAAAAACTGTTTATAGTGATGTAGAAAACCCTCACCTTATGGGTTGGGATTATCCT +AAATGTGATAGAGCCATGCCTAACATGCTTAGAATTATGGCCTCACTTGTTCTTGCTCGC +AAACATACAACGTGTTGTAGCTTGTCACACCGTTTCTATAGATTAGCTAATGAGTGTGCT +CAAGTATTGAGTGAAATGGTCATGTGTGGCGGTTCACTATATGTTAAACCAGGTGGAACC +TCATCAGGAGATGCCACAACTGCTTATGCTAATAGTGTTTTTAACATTTGTCAAGCTGTC +ACGGCCAATGTTAATGCACTTTTATCTACTGATGGTAACAAAATTGCCGATAAGTATGTC +CGCAATTTACAACACAGACTTTATGAGTGTCTCTATAGAAATAGAGATGTTGACACAGAC +TTTGTGAATGAGTTTTACGCATATTTGCGTAAACATTTCTCAATGATGATACTCTCTGAC +GATGCTGTTGTGTGTTTCAATAGCACTTATGCATCTCAAGGTCTAGTGGCTAGCATAAAG +AACTTTAAGTCAGTTCTTTATTATCAAAACAATGTTTTTATGTCTGAAGCAAAATGTTGG +ACTGAGACTGACCTTACTAAAGGACCTCATGAATTTTGCTCTCAACATACAATGCTAGTT +AAACAGGGTGATGATTATGTGTACCTTCCTTACCCAGATCCATCAAGAATCCTAGGGGCC +GGCTGTTTTGTAGATGATATCGTAAAAACAGATGGTACACTTATGATTGAACGGTTCGTG +TCTTTAGCTATAGATGCTTACCCACTTACTAAACATCCTAATCAGGAGTATGCTGATGTC +TTTCATTTGTACTTACAATACATAAGAAAGCTACATGATGAGTTAACAGGACACATGTTA +GACATGTATTCTGTTATGCTTACTAATGATAACACTTCAAGGTATTGGGAACCTGAGTTT +TATGAGGCTATGTACACACCGCATACAGTCTTACAGGCTGTTGGGGCTTGTGTTCTTTGC +AATTCACAGACTTCATTAAGATGTGGTGCTTGCATACGTAGACCATTCTTATGTTGTAAA +TGCTGTTACGACCATGTCATATCAACATCACATAAATTAGTCTTGTCTGTTAATCCGTAT +GTTTGCAATGCTCCAGGTTGTGATGTCACAGATGTGACTCAACTTTACTTAGGAGGTATG +AGCTATTATTGTAAATCACATAAACCACCCATTAGTTTTCCATTGTGTGCTAATGGACAA +GTTTTTGGTTTATATAAAAATACATGTGTTGGTAGCGATAATGTTACTGACTTTAATGCA +ATTGCAACATGTGACTGGACAAATGCTGGTGATTACATTTTAGCTAACACCTGTACTGAA +AGACTCAAGCTTTTTGCAGCAGAAACGCTCAAAGCTACTGAGGAGACATTTAAACTGTCT +TATGGTATTGCTACTGTACGTGAAGTGCTGTCTGACAGAGAATTACATCTTTCATGGGAA +GTTGGTAAACCTAGACCACCACTTAACCGAAATTATGTCTTTACTGGTTATCGTGTAACT +AAAAACAGTAAAGTACAAATAGGAGAGTACACCTTTGAAAAAGGTGACTATGGTGATGCT +GTTGTTTACCGAGGTACAACAACTTACAAATTAAATGTTGGTGATTATTTTGTGCTGACA +TCACATACAGTAATGCCATTAAGTGCACCTACACTAGTGCCACAAGAGCACTATGTTAGA +ATTACTGGCTTATACCCAACACTCAATATCTCAGATGAGTTTTCTAGCAATGTTGCAAAT +TATCAAAAGGTTGGTATGCAAAAGTATTCTACACTCCAGGGACCACCTGGTACTGGTAAG +AGTCATTTTGCTATTGGCCTAGCTCTCTACTACCCTTCTGCTCGCATAGTGTATACAGCT +TGCTCTCATGCCGCTGTTGATGCACTATGTGAGAAGGCATTAAAATATTTGCCTATAGAT +AAATGTAGTAGAATTATACCTGCACGTGCTCGTGTAGAGTGTTTTGATAAATTCAAAGTG +AATTCAACATTAGAACAGTATGTCTTTTGTACTGTAAATGCATTGCCTGAGACGACAGCA +GATATAGTTGTCTTTGATGAAATTTCAATGGCCACAAATTATGATTTGAGTGTTGTCAAT +GCCAGATTACGTGCTAAGCACTATGTGTACATTGGCGACCCTGCTCAATTACCTGCACCA +CGCACATTGCTAACTAAGGGCACACTAGAACCAGAATATTTCAATTCAGTGTGTAGACTT +ATGAAAACTATAGGTCCAGACATGTTCCTCGGAACTTGTCGGCGTTGTCCTGCTGAAATT +GTTGACACTGTGAGTGCTTTGGTTTATGATAATAAGCTTAAAGCACATAAAGACAAATCA +GCTCAATGCTTTAAAATGTTTTATAAGGGTGTTATCACGCATGATGTTTCATCTGCAATT +AACAGGCCACAAATAGGCGTGGTAAGAGAATTCCTTACACGTAACCCTGCTTGGAGAAAA +GCTGTCTTTATTTCACCTTATAATTCACAGAATGCTGTAGCCTCAAAGATTTTGGGACTA +CCAACTCAAACTGTTGATTCATCACAGGGCTCAGAATATGACTATGTCATATTCACTCAA +ACCACTGAAACAGCTCACTCTTGTAATGTAAACAGATTTAATGTTGCTATTACCAGAGCA +AAAGTAGGCATACTTTGCATAATGTCTGATAGAGACCTTTATGACAAGTTGCAATTTACA +AGTCTTGAAATTCCACGTAGGAATGTGGCAACTTTACAAGCTGAAAATGTAACAGGACTC +TTTAAAGATTGTAGTAAGGTAATCACTGGGTTACATCCTACACAGGCACCTACACACCTC +AGTGTTGACACTAAATTCAAAACTGAAGGTTTATGTGTTGACATACCTGGCATACCTAAG +GACATGACCTATAGAAGACTCATCTCTATGATGGGTTTTAAAATGAATTATCAAGTTAAT +GGTTACCCTAACATGTTTATCACCCGCGAAGAAGCTATAAGACATGTACGTGCATGGATT +GGCTTCGATGTCGAGGGGTGTCATGCTACTAGAGAAGCTGTTGGTACCAATTTACCTTTA +CAGCTAGGTTTTTCTACAGGTGTTAACCTAGTTGCTGTACCTACAGGTTATGTTGATACA +CCTAATAATACAGATTTTTCCAGAGTTAGTGCTAAACCACCGCCTGGAGATCAATTTAAA +CACCTCATACCACTTATGTACAAAGGACTTCCTTGGAATGTAGTGCGTATAAAGATTGTA +CAAATGTTAAGTGACACACTTAAAAATCTCTCTGACAGAGTCGTATTTGTCTTATGGGCA +CATGGCTTTGAGTTGACATCTATGAAGTATTTTGTGAAAATAGGACCTGAGCGCACCTGT +TGTCTATGTGATAGACGTGCCACATGCTTTTCCACTGCTTCAGACACTTATGCCTGTTGG +CATCATTCTATTGGATTTGATTACGTCTATAATCCGTTTATGATTGATGTTCAACAATGG +GGTTTTACAGGTAACCTACAAAGCAACCATGATCTGTATTGTCAAGTCCATGGTAATGCA +CATGTAGCTAGTTGTGATGCAATCATGACTAGGTGTCTAGCTGTCCACGAGTGCTTTGTT +AAGCGTGTTGACTGGACTATTGAATATCCTATAATTGGTGATGAACTGAAGATTAATGCG +GCTTGTAGAAAGGTTCAACACATGGTTGTTAAAGCTGCATTATTAGCAGACAAATTCCCA +GTTCTTCACGACATTGGTAACCCTAAAGCTATTAAGTGTGTACCTCAAGCTGATGTAGAA +TGGAAGTTCTATGATGCACAGCCTTGTAGTGACAAAGCTTATAAAATAGAAGAATTATTC +TATTCTTATGCCACACATTCTGACAAATTCACAGATGGTGTATGCCTATTTTGGAATTGC +AATGTCGATAGATATCCTGCTAATTCCATTGTTTGTAGATTTGACACTAGAGTGCTATCT +AACCTTAACTTGCCTGGTTGTGATGGTGGCAGTTTGTATGTAAATAAACATGCATTCCAC +ACACCAGCTTTTGATAAAAGTGCTTTTGTTAATTTAAAACAATTACCATTTTTCTATTAC +TCTGACAGTCCATGTGAGTCTCATGGAAAACAAGTAGTGTCAGATATAGATTATGTACCA +CTAAAGTCTGCTACGTGTATAACACGTTGCAATTTAGGTGGTGCTGTCTGTAGACATCAT +GCTAATGAGTACAGATTGTATCTCGATGCTTATAACATGATGATCTCAGCTGGCTTTAGC +TTGTGGGTTTACAAACAATTTGATACTTATAACCTCTGGAACACTTTTACAAGACTTCAG +AGTTTAGAAAATGTGGCTTTTAATGTTGTAAATAAGGGACACTTTGATGGACAACAGGGT +GAAGTACCAGTTTCTATCATTAATAACACTGTTTACACAAAAGTTGATGGTGTTGATGTA +GAATTGTTTGAAAATAAAACAACATTACCTGTTAATGTAGCATTTGAGCTTTGGGCTAAG +CGCAACATTAAACCAGTACCAGAGGTGAAAATACTCAATAATTTGGGTGTGGACATTGCT +GCTAATACTGTGATCTGGGACTACAAAAGAGATGCTCCAGCACATATATCTACTATTGGT +GTTTGTTCTATGACTGACATAGCCAAGAAACCAACTGAAACGATTTGTGCACCACTCACT +GTCTTTTTTGATGGTAGAGTTGATGGTCAAGTAGACTTATTTAGAAATGCCCGTAATGGT +GTTCTTATTACAGAAGGTAGTGTTAAAGGTTTACAACCATCTGTAGGTCCCAAACAAGCT +AGTCTTAATGGAGTCACATTAATTGGAGAAGCCGTAAAAACACAGTTCAATTATTATAAG +AAAGTTGATGGTGTTGTCCAACAATTACCTGAAACTTACTTTACTCAGAGTAGAAATTTA +CAAGAATTTAAACCCAGGAGTCAAATGGAAATTGATTTCTTAGAATTAGCTATGGATGAA +TTCATTGAACGGTATAAATTAGAAGGCTATGCCTTCGAACATATCGTTTATGGAGATTTT +AGTCATAGTCAGTTAGGTGGTTTACATCTACTGATTGGACTAGCTAAACGTTTTAAGGAA +TCACCTTTTGAATTAGAAGATTTTATTCCTATGGACAGTACAGTTAAAAACTATTTCATA +ACAGATGCGCAAACAGGTTCATCTAAGTGTGTGTGTTCTGTTATTGATTTATTACTTGAT +GATTTTGTTGAAATAATAAAATCCCAAGATTTATCTGTAGTTTCTAAGGTTGTCAAAGTG +ACTATTGACTATACAGAAATTTCATTTATGCTTTGGTGTAAAGATGGCCATGTAGAAACA +TTTTACCCAAAATTACAATCTAGTCAAGCGTGGCAACCGGGTGTTGCTATGCCTAATCTT +TACAAAATGCAAAGAATGCTATTAGAAAAGTGTGACCTTCAAAATTATGGTGATAGTGCA +ACATTACCTAAAGGCATAATGATGAATGTCGCAAAATATACTCAACTGTGTCAATATTTA +AACACATTAACATTAGCTGTACCCTATAATATGAGAGTTATACATTTTGGTGCTGGTTCT +GATAAAGGAGTTGCACCAGGTACAGCTGTTTTAAGACAGTGGTTGCCTACGGGTACGCTG +CTTGTCGATTCAGATCTTAATGACTTTGTCTCTGATGCAGATTCAACTTTGATTGGTGAT +TGTGCAACTGTACATACAGCTAATAAATGGGATCTCATTATTAGTGATATGTACGACCCT +AAGACTAAAAATGTTACAAAAGAAAATGACTCTAAAGAGGGTTTTTTCACTTACATTTGT +GGGTTTATACAACAAAAGCTAGCTCTTGGAGGTTCCGTGGCTATAAAGATAACAGAACAT +TCTTGGAATGCTGATCTTTATAAGCTCATGGGACACTTCGCATGGTGGACAGCCTTTGTT +ACTAATGTGAATGCGTCATCATCTGAAGCATTTTTAATTGGATGTAATTATCTTGGCAAA +CCACGCGAACAAATAGATGGTTATGTCATGCATGCAAATTACATATTTTGGAGGAATACA +AATCCAATTCAGTTGTCTTCCTATTCTTTATTTGACATGAGTAAATTTCCCCTTAAATTA +AGGGGTACTGCTGTTATGTCTTTAAAAGAAGGTCAAATCAATGATATGATTTTATCTCTT +CTTAGTAAAGGTAGACTTATAATTAGAGAAAACAACAGAGTTGTTATTTCTAGTGATGTT +CTTGTTAACAACTAAACGAACAATGTTTGTTTTTCTTGTTTTATTGCCACTAGTCTCTAG +TCAGTGTGTTAATCTTACAACCAGAACTCAATTACCCCCTGCATACACTAATTCTTTCAC +ACGTGGTGTTTATTACCCTGACAAAGTTTTCAGATCCTCAGTTTTACATTCAACTCAGGA +CTTGTTCTTACCTTTCTTTTCCAATGTTACTTGGTTCCATGCTATACATGTCTCTGGGAC +CAATGGTACTAAGAGGTTTGATAACCCTGTCCTACCATTTAATGATGGTGTTTATTTTGC +TTCCACTGAGAAGTCTAACATAATAAGAGGCTGGATTTTTGGTACTACTTTAGATTCGAA +GACCCAGTCCCTACTTATTGTTAATAACGCTACTAATGTTGTTATTAAAGTCTGTGAATT +TCAATTTTGTAATGATCCATTTTTGGGTGTTTATTACCACAAAAACAACAAAAGTTGGAT +GGAAAGTGAGTTCAGAGTTTATTCTAGTGCGAATAATTGCACTTTTGAATATGTCTCTCA +GCCTTTTCTTATGGACCTTGAAGGAAAACAGGGTAATTTCAAAAATCTTAGGGAATTTGT +GTTTAAGAATATTGATGGTTATTTTAAAATATATTCTAAGCACACGCCTATTAATTTAGT +GCGTGATCTCCCTCAGGGTTTTTCGGCTTTAGAACCATTGGTAGATTTGCCAATAGGTAT +TAACATCACTAGGTTTCAAACTTTACTTGCTTTACATAGAAGTTATTTGACTCCTGGTGA +TTCTTCTTCAGGTTGGACAGCTGGTGCTGCAGCTTATTATGTGGGTTATCTTCAACCTAG +GACTTTTCTATTAAAATATAATGAAAATGGAACCATTACAGATGCTGTAGACTGTGCACT +TGACCCTCTCTCAGAAACAAAGTGTACGTTGAAATCCTTCACTGTAGAAAAAGGAATCTA +TCAAACTTCTAACTTTAGAGTCCAACCAACAGAATCTATTGTTAGATTTCCTAATATTAC +AAACTTGTGCCCTTTTGGTGAAGTTTTTAACGCCACCAGATTTGCATCTGTTTATGCTTG +GAACAGGAAGAGAATCAGCAACTGTGTTGCTGATTATTCTGTCCTATATAATTCCGCATC +ATTTTCCACTTTTAAGTGTTATGGAGTGTCTCCTACTAAATTAAATGATCTCTGCTTTAC +TAATGTCTATGCAGATTCATTTGTAATTAGAGGTGATGAAGTCAGACAAATCGCTCCAGG +GCAAACTGGAAAGATTGCTGATTATAATTATAAATTACCAGATGATTTTACAGGCTGCGT +TATAGCTTGGAATTCTAACAATCTTGATTCTAAGGTTGGTGGTAATTATAATTACCTGTA +TAGATTGTTTAGGAAGTCTAATCTCAAACCTTTTGAGAGAGATATTTCAACTGAAATCTA +TCAGGCCGGTAGCACACCTTGTAATGGTGTTGAAGGTTTTAATTGTTACTTTCCTTTACA +ATCATATGGTTTCCAACCCACTAATGGTGTTGGTTACCAACCATACAGAGTAGTAGTACT +TTCTTTTGAACTTCTACATGCACCAGCAACTGTTTGTGGACCTAAAAAGTCTACTAATTT +GGTTAAAAACAAATGTGTCAATTTCAACTTCAATGGTTTAACAGGCACAGGTGTTCTTAC +TGAGTCTAACAAAAAGTTTCTGCCTTTCCAACAATTTGGCAGAGACATTGCTGACACTAC +TGATGCTGTCCGTGATCCACAGACACTTGAGATTCTTGACATTACACCATGTTCTTTTGG +TGGTGTCAGTGTTATAACACCAGGAACAAATACTTCTAACCAGGTTGCTGTTCTTTATCA +GGATGTTAACTGCACAGAAGTCCCTGTTGCTATTCATGCAGATCAACTTACTCCTACTTG +GCGTGTTTATTCTACAGGTTCTAATGTTTTTCAAACACGTGCAGGCTGTTTAATAGGGGC +TGAACATGTCAACAACTCATATGAGTGTGACATACCCATTGGTGCAGGTATATGCGCTAG +TTATCAGACTCAGACTAATTCTCCTCGGCGGGCACGTAGTGTAGCTAGTCAATCCATCAT +TGCCTACACTATGTCACTTGGTGCAGAAAATTCAGTTGCTTACTCTAATAACTCTATTGC +CATACCCACAAATTTTACTATTAGTGTTACCACAGAAATTCTACCAGTGTCTATGACCAA +GACATCAGTAGATTGTACAATGTACATTTGTGGTGATTCAACTGAATGCAGCAATCTTTT +GTTGCAATATGGCAGTTTTTGTACACAATTAAACCGTGCTTTAACTGGAATAGCTGTTGA +ACAAGACAAAAACACCCAAGAAGTTTTTGCACAAGTCAAACAAATTTACAAAACACCACC +AATTAAAGATTTTGGTGGTTTTAATTTTTCACAAATATTACCAGATCCATCAAAACCAAG +CAAGAGGTCATTTATTGAAGATCTACTTTTCAACAAAGTGACACTTGCAGATGCTGGCTT +CATCAAACAATATGGTGATTGCCTTGGTGATATTGCTGCTAGAGACCTCATTTGTGCACA +AAAGTTTAACGGCCTTACTGTTTTGCCACCTTTGCTCACAGATGAAATGATTGCTCAATA +CACTTCTGCACTGTTAGCGGGTACAATCACTTCTGGTTGGACCTTTGGTGCAGGTGCTGC +ATTACAAATACCATTTGCTATGCAAATGGCTTATAGGTTTAATGGTATTGGAGTTACACA +GAATGTTCTCTATGAGAACCAAAAATTGATTGCCAACCAATTTAATAGTGCTATTGGCAA +AATTCAAGACTCACTTTCTTCCACAGCAAGTGCACTTGGAAAACTTCAAGATGTGGTCAA +CCAAAATGCACAAGCTTTAAACACGCTTGTTAAACAACTTAGCTCCAATTTTGGTGCAAT +TTCAAGTGTTTTAAATGATATCCTTTCACGTCTTGACAAAGTTGAGGCTGAAGTGCAAAT +TGATAGGTTGATCACAGGCAGACTTCAAAGTTTGCAGACATATGTGACTCAACAATTAAT +TAGAGCTGCAGAAATCAGAGCTTCTGCTAATCTTGCTGCTACTAAAATGTCAGAGTGTGT +ACTTGGACAATCAAAAAGAGTTGATTTTTGTGGAAAGGGCTATCATCTTATGTCCTTCCC +TCAGTCAGCACCTCATGGTGTAGTCTTCTTGCATGTGACTTATGTCCCTGCACAAGAAAA +GAACTTCACAACTGCTCCTGCCATTTGTCATGATGGAAAAGCACACTTTCCTCGTGAAGG +TGTCTTTGTTTCAAATGGCACACACTGGTTTGTAACACAAAGGAATTTTTATGAACCACA +AATCATTACTACAGACAACACATTTGTGTCTGGTAACTGTGATGTTGTAATAGGAATTGT +CAACAACACAGTTTATGATCCTTTGCAACCTGAATTAGACTCATTCAAGGAGGAGTTAGA +TAAATATTTTAAGAATCATACATCACCAGATGTTGATTTAGGTGACATCTCTGGCATTAA +TGCTTCAGTTGTAAACATTCAAAAAGAAATTGACCGCCTCAATGAGGTTGCCAAGAATTT +AAATGAATCTCTCATCGATCTCCAAGAACTTGGAAAGTATGAGCAGTATATAAAATGGCC +ATGGTACATTTGGCTAGGTTTTATAGCTGGCTTGATTGCCATAGTAATGGTGACAATTAT +GCTTTGCTGTATGACCAGTTGCTGTAGTTGTCTCAAGGGCTGTTGTTCTTGTGGATCCTG +CTGCAAATTTGATGAAGACGACTCTGAGCCAGTGCTCAAAGGAGTCAAATTACATTACAC +ATAAACGAACTTATGGATTTGTTTATGAGAATCTTCACAATTGGAACTGTAACTTTGAAG +CAAGGTGAAATCAAGGATGCTACTCCTTCAGATTTTGTTCGCGCTACTGCAACGATACCG +ATACAAGCCTCACTCCCTTTCGGATGGCTTATTGTTGGCGTTGCACTTCTTGCTGTTTTT +CAGAGCGCTTCCAAAATCATAACCCTCAAAAAGAGATGGCAACTAGCACTCTCCAAGGGT +GTTCACTTTGTTTGCAACTTGCTGTTGTTGTTTGTAACAGTTTACTCACACCTTTTGCTC +GTTGCTGCTGGCCTTGAAGCCCCTTTTCTCTATCTTTATGCTTTAGTCTACTTCTTGCAG +AGTATAAACTTTGTAAGAATAATAATGAGGCTTTGGCTTTGCTGGAAATGCCGTTCCAAA +AACCCATTACTTTATGATGCCAACTATTTTCTTTGCTGGCATACTAATTGTTACGACTAT +TGTATACCTTACAATAGTGTAACTTCTTCAATTGTCATTACTTCAGGTGATGGCACAACA +AGTCCTATTTCTGAACATGACTACCAGATTGGTGGTTATACTGAAAAATGGGAATCTGGA +GTAAAAGACTGTGTTGTATTACACAGTTACTTCACTTCAGACTATTACCAGCTGTACTCA +ACTCAATTGAGTACAGACACTGGTGTTGAACATGTTACCTTCTTCATCTACAATAAAATT +GTTGATGAGCCTGAAGAACATGTCCAAATTCACACAATCGACGGTTCATCCGGAGTTGTT +AATCCAGTAATGGAACCAATTTATGATGAACCGACGACGACTACTAGCGTGCCTTTGTAA +GCACAAGCTGATGAGTACGAACTTATGTACTCATTCGTTTCGGAAGAGACAGGTACGTTA +ATAGTTAATAGCGTACTTCTTTTTCTTGCTTTCGTGGTATTCTTGCTAGTTACACTAGCC +ATCCTTACTGCGCTTCGATTGTGTGCGTACTGCTGCAATATTGTTAACGTGAGTCTTGTA +AAACCTTCTTTTTACGTTTACTCTCGTGTTAAAAATCTGAATTCTTCTAGAGTTCCTGAT +CTTCTGGTCTAAACGAACTAAATATTATATTAGTTTTTCTGTTTGGAACTTTAATTTTAG +CCATGGCAGATTCCAACGGTACTATTACCGTTGAAGAGCTTAAAAAGCTCCTTGAACAAT +GGAACCTAGTAATAGGTTTCCTATTCCTTACATGGATTTGTCTTCTACAATTTGCCTATG +CCAACAGGAATAGGTTTTTGTATATAATTAAGTTAATTTTCCTCTGGCTGTTATGGCCAG +TAACTTTAGCTTGTTTTGTGCTTGCTGCTGTTTACAGAATAAATTGGATCACCGGTGGAA +TTGCTATCGCAATGGCTTGTCTTGTAGGCTTGATGTGGCTCAGCTACTTCATTGCTTCTT +TCAGACTGTTTGCGCGTACGCGTTCCATGTGGTCATTCAATCCAGAAACTAACATTCTTC +TCAACGTGCCACTCCATGGCACTATTCTGACCAGACCGCTTCTAGAAAGTGAACTCGTAA +TCGGAGCTGTGATCCTTCGTGGACATCTTCGTATTGCTGGACACCATCTAGGACGCTGTG +ACATCAAGGACCTGCCTAAAGAAATCACTGTTGCTACATCACGAACGCTTTCTTATTACA +AATTGGGAGCTTCGCAGCGTGTAGCAGGTGACTCAGGTTTTGCTGCATACAGTCGCTACA +GGATTGGCAACTATAAATTAAACACAGACCATTCCAGTAGCAGTGACAATATTGCTTTGC +TTGTACAGTAAGTGACAACAGATGTTTCATCTCGTTGACTTTCAGGTTACTATAGCAGAG +ATATTACTAATTATTATGAGGACTTTTAAAGTTTCCATTTGGAATCTTGATTACATCATA +AACCTCATAATTAAAAATTTATCTAAGTCACTAACTGAGAATAAATATTCTCAATTAGAT +GAAGAGCAACCAATGGAGATTGATTAAACGAACATGAAAATTATTCTTTTCTTGGCACTG +ATAACACTCGCTACTTGTGAGCTTTATCACTACCAAGAGTGTGTTAGAGGTACAACAGTA +CTTTTAAAAGAACCTTGCTCTTCTGGAACATACGAGGGCAATTCACCATTTCATCCTCTA +GCTGATAACAAATTTGCACTGACTTGCTTTAGCACTCAATTTGCTTTTGCTTGTCCTGAC +GGCGTAAAACACGTCTATCAGTTACGTGCCAGATCAGTTTCACCTAAACTGTTCATCAGA +CAAGAGGAAGTTCAAGAACTTTACTCTCCAATTTTTCTTATTGTTGCGGCAATAGTGTTT +ATAACACTTTGCTTCACACTCAAAAGAAAGACAGAATGATTGAACTTTCATTAATTGACT +TCTATTTGTGCTTTTTAGCCTTTCTGCTATTCCTTGTTTTAATTATGCTTATTATCTTTT +GGTTCTCACTTGAACTGCAAGATCATAATGAAACTTGTCACGCCTAAACGAACATGAAAT +TTCTTGTTTTCTTAGGAATCATCACAACTGTAGCTGCATTTCACCAAGAATGTAGTTTAC +AGTCATGTACTCAACATCAACCATATGTAGTTGATGACCCGTGTCCTATTCACTTCTATT +CTAAATGGTATATTAGAGTAGGAGCTAGAAAATCAGCACCTTTAATTGAATTGTGCGTGG +ATGAGGCTGGTTCTAAATCACCCATTCAGTACATCGATATCGGTAATTATACAGTTTCCT +GTTTACCTTTTACAATTAATTGCCAGGAACCTAAATTGGGTAGTCTTGTAGTGCGTTGTT +CGTTCTATGAAGACTTTTTAGAGTATCATGACGTTCGTGTTGTTTTAGATTTCATCTAAA +CGAACAAACTAAAATGTCTGATAATGGACCCCAAAATCAGCGAAATGCACCCCGCATTAC +GTTTGGTGGACCCTCAGATTCAACTGGCAGTAACCAGAATGGAGAACGCAGTGGGGCGCG +ATCAAAACAACGTCGGCCCCAAGGTTTACCCAATAATACTGCGTCTTGGTTCACCGCTCT +CACTCAACATGGCAAGGAAGACCTTAAATTCCCTCGAGGACAAGGCGTTCCAATTAACAC +CAATAGCAGTCCAGATGACCAAATTGGCTACTACCGAAGAGCTACCAGACGAATTCGTGG +TGGTGACGGTAAAATGAAAGATCTCAGTCCAAGATGGTATTTCTACTACCTAGGAACTGG +GCCAGAAGCTGGACTTCCCTATGGTGCTAACAAAGACGGCATCATATGGGTTGCAACTGA +GGGAGCCTTGAATACACCAAAAGATCACATTGGCACCCGCAATCCTGCTAACAATGCTGC +AATCGTGCTACAACTTCCTCAAGGAACAACATTGCCAAAAGGCTTCTACGCAGAAGGGAG +CAGAGGCGGCAGTCAAGCCTCTTCTCGTTCCTCATCACGTAGTCGCAACAGTTCAAGAAA +TTCAACTCCAGGCAGCAGTAGGGGAACTTCTCCTGCTAGAATGGCTGGCAATGGCGGTGA +TGCTGCTCTTGCTTTGCTGCTGCTTGACAGATTGAACCAGCTTGAGAGCAAAATGTCTGG +TAAAGGCCAACAACAACAAGGCCAAACTGTCACTAAGAAATCTGCTGCTGAGGCTTCTAA +GAAGCCTCGGCAAAAACGTACTGCCACTAAAGCATACAATGTAACACAAGCTTTCGGCAG +ACGTGGTCCAGAACAAACCCAAGGAAATTTTGGGGACCAGGAACTAATCAGACAAGGAAC +TGATTACAAACATTGGCCGCAAATTGCACAATTTGCCCCCAGCGCTTCAGCGTTCTTCGG +AATGTCGCGCATTGGCATGGAAGTCACACCTTCGGGAACGTGGTTGACCTACACAGGTGC +CATCAAATTGGATGACAAAGATCCAAATTTCAAAGATCAAGTCATTTTGCTGAATAAGCA +TATTGACGCATACAAAACATTCCCACCAACAGAGCCTAAAAAGGACAAAAAGAAGAAGGC +TGATGAAACTCAAGCCTTACCGCAGAGACAGAAGAAACAGCAAACTGTGACTCTTCTTCC +TGCTGCAGATTTGGATGATTTCTCCAAACAATTGCAACAATCCATGAGCAGTGCTGACTC +AACTCAGGCCTAAACTCATGCAGACCACACAAGGCAGATGGGCTATATAAACGTTTTCGC +TTTTCCGTTTACGATATATAGTCTACTCTTGTGCAGAATGAATTCTCGTAACTACATAGC +ACAAGTAGATGTAGTTAACTTTAATCTCACATAGCAATCTTTAATCAGTGTGTAACATTA +GGGAGGACTTGAAAGAGCCACCACATTTTCACCGAGGCCACGCGGAGTACGATCGAGTGT +ACAGTGAACAATGCTAGGGAGAGCTGCCTATATGGAAGAGCCCTAATGTGTAAAATTAAT +TTTAGTAGTGCTATCCCCATGTGATTTTAATAGCTTCTTAGGAGAATGACAAAAAAAAAA +AAAAAAAAAAAAAAAAAAAAAAA diff --git a/tests/data/sars_cov_2.fa.fai b/tests/data/sars_cov_2.fa.fai index a814005..36a25ab 100644 --- a/tests/data/sars_cov_2.fa.fai +++ b/tests/data/sars_cov_2.fa.fai @@ -1 +1 @@ -NC_045512.2 29903 97 70 71 +MN908947.3 29903 87 60 61 diff --git a/tests/data/sars_cov_2.gff3.gz b/tests/data/sars_cov_2.gff3.gz new file mode 100644 index 0000000000000000000000000000000000000000..2a2a48cfd8c6f4112f08665b45f71fae7fd2e975 GIT binary patch literal 1384 zcmV-u1(*6CiwFb&00000{{{d;LjnMt08>^@PfOD+OD!tS%+FIW=2BKJPAx1=%}Y+z zElN!Xi76=f`dJ!SSelsY87mkn7+G2x0F^1Gr{?A7rs^h@X6B?QxcWINI0pM#T9{f` zSe7A4!wkkMnUYwNs$gVbWT0zch-RX0Vsdh7F<7ggvy-QlyR&1wfdRw_Q$3I~l}W|` z9T+Ab>i_@%ABzYC000000RIL6LPG)ot_0PZU2mI66o#+qukcN!U2PRH1M?Nj7jBwe z^&)On@7=1h96WVVZLok?oBj12Vv22G&Kv?an<&T`!SJ4Qp7VM%@UT$6RH3*Dy!rA$ zNhYN_241GOahiC7|$H^+*%-T~x8Yp3mIW$m=1`2BE%!B^dB=|ZJHy$e7#d@!+o7`WHM`@B5 zaq(2dM?E9c>P9P_Y*vTw>s-(LdaVB6h&W5Wn$1Y~VP-k%EH@>e5io{xB2h6=OAK_> z7$_=+txj~S4hDvDyi)#GEBl|m{7Kf2)5kb1o)3EVtN2m6iIeSgqfddC_k1~Xa!7a2 z)k$xI_ou5b2Pgk5)2A%bw~M>?A1CFlC%=75v%SuC&y)M4$n<~Z;P2+*c@*7-fe3vj zK$%)-F|}}1 zQww{TS}O``A6MF!m=?~TvTy51Pqw?~C>_$U9BSBbP{Vdb4%ZH(-<2K*_#py(7AT{b zQ|ddw=NRxg2tK!i-;jX$K=LAK1YzxX~@hm6ci?(_z3dNNdWo82UfAyGBD(un{|?J#z!RI%+_Na+l<0S zeytrty-tpLmmc&W@P%&&->4b1KI@1Yt{VitGc_>u=5hq|jG9&zL=lJG3yj?h(C!5Q z{k05~-3yD|i&5=f^ntz^h1KpW?GG5}_vQ{p!w3r*7a!^rOl9WEYd`L6sp>|P+q z7u2Wb7)ar;dxEih0@^*WY0;2@vU_5&dorrslOE)^qOjUMslUuQa-GA6xjhBko(y8X zD`>cEpxYxd<1lxd5ls~@)U+7&KbFwp8TIdOPBW)oA$vQH&-dcDMW zy#&2p0>od-KzY5ic)c9e>t!F}n^9Q3Ue?u>NZ{o#rx$?J%VFGi1PzxBbb3T)97Zn@ z*h?ymXvG58V_k;9o`KjiJN6A3DD0U9dp0WetOxe3D6H65+7}q+lc#8>(`bFj9~ zj^(8SGU`hO1ou+GiT_p<)`iZ+^Hd-E2-+WJ{2W;Dy!OHSiTbXv;lhCn9!zc=3!cko q%|8K%D{>PqBme*(iwFb&00000{{{d;LjnLB00RI3000000000#bf9Md literal 0 HcmV?d00001 diff --git a/tests/data/sars_cov_2.gff3.gz.tbi b/tests/data/sars_cov_2.gff3.gz.tbi new file mode 100644 index 0000000000000000000000000000000000000000..9b7e8059674addaf4507828e885072cea810f411 GIT binary patch literal 118 zcmb2|=3rp}f&Xj_PR>jWkqq30pHfm%8W;r5G96jtu!cwUFAQPff-~E7ytl% CARP+; literal 0 HcmV?d00001 diff --git a/tests/shared_data.py b/tests/shared_data.py index b66a0c7..56b5a32 100644 --- a/tests/shared_data.py +++ b/tests/shared_data.py @@ -5,22 +5,28 @@ "SARS_COV_2_GENOME_ID", "SARS_COV_2_FASTA_PATH", "SARS_COV_2_FAI_PATH", + "SARS_COV_2_GFF3_GZ_PATH", + "SARS_COV_2_GFF3_GZ_TBI_PATH", "TEST_GENOME_OF_FILE_URIS", ] DATA_DIR = (pathlib.Path(__file__).parent / "data").absolute() -SARS_COV_2_GENOME_ID = "NC_045512.2" +SARS_COV_2_GENOME_ID = "MN908947.3" SARS_COV_2_FASTA_PATH = DATA_DIR / "sars_cov_2.fa" SARS_COV_2_FAI_PATH = DATA_DIR / "sars_cov_2.fa.fai" +SARS_COV_2_GFF3_GZ_PATH = DATA_DIR / "sars_cov_2.gff3.gz" +SARS_COV_2_GFF3_GZ_TBI_PATH = DATA_DIR / "sars_cov_2.gff3.gz.tbi" TEST_GENOME_OF_FILE_URIS = { "id": SARS_COV_2_GENOME_ID, "aliases": [], - "md5": "825ab3c54b7a67ff2db55262eb532438", - "ga4gh": "SQ.mMg8qNej7pU84juQQWobw9JyUy09oYdd", + "md5": "b98334cd0015ee1b1d2dc3b9d81b325e", + "ga4gh": "SQ.F4O8uhlkMQ76rmE6SmUFFjp04UV25Ybn", "fasta": f"file://{SARS_COV_2_FASTA_PATH}", "fai": f"file://{SARS_COV_2_FAI_PATH}", + "gff3_gz": f"file://{SARS_COV_2_GFF3_GZ_PATH}", + "gff3_gz_tbi": f"file://{SARS_COV_2_GFF3_GZ_TBI_PATH}", "taxon": {"id": "NCBITaxon:2697049", "label": "Severe acute respiratory syndrome coronavirus 2"}, "contigs": [ { diff --git a/tests/test_genome_routes.py b/tests/test_genome_routes.py index 0c62437..c2bede8 100644 --- a/tests/test_genome_routes.py +++ b/tests/test_genome_routes.py @@ -5,7 +5,14 @@ from fastapi.testclient import TestClient from httpx import Response -from .shared_data import SARS_COV_2_GENOME_ID, SARS_COV_2_FASTA_PATH, SARS_COV_2_FAI_PATH, TEST_GENOME_OF_FILE_URIS +from .shared_data import ( + SARS_COV_2_GENOME_ID, + SARS_COV_2_FASTA_PATH, + SARS_COV_2_FAI_PATH, + SARS_COV_2_GFF3_GZ_PATH, + SARS_COV_2_GFF3_GZ_TBI_PATH, + TEST_GENOME_OF_FILE_URIS, +) # all tests are async so that db_cleanup (an async fixture) properly works. not sure why it's this way. @@ -102,7 +109,19 @@ async def test_genome_detail_endpoints(test_client: TestClient, aioresponse: aio res = test_client.get(f"/genomes/{SARS_COV_2_GENOME_ID}.fa.fai", headers={"Range": "bytes=0-0"}) assert res.status_code == status.HTTP_206_PARTIAL_CONTENT assert res.headers.get("Content-Type") == "text/plain; charset=utf-8" - assert res.content == b"N" + assert res.content == b"M" + + # - Feature GFF3 + res = test_client.get(f"/genomes/{SARS_COV_2_GENOME_ID}/features.gff3.gz") + assert res.status_code == status.HTTP_200_OK + with open(SARS_COV_2_GFF3_GZ_PATH, "rb") as fh: + assert res.content == fh.read() + + # - Feature GFF3 TBI + res = test_client.get(f"/genomes/{SARS_COV_2_GENOME_ID}/features.gff3.gz.tbi") + assert res.status_code == status.HTTP_200_OK + with open(SARS_COV_2_GFF3_GZ_TBI_PATH, "rb") as fh: + assert res.content == fh.read() async def test_genome_delete(test_client: TestClient, aioresponse: aioresponses, db_cleanup): @@ -128,3 +147,18 @@ async def test_genome_delete(test_client: TestClient, aioresponse: aioresponses, aioresponse.post("https://authz.local/policy/evaluate", payload={"result": [[False]]}) res = test_client.delete(f"/genomes/{SARS_COV_2_GENOME_ID}", headers={"Authorization": "Token bearer"}) assert res.status_code == status.HTTP_403_FORBIDDEN + + +async def test_genome_feature_ingest(test_client: TestClient, aioresponse: aioresponses, db_cleanup): + # setup: create genome TODO: fixture + create_genome_with_permissions(test_client, aioresponse) + + aioresponse.post("https://authz.local/policy/evaluate", payload={"result": [[True]]}) + + with open(SARS_COV_2_GFF3_GZ_PATH, "rb") as gff3_fh, open(SARS_COV_2_GFF3_GZ_TBI_PATH, "rb") as tbi_fh: + res = test_client.put( + f"/genomes/{SARS_COV_2_GENOME_ID}/features.gff3.gz", + files={"gff3_gz": gff3_fh, "gff3_gz_tbi": tbi_fh}, + headers={"Authorization": "Token bearer"}, + ) + assert res.status_code == status.HTTP_204_NO_CONTENT From e99b2bfa32368d00d876cd9d2d40544f571e331d Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Wed, 8 May 2024 17:51:56 -0400 Subject: [PATCH 028/114] fix(workflows): issues with fasta_ref workflow + lint --- bento_reference_service/db.py | 5 +++-- bento_reference_service/workflows/wdls/fasta_ref.wdl | 10 +++++----- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/bento_reference_service/db.py b/bento_reference_service/db.py index 8bce7e3..561cc8b 100644 --- a/bento_reference_service/db.py +++ b/bento_reference_service/db.py @@ -406,8 +406,9 @@ async def bulk_ingest_genome_features(self, features: Iterable[GenomeFeature]): conn: asyncpg.Connection async with self.connect() as conn: async with conn.transaction(): - await conn.executemany("INSERT INTO genome_feature_types(type_id) VALUES ($1) ON CONFLICT DO NOTHING", - feature_types) + await conn.executemany( + "INSERT INTO genome_feature_types(type_id) VALUES ($1) ON CONFLICT DO NOTHING", feature_types + ) await conn.copy_records_to_table( "genome_features", diff --git a/bento_reference_service/workflows/wdls/fasta_ref.wdl b/bento_reference_service/workflows/wdls/fasta_ref.wdl index bf82650..fb0ce4a 100644 --- a/bento_reference_service/workflows/wdls/fasta_ref.wdl +++ b/bento_reference_service/workflows/wdls/fasta_ref.wdl @@ -34,11 +34,11 @@ workflow fasta_ref { validate_ssl = validate_ssl } - if (genome_gff3) { + if (defined(genome_gff3)) { call normalize_and_compress_gff3_and_index as gi { input: genome_id = genome_id, - gff3 = genome_gff3 + gff3 = select_first([genome_gff3]) # Coerce File? into File via select_first } call ingest_into_drs as drs_gff3 { @@ -71,12 +71,12 @@ workflow fasta_ref { validate_ssl = validate_ssl } - if (genome_gff3) { + if (defined(genome_gff3)) { call ingest_gff3_into_ref { input: genome_id = genome_id, - gff3_gz = gi.sorted_gff3_gz, - gff3_gz_tbi = gi.sorted_gff3_gz_tbi, + gff3_gz = select_first([gi.sorted_gff3_gz]), # Coerce File? into File via select_first + gff3_gz_tbi = select_first([gi.sorted_gff3_gz_tbi]), # " reference_url = reference_url, token = access_token, validate_ssl = validate_ssl From d9842bfc2ab59193e1644c3b4311b083864658bc Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Wed, 8 May 2024 18:29:55 -0400 Subject: [PATCH 029/114] fix(workflows): GFF3 pattern for fasta_ref workflow --- bento_reference_service/routers/genomes.py | 5 +++-- bento_reference_service/workflows/metadata.py | 16 +++++----------- 2 files changed, 8 insertions(+), 13 deletions(-) diff --git a/bento_reference_service/routers/genomes.py b/bento_reference_service/routers/genomes.py index 56ae132..7259203 100644 --- a/bento_reference_service/routers/genomes.py +++ b/bento_reference_service/routers/genomes.py @@ -257,8 +257,9 @@ async def genomes_detail_features_ingest_gff3( await ingest_gene_feature_annotation(genome_id, fn, fn_tbi, db, logger) finally: - fn.unlink(missing_ok=True) - fn_tbi.unlink(missing_ok=True) + pass + # fn.unlink(missing_ok=True) + # fn_tbi.unlink(missing_ok=True) @genome_router.get("/{genome_id}/features.gff3.gz.tbi", dependencies=[authz_middleware.dep_public_endpoint()]) diff --git a/bento_reference_service/workflows/metadata.py b/bento_reference_service/workflows/metadata.py index 2758ac6..1a37113 100644 --- a/bento_reference_service/workflows/metadata.py +++ b/bento_reference_service/workflows/metadata.py @@ -7,6 +7,9 @@ WORKFLOW_FASTA_REFERENCE = "fasta_ref" WORKFLOW_GFF3_ANNOTATION = "gff3_annot" +GFF3_PATTERN = r"^.*\.(gff|gff3|gff.gz|gff3.gz)$" +GFF3_HELP = "GFF3-formatted annotation file for the reference genome." + workflow_set = WorkflowSet(Path(__file__).parent / "wdls") workflow_set.add_workflow( @@ -48,12 +51,7 @@ pattern=r"^.*\.(fa|fa.gz|fna|fna.gz|fas|fas.gz|fasta|fasta.gz)$", help="FASTA file for the reference genome, either gzipped or uncompressed.", ), - wm.WorkflowFileInput( - id="genome_gff3", - pattern=r"^.*\.(gff|gff3)$", - required=False, - help="GFF3-formatted annotation file for the reference genome.", - ), + wm.WorkflowFileInput(id="genome_gff3", pattern=GFF3_PATTERN, required=False, help=GFF3_HELP), ], ), ) @@ -81,11 +79,7 @@ values="{{ serviceUrls.reference }}/genomes?response_format=id_list", help="The reference genome to annotate with the GFF3 file.", ), - wm.WorkflowFileInput( - id="genome_gff3", - pattern=r"^.*\.(gff|gff3|gff.gz|gff3.gz)$", - help="GFF3-formatted annotation file for the reference genome.", - ), + wm.WorkflowFileInput(id="genome_gff3", pattern=GFF3_PATTERN, help=GFF3_HELP), ], ), ) From 820b603596ae7688936b485f42a88183e3ea0053 Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Wed, 8 May 2024 18:30:38 -0400 Subject: [PATCH 030/114] fix: undo debug thing --- bento_reference_service/routers/genomes.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/bento_reference_service/routers/genomes.py b/bento_reference_service/routers/genomes.py index 7259203..56ae132 100644 --- a/bento_reference_service/routers/genomes.py +++ b/bento_reference_service/routers/genomes.py @@ -257,9 +257,8 @@ async def genomes_detail_features_ingest_gff3( await ingest_gene_feature_annotation(genome_id, fn, fn_tbi, db, logger) finally: - pass - # fn.unlink(missing_ok=True) - # fn_tbi.unlink(missing_ok=True) + fn.unlink(missing_ok=True) + fn_tbi.unlink(missing_ok=True) @genome_router.get("/{genome_id}/features.gff3.gz.tbi", dependencies=[authz_middleware.dep_public_endpoint()]) From 2296e30d17e35e7d2ce1f876208082bb63c41259 Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Thu, 9 May 2024 11:15:58 -0400 Subject: [PATCH 031/114] fix(features): use contig-batches to keep parents with child features --- bento_reference_service/features.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/bento_reference_service/features.py b/bento_reference_service/features.py index 0be7ab3..1c0ed2f 100644 --- a/bento_reference_service/features.py +++ b/bento_reference_service/features.py @@ -1,4 +1,3 @@ -import itertools import logging import pysam import traceback @@ -17,7 +16,6 @@ GFF_CAPTURED_ATTRIBUTES = frozenset({"ID", "Parent"}) GFF_SKIPPED_FEATURE_TYPES = frozenset({"stop_codon_redefined_as_selenocysteine"}) -GFF_BATCH_SIZE = 5000 GFF_LOG_PROGRESS_INTERVAL = 1000 @@ -107,7 +105,7 @@ async def ingest_gene_feature_annotation( if genome is None: raise AnnotationIngestError(f"Genome with ID {genome_id} not found") - def _iter_features() -> Generator[m.GenomeFeature, None, None]: + def _iter_features() -> Generator[tuple[m.GenomeFeature, ...], None, None]: gff = pysam.TabixFile(str(gff_path), index=str(gff_index_path)) total_processed: int = 0 @@ -186,7 +184,7 @@ def _iter_features() -> Generator[m.GenomeFeature, None, None]: if total_processed % GFF_LOG_PROGRESS_INTERVAL == 0: logger.info(f"Processed {total_processed} features") - yield from features_by_id.values() + yield tuple(features_by_id.values()) features_by_id.clear() finally: @@ -196,7 +194,10 @@ def _iter_features() -> Generator[m.GenomeFeature, None, None]: n_ingested: int = 0 - while data := tuple(itertools.islice(features_to_ingest, GFF_BATCH_SIZE)): # take features in batches + # take features in contig batches + # - we use contigs as batches rather than a fixed batch size so that we are guaranteed to get parents alongside + # their child features in the same batch. + while data := next(features_to_ingest, ()): await db.bulk_ingest_genome_features(data) n_ingested += len(data) From 1e76c7be9465d1d1194003bdd15b17b3f72c935d Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Thu, 9 May 2024 12:36:23 -0400 Subject: [PATCH 032/114] chore: more logging for feature ingest --- bento_reference_service/db.py | 11 ++++++++++- bento_reference_service/features.py | 4 +++- bento_reference_service/routers/genomes.py | 1 + 3 files changed, 14 insertions(+), 2 deletions(-) diff --git a/bento_reference_service/db.py b/bento_reference_service/db.py index 561cc8b..38a1ff4 100644 --- a/bento_reference_service/db.py +++ b/bento_reference_service/db.py @@ -1,5 +1,6 @@ import asyncpg import json +import logging from bento_lib.db.pg_async import PgAsyncDatabase from fastapi import Depends from functools import lru_cache @@ -359,7 +360,7 @@ async def clear_genome_features(self, g_id: str): await conn.execute("DELETE FROM genome_feature_parents WHERE genome_id = $1", g_id) await conn.execute("DELETE FROM genome_features WHERE genome_id = $1", g_id) - async def bulk_ingest_genome_features(self, features: Iterable[GenomeFeature]): + async def bulk_ingest_genome_features(self, features: Iterable[GenomeFeature], logger: logging.Logger): feature_types: list[tuple[str]] = [] entries: list[tuple[str, str, int, int, str, float | None, int | None]] = [] attributes: list[tuple[str, str, str, str]] = [] @@ -406,10 +407,12 @@ async def bulk_ingest_genome_features(self, features: Iterable[GenomeFeature]): conn: asyncpg.Connection async with self.connect() as conn: async with conn.transaction(): + logger.debug(f"bulk_ingest_genome_features: have {len(feature_types)} feature types for batch") await conn.executemany( "INSERT INTO genome_feature_types(type_id) VALUES ($1) ON CONFLICT DO NOTHING", feature_types ) + logger.debug(f"bulk_ingest_genome_features: have {len(feature_tuples)} features for batch") await conn.copy_records_to_table( "genome_features", columns=[ @@ -423,6 +426,8 @@ async def bulk_ingest_genome_features(self, features: Iterable[GenomeFeature]): ], records=feature_tuples, ) + + logger.debug(f"bulk_ingest_genome_features: have {len(attributes)} feature attribute records for batch") await conn.copy_records_to_table( "genome_feature_attributes", columns=[ @@ -433,6 +438,8 @@ async def bulk_ingest_genome_features(self, features: Iterable[GenomeFeature]): ], records=attributes, ) + + logger.debug(f"bulk_ingest_genome_features: have {len(entries)} feature entries for batch") await conn.copy_records_to_table( "genome_feature_entries", columns=[ @@ -446,6 +453,8 @@ async def bulk_ingest_genome_features(self, features: Iterable[GenomeFeature]): ], records=entries, ) + + logger.debug(f"bulk_ingest_genome_features: have {len(parents)} feature parent records for batch") await conn.copy_records_to_table( "genome_feature_parents", columns=[ diff --git a/bento_reference_service/features.py b/bento_reference_service/features.py index 1c0ed2f..0af83a5 100644 --- a/bento_reference_service/features.py +++ b/bento_reference_service/features.py @@ -105,6 +105,8 @@ async def ingest_gene_feature_annotation( if genome is None: raise AnnotationIngestError(f"Genome with ID {genome_id} not found") + logger.info(f"Ingesting gene features for genome {genome_id}...") + def _iter_features() -> Generator[tuple[m.GenomeFeature, ...], None, None]: gff = pysam.TabixFile(str(gff_path), index=str(gff_index_path)) total_processed: int = 0 @@ -198,7 +200,7 @@ def _iter_features() -> Generator[tuple[m.GenomeFeature, ...], None, None]: # - we use contigs as batches rather than a fixed batch size so that we are guaranteed to get parents alongside # their child features in the same batch. while data := next(features_to_ingest, ()): - await db.bulk_ingest_genome_features(data) + await db.bulk_ingest_genome_features(data, logger) n_ingested += len(data) if n_ingested == 0: diff --git a/bento_reference_service/routers/genomes.py b/bento_reference_service/routers/genomes.py index 56ae132..43ff4df 100644 --- a/bento_reference_service/routers/genomes.py +++ b/bento_reference_service/routers/genomes.py @@ -251,6 +251,7 @@ async def genomes_detail_features_ingest_gff3( logger.debug(f"Wrote GFF.gz.tbi data to {fn_tbi}; size={fn_tbi.stat().st_size}") # clear existing gene features for this genome + logger.info(f"Clearing gene features for genome {genome_id} in preparation for feature (re-)ingestion...") await db.clear_genome_features(genome_id) # ingest gene features into the database From b9344e341cf16063a81aaa6c8ffc1438aeaf8327 Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Thu, 9 May 2024 12:57:47 -0400 Subject: [PATCH 033/114] fix(db): syntax for feature_id in-clause --- bento_reference_service/db.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bento_reference_service/db.py b/bento_reference_service/db.py index 38a1ff4..6ab606f 100644 --- a/bento_reference_service/db.py +++ b/bento_reference_service/db.py @@ -276,7 +276,7 @@ async def get_genome_features_by_ids( WHERE gfp.genome_id = gf.genome_id AND gfp.feature_id = gf.feature_id ) parents FROM genome_features gf - WHERE gf.genome_id = $1 AND feature_id IN $2 + WHERE gf.genome_id = $1 AND feature_id = any($2::text[]) OFFSET $3 LIMIT $4 """ From dd4ef44baa3e480303b72bdfeb13854466a96d3b Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Thu, 9 May 2024 14:55:10 -0400 Subject: [PATCH 034/114] fix: logger config --- bento_reference_service/logger.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/bento_reference_service/logger.py b/bento_reference_service/logger.py index acfb882..3343b70 100644 --- a/bento_reference_service/logger.py +++ b/bento_reference_service/logger.py @@ -12,6 +12,8 @@ "LoggerDependency", ] +logging.basicConfig(level=logging.NOTSET) + @lru_cache def get_logger(config: ConfigDependency) -> logging.Logger: From 6572837d37589a6297cccad7e8d362679eadb2af Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Thu, 9 May 2024 14:55:24 -0400 Subject: [PATCH 035/114] chore: log time taken to ingest batch of feaures --- bento_reference_service/features.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/bento_reference_service/features.py b/bento_reference_service/features.py index 0af83a5..e69c6fe 100644 --- a/bento_reference_service/features.py +++ b/bento_reference_service/features.py @@ -2,6 +2,7 @@ import pysam import traceback +from datetime import datetime from pathlib import Path from typing import Generator from urllib.parse import unquote as url_unquote @@ -200,10 +201,13 @@ def _iter_features() -> Generator[tuple[m.GenomeFeature, ...], None, None]: # - we use contigs as batches rather than a fixed batch size so that we are guaranteed to get parents alongside # their child features in the same batch. while data := next(features_to_ingest, ()): + s = datetime.now() + logger.debug(f"ingest_gene_feature_annotation: ingesting batch of {len(data)} features") await db.bulk_ingest_genome_features(data, logger) n_ingested += len(data) + logger.debug(f"ingest_gene_feature_annotation: batch took {(datetime.now() - s).total_seconds():.1f} seconds") if n_ingested == 0: raise AnnotationIngestError("No gene features could be ingested - is this a valid GFF3 file?") - logger.info(f"Ingested {n_ingested} gene features") + logger.info(f"ingest_gene_feature_annotation: ingested {n_ingested} gene features") From c5bb40971ac4766024b85c2aadadc66ffa9216fe Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Thu, 9 May 2024 16:21:37 -0400 Subject: [PATCH 036/114] chore(db): implement attribute fetch, fix offset/limit perf issue --- bento_reference_service/db.py | 32 +++++++++++++++++++------------- 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/bento_reference_service/db.py b/bento_reference_service/db.py index 6ab606f..1fd4596 100644 --- a/bento_reference_service/db.py +++ b/bento_reference_service/db.py @@ -237,9 +237,9 @@ def deserialize_genome_feature(rec: asyncpg.Record) -> GenomeFeature: feature_name=rec["feature_name"], feature_type=rec["feature_type"], source=rec["source"], - entries=tuple(map(Database.deserialize_genome_feature_entry, json.loads(rec["entries"]))), - annotations=json.loads(rec["annotations"]), # TODO - parents=tuple(rec["parents"]), # tuple of parent IDs + entries=tuple(map(Database.deserialize_genome_feature_entry, json.loads(rec["entries"] or "[]"))), + attributes=json.loads(rec["attributes"] or "{}"), + parents=tuple(rec["parents"] or ()), # tuple of parent IDs ) @staticmethod @@ -257,8 +257,6 @@ async def get_genome_features_by_ids( self, g_id: str, f_ids: list[str], - offset: int = 0, - limit: int = 10, existing_conn: asyncpg.Connection | None = None, ): final_query = f""" @@ -274,19 +272,26 @@ async def get_genome_features_by_ids( ( SELECT array_agg(gfp.parent_id) FROM genome_feature_parents gfp WHERE gfp.genome_id = gf.genome_id AND gfp.feature_id = gf.feature_id - ) parents + ) parents, + ( + WITH attrs_tmp AS ( + SELECT attr_tag, array_agg(gfa.attr_val) attr_vals FROM genome_feature_attributes gfa + WHERE gfa.genome_id = gf.genome_id AND gfa.feature_id = gf.feature_id + GROUP BY gfa.attr_tag + ) + SELECT jsonb_object_agg(attrs_tmp.attr_tag, attrs_tmp.attr_vals) FROM attrs_tmp + ) attributes FROM genome_features gf WHERE gf.genome_id = $1 AND feature_id = any($2::text[]) - OFFSET $3 LIMIT $4 """ conn: asyncpg.Connection async with self.connect(existing_conn) as conn: - final_res = await conn.fetch(final_query, g_id, f_ids, offset, limit) + final_res = await conn.fetch(final_query, g_id, f_ids) return [self.deserialize_genome_feature(r) for r in final_res] async def get_genome_feature_by_id(self, g_id: str, f_id: str) -> GenomeFeature | None: - res = await self.get_genome_features_by_ids(g_id, [f_id], 0, 1) + res = await self.get_genome_features_by_ids(g_id, [f_id]) return res[0] if res else None async def query_genome_features( @@ -307,7 +312,7 @@ async def query_genome_features( def _q_param(pv: str | int) -> str: q_params.append(pv) - return f"${len(gf_where_items) + 2}" + return f"${len(q_params) + 1}" if q: param = _q_param(q) @@ -339,15 +344,16 @@ def _q_param(pv: str | int) -> str: FROM genome_features gf WHERE gf.genome_id = $1 - AND jsonb_array_length(gf.entries) > 0 - AND {where_clause}; + AND {where_clause} + OFFSET {_q_param(max(offset, 0))} + LIMIT {_q_param(max(limit, 0))} """ conn: asyncpg.Connection async with self.connect() as conn: id_res = await conn.fetch(id_query, g_id, *q_params) final_list = await self.get_genome_features_by_ids( - g_id, [r["feature_id"] for r in id_res], offset, limit, conn + g_id, [r["feature_id"] for r in id_res], conn ) return final_list, {"offset": offset, "limit": limit, "total": len(id_res)} From 2ae4ee39afd986fae5b5414081e21380cb0ba7b3 Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Thu, 9 May 2024 21:04:02 -0400 Subject: [PATCH 037/114] perf: add some missing SQL indices --- bento_reference_service/sql/schema.sql | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/bento_reference_service/sql/schema.sql b/bento_reference_service/sql/schema.sql index 102e28e..7e70629 100644 --- a/bento_reference_service/sql/schema.sql +++ b/bento_reference_service/sql/schema.sql @@ -72,7 +72,7 @@ CREATE TABLE IF NOT EXISTS genome_features ( -- Feature characteristics / attributes: -- - technically, there can be multiple rows in a GFF3 file with the same ID, for discontinuous features. -- however, let's not support this, since it becomes tricky and doesn't help us much for our use cases. - feature_id VARCHAR(63) NOT NULL, + feature_id VARCHAR(63) NOT NULL, -- Feature ID from the GFF3 file, in the context of the genome feature_name TEXT NOT NULL, feature_type VARCHAR(15) NOT NULL REFERENCES genome_feature_types, source TEXT NOT NULL, @@ -80,7 +80,6 @@ CREATE TABLE IF NOT EXISTS genome_features ( PRIMARY KEY (genome_id, feature_id), FOREIGN KEY (genome_id, contig_name) REFERENCES genome_contigs ); - CREATE INDEX IF NOT EXISTS genome_features_feature_id_trgm_gin ON genome_features USING GIN (feature_id gin_trgm_ops); CREATE INDEX IF NOT EXISTS genome_features_feature_name_trgm_gin ON genome_features USING GIN (feature_name gin_trgm_ops); @@ -95,7 +94,7 @@ CREATE TABLE IF NOT EXISTS genome_feature_entries ( -- Keys: FOREIGN KEY (genome_id, feature_id) REFERENCES genome_features ); - +CREATE INDEX IF NOT EXISTS genome_feature_entries_genome_feature_idx ON genome_feature_entries (genome_id, feature_id); CREATE INDEX IF NOT EXISTS genome_feature_entries_position_text_trgm_gin ON genome_feature_entries USING GIN (position_text gin_trgm_ops); @@ -109,6 +108,7 @@ CREATE TABLE IF NOT EXISTS genome_feature_parents ( FOREIGN KEY (genome_id, feature_id) REFERENCES genome_features, FOREIGN KEY (genome_id, parent_id) REFERENCES genome_features ); +CREATE INDEX IF NOT EXISTS genome_feature_parents_genome_feature_idx ON genome_feature_parents (genome_id, feature_id); -- attributes can also have multiple values, so we don't enforce uniqueness on (genome_id, feature_id, attr_tag) -- these are non-Parent, non-ID attributes @@ -120,5 +120,7 @@ CREATE TABLE IF NOT EXISTS genome_feature_attributes ( attr_val TEXT NOT NULL, FOREIGN KEY (genome_id, feature_id) REFERENCES genome_features ); +CREATE INDEX IF NOT EXISTS genome_feature_attributes_genome_feature_idx + ON genome_feature_parents (genome_id, feature_id); CREATE INDEX IF NOT EXISTS genome_feature_attributes_attr_idx ON genome_feature_attributes (genome_id, feature_id, attr_tag); From cc00733c065aff4feba03196f7876bd33ab3e1e1 Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Fri, 10 May 2024 10:12:44 -0400 Subject: [PATCH 038/114] fix: handle missing genome when ingesting annotations --- bento_reference_service/features.py | 7 ++++++- bento_reference_service/routers/genomes.py | 11 +++++++++-- 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/bento_reference_service/features.py b/bento_reference_service/features.py index e69c6fe..46ceecb 100644 --- a/bento_reference_service/features.py +++ b/bento_reference_service/features.py @@ -11,6 +11,7 @@ from .db import Database __all__ = [ + "AnnotationGenomeNotFoundError", "ingest_gene_feature_annotation", ] @@ -20,6 +21,10 @@ GFF_LOG_PROGRESS_INTERVAL = 1000 +class AnnotationGenomeNotFoundError(Exception): + pass + + class AnnotationIngestError(Exception): pass @@ -104,7 +109,7 @@ async def ingest_gene_feature_annotation( genome: m.GenomeWithURIs | None = await db.get_genome(genome_id) if genome is None: - raise AnnotationIngestError(f"Genome with ID {genome_id} not found") + raise AnnotationGenomeNotFoundError(f"Genome with ID {genome_id} not found") logger.info(f"Ingesting gene features for genome {genome_id}...") diff --git a/bento_reference_service/routers/genomes.py b/bento_reference_service/routers/genomes.py index 43ff4df..7375f58 100644 --- a/bento_reference_service/routers/genomes.py +++ b/bento_reference_service/routers/genomes.py @@ -13,7 +13,7 @@ from ..authz import authz_middleware from ..config import ConfigDependency from ..db import Database, DatabaseDependency -from ..features import ingest_gene_feature_annotation +from ..features import AnnotationGenomeNotFoundError, ingest_gene_feature_annotation from ..logger import LoggerDependency from ..streaming import generate_uri_streaming_response @@ -28,12 +28,16 @@ genome_router = APIRouter(prefix="/genomes") +def exc_genome_not_found(genome_id: str) -> HTTPException: + return HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=f"Genome with ID {genome_id} not found") + + async def get_genome_or_raise_404( db: Database, genome_id: str, external_resource_uris: bool = True ) -> m.GenomeWithURIs: genome: m.GenomeWithURIs = await db.get_genome(genome_id, external_resource_uris=external_resource_uris) if genome is None: - raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=f"Genome with ID {genome_id} not found") + raise exc_genome_not_found(genome_id) return genome @@ -257,6 +261,9 @@ async def genomes_detail_features_ingest_gff3( # ingest gene features into the database await ingest_gene_feature_annotation(genome_id, fn, fn_tbi, db, logger) + except AnnotationGenomeNotFoundError: + raise exc_genome_not_found(genome_id) + finally: fn.unlink(missing_ok=True) fn_tbi.unlink(missing_ok=True) From d3af68e5fb59bfae7e6f4ba20085b7f8f27467e5 Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Fri, 10 May 2024 12:21:28 -0400 Subject: [PATCH 039/114] refact!: rewrite schema for better feature storage/indexing --- bento_reference_service/db.py | 88 +++++++++++++++----------- bento_reference_service/features.py | 12 ++-- bento_reference_service/models.py | 5 ++ bento_reference_service/sql/schema.sql | 47 +++++++------- 4 files changed, 88 insertions(+), 64 deletions(-) diff --git a/bento_reference_service/db.py b/bento_reference_service/db.py index 1fd4596..abcb9ce 100644 --- a/bento_reference_service/db.py +++ b/bento_reference_service/db.py @@ -8,15 +8,26 @@ from typing import Annotated, AsyncIterator, Iterable from .config import Config, ConfigDependency -from .models import Alias, ContigWithRefgetURI, Genome, GenomeWithURIs, OntologyTerm, GenomeFeatureEntry, GenomeFeature +from .logger import LoggerDependency +from .models import ( + Alias, + ContigWithRefgetURI, + Genome, + GenomeWithURIs, + OntologyTerm, + GenomeFeatureEntry, + GenomeFeature, + GenomeFeatureWithInternalID, +) SCHEMA_PATH = Path(__file__).parent / "sql" / "schema.sql" class Database(PgAsyncDatabase): - def __init__(self, config: Config): + def __init__(self, config: Config, logger: logging.Logger): self._config: Config = config + self.logger: logging.Logger = logger super().__init__(config.database_uri, SCHEMA_PATH) @staticmethod @@ -201,6 +212,8 @@ async def create_genome(self, g: Genome, return_external_resource_uris: bool) -> contig_alias_tuples, ) + self.logger.debug(f"Created genome: {g}") + return await self.get_genome(g.id, external_resource_uris=return_external_resource_uris) async def genome_feature_types_summary(self, g_id: str): @@ -248,7 +261,7 @@ def _feature_inner_entries_query(where_expr: str | None = None) -> str: return f""" WITH entries_tmp AS ( SELECT start_pos, end_pos, score, phase FROM genome_feature_entries gfe - WHERE gfe.genome_id = gf.genome_id AND gfe.feature_id = gf.feature_id {where_clause} + WHERE gfe.feature = gf.id {where_clause} ) SELECT jsonb_agg(entries_tmp.*) FROM entries_tmp """ @@ -270,13 +283,14 @@ async def get_genome_features_by_ids( source, ({self._feature_inner_entries_query()}) entries, ( - SELECT array_agg(gfp.parent_id) FROM genome_feature_parents gfp - WHERE gfp.genome_id = gf.genome_id AND gfp.feature_id = gf.feature_id + SELECT array_agg(gffp.feature_id) + FROM genome_feature_parents gfp JOIN genome_features gffp ON gfp.parent = gffp.id + WHERE gfp.feature = gf.id ) parents, ( WITH attrs_tmp AS ( SELECT attr_tag, array_agg(gfa.attr_val) attr_vals FROM genome_feature_attributes gfa - WHERE gfa.genome_id = gf.genome_id AND gfa.feature_id = gf.feature_id + WHERE gfa.feature = gf.id GROUP BY gfa.attr_tag ) SELECT jsonb_object_agg(attrs_tmp.attr_tag, attrs_tmp.attr_vals) FROM attrs_tmp @@ -349,41 +363,40 @@ def _q_param(pv: str | int) -> str: LIMIT {_q_param(max(limit, 0))} """ + from datetime import datetime + conn: asyncpg.Connection async with self.connect() as conn: id_res = await conn.fetch(id_query, g_id, *q_params) - final_list = await self.get_genome_features_by_ids( - g_id, [r["feature_id"] for r in id_res], conn - ) + final_list = await self.get_genome_features_by_ids(g_id, [r["feature_id"] for r in id_res], conn) return final_list, {"offset": offset, "limit": limit, "total": len(id_res)} async def clear_genome_features(self, g_id: str): conn: asyncpg.Connection async with self.connect() as conn: - await conn.execute("DELETE FROM genome_feature_attributes WHERE genome_id = $1", g_id) - await conn.execute("DELETE FROM genome_feature_entries WHERE genome_id = $1", g_id) - await conn.execute("DELETE FROM genome_feature_parents WHERE genome_id = $1", g_id) await conn.execute("DELETE FROM genome_features WHERE genome_id = $1", g_id) - async def bulk_ingest_genome_features(self, features: Iterable[GenomeFeature], logger: logging.Logger): - feature_types: list[tuple[str]] = [] - entries: list[tuple[str, str, int, int, str, float | None, int | None]] = [] - attributes: list[tuple[str, str, str, str]] = [] - parents: list[tuple[str, str, str]] = [] - feature_tuples: list[tuple[str, str, str, str, str, str, str]] = [] + async def bulk_ingest_genome_features(self, features: tuple[GenomeFeatureWithInternalID, ...]): + feature_types: set[tuple[str]] = set() + entries: list[tuple[int, int, int, str, float | None, int | None]] = [] + attributes: list[tuple[int, str, str]] = [] + parents: list[tuple[int, int]] = [] + feature_tuples: list[tuple[int, str, str, str, str, str, str, str]] = [] + + parent_row_ids_by_nat_id = {p.feature_id: p.id for p in features} for feature in features: + row_id = feature.id genome_id = feature.genome_id contig_name = feature.contig_name feature_id = feature.feature_id - feature_types.append((feature.feature_type,)) + feature_types.add((feature.feature_type,)) entries.extend( ( - genome_id, - feature_id, + row_id, e.start_pos, e.end_pos, f"{contig_name}:{e.start_pos}-{e.end_pos}", @@ -394,12 +407,13 @@ async def bulk_ingest_genome_features(self, features: Iterable[GenomeFeature], l ) for attr_tag, attr_vals in feature.attributes.items(): - attributes.extend((genome_id, feature_id, attr_tag, attr_val) for attr_val in attr_vals) + attributes.extend((row_id, attr_tag, attr_val) for attr_val in attr_vals) - parents.extend((genome_id, feature_id, p) for p in feature.parents) + parents.extend((row_id, parent_row_ids_by_nat_id[p]) for p in feature.parents) feature_tuples.append( ( + row_id, genome_id, contig_name, feature.strand, @@ -413,15 +427,16 @@ async def bulk_ingest_genome_features(self, features: Iterable[GenomeFeature], l conn: asyncpg.Connection async with self.connect() as conn: async with conn.transaction(): - logger.debug(f"bulk_ingest_genome_features: have {len(feature_types)} feature types for batch") + self.logger.debug(f"bulk_ingest_genome_features: have {len(feature_types)} feature types for batch") await conn.executemany( "INSERT INTO genome_feature_types(type_id) VALUES ($1) ON CONFLICT DO NOTHING", feature_types ) - logger.debug(f"bulk_ingest_genome_features: have {len(feature_tuples)} features for batch") + self.logger.debug(f"bulk_ingest_genome_features: have {len(feature_tuples)} features for batch") await conn.copy_records_to_table( "genome_features", columns=[ + "id", "genome_id", "contig_name", "strand", @@ -433,24 +448,24 @@ async def bulk_ingest_genome_features(self, features: Iterable[GenomeFeature], l records=feature_tuples, ) - logger.debug(f"bulk_ingest_genome_features: have {len(attributes)} feature attribute records for batch") + self.logger.debug( + f"bulk_ingest_genome_features: have {len(attributes)} feature attribute records for batch" + ) await conn.copy_records_to_table( "genome_feature_attributes", columns=[ - "genome_id", - "feature_id", + "feature", "attr_tag", "attr_val", ], records=attributes, ) - logger.debug(f"bulk_ingest_genome_features: have {len(entries)} feature entries for batch") + self.logger.debug(f"bulk_ingest_genome_features: have {len(entries)} feature entries for batch") await conn.copy_records_to_table( "genome_feature_entries", columns=[ - "genome_id", - "feature_id", + "feature", "start_pos", "end_pos", "position_text", @@ -460,21 +475,20 @@ async def bulk_ingest_genome_features(self, features: Iterable[GenomeFeature], l records=entries, ) - logger.debug(f"bulk_ingest_genome_features: have {len(parents)} feature parent records for batch") + self.logger.debug(f"bulk_ingest_genome_features: have {len(parents)} feature parent records for batch") await conn.copy_records_to_table( "genome_feature_parents", columns=[ - "genome_id", - "feature_id", - "parent_id", + "feature", + "parent", ], records=parents, ) @lru_cache() -def get_db(config: ConfigDependency) -> Database: # pragma: no cover - return Database(config) +def get_db(config: ConfigDependency, logger: LoggerDependency) -> Database: # pragma: no cover + return Database(config, logger) DatabaseDependency = Annotated[Database, Depends(get_db)] diff --git a/bento_reference_service/features.py b/bento_reference_service/features.py index 46ceecb..49fab06 100644 --- a/bento_reference_service/features.py +++ b/bento_reference_service/features.py @@ -113,12 +113,14 @@ async def ingest_gene_feature_annotation( logger.info(f"Ingesting gene features for genome {genome_id}...") - def _iter_features() -> Generator[tuple[m.GenomeFeature, ...], None, None]: + def _iter_features() -> Generator[tuple[m.GenomeFeatureWithInternalID, ...], None, None]: gff = pysam.TabixFile(str(gff_path), index=str(gff_index_path)) total_processed: int = 0 + current_feature_row_id: int = 1 # We generate a numeric ID for features to save space and improve lookup time. + try: - features_by_id: dict[str, m.GenomeFeature] = {} + features_by_id: dict[str, m.GenomeFeatureWithInternalID] = {} for contig in genome.contigs: logger.info(f"Indexing features from contig {contig.name}") @@ -163,7 +165,8 @@ def _iter_features() -> Generator[tuple[m.GenomeFeature, ...], None, None]: if feature_id in features_by_id: features_by_id[feature_id].entries.append(entry) else: - features_by_id[feature_id] = m.GenomeFeature( + features_by_id[feature_id] = m.GenomeFeatureWithInternalID( + id=current_feature_row_id, genome_id=genome_id, contig_name=contig.name, strand=record.strand or ".", # None/"." <=> unstranded @@ -180,6 +183,7 @@ def _iter_features() -> Generator[tuple[m.GenomeFeature, ...], None, None]: }, parents=tuple(p for p in record_attributes.get("Parent", ()) if p), ) + current_feature_row_id += 1 except Exception as e: logger.error( @@ -208,7 +212,7 @@ def _iter_features() -> Generator[tuple[m.GenomeFeature, ...], None, None]: while data := next(features_to_ingest, ()): s = datetime.now() logger.debug(f"ingest_gene_feature_annotation: ingesting batch of {len(data)} features") - await db.bulk_ingest_genome_features(data, logger) + await db.bulk_ingest_genome_features(data) n_ingested += len(data) logger.debug(f"ingest_gene_feature_annotation: batch took {(datetime.now() - s).total_seconds():.1f} seconds") diff --git a/bento_reference_service/models.py b/bento_reference_service/models.py index 954bad6..1d641d5 100644 --- a/bento_reference_service/models.py +++ b/bento_reference_service/models.py @@ -10,6 +10,7 @@ "GenomeWithURIs", "GenomeFeatureEntry", "GenomeFeature", + "GenomeFeatureWithInternalID", ] # Pydantic/dict models, not database models @@ -92,3 +93,7 @@ class GenomeFeature(BaseModel): attributes: dict[str, list[str]] parents: tuple[str, ...] + + +class GenomeFeatureWithInternalID(GenomeFeature): + id: int diff --git a/bento_reference_service/sql/schema.sql b/bento_reference_service/sql/schema.sql index 7e70629..ac91d2b 100644 --- a/bento_reference_service/sql/schema.sql +++ b/bento_reference_service/sql/schema.sql @@ -24,6 +24,7 @@ CREATE TABLE IF NOT EXISTS genome_aliases ( naming_authority VARCHAR(63) NOT NULL, PRIMARY KEY (genome_id, alias) ); +CREATE INDEX IF NOT EXISTS genome_aliases_genome_idx ON genome_aliases (genome_id); CREATE TABLE IF NOT EXISTS genome_contigs ( genome_id VARCHAR(31) NOT NULL REFERENCES genomes ON DELETE CASCADE, @@ -40,6 +41,7 @@ CREATE TABLE IF NOT EXISTS genome_contigs ( UNIQUE (genome_id, md5_checksum), UNIQUE (genome_id, ga4gh_checksum) ); +CREATE INDEX IF NOT EXISTS genome_contigs_genome_idx ON genome_contigs (genome_id); CREATE INDEX IF NOT EXISTS genome_contigs_md5_checksum_idx ON genome_contigs (md5_checksum); CREATE INDEX IF NOT EXISTS genome_contigs_ga4gh_checksum_idx ON genome_contigs (ga4gh_checksum); @@ -65,7 +67,9 @@ EXCEPTION END $$; CREATE TABLE IF NOT EXISTS genome_features ( - genome_id VARCHAR(31) NOT NULL REFERENCES genomes, + -- Don't use SERIAL, since we need to keep track of these during ingest for bulk ingestion: + id INTEGER NOT NULL PRIMARY KEY, + genome_id VARCHAR(31) NOT NULL REFERENCES genomes ON DELETE CASCADE, -- Feature location information, on the genome: contig_name VARCHAR(63) NOT NULL, strand strand_type NOT NULL, @@ -77,24 +81,26 @@ CREATE TABLE IF NOT EXISTS genome_features ( feature_type VARCHAR(15) NOT NULL REFERENCES genome_feature_types, source TEXT NOT NULL, -- Keys: - PRIMARY KEY (genome_id, feature_id), + UNIQUE (genome_id, feature_id), FOREIGN KEY (genome_id, contig_name) REFERENCES genome_contigs ); +CREATE INDEX IF NOT EXISTS genome_features_genome_idx ON genome_features (genome_id); CREATE INDEX IF NOT EXISTS genome_features_feature_id_trgm_gin ON genome_features USING GIN (feature_id gin_trgm_ops); -CREATE INDEX IF NOT EXISTS genome_features_feature_name_trgm_gin ON genome_features USING GIN (feature_name gin_trgm_ops); +CREATE INDEX IF NOT EXISTS genome_features_feature_name_trgm_gin + ON genome_features USING GIN (feature_name gin_trgm_ops); CREATE TABLE IF NOT EXISTS genome_feature_entries ( - genome_id VARCHAR(31) NOT NULL REFERENCES genomes, - feature_id VARCHAR(63) NOT NULL, + id SERIAL PRIMARY KEY, + feature INTEGER NOT NULL REFERENCES genome_features ON DELETE CASCADE, start_pos INTEGER NOT NULL, -- 1-based, inclusive end_pos INTEGER NOT NULL, -- 1-based, exclusive - if start_pos == end_pos then it's a 0-length feature position_text TEXT NOT NULL, -- chr:start-end style searchable string - cached for indexing purposes score FLOAT, - phase SMALLINT, - -- Keys: - FOREIGN KEY (genome_id, feature_id) REFERENCES genome_features + phase SMALLINT ); -CREATE INDEX IF NOT EXISTS genome_feature_entries_genome_feature_idx ON genome_feature_entries (genome_id, feature_id); +CREATE INDEX IF NOT EXISTS genome_feature_entries_feature_idx ON genome_feature_entries (feature); +CREATE INDEX IF NOT EXISTS genome_feature_entries_start_end_pos_idx ON genome_feature_entries (start_pos, end_pos); +CREATE INDEX IF NOT EXISTS genome_feature_entries_end_pos_idx ON genome_feature_entries (end_pos); CREATE INDEX IF NOT EXISTS genome_feature_entries_position_text_trgm_gin ON genome_feature_entries USING GIN (position_text gin_trgm_ops); @@ -102,25 +108,20 @@ CREATE INDEX IF NOT EXISTS genome_feature_entries_position_text_trgm_gin -- in GFF3 files, features can have one or multiple parents within the same annotation file -- - facilitate this via a many-to-many table CREATE TABLE IF NOT EXISTS genome_feature_parents ( - genome_id VARCHAR(31) NOT NULL REFERENCES genomes, - feature_id VARCHAR(63) NOT NULL, - parent_id VARCHAR(63) NOT NULL, - FOREIGN KEY (genome_id, feature_id) REFERENCES genome_features, - FOREIGN KEY (genome_id, parent_id) REFERENCES genome_features + feature INTEGER NOT NULL REFERENCES genome_features ON DELETE CASCADE, + parent INTEGER NOT NULL REFERENCES genome_features ON DELETE CASCADE, + PRIMARY KEY (feature, parent) ); -CREATE INDEX IF NOT EXISTS genome_feature_parents_genome_feature_idx ON genome_feature_parents (genome_id, feature_id); +CREATE INDEX IF NOT EXISTS genome_feature_parents_feature_idx ON genome_feature_parents (feature); +CREATE INDEX IF NOT EXISTS genome_feature_parents_parent_idx ON genome_feature_parents (parent); -- attributes can also have multiple values, so we don't enforce uniqueness on (genome_id, feature_id, attr_tag) -- these are non-Parent, non-ID attributes CREATE TABLE IF NOT EXISTS genome_feature_attributes ( - annotation_id SERIAL PRIMARY KEY, - genome_id VARCHAR(31) NOT NULL REFERENCES genomes, - feature_id VARCHAR(63) NOT NULL, + id SERIAL PRIMARY KEY, + feature INTEGER NOT NULL REFERENCES genome_features ON DELETE CASCADE, attr_tag VARCHAR(63) NOT NULL, - attr_val TEXT NOT NULL, - FOREIGN KEY (genome_id, feature_id) REFERENCES genome_features + attr_val TEXT NOT NULL ); -CREATE INDEX IF NOT EXISTS genome_feature_attributes_genome_feature_idx - ON genome_feature_parents (genome_id, feature_id); CREATE INDEX IF NOT EXISTS genome_feature_attributes_attr_idx - ON genome_feature_attributes (genome_id, feature_id, attr_tag); + ON genome_feature_attributes (feature, attr_tag); From c4325881320247bf60b187e2ea217a2d819137cb Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Fri, 10 May 2024 12:29:13 -0400 Subject: [PATCH 040/114] fix: issue with v0.2 migration SQL --- bento_reference_service/sql/migrate_v0_2.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bento_reference_service/sql/migrate_v0_2.sql b/bento_reference_service/sql/migrate_v0_2.sql index 354ce70..a889dfc 100644 --- a/bento_reference_service/sql/migrate_v0_2.sql +++ b/bento_reference_service/sql/migrate_v0_2.sql @@ -1,7 +1,7 @@ -- Run these commands before migrating to v0.2.x DROP TABLE genome_feature_annotations CASCADE; -DROP TABLE genome_feature_features CASCADE; +DROP TABLE genome_features CASCADE; DROP TABLE IF EXISTS genome_feature_type_synonyms; -- from v0.1, now unused DROP TYPE strand_type CASCADE; From edf6e9c616209f635bcf25bdefe106e24d32d83b Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Fri, 10 May 2024 15:06:40 -0400 Subject: [PATCH 041/114] lint --- bento_reference_service/db.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/bento_reference_service/db.py b/bento_reference_service/db.py index abcb9ce..338025a 100644 --- a/bento_reference_service/db.py +++ b/bento_reference_service/db.py @@ -363,8 +363,6 @@ def _q_param(pv: str | int) -> str: LIMIT {_q_param(max(limit, 0))} """ - from datetime import datetime - conn: asyncpg.Connection async with self.connect() as conn: id_res = await conn.fetch(id_query, g_id, *q_params) From 938f6b5c97e12e786742337259963696a377f60f Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Fri, 10 May 2024 15:06:57 -0400 Subject: [PATCH 042/114] perf(db): release connections earlier in db manager --- bento_reference_service/db.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/bento_reference_service/db.py b/bento_reference_service/db.py index 338025a..d8ae86e 100644 --- a/bento_reference_service/db.py +++ b/bento_reference_service/db.py @@ -119,8 +119,8 @@ async def _select_genomes(self, g_id: str | None, external_resource_uris: bool) *((g_id,) if g_id is not None else ()), ) - for r in map(lambda g: self.deserialize_genome(g, external_resource_uris), res): - yield r + for r in map(lambda g: self.deserialize_genome(g, external_resource_uris), res): + yield r async def get_genomes(self, external_resource_uris: bool = False) -> tuple[GenomeWithURIs, ...]: return tuple([r async for r in self._select_genomes(None, external_resource_uris)]) @@ -143,12 +143,13 @@ async def get_genome_and_contig_by_checksum_str( contig_res = await conn.fetchrow( "SELECT * FROM genome_contigs WHERE md5_checksum = $1 OR ga4gh_checksum = $1", chk_norm ) - genome_res = ( - (await anext(self._select_genomes(contig_res["genome_id"], False), None)) if contig_res else None - ) - if genome_res is None or contig_res is None: - return None - return genome_res, self.deserialize_contig(contig_res) + + genome_res = ( + (await anext(self._select_genomes(contig_res["genome_id"], False), None)) if contig_res else None + ) + if genome_res is None or contig_res is None: + return None + return genome_res, self.deserialize_contig(contig_res) async def create_genome(self, g: Genome, return_external_resource_uris: bool) -> GenomeWithURIs | None: conn: asyncpg.Connection @@ -302,7 +303,7 @@ async def get_genome_features_by_ids( conn: asyncpg.Connection async with self.connect(existing_conn) as conn: final_res = await conn.fetch(final_query, g_id, f_ids) - return [self.deserialize_genome_feature(r) for r in final_res] + return [self.deserialize_genome_feature(r) for r in final_res] async def get_genome_feature_by_id(self, g_id: str, f_id: str) -> GenomeFeature | None: res = await self.get_genome_features_by_ids(g_id, [f_id]) From e8e5bed05da9b51fa06e94c4de7a20165fd87f7c Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Fri, 10 May 2024 15:32:39 -0400 Subject: [PATCH 043/114] fix(workflows): fix jumping the gun ingesting fasta+gff3 at once --- bento_reference_service/workflows/wdls/fasta_ref.wdl | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/bento_reference_service/workflows/wdls/fasta_ref.wdl b/bento_reference_service/workflows/wdls/fasta_ref.wdl index fb0ce4a..c2bab74 100644 --- a/bento_reference_service/workflows/wdls/fasta_ref.wdl +++ b/bento_reference_service/workflows/wdls/fasta_ref.wdl @@ -79,7 +79,8 @@ workflow fasta_ref { gff3_gz_tbi = select_first([gi.sorted_gff3_gz_tbi]), # " reference_url = reference_url, token = access_token, - validate_ssl = validate_ssl + validate_ssl = validate_ssl, + wait_for_ref_ingest = ingest_metadata_into_ref.out } } } @@ -219,6 +220,7 @@ task ingest_gff3_into_ref { String reference_url String token Boolean validate_ssl + File wait_for_ref_ingest # dummy file to force this task to wait for the reference to be ingested first } command <<< From 6d9ad33a33dcc16d779d997d27e7ac7bbabb67c3 Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Fri, 10 May 2024 16:46:01 -0400 Subject: [PATCH 044/114] refact!: rewrite feature ingest to use async task-based flow perf: reduce size of attribute table --- bento_reference_service/db.py | 230 ++++++++++++++----- bento_reference_service/features.py | 76 ++++-- bento_reference_service/main.py | 3 + bento_reference_service/models.py | 12 +- bento_reference_service/routers/constants.py | 11 + bento_reference_service/routers/genomes.py | 54 ++--- bento_reference_service/routers/tasks.py | 23 ++ bento_reference_service/sql/schema.sql | 46 +++- tests/conftest.py | 15 +- tests/test_genome_routes.py | 30 ++- 10 files changed, 380 insertions(+), 120 deletions(-) create mode 100644 bento_reference_service/routers/constants.py create mode 100644 bento_reference_service/routers/tasks.py diff --git a/bento_reference_service/db.py b/bento_reference_service/db.py index d8ae86e..ae230aa 100644 --- a/bento_reference_service/db.py +++ b/bento_reference_service/db.py @@ -5,7 +5,7 @@ from fastapi import Depends from functools import lru_cache from pathlib import Path -from typing import Annotated, AsyncIterator, Iterable +from typing import Annotated, AsyncIterator, Literal from .config import Config, ConfigDependency from .logger import LoggerDependency @@ -17,7 +17,7 @@ OntologyTerm, GenomeFeatureEntry, GenomeFeature, - GenomeFeatureWithInternalID, + Task, ) @@ -144,9 +144,7 @@ async def get_genome_and_contig_by_checksum_str( "SELECT * FROM genome_contigs WHERE md5_checksum = $1 OR ga4gh_checksum = $1", chk_norm ) - genome_res = ( - (await anext(self._select_genomes(contig_res["genome_id"], False), None)) if contig_res else None - ) + genome_res = (await anext(self._select_genomes(contig_res["genome_id"], False), None)) if contig_res else None if genome_res is None or contig_res is None: return None return genome_res, self.deserialize_contig(contig_res) @@ -290,9 +288,12 @@ async def get_genome_features_by_ids( ) parents, ( WITH attrs_tmp AS ( - SELECT attr_tag, array_agg(gfa.attr_val) attr_vals FROM genome_feature_attributes gfa + SELECT gfak.attr_key AS attr_key, array_agg(gfav.attr_val) attr_vals + FROM genome_feature_attributes gfa + JOIN genome_feature_attribute_keys gfak ON gfa.attr_key = gfak.id + JOIN genome_feature_attribute_values gfav ON gfa.attr_val = gfav.id WHERE gfa.feature = gf.id - GROUP BY gfa.attr_tag + GROUP BY gfak.attr_key ) SELECT jsonb_object_agg(attrs_tmp.attr_tag, attrs_tmp.attr_vals) FROM attrs_tmp ) attributes @@ -376,56 +377,104 @@ async def clear_genome_features(self, g_id: str): async with self.connect() as conn: await conn.execute("DELETE FROM genome_features WHERE genome_id = $1", g_id) - async def bulk_ingest_genome_features(self, features: tuple[GenomeFeatureWithInternalID, ...]): - feature_types: set[tuple[str]] = set() - entries: list[tuple[int, int, int, str, float | None, int | None]] = [] - attributes: list[tuple[int, str, str]] = [] - parents: list[tuple[int, int]] = [] - feature_tuples: list[tuple[int, str, str, str, str, str, str, str]] = [] - - parent_row_ids_by_nat_id = {p.feature_id: p.id for p in features} - - for feature in features: - row_id = feature.id - genome_id = feature.genome_id - contig_name = feature.contig_name - feature_id = feature.feature_id - - feature_types.add((feature.feature_type,)) - - entries.extend( - ( - row_id, - e.start_pos, - e.end_pos, - f"{contig_name}:{e.start_pos}-{e.end_pos}", - e.score, - e.phase, - ) - for e in feature.entries - ) - - for attr_tag, attr_vals in feature.attributes.items(): - attributes.extend((row_id, attr_tag, attr_val) for attr_val in attr_vals) - - parents.extend((row_id, parent_row_ids_by_nat_id[p]) for p in feature.parents) - - feature_tuples.append( - ( - row_id, - genome_id, - contig_name, - feature.strand, - feature_id, - feature.feature_name, - feature.feature_type, - feature.source, - ) - ) + async def bulk_ingest_genome_features(self, features: tuple[GenomeFeature, ...]): + # Manually generate sequential IDs + # This requires an exclusive write lock on the database, so we don't get conflicting IDs conn: asyncpg.Connection async with self.connect() as conn: async with conn.transaction(): + fr = await conn.fetchrow("SELECT COALESCE(MAX(id), 0) + 1 AS next_id FROM genome_features") + kr = await conn.fetchrow( + "SELECT COALESCE(MAX(id), 0) + 1 AS next_id FROM genome_feature_attribute_keys" + ) + vr = await conn.fetchrow( + "SELECT COALESCE(MAX(id), 0) + 1 AS next_id FROM genome_feature_attribute_values" + ) + + assert fr + assert kr + assert vr + + # We generate a numeric ID for features to save space and improve lookup time.; + current_feature_row_id: int = fr["next_id"] + current_attr_key_id: int = kr["next_id"] + current_attr_value_id: int = vr["next_id"] + + feature_row_ids: dict[str, int] = {} + attr_key_ids: dict[str, int] = {} + attr_value_ids: dict[str, int] = {} + + # ------------------------------------------------------------------------------------------------------ + + feature_types: set[tuple[str]] = set() + entries: list[tuple[int, int, int, str, float | None, int | None]] = [] + attributes: list[tuple[int, int, int]] = [] + parents: list[tuple[int, int]] = [] + feature_tuples: list[tuple[int, str, str, str, str, str, str, str, str | None]] = [] + + for f in features: # iter 1: populate row ID lookup dict + feature_row_ids[f.feature_id] = current_feature_row_id + current_feature_row_id += 1 + + for feature in features: + feature_id = feature.feature_id + + row_id = feature_row_ids[feature_id] + genome_id = feature.genome_id + contig_name = feature.contig_name + + feature_types.add((feature.feature_type,)) + + entries.extend( + ( + row_id, + e.start_pos, + e.end_pos, + f"{contig_name}:{e.start_pos}-{e.end_pos}", + e.score, + e.phase, + ) + for e in feature.entries + ) + + # to reduce attribute storage, we deduplicate storage of keys and values by giving them integer IDs, + # and put these in the attributes table. we also have to put the key/value lookups into their + # respective tables. + for attr_key, attr_vals in feature.attributes.items(): + if attr_key in attr_key_ids: + ak = attr_key_ids[attr_key] + else: + ak = current_attr_key_id + attr_key_ids[attr_key] = current_attr_key_id + current_attr_key_id += 1 + + for attr_val in attr_vals: + if attr_val in attr_value_ids: + av = attr_value_ids[attr_val] + else: + av = current_attr_value_id + attr_value_ids[attr_val] = current_attr_value_id + current_attr_value_id += 1 + + attributes.append((row_id, ak, av)) + + parents.extend((row_id, feature_row_ids[p]) for p in feature.parents) + + feature_tuples.append( + ( + row_id, + genome_id, + contig_name, + feature.strand, + feature_id, + feature.feature_name, + feature.feature_type, + feature.source, + feature_row_ids.get(feature.gene_id) if feature.gene_id else None, + ) + ) + self.logger.debug(f"bulk_ingest_genome_features: have {len(feature_types)} feature types for batch") await conn.executemany( "INSERT INTO genome_feature_types(type_id) VALUES ($1) ON CONFLICT DO NOTHING", feature_types @@ -443,10 +492,27 @@ async def bulk_ingest_genome_features(self, features: tuple[GenomeFeatureWithInt "feature_name", "feature_type", "source", + "gene_id", ], records=feature_tuples, ) + attribute_keys: list[tuple[int, str]] = [(ik, sk) for sk, ik in attr_key_ids.items()] + self.logger.debug( + f"bulk_ingest_genome_features: have {len(attribute_keys)} feature attribute keys for batch" + ) + await conn.copy_records_to_table( + "genome_feature_attribute_keys", columns=["id", "attr_key"], records=attribute_keys + ) + + attribute_values: list[tuple[int, str]] = [(iv, sv) for sv, iv in attr_value_ids.items()] + self.logger.debug( + f"bulk_ingest_genome_features: have {len(attribute_keys)} feature attribute values for batch" + ) + await conn.copy_records_to_table( + "genome_feature_attribute_values", columns=["id", "attr_val"], records=attribute_values + ) + self.logger.debug( f"bulk_ingest_genome_features: have {len(attributes)} feature attribute records for batch" ) @@ -454,7 +520,7 @@ async def bulk_ingest_genome_features(self, features: tuple[GenomeFeatureWithInt "genome_feature_attributes", columns=[ "feature", - "attr_tag", + "attr_key", "attr_val", ], records=attributes, @@ -484,6 +550,62 @@ async def bulk_ingest_genome_features(self, features: tuple[GenomeFeatureWithInt records=parents, ) + @staticmethod + def deserialize_task(rec: asyncpg.Record | dict) -> Task: + return Task( + id=rec["id"], + genome_id=rec["genome_id"], + kind=rec["kind"], + status=rec["status"], + message=rec["message"], + created=rec["created"], + ) + + async def get_task(self, t_id: int) -> Task | None: + conn: asyncpg.Connection + async with self.connect() as conn: + res = await conn.fetchrow("SELECT * FROM tasks WHERE id = $1", t_id) + return self.deserialize_task(res) if res is not None else None + + async def query_tasks(self, g_id: str | None, task_kind: Literal["ingest_features"] | None) -> tuple[Task, ...]: + conn: asyncpg.Connection + async with self.connect() as conn: + where_clauses: list[str] = [] + params: list[int | str] = [] + + if g_id is not None: + where_clauses.append(f"genome_id = ${len(where_clauses) + 1}") + params.append(g_id) + + if task_kind is not None: + where_clauses.append(f"kind = ${len(where_clauses) + 1}::task_kind") + params.append(task_kind) + + where_part = " AND ".join(where_clauses) if where_clauses else "true" + + res = await conn.fetch(f"SELECT * FROM tasks WHERE genome_id = $1 {where_part}", *params) + return tuple(self.deserialize_task(r) for r in res) + + async def update_task_status( + self, t_id: int, status: Literal["queued", "running", "success", "error"], message: str = "" + ): + conn: asyncpg.Connection + async with self.connect() as conn: + await conn.execute( + "UPDATE tasks SET status = $2::task_status, message = $3 WHERE id = $1", t_id, status, message + ) + + async def create_task(self, g_id: str, task_kind: Literal["ingest_features"]) -> int: + conn: asyncpg.Connection + async with self.connect() as conn: + res = await conn.fetchrow( + "INSERT INTO tasks (genome_id, kind) VALUES ($1, $2::task_kind) RETURNING id", + g_id, + task_kind, + ) + assert res is not None + return res["id"] + @lru_cache() def get_db(config: ConfigDependency, logger: LoggerDependency) -> Database: # pragma: no cover diff --git a/bento_reference_service/features.py b/bento_reference_service/features.py index 49fab06..50acb83 100644 --- a/bento_reference_service/features.py +++ b/bento_reference_service/features.py @@ -11,12 +11,18 @@ from .db import Database __all__ = [ + "INGEST_FEATURES_TASK_KIND", "AnnotationGenomeNotFoundError", - "ingest_gene_feature_annotation", + "ingest_features", + "ingest_features_task", ] +INGEST_FEATURES_TASK_KIND = "ingest_features" -GFF_CAPTURED_ATTRIBUTES = frozenset({"ID", "Parent"}) +GFF_ID_ATTR = "ID" +GFF_PARENT_ATTR = "ID" +GFF_GENCODE_GENE_ID_ATTR = "gene_id" +GFF_CAPTURED_ATTRIBUTES = frozenset({GFF_ID_ATTR, GFF_PARENT_ATTR, GFF_GENCODE_GENE_ID_ATTR}) GFF_SKIPPED_FEATURE_TYPES = frozenset({"stop_codon_redefined_as_selenocysteine"}) GFF_LOG_PROGRESS_INTERVAL = 1000 @@ -36,14 +42,14 @@ def parse_attributes(raw_attributes: dict[str, str]) -> dict[str, list[str]]: def extract_feature_id(record, attributes: dict[str, list[str]]) -> str | None: feature_type = record.feature.lower() - feature_id = attributes.get("ID", (None,))[0] + feature_id = attributes.get(GFF_ID_ATTR, (None,))[0] if feature_id: return feature_id match feature_type: case "gene": - return attributes.get("gene_id", (None,))[0] + return attributes.get(GFF_GENCODE_GENE_ID_ATTR, (None,))[0] case "transcript": return attributes.get("transcript_id", (None,))[0] case "exon": @@ -63,7 +69,7 @@ def extract_feature_name(record, attributes: dict[str, list[str]]) -> str | None match feature_type: case "gene": - return attributes.get("gene_name", attributes.get("gene_id", (None,)))[0] + return attributes.get("gene_name", attributes.get(GFF_GENCODE_GENE_ID_ATTR, (None,)))[0] case "transcript": return transcript_name case "5utr" | "five_prime_utr": # 5' untranslated region (UTR) @@ -85,7 +91,7 @@ def extract_feature_name(record, attributes: dict[str, list[str]]) -> str | None return None -async def ingest_gene_feature_annotation( +async def ingest_features( # parameters: genome_id: str, gff_path: Path, @@ -93,7 +99,7 @@ async def ingest_gene_feature_annotation( # dependencies: db: Database, logger: logging.Logger, -) -> None: +) -> int: """ Given a genome ID and a path to an external GTF gene/exon/transcript annotation file, this function copies the GTF into the relevant .bentoGenome directory and ingests the annotations into an ElasticSearch index for fuzzy text @@ -113,14 +119,12 @@ async def ingest_gene_feature_annotation( logger.info(f"Ingesting gene features for genome {genome_id}...") - def _iter_features() -> Generator[tuple[m.GenomeFeatureWithInternalID, ...], None, None]: + def _iter_features() -> Generator[tuple[m.GenomeFeature, ...], None, None]: gff = pysam.TabixFile(str(gff_path), index=str(gff_index_path)) total_processed: int = 0 - current_feature_row_id: int = 1 # We generate a numeric ID for features to save space and improve lookup time. - try: - features_by_id: dict[str, m.GenomeFeatureWithInternalID] = {} + features_by_id: dict[str, m.GenomeFeature] = {} for contig in genome.contigs: logger.info(f"Indexing features from contig {contig.name}") @@ -165,8 +169,14 @@ def _iter_features() -> Generator[tuple[m.GenomeFeatureWithInternalID, ...], Non if feature_id in features_by_id: features_by_id[feature_id].entries.append(entry) else: - features_by_id[feature_id] = m.GenomeFeatureWithInternalID( - id=current_feature_row_id, + attributes: dict[str, list[str]] = { + # skip attributes which have been captured in the above information: + k: vs + for k, vs in record_attributes.items() + if k not in GFF_CAPTURED_ATTRIBUTES + } + + features_by_id[feature_id] = m.GenomeFeature( genome_id=genome_id, contig_name=contig.name, strand=record.strand or ".", # None/"." <=> unstranded @@ -175,15 +185,10 @@ def _iter_features() -> Generator[tuple[m.GenomeFeatureWithInternalID, ...], Non feature_type=feature_type, source=record.source, entries=[entry], - attributes={ - # skip attributes which have been captured in the above information - k: v - for k, v in record_attributes.items() - if k not in GFF_CAPTURED_ATTRIBUTES - }, - parents=tuple(p for p in record_attributes.get("Parent", ()) if p), + gene_id=record_attributes.get(GFF_GENCODE_GENE_ID_ATTR, (None,))[0], + attributes=attributes, + parents=tuple(p for p in record_attributes.get(GFF_PARENT_ATTR, ()) if p), ) - current_feature_row_id += 1 except Exception as e: logger.error( @@ -220,3 +225,32 @@ def _iter_features() -> Generator[tuple[m.GenomeFeatureWithInternalID, ...], Non raise AnnotationIngestError("No gene features could be ingested - is this a valid GFF3 file?") logger.info(f"ingest_gene_feature_annotation: ingested {n_ingested} gene features") + + return n_ingested + + +async def ingest_features_task( + genome_id: str, gff3_gz_path: Path, gff3_gz_tbi_path: Path, task_id: int, db: Database, logger: logging.Logger +): + await db.update_task_status(task_id, "running") + + # clear existing gene features for this genome + logger.info(f"Clearing gene features for genome {genome_id} in preparation for feature (re-)ingestion...") + await db.clear_genome_features(genome_id) + + try: + # ingest gene features into the database + n_ingested = await ingest_features(genome_id, gff3_gz_path, gff3_gz_tbi_path, db, logger) + await db.update_task_status(task_id, "success", message=f"ingested {n_ingested} features") + + except Exception as e: + err = ( + f"task {task_id}: encountered exception while ingesting features: {e}; traceback: {traceback.format_exc()}" + ) + logger.error(err) + await db.update_task_status(task_id, "error", message=err) + + finally: + # unlink temporary files + gff3_gz_path.unlink(missing_ok=True) + gff3_gz_tbi_path.unlink(missing_ok=True) diff --git a/bento_reference_service/main.py b/bento_reference_service/main.py index df56e95..74c8f46 100644 --- a/bento_reference_service/main.py +++ b/bento_reference_service/main.py @@ -15,6 +15,7 @@ from .logger import get_logger, LoggerDependency from .routers.genomes import genome_router from .routers.refget import refget_router +from .routers.tasks import task_router from .routers.workflows import workflow_router @@ -22,9 +23,11 @@ # Attach different routers to the app, for: # - genome listing +# - asynchronous task querying # - our RefGet API implementation # - our workflow metadata and WDLs app.include_router(genome_router) +app.include_router(task_router) app.include_router(refget_router) app.include_router(workflow_router) diff --git a/bento_reference_service/models.py b/bento_reference_service/models.py index 1d641d5..e6225ae 100644 --- a/bento_reference_service/models.py +++ b/bento_reference_service/models.py @@ -1,3 +1,4 @@ +from datetime import datetime from pydantic import BaseModel from typing import Literal @@ -10,7 +11,7 @@ "GenomeWithURIs", "GenomeFeatureEntry", "GenomeFeature", - "GenomeFeatureWithInternalID", + "Task", ] # Pydantic/dict models, not database models @@ -90,10 +91,17 @@ class GenomeFeature(BaseModel): source: str entries: list[GenomeFeatureEntry] # mutable to allow us to gradually build up entry list during ingestion + + gene_id: str | None # extracted from attributes, since for GENCODE GFF3s this is a standardized and useful field attributes: dict[str, list[str]] parents: tuple[str, ...] -class GenomeFeatureWithInternalID(GenomeFeature): +class Task(BaseModel): id: int + genome_id: str + kind: Literal["ingest_features"] + status: Literal["active", "success", "error"] + message: str + created: datetime diff --git a/bento_reference_service/routers/constants.py b/bento_reference_service/routers/constants.py new file mode 100644 index 0000000..13882c1 --- /dev/null +++ b/bento_reference_service/routers/constants.py @@ -0,0 +1,11 @@ +from bento_lib.auth.permissions import P_INGEST_REFERENCE_MATERIAL +from bento_lib.auth.resources import RESOURCE_EVERYTHING + +from ..authz import authz_middleware + +__all__ = ["DEPENDENCY_INGEST_REFERENCE_MATERIAL"] + + +DEPENDENCY_INGEST_REFERENCE_MATERIAL = authz_middleware.dep_require_permissions_on_resource( + frozenset({P_INGEST_REFERENCE_MATERIAL}), RESOURCE_EVERYTHING +) diff --git a/bento_reference_service/routers/genomes.py b/bento_reference_service/routers/genomes.py index 7375f58..7ed7e3a 100644 --- a/bento_reference_service/routers/genomes.py +++ b/bento_reference_service/routers/genomes.py @@ -2,9 +2,9 @@ import asyncpg import traceback -from bento_lib.auth.permissions import P_INGEST_REFERENCE_MATERIAL, P_DELETE_REFERENCE_MATERIAL +from bento_lib.auth.permissions import P_DELETE_REFERENCE_MATERIAL from bento_lib.auth.resources import RESOURCE_EVERYTHING -from fastapi import APIRouter, HTTPException, Query, Request, UploadFile, status +from fastapi import APIRouter, BackgroundTasks, HTTPException, Query, Request, UploadFile, status from fastapi.responses import StreamingResponse from typing import Annotated from uuid import uuid4 @@ -13,18 +13,15 @@ from ..authz import authz_middleware from ..config import ConfigDependency from ..db import Database, DatabaseDependency -from ..features import AnnotationGenomeNotFoundError, ingest_gene_feature_annotation +from ..features import INGEST_FEATURES_TASK_KIND, ingest_features_task from ..logger import LoggerDependency from ..streaming import generate_uri_streaming_response +from .constants import DEPENDENCY_INGEST_REFERENCE_MATERIAL __all__ = ["genome_router"] -DEPENDENCY_INGEST_REFERENCE_MATERIAL = authz_middleware.dep_require_permissions_on_resource( - frozenset({P_INGEST_REFERENCE_MATERIAL}), RESOURCE_EVERYTHING -) - genome_router = APIRouter(prefix="/genomes") @@ -226,9 +223,10 @@ async def genomes_detail_features_gff3( @genome_router.put( "/{genome_id}/features.gff3.gz", dependencies=[DEPENDENCY_INGEST_REFERENCE_MATERIAL], - status_code=status.HTTP_204_NO_CONTENT, + status_code=status.HTTP_202_ACCEPTED, ) async def genomes_detail_features_ingest_gff3( + background_tasks: BackgroundTasks, config: ConfigDependency, db: DatabaseDependency, logger: LoggerDependency, @@ -236,37 +234,29 @@ async def genomes_detail_features_ingest_gff3( gff3_gz: UploadFile, gff3_gz_tbi: UploadFile, ): + # Verify that genome exists + await get_genome_or_raise_404(db, genome_id=genome_id, external_resource_uris=False) + fn = config.file_ingest_tmp_dir / f"{uuid4()}.gff3.gz" fn_tbi = config.file_ingest_tmp_dir / f"{fn}.tbi" - try: - # copy .gff3.gz to temporary directory for ingestion - async with aiofiles.open(fn, "wb") as fh: - while data := (await gff3_gz.read(config.file_response_chunk_size)): - await fh.write(data) + # copy .gff3.gz to temporary directory for ingestion + async with aiofiles.open(fn, "wb") as fh: + while data := (await gff3_gz.read(config.file_response_chunk_size)): + await fh.write(data) - logger.debug(f"Wrote GFF.gz data to {fn}; size={fn.stat().st_size}") + logger.debug(f"Wrote GFF.gz data to {fn}; size={fn.stat().st_size}") - # copy .gff3.gz.tbi to temporary directory for ingestion - async with aiofiles.open(fn_tbi, "wb") as fh: - while data := (await gff3_gz_tbi.read(config.file_response_chunk_size)): - await fh.write(data) + # copy .gff3.gz.tbi to temporary directory for ingestion + async with aiofiles.open(fn_tbi, "wb") as fh: + while data := (await gff3_gz_tbi.read(config.file_response_chunk_size)): + await fh.write(data) - logger.debug(f"Wrote GFF.gz.tbi data to {fn_tbi}; size={fn_tbi.stat().st_size}") - - # clear existing gene features for this genome - logger.info(f"Clearing gene features for genome {genome_id} in preparation for feature (re-)ingestion...") - await db.clear_genome_features(genome_id) - - # ingest gene features into the database - await ingest_gene_feature_annotation(genome_id, fn, fn_tbi, db, logger) - - except AnnotationGenomeNotFoundError: - raise exc_genome_not_found(genome_id) + logger.debug(f"Wrote GFF.gz.tbi data to {fn_tbi}; size={fn_tbi.stat().st_size}") - finally: - fn.unlink(missing_ok=True) - fn_tbi.unlink(missing_ok=True) + task_id = await db.create_task(genome_id, INGEST_FEATURES_TASK_KIND) + background_tasks.add_task(ingest_features_task, genome_id, fn, fn_tbi, task_id, db, logger) + return {"task": f"{config.service_url_base_path}/tasks/{task_id}"} @genome_router.get("/{genome_id}/features.gff3.gz.tbi", dependencies=[authz_middleware.dep_public_endpoint()]) diff --git a/bento_reference_service/routers/tasks.py b/bento_reference_service/routers/tasks.py new file mode 100644 index 0000000..3367847 --- /dev/null +++ b/bento_reference_service/routers/tasks.py @@ -0,0 +1,23 @@ +from fastapi import APIRouter, HTTPException, status + +from ..db import DatabaseDependency +from ..models import Task +from .constants import DEPENDENCY_INGEST_REFERENCE_MATERIAL + +__all__ = ["task_router"] + + +task_router = APIRouter(prefix="/tasks") + + +@task_router.get("", dependencies=[DEPENDENCY_INGEST_REFERENCE_MATERIAL]) +async def tasks_list(db: DatabaseDependency): + return await db.query_tasks(None, None) + + +@task_router.get("/{task_id}", dependencies=[DEPENDENCY_INGEST_REFERENCE_MATERIAL]) +async def tasks_detail(task_id: int, db: DatabaseDependency) -> Task: + task = await db.get_task(task_id) + if task is None: + raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=f"Task with ID {task_id} not found") + return task diff --git a/bento_reference_service/sql/schema.sql b/bento_reference_service/sql/schema.sql index ac91d2b..88503fa 100644 --- a/bento_reference_service/sql/schema.sql +++ b/bento_reference_service/sql/schema.sql @@ -80,9 +80,11 @@ CREATE TABLE IF NOT EXISTS genome_features ( feature_name TEXT NOT NULL, feature_type VARCHAR(15) NOT NULL REFERENCES genome_feature_types, source TEXT NOT NULL, + -- extracted from attributes (especially GENCODE GFF3s) - gene context (NULL if not in a gene and not a gene): + gene_id INTEGER REFERENCES genome_features ON DELETE CASCADE, -- Keys: UNIQUE (genome_id, feature_id), - FOREIGN KEY (genome_id, contig_name) REFERENCES genome_contigs + FOREIGN KEY (genome_id, contig_name) REFERENCES genome_contigs ON DELETE CASCADE ); CREATE INDEX IF NOT EXISTS genome_features_genome_idx ON genome_features (genome_id); CREATE INDEX IF NOT EXISTS genome_features_feature_id_trgm_gin ON genome_features USING GIN (feature_id gin_trgm_ops); @@ -115,13 +117,45 @@ CREATE TABLE IF NOT EXISTS genome_feature_parents ( CREATE INDEX IF NOT EXISTS genome_feature_parents_feature_idx ON genome_feature_parents (feature); CREATE INDEX IF NOT EXISTS genome_feature_parents_parent_idx ON genome_feature_parents (parent); --- attributes can also have multiple values, so we don't enforce uniqueness on (genome_id, feature_id, attr_tag) --- these are non-Parent, non-ID attributes +-- attributes can also have multiple values, so we don't enforce uniqueness on (feature, attr_tag) +-- - these are non-Parent, non-ID attributes. +-- - since we have a lot of repetition, we can normalize both keys and values into their own deduplicated tables and do +-- this set-processing at ingestion time. + +CREATE TABLE IF NOT EXISTS genome_feature_attribute_keys ( + id INTEGER NOT NULL PRIMARY KEY, -- attribute-key surrogate key + attr_key VARCHAR(63) NOT NULL -- attribute-key text value +); +CREATE INDEX IF NOT EXISTS genome_feature_attribute_keys_attr_idx + ON genome_feature_attribute_keys (attr_key); + +CREATE TABLE IF NOT EXISTS genome_feature_attribute_values ( + id INTEGER NOT NULL PRIMARY KEY, -- attribute-value surrogate key + attr_val TEXT NOT NULL -- attribute value +); + CREATE TABLE IF NOT EXISTS genome_feature_attributes ( id SERIAL PRIMARY KEY, feature INTEGER NOT NULL REFERENCES genome_features ON DELETE CASCADE, - attr_tag VARCHAR(63) NOT NULL, - attr_val TEXT NOT NULL + attr_key INTEGER NOT NULL REFERENCES genome_feature_attribute_keys, + attr_val INTEGER NOT NULL REFERENCES genome_feature_attribute_values ); CREATE INDEX IF NOT EXISTS genome_feature_attributes_attr_idx - ON genome_feature_attributes (feature, attr_tag); + ON genome_feature_attributes (feature, attr_key); + + +DO $$ BEGIN + CREATE TYPE task_kind AS ENUM ('ingest_features'); + CREATE TYPE task_status AS ENUM ('queued', 'running', 'success', 'error'); +EXCEPTION + WHEN duplicate_object THEN null; +END $$; + +CREATE TABLE IF NOT EXISTS tasks ( + id SERIAL PRIMARY KEY, + genome_id VARCHAR(31) NOT NULL REFERENCES genomes, + kind task_kind NOT NULL, + status task_status NOT NULL DEFAULT 'queued'::task_status, + message TEXT NOT NULL DEFAULT '', + created TIMESTAMP DEFAULT (now() AT TIME ZONE 'utc') +); diff --git a/tests/conftest.py b/tests/conftest.py index efaa42f..731983a 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -15,11 +15,13 @@ from bento_reference_service.config import get_config from bento_reference_service.db import Database, get_db +from bento_reference_service.logger import get_logger from bento_reference_service.main import app async def get_test_db() -> AsyncGenerator[Database, None]: - db_instance = Database(get_config()) + config = get_config() + db_instance = Database(config, get_logger(config)) await db_instance.initialize(pool_size=1) # Small pool size for testing yield db_instance @@ -34,13 +36,22 @@ async def db_cleanup(db: Database): async with db.connect() as conn: await conn.execute( """ + DROP TABLE IF EXISTS tasks; + DROP TYPE IF EXISTS task_kind; + DROP TYPE IF EXISTS task_status; + DROP INDEX IF EXISTS genome_features_feature_id_trgm_gin; DROP INDEX IF EXISTS genome_features_feature_name_trgm_gin; DROP INDEX IF EXISTS genome_feature_entries_position_text_trgm_gin; DROP INDEX IF EXISTS genome_feature_attributes_attr_idx; - + DROP INDEX IF EXISTS genome_feature_attribute_keys_attr_idx; + DROP INDEX IF EXISTS genome_feature_parents_feature_idx; + DROP INDEX IF EXISTS genome_feature_parents_parent_idx; + DROP TABLE IF EXISTS genome_feature_entries; DROP TABLE IF EXISTS genome_feature_attributes; + DROP TABLE IF EXISTS genome_feature_attribute_keys; + DROP TABLE IF EXISTS genome_feature_attribute_values; DROP TABLE IF EXISTS genome_feature_parents; DROP TABLE IF EXISTS genome_features; diff --git a/tests/test_genome_routes.py b/tests/test_genome_routes.py index c2bede8..4d55323 100644 --- a/tests/test_genome_routes.py +++ b/tests/test_genome_routes.py @@ -153,12 +153,36 @@ async def test_genome_feature_ingest(test_client: TestClient, aioresponse: aiore # setup: create genome TODO: fixture create_genome_with_permissions(test_client, aioresponse) - aioresponse.post("https://authz.local/policy/evaluate", payload={"result": [[True]]}) + hs = {"Authorization": "Token bearer"} + + # Test we can create a task for ingesting features + + aioresponse.post("https://authz.local/policy/evaluate", payload={"result": [[True]]}, repeat=True) with open(SARS_COV_2_GFF3_GZ_PATH, "rb") as gff3_fh, open(SARS_COV_2_GFF3_GZ_TBI_PATH, "rb") as tbi_fh: res = test_client.put( f"/genomes/{SARS_COV_2_GENOME_ID}/features.gff3.gz", files={"gff3_gz": gff3_fh, "gff3_gz_tbi": tbi_fh}, - headers={"Authorization": "Token bearer"}, + headers=hs, ) - assert res.status_code == status.HTTP_204_NO_CONTENT + + assert res.status_code == status.HTTP_202_ACCEPTED + data = res.json() + assert "task" in data + task_id = data["task"].split("/")[-1] + + # Test we can access the task and that it eventually succeeds + + finished: bool = False + task_status: str = "" + task_msg: str = "" + while not finished: + res = test_client.get(f"/tasks/{task_id}", headers=hs) + assert res.status_code == status.HTTP_200_OK + rd = res.json() + task_status = rd["status"] + task_msg = rd["message"] + finished = task_status in {"success", "error"} + + assert task_status == "success" + assert task_msg == "ingested 49 features" From dd6b998a537f6c7043d410fadc4c4ef7781c8ce3 Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Fri, 10 May 2024 16:55:50 -0400 Subject: [PATCH 045/114] chore: increase log interval for feature ingest --- bento_reference_service/features.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bento_reference_service/features.py b/bento_reference_service/features.py index 50acb83..b2e28c1 100644 --- a/bento_reference_service/features.py +++ b/bento_reference_service/features.py @@ -24,7 +24,7 @@ GFF_GENCODE_GENE_ID_ATTR = "gene_id" GFF_CAPTURED_ATTRIBUTES = frozenset({GFF_ID_ATTR, GFF_PARENT_ATTR, GFF_GENCODE_GENE_ID_ATTR}) GFF_SKIPPED_FEATURE_TYPES = frozenset({"stop_codon_redefined_as_selenocysteine"}) -GFF_LOG_PROGRESS_INTERVAL = 1000 +GFF_LOG_PROGRESS_INTERVAL = 100000 class AnnotationGenomeNotFoundError(Exception): From c5d6165ef378bed713754792da6a92df488e25bd Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Fri, 10 May 2024 16:56:01 -0400 Subject: [PATCH 046/114] refact: cleanup genomes router exc fn used once --- bento_reference_service/routers/genomes.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/bento_reference_service/routers/genomes.py b/bento_reference_service/routers/genomes.py index 7ed7e3a..12b99d5 100644 --- a/bento_reference_service/routers/genomes.py +++ b/bento_reference_service/routers/genomes.py @@ -25,16 +25,12 @@ genome_router = APIRouter(prefix="/genomes") -def exc_genome_not_found(genome_id: str) -> HTTPException: - return HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=f"Genome with ID {genome_id} not found") - - async def get_genome_or_raise_404( db: Database, genome_id: str, external_resource_uris: bool = True ) -> m.GenomeWithURIs: genome: m.GenomeWithURIs = await db.get_genome(genome_id, external_resource_uris=external_resource_uris) if genome is None: - raise exc_genome_not_found(genome_id) + raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=f"Genome with ID {genome_id} not found") return genome From b416d5cba79006f83acd1a9a5dee2589354003eb Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Fri, 10 May 2024 20:25:16 -0400 Subject: [PATCH 047/114] chore: update gff3 ingest workflow for new task system --- .../workflows/wdls/gff3_annot.wdl | 51 ++++++++++++++++--- 1 file changed, 44 insertions(+), 7 deletions(-) diff --git a/bento_reference_service/workflows/wdls/gff3_annot.wdl b/bento_reference_service/workflows/wdls/gff3_annot.wdl index b7fc75d..16772de 100644 --- a/bento_reference_service/workflows/wdls/gff3_annot.wdl +++ b/bento_reference_service/workflows/wdls/gff3_annot.wdl @@ -68,13 +68,50 @@ task ingest_gff3_into_ref { } command <<< - curl ~{true="" false="-k" validate_ssl} \ - -X PUT \ - -F "gff3_gz=@~{gff3_gz}" \ - -F "gff3_gz_tbi=@~{gff3_gz_tbi}" \ - -H "Authorization: Bearer ~{token}" \ - --fail-with-body \ - "~{reference_url}/genomes/~{genome_id}/features.gff3.gz" + task_res=$( + curl ~{true="" false="-k" validate_ssl} \ + -X PUT \ + -F "gff3_gz=@~{gff3_gz}" \ + -F "gff3_gz_tbi=@~{gff3_gz_tbi}" \ + -H "Authorization: Bearer ~{token}" \ + --fail-with-body \ + "~{reference_url}/genomes/~{genome_id}/features.gff3.gz" + ) + exit_code=$? + if [[ "${exit_code}" == 0 ]]; then + task_url=$(jq -r '.task' <<< "${task_res}") + while true; do + task_status_res=$( + curl ~{true="" false="-k" validate_ssl} \ + -H "Authorization: Bearer ~{token}" \ + --fail-with-body \ + "${task_url}" + ) + + task_exit_code=$? + if [[ "${task_exit_code}" != 0 ]]; then + echo "task status response returned non-success status code" >&2 + exit 1 + fi + + task_status=$(jq -r '.status' <<< "${task_status_res}") + task_message=$(jq -r '.message' <<< "${task_status_res}") + + if [[ "${task_status}" == 'success' ]]; then + echo "task succeeded with message: ${task_message}" + break # success + fi + if [[ "${task_status}" == 'error' ]]; then + echo "task failed with message: ${task_message}" >&2 + exit 1 + fi + + # otherwise, running - wait + sleep 10 + done + else + exit "${exit_code}" + fi >>> output { From 122c40dcc60bbee09930ecfd636e47a27b46a37b Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Fri, 10 May 2024 20:25:34 -0400 Subject: [PATCH 048/114] fix: bad task status literal in Task model --- bento_reference_service/db.py | 5 ++--- bento_reference_service/models.py | 6 +++++- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/bento_reference_service/db.py b/bento_reference_service/db.py index ae230aa..106b56b 100644 --- a/bento_reference_service/db.py +++ b/bento_reference_service/db.py @@ -17,6 +17,7 @@ OntologyTerm, GenomeFeatureEntry, GenomeFeature, + TaskStatus, Task, ) @@ -586,9 +587,7 @@ async def query_tasks(self, g_id: str | None, task_kind: Literal["ingest_feature res = await conn.fetch(f"SELECT * FROM tasks WHERE genome_id = $1 {where_part}", *params) return tuple(self.deserialize_task(r) for r in res) - async def update_task_status( - self, t_id: int, status: Literal["queued", "running", "success", "error"], message: str = "" - ): + async def update_task_status(self, t_id: int, status: TaskStatus, message: str = ""): conn: asyncpg.Connection async with self.connect() as conn: await conn.execute( diff --git a/bento_reference_service/models.py b/bento_reference_service/models.py index e6225ae..b3dc607 100644 --- a/bento_reference_service/models.py +++ b/bento_reference_service/models.py @@ -11,6 +11,7 @@ "GenomeWithURIs", "GenomeFeatureEntry", "GenomeFeature", + "TaskStatus", "Task", ] @@ -98,10 +99,13 @@ class GenomeFeature(BaseModel): parents: tuple[str, ...] +TaskStatus = Literal["queued", "running", "success", "error"] + + class Task(BaseModel): id: int genome_id: str kind: Literal["ingest_features"] - status: Literal["active", "success", "error"] + status: TaskStatus message: str created: datetime From 1c9c34ad46943735a3dd991be378dfe06fc1088c Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Fri, 10 May 2024 20:27:55 -0400 Subject: [PATCH 049/114] fix(db): issues with genome feature select query --- bento_reference_service/db.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/bento_reference_service/db.py b/bento_reference_service/db.py index 106b56b..0227926 100644 --- a/bento_reference_service/db.py +++ b/bento_reference_service/db.py @@ -251,6 +251,7 @@ def deserialize_genome_feature(rec: asyncpg.Record) -> GenomeFeature: feature_type=rec["feature_type"], source=rec["source"], entries=tuple(map(Database.deserialize_genome_feature_entry, json.loads(rec["entries"] or "[]"))), + gene_id=rec["gene_nat_id"], attributes=json.loads(rec["attributes"] or "{}"), parents=tuple(rec["parents"] or ()), # tuple of parent IDs ) @@ -281,6 +282,7 @@ async def get_genome_features_by_ids( feature_name, feature_type, source, + (SELECT ggf.feature_id FROM genome_features ggf WHERE ggf.id = gf.gene_id) gene_nat_id, ({self._feature_inner_entries_query()}) entries, ( SELECT array_agg(gffp.feature_id) @@ -296,7 +298,7 @@ async def get_genome_features_by_ids( WHERE gfa.feature = gf.id GROUP BY gfak.attr_key ) - SELECT jsonb_object_agg(attrs_tmp.attr_tag, attrs_tmp.attr_vals) FROM attrs_tmp + SELECT jsonb_object_agg(attrs_tmp.attr_key, attrs_tmp.attr_vals) FROM attrs_tmp ) attributes FROM genome_features gf WHERE gf.genome_id = $1 AND feature_id = any($2::text[]) From ea3009fa67981286520d8a3127f8c03dc783efeb Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Mon, 13 May 2024 13:57:36 -0400 Subject: [PATCH 050/114] perf: add missing indices to aid in querying+deletion --- bento_reference_service/sql/schema.sql | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/bento_reference_service/sql/schema.sql b/bento_reference_service/sql/schema.sql index 88503fa..083a9e1 100644 --- a/bento_reference_service/sql/schema.sql +++ b/bento_reference_service/sql/schema.sql @@ -90,6 +90,8 @@ CREATE INDEX IF NOT EXISTS genome_features_genome_idx ON genome_features (genome CREATE INDEX IF NOT EXISTS genome_features_feature_id_trgm_gin ON genome_features USING GIN (feature_id gin_trgm_ops); CREATE INDEX IF NOT EXISTS genome_features_feature_name_trgm_gin ON genome_features USING GIN (feature_name gin_trgm_ops); +CREATE INDEX IF NOT EXISTS genome_features_feature_type_idx ON genome_features (feature_type); +CREATE INDEX IF NOT EXISTS genome_features_gene_idx ON genome_features (gene_id); CREATE TABLE IF NOT EXISTS genome_feature_entries ( id SERIAL PRIMARY KEY, @@ -133,6 +135,8 @@ CREATE TABLE IF NOT EXISTS genome_feature_attribute_values ( id INTEGER NOT NULL PRIMARY KEY, -- attribute-value surrogate key attr_val TEXT NOT NULL -- attribute value ); +CREATE INDEX IF NOT EXISTS genome_feature_attribute_values_attr_val_idx + ON genome_feature_attribute_values (attr_val); CREATE TABLE IF NOT EXISTS genome_feature_attributes ( id SERIAL PRIMARY KEY, @@ -140,8 +144,11 @@ CREATE TABLE IF NOT EXISTS genome_feature_attributes ( attr_key INTEGER NOT NULL REFERENCES genome_feature_attribute_keys, attr_val INTEGER NOT NULL REFERENCES genome_feature_attribute_values ); -CREATE INDEX IF NOT EXISTS genome_feature_attributes_attr_idx +CREATE INDEX IF NOT EXISTS genome_feature_attributes_feature_idx ON genome_feature_attributes (feature); +CREATE INDEX IF NOT EXISTS genome_feature_attributes_attr_key_idx ON genome_feature_attributes (feature, attr_key); +CREATE INDEX IF NOT EXISTS genome_feature_attributes_attr_val_idx + ON genome_feature_attributes (feature, attr_val); DO $$ BEGIN @@ -159,3 +166,5 @@ CREATE TABLE IF NOT EXISTS tasks ( message TEXT NOT NULL DEFAULT '', created TIMESTAMP DEFAULT (now() AT TIME ZONE 'utc') ); +CREATE INDEX IF NOT EXISTS tasks_genome_idx ON tasks (genome_id); +CREATE INDEX IF NOT EXISTS tasks_kind_idx ON tasks (kind); From 75c4983756f7dffcaedfc1f62e567eee6930cf73 Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Mon, 13 May 2024 17:21:08 -0400 Subject: [PATCH 051/114] feat: allow deleting features via tested endpoint --- bento_reference_service/routers/constants.py | 8 ++++-- bento_reference_service/routers/genomes.py | 19 +++++++------ tests/test_genome_routes.py | 30 ++++++++++++++++---- 3 files changed, 41 insertions(+), 16 deletions(-) diff --git a/bento_reference_service/routers/constants.py b/bento_reference_service/routers/constants.py index 13882c1..2a50989 100644 --- a/bento_reference_service/routers/constants.py +++ b/bento_reference_service/routers/constants.py @@ -1,11 +1,15 @@ -from bento_lib.auth.permissions import P_INGEST_REFERENCE_MATERIAL +from bento_lib.auth.permissions import P_DELETE_REFERENCE_MATERIAL, P_INGEST_REFERENCE_MATERIAL from bento_lib.auth.resources import RESOURCE_EVERYTHING from ..authz import authz_middleware -__all__ = ["DEPENDENCY_INGEST_REFERENCE_MATERIAL"] +__all__ = ["DEPENDENCY_DELETE_REFERENCE_MATERIAL", "DEPENDENCY_INGEST_REFERENCE_MATERIAL"] +DEPENDENCY_DELETE_REFERENCE_MATERIAL = authz_middleware.dep_require_permissions_on_resource( + frozenset({P_DELETE_REFERENCE_MATERIAL}), RESOURCE_EVERYTHING +) + DEPENDENCY_INGEST_REFERENCE_MATERIAL = authz_middleware.dep_require_permissions_on_resource( frozenset({P_INGEST_REFERENCE_MATERIAL}), RESOURCE_EVERYTHING ) diff --git a/bento_reference_service/routers/genomes.py b/bento_reference_service/routers/genomes.py index 12b99d5..4c02291 100644 --- a/bento_reference_service/routers/genomes.py +++ b/bento_reference_service/routers/genomes.py @@ -2,8 +2,6 @@ import asyncpg import traceback -from bento_lib.auth.permissions import P_DELETE_REFERENCE_MATERIAL -from bento_lib.auth.resources import RESOURCE_EVERYTHING from fastapi import APIRouter, BackgroundTasks, HTTPException, Query, Request, UploadFile, status from fastapi.responses import StreamingResponse from typing import Annotated @@ -16,7 +14,7 @@ from ..features import INGEST_FEATURES_TASK_KIND, ingest_features_task from ..logger import LoggerDependency from ..streaming import generate_uri_streaming_response -from .constants import DEPENDENCY_INGEST_REFERENCE_MATERIAL +from .constants import DEPENDENCY_DELETE_REFERENCE_MATERIAL, DEPENDENCY_INGEST_REFERENCE_MATERIAL __all__ = ["genome_router"] @@ -123,11 +121,7 @@ async def genomes_detail(genome_id: str, db: DatabaseDependency) -> m.GenomeWith @genome_router.delete( "/{genome_id}", status_code=status.HTTP_204_NO_CONTENT, - dependencies=[ - authz_middleware.dep_require_permissions_on_resource( - frozenset({P_DELETE_REFERENCE_MATERIAL}), RESOURCE_EVERYTHING - ) - ], + dependencies=[DEPENDENCY_DELETE_REFERENCE_MATERIAL], ) async def genomes_delete(genome_id: str, db: DatabaseDependency): # TODO: also delete DRS objects!! @@ -185,6 +179,15 @@ async def genomes_detail_features( } +@genome_router.delete( + "/{genome_id}/features", + dependencies=[DEPENDENCY_DELETE_REFERENCE_MATERIAL], + status_code=status.HTTP_204_NO_CONTENT, +) +async def genomes_detail_features_delete(db: DatabaseDependency, genome_id: str): + await db.clear_genome_features(genome_id) + + @genome_router.get("/{genome_id}/features/{feature_id}", dependencies=[authz_middleware.dep_public_endpoint()]) async def genomes_detail_features_detail(db: DatabaseDependency, genome_id: str, feature_id: str): return await db.get_genome_feature_by_id(genome_id, feature_id) diff --git a/tests/test_genome_routes.py b/tests/test_genome_routes.py index 4d55323..a51b4d8 100644 --- a/tests/test_genome_routes.py +++ b/tests/test_genome_routes.py @@ -149,16 +149,11 @@ async def test_genome_delete(test_client: TestClient, aioresponse: aioresponses, assert res.status_code == status.HTTP_403_FORBIDDEN -async def test_genome_feature_ingest(test_client: TestClient, aioresponse: aioresponses, db_cleanup): - # setup: create genome TODO: fixture - create_genome_with_permissions(test_client, aioresponse) - +def _ingest_features(test_client: TestClient): hs = {"Authorization": "Token bearer"} # Test we can create a task for ingesting features - aioresponse.post("https://authz.local/policy/evaluate", payload={"result": [[True]]}, repeat=True) - with open(SARS_COV_2_GFF3_GZ_PATH, "rb") as gff3_fh, open(SARS_COV_2_GFF3_GZ_TBI_PATH, "rb") as tbi_fh: res = test_client.put( f"/genomes/{SARS_COV_2_GENOME_ID}/features.gff3.gz", @@ -186,3 +181,26 @@ async def test_genome_feature_ingest(test_client: TestClient, aioresponse: aiore assert task_status == "success" assert task_msg == "ingested 49 features" + + +async def test_genome_feature_ingest(test_client: TestClient, aioresponse: aioresponses, db_cleanup): + hs = {"Authorization": "Token bearer"} + + # setup: create genome TODO: fixture + create_genome_with_permissions(test_client, aioresponse) + + # Test we can ingest features + + aioresponse.post("https://authz.local/policy/evaluate", payload={"result": [[True]]}, repeat=True) + _ingest_features(test_client) + + # Test we can delete + res = test_client.delete(f"/genomes/{SARS_COV_2_GENOME_ID}/features", headers=hs) + assert res.status_code == status.HTTP_204_NO_CONTENT + + # Test we can ingest again + _ingest_features(test_client) + + # Test we can delete again + res = test_client.delete(f"/genomes/{SARS_COV_2_GENOME_ID}/features", headers=hs) + assert res.status_code == status.HTTP_204_NO_CONTENT From ee19841e0a99951bad12c7768949c5112cfb1484 Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Mon, 13 May 2024 17:22:15 -0400 Subject: [PATCH 052/114] fix(schema): missing on delete cascade for tasks genome fk --- bento_reference_service/sql/schema.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bento_reference_service/sql/schema.sql b/bento_reference_service/sql/schema.sql index 083a9e1..01f6e3e 100644 --- a/bento_reference_service/sql/schema.sql +++ b/bento_reference_service/sql/schema.sql @@ -160,7 +160,7 @@ END $$; CREATE TABLE IF NOT EXISTS tasks ( id SERIAL PRIMARY KEY, - genome_id VARCHAR(31) NOT NULL REFERENCES genomes, + genome_id VARCHAR(31) NOT NULL REFERENCES genomes ON DELETE CASCADE, kind task_kind NOT NULL, status task_status NOT NULL DEFAULT 'queued'::task_status, message TEXT NOT NULL DEFAULT '', From 140810a1962b9dbec6950cf4e2a86fc3cbe27220 Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Mon, 13 May 2024 17:22:24 -0400 Subject: [PATCH 053/114] chore: update lockfile --- poetry.lock | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/poetry.lock b/poetry.lock index e4b59fe..f137c95 100644 --- a/poetry.lock +++ b/poetry.lock @@ -2163,13 +2163,13 @@ test = ["Cython (>=0.29.36,<0.30.0)", "aiohttp (==3.9.0b0)", "aiohttp (>=3.8.1)" [[package]] name = "virtualenv" -version = "20.26.1" +version = "20.26.2" description = "Virtual Python Environment builder" optional = false python-versions = ">=3.7" files = [ - {file = "virtualenv-20.26.1-py3-none-any.whl", hash = "sha256:7aa9982a728ae5892558bff6a2839c00b9ed145523ece2274fad6f414690ae75"}, - {file = "virtualenv-20.26.1.tar.gz", hash = "sha256:604bfdceaeece392802e6ae48e69cec49168b9c5f4a44e483963f9242eb0e78b"}, + {file = "virtualenv-20.26.2-py3-none-any.whl", hash = "sha256:a624db5e94f01ad993d476b9ee5346fdf7b9de43ccaee0e0197012dc838a0e9b"}, + {file = "virtualenv-20.26.2.tar.gz", hash = "sha256:82bf0f4eebbb78d36ddaee0283d43fe5736b53880b8a8cdcd37390a07ac3741c"}, ] [package.dependencies] From 2089a4bde4c90fc06f540ba398cec0dd24a597b7 Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Tue, 14 May 2024 11:09:18 -0400 Subject: [PATCH 054/114] fix: genome alias handling --- bento_reference_service/db.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bento_reference_service/db.py b/bento_reference_service/db.py index 0227926..e6f31b0 100644 --- a/bento_reference_service/db.py +++ b/bento_reference_service/db.py @@ -63,7 +63,7 @@ def deserialize_genome(self, rec: asyncpg.Record, external_resource_uris: bool) return GenomeWithURIs( id=rec["id"], # aliases is [None] if no aliases defined: - aliases=tuple(map(Database.deserialize_alias, filter(None, rec["aliases"]))), + aliases=tuple(map(Database.deserialize_alias, filter(None, json.loads(rec["aliases"])))), uri=genome_uri, contigs=tuple(map(self.deserialize_contig, json.loads(rec["contigs"]))), md5=rec["md5_checksum"], @@ -99,7 +99,7 @@ async def _select_genomes(self, g_id: str | None, external_resource_uris: bool) gff3_gz_tbi_uri, taxon_id, taxon_label, - array( + ( SELECT json_agg(ga.*) FROM genome_aliases ga WHERE g.id = ga.genome_id ) aliases, ( From a4dc701b8e5361d24d7e65335d6589ecbfe32dfc Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Tue, 14 May 2024 11:12:59 -0400 Subject: [PATCH 055/114] fix(db): contig alias deserialization --- bento_reference_service/db.py | 2 +- tests/shared_data.py | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/bento_reference_service/db.py b/bento_reference_service/db.py index e6f31b0..b6bd49a 100644 --- a/bento_reference_service/db.py +++ b/bento_reference_service/db.py @@ -106,7 +106,7 @@ async def _select_genomes(self, g_id: str | None, external_resource_uris: bool) WITH contigs_tmp AS ( SELECT contig_name, contig_length, circular, md5_checksum, ga4gh_checksum, - array( + ( SELECT json_agg(gca.*) FROM genome_contig_aliases gca WHERE g.id = gca.genome_id AND gc.contig_name = gca.contig_name diff --git a/tests/shared_data.py b/tests/shared_data.py index 56b5a32..c167c08 100644 --- a/tests/shared_data.py +++ b/tests/shared_data.py @@ -13,6 +13,8 @@ DATA_DIR = (pathlib.Path(__file__).parent / "data").absolute() SARS_COV_2_GENOME_ID = "MN908947.3" +SARS_COV_2_ALIAS = {"alias": "NC_045512.2", "naming_authority": "refseq"} +SARS_COV_2_FAKE_ALIAS = {"alias": "sars-cov-2", "naming_authority": "me-myself-and-i"} SARS_COV_2_FASTA_PATH = DATA_DIR / "sars_cov_2.fa" SARS_COV_2_FAI_PATH = DATA_DIR / "sars_cov_2.fa.fai" SARS_COV_2_GFF3_GZ_PATH = DATA_DIR / "sars_cov_2.gff3.gz" @@ -20,7 +22,7 @@ TEST_GENOME_OF_FILE_URIS = { "id": SARS_COV_2_GENOME_ID, - "aliases": [], + "aliases": [SARS_COV_2_ALIAS, SARS_COV_2_FAKE_ALIAS], "md5": "b98334cd0015ee1b1d2dc3b9d81b325e", "ga4gh": "SQ.F4O8uhlkMQ76rmE6SmUFFjp04UV25Ybn", "fasta": f"file://{SARS_COV_2_FASTA_PATH}", @@ -31,7 +33,7 @@ "contigs": [ { "name": SARS_COV_2_GENOME_ID, - "aliases": [], + "aliases": [SARS_COV_2_ALIAS, SARS_COV_2_FAKE_ALIAS], "md5": "105c82802b67521950854a851fc6eefd", "ga4gh": "SQ.SyGVJg_YRedxvsjpqNdUgyyqx7lUfu_D", "length": 29903, From dac9a7c34bf3e3a5a33e6d032aac55e3d9039472 Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Tue, 14 May 2024 11:13:19 -0400 Subject: [PATCH 056/114] chore(db): update queued/running tasks to error on db startup --- bento_reference_service/db.py | 21 +++++++++++++++++++++ tests/test_db.py | 19 +++++++++++++++++++ 2 files changed, 40 insertions(+) create mode 100644 tests/test_db.py diff --git a/bento_reference_service/db.py b/bento_reference_service/db.py index b6bd49a..088b5e1 100644 --- a/bento_reference_service/db.py +++ b/bento_reference_service/db.py @@ -31,6 +31,13 @@ def __init__(self, config: Config, logger: logging.Logger): self.logger: logging.Logger = logger super().__init__(config.database_uri, SCHEMA_PATH) + async def initialize(self, pool_size: int = 10): + await super().initialize(pool_size) + + # If we have any tasks that are still marked as "running" on application startup, we need to move them to the + # error state. + await self.move_running_tasks_to_error() + @staticmethod def deserialize_alias(rec: asyncpg.Record | dict) -> Alias: return Alias(alias=rec["alias"], naming_authority=rec["naming_authority"]) @@ -607,6 +614,20 @@ async def create_task(self, g_id: str, task_kind: Literal["ingest_features"]) -> assert res is not None return res["id"] + async def move_running_tasks_to_error(self): + update_q = """ + UPDATE tasks + SET + status = 'error'::task_status, + message = 'This task had an invalid status at application startup: "' || $1 || '"' + WHERE status = $1::task_status + """ + + conn: asyncpg.Connection + async with self.connect() as conn: + await conn.execute(update_q, "queued") + await conn.execute(update_q, "running") + @lru_cache() def get_db(config: ConfigDependency, logger: LoggerDependency) -> Database: # pragma: no cover diff --git a/tests/test_db.py b/tests/test_db.py new file mode 100644 index 0000000..ec1af1f --- /dev/null +++ b/tests/test_db.py @@ -0,0 +1,19 @@ +from bento_reference_service.db import Database + +from .shared_data import TEST_GENOME_OF_FILE_URIS + + +async def test_mark_running_as_error(db: Database, db_cleanup): + g = await db.create_genome(TEST_GENOME_OF_FILE_URIS, return_external_resource_uris=False) + + t1 = await db.create_task(g.id, "ingest_features") + t2 = await db.create_task(g.id, "ingest_features") + await db.update_task_status(t2, "running") + + assert (await db.get_task(t1)).status == "queued" + assert (await db.get_task(t2)).status == "running" + + await db.move_running_tasks_to_error() + + assert (await db.get_task(t1)).status == "error" + assert (await db.get_task(t2)).status == "error" From cdc781cc25c79ff429f86dadd630d1ea6df196d5 Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Tue, 14 May 2024 11:18:13 -0400 Subject: [PATCH 057/114] chore(workflows): update fasta workflow gff3 ingest fn --- .../workflows/wdls/fasta_ref.wdl | 51 ++++++++++++++++--- 1 file changed, 44 insertions(+), 7 deletions(-) diff --git a/bento_reference_service/workflows/wdls/fasta_ref.wdl b/bento_reference_service/workflows/wdls/fasta_ref.wdl index c2bab74..1bf5a42 100644 --- a/bento_reference_service/workflows/wdls/fasta_ref.wdl +++ b/bento_reference_service/workflows/wdls/fasta_ref.wdl @@ -224,13 +224,50 @@ task ingest_gff3_into_ref { } command <<< - curl ~{true="" false="-k" validate_ssl} \ - -X PUT \ - -F "gff3_gz=@~{gff3_gz}" \ - -F "gff3_gz_tbi=@~{gff3_gz_tbi}" \ - -H "Authorization: Bearer ~{token}" \ - --fail-with-body \ - "~{reference_url}/genomes/~{genome_id}/features.gff3.gz" + task_res=$( + curl ~{true="" false="-k" validate_ssl} \ + -X PUT \ + -F "gff3_gz=@~{gff3_gz}" \ + -F "gff3_gz_tbi=@~{gff3_gz_tbi}" \ + -H "Authorization: Bearer ~{token}" \ + --fail-with-body \ + "~{reference_url}/genomes/~{genome_id}/features.gff3.gz" + ) + exit_code=$? + if [[ "${exit_code}" == 0 ]]; then + task_url=$(jq -r '.task' <<< "${task_res}") + while true; do + task_status_res=$( + curl ~{true="" false="-k" validate_ssl} \ + -H "Authorization: Bearer ~{token}" \ + --fail-with-body \ + "${task_url}" + ) + + task_exit_code=$? + if [[ "${task_exit_code}" != 0 ]]; then + echo "task status response returned non-success status code" >&2 + exit 1 + fi + + task_status=$(jq -r '.status' <<< "${task_status_res}") + task_message=$(jq -r '.message' <<< "${task_status_res}") + + if [[ "${task_status}" == 'success' ]]; then + echo "task succeeded with message: ${task_message}" + break # success + fi + if [[ "${task_status}" == 'error' ]]; then + echo "task failed with message: ${task_message}" >&2 + exit 1 + fi + + # otherwise, running - wait + sleep 10 + done + else + exit "${exit_code}" + fi >>> output { From 16298c6affe5a63896def14afe1a4e3c5c366429 Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Tue, 14 May 2024 11:18:22 -0400 Subject: [PATCH 058/114] chore(workflows): don't output any artifacts --- bento_reference_service/workflows/wdls/fasta_ref.wdl | 2 ++ bento_reference_service/workflows/wdls/gff3_annot.wdl | 2 ++ 2 files changed, 4 insertions(+) diff --git a/bento_reference_service/workflows/wdls/fasta_ref.wdl b/bento_reference_service/workflows/wdls/fasta_ref.wdl index 1bf5a42..cc7342d 100644 --- a/bento_reference_service/workflows/wdls/fasta_ref.wdl +++ b/bento_reference_service/workflows/wdls/fasta_ref.wdl @@ -83,6 +83,8 @@ workflow fasta_ref { wait_for_ref_ingest = ingest_metadata_into_ref.out } } + + output {} } task uncompress_fasta_and_generate_fai_if_needed { diff --git a/bento_reference_service/workflows/wdls/gff3_annot.wdl b/bento_reference_service/workflows/wdls/gff3_annot.wdl index 16772de..dcaef12 100644 --- a/bento_reference_service/workflows/wdls/gff3_annot.wdl +++ b/bento_reference_service/workflows/wdls/gff3_annot.wdl @@ -27,6 +27,8 @@ workflow gff3_annot { token = access_token, validate_ssl = validate_ssl } + + output {} } # TODO: shared file with this task From 35a855e5686702ea96a4e2e372047be661636921 Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Tue, 14 May 2024 11:35:20 -0400 Subject: [PATCH 059/114] chore(deps): add fasta-checksum-utils as a dev util --- poetry.lock | 18 +++++++++++++++++- pyproject.toml | 1 + 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/poetry.lock b/poetry.lock index f137c95..e2094d9 100644 --- a/poetry.lock +++ b/poetry.lock @@ -645,6 +645,22 @@ files = [ [package.extras] test = ["pytest (>=6)"] +[[package]] +name = "fasta-checksum-utils" +version = "0.4.1" +description = "Library and command-line utility for checksumming FASTA files and individual contigs." +optional = false +python-versions = ">=3.9.1,<4.0.0" +files = [ + {file = "fasta_checksum_utils-0.4.1-py3-none-any.whl", hash = "sha256:ea16ec14d167d226a3ca167a35841f692290f83914c34ad520bf76bd006f7464"}, + {file = "fasta_checksum_utils-0.4.1.tar.gz", hash = "sha256:647095614541563f3915ea921f3988021c76abd2ef743241599fb3fe4ea4d474"}, +] + +[package.dependencies] +aiofiles = ">=23.2.1,<24.0.0" +aiohttp = ">=3.9.3,<4.0.0" +pysam = ">=0.22.0,<0.23.0" + [[package]] name = "fastapi" version = "0.111.0" @@ -2472,4 +2488,4 @@ multidict = ">=4.0" [metadata] lock-version = "2.0" python-versions = "^3.10.0" -content-hash = "aa5fc61efa4cffe360fa3ef373c04e2f74b7a9b5c8fc758d07acef9af8c65e5d" +content-hash = "81b5ed469e55ebb6ffffa38a41efdf9fb755d5e06314fdf7c7a526ffb9a1975d" diff --git a/pyproject.toml b/pyproject.toml index 3884d58..064d9c8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -44,6 +44,7 @@ pytest = "^7.4.4" pytest-asyncio = "^0.23.5" pytest-cov = "^4.1.0" tox = "^4.12.1" +fasta-checksum-utils = "^0.4.1" [tool.black] line_length = 120 From e06649c62477d27eed696fe84064f9f85aae3894 Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Tue, 14 May 2024 12:00:09 -0400 Subject: [PATCH 060/114] test: add hg38 chr1:1-100000 test "genome" --- tests/data/gencode.v45.first-few.gff3.gz | Bin 0 -> 1167 bytes tests/data/gencode.v45.first-few.gff3.gz.tbi | Bin 0 -> 110 bytes tests/data/hg38.chr1.f100k.fa | 2001 ++++++++++++++++++ tests/data/hg38.chr1.f100k.fa.fai | 1 + tests/shared_data.py | 21 + tests/test_db.py | 10 +- 6 files changed, 2032 insertions(+), 1 deletion(-) create mode 100644 tests/data/gencode.v45.first-few.gff3.gz create mode 100644 tests/data/gencode.v45.first-few.gff3.gz.tbi create mode 100644 tests/data/hg38.chr1.f100k.fa create mode 100644 tests/data/hg38.chr1.f100k.fa.fai diff --git a/tests/data/gencode.v45.first-few.gff3.gz b/tests/data/gencode.v45.first-few.gff3.gz new file mode 100644 index 0000000000000000000000000000000000000000..70d45f06df28d6ac4fba584bbad3aa4e1afe4d69 GIT binary patch literal 1167 zcmV;A1aSKwiwFb&00000{{{d;LjnM$0dY?TB^ zm6WnfSw_~9Bgy;y;V4L_VJGn5a(J#-*O$I;KJvU=r_oSVpYzhq*ZW#N4<>(H&w1=} zb844-Htjf0Yb#r3>diW2GdynXXfAo2ZrMD&{dgTHK7TU*^VpfED>Upb5-Bx zNiyeyc{_f4{r=@x)!j6%ZMPmIrkl>W8S?x6b3R>P+V17?yQ=$XzO}O7_cCC+=7T|t z)Y#NOs_J^le;(VIbH2#2-7tdzgB+9djJS{Bh9D$zsH9LqAVo~hIR)Vju?Sq4eZ&b8 z!BIpl#S0Q_v@Qh&34*vpToBqvq9E*mq9?zPBta-r;s8NWXE%mI3aUGRjd3=5uQK>A z2ZJN4)wCG~CeMUXVK&7!Sf&6PvpKeq563NL@xjidQfk=*oE--_Fh~#J~lzFuj zk?f*};6bSY#j)qIz$-PNC}tmQN>S+>#c=5WB6~&5&+V`AmTh?p`^YZ|35bC~A-+n8 zcRp2r0aYgQIxhqO03VA81ONa4009360763o0L%jAm}_sFFcgMAXMbgFzr0xJBs!q4_(f&SPgs!D`Il3J)w{al}gD&=$YcE_#L+uhHRVc%sC zrHjXjDMG0JRd>H!28YBo=Qg^Tc`~zb=N9E6&$D74RZTX5_&%J(s++MukD@IH-_^N8EQ4>ik z?8;^}w?{5TTKAloyJ(T-MV6Sdw8xvd;3*BB(g$3mo&s13(BenXJP;fX@uQ{vaGxe_ z^zdr*sYePddejw&*wEw+@d6+pv#(}*GbHIr2V}3PS5!e{o(UP{#{tK6y2jWfehS*pHfm%8W;r5G96jtu!cvJdE;c`E*{Al4Ltse9ZGC% tsR6b}6~6MsO_n(D>BXZOo)rwC`am;|&7OL;5onM+nu*d3%pmi@007$d9JT-e literal 0 HcmV?d00001 diff --git a/tests/data/hg38.chr1.f100k.fa b/tests/data/hg38.chr1.f100k.fa new file mode 100644 index 0000000..9e9c08a --- /dev/null +++ b/tests/data/hg38.chr1.f100k.fa @@ -0,0 +1,2001 @@ +>chr1:1-100000 +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +taaccctaaccctaaccctaaccctaaccctaaccctaaccctaacccta +accctaaccctaaccctaaccctaaccctaaccctaaccctaaccctaac +cctaacccaaccctaaccctaaccctaaccctaaccctaaccctaacccc +taaccctaaccctaaccctaaccctaacctaaccctaaccctaaccctaa +ccctaaccctaaccctaaccctaaccctaacccctaaccctaaccctaaa +ccctaaaccctaaccctaaccctaaccctaaccctaaccccaaccccaac +cccaaccccaaccccaaccccaaccctaacccctaaccctaaccctaacc +ctaccctaaccctaaccctaaccctaaccctaaccctaacccctaacccc +taaccctaaccctaaccctaaccctaaccctaaccctaacccctaaccct +aaccctaaccctaaccctcgcggtaccctcagccggcccgcccgcccggg +tctgacctgaggagaactgtgctccgccttcagagtaccaccgaaatctg +tgcagaggacaacgcagctccgccctcgcggtgctctccgggtctgtgct +gaggagaacgcaactccgccgttgcaaaggcgcgccgcgccggcgcaggc +gcagagaggcgcgccgcgccggcgcaggcgcagagaggcgcgccgcgccg +gcgcaggcgcagagaggcgcgccgcgccggcgcaggcgcagagaggcgcg +ccgcgccggcgcaggcgcagagaggcgcgccgcgccggcgcaggcgcaga +cacatgctagcgcgtcggggtggaggcgtggcgcaggcgcagagaggcgc +gccgcgccggcgcaggcgcagagacacatgctaccgcgtccaggggtgga +ggcgtggcgcaggcgcagagaggcgcaccgcgccggcgcaggcgcagaga +cacatgctagcgcgtccaggggtggaggcgtggcgcaggcgcagagacgc +aagcctacgggcgggggttgggggggcgtgtgttgcaggagcaaagtcgc +acggcgccgggctggggcggggggagggtggcgccgtgcacgcgcagaaa +ctcacgtcacggtggcgcggcgcagagacgggtagaacctcagtaatccg +aaaagccgggatcgaccgccccttgcttgcagccgggcactacaggaccc +gcttgctcacggtgctgtgccagggcgccccctgctggcgactagggcaa +ctgcagggctctcttgcttagagtggtggccagcgccccctgctggcgcc +ggggcactgcagggccctcttgcttactgtatagtggtggcacgccgcct +gctggcagctagggacattgcagggtcctcttgctcaaggtgtagtggca +gcacgcccacctgctggcagctggggacactgccgggccctcttgctCCA +ACAGTACTGGCGGATTATAGGGAAACACCCGGAGCATATGCTGTTTGGTC +TCAGtagactcctaaatatgggattcctgggtttaaaagtaaaaaataaa +tatgtttaatttgtgaactgattaccatcagaattgtactgttctgtatc +ccaccagcaatgtctaggaatgcctgtttctccacaaagtgtttactttt +ggatttttgccagtctaacaggtgaAGccctggagattcttattagtgat +ttgggctggggcctggccatgtgtatttttttaaatttccactgatgatt +ttgctgcatggccggtgttgagaatgactgCGCAAATTTGCCGGATTTCC +TTTGCTGTTCCTGCATGTAGTTTAAACGAGATTGCCAGCACCGGGTATCA +TTCACCATTTTTCTTTTCGTTAACTTGCCGTCAGCCTTTTCTTTGACCTC +TTCTTTCTGTTCATGTGTATTTGCTGTCTCTTAGCCCAGACTTCCCGTGT +CCTTTCCACCGGGCCTTTGAGAGGTCACAGGGTCTTGATGCTGTGGTCTT +CATCTGCAGGTGTCTGACTTCCAGCAACTGCTGGCCTGTGCCAGGGTGCA +AGCTGAGCACTGGAGTGGAGTTTTCCTGTGGAGAGGAGCCATGCCTAGAG +TGGGATGGGCCATTGTTCATCTTCTGGCCCCTGTTGTCTGCATGTAACTT +AATACCACAACCAGGCATAGGGGAAAGATTGGAGGAAAGATGAGTGAGAG +CATCAACTTCTCTCACAACCTAGGCCAGTAAGTAGTGCTTGTGCTCATCT +CCTTGGCTGTGATACGTGGCCGGCCCTCGCTCCAGCAGCTGGACCCCTAC +CTGCCGTCTGCTGCCATCGGAGCCCAAAGCCGGGCTGTGACTGCTCAGAC +CAGCCGGCTGGAGGGAGGGGCTCAGCAGGTCTGGCTTTGGCCCTGGGAGA +GCAGGTGGAAGATCAGGCAGGCCATCGCTGCCACAGAACCCAGTGGATTG +GCCTAGGTGGGATCTCTGAGCTCAACAAGCCCTCTCTGGGTGGTAGGTGC +AGAGACGGGAGGGGCAGAGCCGCAGGCACAGCCAAGAGGGCTGAAGAAAT +GGTAGAACGGAGCAGCTGGTGATGTGTGGGCCCACCGGCCCCAGGCTCCT +GTCTCCCCCCAGGTGTGTGGTGATGCCAGGCATGCCCTTCCCCAGCATCA +GGTCTCCAGAGCTGCAGAAGACGACGGCCGACTTGGATCACACTCTTGTG +AGTGTCCCCAGTGTTGCAGAGGTGAGAGGAGAGTAGACAGTGAGTGGGAG +TGGCGTCGCCCCTAGGGCTCTACGGGGCCGGCGTCTCCTGTCTCCTGGAG +AGGCTTCGATGCCCCTCCACACCCTCTTGATCTTCCCTGTGATGTCATCT +GGAGCCCTGCTGCTTGCGGTGGCCTATAAAGCCTCCTAGTCTGGCTCCAA +GGCCTGGCAGAGTCTTTCCCAGGGAAAGCTACAAGCAGCAAACAGTCTGC +ATGGGTCATCCCCTTCACTCCCAGCTCAGAGCCCAGGCCAGGGGCCCCCA +AGAAAGGCTCTGGTGGAGAACCTGTGCATGAAGGCTGTCAACCAGTCCAT +AGGCAAGCCTGGCTGCCTCCAGCTGGGTCGACAGACAGGGGCTGGAGAAG +GGGAGAAGAGGAAAGTGAGGTTGCCTGCCCTGTCTCCTACCTGAGGCTGA +GGAAGGAGAAGGGGATGCACTGTTGGGGAGGCAGCTGTAACTCAAAGCCT +TAGCCTCTGTTCCCACGAAGGCAGGGCCATCAGGCACCAAAGGGATTCTG +CCAGCATAGTGCTCCTGGACCAGTGATACACCCGGCACCCTGTCCTGGAC +ACGCTGTTGGCCTGGATCTGAGCCCTGGTGGAGGTCAAAGCCACCTTTGG +TTCTGCCATTGCTGCTGTGTGGAAGTTCACTCCTGCCTTTTCCTTTCCCT +AGAGCCTCCACCACCCCGAGATCACATTTCTCACTGCCTTTTGTCTGCCC +AGTTTCACCAGAAGTAGGCCTCTTCCTGACAGGCAGCTGCACCACTGCCT +GGCGCTGTGCCCTTCCTTTGCTCTGCCCGCTGGAGACGGTGTTTGTCATG +GGCCTGGTCTGCAGGGATCCTGCTACAAAGGTGAAACCCAGGAGAGTGTG +GAGTCCAGAGTGTTGCCAGGACCCAGGCACAGGCATTAGTGCCCGTTGGA +GAAAACAGGGGAATCCCGAAGAAATGGTGGGTCCTGGCCATCCGTGAGAT +CTTCCCAGGGCAGCTCCCCTCTGTGGAATCCAATCTGTCTTCCATCCTGC +GTGGCCGAGGGCCAGGCTTCTCACTGGGCCTCTGCAGGAGGCTGCCATTT +GTCCTGCCCACCTTCTTAGAAGCGAGACGGAGCAGACCCATCTGCTACTG +CCCTTTCTATAATAACTAAAGTTAGCTGCCCTGGACTATTCACCCCCTAG +TCTCAATTTAAGAAGATCCCCATGGCCACAGGGCCCCTGCCTGGGGGCTT +GTCACCTCCCCCACCTTCTTCCTGAGTCATTCCTGCAGCCTTGCTCCCTA +ACCTGCCCCACAGCCTTGCCTGGATTTCTATCTCCCTGGCTTGGTGCCAG +TTCCTCCAAGTCGATGGCACCTCCCTCCCTCTCAACCACTTGAGCAAACT +CCAAGACATCTTCTACCCCAACACCAGCAATTGTGCCAAGGGCCATTAGG +CTCTCAGCATGACTATTTTTAGAGACCCCGTGTCTGTCACTGAAACCTTT +TTTGTGGGAGACTATTCCTCCCATCTGCAACAGCTGCCCCTGCTGACTGC +CCTTCTCTCCTCCCTCTCATCCCAGAGAAACAGGTCAGCTGGGAGCTTCT +GCCCCCACTGCCTAGGGACCAACAGGGGCAGGAGGCAGTCACTGACCCCG +AGACGTTTGCATCCTGCACAGCTAGAGATCCTTTATTAAAAGCACACTGT +TGGTTTCTGCTCAGTTCTTTATTGATTGGTGTGCCGTTTTCTCTGGAAGC +CTCTTAAGAACACAGTGGCGCAGGCTGGGTGGAGCCGTCCCCCCATGGAG +CACAGGCAGACAGAAGTCCCCGCCCCAGCTGTGTGGCCTCAAGCCAGCCT +TCCGCTCCTTGAAGCTGGTCTCCACACAGTGCTGGTTCCGTCACCCCCTC +CCAAGGAAGTAGGTCTGAGCAGCTTGTCCTGGCTGTGTCCATGTCAGAGC +AACGGCCCAAGTCTGGGTCTGGGGGGGAAGGTGTCATGGAGCCCCCTACG +ATTCCCAGTCGTCCTCGTCCTCCTCTGCCTGTGGCTGCTGCGGTGGCGGC +AGAGGAGGGATGGAGTCTGACACGCGGGCAAAGGCTCCTCCGGGCCCCTC +ACCAGCCCCAGGTCCTTTCCCAGAGATGCCTGGAGGGAAAAGGCTGAGTG +AGGGTGGTTGGTGGGAAACCCTGGTTCCCCCAGCCCCCGGAGACTTAAAT +ACAGGAAGAAAAAGGCAGGACAGAATTACAAGGTGCTGGCCCAGGGCGGG +CAGCGGCCCTGCCTCCTACCCTTGCGCCTCATGACCAGCTTGTTGAAGAG +ATCCGACATCAAGTGCCCACCTTGGCTCGTGGCTCTCACTGCAACGGGAA +AGCCACAGACTGGGGTGAAGAGTTCAGTCACATGCGACCGGTGACTCCCT +GTCCCCACCCCCATGACACTCCCCAGCCCTCCAAGGCCACTGTGTTTCCC +AGTTAGCTCAGAGCCTCAGTCGATCCCTGACCCAGCACCGGGCACTGATG +AGACAGCGGCTGTTTGAGGAGCCACCTCCCAGCCACCTCGGGGCCAGGGC +CAGGGTGTGCAGCAccactgtacaatggggaaactggcccagagaggtga +ggcagcttgcctggggtcacagagcaaggcaaaagcagcgctgggtacaa +gctcaAAACCATAGTGCCCAGGGCACTGCCGCTGCAGGCGCAGGCATCGC +ATCACACCAGTGTCTGCGTTCACAGCAGGCATCATCAGTAGCCTCCAGAG +GCCTCAGGTCCAGTCTCTAAAAATATCTCAGGAGGCTGCAGTGGCTGACC +ATTGCCTTGGACCGCTCTTGGCAGTCGAAGAAGATTCTCCTGTCAGTTTG +AGCTGGGTGAGCTTAGAGAGGAAAGCTCCACTATGGCTCCCAAACCAGGA +AGGAGCCATAGCCCAGGCAGGAGGGCTGAGGACCTCTGGTGGCGGCCCAG +GGCTTCCAGCATGTGCCCTAGGGGAAGCAGGGGCCAGCTGGCAAGAGCAG +GGGGTGGGCAGAAAGCACCCGGTGGACTCAGGGCTGGAGGGGAGGAGGCG +ATCTTGCCCAAGGCCCTCCGACTGCAAGCTCCAGGGCCCGCTCACCTtgc +tcctgctccttctgctgctgcttctccagctttcgctccttcatgctgcG +CAGCTTGGCCTTGCCGATGCCCCCAGCTTGGCGGATGGACTCTAGCAGAG +TGGCCAGCCACCGGAGGGGTCAACCACTTCCCTGGGAGCTCCCTGGACTG +GAGCCGGGAGGTGGGGAACAGGGCAAGGAGGAAAGGCTGCTCAGGCAGGG +CTGGGGAAGCTTACTGTGTCCAAGAGCCTGCTGGGAGGGAAGTCACCTCC +CCTCAAACGAGGAGCCCTGCGCTGGGGAGGCCGGACCTTTGGAGACTGTG +TGTGGGGGCCTGGGCACTGACTTCTGCAACCACCTGAGCGCGGGCATCCT +GTGTGCAGATACTCCCTGCTTCCTCTCTAGCCCCCACCCTGCAGAGCTGG +ACCCCTGAGCTAGCCATGCTCTGACAGTCTCAGTTGCACACACGAGCCAG +CAGAGGGGTTTTGTGCCACTTCTGGATGCTAGGGTTACACTGGGAGACAC +AGCAGTGAAGCTGAAATGAAAAATGTGTTGCTGTAGTTTGTTATTAGACC +CCTTCTTTCCATTGGTTTAATTAGGAATGGGGAACCCAGAGCCTCACTTG +TTCAGGCTCCCTCTGCCCTAGAAGTGAGAAGTCCAGAGCTCTACAGTTTG +AAAACCACTATTTTATGAACCAAGTAGAACAAGATATTTGAAATGGAAAC +TATTCAAAAAATTGAGAATTTCTGACCACTTAACAAACCCACAGAAAATC +CACCCGAGTGCACTGAGCACGCCAGAAATCAGGTGGCCTCAAAGAGCTGC +TCCCACCTGAAGGAGACGCGCTGCTGCTGCTGTCGTCCTGCCTGGCGCCT +TGGCCTACAGGGGCCGCGGTTGAGGGTGGGAGTGGGGGTGCACTGGCCAG +CACCTCAGGAGCtgggggtggtggtgggggcggtgggggtggtgTTAGTA +CCCCATCTTGTAGGTCTGAAACACAAAGTGTGGGGTGTCTAGGGAAGAAG +GTGTGTGACCAGGGAGGTCCCCGGCCCAGCTCCCATCCCAGAACCCAGCT +CACCTACCTTGAGAGGCTCGGCTACCTCAGTGTGGAAGGTGGGCAGTTCT +GGAATGGTGCCAGGGGCAGAGGGGGCAATGCCGGGGCCCAGGTCGGCAAT +GTACATGAGGTCGTTGGCAATGCCGGGCAGGTCAGGCAGGTAGGATGGAA +CATCAATCTCAGGCACCTGGCCCAGGTCTGGCACATAGAAGTAGTTCTCT +GGGACCTGCAAGATTAGGCAGGGACATGTGAGAGGTGACAGGGACCTGCA +GGGGCAGCCAACAAGACCTTGTGTGCACCTCCCATGGGTGGAATAAGGGG +CCCAACAGCCTTGACTGGAGAGGAGCTCTGGCAAGGCCCTGGGCCACTGC +ACCTGTCTCCACCTCTGTCCCACCCCTCCCACCTGCTGTTCCAGCTGCTC +TCTCTTGCTGATGGACAAGGGGGCATCAAACAGCTTCTCCTCTGTCTCTG +CCCCCAGCATCACATGGGTCTTTGTTACAGCACCAGCCAGGGGGTCCAGG +AAGACATACTTCTTCTACCTACAGAGGCGACATGGGGGTCAGGCAAGCTG +ACACCCGCTGTCCTGAGCCCATGTTCCTCTCCCACATCATCAGGGGCACA +GCGTGCACTGTGGGGTCCCAGGCCTCCCGAGCCGAGCCACCCGTCACCCC +CTGGCTCCTGGCCTATGTGCTGTACCTGTGTCTGATGCCCTGGGTCCCCA +CTAAGCCAGGCCGGGCCTCCCGCCCACACCCCTCGGCCCTGCCCTCTGGC +CATACAGGTTCTCGGTGGTGTTGAAGAGCAGCAAGGAGCTGACAGAGCTG +ATGTTGCTGGGAAGACCCCCAAGTCCCTCTTCTGCATCGTCCTCGGGCTC +CGGCTTGGTGCTCACGCACACAGGAAAGTCCTTCAGCTTCTCCTGAGAGG +GCCAGGATGGCCAAGGGATGGTGAATATTTGGTGCTGGGCCTAATCAGCT +GCCATCCCATCCCAGTCAGCCTCCTCTGGGGGACAGAACCCTATGGTGGC +CCCGGCTCCTCCCCAGTATCCAGTCCTCCTGGTGTGTGACAGGCTATATG +CGCGGCCAGCAGACCTGCAGGGCCCGCTCGTCCAGGGGGCGGTGCTTGCT +CTGGATCCTGTGGCGGGGGCGTCTCTGCAGGCCAGGGTCCTGGGCGCCCG +TGAAGATGGAGCCATATTCCTGCAGGCGCCCTGGAGCAGGGTACTTGGCA +CTGGAGAACACCTGTGGACACAGGGACAAGTCTGAGGGGGCCCCAAGAGG +CTCAGAGGGCTAGGATTGCTTGGCAGGAGAGGGTGGAGTTGGAAGCCTGG +GCGAGAAGAAAGCTCAAGGTACAGGTGGGCAGCAGGGCAGAGACTGGGCA +GCCTCAGAGGCACGGGGAAATGGAGGGACTGCCCAGTAGCCTCAGGACAC +AGGGGTATGGGGACTACCTTGATGGCCTTCTTGCTGCCCTTGATCTTCTC +AATCTTGGCCTGGGCCAAGGAGACCTTCTCTCCAATGGCCTGCACCTGGC +TCCGGCTCTGCTCTACCTGCTGGGAGATCCTGCCATGGAGAAGATCACAG +AGGCTGGGCTGCTCCCCACCCTCTGCACACCTCCTGCTTCTAACAGCAGA +GCTGCCAGGCCAGGCCCTCAGGCAAGGGCTCTGAAGTCAGGGTCACCTAC +TTGCCAGGGCCGATCTTGGTGCCATCCAGGGGGCCTCTACAAGGATAATC +TGACCTGCAGGGTCGAGGAGTTGACGGTGCTGAGTTCCCTGCACTCTCAG +TAGGGACAGGCCCTATGCTGCCACCTGTACATGCTATCTGAAGGACAGCC +TCCAGGGCACACAGAGGATGGTATTTACACATGCACACATGGCTACTGAT +GGGGCAAGCACTTCACAACCCCTCATGATCACGTGCAGCAGACAATGTGG +CCTCTGCAGAGGGGGAACGGAGACCGGAGGCTGAGACTGGCAAGGCTGGA +CCTGAGTGTCGTCACCTAAATTCAGACGGGGAACTGCCCCTGCACATACT +GAACGGCTCACTGAGCAAACCCCGAGTCCCGACCACCGCCTCAGTGTGGT +CTAGCTcctcacctgcttccatcctccctggtgcggggtgggcccagtga +tatcagctgcctgctgttccccagatgtgccaagtgcattcttgtgtgct +tgcatctcatggaacgccatttccccagacatccctgtggctggctccTG +ATGCCCGAGGCCCAAGTGTCTGATGCTTTAAGGCACATCACCCCACTCAT +GCTTTTCCATGTTCTTTGGCCGCAGCAAGGCCGCTCTCACTGCAAAGTTA +ACTCTGATGCGTGTGTAACACAACATCCTCCTCCCAGTCGCCCCTGTAGC +TCCCCTACCTCCAAGAGCCCAGCCCTTGCCCACAGGGCCACACTCCACGT +GCAGAGCAGCCTCAGCACTCACCGGGCACGAGCGAGCCTGTGTGGTGCGC +AGGGATGAGAAGGCAGAGGCGCGACTGGGGTTCATGAGGAAGGGCAGGAG +GAGGGTGTGGGATGGTGGAGGGGTTTGAGAAGGCAGAGGCGCGACTGGGG +TTCATGAGGAAAGGGAGGGGGAGGATGTGGGATGGTGGAGGGGCTGCAGA +CTCTGGGCTAGGGAAAGCTGGGATGTCTCTAAAGGTTGGAATGAATGGCC +TAGAATCCGACCCAATAAGCCAAAGCCACTTCCACCAACGTTAGAAGGCC +TTGGCCCCCAGAGAGCCAATTTCACAATCCAGAAGTCCCCGTGCCCTAAA +GGGTCTGCCCTGATTACTCCTGGCTCCTTGTGTGCAGGGGGCTCAGGCAT +GGCAGGGCTGGGAGTACCAGCAGGCACTCAAGCGGCTTAAGTGTTCCATG +ACAGACTGGTATGAAGGTGGCCACAATTCAGAAAGAAAAAAGAAGAGCAC +CATCTCCTTCCAGTGAGGAAGCGGGACCACCACCCAGCGTGTGCTCCATC +TTTTCTGGCTGGGGAGAGGCCTTCATCTGCTGTAAAGGGTCCTCCAGCAC +AAGCTGTCTTAATTGACCCTAGTTCCCAGGGCAGCCTCGTTCTGCCTTGG +GTGCTGACACGACCTTCGGTAGGTGCATAAGCTCTGCATTCGAGGTCCAC +AGGGGCAGTGGGAGGGAACTGagactggggagggacaaaggctgctctgt +cctggtgctcccacaaaggagaagggctgatcactcaaagttgcgaacac +caagctcaacaatgagccctggaaaatttctggaatggattattaaacag +agagtctgtaagcacttagaaaaggccgcggtgagtcccaggggccagca +ctgctcgaaatgtacagcatttctctttgtaacaggattattagcctgct +gtgcccggggaaaacatgcagcacagtgcatctcgagtcagcaggatttt +gacggcttctaacaaaatcttgtagacaagatggagctatgggggttgga +ggagagaacatataggaaaaatcagagccaaatgaaccacagccccaaag +ggcacagttgaacaatggactgattccagccttgcacggagggatctggc +agagtCCATCCAGTTCATTCAACACCTGGTTAGAAAACTGGGGCCAGCAC +ACAGGGGAAGGGTAAGCTGGTTTCATGATCGAATCAAGGCTCAGACAATT +TTTAAAGGCCAGAGGGTAGACTGCAATCACcaagatgaaatttacaagga +acaaatgtgaagcccaacatttaggttttaaaaatcaagcgtataaatac +agaaggtggagggaacttgctttagacacagttcaggtgaagaaagacct +ggaaacttctgttaactataagctcagtaGGGGCTAAAAGCATGTTAATC +GGCATAAAAAGGCAATGAGATCTTAGGGCACACAGCTCCCCGCCCCTCTT +CTGCCCTTCATCCTTCTTTCAATCAGCAGGGACCGTGCACTCTCTTGGAG +CCACCACAGAAAACAGAGGTGCATCCAGCACCACAGAAAACAGAGCCACC +ACAGAAAACAGAGGGTGACTGTCATCCCCTCCAGTCTCTGCACACTCCCA +GCTGCAGCAGAGCAGGAGGAGAGAGCACAGCCTGCAATGCTAATTTGCCA +GGAGCTCACCTGCCTGCGTCACTGGGCACAGACGCCAGTGAGGCCAGAGG +CCGGGCTGTGCTGGGGCCTGAGCCGGGTGGTGGGGAGAGAGTCTCTCCCC +TGCCCCTGTCTCTTCCGTGCAGGAGGAGCATGTTTAAGGGGACGGGTTCA +AAGCTGGTCACATCCCCACCGAAAAAGCCCATGGACAACGAAAAGCCCAC +TAGCTTGTCCAGTGCCACAGGAGGGGCAAGTGGAGGAGGAGAGGTGGCGG +TGCTCCCCACTCCACTGCCAGTCGTCACTGGCTCTCCCTTCCCTTCATCC +TCGTTCCCTATCTGTCACCATTTCCTGTCGTCGTTTCCTCTGAATGTCTC +ACCCTGCCCTCCCTGCTTGCAAGTCCCCTGTCTGTAGCCTCACCCCTGTC +GTATCCTGACTACAATAACAGCTTCTGGGTGTCCCTGGCATCCACTCTCT +CTCCCTTCTTGTCCCTTCCGTGACGGATGCCTGAGGAACCTTCCCCAAAC +TCTTCTGTCCCATCCCTGCCCTGCTCAAAATCCAATCACAGCTCCCTAAC +ACGCCTGAATCAACTTGAAGTCCTGTCTTGAGTAATCCGTGGGCCCTAAC +TCACTCATCCCAACTCTTCACTCACTGCCCTGCCCCACACCCTGCCAGGG +AGCCTCCCGTGGCACCGTGGGGACACAAAGGAACCAGGGCAAAGCTCCCT +CAGCCCCATTCAAAGAGGCCTGGCCCACAGGCTCACGGAAAGTCAGCCTC +TCATGCCCCGAGAGCTGAGTGCAAGGGAGAGGCAGCGCTGTCTGTGCTTC +CCATGCAGAAGCACCCCCCTCCCACCCCTGTGCAGGCCGGCCTTCGCGGC +AGACCACCATACACCACGTTCCAAGCCACACTGAGGCCTCCCTCCAAGCC +TGCAGCCCCCATTTCCAGACCCTGCCAGGGCAACCTGCATATCCACCTCC +CTACCCTGCCCCCCTCTTCCAGGAGTCTGCCCTATGTGGAGTAAGCACgt +ggttttcctcttcagcaactatttcctttttactcaagcaatggccccat +ttcccttggggaatccatctctctcgcaggcttagtcccagagcttcagg +tggggctgcccacagagctcctcagTCTAAGCCAAGTGGTGTGTCATAGT +CCCCTGGCCCCATTAATGGATTCTGGGATAGACATGAGGACCAAGCCAGG +TGGGATGAGTGAGTGTGGCTTCTGGAGGAAGTGGGGACACAGGACAGCAT +TCTTTCCTGCTGGACCTGACCCTGTGTCATGTCACCTTGCTACCACGAGA +GCATGGCCTGTCTGGGAATGCAGCCAGACCCAAAGAAGCAAACTGACATG +GAAGGAAAGCAAAACCAGGCCCTGAGGACATCATTTTAGCCCTTACTCCG +AAGGCTGCTCTACTGATTGGTTAATTTTTGCTTAGCTTGGTCTGGGGAGT +TCTGACAGGCGTGCCACCAATTCTTACCGATTTCTCTCCACTCTAGACCC +TGAGAAGCCCACGCGGTTCATGCTAGCAATTAACAATCAATCTCGCCCTA +TGTGTTCCCATTCCAGCCTCTAGGACACAGTGGCAGCCACATAATTGGTA +TCTCTTAAGGTCCAGCACGAGGTGGAGCACATGGTGGAGAGACAGATGCA +GTGACCTGGAACCCAGGAGTGAGGGAGCCAGGACTCAGGCCCAAGGCTCC +TGAGAGGCATCTGGCCCTCCCTGCGCTGTGCCAGCAGCTTGGAGAACCCA +CACTCAATGAACGCAGCACTCCACTACCCAGGAAATGCCTTCCTGCCCTC +TCCTCATCCCATCCCTGGGCAGGGGACATGCAACTGTCTACAAGGTGCCA +AGTACCAGGACAGGAAAGGAAAGACGCCAAAAATCCAGCGCTGCCCTCAG +AGAAGGGCAACCACGCAGTCCCCATCTTGGCAAGGAAACACAATTTCCGA +GGGAATGGTTTTGGCCTCCATTCTAAGTGCTGGACATGGGGTGGCCATAA +TCTGGAGCTGATGGCTCTTAAAGACCTGCATCCTCTTCCCTAGGTGTCCC +TCGGGCACATTTAGCACAAAGATAAGCACAAAAGGTGCATCCAGCACTTT +GTTACTATTGGTGGCAGGTTTATGAATGGCAACCAAAGGCAGTGTACGGG +TCAAGATTATCAACAGGGAagagatagcatttcctgaaggcttcctaggt +gccaggcactgttccattcctttgcatgttttgattaatttaatatttaa +aataattctaccaggaagctaccattattaccacaacttcacaaatgaga +acaccgaggcttagaggggttgggttgcccaaggttacagaggaagaaaa +caggggagctggatctgagccaaggcatcaactccaaggtaacccctcag +tcacttcactgtgtgtcccctGGTTACTGGGACATTCTTGACAAACTCGG +GGCAAGCCGGTGAGTCAGTGGGGGAGGACTTTCAGGAAGAGGTGGGTTCC +CAGTTGGTGACAGAAGAGGAGGCTGCAAAGTGAAGGAGCAGGGGCTCCAG +GTCTGGCGACAACCAGGGAAGGGACAGGGCAGGGATGGCTTGGACCACGA +GAGGCACCTGAGTCAGGCAGTCACATACTTCCCACTGGGGTCTACCATGT +GAGGCATGGTGTGGGATCCTGGGAAGGAGACCAAGCCTCATTTCAGTTTG +CTTATGGCCAAAGACAGGACCTGTGTACCCGACAACCCCTGGGACCTTTA +CCAAAAAAAGAGCAAACACCATTCACTCACTCATGTTAGATAAACACTGA +GTGAAGTCACTGGAGCCCAAGGACTGTGCGAGGTCAGCACTGCCAATACA +AGAagctgcagccctccagctcgcctccctcaatggccactccgtgctcc +agccatgctggcttccttttaggtcctccacctccaggctgtagttcatg +tgcttctttctggaatgttcttcccaacctacccactcaaccctcagact +ttaccataaatgtcatttcctcacgtctgccttccctgacctgagaccaa +gccaggcttcccatgacgagcctcacagtaccccatctCCCCTGAACAGA +TGCAGTAATAACCTACATAACCCGGGGCCATGATCTAtggctttgaatcc +tggctctgtcactaggccaggtctctcagcccttctgtgcctcagtttcc +tcatctataaaatgagatgacggcagtgcctgctcatgaagtgtgagtta +atgcactcaaatcaatggttgtgcacggtttatatgaatattagtgatta +CAAAATATTATCAATAGACCTTGTCACAACTGTTATTGAAGAACtaatca +tctattgcttatttaggtctttctctcctgccagaatgtgcgctccaggt +ggagaggtatgttgccttatccgtggctggatatatagagattcccacac +tgccttgcacacgagcactgctgggtaaatatttgttggctgcaggaaAA +CGTGAAGGAATAGGCCCTCCAATGGGAGGAAAAGCATGAGTTGTGAGAGC +AGAGCCACCACAGGAAACCAGGAGGCTAAGTGGGGTGGAAGGGAGTGAGC +TCTCGGACTCCCAGGAGTAAAAGCTTCCAAGTTGGGCTCTCACTTCAGCC +CCTCCCACACAGGGAAGCCAGATGGGTTCCCCAGGACCGGGATTCCCCAA +GGGGGCTGCTCCCAGAGGGTGTGTTGCTGGGATTGCCCAGGACAGGGATG +GCCCTCTCATCAGGTGGGGGTGAGTGGCAGCACCCACCTGCTGAAGATGT +CTCCAGAGACCTTCTGCAGGTACTGCAGGGCATCCGCCATCTGCTGGACG +GCCTCCTCTCGCCGCAGGTCTGGCTGGATGAAGGGCACGGCATAGGTCTG +ACCTGCCAGGGAGTGCTGCATCCTCACAGGAGTCATGGTGCCTGTGGGTC +GGAGCCGGAGCGTCAGAGCCACCCACGACCACCGGCACGCCCCCACCACA +GGGCAGCGTGGTGTTGAGACAACACAGCCCTCATCCCAACTATGCACATA +GCTTCAGCCTGCACAGATAGGGGAGTAGGGGACAGAGCATTTGCTGAGAG +GCCAGGAGCGCATAGATGGGACTCTGCTGATGCCTGCTGAGTGAATGAGG +GAAAGGGCAGGGCCCGGGACTGGGGAATCTGTAGGGTCAATGGAGGAGTT +CAGAGAAGGTGCAACATTTCTGACCCCCTACAAGGTGCTTGCTACCTGCC +AGGCACCCTTTCCATACCTTGTCTCAGTTCAGCTCCCCACCTTGGATAAA +CAAGAAACCTTGGTTGCAGAGGAAAAAAGAGGCTGGAAACAAAGGGGTAG +AAATGGGGTAGCAGGGGAGATTGCCTGATCAACTGCCAAATGGTACACAG +TTCTGGAAAAGCACAAAAAATGTGCACACACGGGTTCTTCCCACTTTAAC +CCCTGAGGAATCTGAGGCCTGCTCCTGAAACAGACTGGGCAGTGGCTAGT +GACTCTAGGTATAGGAGTATCCAGCCCTGCTCACCCAGGCTAGAGCTTAG +GGGGACAAGAGGAAAGAGGTGCCTGTGGGGGTGGAGGACAGGAAGGAAAA +ACACTCCTGGAATTGCAAAGTGAGGGCAGAGTCTATTTATATTGGGTTTA +ATTAACTCCTCTCCCTGGTGCCACTAAAGCAGCAATCACACTGCAGACAG +CACTGATTTGATTGGCAAGAGATGCACCAGGCAGAATATTAAGGGACCAG +GCCCCTATAAATAGGCCTAATCACAGCCCCTCACTGGAAAATGGTAAGGA +AGACATTAATCAGGCCTGGCACTGTGCCCTAGACCTGCTCCCCTAGGCAC +TACAGTGGGGCCCTTGGTTGCAACACAAGTAGGTAGGGATGGATGAGTGT +GGCATGAAGGGCCTAGGAGATTTCACTTGGGTTTAAAATGCTGTGACCTT +GAGTAAGTTGCCGTCTCTGAATCTGATCCTTTCGATTTCCCATTCTCCAA +ACTGAGAACTAGCACTGCTGAGACGTGGTTATTTCCAATAATAATTTGTA +TATTTTACATAACGCACCACACCAACATCTTCACCCAGTTGGAGCCTACT +CCTTTGCTCCCGCTGCTGGCTTCCCCAGCCCTCCCTTCTGCCCTCCTCAG +GCCAGCACTTTTCAGTGAGTTCCTCCTTTGCATACAGGCTTTCCAGATCT +GTACTTGCCTTGAATACTCATCAGAGCCCAGGAGTTACTCCTCACCTCCC +ACTTATTTTTCCTCCCATCAAATAACTAAAGCATGGCCAGCTGATGCCCA +GCCAACTGAGAAACCCAACCCTCTGAGACCAGCACACCCCTTTCAAGCAT +GTTCCTCCCTCCCCTTCTTTGTATTTATACTGATGCAAGTTTGCTGGCTG +TCCTAacttatttctgtgcctcagttctcccatatgtaagatcacaaagg +gggtaaagatgcAAGATATTTCCTGTGCACATCTTCAGATGAATTTCTTG +TTAGTGTGTGTGTGTTTGCTCACACATATGCGTGAAAGAAGAGTACATAC +ACAGATCTCCTCAAAAAGGAGGCAGCAAGCCCGTTCAAGAATGGGACTGA +ATACACCTGATGAGTGGTTTACTTTCTGTCTGcaaacatctactgatcat +ctgttaggtgcaggccatgatcacaacaaagacgaataagacactacact +agccagggagagtctcaaaaacaactaaactcaaattaaattcattctac +tccagtcatgggtacaaagctaaggagtgacaaatccctcttggagttag +gggagtcaggaaaaagctcttagcagaatgtgtgcctctcggccgggcgc +agcggctcacgcctgtaatcccagcactttgggaggcgaaggcaggcaga +tcacctgaggtcgggagttcgagaccagtctgaccaacatggtgaaactc +catctctactaaaaatacaaaattagccaggcgtggtggtgcatgcctgt +aatccccgctactcgggaggctgaggaaggagaatcacttgaaccaggaa +ggtggaggttgcagtgtgccaagatcgcgccatggcactccagcctaggc +aacgagggtgaaccaggtccaggaagaaggtgcaaagacagcattccagg +taaaagaaacagcttgaacaaaaagtgtgtaggggaaCCGCAAGCGGTCT +TGAGTGCTGAGGGTACAATCATCCTTGGGGAAGTACTAGAAGAAAGAATG +ATAAACAGAGGCCAGTTTGTTAAAAACACTCAAAATTAAAGCTAGGAGTT +TGGACTTGTGGCAGGAATgaaatccttagacctgtgctgtccaatatggt +agccaccaggcacatgcagccactgagcacttgaaatgtggatagtctga +attgagatgtgccataagtgtaaaatatgcaccaaatttcaaaggctaga +aaaaaagaatgtaaaatatcttattattttatattgattacgtgctaaaa +taaccatatttgggatatactggattttaaaaatatatcactaatttcat +ctgtttctttttacttttAGAAATCACATATGTGACTTAAATATTTCTTT +TCTTTTTCTTTCCTCTCACTCAGCGTCCTGTGATTCCAAAGAAATGAGTC +TCTGCTGTTTTTGGGCAGCAGATATCCTAGAATGGACTCTGACCTAAGCA +TCAAAATTAATCATCATAACGTTATCATTTTATGGCCCCTTCTTCCTATA +TCTGGTAGCTTTTAAATGATGACCATGTAGATAATCTTTATTGTCCCTCT +TTCAGCAGACGGTATTTTCTTATGCTACAGTATGACTGCTAATAATACCT +ACACATGTTAGAACCATTCTGACTCCTCAAGAatctcatttaactcttat +tatcagtgaatttatcatcatcccctattttacataaggaaatggggtta +gaaagaccaaataacattttttcaacatcaaaacactagcttgagatcaa +gcccagacttggatctgtcgtctgaattccaagctttttgttatttattg +atatgttttgttgtTTTCATGCAATAATGCAAATCTTAGCCCAAACATTT +TGTTAGTAGTACCAACTGTAAGTCACCTTATCTTCATACTTTGTCTTTAT +GTAAACCTAAATTAGATCTGTTTTTGATACTGAGGGAAAAACAAGGGAAT +ctaacactaaccagcccgtagtgtgtggtcaacactttcgttactttagt +atacatcaccccaattgtttgtcttcaccacacactttggagttaggtag +tagtatctatttttacaaataagaaaacccaggcacaaaggggttgatta +gcAATTATCTTTTGAAAAGCCTGTAGTTGCTCATCTGAAGAAGTGACGGA +CCACCTCTTATTTAGTGGACAGACAGTAACTAGTTGAGAAGACAGGGGAT +TTTGTTGGCGGAAAAAAAAATTTATCAAAAGTCGTCTTCTATCAGGGAGT +TTTATGAGAAACCCTAGCTCCTCAGTTCCACAGTGGGTAACTGTAATTCA +TTCTAGGTCTGCGATATTTCCTGCCTATCCATTTTGTTAACTCTTCAATG +CATTCCACAAATACCTAAGTATTCTTTAATAATGGTGGTTTTTTTTTTTT +TTTGCATCTATGAAGTTTTTTCAAATTCTTTTTAAGTGACAAAACTTGTA +CATGTGTATCGCTCAATATTTCTAGTCGACAGCACTGCTTTCGAGAATGT +AAACCGTGCACTCCCAGGAAAATGCAGACACAGCACGCCTCTTTGGGACC +GCGGTTTATACTTTCGAAGTGCTCGGAGCCCTTCCTCCAGACCGTTCTCC +CACACCCCGCTCCAGGGTCTCTCCCGGAGTTACAAGCCTCGCTGTAGGCC +CCGGGAACCCAACGCGGTGTCAGAGAAGTGGGGTCCCCTACGAGGGACCA +GGAGCTCCGGGCGGGCAGCAGCTGCGGAAGAGCCGCGCGAGGCTTCCCAG +AACCCGGCAGGGGCGGGAAGACGCAGGAGTGGGGAGGCGGAACCGGGACC +CCGCAGAGCCCGGGTCCCTGCGCCCCACAAGCCTTGGCTTCCCTGCTAGG +GCCGGGCAAGGCCGGGTGCAGGGCGCGGCTCCAGGGAGGAAGCTCCGGGG +CGAGCCCAAGACGCCTCCCGGGCGGTCGGGGCCCAGCGGCGGCGTTCGCA +GTGGAGCCGGGCACCGGGCAGCGGCCGCGGAACACCAGCTTGGCGCAGGC +TTCTCGGTCAGGAACGGTCCCGGGCCTCCCGCCCGCCTCCCTCCAGCCCC +TCCGGGTCCCCTACTTCGCCCCGCCAGGCCCCCACGACCCTACTTCCCGC +GGCCCCGGACGCCTCCTCACCTGCGAGCCGCCCTCCCGGAAGCTCCCGCC +GCCGCTTCCGCTCTGCCGGAGCCGCTGGGTCCTAGCCCCGCCGCCCCCAG +TCCGCCCGCGCCTCCGGGTCCTAACGCCGCCGCTCGCCCTCCACTGCGCC +CTCCCCGAGCGCGGCTCCAGGACCCCGTCGACCCGGAGCGCTGTCCTGTC +GGGCCGAGTCGCGGGCCTGGGCACGGAACTCACGCTCACTCCGAGCTCCC +GACGTGCACACGGCTCCCATGCGTTGTCTTCCGAGCGTCAGGCCGCCCCT +ACCCGTGCTTTCTGCTCTGCAGACCCTCTTCCTAGACCTCCGTCCTTTGT +CCCATCGCTGCCTTCCCCTCAAGCTCAGGGCCAAGCTGTCCGCCAACCTC +GGCTCCTCCGGGCAGCCCTCGCCCGGGGTGCGCCCCGGGGCAGGaccccc +agcccacgcccagggcccgcccctgccctccagccctacgccTTGACCCG +CTTTCCTGCGTCTCTCAGCCTACCTGACCTTGTCTTTACCTCTGTGGGCA +GCTCCCTTGTGATCTGCTTAGTTCCCACCCCCCTTTAAGAATTCAATAGA +Gaagccagacgcaaaactacagatatcgtatgagtccagttttgtgaagt +gcctagaatagtcaaaattcacagagacagaagcagtggtcgccaggaat +ggggaagcaaggcggagttgggcagctcgtgttcaatgggtagagtttca +ggctggggtgatggaagggtgctggaaatgagtggtagtgatggcggcac +aacagtgtgaatctacttaatcccactgaactgtatgctgaaaaatggtt +tagacggtgaattttaggttatgtatgttttaccacaatttttaaaaaGC +TAGTGAAAAGCTGGTAAAAAGAAAGAAAAGAGGCTTTTTTAAAAAGTTAA +ATATATAAAAAGAGCATCATCAGTCCAAAGTCCAGCAGTTGTCCCTCCTG +GAATCCGTTGGCTTGCCTCCGGCATTTTTGGCCCTTGCCTTTtagggttg +ccagattaaaagacaggatgcccagctagtttgaattttagataaacaac +gaataatttcgtagcataaatatgtcccaagcttagtttgggacatactt +atgctaaaaaacattattggttgtttatctgagattcagaattaagcatt +ttatattttatttgctgcctctggccaccctaCTCTCTTCCTAACACTCT +CTCCCTCTCCCAGTTTTGTCCGCCTTCCCTGCCTCCTCTTCTGGGGGAGT +TAGATCGAGTTGTAACAAGAACATGCCACTGTCTCGCTGGCTGCAGCGTG +TGGTCCCCTTACCAGAGGTAAAGAAGAGATGGATCTCCACTCAtgttgta +gacagaatgtttatgtcctctccaaatgcttatgttgaaaccctaacccc +taatgtgatggtatgtggagatgggcctttggtaggtaattacggttaga +tgaggtcatggggtggggccctcattatagatctggtaagaaaagagaGC +ATTGtctctgtgtctccctctctctctctctctctctctctcatttctct +ctatctcatttctctctctctcgctatctcatttttctctctctctcttt +ctctcctctgtcttttcccaccaagtgaggatgcgaagagaaggtggctg +tctgcaaaccaggaagagagccctcaccgggaacccgtccagctgccacc +ttgaacttggacttccaagcctccagaactgtgagggataaatgtatgat +tttaaagtcgcccagtgtgtggtattttgttTTGACTAATACAACCTGAA +AACATTTTCCCCTCACTCCACCTGAGCAATATCTGAGTGGCTTAAGGTAC +TCAGGACACAACAAAGGAGAAATGTCCCATGCACAAGGTGCACCCATGCC +TGGGTAAAGCAGCCTGGCACAGAGGGAAGCACACAGGCTCAGggatctgc +tattcattctttgtgtgaccctgggcaagccatgaatggagcttcagtca +ccccatttgtaatgggatttaattgtgcttgccctgcctccttttgaggg +ctgtagagaaaagatgtcaaagtattttgtaatctggctgggcgtggtgg +ctcatgcctgtaatcctagcactttggtaggctgacgcgagaggactgct +tgagcccaagagtttgagatcagcctgggcaatattgtgagattccatct +ctacaaaaataaaataaaatagccagtcatggtgtcacacacctgtagtc +ccagctacatgggaggctgaggcgggaggatcacttgagcttgggagatc +gaggctgcagtgagctatgattgtaccactgcactccaggctgggcgaca +gagagagaccctgtctcagaaaaaaaaaaaaaagtactttgtaatctgta +aggtTTATTTCAACACACACAAAAAAAGTGTATATGCTCCACGATGCCTG +TGAATATACACACACACCACATCATATACCAAGCCTGGCTGTGTCTTCTC +ACAAATGCACTGCTAGGCACCACCCCCAGTTCTAGAATCACACCAGCCAG +TTCACCCTCCAGATGGTTCACCCTCAACTTCATAAAAGTTCCCTACCTAA +TCTACTGACAGGCTCATCCCCGACCTAATTTTAAAGATTTCCTAGGAGCT +GCAGTGGGAATCCTGGACCTCAGCCTGGACAAAGAACAGCTGCAGGTCAT +TCTCATGTGTGGACACAGAAGCTCTGCCTGCCTTTGCTGGCCAGCTGGGC +TGAGCGGGCCTGGGAATTAAGGCTGCAGGGTTGGTCCCAGGCAGTCTTGC +TGAAGCTTGCCACATCCCCCAGCCTCCTGGATTTGCCAGGATCCAAGAGC +ATGGACTTTAGGAATTCCTGGTGGAGGAGTGAAGAAAATGTGACAGGGTG +TCCTAAGCCCCGATCTACAGGAAGAAAACTGGAAATAAGACTGAGGACTT +AGTTTAAGATGTTCCTACTCAGCCTCTAGCTTTTGTGCTACAGTTCTGGG +AACAGACTCCTCTCTCCTGAAAACCACTTCCCTCCGCAGCATTAGATTTC +ACCAAGATGTCTTGCTTGTGGGAAAGACTTCCAAGGATGCCTGGAGAGAG +GAGGATGGAAATGTCCTGCTCTCTAAACAGATAGACAGATGCAGCCAGAC +AGAAAATAGTTTATCTTGCTGAGGTTTCTAATGTATTTGAAAGAGGCCTG +GGTCTAGAAGTCTACCCAGAGGGCTCTGTGTTGTGCACGCAAAGATAAGA +ACCTTCCCTGTGGGAGTTCCAGAGCCAGTTTTCATAAACACCCATCGGTG +ACTGTGTTCAGAGTGAGTTCACACCATCCTGACCTGCCCTGAGTTAGACC +TTACATGGTCTTCCTCCTCTAGGAAGCCTCTGCAGCCCAGGAACCTCCCC +TTATCGGAAATGAACAGCATTTGAAGCTTCACCAGACAGACCAGACAGCT +TAGCCCTCGTGTTGTGCCATGTGGGTTGTTCTCTGAGAGGcaggagagca +tagtggttactaggaagggaaggactttgggactagactgcctcggctgg +agtcctctttctgcttcatagccacgtgatcctaggcatgttacctgtgc +ctcagttttcactctgtcaatatgtaataactgaatctgtctttgtggtg +aggattcagtgagttaacatatttgaagtgcttaaaaATGAGGCTTGtgt +ccatagattaatgagtgaatacacaaatggtgatatggacatacagtgga +gtattagtcataaaaaggaaggcagagctgatccatggcaccatgtgaca +gaacctcaaaagcattaggttaagtggaagaagccagacacaggtcacct +attgtgtaattccatttataggaaatatacagaatatgtaaatccgtgga +gaaagaaagccgatttccaggggctaaggggaggggagaatgggaagtgg +ctgcttcatgggtacaaggtttcattttgagctgatgaaaatgttttgga +actacatagagatagtgttggcacaacatggtgaatgtactgaatgccac +tgattgttcaatttaaaatggtcaaacttatatgaatttcacctccatta +aaaaaaAAAAAAAAGgaccagatgtggttgctcacacccataatcccaac +actttggaaAAAGGTGAAAGTTTTTTTTtctttttttttttatatactta +agttctagggtacatgtgcataatgtgcaggttggatacatagatatgcg +tgtgccatgttggtttgctgcacccatcaacttgtcatttacattaggta +tttcttctaatgctatccctcccccagccccccacccactgacaggcccc +agtgtatgatgttctctgccccatgtccaagcgttctcattgttcaattc +ccacctgtgagtgagaacatgcagtgtttggttttctgtctttgtgatag +tttgctcagaatgatggtttccagcttcatccatgtccctgcaaaggaca +tgaactcatcctttttaatggctgcatagtatcccatggtatatatgtgc +cacattctcttaatccagtctgtcattgatggacatttgggttggttcaa +agtctttgctattgtgaatactgccacaataaacatacatgtgcatgtgt +ctttatagtagcacgatttataatcctttgggtatatacccTAAGACctg +ggacgcatttaaagcagtgtgtaaagagacatttatagcactaaatgccc +acaagagaCCTCTGCCTGAGAACGTGGGTTTCAGCCTAAGAGTTGTAATA +TGTGTGCCCATTCACAGGTGCTGCATCAGAGTCCCAGGTGGGAAGAAGGC +AAGCATACACAAAAATGGTAAAAGGCAGAAAGGAGCCCAGTCTCGTTCTT +TTTAAGAAGTTTTCCTAAGAATCTCCACCCAGCGACTTGCTCTCACATCT +TCTTGGCCAGCACTGGACCACACAACTCCTTCTAGATACAGAGGAGTCCT +AGGATTCTATGAGAAAGAAGGGGAGGGTGGGCAAAGGGCAGCCAGCTGTG +CAGCATCTGCTGGAGACACCTAACCCTTGGTGGAGGGGTTGTGGTGCTGG +gagaaggctttctggacggtgtgacagcagagataaacttaaaggccaag +taggagttaccctggtgaagcagggcagggttacaagcattccagcaaca +tgaagcagcaGGAGtgttttaattaaaagaaggcagttgctgtaaccaac +tataaacaaataaaggcttaaacacaatggaagtttatttctcactaagg +gaacatccaaatccatgatactttaagtcagggacccaggttcctcccat +ctatggttctgccatcactaatctgggtcttccacaattgccgtgctcct +tggaggtgggaagagcaggcggaggacacgtgggaggttttagggacaag +cctggaggcagcatgcgtcactcccatgcagagtccattggccaatgctg +gctccgatggccacatctcactgcaggggcagctgggaaatacagtctgg +ctgtctacccaggaggaagagCAGCCAGTTTCTGCTGCTGATGATCAGGA +GGTGGAGAAAATGTTCAGTCAGGCAGGGAGTGGGAATAGACAAGACCACA +AGCAGCTTGGTGCCTCTGAAAGGGAGAGGGGTGGAGGGGAGACTAGAGAG +GTGGGTAGGAATACTGGATTCCACTGACCACGTGCTGGATGTCACGCTTA +GCCCTCCTGCTCTGTGCCGGGTTAGGCACCTGGTGTTTTACGTACATAAT +CTCAATTCTGTGAGGGCATCCGACCTGTGGGAAAAGAGCTGTTTGTTTCA +AATGCCAGTCCTGCTTcctaacaagtgtttagagcttaatcgtgttcaaa +atacatatacaatgtttaatacttacaagaatttggtggggaaaatatta +ccatctttcccttttgtgattggagaaaaatgaggctttgaagggtttaa +gaacttgcccaaggtcggccaggtgcagtggctcatgtctataatcccaa +cactttgggaggctgaggtgggaggatcgcttgaggccaggagttcaaga +ccagcctgagcaacatagtgagactttgtctctataaaaaataaataaaT +AAATAAAAACAACTTGTCCAAGGTCAGACAGGCAGCCTCTTAGTAAGCAC +ACATATCCTCTATATTATACTACCTCTCATGGAGGATCTCCTGTGTTCTA +CAAATAGTCTGGACTTGAGCCAGAATGTGTTATAATCCTGGGATCACGGC +CAGTGGGCTTAGAAGAAGCCATCTCTTTCTCATGCCAAGATGAGGCTCCC +CCAGATTTGCTCAGACTTACCTATAGTCAGCAGCATCGGGGGTCAGGAAA +GACTTCACGAAGCCATAAATGCATCCTTCTCGGGGCAGCACCTGGCTCTC +CCAGGTGAGAGAGGACTCCATTTTCACAGGCAGGCGTGGGAGCTTCAGCA +CCCATCTCTGGGCCCAGAATGACCCACTGGAGACCTTACAGCTCTCCTGT +CACCCCCAATTCCTGCCCCCTCTGCAGCCTTGGAGGAGAATGGAGCTGAA +GGGCCTGCCCTCTGTAGGGTGAGAAAGGGAGGCTAAAGCCTGGTGCCCAC +TGCCCTGGCTGCTCCGCATTGCAGGAGCTGCGCCCTTCCTTTCCTGGCAC +AGGGTCCACAGCCCCGAAACCCCGTTGTGTGGGAGCTGGGCACAGGGCAG +CAGGACTAATCCTTGGAACAGCTCAGGGAGGATTATCCCAGCCACTGTCA +GCAGCGGTGCAGCTGGCTCATTCCCATATAGGGGGAGGCCAGAGCCAGGG +GCCTGCCACAAGTTGGAAGGCTGGGGAAGGGGAGGCCAGCAGAGGTGTCC +TGGCTGTGGGTGGCTCTGAGGGGGCTCTCAGGGGTGGGGCTAAATCTCAG +GGGCAGGATTATGTAAATCAAACCAATTCTAGCCACAGATTTAAAGTTTG +GAAAAAAAAAAAAACCCAGCCTGGCGGAAAGAATTTAAATTATAAAAACT +TAGAAGTATGGAATGTGAAATCATCCTGTAGGTGCTTATTTAACAACGAA +ATCATCCCGACACAATGAGCCATATGTGAAAAGTCATCCTTCCCCAACAC +ATCCCCCAACAGGCACTCCTCAAGCCTCTCCCACCCAAGTGCTGGCATCC +TCCCTGTCCTGCTTCACCTGAGACACCCCTTGTCTCATTAGACATGCAAC +TACGGGAGGGGTGACAGGAAGACAAGACACTATTTCCTCAGGCCCAGTTT +GGTGTGGGGAGAAAGCCTCCTGATCCTGAAAGCAAGAATTTGACCAGAGC +AGAAGTAATCAGTATGCAGATTGATTCTGTGGTATGTTAATGTTTATGCA +TAGATTATGAGGACCAGGTGAAAAGTGGGCCAGGGGAGCCAGATGTGTGT +GTGAGTCATGGGTGGCTGAGATGAGGACAGGAGGGAAACTGGTTTGGAGG +GTGCTGGCGATGGGGTGGGGGTGCCAGGAGGAAGGGAGGCTAGTTGTTTG +AATGTCTGCATGAAAAAGCGGACGACAGCGGGGTCTGGGTGAATTCGGGC +AACCATTTGGACCGTGGAGAAAACTGCCTGCGTGCGGCTGAGGACCTGCA +CTATTAATTTGTTTTTTAGCTAAGGCAAAGATAAATATAAAAACtgatac +tccacccagttaccagaaaacatttaggtatgtgtgagacaacttgggta +tgtgaacctaccttttcaatgtaaattcagtgaaatctaagtgcagatcc +catatttccaataaaaaggtaacatccaaactcagatgtcctatgagtat +aaaatacacaaagatcttctggacttagtatgaaaagggatttttttttt +gtcaggtacctcactagttatttttaaaataggattgcatgttgaaatga +taatcttttggatatattgggttaaataaatttattattaaagttaattt +cacttaaaaatgtttaatgtagctactagaaattttaaaattaagcatgt +tgctcaccttatgtttctattggacggctctCTCTAGATACAAAGGCTGC +CAAGAGGGACCTCACTCTAGCTTCAGGGAGAAGAGAGGAATTAGCAAGGC +CAAGCAGAGGCTCCTGAGGGCAGGGCCAAGGGCGGCTTGGTGGGGTGGGG +ATGGGATGCACAGAGATAACTCCAACCCTTAAGAAGGTGTTTCCTAGAGC +AGGCTGTGACCTGTCAGTTTATATACTGAGGCTTAGGAGCCTCTTGGATG +CCCCCAGATCTGCACCCCTGAATTGCCCTGTGCCCCTGCCGTCTTTGTTC +CTGTGCTGGCATAGTGGTCTCACCTCCGGCAGtatcaccaccactgggca +caagcttctccagcacagcaactgtgtcttatttctccttgtactcccag +tgttcacaccatgctgcactcacagaagactcttcgttgatattttgtgg +acagagagaatGCCTGTGAGAGTGGGCTGAAGTGTGCGTTGGGCTCCAGA +GACCTTAAGGAGGGGAGACCAGGTCCTGAGTAAAGTTGAAGGGGAGGGGC +TGAGTCCTGCTAGCCAGGAGTCTCATCCCCTGGGGAAGTTCCAGGGACCC +CTCAGAAGTGCAAGGGGACGGTGTTAGTGTTAGTCCAGTAACACAGCCCA +GAGCCTGCcttccacgtgggtttgacaggagcctcctaactgctcttctg +cttccatttttgccccttcagtctattctcaacagggaagccagaggcat +ccttaaccatgtcagatcatgtggctcctcagctcaaagccTCATCTCAG +AGGAAAGCTCTGGTCCCTTAGAAATGGCCCAAGTGGTGACAGACAGACTC +TAAGGtgagcagactgttgctagatatctgggctcggaggactcgccact +gctcaaaggcagtgaggattttcgcactagaagctggaggacagggatcc +ttgttaggtaggagcagaaagcttagaaaagtggtctcctgcagttacgt +ggcaaacacatcatgtaagtgataaattgggtatgcagttgaggagattt +ccaagtaaaatgttgaggatgctgcctggtttcttcttactgcttataat +atagtgtgagagaagagagataaattgagaaagagactggtttttaaact +gttaaaattgaatcaggacttgatgattttgaaaattgtcagtctcccca +catggaaaaagatgctgaaattaacaaatggcttctgagcatgtggcata +gggtgtaactgtacagtcttttgtgattatgcataaagatcaaaggatgg +gagtagcaatgagtcacacagaggtctgttgcaagagattacaagggtgt +accatgcagaacctctccaccaaaccttagggcccttgggaagcttcagt +gagttaccctgggggccatcttggcaggagctgaaggtagaaaggtagag +tttatctctaaaagattcatgggtatggctcttgacaaatcgactatgag +ccccaccgaaacccacagaggacaggcaaagggtttgggaaagctgtttc +acccacagtgctggcagattggtctgtaggggacagagtgcaaaatgaaa +gaagactgtcagagaccccaaactctgctgtcaagaagaaggctgataaa +actacttggctgcaaacacgtggatctttcgtgagaaaagaaggatgacc +cagaggcagaagcccagaaggcagagccaagagacatggaatcttcccac +atcttaaaacctgtttagggaacaccagcatctgtccagctggatttcag +aaccaccattccttcatccttcccctgctgcctctttctgaacagcaatg +tctcaagctttacccaccattgtgtgttgcatatgtagggggcagatagc +ttgtatctttagttttccagatcagaggaacatccaaagaaatctgttct +acacctaaacccgatttagatgagattcgggactgtgagcatgaagggat +ctcaagaggggtgaatgtgttttgcatgcacaagggacaggagtcttggg +gacagaggacaggctgtggtggcagatactaaggtgacccccacaacccc +cacctctgccattcacacccttgaataatccccttctctggttgtaagca +gaacctgtggcttgcttatgaaggaggcggtatatatgtgattcatgtac +tgatcatattgtataagatcactggctggatgcagtggctcgtgcctgta +atcccaacactttgggaggctgaggcgggtggatcacctgaggtcaggag +ttcgagaccaggctggccaacatggcaaaaccccgcctctactaaaaata +caaaaattagccaggcatagtggtgcacgcctgtaatcacagctactcaa +gaggctgaagcaggagaattgcttgaactcaggaggtggaggtggcagtg +agccaagatcgtgccactgcactccagcctcagtgacagagcgagactct +gtctcaaaaaataaataaataaaatgttaagatcataacctgtctttctg +gggactctctcttgacgcctttgaagaagcaggctgccatgttgcaagct +gcctcatggaggggatcagctgcgaggagctaagagccccctccagtcga +tgctcaccaggaagctgaggtcttgtgtccagcaccctgcatagaactga +atgctgccatgtgagcttggaagcagagccatccacacagctgagcccta +gatgagaacccagtgctggctgacaccctgatggcaccttacagaggacc +agttaggctgtgccaactcctgacctgcagaagctggggaacactgggtc +gtatttgcagctgctggatttgtgggaatttgtcacacagcaatTGGGAG +TCACACAgcctgtgacgccccaacaatccacacctcctgcatctccctgc +cttcacttcctagcacactgccctgactccctctgccgcagccacgctgg +ccctctgctgttcttcgaagccaccagggctgcattggctcccagccttt +gctctcactgctttctcctcctagagagcccttcctgcatgtatatgttt +gactcactcccttgcctccttcagacttgtacttaaaaatctcagtaagc +atttccctggctacccttttaaaaattgcaacccacttccatccccatcc +ccaacatgccatatttcctttcttctTCttccttcttccttttttttttt +ttttttttgacacaggttctctgtcacccagcctggagtgcagtgacatg +atctcggctcactgcaacctctgcctcccCAGGCAagaaaaggggaggat +gccaataaaggatgcattgatttgtatttactacagtggacatcaagggc +acattcttgctgtggccatcaagagactgtataaattctatgacttgtag +ttgtcccacttaagaaacaaagaagctgTGCATTTCTTTACTGGTCTAGA +GCTGCTCTAGGGCATTTTCTCTACAGCAATTCTAGGTTTCCCCACCTTGT +GAGTTTAGCTTTTTCTATATTCAAAGAAAAGTCCTCAGCCAGAGATTCTC +AGGAGCTTATAGAACAATCCAAACTCTTGGGAATATTAAGTGGAGAGGGG +TACGTGCAAGACACCAACAGCACTAGAAACAGTCCACATCTTTCCATGCG +TGGAGGAGTTTATGCTCTATGTGAGTTCACTCCATCATTAATTCTTCAAA +CACAAGAGTGTTAAAGGAACAAGAGTTAATGGGTCCTGTCATTACACTTG +TTCCCAGGATGACATTCTTCATCTTCCTCTTCTACAACCTGTTCTATATT +CCCCTCATGTTTATCCAGTGCTTCTGCTAGTCTAGTTCACTTCCAAAGAC +CCATGATTACCATGGCCCTGTCAGGCTGTAATTGCTGCAATTTCCAATTT +ACAATTGTCATCATCTATGGTTGATAAAGGTATAGCAATATTTctatttc +ctcatgataatgaaggtcaattacaactgccagtataataacttatttct +ttgtctgccaacctacatacacaaggaagccaaaatgacagggagctact +aaaactttattcttattggaatgcttactatgtacccagaagaagcattc +tccctactccagcagagcttaatgctgtaggtccaggaagctcaaattct +ccaagggagttttagtgagaggagccactctcaccctctgcccttggttt +acaaacctgtatattctaggacccaatatcttacaatgtccattggttca +aagtataacatgttaaagcacagagccccaactctgaaaagtaccatccc +taaattggcatttagttgcacctttatatccacctttaaaagaaatatct +tttaatgttctatcagactgatagattctgtttaatatagtatattatag +caccagtggatcatttggttgtatgcatattattgtaccttctctgctac +aaaatatattcctttgtcctaaggtgtgttacaaagaacattaggcattc +tatgcatctttggatagtttaatggccaagacattgatggcaggagagtc +aaagccacaggtggaaaacacatttatcccagtaagaacaaattgctatt +cttccactgtagagagggtaaacaatgtgccattacgttgccaattgaat +gcctcaatcatgtcaagggctgaacatctatgactgtttctgaaaggtca +aacattcaacagaggctgtagctagaactgccttaatgataagagatcat +gctgaattacccatgcaaaaccttaatacttgacacttatcactacttta +ttcaagagcctattgtgcaagcaTAAGTGGCTGAGTCAGGTTCTCAACTC +TGCTCATTAATACTATGCTTGGAGTATACAGTAAGATAAGAAACATAAAT +AAGAAGTGTACATTTGTTTcttcctgttttcttctggctattggatcaat +tacatcccatcttaagctgacccctgtgtaattaatcaatatccgtttta +agcagcaatccatagttgtgcagaaattagaaaactgacccacacagaaa +aactAATTGTGAGAACCAATATTATACTAAATTCATTTGACAATTCTCAG +CAAAGTGCTGGGTTGATCTCTATTTACGCTTTTCTTAAACACACAAAATA +CAAAAGTTAACCCATATGGAATGCAATGGAGGAAATCAATGACATATCAG +ATCTAGAAACTAATCAATTAGCAATCAGGAAGGAGTTGTGGTAGGAAGTC +TGTGCTGTTGAATGTACACTAATCAATGATTCCTTAAATTATTCACAATA +AAAAAAAAGATTAGAATAGTTTTTTTAAAAAAAAAGCCCAGAAACTAATC +TAAGTTTTGTCTGGTAATAAAGGTATATTTTCAAAAGAGAGGTAAATAGA +TCCACATACTGTGGAGGGAATAAAATACTTTTTGAAAAACAAACAACAAG +TTGGATTTTTAGACACATAGAAATTGAATATGTACATTTATAAATATTTT +TGGATTGAACTATTTCAAAATTATACCATAAAATAACTTGTAAAAATGTA +GGCAAAATGTATATAATTATGGCATGAGGTATGCAACTTTAGGCAAGGAA +GCAAAAGCAGAAACCATGAAAAAAGTCTAAATTTTACCATATTGAATTTA +AATTTTCAAAAACAAAAATAAAGACAAAGTGGGAAAAATATGTATGCTTC +ATGTGTGACAAGCCACTGATACCTATTAAATATGAAGAATATTATAAATC +ATATCAATAACCACAACATTCAAGCTGTCAGTTTGAATAGACaatgtaaa +tgacaaaactacatactcaacaagataacagcaaaccagcttcgacagca +cgttaaaggggtcatacaacataatcgagtagaatttatctctgagatgc +aagaatggttcaaaatatggaaaccaataaatgtgatatgccacactaac +agaataaaaaataaaaatcatattatcatctcaatagatgcagaaaaagc +attaacaaaagtaaacattctttcataataagacatcagataaaacaaat +taggaatagaaggaatgtaccgcaacacaataaaggccatatataacaag +cccacagctaacatcataatagtaaaatcatcacactggtaaaaaaaatg +aaagcttttcctctaaggtcagaaataatataaaggttcccactcttgct +atttctattccatatcgtactaaaagtcctagccaggacaattagacaaa +ataaaaataaaaacacccaaattggaaagatagaagcaaacttttctgtt +tacagataacataatcttatatgtagaaaccccttaaaacttcagcaaaa +aaaaaaaaaaaactacagagctagtaaattcagtgaagttgcagaataca +aaatcaacatacaaaaatcagtagtgtctctatacactaataaggactta +acagagaaagaagttaagaaaacaataccactaacaatagaatccaaaaa +ataaaatacttaggaataaattttaccaaacatctgtacactaaaaacta +taaaacattgaaaaaagaagttgaataagacacatataaatagaaagcta +tctcatgttaatagattagaaaaagtaatattgttaagatgtcctcacta +cttaaagcaatttatagatctaatgcatttattgcaatctcttcaaaatc +ccaaaggtatttttgacagaaataaaaaaaaaattctaaaatatgcatga +aaccacaaaagactgtgaatagctaaagcaatcttgagcaagatgaacaa +cactggaagcatcacactaccttatttcaaaatctactacaaagctatag +tgatcaaagcaacatgatactgtcataaaaacacacagataaacctatgg +aatggaataaagagcacagaaataagtccacacatttacattcaattgat +tttcaacaacaatgtcaagaagacaatggggaaaagacaatctcttcaat +aaatgatgctggaaaaactatatatccacatgcagaagaatgcagttgaa +tcctgatttcataccatatgcaaaattcaactggaaatggattaaataca +aatttaaaacatgaaatggtataactattagaacaaaacatagaaaatat +tcttcctgacattggtttgggccatcatttttctgatatgactctaaaag +cacaggcaaaaaaagaaaaaatagacaaatgagactatgccaaattaaaa +aatttctaacaacaaaagaaacgatcaatagagtgaaaaagataacctct +tgaatgggagaaatatttgcaaactactcatccaaccggggattgatatc +cagaatatacaagtaacacaaatatgtcaaaagtaaaataaataaataaa +taaataaataaataaattaaataaattatttaaaaatcggcagaggacag +gaatagacatttctcaggagacaacatacaaagggccacagatacatcaa +aaaatgctcaacatcactatttgtcagggaagtactaattaaaaccaaaa +tgagatgtcccctcaaacctgttagaatggctattatcaaaaagatgaaa +gatagcaactatcagagaggatgatagaaaagggaacccttgcatcatgt +acaaattaaaaatagaactatcacatgatccaagaatcctacttctgggt +atatagccaaaggaattgaaatcaatatgtcaaagggatatctgcactcc +tatgttattgcagcatgttcacaatggccaagatatagaatcaacctaac +tgttcatagacagatgaatggataaatgaaatgtgatatggaaaattatt +cagccttaaaaacagtaggaaattctgtcatttgagacaacgtggatgaa +cctagaggacattaagctaagtgaaataagctagacacagaaagacaaat +attgcatgatctcacttagaatctaaaaaatctgaactcatagaagcaga +gaatagtatgatggttactagggttatctggcagggagaggatgaggaaa +tgggacattgttaataaaaggaaaaaaattcaattagtaggattacattc +aggggacccaatatacgacatgttgactgtaattaataatgtattgtatg +cttgaaaattgctaatacagtatattgtaaatgttaatatgaggtaatat +atgtgttaattaacttgatttattcattcaacaacatacacatatattaa +aacatcacactgtattccacaaatatatataatttttgtcaattaaaaaa +taaTTTTTAAAAATGAGAAACAAAAAAGCTGACATTTTCAGATTAAAAAA +ATTATACAGAAGAATTAATTCATTAAAGTAAAAACAAATGTGGGAAAATG +GTTTTTAAATATAATTTAAACCAAATTTAAAATAAGcatataaagactat +ggacaaaacaagaaatccaaataaaaaataaacatatgaagaatattcaa +actcactttttatcaaagaaatgtaaattttaaaataTAGCATTGCTATT +GTGTTTTCATAAATAATAATATATCATGGATGAGCCTGTGAGGAAACAGA +CACTCATACTCTGCAAAGCAATGACTAAgataattatgtcagatcatgaa +ttacgttaattagcttgatggtggtcactgtttcacgataaatatacata +tgtatcaaaacatcacattacacaccataaagatatataacttgttatCA +AAAAGAAATATAGCAGTTAAAATTTAAAATTTTTAAAAAACGTCTTTTTG +AGGTTCGTACCTCACTTAAGTCACACTGTTCAAAATATTCATGCACTCAT +TTCTCTCATTCATGTGTTAATGTACAGGGTACGGGCCACTATAAATTCCT +TCAGCAACTGGAAAGGAAACTTTATGTACTGAGTGCTCAGAGTTGTATTA +ACTTTTTTTTTTTTTtgagcagcagcaagatttattgtgaagagtgaaag +aacaaagcttccacagtgtggaaggggacccgagcggtttgccCAGTTGT +ATTAACTTCTAATTCAACACTTTAAGATTCTTAGCATTATTGCAGACAAC +ATcagcttcacaagtgtgtgtcctgtgcagttgaacaagatcccacactt +aaaaggatcctacactttttttaatgctctgctgtttctgccttgaaatt +cttaacaatttttttaaccaaagtcctcacaaattcagtttacattagcc +ctgcaatcatgtagacatcctgATTCCAGACAATGTGTCTGGAGGCAGGG +TTTACAGGACTTCAAGAACCTTACCTTCTCAACTTTCATCTGCATCTTTA +CTCCCAACTATATATGAAGATGATGAAGATAGATATGGATGGTGCTTCTA +CCATACCCTCTTCCTCTGCCAAACTTCCTTGATCTAGGATAAggtcagta +aacttcttccgtaaaaggccaaaagtaaatattataggctctacaggccc +tagagtgtctgtcataactactcaactcttattgtagcataaaaactgtc +aacagacaatacagaaacaaatgagtgtgactgggttccagtgaaacttt +atttacaaaagatttgtcccatgagtcaaatttaccacctccAGATCTAG +AGAAACAGTTTTGAGCCCTTTTATTTTGCTCAACAGTTAAGCATGGCTCC +ATGTCCCTTATATTTAGTCAGAACTCGGTATGTTTTAAGGAAAGAATGGT +TACACGAAGACATACATTCATTCATTTATACAACACATTTTCAGTGTTGA +ATGATAAATTTTGGAATAGTTAACAGATGATAAAAGTGTTGTTTTCAGTC +ATCCCTATCCAATGAAGTAAAAAAAAAAGTGTTGAATGGGAAGAAATCAA +GAATAGTTATACGAATATCACCATTGCATTAAAGCTCTCTTCCTTGTTTC +TAAAAGAATATCTTGACACACATTAAGCTCACTGACCCCCACACCATGAA +TGAGGGCATCTTCAACAATGGTGGATGACGTCTTAGTTTCCCTCAACTCA +GTTAATCTAAGTAAGCTCATGGTATCACTTTCCTGTCCTAGAGGGAACAT +ATTTCCTGCATTTTTCTTTTTTTCCTTACTTTCCATCACCAAGTAACTCT +TCTGATATTTTTTCTCTTGAGAAAATTAATATGACTCATAGATCTGGTTC +CCAAGAGAAATCAATGGAGGCCTGGTTACAAGGATCTAAGAAGCATCAAT +GGGTCACTAACATCTAGTGGTACTAATTAACTCTGTTAATCATTGGGAAG +AAAATGTATATATACTTTTGTCTTGGAGCTGATTCTACTAGAAAGCAGAA +ATCAAAATGATCAGTTTCCCAGTGTCACTACTGCACACCCTGGAACAGAA +CAGGTAGGTCAGAAAAACGCTCCCAAAGTTTAGCAATGTCAAGGCAATCT +CTCTCTTCTTACATTTCCCTTCAACCTTCTATCTCCTCCACTTTTCTGTT +TTCCTCCTATCTCCAATTATTTCAATCCTCAGAGCATTATTCTTACAATC +TTAATCACTAAATTATATTACACCCGTTAAAGGAGAGATTTCTAAATGCA +TTGACATTTGTACTGTCTCTCTTTGGAGAATTAGTATTATAAGGATCTGT +TATCTCTTGTCACCTTCCTTATGTCATATGATATGTCACATTTCCCACTG +CGGAGACCAAACATGTTCACATCGTGTGCGTTCCATTTTCCTAATGGAAA +GTGGGGGGAAGTGATTTTCTGTCCTCATATAGAGAATGCTGGGGCCATTC +CCTCTGTATGCCATATTTGATAAAGCATTTGATAATCTTAGTCAATGCCT +GGGCCAAGAATTAAAGGGGTAATTATCAGAATGAAAATGGTTTAATGAAA +CTGTGTCTATCAGTTCTGAAAAGGGCCTCTATCACAATGAACTAAGGTAG +TTATGAATAGAGCTAAaacttaggcaacaccatcctggacataggaacgg +gcaaagatttcatgacaaagacacggaaaccaatcacaacaaaagcaaaa +attgagaagtggaatctaataaaacaatagcttctgcacagcaaaagaag +ctaccaacaaagtaaacagacaacctacagaatgggagaaaatatttgcc +aactgtaagtctgacaaaaatctaatatctggcagctataaggaacttaa +atttacaagacaaaaacaaccccattaaaaagtgggcaaagaacatgaat +agacactctcaaaagaagatatacatatggttaacaagcatatgaaaaaa +aagctcaatatactgagcattagagaaatgcaaatcaaaaccatattgag +atatcatctcataccaggcagaatggctattattaaaaagtcaaaaataa +cagatatcggtgaggttacagagaaaagggaacacttatacactgttggt +gggactgtaaattatttcaaccattgtggaaagcagtatgggatggcgat +tcctcaaaaagccaaaaacagaactatcattcaacccagcaattccatta +ctgggtatatacccagaagaatataaatcgttctaccataaagacgcatg +catgagaatgttcattgcagcactactcacaatagcagagacatggaatc +aacttaaatgcccatcagtaacagactggataaagaaagtgtggtacaga +tacaccgtggattactatgcagccataaaaaagaacaagatcatgtcttt +gacaggaacatggatggagctggaggctactatccttagcaagctaaggc +aggaacagaaatccaaataccgcatgttctcacttatgagcgtgagataa +atgatgagaacttgtaaacacaaagaaggaaacaacaggcagtggggtct +acttgaggacgacgggaagagggagaggagcagaaaagataactactgac +taccgggcgctacctgggggatgaaacaatctgtacaacgaacccccagg +acatgagtttacctatgtaacaaaccttcacgtgtacccccgaacctaaa +ataaaagtcaaaaagaaaAAGAAAAAAAGAAAAATCCATGCATATGATAC +ATCAGTTAACAAGGCACTGGTGAAATTAATTTTAAGTATTATTGTCTCTT +TGTGTTTTTGGTCTCAGAAAAGTTACGATTTCCCTTAGTTCCTTAGGGCA +GAGAGAATCTTCAATCACTGAAGTCAGGAGACACACATTCTATCTGATTT +TCTACATTATCTGTTTGAAAAGGTTACCCACTTATTAGTGTTAAAGCCAA +GATATCCAGCAAGGATAGCAACCAACTCTTAAGGTACTCTCCCTTAGGAG +GATTCCTGATTCTTTAATGTTTTCTaaaaaagcaaaacaaacaaacaaac +aaaacaaaacaCTAAATGTTTTCTCTTTCAACTTATTTGAATACACTCTT +TTCTCACTGCTCTGAGCATGAATTCAATATTTCAGGGCAAACTAACTGAA +TGTTAGAACCAACTCCTGATAAGTCTTGAACAAAAGATAGGATCCTCTAT +AAACAGGTTAATCGCCACGACATAGTAGTATTTAGAGTTACTAGTAAGCC +TGATGCCACTACACAATTCTAGCTTTTCTCTTTAGGATGATTGTTTCATT +CAGTCTTATCTCTTTTAGAAAACATAGGAAAAAATTATTTAATAATAAAA +TTTAATTGGCAAAATGAAGGTATGGCTTATAAGAGTGTTTTCCTATTGTT +TTCAGTGTAGGACTCACTGTTCTAAATAACTGGGACACCCAAGGATTCTG +TAAAATGCCATCCAGTTATCATTTATATTCCCTAACTCAAAATTCATTCA +CATGTATTCATTTTTTTCTAAACAAATTAGCATGTAGAATTCTGGTTAAA +ATTTGGCATAGAACACCCGGGTATTTTTTCATAATGCACCCAATAACTGT +CATTCACTAATTGAGAATGGTGATTTAACAAAGGATAATAAAGTTATGAA +ACCAATGCCACAAAACATCTGTCTCTAACTGgtgtgtgtgtgtgtgtgtg +tgtgtgtgtgtgtAAGAGGGAGAGAGAGAAAATTTCACTCCCTCCATAAA +TCTCACAGTAttcttttctttttcctttcctttccttgctcttctttctc +tcctattgctttcctttcatttccttCTCATAAAAGAAAAATAACAATAT +AGAAAATAACAAAATATAGATGGTCAACCTTTTTAATATTAAGGTTACCT +AAAATGCCATTATCCAAAGTGGTTCTCTAGAGATGCTGATGTATATACTT +ACATATTTTACAGTGTATTCAAATAAAGAGTATATTACATAAGACATATC +CTTTTGTAACCAACTTTTGTCATTAACAATTTACTGGACTTGTCAACAAA +CCTAAATCTGTATCGTCTATAATGGCTACGTTCATTTTGGTATGAATCTT +AATTACCCCTTTCTGCATTATTTAATGATTTTCTCATATGTCACTCTTAA +ATGTACTTCTAATTTTTCACTTTACATCACATAATGAATGGATCCAAATA +TGTTATGGATAGATATCTTCAAACTTTCTACTTACAAGTAGTGATAATAA +CAGATGTTCTCTCTAAAGTGTAGTTGGTATCAATTTTACTGACCTTTAAA +AATATCTTAATGGGACAAAGTTCAAATATTTGATGACCAGCTATCGTGAC +CTTTATCTCTGTGGCTCTGTGGGCCTGTAGTTTTTACGTGCTTTTAGTGT +ATCATGATTAAATATTTTGTTTTAGTAAAGACACCATTATTTCCCAACTT +CATATTCAAATTGTCAAAGGTATTAATCCTAGAGCAGAACTCTCAAAAGC +ACCAACTCTGATTCCTAACAAAGCATGGAAAAGCCCTCTCTCTGAGTTTC +AGATACTCTTTTTTGTGGGGGTTGAGTTTCACTTTATTTAAAGTGAGTCT +TAATCCTCCAACAAGTCAACAAGTGATTGGCTGGAATCACACGTATTGGA +AAACCAGCGGAAGAGTAAGTCTTTGTATTTTATGCTACTGTACCTCTGGG +ATTAATTGCTCTTTCCCTCATTGGCCAGTCACTCTTAGTGTGTGATTAAT +GCCTGAGACTGTGTGAAGTAAGAGATGGATCAGAggccgggcgcgggggc +tcgcgcctgtcatcccagcactttgggaggccgaggcgggcggatcacga +ggtcaggagatcgagaccatcctggctaacacggggaaaccccgtctcca +ctaaaaatacaaaaagttagccgggcgcggtggcgggcgcctgcggtccc +agctgctggggaggccgaggcgggagcatggcgggaaccgggaggcggag +cctgcagtgagccgagatggcgccaccgcactccagcctgggcgacccag +cgagactccgcctcaaaaaaaaaaaaagaaGATTGATCAGAGAGTACCTC +CCCTAAGGGTACATGCAGATAAATACAGTTAAGGCGATTAACATTTCAAA +TACGGTGACTGTTTCTTACGTGGACGACGTTGTGTTGAACATGGGTGAGT +AAGACTGAAGCAGCCGTAATTACTGCACGATGCGCATGGTAAAGAAGCAC +TCCGTTAGGGAAATTATATTCTTTGCCCCTCTAATCCTTCACTCCACCTG +CCATATTCCCACATGATTTTTTTCTTTGCTGTTCTTGTCTAATTGTTATT +AATAATTAATAAATAACTTATGATCTAATTGTTATTAATAATAACTTATC +ATCACATGATTTATTAATAAATTAATAAATAACTTATTATCACCGCATTT +CCCCAAttcatttatctttctttcattttctctctttgtgtgttttctgt +cttCATATTTCAGCACTTGCCACATATTTCCCACAAAATCATTTATGGTC +AAACAACACTTCAACGTGTAGCATTTGTATTTCTCAATTCTTCCTCACTT +TCTTCCTTCAGAATACTAAAGCTTCTTCTCTACTGACTGAGTCAATGGCC +AATGGATAGAGTAAATAATTCTGCGGTATCTAAATTTGTATTGATTGGAC +TTTCAAGCTCTTGGGAGATGCATCTTTTTCTTTTTTGGTTCTTCTCTGTG +TTCTACATGGGAATTATCCTGGAAAATCTCTTCATTGTGTTCACAGTAAT +TATTGACTCTCATTTAAATTCCCCAGGTACTGCCTACTGGCCAACATTTA +TCTTCTTGATCTGGGTCTTCTCCTACAGTTCTGACTTTTTCACTAACTGC +AGCATCATTTCTTTTCCAAGATGCATCATACAGATATTTTTCATTTGTGT +CATGCGTAAAAATTGAGATGGTGCTGCTCATAACCATGGCATAGAGCAGG +TACACTGCCAATCTGTAAGCCTCCCCATTACCTGACCACAATGAACCCCA +AAATGTGTGTTTCCTTTGTTGGAGGCATCCTGGATAGTCAGGATAATCCA +TGCTGTATCTCAGTTTGTTTTTGCCATAAACTTGCCTTTTTGTGGCCCTA +ATAGAGTAGGTAGTTTTCACTGTGATTTTCCTTATGTCATGAAACTTGCT +TGTGTAGACACTTACAAACTAGAGGTTGTAGTCACTGCTAACAGTGGGCT +TATATCCATAGCTACCTGTTTCTTATTAATAATATCCTATATTTTCATTT +CGGTAACCGTCTAGAATCCTTCTTCAGGAGACTTATCTAAAGCATTTGTG +TCATGTTAGATCACATCACAGTAGGGATTTTGTTTTTTATGCCATGTATA +TTTCTGTATGTGTAGCCTTTGCCTAAAACAACACATGATTAATATTTGTT +CATTGTTCCTTTTGCTATCACCCCTGTCTAGGATCTACACATTAAGAAAC +AAAGACATGAACGTCTCCATGGAAAGACTGGGAAAATGGATTGCAGGTTC +TAGCAGGATGTCATAATAAATGGTGCATATCCAGAGTGCAAGATGATTCA +GTCTCACCAAGAACACTGAAAGTCACATGGCTACCAGCATTATTGTGATA +AGAACTACTATTTTGGGAGATAGTTTAGCAAAGGTGCCATGTAGAAATTG +ATTAAGTCAGAGGTATCTTTAACTTGCCACCACAGAGAAGAGATTAATTT +CATATACTTCCATTGAGAAGAGAGATAAGAATACAAAACCAAGCTGATTT +GCAGGAGTAAACTTGATATTCAAATACTATTTCCTGAATGACATTTTCTG +AGACATGCTAATTGTAATTACTTTCAGCTTCAAAACATAATAAATTTATC +TCATAGTAAGCATATAGATGGAATAAATAAAATGTGAACTTAGGtaaatt +ataaattaataaagtatatttttaaaatttccattttaatttCTGTTTAA +ATTAGAATAAGAAACAAAAACAACTATGTAATACGTGTGCAAAGCCCTGA +ACTGAGATTTGACTTTACCTTGAGCTTTGTCAGTTTACGATGCTATTTCA +GTTTTGTGCTCAGATTTGAGTGATTGCAGGAAGAGAATAAATTTCTTTAA +TGCTGTCAAGACTTTAAATAGATACAGACAGAGCATTTTCACTTTTTCCT +ACATCTCTATTATTCTAAAAATGAGAACATTCCAAAAGTCAACCATCCAA +GTTTATTCTAAATAGATGTGTAGAAATAACAGTTGTTTCACAGGAGACTA +ATCGCCCAAGGATATGTGTTTAGAGGTACTGGTTTCTTAAATAAGGTTTT +CTAGTCAGGCAAAAGATTCCCTGGAGCTTATGCATCTGTGGTTGATATTT +TGGGATAAGAATAAAGCTAGAAATGGTGAGGCATATTCAATTTCATTGAA +GATTTCTGCATTCAAAATAAAAACTCTATTGAAGTTACACATACTTTTTT +CATGTATTTGTTTCTACTGCTTTGTAAATTATAACAGCTCAATTAAGAGA +AACCGTACCTATGCTATTTTGTCCTGTGATTCTCCAAGAACCTTCCTAAG +TTATTCTACTTAATTGCTTTATCACTCATATGAATGGGAATTTCTTCTCT +TAATTGCTGCTAATctcccccatcttcaaatactctaccgggcttctgga +acaccacagcttcctggctttttctcctacctcctgggcaagtccttccc +tgtgtcttttgttgagtgttcctcatctgcttaactaccaatcaacctat +tgcccctaatttgatctttggcctgttttcacttagattctatccctacg +tatcacccattcccacagctttaatcaccatctaaacactaggggctctc +aaaCCTTGTATTtttctttctttctttctttctttctttctttctttctt +tctttctttctttctttctttcttcctccttttctttccttttctttctt +tcattctttctttctttTttaaggggcagggtctcactatgttgctgagg +ctggtctcaaactcctgacctcaagcaatctgtctgcttcagcctcccaa +gtagctgagaatacagggacaagccattgcacctgaccCTGGTActattt +cttgagttcctgatccacagatctaacctcctactttcctggatgccaca +caagatcttccactcaacaagtctgcaactaaactagccttcctcttttc +aaacctactcttctttcagtgttctcagtcacaataatttgtaccaacta +gttacctagttgcacaacccaaaatctgggaaaaataatagatttctttc +tccatagtacccccaaatcaataaatcatcaagtcttattctaccttcca +aagagccttacatatgttcctttattttcatctgtaacaccactattcct +gtctaagcctacctatgtcatttttggaagagaatatagtcacctatgcg +accttcccacttaaaatcctactatttacgcttcagtaaaagaaaaaaaa +tttttaatctaagtatgtaattcttttgctgaagacacttcacttgcttc +tgtgcccttaaactggtatgttatcatggtatagtaggccatccaagacc +tggcttccttcctttttttcagtctcagagaataacatactctttccctg +caactccagatccaatttggttttcttttacttgcctggaaactccaaaa +tctatcaactctggggctttccactagctaatcattttgtatacaatatt +tgtccttcATGTTTTGCCTCTTAACATCTCAGCTTTCAGTTTCATCATTT +TACCAGGGAGGCCTCCCAGAACCTGAGTCCAGAAGAGTTCCTTCCATTGT +ATATTCCTCTAGCACTACCTATTACCTCTTTTGTAAGACTAACAGCCCTC +AAAATTTTTCATTCAGTGATGTCTTCCTCATTGCATTTTAAGTTCAACAT +GAGCAGGACTTTGTCGTGTTCACCTCTATCACATCATAAATATAGCAAAC +AGTAAAACTATTGCAACATGACTAATGTATTGAACGATGCTTCAGCTTTC +TTCTTACGTTCAATCACAGGTCATATGACTAAAGAACTTCCTTTTTAATC +TCCTTTTCTATTCTCAATTAATTTCTTCTGCCTGCATCACCTCAAGTCTC +TGGGGTGAAATCCACTAATGAATTCCTTTTGCAGCTTAAGCCAATTCCAA +TCTTGAGCCAATCTCAGGTGAAGAAGCCTGTAAATTATCACTCTCAGTCC +TCTCTTGTACTACTAGGTCTCATGAACTCTTCATTAACAACTCCAGCTTC +TCTGTTAGCCCAAAAGCCTTTTGCTGCCTAGAAAACCCATGATTCATGCC +TCAGGAAACAGCCTTCAAATCACAACATGTTCTGTATCTGGCTGGCCAAC +TCCCTGCAACTTATTTCTGCCTAGATTCTCCCTCATTCATTTCAATACGC +TGTTCGGCCTGCTACCCCAGTTTCCCACTTAGAACAATGGCACACAGGAC +AGGAGCACATTGGCACATCAGAATGACTTATGTACTGCTCATTGTGTTGC +AGAAGAGACCTCTGTGGGGGCAATAGAACAGATTTTCCTCTCACGTCACT +GTAGTTGTGGTTTCCCTAAGCACCTACACTGTTTCACCTCATCTTAGGTA +GACAATAATCCATGTAACTGACTGTGTATCCTAATTTTAAAAAATATTTC +TGCCCACATTATTCTGCAGTTTTTATCTTGCTTACGTATTTTTGGAATGT +TACTATTTTTCAAAAATTAATTTGGGATCAACCAACACTTCTTATTCTGC +TGCTGTTCTAGAGAAAATCATTTTCCTCATTTCTGAACAAGAGAAAATGA +AATACAGCTCTAAACAAATGCCACTGTAAACCAAGGTGGAGCCTTTGCAC +TTTCAGGCCACCATGATAACCTGGAGATTAGATTTTTCTGTGTCTTTATA +TCAATAATAAAGCCAAGCTTCTCCAGGGGTATCCACTAGGCTTGTCTCAA +TGGCTCAATACAGGTCCTTTTGTGAATGATTACCTCACCCTCATGGAAAC +ACACTCTTGTTACAGAAACTCAGAATGATTCTATTTTTTCTTTTATATTT +GTATATGTTTTTCCAATACCTCTGAAAAAACTGATCCAAAAAAAATACAA +ATTTTAATTGTAGCCAGTCAATTCAGGAAGGATAAAGGTCAAAAACTTTC +AAAGAAACCTTCAGCCCCAACACACTAAACTTTGGGAGCACAGGTTGGCA +TCCAGAGGTAAACATTTGCTATAACTGATAACAGGAGAAGGATCCATTTA +TTCACCTGTTATCAATTACAGGCATTGTATTTAAAGATCAGATGTTTTAT +ATTTATTTCTTCAAATTTCATTCATGGTGCCATAAGTGAAGGTATCTCTG +TCCACCCTGAATATATTTTCACTCCCTCATCTCAGTCATTCCGAACAATT +CACACACTAAGATTACCCATGCTAAATGGGGATTCTTTTTTACTAGCCAA +TGTAGTACCTCAAATCCTTCCTTCCCTCCCCCTATTTCATCAGCAGGCAA +TTCTTTTGATACTTTTGTCAAGGGGAAATTGTGTGACTCAGAGATCTAGT +CCCCAAGAGAAACTAATAATGGGCTGGGTATTGTCTGTCTCAGCAGCATC +AGTGGGTCCCTCTCCTGTGCAGCTAATTAGCTTCCTTTCCAATATGAAGA +ATCTTATATATAGCTTTGTCTTTGGGGTATTACATAAATGAAGATTAAGC +TATCTGAATTTCTCCTTCTCCTAAAAATGCACATCCTATGACTGAAAAGA +CAGGTAAAAGAGATGCTTTTAATTACAAAACTTTCCCTGTCGTGGTTGCT +TCTCTCTATCCTTCTAAACTCCCTTTCAATTTCTTCTCTTCTGTAACATA +TTTGTGCCCAAAATCTTCTGCTTTCTGAAATATTTTATCTTTTTCTTCCA +CACTATCTCTTATTTTCCAATTTTAATCATTAAATTATATTATGTCTTAT +AAAACTAATCCCACATATAAACCCCTATGATAATTTCAGTTTGTCCCTAG +TATGAAGTTCTTTAAAGATGTGTAGTTTTCTAACTTTCATGCTCTCCAAT +TAATTATAAACTTCATTTTCCACTCTGAAAAGGAGATGTCTGATCTCAGC +TATTTCCATCCTATTTGAAAACCAGATTTAGTTTTAAACCAGAGGAAGGG +AATCTCAAGTCTTTACCTCCCACAGTCTGGTGTGATTCTCTCTCTTTTGG +TATTACCTTCCTCCACATTGGAACACTCCAGCCAATGCATAGGCTGAGAG +GCTATCTCAGATTCAGAAAGATTTGGCCTCATCCCAGGGGAGGGTACAGA +GGAGCTGATGACTATGAATTCTGAAATGGAACTGTTCCAGGTTGAAGAAA +TAAGAAAGGGAATTGGGAAGAGCAATGCCCAGTGAAAAAGAAGAAATAAT +ATTTTAGGAAGTGAATGCTAATTTTATTTTAAACAAAATAAGAACTCAAG +GAATAAGAGGGTTCTTCCAATAGGTTAGAGTGATCCTGTCAAACATATAT +GCTTCTAGATTTTTTTAAAGACTGTTTCTACTAAGAAAGCATAGACCGCT +ATTGAGAAAGATCATTAAACTGGAATTTAGGAGGTCTGCCTTCTGATTCT +GACTTCTTGAATGTATTGTTAGCCATTTAACCACACTGTGTTGTTTCTCA +TTCTACCTGTAGAATCTCAAAGTTCTTTCCCACTTCTATACAAAACTATA +ATTCTGAACATCCTTTTTGTTTAATATAAGTCTGCATTTCCTGTTTGAAG +ATATGTGTCCCAGACCCTAAATGACTGACAAATTTTAAATCTCCAATAGG +AAAGATGACAAACTCTATGGAAACTTGGCTTCTGAAGAACTCCTAGAAGC +TTTCCAAAGTCATCAGTGTTTCCTAAGAAGGCAGAGAAATCAAACACATG +GTCTTTTCCTCCAGACAAGCTCCTTTGGGTCATCAGGATTTCTTCAACAA +TAAAATGTAATAATTCCAAATGTTTGTAACAGAATGGGTAGGACTTTCTT +CACTTATTTAAATACTCCCTTTTTTATGCAACTGAGTTTTCATCAACAAG +TACAAGCTTGTGAAGGAGTACTTTAAAATGCAATTTCTCTCTATTTTTGT +GGGGGCTAATATTTTATTTCTCATATTGACAATTTATTATGCTGTTTTTA +AAAAGttcattcatcaagtatttcttgagctttttctatgagacaggcac +tgttttaggcaagtaattatgcactgaacaatgcaaaaagtttccctgca +ctcatggactttaattttacatttatgaaaagctacaaatattagaataa +gtaaaataCTGCCTGGAGGCTAAAGCATATTTTGATCACTTATTCCCTAA +TTCTTTTAGAAGAGAACTCACCTGTCGGTTAGCTGAACCACTGCCAGTGA +TATCCAACTATACATTCAATCCCACCATACCTCATTATCACACCTATTCA +CTCACAAGCTTAAACTCTTAACTTTTCTCCACATATCAGTGACTATTTCC +TACAGCTTTTCTTTTACTTTCCATGTTTGCAGTGACAATATACATAAACA +GTGTATGAAAACTCAAGTAAAATCTACTCTCTCAGGTGTTCATAATGTAT +CAATGTATATTGCTTTAAGCCTGAAGGTAACCTAAGTAAAGATGTACCAT +GTTCCACCAATGCTTCTTTTGATCATCATTTTATCCTGTTTTTTCTTTAG +GATTCTTTCTTATTCCTTCCCCTGACCCTTCTTTTATTCTCCAAATTTCT +TTCCAATTCATCTTTGTTCTTCCCTTTCCTTTTTACTCTCTTTAAACATT +CTATGGACTCTGCCTCCTTCACACTGATATTGAACGCCCATAGTTTCATA +TTTTGGATTGCGATTGTTTTATTTTAAAATGGCAAATGTTCATGTTATAA +AGAGAATTTTTCAGTCTTTAGACTAATAGGTTCATGTAGTTTGGGATTTT +CCTCTTTAAGAAAATTAATTATCACTCACACTCCAAGACAAACACCATTT +CAGTAGCAATATGAATTTCAGTAGTAATAGGAATCTCCAAATATGACAAA +GTAATTCAGACATTAATTGCTTTTGTTTTGGAATTGCTCTTATAAGATGA +AATATCACTTTCATGATGAGAGTCCTAGAGTGCTTGGTTTATATATTGTA +TCTTAGTTTTAACAGGATAAAACACTTGATCCTAAGCAGTAAACATGATT +CTTCAGCTTCAACTTCATTTCTTTATAAATAACTATTTATGAATTGGTGT +TGAGCTTAGTAAGTCACCAAACACCTTCTGCTCAGCAGCATAAAGGACAT +TTCCATGAAACCTCCCAGGGATAATCTTATTTACTCTATAATGTTTCCCG +GGTTCAATTCCTCTCCCAAAATTCTTTGTTCTTAAGCCCCTATGATCTGG +GTGATCTAAATATGGGTAAGAAGTCCAGGGATAGCACTATGAATGAAGTG +AAAATAGTAAAACATAGTTAAAAATGTAcagatgctctctgacttataat +agggttacgtcctgataaatccatcataagtcaaaaatgcatttaatatt +cctaatgtacctcacatcatagtttggcctagcctaccttaaatgtgctc +agaacactttcattagcttatataagatcacctaatacaaagcctatttt +ataataaaatattgaatagctcacgtaatatactgactactatactcaag +tacagtttcttctgaatgcatgtcactttctcaccattgtaaagtcaaac +aattataagtcaaactatcacaagccagggaccatcCATATGTATTTCAT +TCAGAAAATGCTGGAAAGAGCATTTCGGAGAATATCTAGATGAGAGAAGG +TAGAAAGCCATGCACAAATTCACTGAGAGTTTAAAAAAATACATGCATAT +TGTGGAGATAGAAATCAAATCTATTTGTCTCCATCTGCTGTATTCTTCCC +AAAATATTATCTCTTCTTATCCCATTGTACTATATTGCATTTCTTTGACC +ATTTATTGTGTATCTCTTAATATTTCCCACTTCATCATTACTAACCTCAC +TCACTCTGAACTTGATGAGAGCACCTGAGCATTAATTTTTCTTATAATTA +TTTAATGATTACCAGAATTCGTTCAGTATGGCCAGCTCTGGTCAAAGTGA +GGCAGGCAAGATGCTTTGTCAACTGCCTGGATGGAATGTCTCAAAAGGTT +TCCATTTCATGGTAGCATTATGCAAAGTTCAAGACGTTTAATCAAGACCC +TTCACTTACTTAACTATACCTCCTTGAGAATCCCATCTATGAAAAAATTC +TAGTCATTATAAAAATGATTGATTAAATGAGGGAAGTAGTAGAGTTCTTC +ATTTCTTTAGTTGGTTTAGTCTCCTATGAGTCAATCCTATTTTCAAAATT +CTTAATAAACCATTTATTCCTTCAACTTTCTATGCCATTTGATGTTTTGT +AAAAAAAAAAATATAATATGTATACAAAAAGATATTTCAAAATCTAGAAA +GAGAGCTTTAGAGCTTTGTAAAGCTCTTTTAAAAATCAAAAACAACTACT +GTTAATTAACATGTTGTACTATGCAATTTGTTTACCATTATTACTCTTGG +TATTTTTAAGAAAAGTCTTTCCATTGTTATTATAAATGCTTCTATTGATA +TTTATTTTAATAACTGTTATTACAGTCCGTCATGTACATACACTATACTT +AAAcctaatgtttggtatttaaatcgtttcaagattttatcactgtcaac +aaagtatgatgaatatttttatgctgaaaacttctgtaaaaatagaattc +caagagtattattgcaccaaaaggcatggacttaaaattcttgatacatg +atttcaaaatattttctttaaggtttgaatcagtctatattccctccagc +agcgtataaaagtgccaatttctctgatccttagccagtttgggtaataa +taattgtaaaacttttttttctttttttttgagacagagtctccctctgt +cgccaggctgaagtgcagtggcgcaatctcggctcactgcaacctccgcc +tcccggggtcaagctattctcctgcctcagcctcccaagtagctgggact +acaggcatgcaccaccatgcccagctaatttttgttatttttagtagaga +tggagtttccccatgttggacaggatggtctcgatctcttgacctcgtga +tccaccctcctcggcctcccaaagtgctgggataacaggcgtgaacaacc +atgcccggcctgtaaaactttttcctaatttaacagaaaaataatagtat +tatattttatcatatttctttgatttctaAGacacacatacacacacaca +cacacatatctgtatatacaaatacacgtatagcttacaTTTTAATTCTT +CATTTCATTTGTTCATTTATTAGGTCTTGGAGATTTTGTGAAACTGTTTA +AATTCTTTTTTATACTATGAAGATATCAACCTTTTGTCTCTACAGCATTT +CAAATTCAAGTATGATTCACGTGTTGGTTTGGGGTAGATCATTATAGGCA +CATGTAGGAAACAGCTTTCAGAGATGCCTTAACCGTAATTATGCATTTGT +ATTCTAATTTTTATTTAATGTTATTATTGATTGCATTTTTAAAGATTCTG +TATTTTTTAAACCATTTATTTGTATATGTTGGTATACAATCTTGCCATTT +TCTGGGATTTCATATTTCCTTATTTTTGTTTTTTACCTTTTTTGGCTTGA +ATTTTTTGAGTTTTTATGCATTCTTTTCCAGTTTCTTAAGATGCTAATAA +GTTCATGTATTTGAGCAATTGAGAACATTTAAAGCAATAGACTGCCTCTG +AGCACAGCTTTGTCCATATTACATTAACCTTTTATACCCTGGGTTCCCAC +TAGTTTTTAAATAATCTACTATCAAATAAAAGATTTGTTAATAATAAATT +TTAAATCATTAACACTTAACGCATTATTTTCAGTCACACTAAGTTGATTC +CTTCGTTTCTTTCAGGTTGCTTCAGAGTCTTCCCTTCTATCTGATTCAGT +GGACCAAGTAAATGACTCTCTGGTAACAGAATTTGTATTACTTGGACTTG +CACAATCCTTGGAAATGCAGTTTTTCCTTTTTCTCTTCTTCTCTTTATTC +TATGTGGGAATTATCCTGGGAAAACTCTTCATTGTGTTCACAGTGATCTT +TGATCCTCACTTACACTCCCCCATGTATATTCTGCTGGCCAACCTATCGC +TCATTGACTTGAGCCTTTCATCTACCACAGTTCCTAGGTTGATCTACGAT +CTTTTTACTGATTGTAAAGTTATTTCCTTCCATAATTGCATGATACAAAA +GTTCTTTATCCATGTTATGGGAGGAGTTGAAATGGTGCTGCTGATAGTCA +TGGCATATGATAGGTACACTGCGATCTGCAAGCCTCTCCACTATCCAACT +ATTATGAATCCCAAAATGTGCATGTTTTTGGTAGCAGCAGCTTGGGTCAT +TGGGGTGATTCATGCTATGTCTCAGTTTGTTTTTGTCATAAATTTACCCT +TCTGTGGCCCTAATAATGTGGGGAGCTTTTATTGTGATTTTCCTCGGGTT +ATTAAACTTGCATGCATGGACACTTATGGGCTAGAATTTGTGGTCACTGC +CAACAGTGGATTCATATCGATGGGCACCTTCTTTTTCTTAATTGTATCAT +ACATTTTTATTCTGGTCACTGTCCAACGACATTCCTCAAATGATTTATCC +AAAGCATTCTTCACTTCGTCGGCTCACATCACCGTAGTGGTTTTGTTTTT +TGCTCCATGCATGTTTCTCTACGTGTGGCCTTTCCCTACTAAGTCATTGG +ATAAATTTTTTGCCATCATGAACTTTGTTGTCACCCCTGTCGTAAATCCT +GCCATCTATACTTTAAGGAACAAAGATATGAAGTTTGCAATGAGAAGGCT +GAATCAACATATTTTAAATTCTATGGAGACGACATAACACATTTGGTTGA +TGAGAGCACAGGATAAATGCCATGGACCATCAAGACTCCTGTGATCACCA +TGATCACTATGGAACGCGCACATTTTTAGTATTGCCTGAAAAAACTGAAA +AATCTGCAAAAAGGATGCATTAAATCTAAGAATTGTATTTCAGATAAAGT +TGCAACATTTTTTGTTAATCATAAAAAGTATATATTTCTATCTAATGTGT +GTATCTAATTAACAGCAATGACTACCTTTAATTTTGATGTAGTTATTTTA +TATCTGTATATAAGCACATACACATATATATGACCTAGGTTTATTTATCA +GTATTTTTATGCTGATAATAAGCATCACTGGAAATTAATTTTCTTATGGA +AATTATGTGGATCCAATGGATAAAATATGAGTTTATATAAATTAGTAAAT +GCCAAAATCAAGGAAGAAACAATTTTTATTTTAATTGTACTTTAAGTTAG +ATAAATGGTAAGGTCAACAGCTTGTTACAACCCTTAAGTATTATTTTCAG +GCTGATTGTCAATATGTTTTGTACAatgttctcacttataggtgggaatt +gaacaatgagaacacatggacacaggaaggggaacatcacacaccggggc +ctgttgtggggtggggggaagggggagggatagcattaggagatataact +agtgttaaatgacgagttaatgggtgcagcacacccacatggcacatgta +tacatatgtaactaacctgcacattgtgcacatgtaccctagaacttaaa +gtataataaaaaaaaaTAGACTCTAGTACTCTGTATTATGCAAAATTTGT +CTATGTTACACTTTTTTAACAACACAATCCTATTGCCCTTGAAATCTTCT +TCAAAGCATTTCTCGAGTCACTCTTAAAAAGCATCTACAACCTAAAAGTA +TAGGAAGAGATTTATTTCCTGGAGAAGAGACCCCATTGAGATCTTAAAAG +CACATTTAATGTGCCTGTGCTTAACTTAAGGTGCTTAGGACAAAGAAGGC +GATTGACATCTTTCAGGTAAAACCTGGTAAGTTTGGTGGTCAAGGAACAC +AACTGAGACATCACTTGGATGTATTCCTATGACTATTTTAAGAAACATAA +ATTGTGGTGACTCACTCAGCTCACTTTTAACTACTGCATGGTAATTAAAG +ATGCAAAATAAAATAAGTTACAAGAAGTGAGGTTTTTTATTGGTTAAAGC +AATTTTTCTATATTTTCTCCGCAAGTTGGTCATAAAAGTTCTAAGCATTC +CTCTTTTTATAAAATCGAAGCATTATTACTTACTCTCTTGTTAACCTATC +TGGATTTTAATTTTGTAACTTTATTATATTTGTTTTGCTGTGATTCTTTA +AAAAGCACCTTTAGACTCAGTGAGATAGCAAAAATATCCAAATAGGCCAA +AAAATTGTGGCAATGTCCTCTCACTCAGGAAAATTCTGTGTGTTTTCTCT +AATGGCCAAGGGAAAACTTGTGAGACTATAAAAGTTAGTCTCAGTACACA +AAGCTCAGACTGGCTATTCCCAGATCTCTTCAGGTACATCTAGTCCATTC +ATAAAGGGCTTTTAATTAACCAAGTGGTTTACTAAAAAGGACAATTCACT +ACATATTATTCTCTTACAGTTTTTATGCCTCATTCTGTGAAAATTGCTGT +AGTCTCTTCCAGTTATGAAGAAGGTAGGTGGAAACAAAGACAAAACACAT +ATATTAGAAGAATGAATGAAATTGTAGCATTTTATTGACAATGAGATGGT +TCTATTAGTAGGAATCTATTCTGCATAATTCCATTTTGTGTTTACCTTCT +GGAAAAATGAAAGGATTCTGTATGGTTAACTTAAATACTTAGAGAAATTA +ATATGAATAATGTTAGCAAGAATAACCCTTGTTATAAGTATTATGCTGGC +AACAATTGTCGAGTCCTCCTCCTCACTCTTCTGGGCTAATTTGTTCTTTT +CTCCCCATTTAATAGTCCTTTTCCCCATCTTTCCCCAGGTCCGGTGTTTT +CTTACCCACCTCCTTCCCTCCTTTTTATAATACCAGTGAAACTTGGTTTG +GAGCATTTCTTTCACATAAAGGTACAaatcatactgctagagttgtgagg +atttttacagcttttgaaagaataaactcattttaaaaacaggaaagcta +aggcccagagatttttaaatgatattcccatgatcacactgtgaatttgt +gccagaacccaaatgcctactcccatctcactgaGACTTACTATAAGGAC +ATAAGGCatttatatatatatatattatatatactatatatttatatata +ttacatattatatatataatatatattatataatatatattatattatat +aatatataatataaatataatataaattatattatataatatataatata +aatataatataaattatataaatataatatatattttattatataatata +atatatattatataaatataatatataaattatataatataatatatatt +atataatataatatattttattatataaatatatattatattatataata +tatattttattatataatatatattatatatttatagaatataatatata +ttttattatataatatatattatataatatatattatatttatatataac +atatattattatataaaatatgtataatatatattatataaatatattta +tatattatataaatatatatattatatataatTCTAATGGTTGAATTCCA +AGAATAATCTATGGCATGAAAGATTTTACCTGTCAACAGTGGCTGGCTCT +TCATGGTTGCTACAATGAGTGTGTAAGATTCTGAAGGACTCCTTTAATAA +GCCTAAACTTAATGTTCAACTTAGAATAAATACAATTCTTCTAATTTTTT +TTGAATAATTTTTAAAAAGTCAGAAATGAGCTTTGAAAGAATTATGGTGG +TGAAGGATCCCCTCAGCAGCACAAATTCAGGAGAGAGATGTCTTAACTAC +GTTAGCAAGAAATTCCTTTTGCTAAAGAATAGCATTCCTGAATTCTTACT +AACAGCCATGATAGAAAGTCTTTTGCTACAGATGAGAACCCTCGGGTCAA +CCTCATCCTTGGCATATTTCATGTGAAGATATAACTTCAAGATTGTCCTT +GCCTATCAATGAAATGAATTAATTTTATGTCAATGCATATTTAAGGTCTA +TTCTAAATTGCACACTTTGATTCAAAAGAAACAGTCCAACCAACCAGTCA +GGACAGAAATTATCTCACAATAAAAATCCTATCGTTTGTACTGTCAATGA +TTAGTATGATTATATTTATTACCGTGCTAAGCAGAAGAGAAATGAAGTGA +ATGTTCATGATTTATTCCACTATTAGACTTCTCTTTATTCTTAAAAATAT +TTAAGATCACTAAATTTTTATAGGACTTTAAAAACAGTAATGTGCTGCTT +TGAGTGTGTAGGACTAAGAAATGGGATTCAGAGTAGTAAAGAGAAAAGTG +GAATTTCCAAGCACTATGAATTACTGTTCTTTAAAAAACAGCAAAAATCA +AATAACAGTATTCCTCCAAAAAAGATGGCAAGTGTAAACTCTATACCTTC +ATGTCTCCCGTGGAATGTTAGTGATCAATTTCCACTTCTCTCTTTTACAT +CTTACTTGCCCATTAACTCTTATACCTAATCCAAAGATTGTTAATATGGC +TATGTCTCACTTTCAGGACACCTTTTATTTGTTACTTCTCTTCACTGCAA +AACTTCTTGAAACAGTACTTATTTTCTCTCCTCCATACACAATTGAAATG +GCTCTCAACTCATGCCCAGAAGTCAGTGTTCAGTCTCTCACCTGGCAGAT +AGCAACTTACAAAGATGCCCCAACAATACCTCCTTGTGTCTAGACAGTCA +TCATTATCCTTTACCTTTTTCTGTATTTATTTCTGCTCCTAAAAGGGATC +TCTATGTAAAGTATTGTTATACTAGTGCTTGTTATAATTATTATCAGAGT +TAAAGCCATCACAATGTTCCCAATTACTTAAAGACATTGGAATAACATTT +TTTTTATTTTCCACATCTTGCCAAAAAATATTTTGTTATCAGTACCTTaa +taatggctattatatattgaccattactatttgctagaaaatttatatac +ctggtcgtatccaatcctcacagaacttctataaagttgtgctattatca +cctatattttccagatgtggccgtaagactgaaatcacttaggtgacttg +tctaaggtcattcagatacatagtagataacccaggatttgaacacaggc +ctcctagcacacaagctcatatcttaactactttaatacgttgctcGATG +GGATCTTACAGGTCTTCATTCACCCCTTTCCTGCTCACACAACCACAACC +TGCAGCTATTACCTATTGTTAGGCTTAAAATAATTACTTGGCTTCATTTC +CAAGCTCCCTCCCTTCCAATTCACATTGAGTCCAGAGCTAAATTAAACAA +TCATTCAAAATTTTTCAGTAGTTCTTGTCTCTATAATAAAACAGAAATGC +TTTAGAAAGCATTCCAAAATCTCTTACCAGTTTTATCTCCTATGAAAGTC +CTTCACACTTTCTCTCATTTAAACTTTATTGCATTTTCCTCACTTTTTCT +CACTTCACTTTTGAATTCCCTATTCTTTTATCCTCTGTTAATTTTTAAGT +ATTATATTTGTGATATTATTTTTTCTTTTTTTCTATTTTTTATCTTTCAT +TTCATTTTGGCCTATTTTTTTCTCTTAAGAACTTTAATATCACCAAATAA +CATGTGTGCTACAAACTGTTTTGTAGTTCAAAGAAAAAGGAGATAAACAT +AGAGTTATGGCATAGACTTAATCTGGCAGAGAGACAAGCATAAATAATGG +TATTTTATATTAGGAATAAACCTAACATTAATGGAGACACTGAGAAGCCG +AGATAACTGAATTATAAGGCATAGCCAGGGAAGTAGTGCGAGATAGAATT +ATGATCTTGTTGAATTCTGAATGTCTTTAAGTAATAGATTATAGAAAGTC +ACTGTAAGAGTGAGCAGAATGATATAAAATGAGGCTTTGAATTTGAATAT +AATAATTCTGACTTCCTTCTCCTTCTCTTCTTCAAGGTAACTGCAGAGGC +TATTTCCTGGAATGAATCAACGAGTGAAACGAATAACTCTATGGTGACTG +AATTCATTTTTCTGGGTCTCTCTGATTCTCAGGAACTCCAGACCTTCCTA +TTTATGTTGTTTTTTGTATTCTATGGAGGAATCGTGTTTGGAAACCTTCT +TATTGTCATAACAGTGGTATCTGACTCCCACCTTCACTCTCCCATGTACT +TCCTGCTAGCCAACCTCTCACTCATTGATCTGTCTCTGTCTTCAGTCACA +GCCCCCAAGATGATTACTGACTTTTTCAGCCAGCGCAAAGTCATCTCTTT +CAAGGGCTGCCTTGTTCAGATATTTCTCCTTCACTTCTTTGGTGGGAGTG +AGATGGTGATCCTCATAGCCATGGGCTTTGACAGATATATAGCAATATGC +AAGCCCCTACACTACACTACAATTATGTGTGGCAACGCATGTGTCGGCAT +TATGGCTGTCACATGGGGAATTGGCTTTCTCCATTCGGTGAGCCAGTTGG +CGTTTGCCGTGCACTTACTCTTCTGTGGTCCCAATGAGGTCGATAGTTTT +TATTGTGACCTTCCTAGGGTAATCAAACTTGCCTGTACAGATACCTACAG +GCTAGATATTATGGTCATTGCTAACAGTGGTGTGCTCACTGTGTGTTCTT +TTGTTCTTCTAATCATCTCATACACTATCATCCTAATGACCATCCAGCAT +CGCCCTTTAGATAAGTCGTCCAAAGCTCTGTCCACTTTGACTGCTCACAT +TACAGTAGTTCTTTTGTTCTTTGGACCATGTGTCTTTATTTATGCCTGGC +CATTCCCCATCAAGTCATTAGATAAATTCCTTGCTGTATTTTATTCTGTG +ATCACCCCTCTCTTGAACCCAATTATATACACACTGAGGAACAAAGACAT +GAAGACGGCAATAAGACAGCTGAGAAAATGGGATGCACATTCTAGTGTAA +AGTTTTAGATCTTATATAACTGTGAGATTAATCTCAGATAATGACACAAA +ATATAGTGAAGTTGGTAAGTTATTTAGTAAAGCTCATGAAAATTGTGCCC +TCCATTCCCATATAATTTAGTAATTGTCTAGGAACTTCCACATACATTGC +CTCAATTTATCTTTCAACAACTTGTGTGTTATATTTTGGAATACAGATAC +AAAGTTATTATGCTTTCAAAATATTCTTTTGCTAATTCTTAGAACAAAGA +AAGGCATAAATATATTAGTATTTGTGTACACCTGTTCCTTCCTGTGTGAC +CCTAAGTTTAGTAGAAGAAAGGAGAGAAAATATAGCCTAGCTTATAAATT +TAAAAAAAAATTTATTTGGTCCATTTTGTGAAAAACATAAAAAAAGAACT +GTCACATCTTAATTTAAAAAATATATGCTTAGTGGTAAGGAGATATATGT +CAACTTTTAAGAGGTTGAAAAACAAACGCCTCCCATTATAAGTTTATACT +TCAcctcccaccactataacaacccagaatccatgagggcattatcagga +gtgagtggaagagtaagtttgccaatgtgaaatgtgccttctaggtccta +gacgtctgtggtataactgctcataagcagtagaaagaatttagagggat +ccaggctctcatcacgttggcacaaagtatattacttggatccatctatg +tcattttccatgGTTAATGTTTaaaagcacaggctttaaagtaaaaaaca +aagagctggattcaactctactgactcttattaatcatgattttgggcac +attacgtagctttcatgagctttagtttctacatttataaacaggagatt +atacctattatgcatggttattatgaaggaaaatgacaaaatagatataa +atcaaatagcccacttcgagacatattaagcatgaataaacattagatac +tattaAAATCCTATATATTAACAAAGCCAAAAGTTTCAAACTTTACTTTT +TCCCAACATTCTTGTGAAATATGACACATCCCAATCTTAACAGATGCTCA +TTTGGGATACTGTACTTGTGAGTGGAAGTGTGTATATTTGTGTGCAAGTG +TGTACTCATATACTTCCACCTTACCACCCTAGAAAGGCATGATGAAAATT +TAAGATAGAAGGAAAATATAAATTGAAAAAAAAAAACCTTAACAAATGAT +TCTGACAAATATCTTCTCTTTCCAGGGAGAATCACTGAGCCAGAATAAAA +TTGAACACTAAATATTCTAAGAAAAAAGGAATCTAGTTTGTCAAAATGTG +ACTTGAATTAATAGATAAGGAGAGTCAGATGATAAGAGGGTCAAAATTAT +GTTTATCTTAGGAAAAGTagaatagaaaatttataagcagattaaaaaca +cataataaaagtagtaaataataatgacagtatctcaaatcagtgcaggg +gggaaaggcctactaatgtgatggtgggataattggatagcaatatggga +aaagatatatttaatttatttgctacaccaaatgccaggacaatctctaa +gtgaattcaagacataactcttttttcaaaaaaactatgcaaatattaaa +agaaaacaagttaatgtttttataatctatgaatatggtaaagatGGATA +ACATTGACTATCAAATTAATTTTTAATGCGTAATAAAACTATGAGAAAAT +TTAAAAGTGAGAAGAAACTACTTGTAACTCACATAATAGActagtacttc +taacacatagggaacttctaaaacaaaacccaaaatattaataggaaaat +gggcaaaacagttaaacttacagttcataCATAAGGAGAATCAGTCTTTT +TTTTTTTTTTTACAGTTGTAGGCAGAAAACTTTTATTTTTCATTTATTTG +TAAAATTTACCCCTAATTTATTCATAATTCATTTAACTGCTAAGGGCATT +AATGTGTACAACGCCATGGGAGAAACCAGTATATTCAGAATTTCTCCTGA +AATTTGACCAGAAGTTATGGGCATCCCTCCCCTGGGAAGGAGGCAGGCAG +AAAAGTTTGGAATCTATGTAGTAAAATATGTTACTCTTTtatatatatac +atatatgtgtgtatatgtgtatatatatatacacacatatatacatacat +acatacatacatatTATCTGAATTAGGCCTGGTCttttttaatactttaa +gttctgggatacatgtgcagaatgtacaggtttgttacacaggtatacac +ctgccatggttgtttgctgcacccatcaactcaccatctacattaggtat +ttctcctaacgttatccctctccttgcctcccacctcccgacaggccctg +gtgtgtgatattcccttccctgtgcccatatgttctcattggtcaactcc +catttatgagtgagaacatgcggtgtttggttttctgttcttgtgttagt +ttgcggagaatgatggtttccagcttcatccatgtccctgcaaaggacat +gaactcattcttttttatggctgcaagaaatgcaaatcaaaaccacaatg +agatgccatctcacaccagttagaatggcaatcattaaaaagtcaggaaa +caatagatgctggagaggatgtggagaaataggaatgcttttacactgtt +ggtgggagcgtacattagttcaaccattgtggaagacagtgtggtgtttc +ctcaaggatctaaaactagaaataccatttgacccagcaatcccattact +gggtatatacccaaacgattgtaagtcattctactacaaagacacatgca +caggtatgtttattgcagcactattcacaatagggaagacttggaaccaa +cccaaatgcccgtcaatgttagactagataaaatgtggcacatagacCTG +GTCTTAAAATCAAGAACAGAGATTGTTACTTTTACATCCATTCCTAATTG +ATAAACCATTCAGTTATACCACATCTTAGCTTCTGGACTACAATGACCAT +ATTTGGGGttttctttctaatttcattataggttcagagggtacatgtgc +aggtttgagacaaaggtatattgcatgatactaaggtttggagtacaaat +gattccacctcccaggtagcaagaataatacccaatatgtagtttttcaa +ctctttcccctcttcctccatcctccctctgctactctgtggtgtctgtt +tttctcatctttatgtccatgtgtactcgatgtttagctcccccttgtta +ggtgagaacatgtggtatttggttttctgtttcagtgttaattcacttag +gataatggcctccaactgcattcatgctgctgcaaaggatgtgactttct +tcttattagctgcatatattttgtggtggatttgtaccacatttacttta +tctagtccaaagttgttgggcacccaggtggattccatgtctttgctatt +gtgaatagcactgggacaacccatacaagttcatgtgtctttttggtaaa +acaatgtattttcctttgggcatatatgcggtgatggaattgctggatcg +agtggtagtttaactcttagttctttgagaaatccccagactgttctcca +cagtggctggactaagttgcattcccaccagcagtgtagaagtgttcccc +attctctgtagcctcaccagcacatgttAAACTATCTttaaatatatgaa +aaaaatgttcaagtctctcagattaagatgcatgcaaagtaaaatgatac +ttaaatatcagttctaacctataaaatatcaaatatctgacctcaatatt +tgataatccaacctgttgatgaagctgtagagagaggcaccctTtttttt +ttttttaattatactttaagttttagggtacatgtgcaccttgtgcaggt +tagttacatatgtatacatgtgccatgctggtgcgctgaacccactaact +cgtcatctagcattaggtatatctcccaatgctatccctcccccctcccc +ccaccccacaacagtccccagagtgtgatattccccttcctgtgtccatg +tgatctcattgttcacttcccacctatgagtgagaatatgcggtgtttgg +ttttttgttcttgcgatagtttactgagaatgatgatttccagtttcatc +catgtccctacaaaggacatgaactcatcattttttatggctgcatagta +ttccatggtgtatatgtgccacattttcttaatccagtctatcattgttg +gacatttgggttggttccaagtctttgctattgtgaataatgccgcaata +aacatacgtgtgcatgtgtctttatagcagcatgatttatagtcctttgg +gtatatacccagtaatgggatggctgggtcaaatggtatttccagttcga +gatccctgaggaatcgccacactgacttccacaatggttgaactagttta +cagtcccaccaacagtgtaaaagtgttcctatttctccacatcctctcca +gcacctgttgtttcctgactttttaatgattgccattctaactggtgtga +gatgatatctcattgtggttttgatttgcatttctctgatggccagtgat +gatgagcattttttcatgtgttttttggctgcatagatgtcttcttttga +gaagtgtctgttcatgtccttcgcccacttgttgatggggttgtttgttt +ttttcttgtaaatttgtttgagttcattgtagattctggatattagccct +ttgtcagatgagtaggttgcaaaaattttctcccattttctgggttgcct +gttcactctgatggtagtttcttttgctgtgcagaagctctttagtttaa +ttagatcccatttgtcaattttgtcttttgttgccattgcttttgTCcca +ccgatcccacagaaatacaaactaccatcagagaatactacaaacacctc +tacgcaaataaactagaaaatctagaagaaatggataaattcctggacac +atacactctcccaagcctaaaccaggaagaagttgaatctctgaatagac +caataacagaagctgaaattgtggcaataatcaatagcttaccaaccaaa +aagagtccaggaccagatggattcacagccgaattctaccagaggtacaa +ggaggaactggtaccattccttctgaaactattccaatcaatagaaaaag +agggagtcctccctaactcattttatgaggccagcatcattctgatacca +aagccaggcagagacacaacaaaaaaagagaattttagaccaatatcctt +gatgaacattgatgcaaaaatcctcaataaaatactggcaaaacgaatcc +agcagcacatcaaaaagcttatccaccaagatcaagtgggcttcatccct +gggatgcaaggctggttcaatatacgcaaatcaataaatgtaatccagca +tataaacagagccaaagacaaaaaccacatgattatctcaatagatgcag +aaaaggcctttgacaaaattcaacaacccttcatgctaaaaactctcaat +aaattaggtattgatgggacgtatttcaaaataataagagctatctatga +caaacccacagccaatatcatactgaatgggcaaaaactggaagcattcc +ctttgaaaactggcacaagacagggatgccctctctcaccactcctattc +aacatagtgttggaagttctggccagggcaattaggcaggagaaggaaat +aaagggtattcagttaggaaaagaggaagtcaaattgtccctgtttgcag +acgacatgattgtatatctagaaaaccccattgtctcagcccaaaatctt +cctaagctgataagcaacttcagcaaagtctcaggatacaaaatcaatgt +acaaaaatcacaagcattcttatacaccaacaacagacaaacagagagcc +aaaccatgagtgaactcccattcacaattgtttcaaagagaataaaatac +ctaggaatccaacttacaagggacgtgaaggacctcttcaaggagaacta +caaatcactgctcaaggaaataaaagaggatacaaagaaatggaagaaca +ttccatgctcatgggtaggaagaatcaatatcgtgaaaatggccatactg +cccaaggtaatttacagattcaatgccatccccatcaagctaccaatgac +tttcttcacagaattggaaaaaactactttaaagttcatatggaaccaaa +aaagagcctgcattgccaagtcaatcctaagccaaaagaacaaagctgga +ggcatcacgctacctgacttcaaactatacgacaaggctacagtaaccaa +aacagcatggtactggtaccaaaacagagatatagatcaatggaacagaa +cagagccctcagaaataatgccgcatatctacaactatctgatctttgac +aaacctgagaaaaacaagcaatggggaaaggattccctatttaataaatg +gtgctgggaaaactggctagccatatgtagaaagctgaaactggatccct +tccttacaccttatacaaaaatcaattcaagatggattaaagacttaaac +gttagacctcaaaccataaaaaccctagaagaaaacctaggctttaccat +tcaggacataggcatgggcaaggacttcatgtctaaaacaccgagagagg +cactcttatgcattgttggtgagaatacaaaatggtacaactcttggcaa +tatcttaaaaaatttacatggtactgacttttggtctagcaatcctactt +ctatcctaaagatatattggcaaaaatacaaaataattgatgcactcaag +tctattcattgaagcattgtttttcatagtaaacggaaagtaggccgggc +gtggtggctcatgcctgtgatcccagcattttgggaggctgaggcgggca +gatcacttgaggccaggaattcaagaccagcgtggctaacatggcgaaac +cccatctctaccaaaaatacaaaaattagctgggcgtggtggtgcacact +tgtaattccagctacttgagaggctgaggtgggaggatcgcttgaacctg +ggaggcagaagtttcagtgagcccagaacgtgcctctgcactccagccag +gatgacagagcaagactccatctcaaaaaaaaaaaaaaaaaaaaaggaaa +ataaccaaatgacaattagtgagtactacttgcaaaacttgtacgcaata +gagtatgaagcaactataaaatgagagagaaatatctccaaatactactc +taaagtaatctacaaggtataccttaactgaaaagaaacaaaaaagtgac +accagaatgctatttttatgttaaaacagggataaatacattggatttac +atgcatatataagtatatattttataaatgtttaaataagcatacttaaa +atggcaaaaacgtaatacatatataattttcttatggcaggaggaggaaa +cagggcaaggcacagggataaaagttattctgaatacatcttattttata +tttttgactttgaaatcctgtagctgttttatgtaatataaaaatgtaat +taaattaacagaaaaaaattacaactgctaaaaatcaaGATCTGGCATTT +TAATTAAGTTATAAAACATCGGAGAAAAGAATTGTTTCATGGGACACTAA +CATACAGACAAATTCAtttggaacccaatgaattaatgggcctaagataa +caaccaatagaagctaaaatgacgaataactgtttcagaagaaaacatat +atggaatgaatcagctgaaaatacctgaacctactgatcaatttttatat +cacatgaagtgaatacacataaagtataatatggagcacatagaaccaac +tagaaatgagcctaattgttaaatattctctattttatgacaatatacag +gaaatatgtcgaagagagaaacatgcaagaacaccgtagggtttaataag +ataatcacaaggtatggaatattcaacaggatgagtatcctggattattc +agcaaatacacagagctaaaaagcaggagaaaggaattcatatatatttt +taaaaactaaaaagatatattagctgatgcaactttgaaacttctttaga +tcctgattcaaatagagcaaatttaacaaatatatttgaaactattaaaa +taatttaaaaatgaccaagtatttgattatatcaaatatagacaataata +accttgaatgtacatggattaaatgtccacttaGGggctgggtgtggtgg +ctcatgactataattccagcactttgggaggccaaggcagaaggattgct +tgaggtcagaggttcaagtgcagcctggtcaacacagtgaaaccctatct +ctacaaaaaacaaacaaaaatAAAAAATTAACTAATTTTAAAAAATATAT +ATTTCTTCtaaattctccacctgaaagatatagactgactgaatgaattt +taactatgatctgactatgtgcttccctgaacaaatgcactttacctgta +aaacacatattaactaaaagaaaagagatggaaaaaggtattccatgaac +agaaaccaaaatgagtaggagtagctatacttctgtcagacaaaacagac +tttaagtcaaaactagctttagaaaaaagacaaaaatgcttattatacaa +cgataaaggaatcaatccagaaagaggatataacaattttaaatatatat +gcagccaacactggagcagccagattcataaagcaaatactactagatca +aaacagagaggtagactcaaatataataatagtgaaggacttcaacaccc +cactttcagcattaaacagatcatctaataagaaaaccaatctcgcagcc +ctcaccctggagagtccacaggtaccaggggttggtctgaacccccagca +cagagcacctgcctcacagaagagtggctgcatttttcttcctgcagttt +tcagtcctcacttctccttaccaagcagggccacctggcctgggactccg +gtacaactaccctgccccccacctgacgacttcaataagaagtagcccag +catttctccaaggaggaaataccagagtcaattcacaaccactgcaattg +cagtggtaccaccataacagcccttgggctgcagaaggaactaagagtct +agtcactacagtggcaccttcagcacaccacagccaccatacagagagga +atccagccccctcccctgggaacccccaccacccactccaccaggcacag +cacccagctcataactgcagatcagttgccccacccacagctgagcttac +ctactggcagtggcccagactttccctagggagaggctcccagaggcaaa +cggcagcctctctgcccgtgtcacagcagcagttctatccatgctgtcct +caggcttggaaagaaacaaagcgcctgaaggctgcacctgaacttacagc +atgccacagttcccatatggagaggagaccagtctctcctcccagtgagc +cctaaaccccctgatccccaacaagcagagccctaacctcacaccagcag +tacagctgccccatcccccaggctgaacattcccagtaatagcagctcca +cctggagatggaacccccagggtcaactaaaagcccctctgccactgcct +ctacagtggtactacccctgctacccttgaactaacaaaggagcaaagac +cccagtgctttatccacacctccaacaagctgcagtcgaccacaaagaag +aaacacgtctgtctcccatgggtcctacccacaccccctgctgttcacca +tggatgatagagtcaacagtgtgaaaacgaccatactgccaaaagcaacc +tacaaattcaatgcaattcccatcaaaataccaccatcattcttcacaga +actagaaaaaacaaggctaaaattcacatggaaccaaaaaagagcccaca +tagccaaagcaagactaagcaaaaagaataaatctagaggcatcacatta +ctcgacttcaaactatactataaggccatagtcaccaaaacagcatggta +ctggtataaaaataggcatatagaccaatggaatagaataaagaacccag +aaataaagccaaatactttcagccaactgatctttgacaaagcaagcaaa +aacataaagtggggaaaggacaccctattcaacaaatggtgctggtataa +ttggcaagccacatgtagaagaatgcaactggatcctcatctctcacctt +ataaacaaatcaactcaagatggttcacagacttaaatctaagacctgaa +accataaaaattctagaagataagattggaaaaacccttctagacattgg +cttaggcaaagacttcacaatcaagaacccaaaagcaaacacaacaaaac +aaagataaatagatgggacttaattaaactgaaagccttctgcacatcaa +aataaataatcagcagagtaaacagacaacccacagagtgggagaaaatc +ttcacaaactatgcatccaacagaggactaatatccagaatctacaaaga +attggaacaaatcagcaagaaaaaaaaccaaacaCAAGGATGACAGTGGA +AATACAAAAACAAGACATAAATATTCTGAATAGTGATAATAAAACAGTGC +ATACCAGAATAcaaactgtttccaagttacaatggttcaaccatttttca +gctttatggtggtgtgaaagtgatatccattcattagaaaccatgctcca +ggatgggcgcagtgggtcacgcctgtaatcctagcactttgggaggccga +ggagggcggatcacaaggtcaagagatcaagaccatcctggccaacatgg +tgaaaccccgtctctcctaaaaatacaaaaattagctgggcattgtggtg +cgtgcctgtaatcccagctattcgggaggctgaggcaggagaatcacttg +aaccagggagtcggaggtgttgcagtgagccgagatcgtgccactgcctc +cagcctggcaacagagtgagactccatctcaaaaaaaagaaagaaaccct +actccgaattttgaattttgatattttcctggactaccaatatgtggcac +aatgctctctcacaatgttgtgcaacagcggtgagctgcagcttccagtc +agctaaatgataataaaggtagataatccatcttgatatcttcctgaaga +acataatgcctgcctaccatcaacaggcatcaatactttctaccagctat +tctcaaccctcatgatcggaagagacagagactgactgtgtcaaagtatt +agtcccatcattcagcaattaactttagctcaatgcttcaaaaattcttc +aggccctgtgtaatttcagctacgtacattaatgatgagtacccatacaa +ccattctgtttcttattttcagtaccatatttaataaatatcagttattc +aatactttatttagacattttgttagattattttgaccaactgaagtcta +atctaaatgttctgagcatgttcaaagtaagctaggccaacctataattt +tcggtgtgctaaatgcatttttaacttatgatattttcagtttacggggg +tttgttgagacataacttcatcatacatcaaggagcatctgTAtatggga +tatagttaaagcagtgatcagaggaaaatctatagccttaacacatttat +taataaaagtgtaggaattaaattatcagctgaaaaatgtaaaaagtatc +taaaagagtaagcagaaagtacaagaaagaacccaaagtagaaaaaagtg +aaaattaataaaataagaagccaaaaaacagatcaaatcagtaaaccaaa +aatcttgttctttaaacaaatcaacaaagttgacaaaaaaattagatctt +ttaatcatgaataaaaaaaagagaaagcacaaaaatgaataaggaatggt +gagagaaataactattgataatcagcaaataaaaaatcattaaaaacaat +gttgttcacatctatgaaaaacattgaaagctagagggaatgggtaattt +tctagaaaaatacaattcaccacaactgacttcaaaaaaaaaaaaaaaaa +aaaagaagtaccgcacttatgtgagcaatttccatagagaaatacagttg +tcatggaattataacacacacacaaacactaggtttagatgttttcacag +agaattccaccaaacctttaGAAATCAGATCGTCCAaaggcaaattaaca +actctcagccatttgaggcaaaatattacaattgaggcaagatatactgt +actgaaaacttgaggaaaaagcaggagagaaagttcctttgggaaattcg +aatactcaaaagtgcttacatacaatgaaaaatttggaaatccataagca +tggccaaggtgggacacatgctcagaaaaggcctgagaagacactaataa +ctcacctttagtaattcctaggctcacagcaagaaaaaatgaaggctaag +gcagaattatatatggctccgctaagtgttgagggagccccaatacagag +tcagtaagcaaagtctgggagaagtttttcatatttttttctttcttggc +tccttgcagtcaaggaaatcatttttaaatcactaaatgctaaatgaaca +caagctaaaggaaccgagccttcaaacatcaaatataaaaaagaatgcag +atattacaaaaccagtttacaaaagttactaaacaaataaaaactacatc +ccacagtgggtaacaaaaataaccttgaagaagggaaaaatttggtttcc +agaataaacacattataatatccaaaatgtccagttttcaacaaaaatta +agaagcatgcaaataaacacaaaactatggcccatttacagaagaaataa +atgagactctccctgagtaagcagatattgaaaatattagacaaaaactt +tatataactgtcttaaataaacttaaagagctaaagaaacccaagagaat +gacatataaataaataagaaatatgaatttttttaaaggtacaaaaaaat +tctgaggctgaaaagtacaataagtaaaaagttactttttacttagggtt +ccaatagaagatttgagcagctggaaaaaagaatcagtgaacttgataga +tcaaatgaaatgattcagtctgaagagcaggaaaatgaaagaatgacaac +aaaaaagaatagagcctaaagacctgtgtaacaacatcaagaatgcctac +atacagaatcctggtggggagtgaggggcaggaagactatttgaagaaat +gtgtttgaaagcttcccaaatttcactaaaaacaaatatatacattcaaa +aagctcagtgaacttcatcaaggaaatatacaaagatattcacaccaaga +cacactatgtttcaaattgtcaaaaggcaaagcgaatgtttgaaagcagc +aagagaaaggcaacgcgtcatttacaaaggatcctcaataagtttgacag +cagatagtgcattataagccatggatgccagaagAGCTTAGgaaaaaggc +aacgcgtcatttacaaaggatcctcagtaagtttgacagcagagagctca +ttataaaccatgggtgccagaagagcttaggatgacattttaaagttctg +aaagaaaaaaacactgtcaaccaaaaattctataacttggaagatgcccc +ttcaagtattaaggataaattacacattcccagattaaaaaaaagaaaga +gagagagagagaaagagaaagaaagaaagagaaagaaagaaagaaagaaa +gaaagaaagaaagaaagaaagaagagaaagaaagaaagaagaaagagaaa +gaaagaaagaaagagagagagaaagagagagaaagaaaaagaaggaaaga +aagaaagaaagaaaaaagaaagaaaaagaaagaaagaaagaaagaaagaa +agaaagaaagaaagaaagaaagaaagaaagaaagaaagaaaagcaagcaa +gCTTTAAAAGTTCATGTTTGGTAGGctgtacttcaagatacacttttaaa +aaaaagactccttcagatacaaactaaaaaacactagaaagtaactcaaa +accacataaagaaataactccagtaaagataactacataggtaaatataa +aagcaattatcacattttttgtaagtcttttttaatattctatatgtttt +aaaacaaatgtgtaaaataatgactataaatctatgttaatgaagcatga +tgtatacagatgtggtttgtgaaattaccaacataaagaaattcatagga +aactaaataataatagagattttgtatactattgaagttgtttcaattta +ctctaaattgttccaaattaagaatgttaattgtaaatccccatggtaac +cactaagttaatatcttttgaaaatacagaaaaggaaagcacagggtaaa +cacagtgatatgctacaaaatagcaactaaacacaaaagaaggcgataat +tgaggaaattaggaacaaaggaggtataagacatacagaaaacaaaagca +aaatggtaggagtaagcccctctttatcagtaattacattaaatacaaat +gaattaaactctccaatccaaagaaagagattaacagaatggatttttta +aaaatgatccaactatattgtccacaagatactcactttagatcaaaata +cacaatgagttgaaatgaaaggatgggagaaaatattccatgtaagtaat +aaccaaaggagatctgaggcaaatatacttatatcagacaaaatagactt +taagtcaaaaactgttacaaaatacaaagaacagtatatattgatttcaa +aattaattaagaagatataacaattataaatatatgtacaccaactaaca +gggctccaaaatatataatgtaaccattgagagaattaaagggagagaca +gacaattccacgaaaattgttgggcatttgaaaacccaactttaaataaa +agataaaacatctagagcaaatatcaagggaggaattagaggatttgaat +aaaactataagcaataactatagataacacttctctcaaaaactgcagag +tacacattcttctcaagtgaacatggaacattctccagcacagatgatat +gttaggccataagataagctcaataaacttaaaaagattgaaatcatgca +aagtatcttcactggccacaatggaatgaaataagatatcaataacaaaa +gaaaaactagaaaatttacaaatatttggaaattaaacaacacagtattt +accaaccaatgaatcaaagaacaaatcatgagggaaattagaaaatgttt +agagacgattgaaaacaaatatataacaagatgggtgtgatatatcaaaa +gcagtgctcagagttgtaacacctacattttaaaaaagaaacatgtcaaa +tcaataaccaaactttactcaataaaccgtaaaaggaagagcaaacaaaa +tccagagctagcagaaggaaggaaatgaagattagagcagagataaatga +aattgagaattaaaaaattatacagagatcaacaaaattaaaagttggtt +cttttaaaatatcaataaaattaatatacttttacatagactaagcaaaa +catctctattcagctgactttttttacaagggagccaacattattcagtg +gggaataatagctttttcaacaaaaagtgctgggaatactgaatattcat +atgcaaaaaaaatgaagctggacccctacctcacattatatacaaaatct +agattggatcaataatgtaaatataagagtgaaaaccatacatgcttaga +agaaaacatggaaataaaacattgctgtggattggcaatgcgttcttaga +taatacaccaaaaatacaagcatgaaacaaacaaatGCAGCCAAAATGTA +CCAGAATCTGAAAACATCTATTATCTATGaagaattagaggggaatttgg +tgaaagaaatatgggagaatgggacattgctctgtgaatgcttttgtgca +taattgtacatttttaattaagttaatcttttacactctcaaagtgtgat +attaagcaagcaaagataagttattacaagactctaaaaccgaatgcaat +gagaaacaagtgaatccaaatatatttcaaatgaatgaatgacataatca +aacttaaggggaaaataataattaatctgattaatttttgactgttcttt +tagttcaaattgacttttgaacatacttggactacataccattgcttgaa +aaaataaaatatctgcaaaaaattattaaatcttcatgataggctttttt +ctttttatattagtataaatataacaattctgaaacaaatgtatgtgcat +tgtaagattaagccaatgagtaaatattaatatatttgtattgctagaac +cccagattctcactgtgaaaggacagagatacagatatggaataagacaa +ggaaagaagcagcccactgagttacattagaatcagtattatcaacataa +atatgcaatgtgctctctcACATGCTCTTTCCTTCTCTTAAAAAAatata +atatggacatattatatattatatgcatagacacacgtgtgtctatacat +atcctatctatacatatTGAGGATTAACAGGTGCTAGTAGAAAATATTAA +CTTTCTTTGTATTAACAGGTGTTAGTAGAAAGTAGTAGTAGGTGCTAAGA +TAAAAGCCATAATTAAAcctcctggtgaatgaacacaccatcacctacaa +tcttaccaaaaatagaatcaagcacgtgtcctagtcaaacctctggattc +aactgtcatttggataaaacgcaaaggatagtgaaaatgtcgatcttcac +tgagagtctaaccagcaaatttcacagtgtggacatcaagtgacaaaaat +cccaaatttttcaacaaatatattgtatgggaaagaaaactttgaaaaga +aacctgtatgttagaagagattttaaaaacatgacaaATGAAAAAAAATG +GGCAAGACTAAAACTTTTAAAAAAGtttgagacagggtctcactctgtca +cccaggctggagtgcagtggtgtgaccatggctcactgtggcctcaacct +cctggctcaagtgatcctaccacctcagtcttccatgtagctgggactac +agctgcgtgccaccacatctggctcatttttttttcttttttaagtagag +acggggacttgctatgttgcccaggctagtctcaaactcctaagcacaag +cgatcctcccgcctcggcccctgaaagtgctgggattgcaggcatgagcc +accacacccggccAAAAGTTGCttttgaggagttattgctgtgtggatgt +gatataaccctttctgtcatctcttcacaaaactttctgtaaaACATAAA +AATCACCTGGACCTTCAGAGATGAGTTTGTTTATTTTTTTATTTTTTAAA +AAATTGCTAATTTACAGAACATGGAGATGAGTATGTTTTGAAGGCTTGGA +AGCATGCAAGTGGGAGAAGAAAGGAGTCAGCTACATTCTGGCTGTGTGCA +GAGGCAGGTCACTGTGGTGGGAGTGTTCCTGTTTCATGGACTCTGCAAAT +CGCAATGCTTGGCATGGCCTCCCGACCCTGATGGCAGAGAAGCAAACACC +AGTCGGAGAGCTGGGGTCCTCCCAGCCCTCTTGGCCCTGTGGCCAATTTT +TTCTTCAATAGCCTCATAAAATCACATTATTTGAGTGCCCATGGCTCCAA +AACAAGCAGGGATGCCCATGGACCCTGATTATCCATTGTCACCCTTCCCT +CCAAACAGCCACCTCTCCCCTGGAGACAGCCCCATACTCCACTCAGACCT +GTGCACTTTCTGGTATCCTTGTCACCTGCTTTTTATGTCTCATTTTACAA +ACACCAAATTGGAAGACAGCAGGAGCTGCCCCATAATACCAGTAAAGTGA +GAAGCAGAGATAAACTAGTCCTAGACAGCCGACTCATGTTGGGGGCAGCC +CACTCACAGTGGCCCTGACCCAACTCTGACTAGAGGCCACTTGctctcaa +caccagggtgctcaatggcccgtcctggtactctgctcttctctctccac +cttcgctttcctgcaatctatgcagcctgtgactccatccatgggctagt +gacccccagaccttctcctgggaccacaggcctgtgtctctatctgctgc +tcaatacctcccctcgaacatccatggctaaaactgagctcctgatactc +tctccctacccgcttctctgtggattccccacctccgcgaaggacagctt +catcctttcagctactcaggccagaagattgaagtcatctccttctccag +gaaatcgtattgagggagctacaaatatccaaaatccgatcgcttctcct +ccactacacccgaggcccgccacccatttttgcctgaattgctgcagcag +cctcctaaccgatctctgctttcacgtgggcacctcagttttttccagaa +caacaaccagagagatctgctcacacccaagtcagaccaggttactcctc +tgctctcatagcatttggaggaaaacccagagtgctcgtgttggccggca +gagccggcccccatctcctctgacctcctccccacctcttgccctcagca +cccagagtgctcgtgacggccagcagagccagcctccatctcctctgacc +tcccacctctcgccctcagcaccCAGAGTGCTCGTGTTGGCCAGCAAAGC +CGGCCCCCATCTCCTCTGACCTCCCACCTCTCGCCCTCTGCACCCAGAGT +GCTCGTGACGGCCAGCAGAGCCGGcccccatctcctctgacctcccacct +ctctccctcagctagtcctcgaacatgtctgatgtggtcccaccttggga +cccacattgctactcctctgcctgtaggggtacccacagttatccacaca +gttcactcctgtctttcaggtctttgtgcaaatatcaccttctcagtgga +gactacaccttcaggacttaggctgtgcctggcacatagtaggtgctcag +tagacactggttgtaggaaggaatCTACAGGTTGAAATAAGGAGATCATT +TCCCTGAGGTTCCGAAGCTCATATTTACTCACCATTTGTTGTTTACTGCT +AATATTGAGCACTATCAGTAAAATACATAAAACCCtttgccaatccagga +agtgaaaatgacactttactgttttagtttgcatttctctgcttacaaat +ggattacacgcattttcatgtgctgttggctACTTATTCATTCAGAAAAC +ATACTAAGTGCTGGCTCTTTTTCATGTCCTTTATCAAGTTTGGATCATGT +CATTTGCTGTTTTCTTTCTGATGTAAACTCTCAAAGTTTGAAGGGTATTG +TCTTTTCCTGACACATACGTTGTAAATAATTTTCTGGCTTACATTTTGAC +TTTTAATTTCATTCACGATGTTTTTAATGAATAATTTTAATTTTTATGAA +TGCAAGTTAAAATAATTCTTTCATTGTGGTTTCTGACATGTCATGCCAAT +AAGGGTCTTCTCCTCCAAGAGCACAGAAATATTTGCCAATACTGTCCTTA +AAATCGGTCACAGTTTCATTTTTTATATATGCATTTTACTTCAATTGGGG +CTTCATTTTACTGGCCCTATTTGAAGCAAGTTTCTCAGTTAATTCTTTTC +TCAAAGTGCTAAGTATGGTAGATTGCAAACATAAGTGGCCACATAATACT +CCCACCTCctttgcctcctctcccaggaggagatagcctccatctttcca +ctccttaatctgggcttggccaagtgacttacactggccaatgggatatt +aacaagtctgatgtgcacagaggctgtagaatgtgcactggggcttggtc +tctcttgctgccctggagaccagctgccccacgaaggaaacagagccaac +ctgctgCTTCCTGGGGGGAGACAGTCCCTCAGTCCCTCTGTCTCTGCCAA +TCAGTTAACCTGCTGCTTCCTGGAGGAAGACAGTCCCTCAGTCCCTCTGT +CTCTGCCAACCAGTTAACCTGCTGCTTCCTGGAGGAAGACAGTCCCTCAG +TCCCTCTGTCTCTGCCAACCAGTTAACCTGCTGCTTCATGGAGGAAGACA +GTCCCTCAGTCCCTCTGTCTCTGCCAACCAGTTAACCTGCTGCTTCCTGG +AGGAAGACAGTCCCTCTGTCCCTCTGTCTCTGCCAACCAGTTAACCTGCT +GCTTCCTGGAGGAAGACAGTCCCTCTGTCCCTCTGTCTCTGCCAACCAGT +TAACCTGCTGCTTCCTGGAGGAAGACAGTCACTCTGTCTCTGccaaccca +gttgaccgcagacatgcaggtctgctcaggtaagaccagcacagtccctg +ccctgtgagccaaaccaaatggtccagccacagaatcgtgagcaaataag +tgatgcttaagtcactaagatttgggCAAAAGCTGAGCATTTATCCCAAT +CCCAATActgtttgtccttctgtttatctgtctgtcCTTCTCTGCTCATT +TAAAATGCCCCCACTGCATCTAGTACATTTTTATAGGATCAGGGATCTGC +TCTTGGATTTATGTCATGTTCCCACCTCGAGGCAGCTTTGTAAGCTTCTG +AGCACTTCCCAATTCCGGGTGACTTCAGGCGCTGGGAGGCCTGTGCATCA +GCTGCTGCTGTCTGTAGCTGAGTTCCTTCACCCCTCTGCTGTCCTCAGCT +CCTTCGCCCCTGGGCCTCAGGAAATCAATGTCATGCTGACATCACTCTAG +ATCTAAAACTTGGGTTCTTGgaccaggtgcggtggctcacatctgtaatc +ccagcaatttgggaggccgaggcgggtggatcacaaggtcaggagatcaa +gacgatcctggctaacacggtgaaaccccgtctctactaaaaatacaaaa +aaattagccgggtttggtggcaggtgcctgtagccccagctacttgggag +gctgaagcaggagaatggcgtgaacctgggaggtggagctggcagtgagc +caagatcacgccactgcactccagactgggagagagagcgagactttctc +aaaaaaaaaaaaaTCTTAGGTTCTTGGATGTTCGGGAAAGGGGGTTATTA +TCTAGAATCCTTGAAGCGCCCCCAAGGGCATCTTCTCAAAGTTGGATGTG +TGCATTTTCCTGAGAGGAAAGCTTTCCCACATTATACAGCTTCTGAAAGG +GTTGCTTGACCCACAGATGTGAAGCTGAGGCTGAAGGAGACTGATGTGGT +TTCTCCTCAGTTTCTCTGTGTGGCACCAGGTGGCAGCAGAGGTCAGCAAG +GCAAACCCGAGCCCAGGGATGCGGGGTGGGGGCAGGTACATCCTCTCTTG +AGCTACAGCAGATTAACTCTGTTCTGTTTCATTGTGGTTGTTTAGTTTGC +GTTTTTTTTTCTCCAACTTTGTGCTTCATCGGGAAAAGCTTTGGATCACA +ATTCCCAGtgctgaagaaaaggccaaactctggaaaaaatttgaatattt +tgagccaaatgtgaggaccacaacctgtgagaacggaaaataaatcctgg +gaccccagactcactaagccaaagggaaaagccaagctgggaactggctt +atgcaaacctgcttcccatctggttcctaaataagatagctattacacaa +agacaaaaaagctacatccctgcctctacctccatcgcatgcaaaatgtg +tattcagtgaacgctgaccaaagacagaagaatgcaaccatttgcctctg +atttacccacacccattttttccacttcttcccctttccccaatacccgc +acttttcccctttacttactgaggtccccagacaacctttgggaaaagca +cggaccacagtttttcctgtggttctctgttcttttctcaggtgtgtcct +taaccttgcaaatagatttcttgaaatgattgagactcaccttggttgtg +ttctttgattAGTgcctgtgacgcagcttcaggaggtcctgagaacgtgt +gcacagtttagtcggcagaaacttagggaaatgtaagaccaccatcagca +cataggagttctgcattggtttggtctgcattggtttggtctggaaggag +gaaaattcaaagtaatggggcttacaggtcatagatagattcaaagattt +tctgattgtcaattggttgaaagaattattatctacagacctgctatcaa +tagaaaggagagtctgggttaagataagagactgtggagaccGTGCATAG +TTGCTTCCTGATCAGCTCTTTATTTGATTGAGAGTGAGGCAGGGAAGATT +AGAGGGAAGCTTACAGTGGAATTCAGGGCTGAGGCTGCTATTCTTTTGCT +CCTTGTAACTTCCTACAGTGTTGTCAGCATCCACATACTTCTCTGTGGGG +TTggtctcagagccaggttaccttgtcttaggtccagtggcaccctgact +ggcttggtgtccttgaacaagttacctaacctctccaaacctcagtccct +cagttgtaaaattaaaaaaaaaaaaaagaagaagaagagtacctactgta +tagcattgatttgaagattgaatgagctggtattatacaacgtttagaag +cagtgcctgacacgcaaaaggctctcaacaaatACTATCCTTTACTAATA +TCCTGTGTGTCTGTATCAGAGCTGGTGGGGTGGAGGGACAGAAACAAGTG +GGAGAAGGTaaagagatggacaaatgatctctaaagtctctctggcacta +acaCAATTCTTTATTATGTGTTTTGTCTGGCTCTTTATATTGATAGCTGT +TCCAGAGGCAATCAATAGCTATTAGTCGGTTTTATTCTTATTTTTCTGTC +TGATCTTACAGGGGAGCAAACTGTGGCAAAGTATGAACTTACTTCTCAGG +AAATTAACCATTATATTGGCAATCACTGTGATTATTTGAACTTCAGCGTC +TGGACAAATTTAGTCACATGAAATACAGAAGAGAGATTTCTCATGGTTAA +AACGAAGCtctctttatttgcttctgctaattaaaaaatcagagctaaag +atacttaaacactacagttaaaatgccatggttgtctattggcttaacga +attctcttatgaaatcaactctaaaatgctatccatcataaatcatgaaa +cgcaatttttcttattctctttagagctttacaattcatcttaaagacca +gtgtttacactctcttctgtaggttgtacaataacttttggcgagaaaaa +ataaaagtctggctttctgacTCATAGGTGTGTTCCCTTTAACAGAAAAA +GAAAATATGTCCTCTTTAAAACTGATGATCATTGGTCACCTCAATTTTAT +TGAAGTTCACTTCTGACCTCTTTAGATGTAGTTCTCTACATAAAACTGCC +CAACAGAATTCTCTGTCTGAATGTCTCCTCCACAAACAAAATTTTAAGAA +CTAAAATTATCATCTTTCCTTCCAAATATGCTCTCCCTATGTCCCCAGGG +CTCTCCATGTGTAGAGCTGAGACCATTTGCCACTCAGTTTCCTCACCCAA +TTAATTACAAGTCCCAACAATTTTCCGGtttttttgtttttgtttttgtt +tttagacggagtcttgctctgtcaccaggctggtgtgcggtggtgcaatc +tcagctcactgcaacctccgctgcctgtgttcaagcgattctcctgcctc +agcttcccaagtagctgggattataggtgtgtgccactacatccagataa +tttttgtatttttagtagagaggggatttcaccatattggcccagatgat +ctcaatctcttgacctcatgatctgcccaccttggcctcccaaagtgctg +ggattacaggtgtgagccgccatccctggccCAGTTTTGCCTTTTTAACA +TCCCTCAGCTCTTCAAATCCATTTTCTcttctctaacacctccccattcc +ccagctcgtaatgaactcgtaagtagattactacaatcacctcccaaatg +gtcttcctggctccatcagccttgtgaccttcaagttcattttccacatg +gatgtcagagtaactttctaaaatgaaaatctgaccacgttactctcttg +cctaaatccgcctatggccgctgttaggatcaagtctaaactcccgaccc +tggaacatcaggtcttcgtgctctgttcactgcttctctacctcacctgc +aaccaACACCACTCCCACATCCATATGCTGCTCACCGTGTATCAACATGA +ACAGGAGGTGGGTGTTTCAGTCCCCAGGAAGACACTGGGCCTTTTCAATC +ATCTACTGCTGTGTAATAACCACCCCGCAAACTGACCACATGATTTCATT +TTGCAAGGGTTCCTTCCTTgggctgtgttcagcaaaagggtttactgagc +tggcaggtccaagatggcctcactcacaggactggctgttgatgggagcc +ttgatgctcttgggctcaccccttatcctccagtaggttagagcttctta +cagtggtttcaggcagcatctgaagacagtaaaagcagaagctccaaggc +ttcttacattctagcctggaaaatcacatcacattgcttccttcatattt +ttttggcaaatcaggttgcaaggcttgcccagattagggtaaagaggcaa +agaggctccttttcttTTCttttctttttttttttttttttttttttgag +tcagaatctcgctctgttgcccaggctggagtgcagtggtgcaatctagg +ctcactgcaagctctgcctcctgggttcacgtcattctcctgcctcaggc +tcccaagtagctgagactacaggcacctaccaccacgcctggctaatttt +tttttattttttattttttagtagagactgtgtttcactgtgttagccag +gatggtctccatctcctgacctcgtgatccTTGCAAAGGGACATGCAGAC +CACATTAGTGAGAATATGTGCCTGTATTTTGCAATCTGTAACATGGGCAT +AAACTAAATGTTTTCCAAAGGGAATAGGGCAAAACAAAAAGGACCTTGAC +CACTCCTTGGCCCTGAATAAATCCAGGAAGCCTAAGAGTATGACTATCCT +GAGGTAGAAAGAGGGTCACATGCTGGATAAGAGGTACCTGGGCTCTCCAC +TTACAAGAAGAGAGCATGGTTACATTTATAATCACCATTCCCAACATGCT +GTGAGTGCAGGCAGCTACCAGGAGGAGAACAAAGGAAATAACCAGGACAC +TCATCTCTAAACCTGTTAATTTAATCACACGGAACACTTCTATTTAAAAT +TCCCGAGAGTTAAGATGTAAGAATGCTTATCAAGGTAAATGCTGTTCACA +CTGCTTGGAGTGTCAGGCCTAGATCTCTATCCATCAGAaacaacaatatc +aataacaacaacagcaacaTGATGATGGGGCAATTTCTTAAAAGCACCAT +GTATTTTATCGATACATGTCCGTTGCAGAAAATCCAGGTGAATCCAAAGA +AGAAATAAATGTCTTCCACAATCCCATAGCCCAGAGCTAACTAACCACTA +TAAAGAACCCAGCGTGGTTTTAACTAATGGATCAAAAGATGCTCATCAAA +GGCTCTGAGCTTTCCTGAGTGCTAACAGGAAACATCCAGCATCACTGGTC +TCTCCAAGGCTGCAGGTGTCTTTGCCCATAGTGCCTGTTTTGTGTCAGGG +AAAGAATCAACCTGGGAGCCAAGCCCAGGAATCAGGATGACCAAGACATA +CTGCACAAGGAGGGAACAAACCCATCCAAGGACACTCAAGGACAAATCAA +GCAAATGAATTTAAGGGAGACGTGCTCATGGTCTGCTTTGCTGCTCAGCA +TGGCTGGGAGGCACAGTGGAAGATCATGCATCCTGCCCCTGGGACTCCTC +TGCCAGAGCCTGAGAGCTTTCTCCTGCCCACAGGCTAGGGGTAGGGCAGT +TGGAATTGATCCATGCCTTCTAGCTAGACTGTGGGTCCCCTCAGTCTTGG +GCATGGTGACAGCCCAGCATCAGACAGAGGTCAGTATCAAACTAGAAAAT +TTAATAAATACTGTCAGATTTGTAGACCCAAGAAAATATAAACTGCCAAT +CACGGAGGAAAAAAATCTCTCAATGATCTTATCTTTATATGATTCCCTTG +CTGCCTGGAGATTGACATTTCCTTGGGGATAATCTGGTCATAGGATTGGT +GAAGGTGGAAGGGAGGCAACCTCCAAAGGTGGGGCCCTCTGCTCACCTGG +GACAGGGAGGGCCTGAGGTAGGTGTCTGTGTGGGCTGGGGAGGAGGATGG +GAGCAGTGCTTCTAGATGTTTCCACTTTCTCCTCATTAGATAATAACGAA +TGGGTGATTTCCCTAGTCACTGCAGTGTGAGGAAATCTACAAAATTAATT +TCACAATACGCTTTACAGGATAGGTGGAGAAACACATGAAGTACAACTGC +AGTGGGTTATAAAAAACGGCCTTTCGAGTTGAGCAATAAATTCGTTCAAG +CAGCCATTCTGAAGGACAAACTGGCTCTGTATTTAAGAGGGGCATTCCAG +CACTTCTCTAGCCACTGGGTTGACAATGACTCACCAAAGCCTCTGGTAGC +CACCACAGGACACCCAGAGCATATGTTTTAAAGCTGAACACCAAACTGCG +GACTTCGGGAGTAAGTGAACTGACTGGTTTTTATTTTGTTTTACTGCTTT +TAACATTACAGTAACTGTTACAGGTTCCAGCAGGCTAACTGGGTGGAAAT +GAGTTTGGTTTCACTTAGTCTCTCTAAAGAGAAAGCAAGTCGGTAGACTA +ATACCTAATAAAAGCAAAGCTGCCAACAATTGAAATTGCCTAGGCTGCTC +TGTGTGTCCCACATGCAtgggtgtgggtgccagtgtgtgtgcgtgtgtgc +atgcatgtgcatgtgtgtTGGGATAGAGTGGTAAGAAAATGGGAAATAAT +AAGAATGTTCAGTCCATAGCCCTTCATTATAAAAAGGTGAGCTGTAATAA +ATACTAGTGCCACATTTAGCCAAAACTTTACTCCAGCCAAAGGTGATATT +TTCATGATAACATCCTGTGATTGCTTTGTTCTTCGTCTTTTATGTTCTTC +CTAGATGGGCTCAGAACATACAAGAATTAAGTACACATCTTATTTTCCAG +TGATAATGCTACCGGCAAATTCTGTTGTTTGTATAAACATCAGCCATGTT +TATATAACTAAACTAGTGTTTTGTTTTGTCAATTCAGCAAGAAATTAGAC +CAAATGGTGGCTTAATGCTGCATTGATTTGGCTATCAATTTGTTTTCACT +TTTCTGCAAAATAATTAATACATTATTAAATTGAATTGTGCTGATGCCAC +AGTTGTTCTTATCTCAAGTGTCTTAAAATTCATTTAATTTGTTTTTCCTT +TGGTTTCATTATTCAAATTTTAACTTCAGTTCTCAAGATTTTATCTGATG +GAAGAGATGGAGTCCATTACTAAGGACTCCATTGTGCTCCATCATGCCAG +AGTTGTAAAATAGATCTTTTAAAGGAAATTTACTGTGATTTTTTTTCTAT +TTAAGAGCTTCCTCTCCAGTTGAGCATGTAAGAAAATTATACCAGGAGAA +TACAGTAAACTCTATGAGGCAAGCTATAAACATGTAGCATTGTGATTAGG +Gctggttctccttctagagacatggtaggattgcaatttcataccatcct +tgaagttagagagagccacgtgactcatttagccaatgaactgtgagcag +aatgacatgtcacttccagctgaagctttaacaatctgagagacattcat +acattttccatgtgctgtagccttatacccaaagcctgggtcccaagtga +ccatgacaggcagagctccctgttgagccacagagatttagagaatggct +gttaacacagcataatccagcccatcctgactaatCTGATATTAACATGT +ATAATAAAGAATTCTATCAATGCTGAGGGAAGATGACTAGTTAAGGTCCT +AGGTTGCAAGTCTCAAAACCTCTTCTAAGGATTGTAGACAGGAAATTAAA +TGACTTCTAGTCCCTAGAGTTCCCAATCTCCTACCATCCCATCCTAATAT +GACAGAAGTAATTCCTGAGTTGCTTCTGAAACCAGAGCTTCCCTCAGAAC +CCTTAGCCTGCCAGATGGCTTCTTGGAGAGCCCTCACTCACTTTTCTCCT +TCTGCTATTGCTGCTCATTCATTCCAGTTTTTAAAAATTCATCTTTATCC +AGGAACCTCGCTTCTAGAAAAGTCATACAGGTGCTTCCAGGAGGCTACAT +GGGCACCCATATTTTTCTAGCCACTTTCATTAGACCAATGCAGCAGAGAA +GAAAAGCCTCAATAATTATTATGACATGGCATGTTAGGATACCAAGTAAA +TTGCATTTGTAAAATGTGATTTTCTGTTGGTGTTCACTTCAGCTCTACTG +ACATTTGGTAAGTATTATTGACTGACTGACTAACTAATGTGGTCATTAGT +CTTCATAAAGAAAGGCTCTCTACAAAAACGGAGGGATGCCCTTTTTCTGG +CATTTAATACGTAAGAAATTGCCTCCAATAGAAACCAGAGTTGCCTGATT +ACTATCAGCACAGGAGAAATGTATTAATGTGCCTTTCTAGTAACAGGTTT +TTAGAAAGTCAAATATAAACAAATCTGTCTATTTGTGTGTGTGCATGTGG +TAGTGGGGAGGGAAGAAAAAAGGAGGGGGAGAGAAAGAGAAATAAGAACC +AAGTTTATTATACTGTATTCAGGGGGAAAAAATTTTCCCAAGGTCCTAAC +AGAAGAGCAAAGTGCCACTGTCAATAGCCTCAGTAGTGTTAGGGTTGCTt +ttatttatttatttatttatttatttatttatttatttatttTTCCtttt +ttttctttctctttttttcttcttttttttttcttttctttctttttttt +ttttttttttttttttGgacagagtctcacactgtcacctgggctggagt +gcattggtgcaatctcgactcactgcaacttctgcctcccaggttcaagt +gattctcctgcctcagccgcccaagtagctgggattacaggtgtctgcca +ccgtgcctagctaatttttttgtatttttagtagagatgaggtttcacta +tgttggccaggctggtctcaaactcctgacctcatgatccacccacgttg +gcctcccaaagtgctgggattacaggcgtgagccaccgcccctggccAGG +ATTGCTTTTACAGCCAGTCTTCAGGTGCCCACTGTAGGAACAATGTCATT +TAACCCTCGGGATTATTCTGTGCCAAATATGGATAATGACTAATATCCAA +CACAGATATTCTCAGCTCAGAAGAGCAATTAGCAAATTCATAAATTAAGT +GCTTGCTTCCTCTTTAGTCAAATACAAACGTTTGTTAAAAGATATTATTT +TGCTTTACACTTTTTCTCTCAGAAATAAGCAGATGCTTGAATTCCCACAG +TGCTGCTTGAGCCTCACACCATGTCATCCTGCCAGGCACCCAGATCCAGT +TCTAGAGTTTCACATGATCGTGAgtgttggttaataagtcaatgtgaact +gggaggggagatttttcaggagtgccacagggctctccctttaatcACAT +ACACTCCCTGCTTTCATTGGAAAGTGTATAATGATGTCAGAGTGCCCCAG +AATGGAGCTAGTTGGAAGACTGCCGTCATAGGGAtgccttagtgaattaa +taaggttttaatttctggctctcaactttgtagatgtaaaagttgattta +tcaatatgtgagaaaggatgaatctttctgaaggttatgtcatcacactc diff --git a/tests/data/hg38.chr1.f100k.fa.fai b/tests/data/hg38.chr1.f100k.fa.fai new file mode 100644 index 0000000..4397900 --- /dev/null +++ b/tests/data/hg38.chr1.f100k.fa.fai @@ -0,0 +1 @@ +chr1:1-100000 100000 15 50 51 diff --git a/tests/shared_data.py b/tests/shared_data.py index c167c08..5ed502a 100644 --- a/tests/shared_data.py +++ b/tests/shared_data.py @@ -8,6 +8,7 @@ "SARS_COV_2_GFF3_GZ_PATH", "SARS_COV_2_GFF3_GZ_TBI_PATH", "TEST_GENOME_OF_FILE_URIS", + "TEST_GENOME_HG38_CHR1_F100K", ] DATA_DIR = (pathlib.Path(__file__).parent / "data").absolute() @@ -41,3 +42,23 @@ }, ], } + +TEST_GENOME_HG38_CHR1_F100K = { + "id": "hg38-chr1-f100k", + "fasta": f"file://{DATA_DIR / 'hg38.chr1.f100k.fa'}", + "fai": f"file://{DATA_DIR / 'hg38.chr1.f100k.fa.fai'}", + "gff3_gz": f"file://{DATA_DIR / 'gencode.v45.first-few.gff3.gz'}", + "gff3_gz_tbi": f"file://{DATA_DIR / 'gencode.v45.first-few.gff3.gz.tbi'}", + "md5": "021db6573bbb7373345e6c3eec307632", + "ga4gh": "SQ.sY74le7UyqmFWoC1FWbvt8zHxjnpS8e2", + "contigs": [ + { + "name": "chr1:1-100000", + "aliases": [], + "md5": "d12b28d76aa3c1c6bb143b8da8cce642", + "ga4gh": "SQ.jTVrjy4tzSYmexXZs_cfFWNuRKpvpVBI", + "length": 100000, + "circular": False, + } + ], +} diff --git a/tests/test_db.py b/tests/test_db.py index ec1af1f..94ba3d0 100644 --- a/tests/test_db.py +++ b/tests/test_db.py @@ -1,6 +1,14 @@ from bento_reference_service.db import Database -from .shared_data import TEST_GENOME_OF_FILE_URIS +from .shared_data import TEST_GENOME_OF_FILE_URIS, TEST_GENOME_HG38_CHR1_F100K + + +async def test_create_genome(db: Database, db_cleanup): + # SARS-CoV-2 + await db.create_genome(TEST_GENOME_OF_FILE_URIS, return_external_resource_uris=False) + + # hg38 chr1:1-100000 + await db.create_genome(TEST_GENOME_HG38_CHR1_F100K, return_external_resource_uris=False) async def test_mark_running_as_error(db: Database, db_cleanup): From 380775a3585f705b39ce0f0be5d6b4e368392a3e Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Tue, 14 May 2024 12:11:58 -0400 Subject: [PATCH 061/114] fix(db): alias deserialization --- bento_reference_service/db.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bento_reference_service/db.py b/bento_reference_service/db.py index 088b5e1..c5b5ee4 100644 --- a/bento_reference_service/db.py +++ b/bento_reference_service/db.py @@ -50,7 +50,7 @@ def deserialize_contig(self, rec: asyncpg.Record | dict) -> ContigWithRefgetURI: return ContigWithRefgetURI( name=rec["contig_name"], # aliases is [None] if no aliases defined: - aliases=tuple(map(Database.deserialize_alias, filter(None, rec["aliases"]))), + aliases=tuple(map(Database.deserialize_alias, rec["aliases"])) if rec["aliases"] else (), md5=md5, ga4gh=ga4gh, length=rec["contig_length"], @@ -70,7 +70,7 @@ def deserialize_genome(self, rec: asyncpg.Record, external_resource_uris: bool) return GenomeWithURIs( id=rec["id"], # aliases is [None] if no aliases defined: - aliases=tuple(map(Database.deserialize_alias, filter(None, json.loads(rec["aliases"])))), + aliases=tuple(map(Database.deserialize_alias, json.loads(rec["aliases"]))) if rec["aliases"] else (), uri=genome_uri, contigs=tuple(map(self.deserialize_contig, json.loads(rec["contigs"]))), md5=rec["md5_checksum"], From 2d50000cb8524f04ff91eaef87fcfc2086487697 Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Tue, 14 May 2024 12:12:05 -0400 Subject: [PATCH 062/114] test: fix async db tests --- tests/shared_data.py | 5 +++-- tests/test_db.py | 11 ++++++++--- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/tests/shared_data.py b/tests/shared_data.py index 5ed502a..5d1583a 100644 --- a/tests/shared_data.py +++ b/tests/shared_data.py @@ -45,12 +45,13 @@ TEST_GENOME_HG38_CHR1_F100K = { "id": "hg38-chr1-f100k", + "md5": "021db6573bbb7373345e6c3eec307632", + "ga4gh": "SQ.sY74le7UyqmFWoC1FWbvt8zHxjnpS8e2", "fasta": f"file://{DATA_DIR / 'hg38.chr1.f100k.fa'}", "fai": f"file://{DATA_DIR / 'hg38.chr1.f100k.fa.fai'}", "gff3_gz": f"file://{DATA_DIR / 'gencode.v45.first-few.gff3.gz'}", "gff3_gz_tbi": f"file://{DATA_DIR / 'gencode.v45.first-few.gff3.gz.tbi'}", - "md5": "021db6573bbb7373345e6c3eec307632", - "ga4gh": "SQ.sY74le7UyqmFWoC1FWbvt8zHxjnpS8e2", + "taxon": {"id": "NCBITaxon:9606", "label": "Homo sapiens"}, "contigs": [ { "name": "chr1:1-100000", diff --git a/tests/test_db.py b/tests/test_db.py index 94ba3d0..ae22d76 100644 --- a/tests/test_db.py +++ b/tests/test_db.py @@ -1,18 +1,23 @@ +import pytest + from bento_reference_service.db import Database +from bento_reference_service.models import Genome from .shared_data import TEST_GENOME_OF_FILE_URIS, TEST_GENOME_HG38_CHR1_F100K +pytestmark = pytest.mark.asyncio() + async def test_create_genome(db: Database, db_cleanup): # SARS-CoV-2 - await db.create_genome(TEST_GENOME_OF_FILE_URIS, return_external_resource_uris=False) + await db.create_genome(Genome(**TEST_GENOME_OF_FILE_URIS), return_external_resource_uris=False) # hg38 chr1:1-100000 - await db.create_genome(TEST_GENOME_HG38_CHR1_F100K, return_external_resource_uris=False) + await db.create_genome(Genome(**TEST_GENOME_HG38_CHR1_F100K), return_external_resource_uris=False) async def test_mark_running_as_error(db: Database, db_cleanup): - g = await db.create_genome(TEST_GENOME_OF_FILE_URIS, return_external_resource_uris=False) + g = await db.create_genome(Genome(**TEST_GENOME_OF_FILE_URIS), return_external_resource_uris=False) t1 = await db.create_task(g.id, "ingest_features") t2 = await db.create_task(g.id, "ingest_features") From f47c761324e89fc27ec138adf1d3cf95c2676e23 Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Tue, 14 May 2024 14:30:47 -0400 Subject: [PATCH 063/114] test: creation of covid + hg38 subset genomes --- tests/test_genome_routes.py | 73 +++++++++++++++++++++++++------------ 1 file changed, 49 insertions(+), 24 deletions(-) diff --git a/tests/test_genome_routes.py b/tests/test_genome_routes.py index a51b4d8..274f0a2 100644 --- a/tests/test_genome_routes.py +++ b/tests/test_genome_routes.py @@ -12,12 +12,15 @@ SARS_COV_2_GFF3_GZ_PATH, SARS_COV_2_GFF3_GZ_TBI_PATH, TEST_GENOME_OF_FILE_URIS, + TEST_GENOME_HG38_CHR1_F100K, ) # all tests are async so that db_cleanup (an async fixture) properly works. not sure why it's this way. pytestmark = pytest.mark.asyncio() +AUTHORIZATION_HEADER = {"Authorization": "Token bearer"} + async def test_genome_list(test_client: TestClient): res = test_client.get("/genomes") @@ -46,9 +49,15 @@ async def test_404s_with_no_genomes(test_client: TestClient): assert res.status_code == status.HTTP_404_NOT_FOUND -def create_genome_with_permissions(test_client: TestClient, aioresponse: aioresponses) -> Response: +def create_covid_genome_with_permissions(test_client: TestClient, aioresponse: aioresponses) -> Response: + aioresponse.post("https://authz.local/policy/evaluate", payload={"result": [[True]]}) + res = test_client.post("/genomes", json=TEST_GENOME_OF_FILE_URIS, headers=AUTHORIZATION_HEADER) + return res + + +def create_hg38_subset_genome_with_permissions(test_client: TestClient, aioresponse: aioresponses) -> Response: aioresponse.post("https://authz.local/policy/evaluate", payload={"result": [[True]]}) - res = test_client.post("/genomes", json=TEST_GENOME_OF_FILE_URIS, headers={"Authorization": "Token bearer"}) + res = test_client.post("/genomes", json=TEST_GENOME_HG38_CHR1_F100K, headers=AUTHORIZATION_HEADER) return res @@ -57,19 +66,39 @@ async def test_genome_create(test_client: TestClient, aioresponse: aioresponses, assert res.status_code == status.HTTP_401_UNAUTHORIZED aioresponse.post("https://authz.local/policy/evaluate", payload={"result": [[False]]}) - res = test_client.post("/genomes", json=TEST_GENOME_OF_FILE_URIS, headers={"Authorization": "Token bearer"}) + res = test_client.post("/genomes", json=TEST_GENOME_OF_FILE_URIS, headers=AUTHORIZATION_HEADER) assert res.status_code == status.HTTP_403_FORBIDDEN - res = create_genome_with_permissions(test_client, aioresponse) + # SARS-CoV-2 + + res = create_covid_genome_with_permissions(test_client, aioresponse) + assert res.status_code == status.HTTP_201_CREATED + + res = create_covid_genome_with_permissions(test_client, aioresponse) # test we cannot recreate + assert res.status_code == status.HTTP_400_BAD_REQUEST + + # - test list has one entry + res = test_client.get("/genomes") + assert res.status_code == status.HTTP_200_OK + assert len(res.json()) == 1 + + # hg38 subset + + res = create_hg38_subset_genome_with_permissions(test_client, aioresponse) assert res.status_code == status.HTTP_201_CREATED - res = create_genome_with_permissions(test_client, aioresponse) # test we cannot recreate + res = create_hg38_subset_genome_with_permissions(test_client, aioresponse) # test we cannot recreate assert res.status_code == status.HTTP_400_BAD_REQUEST + # - test list has two entries + res = test_client.get("/genomes") + assert res.status_code == status.HTTP_200_OK + assert len(res.json()) == 2 + async def test_genome_detail_endpoints(test_client: TestClient, aioresponse: aioresponses, db_cleanup): # setup: create genome TODO: fixture - create_genome_with_permissions(test_client, aioresponse) + create_covid_genome_with_permissions(test_client, aioresponse) # tests @@ -126,17 +155,17 @@ async def test_genome_detail_endpoints(test_client: TestClient, aioresponse: aio async def test_genome_delete(test_client: TestClient, aioresponse: aioresponses, db_cleanup): # setup: create genome TODO: fixture - create_genome_with_permissions(test_client, aioresponse) + create_covid_genome_with_permissions(test_client, aioresponse) aioresponse.post("https://authz.local/policy/evaluate", payload={"result": [[True]]}) - res = test_client.delete(f"/genomes/{SARS_COV_2_GENOME_ID}", headers={"Authorization": "Token bearer"}) + res = test_client.delete(f"/genomes/{SARS_COV_2_GENOME_ID}", headers=AUTHORIZATION_HEADER) assert res.status_code == status.HTTP_204_NO_CONTENT aioresponse.post("https://authz.local/policy/evaluate", payload={"result": [[True]]}) - res = test_client.delete(f"/genomes/{SARS_COV_2_GENOME_ID}", headers={"Authorization": "Token bearer"}) + res = test_client.delete(f"/genomes/{SARS_COV_2_GENOME_ID}", headers=AUTHORIZATION_HEADER) assert res.status_code == status.HTTP_404_NOT_FOUND # already deleted - res = create_genome_with_permissions(test_client, aioresponse) # test we can re-create + res = create_covid_genome_with_permissions(test_client, aioresponse) # test we can re-create assert res.status_code == status.HTTP_201_CREATED # test that we cannot delete with no token @@ -145,20 +174,18 @@ async def test_genome_delete(test_client: TestClient, aioresponse: aioresponses, # test that we cannot delete with no permission aioresponse.post("https://authz.local/policy/evaluate", payload={"result": [[False]]}) - res = test_client.delete(f"/genomes/{SARS_COV_2_GENOME_ID}", headers={"Authorization": "Token bearer"}) + res = test_client.delete(f"/genomes/{SARS_COV_2_GENOME_ID}", headers=AUTHORIZATION_HEADER) assert res.status_code == status.HTTP_403_FORBIDDEN -def _ingest_features(test_client: TestClient): - hs = {"Authorization": "Token bearer"} - +def _ingest_covid_features(test_client: TestClient): # Test we can create a task for ingesting features with open(SARS_COV_2_GFF3_GZ_PATH, "rb") as gff3_fh, open(SARS_COV_2_GFF3_GZ_TBI_PATH, "rb") as tbi_fh: res = test_client.put( f"/genomes/{SARS_COV_2_GENOME_ID}/features.gff3.gz", files={"gff3_gz": gff3_fh, "gff3_gz_tbi": tbi_fh}, - headers=hs, + headers=AUTHORIZATION_HEADER, ) assert res.status_code == status.HTTP_202_ACCEPTED @@ -172,7 +199,7 @@ def _ingest_features(test_client: TestClient): task_status: str = "" task_msg: str = "" while not finished: - res = test_client.get(f"/tasks/{task_id}", headers=hs) + res = test_client.get(f"/tasks/{task_id}", headers=AUTHORIZATION_HEADER) assert res.status_code == status.HTTP_200_OK rd = res.json() task_status = rd["status"] @@ -184,23 +211,21 @@ def _ingest_features(test_client: TestClient): async def test_genome_feature_ingest(test_client: TestClient, aioresponse: aioresponses, db_cleanup): - hs = {"Authorization": "Token bearer"} - - # setup: create genome TODO: fixture - create_genome_with_permissions(test_client, aioresponse) + # setup: create SARS-CoV-2 genome TODO: fixture + create_covid_genome_with_permissions(test_client, aioresponse) # Test we can ingest features aioresponse.post("https://authz.local/policy/evaluate", payload={"result": [[True]]}, repeat=True) - _ingest_features(test_client) + _ingest_covid_features(test_client) # Test we can delete - res = test_client.delete(f"/genomes/{SARS_COV_2_GENOME_ID}/features", headers=hs) + res = test_client.delete(f"/genomes/{SARS_COV_2_GENOME_ID}/features", headers=AUTHORIZATION_HEADER) assert res.status_code == status.HTTP_204_NO_CONTENT # Test we can ingest again - _ingest_features(test_client) + _ingest_covid_features(test_client) # Test we can delete again - res = test_client.delete(f"/genomes/{SARS_COV_2_GENOME_ID}/features", headers=hs) + res = test_client.delete(f"/genomes/{SARS_COV_2_GENOME_ID}/features", headers=AUTHORIZATION_HEADER) assert res.status_code == status.HTTP_204_NO_CONTENT From 49c3a805d8b4daa8d99b25700e1dec53a4ed8cf2 Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Tue, 14 May 2024 14:58:31 -0400 Subject: [PATCH 064/114] test: iter test for features for hg38 subset + sars cov 2 --- tests/shared_data.py | 9 +++-- tests/test_db.py | 6 ++-- tests/test_genome_routes.py | 70 ++++++++++++++++++++++++++----------- 3 files changed, 59 insertions(+), 26 deletions(-) diff --git a/tests/shared_data.py b/tests/shared_data.py index 5d1583a..db086da 100644 --- a/tests/shared_data.py +++ b/tests/shared_data.py @@ -1,4 +1,5 @@ import pathlib +from bento_reference_service.models import Genome __all__ = [ "DATA_DIR", @@ -7,8 +8,10 @@ "SARS_COV_2_FAI_PATH", "SARS_COV_2_GFF3_GZ_PATH", "SARS_COV_2_GFF3_GZ_TBI_PATH", - "TEST_GENOME_OF_FILE_URIS", + "TEST_GENOME_SARS_COV_2", + "TEST_GENOME_SARS_COV_2_OBJ", "TEST_GENOME_HG38_CHR1_F100K", + "TEST_GENOME_HG38_CHR1_F100K_OBJ", ] DATA_DIR = (pathlib.Path(__file__).parent / "data").absolute() @@ -21,7 +24,7 @@ SARS_COV_2_GFF3_GZ_PATH = DATA_DIR / "sars_cov_2.gff3.gz" SARS_COV_2_GFF3_GZ_TBI_PATH = DATA_DIR / "sars_cov_2.gff3.gz.tbi" -TEST_GENOME_OF_FILE_URIS = { +TEST_GENOME_SARS_COV_2 = { "id": SARS_COV_2_GENOME_ID, "aliases": [SARS_COV_2_ALIAS, SARS_COV_2_FAKE_ALIAS], "md5": "b98334cd0015ee1b1d2dc3b9d81b325e", @@ -42,6 +45,7 @@ }, ], } +TEST_GENOME_SARS_COV_2_OBJ = Genome(**TEST_GENOME_SARS_COV_2) TEST_GENOME_HG38_CHR1_F100K = { "id": "hg38-chr1-f100k", @@ -63,3 +67,4 @@ } ], } +TEST_GENOME_HG38_CHR1_F100K_OBJ = Genome(**TEST_GENOME_HG38_CHR1_F100K) diff --git a/tests/test_db.py b/tests/test_db.py index ae22d76..b44d3f6 100644 --- a/tests/test_db.py +++ b/tests/test_db.py @@ -3,21 +3,21 @@ from bento_reference_service.db import Database from bento_reference_service.models import Genome -from .shared_data import TEST_GENOME_OF_FILE_URIS, TEST_GENOME_HG38_CHR1_F100K +from .shared_data import TEST_GENOME_SARS_COV_2, TEST_GENOME_HG38_CHR1_F100K pytestmark = pytest.mark.asyncio() async def test_create_genome(db: Database, db_cleanup): # SARS-CoV-2 - await db.create_genome(Genome(**TEST_GENOME_OF_FILE_URIS), return_external_resource_uris=False) + await db.create_genome(Genome(**TEST_GENOME_SARS_COV_2), return_external_resource_uris=False) # hg38 chr1:1-100000 await db.create_genome(Genome(**TEST_GENOME_HG38_CHR1_F100K), return_external_resource_uris=False) async def test_mark_running_as_error(db: Database, db_cleanup): - g = await db.create_genome(Genome(**TEST_GENOME_OF_FILE_URIS), return_external_resource_uris=False) + g = await db.create_genome(Genome(**TEST_GENOME_SARS_COV_2), return_external_resource_uris=False) t1 = await db.create_task(g.id, "ingest_features") t2 = await db.create_task(g.id, "ingest_features") diff --git a/tests/test_genome_routes.py b/tests/test_genome_routes.py index 274f0a2..29d0ead 100644 --- a/tests/test_genome_routes.py +++ b/tests/test_genome_routes.py @@ -5,14 +5,18 @@ from fastapi.testclient import TestClient from httpx import Response +from bento_reference_service.models import Genome + from .shared_data import ( SARS_COV_2_GENOME_ID, SARS_COV_2_FASTA_PATH, SARS_COV_2_FAI_PATH, SARS_COV_2_GFF3_GZ_PATH, SARS_COV_2_GFF3_GZ_TBI_PATH, - TEST_GENOME_OF_FILE_URIS, + TEST_GENOME_SARS_COV_2, + TEST_GENOME_SARS_COV_2_OBJ, TEST_GENOME_HG38_CHR1_F100K, + TEST_GENOME_HG38_CHR1_F100K_OBJ, ) # all tests are async so that db_cleanup (an async fixture) properly works. not sure why it's this way. @@ -49,24 +53,26 @@ async def test_404s_with_no_genomes(test_client: TestClient): assert res.status_code == status.HTTP_404_NOT_FOUND -def create_covid_genome_with_permissions(test_client: TestClient, aioresponse: aioresponses) -> Response: +def create_genome_with_permissions(test_client: TestClient, aioresponse: aioresponses, genome: dict) -> Response: aioresponse.post("https://authz.local/policy/evaluate", payload={"result": [[True]]}) - res = test_client.post("/genomes", json=TEST_GENOME_OF_FILE_URIS, headers=AUTHORIZATION_HEADER) + res = test_client.post("/genomes", json=genome, headers=AUTHORIZATION_HEADER) return res +def create_covid_genome_with_permissions(test_client: TestClient, aioresponse: aioresponses) -> Response: + return create_genome_with_permissions(test_client, aioresponse, TEST_GENOME_SARS_COV_2) + + def create_hg38_subset_genome_with_permissions(test_client: TestClient, aioresponse: aioresponses) -> Response: - aioresponse.post("https://authz.local/policy/evaluate", payload={"result": [[True]]}) - res = test_client.post("/genomes", json=TEST_GENOME_HG38_CHR1_F100K, headers=AUTHORIZATION_HEADER) - return res + return create_genome_with_permissions(test_client, aioresponse, TEST_GENOME_HG38_CHR1_F100K) async def test_genome_create(test_client: TestClient, aioresponse: aioresponses, db_cleanup): - res = test_client.post("/genomes", json=TEST_GENOME_OF_FILE_URIS) + res = test_client.post("/genomes", json=TEST_GENOME_SARS_COV_2) assert res.status_code == status.HTTP_401_UNAUTHORIZED aioresponse.post("https://authz.local/policy/evaluate", payload={"result": [[False]]}) - res = test_client.post("/genomes", json=TEST_GENOME_OF_FILE_URIS, headers=AUTHORIZATION_HEADER) + res = test_client.post("/genomes", json=TEST_GENOME_SARS_COV_2, headers=AUTHORIZATION_HEADER) assert res.status_code == status.HTTP_403_FORBIDDEN # SARS-CoV-2 @@ -178,16 +184,27 @@ async def test_genome_delete(test_client: TestClient, aioresponse: aioresponses, assert res.status_code == status.HTTP_403_FORBIDDEN -def _ingest_covid_features(test_client: TestClient): - # Test we can create a task for ingesting features +def _file_uri_to_path(uri: str) -> str: + return uri.replace("file://", "") + + +def _put_genome_features(test_client: TestClient, genome: Genome) -> Response: + gff3_gz = _file_uri_to_path(genome.gff3_gz) + gff3_gz_tbi = _file_uri_to_path(genome.gff3_gz_tbi) - with open(SARS_COV_2_GFF3_GZ_PATH, "rb") as gff3_fh, open(SARS_COV_2_GFF3_GZ_TBI_PATH, "rb") as tbi_fh: - res = test_client.put( - f"/genomes/{SARS_COV_2_GENOME_ID}/features.gff3.gz", + with open(gff3_gz, "rb") as gff3_fh, open(gff3_gz_tbi, "rb") as tbi_fh: + return test_client.put( + f"/genomes/{genome.id}/features.gff3.gz", files={"gff3_gz": gff3_fh, "gff3_gz_tbi": tbi_fh}, headers=AUTHORIZATION_HEADER, ) + +def _test_ingest_genome_features(test_client: TestClient, genome: Genome, expected_features: int): + # Test we can create a task for ingesting features + + res = _put_genome_features(test_client, genome) + assert res.status_code == status.HTTP_202_ACCEPTED data = res.json() assert "task" in data @@ -207,25 +224,36 @@ def _ingest_covid_features(test_client: TestClient): finished = task_status in {"success", "error"} assert task_status == "success" - assert task_msg == "ingested 49 features" + assert task_msg == f"ingested {expected_features} features" -async def test_genome_feature_ingest(test_client: TestClient, aioresponse: aioresponses, db_cleanup): - # setup: create SARS-CoV-2 genome TODO: fixture - create_covid_genome_with_permissions(test_client, aioresponse) +@pytest.mark.parametrize( + "genome,expected_features", [(TEST_GENOME_SARS_COV_2_OBJ, 49), (TEST_GENOME_HG38_CHR1_F100K_OBJ, 14)] +) +async def test_genome_feature_ingest( + test_client: TestClient, aioresponse: aioresponses, db_cleanup, genome: Genome, expected_features: int +): + # setup: create genome + create_genome_with_permissions(test_client, aioresponse, genome.model_dump(mode="json")) + + # Test we cannot ingest without permissions + aioresponse.post("https://authz.local/policy/evaluate", payload={"result": [[False]]}) + res = _put_genome_features(test_client, genome) + assert res.status_code == status.HTTP_403_FORBIDDEN # Test we can ingest features aioresponse.post("https://authz.local/policy/evaluate", payload={"result": [[True]]}, repeat=True) - _ingest_covid_features(test_client) + _test_ingest_genome_features(test_client, genome, expected_features) # Test we can delete - res = test_client.delete(f"/genomes/{SARS_COV_2_GENOME_ID}/features", headers=AUTHORIZATION_HEADER) + res = test_client.delete(f"/genomes/{genome.id}/features", headers=AUTHORIZATION_HEADER) assert res.status_code == status.HTTP_204_NO_CONTENT # Test we can ingest again - _ingest_covid_features(test_client) + _test_ingest_genome_features(test_client, genome, expected_features) # Test we can delete again - res = test_client.delete(f"/genomes/{SARS_COV_2_GENOME_ID}/features", headers=AUTHORIZATION_HEADER) + + res = test_client.delete(f"/genomes/{genome.id}/features", headers=AUTHORIZATION_HEADER) assert res.status_code == status.HTTP_204_NO_CONTENT From e5037ef2f0316f7771ae7a4d123f0278680b62c6 Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Tue, 14 May 2024 15:13:23 -0400 Subject: [PATCH 065/114] fix(features): not properly removing Parent attr on ingest --- bento_reference_service/features.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bento_reference_service/features.py b/bento_reference_service/features.py index b2e28c1..e0d5162 100644 --- a/bento_reference_service/features.py +++ b/bento_reference_service/features.py @@ -20,7 +20,7 @@ INGEST_FEATURES_TASK_KIND = "ingest_features" GFF_ID_ATTR = "ID" -GFF_PARENT_ATTR = "ID" +GFF_PARENT_ATTR = "Parent" GFF_GENCODE_GENE_ID_ATTR = "gene_id" GFF_CAPTURED_ATTRIBUTES = frozenset({GFF_ID_ATTR, GFF_PARENT_ATTR, GFF_GENCODE_GENE_ID_ATTR}) GFF_SKIPPED_FEATURE_TYPES = frozenset({"stop_codon_redefined_as_selenocysteine"}) From 8339defd66ff54ea270371674206237bfb4c62ab Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Tue, 14 May 2024 15:37:33 -0400 Subject: [PATCH 066/114] chore(features): move standard GFF Name attr to constant --- bento_reference_service/features.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/bento_reference_service/features.py b/bento_reference_service/features.py index e0d5162..c3f9001 100644 --- a/bento_reference_service/features.py +++ b/bento_reference_service/features.py @@ -20,6 +20,7 @@ INGEST_FEATURES_TASK_KIND = "ingest_features" GFF_ID_ATTR = "ID" +GFF_NAME_ATTR = "Name" GFF_PARENT_ATTR = "Parent" GFF_GENCODE_GENE_ID_ATTR = "gene_id" GFF_CAPTURED_ATTRIBUTES = frozenset({GFF_ID_ATTR, GFF_PARENT_ATTR, GFF_GENCODE_GENE_ID_ATTR}) @@ -60,7 +61,7 @@ def extract_feature_id(record, attributes: dict[str, list[str]]) -> str | None: def extract_feature_name(record, attributes: dict[str, list[str]]) -> str | None: feature_type = record.feature.lower() - feature_name: str | None = attributes.get("Name", (None,))[0] + feature_name: str | None = attributes.get(GFF_NAME_ATTR, (None,))[0] if feature_name: return feature_name From 0d4097a6655d9f002639d721002b95895673dc92 Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Tue, 14 May 2024 15:51:29 -0400 Subject: [PATCH 067/114] test: fix issues with test gencode file --- tests/data/gencode.v45.first-few.gff3.gz | Bin 1167 -> 1088 bytes tests/data/gencode.v45.first-few.gff3.gz.tbi | Bin 110 -> 110 bytes ...ncode.v45.first-few.missing-parent.gff3.gz | Bin 0 -> 1167 bytes ...e.v45.first-few.missing-parent.gff3.gz.tbi | Bin 0 -> 110 bytes tests/test_genome_routes.py | 2 +- 5 files changed, 1 insertion(+), 1 deletion(-) create mode 100644 tests/data/gencode.v45.first-few.missing-parent.gff3.gz create mode 100644 tests/data/gencode.v45.first-few.missing-parent.gff3.gz.tbi diff --git a/tests/data/gencode.v45.first-few.gff3.gz b/tests/data/gencode.v45.first-few.gff3.gz index 70d45f06df28d6ac4fba584bbad3aa4e1afe4d69..f73374c44a48653b4cd389feffe160379650ae1b 100644 GIT binary patch delta 651 zcmV;60(AY43BU-jwgG>A0_9g-ubMy*eYSt4w}#o-4_3p2sYaW$-kV$7^o<~H)C3m- zYTAF_UC}EFTuhqOs3Z_zc46k6IWxQAB8Aqd_p>+dwbbuew*Xu&TR;dAt#<39IluHh zYv~Ne-zSbeDC0=nHt9h3B}kQCdnHqMuC(%(Giw>{EJQjkVQqituG2tgVY*D#dwbQ) z^`As}kq%$Sz5Im^R?6%5e*t_Y#ZMFm7G4*JerXL@Rek{{3 zm`eoSA^m@XK;ycP0g)pxi?UA%dL~3X*I~MSaXpwJm0s}h>ky&=y+Ll lE=QB)x4RSn3+*^I_Tdx&03VA81ONa400936078?11P>h1IgkJV delta 742 zcmVP^*oU z0)nL4{{8HbG#5jOI%!oCq#%NAIOl!*9s|ik0iEIC$6z$*nBQ620SKh-03xbRxAW2d zK3{~crFS{H9W%FaAqj&n_Mm?g7bcyDUQ_?xq$Zl)xt6PC3u=EBpy)z>Uc_mc6w`dZ zYOfnP|C5_=mL}gugX%;YKbr93;upXd+`hzi$F0-b-OrI>-(?V`i^qv6La6>#cfVW) zhr~7KHoBR4GP7{!7Ud$(vtk}qO*Vn}KAgnmG&#FWOY=CHMM<1y=`@LF{fGD|PUC1Z zI=mi_hu^s++MukD@IH-_^N8EQ4>ik?8;^}w?{5TTKAlo zyJ(T-MV6Sdw8xvd;3*BB(g$3mo&s13(BenXJP;fX@uPpG{cxWqZuIbK^r=S*Eac3J zcXM^!TS~yh+4C{zL+HBjS6nNV?WQGI6<@}0aRj!EV2!~m{3I2tGx!=}Gv0s1x7M*n ztG1jQ3qwOnd-KjK4DGqTuc>?lQ*>3KK((&?s|tmuRiV(YP*Q;;w@MukehK9Ex*b2A z7Jl;aD-nO7q9eqw5L%!6>)ihip8I_llcT8VO`iK#s%s_e8p;H=nvS2A>3EIlwAOS~ zrX#d=10uy+@r%>KFKX~}Nn9Qu{@qvWhMHTqo1*1cBvuphBcSCp#Arj2GhyY zGM)5I7f7c&HUAm(bc|GkBIm^nknfxQZIz=-c-el=PV*cjT;lD5wN3zwP*=DZdF03VA81ONa4 Y009360763o02=@U000000000001B*caR2}S diff --git a/tests/data/gencode.v45.first-few.gff3.gz.tbi b/tests/data/gencode.v45.first-few.gff3.gz.tbi index d97ad542ccc2a70879cb5339b8c8c01a16bae55c..f1455de1f8180d22b9c891270474bb84f9db7375 100644 GIT binary patch delta 32 ocmd1Hn_woPrSO#}ZnDIIPcI(r;aR~Ds?Wl}AgaOMInhr60M6wLPXGV_ delta 32 ocmd1Hn_wn!RN*U6++>LZpI$tw;aR~Ds?Wl}aBTL}vlIOk0Op+ypa1{> diff --git a/tests/data/gencode.v45.first-few.missing-parent.gff3.gz b/tests/data/gencode.v45.first-few.missing-parent.gff3.gz new file mode 100644 index 0000000000000000000000000000000000000000..70d45f06df28d6ac4fba584bbad3aa4e1afe4d69 GIT binary patch literal 1167 zcmV;A1aSKwiwFb&00000{{{d;LjnM$0dY?TB^ zm6WnfSw_~9Bgy;y;V4L_VJGn5a(J#-*O$I;KJvU=r_oSVpYzhq*ZW#N4<>(H&w1=} zb844-Htjf0Yb#r3>diW2GdynXXfAo2ZrMD&{dgTHK7TU*^VpfED>Upb5-Bx zNiyeyc{_f4{r=@x)!j6%ZMPmIrkl>W8S?x6b3R>P+V17?yQ=$XzO}O7_cCC+=7T|t z)Y#NOs_J^le;(VIbH2#2-7tdzgB+9djJS{Bh9D$zsH9LqAVo~hIR)Vju?Sq4eZ&b8 z!BIpl#S0Q_v@Qh&34*vpToBqvq9E*mq9?zPBta-r;s8NWXE%mI3aUGRjd3=5uQK>A z2ZJN4)wCG~CeMUXVK&7!Sf&6PvpKeq563NL@xjidQfk=*oE--_Fh~#J~lzFuj zk?f*};6bSY#j)qIz$-PNC}tmQN>S+>#c=5WB6~&5&+V`AmTh?p`^YZ|35bC~A-+n8 zcRp2r0aYgQIxhqO03VA81ONa4009360763o0L%jAm}_sFFcgMAXMbgFzr0xJBs!q4_(f&SPgs!D`Il3J)w{al}gD&=$YcE_#L+uhHRVc%sC zrHjXjDMG0JRd>H!28YBo=Qg^Tc`~zb=N9E6&$D74RZTX5_&%J(s++MukD@IH-_^N8EQ4>ik z?8;^}w?{5TTKAloyJ(T-MV6Sdw8xvd;3*BB(g$3mo&s13(BenXJP;fX@uQ{vaGxe_ z^zdr*sYePddejw&*wEw+@d6+pv#(}*GbHIr2V}3PS5!e{o(UP{#{tK6y2jWfehS*pHfm%8W;r5G96jtu!cvJdE;c`E*{Al4Ltse9ZGC% tsR6b}6~6MsO_n(D>BXZOo)rwC`am;|&7OL;5onM+nu*d3%pmi@007$d9JT-e literal 0 HcmV?d00001 diff --git a/tests/test_genome_routes.py b/tests/test_genome_routes.py index 29d0ead..ee0766d 100644 --- a/tests/test_genome_routes.py +++ b/tests/test_genome_routes.py @@ -228,7 +228,7 @@ def _test_ingest_genome_features(test_client: TestClient, genome: Genome, expect @pytest.mark.parametrize( - "genome,expected_features", [(TEST_GENOME_SARS_COV_2_OBJ, 49), (TEST_GENOME_HG38_CHR1_F100K_OBJ, 14)] + "genome,expected_features", [(TEST_GENOME_SARS_COV_2_OBJ, 49), (TEST_GENOME_HG38_CHR1_F100K_OBJ, 13)] ) async def test_genome_feature_ingest( test_client: TestClient, aioresponse: aioresponses, db_cleanup, genome: Genome, expected_features: int From bd64c4f0880839cd8658c0a2027af320224e1aa6 Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Tue, 14 May 2024 15:51:47 -0400 Subject: [PATCH 068/114] chore(db): log feature types being ingested --- bento_reference_service/db.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/bento_reference_service/db.py b/bento_reference_service/db.py index c5b5ee4..de75189 100644 --- a/bento_reference_service/db.py +++ b/bento_reference_service/db.py @@ -485,7 +485,10 @@ async def bulk_ingest_genome_features(self, features: tuple[GenomeFeature, ...]) ) ) - self.logger.debug(f"bulk_ingest_genome_features: have {len(feature_types)} feature types for batch") + self.logger.debug( + f"bulk_ingest_genome_features: have {len(feature_types)} feature types for batch " + f"({[ft[0] for ft in feature_types][:20]})" + ) await conn.executemany( "INSERT INTO genome_feature_types(type_id) VALUES ($1) ON CONFLICT DO NOTHING", feature_types ) From bf80fc35db42d7323c42eb3996796881f7258553 Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Tue, 14 May 2024 15:52:08 -0400 Subject: [PATCH 069/114] test: fix ssl errors in streaming response tests --- tests/conftest.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/conftest.py b/tests/conftest.py index 731983a..8af7ae0 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -10,6 +10,7 @@ import os os.environ["BENTO_DEBUG"] = "true" +os.environ["BENTO_VALIDATE_SSL"] = "false" os.environ["CORS_ORIGINS"] = "*" os.environ["BENTO_AUTHZ_SERVICE_URL"] = "https://authz.local" From 2cc60441791b852c9d658d6c889942bb49463363 Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Tue, 14 May 2024 15:52:28 -0400 Subject: [PATCH 070/114] chore(db): log better error message when feature has missing parent --- bento_reference_service/db.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/bento_reference_service/db.py b/bento_reference_service/db.py index de75189..dc41ca2 100644 --- a/bento_reference_service/db.py +++ b/bento_reference_service/db.py @@ -323,12 +323,13 @@ async def get_genome_feature_by_id(self, g_id: str, f_id: str) -> GenomeFeature async def query_genome_features( self, g_id: str, - q: str | None, - name: str | None, - position: str | None, - start: int | None, - end: int | None, - feature_types: list[str] | None, + /, + q: str | None = None, + name: str | None = None, + position: str | None = None, + start: int | None = None, + end: int | None = None, + feature_types: list[str] | None = None, offset: int = 0, limit: int = 10, ) -> tuple[list[GenomeFeature], dict]: # list of genome features + pagination dict object @@ -469,7 +470,12 @@ async def bulk_ingest_genome_features(self, features: tuple[GenomeFeature, ...]) attributes.append((row_id, ak, av)) - parents.extend((row_id, feature_row_ids[p]) for p in feature.parents) + for p in feature.parents: + try: + parents.append((row_id, feature_row_ids[p])) + except KeyError as e: + self.logger.error(f"Could not find parent row ID '{p}' for feature {feature.feature_id}") + raise e feature_tuples.append( ( From 07b82492add8af147d9ba213c67fbc80bc16b89e Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Tue, 14 May 2024 15:52:55 -0400 Subject: [PATCH 071/114] test: start testing querying of genome features --- tests/test_db.py | 44 +++++++++++++++++++++++++++++++++++++++----- 1 file changed, 39 insertions(+), 5 deletions(-) diff --git a/tests/test_db.py b/tests/test_db.py index b44d3f6..c6aa1dd 100644 --- a/tests/test_db.py +++ b/tests/test_db.py @@ -1,23 +1,27 @@ +import logging + import pytest +from pathlib import Path + from bento_reference_service.db import Database -from bento_reference_service.models import Genome +from bento_reference_service.features import ingest_features -from .shared_data import TEST_GENOME_SARS_COV_2, TEST_GENOME_HG38_CHR1_F100K +from .shared_data import TEST_GENOME_SARS_COV_2_OBJ, TEST_GENOME_HG38_CHR1_F100K_OBJ pytestmark = pytest.mark.asyncio() async def test_create_genome(db: Database, db_cleanup): # SARS-CoV-2 - await db.create_genome(Genome(**TEST_GENOME_SARS_COV_2), return_external_resource_uris=False) + await db.create_genome(TEST_GENOME_SARS_COV_2_OBJ, return_external_resource_uris=False) # hg38 chr1:1-100000 - await db.create_genome(Genome(**TEST_GENOME_HG38_CHR1_F100K), return_external_resource_uris=False) + await db.create_genome(TEST_GENOME_HG38_CHR1_F100K_OBJ, return_external_resource_uris=False) async def test_mark_running_as_error(db: Database, db_cleanup): - g = await db.create_genome(Genome(**TEST_GENOME_SARS_COV_2), return_external_resource_uris=False) + g = await db.create_genome(TEST_GENOME_SARS_COV_2_OBJ, return_external_resource_uris=False) t1 = await db.create_task(g.id, "ingest_features") t2 = await db.create_task(g.id, "ingest_features") @@ -30,3 +34,33 @@ async def test_mark_running_as_error(db: Database, db_cleanup): assert (await db.get_task(t1)).status == "error" assert (await db.get_task(t2)).status == "error" + + +async def test_query_genome_features(db: Database, db_cleanup): + logger = logging.getLogger(__name__) + + # prerequesite: create genome + await db.create_genome(TEST_GENOME_SARS_COV_2_OBJ, return_external_resource_uris=False) + + # prerequesite: ingest features + gff3_gz_path = Path(TEST_GENOME_SARS_COV_2_OBJ.gff3_gz.replace("file://", "")) + gff3_gz_tbi_path = Path(TEST_GENOME_SARS_COV_2_OBJ.gff3_gz_tbi.replace("file://", "")) + await ingest_features(TEST_GENOME_SARS_COV_2_OBJ.id, gff3_gz_path, gff3_gz_tbi_path, db, logger) + + g_id = TEST_GENOME_SARS_COV_2_OBJ.id + + # - should get back 2 genes and 1 transcript + res, page = await db.query_genome_features(g_id, q="ORF1ab") + assert len(res) == 3 + assert page["total"] == 3 + + # - should get back 2 genes and 1 transcript + res, page = await db.query_genome_features(g_id, name="ORF1ab") + assert len(res) == 3 + assert page["total"] == 3 + + # - filter by q and name - should get back 1 gene + res, page = await db.query_genome_features(g_id, q="ENSSASG00005000002", name="ORF1ab") + assert len(res) == 1 + assert page["total"] == 1 + assert res[0].feature_id == "gene:ENSSASG00005000002" From e0611ef8e6c9076d24c384a072dd067b07601cde Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Tue, 14 May 2024 15:54:38 -0400 Subject: [PATCH 072/114] docs: update status of annotation service --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 997ec5a..7d9da20 100644 --- a/README.md +++ b/README.md @@ -15,7 +15,7 @@ Reference data (genomes & annotations) service for the Bento platform. * Bento-style genome ingestion: **DONE** * API endpoint permissions: **DONE** * RefGet implementation: _Partially done_ -* Annotation service: **Not done** +* Annotation service: _Partially done_ * Tests: _Partially done_ * Documentation: **Not done** From 3a7cedd5fd2c4d91229215ca5a8448918f2c44f4 Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Tue, 14 May 2024 16:36:24 -0400 Subject: [PATCH 073/114] fix(streaming): deprecation notice for streaming tcp connector init --- bento_reference_service/streaming.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bento_reference_service/streaming.py b/bento_reference_service/streaming.py index ab8fb6e..be39f81 100644 --- a/bento_reference_service/streaming.py +++ b/bento_reference_service/streaming.py @@ -55,7 +55,7 @@ class StreamingUnsupportedURIScheme(Exception): def tcp_connector(config: Config) -> aiohttp.TCPConnector: - return aiohttp.TCPConnector(verify_ssl=config.bento_validate_ssl) + return aiohttp.TCPConnector(ssl=config.bento_validate_ssl) async def stream_file( From 3ca266e3bfc62242020cf0a77e90b437ae6d24c4 Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Tue, 14 May 2024 16:36:49 -0400 Subject: [PATCH 074/114] chore(db): consistent use of jsonb --- bento_reference_service/db.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/bento_reference_service/db.py b/bento_reference_service/db.py index dc41ca2..09490ea 100644 --- a/bento_reference_service/db.py +++ b/bento_reference_service/db.py @@ -107,20 +107,20 @@ async def _select_genomes(self, g_id: str | None, external_resource_uris: bool) taxon_id, taxon_label, ( - SELECT json_agg(ga.*) FROM genome_aliases ga WHERE g.id = ga.genome_id + SELECT jsonb_agg(ga.*) FROM genome_aliases ga WHERE g.id = ga.genome_id ) aliases, ( WITH contigs_tmp AS ( SELECT contig_name, contig_length, circular, md5_checksum, ga4gh_checksum, ( - SELECT json_agg(gca.*) + SELECT jsonb_agg(gca.*) FROM genome_contig_aliases gca WHERE g.id = gca.genome_id AND gc.contig_name = gca.contig_name ) aliases FROM genome_contigs gc WHERE g.id = gc.genome_id ) - SELECT json_agg(contigs_tmp.*) FROM contigs_tmp + SELECT jsonb_agg(contigs_tmp.*) FROM contigs_tmp ) contigs FROM genomes g {where_clause} """, From ce0317cc5e2715de4031fcc99eeb29da0a7d5b1e Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Tue, 14 May 2024 16:51:39 -0400 Subject: [PATCH 075/114] fix(db): correctly deduplicate attribute keys/value lookups --- bento_reference_service/db.py | 44 +++++++++++++++++++++++++++-------- 1 file changed, 34 insertions(+), 10 deletions(-) diff --git a/bento_reference_service/db.py b/bento_reference_service/db.py index 09490ea..0f7caed 100644 --- a/bento_reference_service/db.py +++ b/bento_reference_service/db.py @@ -388,6 +388,22 @@ async def clear_genome_features(self, g_id: str): async with self.connect() as conn: await conn.execute("DELETE FROM genome_features WHERE genome_id = $1", g_id) + async def get_genome_feature_attribute_keys( + self, existing_conn: asyncpg.Connection | None + ) -> list[tuple[int, str]]: + conn: asyncpg.Connection + async with self.connect(existing_conn) as conn: + res = await conn.fetch("SELECT id, attr_key FROM genome_feature_attribute_keys") + return [(row["id"], row["attr_key"]) for row in res] + + async def get_genome_feature_attribute_values( + self, existing_conn: asyncpg.Connection | None + ) -> list[tuple[int, str]]: + conn: asyncpg.Connection + async with self.connect(existing_conn) as conn: + res = await conn.fetch("SELECT id, attr_val FROM genome_feature_attribute_values") + return [(row["id"], row["attr_val"]) for row in res] + async def bulk_ingest_genome_features(self, features: tuple[GenomeFeature, ...]): # Manually generate sequential IDs # This requires an exclusive write lock on the database, so we don't get conflicting IDs @@ -413,8 +429,12 @@ async def bulk_ingest_genome_features(self, features: tuple[GenomeFeature, ...]) current_attr_value_id: int = vr["next_id"] feature_row_ids: dict[str, int] = {} - attr_key_ids: dict[str, int] = {} - attr_value_ids: dict[str, int] = {} + attr_key_ids: dict[str, int] = {t[1]: t[0] for t in await self.get_genome_feature_attribute_keys(conn)} + new_attr_key_ids: dict[str, int] = {} + attr_value_ids: dict[str, int] = { + t[1]: t[0] for t in await self.get_genome_feature_attribute_values(conn) + } + new_attr_value_ids: dict[str, int] = {} # ------------------------------------------------------------------------------------------------------ @@ -455,17 +475,21 @@ async def bulk_ingest_genome_features(self, features: tuple[GenomeFeature, ...]) for attr_key, attr_vals in feature.attributes.items(): if attr_key in attr_key_ids: ak = attr_key_ids[attr_key] + elif attr_key in new_attr_key_ids: + ak = new_attr_key_ids[attr_key] else: ak = current_attr_key_id - attr_key_ids[attr_key] = current_attr_key_id + new_attr_key_ids[attr_key] = current_attr_key_id current_attr_key_id += 1 for attr_val in attr_vals: if attr_val in attr_value_ids: av = attr_value_ids[attr_val] + elif attr_val in new_attr_value_ids: + av = new_attr_value_ids[attr_val] else: av = current_attr_value_id - attr_value_ids[attr_val] = current_attr_value_id + new_attr_value_ids[attr_val] = current_attr_value_id current_attr_value_id += 1 attributes.append((row_id, ak, av)) @@ -516,20 +540,20 @@ async def bulk_ingest_genome_features(self, features: tuple[GenomeFeature, ...]) records=feature_tuples, ) - attribute_keys: list[tuple[int, str]] = [(ik, sk) for sk, ik in attr_key_ids.items()] + new_attribute_keys: list[tuple[int, str]] = [(ik, sk) for sk, ik in new_attr_key_ids.items()] self.logger.debug( - f"bulk_ingest_genome_features: have {len(attribute_keys)} feature attribute keys for batch" + f"bulk_ingest_genome_features: have {len(new_attribute_keys)} new feature attribute keys for batch" ) await conn.copy_records_to_table( - "genome_feature_attribute_keys", columns=["id", "attr_key"], records=attribute_keys + "genome_feature_attribute_keys", columns=["id", "attr_key"], records=new_attribute_keys ) - attribute_values: list[tuple[int, str]] = [(iv, sv) for sv, iv in attr_value_ids.items()] + new_attribute_values: list[tuple[int, str]] = [(iv, sv) for sv, iv in new_attr_value_ids.items()] self.logger.debug( - f"bulk_ingest_genome_features: have {len(attribute_keys)} feature attribute values for batch" + f"bulk_ingest_genome_features: have {len(new_attribute_keys)} feature attribute values for batch" ) await conn.copy_records_to_table( - "genome_feature_attribute_values", columns=["id", "attr_val"], records=attribute_values + "genome_feature_attribute_values", columns=["id", "attr_val"], records=new_attribute_values ) self.logger.debug( From e2122e6481883ce0788a5bfad6d5e68db505bad8 Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Tue, 14 May 2024 16:51:56 -0400 Subject: [PATCH 076/114] chore(db): create GIN index on genome feature attr values --- bento_reference_service/sql/schema.sql | 2 ++ 1 file changed, 2 insertions(+) diff --git a/bento_reference_service/sql/schema.sql b/bento_reference_service/sql/schema.sql index 01f6e3e..b305874 100644 --- a/bento_reference_service/sql/schema.sql +++ b/bento_reference_service/sql/schema.sql @@ -137,6 +137,8 @@ CREATE TABLE IF NOT EXISTS genome_feature_attribute_values ( ); CREATE INDEX IF NOT EXISTS genome_feature_attribute_values_attr_val_idx ON genome_feature_attribute_values (attr_val); +CREATE INDEX IF NOT EXISTS genome_feature_attribute_values_attr_val_trgm_gin + ON genome_feature_attribute_values USING GIN (attr_val gin_trgm_ops); CREATE TABLE IF NOT EXISTS genome_feature_attributes ( id SERIAL PRIMARY KEY, From 1932d287847e868a3181d689cdc0962dcedacfd1 Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Wed, 15 May 2024 13:19:49 -0400 Subject: [PATCH 077/114] test: use mock instead of real site for streaming tests --- tests/test_streaming.py | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/tests/test_streaming.py b/tests/test_streaming.py index 7346e74..c4dc101 100644 --- a/tests/test_streaming.py +++ b/tests/test_streaming.py @@ -1,22 +1,23 @@ import logging - import pytest +from aioresponses import aioresponses from fastapi import HTTPException -from bento_reference_service import config as c -from bento_reference_service import streaming as s +from bento_reference_service import config as c, streaming as s -HTTP_TEST_URI_1 = "https://hgdownload.soe.ucsc.edu/goldenPath/hg38/chromosomes/md5sum.txt" +HTTP_TEST_URI = "https://test.local/file.txt" logger = logging.getLogger(__name__) @pytest.mark.asyncio() -async def test_http_streaming(): +async def test_http_streaming(aioresponse: aioresponses): + aioresponse.get(HTTP_TEST_URI, body=b"test page") + # test that we get back content as expected - stream = s.stream_http(c.get_config(), HTTP_TEST_URI_1, {}) - assert (await anext(stream))[:32] == b"f069c41e7cc8c2d3a7655cbb2d4186b8" # MD5 sum for chr1 + stream = s.stream_http(c.get_config(), HTTP_TEST_URI, {}) + assert (await anext(stream))[:9] == b"test page" # test that we can consume the entire stream async for chunk in stream: @@ -24,28 +25,29 @@ async def test_http_streaming(): @pytest.mark.asyncio() -async def test_http_streaming_404_1(): +async def test_http_streaming_404_1(aioresponse: aioresponses): + aioresponse.get(HTTP_TEST_URI, status=404, body=b"Not Found") with pytest.raises(s.StreamingProxyingError): - stream = s.stream_http(c.get_config(), "https://hgdownload.soe.ucsc.edu/goldenPath/hg38/DOES_NOT_EXIST", {}) + stream = s.stream_http(c.get_config(), HTTP_TEST_URI, {}) await anext(stream) @pytest.mark.asyncio() -async def test_http_streaming_404_2(): +async def test_http_streaming_404_2(aioresponse: aioresponses): + aioresponse.get(HTTP_TEST_URI, status=404, body=b"Not Found") with pytest.raises(s.StreamingProxyingError): - _, stream = await s.stream_from_uri( - c.get_config(), logger, "https://hgdownload.soe.ucsc.edu/goldenPath/hg38/DOES_NOT_EXIST", None, False - ) + _, stream = await s.stream_from_uri(c.get_config(), logger, HTTP_TEST_URI, None, False) await anext(stream) @pytest.mark.asyncio() -async def test_http_streaming_404_3(): +async def test_http_streaming_404_3(aioresponse: aioresponses): + aioresponse.get(HTTP_TEST_URI, status=404, body=b"Not Found") with pytest.raises(HTTPException): res = await s.generate_uri_streaming_response( c.get_config(), logger, - "https://hgdownload.soe.ucsc.edu/goldenPath/hg38/DOES_NOT_EXIST", + HTTP_TEST_URI, None, "text/plain", False, From bb93de514727369387e1af89b776c492d886e4e8 Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Wed, 15 May 2024 13:34:41 -0400 Subject: [PATCH 078/114] test: feature_types summary route --- tests/test_genome_routes.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/test_genome_routes.py b/tests/test_genome_routes.py index ee0766d..7539930 100644 --- a/tests/test_genome_routes.py +++ b/tests/test_genome_routes.py @@ -246,6 +246,10 @@ async def test_genome_feature_ingest( aioresponse.post("https://authz.local/policy/evaluate", payload={"result": [[True]]}, repeat=True) _test_ingest_genome_features(test_client, genome, expected_features) + sr = test_client.get(f"/genomes/{genome.id}/feature_types") + srd = sr.json() + assert sum(srd.values()) == expected_features + # Test we can delete res = test_client.delete(f"/genomes/{genome.id}/features", headers=AUTHORIZATION_HEADER) assert res.status_code == status.HTTP_204_NO_CONTENT From 6032870aa14e257f6dd9b0307abd76e68ff09723 Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Wed, 15 May 2024 13:40:54 -0400 Subject: [PATCH 079/114] chore(features): don't keep Name attribute in feature attrs --- bento_reference_service/features.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bento_reference_service/features.py b/bento_reference_service/features.py index c3f9001..878212e 100644 --- a/bento_reference_service/features.py +++ b/bento_reference_service/features.py @@ -23,7 +23,7 @@ GFF_NAME_ATTR = "Name" GFF_PARENT_ATTR = "Parent" GFF_GENCODE_GENE_ID_ATTR = "gene_id" -GFF_CAPTURED_ATTRIBUTES = frozenset({GFF_ID_ATTR, GFF_PARENT_ATTR, GFF_GENCODE_GENE_ID_ATTR}) +GFF_CAPTURED_ATTRIBUTES = frozenset({GFF_ID_ATTR, GFF_NAME_ATTR, GFF_PARENT_ATTR, GFF_GENCODE_GENE_ID_ATTR}) GFF_SKIPPED_FEATURE_TYPES = frozenset({"stop_codon_redefined_as_selenocysteine"}) GFF_LOG_PROGRESS_INTERVAL = 100000 From 5583bc89f63d397630e1ce95d47f2fd43d4e21ad Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Wed, 15 May 2024 13:54:53 -0400 Subject: [PATCH 080/114] fix(db): issues with fetching contigs by checksum --- bento_reference_service/db.py | 24 +++++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) diff --git a/bento_reference_service/db.py b/bento_reference_service/db.py index 0f7caed..221b505 100644 --- a/bento_reference_service/db.py +++ b/bento_reference_service/db.py @@ -45,12 +45,18 @@ def deserialize_alias(rec: asyncpg.Record | dict) -> Alias: def deserialize_contig(self, rec: asyncpg.Record | dict) -> ContigWithRefgetURI: service_base_url = self._config.service_url_base_path.rstrip("/") refget_uri_base = f"{service_base_url}/sequence" + md5 = rec["md5_checksum"] ga4gh = rec["ga4gh_checksum"] + + aliases = rec["aliases"] + if isinstance(aliases, str): + aliases = json.loads(aliases) + return ContigWithRefgetURI( name=rec["contig_name"], # aliases is [None] if no aliases defined: - aliases=tuple(map(Database.deserialize_alias, rec["aliases"])) if rec["aliases"] else (), + aliases=tuple(map(Database.deserialize_alias, aliases)) if aliases else (), md5=md5, ga4gh=ga4gh, length=rec["contig_length"], @@ -144,12 +150,24 @@ async def delete_genome(self, g_id: str) -> None: async def get_genome_and_contig_by_checksum_str( self, checksum_str: str ) -> tuple[GenomeWithURIs, ContigWithRefgetURI] | None: - chk_norm: str = checksum_str.rstrip("ga4gh:").rstrip("md5:") # strip optional checksum prefixes if present + # strip optional checksum prefixes if present: + chk_norm: str = checksum_str.removeprefix("ga4gh:").removeprefix("md5:") + conn: asyncpg.Connection async with self.connect() as conn: # TODO: these SQL statements could be optimized into one for performance reasons if it becomes necessary contig_res = await conn.fetchrow( - "SELECT * FROM genome_contigs WHERE md5_checksum = $1 OR ga4gh_checksum = $1", chk_norm + """ + SELECT + genome_id, contig_name, contig_length, circular, md5_checksum, ga4gh_checksum, + ( + SELECT jsonb_agg(gca.*) + FROM genome_contig_aliases gca + WHERE gc.genome_id = gca.genome_id AND gc.contig_name = gca.contig_name + ) aliases + FROM genome_contigs gc + WHERE md5_checksum = $1 OR ga4gh_checksum = $1 + """, chk_norm ) genome_res = (await anext(self._select_genomes(contig_res["genome_id"], False), None)) if contig_res else None From 3190f30f57552b7566f6949f6ef2e54362af615a Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Wed, 15 May 2024 13:55:24 -0400 Subject: [PATCH 081/114] refact: separate query and filter genome feature fns --- bento_reference_service/db.py | 84 ++++++++++++++++++++++++++--------- 1 file changed, 64 insertions(+), 20 deletions(-) diff --git a/bento_reference_service/db.py b/bento_reference_service/db.py index 221b505..8244d4d 100644 --- a/bento_reference_service/db.py +++ b/bento_reference_service/db.py @@ -282,12 +282,12 @@ def deserialize_genome_feature(rec: asyncpg.Record) -> GenomeFeature: ) @staticmethod - def _feature_inner_entries_query(where_expr: str | None = None) -> str: + def _feature_inner_entries_query(where_expr: str | None = None, gf_table_name: str = "gf") -> str: where_clause = f"AND {where_expr}" if where_expr else "" return f""" WITH entries_tmp AS ( SELECT start_pos, end_pos, score, phase FROM genome_feature_entries gfe - WHERE gfe.feature = gf.id {where_clause} + WHERE gfe.feature = {gf_table_name}.id {where_clause} ) SELECT jsonb_agg(entries_tmp.*) FROM entries_tmp """ @@ -338,11 +338,54 @@ async def get_genome_feature_by_id(self, g_id: str, f_id: str) -> GenomeFeature res = await self.get_genome_features_by_ids(g_id, [f_id]) return res[0] if res else None + async def _run_feature_id_query( + self, id_query: str, g_id: str, offset: int, limit: int, *args + ) -> tuple[list[GenomeFeature], dict]: # results, pagination dict + offset = max(offset, 0) + limit = max(limit, 0) + + conn: asyncpg.Connection + async with self.connect() as conn: + id_res = await conn.fetch(id_query, g_id, offset, limit, *args) + final_list = await self.get_genome_features_by_ids(g_id, [r["feature_id"] for r in id_res], conn) + + return final_list, {"offset": offset, "limit": limit, "total": len(id_res)} + async def query_genome_features( + self, g_id: str, q: str, offset: int = 0, limit: int = 10 + ) -> tuple[list[GenomeFeature], dict]: # results, pagination dict: + id_query = f""" + SELECT feature_id FROM ( + SELECT + feature_id, + feature_name, + feature_type, + ({self._feature_inner_entries_query(None, "gf_tmp")}) entries, + ( + SELECT array_agg(gfav.attr_val) + FROM genome_feature_attributes gfa + JOIN genome_feature_attribute_keys gfak ON gfa.attr_key = gfak.id + JOIN genome_feature_attribute_values gfav ON gfa.attr_val = gfav.id + WHERE gfa.feature = gf_tmp.id AND gfav.attr_val ~ $4 + ) attributes + FROM genome_features gf_tmp + WHERE + gf_tmp.genome_id = $1 + ) gf + WHERE + array_length(gf.attributes, 1) > 0 + OR gf.feature_id ~ $4 + OR gf.feature_name ~ $4 + OFFSET $2 + LIMIT $3 + """ + + return await self._run_feature_id_query(id_query, g_id, offset, limit, q) + + async def filter_genome_features( self, g_id: str, /, - q: str | None = None, name: str | None = None, position: str | None = None, start: int | None = None, @@ -351,17 +394,15 @@ async def query_genome_features( offset: int = 0, limit: int = 10, ) -> tuple[list[GenomeFeature], dict]: # list of genome features + pagination dict object + # TODO: refactor to use standard Bento search in the future, when Bento search makes more sense + gf_where_items: list[str] = [] gfe_where_items: list[str] = [] q_params: list[str | int] = [] def _q_param(pv: str | int) -> str: q_params.append(pv) - return f"${len(q_params) + 1}" - - if q: - param = _q_param(q) - gf_where_items.append(f"(gf.feature_id ~ {param} OR gf.feature_name ~ {param})") + return f"${len(q_params) + 3}" # plus 3: g_id, offset, limit at start if name: gf_where_items.append(f"gf.feature_name = {_q_param(name)}") @@ -385,21 +426,24 @@ def _q_param(pv: str | int) -> str: gfe_where_clause = " AND ".join(gfe_where_items) if gfe_where_items else None id_query = f""" - SELECT feature_id, ({self._feature_inner_entries_query(gfe_where_clause)}) entries - FROM genome_features gf + SELECT feature_id FROM ( + SELECT + feature_id, + feature_name, + feature_type, + ({self._feature_inner_entries_query(gfe_where_clause, "gf_tmp")}) entries + FROM genome_features gf_tmp + WHERE + gf_tmp.genome_id = $1 + ) gf WHERE - gf.genome_id = $1 - AND {where_clause} - OFFSET {_q_param(max(offset, 0))} - LIMIT {_q_param(max(limit, 0))} + {"jsonb_array_length(gf.entries) > 0 AND" if gfe_where_clause else ""} + {where_clause} + OFFSET $2 + LIMIT $3 """ - conn: asyncpg.Connection - async with self.connect() as conn: - id_res = await conn.fetch(id_query, g_id, *q_params) - final_list = await self.get_genome_features_by_ids(g_id, [r["feature_id"] for r in id_res], conn) - - return final_list, {"offset": offset, "limit": limit, "total": len(id_res)} + return await self._run_feature_id_query(id_query, g_id, offset, limit, *q_params) async def clear_genome_features(self, g_id: str): conn: asyncpg.Connection From bf2999a000787a881365beb6c4d9fc600987d06f Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Wed, 15 May 2024 13:55:32 -0400 Subject: [PATCH 082/114] test: more db tests --- tests/test_db.py | 88 ++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 70 insertions(+), 18 deletions(-) diff --git a/tests/test_db.py b/tests/test_db.py index c6aa1dd..fe4737d 100644 --- a/tests/test_db.py +++ b/tests/test_db.py @@ -7,24 +7,58 @@ from bento_reference_service.db import Database from bento_reference_service.features import ingest_features -from .shared_data import TEST_GENOME_SARS_COV_2_OBJ, TEST_GENOME_HG38_CHR1_F100K_OBJ +from .shared_data import SARS_COV_2_GENOME_ID, TEST_GENOME_SARS_COV_2_OBJ, TEST_GENOME_HG38_CHR1_F100K_OBJ pytestmark = pytest.mark.asyncio() -async def test_create_genome(db: Database, db_cleanup): - # SARS-CoV-2 +async def _set_up_sars_cov_2_genome(db: Database): await db.create_genome(TEST_GENOME_SARS_COV_2_OBJ, return_external_resource_uris=False) - # hg38 chr1:1-100000 + +async def _set_up_hg38_subset_genome(db: Database): await db.create_genome(TEST_GENOME_HG38_CHR1_F100K_OBJ, return_external_resource_uris=False) +async def test_create_genome(db: Database, db_cleanup): + await _set_up_sars_cov_2_genome(db) + await _set_up_hg38_subset_genome(db) + + +@pytest.mark.parametrize( + "checksum,genome_id,contig_name", + [ + ("SQ.SyGVJg_YRedxvsjpqNdUgyyqx7lUfu_D", SARS_COV_2_GENOME_ID, "MN908947.3"), + ("ga4gh:SQ.SyGVJg_YRedxvsjpqNdUgyyqx7lUfu_D", SARS_COV_2_GENOME_ID, "MN908947.3"), + ("105c82802b67521950854a851fc6eefd", SARS_COV_2_GENOME_ID, "MN908947.3"), + ("md5:105c82802b67521950854a851fc6eefd", SARS_COV_2_GENOME_ID, "MN908947.3"), + ("d12b28d76aa3c1c6bb143b8da8cce642", TEST_GENOME_HG38_CHR1_F100K_OBJ.id, "chr1:1-100000"), + ("md5:d12b28d76aa3c1c6bb143b8da8cce642", TEST_GENOME_HG38_CHR1_F100K_OBJ.id, "chr1:1-100000"), + ], +) +async def test_get_genome_and_contig_by_checksum_str(db: Database, db_cleanup, checksum, genome_id, contig_name): + # start with two genomes, so we validate that we get the right one + await _set_up_sars_cov_2_genome(db) + await _set_up_hg38_subset_genome(db) + + res = await db.get_genome_and_contig_by_checksum_str(checksum) + assert res is not None + g_res, c_res = res + assert g_res.id == genome_id + assert c_res.name == contig_name + + +async def test_get_genome_and_contig_by_checksum_str_dne(db: Database, db_cleanup): + await _set_up_sars_cov_2_genome(db) + res = await db.get_genome_and_contig_by_checksum_str("DOES_NOT_EXIST") + assert res is None + + async def test_mark_running_as_error(db: Database, db_cleanup): - g = await db.create_genome(TEST_GENOME_SARS_COV_2_OBJ, return_external_resource_uris=False) + await _set_up_sars_cov_2_genome(db) - t1 = await db.create_task(g.id, "ingest_features") - t2 = await db.create_task(g.id, "ingest_features") + t1 = await db.create_task(SARS_COV_2_GENOME_ID, "ingest_features") + t2 = await db.create_task(SARS_COV_2_GENOME_ID, "ingest_features") await db.update_task_status(t2, "running") assert (await db.get_task(t1)).status == "queued" @@ -36,31 +70,49 @@ async def test_mark_running_as_error(db: Database, db_cleanup): assert (await db.get_task(t2)).status == "error" -async def test_query_genome_features(db: Database, db_cleanup): - logger = logging.getLogger(__name__) - - # prerequesite: create genome - await db.create_genome(TEST_GENOME_SARS_COV_2_OBJ, return_external_resource_uris=False) +# TODO: fixture +async def _set_up_sars_cov_2_genome_and_features(db: Database, logger: logging.Logger): + await _set_up_sars_cov_2_genome(db) # prerequesite: ingest features gff3_gz_path = Path(TEST_GENOME_SARS_COV_2_OBJ.gff3_gz.replace("file://", "")) gff3_gz_tbi_path = Path(TEST_GENOME_SARS_COV_2_OBJ.gff3_gz_tbi.replace("file://", "")) - await ingest_features(TEST_GENOME_SARS_COV_2_OBJ.id, gff3_gz_path, gff3_gz_tbi_path, db, logger) + await ingest_features(SARS_COV_2_GENOME_ID, gff3_gz_path, gff3_gz_tbi_path, db, logger) - g_id = TEST_GENOME_SARS_COV_2_OBJ.id + +async def test_genome_features_summary(db: Database, db_cleanup): + logger = logging.getLogger(__name__) + await _set_up_sars_cov_2_genome_and_features(db, logger) + s = await db.genome_feature_types_summary(SARS_COV_2_GENOME_ID) + assert sum(s.values()) == 49 # total # of features, divided by type in summary response + + +async def test_filter_and_query_genome_features(db: Database, db_cleanup): + logger = logging.getLogger(__name__) + await _set_up_sars_cov_2_genome_and_features(db, logger) # - should get back 2 genes and 1 transcript - res, page = await db.query_genome_features(g_id, q="ORF1ab") + res, page = await db.filter_genome_features(SARS_COV_2_GENOME_ID, name="ORF1ab") assert len(res) == 3 assert page["total"] == 3 + +async def test_query_genome_features(db: Database, db_cleanup): + logger = logging.getLogger(__name__) + await _set_up_sars_cov_2_genome_and_features(db, logger) + # - should get back 2 genes and 1 transcript - res, page = await db.query_genome_features(g_id, name="ORF1ab") + res, page = await db.query_genome_features(SARS_COV_2_GENOME_ID, q="ORF1ab") assert len(res) == 3 assert page["total"] == 3 - # - filter by q and name - should get back 1 gene - res, page = await db.query_genome_features(g_id, q="ENSSASG00005000002", name="ORF1ab") + # - filter by q - should get back 1 gene + res, page = await db.query_genome_features(SARS_COV_2_GENOME_ID, q="ENSSASG00005000002") assert len(res) == 1 assert page["total"] == 1 assert res[0].feature_id == "gene:ENSSASG00005000002" + + # - query by attribute value + res, page = await db.query_genome_features(SARS_COV_2_GENOME_ID, q="protein_coding", limit=100) # protein_coding + assert len(res) == 24 + assert page["total"] == 24 From ba52f99736683dbcb08d8f52fccb5baf697044f5 Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Wed, 15 May 2024 14:18:47 -0400 Subject: [PATCH 083/114] fix: genome feature endpoints - either query or filter for now test: genome feature endpoints --- bento_reference_service/routers/genomes.py | 9 +++-- tests/test_genome_routes.py | 41 +++++++++++++++++++--- 2 files changed, 42 insertions(+), 8 deletions(-) diff --git a/bento_reference_service/routers/genomes.py b/bento_reference_service/routers/genomes.py index 4c02291..31df4ca 100644 --- a/bento_reference_service/routers/genomes.py +++ b/bento_reference_service/routers/genomes.py @@ -169,9 +169,12 @@ async def genomes_detail_features( offset: int = 0, limit: int = 10, ): - results, pagination = await db.query_genome_features( - genome_id, q, name, position, start, end, feature_type, offset, limit - ) + if q: + results, pagination = await db.query_genome_features(genome_id, q, offset, limit) + else: + results, pagination = await db.filter_genome_features( + genome_id, name, position, start, end, feature_type, offset, limit + ) return { "results": results, diff --git a/tests/test_genome_routes.py b/tests/test_genome_routes.py index 7539930..484d60d 100644 --- a/tests/test_genome_routes.py +++ b/tests/test_genome_routes.py @@ -185,7 +185,7 @@ async def test_genome_delete(test_client: TestClient, aioresponse: aioresponses, def _file_uri_to_path(uri: str) -> str: - return uri.replace("file://", "") + return uri.removeprefix("file://") def _put_genome_features(test_client: TestClient, genome: Genome) -> Response: @@ -246,10 +246,6 @@ async def test_genome_feature_ingest( aioresponse.post("https://authz.local/policy/evaluate", payload={"result": [[True]]}, repeat=True) _test_ingest_genome_features(test_client, genome, expected_features) - sr = test_client.get(f"/genomes/{genome.id}/feature_types") - srd = sr.json() - assert sum(srd.values()) == expected_features - # Test we can delete res = test_client.delete(f"/genomes/{genome.id}/features", headers=AUTHORIZATION_HEADER) assert res.status_code == status.HTTP_204_NO_CONTENT @@ -261,3 +257,38 @@ async def test_genome_feature_ingest( res = test_client.delete(f"/genomes/{genome.id}/features", headers=AUTHORIZATION_HEADER) assert res.status_code == status.HTTP_204_NO_CONTENT + + +async def test_genome_feature_endpoints(test_client: TestClient, aioresponse: aioresponses, db_cleanup): + genome = TEST_GENOME_SARS_COV_2_OBJ + expected_features = 49 + + # setup: create genome + create_genome_with_permissions(test_client, aioresponse, genome.model_dump(mode="json")) + + # setup: ingest features + aioresponse.post("https://authz.local/policy/evaluate", payload={"result": [[True]]}, repeat=True) + _test_ingest_genome_features(test_client, genome, expected_features) + + # Test we can query genome features + sr = test_client.get(f"/genomes/{genome.id}/feature_types") + srd = sr.json() + assert sum(srd.values()) == expected_features + + # Test we can query genome features + sr = test_client.get(f"/genomes/{genome.id}/features", params={"q": "ENSSASP00005000003"}) + srd = sr.json() + assert len(srd["results"]) == 1 + assert srd["pagination"]["total"] == 1 + + # Test we can list genome features - we get back the first 10 + sr = test_client.get(f"/genomes/{genome.id}/features") + srd = sr.json() + assert len(srd["results"]) == 10 + assert srd["pagination"]["offset"] == 0 + assert srd["pagination"]["total"] == 10 + + # Test we can get a feature by ID + sr = test_client.get(f"/genomes/{genome.id}/features/CDS:ENSSASP00005000003") + assert sr.status_code == 200 + assert sr.json()["feature_id"] == "CDS:ENSSASP00005000003" From b2dd3321ee8a951e9508f2a1e9c0a228fac05ae7 Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Wed, 15 May 2024 14:27:24 -0400 Subject: [PATCH 084/114] fix(features): fix bad feature name extraction in some cases --- bento_reference_service/features.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/bento_reference_service/features.py b/bento_reference_service/features.py index 878212e..09a7e90 100644 --- a/bento_reference_service/features.py +++ b/bento_reference_service/features.py @@ -74,20 +74,20 @@ def extract_feature_name(record, attributes: dict[str, list[str]]) -> str | None case "transcript": return transcript_name case "5utr" | "five_prime_utr": # 5' untranslated region (UTR) - return f"{transcript_name} 5' UTR" + return f"{transcript_name} 5' UTR" if transcript_name else None case "3utr" | "three_prime_utr": # 3' untranslated region (UTR) - return f"{transcript_name} 3' UTR" + return f"{transcript_name} 3' UTR" if transcript_name else None case "start_codon": - return f"{transcript_name} start codon" + return f"{transcript_name} start codon" if transcript_name else None case "stop_codon": - return f"{transcript_name} stop codon" + return f"{transcript_name} stop codon" if transcript_name else None case "exon": - if "exon_id" in attributes: - return attributes["exon_id"][0] - else: - return attributes["ID"][0] + exon_number = attributes.get("exon_number", (None,))[0] + if transcript_name is None or exon_number is None: + return None + return f"{transcript_name} exon {exon_number}" case "cds": # coding sequence - return f"{transcript_name} CDS" + return f"{transcript_name} CDS" if transcript_name else None case _: return None From b2bf06639d5fa02ed63ee97775b84ba78947a089 Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Wed, 15 May 2024 14:29:45 -0400 Subject: [PATCH 085/114] test: filter genome features based on name --- tests/test_genome_routes.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tests/test_genome_routes.py b/tests/test_genome_routes.py index 484d60d..0a1b7ea 100644 --- a/tests/test_genome_routes.py +++ b/tests/test_genome_routes.py @@ -281,6 +281,12 @@ async def test_genome_feature_endpoints(test_client: TestClient, aioresponse: ai assert len(srd["results"]) == 1 assert srd["pagination"]["total"] == 1 + # Test we can filter genome features (ID used as name) + sr = test_client.get(f"/genomes/{genome.id}/features", params={"name": "CDS:ENSSASP00005000003"}) + srd = sr.json() + assert len(srd["results"]) == 1 + assert srd["pagination"]["total"] == 1 + # Test we can list genome features - we get back the first 10 sr = test_client.get(f"/genomes/{genome.id}/features") srd = sr.json() From 2ca7bbb656e68edaf0bacc34ff6edc6fc6dd2514 Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Wed, 15 May 2024 14:30:54 -0400 Subject: [PATCH 086/114] lint --- bento_reference_service/db.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/bento_reference_service/db.py b/bento_reference_service/db.py index 8244d4d..2946af3 100644 --- a/bento_reference_service/db.py +++ b/bento_reference_service/db.py @@ -167,7 +167,8 @@ async def get_genome_and_contig_by_checksum_str( ) aliases FROM genome_contigs gc WHERE md5_checksum = $1 OR ga4gh_checksum = $1 - """, chk_norm + """, + chk_norm, ) genome_res = (await anext(self._select_genomes(contig_res["genome_id"], False), None)) if contig_res else None From 77cb8dfa050b13c9b8671041367538074488a7f9 Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Wed, 15 May 2024 14:41:21 -0400 Subject: [PATCH 087/114] chore(features): add docstrings --- bento_reference_service/features.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/bento_reference_service/features.py b/bento_reference_service/features.py index 09a7e90..ccfee8c 100644 --- a/bento_reference_service/features.py +++ b/bento_reference_service/features.py @@ -37,11 +37,20 @@ class AnnotationIngestError(Exception): def parse_attributes(raw_attributes: dict[str, str]) -> dict[str, list[str]]: + """ + Parse the raw GFF3 attribute dictionary into a properly list-ified dictionary - every attribute in GFF3 except a few + standard ones can be lists (although most are not, in reality.) + """ + # See "attributes" in http://gmod.org/wiki/GFF3 return {k: [url_unquote(e) for e in str(v).split(",") if e] for k, v in raw_attributes.items()} def extract_feature_id(record, attributes: dict[str, list[str]]) -> str | None: + """ + Given a GFF3 record and an extracted dictionary of attributes, extract a natural-key ID for the feature. + """ + feature_type = record.feature.lower() feature_id = attributes.get(GFF_ID_ATTR, (None,))[0] @@ -60,6 +69,11 @@ def extract_feature_id(record, attributes: dict[str, list[str]]) -> str | None: def extract_feature_name(record, attributes: dict[str, list[str]]) -> str | None: + """ + Given a GFF3 record and an extracted dictionary of attributes, either extract or infer a (not necessarily unique) + name for the feature. + """ + feature_type = record.feature.lower() feature_name: str | None = attributes.get(GFF_NAME_ATTR, (None,))[0] From df2de4392a3afff1314d1a3688b5a78ea5af7d5f Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Wed, 15 May 2024 14:45:07 -0400 Subject: [PATCH 088/114] refact(features): factor out iter_features closure --- bento_reference_service/features.py | 189 +++++++++++++++------------- 1 file changed, 99 insertions(+), 90 deletions(-) diff --git a/bento_reference_service/features.py b/bento_reference_service/features.py index ccfee8c..082570a 100644 --- a/bento_reference_service/features.py +++ b/bento_reference_service/features.py @@ -106,6 +106,104 @@ def extract_feature_name(record, attributes: dict[str, list[str]]) -> str | None return None +def iter_features( + # parameters: + genome: m.Genome, + gff_path: Path, + gff_index_path: Path, + # dependencies: + logger: logging.Logger, +) -> Generator[tuple[m.GenomeFeature, ...], None, None]: + """ + Given genome and a GFF3 for the genome, iterate through the lines of the GFF3 and build genome feature objects. + """ + + genome_id = genome.id + + gff = pysam.TabixFile(str(gff_path), index=str(gff_index_path)) + total_processed: int = 0 + + try: + features_by_id: dict[str, m.GenomeFeature] = {} + + for contig in genome.contigs: + contig_name = contig.name + + logger.info(f"Indexing features from contig {contig_name}") + + try: + fetch_iter = gff.fetch(contig.name, parser=pysam.asGFF3()) + except ValueError as e: + logger.warning(f"Could not find contig with name {contig_name} in GFF3; skipping... ({e})") + continue + + for i, rec in enumerate(fetch_iter): + feature_type = rec.feature + + if feature_type in GFF_SKIPPED_FEATURE_TYPES: + continue # Don't ingest stop_codon_redefined_as_selenocysteine annotations + + # for some reason, dict(...) returns the attributes dict: + feature_raw_attributes = dict(rec) + + try: + record_attributes = parse_attributes(feature_raw_attributes) + feature_id = extract_feature_id(rec, record_attributes) + feature_name = extract_feature_name(rec, record_attributes) + + if feature_id is None: + logger.warning(f"Skipping unsupported feature {i}: type={feature_type}, no ID retrieval; {rec}") + continue + + if feature_name is None: + logger.warning(f"Using ID as name for feature {i}: {rec}") + feature_name = feature_id + + # 'phase' is misnamed / legacy-named as 'frame' in PySAM's GFF3 parser: + entry = m.GenomeFeatureEntry(start_pos=rec.start, end_pos=rec.end, score=rec.score, phase=rec.frame) + + if feature_id in features_by_id: + features_by_id[feature_id].entries.append(entry) + else: + attributes: dict[str, list[str]] = { + # skip attributes which have been captured in the above information: + k: vs + for k, vs in record_attributes.items() + if k not in GFF_CAPTURED_ATTRIBUTES + } + + features_by_id[feature_id] = m.GenomeFeature( + genome_id=genome_id, + contig_name=contig_name, + strand=rec.strand or ".", # None/"." <=> unstranded + feature_id=feature_id, + feature_name=feature_name, + feature_type=feature_type, + source=rec.source, + entries=[entry], + gene_id=record_attributes.get(GFF_GENCODE_GENE_ID_ATTR, (None,))[0], + attributes=attributes, + parents=tuple(p for p in record_attributes.get(GFF_PARENT_ATTR, ()) if p), + ) + + except Exception as e: + logger.error( + f"Could not process feature {i}: {feature_type=}, {feature_raw_attributes=}; encountered " + f"exception: {e}" + ) + logger.error(traceback.format_exc()) + + total_processed += 1 + if total_processed % GFF_LOG_PROGRESS_INTERVAL == 0: + logger.info(f"Processed {total_processed} features") + + yield tuple(features_by_id.values()) + features_by_id.clear() + + finally: + gff.close() + + async def ingest_features( # parameters: genome_id: str, @@ -134,96 +232,7 @@ async def ingest_features( logger.info(f"Ingesting gene features for genome {genome_id}...") - def _iter_features() -> Generator[tuple[m.GenomeFeature, ...], None, None]: - gff = pysam.TabixFile(str(gff_path), index=str(gff_index_path)) - total_processed: int = 0 - - try: - features_by_id: dict[str, m.GenomeFeature] = {} - - for contig in genome.contigs: - logger.info(f"Indexing features from contig {contig.name}") - - try: - fetch_iter = gff.fetch(contig.name, parser=pysam.asGFF3()) - except ValueError as e: - logger.warning(f"Could not find contig with name {contig.name} in GFF3; skipping... ({e})") - continue - - for i, record in enumerate(fetch_iter): - feature_type = record.feature - - if feature_type in GFF_SKIPPED_FEATURE_TYPES: - continue # Don't ingest stop_codon_redefined_as_selenocysteine annotations - - # for some reason, dict(...) returns the attributes dict: - feature_raw_attributes = dict(record) - - try: - record_attributes = parse_attributes(feature_raw_attributes) - feature_id = extract_feature_id(record, record_attributes) - feature_name = extract_feature_name(record, record_attributes) - - if feature_id is None: - logger.warning( - f"Skipping unsupported feature {i}: type={feature_type}, no ID retrieval; {record}" - ) - continue - - if feature_name is None: - logger.warning(f"Using ID as name for feature {i}: {record}") - feature_name = feature_id - - entry = m.GenomeFeatureEntry( - start_pos=record.start, - end_pos=record.end, - score=record.score, - phase=record.frame, # misnamed in PySAM's GFF3 parser - ) - - if feature_id in features_by_id: - features_by_id[feature_id].entries.append(entry) - else: - attributes: dict[str, list[str]] = { - # skip attributes which have been captured in the above information: - k: vs - for k, vs in record_attributes.items() - if k not in GFF_CAPTURED_ATTRIBUTES - } - - features_by_id[feature_id] = m.GenomeFeature( - genome_id=genome_id, - contig_name=contig.name, - strand=record.strand or ".", # None/"." <=> unstranded - feature_id=feature_id, - feature_name=feature_name, - feature_type=feature_type, - source=record.source, - entries=[entry], - gene_id=record_attributes.get(GFF_GENCODE_GENE_ID_ATTR, (None,))[0], - attributes=attributes, - parents=tuple(p for p in record_attributes.get(GFF_PARENT_ATTR, ()) if p), - ) - - except Exception as e: - logger.error( - f"Could not process feature {i}: {feature_type=}, {feature_raw_attributes=}; encountered " - f"exception: {e}" - ) - logger.error(traceback.format_exc()) - - total_processed += 1 - if total_processed % GFF_LOG_PROGRESS_INTERVAL == 0: - logger.info(f"Processed {total_processed} features") - - yield tuple(features_by_id.values()) - features_by_id.clear() - - finally: - gff.close() - - features_to_ingest = _iter_features() - + features_to_ingest = iter_features(genome, gff_path, gff_index_path, logger) n_ingested: int = 0 # take features in contig batches From 12630a0cdad0a256d312c2dd3b56eb121fb4a630 Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Wed, 15 May 2024 14:45:16 -0400 Subject: [PATCH 089/114] chore: update dependencies --- poetry.lock | 153 +++++++++++++++++++++++++++---------------------- pyproject.toml | 2 +- 2 files changed, 84 insertions(+), 71 deletions(-) diff --git a/poetry.lock b/poetry.lock index e2094d9..36cc979 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1277,13 +1277,13 @@ files = [ [[package]] name = "platformdirs" -version = "4.2.1" +version = "4.2.2" description = "A small Python package for determining appropriate platform-specific dirs, e.g. a `user data dir`." optional = false python-versions = ">=3.8" files = [ - {file = "platformdirs-4.2.1-py3-none-any.whl", hash = "sha256:17d5a1161b3fd67b390023cb2d3b026bbd40abde6fdb052dfbd3a29c3ba22ee1"}, - {file = "platformdirs-4.2.1.tar.gz", hash = "sha256:031cd18d4ec63ec53e82dceaac0417d218a6863f7745dfcc9efe7793b7039bdf"}, + {file = "platformdirs-4.2.2-py3-none-any.whl", hash = "sha256:2d7a1657e36a80ea911db832a8a6ece5ee53d8de21edd5cc5879af6530b1bfee"}, + {file = "platformdirs-4.2.2.tar.gz", hash = "sha256:38b7b51f512eed9e84a22788b4bce1de17c0adb134d6becb09836e37d8654cd3"}, ] [package.extras] @@ -2018,76 +2018,89 @@ files = [ [[package]] name = "ujson" -version = "5.9.0" +version = "5.10.0" description = "Ultra fast JSON encoder and decoder for Python" optional = false python-versions = ">=3.8" files = [ - {file = "ujson-5.9.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:ab71bf27b002eaf7d047c54a68e60230fbd5cd9da60de7ca0aa87d0bccead8fa"}, - {file = "ujson-5.9.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:7a365eac66f5aa7a7fdf57e5066ada6226700884fc7dce2ba5483538bc16c8c5"}, - {file = "ujson-5.9.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e015122b337858dba5a3dc3533af2a8fc0410ee9e2374092f6a5b88b182e9fcc"}, - {file = "ujson-5.9.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:779a2a88c53039bebfbccca934430dabb5c62cc179e09a9c27a322023f363e0d"}, - {file = "ujson-5.9.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:10ca3c41e80509fd9805f7c149068fa8dbee18872bbdc03d7cca928926a358d5"}, - {file = "ujson-5.9.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:4a566e465cb2fcfdf040c2447b7dd9718799d0d90134b37a20dff1e27c0e9096"}, - {file = "ujson-5.9.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:f833c529e922577226a05bc25b6a8b3eb6c4fb155b72dd88d33de99d53113124"}, - {file = "ujson-5.9.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:b68a0caab33f359b4cbbc10065c88e3758c9f73a11a65a91f024b2e7a1257106"}, - {file = "ujson-5.9.0-cp310-cp310-win32.whl", hash = "sha256:7cc7e605d2aa6ae6b7321c3ae250d2e050f06082e71ab1a4200b4ae64d25863c"}, - {file = "ujson-5.9.0-cp310-cp310-win_amd64.whl", hash = "sha256:a6d3f10eb8ccba4316a6b5465b705ed70a06011c6f82418b59278fbc919bef6f"}, - {file = "ujson-5.9.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:3b23bbb46334ce51ddb5dded60c662fbf7bb74a37b8f87221c5b0fec1ec6454b"}, - {file = "ujson-5.9.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:6974b3a7c17bbf829e6c3bfdc5823c67922e44ff169851a755eab79a3dd31ec0"}, - {file = "ujson-5.9.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b5964ea916edfe24af1f4cc68488448fbb1ec27a3ddcddc2b236da575c12c8ae"}, - {file = "ujson-5.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8ba7cac47dd65ff88571eceeff48bf30ed5eb9c67b34b88cb22869b7aa19600d"}, - {file = "ujson-5.9.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6bbd91a151a8f3358c29355a491e915eb203f607267a25e6ab10531b3b157c5e"}, - {file = "ujson-5.9.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:829a69d451a49c0de14a9fecb2a2d544a9b2c884c2b542adb243b683a6f15908"}, - {file = "ujson-5.9.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:a807ae73c46ad5db161a7e883eec0fbe1bebc6a54890152ccc63072c4884823b"}, - {file = "ujson-5.9.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:8fc2aa18b13d97b3c8ccecdf1a3c405f411a6e96adeee94233058c44ff92617d"}, - {file = "ujson-5.9.0-cp311-cp311-win32.whl", hash = "sha256:70e06849dfeb2548be48fdd3ceb53300640bc8100c379d6e19d78045e9c26120"}, - {file = "ujson-5.9.0-cp311-cp311-win_amd64.whl", hash = "sha256:7309d063cd392811acc49b5016728a5e1b46ab9907d321ebbe1c2156bc3c0b99"}, - {file = "ujson-5.9.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:20509a8c9f775b3a511e308bbe0b72897ba6b800767a7c90c5cca59d20d7c42c"}, - {file = "ujson-5.9.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:b28407cfe315bd1b34f1ebe65d3bd735d6b36d409b334100be8cdffae2177b2f"}, - {file = "ujson-5.9.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9d302bd17989b6bd90d49bade66943c78f9e3670407dbc53ebcf61271cadc399"}, - {file = "ujson-5.9.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9f21315f51e0db8ee245e33a649dd2d9dce0594522de6f278d62f15f998e050e"}, - {file = "ujson-5.9.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5635b78b636a54a86fdbf6f027e461aa6c6b948363bdf8d4fbb56a42b7388320"}, - {file = "ujson-5.9.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:82b5a56609f1235d72835ee109163c7041b30920d70fe7dac9176c64df87c164"}, - {file = "ujson-5.9.0-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:5ca35f484622fd208f55041b042d9d94f3b2c9c5add4e9af5ee9946d2d30db01"}, - {file = "ujson-5.9.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:829b824953ebad76d46e4ae709e940bb229e8999e40881338b3cc94c771b876c"}, - {file = "ujson-5.9.0-cp312-cp312-win32.whl", hash = "sha256:25fa46e4ff0a2deecbcf7100af3a5d70090b461906f2299506485ff31d9ec437"}, - {file = "ujson-5.9.0-cp312-cp312-win_amd64.whl", hash = "sha256:60718f1720a61560618eff3b56fd517d107518d3c0160ca7a5a66ac949c6cf1c"}, - {file = "ujson-5.9.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:d581db9db9e41d8ea0b2705c90518ba623cbdc74f8d644d7eb0d107be0d85d9c"}, - {file = "ujson-5.9.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:ff741a5b4be2d08fceaab681c9d4bc89abf3c9db600ab435e20b9b6d4dfef12e"}, - {file = "ujson-5.9.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cdcb02cabcb1e44381221840a7af04433c1dc3297af76fde924a50c3054c708c"}, - {file = "ujson-5.9.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e208d3bf02c6963e6ef7324dadf1d73239fb7008491fdf523208f60be6437402"}, - {file = "ujson-5.9.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f4b3917296630a075e04d3d07601ce2a176479c23af838b6cf90a2d6b39b0d95"}, - {file = "ujson-5.9.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:0c4d6adb2c7bb9eb7c71ad6f6f612e13b264942e841f8cc3314a21a289a76c4e"}, - {file = "ujson-5.9.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:0b159efece9ab5c01f70b9d10bbb77241ce111a45bc8d21a44c219a2aec8ddfd"}, - {file = "ujson-5.9.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:f0cb4a7814940ddd6619bdce6be637a4b37a8c4760de9373bac54bb7b229698b"}, - {file = "ujson-5.9.0-cp38-cp38-win32.whl", hash = "sha256:dc80f0f5abf33bd7099f7ac94ab1206730a3c0a2d17549911ed2cb6b7aa36d2d"}, - {file = "ujson-5.9.0-cp38-cp38-win_amd64.whl", hash = "sha256:506a45e5fcbb2d46f1a51fead991c39529fc3737c0f5d47c9b4a1d762578fc30"}, - {file = "ujson-5.9.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:d0fd2eba664a22447102062814bd13e63c6130540222c0aa620701dd01f4be81"}, - {file = "ujson-5.9.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:bdf7fc21a03bafe4ba208dafa84ae38e04e5d36c0e1c746726edf5392e9f9f36"}, - {file = "ujson-5.9.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e2f909bc08ce01f122fd9c24bc6f9876aa087188dfaf3c4116fe6e4daf7e194f"}, - {file = "ujson-5.9.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bd4ea86c2afd41429751d22a3ccd03311c067bd6aeee2d054f83f97e41e11d8f"}, - {file = "ujson-5.9.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:63fb2e6599d96fdffdb553af0ed3f76b85fda63281063f1cb5b1141a6fcd0617"}, - {file = "ujson-5.9.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:32bba5870c8fa2a97f4a68f6401038d3f1922e66c34280d710af00b14a3ca562"}, - {file = "ujson-5.9.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:37ef92e42535a81bf72179d0e252c9af42a4ed966dc6be6967ebfb929a87bc60"}, - {file = "ujson-5.9.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:f69f16b8f1c69da00e38dc5f2d08a86b0e781d0ad3e4cc6a13ea033a439c4844"}, - {file = "ujson-5.9.0-cp39-cp39-win32.whl", hash = "sha256:3382a3ce0ccc0558b1c1668950008cece9bf463ebb17463ebf6a8bfc060dae34"}, - {file = "ujson-5.9.0-cp39-cp39-win_amd64.whl", hash = "sha256:6adef377ed583477cf005b58c3025051b5faa6b8cc25876e594afbb772578f21"}, - {file = "ujson-5.9.0-pp310-pypy310_pp73-macosx_10_9_x86_64.whl", hash = "sha256:ffdfebd819f492e48e4f31c97cb593b9c1a8251933d8f8972e81697f00326ff1"}, - {file = "ujson-5.9.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c4eec2ddc046360d087cf35659c7ba0cbd101f32035e19047013162274e71fcf"}, - {file = "ujson-5.9.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2fbb90aa5c23cb3d4b803c12aa220d26778c31b6e4b7a13a1f49971f6c7d088e"}, - {file = "ujson-5.9.0-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ba0823cb70866f0d6a4ad48d998dd338dce7314598721bc1b7986d054d782dfd"}, - {file = "ujson-5.9.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:4e35d7885ed612feb6b3dd1b7de28e89baaba4011ecdf995e88be9ac614765e9"}, - {file = "ujson-5.9.0-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:b048aa93eace8571eedbd67b3766623e7f0acbf08ee291bef7d8106210432427"}, - {file = "ujson-5.9.0-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:323279e68c195110ef85cbe5edce885219e3d4a48705448720ad925d88c9f851"}, - {file = "ujson-5.9.0-pp38-pypy38_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9ac92d86ff34296f881e12aa955f7014d276895e0e4e868ba7fddebbde38e378"}, - {file = "ujson-5.9.0-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:6eecbd09b316cea1fd929b1e25f70382917542ab11b692cb46ec9b0a26c7427f"}, - {file = "ujson-5.9.0-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:473fb8dff1d58f49912323d7cb0859df5585cfc932e4b9c053bf8cf7f2d7c5c4"}, - {file = "ujson-5.9.0-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f91719c6abafe429c1a144cfe27883eace9fb1c09a9c5ef1bcb3ae80a3076a4e"}, - {file = "ujson-5.9.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7b1c0991c4fe256f5fdb19758f7eac7f47caac29a6c57d0de16a19048eb86bad"}, - {file = "ujson-5.9.0-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2a8ea0f55a1396708e564595aaa6696c0d8af532340f477162ff6927ecc46e21"}, - {file = "ujson-5.9.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:07e0cfdde5fd91f54cd2d7ffb3482c8ff1bf558abf32a8b953a5d169575ae1cd"}, - {file = "ujson-5.9.0.tar.gz", hash = "sha256:89cc92e73d5501b8a7f48575eeb14ad27156ad092c2e9fc7e3cf949f07e75532"}, + {file = "ujson-5.10.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:2601aa9ecdbee1118a1c2065323bda35e2c5a2cf0797ef4522d485f9d3ef65bd"}, + {file = "ujson-5.10.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:348898dd702fc1c4f1051bc3aacbf894caa0927fe2c53e68679c073375f732cf"}, + {file = "ujson-5.10.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:22cffecf73391e8abd65ef5f4e4dd523162a3399d5e84faa6aebbf9583df86d6"}, + {file = "ujson-5.10.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:26b0e2d2366543c1bb4fbd457446f00b0187a2bddf93148ac2da07a53fe51569"}, + {file = "ujson-5.10.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:caf270c6dba1be7a41125cd1e4fc7ba384bf564650beef0df2dd21a00b7f5770"}, + {file = "ujson-5.10.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:a245d59f2ffe750446292b0094244df163c3dc96b3ce152a2c837a44e7cda9d1"}, + {file = "ujson-5.10.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:94a87f6e151c5f483d7d54ceef83b45d3a9cca7a9cb453dbdbb3f5a6f64033f5"}, + {file = "ujson-5.10.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:29b443c4c0a113bcbb792c88bea67b675c7ca3ca80c3474784e08bba01c18d51"}, + {file = "ujson-5.10.0-cp310-cp310-win32.whl", hash = "sha256:c18610b9ccd2874950faf474692deee4223a994251bc0a083c114671b64e6518"}, + {file = "ujson-5.10.0-cp310-cp310-win_amd64.whl", hash = "sha256:924f7318c31874d6bb44d9ee1900167ca32aa9b69389b98ecbde34c1698a250f"}, + {file = "ujson-5.10.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:a5b366812c90e69d0f379a53648be10a5db38f9d4ad212b60af00bd4048d0f00"}, + {file = "ujson-5.10.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:502bf475781e8167f0f9d0e41cd32879d120a524b22358e7f205294224c71126"}, + {file = "ujson-5.10.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5b91b5d0d9d283e085e821651184a647699430705b15bf274c7896f23fe9c9d8"}, + {file = "ujson-5.10.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:129e39af3a6d85b9c26d5577169c21d53821d8cf68e079060602e861c6e5da1b"}, + {file = "ujson-5.10.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f77b74475c462cb8b88680471193064d3e715c7c6074b1c8c412cb526466efe9"}, + {file = "ujson-5.10.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:7ec0ca8c415e81aa4123501fee7f761abf4b7f386aad348501a26940beb1860f"}, + {file = "ujson-5.10.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:ab13a2a9e0b2865a6c6db9271f4b46af1c7476bfd51af1f64585e919b7c07fd4"}, + {file = "ujson-5.10.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:57aaf98b92d72fc70886b5a0e1a1ca52c2320377360341715dd3933a18e827b1"}, + {file = "ujson-5.10.0-cp311-cp311-win32.whl", hash = "sha256:2987713a490ceb27edff77fb184ed09acdc565db700ee852823c3dc3cffe455f"}, + {file = "ujson-5.10.0-cp311-cp311-win_amd64.whl", hash = "sha256:f00ea7e00447918ee0eff2422c4add4c5752b1b60e88fcb3c067d4a21049a720"}, + {file = "ujson-5.10.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:98ba15d8cbc481ce55695beee9f063189dce91a4b08bc1d03e7f0152cd4bbdd5"}, + {file = "ujson-5.10.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:a9d2edbf1556e4f56e50fab7d8ff993dbad7f54bac68eacdd27a8f55f433578e"}, + {file = "ujson-5.10.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6627029ae4f52d0e1a2451768c2c37c0c814ffc04f796eb36244cf16b8e57043"}, + {file = "ujson-5.10.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f8ccb77b3e40b151e20519c6ae6d89bfe3f4c14e8e210d910287f778368bb3d1"}, + {file = "ujson-5.10.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f3caf9cd64abfeb11a3b661329085c5e167abbe15256b3b68cb5d914ba7396f3"}, + {file = "ujson-5.10.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:6e32abdce572e3a8c3d02c886c704a38a1b015a1fb858004e03d20ca7cecbb21"}, + {file = "ujson-5.10.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:a65b6af4d903103ee7b6f4f5b85f1bfd0c90ba4eeac6421aae436c9988aa64a2"}, + {file = "ujson-5.10.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:604a046d966457b6cdcacc5aa2ec5314f0e8c42bae52842c1e6fa02ea4bda42e"}, + {file = "ujson-5.10.0-cp312-cp312-win32.whl", hash = "sha256:6dea1c8b4fc921bf78a8ff00bbd2bfe166345f5536c510671bccececb187c80e"}, + {file = "ujson-5.10.0-cp312-cp312-win_amd64.whl", hash = "sha256:38665e7d8290188b1e0d57d584eb8110951a9591363316dd41cf8686ab1d0abc"}, + {file = "ujson-5.10.0-cp313-cp313-macosx_10_9_x86_64.whl", hash = "sha256:618efd84dc1acbd6bff8eaa736bb6c074bfa8b8a98f55b61c38d4ca2c1f7f287"}, + {file = "ujson-5.10.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:38d5d36b4aedfe81dfe251f76c0467399d575d1395a1755de391e58985ab1c2e"}, + {file = "ujson-5.10.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:67079b1f9fb29ed9a2914acf4ef6c02844b3153913eb735d4bf287ee1db6e557"}, + {file = "ujson-5.10.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d7d0e0ceeb8fe2468c70ec0c37b439dd554e2aa539a8a56365fd761edb418988"}, + {file = "ujson-5.10.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:59e02cd37bc7c44d587a0ba45347cc815fb7a5fe48de16bf05caa5f7d0d2e816"}, + {file = "ujson-5.10.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:2a890b706b64e0065f02577bf6d8ca3b66c11a5e81fb75d757233a38c07a1f20"}, + {file = "ujson-5.10.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:621e34b4632c740ecb491efc7f1fcb4f74b48ddb55e65221995e74e2d00bbff0"}, + {file = "ujson-5.10.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:b9500e61fce0cfc86168b248104e954fead61f9be213087153d272e817ec7b4f"}, + {file = "ujson-5.10.0-cp313-cp313-win32.whl", hash = "sha256:4c4fc16f11ac1612f05b6f5781b384716719547e142cfd67b65d035bd85af165"}, + {file = "ujson-5.10.0-cp313-cp313-win_amd64.whl", hash = "sha256:4573fd1695932d4f619928fd09d5d03d917274381649ade4328091ceca175539"}, + {file = "ujson-5.10.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:a984a3131da7f07563057db1c3020b1350a3e27a8ec46ccbfbf21e5928a43050"}, + {file = "ujson-5.10.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:73814cd1b9db6fc3270e9d8fe3b19f9f89e78ee9d71e8bd6c9a626aeaeaf16bd"}, + {file = "ujson-5.10.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:61e1591ed9376e5eddda202ec229eddc56c612b61ac6ad07f96b91460bb6c2fb"}, + {file = "ujson-5.10.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d2c75269f8205b2690db4572a4a36fe47cd1338e4368bc73a7a0e48789e2e35a"}, + {file = "ujson-5.10.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7223f41e5bf1f919cd8d073e35b229295aa8e0f7b5de07ed1c8fddac63a6bc5d"}, + {file = "ujson-5.10.0-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:d4dc2fd6b3067c0782e7002ac3b38cf48608ee6366ff176bbd02cf969c9c20fe"}, + {file = "ujson-5.10.0-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:232cc85f8ee3c454c115455195a205074a56ff42608fd6b942aa4c378ac14dd7"}, + {file = "ujson-5.10.0-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:cc6139531f13148055d691e442e4bc6601f6dba1e6d521b1585d4788ab0bfad4"}, + {file = "ujson-5.10.0-cp38-cp38-win32.whl", hash = "sha256:e7ce306a42b6b93ca47ac4a3b96683ca554f6d35dd8adc5acfcd55096c8dfcb8"}, + {file = "ujson-5.10.0-cp38-cp38-win_amd64.whl", hash = "sha256:e82d4bb2138ab05e18f089a83b6564fee28048771eb63cdecf4b9b549de8a2cc"}, + {file = "ujson-5.10.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:dfef2814c6b3291c3c5f10065f745a1307d86019dbd7ea50e83504950136ed5b"}, + {file = "ujson-5.10.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:4734ee0745d5928d0ba3a213647f1c4a74a2a28edc6d27b2d6d5bd9fa4319e27"}, + {file = "ujson-5.10.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d47ebb01bd865fdea43da56254a3930a413f0c5590372a1241514abae8aa7c76"}, + {file = "ujson-5.10.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dee5e97c2496874acbf1d3e37b521dd1f307349ed955e62d1d2f05382bc36dd5"}, + {file = "ujson-5.10.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7490655a2272a2d0b072ef16b0b58ee462f4973a8f6bbe64917ce5e0a256f9c0"}, + {file = "ujson-5.10.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:ba17799fcddaddf5c1f75a4ba3fd6441f6a4f1e9173f8a786b42450851bd74f1"}, + {file = "ujson-5.10.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:2aff2985cef314f21d0fecc56027505804bc78802c0121343874741650a4d3d1"}, + {file = "ujson-5.10.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:ad88ac75c432674d05b61184178635d44901eb749786c8eb08c102330e6e8996"}, + {file = "ujson-5.10.0-cp39-cp39-win32.whl", hash = "sha256:2544912a71da4ff8c4f7ab5606f947d7299971bdd25a45e008e467ca638d13c9"}, + {file = "ujson-5.10.0-cp39-cp39-win_amd64.whl", hash = "sha256:3ff201d62b1b177a46f113bb43ad300b424b7847f9c5d38b1b4ad8f75d4a282a"}, + {file = "ujson-5.10.0-pp310-pypy310_pp73-macosx_10_9_x86_64.whl", hash = "sha256:5b6fee72fa77dc172a28f21693f64d93166534c263adb3f96c413ccc85ef6e64"}, + {file = "ujson-5.10.0-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:61d0af13a9af01d9f26d2331ce49bb5ac1fb9c814964018ac8df605b5422dcb3"}, + {file = "ujson-5.10.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ecb24f0bdd899d368b715c9e6664166cf694d1e57be73f17759573a6986dd95a"}, + {file = "ujson-5.10.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fbd8fd427f57a03cff3ad6574b5e299131585d9727c8c366da4624a9069ed746"}, + {file = "ujson-5.10.0-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:beeaf1c48e32f07d8820c705ff8e645f8afa690cca1544adba4ebfa067efdc88"}, + {file = "ujson-5.10.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:baed37ea46d756aca2955e99525cc02d9181de67f25515c468856c38d52b5f3b"}, + {file = "ujson-5.10.0-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:7663960f08cd5a2bb152f5ee3992e1af7690a64c0e26d31ba7b3ff5b2ee66337"}, + {file = "ujson-5.10.0-pp38-pypy38_pp73-macosx_11_0_arm64.whl", hash = "sha256:d8640fb4072d36b08e95a3a380ba65779d356b2fee8696afeb7794cf0902d0a1"}, + {file = "ujson-5.10.0-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:78778a3aa7aafb11e7ddca4e29f46bc5139131037ad628cc10936764282d6753"}, + {file = "ujson-5.10.0-pp38-pypy38_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b0111b27f2d5c820e7f2dbad7d48e3338c824e7ac4d2a12da3dc6061cc39c8e6"}, + {file = "ujson-5.10.0-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:c66962ca7565605b355a9ed478292da628b8f18c0f2793021ca4425abf8b01e5"}, + {file = "ujson-5.10.0-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:ba43cc34cce49cf2d4bc76401a754a81202d8aa926d0e2b79f0ee258cb15d3a4"}, + {file = "ujson-5.10.0-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:ac56eb983edce27e7f51d05bc8dd820586c6e6be1c5216a6809b0c668bb312b8"}, + {file = "ujson-5.10.0-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f44bd4b23a0e723bf8b10628288c2c7c335161d6840013d4d5de20e48551773b"}, + {file = "ujson-5.10.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7c10f4654e5326ec14a46bcdeb2b685d4ada6911050aa8baaf3501e57024b804"}, + {file = "ujson-5.10.0-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0de4971a89a762398006e844ae394bd46991f7c385d7a6a3b93ba229e6dac17e"}, + {file = "ujson-5.10.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:e1402f0564a97d2a52310ae10a64d25bcef94f8dd643fcf5d310219d915484f7"}, + {file = "ujson-5.10.0.tar.gz", hash = "sha256:b3cd8f3c5d8c7738257f1018880444f7b7d9b66232c64649f562d7ba86ad4bc1"}, ] [[package]] @@ -2488,4 +2501,4 @@ multidict = ">=4.0" [metadata] lock-version = "2.0" python-versions = "^3.10.0" -content-hash = "81b5ed469e55ebb6ffffa38a41efdf9fb755d5e06314fdf7c7a526ffb9a1975d" +content-hash = "c46a7f9fb29a9e4e3eef8da4c9bbd210bef32ca2196a9e0c81d8822017f43c5b" diff --git a/pyproject.toml b/pyproject.toml index 064d9c8..06281f5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -29,7 +29,7 @@ fastapi = "^0.111.0" pydantic = "^2.6.1" bento-lib = {extras = ["fastapi"], version = "^11.7.2"} aiofiles = "^23.2.1" -pysam = "^0.22.0" +pysam = "~0.22.0" jsonschema = "^4.21.1" pydantic-settings = "^2.1.0" asyncpg = "^0.29.0" From 863e9d72a675a4994fcd31e9c89ad502278f7ac6 Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Wed, 15 May 2024 14:52:59 -0400 Subject: [PATCH 090/114] lint(features): add comments --- bento_reference_service/features.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/bento_reference_service/features.py b/bento_reference_service/features.py index 082570a..764dd45 100644 --- a/bento_reference_service/features.py +++ b/bento_reference_service/features.py @@ -54,7 +54,7 @@ def extract_feature_id(record, attributes: dict[str, list[str]]) -> str | None: feature_type = record.feature.lower() feature_id = attributes.get(GFF_ID_ATTR, (None,))[0] - if feature_id: + if feature_id: # If the standardized GFF `ID` attribute is set, we can use it and skip any deriving logic. return feature_id match feature_type: @@ -77,7 +77,7 @@ def extract_feature_name(record, attributes: dict[str, list[str]]) -> str | None feature_type = record.feature.lower() feature_name: str | None = attributes.get(GFF_NAME_ATTR, (None,))[0] - if feature_name: + if feature_name: # If the standardized GFF `Name` attribute is set, we can use it and skip any deriving logic. return feature_name transcript_name = attributes.get("transcript_name", attributes.get("transcript_id", (None,)))[0] @@ -236,8 +236,9 @@ async def ingest_features( n_ingested: int = 0 # take features in contig batches + # - these contig batches are created by the generator produced iter_features(...) # - we use contigs as batches rather than a fixed batch size so that we are guaranteed to get parents alongside - # their child features in the same batch. + # their child features in the same batch, so we can assign surrogate keys correctly. while data := next(features_to_ingest, ()): s = datetime.now() logger.debug(f"ingest_gene_feature_annotation: ingesting batch of {len(data)} features") @@ -256,6 +257,8 @@ async def ingest_features( async def ingest_features_task( genome_id: str, gff3_gz_path: Path, gff3_gz_tbi_path: Path, task_id: int, db: Database, logger: logging.Logger ): + # the ingest_features task moves from queued -> running -> (success | error) + await db.update_task_status(task_id, "running") # clear existing gene features for this genome From 8f19204501820eff900cb9188e043241738cdac9 Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Wed, 15 May 2024 15:18:35 -0400 Subject: [PATCH 091/114] chore(features): fall back to none right away w/ gene name as feature name --- bento_reference_service/features.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bento_reference_service/features.py b/bento_reference_service/features.py index 764dd45..234a01e 100644 --- a/bento_reference_service/features.py +++ b/bento_reference_service/features.py @@ -84,7 +84,7 @@ def extract_feature_name(record, attributes: dict[str, list[str]]) -> str | None match feature_type: case "gene": - return attributes.get("gene_name", attributes.get(GFF_GENCODE_GENE_ID_ATTR, (None,)))[0] + return attributes.get("gene_name", (None,))[0] case "transcript": return transcript_name case "5utr" | "five_prime_utr": # 5' untranslated region (UTR) From 736e7053bfb98d8cbbdf6390e8d3260186e2a6fe Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Wed, 15 May 2024 15:58:41 -0400 Subject: [PATCH 092/114] fix(db): typo handling feature type filter queries for features --- bento_reference_service/db.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bento_reference_service/db.py b/bento_reference_service/db.py index 2946af3..e19862e 100644 --- a/bento_reference_service/db.py +++ b/bento_reference_service/db.py @@ -420,7 +420,7 @@ def _q_param(pv: str | int) -> str: if feature_types: or_items = [] for ft in feature_types: - gf_where_items.append(f"gf.feature_type = f{_q_param(ft)}") + or_items.append(f"gf.feature_type = {_q_param(ft)}") gf_where_items.append(f"({' OR '.join(or_items)})") where_clause = " AND ".join(gf_where_items) if gf_where_items else "true" From 5ae8d3841f9933f783847fba9904b9abffd40345 Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Wed, 15 May 2024 16:34:42 -0400 Subject: [PATCH 093/114] test: fix hg38 subset genome contig consistency --- tests/data/hg38.chr1.f100k.fa | 2 +- tests/data/hg38.chr1.f100k.fa.fai | 2 +- tests/shared_data.py | 10 ++++++---- 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/tests/data/hg38.chr1.f100k.fa b/tests/data/hg38.chr1.f100k.fa index 9e9c08a..d2cf3ec 100644 --- a/tests/data/hg38.chr1.f100k.fa +++ b/tests/data/hg38.chr1.f100k.fa @@ -1,4 +1,4 @@ ->chr1:1-100000 +>chr1 NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN diff --git a/tests/data/hg38.chr1.f100k.fa.fai b/tests/data/hg38.chr1.f100k.fa.fai index 4397900..0447a32 100644 --- a/tests/data/hg38.chr1.f100k.fa.fai +++ b/tests/data/hg38.chr1.f100k.fa.fai @@ -1 +1 @@ -chr1:1-100000 100000 15 50 51 +chr1 100000 6 50 51 diff --git a/tests/shared_data.py b/tests/shared_data.py index db086da..69bbc15 100644 --- a/tests/shared_data.py +++ b/tests/shared_data.py @@ -10,6 +10,7 @@ "SARS_COV_2_GFF3_GZ_TBI_PATH", "TEST_GENOME_SARS_COV_2", "TEST_GENOME_SARS_COV_2_OBJ", + "HG38_CHR1_F100K_GENOME_ID", "TEST_GENOME_HG38_CHR1_F100K", "TEST_GENOME_HG38_CHR1_F100K_OBJ", ] @@ -47,10 +48,11 @@ } TEST_GENOME_SARS_COV_2_OBJ = Genome(**TEST_GENOME_SARS_COV_2) +HG38_CHR1_F100K_GENOME_ID = "hg38-chr1-f100k" TEST_GENOME_HG38_CHR1_F100K = { - "id": "hg38-chr1-f100k", - "md5": "021db6573bbb7373345e6c3eec307632", - "ga4gh": "SQ.sY74le7UyqmFWoC1FWbvt8zHxjnpS8e2", + "id": HG38_CHR1_F100K_GENOME_ID, + "md5": "80c4a2f1d70d2ca5babe40ca24e47e85", + "ga4gh": "SQ.Sd58mcdOdfBAdpwaLFeI5bHwjspHd2D6", "fasta": f"file://{DATA_DIR / 'hg38.chr1.f100k.fa'}", "fai": f"file://{DATA_DIR / 'hg38.chr1.f100k.fa.fai'}", "gff3_gz": f"file://{DATA_DIR / 'gencode.v45.first-few.gff3.gz'}", @@ -58,7 +60,7 @@ "taxon": {"id": "NCBITaxon:9606", "label": "Homo sapiens"}, "contigs": [ { - "name": "chr1:1-100000", + "name": "chr1", "aliases": [], "md5": "d12b28d76aa3c1c6bb143b8da8cce642", "ga4gh": "SQ.jTVrjy4tzSYmexXZs_cfFWNuRKpvpVBI", From 2f8a8d55328b4242162a8422f219602f66550ceb Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Wed, 15 May 2024 16:39:36 -0400 Subject: [PATCH 094/114] fix(features): bad handling of gff fetch contig + coord off-by-1 --- bento_reference_service/features.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/bento_reference_service/features.py b/bento_reference_service/features.py index 234a01e..4f2d1fe 100644 --- a/bento_reference_service/features.py +++ b/bento_reference_service/features.py @@ -132,7 +132,7 @@ def iter_features( logger.info(f"Indexing features from contig {contig_name}") try: - fetch_iter = gff.fetch(contig.name, parser=pysam.asGFF3()) + fetch_iter = gff.fetch(reference=contig.name, parser=pysam.asGFF3()) except ValueError as e: logger.warning(f"Could not find contig with name {contig_name} in GFF3; skipping... ({e})") continue @@ -159,8 +159,19 @@ def iter_features( logger.warning(f"Using ID as name for feature {i}: {rec}") feature_name = feature_id - # 'phase' is misnamed / legacy-named as 'frame' in PySAM's GFF3 parser: - entry = m.GenomeFeatureEntry(start_pos=rec.start, end_pos=rec.end, score=rec.score, phase=rec.frame) + # - coordinates from PySAM are 0-based, semi-open + # - to convert to 1-based semi-open coordinates like in the original GFF3, we add 1 to start + # (we should have to add 1 to end too, but the GFF3 parser is busted in PySAM I guess, so we + # leave it as-is) + start_pos = rec.start + 1 + end_pos = rec.end + entry = m.GenomeFeatureEntry( + start_pos=start_pos, + end_pos=end_pos, + score=rec.score, + # - 'phase' is misnamed / legacy-named as 'frame' in PySAM's GFF3 parser + phase=rec.frame, + ) if feature_id in features_by_id: features_by_id[feature_id].entries.append(entry) From 114a1ff0e201208c00b41eb54ca653b8e4704a03 Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Wed, 15 May 2024 16:39:49 -0400 Subject: [PATCH 095/114] test: more tests for filtering features --- tests/test_db.py | 55 ++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 44 insertions(+), 11 deletions(-) diff --git a/tests/test_db.py b/tests/test_db.py index fe4737d..569dbf6 100644 --- a/tests/test_db.py +++ b/tests/test_db.py @@ -7,7 +7,12 @@ from bento_reference_service.db import Database from bento_reference_service.features import ingest_features -from .shared_data import SARS_COV_2_GENOME_ID, TEST_GENOME_SARS_COV_2_OBJ, TEST_GENOME_HG38_CHR1_F100K_OBJ +from .shared_data import ( + SARS_COV_2_GENOME_ID, + TEST_GENOME_SARS_COV_2_OBJ, + HG38_CHR1_F100K_GENOME_ID, + TEST_GENOME_HG38_CHR1_F100K_OBJ, +) pytestmark = pytest.mark.asyncio() @@ -32,8 +37,8 @@ async def test_create_genome(db: Database, db_cleanup): ("ga4gh:SQ.SyGVJg_YRedxvsjpqNdUgyyqx7lUfu_D", SARS_COV_2_GENOME_ID, "MN908947.3"), ("105c82802b67521950854a851fc6eefd", SARS_COV_2_GENOME_ID, "MN908947.3"), ("md5:105c82802b67521950854a851fc6eefd", SARS_COV_2_GENOME_ID, "MN908947.3"), - ("d12b28d76aa3c1c6bb143b8da8cce642", TEST_GENOME_HG38_CHR1_F100K_OBJ.id, "chr1:1-100000"), - ("md5:d12b28d76aa3c1c6bb143b8da8cce642", TEST_GENOME_HG38_CHR1_F100K_OBJ.id, "chr1:1-100000"), + ("d12b28d76aa3c1c6bb143b8da8cce642", TEST_GENOME_HG38_CHR1_F100K_OBJ.id, "chr1"), + ("md5:d12b28d76aa3c1c6bb143b8da8cce642", TEST_GENOME_HG38_CHR1_F100K_OBJ.id, "chr1"), ], ) async def test_get_genome_and_contig_by_checksum_str(db: Database, db_cleanup, checksum, genome_id, contig_name): @@ -80,6 +85,21 @@ async def _set_up_sars_cov_2_genome_and_features(db: Database, logger: logging.L await ingest_features(SARS_COV_2_GENOME_ID, gff3_gz_path, gff3_gz_tbi_path, db, logger) +async def _set_up_hg38_subset_genome_and_features(db: Database, logger: logging.Logger): + await _set_up_hg38_subset_genome(db) + + # prerequesite: ingest features + gff3_gz_path = Path(TEST_GENOME_HG38_CHR1_F100K_OBJ.gff3_gz.replace("file://", "")) + gff3_gz_tbi_path = Path(TEST_GENOME_HG38_CHR1_F100K_OBJ.gff3_gz_tbi.replace("file://", "")) + await ingest_features(HG38_CHR1_F100K_GENOME_ID, gff3_gz_path, gff3_gz_tbi_path, db, logger) + + +GENOME_ID_TO_SET_UP_FN = { + SARS_COV_2_GENOME_ID: _set_up_sars_cov_2_genome_and_features, + HG38_CHR1_F100K_GENOME_ID: _set_up_hg38_subset_genome_and_features, +} + + async def test_genome_features_summary(db: Database, db_cleanup): logger = logging.getLogger(__name__) await _set_up_sars_cov_2_genome_and_features(db, logger) @@ -87,14 +107,27 @@ async def test_genome_features_summary(db: Database, db_cleanup): assert sum(s.values()) == 49 # total # of features, divided by type in summary response -async def test_filter_and_query_genome_features(db: Database, db_cleanup): - logger = logging.getLogger(__name__) - await _set_up_sars_cov_2_genome_and_features(db, logger) - - # - should get back 2 genes and 1 transcript - res, page = await db.filter_genome_features(SARS_COV_2_GENOME_ID, name="ORF1ab") - assert len(res) == 3 - assert page["total"] == 3 +@pytest.mark.parametrize( + "genome_id,filters,n_results", + [ + # SARS-CoV-2 + (SARS_COV_2_GENOME_ID, dict(name="ORF1ab"), 3), # should get back 2 genes and 1 transcript + (SARS_COV_2_GENOME_ID, dict(start=1, end=1000), 9), # region + 8 related to ORF1ab + # hg38 subset + (HG38_CHR1_F100K_GENOME_ID, dict(position="chr1:11869-"), 3), + (HG38_CHR1_F100K_GENOME_ID, dict(start=12000), 10), + (HG38_CHR1_F100K_GENOME_ID, dict(start=11869, end=11869), 3), + (HG38_CHR1_F100K_GENOME_ID, dict(start=12000, end=13000), 7), + (HG38_CHR1_F100K_GENOME_ID, dict(start=13000, end=13000), 0), + (HG38_CHR1_F100K_GENOME_ID, dict(feature_types=["gene"]), 2), + (HG38_CHR1_F100K_GENOME_ID, dict(limit=20), 13), + ], +) +async def test_filter_genome_features(db: Database, db_cleanup, genome_id: str, filters: dict, n_results: int): + await (GENOME_ID_TO_SET_UP_FN[genome_id])(db, logging.getLogger(__name__)) + res, page = await db.filter_genome_features(genome_id, **filters) + assert len(res) == n_results + assert page["total"] == n_results async def test_query_genome_features(db: Database, db_cleanup): From 33b630eb844da62277c459e246e0696a0e630472 Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Thu, 16 May 2024 10:21:21 -0400 Subject: [PATCH 096/114] perf: add a maximum feature response length limit --- bento_reference_service/config.py | 2 ++ bento_reference_service/db.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/bento_reference_service/config.py b/bento_reference_service/config.py index c63e400..c000839 100644 --- a/bento_reference_service/config.py +++ b/bento_reference_service/config.py @@ -26,6 +26,8 @@ class Config(BentoBaseConfig): file_response_chunk_size: int = 1024 * 16 # 16 KiB at a time response_substring_limit: int = 10000 # TODO: Refine default + feature_response_record_limit: int = 1000 + @lru_cache() def get_config(): diff --git a/bento_reference_service/db.py b/bento_reference_service/db.py index e19862e..2a6072d 100644 --- a/bento_reference_service/db.py +++ b/bento_reference_service/db.py @@ -343,7 +343,7 @@ async def _run_feature_id_query( self, id_query: str, g_id: str, offset: int, limit: int, *args ) -> tuple[list[GenomeFeature], dict]: # results, pagination dict offset = max(offset, 0) - limit = max(limit, 0) + limit = min(max(limit, 0), self._config.feature_response_record_limit) conn: asyncpg.Connection async with self.connect() as conn: From 0c7276edd21f601dcc69eed76f01085e81e00d23 Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Thu, 16 May 2024 16:54:48 -0400 Subject: [PATCH 097/114] refact: move running tasks to error on startup instead of db init --- bento_reference_service/db.py | 7 ------- bento_reference_service/main.py | 20 +++++++++++++++++--- 2 files changed, 17 insertions(+), 10 deletions(-) diff --git a/bento_reference_service/db.py b/bento_reference_service/db.py index 2a6072d..20dfea3 100644 --- a/bento_reference_service/db.py +++ b/bento_reference_service/db.py @@ -31,13 +31,6 @@ def __init__(self, config: Config, logger: logging.Logger): self.logger: logging.Logger = logger super().__init__(config.database_uri, SCHEMA_PATH) - async def initialize(self, pool_size: int = 10): - await super().initialize(pool_size) - - # If we have any tasks that are still marked as "running" on application startup, we need to move them to the - # error state. - await self.move_running_tasks_to_error() - @staticmethod def deserialize_alias(rec: asyncpg.Record | dict) -> Alias: return Alias(alias=rec["alias"], naming_authority=rec["naming_authority"]) diff --git a/bento_reference_service/main.py b/bento_reference_service/main.py index 74c8f46..3272c4a 100644 --- a/bento_reference_service/main.py +++ b/bento_reference_service/main.py @@ -1,3 +1,4 @@ +from contextlib import asynccontextmanager from fastapi import FastAPI from fastapi.middleware.cors import CORSMiddleware from fastapi.exceptions import RequestValidationError, StarletteHTTPException @@ -12,6 +13,7 @@ from .authz import authz_middleware from .config import get_config, ConfigDependency from .constants import BENTO_SERVICE_KIND, SERVICE_TYPE +from .db import get_db from .logger import get_logger, LoggerDependency from .routers.genomes import genome_router from .routers.refget import refget_router @@ -19,6 +21,21 @@ from .routers.workflows import workflow_router +# TODO: Find a way to DI this +config_for_setup = get_config() + + +@asynccontextmanager +async def lifespan(_app: FastAPI): + db = get_db(config_for_setup, get_logger(config_for_setup)) + + # If we have any tasks that are still marked as "running" on application startup, we need to move them to the error + # state. + await db.move_running_tasks_to_error() + + yield + + app = FastAPI() # Attach different routers to the app, for: @@ -31,9 +48,6 @@ app.include_router(refget_router) app.include_router(workflow_router) -# TODO: Find a way to DI this -config_for_setup = get_config() - app.add_middleware( CORSMiddleware, allow_origins=config_for_setup.cors_origins, From 0d76715fdd80adde35b48ef9d2cacbb699818e85 Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Thu, 16 May 2024 16:55:48 -0400 Subject: [PATCH 098/114] fix: bad query for tasks querying --- bento_reference_service/db.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bento_reference_service/db.py b/bento_reference_service/db.py index 20dfea3..223d4aa 100644 --- a/bento_reference_service/db.py +++ b/bento_reference_service/db.py @@ -682,7 +682,7 @@ async def query_tasks(self, g_id: str | None, task_kind: Literal["ingest_feature where_part = " AND ".join(where_clauses) if where_clauses else "true" - res = await conn.fetch(f"SELECT * FROM tasks WHERE genome_id = $1 {where_part}", *params) + res = await conn.fetch(f"SELECT * FROM tasks WHERE {where_part}", *params) return tuple(self.deserialize_task(r) for r in res) async def update_task_status(self, t_id: int, status: TaskStatus, message: str = ""): From 1121e4dea2d3941739e59dc6991386ea633b4544 Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Thu, 16 May 2024 16:56:41 -0400 Subject: [PATCH 099/114] feat: add name_q argument for text searching feature names --- bento_reference_service/db.py | 76 +++++++++++++--------- bento_reference_service/routers/genomes.py | 10 ++- tests/test_db.py | 31 ++------- 3 files changed, 55 insertions(+), 62 deletions(-) diff --git a/bento_reference_service/db.py b/bento_reference_service/db.py index 223d4aa..30c9640 100644 --- a/bento_reference_service/db.py +++ b/bento_reference_service/db.py @@ -346,41 +346,12 @@ async def _run_feature_id_query( return final_list, {"offset": offset, "limit": limit, "total": len(id_res)} async def query_genome_features( - self, g_id: str, q: str, offset: int = 0, limit: int = 10 - ) -> tuple[list[GenomeFeature], dict]: # results, pagination dict: - id_query = f""" - SELECT feature_id FROM ( - SELECT - feature_id, - feature_name, - feature_type, - ({self._feature_inner_entries_query(None, "gf_tmp")}) entries, - ( - SELECT array_agg(gfav.attr_val) - FROM genome_feature_attributes gfa - JOIN genome_feature_attribute_keys gfak ON gfa.attr_key = gfak.id - JOIN genome_feature_attribute_values gfav ON gfa.attr_val = gfav.id - WHERE gfa.feature = gf_tmp.id AND gfav.attr_val ~ $4 - ) attributes - FROM genome_features gf_tmp - WHERE - gf_tmp.genome_id = $1 - ) gf - WHERE - array_length(gf.attributes, 1) > 0 - OR gf.feature_id ~ $4 - OR gf.feature_name ~ $4 - OFFSET $2 - LIMIT $3 - """ - - return await self._run_feature_id_query(id_query, g_id, offset, limit, q) - - async def filter_genome_features( self, g_id: str, /, + q: str | None = None, name: str | None = None, + name_q: str | None = None, position: str | None = None, start: int | None = None, end: int | None = None, @@ -398,9 +369,42 @@ def _q_param(pv: str | int) -> str: q_params.append(pv) return f"${len(q_params) + 3}" # plus 3: g_id, offset, limit at start + if q: + query_param = _q_param(q) + gf_where_items.append( + f""" + gf.feature_id IN ( + SELECT feature_id FROM ( + SELECT + feature_id, + feature_name, + feature_type, + ({self._feature_inner_entries_query(None, "gf_tmp_1")}) entries, + ( + SELECT array_agg(gfav.attr_val) + FROM genome_feature_attributes gfa + JOIN genome_feature_attribute_keys gfak ON gfa.attr_key = gfak.id + JOIN genome_feature_attribute_values gfav ON gfa.attr_val = gfav.id + WHERE gfa.feature = gf_tmp_1.id AND gfav.attr_val ~ {query_param} + ) attributes + FROM genome_features gf_tmp_1 + WHERE + gf_tmp_1.genome_id = $1 + ) gf_tmp_2 + WHERE + array_length(gf_tmp_2.attributes, 1) > 0 + OR gf_tmp_2.feature_id ~ {query_param} + OR gf_tmp_2.feature_name ~ {query_param} + ) + """ + ) + if name: gf_where_items.append(f"gf.feature_name = {_q_param(name)}") + if name_q: + gf_where_items.append(f"gf.feature_name ~ {_q_param(name_q)}") + if position: gfe_where_items.append(f"gfe.position_text ~ {_q_param(position)}") @@ -437,7 +441,15 @@ def _q_param(pv: str | int) -> str: LIMIT $3 """ - return await self._run_feature_id_query(id_query, g_id, offset, limit, *q_params) + offset = max(offset, 0) + limit = min(max(limit, 0), self._config.feature_response_record_limit) + + conn: asyncpg.Connection + async with self.connect() as conn: + id_res = await conn.fetch(id_query, g_id, offset, limit, *q_params) + final_list = await self.get_genome_features_by_ids(g_id, [r["feature_id"] for r in id_res], conn) + + return final_list, {"offset": offset, "limit": limit, "total": len(id_res)} async def clear_genome_features(self, g_id: str): conn: asyncpg.Connection diff --git a/bento_reference_service/routers/genomes.py b/bento_reference_service/routers/genomes.py index 31df4ca..69a8772 100644 --- a/bento_reference_service/routers/genomes.py +++ b/bento_reference_service/routers/genomes.py @@ -162,6 +162,7 @@ async def genomes_detail_features( genome_id: str, q: str | None = None, name: str | None = None, + name_q: str | None = None, position: str | None = None, start: int | None = None, end: int | None = None, @@ -169,12 +170,9 @@ async def genomes_detail_features( offset: int = 0, limit: int = 10, ): - if q: - results, pagination = await db.query_genome_features(genome_id, q, offset, limit) - else: - results, pagination = await db.filter_genome_features( - genome_id, name, position, start, end, feature_type, offset, limit - ) + results, pagination = await db.query_genome_features( + genome_id, q, name, name_q, position, start, end, feature_type, offset, limit + ) return { "results": results, diff --git a/tests/test_db.py b/tests/test_db.py index 569dbf6..27f688d 100644 --- a/tests/test_db.py +++ b/tests/test_db.py @@ -108,11 +108,15 @@ async def test_genome_features_summary(db: Database, db_cleanup): @pytest.mark.parametrize( - "genome_id,filters,n_results", + "genome_id,args,n_results", [ # SARS-CoV-2 (SARS_COV_2_GENOME_ID, dict(name="ORF1ab"), 3), # should get back 2 genes and 1 transcript + (SARS_COV_2_GENOME_ID, dict(name_q="ORF1"), 6), # ORF1ab, ORF1a, ORF10 (SARS_COV_2_GENOME_ID, dict(start=1, end=1000), 9), # region + 8 related to ORF1ab + (SARS_COV_2_GENOME_ID, dict(q="ORF1ab"), 3), + (SARS_COV_2_GENOME_ID, dict(q="ENSSASG00005000002"), 1), + (SARS_COV_2_GENOME_ID, dict(q="protein_coding", limit=100), 24), # hg38 subset (HG38_CHR1_F100K_GENOME_ID, dict(position="chr1:11869-"), 3), (HG38_CHR1_F100K_GENOME_ID, dict(start=12000), 10), @@ -123,29 +127,8 @@ async def test_genome_features_summary(db: Database, db_cleanup): (HG38_CHR1_F100K_GENOME_ID, dict(limit=20), 13), ], ) -async def test_filter_genome_features(db: Database, db_cleanup, genome_id: str, filters: dict, n_results: int): +async def test_query_genome_features(db: Database, db_cleanup, genome_id: str, args: dict, n_results: int): await (GENOME_ID_TO_SET_UP_FN[genome_id])(db, logging.getLogger(__name__)) - res, page = await db.filter_genome_features(genome_id, **filters) + res, page = await db.query_genome_features(genome_id, **args) assert len(res) == n_results assert page["total"] == n_results - - -async def test_query_genome_features(db: Database, db_cleanup): - logger = logging.getLogger(__name__) - await _set_up_sars_cov_2_genome_and_features(db, logger) - - # - should get back 2 genes and 1 transcript - res, page = await db.query_genome_features(SARS_COV_2_GENOME_ID, q="ORF1ab") - assert len(res) == 3 - assert page["total"] == 3 - - # - filter by q - should get back 1 gene - res, page = await db.query_genome_features(SARS_COV_2_GENOME_ID, q="ENSSASG00005000002") - assert len(res) == 1 - assert page["total"] == 1 - assert res[0].feature_id == "gene:ENSSASG00005000002" - - # - query by attribute value - res, page = await db.query_genome_features(SARS_COV_2_GENOME_ID, q="protein_coding", limit=100) # protein_coding - assert len(res) == 24 - assert page["total"] == 24 From 7a18f65c50493fa7d02b3acdb407bc4270e591b5 Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Thu, 16 May 2024 16:57:13 -0400 Subject: [PATCH 100/114] fix(features): bad log for feature attribute values --- bento_reference_service/db.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/bento_reference_service/db.py b/bento_reference_service/db.py index 30c9640..a184b39 100644 --- a/bento_reference_service/db.py +++ b/bento_reference_service/db.py @@ -618,7 +618,8 @@ async def bulk_ingest_genome_features(self, features: tuple[GenomeFeature, ...]) new_attribute_values: list[tuple[int, str]] = [(iv, sv) for sv, iv in new_attr_value_ids.items()] self.logger.debug( - f"bulk_ingest_genome_features: have {len(new_attribute_keys)} feature attribute values for batch" + f"bulk_ingest_genome_features: have {len(new_attribute_values)} new feature attribute values for " + f"batch" ) await conn.copy_records_to_table( "genome_feature_attribute_values", columns=["id", "attr_val"], records=new_attribute_values From 864f267d09fc3ecd42837dbdc9b234dfa88fecba Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Thu, 16 May 2024 16:57:44 -0400 Subject: [PATCH 101/114] test: task list route --- tests/shared_data.py | 3 +++ tests/shared_functions.py | 13 +++++++++++++ tests/test_genome_routes.py | 10 ++-------- tests/test_task_routes.py | 33 +++++++++++++++++++++++++++++++++ 4 files changed, 51 insertions(+), 8 deletions(-) create mode 100644 tests/shared_functions.py create mode 100644 tests/test_task_routes.py diff --git a/tests/shared_data.py b/tests/shared_data.py index 69bbc15..5be21a7 100644 --- a/tests/shared_data.py +++ b/tests/shared_data.py @@ -13,6 +13,7 @@ "HG38_CHR1_F100K_GENOME_ID", "TEST_GENOME_HG38_CHR1_F100K", "TEST_GENOME_HG38_CHR1_F100K_OBJ", + "AUTHORIZATION_HEADER", ] DATA_DIR = (pathlib.Path(__file__).parent / "data").absolute() @@ -70,3 +71,5 @@ ], } TEST_GENOME_HG38_CHR1_F100K_OBJ = Genome(**TEST_GENOME_HG38_CHR1_F100K) + +AUTHORIZATION_HEADER = {"Authorization": "Token bearer"} diff --git a/tests/shared_functions.py b/tests/shared_functions.py new file mode 100644 index 0000000..2f2e915 --- /dev/null +++ b/tests/shared_functions.py @@ -0,0 +1,13 @@ +from aioresponses import aioresponses +from fastapi.testclient import TestClient +from httpx import Response + +from .shared_data import AUTHORIZATION_HEADER + +__all__ = ["create_genome_with_permissions"] + + +def create_genome_with_permissions(test_client: TestClient, aioresponse: aioresponses, genome: dict) -> Response: + aioresponse.post("https://authz.local/policy/evaluate", payload={"result": [[True]]}) + res = test_client.post("/genomes", json=genome, headers=AUTHORIZATION_HEADER) + return res diff --git a/tests/test_genome_routes.py b/tests/test_genome_routes.py index 0a1b7ea..6bdb466 100644 --- a/tests/test_genome_routes.py +++ b/tests/test_genome_routes.py @@ -17,14 +17,14 @@ TEST_GENOME_SARS_COV_2_OBJ, TEST_GENOME_HG38_CHR1_F100K, TEST_GENOME_HG38_CHR1_F100K_OBJ, + AUTHORIZATION_HEADER, ) +from .shared_functions import create_genome_with_permissions # all tests are async so that db_cleanup (an async fixture) properly works. not sure why it's this way. pytestmark = pytest.mark.asyncio() -AUTHORIZATION_HEADER = {"Authorization": "Token bearer"} - async def test_genome_list(test_client: TestClient): res = test_client.get("/genomes") @@ -53,12 +53,6 @@ async def test_404s_with_no_genomes(test_client: TestClient): assert res.status_code == status.HTTP_404_NOT_FOUND -def create_genome_with_permissions(test_client: TestClient, aioresponse: aioresponses, genome: dict) -> Response: - aioresponse.post("https://authz.local/policy/evaluate", payload={"result": [[True]]}) - res = test_client.post("/genomes", json=genome, headers=AUTHORIZATION_HEADER) - return res - - def create_covid_genome_with_permissions(test_client: TestClient, aioresponse: aioresponses) -> Response: return create_genome_with_permissions(test_client, aioresponse, TEST_GENOME_SARS_COV_2) diff --git a/tests/test_task_routes.py b/tests/test_task_routes.py new file mode 100644 index 0000000..996a052 --- /dev/null +++ b/tests/test_task_routes.py @@ -0,0 +1,33 @@ +import pytest +from aioresponses import aioresponses +from fastapi.testclient import TestClient + +from bento_reference_service.db import Database + +from .shared_data import SARS_COV_2_GENOME_ID, TEST_GENOME_SARS_COV_2, AUTHORIZATION_HEADER +from .shared_functions import create_genome_with_permissions + + +@pytest.mark.asyncio() +async def test_task_routes(test_client: TestClient, aioresponse: aioresponses, db: Database, db_cleanup): + # prerequesite: set up a genome + create_genome_with_permissions(test_client, aioresponse, TEST_GENOME_SARS_COV_2) + + # prerequesite: initialize the database for the web app + validate there aren't any tasks + aioresponse.post("https://authz.local/policy/evaluate", payload={"result": [[True]]}) + res = test_client.get("/tasks", headers=AUTHORIZATION_HEADER) + assert res.status_code == 200 + rd = res.json() + assert len(rd) == 0 + + # prerequesite: set up a dummy task + await db.create_task(SARS_COV_2_GENOME_ID, "ingest_features") + + # make sure the task now shows up in the list of tasks in the initial state + aioresponse.post("https://authz.local/policy/evaluate", payload={"result": [[True]]}) + res = test_client.get("/tasks", headers=AUTHORIZATION_HEADER) + assert res.status_code == 200 + rd = res.json() + assert len(rd) == 1 + assert rd[0]["genome_id"] == SARS_COV_2_GENOME_ID + assert rd[0]["status"] == "queued" From 3204c079b00fdb984d0ede028372713ac6657f50 Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Thu, 16 May 2024 17:04:14 -0400 Subject: [PATCH 102/114] lint(db): rm unused function --- bento_reference_service/db.py | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/bento_reference_service/db.py b/bento_reference_service/db.py index a184b39..5a554ad 100644 --- a/bento_reference_service/db.py +++ b/bento_reference_service/db.py @@ -332,19 +332,6 @@ async def get_genome_feature_by_id(self, g_id: str, f_id: str) -> GenomeFeature res = await self.get_genome_features_by_ids(g_id, [f_id]) return res[0] if res else None - async def _run_feature_id_query( - self, id_query: str, g_id: str, offset: int, limit: int, *args - ) -> tuple[list[GenomeFeature], dict]: # results, pagination dict - offset = max(offset, 0) - limit = min(max(limit, 0), self._config.feature_response_record_limit) - - conn: asyncpg.Connection - async with self.connect() as conn: - id_res = await conn.fetch(id_query, g_id, offset, limit, *args) - final_list = await self.get_genome_features_by_ids(g_id, [r["feature_id"] for r in id_res], conn) - - return final_list, {"offset": offset, "limit": limit, "total": len(id_res)} - async def query_genome_features( self, g_id: str, From 01ae03ff5ff6e880cf3c863b2d9e5eba9f322a08 Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Fri, 17 May 2024 09:38:38 -0400 Subject: [PATCH 103/114] feat: include time taken in features query response --- bento_reference_service/routers/genomes.py | 4 ++++ tests/test_genome_routes.py | 2 ++ 2 files changed, 6 insertions(+) diff --git a/bento_reference_service/routers/genomes.py b/bento_reference_service/routers/genomes.py index 69a8772..8e9f278 100644 --- a/bento_reference_service/routers/genomes.py +++ b/bento_reference_service/routers/genomes.py @@ -2,6 +2,7 @@ import asyncpg import traceback +from datetime import datetime from fastapi import APIRouter, BackgroundTasks, HTTPException, Query, Request, UploadFile, status from fastapi.responses import StreamingResponse from typing import Annotated @@ -170,6 +171,8 @@ async def genomes_detail_features( offset: int = 0, limit: int = 10, ): + st = datetime.now() + results, pagination = await db.query_genome_features( genome_id, q, name, name_q, position, start, end, feature_type, offset, limit ) @@ -177,6 +180,7 @@ async def genomes_detail_features( return { "results": results, "pagination": pagination, + "time": (datetime.now() - st).total_seconds(), } diff --git a/tests/test_genome_routes.py b/tests/test_genome_routes.py index 6bdb466..cba98e0 100644 --- a/tests/test_genome_routes.py +++ b/tests/test_genome_routes.py @@ -274,6 +274,8 @@ async def test_genome_feature_endpoints(test_client: TestClient, aioresponse: ai srd = sr.json() assert len(srd["results"]) == 1 assert srd["pagination"]["total"] == 1 + assert isinstance(srd.get("time"), float) + assert srd["time"] < 0.2 # this is a very basic operation on a small dataset and should be fast. # Test we can filter genome features (ID used as name) sr = test_client.get(f"/genomes/{genome.id}/features", params={"name": "CDS:ENSSASP00005000003"}) From b575d44f129811d510510087563ebede09fc1dfd Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Fri, 17 May 2024 09:57:44 -0400 Subject: [PATCH 104/114] chore(features): shorter logs for fallback to feature name/missing ID --- bento_reference_service/features.py | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/bento_reference_service/features.py b/bento_reference_service/features.py index 4f2d1fe..c2ecfc9 100644 --- a/bento_reference_service/features.py +++ b/bento_reference_service/features.py @@ -148,16 +148,6 @@ def iter_features( try: record_attributes = parse_attributes(feature_raw_attributes) - feature_id = extract_feature_id(rec, record_attributes) - feature_name = extract_feature_name(rec, record_attributes) - - if feature_id is None: - logger.warning(f"Skipping unsupported feature {i}: type={feature_type}, no ID retrieval; {rec}") - continue - - if feature_name is None: - logger.warning(f"Using ID as name for feature {i}: {rec}") - feature_name = feature_id # - coordinates from PySAM are 0-based, semi-open # - to convert to 1-based semi-open coordinates like in the original GFF3, we add 1 to start @@ -165,6 +155,22 @@ def iter_features( # leave it as-is) start_pos = rec.start + 1 end_pos = rec.end + + feature_id = extract_feature_id(rec, record_attributes) + if feature_id is None: + logger.warning( + f"Skipping unsupported feature {i}: type={feature_type}, no ID retrieval; " + f"{contig_name}:{start_pos}-{end_pos}" + ) + continue + + feature_name = extract_feature_name(rec, record_attributes) + if feature_name is None: + logger.warning( + f"Using ID as name for feature {i}: {feature_id} {contig_name}:{start_pos}-{end_pos}" + ) + feature_name = feature_id + entry = m.GenomeFeatureEntry( start_pos=start_pos, end_pos=end_pos, From c772412516ed613c46556fe71c6aeef5a9925b45 Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Fri, 17 May 2024 09:58:10 -0400 Subject: [PATCH 105/114] fix: lifespan function for moving tasks --- bento_reference_service/main.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/bento_reference_service/main.py b/bento_reference_service/main.py index 3272c4a..c489ddc 100644 --- a/bento_reference_service/main.py +++ b/bento_reference_service/main.py @@ -32,11 +32,12 @@ async def lifespan(_app: FastAPI): # If we have any tasks that are still marked as "running" on application startup, we need to move them to the error # state. await db.move_running_tasks_to_error() + await db.close() yield -app = FastAPI() +app = FastAPI(lifespan=lifespan) # Attach different routers to the app, for: # - genome listing From 14253994f7f47c9064638eac2356f6c6518baeda Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Fri, 17 May 2024 09:59:25 -0400 Subject: [PATCH 106/114] test: tasks detail endpoint --- tests/test_task_routes.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/tests/test_task_routes.py b/tests/test_task_routes.py index 996a052..b83ed27 100644 --- a/tests/test_task_routes.py +++ b/tests/test_task_routes.py @@ -31,3 +31,12 @@ async def test_task_routes(test_client: TestClient, aioresponse: aioresponses, d assert len(rd) == 1 assert rd[0]["genome_id"] == SARS_COV_2_GENOME_ID assert rd[0]["status"] == "queued" + + aioresponse.post("https://authz.local/policy/evaluate", payload={"result": [[True]]}) + res = test_client.get(f"/tasks/{rd[0]['id']}", headers=AUTHORIZATION_HEADER) + rd2 = res.json() + assert rd[0] == rd2 + + aioresponse.post("https://authz.local/policy/evaluate", payload={"result": [[True]]}) + res = test_client.get(f"/tasks/0", headers=AUTHORIZATION_HEADER) + assert res.status_code == 404 From b73f11ae8138a49e9b52dc58f603cac69981cca6 Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Fri, 17 May 2024 10:14:43 -0400 Subject: [PATCH 107/114] feat: fuzzy search params for feature search args: q/name --- bento_reference_service/db.py | 55 +++++++++++----------- bento_reference_service/routers/genomes.py | 5 +- bento_reference_service/sql/schema.sql | 6 +++ tests/conftest.py | 2 + tests/test_db.py | 6 ++- tests/test_genome_routes.py | 12 +++++ 6 files changed, 55 insertions(+), 31 deletions(-) diff --git a/bento_reference_service/db.py b/bento_reference_service/db.py index 5a554ad..025367d 100644 --- a/bento_reference_service/db.py +++ b/bento_reference_service/db.py @@ -310,12 +310,10 @@ async def get_genome_features_by_ids( ) parents, ( WITH attrs_tmp AS ( - SELECT gfak.attr_key AS attr_key, array_agg(gfav.attr_val) attr_vals - FROM genome_feature_attributes gfa - JOIN genome_feature_attribute_keys gfak ON gfa.attr_key = gfak.id - JOIN genome_feature_attribute_values gfav ON gfa.attr_val = gfav.id - WHERE gfa.feature = gf.id - GROUP BY gfak.attr_key + SELECT gfav.attr_key, array_agg(attr_val) attr_vals + FROM genome_feature_attributes_view gfav + WHERE gfav.feature = gf.id + GROUP BY gfav.attr_key ) SELECT jsonb_object_agg(attrs_tmp.attr_key, attrs_tmp.attr_vals) FROM attrs_tmp ) attributes @@ -337,8 +335,9 @@ async def query_genome_features( g_id: str, /, q: str | None = None, + q_fzy: bool = False, name: str | None = None, - name_q: str | None = None, + name_fzy: bool = False, position: str | None = None, start: int | None = None, end: int | None = None, @@ -348,7 +347,9 @@ async def query_genome_features( ) -> tuple[list[GenomeFeature], dict]: # list of genome features + pagination dict object # TODO: refactor to use standard Bento search in the future, when Bento search makes more sense + gf_select_items: list[str] = [] gf_where_items: list[str] = [] + gf_order_items: list[str] = [] gfe_where_items: list[str] = [] q_params: list[str | int] = [] @@ -358,6 +359,7 @@ def _q_param(pv: str | int) -> str: if q: query_param = _q_param(q) + q_op = "%" if q_fzy else "~" gf_where_items.append( f""" gf.feature_id IN ( @@ -365,35 +367,33 @@ def _q_param(pv: str | int) -> str: SELECT feature_id, feature_name, - feature_type, - ({self._feature_inner_entries_query(None, "gf_tmp_1")}) entries, - ( - SELECT array_agg(gfav.attr_val) - FROM genome_feature_attributes gfa - JOIN genome_feature_attribute_keys gfak ON gfa.attr_key = gfak.id - JOIN genome_feature_attribute_values gfav ON gfa.attr_val = gfav.id - WHERE gfa.feature = gf_tmp_1.id AND gfav.attr_val ~ {query_param} - ) attributes + feature_type FROM genome_features gf_tmp_1 WHERE - gf_tmp_1.genome_id = $1 + gf_tmp_1.genome_id = $1 AND ( + gf_tmp_1.feature_id {q_op} {query_param} + OR gf_tmp_1.feature_name {q_op} {query_param} + OR EXISTS ( + SELECT attr_val FROM genome_feature_attributes_view gfav + WHERE gfav.feature = gf_tmp_1.id AND gfav.attr_val {q_op} {query_param} + ) + ) ) gf_tmp_2 - WHERE - array_length(gf_tmp_2.attributes, 1) > 0 - OR gf_tmp_2.feature_id ~ {query_param} - OR gf_tmp_2.feature_name ~ {query_param} ) """ ) if name: - gf_where_items.append(f"gf.feature_name = {_q_param(name)}") - - if name_q: - gf_where_items.append(f"gf.feature_name ~ {_q_param(name_q)}") + param = _q_param(name) + if name_fzy: + gf_select_items.append(f"similarity(gf.feature_name, {param}) gf_fn_sml") + gf_where_items.append(f"gf.feature_name % {param}") + gf_order_items.append("gf_fn_sml DESC") + else: + gf_where_items.append(f"gf.feature_name = {param}") if position: - gfe_where_items.append(f"gfe.position_text ~ {_q_param(position)}") + gfe_where_items.append(f"gfe.position_text ILIKE {_q_param(position + '%')}") if start is not None: gfe_where_items.append(f"gfe.start_pos >= {_q_param(start)}") @@ -411,7 +411,7 @@ def _q_param(pv: str | int) -> str: gfe_where_clause = " AND ".join(gfe_where_items) if gfe_where_items else None id_query = f""" - SELECT feature_id FROM ( + SELECT feature_id {", " + ", ".join(gf_select_items) if gf_select_items else ""} FROM ( SELECT feature_id, feature_name, @@ -424,6 +424,7 @@ def _q_param(pv: str | int) -> str: WHERE {"jsonb_array_length(gf.entries) > 0 AND" if gfe_where_clause else ""} {where_clause} + {"ORDER BY " + ", ".join(gf_order_items) if gf_order_items else ""} OFFSET $2 LIMIT $3 """ diff --git a/bento_reference_service/routers/genomes.py b/bento_reference_service/routers/genomes.py index 8e9f278..a51aff5 100644 --- a/bento_reference_service/routers/genomes.py +++ b/bento_reference_service/routers/genomes.py @@ -162,8 +162,9 @@ async def genomes_detail_features( db: DatabaseDependency, genome_id: str, q: str | None = None, + q_fzy: bool = False, name: str | None = None, - name_q: str | None = None, + name_fzy: bool = False, position: str | None = None, start: int | None = None, end: int | None = None, @@ -174,7 +175,7 @@ async def genomes_detail_features( st = datetime.now() results, pagination = await db.query_genome_features( - genome_id, q, name, name_q, position, start, end, feature_type, offset, limit + genome_id, q, q_fzy, name, name_fzy, position, start, end, feature_type, offset, limit ) return { diff --git a/bento_reference_service/sql/schema.sql b/bento_reference_service/sql/schema.sql index b305874..8a2d2e6 100644 --- a/bento_reference_service/sql/schema.sql +++ b/bento_reference_service/sql/schema.sql @@ -152,6 +152,12 @@ CREATE INDEX IF NOT EXISTS genome_feature_attributes_attr_key_idx CREATE INDEX IF NOT EXISTS genome_feature_attributes_attr_val_idx ON genome_feature_attributes (feature, attr_val); +CREATE OR REPLACE VIEW genome_feature_attributes_view AS + SELECT gfa.feature feature, gfak.attr_key attr_key, gfav.attr_val attr_val + FROM genome_feature_attributes gfa + JOIN genome_feature_attribute_keys gfak ON gfa.attr_key = gfak.id + JOIN genome_feature_attribute_values gfav ON gfa.attr_val = gfav.id; + DO $$ BEGIN CREATE TYPE task_kind AS ENUM ('ingest_features'); diff --git a/tests/conftest.py b/tests/conftest.py index 8af7ae0..d15ea08 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -41,6 +41,8 @@ async def db_cleanup(db: Database): DROP TYPE IF EXISTS task_kind; DROP TYPE IF EXISTS task_status; + DROP VIEW genome_feature_attributes_view; + DROP INDEX IF EXISTS genome_features_feature_id_trgm_gin; DROP INDEX IF EXISTS genome_features_feature_name_trgm_gin; DROP INDEX IF EXISTS genome_feature_entries_position_text_trgm_gin; diff --git a/tests/test_db.py b/tests/test_db.py index 27f688d..8742858 100644 --- a/tests/test_db.py +++ b/tests/test_db.py @@ -112,11 +112,13 @@ async def test_genome_features_summary(db: Database, db_cleanup): [ # SARS-CoV-2 (SARS_COV_2_GENOME_ID, dict(name="ORF1ab"), 3), # should get back 2 genes and 1 transcript - (SARS_COV_2_GENOME_ID, dict(name_q="ORF1"), 6), # ORF1ab, ORF1a, ORF10 + # ORF1ab, ORF1a, ORF10 should be top 6 results, but we get more back since it's fuzzy + # (ORF3a, ORF6, ORF7[a|b], ORF8): + (SARS_COV_2_GENOME_ID, dict(name="ORF1", name_fzy=True, limit=100), 16), (SARS_COV_2_GENOME_ID, dict(start=1, end=1000), 9), # region + 8 related to ORF1ab (SARS_COV_2_GENOME_ID, dict(q="ORF1ab"), 3), (SARS_COV_2_GENOME_ID, dict(q="ENSSASG00005000002"), 1), - (SARS_COV_2_GENOME_ID, dict(q="protein_coding", limit=100), 24), + (SARS_COV_2_GENOME_ID, dict(q="protein_coding", q_fzy=True, limit=100), 24), # hg38 subset (HG38_CHR1_F100K_GENOME_ID, dict(position="chr1:11869-"), 3), (HG38_CHR1_F100K_GENOME_ID, dict(start=12000), 10), diff --git a/tests/test_genome_routes.py b/tests/test_genome_routes.py index cba98e0..a1dc078 100644 --- a/tests/test_genome_routes.py +++ b/tests/test_genome_routes.py @@ -266,25 +266,37 @@ async def test_genome_feature_endpoints(test_client: TestClient, aioresponse: ai # Test we can query genome features sr = test_client.get(f"/genomes/{genome.id}/feature_types") + assert sr.status_code == 200 srd = sr.json() assert sum(srd.values()) == expected_features # Test we can query genome features + + # - regular expression sr = test_client.get(f"/genomes/{genome.id}/features", params={"q": "ENSSASP00005000003"}) + assert sr.status_code == 200 srd = sr.json() assert len(srd["results"]) == 1 assert srd["pagination"]["total"] == 1 assert isinstance(srd.get("time"), float) assert srd["time"] < 0.2 # this is a very basic operation on a small dataset and should be fast. + # - fuzzy search + sr = test_client.get(f"/genomes/{genome.id}/features", params={"q": "ENSSASP00005000003", "q_fzy": "true"}) + assert sr.status_code == 200 + srd = sr.json() + assert len(srd["results"]) == 10 # fuzzy search yields many results + # Test we can filter genome features (ID used as name) sr = test_client.get(f"/genomes/{genome.id}/features", params={"name": "CDS:ENSSASP00005000003"}) + assert sr.status_code == 200 srd = sr.json() assert len(srd["results"]) == 1 assert srd["pagination"]["total"] == 1 # Test we can list genome features - we get back the first 10 sr = test_client.get(f"/genomes/{genome.id}/features") + assert sr.status_code == 200 srd = sr.json() assert len(srd["results"]) == 10 assert srd["pagination"]["offset"] == 0 From fcbe4fa277d733fd3a9c7f2cf224ce0dec173c10 Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Tue, 21 May 2024 09:13:53 -0400 Subject: [PATCH 108/114] refact(features): simplify feature generator a bit --- bento_reference_service/features.py | 20 +++++++------------- 1 file changed, 7 insertions(+), 13 deletions(-) diff --git a/bento_reference_service/features.py b/bento_reference_service/features.py index c2ecfc9..fddb2cd 100644 --- a/bento_reference_service/features.py +++ b/bento_reference_service/features.py @@ -120,17 +120,15 @@ def iter_features( genome_id = genome.id - gff = pysam.TabixFile(str(gff_path), index=str(gff_index_path)) total_processed: int = 0 - try: - features_by_id: dict[str, m.GenomeFeature] = {} - + with pysam.TabixFile(str(gff_path), index=str(gff_index_path)) as gff: for contig in genome.contigs: contig_name = contig.name - logger.info(f"Indexing features from contig {contig_name}") + contig_features_by_id: dict[str, m.GenomeFeature] = {} + try: fetch_iter = gff.fetch(reference=contig.name, parser=pysam.asGFF3()) except ValueError as e: @@ -179,8 +177,8 @@ def iter_features( phase=rec.frame, ) - if feature_id in features_by_id: - features_by_id[feature_id].entries.append(entry) + if feature_id in contig_features_by_id: + contig_features_by_id[feature_id].entries.append(entry) else: attributes: dict[str, list[str]] = { # skip attributes which have been captured in the above information: @@ -189,7 +187,7 @@ def iter_features( if k not in GFF_CAPTURED_ATTRIBUTES } - features_by_id[feature_id] = m.GenomeFeature( + contig_features_by_id[feature_id] = m.GenomeFeature( genome_id=genome_id, contig_name=contig_name, strand=rec.strand or ".", # None/"." <=> unstranded @@ -214,11 +212,7 @@ def iter_features( if total_processed % GFF_LOG_PROGRESS_INTERVAL == 0: logger.info(f"Processed {total_processed} features") - yield tuple(features_by_id.values()) - features_by_id.clear() - - finally: - gff.close() + yield tuple(contig_features_by_id.values()) async def ingest_features( From b7b8f68d9a154dea6c51a10d0d80598c289fb4ea Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Tue, 21 May 2024 13:42:44 -0400 Subject: [PATCH 109/114] refact: rewrite feature ingestion to be a bit more RESTy --- bento_reference_service/db.py | 11 +++ bento_reference_service/features.py | 64 ++++++++++---- bento_reference_service/models.py | 14 ++- bento_reference_service/routers/genomes.py | 52 ++--------- bento_reference_service/routers/tasks.py | 35 +++++++- .../workflows/wdls/fasta_ref.wdl | 17 ++-- .../workflows/wdls/gff3_annot.wdl | 87 ++++++++++++++++--- tests/test_db.py | 4 +- tests/test_genome_routes.py | 59 ++++++++++--- tests/test_task_routes.py | 9 ++ 10 files changed, 254 insertions(+), 98 deletions(-) diff --git a/bento_reference_service/db.py b/bento_reference_service/db.py index 025367d..9bc86f3 100644 --- a/bento_reference_service/db.py +++ b/bento_reference_service/db.py @@ -14,6 +14,7 @@ ContigWithRefgetURI, Genome, GenomeWithURIs, + GenomeGFF3Patch, OntologyTerm, GenomeFeatureEntry, GenomeFeature, @@ -235,6 +236,16 @@ async def create_genome(self, g: Genome, return_external_resource_uris: bool) -> return await self.get_genome(g.id, external_resource_uris=return_external_resource_uris) + async def update_genome(self, g_id: str, patch: GenomeGFF3Patch): + conn: asyncpg.Connection + async with self.connect() as conn: + await conn.execute( + "UPDATE genomes SET gff3_gz_uri = $2, gff3_gz_tbi_uri = $3 WHERE id = $1", + g_id, + patch.gff3_gz, + patch.gff3_gz_tbi, + ) + async def genome_feature_types_summary(self, g_id: str): conn: asyncpg.Connection async with self.connect() as conn: diff --git a/bento_reference_service/features.py b/bento_reference_service/features.py index fddb2cd..056a9fd 100644 --- a/bento_reference_service/features.py +++ b/bento_reference_service/features.py @@ -1,3 +1,4 @@ +import aiofiles import logging import pysam import traceback @@ -6,9 +7,12 @@ from pathlib import Path from typing import Generator from urllib.parse import unquote as url_unquote +from uuid import uuid4 from . import models as m +from .config import Config from .db import Database +from .streaming import stream_from_uri __all__ = [ "INGEST_FEATURES_TASK_KIND", @@ -217,7 +221,7 @@ def iter_features( async def ingest_features( # parameters: - genome_id: str, + genome: m.GenomeWithURIs, gff_path: Path, gff_index_path: Path, # dependencies: @@ -228,7 +232,7 @@ async def ingest_features( Given a genome ID and a path to an external GTF gene/exon/transcript annotation file, this function copies the GTF into the relevant .bentoGenome directory and ingests the annotations into an ElasticSearch index for fuzzy text querying of features. - :param genome_id: The ID of the genome to attach the annotation to. + :param genome: The genome to attach the annotation to. :param gff_path: The path to an external GTF.gz-formatted annotation file to copy and read from. :param gff_index_path: The path to an external index file for the above .gtf.gz. :param db: Database connection/management object. @@ -236,12 +240,7 @@ async def ingest_features( :return: None """ - genome: m.GenomeWithURIs | None = await db.get_genome(genome_id) - - if genome is None: - raise AnnotationGenomeNotFoundError(f"Genome with ID {genome_id} not found") - - logger.info(f"Ingesting gene features for genome {genome_id}...") + logger.info(f"Ingesting gene features for genome {genome.id}...") features_to_ingest = iter_features(genome, gff_path, gff_index_path, logger) n_ingested: int = 0 @@ -265,20 +264,55 @@ async def ingest_features( return n_ingested -async def ingest_features_task( - genome_id: str, gff3_gz_path: Path, gff3_gz_tbi_path: Path, task_id: int, db: Database, logger: logging.Logger -): +async def download_feature_files(genome: m.GenomeWithURIs, config: Config, logger: logging.Logger): + _, gff3_gz_iter = await stream_from_uri( + config, logger, genome.gff3_gz, range_header=None, impose_response_limit=False + ) + + _, gff3_gz_tbi_iter = await stream_from_uri( + config, logger, genome.gff3_gz_tbi, range_header=None, impose_response_limit=False + ) + + fn = config.file_ingest_tmp_dir / f"{uuid4()}.gff3.gz" + fn_tbi = config.file_ingest_tmp_dir / f"{fn}.tbi" + + # copy .gff3.gz to temporary directory for ingestion + async with aiofiles.open(fn, "wb") as fh: + while data := (await anext(gff3_gz_iter, None)): + await fh.write(data) + + logger.debug(f"Wrote GFF.gz data to {fn}; size={fn.stat().st_size}") + + # copy .gff3.gz.tbi to temporary directory for ingestion + async with aiofiles.open(fn_tbi, "wb") as fh: + while data := (await anext(gff3_gz_tbi_iter, None)): + await fh.write(data) + + logger.debug(f"Wrote GFF.gz.tbi data to {fn_tbi}; size={fn_tbi.stat().st_size}") + + return fn, fn_tbi + + +async def ingest_features_task(genome_id: str, task_id: int, config: Config, db: Database, logger: logging.Logger): # the ingest_features task moves from queued -> running -> (success | error) await db.update_task_status(task_id, "running") - # clear existing gene features for this genome - logger.info(f"Clearing gene features for genome {genome_id} in preparation for feature (re-)ingestion...") - await db.clear_genome_features(genome_id) + genome: m.GenomeWithURIs | None = await db.get_genome(genome_id) + if genome is None: + raise AnnotationGenomeNotFoundError(f"Genome with ID {genome_id} not found") + + # download GFF3 + GFF3 TBI file for this genome + logger.info(f"Downloading gene feature files for genome {genome_id}") + gff3_gz_path, gff3_gz_tbi_path = await download_feature_files(genome, config, logger) try: + # clear existing gene features for this genome + logger.info(f"Clearing gene features for genome {genome_id} in preparation for feature (re-)ingestion") + await db.clear_genome_features(genome_id) + # ingest gene features into the database - n_ingested = await ingest_features(genome_id, gff3_gz_path, gff3_gz_tbi_path, db, logger) + n_ingested = await ingest_features(genome, gff3_gz_path, gff3_gz_tbi_path, db, logger) await db.update_task_status(task_id, "success", message=f"ingested {n_ingested} features") except Exception as e: diff --git a/bento_reference_service/models.py b/bento_reference_service/models.py index b3dc607..f97eac5 100644 --- a/bento_reference_service/models.py +++ b/bento_reference_service/models.py @@ -9,9 +9,11 @@ "ContigWithRefgetURI", "Genome", "GenomeWithURIs", + "GenomeGFF3Patch", "GenomeFeatureEntry", "GenomeFeature", "TaskStatus", + "TaskParams", "Task", ] @@ -72,6 +74,11 @@ class GenomeWithURIs(Genome): contigs: tuple[ContigWithRefgetURI, ...] +class GenomeGFF3Patch(BaseModel): + gff3_gz: str # URI + gff3_gz_tbi: str # URI + + class GenomeFeatureEntry(BaseModel): start_pos: int # 1-based, inclusive end_pos: int # 1-based, exclusive @@ -102,10 +109,13 @@ class GenomeFeature(BaseModel): TaskStatus = Literal["queued", "running", "success", "error"] -class Task(BaseModel): - id: int +class TaskParams(BaseModel): genome_id: str kind: Literal["ingest_features"] + + +class Task(TaskParams): + id: int status: TaskStatus message: str created: datetime diff --git a/bento_reference_service/routers/genomes.py b/bento_reference_service/routers/genomes.py index a51aff5..4f4e6a2 100644 --- a/bento_reference_service/routers/genomes.py +++ b/bento_reference_service/routers/genomes.py @@ -1,18 +1,15 @@ -import aiofiles import asyncpg import traceback from datetime import datetime -from fastapi import APIRouter, BackgroundTasks, HTTPException, Query, Request, UploadFile, status +from fastapi import APIRouter, HTTPException, Query, Request, status from fastapi.responses import StreamingResponse from typing import Annotated -from uuid import uuid4 from .. import models as m from ..authz import authz_middleware from ..config import ConfigDependency from ..db import Database, DatabaseDependency -from ..features import INGEST_FEATURES_TASK_KIND, ingest_features_task from ..logger import LoggerDependency from ..streaming import generate_uri_streaming_response from .constants import DEPENDENCY_DELETE_REFERENCE_MATERIAL, DEPENDENCY_INGEST_REFERENCE_MATERIAL @@ -119,6 +116,14 @@ async def genomes_detail(genome_id: str, db: DatabaseDependency) -> m.GenomeWith return await get_genome_or_raise_404(db, genome_id) +@genome_router.patch( + "/{genome_id}", status_code=status.HTTP_204_NO_CONTENT, dependencies=[DEPENDENCY_INGEST_REFERENCE_MATERIAL] +) +async def genomes_patch(genome_id: str, genome_patch: m.GenomeGFF3Patch, db: DatabaseDependency): + await get_genome_or_raise_404(db, genome_id) + await db.update_genome(genome_id, genome_patch) + + @genome_router.delete( "/{genome_id}", status_code=status.HTTP_204_NO_CONTENT, @@ -225,45 +230,6 @@ async def genomes_detail_features_gff3( ) -@genome_router.put( - "/{genome_id}/features.gff3.gz", - dependencies=[DEPENDENCY_INGEST_REFERENCE_MATERIAL], - status_code=status.HTTP_202_ACCEPTED, -) -async def genomes_detail_features_ingest_gff3( - background_tasks: BackgroundTasks, - config: ConfigDependency, - db: DatabaseDependency, - logger: LoggerDependency, - genome_id: str, - gff3_gz: UploadFile, - gff3_gz_tbi: UploadFile, -): - # Verify that genome exists - await get_genome_or_raise_404(db, genome_id=genome_id, external_resource_uris=False) - - fn = config.file_ingest_tmp_dir / f"{uuid4()}.gff3.gz" - fn_tbi = config.file_ingest_tmp_dir / f"{fn}.tbi" - - # copy .gff3.gz to temporary directory for ingestion - async with aiofiles.open(fn, "wb") as fh: - while data := (await gff3_gz.read(config.file_response_chunk_size)): - await fh.write(data) - - logger.debug(f"Wrote GFF.gz data to {fn}; size={fn.stat().st_size}") - - # copy .gff3.gz.tbi to temporary directory for ingestion - async with aiofiles.open(fn_tbi, "wb") as fh: - while data := (await gff3_gz_tbi.read(config.file_response_chunk_size)): - await fh.write(data) - - logger.debug(f"Wrote GFF.gz.tbi data to {fn_tbi}; size={fn_tbi.stat().st_size}") - - task_id = await db.create_task(genome_id, INGEST_FEATURES_TASK_KIND) - background_tasks.add_task(ingest_features_task, genome_id, fn, fn_tbi, task_id, db, logger) - return {"task": f"{config.service_url_base_path}/tasks/{task_id}"} - - @genome_router.get("/{genome_id}/features.gff3.gz.tbi", dependencies=[authz_middleware.dep_public_endpoint()]) async def genomes_detail_gene_features_gff3_index( config: ConfigDependency, db: DatabaseDependency, logger: LoggerDependency, request: Request, genome_id: str diff --git a/bento_reference_service/routers/tasks.py b/bento_reference_service/routers/tasks.py index 3367847..c59b614 100644 --- a/bento_reference_service/routers/tasks.py +++ b/bento_reference_service/routers/tasks.py @@ -1,7 +1,10 @@ -from fastapi import APIRouter, HTTPException, status +from fastapi import APIRouter, BackgroundTasks, HTTPException, status +from ..config import ConfigDependency from ..db import DatabaseDependency -from ..models import Task +from ..features import ingest_features_task +from ..logger import LoggerDependency +from ..models import TaskParams, Task from .constants import DEPENDENCY_INGEST_REFERENCE_MATERIAL __all__ = ["task_router"] @@ -15,6 +18,34 @@ async def tasks_list(db: DatabaseDependency): return await db.query_tasks(None, None) +@task_router.post("", status_code=status.HTTP_201_CREATED, dependencies=[DEPENDENCY_INGEST_REFERENCE_MATERIAL]) +async def tasks_create( + task: TaskParams, + background_tasks: BackgroundTasks, + config: ConfigDependency, + db: DatabaseDependency, + logger: LoggerDependency, +) -> Task: + genome_id = task.genome_id + + g = await db.get_genome(genome_id) + if g is None: + raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=f"Genome with ID {genome_id} not found.") + + task_id = await db.create_task(genome_id, task.kind) + task = await db.get_task(task_id) + if task is None: # pragma: no cover + raise HTTPException( + status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail="Something went wrong when creating the task." + ) + + # currently, ingest_features is the only task type, so we don't need an if-statement to decide which task to + # dispatch. + background_tasks.add_task(ingest_features_task, genome_id, task_id, config, db, logger) + + return task + + @task_router.get("/{task_id}", dependencies=[DEPENDENCY_INGEST_REFERENCE_MATERIAL]) async def tasks_detail(task_id: int, db: DatabaseDependency) -> Task: task = await db.get_task(task_id) diff --git a/bento_reference_service/workflows/wdls/fasta_ref.wdl b/bento_reference_service/workflows/wdls/fasta_ref.wdl index cc7342d..ee4fd9e 100644 --- a/bento_reference_service/workflows/wdls/fasta_ref.wdl +++ b/bento_reference_service/workflows/wdls/fasta_ref.wdl @@ -66,6 +66,8 @@ workflow fasta_ref { fai = s1.fai, fasta_drs_uri = drs_fasta.drs_uri, fai_drs_uri = drs_fai.drs_uri, + gff3_gz_drs_uri = drs_gff3.drs_uri, + gff3_gz_tbi_drs_uri = drs_gff3_tbi.drs_uri, reference_url = reference_url, token = access_token, validate_ssl = validate_ssl @@ -75,8 +77,6 @@ workflow fasta_ref { call ingest_gff3_into_ref { input: genome_id = genome_id, - gff3_gz = select_first([gi.sorted_gff3_gz]), # Coerce File? into File via select_first - gff3_gz_tbi = select_first([gi.sorted_gff3_gz_tbi]), # " reference_url = reference_url, token = access_token, validate_ssl = validate_ssl, @@ -217,8 +217,6 @@ task ingest_metadata_into_ref { task ingest_gff3_into_ref { input { String genome_id - File gff3_gz - File gff3_gz_tbi String reference_url String token Boolean validate_ssl @@ -228,22 +226,21 @@ task ingest_gff3_into_ref { command <<< task_res=$( curl ~{true="" false="-k" validate_ssl} \ - -X PUT \ - -F "gff3_gz=@~{gff3_gz}" \ - -F "gff3_gz_tbi=@~{gff3_gz_tbi}" \ + -X POST \ + --json '{"genome_id": "~{genome_id}", "kind": "ingest_features"}' -H "Authorization: Bearer ~{token}" \ --fail-with-body \ - "~{reference_url}/genomes/~{genome_id}/features.gff3.gz" + "~{reference_url}/tasks" ) exit_code=$? if [[ "${exit_code}" == 0 ]]; then - task_url=$(jq -r '.task' <<< "${task_res}") + task_id=$(jq -r '.id' <<< "${task_res}") while true; do task_status_res=$( curl ~{true="" false="-k" validate_ssl} \ -H "Authorization: Bearer ~{token}" \ --fail-with-body \ - "${task_url}" + "~{reference_url}/tasks/${task_id}" ) task_exit_code=$? diff --git a/bento_reference_service/workflows/wdls/gff3_annot.wdl b/bento_reference_service/workflows/wdls/gff3_annot.wdl index dcaef12..df248db 100644 --- a/bento_reference_service/workflows/wdls/gff3_annot.wdl +++ b/bento_reference_service/workflows/wdls/gff3_annot.wdl @@ -16,13 +16,27 @@ workflow gff3_annot { gff3 = genome_gff3 } - # TODO: DRS ingestion + updating reference metadata record + call ingest_into_drs as drs_gff3 { + input: + file = gi.sorted_gff3_gz, + drs_url = drs_url, + access_token = access_token, + validate_ssl = validate_ssl + } + + call ingest_into_drs as drs_gff3_tbi { + input: + file = gi.sorted_gff3_gz_tbi, + drs_url = drs_url, + access_token = access_token, + validate_ssl = validate_ssl + } call ingest_gff3_into_ref { input: genome_id = genome_id, - gff3_gz = gi.sorted_gff3_gz, - gff3_gz_tbi = gi.sorted_gff3_gz_tbi, + gff3_gz_drs_uri = drs_gff3.drs_uri, + gff3_gz_tbi_drs_uri = drs_gff3_tbi.drs_uri, reference_url = reference_url, token = access_token, validate_ssl = validate_ssl @@ -59,35 +73,85 @@ task normalize_and_compress_gff3_and_index { } } +# TODO: shared file with this task +task ingest_into_drs { + input { + File file + String drs_url + String access_token + Boolean validate_ssl + } + + command <<< + drs_res=$( + curl ~{true="" false="-k" validate_ssl} \ + -X POST \ + -F "file=@~{file}" \ + -F "project_id=$project_id" \ + -F "dataset_id=$dataset_id" \ + -F "public=true" \ + -H "Authorization: Bearer ~{access_token}" \ + --fail-with-body \ + "~{drs_url}/ingest" + ) + exit_code=$? + rm '~{file}' + if [[ "${exit_code}" == 0 ]]; then + jq -r '.self_uri' <<< "${drs_res}" + else + exit "${exit_code}" + fi + >>> + + output { + String drs_uri = read_string(stdout()) + } +} + task ingest_gff3_into_ref { input { String genome_id - File gff3_gz - File gff3_gz_tbi + String gff3_gz_drs_uri + String gff3_gz_tbi_drs_uri String reference_url String token Boolean validate_ssl } command <<< + patch_res=$( + curl ~{true="" false="-k" validate_ssl} \ + -X PATCH \ + --json '{"gff3_gz": "~{gff3_gz_drs_uri}", "gff3_gz_tbi": "~{gff3_gz_tbi_drs_uri}"}' \ + -H "Authorization: Bearer ~{token}" \ + --fail-with-body \ + "~{reference_url}/genomes/~{genome_id}" + ) + + exit_code=$? + if [[ "${exit_code}" != 0 ]]; then + echo "patch failed with body: ${patch_res}" >&2 + exit "${exit_code}" + fi + task_res=$( curl ~{true="" false="-k" validate_ssl} \ - -X PUT \ - -F "gff3_gz=@~{gff3_gz}" \ - -F "gff3_gz_tbi=@~{gff3_gz_tbi}" \ + -X POST \ + --json '{"genome_id": "~{genome_id}", "kind": "ingest_features"}' \ -H "Authorization: Bearer ~{token}" \ --fail-with-body \ - "~{reference_url}/genomes/~{genome_id}/features.gff3.gz" + "~{reference_url}/tasks" ) exit_code=$? if [[ "${exit_code}" == 0 ]]; then - task_url=$(jq -r '.task' <<< "${task_res}") + echo "task created: ${task_res}" + task_id=$(jq -r '.id' <<< "${task_res}") while true; do task_status_res=$( curl ~{true="" false="-k" validate_ssl} \ -H "Authorization: Bearer ~{token}" \ --fail-with-body \ - "${task_url}" + "~{reference_url}/tasks/${task_id}" ) task_exit_code=$? @@ -112,6 +176,7 @@ task ingest_gff3_into_ref { sleep 10 done else + echo "task creation failed: ${task_res}" >&2 exit "${exit_code}" fi >>> diff --git a/tests/test_db.py b/tests/test_db.py index 8742858..b55352c 100644 --- a/tests/test_db.py +++ b/tests/test_db.py @@ -82,7 +82,7 @@ async def _set_up_sars_cov_2_genome_and_features(db: Database, logger: logging.L # prerequesite: ingest features gff3_gz_path = Path(TEST_GENOME_SARS_COV_2_OBJ.gff3_gz.replace("file://", "")) gff3_gz_tbi_path = Path(TEST_GENOME_SARS_COV_2_OBJ.gff3_gz_tbi.replace("file://", "")) - await ingest_features(SARS_COV_2_GENOME_ID, gff3_gz_path, gff3_gz_tbi_path, db, logger) + await ingest_features(await db.get_genome(SARS_COV_2_GENOME_ID), gff3_gz_path, gff3_gz_tbi_path, db, logger) async def _set_up_hg38_subset_genome_and_features(db: Database, logger: logging.Logger): @@ -91,7 +91,7 @@ async def _set_up_hg38_subset_genome_and_features(db: Database, logger: logging. # prerequesite: ingest features gff3_gz_path = Path(TEST_GENOME_HG38_CHR1_F100K_OBJ.gff3_gz.replace("file://", "")) gff3_gz_tbi_path = Path(TEST_GENOME_HG38_CHR1_F100K_OBJ.gff3_gz_tbi.replace("file://", "")) - await ingest_features(HG38_CHR1_F100K_GENOME_ID, gff3_gz_path, gff3_gz_tbi_path, db, logger) + await ingest_features(await db.get_genome(HG38_CHR1_F100K_GENOME_ID), gff3_gz_path, gff3_gz_tbi_path, db, logger) GENOME_ID_TO_SET_UP_FN = { diff --git a/tests/test_genome_routes.py b/tests/test_genome_routes.py index a1dc078..e6bbc2f 100644 --- a/tests/test_genome_routes.py +++ b/tests/test_genome_routes.py @@ -153,6 +153,45 @@ async def test_genome_detail_endpoints(test_client: TestClient, aioresponse: aio assert res.content == fh.read() +async def test_genome_without_gff3_and_then_patch(test_client: TestClient, aioresponse: aioresponses, db_cleanup): + covid_genome_without_gff3 = {**TEST_GENOME_SARS_COV_2} + del covid_genome_without_gff3["gff3_gz"] + del covid_genome_without_gff3["gff3_gz_tbi"] + + # ingest a genome without GFF3/TBI URIs (we'll add them in later) + aioresponse.post("https://authz.local/policy/evaluate", payload={"result": [[True]]}) + res = test_client.post("/genomes", json=covid_genome_without_gff3, headers=AUTHORIZATION_HEADER) + assert res.status_code == status.HTTP_201_CREATED + + # check that the genome ingested + res = test_client.get(f"/genomes/{SARS_COV_2_GENOME_ID}") + assert res.status_code == status.HTTP_200_OK + + # check that we get 404s for the gff3 files, since we haven't ingested them yet + res = test_client.get(f"/genomes/{SARS_COV_2_GENOME_ID}/features.gff3.gz") + assert res.status_code == status.HTTP_404_NOT_FOUND + res = test_client.get(f"/genomes/{SARS_COV_2_GENOME_ID}/features.gff3.gz.tbi") + assert res.status_code == status.HTTP_404_NOT_FOUND + + # update the genome with GFF3/TBI URIs + aioresponse.post("https://authz.local/policy/evaluate", payload={"result": [[True]]}) + res = test_client.patch( + f"/genomes/{SARS_COV_2_GENOME_ID}", + json={ + "gff3_gz": TEST_GENOME_SARS_COV_2["gff3_gz"], + "gff3_gz_tbi": TEST_GENOME_SARS_COV_2["gff3_gz_tbi"], + }, + headers=AUTHORIZATION_HEADER, + ) + assert res.status_code == status.HTTP_204_NO_CONTENT + + # check that we can now access the GFF3/TBI + res = test_client.get(f"/genomes/{SARS_COV_2_GENOME_ID}/features.gff3.gz") + assert res.status_code == status.HTTP_200_OK + res = test_client.get(f"/genomes/{SARS_COV_2_GENOME_ID}/features.gff3.gz.tbi") + assert res.status_code == status.HTTP_200_OK + + async def test_genome_delete(test_client: TestClient, aioresponse: aioresponses, db_cleanup): # setup: create genome TODO: fixture create_covid_genome_with_permissions(test_client, aioresponse) @@ -183,15 +222,11 @@ def _file_uri_to_path(uri: str) -> str: def _put_genome_features(test_client: TestClient, genome: Genome) -> Response: - gff3_gz = _file_uri_to_path(genome.gff3_gz) - gff3_gz_tbi = _file_uri_to_path(genome.gff3_gz_tbi) - - with open(gff3_gz, "rb") as gff3_fh, open(gff3_gz_tbi, "rb") as tbi_fh: - return test_client.put( - f"/genomes/{genome.id}/features.gff3.gz", - files={"gff3_gz": gff3_fh, "gff3_gz_tbi": tbi_fh}, - headers=AUTHORIZATION_HEADER, - ) + return test_client.post( + "/tasks", + json={"genome_id": genome.id, "kind": "ingest_features"}, + headers=AUTHORIZATION_HEADER, + ) def _test_ingest_genome_features(test_client: TestClient, genome: Genome, expected_features: int): @@ -199,10 +234,8 @@ def _test_ingest_genome_features(test_client: TestClient, genome: Genome, expect res = _put_genome_features(test_client, genome) - assert res.status_code == status.HTTP_202_ACCEPTED - data = res.json() - assert "task" in data - task_id = data["task"].split("/")[-1] + assert res.status_code == status.HTTP_201_CREATED + task_id = res.json()["id"] # Test we can access the task and that it eventually succeeds diff --git a/tests/test_task_routes.py b/tests/test_task_routes.py index b83ed27..e900ed6 100644 --- a/tests/test_task_routes.py +++ b/tests/test_task_routes.py @@ -8,6 +8,15 @@ from .shared_functions import create_genome_with_permissions +@pytest.mark.asyncio() +async def test_task_create_no_genome(test_client: TestClient, aioresponse: aioresponses, db_cleanup): + aioresponse.post("https://authz.local/policy/evaluate", payload={"result": [[True]]}) + res = test_client.post("/tasks", json={"genome_id": "DNE", "kind": "ingest_features"}, headers=AUTHORIZATION_HEADER) + assert res.status_code == 400 # 400: no genome + err = res.json() + assert err["errors"][0]["message"] == f"Genome with ID DNE not found." + + @pytest.mark.asyncio() async def test_task_routes(test_client: TestClient, aioresponse: aioresponses, db: Database, db_cleanup): # prerequesite: set up a genome From 93699a5e98ab53c06e499a7c7e1c857010fbb439 Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Tue, 21 May 2024 22:03:46 -0400 Subject: [PATCH 110/114] chore(streaming): add additional debug log when starting to stream URL --- bento_reference_service/streaming.py | 1 + 1 file changed, 1 insertion(+) diff --git a/bento_reference_service/streaming.py b/bento_reference_service/streaming.py index be39f81..b791c0c 100644 --- a/bento_reference_service/streaming.py +++ b/bento_reference_service/streaming.py @@ -196,6 +196,7 @@ async def stream_from_uri( ) # Don't pass Authorization header to possibly external sources + logger.debug(f"Streaming from HTTP URL: {url}") stream = stream_http( config, url, From 24b341d8464c4669f764b7c2d544a6bfb2772c42 Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Tue, 21 May 2024 22:03:56 -0400 Subject: [PATCH 111/114] fix: typo in fasta ref WDL --- bento_reference_service/workflows/wdls/fasta_ref.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bento_reference_service/workflows/wdls/fasta_ref.wdl b/bento_reference_service/workflows/wdls/fasta_ref.wdl index ee4fd9e..a3b80ae 100644 --- a/bento_reference_service/workflows/wdls/fasta_ref.wdl +++ b/bento_reference_service/workflows/wdls/fasta_ref.wdl @@ -227,7 +227,7 @@ task ingest_gff3_into_ref { task_res=$( curl ~{true="" false="-k" validate_ssl} \ -X POST \ - --json '{"genome_id": "~{genome_id}", "kind": "ingest_features"}' + --json '{"genome_id": "~{genome_id}", "kind": "ingest_features"}' \ -H "Authorization: Bearer ~{token}" \ --fail-with-body \ "~{reference_url}/tasks" From de199e38131c1235f3d41da6a3359d2e072975fd Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Tue, 21 May 2024 22:04:12 -0400 Subject: [PATCH 112/114] chore(config): increase default file response chunk size --- bento_reference_service/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bento_reference_service/config.py b/bento_reference_service/config.py index c000839..46433b2 100644 --- a/bento_reference_service/config.py +++ b/bento_reference_service/config.py @@ -23,7 +23,7 @@ class Config(BentoBaseConfig): file_ingest_tmp_dir: Path = Path(__file__).parent.parent / "tmp" # Default to repository `tmp` folder file_ingest_chunk_size: int = 1024 * 256 # 256 KiB at a time - file_response_chunk_size: int = 1024 * 16 # 16 KiB at a time + file_response_chunk_size: int = 1024 * 256 # 256 KiB at a time response_substring_limit: int = 10000 # TODO: Refine default feature_response_record_limit: int = 1000 From 3320bbfc1986af36863db3af7a2330d224241648 Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Tue, 21 May 2024 22:10:55 -0400 Subject: [PATCH 113/114] fix(workflows): another typo in fasta_ref --- bento_reference_service/workflows/wdls/fasta_ref.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bento_reference_service/workflows/wdls/fasta_ref.wdl b/bento_reference_service/workflows/wdls/fasta_ref.wdl index a3b80ae..ddeb024 100644 --- a/bento_reference_service/workflows/wdls/fasta_ref.wdl +++ b/bento_reference_service/workflows/wdls/fasta_ref.wdl @@ -193,7 +193,7 @@ task ingest_metadata_into_ref { if [[ '~{gff3_gz_drs_uri}' != '' ]]; then # assume if this is set then both gff3 variables are set. cat metadata.json | \ - jq '.gff3_gz = "~{gff3_gz_drs_uri}" | .gff_gz_tbi = "~{gff3_gz_tbi_drs_uri}"' > metadata.json.tmp + jq '.gff3_gz = "~{gff3_gz_drs_uri}" | .gff3_gz_tbi = "~{gff3_gz_tbi_drs_uri}"' > metadata.json.tmp mv metadata.json.tmp metadata.json fi From 77b9e63a6d580fcfb964623ed641c73a95502ed5 Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Tue, 21 May 2024 22:13:37 -0400 Subject: [PATCH 114/114] fix: feature ingestion --- bento_reference_service/features.py | 57 ++++++++++++++++++----------- 1 file changed, 35 insertions(+), 22 deletions(-) diff --git a/bento_reference_service/features.py b/bento_reference_service/features.py index 056a9fd..d88fcbe 100644 --- a/bento_reference_service/features.py +++ b/bento_reference_service/features.py @@ -264,31 +264,32 @@ async def ingest_features( return n_ingested -async def download_feature_files(genome: m.GenomeWithURIs, config: Config, logger: logging.Logger): - _, gff3_gz_iter = await stream_from_uri( - config, logger, genome.gff3_gz, range_header=None, impose_response_limit=False - ) - - _, gff3_gz_tbi_iter = await stream_from_uri( - config, logger, genome.gff3_gz_tbi, range_header=None, impose_response_limit=False - ) +async def download_uri_into_temporary_file(uri: str, tmp: Path, config: Config, logger: logging.Logger): + logger.debug(f"Saving data from URI {uri} into temporary file {tmp}") - fn = config.file_ingest_tmp_dir / f"{uuid4()}.gff3.gz" - fn_tbi = config.file_ingest_tmp_dir / f"{fn}.tbi" + _, stream_iter = await stream_from_uri(config, logger, uri, range_header=None, impose_response_limit=False) # copy .gff3.gz to temporary directory for ingestion - async with aiofiles.open(fn, "wb") as fh: - while data := (await anext(gff3_gz_iter, None)): + async with aiofiles.open(tmp, "wb") as fh: + while data := (await anext(stream_iter, None)): await fh.write(data) - logger.debug(f"Wrote GFF.gz data to {fn}; size={fn.stat().st_size}") + logger.debug(f"Wrote downloaded data to {tmp}; size={tmp.stat().st_size}") - # copy .gff3.gz.tbi to temporary directory for ingestion - async with aiofiles.open(fn_tbi, "wb") as fh: - while data := (await anext(gff3_gz_tbi_iter, None)): - await fh.write(data) - logger.debug(f"Wrote GFF.gz.tbi data to {fn_tbi}; size={fn_tbi.stat().st_size}") +async def download_feature_files(genome: m.GenomeWithURIs, config: Config, logger: logging.Logger): + tmp_file_id = str(uuid4()) + + if genome.gff3_gz is None: + raise AnnotationIngestError(f"Genome {genome.id} is missing a GFF3 file") + if genome.gff3_gz_tbi is None: + raise AnnotationIngestError(f"Genome {genome.id} is missing a GFF3 Tabix index") + + fn = config.file_ingest_tmp_dir / f"{tmp_file_id}.gff3.gz" + await download_uri_into_temporary_file(genome.gff3_gz, fn, config, logger) + + fn_tbi = config.file_ingest_tmp_dir / f"{tmp_file_id}.gff3.gz.tbi" + await download_uri_into_temporary_file(genome.gff3_gz_tbi, fn_tbi, config, logger) return fn, fn_tbi @@ -300,11 +301,23 @@ async def ingest_features_task(genome_id: str, task_id: int, config: Config, db: genome: m.GenomeWithURIs | None = await db.get_genome(genome_id) if genome is None: - raise AnnotationGenomeNotFoundError(f"Genome with ID {genome_id} not found") + err = f"task {task_id}: genome with ID {genome_id} not found" + logger.error(err) + await db.update_task_status(task_id, "error", message=err) + raise AnnotationGenomeNotFoundError(err) - # download GFF3 + GFF3 TBI file for this genome - logger.info(f"Downloading gene feature files for genome {genome_id}") - gff3_gz_path, gff3_gz_tbi_path = await download_feature_files(genome, config, logger) + try: + # download GFF3 + GFF3 TBI file for this genome + logger.info(f"Downloading gene feature files for genome {genome_id}") + gff3_gz_path, gff3_gz_tbi_path = await download_feature_files(genome, config, logger) + except Exception as e: + err = ( + f"task {task_id}: encountered exception while downloading feature files: {e}; traceback: " + f"{traceback.format_exc()}" + ) + logger.error(err) + await db.update_task_status(task_id, "error", message=err) + raise AnnotationIngestError(err) try: # clear existing gene features for this genome