From e3c407beebfa327062c16161e041a81c6f57e5b5 Mon Sep 17 00:00:00 2001 From: Julian Date: Fri, 13 Dec 2024 14:16:41 -0500 Subject: [PATCH 01/18] rm fetch_expressions --- transcriptomics_data_service/db.py | 21 --------------------- 1 file changed, 21 deletions(-) diff --git a/transcriptomics_data_service/db.py b/transcriptomics_data_service/db.py index 636677a..289e885 100644 --- a/transcriptomics_data_service/db.py +++ b/transcriptomics_data_service/db.py @@ -111,8 +111,6 @@ async def create_gene_expressions(self, expressions: list[GeneExpression], trans await transaction_conn.executemany(query, records) self.logger.info(f"Inserted {len(records)} gene expression records.") - async def fetch_expressions(self) -> tuple[GeneExpression, ...]: - return tuple([r async for r in self._select_expressions(None)]) async def _select_expressions(self, exp_id: str | None) -> AsyncIterator[GeneExpression]: conn: asyncpg.Connection @@ -123,25 +121,6 @@ async def _select_expressions(self, exp_id: str | None) -> AsyncIterator[GeneExp for r in map(lambda g: self._deserialize_gene_expression(g), res): yield r - async def fetch_gene_expressions_by_experiment_id(self, experiment_result_id: str) -> Tuple[GeneExpression, ...]: - """ - Fetch gene expressions for a specific experiment_result_id. - """ - conn: asyncpg.Connection - async with self.connect() as conn: - query = """ - SELECT * FROM gene_expressions WHERE experiment_result_id = $1 - """ - res = await conn.fetch(query, experiment_result_id) - return tuple([self._deserialize_gene_expression(record) for record in res]) - - async def fetch_gene_expressions(self, experiments: list[str], method: str = "raw", paginate: bool = False) -> Tuple[Tuple[GeneExpression, ...], int]: - if not experiments: - return (), 0 - # TODO: refactor this fetch_gene_expressions_by_experiment_id and implement pagination - experiment_result_id = experiments[0] - expressions = await self.fetch_gene_expressions_by_experiment_id(experiment_result_id) - return expressions, len(expressions) def _deserialize_gene_expression(self, rec: asyncpg.Record) -> GeneExpression: return GeneExpression( From 7453f1094db40db75be38969e4f9eadbb93ccd3d Mon Sep 17 00:00:00 2001 From: Julian Date: Fri, 13 Dec 2024 14:18:09 -0500 Subject: [PATCH 02/18] implement fetch_gene_expressions accepting several params --- transcriptomics_data_service/db.py | 78 ++++++++++++++++++++++++- transcriptomics_data_service/main.py | 4 +- transcriptomics_data_service/models.py | 80 ++++++++++++++++++++++---- 3 files changed, 149 insertions(+), 13 deletions(-) diff --git a/transcriptomics_data_service/db.py b/transcriptomics_data_service/db.py index 289e885..7ba984c 100644 --- a/transcriptomics_data_service/db.py +++ b/transcriptomics_data_service/db.py @@ -1,5 +1,5 @@ import logging -from typing import Annotated, AsyncIterator, List, Tuple +from typing import Annotated, AsyncIterator, List, Tuple, Optional import asyncpg from bento_lib.db.pg_async import PgAsyncDatabase from contextlib import asynccontextmanager @@ -196,6 +196,82 @@ async def transaction_connection(self): # operations must be made using this connection for the transaction to apply yield conn + async def fetch_gene_expressions( + self, + genes: Optional[List[str]] = None, + experiments: Optional[List[str]] = None, + sample_ids: Optional[List[str]] = None, + method: str = "raw", + page: int = 1, + page_size: int = 100, + paginate: bool = True, + ) -> Tuple[List[GeneExpression], int]: + """ + Fetch gene expressions based on genes, experiments, sample_ids, and method, with optional pagination. + Returns a tuple of (expressions list, total_records count). + """ + conn: asyncpg.Connection + async with self.connect() as conn: + # Query builder + base_query = """ + SELECT gene_code, sample_id, experiment_result_id, raw_count, tpm_count, tmm_count, getmm_count + FROM gene_expressions + """ + count_query = "SELECT COUNT(*) FROM gene_expressions" + conditions = [] + params = [] + param_counter = 1 + + if genes: + conditions.append(f"gene_code = ANY(${param_counter}::text[])") + params.append(genes) + param_counter += 1 + + if experiments: + conditions.append(f"experiment_result_id = ANY(${param_counter}::text[])") + params.append(experiments) + param_counter += 1 + + if sample_ids: + conditions.append(f"sample_id = ANY(${param_counter}::text[])") + params.append(sample_ids) + param_counter += 1 + + if method != "raw": + conditions.append(f"{method}_count IS NOT NULL") + + where_clause = " WHERE " + " AND ".join(conditions) if conditions else "" + + order_clause = " ORDER BY gene_code, sample_id" + + query = base_query + where_clause + order_clause + count_query += where_clause + + # Pagination + if paginate: + limit_offset_clause = f" LIMIT ${param_counter} OFFSET ${param_counter + 1}" + params.extend([page_size, (page - 1) * page_size]) + query += limit_offset_clause + + total_records_params = params[:-2] if paginate else params + total_records = await conn.fetchval(count_query, *total_records_params) + + res = await conn.fetch(query, *params) + + expressions = [ + GeneExpression( + gene_code=record["gene_code"], + sample_id=record["sample_id"], + experiment_result_id=record["experiment_result_id"], + raw_count=record["raw_count"], + tpm_count=record["tpm_count"], + tmm_count=record["tmm_count"], + getmm_count=record["getmm_count"], + ) + for record in res + ] + + return expressions, total_records @lru_cache() def get_db(config: ConfigDependency, logger: LoggerDependency) -> Database: diff --git a/transcriptomics_data_service/main.py b/transcriptomics_data_service/main.py index e76ccc0..783462b 100644 --- a/transcriptomics_data_service/main.py +++ b/transcriptomics_data_service/main.py @@ -4,9 +4,9 @@ from transcriptomics_data_service.db import get_db from transcriptomics_data_service.routers.experiment_results import experiment_router -from transcriptomics_data_service.routers.expressions import expression_router from transcriptomics_data_service.routers.ingest import ingest_router from transcriptomics_data_service.routers.normalization import normalization_router +from transcriptomics_data_service.routers.query import query_router from . import __version__ from .config import get_config from .constants import BENTO_SERVICE_KIND, SERVICE_TYPE @@ -42,7 +42,7 @@ async def lifespan(_app: FastAPI): lifespan=lifespan, ) -app.include_router(expression_router) app.include_router(ingest_router) app.include_router(experiment_router) app.include_router(normalization_router) +app.include_router(query_router) diff --git a/transcriptomics_data_service/models.py b/transcriptomics_data_service/models.py index ab4317b..d7fb431 100644 --- a/transcriptomics_data_service/models.py +++ b/transcriptomics_data_service/models.py @@ -1,22 +1,82 @@ -from pydantic import BaseModel +from pydantic import BaseModel, Field, validator +from typing import List, Optional +from enum import Enum __all__ = [ "ExperimentResult", "GeneExpression", + "GeneExpressionData", + "PaginationMeta", + "GeneExpressionResponse", + "MethodEnum", + "QueryParameters", ] class ExperimentResult(BaseModel): - experiment_result_id: str - assembly_id: str | None = None - assembly_name: str | None = None + experiment_result_id: str = Field(..., min_length=1, max_length=255) + assembly_id: Optional[str] = Field(None, max_length=255) + assembly_name: Optional[str] = Field(None, max_length=255) class GeneExpression(BaseModel): - gene_code: str - sample_id: str - experiment_result_id: str + gene_code: str = Field(..., min_length=1, max_length=255) + sample_id: str = Field(..., min_length=1, max_length=255) + experiment_result_id: str = Field(..., min_length=1, max_length=255) raw_count: int - tpm_count: float | None = None - tmm_count: float | None = None - getmm_count: float | None = None + tpm_count: Optional[float] = None + tmm_count: Optional[float] = None + getmm_count: Optional[float] = None + + +class GeneExpressionData(BaseModel): + gene_code: str = Field(..., min_length=1, max_length=255, description="Gene code") + sample_id: str = Field(..., min_length=1, max_length=255, description="Sample ID") + experiment_result_id: str = Field(..., min_length=1, max_length=255, description="Experiment result ID") + count: float = Field(..., description="Expression count") + method: str = Field(..., description="Method used to calculate the expression count") + + +class PaginationMeta(BaseModel): + total_records: int = Field(..., ge=0, description="Total number of records") + page: int = Field(..., ge=1, description="Current page number") + page_size: int = Field(..., ge=1, le=1000, description="Number of records per page") + total_pages: int = Field(..., ge=1, description="Total number of pages") + + +class GeneExpressionResponse(BaseModel): + expressions: List[GeneExpressionData] + pagination: PaginationMeta + + +class MethodEnum(str, Enum): + raw = "raw" + tpm = "tpm" + tmm = "tmm" + getmm = "getmm" + + +class QueryParameters(BaseModel): + genes: Optional[List[str]] = Field(None, description="List of gene codes to retrieve") + experiments: Optional[List[str]] = Field(None, description="List of experiment result IDs to retrieve data from") + sample_ids: Optional[List[str]] = Field(None, description="List of sample IDs to retrieve data from") + method: MethodEnum = Field(MethodEnum.raw, description="Data method to retrieve: 'raw', 'tpm', 'tmm', 'getmm'") + page: int = Field( + 1, + ge=1, + description="Page number for pagination (must be greater than or equal to 1)", + ) + page_size: int = Field( + 100, + ge=1, + le=1000, + description="Number of records per page (between 1 and 1000)", + ) + + @validator("genes", "experiments", "sample_ids", each_item=True) + def validate_identifiers(cls, value): + if not (1 <= len(value) <= 255): + raise ValueError("Each identifier must be between 1 and 255 characters long.") + if not value.replace("_", "").isalnum(): + raise ValueError("Identifiers must contain only alphanumeric characters and underscores.") + return value From 0f41887d0c9d530ae5f0d76f18a68091e79658d6 Mon Sep 17 00:00:00 2001 From: Julian Date: Fri, 13 Dec 2024 14:19:15 -0500 Subject: [PATCH 03/18] implement query router --- transcriptomics_data_service/routers/query.py | 127 ++++++++++++++++++ 1 file changed, 127 insertions(+) create mode 100644 transcriptomics_data_service/routers/query.py diff --git a/transcriptomics_data_service/routers/query.py b/transcriptomics_data_service/routers/query.py new file mode 100644 index 0000000..10a9344 --- /dev/null +++ b/transcriptomics_data_service/routers/query.py @@ -0,0 +1,127 @@ +from fastapi import APIRouter, HTTPException, status, Query + +from transcriptomics_data_service.db import DatabaseDependency +from transcriptomics_data_service.logger import LoggerDependency +from transcriptomics_data_service.models import ( + GeneExpressionData, + GeneExpressionResponse, + PaginationMeta, + MethodEnum, + QueryParameters, +) + +query_router = APIRouter(prefix="/query") + + +async def get_expressions_handler( + params: QueryParameters, + db: DatabaseDependency, + logger: LoggerDependency, +): + """ + Handler for fetching and returning gene expression data. + """ + logger.info(f"Received query parameters: {params}") + + expressions, total_records = await db.fetch_gene_expressions( + genes=params.genes, + experiments=params.experiments, + sample_ids=params.sample_ids, + method=params.method.value, + page=params.page, + page_size=params.page_size, + ) + + if not expressions: + raise HTTPException( + status_code=status.HTTP_404_NOT_FOUND, + detail="No gene expression data found for the given parameters.", + ) + + response_data = [] + method_field = f"{params.method.value}_count" if params.method != MethodEnum.raw else "raw_count" + for expr in expressions: + count = getattr(expr, method_field) + response_item = GeneExpressionData( + gene_code=expr.gene_code, + sample_id=expr.sample_id, + experiment_result_id=expr.experiment_result_id, + count=count, + method=method_field, + ) + response_data.append(response_item) + + total_pages = (total_records + params.page_size - 1) // params.page_size + pagination_meta = PaginationMeta( + total_records=total_records, + page=params.page, + page_size=params.page_size, + total_pages=total_pages, + ) + + return GeneExpressionResponse(expressions=response_data, pagination=pagination_meta) + + +@query_router.get( + "/expressions_all", + status_code=status.HTTP_200_OK, + response_model=GeneExpressionResponse, +) +async def get_expressions_all( + db: DatabaseDependency, + logger: LoggerDependency, + method: MethodEnum = Query( + MethodEnum.raw, + description="Data method to retrieve: 'raw', 'tpm', 'tmm', 'getmm'", + ), + page: int = Query( + 1, + ge=1, + description="Page number for pagination (must be greater than or equal to 1)", + ), + page_size: int = Query( + 100, + ge=1, + le=1000, + description="Number of records per page (between 1 and 1000)", + ), +): + """ + Retrieve all gene expression data via GET request with pagination. + """ + params = QueryParameters( + genes=None, + experiments=None, + sample_ids=None, + method=method, + page=page, + page_size=page_size, + ) + + return await get_expressions_handler(params, db, logger) + + +@query_router.post( + "/expressions", + status_code=status.HTTP_200_OK, + response_model=GeneExpressionResponse, +) +async def get_expressions_post( + params: QueryParameters, + db: DatabaseDependency, + logger: LoggerDependency, +): + """ + Retrieve gene expression data via POST request. + + Example JSON body: + { + "genes": ["gene1", "gene2"], + "experiments": ["exp1"], + "sample_ids": ["sample1"], + "method": "tmm", + "page": 1, + "page_size": 100 + } + """ + return await get_expressions_handler(params, db, logger) From 4e062bf029b4535704d83b636ea89268fab7e32c Mon Sep 17 00:00:00 2001 From: Julian Date: Fri, 13 Dec 2024 14:21:39 -0500 Subject: [PATCH 04/18] lint --- transcriptomics_data_service/db.py | 145 +++++++++--------- .../routers/normalization.py | 4 +- transcriptomics_data_service/routers/query.py | 2 +- .../scripts/normalize.py | 15 +- 4 files changed, 87 insertions(+), 79 deletions(-) diff --git a/transcriptomics_data_service/db.py b/transcriptomics_data_service/db.py index 7ba984c..c839f4f 100644 --- a/transcriptomics_data_service/db.py +++ b/transcriptomics_data_service/db.py @@ -111,7 +111,6 @@ async def create_gene_expressions(self, expressions: list[GeneExpression], trans await transaction_conn.executemany(query, records) self.logger.info(f"Inserted {len(records)} gene expression records.") - async def _select_expressions(self, exp_id: str | None) -> AsyncIterator[GeneExpression]: conn: asyncpg.Connection where_clause = "WHERE experiment_result_id = $1" if exp_id is not None else "" @@ -121,7 +120,6 @@ async def _select_expressions(self, exp_id: str | None) -> AsyncIterator[GeneExp for r in map(lambda g: self._deserialize_gene_expression(g), res): yield r - def _deserialize_gene_expression(self, rec: asyncpg.Record) -> GeneExpression: return GeneExpression( gene_code=rec["gene_code"], @@ -197,81 +195,82 @@ async def transaction_connection(self): yield conn async def fetch_gene_expressions( - self, - genes: Optional[List[str]] = None, - experiments: Optional[List[str]] = None, - sample_ids: Optional[List[str]] = None, - method: str = "raw", - page: int = 1, - page_size: int = 100, - paginate: bool = True, - ) -> Tuple[List[GeneExpression], int]: - """ - Fetch gene expressions based on genes, experiments, sample_ids, and method, with optional pagination. - Returns a tuple of (expressions list, total_records count). - """ - conn: asyncpg.Connection - async with self.connect() as conn: - # Query builder - base_query = """ + self, + genes: Optional[List[str]] = None, + experiments: Optional[List[str]] = None, + sample_ids: Optional[List[str]] = None, + method: str = "raw", + page: int = 1, + page_size: int = 100, + paginate: bool = True, + ) -> Tuple[List[GeneExpression], int]: + """ + Fetch gene expressions based on genes, experiments, sample_ids, and method, with optional pagination. + Returns a tuple of (expressions list, total_records count). + """ + conn: asyncpg.Connection + async with self.connect() as conn: + # Query builder + base_query = """ SELECT gene_code, sample_id, experiment_result_id, raw_count, tpm_count, tmm_count, getmm_count FROM gene_expressions """ - count_query = "SELECT COUNT(*) FROM gene_expressions" - conditions = [] - params = [] - param_counter = 1 - - if genes: - conditions.append(f"gene_code = ANY(${param_counter}::text[])") - params.append(genes) - param_counter += 1 - - if experiments: - conditions.append(f"experiment_result_id = ANY(${param_counter}::text[])") - params.append(experiments) - param_counter += 1 - - if sample_ids: - conditions.append(f"sample_id = ANY(${param_counter}::text[])") - params.append(sample_ids) - param_counter += 1 - - if method != "raw": - conditions.append(f"{method}_count IS NOT NULL") - - where_clause = " WHERE " + " AND ".join(conditions) if conditions else "" - - order_clause = " ORDER BY gene_code, sample_id" - - query = base_query + where_clause + order_clause - count_query += where_clause - - # Pagination - if paginate: - limit_offset_clause = f" LIMIT ${param_counter} OFFSET ${param_counter + 1}" - params.extend([page_size, (page - 1) * page_size]) - query += limit_offset_clause - - total_records_params = params[:-2] if paginate else params - total_records = await conn.fetchval(count_query, *total_records_params) - - res = await conn.fetch(query, *params) - - expressions = [ - GeneExpression( - gene_code=record["gene_code"], - sample_id=record["sample_id"], - experiment_result_id=record["experiment_result_id"], - raw_count=record["raw_count"], - tpm_count=record["tpm_count"], - tmm_count=record["tmm_count"], - getmm_count=record["getmm_count"], - ) - for record in res - ] + count_query = "SELECT COUNT(*) FROM gene_expressions" + conditions = [] + params = [] + param_counter = 1 + + if genes: + conditions.append(f"gene_code = ANY(${param_counter}::text[])") + params.append(genes) + param_counter += 1 + + if experiments: + conditions.append(f"experiment_result_id = ANY(${param_counter}::text[])") + params.append(experiments) + param_counter += 1 + + if sample_ids: + conditions.append(f"sample_id = ANY(${param_counter}::text[])") + params.append(sample_ids) + param_counter += 1 + + if method != "raw": + conditions.append(f"{method}_count IS NOT NULL") + + where_clause = " WHERE " + " AND ".join(conditions) if conditions else "" + + order_clause = " ORDER BY gene_code, sample_id" + + query = base_query + where_clause + order_clause + count_query += where_clause + + # Pagination + if paginate: + limit_offset_clause = f" LIMIT ${param_counter} OFFSET ${param_counter + 1}" + params.extend([page_size, (page - 1) * page_size]) + query += limit_offset_clause + + total_records_params = params[:-2] if paginate else params + total_records = await conn.fetchval(count_query, *total_records_params) + + res = await conn.fetch(query, *params) + + expressions = [ + GeneExpression( + gene_code=record["gene_code"], + sample_id=record["sample_id"], + experiment_result_id=record["experiment_result_id"], + raw_count=record["raw_count"], + tpm_count=record["tpm_count"], + tmm_count=record["tmm_count"], + getmm_count=record["getmm_count"], + ) + for record in res + ] + + return expressions, total_records - return expressions, total_records @lru_cache() def get_db(config: ConfigDependency, logger: LoggerDependency) -> Database: diff --git a/transcriptomics_data_service/routers/normalization.py b/transcriptomics_data_service/routers/normalization.py index d845ce7..439ed3a 100644 --- a/transcriptomics_data_service/routers/normalization.py +++ b/transcriptomics_data_service/routers/normalization.py @@ -93,9 +93,7 @@ async def _fetch_raw_counts(db, experiment_result_id: str) -> pd.DataFrame: Fetch raw counts from the database for the given experiment_result_id. Returns a DataFrame with genes as rows and samples as columns. """ - expressions, _ = await db.fetch_gene_expressions( - experiments=[experiment_result_id], method="raw", paginate=False - ) + expressions, _ = await db.fetch_gene_expressions(experiments=[experiment_result_id], method="raw", paginate=False) if not expressions: raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Experiment result not found.") diff --git a/transcriptomics_data_service/routers/query.py b/transcriptomics_data_service/routers/query.py index 10a9344..fc6752c 100644 --- a/transcriptomics_data_service/routers/query.py +++ b/transcriptomics_data_service/routers/query.py @@ -113,7 +113,7 @@ async def get_expressions_post( ): """ Retrieve gene expression data via POST request. - + Example JSON body: { "genes": ["gene1", "gene2"], diff --git a/transcriptomics_data_service/scripts/normalize.py b/transcriptomics_data_service/scripts/normalize.py index d211dbf..55f9799 100644 --- a/transcriptomics_data_service/scripts/normalize.py +++ b/transcriptomics_data_service/scripts/normalize.py @@ -2,12 +2,14 @@ import numpy as np from joblib import Parallel, delayed + def filter_counts(counts_df): """Filter out genes (rows) and samples (columns) with zero total counts.""" row_filter = counts_df.sum(axis=1) > 0 col_filter = counts_df.sum(axis=0) > 0 return counts_df.loc[row_filter, col_filter] + def prepare_counts_and_lengths(counts_df, gene_lengths, scale_length=None): """Align counts and gene_lengths, drop zeros, and optionally scale gene lengths.""" counts_df = counts_df.loc[gene_lengths.index] @@ -18,11 +20,13 @@ def prepare_counts_and_lengths(counts_df, gene_lengths, scale_length=None): gene_lengths = gene_lengths / scale_length return filter_counts(counts_df), gene_lengths + def parallel_apply(columns, func, n_jobs=-1): """Apply a function to each column in parallel and combine results.""" results = Parallel(n_jobs=n_jobs)(delayed(func)(col) for col in columns) return pd.concat(results, axis=1) + def trim_values(log_ratio, log_mean, w, logratio_trim, sum_trim): """Perform log ratio and sum trimming.""" n = len(log_ratio) @@ -43,6 +47,7 @@ def trim_values(log_ratio, log_mean, w, logratio_trim, sum_trim): return lr_t[final_idx], w_t[final_idx] + def compute_TMM_normalization_factors(counts_df, logratio_trim=0.3, sum_trim=0.05, weighting=True, n_jobs=-1): """Compute TMM normalization factors for counts data.""" lib_sizes = counts_df.sum(axis=0) @@ -53,7 +58,7 @@ def compute_TMM_normalization_factors(counts_df, logratio_trim=0.3, sum_trim=0.0 sample_names = counts_df.columns data_values = counts_df.values - norm_factors = pd.Series(index=sample_names, dtype='float64') + norm_factors = pd.Series(index=sample_names, dtype="float64") norm_factors[ref_sample] = 1.0 def compute_norm_factor(sample): @@ -81,7 +86,7 @@ def compute_norm_factor(sample): lr_final, w_final = trim_values(log_ratio, log_mean, w, logratio_trim, sum_trim) mean_M = np.sum(w_final * lr_final) / np.sum(w_final) - norm_factor = 2 ** mean_M + norm_factor = 2**mean_M return sample, norm_factor samples = [s for s in sample_names if s != ref_sample] @@ -93,6 +98,7 @@ def compute_norm_factor(sample): norm_factors = norm_factors / np.exp(np.mean(np.log(norm_factors))) return norm_factors + def tmm_normalization(counts_df, logratio_trim=0.3, sum_trim=0.05, weighting=True, n_jobs=-1): """Perform TMM normalization on counts data.""" counts_df = filter_counts(counts_df) @@ -101,21 +107,26 @@ def tmm_normalization(counts_df, logratio_trim=0.3, sum_trim=0.05, weighting=Tru normalized_data = counts_df.div(lib_sizes, axis=1).div(norm_factors, axis=1) * lib_sizes.mean() return normalized_data + def getmm_normalization(counts_df, gene_lengths, logratio_trim=0.3, sum_trim=0.05, weighting=True, n_jobs=-1): """Perform GeTMM normalization on counts data.""" counts_df, gene_lengths = prepare_counts_and_lengths(counts_df, gene_lengths) rpk = counts_df.mul(1e3).div(gene_lengths, axis=0) return tmm_normalization(rpk, logratio_trim, sum_trim, weighting, n_jobs) + def compute_rpk(counts_df, gene_lengths_scaled, n_jobs=-1): """Compute RPK values in parallel.""" columns = counts_df.columns + def rpk_col(col): return counts_df[col] / gene_lengths_scaled + rpk = parallel_apply(columns, rpk_col, n_jobs) rpk.columns = columns return rpk + def tpm_normalization(counts_df, gene_lengths, scale_library=1e6, scale_length=1e3, n_jobs=-1): """Convert raw read counts to TPM in parallel.""" counts_df, gene_lengths_scaled = prepare_counts_and_lengths(counts_df, gene_lengths, scale_length=scale_length) From 0a45a5e07d4a240d4dc74f1c9c9cb7d8e5600933 Mon Sep 17 00:00:00 2001 From: Julian Date: Wed, 18 Dec 2024 13:57:58 -0500 Subject: [PATCH 05/18] rename endpoint --- transcriptomics_data_service/routers/query.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/transcriptomics_data_service/routers/query.py b/transcriptomics_data_service/routers/query.py index fc6752c..e9966b5 100644 --- a/transcriptomics_data_service/routers/query.py +++ b/transcriptomics_data_service/routers/query.py @@ -63,7 +63,7 @@ async def get_expressions_handler( @query_router.get( - "/expressions_all", + "/expressions", status_code=status.HTTP_200_OK, response_model=GeneExpressionResponse, ) From d7b3cfd26a21529ce3f416a21da3347e5fa2d3fd Mon Sep 17 00:00:00 2001 From: Julian Date: Wed, 18 Dec 2024 14:11:00 -0500 Subject: [PATCH 06/18] remove get method for expressions --- transcriptomics_data_service/routers/query.py | 39 ------------------- 1 file changed, 39 deletions(-) diff --git a/transcriptomics_data_service/routers/query.py b/transcriptomics_data_service/routers/query.py index e9966b5..069baaa 100644 --- a/transcriptomics_data_service/routers/query.py +++ b/transcriptomics_data_service/routers/query.py @@ -62,45 +62,6 @@ async def get_expressions_handler( return GeneExpressionResponse(expressions=response_data, pagination=pagination_meta) -@query_router.get( - "/expressions", - status_code=status.HTTP_200_OK, - response_model=GeneExpressionResponse, -) -async def get_expressions_all( - db: DatabaseDependency, - logger: LoggerDependency, - method: MethodEnum = Query( - MethodEnum.raw, - description="Data method to retrieve: 'raw', 'tpm', 'tmm', 'getmm'", - ), - page: int = Query( - 1, - ge=1, - description="Page number for pagination (must be greater than or equal to 1)", - ), - page_size: int = Query( - 100, - ge=1, - le=1000, - description="Number of records per page (between 1 and 1000)", - ), -): - """ - Retrieve all gene expression data via GET request with pagination. - """ - params = QueryParameters( - genes=None, - experiments=None, - sample_ids=None, - method=method, - page=page, - page_size=page_size, - ) - - return await get_expressions_handler(params, db, logger) - - @query_router.post( "/expressions", status_code=status.HTTP_200_OK, From dea2fe4d62fc2c0304c1f58c30dffa47993581ef Mon Sep 17 00:00:00 2001 From: Julian Date: Fri, 20 Dec 2024 14:21:00 -0500 Subject: [PATCH 07/18] rm prefix --- transcriptomics_data_service/routers/query.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/transcriptomics_data_service/routers/query.py b/transcriptomics_data_service/routers/query.py index 069baaa..2c05383 100644 --- a/transcriptomics_data_service/routers/query.py +++ b/transcriptomics_data_service/routers/query.py @@ -10,7 +10,7 @@ QueryParameters, ) -query_router = APIRouter(prefix="/query") +query_router = APIRouter() async def get_expressions_handler( From 3b0128961fbf940365b3c9e1ba561c46310e7731 Mon Sep 17 00:00:00 2001 From: Julian Date: Fri, 20 Dec 2024 15:08:09 -0500 Subject: [PATCH 08/18] refactor pagination --- transcriptomics_data_service/models.py | 41 ++++++++++--------- transcriptomics_data_service/routers/query.py | 7 ++-- 2 files changed, 24 insertions(+), 24 deletions(-) diff --git a/transcriptomics_data_service/models.py b/transcriptomics_data_service/models.py index d7fb431..bebfe07 100644 --- a/transcriptomics_data_service/models.py +++ b/transcriptomics_data_service/models.py @@ -13,6 +13,25 @@ ] +class PaginatedRequest(BaseModel): + page: int = Field( + 1, + ge=1, + description="Current page number" + ) + page_size: int = Field( + 100, + ge=1, + le=1000, + description="Number of records per page" + ) + + +class PaginatedResponse(PaginatedRequest): + total_records: int = Field(..., ge=0, description="Total number of records") + total_pages: int = Field(..., ge=1, description="Total number of pages") + + class ExperimentResult(BaseModel): experiment_result_id: str = Field(..., min_length=1, max_length=255) assembly_id: Optional[str] = Field(None, max_length=255) @@ -37,16 +56,9 @@ class GeneExpressionData(BaseModel): method: str = Field(..., description="Method used to calculate the expression count") -class PaginationMeta(BaseModel): - total_records: int = Field(..., ge=0, description="Total number of records") - page: int = Field(..., ge=1, description="Current page number") - page_size: int = Field(..., ge=1, le=1000, description="Number of records per page") - total_pages: int = Field(..., ge=1, description="Total number of pages") - -class GeneExpressionResponse(BaseModel): +class GeneExpressionResponse(PaginatedResponse): expressions: List[GeneExpressionData] - pagination: PaginationMeta class MethodEnum(str, Enum): @@ -56,22 +68,11 @@ class MethodEnum(str, Enum): getmm = "getmm" -class QueryParameters(BaseModel): +class QueryParameters(PaginatedRequest): genes: Optional[List[str]] = Field(None, description="List of gene codes to retrieve") experiments: Optional[List[str]] = Field(None, description="List of experiment result IDs to retrieve data from") sample_ids: Optional[List[str]] = Field(None, description="List of sample IDs to retrieve data from") method: MethodEnum = Field(MethodEnum.raw, description="Data method to retrieve: 'raw', 'tpm', 'tmm', 'getmm'") - page: int = Field( - 1, - ge=1, - description="Page number for pagination (must be greater than or equal to 1)", - ) - page_size: int = Field( - 100, - ge=1, - le=1000, - description="Number of records per page (between 1 and 1000)", - ) @validator("genes", "experiments", "sample_ids", each_item=True) def validate_identifiers(cls, value): diff --git a/transcriptomics_data_service/routers/query.py b/transcriptomics_data_service/routers/query.py index 2c05383..8700b23 100644 --- a/transcriptomics_data_service/routers/query.py +++ b/transcriptomics_data_service/routers/query.py @@ -5,7 +5,6 @@ from transcriptomics_data_service.models import ( GeneExpressionData, GeneExpressionResponse, - PaginationMeta, MethodEnum, QueryParameters, ) @@ -52,15 +51,15 @@ async def get_expressions_handler( response_data.append(response_item) total_pages = (total_records + params.page_size - 1) // params.page_size - pagination_meta = PaginationMeta( + + return GeneExpressionResponse( + expressions=response_data, total_records=total_records, page=params.page, page_size=params.page_size, total_pages=total_pages, ) - return GeneExpressionResponse(expressions=response_data, pagination=pagination_meta) - @query_router.post( "/expressions", From afe2e0d6d228bf2fe81c4c38bafdf86029847fc5 Mon Sep 17 00:00:00 2001 From: Julian Date: Fri, 20 Dec 2024 15:09:10 -0500 Subject: [PATCH 09/18] lint --- transcriptomics_data_service/models.py | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/transcriptomics_data_service/models.py b/transcriptomics_data_service/models.py index bebfe07..3b031ed 100644 --- a/transcriptomics_data_service/models.py +++ b/transcriptomics_data_service/models.py @@ -14,17 +14,8 @@ class PaginatedRequest(BaseModel): - page: int = Field( - 1, - ge=1, - description="Current page number" - ) - page_size: int = Field( - 100, - ge=1, - le=1000, - description="Number of records per page" - ) + page: int = Field(1, ge=1, description="Current page number") + page_size: int = Field(100, ge=1, le=1000, description="Number of records per page") class PaginatedResponse(PaginatedRequest): @@ -56,7 +47,6 @@ class GeneExpressionData(BaseModel): method: str = Field(..., description="Method used to calculate the expression count") - class GeneExpressionResponse(PaginatedResponse): expressions: List[GeneExpressionData] From 44d6dee192ba0f3bd3dc102bb78b3623b6e7a3b4 Mon Sep 17 00:00:00 2001 From: Julian Date: Fri, 20 Dec 2024 15:13:38 -0500 Subject: [PATCH 10/18] rm line --- transcriptomics_data_service/routers/query.py | 1 - 1 file changed, 1 deletion(-) diff --git a/transcriptomics_data_service/routers/query.py b/transcriptomics_data_service/routers/query.py index 8700b23..4a40497 100644 --- a/transcriptomics_data_service/routers/query.py +++ b/transcriptomics_data_service/routers/query.py @@ -11,7 +11,6 @@ query_router = APIRouter() - async def get_expressions_handler( params: QueryParameters, db: DatabaseDependency, From 58aaa3372c1b9103abfb6f27008570cf322b428b Mon Sep 17 00:00:00 2001 From: Julian Date: Fri, 20 Dec 2024 15:16:45 -0500 Subject: [PATCH 11/18] lint --- transcriptomics_data_service/routers/query.py | 1 + 1 file changed, 1 insertion(+) diff --git a/transcriptomics_data_service/routers/query.py b/transcriptomics_data_service/routers/query.py index 4a40497..8700b23 100644 --- a/transcriptomics_data_service/routers/query.py +++ b/transcriptomics_data_service/routers/query.py @@ -11,6 +11,7 @@ query_router = APIRouter() + async def get_expressions_handler( params: QueryParameters, db: DatabaseDependency, From e02e1f40dcf3d5334f1d56e21a42976a161db5cb Mon Sep 17 00:00:00 2001 From: v-rocheleau Date: Fri, 20 Dec 2024 16:32:45 -0500 Subject: [PATCH 12/18] lint --- transcriptomics_data_service/scripts/normalize.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/transcriptomics_data_service/scripts/normalize.py b/transcriptomics_data_service/scripts/normalize.py index 9bcd402..dd7f169 100644 --- a/transcriptomics_data_service/scripts/normalize.py +++ b/transcriptomics_data_service/scripts/normalize.py @@ -108,7 +108,9 @@ def tmm_normalization(counts_df, logratio_trim=0.3, sum_trim=0.05, weighting=Tru return normalized_data -def getmm_normalization(counts_df, gene_lengths, logratio_trim=0.3, sum_trim=0.05, scaling_factor=1e3, weighting=True, n_jobs=-1): +def getmm_normalization( + counts_df, gene_lengths, logratio_trim=0.3, sum_trim=0.05, scaling_factor=1e3, weighting=True, n_jobs=-1 +): """Perform GeTMM normalization on counts data.""" counts_df, gene_lengths = prepare_counts_and_lengths(counts_df, gene_lengths) rpk = counts_df.mul(scaling_factor).div(gene_lengths, axis=0) From 5afce03e4185fdb9b33b934cf3afe6b6be3cb779 Mon Sep 17 00:00:00 2001 From: Julian Date: Mon, 30 Dec 2024 14:51:38 -0500 Subject: [PATCH 13/18] rf GeneExpressionData --- transcriptomics_data_service/models.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/transcriptomics_data_service/models.py b/transcriptomics_data_service/models.py index 3b031ed..7e2a79b 100644 --- a/transcriptomics_data_service/models.py +++ b/transcriptomics_data_service/models.py @@ -12,6 +12,11 @@ "QueryParameters", ] +class MethodEnum(str, Enum): + raw = "raw" + tpm = "tpm" + tmm = "tmm" + getmm = "getmm" class PaginatedRequest(BaseModel): page: int = Field(1, ge=1, description="Current page number") @@ -44,20 +49,13 @@ class GeneExpressionData(BaseModel): sample_id: str = Field(..., min_length=1, max_length=255, description="Sample ID") experiment_result_id: str = Field(..., min_length=1, max_length=255, description="Experiment result ID") count: float = Field(..., description="Expression count") - method: str = Field(..., description="Method used to calculate the expression count") + method: MethodEnum = Field(..., description="Method used to calculate the expression count") class GeneExpressionResponse(PaginatedResponse): expressions: List[GeneExpressionData] -class MethodEnum(str, Enum): - raw = "raw" - tpm = "tpm" - tmm = "tmm" - getmm = "getmm" - - class QueryParameters(PaginatedRequest): genes: Optional[List[str]] = Field(None, description="List of gene codes to retrieve") experiments: Optional[List[str]] = Field(None, description="List of experiment result IDs to retrieve data from") From e23cddc5e4a04d88fd36259c8119a1fd58df3b8a Mon Sep 17 00:00:00 2001 From: Julian Date: Mon, 30 Dec 2024 14:54:51 -0500 Subject: [PATCH 14/18] rename QueryParameters --- transcriptomics_data_service/models.py | 4 ++-- transcriptomics_data_service/routers/query.py | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/transcriptomics_data_service/models.py b/transcriptomics_data_service/models.py index 7e2a79b..e3cef80 100644 --- a/transcriptomics_data_service/models.py +++ b/transcriptomics_data_service/models.py @@ -9,7 +9,7 @@ "PaginationMeta", "GeneExpressionResponse", "MethodEnum", - "QueryParameters", + "ExpressionQueryBody", ] class MethodEnum(str, Enum): @@ -56,7 +56,7 @@ class GeneExpressionResponse(PaginatedResponse): expressions: List[GeneExpressionData] -class QueryParameters(PaginatedRequest): +class ExpressionQueryBody(PaginatedRequest): genes: Optional[List[str]] = Field(None, description="List of gene codes to retrieve") experiments: Optional[List[str]] = Field(None, description="List of experiment result IDs to retrieve data from") sample_ids: Optional[List[str]] = Field(None, description="List of sample IDs to retrieve data from") diff --git a/transcriptomics_data_service/routers/query.py b/transcriptomics_data_service/routers/query.py index 8700b23..65e1466 100644 --- a/transcriptomics_data_service/routers/query.py +++ b/transcriptomics_data_service/routers/query.py @@ -6,14 +6,14 @@ GeneExpressionData, GeneExpressionResponse, MethodEnum, - QueryParameters, + ExpressionQueryBody, ) query_router = APIRouter() async def get_expressions_handler( - params: QueryParameters, + params: ExpressionQueryBody, db: DatabaseDependency, logger: LoggerDependency, ): @@ -67,7 +67,7 @@ async def get_expressions_handler( response_model=GeneExpressionResponse, ) async def get_expressions_post( - params: QueryParameters, + params: ExpressionQueryBody, db: DatabaseDependency, logger: LoggerDependency, ): From aa6bc94f015cb06b4360aa677b5272ef1cda6f27 Mon Sep 17 00:00:00 2001 From: Julian Date: Mon, 30 Dec 2024 15:10:22 -0500 Subject: [PATCH 15/18] lint --- transcriptomics_data_service/models.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/transcriptomics_data_service/models.py b/transcriptomics_data_service/models.py index e3cef80..9b80760 100644 --- a/transcriptomics_data_service/models.py +++ b/transcriptomics_data_service/models.py @@ -12,12 +12,14 @@ "ExpressionQueryBody", ] + class MethodEnum(str, Enum): raw = "raw" tpm = "tpm" tmm = "tmm" getmm = "getmm" + class PaginatedRequest(BaseModel): page: int = Field(1, ge=1, description="Current page number") page_size: int = Field(100, ge=1, le=1000, description="Number of records per page") From 4cf215306c4fc9d63cd00ba494e84e9d41581b20 Mon Sep 17 00:00:00 2001 From: v-rocheleau Date: Tue, 7 Jan 2025 15:57:00 -0500 Subject: [PATCH 16/18] fix: expression response method field --- docker-compose.dev.yaml | 2 +- transcriptomics_data_service/main.py | 4 +- transcriptomics_data_service/models.py | 10 +-- .../routers/expressions.py | 89 +++++++++++++++++-- transcriptomics_data_service/routers/query.py | 87 ------------------ 5 files changed, 87 insertions(+), 105 deletions(-) delete mode 100644 transcriptomics_data_service/routers/query.py diff --git a/docker-compose.dev.yaml b/docker-compose.dev.yaml index fe3e103..f34ab7b 100644 --- a/docker-compose.dev.yaml +++ b/docker-compose.dev.yaml @@ -9,7 +9,7 @@ services: depends_on: - tds-db environment: - - BENTO_UID=1001 + - BENTO_UID=${UID} - DATABASE_URI=postgres://tds_user:tds_password@tds-db:5432/tds_db - CORS_ORIGINS="*" - BENTO_AUTHZ_SERVICE_URL="" diff --git a/transcriptomics_data_service/main.py b/transcriptomics_data_service/main.py index 783462b..db9a047 100644 --- a/transcriptomics_data_service/main.py +++ b/transcriptomics_data_service/main.py @@ -6,7 +6,7 @@ from transcriptomics_data_service.routers.experiment_results import experiment_router from transcriptomics_data_service.routers.ingest import ingest_router from transcriptomics_data_service.routers.normalization import normalization_router -from transcriptomics_data_service.routers.query import query_router +from transcriptomics_data_service.routers.expressions import expressions_router from . import __version__ from .config import get_config from .constants import BENTO_SERVICE_KIND, SERVICE_TYPE @@ -45,4 +45,4 @@ async def lifespan(_app: FastAPI): app.include_router(ingest_router) app.include_router(experiment_router) app.include_router(normalization_router) -app.include_router(query_router) +app.include_router(expressions_router) diff --git a/transcriptomics_data_service/models.py b/transcriptomics_data_service/models.py index 9b80760..f074b07 100644 --- a/transcriptomics_data_service/models.py +++ b/transcriptomics_data_service/models.py @@ -1,4 +1,4 @@ -from pydantic import BaseModel, Field, validator +from pydantic import BaseModel, Field, validator, field_validator from typing import List, Optional from enum import Enum @@ -63,11 +63,3 @@ class ExpressionQueryBody(PaginatedRequest): experiments: Optional[List[str]] = Field(None, description="List of experiment result IDs to retrieve data from") sample_ids: Optional[List[str]] = Field(None, description="List of sample IDs to retrieve data from") method: MethodEnum = Field(MethodEnum.raw, description="Data method to retrieve: 'raw', 'tpm', 'tmm', 'getmm'") - - @validator("genes", "experiments", "sample_ids", each_item=True) - def validate_identifiers(cls, value): - if not (1 <= len(value) <= 255): - raise ValueError("Each identifier must be between 1 and 255 characters long.") - if not value.replace("_", "").isalnum(): - raise ValueError("Identifiers must contain only alphanumeric characters and underscores.") - return value diff --git a/transcriptomics_data_service/routers/expressions.py b/transcriptomics_data_service/routers/expressions.py index b2d5feb..a46b8eb 100644 --- a/transcriptomics_data_service/routers/expressions.py +++ b/transcriptomics_data_service/routers/expressions.py @@ -1,12 +1,89 @@ -from fastapi import APIRouter, status +from fastapi import APIRouter, HTTPException, status, Query from transcriptomics_data_service.db import DatabaseDependency +from transcriptomics_data_service.logger import LoggerDependency +from transcriptomics_data_service.models import ( + GeneExpressionData, + GeneExpressionResponse, + MethodEnum, + ExpressionQueryBody, +) -__all__ = ["expression_router"] +expressions_router = APIRouter(prefix="/expressions") -expression_router = APIRouter(prefix="/expressions") +async def get_expressions_handler( + params: ExpressionQueryBody, + db: DatabaseDependency, + logger: LoggerDependency, +): + """ + Handler for fetching and returning gene expression data. + """ + logger.info(f"Received query parameters: {params}") -@expression_router.get("", status_code=status.HTTP_200_OK) -async def expressions_list(db: DatabaseDependency): - return await db.fetch_expressions() + expressions, total_records = await db.fetch_gene_expressions( + genes=params.genes, + experiments=params.experiments, + sample_ids=params.sample_ids, + method=params.method.value, + page=params.page, + page_size=params.page_size, + ) + + if not expressions: + raise HTTPException( + status_code=status.HTTP_404_NOT_FOUND, + detail="No gene expression data found for the given parameters.", + ) + + response_data = [] + method = params.method.value + method_count_field = f"{params.method.value}_count" if params.method != MethodEnum.raw else "raw_count" + for expr in expressions: + count = getattr(expr, method_count_field) + response_item = GeneExpressionData( + gene_code=expr.gene_code, + sample_id=expr.sample_id, + experiment_result_id=expr.experiment_result_id, + count=count, + method=method, + ) + response_data.append(response_item) + + total_pages = (total_records + params.page_size - 1) // params.page_size + + return GeneExpressionResponse( + expressions=response_data, + total_records=total_records, + page=params.page, + page_size=params.page_size, + total_pages=total_pages, + ) + + +@expressions_router.post( + "", + status_code=status.HTTP_200_OK, + response_model=GeneExpressionResponse, +) +async def get_expressions_post( + params: ExpressionQueryBody, + db: DatabaseDependency, + logger: LoggerDependency, +): + """ + Retrieve gene expression data via POST request. + Using POST instead of GET in order to add a body of type ExpressionQueryBody + + Example JSON body: + { + "genes": ["gene1", "gene2"], + "experiments": ["exp1"], + "sample_ids": ["sample1"], + "method": "tmm", + "page": 1, + "page_size": 100 + } + """ + return await get_expressions_handler(params, db, logger) diff --git a/transcriptomics_data_service/routers/query.py b/transcriptomics_data_service/routers/query.py deleted file mode 100644 index 65e1466..0000000 --- a/transcriptomics_data_service/routers/query.py +++ /dev/null @@ -1,87 +0,0 @@ -from fastapi import APIRouter, HTTPException, status, Query - -from transcriptomics_data_service.db import DatabaseDependency -from transcriptomics_data_service.logger import LoggerDependency -from transcriptomics_data_service.models import ( - GeneExpressionData, - GeneExpressionResponse, - MethodEnum, - ExpressionQueryBody, -) - -query_router = APIRouter() - - -async def get_expressions_handler( - params: ExpressionQueryBody, - db: DatabaseDependency, - logger: LoggerDependency, -): - """ - Handler for fetching and returning gene expression data. - """ - logger.info(f"Received query parameters: {params}") - - expressions, total_records = await db.fetch_gene_expressions( - genes=params.genes, - experiments=params.experiments, - sample_ids=params.sample_ids, - method=params.method.value, - page=params.page, - page_size=params.page_size, - ) - - if not expressions: - raise HTTPException( - status_code=status.HTTP_404_NOT_FOUND, - detail="No gene expression data found for the given parameters.", - ) - - response_data = [] - method_field = f"{params.method.value}_count" if params.method != MethodEnum.raw else "raw_count" - for expr in expressions: - count = getattr(expr, method_field) - response_item = GeneExpressionData( - gene_code=expr.gene_code, - sample_id=expr.sample_id, - experiment_result_id=expr.experiment_result_id, - count=count, - method=method_field, - ) - response_data.append(response_item) - - total_pages = (total_records + params.page_size - 1) // params.page_size - - return GeneExpressionResponse( - expressions=response_data, - total_records=total_records, - page=params.page, - page_size=params.page_size, - total_pages=total_pages, - ) - - -@query_router.post( - "/expressions", - status_code=status.HTTP_200_OK, - response_model=GeneExpressionResponse, -) -async def get_expressions_post( - params: ExpressionQueryBody, - db: DatabaseDependency, - logger: LoggerDependency, -): - """ - Retrieve gene expression data via POST request. - - Example JSON body: - { - "genes": ["gene1", "gene2"], - "experiments": ["exp1"], - "sample_ids": ["sample1"], - "method": "tmm", - "page": 1, - "page_size": 100 - } - """ - return await get_expressions_handler(params, db, logger) From daa1754e8e1b48cf4f22d718b0a84ce03f205e5f Mon Sep 17 00:00:00 2001 From: v-rocheleau Date: Tue, 7 Jan 2025 16:38:17 -0500 Subject: [PATCH 17/18] refact: factor out repetitive method values in expressions queries --- transcriptomics_data_service/models.py | 10 ++--- .../routers/expressions.py | 43 ++++++++++--------- 2 files changed, 27 insertions(+), 26 deletions(-) diff --git a/transcriptomics_data_service/models.py b/transcriptomics_data_service/models.py index f074b07..51f8e5c 100644 --- a/transcriptomics_data_service/models.py +++ b/transcriptomics_data_service/models.py @@ -51,11 +51,6 @@ class GeneExpressionData(BaseModel): sample_id: str = Field(..., min_length=1, max_length=255, description="Sample ID") experiment_result_id: str = Field(..., min_length=1, max_length=255, description="Experiment result ID") count: float = Field(..., description="Expression count") - method: MethodEnum = Field(..., description="Method used to calculate the expression count") - - -class GeneExpressionResponse(PaginatedResponse): - expressions: List[GeneExpressionData] class ExpressionQueryBody(PaginatedRequest): @@ -63,3 +58,8 @@ class ExpressionQueryBody(PaginatedRequest): experiments: Optional[List[str]] = Field(None, description="List of experiment result IDs to retrieve data from") sample_ids: Optional[List[str]] = Field(None, description="List of sample IDs to retrieve data from") method: MethodEnum = Field(MethodEnum.raw, description="Data method to retrieve: 'raw', 'tpm', 'tmm', 'getmm'") + + +class GeneExpressionResponse(PaginatedResponse): + query: ExpressionQueryBody = Field(..., description="The query that produced this response") + expressions: List[GeneExpressionData] = Field(..., description="List of gene expressions") diff --git a/transcriptomics_data_service/routers/expressions.py b/transcriptomics_data_service/routers/expressions.py index a46b8eb..8a3e7e1 100644 --- a/transcriptomics_data_service/routers/expressions.py +++ b/transcriptomics_data_service/routers/expressions.py @@ -13,22 +13,22 @@ async def get_expressions_handler( - params: ExpressionQueryBody, + query_body: ExpressionQueryBody, db: DatabaseDependency, logger: LoggerDependency, ): """ Handler for fetching and returning gene expression data. """ - logger.info(f"Received query parameters: {params}") + logger.info(f"Received query parameters: {query_body}") expressions, total_records = await db.fetch_gene_expressions( - genes=params.genes, - experiments=params.experiments, - sample_ids=params.sample_ids, - method=params.method.value, - page=params.page, - page_size=params.page_size, + genes=query_body.genes, + experiments=query_body.experiments, + sample_ids=query_body.sample_ids, + method=query_body.method.value, + page=query_body.page, + page_size=query_body.page_size, ) if not expressions: @@ -37,28 +37,29 @@ async def get_expressions_handler( detail="No gene expression data found for the given parameters.", ) - response_data = [] - method = params.method.value - method_count_field = f"{params.method.value}_count" if params.method != MethodEnum.raw else "raw_count" - for expr in expressions: - count = getattr(expr, method_count_field) - response_item = GeneExpressionData( + method = query_body.method.value + method_count_field = f"{method}_count" + response_data = [ + GeneExpressionData( gene_code=expr.gene_code, sample_id=expr.sample_id, experiment_result_id=expr.experiment_result_id, - count=count, - method=method, + count=getattr(expr, method_count_field), ) - response_data.append(response_item) + for expr in expressions + ] - total_pages = (total_records + params.page_size - 1) // params.page_size + total_pages = (total_records + query_body.page_size - 1) // query_body.page_size return GeneExpressionResponse( - expressions=response_data, + # pagination + page=query_body.page, + page_size=query_body.page_size, total_records=total_records, - page=params.page, - page_size=params.page_size, total_pages=total_pages, + # data + expressions=response_data, + query=query_body ) From b154ffe6b27e6b92e794d7123be05dd3ac00a751 Mon Sep 17 00:00:00 2001 From: v-rocheleau Date: Tue, 7 Jan 2025 16:40:19 -0500 Subject: [PATCH 18/18] lint --- transcriptomics_data_service/routers/expressions.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/transcriptomics_data_service/routers/expressions.py b/transcriptomics_data_service/routers/expressions.py index 8a3e7e1..2942be5 100644 --- a/transcriptomics_data_service/routers/expressions.py +++ b/transcriptomics_data_service/routers/expressions.py @@ -5,7 +5,6 @@ from transcriptomics_data_service.models import ( GeneExpressionData, GeneExpressionResponse, - MethodEnum, ExpressionQueryBody, ) @@ -59,7 +58,7 @@ async def get_expressions_handler( total_pages=total_pages, # data expressions=response_data, - query=query_body + query=query_body, )