diff --git a/src/modules/site-v2/base/utils/trait.py b/src/modules/site-v2/base/utils/trait.py new file mode 100644 index 00000000..1375c773 --- /dev/null +++ b/src/modules/site-v2/base/utils/trait.py @@ -0,0 +1,171 @@ +import bleach +import csv + +from caendr.models.error import FileUploadError +from caendr.models.datastore import TraitFile, Species, DatasetType +from caendr.models.status import PublishStatus +from caendr.models.sql import PhenotypeMetadata, PhenotypeDatabase +from caendr.services.cloud.storage import upload_blob_from_file_object, check_blob_exists, get_blob_if_exists +from caendr.services.cloud.postgresql import rollback_on_error_handler +from caendr.services.logger import logger +from caendr.services.validate import validate_file, StrainValidator, NumberValidator +from caendr.services.cloud.datastore import delete_ds_entity_by_ref +from caendr.api.phenotype import get_trait +from caendr.utils.data import unique_id +from caendr.utils.env import get_env_var +from caendr.utils.local_files import LocalUploadFile +from constants import TOOL_INPUT_DATA_VALID_FILE_EXTENSIONS + +MODULE_DB_OPERATIONS_BUCKET_NAME = get_env_var('MODULE_DB_OPERATIONS_BUCKET_NAME') +MODULE_DB_OPERATIONS_TRAITFILE_PUBLIC_FILEPATH = get_env_var('MODULE_DB_OPERATIONS_TRAITFILE_PUBLIC_FILEPATH') + + +def add_trait(form_data, user): + """ + Add a trait to the database by permoforming the following operations: + 1. Create a new TraitFile object with the user submitted data and save it to Datastore. + 2. Seed the trait data to Phenotype Metadata SQL table. + 3. Save the file to GCP bucket. + 4. Parse and seed the file data to Phenotype Database SQL table. + On the failure of any of the above operations rolls back to the initial state and returns an error message. + """ + try: + # Create a new TraitFile oject + tf = TraitFile(unique_id()) + + # Create a unique filename for file uload + filename_hash = f'{unique_id()}.tsv' + tf.set_properties(**{ + # User submitted data + 'trait_name_user': bleach.clean(form_data.trait_name_user.data), + 'trait_name_display_1': bleach.clean(form_data.trait_name_display_1.data), + 'trait_name_display_2': bleach.clean(form_data.trait_name_display_2.data), + 'trait_name_display_3': bleach.clean(form_data.trait_name_display_2.data), + 'filename': bleach.clean(form_data.file.data.filename), + 'species': bleach.clean(form_data.species.data), + 'description_short': bleach.clean(form_data.description_short.data), + 'description_long': bleach.clean(form_data.description_long.data), + 'units': bleach.clean(form_data.units.data), + 'tags': [ bleach.clean(tag) for tag in form_data.tags.data ], + 'institution': bleach.clean(form_data.institution.data), + 'source_lab': bleach.clean(form_data.source_lab.data), + 'protocols': bleach.clean(form_data.protocols.data), + 'publication': bleach.clean(form_data.publication.data), + + # Internally used data + 'dataset': DatasetType.PUBLIC, + 'publish_status': PublishStatus.UPLOADED, + 'is_bulk_file': False, + 'filename_hash': filename_hash, + }) + + tf.set_user(user) + + # Save te TraitFile object to Datastore + tf.save() + except Exception as ex: + logger.error(f'Failed to create a trait file {form_data.trait_name_user.data}: {ex}') + return {'message': 'Failed to submit a form. Please try again later.'}, 500 + + try: + # Seed to Phenotype Metadata SQL table + with rollback_on_error_handler(): + new_trait = PhenotypeMetadata() + new_trait.add(tf) + except Exception as ex: + rollback_submission_on_error(tf.name) + logger.error(f'Failed to seed the trait to the database: {ex}') + return {'message': 'Failed to submit a form. Please try again later.'}, 500 + + # Save file to GCP bucket + species_name = Species.get(form_data.species.data).name + blob_name = f'{MODULE_DB_OPERATIONS_TRAITFILE_PUBLIC_FILEPATH}/{species_name}/{user.name}/{filename_hash}' + + # Check if the file already exists + if check_blob_exists(MODULE_DB_OPERATIONS_BUCKET_NAME, blob_name): + return {'message': 'File already exists.'}, 400 + else: + upload_blob_from_file_object(MODULE_DB_OPERATIONS_BUCKET_NAME, form_data.file.data, blob_name) + + # Reset the file pointer + form_data.file.data.seek(0) + + try: + # Seed the file data to Phenotype Database SQL table + with LocalUploadFile(form_data.file.data, valid_file_extensions=TOOL_INPUT_DATA_VALID_FILE_EXTENSIONS) as file: + # Validate the file + try: + validate_file(file, [ + StrainValidator( 'strain', species=tf['species'], force_unique=True, force_unique_msgs={} ), + NumberValidator( None, accept_float=True, accept_na=True ), + ]) + except Exception as ex: + rollback_submission_on_error(tf.name, blob_name) + logger.error(f'Failed to validate the file: {ex.msg}') + return {'message': f'Failed to validate the file: {ex.msg}'}, 400 + + # Parse the trait file + trait_data_list = [] + with open(file) as f: + for idx, row in enumerate( csv.reader(f, delimiter='\t') ): + if idx == 0: + continue + else: + trait_data = { + 'trait_name': tf['trait_name_user'], + 'strain_name': row[0], + 'trait_value': row[1], + 'metadata_id': tf.name + } + trait_data_list.append(trait_data) + try: + with rollback_on_error_handler(): + trait_data = PhenotypeDatabase() + trait_data.add_trait_data(trait_data_list) + except Exception as ex: + rollback_submission_on_error(tf.name, blob_name) + logger.error(f'Failed to seed the file data to the database: {ex}') + + except FileUploadError as ex: + rollback_submission_on_error(tf.name, blob_name) + logger.error(f'Failed to upload a file {form_data.file.data.filename}: {ex}') + return {'message': 'Failed to submit a form. Please try again later.'}, 500 + + except Exception as ex: + rollback_submission_on_error(tf.name, blob_name) + logger.error(f'Failed to upload the file data to the database: {ex}') + return {'message': 'Failed to submit a form. Please try again later.'}, 500 + + return {'message': 'Trait submitted successfully.'}, 200 + + +def rollback_submission_on_error(trait_id, blob_name=None): + """ + Rollback the trait submission on error by performing the following operations: + 1. Delete the TraitFile object from Datastore. + 2. Delete the trait from Phenotype Metadata SQL table. + 3. Delete the file from GCP bucket. + 4. Delete the file data from Phenotype Database SQL table. + """ + tf = TraitFile.get_ds(trait_id) + if tf is None: + logger.error(f'Failed to retrieve the trait file {trait_id}') + return + + # Delete the previousely created TraitFile object + delete_ds_entity_by_ref(TraitFile.kind, tf.name) + + # Delete a new trait from Phenotype Metadata SQL table + trait = get_trait(tf.name) + if trait: + trait.delete() + + # Delete the file from GCP bucket + if blob_name: + blob = get_blob_if_exists(MODULE_DB_OPERATIONS_BUCKET_NAME, blob_name) + if blob: + blob.delete() + + # Delete the file data from Phenotype Database SQL table + PhenotypeDatabase.delete_by_metadata_id(tf.name) + diff --git a/src/modules/site-v2/base/views/data/data.py b/src/modules/site-v2/base/views/data/data.py index f20054e4..ec947aee 100644 --- a/src/modules/site-v2/base/views/data/data.py +++ b/src/modules/site-v2/base/views/data/data.py @@ -1,32 +1,23 @@ import yaml -import bleach import csv -from flask import render_template, Blueprint, redirect, url_for, request, flash, jsonify, abort +from flask import render_template, Blueprint, redirect, url_for, request, flash, jsonify from extensions import cache from config import config from caendr.models.error import EnvVarError, FileUploadError -from caendr.models.datastore import TraitFile, Species -from caendr.models.status import PublishStatus -from caendr.services.cloud.storage import get_blob, upload_blob_from_file_object, check_blob_exists +from caendr.models.datastore import Species +from caendr.services.cloud.storage import get_blob from caendr.services.logger import logger from caendr.services.validate import validate_file, StrainValidator, NumberValidator -from caendr.utils.data import unique_id -from base.utils.auth import jwt_required, get_current_user -from caendr.utils.env import get_env_var from caendr.utils.local_files import LocalUploadFile +from base.utils.auth import jwt_required, get_current_user +from base.utils.trait import add_trait from base.forms import TraitSubmissionForm from constants import TOOL_INPUT_DATA_VALID_FILE_EXTENSIONS -from caendr.models.sql import PhenotypeMetadata, PhenotypeDatabase - - -MODULE_DB_OPERATIONS_BUCKET_NAME = get_env_var('MODULE_DB_OPERATIONS_BUCKET_NAME') -MODULE_DB_OPERATIONS_TRAITFILE_PUBLIC_FILEPATH = get_env_var('MODULE_DB_OPERATIONS_TRAITFILE_PUBLIC_FILEPATH') - data_bp = Blueprint( 'data', __name__, template_folder='templates' ) @@ -80,7 +71,7 @@ def protocols(): # # Submit Trait # -@data_bp.route('/submit-trait') +@data_bp.route('/trait/start-submit') @jwt_required() def submit_trait_start(): """ Submit Trait start page """ @@ -92,7 +83,7 @@ def submit_trait_start(): # # Submit Trait Form # -@data_bp.route('/submit-trait/new-submission', methods=['GET', 'POST']) +@data_bp.route('/trait/create', methods=['GET', 'POST']) @jwt_required() def submit_trait_form(): """ Trait Submission Form """ @@ -112,111 +103,13 @@ def submit_trait_form(): flash('Please fill out all required fields.', 'warning') else: - try: - # Create a new TraitFile object - tf = TraitFile(unique_id()) - - # Create a unique filename for file upload - hashed_filename = f'{unique_id()}.tsv' - - tf.set_properties(**{ - # User submitted data - 'trait_name_user': bleach.clean(form.trait_name_user.data), - 'trait_name_display_1': bleach.clean(form.trait_name_display_1.data), - 'trait_name_display_2': bleach.clean(form.trait_name_display_2.data), - 'trait_name_display_3': bleach.clean(form.trait_name_display_2.data), - 'filename': bleach.clean(form.file.data.filename), - 'species': bleach.clean(form.species.data), - 'description_short': bleach.clean(form.description_short.data), - 'description_long': bleach.clean(form.description_long.data), - 'units': bleach.clean(form.units.data), - 'tags': [ bleach.clean(tag) for tag in form.tags.data ], - 'institution': bleach.clean(form.institution.data), - 'source_lab': bleach.clean(form.source_lab.data), - 'protocols': bleach.clean(form.protocols.data), - 'publication': bleach.clean(form.publication.data), - - # Internally used data - 'dataset': 'public', - 'publish_status': PublishStatus.UPLOADED, - 'is_bulk_file': False, - 'hashed_filename': hashed_filename, - }) - - tf.set_user(user) - - # Save the TraitFile object to Datastore - tf.save() - - # Seed to Phenotype Metadata SQL table - new_trait = PhenotypeMetadata() - new_trait.add_trait(tf) - - except Exception as ex: - logger.error(f'Failed to create a trait file {form.trait_name_user.data}: {ex}') - flash('Failed to submit a form. Please try again later.', 'danger') - abort(500) - - # Save file to GCP bucket - species_name = Species.get(form.species.data).name - blob_name = f'{MODULE_DB_OPERATIONS_TRAITFILE_PUBLIC_FILEPATH}/{species_name}/{user.name}/{hashed_filename}' - - # Check if the file already exists - if check_blob_exists(MODULE_DB_OPERATIONS_BUCKET_NAME, blob_name): - flash('File already exists.', 'danger') + # Add the trait to the database + resp, code = add_trait(form, user) + if code != 200: + flash(resp['message'], 'danger') else: - upload_blob_from_file_object(MODULE_DB_OPERATIONS_BUCKET_NAME, form.file.data, blob_name) - - # Reset the file pointer - form.file.data.seek(0) - - try: - # Seed the file data to Phenotype Database SQL table - with LocalUploadFile(form.file.data, valid_file_extensions=TOOL_INPUT_DATA_VALID_FILE_EXTENSIONS) as file: - - # Validate the file - try: - validate_file(file, [ - StrainValidator( 'strain', species=tf['species'], force_unique=True, force_unique_msgs={} ), - NumberValidator( None, accept_float=True, accept_na=True ), - ]) - except Exception as ex: - flash(f'Failed to validate the file: {ex.msg}', 'danger') - return render_template('data/submit-trait-form.html', **{ - # Page Info - 'title': 'Phenotype Database Trait Submission', - 'tool_alt_parent_breadcrumb': {"title": "Submit Trait", "url": url_for('data.submit_trait_start')}, - - # Data - 'form': form, - }) - - # Parse the trait file - trait_data_list = [] - with open(file) as f: - for idx, row in enumerate( csv.reader(f, delimiter='\t') ): - if idx == 0: - continue - else: - trait_data = { - 'trait_name': tf['trait_name_user'], - 'strain_name': row[0], - 'trait_value': row[1], - 'metadata_id': tf.name - } - trait_data_list.append(trait_data) - - trait_data = PhenotypeDatabase() - trait_data.add_trait_data(trait_data_list) - - except FileUploadError as ex: - logger.error(f'Failed to upload a file {form.file.data.filename}: {ex}') - flash('Failed to submit a form. Please try again later.', 'danger') - abort(500) - - flash('Trait submitted successfully.', 'success') - # TODO: change the redirect to MTL - return redirect(url_for('data.submit_trait_start')) + flash('Trait submitted successfully.', 'success') + return redirect(url_for('data.submit_trait_start')) return render_template('data/submit-trait-form.html', **{ # Page Info @@ -230,7 +123,7 @@ def submit_trait_form(): # # File Upload # -@data_bp.route('/submit-trait/parse-file', methods=['POST']) +@data_bp.route('/trait/parse-file', methods=['POST']) @jwt_required() def parse_trait_file(): """ Parse the trait file and return the data """ @@ -259,3 +152,4 @@ def parse_trait_file(): except Exception as ex: logger.error(f'Failed to parse the file: {ex}') return jsonify({ 'message': 'Failed to parse the file. Please try again later.' }), 500 + diff --git a/src/modules/site-v2/templates/_includes/breadcrumb.html b/src/modules/site-v2/templates/_includes/breadcrumb.html index 4f79e6e1..619e10a3 100644 --- a/src/modules/site-v2/templates/_includes/breadcrumb.html +++ b/src/modules/site-v2/templates/_includes/breadcrumb.html @@ -11,7 +11,7 @@ {% elif alt_parent_breadcrumb %} {% elif tool_alt_parent_breadcrumb %} - {% if tool_alt_parent_breadcrumb["title"] == 'Strain Catalog' or tool_alt_parent_breadcrumb["title"] == 'Submit Trait'%} + {% if request.blueprint == 'request_strains' or request.blueprint == 'data'%} {% endif %} diff --git a/src/modules/site-v2/templates/data/submit-trait-form.html b/src/modules/site-v2/templates/data/submit-trait-form.html index 03a6d04c..cb8556c9 100644 --- a/src/modules/site-v2/templates/data/submit-trait-form.html +++ b/src/modules/site-v2/templates/data/submit-trait-form.html @@ -124,6 +124,10 @@

Trait Information

// Upload the file and display the content $('#file-upload').on('click', function(e) { e.preventDefault() + handleFileUpload() + }) + + function handleFileUpload() { const file = $('#file')[0].files[0] const species = $('#speciesSelect').val() if (file && species) { @@ -153,8 +157,8 @@

Trait Information

} } }) - } - }) + } + } function display_file_content(file) { $('#file-content').parent().removeClass('d-none') diff --git a/src/pkg/caendr/caendr/models/datastore/__init__.py b/src/pkg/caendr/caendr/models/datastore/__init__.py index 3461d2f7..aebc57a4 100644 --- a/src/pkg/caendr/caendr/models/datastore/__init__.py +++ b/src/pkg/caendr/caendr/models/datastore/__init__.py @@ -22,6 +22,7 @@ from .browser_track import BrowserTrackDefault # Subclasses FileRecordEntity (from BrowserTrack) from .browser_track import BrowserTrackTemplate # Subclasses FileRecordEntity (from BrowserTrack) from .trait_file import TraitFile # Subclasses FileRecordEntity, PublishableEntity, SpeciesEntity, UserOwnedEntity +from .trait_file import DatasetType # Job template classes from .job_entity import JobEntity # Subclasses StatusEntity; imports Container diff --git a/src/pkg/caendr/caendr/models/datastore/file_record_entity.py b/src/pkg/caendr/caendr/models/datastore/file_record_entity.py index b08cda23..e6c37dcd 100644 --- a/src/pkg/caendr/caendr/models/datastore/file_record_entity.py +++ b/src/pkg/caendr/caendr/models/datastore/file_record_entity.py @@ -24,7 +24,7 @@ def get_props_set(cls): return { *super().get_props_set(), 'filename', - 'hashed_filename', + 'filename_hash', } @@ -82,24 +82,24 @@ def filename(self, v): return self._set_raw_prop('filename', v) @property - def hashed_filename(self) -> TokenizedString: + def filename_hash(self) -> TokenizedString: ''' The hashed name of the file. Returns as a `TokenizedString`. ''' - if self._get_raw_prop('hashed_filename') is None: + if self._get_raw_prop('filename_hash') is None: return None - return TokenizedString( self._get_raw_prop('hashed_filename') ) + return TokenizedString( self._get_raw_prop('filename_hash') ) - @hashed_filename.setter - def hashed_filename(self, v): + @filename_hash.setter + def filename_hash(self, v): ''' Save the hashed name of the file itself. Saves internally as a raw string. ''' if isinstance(v, TokenizedString): v = v.raw_string if not (isinstance(v, str) or v is None): - raise ValueError(f'Cannot set prop "hashed_filename" to "{v}" (type {type(v)}): must be a string') - return self._set_raw_prop('hashed_filename', v) + raise ValueError(f'Cannot set prop "filename_hash" to "{v}" (type {type(v)}): must be a string') + return self._set_raw_prop('filename_hash', v) # @@ -121,7 +121,7 @@ def get_filepath_hashed(self, schema: BlobURISchema = None, check_if_exists: boo ''' if check_if_exists and not self.check_exists(**kwargs): return None - return generate_blob_uri( self.bucket, self.prefix.get_string(**kwargs), self['hashed_filename'].get_string(**kwargs), schema=schema ) + return generate_blob_uri( self.bucket, self.prefix.get_string(**kwargs), self['filename_hash'].get_string(**kwargs), schema=schema ) def get_filepath_template(self, schema: BlobURISchema = None) -> TokenizedString: diff --git a/src/pkg/caendr/caendr/models/datastore/trait_file.py b/src/pkg/caendr/caendr/models/datastore/trait_file.py index 477d61a5..f268a37a 100644 --- a/src/pkg/caendr/caendr/models/datastore/trait_file.py +++ b/src/pkg/caendr/caendr/models/datastore/trait_file.py @@ -1,4 +1,5 @@ from typing import Tuple, Optional +from enum import Enum from caendr.utils.env import get_env_var @@ -10,12 +11,16 @@ DB_BUCKET_NAME = get_env_var('MODULE_DB_OPERATIONS_BUCKET_NAME') +class DatasetType(Enum): + """ Identifier for trait files folder in GCP Buckets """ + CAENDR = 'caendr' + PUBLIC = 'public' + ZHANG = 'zhang' class TraitFile(FileRecordEntity, PublishableEntity, SpeciesEntity, UserOwnedEntity): kind = 'trait_file' - # # Properties # @@ -74,9 +79,9 @@ def bucket(self): @property def prefix(self): - if self.dataset == 'public': - return TokenizedString(join_path('trait_files', self['dataset'], '${SPECIES}', '${USER_ID}')) - return TokenizedString(join_path('trait_files', self['dataset'], '${SPECIES}')) + if self.dataset == DatasetType.PUBLIC: + return TokenizedString(join_path('trait_files', self['dataset'].value, '${SPECIES}', '${USER_ID}')) + return TokenizedString(join_path('trait_files', self['dataset'].value, '${SPECIES}')) # @@ -85,7 +90,7 @@ def prefix(self): # The species is always determined by this entity itself, so we fill it in instead of letting the calling function supply it def get_filepath(self, schema: BlobURISchema = None, check_if_exists: bool = False): - if self.dataset == 'public': + if self.dataset == DatasetType.PUBLIC: return super().get_filepath_hashed(schema=schema, check_if_exists=check_if_exists, SPECIES=self['species'].name, USER_ID=self['username']) return super().get_filepath(schema=schema, check_if_exists=check_if_exists, SPECIES=self['species'].name) @@ -111,3 +116,13 @@ def display_name(self) -> Tuple[str, Optional[str], Optional[str]]: Combines `trait_name_display_1`, `trait_name_display_2`, and `trait_name_display_3` into a single tuple. ''' return self['trait_name_display_1'], self['trait_name_display_2'], self['trait_name_display_3'] + + @property + def dataset(self): + return self._get_enum_prop(DatasetType, 'dataset', None) + + @dataset.setter + def dataset(self, val): + if isinstance(val, str): + val = val.upper() + return self._set_enum_prop(DatasetType, 'dataset', val) \ No newline at end of file diff --git a/src/pkg/caendr/caendr/models/sql/phenotype.py b/src/pkg/caendr/caendr/models/sql/phenotype.py index e0e0053b..7dd2fdb6 100644 --- a/src/pkg/caendr/caendr/models/sql/phenotype.py +++ b/src/pkg/caendr/caendr/models/sql/phenotype.py @@ -1,4 +1,4 @@ -from caendr.services.cloud.postgresql import db +from caendr.services.cloud.postgresql import db, rollback_on_error from caendr.models.sql.dict_serializable import DictSerializable class PhenotypeDatabase(DictSerializable, db.Model): @@ -21,3 +21,14 @@ def add_trait_data(self, trait_data): """ db.session.bulk_insert_mappings(PhenotypeDatabase, trait_data) db.session.commit() + + @classmethod + @rollback_on_error + def delete_by_metadata_id(cls, metadata_id): + """ + Deletes entries from the Phenotype Database table for the given trait + """ + + del_statement = PhenotypeDatabase.__table__.delete().where(PhenotypeDatabase.metadata_id == metadata_id) + db.session.execute(del_statement) + db.session.commit() \ No newline at end of file diff --git a/src/pkg/caendr/caendr/models/sql/phenotype_metadata.py b/src/pkg/caendr/caendr/models/sql/phenotype_metadata.py index b471046d..ebf41330 100644 --- a/src/pkg/caendr/caendr/models/sql/phenotype_metadata.py +++ b/src/pkg/caendr/caendr/models/sql/phenotype_metadata.py @@ -61,7 +61,7 @@ def get_tags(self): return { tg.strip() for tg in self.tags.split(',') } - def add_trait(self, trait_obj): + def add(self, trait_obj): new_trait = PhenotypeMetadata( id = trait_obj.name, trait_name_user = trait_obj['trait_name_user'], @@ -81,8 +81,12 @@ def add_trait(self, trait_obj): tags = ', '.join(trait_obj['tags']), created_on = datetime.now(timezone.utc), modified_on = datetime.now(timezone.utc), - dataset = trait_obj['dataset'], + dataset = trait_obj['dataset'].value, is_bulk_file = trait_obj['is_bulk_file'] ) db.session.add(new_trait) db.session.commit() + + def delete(self): + db.session.delete(self) + db.session.commit()