From f9656ae16c8c4253bd0e67d70c063cb430bb72c0 Mon Sep 17 00:00:00 2001 From: Yamil Suarez Date: Mon, 30 Oct 2023 22:44:31 -0400 Subject: [PATCH 1/5] add .lower() to make string comparisons case insensitive --- workbench | 4 ++-- workbench_utils.py | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/workbench b/workbench index 9c00c66..f6ead89 100755 --- a/workbench +++ b/workbench @@ -2120,9 +2120,9 @@ if config['check'] is False and 'bootstrap' in config and len(config['bootstrap' else: logging.error(f"Bootstrap script {command} failed with exit code {str(return_code)}.") -if config['task'] != 'create_from_files' and config['input_csv'].startswith('http') is True: +if config['task'] != 'create_from_files' and config['input_csv'].lower().startswith('http') is True: get_csv_from_google_sheet(config) -if config['task'] != 'create_from_files' and config['input_csv'].endswith('.xlsx') is True: +if config['task'] != 'create_from_files' and config['input_csv'].lower().endswith('.xlsx') is True: get_csv_from_excel(config) validate_input_dir(config) diff --git a/workbench_utils.py b/workbench_utils.py index d667a8a..18a8c5a 100644 --- a/workbench_utils.py +++ b/workbench_utils.py @@ -79,7 +79,7 @@ def set_media_type(config, filepath, file_fieldname, csv_row): if oembed_media_type is not None: return oembed_media_type - if filepath.strip().startswith('http'): + if filepath.strip().lower().startswith('http'): preprocessed_file_path = get_preprocessed_file_path(config, file_fieldname, csv_row) filename = preprocessed_file_path.split('/')[-1] extension = filename.split('.')[-1] @@ -124,7 +124,7 @@ def get_oembed_url_media_type(config, filepath): for oembed_provider in config['oembed_providers']: for mtype, provider_urls in oembed_provider.items(): for provider_url in provider_urls: - if filepath.startswith(provider_url): + if filepath.lower().startswith(provider_url): return mtype return None @@ -2214,7 +2214,7 @@ def check_input(config, args): rows_with_missing_files.append(file_check_row[config['id_field']]) logging.warning(message) # Check for URLs. - elif file_check_row['file'].startswith('http'): + elif file_check_row['file'].lower().startswith('http'): http_response_code = ping_remote_file(config, file_check_row['file']) if http_response_code != 200 or ping_remote_file(config, file_check_row['file']) is False: message = 'Remote file "' + file_check_row['file'] + '" identified in CSV "file" column for record with ID "' \ From 155863d0da8378d267fb941543e6e6f79057d72a Mon Sep 17 00:00:00 2001 From: Yamil Suarez Date: Wed, 1 Nov 2023 21:42:19 -0400 Subject: [PATCH 2/5] Add additional lower() calls --- workbench_utils.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/workbench_utils.py b/workbench_utils.py index 18a8c5a..942532f 100644 --- a/workbench_utils.py +++ b/workbench_utils.py @@ -3887,7 +3887,7 @@ def get_csv_data(config, csv_file_target='node_fields', file_path=None): if os.path.exists(input_csv_path): os.remove(input_csv_path) get_csv_from_google_sheet(config) - elif file_path.endswith('.xlsx') is True: + elif file_path.lower().endswith('.xlsx') is True: input_csv_path = get_extracted_csv_file_path(config) if os.path.exists(input_csv_path): os.remove(input_csv_path) @@ -6037,7 +6037,7 @@ def get_extracted_csv_file_path(config): """ if config['input_csv'].startswith('http'): exported_csv_filename = config['google_sheets_csv_filename'] - elif config['input_csv'].endswith('xlsx'): + elif config['input_csv'].lower().endswith('xlsx'): exported_csv_filename = config['excel_csv_filename'] else: return False @@ -6563,7 +6563,7 @@ def check_csv_file_exists(config, csv_file_target, file_path=None): message = "Extracting CSV data from " + config['input_csv'] + " (worksheet gid " + str(config['google_sheets_gid']) + ") to " + input_csv + '.' print(message) logging.info(message) - elif config['input_csv'].endswith('xlsx'): + elif config['input_csv'].lower().endswith('xlsx'): input_csv = get_extracted_csv_file_path(config) message = "Extracting CSV data from " + config['input_csv'] + " to " + input_csv + '.' print(message) From a53c9d537b562b2761a1710e57df770eaf86668f Mon Sep 17 00:00:00 2001 From: Yamil Suarez Date: Wed, 1 Nov 2023 21:51:18 -0400 Subject: [PATCH 3/5] Revert some lower() calls to await testing --- workbench | 2 +- workbench_utils.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/workbench b/workbench index f6ead89..6ab7378 100755 --- a/workbench +++ b/workbench @@ -2120,7 +2120,7 @@ if config['check'] is False and 'bootstrap' in config and len(config['bootstrap' else: logging.error(f"Bootstrap script {command} failed with exit code {str(return_code)}.") -if config['task'] != 'create_from_files' and config['input_csv'].lower().startswith('http') is True: +if config['task'] != 'create_from_files' and config['input_csv'].startswith('http') is True: get_csv_from_google_sheet(config) if config['task'] != 'create_from_files' and config['input_csv'].lower().endswith('.xlsx') is True: get_csv_from_excel(config) diff --git a/workbench_utils.py b/workbench_utils.py index 942532f..a40a4ab 100644 --- a/workbench_utils.py +++ b/workbench_utils.py @@ -79,7 +79,7 @@ def set_media_type(config, filepath, file_fieldname, csv_row): if oembed_media_type is not None: return oembed_media_type - if filepath.strip().lower().startswith('http'): + if filepath.strip().startswith('http'): preprocessed_file_path = get_preprocessed_file_path(config, file_fieldname, csv_row) filename = preprocessed_file_path.split('/')[-1] extension = filename.split('.')[-1] @@ -124,7 +124,7 @@ def get_oembed_url_media_type(config, filepath): for oembed_provider in config['oembed_providers']: for mtype, provider_urls in oembed_provider.items(): for provider_url in provider_urls: - if filepath.lower().startswith(provider_url): + if filepath.startswith(provider_url): return mtype return None @@ -2214,7 +2214,7 @@ def check_input(config, args): rows_with_missing_files.append(file_check_row[config['id_field']]) logging.warning(message) # Check for URLs. - elif file_check_row['file'].lower().startswith('http'): + elif file_check_row['file'].startswith('http'): http_response_code = ping_remote_file(config, file_check_row['file']) if http_response_code != 200 or ping_remote_file(config, file_check_row['file']) is False: message = 'Remote file "' + file_check_row['file'] + '" identified in CSV "file" column for record with ID "' \ From 125e4950dcfd5420045fd80965d57ce17b19010b Mon Sep 17 00:00:00 2001 From: Yamil Suarez Date: Sun, 12 Nov 2023 16:54:07 -0500 Subject: [PATCH 4/5] Make check for input_csv URL case-insensitive --- workbench | 2 +- workbench_utils.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/workbench b/workbench index 6ab7378..f6ead89 100755 --- a/workbench +++ b/workbench @@ -2120,7 +2120,7 @@ if config['check'] is False and 'bootstrap' in config and len(config['bootstrap' else: logging.error(f"Bootstrap script {command} failed with exit code {str(return_code)}.") -if config['task'] != 'create_from_files' and config['input_csv'].startswith('http') is True: +if config['task'] != 'create_from_files' and config['input_csv'].lower().startswith('http') is True: get_csv_from_google_sheet(config) if config['task'] != 'create_from_files' and config['input_csv'].lower().endswith('.xlsx') is True: get_csv_from_excel(config) diff --git a/workbench_utils.py b/workbench_utils.py index a40a4ab..eccb7f0 100644 --- a/workbench_utils.py +++ b/workbench_utils.py @@ -3882,7 +3882,7 @@ def get_csv_data(config, csv_file_target='node_fields', file_path=None): if os.path.isabs(file_path): input_csv_path = file_path - elif file_path.startswith('http') is True: + elif file_path.lower().startswith('http') is True: input_csv_path = get_extracted_csv_file_path(config) if os.path.exists(input_csv_path): os.remove(input_csv_path) @@ -6035,7 +6035,7 @@ def get_extracted_csv_file_path(config): A file path with the current config file's unique ID appended to it. False if config['input_csv'] is not a Google Sheet or Excel file. """ - if config['input_csv'].startswith('http'): + if config['input_csv'].lower().startswith('http'): exported_csv_filename = config['google_sheets_csv_filename'] elif config['input_csv'].lower().endswith('xlsx'): exported_csv_filename = config['excel_csv_filename'] @@ -6558,7 +6558,7 @@ def check_csv_file_exists(config, csv_file_target, file_path=None): if os.path.isabs(config['input_csv']): input_csv = config['input_csv'] # For Google Sheets, the "extraction" is fired over in workbench. - elif config['input_csv'].startswith('http'): + elif config['input_csv'].lower().startswith('http'): input_csv = get_extracted_csv_file_path(config) message = "Extracting CSV data from " + config['input_csv'] + " (worksheet gid " + str(config['google_sheets_gid']) + ") to " + input_csv + '.' print(message) From ae52e72e283fcf3ccaddf2e0a937031aa92f4d25 Mon Sep 17 00:00:00 2001 From: Yamil Suarez Date: Sun, 10 Dec 2023 16:21:03 -0500 Subject: [PATCH 5/5] Add several lower() calls to make code more case-insensitive --- workbench_utils.py | 44 ++++++++++++++++++++++---------------------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/workbench_utils.py b/workbench_utils.py index eccb7f0..29e0bfa 100644 --- a/workbench_utils.py +++ b/workbench_utils.py @@ -79,7 +79,7 @@ def set_media_type(config, filepath, file_fieldname, csv_row): if oembed_media_type is not None: return oembed_media_type - if filepath.strip().startswith('http'): + if filepath.strip().lower().startswith('http'): preprocessed_file_path = get_preprocessed_file_path(config, file_fieldname, csv_row) filename = preprocessed_file_path.split('/')[-1] extension = filename.split('.')[-1] @@ -185,7 +185,7 @@ def set_model_from_extension(file_name, config): normalized_extension = extension.lower() for model_tids in config['models']: for tid, extensions in model_tids.items(): - if str(tid).startswith('http'): + if str(tid).lower().startswith('http'): tid = get_term_id_from_uri(config, tid) if normalized_extension in extensions: return tid @@ -2214,7 +2214,7 @@ def check_input(config, args): rows_with_missing_files.append(file_check_row[config['id_field']]) logging.warning(message) # Check for URLs. - elif file_check_row['file'].startswith('http'): + elif file_check_row['file'].lower().startswith('http'): http_response_code = ping_remote_file(config, file_check_row['file']) if http_response_code != 200 or ping_remote_file(config, file_check_row['file']) is False: message = 'Remote file "' + file_check_row['file'] + '" identified in CSV "file" column for record with ID "' \ @@ -2273,7 +2273,7 @@ def check_input(config, args): # Check that each file's extension is allowed for the current media type. 'file' is the only # CSV field to check here. Files added using the 'additional_files' setting are checked below. - if file_check_row['file'].startswith('http'): + if file_check_row['file'].lower().startswith('http'): # First check to see if the file has an extension. extension = os.path.splitext(file_check_row['file'])[1] if len(extension) > 0: @@ -2325,7 +2325,7 @@ def check_input(config, args): else: logging.warning(message) - if file_check_row[additional_file_field].startswith('http'): + if file_check_row[additional_file_field].lower().startswith('http'): http_response_code = ping_remote_file(config, file_check_row[additional_file_field]) if http_response_code != 200 or ping_remote_file(config, file_check_row[additional_file_field]) is False: message = 'Remote file ' + file_check_row[additional_file_field] + ' identified in CSV "' + additional_file_field + '" column for record with ID ' \ @@ -2372,7 +2372,7 @@ def check_input(config, args): additional_filenames = file_check_row[additional_file_field].split(config['subdelimiter']) media_type_file_field = config['media_type_file_fields'][media_type] for additional_filename in additional_filenames: - if additional_filename.startswith('http'): + if additional_filename.lower().startswith('http'): # First check to see if the file has an extension. extension = os.path.splitext(additional_filename)[1] if len(extension) > 0: @@ -2967,9 +2967,9 @@ def validate_media_use_tid_in_additional_files_setting(config, media_use_tid_val media_use_tids.append(media_use_tid_value) for media_use_tid in media_use_tids: - if not value_is_numeric(media_use_tid) and media_use_tid.strip().startswith('http'): + if not value_is_numeric(media_use_tid) and media_use_tid.strip().lower().startswith('http'): media_use_tid = get_term_id_from_uri(config, media_use_tid.strip()) - if not value_is_numeric(media_use_tid) and not media_use_tid.strip().startswith('http'): + if not value_is_numeric(media_use_tid) and not media_use_tid.strip().lower().startswith('http'): media_use_tid = find_term_in_vocab(config, 'islandora_media_use', media_use_tid.strip()) term_endpoint = config['host'] + '/taxonomy/term/' + str(media_use_tid).strip() + '?_format=json' @@ -3016,7 +3016,7 @@ def validate_media_use_tid(config, media_use_tid_value_from_csv=None, csv_row_id media_use_terms = str(media_use_tid_value).split(config['subdelimiter']) for media_use_term in media_use_terms: - if value_is_numeric(media_use_term) is not True and media_use_term.strip().startswith('http'): + if value_is_numeric(media_use_term) is not True and media_use_term.strip().lower().startswith('http'): media_use_tid = get_term_id_from_uri(config, media_use_term.strip()) if csv_row_id is None: if media_use_tid is False: @@ -3041,7 +3041,7 @@ def validate_media_use_tid(config, media_use_tid_value_from_csv=None, csv_row_id "derivative media. You should temporarily disable the Context or Action that generates those derivatives." logging.warning(message) - elif value_is_numeric(media_use_term) is not True and media_use_term.strip().startswith('http') is not True: + elif value_is_numeric(media_use_term) is not True and media_use_term.strip().lower().startswith('http') is not True: media_use_tid = find_term_in_vocab(config, 'islandora_media_use', media_use_term.strip()) if csv_row_id is None: if media_use_tid is False: @@ -3235,7 +3235,7 @@ def create_file(config, filename, file_fieldname, node_csv_row, node_id): is_remote = False filename = filename.strip() - if filename.startswith('http'): + if filename.lower().startswith('http'): remote_file_http_response_code = ping_remote_file(config, filename) if remote_file_http_response_code != 200: return False @@ -3371,7 +3371,7 @@ def create_media(config, filename, file_fieldname, node_id, csv_row, media_use_t else: file_result = create_file(config, filename, file_fieldname, csv_row, node_id) - if filename.startswith('http'): + if filename.lower().startswith('http'): if file_result > 0: filename = get_preprocessed_file_path(config, file_fieldname, csv_row, node_id, False) @@ -3389,9 +3389,9 @@ def create_media(config, filename, file_fieldname, node_id, csv_row, media_use_t for media_use_term in media_use_terms: if value_is_numeric(media_use_term): media_use_tids.append(media_use_term) - if not value_is_numeric(media_use_term) and media_use_term.strip().startswith('http'): + if not value_is_numeric(media_use_term) and media_use_term.strip().lower().startswith('http'): media_use_tids.append(get_term_id_from_uri(config, media_use_term)) - if not value_is_numeric(media_use_term) and not media_use_term.strip().startswith('http'): + if not value_is_numeric(media_use_term) and not media_use_term.strip().lower().startswith('http'): media_use_tids.append(find_term_in_vocab(config, 'islandora_media_use', media_use_term.strip())) media_bundle_response_code = ping_media_bundle(config, media_type) @@ -4580,7 +4580,7 @@ def prepare_term_id(config, vocab_ids, field_name, term): return None # Special case: if the term starts with 'http', assume it's a Linked Data URI # and get its term ID from the URI. - elif term.startswith('http'): + elif term.lower().startswith('http'): # Note: get_term_id_from_uri() will return False if the URI doesn't match a term. tid_from_uri = get_term_id_from_uri(config, term) if value_is_numeric(tid_from_uri): @@ -4949,7 +4949,7 @@ def validate_media_track_fields(config, csv_data): # Confirm that config['media_use_tid'] and row-level media_use_term is for Service File (http://pcdm.org/use#ServiceFile). if 'media_use_tid' in row: - if row['media_use_tid'].startswith('http') and row['media_use_tid'] != 'http://pcdm.org/use#ServiceFile': + if row['media_use_tid'].lower().startswith('http') and row['media_use_tid'] != 'http://pcdm.org/use#ServiceFile': message = f"{row['media_use_tid']} cannot be used as a \"media_use_tid\" value in your CSV when creating media tracks." logging.error(message) sys.exit('Error: ' + message) @@ -5545,7 +5545,7 @@ def validate_taxonomy_reference_value(config, field_definitions, csv_field_name, # If this is a multi-taxonomy field, all term names (not IDs or URIs) must be namespaced using the vocab_id:term_name pattern, # regardless of whether config['allow_adding_terms'] is True. Also, we need to accommodate terms that are namespaced # and also contain a ':'. - if len(this_fields_vocabularies) > 1 and value_is_numeric(field_value) is False and not field_value.startswith('http'): + if len(this_fields_vocabularies) > 1 and value_is_numeric(field_value) is False and not field_value.lower().startswith('http'): split_field_values = field_value.split(config['subdelimiter']) for split_field_value in split_field_values: if ':' in field_value: @@ -5583,7 +5583,7 @@ def validate_taxonomy_reference_value(config, field_definitions, csv_field_name, logging.error(message + message_2) sys.exit('Error: ' + message + message_2) # Then check values that are URIs. - elif field_value.strip().startswith('http'): + elif field_value.strip().lower().startswith('http'): field_value = field_value.strip() tid_from_uri = get_term_id_from_uri(config, field_value) if value_is_numeric(tid_from_uri): @@ -5775,7 +5775,7 @@ def create_children_from_directory(config, parent_csv_record, parent_node_id): # Add field_model if that field exists in the child's content type. entity_fields = get_entity_fields(config, 'node', config['paged_content_page_content_type']) if 'field_model' in entity_fields: - if not value_is_numeric(config['paged_content_page_model_tid'].strip()) and config['paged_content_page_model_tid'].strip().startswith('http'): + if not value_is_numeric(config['paged_content_page_model_tid'].strip()) and config['paged_content_page_model_tid'].strip().lower().startswith('http'): paged_content_model_tid = get_term_id_from_uri(config, config['paged_content_page_model_tid'].strip()) else: paged_content_model_tid = config['paged_content_page_model_tid'].strip() @@ -6135,7 +6135,7 @@ def check_file_exists(config, filename): True if the file exists, false if not. """ # It's a remote file. - if filename.startswith('http'): + if filename.lower().startswith('http'): try: head_response = requests.head(filename, allow_redirects=True, verify=config['secure_ssl_only']) if head_response.status_code == 200: @@ -6201,7 +6201,7 @@ def get_preprocessed_file_path(config, file_fieldname, node_csv_row, node_id=Non return file_path_from_csv # It's a remote file. - if file_path_from_csv.startswith('http'): + if file_path_from_csv.lower().startswith('http'): if config['task'] == 'add_media': subdir = os.path.join(config['temp_dir'], re.sub('[^A-Za-z0-9]+', '_', str(node_csv_row['node_id']))) elif config['task'] == 'update_media': @@ -6418,7 +6418,7 @@ def download_file_from_drupal(config, node_id): logging.warning(f'Node {node_id} has no media.') return False - if str(config['export_file_media_use_term_id']).startswith('http'): + if str(config['export_file_media_use_term_id']).lower().startswith('http'): config['export_file_media_use_term_id'] = get_term_id_from_uri(config, config['export_file_media_use_term_id']) if config['export_file_media_use_term_id'] is False: