Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimize iteration in DatasetInstance model + SA2.0 fix #16776

Merged
merged 3 commits into from
Oct 10, 2023
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions lib/galaxy/model/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from collections.abc import Callable
from datetime import timedelta
from enum import Enum
from itertools import chain
from string import Template
from typing import (
Any,
Expand Down Expand Up @@ -4487,10 +4488,9 @@ def ok_to_edit_metadata(self):
# prevent modifying metadata when dataset is queued or running as input/output
# This code could be more efficient, i.e. by using mappers, but to prevent slowing down loading a History panel, we'll leave the code here for now
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should this comment now go away?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Well now that you mention it... I've looked at the generated queries. Given that this needs to be efficient, we can do better.

Currently, this is what's sent to the database before the loop (using dataset_id=42, job_id=1 as fillers):

SELECT
	job_to_input_dataset.id,
	job_to_input_dataset.job_id,
	job_to_input_dataset.dataset_id,
	job_to_input_dataset.dataset_version,
	job_to_input_dataset.name,
	dataset_1.id AS id_1,
	dataset_1.job_id AS job_id_1,
	dataset_1.create_time,
	dataset_1.update_time,
	dataset_1.state,
	dataset_1.deleted,
	dataset_1.purged,
	dataset_1.purgable,
	dataset_1.object_store_id,
	dataset_1.external_filename,
	dataset_1._extra_files_path,
	dataset_1.created_from_basename,
	dataset_1.file_size,
	dataset_1.total_size,
	dataset_1.uuid,
	history_dataset_association_1.id AS id_2,
	history_dataset_association_1.history_id,
	history_dataset_association_1.dataset_id AS dataset_id_1,
	history_dataset_association_1.create_time AS create_time_1,
	history_dataset_association_1.update_time AS update_time_1,
	history_dataset_association_1.state AS state_1,
	history_dataset_association_1.copied_from_history_dataset_association_id,
	history_dataset_association_1.copied_from_library_dataset_dataset_association_id,
	history_dataset_association_1.name AS name_1,
	history_dataset_association_1.info,
	history_dataset_association_1.blurb,
	history_dataset_association_1.peek,
	history_dataset_association_1.tool_version,
	history_dataset_association_1.extension,
	history_dataset_association_1.metadata_deferred,
	history_dataset_association_1.parent_id,
	history_dataset_association_1.designation,
	history_dataset_association_1.deleted AS deleted_1,
	history_dataset_association_1.visible,
	history_dataset_association_1.extended_metadata_id,
	history_dataset_association_1.version,
	history_dataset_association_1.hid,
	history_dataset_association_1.purged AS purged_1,
	history_dataset_association_1.validated_state,
	history_dataset_association_1.validated_state_message,
	history_dataset_association_1.hidden_beneath_collection_instance_id
FROM
	job_to_input_dataset
	LEFT JOIN history_dataset_association AS history_dataset_association_1 ON
			history_dataset_association_1.id = job_to_input_dataset.dataset_id
	LEFT JOIN dataset AS dataset_1 ON dataset_1.id = history_dataset_association_1.dataset_id
WHERE
	job_to_input_dataset.dataset_id = 42;

SELECT
	job_to_output_dataset.id,
	job_to_output_dataset.job_id,
	job_to_output_dataset.dataset_id,
	job_to_output_dataset.name,
	dataset_1.id AS id_1,
	dataset_1.job_id AS job_id_1,
	dataset_1.create_time,
	dataset_1.update_time,
	dataset_1.state,
	dataset_1.deleted,
	dataset_1.purged,
	dataset_1.purgable,
	dataset_1.object_store_id,
	dataset_1.external_filename,
	dataset_1._extra_files_path,
	dataset_1.created_from_basename,
	dataset_1.file_size,
	dataset_1.total_size,
	dataset_1.uuid,
	history_dataset_association_1.id AS id_2,
	history_dataset_association_1.history_id,
	history_dataset_association_1.dataset_id AS dataset_id_1,
	history_dataset_association_1.create_time AS create_time_1,
	history_dataset_association_1.update_time AS update_time_1,
	history_dataset_association_1.state AS state_1,
	history_dataset_association_1.copied_from_history_dataset_association_id,
	history_dataset_association_1.copied_from_library_dataset_dataset_association_id,
	history_dataset_association_1.name AS name_1,
	history_dataset_association_1.info,
	history_dataset_association_1.blurb,
	history_dataset_association_1.peek,
	history_dataset_association_1.tool_version,
	history_dataset_association_1.extension,
	history_dataset_association_1.metadata_deferred,
	history_dataset_association_1.parent_id,
	history_dataset_association_1.designation,
	history_dataset_association_1.deleted AS deleted_1,
	history_dataset_association_1.visible,
	history_dataset_association_1.extended_metadata_id,
	history_dataset_association_1.version,
	history_dataset_association_1.hid,
	history_dataset_association_1.purged AS purged_1,
	history_dataset_association_1.validated_state,
	history_dataset_association_1.validated_state_message,
	history_dataset_association_1.hidden_beneath_collection_instance_id
FROM
	job_to_output_dataset
	LEFT JOIN history_dataset_association AS history_dataset_association_1 ON
			history_dataset_association_1.id = job_to_output_dataset.dataset_id
	LEFT JOIN dataset AS dataset_1 ON dataset_1.id = history_dataset_association_1.dataset_id
WHERE
	job_to_output_dataset.dataset_id = 42;

Then, at each iteration step this is sent to the database:

SELECT
	job.id AS job_id,
	job.create_time AS job_create_time,
	job.update_time AS job_update_time,
	job.history_id AS job_history_id,
	job.library_folder_id AS job_library_folder_id,
	job.tool_id AS job_tool_id,
	job.tool_version AS job_tool_version,
	job.galaxy_version AS job_galaxy_version,
	job.dynamic_tool_id AS job_dynamic_tool_id,
	job.state AS job_state,
	job.info AS job_info,
	job.copied_from_job_id AS job_copied_from_job_id,
	job.command_line AS job_command_line,
	job.dependencies AS job_dependencies,
	job.job_messages AS job_job_messages,
	job.param_filename AS job_param_filename,
	job.runner_name AS job_runner_name_1,
	job.job_stdout AS job_job_stdout,
	job.job_stderr AS job_job_stderr,
	job.tool_stdout AS job_tool_stdout,
	job.tool_stderr AS job_tool_stderr,
	job.exit_code AS job_exit_code,
	job.traceback AS job_traceback,
	job.session_id AS job_session_id,
	job.user_id AS job_user_id,
	job.job_runner_name AS job_job_runner_name,
	job.job_runner_external_id AS job_job_runner_external_id,
	job.destination_id AS job_destination_id,
	job.destination_params AS job_destination_params,
	job.object_store_id AS job_object_store_id,
	job.imported AS job_imported,
	job.params AS job_params,
	job.handler AS job_handler,
	job.preferred_object_store_id AS job_preferred_object_store_id,
	job.object_store_id_overrides AS job_object_store_id_overrides,
	EXISTS(
		SELECT
			history_dataset_collection_association.id
		FROM
			history_dataset_collection_association, job_to_output_dataset_collection
		WHERE
			job.id = job_to_output_dataset_collection.job_id
			AND history_dataset_collection_association.id
				= job_to_output_dataset_collection.dataset_collection_id
			AND history_dataset_collection_association.deleted = true
	)
		AS anon_1,
	EXISTS(
		SELECT
			history_dataset_association.id
		FROM
			history_dataset_association, job_to_output_dataset
		WHERE
			job.id = job_to_output_dataset.job_id
			AND history_dataset_association.id = job_to_output_dataset.dataset_id
			AND history_dataset_association.deleted = true
	)
		AS anon_2
FROM
	job
WHERE
	job.id = 1;

So, instead, before the loop:

SELECT
	job_to_input_dataset.job_id
FROM
	job_to_input_dataset
WHERE
	job_to_input_dataset.dataset_id = 42;

SELECT
	job_to_output_dataset.job_id
FROM
	job_to_output_dataset
WHERE
	job_to_output_dataset.dataset_id = 42;

And at each iteration step:

SELECT job.state FROM job WHERE job.id = 1;

Fix coming up in next commit.

sa_session = object_session(self)
for job_to_dataset_association in (
sa_session.query(JobToInputDatasetAssociation).filter_by(dataset_id=self.id).all()
+ sa_session.query(JobToOutputDatasetAssociation).filter_by(dataset_id=self.id).all()
):
stmt1 = select(JobToInputDatasetAssociation).filter_by(dataset_id=self.id)
stmt2 = select(JobToOutputDatasetAssociation).filter_by(dataset_id=self.id)
for job_to_dataset_association in chain(sa_session.scalars(stmt1), sa_session.scalars(stmt2)):
if job_to_dataset_association.job.state not in Job.terminal_states:
return False
return True
Expand Down