Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Requeue updates #790

Merged
merged 6 commits into from
Aug 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/openstudio-server-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ jobs:
if: |
github.ref == 'refs/heads/master' ||
github.ref == 'refs/heads/develop'
# github.ref == 'refs/heads/3.6.1-4'
# github.ref == 'refs/heads/3.6.1-3'
shell: bash
run: ./docker/deployment/scripts/deploy_docker_github_actions.sh
env:
Expand Down
4 changes: 2 additions & 2 deletions docker-compose.deploy.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ services:
image: redis:6.0.9
ports:
- "6379:6379"
command: "redis-server --requirepass ${REDIS_PASSWORD}"
command: "redis-server --requirepass ${REDIS_PASSWORD} --maxclients 40000"
web:
image: nrel/openstudio-server:latest
ports:
Expand Down Expand Up @@ -72,7 +72,7 @@ services:
worker:
image: nrel/openstudio-server:latest
environment:
- QUEUES=simulations
- QUEUES=requeued,simulations
- COUNT=1
- REDIS_URL=${REDIS_URL}
- MONGO_USER=${MONGO_USER}
Expand Down
4 changes: 2 additions & 2 deletions docker-compose.local.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ services:
image: redis:6.0.9
ports:
- "6379:6379"
command: "redis-server --requirepass ${REDIS_PASSWORD}"
command: "redis-server --requirepass ${REDIS_PASSWORD} --maxclients 40000"
web:
image: 127.0.0.1:5000/openstudio-server
build:
Expand Down Expand Up @@ -69,7 +69,7 @@ services:
rails_env: docker
environment:
- OS_SERVER_NUMBER_OF_WORKERS=${OS_SERVER_NUMBER_OF_WORKERS}
- QUEUES=simulations
- QUEUES=requeued,simulations
- COUNT=1
- REDIS_URL=${REDIS_URL}
- MONGO_USER=${MONGO_USER}
Expand Down
4 changes: 2 additions & 2 deletions docker-compose.test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ services:
image: redis:6.0.9
ports:
- "6379:6379"
command: "redis-server --requirepass ${REDIS_PASSWORD}"
command: "redis-server --requirepass ${REDIS_PASSWORD} --maxclients 40000"
web:
image: nrel/openstudio-server:latest
build:
Expand Down Expand Up @@ -75,7 +75,7 @@ services:
bundle_args: ''
environment:
- OS_SERVER_NUMBER_OF_WORKERS=1
- QUEUES=simulations
- QUEUES=requeued,simulations
- COUNT=1
- REDIS_URL=${REDIS_URL}
- MONGO_USER=${MONGO_USER}
Expand Down
4 changes: 2 additions & 2 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ services:
image: redis:6.0.9
ports:
- "6379:6379"
command: "redis-server --requirepass ${REDIS_PASSWORD}"
command: "redis-server --requirepass ${REDIS_PASSWORD} --maxclients 40000"
web:
image: nrel/openstudio-server:latest
build:
Expand Down Expand Up @@ -69,7 +69,7 @@ services:
rails_env: docker
environment:
- OS_SERVER_NUMBER_OF_WORKERS=${OS_SERVER_NUMBER_OF_WORKERS}
- QUEUES=simulations
- QUEUES=requeued,simulations
- COUNT=1
- REDIS_URL=${REDIS_URL}
- MONGO_USER=${MONGO_USER}
Expand Down
2 changes: 1 addition & 1 deletion docker/server/start-web-background.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,4 @@ echo "Waiting for Redis to start"

#cd /opt/openstudio/server && bundle exec rake environment resque:work
echo "Startup two resque workers"
cd /opt/openstudio/server && COUNT=2 bundle exec rake environment resque:workers
cd /opt/openstudio/server && COUNT=6 bundle exec rake environment resque:workers
4 changes: 2 additions & 2 deletions local_setup_scripts/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ services:
resources:
reservations:
cpus: "1"
command: "redis-server --requirepass openstudio"
command: "redis-server --requirepass openstudio --maxclients 40000"
web:
image: 127.0.0.1:5000/openstudio-server
ports:
Expand Down Expand Up @@ -84,7 +84,7 @@ services:
worker:
image: 127.0.0.1:5000/openstudio-server
environment:
- QUEUES=simulations
- QUEUES=requeued,simulations
- COUNT=1
- REDIS_URL=redis://:openstudio@queue:6379
- MONGO_USER=openstudio
Expand Down
4 changes: 2 additions & 2 deletions local_setup_scripts/win64/docker-compose-mock.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ services:
placement:
constraints:
- node.role == manager
command: "redis-server --requirepass openstudio"
command: "redis-server --requirepass openstudio --maxclients 40000"
web:
image: 127.0.0.1:5000/openstudio-server
ports:
Expand Down Expand Up @@ -72,7 +72,7 @@ services:
worker:
image: 127.0.0.1:5000/openstudio-server
environment:
- QUEUES=simulations
- QUEUES=requeued,simulations
- COUNT=1
- REDIS_URL=redis://:openstudio@queue:6379
- MONGO_USER=openstudio
Expand Down
4 changes: 2 additions & 2 deletions local_setup_scripts/win64/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ services:
placement:
constraints:
- node.role == manager
command: "redis-server --requirepass openstudio"
command: "redis-server --requirepass openstudio --maxclients 40000"
web:
image: 127.0.0.1:5000/openstudio-server
ports:
Expand Down Expand Up @@ -72,7 +72,7 @@ services:
worker:
image: 127.0.0.1:5000/openstudio-server
environment:
- QUEUES=simulations
- QUEUES=requeued,simulations
- COUNT=1
- REDIS_URL=redis://:openstudio@queue:6379
- MONGO_USER=openstudio
Expand Down
40 changes: 40 additions & 0 deletions server/app/controllers/admin_controller.rb
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,46 @@ def index
@os_cli = version ? version.strip : 'Unknown'
end

def prune_resque_workers
Rails.logger.warn "Pruning Dead Resque Workers"

# Enqueue a new job
worker = Resque::Worker.new()
worker.prune_dead_workers
worker.shutdown

respond_to do |format|
format.html { redirect_to admin_index_path, notice: 'Resque Workers Pruned.' }
format.json { head :no_content }
end
end

# POST /jobs/requeue_failed
def requeue_failed
Rails.logger.warn "Requeueing Failed Jobs to 'requeued' Queue"

requeued_count = 0

Resque::Failure.each do |id, failed_job|
payload = failed_job['payload']
job_class = payload['class']
job_args = payload['args']

# Requeue the job to the 'requeued' queue
Resque.enqueue_to('requeued', job_class.constantize, *job_args)
# Remove the job from the failed queue
Resque::Failure.remove(id)
requeued_count += 1
end

Rails.logger.info "#{requeued_count} jobs successfully requeued to 'requeued' queue by requeue_failed"

respond_to do |format|
format.html { redirect_to admin_index_path, notice: "#{requeued_count} Resque jobs requeued." }
format.json { render json: { message: "#{requeued_count} jobs requeued to 'requeued' queue" }, status: :ok }
end
end

def backup_database
logger.info params
write_and_send_data
Expand Down
22 changes: 22 additions & 0 deletions server/app/controllers/analyses_controller.rb
Original file line number Diff line number Diff line change
Expand Up @@ -198,6 +198,10 @@ def destroy
# stop analysis button action
def stop
@analysis = Analysis.find(params[:id])
if @analysis.nil?
logger.error "Analysis with ID #{params[:id]} not found."
redirect_to analyses_path, alert: 'Analysis not found.' and return
end
res = @analysis.stop_analysis

respond_to do |format|
Expand All @@ -208,6 +212,24 @@ def stop
end
end
end

# stop analysis button action
def soft_stop
@analysis = Analysis.find(params[:id])
if @analysis.nil?
logger.error "Analysis with ID #{params[:id]} not found."
redirect_to analyses_path, alert: 'Analysis not found.' and return
end
res = @analysis.soft_stop_analysis

respond_to do |format|
if res[0]
format.html { redirect_to @analysis, notice: 'Analysis flag changed to stop. Will NOT wait until the last submitted run finishes before killing.' }
else
format.html { redirect_to @analysis, notice: 'Analysis flag did NOT change.' }
end
end
end

# Controller for submitting the action via post. This right now only works with the API
# and will only return a JSON response based on whether or not the analysis has been
Expand Down
20 changes: 19 additions & 1 deletion server/app/controllers/data_points_controller.rb
Original file line number Diff line number Diff line change
Expand Up @@ -302,13 +302,15 @@ def requeue
Rails.logger.warn "data_points_controller.REQUEUE"
@data_point = DataPoint.find(params[:id])
analysis_id = @data_point.analysis
Rails.logger.warn "data_points_contoller.REQUEUEing #{@data_point.id}"
Rails.logger.debug "data_points_controller.id: #{@data_point.id}"
Rails.logger.debug "data_points_controller.job_id: #{@data_point.job_id}"
# Destroy the existing job in Resque queue; this is tied to a worker_host:PID:uuid
Resque::Job.destroy(:requeue, 'ResqueJobs::RunSimulateDataPoint', @data_point.job_id)
Resque::Job.destroy(:simulations, 'ResqueJobs::RunSimulateDataPoint', @data_point.job_id)

# Enqueue a new job
Resque.enqueue(ResqueJobs::RunSimulateDataPoint, @data_point.job_id)
Resque.enqueue_to(:requeued, ResqueJobs::RunSimulateDataPoint, @data_point.job_id)

# Attempt to find the worker processing this job
#worker = find_resque_worker_by_job_id(@data_point.job_id)
Expand All @@ -335,6 +337,22 @@ def requeue
end
end

def requeue_started
@analysis = Analysis.find(params[:id])
Rails.logger.warn "Requeueing all NOT completed normal simulations for analysis #{@analysis.id}"
data_points_to_requeue = @analysis.data_points.where.not(status_message: 'completed normal')

data_points_to_requeue.each do |dp|
Rails.logger.warn "Requeueing DataPoint #{dp.id}"
Resque.enqueue_to(:requeued, ResqueJobs::RunSimulateDataPoint, dp.job_id)
end

respond_to do |format|
format.html { redirect_to analysis_path(@analysis), notice: 'Simulations were successfully requeued.' }
format.json { head :no_content }
end
end

def find_resque_worker_by_job_id(job_id)
Rails.logger.debug "data_points_controller.find_resque_worker_by_job_id"
Resque.workers.each do |worker|
Expand Down
31 changes: 29 additions & 2 deletions server/app/controllers/pages_controller.rb
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def dashboard
# data for dashboard header
@projects = Project.all
# sort works because the states are queued, started, completed, na. started is the last in the list...
@analyses = Analysis.all.order_by(:updated_at.asc)
#@analyses = Analysis.all.order_by(:updated_at.asc)
failed_runs = DataPoint.where(status_message: 'datapoint failure').count
total_runs = DataPoint.all.count
completed_cnt = DataPoint.where(status: 'completed').count
Expand All @@ -50,7 +50,34 @@ def dashboard

# Finding the current analysis
#candidates = Analysis.includes(:jobs).order_by(:updated_at.asc)
@current = @analyses.detect { |analysis| analysis.jobs.any? { |job| job.status == 'started' } }
#@current = @analyses.detect { |analysis| analysis.jobs.any? { |job| job.status == 'started' } }

# Step 1: Fetch all analyses ordered by updated_at descending (newest first)
all_analyses = Analysis.includes(:jobs).order_by(updated_at: :desc)

# Step 2: Select the first two 'started' analyses
#started_analyses = all_analyses.select { |analysis| analysis.jobs.any? { |job| job.status == 'started' } }.first(2)
# Step 2: Efficiently select the first two 'started' analyses, leveraging Ruby for fine-tuned sorting
started_analyses = all_analyses.select { |analysis| analysis.jobs.any? { |job| job.status == 'started' }
}.sort_by { |analysis|
# This finds the earliest start time among the started jobs for sorting
analysis.jobs.select { |job| job.status == 'started' }.min_by(&:created_at).created_at
}.first(2)

# Step 3: Set @current and prepare @analyses
if started_analyses.any?
# Assume @current is the most recently updated
@current = started_analyses.first
# Ensure @analyses includes other analyses without changing the original order too much
# Remove @current from all_analyses and prepend the second 'started' analysis if it exists
all_analyses -= started_analyses
all_analyses.prepend(started_analyses.second) if started_analyses.length > 1
else
# Fallback if no 'started' analyses found
@current = all_analyses.first
end

@analyses = all_analyses
# If no 'started' analysis is currently running, optionally set @current to the most recently updated analysis
@current ||= @analyses.first

Expand Down
18 changes: 11 additions & 7 deletions server/app/jobs/resque_jobs/run_simulate_data_point.rb
Original file line number Diff line number Diff line change
Expand Up @@ -25,21 +25,25 @@ def self.perform(data_point_id, options = {})
if !(statuses[:status] == 'completed' && statuses[:status_message] == 'completed normal')
msg = "RUNNING DJ: #{statuses[:status]} and #{statuses[:status_message]}"
d.add_to_rails_log(msg)
puts msg
job = DjJobs::RunSimulateDataPoint.new(data_point_id, options)
job.perform
else
msg = "SKIPPING #{data_point_id} since it is #{statuses[:status]} and #{statuses[:status_message]}"
d.add_to_rails_log(msg)
puts msg
end
rescue Errno::ENOSPC, Resque::DirtyExit, Resque::TermException, Resque::PruneDeadWorkerDirtyExit => e
rescue SignalException, Errno::ENOSPC, Resque::DirtyExit, Resque::TermException, Resque::PruneDeadWorkerDirtyExit => e
# Log the termination and re-enqueue attempt
d.add_to_rails_log("Worker Caught Exception: #{e.inspect}: Re-enqueueing DataPoint ID #{data_point_id}")
Resque.enqueue(self, data_point_id, options)
d.add_to_rails_log("DataPoint #{data_point_id} re-enqueued.")
d.add_to_rails_log("Worker Caught Exception: #{e.inspect}")#: Re-enqueueing DataPoint ID #{data_point_id}")
#Resque.enqueue_to(:requeued, self, data_point_id, options)
#puts "DataPoint #{data_point_id} re-enqueued."
puts "Worker Caught Exception: #{e.inspect}"
rescue => e
d.add_to_rails_log("Worker Caught Unhandled Exception: #{e.message}: Re-enqueueing DataPoint ID #{data_point_id}")
Resque.enqueue(self, data_point_id, options)
d.add_to_rails_log("Unhandled exception, re-enqueued DataPoint.")
d.add_to_rails_log("Worker Caught Unhandled Exception: #{e.message}")#: Re-enqueueing DataPoint ID #{data_point_id}")
#Resque.enqueue_to(:requeued, self, data_point_id, options)
#puts "Unhandled exception, re-enqueued DataPoint."
puts "Worker Caught Unhandled Exception: #{e.message}"
end
end
end
20 changes: 20 additions & 0 deletions server/app/models/analysis.rb
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,26 @@ def stop_analysis
[save!, errors]
end

def soft_stop_analysis
Rails.logger.info('attempting to stop analysis')

self.run_flag = false

jobs.each do |j|
unless j.status == 'completed'
j.status = 'completed'
j.end_time = Time.new
j.status_message = 'datapoint canceled'
j.save!
end
end

# Remove all the queued background jobs for this analysis
data_points.where(status: 'queued').each(&:set_soft_canceled_state)

[save!, errors]
end

# Method that pulls out the variables from the uploaded problem/analysis JSON.
def pull_out_os_variables
pat_json = false
Expand Down
11 changes: 11 additions & 0 deletions server/app/models/data_point.rb
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,16 @@ def set_canceled_state
self.status_message = 'datapoint canceled'
save!
end

def set_soft_canceled_state
Rails.logger.debug "data_point.set_soft_canceled_state"
#destroy_background_job # destroy queued job
self.run_start_time ||= Time.now
self.run_end_time = Time.now
self.status = :completed
self.status_message = 'datapoint canceled'
save!
end

def set_queued_state
Rails.logger.debug "data_point.set_queued_state"
Expand Down Expand Up @@ -171,6 +181,7 @@ def destroy_background_job
elsif Rails.application.config.job_manager == :resque
if job_id
Resque::Job.destroy(:simulations, 'ResqueJobs::RunSimulateDataPoint', job_id)
Resque::Job.destroy(:requeued, 'ResqueJobs::RunSimulateDataPoint', job_id)
end
else
raise 'Rails.application.config.job_manager must be set to :resque or :delayed_job'
Expand Down
Loading
Loading