Skip to content

Commit

Permalink
feat: train_parallel.sh
Browse files Browse the repository at this point in the history
  • Loading branch information
db0 committed Nov 19, 2024
1 parent 40a9b48 commit 8e846d1
Show file tree
Hide file tree
Showing 2 changed files with 50 additions and 8 deletions.
8 changes: 0 additions & 8 deletions train.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,13 +51,6 @@

random.seed()

# Database connection string for Optuna - don't use root :)
DB_CONNECTION_STRING = "mysql://root:root@localhost/optuna"

# # Where is our training data?
# TRAINING_DATA_FILENAME = "f:/ai/dev/AI-Horde-Worker/inference-time-data.json"
# VALIDATION_DATA_FILENAME = "f:/ai/dev/AI-Horde-Worker/inference-time-data-validation.json"

# Number of trials to run.
# Each trial generates a new neural network topology with new hyper parameters and trains it.
NUMBER_OF_STUDY_TRIALS = 300
Expand Down Expand Up @@ -664,7 +657,6 @@ def main():
study.optimize(
objective,
n_trials=NUMBER_OF_STUDY_TRIALS,
show_progress_bar=True,
callbacks=[TerminatorCallback(terminator)],
)
except KeyboardInterrupt:
Expand Down
50 changes: 50 additions & 0 deletions train_parallel.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
#!/bin/bash

# Check if all required arguments were provided
if [ $# -ne 2 ]; then
echo "Usage: $0 <number_of_instances> <version>"
echo "Example: $0 20 v26"
exit 1
fi

# Arguments
N=$1
VERSION=$2

# Validate number of instances is a positive number
if ! [[ "$N" =~ ^[0-9]+$ ]] || [ "$N" -lt 1 ]; then
echo "Error: First argument must be a positive number"
exit 1
fi

# Validate version starts with 'v'
if ! [[ "$VERSION" =~ ^v[0-9]+$ ]]; then
echo "Error: Version must be in format 'v<number>' (e.g., v26)"
exit 1
fi

# Counter for naming log files
counter=1

# Create logs directory if it doesn't exist
mkdir -p logs

echo "Starting $N training instances with version $VERSION..."

# Start N instances in parallel
for i in $(seq 1 $N); do
# Run each instance with its output redirected to a log file
python train.py -ev $VERSION > "logs/train_${VERSION}_${counter}.log" 2>&1 &
echo "Started instance $counter with version $VERSION"
((counter++))
done

# Wait for all background processes to complete
wait

echo "All training instances have completed"

# Check exit status of all processes
for job in $(jobs -p); do
wait $job || echo "Process $job failed"
done

0 comments on commit 8e846d1

Please sign in to comment.