Spaces:
Sleeping
Sleeping
File size: 1,498 Bytes
7245cc5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 |
#!/bin/bash
#SBATCH --job-name=train_deblur
#SBATCH --nodes=1
#SBATCH --gpus-per-node=4
#SBATCH --qos=gpu4-8h
#SBATCH --signal=B:USR1@600
#SBATCH --cpus-per-task=24
#SBATCH --output=output/slurm-%j.out
#SBATCH --error=error/slurm-%j.err
#SBATCH --nodelist=lse-hpcnode[8]
#the signal time needs to be larger than the sleep in the handler function
# prepare your environment here
source ~/.bashrc
conda activate gencam
cd /datasets/sai/gencam/cogvideox/training
export CUDA_VISIBLE_DEVICES=0,1,2,3
export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
if [ -z "$1" ]
then
#quit if no job number is passed
echo "No config file passed, quitting"
exit 1
else
config_file=$1
fi
handler()
{
echo "function handler called at $(date)"
# Send SIGUSR1 to the captured PID of the accelerate job
if [ -n "$accelerate_pid" ]; then
echo "Sending SIGUSR1 to accelerate PID: $accelerate_pid"
python_id=$(ps --ppid $accelerate_pid -o pid=)
kill -USR1 $python_id # Send SIGUSR1 to the accelerate job
sleep 300 # Wait for 5 minutes
else
echo "No accelerate PID found"
fi
echo "Resubmitting job with config file: $config_file"
sbatch ${BASH_SOURCE[0]} $config_file
}
# register signal handler
trap handler SIGUSR1
echo "Starting job at $(date)"
#python train_controlnet.py #--config $config_file #& wait
accelerate launch --config_file accelerator_configs/accelerator_train_config.yaml --multi_gpu train_controlnet.py --config $config_file &
accelerate_pid=$!
wait |