File size: 1,498 Bytes
7245cc5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
#!/bin/bash
#SBATCH --job-name=train_deblur
#SBATCH --nodes=1
#SBATCH --gpus-per-node=4
#SBATCH --qos=gpu4-8h
#SBATCH --signal=B:USR1@600
#SBATCH --cpus-per-task=24
#SBATCH --output=output/slurm-%j.out
#SBATCH --error=error/slurm-%j.err
#SBATCH --nodelist=lse-hpcnode[8]

#the signal time needs to be larger than the sleep in the handler function

# prepare your environment here
source ~/.bashrc
conda activate gencam
cd /datasets/sai/gencam/cogvideox/training
export CUDA_VISIBLE_DEVICES=0,1,2,3
export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True

if [ -z "$1" ]
 then
  #quit if no job number is passed
  echo "No config file passed, quitting"
  exit 1
 else
  config_file=$1
fi

handler()
{
  echo "function handler called at $(date)"
  # Send SIGUSR1 to the captured PID of the accelerate job
  if [ -n "$accelerate_pid" ]; then
    echo "Sending SIGUSR1 to accelerate PID: $accelerate_pid"
    python_id=$(ps --ppid $accelerate_pid -o pid=)
    kill -USR1 $python_id  # Send SIGUSR1 to the accelerate job
    sleep 300 # Wait for 5 minutes
  else
    echo "No accelerate PID found"
  fi
  echo "Resubmitting job with config file: $config_file"
  sbatch ${BASH_SOURCE[0]} $config_file
}

# register signal handler
trap handler SIGUSR1

echo "Starting job at $(date)"
#python train_controlnet.py #--config $config_file #& wait
accelerate launch --config_file accelerator_configs/accelerator_train_config.yaml --multi_gpu train_controlnet.py --config $config_file & 
accelerate_pid=$!

wait