| | #!/bin/bash |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | module purge |
| | module load gcc craype cray-mpich cuda cudnn/8.8.1.3-12 conda |
| | conda activate /glade/work/ksha/miniconda3/envs/credit-derecho |
| | |
| | |
| | export LSCRATCH=/glade/derecho/scratch/ksha/ |
| | export LOGLEVEL=INFO |
| | export NCCL_DEBUG=INFO |
| | export CUDA_VISIBLE_DEVICES=0,1,2,3 |
| | export NCCL_SOCKET_IFNAME=hsn |
| | export MPICH_GPU_MANAGED_MEMORY_SUPPORT_ENABLED=1 |
| | export MPICH_OFI_NIC_POLICY=GPU |
| | export MPICH_GPU_SUPPORT_ENABLED=1 |
| | export NCCL_IB_DISABLE=1 |
| | export NCCL_CROSS_NIC=1 |
| | export NCCL_NCHANNELS_PER_NET_PEER=4 |
| | export MPICH_RDMA_ENABLED_CUDA=1 |
| | export NCCL_NET="AWS Libfabric" |
| | export NCCL_NET_GDR_LEVEL=PBH |
| | export FI_CXI_DISABLE_HOST_REGISTER=1 |
| | export FI_CXI_OPTIMIZED_MRS=false |
| | export FI_MR_CACHE_MONITOR=userfaultfd |
| | export FI_CXI_DEFAULT_CQ_SIZE=131072 |
| | |
| | echo "Number of nodes: 8" |
| | echo "Number of GPUs per node: 4" |
| | echo "Total number of GPUs: 32" |
| | |
| | |
| | |
| | nodes=( $( cat $PBS_NODEFILE ) ) |
| | echo nodes: $nodes |
| | |
| | head_node=${nodes[0]} |
| | head_node_ip=$(ssh $head_node hostname -i | awk '{print $1}') |
| | MASTER_ADDR=$head_node_ip MASTER_PORT=1234 mpiexec -n 32 --ppn 4 --cpu-bind none python /glade/u/home/ksha/miles-credit/applications/train.py -c /glade/work/ksha/CREDIT_runs/fuxi_6h/model_single.yml --backend nccl |
| |
|