...
Code Block | ||||
---|---|---|---|---|
| ||||
#!/bin/bash jobid=$JOB_ID machinefile=$TMPDIR/machines head_node='' password='' portnum=0 declare -a list_machines # ports are used by ray declare -a skip_ports=(11123 10001 38717 44006) while true do # assign random port in range 20000 - 52767 portnum=$(($jobid % $RANDOM + 20000)) if [[ ! "${skip_ports[*]}" =~ "$portnum" ]] then break fi done # build uniq list of machines assigned by scheduler for machine in $(cat $machinefile | uniq) do list_machines[${#list_machines[@]}]=$machine done # first node is head node master_node=${list_machines[0]} # head node bootstrap if [[ "x$(hostname)" == "x$master_node" ]] then numcpus=$(grep $master_node $machinefile | wc -l) echo "Isabella Ray head - $numcpus cores @ $master_node port=$portnum" head_start_log=$(ray start --num-cpus $numcpus --port=$portnum --head | grep "ray start") head_start_log=${head_start_log#*ray start} head_node=$(echo $head_start_log | awk '{print $1}' | awk -F'=' '{print $2}') head_node="${head_node%\'*}'" head_node="'${head_node#*\'}" password=$(echo $head_start_log | awk '{print $2}' | awk -F'=' '{print $2}') password="${password%\'*}'" password="'${password#*\'}" fi # worker nodes bootstrap for machine in ${list_machines[@]:1} do numcpus=$(grep $machine $machinefile | wc -l) echo "Isabella Ray worker - $numcpus cores @ $machine" master_arg="${head_node//\'/}" password_arg="${password//\'/}" ssh $machine ray start --num-cpus $numcpus --address=$master_arg --redis-password=$password_arg done |
...