Versions Compared

Key

  • This line was added.
  • This line was removed.
  • Formatting was changed.

...

Code Block
languagebash
titleray_isabella_start.sh
 #!/bin/bash

jobid=$JOB_ID
machinefile=$TMPDIR/machines
head_node=''
password=''
portnum=0
declare -a list_machines
# ports are used by ray
declare -a skip_ports=(11123 10001 38717 44006)

while true
do
	# assign random port in range 20000 - 52767
	portnum=$(($jobid % $RANDOM + 20000))
	if [[ ! "${skip_ports[*]}" =~ "$portnum" ]]
	then
		break
	fi
done

# build uniq list of machines assigned by scheduler
for machine in $(cat $machinefile | uniq)
do
	list_machines[${#list_machines[@]}]=$machine
done

# first node is head node
master_node=${list_machines[0]}

# changedir
cd $TMPDIR

# head node bootstrap
if [[ "x$(hostname)" == "x$master_node" ]]
then
	numcpus=$(grep $master_node $machinefile | wc -l)

	echo "Isabella Ray head - $numcpus cores @ $master_node port=$portnum"
	head_start_log=$(ray start --num-cpus $numcpus --port=$portnum --head | grep "ray start")
	head_start_log=${head_start_log#*ray start}

	head_node=$(echo $head_start_log | awk '{print $1}' | awk -F'=' '{print $2}')
	head_node="${head_node%\'*}'"
	head_node="'${head_node#*\'}"

	password=$(echo $head_start_log | awk '{print $2}' | awk -F'=' '{print $2}')
	password="${password%\'*}'"
	password="'${password#*\'}"
fi

sleep 10

# worker nodes bootstrap
for machine in ${list_machines[@]:1}
do
	numcpus=$(grep $machine $machinefile | wc -l)

	echo "Isabella Ray worker - $numcpus cores @ $machine"
	master_arg="${head_node//\'/}"
	password_arg="${password//\'/}"
    ssh $machine "eval `/usr/bin/modulecmd bash load ray/1.10.0` ray start --num-cpus $numcpus --address=$master_arg --redis-password=$password_arg --block"&
done

sleep 10

...