This commit is contained in:
parent
b791dcd584
commit
534ce0c6ed
@ -22,7 +22,10 @@ sock.setsockopt(socket.SOL_SOCKET, socket.SO_BROADCAST, 1)
|
||||
sock.bind((UDP_IP, UDP_PORT))
|
||||
|
||||
def nfsDone():
|
||||
with open('myfile.txt') as myfile:
|
||||
if MASTER is None:
|
||||
return False
|
||||
|
||||
with open('/etc/fstab') as myfile:
|
||||
if MASTER["ip"] in myfile.read():
|
||||
return True
|
||||
|
@ -3,6 +3,9 @@ Description=SaturnArch discover Service
|
||||
After=network.target
|
||||
|
||||
[Service]
|
||||
StandardError=journal
|
||||
StandardOutput=journal
|
||||
StandardInput=null
|
||||
Type=idle
|
||||
Restart=on-failure
|
||||
User=root
|
@ -110,12 +110,12 @@ echo "Entering Chroot Environment"
|
||||
|
||||
mkdir /mnt/etc/slurm-llnl
|
||||
|
||||
cp fast_install_stage2.sh /mnt
|
||||
cp environment.sh /mnt
|
||||
cp discover.py /mnt/usr/bin/discover.py
|
||||
cp -rf slurm/* /mnt/etc/slurm-llnl
|
||||
cp saturnDiscover.service /mnt/lib/systemd/system/saturnDiscover.service
|
||||
cp /usr/local/bin/apt /mnt/usr/bin/apt
|
||||
cp fast_install_stage2.sh /mnt
|
||||
cp environment.sh /mnt
|
||||
cp /installFiles/discover.py /mnt/usr/bin/discover.py
|
||||
cp -rf /installFiles/slurm/* /mnt/etc/slurm-llnl
|
||||
cp /installFiles/saturnDiscover.service /mnt/lib/systemd/system/saturnDiscover.service
|
||||
cp /usr/local/bin/apt /mnt/usr/bin/apt
|
||||
|
||||
arch-chroot /mnt /fast_install_stage2.sh
|
||||
|
||||
|
@ -107,8 +107,8 @@ sudo chown nobody.nogroup /clusterfs
|
||||
sudo chmod -R 777 /clusterfs
|
||||
|
||||
# todo security check here
|
||||
echo "/clusterfs 0.0.0.0/0(rw,sync,no_root_squash,no_subtree_check)" >> /etc/exports
|
||||
echo "/home 0.0.0.0/0(rw,sync,no_root_squash,no_subtree_check)" >> /etc/exports
|
||||
echo "/clusterfs (rw,sync,no_root_squash,no_subtree_check)" >> /etc/exports
|
||||
echo "/home (rw,sync,no_root_squash,no_subtree_check)" >> /etc/exports
|
||||
|
||||
# copy keys
|
||||
cp /etc/munge/munge.key /clusterfs
|
||||
|
@ -1,171 +0,0 @@
|
||||
import socket
|
||||
import json
|
||||
import os, sys
|
||||
from random import randint
|
||||
from time import sleep, time
|
||||
|
||||
HOSTS = {}
|
||||
UDP_IP = "255.255.255.255"
|
||||
UDP_PORT = 5005
|
||||
PROT_HDR = "SATURNARCH "
|
||||
SEND_TIME = None
|
||||
|
||||
TYPE = "slave"
|
||||
|
||||
if os.path.exists("/etc/slurm-llnl/MASTER"):
|
||||
TYPE = "master"
|
||||
|
||||
MASTER = None
|
||||
|
||||
sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM, socket.IPPROTO_UDP)
|
||||
sock.setsockopt(socket.SOL_SOCKET, socket.SO_BROADCAST, 1)
|
||||
sock.bind((UDP_IP, UDP_PORT))
|
||||
|
||||
def nfsDone():
|
||||
with open('myfile.txt') as myfile:
|
||||
if MASTER["ip"] in myfile.read():
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def get_ip():
|
||||
s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
|
||||
s.settimeout(0)
|
||||
try:
|
||||
# doesn't even have to be reachable
|
||||
s.connect(('8.8.8.8', 1))
|
||||
IP = s.getsockname()[0]
|
||||
except Exception:
|
||||
IP = '127.0.0.1'
|
||||
finally:
|
||||
s.close()
|
||||
return IP
|
||||
|
||||
def selfInfo():
|
||||
return {
|
||||
"ip": get_ip(),
|
||||
"type": TYPE,
|
||||
"name": socket.gethostname(),
|
||||
"cpus": os.cpu_count(),
|
||||
"rams": os.sysconf('SC_PAGE_SIZE') * os.sysconf('SC_PHYS_PAGES')
|
||||
}
|
||||
|
||||
|
||||
def loadHosts():
|
||||
global HOSTS, MASTER
|
||||
|
||||
try:
|
||||
with open('/etc/slurm-llnl/hosts.json', 'r') as file:
|
||||
HOSTS = json.load(file)
|
||||
except:
|
||||
HOSTS = {}
|
||||
|
||||
if TYPE == "master":
|
||||
MASTER = selfInfo()
|
||||
else:
|
||||
for host in HOSTS.values():
|
||||
if host["type"] == "master":
|
||||
MASTER = host
|
||||
|
||||
def updateHosts():
|
||||
with open("/etc/slurm-llnl/hosts.json", "w") as outfile:
|
||||
json.dump(HOSTS, outfile)
|
||||
|
||||
def generateSlurmConfig(source, target):
|
||||
if MASTER is None:
|
||||
return
|
||||
|
||||
hosts = f"NodeName={socket.gethostname()} NodeAddr={get_ip()} CPUs={os.cpu_count()} State=UNKNOWN\n" # first is my self
|
||||
noMasterHosts = ""
|
||||
for host in HOSTS.values():
|
||||
hosts += f"NodeName={host["name"]} NodeAddr={host["ip"]} CPUs={host["cpus"]} State=UNKNOWN\n"
|
||||
noMasterHosts += f"{host["name"]}, "
|
||||
|
||||
if len(noMasterHosts) > 0:
|
||||
noMasterHosts = noMasterHosts[:-2]
|
||||
|
||||
with open(source) as f:
|
||||
newText=f.read().replace('{%hosts%}', hosts).replace('{%noMasterHosts%}', noMasterHosts).replace('{%masterName%}', MASTER["name"]).replace('{%masterIP%}', MASTER["ip"])
|
||||
|
||||
with open(target, "w") as f:
|
||||
f.write(newText)
|
||||
|
||||
def generateHosts(target):
|
||||
fileStr = """# Auto generated by SaturnArch
|
||||
127.0.0.1\tlocalhost
|
||||
::1\tlocalhost ip6-localhost ip6-loopback
|
||||
ff02::1\tip6-allnodes
|
||||
ff02::2\tip6-allrouters
|
||||
|
||||
"""
|
||||
fileStr += f"{get_ip()}\t{socket.gethostname()}\n" # first is my self
|
||||
for host in HOSTS.values():
|
||||
fileStr += f"{host["ip"]}\t{host["name"]}\n"
|
||||
|
||||
with open(target, "w") as outfile:
|
||||
outfile.write(fileStr)
|
||||
|
||||
def self_announcement():
|
||||
MESSAGE = (PROT_HDR + json.dumps(selfInfo())).encode("ASCII")
|
||||
sock.sendto(MESSAGE, (UDP_IP, UDP_PORT))
|
||||
|
||||
## Start program
|
||||
loadHosts()
|
||||
self_announcement()
|
||||
|
||||
while True:
|
||||
if SEND_TIME is not None and SEND_TIME < int(time()):
|
||||
print(f"Sending self announcement")
|
||||
self_announcement()
|
||||
SEND_TIME = None
|
||||
sock.settimeout(None)
|
||||
|
||||
data, addr = None, None
|
||||
try:
|
||||
data, addr = sock.recvfrom(1024)
|
||||
data = data.decode("ASCII")
|
||||
except socket.timeout:
|
||||
continue
|
||||
|
||||
if not data.startswith(PROT_HDR):
|
||||
continue
|
||||
|
||||
data = data[len(PROT_HDR):] # remove header
|
||||
data = json.loads(data)
|
||||
|
||||
if data["ip"] == get_ip():
|
||||
continue
|
||||
|
||||
if data["ip"] in HOSTS and data == HOSTS[data["ip"]]:
|
||||
continue
|
||||
|
||||
print(f"Discover new HOST {data}")
|
||||
|
||||
if data["type"] == "master":
|
||||
MASTER = data
|
||||
|
||||
HOSTS[data["ip"]] = data
|
||||
updateHosts()
|
||||
generateHosts("/etc/hosts")
|
||||
generateSlurmConfig("/etc/slurm-llnl/slurm.conf.template", "/etc/slurm-llnl/slurm.conf")
|
||||
|
||||
# configure network disks
|
||||
if TYPE == "slave" and MASTER is not None and not nfsDone():
|
||||
os.system(f"echo \"{MASTER['ip']}:/clusterfs /clusterfs nfs defaults 0 0\" >> /etc/fstab")
|
||||
os.system(f"echo \"{MASTER['ip']}:/home /home nfs defaults 0 0\" >> /etc/fstab")
|
||||
os.system("mount -a")
|
||||
|
||||
os.system("cp -f /clusterfs/munge.key /etc/munge/munge.key")
|
||||
|
||||
# reset all services
|
||||
os.system("systemctl restart munge")
|
||||
os.system("systemctl restart slurmd")
|
||||
|
||||
if TYPE == "master":
|
||||
os.system("systemctl restart slurmctld")
|
||||
|
||||
# plan next send
|
||||
waitTime = randint(10,100)
|
||||
print(f"Plan self announcement at T+{waitTime}s")
|
||||
SEND_TIME = int(time()) + waitTime
|
||||
sock.settimeout(waitTime / 2)
|
@ -112,10 +112,10 @@ mkdir /mnt/etc/slurm-llnl
|
||||
|
||||
cp fast_install_stage2.sh /mnt
|
||||
cp environment.sh /mnt
|
||||
cp discover.py /mnt/usr/bin/discover.py
|
||||
cp -rf slurm/* /mnt/etc/slurm-llnl
|
||||
cp saturnDiscover.service /mnt/lib/systemd/system/saturnDiscover.service
|
||||
cp /usr/local/bin/apt /mnt/usr/bin/apt
|
||||
cp /installFiles/discover.py /mnt/usr/bin/discover.py
|
||||
cp -rf /installFiles/slurm/* /mnt/etc/slurm-llnl
|
||||
cp /installFiles/saturnDiscover.service /mnt/lib/systemd/system/saturnDiscover.service
|
||||
cp /usr/local/bin/apt /mnt/usr/bin/apt
|
||||
|
||||
arch-chroot /mnt /fast_install_stage2.sh
|
||||
|
||||
|
@ -1,12 +0,0 @@
|
||||
[Unit]
|
||||
Description=SaturnArch discover Service
|
||||
After=network.target
|
||||
|
||||
[Service]
|
||||
Type=idle
|
||||
Restart=on-failure
|
||||
User=root
|
||||
ExecStart=/usr/bin/python /usr/bin/discover.py
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
@ -1,14 +0,0 @@
|
||||
CgroupMountpoint="/sys/fs/cgroup"
|
||||
#CgroupAutomount=yes
|
||||
#CgroupReleaseAgentDir="/etc/slurm/cgroup"
|
||||
AllowedDevicesFile="/etc/slurm/cgroup_allowed_devices_file.conf"
|
||||
ConstrainCores=no
|
||||
#TaskAffinity=no
|
||||
ConstrainRAMSpace=yes
|
||||
ConstrainSwapSpace=no
|
||||
ConstrainDevices=no
|
||||
AllowedRamSpace=100
|
||||
AllowedSwapSpace=0
|
||||
MaxRAMPercent=100
|
||||
MaxSwapPercent=100
|
||||
MinRAMSpace=30
|
@ -1,7 +0,0 @@
|
||||
/dev/null
|
||||
/dev/urandom
|
||||
/dev/zero
|
||||
/dev/sda*
|
||||
/dev/cpu/*/*
|
||||
/dev/pts/*
|
||||
/home/*
|
@ -1,58 +0,0 @@
|
||||
ClusterName=Betynda
|
||||
|
||||
SlurmctldHost={%masterName%}({%masterIP%})
|
||||
|
||||
ProctrackType=proctrack/linuxproc
|
||||
|
||||
ReturnToService=2
|
||||
|
||||
SlurmctldPidFile=/run/slurmctld.pid
|
||||
SlurmdPidFile=/run/slurmd.pid
|
||||
SlurmdSpoolDir=/var/lib/slurm/slurmd
|
||||
StateSaveLocation=/var/lib/slurm/slurmctld
|
||||
|
||||
SlurmUser=slurm
|
||||
TaskPlugin=task/none
|
||||
|
||||
SchedulerType=sched/backfill
|
||||
SelectType=select/cons_tres
|
||||
SelectTypeParameters=CR_Core
|
||||
|
||||
AccountingStorageType=accounting_storage/none
|
||||
JobCompType=jobcomp/none
|
||||
JobAcctGatherType=jobacct_gather/none
|
||||
|
||||
SlurmctldDebug=info
|
||||
SlurmctldLogFile=/var/log/slurm/slurmctld.log
|
||||
|
||||
SlurmdDebug=info
|
||||
SlurmdLogFile=/var/log/slurm/slurmd.log
|
||||
|
||||
{%hosts%}
|
||||
|
||||
PartitionName=exp Nodes={%noMasterHosts%} Default=YES MaxTime=01:00:00 State=UP SuspendTime=3600 PriorityTier=100
|
||||
PartitionName=long Nodes={%noMasterHosts%} Default=NO MaxTime=168:00:00 State=UP SuspendTime=3600 PriorityTier=50
|
||||
PartitionName=debug Nodes=ALL Default=NO MaxTime=03:00:00 State=UP PriorityTier=150
|
||||
|
||||
##
|
||||
## Power saving
|
||||
##
|
||||
|
||||
# timeout for power on
|
||||
ResumeTimeout=600
|
||||
|
||||
# timeout for power off
|
||||
SuspendTimeout=120
|
||||
|
||||
# Up and down maximaly 1 per minute
|
||||
ResumeRate=1
|
||||
SuspendRate=1
|
||||
|
||||
# poweroff and on programs
|
||||
ResumeProgram=/usr/local/bin/slurmResume
|
||||
SuspendProgram=/usr/local/bin/slurmSuspend
|
||||
|
||||
TreeWidth=1000
|
||||
|
||||
# wait until power on when reserve
|
||||
SchedulerParameters=salloc_wait_nodes,sbatch_wait_nodes
|
@ -1,5 +0,0 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
echo "`date` User $USER invoked Resume $*" >>/var/log/slurm/power_save.log
|
||||
|
||||
sudo etherwake b0:83:fe:d8:a6:e0
|
@ -1,5 +0,0 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
echo "`date` User $USER invoked Suspend $*" >>/var/log/slurm/power_save.log
|
||||
|
||||
sshpass -p 4126 ssh -o "UserKnownHostsFile=/dev/null" -o "StrictHostKeyChecking=no" -t lukasplevac@10.0.0.101 "sudo /sbin/shutdown"
|
Loading…
x
Reference in New Issue
Block a user