Fixed invalid genereting of slurm config

This commit is contained in:
Lukáš Plevač 2024-12-30 23:00:47 +01:00
parent 52c7cedfdb
commit 03a80d19d1
10 changed files with 1485 additions and 1 deletions

File diff suppressed because it is too large Load Diff

BIN
pkg/saturn-discover/.MTREE Normal file

Binary file not shown.

View File

@ -0,0 +1,12 @@
# Generated by makepkg 7.0.0
# using fakeroot version 1.36
pkgname = saturn-discover
pkgbase = saturn-discover
xdata = pkgtype=pkg
pkgver = 1.7-8
pkgdesc =
url =
builddate = 1733738986
packager = Unknown Packager
size = 10663
arch = any

View File

@ -0,0 +1,182 @@
#!/usr/bin/env python
import os
import sys
import json
def tryReadFile(fname, defval):
try:
with open(fname, 'r') as file:
return file.read().replace('\n', '')
except:
pass
return defval
def restartService():
os.system("systemctl restart saturnDiscover")
def loadHosts():
HOSTS = {}
try:
with open('/etc/slurm-llnl/hosts.json', 'r') as file:
HOSTS = json.load(file)
except:
HOSTS = {}
return HOSTS
def updateFile(filename, value):
with open(filename, "w") as outfile:
outfile.write(value)
def help():
print("Saturn discover setting tool")
print()
print("Using: saturnDiscover <command> [ops]")
print("supported commands:")
print(" add Add value (to list)")
print(" set Set value")
print(" list List values (from list)")
print(" get Get value")
print(" reload Reload cluster config")
print(" lookup Lookup for value in list")
print("")
print("Every command have sub help, when you write help in ops")
def helpAdd():
print("Saturn discover setting tool")
print("command ADD - add value to list")
print()
print("Using: saturnDiscover add <target> <value>")
print("supported tragets:")
print(" res Node resources list")
def helpSet():
print("Saturn discover setting tool")
print("command SET - set value")
print()
print("Using: saturnDiscover set <target> <value>")
print("supported tragets:")
print(" mac Node mac address")
def helpList():
print("Saturn discover setting tool")
print("command LIST - print values in list")
print()
print("Using: saturnDiscover list <target>")
print("supported tragets:")
print(" res Node resources list")
print(" nodes Discovered nodes list")
def helpGet():
print("Saturn discover setting tool")
print("command GET - get value")
print()
print("Using: saturnDiscover get <target>")
print("supported tragets:")
print(" mac Node mac address")
def helpLookup():
print("Saturn discover setting tool")
print("command LOOKUP - get value from list")
print()
print("Using: saturnDiscover lookup <targetList> <tragetRow>")
print("supported tragetsList:")
print(" hostmac Node mac address lookup by hostname or ip address")
def helpReload():
print("Saturn discover setting tool")
print("command RELOAD - reload cluster configuration on all nodes")
print()
print("Using: saturnDiscover reload")
argc = len(sys.argv)
if (argc <= 1):
help()
exit(0)
## prase command
if (sys.argv[1] == "add"):
if (argc != 4):
helpAdd()
exit(1)
if (sys.argv[2] == "res"):
res = json.loads(tryReadFile("/etc/slurm-llnl/localRes", "[]"))
res.append(sys.argv[3])
updateFile("/etc/slurm-llnl/localRes", json.dumps(res))
restartService()
elif (sys.argv[1] == "set"):
if (argc != 4):
helpSet()
exit(1)
if (sys.argv[2] == "mac"):
updateFile("/etc/slurm-llnl/localMac", sys.argv[3])
restartService()
elif (sys.argv[1] == "list"):
if (argc != 3):
helpList()
exit(1)
if (sys.argv[2] == "res"):
res = json.loads(tryReadFile("/etc/slurm-llnl/localRes", "[]"))
print(",".join(res["res"]))
if (sys.argv[2] == "nodes"):
nodes = loadHosts()
print("NAME MASTER IP MAC CPUs RAM[GB] VERSION GRES")
print("SELF")
for node in nodes.values():
isMaster = ""
if node["type"] == "master":
isMaster = "YES"
print(f'{node["name"]: <8}{isMaster: <10}{node["ip"]: <18}{node["mac"]: <19}{node["cpus"]: <8}{node["rams"]: <11}{node["version"]: <11}{",".join(node["res"])}')
elif (sys.argv[1] == "get"):
if (argc != 3):
helpGet()
exit(1)
if (sys.argv[2] == "mac"):
print(tryReadFile("/etc/slurm-llnl/localMac", "00:00:00:00:00:00"))
elif (sys.argv[1] == "lookup"):
if (argc != 4):
helpLookup()
exit(1)
if (sys.argv[2] == "hostmac"):
nodes = loadHosts()
for node in nodes.values():
if node["ip"] == sys.argv[3] or node["name"] == sys.argv[3]:
print(node["mac"])
exit(0)
print("00:00:00:00:00:00")
elif (sys.argv[1] == "reload"):
if (argc != 2):
helpReload()
exit(1)
ver = int(tryReadFile("/etc/slurm-llnl/confVer", 0))
updateFile("/etc/slurm-llnl/confVer", str(ver + 1))
restartService()
else:
help()
exit(1)

View File

@ -0,0 +1,197 @@
import socket
import json
import os, sys
from random import randint
from time import sleep, time
HOSTS = {}
UDP_IP = "255.255.255.255"
UDP_PORT = 5005
PROT_HDR = "SATURNARCH "
SEND_TIME = None
TYPE = "slave"
if os.path.exists("/etc/slurm-llnl/MASTER"):
TYPE = "master"
MASTER = None
sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM, socket.IPPROTO_UDP)
sock.setsockopt(socket.SOL_SOCKET, socket.SO_BROADCAST, 1)
sock.bind((UDP_IP, UDP_PORT))
def nfsDone():
if MASTER is None:
return False
with open('/etc/fstab') as myfile:
if MASTER["ip"] in myfile.read():
return True
return False
def get_ip():
s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
s.settimeout(0)
try:
# doesn't even have to be reachable
s.connect(('8.8.8.8', 1))
IP = s.getsockname()[0]
except Exception:
IP = '127.0.0.1'
finally:
s.close()
return IP
def tryReadFile(fname, defval):
try:
with open(fname, 'r') as file:
return file.read().replace('\n', '')
except:
pass
return defval
def selfInfo():
return {
"ip": get_ip(),
"type": TYPE,
"name": socket.gethostname(),
"cpus": os.cpu_count(),
"rams": os.sysconf('SC_PAGE_SIZE') * os.sysconf('SC_PHYS_PAGES'),
"res": json.loads(tryReadFile("/etc/slurm-llnl/localRes", "[]")),
"mac": tryReadFile("/etc/slurm-llnl/localMac", "00:00:00:00:00:00"),
"version": int(tryReadFile("/etc/slurm-llnl/confVer", 0))
}
def loadHosts():
global HOSTS, MASTER
try:
with open('/etc/slurm-llnl/hosts.json', 'r') as file:
HOSTS = json.load(file)
except:
HOSTS = {}
if TYPE == "master":
MASTER = selfInfo()
else:
for host in HOSTS.values():
if host["type"] == "master":
MASTER = host
def updateVersion(val):
with open("/etc/slurm-llnl/confVer", "w") as outfile:
outfile.write(str(val))
def updateHosts():
with open("/etc/slurm-llnl/hosts.json", "w") as outfile:
json.dump(HOSTS, outfile)
def generateSlurmConfig(source, target):
if MASTER is None:
return
selfInfoData = selfInfo()
hosts = f"NodeName={socket.gethostname()} NodeAddr={get_ip()} CPUs={os.cpu_count()} RealMemory={int(selfInfoData["rams"] / (1024.**2))} Gres={",".join(selfInfoData["res"])} State=UNKNOWN\n" # first is my self
noMasterHosts = ""
for host in HOSTS.values():
hosts += f"NodeName={host["name"]} NodeAddr={host["ip"]} CPUs={host["cpus"]} RealMemory={int(host["rams"] / (1024.**2))} Gres={",".join(host["res"])} State=UNKNOWN\n"
noMasterHosts += f"{host["name"]}, "
if len(noMasterHosts) > 0:
noMasterHosts = noMasterHosts[:-2]
with open(source) as f:
newText=f.read().replace('{%hosts%}', hosts).replace('{%noMasterHosts%}', noMasterHosts).replace('{%masterName%}', MASTER["name"]).replace('{%masterIP%}', MASTER["ip"])
with open(target, "w") as f:
f.write(newText)
def generateHosts(target):
fileStr = """# Auto generated by SaturnArch
127.0.0.1\tlocalhost
::1\tlocalhost ip6-localhost ip6-loopback
ff02::1\tip6-allnodes
ff02::2\tip6-allrouters
"""
fileStr += f"{get_ip()}\t{socket.gethostname()}\n" # first is my self
for host in HOSTS.values():
fileStr += f"{host["ip"]}\t{host["name"]}\n"
with open(target, "w") as outfile:
outfile.write(fileStr)
def self_announcement():
MESSAGE = (PROT_HDR + json.dumps(selfInfo())).encode("ASCII")
sock.sendto(MESSAGE, (UDP_IP, UDP_PORT))
## Start program
loadHosts()
self_announcement()
while True:
if SEND_TIME is not None and SEND_TIME < int(time()):
print(f"Sending self announcement")
self_announcement()
SEND_TIME = None
sock.settimeout(None)
data, addr = None, None
try:
data, addr = sock.recvfrom(1024)
data = data.decode("ASCII")
except socket.timeout:
continue
if not data.startswith(PROT_HDR):
continue
data = data[len(PROT_HDR):] # remove header
data = json.loads(data)
if data["ip"] == get_ip():
continue
if data["ip"] in HOSTS and data == HOSTS[data["ip"]]:
continue
print(f"Discover new HOST {data}")
if data["type"] == "master":
MASTER = data
HOSTS[data["ip"]] = data
updateHosts()
generateHosts("/etc/hosts")
# configure network disks
if TYPE == "slave" and MASTER is not None and not nfsDone():
os.system(f"echo \"{MASTER['ip']}:/clusterfs /clusterfs nfs defaults 0 0\" >> /etc/fstab")
os.system(f"echo \"{MASTER['ip']}:/home /home nfs defaults 0 0\" >> /etc/fstab")
os.system("mount -a")
os.system("cp -f /clusterfs/config/munge.key /etc/munge/munge.key")
os.system("cat /clusterfs/config/maintenance.pub >> /home/maintenance/.ssh/authorized_keys")
generateSlurmConfig("/clusterfs/config/slurm.conf.template", "/etc/slurm-llnl/slurm.conf")
if int(tryReadFile("/etc/slurm-llnl/confVer", 0)) < data["version"]:
updateVersion(data["version"])
# reset all services
os.system("systemctl restart munge")
os.system("systemctl restart slurmd")
if TYPE == "master":
os.system("systemctl restart slurmctld")
# plan next send
waitTime = randint(10,100)
print(f"Plan self announcement at T+{waitTime}s")
SEND_TIME = int(time()) + waitTime
sock.settimeout(waitTime / 2)

View File

@ -0,0 +1,15 @@
[Unit]
Description=SaturnArch discover Service
After=network.target
[Service]
StandardError=journal
StandardOutput=journal
StandardInput=null
Type=idle
Restart=on-failure
User=root
ExecStart=/usr/bin/python /usr/bin/saturnDiscoverDeamon.py
[Install]
WantedBy=multi-user.target

View File

@ -100,7 +100,7 @@ def generateSlurmConfig(source, target):
noMasterHosts = ""
for host in HOSTS.values():
hosts += f"NodeName={host["name"]} NodeAddr={host["ip"]} CPUs={host["cpus"]} RealMemory={int(host["rams"] / (1024.**2))} Gres={",".join(host["res"])} State=UNKNOWN\n"
noMasterHosts += f"{host["name"]}, "
noMasterHosts += f"{host["name"]},"
if len(noMasterHosts) > 0:
noMasterHosts = noMasterHosts[:-2]

1
src/saturnDiscover Symbolic link
View File

@ -0,0 +1 @@
/home/lukasplevac/Plocha/SaturnArch-Discover/saturnDiscover

1
src/saturnDiscover.service Symbolic link
View File

@ -0,0 +1 @@
/home/lukasplevac/Plocha/SaturnArch-Discover/saturnDiscover.service

1
src/saturnDiscoverDeamon.py Symbolic link
View File

@ -0,0 +1 @@
/home/lukasplevac/Plocha/SaturnArch-Discover/saturnDiscoverDeamon.py