Repository testing
All checks were successful
Build / build (push) Successful in 22m10s

This commit is contained in:
Lukáš Plevač 2024-12-07 16:02:51 +01:00
parent 534ce0c6ed
commit 3962e10699
7 changed files with 10 additions and 197 deletions

View File

@ -1,174 +0,0 @@
import socket
import json
import os, sys
from random import randint
from time import sleep, time
HOSTS = {}
UDP_IP = "255.255.255.255"
UDP_PORT = 5005
PROT_HDR = "SATURNARCH "
SEND_TIME = None
TYPE = "slave"
if os.path.exists("/etc/slurm-llnl/MASTER"):
TYPE = "master"
MASTER = None
sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM, socket.IPPROTO_UDP)
sock.setsockopt(socket.SOL_SOCKET, socket.SO_BROADCAST, 1)
sock.bind((UDP_IP, UDP_PORT))
def nfsDone():
if MASTER is None:
return False
with open('/etc/fstab') as myfile:
if MASTER["ip"] in myfile.read():
return True
return False
def get_ip():
s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
s.settimeout(0)
try:
# doesn't even have to be reachable
s.connect(('8.8.8.8', 1))
IP = s.getsockname()[0]
except Exception:
IP = '127.0.0.1'
finally:
s.close()
return IP
def selfInfo():
return {
"ip": get_ip(),
"type": TYPE,
"name": socket.gethostname(),
"cpus": os.cpu_count(),
"rams": os.sysconf('SC_PAGE_SIZE') * os.sysconf('SC_PHYS_PAGES')
}
def loadHosts():
global HOSTS, MASTER
try:
with open('/etc/slurm-llnl/hosts.json', 'r') as file:
HOSTS = json.load(file)
except:
HOSTS = {}
if TYPE == "master":
MASTER = selfInfo()
else:
for host in HOSTS.values():
if host["type"] == "master":
MASTER = host
def updateHosts():
with open("/etc/slurm-llnl/hosts.json", "w") as outfile:
json.dump(HOSTS, outfile)
def generateSlurmConfig(source, target):
if MASTER is None:
return
hosts = f"NodeName={socket.gethostname()} NodeAddr={get_ip()} CPUs={os.cpu_count()} State=UNKNOWN\n" # first is my self
noMasterHosts = ""
for host in HOSTS.values():
hosts += f"NodeName={host["name"]} NodeAddr={host["ip"]} CPUs={host["cpus"]} State=UNKNOWN\n"
noMasterHosts += f"{host["name"]}, "
if len(noMasterHosts) > 0:
noMasterHosts = noMasterHosts[:-2]
with open(source) as f:
newText=f.read().replace('{%hosts%}', hosts).replace('{%noMasterHosts%}', noMasterHosts).replace('{%masterName%}', MASTER["name"]).replace('{%masterIP%}', MASTER["ip"])
with open(target, "w") as f:
f.write(newText)
def generateHosts(target):
fileStr = """# Auto generated by SaturnArch
127.0.0.1\tlocalhost
::1\tlocalhost ip6-localhost ip6-loopback
ff02::1\tip6-allnodes
ff02::2\tip6-allrouters
"""
fileStr += f"{get_ip()}\t{socket.gethostname()}\n" # first is my self
for host in HOSTS.values():
fileStr += f"{host["ip"]}\t{host["name"]}\n"
with open(target, "w") as outfile:
outfile.write(fileStr)
def self_announcement():
MESSAGE = (PROT_HDR + json.dumps(selfInfo())).encode("ASCII")
sock.sendto(MESSAGE, (UDP_IP, UDP_PORT))
## Start program
loadHosts()
self_announcement()
while True:
if SEND_TIME is not None and SEND_TIME < int(time()):
print(f"Sending self announcement")
self_announcement()
SEND_TIME = None
sock.settimeout(None)
data, addr = None, None
try:
data, addr = sock.recvfrom(1024)
data = data.decode("ASCII")
except socket.timeout:
continue
if not data.startswith(PROT_HDR):
continue
data = data[len(PROT_HDR):] # remove header
data = json.loads(data)
if data["ip"] == get_ip():
continue
if data["ip"] in HOSTS and data == HOSTS[data["ip"]]:
continue
print(f"Discover new HOST {data}")
if data["type"] == "master":
MASTER = data
HOSTS[data["ip"]] = data
updateHosts()
generateHosts("/etc/hosts")
generateSlurmConfig("/etc/slurm-llnl/slurm.conf.template", "/etc/slurm-llnl/slurm.conf")
# configure network disks
if TYPE == "slave" and MASTER is not None and not nfsDone():
os.system(f"echo \"{MASTER['ip']}:/clusterfs /clusterfs nfs defaults 0 0\" >> /etc/fstab")
os.system(f"echo \"{MASTER['ip']}:/home /home nfs defaults 0 0\" >> /etc/fstab")
os.system("mount -a")
os.system("cp -f /clusterfs/munge.key /etc/munge/munge.key")
# reset all services
os.system("systemctl restart munge")
os.system("systemctl restart slurmd")
if TYPE == "master":
os.system("systemctl restart slurmctld")
# plan next send
waitTime = randint(10,100)
print(f"Plan self announcement at T+{waitTime}s")
SEND_TIME = int(time()) + waitTime
sock.settimeout(waitTime / 2)

View File

@ -1,15 +0,0 @@
[Unit]
Description=SaturnArch discover Service
After=network.target
[Service]
StandardError=journal
StandardOutput=journal
StandardInput=null
Type=idle
Restart=on-failure
User=root
ExecStart=/usr/bin/python /usr/bin/discover.py
[Install]
WantedBy=multi-user.target

View File

@ -100,3 +100,7 @@ Include = /etc/pacman.d/mirrorlist
#[custom]
#SigLevel = Optional TrustAll
#Server = file:///home/custompkgs
[saturn_repo]
SigLevel = Optional TrustAll
Server = https://git.plevac.eu/Betynda/SaturnArch-REPO/raw/branch/main/$arch

View File

@ -112,9 +112,7 @@ mkdir /mnt/etc/slurm-llnl
cp fast_install_stage2.sh /mnt
cp environment.sh /mnt
cp /installFiles/discover.py /mnt/usr/bin/discover.py
cp -rf /installFiles/slurm/* /mnt/etc/slurm-llnl
cp /installFiles/saturnDiscover.service /mnt/lib/systemd/system/saturnDiscover.service
cp /usr/local/bin/apt /mnt/usr/bin/apt
arch-chroot /mnt /fast_install_stage2.sh

View File

@ -21,7 +21,7 @@ pacman -Syy
pacman -S --noconfirm man-pages man-db dnsutils ethtool iputils net-tools iproute2 openssh wget \
usbutils usb_modeswitch tcpdump smartmontools gnu-netcat mc dosfstools exfat-utils \
partclone parted partimage gptfdisk iw dialog base-devel vim \
grub os-prober efivar efibootmgr efitools intel-ucode amd-ucode dmidecode htop nano python slurm-llnl nfs-utils
grub os-prober efivar efibootmgr efitools intel-ucode amd-ucode dmidecode htop nano python slurm-llnl nfs-utils saturn-discover lmod python-pipenv
cd /usr/bin/
ln -s vim vi
@ -110,8 +110,10 @@ sudo chmod -R 777 /clusterfs
echo "/clusterfs (rw,sync,no_root_squash,no_subtree_check)" >> /etc/exports
echo "/home (rw,sync,no_root_squash,no_subtree_check)" >> /etc/exports
# copy keys
cp /etc/munge/munge.key /clusterfs
# copy keys and config
mkdir /clusterfs/config
cp /etc/munge/munge.key /clusterfs/config
cp /etc/slurm-llnl/slurm.conf.template /clusterfs/config
systemctl enable munge
systemctl enable slurmd

View File

@ -112,9 +112,7 @@ mkdir /mnt/etc/slurm-llnl
cp fast_install_stage2.sh /mnt
cp environment.sh /mnt
cp /installFiles/discover.py /mnt/usr/bin/discover.py
cp -rf /installFiles/slurm/* /mnt/etc/slurm-llnl
cp /installFiles/saturnDiscover.service /mnt/lib/systemd/system/saturnDiscover.service
cp /usr/local/bin/apt /mnt/usr/bin/apt
arch-chroot /mnt /fast_install_stage2.sh

View File

@ -21,7 +21,7 @@ pacman -Syy
pacman -S --noconfirm man-pages man-db dnsutils ethtool iputils net-tools iproute2 openssh wget \
usbutils usb_modeswitch tcpdump smartmontools gnu-netcat mc dosfstools exfat-utils \
partclone parted partimage gptfdisk iw dialog base-devel vim \
grub os-prober efivar efibootmgr efitools intel-ucode amd-ucode dmidecode htop nano python slurm-llnl nfs-utils
grub os-prober efivar efibootmgr efitools intel-ucode amd-ucode dmidecode htop nano python slurm-llnl nfs-utils saturn-discover lmod python-pipenv
cd /usr/bin/
ln -s vim vi