diff --git a/versions/master/airootfs/discover.py b/airootfs/installFiles/discover.py similarity index 98% rename from versions/master/airootfs/discover.py rename to airootfs/installFiles/discover.py index 64ee89e..ce71f80 100644 --- a/versions/master/airootfs/discover.py +++ b/airootfs/installFiles/discover.py @@ -22,7 +22,10 @@ sock.setsockopt(socket.SOL_SOCKET, socket.SO_BROADCAST, 1) sock.bind((UDP_IP, UDP_PORT)) def nfsDone(): - with open('myfile.txt') as myfile: + if MASTER is None: + return False + + with open('/etc/fstab') as myfile: if MASTER["ip"] in myfile.read(): return True diff --git a/versions/master/airootfs/saturnDiscover.service b/airootfs/installFiles/saturnDiscover.service similarity index 75% rename from versions/master/airootfs/saturnDiscover.service rename to airootfs/installFiles/saturnDiscover.service index 04fd480..ef2d89f 100644 --- a/versions/master/airootfs/saturnDiscover.service +++ b/airootfs/installFiles/saturnDiscover.service @@ -3,6 +3,9 @@ Description=SaturnArch discover Service After=network.target [Service] +StandardError=journal +StandardOutput=journal +StandardInput=null Type=idle Restart=on-failure User=root diff --git a/versions/master/airootfs/slurm/cgroup.conf b/airootfs/installFiles/slurm/cgroup.conf similarity index 100% rename from versions/master/airootfs/slurm/cgroup.conf rename to airootfs/installFiles/slurm/cgroup.conf diff --git a/versions/master/airootfs/slurm/cgroup_allowed_devices_file.conf b/airootfs/installFiles/slurm/cgroup_allowed_devices_file.conf similarity index 100% rename from versions/master/airootfs/slurm/cgroup_allowed_devices_file.conf rename to airootfs/installFiles/slurm/cgroup_allowed_devices_file.conf diff --git a/versions/master/airootfs/slurm/slurm.conf.template b/airootfs/installFiles/slurm/slurm.conf.template similarity index 100% rename from versions/master/airootfs/slurm/slurm.conf.template rename to airootfs/installFiles/slurm/slurm.conf.template diff --git a/versions/master/airootfs/slurmResume b/airootfs/installFiles/slurmResume similarity index 100% rename from versions/master/airootfs/slurmResume rename to airootfs/installFiles/slurmResume diff --git a/versions/master/airootfs/slurmSuspend b/airootfs/installFiles/slurmSuspend similarity index 100% rename from versions/master/airootfs/slurmSuspend rename to airootfs/installFiles/slurmSuspend diff --git a/versions/master/airootfs/fast_install.sh b/versions/master/airootfs/fast_install.sh index c4f966e..2116982 100644 --- a/versions/master/airootfs/fast_install.sh +++ b/versions/master/airootfs/fast_install.sh @@ -110,12 +110,12 @@ echo "Entering Chroot Environment" mkdir /mnt/etc/slurm-llnl -cp fast_install_stage2.sh /mnt -cp environment.sh /mnt -cp discover.py /mnt/usr/bin/discover.py -cp -rf slurm/* /mnt/etc/slurm-llnl -cp saturnDiscover.service /mnt/lib/systemd/system/saturnDiscover.service -cp /usr/local/bin/apt /mnt/usr/bin/apt +cp fast_install_stage2.sh /mnt +cp environment.sh /mnt +cp /installFiles/discover.py /mnt/usr/bin/discover.py +cp -rf /installFiles/slurm/* /mnt/etc/slurm-llnl +cp /installFiles/saturnDiscover.service /mnt/lib/systemd/system/saturnDiscover.service +cp /usr/local/bin/apt /mnt/usr/bin/apt arch-chroot /mnt /fast_install_stage2.sh diff --git a/versions/master/airootfs/fast_install_stage2.sh b/versions/master/airootfs/fast_install_stage2.sh index d249ded..8f463eb 100644 --- a/versions/master/airootfs/fast_install_stage2.sh +++ b/versions/master/airootfs/fast_install_stage2.sh @@ -107,8 +107,8 @@ sudo chown nobody.nogroup /clusterfs sudo chmod -R 777 /clusterfs # todo security check here -echo "/clusterfs 0.0.0.0/0(rw,sync,no_root_squash,no_subtree_check)" >> /etc/exports -echo "/home 0.0.0.0/0(rw,sync,no_root_squash,no_subtree_check)" >> /etc/exports +echo "/clusterfs (rw,sync,no_root_squash,no_subtree_check)" >> /etc/exports +echo "/home (rw,sync,no_root_squash,no_subtree_check)" >> /etc/exports # copy keys cp /etc/munge/munge.key /clusterfs diff --git a/versions/slave/airootfs/discover.py b/versions/slave/airootfs/discover.py deleted file mode 100644 index 64ee89e..0000000 --- a/versions/slave/airootfs/discover.py +++ /dev/null @@ -1,171 +0,0 @@ -import socket -import json -import os, sys -from random import randint -from time import sleep, time - -HOSTS = {} -UDP_IP = "255.255.255.255" -UDP_PORT = 5005 -PROT_HDR = "SATURNARCH " -SEND_TIME = None - -TYPE = "slave" - -if os.path.exists("/etc/slurm-llnl/MASTER"): - TYPE = "master" - -MASTER = None - -sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM, socket.IPPROTO_UDP) -sock.setsockopt(socket.SOL_SOCKET, socket.SO_BROADCAST, 1) -sock.bind((UDP_IP, UDP_PORT)) - -def nfsDone(): - with open('myfile.txt') as myfile: - if MASTER["ip"] in myfile.read(): - return True - - return False - -def get_ip(): - s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) - s.settimeout(0) - try: - # doesn't even have to be reachable - s.connect(('8.8.8.8', 1)) - IP = s.getsockname()[0] - except Exception: - IP = '127.0.0.1' - finally: - s.close() - return IP - -def selfInfo(): - return { - "ip": get_ip(), - "type": TYPE, - "name": socket.gethostname(), - "cpus": os.cpu_count(), - "rams": os.sysconf('SC_PAGE_SIZE') * os.sysconf('SC_PHYS_PAGES') - } - - -def loadHosts(): - global HOSTS, MASTER - - try: - with open('/etc/slurm-llnl/hosts.json', 'r') as file: - HOSTS = json.load(file) - except: - HOSTS = {} - - if TYPE == "master": - MASTER = selfInfo() - else: - for host in HOSTS.values(): - if host["type"] == "master": - MASTER = host - -def updateHosts(): - with open("/etc/slurm-llnl/hosts.json", "w") as outfile: - json.dump(HOSTS, outfile) - -def generateSlurmConfig(source, target): - if MASTER is None: - return - - hosts = f"NodeName={socket.gethostname()} NodeAddr={get_ip()} CPUs={os.cpu_count()} State=UNKNOWN\n" # first is my self - noMasterHosts = "" - for host in HOSTS.values(): - hosts += f"NodeName={host["name"]} NodeAddr={host["ip"]} CPUs={host["cpus"]} State=UNKNOWN\n" - noMasterHosts += f"{host["name"]}, " - - if len(noMasterHosts) > 0: - noMasterHosts = noMasterHosts[:-2] - - with open(source) as f: - newText=f.read().replace('{%hosts%}', hosts).replace('{%noMasterHosts%}', noMasterHosts).replace('{%masterName%}', MASTER["name"]).replace('{%masterIP%}', MASTER["ip"]) - - with open(target, "w") as f: - f.write(newText) - -def generateHosts(target): - fileStr = """# Auto generated by SaturnArch -127.0.0.1\tlocalhost -::1\tlocalhost ip6-localhost ip6-loopback -ff02::1\tip6-allnodes -ff02::2\tip6-allrouters - -""" - fileStr += f"{get_ip()}\t{socket.gethostname()}\n" # first is my self - for host in HOSTS.values(): - fileStr += f"{host["ip"]}\t{host["name"]}\n" - - with open(target, "w") as outfile: - outfile.write(fileStr) - -def self_announcement(): - MESSAGE = (PROT_HDR + json.dumps(selfInfo())).encode("ASCII") - sock.sendto(MESSAGE, (UDP_IP, UDP_PORT)) - -## Start program -loadHosts() -self_announcement() - -while True: - if SEND_TIME is not None and SEND_TIME < int(time()): - print(f"Sending self announcement") - self_announcement() - SEND_TIME = None - sock.settimeout(None) - - data, addr = None, None - try: - data, addr = sock.recvfrom(1024) - data = data.decode("ASCII") - except socket.timeout: - continue - - if not data.startswith(PROT_HDR): - continue - - data = data[len(PROT_HDR):] # remove header - data = json.loads(data) - - if data["ip"] == get_ip(): - continue - - if data["ip"] in HOSTS and data == HOSTS[data["ip"]]: - continue - - print(f"Discover new HOST {data}") - - if data["type"] == "master": - MASTER = data - - HOSTS[data["ip"]] = data - updateHosts() - generateHosts("/etc/hosts") - generateSlurmConfig("/etc/slurm-llnl/slurm.conf.template", "/etc/slurm-llnl/slurm.conf") - - # configure network disks - if TYPE == "slave" and MASTER is not None and not nfsDone(): - os.system(f"echo \"{MASTER['ip']}:/clusterfs /clusterfs nfs defaults 0 0\" >> /etc/fstab") - os.system(f"echo \"{MASTER['ip']}:/home /home nfs defaults 0 0\" >> /etc/fstab") - os.system("mount -a") - - os.system("cp -f /clusterfs/munge.key /etc/munge/munge.key") - - # reset all services - os.system("systemctl restart munge") - os.system("systemctl restart slurmd") - - if TYPE == "master": - os.system("systemctl restart slurmctld") - - # plan next send - waitTime = randint(10,100) - print(f"Plan self announcement at T+{waitTime}s") - SEND_TIME = int(time()) + waitTime - sock.settimeout(waitTime / 2) \ No newline at end of file diff --git a/versions/slave/airootfs/fast_install.sh b/versions/slave/airootfs/fast_install.sh index c4f966e..8b9b689 100644 --- a/versions/slave/airootfs/fast_install.sh +++ b/versions/slave/airootfs/fast_install.sh @@ -112,10 +112,10 @@ mkdir /mnt/etc/slurm-llnl cp fast_install_stage2.sh /mnt cp environment.sh /mnt -cp discover.py /mnt/usr/bin/discover.py -cp -rf slurm/* /mnt/etc/slurm-llnl -cp saturnDiscover.service /mnt/lib/systemd/system/saturnDiscover.service -cp /usr/local/bin/apt /mnt/usr/bin/apt +cp /installFiles/discover.py /mnt/usr/bin/discover.py +cp -rf /installFiles/slurm/* /mnt/etc/slurm-llnl +cp /installFiles/saturnDiscover.service /mnt/lib/systemd/system/saturnDiscover.service +cp /usr/local/bin/apt /mnt/usr/bin/apt arch-chroot /mnt /fast_install_stage2.sh diff --git a/versions/slave/airootfs/saturnDiscover.service b/versions/slave/airootfs/saturnDiscover.service deleted file mode 100644 index 04fd480..0000000 --- a/versions/slave/airootfs/saturnDiscover.service +++ /dev/null @@ -1,12 +0,0 @@ -[Unit] -Description=SaturnArch discover Service -After=network.target - -[Service] -Type=idle -Restart=on-failure -User=root -ExecStart=/usr/bin/python /usr/bin/discover.py - -[Install] -WantedBy=multi-user.target \ No newline at end of file diff --git a/versions/slave/airootfs/slurm/cgroup.conf b/versions/slave/airootfs/slurm/cgroup.conf deleted file mode 100644 index 0c8c7c1..0000000 --- a/versions/slave/airootfs/slurm/cgroup.conf +++ /dev/null @@ -1,14 +0,0 @@ -CgroupMountpoint="/sys/fs/cgroup" -#CgroupAutomount=yes -#CgroupReleaseAgentDir="/etc/slurm/cgroup" -AllowedDevicesFile="/etc/slurm/cgroup_allowed_devices_file.conf" -ConstrainCores=no -#TaskAffinity=no -ConstrainRAMSpace=yes -ConstrainSwapSpace=no -ConstrainDevices=no -AllowedRamSpace=100 -AllowedSwapSpace=0 -MaxRAMPercent=100 -MaxSwapPercent=100 -MinRAMSpace=30 diff --git a/versions/slave/airootfs/slurm/cgroup_allowed_devices_file.conf b/versions/slave/airootfs/slurm/cgroup_allowed_devices_file.conf deleted file mode 100644 index e8681e1..0000000 --- a/versions/slave/airootfs/slurm/cgroup_allowed_devices_file.conf +++ /dev/null @@ -1,7 +0,0 @@ -/dev/null -/dev/urandom -/dev/zero -/dev/sda* -/dev/cpu/*/* -/dev/pts/* -/home/* diff --git a/versions/slave/airootfs/slurm/slurm.conf.template b/versions/slave/airootfs/slurm/slurm.conf.template deleted file mode 100644 index 25e9756..0000000 --- a/versions/slave/airootfs/slurm/slurm.conf.template +++ /dev/null @@ -1,58 +0,0 @@ -ClusterName=Betynda - -SlurmctldHost={%masterName%}({%masterIP%}) - -ProctrackType=proctrack/linuxproc - -ReturnToService=2 - -SlurmctldPidFile=/run/slurmctld.pid -SlurmdPidFile=/run/slurmd.pid -SlurmdSpoolDir=/var/lib/slurm/slurmd -StateSaveLocation=/var/lib/slurm/slurmctld - -SlurmUser=slurm -TaskPlugin=task/none - -SchedulerType=sched/backfill -SelectType=select/cons_tres -SelectTypeParameters=CR_Core - -AccountingStorageType=accounting_storage/none -JobCompType=jobcomp/none -JobAcctGatherType=jobacct_gather/none - -SlurmctldDebug=info -SlurmctldLogFile=/var/log/slurm/slurmctld.log - -SlurmdDebug=info -SlurmdLogFile=/var/log/slurm/slurmd.log - -{%hosts%} - -PartitionName=exp Nodes={%noMasterHosts%} Default=YES MaxTime=01:00:00 State=UP SuspendTime=3600 PriorityTier=100 -PartitionName=long Nodes={%noMasterHosts%} Default=NO MaxTime=168:00:00 State=UP SuspendTime=3600 PriorityTier=50 -PartitionName=debug Nodes=ALL Default=NO MaxTime=03:00:00 State=UP PriorityTier=150 - -## -## Power saving -## - -# timeout for power on -ResumeTimeout=600 - -# timeout for power off -SuspendTimeout=120 - -# Up and down maximaly 1 per minute -ResumeRate=1 -SuspendRate=1 - -# poweroff and on programs -ResumeProgram=/usr/local/bin/slurmResume -SuspendProgram=/usr/local/bin/slurmSuspend - -TreeWidth=1000 - -# wait until power on when reserve -SchedulerParameters=salloc_wait_nodes,sbatch_wait_nodes diff --git a/versions/slave/airootfs/slurmResume b/versions/slave/airootfs/slurmResume deleted file mode 100755 index 07923b3..0000000 --- a/versions/slave/airootfs/slurmResume +++ /dev/null @@ -1,5 +0,0 @@ -#!/usr/bin/env bash - -echo "`date` User $USER invoked Resume $*" >>/var/log/slurm/power_save.log - -sudo etherwake b0:83:fe:d8:a6:e0 diff --git a/versions/slave/airootfs/slurmSuspend b/versions/slave/airootfs/slurmSuspend deleted file mode 100755 index 8d6d36a..0000000 --- a/versions/slave/airootfs/slurmSuspend +++ /dev/null @@ -1,5 +0,0 @@ -#!/usr/bin/env bash - -echo "`date` User $USER invoked Suspend $*" >>/var/log/slurm/power_save.log - -sshpass -p 4126 ssh -o "UserKnownHostsFile=/dev/null" -o "StrictHostKeyChecking=no" -t lukasplevac@10.0.0.101 "sudo /sbin/shutdown"