diff --git a/ats/atsMachines/fluxScheduled.py b/ats/atsMachines/fluxScheduled.py index 76f73be..a1492c9 100755 --- a/ats/atsMachines/fluxScheduled.py +++ b/ats/atsMachines/fluxScheduled.py @@ -13,6 +13,7 @@ import sys import time import subprocess +import re from math import ceil from ats import terminal @@ -22,13 +23,13 @@ from ats import configuration from ats import log - class FluxScheduled(lcMachines.LCMachineCore): """ A class to initialize Flux if necessary and return job statements from ATS tests. """ + _cached_nodes = None # static/class variable debug = False debug_canRunNow = False debug_noteLaunch = False @@ -103,6 +104,57 @@ def init(self): log(("DEBUG: FluxScheduled init : self.numNodesAvailable =%i" % (self.numNodesAvailable)), echo=True) log(("DEBUG: FluxScheduled init : self.numGPUsAvailable =%i" % (self.numGPUs)), echo=True) + # Call get_physical_node to cache the hardware node listing before starting jobs + self.get_physical_node(0) + + def expand_nodelist(self, nodelist_field): + """ + Expand a Flux nodelist string like 'rzadams[1002,1005-1007]' into a list of node names. + Handles multiple comma-separated patterns. + """ + nodes = [] + # Regex to find patterns like prefix[range] or prefixNNNN + pattern = re.compile(r'([a-zA-Z0-9_-]+)(?:\[(.*?)\])?') + for match in pattern.finditer(nodelist_field): + prefix = match.group(1) + bracket = match.group(2) + if bracket: + for part in bracket.split(','): + part = part.strip() + if '-' in part: + start, end = map(int, part.split('-')) + nodes.extend([f"{prefix}{i}" for i in range(start, end + 1)]) + else: + nodes.append(f"{prefix}{part}") + else: + nodes.append(prefix) + return nodes + + def get_physical_node(self, rel_index): + """ + Given a relative node number, return the actual physical node within the flux allocation. + Works for any node prefix (e.g., rzadams, elcap, tuo, syz). + """ + if FluxScheduled._cached_nodes is None: + out = subprocess.check_output("flux resource list", shell=True).decode() + nodelist_field = None + for line in out.splitlines(): + if line.strip().startswith("free"): + parts = line.strip().split() + if len(parts) >= 5: + nodelist_field = parts[-1] + break + if nodelist_field is None: + raise RuntimeError("Could not find NODELIST field in flux resource list output.") + FluxScheduled._cached_nodes = self.expand_nodelist(nodelist_field) + log(("Info: Physical Hardware Nodes: %s" % FluxScheduled._cached_nodes), echo=True) + + nodes = FluxScheduled._cached_nodes + if rel_index < 0 or rel_index >= len(nodes): + raise IndexError(f"Relative index {rel_index} out of range (0-{len(nodes)-1})") + return nodes[rel_index] + + def kill(self, test): """ Final cleanup if any. Not implemented for Flux yet. @@ -288,7 +340,9 @@ def calculateCommandList(self, test): if same_node is not None: if same_node not in self.node_list: self.node_list.append(same_node) - ret.append(f"--requires=-rank:{self.node_list.index(same_node) % self.numNodes}") + rel_node = self.node_list.index(same_node) % self.numNodes + physical_node = self.get_physical_node(rel_node) + ret.append(f"--requires=host:{physical_node}") """ Need to set -n{np} and -c{test.cpus_per_task}. But we also need to account for accessing diff --git a/ats/version.py b/ats/version.py index 6a9814f..f446323 100644 --- a/ats/version.py +++ b/ats/version.py @@ -2,7 +2,7 @@ # ATS_MAJOR = 7 ATS_MINOR = 0 -ATS_PATCH = 121 +ATS_PATCH = 122 # # This version, constructed from the above, is used internally by ATS diff --git a/pyproject.toml b/pyproject.toml index 12e3516..963f13b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,7 +14,7 @@ license = "BSD-3-Clause" publish = false readme = "README.md" repository = "https://github.com/LLNL/ATS" -version = "7.0.121" +version = "7.0.122" [tool.poetry.dependencies] python = ">=3.8" diff --git a/scripts/update-version.x b/scripts/update-version.x index 915f0be..5c47e18 100755 --- a/scripts/update-version.x +++ b/scripts/update-version.x @@ -1,8 +1,8 @@ -/usr/gapps/ats/scripts/replace 7.0.120 7.0.121 \ +/usr/gapps/ats/scripts/replace 7.0.121 7.0.122 \ pyproject.toml \ test/*/READ.ME -/usr/gapps/ats/scripts/replace "ATS_PATCH = ..." "ATS_PATCH = 121" ats/version.py +/usr/gapps/ats/scripts/replace "ATS_PATCH = ..." "ATS_PATCH = 122" ats/version.py diff --git a/test/HelloATS/READ.ME b/test/HelloATS/READ.ME index 3d584e4..670a131 100644 --- a/test/HelloATS/READ.ME +++ b/test/HelloATS/READ.ME @@ -6,7 +6,7 @@ Sample use of lightweight ATS wrapper script How to use: - export PATH=/usr/gapps/ats/${SYS_TYPE}/7.0.121/bin:$PATH + export PATH=/usr/gapps/ats/${SYS_TYPE}/7.0.122/bin:$PATH or export PATH=/usr/gapps/ats/${SYS_TYPE}/7.0.${USER}/bin:$PATH @@ -54,7 +54,7 @@ Toss 4 Testing on slurm based toss4 machines such as rzwhippet -------------------------------------------------------------------------------- Toss 4 ATS-4 (rzvernal, rzadams, tioga, etc.) -------------------------------------------------------------------------------- - export PATH=/usr/gapps/ats/${SYS_TYPE}/7.0.121/bin:$PATH + export PATH=/usr/gapps/ats/${SYS_TYPE}/7.0.122/bin:$PATH or export PATH=/usr/gapps/ats/${SYS_TYPE}/7.0.${USER}/bin:$PATH diff --git a/test/HelloCPUAffinity/READ.ME b/test/HelloCPUAffinity/READ.ME index c1da1bd..84a2087 100644 --- a/test/HelloCPUAffinity/READ.ME +++ b/test/HelloCPUAffinity/READ.ME @@ -6,7 +6,7 @@ Sample use of lightweight ATS wrapper script How to use: - export PATH=/usr/gapps/ats/${SYS_TYPE}/7.0.121/bin:$PATH + export PATH=/usr/gapps/ats/${SYS_TYPE}/7.0.122/bin:$PATH or export PATH=/usr/gapps/ats/${SYS_TYPE}/7.0.${USER}/bin:$PATH @@ -26,7 +26,7 @@ an a.out executable in this directory like so: -------------------------------------------------------------------------------- Toss 3 (rzgenie, etc.). Only use Slurm -------------------------------------------------------------------------------- - export PATH=/usr/gapps/ats/${SYS_TYPE}/7.0.121/bin:$PATH + export PATH=/usr/gapps/ats/${SYS_TYPE}/7.0.122/bin:$PATH or export PATH=/usr/gapps/ats/${SYS_TYPE}/7.0.${USER}/bin:$PATH @@ -42,7 +42,7 @@ Toss 3 (rzgenie, etc.). Only use Slurm -------------------------------------------------------------------------------- Toss 4 Cray rzvernal/rzadams ATS-4 -------------------------------------------------------------------------------- - export PATH=/usr/gapps/ats/${SYS_TYPE}/7.0.121/bin:$PATH + export PATH=/usr/gapps/ats/${SYS_TYPE}/7.0.122/bin:$PATH or export PATH=/usr/gapps/ats/${SYS_TYPE}/7.0.${USER}/bin:$PATH @@ -59,7 +59,7 @@ Toss 4 Cray rzvernal/rzadams ATS-4 -------------------------------------------------------------------------------- Toss 4 Testing on slurm based toss4 machines such as rzwhippet -------------------------------------------------------------------------------- - export PATH=/usr/gapps/ats/${SYS_TYPE}/7.0.121/bin:$PATH + export PATH=/usr/gapps/ats/${SYS_TYPE}/7.0.122/bin:$PATH or export PATH=/usr/gapps/ats/${SYS_TYPE}/7.0.${USER}/bin:$PATH @@ -80,7 +80,7 @@ Toss 4 Testing on slurm based toss4 machines such as rzwhippet -------------------------------------------------------------------------------- Blueos (rzansel) Uses LSF -------------------------------------------------------------------------------- - export PATH=/usr/gapps/ats/${SYS_TYPE}/7.0.121/bin:$PATH + export PATH=/usr/gapps/ats/${SYS_TYPE}/7.0.122/bin:$PATH or export PATH=/usr/gapps/ats/${SYS_TYPE}/7.0.${USER}/bin:$PATH diff --git a/test/HelloGPU/READ.ME b/test/HelloGPU/READ.ME index 69edf39..a871a02 100644 --- a/test/HelloGPU/READ.ME +++ b/test/HelloGPU/READ.ME @@ -9,10 +9,10 @@ Blueos (rzansel) Uses LSF mpixlc-gpu -fopenmp -DHAVE_OPENMP -DHAVE_OPENMP_4 hello_gpu.cc -/usr/apps/ats/7.0.121/bin/atslite1 --smpi_off -/usr/apps/ats/7.0.121/bin/atslite1 --smpi_off --lrun -verbose -verbose -/usr/apps/ats/7.0.121/bin/atslite1 --smpi_off --lrun --lrun_pack -verbose -/usr/apps/ats/7.0.121/bin/atslite1 --smpi_off --jsrun_exclusive -verbose +/usr/apps/ats/7.0.122/bin/atslite1 --smpi_off +/usr/apps/ats/7.0.122/bin/atslite1 --smpi_off --lrun -verbose -verbose +/usr/apps/ats/7.0.122/bin/atslite1 --smpi_off --lrun --lrun_pack -verbose +/usr/apps/ats/7.0.122/bin/atslite1 --smpi_off --jsrun_exclusive -verbose -------------------------------------------------------------------------------- @@ -35,12 +35,12 @@ Sample Runs of the code stand alone srun -n 4 ./a.out Sample ATS runs of the code - /usr/apps/ats/7.0.121/bin/atslite1 --smpi_off - /usr/apps/ats/7.0.121/bin/atslite1 --smpi_off --lrun - /usr/apps/ats/7.0.121/bin/atslite1 --smpi_off --lrun --lrun_pack - /usr/apps/ats/7.0.121/bin/atslite1 --smpi_off --jsrun - /usr/apps/ats/7.0.121/bin/atslite1 --smpi_off --jsrun --jsrun_exclusive - /usr/apps/ats/7.0.121/bin/atslite1 --smpi_off --jsrun --jsrun_exclusive --jsrun_np 4 --jsrun_ngpu 4 + /usr/apps/ats/7.0.122/bin/atslite1 --smpi_off + /usr/apps/ats/7.0.122/bin/atslite1 --smpi_off --lrun + /usr/apps/ats/7.0.122/bin/atslite1 --smpi_off --lrun --lrun_pack + /usr/apps/ats/7.0.122/bin/atslite1 --smpi_off --jsrun + /usr/apps/ats/7.0.122/bin/atslite1 --smpi_off --jsrun --jsrun_exclusive + /usr/apps/ats/7.0.122/bin/atslite1 --smpi_off --jsrun --jsrun_exclusive --jsrun_np 4 --jsrun_ngpu 4 Clean rm -rf a.out blueos_3* diff --git a/test/HelloGPU2/READ.ME b/test/HelloGPU2/READ.ME index 9c3417e..7f134bf 100644 --- a/test/HelloGPU2/READ.ME +++ b/test/HelloGPU2/READ.ME @@ -25,7 +25,7 @@ flux run -N1 -n1 -c 96 ./a.out 5 flux run -N 2 --tasks-per-node 2 ./a.out # ats tests -/usr/apps/ats/7.0.121/bin/atsflux --flux test.ats +/usr/apps/ats/7.0.122/bin/atsflux --flux test.ats -------------------------------------------------------------------------------- Test a GPU code built with cuda/nvcc @@ -52,10 +52,10 @@ lrun --mpibind=off -N2 -n16 ./a.out <- same as above # the cpu and gpu affinity in the tes case, so look at them by hand for reasonableness # lrun will 'pack' the jobs. jsrun will use a resource list -/usr/apps/ats/7.0.121/bin/atslite1 --smpi_off -/usr/apps/ats/7.0.121/bin/atslite1 --smpi_off --lrun -verbose -/usr/apps/ats/7.0.121/bin/atslite1 --smpi_off --lrun --lrun_pack -verbose -/usr/apps/ats/7.0.121/bin/atslite1 --smpi_off --jsrun_exclusive -verbose +/usr/apps/ats/7.0.122/bin/atslite1 --smpi_off +/usr/apps/ats/7.0.122/bin/atslite1 --smpi_off --lrun -verbose +/usr/apps/ats/7.0.122/bin/atslite1 --smpi_off --lrun --lrun_pack -verbose +/usr/apps/ats/7.0.122/bin/atslite1 --smpi_off --jsrun_exclusive -verbose -------------------------------------------------------------------------------- diff --git a/test/HelloGPU2/READ.ME.CPX b/test/HelloGPU2/READ.ME.CPX index 3b87dce..ffdc468 100644 --- a/test/HelloGPU2/READ.ME.CPX +++ b/test/HelloGPU2/READ.ME.CPX @@ -7,9 +7,9 @@ export ROCR_VISIBLE_DEVICES=0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20 make clean make hip -/usr/apps/ats/7.0.120/bin/atsflux --flux test.ats +/usr/apps/ats/7.0.122/bin/atsflux --flux test.ats -/usr/apps/ats/7.0.120/bin/atsflux --CPX --cpx --flux test.ats \ +/usr/apps/ats/7.0.122/bin/atsflux --CPX --cpx --flux test.ats \ --test_np_max=4 --gpus_per_task=1 --num_concurrent_jobs=48 diff --git a/test/HelloOMP/READ.ME b/test/HelloOMP/READ.ME index c5956f4..42c4f7d 100644 --- a/test/HelloOMP/READ.ME +++ b/test/HelloOMP/READ.ME @@ -6,7 +6,7 @@ Sample use of lightweight ATS wrapper script How to use: - export PATH=/usr/gapps/ats/${SYS_TYPE}/7.0.121/bin:$PATH + export PATH=/usr/gapps/ats/${SYS_TYPE}/7.0.122/bin:$PATH or export PATH=/usr/gapps/ats/${SYS_TYPE}/7.0.${USER}/bin:$PATH @@ -48,7 +48,7 @@ Toss 3 (rzgenie, etc.). -------------------------------------------------------------------------------- export PATH=${PATH}:/usr/gapps/ats/scripts module load python/3.8.2 - export PATH=/usr/gapps/ats/${SYS_TYPE}/7.0.121/bin:$PATH + export PATH=/usr/gapps/ats/${SYS_TYPE}/7.0.122/bin:$PATH or export PATH=/usr/gapps/ats/${SYS_TYPE}/7.0.${USER}/bin:$PATH @@ -65,7 +65,7 @@ Toss 3 (rzgenie, etc.). -------------------------------------------------------------------------------- Toss 4 ATS-4 (rzvernal, rzadams, tioga, etc.) -------------------------------------------------------------------------------- - export PATH=/usr/gapps/ats/${SYS_TYPE}/7.0.121/bin:$PATH + export PATH=/usr/gapps/ats/${SYS_TYPE}/7.0.122/bin:$PATH or export PATH=/usr/gapps/ats/${SYS_TYPE}/7.0.${USER}/bin:$PATH @@ -89,7 +89,7 @@ Toss 4 (rzwhippet, etc.). module load python/3.9.12 export PATH=/usr/gapps/ats/${SYS_TYPE}/7.0.${USER}/bin:$PATH or - export PATH=/usr/gapps/ats/${SYS_TYPE}/7.0.121/bin:$PATH + export PATH=/usr/gapps/ats/${SYS_TYPE}/7.0.122/bin:$PATH mpicxx -g -qopenmp -pthread -O2 -o omp_test omp_test.cc @@ -108,7 +108,7 @@ Toss 4 (rzwhippet, etc.). -------------------------------------------------------------------------------- Blueos (rzansel) Uses LSF -------------------------------------------------------------------------------- - export PATH=/usr/gapps/ats/${SYS_TYPE}/7.0.121/bin:$PATH + export PATH=/usr/gapps/ats/${SYS_TYPE}/7.0.122/bin:$PATH or export PATH=/usr/gapps/ats/${SYS_TYPE}/7.0.${USER}/bin:$PATH diff --git a/test/HelloSameNode/READ.ME b/test/HelloSameNode/READ.ME index edd25fa..68fbad1 100644 --- a/test/HelloSameNode/READ.ME +++ b/test/HelloSameNode/READ.ME @@ -4,7 +4,7 @@ How to use: - export PATH=/usr/gapps/ats/${SYS_TYPE}/7.0.121/bin:$PATH + export PATH=/usr/gapps/ats/${SYS_TYPE}/7.0.122/bin:$PATH or export PATH=/usr/gapps/ats/${SYS_TYPE}/7.0.${USER}/bin:$PATH diff --git a/test/HelloSameNode/test_get_node_names.py b/test/HelloSameNode/test_get_node_names.py new file mode 100755 index 0000000..c8b5f64 --- /dev/null +++ b/test/HelloSameNode/test_get_node_names.py @@ -0,0 +1,70 @@ +#!/usr/bin/env python3 + +import subprocess +import sys +import re + +def get_node_names(): + """ + Parse the output of 'flux resource list' and return a list of node names in allocation order. + """ + try: + out = subprocess.check_output("flux resource list", shell=True).decode() + except Exception as e: + print(f"Error running 'flux resource list': {e}") + sys.exit(1) + + node_names = [] + # Look for NODELIST field in the output + for line in out.splitlines(): + if re.search(r'NODELIST', line): + # The next line should contain the node list + continue + m = re.search(r'(rzadams\[[0-9,-]+\]|rzadams[0-9]+)', line) + if m: + nodelist = m.group(1) + # Expand bracket notation, e.g. rzadams[1010-1013] + bracket = re.match(r'([a-zA-Z]+)\[([0-9,-]+)\]', nodelist) + if bracket: + base = bracket.group(1) + rng = bracket.group(2) + for part in rng.split(','): + if '-' in part: + start, end = map(int, part.split('-')) + node_names.extend([f"{base}{i}" for i in range(start, end+1)]) + else: + node_names.append(f"{base}{part}") + else: + node_names.append(nodelist) + return node_names + +def get_node_name(rel_index): + nodes = get_node_names() + if rel_index < 0 or rel_index >= len(nodes): + raise IndexError(f"Relative index {rel_index} out of range (0-{len(nodes)-1})") + return nodes[rel_index] + +def main(): + nodes = get_node_names() + print("Allocated nodes in order:") + for idx, n in enumerate(nodes): + print(f" [{idx}] {n}") + + if len(sys.argv) > 1: + try: + rel_index = int(sys.argv[1]) + except ValueError: + print("Usage: flux_node_mapper.py ") + sys.exit(1) + else: + rel_index = int(input(f"Enter relative node index (0-{len(nodes)-1}): ")) + + try: + node_name = get_node_name(rel_index) + print(f"Node {rel_index} maps to: {node_name}") + except Exception as e: + print(f"Error: {e}") + sys.exit(1) + +if __name__ == "__main__": + main()