Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 56 additions & 2 deletions ats/atsMachines/fluxScheduled.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import sys
import time
import subprocess
import re
from math import ceil

from ats import terminal
Expand All @@ -22,13 +23,13 @@
from ats import configuration
from ats import log


class FluxScheduled(lcMachines.LCMachineCore):
"""
A class to initialize Flux if necessary and return job statements
from ATS tests.
"""

_cached_nodes = None # static/class variable
debug = False
debug_canRunNow = False
debug_noteLaunch = False
Expand Down Expand Up @@ -103,6 +104,57 @@ def init(self):
log(("DEBUG: FluxScheduled init : self.numNodesAvailable =%i" % (self.numNodesAvailable)), echo=True)
log(("DEBUG: FluxScheduled init : self.numGPUsAvailable =%i" % (self.numGPUs)), echo=True)

# Call get_physical_node to cache the hardware node listing before starting jobs
self.get_physical_node(0)

def expand_nodelist(self, nodelist_field):
"""
Expand a Flux nodelist string like 'rzadams[1002,1005-1007]' into a list of node names.
Handles multiple comma-separated patterns.
"""
nodes = []
# Regex to find patterns like prefix[range] or prefixNNNN
pattern = re.compile(r'([a-zA-Z0-9_-]+)(?:\[(.*?)\])?')
for match in pattern.finditer(nodelist_field):
prefix = match.group(1)
bracket = match.group(2)
if bracket:
for part in bracket.split(','):
part = part.strip()
if '-' in part:
start, end = map(int, part.split('-'))
nodes.extend([f"{prefix}{i}" for i in range(start, end + 1)])
else:
nodes.append(f"{prefix}{part}")
else:
nodes.append(prefix)
return nodes

def get_physical_node(self, rel_index):
"""
Given a relative node number, return the actual physical node within the flux allocation.
Works for any node prefix (e.g., rzadams, elcap, tuo, syz).
"""
if FluxScheduled._cached_nodes is None:
out = subprocess.check_output("flux resource list", shell=True).decode()
nodelist_field = None
for line in out.splitlines():
if line.strip().startswith("free"):
parts = line.strip().split()
if len(parts) >= 5:
nodelist_field = parts[-1]
break
if nodelist_field is None:
raise RuntimeError("Could not find NODELIST field in flux resource list output.")
FluxScheduled._cached_nodes = self.expand_nodelist(nodelist_field)
log(("Info: Physical Hardware Nodes: %s" % FluxScheduled._cached_nodes), echo=True)

nodes = FluxScheduled._cached_nodes
if rel_index < 0 or rel_index >= len(nodes):
raise IndexError(f"Relative index {rel_index} out of range (0-{len(nodes)-1})")
return nodes[rel_index]


def kill(self, test):
"""
Final cleanup if any. Not implemented for Flux yet.
Expand Down Expand Up @@ -288,7 +340,9 @@ def calculateCommandList(self, test):
if same_node is not None:
if same_node not in self.node_list:
self.node_list.append(same_node)
ret.append(f"--requires=-rank:{self.node_list.index(same_node) % self.numNodes}")
rel_node = self.node_list.index(same_node) % self.numNodes
physical_node = self.get_physical_node(rel_node)
ret.append(f"--requires=host:{physical_node}")

"""
Need to set -n{np} and -c{test.cpus_per_task}. But we also need to account for accessing
Expand Down
2 changes: 1 addition & 1 deletion ats/version.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
#
ATS_MAJOR = 7
ATS_MINOR = 0
ATS_PATCH = 121
ATS_PATCH = 122

#
# This version, constructed from the above, is used internally by ATS
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ license = "BSD-3-Clause"
publish = false
readme = "README.md"
repository = "https://github.com/LLNL/ATS"
version = "7.0.121"
version = "7.0.122"

[tool.poetry.dependencies]
python = ">=3.8"
Expand Down
4 changes: 2 additions & 2 deletions scripts/update-version.x
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
/usr/gapps/ats/scripts/replace 7.0.120 7.0.121 \
/usr/gapps/ats/scripts/replace 7.0.121 7.0.122 \
pyproject.toml \
test/*/READ.ME

/usr/gapps/ats/scripts/replace "ATS_PATCH = ..." "ATS_PATCH = 121" ats/version.py
/usr/gapps/ats/scripts/replace "ATS_PATCH = ..." "ATS_PATCH = 122" ats/version.py



4 changes: 2 additions & 2 deletions test/HelloATS/READ.ME
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ Sample use of lightweight ATS wrapper script

How to use:

export PATH=/usr/gapps/ats/${SYS_TYPE}/7.0.121/bin:$PATH
export PATH=/usr/gapps/ats/${SYS_TYPE}/7.0.122/bin:$PATH
or
export PATH=/usr/gapps/ats/${SYS_TYPE}/7.0.${USER}/bin:$PATH

Expand Down Expand Up @@ -54,7 +54,7 @@ Toss 4 Testing on slurm based toss4 machines such as rzwhippet
--------------------------------------------------------------------------------
Toss 4 ATS-4 (rzvernal, rzadams, tioga, etc.)
--------------------------------------------------------------------------------
export PATH=/usr/gapps/ats/${SYS_TYPE}/7.0.121/bin:$PATH
export PATH=/usr/gapps/ats/${SYS_TYPE}/7.0.122/bin:$PATH
or
export PATH=/usr/gapps/ats/${SYS_TYPE}/7.0.${USER}/bin:$PATH

Expand Down
10 changes: 5 additions & 5 deletions test/HelloCPUAffinity/READ.ME
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ Sample use of lightweight ATS wrapper script

How to use:

export PATH=/usr/gapps/ats/${SYS_TYPE}/7.0.121/bin:$PATH
export PATH=/usr/gapps/ats/${SYS_TYPE}/7.0.122/bin:$PATH
or
export PATH=/usr/gapps/ats/${SYS_TYPE}/7.0.${USER}/bin:$PATH

Expand All @@ -26,7 +26,7 @@ an a.out executable in this directory like so:
--------------------------------------------------------------------------------
Toss 3 (rzgenie, etc.). Only use Slurm
--------------------------------------------------------------------------------
export PATH=/usr/gapps/ats/${SYS_TYPE}/7.0.121/bin:$PATH
export PATH=/usr/gapps/ats/${SYS_TYPE}/7.0.122/bin:$PATH
or
export PATH=/usr/gapps/ats/${SYS_TYPE}/7.0.${USER}/bin:$PATH

Expand All @@ -42,7 +42,7 @@ Toss 3 (rzgenie, etc.). Only use Slurm
--------------------------------------------------------------------------------
Toss 4 Cray rzvernal/rzadams ATS-4
--------------------------------------------------------------------------------
export PATH=/usr/gapps/ats/${SYS_TYPE}/7.0.121/bin:$PATH
export PATH=/usr/gapps/ats/${SYS_TYPE}/7.0.122/bin:$PATH
or
export PATH=/usr/gapps/ats/${SYS_TYPE}/7.0.${USER}/bin:$PATH

Expand All @@ -59,7 +59,7 @@ Toss 4 Cray rzvernal/rzadams ATS-4
--------------------------------------------------------------------------------
Toss 4 Testing on slurm based toss4 machines such as rzwhippet
--------------------------------------------------------------------------------
export PATH=/usr/gapps/ats/${SYS_TYPE}/7.0.121/bin:$PATH
export PATH=/usr/gapps/ats/${SYS_TYPE}/7.0.122/bin:$PATH
or
export PATH=/usr/gapps/ats/${SYS_TYPE}/7.0.${USER}/bin:$PATH

Expand All @@ -80,7 +80,7 @@ Toss 4 Testing on slurm based toss4 machines such as rzwhippet
--------------------------------------------------------------------------------
Blueos (rzansel) Uses LSF
--------------------------------------------------------------------------------
export PATH=/usr/gapps/ats/${SYS_TYPE}/7.0.121/bin:$PATH
export PATH=/usr/gapps/ats/${SYS_TYPE}/7.0.122/bin:$PATH
or
export PATH=/usr/gapps/ats/${SYS_TYPE}/7.0.${USER}/bin:$PATH

Expand Down
20 changes: 10 additions & 10 deletions test/HelloGPU/READ.ME
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,10 @@ Blueos (rzansel) Uses LSF

mpixlc-gpu -fopenmp -DHAVE_OPENMP -DHAVE_OPENMP_4 hello_gpu.cc

/usr/apps/ats/7.0.121/bin/atslite1 --smpi_off
/usr/apps/ats/7.0.121/bin/atslite1 --smpi_off --lrun -verbose -verbose
/usr/apps/ats/7.0.121/bin/atslite1 --smpi_off --lrun --lrun_pack -verbose
/usr/apps/ats/7.0.121/bin/atslite1 --smpi_off --jsrun_exclusive -verbose
/usr/apps/ats/7.0.122/bin/atslite1 --smpi_off
/usr/apps/ats/7.0.122/bin/atslite1 --smpi_off --lrun -verbose -verbose
/usr/apps/ats/7.0.122/bin/atslite1 --smpi_off --lrun --lrun_pack -verbose
/usr/apps/ats/7.0.122/bin/atslite1 --smpi_off --jsrun_exclusive -verbose


--------------------------------------------------------------------------------
Expand All @@ -35,12 +35,12 @@ Sample Runs of the code stand alone
srun -n 4 ./a.out

Sample ATS runs of the code
/usr/apps/ats/7.0.121/bin/atslite1 --smpi_off
/usr/apps/ats/7.0.121/bin/atslite1 --smpi_off --lrun
/usr/apps/ats/7.0.121/bin/atslite1 --smpi_off --lrun --lrun_pack
/usr/apps/ats/7.0.121/bin/atslite1 --smpi_off --jsrun
/usr/apps/ats/7.0.121/bin/atslite1 --smpi_off --jsrun --jsrun_exclusive
/usr/apps/ats/7.0.121/bin/atslite1 --smpi_off --jsrun --jsrun_exclusive --jsrun_np 4 --jsrun_ngpu 4
/usr/apps/ats/7.0.122/bin/atslite1 --smpi_off
/usr/apps/ats/7.0.122/bin/atslite1 --smpi_off --lrun
/usr/apps/ats/7.0.122/bin/atslite1 --smpi_off --lrun --lrun_pack
/usr/apps/ats/7.0.122/bin/atslite1 --smpi_off --jsrun
/usr/apps/ats/7.0.122/bin/atslite1 --smpi_off --jsrun --jsrun_exclusive
/usr/apps/ats/7.0.122/bin/atslite1 --smpi_off --jsrun --jsrun_exclusive --jsrun_np 4 --jsrun_ngpu 4

Clean
rm -rf a.out blueos_3*
Expand Down
10 changes: 5 additions & 5 deletions test/HelloGPU2/READ.ME
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ flux run -N1 -n1 -c 96 ./a.out 5
flux run -N 2 --tasks-per-node 2 ./a.out

# ats tests
/usr/apps/ats/7.0.121/bin/atsflux --flux test.ats
/usr/apps/ats/7.0.122/bin/atsflux --flux test.ats

--------------------------------------------------------------------------------
Test a GPU code built with cuda/nvcc
Expand All @@ -52,10 +52,10 @@ lrun --mpibind=off -N2 -n16 ./a.out <- same as above
# the cpu and gpu affinity in the tes case, so look at them by hand for reasonableness
# lrun will 'pack' the jobs. jsrun will use a resource list

/usr/apps/ats/7.0.121/bin/atslite1 --smpi_off
/usr/apps/ats/7.0.121/bin/atslite1 --smpi_off --lrun -verbose
/usr/apps/ats/7.0.121/bin/atslite1 --smpi_off --lrun --lrun_pack -verbose
/usr/apps/ats/7.0.121/bin/atslite1 --smpi_off --jsrun_exclusive -verbose
/usr/apps/ats/7.0.122/bin/atslite1 --smpi_off
/usr/apps/ats/7.0.122/bin/atslite1 --smpi_off --lrun -verbose
/usr/apps/ats/7.0.122/bin/atslite1 --smpi_off --lrun --lrun_pack -verbose
/usr/apps/ats/7.0.122/bin/atslite1 --smpi_off --jsrun_exclusive -verbose


--------------------------------------------------------------------------------
Expand Down
4 changes: 2 additions & 2 deletions test/HelloGPU2/READ.ME.CPX
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,9 @@ export ROCR_VISIBLE_DEVICES=0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20
make clean
make hip

/usr/apps/ats/7.0.120/bin/atsflux --flux test.ats
/usr/apps/ats/7.0.122/bin/atsflux --flux test.ats

/usr/apps/ats/7.0.120/bin/atsflux --CPX --cpx --flux test.ats \
/usr/apps/ats/7.0.122/bin/atsflux --CPX --cpx --flux test.ats \
--test_np_max=4 --gpus_per_task=1 --num_concurrent_jobs=48


Expand Down
10 changes: 5 additions & 5 deletions test/HelloOMP/READ.ME
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ Sample use of lightweight ATS wrapper script

How to use:

export PATH=/usr/gapps/ats/${SYS_TYPE}/7.0.121/bin:$PATH
export PATH=/usr/gapps/ats/${SYS_TYPE}/7.0.122/bin:$PATH
or
export PATH=/usr/gapps/ats/${SYS_TYPE}/7.0.${USER}/bin:$PATH

Expand Down Expand Up @@ -48,7 +48,7 @@ Toss 3 (rzgenie, etc.).
--------------------------------------------------------------------------------
export PATH=${PATH}:/usr/gapps/ats/scripts
module load python/3.8.2
export PATH=/usr/gapps/ats/${SYS_TYPE}/7.0.121/bin:$PATH
export PATH=/usr/gapps/ats/${SYS_TYPE}/7.0.122/bin:$PATH
or
export PATH=/usr/gapps/ats/${SYS_TYPE}/7.0.${USER}/bin:$PATH

Expand All @@ -65,7 +65,7 @@ Toss 3 (rzgenie, etc.).
--------------------------------------------------------------------------------
Toss 4 ATS-4 (rzvernal, rzadams, tioga, etc.)
--------------------------------------------------------------------------------
export PATH=/usr/gapps/ats/${SYS_TYPE}/7.0.121/bin:$PATH
export PATH=/usr/gapps/ats/${SYS_TYPE}/7.0.122/bin:$PATH
or
export PATH=/usr/gapps/ats/${SYS_TYPE}/7.0.${USER}/bin:$PATH

Expand All @@ -89,7 +89,7 @@ Toss 4 (rzwhippet, etc.).
module load python/3.9.12
export PATH=/usr/gapps/ats/${SYS_TYPE}/7.0.${USER}/bin:$PATH
or
export PATH=/usr/gapps/ats/${SYS_TYPE}/7.0.121/bin:$PATH
export PATH=/usr/gapps/ats/${SYS_TYPE}/7.0.122/bin:$PATH


mpicxx -g -qopenmp -pthread -O2 -o omp_test omp_test.cc
Expand All @@ -108,7 +108,7 @@ Toss 4 (rzwhippet, etc.).
--------------------------------------------------------------------------------
Blueos (rzansel) Uses LSF
--------------------------------------------------------------------------------
export PATH=/usr/gapps/ats/${SYS_TYPE}/7.0.121/bin:$PATH
export PATH=/usr/gapps/ats/${SYS_TYPE}/7.0.122/bin:$PATH
or
export PATH=/usr/gapps/ats/${SYS_TYPE}/7.0.${USER}/bin:$PATH

Expand Down
2 changes: 1 addition & 1 deletion test/HelloSameNode/READ.ME
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

How to use:

export PATH=/usr/gapps/ats/${SYS_TYPE}/7.0.121/bin:$PATH
export PATH=/usr/gapps/ats/${SYS_TYPE}/7.0.122/bin:$PATH
or
export PATH=/usr/gapps/ats/${SYS_TYPE}/7.0.${USER}/bin:$PATH

Expand Down
70 changes: 70 additions & 0 deletions test/HelloSameNode/test_get_node_names.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
#!/usr/bin/env python3

import subprocess
import sys
import re

def get_node_names():
"""
Parse the output of 'flux resource list' and return a list of node names in allocation order.
"""
try:
out = subprocess.check_output("flux resource list", shell=True).decode()
except Exception as e:
print(f"Error running 'flux resource list': {e}")
sys.exit(1)

node_names = []
# Look for NODELIST field in the output
for line in out.splitlines():
if re.search(r'NODELIST', line):
# The next line should contain the node list
continue
m = re.search(r'(rzadams\[[0-9,-]+\]|rzadams[0-9]+)', line)
if m:
nodelist = m.group(1)
# Expand bracket notation, e.g. rzadams[1010-1013]
bracket = re.match(r'([a-zA-Z]+)\[([0-9,-]+)\]', nodelist)
if bracket:
base = bracket.group(1)
rng = bracket.group(2)
for part in rng.split(','):
if '-' in part:
start, end = map(int, part.split('-'))
node_names.extend([f"{base}{i}" for i in range(start, end+1)])
else:
node_names.append(f"{base}{part}")
else:
node_names.append(nodelist)
return node_names

def get_node_name(rel_index):
nodes = get_node_names()
if rel_index < 0 or rel_index >= len(nodes):
raise IndexError(f"Relative index {rel_index} out of range (0-{len(nodes)-1})")
return nodes[rel_index]

def main():
nodes = get_node_names()
print("Allocated nodes in order:")
for idx, n in enumerate(nodes):
print(f" [{idx}] {n}")

if len(sys.argv) > 1:
try:
rel_index = int(sys.argv[1])
except ValueError:
print("Usage: flux_node_mapper.py <relative_index>")
sys.exit(1)
else:
rel_index = int(input(f"Enter relative node index (0-{len(nodes)-1}): "))

try:
node_name = get_node_name(rel_index)
print(f"Node {rel_index} maps to: {node_name}")
except Exception as e:
print(f"Error: {e}")
sys.exit(1)

if __name__ == "__main__":
main()