Skip to content

Commit f39ecf8

Browse files
lukeseawalkerdemartinofra
authored andcommitted
Enable byos HeadClusterUpdate event
Recipes have been refactored so that common code is now used by both slurm and byos plugin. The configuration fetching is now implemented as custom resource instead of a recipe, and it's now used by both config (at cluster creation) and update. Entrypoint for the update process has been renamed from "update_head_node" to just "update", so that the logic to call the right cookbook/recipe based on the node type is now moved into cookbook itself. Signed-off-by: Luca Carrogu <[email protected]>
1 parent c3b3be6 commit f39ecf8

File tree

13 files changed

+192
-101
lines changed

13 files changed

+192
-101
lines changed

attributes/default.rb

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
default['cluster']['cluster_config_version'] = nil
3030
default['cluster']['instance_types_data_s3_key'] = nil
3131
default['cluster']['cluster_config_path'] = "#{node['cluster']['shared_dir']}/cluster-config.yaml"
32+
default['cluster']['previous_cluster_config_path'] = "#{node['cluster']['shared_dir']}/previous-cluster-config.yaml"
3233
default['cluster']['launch_templates_config_path'] = "#{node['cluster']['shared_dir']}/launch-templates-config.json"
3334
default['cluster']['instance_types_data_path'] = "#{node['cluster']['shared_dir']}/instance-types-data.json"
3435
default['cluster']['reserved_base_uid'] = 400

cookbooks/aws-parallelcluster-config/recipes/config.rb

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,8 @@
1616
# limitations under the License.
1717

1818
include_recipe 'aws-parallelcluster-config::base'
19-
include_recipe "aws-parallelcluster-config::fetch_config" unless node['cluster']['scheduler'] == 'awsbatch'
19+
20+
fetch_config 'Fetch and load cluster configs' unless node['cluster']['scheduler'] == 'awsbatch'
2021

2122
include_recipe 'aws-parallelcluster-slurm::config' if node['cluster']['scheduler'] == 'slurm'
2223
include_recipe 'aws-parallelcluster-scheduler-plugin::config' if node['cluster']['scheduler'] == 'plugin'

cookbooks/aws-parallelcluster-config/recipes/fetch_config.rb

Lines changed: 0 additions & 48 deletions
This file was deleted.

cookbooks/aws-parallelcluster-config/recipes/finalize.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and
1616
# limitations under the License.
1717

18-
include_recipe "aws-parallelcluster-config::fetch_config" unless node['cluster']['scheduler'] == 'awsbatch'
18+
fetch_config 'Fetch and load cluster configs' unless node['cluster']['scheduler'] == 'awsbatch'
1919

2020
# Restart supervisord
2121
service "supervisord" do

cookbooks/aws-parallelcluster-config/recipes/init.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@
6060

6161
include_recipe "aws-parallelcluster-config::mount_shared" if node['cluster']['node_type'] == "ComputeFleet"
6262

63-
include_recipe "aws-parallelcluster-config::fetch_config" unless node['cluster']['scheduler'] == 'awsbatch'
63+
fetch_config 'Fetch and load cluster configs' unless node['cluster']['scheduler'] == 'awsbatch'
6464

6565
include_recipe "aws-parallelcluster-slurm::init" if node['cluster']['scheduler'] == 'slurm'
6666
include_recipe "aws-parallelcluster-scheduler-plugin::init" if node['cluster']['scheduler'] == 'plugin'
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
# frozen_string_literal: true
2+
3+
#
4+
# Cookbook:: aws-parallelcluster-config
5+
# Recipe:: update
6+
#
7+
# Copyright:: 2021 Amazon.com, Inc. or its affiliates. All Rights Reserved.
8+
#
9+
# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with the
10+
# License. A copy of the License is located at
11+
#
12+
# http://aws.amazon.com/apache2.0/
13+
#
14+
# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
15+
# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and
16+
# limitations under the License.
17+
18+
unless node['cluster']['scheduler'] == 'awsbatch'
19+
fetch_config 'Fetch and load cluster configs' do
20+
update true
21+
end
22+
end
23+
24+
include_recipe 'aws-parallelcluster-slurm::update' if node['cluster']['scheduler'] == 'slurm'
25+
include_recipe 'aws-parallelcluster-scheduler-plugin::update' if node['cluster']['scheduler'] == 'plugin'
Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
# frozen_string_literal: true
2+
3+
resource_name :fetch_config
4+
provides :fetch_config
5+
unified_mode true
6+
7+
property :update, [true, false],
8+
default: false
9+
10+
default_action :run
11+
12+
action :run do
13+
Chef::Log.debug("Called fetch_config with update (#{new_resource.update})")
14+
unless virtualized?
15+
if new_resource.update
16+
Chef::Log.info("Backing up old configuration from (#{node['cluster']['cluster_config_path']}) to (#{node['cluster']['previous_cluster_config_path']})")
17+
::FileUtils.cp(node['cluster']['cluster_config_path'], node['cluster']['previous_cluster_config_path'])
18+
fetch_cluster_config(node['cluster']['cluster_config_path'])
19+
fetch_instance_type_data unless ::FileUtils.identical?(node['cluster']['previous_cluster_config_path'], node['cluster']['cluster_config_path'])
20+
else
21+
fetch_cluster_config(node['cluster']['cluster_config_path']) unless ::File.exist?(node['cluster']['cluster_config_path'])
22+
fetch_instance_type_data unless ::File.exist?(node['cluster']['instance_types_data_path'])
23+
end
24+
25+
# load cluster config into node object
26+
load_cluster_config
27+
end
28+
end
29+
30+
action_class do # rubocop:disable Metrics/BlockLength
31+
def fetch_cluster_config(config_path)
32+
# Copy cluster config file from S3 URI
33+
fetch_config_command = "#{node['cluster']['cookbook_virtualenv_path']}/bin/aws s3api get-object" \
34+
" --bucket #{node['cluster']['cluster_s3_bucket']}" \
35+
" --key #{node['cluster']['cluster_config_s3_key']}" \
36+
" --region #{node['cluster']['region']}" \
37+
" #{config_path}"
38+
fetch_config_command += " --version-id #{node['cluster']['cluster_config_version']}" unless node['cluster']['cluster_config_version'].nil?
39+
execute "copy_cluster_config_from_s3" do
40+
command fetch_config_command
41+
retries 3
42+
retry_delay 5
43+
end
44+
end
45+
46+
def fetch_instance_type_data
47+
# Copy instance type infos file from S3 URI
48+
fetch_config_command = "#{node['cluster']['cookbook_virtualenv_path']}/bin/aws s3api get-object" \
49+
" --bucket #{node['cluster']['cluster_s3_bucket']}" \
50+
" --key #{node['cluster']['instance_types_data_s3_key']}" \
51+
" --region #{node['cluster']['region']}" \
52+
" #{node['cluster']['instance_types_data_path']}"
53+
execute "copy_instance_type_data_from_s3" do
54+
command fetch_config_command
55+
retries 3
56+
retry_delay 5
57+
end
58+
end
59+
end

cookbooks/aws-parallelcluster-scheduler-plugin/files/default/event_handler/invoke-scheduler-plugin-event-handler.sh

Lines changed: 36 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ ORIGINAL_SHARED_DIR="/opt/parallelcluster/shared"
55
CLUSTER_CONFIGURATION_FILE="cluster-config.yaml"
66
LAUNCH_TEMPLATES_CONFIG_FILE="launch-templates-config.json"
77
INSTANCE_TYPES_DATA_FILE="instance-types-data.json"
8+
HANDLER_STATIC_ENV_FILE="handler-env.json"
89
SCHEDULER_PLUGIN_SUBSTACK_OUTPUTS_FILE="scheduler-plugin-substack-outputs.json"
910
ORIGINAL_CLUSTER_CONFIGURATION="${ORIGINAL_SHARED_DIR}/${CLUSTER_CONFIGURATION_FILE}"
1011
DUMMY_LAUNCH_TEMPLATES_CONFIG="${DUMMY_DIR}/${LAUNCH_TEMPLATES_CONFIG_FILE}"
@@ -13,6 +14,7 @@ DUMMY_INSTANCE_TYPES_DATA="${DUMMY_DIR}/${INSTANCE_TYPES_DATA_FILE}"
1314
ORIGINAL_INSTANCE_TYPES_DATA="${ORIGINAL_SHARED_DIR}/${INSTANCE_TYPES_DATA_FILE}"
1415
DUMMY_SCHEDULER_PLUGIN_SUBSTACK_OUTPUTS="${DUMMY_DIR}/${SCHEDULER_PLUGIN_SUBSTACK_OUTPUTS_FILE}"
1516
ORIGINAL_SCHEDULER_PLUGIN_SUBSTACK_OUTPUTS="${ORIGINAL_SHARED_DIR}/${SCHEDULER_PLUGIN_SUBSTACK_OUTPUTS_FILE}"
17+
HANDLER_STATIC_ENV="${ORIGINAL_SHARED_DIR}/${HANDLER_STATIC_ENV_FILE}"
1618

1719
syntax() {
1820
echo
@@ -23,6 +25,7 @@ syntax() {
2325
echo "--event-name The name of the event to trigger, possible values are:"
2426
echo " HeadInit, HeadConfigure, HeadFinalize, ComputeInit, ComputeConfigure, ComputeFinalize, HeadClusterUpdate, HeadComputeFleetStart or HeadComputeFleetStop."
2527
echo "--cluster-configuration Required local path to cluster configuration file, in YAML format."
28+
echo "--previous-cluster-configuration Required for the HeadClusterUpdate event. Local path to previous cluster configuration file, in YAML format."
2629
echo "--launch-templates-config Local path to launch templates config file, in JSON format. When not set, if instance is not created by cluster creation, dummy launch templates config is created, otherwise the one retrieved from cluster will be used"
2730
echo "--instance-types-data Local path to instance types data file, in JSON format. When not set, if instance is not created by cluster creation, dummy instance types data is created, otherwise the one retrieved from cluster will be used"
2831
echo "--scheduler-plugin-substack-outputs Local path to scheduler plugin substack outputs file, in JSON format. When not set, if instance is not created by cluster creation, dummy scheduler plugin substack outputs is created, otherwise the one retrieved from cluster will be used"
@@ -65,39 +68,46 @@ while [ $# -gt 0 ]; do
6568
set -x
6669
;;
6770
--event-name)
68-
event_name+=("$2")
71+
event_name="$2"
6972
shift
7073
;;
7174
--event-name=*)
72-
event_name+=("${1#*=}")
75+
event_name="${1#*=}"
7376
;;
7477
--cluster-configuration)
75-
cluster_configuration+=("$2")
78+
cluster_configuration="$2"
7679
shift
7780
;;
7881
--cluster-configuration=*)
79-
cluster_configuration+=("${1#*=}")
82+
cluster_configuration="${1#*=}"
83+
;;
84+
--previous-cluster-configuration)
85+
previous_cluster_configuration="$2"
86+
shift
87+
;;
88+
--previous-cluster-configuration=*)
89+
previous_cluster_configuration="${1#*=}"
8090
;;
8191
--launch-templates-config)
82-
launch_templates_config+=("$2")
92+
launch_templates_config="$2"
8393
shift
8494
;;
8595
--launch-templates-config=*)
86-
launch_templates_config+=("${1#*=}")
96+
launch_templates_config="${1#*=}"
8797
;;
8898
--instance-types-data)
89-
instance_types_data+=("$2")
99+
instance_types_data="$2"
90100
shift
91101
;;
92102
--instance-types-data=*)
93-
instance_types_data+=("${1#*=}")
103+
instance_types_data="${1#*=}"
94104
;;
95105
--scheduler-plugin-substack-outputs)
96-
scheduler_plugin_substack_outputs+=("$2")
106+
scheduler_plugin_substack_outputs="$2"
97107
shift
98108
;;
99109
--scheduler-plugin-substack-outputs=*)
100-
scheduler_plugin_substack_outputs+=("${1#*=}")
110+
scheduler_plugin_substack_outputs="${1#*=}"
101111
;;
102112
*)
103113
fail_syntax "Unrecognized option ($1)"
@@ -119,9 +129,13 @@ if [[ ! "${event_name}" =~ ^(HeadInit|HeadConfigure|HeadFinalize|ComputeInit|Com
119129
fail "Event name ${event_name} not supported"
120130
fi
121131

132+
if [[ "${event_name}" == "HeadClusterUpdate" ]] && [[ -z ${previous_cluster_configuration} ]]; then
133+
fail "Option --previous-cluster-configuration is required when event name is (${event_name})"
134+
fi
135+
122136
build_dummy_dna_json() {
123-
log "Building dummy dna.json in ${source_dna_json}"
124-
cat << EOF > ${source_dna_json}
137+
log "Building dummy dna.json in (${source_dna_json})"
138+
cat << EOF > "${source_dna_json}"
125139
{
126140
"cluster": {
127141
"stack_name": "dummy-stack-name",
@@ -143,6 +157,7 @@ build_dna_json() {
143157
"cluster": {
144158
"node_type": "${node_type}",
145159
"cluster_config_path": "$(readlink -f ${cluster_configuration})",
160+
"previous_cluster_config_path": "$(readlink -f ${previous_cluster_configuration})",
146161
"launch_templates_config_path": "$(readlink -f ${launch_templates_config})",
147162
"instance_types_data_path": "$(readlink -f ${instance_types_data})",
148163
"event_name": "${event_name}",
@@ -153,7 +168,7 @@ build_dna_json() {
153168
}
154169
EOF
155170

156-
jq --argfile f1 ${source_dna_json} --argfile f2 /tmp/extra.json -n '$f1 + $f2 | .cluster = $f1.cluster + $f2.cluster' > /tmp/dna.json
171+
jq --argfile f1 "${source_dna_json}" --argfile f2 /tmp/extra.json -n '$f1 + $f2 | .cluster = $f1.cluster + $f2.cluster' > /tmp/dna.json
157172
log "Generated dna.json:"
158173
log "$(cat /tmp/dna.json)"
159174
}
@@ -320,6 +335,13 @@ create_dummy_scheduler_plugin_substack_outputs() {
320335
EOF
321336
}
322337

338+
cleanup_previous_env() {
339+
if [[ -f "${HANDLER_STATIC_ENV}" ]]; then
340+
log "Found previously built event environment, deleting it..."
341+
rm -f "${HANDLER_STATIC_ENV}"
342+
fi
343+
}
344+
323345
# Main
324346
if [[ -f "/etc/chef/dna.json" ]]; then
325347
log "Instance created as part of cluster creation"
@@ -367,5 +389,5 @@ else
367389
fi
368390

369391
build_dna_json
370-
392+
cleanup_previous_env
371393
call_chef_run
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
# frozen_string_literal: true
2+
3+
#
4+
# Cookbook:: aws-parallelcluster-scheduler-plugin
5+
# Recipe:: update
6+
#
7+
# Copyright:: 2013-2021 Amazon.com, Inc. or its affiliates. All Rights Reserved.
8+
#
9+
# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with the
10+
# License. A copy of the License is located at
11+
#
12+
# http://aws.amazon.com/apache2.0/
13+
#
14+
# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
15+
# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and
16+
# limitations under the License.
17+
18+
case node['cluster']['node_type']
19+
when 'HeadNode'
20+
include_recipe 'aws-parallelcluster-scheduler-plugin::update_head_node'
21+
else
22+
raise "node_type must be HeadNode"
23+
end

cookbooks/aws-parallelcluster-scheduler-plugin/resources/execute_event_handler.rb

Lines changed: 13 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,14 @@ def build_env
5252
target_cluster_config = "#{node['cluster']['scheduler_plugin']['handler_dir']}/cluster-config.yaml"
5353
copy_config("cluster configuration", node.dig(:cluster, :cluster_config_path), target_cluster_config)
5454

55+
# copy previous cluster config if exist otherwise delete the old target
56+
target_previous_cluster_config = "#{node['cluster']['scheduler_plugin']['handler_dir']}/previous-cluster-config.yaml"
57+
if ::File.exist?(node['cluster']['previous_cluster_config_path'])
58+
copy_config("previous cluster configuration", node.dig(:cluster, :previous_cluster_config_path), target_previous_cluster_config)
59+
elsif ::File.exist?(target_previous_cluster_config)
60+
::File.delete(target_previous_cluster_config)
61+
end
62+
5563
# copy launch templates config
5664
target_launch_templates = "#{node['cluster']['scheduler_plugin']['handler_dir']}/launch-templates-config.json"
5765
copy_config("launch templates", node.dig(:cluster, :launch_templates_config_path), target_launch_templates)
@@ -105,7 +113,7 @@ def build_env
105113
end
106114

107115
# Merge env with dyanmic env
108-
env.merge!(build_dynamic_env)
116+
env.merge!(build_dynamic_env(target_previous_cluster_config))
109117
env
110118
end
111119

@@ -117,25 +125,21 @@ def copy_config(config_type, source_config, target_config)
117125
FileUtils.chown(node['cluster']['scheduler_plugin']['user'], node['cluster']['scheduler_plugin']['group'], target_config)
118126
end
119127

120-
def build_dynamic_env
128+
def build_dynamic_env(target_previous_cluster_config)
121129
Chef::Log.info("Building dynamic handler environment")
122130
env = {}
123131

124-
# PCLUSTER_EC2_INSTANCE_TYPE
132+
if ::File.exist?(target_previous_cluster_config)
133+
env.merge!({ 'PCLUSTER_CLUSTER_CONFIG_OLD' => target_previous_cluster_config })
134+
end
125135
env.merge!(build_hash_from_node('PCLUSTER_EC2_INSTANCE_TYPE', true, :ec2, :instance_type))
126136

127137
case node['cluster']['node_type']
128138
when 'ComputeFleet'
129-
# PCLUSTER_QUEUE_NAME
130139
env.merge!(build_hash_from_node('PCLUSTER_QUEUE_NAME', false, :cluster, :scheduler_queue_name))
131-
132-
# PCLUSTER_COMPUTE_RESOURCE_NAME
133140
env.merge!(build_hash_from_node('PCLUSTER_COMPUTE_RESOURCE_NAME', false, :cluster, :scheduler_compute_resource_name))
134-
135-
# PCLUSTER_NODE_TYPE
136141
env.merge!({ 'PCLUSTER_NODE_TYPE' => 'compute' })
137142
when 'HeadNode'
138-
# PCLUSTER_NODE_TYPE
139143
env.merge!({ 'PCLUSTER_NODE_TYPE' => 'head' })
140144
end
141145

@@ -165,8 +169,6 @@ def build_static_env(target_cluster_config, target_launch_templates, target_inst
165169
env.merge!({ 'PCLUSTER_PYTHON_ROOT' => "#{node['cluster']['scheduler_plugin']['virtualenv_path']}/bin" })
166170
env.merge!({ 'PATH' => "#{node['cluster']['scheduler_plugin']['virtualenv_path']}/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/opt/aws/bin:#{node['cluster']['scheduler_plugin']['home']}/.local/bin:#{node['cluster']['scheduler_plugin']['home']}/bin" })
167171
env.merge!(setup_proxy(:cluster, :proxy))
168-
# PCLUSTER_CLUSTER_CONFIG_OLD
169-
# TODO: to be implemented
170172

171173
env
172174
end

0 commit comments

Comments
 (0)