mirror of
https://github.com/ysoftdevs/terraform-aws-eks.git
synced 2026-01-16 16:47:20 +01:00
feat: Add support for creating efa-only network interfaces (#3196)
This commit is contained in:
@@ -1,6 +1,6 @@
|
|||||||
repos:
|
repos:
|
||||||
- repo: https://github.com/antonbabenko/pre-commit-terraform
|
- repo: https://github.com/antonbabenko/pre-commit-terraform
|
||||||
rev: v1.96.1
|
rev: v1.96.2
|
||||||
hooks:
|
hooks:
|
||||||
- id: terraform_fmt
|
- id: terraform_fmt
|
||||||
- id: terraform_docs
|
- id: terraform_docs
|
||||||
|
|||||||
@@ -132,9 +132,11 @@ module "eks_managed_node_group" {
|
|||||||
| <a name="input_disable_api_termination"></a> [disable\_api\_termination](#input\_disable\_api\_termination) | If true, enables EC2 instance termination protection | `bool` | `null` | no |
|
| <a name="input_disable_api_termination"></a> [disable\_api\_termination](#input\_disable\_api\_termination) | If true, enables EC2 instance termination protection | `bool` | `null` | no |
|
||||||
| <a name="input_disk_size"></a> [disk\_size](#input\_disk\_size) | Disk size in GiB for nodes. Defaults to `20`. Only valid when `use_custom_launch_template` = `false` | `number` | `null` | no |
|
| <a name="input_disk_size"></a> [disk\_size](#input\_disk\_size) | Disk size in GiB for nodes. Defaults to `20`. Only valid when `use_custom_launch_template` = `false` | `number` | `null` | no |
|
||||||
| <a name="input_ebs_optimized"></a> [ebs\_optimized](#input\_ebs\_optimized) | If true, the launched EC2 instance(s) will be EBS-optimized | `bool` | `null` | no |
|
| <a name="input_ebs_optimized"></a> [ebs\_optimized](#input\_ebs\_optimized) | If true, the launched EC2 instance(s) will be EBS-optimized | `bool` | `null` | no |
|
||||||
|
| <a name="input_efa_indices"></a> [efa\_indices](#input\_efa\_indices) | The indices of the network interfaces that should be EFA-enabled. Only valid when `enable_efa_support` = `true` | `list(number)` | <pre>[<br/> 0<br/>]</pre> | no |
|
||||||
| <a name="input_elastic_gpu_specifications"></a> [elastic\_gpu\_specifications](#input\_elastic\_gpu\_specifications) | The elastic GPU to attach to the instance | `any` | `{}` | no |
|
| <a name="input_elastic_gpu_specifications"></a> [elastic\_gpu\_specifications](#input\_elastic\_gpu\_specifications) | The elastic GPU to attach to the instance | `any` | `{}` | no |
|
||||||
| <a name="input_elastic_inference_accelerator"></a> [elastic\_inference\_accelerator](#input\_elastic\_inference\_accelerator) | Configuration block containing an Elastic Inference Accelerator to attach to the instance | `map(string)` | `{}` | no |
|
| <a name="input_elastic_inference_accelerator"></a> [elastic\_inference\_accelerator](#input\_elastic\_inference\_accelerator) | Configuration block containing an Elastic Inference Accelerator to attach to the instance | `map(string)` | `{}` | no |
|
||||||
| <a name="input_enable_bootstrap_user_data"></a> [enable\_bootstrap\_user\_data](#input\_enable\_bootstrap\_user\_data) | Determines whether the bootstrap configurations are populated within the user data template. Only valid when using a custom AMI via `ami_id` | `bool` | `false` | no |
|
| <a name="input_enable_bootstrap_user_data"></a> [enable\_bootstrap\_user\_data](#input\_enable\_bootstrap\_user\_data) | Determines whether the bootstrap configurations are populated within the user data template. Only valid when using a custom AMI via `ami_id` | `bool` | `false` | no |
|
||||||
|
| <a name="input_enable_efa_only"></a> [enable\_efa\_only](#input\_enable\_efa\_only) | Determines whether to enable EFA (`false`, default) or EFA and EFA-only (`true`) network interfaces. Note: requires vpc-cni version `v1.18.4` or later | `bool` | `false` | no |
|
||||||
| <a name="input_enable_efa_support"></a> [enable\_efa\_support](#input\_enable\_efa\_support) | Determines whether to enable Elastic Fabric Adapter (EFA) support | `bool` | `false` | no |
|
| <a name="input_enable_efa_support"></a> [enable\_efa\_support](#input\_enable\_efa\_support) | Determines whether to enable Elastic Fabric Adapter (EFA) support | `bool` | `false` | no |
|
||||||
| <a name="input_enable_monitoring"></a> [enable\_monitoring](#input\_enable\_monitoring) | Enables/disables detailed monitoring | `bool` | `true` | no |
|
| <a name="input_enable_monitoring"></a> [enable\_monitoring](#input\_enable\_monitoring) | Enables/disables detailed monitoring | `bool` | `true` | no |
|
||||||
| <a name="input_enclave_options"></a> [enclave\_options](#input\_enclave\_options) | Enable Nitro Enclaves on launched instances | `map(string)` | `{}` | no |
|
| <a name="input_enclave_options"></a> [enclave\_options](#input\_enclave\_options) | Enable Nitro Enclaves on launched instances | `map(string)` | `{}` | no |
|
||||||
|
|||||||
@@ -44,13 +44,14 @@ locals {
|
|||||||
efa_instance_type = try(element(var.instance_types, 0), "")
|
efa_instance_type = try(element(var.instance_types, 0), "")
|
||||||
num_network_cards = try(data.aws_ec2_instance_type.this[0].maximum_network_cards, 0)
|
num_network_cards = try(data.aws_ec2_instance_type.this[0].maximum_network_cards, 0)
|
||||||
|
|
||||||
|
# Primary network interface must be EFA, remaining can be EFA or EFA-only
|
||||||
efa_network_interfaces = [
|
efa_network_interfaces = [
|
||||||
for i in range(local.num_network_cards) : {
|
for i in range(local.num_network_cards) : {
|
||||||
associate_public_ip_address = false
|
associate_public_ip_address = false
|
||||||
delete_on_termination = true
|
delete_on_termination = true
|
||||||
device_index = i == 0 ? 0 : 1
|
device_index = i == 0 ? 0 : 1
|
||||||
network_card_index = i
|
network_card_index = i
|
||||||
interface_type = "efa"
|
interface_type = var.enable_efa_only ? contains(concat([0], var.efa_indices), i) ? "efa" : "efa-only" : "efa"
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|||||||
@@ -285,6 +285,19 @@ variable "enable_efa_support" {
|
|||||||
default = false
|
default = false
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# TODO - make this true by default at next breaking change (remove variable, only pass indices)
|
||||||
|
variable "enable_efa_only" {
|
||||||
|
description = "Determines whether to enable EFA (`false`, default) or EFA and EFA-only (`true`) network interfaces. Note: requires vpc-cni version `v1.18.4` or later"
|
||||||
|
type = bool
|
||||||
|
default = false
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "efa_indices" {
|
||||||
|
description = "The indices of the network interfaces that should be EFA-enabled. Only valid when `enable_efa_support` = `true`"
|
||||||
|
type = list(number)
|
||||||
|
default = [0]
|
||||||
|
}
|
||||||
|
|
||||||
variable "network_interfaces" {
|
variable "network_interfaces" {
|
||||||
description = "Customize network interfaces to be attached at instance boot time"
|
description = "Customize network interfaces to be attached at instance boot time"
|
||||||
type = list(any)
|
type = list(any)
|
||||||
|
|||||||
@@ -120,8 +120,10 @@ module "self_managed_node_group" {
|
|||||||
| <a name="input_desired_size_type"></a> [desired\_size\_type](#input\_desired\_size\_type) | The unit of measurement for the value specified for `desired_size`. Supported for attribute-based instance type selection only. Valid values: `units`, `vcpu`, `memory-mib` | `string` | `null` | no |
|
| <a name="input_desired_size_type"></a> [desired\_size\_type](#input\_desired\_size\_type) | The unit of measurement for the value specified for `desired_size`. Supported for attribute-based instance type selection only. Valid values: `units`, `vcpu`, `memory-mib` | `string` | `null` | no |
|
||||||
| <a name="input_disable_api_termination"></a> [disable\_api\_termination](#input\_disable\_api\_termination) | If true, enables EC2 instance termination protection | `bool` | `null` | no |
|
| <a name="input_disable_api_termination"></a> [disable\_api\_termination](#input\_disable\_api\_termination) | If true, enables EC2 instance termination protection | `bool` | `null` | no |
|
||||||
| <a name="input_ebs_optimized"></a> [ebs\_optimized](#input\_ebs\_optimized) | If true, the launched EC2 instance will be EBS-optimized | `bool` | `null` | no |
|
| <a name="input_ebs_optimized"></a> [ebs\_optimized](#input\_ebs\_optimized) | If true, the launched EC2 instance will be EBS-optimized | `bool` | `null` | no |
|
||||||
|
| <a name="input_efa_indices"></a> [efa\_indices](#input\_efa\_indices) | The indices of the network interfaces that should be EFA-enabled. Only valid when `enable_efa_support` = `true` | `list(number)` | <pre>[<br/> 0<br/>]</pre> | no |
|
||||||
| <a name="input_elastic_gpu_specifications"></a> [elastic\_gpu\_specifications](#input\_elastic\_gpu\_specifications) | The elastic GPU to attach to the instance | `any` | `{}` | no |
|
| <a name="input_elastic_gpu_specifications"></a> [elastic\_gpu\_specifications](#input\_elastic\_gpu\_specifications) | The elastic GPU to attach to the instance | `any` | `{}` | no |
|
||||||
| <a name="input_elastic_inference_accelerator"></a> [elastic\_inference\_accelerator](#input\_elastic\_inference\_accelerator) | Configuration block containing an Elastic Inference Accelerator to attach to the instance | `map(string)` | `{}` | no |
|
| <a name="input_elastic_inference_accelerator"></a> [elastic\_inference\_accelerator](#input\_elastic\_inference\_accelerator) | Configuration block containing an Elastic Inference Accelerator to attach to the instance | `map(string)` | `{}` | no |
|
||||||
|
| <a name="input_enable_efa_only"></a> [enable\_efa\_only](#input\_enable\_efa\_only) | Determines whether to enable EFA (`false`, default) or EFA and EFA-only (`true`) network interfaces. Note: requires vpc-cni version `v1.18.4` or later | `bool` | `false` | no |
|
||||||
| <a name="input_enable_efa_support"></a> [enable\_efa\_support](#input\_enable\_efa\_support) | Determines whether to enable Elastic Fabric Adapter (EFA) support | `bool` | `false` | no |
|
| <a name="input_enable_efa_support"></a> [enable\_efa\_support](#input\_enable\_efa\_support) | Determines whether to enable Elastic Fabric Adapter (EFA) support | `bool` | `false` | no |
|
||||||
| <a name="input_enable_monitoring"></a> [enable\_monitoring](#input\_enable\_monitoring) | Enables/disables detailed monitoring | `bool` | `true` | no |
|
| <a name="input_enable_monitoring"></a> [enable\_monitoring](#input\_enable\_monitoring) | Enables/disables detailed monitoring | `bool` | `true` | no |
|
||||||
| <a name="input_enabled_metrics"></a> [enabled\_metrics](#input\_enabled\_metrics) | A list of metrics to collect. The allowed values are `GroupDesiredCapacity`, `GroupInServiceCapacity`, `GroupPendingCapacity`, `GroupMinSize`, `GroupMaxSize`, `GroupInServiceInstances`, `GroupPendingInstances`, `GroupStandbyInstances`, `GroupStandbyCapacity`, `GroupTerminatingCapacity`, `GroupTerminatingInstances`, `GroupTotalCapacity`, `GroupTotalInstances` | `list(string)` | `[]` | no |
|
| <a name="input_enabled_metrics"></a> [enabled\_metrics](#input\_enabled\_metrics) | A list of metrics to collect. The allowed values are `GroupDesiredCapacity`, `GroupInServiceCapacity`, `GroupPendingCapacity`, `GroupMinSize`, `GroupMaxSize`, `GroupInServiceInstances`, `GroupPendingInstances`, `GroupStandbyInstances`, `GroupStandbyCapacity`, `GroupTerminatingCapacity`, `GroupTerminatingInstances`, `GroupTotalCapacity`, `GroupTotalInstances` | `list(string)` | `[]` | no |
|
||||||
|
|||||||
@@ -90,7 +90,7 @@ module "user_data" {
|
|||||||
################################################################################
|
################################################################################
|
||||||
|
|
||||||
data "aws_ec2_instance_type" "this" {
|
data "aws_ec2_instance_type" "this" {
|
||||||
count = local.enable_efa_support ? 1 : 0
|
count = var.create && var.enable_efa_support ? 1 : 0
|
||||||
|
|
||||||
instance_type = var.instance_type
|
instance_type = var.instance_type
|
||||||
}
|
}
|
||||||
@@ -101,13 +101,14 @@ locals {
|
|||||||
instance_type_provided = var.instance_type != ""
|
instance_type_provided = var.instance_type != ""
|
||||||
num_network_cards = try(data.aws_ec2_instance_type.this[0].maximum_network_cards, 0)
|
num_network_cards = try(data.aws_ec2_instance_type.this[0].maximum_network_cards, 0)
|
||||||
|
|
||||||
|
# Primary network interface must be EFA, remaining can be EFA or EFA-only
|
||||||
efa_network_interfaces = [
|
efa_network_interfaces = [
|
||||||
for i in range(local.num_network_cards) : {
|
for i in range(local.num_network_cards) : {
|
||||||
associate_public_ip_address = false
|
associate_public_ip_address = false
|
||||||
delete_on_termination = true
|
delete_on_termination = true
|
||||||
device_index = i == 0 ? 0 : 1
|
device_index = i == 0 ? 0 : 1
|
||||||
network_card_index = i
|
network_card_index = i
|
||||||
interface_type = "efa"
|
interface_type = var.enable_efa_only ? contains(concat([0], var.efa_indices), i) ? "efa" : "efa-only" : "efa"
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|||||||
@@ -334,6 +334,19 @@ variable "enable_efa_support" {
|
|||||||
default = false
|
default = false
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# TODO - make this true by default at next breaking change (remove variable, only pass indices)
|
||||||
|
variable "enable_efa_only" {
|
||||||
|
description = "Determines whether to enable EFA (`false`, default) or EFA and EFA-only (`true`) network interfaces. Note: requires vpc-cni version `v1.18.4` or later"
|
||||||
|
type = bool
|
||||||
|
default = false
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "efa_indices" {
|
||||||
|
description = "The indices of the network interfaces that should be EFA-enabled. Only valid when `enable_efa_support` = `true`"
|
||||||
|
type = list(number)
|
||||||
|
default = [0]
|
||||||
|
}
|
||||||
|
|
||||||
variable "metadata_options" {
|
variable "metadata_options" {
|
||||||
description = "Customize the metadata options for the instance"
|
description = "Customize the metadata options for the instance"
|
||||||
type = map(string)
|
type = map(string)
|
||||||
|
|||||||
@@ -375,6 +375,8 @@ module "eks_managed_node_group" {
|
|||||||
metadata_options = try(each.value.metadata_options, var.eks_managed_node_group_defaults.metadata_options, local.metadata_options)
|
metadata_options = try(each.value.metadata_options, var.eks_managed_node_group_defaults.metadata_options, local.metadata_options)
|
||||||
enable_monitoring = try(each.value.enable_monitoring, var.eks_managed_node_group_defaults.enable_monitoring, true)
|
enable_monitoring = try(each.value.enable_monitoring, var.eks_managed_node_group_defaults.enable_monitoring, true)
|
||||||
enable_efa_support = try(each.value.enable_efa_support, var.eks_managed_node_group_defaults.enable_efa_support, false)
|
enable_efa_support = try(each.value.enable_efa_support, var.eks_managed_node_group_defaults.enable_efa_support, false)
|
||||||
|
enable_efa_only = try(each.value.enable_efa_only, var.eks_managed_node_group_defaults.enable_efa_only, false)
|
||||||
|
efa_indices = try(each.value.efa_indices, var.eks_managed_node_group_defaults.efa_indices, [0])
|
||||||
create_placement_group = try(each.value.create_placement_group, var.eks_managed_node_group_defaults.create_placement_group, false)
|
create_placement_group = try(each.value.create_placement_group, var.eks_managed_node_group_defaults.create_placement_group, false)
|
||||||
placement = try(each.value.placement, var.eks_managed_node_group_defaults.placement, {})
|
placement = try(each.value.placement, var.eks_managed_node_group_defaults.placement, {})
|
||||||
placement_group_az = try(each.value.placement_group_az, var.eks_managed_node_group_defaults.placement_group_az, null)
|
placement_group_az = try(each.value.placement_group_az, var.eks_managed_node_group_defaults.placement_group_az, null)
|
||||||
@@ -526,6 +528,8 @@ module "self_managed_node_group" {
|
|||||||
metadata_options = try(each.value.metadata_options, var.self_managed_node_group_defaults.metadata_options, local.metadata_options)
|
metadata_options = try(each.value.metadata_options, var.self_managed_node_group_defaults.metadata_options, local.metadata_options)
|
||||||
enable_monitoring = try(each.value.enable_monitoring, var.self_managed_node_group_defaults.enable_monitoring, true)
|
enable_monitoring = try(each.value.enable_monitoring, var.self_managed_node_group_defaults.enable_monitoring, true)
|
||||||
enable_efa_support = try(each.value.enable_efa_support, var.self_managed_node_group_defaults.enable_efa_support, false)
|
enable_efa_support = try(each.value.enable_efa_support, var.self_managed_node_group_defaults.enable_efa_support, false)
|
||||||
|
enable_efa_only = try(each.value.enable_efa_only, var.self_managed_node_group_defaults.enable_efa_only, false)
|
||||||
|
efa_indices = try(each.value.efa_indices, var.self_managed_node_group_defaults.efa_indices, [0])
|
||||||
network_interfaces = try(each.value.network_interfaces, var.self_managed_node_group_defaults.network_interfaces, [])
|
network_interfaces = try(each.value.network_interfaces, var.self_managed_node_group_defaults.network_interfaces, [])
|
||||||
placement = try(each.value.placement, var.self_managed_node_group_defaults.placement, {})
|
placement = try(each.value.placement, var.self_managed_node_group_defaults.placement, {})
|
||||||
maintenance_options = try(each.value.maintenance_options, var.self_managed_node_group_defaults.maintenance_options, {})
|
maintenance_options = try(each.value.maintenance_options, var.self_managed_node_group_defaults.maintenance_options, {})
|
||||||
|
|||||||
@@ -74,7 +74,7 @@ module "eks" {
|
|||||||
control_plane_subnet_ids = module.vpc.intra_subnets
|
control_plane_subnet_ids = module.vpc.intra_subnets
|
||||||
|
|
||||||
eks_managed_node_group_defaults = {
|
eks_managed_node_group_defaults = {
|
||||||
ami_type = "AL2_x86_64"
|
ami_type = "AL2023_x86_64_STANDARD"
|
||||||
instance_types = ["m6i.large", "m5.large", "m5n.large", "m5zn.large"]
|
instance_types = ["m6i.large", "m5.large", "m5n.large", "m5zn.large"]
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -184,7 +184,7 @@ module "eks" {
|
|||||||
|
|
||||||
# Use a custom AMI
|
# Use a custom AMI
|
||||||
custom_ami = {
|
custom_ami = {
|
||||||
ami_type = "AL2_ARM_64"
|
ami_type = "AL2023_ARM_64_STANDARD"
|
||||||
# Current default AMI used by managed node groups - pseudo "custom"
|
# Current default AMI used by managed node groups - pseudo "custom"
|
||||||
ami_id = data.aws_ami.eks_default_arm.image_id
|
ami_id = data.aws_ami.eks_default_arm.image_id
|
||||||
|
|
||||||
@@ -211,13 +211,28 @@ module "eks" {
|
|||||||
ami_id = data.aws_ami.eks_default.image_id
|
ami_id = data.aws_ami.eks_default.image_id
|
||||||
enable_bootstrap_user_data = true
|
enable_bootstrap_user_data = true
|
||||||
|
|
||||||
pre_bootstrap_user_data = <<-EOT
|
cloudinit_pre_nodeadm = [{
|
||||||
export FOO=bar
|
content = <<-EOT
|
||||||
EOT
|
---
|
||||||
|
apiVersion: node.eks.aws/v1alpha1
|
||||||
|
kind: NodeConfig
|
||||||
|
spec:
|
||||||
|
kubelet:
|
||||||
|
config:
|
||||||
|
shutdownGracePeriod: 30s
|
||||||
|
featureGates:
|
||||||
|
DisableKubeletCloudCredentialProviders: true
|
||||||
|
EOT
|
||||||
|
content_type = "application/node.eks.aws"
|
||||||
|
}]
|
||||||
|
|
||||||
post_bootstrap_user_data = <<-EOT
|
# This is only possible with a custom AMI or self-managed node group
|
||||||
echo "you are free little kubelet!"
|
cloudinit_post_nodeadm = [{
|
||||||
EOT
|
content = <<-EOT
|
||||||
|
echo "All done"
|
||||||
|
EOT
|
||||||
|
content_type = "text/x-shellscript; charset=\"us-ascii\""
|
||||||
|
}]
|
||||||
|
|
||||||
capacity_type = "SPOT"
|
capacity_type = "SPOT"
|
||||||
force_update_version = true
|
force_update_version = true
|
||||||
@@ -227,14 +242,6 @@ module "eks" {
|
|||||||
GithubOrg = "terraform-aws-modules"
|
GithubOrg = "terraform-aws-modules"
|
||||||
}
|
}
|
||||||
|
|
||||||
taints = [
|
|
||||||
{
|
|
||||||
key = "dedicated"
|
|
||||||
value = "gpuGroup"
|
|
||||||
effect = "NO_SCHEDULE"
|
|
||||||
}
|
|
||||||
]
|
|
||||||
|
|
||||||
update_config = {
|
update_config = {
|
||||||
max_unavailable_percentage = 33 # or set `max_unavailable`
|
max_unavailable_percentage = 33 # or set `max_unavailable`
|
||||||
}
|
}
|
||||||
@@ -306,19 +313,53 @@ module "eks" {
|
|||||||
# Can be enabled when appropriate for testing/validation
|
# Can be enabled when appropriate for testing/validation
|
||||||
create = false
|
create = false
|
||||||
|
|
||||||
ami_type = "AL2_x86_64_GPU"
|
# The EKS AL2023 NVIDIA AMI provides all of the necessary components
|
||||||
instance_types = ["trn1n.32xlarge"]
|
# for accelerated workloads w/ EFA
|
||||||
|
ami_type = "AL2023_x86_64_NVIDIA"
|
||||||
|
instance_types = ["p5e.48xlarge"]
|
||||||
|
|
||||||
enable_efa_support = true
|
# Mount instance store volumes in RAID-0 for kubelet and containerd
|
||||||
pre_bootstrap_user_data = <<-EOT
|
# https://github.com/awslabs/amazon-eks-ami/blob/master/doc/USER_GUIDE.md#raid-0-for-kubelet-and-containerd-raid0
|
||||||
# Mount NVME instance store volumes since they are typically
|
cloudinit_pre_nodeadm = [
|
||||||
# available on instances that support EFA
|
{
|
||||||
setup-local-disks raid0
|
content_type = "application/node.eks.aws"
|
||||||
EOT
|
content = <<-EOT
|
||||||
|
---
|
||||||
|
apiVersion: node.eks.aws/v1alpha1
|
||||||
|
kind: NodeConfig
|
||||||
|
spec:
|
||||||
|
instance:
|
||||||
|
localStorage:
|
||||||
|
strategy: RAID0
|
||||||
|
EOT
|
||||||
|
}
|
||||||
|
]
|
||||||
|
|
||||||
min_size = 2
|
# This will:
|
||||||
max_size = 2
|
# 1. Create a placement group to place the instances close to one another
|
||||||
desired_size = 2
|
# 2. Ignore subnets that reside in AZs that do not support the instance type
|
||||||
|
# 3. Expose all of the available EFA interfaces on the launch template
|
||||||
|
enable_efa_support = true
|
||||||
|
enable_efa_only = true
|
||||||
|
efa_indices = [0, 4, 8, 12]
|
||||||
|
|
||||||
|
min_size = 1
|
||||||
|
max_size = 1
|
||||||
|
desired_size = 1
|
||||||
|
|
||||||
|
labels = {
|
||||||
|
"vpc.amazonaws.com/efa.present" = "true"
|
||||||
|
"nvidia.com/gpu.present" = "true"
|
||||||
|
}
|
||||||
|
|
||||||
|
taints = {
|
||||||
|
# Ensure only GPU workloads are scheduled on this node group
|
||||||
|
gpu = {
|
||||||
|
key = "nvidia.com/gpu"
|
||||||
|
value = "true"
|
||||||
|
effect = "NO_SCHEDULE"
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -532,7 +573,7 @@ data "aws_ami" "eks_default" {
|
|||||||
|
|
||||||
filter {
|
filter {
|
||||||
name = "name"
|
name = "name"
|
||||||
values = ["amazon-eks-node-${local.cluster_version}-v*"]
|
values = ["amazon-eks-node-al2023-x86_64-standard-${local.cluster_version}-v*"]
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -542,7 +583,7 @@ data "aws_ami" "eks_default_arm" {
|
|||||||
|
|
||||||
filter {
|
filter {
|
||||||
name = "name"
|
name = "name"
|
||||||
values = ["amazon-eks-arm64-node-${local.cluster_version}-v*"]
|
values = ["amazon-eks-node-al2023-arm64-standard-${local.cluster_version}-v*"]
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -61,6 +61,9 @@ module "eks" {
|
|||||||
}
|
}
|
||||||
|
|
||||||
self_managed_node_group_defaults = {
|
self_managed_node_group_defaults = {
|
||||||
|
ami_type = "AL2023_x86_64_STANDARD"
|
||||||
|
ami_id = data.aws_ami.eks_default.image_id
|
||||||
|
|
||||||
# enable discovery of autoscaling groups by cluster-autoscaler
|
# enable discovery of autoscaling groups by cluster-autoscaler
|
||||||
autoscaling_group_tags = {
|
autoscaling_group_tags = {
|
||||||
"k8s.io/cluster-autoscaler/enabled" : true,
|
"k8s.io/cluster-autoscaler/enabled" : true,
|
||||||
@@ -72,29 +75,6 @@ module "eks" {
|
|||||||
# Default node group - as provisioned by the module defaults
|
# Default node group - as provisioned by the module defaults
|
||||||
default_node_group = {}
|
default_node_group = {}
|
||||||
|
|
||||||
# AL2023 node group utilizing new user data format which utilizes nodeadm
|
|
||||||
# to join nodes to the cluster (instead of /etc/eks/bootstrap.sh)
|
|
||||||
al2023_nodeadm = {
|
|
||||||
ami_type = "AL2023_x86_64_STANDARD"
|
|
||||||
|
|
||||||
cloudinit_pre_nodeadm = [
|
|
||||||
{
|
|
||||||
content_type = "application/node.eks.aws"
|
|
||||||
content = <<-EOT
|
|
||||||
---
|
|
||||||
apiVersion: node.eks.aws/v1alpha1
|
|
||||||
kind: NodeConfig
|
|
||||||
spec:
|
|
||||||
kubelet:
|
|
||||||
config:
|
|
||||||
shutdownGracePeriod: 30s
|
|
||||||
featureGates:
|
|
||||||
DisableKubeletCloudCredentialProviders: true
|
|
||||||
EOT
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
|
|
||||||
# Bottlerocket node group
|
# Bottlerocket node group
|
||||||
bottlerocket = {
|
bottlerocket = {
|
||||||
name = "bottlerocket-self-mng"
|
name = "bottlerocket-self-mng"
|
||||||
@@ -138,8 +118,18 @@ module "eks" {
|
|||||||
max_size = 5
|
max_size = 5
|
||||||
desired_size = 2
|
desired_size = 2
|
||||||
|
|
||||||
ami_type = "AL2_x86_64"
|
cloudinit_pre_nodeadm = [{
|
||||||
bootstrap_extra_args = "--kubelet-extra-args '--node-labels=node.kubernetes.io/lifecycle=spot'"
|
content = <<-EOT
|
||||||
|
---
|
||||||
|
apiVersion: node.eks.aws/v1alpha1
|
||||||
|
kind: NodeConfig
|
||||||
|
spec:
|
||||||
|
kubelet:
|
||||||
|
flags:
|
||||||
|
- --node-labels=node.kubernetes.io/lifecycle=spot
|
||||||
|
EOT
|
||||||
|
content_type = "application/node.eks.aws"
|
||||||
|
}]
|
||||||
|
|
||||||
use_mixed_instances_policy = true
|
use_mixed_instances_policy = true
|
||||||
mixed_instances_policy = {
|
mixed_instances_policy = {
|
||||||
@@ -173,16 +163,18 @@ module "eks" {
|
|||||||
max_size = 7
|
max_size = 7
|
||||||
desired_size = 1
|
desired_size = 1
|
||||||
|
|
||||||
ami_id = data.aws_ami.eks_default.id
|
cloudinit_pre_nodeadm = [{
|
||||||
ami_type = "AL2_x86_64"
|
content = <<-EOT
|
||||||
|
---
|
||||||
pre_bootstrap_user_data = <<-EOT
|
apiVersion: node.eks.aws/v1alpha1
|
||||||
export FOO=bar
|
kind: NodeConfig
|
||||||
EOT
|
spec:
|
||||||
|
kubelet:
|
||||||
post_bootstrap_user_data = <<-EOT
|
flags:
|
||||||
echo "you are free little kubelet!"
|
- --node-labels=node.kubernetes.io/lifecycle=spot
|
||||||
EOT
|
EOT
|
||||||
|
content_type = "application/node.eks.aws"
|
||||||
|
}]
|
||||||
|
|
||||||
instance_type = "m6i.large"
|
instance_type = "m6i.large"
|
||||||
|
|
||||||
@@ -215,9 +207,23 @@ module "eks" {
|
|||||||
max_size = 2
|
max_size = 2
|
||||||
desired_size = 1
|
desired_size = 1
|
||||||
|
|
||||||
ami_type = "AL2_x86_64"
|
|
||||||
bootstrap_extra_args = "--kubelet-extra-args '--node-labels=node.kubernetes.io/lifecycle=spot'"
|
bootstrap_extra_args = "--kubelet-extra-args '--node-labels=node.kubernetes.io/lifecycle=spot'"
|
||||||
|
|
||||||
|
cloudinit_pre_nodeadm = [{
|
||||||
|
content = <<-EOT
|
||||||
|
---
|
||||||
|
apiVersion: node.eks.aws/v1alpha1
|
||||||
|
kind: NodeConfig
|
||||||
|
spec:
|
||||||
|
kubelet:
|
||||||
|
config:
|
||||||
|
shutdownGracePeriod: 30s
|
||||||
|
featureGates:
|
||||||
|
DisableKubeletCloudCredentialProviders: true
|
||||||
|
EOT
|
||||||
|
content_type = "application/node.eks.aws"
|
||||||
|
}]
|
||||||
|
|
||||||
instance_type = null
|
instance_type = null
|
||||||
|
|
||||||
# launch template configuration
|
# launch template configuration
|
||||||
@@ -290,19 +296,53 @@ module "eks" {
|
|||||||
# Can be enabled when appropriate for testing/validation
|
# Can be enabled when appropriate for testing/validation
|
||||||
create = false
|
create = false
|
||||||
|
|
||||||
ami_type = "AL2_x86_64_GPU"
|
# The EKS AL2023 NVIDIA AMI provides all of the necessary components
|
||||||
instance_type = "trn1n.32xlarge"
|
# for accelerated workloads w/ EFA
|
||||||
|
ami_type = "AL2023_x86_64_NVIDIA"
|
||||||
|
instance_types = ["p5e.48xlarge"]
|
||||||
|
|
||||||
enable_efa_support = true
|
# Mount instance store volumes in RAID-0 for kubelet and containerd
|
||||||
pre_bootstrap_user_data = <<-EOT
|
# https://github.com/awslabs/amazon-eks-ami/blob/master/doc/USER_GUIDE.md#raid-0-for-kubelet-and-containerd-raid0
|
||||||
# Mount NVME instance store volumes since they are typically
|
cloudinit_pre_nodeadm = [
|
||||||
# available on instances that support EFA
|
{
|
||||||
setup-local-disks raid0
|
content_type = "application/node.eks.aws"
|
||||||
EOT
|
content = <<-EOT
|
||||||
|
---
|
||||||
|
apiVersion: node.eks.aws/v1alpha1
|
||||||
|
kind: NodeConfig
|
||||||
|
spec:
|
||||||
|
instance:
|
||||||
|
localStorage:
|
||||||
|
strategy: RAID0
|
||||||
|
EOT
|
||||||
|
}
|
||||||
|
]
|
||||||
|
|
||||||
|
# This will:
|
||||||
|
# 1. Create a placement group to place the instances close to one another
|
||||||
|
# 2. Ignore subnets that reside in AZs that do not support the instance type
|
||||||
|
# 3. Expose all of the available EFA interfaces on the launch template
|
||||||
|
enable_efa_support = true
|
||||||
|
enable_efa_only = true
|
||||||
|
efa_indices = [0, 4, 8, 12]
|
||||||
|
|
||||||
min_size = 2
|
min_size = 2
|
||||||
max_size = 2
|
max_size = 2
|
||||||
desired_size = 2
|
desired_size = 2
|
||||||
|
|
||||||
|
labels = {
|
||||||
|
"vpc.amazonaws.com/efa.present" = "true"
|
||||||
|
"nvidia.com/gpu.present" = "true"
|
||||||
|
}
|
||||||
|
|
||||||
|
taints = {
|
||||||
|
# Ensure only GPU workloads are scheduled on this node group
|
||||||
|
gpu = {
|
||||||
|
key = "nvidia.com/gpu"
|
||||||
|
value = "true"
|
||||||
|
effect = "NO_SCHEDULE"
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -354,7 +394,7 @@ data "aws_ami" "eks_default" {
|
|||||||
|
|
||||||
filter {
|
filter {
|
||||||
name = "name"
|
name = "name"
|
||||||
values = ["amazon-eks-node-${local.cluster_version}-v*"]
|
values = ["amazon-eks-node-al2023-x86_64-standard-${local.cluster_version}-v*"]
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user