mirror of
https://github.com/ysoftdevs/terraform-aws-eks.git
synced 2026-01-11 22:41:43 +01:00
feat: Add support for Auto Scaling Group Instance Refresh for self-managed worker groups (#1224)
Co-authored-by: Thierno IB. BARRY <ibrahima.br@gmail.com>
This commit is contained in:
1
.gitignore
vendored
1
.gitignore
vendored
@@ -2,6 +2,7 @@ eks-admin-cluster-role-binding.yaml
|
||||
eks-admin-service-account.yaml
|
||||
config-map-aws-auth*.yaml
|
||||
kubeconfig_*
|
||||
.idea
|
||||
|
||||
#################################################################
|
||||
# Default .gitignore content for all terraform-aws-modules below
|
||||
|
||||
@@ -27,6 +27,8 @@ An example of harming update was the removal of several commonly used, but depre
|
||||
|
||||
By default, this module manages the `aws-auth` configmap for you (`manage_aws_auth=true`). To avoid the following [issue](https://github.com/aws/containers-roadmap/issues/654) where the EKS creation is `ACTIVE` but not ready. We implemented a "retry" logic with a fork of the http provider https://github.com/terraform-aws-modules/terraform-provider-http. This fork adds the support of a self-signed CA certificate. The original PR can be found at https://github.com/hashicorp/terraform-provider-http/pull/29.
|
||||
|
||||
Setting `instance_refresh_enabled` to true will recreate your worker nodes without draining them first. It is recommended to install [aws-node-termination-handler](https://github.com/aws/aws-node-termination-handler) for proper node draining. Find the complete example here [instance_refresh](examples/instance_refresh).
|
||||
|
||||
## Usage example
|
||||
|
||||
A full example leveraging other community modules is contained in the [examples/basic directory](https://github.com/terraform-aws-modules/terraform-aws-eks/tree/master/examples/basic).
|
||||
@@ -155,7 +157,6 @@ MIT Licensed. See [LICENSE](https://github.com/terraform-aws-modules/terraform-a
|
||||
| <a name="provider_http"></a> [http](#provider\_http) | >= 2.3.0 |
|
||||
| <a name="provider_kubernetes"></a> [kubernetes](#provider\_kubernetes) | >= 1.11.1 |
|
||||
| <a name="provider_local"></a> [local](#provider\_local) | >= 1.4 |
|
||||
| <a name="provider_random"></a> [random](#provider\_random) | >= 2.1 |
|
||||
|
||||
## Modules
|
||||
|
||||
@@ -202,8 +203,6 @@ MIT Licensed. See [LICENSE](https://github.com/terraform-aws-modules/terraform-a
|
||||
| [aws_security_group_rule.workers_ingress_self](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/security_group_rule) | resource |
|
||||
| [kubernetes_config_map.aws_auth](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/config_map) | resource |
|
||||
| [local_file.kubeconfig](https://registry.terraform.io/providers/hashicorp/local/latest/docs/resources/file) | resource |
|
||||
| [random_pet.workers](https://registry.terraform.io/providers/hashicorp/random/latest/docs/resources/pet) | resource |
|
||||
| [random_pet.workers_launch_template](https://registry.terraform.io/providers/hashicorp/random/latest/docs/resources/pet) | resource |
|
||||
| [aws_ami.eks_worker](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/ami) | data source |
|
||||
| [aws_ami.eks_worker_windows](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/ami) | data source |
|
||||
| [aws_caller_identity.current](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/caller_identity) | data source |
|
||||
|
||||
234
examples/instance_refresh/main.tf
Normal file
234
examples/instance_refresh/main.tf
Normal file
@@ -0,0 +1,234 @@
|
||||
provider "aws" {
|
||||
region = var.region
|
||||
}
|
||||
|
||||
data "aws_caller_identity" "current" {}
|
||||
|
||||
data "aws_eks_cluster" "cluster" {
|
||||
name = module.eks.cluster_id
|
||||
}
|
||||
|
||||
data "aws_eks_cluster_auth" "cluster" {
|
||||
name = module.eks.cluster_id
|
||||
}
|
||||
|
||||
provider "kubernetes" {
|
||||
host = data.aws_eks_cluster.cluster.endpoint
|
||||
cluster_ca_certificate = base64decode(data.aws_eks_cluster.cluster.certificate_authority.0.data)
|
||||
token = data.aws_eks_cluster_auth.cluster.token
|
||||
load_config_file = false
|
||||
}
|
||||
|
||||
provider "helm" {
|
||||
kubernetes {
|
||||
host = data.aws_eks_cluster.cluster.endpoint
|
||||
cluster_ca_certificate = base64decode(data.aws_eks_cluster.cluster.certificate_authority.0.data)
|
||||
token = data.aws_eks_cluster_auth.cluster.token
|
||||
}
|
||||
}
|
||||
|
||||
data "aws_availability_zones" "available" {
|
||||
}
|
||||
|
||||
locals {
|
||||
cluster_name = "test-refresh-${random_string.suffix.result}"
|
||||
}
|
||||
|
||||
resource "random_string" "suffix" {
|
||||
length = 8
|
||||
special = false
|
||||
}
|
||||
|
||||
module "vpc" {
|
||||
source = "terraform-aws-modules/vpc/aws"
|
||||
version = "~> 3.0.0"
|
||||
|
||||
name = local.cluster_name
|
||||
cidr = "10.0.0.0/16"
|
||||
azs = data.aws_availability_zones.available.names
|
||||
public_subnets = ["10.0.4.0/24", "10.0.5.0/24", "10.0.6.0/24"]
|
||||
enable_dns_hostnames = true
|
||||
}
|
||||
|
||||
data "aws_iam_policy_document" "node_term" {
|
||||
statement {
|
||||
effect = "Allow"
|
||||
actions = [
|
||||
"ec2:DescribeInstances",
|
||||
"autoscaling:DescribeAutoScalingInstances",
|
||||
"autoscaling:DescribeTags",
|
||||
]
|
||||
resources = [
|
||||
"*",
|
||||
]
|
||||
}
|
||||
statement {
|
||||
effect = "Allow"
|
||||
actions = [
|
||||
"autoscaling:CompleteLifecycleAction",
|
||||
]
|
||||
resources = module.eks.workers_asg_arns
|
||||
}
|
||||
statement {
|
||||
effect = "Allow"
|
||||
actions = [
|
||||
"sqs:DeleteMessage",
|
||||
"sqs:ReceiveMessage"
|
||||
]
|
||||
resources = [
|
||||
module.node_term_sqs.sqs_queue_arn
|
||||
]
|
||||
}
|
||||
}
|
||||
|
||||
resource "aws_iam_policy" "node_term" {
|
||||
name = "node-term-${local.cluster_name}"
|
||||
policy = data.aws_iam_policy_document.node_term.json
|
||||
}
|
||||
|
||||
resource "aws_iam_role_policy_attachment" "node_term_policy" {
|
||||
policy_arn = aws_iam_policy.node_term.arn
|
||||
role = module.eks.worker_iam_role_name
|
||||
}
|
||||
|
||||
data "aws_iam_policy_document" "node_term_events" {
|
||||
statement {
|
||||
effect = "Allow"
|
||||
principals {
|
||||
type = "Service"
|
||||
identifiers = [
|
||||
"events.amazonaws.com",
|
||||
"sqs.amazonaws.com",
|
||||
]
|
||||
}
|
||||
actions = [
|
||||
"sqs:SendMessage",
|
||||
]
|
||||
resources = [
|
||||
"arn:aws:sqs:${var.region}:${data.aws_caller_identity.current.account_id}:${local.cluster_name}",
|
||||
]
|
||||
}
|
||||
}
|
||||
|
||||
module "node_term_sqs" {
|
||||
source = "terraform-aws-modules/sqs/aws"
|
||||
version = "~> 3.0.0"
|
||||
name = local.cluster_name
|
||||
message_retention_seconds = 300
|
||||
policy = data.aws_iam_policy_document.node_term_events.json
|
||||
}
|
||||
|
||||
resource "aws_cloudwatch_event_rule" "node_term_event_rule" {
|
||||
name = "${local.cluster_name}-nth-rule"
|
||||
description = "Node termination event rule"
|
||||
event_pattern = jsonencode(
|
||||
{
|
||||
"source" : [
|
||||
"aws.autoscaling"
|
||||
],
|
||||
"detail-type" : [
|
||||
"EC2 Instance-terminate Lifecycle Action"
|
||||
]
|
||||
"resources" : module.eks.workers_asg_arns
|
||||
}
|
||||
)
|
||||
}
|
||||
|
||||
resource "aws_cloudwatch_event_target" "node_term_event_target" {
|
||||
rule = aws_cloudwatch_event_rule.node_term_event_rule.name
|
||||
target_id = "ANTHandler"
|
||||
arn = module.node_term_sqs.sqs_queue_arn
|
||||
}
|
||||
|
||||
module "node_term_role" {
|
||||
source = "terraform-aws-modules/iam/aws//modules/iam-assumable-role-with-oidc"
|
||||
version = "4.1.0"
|
||||
create_role = true
|
||||
role_description = "IRSA role for ANTH, cluster ${local.cluster_name}"
|
||||
role_name_prefix = local.cluster_name
|
||||
provider_url = replace(module.eks.cluster_oidc_issuer_url, "https://", "")
|
||||
role_policy_arns = [aws_iam_policy.node_term.arn]
|
||||
oidc_fully_qualified_subjects = ["system:serviceaccount:${var.namespace}:${var.serviceaccount}"]
|
||||
}
|
||||
|
||||
resource "helm_release" "anth" {
|
||||
depends_on = [
|
||||
module.eks
|
||||
]
|
||||
|
||||
name = "aws-node-termination-handler"
|
||||
namespace = var.namespace
|
||||
repository = "https://aws.github.io/eks-charts"
|
||||
chart = "aws-node-termination-handler"
|
||||
version = var.aws_node_termination_handler_chart_version
|
||||
create_namespace = true
|
||||
|
||||
set {
|
||||
name = "awsRegion"
|
||||
value = var.region
|
||||
}
|
||||
set {
|
||||
name = "serviceAccount.name"
|
||||
value = var.serviceaccount
|
||||
}
|
||||
set {
|
||||
name = "serviceAccount.annotations.eks\\.amazonaws\\.com/role-arn"
|
||||
value = module.node_term_role.iam_role_arn
|
||||
type = "string"
|
||||
}
|
||||
set {
|
||||
name = "enableSqsTerminationDraining"
|
||||
value = "true"
|
||||
}
|
||||
set {
|
||||
name = "queueURL"
|
||||
value = module.node_term_sqs.sqs_queue_id
|
||||
}
|
||||
set {
|
||||
name = "logLevel"
|
||||
value = "DEBUG"
|
||||
}
|
||||
}
|
||||
|
||||
# Creating the lifecycle-hook outside of the ASG resource's `initial_lifecycle_hook`
|
||||
# ensures that node termination does not require the lifecycle action to be completed,
|
||||
# and thus allows the ASG to be destroyed cleanly.
|
||||
resource "aws_autoscaling_lifecycle_hook" "node_term" {
|
||||
name = "node_term-${local.cluster_name}"
|
||||
autoscaling_group_name = module.eks.workers_asg_names[0]
|
||||
lifecycle_transition = "autoscaling:EC2_INSTANCE_TERMINATING"
|
||||
heartbeat_timeout = 300
|
||||
default_result = "CONTINUE"
|
||||
}
|
||||
|
||||
module "eks" {
|
||||
source = "../.."
|
||||
cluster_name = local.cluster_name
|
||||
cluster_version = "1.19"
|
||||
subnets = module.vpc.public_subnets
|
||||
vpc_id = module.vpc.vpc_id
|
||||
enable_irsa = true
|
||||
worker_groups_launch_template = [
|
||||
{
|
||||
name = "refresh"
|
||||
asg_max_size = 2
|
||||
asg_desired_capacity = 2
|
||||
instance_refresh_enabled = true
|
||||
instance_refresh_triggers = ["tag"]
|
||||
public_ip = true
|
||||
metadata_http_put_response_hop_limit = 3
|
||||
tags = [
|
||||
{
|
||||
key = "aws-node-termination-handler/managed"
|
||||
value = ""
|
||||
propagate_at_launch = true
|
||||
},
|
||||
{
|
||||
key = "foo"
|
||||
value = "buzz"
|
||||
propagate_at_launch = true
|
||||
},
|
||||
]
|
||||
},
|
||||
]
|
||||
}
|
||||
34
examples/instance_refresh/outputs.tf
Normal file
34
examples/instance_refresh/outputs.tf
Normal file
@@ -0,0 +1,34 @@
|
||||
output "cluster_endpoint" {
|
||||
description = "Endpoint for EKS control plane."
|
||||
value = module.eks.cluster_endpoint
|
||||
}
|
||||
|
||||
output "cluster_security_group_id" {
|
||||
description = "Security group ids attached to the cluster control plane."
|
||||
value = module.eks.cluster_security_group_id
|
||||
}
|
||||
|
||||
output "kubectl_config" {
|
||||
description = "kubectl config as generated by the module."
|
||||
value = module.eks.kubeconfig
|
||||
}
|
||||
|
||||
output "config_map_aws_auth" {
|
||||
description = "A kubernetes configuration to authenticate to this EKS cluster."
|
||||
value = module.eks.config_map_aws_auth
|
||||
}
|
||||
|
||||
output "region" {
|
||||
description = "AWS region."
|
||||
value = var.region
|
||||
}
|
||||
|
||||
output "sqs_queue_asg_notification_arn" {
|
||||
description = "SQS queue ASG notification ARN"
|
||||
value = module.node_term_sqs.sqs_queue_arn
|
||||
}
|
||||
|
||||
output "sqs_queue_asg_notification_url" {
|
||||
description = "SQS queue ASG notification URL"
|
||||
value = module.node_term_sqs.sqs_queue_id
|
||||
}
|
||||
18
examples/instance_refresh/variables.tf
Normal file
18
examples/instance_refresh/variables.tf
Normal file
@@ -0,0 +1,18 @@
|
||||
variable "region" {
|
||||
default = "us-west-2"
|
||||
}
|
||||
|
||||
variable "aws_node_termination_handler_chart_version" {
|
||||
description = "Version of the aws-node-termination-handler Helm chart to install."
|
||||
default = "0.15.0"
|
||||
}
|
||||
|
||||
variable "namespace" {
|
||||
description = "Namespace for the aws-node-termination-handler."
|
||||
default = "kube-system"
|
||||
}
|
||||
|
||||
variable "serviceaccount" {
|
||||
description = "Serviceaccount for the aws-node-termination-handler."
|
||||
default = "aws-node-termination-handler"
|
||||
}
|
||||
11
examples/instance_refresh/versions.tf
Normal file
11
examples/instance_refresh/versions.tf
Normal file
@@ -0,0 +1,11 @@
|
||||
terraform {
|
||||
required_version = ">= 0.13.1"
|
||||
|
||||
required_providers {
|
||||
aws = ">= 3.22.0"
|
||||
local = ">= 1.4"
|
||||
random = ">= 2.1"
|
||||
kubernetes = "~> 1.11"
|
||||
helm = "~> 2.1.2"
|
||||
}
|
||||
}
|
||||
7
local.tf
7
local.tf
@@ -34,7 +34,7 @@ locals {
|
||||
asg_max_size = "3" # Maximum worker capacity in the autoscaling group.
|
||||
asg_min_size = "1" # Minimum worker capacity in the autoscaling group. NOTE: Change in this paramater will affect the asg_desired_capacity, like changing its value to 2 will change asg_desired_capacity value to 2 but bringing back it to 1 will not affect the asg_desired_capacity.
|
||||
asg_force_delete = false # Enable forced deletion for the autoscaling group.
|
||||
asg_initial_lifecycle_hooks = [] # Initital lifecycle hook for the autoscaling group.
|
||||
asg_initial_lifecycle_hooks = [] # Initial lifecycle hook for the autoscaling group.
|
||||
default_cooldown = null # The amount of time, in seconds, after a scaling activity completes before another scaling activity can start.
|
||||
health_check_type = null # Controls how health checking is done. Valid values are "EC2" or "ELB".
|
||||
health_check_grace_period = null # Time in seconds after instance comes into service before checking health.
|
||||
@@ -95,6 +95,11 @@ locals {
|
||||
spot_max_price = "" # Maximum price per unit hour that the user is willing to pay for the Spot instances. Default is the on-demand price
|
||||
max_instance_lifetime = 0 # Maximum number of seconds instances can run in the ASG. 0 is unlimited.
|
||||
elastic_inference_accelerator = null # Type of elastic inference accelerator to be attached. Example values are eia1.medium, eia2.large, etc.
|
||||
instance_refresh_enabled = false # Enable instance refresh for the worker autoscaling group.
|
||||
instance_refresh_strategy = "Rolling" # Strategy to use for instance refresh. Default is 'Rolling' which the only valid value.
|
||||
instance_refresh_min_healthy_percentage = 90 # The amount of capacity in the ASG that must remain healthy during an instance refresh, as a percentage of the ASG's desired capacity.
|
||||
instance_refresh_instance_warmup = null # The number of seconds until a newly launched instance is configured and ready to use. Defaults to the ASG's health check grace period.
|
||||
instance_refresh_triggers = [] # Set of additional property names that will trigger an Instance Refresh. A refresh will always be triggered by a change in any of launch_configuration, launch_template, or mixed_instances_policy.
|
||||
}
|
||||
|
||||
workers_group_defaults = merge(
|
||||
|
||||
27
workers.tf
27
workers.tf
@@ -162,6 +162,33 @@ resource "aws_autoscaling_group" "workers" {
|
||||
}
|
||||
}
|
||||
|
||||
# logic duplicated in workers_launch_template.tf
|
||||
dynamic "instance_refresh" {
|
||||
for_each = lookup(var.worker_groups[count.index],
|
||||
"instance_refresh_enabled",
|
||||
local.workers_group_defaults["instance_refresh_enabled"]) ? [1] : []
|
||||
content {
|
||||
strategy = lookup(
|
||||
var.worker_groups[count.index], "instance_refresh_strategy",
|
||||
local.workers_group_defaults["instance_refresh_strategy"]
|
||||
)
|
||||
preferences {
|
||||
instance_warmup = lookup(
|
||||
var.worker_groups[count.index], "instance_refresh_instance_warmup",
|
||||
local.workers_group_defaults["instance_refresh_instance_warmup"]
|
||||
)
|
||||
min_healthy_percentage = lookup(
|
||||
var.worker_groups[count.index], "instance_refresh_min_healthy_percentage",
|
||||
local.workers_group_defaults["instance_refresh_min_healthy_percentage"]
|
||||
)
|
||||
}
|
||||
triggers = lookup(
|
||||
var.worker_groups[count.index], "instance_refresh_triggers",
|
||||
local.workers_group_defaults["instance_refresh_triggers"]
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
lifecycle {
|
||||
create_before_destroy = true
|
||||
ignore_changes = [desired_capacity]
|
||||
|
||||
@@ -156,7 +156,6 @@ resource "aws_autoscaling_group" "workers_launch_template" {
|
||||
instance_type = override.value
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -238,6 +237,33 @@ resource "aws_autoscaling_group" "workers_launch_template" {
|
||||
}
|
||||
}
|
||||
|
||||
# logic duplicated in workers.tf
|
||||
dynamic "instance_refresh" {
|
||||
for_each = lookup(var.worker_groups_launch_template[count.index],
|
||||
"instance_refresh_enabled",
|
||||
local.workers_group_defaults["instance_refresh_enabled"]) ? [1] : []
|
||||
content {
|
||||
strategy = lookup(
|
||||
var.worker_groups_launch_template[count.index], "instance_refresh_strategy",
|
||||
local.workers_group_defaults["instance_refresh_strategy"]
|
||||
)
|
||||
preferences {
|
||||
instance_warmup = lookup(
|
||||
var.worker_groups_launch_template[count.index], "instance_refresh_instance_warmup",
|
||||
local.workers_group_defaults["instance_refresh_instance_warmup"]
|
||||
)
|
||||
min_healthy_percentage = lookup(
|
||||
var.worker_groups_launch_template[count.index], "instance_refresh_min_healthy_percentage",
|
||||
local.workers_group_defaults["instance_refresh_min_healthy_percentage"]
|
||||
)
|
||||
}
|
||||
triggers = lookup(
|
||||
var.worker_groups_launch_template[count.index], "instance_refresh_triggers",
|
||||
local.workers_group_defaults["instance_refresh_triggers"]
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
lifecycle {
|
||||
create_before_destroy = true
|
||||
ignore_changes = [desired_capacity]
|
||||
|
||||
Reference in New Issue
Block a user