Safe Application Scaling With Graceful Shutdown

ขั้นตอนการพัฒนา feature ของ software ต่างๆในปัจจุบันนั้น ส่วนใหญ่ก็จะมี process คล้ายๆกันหมดเกือบทุกที่ ซึ่งที่กล่าวมาจะเกี่ยวกับ CI/CD process เราเคยคิดกันบ้างไหมว่าวิธีการ Rolling Update ใน Kubernetes นั้นข้างในมันจัดการ event ต่างๆยังไง ถ้าเกิดเรามี CR (Change Request) แล้ว upgrade feature ผ่าน CI/CD pipeline แล้วเราจะรู้ได้ยังไงว่า requests ต่างๆที่ user ส่งเข้ามายัง application ในระหว่างนั้นจะไม่สูญหายไป

มารู้จักกับวิธีการทำ Graceful Shutdown ใน application ของเรา เพื่อจัดการ Application Lifecycle ก่อนที่จะถูก Kubernetes terminate

โดยปกติตาม document หรือหาตาม Google search เราจะพบว่า Grace Period Duration Time ใน Kubernetes จะมีเวลาอยู่ที่ 30 วินาที ซึ่งตัวเลขนี้คือระยะเวลาที่ Kubernetes จะบังคับการ kill pod หลังจาก Kubernetes ส่งสัญญาณ SIGTERM (UNIX terminate signal) มายัง Pods ใน Workload ของ application เรา

เพราะฉะนั้น สิ่งที่เราควรที่จะรับมือจากเหตุการณ์นี้มีก็จะเกี่ยวข้องกับ code ที่เราเขียนไปนั่นแหละ เช่น ถ้าเกิดเราทำ HTTP server เชื่อมต่อกับ database และ Messaging Queue เพื่อให้คนอื่นมาเรียก API ที่เราทำไว้ ถ้าเกิดว่าเราไม่หยุดรับ request ที่มีการเรียกเข้ามายัง application หรือ flush messages ในส่วนของ PubSub client หรือ release database connections ออกจาก database connection pool ก็อาจจะทำให้เกิด error alert ขึ้นในฝั่งคนเรียกใช้งาน API หรือ messages ใน PubSub หาย ซึ่งทำให้เกิดปัญหาที่ตามมา เราที่เป็น developer คงไม่อยากมาตามแก้ มาตามตอบ issue กันอยู่แล้วหรอกเนอะ ถถถถ

ทีนี้มาดู code ตัวอย่างกัน

ผมจะใช้ภาษา Go เป็นตัวอย่างนะครับ (หรือจะดู code บน GitHub ของผมก็ได้ครับ)

โดยจะแยกไว้ 2 case คือ เคสแรกจะเป็นไม่ใช้ graceful shutdown โดย set environment variable key ชื่อ NO_SIGNALS ไป ส่วนเคสที่สองจะเป็นการทำ graceful shutdown โดยเราจะใส่ grace period duration ไปด้วยเพื่อความสะดวกในการเทส

package main

import (
	"cmp"
	"context"
	"errors"
	"fmt"
	"log"
	"net/http"
	"os"
	"os/signal"
	"strconv"
	"syscall"
	"time"
)

var (
	start time.Time
)

func init() {
	start = time.Now()
}

func main() {
	mux := http.NewServeMux()
	mux.HandleFunc("GET /", func(w http.ResponseWriter, r *http.Request) {
		vars := r.URL.Query()
		sleep := vars.Get("sleep")

		fmt.Printf("sleep val=%v\n", cmp.Or(sleep, "0"))
		if len(sleep) > 0 {
			t, err := strconv.Atoi(sleep)
			if err != nil {
				http.Error(w, err.Error(), http.StatusBadRequest)
				return
			}
			time.Sleep(time.Duration(t) * time.Second)
		}
		w.Write([]byte("Hello World"))
	})

	server := &http.Server{
		Addr:    fmt.Sprintf(":%s", cmp.Or(os.Getenv("PORT"), "8080")),
		Handler: mux,
		// Enforce server timeout
		WriteTimeout: 15 * time.Second,
		ReadTimeout:  15 * time.Second,
		IdleTimeout:  15 * time.Second,
	}

	if len(os.Getenv("NO_SIGNALS")) > 0 {
		log.Printf("Started server in %v", time.Since(start))
		if err := server.ListenAndServe(); !errors.Is(err, http.ErrServerClosed) {
			log.Fatalf("HTTP server error: %v\n", err)
		}
	} else {
		done := make(chan bool, 1)
		go gracefulShutdown(done, server)

		log.Printf("Started server in %v", time.Since(start))
		if err := server.ListenAndServe(); !errors.Is(err, http.ErrServerClosed) {
			log.Fatalf("HTTP server error: %v\n", err)
		}

		// Wait for the shutdown process to complete
		<-done
		log.Println("Shutdown completed")
	}
}

func gracefulShutdown(done chan bool, server *http.Server) {
	ctx, stop := signal.NotifyContext(context.Background(), syscall.SIGINT, syscall.SIGTERM)
	defer stop()

	// Listen for the interrupt signal
	<-ctx.Done()

	gracePeriodDuration, _ := strconv.Atoi(cmp.Or(os.Getenv("GRACE_PERIOD_DURATION"), "30"))
	shutdownCtx, cancel := context.WithTimeout(
		context.Background(),
		time.Duration(gracePeriodDuration)*time.Second, /* Kubernetes termination grace period time is 30 seconds by default */
	)
	defer cancel()

	log.Println("Server is shutting down...")

	if err := server.Shutdown(shutdownCtx); err != nil {
		log.Printf("Server is forced to shutdown with error: %v\n", err)
	} else {
		log.Printf("Server has been shutdown\n")
	}

	// Notify the main goroutine that shutdown process is completed
	done <- true
}

หลังจากลองเล่นดูครับ โดยด้านล่างจะเป็น gif แสดงผลลัพธ์ในแต่ละเคสต่างกัน โดยจะอธิบายเรียงตามเคสประมาณนี้คือ

เคสที่ 1 เราจะ start application โดยไม่รับ OS signals ใดๆเลย ผลลัพธ์ที่ได้คือ application จะทำการ force shutdown ไปเลย

เคสที่ 2 เราจะ start application แบบ graceful shutdown โดยใส่ค่า grace period ให้น้อยกว่าระยะเวลาที่ API request จะทำงานสำเร็จ ผลลัพธ์ที่ได้คือ client ที่ยิง request มาไม่ได้รับ response ทำให้เกิด error ที่ฝั่งคนเรียกใช้ API ตัวนี้

เคสที่ 3 เราจะ start application แบบ graceful shutdown เหมือนเดิม แต่ระยะเวลาการการทำงานของ API ที่ client ยิง request เข้ามานั้นทำสำเร็จในระยะ grace period ผลลัพธ์ที่ได้คือ client ที่ยิงมาได้รับ response กลับไปถูกต้องและ application ก็ shutdown ได้สมบูรณ์

เพิ่มเติม

หากยังไม่เข้าใจหรือไม่เห็นภาพในหัวว่า graceful shutdown มีความสำคัญยังไงกับระบบต่างๆ ที่ deploy อยู่บน Kubernetes ผมขอแนะให้ดูคลิปของคุณ Viktor (YT: DevOps Toolkit) ด้านล่างเพื่อทำความเข้าใจเพิ่มเติมครับ
โดยเขาจะแบ่ง timestamp ไว้ 3 ช่วงหลัก คือ run บน local machine, run บน Docker และ run บน Kubernetes Pods ครับ