mirror of
https://github.com/yusing/godoxy.git
synced 2026-04-23 00:38:33 +02:00
refactor(metrics): reorganize system info collection into separate functions
Split the monolithic AllSystemInfo handler into smaller, focused functions: - Extract streamSystemInfo for channel consumption - Add queueSystemInfo for safe non-blocking queue operations - Create collectSystemInfoRound for parallel agent data collection - Implement handleRoundResult for consistent round result processing - Replace custom exponential backoff with cenkalti/backoff/v5 library This improves code maintainability and separates concerns within the metrics API endpoint.
This commit is contained in:
@@ -4,10 +4,12 @@ import (
|
|||||||
"context"
|
"context"
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"net/http"
|
"net/http"
|
||||||
|
"net/url"
|
||||||
"sync/atomic"
|
"sync/atomic"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"github.com/bytedance/sonic"
|
"github.com/bytedance/sonic"
|
||||||
|
"github.com/cenkalti/backoff/v5"
|
||||||
"github.com/gin-gonic/gin"
|
"github.com/gin-gonic/gin"
|
||||||
"github.com/rs/zerolog/log"
|
"github.com/rs/zerolog/log"
|
||||||
"github.com/yusing/godoxy/agent/pkg/agent"
|
"github.com/yusing/godoxy/agent/pkg/agent"
|
||||||
@@ -35,6 +37,11 @@ type bytesFromPool struct {
|
|||||||
release func([]byte)
|
release func([]byte)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type systemInfoData struct {
|
||||||
|
agentName string
|
||||||
|
systemInfo any
|
||||||
|
}
|
||||||
|
|
||||||
// @x-id "all_system_info"
|
// @x-id "all_system_info"
|
||||||
// @BasePath /api/v1
|
// @BasePath /api/v1
|
||||||
// @Summary Get system info
|
// @Summary Get system info
|
||||||
@@ -72,91 +79,19 @@ func AllSystemInfo(c *gin.Context) {
|
|||||||
defer manager.Close()
|
defer manager.Close()
|
||||||
|
|
||||||
query := c.Request.URL.Query()
|
query := c.Request.URL.Query()
|
||||||
queryEncoded := c.Request.URL.Query().Encode()
|
queryEncoded := query.Encode()
|
||||||
|
|
||||||
type SystemInfoData struct {
|
|
||||||
AgentName string
|
|
||||||
SystemInfo any
|
|
||||||
}
|
|
||||||
|
|
||||||
// leave 5 extra slots for buffering in case new agents are added.
|
// leave 5 extra slots for buffering in case new agents are added.
|
||||||
dataCh := make(chan SystemInfoData, 1+agentpool.Num()+5)
|
dataCh := make(chan systemInfoData, 1+agentpool.Num()+5)
|
||||||
defer close(dataCh)
|
|
||||||
|
|
||||||
ticker := time.NewTicker(req.Interval)
|
ticker := time.NewTicker(req.Interval)
|
||||||
defer ticker.Stop()
|
defer ticker.Stop()
|
||||||
|
|
||||||
go func() {
|
go streamSystemInfo(manager, dataCh)
|
||||||
for {
|
|
||||||
select {
|
|
||||||
case <-manager.Done():
|
|
||||||
return
|
|
||||||
case data := <-dataCh:
|
|
||||||
err := marshalSystemInfo(manager, data.AgentName, data.SystemInfo)
|
|
||||||
if err != nil {
|
|
||||||
manager.Close()
|
|
||||||
return
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}()
|
|
||||||
|
|
||||||
// processing function for one round.
|
|
||||||
doRound := func() (bool, error) {
|
|
||||||
var numErrs atomic.Int32
|
|
||||||
|
|
||||||
totalAgents := int32(1) // myself
|
|
||||||
|
|
||||||
var errs gperr.Group
|
|
||||||
// get system info for me and all agents in parallel.
|
|
||||||
errs.Go(func() error {
|
|
||||||
data, err := systeminfo.Poller.GetRespData(req.Period, query)
|
|
||||||
if err != nil {
|
|
||||||
numErrs.Add(1)
|
|
||||||
return gperr.PrependSubject(err, "Main server")
|
|
||||||
}
|
|
||||||
select {
|
|
||||||
case <-manager.Done():
|
|
||||||
return nil
|
|
||||||
case dataCh <- SystemInfoData{
|
|
||||||
AgentName: "GoDoxy",
|
|
||||||
SystemInfo: data,
|
|
||||||
}:
|
|
||||||
}
|
|
||||||
return nil
|
|
||||||
})
|
|
||||||
|
|
||||||
for _, a := range agentpool.Iter() {
|
|
||||||
totalAgents++
|
|
||||||
|
|
||||||
errs.Go(func() error {
|
|
||||||
data, err := getAgentSystemInfoWithRetry(manager.Context(), a, queryEncoded)
|
|
||||||
if err != nil {
|
|
||||||
numErrs.Add(1)
|
|
||||||
return gperr.PrependSubject(err, "Agent "+a.Name)
|
|
||||||
}
|
|
||||||
select {
|
|
||||||
case <-manager.Done():
|
|
||||||
return nil
|
|
||||||
case dataCh <- SystemInfoData{
|
|
||||||
AgentName: a.Name,
|
|
||||||
SystemInfo: data,
|
|
||||||
}:
|
|
||||||
}
|
|
||||||
return nil
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
err := errs.Wait().Error()
|
|
||||||
return numErrs.Load() == totalAgents, err
|
|
||||||
}
|
|
||||||
|
|
||||||
// write system info immediately once.
|
// write system info immediately once.
|
||||||
if shouldContinue, err := doRound(); err != nil {
|
if hasSuccess, err := collectSystemInfoRound(manager, req, query, queryEncoded, dataCh); handleRoundResult(c, hasSuccess, err, false) {
|
||||||
if !shouldContinue {
|
return
|
||||||
c.Error(apitypes.InternalServerError(err, "failed to get all system info"))
|
|
||||||
return
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// then continue on the ticker.
|
// then continue on the ticker.
|
||||||
@@ -165,17 +100,95 @@ func AllSystemInfo(c *gin.Context) {
|
|||||||
case <-manager.Done():
|
case <-manager.Done():
|
||||||
return
|
return
|
||||||
case <-ticker.C:
|
case <-ticker.C:
|
||||||
if shouldContinue, err := doRound(); err != nil {
|
if hasSuccess, err := collectSystemInfoRound(manager, req, query, queryEncoded, dataCh); handleRoundResult(c, hasSuccess, err, true) {
|
||||||
if !shouldContinue {
|
return
|
||||||
c.Error(apitypes.InternalServerError(err, "failed to get all system info"))
|
|
||||||
return
|
|
||||||
}
|
|
||||||
log.Warn().Err(err).Msg("failed to get some system info")
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func streamSystemInfo(manager *websocket.Manager, dataCh <-chan systemInfoData) {
|
||||||
|
for {
|
||||||
|
select {
|
||||||
|
case <-manager.Done():
|
||||||
|
return
|
||||||
|
case data := <-dataCh:
|
||||||
|
err := marshalSystemInfo(manager, data.agentName, data.systemInfo)
|
||||||
|
if err != nil {
|
||||||
|
manager.Close()
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func queueSystemInfo(manager *websocket.Manager, dataCh chan<- systemInfoData, data systemInfoData) {
|
||||||
|
select {
|
||||||
|
case <-manager.Done():
|
||||||
|
case dataCh <- data:
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func collectSystemInfoRound(
|
||||||
|
manager *websocket.Manager,
|
||||||
|
req AllSystemInfoRequest,
|
||||||
|
query url.Values,
|
||||||
|
queryEncoded string,
|
||||||
|
dataCh chan<- systemInfoData,
|
||||||
|
) (hasSuccess bool, err error) {
|
||||||
|
var numErrs atomic.Int32
|
||||||
|
totalAgents := int32(1) // myself
|
||||||
|
|
||||||
|
var errs gperr.Group
|
||||||
|
// get system info for me and all agents in parallel.
|
||||||
|
errs.Go(func() error {
|
||||||
|
data, err := systeminfo.Poller.GetRespData(req.Period, query)
|
||||||
|
if err != nil {
|
||||||
|
numErrs.Add(1)
|
||||||
|
return gperr.PrependSubject(err, "Main server")
|
||||||
|
}
|
||||||
|
queueSystemInfo(manager, dataCh, systemInfoData{
|
||||||
|
agentName: "GoDoxy",
|
||||||
|
systemInfo: data,
|
||||||
|
})
|
||||||
|
return nil
|
||||||
|
})
|
||||||
|
|
||||||
|
for _, a := range agentpool.Iter() {
|
||||||
|
totalAgents++
|
||||||
|
|
||||||
|
errs.Go(func() error {
|
||||||
|
data, err := getAgentSystemInfoWithRetry(manager.Context(), a, queryEncoded)
|
||||||
|
if err != nil {
|
||||||
|
numErrs.Add(1)
|
||||||
|
return gperr.PrependSubject(err, "Agent "+a.Name)
|
||||||
|
}
|
||||||
|
queueSystemInfo(manager, dataCh, systemInfoData{
|
||||||
|
agentName: a.Name,
|
||||||
|
systemInfo: data,
|
||||||
|
})
|
||||||
|
return nil
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
err = errs.Wait().Error()
|
||||||
|
return numErrs.Load() < totalAgents, err
|
||||||
|
}
|
||||||
|
|
||||||
|
func handleRoundResult(c *gin.Context, hasSuccess bool, err error, logPartial bool) (stop bool) {
|
||||||
|
if err == nil {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
if !hasSuccess {
|
||||||
|
c.Error(apitypes.InternalServerError(err, "failed to get all system info"))
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
if logPartial {
|
||||||
|
log.Warn().Err(err).Msg("failed to get some system info")
|
||||||
|
}
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
func getAgentSystemInfo(ctx context.Context, a *agentpool.Agent, query string) (bytesFromPool, error) {
|
func getAgentSystemInfo(ctx context.Context, a *agentpool.Agent, query string) (bytesFromPool, error) {
|
||||||
ctx, cancel := context.WithTimeout(ctx, 5*time.Second)
|
ctx, cancel := context.WithTimeout(ctx, 5*time.Second)
|
||||||
defer cancel()
|
defer cancel()
|
||||||
@@ -197,35 +210,26 @@ func getAgentSystemInfo(ctx context.Context, a *agentpool.Agent, query string) (
|
|||||||
|
|
||||||
func getAgentSystemInfoWithRetry(ctx context.Context, a *agentpool.Agent, query string) (bytesFromPool, error) {
|
func getAgentSystemInfoWithRetry(ctx context.Context, a *agentpool.Agent, query string) (bytesFromPool, error) {
|
||||||
const maxRetries = 3
|
const maxRetries = 3
|
||||||
var lastErr error
|
const retryDelay = 5 * time.Second
|
||||||
|
var attempt int
|
||||||
for attempt := range maxRetries {
|
data, err := backoff.Retry(ctx, func() (bytesFromPool, error) {
|
||||||
// Apply backoff delay for retries (not for first attempt)
|
attempt++
|
||||||
if attempt > 0 {
|
|
||||||
delay := max((1<<attempt)*time.Second, 5*time.Second)
|
|
||||||
select {
|
|
||||||
case <-ctx.Done():
|
|
||||||
return bytesFromPool{}, ctx.Err()
|
|
||||||
case <-time.After(delay):
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
data, err := getAgentSystemInfo(ctx, a, query)
|
data, err := getAgentSystemInfo(ctx, a, query)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
return data, nil
|
return data, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
lastErr = err
|
log.Err(err).Str("agent", a.Name).Int("attempt", attempt).Msg("Agent request attempt failed")
|
||||||
|
return bytesFromPool{}, err
|
||||||
log.Debug().Str("agent", a.Name).Int("attempt", attempt+1).Str("error", err.Error()).Msg("Agent request attempt failed")
|
},
|
||||||
|
backoff.WithBackOff(backoff.NewConstantBackOff(retryDelay)),
|
||||||
// Don't retry on context cancellation
|
backoff.WithMaxTries(maxRetries),
|
||||||
if ctx.Err() != nil {
|
)
|
||||||
return bytesFromPool{}, ctx.Err()
|
if err != nil {
|
||||||
}
|
return bytesFromPool{}, err
|
||||||
}
|
}
|
||||||
|
return data, nil
|
||||||
return bytesFromPool{}, lastErr
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func marshalSystemInfo(ws *websocket.Manager, agentName string, systemInfo any) error {
|
func marshalSystemInfo(ws *websocket.Manager, agentName string, systemInfo any) error {
|
||||||
|
|||||||
Reference in New Issue
Block a user