diff --git a/base/gfspvgmgr/virtual_group_manager.go b/base/gfspvgmgr/virtual_group_manager.go index c0fa9d157..65151b919 100644 --- a/base/gfspvgmgr/virtual_group_manager.go +++ b/base/gfspvgmgr/virtual_group_manager.go @@ -19,6 +19,7 @@ import ( "github.com/bnb-chain/greenfield-storage-provider/core/consensus" "github.com/bnb-chain/greenfield-storage-provider/core/vgmgr" "github.com/bnb-chain/greenfield-storage-provider/pkg/log" + "github.com/bnb-chain/greenfield-storage-provider/pkg/metrics" "github.com/bnb-chain/greenfield-storage-provider/util" sptypes "github.com/bnb-chain/greenfield/x/sp/types" virtualgrouptypes "github.com/bnb-chain/greenfield/x/virtualgroup/types" @@ -33,9 +34,14 @@ const ( DefaultInitialGVGStakingStorageSize = uint64(2) * 1024 * 1024 * 1024 * 1024 // 2TB per GVG, chain side DefaultMaxStoreSizePerFamily is 64 TB additionalGVGStakingStorageSize = uint64(1) * 1024 * 1024 * 1024 * 1024 // 1TB - defaultSPCheckTimeout = 3 * time.Second - defaultSPHealthCheckerInterval = 10 * time.Second - httpStatusPath = "/status" + defaultSPCheckTimeout = 1 * time.Minute + defaultSPHealthCheckerInterval = 10 * time.Second + defaultSPHealthCheckerRetryInterval = 1 * time.Second + defaultSPHealthCheckerMaxRetries = 5 + httpStatusPath = "/status" + + SPHealthCheckerDuration = "check_sp_health_duration" + SPHealthCheckerFailure = "check_sp_health_total_failure" emptyGVGSafeDeletePeriod = int64(60) * 60 * 24 ) @@ -785,30 +791,41 @@ func (checker *HealthChecker) checkSPHealth(sp *sptypes.StorageProvider) bool { Transport: &http.Transport{ TLSClientConfig: &tls.Config{MinVersion: tls.VersionTLS12}, }, - Timeout: defaultSPCheckTimeout * time.Second, + Timeout: defaultSPCheckTimeout, } // Create an HTTP request to test the validity of the endpoint urlToCheck := fmt.Sprintf("%s%s", endpoint, httpStatusPath) - req, err := http.NewRequestWithContext(ctxTimeout, http.MethodGet, urlToCheck, nil) - if err != nil { - return false - } + for attempt := 0; attempt < defaultSPHealthCheckerMaxRetries; attempt++ { + start := time.Now() + req, err := http.NewRequestWithContext(ctxTimeout, http.MethodGet, urlToCheck, nil) + if err != nil { + log.CtxErrorw(context.Background(), "failed to create request", "sp", sp, "error", err) + return false + } - resp, err := client.Do(req) - if err != nil { - log.CtxErrorw(context.Background(), "failed to connect to sp", "sp", sp, "error", err) - return false - } - defer resp.Body.Close() + resp, err := client.Do(req) + duration := time.Since(start) + metrics.ReqTime.WithLabelValues(SPHealthCheckerDuration).Observe(duration.Seconds()) + if err != nil { + log.CtxErrorw(context.Background(), "failed to connect to sp", "sp", sp, "error", err, "duration", duration) + time.Sleep(defaultSPHealthCheckerRetryInterval) + continue + } + defer resp.Body.Close() - if resp.StatusCode != http.StatusOK { - log.CtxErrorw(context.Background(), "failed to check sp healthy", "sp", sp, "http_status_code", resp.StatusCode, "resp_body", resp.Body) - return false + if resp.StatusCode == http.StatusOK { + log.CtxInfow(context.Background(), "succeed to check the sp healthy", "sp", sp, "duration", duration) + return true + } else { + metrics.ReqCounter.WithLabelValues(SPHealthCheckerFailure).Inc() + log.CtxErrorw(context.Background(), "failed to check sp healthy", "sp", sp, "http_status_code", resp.StatusCode, "resp_body", resp.Body, "duration", duration) + time.Sleep(defaultSPHealthCheckerRetryInterval) + } } - log.CtxInfow(context.Background(), "succeed to check the sp healthy", "sp", sp) - return true + log.CtxErrorw(context.Background(), "failed to check sp healthy after retries", "sp", sp) + return false } func (checker *HealthChecker) Start() {