Skip to content

Commit

Permalink
fix: check sp health retry
Browse files Browse the repository at this point in the history
  • Loading branch information
BarryTong65 committed Jun 3, 2024
1 parent 6090452 commit 8c58fa2
Showing 1 changed file with 36 additions and 19 deletions.
55 changes: 36 additions & 19 deletions base/gfspvgmgr/virtual_group_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ import (
"github.com/bnb-chain/greenfield-storage-provider/core/consensus"
"github.com/bnb-chain/greenfield-storage-provider/core/vgmgr"
"github.com/bnb-chain/greenfield-storage-provider/pkg/log"
"github.com/bnb-chain/greenfield-storage-provider/pkg/metrics"
"github.com/bnb-chain/greenfield-storage-provider/util"
sptypes "github.com/bnb-chain/greenfield/x/sp/types"
virtualgrouptypes "github.com/bnb-chain/greenfield/x/virtualgroup/types"
Expand All @@ -33,9 +34,14 @@ const (
DefaultInitialGVGStakingStorageSize = uint64(2) * 1024 * 1024 * 1024 * 1024 // 2TB per GVG, chain side DefaultMaxStoreSizePerFamily is 64 TB
additionalGVGStakingStorageSize = uint64(1) * 1024 * 1024 * 1024 * 1024 // 1TB

defaultSPCheckTimeout = 3 * time.Second
defaultSPHealthCheckerInterval = 10 * time.Second
httpStatusPath = "/status"
defaultSPCheckTimeout = 1 * time.Minute
defaultSPHealthCheckerInterval = 10 * time.Second
defaultSPHealthCheckerRetryInterval = 1 * time.Second
defaultSPHealthCheckerMaxRetries = 5
httpStatusPath = "/status"

SPHealthCheckerDuration = "check_sp_health_duration"
SPHealthCheckerFailure = "check_sp_health_total_failure"

emptyGVGSafeDeletePeriod = int64(60) * 60 * 24
)
Expand Down Expand Up @@ -785,30 +791,41 @@ func (checker *HealthChecker) checkSPHealth(sp *sptypes.StorageProvider) bool {
Transport: &http.Transport{
TLSClientConfig: &tls.Config{MinVersion: tls.VersionTLS12},
},
Timeout: defaultSPCheckTimeout * time.Second,
Timeout: defaultSPCheckTimeout,
}

// Create an HTTP request to test the validity of the endpoint
urlToCheck := fmt.Sprintf("%s%s", endpoint, httpStatusPath)
req, err := http.NewRequestWithContext(ctxTimeout, http.MethodGet, urlToCheck, nil)
if err != nil {
return false
}
for attempt := 0; attempt < defaultSPHealthCheckerMaxRetries; attempt++ {
start := time.Now()
req, err := http.NewRequestWithContext(ctxTimeout, http.MethodGet, urlToCheck, nil)
if err != nil {
log.CtxErrorw(context.Background(), "failed to create request", "sp", sp, "error", err)
return false
}

resp, err := client.Do(req)
if err != nil {
log.CtxErrorw(context.Background(), "failed to connect to sp", "sp", sp, "error", err)
return false
}
defer resp.Body.Close()
resp, err := client.Do(req)
duration := time.Since(start)
metrics.ReqTime.WithLabelValues(SPHealthCheckerDuration).Observe(duration.Seconds())
if err != nil {
log.CtxErrorw(context.Background(), "failed to connect to sp", "sp", sp, "error", err, "duration", duration)
time.Sleep(defaultSPHealthCheckerRetryInterval)
continue
}
defer resp.Body.Close()

if resp.StatusCode != http.StatusOK {
log.CtxErrorw(context.Background(), "failed to check sp healthy", "sp", sp, "http_status_code", resp.StatusCode, "resp_body", resp.Body)
return false
if resp.StatusCode == http.StatusOK {
log.CtxInfow(context.Background(), "succeed to check the sp healthy", "sp", sp, "duration", duration)
return true
} else {
metrics.ReqCounter.WithLabelValues(SPHealthCheckerFailure).Inc()
log.CtxErrorw(context.Background(), "failed to check sp healthy", "sp", sp, "http_status_code", resp.StatusCode, "resp_body", resp.Body, "duration", duration)
time.Sleep(defaultSPHealthCheckerRetryInterval)
}
}

log.CtxInfow(context.Background(), "succeed to check the sp healthy", "sp", sp)
return true
log.CtxErrorw(context.Background(), "failed to check sp healthy after retries", "sp", sp)
return false
}

func (checker *HealthChecker) Start() {
Expand Down

0 comments on commit 8c58fa2

Please sign in to comment.