Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(coordinator): fix all chunks ready bug #862

Merged
merged 7 commits into from
Aug 24, 2023
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion common/version/version.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ import (
"strings"
)

var tag = "v4.1.95"
var tag = "v4.1.96"

var commit = func() string {
if info, ok := debug.ReadBuildInfo(); ok {
Expand Down
72 changes: 67 additions & 5 deletions coordinator/internal/controller/cron/collect_proof.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,11 @@ type Collector struct {
chunkOrm *orm.Chunk
batchOrm *orm.Batch

timeoutBatchCheckerRunTotal prometheus.Counter
batchProverTaskTimeoutTotal prometheus.Counter
timeoutChunkCheckerRunTotal prometheus.Counter
chunkProverTaskTimeoutTotal prometheus.Counter
timeoutBatchCheckerRunTotal prometheus.Counter
batchProverTaskTimeoutTotal prometheus.Counter
timeoutChunkCheckerRunTotal prometheus.Counter
chunkProverTaskTimeoutTotal prometheus.Counter
checkBatchAllChunkReadyRunTotal prometheus.Counter
}

// NewCollector create a collector to cron collect the data to send to prover
Expand Down Expand Up @@ -62,10 +63,15 @@ func NewCollector(ctx context.Context, db *gorm.DB, cfg *config.Config, reg prom
Name: "coordinator_chunk_prover_task_timeout_total",
Help: "Total number of chunk timeout prover task.",
}),
checkBatchAllChunkReadyRunTotal: promauto.With(reg).NewCounter(prometheus.CounterOpts{
Name: "coordinator_check_batch_all_chunk_ready_run_total",
Help: "Total number of check batch all chunks ready total",
}),
}

go c.timeoutBatchProofTask()
go c.timeoutChunkProofTask()
go c.checkBatchAllChunkReady()

log.Info("Start coordinator successfully.")

Expand All @@ -79,7 +85,6 @@ func (c *Collector) Stop() {

// timeoutTask cron check the send task is timeout. if timeout reached, restore the
// chunk/batch task to unassigned. then the batch/chunk collector can retry it.

func (c *Collector) timeoutBatchProofTask() {
defer func() {
if err := recover(); err != nil {
Expand Down Expand Up @@ -189,3 +194,60 @@ func (c *Collector) check(assignedProverTasks []orm.ProverTask, timeout promethe
}
}
}

func (c *Collector) checkBatchAllChunkReady() {
defer func() {
if err := recover(); err != nil {
nerr := fmt.Errorf("check batch all chunk ready panic error:%v", err)
log.Warn(nerr.Error())
}
}()

ticker := time.NewTicker(time.Second * 10)
for {
select {
case <-ticker.C:
c.checkBatchAllChunkReadyRunTotal.Inc()
page := 1
pageSize := 50
for {
offset := (page - 1) * 50
georgehao marked this conversation as resolved.
Show resolved Hide resolved
batches, err := c.batchOrm.GetUnassignedAndChunksUnreadyBatches(c.ctx, offset, pageSize)
if err != nil {
log.Warn("checkBatchAllChunkReady GetUnassignedAndChunksUnreadyBatches", "error", err)
break
}

for _, batch := range batches {
allReady, checkErr := c.chunkOrm.CheckIfBatchChunkProofsAreReady(c.ctx, batch.Hash)
if checkErr != nil {
log.Warn("checkBatchAllChunkReady CheckIfBatchChunkProofsAreReady failure", "error", checkErr, "hash", batch.Hash)
continue
}

if !allReady {
continue
}

if updateErr := c.batchOrm.UpdateChunkProofsStatusByBatchHash(c.ctx, batch.Hash, types.ChunkProofsStatusReady); updateErr != nil {
log.Warn("checkBatchAllChunkReady UpdateChunkProofsStatusByBatchHash failure", "error", checkErr, "hash", batch.Hash)
}
}

if len(batches) < pageSize {
break
}
page++
}

case <-c.ctx.Done():
if c.ctx.Err() != nil {
log.Error("manager context canceled with error", "error", c.ctx.Err())
}
return
case <-c.stopTimeoutChan:
log.Info("the coordinator run loop exit")
return
}
}
}
32 changes: 1 addition & 31 deletions coordinator/internal/logic/submitproof/proof_receiver.go
Original file line number Diff line number Diff line change
Expand Up @@ -186,25 +186,6 @@ func (m *ProofReceiverLogic) HandleZkProof(ctx *gin.Context, proofMsg *message.P
return nil
}

func (m *ProofReceiverLogic) checkAreAllChunkProofsReady(ctx context.Context, chunkHash string) error {
batchHash, err := m.chunkOrm.GetChunkBatchHash(ctx, chunkHash)
if err != nil {
return err
}

allReady, err := m.chunkOrm.CheckIfBatchChunkProofsAreReady(ctx, batchHash)
if err != nil {
return err
}
if allReady {
err := m.batchOrm.UpdateChunkProofsStatusByBatchHash(ctx, batchHash, types.ChunkProofsStatusReady)
if err != nil {
return err
}
}
return nil
}

func (m *ProofReceiverLogic) validator(ctx context.Context, proverTask *orm.ProverTask, pk string, proofMsg *message.ProofMsg, proofParameter coordinatorType.SubmitProofParameter) (err error) {
defer func() {
if err != nil {
Expand Down Expand Up @@ -342,18 +323,7 @@ func (m *ProofReceiverLogic) updateProofStatus(ctx context.Context, hash string,
return nil
})

if err != nil {
return err
}

if status == types.ProvingTaskVerified && proofMsg.Type == message.ProofTypeChunk {
if checkReadyErr := m.checkAreAllChunkProofsReady(ctx, hash); checkReadyErr != nil {
log.Error("failed to check are all chunk proofs ready", "error", checkReadyErr)
return checkReadyErr
}
}

return nil
return err
}

func (m *ProofReceiverLogic) checkIsTaskSuccess(ctx context.Context, hash string, proofType message.ProofType) bool {
Expand Down
20 changes: 20 additions & 0 deletions coordinator/internal/orm/batch.go
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,26 @@ func (o *Batch) GetUnassignedBatches(ctx context.Context, limit int) ([]*Batch,
return batches, nil
}

// GetUnassignedAndChunksUnreadyBatches get the batches which is unassigned and chunks is not ready
func (o *Batch) GetUnassignedAndChunksUnreadyBatches(ctx context.Context, offset, limit int) ([]*Batch, error) {
if offset < 0 || limit < 0 {
return nil, errors.New("limit and offset must not be smaller than 0")
}

db := o.db.WithContext(ctx)
db = db.Where("proving_status = ?", types.ProvingTaskUnassigned)
db = db.Where("chunk_proofs_status = ?", types.ChunkProofsStatusPending)
db = db.Order("index ASC")
db = db.Offset(offset)
db = db.Limit(limit)

var batches []*Batch
if err := db.Find(&batches).Error; err != nil {
return nil, fmt.Errorf("Batch.GetUnassignedAndChunksUnreadyBatches error: %w", err)
}
return batches, nil
}

// GetAssignedBatches retrieves all batches whose proving_status is either types.ProvingTaskAssigned.
func (o *Batch) GetAssignedBatches(ctx context.Context) ([]*Batch, error) {
db := o.db.WithContext(ctx)
Expand Down