diff --git a/CHANGELOG.md b/CHANGELOG.md index 8bc1f24..34e9615 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Added + +- `IsFailure` config option is added to determine whether error returned from Handler counts as a failure. + ## [0.18.4] - 2020-08-17 ### Fixed diff --git a/internal/base/base.go b/internal/base/base.go index f77bb31..3dc7113 100644 --- a/internal/base/base.go +++ b/internal/base/base.go @@ -645,7 +645,7 @@ type Broker interface { Requeue(msg *TaskMessage) error Schedule(msg *TaskMessage, processAt time.Time) error ScheduleUnique(msg *TaskMessage, processAt time.Time, ttl time.Duration) error - Retry(msg *TaskMessage, processAt time.Time, errMsg string) error + Retry(msg *TaskMessage, processAt time.Time, errMsg string, isFailure bool) error Archive(msg *TaskMessage, errMsg string) error ForwardIfReady(qnames ...string) error ListDeadlineExceeded(deadline time.Time, qnames ...string) ([]*TaskMessage, error) diff --git a/internal/rdb/benchmark_test.go b/internal/rdb/benchmark_test.go index 576b111..8880b3b 100644 --- a/internal/rdb/benchmark_test.go +++ b/internal/rdb/benchmark_test.go @@ -184,7 +184,7 @@ func BenchmarkRetry(b *testing.B) { asynqtest.SeedDeadlines(b, r.client, zs, base.DefaultQueueName) b.StartTimer() - if err := r.Retry(msgs[0], time.Now().Add(1*time.Minute), "error"); err != nil { + if err := r.Retry(msgs[0], time.Now().Add(1*time.Minute), "error", true /*isFailure*/); err != nil { b.Fatalf("Retry failed: %v", err) } } diff --git a/internal/rdb/rdb.go b/internal/rdb/rdb.go index f65f3c9..6bd6a27 100644 --- a/internal/rdb/rdb.go +++ b/internal/rdb/rdb.go @@ -467,6 +467,7 @@ func (r *RDB) ScheduleUnique(msg *base.TaskMessage, processAt time.Time, ttl tim // ARGV[2] -> updated base.TaskMessage value // ARGV[3] -> retry_at UNIX timestamp // ARGV[4] -> stats expiration timestamp +// ARGV[5] -> is_failure (bool) var retryCmd = redis.NewScript(` if redis.call("LREM", KEYS[2], 0, ARGV[1]) == 0 then return redis.error_reply("NOT FOUND") @@ -476,23 +477,28 @@ if redis.call("ZREM", KEYS[3], ARGV[1]) == 0 then end redis.call("ZADD", KEYS[4], ARGV[3], ARGV[1]) redis.call("HSET", KEYS[1], "msg", ARGV[2], "state", "retry") -local n = redis.call("INCR", KEYS[5]) -if tonumber(n) == 1 then - redis.call("EXPIREAT", KEYS[5], ARGV[4]) -end -local m = redis.call("INCR", KEYS[6]) -if tonumber(m) == 1 then - redis.call("EXPIREAT", KEYS[6], ARGV[4]) +if tonumber(ARGV[5]) == 1 then + local n = redis.call("INCR", KEYS[5]) + if tonumber(n) == 1 then + redis.call("EXPIREAT", KEYS[5], ARGV[4]) + end + local m = redis.call("INCR", KEYS[6]) + if tonumber(m) == 1 then + redis.call("EXPIREAT", KEYS[6], ARGV[4]) + end end return redis.status_reply("OK")`) -// Retry moves the task from active to retry queue, incrementing retry count -// and assigning error message to the task message. -func (r *RDB) Retry(msg *base.TaskMessage, processAt time.Time, errMsg string) error { +// Retry moves the task from active to retry queue. +// It also annotates the message with the given error message and +// if isFailure is true increments the retried counter. +func (r *RDB) Retry(msg *base.TaskMessage, processAt time.Time, errMsg string, isFailure bool) error { var op errors.Op = "rdb.Retry" now := time.Now() modified := *msg - modified.Retried++ + if isFailure { + modified.Retried++ + } modified.ErrorMsg = errMsg modified.LastFailedAt = now.Unix() encoded, err := base.EncodeMessage(&modified) @@ -513,6 +519,7 @@ func (r *RDB) Retry(msg *base.TaskMessage, processAt time.Time, errMsg string) e encoded, processAt.Unix(), expireAt.Unix(), + isFailure, } return r.runScript(op, retryCmd, keys, argv...) } diff --git a/internal/rdb/rdb_test.go b/internal/rdb/rdb_test.go index 4e2b7d7..b83c720 100644 --- a/internal/rdb/rdb_test.go +++ b/internal/rdb/rdb_test.go @@ -1158,7 +1158,7 @@ func TestRetry(t *testing.T) { h.SeedAllRetryQueues(t, r.client, tc.retry) callTime := time.Now() // time when method was called - err := r.Retry(tc.msg, tc.processAt, tc.errMsg) + err := r.Retry(tc.msg, tc.processAt, tc.errMsg, true /*isFailure*/) if err != nil { t.Errorf("(*RDB).Retry = %v, want nil", err) continue @@ -1210,6 +1210,173 @@ func TestRetry(t *testing.T) { } } +func TestRetryWithNonFailureError(t *testing.T) { + r := setup(t) + defer r.Close() + now := time.Now() + t1 := &base.TaskMessage{ + ID: uuid.New(), + Type: "send_email", + Payload: h.JSON(map[string]interface{}{"subject": "Hola!"}), + Retried: 10, + Timeout: 1800, + Queue: "default", + } + t2 := &base.TaskMessage{ + ID: uuid.New(), + Type: "gen_thumbnail", + Payload: h.JSON(map[string]interface{}{"path": "some/path/to/image.jpg"}), + Timeout: 3000, + Queue: "default", + } + t3 := &base.TaskMessage{ + ID: uuid.New(), + Type: "reindex", + Payload: nil, + Timeout: 60, + Queue: "default", + } + t4 := &base.TaskMessage{ + ID: uuid.New(), + Type: "send_notification", + Payload: nil, + Timeout: 1800, + Queue: "custom", + } + t1Deadline := now.Unix() + t1.Timeout + t2Deadline := now.Unix() + t2.Timeout + t4Deadline := now.Unix() + t4.Timeout + errMsg := "SMTP server is not responding" + + tests := []struct { + active map[string][]*base.TaskMessage + deadlines map[string][]base.Z + retry map[string][]base.Z + msg *base.TaskMessage + processAt time.Time + errMsg string + wantActive map[string][]*base.TaskMessage + wantDeadlines map[string][]base.Z + getWantRetry func(failedAt time.Time) map[string][]base.Z + }{ + { + active: map[string][]*base.TaskMessage{ + "default": {t1, t2}, + }, + deadlines: map[string][]base.Z{ + "default": {{Message: t1, Score: t1Deadline}, {Message: t2, Score: t2Deadline}}, + }, + retry: map[string][]base.Z{ + "default": {{Message: t3, Score: now.Add(time.Minute).Unix()}}, + }, + msg: t1, + processAt: now.Add(5 * time.Minute), + errMsg: errMsg, + wantActive: map[string][]*base.TaskMessage{ + "default": {t2}, + }, + wantDeadlines: map[string][]base.Z{ + "default": {{Message: t2, Score: t2Deadline}}, + }, + getWantRetry: func(failedAt time.Time) map[string][]base.Z { + return map[string][]base.Z{ + "default": { + // Task message should include the error message but without incrementing the retry count. + {Message: h.TaskMessageWithError(*t1, errMsg, failedAt), Score: now.Add(5 * time.Minute).Unix()}, + {Message: t3, Score: now.Add(time.Minute).Unix()}, + }, + } + }, + }, + { + active: map[string][]*base.TaskMessage{ + "default": {t1, t2}, + "custom": {t4}, + }, + deadlines: map[string][]base.Z{ + "default": {{Message: t1, Score: t1Deadline}, {Message: t2, Score: t2Deadline}}, + "custom": {{Message: t4, Score: t4Deadline}}, + }, + retry: map[string][]base.Z{ + "default": {}, + "custom": {}, + }, + msg: t4, + processAt: now.Add(5 * time.Minute), + errMsg: errMsg, + wantActive: map[string][]*base.TaskMessage{ + "default": {t1, t2}, + "custom": {}, + }, + wantDeadlines: map[string][]base.Z{ + "default": {{Message: t1, Score: t1Deadline}, {Message: t2, Score: t2Deadline}}, + "custom": {}, + }, + getWantRetry: func(failedAt time.Time) map[string][]base.Z { + return map[string][]base.Z{ + "default": {}, + "custom": { + // Task message should include the error message but without incrementing the retry count. + {Message: h.TaskMessageWithError(*t4, errMsg, failedAt), Score: now.Add(5 * time.Minute).Unix()}, + }, + } + }, + }, + } + + for _, tc := range tests { + h.FlushDB(t, r.client) + h.SeedAllActiveQueues(t, r.client, tc.active) + h.SeedAllDeadlines(t, r.client, tc.deadlines) + h.SeedAllRetryQueues(t, r.client, tc.retry) + + callTime := time.Now() // time when method was called + err := r.Retry(tc.msg, tc.processAt, tc.errMsg, false /*isFailure*/) + if err != nil { + t.Errorf("(*RDB).Retry = %v, want nil", err) + continue + } + + for queue, want := range tc.wantActive { + gotActive := h.GetActiveMessages(t, r.client, queue) + if diff := cmp.Diff(want, gotActive, h.SortMsgOpt); diff != "" { + t.Errorf("mismatch found in %q; (-want, +got)\n%s", base.ActiveKey(queue), diff) + } + } + for queue, want := range tc.wantDeadlines { + gotDeadlines := h.GetDeadlinesEntries(t, r.client, queue) + if diff := cmp.Diff(want, gotDeadlines, h.SortZSetEntryOpt); diff != "" { + t.Errorf("mismatch found in %q; (-want, +got)\n%s", base.DeadlinesKey(queue), diff) + } + } + cmpOpts := []cmp.Option{ + h.SortZSetEntryOpt, + cmpopts.EquateApproxTime(5 * time.Second), // for LastFailedAt field + } + wantRetry := tc.getWantRetry(callTime) + for queue, want := range wantRetry { + gotRetry := h.GetRetryEntries(t, r.client, queue) + if diff := cmp.Diff(want, gotRetry, cmpOpts...); diff != "" { + t.Errorf("mismatch found in %q; (-want, +got)\n%s", base.RetryKey(queue), diff) + } + } + + // If isFailure is set to false, no stats should be recorded to avoid skewing the error rate. + processedKey := base.ProcessedKey(tc.msg.Queue, time.Now()) + gotProcessed := r.client.Get(processedKey).Val() + if gotProcessed != "" { + t.Errorf("GET %q = %q, want empty", processedKey, gotProcessed) + } + + // If isFailure is set to false, no stats should be recorded to avoid skewing the error rate. + failedKey := base.FailedKey(tc.msg.Queue, time.Now()) + gotFailed := r.client.Get(failedKey).Val() + if gotFailed != "" { + t.Errorf("GET %q = %q, want empty", failedKey, gotFailed) + } + } +} + func TestArchive(t *testing.T) { r := setup(t) defer r.Close() diff --git a/internal/testbroker/testbroker.go b/internal/testbroker/testbroker.go index 6ab74d3..d6c4738 100644 --- a/internal/testbroker/testbroker.go +++ b/internal/testbroker/testbroker.go @@ -108,13 +108,13 @@ func (tb *TestBroker) ScheduleUnique(msg *base.TaskMessage, processAt time.Time, return tb.real.ScheduleUnique(msg, processAt, ttl) } -func (tb *TestBroker) Retry(msg *base.TaskMessage, processAt time.Time, errMsg string) error { +func (tb *TestBroker) Retry(msg *base.TaskMessage, processAt time.Time, errMsg string, isFailure bool) error { tb.mu.Lock() defer tb.mu.Unlock() if tb.sleeping { return errRedisDown } - return tb.real.Retry(msg, processAt, errMsg) + return tb.real.Retry(msg, processAt, errMsg, isFailure) } func (tb *TestBroker) Archive(msg *base.TaskMessage, errMsg string) error { diff --git a/processor.go b/processor.go index c1857bf..5ede2d2 100644 --- a/processor.go +++ b/processor.go @@ -33,6 +33,7 @@ type processor struct { orderedQueues []string retryDelayFunc RetryDelayFunc + isFailureFunc func(error) bool errHandler ErrorHandler @@ -70,6 +71,7 @@ type processorParams struct { logger *log.Logger broker base.Broker retryDelayFunc RetryDelayFunc + isFailureFunc func(error) bool syncCh chan<- *syncRequest cancelations *base.Cancelations concurrency int @@ -94,6 +96,7 @@ func newProcessor(params processorParams) *processor { queueConfig: queues, orderedQueues: orderedQueues, retryDelayFunc: params.retryDelayFunc, + isFailureFunc: params.isFailureFunc, syncRequestCh: params.syncCh, cancelations: params.cancelations, errLogLimiter: rate.NewLimiter(rate.Every(3*time.Second), 1), @@ -197,7 +200,7 @@ func (p *processor) exec() { select { case <-ctx.Done(): // already canceled (e.g. deadline exceeded). - p.retryOrKill(ctx, msg, ctx.Err()) + p.retryOrArchive(ctx, msg, ctx.Err()) return default: } @@ -214,7 +217,7 @@ func (p *processor) exec() { p.requeue(msg) return case <-ctx.Done(): - p.retryOrKill(ctx, msg, ctx.Err()) + p.retryOrArchive(ctx, msg, ctx.Err()) return case resErr := <-resCh: // Note: One of three things should happen. @@ -222,7 +225,7 @@ func (p *processor) exec() { // 2) Retry -> Removes the message from Active & Adds the message to Retry // 3) Archive -> Removes the message from Active & Adds the message to archive if resErr != nil { - p.retryOrKill(ctx, msg, resErr) + p.retryOrArchive(ctx, msg, resErr) return } p.markAsDone(ctx, msg) @@ -263,22 +266,27 @@ func (p *processor) markAsDone(ctx context.Context, msg *base.TaskMessage) { // the task should not be retried and should be archived instead. var SkipRetry = errors.New("skip retry for the task") -func (p *processor) retryOrKill(ctx context.Context, msg *base.TaskMessage, err error) { +func (p *processor) retryOrArchive(ctx context.Context, msg *base.TaskMessage, err error) { if p.errHandler != nil { p.errHandler.HandleError(ctx, NewTask(msg.Type, msg.Payload), err) } + if !p.isFailureFunc(err) { + // retry the task without marking it as failed + p.retry(ctx, msg, err, false /*isFailure*/) + return + } if msg.Retried >= msg.Retry || errors.Is(err, SkipRetry) { p.logger.Warnf("Retry exhausted for task id=%s", msg.ID) p.archive(ctx, msg, err) } else { - p.retry(ctx, msg, err) + p.retry(ctx, msg, err, true /*isFailure*/) } } -func (p *processor) retry(ctx context.Context, msg *base.TaskMessage, e error) { +func (p *processor) retry(ctx context.Context, msg *base.TaskMessage, e error, isFailure bool) { d := p.retryDelayFunc(msg.Retried, e, NewTask(msg.Type, msg.Payload)) retryAt := time.Now().Add(d) - err := p.broker.Retry(msg, retryAt, e.Error()) + err := p.broker.Retry(msg, retryAt, e.Error(), isFailure) if err != nil { errMsg := fmt.Sprintf("Could not move task id=%s from %q to %q", msg.ID, base.ActiveKey(msg.Queue), base.RetryKey(msg.Queue)) deadline, ok := ctx.Deadline() @@ -288,7 +296,7 @@ func (p *processor) retry(ctx context.Context, msg *base.TaskMessage, e error) { p.logger.Warnf("%s; Will retry syncing", errMsg) p.syncRequestCh <- &syncRequest{ fn: func() error { - return p.broker.Retry(msg, retryAt, e.Error()) + return p.broker.Retry(msg, retryAt, e.Error(), isFailure) }, errMsg: errMsg, deadline: deadline, diff --git a/processor_test.go b/processor_test.go index 008eecf..d1f4944 100644 --- a/processor_test.go +++ b/processor_test.go @@ -98,6 +98,7 @@ func TestProcessorSuccessWithSingleQueue(t *testing.T) { logger: testLogger, broker: rdbClient, retryDelayFunc: DefaultRetryDelayFunc, + isFailureFunc: defaultIsFailureFunc, syncCh: syncCh, cancelations: base.NewCancelations(), concurrency: 10, @@ -190,6 +191,7 @@ func TestProcessorSuccessWithMultipleQueues(t *testing.T) { logger: testLogger, broker: rdbClient, retryDelayFunc: DefaultRetryDelayFunc, + isFailureFunc: defaultIsFailureFunc, syncCh: syncCh, cancelations: base.NewCancelations(), concurrency: 10, @@ -276,6 +278,7 @@ func TestProcessTasksWithLargeNumberInPayload(t *testing.T) { logger: testLogger, broker: rdbClient, retryDelayFunc: DefaultRetryDelayFunc, + isFailureFunc: defaultIsFailureFunc, syncCh: syncCh, cancelations: base.NewCancelations(), concurrency: 10, @@ -395,6 +398,7 @@ func TestProcessorRetry(t *testing.T) { logger: testLogger, broker: rdbClient, retryDelayFunc: delayFunc, + isFailureFunc: defaultIsFailureFunc, syncCh: nil, cancelations: base.NewCancelations(), concurrency: 10, @@ -486,6 +490,7 @@ func TestProcessorQueues(t *testing.T) { logger: testLogger, broker: nil, retryDelayFunc: DefaultRetryDelayFunc, + isFailureFunc: defaultIsFailureFunc, syncCh: nil, cancelations: base.NewCancelations(), concurrency: 10, @@ -577,6 +582,7 @@ func TestProcessorWithStrictPriority(t *testing.T) { logger: testLogger, broker: rdbClient, retryDelayFunc: DefaultRetryDelayFunc, + isFailureFunc: defaultIsFailureFunc, syncCh: syncCh, cancelations: base.NewCancelations(), concurrency: 1, // Set concurrency to 1 to make sure tasks are processed one at a time. diff --git a/recoverer.go b/recoverer.go index 7aae69c..4165e43 100644 --- a/recoverer.go +++ b/recoverer.go @@ -5,7 +5,7 @@ package asynq import ( - "fmt" + "context" "sync" "time" @@ -17,6 +17,7 @@ type recoverer struct { logger *log.Logger broker base.Broker retryDelayFunc RetryDelayFunc + isFailureFunc func(error) bool // channel to communicate back to the long running "recoverer" goroutine. done chan struct{} @@ -34,6 +35,7 @@ type recovererParams struct { queues []string interval time.Duration retryDelayFunc RetryDelayFunc + isFailureFunc func(error) bool } func newRecoverer(params recovererParams) *recoverer { @@ -44,6 +46,7 @@ func newRecoverer(params recovererParams) *recoverer { queues: params.queues, interval: params.interval, retryDelayFunc: params.retryDelayFunc, + isFailureFunc: params.isFailureFunc, } } @@ -81,26 +84,25 @@ func (r *recoverer) recover() { r.logger.Warn("recoverer: could not list deadline exceeded tasks") return } - const errMsg = "deadline exceeded" for _, msg := range msgs { if msg.Retried >= msg.Retry { - r.archive(msg, errMsg) + r.archive(msg, context.DeadlineExceeded) } else { - r.retry(msg, errMsg) + r.retry(msg, context.DeadlineExceeded) } } } -func (r *recoverer) retry(msg *base.TaskMessage, errMsg string) { - delay := r.retryDelayFunc(msg.Retried, fmt.Errorf(errMsg), NewTask(msg.Type, msg.Payload)) +func (r *recoverer) retry(msg *base.TaskMessage, err error) { + delay := r.retryDelayFunc(msg.Retried, err, NewTask(msg.Type, msg.Payload)) retryAt := time.Now().Add(delay) - if err := r.broker.Retry(msg, retryAt, errMsg); err != nil { + if err := r.broker.Retry(msg, retryAt, err.Error(), r.isFailureFunc(err)); err != nil { r.logger.Warnf("recoverer: could not retry deadline exceeded task: %v", err) } } -func (r *recoverer) archive(msg *base.TaskMessage, errMsg string) { - if err := r.broker.Archive(msg, errMsg); err != nil { +func (r *recoverer) archive(msg *base.TaskMessage, err error) { + if err := r.broker.Archive(msg, err.Error()); err != nil { r.logger.Warnf("recoverer: could not move task to archive: %v", err) } } diff --git a/recoverer_test.go b/recoverer_test.go index 0a9fdb6..a62f7b7 100644 --- a/recoverer_test.go +++ b/recoverer_test.go @@ -234,6 +234,7 @@ func TestRecoverer(t *testing.T) { queues: []string{"default", "critical"}, interval: 1 * time.Second, retryDelayFunc: func(n int, err error, task *Task) time.Duration { return 30 * time.Second }, + isFailureFunc: defaultIsFailureFunc, }) var wg sync.WaitGroup @@ -259,7 +260,7 @@ func TestRecoverer(t *testing.T) { gotRetry := h.GetRetryMessages(t, r, qname) var wantRetry []*base.TaskMessage // Note: construct message here since `LastFailedAt` is relative to each test run for _, msg := range msgs { - wantRetry = append(wantRetry, h.TaskMessageAfterRetry(*msg, "deadline exceeded", runTime)) + wantRetry = append(wantRetry, h.TaskMessageAfterRetry(*msg, "context deadline exceeded", runTime)) } if diff := cmp.Diff(wantRetry, gotRetry, h.SortMsgOpt, cmpOpt); diff != "" { t.Errorf("%s; mismatch found in %q: (-want, +got)\n%s", tc.desc, base.RetryKey(qname), diff) @@ -269,7 +270,7 @@ func TestRecoverer(t *testing.T) { gotArchived := h.GetArchivedMessages(t, r, qname) var wantArchived []*base.TaskMessage for _, msg := range msgs { - wantArchived = append(wantArchived, h.TaskMessageWithError(*msg, "deadline exceeded", runTime)) + wantArchived = append(wantArchived, h.TaskMessageWithError(*msg, "context deadline exceeded", runTime)) } if diff := cmp.Diff(wantArchived, gotArchived, h.SortMsgOpt, cmpOpt); diff != "" { t.Errorf("%s; mismatch found in %q: (-want, +got)\n%s", tc.desc, base.ArchivedKey(qname), diff) diff --git a/server.go b/server.go index 93d3c6a..36b9dd5 100644 --- a/server.go +++ b/server.go @@ -64,6 +64,14 @@ type Config struct { // By default, it uses exponential backoff algorithm to calculate the delay. RetryDelayFunc RetryDelayFunc + // Predicate function to determine whether the error returned from Handler is a failure. + // If the function returns false, Server will not increment the retried counter for the task, + // and Server won't record the queue stats (processed and failed stats) to avoid skewing the error + // rate of the queue. + // + // By default, if the given error is non-nil the function returns true. + IsFailure func(error) bool + // List of queues to process with given priority value. Keys are the names of the // queues and values are associated priority value. // @@ -268,6 +276,8 @@ func DefaultRetryDelayFunc(n int, e error, t *Task) time.Duration { return time.Duration(s) * time.Second } +func defaultIsFailureFunc(err error) bool { return err != nil } + var defaultQueueConfig = map[string]int{ base.DefaultQueueName: 1, } @@ -293,6 +303,10 @@ func NewServer(r RedisConnOpt, cfg Config) *Server { if delayFunc == nil { delayFunc = DefaultRetryDelayFunc } + isFailureFunc := cfg.IsFailure + if isFailureFunc == nil { + isFailureFunc = defaultIsFailureFunc + } queues := make(map[string]int) for qname, p := range cfg.Queues { if err := base.ValidateQueueName(qname); err != nil { @@ -362,6 +376,7 @@ func NewServer(r RedisConnOpt, cfg Config) *Server { logger: logger, broker: rdb, retryDelayFunc: delayFunc, + isFailureFunc: isFailureFunc, syncCh: syncCh, cancelations: cancels, concurrency: n, @@ -376,6 +391,7 @@ func NewServer(r RedisConnOpt, cfg Config) *Server { logger: logger, broker: rdb, retryDelayFunc: delayFunc, + isFailureFunc: isFailureFunc, queues: qnames, interval: 1 * time.Minute, })