2
0
mirror of https://github.com/hibiken/asynq.git synced 2024-12-25 23:32:17 +08:00
asynq/server.go

631 lines
18 KiB
Go
Raw Normal View History

2020-01-03 10:13:16 +08:00
// Copyright 2020 Kentaro Hibino. All rights reserved.
// Use of this source code is governed by a MIT license
// that can be found in the LICENSE file.
2019-11-24 07:22:43 +08:00
package asynq
import (
"context"
"errors"
2019-11-27 22:33:04 +08:00
"fmt"
"math"
"math/rand"
"runtime"
"strings"
2019-11-24 07:22:43 +08:00
"sync"
"time"
2019-12-04 13:01:26 +08:00
2021-09-02 20:56:02 +08:00
"github.com/go-redis/redis/v8"
2020-01-06 23:15:59 +08:00
"github.com/hibiken/asynq/internal/base"
2020-03-09 22:11:16 +08:00
"github.com/hibiken/asynq/internal/log"
2019-12-04 13:01:26 +08:00
"github.com/hibiken/asynq/internal/rdb"
2019-11-24 07:22:43 +08:00
)
// Server is responsible for task processing and task lifecycle management.
2019-12-07 14:00:09 +08:00
//
2020-04-13 23:14:55 +08:00
// Server pulls tasks off queues and processes them.
// If the processing of a task is unsuccessful, server will schedule it for a retry.
//
2020-04-13 23:14:55 +08:00
// A task will be retried until either the task gets processed successfully
// or until it reaches its max retry count.
2019-12-07 14:00:09 +08:00
//
// If a task exhausts its retries, it will be moved to the archive and
// will be kept in the archive set.
// Note that the archive size is finite and once it reaches its max size,
// oldest tasks in the archive will be deleted.
2020-04-12 23:16:42 +08:00
type Server struct {
2020-05-06 13:10:11 +08:00
logger *log.Logger
2020-03-09 22:11:16 +08:00
2020-04-18 22:55:10 +08:00
broker base.Broker
2020-04-13 02:41:50 +08:00
state *serverState
2020-05-19 11:47:35 +08:00
2020-04-13 02:41:50 +08:00
// wait group to wait for all goroutines to finish.
wg sync.WaitGroup
forwarder *forwarder
processor *processor
syncer *syncer
heartbeater *heartbeater
subscriber *subscriber
recoverer *recoverer
healthchecker *healthchecker
janitor *janitor
2019-11-24 07:22:43 +08:00
}
type serverState struct {
mu sync.Mutex
value serverStateValue
}
type serverStateValue int
const (
// StateNew represents a new server. Server begins in
// this state and then transition to StatusActive when
// Start or Run is callled.
srvStateNew serverStateValue = iota
// StateActive indicates the server is up and active.
srvStateActive
// StateStopped indicates the server is up but no longer processing new tasks.
srvStateStopped
// StateClosed indicates the server has been shutdown.
srvStateClosed
)
var serverStates = []string{
"new",
"active",
"stopped",
"closed",
}
func (s serverStateValue) String() string {
if srvStateNew <= s && s <= srvStateClosed {
return serverStates[s]
}
return "unknown status"
}
2020-04-12 23:16:42 +08:00
// Config specifies the server's background-task processing behavior.
type Config struct {
2020-01-17 11:50:45 +08:00
// Maximum number of concurrent processing of tasks.
//
// If set to a zero or negative value, NewServer will overwrite the value
2021-12-14 02:39:42 +08:00
// to the number of CPUs usable by the current process.
Concurrency int
2022-01-28 23:51:34 +08:00
// BaseContext optionally specifies a function that returns the base context for Handler invocations on this server.
//
2022-01-25 17:12:02 +08:00
// If BaseContext is nil, the default is context.Background().
2022-01-28 23:51:34 +08:00
// If this is defined, then it MUST return a non-nil context
2022-02-10 17:56:34 +08:00
BaseContext func() context.Context
2022-01-25 17:12:02 +08:00
// Function to calculate retry delay for a failed task.
//
// By default, it uses exponential backoff algorithm to calculate the delay.
2021-01-13 03:40:26 +08:00
RetryDelayFunc RetryDelayFunc
2020-01-06 23:15:59 +08:00
// Predicate function to determine whether the error returned from Handler is a failure.
// If the function returns false, Server will not increment the retried counter for the task,
// and Server won't record the queue stats (processed and failed stats) to avoid skewing the error
// rate of the queue.
//
// By default, if the given error is non-nil the function returns true.
IsFailure func(error) bool
// List of queues to process with given priority value. Keys are the names of the
// queues and values are associated priority value.
2020-01-06 23:15:59 +08:00
//
2020-04-12 23:16:42 +08:00
// If set to nil or not specified, the server will process only the "default" queue.
2020-01-06 23:15:59 +08:00
//
2020-01-08 13:53:38 +08:00
// Priority is treated as follows to avoid starving low priority queues.
//
2020-01-06 23:15:59 +08:00
// Example:
2020-09-15 12:45:30 +08:00
//
// Queues: map[string]int{
// "critical": 6,
// "default": 3,
// "low": 1,
// }
//
2020-01-08 13:53:38 +08:00
// With the above config and given that all queues are not empty, the tasks
2020-01-06 23:15:59 +08:00
// in "critical", "default", "low" should be processed 60%, 30%, 10% of
// the time respectively.
//
// If a queue has a zero or negative priority value, the queue will be ignored.
Queues map[string]int
2020-01-12 23:46:51 +08:00
// StrictPriority indicates whether the queue priority should be treated strictly.
//
// If set to true, tasks in the queue with the highest priority is processed first.
// The tasks in lower priority queues are processed only when those queues with
// higher priorities are empty.
StrictPriority bool
// ErrorHandler handles errors returned by the task handler.
//
// HandleError is invoked only if the task handler returns a non-nil error.
//
// Example:
2020-09-15 12:45:30 +08:00
//
// func reportError(ctx context, task *asynq.Task, err error) {
// retried, _ := asynq.GetRetryCount(ctx)
// maxRetry, _ := asynq.GetMaxRetry(ctx)
// if retried >= maxRetry {
// err = fmt.Errorf("retry exhausted for task %s: %w", task.Type, err)
// }
// errorReportingService.Notify(err)
// })
//
// ErrorHandler: asynq.ErrorHandlerFunc(reportError)
ErrorHandler ErrorHandler
2020-04-12 23:16:42 +08:00
// Logger specifies the logger used by the server instance.
//
// If unset, default logger is used.
Logger Logger
// LogLevel specifies the minimum log level to enable.
//
2020-05-11 21:55:04 +08:00
// If unset, InfoLevel is used by default.
LogLevel LogLevel
// ShutdownTimeout specifies the duration to wait to let workers finish their tasks
// before forcing them to abort when stopping the server.
//
// If unset or zero, default timeout of 8 seconds is used.
ShutdownTimeout time.Duration
// HealthCheckFunc is called periodically with any errors encountered during ping to the
// connected redis server.
HealthCheckFunc func(error)
// HealthCheckInterval specifies the interval between healthchecks.
//
// If unset or zero, the interval is set to 15 seconds.
HealthCheckInterval time.Duration
// DelayedTaskCheckInterval specifies the interval between checks run on 'scheduled' and 'retry'
// tasks, and forwarding them to 'pending' state if they are ready to be processed.
//
// If unset or zero, the interval is set to 5 seconds.
DelayedTaskCheckInterval time.Duration
}
// An ErrorHandler handles an error occured during task processing.
type ErrorHandler interface {
2020-07-04 20:24:47 +08:00
HandleError(ctx context.Context, task *Task, err error)
}
// The ErrorHandlerFunc type is an adapter to allow the use of ordinary functions as a ErrorHandler.
// If f is a function with the appropriate signature, ErrorHandlerFunc(f) is a ErrorHandler that calls f.
2020-07-04 20:24:47 +08:00
type ErrorHandlerFunc func(ctx context.Context, task *Task, err error)
2020-07-04 20:24:47 +08:00
// HandleError calls fn(ctx, task, err)
func (fn ErrorHandlerFunc) HandleError(ctx context.Context, task *Task, err error) {
fn(ctx, task, err)
}
2021-01-13 03:40:26 +08:00
// RetryDelayFunc calculates the retry delay duration for a failed task given
// the retry count, error, and the task.
//
// n is the number of times the task has been retried.
// e is the error returned by the task handler.
// t is the task in question.
type RetryDelayFunc func(n int, e error, t *Task) time.Duration
// Logger supports logging at various log levels.
type Logger interface {
// Debug logs a message at Debug level.
2020-05-06 13:10:11 +08:00
Debug(args ...interface{})
// Info logs a message at Info level.
2020-05-06 13:10:11 +08:00
Info(args ...interface{})
// Warn logs a message at Warning level.
2020-05-06 13:10:11 +08:00
Warn(args ...interface{})
// Error logs a message at Error level.
2020-05-06 13:10:11 +08:00
Error(args ...interface{})
// Fatal logs a message at Fatal level
// and process will exit with status set to 1.
2020-05-06 13:10:11 +08:00
Fatal(args ...interface{})
}
// LogLevel represents logging level.
//
// It satisfies flag.Value interface.
type LogLevel int32
const (
2020-05-11 21:55:04 +08:00
// Note: reserving value zero to differentiate unspecified case.
level_unspecified LogLevel = iota
// DebugLevel is the lowest level of logging.
// Debug logs are intended for debugging and development purposes.
2020-05-11 21:55:04 +08:00
DebugLevel
// InfoLevel is used for general informational log messages.
InfoLevel
// WarnLevel is used for undesired but relatively expected events,
// which may indicate a problem.
WarnLevel
// ErrorLevel is used for undesired and unexpected events that
// the program can recover from.
ErrorLevel
// FatalLevel is used for undesired and unexpected events that
// the program cannot recover from.
FatalLevel
)
// String is part of the flag.Value interface.
func (l *LogLevel) String() string {
switch *l {
case DebugLevel:
return "debug"
case InfoLevel:
return "info"
case WarnLevel:
return "warn"
case ErrorLevel:
return "error"
case FatalLevel:
return "fatal"
}
panic(fmt.Sprintf("asynq: unexpected log level: %v", *l))
}
// Set is part of the flag.Value interface.
func (l *LogLevel) Set(val string) error {
switch strings.ToLower(val) {
case "debug":
*l = DebugLevel
case "info":
*l = InfoLevel
case "warn", "warning":
*l = WarnLevel
case "error":
*l = ErrorLevel
case "fatal":
*l = FatalLevel
default:
return fmt.Errorf("asynq: unsupported log level %q", val)
}
return nil
}
2020-05-11 21:55:04 +08:00
func toInternalLogLevel(l LogLevel) log.Level {
switch l {
case DebugLevel:
return log.DebugLevel
case InfoLevel:
return log.InfoLevel
case WarnLevel:
return log.WarnLevel
case ErrorLevel:
return log.ErrorLevel
case FatalLevel:
return log.FatalLevel
}
panic(fmt.Sprintf("asynq: unexpected log level: %v", l))
}
2021-01-13 03:40:26 +08:00
// DefaultRetryDelayFunc is the default RetryDelayFunc used if one is not specified in Config.
// It uses exponential back-off strategy to calculate the retry delay.
func DefaultRetryDelayFunc(n int, e error, t *Task) time.Duration {
r := rand.New(rand.NewSource(time.Now().UnixNano()))
2021-01-13 03:40:26 +08:00
// Formula taken from https://github.com/mperham/sidekiq.
s := int(math.Pow(float64(n), 4)) + 15 + (r.Intn(30) * (n + 1))
return time.Duration(s) * time.Second
}
func defaultIsFailureFunc(err error) bool { return err != nil }
var defaultQueueConfig = map[string]int{
2020-01-06 23:15:59 +08:00
base.DefaultQueueName: 1,
}
const (
defaultShutdownTimeout = 8 * time.Second
defaultHealthCheckInterval = 15 * time.Second
defaultDelayedTaskCheckInterval = 5 * time.Second
)
2020-04-12 23:16:42 +08:00
// NewServer returns a new Server given a redis connection option
// and server configuration.
2020-04-12 23:16:42 +08:00
func NewServer(r RedisConnOpt, cfg Config) *Server {
2021-01-29 22:37:35 +08:00
c, ok := r.MakeRedisClient().(redis.UniversalClient)
if !ok {
panic(fmt.Sprintf("asynq: unsupported RedisConnOpt type %T", r))
}
2022-01-25 17:12:02 +08:00
baseCtxFn := cfg.BaseContext
if baseCtxFn == nil {
baseCtxFn = context.Background
}
n := cfg.Concurrency
if n < 1 {
n = runtime.NumCPU()
}
delayFunc := cfg.RetryDelayFunc
if delayFunc == nil {
2021-01-13 03:40:26 +08:00
delayFunc = DefaultRetryDelayFunc
}
isFailureFunc := cfg.IsFailure
if isFailureFunc == nil {
isFailureFunc = defaultIsFailureFunc
}
queues := make(map[string]int)
for qname, p := range cfg.Queues {
if err := base.ValidateQueueName(qname); err != nil {
continue // ignore invalid queue names
}
if p > 0 {
queues[qname] = p
}
}
if len(queues) == 0 {
2020-01-06 23:15:59 +08:00
queues = defaultQueueConfig
}
var qnames []string
2021-01-13 03:40:26 +08:00
for q := range queues {
qnames = append(qnames, q)
}
shutdownTimeout := cfg.ShutdownTimeout
if shutdownTimeout == 0 {
shutdownTimeout = defaultShutdownTimeout
}
healthcheckInterval := cfg.HealthCheckInterval
if healthcheckInterval == 0 {
healthcheckInterval = defaultHealthCheckInterval
}
logger := log.NewLogger(cfg.Logger)
2020-05-11 21:55:04 +08:00
loglevel := cfg.LogLevel
if loglevel == level_unspecified {
loglevel = InfoLevel
}
logger.SetLevel(toInternalLogLevel(loglevel))
2021-01-29 22:37:35 +08:00
rdb := rdb.NewRDB(c)
2021-01-28 07:55:43 +08:00
starting := make(chan *workerInfo)
finished := make(chan *base.TaskMessage)
2020-02-18 22:57:39 +08:00
syncCh := make(chan *syncRequest)
srvState := &serverState{value: srvStateNew}
2020-02-18 22:57:39 +08:00
cancels := base.NewCancelations()
syncer := newSyncer(syncerParams{
logger: logger,
requestsCh: syncCh,
interval: 5 * time.Second,
})
heartbeater := newHeartbeater(heartbeaterParams{
2020-05-19 11:47:35 +08:00
logger: logger,
broker: rdb,
interval: 5 * time.Second,
concurrency: n,
queues: queues,
strictPriority: cfg.StrictPriority,
state: srvState,
2020-05-19 11:47:35 +08:00
starting: starting,
finished: finished,
})
delayedTaskCheckInterval := cfg.DelayedTaskCheckInterval
if delayedTaskCheckInterval == 0 {
delayedTaskCheckInterval = defaultDelayedTaskCheckInterval
}
forwarder := newForwarder(forwarderParams{
logger: logger,
broker: rdb,
queues: qnames,
interval: delayedTaskCheckInterval,
})
subscriber := newSubscriber(subscriberParams{
logger: logger,
broker: rdb,
cancelations: cancels,
})
processor := newProcessor(processorParams{
logger: logger,
2020-04-17 21:56:44 +08:00
broker: rdb,
retryDelayFunc: delayFunc,
2022-01-25 17:12:02 +08:00
baseCtxFn: baseCtxFn,
isFailureFunc: isFailureFunc,
syncCh: syncCh,
cancelations: cancels,
2020-05-19 11:47:35 +08:00
concurrency: n,
queues: queues,
strictPriority: cfg.StrictPriority,
errHandler: cfg.ErrorHandler,
shutdownTimeout: shutdownTimeout,
2020-05-19 11:47:35 +08:00
starting: starting,
finished: finished,
})
2020-06-21 22:05:57 +08:00
recoverer := newRecoverer(recovererParams{
logger: logger,
broker: rdb,
retryDelayFunc: delayFunc,
isFailureFunc: isFailureFunc,
2020-08-10 21:10:14 +08:00
queues: qnames,
2020-06-21 22:05:57 +08:00
interval: 1 * time.Minute,
})
healthchecker := newHealthChecker(healthcheckerParams{
logger: logger,
broker: rdb,
interval: healthcheckInterval,
healthcheckFunc: cfg.HealthCheckFunc,
})
janitor := newJanitor(janitorParams{
logger: logger,
broker: rdb,
queues: qnames,
interval: 8 * time.Second,
})
2020-04-12 23:16:42 +08:00
return &Server{
logger: logger,
broker: rdb,
state: srvState,
forwarder: forwarder,
processor: processor,
syncer: syncer,
heartbeater: heartbeater,
subscriber: subscriber,
recoverer: recoverer,
healthchecker: healthchecker,
janitor: janitor,
2019-11-24 07:22:43 +08:00
}
}
// A Handler processes tasks.
//
// ProcessTask should return nil if the processing of a task
// is successful.
//
// If ProcessTask returns a non-nil error or panics, the task
// will be retried after delay if retry-count is remaining,
// otherwise the task will be archived.
//
// One exception to this rule is when ProcessTask returns a SkipRetry error.
// If the returned error is SkipRetry or an error wraps SkipRetry, retry is
// skipped and the task will be immediately archived instead.
type Handler interface {
ProcessTask(context.Context, *Task) error
}
// The HandlerFunc type is an adapter to allow the use of
// ordinary functions as a Handler. If f is a function
// with the appropriate signature, HandlerFunc(f) is a
// Handler that calls f.
type HandlerFunc func(context.Context, *Task) error
// ProcessTask calls fn(ctx, task)
func (fn HandlerFunc) ProcessTask(ctx context.Context, task *Task) error {
return fn(ctx, task)
}
2019-11-24 07:22:43 +08:00
// ErrServerClosed indicates that the operation is now illegal because of the server has been shutdown.
var ErrServerClosed = errors.New("asynq: Server closed")
// Run starts the task processing and blocks until
2019-11-24 07:44:42 +08:00
// an os signal to exit the program is received. Once it receives
2020-04-13 23:14:55 +08:00
// a signal, it gracefully shuts down all active workers and other
2019-11-24 07:44:42 +08:00
// goroutines to process the tasks.
2020-04-13 23:14:55 +08:00
//
// Run returns any error encountered at server startup time.
// If the server has already been shutdown, ErrServerClosed is returned.
func (srv *Server) Run(handler Handler) error {
if err := srv.Start(handler); err != nil {
return err
}
2020-04-12 23:16:42 +08:00
srv.waitForSignals()
srv.Shutdown()
return nil
2019-11-24 07:44:42 +08:00
}
2020-04-13 23:14:55 +08:00
// Start starts the worker server. Once the server has started,
// it pulls tasks off queues and starts a worker goroutine for each task
// and then call Handler to process it.
// Tasks are processed concurrently by the workers up to the number of
// concurrency specified in Config.Concurrency.
2020-04-13 23:14:55 +08:00
//
// Start returns any error encountered at server startup time.
// If the server has already been shutdown, ErrServerClosed is returned.
func (srv *Server) Start(handler Handler) error {
2020-04-15 00:01:22 +08:00
if handler == nil {
return fmt.Errorf("asynq: server cannot run with nil handler")
}
2020-04-12 23:16:42 +08:00
srv.processor.handler = handler
2019-11-24 07:22:43 +08:00
if err := srv.start(); err != nil {
return err
}
srv.logger.Info("Starting processing")
2020-04-12 23:16:42 +08:00
srv.heartbeater.start(&srv.wg)
srv.healthchecker.start(&srv.wg)
2020-04-12 23:16:42 +08:00
srv.subscriber.start(&srv.wg)
srv.syncer.start(&srv.wg)
2020-06-21 22:05:57 +08:00
srv.recoverer.start(&srv.wg)
srv.forwarder.start(&srv.wg)
2020-04-12 23:16:42 +08:00
srv.processor.start(&srv.wg)
srv.janitor.start(&srv.wg)
return nil
2019-11-24 07:22:43 +08:00
}
// Checks server state and returns an error if pre-condition is not met.
// Otherwise it sets the server state to active.
func (srv *Server) start() error {
srv.state.mu.Lock()
defer srv.state.mu.Unlock()
switch srv.state.value {
case srvStateActive:
return fmt.Errorf("asynq: the server is already running")
case srvStateStopped:
return fmt.Errorf("asynq: the server is in the stopped state. Waiting for shutdown.")
case srvStateClosed:
return ErrServerClosed
}
srv.state.value = srvStateActive
return nil
}
// Shutdown gracefully shuts down the server.
2020-04-13 23:14:55 +08:00
// It gracefully closes all active workers. The server will wait for
// active workers to finish processing tasks for duration specified in Config.ShutdownTimeout.
// If worker didn't finish processing a task during the timeout, the task will be pushed back to Redis.
func (srv *Server) Shutdown() {
srv.state.mu.Lock()
if srv.state.value == srvStateNew || srv.state.value == srvStateClosed {
srv.state.mu.Unlock()
// server is not running, do nothing and return.
2019-11-24 07:22:43 +08:00
return
}
srv.state.value = srvStateClosed
srv.state.mu.Unlock()
2019-11-24 07:22:43 +08:00
srv.logger.Info("Starting graceful shutdown")
// Note: The order of shutdown is important.
2020-02-17 06:42:21 +08:00
// Sender goroutines should be terminated before the receiver goroutines.
2020-02-18 22:57:39 +08:00
// processor -> syncer (via syncCh)
2020-05-19 11:47:35 +08:00
// processor -> heartbeater (via starting, finished channels)
srv.forwarder.shutdown()
srv.processor.shutdown()
srv.recoverer.shutdown()
srv.syncer.shutdown()
srv.subscriber.shutdown()
srv.janitor.shutdown()
srv.healthchecker.shutdown()
srv.heartbeater.shutdown()
2020-04-12 23:16:42 +08:00
srv.wg.Wait()
2020-02-16 15:14:30 +08:00
2020-04-17 21:56:44 +08:00
srv.broker.Close()
srv.logger.Info("Exiting")
2019-11-24 07:22:43 +08:00
}
// Stop signals the server to stop pulling new tasks off queues.
// Stop can be used before shutting down the server to ensure that all
// currently active tasks are processed before server shutdown.
//
// Stop does not shutdown the server, make sure to call Shutdown before exit.
func (srv *Server) Stop() {
srv.state.mu.Lock()
if srv.state.value != srvStateActive {
// Invalid calll to Stop, server can only go from Active state to Stopped state.
srv.state.mu.Unlock()
return
}
srv.state.value = srvStateStopped
srv.state.mu.Unlock()
2020-05-16 22:12:08 +08:00
srv.logger.Info("Stopping processor")
srv.processor.stop()
2020-05-16 22:12:08 +08:00
srv.logger.Info("Processor stopped")
}