diff --git a/docker/Dockerfile.local b/docker/Dockerfile.local new file mode 100644 index 00000000000..0c2961a04ef --- /dev/null +++ b/docker/Dockerfile.local @@ -0,0 +1,44 @@ +FROM golang:1.22-alpine AS builder + +ARG VERSION="nightly" + +RUN apk add --update git + +WORKDIR /src + +# Copy go.mod and go.sum first (if they exist) to cache dependencies +COPY go.mod go.sum* ./ +RUN go mod download + +# Clone natscli separately to cache it +RUN mkdir -p src/github.com/nats-io && \ + cd src/github.com/nats-io/ && \ + git clone https://github.com/nats-io/natscli.git && \ + cd natscli/nats && \ + go mod download + +# Install nsc separately to cache it +RUN go install github.com/nats-io/nsc/v2@latest + +# Now copy the rest of the source code and build +COPY . . + +RUN go build -ldflags "-w -X main.version=${VERSION}" -o /nats-server + +# Build nats cli +RUN cd src/github.com/nats-io/natscli/nats && \ + go build -ldflags "-w -X main.version=${VERSION}" -o /nats + +FROM alpine:latest + +RUN apk add --update ca-certificates && mkdir -p /nats/bin && mkdir /nats/conf + +COPY docker/nats-server.conf /nats/conf/nats-server.conf +COPY --from=builder /nats-server /bin/nats-server +COPY --from=builder /nats /bin/nats +COPY --from=builder /go/bin/nsc /bin/nsc + +EXPOSE 4222 8222 6222 5222 + +ENTRYPOINT ["/bin/nats-server"] +CMD ["-c", "/nats/conf/nats-server.conf"] diff --git a/go.mod b/go.mod index 75c7f2e9473..7d77d8117cd 100644 --- a/go.mod +++ b/go.mod @@ -17,3 +17,9 @@ require ( golang.org/x/sys v0.28.0 golang.org/x/time v0.8.0 ) + +require ( + github.com/cespare/xxhash/v2 v2.2.0 // indirect + github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f // indirect + github.com/redis/go-redis/v9 v9.7.0 // indirect +) diff --git a/go.sum b/go.sum index 3f2a76d9cc0..dc0f2dce272 100644 --- a/go.sum +++ b/go.sum @@ -1,5 +1,9 @@ +github.com/cespare/xxhash/v2 v2.2.0 h1:DC2CZ1Ep5Y4k3ZQ899DldepgrayRUGE6BBZ/cd9Cj44= +github.com/cespare/xxhash/v2 v2.2.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f h1:lO4WD4F/rVNCu3HqELle0jiPLLBs70cWOduZpkS1E78= +github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f/go.mod h1:cuUVRXasLTGF7a8hSLbxyZXjz+1KgoB3wDUb6vlszIc= github.com/google/go-tpm v0.9.3 h1:+yx0/anQuGzi+ssRqeD6WpXjW2L/V0dItUayO0i9sRc= github.com/google/go-tpm v0.9.3/go.mod h1:h9jEsEECg7gtLis0upRBQU+GhYVH6jMjrFxI8u6bVUY= github.com/klauspost/compress v1.17.11 h1:In6xLpyWOi1+C7tXUUWv2ot1QvBjxevKAaI6IXrJmUc= @@ -18,6 +22,8 @@ github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZb github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/prashantv/gostub v1.1.0 h1:BTyx3RfQjRHnUWaGF9oQos79AlQ5k8WNktv7VGvVH4g= github.com/prashantv/gostub v1.1.0/go.mod h1:A5zLQHz7ieHGG7is6LLXLz7I8+3LZzsrV0P1IAHhP5U= +github.com/redis/go-redis/v9 v9.7.0 h1:HhLSs+B6O021gwzl+locl0zEDnyNkxMtf/Z3NNBMa9E= +github.com/redis/go-redis/v9 v9.7.0/go.mod h1:f6zhXITC7JUJIlPEiBOTXxJgPLdZcA93GewI7inzyWw= github.com/stretchr/testify v1.7.1 h1:5TQK59W5E3v0r2duFAb7P95B6hEeOyEnHRa8MjYSMTY= github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= go.uber.org/automaxprocs v1.6.0 h1:O3y2/QNTOdbF+e/dpXNNW7Rx2hZ4sTIPyybbxyNqTUs= diff --git a/server/accounts.go b/server/accounts.go index a58e5b595d0..6d3becf072f 100644 --- a/server/accounts.go +++ b/server/accounts.go @@ -52,62 +52,56 @@ var maxSubLimitReportThreshold = defaultMaxSubLimitReportThreshold type Account struct { stats gwReplyMapping - Name string - LogicalName string - Nkey string - Issuer string - claimJWT string - updated time.Time - mu sync.RWMutex - sqmu sync.Mutex - sl *Sublist - ic *client - sq *sendq - isid uint64 - etmr *time.Timer - ctmr *time.Timer - strack map[string]sconns - nrclients int32 - sysclients int32 - nleafs int32 - nrleafs int32 - clients map[*client]struct{} - rm map[string]int32 - lqws map[string]int32 - usersRevoked map[string]int64 - mappings []*mapping - hasMapped atomic.Bool - lmu sync.RWMutex - lleafs []*client - leafClusters map[string]uint64 - imports importMap - exports exportMap - js *jsAccount - jsLimits map[string]JetStreamAccountLimits + Name string `json:"name"` + LogicalName string `json:"logical_name"` + Nkey string `json:"nkey"` + Issuer string `json:"issuer"` + claimJWT string `json:"claim_jwt"` + updated time.Time `json:"updated"` + mu sync.RWMutex `json:"-"` + sqmu sync.Mutex `json:"-"` + sl *Sublist `json:"-"` + ic *client `json:"-"` + sq *sendq `json:"-"` + isid uint64 `json:"isid"` + etmr *time.Timer `json:"-"` + ctmr *time.Timer `json:"-"` + strack map[string]sconns `json:"-"` + nrclients int32 `json:"nr_clients"` + sysclients int32 `json:"sys_clients"` + nleafs int32 `json:"nleafs"` + nrleafs int32 `json:"nr_leafs"` + clients map[*client]struct{} `json:"-"` + rm map[string]int32 `json:"-"` + lqws map[string]int32 `json:"-"` + usersRevoked map[string]int64 `json:"-"` + mappings []*mapping `json:"-"` + hasMapped atomic.Bool `json:"-"` + lmu sync.RWMutex `json:"-"` + lleafs []*client `json:"-"` + leafClusters map[string]uint64 `json:"-"` + imports importMap `json:"-"` + exports exportMap `json:"-"` + js *jsAccount `json:"-"` + jsLimits map[string]JetStreamAccountLimits `json:"-"` limits - expired atomic.Bool - incomplete bool - signingKeys map[string]jwt.Scope - extAuth *jwt.ExternalAuthorization - srv *Server // server this account is registered with (possibly nil) - lds string // loop detection subject for leaf nodes - siReply []byte // service reply prefix, will form wildcard subscription. - eventIds *nuid.NUID - eventIdsMu sync.Mutex - defaultPerms *Permissions - tags jwt.TagList - nameTag string - lastLimErr int64 - routePoolIdx int - // If the trace destination is specified and a message with a traceParentHdr - // is received, and has the least significant bit of the last token set to 1, - // then if traceDestSampling is > 0 and < 100, a random value will be selected - // and if it falls between 0 and that value, message tracing will be triggered. - traceDest string - traceDestSampling int - // Guarantee that only one goroutine can be running either checkJetStreamMigrate - // or clearObserverState at a given time for this account to prevent interleaving. - jscmMu sync.Mutex + expired atomic.Bool `json:"-"` + incomplete bool `json:"incomplete"` + signingKeys map[string]jwt.Scope `json:"-"` + extAuth *jwt.ExternalAuthorization `json:"-"` + srv *Server `json:"-"` + lds string `json:"lds"` + siReply []byte `json:"-"` + eventIds *nuid.NUID `json:"-"` + eventIdsMu sync.Mutex `json:"-"` + defaultPerms *Permissions `json:"-"` + tags jwt.TagList `json:"-"` + nameTag string `json:"name_tag"` + lastLimErr int64 `json:"last_lim_err"` + routePoolIdx int `json:"route_pool_idx"` + traceDest string `json:"trace_dest"` + traceDestSampling int `json:"trace_dest_sampling"` + jscmMu sync.Mutex `json:"-"` } const ( diff --git a/server/consume_og b/server/consume_og new file mode 100644 index 00000000000..6ce4678b126 --- /dev/null +++ b/server/consume_og @@ -0,0 +1,6099 @@ +// Copyright 2019-2024 The NATS Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package server + +import ( + "bytes" + "encoding/binary" + "encoding/json" + "errors" + "fmt" + "math/rand" + "reflect" + "regexp" + "slices" + "strconv" + "strings" + "sync" + "sync/atomic" + "time" + + "github.com/nats-io/nats-server/v2/server/avl" + "github.com/nats-io/nuid" + "golang.org/x/time/rate" +) + +// Headers sent with Request Timeout +const ( + JSPullRequestPendingMsgs = "Nats-Pending-Messages" + JSPullRequestPendingBytes = "Nats-Pending-Bytes" + JSPullRequestWrongPinID = "NATS/1.0 423 Nats-Wrong-Pin-Id\r\n\r\n" + JSPullRequestNatsPinId = "Nats-Pin-Id" +) + +var ( + validGroupName = regexp.MustCompile(`^[a-zA-Z0-9/_=-]{1,16}$`) +) + +// Headers sent when batch size was completed, but there were remaining bytes. +const JsPullRequestRemainingBytesT = "NATS/1.0 409 Batch Completed\r\n%s: %d\r\n%s: %d\r\n\r\n" + +type ConsumerInfo struct { + Stream string `json:"stream_name"` + Name string `json:"name"` + Created time.Time `json:"created"` + Config *ConsumerConfig `json:"config,omitempty"` + Delivered SequenceInfo `json:"delivered"` + AckFloor SequenceInfo `json:"ack_floor"` + NumAckPending int `json:"num_ack_pending"` + NumRedelivered int `json:"num_redelivered"` + NumWaiting int `json:"num_waiting"` + NumPending uint64 `json:"num_pending"` + Cluster *ClusterInfo `json:"cluster,omitempty"` + PushBound bool `json:"push_bound,omitempty"` + Paused bool `json:"paused,omitempty"` + PauseRemaining time.Duration `json:"pause_remaining,omitempty"` + // TimeStamp indicates when the info was gathered + TimeStamp time.Time `json:"ts"` + PriorityGroups []PriorityGroupState `json:"priority_groups,omitempty"` +} + +type PriorityGroupState struct { + Group string `json:"group"` + PinnedClientID string `json:"pinned_client_id,omitempty"` + PinnedTS time.Time `json:"pinned_ts,omitempty"` +} + +type ConsumerConfig struct { + // Durable is deprecated. All consumers should have names, picked by clients. + Durable string `json:"durable_name,omitempty"` + Name string `json:"name,omitempty"` + Description string `json:"description,omitempty"` + DeliverPolicy DeliverPolicy `json:"deliver_policy"` + OptStartSeq uint64 `json:"opt_start_seq,omitempty"` + OptStartTime *time.Time `json:"opt_start_time,omitempty"` + AckPolicy AckPolicy `json:"ack_policy"` + AckWait time.Duration `json:"ack_wait,omitempty"` + MaxDeliver int `json:"max_deliver,omitempty"` + BackOff []time.Duration `json:"backoff,omitempty"` + FilterSubject string `json:"filter_subject,omitempty"` + FilterSubjects []string `json:"filter_subjects,omitempty"` + ReplayPolicy ReplayPolicy `json:"replay_policy"` + RateLimit uint64 `json:"rate_limit_bps,omitempty"` // Bits per sec + SampleFrequency string `json:"sample_freq,omitempty"` + MaxWaiting int `json:"max_waiting,omitempty"` + MaxAckPending int `json:"max_ack_pending,omitempty"` + Heartbeat time.Duration `json:"idle_heartbeat,omitempty"` + FlowControl bool `json:"flow_control,omitempty"` + HeadersOnly bool `json:"headers_only,omitempty"` + + // Pull based options. + MaxRequestBatch int `json:"max_batch,omitempty"` + MaxRequestExpires time.Duration `json:"max_expires,omitempty"` + MaxRequestMaxBytes int `json:"max_bytes,omitempty"` + + // Push based consumers. + DeliverSubject string `json:"deliver_subject,omitempty"` + DeliverGroup string `json:"deliver_group,omitempty"` + + // Ephemeral inactivity threshold. + InactiveThreshold time.Duration `json:"inactive_threshold,omitempty"` + + // Generally inherited by parent stream and other markers, now can be configured directly. + Replicas int `json:"num_replicas"` + // Force memory storage. + MemoryStorage bool `json:"mem_storage,omitempty"` + + // Don't add to general clients. + Direct bool `json:"direct,omitempty"` + + // Metadata is additional metadata for the Consumer. + Metadata map[string]string `json:"metadata,omitempty"` + + // PauseUntil is for suspending the consumer until the deadline. + PauseUntil *time.Time `json:"pause_until,omitempty"` + + // Priority groups + PriorityGroups []string `json:"priority_groups,omitempty"` + PriorityPolicy PriorityPolicy `json:"priority_policy,omitempty"` + PinnedTTL time.Duration `json:"priority_timeout,omitempty"` +} + +// SequenceInfo has both the consumer and the stream sequence and last activity. +type SequenceInfo struct { + Consumer uint64 `json:"consumer_seq"` + Stream uint64 `json:"stream_seq"` + Last *time.Time `json:"last_active,omitempty"` +} + +type CreateConsumerRequest struct { + Stream string `json:"stream_name"` + Config ConsumerConfig `json:"config"` + Action ConsumerAction `json:"action"` + Pedantic bool `json:"pedantic,omitempty"` +} + +type ConsumerAction int + +const ( + ActionCreateOrUpdate ConsumerAction = iota + ActionUpdate + ActionCreate +) + +const ( + actionUpdateJSONString = `"update"` + actionCreateJSONString = `"create"` + actionCreateOrUpdateJSONString = `""` +) + +var ( + actionUpdateJSONBytes = []byte(actionUpdateJSONString) + actionCreateJSONBytes = []byte(actionCreateJSONString) + actionCreateOrUpdateJSONBytes = []byte(actionCreateOrUpdateJSONString) +) + +func (a ConsumerAction) String() string { + switch a { + case ActionCreateOrUpdate: + return actionCreateOrUpdateJSONString + case ActionCreate: + return actionCreateJSONString + case ActionUpdate: + return actionUpdateJSONString + } + return actionCreateOrUpdateJSONString +} + +func (a ConsumerAction) MarshalJSON() ([]byte, error) { + switch a { + case ActionCreate: + return actionCreateJSONBytes, nil + case ActionUpdate: + return actionUpdateJSONBytes, nil + case ActionCreateOrUpdate: + return actionCreateOrUpdateJSONBytes, nil + default: + return nil, fmt.Errorf("can not marshal %v", a) + } +} + +func (a *ConsumerAction) UnmarshalJSON(data []byte) error { + switch string(data) { + case actionCreateJSONString: + *a = ActionCreate + case actionUpdateJSONString: + *a = ActionUpdate + case actionCreateOrUpdateJSONString: + *a = ActionCreateOrUpdate + default: + return fmt.Errorf("unknown consumer action: %v", string(data)) + } + return nil +} + +// ConsumerNakOptions is for optional NAK values, e.g. delay. +type ConsumerNakOptions struct { + Delay time.Duration `json:"delay"` +} + +// PriorityPolicy determines policy for selecting messages based on priority. +type PriorityPolicy int + +const ( + // No priority policy. + PriorityNone PriorityPolicy = iota + // Clients will get the messages only if certain criteria are specified. + PriorityOverflow + // Single client takes over handling of the messages, while others are on standby. + PriorityPinnedClient +) + +const ( + PriorityNoneJSONString = `"none"` + PriorityOverflowJSONString = `"overflow"` + PriorityPinnedClientJSONString = `"pinned_client"` +) + +var ( + PriorityNoneJSONBytes = []byte(PriorityNoneJSONString) + PriorityOverflowJSONBytes = []byte(PriorityOverflowJSONString) + PriorityPinnedClientJSONBytes = []byte(PriorityPinnedClientJSONString) +) + +func (pp PriorityPolicy) String() string { + switch pp { + case PriorityOverflow: + return PriorityOverflowJSONString + case PriorityPinnedClient: + return PriorityPinnedClientJSONString + default: + return PriorityNoneJSONString + } +} + +func (pp PriorityPolicy) MarshalJSON() ([]byte, error) { + switch pp { + case PriorityOverflow: + return PriorityOverflowJSONBytes, nil + case PriorityPinnedClient: + return PriorityPinnedClientJSONBytes, nil + default: + return nil, fmt.Errorf("unknown priority policy: %v", pp) + } +} + +func (pp *PriorityPolicy) UnmarshalJSON(data []byte) error { + switch string(data) { + case PriorityOverflowJSONString: + *pp = PriorityOverflow + case PriorityPinnedClientJSONString: + *pp = PriorityPinnedClient + case PriorityNoneJSONString: + *pp = PriorityNone + default: + return fmt.Errorf("unknown priority policy: %v", string(data)) + } + return nil +} + +// DeliverPolicy determines how the consumer should select the first message to deliver. +type DeliverPolicy int + +const ( + // DeliverAll will be the default so can be omitted from the request. + DeliverAll DeliverPolicy = iota + // DeliverLast will start the consumer with the last sequence received. + DeliverLast + // DeliverNew will only deliver new messages that are sent after the consumer is created. + DeliverNew + // DeliverByStartSequence will look for a defined starting sequence to start. + DeliverByStartSequence + // DeliverByStartTime will select the first messsage with a timestamp >= to StartTime. + DeliverByStartTime + // DeliverLastPerSubject will start the consumer with the last message for all subjects received. + DeliverLastPerSubject +) + +func (dp DeliverPolicy) String() string { + switch dp { + case DeliverAll: + return "all" + case DeliverLast: + return "last" + case DeliverNew: + return "new" + case DeliverByStartSequence: + return "by_start_sequence" + case DeliverByStartTime: + return "by_start_time" + case DeliverLastPerSubject: + return "last_per_subject" + default: + return "undefined" + } +} + +// AckPolicy determines how the consumer should acknowledge delivered messages. +type AckPolicy int + +const ( + // AckNone requires no acks for delivered messages. + AckNone AckPolicy = iota + // AckAll when acking a sequence number, this implicitly acks all sequences below this one as well. + AckAll + // AckExplicit requires ack or nack for all messages. + AckExplicit +) + +func (a AckPolicy) String() string { + switch a { + case AckNone: + return "none" + case AckAll: + return "all" + default: + return "explicit" + } +} + +// ReplayPolicy determines how the consumer should replay messages it already has queued in the stream. +type ReplayPolicy int + +const ( + // ReplayInstant will replay messages as fast as possible. + ReplayInstant ReplayPolicy = iota + // ReplayOriginal will maintain the same timing as the messages were received. + ReplayOriginal +) + +func (r ReplayPolicy) String() string { + switch r { + case ReplayInstant: + return replayInstantPolicyJSONString + default: + return replayOriginalPolicyJSONString + } +} + +// OK +const OK = "+OK" + +// Ack responses. Note that a nil or no payload is same as AckAck +var ( + // Ack + AckAck = []byte("+ACK") // nil or no payload to ack subject also means ACK + AckOK = []byte(OK) // deprecated but +OK meant ack as well. + + // Nack + AckNak = []byte("-NAK") + // Progress indicator + AckProgress = []byte("+WPI") + // Ack + Deliver the next message(s). + AckNext = []byte("+NXT") + // Terminate delivery of the message. + AckTerm = []byte("+TERM") +) + +const ( + // reasons to supply when terminating messages using limits + ackTermLimitsReason = "Message deleted by stream limits" + ackTermUnackedLimitsReason = "Unacknowledged message was deleted" +) + +// Calculate accurate replicas for the consumer config with the parent stream config. +func (consCfg ConsumerConfig) replicas(strCfg *StreamConfig) int { + if consCfg.Replicas == 0 || consCfg.Replicas > strCfg.Replicas { + if !isDurableConsumer(&consCfg) && strCfg.Retention == LimitsPolicy && consCfg.Replicas == 0 { + // Matches old-school ephemerals only, where the replica count is 0. + return 1 + } + return strCfg.Replicas + } + return consCfg.Replicas +} + +// Consumer is a jetstream consumer. +type consumer struct { + // Atomic used to notify that we want to process an ack. + // This will be checked in checkPending to abort processing + // and let ack be processed in priority. + awl int64 + leader atomic.Bool + mu sync.RWMutex + js *jetStream + mset *stream + acc *Account + srv *Server + client *client + sysc *client + sid int + name string + stream string + sseq uint64 // next stream sequence + subjf subjectFilters // subject filters and their sequences + filters *Sublist // When we have multiple filters we will use LoadNextMsgMulti and pass this in. + dseq uint64 // delivered consumer sequence + adflr uint64 // ack delivery floor + asflr uint64 // ack store floor + chkflr uint64 // our check floor, interest streams only. + npc int64 // Num Pending Count + npf uint64 // Num Pending Floor Sequence + dsubj string + qgroup string + lss *lastSeqSkipList + rlimit *rate.Limiter + reqSub *subscription + ackSub *subscription + ackReplyT string + ackSubj string + nextMsgSubj string + nextMsgReqs *ipQueue[*nextMsgReq] + maxp int + pblimit int + maxpb int + pbytes int + fcsz int + fcid string + fcSub *subscription + outq *jsOutQ + pending map[uint64]*Pending + ptmr *time.Timer + ptmrEnd time.Time + rdq []uint64 + rdqi avl.SequenceSet + rdc map[uint64]uint64 + replies map[uint64]string + maxdc uint64 + waiting *waitQueue + cfg ConsumerConfig + ici *ConsumerInfo + store ConsumerStore + active bool + replay bool + dtmr *time.Timer + uptmr *time.Timer // Unpause timer + gwdtmr *time.Timer + dthresh time.Duration + mch chan struct{} // Message channel + qch chan struct{} // Quit channel + inch chan bool // Interest change channel + sfreq int32 + ackEventT string + nakEventT string + deliveryExcEventT string + created time.Time + ldt time.Time + lat time.Time + lwqic time.Time + closed bool + + // Clustered. + ca *consumerAssignment + node RaftNode + infoSub *subscription + lqsent time.Time + prm map[string]struct{} + prOk bool + uch chan struct{} + retention RetentionPolicy + + monitorWg sync.WaitGroup + inMonitor bool + + // R>1 proposals + pch chan struct{} + phead *proposal + ptail *proposal + + // Ack queue + ackMsgs *ipQueue[*jsAckMsg] + + // for stream signaling when multiple filters are set. + sigSubs []*subscription + + // Priority groups + // Details described in ADR-42. + + // currentPinId is the current nuid for the pinned consumer. + // If the Consumer is running in `PriorityPinnedClient` mode, server will + // pick up a new nuid and assign it to first pending pull request. + currentPinId string + /// pinnedTtl is the remaining time before the current PinId expires. + pinnedTtl *time.Timer + pinnedTS time.Time +} + +// A single subject filter. +type subjectFilter struct { + subject string + tokenizedSubject []string + hasWildcard bool +} + +type subjectFilters []*subjectFilter + +// subjects is a helper function used for updating consumers. +// It is not used and should not be used in hotpath. +func (s subjectFilters) subjects() []string { + subjects := make([]string, 0, len(s)) + for _, filter := range s { + subjects = append(subjects, filter.subject) + } + return subjects +} + +type proposal struct { + data []byte + next *proposal +} + +const ( + // JsAckWaitDefault is the default AckWait, only applicable on explicit ack policy consumers. + JsAckWaitDefault = 30 * time.Second + // JsDeleteWaitTimeDefault is the default amount of time we will wait for non-durable + // consumers to be in an inactive state before deleting them. + JsDeleteWaitTimeDefault = 5 * time.Second + // JsFlowControlMaxPending specifies default pending bytes during flow control that can be outstanding. + JsFlowControlMaxPending = 32 * 1024 * 1024 + // JsDefaultMaxAckPending is set for consumers with explicit ack that do not set the max ack pending. + JsDefaultMaxAckPending = 1000 + // JsDefaultPinnedTTL is the default grace period for the pinned consumer to send a new request before a new pin + // is picked by a server. + JsDefaultPinnedTTL = 2 * time.Minute +) + +// Helper function to set consumer config defaults from above. +func setConsumerConfigDefaults(config *ConsumerConfig, streamCfg *StreamConfig, lim *JSLimitOpts, accLim *JetStreamAccountLimits, pedantic bool) *ApiError { + // Set to default if not specified. + if config.DeliverSubject == _EMPTY_ && config.MaxWaiting == 0 { + config.MaxWaiting = JSWaitQueueDefaultMax + } + // Setup proper default for ack wait if we are in explicit ack mode. + if config.AckWait == 0 && (config.AckPolicy == AckExplicit || config.AckPolicy == AckAll) { + config.AckWait = JsAckWaitDefault + } + // Setup default of -1, meaning no limit for MaxDeliver. + if config.MaxDeliver == 0 { + config.MaxDeliver = -1 + } + // If BackOff was specified that will override the AckWait and the MaxDeliver. + if len(config.BackOff) > 0 { + if pedantic && config.AckWait != config.BackOff[0] { + return NewJSPedanticError(errors.New("first backoff value has to equal batch AckWait")) + } + config.AckWait = config.BackOff[0] + } + if config.MaxAckPending == 0 { + if pedantic && streamCfg.ConsumerLimits.MaxAckPending > 0 { + return NewJSPedanticError(errors.New("max_ack_pending must be set if it's configured in stream limits")) + } + config.MaxAckPending = streamCfg.ConsumerLimits.MaxAckPending + } + if config.InactiveThreshold == 0 { + if pedantic && streamCfg.ConsumerLimits.InactiveThreshold > 0 { + return NewJSPedanticError(errors.New("inactive_threshold must be set if it's configured in stream limits")) + } + config.InactiveThreshold = streamCfg.ConsumerLimits.InactiveThreshold + } + // Set proper default for max ack pending if we are ack explicit and none has been set. + if (config.AckPolicy == AckExplicit || config.AckPolicy == AckAll) && config.MaxAckPending == 0 { + accPending := JsDefaultMaxAckPending + if lim.MaxAckPending > 0 && lim.MaxAckPending < accPending { + accPending = lim.MaxAckPending + } + if accLim.MaxAckPending > 0 && accLim.MaxAckPending < accPending { + accPending = accLim.MaxAckPending + } + config.MaxAckPending = accPending + } + // if applicable set max request batch size + if config.DeliverSubject == _EMPTY_ && config.MaxRequestBatch == 0 && lim.MaxRequestBatch > 0 { + if pedantic { + return NewJSPedanticError(errors.New("max_request_batch must be set if it's JetStream limits are set")) + } + config.MaxRequestBatch = lim.MaxRequestBatch + } + + // set the default value only if pinned policy is used. + if config.PriorityPolicy == PriorityPinnedClient && config.PinnedTTL == 0 { + config.PinnedTTL = JsDefaultPinnedTTL + } + return nil +} + +// Check the consumer config. If we are recovering don't check filter subjects. +func checkConsumerCfg( + config *ConsumerConfig, + srvLim *JSLimitOpts, + cfg *StreamConfig, + _ *Account, + accLim *JetStreamAccountLimits, + isRecovering bool, +) *ApiError { + + // Check if replicas is defined but exceeds parent stream. + if config.Replicas > 0 && config.Replicas > cfg.Replicas { + return NewJSConsumerReplicasExceedsStreamError() + } + // Check that it is not negative + if config.Replicas < 0 { + return NewJSReplicasCountCannotBeNegativeError() + } + // If the stream is interest or workqueue retention make sure the replicas + // match that of the stream. This is REQUIRED for now. + if cfg.Retention == InterestPolicy || cfg.Retention == WorkQueuePolicy { + // Only error here if not recovering. + // We handle recovering in a different spot to allow consumer to come up + // if previous version allowed it to be created. We do not want it to not come up. + if !isRecovering && config.Replicas != 0 && config.Replicas != cfg.Replicas { + return NewJSConsumerReplicasShouldMatchStreamError() + } + } + + // Check if we have a BackOff defined that MaxDeliver is within range etc. + if lbo := len(config.BackOff); lbo > 0 && config.MaxDeliver != -1 && lbo > config.MaxDeliver { + return NewJSConsumerMaxDeliverBackoffError() + } + + if len(config.Description) > JSMaxDescriptionLen { + return NewJSConsumerDescriptionTooLongError(JSMaxDescriptionLen) + } + + // For now expect a literal subject if its not empty. Empty means work queue mode (pull mode). + if config.DeliverSubject != _EMPTY_ { + if !subjectIsLiteral(config.DeliverSubject) { + return NewJSConsumerDeliverToWildcardsError() + } + if !IsValidSubject(config.DeliverSubject) { + return NewJSConsumerInvalidDeliverSubjectError() + } + if deliveryFormsCycle(cfg, config.DeliverSubject) { + return NewJSConsumerDeliverCycleError() + } + if config.MaxWaiting != 0 { + return NewJSConsumerPushMaxWaitingError() + } + if config.MaxAckPending > 0 && config.AckPolicy == AckNone { + return NewJSConsumerMaxPendingAckPolicyRequiredError() + } + if config.Heartbeat > 0 && config.Heartbeat < 100*time.Millisecond { + return NewJSConsumerSmallHeartbeatError() + } + } else { + // Pull mode with work queue retention from the stream requires an explicit ack. + if config.AckPolicy == AckNone && cfg.Retention == WorkQueuePolicy { + return NewJSConsumerPullRequiresAckError() + } + if config.RateLimit > 0 { + return NewJSConsumerPullWithRateLimitError() + } + if config.MaxWaiting < 0 { + return NewJSConsumerMaxWaitingNegativeError() + } + if config.Heartbeat > 0 { + return NewJSConsumerHBRequiresPushError() + } + if config.FlowControl { + return NewJSConsumerFCRequiresPushError() + } + if config.MaxRequestBatch < 0 { + return NewJSConsumerMaxRequestBatchNegativeError() + } + if config.MaxRequestExpires != 0 && config.MaxRequestExpires < time.Millisecond { + return NewJSConsumerMaxRequestExpiresToSmallError() + } + if srvLim.MaxRequestBatch > 0 && config.MaxRequestBatch > srvLim.MaxRequestBatch { + return NewJSConsumerMaxRequestBatchExceededError(srvLim.MaxRequestBatch) + } + } + if srvLim.MaxAckPending > 0 && config.MaxAckPending > srvLim.MaxAckPending { + return NewJSConsumerMaxPendingAckExcessError(srvLim.MaxAckPending) + } + if accLim.MaxAckPending > 0 && config.MaxAckPending > accLim.MaxAckPending { + return NewJSConsumerMaxPendingAckExcessError(accLim.MaxAckPending) + } + if cfg.ConsumerLimits.MaxAckPending > 0 && config.MaxAckPending > cfg.ConsumerLimits.MaxAckPending { + return NewJSConsumerMaxPendingAckExcessError(cfg.ConsumerLimits.MaxAckPending) + } + if cfg.ConsumerLimits.InactiveThreshold > 0 && config.InactiveThreshold > cfg.ConsumerLimits.InactiveThreshold { + return NewJSConsumerInactiveThresholdExcessError(cfg.ConsumerLimits.InactiveThreshold) + } + + // Direct need to be non-mapped ephemerals. + if config.Direct { + if config.DeliverSubject == _EMPTY_ { + return NewJSConsumerDirectRequiresPushError() + } + if isDurableConsumer(config) { + return NewJSConsumerDirectRequiresEphemeralError() + } + } + + // Do not allow specifying both FilterSubject and FilterSubjects, + // as that's probably unintentional without any difference from passing + // all filters in FilterSubjects. + if config.FilterSubject != _EMPTY_ && len(config.FilterSubjects) > 0 { + return NewJSConsumerDuplicateFilterSubjectsError() + } + + if config.FilterSubject != _EMPTY_ && !IsValidSubject(config.FilterSubject) { + return NewJSStreamInvalidConfigError(ErrBadSubject) + } + + // We treat FilterSubjects: []string{""} as a misconfig, so we validate against it. + for _, filter := range config.FilterSubjects { + if filter == _EMPTY_ { + return NewJSConsumerEmptyFilterError() + } + } + subjectFilters := gatherSubjectFilters(config.FilterSubject, config.FilterSubjects) + + // Check subject filters do not overlap. + for outer, subject := range subjectFilters { + if !IsValidSubject(subject) { + return NewJSStreamInvalidConfigError(ErrBadSubject) + } + for inner, ssubject := range subjectFilters { + if inner != outer && SubjectsCollide(subject, ssubject) { + return NewJSConsumerOverlappingSubjectFiltersError() + } + } + } + + // Helper function to formulate similar errors. + badStart := func(dp, start string) error { + return fmt.Errorf("consumer delivery policy is deliver %s, but optional start %s is also set", dp, start) + } + notSet := func(dp, notSet string) error { + return fmt.Errorf("consumer delivery policy is deliver %s, but optional %s is not set", dp, notSet) + } + + // Check on start position conflicts. + switch config.DeliverPolicy { + case DeliverAll: + if config.OptStartSeq > 0 { + return NewJSConsumerInvalidPolicyError(badStart("all", "sequence")) + } + if config.OptStartTime != nil { + return NewJSConsumerInvalidPolicyError(badStart("all", "time")) + } + case DeliverLast: + if config.OptStartSeq > 0 { + return NewJSConsumerInvalidPolicyError(badStart("last", "sequence")) + } + if config.OptStartTime != nil { + return NewJSConsumerInvalidPolicyError(badStart("last", "time")) + } + case DeliverLastPerSubject: + if config.OptStartSeq > 0 { + return NewJSConsumerInvalidPolicyError(badStart("last per subject", "sequence")) + } + if config.OptStartTime != nil { + return NewJSConsumerInvalidPolicyError(badStart("last per subject", "time")) + } + if config.FilterSubject == _EMPTY_ && len(config.FilterSubjects) == 0 { + return NewJSConsumerInvalidPolicyError(notSet("last per subject", "filter subject")) + } + case DeliverNew: + if config.OptStartSeq > 0 { + return NewJSConsumerInvalidPolicyError(badStart("new", "sequence")) + } + if config.OptStartTime != nil { + return NewJSConsumerInvalidPolicyError(badStart("new", "time")) + } + case DeliverByStartSequence: + if config.OptStartSeq == 0 { + return NewJSConsumerInvalidPolicyError(notSet("by start sequence", "start sequence")) + } + if config.OptStartTime != nil { + return NewJSConsumerInvalidPolicyError(badStart("by start sequence", "time")) + } + case DeliverByStartTime: + if config.OptStartTime == nil { + return NewJSConsumerInvalidPolicyError(notSet("by start time", "start time")) + } + if config.OptStartSeq != 0 { + return NewJSConsumerInvalidPolicyError(badStart("by start time", "start sequence")) + } + } + + if config.SampleFrequency != _EMPTY_ { + s := strings.TrimSuffix(config.SampleFrequency, "%") + if sampleFreq, err := strconv.Atoi(s); err != nil || sampleFreq < 0 { + return NewJSConsumerInvalidSamplingError(err) + } + } + + // We reject if flow control is set without heartbeats. + if config.FlowControl && config.Heartbeat == 0 { + return NewJSConsumerWithFlowControlNeedsHeartbeatsError() + } + + if config.Durable != _EMPTY_ && config.Name != _EMPTY_ { + if config.Name != config.Durable { + return NewJSConsumerCreateDurableAndNameMismatchError() + } + } + + var metadataLen int + for k, v := range config.Metadata { + metadataLen += len(k) + len(v) + } + if metadataLen > JSMaxMetadataLen { + return NewJSConsumerMetadataLengthError(fmt.Sprintf("%dKB", JSMaxMetadataLen/1024)) + } + + if config.PriorityPolicy != PriorityNone { + if len(config.PriorityGroups) == 0 { + return NewJSConsumerPriorityPolicyWithoutGroupError() + } + + for _, group := range config.PriorityGroups { + if group == _EMPTY_ { + return NewJSConsumerEmptyGroupNameError() + } + if !validGroupName.MatchString(group) { + return NewJSConsumerInvalidGroupNameError() + } + } + } + + // For now don't allow preferred server in placement. + if cfg.Placement != nil && cfg.Placement.Preferred != _EMPTY_ { + return NewJSStreamInvalidConfigError(fmt.Errorf("preferred server not permitted in placement")) + } + + return nil +} + +func (mset *stream) addConsumerWithAction(config *ConsumerConfig, action ConsumerAction, pedantic bool) (*consumer, error) { + return mset.addConsumerWithAssignment(config, _EMPTY_, nil, false, action, pedantic) +} + +func (mset *stream) addConsumer(config *ConsumerConfig) (*consumer, error) { + return mset.addConsumerWithAction(config, ActionCreateOrUpdate, false) +} + +func (mset *stream) addConsumerWithAssignment(config *ConsumerConfig, oname string, ca *consumerAssignment, isRecovering bool, action ConsumerAction, pedantic bool) (*consumer, error) { + // Check if this stream has closed. + if mset.closed.Load() { + return nil, NewJSStreamInvalidError() + } + + mset.mu.RLock() + s, jsa, cfg, acc := mset.srv, mset.jsa, mset.cfg, mset.acc + mset.mu.RUnlock() + + // If we do not have the consumer currently assigned to us in cluster mode we will proceed but warn. + // This can happen on startup with restored state where on meta replay we still do not have + // the assignment. Running in single server mode this always returns true. + if oname != _EMPTY_ && !jsa.consumerAssigned(mset.name(), oname) { + s.Debugf("Consumer %q > %q does not seem to be assigned to this server", mset.name(), oname) + } + + if config == nil { + return nil, NewJSConsumerConfigRequiredError() + } + + selectedLimits, _, _, _ := acc.selectLimits(config.replicas(&cfg)) + if selectedLimits == nil { + return nil, NewJSNoLimitsError() + } + + srvLim := &s.getOpts().JetStreamLimits + // Make sure we have sane defaults. Do so with the JS lock, otherwise a + // badly timed meta snapshot can result in a race condition. + mset.js.mu.Lock() + err := setConsumerConfigDefaults(config, &cfg, srvLim, selectedLimits, pedantic) + mset.js.mu.Unlock() + if err != nil { + return nil, err + } + + if err := checkConsumerCfg(config, srvLim, &cfg, acc, selectedLimits, isRecovering); err != nil { + return nil, err + } + sampleFreq := 0 + if config.SampleFrequency != _EMPTY_ { + // Can't fail as checkConsumerCfg checks correct format + sampleFreq, _ = strconv.Atoi(strings.TrimSuffix(config.SampleFrequency, "%")) + } + + // Grab the client, account and server reference. + c := mset.client + if c == nil { + return nil, NewJSStreamInvalidError() + } + var accName string + c.mu.Lock() + s, a := c.srv, c.acc + if a != nil { + accName = a.Name + } + c.mu.Unlock() + + // Hold mset lock here. + mset.mu.Lock() + if mset.client == nil || mset.store == nil || mset.consumers == nil { + mset.mu.Unlock() + return nil, NewJSStreamInvalidError() + } + + // If this one is durable and already exists, we let that be ok as long as only updating what should be allowed. + var cName string + if isDurableConsumer(config) { + cName = config.Durable + } else if config.Name != _EMPTY_ { + cName = config.Name + } + if cName != _EMPTY_ { + if eo, ok := mset.consumers[cName]; ok { + mset.mu.Unlock() + if action == ActionCreate && !reflect.DeepEqual(*config, eo.config()) { + return nil, NewJSConsumerAlreadyExistsError() + } + // Check for overlapping subjects if we are a workqueue + if cfg.Retention == WorkQueuePolicy { + subjects := gatherSubjectFilters(config.FilterSubject, config.FilterSubjects) + if !mset.partitionUnique(cName, subjects) { + return nil, NewJSConsumerWQConsumerNotUniqueError() + } + } + err := eo.updateConfig(config) + if err == nil { + return eo, nil + } + return nil, NewJSConsumerCreateError(err, Unless(err)) + } + } + if action == ActionUpdate { + mset.mu.Unlock() + return nil, NewJSConsumerDoesNotExistError() + } + + // Check for any limits, if the config for the consumer sets a limit we check against that + // but if not we use the value from account limits, if account limits is more restrictive + // than stream config we prefer the account limits to handle cases where account limits are + // updated during the lifecycle of the stream + maxc := cfg.MaxConsumers + if maxc <= 0 || (selectedLimits.MaxConsumers > 0 && selectedLimits.MaxConsumers < maxc) { + maxc = selectedLimits.MaxConsumers + } + if maxc > 0 && mset.numPublicConsumers() >= maxc { + mset.mu.Unlock() + return nil, NewJSMaximumConsumersLimitError() + } + + // Check on stream type conflicts with WorkQueues. + if cfg.Retention == WorkQueuePolicy && !config.Direct { + // Force explicit acks here. + if config.AckPolicy != AckExplicit { + mset.mu.Unlock() + return nil, NewJSConsumerWQRequiresExplicitAckError() + } + + if len(mset.consumers) > 0 { + subjects := gatherSubjectFilters(config.FilterSubject, config.FilterSubjects) + if len(subjects) == 0 { + mset.mu.Unlock() + return nil, NewJSConsumerWQMultipleUnfilteredError() + } else if !mset.partitionUnique(cName, subjects) { + // Prior to v2.9.7, on a stream with WorkQueue policy, the servers + // were not catching the error of having multiple consumers with + // overlapping filter subjects depending on the scope, for instance + // creating "foo.*.bar" and then "foo.>" was not detected, while + // "foo.>" and then "foo.*.bar" would have been. Failing here + // in recovery mode would leave the rejected consumer in a bad state, + // so we will simply warn here, asking the user to remove this + // consumer administratively. Otherwise, if this is the creation + // of a new consumer, we will return the error. + if isRecovering { + s.Warnf("Consumer %q > %q has a filter subject that overlaps "+ + "with other consumers, which is not allowed for a stream "+ + "with WorkQueue policy, it should be administratively deleted", + cfg.Name, cName) + } else { + // We have a partition but it is not unique amongst the others. + mset.mu.Unlock() + return nil, NewJSConsumerWQConsumerNotUniqueError() + } + } + } + if config.DeliverPolicy != DeliverAll { + mset.mu.Unlock() + return nil, NewJSConsumerWQConsumerNotDeliverAllError() + } + } + + // Set name, which will be durable name if set, otherwise we create one at random. + o := &consumer{ + mset: mset, + js: s.getJetStream(), + acc: a, + srv: s, + client: s.createInternalJetStreamClient(), + sysc: s.createInternalJetStreamClient(), + cfg: *config, + dsubj: config.DeliverSubject, + outq: mset.outq, + active: true, + qch: make(chan struct{}), + uch: make(chan struct{}, 1), + mch: make(chan struct{}, 1), + sfreq: int32(sampleFreq), + maxdc: uint64(config.MaxDeliver), + maxp: config.MaxAckPending, + retention: cfg.Retention, + created: time.Now().UTC(), + } + + // Bind internal client to the user account. + o.client.registerWithAccount(a) + // Bind to the system account. + o.sysc.registerWithAccount(s.SystemAccount()) + + if isDurableConsumer(config) { + if len(config.Durable) > JSMaxNameLen { + mset.mu.Unlock() + o.deleteWithoutAdvisory() + return nil, NewJSConsumerNameTooLongError(JSMaxNameLen) + } + o.name = config.Durable + } else if oname != _EMPTY_ { + o.name = oname + } else { + if config.Name != _EMPTY_ { + o.name = config.Name + } else { + // Legacy ephemeral auto-generated. + for { + o.name = createConsumerName() + if _, ok := mset.consumers[o.name]; !ok { + break + } + } + config.Name = o.name + } + } + // Create ackMsgs queue now that we have a consumer name + o.ackMsgs = newIPQueue[*jsAckMsg](s, fmt.Sprintf("[ACC:%s] consumer '%s' on stream '%s' ackMsgs", accName, o.name, cfg.Name)) + + // Create our request waiting queue. + if o.isPullMode() { + o.waiting = newWaitQueue(config.MaxWaiting) + // Create our internal queue for next msg requests. + o.nextMsgReqs = newIPQueue[*nextMsgReq](s, fmt.Sprintf("[ACC:%s] consumer '%s' on stream '%s' pull requests", accName, o.name, cfg.Name)) + } + + // already under lock, mset.Name() would deadlock + o.stream = cfg.Name + o.ackEventT = JSMetricConsumerAckPre + "." + o.stream + "." + o.name + o.nakEventT = JSAdvisoryConsumerMsgNakPre + "." + o.stream + "." + o.name + o.deliveryExcEventT = JSAdvisoryConsumerMaxDeliveryExceedPre + "." + o.stream + "." + o.name + + if !isValidName(o.name) { + mset.mu.Unlock() + o.deleteWithoutAdvisory() + return nil, NewJSConsumerBadDurableNameError() + } + + // Setup our storage if not a direct consumer. + if !config.Direct { + store, err := mset.store.ConsumerStore(o.name, config) + if err != nil { + mset.mu.Unlock() + o.deleteWithoutAdvisory() + return nil, NewJSConsumerStoreFailedError(err) + } + o.store = store + } + + for _, filter := range gatherSubjectFilters(o.cfg.FilterSubject, o.cfg.FilterSubjects) { + sub := &subjectFilter{ + subject: filter, + hasWildcard: subjectHasWildcard(filter), + tokenizedSubject: tokenizeSubjectIntoSlice(nil, filter), + } + o.subjf = append(o.subjf, sub) + } + + // If we have multiple filter subjects, create a sublist which we will use + // in calling store.LoadNextMsgMulti. + if len(o.cfg.FilterSubjects) > 0 { + o.filters = NewSublistNoCache() + for _, filter := range o.cfg.FilterSubjects { + o.filters.Insert(&subscription{subject: []byte(filter)}) + } + } else { + // Make sure this is nil otherwise. + o.filters = nil + } + + if o.store != nil && o.store.HasState() { + // Restore our saved state. + o.mu.Lock() + o.readStoredState(0) + o.mu.Unlock() + } else { + // Select starting sequence number + o.selectStartingSeqNo() + } + + // Now register with mset and create the ack subscription. + // Check if we already have this one registered. + if eo, ok := mset.consumers[o.name]; ok { + mset.mu.Unlock() + if !o.isDurable() || !o.isPushMode() { + o.name = _EMPTY_ // Prevent removal since same name. + o.deleteWithoutAdvisory() + return nil, NewJSConsumerNameExistError() + } + // If we are here we have already registered this durable. If it is still active that is an error. + if eo.isActive() { + o.name = _EMPTY_ // Prevent removal since same name. + o.deleteWithoutAdvisory() + return nil, NewJSConsumerExistingActiveError() + } + // Since we are here this means we have a potentially new durable so we should update here. + // Check that configs are the same. + if !configsEqualSansDelivery(o.cfg, eo.cfg) { + o.name = _EMPTY_ // Prevent removal since same name. + o.deleteWithoutAdvisory() + return nil, NewJSConsumerReplacementWithDifferentNameError() + } + // Once we are here we have a replacement push-based durable. + eo.updateDeliverSubject(o.cfg.DeliverSubject) + return eo, nil + } + + // Set up the ack subscription for this consumer. Will use wildcard for all acks. + // We will remember the template to generate replies with sequence numbers and use + // that to scanf them back in. + // Escape '%' in consumer and stream names, as `pre` is used as a template later + // in consumer.ackReply(), resulting in erroneous formatting of the ack subject. + mn := strings.ReplaceAll(cfg.Name, "%", "%%") + pre := fmt.Sprintf(jsAckT, mn, strings.ReplaceAll(o.name, "%", "%%")) + o.ackReplyT = fmt.Sprintf("%s.%%d.%%d.%%d.%%d.%%d", pre) + o.ackSubj = fmt.Sprintf("%s.*.*.*.*.*", pre) + o.nextMsgSubj = fmt.Sprintf(JSApiRequestNextT, mn, o.name) + + // Check/update the inactive threshold + o.updateInactiveThreshold(&o.cfg) + + if o.isPushMode() { + // Check if we are running only 1 replica and that the delivery subject has interest. + // Check in place here for interest. Will setup properly in setLeader. + if config.replicas(&cfg) == 1 { + interest := o.acc.sl.HasInterest(o.cfg.DeliverSubject) + if !o.hasDeliveryInterest(interest) { + // Let the interest come to us eventually, but setup delete timer. + o.updateDeliveryInterest(false) + } + } + } + + // Set our ca. + if ca != nil { + o.setConsumerAssignment(ca) + } + + // Check if we have a rate limit set. + if config.RateLimit != 0 { + o.setRateLimit(config.RateLimit) + } + + mset.setConsumer(o) + mset.mu.Unlock() + + if config.Direct || (!s.JetStreamIsClustered() && s.standAloneMode()) { + o.setLeader(true) + } + + // This is always true in single server mode. + if o.IsLeader() { + // Send advisory. + var suppress bool + if !s.standAloneMode() && ca == nil { + suppress = true + } else if ca != nil { + suppress = ca.responded + } + if !suppress { + o.sendCreateAdvisory() + } + } + + return o, nil +} + +// Updates the consumer `dthresh` delete timer duration and set +// cfg.InactiveThreshold to JsDeleteWaitTimeDefault for ephemerals +// if not explicitly already specified by the user. +// Lock should be held. +func (o *consumer) updateInactiveThreshold(cfg *ConsumerConfig) { + // Ephemerals will always have inactive thresholds. + if !o.isDurable() && cfg.InactiveThreshold <= 0 { + // Add in 1 sec of jitter above and beyond the default of 5s. + o.dthresh = JsDeleteWaitTimeDefault + 100*time.Millisecond + time.Duration(rand.Int63n(900))*time.Millisecond + // Only stamp config with default sans jitter. + cfg.InactiveThreshold = JsDeleteWaitTimeDefault + } else if cfg.InactiveThreshold > 0 { + // Add in up to 1 sec of jitter if pull mode. + if o.isPullMode() { + o.dthresh = cfg.InactiveThreshold + 100*time.Millisecond + time.Duration(rand.Int63n(900))*time.Millisecond + } else { + o.dthresh = cfg.InactiveThreshold + } + } else if cfg.InactiveThreshold <= 0 { + // We accept InactiveThreshold be set to 0 (for durables) + o.dthresh = 0 + } +} + +// Updates the paused state. If we are the leader and the pause deadline +// hasn't passed yet then we will start a timer to kick the consumer once +// that deadline is reached. Lock should be held. +func (o *consumer) updatePauseState(cfg *ConsumerConfig) { + if o.uptmr != nil { + stopAndClearTimer(&o.uptmr) + } + if !o.isLeader() { + // Only the leader will run the timer as only the leader will run + // loopAndGatherMsgs. + return + } + if cfg.PauseUntil == nil || cfg.PauseUntil.IsZero() || cfg.PauseUntil.Before(time.Now()) { + // Either the PauseUntil is unset (is effectively zero) or the + // deadline has already passed, in which case there is nothing + // to do. + return + } + o.uptmr = time.AfterFunc(time.Until(*cfg.PauseUntil), func() { + o.mu.Lock() + defer o.mu.Unlock() + + stopAndClearTimer(&o.uptmr) + o.sendPauseAdvisoryLocked(&o.cfg) + o.signalNewMessages() + }) +} + +func (o *consumer) consumerAssignment() *consumerAssignment { + o.mu.RLock() + defer o.mu.RUnlock() + return o.ca +} + +func (o *consumer) setConsumerAssignment(ca *consumerAssignment) { + o.mu.Lock() + defer o.mu.Unlock() + + o.ca = ca + if ca == nil { + return + } + // Set our node. + o.node = ca.Group.node + + // Trigger update chan. + select { + case o.uch <- struct{}{}: + default: + } +} + +func (o *consumer) updateC() <-chan struct{} { + o.mu.RLock() + defer o.mu.RUnlock() + return o.uch +} + +// checkQueueInterest will check on our interest's queue group status. +// Lock should be held. +func (o *consumer) checkQueueInterest() { + if !o.active || o.cfg.DeliverSubject == _EMPTY_ { + return + } + subj := o.dsubj + if subj == _EMPTY_ { + subj = o.cfg.DeliverSubject + } + + if rr := o.acc.sl.Match(subj); len(rr.qsubs) > 0 { + // Just grab first + if qsubs := rr.qsubs[0]; len(qsubs) > 0 { + if sub := rr.qsubs[0][0]; len(sub.queue) > 0 { + o.qgroup = string(sub.queue) + } + } + } +} + +// clears our node if we have one. When we scale down to 1. +func (o *consumer) clearNode() { + o.mu.Lock() + defer o.mu.Unlock() + if o.node != nil { + o.node.Delete() + o.node = nil + } +} + +// IsLeader will return if we are the current leader. +func (o *consumer) IsLeader() bool { + return o.isLeader() +} + +// Lock should be held. +func (o *consumer) isLeader() bool { + return o.leader.Load() +} + +func (o *consumer) setLeader(isLeader bool) { + o.mu.RLock() + mset, closed := o.mset, o.closed + movingToClustered := o.node != nil && o.pch == nil + movingToNonClustered := o.node == nil && o.pch != nil + wasLeader := o.leader.Swap(isLeader) + o.mu.RUnlock() + + // If we are here we have a change in leader status. + if isLeader { + if closed || mset == nil { + return + } + + if wasLeader { + // If we detect we are scaling up, make sure to create clustered routines and channels. + if movingToClustered { + o.mu.Lock() + // We are moving from R1 to clustered. + o.pch = make(chan struct{}, 1) + go o.loopAndForwardProposals(o.qch) + if o.phead != nil { + select { + case o.pch <- struct{}{}: + default: + } + } + o.mu.Unlock() + } else if movingToNonClustered { + // We are moving from clustered to non-clustered now. + // Set pch to nil so if we scale back up we will recreate the loopAndForward from above. + o.mu.Lock() + pch := o.pch + o.pch = nil + select { + case pch <- struct{}{}: + default: + } + o.mu.Unlock() + } + return + } + + mset.mu.RLock() + s, jsa, stream, lseq := mset.srv, mset.jsa, mset.getCfgName(), mset.lseq + mset.mu.RUnlock() + + o.mu.Lock() + o.rdq = nil + o.rdqi.Empty() + + // Restore our saved state. During non-leader status we just update our underlying store. + o.readStoredState(lseq) + + // Setup initial num pending. + o.streamNumPending() + + // Cleanup lss when we take over in clustered mode. + if o.hasSkipListPending() && o.sseq >= o.lss.resume { + o.lss = nil + } + + // Update the group on the our starting sequence if we are starting but we skipped some in the stream. + if o.dseq == 1 && o.sseq > 1 { + o.updateSkipped(o.sseq) + } + + // Do info sub. + if o.infoSub == nil && jsa != nil { + isubj := fmt.Sprintf(clusterConsumerInfoT, jsa.acc(), stream, o.name) + // Note below the way we subscribe here is so that we can send requests to ourselves. + o.infoSub, _ = s.systemSubscribe(isubj, _EMPTY_, false, o.sysc, o.handleClusterConsumerInfoRequest) + } + + var err error + if o.cfg.AckPolicy != AckNone { + if o.ackSub, err = o.subscribeInternal(o.ackSubj, o.pushAck); err != nil { + o.mu.Unlock() + o.deleteWithoutAdvisory() + return + } + } + + // Setup the internal sub for next message requests regardless. + // Will error if wrong mode to provide feedback to users. + if o.reqSub, err = o.subscribeInternal(o.nextMsgSubj, o.processNextMsgReq); err != nil { + o.mu.Unlock() + o.deleteWithoutAdvisory() + return + } + + // Check on flow control settings. + if o.cfg.FlowControl { + o.setMaxPendingBytes(JsFlowControlMaxPending) + fcsubj := fmt.Sprintf(jsFlowControl, stream, o.name) + if o.fcSub, err = o.subscribeInternal(fcsubj, o.processFlowControl); err != nil { + o.mu.Unlock() + o.deleteWithoutAdvisory() + return + } + } + + // If push mode, register for notifications on interest. + if o.isPushMode() { + o.inch = make(chan bool, 8) + o.acc.sl.registerNotification(o.cfg.DeliverSubject, o.cfg.DeliverGroup, o.inch) + if o.active = <-o.inch; o.active { + o.checkQueueInterest() + } + + // Check gateways in case they are enabled. + if s.gateway.enabled { + if !o.active { + o.active = s.hasGatewayInterest(o.acc.Name, o.cfg.DeliverSubject) + } + stopAndClearTimer(&o.gwdtmr) + o.gwdtmr = time.AfterFunc(time.Second, func() { o.watchGWinterest() }) + } + } + + if o.dthresh > 0 && (o.isPullMode() || !o.active) { + // Pull consumer. We run the dtmr all the time for this one. + stopAndClearTimer(&o.dtmr) + o.dtmr = time.AfterFunc(o.dthresh, o.deleteNotActive) + } + + // Update the consumer pause tracking. + o.updatePauseState(&o.cfg) + + // If we are not in ReplayInstant mode mark us as in replay state until resolved. + if o.cfg.ReplayPolicy != ReplayInstant { + o.replay = true + } + + // Recreate quit channel. + o.qch = make(chan struct{}) + qch := o.qch + node := o.node + if node != nil && o.pch == nil { + o.pch = make(chan struct{}, 1) + } + pullMode := o.isPullMode() + o.mu.Unlock() + + // Check if there are any pending we might need to clean up etc. + o.checkPending() + + // Snapshot initial info. + o.infoWithSnap(true) + + // These are the labels we will use to annotate our goroutines. + labels := pprofLabels{ + "type": "consumer", + "account": mset.accName(), + "stream": mset.name(), + "consumer": o.name, + } + + // Now start up Go routine to deliver msgs. + go func() { + setGoRoutineLabels(labels) + o.loopAndGatherMsgs(qch) + }() + + // Now start up Go routine to process acks. + go func() { + setGoRoutineLabels(labels) + o.processInboundAcks(qch) + }() + + if pullMode { + // Now start up Go routine to process inbound next message requests. + go func() { + setGoRoutineLabels(labels) + o.processInboundNextMsgReqs(qch) + }() + } + + // If we are R>1 spin up our proposal loop. + if node != nil { + // Determine if we can send pending requests info to the group. + // They must be on server versions >= 2.7.1 + o.checkAndSetPendingRequestsOk() + o.checkPendingRequests() + go func() { + setGoRoutineLabels(labels) + o.loopAndForwardProposals(qch) + }() + } + + } else { + // Shutdown the go routines and the subscriptions. + o.mu.Lock() + if o.qch != nil { + close(o.qch) + o.qch = nil + } + // Stop any inactivity timers. Should only be running on leaders. + stopAndClearTimer(&o.dtmr) + // Stop any unpause timers. Should only be running on leaders. + stopAndClearTimer(&o.uptmr) + // Make sure to clear out any re-deliver queues + o.stopAndClearPtmr() + o.rdq = nil + o.rdqi.Empty() + o.pending = nil + // ok if they are nil, we protect inside unsubscribe() + o.unsubscribe(o.ackSub) + o.unsubscribe(o.reqSub) + o.unsubscribe(o.fcSub) + o.ackSub, o.reqSub, o.fcSub = nil, nil, nil + if o.infoSub != nil { + o.srv.sysUnsubscribe(o.infoSub) + o.infoSub = nil + } + // Reset waiting if we are in pull mode. + if o.isPullMode() { + o.waiting = newWaitQueue(o.cfg.MaxWaiting) + o.nextMsgReqs.drain() + } else if o.srv.gateway.enabled { + stopAndClearTimer(&o.gwdtmr) + } + // If we were the leader make sure to drain queued up acks. + if wasLeader { + o.ackMsgs.drain() + // Also remove any pending replies since we should not be the one to respond at this point. + o.replies = nil + } + o.mu.Unlock() + } +} + +// This is coming on the wire so do not block here. +func (o *consumer) handleClusterConsumerInfoRequest(sub *subscription, c *client, _ *Account, subject, reply string, msg []byte) { + go o.infoWithSnapAndReply(false, reply) +} + +// Lock should be held. +func (o *consumer) subscribeInternal(subject string, cb msgHandler) (*subscription, error) { + c := o.client + if c == nil { + return nil, fmt.Errorf("invalid consumer") + } + if !c.srv.EventsEnabled() { + return nil, ErrNoSysAccount + } + if cb == nil { + return nil, fmt.Errorf("undefined message handler") + } + + o.sid++ + + // Now create the subscription + return c.processSub([]byte(subject), nil, []byte(strconv.Itoa(o.sid)), cb, false) +} + +// Unsubscribe from our subscription. +// Lock should be held. +func (o *consumer) unsubscribe(sub *subscription) { + if sub == nil || o.client == nil { + return + } + o.client.processUnsub(sub.sid) +} + +// We need to make sure we protect access to the outq. +// Do all advisory sends here. +func (o *consumer) sendAdvisory(subj string, msg []byte) { + o.outq.sendMsg(subj, msg) +} + +func (o *consumer) sendDeleteAdvisoryLocked() { + e := JSConsumerActionAdvisory{ + TypedEvent: TypedEvent{ + Type: JSConsumerActionAdvisoryType, + ID: nuid.Next(), + Time: time.Now().UTC(), + }, + Stream: o.stream, + Consumer: o.name, + Action: DeleteEvent, + Domain: o.srv.getOpts().JetStreamDomain, + } + + j, err := json.Marshal(e) + if err != nil { + return + } + + subj := JSAdvisoryConsumerDeletedPre + "." + o.stream + "." + o.name + o.sendAdvisory(subj, j) +} + +func (o *consumer) sendPinnedAdvisoryLocked(group string) { + e := JSConsumerGroupPinnedAdvisory{ + TypedEvent: TypedEvent{ + Type: JSConsumerGroupPinnedAdvisoryType, + ID: nuid.Next(), + Time: time.Now().UTC(), + }, + Account: o.acc.Name, + Stream: o.stream, + Consumer: o.name, + Domain: o.srv.getOpts().JetStreamDomain, + PinnedClientId: o.currentPinId, + Group: group, + } + + j, err := json.Marshal(e) + if err != nil { + return + } + + subj := JSAdvisoryConsumerPinnedPre + "." + o.stream + "." + o.name + o.sendAdvisory(subj, j) + +} +func (o *consumer) sendUnpinnedAdvisoryLocked(group string, reason string) { + e := JSConsumerGroupUnpinnedAdvisory{ + TypedEvent: TypedEvent{ + Type: JSConsumerGroupUnpinnedAdvisoryType, + ID: nuid.Next(), + Time: time.Now().UTC(), + }, + Account: o.acc.Name, + Stream: o.stream, + Consumer: o.name, + Domain: o.srv.getOpts().JetStreamDomain, + Group: group, + Reason: reason, + } + + j, err := json.Marshal(e) + if err != nil { + return + } + + subj := JSAdvisoryConsumerUnpinnedPre + "." + o.stream + "." + o.name + o.sendAdvisory(subj, j) + +} + +func (o *consumer) sendCreateAdvisory() { + o.mu.Lock() + defer o.mu.Unlock() + + e := JSConsumerActionAdvisory{ + TypedEvent: TypedEvent{ + Type: JSConsumerActionAdvisoryType, + ID: nuid.Next(), + Time: time.Now().UTC(), + }, + Stream: o.stream, + Consumer: o.name, + Action: CreateEvent, + Domain: o.srv.getOpts().JetStreamDomain, + } + + j, err := json.Marshal(e) + if err != nil { + return + } + + subj := JSAdvisoryConsumerCreatedPre + "." + o.stream + "." + o.name + o.sendAdvisory(subj, j) +} + +func (o *consumer) sendPauseAdvisoryLocked(cfg *ConsumerConfig) { + e := JSConsumerPauseAdvisory{ + TypedEvent: TypedEvent{ + Type: JSConsumerPauseAdvisoryType, + ID: nuid.Next(), + Time: time.Now().UTC(), + }, + Stream: o.stream, + Consumer: o.name, + Domain: o.srv.getOpts().JetStreamDomain, + } + + if cfg.PauseUntil != nil { + e.PauseUntil = *cfg.PauseUntil + e.Paused = time.Now().Before(e.PauseUntil) + } + + j, err := json.Marshal(e) + if err != nil { + return + } + + subj := JSAdvisoryConsumerPausePre + "." + o.stream + "." + o.name + o.sendAdvisory(subj, j) +} + +// Created returns created time. +func (o *consumer) createdTime() time.Time { + o.mu.Lock() + created := o.created + o.mu.Unlock() + return created +} + +// Internal to allow creation time to be restored. +func (o *consumer) setCreatedTime(created time.Time) { + o.mu.Lock() + o.created = created + o.mu.Unlock() +} + +// This will check for extended interest in a subject. If we have local interest we just return +// that, but in the absence of local interest and presence of gateways or service imports we need +// to check those as well. +func (o *consumer) hasDeliveryInterest(localInterest bool) bool { + o.mu.RLock() + mset := o.mset + if mset == nil { + o.mu.RUnlock() + return false + } + acc := o.acc + deliver := o.cfg.DeliverSubject + o.mu.RUnlock() + + if localInterest { + return true + } + + // If we are here check gateways. + if s := acc.srv; s != nil && s.hasGatewayInterest(acc.Name, deliver) { + return true + } + return false +} + +func (s *Server) hasGatewayInterest(account, subject string) bool { + gw := s.gateway + if !gw.enabled { + return false + } + gw.RLock() + defer gw.RUnlock() + for _, gwc := range gw.outo { + psi, qr := gwc.gatewayInterest(account, stringToBytes(subject)) + if psi || qr != nil { + return true + } + } + return false +} + +// This processes an update to the local interest for a deliver subject. +func (o *consumer) updateDeliveryInterest(localInterest bool) bool { + interest := o.hasDeliveryInterest(localInterest) + + o.mu.Lock() + defer o.mu.Unlock() + + mset := o.mset + if mset == nil || o.isPullMode() { + return false + } + + if interest && !o.active { + o.signalNewMessages() + } + // Update active status, if not active clear any queue group we captured. + if o.active = interest; !o.active { + o.qgroup = _EMPTY_ + } else { + o.checkQueueInterest() + } + + // If the delete timer has already been set do not clear here and return. + // Note that durable can now have an inactive threshold, so don't check + // for durable status, instead check for dthresh > 0. + if o.dtmr != nil && o.dthresh > 0 && !interest { + return true + } + + // Stop and clear the delete timer always. + stopAndClearTimer(&o.dtmr) + + // If we do not have interest anymore and have a delete threshold set, then set + // a timer to delete us. We wait for a bit in case of server reconnect. + if !interest && o.dthresh > 0 { + o.dtmr = time.AfterFunc(o.dthresh, o.deleteNotActive) + return true + } + return false +} + +const ( + defaultConsumerNotActiveStartInterval = 30 * time.Second + defaultConsumerNotActiveMaxInterval = 5 * time.Minute +) + +var ( + consumerNotActiveStartInterval = defaultConsumerNotActiveStartInterval + consumerNotActiveMaxInterval = defaultConsumerNotActiveMaxInterval +) + +func (o *consumer) deleteNotActive() { + o.mu.Lock() + if o.mset == nil { + o.mu.Unlock() + return + } + // Push mode just look at active. + if o.isPushMode() { + // If we are active simply return. + if o.active { + o.mu.Unlock() + return + } + } else { + // Pull mode. + elapsed := time.Since(o.waiting.last) + if elapsed <= o.cfg.InactiveThreshold { + // These need to keep firing so reset but use delta. + if o.dtmr != nil { + o.dtmr.Reset(o.dthresh - elapsed) + } else { + o.dtmr = time.AfterFunc(o.dthresh-elapsed, o.deleteNotActive) + } + o.mu.Unlock() + return + } + // Check if we still have valid requests waiting. + if o.checkWaitingForInterest() { + if o.dtmr != nil { + o.dtmr.Reset(o.dthresh) + } else { + o.dtmr = time.AfterFunc(o.dthresh, o.deleteNotActive) + } + o.mu.Unlock() + return + } + } + + s, js := o.mset.srv, o.srv.js.Load() + acc, stream, name, isDirect := o.acc.Name, o.stream, o.name, o.cfg.Direct + o.mu.Unlock() + + // If we are clustered, check if we still have this consumer assigned. + // If we do forward a proposal to delete ourselves to the metacontroller leader. + if !isDirect && s.JetStreamIsClustered() { + js.mu.RLock() + var ( + cca consumerAssignment + meta RaftNode + removeEntry []byte + ) + ca, cc := js.consumerAssignment(acc, stream, name), js.cluster + if ca != nil && cc != nil { + meta = cc.meta + cca = *ca + cca.Reply = _EMPTY_ + removeEntry = encodeDeleteConsumerAssignment(&cca) + meta.ForwardProposal(removeEntry) + } + js.mu.RUnlock() + + if ca != nil && cc != nil { + // Check to make sure we went away. + // Don't think this needs to be a monitored go routine. + go func() { + jitter := time.Duration(rand.Int63n(int64(consumerNotActiveStartInterval))) + interval := consumerNotActiveStartInterval + jitter + ticker := time.NewTicker(interval) + defer ticker.Stop() + for range ticker.C { + js.mu.RLock() + if js.shuttingDown { + js.mu.RUnlock() + return + } + nca := js.consumerAssignment(acc, stream, name) + js.mu.RUnlock() + // Make sure this is not a new consumer with the same name. + if nca != nil && nca == ca { + s.Warnf("Consumer assignment for '%s > %s > %s' not cleaned up, retrying", acc, stream, name) + meta.ForwardProposal(removeEntry) + if interval < consumerNotActiveMaxInterval { + interval *= 2 + ticker.Reset(interval) + } + continue + } + // We saw that consumer has been removed, all done. + return + } + }() + } + } + + // We will delete here regardless. + o.delete() +} + +func (o *consumer) watchGWinterest() { + pa := o.isActive() + // If there is no local interest... + if o.hasNoLocalInterest() { + o.updateDeliveryInterest(false) + if !pa && o.isActive() { + o.signalNewMessages() + } + } + + // We want this to always be running so we can also pick up on interest returning. + o.mu.Lock() + if o.gwdtmr != nil { + o.gwdtmr.Reset(time.Second) + } else { + stopAndClearTimer(&o.gwdtmr) + o.gwdtmr = time.AfterFunc(time.Second, func() { o.watchGWinterest() }) + } + o.mu.Unlock() +} + +// Config returns the consumer's configuration. +func (o *consumer) config() ConsumerConfig { + o.mu.Lock() + defer o.mu.Unlock() + return o.cfg +} + +// Check if we have hit max deliveries. If so do notification and cleanup. +// Return whether or not the max was hit. +// Lock should be held. +func (o *consumer) hasMaxDeliveries(seq uint64) bool { + if o.maxdc == 0 { + return false + } + if dc := o.deliveryCount(seq); dc >= o.maxdc { + // We have hit our max deliveries for this sequence. + // Only send the advisory once. + if dc == o.maxdc { + o.notifyDeliveryExceeded(seq, dc) + } + // Determine if we signal to start flow of messages again. + if o.maxp > 0 && len(o.pending) >= o.maxp { + o.signalNewMessages() + } + // Cleanup our tracking. + delete(o.pending, seq) + if o.rdc != nil { + delete(o.rdc, seq) + } + return true + } + return false +} + +// Force expiration of all pending. +// Lock should be held. +func (o *consumer) forceExpirePending() { + var expired []uint64 + for seq := range o.pending { + if !o.onRedeliverQueue(seq) && !o.hasMaxDeliveries(seq) { + expired = append(expired, seq) + } + } + if len(expired) > 0 { + slices.Sort(expired) + o.addToRedeliverQueue(expired...) + // Now we should update the timestamp here since we are redelivering. + // We will use an incrementing time to preserve order for any other redelivery. + off := time.Now().UnixNano() - o.pending[expired[0]].Timestamp + for _, seq := range expired { + if p, ok := o.pending[seq]; ok && p != nil { + p.Timestamp += off + } + } + o.resetPtmr(o.ackWait(0)) + } + o.signalNewMessages() +} + +// Acquire proper locks and update rate limit. +// Will use what is in config. +func (o *consumer) setRateLimitNeedsLocks() { + o.mu.RLock() + mset := o.mset + o.mu.RUnlock() + + if mset == nil { + return + } + + mset.mu.RLock() + o.mu.Lock() + o.setRateLimit(o.cfg.RateLimit) + o.mu.Unlock() + mset.mu.RUnlock() +} + +// Set the rate limiter +// Both mset and consumer lock should be held. +func (o *consumer) setRateLimit(bps uint64) { + if bps == 0 { + o.rlimit = nil + return + } + + // TODO(dlc) - Make sane values or error if not sane? + // We are configured in bits per sec so adjust to bytes. + rl := rate.Limit(bps / 8) + mset := o.mset + + // Burst should be set to maximum msg size for this account, etc. + var burst int + // We don't need to get cfgMu's rlock here since this function + // is already invoked under mset.mu.RLock(), which superseeds cfgMu. + if mset.cfg.MaxMsgSize > 0 { + burst = int(mset.cfg.MaxMsgSize) + } else if mset.jsa.account.limits.mpay > 0 { + burst = int(mset.jsa.account.limits.mpay) + } else { + s := mset.jsa.account.srv + burst = int(s.getOpts().MaxPayload) + } + + o.rlimit = rate.NewLimiter(rl, burst) +} + +// Check if new consumer config allowed vs old. +func (acc *Account) checkNewConsumerConfig(cfg, ncfg *ConsumerConfig) error { + if reflect.DeepEqual(cfg, ncfg) { + return nil + } + // Something different, so check since we only allow certain things to be updated. + if cfg.DeliverPolicy != ncfg.DeliverPolicy { + return errors.New("deliver policy can not be updated") + } + if cfg.OptStartSeq != ncfg.OptStartSeq { + return errors.New("start sequence can not be updated") + } + if cfg.OptStartTime != nil && ncfg.OptStartTime != nil { + // Both have start times set, compare them directly: + if !cfg.OptStartTime.Equal(*ncfg.OptStartTime) { + return errors.New("start time can not be updated") + } + } else if cfg.OptStartTime != nil || ncfg.OptStartTime != nil { + // At least one start time is set and the other is not + return errors.New("start time can not be updated") + } + if cfg.AckPolicy != ncfg.AckPolicy { + return errors.New("ack policy can not be updated") + } + if cfg.ReplayPolicy != ncfg.ReplayPolicy { + return errors.New("replay policy can not be updated") + } + if cfg.Heartbeat != ncfg.Heartbeat { + return errors.New("heart beats can not be updated") + } + if cfg.FlowControl != ncfg.FlowControl { + return errors.New("flow control can not be updated") + } + if cfg.MaxWaiting != ncfg.MaxWaiting { + return errors.New("max waiting can not be updated") + } + + // Deliver Subject is conditional on if its bound. + if cfg.DeliverSubject != ncfg.DeliverSubject { + if cfg.DeliverSubject == _EMPTY_ { + return errors.New("can not update pull consumer to push based") + } + if ncfg.DeliverSubject == _EMPTY_ { + return errors.New("can not update push consumer to pull based") + } + if acc.sl.HasInterest(cfg.DeliverSubject) { + return NewJSConsumerNameExistError() + } + } + + // Check if BackOff is defined, MaxDeliver is within range. + if lbo := len(ncfg.BackOff); lbo > 0 && ncfg.MaxDeliver != -1 && lbo > ncfg.MaxDeliver { + return NewJSConsumerMaxDeliverBackoffError() + } + + return nil +} + +// Update the config based on the new config, or error if update not allowed. +func (o *consumer) updateConfig(cfg *ConsumerConfig) error { + o.mu.Lock() + defer o.mu.Unlock() + + if o.closed || o.mset == nil { + return NewJSConsumerDoesNotExistError() + } + + if err := o.acc.checkNewConsumerConfig(&o.cfg, cfg); err != nil { + return err + } + + // Make sure we always store PauseUntil in UTC. + if cfg.PauseUntil != nil { + utc := (*cfg.PauseUntil).UTC() + cfg.PauseUntil = &utc + } + + if o.store != nil { + // Update local state always. + if err := o.store.UpdateConfig(cfg); err != nil { + return err + } + } + + // DeliverSubject + if cfg.DeliverSubject != o.cfg.DeliverSubject { + o.updateDeliverSubjectLocked(cfg.DeliverSubject) + } + + // MaxAckPending + if cfg.MaxAckPending != o.cfg.MaxAckPending { + o.maxp = cfg.MaxAckPending + o.signalNewMessages() + } + // AckWait + if cfg.AckWait != o.cfg.AckWait { + if o.ptmr != nil { + o.resetPtmr(100 * time.Millisecond) + } + } + // Rate Limit + if cfg.RateLimit != o.cfg.RateLimit { + // We need both locks here so do in Go routine. + go o.setRateLimitNeedsLocks() + } + if cfg.SampleFrequency != o.cfg.SampleFrequency { + s := strings.TrimSuffix(cfg.SampleFrequency, "%") + // String has been already verified for validity up in the stack, so no + // need to check for error here. + sampleFreq, _ := strconv.Atoi(s) + o.sfreq = int32(sampleFreq) + } + // Set MaxDeliver if changed + if cfg.MaxDeliver != o.cfg.MaxDeliver { + o.maxdc = uint64(cfg.MaxDeliver) + } + // Set InactiveThreshold if changed. + if val := cfg.InactiveThreshold; val != o.cfg.InactiveThreshold { + o.updateInactiveThreshold(cfg) + stopAndClearTimer(&o.dtmr) + // Restart timer only if we are the leader. + if o.isLeader() && o.dthresh > 0 { + o.dtmr = time.AfterFunc(o.dthresh, o.deleteNotActive) + } + } + // Check whether the pause has changed + { + var old, new time.Time + if o.cfg.PauseUntil != nil { + old = *o.cfg.PauseUntil + } + if cfg.PauseUntil != nil { + new = *cfg.PauseUntil + } + if !old.Equal(new) { + o.updatePauseState(cfg) + if o.isLeader() { + o.sendPauseAdvisoryLocked(cfg) + } + } + } + + // Check for Subject Filters update. + newSubjects := gatherSubjectFilters(cfg.FilterSubject, cfg.FilterSubjects) + if !subjectSliceEqual(newSubjects, o.subjf.subjects()) { + newSubjf := make(subjectFilters, 0, len(newSubjects)) + for _, newFilter := range newSubjects { + fs := &subjectFilter{ + subject: newFilter, + hasWildcard: subjectHasWildcard(newFilter), + tokenizedSubject: tokenizeSubjectIntoSlice(nil, newFilter), + } + newSubjf = append(newSubjf, fs) + } + // Make sure we have correct signaling setup. + // Consumer lock can not be held. + mset := o.mset + o.mu.Unlock() + mset.swapSigSubs(o, newSubjf.subjects()) + o.mu.Lock() + + // When we're done with signaling, we can replace the subjects. + // If filters were removed, set `o.subjf` to nil. + if len(newSubjf) == 0 { + o.subjf = nil + o.filters = nil + } else { + o.subjf = newSubjf + if len(o.subjf) == 1 { + o.filters = nil + } else { + o.filters = NewSublistNoCache() + for _, filter := range o.subjf { + o.filters.Insert(&subscription{subject: []byte(filter.subject)}) + } + } + } + } + + // Record new config for others that do not need special handling. + // Allowed but considered no-op, [Description, SampleFrequency, MaxWaiting, HeadersOnly] + o.cfg = *cfg + + // Cleanup messages that lost interest. + if o.retention == InterestPolicy { + o.mu.Unlock() + o.cleanupNoInterestMessages(o.mset, false) + o.mu.Lock() + } + + // Re-calculate num pending on update. + o.streamNumPending() + + return nil +} + +// This is a config change for the delivery subject for a +// push based consumer. +func (o *consumer) updateDeliverSubject(newDeliver string) { + // Update the config and the dsubj + o.mu.Lock() + defer o.mu.Unlock() + o.updateDeliverSubjectLocked(newDeliver) +} + +// This is a config change for the delivery subject for a +// push based consumer. +func (o *consumer) updateDeliverSubjectLocked(newDeliver string) { + if o.closed || o.isPullMode() || o.cfg.DeliverSubject == newDeliver { + return + } + + // Force redeliver of all pending on change of delivery subject. + if len(o.pending) > 0 { + o.forceExpirePending() + } + + o.acc.sl.clearNotification(o.dsubj, o.cfg.DeliverGroup, o.inch) + o.dsubj, o.cfg.DeliverSubject = newDeliver, newDeliver + // When we register new one it will deliver to update state loop. + o.acc.sl.registerNotification(newDeliver, o.cfg.DeliverGroup, o.inch) +} + +// Check that configs are equal but allow delivery subjects to be different. +func configsEqualSansDelivery(a, b ConsumerConfig) bool { + // These were copied in so can set Delivery here. + a.DeliverSubject, b.DeliverSubject = _EMPTY_, _EMPTY_ + return reflect.DeepEqual(a, b) +} + +// Helper to send a reply to an ack. +func (o *consumer) sendAckReply(subj string) { + o.mu.RLock() + defer o.mu.RUnlock() + o.outq.sendMsg(subj, nil) +} + +type jsAckMsg struct { + subject string + reply string + hdr int + msg []byte +} + +var jsAckMsgPool sync.Pool + +func newJSAckMsg(subj, reply string, hdr int, msg []byte) *jsAckMsg { + var m *jsAckMsg + am := jsAckMsgPool.Get() + if am != nil { + m = am.(*jsAckMsg) + } else { + m = &jsAckMsg{} + } + // When getting something from a pool it is critical that all fields are + // initialized. Doing this way guarantees that if someone adds a field to + // the structure, the compiler will fail the build if this line is not updated. + (*m) = jsAckMsg{subj, reply, hdr, msg} + return m +} + +func (am *jsAckMsg) returnToPool() { + if am == nil { + return + } + am.subject, am.reply, am.hdr, am.msg = _EMPTY_, _EMPTY_, -1, nil + jsAckMsgPool.Put(am) +} + +// Push the ack message to the consumer's ackMsgs queue +func (o *consumer) pushAck(_ *subscription, c *client, _ *Account, subject, reply string, rmsg []byte) { + atomic.AddInt64(&o.awl, 1) + o.ackMsgs.push(newJSAckMsg(subject, reply, c.pa.hdr, copyBytes(rmsg))) +} + +// Processes a message for the ack reply subject delivered with a message. +func (o *consumer) processAck(subject, reply string, hdr int, rmsg []byte) { + defer atomic.AddInt64(&o.awl, -1) + + var msg []byte + if hdr > 0 { + msg = rmsg[hdr:] + } else { + msg = rmsg + } + + sseq, dseq, dc := ackReplyInfo(subject) + + skipAckReply := sseq == 0 + + switch { + case len(msg) == 0, bytes.Equal(msg, AckAck), bytes.Equal(msg, AckOK): + if !o.processAckMsg(sseq, dseq, dc, reply, true) { + // We handle replies for acks in updateAcks + skipAckReply = true + } + case bytes.HasPrefix(msg, AckNext): + o.processAckMsg(sseq, dseq, dc, _EMPTY_, true) + o.processNextMsgRequest(reply, msg[len(AckNext):]) + skipAckReply = true + case bytes.HasPrefix(msg, AckNak): + o.processNak(sseq, dseq, dc, msg) + case bytes.Equal(msg, AckProgress): + o.progressUpdate(sseq) + case bytes.HasPrefix(msg, AckTerm): + var reason string + if buf := msg[len(AckTerm):]; len(buf) > 0 { + reason = string(bytes.TrimSpace(buf)) + } + if !o.processTerm(sseq, dseq, dc, reason, reply) { + // We handle replies for acks in updateAcks + skipAckReply = true + } + } + + // Ack the ack if requested. + if len(reply) > 0 && !skipAckReply { + o.sendAckReply(reply) + } +} + +// Used to process a working update to delay redelivery. +func (o *consumer) progressUpdate(seq uint64) { + o.mu.Lock() + defer o.mu.Unlock() + + if p, ok := o.pending[seq]; ok { + p.Timestamp = time.Now().UnixNano() + // Update store system. + o.updateDelivered(p.Sequence, seq, 1, p.Timestamp) + } +} + +// Lock should be held. +func (o *consumer) updateSkipped(seq uint64) { + // Clustered mode and R>1 only. + if o.node == nil || !o.isLeader() { + return + } + var b [1 + 8]byte + b[0] = byte(updateSkipOp) + var le = binary.LittleEndian + le.PutUint64(b[1:], seq) + o.propose(b[:]) +} + +func (o *consumer) loopAndForwardProposals(qch chan struct{}) { + // On exit make sure we nil out pch. + defer func() { + o.mu.Lock() + o.pch = nil + o.mu.Unlock() + }() + + o.mu.RLock() + node, pch := o.node, o.pch + o.mu.RUnlock() + + if node == nil || pch == nil { + return + } + + forwardProposals := func() error { + o.mu.Lock() + if o.node == nil || o.node.State() != Leader { + o.mu.Unlock() + return errors.New("no longer leader") + } + proposal := o.phead + o.phead, o.ptail = nil, nil + o.mu.Unlock() + // 256k max for now per batch. + const maxBatch = 256 * 1024 + var entries []*Entry + for sz := 0; proposal != nil; proposal = proposal.next { + entries = append(entries, newEntry(EntryNormal, proposal.data)) + sz += len(proposal.data) + if sz > maxBatch { + node.ProposeMulti(entries) + // We need to re-create `entries` because there is a reference + // to it in the node's pae map. + sz, entries = 0, nil + } + } + if len(entries) > 0 { + node.ProposeMulti(entries) + } + return nil + } + + // In case we have anything pending on entry. + forwardProposals() + + for { + select { + case <-qch: + forwardProposals() + return + case <-pch: + if err := forwardProposals(); err != nil { + return + } + } + } +} + +// Lock should be held. +func (o *consumer) propose(entry []byte) { + p := &proposal{data: entry} + if o.phead == nil { + o.phead = p + } else { + o.ptail.next = p + } + o.ptail = p + + // Kick our looper routine. + select { + case o.pch <- struct{}{}: + default: + } +} + +// Lock should be held. +func (o *consumer) updateDelivered(dseq, sseq, dc uint64, ts int64) { + // Clustered mode and R>1. + if o.node != nil { + // Inline for now, use variable compression. + var b [4*binary.MaxVarintLen64 + 1]byte + b[0] = byte(updateDeliveredOp) + n := 1 + n += binary.PutUvarint(b[n:], dseq) + n += binary.PutUvarint(b[n:], sseq) + n += binary.PutUvarint(b[n:], dc) + n += binary.PutVarint(b[n:], ts) + o.propose(b[:n]) + } else if o.store != nil { + o.store.UpdateDelivered(dseq, sseq, dc, ts) + } + // Update activity. + o.ldt = time.Now() +} + +// Used to remember a pending ack reply in a replicated consumer. +// Lock should be held. +func (o *consumer) addAckReply(sseq uint64, reply string) { + if o.replies == nil { + o.replies = make(map[uint64]string) + } + o.replies[sseq] = reply +} + +// Lock should be held. +func (o *consumer) updateAcks(dseq, sseq uint64, reply string) { + if o.node != nil { + // Inline for now, use variable compression. + var b [2*binary.MaxVarintLen64 + 1]byte + b[0] = byte(updateAcksOp) + n := 1 + n += binary.PutUvarint(b[n:], dseq) + n += binary.PutUvarint(b[n:], sseq) + o.propose(b[:n]) + if reply != _EMPTY_ { + o.addAckReply(sseq, reply) + } + } else if o.store != nil { + o.store.UpdateAcks(dseq, sseq) + if reply != _EMPTY_ { + // Already locked so send direct. + o.outq.sendMsg(reply, nil) + } + } + // Update activity. + o.lat = time.Now() +} + +// Communicate to the cluster an addition of a pending request. +// Lock should be held. +func (o *consumer) addClusterPendingRequest(reply string) { + if o.node == nil || !o.pendingRequestsOk() { + return + } + b := make([]byte, len(reply)+1) + b[0] = byte(addPendingRequest) + copy(b[1:], reply) + o.propose(b) +} + +// Communicate to the cluster a removal of a pending request. +// Lock should be held. +func (o *consumer) removeClusterPendingRequest(reply string) { + if o.node == nil || !o.pendingRequestsOk() { + return + } + b := make([]byte, len(reply)+1) + b[0] = byte(removePendingRequest) + copy(b[1:], reply) + o.propose(b) +} + +// Set whether or not we can send pending requests to followers. +func (o *consumer) setPendingRequestsOk(ok bool) { + o.mu.Lock() + o.prOk = ok + o.mu.Unlock() +} + +// Lock should be held. +func (o *consumer) pendingRequestsOk() bool { + return o.prOk +} + +// Set whether or not we can send info about pending pull requests to our group. +// Will require all peers have a minimum version. +func (o *consumer) checkAndSetPendingRequestsOk() { + o.mu.RLock() + s, isValid := o.srv, o.mset != nil + o.mu.RUnlock() + if !isValid { + return + } + + if ca := o.consumerAssignment(); ca != nil && len(ca.Group.Peers) > 1 { + for _, pn := range ca.Group.Peers { + if si, ok := s.nodeToInfo.Load(pn); ok { + if !versionAtLeast(si.(nodeInfo).version, 2, 7, 1) { + // We expect all of our peers to eventually be up to date. + // So check again in awhile. + time.AfterFunc(eventsHBInterval, func() { o.checkAndSetPendingRequestsOk() }) + o.setPendingRequestsOk(false) + return + } + } + } + } + o.setPendingRequestsOk(true) +} + +// On leadership change make sure we alert the pending requests that they are no longer valid. +func (o *consumer) checkPendingRequests() { + o.mu.Lock() + defer o.mu.Unlock() + if o.mset == nil || o.outq == nil { + return + } + hdr := []byte("NATS/1.0 409 Leadership Change\r\n\r\n") + for reply := range o.prm { + o.outq.send(newJSPubMsg(reply, _EMPTY_, _EMPTY_, hdr, nil, nil, 0)) + } + o.prm = nil +} + +// This will release any pending pull requests if applicable. +// Should be called only by the leader being deleted or stopped. +// Lock should be held. +func (o *consumer) releaseAnyPendingRequests(isAssigned bool) { + if o.mset == nil || o.outq == nil || o.waiting.len() == 0 { + return + } + var hdr []byte + if !isAssigned { + hdr = []byte("NATS/1.0 409 Consumer Deleted\r\n\r\n") + } + + wq := o.waiting + for wr := wq.head; wr != nil; { + if hdr != nil { + o.outq.send(newJSPubMsg(wr.reply, _EMPTY_, _EMPTY_, hdr, nil, nil, 0)) + } + next := wr.next + wr.recycle() + wr = next + } + // Nil out old queue. + o.waiting = nil +} + +// Process a NAK. +func (o *consumer) processNak(sseq, dseq, dc uint64, nak []byte) { + o.mu.Lock() + defer o.mu.Unlock() + + // Check for out of range. + if dseq <= o.adflr || dseq > o.dseq { + return + } + // If we are explicit ack make sure this is still on our pending list. + if _, ok := o.pending[sseq]; !ok { + return + } + + // Deliver an advisory + e := JSConsumerDeliveryNakAdvisory{ + TypedEvent: TypedEvent{ + Type: JSConsumerDeliveryNakAdvisoryType, + ID: nuid.Next(), + Time: time.Now().UTC(), + }, + Stream: o.stream, + Consumer: o.name, + ConsumerSeq: dseq, + StreamSeq: sseq, + Deliveries: dc, + Domain: o.srv.getOpts().JetStreamDomain, + } + + j, err := json.Marshal(e) + if err != nil { + return + } + + o.sendAdvisory(o.nakEventT, j) + + // Check to see if we have delays attached. + if len(nak) > len(AckNak) { + arg := bytes.TrimSpace(nak[len(AckNak):]) + if len(arg) > 0 { + var d time.Duration + var err error + if arg[0] == '{' { + var nd ConsumerNakOptions + if err = json.Unmarshal(arg, &nd); err == nil { + d = nd.Delay + } + } else { + d, err = time.ParseDuration(string(arg)) + } + if err != nil { + // Treat this as normal NAK. + o.srv.Warnf("JetStream consumer '%s > %s > %s' bad NAK delay value: %q", o.acc.Name, o.stream, o.name, arg) + } else { + // We have a parsed duration that the user wants us to wait before retrying. + // Make sure we are not on the rdq. + o.removeFromRedeliverQueue(sseq) + if p, ok := o.pending[sseq]; ok { + // now - ackWait is expired now, so offset from there. + p.Timestamp = time.Now().Add(-o.cfg.AckWait).Add(d).UnixNano() + // Update store system which will update followers as well. + o.updateDelivered(p.Sequence, sseq, dc, p.Timestamp) + if o.ptmr != nil { + // Want checkPending to run and figure out the next timer ttl. + // TODO(dlc) - We could optimize this maybe a bit more and track when we expect the timer to fire. + o.resetPtmr(10 * time.Millisecond) + } + } + // Nothing else for use to do now so return. + return + } + } + } + + // If already queued up also ignore. + if !o.onRedeliverQueue(sseq) { + o.addToRedeliverQueue(sseq) + } + + o.signalNewMessages() +} + +// Process a TERM +// Returns `true` if the ack was processed in place and the sender can now respond +// to the client, or `false` if there was an error or the ack is replicated (in which +// case the reply will be sent later). +func (o *consumer) processTerm(sseq, dseq, dc uint64, reason, reply string) bool { + // Treat like an ack to suppress redelivery. + ackedInPlace := o.processAckMsg(sseq, dseq, dc, reply, false) + + o.mu.Lock() + defer o.mu.Unlock() + + // Deliver an advisory + e := JSConsumerDeliveryTerminatedAdvisory{ + TypedEvent: TypedEvent{ + Type: JSConsumerDeliveryTerminatedAdvisoryType, + ID: nuid.Next(), + Time: time.Now().UTC(), + }, + Stream: o.stream, + Consumer: o.name, + ConsumerSeq: dseq, + StreamSeq: sseq, + Deliveries: dc, + Reason: reason, + Domain: o.srv.getOpts().JetStreamDomain, + } + + j, err := json.Marshal(e) + if err != nil { + // We had an error during the marshal, so we can't send the advisory, + // but we still need to tell the caller that the ack was processed. + return ackedInPlace + } + + subj := JSAdvisoryConsumerMsgTerminatedPre + "." + o.stream + "." + o.name + o.sendAdvisory(subj, j) + return ackedInPlace +} + +// Introduce a small delay in when timer fires to check pending. +// Allows bursts to be treated in same time frame. +const ackWaitDelay = time.Millisecond + +// ackWait returns how long to wait to fire the pending timer. +func (o *consumer) ackWait(next time.Duration) time.Duration { + if next > 0 { + return next + ackWaitDelay + } + return o.cfg.AckWait + ackWaitDelay +} + +// Due to bug in calculation of sequences on restoring redelivered let's do quick sanity check. +// Lock should be held. +func (o *consumer) checkRedelivered(slseq uint64) { + var lseq uint64 + if mset := o.mset; mset != nil { + lseq = slseq + } + var shouldUpdateState bool + for sseq := range o.rdc { + if sseq <= o.asflr || (lseq > 0 && sseq > lseq) { + delete(o.rdc, sseq) + o.removeFromRedeliverQueue(sseq) + shouldUpdateState = true + } + } + if shouldUpdateState { + if err := o.writeStoreStateUnlocked(); err != nil && o.srv != nil && o.mset != nil && !o.closed { + s, acc, mset, name := o.srv, o.acc, o.mset, o.name + s.Warnf("Consumer '%s > %s > %s' error on write store state from check redelivered: %v", acc, mset.getCfgName(), name, err) + } + } +} + +// This will restore the state from disk. +// Lock should be held. +func (o *consumer) readStoredState(slseq uint64) error { + if o.store == nil { + return nil + } + state, err := o.store.State() + if err == nil { + o.applyState(state) + if len(o.rdc) > 0 { + o.checkRedelivered(slseq) + } + } + return err +} + +// Apply the consumer stored state. +// Lock should be held. +func (o *consumer) applyState(state *ConsumerState) { + if state == nil { + return + } + + // If o.sseq is greater don't update. Don't go backwards on o.sseq if leader. + if !o.isLeader() || o.sseq <= state.Delivered.Stream { + o.sseq = state.Delivered.Stream + 1 + } + o.dseq = state.Delivered.Consumer + 1 + o.adflr = state.AckFloor.Consumer + o.asflr = state.AckFloor.Stream + o.pending = state.Pending + o.rdc = state.Redelivered + + // Setup tracking timer if we have restored pending. + if o.isLeader() && len(o.pending) > 0 { + // This is on startup or leader change. We want to check pending + // sooner in case there are inconsistencies etc. Pick between 500ms - 1.5s + delay := 500*time.Millisecond + time.Duration(rand.Int63n(1000))*time.Millisecond + + // If normal is lower than this just use that. + if o.cfg.AckWait < delay { + delay = o.ackWait(0) + } + o.resetPtmr(delay) + } +} + +// Sets our store state from another source. Used in clustered mode on snapshot restore. +// Lock should be held. +func (o *consumer) setStoreState(state *ConsumerState) error { + if state == nil || o.store == nil { + return nil + } + err := o.store.Update(state) + if err == nil { + o.applyState(state) + } + return err +} + +// Update our state to the store. +func (o *consumer) writeStoreState() error { + o.mu.Lock() + defer o.mu.Unlock() + return o.writeStoreStateUnlocked() +} + +// Update our state to the store. +// Lock should be held. +func (o *consumer) writeStoreStateUnlocked() error { + if o.store == nil { + return nil + } + state := ConsumerState{ + Delivered: SequencePair{ + Consumer: o.dseq - 1, + Stream: o.sseq - 1, + }, + AckFloor: SequencePair{ + Consumer: o.adflr, + Stream: o.asflr, + }, + Pending: o.pending, + Redelivered: o.rdc, + } + return o.store.Update(&state) +} + +// Returns an initial info. Only applicable for non-clustered consumers. +// We will clear after we return it, so one shot. +func (o *consumer) initialInfo() *ConsumerInfo { + o.mu.Lock() + ici := o.ici + o.ici = nil // gc friendly + o.mu.Unlock() + if ici == nil { + ici = o.info() + } + return ici +} + +// Clears our initial info. +// Used when we have a leader change in cluster mode but do not send a response. +func (o *consumer) clearInitialInfo() { + o.mu.Lock() + o.ici = nil // gc friendly + o.mu.Unlock() +} + +// Info returns our current consumer state. +func (o *consumer) info() *ConsumerInfo { + return o.infoWithSnap(false) +} + +func (o *consumer) infoWithSnap(snap bool) *ConsumerInfo { + return o.infoWithSnapAndReply(snap, _EMPTY_) +} + +func (o *consumer) infoWithSnapAndReply(snap bool, reply string) *ConsumerInfo { + o.mu.Lock() + mset := o.mset + if o.closed || mset == nil || mset.srv == nil { + o.mu.Unlock() + return nil + } + js := o.js + if js == nil { + o.mu.Unlock() + return nil + } + + // Capture raftGroup. + var rg *raftGroup + if o.ca != nil { + rg = o.ca.Group + } + + priorityGroups := []PriorityGroupState{} + // TODO(jrm): when we introduce supporting many priority groups, we need to update assigning `o.currentNuid` for each group. + if len(o.cfg.PriorityGroups) > 0 { + priorityGroups = append(priorityGroups, PriorityGroupState{ + Group: o.cfg.PriorityGroups[0], + PinnedClientID: o.currentPinId, + PinnedTS: o.pinnedTS, + }) + } + + cfg := o.cfg + info := &ConsumerInfo{ + Stream: o.stream, + Name: o.name, + Created: o.created, + Config: &cfg, + Delivered: SequenceInfo{ + Consumer: o.dseq - 1, + Stream: o.sseq - 1, + }, + AckFloor: SequenceInfo{ + Consumer: o.adflr, + Stream: o.asflr, + }, + NumAckPending: len(o.pending), + NumRedelivered: len(o.rdc), + NumPending: o.checkNumPending(), + PushBound: o.isPushMode() && o.active, + TimeStamp: time.Now().UTC(), + PriorityGroups: priorityGroups, + } + if o.cfg.PauseUntil != nil { + p := *o.cfg.PauseUntil + if info.Paused = time.Now().Before(p); info.Paused { + info.PauseRemaining = time.Until(p) + } + } + + // If we are replicated, we need to pull certain data from our store. + if rg != nil && rg.node != nil && o.store != nil { + state, err := o.store.BorrowState() + if err != nil { + o.mu.Unlock() + return nil + } + // If we are the leader we could have o.sseq that is skipped ahead. + // To maintain consistency in reporting (e.g. jsz) we always take the state for our delivered/ackfloor stream sequence. + info.Delivered.Consumer, info.Delivered.Stream = state.Delivered.Consumer, state.Delivered.Stream + info.AckFloor.Consumer, info.AckFloor.Stream = state.AckFloor.Consumer, state.AckFloor.Stream + if !o.isLeader() { + info.NumAckPending = len(state.Pending) + info.NumRedelivered = len(state.Redelivered) + } + } + + // Adjust active based on non-zero etc. Also make UTC here. + if !o.ldt.IsZero() { + ldt := o.ldt.UTC() // This copies as well. + info.Delivered.Last = &ldt + } + if !o.lat.IsZero() { + lat := o.lat.UTC() // This copies as well. + info.AckFloor.Last = &lat + } + + // If we are a pull mode consumer, report on number of waiting requests. + if o.isPullMode() { + o.processWaiting(false) + info.NumWaiting = o.waiting.len() + } + // If we were asked to snapshot do so here. + if snap { + o.ici = info + } + sysc := o.sysc + o.mu.Unlock() + + // Do cluster. + if rg != nil { + info.Cluster = js.clusterInfo(rg) + } + + // If we have a reply subject send the response here. + if reply != _EMPTY_ && sysc != nil { + sysc.sendInternalMsg(reply, _EMPTY_, nil, info) + } + + return info +} + +// Will signal us that new messages are available. Will break out of waiting. +func (o *consumer) signalNewMessages() { + // Kick our new message channel + select { + case o.mch <- struct{}{}: + default: + } +} + +// shouldSample lets us know if we are sampling metrics on acks. +func (o *consumer) shouldSample() bool { + switch { + case o.sfreq <= 0: + return false + case o.sfreq >= 100: + return true + } + + // TODO(ripienaar) this is a tad slow so we need to rethink here, however this will only + // hit for those with sampling enabled and its not the default + return rand.Int31n(100) <= o.sfreq +} + +func (o *consumer) sampleAck(sseq, dseq, dc uint64) { + if !o.shouldSample() { + return + } + + now := time.Now().UTC() + unow := now.UnixNano() + + e := JSConsumerAckMetric{ + TypedEvent: TypedEvent{ + Type: JSConsumerAckMetricType, + ID: nuid.Next(), + Time: now, + }, + Stream: o.stream, + Consumer: o.name, + ConsumerSeq: dseq, + StreamSeq: sseq, + Delay: unow - o.pending[sseq].Timestamp, + Deliveries: dc, + Domain: o.srv.getOpts().JetStreamDomain, + } + + j, err := json.Marshal(e) + if err != nil { + return + } + + o.sendAdvisory(o.ackEventT, j) +} + +// Process an ACK. +// Returns `true` if the ack was processed in place and the sender can now respond +// to the client, or `false` if there was an error or the ack is replicated (in which +// case the reply will be sent later). +func (o *consumer) processAckMsg(sseq, dseq, dc uint64, reply string, doSample bool) bool { + o.mu.Lock() + if o.closed { + o.mu.Unlock() + return false + } + + mset := o.mset + if mset == nil || mset.closed.Load() { + o.mu.Unlock() + return false + } + + // Check if this ack is above the current pointer to our next to deliver. + // This could happen on a cooperative takeover with high speed deliveries. + if sseq >= o.sseq { + // Let's make sure this is valid. + // This is only received on the consumer leader, so should never be higher + // than the last stream sequence. + var ss StreamState + mset.store.FastState(&ss) + if sseq > ss.LastSeq { + o.srv.Warnf("JetStream consumer '%s > %s > %s' ACK sequence %d past last stream sequence of %d", + o.acc.Name, o.stream, o.name, sseq, ss.LastSeq) + // FIXME(dlc) - For 2.11 onwards should we return an error here to the caller? + o.mu.Unlock() + return false + } + o.sseq = sseq + 1 + } + + // Let the owning stream know if we are interest or workqueue retention based. + // If this consumer is clustered (o.node != nil) this will be handled by + // processReplicatedAck after the ack has propagated. + ackInPlace := o.node == nil && o.retention != LimitsPolicy + + var sgap, floor uint64 + var needSignal bool + + switch o.cfg.AckPolicy { + case AckExplicit: + if p, ok := o.pending[sseq]; ok { + if doSample { + o.sampleAck(sseq, dseq, dc) + } + if o.maxp > 0 && len(o.pending) >= o.maxp { + needSignal = true + } + delete(o.pending, sseq) + // Use the original deliver sequence from our pending record. + dseq = p.Sequence + + // Only move floors if we matched an existing pending. + if len(o.pending) == 0 { + o.adflr = o.dseq - 1 + o.asflr = o.sseq - 1 + } else if dseq == o.adflr+1 { + o.adflr, o.asflr = dseq, sseq + for ss := sseq + 1; ss < o.sseq; ss++ { + if p, ok := o.pending[ss]; ok { + if p.Sequence > 0 { + o.adflr, o.asflr = p.Sequence-1, ss-1 + } + break + } + } + } + } + delete(o.rdc, sseq) + o.removeFromRedeliverQueue(sseq) + case AckAll: + // no-op + if dseq <= o.adflr || sseq <= o.asflr { + o.mu.Unlock() + return ackInPlace + } + if o.maxp > 0 && len(o.pending) >= o.maxp { + needSignal = true + } + sgap = sseq - o.asflr + floor = sgap // start at same and set lower as we go. + o.adflr, o.asflr = dseq, sseq + + remove := func(seq uint64) { + delete(o.pending, seq) + delete(o.rdc, seq) + o.removeFromRedeliverQueue(seq) + if seq < floor { + floor = seq + } + } + // Determine if smarter to walk all of pending vs the sequence range. + if sgap > uint64(len(o.pending)) { + for seq := range o.pending { + if seq <= sseq { + remove(seq) + } + } + } else { + for seq := sseq; seq > sseq-sgap && len(o.pending) > 0; seq-- { + remove(seq) + } + } + case AckNone: + // FIXME(dlc) - This is error but do we care? + o.mu.Unlock() + return ackInPlace + } + + // No ack replication, so we set reply to "" so that updateAcks does not + // send the reply. The caller will. + if ackInPlace { + reply = _EMPTY_ + } + // Update underlying store. + o.updateAcks(dseq, sseq, reply) + o.mu.Unlock() + + if ackInPlace { + if sgap > 1 { + // FIXME(dlc) - This can very inefficient, will need to fix. + for seq := sseq; seq >= floor; seq-- { + mset.ackMsg(o, seq) + } + } else { + mset.ackMsg(o, sseq) + } + } + + // If we had max ack pending set and were at limit we need to unblock ourselves. + if needSignal { + o.signalNewMessages() + } + return ackInPlace +} + +// Determine if this is a truly filtered consumer. Modern clients will place filtered subjects +// even if the stream only has a single non-wildcard subject designation. +// Read lock should be held. +func (o *consumer) isFiltered() bool { + if o.subjf == nil { + return false + } + // If we are here we want to check if the filtered subject is + // a direct match for our only listed subject. + mset := o.mset + if mset == nil { + return true + } + + // Protect access to mset.cfg with the cfgMu mutex. + mset.cfgMu.RLock() + msetSubjects := mset.cfg.Subjects + mset.cfgMu.RUnlock() + + // `isFiltered` need to be performant, so we do + // as any checks as possible to avoid unnecessary work. + // Here we avoid iteration over slices if there is only one subject in stream + // and one filter for the consumer. + if len(msetSubjects) == 1 && len(o.subjf) == 1 { + return msetSubjects[0] != o.subjf[0].subject + } + + // if the list is not equal length, we can return early, as this is filtered. + if len(msetSubjects) != len(o.subjf) { + return true + } + + // if in rare case scenario that user passed all stream subjects as consumer filters, + // we need to do a more expensive operation. + // reflect.DeepEqual would return false if the filters are the same, but in different order + // so it can't be used here. + cfilters := make(map[string]struct{}, len(o.subjf)) + for _, val := range o.subjf { + cfilters[val.subject] = struct{}{} + } + for _, val := range msetSubjects { + if _, ok := cfilters[val]; !ok { + return true + } + } + return false +} + +// Check if we need an ack for this store seq. +// This is called for interest based retention streams to remove messages. +func (o *consumer) needAck(sseq uint64, subj string) bool { + var needAck bool + var asflr, osseq uint64 + var pending map[uint64]*Pending + + o.mu.RLock() + defer o.mu.RUnlock() + + isFiltered := o.isFiltered() + if isFiltered && o.mset == nil { + return false + } + + // Check if we are filtered, and if so check if this is even applicable to us. + if isFiltered { + if subj == _EMPTY_ { + var svp StoreMsg + if _, err := o.mset.store.LoadMsg(sseq, &svp); err != nil { + return false + } + subj = svp.subj + } + if !o.isFilteredMatch(subj) { + return false + } + } + if o.isLeader() { + asflr, osseq = o.asflr, o.sseq + pending = o.pending + } else { + if o.store == nil { + return false + } + state, err := o.store.BorrowState() + if err != nil || state == nil { + // Fall back to what we track internally for now. + return sseq > o.asflr && !o.isFiltered() + } + // If loading state as here, the osseq is +1. + asflr, osseq, pending = state.AckFloor.Stream, state.Delivered.Stream+1, state.Pending + } + + switch o.cfg.AckPolicy { + case AckNone, AckAll: + needAck = sseq > asflr + case AckExplicit: + if sseq > asflr { + if sseq >= osseq { + needAck = true + } else { + _, needAck = pending[sseq] + } + } + } + + return needAck +} + +type PriorityGroup struct { + Group string `json:"group,omitempty"` + MinPending int64 `json:"min_pending,omitempty"` + MinAckPending int64 `json:"min_ack_pending,omitempty"` + Id string `json:"id,omitempty"` +} + +// Used in nextReqFromMsg, since the json.Unmarshal causes the request +// struct to escape to the heap always. This should reduce GC pressure. +var jsGetNextPool = sync.Pool{ + New: func() any { + return &JSApiConsumerGetNextRequest{} + }, +} + +// Helper for the next message requests. +func nextReqFromMsg(msg []byte) (time.Time, int, int, bool, time.Duration, time.Time, *PriorityGroup, error) { + req := bytes.TrimSpace(msg) + + switch { + case len(req) == 0: + return time.Time{}, 1, 0, false, 0, time.Time{}, nil, nil + + case req[0] == '{': + cr := jsGetNextPool.Get().(*JSApiConsumerGetNextRequest) + defer func() { + *cr = JSApiConsumerGetNextRequest{} + jsGetNextPool.Put(cr) + }() + if err := json.Unmarshal(req, &cr); err != nil { + return time.Time{}, -1, 0, false, 0, time.Time{}, nil, err + } + var hbt time.Time + if cr.Heartbeat > 0 { + if cr.Heartbeat*2 > cr.Expires { + return time.Time{}, 1, 0, false, 0, time.Time{}, nil, errors.New("heartbeat value too large") + } + hbt = time.Now().Add(cr.Heartbeat) + } + priorityGroup := cr.PriorityGroup + if cr.Expires == time.Duration(0) { + return time.Time{}, cr.Batch, cr.MaxBytes, cr.NoWait, cr.Heartbeat, hbt, &priorityGroup, nil + } + return time.Now().Add(cr.Expires), cr.Batch, cr.MaxBytes, cr.NoWait, cr.Heartbeat, hbt, &priorityGroup, nil + default: + if n, err := strconv.Atoi(string(req)); err == nil { + return time.Time{}, n, 0, false, 0, time.Time{}, nil, nil + } + } + + return time.Time{}, 1, 0, false, 0, time.Time{}, nil, nil +} + +// Represents a request that is on the internal waiting queue +type waitingRequest struct { + next *waitingRequest + acc *Account + interest string + reply string + n int // For batching + d int // num delivered + b int // For max bytes tracking + expires time.Time + received time.Time + hb time.Duration + hbt time.Time + noWait bool + priorityGroup *PriorityGroup +} + +// sync.Pool for waiting requests. +var wrPool = sync.Pool{ + New: func() any { + return new(waitingRequest) + }, +} + +// Recycle this request. This request can not be accessed after this call. +func (wr *waitingRequest) recycleIfDone() bool { + if wr != nil && wr.n <= 0 { + wr.recycle() + return true + } + return false +} + +// Force a recycle. +func (wr *waitingRequest) recycle() { + if wr != nil { + wr.next, wr.acc, wr.interest, wr.reply = nil, nil, _EMPTY_, _EMPTY_ + wrPool.Put(wr) + } +} + +// waiting queue for requests that are waiting for new messages to arrive. +type waitQueue struct { + n, max int + last time.Time + head *waitingRequest + tail *waitingRequest +} + +// Create a new ring buffer with at most max items. +func newWaitQueue(max int) *waitQueue { + return &waitQueue{max: max} +} + +var ( + errWaitQueueFull = errors.New("wait queue is full") + errWaitQueueNil = errors.New("wait queue is nil") +) + +// Adds in a new request. +func (wq *waitQueue) add(wr *waitingRequest) error { + if wq == nil { + return errWaitQueueNil + } + if wq.isFull() { + return errWaitQueueFull + } + if wq.head == nil { + wq.head = wr + } else { + wq.tail.next = wr + } + // Always set tail. + wq.tail = wr + // Make sure nil + wr.next = nil + + // Track last active via when we receive a request. + wq.last = wr.received + wq.n++ + return nil +} + +func (wq *waitQueue) isFull() bool { + if wq == nil { + return false + } + return wq.n == wq.max +} + +func (wq *waitQueue) isEmpty() bool { + if wq == nil { + return true + } + return wq.n == 0 +} + +func (wq *waitQueue) len() int { + if wq == nil { + return 0 + } + return wq.n +} + +// Peek will return the next request waiting or nil if empty. +func (wq *waitQueue) peek() *waitingRequest { + if wq == nil { + return nil + } + return wq.head +} + +func (wq *waitQueue) cycle() { + wr := wq.peek() + if wr != nil { + // Always remove current now on a pop, and move to end if still valid. + // If we were the only one don't need to remove since this can be a no-op. + wq.removeCurrent() + wq.add(wr) + } +} + +// pop will return the next request and move the read cursor. +// This will now place a request that still has pending items at the ends of the list. +func (wq *waitQueue) pop() *waitingRequest { + wr := wq.peek() + if wr != nil { + wr.d++ + wr.n-- + // Always remove current now on a pop, and move to end if still valid. + // If we were the only one don't need to remove since this can be a no-op. + if wr.n > 0 && wq.n > 1 { + wq.removeCurrent() + wq.add(wr) + } else if wr.n <= 0 { + wq.removeCurrent() + } + } + return wr +} + +// Removes the current read pointer (head FIFO) entry. +func (wq *waitQueue) removeCurrent() { + wq.remove(nil, wq.head) +} + +// Remove the wr element from the wait queue. +func (wq *waitQueue) remove(pre, wr *waitingRequest) { + if wr == nil { + return + } + if pre != nil { + pre.next = wr.next + } else if wr == wq.head { + // We are removing head here. + wq.head = wr.next + } + // Check if wr was our tail. + if wr == wq.tail { + // Check if we need to assign to pre. + if wr.next == nil { + wq.tail = pre + } else { + wq.tail = wr.next + } + } + wq.n-- +} + +// Return the map of pending requests keyed by the reply subject. +// No-op if push consumer or invalid etc. +func (o *consumer) pendingRequests() map[string]*waitingRequest { + if o.waiting == nil { + return nil + } + wq, m := o.waiting, make(map[string]*waitingRequest) + for wr := wq.head; wr != nil; wr = wr.next { + m[wr.reply] = wr + } + + return m +} + +func (o *consumer) setPinnedTimer(priorityGroup string) { + if o.pinnedTtl != nil { + o.pinnedTtl.Reset(o.cfg.PinnedTTL) + } else { + o.pinnedTtl = time.AfterFunc(o.cfg.PinnedTTL, func() { + o.mu.Lock() + o.pinnedTS = time.Now() + o.currentPinId = _EMPTY_ + o.sendUnpinnedAdvisoryLocked(priorityGroup, "timeout") + o.mu.Unlock() + o.signalNewMessages() + }) + } +} + +// Return next waiting request. This will check for expirations but not noWait or interest. +// That will be handled by processWaiting. +// Lock should be held. +func (o *consumer) nextWaiting(sz int) *waitingRequest { + if o.waiting == nil || o.waiting.isEmpty() { + return nil + } + + // Check if server needs to assign a new pin id. + needNewPin := o.currentPinId == _EMPTY_ && o.cfg.PriorityPolicy == PriorityPinnedClient + // As long as we support only one priority group, we can capture that group here and reuse it. + var priorityGroup string + if len(o.cfg.PriorityGroups) > 0 { + priorityGroup = o.cfg.PriorityGroups[0] + } + + lastRequest := o.waiting.tail + for wr := o.waiting.peek(); !o.waiting.isEmpty(); wr = o.waiting.peek() { + if wr == nil { + break + } + // Check if we have max bytes set. + if wr.b > 0 { + if sz <= wr.b { + wr.b -= sz + // If we are right now at zero, set batch to 1 to deliver this one but stop after. + if wr.b == 0 { + wr.n = 1 + } + } else { + // Since we can't send that message to the requestor, we need to + // notify that we are closing the request. + const maxBytesT = "NATS/1.0 409 Message Size Exceeds MaxBytes\r\n%s: %d\r\n%s: %d\r\n\r\n" + hdr := fmt.Appendf(nil, maxBytesT, JSPullRequestPendingMsgs, wr.n, JSPullRequestPendingBytes, wr.b) + o.outq.send(newJSPubMsg(wr.reply, _EMPTY_, _EMPTY_, hdr, nil, nil, 0)) + // Remove the current one, no longer valid due to max bytes limit. + o.waiting.removeCurrent() + if o.node != nil { + o.removeClusterPendingRequest(wr.reply) + } + wr.recycle() + continue + } + } + + if wr.expires.IsZero() || time.Now().Before(wr.expires) { + if needNewPin { + if wr.priorityGroup.Id == _EMPTY_ { + o.currentPinId = nuid.Next() + wr.priorityGroup.Id = o.currentPinId + o.setPinnedTimer(priorityGroup) + + } else { + // There is pin id set, but not a matching one. Send a notification to the client and remove the request. + // Probably this is the old pin id. + o.outq.send(newJSPubMsg(wr.reply, _EMPTY_, _EMPTY_, []byte(JSPullRequestWrongPinID), nil, nil, 0)) + o.waiting.removeCurrent() + if o.node != nil { + o.removeClusterPendingRequest(wr.reply) + } + wr.recycle() + continue + } + } else if o.currentPinId != _EMPTY_ { + // Check if we have a match on the currentNuid + if wr.priorityGroup != nil && wr.priorityGroup.Id == o.currentPinId { + // If we have a match, we do nothing here and will deliver the message later down the code path. + } else if wr.priorityGroup.Id == _EMPTY_ { + o.waiting.cycle() + if wr == lastRequest { + return nil + } + continue + } else { + // There is pin id set, but not a matching one. Send a notification to the client and remove the request. + o.outq.send(newJSPubMsg(wr.reply, _EMPTY_, _EMPTY_, []byte(JSPullRequestWrongPinID), nil, nil, 0)) + o.waiting.removeCurrent() + if o.node != nil { + o.removeClusterPendingRequest(wr.reply) + } + wr.recycle() + continue + } + } + + if o.cfg.PriorityPolicy == PriorityOverflow { + if wr.priorityGroup != nil && + // We need to check o.npc+1, because before calling nextWaiting, we do o.npc-- + (wr.priorityGroup.MinPending > 0 && wr.priorityGroup.MinPending > o.npc+1 || + wr.priorityGroup.MinAckPending > 0 && wr.priorityGroup.MinAckPending > int64(len(o.pending))) { + o.waiting.cycle() + // We're done cycling through the requests. + if wr == lastRequest { + return nil + } + continue + } + } + if wr.acc.sl.HasInterest(wr.interest) { + if needNewPin { + o.sendPinnedAdvisoryLocked(priorityGroup) + } + return o.waiting.pop() + } else if time.Since(wr.received) < defaultGatewayRecentSubExpiration && (o.srv.leafNodeEnabled || o.srv.gateway.enabled) { + if needNewPin { + o.sendPinnedAdvisoryLocked(priorityGroup) + } + return o.waiting.pop() + } else if o.srv.gateway.enabled && o.srv.hasGatewayInterest(wr.acc.Name, wr.interest) { + if needNewPin { + o.sendPinnedAdvisoryLocked(priorityGroup) + } + return o.waiting.pop() + } + } else { + // We do check for expiration in `processWaiting`, but it is possible to hit the expiry here, and not there. + hdr := fmt.Appendf(nil, "NATS/1.0 408 Request Timeout\r\n%s: %d\r\n%s: %d\r\n\r\n", JSPullRequestPendingMsgs, wr.n, JSPullRequestPendingBytes, wr.b) + o.outq.send(newJSPubMsg(wr.reply, _EMPTY_, _EMPTY_, hdr, nil, nil, 0)) + o.waiting.removeCurrent() + if o.node != nil { + o.removeClusterPendingRequest(wr.reply) + } + wr.recycle() + continue + + } + if wr.interest != wr.reply { + const intExpT = "NATS/1.0 408 Interest Expired\r\n%s: %d\r\n%s: %d\r\n\r\n" + hdr := fmt.Appendf(nil, intExpT, JSPullRequestPendingMsgs, wr.n, JSPullRequestPendingBytes, wr.b) + o.outq.send(newJSPubMsg(wr.reply, _EMPTY_, _EMPTY_, hdr, nil, nil, 0)) + } + // Remove the current one, no longer valid. + o.waiting.removeCurrent() + if o.node != nil { + o.removeClusterPendingRequest(wr.reply) + } + wr.recycle() + } + + return nil +} + +// Next message request. +type nextMsgReq struct { + reply string + msg []byte +} + +var nextMsgReqPool sync.Pool + +func newNextMsgReq(reply string, msg []byte) *nextMsgReq { + var nmr *nextMsgReq + m := nextMsgReqPool.Get() + if m != nil { + nmr = m.(*nextMsgReq) + } else { + nmr = &nextMsgReq{} + } + // When getting something from a pool it is critical that all fields are + // initialized. Doing this way guarantees that if someone adds a field to + // the structure, the compiler will fail the build if this line is not updated. + (*nmr) = nextMsgReq{reply, msg} + return nmr +} + +func (nmr *nextMsgReq) returnToPool() { + if nmr == nil { + return + } + nmr.reply, nmr.msg = _EMPTY_, nil + nextMsgReqPool.Put(nmr) +} + +// processNextMsgReq will process a request for the next message available. A nil message payload means deliver +// a single message. If the payload is a formal request or a number parseable with Atoi(), then we will send a +// batch of messages without requiring another request to this endpoint, or an ACK. +func (o *consumer) processNextMsgReq(_ *subscription, c *client, _ *Account, _, reply string, msg []byte) { + if reply == _EMPTY_ { + return + } + + // Short circuit error here. + if o.nextMsgReqs == nil { + hdr := []byte("NATS/1.0 409 Consumer is push based\r\n\r\n") + o.outq.send(newJSPubMsg(reply, _EMPTY_, _EMPTY_, hdr, nil, nil, 0)) + return + } + + _, msg = c.msgParts(msg) + o.nextMsgReqs.push(newNextMsgReq(reply, copyBytes(msg))) +} + +func (o *consumer) processNextMsgRequest(reply string, msg []byte) { + o.mu.Lock() + defer o.mu.Unlock() + + mset := o.mset + if mset == nil { + return + } + + sendErr := func(status int, description string) { + hdr := fmt.Appendf(nil, "NATS/1.0 %d %s\r\n\r\n", status, description) + o.outq.send(newJSPubMsg(reply, _EMPTY_, _EMPTY_, hdr, nil, nil, 0)) + } + + if o.isPushMode() || o.waiting == nil { + sendErr(409, "Consumer is push based") + return + } + + // Check payload here to see if they sent in batch size or a formal request. + expires, batchSize, maxBytes, noWait, hb, hbt, priorityGroup, err := nextReqFromMsg(msg) + if err != nil { + sendErr(400, fmt.Sprintf("Bad Request - %v", err)) + return + } + + // Check for request limits + if o.cfg.MaxRequestBatch > 0 && batchSize > o.cfg.MaxRequestBatch { + sendErr(409, fmt.Sprintf("Exceeded MaxRequestBatch of %d", o.cfg.MaxRequestBatch)) + return + } + + if !expires.IsZero() && o.cfg.MaxRequestExpires > 0 && expires.After(time.Now().Add(o.cfg.MaxRequestExpires)) { + sendErr(409, fmt.Sprintf("Exceeded MaxRequestExpires of %v", o.cfg.MaxRequestExpires)) + return + } + + if maxBytes > 0 && o.cfg.MaxRequestMaxBytes > 0 && maxBytes > o.cfg.MaxRequestMaxBytes { + sendErr(409, fmt.Sprintf("Exceeded MaxRequestMaxBytes of %v", o.cfg.MaxRequestMaxBytes)) + return + } + + if priorityGroup != nil { + if (priorityGroup.MinPending != 0 || priorityGroup.MinAckPending != 0) && o.cfg.PriorityPolicy != PriorityOverflow { + sendErr(400, "Bad Request - Not a Overflow Priority consumer") + } + + if priorityGroup.Id != _EMPTY_ && o.cfg.PriorityPolicy != PriorityPinnedClient { + sendErr(400, "Bad Request - Not a Pinned Client Priority consumer") + } + } + + if priorityGroup != nil && o.cfg.PriorityPolicy != PriorityNone { + if priorityGroup.Group == _EMPTY_ { + sendErr(400, "Bad Request - Priority Group missing") + return + } + + found := false + for _, group := range o.cfg.PriorityGroups { + if group == priorityGroup.Group { + found = true + break + } + } + if !found { + sendErr(400, "Bad Request - Invalid Priority Group") + return + } + + if o.currentPinId != _EMPTY_ { + if priorityGroup.Id == o.currentPinId { + o.setPinnedTimer(priorityGroup.Group) + } else if priorityGroup.Id != _EMPTY_ { + sendErr(423, "Nats-Pin-Id mismatch") + return + } + } + } + + // If we have the max number of requests already pending try to expire. + if o.waiting.isFull() { + // Try to expire some of the requests. + // We do not want to push too hard here so at maximum process once per sec. + if time.Since(o.lwqic) > time.Second { + o.processWaiting(false) + } + } + + // If the request is for noWait and we have pending requests already, check if we have room. + if noWait { + msgsPending := o.numPending() + uint64(len(o.rdq)) + // If no pending at all, decide what to do with request. + // If no expires was set then fail. + if msgsPending == 0 && expires.IsZero() { + o.waiting.last = time.Now() + sendErr(404, "No Messages") + return + } + if msgsPending > 0 { + _, _, batchPending, _ := o.processWaiting(false) + if msgsPending < uint64(batchPending) { + o.waiting.last = time.Now() + sendErr(408, "Requests Pending") + return + } + } + // If we are here this should be considered a one-shot situation. + // We will wait for expires but will return as soon as we have any messages. + } + + // If we receive this request though an account export, we need to track that interest subject and account. + acc, interest := trackDownAccountAndInterest(o.acc, reply) + + // Create a waiting request. + wr := wrPool.Get().(*waitingRequest) + wr.acc, wr.interest, wr.reply, wr.n, wr.d, wr.noWait, wr.expires, wr.hb, wr.hbt, wr.priorityGroup = acc, interest, reply, batchSize, 0, noWait, expires, hb, hbt, priorityGroup + wr.b = maxBytes + wr.received = time.Now() + + if err := o.waiting.add(wr); err != nil { + sendErr(409, "Exceeded MaxWaiting") + wr.recycle() + return + } + o.signalNewMessages() + // If we are clustered update our followers about this request. + if o.node != nil { + o.addClusterPendingRequest(wr.reply) + } +} + +func trackDownAccountAndInterest(acc *Account, interest string) (*Account, string) { + for strings.HasPrefix(interest, replyPrefix) { + oa := acc + oa.mu.RLock() + if oa.exports.responses == nil { + oa.mu.RUnlock() + break + } + si := oa.exports.responses[interest] + if si == nil { + oa.mu.RUnlock() + break + } + acc, interest = si.acc, si.to + oa.mu.RUnlock() + } + return acc, interest +} + +// Return current delivery count for a given sequence. +func (o *consumer) deliveryCount(seq uint64) uint64 { + if o.rdc == nil { + return 1 + } + return o.rdc[seq] +} + +// Increase the delivery count for this message. +// ONLY used on redelivery semantics. +// Lock should be held. +func (o *consumer) incDeliveryCount(sseq uint64) uint64 { + if o.rdc == nil { + o.rdc = make(map[uint64]uint64) + } + o.rdc[sseq] += 1 + return o.rdc[sseq] + 1 +} + +// Used if we have to adjust on failed delivery or bad lookups. +// Those failed attempts should not increase deliver count. +// Lock should be held. +func (o *consumer) decDeliveryCount(sseq uint64) { + if o.rdc == nil { + o.rdc = make(map[uint64]uint64) + } + o.rdc[sseq] -= 1 +} + +// send a delivery exceeded advisory. +func (o *consumer) notifyDeliveryExceeded(sseq, dc uint64) { + e := JSConsumerDeliveryExceededAdvisory{ + TypedEvent: TypedEvent{ + Type: JSConsumerDeliveryExceededAdvisoryType, + ID: nuid.Next(), + Time: time.Now().UTC(), + }, + Stream: o.stream, + Consumer: o.name, + StreamSeq: sseq, + Deliveries: dc, + Domain: o.srv.getOpts().JetStreamDomain, + } + + j, err := json.Marshal(e) + if err != nil { + return + } + + o.sendAdvisory(o.deliveryExcEventT, j) +} + +// Check if the candidate subject matches a filter if its present. +// Lock should be held. +func (o *consumer) isFilteredMatch(subj string) bool { + // No filter is automatic match. + if o.subjf == nil { + return true + } + for _, filter := range o.subjf { + if !filter.hasWildcard && subj == filter.subject { + return true + } + } + // It's quicker to first check for non-wildcard filters, then + // iterate again to check for subset match. + tsa := [32]string{} + tts := tokenizeSubjectIntoSlice(tsa[:0], subj) + for _, filter := range o.subjf { + if isSubsetMatchTokenized(tts, filter.tokenizedSubject) { + return true + } + } + return false +} + +// Check if the candidate filter subject is equal to or a subset match +// of one of the filter subjects. +// Lock should be held. +func (o *consumer) isEqualOrSubsetMatch(subj string) bool { + for _, filter := range o.subjf { + if !filter.hasWildcard && subj == filter.subject { + return true + } + } + tsa := [32]string{} + tts := tokenizeSubjectIntoSlice(tsa[:0], subj) + for _, filter := range o.subjf { + if isSubsetMatchTokenized(filter.tokenizedSubject, tts) { + return true + } + } + return false +} + +var ( + errMaxAckPending = errors.New("max ack pending reached") + errBadConsumer = errors.New("consumer not valid") + errNoInterest = errors.New("consumer requires interest for delivery subject when ephemeral") +) + +// Get next available message from underlying store. +// Is partition aware and redeliver aware. +// Lock should be held. +func (o *consumer) getNextMsg() (*jsPubMsg, uint64, error) { + if o.mset == nil || o.mset.store == nil { + return nil, 0, errBadConsumer + } + // Process redelivered messages before looking at possibly "skip list" (deliver last per subject) + if o.hasRedeliveries() { + var seq, dc uint64 + for seq = o.getNextToRedeliver(); seq > 0; seq = o.getNextToRedeliver() { + dc = o.incDeliveryCount(seq) + if o.maxdc > 0 && dc > o.maxdc { + // Only send once + if dc == o.maxdc+1 { + o.notifyDeliveryExceeded(seq, dc-1) + } + // Make sure to remove from pending. + if p, ok := o.pending[seq]; ok && p != nil { + delete(o.pending, seq) + o.updateDelivered(p.Sequence, seq, dc, p.Timestamp) + } + continue + } + if seq > 0 { + pmsg := getJSPubMsgFromPool() + sm, err := o.mset.store.LoadMsg(seq, &pmsg.StoreMsg) + if sm == nil || err != nil { + pmsg.returnToPool() + pmsg, dc = nil, 0 + // Adjust back deliver count. + o.decDeliveryCount(seq) + } + return pmsg, dc, err + } + } + } + + // Check if we have max pending. + if o.maxp > 0 && len(o.pending) >= o.maxp { + // maxp only set when ack policy != AckNone and user set MaxAckPending + // Stall if we have hit max pending. + return nil, 0, errMaxAckPending + } + + if o.hasSkipListPending() { + seq := o.lss.seqs[0] + if len(o.lss.seqs) == 1 { + o.sseq = o.lss.resume + o.lss = nil + o.updateSkipped(o.sseq) + } else { + o.lss.seqs = o.lss.seqs[1:] + } + pmsg := getJSPubMsgFromPool() + sm, err := o.mset.store.LoadMsg(seq, &pmsg.StoreMsg) + if sm == nil || err != nil { + pmsg.returnToPool() + } + o.sseq++ + return pmsg, 1, err + } + + // Hold onto this since we release the lock. + store := o.mset.store + + var sseq uint64 + var err error + var sm *StoreMsg + var pmsg = getJSPubMsgFromPool() + + // Grab next message applicable to us. + filters, subjf, fseq := o.filters, o.subjf, o.sseq + // Check if we are multi-filtered or not. + if filters != nil { + sm, sseq, err = store.LoadNextMsgMulti(filters, fseq, &pmsg.StoreMsg) + } else if len(subjf) > 0 { // Means single filtered subject since o.filters means > 1. + filter, wc := subjf[0].subject, subjf[0].hasWildcard + sm, sseq, err = store.LoadNextMsg(filter, wc, fseq, &pmsg.StoreMsg) + } else { + // No filter here. + sm, sseq, err = store.LoadNextMsg(_EMPTY_, false, fseq, &pmsg.StoreMsg) + } + if sm == nil { + pmsg.returnToPool() + pmsg = nil + } + // Check if we should move our o.sseq. + if sseq >= o.sseq { + // If we are moving step by step then sseq == o.sseq. + // If we have jumped we should update skipped for other replicas. + if sseq != o.sseq && err == ErrStoreEOF { + o.updateSkipped(sseq + 1) + } + o.sseq = sseq + 1 + } + return pmsg, 1, err +} + +// Will check for expiration and lack of interest on waiting requests. +// Will also do any heartbeats and return the next expiration or HB interval. +func (o *consumer) processWaiting(eos bool) (int, int, int, time.Time) { + var fexp time.Time + if o.srv == nil || o.waiting.isEmpty() { + return 0, 0, 0, fexp + } + // Mark our last check time. + o.lwqic = time.Now() + + var expired, brp int + s, now := o.srv, time.Now() + + wq := o.waiting + remove := func(pre, wr *waitingRequest) *waitingRequest { + expired++ + if o.node != nil { + o.removeClusterPendingRequest(wr.reply) + } + next := wr.next + wq.remove(pre, wr) + wr.recycle() + return next + } + + var pre *waitingRequest + for wr := wq.head; wr != nil; { + // Check expiration. + if (eos && wr.noWait && wr.d > 0) || (!wr.expires.IsZero() && now.After(wr.expires)) { + hdr := fmt.Appendf(nil, "NATS/1.0 408 Request Timeout\r\n%s: %d\r\n%s: %d\r\n\r\n", JSPullRequestPendingMsgs, wr.n, JSPullRequestPendingBytes, wr.b) + o.outq.send(newJSPubMsg(wr.reply, _EMPTY_, _EMPTY_, hdr, nil, nil, 0)) + wr = remove(pre, wr) + continue + } + // Now check interest. + interest := wr.acc.sl.HasInterest(wr.interest) + if !interest && (s.leafNodeEnabled || s.gateway.enabled) { + // If we are here check on gateways and leaf nodes (as they can mask gateways on the other end). + // If we have interest or the request is too young break and do not expire. + if time.Since(wr.received) < defaultGatewayRecentSubExpiration { + interest = true + } else if s.gateway.enabled && s.hasGatewayInterest(wr.acc.Name, wr.interest) { + interest = true + } + } + // Check if we have interest. + if !interest { + // No more interest here so go ahead and remove this one from our list. + wr = remove(pre, wr) + continue + } + + // If interest, update batch pending requests counter and update fexp timer. + brp += wr.n + if !wr.hbt.IsZero() { + if now.After(wr.hbt) { + // Fire off a heartbeat here. + o.sendIdleHeartbeat(wr.reply) + // Update next HB. + wr.hbt = now.Add(wr.hb) + } + if fexp.IsZero() || wr.hbt.Before(fexp) { + fexp = wr.hbt + } + } + if !wr.expires.IsZero() && (fexp.IsZero() || wr.expires.Before(fexp)) { + fexp = wr.expires + } + // Update pre and wr here. + pre = wr + wr = wr.next + } + + return expired, wq.len(), brp, fexp +} + +// Will check to make sure those waiting still have registered interest. +func (o *consumer) checkWaitingForInterest() bool { + o.processWaiting(true) + return o.waiting.len() > 0 +} + +// Lock should be held. +func (o *consumer) hbTimer() (time.Duration, *time.Timer) { + if o.cfg.Heartbeat == 0 { + return 0, nil + } + return o.cfg.Heartbeat, time.NewTimer(o.cfg.Heartbeat) +} + +// Check here for conditions when our ack floor may have drifted below the streams first sequence. +// In general this is accounted for in normal operations, but if the consumer misses the signal from +// the stream it will not clear the message and move the ack state. +// Should only be called from consumer leader. +func (o *consumer) checkAckFloor() { + o.mu.RLock() + mset, closed, asflr, numPending := o.mset, o.closed, o.asflr, len(o.pending) + o.mu.RUnlock() + + if asflr == 0 || closed || mset == nil { + return + } + + var ss StreamState + mset.store.FastState(&ss) + + // If our floor is equal or greater that is normal and nothing for us to do. + if ss.FirstSeq == 0 || asflr >= ss.FirstSeq-1 { + return + } + + // Check which linear space is less to walk. + if ss.FirstSeq-asflr-1 < uint64(numPending) { + // Process all messages that no longer exist. + for seq := asflr + 1; seq < ss.FirstSeq; seq++ { + // Check if this message was pending. + o.mu.RLock() + p, isPending := o.pending[seq] + var rdc uint64 = 1 + if o.rdc != nil { + rdc = o.rdc[seq] + } + o.mu.RUnlock() + // If it was pending for us, get rid of it. + if isPending { + o.processTerm(seq, p.Sequence, rdc, ackTermLimitsReason, _EMPTY_) + } + } + } else if numPending > 0 { + // here it is shorter to walk pending. + // toTerm is seq, dseq, rcd for each entry. + toTerm := make([]uint64, 0, numPending*3) + o.mu.RLock() + for seq, p := range o.pending { + if seq < ss.FirstSeq { + var dseq uint64 = 1 + if p != nil { + dseq = p.Sequence + } + var rdc uint64 = 1 + if o.rdc != nil { + rdc = o.rdc[seq] + } + toTerm = append(toTerm, seq, dseq, rdc) + } + } + o.mu.RUnlock() + + for i := 0; i < len(toTerm); i += 3 { + seq, dseq, rdc := toTerm[i], toTerm[i+1], toTerm[i+2] + o.processTerm(seq, dseq, rdc, ackTermLimitsReason, _EMPTY_) + } + } + + // Do one final check here. + o.mu.Lock() + defer o.mu.Unlock() + + // If we are closed do not change anything and simply return. + if o.closed { + return + } + + // If we are here, and this should be rare, we still are off with our ack floor. + // We will make sure we are not doing un-necessary work here if only off by a bit + // since this could be normal for a high activity wq or stream. + // We will set it explicitly to 1 behind our current lowest in pending, or if + // pending is empty, to our current delivered -1. + const minOffThreshold = 50 + if ss.FirstSeq >= minOffThreshold && o.asflr < ss.FirstSeq-minOffThreshold { + var psseq, pdseq uint64 + for seq, p := range o.pending { + if psseq == 0 || seq < psseq { + psseq, pdseq = seq, p.Sequence + } + } + // If we still have none, set to current delivered -1. + if psseq == 0 { + psseq, pdseq = o.sseq-1, o.dseq-1 + // If still not adjusted. + if psseq < ss.FirstSeq-1 { + psseq = ss.FirstSeq - 1 + } + } else { + // Since this was set via the pending, we should not include + // it directly but set floors to -1. + psseq, pdseq = psseq-1, pdseq-1 + } + o.asflr, o.adflr = psseq, pdseq + } +} + +func (o *consumer) processInboundAcks(qch chan struct{}) { + // Grab the server lock to watch for server quit. + o.mu.RLock() + s, mset := o.srv, o.mset + hasInactiveThresh := o.cfg.InactiveThreshold > 0 + + o.mu.RUnlock() + + if s == nil || mset == nil { + return + } + + // We will check this on entry and periodically. + o.checkAckFloor() + + // How often we will check for ack floor drift. + // Spread these out for large numbers on a server restart. + delta := time.Duration(rand.Int63n(int64(time.Minute))) + ticker := time.NewTicker(time.Minute + delta) + defer ticker.Stop() + + for { + select { + case <-o.ackMsgs.ch: + acks := o.ackMsgs.pop() + for _, ack := range acks { + o.processAck(ack.subject, ack.reply, ack.hdr, ack.msg) + ack.returnToPool() + } + o.ackMsgs.recycle(&acks) + // If we have an inactiveThreshold set, mark our activity. + if hasInactiveThresh { + o.suppressDeletion() + } + case <-ticker.C: + o.checkAckFloor() + case <-qch: + return + case <-s.quitCh: + return + } + } +} + +// Process inbound next message requests. +func (o *consumer) processInboundNextMsgReqs(qch chan struct{}) { + // Grab the server lock to watch for server quit. + o.mu.RLock() + s := o.srv + o.mu.RUnlock() + + for { + select { + case <-o.nextMsgReqs.ch: + reqs := o.nextMsgReqs.pop() + for _, req := range reqs { + o.processNextMsgRequest(req.reply, req.msg) + req.returnToPool() + } + o.nextMsgReqs.recycle(&reqs) + case <-qch: + return + case <-s.quitCh: + return + } + } +} + +// Suppress auto cleanup on ack activity of any kind. +func (o *consumer) suppressDeletion() { + o.mu.Lock() + defer o.mu.Unlock() + + if o.closed { + return + } + + if o.isPushMode() && o.dtmr != nil { + // if dtmr is not nil we have started the countdown, simply reset to threshold. + o.dtmr.Reset(o.dthresh) + } else if o.isPullMode() && o.waiting != nil { + // Pull mode always has timer running, just update last on waiting queue. + o.waiting.last = time.Now() + } +} + +// loopAndGatherMsgs waits for messages for the consumer. qch is the quit channel, +// upch is the unpause channel which fires when the PauseUntil deadline is reached. +func (o *consumer) loopAndGatherMsgs(qch chan struct{}) { + // On startup check to see if we are in a reply situation where replay policy is not instant. + var ( + lts int64 // last time stamp seen, used for replay. + lseq uint64 + ) + + o.mu.RLock() + mset := o.mset + getLSeq := o.replay + o.mu.RUnlock() + // consumer is closed when mset is set to nil. + if mset == nil { + return + } + if getLSeq { + lseq = mset.state().LastSeq + } + + o.mu.Lock() + s := o.srv + // need to check again if consumer is closed + if o.mset == nil { + o.mu.Unlock() + return + } + // For idle heartbeat support. + var hbc <-chan time.Time + hbd, hb := o.hbTimer() + if hb != nil { + hbc = hb.C + } + // Interest changes. + inch := o.inch + o.mu.Unlock() + + // Grab the stream's retention policy and name + mset.cfgMu.RLock() + stream, rp := mset.cfg.Name, mset.cfg.Retention + mset.cfgMu.RUnlock() + + var err error + + // Deliver all the msgs we have now, once done or on a condition, we wait for new ones. + for { + var ( + pmsg *jsPubMsg + dc uint64 + dsubj string + ackReply string + delay time.Duration + sz int + wrn, wrb int + ) + + o.mu.Lock() + + // consumer is closed when mset is set to nil. + if o.closed || o.mset == nil { + o.mu.Unlock() + return + } + + // Clear last error. + err = nil + + // If the consumer is paused then stop sending. + if o.cfg.PauseUntil != nil && !o.cfg.PauseUntil.IsZero() && time.Now().Before(*o.cfg.PauseUntil) { + // If the consumer is paused and we haven't reached the deadline yet then + // go back to waiting. + goto waitForMsgs + } + + // If we are in push mode and not active or under flowcontrol let's stop sending. + if o.isPushMode() { + if !o.active || (o.maxpb > 0 && o.pbytes > o.maxpb) { + goto waitForMsgs + } + } else if o.waiting.isEmpty() { + // If we are in pull mode and no one is waiting already break and wait. + goto waitForMsgs + } + + // Grab our next msg. + pmsg, dc, err = o.getNextMsg() + + // We can release the lock now under getNextMsg so need to check this condition again here. + if o.closed || o.mset == nil { + o.mu.Unlock() + return + } + + // On error either wait or return. + if err != nil || pmsg == nil { + // On EOF we can optionally fast sync num pending state. + if err == ErrStoreEOF { + o.checkNumPendingOnEOF() + } + if err == ErrStoreMsgNotFound || err == errDeletedMsg || err == ErrStoreEOF || err == errMaxAckPending { + goto waitForMsgs + } else if err == errPartialCache { + s.Warnf("Unexpected partial cache error looking up message for consumer '%s > %s > %s'", + o.mset.acc, stream, o.cfg.Name) + goto waitForMsgs + + } else { + s.Errorf("Received an error looking up message for consumer '%s > %s > %s': %v", + o.mset.acc, stream, o.cfg.Name, err) + goto waitForMsgs + } + } + + // Update our cached num pending here first. + if dc == 1 { + o.npc-- + } + // Pre-calculate ackReply + ackReply = o.ackReply(pmsg.seq, o.dseq, dc, pmsg.ts, o.numPending()) + + // If headers only do not send msg payload. + // Add in msg size itself as header. + if o.cfg.HeadersOnly { + convertToHeadersOnly(pmsg) + } + // Calculate payload size. This can be calculated on client side. + // We do not include transport subject here since not generally known on client. + sz = len(pmsg.subj) + len(ackReply) + len(pmsg.hdr) + len(pmsg.msg) + + if o.isPushMode() { + dsubj = o.dsubj + } else if wr := o.nextWaiting(sz); wr != nil { + wrn, wrb = wr.n, wr.b + dsubj = wr.reply + if o.cfg.PriorityPolicy == PriorityPinnedClient { + // FIXME(jrm): Can we make this prettier? + if len(pmsg.hdr) == 0 { + pmsg.hdr = genHeader(pmsg.hdr, JSPullRequestNatsPinId, o.currentPinId) + pmsg.buf = append(pmsg.hdr, pmsg.msg...) + } else { + pmsg.hdr = genHeader(pmsg.hdr, JSPullRequestNatsPinId, o.currentPinId) + bufLen := len(pmsg.hdr) + len(pmsg.msg) + pmsg.buf = make([]byte, bufLen) + pmsg.buf = append(pmsg.hdr, pmsg.msg...) + } + + sz = len(pmsg.subj) + len(ackReply) + len(pmsg.hdr) + len(pmsg.msg) + + } + if done := wr.recycleIfDone(); done && o.node != nil { + o.removeClusterPendingRequest(dsubj) + } else if !done && wr.hb > 0 { + wr.hbt = time.Now().Add(wr.hb) + } + } else { + // We will redo this one as long as this is not a redelivery. + // Need to also test that this is not going backwards since if + // we fail to deliver we can end up here from rdq but we do not + // want to decrement o.sseq if that is the case. + if dc == 1 && pmsg.seq == o.sseq-1 { + o.sseq-- + o.npc++ + } else if !o.onRedeliverQueue(pmsg.seq) { + // We are not on the rdq so decrement the delivery count + // and add it back. + o.decDeliveryCount(pmsg.seq) + o.addToRedeliverQueue(pmsg.seq) + } + pmsg.returnToPool() + goto waitForMsgs + } + + // If we are in a replay scenario and have not caught up check if we need to delay here. + if o.replay && lts > 0 { + if delay = time.Duration(pmsg.ts - lts); delay > time.Millisecond { + o.mu.Unlock() + select { + case <-qch: + pmsg.returnToPool() + return + case <-time.After(delay): + } + o.mu.Lock() + } + } + + // Track this regardless. + lts = pmsg.ts + + // If we have a rate limit set make sure we check that here. + if o.rlimit != nil { + now := time.Now() + r := o.rlimit.ReserveN(now, sz) + delay := r.DelayFrom(now) + if delay > 0 { + o.mu.Unlock() + select { + case <-qch: + pmsg.returnToPool() + return + case <-time.After(delay): + } + o.mu.Lock() + } + } + + // Do actual delivery. + o.deliverMsg(dsubj, ackReply, pmsg, dc, rp) + + // If given request fulfilled batch size, but there are still pending bytes, send information about it. + if wrn <= 0 && wrb > 0 { + msg := fmt.Appendf(nil, JsPullRequestRemainingBytesT, JSPullRequestPendingMsgs, wrn, JSPullRequestPendingBytes, wrb) + o.outq.send(newJSPubMsg(dsubj, _EMPTY_, _EMPTY_, msg, nil, nil, 0)) + } + // Reset our idle heartbeat timer if set. + if hb != nil { + hb.Reset(hbd) + } + + o.mu.Unlock() + continue + + waitForMsgs: + // If we were in a replay state check to see if we are caught up. If so clear. + if o.replay && o.sseq > lseq { + o.replay = false + } + + // Make sure to process any expired requests that are pending. + var wrExp <-chan time.Time + if o.isPullMode() { + // Dont expire oneshots if we are here because of max ack pending limit. + _, _, _, fexp := o.processWaiting(err != errMaxAckPending) + if !fexp.IsZero() { + expires := time.Until(fexp) + if expires <= 0 { + expires = time.Millisecond + } + wrExp = time.NewTimer(expires).C + } + } + + // We will wait here for new messages to arrive. + mch, odsubj := o.mch, o.cfg.DeliverSubject + o.mu.Unlock() + + select { + case <-mch: + // Messages are waiting. + case interest := <-inch: + // inch can be nil on pull-based, but then this will + // just block and not fire. + o.updateDeliveryInterest(interest) + case <-qch: + return + case <-wrExp: + o.mu.Lock() + o.processWaiting(true) + o.mu.Unlock() + case <-hbc: + if o.isActive() { + o.mu.RLock() + o.sendIdleHeartbeat(odsubj) + o.mu.RUnlock() + } + // Reset our idle heartbeat timer. + hb.Reset(hbd) + } + } +} + +// Lock should be held. +func (o *consumer) sendIdleHeartbeat(subj string) { + const t = "NATS/1.0 100 Idle Heartbeat\r\n%s: %d\r\n%s: %d\r\n\r\n" + sseq, dseq := o.sseq-1, o.dseq-1 + hdr := fmt.Appendf(nil, t, JSLastConsumerSeq, dseq, JSLastStreamSeq, sseq) + if fcp := o.fcid; fcp != _EMPTY_ { + // Add in that we are stalled on flow control here. + addOn := fmt.Appendf(nil, "%s: %s\r\n\r\n", JSConsumerStalled, fcp) + hdr = append(hdr[:len(hdr)-LEN_CR_LF], []byte(addOn)...) + } + o.outq.send(newJSPubMsg(subj, _EMPTY_, _EMPTY_, hdr, nil, nil, 0)) +} + +func (o *consumer) ackReply(sseq, dseq, dc uint64, ts int64, pending uint64) string { + return fmt.Sprintf(o.ackReplyT, dc, sseq, dseq, ts, pending) +} + +// Used mostly for testing. Sets max pending bytes for flow control setups. +func (o *consumer) setMaxPendingBytes(limit int) { + o.pblimit = limit + o.maxpb = limit / 16 + if o.maxpb == 0 { + o.maxpb = 1 + } +} + +// Does some sanity checks to see if we should re-calculate. +// Since there is a race when decrementing when there is contention at the beginning of the stream. +// The race is a getNextMsg skips a deleted msg, and then the decStreamPending call fires. +// This does some quick sanity checks to see if we should re-calculate num pending. +// Lock should be held. +func (o *consumer) checkNumPending() uint64 { + if o.mset != nil { + var state StreamState + o.mset.store.FastState(&state) + npc := o.numPending() + if o.sseq > state.LastSeq && npc > 0 || npc > state.Msgs { + // Re-calculate. + o.streamNumPending() + } + } + return o.numPending() +} + +// Lock should be held. +func (o *consumer) numPending() uint64 { + if o.npc < 0 { + return 0 + } + return uint64(o.npc) +} + +// This will do a quick sanity check on num pending when we encounter +// and EOF in the loop and gather. +// Lock should be held. +func (o *consumer) checkNumPendingOnEOF() { + if o.mset == nil { + return + } + var state StreamState + o.mset.store.FastState(&state) + if o.sseq > state.LastSeq && o.npc != 0 { + // We know here we can reset our running state for num pending. + o.npc, o.npf = 0, state.LastSeq + } +} + +// Call into streamNumPending after acquiring the consumer lock. +func (o *consumer) streamNumPendingLocked() uint64 { + o.mu.Lock() + defer o.mu.Unlock() + return o.streamNumPending() +} + +// Will force a set from the stream store of num pending. +// Depends on delivery policy, for last per subject we calculate differently. +// Lock should be held. +func (o *consumer) streamNumPending() uint64 { + if o.mset == nil || o.mset.store == nil { + o.npc, o.npf = 0, 0 + return 0 + } + npc, npf := o.calculateNumPending() + o.npc, o.npf = int64(npc), npf + return o.numPending() +} + +// Will calculate num pending but only requires a read lock. +// Depends on delivery policy, for last per subject we calculate differently. +// At least RLock should be held. +func (o *consumer) calculateNumPending() (npc, npf uint64) { + if o.mset == nil || o.mset.store == nil { + return 0, 0 + } + + isLastPerSubject := o.cfg.DeliverPolicy == DeliverLastPerSubject + filters, subjf := o.filters, o.subjf + + if filters != nil { + return o.mset.store.NumPendingMulti(o.sseq, filters, isLastPerSubject) + } else if len(subjf) > 0 { + filter := subjf[0].subject + return o.mset.store.NumPending(o.sseq, filter, isLastPerSubject) + } + return o.mset.store.NumPending(o.sseq, _EMPTY_, isLastPerSubject) +} + +func convertToHeadersOnly(pmsg *jsPubMsg) { + // If headers only do not send msg payload. + // Add in msg size itself as header. + hdr, msg := pmsg.hdr, pmsg.msg + var bb bytes.Buffer + if len(hdr) == 0 { + bb.WriteString(hdrLine) + } else { + bb.Write(hdr) + bb.Truncate(len(hdr) - LEN_CR_LF) + } + bb.WriteString(JSMsgSize) + bb.WriteString(": ") + bb.WriteString(strconv.FormatInt(int64(len(msg)), 10)) + bb.WriteString(CR_LF) + bb.WriteString(CR_LF) + // Replace underlying buf which we can use directly when we send. + // TODO(dlc) - Probably just use directly when forming bytes.Buffer? + pmsg.buf = pmsg.buf[:0] + pmsg.buf = append(pmsg.buf, bb.Bytes()...) + // Replace with new header. + pmsg.hdr = pmsg.buf + // Cancel msg payload + pmsg.msg = nil +} + +// Deliver a msg to the consumer. +// Lock should be held and o.mset validated to be non-nil. +func (o *consumer) deliverMsg(dsubj, ackReply string, pmsg *jsPubMsg, dc uint64, rp RetentionPolicy) { + if o.mset == nil { + pmsg.returnToPool() + return + } + + dseq := o.dseq + o.dseq++ + + pmsg.dsubj, pmsg.reply, pmsg.o = dsubj, ackReply, o + psz := pmsg.size() + + if o.maxpb > 0 { + o.pbytes += psz + } + + mset := o.mset + ap := o.cfg.AckPolicy + + // Cant touch pmsg after this sending so capture what we need. + seq, ts := pmsg.seq, pmsg.ts + + // Update delivered first. + o.updateDelivered(dseq, seq, dc, ts) + + // Send message. + o.outq.send(pmsg) + + if ap == AckExplicit || ap == AckAll { + o.trackPending(seq, dseq) + } else if ap == AckNone { + o.adflr = dseq + o.asflr = seq + } + + // Flow control. + if o.maxpb > 0 && o.needFlowControl(psz) { + o.sendFlowControl() + } + + // If pull mode and we have inactivity threshold, signaled by dthresh, update last activity. + if o.isPullMode() && o.dthresh > 0 { + o.waiting.last = time.Now() + } + + // If we are ack none and mset is interest only we should make sure stream removes interest. + if ap == AckNone && rp != LimitsPolicy { + if mset != nil && mset.ackq != nil && (o.node == nil || o.cfg.Direct) { + mset.ackq.push(seq) + } else { + o.updateAcks(dseq, seq, _EMPTY_) + } + } +} + +func (o *consumer) needFlowControl(sz int) bool { + if o.maxpb == 0 { + return false + } + // Decide whether to send a flow control message which we will need the user to respond. + // We send when we are over 50% of our current window limit. + if o.fcid == _EMPTY_ && o.pbytes > o.maxpb/2 { + return true + } + // If we have an existing outstanding FC, check to see if we need to expand the o.fcsz + if o.fcid != _EMPTY_ && (o.pbytes-o.fcsz) >= o.maxpb { + o.fcsz += sz + } + return false +} + +func (o *consumer) processFlowControl(_ *subscription, c *client, _ *Account, subj, _ string, _ []byte) { + o.mu.Lock() + defer o.mu.Unlock() + + // Ignore if not the latest we have sent out. + if subj != o.fcid { + return + } + + // For slow starts and ramping up. + if o.maxpb < o.pblimit { + o.maxpb *= 2 + if o.maxpb > o.pblimit { + o.maxpb = o.pblimit + } + } + + // Update accounting. + o.pbytes -= o.fcsz + if o.pbytes < 0 { + o.pbytes = 0 + } + o.fcid, o.fcsz = _EMPTY_, 0 + + o.signalNewMessages() +} + +// Lock should be held. +func (o *consumer) fcReply() string { + var sb strings.Builder + sb.WriteString(jsFlowControlPre) + sb.WriteString(o.stream) + sb.WriteByte(btsep) + sb.WriteString(o.name) + sb.WriteByte(btsep) + var b [4]byte + rn := rand.Int63() + for i, l := 0, rn; i < len(b); i++ { + b[i] = digits[l%base] + l /= base + } + sb.Write(b[:]) + return sb.String() +} + +// sendFlowControl will send a flow control packet to the consumer. +// Lock should be held. +func (o *consumer) sendFlowControl() { + if !o.isPushMode() { + return + } + subj, rply := o.cfg.DeliverSubject, o.fcReply() + o.fcsz, o.fcid = o.pbytes, rply + hdr := []byte("NATS/1.0 100 FlowControl Request\r\n\r\n") + o.outq.send(newJSPubMsg(subj, _EMPTY_, rply, hdr, nil, nil, 0)) +} + +// Tracks our outstanding pending acks. Only applicable to AckExplicit mode. +// Lock should be held. +func (o *consumer) trackPending(sseq, dseq uint64) { + if o.pending == nil { + o.pending = make(map[uint64]*Pending) + } + + // We could have a backoff that set a timer higher than what we need for this message. + // In that case, reset to lowest backoff required for a message redelivery. + minDelay := o.ackWait(0) + if l := len(o.cfg.BackOff); l > 0 { + bi := int(o.rdc[sseq]) + if bi < 0 { + bi = 0 + } else if bi >= l { + bi = l - 1 + } + minDelay = o.ackWait(o.cfg.BackOff[bi]) + } + minDeadline := time.Now().Add(minDelay) + if o.ptmr == nil || o.ptmrEnd.After(minDeadline) { + o.resetPtmr(minDelay) + } + + if p, ok := o.pending[sseq]; ok { + // Update timestamp but keep original consumer delivery sequence. + // So do not update p.Sequence. + p.Timestamp = time.Now().UnixNano() + } else { + o.pending[sseq] = &Pending{dseq, time.Now().UnixNano()} + } +} + +// Credit back a failed delivery. +// lock should be held. +func (o *consumer) creditWaitingRequest(reply string) { + wq := o.waiting + for wr := wq.head; wr != nil; wr = wr.next { + if wr.reply == reply { + wr.n++ + wr.d-- + return + } + } +} + +// didNotDeliver is called when a delivery for a consumer message failed. +// Depending on our state, we will process the failure. +func (o *consumer) didNotDeliver(seq uint64, subj string) { + o.mu.Lock() + mset := o.mset + if mset == nil { + o.mu.Unlock() + return + } + // Adjust back deliver count. + o.decDeliveryCount(seq) + + var checkDeliveryInterest bool + if o.isPushMode() { + o.active = false + checkDeliveryInterest = true + } else if o.pending != nil { + // Good chance we did not deliver because no interest so force a check. + o.processWaiting(false) + // If it is still there credit it back. + o.creditWaitingRequest(subj) + // pull mode and we have pending. + if _, ok := o.pending[seq]; ok { + // We found this messsage on pending, we need + // to queue it up for immediate redelivery since + // we know it was not delivered + if !o.onRedeliverQueue(seq) { + o.addToRedeliverQueue(seq) + if !o.waiting.isEmpty() { + o.signalNewMessages() + } + } + } + } + o.mu.Unlock() + + // If we do not have interest update that here. + if checkDeliveryInterest && o.hasNoLocalInterest() { + o.updateDeliveryInterest(false) + } +} + +// Lock should be held. +func (o *consumer) addToRedeliverQueue(seqs ...uint64) { + o.rdq = append(o.rdq, seqs...) + for _, seq := range seqs { + o.rdqi.Insert(seq) + } +} + +// Lock should be held. +func (o *consumer) hasRedeliveries() bool { + return len(o.rdq) > 0 +} + +func (o *consumer) getNextToRedeliver() uint64 { + if len(o.rdq) == 0 { + return 0 + } + seq := o.rdq[0] + if len(o.rdq) == 1 { + o.rdq = nil + o.rdqi.Empty() + } else { + o.rdq = append(o.rdq[:0], o.rdq[1:]...) + o.rdqi.Delete(seq) + } + return seq +} + +// This checks if we already have this sequence queued for redelivery. +// FIXME(dlc) - This is O(n) but should be fast with small redeliver size. +// Lock should be held. +func (o *consumer) onRedeliverQueue(seq uint64) bool { + return o.rdqi.Exists(seq) +} + +// Remove a sequence from the redelivery queue. +// Lock should be held. +func (o *consumer) removeFromRedeliverQueue(seq uint64) bool { + if !o.onRedeliverQueue(seq) { + return false + } + for i, rseq := range o.rdq { + if rseq == seq { + if len(o.rdq) == 1 { + o.rdq = nil + o.rdqi.Empty() + } else { + o.rdq = append(o.rdq[:i], o.rdq[i+1:]...) + o.rdqi.Delete(seq) + } + return true + } + } + return false +} + +// Checks the pending messages. +func (o *consumer) checkPending() { + o.mu.Lock() + defer o.mu.Unlock() + + mset := o.mset + // On stop, mset and timer will be nil. + if o.closed || mset == nil || o.ptmr == nil { + o.stopAndClearPtmr() + return + } + + var shouldUpdateState bool + var state StreamState + mset.store.FastState(&state) + fseq := state.FirstSeq + + now := time.Now().UnixNano() + ttl := int64(o.cfg.AckWait) + next := int64(o.ackWait(0)) + // However, if there is backoff, initializes with the largest backoff. + // It will be adjusted as needed. + if l := len(o.cfg.BackOff); l > 0 { + next = int64(o.cfg.BackOff[l-1]) + } + + // Since we can update timestamps, we have to review all pending. + // We will now bail if we see an ack pending inbound to us via o.awl. + var expired []uint64 + check := len(o.pending) > 1024 + for seq, p := range o.pending { + if check && atomic.LoadInt64(&o.awl) > 0 { + o.resetPtmr(100 * time.Millisecond) + return + } + // Check if these are no longer valid. + if seq < fseq || seq <= o.asflr { + delete(o.pending, seq) + delete(o.rdc, seq) + o.removeFromRedeliverQueue(seq) + shouldUpdateState = true + // Check if we need to move ack floors. + if seq > o.asflr { + o.asflr = seq + } + if p.Sequence > o.adflr { + o.adflr = p.Sequence + } + continue + } + elapsed, deadline := now-p.Timestamp, ttl + if len(o.cfg.BackOff) > 0 { + // This is ok even if o.rdc is nil, we would get dc == 0, which is what we want. + dc := int(o.rdc[seq]) + if dc < 0 { + // Prevent consumer backoff from going backwards. + dc = 0 + } + // This will be the index for the next backoff, will set to last element if needed. + nbi := dc + 1 + if dc+1 >= len(o.cfg.BackOff) { + dc = len(o.cfg.BackOff) - 1 + nbi = dc + } + deadline = int64(o.cfg.BackOff[dc]) + // Set `next` to the next backoff (if smaller than current `next` value). + if nextBackoff := int64(o.cfg.BackOff[nbi]); nextBackoff < next { + next = nextBackoff + } + } + if elapsed >= deadline { + // We will check if we have hit our max deliveries. Previously we would do this on getNextMsg() which + // worked well for push consumers, but with pull based consumers would require a new pull request to be + // present to process and redelivered could be reported incorrectly. + if !o.onRedeliverQueue(seq) && !o.hasMaxDeliveries(seq) { + expired = append(expired, seq) + } + } else if deadline-elapsed < next { + // Update when we should fire next. + next = deadline - elapsed + } + } + + if len(expired) > 0 { + // We need to sort. + slices.Sort(expired) + o.addToRedeliverQueue(expired...) + // Now we should update the timestamp here since we are redelivering. + // We will use an incrementing time to preserve order for any other redelivery. + off := now - o.pending[expired[0]].Timestamp + for _, seq := range expired { + if p, ok := o.pending[seq]; ok { + p.Timestamp += off + } + } + o.signalNewMessages() + } + + if len(o.pending) > 0 { + o.resetPtmr(time.Duration(next)) + } else { + // Make sure to stop timer and clear out any re delivery queues + o.stopAndClearPtmr() + o.rdq = nil + o.rdqi.Empty() + o.pending = nil + // Mimic behavior in processAckMsg when pending is empty. + o.adflr, o.asflr = o.dseq-1, o.sseq-1 + } + + // Update our state if needed. + if shouldUpdateState { + if err := o.writeStoreStateUnlocked(); err != nil && o.srv != nil && o.mset != nil && !o.closed { + s, acc, mset, name := o.srv, o.acc, o.mset, o.name + s.Warnf("Consumer '%s > %s > %s' error on write store state from check pending: %v", acc, mset.getCfgName(), name, err) + } + } +} + +// SeqFromReply will extract a sequence number from a reply subject. +func (o *consumer) seqFromReply(reply string) uint64 { + _, dseq, _ := ackReplyInfo(reply) + return dseq +} + +// StreamSeqFromReply will extract the stream sequence from the reply subject. +func (o *consumer) streamSeqFromReply(reply string) uint64 { + sseq, _, _ := ackReplyInfo(reply) + return sseq +} + +// Quick parser for positive numbers in ack reply encoding. +func parseAckReplyNum(d string) (n int64) { + if len(d) == 0 { + return -1 + } + for _, dec := range d { + if dec < asciiZero || dec > asciiNine { + return -1 + } + n = n*10 + (int64(dec) - asciiZero) + } + return n +} + +const expectedNumReplyTokens = 9 + +// Grab encoded information in the reply subject for a delivered message. +func replyInfo(subject string) (sseq, dseq, dc uint64, ts int64, pending uint64) { + tsa := [expectedNumReplyTokens]string{} + start, tokens := 0, tsa[:0] + for i := 0; i < len(subject); i++ { + if subject[i] == btsep { + tokens = append(tokens, subject[start:i]) + start = i + 1 + } + } + tokens = append(tokens, subject[start:]) + if len(tokens) != expectedNumReplyTokens || tokens[0] != "$JS" || tokens[1] != "ACK" { + return 0, 0, 0, 0, 0 + } + // TODO(dlc) - Should we error if we do not match consumer name? + // stream is tokens[2], consumer is 3. + dc = uint64(parseAckReplyNum(tokens[4])) + sseq, dseq = uint64(parseAckReplyNum(tokens[5])), uint64(parseAckReplyNum(tokens[6])) + ts = parseAckReplyNum(tokens[7]) + pending = uint64(parseAckReplyNum(tokens[8])) + + return sseq, dseq, dc, ts, pending +} + +func ackReplyInfo(subject string) (sseq, dseq, dc uint64) { + tsa := [expectedNumReplyTokens]string{} + start, tokens := 0, tsa[:0] + for i := 0; i < len(subject); i++ { + if subject[i] == btsep { + tokens = append(tokens, subject[start:i]) + start = i + 1 + } + } + tokens = append(tokens, subject[start:]) + if len(tokens) != expectedNumReplyTokens || tokens[0] != "$JS" || tokens[1] != "ACK" { + return 0, 0, 0 + } + dc = uint64(parseAckReplyNum(tokens[4])) + sseq, dseq = uint64(parseAckReplyNum(tokens[5])), uint64(parseAckReplyNum(tokens[6])) + + return sseq, dseq, dc +} + +// NextSeq returns the next delivered sequence number for this consumer. +func (o *consumer) nextSeq() uint64 { + o.mu.RLock() + dseq := o.dseq + o.mu.RUnlock() + return dseq +} + +// Used to hold skip list when deliver policy is last per subject. +type lastSeqSkipList struct { + resume uint64 + seqs []uint64 +} + +// Let's us know we have a skip list, which is for deliver last per subject and we are just starting. +// Lock should be held. +func (o *consumer) hasSkipListPending() bool { + return o.lss != nil && len(o.lss.seqs) > 0 +} + +// Will select the starting sequence. +func (o *consumer) selectStartingSeqNo() { + if o.mset == nil || o.mset.store == nil { + o.sseq = 1 + } else { + var state StreamState + o.mset.store.FastState(&state) + if o.cfg.OptStartSeq == 0 { + if o.cfg.DeliverPolicy == DeliverAll { + o.sseq = state.FirstSeq + } else if o.cfg.DeliverPolicy == DeliverLast { + if o.subjf == nil { + o.sseq = state.LastSeq + return + } + // If we are partitioned here this will be properly set when we become leader. + for _, filter := range o.subjf { + ss := o.mset.store.FilteredState(1, filter.subject) + if ss.Last > o.sseq { + o.sseq = ss.Last + } + } + } else if o.cfg.DeliverPolicy == DeliverLastPerSubject { + // If our parent stream is set to max msgs per subject of 1 this is just + // a normal consumer at this point. We can avoid any heavy lifting. + o.mset.cfgMu.RLock() + mmp := o.mset.cfg.MaxMsgsPer + o.mset.cfgMu.RUnlock() + if mmp == 1 { + o.sseq = state.FirstSeq + } else { + // A threshold for when we switch from get last msg to subjects state. + const numSubjectsThresh = 256 + lss := &lastSeqSkipList{resume: state.LastSeq} + var filters []string + if o.subjf == nil { + filters = append(filters, o.cfg.FilterSubject) + } else { + for _, filter := range o.subjf { + filters = append(filters, filter.subject) + } + } + for _, filter := range filters { + if st := o.mset.store.SubjectsTotals(filter); len(st) < numSubjectsThresh { + var smv StoreMsg + for subj := range st { + if sm, err := o.mset.store.LoadLastMsg(subj, &smv); err == nil { + lss.seqs = append(lss.seqs, sm.seq) + } + } + } else if mss := o.mset.store.SubjectsState(filter); len(mss) > 0 { + for _, ss := range mss { + lss.seqs = append(lss.seqs, ss.Last) + } + } + } + // Sort the skip list if needed. + if len(lss.seqs) > 1 { + slices.Sort(lss.seqs) + } + if len(lss.seqs) == 0 { + o.sseq = state.LastSeq + } else { + o.sseq = lss.seqs[0] + } + // Assign skip list. + o.lss = lss + } + } else if o.cfg.OptStartTime != nil { + // If we are here we are time based. + // TODO(dlc) - Once clustered can't rely on this. + o.sseq = o.mset.store.GetSeqFromTime(*o.cfg.OptStartTime) + // Here we want to see if we are filtered, and if so possibly close the gap + // to the nearest first given our starting sequence from time. This is so we do + // not force the system to do a linear walk between o.sseq and the real first. + if len(o.subjf) > 0 { + nseq := state.LastSeq + for _, filter := range o.subjf { + // Use first sequence since this is more optimized atm. + ss := o.mset.store.FilteredState(state.FirstSeq, filter.subject) + if ss.First >= o.sseq && ss.First < nseq { + nseq = ss.First + } + } + // Skip ahead if possible. + if nseq > o.sseq && nseq < state.LastSeq { + o.sseq = nseq + } + } + } else { + // DeliverNew + o.sseq = state.LastSeq + 1 + } + } else { + o.sseq = o.cfg.OptStartSeq + } + + if state.FirstSeq == 0 { + o.sseq = 1 + } else if o.sseq < state.FirstSeq { + o.sseq = state.FirstSeq + } else if o.sseq > state.LastSeq { + o.sseq = state.LastSeq + 1 + } + } + + // Always set delivery sequence to 1. + o.dseq = 1 + // Set ack delivery floor to delivery-1 + o.adflr = o.dseq - 1 + // Set ack store floor to store-1 + o.asflr = o.sseq - 1 + // Set our starting sequence state. + if o.store != nil && o.sseq > 0 { + o.store.SetStarting(o.sseq - 1) + } +} + +// Test whether a config represents a durable subscriber. +func isDurableConsumer(config *ConsumerConfig) bool { + return config != nil && config.Durable != _EMPTY_ +} + +func (o *consumer) isDurable() bool { + return o.cfg.Durable != _EMPTY_ +} + +// Are we in push mode, delivery subject, etc. +func (o *consumer) isPushMode() bool { + return o.cfg.DeliverSubject != _EMPTY_ +} + +func (o *consumer) isPullMode() bool { + return o.cfg.DeliverSubject == _EMPTY_ +} + +// Name returns the name of this consumer. +func (o *consumer) String() string { + o.mu.RLock() + n := o.name + o.mu.RUnlock() + return n +} + +func createConsumerName() string { + return getHash(nuid.Next()) +} + +// deleteConsumer will delete the consumer from this stream. +func (mset *stream) deleteConsumer(o *consumer) error { + return o.delete() +} + +func (o *consumer) getStream() *stream { + o.mu.RLock() + mset := o.mset + o.mu.RUnlock() + return mset +} + +func (o *consumer) streamName() string { + o.mu.RLock() + mset := o.mset + o.mu.RUnlock() + if mset != nil { + return mset.name() + } + return _EMPTY_ +} + +// Active indicates if this consumer is still active. +func (o *consumer) isActive() bool { + o.mu.RLock() + active := o.active && o.mset != nil + o.mu.RUnlock() + return active +} + +// hasNoLocalInterest return true if we have no local interest. +func (o *consumer) hasNoLocalInterest() bool { + o.mu.RLock() + interest := o.acc.sl.HasInterest(o.cfg.DeliverSubject) + o.mu.RUnlock() + return !interest +} + +// This is when the underlying stream has been purged. +// sseq is the new first seq for the stream after purge. +// Lock should NOT be held. +func (o *consumer) purge(sseq uint64, slseq uint64, isWider bool) { + // Do not update our state unless we know we are the leader. + if !o.isLeader() { + return + } + // Signals all have been purged for this consumer. + if sseq == 0 && !isWider { + sseq = slseq + 1 + } + + var store StreamStore + if isWider { + o.mu.RLock() + if o.mset != nil { + store = o.mset.store + } + o.mu.RUnlock() + } + + o.mu.Lock() + // Do not go backwards + if o.sseq < sseq { + o.sseq = sseq + } + + if o.asflr < sseq { + o.asflr = sseq - 1 + // We need to remove those no longer relevant from pending. + for seq, p := range o.pending { + if seq <= o.asflr { + if p.Sequence > o.adflr { + o.adflr = p.Sequence + if o.adflr > o.dseq { + o.dseq = o.adflr + } + } + delete(o.pending, seq) + delete(o.rdc, seq) + // rdq handled below. + } + if isWider && store != nil { + // Our filtered subject, which could be all, is wider than the underlying purge. + // We need to check if the pending items left are still valid. + var smv StoreMsg + if _, err := store.LoadMsg(seq, &smv); err == errDeletedMsg || err == ErrStoreMsgNotFound { + if p.Sequence > o.adflr { + o.adflr = p.Sequence + if o.adflr > o.dseq { + o.dseq = o.adflr + } + } + delete(o.pending, seq) + delete(o.rdc, seq) + } + } + } + } + + // This means we can reset everything at this point. + if len(o.pending) == 0 { + o.pending, o.rdc = nil, nil + o.adflr, o.asflr = o.dseq-1, o.sseq-1 + } + + // We need to remove all those being queued for redelivery under o.rdq + if len(o.rdq) > 0 { + rdq := o.rdq + o.rdq = nil + o.rdqi.Empty() + for _, sseq := range rdq { + if sseq >= o.sseq { + o.addToRedeliverQueue(sseq) + } + } + } + // Grab some info in case of error below. + s, acc, mset, name := o.srv, o.acc, o.mset, o.name + o.mu.Unlock() + + if err := o.writeStoreState(); err != nil && s != nil && mset != nil { + s.Warnf("Consumer '%s > %s > %s' error on write store state from purge: %v", acc, mset.name(), name, err) + } +} + +func stopAndClearTimer(tp **time.Timer) { + if *tp == nil { + return + } + // Will get drained in normal course, do not try to + // drain here. + (*tp).Stop() + *tp = nil +} + +// Stop will shutdown the consumer for the associated stream. +func (o *consumer) stop() error { + return o.stopWithFlags(false, false, true, false) +} + +func (o *consumer) deleteWithoutAdvisory() error { + return o.stopWithFlags(true, false, true, false) +} + +// Delete will delete the consumer for the associated stream and send advisories. +func (o *consumer) delete() error { + return o.stopWithFlags(true, false, true, true) +} + +// To test for closed state. +func (o *consumer) isClosed() bool { + o.mu.RLock() + defer o.mu.RUnlock() + return o.closed +} + +func (o *consumer) stopWithFlags(dflag, sdflag, doSignal, advisory bool) error { + // If dflag is true determine if we are still assigned. + var isAssigned bool + if dflag { + o.mu.RLock() + acc, stream, consumer := o.acc, o.stream, o.name + isClustered := o.js != nil && o.js.isClustered() + o.mu.RUnlock() + if isClustered { + // Grab jsa to check assignment. + var jsa *jsAccount + if acc != nil { + // Need lock here to avoid data race. + acc.mu.RLock() + jsa = acc.js + acc.mu.RUnlock() + } + if jsa != nil { + isAssigned = jsa.consumerAssigned(stream, consumer) + } + } + } + + o.mu.Lock() + if o.closed { + o.mu.Unlock() + return nil + } + o.closed = true + + // Check if we are the leader and are being deleted (as a node). + if dflag && o.isLeader() { + // If we are clustered and node leader (probable from above), stepdown. + if node := o.node; node != nil && node.Leader() { + node.StepDown() + } + + // dflag does not necessarily mean that the consumer is being deleted, + // just that the consumer node is being removed from this peer, so we + // send delete advisories only if we are no longer assigned at the meta layer, + // or we are not clustered. + if !isAssigned && advisory { + o.sendDeleteAdvisoryLocked() + } + if o.isPullMode() { + // Release any pending. + o.releaseAnyPendingRequests(isAssigned) + } + } + + if o.qch != nil { + close(o.qch) + o.qch = nil + } + + a := o.acc + store := o.store + mset := o.mset + o.mset = nil + o.active = false + o.unsubscribe(o.ackSub) + o.unsubscribe(o.reqSub) + o.unsubscribe(o.fcSub) + o.ackSub = nil + o.reqSub = nil + o.fcSub = nil + if o.infoSub != nil { + o.srv.sysUnsubscribe(o.infoSub) + o.infoSub = nil + } + c := o.client + o.client = nil + sysc := o.sysc + o.sysc = nil + o.stopAndClearPtmr() + stopAndClearTimer(&o.dtmr) + stopAndClearTimer(&o.gwdtmr) + delivery := o.cfg.DeliverSubject + o.waiting = nil + // Break us out of the readLoop. + if doSignal { + o.signalNewMessages() + } + n := o.node + qgroup := o.cfg.DeliverGroup + o.ackMsgs.unregister() + if o.nextMsgReqs != nil { + o.nextMsgReqs.unregister() + } + + // For cleaning up the node assignment. + var ca *consumerAssignment + if dflag { + ca = o.ca + } + js := o.js + o.mu.Unlock() + + if c != nil { + c.closeConnection(ClientClosed) + } + if sysc != nil { + sysc.closeConnection(ClientClosed) + } + + if delivery != _EMPTY_ { + a.sl.clearNotification(delivery, qgroup, o.inch) + } + + var rp RetentionPolicy + if mset != nil { + mset.mu.Lock() + mset.removeConsumer(o) + // No need for cfgMu's lock since mset.mu.Lock superseeds it. + rp = mset.cfg.Retention + mset.mu.Unlock() + } + + // Cleanup messages that lost interest. + if dflag && rp == InterestPolicy { + o.cleanupNoInterestMessages(mset, true) + } + + // Cluster cleanup. + if n != nil { + if dflag { + n.Delete() + } else { + n.Stop() + } + } + + if ca != nil { + js.mu.Lock() + if ca.Group != nil { + ca.Group.node = nil + } + js.mu.Unlock() + } + + // Clean up our store. + var err error + if store != nil { + if dflag { + if sdflag { + err = store.StreamDelete() + } else { + err = store.Delete() + } + } else { + err = store.Stop() + } + } + + return err +} + +// We need to optionally remove all messages since we are interest based retention. +// We will do this consistently on all replicas. Note that if in clustered mode the non-leader +// consumers will need to restore state first. +// ignoreInterest marks whether the consumer should be ignored when determining interest. +// No lock held on entry. +func (o *consumer) cleanupNoInterestMessages(mset *stream, ignoreInterest bool) { + o.mu.Lock() + if !o.isLeader() { + o.readStoredState(0) + } + start := o.asflr + o.mu.Unlock() + + // Make sure we start at worst with first sequence in the stream. + state := mset.state() + if start < state.FirstSeq { + start = state.FirstSeq + } + stop := state.LastSeq + + // Consumer's interests are ignored by default. If we should not ignore interest, unset. + co := o + if !ignoreInterest { + co = nil + } + + var rmseqs []uint64 + mset.mu.RLock() + + // If over this amount of messages to check, defer to checkInterestState() which + // will do the right thing since we are now removed. + // TODO(dlc) - Better way? + const bailThresh = 100_000 + + // Check if we would be spending too much time here and defer to separate go routine. + if len(mset.consumers) == 0 { + mset.mu.RUnlock() + mset.mu.Lock() + defer mset.mu.Unlock() + mset.store.Purge() + var state StreamState + mset.store.FastState(&state) + mset.lseq = state.LastSeq + // Also make sure we clear any pending acks. + mset.clearAllPreAcksBelowFloor(state.FirstSeq) + return + } else if stop-start > bailThresh { + mset.mu.RUnlock() + go mset.checkInterestState() + return + } + + mset.mu.RUnlock() + mset.mu.Lock() + for seq := start; seq <= stop; seq++ { + if mset.noInterest(seq, co) { + rmseqs = append(rmseqs, seq) + } + } + mset.mu.Unlock() + + // These can be removed. + for _, seq := range rmseqs { + mset.store.RemoveMsg(seq) + } +} + +// Check that we do not form a cycle by delivering to a delivery subject +// that is part of the interest group. +func deliveryFormsCycle(cfg *StreamConfig, deliverySubject string) bool { + for _, subject := range cfg.Subjects { + if subjectIsSubsetMatch(deliverySubject, subject) { + return true + } + } + return false +} + +// switchToEphemeral is called on startup when recovering ephemerals. +func (o *consumer) switchToEphemeral() { + o.mu.Lock() + o.cfg.Durable = _EMPTY_ + store, ok := o.store.(*consumerFileStore) + interest := o.acc.sl.HasInterest(o.cfg.DeliverSubject) + // Setup dthresh. + o.updateInactiveThreshold(&o.cfg) + o.updatePauseState(&o.cfg) + o.mu.Unlock() + + // Update interest + o.updateDeliveryInterest(interest) + // Write out new config + if ok { + store.updateConfig(o.cfg) + } +} + +// RequestNextMsgSubject returns the subject to request the next message when in pull or worker mode. +// Returns empty otherwise. +func (o *consumer) requestNextMsgSubject() string { + return o.nextMsgSubj +} + +func (o *consumer) decStreamPending(sseq uint64, subj string) { + o.mu.Lock() + + // Check if this message was pending. + p, wasPending := o.pending[sseq] + var rdc uint64 = 1 + if o.rdc != nil { + rdc = o.rdc[sseq] + } + + // Update our cached num pending only if we think deliverMsg has not done so. + // Either we have not reached the message yet, or we've hit the race condition + // when there is contention at the beginning of the stream. In which case we can + // only decrement if the ack floor is still low enough to be able to detect it. + if sseq > o.asflr && (sseq >= o.sseq || !wasPending) && o.isFilteredMatch(subj) { + o.npc-- + } + + o.mu.Unlock() + + // If it was pending process it like an ack. + if wasPending { + // We could have the lock for the stream so do this in a go routine. + // TODO(dlc) - We should do this with ipq vs naked go routines. + go o.processTerm(sseq, p.Sequence, rdc, ackTermUnackedLimitsReason, _EMPTY_) + } +} + +func (o *consumer) account() *Account { + o.mu.RLock() + a := o.acc + o.mu.RUnlock() + return a +} + +// Creates a sublist for consumer. +// All subjects share the same callback. +func (o *consumer) signalSubs() []*subscription { + o.mu.Lock() + defer o.mu.Unlock() + + if o.sigSubs != nil { + return o.sigSubs + } + + subs := []*subscription{} + if o.subjf == nil { + subs = append(subs, &subscription{subject: []byte(fwcs), icb: o.processStreamSignal}) + o.sigSubs = subs + return subs + } + + for _, filter := range o.subjf { + subs = append(subs, &subscription{subject: []byte(filter.subject), icb: o.processStreamSignal}) + } + o.sigSubs = subs + return subs +} + +// This is what will be called when our parent stream wants to kick us regarding a new message. +// We know that this subject matches us by how the parent handles registering us with the signaling sublist, +// but we must check if we are leader. +// We do need the sequence of the message however and we use the msg as the encoded seq. +func (o *consumer) processStreamSignal(_ *subscription, _ *client, _ *Account, subject, _ string, seqb []byte) { + // We can get called here now when not leader, so bail fast + // and without acquiring any locks. + if !o.leader.Load() { + return + } + o.mu.Lock() + defer o.mu.Unlock() + if o.mset == nil { + return + } + + var le = binary.LittleEndian + seq := le.Uint64(seqb) + + if seq > o.npf { + o.npc++ + } + if seq < o.sseq { + return + } + if o.isPushMode() && o.active || o.isPullMode() && !o.waiting.isEmpty() { + o.signalNewMessages() + } +} + +// Used to compare if two multiple filtered subject lists are equal. +func subjectSliceEqual(slice1 []string, slice2 []string) bool { + if len(slice1) != len(slice2) { + return false + } + set2 := make(map[string]struct{}, len(slice2)) + for _, val := range slice2 { + set2[val] = struct{}{} + } + for _, val := range slice1 { + if _, ok := set2[val]; !ok { + return false + } + } + return true +} + +// Utility for simpler if conditions in Consumer config checks. +// In future iteration, we can immediately create `o.subjf` and +// use it to validate things. +func gatherSubjectFilters(filter string, filters []string) []string { + if filter != _EMPTY_ { + filters = append(filters, filter) + } + // list of filters should never contain non-empty filter. + return filters +} + +// shouldStartMonitor will return true if we should start a monitor +// goroutine or will return false if one is already running. +func (o *consumer) shouldStartMonitor() bool { + o.mu.Lock() + defer o.mu.Unlock() + + if o.inMonitor { + return false + } + o.monitorWg.Add(1) + o.inMonitor = true + return true +} + +// Clear the monitor running state. The monitor goroutine should +// call this in a defer to clean up on exit. +func (o *consumer) clearMonitorRunning() { + o.mu.Lock() + defer o.mu.Unlock() + + if o.inMonitor { + o.monitorWg.Done() + o.inMonitor = false + } +} + +// Test whether we are in the monitor routine. +func (o *consumer) isMonitorRunning() bool { + o.mu.RLock() + defer o.mu.RUnlock() + return o.inMonitor +} + +// If we detect that our ackfloor is higher than the stream's last sequence, return this error. +var errAckFloorHigherThanLastSeq = errors.New("consumer ack floor is higher than streams last sequence") +var errAckFloorInvalid = errors.New("consumer ack floor is invalid") + +// If we are a consumer of an interest or workqueue policy stream, process that state and make sure consistent. +func (o *consumer) checkStateForInterestStream(ss *StreamState) error { + o.mu.RLock() + // See if we need to process this update if our parent stream is not a limits policy stream. + mset := o.mset + shouldProcessState := mset != nil && o.retention != LimitsPolicy + if o.closed || !shouldProcessState || o.store == nil || ss == nil { + o.mu.RUnlock() + return nil + } + store := mset.store + state, err := o.store.State() + + filters, subjf, filter := o.filters, o.subjf, _EMPTY_ + var wc bool + if filters == nil && subjf != nil { + filter, wc = subjf[0].subject, subjf[0].hasWildcard + } + chkfloor := o.chkflr + o.mu.RUnlock() + + if err != nil { + return err + } + + asflr := state.AckFloor.Stream + // Protect ourselves against rolling backwards. + if asflr&(1<<63) != 0 { + return errAckFloorInvalid + } + + // Check if the underlying stream's last sequence is less than our floor. + // This can happen if the stream has been reset and has not caught up yet. + if asflr > ss.LastSeq { + return errAckFloorHigherThanLastSeq + } + + var smv StoreMsg + var seq, nseq uint64 + // Start at first stream seq or a previous check floor, whichever is higher. + // Note this will really help for interest retention, with WQ the loadNextMsg + // gets us a long way already since it will skip deleted msgs not for our filter. + fseq := ss.FirstSeq + if chkfloor > fseq { + fseq = chkfloor + } + + var retryAsflr uint64 + for seq = fseq; asflr > 0 && seq <= asflr; seq++ { + if filters != nil { + _, nseq, err = store.LoadNextMsgMulti(filters, seq, &smv) + } else { + _, nseq, err = store.LoadNextMsg(filter, wc, seq, &smv) + } + // if we advanced sequence update our seq. This can be on no error and EOF. + if nseq > seq { + seq = nseq + } + // Only ack though if no error and seq <= ack floor. + if err == nil && seq <= asflr { + didRemove := mset.ackMsg(o, seq) + // Removing the message could fail. For example if clustered since we need to propose it. + // Overwrite retry floor (only the first time) to allow us to check next time if the removal was successful. + if didRemove && retryAsflr == 0 { + retryAsflr = seq + } + } + } + // If retry floor was not overwritten, set to ack floor+1, we don't need to account for any retries below it. + if retryAsflr == 0 { + retryAsflr = asflr + 1 + } + + o.mu.Lock() + // Update our check floor. + // Check floor must never be greater than ack floor+1, otherwise subsequent calls to this function would skip work. + if retryAsflr > o.chkflr { + o.chkflr = retryAsflr + } + // See if we need to process this update if our parent stream is not a limits policy stream. + state, _ = o.store.State() + o.mu.Unlock() + + // If we have pending, we will need to walk through to delivered in case we missed any of those acks as well. + if state != nil && len(state.Pending) > 0 && state.AckFloor.Stream > 0 { + for seq := state.AckFloor.Stream + 1; seq <= state.Delivered.Stream; seq++ { + if _, ok := state.Pending[seq]; !ok { + // Want to call needAck since it is filter aware. + if o.needAck(seq, _EMPTY_) { + mset.ackMsg(o, seq) + } + } + } + } + return nil +} + +func (o *consumer) resetPtmr(delay time.Duration) { + if o.ptmr == nil { + o.ptmr = time.AfterFunc(delay, o.checkPending) + } else { + o.ptmr.Reset(delay) + } + o.ptmrEnd = time.Now().Add(delay) +} + +func (o *consumer) stopAndClearPtmr() { + stopAndClearTimer(&o.ptmr) + o.ptmrEnd = time.Time{} +} diff --git a/server/consumer.go b/server/consumer.go index 6ce4678b126..0e2e5baa698 100644 --- a/server/consumer.go +++ b/server/consumer.go @@ -436,7 +436,7 @@ type consumer struct { rdc map[uint64]uint64 replies map[uint64]string maxdc uint64 - waiting *waitQueue + waiting WaitQueue cfg ConsumerConfig ici *ConsumerInfo store ConsumerStore @@ -1052,7 +1052,7 @@ func (mset *stream) addConsumerWithAssignment(config *ConsumerConfig, oname stri // Create our request waiting queue. if o.isPullMode() { - o.waiting = newWaitQueue(config.MaxWaiting) + o.waiting = NewWaitQueue(config.MaxWaiting, o.name) // Create our internal queue for next msg requests. o.nextMsgReqs = newIPQueue[*nextMsgReq](s, fmt.Sprintf("[ACC:%s] consumer '%s' on stream '%s' pull requests", accName, o.name, cfg.Name)) } @@ -1538,7 +1538,8 @@ func (o *consumer) setLeader(isLeader bool) { } // Reset waiting if we are in pull mode. if o.isPullMode() { - o.waiting = newWaitQueue(o.cfg.MaxWaiting) + o.waiting = NewWaitQueue(o.cfg.MaxWaiting, o.name) + o.nextMsgReqs.drain() } else if o.srv.gateway.enabled { stopAndClearTimer(&o.gwdtmr) @@ -1836,7 +1837,7 @@ func (o *consumer) deleteNotActive() { } } else { // Pull mode. - elapsed := time.Since(o.waiting.last) + elapsed := time.Since(o.waiting.Last()) if elapsed <= o.cfg.InactiveThreshold { // These need to keep firing so reset but use delta. if o.dtmr != nil { @@ -2602,7 +2603,7 @@ func (o *consumer) checkPendingRequests() { // Should be called only by the leader being deleted or stopped. // Lock should be held. func (o *consumer) releaseAnyPendingRequests(isAssigned bool) { - if o.mset == nil || o.outq == nil || o.waiting.len() == 0 { + if o.mset == nil || o.outq == nil || o.waiting.Len() == 0 { return } var hdr []byte @@ -2611,7 +2612,7 @@ func (o *consumer) releaseAnyPendingRequests(isAssigned bool) { } wq := o.waiting - for wr := wq.head; wr != nil; { + for wr := wq.Peek(); wr != nil; { if hdr != nil { o.outq.send(newJSPubMsg(wr.reply, _EMPTY_, _EMPTY_, hdr, nil, nil, 0)) } @@ -2984,7 +2985,7 @@ func (o *consumer) infoWithSnapAndReply(snap bool, reply string) *ConsumerInfo { // If we are a pull mode consumer, report on number of waiting requests. if o.isPullMode() { o.processWaiting(false) - info.NumWaiting = o.waiting.len() + info.NumWaiting = o.waiting.Len() } // If we were asked to snapshot do so here. if snap { @@ -3361,183 +3362,21 @@ func nextReqFromMsg(msg []byte) (time.Time, int, int, bool, time.Duration, time. return time.Time{}, 1, 0, false, 0, time.Time{}, nil, nil } -// Represents a request that is on the internal waiting queue -type waitingRequest struct { - next *waitingRequest - acc *Account - interest string - reply string - n int // For batching - d int // num delivered - b int // For max bytes tracking - expires time.Time - received time.Time - hb time.Duration - hbt time.Time - noWait bool - priorityGroup *PriorityGroup -} - // sync.Pool for waiting requests. var wrPool = sync.Pool{ New: func() any { - return new(waitingRequest) + return new(WaitingRequest) }, } -// Recycle this request. This request can not be accessed after this call. -func (wr *waitingRequest) recycleIfDone() bool { - if wr != nil && wr.n <= 0 { - wr.recycle() - return true - } - return false -} - -// Force a recycle. -func (wr *waitingRequest) recycle() { - if wr != nil { - wr.next, wr.acc, wr.interest, wr.reply = nil, nil, _EMPTY_, _EMPTY_ - wrPool.Put(wr) - } -} - -// waiting queue for requests that are waiting for new messages to arrive. -type waitQueue struct { - n, max int - last time.Time - head *waitingRequest - tail *waitingRequest -} - -// Create a new ring buffer with at most max items. -func newWaitQueue(max int) *waitQueue { - return &waitQueue{max: max} -} - -var ( - errWaitQueueFull = errors.New("wait queue is full") - errWaitQueueNil = errors.New("wait queue is nil") -) - -// Adds in a new request. -func (wq *waitQueue) add(wr *waitingRequest) error { - if wq == nil { - return errWaitQueueNil - } - if wq.isFull() { - return errWaitQueueFull - } - if wq.head == nil { - wq.head = wr - } else { - wq.tail.next = wr - } - // Always set tail. - wq.tail = wr - // Make sure nil - wr.next = nil - - // Track last active via when we receive a request. - wq.last = wr.received - wq.n++ - return nil -} - -func (wq *waitQueue) isFull() bool { - if wq == nil { - return false - } - return wq.n == wq.max -} - -func (wq *waitQueue) isEmpty() bool { - if wq == nil { - return true - } - return wq.n == 0 -} - -func (wq *waitQueue) len() int { - if wq == nil { - return 0 - } - return wq.n -} - -// Peek will return the next request waiting or nil if empty. -func (wq *waitQueue) peek() *waitingRequest { - if wq == nil { - return nil - } - return wq.head -} - -func (wq *waitQueue) cycle() { - wr := wq.peek() - if wr != nil { - // Always remove current now on a pop, and move to end if still valid. - // If we were the only one don't need to remove since this can be a no-op. - wq.removeCurrent() - wq.add(wr) - } -} - -// pop will return the next request and move the read cursor. -// This will now place a request that still has pending items at the ends of the list. -func (wq *waitQueue) pop() *waitingRequest { - wr := wq.peek() - if wr != nil { - wr.d++ - wr.n-- - // Always remove current now on a pop, and move to end if still valid. - // If we were the only one don't need to remove since this can be a no-op. - if wr.n > 0 && wq.n > 1 { - wq.removeCurrent() - wq.add(wr) - } else if wr.n <= 0 { - wq.removeCurrent() - } - } - return wr -} - -// Removes the current read pointer (head FIFO) entry. -func (wq *waitQueue) removeCurrent() { - wq.remove(nil, wq.head) -} - -// Remove the wr element from the wait queue. -func (wq *waitQueue) remove(pre, wr *waitingRequest) { - if wr == nil { - return - } - if pre != nil { - pre.next = wr.next - } else if wr == wq.head { - // We are removing head here. - wq.head = wr.next - } - // Check if wr was our tail. - if wr == wq.tail { - // Check if we need to assign to pre. - if wr.next == nil { - wq.tail = pre - } else { - wq.tail = wr.next - } - } - wq.n-- -} - // Return the map of pending requests keyed by the reply subject. // No-op if push consumer or invalid etc. -func (o *consumer) pendingRequests() map[string]*waitingRequest { +func (o *consumer) pendingRequests() map[string]*WaitingRequest { if o.waiting == nil { return nil } - wq, m := o.waiting, make(map[string]*waitingRequest) - for wr := wq.head; wr != nil; wr = wr.next { + wq, m := o.waiting, make(map[string]*WaitingRequest) + for wr := wq.Peek(); wr != nil; wr = wr.next { m[wr.reply] = wr } @@ -3562,8 +3401,8 @@ func (o *consumer) setPinnedTimer(priorityGroup string) { // Return next waiting request. This will check for expirations but not noWait or interest. // That will be handled by processWaiting. // Lock should be held. -func (o *consumer) nextWaiting(sz int) *waitingRequest { - if o.waiting == nil || o.waiting.isEmpty() { +func (o *consumer) nextWaiting(sz int) *WaitingRequest { + if o.waiting == nil || o.waiting.IsEmpty() { return nil } @@ -3575,8 +3414,8 @@ func (o *consumer) nextWaiting(sz int) *waitingRequest { priorityGroup = o.cfg.PriorityGroups[0] } - lastRequest := o.waiting.tail - for wr := o.waiting.peek(); !o.waiting.isEmpty(); wr = o.waiting.peek() { + lastRequest := o.waiting.Tail() + for wr := o.waiting.Peek(); !o.waiting.IsEmpty(); wr = o.waiting.Peek() { if wr == nil { break } @@ -3595,7 +3434,7 @@ func (o *consumer) nextWaiting(sz int) *waitingRequest { hdr := fmt.Appendf(nil, maxBytesT, JSPullRequestPendingMsgs, wr.n, JSPullRequestPendingBytes, wr.b) o.outq.send(newJSPubMsg(wr.reply, _EMPTY_, _EMPTY_, hdr, nil, nil, 0)) // Remove the current one, no longer valid due to max bytes limit. - o.waiting.removeCurrent() + o.waiting.RemoveCurrent() if o.node != nil { o.removeClusterPendingRequest(wr.reply) } @@ -3615,7 +3454,7 @@ func (o *consumer) nextWaiting(sz int) *waitingRequest { // There is pin id set, but not a matching one. Send a notification to the client and remove the request. // Probably this is the old pin id. o.outq.send(newJSPubMsg(wr.reply, _EMPTY_, _EMPTY_, []byte(JSPullRequestWrongPinID), nil, nil, 0)) - o.waiting.removeCurrent() + o.waiting.RemoveCurrent() if o.node != nil { o.removeClusterPendingRequest(wr.reply) } @@ -3627,7 +3466,7 @@ func (o *consumer) nextWaiting(sz int) *waitingRequest { if wr.priorityGroup != nil && wr.priorityGroup.Id == o.currentPinId { // If we have a match, we do nothing here and will deliver the message later down the code path. } else if wr.priorityGroup.Id == _EMPTY_ { - o.waiting.cycle() + o.waiting.Cycle() if wr == lastRequest { return nil } @@ -3635,7 +3474,7 @@ func (o *consumer) nextWaiting(sz int) *waitingRequest { } else { // There is pin id set, but not a matching one. Send a notification to the client and remove the request. o.outq.send(newJSPubMsg(wr.reply, _EMPTY_, _EMPTY_, []byte(JSPullRequestWrongPinID), nil, nil, 0)) - o.waiting.removeCurrent() + o.waiting.RemoveCurrent() if o.node != nil { o.removeClusterPendingRequest(wr.reply) } @@ -3649,7 +3488,7 @@ func (o *consumer) nextWaiting(sz int) *waitingRequest { // We need to check o.npc+1, because before calling nextWaiting, we do o.npc-- (wr.priorityGroup.MinPending > 0 && wr.priorityGroup.MinPending > o.npc+1 || wr.priorityGroup.MinAckPending > 0 && wr.priorityGroup.MinAckPending > int64(len(o.pending))) { - o.waiting.cycle() + o.waiting.Cycle() // We're done cycling through the requests. if wr == lastRequest { return nil @@ -3661,23 +3500,23 @@ func (o *consumer) nextWaiting(sz int) *waitingRequest { if needNewPin { o.sendPinnedAdvisoryLocked(priorityGroup) } - return o.waiting.pop() + return o.waiting.Pop() } else if time.Since(wr.received) < defaultGatewayRecentSubExpiration && (o.srv.leafNodeEnabled || o.srv.gateway.enabled) { if needNewPin { o.sendPinnedAdvisoryLocked(priorityGroup) } - return o.waiting.pop() + return o.waiting.Pop() } else if o.srv.gateway.enabled && o.srv.hasGatewayInterest(wr.acc.Name, wr.interest) { if needNewPin { o.sendPinnedAdvisoryLocked(priorityGroup) } - return o.waiting.pop() + return o.waiting.Pop() } } else { // We do check for expiration in `processWaiting`, but it is possible to hit the expiry here, and not there. hdr := fmt.Appendf(nil, "NATS/1.0 408 Request Timeout\r\n%s: %d\r\n%s: %d\r\n\r\n", JSPullRequestPendingMsgs, wr.n, JSPullRequestPendingBytes, wr.b) o.outq.send(newJSPubMsg(wr.reply, _EMPTY_, _EMPTY_, hdr, nil, nil, 0)) - o.waiting.removeCurrent() + o.waiting.RemoveCurrent() if o.node != nil { o.removeClusterPendingRequest(wr.reply) } @@ -3691,7 +3530,7 @@ func (o *consumer) nextWaiting(sz int) *waitingRequest { o.outq.send(newJSPubMsg(wr.reply, _EMPTY_, _EMPTY_, hdr, nil, nil, 0)) } // Remove the current one, no longer valid. - o.waiting.removeCurrent() + o.waiting.RemoveCurrent() if o.node != nil { o.removeClusterPendingRequest(wr.reply) } @@ -3832,7 +3671,7 @@ func (o *consumer) processNextMsgRequest(reply string, msg []byte) { } // If we have the max number of requests already pending try to expire. - if o.waiting.isFull() { + if o.waiting.IsFull() { // Try to expire some of the requests. // We do not want to push too hard here so at maximum process once per sec. if time.Since(o.lwqic) > time.Second { @@ -3846,14 +3685,14 @@ func (o *consumer) processNextMsgRequest(reply string, msg []byte) { // If no pending at all, decide what to do with request. // If no expires was set then fail. if msgsPending == 0 && expires.IsZero() { - o.waiting.last = time.Now() + o.waiting.SetLast(time.Now()) sendErr(404, "No Messages") return } if msgsPending > 0 { _, _, batchPending, _ := o.processWaiting(false) if msgsPending < uint64(batchPending) { - o.waiting.last = time.Now() + o.waiting.SetLast(time.Now()) sendErr(408, "Requests Pending") return } @@ -3866,12 +3705,13 @@ func (o *consumer) processNextMsgRequest(reply string, msg []byte) { acc, interest := trackDownAccountAndInterest(o.acc, reply) // Create a waiting request. - wr := wrPool.Get().(*waitingRequest) + wr := wrPool.Get().(*WaitingRequest) wr.acc, wr.interest, wr.reply, wr.n, wr.d, wr.noWait, wr.expires, wr.hb, wr.hbt, wr.priorityGroup = acc, interest, reply, batchSize, 0, noWait, expires, hb, hbt, priorityGroup wr.b = maxBytes wr.received = time.Now() - if err := o.waiting.add(wr); err != nil { + // fmt.Println("Adding waiting request for consumer ", o.name, "on stream ", o.stream, "with reply ", reply, "and interest ", interest, "and claim ", acc.claimJWT) + if err := o.waiting.Add(wr); err != nil { sendErr(409, "Exceeded MaxWaiting") wr.recycle() return @@ -4106,7 +3946,7 @@ func (o *consumer) getNextMsg() (*jsPubMsg, uint64, error) { // Will also do any heartbeats and return the next expiration or HB interval. func (o *consumer) processWaiting(eos bool) (int, int, int, time.Time) { var fexp time.Time - if o.srv == nil || o.waiting.isEmpty() { + if o.srv == nil || o.waiting.IsEmpty() { return 0, 0, 0, fexp } // Mark our last check time. @@ -4116,19 +3956,20 @@ func (o *consumer) processWaiting(eos bool) (int, int, int, time.Time) { s, now := o.srv, time.Now() wq := o.waiting - remove := func(pre, wr *waitingRequest) *waitingRequest { + remove := func(pre, wr *WaitingRequest) *WaitingRequest { expired++ if o.node != nil { o.removeClusterPendingRequest(wr.reply) } next := wr.next - wq.remove(pre, wr) + wq.Remove(pre, wr) wr.recycle() return next } - var pre *waitingRequest - for wr := wq.head; wr != nil; { + var pre *WaitingRequest + for wr := wq.Peek(); wr != nil; { + // fmt.Println("Processing waiting request here", wr.instanceID(), wr.interest) // Check expiration. if (eos && wr.noWait && wr.d > 0) || (!wr.expires.IsZero() && now.After(wr.expires)) { hdr := fmt.Appendf(nil, "NATS/1.0 408 Request Timeout\r\n%s: %d\r\n%s: %d\r\n\r\n", JSPullRequestPendingMsgs, wr.n, JSPullRequestPendingBytes, wr.b) @@ -4175,13 +4016,13 @@ func (o *consumer) processWaiting(eos bool) (int, int, int, time.Time) { wr = wr.next } - return expired, wq.len(), brp, fexp + return expired, o.waiting.Len(), brp, fexp } // Will check to make sure those waiting still have registered interest. func (o *consumer) checkWaitingForInterest() bool { o.processWaiting(true) - return o.waiting.len() > 0 + return o.waiting.Len() > 0 } // Lock should be held. @@ -4376,7 +4217,7 @@ func (o *consumer) suppressDeletion() { o.dtmr.Reset(o.dthresh) } else if o.isPullMode() && o.waiting != nil { // Pull mode always has timer running, just update last on waiting queue. - o.waiting.last = time.Now() + o.waiting.SetLast(time.Now()) } } @@ -4460,7 +4301,7 @@ func (o *consumer) loopAndGatherMsgs(qch chan struct{}) { if !o.active || (o.maxpb > 0 && o.pbytes > o.maxpb) { goto waitForMsgs } - } else if o.waiting.isEmpty() { + } else if o.waiting.IsEmpty() { // If we are in pull mode and no one is waiting already break and wait. goto waitForMsgs } @@ -4829,7 +4670,7 @@ func (o *consumer) deliverMsg(dsubj, ackReply string, pmsg *jsPubMsg, dc uint64, // If pull mode and we have inactivity threshold, signaled by dthresh, update last activity. if o.isPullMode() && o.dthresh > 0 { - o.waiting.last = time.Now() + o.waiting.SetLast(time.Now()) } // If we are ack none and mset is interest only we should make sure stream removes interest. @@ -4952,7 +4793,7 @@ func (o *consumer) trackPending(sseq, dseq uint64) { // lock should be held. func (o *consumer) creditWaitingRequest(reply string) { wq := o.waiting - for wr := wq.head; wr != nil; wr = wr.next { + for wr := wq.Peek(); wr != nil; wr = wr.next { if wr.reply == reply { wr.n++ wr.d-- @@ -4989,7 +4830,7 @@ func (o *consumer) didNotDeliver(seq uint64, subj string) { // we know it was not delivered if !o.onRedeliverQueue(seq) { o.addToRedeliverQueue(seq) - if !o.waiting.isEmpty() { + if !o.waiting.IsEmpty() { o.signalNewMessages() } } @@ -5917,7 +5758,7 @@ func (o *consumer) processStreamSignal(_ *subscription, _ *client, _ *Account, s if seq < o.sseq { return } - if o.isPushMode() && o.active || o.isPullMode() && !o.waiting.isEmpty() { + if o.isPushMode() && o.active || o.isPullMode() && !o.waiting.IsEmpty() { o.signalNewMessages() } } diff --git a/server/consumer_wait_queue.go b/server/consumer_wait_queue.go new file mode 100644 index 00000000000..6f77ff5ae1d --- /dev/null +++ b/server/consumer_wait_queue.go @@ -0,0 +1,854 @@ +package server + +import ( + "errors" + "fmt" + "math" + "sort" + "strings" + "sync" + "time" +) + +// ---------------------------------------------------------- +// Shared Data Structures +// ---------------------------------------------------------- + +// WaitingRequest represents a request in the queue +type WaitingRequest struct { + next *WaitingRequest + acc *Account + interest string + reply string + n int // For batching + d int // num delivered + b int // For max bytes tracking + expires time.Time + received time.Time + hb time.Duration + hbt time.Time + noWait bool + priorityGroup *PriorityGroup +} + +// recycleIfDone recycles this request if n <= 0 +func (wr *WaitingRequest) recycleIfDone() bool { + if wr != nil && wr.n <= 0 { + wr.recycle() + return true + } + return false +} + +// recycle forces a recycle of this request +func (wr *WaitingRequest) recycle() { + if wr != nil { + wr.next, wr.acc, wr.interest, wr.reply = nil, nil, _EMPTY_, _EMPTY_ + wrPool.Put(wr) + } +} + +// instanceID extracts a worker/instance ID from wr.reply +func (wr *WaitingRequest) instanceID() string { + if wr != nil && wr.reply != "" { + parts := strings.Split(wr.reply, ".") + if len(parts) > 1 { + return parts[1] + } + } + return "" +} + +// ---------------------------------------------------------- +// WaitQueue Interface +// ---------------------------------------------------------- + +type WaitQueue interface { + // Add adds a new request to the queue + Add(wr *WaitingRequest) error + + // Peek returns the next request that would be popped without removing it + Peek() *WaitingRequest + + // Tail returns the last request in the queue + Tail() *WaitingRequest + + // Pop returns and removes the next request from the queue + Pop() *WaitingRequest + + // Cycle moves the current head (or flow) to the end if valid + Cycle() + + // IsFull returns true if the queue is at capacity + IsFull() bool + + // IsEmpty returns true if the queue has no items + IsEmpty() bool + + // Len returns the current number of items in the queue + Len() int + + // RemoveCurrent removes the current head request + RemoveCurrent() + + // Remove removes a specific request from the queue + Remove(pre, wr *WaitingRequest) + + // Last returns the last active time + Last() time.Time + + // SetLast sets the last active time + SetLast(t time.Time) + + // LogFlows prints information about each active flow in this DRRWaitQueue. + LogFlows() +} + +// ---------------------------------------------------------- +// BaseWaitQueue with shared fields +// ---------------------------------------------------------- + +type BaseWaitQueue struct { + mu sync.RWMutex + n int + max int + last time.Time + head *WaitingRequest + tail *WaitingRequest +} + +// Common errors +var ( + ErrWaitQueueFull = errors.New("wait queue is full") + ErrWaitQueueNil = errors.New("wait queue is nil") +) + +// ---------------------------------------------------------- +// Redis, Stake Cache, and Global Map +// ---------------------------------------------------------- + +var ( + // globalRedis *redis.Client + globalStakeCache *stakeCache + stakeCacheOnce sync.Once + waitQueueMap = make(map[string]*WaitQueueInfo) + waitQueueMutex sync.RWMutex +) + +// SetGlobalRedis sets up the global Redis client for all wait queues +// func SetGlobalRedis(redisClient *redis.Client) { +// globalRedis = redisClient +// } + +// stakeCache maintains the current account balances and related stats +type stakeCache struct { + sync.RWMutex + stakeByInstanceID map[string]float64 + totalStake float64 + lastUpdate time.Time +} + +// getStakeCache returns the singleton stake cache instance +func getStakeCache() *stakeCache { + stakeCacheOnce.Do(func() { + // if globalRedis == nil { + // panic("global Redis client not initialized") + // } + globalStakeCache = &stakeCache{ + stakeByInstanceID: map[string]float64{ + "test-worker-id": 1.0, + "test-worker-2-id": 2.0, + }, + totalStake: 3.0, + lastUpdate: time.Now(), + } + }) + return globalStakeCache +} + +// ---------------------------------------------------------- +// FIFOWaitQueue +// ---------------------------------------------------------- + +type FIFOWaitQueue struct { + BaseWaitQueue +} + +// NewFIFOWaitQueue creates a new FIFO-based wait queue +func NewFIFOWaitQueue(max int) *FIFOWaitQueue { + return &FIFOWaitQueue{ + BaseWaitQueue: BaseWaitQueue{ + max: max, + }, + } +} + +// Add implements WaitQueue.Add +func (wq *FIFOWaitQueue) Add(wr *WaitingRequest) error { + if wq == nil { + return ErrWaitQueueNil + } + wq.mu.Lock() + defer wq.mu.Unlock() + + if wq.n >= wq.max { + return ErrWaitQueueFull + } + + if wq.head == nil { + wq.head = wr + } else { + wq.tail.next = wr + } + wq.tail = wr + wr.next = nil + wq.last = wr.received + wq.n++ + return nil +} + +// Pop implements WaitQueue.Pop +func (wq *FIFOWaitQueue) Pop() *WaitingRequest { + if wq == nil { + return nil + } + wq.mu.Lock() + defer wq.mu.Unlock() + + wr := wq.head + if wr != nil { + wr.d++ + wr.n-- + // Remove from front + wq.head = wr.next + if wq.head == nil { + wq.tail = nil + } + wq.n-- + + // If it still has n>0, requeue at the tail + if wr.n > 0 { + wr.next = nil + if wq.head == nil { + wq.head = wr + wq.tail = wr + } else { + wq.tail.next = wr + wq.tail = wr + } + wq.n++ + } else { + wr.next = nil + } + } + return wr +} + +// IsFull implements WaitQueue.IsFull +func (wq *FIFOWaitQueue) IsFull() bool { + if wq == nil { + return false + } + wq.mu.RLock() + defer wq.mu.RUnlock() + return wq.n == wq.max +} + +// IsEmpty implements WaitQueue.IsEmpty +func (wq *FIFOWaitQueue) IsEmpty() bool { + if wq == nil { + return true + } + wq.mu.RLock() + defer wq.mu.RUnlock() + return wq.n == 0 +} + +// Len implements WaitQueue.Len +func (wq *FIFOWaitQueue) Len() int { + if wq == nil { + return 0 + } + wq.mu.RLock() + defer wq.mu.RUnlock() + return wq.n +} + +// Peek implements WaitQueue.Peek +func (wq *FIFOWaitQueue) Peek() *WaitingRequest { + if wq == nil { + return nil + } + wq.mu.RLock() + defer wq.mu.RUnlock() + return wq.head +} + +// Tail implements WaitQueue.Tail +func (wq *FIFOWaitQueue) Tail() *WaitingRequest { + if wq == nil { + return nil + } + wq.mu.RLock() + defer wq.mu.RUnlock() + return wq.tail +} + +// Cycle implements WaitQueue.Cycle +func (wq *FIFOWaitQueue) Cycle() { + if wq == nil { + return + } + wq.mu.Lock() + defer wq.mu.Unlock() + + wr := wq.head + if wr != nil { + // Remove from front + wq.head = wr.next + if wq.head == nil { + wq.tail = nil + } else { + if wq.n > 0 { + wq.n-- + } + } + // Re-add + wq.Add(wr) + } +} + +// RemoveCurrent implements WaitQueue.RemoveCurrent +func (wq *FIFOWaitQueue) RemoveCurrent() { + wq.Remove(nil, wq.Peek()) +} + +// Remove implements WaitQueue.Remove +func (wq *FIFOWaitQueue) Remove(pre, wr *WaitingRequest) { + if wq == nil { + return + } + wq.mu.Lock() + defer wq.mu.Unlock() + + if wr == nil { + return + } + if pre != nil { + pre.next = wr.next + } else if wr == wq.head { + wq.head = wr.next + } + if wr == wq.tail { + if wr.next == nil { + wq.tail = pre + } else { + wq.tail = wr.next + } + } + if wq.n > 0 { + wq.n-- + } + wr.next = nil +} + +// Last implements WaitQueue.Last +func (wq *FIFOWaitQueue) Last() time.Time { + wq.mu.RLock() + defer wq.mu.RUnlock() + return wq.last +} + +// SetLast implements WaitQueue.SetLast +func (wq *FIFOWaitQueue) SetLast(t time.Time) { + wq.mu.Lock() + defer wq.mu.Unlock() + wq.last = t +} + +// LogFlows implements WaitQueue.LogFlows +func (wq *FIFOWaitQueue) LogFlows() { + fmt.Println("FIFOWaitQueue has no flows") +} + +// ---------------------------------------------------------- +// DRRWaitQueue (Deficit Round Robin) with Single-Item Pop +// ---------------------------------------------------------- + +// costOfRequest returns the "cost" for a given request (assume 1). +func costOfRequest(_ *WaitingRequest) int { + return 1 +} + +// flow represents per-worker DRR state +type flow struct { + instanceID string + head *WaitingRequest + tail *WaitingRequest + next *flow // for circular linked-list + deficit int // DRR deficit counter + quantum int // DRR quantum (stake-based) +} + +// DRRWaitQueue implements WaitQueue using Deficit Round Robin scheduling +type DRRWaitQueue struct { + BaseWaitQueue + + // Protect DRR-specific fields + mu sync.Mutex + + flows map[string]*flow + activeFlows *flow + activeFlowsTail *flow + current *flow + + scache *stakeCache + baseQuantum int +} + +// Ensure DRRWaitQueue implements WaitQueue +var _ WaitQueue = (*DRRWaitQueue)(nil) + +// NewDRRWaitQueue creates a new DRRWaitQueue with the given maximum capacity. +func NewDRRWaitQueue(max int) *DRRWaitQueue { + return &DRRWaitQueue{ + BaseWaitQueue: BaseWaitQueue{ + max: max, + }, + flows: make(map[string]*flow), + scache: getStakeCache(), + baseQuantum: 1, // quantum = ceil(stake) * baseQuantum + } +} + +// quantumForStake returns int(math.Ceil(stake)). +func (wq *DRRWaitQueue) quantumForStake(stake float64) int { + return int(math.Ceil(stake)) +} + +// activateFlow inserts the flow f into the circular active list +func (wq *DRRWaitQueue) activateFlow(f *flow) { + if f == nil { + return + } + if wq.activeFlows == nil { + // first active flow + wq.activeFlows = f + wq.activeFlowsTail = f + f.next = f // circular + if wq.current == nil { + wq.current = f + } + return + } + // If flow might already be in the list (f.next != nil), skip + if f.next != nil { + return + } + // Insert at tail + f.next = wq.activeFlows + wq.activeFlowsTail.next = f + wq.activeFlowsTail = f +} + +// removeFlowFromActive removes flow f from circular list +func (wq *DRRWaitQueue) removeFlowFromActive(f *flow) { + if f == nil || wq.activeFlows == nil { + return + } + // if single flow in list + if wq.activeFlows == f && wq.activeFlowsTail == f && f.next == f { + wq.activeFlows = nil + wq.activeFlowsTail = nil + if wq.current == f { + wq.current = nil + } + f.next = nil + return + } + // find predecessor + prev := f + for prev.next != f { + prev = prev.next + } + prev.next = f.next + if wq.activeFlows == f { + wq.activeFlows = f.next + } + if wq.activeFlowsTail == f { + wq.activeFlowsTail = prev + } + if wq.current == f { + wq.current = f.next + } + f.next = nil +} + +// Add enqueues wr into the DRR queue +func (wq *DRRWaitQueue) Add(wr *WaitingRequest) error { + if wr == nil { + return ErrWaitQueueNil + } + wq.mu.Lock() + defer wq.mu.Unlock() + + if wq.n >= wq.max { + return ErrWaitQueueFull + } + + instanceID := wr.instanceID() + if instanceID == "" { + instanceID = "unknown" + } + stake := wq.scache.stakeByInstanceID[instanceID] + if stake < 0 { + stake = 0 + } + + f := wq.flows[instanceID] + if f == nil { + // create new flow + f = &flow{ + instanceID: instanceID, + deficit: 0, + quantum: wq.quantumForStake(stake), + } + wq.flows[instanceID] = f + } + + // enqueue into flow + if f.head == nil { + f.head = wr + f.tail = wr + // newly active + wq.activateFlow(f) + } else { + f.tail.next = wr + f.tail = wr + } + + wq.n++ + wq.last = wr.received + return nil +} + +// Peek returns the next request that would be popped (best effort) +func (wq *DRRWaitQueue) Peek() *WaitingRequest { + wq.mu.Lock() + defer wq.mu.Unlock() + + if wq.activeFlows == nil || wq.current == nil { + return nil + } + return wq.current.head +} + +// Tail returns the last request in the queue +// We return BaseWaitQueue.tail for completeness +func (wq *DRRWaitQueue) Tail() *WaitingRequest { + wq.mu.Lock() + defer wq.mu.Unlock() + return wq.BaseWaitQueue.tail +} + +// Pop returns one request from DRR (partial dispatch) +func (wq *DRRWaitQueue) Pop() *WaitingRequest { + wq.mu.Lock() + defer wq.mu.Unlock() + + if wq.activeFlows == nil { + return nil + } + if wq.current == nil { + wq.current = wq.activeFlows + } + + count := wq.countActiveFlows() + if count == 0 { + return nil + } + + original := wq.current + for i := 0; i < count; i++ { + f := wq.current + + // Add quantum to deficit + f.deficit += f.quantum + + // If we can afford the head + if f.head != nil && costOfRequest(f.head) <= f.deficit { + wr := f.head + f.head = wr.next + if f.head == nil { + f.tail = nil + } + f.deficit -= costOfRequest(wr) + wq.n-- + wr.next = nil + + // if flow is empty now, remove it + if f.head == nil { + wq.removeFlowFromActive(f) + } + + // Rotate to the NEXT flow so we don't keep returning the same flow + wq.current = f.next + // Return exactly one item + return wr + } + + // rotate to next flow + wq.current = f.next + if wq.current == original { + break + } + } + // none found + return nil +} + +// Cycle moves the current pointer to the next flow +func (wq *DRRWaitQueue) Cycle() { + wq.mu.Lock() + defer wq.mu.Unlock() + + if wq.activeFlows == nil || wq.current == nil { + return + } + wq.current = wq.current.next +} + +// RemoveCurrent removes the request at current flow's head +func (wq *DRRWaitQueue) RemoveCurrent() { + wq.mu.Lock() + defer wq.mu.Unlock() + + if wq.current == nil || wq.current.head == nil { + return + } + wr := wq.current.head + wq.removeRequest(wq.current, nil, wr) +} + +// Remove removes a specific request from the queue +func (wq *DRRWaitQueue) Remove(pre, wr *WaitingRequest) { + if wr == nil { + return + } + wq.mu.Lock() + defer wq.mu.Unlock() + + instanceID := wr.instanceID() + if instanceID == "" { + instanceID = "unknown" + } + f := wq.flows[instanceID] + if f == nil { + return + } + wq.removeRequest(f, pre, wr) +} + +// removeRequest unlinks wr from flow f +func (wq *DRRWaitQueue) removeRequest(f *flow, pre, wr *WaitingRequest) { + if f == nil || wr == nil { + return + } + if f.head == nil { + return + } + + if pre == nil { + // remove head + if wr == f.head { + f.head = f.head.next + if f.head == nil { + f.tail = nil + } + wq.n-- + wr.next = nil + } + } else { + // remove mid or tail + if pre.next == wr { + pre.next = wr.next + if wr == f.tail { + f.tail = pre + } + wq.n-- + wr.next = nil + } + } + + // if flow empty, remove it + if f.head == nil { + wq.removeFlowFromActive(f) + } +} + +// IsFull returns true if DRR queue is at capacity +func (wq *DRRWaitQueue) IsFull() bool { + wq.mu.Lock() + defer wq.mu.Unlock() + return wq.n >= wq.max +} + +// IsEmpty returns true if DRR queue has no items +func (wq *DRRWaitQueue) IsEmpty() bool { + wq.mu.Lock() + defer wq.mu.Unlock() + return wq.n == 0 +} + +// Len returns the current number of items in DRR +func (wq *DRRWaitQueue) Len() int { + wq.mu.Lock() + defer wq.mu.Unlock() + return wq.n +} + +// Last returns the last active time +func (wq *DRRWaitQueue) Last() time.Time { + wq.mu.Lock() + defer wq.mu.Unlock() + return wq.last +} + +// SetLast sets the last active time +func (wq *DRRWaitQueue) SetLast(t time.Time) { + wq.mu.Lock() + defer wq.mu.Unlock() + wq.last = t +} + +// countActiveFlows returns how many flows are in the active list (circular) +func (wq *DRRWaitQueue) countActiveFlows() int { + if wq.activeFlows == nil { + return 0 + } + count := 0 + start := wq.activeFlows + f := start + for { + count++ + f = f.next + if f == start { + break + } + } + return count +} + +// LogFlows prints information about each active flow in this DRRWaitQueue. +func (wq *DRRWaitQueue) LogFlows() { + wq.mu.Lock() + defer wq.mu.Unlock() + + // Optionally, log overall queue info: + fmt.Printf("==== DRRWaitQueue Debug ====\n") + fmt.Printf("Total Requests: %d / Capacity: %d\n", wq.n, wq.max) + if wq.activeFlows == nil { + fmt.Println("No active flows. (Either empty or flows have no requests.)") + return + } + + // Walk the circular list of active flows exactly once + start := wq.activeFlows + f := start + for { + // Count how many requests are currently in this flow + reqCount := 0 + for wr := f.head; wr != nil; wr = wr.next { + reqCount++ + } + + // Print relevant details + fmt.Printf("Flow instance=%q, Deficit=%d, Quantum=%d, Requests=%d\n", + f.instanceID, f.deficit, f.quantum, reqCount) + + // Move on to the next flow + f = f.next + if f == start { + break // We’ve looped around + } + } + fmt.Println("================================") +} + +// ---------------------------------------------------------- +// WaitQueueInfo & monitorWaitQueue +// ---------------------------------------------------------- + +type WaitQueueInfo struct { + wq WaitQueue + stream string + what string +} + +// NewWaitQueue picks DRR for inference streams, FIFO otherwise +func NewWaitQueue(max int, stream string) WaitQueue { + fmt.Println("NewWaitQueue:", stream) + + waitQueueMutex.Lock() + defer waitQueueMutex.Unlock() + + // Check if queue already exists for this stream + if info, exists := waitQueueMap[stream]; exists { + return info.wq + } + + isInference := strings.Contains(stream, "fast") || strings.Contains(stream, "slow") + + var what string + var wq WaitQueue + if isInference { + what = "DRR" + wq = NewDRRWaitQueue(max) + } else { + what = "FIFO" + wq = NewFIFOWaitQueue(max) + } + + info := &WaitQueueInfo{ + wq: wq, + stream: stream, + what: what, + } + waitQueueMap[stream] = info + monitorWaitQueue() + + return wq +} + +var isRunning = false + +func monitorWaitQueue() { + if isRunning { + return + } + isRunning = true + + go func() { + for range time.Tick(1 * time.Second) { + fmt.Println("--------------------------------") + // gather streams, sorted + waitQueueMutex.RLock() + streams := make([]string, 0, len(waitQueueMap)) + for s := range waitQueueMap { + streams = append(streams, s) + } + waitQueueMutex.RUnlock() + sort.Strings(streams) + + // print in sorted order + waitQueueMutex.RLock() + for _, s := range streams { + info := waitQueueMap[s] + // fmt.Printf("| %-4s | %-4d | %-40s |\n", info.what, info.wq.Len(), info.stream) + info.wq.LogFlows() + } + waitQueueMutex.RUnlock() + fmt.Println("--------------------------------") + } + }() +} diff --git a/server/server.go b/server/server.go index 030dea5f5bf..d6b1cb61688 100644 --- a/server/server.go +++ b/server/server.go @@ -2179,7 +2179,7 @@ func (s *Server) fetchAccount(name string) (*Account, error) { // WaitForShutdown can be used to block and wait for the server to shutdown properly if needed // after calling s.Shutdown() func (s *Server) Start() { - s.Noticef("Starting nats-server") + s.Noticef("Starting kuzco-nats-server") gc := gitCommit if gc == _EMPTY_ {