dd9309c15e
Initial vendor list validated with empty $GOPATH and only master checked out; followed by `make` and verified that all binaries build properly. Updates require github.com/LK4D4/vndr tool. Signed-off-by: Phil Estes <estesp@linux.vnet.ibm.com>
2749 lines
77 KiB
Go
2749 lines
77 KiB
Go
// Copyright 2016 Apcera Inc. All rights reserved.
|
|
|
|
package stores
|
|
|
|
import (
|
|
"bufio"
|
|
"fmt"
|
|
"hash/crc32"
|
|
"io"
|
|
"io/ioutil"
|
|
"os"
|
|
"os/exec"
|
|
"path/filepath"
|
|
"sort"
|
|
"strconv"
|
|
"strings"
|
|
"sync"
|
|
"sync/atomic"
|
|
"time"
|
|
|
|
"github.com/nats-io/go-nats-streaming/pb"
|
|
"github.com/nats-io/nats-streaming-server/spb"
|
|
"github.com/nats-io/nats-streaming-server/util"
|
|
)
|
|
|
|
const (
|
|
// Our file version.
|
|
fileVersion = 1
|
|
|
|
// Prefix for message log files
|
|
msgFilesPrefix = "msgs."
|
|
|
|
// Data files suffix
|
|
datSuffix = ".dat"
|
|
|
|
// Index files suffix
|
|
idxSuffix = ".idx"
|
|
|
|
// Backup file suffix
|
|
bakSuffix = ".bak"
|
|
|
|
// Name of the subscriptions file.
|
|
subsFileName = "subs" + datSuffix
|
|
|
|
// Name of the clients file.
|
|
clientsFileName = "clients" + datSuffix
|
|
|
|
// Name of the server file.
|
|
serverFileName = "server" + datSuffix
|
|
|
|
// Number of bytes required to store a CRC-32 checksum
|
|
crcSize = crc32.Size
|
|
|
|
// Size of a record header.
|
|
// 4 bytes: For typed records: 1 byte for type, 3 bytes for buffer size
|
|
// For non typed rec: buffer size
|
|
// +4 bytes for CRC-32
|
|
recordHeaderSize = 4 + crcSize
|
|
|
|
// defaultBufSize is used for various buffered IO operations
|
|
defaultBufSize = 10 * 1024 * 1024
|
|
|
|
// Size of an message index record
|
|
// Seq - Offset - Timestamp - Size - CRC
|
|
msgIndexRecSize = 8 + 8 + 8 + 4 + crcSize
|
|
|
|
// msgRecordOverhead is the number of bytes to count toward the size
|
|
// of a serialized message so that file slice size is closer to
|
|
// channels and/or file slice limits.
|
|
msgRecordOverhead = recordHeaderSize + msgIndexRecSize
|
|
|
|
// Percentage of buffer usage to decide if the buffer should shrink
|
|
bufShrinkThreshold = 50
|
|
|
|
// Interval when to check/try to shrink buffer writers
|
|
defaultBufShrinkInterval = 5 * time.Second
|
|
|
|
// If FileStoreOption's BufferSize is > 0, the buffer writer is initially
|
|
// created with this size (unless this is > than BufferSize, in which case
|
|
// BufferSize is used). When possible, the buffer will shrink but not lower
|
|
// than this value. This is for FileSubStore's
|
|
subBufMinShrinkSize = 128
|
|
|
|
// If FileStoreOption's BufferSize is > 0, the buffer writer is initially
|
|
// created with this size (unless this is > than BufferSize, in which case
|
|
// BufferSize is used). When possible, the buffer will shrink but not lower
|
|
// than this value. This is for FileMsgStore's
|
|
msgBufMinShrinkSize = 512
|
|
|
|
// This is the sleep time in the background tasks go routine.
|
|
defaultBkgTasksSleepDuration = time.Second
|
|
|
|
// This is the default amount of time a message is cached.
|
|
defaultCacheTTL = time.Second
|
|
)
|
|
|
|
// FileStoreOption is a function on the options for a File Store
|
|
type FileStoreOption func(*FileStoreOptions) error
|
|
|
|
// FileStoreOptions can be used to customize a File Store
|
|
type FileStoreOptions struct {
|
|
// BufferSize is the size of the buffer used during store operations.
|
|
BufferSize int
|
|
|
|
// CompactEnabled allows to enable/disable files compaction.
|
|
CompactEnabled bool
|
|
|
|
// CompactInterval indicates the minimum interval (in seconds) between compactions.
|
|
CompactInterval int
|
|
|
|
// CompactFragmentation indicates the minimum ratio of fragmentation
|
|
// to trigger compaction. For instance, 50 means that compaction
|
|
// would not happen until fragmentation is more than 50%.
|
|
CompactFragmentation int
|
|
|
|
// CompactMinFileSize indicates the minimum file size before compaction
|
|
// can be performed, regardless of the current file fragmentation.
|
|
CompactMinFileSize int64
|
|
|
|
// DoCRC enables (or disables) CRC checksum verification on read operations.
|
|
DoCRC bool
|
|
|
|
// CRCPoly is a polynomial used to make the table used in CRC computation.
|
|
CRCPolynomial int64
|
|
|
|
// DoSync indicates if `File.Sync()`` is called during a flush.
|
|
DoSync bool
|
|
|
|
// Regardless of channel limits, the options below allow to split a message
|
|
// log in smaller file chunks. If all those options were to be set to 0,
|
|
// some file slice limit will be selected automatically based on the channel
|
|
// limits.
|
|
// SliceMaxMsgs defines how many messages can fit in a file slice (0 means
|
|
// count is not checked).
|
|
SliceMaxMsgs int
|
|
// SliceMaxBytes defines how many bytes can fit in a file slice, including
|
|
// the corresponding index file (0 means size is not checked).
|
|
SliceMaxBytes int64
|
|
// SliceMaxAge defines the period of time covered by a slice starting when
|
|
// the first message is stored (0 means time is not checked).
|
|
SliceMaxAge time.Duration
|
|
// SliceArchiveScript is the path to a script to be invoked when a file
|
|
// slice (and the corresponding index file) is going to be removed.
|
|
// The script will be invoked with the channel name and names of data and
|
|
// index files (which both have been previously renamed with a '.bak'
|
|
// extension). It is the responsability of the script to move/remove
|
|
// those files.
|
|
SliceArchiveScript string
|
|
}
|
|
|
|
// DefaultFileStoreOptions defines the default options for a File Store.
|
|
var DefaultFileStoreOptions = FileStoreOptions{
|
|
BufferSize: 2 * 1024 * 1024, // 2MB
|
|
CompactEnabled: true,
|
|
CompactInterval: 5 * 60, // 5 minutes
|
|
CompactFragmentation: 50,
|
|
CompactMinFileSize: 1024 * 1024,
|
|
DoCRC: true,
|
|
CRCPolynomial: int64(crc32.IEEE),
|
|
DoSync: true,
|
|
SliceMaxBytes: 64 * 1024 * 1024, // 64MB
|
|
}
|
|
|
|
// BufferSize is a FileStore option that sets the size of the buffer used
|
|
// during store writes. This can help improve write performance.
|
|
func BufferSize(size int) FileStoreOption {
|
|
return func(o *FileStoreOptions) error {
|
|
o.BufferSize = size
|
|
return nil
|
|
}
|
|
}
|
|
|
|
// CompactEnabled is a FileStore option that enables or disables file compaction.
|
|
// The value false will disable compaction.
|
|
func CompactEnabled(enabled bool) FileStoreOption {
|
|
return func(o *FileStoreOptions) error {
|
|
o.CompactEnabled = enabled
|
|
return nil
|
|
}
|
|
}
|
|
|
|
// CompactInterval is a FileStore option that defines the minimum compaction interval.
|
|
// Compaction is not timer based, but instead when things get "deleted". This value
|
|
// prevents compaction to happen too often.
|
|
func CompactInterval(seconds int) FileStoreOption {
|
|
return func(o *FileStoreOptions) error {
|
|
o.CompactInterval = seconds
|
|
return nil
|
|
}
|
|
}
|
|
|
|
// CompactFragmentation is a FileStore option that defines the fragmentation ratio
|
|
// below which compaction would not occur. For instance, specifying 50 means that
|
|
// if other variables would allow for compaction, the compaction would occur only
|
|
// after 50% of the file has data that is no longer valid.
|
|
func CompactFragmentation(fragmentation int) FileStoreOption {
|
|
return func(o *FileStoreOptions) error {
|
|
o.CompactFragmentation = fragmentation
|
|
return nil
|
|
}
|
|
}
|
|
|
|
// CompactMinFileSize is a FileStore option that defines the minimum file size below
|
|
// which compaction would not occur. Specify `-1` if you don't want any minimum.
|
|
func CompactMinFileSize(fileSize int64) FileStoreOption {
|
|
return func(o *FileStoreOptions) error {
|
|
o.CompactMinFileSize = fileSize
|
|
return nil
|
|
}
|
|
}
|
|
|
|
// DoCRC is a FileStore option that defines if a CRC checksum verification should
|
|
// be performed when records are read from disk.
|
|
func DoCRC(enableCRC bool) FileStoreOption {
|
|
return func(o *FileStoreOptions) error {
|
|
o.DoCRC = enableCRC
|
|
return nil
|
|
}
|
|
}
|
|
|
|
// CRCPolynomial is a FileStore option that defines the polynomial to use to create
|
|
// the table used for CRC-32 Checksum.
|
|
// See https://golang.org/pkg/hash/crc32/#MakeTable
|
|
func CRCPolynomial(polynomial int64) FileStoreOption {
|
|
return func(o *FileStoreOptions) error {
|
|
o.CRCPolynomial = polynomial
|
|
return nil
|
|
}
|
|
}
|
|
|
|
// DoSync is a FileStore option that defines if `File.Sync()` should be called
|
|
// during a `Flush()` call.
|
|
func DoSync(enableFileSync bool) FileStoreOption {
|
|
return func(o *FileStoreOptions) error {
|
|
o.DoSync = enableFileSync
|
|
return nil
|
|
}
|
|
}
|
|
|
|
// SliceConfig is a FileStore option that allows the configuration of
|
|
// file slice limits and optional archive script file name.
|
|
func SliceConfig(maxMsgs int, maxBytes int64, maxAge time.Duration, script string) FileStoreOption {
|
|
return func(o *FileStoreOptions) error {
|
|
o.SliceMaxMsgs = maxMsgs
|
|
o.SliceMaxBytes = maxBytes
|
|
o.SliceMaxAge = maxAge
|
|
o.SliceArchiveScript = script
|
|
return nil
|
|
}
|
|
}
|
|
|
|
// AllOptions is a convenient option to pass all options from a FileStoreOptions
|
|
// structure to the constructor.
|
|
func AllOptions(opts *FileStoreOptions) FileStoreOption {
|
|
return func(o *FileStoreOptions) error {
|
|
// Make a copy
|
|
*o = *opts
|
|
return nil
|
|
}
|
|
}
|
|
|
|
// Type for the records in the subscriptions file
|
|
type recordType byte
|
|
|
|
// Protobufs do not share a common interface, yet, when saving a
|
|
// record on disk, we have to get the size and marshal the record in
|
|
// a buffer. These methods are available in all the protobuf.
|
|
// So we create this interface with those two methods to be used by the
|
|
// writeRecord method.
|
|
type record interface {
|
|
Size() int
|
|
MarshalTo([]byte) (int, error)
|
|
}
|
|
|
|
// This is use for cases when the record is not typed
|
|
const recNoType = recordType(0)
|
|
|
|
// Record types for subscription file
|
|
const (
|
|
subRecNew = recordType(iota) + 1
|
|
subRecUpdate
|
|
subRecDel
|
|
subRecAck
|
|
subRecMsg
|
|
)
|
|
|
|
// Record types for client store
|
|
const (
|
|
addClient = recordType(iota) + 1
|
|
delClient
|
|
)
|
|
|
|
// FileStore is the storage interface for STAN servers, backed by files.
|
|
type FileStore struct {
|
|
genericStore
|
|
rootDir string
|
|
serverFile *os.File
|
|
clientsFile *os.File
|
|
opts FileStoreOptions
|
|
compactItvl time.Duration
|
|
addClientRec spb.ClientInfo
|
|
delClientRec spb.ClientDelete
|
|
cliFileSize int64
|
|
cliDeleteRecs int // Number of deleted client records
|
|
cliCompactTS time.Time
|
|
crcTable *crc32.Table
|
|
}
|
|
|
|
type subscription struct {
|
|
sub *spb.SubState
|
|
seqnos map[uint64]struct{}
|
|
}
|
|
|
|
type bufferedWriter struct {
|
|
buf *bufio.Writer
|
|
bufSize int // current buffer size
|
|
minShrinkSize int // minimum shrink size. Note that this can be bigger than maxSize (see setSizes)
|
|
maxSize int // maximum size the buffer can grow
|
|
shrinkReq bool // used to decide if buffer should shrink
|
|
}
|
|
|
|
// FileSubStore is a subscription store in files.
|
|
type FileSubStore struct {
|
|
genericSubStore
|
|
tmpSubBuf []byte
|
|
file *os.File
|
|
bw *bufferedWriter
|
|
delSub spb.SubStateDelete
|
|
updateSub spb.SubStateUpdate
|
|
subs map[uint64]*subscription
|
|
opts *FileStoreOptions // points to options from FileStore
|
|
compactItvl time.Duration
|
|
fileSize int64
|
|
numRecs int // Number of records (sub and msgs)
|
|
delRecs int // Number of delete (or ack) records
|
|
rootDir string
|
|
compactTS time.Time
|
|
crcTable *crc32.Table // reference to the one from FileStore
|
|
activity bool // was there any write between two flush calls
|
|
writer io.Writer // this is either `bw` or `file` depending if buffer writer is used or not
|
|
shrinkTimer *time.Timer // timer associated with callback shrinking buffer when possible
|
|
allDone sync.WaitGroup
|
|
}
|
|
|
|
// fileSlice represents one of the message store file (there are a number
|
|
// of files for a MsgStore on a given channel).
|
|
type fileSlice struct {
|
|
fileName string
|
|
idxFName string
|
|
firstSeq uint64
|
|
lastSeq uint64
|
|
rmCount int // Count of messages "removed" from the slice due to limits.
|
|
msgsCount int
|
|
msgsSize uint64
|
|
firstWrite int64 // Time the first message was added to this slice (used for slice age limit)
|
|
file *os.File // Used during lookups.
|
|
lastUsed int64
|
|
}
|
|
|
|
// msgRecord contains data regarding a message that the FileMsgStore needs to
|
|
// keep in memory for performance reasons.
|
|
type msgRecord struct {
|
|
offset int64
|
|
timestamp int64
|
|
msgSize uint32
|
|
}
|
|
|
|
// bufferedMsg is required to keep track of a message and msgRecord when
|
|
// file buffering is used. It is possible that a message and index is
|
|
// not flushed on disk while the message gets removed from the store
|
|
// due to limit. We need a map that keeps a reference to message and
|
|
// record until the file is flushed.
|
|
type bufferedMsg struct {
|
|
msg *pb.MsgProto
|
|
rec *msgRecord
|
|
}
|
|
|
|
// cachedMsg is a structure that contains a reference to a message
|
|
// and cache expiration value. The cache has a map and list so
|
|
// that cached messages can be ordered by expiration time.
|
|
type cachedMsg struct {
|
|
expiration int64
|
|
msg *pb.MsgProto
|
|
prev *cachedMsg
|
|
next *cachedMsg
|
|
}
|
|
|
|
// msgsCache is the file store cache.
|
|
type msgsCache struct {
|
|
tryEvict int32
|
|
seqMaps map[uint64]*cachedMsg
|
|
head *cachedMsg
|
|
tail *cachedMsg
|
|
}
|
|
|
|
// FileMsgStore is a per channel message file store.
|
|
type FileMsgStore struct {
|
|
genericMsgStore
|
|
// Atomic operations require 64bit aligned fields to be able
|
|
// to run with 32bit processes.
|
|
checkSlices int64 // used with atomic operations
|
|
timeTick int64 // time captured in background tasks go routine
|
|
|
|
tmpMsgBuf []byte
|
|
file *os.File
|
|
idxFile *os.File
|
|
bw *bufferedWriter
|
|
writer io.Writer // this is `bw.buf` or `file` depending if buffer writer is used or not
|
|
files map[int]*fileSlice
|
|
currSlice *fileSlice
|
|
rootDir string
|
|
firstFSlSeq int // First file slice sequence number
|
|
lastFSlSeq int // Last file slice sequence number
|
|
slCountLim int
|
|
slSizeLim uint64
|
|
slAgeLim int64
|
|
slHasLimits bool
|
|
fstore *FileStore // pointers to file store object
|
|
cache *msgsCache
|
|
msgs map[uint64]*msgRecord
|
|
wOffset int64
|
|
firstMsg *pb.MsgProto
|
|
lastMsg *pb.MsgProto
|
|
expiration int64
|
|
bufferedSeqs []uint64
|
|
bufferedMsgs map[uint64]*bufferedMsg
|
|
bkgTasksDone chan bool // signal the background tasks go routine to stop
|
|
bkgTasksWake chan bool // signal the background tasks go routine to get out of a sleep
|
|
allDone sync.WaitGroup
|
|
}
|
|
|
|
// some variables based on constants but that we can change
|
|
// for tests puposes.
|
|
var (
|
|
bufShrinkInterval = defaultBufShrinkInterval
|
|
bkgTasksSleepDuration = defaultBkgTasksSleepDuration
|
|
cacheTTL = int64(defaultCacheTTL)
|
|
)
|
|
|
|
// openFile opens the file specified by `filename`.
|
|
// If the file exists, it checks that the version is supported.
|
|
// If no file mode is provided, the file is created if not present,
|
|
// opened in Read/Write and Append mode.
|
|
func openFile(fileName string, modes ...int) (*os.File, error) {
|
|
checkVersion := false
|
|
|
|
mode := os.O_RDWR | os.O_CREATE | os.O_APPEND
|
|
if len(modes) > 0 {
|
|
// Use the provided modes instead
|
|
mode = 0
|
|
for _, m := range modes {
|
|
mode |= m
|
|
}
|
|
}
|
|
|
|
// Check if file already exists
|
|
if s, err := os.Stat(fileName); s != nil && err == nil {
|
|
checkVersion = true
|
|
}
|
|
file, err := os.OpenFile(fileName, mode, 0666)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
if checkVersion {
|
|
err = checkFileVersion(file)
|
|
} else {
|
|
// This is a new file, write our file version
|
|
err = util.WriteInt(file, fileVersion)
|
|
}
|
|
if err != nil {
|
|
file.Close()
|
|
file = nil
|
|
}
|
|
return file, err
|
|
}
|
|
|
|
// check that the version of the file is understood by this interface
|
|
func checkFileVersion(r io.Reader) error {
|
|
fv, err := util.ReadInt(r)
|
|
if err != nil {
|
|
return fmt.Errorf("unable to verify file version: %v", err)
|
|
}
|
|
if fv == 0 || fv > fileVersion {
|
|
return fmt.Errorf("unsupported file version: %v (supports [1..%v])", fv, fileVersion)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// writeRecord writes a record to `w`.
|
|
// The record layout is as follows:
|
|
// 8 bytes: 4 bytes for type and/or size combined
|
|
// 4 bytes for CRC-32
|
|
// variable bytes: payload.
|
|
// If a buffer is provided, this function uses it and expands it if necessary.
|
|
// The function returns the buffer (possibly changed due to expansion) and the
|
|
// number of bytes written into that buffer.
|
|
func writeRecord(w io.Writer, buf []byte, recType recordType, rec record, recSize int, crcTable *crc32.Table) ([]byte, int, error) {
|
|
// This is the header + payload size
|
|
totalSize := recordHeaderSize + recSize
|
|
// Alloc or realloc as needed
|
|
buf = util.EnsureBufBigEnough(buf, totalSize)
|
|
// If there is a record type, encode it
|
|
headerFirstInt := 0
|
|
if recType != recNoType {
|
|
if recSize > 0xFFFFFF {
|
|
panic("record size too big")
|
|
}
|
|
// Encode the type in the high byte of the header
|
|
headerFirstInt = int(recType)<<24 | recSize
|
|
} else {
|
|
// The header is the size of the record
|
|
headerFirstInt = recSize
|
|
}
|
|
// Write the first part of the header at the beginning of the buffer
|
|
util.ByteOrder.PutUint32(buf[:4], uint32(headerFirstInt))
|
|
// Marshal the record into the given buffer, after the header offset
|
|
if _, err := rec.MarshalTo(buf[recordHeaderSize:totalSize]); err != nil {
|
|
// Return the buffer because the caller may have provided one
|
|
return buf, 0, err
|
|
}
|
|
// Compute CRC
|
|
crc := crc32.Checksum(buf[recordHeaderSize:totalSize], crcTable)
|
|
// Write it in the buffer
|
|
util.ByteOrder.PutUint32(buf[4:recordHeaderSize], crc)
|
|
// Are we dealing with a buffered writer?
|
|
bw, isBuffered := w.(*bufio.Writer)
|
|
// if so, make sure that if what we are about to "write" is more
|
|
// than what's available, then first flush the buffer.
|
|
// This is to reduce the risk of partial writes.
|
|
if isBuffered && (bw.Buffered() > 0) && (bw.Available() < totalSize) {
|
|
if err := bw.Flush(); err != nil {
|
|
return buf, 0, err
|
|
}
|
|
}
|
|
// Write the content of our slice into the writer `w`
|
|
if _, err := w.Write(buf[:totalSize]); err != nil {
|
|
// Return the tmpBuf because the caller may have provided one
|
|
return buf, 0, err
|
|
}
|
|
return buf, totalSize, nil
|
|
}
|
|
|
|
// readRecord reads a record from `r`, possibly checking the CRC-32 checksum.
|
|
// When `buf`` is not nil, this function ensures the buffer is big enough to
|
|
// hold the payload (expanding if necessary). Therefore, this call always
|
|
// return `buf`, regardless if there is an error or not.
|
|
// The caller is indicating if the record is supposed to be typed or not.
|
|
func readRecord(r io.Reader, buf []byte, recTyped bool, crcTable *crc32.Table, checkCRC bool) ([]byte, int, recordType, error) {
|
|
_header := [recordHeaderSize]byte{}
|
|
header := _header[:]
|
|
if _, err := io.ReadFull(r, header); err != nil {
|
|
return buf, 0, recNoType, err
|
|
}
|
|
recType := recNoType
|
|
recSize := 0
|
|
firstInt := int(util.ByteOrder.Uint32(header[:4]))
|
|
if recTyped {
|
|
recType = recordType(firstInt >> 24 & 0xFF)
|
|
recSize = firstInt & 0xFFFFFF
|
|
} else {
|
|
recSize = firstInt
|
|
}
|
|
crc := util.ByteOrder.Uint32(header[4:recordHeaderSize])
|
|
// Now we are going to read the payload
|
|
buf = util.EnsureBufBigEnough(buf, recSize)
|
|
if _, err := io.ReadFull(r, buf[:recSize]); err != nil {
|
|
return buf, 0, recNoType, err
|
|
}
|
|
if checkCRC {
|
|
// check CRC against what was stored
|
|
if c := crc32.Checksum(buf[:recSize], crcTable); c != crc {
|
|
return buf, 0, recNoType, fmt.Errorf("corrupted data, expected crc to be 0x%08x, got 0x%08x", crc, c)
|
|
}
|
|
}
|
|
return buf, recSize, recType, nil
|
|
}
|
|
|
|
// setSize sets the initial buffer size and keep track of min/max allowed sizes
|
|
func newBufferWriter(minShrinkSize, maxSize int) *bufferedWriter {
|
|
w := &bufferedWriter{minShrinkSize: minShrinkSize, maxSize: maxSize}
|
|
w.bufSize = minShrinkSize
|
|
// The minSize is the minimum size the buffer can shrink to.
|
|
// However, if the given max size is smaller than the min
|
|
// shrink size, use that instead.
|
|
if maxSize < minShrinkSize {
|
|
w.bufSize = maxSize
|
|
}
|
|
return w
|
|
}
|
|
|
|
// createNewWriter creates a new buffer writer for `file` with
|
|
// the bufferedWriter's current buffer size.
|
|
func (w *bufferedWriter) createNewWriter(file *os.File) io.Writer {
|
|
w.buf = bufio.NewWriterSize(file, w.bufSize)
|
|
return w.buf
|
|
}
|
|
|
|
// expand the buffer (first flushing the buffer if not empty)
|
|
func (w *bufferedWriter) expand(file *os.File, required int) (io.Writer, error) {
|
|
// If there was a request to shrink the buffer, cancel that.
|
|
w.shrinkReq = false
|
|
// If there was something, flush first
|
|
if w.buf.Buffered() > 0 {
|
|
if err := w.buf.Flush(); err != nil {
|
|
return w.buf, err
|
|
}
|
|
}
|
|
// Double the size
|
|
w.bufSize *= 2
|
|
// If still smaller than what is required, adjust
|
|
if w.bufSize < required {
|
|
w.bufSize = required
|
|
}
|
|
// But cap it.
|
|
if w.bufSize > w.maxSize {
|
|
w.bufSize = w.maxSize
|
|
}
|
|
w.buf = bufio.NewWriterSize(file, w.bufSize)
|
|
return w.buf, nil
|
|
}
|
|
|
|
// tryShrinkBuffer checks and possibly shrinks the buffer
|
|
func (w *bufferedWriter) tryShrinkBuffer(file *os.File) (io.Writer, error) {
|
|
// Nothing to do if we are already at the lowest
|
|
// or file not set/opened.
|
|
if w.bufSize == w.minShrinkSize || file == nil {
|
|
return w.buf, nil
|
|
}
|
|
|
|
if !w.shrinkReq {
|
|
percentFilled := w.buf.Buffered() * 100 / w.bufSize
|
|
if percentFilled <= bufShrinkThreshold {
|
|
w.shrinkReq = true
|
|
}
|
|
// Wait for next tick to see if we can shrink
|
|
return w.buf, nil
|
|
}
|
|
if err := w.buf.Flush(); err != nil {
|
|
return w.buf, err
|
|
}
|
|
// Reduce size, but ensure it does not go below the limit
|
|
w.bufSize /= 2
|
|
if w.bufSize < w.minShrinkSize {
|
|
w.bufSize = w.minShrinkSize
|
|
}
|
|
w.buf = bufio.NewWriterSize(file, w.bufSize)
|
|
// Don't reset shrinkReq unless we are down to the limit
|
|
if w.bufSize == w.minShrinkSize {
|
|
w.shrinkReq = true
|
|
}
|
|
return w.buf, nil
|
|
}
|
|
|
|
// checkShrinkRequest checks how full the buffer is, and if is above a certain
|
|
// threshold, cancels the shrink request
|
|
func (w *bufferedWriter) checkShrinkRequest() {
|
|
percentFilled := w.buf.Buffered() * 100 / w.bufSize
|
|
// If above the threshold, cancel the request.
|
|
if percentFilled > bufShrinkThreshold {
|
|
w.shrinkReq = false
|
|
}
|
|
}
|
|
|
|
////////////////////////////////////////////////////////////////////////////
|
|
// FileStore methods
|
|
////////////////////////////////////////////////////////////////////////////
|
|
|
|
// NewFileStore returns a factory for stores backed by files, and recovers
|
|
// any state present.
|
|
// If not limits are provided, the store will be created with
|
|
// DefaultStoreLimits.
|
|
func NewFileStore(rootDir string, limits *StoreLimits, options ...FileStoreOption) (*FileStore, *RecoveredState, error) {
|
|
fs := &FileStore{
|
|
rootDir: rootDir,
|
|
opts: DefaultFileStoreOptions,
|
|
}
|
|
fs.init(TypeFile, limits)
|
|
|
|
for _, opt := range options {
|
|
if err := opt(&fs.opts); err != nil {
|
|
return nil, nil, err
|
|
}
|
|
}
|
|
// Convert the compact interval in time.Duration
|
|
fs.compactItvl = time.Duration(fs.opts.CompactInterval) * time.Second
|
|
// Create the table using polynomial in options
|
|
if fs.opts.CRCPolynomial == int64(crc32.IEEE) {
|
|
fs.crcTable = crc32.IEEETable
|
|
} else {
|
|
fs.crcTable = crc32.MakeTable(uint32(fs.opts.CRCPolynomial))
|
|
}
|
|
|
|
if err := os.MkdirAll(rootDir, os.ModeDir+os.ModePerm); err != nil && !os.IsExist(err) {
|
|
return nil, nil, fmt.Errorf("unable to create the root directory [%s]: %v", rootDir, err)
|
|
}
|
|
|
|
var err error
|
|
var recoveredState *RecoveredState
|
|
var serverInfo *spb.ServerInfo
|
|
var recoveredClients []*Client
|
|
var recoveredSubs = make(RecoveredSubscriptions)
|
|
var channels []os.FileInfo
|
|
var msgStore *FileMsgStore
|
|
var subStore *FileSubStore
|
|
|
|
// Ensure store is closed in case of return with error
|
|
defer func() {
|
|
if err != nil {
|
|
fs.Close()
|
|
}
|
|
}()
|
|
|
|
// Open/Create the server file (note that this file must not be opened,
|
|
// in APPEND mode to allow truncate to work).
|
|
fileName := filepath.Join(fs.rootDir, serverFileName)
|
|
fs.serverFile, err = openFile(fileName, os.O_RDWR, os.O_CREATE)
|
|
if err != nil {
|
|
return nil, nil, err
|
|
}
|
|
|
|
// Open/Create the client file.
|
|
fileName = filepath.Join(fs.rootDir, clientsFileName)
|
|
fs.clientsFile, err = openFile(fileName)
|
|
if err != nil {
|
|
return nil, nil, err
|
|
}
|
|
|
|
// Recover the server file.
|
|
serverInfo, err = fs.recoverServerInfo()
|
|
if err != nil {
|
|
return nil, nil, err
|
|
}
|
|
// If the server file is empty, then we are done
|
|
if serverInfo == nil {
|
|
// We return the file store instance, but no recovered state.
|
|
return fs, nil, nil
|
|
}
|
|
|
|
// Recover the clients file
|
|
recoveredClients, err = fs.recoverClients()
|
|
if err != nil {
|
|
return nil, nil, err
|
|
}
|
|
|
|
// Get the channels (there are subdirectories of rootDir)
|
|
channels, err = ioutil.ReadDir(rootDir)
|
|
if err != nil {
|
|
return nil, nil, err
|
|
}
|
|
// Go through the list
|
|
for _, c := range channels {
|
|
// Channels are directories. Ignore simple files
|
|
if !c.IsDir() {
|
|
continue
|
|
}
|
|
|
|
channel := c.Name()
|
|
channelDirName := filepath.Join(rootDir, channel)
|
|
|
|
// Recover messages for this channel
|
|
msgStore, err = fs.newFileMsgStore(channelDirName, channel, true)
|
|
if err != nil {
|
|
break
|
|
}
|
|
subStore, err = fs.newFileSubStore(channelDirName, channel, true)
|
|
if err != nil {
|
|
msgStore.Close()
|
|
break
|
|
}
|
|
|
|
// For this channel, construct an array of RecoveredSubState
|
|
rssArray := make([]*RecoveredSubState, 0, len(subStore.subs))
|
|
|
|
// Fill that array with what we got from newFileSubStore.
|
|
for _, sub := range subStore.subs {
|
|
// The server is making a copy of rss.Sub, still it is not
|
|
// a good idea to return a pointer to an object that belong
|
|
// to the store. So make a copy and return the pointer to
|
|
// that copy.
|
|
csub := *sub.sub
|
|
rss := &RecoveredSubState{
|
|
Sub: &csub,
|
|
Pending: make(PendingAcks),
|
|
}
|
|
// If we recovered any seqno...
|
|
if len(sub.seqnos) > 0 {
|
|
// Lookup messages, and if we find those, update the
|
|
// Pending map.
|
|
for seq := range sub.seqnos {
|
|
rss.Pending[seq] = struct{}{}
|
|
}
|
|
}
|
|
// Add to the array of recovered subscriptions
|
|
rssArray = append(rssArray, rss)
|
|
}
|
|
|
|
// This is the recovered subscription state for this channel
|
|
recoveredSubs[channel] = rssArray
|
|
|
|
fs.channels[channel] = &ChannelStore{
|
|
Subs: subStore,
|
|
Msgs: msgStore,
|
|
}
|
|
}
|
|
if err != nil {
|
|
return nil, nil, err
|
|
}
|
|
// Create the recovered state to return
|
|
recoveredState = &RecoveredState{
|
|
Info: serverInfo,
|
|
Clients: recoveredClients,
|
|
Subs: recoveredSubs,
|
|
}
|
|
return fs, recoveredState, nil
|
|
}
|
|
|
|
// Init is used to persist server's information after the first start
|
|
func (fs *FileStore) Init(info *spb.ServerInfo) error {
|
|
fs.Lock()
|
|
defer fs.Unlock()
|
|
|
|
f := fs.serverFile
|
|
// Truncate the file (4 is the size of the fileVersion record)
|
|
if err := f.Truncate(4); err != nil {
|
|
return err
|
|
}
|
|
// Move offset to 4 (truncate does not do that)
|
|
if _, err := f.Seek(4, 0); err != nil {
|
|
return err
|
|
}
|
|
// ServerInfo record is not typed. We also don't pass a reusable buffer.
|
|
if _, _, err := writeRecord(f, nil, recNoType, info, info.Size(), fs.crcTable); err != nil {
|
|
return err
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// recoverClients reads the client files and returns an array of RecoveredClient
|
|
func (fs *FileStore) recoverClients() ([]*Client, error) {
|
|
var err error
|
|
var recType recordType
|
|
var recSize int
|
|
|
|
_buf := [256]byte{}
|
|
buf := _buf[:]
|
|
|
|
// Create a buffered reader to speed-up recovery
|
|
br := bufio.NewReaderSize(fs.clientsFile, defaultBufSize)
|
|
|
|
for {
|
|
buf, recSize, recType, err = readRecord(br, buf, true, fs.crcTable, fs.opts.DoCRC)
|
|
if err != nil {
|
|
if err == io.EOF {
|
|
err = nil
|
|
break
|
|
}
|
|
return nil, err
|
|
}
|
|
fs.cliFileSize += int64(recSize + recordHeaderSize)
|
|
switch recType {
|
|
case addClient:
|
|
c := &Client{}
|
|
if err := c.ClientInfo.Unmarshal(buf[:recSize]); err != nil {
|
|
return nil, err
|
|
}
|
|
// Add to the map. Note that if one already exists, which should
|
|
// not, just replace with this most recent one.
|
|
fs.clients[c.ID] = c
|
|
case delClient:
|
|
c := spb.ClientDelete{}
|
|
if err := c.Unmarshal(buf[:recSize]); err != nil {
|
|
return nil, err
|
|
}
|
|
delete(fs.clients, c.ID)
|
|
fs.cliDeleteRecs++
|
|
default:
|
|
return nil, fmt.Errorf("invalid client record type: %v", recType)
|
|
}
|
|
}
|
|
clients := make([]*Client, len(fs.clients))
|
|
i := 0
|
|
// Convert the map into an array
|
|
for _, c := range fs.clients {
|
|
clients[i] = c
|
|
i++
|
|
}
|
|
return clients, nil
|
|
}
|
|
|
|
// recoverServerInfo reads the server file and returns a ServerInfo structure
|
|
func (fs *FileStore) recoverServerInfo() (*spb.ServerInfo, error) {
|
|
file := fs.serverFile
|
|
info := &spb.ServerInfo{}
|
|
buf, size, _, err := readRecord(file, nil, false, fs.crcTable, fs.opts.DoCRC)
|
|
if err != nil {
|
|
if err == io.EOF {
|
|
// We are done, no state recovered
|
|
return nil, nil
|
|
}
|
|
return nil, err
|
|
}
|
|
// Check that the size of the file is consistent with the size
|
|
// of the record we are supposed to recover. Account for the
|
|
// 12 bytes (4 + recordHeaderSize) corresponding to the fileVersion and
|
|
// record header.
|
|
fstat, err := file.Stat()
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
expectedSize := int64(size + 4 + recordHeaderSize)
|
|
if fstat.Size() != expectedSize {
|
|
return nil, fmt.Errorf("incorrect file size, expected %v bytes, got %v bytes",
|
|
expectedSize, fstat.Size())
|
|
}
|
|
// Reconstruct now
|
|
if err := info.Unmarshal(buf[:size]); err != nil {
|
|
return nil, err
|
|
}
|
|
return info, nil
|
|
}
|
|
|
|
// CreateChannel creates a ChannelStore for the given channel, and returns
|
|
// `true` to indicate that the channel is new, false if it already exists.
|
|
func (fs *FileStore) CreateChannel(channel string, userData interface{}) (*ChannelStore, bool, error) {
|
|
fs.Lock()
|
|
defer fs.Unlock()
|
|
channelStore := fs.channels[channel]
|
|
if channelStore != nil {
|
|
return channelStore, false, nil
|
|
}
|
|
|
|
// Check for limits
|
|
if err := fs.canAddChannel(); err != nil {
|
|
return nil, false, err
|
|
}
|
|
|
|
// We create the channel here...
|
|
|
|
channelDirName := filepath.Join(fs.rootDir, channel)
|
|
if err := os.MkdirAll(channelDirName, os.ModeDir+os.ModePerm); err != nil {
|
|
return nil, false, err
|
|
}
|
|
|
|
var err error
|
|
var msgStore MsgStore
|
|
var subStore SubStore
|
|
|
|
msgStore, err = fs.newFileMsgStore(channelDirName, channel, false)
|
|
if err != nil {
|
|
return nil, false, err
|
|
}
|
|
subStore, err = fs.newFileSubStore(channelDirName, channel, false)
|
|
if err != nil {
|
|
msgStore.Close()
|
|
return nil, false, err
|
|
}
|
|
|
|
channelStore = &ChannelStore{
|
|
Subs: subStore,
|
|
Msgs: msgStore,
|
|
UserData: userData,
|
|
}
|
|
|
|
fs.channels[channel] = channelStore
|
|
|
|
return channelStore, true, nil
|
|
}
|
|
|
|
// AddClient stores information about the client identified by `clientID`.
|
|
func (fs *FileStore) AddClient(clientID, hbInbox string, userData interface{}) (*Client, bool, error) {
|
|
sc, isNew, err := fs.genericStore.AddClient(clientID, hbInbox, userData)
|
|
if err != nil {
|
|
return nil, false, err
|
|
}
|
|
if !isNew {
|
|
return sc, false, nil
|
|
}
|
|
fs.Lock()
|
|
fs.addClientRec = spb.ClientInfo{ID: clientID, HbInbox: hbInbox}
|
|
_, size, err := writeRecord(fs.clientsFile, nil, addClient, &fs.addClientRec, fs.addClientRec.Size(), fs.crcTable)
|
|
if err != nil {
|
|
delete(fs.clients, clientID)
|
|
fs.Unlock()
|
|
return nil, false, err
|
|
}
|
|
fs.cliFileSize += int64(size)
|
|
fs.Unlock()
|
|
return sc, true, nil
|
|
}
|
|
|
|
// DeleteClient invalidates the client identified by `clientID`.
|
|
func (fs *FileStore) DeleteClient(clientID string) *Client {
|
|
sc := fs.genericStore.DeleteClient(clientID)
|
|
if sc != nil {
|
|
fs.Lock()
|
|
fs.delClientRec = spb.ClientDelete{ID: clientID}
|
|
_, size, _ := writeRecord(fs.clientsFile, nil, delClient, &fs.delClientRec, fs.delClientRec.Size(), fs.crcTable)
|
|
fs.cliDeleteRecs++
|
|
fs.cliFileSize += int64(size)
|
|
// Check if this triggers a need for compaction
|
|
if fs.shouldCompactClientFile() {
|
|
fs.compactClientFile()
|
|
}
|
|
fs.Unlock()
|
|
}
|
|
return sc
|
|
}
|
|
|
|
// shouldCompactClientFile returns true if the client file should be compacted
|
|
// Lock is held by caller
|
|
func (fs *FileStore) shouldCompactClientFile() bool {
|
|
// Global switch
|
|
if !fs.opts.CompactEnabled {
|
|
return false
|
|
}
|
|
// Check that if minimum file size is set, the client file
|
|
// is at least at the minimum.
|
|
if fs.opts.CompactMinFileSize > 0 && fs.cliFileSize < fs.opts.CompactMinFileSize {
|
|
return false
|
|
}
|
|
// Check fragmentation
|
|
frag := fs.cliDeleteRecs * 100 / (fs.cliDeleteRecs + len(fs.clients))
|
|
if frag < fs.opts.CompactFragmentation {
|
|
return false
|
|
}
|
|
// Check that we don't do too often
|
|
if time.Now().Sub(fs.cliCompactTS) < fs.compactItvl {
|
|
return false
|
|
}
|
|
return true
|
|
}
|
|
|
|
// Rewrite the content of the clients map into a temporary file,
|
|
// then swap back to active file.
|
|
// Store lock held on entry
|
|
func (fs *FileStore) compactClientFile() error {
|
|
// Open a temporary file
|
|
tmpFile, err := getTempFile(fs.rootDir, clientsFileName)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
defer func() {
|
|
if tmpFile != nil {
|
|
tmpFile.Close()
|
|
os.Remove(tmpFile.Name())
|
|
}
|
|
}()
|
|
bw := bufio.NewWriterSize(tmpFile, defaultBufSize)
|
|
fileSize := int64(0)
|
|
size := 0
|
|
_buf := [256]byte{}
|
|
buf := _buf[:]
|
|
// Dump the content of active clients into the temporary file.
|
|
for _, c := range fs.clients {
|
|
fs.addClientRec = spb.ClientInfo{ID: c.ID, HbInbox: c.HbInbox}
|
|
buf, size, err = writeRecord(bw, buf, addClient, &fs.addClientRec, fs.addClientRec.Size(), fs.crcTable)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
fileSize += int64(size)
|
|
}
|
|
// Flush the buffer on disk
|
|
if err := bw.Flush(); err != nil {
|
|
return err
|
|
}
|
|
// Switch the temporary file with the original one.
|
|
fs.clientsFile, err = swapFiles(tmpFile, fs.clientsFile)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
// Avoid unnecesary attempt to cleanup
|
|
tmpFile = nil
|
|
|
|
fs.cliDeleteRecs = 0
|
|
fs.cliFileSize = fileSize
|
|
fs.cliCompactTS = time.Now()
|
|
return nil
|
|
}
|
|
|
|
// Return a temporary file (including file version)
|
|
func getTempFile(rootDir, prefix string) (*os.File, error) {
|
|
tmpFile, err := ioutil.TempFile(rootDir, prefix)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
if err := util.WriteInt(tmpFile, fileVersion); err != nil {
|
|
return nil, err
|
|
}
|
|
return tmpFile, nil
|
|
}
|
|
|
|
// When a store file is compacted, the content is rewritten into a
|
|
// temporary file. When this is done, the temporary file replaces
|
|
// the original file.
|
|
func swapFiles(tempFile *os.File, activeFile *os.File) (*os.File, error) {
|
|
activeFileName := activeFile.Name()
|
|
tempFileName := tempFile.Name()
|
|
|
|
// Lots of things we do here is because Windows would not accept working
|
|
// on files that are currently opened.
|
|
|
|
// On exit, ensure temporary file is removed.
|
|
defer func() {
|
|
os.Remove(tempFileName)
|
|
}()
|
|
// Start by closing the temporary file.
|
|
if err := tempFile.Close(); err != nil {
|
|
return activeFile, err
|
|
}
|
|
// Close original file before trying to rename it.
|
|
if err := activeFile.Close(); err != nil {
|
|
return activeFile, err
|
|
}
|
|
// Rename the tmp file to original file name
|
|
err := os.Rename(tempFileName, activeFileName)
|
|
// Need to re-open the active file anyway
|
|
file, lerr := openFile(activeFileName)
|
|
if lerr != nil && err == nil {
|
|
err = lerr
|
|
}
|
|
return file, err
|
|
}
|
|
|
|
// Close closes all stores.
|
|
func (fs *FileStore) Close() error {
|
|
fs.Lock()
|
|
defer fs.Unlock()
|
|
if fs.closed {
|
|
return nil
|
|
}
|
|
fs.closed = true
|
|
|
|
var err error
|
|
closeFile := func(f *os.File) {
|
|
if f == nil {
|
|
return
|
|
}
|
|
if lerr := f.Close(); lerr != nil && err == nil {
|
|
err = lerr
|
|
}
|
|
}
|
|
err = fs.genericStore.close()
|
|
closeFile(fs.serverFile)
|
|
closeFile(fs.clientsFile)
|
|
return err
|
|
}
|
|
|
|
////////////////////////////////////////////////////////////////////////////
|
|
// FileMsgStore methods
|
|
////////////////////////////////////////////////////////////////////////////
|
|
|
|
// newFileMsgStore returns a new instace of a file MsgStore.
|
|
func (fs *FileStore) newFileMsgStore(channelDirName, channel string, doRecover bool) (*FileMsgStore, error) {
|
|
// Create an instance and initialize
|
|
ms := &FileMsgStore{
|
|
fstore: fs,
|
|
msgs: make(map[uint64]*msgRecord, 64),
|
|
wOffset: int64(4), // The very first record starts after the file version record
|
|
files: make(map[int]*fileSlice),
|
|
rootDir: channelDirName,
|
|
bkgTasksDone: make(chan bool, 1),
|
|
bkgTasksWake: make(chan bool, 1),
|
|
}
|
|
// Defaults to the global limits
|
|
msgStoreLimits := fs.limits.MsgStoreLimits
|
|
// See if there is an override
|
|
thisChannelLimits, exists := fs.limits.PerChannel[channel]
|
|
if exists {
|
|
// Use this channel specific limits
|
|
msgStoreLimits = thisChannelLimits.MsgStoreLimits
|
|
}
|
|
ms.init(channel, &msgStoreLimits)
|
|
|
|
ms.setSliceLimits()
|
|
ms.initCache()
|
|
|
|
maxBufSize := fs.opts.BufferSize
|
|
if maxBufSize > 0 {
|
|
ms.bw = newBufferWriter(msgBufMinShrinkSize, maxBufSize)
|
|
ms.bufferedSeqs = make([]uint64, 0, 1)
|
|
ms.bufferedMsgs = make(map[uint64]*bufferedMsg)
|
|
}
|
|
|
|
// Use this variable for all errors below so we can do the cleanup
|
|
var err error
|
|
|
|
// Recovery case
|
|
if doRecover {
|
|
var dirFiles []os.FileInfo
|
|
var fseq int64
|
|
|
|
dirFiles, err = ioutil.ReadDir(channelDirName)
|
|
for _, file := range dirFiles {
|
|
if file.IsDir() {
|
|
continue
|
|
}
|
|
fileName := file.Name()
|
|
if !strings.HasPrefix(fileName, msgFilesPrefix) || !strings.HasSuffix(fileName, datSuffix) {
|
|
continue
|
|
}
|
|
// Remove suffix
|
|
fileNameWithoutSuffix := strings.TrimSuffix(fileName, datSuffix)
|
|
// Remove prefix
|
|
fileNameWithoutPrefixAndSuffix := strings.TrimPrefix(fileNameWithoutSuffix, msgFilesPrefix)
|
|
// Get the file sequence number
|
|
fseq, err = strconv.ParseInt(fileNameWithoutPrefixAndSuffix, 10, 64)
|
|
if err != nil {
|
|
err = fmt.Errorf("message log has an invalid name: %v", fileName)
|
|
break
|
|
}
|
|
// Need fully qualified names
|
|
fileName = filepath.Join(channelDirName, fileName)
|
|
idxFName := filepath.Join(channelDirName, fmt.Sprintf("%s%v%s", msgFilesPrefix, fseq, idxSuffix))
|
|
// Create the slice
|
|
fslice := &fileSlice{fileName: fileName, idxFName: idxFName}
|
|
// Recover the file slice
|
|
err = ms.recoverOneMsgFile(fslice, int(fseq))
|
|
if err != nil {
|
|
break
|
|
}
|
|
}
|
|
if err == nil && ms.lastFSlSeq > 0 {
|
|
// Now that all file slices have been recovered, we know which
|
|
// one is the last, so open the corresponding data and index files.
|
|
ms.currSlice = ms.files[ms.lastFSlSeq]
|
|
err = ms.openDataAndIndexFiles(ms.currSlice.fileName, ms.currSlice.idxFName)
|
|
if err == nil {
|
|
ms.wOffset, err = ms.file.Seek(0, 2)
|
|
}
|
|
}
|
|
if err == nil {
|
|
// Apply message limits (no need to check if there are limits
|
|
// defined, the call won't do anything if they aren't).
|
|
err = ms.enforceLimits(false)
|
|
}
|
|
}
|
|
if err == nil {
|
|
ms.Lock()
|
|
ms.allDone.Add(1)
|
|
// Capture the time here first, it will then be captured
|
|
// in the go routine we are about to start.
|
|
ms.timeTick = time.Now().UnixNano()
|
|
// On recovery, if there is age limit set and at least one message...
|
|
if doRecover && ms.limits.MaxAge > 0 && ms.totalCount > 0 {
|
|
// Force the execution of the expireMsgs method.
|
|
// This will take care of expiring messages that should have
|
|
// expired while the server was stopped.
|
|
ms.expireMsgs(ms.timeTick, int64(ms.limits.MaxAge))
|
|
}
|
|
// Start the background tasks go routine
|
|
go ms.backgroundTasks()
|
|
ms.Unlock()
|
|
}
|
|
// Cleanup on error
|
|
if err != nil {
|
|
// The buffer writer may not be fully set yet
|
|
if ms.bw != nil && ms.bw.buf == nil {
|
|
ms.bw = nil
|
|
}
|
|
ms.Close()
|
|
ms = nil
|
|
action := "create"
|
|
if doRecover {
|
|
action = "recover"
|
|
}
|
|
err = fmt.Errorf("unable to %s message store for [%s]: %v", action, channel, err)
|
|
return nil, err
|
|
}
|
|
|
|
return ms, nil
|
|
}
|
|
|
|
// openDataAndIndexFiles opens/creates the data and index file with the given
|
|
// file names.
|
|
func (ms *FileMsgStore) openDataAndIndexFiles(dataFileName, idxFileName string) error {
|
|
file, err := openFile(dataFileName)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
idxFile, err := openFile(idxFileName)
|
|
if err != nil {
|
|
file.Close()
|
|
return err
|
|
}
|
|
ms.setFile(file, idxFile)
|
|
return nil
|
|
}
|
|
|
|
// closeDataAndIndexFiles closes both current data and index files.
|
|
func (ms *FileMsgStore) closeDataAndIndexFiles() error {
|
|
err := ms.flush()
|
|
if cerr := ms.file.Close(); cerr != nil && err == nil {
|
|
err = cerr
|
|
}
|
|
if cerr := ms.idxFile.Close(); cerr != nil && err == nil {
|
|
err = cerr
|
|
}
|
|
return err
|
|
}
|
|
|
|
// setFile sets the current data and index file.
|
|
// The buffered writer is recreated.
|
|
func (ms *FileMsgStore) setFile(dataFile, idxFile *os.File) {
|
|
ms.file = dataFile
|
|
ms.writer = ms.file
|
|
if ms.file != nil && ms.bw != nil {
|
|
ms.writer = ms.bw.createNewWriter(ms.file)
|
|
}
|
|
ms.idxFile = idxFile
|
|
}
|
|
|
|
// recovers one of the file
|
|
func (ms *FileMsgStore) recoverOneMsgFile(fslice *fileSlice, fseq int) error {
|
|
var err error
|
|
|
|
msgSize := 0
|
|
var msg *pb.MsgProto
|
|
var mrec *msgRecord
|
|
var seq uint64
|
|
|
|
// Check if index file exists
|
|
useIdxFile := false
|
|
if s, statErr := os.Stat(fslice.idxFName); s != nil && statErr == nil {
|
|
useIdxFile = true
|
|
}
|
|
|
|
// Open the files (the idx file will be created if it does not exist)
|
|
err = ms.openDataAndIndexFiles(fslice.fileName, fslice.idxFName)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
// Select which file to recover based on presence of index file
|
|
file := ms.file
|
|
if useIdxFile {
|
|
file = ms.idxFile
|
|
}
|
|
|
|
// Create a buffered reader to speed-up recovery
|
|
br := bufio.NewReaderSize(file, defaultBufSize)
|
|
|
|
// The first record starts after the file version record
|
|
offset := int64(4)
|
|
|
|
if useIdxFile {
|
|
for {
|
|
seq, mrec, err = ms.readIndex(br)
|
|
if err != nil {
|
|
if err == io.EOF {
|
|
// We are done, reset err
|
|
err = nil
|
|
}
|
|
break
|
|
}
|
|
|
|
// Update file slice
|
|
if fslice.firstSeq == 0 {
|
|
fslice.firstSeq = seq
|
|
}
|
|
fslice.lastSeq = seq
|
|
fslice.msgsCount++
|
|
// For size, add the message record size, the record header and the size
|
|
// required for the corresponding index record.
|
|
fslice.msgsSize += uint64(mrec.msgSize + msgRecordOverhead)
|
|
if fslice.firstWrite == 0 {
|
|
fslice.firstWrite = mrec.timestamp
|
|
}
|
|
}
|
|
} else {
|
|
// Get these from the file store object
|
|
crcTable := ms.fstore.crcTable
|
|
doCRC := ms.fstore.opts.DoCRC
|
|
|
|
// We are going to write the index file while recovering the data file
|
|
bw := bufio.NewWriterSize(ms.idxFile, msgIndexRecSize*1000)
|
|
|
|
for {
|
|
ms.tmpMsgBuf, msgSize, _, err = readRecord(br, ms.tmpMsgBuf, false, crcTable, doCRC)
|
|
if err != nil {
|
|
if err == io.EOF {
|
|
// We are done, reset err
|
|
err = nil
|
|
}
|
|
break
|
|
}
|
|
|
|
// Recover this message
|
|
msg = &pb.MsgProto{}
|
|
err = msg.Unmarshal(ms.tmpMsgBuf[:msgSize])
|
|
if err != nil {
|
|
break
|
|
}
|
|
|
|
if fslice.firstSeq == 0 {
|
|
fslice.firstSeq = msg.Sequence
|
|
}
|
|
fslice.lastSeq = msg.Sequence
|
|
fslice.msgsCount++
|
|
// For size, add the message record size, the record header and the size
|
|
// required for the corresponding index record.
|
|
fslice.msgsSize += uint64(msgSize + msgRecordOverhead)
|
|
if fslice.firstWrite == 0 {
|
|
fslice.firstWrite = msg.Timestamp
|
|
}
|
|
|
|
mrec := &msgRecord{offset: offset, timestamp: msg.Timestamp, msgSize: uint32(msgSize)}
|
|
ms.msgs[msg.Sequence] = mrec
|
|
// There was no index file, update it
|
|
err = ms.writeIndex(bw, msg.Sequence, offset, msg.Timestamp, msgSize)
|
|
if err != nil {
|
|
break
|
|
}
|
|
// Move offset
|
|
offset += int64(recordHeaderSize + msgSize)
|
|
}
|
|
if err == nil {
|
|
err = bw.Flush()
|
|
if err == nil {
|
|
err = ms.idxFile.Sync()
|
|
}
|
|
}
|
|
// Since there was no index and there was an error, remove the index
|
|
// file so when server restarts, it recovers again from the data file.
|
|
if err != nil {
|
|
// Close the index file
|
|
ms.idxFile.Close()
|
|
// Remove it, and panic if we can't
|
|
if rmErr := os.Remove(fslice.idxFName); rmErr != nil {
|
|
panic(fmt.Errorf("Error during recovery of file %q: %v, you need "+
|
|
"to manually remove index file %q (remove failed with err: %v)",
|
|
fslice.fileName, err, fslice.idxFName, rmErr))
|
|
}
|
|
}
|
|
}
|
|
|
|
// If no error and slice is not empty...
|
|
if err == nil && fslice.msgsCount > 0 {
|
|
if ms.first == 0 || ms.first > fslice.firstSeq {
|
|
ms.first = fslice.firstSeq
|
|
}
|
|
if ms.last < fslice.lastSeq {
|
|
ms.last = fslice.lastSeq
|
|
}
|
|
ms.totalCount += fslice.msgsCount
|
|
ms.totalBytes += fslice.msgsSize
|
|
|
|
// File slices may be recovered in any order. When all slices
|
|
// are recovered the caller will open the last file slice. So
|
|
// close the files here since we don't know if this is going
|
|
// to be the last.
|
|
if err == nil {
|
|
err = ms.closeDataAndIndexFiles()
|
|
}
|
|
if err == nil {
|
|
// On success, add to the map of file slices and
|
|
// update first/last file slice sequence.
|
|
ms.files[fseq] = fslice
|
|
if ms.firstFSlSeq == 0 || ms.firstFSlSeq > fseq {
|
|
ms.firstFSlSeq = fseq
|
|
}
|
|
if ms.lastFSlSeq < fseq {
|
|
ms.lastFSlSeq = fseq
|
|
}
|
|
}
|
|
} else {
|
|
// We got an error, or this is an empty file slice which we
|
|
// didn't add to the map.
|
|
if cerr := ms.closeDataAndIndexFiles(); cerr != nil && err == nil {
|
|
err = cerr
|
|
}
|
|
}
|
|
return err
|
|
}
|
|
|
|
// setSliceLimits sets the limits of a file slice based on options and/or
|
|
// channel limits.
|
|
func (ms *FileMsgStore) setSliceLimits() {
|
|
// First set slice limits based on slice configuration.
|
|
ms.slCountLim = ms.fstore.opts.SliceMaxMsgs
|
|
ms.slSizeLim = uint64(ms.fstore.opts.SliceMaxBytes)
|
|
ms.slAgeLim = int64(ms.fstore.opts.SliceMaxAge)
|
|
// Did we configure any of the "dimension"?
|
|
ms.slHasLimits = ms.slCountLim > 0 || ms.slSizeLim > 0 || ms.slAgeLim > 0
|
|
|
|
// If so, we are done. We will use those limits to decide
|
|
// when to move to a new slice.
|
|
if ms.slHasLimits {
|
|
return
|
|
}
|
|
|
|
// Slices limits were not configured. We will set a limit based on channel limits.
|
|
if ms.limits.MaxMsgs > 0 {
|
|
limit := ms.limits.MaxMsgs / 4
|
|
if limit == 0 {
|
|
limit = 1
|
|
}
|
|
ms.slCountLim = limit
|
|
}
|
|
if ms.limits.MaxBytes > 0 {
|
|
limit := uint64(ms.limits.MaxBytes) / 4
|
|
if limit == 0 {
|
|
limit = 1
|
|
}
|
|
ms.slSizeLim = limit
|
|
}
|
|
if ms.limits.MaxAge > 0 {
|
|
limit := time.Duration(int64(ms.limits.MaxAge) / 4)
|
|
if limit < time.Second {
|
|
limit = time.Second
|
|
}
|
|
ms.slAgeLim = int64(limit)
|
|
}
|
|
// Refresh our view of slices having limits.
|
|
ms.slHasLimits = ms.slCountLim > 0 || ms.slSizeLim > 0 || ms.slAgeLim > 0
|
|
}
|
|
|
|
// writeIndex writes a message index record to the writer `w`
|
|
func (ms *FileMsgStore) writeIndex(w io.Writer, seq uint64, offset, timestamp int64, msgSize int) error {
|
|
_buf := [msgIndexRecSize]byte{}
|
|
buf := _buf[:]
|
|
ms.addIndex(buf, seq, offset, timestamp, msgSize)
|
|
_, err := w.Write(buf[:msgIndexRecSize])
|
|
return err
|
|
}
|
|
|
|
// addIndex adds a message index record in the given buffer
|
|
func (ms *FileMsgStore) addIndex(buf []byte, seq uint64, offset, timestamp int64, msgSize int) {
|
|
util.ByteOrder.PutUint64(buf, seq)
|
|
util.ByteOrder.PutUint64(buf[8:], uint64(offset))
|
|
util.ByteOrder.PutUint64(buf[16:], uint64(timestamp))
|
|
util.ByteOrder.PutUint32(buf[24:], uint32(msgSize))
|
|
crc := crc32.Checksum(buf[:msgIndexRecSize-crcSize], ms.fstore.crcTable)
|
|
util.ByteOrder.PutUint32(buf[msgIndexRecSize-crcSize:], crc)
|
|
}
|
|
|
|
// readIndex reads a message index record from the given reader
|
|
// and returns an allocated msgRecord object.
|
|
func (ms *FileMsgStore) readIndex(r io.Reader) (uint64, *msgRecord, error) {
|
|
_buf := [msgIndexRecSize]byte{}
|
|
buf := _buf[:]
|
|
if _, err := io.ReadFull(r, buf); err != nil {
|
|
return 0, nil, err
|
|
}
|
|
mrec := &msgRecord{}
|
|
seq := util.ByteOrder.Uint64(buf)
|
|
mrec.offset = int64(util.ByteOrder.Uint64(buf[8:]))
|
|
mrec.timestamp = int64(util.ByteOrder.Uint64(buf[16:]))
|
|
mrec.msgSize = util.ByteOrder.Uint32(buf[24:])
|
|
if ms.fstore.opts.DoCRC {
|
|
storedCRC := util.ByteOrder.Uint32(buf[msgIndexRecSize-crcSize:])
|
|
crc := crc32.Checksum(buf[:msgIndexRecSize-crcSize], ms.fstore.crcTable)
|
|
if storedCRC != crc {
|
|
return 0, nil, fmt.Errorf("corrupted data, expected crc to be 0x%08x, got 0x%08x", storedCRC, crc)
|
|
}
|
|
}
|
|
ms.msgs[seq] = mrec
|
|
return seq, mrec, nil
|
|
}
|
|
|
|
// Store a given message.
|
|
func (ms *FileMsgStore) Store(data []byte) (uint64, error) {
|
|
ms.Lock()
|
|
defer ms.Unlock()
|
|
|
|
fslice := ms.currSlice
|
|
|
|
// Check if we need to move to next file slice
|
|
if fslice == nil || ms.slHasLimits {
|
|
if fslice == nil ||
|
|
(ms.slSizeLim > 0 && fslice.msgsSize >= ms.slSizeLim) ||
|
|
(ms.slCountLim > 0 && fslice.msgsCount >= ms.slCountLim) ||
|
|
(ms.slAgeLim > 0 && atomic.LoadInt64(&ms.timeTick)-fslice.firstWrite >= ms.slAgeLim) {
|
|
|
|
// Don't change store variable until success...
|
|
newSliceSeq := ms.lastFSlSeq + 1
|
|
|
|
// Close the current file slice (if applicable) and open the next slice
|
|
if fslice != nil {
|
|
if err := ms.closeDataAndIndexFiles(); err != nil {
|
|
return 0, err
|
|
}
|
|
}
|
|
// Create new slice
|
|
datFName := filepath.Join(ms.rootDir, fmt.Sprintf("%s%v%s", msgFilesPrefix, newSliceSeq, datSuffix))
|
|
idxFName := filepath.Join(ms.rootDir, fmt.Sprintf("%s%v%s", msgFilesPrefix, newSliceSeq, idxSuffix))
|
|
// Open the new slice
|
|
if err := ms.openDataAndIndexFiles(datFName, idxFName); err != nil {
|
|
return 0, err
|
|
}
|
|
// Success, update the store's variables
|
|
newSlice := &fileSlice{fileName: datFName, idxFName: idxFName}
|
|
ms.files[newSliceSeq] = newSlice
|
|
ms.currSlice = newSlice
|
|
if ms.firstFSlSeq == 0 {
|
|
ms.firstFSlSeq = newSliceSeq
|
|
}
|
|
ms.lastFSlSeq = newSliceSeq
|
|
ms.wOffset = int64(4)
|
|
|
|
// If we added a second slice and the first slice was empty but not removed
|
|
// because it was the only one, we remove it now.
|
|
if len(ms.files) == 2 && fslice.msgsCount == fslice.rmCount {
|
|
ms.removeFirstSlice()
|
|
}
|
|
// Update the fslice reference to new slice for rest of function
|
|
fslice = ms.currSlice
|
|
}
|
|
}
|
|
|
|
seq := ms.last + 1
|
|
m := &pb.MsgProto{
|
|
Sequence: seq,
|
|
Subject: ms.subject,
|
|
Data: data,
|
|
Timestamp: time.Now().UnixNano(),
|
|
}
|
|
|
|
msgInBuffer := false
|
|
|
|
var recSize int
|
|
var err error
|
|
|
|
var bwBuf *bufio.Writer
|
|
if ms.bw != nil {
|
|
bwBuf = ms.bw.buf
|
|
}
|
|
msgSize := m.Size()
|
|
if bwBuf != nil {
|
|
required := msgSize + recordHeaderSize
|
|
if required > bwBuf.Available() {
|
|
ms.writer, err = ms.bw.expand(ms.file, required)
|
|
if err != nil {
|
|
return 0, err
|
|
}
|
|
if err := ms.processBufferedMsgs(); err != nil {
|
|
return 0, err
|
|
}
|
|
// Refresh this since it has changed.
|
|
bwBuf = ms.bw.buf
|
|
}
|
|
}
|
|
ms.tmpMsgBuf, recSize, err = writeRecord(ms.writer, ms.tmpMsgBuf, recNoType, m, msgSize, ms.fstore.crcTable)
|
|
if err != nil {
|
|
return 0, err
|
|
}
|
|
mrec := &msgRecord{offset: ms.wOffset, timestamp: m.Timestamp, msgSize: uint32(msgSize)}
|
|
if bwBuf != nil {
|
|
// Check to see if we should cancel a buffer shrink request
|
|
if ms.bw.shrinkReq {
|
|
ms.bw.checkShrinkRequest()
|
|
}
|
|
// If message was added to the buffer we need to also save a reference
|
|
// to that message outside of the cache, until the buffer is flushed.
|
|
if bwBuf.Buffered() >= recSize {
|
|
ms.bufferedSeqs = append(ms.bufferedSeqs, seq)
|
|
ms.bufferedMsgs[seq] = &bufferedMsg{msg: m, rec: mrec}
|
|
msgInBuffer = true
|
|
}
|
|
}
|
|
// Message was flushed to disk, write corresponding index
|
|
if !msgInBuffer {
|
|
if err := ms.writeIndex(ms.idxFile, seq, ms.wOffset, m.Timestamp, msgSize); err != nil {
|
|
return 0, err
|
|
}
|
|
}
|
|
|
|
if ms.first == 0 || ms.first == seq {
|
|
// First ever message or after all messages expired and this is the
|
|
// first new message.
|
|
ms.first = seq
|
|
ms.firstMsg = m
|
|
if maxAge := ms.limits.MaxAge; maxAge > 0 {
|
|
ms.expiration = mrec.timestamp + int64(maxAge)
|
|
if len(ms.bkgTasksWake) == 0 {
|
|
ms.bkgTasksWake <- true
|
|
}
|
|
}
|
|
}
|
|
ms.last = seq
|
|
ms.lastMsg = m
|
|
ms.msgs[ms.last] = mrec
|
|
ms.addToCache(seq, m, true)
|
|
ms.wOffset += int64(recSize)
|
|
|
|
// For size, add the message record size, the record header and the size
|
|
// required for the corresponding index record.
|
|
size := uint64(msgSize + msgRecordOverhead)
|
|
|
|
// Total stats
|
|
ms.totalCount++
|
|
ms.totalBytes += size
|
|
|
|
// Stats per file slice
|
|
fslice.msgsCount++
|
|
fslice.msgsSize += size
|
|
if fslice.firstWrite == 0 {
|
|
fslice.firstWrite = m.Timestamp
|
|
}
|
|
|
|
// Save references to first and last sequences for this slice
|
|
if fslice.firstSeq == 0 {
|
|
fslice.firstSeq = seq
|
|
}
|
|
fslice.lastSeq = seq
|
|
|
|
if ms.limits.MaxMsgs > 0 || ms.limits.MaxBytes > 0 {
|
|
// Enfore limits and update file slice if needed.
|
|
if err := ms.enforceLimits(true); err != nil {
|
|
return 0, err
|
|
}
|
|
}
|
|
return seq, nil
|
|
}
|
|
|
|
// processBufferedMsgs adds message index records in the given buffer
|
|
// for every pending buffered messages.
|
|
func (ms *FileMsgStore) processBufferedMsgs() error {
|
|
if len(ms.bufferedMsgs) == 0 {
|
|
return nil
|
|
}
|
|
idxBufferSize := len(ms.bufferedMsgs) * msgIndexRecSize
|
|
ms.tmpMsgBuf = util.EnsureBufBigEnough(ms.tmpMsgBuf, idxBufferSize)
|
|
bufOffset := 0
|
|
for _, pseq := range ms.bufferedSeqs {
|
|
bm := ms.bufferedMsgs[pseq]
|
|
if bm != nil {
|
|
mrec := bm.rec
|
|
// We add the index info for this flushed message
|
|
ms.addIndex(ms.tmpMsgBuf[bufOffset:], pseq, mrec.offset, mrec.timestamp, int(mrec.msgSize))
|
|
bufOffset += msgIndexRecSize
|
|
delete(ms.bufferedMsgs, pseq)
|
|
}
|
|
}
|
|
if bufOffset > 0 {
|
|
if _, err := ms.idxFile.Write(ms.tmpMsgBuf[:bufOffset]); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
ms.bufferedSeqs = ms.bufferedSeqs[:0]
|
|
return nil
|
|
}
|
|
|
|
// expireMsgs ensures that messages don't stay in the log longer than the
|
|
// limit's MaxAge.
|
|
// Returns the time of the next expiration (possibly 0 if no message left)
|
|
// The store's lock is assumed to be held on entry
|
|
func (ms *FileMsgStore) expireMsgs(now, maxAge int64) int64 {
|
|
for {
|
|
m, hasMore := ms.msgs[ms.first]
|
|
if !hasMore {
|
|
ms.expiration = 0
|
|
break
|
|
}
|
|
elapsed := now - m.timestamp
|
|
if elapsed >= maxAge {
|
|
ms.removeFirstMsg()
|
|
} else {
|
|
ms.expiration = now + (maxAge - elapsed)
|
|
break
|
|
}
|
|
}
|
|
return ms.expiration
|
|
}
|
|
|
|
// enforceLimits checks total counts with current msg store's limits,
|
|
// removing a file slice and/or updating slices' count as necessary.
|
|
func (ms *FileMsgStore) enforceLimits(reportHitLimit bool) error {
|
|
// Check if we need to remove any (but leave at least the last added).
|
|
// Note that we may have to remove more than one msg if we are here
|
|
// after a restart with smaller limits than originally set, or if
|
|
// message is quite big, etc...
|
|
maxMsgs := ms.limits.MaxMsgs
|
|
maxBytes := ms.limits.MaxBytes
|
|
for ms.totalCount > 1 &&
|
|
((maxMsgs > 0 && ms.totalCount > maxMsgs) ||
|
|
(maxBytes > 0 && ms.totalBytes > uint64(maxBytes))) {
|
|
|
|
// Remove first message from first slice, potentially removing
|
|
// the slice, etc...
|
|
ms.removeFirstMsg()
|
|
if reportHitLimit && !ms.hitLimit {
|
|
ms.hitLimit = true
|
|
Noticef(droppingMsgsFmt, ms.subject, ms.totalCount, ms.limits.MaxMsgs, ms.totalBytes, ms.limits.MaxBytes)
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// removeFirstMsg "removes" the first message of the first slice.
|
|
// If the slice is "empty" the file slice is removed.
|
|
func (ms *FileMsgStore) removeFirstMsg() {
|
|
// Work with the first slice
|
|
slice := ms.files[ms.firstFSlSeq]
|
|
// Size of the first message in this slice
|
|
firstMsgSize := ms.msgs[slice.firstSeq].msgSize
|
|
// For size, we count the size of serialized message + record header +
|
|
// the corresponding index record
|
|
size := uint64(firstMsgSize + msgRecordOverhead)
|
|
// Keep track of number of "removed" messages in this slice
|
|
slice.rmCount++
|
|
// Update total counts
|
|
ms.totalCount--
|
|
ms.totalBytes -= size
|
|
// Remove the first message from the records map
|
|
delete(ms.msgs, ms.first)
|
|
// Messages sequence is incremental with no gap on a given msgstore.
|
|
ms.first++
|
|
// Invalidate ms.firstMsg, it will be looked-up on demand.
|
|
ms.firstMsg = nil
|
|
// Invalidate ms.lastMsg if it was the last message being removed.
|
|
if ms.first > ms.last {
|
|
ms.lastMsg = nil
|
|
}
|
|
// Is file slice is "empty" and not the last one
|
|
if slice.msgsCount == slice.rmCount && len(ms.files) > 1 {
|
|
ms.removeFirstSlice()
|
|
} else {
|
|
// This is the new first message in this slice.
|
|
slice.firstSeq = ms.first
|
|
}
|
|
}
|
|
|
|
// removeFirstSlice removes the first file slice.
|
|
// Should not be called if first slice is also last!
|
|
func (ms *FileMsgStore) removeFirstSlice() {
|
|
sl := ms.files[ms.firstFSlSeq]
|
|
// Close file that may have been opened due to lookups
|
|
if sl.file != nil {
|
|
sl.file.Close()
|
|
sl.file = nil
|
|
}
|
|
// Assume we will remove the files
|
|
remove := true
|
|
// If there is an archive script invoke it first
|
|
script := ms.fstore.opts.SliceArchiveScript
|
|
if script != "" {
|
|
datBak := sl.fileName + bakSuffix
|
|
idxBak := sl.idxFName + bakSuffix
|
|
|
|
var err error
|
|
if err = os.Rename(sl.fileName, datBak); err == nil {
|
|
if err = os.Rename(sl.idxFName, idxBak); err != nil {
|
|
// Remove first backup file
|
|
os.Remove(datBak)
|
|
}
|
|
}
|
|
if err == nil {
|
|
// Files have been successfully renamed, so don't attempt
|
|
// to remove the original files.
|
|
remove = false
|
|
|
|
// We run the script in a go routine to not block the server.
|
|
ms.allDone.Add(1)
|
|
go func(subj, dat, idx string) {
|
|
defer ms.allDone.Done()
|
|
cmd := exec.Command(script, subj, dat, idx)
|
|
output, err := cmd.CombinedOutput()
|
|
if err != nil {
|
|
Noticef("STAN: Error invoking archive script %q: %v (output=%v)", script, err, string(output))
|
|
} else {
|
|
Noticef("STAN: Output of archive script for %s (%s and %s): %v", subj, dat, idx, string(output))
|
|
}
|
|
}(ms.subject, datBak, idxBak)
|
|
}
|
|
}
|
|
// Remove files
|
|
if remove {
|
|
os.Remove(sl.fileName)
|
|
os.Remove(sl.idxFName)
|
|
}
|
|
// Remove slice from map
|
|
delete(ms.files, ms.firstFSlSeq)
|
|
// Normally, file slices have an incremental sequence number with
|
|
// no gap. However, we want to support the fact that an user could
|
|
// copy back some old file slice to be recovered, and so there
|
|
// may be a gap. So find out what is the new first file sequence.
|
|
for ms.firstFSlSeq < ms.lastFSlSeq {
|
|
ms.firstFSlSeq++
|
|
if _, ok := ms.files[ms.firstFSlSeq]; ok {
|
|
break
|
|
}
|
|
}
|
|
// This should not happen!
|
|
if ms.firstFSlSeq > ms.lastFSlSeq {
|
|
panic("Removed last slice!")
|
|
}
|
|
}
|
|
|
|
// getFileForSeq returns the file where the message of the given sequence
|
|
// is stored. If the file is opened, a task is triggered to close this
|
|
// file when no longer used after a period of time.
|
|
func (ms *FileMsgStore) getFileForSeq(seq uint64) (*os.File, error) {
|
|
if len(ms.files) == 0 {
|
|
return nil, fmt.Errorf("no file slice for store %q, message seq: %v", ms.subject, seq)
|
|
}
|
|
// Start with current slice
|
|
slice := ms.currSlice
|
|
if (slice.firstSeq <= seq) && (seq <= slice.lastSeq) {
|
|
return ms.file, nil
|
|
}
|
|
// We want to support possible gaps in file slice sequence, so
|
|
// no dichotomy, but simple iteration of the map, which in Go is
|
|
// random.
|
|
for _, slice := range ms.files {
|
|
if (slice.firstSeq <= seq) && (seq <= slice.lastSeq) {
|
|
file := slice.file
|
|
if file == nil {
|
|
var err error
|
|
file, err = openFile(slice.fileName)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("unable to open file %q: %v", slice.fileName, err)
|
|
}
|
|
slice.file = file
|
|
// Let the background task know that we have opened a slice
|
|
atomic.StoreInt64(&ms.checkSlices, 1)
|
|
}
|
|
slice.lastUsed = atomic.LoadInt64(&ms.timeTick)
|
|
return file, nil
|
|
}
|
|
}
|
|
return nil, fmt.Errorf("could not find file slice for store %q, message seq: %v", ms.subject, seq)
|
|
}
|
|
|
|
// backgroundTasks performs some background tasks related to this
|
|
// messages store.
|
|
func (ms *FileMsgStore) backgroundTasks() {
|
|
defer ms.allDone.Done()
|
|
|
|
ms.RLock()
|
|
hasBuffer := ms.bw != nil
|
|
maxAge := int64(ms.limits.MaxAge)
|
|
nextExpiration := ms.expiration
|
|
lastCacheCheck := ms.timeTick
|
|
lastBufShrink := ms.timeTick
|
|
ms.RUnlock()
|
|
|
|
for {
|
|
// Update time
|
|
timeTick := time.Now().UnixNano()
|
|
atomic.StoreInt64(&ms.timeTick, timeTick)
|
|
|
|
// Close unused file slices
|
|
if atomic.LoadInt64(&ms.checkSlices) == 1 {
|
|
ms.Lock()
|
|
opened := 0
|
|
for _, slice := range ms.files {
|
|
if slice.file != nil {
|
|
opened++
|
|
if slice.lastUsed < timeTick && time.Duration(timeTick-slice.lastUsed) >= time.Second {
|
|
slice.file.Close()
|
|
slice.file = nil
|
|
opened--
|
|
}
|
|
}
|
|
}
|
|
if opened == 0 {
|
|
// We can update this without atomic since we are under store lock
|
|
// and this go routine is the only place where we check the value.
|
|
ms.checkSlices = 0
|
|
}
|
|
ms.Unlock()
|
|
}
|
|
|
|
// Shrink the buffer if applicable
|
|
if hasBuffer && time.Duration(timeTick-lastBufShrink) >= bufShrinkInterval {
|
|
ms.Lock()
|
|
ms.writer, _ = ms.bw.tryShrinkBuffer(ms.file)
|
|
ms.Unlock()
|
|
lastBufShrink = timeTick
|
|
}
|
|
|
|
// Check for expiration
|
|
if maxAge > 0 && nextExpiration > 0 && timeTick >= nextExpiration {
|
|
ms.Lock()
|
|
// Expire messages
|
|
nextExpiration = ms.expireMsgs(timeTick, maxAge)
|
|
ms.Unlock()
|
|
}
|
|
|
|
// Check for message caching
|
|
if timeTick >= lastCacheCheck+cacheTTL {
|
|
tryEvict := atomic.LoadInt32(&ms.cache.tryEvict)
|
|
if tryEvict == 1 {
|
|
ms.Lock()
|
|
// Possibly remove some/all cached messages
|
|
ms.evictFromCache(timeTick)
|
|
ms.Unlock()
|
|
}
|
|
lastCacheCheck = timeTick
|
|
}
|
|
|
|
select {
|
|
case <-ms.bkgTasksDone:
|
|
return
|
|
case <-ms.bkgTasksWake:
|
|
// wake up from a possible sleep to run the loop
|
|
ms.RLock()
|
|
nextExpiration = ms.expiration
|
|
ms.RUnlock()
|
|
case <-time.After(bkgTasksSleepDuration):
|
|
// go back to top of for loop.
|
|
}
|
|
}
|
|
}
|
|
|
|
// lookup returns the message for the given sequence number, possibly
|
|
// reading the message from disk.
|
|
// Store write lock is assumed to be held on entry
|
|
func (ms *FileMsgStore) lookup(seq uint64) *pb.MsgProto {
|
|
var msg *pb.MsgProto
|
|
m := ms.msgs[seq]
|
|
if m != nil {
|
|
msg = ms.getFromCache(seq)
|
|
if msg == nil && ms.bufferedMsgs != nil {
|
|
// Possibly in bufferedMsgs
|
|
bm := ms.bufferedMsgs[seq]
|
|
if bm != nil {
|
|
msg = bm.msg
|
|
ms.addToCache(seq, msg, false)
|
|
}
|
|
}
|
|
if msg == nil {
|
|
var msgSize int
|
|
// Look in which file slice the message is located.
|
|
file, err := ms.getFileForSeq(seq)
|
|
if err != nil {
|
|
return nil
|
|
}
|
|
// Position file to message's offset. 0 means from start.
|
|
if _, err := file.Seek(m.offset, 0); err != nil {
|
|
return nil
|
|
}
|
|
ms.tmpMsgBuf, msgSize, _, err = readRecord(file, ms.tmpMsgBuf, false, ms.fstore.crcTable, ms.fstore.opts.DoCRC)
|
|
if err != nil {
|
|
return nil
|
|
}
|
|
// Recover this message
|
|
msg = &pb.MsgProto{}
|
|
err = msg.Unmarshal(ms.tmpMsgBuf[:msgSize])
|
|
if err != nil {
|
|
return nil
|
|
}
|
|
ms.addToCache(seq, msg, false)
|
|
}
|
|
}
|
|
return msg
|
|
}
|
|
|
|
// Lookup returns the stored message with given sequence number.
|
|
func (ms *FileMsgStore) Lookup(seq uint64) *pb.MsgProto {
|
|
ms.Lock()
|
|
msg := ms.lookup(seq)
|
|
ms.Unlock()
|
|
return msg
|
|
}
|
|
|
|
// FirstMsg returns the first message stored.
|
|
func (ms *FileMsgStore) FirstMsg() *pb.MsgProto {
|
|
ms.RLock()
|
|
if ms.firstMsg == nil {
|
|
ms.firstMsg = ms.lookup(ms.first)
|
|
}
|
|
m := ms.firstMsg
|
|
ms.RUnlock()
|
|
return m
|
|
}
|
|
|
|
// LastMsg returns the last message stored.
|
|
func (ms *FileMsgStore) LastMsg() *pb.MsgProto {
|
|
ms.RLock()
|
|
if ms.lastMsg == nil {
|
|
ms.lastMsg = ms.lookup(ms.last)
|
|
}
|
|
m := ms.lastMsg
|
|
ms.RUnlock()
|
|
return m
|
|
}
|
|
|
|
// GetSequenceFromTimestamp returns the sequence of the first message whose
|
|
// timestamp is greater or equal to given timestamp.
|
|
func (ms *FileMsgStore) GetSequenceFromTimestamp(timestamp int64) uint64 {
|
|
ms.RLock()
|
|
defer ms.RUnlock()
|
|
|
|
index := sort.Search(len(ms.msgs), func(i int) bool {
|
|
if ms.msgs[uint64(i)+ms.first].timestamp >= timestamp {
|
|
return true
|
|
}
|
|
return false
|
|
})
|
|
|
|
return uint64(index) + ms.first
|
|
}
|
|
|
|
// initCache initializes the message cache
|
|
func (ms *FileMsgStore) initCache() {
|
|
ms.cache = &msgsCache{
|
|
seqMaps: make(map[uint64]*cachedMsg),
|
|
}
|
|
}
|
|
|
|
// addToCache adds a message to the cache.
|
|
// Store write lock is assumed held on entry
|
|
func (ms *FileMsgStore) addToCache(seq uint64, msg *pb.MsgProto, isNew bool) {
|
|
c := ms.cache
|
|
exp := cacheTTL
|
|
if isNew {
|
|
exp += msg.Timestamp
|
|
} else {
|
|
exp += time.Now().UnixNano()
|
|
}
|
|
cMsg := &cachedMsg{
|
|
expiration: exp,
|
|
msg: msg,
|
|
}
|
|
if c.tail == nil {
|
|
c.head = cMsg
|
|
} else {
|
|
c.tail.next = cMsg
|
|
}
|
|
cMsg.prev = c.tail
|
|
c.tail = cMsg
|
|
c.seqMaps[seq] = cMsg
|
|
if len(c.seqMaps) == 1 {
|
|
atomic.StoreInt32(&c.tryEvict, 1)
|
|
}
|
|
}
|
|
|
|
// getFromCache returns a message if available in the cache.
|
|
// Store write lock is assumed held on entry
|
|
func (ms *FileMsgStore) getFromCache(seq uint64) *pb.MsgProto {
|
|
c := ms.cache
|
|
cMsg := c.seqMaps[seq]
|
|
if cMsg == nil {
|
|
return nil
|
|
}
|
|
if cMsg != c.tail {
|
|
if cMsg.prev != nil {
|
|
cMsg.prev.next = cMsg.next
|
|
}
|
|
if cMsg.next != nil {
|
|
cMsg.next.prev = cMsg.prev
|
|
}
|
|
if cMsg == c.head {
|
|
c.head = cMsg.next
|
|
}
|
|
cMsg.prev = c.tail
|
|
cMsg.next = nil
|
|
c.tail = cMsg
|
|
}
|
|
cMsg.expiration = time.Now().UnixNano() + cacheTTL
|
|
return cMsg.msg
|
|
}
|
|
|
|
// evictFromCache move down the cache maps, evicting the last one.
|
|
// Store write lock is assumed held on entry
|
|
func (ms *FileMsgStore) evictFromCache(now int64) {
|
|
c := ms.cache
|
|
if now >= c.tail.expiration {
|
|
// Bulk remove
|
|
c.seqMaps = make(map[uint64]*cachedMsg)
|
|
c.head, c.tail, c.tryEvict = nil, nil, 0
|
|
return
|
|
}
|
|
cMsg := c.head
|
|
for cMsg != nil && cMsg.expiration <= now {
|
|
delete(c.seqMaps, cMsg.msg.Sequence)
|
|
cMsg = cMsg.next
|
|
}
|
|
if cMsg != c.head {
|
|
// There should be at least one left, otherwise, they
|
|
// would all have been bulk removed at top of this function.
|
|
cMsg.prev = nil
|
|
c.head = cMsg
|
|
}
|
|
}
|
|
|
|
// Close closes the store.
|
|
func (ms *FileMsgStore) Close() error {
|
|
ms.Lock()
|
|
if ms.closed {
|
|
ms.Unlock()
|
|
return nil
|
|
}
|
|
|
|
ms.closed = true
|
|
|
|
var err error
|
|
// Close file slices that may have been opened due to
|
|
// message lookups.
|
|
for _, slice := range ms.files {
|
|
if slice.file != nil {
|
|
if lerr := slice.file.Close(); lerr != nil && err == nil {
|
|
err = lerr
|
|
}
|
|
}
|
|
}
|
|
// Flush and close current files
|
|
if ms.currSlice != nil {
|
|
if lerr := ms.closeDataAndIndexFiles(); lerr != nil && err == nil {
|
|
err = lerr
|
|
}
|
|
}
|
|
// Signal the background tasks go-routine to exit
|
|
ms.bkgTasksDone <- true
|
|
|
|
ms.Unlock()
|
|
|
|
// Wait on go routines/timers to finish
|
|
ms.allDone.Wait()
|
|
|
|
return err
|
|
}
|
|
|
|
func (ms *FileMsgStore) flush() error {
|
|
if ms.bw != nil && ms.bw.buf != nil && ms.bw.buf.Buffered() > 0 {
|
|
if err := ms.bw.buf.Flush(); err != nil {
|
|
return err
|
|
}
|
|
if err := ms.processBufferedMsgs(); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
if ms.fstore.opts.DoSync {
|
|
if err := ms.file.Sync(); err != nil {
|
|
return err
|
|
}
|
|
if err := ms.idxFile.Sync(); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// Flush flushes outstanding data into the store.
|
|
func (ms *FileMsgStore) Flush() error {
|
|
ms.Lock()
|
|
err := ms.flush()
|
|
ms.Unlock()
|
|
return err
|
|
}
|
|
|
|
////////////////////////////////////////////////////////////////////////////
|
|
// FileSubStore methods
|
|
////////////////////////////////////////////////////////////////////////////
|
|
|
|
// newFileSubStore returns a new instace of a file SubStore.
|
|
func (fs *FileStore) newFileSubStore(channelDirName, channel string, doRecover bool) (*FileSubStore, error) {
|
|
ss := &FileSubStore{
|
|
rootDir: channelDirName,
|
|
subs: make(map[uint64]*subscription),
|
|
opts: &fs.opts,
|
|
crcTable: fs.crcTable,
|
|
}
|
|
// Defaults to the global limits
|
|
subStoreLimits := fs.limits.SubStoreLimits
|
|
// See if there is an override
|
|
thisChannelLimits, exists := fs.limits.PerChannel[channel]
|
|
if exists {
|
|
// Use this channel specific limits
|
|
subStoreLimits = thisChannelLimits.SubStoreLimits
|
|
}
|
|
ss.init(channel, &subStoreLimits)
|
|
// Convert the CompactInterval in time.Duration
|
|
ss.compactItvl = time.Duration(ss.opts.CompactInterval) * time.Second
|
|
|
|
var err error
|
|
|
|
fileName := filepath.Join(channelDirName, subsFileName)
|
|
ss.file, err = openFile(fileName)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
maxBufSize := ss.opts.BufferSize
|
|
// This needs to be done before the call to ss.setWriter()
|
|
if maxBufSize > 0 {
|
|
ss.bw = newBufferWriter(subBufMinShrinkSize, maxBufSize)
|
|
}
|
|
ss.setWriter()
|
|
if doRecover {
|
|
if err := ss.recoverSubscriptions(); err != nil {
|
|
ss.Close()
|
|
return nil, fmt.Errorf("unable to create subscription store for [%s]: %v", channel, err)
|
|
}
|
|
}
|
|
// Do not attempt to shrink unless the option is greater than the
|
|
// minimum shrinkable size.
|
|
if maxBufSize > subBufMinShrinkSize {
|
|
// Use lock to avoid RACE report between setting shrinkTimer and
|
|
// execution of the callback itself.
|
|
ss.Lock()
|
|
ss.allDone.Add(1)
|
|
ss.shrinkTimer = time.AfterFunc(bufShrinkInterval, ss.shrinkBuffer)
|
|
ss.Unlock()
|
|
}
|
|
return ss, nil
|
|
}
|
|
|
|
// setWriter sets the writer to either file or buffered writer (and create it),
|
|
// based on store option.
|
|
func (ss *FileSubStore) setWriter() {
|
|
ss.writer = ss.file
|
|
if ss.bw != nil {
|
|
ss.writer = ss.bw.createNewWriter(ss.file)
|
|
}
|
|
}
|
|
|
|
// shrinkBuffer is a timer callback that shrinks the buffer writer when possible
|
|
func (ss *FileSubStore) shrinkBuffer() {
|
|
ss.Lock()
|
|
defer ss.Unlock()
|
|
|
|
if ss.closed {
|
|
ss.allDone.Done()
|
|
return
|
|
}
|
|
|
|
// If error, the buffer (in bufio) memorizes the error
|
|
// so any other write/flush on that buffer will fail. We will get the
|
|
// error at the next "synchronous" operation where we can report back
|
|
// to the user.
|
|
ss.writer, _ = ss.bw.tryShrinkBuffer(ss.file)
|
|
|
|
// Fire again
|
|
ss.shrinkTimer.Reset(bufShrinkInterval)
|
|
}
|
|
|
|
// recoverSubscriptions recovers subscriptions state for this store.
|
|
func (ss *FileSubStore) recoverSubscriptions() error {
|
|
var err error
|
|
var recType recordType
|
|
|
|
recSize := 0
|
|
// Create a buffered reader to speed-up recovery
|
|
br := bufio.NewReaderSize(ss.file, defaultBufSize)
|
|
|
|
for {
|
|
ss.tmpSubBuf, recSize, recType, err = readRecord(br, ss.tmpSubBuf, true, ss.crcTable, ss.opts.DoCRC)
|
|
if err != nil {
|
|
if err == io.EOF {
|
|
// We are done, reset err
|
|
err = nil
|
|
break
|
|
} else {
|
|
return err
|
|
}
|
|
}
|
|
ss.fileSize += int64(recSize + recordHeaderSize)
|
|
// Based on record type...
|
|
switch recType {
|
|
case subRecNew:
|
|
newSub := &spb.SubState{}
|
|
if err := newSub.Unmarshal(ss.tmpSubBuf[:recSize]); err != nil {
|
|
return err
|
|
}
|
|
sub := &subscription{
|
|
sub: newSub,
|
|
seqnos: make(map[uint64]struct{}),
|
|
}
|
|
ss.subs[newSub.ID] = sub
|
|
// Keep track of the subscriptions count
|
|
ss.subsCount++
|
|
// Keep track of max subscription ID found.
|
|
if newSub.ID > ss.maxSubID {
|
|
ss.maxSubID = newSub.ID
|
|
}
|
|
ss.numRecs++
|
|
case subRecUpdate:
|
|
modifiedSub := &spb.SubState{}
|
|
if err := modifiedSub.Unmarshal(ss.tmpSubBuf[:recSize]); err != nil {
|
|
return err
|
|
}
|
|
// Search if the create has been recovered.
|
|
sub, exists := ss.subs[modifiedSub.ID]
|
|
if exists {
|
|
sub.sub = modifiedSub
|
|
// An update means that the previous version is free space.
|
|
ss.delRecs++
|
|
} else {
|
|
sub := &subscription{
|
|
sub: modifiedSub,
|
|
seqnos: make(map[uint64]struct{}),
|
|
}
|
|
ss.subs[modifiedSub.ID] = sub
|
|
}
|
|
// Keep track of max subscription ID found.
|
|
if modifiedSub.ID > ss.maxSubID {
|
|
ss.maxSubID = modifiedSub.ID
|
|
}
|
|
ss.numRecs++
|
|
case subRecDel:
|
|
delSub := spb.SubStateDelete{}
|
|
if err := delSub.Unmarshal(ss.tmpSubBuf[:recSize]); err != nil {
|
|
return err
|
|
}
|
|
if s, exists := ss.subs[delSub.ID]; exists {
|
|
delete(ss.subs, delSub.ID)
|
|
// Keep track of the subscriptions count
|
|
ss.subsCount--
|
|
// Delete and count all non-ack'ed messages free space.
|
|
ss.delRecs++
|
|
ss.delRecs += len(s.seqnos)
|
|
}
|
|
// Keep track of max subscription ID found.
|
|
if delSub.ID > ss.maxSubID {
|
|
ss.maxSubID = delSub.ID
|
|
}
|
|
case subRecMsg:
|
|
updateSub := spb.SubStateUpdate{}
|
|
if err := updateSub.Unmarshal(ss.tmpSubBuf[:recSize]); err != nil {
|
|
return err
|
|
}
|
|
if sub, exists := ss.subs[updateSub.ID]; exists {
|
|
seqno := updateSub.Seqno
|
|
// Same seqno/ack can appear several times for the same sub.
|
|
// See queue subscribers redelivery.
|
|
if seqno > sub.sub.LastSent {
|
|
sub.sub.LastSent = seqno
|
|
}
|
|
sub.seqnos[seqno] = struct{}{}
|
|
ss.numRecs++
|
|
}
|
|
case subRecAck:
|
|
updateSub := spb.SubStateUpdate{}
|
|
if err := updateSub.Unmarshal(ss.tmpSubBuf[:recSize]); err != nil {
|
|
return err
|
|
}
|
|
if sub, exists := ss.subs[updateSub.ID]; exists {
|
|
delete(sub.seqnos, updateSub.Seqno)
|
|
// A message is ack'ed
|
|
ss.delRecs++
|
|
}
|
|
default:
|
|
return fmt.Errorf("unexpected record type: %v", recType)
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// CreateSub records a new subscription represented by SubState. On success,
|
|
// it returns an id that is used by the other methods.
|
|
func (ss *FileSubStore) CreateSub(sub *spb.SubState) error {
|
|
// Check if we can create the subscription (check limits and update
|
|
// subscription count)
|
|
ss.Lock()
|
|
defer ss.Unlock()
|
|
if err := ss.createSub(sub); err != nil {
|
|
return err
|
|
}
|
|
if err := ss.writeRecord(ss.writer, subRecNew, sub); err != nil {
|
|
return err
|
|
}
|
|
// We need to get a copy of the passed sub, we can't hold a reference
|
|
// to it.
|
|
csub := *sub
|
|
s := &subscription{sub: &csub, seqnos: make(map[uint64]struct{})}
|
|
ss.subs[sub.ID] = s
|
|
return nil
|
|
}
|
|
|
|
// UpdateSub updates a given subscription represented by SubState.
|
|
func (ss *FileSubStore) UpdateSub(sub *spb.SubState) error {
|
|
ss.Lock()
|
|
defer ss.Unlock()
|
|
if err := ss.writeRecord(ss.writer, subRecUpdate, sub); err != nil {
|
|
return err
|
|
}
|
|
// We need to get a copy of the passed sub, we can't hold a reference
|
|
// to it.
|
|
csub := *sub
|
|
s := ss.subs[sub.ID]
|
|
if s != nil {
|
|
s.sub = &csub
|
|
} else {
|
|
s := &subscription{sub: &csub, seqnos: make(map[uint64]struct{})}
|
|
ss.subs[sub.ID] = s
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// DeleteSub invalidates this subscription.
|
|
func (ss *FileSubStore) DeleteSub(subid uint64) {
|
|
ss.Lock()
|
|
ss.delSub.ID = subid
|
|
ss.writeRecord(ss.writer, subRecDel, &ss.delSub)
|
|
if s, exists := ss.subs[subid]; exists {
|
|
delete(ss.subs, subid)
|
|
// writeRecord has already accounted for the count of the
|
|
// delete record. We add to this the number of pending messages
|
|
ss.delRecs += len(s.seqnos)
|
|
// Check if this triggers a need for compaction
|
|
if ss.shouldCompact() {
|
|
ss.compact()
|
|
}
|
|
}
|
|
ss.Unlock()
|
|
}
|
|
|
|
// shouldCompact returns a boolean indicating if we should compact
|
|
// Lock is held by caller
|
|
func (ss *FileSubStore) shouldCompact() bool {
|
|
// Gobal switch
|
|
if !ss.opts.CompactEnabled {
|
|
return false
|
|
}
|
|
// Check that if minimum file size is set, the client file
|
|
// is at least at the minimum.
|
|
if ss.opts.CompactMinFileSize > 0 && ss.fileSize < ss.opts.CompactMinFileSize {
|
|
return false
|
|
}
|
|
// Check fragmentation
|
|
frag := 0
|
|
if ss.numRecs == 0 {
|
|
frag = 100
|
|
} else {
|
|
frag = ss.delRecs * 100 / ss.numRecs
|
|
}
|
|
if frag < ss.opts.CompactFragmentation {
|
|
return false
|
|
}
|
|
// Check that we don't compact too often
|
|
if time.Now().Sub(ss.compactTS) < ss.compactItvl {
|
|
return false
|
|
}
|
|
return true
|
|
}
|
|
|
|
// AddSeqPending adds the given message seqno to the given subscription.
|
|
func (ss *FileSubStore) AddSeqPending(subid, seqno uint64) error {
|
|
ss.Lock()
|
|
ss.updateSub.ID, ss.updateSub.Seqno = subid, seqno
|
|
if err := ss.writeRecord(ss.writer, subRecMsg, &ss.updateSub); err != nil {
|
|
ss.Unlock()
|
|
return err
|
|
}
|
|
s := ss.subs[subid]
|
|
if s != nil {
|
|
if seqno > s.sub.LastSent {
|
|
s.sub.LastSent = seqno
|
|
}
|
|
s.seqnos[seqno] = struct{}{}
|
|
}
|
|
ss.Unlock()
|
|
return nil
|
|
}
|
|
|
|
// AckSeqPending records that the given message seqno has been acknowledged
|
|
// by the given subscription.
|
|
func (ss *FileSubStore) AckSeqPending(subid, seqno uint64) error {
|
|
ss.Lock()
|
|
ss.updateSub.ID, ss.updateSub.Seqno = subid, seqno
|
|
if err := ss.writeRecord(ss.writer, subRecAck, &ss.updateSub); err != nil {
|
|
ss.Unlock()
|
|
return err
|
|
}
|
|
s := ss.subs[subid]
|
|
if s != nil {
|
|
delete(s.seqnos, seqno)
|
|
// Test if we should compact
|
|
if ss.shouldCompact() {
|
|
ss.compact()
|
|
}
|
|
}
|
|
ss.Unlock()
|
|
return nil
|
|
}
|
|
|
|
// compact rewrites all subscriptions on a temporary file, reducing the size
|
|
// since we get rid of deleted subscriptions and message sequences that have
|
|
// been acknowledged. On success, the subscriptions file is replaced by this
|
|
// temporary file.
|
|
// Lock is held by caller
|
|
func (ss *FileSubStore) compact() error {
|
|
tmpFile, err := getTempFile(ss.rootDir, "subs")
|
|
if err != nil {
|
|
return err
|
|
}
|
|
tmpBW := bufio.NewWriterSize(tmpFile, defaultBufSize)
|
|
// Save values in case of failed compaction
|
|
savedNumRecs := ss.numRecs
|
|
savedDelRecs := ss.delRecs
|
|
savedFileSize := ss.fileSize
|
|
// Cleanup in case of error during compact
|
|
defer func() {
|
|
if tmpFile != nil {
|
|
tmpFile.Close()
|
|
os.Remove(tmpFile.Name())
|
|
// Since we failed compaction, restore values
|
|
ss.numRecs = savedNumRecs
|
|
ss.delRecs = savedDelRecs
|
|
ss.fileSize = savedFileSize
|
|
}
|
|
}()
|
|
// Reset to 0 since writeRecord() is updating the values.
|
|
ss.numRecs = 0
|
|
ss.delRecs = 0
|
|
ss.fileSize = 0
|
|
for _, sub := range ss.subs {
|
|
err = ss.writeRecord(tmpBW, subRecNew, sub.sub)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
ss.updateSub.ID = sub.sub.ID
|
|
for seqno := range sub.seqnos {
|
|
ss.updateSub.Seqno = seqno
|
|
err = ss.writeRecord(tmpBW, subRecMsg, &ss.updateSub)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
}
|
|
}
|
|
// Flush and sync the temporary file
|
|
err = tmpBW.Flush()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
err = tmpFile.Sync()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
// Switch the temporary file with the original one.
|
|
ss.file, err = swapFiles(tmpFile, ss.file)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
// Prevent cleanup on success
|
|
tmpFile = nil
|
|
|
|
// Set the file and create buffered writer if applicable
|
|
ss.setWriter()
|
|
// Update the timestamp of this last successful compact
|
|
ss.compactTS = time.Now()
|
|
return nil
|
|
}
|
|
|
|
// writes a record in the subscriptions file.
|
|
// store's lock is held on entry.
|
|
func (ss *FileSubStore) writeRecord(w io.Writer, recType recordType, rec record) error {
|
|
var err error
|
|
totalSize := 0
|
|
recSize := rec.Size()
|
|
|
|
var bwBuf *bufio.Writer
|
|
if ss.bw != nil && w == ss.bw.buf {
|
|
bwBuf = ss.bw.buf
|
|
}
|
|
// If we are using the buffer writer on this call, and the buffer is
|
|
// not already at the max size...
|
|
if bwBuf != nil && ss.bw.bufSize != ss.opts.BufferSize {
|
|
// Check if record fits
|
|
required := recSize + recordHeaderSize
|
|
if required > bwBuf.Available() {
|
|
ss.writer, err = ss.bw.expand(ss.file, required)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
// `w` is used in this function, so point it to the new buffer
|
|
bwBuf = ss.bw.buf
|
|
w = bwBuf
|
|
}
|
|
}
|
|
ss.tmpSubBuf, totalSize, err = writeRecord(w, ss.tmpSubBuf, recType, rec, recSize, ss.crcTable)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if bwBuf != nil && ss.bw.shrinkReq {
|
|
ss.bw.checkShrinkRequest()
|
|
}
|
|
// Indicate that we wrote something to the buffer/file
|
|
ss.activity = true
|
|
switch recType {
|
|
case subRecNew:
|
|
ss.numRecs++
|
|
case subRecMsg:
|
|
ss.numRecs++
|
|
case subRecAck:
|
|
// An ack makes the message record free space
|
|
ss.delRecs++
|
|
case subRecUpdate:
|
|
ss.numRecs++
|
|
// An update makes the old record free space
|
|
ss.delRecs++
|
|
case subRecDel:
|
|
ss.delRecs++
|
|
default:
|
|
panic(fmt.Errorf("Record type %v unknown", recType))
|
|
}
|
|
ss.fileSize += int64(totalSize)
|
|
return nil
|
|
}
|
|
|
|
func (ss *FileSubStore) flush() error {
|
|
// Skip this if nothing was written since the last flush
|
|
if !ss.activity {
|
|
return nil
|
|
}
|
|
// Reset this now
|
|
ss.activity = false
|
|
if ss.bw != nil && ss.bw.buf.Buffered() > 0 {
|
|
if err := ss.bw.buf.Flush(); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
if ss.opts.DoSync {
|
|
return ss.file.Sync()
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// Flush persists buffered operations to disk.
|
|
func (ss *FileSubStore) Flush() error {
|
|
ss.Lock()
|
|
err := ss.flush()
|
|
ss.Unlock()
|
|
return err
|
|
}
|
|
|
|
// Close closes this store
|
|
func (ss *FileSubStore) Close() error {
|
|
ss.Lock()
|
|
if ss.closed {
|
|
ss.Unlock()
|
|
return nil
|
|
}
|
|
|
|
ss.closed = true
|
|
|
|
var err error
|
|
if ss.file != nil {
|
|
err = ss.flush()
|
|
if lerr := ss.file.Close(); lerr != nil && err == nil {
|
|
err = lerr
|
|
}
|
|
}
|
|
if ss.shrinkTimer != nil {
|
|
if ss.shrinkTimer.Stop() {
|
|
// If we can stop, timer callback won't fire,
|
|
// so we need to decrement the wait group.
|
|
ss.allDone.Done()
|
|
}
|
|
}
|
|
ss.Unlock()
|
|
|
|
// Wait on timers/callbacks
|
|
ss.allDone.Wait()
|
|
|
|
return err
|
|
}
|