containerd/vendor/github.com/nats-io/nats-streaming-server/stores/filestore.go

2750 lines
77 KiB
Go

// Copyright 2016 Apcera Inc. All rights reserved.
package stores
import (
"bufio"
"fmt"
"hash/crc32"
"io"
"io/ioutil"
"os"
"os/exec"
"path/filepath"
"sort"
"strconv"
"strings"
"sync"
"sync/atomic"
"time"
"github.com/nats-io/go-nats-streaming/pb"
"github.com/nats-io/nats-streaming-server/spb"
"github.com/nats-io/nats-streaming-server/util"
)
const (
// Our file version.
fileVersion = 1
// Prefix for message log files
msgFilesPrefix = "msgs."
// Data files suffix
datSuffix = ".dat"
// Index files suffix
idxSuffix = ".idx"
// Backup file suffix
bakSuffix = ".bak"
// Name of the subscriptions file.
subsFileName = "subs" + datSuffix
// Name of the clients file.
clientsFileName = "clients" + datSuffix
// Name of the server file.
serverFileName = "server" + datSuffix
// Number of bytes required to store a CRC-32 checksum
crcSize = crc32.Size
// Size of a record header.
// 4 bytes: For typed records: 1 byte for type, 3 bytes for buffer size
// For non typed rec: buffer size
// +4 bytes for CRC-32
recordHeaderSize = 4 + crcSize
// defaultBufSize is used for various buffered IO operations
defaultBufSize = 10 * 1024 * 1024
// Size of an message index record
// Seq - Offset - Timestamp - Size - CRC
msgIndexRecSize = 8 + 8 + 8 + 4 + crcSize
// msgRecordOverhead is the number of bytes to count toward the size
// of a serialized message so that file slice size is closer to
// channels and/or file slice limits.
msgRecordOverhead = recordHeaderSize + msgIndexRecSize
// Percentage of buffer usage to decide if the buffer should shrink
bufShrinkThreshold = 50
// Interval when to check/try to shrink buffer writers
defaultBufShrinkInterval = 5 * time.Second
// If FileStoreOption's BufferSize is > 0, the buffer writer is initially
// created with this size (unless this is > than BufferSize, in which case
// BufferSize is used). When possible, the buffer will shrink but not lower
// than this value. This is for FileSubStore's
subBufMinShrinkSize = 128
// If FileStoreOption's BufferSize is > 0, the buffer writer is initially
// created with this size (unless this is > than BufferSize, in which case
// BufferSize is used). When possible, the buffer will shrink but not lower
// than this value. This is for FileMsgStore's
msgBufMinShrinkSize = 512
// This is the sleep time in the background tasks go routine.
defaultBkgTasksSleepDuration = time.Second
// This is the default amount of time a message is cached.
defaultCacheTTL = time.Second
)
// FileStoreOption is a function on the options for a File Store
type FileStoreOption func(*FileStoreOptions) error
// FileStoreOptions can be used to customize a File Store
type FileStoreOptions struct {
// BufferSize is the size of the buffer used during store operations.
BufferSize int
// CompactEnabled allows to enable/disable files compaction.
CompactEnabled bool
// CompactInterval indicates the minimum interval (in seconds) between compactions.
CompactInterval int
// CompactFragmentation indicates the minimum ratio of fragmentation
// to trigger compaction. For instance, 50 means that compaction
// would not happen until fragmentation is more than 50%.
CompactFragmentation int
// CompactMinFileSize indicates the minimum file size before compaction
// can be performed, regardless of the current file fragmentation.
CompactMinFileSize int64
// DoCRC enables (or disables) CRC checksum verification on read operations.
DoCRC bool
// CRCPoly is a polynomial used to make the table used in CRC computation.
CRCPolynomial int64
// DoSync indicates if `File.Sync()`` is called during a flush.
DoSync bool
// Regardless of channel limits, the options below allow to split a message
// log in smaller file chunks. If all those options were to be set to 0,
// some file slice limit will be selected automatically based on the channel
// limits.
// SliceMaxMsgs defines how many messages can fit in a file slice (0 means
// count is not checked).
SliceMaxMsgs int
// SliceMaxBytes defines how many bytes can fit in a file slice, including
// the corresponding index file (0 means size is not checked).
SliceMaxBytes int64
// SliceMaxAge defines the period of time covered by a slice starting when
// the first message is stored (0 means time is not checked).
SliceMaxAge time.Duration
// SliceArchiveScript is the path to a script to be invoked when a file
// slice (and the corresponding index file) is going to be removed.
// The script will be invoked with the channel name and names of data and
// index files (which both have been previously renamed with a '.bak'
// extension). It is the responsability of the script to move/remove
// those files.
SliceArchiveScript string
}
// DefaultFileStoreOptions defines the default options for a File Store.
var DefaultFileStoreOptions = FileStoreOptions{
BufferSize: 2 * 1024 * 1024, // 2MB
CompactEnabled: true,
CompactInterval: 5 * 60, // 5 minutes
CompactFragmentation: 50,
CompactMinFileSize: 1024 * 1024,
DoCRC: true,
CRCPolynomial: int64(crc32.IEEE),
DoSync: true,
SliceMaxBytes: 64 * 1024 * 1024, // 64MB
}
// BufferSize is a FileStore option that sets the size of the buffer used
// during store writes. This can help improve write performance.
func BufferSize(size int) FileStoreOption {
return func(o *FileStoreOptions) error {
o.BufferSize = size
return nil
}
}
// CompactEnabled is a FileStore option that enables or disables file compaction.
// The value false will disable compaction.
func CompactEnabled(enabled bool) FileStoreOption {
return func(o *FileStoreOptions) error {
o.CompactEnabled = enabled
return nil
}
}
// CompactInterval is a FileStore option that defines the minimum compaction interval.
// Compaction is not timer based, but instead when things get "deleted". This value
// prevents compaction to happen too often.
func CompactInterval(seconds int) FileStoreOption {
return func(o *FileStoreOptions) error {
o.CompactInterval = seconds
return nil
}
}
// CompactFragmentation is a FileStore option that defines the fragmentation ratio
// below which compaction would not occur. For instance, specifying 50 means that
// if other variables would allow for compaction, the compaction would occur only
// after 50% of the file has data that is no longer valid.
func CompactFragmentation(fragmentation int) FileStoreOption {
return func(o *FileStoreOptions) error {
o.CompactFragmentation = fragmentation
return nil
}
}
// CompactMinFileSize is a FileStore option that defines the minimum file size below
// which compaction would not occur. Specify `-1` if you don't want any minimum.
func CompactMinFileSize(fileSize int64) FileStoreOption {
return func(o *FileStoreOptions) error {
o.CompactMinFileSize = fileSize
return nil
}
}
// DoCRC is a FileStore option that defines if a CRC checksum verification should
// be performed when records are read from disk.
func DoCRC(enableCRC bool) FileStoreOption {
return func(o *FileStoreOptions) error {
o.DoCRC = enableCRC
return nil
}
}
// CRCPolynomial is a FileStore option that defines the polynomial to use to create
// the table used for CRC-32 Checksum.
// See https://golang.org/pkg/hash/crc32/#MakeTable
func CRCPolynomial(polynomial int64) FileStoreOption {
return func(o *FileStoreOptions) error {
o.CRCPolynomial = polynomial
return nil
}
}
// DoSync is a FileStore option that defines if `File.Sync()` should be called
// during a `Flush()` call.
func DoSync(enableFileSync bool) FileStoreOption {
return func(o *FileStoreOptions) error {
o.DoSync = enableFileSync
return nil
}
}
// SliceConfig is a FileStore option that allows the configuration of
// file slice limits and optional archive script file name.
func SliceConfig(maxMsgs int, maxBytes int64, maxAge time.Duration, script string) FileStoreOption {
return func(o *FileStoreOptions) error {
o.SliceMaxMsgs = maxMsgs
o.SliceMaxBytes = maxBytes
o.SliceMaxAge = maxAge
o.SliceArchiveScript = script
return nil
}
}
// AllOptions is a convenient option to pass all options from a FileStoreOptions
// structure to the constructor.
func AllOptions(opts *FileStoreOptions) FileStoreOption {
return func(o *FileStoreOptions) error {
// Make a copy
*o = *opts
return nil
}
}
// Type for the records in the subscriptions file
type recordType byte
// Protobufs do not share a common interface, yet, when saving a
// record on disk, we have to get the size and marshal the record in
// a buffer. These methods are available in all the protobuf.
// So we create this interface with those two methods to be used by the
// writeRecord method.
type record interface {
Size() int
MarshalTo([]byte) (int, error)
}
// This is use for cases when the record is not typed
const recNoType = recordType(0)
// Record types for subscription file
const (
subRecNew = recordType(iota) + 1
subRecUpdate
subRecDel
subRecAck
subRecMsg
)
// Record types for client store
const (
addClient = recordType(iota) + 1
delClient
)
// FileStore is the storage interface for STAN servers, backed by files.
type FileStore struct {
genericStore
rootDir string
serverFile *os.File
clientsFile *os.File
opts FileStoreOptions
compactItvl time.Duration
addClientRec spb.ClientInfo
delClientRec spb.ClientDelete
cliFileSize int64
cliDeleteRecs int // Number of deleted client records
cliCompactTS time.Time
crcTable *crc32.Table
}
type subscription struct {
sub *spb.SubState
seqnos map[uint64]struct{}
}
type bufferedWriter struct {
buf *bufio.Writer
bufSize int // current buffer size
minShrinkSize int // minimum shrink size. Note that this can be bigger than maxSize (see setSizes)
maxSize int // maximum size the buffer can grow
shrinkReq bool // used to decide if buffer should shrink
}
// FileSubStore is a subscription store in files.
type FileSubStore struct {
genericSubStore
tmpSubBuf []byte
file *os.File
bw *bufferedWriter
delSub spb.SubStateDelete
updateSub spb.SubStateUpdate
subs map[uint64]*subscription
opts *FileStoreOptions // points to options from FileStore
compactItvl time.Duration
fileSize int64
numRecs int // Number of records (sub and msgs)
delRecs int // Number of delete (or ack) records
rootDir string
compactTS time.Time
crcTable *crc32.Table // reference to the one from FileStore
activity bool // was there any write between two flush calls
writer io.Writer // this is either `bw` or `file` depending if buffer writer is used or not
shrinkTimer *time.Timer // timer associated with callback shrinking buffer when possible
allDone sync.WaitGroup
}
// fileSlice represents one of the message store file (there are a number
// of files for a MsgStore on a given channel).
type fileSlice struct {
fileName string
idxFName string
firstSeq uint64
lastSeq uint64
rmCount int // Count of messages "removed" from the slice due to limits.
msgsCount int
msgsSize uint64
firstWrite int64 // Time the first message was added to this slice (used for slice age limit)
file *os.File // Used during lookups.
lastUsed int64
}
// msgRecord contains data regarding a message that the FileMsgStore needs to
// keep in memory for performance reasons.
type msgRecord struct {
offset int64
timestamp int64
msgSize uint32
}
// bufferedMsg is required to keep track of a message and msgRecord when
// file buffering is used. It is possible that a message and index is
// not flushed on disk while the message gets removed from the store
// due to limit. We need a map that keeps a reference to message and
// record until the file is flushed.
type bufferedMsg struct {
msg *pb.MsgProto
rec *msgRecord
}
// cachedMsg is a structure that contains a reference to a message
// and cache expiration value. The cache has a map and list so
// that cached messages can be ordered by expiration time.
type cachedMsg struct {
expiration int64
msg *pb.MsgProto
prev *cachedMsg
next *cachedMsg
}
// msgsCache is the file store cache.
type msgsCache struct {
tryEvict int32
seqMaps map[uint64]*cachedMsg
head *cachedMsg
tail *cachedMsg
}
// FileMsgStore is a per channel message file store.
type FileMsgStore struct {
genericMsgStore
// Atomic operations require 64bit aligned fields to be able
// to run with 32bit processes.
checkSlices int64 // used with atomic operations
timeTick int64 // time captured in background tasks go routine
tmpMsgBuf []byte
file *os.File
idxFile *os.File
bw *bufferedWriter
writer io.Writer // this is `bw.buf` or `file` depending if buffer writer is used or not
files map[int]*fileSlice
currSlice *fileSlice
rootDir string
firstFSlSeq int // First file slice sequence number
lastFSlSeq int // Last file slice sequence number
slCountLim int
slSizeLim uint64
slAgeLim int64
slHasLimits bool
fstore *FileStore // pointers to file store object
cache *msgsCache
msgs map[uint64]*msgRecord
wOffset int64
firstMsg *pb.MsgProto
lastMsg *pb.MsgProto
expiration int64
bufferedSeqs []uint64
bufferedMsgs map[uint64]*bufferedMsg
bkgTasksDone chan bool // signal the background tasks go routine to stop
bkgTasksWake chan bool // signal the background tasks go routine to get out of a sleep
allDone sync.WaitGroup
}
// some variables based on constants but that we can change
// for tests puposes.
var (
bufShrinkInterval = defaultBufShrinkInterval
bkgTasksSleepDuration = defaultBkgTasksSleepDuration
cacheTTL = int64(defaultCacheTTL)
)
// openFile opens the file specified by `filename`.
// If the file exists, it checks that the version is supported.
// If no file mode is provided, the file is created if not present,
// opened in Read/Write and Append mode.
func openFile(fileName string, modes ...int) (*os.File, error) {
checkVersion := false
mode := os.O_RDWR | os.O_CREATE | os.O_APPEND
if len(modes) > 0 {
// Use the provided modes instead
mode = 0
for _, m := range modes {
mode |= m
}
}
// Check if file already exists
if s, err := os.Stat(fileName); s != nil && err == nil {
checkVersion = true
}
file, err := os.OpenFile(fileName, mode, 0666)
if err != nil {
return nil, err
}
if checkVersion {
err = checkFileVersion(file)
} else {
// This is a new file, write our file version
err = util.WriteInt(file, fileVersion)
}
if err != nil {
file.Close()
file = nil
}
return file, err
}
// check that the version of the file is understood by this interface
func checkFileVersion(r io.Reader) error {
fv, err := util.ReadInt(r)
if err != nil {
return fmt.Errorf("unable to verify file version: %v", err)
}
if fv == 0 || fv > fileVersion {
return fmt.Errorf("unsupported file version: %v (supports [1..%v])", fv, fileVersion)
}
return nil
}
// writeRecord writes a record to `w`.
// The record layout is as follows:
// 8 bytes: 4 bytes for type and/or size combined
// 4 bytes for CRC-32
// variable bytes: payload.
// If a buffer is provided, this function uses it and expands it if necessary.
// The function returns the buffer (possibly changed due to expansion) and the
// number of bytes written into that buffer.
func writeRecord(w io.Writer, buf []byte, recType recordType, rec record, recSize int, crcTable *crc32.Table) ([]byte, int, error) {
// This is the header + payload size
totalSize := recordHeaderSize + recSize
// Alloc or realloc as needed
buf = util.EnsureBufBigEnough(buf, totalSize)
// If there is a record type, encode it
headerFirstInt := 0
if recType != recNoType {
if recSize > 0xFFFFFF {
panic("record size too big")
}
// Encode the type in the high byte of the header
headerFirstInt = int(recType)<<24 | recSize
} else {
// The header is the size of the record
headerFirstInt = recSize
}
// Write the first part of the header at the beginning of the buffer
util.ByteOrder.PutUint32(buf[:4], uint32(headerFirstInt))
// Marshal the record into the given buffer, after the header offset
if _, err := rec.MarshalTo(buf[recordHeaderSize:totalSize]); err != nil {
// Return the buffer because the caller may have provided one
return buf, 0, err
}
// Compute CRC
crc := crc32.Checksum(buf[recordHeaderSize:totalSize], crcTable)
// Write it in the buffer
util.ByteOrder.PutUint32(buf[4:recordHeaderSize], crc)
// Are we dealing with a buffered writer?
bw, isBuffered := w.(*bufio.Writer)
// if so, make sure that if what we are about to "write" is more
// than what's available, then first flush the buffer.
// This is to reduce the risk of partial writes.
if isBuffered && (bw.Buffered() > 0) && (bw.Available() < totalSize) {
if err := bw.Flush(); err != nil {
return buf, 0, err
}
}
// Write the content of our slice into the writer `w`
if _, err := w.Write(buf[:totalSize]); err != nil {
// Return the tmpBuf because the caller may have provided one
return buf, 0, err
}
return buf, totalSize, nil
}
// readRecord reads a record from `r`, possibly checking the CRC-32 checksum.
// When `buf`` is not nil, this function ensures the buffer is big enough to
// hold the payload (expanding if necessary). Therefore, this call always
// return `buf`, regardless if there is an error or not.
// The caller is indicating if the record is supposed to be typed or not.
func readRecord(r io.Reader, buf []byte, recTyped bool, crcTable *crc32.Table, checkCRC bool) ([]byte, int, recordType, error) {
_header := [recordHeaderSize]byte{}
header := _header[:]
if _, err := io.ReadFull(r, header); err != nil {
return buf, 0, recNoType, err
}
recType := recNoType
recSize := 0
firstInt := int(util.ByteOrder.Uint32(header[:4]))
if recTyped {
recType = recordType(firstInt >> 24 & 0xFF)
recSize = firstInt & 0xFFFFFF
} else {
recSize = firstInt
}
crc := util.ByteOrder.Uint32(header[4:recordHeaderSize])
// Now we are going to read the payload
buf = util.EnsureBufBigEnough(buf, recSize)
if _, err := io.ReadFull(r, buf[:recSize]); err != nil {
return buf, 0, recNoType, err
}
if checkCRC {
// check CRC against what was stored
if c := crc32.Checksum(buf[:recSize], crcTable); c != crc {
return buf, 0, recNoType, fmt.Errorf("corrupted data, expected crc to be 0x%08x, got 0x%08x", crc, c)
}
}
return buf, recSize, recType, nil
}
// setSize sets the initial buffer size and keep track of min/max allowed sizes
func newBufferWriter(minShrinkSize, maxSize int) *bufferedWriter {
w := &bufferedWriter{minShrinkSize: minShrinkSize, maxSize: maxSize}
w.bufSize = minShrinkSize
// The minSize is the minimum size the buffer can shrink to.
// However, if the given max size is smaller than the min
// shrink size, use that instead.
if maxSize < minShrinkSize {
w.bufSize = maxSize
}
return w
}
// createNewWriter creates a new buffer writer for `file` with
// the bufferedWriter's current buffer size.
func (w *bufferedWriter) createNewWriter(file *os.File) io.Writer {
w.buf = bufio.NewWriterSize(file, w.bufSize)
return w.buf
}
// expand the buffer (first flushing the buffer if not empty)
func (w *bufferedWriter) expand(file *os.File, required int) (io.Writer, error) {
// If there was a request to shrink the buffer, cancel that.
w.shrinkReq = false
// If there was something, flush first
if w.buf.Buffered() > 0 {
if err := w.buf.Flush(); err != nil {
return w.buf, err
}
}
// Double the size
w.bufSize *= 2
// If still smaller than what is required, adjust
if w.bufSize < required {
w.bufSize = required
}
// But cap it.
if w.bufSize > w.maxSize {
w.bufSize = w.maxSize
}
w.buf = bufio.NewWriterSize(file, w.bufSize)
return w.buf, nil
}
// tryShrinkBuffer checks and possibly shrinks the buffer
func (w *bufferedWriter) tryShrinkBuffer(file *os.File) (io.Writer, error) {
// Nothing to do if we are already at the lowest
// or file not set/opened.
if w.bufSize == w.minShrinkSize || file == nil {
return w.buf, nil
}
if !w.shrinkReq {
percentFilled := w.buf.Buffered() * 100 / w.bufSize
if percentFilled <= bufShrinkThreshold {
w.shrinkReq = true
}
// Wait for next tick to see if we can shrink
return w.buf, nil
}
if err := w.buf.Flush(); err != nil {
return w.buf, err
}
// Reduce size, but ensure it does not go below the limit
w.bufSize /= 2
if w.bufSize < w.minShrinkSize {
w.bufSize = w.minShrinkSize
}
w.buf = bufio.NewWriterSize(file, w.bufSize)
// Don't reset shrinkReq unless we are down to the limit
if w.bufSize == w.minShrinkSize {
w.shrinkReq = true
}
return w.buf, nil
}
// checkShrinkRequest checks how full the buffer is, and if is above a certain
// threshold, cancels the shrink request
func (w *bufferedWriter) checkShrinkRequest() {
percentFilled := w.buf.Buffered() * 100 / w.bufSize
// If above the threshold, cancel the request.
if percentFilled > bufShrinkThreshold {
w.shrinkReq = false
}
}
////////////////////////////////////////////////////////////////////////////
// FileStore methods
////////////////////////////////////////////////////////////////////////////
// NewFileStore returns a factory for stores backed by files, and recovers
// any state present.
// If not limits are provided, the store will be created with
// DefaultStoreLimits.
func NewFileStore(rootDir string, limits *StoreLimits, options ...FileStoreOption) (*FileStore, *RecoveredState, error) {
fs := &FileStore{
rootDir: rootDir,
opts: DefaultFileStoreOptions,
}
fs.init(TypeFile, limits)
for _, opt := range options {
if err := opt(&fs.opts); err != nil {
return nil, nil, err
}
}
// Convert the compact interval in time.Duration
fs.compactItvl = time.Duration(fs.opts.CompactInterval) * time.Second
// Create the table using polynomial in options
if fs.opts.CRCPolynomial == int64(crc32.IEEE) {
fs.crcTable = crc32.IEEETable
} else {
fs.crcTable = crc32.MakeTable(uint32(fs.opts.CRCPolynomial))
}
if err := os.MkdirAll(rootDir, os.ModeDir+os.ModePerm); err != nil && !os.IsExist(err) {
return nil, nil, fmt.Errorf("unable to create the root directory [%s]: %v", rootDir, err)
}
var err error
var recoveredState *RecoveredState
var serverInfo *spb.ServerInfo
var recoveredClients []*Client
var recoveredSubs = make(RecoveredSubscriptions)
var channels []os.FileInfo
var msgStore *FileMsgStore
var subStore *FileSubStore
// Ensure store is closed in case of return with error
defer func() {
if err != nil {
fs.Close()
}
}()
// Open/Create the server file (note that this file must not be opened,
// in APPEND mode to allow truncate to work).
fileName := filepath.Join(fs.rootDir, serverFileName)
fs.serverFile, err = openFile(fileName, os.O_RDWR, os.O_CREATE)
if err != nil {
return nil, nil, err
}
// Open/Create the client file.
fileName = filepath.Join(fs.rootDir, clientsFileName)
fs.clientsFile, err = openFile(fileName)
if err != nil {
return nil, nil, err
}
// Recover the server file.
serverInfo, err = fs.recoverServerInfo()
if err != nil {
return nil, nil, err
}
// If the server file is empty, then we are done
if serverInfo == nil {
// We return the file store instance, but no recovered state.
return fs, nil, nil
}
// Recover the clients file
recoveredClients, err = fs.recoverClients()
if err != nil {
return nil, nil, err
}
// Get the channels (there are subdirectories of rootDir)
channels, err = ioutil.ReadDir(rootDir)
if err != nil {
return nil, nil, err
}
// Go through the list
for _, c := range channels {
// Channels are directories. Ignore simple files
if !c.IsDir() {
continue
}
channel := c.Name()
channelDirName := filepath.Join(rootDir, channel)
// Recover messages for this channel
msgStore, err = fs.newFileMsgStore(channelDirName, channel, true)
if err != nil {
break
}
subStore, err = fs.newFileSubStore(channelDirName, channel, true)
if err != nil {
msgStore.Close()
break
}
// For this channel, construct an array of RecoveredSubState
rssArray := make([]*RecoveredSubState, 0, len(subStore.subs))
// Fill that array with what we got from newFileSubStore.
for _, sub := range subStore.subs {
// The server is making a copy of rss.Sub, still it is not
// a good idea to return a pointer to an object that belong
// to the store. So make a copy and return the pointer to
// that copy.
csub := *sub.sub
rss := &RecoveredSubState{
Sub: &csub,
Pending: make(PendingAcks),
}
// If we recovered any seqno...
if len(sub.seqnos) > 0 {
// Lookup messages, and if we find those, update the
// Pending map.
for seq := range sub.seqnos {
rss.Pending[seq] = struct{}{}
}
}
// Add to the array of recovered subscriptions
rssArray = append(rssArray, rss)
}
// This is the recovered subscription state for this channel
recoveredSubs[channel] = rssArray
fs.channels[channel] = &ChannelStore{
Subs: subStore,
Msgs: msgStore,
}
}
if err != nil {
return nil, nil, err
}
// Create the recovered state to return
recoveredState = &RecoveredState{
Info: serverInfo,
Clients: recoveredClients,
Subs: recoveredSubs,
}
return fs, recoveredState, nil
}
// Init is used to persist server's information after the first start
func (fs *FileStore) Init(info *spb.ServerInfo) error {
fs.Lock()
defer fs.Unlock()
f := fs.serverFile
// Truncate the file (4 is the size of the fileVersion record)
if err := f.Truncate(4); err != nil {
return err
}
// Move offset to 4 (truncate does not do that)
if _, err := f.Seek(4, 0); err != nil {
return err
}
// ServerInfo record is not typed. We also don't pass a reusable buffer.
if _, _, err := writeRecord(f, nil, recNoType, info, info.Size(), fs.crcTable); err != nil {
return err
}
return nil
}
// recoverClients reads the client files and returns an array of RecoveredClient
func (fs *FileStore) recoverClients() ([]*Client, error) {
var err error
var recType recordType
var recSize int
_buf := [256]byte{}
buf := _buf[:]
// Create a buffered reader to speed-up recovery
br := bufio.NewReaderSize(fs.clientsFile, defaultBufSize)
for {
buf, recSize, recType, err = readRecord(br, buf, true, fs.crcTable, fs.opts.DoCRC)
if err != nil {
if err == io.EOF {
err = nil
break
}
return nil, err
}
fs.cliFileSize += int64(recSize + recordHeaderSize)
switch recType {
case addClient:
c := &Client{}
if err := c.ClientInfo.Unmarshal(buf[:recSize]); err != nil {
return nil, err
}
// Add to the map. Note that if one already exists, which should
// not, just replace with this most recent one.
fs.clients[c.ID] = c
case delClient:
c := spb.ClientDelete{}
if err := c.Unmarshal(buf[:recSize]); err != nil {
return nil, err
}
delete(fs.clients, c.ID)
fs.cliDeleteRecs++
default:
return nil, fmt.Errorf("invalid client record type: %v", recType)
}
}
clients := make([]*Client, len(fs.clients))
i := 0
// Convert the map into an array
for _, c := range fs.clients {
clients[i] = c
i++
}
return clients, nil
}
// recoverServerInfo reads the server file and returns a ServerInfo structure
func (fs *FileStore) recoverServerInfo() (*spb.ServerInfo, error) {
file := fs.serverFile
info := &spb.ServerInfo{}
buf, size, _, err := readRecord(file, nil, false, fs.crcTable, fs.opts.DoCRC)
if err != nil {
if err == io.EOF {
// We are done, no state recovered
return nil, nil
}
return nil, err
}
// Check that the size of the file is consistent with the size
// of the record we are supposed to recover. Account for the
// 12 bytes (4 + recordHeaderSize) corresponding to the fileVersion and
// record header.
fstat, err := file.Stat()
if err != nil {
return nil, err
}
expectedSize := int64(size + 4 + recordHeaderSize)
if fstat.Size() != expectedSize {
return nil, fmt.Errorf("incorrect file size, expected %v bytes, got %v bytes",
expectedSize, fstat.Size())
}
// Reconstruct now
if err := info.Unmarshal(buf[:size]); err != nil {
return nil, err
}
return info, nil
}
// CreateChannel creates a ChannelStore for the given channel, and returns
// `true` to indicate that the channel is new, false if it already exists.
func (fs *FileStore) CreateChannel(channel string, userData interface{}) (*ChannelStore, bool, error) {
fs.Lock()
defer fs.Unlock()
channelStore := fs.channels[channel]
if channelStore != nil {
return channelStore, false, nil
}
// Check for limits
if err := fs.canAddChannel(); err != nil {
return nil, false, err
}
// We create the channel here...
channelDirName := filepath.Join(fs.rootDir, channel)
if err := os.MkdirAll(channelDirName, os.ModeDir+os.ModePerm); err != nil {
return nil, false, err
}
var err error
var msgStore MsgStore
var subStore SubStore
msgStore, err = fs.newFileMsgStore(channelDirName, channel, false)
if err != nil {
return nil, false, err
}
subStore, err = fs.newFileSubStore(channelDirName, channel, false)
if err != nil {
msgStore.Close()
return nil, false, err
}
channelStore = &ChannelStore{
Subs: subStore,
Msgs: msgStore,
UserData: userData,
}
fs.channels[channel] = channelStore
return channelStore, true, nil
}
// AddClient stores information about the client identified by `clientID`.
func (fs *FileStore) AddClient(clientID, hbInbox string, userData interface{}) (*Client, bool, error) {
sc, isNew, err := fs.genericStore.AddClient(clientID, hbInbox, userData)
if err != nil {
return nil, false, err
}
if !isNew {
return sc, false, nil
}
fs.Lock()
fs.addClientRec = spb.ClientInfo{ID: clientID, HbInbox: hbInbox}
_, size, err := writeRecord(fs.clientsFile, nil, addClient, &fs.addClientRec, fs.addClientRec.Size(), fs.crcTable)
if err != nil {
delete(fs.clients, clientID)
fs.Unlock()
return nil, false, err
}
fs.cliFileSize += int64(size)
fs.Unlock()
return sc, true, nil
}
// DeleteClient invalidates the client identified by `clientID`.
func (fs *FileStore) DeleteClient(clientID string) *Client {
sc := fs.genericStore.DeleteClient(clientID)
if sc != nil {
fs.Lock()
fs.delClientRec = spb.ClientDelete{ID: clientID}
_, size, _ := writeRecord(fs.clientsFile, nil, delClient, &fs.delClientRec, fs.delClientRec.Size(), fs.crcTable)
fs.cliDeleteRecs++
fs.cliFileSize += int64(size)
// Check if this triggers a need for compaction
if fs.shouldCompactClientFile() {
fs.compactClientFile()
}
fs.Unlock()
}
return sc
}
// shouldCompactClientFile returns true if the client file should be compacted
// Lock is held by caller
func (fs *FileStore) shouldCompactClientFile() bool {
// Global switch
if !fs.opts.CompactEnabled {
return false
}
// Check that if minimum file size is set, the client file
// is at least at the minimum.
if fs.opts.CompactMinFileSize > 0 && fs.cliFileSize < fs.opts.CompactMinFileSize {
return false
}
// Check fragmentation
frag := fs.cliDeleteRecs * 100 / (fs.cliDeleteRecs + len(fs.clients))
if frag < fs.opts.CompactFragmentation {
return false
}
// Check that we don't do too often
if time.Now().Sub(fs.cliCompactTS) < fs.compactItvl {
return false
}
return true
}
// Rewrite the content of the clients map into a temporary file,
// then swap back to active file.
// Store lock held on entry
func (fs *FileStore) compactClientFile() error {
// Open a temporary file
tmpFile, err := getTempFile(fs.rootDir, clientsFileName)
if err != nil {
return err
}
defer func() {
if tmpFile != nil {
tmpFile.Close()
os.Remove(tmpFile.Name())
}
}()
bw := bufio.NewWriterSize(tmpFile, defaultBufSize)
fileSize := int64(0)
size := 0
_buf := [256]byte{}
buf := _buf[:]
// Dump the content of active clients into the temporary file.
for _, c := range fs.clients {
fs.addClientRec = spb.ClientInfo{ID: c.ID, HbInbox: c.HbInbox}
buf, size, err = writeRecord(bw, buf, addClient, &fs.addClientRec, fs.addClientRec.Size(), fs.crcTable)
if err != nil {
return err
}
fileSize += int64(size)
}
// Flush the buffer on disk
if err := bw.Flush(); err != nil {
return err
}
// Switch the temporary file with the original one.
fs.clientsFile, err = swapFiles(tmpFile, fs.clientsFile)
if err != nil {
return err
}
// Avoid unnecesary attempt to cleanup
tmpFile = nil
fs.cliDeleteRecs = 0
fs.cliFileSize = fileSize
fs.cliCompactTS = time.Now()
return nil
}
// Return a temporary file (including file version)
func getTempFile(rootDir, prefix string) (*os.File, error) {
tmpFile, err := ioutil.TempFile(rootDir, prefix)
if err != nil {
return nil, err
}
if err := util.WriteInt(tmpFile, fileVersion); err != nil {
return nil, err
}
return tmpFile, nil
}
// When a store file is compacted, the content is rewritten into a
// temporary file. When this is done, the temporary file replaces
// the original file.
func swapFiles(tempFile *os.File, activeFile *os.File) (*os.File, error) {
activeFileName := activeFile.Name()
tempFileName := tempFile.Name()
// Lots of things we do here is because Windows would not accept working
// on files that are currently opened.
// On exit, ensure temporary file is removed.
defer func() {
os.Remove(tempFileName)
}()
// Start by closing the temporary file.
if err := tempFile.Close(); err != nil {
return activeFile, err
}
// Close original file before trying to rename it.
if err := activeFile.Close(); err != nil {
return activeFile, err
}
// Rename the tmp file to original file name
err := os.Rename(tempFileName, activeFileName)
// Need to re-open the active file anyway
file, lerr := openFile(activeFileName)
if lerr != nil && err == nil {
err = lerr
}
return file, err
}
// Close closes all stores.
func (fs *FileStore) Close() error {
fs.Lock()
defer fs.Unlock()
if fs.closed {
return nil
}
fs.closed = true
var err error
closeFile := func(f *os.File) {
if f == nil {
return
}
if lerr := f.Close(); lerr != nil && err == nil {
err = lerr
}
}
err = fs.genericStore.close()
closeFile(fs.serverFile)
closeFile(fs.clientsFile)
return err
}
////////////////////////////////////////////////////////////////////////////
// FileMsgStore methods
////////////////////////////////////////////////////////////////////////////
// newFileMsgStore returns a new instace of a file MsgStore.
func (fs *FileStore) newFileMsgStore(channelDirName, channel string, doRecover bool) (*FileMsgStore, error) {
// Create an instance and initialize
ms := &FileMsgStore{
fstore: fs,
msgs: make(map[uint64]*msgRecord, 64),
wOffset: int64(4), // The very first record starts after the file version record
files: make(map[int]*fileSlice),
rootDir: channelDirName,
bkgTasksDone: make(chan bool, 1),
bkgTasksWake: make(chan bool, 1),
}
// Defaults to the global limits
msgStoreLimits := fs.limits.MsgStoreLimits
// See if there is an override
thisChannelLimits, exists := fs.limits.PerChannel[channel]
if exists {
// Use this channel specific limits
msgStoreLimits = thisChannelLimits.MsgStoreLimits
}
ms.init(channel, &msgStoreLimits)
ms.setSliceLimits()
ms.initCache()
maxBufSize := fs.opts.BufferSize
if maxBufSize > 0 {
ms.bw = newBufferWriter(msgBufMinShrinkSize, maxBufSize)
ms.bufferedSeqs = make([]uint64, 0, 1)
ms.bufferedMsgs = make(map[uint64]*bufferedMsg)
}
// Use this variable for all errors below so we can do the cleanup
var err error
// Recovery case
if doRecover {
var dirFiles []os.FileInfo
var fseq int64
dirFiles, err = ioutil.ReadDir(channelDirName)
for _, file := range dirFiles {
if file.IsDir() {
continue
}
fileName := file.Name()
if !strings.HasPrefix(fileName, msgFilesPrefix) || !strings.HasSuffix(fileName, datSuffix) {
continue
}
// Remove suffix
fileNameWithoutSuffix := strings.TrimSuffix(fileName, datSuffix)
// Remove prefix
fileNameWithoutPrefixAndSuffix := strings.TrimPrefix(fileNameWithoutSuffix, msgFilesPrefix)
// Get the file sequence number
fseq, err = strconv.ParseInt(fileNameWithoutPrefixAndSuffix, 10, 64)
if err != nil {
err = fmt.Errorf("message log has an invalid name: %v", fileName)
break
}
// Need fully qualified names
fileName = filepath.Join(channelDirName, fileName)
idxFName := filepath.Join(channelDirName, fmt.Sprintf("%s%v%s", msgFilesPrefix, fseq, idxSuffix))
// Create the slice
fslice := &fileSlice{fileName: fileName, idxFName: idxFName}
// Recover the file slice
err = ms.recoverOneMsgFile(fslice, int(fseq))
if err != nil {
break
}
}
if err == nil && ms.lastFSlSeq > 0 {
// Now that all file slices have been recovered, we know which
// one is the last, so open the corresponding data and index files.
ms.currSlice = ms.files[ms.lastFSlSeq]
err = ms.openDataAndIndexFiles(ms.currSlice.fileName, ms.currSlice.idxFName)
if err == nil {
ms.wOffset, err = ms.file.Seek(0, 2)
}
}
if err == nil {
// Apply message limits (no need to check if there are limits
// defined, the call won't do anything if they aren't).
err = ms.enforceLimits(false)
}
}
if err == nil {
ms.Lock()
ms.allDone.Add(1)
// Capture the time here first, it will then be captured
// in the go routine we are about to start.
ms.timeTick = time.Now().UnixNano()
// On recovery, if there is age limit set and at least one message...
if doRecover && ms.limits.MaxAge > 0 && ms.totalCount > 0 {
// Force the execution of the expireMsgs method.
// This will take care of expiring messages that should have
// expired while the server was stopped.
ms.expireMsgs(ms.timeTick, int64(ms.limits.MaxAge))
}
// Start the background tasks go routine
go ms.backgroundTasks()
ms.Unlock()
}
// Cleanup on error
if err != nil {
// The buffer writer may not be fully set yet
if ms.bw != nil && ms.bw.buf == nil {
ms.bw = nil
}
ms.Close()
ms = nil
action := "create"
if doRecover {
action = "recover"
}
err = fmt.Errorf("unable to %s message store for [%s]: %v", action, channel, err)
return nil, err
}
return ms, nil
}
// openDataAndIndexFiles opens/creates the data and index file with the given
// file names.
func (ms *FileMsgStore) openDataAndIndexFiles(dataFileName, idxFileName string) error {
file, err := openFile(dataFileName)
if err != nil {
return err
}
idxFile, err := openFile(idxFileName)
if err != nil {
file.Close()
return err
}
ms.setFile(file, idxFile)
return nil
}
// closeDataAndIndexFiles closes both current data and index files.
func (ms *FileMsgStore) closeDataAndIndexFiles() error {
err := ms.flush()
if cerr := ms.file.Close(); cerr != nil && err == nil {
err = cerr
}
if cerr := ms.idxFile.Close(); cerr != nil && err == nil {
err = cerr
}
return err
}
// setFile sets the current data and index file.
// The buffered writer is recreated.
func (ms *FileMsgStore) setFile(dataFile, idxFile *os.File) {
ms.file = dataFile
ms.writer = ms.file
if ms.file != nil && ms.bw != nil {
ms.writer = ms.bw.createNewWriter(ms.file)
}
ms.idxFile = idxFile
}
// recovers one of the file
func (ms *FileMsgStore) recoverOneMsgFile(fslice *fileSlice, fseq int) error {
var err error
msgSize := 0
var msg *pb.MsgProto
var mrec *msgRecord
var seq uint64
// Check if index file exists
useIdxFile := false
if s, statErr := os.Stat(fslice.idxFName); s != nil && statErr == nil {
useIdxFile = true
}
// Open the files (the idx file will be created if it does not exist)
err = ms.openDataAndIndexFiles(fslice.fileName, fslice.idxFName)
if err != nil {
return err
}
// Select which file to recover based on presence of index file
file := ms.file
if useIdxFile {
file = ms.idxFile
}
// Create a buffered reader to speed-up recovery
br := bufio.NewReaderSize(file, defaultBufSize)
// The first record starts after the file version record
offset := int64(4)
if useIdxFile {
for {
seq, mrec, err = ms.readIndex(br)
if err != nil {
if err == io.EOF {
// We are done, reset err
err = nil
}
break
}
// Update file slice
if fslice.firstSeq == 0 {
fslice.firstSeq = seq
}
fslice.lastSeq = seq
fslice.msgsCount++
// For size, add the message record size, the record header and the size
// required for the corresponding index record.
fslice.msgsSize += uint64(mrec.msgSize + msgRecordOverhead)
if fslice.firstWrite == 0 {
fslice.firstWrite = mrec.timestamp
}
}
} else {
// Get these from the file store object
crcTable := ms.fstore.crcTable
doCRC := ms.fstore.opts.DoCRC
// We are going to write the index file while recovering the data file
bw := bufio.NewWriterSize(ms.idxFile, msgIndexRecSize*1000)
for {
ms.tmpMsgBuf, msgSize, _, err = readRecord(br, ms.tmpMsgBuf, false, crcTable, doCRC)
if err != nil {
if err == io.EOF {
// We are done, reset err
err = nil
}
break
}
// Recover this message
msg = &pb.MsgProto{}
err = msg.Unmarshal(ms.tmpMsgBuf[:msgSize])
if err != nil {
break
}
if fslice.firstSeq == 0 {
fslice.firstSeq = msg.Sequence
}
fslice.lastSeq = msg.Sequence
fslice.msgsCount++
// For size, add the message record size, the record header and the size
// required for the corresponding index record.
fslice.msgsSize += uint64(msgSize + msgRecordOverhead)
if fslice.firstWrite == 0 {
fslice.firstWrite = msg.Timestamp
}
mrec := &msgRecord{offset: offset, timestamp: msg.Timestamp, msgSize: uint32(msgSize)}
ms.msgs[msg.Sequence] = mrec
// There was no index file, update it
err = ms.writeIndex(bw, msg.Sequence, offset, msg.Timestamp, msgSize)
if err != nil {
break
}
// Move offset
offset += int64(recordHeaderSize + msgSize)
}
if err == nil {
err = bw.Flush()
if err == nil {
err = ms.idxFile.Sync()
}
}
// Since there was no index and there was an error, remove the index
// file so when server restarts, it recovers again from the data file.
if err != nil {
// Close the index file
ms.idxFile.Close()
// Remove it, and panic if we can't
if rmErr := os.Remove(fslice.idxFName); rmErr != nil {
panic(fmt.Errorf("Error during recovery of file %q: %v, you need "+
"to manually remove index file %q (remove failed with err: %v)",
fslice.fileName, err, fslice.idxFName, rmErr))
}
}
}
// If no error and slice is not empty...
if err == nil && fslice.msgsCount > 0 {
if ms.first == 0 || ms.first > fslice.firstSeq {
ms.first = fslice.firstSeq
}
if ms.last < fslice.lastSeq {
ms.last = fslice.lastSeq
}
ms.totalCount += fslice.msgsCount
ms.totalBytes += fslice.msgsSize
// File slices may be recovered in any order. When all slices
// are recovered the caller will open the last file slice. So
// close the files here since we don't know if this is going
// to be the last.
if err == nil {
err = ms.closeDataAndIndexFiles()
}
if err == nil {
// On success, add to the map of file slices and
// update first/last file slice sequence.
ms.files[fseq] = fslice
if ms.firstFSlSeq == 0 || ms.firstFSlSeq > fseq {
ms.firstFSlSeq = fseq
}
if ms.lastFSlSeq < fseq {
ms.lastFSlSeq = fseq
}
}
} else {
// We got an error, or this is an empty file slice which we
// didn't add to the map.
if cerr := ms.closeDataAndIndexFiles(); cerr != nil && err == nil {
err = cerr
}
}
return err
}
// setSliceLimits sets the limits of a file slice based on options and/or
// channel limits.
func (ms *FileMsgStore) setSliceLimits() {
// First set slice limits based on slice configuration.
ms.slCountLim = ms.fstore.opts.SliceMaxMsgs
ms.slSizeLim = uint64(ms.fstore.opts.SliceMaxBytes)
ms.slAgeLim = int64(ms.fstore.opts.SliceMaxAge)
// Did we configure any of the "dimension"?
ms.slHasLimits = ms.slCountLim > 0 || ms.slSizeLim > 0 || ms.slAgeLim > 0
// If so, we are done. We will use those limits to decide
// when to move to a new slice.
if ms.slHasLimits {
return
}
// Slices limits were not configured. We will set a limit based on channel limits.
if ms.limits.MaxMsgs > 0 {
limit := ms.limits.MaxMsgs / 4
if limit == 0 {
limit = 1
}
ms.slCountLim = limit
}
if ms.limits.MaxBytes > 0 {
limit := uint64(ms.limits.MaxBytes) / 4
if limit == 0 {
limit = 1
}
ms.slSizeLim = limit
}
if ms.limits.MaxAge > 0 {
limit := time.Duration(int64(ms.limits.MaxAge) / 4)
if limit < time.Second {
limit = time.Second
}
ms.slAgeLim = int64(limit)
}
// Refresh our view of slices having limits.
ms.slHasLimits = ms.slCountLim > 0 || ms.slSizeLim > 0 || ms.slAgeLim > 0
}
// writeIndex writes a message index record to the writer `w`
func (ms *FileMsgStore) writeIndex(w io.Writer, seq uint64, offset, timestamp int64, msgSize int) error {
_buf := [msgIndexRecSize]byte{}
buf := _buf[:]
ms.addIndex(buf, seq, offset, timestamp, msgSize)
_, err := w.Write(buf[:msgIndexRecSize])
return err
}
// addIndex adds a message index record in the given buffer
func (ms *FileMsgStore) addIndex(buf []byte, seq uint64, offset, timestamp int64, msgSize int) {
util.ByteOrder.PutUint64(buf, seq)
util.ByteOrder.PutUint64(buf[8:], uint64(offset))
util.ByteOrder.PutUint64(buf[16:], uint64(timestamp))
util.ByteOrder.PutUint32(buf[24:], uint32(msgSize))
crc := crc32.Checksum(buf[:msgIndexRecSize-crcSize], ms.fstore.crcTable)
util.ByteOrder.PutUint32(buf[msgIndexRecSize-crcSize:], crc)
}
// readIndex reads a message index record from the given reader
// and returns an allocated msgRecord object.
func (ms *FileMsgStore) readIndex(r io.Reader) (uint64, *msgRecord, error) {
_buf := [msgIndexRecSize]byte{}
buf := _buf[:]
if _, err := io.ReadFull(r, buf); err != nil {
return 0, nil, err
}
mrec := &msgRecord{}
seq := util.ByteOrder.Uint64(buf)
mrec.offset = int64(util.ByteOrder.Uint64(buf[8:]))
mrec.timestamp = int64(util.ByteOrder.Uint64(buf[16:]))
mrec.msgSize = util.ByteOrder.Uint32(buf[24:])
if ms.fstore.opts.DoCRC {
storedCRC := util.ByteOrder.Uint32(buf[msgIndexRecSize-crcSize:])
crc := crc32.Checksum(buf[:msgIndexRecSize-crcSize], ms.fstore.crcTable)
if storedCRC != crc {
return 0, nil, fmt.Errorf("corrupted data, expected crc to be 0x%08x, got 0x%08x", storedCRC, crc)
}
}
ms.msgs[seq] = mrec
return seq, mrec, nil
}
// Store a given message.
func (ms *FileMsgStore) Store(data []byte) (uint64, error) {
ms.Lock()
defer ms.Unlock()
fslice := ms.currSlice
// Check if we need to move to next file slice
if fslice == nil || ms.slHasLimits {
if fslice == nil ||
(ms.slSizeLim > 0 && fslice.msgsSize >= ms.slSizeLim) ||
(ms.slCountLim > 0 && fslice.msgsCount >= ms.slCountLim) ||
(ms.slAgeLim > 0 && atomic.LoadInt64(&ms.timeTick)-fslice.firstWrite >= ms.slAgeLim) {
// Don't change store variable until success...
newSliceSeq := ms.lastFSlSeq + 1
// Close the current file slice (if applicable) and open the next slice
if fslice != nil {
if err := ms.closeDataAndIndexFiles(); err != nil {
return 0, err
}
}
// Create new slice
datFName := filepath.Join(ms.rootDir, fmt.Sprintf("%s%v%s", msgFilesPrefix, newSliceSeq, datSuffix))
idxFName := filepath.Join(ms.rootDir, fmt.Sprintf("%s%v%s", msgFilesPrefix, newSliceSeq, idxSuffix))
// Open the new slice
if err := ms.openDataAndIndexFiles(datFName, idxFName); err != nil {
return 0, err
}
// Success, update the store's variables
newSlice := &fileSlice{fileName: datFName, idxFName: idxFName}
ms.files[newSliceSeq] = newSlice
ms.currSlice = newSlice
if ms.firstFSlSeq == 0 {
ms.firstFSlSeq = newSliceSeq
}
ms.lastFSlSeq = newSliceSeq
ms.wOffset = int64(4)
// If we added a second slice and the first slice was empty but not removed
// because it was the only one, we remove it now.
if len(ms.files) == 2 && fslice.msgsCount == fslice.rmCount {
ms.removeFirstSlice()
}
// Update the fslice reference to new slice for rest of function
fslice = ms.currSlice
}
}
seq := ms.last + 1
m := &pb.MsgProto{
Sequence: seq,
Subject: ms.subject,
Data: data,
Timestamp: time.Now().UnixNano(),
}
msgInBuffer := false
var recSize int
var err error
var bwBuf *bufio.Writer
if ms.bw != nil {
bwBuf = ms.bw.buf
}
msgSize := m.Size()
if bwBuf != nil {
required := msgSize + recordHeaderSize
if required > bwBuf.Available() {
ms.writer, err = ms.bw.expand(ms.file, required)
if err != nil {
return 0, err
}
if err := ms.processBufferedMsgs(); err != nil {
return 0, err
}
// Refresh this since it has changed.
bwBuf = ms.bw.buf
}
}
ms.tmpMsgBuf, recSize, err = writeRecord(ms.writer, ms.tmpMsgBuf, recNoType, m, msgSize, ms.fstore.crcTable)
if err != nil {
return 0, err
}
mrec := &msgRecord{offset: ms.wOffset, timestamp: m.Timestamp, msgSize: uint32(msgSize)}
if bwBuf != nil {
// Check to see if we should cancel a buffer shrink request
if ms.bw.shrinkReq {
ms.bw.checkShrinkRequest()
}
// If message was added to the buffer we need to also save a reference
// to that message outside of the cache, until the buffer is flushed.
if bwBuf.Buffered() >= recSize {
ms.bufferedSeqs = append(ms.bufferedSeqs, seq)
ms.bufferedMsgs[seq] = &bufferedMsg{msg: m, rec: mrec}
msgInBuffer = true
}
}
// Message was flushed to disk, write corresponding index
if !msgInBuffer {
if err := ms.writeIndex(ms.idxFile, seq, ms.wOffset, m.Timestamp, msgSize); err != nil {
return 0, err
}
}
if ms.first == 0 || ms.first == seq {
// First ever message or after all messages expired and this is the
// first new message.
ms.first = seq
ms.firstMsg = m
if maxAge := ms.limits.MaxAge; maxAge > 0 {
ms.expiration = mrec.timestamp + int64(maxAge)
if len(ms.bkgTasksWake) == 0 {
ms.bkgTasksWake <- true
}
}
}
ms.last = seq
ms.lastMsg = m
ms.msgs[ms.last] = mrec
ms.addToCache(seq, m, true)
ms.wOffset += int64(recSize)
// For size, add the message record size, the record header and the size
// required for the corresponding index record.
size := uint64(msgSize + msgRecordOverhead)
// Total stats
ms.totalCount++
ms.totalBytes += size
// Stats per file slice
fslice.msgsCount++
fslice.msgsSize += size
if fslice.firstWrite == 0 {
fslice.firstWrite = m.Timestamp
}
// Save references to first and last sequences for this slice
if fslice.firstSeq == 0 {
fslice.firstSeq = seq
}
fslice.lastSeq = seq
if ms.limits.MaxMsgs > 0 || ms.limits.MaxBytes > 0 {
// Enfore limits and update file slice if needed.
if err := ms.enforceLimits(true); err != nil {
return 0, err
}
}
return seq, nil
}
// processBufferedMsgs adds message index records in the given buffer
// for every pending buffered messages.
func (ms *FileMsgStore) processBufferedMsgs() error {
if len(ms.bufferedMsgs) == 0 {
return nil
}
idxBufferSize := len(ms.bufferedMsgs) * msgIndexRecSize
ms.tmpMsgBuf = util.EnsureBufBigEnough(ms.tmpMsgBuf, idxBufferSize)
bufOffset := 0
for _, pseq := range ms.bufferedSeqs {
bm := ms.bufferedMsgs[pseq]
if bm != nil {
mrec := bm.rec
// We add the index info for this flushed message
ms.addIndex(ms.tmpMsgBuf[bufOffset:], pseq, mrec.offset, mrec.timestamp, int(mrec.msgSize))
bufOffset += msgIndexRecSize
delete(ms.bufferedMsgs, pseq)
}
}
if bufOffset > 0 {
if _, err := ms.idxFile.Write(ms.tmpMsgBuf[:bufOffset]); err != nil {
return err
}
}
ms.bufferedSeqs = ms.bufferedSeqs[:0]
return nil
}
// expireMsgs ensures that messages don't stay in the log longer than the
// limit's MaxAge.
// Returns the time of the next expiration (possibly 0 if no message left)
// The store's lock is assumed to be held on entry
func (ms *FileMsgStore) expireMsgs(now, maxAge int64) int64 {
for {
m, hasMore := ms.msgs[ms.first]
if !hasMore {
ms.expiration = 0
break
}
elapsed := now - m.timestamp
if elapsed >= maxAge {
ms.removeFirstMsg()
} else {
ms.expiration = now + (maxAge - elapsed)
break
}
}
return ms.expiration
}
// enforceLimits checks total counts with current msg store's limits,
// removing a file slice and/or updating slices' count as necessary.
func (ms *FileMsgStore) enforceLimits(reportHitLimit bool) error {
// Check if we need to remove any (but leave at least the last added).
// Note that we may have to remove more than one msg if we are here
// after a restart with smaller limits than originally set, or if
// message is quite big, etc...
maxMsgs := ms.limits.MaxMsgs
maxBytes := ms.limits.MaxBytes
for ms.totalCount > 1 &&
((maxMsgs > 0 && ms.totalCount > maxMsgs) ||
(maxBytes > 0 && ms.totalBytes > uint64(maxBytes))) {
// Remove first message from first slice, potentially removing
// the slice, etc...
ms.removeFirstMsg()
if reportHitLimit && !ms.hitLimit {
ms.hitLimit = true
Noticef(droppingMsgsFmt, ms.subject, ms.totalCount, ms.limits.MaxMsgs, ms.totalBytes, ms.limits.MaxBytes)
}
}
return nil
}
// removeFirstMsg "removes" the first message of the first slice.
// If the slice is "empty" the file slice is removed.
func (ms *FileMsgStore) removeFirstMsg() {
// Work with the first slice
slice := ms.files[ms.firstFSlSeq]
// Size of the first message in this slice
firstMsgSize := ms.msgs[slice.firstSeq].msgSize
// For size, we count the size of serialized message + record header +
// the corresponding index record
size := uint64(firstMsgSize + msgRecordOverhead)
// Keep track of number of "removed" messages in this slice
slice.rmCount++
// Update total counts
ms.totalCount--
ms.totalBytes -= size
// Remove the first message from the records map
delete(ms.msgs, ms.first)
// Messages sequence is incremental with no gap on a given msgstore.
ms.first++
// Invalidate ms.firstMsg, it will be looked-up on demand.
ms.firstMsg = nil
// Invalidate ms.lastMsg if it was the last message being removed.
if ms.first > ms.last {
ms.lastMsg = nil
}
// Is file slice is "empty" and not the last one
if slice.msgsCount == slice.rmCount && len(ms.files) > 1 {
ms.removeFirstSlice()
} else {
// This is the new first message in this slice.
slice.firstSeq = ms.first
}
}
// removeFirstSlice removes the first file slice.
// Should not be called if first slice is also last!
func (ms *FileMsgStore) removeFirstSlice() {
sl := ms.files[ms.firstFSlSeq]
// Close file that may have been opened due to lookups
if sl.file != nil {
sl.file.Close()
sl.file = nil
}
// Assume we will remove the files
remove := true
// If there is an archive script invoke it first
script := ms.fstore.opts.SliceArchiveScript
if script != "" {
datBak := sl.fileName + bakSuffix
idxBak := sl.idxFName + bakSuffix
var err error
if err = os.Rename(sl.fileName, datBak); err == nil {
if err = os.Rename(sl.idxFName, idxBak); err != nil {
// Remove first backup file
os.Remove(datBak)
}
}
if err == nil {
// Files have been successfully renamed, so don't attempt
// to remove the original files.
remove = false
// We run the script in a go routine to not block the server.
ms.allDone.Add(1)
go func(subj, dat, idx string) {
defer ms.allDone.Done()
cmd := exec.Command(script, subj, dat, idx)
output, err := cmd.CombinedOutput()
if err != nil {
Noticef("STAN: Error invoking archive script %q: %v (output=%v)", script, err, string(output))
} else {
Noticef("STAN: Output of archive script for %s (%s and %s): %v", subj, dat, idx, string(output))
}
}(ms.subject, datBak, idxBak)
}
}
// Remove files
if remove {
os.Remove(sl.fileName)
os.Remove(sl.idxFName)
}
// Remove slice from map
delete(ms.files, ms.firstFSlSeq)
// Normally, file slices have an incremental sequence number with
// no gap. However, we want to support the fact that an user could
// copy back some old file slice to be recovered, and so there
// may be a gap. So find out what is the new first file sequence.
for ms.firstFSlSeq < ms.lastFSlSeq {
ms.firstFSlSeq++
if _, ok := ms.files[ms.firstFSlSeq]; ok {
break
}
}
// This should not happen!
if ms.firstFSlSeq > ms.lastFSlSeq {
panic("Removed last slice!")
}
}
// getFileForSeq returns the file where the message of the given sequence
// is stored. If the file is opened, a task is triggered to close this
// file when no longer used after a period of time.
func (ms *FileMsgStore) getFileForSeq(seq uint64) (*os.File, error) {
if len(ms.files) == 0 {
return nil, fmt.Errorf("no file slice for store %q, message seq: %v", ms.subject, seq)
}
// Start with current slice
slice := ms.currSlice
if (slice.firstSeq <= seq) && (seq <= slice.lastSeq) {
return ms.file, nil
}
// We want to support possible gaps in file slice sequence, so
// no dichotomy, but simple iteration of the map, which in Go is
// random.
for _, slice := range ms.files {
if (slice.firstSeq <= seq) && (seq <= slice.lastSeq) {
file := slice.file
if file == nil {
var err error
file, err = openFile(slice.fileName)
if err != nil {
return nil, fmt.Errorf("unable to open file %q: %v", slice.fileName, err)
}
slice.file = file
// Let the background task know that we have opened a slice
atomic.StoreInt64(&ms.checkSlices, 1)
}
slice.lastUsed = atomic.LoadInt64(&ms.timeTick)
return file, nil
}
}
return nil, fmt.Errorf("could not find file slice for store %q, message seq: %v", ms.subject, seq)
}
// backgroundTasks performs some background tasks related to this
// messages store.
func (ms *FileMsgStore) backgroundTasks() {
defer ms.allDone.Done()
ms.RLock()
hasBuffer := ms.bw != nil
maxAge := int64(ms.limits.MaxAge)
nextExpiration := ms.expiration
lastCacheCheck := ms.timeTick
lastBufShrink := ms.timeTick
ms.RUnlock()
for {
// Update time
timeTick := time.Now().UnixNano()
atomic.StoreInt64(&ms.timeTick, timeTick)
// Close unused file slices
if atomic.LoadInt64(&ms.checkSlices) == 1 {
ms.Lock()
opened := 0
for _, slice := range ms.files {
if slice.file != nil {
opened++
if slice.lastUsed < timeTick && time.Duration(timeTick-slice.lastUsed) >= time.Second {
slice.file.Close()
slice.file = nil
opened--
}
}
}
if opened == 0 {
// We can update this without atomic since we are under store lock
// and this go routine is the only place where we check the value.
ms.checkSlices = 0
}
ms.Unlock()
}
// Shrink the buffer if applicable
if hasBuffer && time.Duration(timeTick-lastBufShrink) >= bufShrinkInterval {
ms.Lock()
ms.writer, _ = ms.bw.tryShrinkBuffer(ms.file)
ms.Unlock()
lastBufShrink = timeTick
}
// Check for expiration
if maxAge > 0 && nextExpiration > 0 && timeTick >= nextExpiration {
ms.Lock()
// Expire messages
nextExpiration = ms.expireMsgs(timeTick, maxAge)
ms.Unlock()
}
// Check for message caching
if timeTick >= lastCacheCheck+cacheTTL {
tryEvict := atomic.LoadInt32(&ms.cache.tryEvict)
if tryEvict == 1 {
ms.Lock()
// Possibly remove some/all cached messages
ms.evictFromCache(timeTick)
ms.Unlock()
}
lastCacheCheck = timeTick
}
select {
case <-ms.bkgTasksDone:
return
case <-ms.bkgTasksWake:
// wake up from a possible sleep to run the loop
ms.RLock()
nextExpiration = ms.expiration
ms.RUnlock()
case <-time.After(bkgTasksSleepDuration):
// go back to top of for loop.
}
}
}
// lookup returns the message for the given sequence number, possibly
// reading the message from disk.
// Store write lock is assumed to be held on entry
func (ms *FileMsgStore) lookup(seq uint64) *pb.MsgProto {
var msg *pb.MsgProto
m := ms.msgs[seq]
if m != nil {
msg = ms.getFromCache(seq)
if msg == nil && ms.bufferedMsgs != nil {
// Possibly in bufferedMsgs
bm := ms.bufferedMsgs[seq]
if bm != nil {
msg = bm.msg
ms.addToCache(seq, msg, false)
}
}
if msg == nil {
var msgSize int
// Look in which file slice the message is located.
file, err := ms.getFileForSeq(seq)
if err != nil {
return nil
}
// Position file to message's offset. 0 means from start.
if _, err := file.Seek(m.offset, 0); err != nil {
return nil
}
ms.tmpMsgBuf, msgSize, _, err = readRecord(file, ms.tmpMsgBuf, false, ms.fstore.crcTable, ms.fstore.opts.DoCRC)
if err != nil {
return nil
}
// Recover this message
msg = &pb.MsgProto{}
err = msg.Unmarshal(ms.tmpMsgBuf[:msgSize])
if err != nil {
return nil
}
ms.addToCache(seq, msg, false)
}
}
return msg
}
// Lookup returns the stored message with given sequence number.
func (ms *FileMsgStore) Lookup(seq uint64) *pb.MsgProto {
ms.Lock()
msg := ms.lookup(seq)
ms.Unlock()
return msg
}
// FirstMsg returns the first message stored.
func (ms *FileMsgStore) FirstMsg() *pb.MsgProto {
ms.RLock()
if ms.firstMsg == nil {
ms.firstMsg = ms.lookup(ms.first)
}
m := ms.firstMsg
ms.RUnlock()
return m
}
// LastMsg returns the last message stored.
func (ms *FileMsgStore) LastMsg() *pb.MsgProto {
ms.RLock()
if ms.lastMsg == nil {
ms.lastMsg = ms.lookup(ms.last)
}
m := ms.lastMsg
ms.RUnlock()
return m
}
// GetSequenceFromTimestamp returns the sequence of the first message whose
// timestamp is greater or equal to given timestamp.
func (ms *FileMsgStore) GetSequenceFromTimestamp(timestamp int64) uint64 {
ms.RLock()
defer ms.RUnlock()
index := sort.Search(len(ms.msgs), func(i int) bool {
if ms.msgs[uint64(i)+ms.first].timestamp >= timestamp {
return true
}
return false
})
return uint64(index) + ms.first
}
// initCache initializes the message cache
func (ms *FileMsgStore) initCache() {
ms.cache = &msgsCache{
seqMaps: make(map[uint64]*cachedMsg),
}
}
// addToCache adds a message to the cache.
// Store write lock is assumed held on entry
func (ms *FileMsgStore) addToCache(seq uint64, msg *pb.MsgProto, isNew bool) {
c := ms.cache
exp := cacheTTL
if isNew {
exp += msg.Timestamp
} else {
exp += time.Now().UnixNano()
}
cMsg := &cachedMsg{
expiration: exp,
msg: msg,
}
if c.tail == nil {
c.head = cMsg
} else {
c.tail.next = cMsg
}
cMsg.prev = c.tail
c.tail = cMsg
c.seqMaps[seq] = cMsg
if len(c.seqMaps) == 1 {
atomic.StoreInt32(&c.tryEvict, 1)
}
}
// getFromCache returns a message if available in the cache.
// Store write lock is assumed held on entry
func (ms *FileMsgStore) getFromCache(seq uint64) *pb.MsgProto {
c := ms.cache
cMsg := c.seqMaps[seq]
if cMsg == nil {
return nil
}
if cMsg != c.tail {
if cMsg.prev != nil {
cMsg.prev.next = cMsg.next
}
if cMsg.next != nil {
cMsg.next.prev = cMsg.prev
}
if cMsg == c.head {
c.head = cMsg.next
}
cMsg.prev = c.tail
cMsg.next = nil
c.tail = cMsg
}
cMsg.expiration = time.Now().UnixNano() + cacheTTL
return cMsg.msg
}
// evictFromCache move down the cache maps, evicting the last one.
// Store write lock is assumed held on entry
func (ms *FileMsgStore) evictFromCache(now int64) {
c := ms.cache
if now >= c.tail.expiration {
// Bulk remove
c.seqMaps = make(map[uint64]*cachedMsg)
c.head, c.tail, c.tryEvict = nil, nil, 0
return
}
cMsg := c.head
for cMsg != nil && cMsg.expiration <= now {
delete(c.seqMaps, cMsg.msg.Sequence)
cMsg = cMsg.next
}
if cMsg != c.head {
// There should be at least one left, otherwise, they
// would all have been bulk removed at top of this function.
cMsg.prev = nil
c.head = cMsg
}
}
// Close closes the store.
func (ms *FileMsgStore) Close() error {
ms.Lock()
if ms.closed {
ms.Unlock()
return nil
}
ms.closed = true
var err error
// Close file slices that may have been opened due to
// message lookups.
for _, slice := range ms.files {
if slice.file != nil {
if lerr := slice.file.Close(); lerr != nil && err == nil {
err = lerr
}
}
}
// Flush and close current files
if ms.currSlice != nil {
if lerr := ms.closeDataAndIndexFiles(); lerr != nil && err == nil {
err = lerr
}
}
// Signal the background tasks go-routine to exit
ms.bkgTasksDone <- true
ms.Unlock()
// Wait on go routines/timers to finish
ms.allDone.Wait()
return err
}
func (ms *FileMsgStore) flush() error {
if ms.bw != nil && ms.bw.buf != nil && ms.bw.buf.Buffered() > 0 {
if err := ms.bw.buf.Flush(); err != nil {
return err
}
if err := ms.processBufferedMsgs(); err != nil {
return err
}
}
if ms.fstore.opts.DoSync {
if err := ms.file.Sync(); err != nil {
return err
}
if err := ms.idxFile.Sync(); err != nil {
return err
}
}
return nil
}
// Flush flushes outstanding data into the store.
func (ms *FileMsgStore) Flush() error {
ms.Lock()
err := ms.flush()
ms.Unlock()
return err
}
////////////////////////////////////////////////////////////////////////////
// FileSubStore methods
////////////////////////////////////////////////////////////////////////////
// newFileSubStore returns a new instace of a file SubStore.
func (fs *FileStore) newFileSubStore(channelDirName, channel string, doRecover bool) (*FileSubStore, error) {
ss := &FileSubStore{
rootDir: channelDirName,
subs: make(map[uint64]*subscription),
opts: &fs.opts,
crcTable: fs.crcTable,
}
// Defaults to the global limits
subStoreLimits := fs.limits.SubStoreLimits
// See if there is an override
thisChannelLimits, exists := fs.limits.PerChannel[channel]
if exists {
// Use this channel specific limits
subStoreLimits = thisChannelLimits.SubStoreLimits
}
ss.init(channel, &subStoreLimits)
// Convert the CompactInterval in time.Duration
ss.compactItvl = time.Duration(ss.opts.CompactInterval) * time.Second
var err error
fileName := filepath.Join(channelDirName, subsFileName)
ss.file, err = openFile(fileName)
if err != nil {
return nil, err
}
maxBufSize := ss.opts.BufferSize
// This needs to be done before the call to ss.setWriter()
if maxBufSize > 0 {
ss.bw = newBufferWriter(subBufMinShrinkSize, maxBufSize)
}
ss.setWriter()
if doRecover {
if err := ss.recoverSubscriptions(); err != nil {
ss.Close()
return nil, fmt.Errorf("unable to create subscription store for [%s]: %v", channel, err)
}
}
// Do not attempt to shrink unless the option is greater than the
// minimum shrinkable size.
if maxBufSize > subBufMinShrinkSize {
// Use lock to avoid RACE report between setting shrinkTimer and
// execution of the callback itself.
ss.Lock()
ss.allDone.Add(1)
ss.shrinkTimer = time.AfterFunc(bufShrinkInterval, ss.shrinkBuffer)
ss.Unlock()
}
return ss, nil
}
// setWriter sets the writer to either file or buffered writer (and create it),
// based on store option.
func (ss *FileSubStore) setWriter() {
ss.writer = ss.file
if ss.bw != nil {
ss.writer = ss.bw.createNewWriter(ss.file)
}
}
// shrinkBuffer is a timer callback that shrinks the buffer writer when possible
func (ss *FileSubStore) shrinkBuffer() {
ss.Lock()
defer ss.Unlock()
if ss.closed {
ss.allDone.Done()
return
}
// If error, the buffer (in bufio) memorizes the error
// so any other write/flush on that buffer will fail. We will get the
// error at the next "synchronous" operation where we can report back
// to the user.
ss.writer, _ = ss.bw.tryShrinkBuffer(ss.file)
// Fire again
ss.shrinkTimer.Reset(bufShrinkInterval)
}
// recoverSubscriptions recovers subscriptions state for this store.
func (ss *FileSubStore) recoverSubscriptions() error {
var err error
var recType recordType
recSize := 0
// Create a buffered reader to speed-up recovery
br := bufio.NewReaderSize(ss.file, defaultBufSize)
for {
ss.tmpSubBuf, recSize, recType, err = readRecord(br, ss.tmpSubBuf, true, ss.crcTable, ss.opts.DoCRC)
if err != nil {
if err == io.EOF {
// We are done, reset err
err = nil
break
} else {
return err
}
}
ss.fileSize += int64(recSize + recordHeaderSize)
// Based on record type...
switch recType {
case subRecNew:
newSub := &spb.SubState{}
if err := newSub.Unmarshal(ss.tmpSubBuf[:recSize]); err != nil {
return err
}
sub := &subscription{
sub: newSub,
seqnos: make(map[uint64]struct{}),
}
ss.subs[newSub.ID] = sub
// Keep track of the subscriptions count
ss.subsCount++
// Keep track of max subscription ID found.
if newSub.ID > ss.maxSubID {
ss.maxSubID = newSub.ID
}
ss.numRecs++
case subRecUpdate:
modifiedSub := &spb.SubState{}
if err := modifiedSub.Unmarshal(ss.tmpSubBuf[:recSize]); err != nil {
return err
}
// Search if the create has been recovered.
sub, exists := ss.subs[modifiedSub.ID]
if exists {
sub.sub = modifiedSub
// An update means that the previous version is free space.
ss.delRecs++
} else {
sub := &subscription{
sub: modifiedSub,
seqnos: make(map[uint64]struct{}),
}
ss.subs[modifiedSub.ID] = sub
}
// Keep track of max subscription ID found.
if modifiedSub.ID > ss.maxSubID {
ss.maxSubID = modifiedSub.ID
}
ss.numRecs++
case subRecDel:
delSub := spb.SubStateDelete{}
if err := delSub.Unmarshal(ss.tmpSubBuf[:recSize]); err != nil {
return err
}
if s, exists := ss.subs[delSub.ID]; exists {
delete(ss.subs, delSub.ID)
// Keep track of the subscriptions count
ss.subsCount--
// Delete and count all non-ack'ed messages free space.
ss.delRecs++
ss.delRecs += len(s.seqnos)
}
// Keep track of max subscription ID found.
if delSub.ID > ss.maxSubID {
ss.maxSubID = delSub.ID
}
case subRecMsg:
updateSub := spb.SubStateUpdate{}
if err := updateSub.Unmarshal(ss.tmpSubBuf[:recSize]); err != nil {
return err
}
if sub, exists := ss.subs[updateSub.ID]; exists {
seqno := updateSub.Seqno
// Same seqno/ack can appear several times for the same sub.
// See queue subscribers redelivery.
if seqno > sub.sub.LastSent {
sub.sub.LastSent = seqno
}
sub.seqnos[seqno] = struct{}{}
ss.numRecs++
}
case subRecAck:
updateSub := spb.SubStateUpdate{}
if err := updateSub.Unmarshal(ss.tmpSubBuf[:recSize]); err != nil {
return err
}
if sub, exists := ss.subs[updateSub.ID]; exists {
delete(sub.seqnos, updateSub.Seqno)
// A message is ack'ed
ss.delRecs++
}
default:
return fmt.Errorf("unexpected record type: %v", recType)
}
}
return nil
}
// CreateSub records a new subscription represented by SubState. On success,
// it returns an id that is used by the other methods.
func (ss *FileSubStore) CreateSub(sub *spb.SubState) error {
// Check if we can create the subscription (check limits and update
// subscription count)
ss.Lock()
defer ss.Unlock()
if err := ss.createSub(sub); err != nil {
return err
}
if err := ss.writeRecord(ss.writer, subRecNew, sub); err != nil {
return err
}
// We need to get a copy of the passed sub, we can't hold a reference
// to it.
csub := *sub
s := &subscription{sub: &csub, seqnos: make(map[uint64]struct{})}
ss.subs[sub.ID] = s
return nil
}
// UpdateSub updates a given subscription represented by SubState.
func (ss *FileSubStore) UpdateSub(sub *spb.SubState) error {
ss.Lock()
defer ss.Unlock()
if err := ss.writeRecord(ss.writer, subRecUpdate, sub); err != nil {
return err
}
// We need to get a copy of the passed sub, we can't hold a reference
// to it.
csub := *sub
s := ss.subs[sub.ID]
if s != nil {
s.sub = &csub
} else {
s := &subscription{sub: &csub, seqnos: make(map[uint64]struct{})}
ss.subs[sub.ID] = s
}
return nil
}
// DeleteSub invalidates this subscription.
func (ss *FileSubStore) DeleteSub(subid uint64) {
ss.Lock()
ss.delSub.ID = subid
ss.writeRecord(ss.writer, subRecDel, &ss.delSub)
if s, exists := ss.subs[subid]; exists {
delete(ss.subs, subid)
// writeRecord has already accounted for the count of the
// delete record. We add to this the number of pending messages
ss.delRecs += len(s.seqnos)
// Check if this triggers a need for compaction
if ss.shouldCompact() {
ss.compact()
}
}
ss.Unlock()
}
// shouldCompact returns a boolean indicating if we should compact
// Lock is held by caller
func (ss *FileSubStore) shouldCompact() bool {
// Gobal switch
if !ss.opts.CompactEnabled {
return false
}
// Check that if minimum file size is set, the client file
// is at least at the minimum.
if ss.opts.CompactMinFileSize > 0 && ss.fileSize < ss.opts.CompactMinFileSize {
return false
}
// Check fragmentation
frag := 0
if ss.numRecs == 0 {
frag = 100
} else {
frag = ss.delRecs * 100 / ss.numRecs
}
if frag < ss.opts.CompactFragmentation {
return false
}
// Check that we don't compact too often
if time.Now().Sub(ss.compactTS) < ss.compactItvl {
return false
}
return true
}
// AddSeqPending adds the given message seqno to the given subscription.
func (ss *FileSubStore) AddSeqPending(subid, seqno uint64) error {
ss.Lock()
ss.updateSub.ID, ss.updateSub.Seqno = subid, seqno
if err := ss.writeRecord(ss.writer, subRecMsg, &ss.updateSub); err != nil {
ss.Unlock()
return err
}
s := ss.subs[subid]
if s != nil {
if seqno > s.sub.LastSent {
s.sub.LastSent = seqno
}
s.seqnos[seqno] = struct{}{}
}
ss.Unlock()
return nil
}
// AckSeqPending records that the given message seqno has been acknowledged
// by the given subscription.
func (ss *FileSubStore) AckSeqPending(subid, seqno uint64) error {
ss.Lock()
ss.updateSub.ID, ss.updateSub.Seqno = subid, seqno
if err := ss.writeRecord(ss.writer, subRecAck, &ss.updateSub); err != nil {
ss.Unlock()
return err
}
s := ss.subs[subid]
if s != nil {
delete(s.seqnos, seqno)
// Test if we should compact
if ss.shouldCompact() {
ss.compact()
}
}
ss.Unlock()
return nil
}
// compact rewrites all subscriptions on a temporary file, reducing the size
// since we get rid of deleted subscriptions and message sequences that have
// been acknowledged. On success, the subscriptions file is replaced by this
// temporary file.
// Lock is held by caller
func (ss *FileSubStore) compact() error {
tmpFile, err := getTempFile(ss.rootDir, "subs")
if err != nil {
return err
}
tmpBW := bufio.NewWriterSize(tmpFile, defaultBufSize)
// Save values in case of failed compaction
savedNumRecs := ss.numRecs
savedDelRecs := ss.delRecs
savedFileSize := ss.fileSize
// Cleanup in case of error during compact
defer func() {
if tmpFile != nil {
tmpFile.Close()
os.Remove(tmpFile.Name())
// Since we failed compaction, restore values
ss.numRecs = savedNumRecs
ss.delRecs = savedDelRecs
ss.fileSize = savedFileSize
}
}()
// Reset to 0 since writeRecord() is updating the values.
ss.numRecs = 0
ss.delRecs = 0
ss.fileSize = 0
for _, sub := range ss.subs {
err = ss.writeRecord(tmpBW, subRecNew, sub.sub)
if err != nil {
return err
}
ss.updateSub.ID = sub.sub.ID
for seqno := range sub.seqnos {
ss.updateSub.Seqno = seqno
err = ss.writeRecord(tmpBW, subRecMsg, &ss.updateSub)
if err != nil {
return err
}
}
}
// Flush and sync the temporary file
err = tmpBW.Flush()
if err != nil {
return err
}
err = tmpFile.Sync()
if err != nil {
return err
}
// Switch the temporary file with the original one.
ss.file, err = swapFiles(tmpFile, ss.file)
if err != nil {
return err
}
// Prevent cleanup on success
tmpFile = nil
// Set the file and create buffered writer if applicable
ss.setWriter()
// Update the timestamp of this last successful compact
ss.compactTS = time.Now()
return nil
}
// writes a record in the subscriptions file.
// store's lock is held on entry.
func (ss *FileSubStore) writeRecord(w io.Writer, recType recordType, rec record) error {
var err error
totalSize := 0
recSize := rec.Size()
var bwBuf *bufio.Writer
if ss.bw != nil && w == ss.bw.buf {
bwBuf = ss.bw.buf
}
// If we are using the buffer writer on this call, and the buffer is
// not already at the max size...
if bwBuf != nil && ss.bw.bufSize != ss.opts.BufferSize {
// Check if record fits
required := recSize + recordHeaderSize
if required > bwBuf.Available() {
ss.writer, err = ss.bw.expand(ss.file, required)
if err != nil {
return err
}
// `w` is used in this function, so point it to the new buffer
bwBuf = ss.bw.buf
w = bwBuf
}
}
ss.tmpSubBuf, totalSize, err = writeRecord(w, ss.tmpSubBuf, recType, rec, recSize, ss.crcTable)
if err != nil {
return err
}
if bwBuf != nil && ss.bw.shrinkReq {
ss.bw.checkShrinkRequest()
}
// Indicate that we wrote something to the buffer/file
ss.activity = true
switch recType {
case subRecNew:
ss.numRecs++
case subRecMsg:
ss.numRecs++
case subRecAck:
// An ack makes the message record free space
ss.delRecs++
case subRecUpdate:
ss.numRecs++
// An update makes the old record free space
ss.delRecs++
case subRecDel:
ss.delRecs++
default:
panic(fmt.Errorf("Record type %v unknown", recType))
}
ss.fileSize += int64(totalSize)
return nil
}
func (ss *FileSubStore) flush() error {
// Skip this if nothing was written since the last flush
if !ss.activity {
return nil
}
// Reset this now
ss.activity = false
if ss.bw != nil && ss.bw.buf.Buffered() > 0 {
if err := ss.bw.buf.Flush(); err != nil {
return err
}
}
if ss.opts.DoSync {
return ss.file.Sync()
}
return nil
}
// Flush persists buffered operations to disk.
func (ss *FileSubStore) Flush() error {
ss.Lock()
err := ss.flush()
ss.Unlock()
return err
}
// Close closes this store
func (ss *FileSubStore) Close() error {
ss.Lock()
if ss.closed {
ss.Unlock()
return nil
}
ss.closed = true
var err error
if ss.file != nil {
err = ss.flush()
if lerr := ss.file.Close(); lerr != nil && err == nil {
err = lerr
}
}
if ss.shrinkTimer != nil {
if ss.shrinkTimer.Stop() {
// If we can stop, timer callback won't fire,
// so we need to decrement the wait group.
ss.allDone.Done()
}
}
ss.Unlock()
// Wait on timers/callbacks
ss.allDone.Wait()
return err
}