mirror of
https://github.com/etcd-io/etcd.git
synced 2024-09-27 06:25:44 +00:00
wal: preallocate WAL files with initial size equal to segment size
Avoids having to update file size metadata during fdatasync on common path. Fixes #4755
This commit is contained in:
parent
aafe717f2f
commit
24b806d2ee
@ -28,30 +28,58 @@ import (
|
||||
)
|
||||
|
||||
type decoder struct {
|
||||
mu sync.Mutex
|
||||
br *bufio.Reader
|
||||
mu sync.Mutex
|
||||
brs []*bufio.Reader
|
||||
|
||||
crc hash.Hash32
|
||||
// lastValidOff file offset following the last valid decoded record
|
||||
lastValidOff int64
|
||||
crc hash.Hash32
|
||||
}
|
||||
|
||||
func newDecoder(r io.Reader) *decoder {
|
||||
func newDecoder(r ...io.Reader) *decoder {
|
||||
readers := make([]*bufio.Reader, len(r))
|
||||
for i := range r {
|
||||
readers[i] = bufio.NewReader(r[i])
|
||||
}
|
||||
return &decoder{
|
||||
br: bufio.NewReader(r),
|
||||
brs: readers,
|
||||
crc: crc.New(0, crcTable),
|
||||
}
|
||||
}
|
||||
|
||||
func (d *decoder) decode(rec *walpb.Record) error {
|
||||
rec.Reset()
|
||||
d.mu.Lock()
|
||||
defer d.mu.Unlock()
|
||||
return d.decodeRecord(rec)
|
||||
}
|
||||
|
||||
func (d *decoder) decodeRecord(rec *walpb.Record) error {
|
||||
if len(d.brs) == 0 {
|
||||
return io.EOF
|
||||
}
|
||||
|
||||
l, err := readInt64(d.brs[0])
|
||||
if err == io.EOF {
|
||||
d.brs = d.brs[1:]
|
||||
d.lastValidOff = 0
|
||||
return d.decodeRecord(rec)
|
||||
}
|
||||
|
||||
rec.Reset()
|
||||
l, err := readInt64(d.br)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if l == 0 {
|
||||
// hit preallocated space
|
||||
d.brs = d.brs[1:]
|
||||
if len(d.brs) == 0 {
|
||||
return io.EOF
|
||||
}
|
||||
d.lastValidOff = 0
|
||||
return d.decodeRecord(rec)
|
||||
}
|
||||
data := make([]byte, l)
|
||||
if _, err = io.ReadFull(d.br, data); err != nil {
|
||||
if _, err = io.ReadFull(d.brs[0], data); err != nil {
|
||||
// ReadFull returns io.EOF only if no bytes were read
|
||||
// the decoder should treat this as an ErrUnexpectedEOF instead.
|
||||
if err == io.EOF {
|
||||
@ -62,12 +90,17 @@ func (d *decoder) decode(rec *walpb.Record) error {
|
||||
if err := rec.Unmarshal(data); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// skip crc checking if the record type is crcType
|
||||
if rec.Type == crcType {
|
||||
return nil
|
||||
if rec.Type != crcType {
|
||||
d.crc.Write(rec.Data)
|
||||
if err := rec.Validate(d.crc.Sum32()); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
d.crc.Write(rec.Data)
|
||||
return rec.Validate(d.crc.Sum32())
|
||||
// record decoded as valid; point last valid offset to end of record
|
||||
d.lastValidOff += l + 8
|
||||
return nil
|
||||
}
|
||||
|
||||
func (d *decoder) updateCRC(prevCrc uint32) {
|
||||
@ -78,6 +111,8 @@ func (d *decoder) lastCRC() uint32 {
|
||||
return d.crc.Sum32()
|
||||
}
|
||||
|
||||
func (d *decoder) lastOffset() int64 { return d.lastValidOff }
|
||||
|
||||
func mustUnmarshalEntry(d []byte) raftpb.Entry {
|
||||
var e raftpb.Entry
|
||||
pbutil.MustUnmarshal(&e, d)
|
||||
|
@ -1,45 +0,0 @@
|
||||
// Copyright 2015 CoreOS, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package wal
|
||||
|
||||
import "io"
|
||||
|
||||
type multiReadCloser struct {
|
||||
closers []io.Closer
|
||||
reader io.Reader
|
||||
}
|
||||
|
||||
func (mc *multiReadCloser) Close() error {
|
||||
var err error
|
||||
for i := range mc.closers {
|
||||
err = mc.closers[i].Close()
|
||||
}
|
||||
return err
|
||||
}
|
||||
|
||||
func (mc *multiReadCloser) Read(p []byte) (int, error) {
|
||||
return mc.reader.Read(p)
|
||||
}
|
||||
|
||||
func MultiReadCloser(readClosers ...io.ReadCloser) io.ReadCloser {
|
||||
cs := make([]io.Closer, len(readClosers))
|
||||
rs := make([]io.Reader, len(readClosers))
|
||||
for i := range readClosers {
|
||||
cs[i] = readClosers[i]
|
||||
rs[i] = readClosers[i]
|
||||
}
|
||||
r := io.MultiReader(rs...)
|
||||
return &multiReadCloser{cs, r}
|
||||
}
|
@ -55,6 +55,9 @@ func Repair(dirpath string) bool {
|
||||
continue
|
||||
case io.EOF:
|
||||
return true
|
||||
case ErrZeroTrailer:
|
||||
plog.Noticef("found zero trailer in %v", f.Name())
|
||||
fallthrough
|
||||
case io.ErrUnexpectedEOF:
|
||||
plog.Noticef("repairing %v", f.Name())
|
||||
bf, bferr := os.Create(f.Name() + ".broken")
|
||||
|
@ -44,6 +44,10 @@ func TestRepair(t *testing.T) {
|
||||
t.Fatal(err)
|
||||
}
|
||||
}
|
||||
offset, err := w.tail().Seek(0, os.SEEK_CUR)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
w.Close()
|
||||
|
||||
// break the wal.
|
||||
@ -51,11 +55,7 @@ func TestRepair(t *testing.T) {
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
offset, err := f.Seek(-4, os.SEEK_END)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
err = f.Truncate(offset)
|
||||
err = f.Truncate(offset - 4)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
@ -67,7 +67,7 @@ func TestRepair(t *testing.T) {
|
||||
}
|
||||
_, _, _, err = w.ReadAll()
|
||||
if err != io.ErrUnexpectedEOF {
|
||||
t.Fatalf("err = %v, want %v", err, io.ErrUnexpectedEOF)
|
||||
t.Fatalf("err = %v, want error %v", err, io.ErrUnexpectedEOF)
|
||||
}
|
||||
w.Close()
|
||||
|
||||
|
73
wal/wal.go
73
wal/wal.go
@ -57,6 +57,7 @@ var (
|
||||
ErrCRCMismatch = errors.New("wal: crc mismatch")
|
||||
ErrSnapshotMismatch = errors.New("wal: snapshot mismatch")
|
||||
ErrSnapshotNotFound = errors.New("wal: snapshot not found")
|
||||
ErrZeroTrailer = errors.New("wal: zero trailer")
|
||||
crcTable = crc32.MakeTable(crc32.Castagnoli)
|
||||
)
|
||||
|
||||
@ -72,7 +73,7 @@ type WAL struct {
|
||||
|
||||
start walpb.Snapshot // snapshot to start reading
|
||||
decoder *decoder // decoder to decode records
|
||||
readClose io.Closer // closer for decode reader
|
||||
readClose func() error // closer for decode reader
|
||||
|
||||
mu sync.Mutex
|
||||
enti uint64 // index of the last entry saved to the wal
|
||||
@ -93,10 +94,16 @@ func Create(dirpath string, metadata []byte) (*WAL, error) {
|
||||
}
|
||||
|
||||
p := path.Join(dirpath, walName(0, 0))
|
||||
f, err := fileutil.LockFile(p, os.O_WRONLY|os.O_APPEND|os.O_CREATE, 0600)
|
||||
f, err := fileutil.LockFile(p, os.O_WRONLY|os.O_CREATE, 0600)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if _, err := f.Seek(0, os.SEEK_END); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if err := fileutil.Preallocate(f.File, segmentSizeBytes, true); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
w := &WAL{
|
||||
dir: dirpath,
|
||||
@ -149,13 +156,14 @@ func openAtIndex(dirpath string, snap walpb.Snapshot, write bool) (*WAL, error)
|
||||
|
||||
// open the wal files
|
||||
rcs := make([]io.ReadCloser, 0)
|
||||
rs := make([]io.Reader, 0)
|
||||
ls := make([]*fileutil.LockedFile, 0)
|
||||
for _, name := range names[nameIndex:] {
|
||||
p := path.Join(dirpath, name)
|
||||
if write {
|
||||
l, err := fileutil.TryLockFile(p, os.O_RDWR, 0600)
|
||||
if err != nil {
|
||||
MultiReadCloser(rcs...).Close()
|
||||
closeAll(rcs...)
|
||||
return nil, err
|
||||
}
|
||||
ls = append(ls, l)
|
||||
@ -163,37 +171,38 @@ func openAtIndex(dirpath string, snap walpb.Snapshot, write bool) (*WAL, error)
|
||||
} else {
|
||||
rf, err := os.OpenFile(p, os.O_RDONLY, 0600)
|
||||
if err != nil {
|
||||
closeAll(rcs...)
|
||||
return nil, err
|
||||
}
|
||||
ls = append(ls, nil)
|
||||
rcs = append(rcs, rf)
|
||||
}
|
||||
rs = append(rs, rcs[len(rcs)-1])
|
||||
}
|
||||
|
||||
rc := MultiReadCloser(rcs...)
|
||||
c := rc
|
||||
if write {
|
||||
// write reuses the file descriptors from read; don't close so
|
||||
// WAL can append without dropping the file lock
|
||||
c = nil
|
||||
}
|
||||
closer := func() error { return closeAll(rcs...) }
|
||||
|
||||
// create a WAL ready for reading
|
||||
w := &WAL{
|
||||
dir: dirpath,
|
||||
start: snap,
|
||||
decoder: newDecoder(rc),
|
||||
readClose: c,
|
||||
decoder: newDecoder(rs...),
|
||||
readClose: closer,
|
||||
locks: ls,
|
||||
}
|
||||
|
||||
if write {
|
||||
// write reuses the file descriptors from read; don't close so
|
||||
// WAL can append without dropping the file lock
|
||||
w.readClose = nil
|
||||
|
||||
if _, _, err := parseWalName(path.Base(w.tail().Name())); err != nil {
|
||||
rc.Close()
|
||||
closer()
|
||||
return nil, err
|
||||
}
|
||||
// don't resize file for preallocation in case tail is corrupted
|
||||
if err := fileutil.Preallocate(w.tail().File, segmentSizeBytes, false); err != nil {
|
||||
rc.Close()
|
||||
closer()
|
||||
plog.Errorf("failed to allocate space when creating new wal file (%v)", err)
|
||||
return nil, err
|
||||
}
|
||||
@ -261,6 +270,9 @@ func (w *WAL) ReadAll() (metadata []byte, state raftpb.HardState, ents []raftpb.
|
||||
}
|
||||
}
|
||||
|
||||
if err == ErrZeroTrailer {
|
||||
err = io.EOF
|
||||
}
|
||||
switch w.tail() {
|
||||
case nil:
|
||||
// We do not have to read out all entries in read mode.
|
||||
@ -285,7 +297,7 @@ func (w *WAL) ReadAll() (metadata []byte, state raftpb.HardState, ents []raftpb.
|
||||
|
||||
// close decoder, disable reading
|
||||
if w.readClose != nil {
|
||||
w.readClose.Close()
|
||||
w.readClose()
|
||||
w.readClose = nil
|
||||
}
|
||||
w.start = walpb.Snapshot{}
|
||||
@ -294,6 +306,7 @@ func (w *WAL) ReadAll() (metadata []byte, state raftpb.HardState, ents []raftpb.
|
||||
|
||||
if w.tail() != nil {
|
||||
// create encoder (chain crc with the decoder), enable appending
|
||||
_, err = w.tail().Seek(w.decoder.lastOffset(), os.SEEK_SET)
|
||||
w.encoder = newEncoder(w.tail(), w.decoder.lastCRC())
|
||||
lastIndexSaved.Set(float64(w.enti))
|
||||
}
|
||||
@ -306,7 +319,14 @@ func (w *WAL) ReadAll() (metadata []byte, state raftpb.HardState, ents []raftpb.
|
||||
// cut first creates a temp wal file and writes necessary headers into it.
|
||||
// Then cut atomically rename temp wal file to a wal file.
|
||||
func (w *WAL) cut() error {
|
||||
// close old wal file
|
||||
// close old wal file; truncate to avoid wasting space if an early cut
|
||||
off, serr := w.tail().Seek(0, os.SEEK_CUR)
|
||||
if serr != nil {
|
||||
return serr
|
||||
}
|
||||
if err := w.tail().Truncate(off); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := w.sync(); err != nil {
|
||||
return err
|
||||
}
|
||||
@ -342,15 +362,19 @@ func (w *WAL) cut() error {
|
||||
}
|
||||
newTail.Close()
|
||||
|
||||
if newTail, err = fileutil.LockFile(fpath, os.O_WRONLY|os.O_APPEND, 0600); err != nil {
|
||||
if newTail, err = fileutil.LockFile(fpath, os.O_WRONLY, 0600); err != nil {
|
||||
return err
|
||||
}
|
||||
if _, err = newTail.Seek(0, os.SEEK_END); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
w.locks[len(w.locks)-1] = newTail
|
||||
|
||||
prevCrc = w.encoder.crc.Sum32()
|
||||
w.encoder = newEncoder(w.tail(), prevCrc)
|
||||
|
||||
if err = fileutil.Preallocate(w.tail().File, segmentSizeBytes, false); err != nil {
|
||||
if err = fileutil.Preallocate(w.tail().File, segmentSizeBytes, true); err != nil {
|
||||
plog.Errorf("failed to allocate space when creating new wal file (%v)", err)
|
||||
return err
|
||||
}
|
||||
@ -478,11 +502,11 @@ func (w *WAL) Save(st raftpb.HardState, ents []raftpb.Entry) error {
|
||||
return err
|
||||
}
|
||||
|
||||
fstat, err := w.tail().Stat()
|
||||
curOff, err := w.tail().Seek(0, os.SEEK_CUR)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if fstat.Size() < segmentSizeBytes {
|
||||
if curOff < segmentSizeBytes {
|
||||
if mustSync {
|
||||
return w.sync()
|
||||
}
|
||||
@ -544,3 +568,12 @@ func mustSync(st, prevst raftpb.HardState, entsnum int) bool {
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func closeAll(rcs ...io.ReadCloser) error {
|
||||
for _, f := range rcs {
|
||||
if err := f.Close(); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
@ -16,6 +16,7 @@ package wal
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"io"
|
||||
"io/ioutil"
|
||||
"os"
|
||||
"path"
|
||||
@ -42,8 +43,19 @@ func TestNew(t *testing.T) {
|
||||
t.Errorf("name = %+v, want %+v", g, walName(0, 0))
|
||||
}
|
||||
defer w.Close()
|
||||
gd, err := ioutil.ReadFile(w.tail().Name())
|
||||
|
||||
// file is preallocated to segment size; only read data written by wal
|
||||
off, err := w.tail().Seek(0, os.SEEK_CUR)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
gd := make([]byte, off)
|
||||
f, err := os.Open(w.tail().Name())
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
defer f.Close()
|
||||
if _, err = io.ReadFull(f, gd); err != nil {
|
||||
t.Fatalf("err = %v, want nil", err)
|
||||
}
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user