Merge pull request #420 from benbjohnson/logging

Logging
This commit is contained in:
Xiang Li
2014-01-08 21:36:52 -08:00
118 changed files with 6917 additions and 2682 deletions

View File

@@ -37,11 +37,11 @@ func (c *JoinCommand) CommandName() string {
}
// Join a server to the cluster
func (c *JoinCommand) Apply(server raft.Server) (interface{}, error) {
ps, _ := server.Context().(*PeerServer)
func (c *JoinCommand) Apply(context raft.Context) (interface{}, error) {
ps, _ := context.Server().Context().(*PeerServer)
b := make([]byte, 8)
binary.PutUvarint(b, server.CommitIndex())
binary.PutUvarint(b, context.CommitIndex())
// Make sure we're not getting a cached value from the registry.
ps.registry.Invalidate(c.Name)
@@ -54,14 +54,14 @@ func (c *JoinCommand) Apply(server raft.Server) (interface{}, error) {
// Check peer number in the cluster
if ps.registry.Count() == ps.MaxClusterSize {
log.Debug("Reject join request from ", c.Name)
return []byte{0}, etcdErr.NewError(etcdErr.EcodeNoMorePeer, "", server.CommitIndex())
return []byte{0}, etcdErr.NewError(etcdErr.EcodeNoMorePeer, "", context.CommitIndex())
}
// Add to shared peer registry.
ps.registry.Register(c.Name, c.RaftURL, c.EtcdURL)
// Add peer in raft
err := server.AddPeer(c.Name, "")
err := context.Server().AddPeer(c.Name, "")
// Add peer stats
if c.Name != ps.RaftServer().Name() {

View File

@@ -22,6 +22,8 @@ import (
const retryInterval = 10
const ThresholdMonitorTimeout = 5 * time.Second
type PeerServer struct {
raftServer raft.Server
server *Server
@@ -42,6 +44,9 @@ type PeerServer struct {
RetryTimes int
HeartbeatTimeout time.Duration
ElectionTimeout time.Duration
closeChan chan bool
timeoutThresholdChan chan interface{}
}
// TODO: find a good policy to do snapshot
@@ -83,6 +88,8 @@ func NewPeerServer(name string, path string, url string, bindAddr string, tlsCon
},
HeartbeatTimeout: defaultHeartbeatTimeout,
ElectionTimeout: defaultElectionTimeout,
timeoutThresholdChan: make(chan interface{}, 1),
}
// Create transporter for raft
@@ -95,6 +102,13 @@ func NewPeerServer(name string, path string, url string, bindAddr string, tlsCon
}
s.raftServer = raftServer
s.raftServer.AddEventListener(raft.StateChangeEventType, s.raftEventLogger)
s.raftServer.AddEventListener(raft.LeaderChangeEventType, s.raftEventLogger)
s.raftServer.AddEventListener(raft.TermChangeEventType, s.raftEventLogger)
s.raftServer.AddEventListener(raft.AddPeerEventType, s.raftEventLogger)
s.raftServer.AddEventListener(raft.RemovePeerEventType, s.raftEventLogger)
s.raftServer.AddEventListener(raft.HeartbeatTimeoutEventType, s.raftEventLogger)
s.raftServer.AddEventListener(raft.ElectionTimeoutThresholdEventType, s.raftEventLogger)
return s
}
@@ -143,7 +157,10 @@ func (s *PeerServer) ListenAndServe(snapshot bool, cluster []string) error {
log.Debugf("%s restart as a follower", s.name)
}
s.closeChan = make(chan bool)
go s.monitorSync()
go s.monitorTimeoutThreshold(s.closeChan)
// open the snapshot
if snapshot {
@@ -201,6 +218,10 @@ func (s *PeerServer) listenAndServeTLS(certFile, keyFile string) error {
// Stops the server.
func (s *PeerServer) Close() {
if s.closeChan != nil {
close(s.closeChan)
s.closeChan = nil
}
if s.listener != nil {
s.listener.Close()
s.listener = nil
@@ -429,6 +450,43 @@ func (s *PeerServer) PeerStats() []byte {
return nil
}
// raftEventLogger converts events from the Raft server into log messages.
func (s *PeerServer) raftEventLogger(event raft.Event) {
value := event.Value()
prevValue := event.PrevValue()
if value == nil {
value = "<nil>"
}
if prevValue == nil {
prevValue = "<nil>"
}
switch event.Type() {
case raft.StateChangeEventType:
log.Infof("%s: state changed from '%v' to '%v'.", s.name, prevValue, value)
case raft.TermChangeEventType:
log.Infof("%s: term #%v started.", s.name, value)
case raft.LeaderChangeEventType:
log.Infof("%s: leader changed from '%v' to '%v'.", s.name, prevValue, value)
case raft.AddPeerEventType:
log.Infof("%s: peer added: '%v'", s.name, value)
case raft.RemovePeerEventType:
log.Infof("%s: peer removed: '%v'", s.name, value)
case raft.HeartbeatTimeoutEventType:
var name = "<unknown>"
if peer, ok := value.(*raft.Peer); ok {
name = peer.Name
}
log.Infof("%s: warning: heartbeat timed out: '%v'", s.name, name)
case raft.ElectionTimeoutThresholdEventType:
select {
case s.timeoutThresholdChan <- value:
default:
}
}
}
func (s *PeerServer) monitorSnapshot() {
for {
time.Sleep(s.snapConf.checkingInterval)
@@ -451,3 +509,18 @@ func (s *PeerServer) monitorSync() {
}
}
}
// monitorTimeoutThreshold groups timeout threshold events together and prints
// them as a single log line.
func (s *PeerServer) monitorTimeoutThreshold(closeChan chan bool) {
for {
select {
case value := <-s.timeoutThresholdChan:
log.Infof("%s: warning: heartbeat near election timeout: %v", s.name, value)
case <-closeChan:
return
}
time.Sleep(ThresholdMonitorTimeout)
}
}

View File

@@ -23,8 +23,8 @@ func (c *RemoveCommand) CommandName() string {
}
// Remove a server from the cluster
func (c *RemoveCommand) Apply(server raft.Server) (interface{}, error) {
ps, _ := server.Context().(*PeerServer)
func (c *RemoveCommand) Apply(context raft.Context) (interface{}, error) {
ps, _ := context.Server().Context().(*PeerServer)
// Remove node from the shared registry.
err := ps.registry.Unregister(c.Name)
@@ -38,21 +38,21 @@ func (c *RemoveCommand) Apply(server raft.Server) (interface{}, error) {
}
// Remove peer in raft
err = server.RemovePeer(c.Name)
err = context.Server().RemovePeer(c.Name)
if err != nil {
log.Debugf("Unable to remove peer: %s (%v)", c.Name, err)
return []byte{0}, err
}
if c.Name == server.Name() {
if c.Name == context.Server().Name() {
// the removed node is this node
// if the node is not replaying the previous logs
// and the node has sent out a join request in this
// start. It is sure that this node received a new remove
// command and need to be removed
if server.CommitIndex() > ps.joinIndex && ps.joinIndex != 0 {
log.Debugf("server [%s] is removed", server.Name())
if context.CommitIndex() > ps.joinIndex && ps.joinIndex != 0 {
log.Debugf("server [%s] is removed", context.Server().Name())
os.Exit(0)
} else {
// else ignore remove
@@ -61,7 +61,7 @@ func (c *RemoveCommand) Apply(server raft.Server) (interface{}, error) {
}
b := make([]byte, 8)
binary.PutUvarint(b, server.CommitIndex())
binary.PutUvarint(b, context.CommitIndex())
return b, err
}