From ac2e3e43bfaf6727bf2c1cba3062f6e19bf93b9f Mon Sep 17 00:00:00 2001 From: Anthony Romano Date: Mon, 16 May 2016 08:59:44 -0700 Subject: [PATCH 1/2] v3rpc: add sha trailer to snapshot --- etcdserver/api/v3rpc/maintenance.go | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/etcdserver/api/v3rpc/maintenance.go b/etcdserver/api/v3rpc/maintenance.go index 993cb4193..20af20fc3 100644 --- a/etcdserver/api/v3rpc/maintenance.go +++ b/etcdserver/api/v3rpc/maintenance.go @@ -15,6 +15,7 @@ package v3rpc import ( + "crypto/sha256" "io" "github.com/coreos/etcd/etcdserver" @@ -81,6 +82,8 @@ func (ms *maintenanceServer) Snapshot(sr *pb.SnapshotRequest, srv pb.Maintenance pw.Close() }() + // send file data + h := sha256.New() br := int64(0) buf := make([]byte, 32*1024) sz := snap.Size() @@ -97,6 +100,14 @@ func (ms *maintenanceServer) Snapshot(sr *pb.SnapshotRequest, srv pb.Maintenance if err = srv.Send(resp); err != nil { return togRPCError(err) } + h.Write(buf[:n]) + } + + // send sha + sha := h.Sum(nil) + hresp := &pb.SnapshotResponse{RemainingBytes: 0, Blob: sha} + if err := srv.Send(hresp); err != nil { + return togRPCError(err) } return nil From 798718c49ba86e8f03eb586456e01909a8969653 Mon Sep 17 00:00:00 2001 From: Anthony Romano Date: Mon, 16 May 2016 09:41:54 -0700 Subject: [PATCH 2/2] etcdctl: verify snapshot hash on restore Fixes #4097 --- Documentation/op-guide/recovery.md | 2 + e2e/ctl_v3_snapshot_test.go | 32 +++++++++++++ etcdctl/ctlv3/command/snapshot_command.go | 56 ++++++++++++++++++++++- 3 files changed, 88 insertions(+), 2 deletions(-) diff --git a/Documentation/op-guide/recovery.md b/Documentation/op-guide/recovery.md index 278863060..25cb4b2d5 100644 --- a/Documentation/op-guide/recovery.md +++ b/Documentation/op-guide/recovery.md @@ -18,6 +18,8 @@ $ etcdctl --endpoints $ENDPOINT snapshot save snapshot.db To restore a cluster, all that is needed is a single snapshot "db" file. A cluster restore with `etcdctl snapshot restore` creates new etcd data directories; all members should restore using the same snapshot. Restoring overwrites some snapshot metadata (specifically, the member ID and cluster ID); the member loses its former identity. This metadata overwrite prevents the new member from inadvertently joining an existing cluster. Therefore in order to start a cluster from a snapshot, the restore must start a new logical cluster. +Snapshot integrity may be optionally verified at restore time. If the snapshot is taken with `etcdctl snapshot save`, it will have an integrity hash that is checked by `etcdctl snapshot restore`. If the snapshot is copied from the data directory, there is no integrity hash and it will only restore by using `--skip-hash-check`. + A restore initializes a new member of a new cluster, with a fresh cluster configuration using `etcd`'s cluster configuration flags, but preserves the contents of the etcd keyspace. Continuing from the previous example, the following creates new etcd data directories (`m1.etcd`, `m2.etcd`, `m3.etcd`) for a three member cluster: ```sh diff --git a/e2e/ctl_v3_snapshot_test.go b/e2e/ctl_v3_snapshot_test.go index 986076e17..6b2bb379b 100644 --- a/e2e/ctl_v3_snapshot_test.go +++ b/e2e/ctl_v3_snapshot_test.go @@ -52,6 +52,38 @@ func snapshotTest(cx ctlCtx) { } } +func TestCtlV3SnapshotCorrupt(t *testing.T) { testCtl(t, snapshotCorruptTest) } + +func snapshotCorruptTest(cx ctlCtx) { + fpath := "test.snapshot" + defer os.RemoveAll(fpath) + + if err := ctlV3SnapshotSave(cx, fpath); err != nil { + cx.t.Fatalf("snapshotTest ctlV3SnapshotSave error (%v)", err) + } + + // corrupt file + f, oerr := os.OpenFile(fpath, os.O_WRONLY, 0) + if oerr != nil { + cx.t.Fatal(oerr) + } + if _, err := f.Write(make([]byte, 512)); err != nil { + cx.t.Fatal(err) + } + f.Close() + + defer os.RemoveAll("snap.etcd") + serr := spawnWithExpect( + append(cx.PrefixArgs(), "snapshot", "restore", + "--data-dir", "snap.etcd", + fpath), + "expected sha256") + + if serr != nil { + cx.t.Fatal(serr) + } +} + func ctlV3SnapshotSave(cx ctlCtx, fpath string) error { cmdArgs := append(cx.PrefixArgs(), "snapshot", "save", fpath) return spawnWithExpect(cmdArgs, fmt.Sprintf("Snapshot saved at %s", fpath)) diff --git a/etcdctl/ctlv3/command/snapshot_command.go b/etcdctl/ctlv3/command/snapshot_command.go index 87713dfa6..3dc73189b 100644 --- a/etcdctl/ctlv3/command/snapshot_command.go +++ b/etcdctl/ctlv3/command/snapshot_command.go @@ -15,6 +15,7 @@ package command import ( + "crypto/sha256" "encoding/binary" "encoding/json" "fmt" @@ -22,6 +23,7 @@ import ( "io" "os" "path" + "reflect" "strings" "github.com/boltdb/bolt" @@ -50,6 +52,7 @@ var ( restoreDataDir string restorePeerURLs string restoreName string + skipHashCheck bool ) // NewSnapshotCommand returns the cobra command for "snapshot". @@ -94,6 +97,7 @@ func NewSnapshotRestoreCommand() *cobra.Command { cmd.Flags().StringVar(&restoreClusterToken, "initial-cluster-token", "etcd-cluster", "Initial cluster token for the etcd cluster during restore bootstrap.") cmd.Flags().StringVar(&restorePeerURLs, "initial-advertise-peer-urls", defaultInitialAdvertisePeerURLs, "List of this member's peer URLs to advertise to the rest of the cluster.") cmd.Flags().StringVar(&restoreName, "name", defaultName, "Human-readable name for this member.") + cmd.Flags().BoolVar(&skipHashCheck, "skip-hash-check", false, "Ignore snapshot integrity hash value (required if copied from data directory).") return cmd } @@ -191,7 +195,7 @@ func initialClusterFromName(name string) string { if name == "" { n = defaultName } - return fmt.Sprintf("%s=http://localhost:2380", n, n) + return fmt.Sprintf("%s=http://localhost:2380", n) } // makeWAL creates a WAL for the initial cluster @@ -261,18 +265,65 @@ func makeDB(snapdir, dbfile string) { } defer f.Close() + // get snapshot integrity hash + if _, err := f.Seek(-sha256.Size, os.SEEK_END); err != nil { + ExitWithError(ExitIO, err) + } + sha := make([]byte, sha256.Size) + if _, err := f.Read(sha); err != nil { + ExitWithError(ExitIO, err) + } + if _, err := f.Seek(0, os.SEEK_SET); err != nil { + ExitWithError(ExitIO, err) + } + if err := os.MkdirAll(snapdir, 0755); err != nil { ExitWithError(ExitIO, err) } dbpath := path.Join(snapdir, "db") - db, dberr := os.OpenFile(dbpath, os.O_WRONLY|os.O_CREATE, 0600) + db, dberr := os.OpenFile(dbpath, os.O_RDWR|os.O_CREATE, 0600) if dberr != nil { ExitWithError(ExitIO, dberr) } if _, err := io.Copy(db, f); err != nil { ExitWithError(ExitIO, err) } + + // truncate away integrity hash, if any. + off, serr := db.Seek(0, os.SEEK_END) + if serr != nil { + ExitWithError(ExitIO, serr) + } + hasHash := (off % 512) == sha256.Size + if hasHash { + if err := db.Truncate(off - sha256.Size); err != nil { + ExitWithError(ExitIO, err) + } + } + + if !hasHash && !skipHashCheck { + err := fmt.Errorf("snapshot missing hash but --skip-hash-check=false") + ExitWithError(ExitBadArgs, err) + } + + if hasHash && !skipHashCheck { + // check for match + if _, err := db.Seek(0, os.SEEK_SET); err != nil { + ExitWithError(ExitIO, err) + } + h := sha256.New() + if _, err := io.Copy(h, db); err != nil { + ExitWithError(ExitIO, err) + } + dbsha := h.Sum(nil) + if !reflect.DeepEqual(sha, dbsha) { + err := fmt.Errorf("expected sha256 %v, got %v", sha, dbsha) + ExitWithError(ExitInvalidInput, err) + } + } + + // db hash is OK, can now modify DB so it can be part of a new cluster db.Close() // update consistentIndex so applies go through on etcdserver despite @@ -285,6 +336,7 @@ func makeDB(snapdir, dbfile string) { _, _, err := s.TxnDeleteRange(id, k, nil) return err } + // delete stored members from old cluster since using new members btx.UnsafeForEach([]byte("members"), del) btx.UnsafeForEach([]byte("members_removed"), del)