bolt/node.go

605 lines
16 KiB
Go
Raw Normal View History

2014-01-31 18:18:51 +00:00
package bolt
import (
"bytes"
"fmt"
2014-01-31 18:18:51 +00:00
"sort"
"unsafe"
)
// node represents an in-memory, deserialized page.
type node struct {
bucket *Bucket
isLeaf bool
unbalanced bool
spilled bool
key []byte
pgid pgid
parent *node
2014-06-03 19:21:28 +00:00
children nodes
inodes inodes
2014-01-31 18:18:51 +00:00
}
2014-05-02 19:59:23 +00:00
// root returns the top-level node this node is attached to.
func (n *node) root() *node {
if n.parent == nil {
return n
}
return n.parent.root()
}
2014-02-08 05:55:25 +00:00
// minKeys returns the minimum number of inodes this node should have.
func (n *node) minKeys() int {
if n.isLeaf {
return 1
}
return 2
}
2014-01-31 18:18:51 +00:00
// size returns the size of the node after serialization.
func (n *node) size() int {
sz, elsz := pageHeaderSize, n.pageElementSize()
for i := 0; i < len(n.inodes); i++ {
item := &n.inodes[i]
sz += elsz + len(item.key) + len(item.value)
}
return sz
}
2014-01-31 18:18:51 +00:00
// sizeLessThan returns true if the node is less than a given size.
// This is an optimization to avoid calculating a large node when we only need
// to know if it fits inside a certain page size.
func (n *node) sizeLessThan(v int) bool {
sz, elsz := pageHeaderSize, n.pageElementSize()
for i := 0; i < len(n.inodes); i++ {
item := &n.inodes[i]
sz += elsz + len(item.key) + len(item.value)
if sz >= v {
return false
}
2014-01-31 18:18:51 +00:00
}
return true
2014-01-31 18:18:51 +00:00
}
// pageElementSize returns the size of each page element based on the type of node.
func (n *node) pageElementSize() int {
if n.isLeaf {
return leafPageElementSize
}
return branchPageElementSize
}
2014-02-07 22:03:29 +00:00
// childAt returns the child node at a given index.
2014-02-08 05:55:25 +00:00
func (n *node) childAt(index int) *node {
if n.isLeaf {
panic(fmt.Sprintf("invalid childAt(%d) on a leaf node", index))
}
return n.bucket.node(n.inodes[index].pgid, n)
2014-02-07 22:03:29 +00:00
}
2014-02-08 05:55:25 +00:00
// childIndex returns the index of a given child node.
func (n *node) childIndex(child *node) int {
index := sort.Search(len(n.inodes), func(i int) bool { return bytes.Compare(n.inodes[i].key, child.key) != -1 })
return index
}
// numChildren returns the number of children.
func (n *node) numChildren() int {
return len(n.inodes)
}
// nextSibling returns the next node with the same parent.
func (n *node) nextSibling() *node {
if n.parent == nil {
return nil
}
index := n.parent.childIndex(n)
2014-02-09 22:52:19 +00:00
if index >= n.parent.numChildren()-1 {
2014-02-08 05:55:25 +00:00
return nil
}
return n.parent.childAt(index + 1)
}
// prevSibling returns the previous node with the same parent.
func (n *node) prevSibling() *node {
if n.parent == nil {
return nil
}
index := n.parent.childIndex(n)
if index == 0 {
return nil
}
return n.parent.childAt(index - 1)
}
2014-01-31 18:18:51 +00:00
// put inserts a key/value.
func (n *node) put(oldKey, newKey, value []byte, pgid pgid, flags uint32) {
if pgid >= n.bucket.tx.meta.pgid {
panic(fmt.Sprintf("pgid (%d) above high water mark (%d)", pgid, n.bucket.tx.meta.pgid))
} else if len(oldKey) <= 0 {
panic("put: zero-length old key")
} else if len(newKey) <= 0 {
panic("put: zero-length new key")
}
2014-05-08 14:10:14 +00:00
2014-01-31 18:18:51 +00:00
// Find insertion index.
index := sort.Search(len(n.inodes), func(i int) bool { return bytes.Compare(n.inodes[i].key, oldKey) != -1 })
// Add capacity and shift nodes if we don't have an exact match and need to insert.
exact := (len(n.inodes) > 0 && index < len(n.inodes) && bytes.Equal(n.inodes[index].key, oldKey))
if !exact {
n.inodes = append(n.inodes, inode{})
copy(n.inodes[index+1:], n.inodes[index:])
}
inode := &n.inodes[index]
inode.flags = flags
2014-01-31 18:18:51 +00:00
inode.key = newKey
inode.value = value
inode.pgid = pgid
2014-05-08 14:10:14 +00:00
_assert(len(inode.key) > 0, "put: zero-length inode key")
2014-01-31 18:18:51 +00:00
}
2014-02-03 21:33:51 +00:00
// del removes a key from the node.
func (n *node) del(key []byte) {
// Find index of key.
index := sort.Search(len(n.inodes), func(i int) bool { return bytes.Compare(n.inodes[i].key, key) != -1 })
// Exit if the key isn't found.
2014-02-16 22:43:35 +00:00
if index >= len(n.inodes) || !bytes.Equal(n.inodes[index].key, key) {
2014-02-03 21:33:51 +00:00
return
}
// Delete inode from the node.
n.inodes = append(n.inodes[:index], n.inodes[index+1:]...)
2014-02-08 05:55:25 +00:00
// Mark the node as needing rebalancing.
n.unbalanced = true
2014-02-03 21:33:51 +00:00
}
2014-01-31 18:18:51 +00:00
// read initializes the node from a page.
func (n *node) read(p *page) {
n.pgid = p.id
2014-02-12 21:57:27 +00:00
n.isLeaf = ((p.flags & leafPageFlag) != 0)
2014-01-31 18:18:51 +00:00
n.inodes = make(inodes, int(p.count))
for i := 0; i < int(p.count); i++ {
inode := &n.inodes[i]
if n.isLeaf {
elem := p.leafPageElement(uint16(i))
inode.flags = elem.flags
2014-01-31 18:18:51 +00:00
inode.key = elem.key()
inode.value = elem.value()
} else {
elem := p.branchPageElement(uint16(i))
inode.pgid = elem.pgid
inode.key = elem.key()
}
2014-05-08 14:10:14 +00:00
_assert(len(inode.key) > 0, "read: zero-length inode key")
2014-01-31 18:18:51 +00:00
}
// Save first key so we can find the node in the parent when we spill.
if len(n.inodes) > 0 {
n.key = n.inodes[0].key
2014-05-08 14:10:14 +00:00
_assert(len(n.key) > 0, "read: zero-length node key")
2014-01-31 18:18:51 +00:00
} else {
n.key = nil
}
}
// write writes the items onto one or more pages.
func (n *node) write(p *page) {
// Initialize page.
if n.isLeaf {
2014-02-12 21:57:27 +00:00
p.flags |= leafPageFlag
2014-01-31 18:18:51 +00:00
} else {
2014-02-12 21:57:27 +00:00
p.flags |= branchPageFlag
2014-01-31 18:18:51 +00:00
}
if len(n.inodes) >= 0xFFFF {
panic(fmt.Sprintf("inode overflow: %d (pgid=%d)", len(n.inodes), p.id))
}
2014-01-31 18:18:51 +00:00
p.count = uint16(len(n.inodes))
// Stop here if there are no items to write.
if p.count == 0 {
return
}
2014-01-31 18:18:51 +00:00
// Loop over each item and write it to the page.
b := (*[maxAllocSize]byte)(unsafe.Pointer(&p.ptr))[n.pageElementSize()*len(n.inodes):]
for i, item := range n.inodes {
2014-05-08 14:10:14 +00:00
_assert(len(item.key) > 0, "write: zero-length inode key")
2014-01-31 18:18:51 +00:00
// Write the page element.
if n.isLeaf {
elem := p.leafPageElement(uint16(i))
elem.pos = uint32(uintptr(unsafe.Pointer(&b[0])) - uintptr(unsafe.Pointer(elem)))
elem.flags = item.flags
2014-01-31 18:18:51 +00:00
elem.ksize = uint32(len(item.key))
elem.vsize = uint32(len(item.value))
} else {
elem := p.branchPageElement(uint16(i))
elem.pos = uint32(uintptr(unsafe.Pointer(&b[0])) - uintptr(unsafe.Pointer(elem)))
elem.ksize = uint32(len(item.key))
elem.pgid = item.pgid
_assert(elem.pgid != p.id, "write: circular dependency occurred")
2014-01-31 18:18:51 +00:00
}
// If the length of key+value is larger than the max allocation size
// then we need to reallocate the byte array pointer.
//
// See: https://github.com/boltdb/bolt/pull/335
klen, vlen := len(item.key), len(item.value)
if len(b) < klen+vlen {
fix `slice bounds out of range`/maxAllocSize bugs when accessing the node data we used to use cast to *[maxAllocSize]byte, which breaks if we try to go across maxAllocSize boundary. This leads to occasional panics. Sample stacktrace: ``` panic: runtime error: slice bounds out of range goroutine 1 [running]: github.com/boltdb/bolt.(*node).write(0xc208010f50, 0xc27452a000) $GOPATH/src/github.com/boltdb/bolt/node.go:228 +0x5a5 github.com/boltdb/bolt.(*node).spill(0xc208010f50, 0x0, 0x0) $GOPATH/src/github.com/boltdb/bolt/node.go:364 +0x506 github.com/boltdb/bolt.(*node).spill(0xc208010700, 0x0, 0x0) $GOPATH/src/github.com/boltdb/bolt/node.go:336 +0x12d github.com/boltdb/bolt.(*node).spill(0xc208010620, 0x0, 0x0) $GOPATH/src/github.com/boltdb/bolt/node.go:336 +0x12d github.com/boltdb/bolt.(*Bucket).spill(0xc22b6ae880, 0x0, 0x0) $GOPATH/src/github.com/boltdb/bolt/bucket.go:535 +0x1c4 github.com/boltdb/bolt.(*Bucket).spill(0xc22b6ae840, 0x0, 0x0) $GOPATH/src/github.com/boltdb/bolt/bucket.go:502 +0xac2 github.com/boltdb/bolt.(*Bucket).spill(0xc22f4e2018, 0x0, 0x0) $GOPATH/src/github.com/boltdb/bolt/bucket.go:502 +0xac2 github.com/boltdb/bolt.(*Tx).Commit(0xc22f4e2000, 0x0, 0x0) $GOPATH/src/github.com/boltdb/bolt/tx.go:150 +0x1ee github.com/boltdb/bolt.(*DB).Update(0xc2080e4000, 0xc24d077508, 0x0, 0x0) $GOPATH/src/github.com/boltdb/bolt/db.go:483 +0x169 ``` It usually happens when working with large (50M/100M) values. One way to reproduce it is to change maxAllocSize in bolt_amd64.go to 70000 and run the tests. TestBucket_Put_Large crashes.
2015-03-26 23:47:24 +00:00
b = (*[maxAllocSize]byte)(unsafe.Pointer(&b[0]))[:]
}
2014-01-31 18:18:51 +00:00
// Write data for the element to the end of the page.
copy(b[0:], item.key)
b = b[klen:]
2014-01-31 18:18:51 +00:00
copy(b[0:], item.value)
b = b[vlen:]
2014-01-31 18:18:51 +00:00
}
2014-05-08 14:10:14 +00:00
// DEBUG ONLY: n.dump()
2014-01-31 18:18:51 +00:00
}
// split breaks up a node into multiple smaller nodes, if appropriate.
2014-05-02 19:59:23 +00:00
// This should only be called from the spill() function.
2014-01-31 18:18:51 +00:00
func (n *node) split(pageSize int) []*node {
var nodes []*node
node := n
for {
// Split node into two.
a, b := node.splitTwo(pageSize)
nodes = append(nodes, a)
// If we can't split then exit the loop.
if b == nil {
break
}
// Set node to b so it gets split on the next iteration.
node = b
}
return nodes
}
// splitTwo breaks up a node into two smaller nodes, if appropriate.
// This should only be called from the split() function.
func (n *node) splitTwo(pageSize int) (*node, *node) {
2014-01-31 18:18:51 +00:00
// Ignore the split if the page doesn't have at least enough nodes for
// two pages or if the nodes can fit in a single page.
if len(n.inodes) <= (minKeysPerPage*2) || n.sizeLessThan(pageSize) {
return n, nil
2014-01-31 18:18:51 +00:00
}
// Determine the threshold before starting a new node.
var fillPercent = n.bucket.FillPercent
if fillPercent < minFillPercent {
fillPercent = minFillPercent
} else if fillPercent > maxFillPercent {
fillPercent = maxFillPercent
}
threshold := int(float64(pageSize) * fillPercent)
2014-01-31 18:18:51 +00:00
// Determine split position and sizes of the two pages.
splitIndex, _ := n.splitIndex(threshold)
// Split node into two separate nodes.
// If there's no parent then we'll need to create one.
if n.parent == nil {
n.parent = &node{bucket: n.bucket, children: []*node{n}}
}
// Create a new node and add it to the parent.
next := &node{bucket: n.bucket, isLeaf: n.isLeaf, parent: n.parent}
n.parent.children = append(n.parent.children, next)
// Split inodes across two nodes.
next.inodes = n.inodes[splitIndex:]
n.inodes = n.inodes[:splitIndex]
// Update the statistics.
n.bucket.tx.stats.Split++
return n, next
}
// splitIndex finds the position where a page will fill a given threshold.
// It returns the index as well as the size of the first page.
// This is only be called from split().
func (n *node) splitIndex(threshold int) (index, sz int) {
sz = pageHeaderSize
// Loop until we only have the minimum number of keys required for the second page.
for i := 0; i < len(n.inodes)-minKeysPerPage; i++ {
index = i
inode := n.inodes[i]
elsize := n.pageElementSize() + len(inode.key) + len(inode.value)
// If we have at least the minimum number of keys and adding another
// node would put us over the threshold then exit and return.
if i >= minKeysPerPage && sz+elsize > threshold {
break
2014-01-31 18:18:51 +00:00
}
// Add the element size to the total size.
sz += elsize
2014-01-31 18:18:51 +00:00
}
return
2014-01-31 18:18:51 +00:00
}
2014-05-02 19:59:23 +00:00
// spill writes the nodes to dirty pages and splits nodes as it goes.
// Returns an error if dirty pages cannot be allocated.
func (n *node) spill() error {
var tx = n.bucket.tx
if n.spilled {
return nil
}
2014-05-02 19:59:23 +00:00
2014-06-03 19:21:28 +00:00
// Spill child nodes first. Child nodes can materialize sibling nodes in
// the case of split-merge so we cannot use a range loop. We have to check
// the children size on every loop iteration.
sort.Sort(n.children)
for i := 0; i < len(n.children); i++ {
if err := n.children[i].spill(); err != nil {
2014-05-02 19:59:23 +00:00
return err
}
}
2014-06-03 19:21:28 +00:00
// We no longer need the child list because it's only used for spill tracking.
n.children = nil
2014-05-02 19:59:23 +00:00
// Split nodes into appropriate sizes. The first node will always be n.
2014-05-02 19:59:23 +00:00
var nodes = n.split(tx.db.pageSize)
for _, node := range nodes {
2014-06-03 19:21:28 +00:00
// Add node's page to the freelist if it's not new.
if node.pgid > 0 {
2014-07-26 21:11:47 +00:00
tx.db.freelist.free(tx.meta.txid, tx.page(node.pgid))
2014-06-03 19:21:28 +00:00
node.pgid = 0
}
2014-05-02 19:59:23 +00:00
// Allocate contiguous space for the node.
p, err := tx.allocate((node.size() / tx.db.pageSize) + 1)
2014-05-02 19:59:23 +00:00
if err != nil {
return err
}
// Write the node.
if p.id >= tx.meta.pgid {
panic(fmt.Sprintf("pgid (%d) above high water mark (%d)", p.id, tx.meta.pgid))
}
2014-05-02 19:59:23 +00:00
node.pgid = p.id
node.write(p)
node.spilled = true
2014-05-02 19:59:23 +00:00
// Insert into parent inodes.
if node.parent != nil {
var key = node.key
if key == nil {
key = node.inodes[0].key
}
node.parent.put(key, node.inodes[0].key, nil, node.pgid, 0)
node.key = node.inodes[0].key
_assert(len(node.key) > 0, "spill: zero-length node key")
2014-05-02 19:59:23 +00:00
}
// Update the statistics.
tx.stats.Spill++
}
// If the root node split and created a new root then we need to spill that
// as well. We'll clear out the children to make sure it doesn't try to respill.
if n.parent != nil && n.parent.pgid == 0 {
n.children = nil
return n.parent.spill()
2014-05-02 19:59:23 +00:00
}
return nil
}
2014-02-08 05:55:25 +00:00
// rebalance attempts to combine the node with sibling nodes if the node fill
// size is below a threshold or if there are not enough keys.
func (n *node) rebalance() {
if !n.unbalanced {
return
}
n.unbalanced = false
// Update statistics.
n.bucket.tx.stats.Rebalance++
2014-02-08 05:55:25 +00:00
// Ignore if node is above threshold (25%) and has enough keys.
var threshold = n.bucket.tx.db.pageSize / 4
2014-02-08 05:55:25 +00:00
if n.size() > threshold && len(n.inodes) > n.minKeys() {
return
}
// Root node has special handling.
if n.parent == nil {
// If root node is a branch and only has one node then collapse it.
if !n.isLeaf && len(n.inodes) == 1 {
2014-05-02 19:59:23 +00:00
// Move root's child up.
2014-05-07 16:37:50 +00:00
child := n.bucket.node(n.inodes[0].pgid, n)
2014-02-08 05:55:25 +00:00
n.isLeaf = child.isLeaf
n.inodes = child.inodes[:]
2014-05-02 19:59:23 +00:00
n.children = child.children
2014-02-08 05:55:25 +00:00
// Reparent all child nodes being moved.
for _, inode := range n.inodes {
if child, ok := n.bucket.nodes[inode.pgid]; ok {
2014-02-08 05:55:25 +00:00
child.parent = n
}
}
// Remove old child.
child.parent = nil
delete(n.bucket.nodes, child.pgid)
2014-03-28 06:07:05 +00:00
child.free()
2014-02-08 05:55:25 +00:00
}
return
}
2014-05-07 16:37:50 +00:00
// If node has no keys then just remove it.
if n.numChildren() == 0 {
n.parent.del(n.key)
n.parent.removeChild(n)
delete(n.bucket.nodes, n.pgid)
n.free()
n.parent.rebalance()
return
}
2014-02-08 05:55:25 +00:00
_assert(n.parent.numChildren() > 1, "parent must have at least 2 children")
// Destination node is right sibling if idx == 0, otherwise left sibling.
var target *node
var useNextSibling = (n.parent.childIndex(n) == 0)
if useNextSibling {
target = n.nextSibling()
} else {
target = n.prevSibling()
}
// If both this node and the target node are too small then merge them.
if useNextSibling {
// Reparent all child nodes being moved.
for _, inode := range target.inodes {
if child, ok := n.bucket.nodes[inode.pgid]; ok {
2014-05-02 19:59:23 +00:00
child.parent.removeChild(child)
2014-02-08 05:55:25 +00:00
child.parent = n
2014-05-02 19:59:23 +00:00
child.parent.children = append(child.parent.children, child)
2014-02-08 05:55:25 +00:00
}
}
// Copy over inodes from target and remove target.
n.inodes = append(n.inodes, target.inodes...)
n.parent.del(target.key)
2014-05-02 19:59:23 +00:00
n.parent.removeChild(target)
delete(n.bucket.nodes, target.pgid)
2014-03-28 06:07:05 +00:00
target.free()
2014-02-08 05:55:25 +00:00
} else {
// Reparent all child nodes being moved.
for _, inode := range n.inodes {
if child, ok := n.bucket.nodes[inode.pgid]; ok {
2014-05-02 19:59:23 +00:00
child.parent.removeChild(child)
2014-02-08 05:55:25 +00:00
child.parent = target
2014-05-02 19:59:23 +00:00
child.parent.children = append(child.parent.children, child)
2014-02-08 05:55:25 +00:00
}
}
// Copy over inodes to target and remove node.
target.inodes = append(target.inodes, n.inodes...)
n.parent.del(n.key)
2014-05-02 19:59:23 +00:00
n.parent.removeChild(n)
delete(n.bucket.nodes, n.pgid)
2014-03-28 06:07:05 +00:00
n.free()
2014-02-08 05:55:25 +00:00
}
// Either this node or the target node was deleted from the parent so rebalance it.
n.parent.rebalance()
}
2014-05-02 19:59:23 +00:00
// removes a node from the list of in-memory children.
// This does not affect the inodes.
func (n *node) removeChild(target *node) {
for i, child := range n.children {
if child == target {
n.children = append(n.children[:i], n.children[i+1:]...)
return
}
}
}
2014-02-11 19:16:12 +00:00
// dereference causes the node to copy all its inode key/value references to heap memory.
// This is required when the mmap is reallocated so inodes are not pointing to stale data.
func (n *node) dereference() {
2014-05-08 14:10:14 +00:00
if n.key != nil {
key := make([]byte, len(n.key))
copy(key, n.key)
n.key = key
_assert(n.pgid == 0 || len(n.key) > 0, "dereference: zero-length node key on existing node")
}
2014-02-11 19:16:12 +00:00
2014-02-12 21:57:27 +00:00
for i := range n.inodes {
2014-02-11 19:16:12 +00:00
inode := &n.inodes[i]
key := make([]byte, len(inode.key))
copy(key, inode.key)
inode.key = key
2014-05-08 14:10:14 +00:00
_assert(len(inode.key) > 0, "dereference: zero-length inode key")
2014-02-11 19:16:12 +00:00
value := make([]byte, len(inode.value))
copy(value, inode.value)
inode.value = value
}
2014-05-08 14:10:14 +00:00
// Recursively dereference children.
for _, child := range n.children {
child.dereference()
}
// Update statistics.
n.bucket.tx.stats.NodeDeref++
2014-02-11 19:16:12 +00:00
}
2014-03-28 06:07:05 +00:00
// free adds the node's underlying page to the freelist.
func (n *node) free() {
if n.pgid != 0 {
2014-07-26 21:11:47 +00:00
n.bucket.tx.db.freelist.free(n.bucket.tx.meta.txid, n.bucket.tx.page(n.pgid))
2014-05-02 19:59:23 +00:00
n.pgid = 0
2014-03-28 06:07:05 +00:00
}
}
2014-05-08 14:10:14 +00:00
// dump writes the contents of the node to STDERR for debugging purposes.
/*
func (n *node) dump() {
// Write node header.
var typ = "branch"
if n.isLeaf {
typ = "leaf"
}
2014-05-08 14:10:14 +00:00
warnf("[NODE %d {type=%s count=%d}]", n.pgid, typ, len(n.inodes))
// Write out abbreviated version of each item.
for _, item := range n.inodes {
if n.isLeaf {
if item.flags&bucketLeafFlag != 0 {
bucket := (*bucket)(unsafe.Pointer(&item.value[0]))
2014-05-08 14:10:14 +00:00
warnf("+L %08x -> (bucket root=%d)", trunc(item.key, 4), bucket.root)
} else {
2014-05-08 14:10:14 +00:00
warnf("+L %08x -> %08x", trunc(item.key, 4), trunc(item.value, 4))
}
} else {
2014-05-08 14:10:14 +00:00
warnf("+B %08x -> pgid=%d", trunc(item.key, 4), item.pgid)
}
}
2014-05-08 14:10:14 +00:00
warn("")
}
2014-05-08 14:10:14 +00:00
*/
2014-06-03 19:21:28 +00:00
type nodes []*node
func (s nodes) Len() int { return len(s) }
func (s nodes) Swap(i, j int) { s[i], s[j] = s[j], s[i] }
func (s nodes) Less(i, j int) bool { return bytes.Compare(s[i].inodes[0].key, s[j].inodes[0].key) == -1 }
2014-01-31 18:18:51 +00:00
// inode represents an internal node inside of a node.
// It can be used to point to elements in a page or point
// to an element which hasn't been added to a page yet.
type inode struct {
flags uint32
2014-01-31 18:18:51 +00:00
pgid pgid
key []byte
value []byte
}
type inodes []inode