package bolt import ( "bytes" "sort" "unsafe" ) // node represents an in-memory, deserialized page. type node struct { bucket *Bucket isLeaf bool unbalanced bool key []byte pgid pgid parent *node children []*node inodes inodes } // root returns the top-level node this node is attached to. func (n *node) root() *node { if n.parent == nil { return n } return n.parent.root() } // minKeys returns the minimum number of inodes this node should have. func (n *node) minKeys() int { if n.isLeaf { return 1 } return 2 } // size returns the size of the node after serialization. func (n *node) size() int { var elementSize = n.pageElementSize() var size = pageHeaderSize for _, item := range n.inodes { size += elementSize + len(item.key) + len(item.value) } return size } // pageElementSize returns the size of each page element based on the type of node. func (n *node) pageElementSize() int { if n.isLeaf { return leafPageElementSize } return branchPageElementSize } // childAt returns the child node at a given index. func (n *node) childAt(index int) *node { _assert(!n.isLeaf, "invalid childAt(%d) on a leaf node", index) return n.bucket.node(n.inodes[index].pgid, n) } // childIndex returns the index of a given child node. func (n *node) childIndex(child *node) int { index := sort.Search(len(n.inodes), func(i int) bool { return bytes.Compare(n.inodes[i].key, child.key) != -1 }) return index } // numChildren returns the number of children. func (n *node) numChildren() int { return len(n.inodes) } // nextSibling returns the next node with the same parent. func (n *node) nextSibling() *node { if n.parent == nil { return nil } index := n.parent.childIndex(n) if index >= n.parent.numChildren()-1 { return nil } return n.parent.childAt(index + 1) } // prevSibling returns the previous node with the same parent. func (n *node) prevSibling() *node { if n.parent == nil { return nil } index := n.parent.childIndex(n) if index == 0 { return nil } return n.parent.childAt(index - 1) } // put inserts a key/value. func (n *node) put(oldKey, newKey, value []byte, pgid pgid, flags uint32) { _assert(pgid < n.bucket.tx.meta.pgid, "pgid (%d) above high water mark (%d)", pgid, n.bucket.tx.meta.pgid) _assert(len(oldKey) > 0, "put: zero-length old key") _assert(len(newKey) > 0, "put: zero-length new key") // Find insertion index. index := sort.Search(len(n.inodes), func(i int) bool { return bytes.Compare(n.inodes[i].key, oldKey) != -1 }) // Add capacity and shift nodes if we don't have an exact match and need to insert. exact := (len(n.inodes) > 0 && index < len(n.inodes) && bytes.Equal(n.inodes[index].key, oldKey)) if !exact { n.inodes = append(n.inodes, inode{}) copy(n.inodes[index+1:], n.inodes[index:]) } inode := &n.inodes[index] inode.flags = flags inode.key = newKey inode.value = value inode.pgid = pgid _assert(len(inode.key) > 0, "put: zero-length inode key") } // del removes a key from the node. func (n *node) del(key []byte) { // Find index of key. index := sort.Search(len(n.inodes), func(i int) bool { return bytes.Compare(n.inodes[i].key, key) != -1 }) // Exit if the key isn't found. if index >= len(n.inodes) || !bytes.Equal(n.inodes[index].key, key) { return } // Delete inode from the node. n.inodes = append(n.inodes[:index], n.inodes[index+1:]...) // Mark the node as needing rebalancing. n.unbalanced = true } // read initializes the node from a page. func (n *node) read(p *page) { n.pgid = p.id n.isLeaf = ((p.flags & leafPageFlag) != 0) n.inodes = make(inodes, int(p.count)) for i := 0; i < int(p.count); i++ { inode := &n.inodes[i] if n.isLeaf { elem := p.leafPageElement(uint16(i)) inode.flags = elem.flags inode.key = elem.key() inode.value = elem.value() } else { elem := p.branchPageElement(uint16(i)) inode.pgid = elem.pgid inode.key = elem.key() } _assert(len(inode.key) > 0, "read: zero-length inode key") } // Save first key so we can find the node in the parent when we spill. if len(n.inodes) > 0 { n.key = n.inodes[0].key _assert(len(n.key) > 0, "read: zero-length node key") } else { n.key = nil } } // write writes the items onto one or more pages. func (n *node) write(p *page) { // Initialize page. if n.isLeaf { p.flags |= leafPageFlag } else { p.flags |= branchPageFlag } p.count = uint16(len(n.inodes)) // Loop over each item and write it to the page. b := (*[maxAllocSize]byte)(unsafe.Pointer(&p.ptr))[n.pageElementSize()*len(n.inodes):] for i, item := range n.inodes { _assert(len(item.key) > 0, "write: zero-length inode key") // Write the page element. if n.isLeaf { elem := p.leafPageElement(uint16(i)) elem.pos = uint32(uintptr(unsafe.Pointer(&b[0])) - uintptr(unsafe.Pointer(elem))) elem.flags = item.flags elem.ksize = uint32(len(item.key)) elem.vsize = uint32(len(item.value)) } else { elem := p.branchPageElement(uint16(i)) elem.pos = uint32(uintptr(unsafe.Pointer(&b[0])) - uintptr(unsafe.Pointer(elem))) elem.ksize = uint32(len(item.key)) elem.pgid = item.pgid } // Write data for the element to the end of the page. copy(b[0:], item.key) b = b[len(item.key):] copy(b[0:], item.value) b = b[len(item.value):] } // DEBUG ONLY: n.dump() } // split breaks up a node into smaller nodes, if appropriate. // This should only be called from the spill() function. func (n *node) split(pageSize int) []*node { var nodes = []*node{n} // Ignore the split if the page doesn't have at least enough nodes for // multiple pages or if the data can fit on a single page. if len(n.inodes) <= (minKeysPerPage*2) || n.size() < pageSize { return nodes } // Determine the threshold before starting a new node. var fillPercent = n.bucket.tx.db.FillPercent if fillPercent < minFillPercent { fillPercent = minFillPercent } else if fillPercent > maxFillPercent { fillPercent = maxFillPercent } threshold := int(float64(pageSize) * fillPercent) // Group into smaller pages and target a given fill size. size := pageHeaderSize internalNodes := n.inodes current := n current.inodes = nil // Loop over every inode and split once we reach our threshold. for i, inode := range internalNodes { elemSize := n.pageElementSize() + len(inode.key) + len(inode.value) // Split once we reach our threshold split size. However, this should // only be done if we have enough keys for this node and we will have // enough keys for the next node. if len(current.inodes) >= minKeysPerPage && i < len(internalNodes)-minKeysPerPage && size+elemSize > threshold { // If there's no parent then we need to create one. if n.parent == nil { n.parent = &node{bucket: n.bucket, children: []*node{n}} } // Create a new node and add it to the parent. current = &node{bucket: n.bucket, isLeaf: n.isLeaf, parent: n.parent} n.parent.children = append(n.parent.children, current) nodes = append(nodes, current) // Reset our running total back to zero (plus header size). size = pageHeaderSize // Update the statistics. n.bucket.tx.stats.Split++ } // Increase our running total of the size and append the inode. size += elemSize current.inodes = append(current.inodes, inode) } return nodes } // spill writes the nodes to dirty pages and splits nodes as it goes. // Returns an error if dirty pages cannot be allocated. func (n *node) spill() error { var tx = n.bucket.tx // Spill child nodes first. for _, child := range n.children { if err := child.spill(); err != nil { return err } } // Add node's page to the freelist if it's not new. if n.pgid > 0 { tx.db.freelist.free(tx.id(), tx.page(n.pgid)) n.pgid = 0 } // Spill nodes by deepest first. var nodes = n.split(tx.db.pageSize) for _, node := range nodes { // Allocate contiguous space for the node. p, err := tx.allocate((node.size() / tx.db.pageSize) + 1) if err != nil { return err } // Write the node. _assert(p.id < tx.meta.pgid, "pgid (%d) above high water mark (%d)", p.id, tx.meta.pgid) node.pgid = p.id node.write(p) // Insert into parent inodes. if node.parent != nil { var key = node.key if key == nil { key = node.inodes[0].key } node.parent.put(key, node.inodes[0].key, nil, node.pgid, 0) node.key = node.inodes[0].key _assert(len(n.key) > 0, "spill: zero-length node key") } // Update the statistics. tx.stats.Spill++ } // This is a special case where we need to write the parent if it is new // and caused by a split in the root. var parent = n.parent if parent != nil && parent.pgid == 0 { // Allocate contiguous space for the node. p, err := tx.allocate((parent.size() / tx.db.pageSize) + 1) if err != nil { return err } // Write the new root. _assert(p.id < tx.meta.pgid, "pgid (%d) above high water mark (%d)", p.id, tx.meta.pgid) parent.pgid = p.id parent.write(p) } return nil } // rebalance attempts to combine the node with sibling nodes if the node fill // size is below a threshold or if there are not enough keys. func (n *node) rebalance() { if !n.unbalanced { return } n.unbalanced = false // Update statistics. n.bucket.tx.stats.Rebalance++ // Ignore if node is above threshold (25%) and has enough keys. var threshold = n.bucket.tx.db.pageSize / 4 if n.size() > threshold && len(n.inodes) > n.minKeys() { return } // Root node has special handling. if n.parent == nil { // If root node is a branch and only has one node then collapse it. if !n.isLeaf && len(n.inodes) == 1 { // Move root's child up. child := n.bucket.node(n.inodes[0].pgid, n) n.isLeaf = child.isLeaf n.inodes = child.inodes[:] n.children = child.children // Reparent all child nodes being moved. for _, inode := range n.inodes { if child, ok := n.bucket.nodes[inode.pgid]; ok { child.parent = n } } // Remove old child. child.parent = nil delete(n.bucket.nodes, child.pgid) child.free() } return } // If node has no keys then just remove it. if n.numChildren() == 0 { n.parent.del(n.key) n.parent.removeChild(n) delete(n.bucket.nodes, n.pgid) n.free() n.parent.rebalance() return } _assert(n.parent.numChildren() > 1, "parent must have at least 2 children") // Destination node is right sibling if idx == 0, otherwise left sibling. var target *node var useNextSibling = (n.parent.childIndex(n) == 0) if useNextSibling { target = n.nextSibling() } else { target = n.prevSibling() } // If target node has extra nodes then just move one over. if target.numChildren() > target.minKeys() { if useNextSibling { // Reparent and move node. if child, ok := n.bucket.nodes[target.inodes[0].pgid]; ok { child.parent.removeChild(child) child.parent = n child.parent.children = append(child.parent.children, child) } n.inodes = append(n.inodes, target.inodes[0]) target.inodes = target.inodes[1:] // Update target key on parent. target.parent.put(target.key, target.inodes[0].key, nil, target.pgid, 0) target.key = target.inodes[0].key _assert(len(target.key) > 0, "rebalance(1): zero-length node key") } else { // Reparent and move node. if child, ok := n.bucket.nodes[target.inodes[len(target.inodes)-1].pgid]; ok { child.parent.removeChild(child) child.parent = n child.parent.children = append(child.parent.children, child) } n.inodes = append(n.inodes, inode{}) copy(n.inodes[1:], n.inodes) n.inodes[0] = target.inodes[len(target.inodes)-1] target.inodes = target.inodes[:len(target.inodes)-1] } // Update parent key for node. n.parent.put(n.key, n.inodes[0].key, nil, n.pgid, 0) n.key = n.inodes[0].key _assert(len(n.key) > 0, "rebalance(2): zero-length node key") return } // If both this node and the target node are too small then merge them. if useNextSibling { // Reparent all child nodes being moved. for _, inode := range target.inodes { if child, ok := n.bucket.nodes[inode.pgid]; ok { child.parent.removeChild(child) child.parent = n child.parent.children = append(child.parent.children, child) } } // Copy over inodes from target and remove target. n.inodes = append(n.inodes, target.inodes...) n.parent.del(target.key) n.parent.removeChild(target) delete(n.bucket.nodes, target.pgid) target.free() } else { // Reparent all child nodes being moved. for _, inode := range n.inodes { if child, ok := n.bucket.nodes[inode.pgid]; ok { child.parent.removeChild(child) child.parent = target child.parent.children = append(child.parent.children, child) } } // Copy over inodes to target and remove node. target.inodes = append(target.inodes, n.inodes...) n.parent.del(n.key) n.parent.removeChild(n) n.parent.put(target.key, target.inodes[0].key, nil, target.pgid, 0) delete(n.bucket.nodes, n.pgid) n.free() } // Either this node or the target node was deleted from the parent so rebalance it. n.parent.rebalance() } // removes a node from the list of in-memory children. // This does not affect the inodes. func (n *node) removeChild(target *node) { for i, child := range n.children { if child == target { n.children = append(n.children[:i], n.children[i+1:]...) return } } } // dereference causes the node to copy all its inode key/value references to heap memory. // This is required when the mmap is reallocated so inodes are not pointing to stale data. func (n *node) dereference() { if n.key != nil { key := make([]byte, len(n.key)) copy(key, n.key) n.key = key _assert(n.pgid == 0 || len(n.key) > 0, "dereference: zero-length node key on existing node") } for i := range n.inodes { inode := &n.inodes[i] key := make([]byte, len(inode.key)) copy(key, inode.key) inode.key = key _assert(len(inode.key) > 0, "dereference: zero-length inode key") value := make([]byte, len(inode.value)) copy(value, inode.value) inode.value = value } // Recursively dereference children. for _, child := range n.children { child.dereference() } // Update statistics. n.bucket.tx.stats.NodeDeref++ } // free adds the node's underlying page to the freelist. func (n *node) free() { if n.pgid != 0 { n.bucket.tx.db.freelist.free(n.bucket.tx.id(), n.bucket.tx.page(n.pgid)) n.pgid = 0 } } // dump writes the contents of the node to STDERR for debugging purposes. /* func (n *node) dump() { // Write node header. var typ = "branch" if n.isLeaf { typ = "leaf" } warnf("[NODE %d {type=%s count=%d}]", n.pgid, typ, len(n.inodes)) // Write out abbreviated version of each item. for _, item := range n.inodes { if n.isLeaf { if item.flags&bucketLeafFlag != 0 { bucket := (*bucket)(unsafe.Pointer(&item.value[0])) warnf("+L %08x -> (bucket root=%d)", trunc(item.key, 4), bucket.root) } else { warnf("+L %08x -> %08x", trunc(item.key, 4), trunc(item.value, 4)) } } else { warnf("+B %08x -> pgid=%d", trunc(item.key, 4), item.pgid) } } warn("") } */ // inode represents an internal node inside of a node. // It can be used to point to elements in a page or point // to an element which hasn't been added to a page yet. type inode struct { flags uint32 pgid pgid key []byte value []byte } type inodes []inode