bolt/page.go

198 lines
4.8 KiB
Go
Raw Normal View History

2014-01-08 15:06:17 +00:00
package bolt
import (
2014-01-31 18:18:51 +00:00
"fmt"
"os"
"sort"
2014-01-08 15:06:17 +00:00
"unsafe"
)
2014-01-27 15:11:54 +00:00
const pageHeaderSize = int(unsafe.Offsetof(((*page)(nil)).ptr))
2014-01-10 14:32:12 +00:00
2014-01-27 15:11:54 +00:00
const minKeysPerPage = 2
2014-01-08 15:06:17 +00:00
2014-01-31 18:18:51 +00:00
const branchPageElementSize = int(unsafe.Sizeof(branchPageElement{}))
const leafPageElementSize = int(unsafe.Sizeof(leafPageElement{}))
2014-01-08 15:06:17 +00:00
const (
2014-02-12 21:57:27 +00:00
branchPageFlag = 0x01
leafPageFlag = 0x02
metaPageFlag = 0x04
freelistPageFlag = 0x10
2014-01-13 17:35:04 +00:00
)
2014-01-09 16:07:10 +00:00
const (
bucketLeafFlag = 0x01
)
2014-01-24 23:32:18 +00:00
type pgid uint64
2014-01-09 16:07:10 +00:00
2014-01-08 15:06:17 +00:00
type page struct {
2014-01-27 15:11:54 +00:00
id pgid
flags uint16
count uint16
overflow uint32
ptr uintptr
2014-01-09 16:07:10 +00:00
}
2014-01-31 18:18:51 +00:00
// typ returns a human readable page type string used for debugging.
func (p *page) typ() string {
2014-02-12 21:57:27 +00:00
if (p.flags & branchPageFlag) != 0 {
2014-01-31 18:18:51 +00:00
return "branch"
2014-02-12 21:57:27 +00:00
} else if (p.flags & leafPageFlag) != 0 {
2014-01-31 18:18:51 +00:00
return "leaf"
2014-02-12 21:57:27 +00:00
} else if (p.flags & metaPageFlag) != 0 {
2014-01-31 18:18:51 +00:00
return "meta"
2014-02-12 21:57:27 +00:00
} else if (p.flags & freelistPageFlag) != 0 {
2014-01-31 18:18:51 +00:00
return "freelist"
}
return fmt.Sprintf("unknown<%02x>", p.flags)
}
2014-01-10 14:32:12 +00:00
// meta returns a pointer to the metadata section of the page.
2014-01-30 03:35:58 +00:00
func (p *page) meta() *meta {
return (*meta)(unsafe.Pointer(&p.ptr))
2014-01-27 15:11:54 +00:00
}
2014-01-31 18:18:51 +00:00
// leafPageElement retrieves the leaf node by index
func (p *page) leafPageElement(index uint16) *leafPageElement {
2014-06-18 17:53:01 +00:00
n := &((*[0x7FFFFFF]leafPageElement)(unsafe.Pointer(&p.ptr)))[index]
2014-01-31 18:18:51 +00:00
return n
}
2014-01-30 03:35:58 +00:00
2014-01-31 18:18:51 +00:00
// leafPageElements retrieves a list of leaf nodes.
func (p *page) leafPageElements() []leafPageElement {
if p.count == 0 {
return nil
}
2014-06-18 17:53:01 +00:00
return ((*[0x7FFFFFF]leafPageElement)(unsafe.Pointer(&p.ptr)))[:]
2014-01-30 05:11:46 +00:00
}
2014-01-31 18:18:51 +00:00
// branchPageElement retrieves the branch node by index
func (p *page) branchPageElement(index uint16) *branchPageElement {
2014-06-18 17:53:01 +00:00
return &((*[0x7FFFFFF]branchPageElement)(unsafe.Pointer(&p.ptr)))[index]
2014-01-30 03:35:58 +00:00
}
2014-01-30 03:50:29 +00:00
2014-01-31 18:18:51 +00:00
// branchPageElements retrieves a list of branch nodes.
func (p *page) branchPageElements() []branchPageElement {
if p.count == 0 {
return nil
}
2014-06-18 17:53:01 +00:00
return ((*[0x7FFFFFF]branchPageElement)(unsafe.Pointer(&p.ptr)))[:]
2014-01-30 05:11:46 +00:00
}
2014-01-31 18:18:51 +00:00
// dump writes n bytes of the page to STDERR as hex output.
func (p *page) hexdump(n int) {
buf := (*[maxAllocSize]byte)(unsafe.Pointer(p))[:n]
fmt.Fprintf(os.Stderr, "%x\n", buf)
}
2014-01-30 23:22:02 +00:00
type pages []*page
func (s pages) Len() int { return len(s) }
func (s pages) Swap(i, j int) { s[i], s[j] = s[j], s[i] }
func (s pages) Less(i, j int) bool { return s[i].id < s[j].id }
2014-01-31 18:18:51 +00:00
// branchPageElement represents a node on a branch page.
type branchPageElement struct {
pos uint32
ksize uint32
pgid pgid
}
// key returns a byte slice of the node key.
func (n *branchPageElement) key() []byte {
buf := (*[maxAllocSize]byte)(unsafe.Pointer(n))
fix `slice bounds out of range`/maxAllocSize bugs when accessing the node data we used to use cast to *[maxAllocSize]byte, which breaks if we try to go across maxAllocSize boundary. This leads to occasional panics. Sample stacktrace: ``` panic: runtime error: slice bounds out of range goroutine 1 [running]: github.com/boltdb/bolt.(*node).write(0xc208010f50, 0xc27452a000) $GOPATH/src/github.com/boltdb/bolt/node.go:228 +0x5a5 github.com/boltdb/bolt.(*node).spill(0xc208010f50, 0x0, 0x0) $GOPATH/src/github.com/boltdb/bolt/node.go:364 +0x506 github.com/boltdb/bolt.(*node).spill(0xc208010700, 0x0, 0x0) $GOPATH/src/github.com/boltdb/bolt/node.go:336 +0x12d github.com/boltdb/bolt.(*node).spill(0xc208010620, 0x0, 0x0) $GOPATH/src/github.com/boltdb/bolt/node.go:336 +0x12d github.com/boltdb/bolt.(*Bucket).spill(0xc22b6ae880, 0x0, 0x0) $GOPATH/src/github.com/boltdb/bolt/bucket.go:535 +0x1c4 github.com/boltdb/bolt.(*Bucket).spill(0xc22b6ae840, 0x0, 0x0) $GOPATH/src/github.com/boltdb/bolt/bucket.go:502 +0xac2 github.com/boltdb/bolt.(*Bucket).spill(0xc22f4e2018, 0x0, 0x0) $GOPATH/src/github.com/boltdb/bolt/bucket.go:502 +0xac2 github.com/boltdb/bolt.(*Tx).Commit(0xc22f4e2000, 0x0, 0x0) $GOPATH/src/github.com/boltdb/bolt/tx.go:150 +0x1ee github.com/boltdb/bolt.(*DB).Update(0xc2080e4000, 0xc24d077508, 0x0, 0x0) $GOPATH/src/github.com/boltdb/bolt/db.go:483 +0x169 ``` It usually happens when working with large (50M/100M) values. One way to reproduce it is to change maxAllocSize in bolt_amd64.go to 70000 and run the tests. TestBucket_Put_Large crashes.
2015-03-26 23:47:24 +00:00
return (*[maxAllocSize]byte)(unsafe.Pointer(&buf[n.pos]))[:n.ksize]
2014-01-31 18:18:51 +00:00
}
// leafPageElement represents a node on a leaf page.
type leafPageElement struct {
flags uint32
pos uint32
ksize uint32
vsize uint32
}
// key returns a byte slice of the node key.
func (n *leafPageElement) key() []byte {
buf := (*[maxAllocSize]byte)(unsafe.Pointer(n))
return (*[maxAllocSize]byte)(unsafe.Pointer(&buf[n.pos]))[:n.ksize:n.ksize]
2014-01-31 18:18:51 +00:00
}
// value returns a byte slice of the node value.
func (n *leafPageElement) value() []byte {
buf := (*[maxAllocSize]byte)(unsafe.Pointer(n))
return (*[maxAllocSize]byte)(unsafe.Pointer(&buf[n.pos+n.ksize]))[:n.vsize:n.vsize]
2014-01-31 18:18:51 +00:00
}
2014-03-22 04:34:54 +00:00
// PageInfo represents human readable information about a page.
type PageInfo struct {
ID int
Type string
Count int
OverflowCount int
}
type pgids []pgid
func (s pgids) Len() int { return len(s) }
func (s pgids) Swap(i, j int) { s[i], s[j] = s[j], s[i] }
2014-05-19 20:11:32 +00:00
func (s pgids) Less(i, j int) bool { return s[i] < s[j] }
// merge returns the sorted union of a and b.
func (a pgids) merge(b pgids) pgids {
// Return the opposite slice if one is nil.
if len(a) == 0 {
return b
Don't allocate huge slices to merge pgids in freelist.write Using a large (50gb) database with a read-write-delete heavy load, nearly 100% of allocated space came from freelists. 1/3 came from freelist.release, 1/3 from freelist.write, and 1/3 came from tx.allocate to make space for freelist.write. In the case of freelist.write, the newly allocated giant slice gets copied to the space prepared by tx.allocate and then discarded. To avoid this, add func mergepgids that accepts a destination slice, and use it in freelist.write. This has a mild negative impact on the existing benchmarks, but cuts allocated space in my real world db by over 30%. name old time/op new time/op delta _FreelistRelease10K-8 18.7µs ±10% 18.2µs ± 4% ~ (p=0.548 n=5+5) _FreelistRelease100K-8 233µs ± 5% 258µs ±20% ~ (p=0.151 n=5+5) _FreelistRelease1000K-8 3.34ms ± 8% 3.13ms ± 8% ~ (p=0.151 n=5+5) _FreelistRelease10000K-8 32.3ms ± 1% 32.2ms ± 7% ~ (p=0.690 n=5+5) DBBatchAutomatic-8 2.18ms ± 3% 2.19ms ± 4% ~ (p=0.421 n=5+5) DBBatchSingle-8 140ms ± 6% 140ms ± 4% ~ (p=0.841 n=5+5) DBBatchManual10x100-8 4.41ms ± 2% 4.37ms ± 3% ~ (p=0.548 n=5+5) name old alloc/op new alloc/op delta _FreelistRelease10K-8 82.5kB ± 0% 82.5kB ± 0% ~ (all samples are equal) _FreelistRelease100K-8 805kB ± 0% 805kB ± 0% ~ (all samples are equal) _FreelistRelease1000K-8 8.05MB ± 0% 8.05MB ± 0% ~ (all samples are equal) _FreelistRelease10000K-8 80.4MB ± 0% 80.4MB ± 0% ~ (p=1.000 n=5+5) DBBatchAutomatic-8 384kB ± 0% 384kB ± 0% ~ (p=0.095 n=5+5) DBBatchSingle-8 17.2MB ± 1% 17.2MB ± 1% ~ (p=0.310 n=5+5) DBBatchManual10x100-8 908kB ± 0% 902kB ± 1% ~ (p=0.730 n=4+5) name old allocs/op new allocs/op delta _FreelistRelease10K-8 5.00 ± 0% 5.00 ± 0% ~ (all samples are equal) _FreelistRelease100K-8 5.00 ± 0% 5.00 ± 0% ~ (all samples are equal) _FreelistRelease1000K-8 5.00 ± 0% 5.00 ± 0% ~ (all samples are equal) _FreelistRelease10000K-8 5.00 ± 0% 5.00 ± 0% ~ (all samples are equal) DBBatchAutomatic-8 10.2k ± 0% 10.2k ± 0% +0.07% (p=0.032 n=5+5) DBBatchSingle-8 58.6k ± 0% 59.6k ± 0% +1.70% (p=0.008 n=5+5) DBBatchManual10x100-8 6.02k ± 0% 6.03k ± 0% +0.17% (p=0.029 n=4+4)
2016-12-20 22:04:46 +00:00
}
if len(b) == 0 {
return a
}
Don't allocate huge slices to merge pgids in freelist.write Using a large (50gb) database with a read-write-delete heavy load, nearly 100% of allocated space came from freelists. 1/3 came from freelist.release, 1/3 from freelist.write, and 1/3 came from tx.allocate to make space for freelist.write. In the case of freelist.write, the newly allocated giant slice gets copied to the space prepared by tx.allocate and then discarded. To avoid this, add func mergepgids that accepts a destination slice, and use it in freelist.write. This has a mild negative impact on the existing benchmarks, but cuts allocated space in my real world db by over 30%. name old time/op new time/op delta _FreelistRelease10K-8 18.7µs ±10% 18.2µs ± 4% ~ (p=0.548 n=5+5) _FreelistRelease100K-8 233µs ± 5% 258µs ±20% ~ (p=0.151 n=5+5) _FreelistRelease1000K-8 3.34ms ± 8% 3.13ms ± 8% ~ (p=0.151 n=5+5) _FreelistRelease10000K-8 32.3ms ± 1% 32.2ms ± 7% ~ (p=0.690 n=5+5) DBBatchAutomatic-8 2.18ms ± 3% 2.19ms ± 4% ~ (p=0.421 n=5+5) DBBatchSingle-8 140ms ± 6% 140ms ± 4% ~ (p=0.841 n=5+5) DBBatchManual10x100-8 4.41ms ± 2% 4.37ms ± 3% ~ (p=0.548 n=5+5) name old alloc/op new alloc/op delta _FreelistRelease10K-8 82.5kB ± 0% 82.5kB ± 0% ~ (all samples are equal) _FreelistRelease100K-8 805kB ± 0% 805kB ± 0% ~ (all samples are equal) _FreelistRelease1000K-8 8.05MB ± 0% 8.05MB ± 0% ~ (all samples are equal) _FreelistRelease10000K-8 80.4MB ± 0% 80.4MB ± 0% ~ (p=1.000 n=5+5) DBBatchAutomatic-8 384kB ± 0% 384kB ± 0% ~ (p=0.095 n=5+5) DBBatchSingle-8 17.2MB ± 1% 17.2MB ± 1% ~ (p=0.310 n=5+5) DBBatchManual10x100-8 908kB ± 0% 902kB ± 1% ~ (p=0.730 n=4+5) name old allocs/op new allocs/op delta _FreelistRelease10K-8 5.00 ± 0% 5.00 ± 0% ~ (all samples are equal) _FreelistRelease100K-8 5.00 ± 0% 5.00 ± 0% ~ (all samples are equal) _FreelistRelease1000K-8 5.00 ± 0% 5.00 ± 0% ~ (all samples are equal) _FreelistRelease10000K-8 5.00 ± 0% 5.00 ± 0% ~ (all samples are equal) DBBatchAutomatic-8 10.2k ± 0% 10.2k ± 0% +0.07% (p=0.032 n=5+5) DBBatchSingle-8 58.6k ± 0% 59.6k ± 0% +1.70% (p=0.008 n=5+5) DBBatchManual10x100-8 6.02k ± 0% 6.03k ± 0% +0.17% (p=0.029 n=4+4)
2016-12-20 22:04:46 +00:00
merged := make(pgids, len(a)+len(b))
mergepgids(merged, a, b)
return merged
}
// merge copies the sorted union of a and b into dst.
// If dst is too small, it panics.
func mergepgids(dst, a, b pgids) {
if len(dst) < len(a)+len(b) {
panic(fmt.Errorf("mergepgids bad len %d < %d + %d", len(dst), len(a), len(b)))
}
// Copy in the opposite slice if one is nil.
if len(a) == 0 {
copy(dst, b)
return
}
if len(b) == 0 {
copy(dst, a)
return
}
Don't allocate huge slices to merge pgids in freelist.write Using a large (50gb) database with a read-write-delete heavy load, nearly 100% of allocated space came from freelists. 1/3 came from freelist.release, 1/3 from freelist.write, and 1/3 came from tx.allocate to make space for freelist.write. In the case of freelist.write, the newly allocated giant slice gets copied to the space prepared by tx.allocate and then discarded. To avoid this, add func mergepgids that accepts a destination slice, and use it in freelist.write. This has a mild negative impact on the existing benchmarks, but cuts allocated space in my real world db by over 30%. name old time/op new time/op delta _FreelistRelease10K-8 18.7µs ±10% 18.2µs ± 4% ~ (p=0.548 n=5+5) _FreelistRelease100K-8 233µs ± 5% 258µs ±20% ~ (p=0.151 n=5+5) _FreelistRelease1000K-8 3.34ms ± 8% 3.13ms ± 8% ~ (p=0.151 n=5+5) _FreelistRelease10000K-8 32.3ms ± 1% 32.2ms ± 7% ~ (p=0.690 n=5+5) DBBatchAutomatic-8 2.18ms ± 3% 2.19ms ± 4% ~ (p=0.421 n=5+5) DBBatchSingle-8 140ms ± 6% 140ms ± 4% ~ (p=0.841 n=5+5) DBBatchManual10x100-8 4.41ms ± 2% 4.37ms ± 3% ~ (p=0.548 n=5+5) name old alloc/op new alloc/op delta _FreelistRelease10K-8 82.5kB ± 0% 82.5kB ± 0% ~ (all samples are equal) _FreelistRelease100K-8 805kB ± 0% 805kB ± 0% ~ (all samples are equal) _FreelistRelease1000K-8 8.05MB ± 0% 8.05MB ± 0% ~ (all samples are equal) _FreelistRelease10000K-8 80.4MB ± 0% 80.4MB ± 0% ~ (p=1.000 n=5+5) DBBatchAutomatic-8 384kB ± 0% 384kB ± 0% ~ (p=0.095 n=5+5) DBBatchSingle-8 17.2MB ± 1% 17.2MB ± 1% ~ (p=0.310 n=5+5) DBBatchManual10x100-8 908kB ± 0% 902kB ± 1% ~ (p=0.730 n=4+5) name old allocs/op new allocs/op delta _FreelistRelease10K-8 5.00 ± 0% 5.00 ± 0% ~ (all samples are equal) _FreelistRelease100K-8 5.00 ± 0% 5.00 ± 0% ~ (all samples are equal) _FreelistRelease1000K-8 5.00 ± 0% 5.00 ± 0% ~ (all samples are equal) _FreelistRelease10000K-8 5.00 ± 0% 5.00 ± 0% ~ (all samples are equal) DBBatchAutomatic-8 10.2k ± 0% 10.2k ± 0% +0.07% (p=0.032 n=5+5) DBBatchSingle-8 58.6k ± 0% 59.6k ± 0% +1.70% (p=0.008 n=5+5) DBBatchManual10x100-8 6.02k ± 0% 6.03k ± 0% +0.17% (p=0.029 n=4+4)
2016-12-20 22:04:46 +00:00
// Merged will hold all elements from both lists.
merged := dst[:0]
// Assign lead to the slice with a lower starting value, follow to the higher value.
lead, follow := a, b
if b[0] < a[0] {
lead, follow = b, a
}
// Continue while there are elements in the lead.
for len(lead) > 0 {
// Merge largest prefix of lead that is ahead of follow[0].
n := sort.Search(len(lead), func(i int) bool { return lead[i] > follow[0] })
merged = append(merged, lead[:n]...)
if n >= len(lead) {
break
}
// Swap lead and follow.
lead, follow = follow, lead[n:]
}
// Append what's left in follow.
Don't allocate huge slices to merge pgids in freelist.write Using a large (50gb) database with a read-write-delete heavy load, nearly 100% of allocated space came from freelists. 1/3 came from freelist.release, 1/3 from freelist.write, and 1/3 came from tx.allocate to make space for freelist.write. In the case of freelist.write, the newly allocated giant slice gets copied to the space prepared by tx.allocate and then discarded. To avoid this, add func mergepgids that accepts a destination slice, and use it in freelist.write. This has a mild negative impact on the existing benchmarks, but cuts allocated space in my real world db by over 30%. name old time/op new time/op delta _FreelistRelease10K-8 18.7µs ±10% 18.2µs ± 4% ~ (p=0.548 n=5+5) _FreelistRelease100K-8 233µs ± 5% 258µs ±20% ~ (p=0.151 n=5+5) _FreelistRelease1000K-8 3.34ms ± 8% 3.13ms ± 8% ~ (p=0.151 n=5+5) _FreelistRelease10000K-8 32.3ms ± 1% 32.2ms ± 7% ~ (p=0.690 n=5+5) DBBatchAutomatic-8 2.18ms ± 3% 2.19ms ± 4% ~ (p=0.421 n=5+5) DBBatchSingle-8 140ms ± 6% 140ms ± 4% ~ (p=0.841 n=5+5) DBBatchManual10x100-8 4.41ms ± 2% 4.37ms ± 3% ~ (p=0.548 n=5+5) name old alloc/op new alloc/op delta _FreelistRelease10K-8 82.5kB ± 0% 82.5kB ± 0% ~ (all samples are equal) _FreelistRelease100K-8 805kB ± 0% 805kB ± 0% ~ (all samples are equal) _FreelistRelease1000K-8 8.05MB ± 0% 8.05MB ± 0% ~ (all samples are equal) _FreelistRelease10000K-8 80.4MB ± 0% 80.4MB ± 0% ~ (p=1.000 n=5+5) DBBatchAutomatic-8 384kB ± 0% 384kB ± 0% ~ (p=0.095 n=5+5) DBBatchSingle-8 17.2MB ± 1% 17.2MB ± 1% ~ (p=0.310 n=5+5) DBBatchManual10x100-8 908kB ± 0% 902kB ± 1% ~ (p=0.730 n=4+5) name old allocs/op new allocs/op delta _FreelistRelease10K-8 5.00 ± 0% 5.00 ± 0% ~ (all samples are equal) _FreelistRelease100K-8 5.00 ± 0% 5.00 ± 0% ~ (all samples are equal) _FreelistRelease1000K-8 5.00 ± 0% 5.00 ± 0% ~ (all samples are equal) _FreelistRelease10000K-8 5.00 ± 0% 5.00 ± 0% ~ (all samples are equal) DBBatchAutomatic-8 10.2k ± 0% 10.2k ± 0% +0.07% (p=0.032 n=5+5) DBBatchSingle-8 58.6k ± 0% 59.6k ± 0% +1.70% (p=0.008 n=5+5) DBBatchManual10x100-8 6.02k ± 0% 6.03k ± 0% +0.17% (p=0.029 n=4+4)
2016-12-20 22:04:46 +00:00
_ = append(merged, follow...)
}