summaryrefslogtreecommitdiff
path: root/src/index/suffixarray/suffixarray.go
diff options
context:
space:
mode:
authorRuss Cox <rsc@golang.org>2019-01-25 12:01:53 -0500
committerRuss Cox <rsc@golang.org>2019-05-01 15:19:07 +0000
commit45be3530a32dfe759d992f488a42e7495fcebd19 (patch)
treea3fc34ea5f01fe2d32155b6081fccf8fee440908 /src/index/suffixarray/suffixarray.go
parentb098c0f467e5ce70b936381c439a0cbafc3316d3 (diff)
downloadgo-git-45be3530a32dfe759d992f488a42e7495fcebd19.tar.gz
index/suffixarray: add 32-bit implementation
The original index/suffixarray used 32-bit ints on 64-bit machines, because that's what 'int' meant in Go at the time. When we changed the meaning of int, that doubled the space overhead of suffix arrays for all uses, even though the vast majority of them describe less than 2 GB of text. The space overhead of a suffix array compared to the text is not insignificant: there's a big difference for many uses between 4X and 8X. This CL adjusts names in qsufsort.go so that a global search and replace s/32/64/g produces a working 64-bit implementation, and then it modifies suffixarray.go to choose between the 32-bit and 64-bit implementation as appropriate depending on the input size. The 64-bit implementation is generated by 'go generate'. This CL also restructures the benchmarks, to test different input sizes, different input texts, and 32-bit vs 64-bit. The serialized form uses varint-encoded numbers and is unchanged, so on-disk suffix arrays written by older versions of Go will be readable by this version, and vice versa. The 32-bit version runs a up to 17% faster than the 64-bit version on real inputs, but more importantly it uses 50% less memory. I have a followup CL that also implements a faster algorithm on top of these improvements, but these are a good first step. name 64-bit speed 32-bit speed delta New/text=opticks/size=100K/bits=*-12 4.44MB/s ± 0% 4.64MB/s ± 0% +4.41% (p=0.008 n=5+5) New/text=opticks/size=500K/bits=*-12 3.70MB/s ± 1% 3.82MB/s ± 0% +3.30% (p=0.008 n=5+5) New/text=go/size=100K/bits=*-12 4.40MB/s ± 0% 4.61MB/s ± 0% +4.82% (p=0.008 n=5+5) New/text=go/size=500K/bits=*-12 3.66MB/s ± 0% 3.77MB/s ± 0% +3.01% (p=0.016 n=4+5) New/text=go/size=1M/bits=*-12 3.29MB/s ± 0% 3.55MB/s ± 0% +7.90% (p=0.016 n=5+4) New/text=go/size=5M/bits=*-12 2.25MB/s ± 1% 2.65MB/s ± 0% +17.81% (p=0.008 n=5+5) New/text=go/size=10M/bits=*-12 1.82MB/s ± 0% 2.09MB/s ± 1% +14.36% (p=0.008 n=5+5) New/text=go/size=50M/bits=*-12 1.35MB/s ± 0% 1.51MB/s ± 1% +12.33% (p=0.008 n=5+5) New/text=zero/size=100K/bits=*-12 3.42MB/s ± 0% 3.32MB/s ± 0% -2.74% (p=0.000 n=5+4) New/text=zero/size=500K/bits=*-12 3.00MB/s ± 1% 2.97MB/s ± 0% -1.13% (p=0.016 n=5+4) New/text=zero/size=1M/bits=*-12 2.81MB/s ± 0% 2.78MB/s ± 2% ~ (p=0.167 n=5+5) New/text=zero/size=5M/bits=*-12 2.46MB/s ± 0% 2.53MB/s ± 0% +3.18% (p=0.008 n=5+5) New/text=zero/size=10M/bits=*-12 2.35MB/s ± 0% 2.42MB/s ± 0% +2.98% (p=0.016 n=4+5) New/text=zero/size=50M/bits=*-12 2.12MB/s ± 0% 2.18MB/s ± 0% +3.02% (p=0.008 n=5+5) New/text=rand/size=100K/bits=*-12 6.98MB/s ± 0% 7.22MB/s ± 0% +3.38% (p=0.016 n=4+5) New/text=rand/size=500K/bits=*-12 5.53MB/s ± 0% 5.64MB/s ± 0% +1.92% (p=0.008 n=5+5) New/text=rand/size=1M/bits=*-12 4.62MB/s ± 1% 5.06MB/s ± 0% +9.61% (p=0.008 n=5+5) New/text=rand/size=5M/bits=*-12 3.09MB/s ± 0% 3.43MB/s ± 0% +10.94% (p=0.016 n=4+5) New/text=rand/size=10M/bits=*-12 2.68MB/s ± 0% 2.95MB/s ± 0% +10.39% (p=0.008 n=5+5) New/text=rand/size=50M/bits=*-12 1.92MB/s ± 0% 2.06MB/s ± 1% +7.41% (p=0.008 n=5+5) SaveRestore/bits=*-12 243MB/s ± 1% 259MB/s ± 0% +6.68% (p=0.000 n=9+10) name 64-bit alloc/op 32-bit alloc/op delta New/text=opticks/size=100K/bits=*-12 1.62MB ± 0% 0.81MB ± 0% -50.00% (p=0.000 n=5+4) New/text=opticks/size=500K/bits=*-12 8.07MB ± 0% 4.04MB ± 0% -49.89% (p=0.008 n=5+5) New/text=go/size=100K/bits=*-12 1.62MB ± 0% 0.81MB ± 0% -50.00% (p=0.008 n=5+5) New/text=go/size=500K/bits=*-12 8.07MB ± 0% 4.04MB ± 0% -49.89% (p=0.029 n=4+4) New/text=go/size=1M/bits=*-12 16.1MB ± 0% 8.1MB ± 0% -49.95% (p=0.008 n=5+5) New/text=go/size=5M/bits=*-12 80.3MB ± 0% 40.2MB ± 0% ~ (p=0.079 n=4+5) New/text=go/size=10M/bits=*-12 160MB ± 0% 80MB ± 0% -50.00% (p=0.008 n=5+5) New/text=go/size=50M/bits=*-12 805MB ± 0% 402MB ± 0% -50.06% (p=0.029 n=4+4) New/text=zero/size=100K/bits=*-12 3.02MB ± 0% 1.46MB ± 0% ~ (p=0.079 n=4+5) New/text=zero/size=500K/bits=*-12 19.7MB ± 0% 8.7MB ± 0% -55.98% (p=0.008 n=5+5) New/text=zero/size=1M/bits=*-12 39.0MB ± 0% 19.7MB ± 0% -49.60% (p=0.000 n=5+4) New/text=zero/size=5M/bits=*-12 169MB ± 0% 85MB ± 0% -49.46% (p=0.029 n=4+4) New/text=zero/size=10M/bits=*-12 333MB ± 0% 169MB ± 0% -49.43% (p=0.000 n=5+4) New/text=zero/size=50M/bits=*-12 1.63GB ± 0% 0.74GB ± 0% -54.61% (p=0.008 n=5+5) New/text=rand/size=100K/bits=*-12 1.61MB ± 0% 0.81MB ± 0% -50.00% (p=0.000 n=5+4) New/text=rand/size=500K/bits=*-12 8.07MB ± 0% 4.04MB ± 0% -49.89% (p=0.000 n=5+4) New/text=rand/size=1M/bits=*-12 16.1MB ± 0% 8.1MB ± 0% -49.95% (p=0.029 n=4+4) New/text=rand/size=5M/bits=*-12 80.7MB ± 0% 40.3MB ± 0% -50.06% (p=0.008 n=5+5) New/text=rand/size=10M/bits=*-12 161MB ± 0% 81MB ± 0% -50.03% (p=0.008 n=5+5) New/text=rand/size=50M/bits=*-12 806MB ± 0% 403MB ± 0% -50.00% (p=0.016 n=4+5) SaveRestore/bits=*-12 9.47MB ± 0% 5.28MB ± 0% -44.29% (p=0.000 n=9+8) https://perf.golang.org/search?q=upload:20190126.1+|+bits:64+vs+bits:32 Fixes #6816. Change-Id: Ied2fbea519a202ecc43719debcd233344ce38847 Reviewed-on: https://go-review.googlesource.com/c/go/+/174097 Run-TryBot: Russ Cox <rsc@golang.org> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org>
Diffstat (limited to 'src/index/suffixarray/suffixarray.go')
-rw-r--r--src/index/suffixarray/suffixarray.go132
1 files changed, 104 insertions, 28 deletions
diff --git a/src/index/suffixarray/suffixarray.go b/src/index/suffixarray/suffixarray.go
index 0961ac4fb2..339643db4d 100644
--- a/src/index/suffixarray/suffixarray.go
+++ b/src/index/suffixarray/suffixarray.go
@@ -19,21 +19,68 @@ package suffixarray
import (
"bytes"
"encoding/binary"
+ "errors"
"io"
+ "math"
"regexp"
"sort"
)
+// Can change for testing
+var maxData32 int = realMaxData32
+
+const realMaxData32 = math.MaxInt32
+
// Index implements a suffix array for fast substring search.
type Index struct {
data []byte
- sa []int // suffix array for data; len(sa) == len(data)
+ sa ints // suffix array for data; sa.len() == len(data)
+}
+
+// An ints is either an []int32 or an []int64.
+// That is, one of them is empty, and one is the real data.
+// The int64 form is used when len(data) > maxData32
+type ints struct {
+ int32 []int32
+ int64 []int64
+}
+
+func (a *ints) len() int {
+ return len(a.int32) + len(a.int64)
+}
+
+func (a *ints) get(i int) int64 {
+ if a.int32 != nil {
+ return int64(a.int32[i])
+ }
+ return a.int64[i]
+}
+
+func (a *ints) set(i int, v int64) {
+ if a.int32 != nil {
+ a.int32[i] = int32(v)
+ } else {
+ a.int64[i] = v
+ }
+}
+
+func (a *ints) slice(i, j int) ints {
+ if a.int32 != nil {
+ return ints{a.int32[i:j], nil}
+ }
+ return ints{nil, a.int64[i:j]}
}
// New creates a new Index for data.
// Index creation time is O(N*log(N)) for N = len(data).
func New(data []byte) *Index {
- return &Index{data, qsufsort(data)}
+ ix := &Index{data: data}
+ if len(data) <= maxData32 {
+ ix.sa.int32 = qsufsort32(data)
+ } else {
+ ix.sa.int64 = qsufsort64(data)
+ }
+ return ix
}
// writeInt writes an int x to w using buf to buffer the write.
@@ -44,19 +91,20 @@ func writeInt(w io.Writer, buf []byte, x int) error {
}
// readInt reads an int x from r using buf to buffer the read and returns x.
-func readInt(r io.Reader, buf []byte) (int, error) {
+func readInt(r io.Reader, buf []byte) (int64, error) {
_, err := io.ReadFull(r, buf[0:binary.MaxVarintLen64]) // ok to continue with error
x, _ := binary.Varint(buf)
- return int(x), err
+ return x, err
}
// writeSlice writes data[:n] to w and returns n.
// It uses buf to buffer the write.
-func writeSlice(w io.Writer, buf []byte, data []int) (n int, err error) {
+func writeSlice(w io.Writer, buf []byte, data ints) (n int, err error) {
// encode as many elements as fit into buf
p := binary.MaxVarintLen64
- for ; n < len(data) && p+binary.MaxVarintLen64 <= len(buf); n++ {
- p += binary.PutUvarint(buf[p:], uint64(data[n]))
+ m := data.len()
+ for ; n < m && p+binary.MaxVarintLen64 <= len(buf); n++ {
+ p += binary.PutUvarint(buf[p:], uint64(data.get(n)))
}
// update buffer size
@@ -67,15 +115,22 @@ func writeSlice(w io.Writer, buf []byte, data []int) (n int, err error) {
return
}
+var errTooBig = errors.New("suffixarray: data too large")
+
// readSlice reads data[:n] from r and returns n.
// It uses buf to buffer the read.
-func readSlice(r io.Reader, buf []byte, data []int) (n int, err error) {
+func readSlice(r io.Reader, buf []byte, data ints) (n int, err error) {
// read buffer size
- var size int
- size, err = readInt(r, buf)
+ var size64 int64
+ size64, err = readInt(r, buf)
if err != nil {
return
}
+ if int64(int(size64)) != size64 || int(size64) < 0 {
+ // We never write chunks this big anyway.
+ return 0, errTooBig
+ }
+ size := int(size64)
// read buffer w/o the size
if _, err = io.ReadFull(r, buf[binary.MaxVarintLen64:size]); err != nil {
@@ -85,7 +140,7 @@ func readSlice(r io.Reader, buf []byte, data []int) (n int, err error) {
// decode as many elements as present in buf
for p := binary.MaxVarintLen64; p < size; n++ {
x, w := binary.Uvarint(buf[p:])
- data[n] = int(x)
+ data.set(n, int64(x))
p += w
}
@@ -100,21 +155,31 @@ func (x *Index) Read(r io.Reader) error {
buf := make([]byte, bufSize)
// read length
- n, err := readInt(r, buf)
+ n64, err := readInt(r, buf)
if err != nil {
return err
}
+ if int64(int(n64)) != n64 || int(n64) < 0 {
+ return errTooBig
+ }
+ n := int(n64)
// allocate space
- if 2*n < cap(x.data) || cap(x.data) < n {
+ if 2*n < cap(x.data) || cap(x.data) < n || x.sa.int32 != nil && n > maxData32 || x.sa.int64 != nil && n <= maxData32 {
// new data is significantly smaller or larger than
// existing buffers - allocate new ones
x.data = make([]byte, n)
- x.sa = make([]int, n)
+ x.sa.int32 = nil
+ x.sa.int64 = nil
+ if n <= maxData32 {
+ x.sa.int32 = make([]int32, n)
+ } else {
+ x.sa.int64 = make([]int64, n)
+ }
} else {
// re-use existing buffers
x.data = x.data[0:n]
- x.sa = x.sa[0:n]
+ x.sa = x.sa.slice(0, n)
}
// read data
@@ -123,12 +188,13 @@ func (x *Index) Read(r io.Reader) error {
}
// read index
- for sa := x.sa; len(sa) > 0; {
+ sa := x.sa
+ for sa.len() > 0 {
n, err := readSlice(r, buf, sa)
if err != nil {
return err
}
- sa = sa[n:]
+ sa = sa.slice(n, sa.len())
}
return nil
}
@@ -149,12 +215,13 @@ func (x *Index) Write(w io.Writer) error {
}
// write index
- for sa := x.sa; len(sa) > 0; {
+ sa := x.sa
+ for sa.len() > 0 {
n, err := writeSlice(w, buf, sa)
if err != nil {
return err
}
- sa = sa[n:]
+ sa = sa.slice(n, sa.len())
}
return nil
}
@@ -167,18 +234,18 @@ func (x *Index) Bytes() []byte {
}
func (x *Index) at(i int) []byte {
- return x.data[x.sa[i]:]
+ return x.data[x.sa.get(i):]
}
// lookupAll returns a slice into the matching region of the index.
// The runtime is O(log(N)*len(s)).
-func (x *Index) lookupAll(s []byte) []int {
+func (x *Index) lookupAll(s []byte) ints {
// find matching suffix index range [i:j]
// find the first index where s would be the prefix
- i := sort.Search(len(x.sa), func(i int) bool { return bytes.Compare(x.at(i), s) >= 0 })
+ i := sort.Search(x.sa.len(), func(i int) bool { return bytes.Compare(x.at(i), s) >= 0 })
// starting at i, find the first index at which s is not a prefix
- j := i + sort.Search(len(x.sa)-i, func(j int) bool { return !bytes.HasPrefix(x.at(j+i), s) })
- return x.sa[i:j]
+ j := i + sort.Search(x.sa.len()-i, func(j int) bool { return !bytes.HasPrefix(x.at(j+i), s) })
+ return x.sa.slice(i, j)
}
// Lookup returns an unsorted list of at most n indices where the byte string s
@@ -190,13 +257,22 @@ func (x *Index) lookupAll(s []byte) []int {
func (x *Index) Lookup(s []byte, n int) (result []int) {
if len(s) > 0 && n != 0 {
matches := x.lookupAll(s)
- if n < 0 || len(matches) < n {
- n = len(matches)
+ count := matches.len()
+ if n < 0 || count < n {
+ n = count
}
- // 0 <= n <= len(matches)
+ // 0 <= n <= count
if n > 0 {
result = make([]int, n)
- copy(result, matches)
+ if matches.int32 != nil {
+ for i := range result {
+ result[i] = int(matches.int32[i])
+ }
+ } else {
+ for i := range result {
+ result[i] = int(matches.int64[i])
+ }
+ }
}
}
return