OpenCores
URL https://opencores.org/ocsvn/openrisc/openrisc/trunk

Subversion Repositories openrisc

[/] [openrisc/] [trunk/] [gnu-dev/] [or1k-gcc/] [libgo/] [go/] [index/] [suffixarray/] [qsufsort.go] - Blame information for rev 801

Go to most recent revision | Details | Compare with Previous | View Log

Line No. Rev Author Line
1 747 jeremybenn
// Copyright 2011 The Go Authors. All rights reserved.
2
// Use of this source code is governed by a BSD-style
3
// license that can be found in the LICENSE file.
4
 
5
// This algorithm is based on "Faster Suffix Sorting"
6
//   by N. Jesper Larsson and Kunihiko Sadakane
7
// paper: http://www.larsson.dogma.net/ssrev-tr.pdf
8
// code:  http://www.larsson.dogma.net/qsufsort.c
9
 
10
// This algorithm computes the suffix array sa by computing its inverse.
11
// Consecutive groups of suffixes in sa are labeled as sorted groups or
12
// unsorted groups. For a given pass of the sorter, all suffixes are ordered
13
// up to their first h characters, and sa is h-ordered. Suffixes in their
14
// final positions and unambiguouly sorted in h-order are in a sorted group.
15
// Consecutive groups of suffixes with identical first h characters are an
16
// unsorted group. In each pass of the algorithm, unsorted groups are sorted
17
// according to the group number of their following suffix.
18
 
19
// In the implementation, if sa[i] is negative, it indicates that i is
20
// the first element of a sorted group of length -sa[i], and can be skipped.
21
// An unsorted group sa[i:k] is given the group number of the index of its
22
// last element, k-1. The group numbers are stored in the inverse slice (inv),
23
// and when all groups are sorted, this slice is the inverse suffix array.
24
 
25
package suffixarray
26
 
27
import "sort"
28
 
29
func qsufsort(data []byte) []int {
30
        // initial sorting by first byte of suffix
31
        sa := sortedByFirstByte(data)
32
        if len(sa) < 2 {
33
                return sa
34
        }
35
        // initialize the group lookup table
36
        // this becomes the inverse of the suffix array when all groups are sorted
37
        inv := initGroups(sa, data)
38
 
39
        // the index starts 1-ordered
40
        sufSortable := &suffixSortable{sa: sa, inv: inv, h: 1}
41
 
42
        for sa[0] > -len(sa) { // until all suffixes are one big sorted group
43
                // The suffixes are h-ordered, make them 2*h-ordered
44
                pi := 0 // pi is first position of first group
45
                sl := 0 // sl is negated length of sorted groups
46
                for pi < len(sa) {
47
                        if s := sa[pi]; s < 0 { // if pi starts sorted group
48
                                pi -= s // skip over sorted group
49
                                sl += s // add negated length to sl
50
                        } else { // if pi starts unsorted group
51
                                if sl != 0 {
52
                                        sa[pi+sl] = sl // combine sorted groups before pi
53
                                        sl = 0
54
                                }
55
                                pk := inv[s] + 1 // pk-1 is last position of unsorted group
56
                                sufSortable.sa = sa[pi:pk]
57
                                sort.Sort(sufSortable)
58
                                sufSortable.updateGroups(pi)
59
                                pi = pk // next group
60
                        }
61
                }
62
                if sl != 0 { // if the array ends with a sorted group
63
                        sa[pi+sl] = sl // combine sorted groups at end of sa
64
                }
65
 
66
                sufSortable.h *= 2 // double sorted depth
67
        }
68
 
69
        for i := range sa { // reconstruct suffix array from inverse
70
                sa[inv[i]] = i
71
        }
72
        return sa
73
}
74
 
75
func sortedByFirstByte(data []byte) []int {
76
        // total byte counts
77
        var count [256]int
78
        for _, b := range data {
79
                count[b]++
80
        }
81
        // make count[b] equal index of first occurence of b in sorted array
82
        sum := 0
83
        for b := range count {
84
                count[b], sum = sum, count[b]+sum
85
        }
86
        // iterate through bytes, placing index into the correct spot in sa
87
        sa := make([]int, len(data))
88
        for i, b := range data {
89
                sa[count[b]] = i
90
                count[b]++
91
        }
92
        return sa
93
}
94
 
95
func initGroups(sa []int, data []byte) []int {
96
        // label contiguous same-letter groups with the same group number
97
        inv := make([]int, len(data))
98
        prevGroup := len(sa) - 1
99
        groupByte := data[sa[prevGroup]]
100
        for i := len(sa) - 1; i >= 0; i-- {
101
                if b := data[sa[i]]; b < groupByte {
102
                        if prevGroup == i+1 {
103
                                sa[i+1] = -1
104
                        }
105
                        groupByte = b
106
                        prevGroup = i
107
                }
108
                inv[sa[i]] = prevGroup
109
                if prevGroup == 0 {
110
                        sa[0] = -1
111
                }
112
        }
113
        // Separate out the final suffix to the start of its group.
114
        // This is necessary to ensure the suffix "a" is before "aba"
115
        // when using a potentially unstable sort.
116
        lastByte := data[len(data)-1]
117
        s := -1
118
        for i := range sa {
119
                if sa[i] >= 0 {
120
                        if data[sa[i]] == lastByte && s == -1 {
121
                                s = i
122
                        }
123
                        if sa[i] == len(sa)-1 {
124
                                sa[i], sa[s] = sa[s], sa[i]
125
                                inv[sa[s]] = s
126
                                sa[s] = -1 // mark it as an isolated sorted group
127
                                break
128
                        }
129
                }
130
        }
131
        return inv
132
}
133
 
134
type suffixSortable struct {
135
        sa  []int
136
        inv []int
137
        h   int
138
        buf []int // common scratch space
139
}
140
 
141
func (x *suffixSortable) Len() int           { return len(x.sa) }
142
func (x *suffixSortable) Less(i, j int) bool { return x.inv[x.sa[i]+x.h] < x.inv[x.sa[j]+x.h] }
143
func (x *suffixSortable) Swap(i, j int)      { x.sa[i], x.sa[j] = x.sa[j], x.sa[i] }
144
 
145
func (x *suffixSortable) updateGroups(offset int) {
146
        bounds := x.buf[0:0]
147
        group := x.inv[x.sa[0]+x.h]
148
        for i := 1; i < len(x.sa); i++ {
149
                if g := x.inv[x.sa[i]+x.h]; g > group {
150
                        bounds = append(bounds, i)
151
                        group = g
152
                }
153
        }
154
        bounds = append(bounds, len(x.sa))
155
        x.buf = bounds
156
 
157
        // update the group numberings after all new groups are determined
158
        prev := 0
159
        for _, b := range bounds {
160
                for i := prev; i < b; i++ {
161
                        x.inv[x.sa[i]] = offset + b - 1
162
                }
163
                if b-prev == 1 {
164
                        x.sa[prev] = -1
165
                }
166
                prev = b
167
        }
168
}

powered by: WebSVN 2.1.0

© copyright 1999-2024 OpenCores.org, equivalent to Oliscience, all rights reserved. OpenCores®, registered trademark.