| 1 |
747 |
jeremybenn |
// Copyright 2009 The Go Authors. All rights reserved.
|
| 2 |
|
|
// Use of this source code is governed by a BSD-style
|
| 3 |
|
|
// license that can be found in the LICENSE file.
|
| 4 |
|
|
|
| 5 |
|
|
// Package ebnf is a library for EBNF grammars. The input is text ([]byte)
|
| 6 |
|
|
// satisfying the following grammar (represented itself in EBNF):
|
| 7 |
|
|
//
|
| 8 |
|
|
// Production = name "=" [ Expression ] "." .
|
| 9 |
|
|
// Expression = Alternative { "|" Alternative } .
|
| 10 |
|
|
// Alternative = Term { Term } .
|
| 11 |
|
|
// Term = name | token [ "…" token ] | Group | Option | Repetition .
|
| 12 |
|
|
// Group = "(" Expression ")" .
|
| 13 |
|
|
// Option = "[" Expression "]" .
|
| 14 |
|
|
// Repetition = "{" Expression "}" .
|
| 15 |
|
|
//
|
| 16 |
|
|
// A name is a Go identifier, a token is a Go string, and comments
|
| 17 |
|
|
// and white space follow the same rules as for the Go language.
|
| 18 |
|
|
// Production names starting with an uppercase Unicode letter denote
|
| 19 |
|
|
// non-terminal productions (i.e., productions which allow white-space
|
| 20 |
|
|
// and comments between tokens); all other production names denote
|
| 21 |
|
|
// lexical productions.
|
| 22 |
|
|
//
|
| 23 |
|
|
package ebnf
|
| 24 |
|
|
|
| 25 |
|
|
import (
|
| 26 |
|
|
"errors"
|
| 27 |
|
|
"fmt"
|
| 28 |
|
|
"text/scanner"
|
| 29 |
|
|
"unicode"
|
| 30 |
|
|
"unicode/utf8"
|
| 31 |
|
|
)
|
| 32 |
|
|
|
| 33 |
|
|
// ----------------------------------------------------------------------------
|
| 34 |
|
|
// Error handling
|
| 35 |
|
|
|
| 36 |
|
|
type errorList []error
|
| 37 |
|
|
|
| 38 |
|
|
func (list errorList) Err() error {
|
| 39 |
|
|
if len(list) == 0 {
|
| 40 |
|
|
return nil
|
| 41 |
|
|
}
|
| 42 |
|
|
return list
|
| 43 |
|
|
}
|
| 44 |
|
|
|
| 45 |
|
|
func (list errorList) Error() string {
|
| 46 |
|
|
switch len(list) {
|
| 47 |
|
|
case 0:
|
| 48 |
|
|
return "no errors"
|
| 49 |
|
|
case 1:
|
| 50 |
|
|
return list[0].Error()
|
| 51 |
|
|
}
|
| 52 |
|
|
return fmt.Sprintf("%s (and %d more errors)", list[0], len(list)-1)
|
| 53 |
|
|
}
|
| 54 |
|
|
|
| 55 |
|
|
func newError(pos scanner.Position, msg string) error {
|
| 56 |
|
|
return errors.New(fmt.Sprintf("%s: %s", pos, msg))
|
| 57 |
|
|
}
|
| 58 |
|
|
|
| 59 |
|
|
// ----------------------------------------------------------------------------
|
| 60 |
|
|
// Internal representation
|
| 61 |
|
|
|
| 62 |
|
|
type (
|
| 63 |
|
|
// An Expression node represents a production expression.
|
| 64 |
|
|
Expression interface {
|
| 65 |
|
|
// Pos is the position of the first character of the syntactic construct
|
| 66 |
|
|
Pos() scanner.Position
|
| 67 |
|
|
}
|
| 68 |
|
|
|
| 69 |
|
|
// An Alternative node represents a non-empty list of alternative expressions.
|
| 70 |
|
|
Alternative []Expression // x | y | z
|
| 71 |
|
|
|
| 72 |
|
|
// A Sequence node represents a non-empty list of sequential expressions.
|
| 73 |
|
|
Sequence []Expression // x y z
|
| 74 |
|
|
|
| 75 |
|
|
// A Name node represents a production name.
|
| 76 |
|
|
Name struct {
|
| 77 |
|
|
StringPos scanner.Position
|
| 78 |
|
|
String string
|
| 79 |
|
|
}
|
| 80 |
|
|
|
| 81 |
|
|
// A Token node represents a literal.
|
| 82 |
|
|
Token struct {
|
| 83 |
|
|
StringPos scanner.Position
|
| 84 |
|
|
String string
|
| 85 |
|
|
}
|
| 86 |
|
|
|
| 87 |
|
|
// A List node represents a range of characters.
|
| 88 |
|
|
Range struct {
|
| 89 |
|
|
Begin, End *Token // begin ... end
|
| 90 |
|
|
}
|
| 91 |
|
|
|
| 92 |
|
|
// A Group node represents a grouped expression.
|
| 93 |
|
|
Group struct {
|
| 94 |
|
|
Lparen scanner.Position
|
| 95 |
|
|
Body Expression // (body)
|
| 96 |
|
|
}
|
| 97 |
|
|
|
| 98 |
|
|
// An Option node represents an optional expression.
|
| 99 |
|
|
Option struct {
|
| 100 |
|
|
Lbrack scanner.Position
|
| 101 |
|
|
Body Expression // [body]
|
| 102 |
|
|
}
|
| 103 |
|
|
|
| 104 |
|
|
// A Repetition node represents a repeated expression.
|
| 105 |
|
|
Repetition struct {
|
| 106 |
|
|
Lbrace scanner.Position
|
| 107 |
|
|
Body Expression // {body}
|
| 108 |
|
|
}
|
| 109 |
|
|
|
| 110 |
|
|
// A Production node represents an EBNF production.
|
| 111 |
|
|
Production struct {
|
| 112 |
|
|
Name *Name
|
| 113 |
|
|
Expr Expression
|
| 114 |
|
|
}
|
| 115 |
|
|
|
| 116 |
|
|
// A Bad node stands for pieces of source code that lead to a parse error.
|
| 117 |
|
|
Bad struct {
|
| 118 |
|
|
TokPos scanner.Position
|
| 119 |
|
|
Error string // parser error message
|
| 120 |
|
|
}
|
| 121 |
|
|
|
| 122 |
|
|
// A Grammar is a set of EBNF productions. The map
|
| 123 |
|
|
// is indexed by production name.
|
| 124 |
|
|
//
|
| 125 |
|
|
Grammar map[string]*Production
|
| 126 |
|
|
)
|
| 127 |
|
|
|
| 128 |
|
|
func (x Alternative) Pos() scanner.Position { return x[0].Pos() } // the parser always generates non-empty Alternative
|
| 129 |
|
|
func (x Sequence) Pos() scanner.Position { return x[0].Pos() } // the parser always generates non-empty Sequences
|
| 130 |
|
|
func (x *Name) Pos() scanner.Position { return x.StringPos }
|
| 131 |
|
|
func (x *Token) Pos() scanner.Position { return x.StringPos }
|
| 132 |
|
|
func (x *Range) Pos() scanner.Position { return x.Begin.Pos() }
|
| 133 |
|
|
func (x *Group) Pos() scanner.Position { return x.Lparen }
|
| 134 |
|
|
func (x *Option) Pos() scanner.Position { return x.Lbrack }
|
| 135 |
|
|
func (x *Repetition) Pos() scanner.Position { return x.Lbrace }
|
| 136 |
|
|
func (x *Production) Pos() scanner.Position { return x.Name.Pos() }
|
| 137 |
|
|
func (x *Bad) Pos() scanner.Position { return x.TokPos }
|
| 138 |
|
|
|
| 139 |
|
|
// ----------------------------------------------------------------------------
|
| 140 |
|
|
// Grammar verification
|
| 141 |
|
|
|
| 142 |
|
|
func isLexical(name string) bool {
|
| 143 |
|
|
ch, _ := utf8.DecodeRuneInString(name)
|
| 144 |
|
|
return !unicode.IsUpper(ch)
|
| 145 |
|
|
}
|
| 146 |
|
|
|
| 147 |
|
|
type verifier struct {
|
| 148 |
|
|
errors errorList
|
| 149 |
|
|
worklist []*Production
|
| 150 |
|
|
reached Grammar // set of productions reached from (and including) the root production
|
| 151 |
|
|
grammar Grammar
|
| 152 |
|
|
}
|
| 153 |
|
|
|
| 154 |
|
|
func (v *verifier) error(pos scanner.Position, msg string) {
|
| 155 |
|
|
v.errors = append(v.errors, newError(pos, msg))
|
| 156 |
|
|
}
|
| 157 |
|
|
|
| 158 |
|
|
func (v *verifier) push(prod *Production) {
|
| 159 |
|
|
name := prod.Name.String
|
| 160 |
|
|
if _, found := v.reached[name]; !found {
|
| 161 |
|
|
v.worklist = append(v.worklist, prod)
|
| 162 |
|
|
v.reached[name] = prod
|
| 163 |
|
|
}
|
| 164 |
|
|
}
|
| 165 |
|
|
|
| 166 |
|
|
func (v *verifier) verifyChar(x *Token) rune {
|
| 167 |
|
|
s := x.String
|
| 168 |
|
|
if utf8.RuneCountInString(s) != 1 {
|
| 169 |
|
|
v.error(x.Pos(), "single char expected, found "+s)
|
| 170 |
|
|
return 0
|
| 171 |
|
|
}
|
| 172 |
|
|
ch, _ := utf8.DecodeRuneInString(s)
|
| 173 |
|
|
return ch
|
| 174 |
|
|
}
|
| 175 |
|
|
|
| 176 |
|
|
func (v *verifier) verifyExpr(expr Expression, lexical bool) {
|
| 177 |
|
|
switch x := expr.(type) {
|
| 178 |
|
|
case nil:
|
| 179 |
|
|
// empty expression
|
| 180 |
|
|
case Alternative:
|
| 181 |
|
|
for _, e := range x {
|
| 182 |
|
|
v.verifyExpr(e, lexical)
|
| 183 |
|
|
}
|
| 184 |
|
|
case Sequence:
|
| 185 |
|
|
for _, e := range x {
|
| 186 |
|
|
v.verifyExpr(e, lexical)
|
| 187 |
|
|
}
|
| 188 |
|
|
case *Name:
|
| 189 |
|
|
// a production with this name must exist;
|
| 190 |
|
|
// add it to the worklist if not yet processed
|
| 191 |
|
|
if prod, found := v.grammar[x.String]; found {
|
| 192 |
|
|
v.push(prod)
|
| 193 |
|
|
} else {
|
| 194 |
|
|
v.error(x.Pos(), "missing production "+x.String)
|
| 195 |
|
|
}
|
| 196 |
|
|
// within a lexical production references
|
| 197 |
|
|
// to non-lexical productions are invalid
|
| 198 |
|
|
if lexical && !isLexical(x.String) {
|
| 199 |
|
|
v.error(x.Pos(), "reference to non-lexical production "+x.String)
|
| 200 |
|
|
}
|
| 201 |
|
|
case *Token:
|
| 202 |
|
|
// nothing to do for now
|
| 203 |
|
|
case *Range:
|
| 204 |
|
|
i := v.verifyChar(x.Begin)
|
| 205 |
|
|
j := v.verifyChar(x.End)
|
| 206 |
|
|
if i >= j {
|
| 207 |
|
|
v.error(x.Pos(), "decreasing character range")
|
| 208 |
|
|
}
|
| 209 |
|
|
case *Group:
|
| 210 |
|
|
v.verifyExpr(x.Body, lexical)
|
| 211 |
|
|
case *Option:
|
| 212 |
|
|
v.verifyExpr(x.Body, lexical)
|
| 213 |
|
|
case *Repetition:
|
| 214 |
|
|
v.verifyExpr(x.Body, lexical)
|
| 215 |
|
|
case *Bad:
|
| 216 |
|
|
v.error(x.Pos(), x.Error)
|
| 217 |
|
|
default:
|
| 218 |
|
|
panic(fmt.Sprintf("internal error: unexpected type %T", expr))
|
| 219 |
|
|
}
|
| 220 |
|
|
}
|
| 221 |
|
|
|
| 222 |
|
|
func (v *verifier) verify(grammar Grammar, start string) {
|
| 223 |
|
|
// find root production
|
| 224 |
|
|
root, found := grammar[start]
|
| 225 |
|
|
if !found {
|
| 226 |
|
|
var noPos scanner.Position
|
| 227 |
|
|
v.error(noPos, "no start production "+start)
|
| 228 |
|
|
return
|
| 229 |
|
|
}
|
| 230 |
|
|
|
| 231 |
|
|
// initialize verifier
|
| 232 |
|
|
v.worklist = v.worklist[0:0]
|
| 233 |
|
|
v.reached = make(Grammar)
|
| 234 |
|
|
v.grammar = grammar
|
| 235 |
|
|
|
| 236 |
|
|
// work through the worklist
|
| 237 |
|
|
v.push(root)
|
| 238 |
|
|
for {
|
| 239 |
|
|
n := len(v.worklist) - 1
|
| 240 |
|
|
if n < 0 {
|
| 241 |
|
|
break
|
| 242 |
|
|
}
|
| 243 |
|
|
prod := v.worklist[n]
|
| 244 |
|
|
v.worklist = v.worklist[0:n]
|
| 245 |
|
|
v.verifyExpr(prod.Expr, isLexical(prod.Name.String))
|
| 246 |
|
|
}
|
| 247 |
|
|
|
| 248 |
|
|
// check if all productions were reached
|
| 249 |
|
|
if len(v.reached) < len(v.grammar) {
|
| 250 |
|
|
for name, prod := range v.grammar {
|
| 251 |
|
|
if _, found := v.reached[name]; !found {
|
| 252 |
|
|
v.error(prod.Pos(), name+" is unreachable")
|
| 253 |
|
|
}
|
| 254 |
|
|
}
|
| 255 |
|
|
}
|
| 256 |
|
|
}
|
| 257 |
|
|
|
| 258 |
|
|
// Verify checks that:
|
| 259 |
|
|
// - all productions used are defined
|
| 260 |
|
|
// - all productions defined are used when beginning at start
|
| 261 |
|
|
// - lexical productions refer only to other lexical productions
|
| 262 |
|
|
//
|
| 263 |
|
|
// Position information is interpreted relative to the file set fset.
|
| 264 |
|
|
//
|
| 265 |
|
|
func Verify(grammar Grammar, start string) error {
|
| 266 |
|
|
var v verifier
|
| 267 |
|
|
v.verify(grammar, start)
|
| 268 |
|
|
return v.errors.Err()
|
| 269 |
|
|
}
|