OpenCores
URL https://opencores.org/ocsvn/s80186/s80186/trunk

Subversion Repositories s80186

[/] [s80186/] [trunk/] [vendor/] [googletest/] [googlemock/] [scripts/] [generator/] [cpp/] [tokenize.py] - Blame information for rev 2

Details | Compare with Previous | View Log

Line No. Rev Author Line
1 2 jamieiles
#!/usr/bin/env python
2
#
3
# Copyright 2007 Neal Norwitz
4
# Portions Copyright 2007 Google Inc.
5
#
6
# Licensed under the Apache License, Version 2.0 (the "License");
7
# you may not use this file except in compliance with the License.
8
# You may obtain a copy of the License at
9
#
10
#      http://www.apache.org/licenses/LICENSE-2.0
11
#
12
# Unless required by applicable law or agreed to in writing, software
13
# distributed under the License is distributed on an "AS IS" BASIS,
14
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
# See the License for the specific language governing permissions and
16
# limitations under the License.
17
 
18
"""Tokenize C++ source code."""
19
 
20
__author__ = 'nnorwitz@google.com (Neal Norwitz)'
21
 
22
 
23
try:
24
    # Python 3.x
25
    import builtins
26
except ImportError:
27
    # Python 2.x
28
    import __builtin__ as builtins
29
 
30
 
31
import sys
32
 
33
from cpp import utils
34
 
35
 
36
if not hasattr(builtins, 'set'):
37
    # Nominal support for Python 2.3.
38
    from sets import Set as set
39
 
40
 
41
# Add $ as a valid identifier char since so much code uses it.
42
_letters = 'abcdefghijklmnopqrstuvwxyz'
43
VALID_IDENTIFIER_CHARS = set(_letters + _letters.upper() + '_0123456789$')
44
HEX_DIGITS = set('0123456789abcdefABCDEF')
45
INT_OR_FLOAT_DIGITS = set('01234567890eE-+')
46
 
47
 
48
# C++0x string preffixes.
49
_STR_PREFIXES = set(('R', 'u8', 'u8R', 'u', 'uR', 'U', 'UR', 'L', 'LR'))
50
 
51
 
52
# Token types.
53
UNKNOWN = 'UNKNOWN'
54
SYNTAX = 'SYNTAX'
55
CONSTANT = 'CONSTANT'
56
NAME = 'NAME'
57
PREPROCESSOR = 'PREPROCESSOR'
58
 
59
# Where the token originated from.  This can be used for backtracking.
60
# It is always set to WHENCE_STREAM in this code.
61
WHENCE_STREAM, WHENCE_QUEUE = range(2)
62
 
63
 
64
class Token(object):
65
    """Data container to represent a C++ token.
66
 
67
    Tokens can be identifiers, syntax char(s), constants, or
68
    pre-processor directives.
69
 
70
    start contains the index of the first char of the token in the source
71
    end contains the index of the last char of the token in the source
72
    """
73
 
74
    def __init__(self, token_type, name, start, end):
75
        self.token_type = token_type
76
        self.name = name
77
        self.start = start
78
        self.end = end
79
        self.whence = WHENCE_STREAM
80
 
81
    def __str__(self):
82
        if not utils.DEBUG:
83
            return 'Token(%r)' % self.name
84
        return 'Token(%r, %s, %s)' % (self.name, self.start, self.end)
85
 
86
    __repr__ = __str__
87
 
88
 
89
def _GetString(source, start, i):
90
    i = source.find('"', i+1)
91
    while source[i-1] == '\\':
92
        # Count the trailing backslashes.
93
        backslash_count = 1
94
        j = i - 2
95
        while source[j] == '\\':
96
            backslash_count += 1
97
            j -= 1
98
        # When trailing backslashes are even, they escape each other.
99
        if (backslash_count % 2) == 0:
100
            break
101
        i = source.find('"', i+1)
102
    return i + 1
103
 
104
 
105
def _GetChar(source, start, i):
106
    # NOTE(nnorwitz): may not be quite correct, should be good enough.
107
    i = source.find("'", i+1)
108
    while source[i-1] == '\\':
109
        # Need to special case '\\'.
110
        if (i - 2) > start and source[i-2] == '\\':
111
            break
112
        i = source.find("'", i+1)
113
    # Try to handle unterminated single quotes (in a #if 0 block).
114
    if i < 0:
115
        i = start
116
    return i + 1
117
 
118
 
119
def GetTokens(source):
120
    """Returns a sequence of Tokens.
121
 
122
    Args:
123
      source: string of C++ source code.
124
 
125
    Yields:
126
      Token that represents the next token in the source.
127
    """
128
    # Cache various valid character sets for speed.
129
    valid_identifier_chars = VALID_IDENTIFIER_CHARS
130
    hex_digits = HEX_DIGITS
131
    int_or_float_digits = INT_OR_FLOAT_DIGITS
132
    int_or_float_digits2 = int_or_float_digits | set('.')
133
 
134
    # Only ignore errors while in a #if 0 block.
135
    ignore_errors = False
136
    count_ifs = 0
137
 
138
    i = 0
139
    end = len(source)
140
    while i < end:
141
        # Skip whitespace.
142
        while i < end and source[i].isspace():
143
            i += 1
144
        if i >= end:
145
            return
146
 
147
        token_type = UNKNOWN
148
        start = i
149
        c = source[i]
150
        if c.isalpha() or c == '_':              # Find a string token.
151
            token_type = NAME
152
            while source[i] in valid_identifier_chars:
153
                i += 1
154
            # String and character constants can look like a name if
155
            # they are something like L"".
156
            if (source[i] == "'" and (i - start) == 1 and
157
                source[start:i] in 'uUL'):
158
                # u, U, and L are valid C++0x character preffixes.
159
                token_type = CONSTANT
160
                i = _GetChar(source, start, i)
161
            elif source[i] == "'" and source[start:i] in _STR_PREFIXES:
162
                token_type = CONSTANT
163
                i = _GetString(source, start, i)
164
        elif c == '/' and source[i+1] == '/':    # Find // comments.
165
            i = source.find('\n', i)
166
            if i == -1:  # Handle EOF.
167
                i = end
168
            continue
169
        elif c == '/' and source[i+1] == '*':    # Find /* comments. */
170
            i = source.find('*/', i) + 2
171
            continue
172
        elif c in ':+-<>&|*=':                   # : or :: (plus other chars).
173
            token_type = SYNTAX
174
            i += 1
175
            new_ch = source[i]
176
            if new_ch == c and c != '>':         # Treat ">>" as two tokens.
177
                i += 1
178
            elif c == '-' and new_ch == '>':
179
                i += 1
180
            elif new_ch == '=':
181
                i += 1
182
        elif c in '()[]{}~!?^%;/.,':             # Handle single char tokens.
183
            token_type = SYNTAX
184
            i += 1
185
            if c == '.' and source[i].isdigit():
186
                token_type = CONSTANT
187
                i += 1
188
                while source[i] in int_or_float_digits:
189
                    i += 1
190
                # Handle float suffixes.
191
                for suffix in ('l', 'f'):
192
                    if suffix == source[i:i+1].lower():
193
                        i += 1
194
                        break
195
        elif c.isdigit():                        # Find integer.
196
            token_type = CONSTANT
197
            if c == '0' and source[i+1] in 'xX':
198
                # Handle hex digits.
199
                i += 2
200
                while source[i] in hex_digits:
201
                    i += 1
202
            else:
203
                while source[i] in int_or_float_digits2:
204
                    i += 1
205
            # Handle integer (and float) suffixes.
206
            for suffix in ('ull', 'll', 'ul', 'l', 'f', 'u'):
207
                size = len(suffix)
208
                if suffix == source[i:i+size].lower():
209
                    i += size
210
                    break
211
        elif c == '"':                           # Find string.
212
            token_type = CONSTANT
213
            i = _GetString(source, start, i)
214
        elif c == "'":                           # Find char.
215
            token_type = CONSTANT
216
            i = _GetChar(source, start, i)
217
        elif c == '#':                           # Find pre-processor command.
218
            token_type = PREPROCESSOR
219
            got_if = source[i:i+3] == '#if' and source[i+3:i+4].isspace()
220
            if got_if:
221
                count_ifs += 1
222
            elif source[i:i+6] == '#endif':
223
                count_ifs -= 1
224
                if count_ifs == 0:
225
                    ignore_errors = False
226
 
227
            # TODO(nnorwitz): handle preprocessor statements (\ continuations).
228
            while 1:
229
                i1 = source.find('\n', i)
230
                i2 = source.find('//', i)
231
                i3 = source.find('/*', i)
232
                i4 = source.find('"', i)
233
                # NOTE(nnorwitz): doesn't handle comments in #define macros.
234
                # Get the first important symbol (newline, comment, EOF/end).
235
                i = min([x for x in (i1, i2, i3, i4, end) if x != -1])
236
 
237
                # Handle #include "dir//foo.h" properly.
238
                if source[i] == '"':
239
                    i = source.find('"', i+1) + 1
240
                    assert i > 0
241
                    continue
242
                # Keep going if end of the line and the line ends with \.
243
                if not (i == i1 and source[i-1] == '\\'):
244
                    if got_if:
245
                        condition = source[start+4:i].lstrip()
246
                        if (condition.startswith('0') or
247
                            condition.startswith('(0)')):
248
                            ignore_errors = True
249
                    break
250
                i += 1
251
        elif c == '\\':                          # Handle \ in code.
252
            # This is different from the pre-processor \ handling.
253
            i += 1
254
            continue
255
        elif ignore_errors:
256
            # The tokenizer seems to be in pretty good shape.  This
257
            # raise is conditionally disabled so that bogus code
258
            # in an #if 0 block can be handled.  Since we will ignore
259
            # it anyways, this is probably fine.  So disable the
260
            # exception and  return the bogus char.
261
            i += 1
262
        else:
263
            sys.stderr.write('Got invalid token in %s @ %d token:%s: %r\n' %
264
                             ('?', i, c, source[i-10:i+10]))
265
            raise RuntimeError('unexpected token')
266
 
267
        if i <= 0:
268
            print('Invalid index, exiting now.')
269
            return
270
        yield Token(token_type, source[start:i], start, i)
271
 
272
 
273
if __name__ == '__main__':
274
    def main(argv):
275
        """Driver mostly for testing purposes."""
276
        for filename in argv[1:]:
277
            source = utils.ReadFile(filename)
278
            if source is None:
279
                continue
280
 
281
            for token in GetTokens(source):
282
                print('%-12s: %s' % (token.token_type, token.name))
283
                # print('\r%6.2f%%' % (100.0 * index / token.end),)
284
            sys.stdout.write('\n')
285
 
286
 
287
    main(sys.argv)

powered by: WebSVN 2.1.0

© copyright 1999-2024 OpenCores.org, equivalent to Oliscience, all rights reserved. OpenCores®, registered trademark.