/*
|
/*
|
* Copyright © 2003..2010 : Henk van Kampen <henk@mediatronix.com>
|
* Copyright © 2003..2010 : Henk van Kampen <henk@mediatronix.com>
|
*
|
*
|
* This file is part of pBlazASM.
|
* This file is part of pBlazASM.
|
*
|
*
|
* pBlazASM is free software: you can redistribute it and/or modify
|
* pBlazASM is free software: you can redistribute it and/or modify
|
* it under the terms of the GNU General Public License as published by
|
* it under the terms of the GNU General Public License as published by
|
* the Free Software Foundation, either version 3 of the License, or
|
* the Free Software Foundation, either version 3 of the License, or
|
* (at your option) any later version.
|
* (at your option) any later version.
|
*
|
*
|
* pBlazASM is distributed in the hope that it will be useful,
|
* pBlazASM is distributed in the hope that it will be useful,
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
* GNU General Public License for more details.
|
* GNU General Public License for more details.
|
*
|
*
|
* You should have received a copy of the GNU General Public License
|
* You should have received a copy of the GNU General Public License
|
* along with pBlazASM. If not, see <http://www.gnu.org/licenses/>.
|
* along with pBlazASM. If not, see <http://www.gnu.org/licenses/>.
|
*/
|
*/
|
|
|
#include <ctype.h>
|
#include <ctype.h>
|
#include <string.h>
|
#include <string.h>
|
#include <stdint.h>
|
#include <stdint.h>
|
#include <stdlib.h>
|
#include <stdlib.h>
|
|
|
#include "pbTypes.h"
|
#include "pbTypes.h"
|
#include "pbErrors.h"
|
#include "pbErrors.h"
|
|
|
// lexer states
|
// lexer states
|
typedef enum {
|
typedef enum {
|
lsBin,
|
lsBin,
|
lsChar,
|
lsChar,
|
lsComment,
|
lsComment,
|
lsDec,
|
lsDec,
|
lsCopy,
|
lsCopy,
|
lsError,
|
lsError,
|
lsHex,
|
lsHex,
|
lsHexBin,
|
lsHexBin,
|
lsIdent,
|
lsIdent,
|
lsIdle,
|
lsIdle,
|
lsInit,
|
lsInit,
|
lsOperator,
|
lsOperator,
|
lsDoubleOp,
|
lsDoubleOp,
|
lsPunct,
|
lsPunct,
|
lsIndex,
|
lsIndex,
|
lsString
|
lsString
|
} LexState ;
|
} LexState ;
|
|
|
// global token list
|
// global token list
|
static symbol_t tokens[ 256 ] ; // global token list
|
static symbol_t tokens[ 256 ] ; // global token list
|
static symbol_t * ptok = 0 ; // pointer to current token, index in 'tokens[]'
|
static symbol_t * ptok = 0 ; // pointer to current token, index in 'tokens[]'
|
|
|
symbol_t * tok_first( void ) {
|
symbol_t * tok_first( void ) {
|
ptok = tokens ;
|
ptok = tokens ;
|
return ptok ;
|
return ptok ;
|
}
|
}
|
|
|
symbol_t * tok_current( void ) {
|
symbol_t * tok_current( void ) {
|
return ptok ;
|
return ptok ;
|
}
|
}
|
|
|
symbol_t * tok_next( void ) {
|
symbol_t * tok_next( void ) {
|
if ( ptok < &tokens[ 256 ] )
|
if ( ptok < &tokens[ 256 ] )
|
return ptok++ ;
|
return ptok++ ;
|
else {
|
else {
|
ptok->type = tNONE ;
|
ptok->type = tNONE ;
|
return ptok ;
|
return ptok ;
|
}
|
}
|
}
|
}
|
|
|
void tok_back(symbol_t * back ){
|
void tok_back(symbol_t * back ){
|
ptok = back ;
|
ptok = back ;
|
}
|
}
|
|
|
void tok_free( void ) {
|
void tok_free( void ) {
|
for ( ptok = tokens ; ptok->text != NULL ; ptok++ ) {
|
for ( ptok = tokens ; ptok->text != NULL ; ptok++ ) {
|
free( ptok->text ) ;
|
free( ptok->text ) ;
|
|
|
ptok->type = tNONE ;
|
ptok->type = tNONE ;
|
ptok->subtype = stNONE ;
|
ptok->subtype = stNONE ;
|
ptok->text = NULL ;
|
ptok->text = NULL ;
|
ptok->value = 0 ;
|
ptok->value = 0 ;
|
}
|
}
|
}
|
}
|
|
|
// state machine based lexer
|
// state machine based lexer
|
// tokens are recorded in 'tokens', ended by a NONE token
|
// tokens are recorded in 'tokens', ended by a NONE token
|
bool lex( char * line, const bool mode ) {
|
bool lex( char * line, const bool mode ) {
|
char * start = NULL, *end = NULL, *s = line ;
|
char * start = NULL, *end = NULL, *s = line ;
|
char term[ 256 ], *pterm = NULL ;
|
char term[ 256 ], *pterm = NULL ;
|
LexState state = lsInit ;
|
LexState state = lsInit ;
|
|
|
// state machine
|
// state machine
|
for ( ptok = tokens ; ptok < &tokens[ 256 ] ; ) {
|
for ( ptok = tokens ; ptok < &tokens[ 256 ] ; ) {
|
switch ( state ) {
|
switch ( state ) {
|
case lsInit :
|
case lsInit :
|
ptok->type = tNONE ;
|
ptok->type = tNONE ;
|
ptok->subtype = stNONE ;
|
ptok->subtype = stNONE ;
|
ptok->value = 0 ;
|
ptok->value = 0 ;
|
ptok->text = NULL ;
|
ptok->text = NULL ;
|
|
|
pterm = term ;
|
pterm = term ;
|
*pterm = '\0' ;
|
*pterm = '\0' ;
|
state = lsIdle ;
|
state = lsIdle ;
|
break ;
|
break ;
|
|
|
case lsIdle :
|
case lsIdle :
|
// starting characters of tokens to be
|
// starting characters of tokens to be
|
if ( *s == '\0' || *s == '\r' || *s == '\n' ) {
|
if ( *s == '\0' || *s == '\r' || *s == '\n' ) {
|
// end of line
|
// end of line
|
return true ;
|
return true ;
|
} else if ( *s == ' ' || iscntrl( *s ) ) {
|
} else if ( *s == ' ' || iscntrl( *s ) ) {
|
// white space, 'space' and all control characters, except \0, \r and \n
|
// white space, 'space' and all control characters, except \0, \r and \n
|
s++ ;
|
s++ ;
|
} else if ( mode && ( isalnum( *s ) ) ) {
|
} else if ( mode && ( isalnum( *s ) || *s == '_' ) ) {
|
// KCPSM mode, all alphanum is accepted for idents, could be hex values
|
// KCPSM mode, all alphanum is accepted for idents, could be hex values
|
// ident
|
// ident
|
start = s++ ;
|
start = s++ ;
|
state = lsIdent ;
|
state = lsIdent ;
|
} else if ( !mode && ( isalpha( *s ) || *s == '_' ) ) {
|
} else if ( !mode && ( isalpha( *s ) || *s == '_' ) ) {
|
// ident
|
// ident
|
start = s++ ;
|
start = s++ ;
|
state = lsIdent ;
|
state = lsIdent ;
|
} else if ( *s == ';' ) {
|
} else if ( *s == ';' ) {
|
// comment
|
// comment
|
start = s++ ;
|
start = s++ ;
|
state = lsComment ;
|
state = lsComment ;
|
} else if ( *s == '0' ) {
|
} else if ( *s == '0' ) {
|
// maybe hex or bin
|
// maybe hex or bin
|
start = s++ ;
|
start = s++ ;
|
state = lsHexBin ;
|
state = lsHexBin ;
|
} else if ( isdigit( *s ) ) {
|
} else if ( isdigit( *s ) ) {
|
// decimal number
|
// decimal number
|
start = s++ ;
|
start = s++ ;
|
state = lsDec ;
|
state = lsDec ;
|
} else if ( *s == '$' ) {
|
} else if ( *s == '$' ) {
|
// hexadecimal number
|
// hexadecimal number
|
start = ++s ;
|
start = ++s ;
|
state = lsHex ;
|
state = lsHex ;
|
} else if ( *s == '%' ) {
|
} else if ( *s == '%' ) {
|
// binary number
|
// binary number
|
start = ++s ;
|
start = ++s ;
|
state = lsBin ;
|
state = lsBin ;
|
} else if ( *s == '.' ) {
|
} else if ( *s == '.' ) {
|
// directives, indexing, local labels, etc
|
// directives, indexing, local labels, etc
|
start = s++ ;
|
start = s++ ;
|
state = lsIndex ;
|
state = lsIndex ;
|
} else if ( *s == ':' || *s == ',' || *s == '(' || *s == ')' ) {
|
} else if ( *s == ':' || *s == ',' || *s == '(' || *s == ')' ) {
|
// punctuation ',', ':', '(', ')', '~'
|
// punctuation ',', ':', '(', ')', '~'
|
start = s++ ;
|
start = s++ ;
|
state = lsPunct ;
|
state = lsPunct ;
|
} else if ( *s == '*' || *s == '/' || *s == '#' || *s == '+' || *s == '-' ||
|
} else if ( *s == '*' || *s == '/' || *s == '#' || *s == '+' || *s == '-' ||
|
*s == '|' || *s == '&' || *s == '^' || *s == '~' ) {
|
*s == '|' || *s == '&' || *s == '^' || *s == '~' ) {
|
// operators
|
// operators
|
start = s++ ;
|
start = s++ ;
|
state = lsOperator ;
|
state = lsOperator ;
|
} else if ( *s == '<' || *s == '>' ) {
|
} else if ( *s == '<' || *s == '>' ) {
|
// double char operators
|
// double char operators
|
start = s++ ;
|
start = s++ ;
|
state = lsDoubleOp ;
|
state = lsDoubleOp ;
|
} else if ( *s == '\'' ) {
|
} else if ( *s == '\'' ) {
|
// 'c'
|
// 'c'
|
start = ++s ;
|
start = ++s ;
|
state = lsChar ;
|
state = lsChar ;
|
} else if ( *s == '"' ) {
|
} else if ( *s == '"' ) {
|
// "string"
|
// "string"
|
start = ++s ;
|
start = ++s ;
|
state = lsString ;
|
state = lsString ;
|
} else
|
} else
|
state = lsError ;
|
state = lsError ;
|
break ;
|
break ;
|
|
|
case lsComment :
|
case lsComment :
|
if ( *s != '\0' && *s != '\r' && *s != '\n' )
|
if ( *s != '\0' && *s != '\r' && *s != '\n' )
|
// anything till end of line
|
// anything till end of line
|
s++ ;
|
s++ ;
|
else {
|
else {
|
end = s ;
|
end = s ;
|
ptok->type = tNONE ;
|
ptok->type = tNONE ;
|
ptok->subtype = stCOMMENT ;
|
ptok->subtype = stCOMMENT ;
|
state = lsCopy ;
|
state = lsCopy ;
|
}
|
}
|
break ;
|
break ;
|
|
|
case lsChar :
|
case lsChar :
|
if ( *s == '\'' ) {
|
if ( *s == '\'' ) {
|
ptok->type = tCHAR ;
|
ptok->type = tCHAR ;
|
end = s++ ;
|
end = s++ ;
|
state = lsCopy ;
|
state = lsCopy ;
|
} else if ( *s == '\\' ) {
|
} else if ( *s == '\\' ) {
|
s += 1 ;
|
s += 1 ;
|
if ( *s != '\0' )
|
if ( *s != '\0' )
|
s += 1 ;
|
s += 1 ;
|
} else if ( isgraph( *s ) || *s == ' ' ) {
|
} else if ( isgraph( *s ) || *s == ' ' ) {
|
s++ ;
|
s++ ;
|
} else
|
} else
|
state = lsError ;
|
state = lsError ;
|
break ;
|
break ;
|
|
|
case lsString :
|
case lsString :
|
if ( *s == '"' ) {
|
if ( *s == '"' ) {
|
ptok->type = tSTRING ;
|
ptok->type = tSTRING ;
|
end = s++ ;
|
end = s++ ;
|
state = lsCopy ;
|
state = lsCopy ;
|
} else if ( *s == '\\' ) {
|
} else if ( *s == '\\' ) {
|
s += 1 ;
|
s += 1 ;
|
if ( *s != '\0' )
|
if ( *s != '\0' )
|
s += 1 ;
|
s += 1 ;
|
} else if ( isgraph( *s ) || *s == ' ' )
|
} else if ( isgraph( *s ) || *s == ' ' )
|
s++ ;
|
s++ ;
|
else
|
else
|
state = lsError ;
|
state = lsError ;
|
break ;
|
break ;
|
|
|
case lsIdent :
|
case lsIdent :
|
if ( isalnum( *s ) || *s == '_' )
|
if ( isalnum( *s ) || *s == '_' )
|
s++ ;
|
s++ ;
|
else {
|
else {
|
end = s ;
|
end = s ;
|
ptok->type = tIDENT ;
|
ptok->type = tIDENT ;
|
ptok->subtype = stNONE ;
|
ptok->subtype = stNONE ;
|
state = lsCopy ;
|
state = lsCopy ;
|
}
|
}
|
break ;
|
break ;
|
|
|
case lsHexBin :
|
case lsHexBin :
|
if ( *s == 'x' ) {
|
if ( *s == 'x' ) {
|
start = ++s ;
|
start = ++s ;
|
state = lsHex ;
|
state = lsHex ;
|
} else if ( *s == 'b' ) {
|
} else if ( *s == 'b' ) {
|
start = ++s ;
|
start = ++s ;
|
state = lsBin ;
|
state = lsBin ;
|
} else
|
} else
|
// missing the first '0' doesn't hurt here
|
// missing the first '0' doesn't hurt here
|
state = lsDec ;
|
state = lsDec ;
|
break ;
|
break ;
|
|
|
case lsHex :
|
case lsHex :
|
if ( isxdigit( *s ) )
|
if ( isxdigit( *s ) )
|
s++ ;
|
s++ ;
|
else {
|
else {
|
end = s ;
|
end = s ;
|
ptok->type = tHEX ;
|
ptok->type = tHEX ;
|
state = lsCopy ;
|
state = lsCopy ;
|
}
|
}
|
break ;
|
break ;
|
|
|
case lsBin :
|
case lsBin :
|
if ( *s == '0' || *s == '1' )
|
if ( *s == '0' || *s == '1' )
|
s++ ;
|
s++ ;
|
else {
|
else {
|
end = s ;
|
end = s ;
|
ptok->type = tBIN ;
|
ptok->type = tBIN ;
|
state = lsCopy ;
|
state = lsCopy ;
|
}
|
}
|
break ;
|
break ;
|
|
|
case lsDec :
|
case lsDec :
|
if ( isdigit( *s ) )
|
if ( isdigit( *s ) )
|
s++ ;
|
s++ ;
|
else {
|
else {
|
end = s ;
|
end = s ;
|
ptok->type = tDEC ;
|
ptok->type = tDEC ;
|
state = lsCopy ;
|
state = lsCopy ;
|
}
|
}
|
break ;
|
break ;
|
|
|
case lsOperator :
|
case lsOperator :
|
ptok->type = tOPERATOR ;
|
ptok->type = tOPERATOR ;
|
switch ( *start ) {
|
switch ( *start ) {
|
case '*' :
|
case '*' :
|
ptok->subtype = stMUL ;
|
ptok->subtype = stMUL ;
|
break ;
|
break ;
|
case '/' :
|
case '/' :
|
ptok->subtype = stDIV ;
|
ptok->subtype = stDIV ;
|
break ;
|
break ;
|
case '#' :
|
case '#' :
|
ptok->subtype = stMOD ;
|
ptok->subtype = stMOD ;
|
break ;
|
break ;
|
case '+' :
|
case '+' :
|
ptok->subtype = stADD ;
|
ptok->subtype = stADD ;
|
break ;
|
break ;
|
case '-' :
|
case '-' :
|
ptok->subtype = stSUB ;
|
ptok->subtype = stSUB ;
|
break ;
|
break ;
|
case '|' :
|
case '|' :
|
ptok->subtype = stIOR ;
|
ptok->subtype = stIOR ;
|
break ;
|
break ;
|
case '&' :
|
case '&' :
|
ptok->subtype = stAND ;
|
ptok->subtype = stAND ;
|
break ;
|
break ;
|
case '^' :
|
case '^' :
|
ptok->subtype = stXOR ;
|
ptok->subtype = stXOR ;
|
break ;
|
break ;
|
case '~' :
|
case '~' :
|
ptok->subtype = stTILDA ;
|
ptok->subtype = stTILDA ;
|
break ;
|
break ;
|
}
|
}
|
end = s ;
|
end = s ;
|
state = lsCopy ;
|
state = lsCopy ;
|
break ;
|
break ;
|
|
|
case lsDoubleOp :
|
case lsDoubleOp :
|
if ( *start == *s ) { // << or >>
|
if ( *start == *s ) { // << or >>
|
ptok->type = tOPERATOR ;
|
ptok->type = tOPERATOR ;
|
switch ( *start ) {
|
switch ( *start ) {
|
case '<' :
|
case '<' :
|
ptok->subtype = stSHL ;
|
ptok->subtype = stSHL ;
|
break ;
|
break ;
|
case '>' :
|
case '>' :
|
ptok->subtype = stSHR ;
|
ptok->subtype = stSHR ;
|
break ;
|
break ;
|
}
|
}
|
end = ++s ;
|
end = ++s ;
|
state = lsCopy ;
|
state = lsCopy ;
|
} else
|
} else
|
state = lsError ;
|
state = lsError ;
|
break ;
|
break ;
|
|
|
case lsPunct :
|
case lsPunct :
|
end = s ;
|
end = s ;
|
state = lsCopy ;
|
state = lsCopy ;
|
switch ( *start ) {
|
switch ( *start ) {
|
case ':' :
|
case ':' :
|
ptok->type = tCOLON ;
|
ptok->type = tCOLON ;
|
break ;
|
break ;
|
case '(' :
|
case '(' :
|
ptok->type = tLPAREN ;
|
ptok->type = tLPAREN ;
|
break ;
|
break ;
|
case ')' :
|
case ')' :
|
ptok->type = tRPAREN ;
|
ptok->type = tRPAREN ;
|
break ;
|
break ;
|
case ',' :
|
case ',' :
|
ptok->type = tCOMMA ;
|
ptok->type = tCOMMA ;
|
break ;
|
break ;
|
default :
|
default :
|
state = lsError ;
|
state = lsError ;
|
}
|
}
|
break ;
|
break ;
|
|
|
case lsIndex :
|
case lsIndex :
|
// any of .IX, .IX++, .--IX, .-IX+
|
// any of .IX, .IX++, .--IX, .-IX+
|
if ( isalnum( *s ) || *s == '-' || *s == '+' )
|
if ( isalnum( *s ) || *s == '-' || *s == '+' )
|
s++ ;
|
s++ ;
|
else {
|
else {
|
end = s ;
|
end = s ;
|
ptok->type = tIDENT ;
|
ptok->type = tIDENT ;
|
ptok->subtype = stDOT ;
|
ptok->subtype = stDOT ;
|
state = lsCopy ;
|
state = lsCopy ;
|
}
|
}
|
break ;
|
break ;
|
|
|
// final token collector
|
// final token collector
|
case lsCopy :
|
case lsCopy :
|
while ( start < end )
|
while ( start < end )
|
*pterm++ = *start++ ;
|
*pterm++ = *start++ ;
|
*pterm = '\0' ;
|
*pterm = '\0' ;
|
ptok->text = strdup( term ) ;
|
ptok->text = strdup( term ) ;
|
ptok++ ;
|
ptok++ ;
|
state = lsInit ;
|
state = lsInit ;
|
break ;
|
break ;
|
|
|
// any errors
|
// any errors
|
case lsError :
|
case lsError :
|
*pterm = '\0' ;
|
*pterm = '\0' ;
|
ptok->type = tERROR ;
|
ptok->type = tERROR ;
|
return false ;
|
return false ;
|
}
|
}
|
}
|
}
|
return false ;
|
return false ;
|
}
|
}
|
|
|