00001 // tokenizer.cpp 00002 // 00003 // Copyright (C) 2001 Chris Laurel <claurel@shatters.net> 00004 // 00005 // This program is free software; you can redistribute it and/or 00006 // modify it under the terms of the GNU General Public License 00007 // as published by the Free Software Foundation; either version 2 00008 // of the License, or (at your option) any later version. 00009 00010 #include <cctype> 00011 #include <cmath> 00012 #include <iomanip> 00013 #include <celutil/utf8.h> 00014 #include "tokenizer.h" 00015 00016 00017 static bool issep(char c) 00018 { 00019 return !isdigit(c) && !isalpha(c) && c != '.'; 00020 } 00021 00022 00023 Tokenizer::Tokenizer(istream* _in) : 00024 in(_in), 00025 tokenType(TokenBegin), 00026 haveValidNumber(false), 00027 haveValidName(false), 00028 haveValidString(false), 00029 pushedBack(false), 00030 lineNum(1) 00031 { 00032 } 00033 00034 00035 Tokenizer::TokenType Tokenizer::nextToken() 00036 { 00037 State state = StartState; 00038 00039 if (pushedBack) 00040 { 00041 pushedBack = false; 00042 return tokenType; 00043 } 00044 00045 textToken = ""; 00046 haveValidNumber = false; 00047 haveValidName = false; 00048 haveValidString = false; 00049 00050 if (tokenType == TokenBegin) 00051 { 00052 nextChar = readChar(); 00053 if (in->eof()) 00054 return TokenEnd; 00055 } 00056 else if (tokenType == TokenEnd) 00057 { 00058 return tokenType; 00059 } 00060 00061 double integerValue = 0; 00062 double fractionValue = 0; 00063 double sign = 1; 00064 double fracExp = 1; 00065 double exponentValue = 0; 00066 double exponentSign = 1; 00067 00068 TokenType newToken = TokenBegin; 00069 while (newToken == TokenBegin) 00070 { 00071 switch (state) 00072 { 00073 case StartState: 00074 if (isspace(nextChar)) 00075 { 00076 state = StartState; 00077 } 00078 else if (isdigit(nextChar)) 00079 { 00080 state = NumberState; 00081 integerValue = (int) nextChar - (int) '0'; 00082 } 00083 else if (nextChar == '-') 00084 { 00085 state = NumberState; 00086 sign = -1; 00087 integerValue = 0; 00088 } 00089 else if (isalpha(nextChar) || nextChar == '_') 00090 { 00091 state = NameState; 00092 textToken += (char) nextChar; 00093 } 00094 else if (nextChar == '#') 00095 { 00096 state = CommentState; 00097 } 00098 else if (nextChar == '"') 00099 { 00100 state = StringState; 00101 } 00102 else if (nextChar == '{') 00103 { 00104 newToken = TokenBeginGroup; 00105 nextChar = readChar(); 00106 } 00107 else if (nextChar == '}') 00108 { 00109 newToken = TokenEndGroup; 00110 nextChar = readChar(); 00111 } 00112 else if (nextChar == '[') 00113 { 00114 newToken = TokenBeginArray; 00115 nextChar = readChar(); 00116 } 00117 else if (nextChar == ']') 00118 { 00119 newToken = TokenEndArray; 00120 nextChar = readChar(); 00121 } 00122 else if (nextChar == '=') 00123 { 00124 newToken = TokenEquals; 00125 nextChar = readChar(); 00126 } 00127 else if (nextChar == '|') 00128 { 00129 newToken = TokenBar; 00130 nextChar = readChar(); 00131 } 00132 else if (nextChar == -1) 00133 { 00134 newToken = TokenEnd; 00135 } 00136 else 00137 { 00138 newToken = TokenError; 00139 syntaxError("Bad character in stream"); 00140 } 00141 break; 00142 00143 case NameState: 00144 if (isalpha(nextChar) || isdigit(nextChar) || nextChar == '_') 00145 { 00146 state = NameState; 00147 textToken += (char) nextChar; 00148 } 00149 else 00150 { 00151 newToken = TokenName; 00152 haveValidName = true; 00153 } 00154 break; 00155 00156 case CommentState: 00157 if (nextChar == '\n' || nextChar == '\r') 00158 state = StartState; 00159 break; 00160 00161 case StringState: 00162 if (nextChar == '"') 00163 { 00164 newToken = TokenString; 00165 haveValidString = true; 00166 nextChar = readChar(); 00167 } 00168 else if (nextChar == '\\') 00169 { 00170 state = StringEscapeState; 00171 } 00172 else 00173 { 00174 state = StringState; 00175 textToken += (char) nextChar; 00176 } 00177 break; 00178 00179 case StringEscapeState: 00180 if (nextChar == '\\') 00181 { 00182 textToken += '\\'; 00183 state = StringState; 00184 } 00185 else if (nextChar == 'n') 00186 { 00187 textToken += '\n'; 00188 state = StringState; 00189 } 00190 else if (nextChar == '"') 00191 { 00192 textToken += '"'; 00193 state = StringState; 00194 } 00195 else if (nextChar == 'u') 00196 { 00197 unicodeValue = 0; 00198 unicodeEscapeDigits = 0; 00199 state = UnicodeEscapeState; 00200 } 00201 else 00202 { 00203 newToken = TokenError; 00204 syntaxError("Unknown escape code in string"); 00205 state = StringState; 00206 } 00207 break; 00208 00209 case NumberState: 00210 if (isdigit(nextChar)) 00211 { 00212 state = NumberState; 00213 integerValue = integerValue * 10 + (int) nextChar - (int) '0'; 00214 } 00215 else if (nextChar == '.') 00216 { 00217 state = FractionState; 00218 } 00219 else if (nextChar == 'e' || nextChar == 'E') 00220 { 00221 state = ExponentFirstState; 00222 } 00223 else if (issep(nextChar)) 00224 { 00225 newToken = TokenNumber; 00226 haveValidNumber = true; 00227 } 00228 else 00229 { 00230 newToken = TokenError; 00231 syntaxError("Bad character in number"); 00232 } 00233 break; 00234 00235 case FractionState: 00236 if (isdigit(nextChar)) 00237 { 00238 state = FractionState; 00239 fractionValue = fractionValue * 10 + nextChar - (int) '0'; 00240 fracExp *= 10; 00241 } 00242 else if (nextChar == 'e' || nextChar == 'E') 00243 { 00244 state = ExponentFirstState; 00245 } 00246 else if (issep(nextChar)) 00247 { 00248 newToken = TokenNumber; 00249 haveValidNumber = true; 00250 } else { 00251 newToken = TokenError; 00252 syntaxError("Bad character in number"); 00253 } 00254 break; 00255 00256 case ExponentFirstState: 00257 if (isdigit(nextChar)) 00258 { 00259 state = ExponentState; 00260 exponentValue = (int) nextChar - (int) '0'; 00261 } 00262 else if (nextChar == '-') 00263 { 00264 state = ExponentState; 00265 exponentSign = -1; 00266 } 00267 else if (nextChar == '+') 00268 { 00269 state = ExponentState; 00270 } 00271 else 00272 { 00273 state = ErrorState; 00274 syntaxError("Bad character in number"); 00275 } 00276 break; 00277 00278 case ExponentState: 00279 if (isdigit(nextChar)) 00280 { 00281 state = ExponentState; 00282 exponentValue = exponentValue * 10 + (int) nextChar - (int) '0'; 00283 } 00284 else if (issep(nextChar)) 00285 { 00286 newToken = TokenNumber; 00287 haveValidNumber = true; 00288 } 00289 else 00290 { 00291 state = ErrorState; 00292 syntaxError("Bad character in number"); 00293 } 00294 break; 00295 00296 case DotState: 00297 if (isdigit(nextChar)) 00298 { 00299 state = FractionState; 00300 fractionValue = fractionValue * 10 + (int) nextChar - (int) '0'; 00301 fracExp = 10; 00302 } 00303 else 00304 { 00305 state = ErrorState; 00306 syntaxError("'.' in stupid place"); 00307 } 00308 break; 00309 00310 case UnicodeEscapeState: 00311 if (isxdigit(nextChar)) 00312 { 00313 unsigned int digitValue; 00314 if (nextChar >= 'a' && nextChar <= 'f') 00315 digitValue = nextChar - 'a' + 10; 00316 else if (nextChar >= 'A' && nextChar <= 'F') 00317 digitValue = nextChar - 'A' + 10; 00318 else 00319 digitValue = nextChar - '0'; 00320 unicodeValue = (unicodeValue << 4) + digitValue; 00321 unicodeEscapeDigits++; 00322 if (unicodeEscapeDigits == 4) 00323 { 00324 char utf8Encoded[7]; 00325 UTF8Encode((wchar_t) unicodeValue, utf8Encoded); 00326 textToken += utf8Encoded; 00327 state = StringState; 00328 } 00329 } 00330 else 00331 { 00332 state = ErrorState; 00333 syntaxError("Bad Unicode escape in string"); 00334 } 00335 break; 00336 } 00337 00338 if (newToken == TokenBegin) 00339 { 00340 nextChar = readChar(); 00341 } 00342 } 00343 00344 tokenType = newToken; 00345 if (haveValidNumber) 00346 { 00347 numberValue = integerValue + fractionValue / fracExp; 00348 if (exponentValue != 0) 00349 numberValue *= pow(10.0, exponentValue * exponentSign); 00350 numberValue *= sign; 00351 } 00352 00353 return tokenType; 00354 } 00355 00356 00357 Tokenizer::TokenType Tokenizer::getTokenType() 00358 { 00359 return tokenType; 00360 } 00361 00362 00363 void Tokenizer::pushBack() 00364 { 00365 pushedBack = true; 00366 } 00367 00368 00369 double Tokenizer::getNumberValue() 00370 { 00371 return numberValue; 00372 } 00373 00374 00375 string Tokenizer::getNameValue() 00376 { 00377 return textToken; 00378 } 00379 00380 00381 string Tokenizer::getStringValue() 00382 { 00383 return textToken; 00384 } 00385 00386 00387 int Tokenizer::readChar() 00388 { 00389 int c = (int) in->get(); 00390 if (c == '\n') 00391 lineNum++; 00392 00393 return c; 00394 } 00395 00396 void Tokenizer::syntaxError(const char* message) 00397 { 00398 cerr << message << '\n'; 00399 } 00400 00401 00402 int Tokenizer::getLineNumber() const 00403 { 00404 return lineNum; 00405 } 00406 00407 #if 0 00408 // Tokenizer test 00409 int main(int argc, char *argv[]) 00410 { 00411 Tokenizer tokenizer(&cin); 00412 Tokenizer::TokenType tok = Tokenizer::TokenBegin; 00413 00414 while (tok != Tokenizer::TokenEnd) 00415 { 00416 tok = tokenizer.nextToken(); 00417 switch (tok) 00418 { 00419 case Tokenizer::TokenBegin: 00420 cout << "Begin"; 00421 break; 00422 case Tokenizer::TokenEnd: 00423 cout << "End"; 00424 break; 00425 case Tokenizer::TokenName: 00426 cout << "Name = " << tokenizer.getNameValue(); 00427 break; 00428 case Tokenizer::TokenNumber: 00429 cout << "Number = " << tokenizer.getNumberValue(); 00430 break; 00431 case Tokenizer::TokenString: 00432 cout << "String = " << '"' << tokenizer.getStringValue() << '"'; 00433 break; 00434 case Tokenizer::TokenBeginGroup: 00435 cout << '{'; 00436 break; 00437 case Tokenizer::TokenEndGroup: 00438 cout << '}'; 00439 break; 00440 case Tokenizer::TokenEquals: 00441 cout << '='; 00442 break; 00443 default: 00444 cout << "Other"; 00445 break; 00446 } 00447 00448 cout << '\n'; 00449 } 00450 00451 return 0; 00452 } 00453 #endif
1.4.1