Main Page | Namespace List | Class Hierarchy | Alphabetical List | Class List | Directories | File List | Namespace Members | Class Members | File Members

tokenizer.cpp

Go to the documentation of this file.
00001 // tokenizer.cpp
00002 //
00003 // Copyright (C) 2001 Chris Laurel <claurel@shatters.net>
00004 //
00005 // This program is free software; you can redistribute it and/or
00006 // modify it under the terms of the GNU General Public License
00007 // as published by the Free Software Foundation; either version 2
00008 // of the License, or (at your option) any later version.
00009 
00010 #include <cctype>
00011 #include <cmath>
00012 #include <iomanip>
00013 #include <celutil/utf8.h>
00014 #include "tokenizer.h"
00015 
00016 
00017 static bool issep(char c)
00018 {
00019     return !isdigit(c) && !isalpha(c) && c != '.';
00020 }
00021 
00022 
00023 Tokenizer::Tokenizer(istream* _in) :
00024     in(_in),
00025     tokenType(TokenBegin),
00026     haveValidNumber(false),
00027     haveValidName(false),
00028     haveValidString(false),
00029     pushedBack(false),
00030     lineNum(1)
00031 {
00032 }
00033 
00034 
00035 Tokenizer::TokenType Tokenizer::nextToken()
00036 {
00037     State state = StartState;
00038 
00039     if (pushedBack)
00040     {
00041         pushedBack = false;
00042         return tokenType;
00043     }
00044 
00045     textToken = "";
00046     haveValidNumber = false;
00047     haveValidName = false;
00048     haveValidString = false;
00049 
00050     if (tokenType == TokenBegin)
00051     {
00052         nextChar = readChar();
00053         if (in->eof())
00054             return TokenEnd;
00055     }
00056     else if (tokenType == TokenEnd)
00057     {
00058         return tokenType;
00059     }
00060 
00061     double integerValue = 0;
00062     double fractionValue = 0;
00063     double sign = 1;
00064     double fracExp = 1;
00065     double exponentValue = 0;
00066     double exponentSign = 1;
00067 
00068     TokenType newToken = TokenBegin;
00069     while (newToken == TokenBegin)
00070     {
00071         switch (state)
00072         {
00073         case StartState:
00074             if (isspace(nextChar))
00075             {
00076                 state = StartState;
00077             }
00078             else if (isdigit(nextChar))
00079             {
00080                 state = NumberState;
00081                 integerValue = (int) nextChar - (int) '0';
00082             }
00083             else if (nextChar == '-')
00084             {
00085                 state = NumberState;
00086                 sign = -1;
00087                 integerValue = 0;
00088             }
00089             else if (isalpha(nextChar) || nextChar == '_')
00090             {
00091                 state = NameState;
00092                 textToken += (char) nextChar;
00093             }
00094             else if (nextChar == '#')
00095             {
00096                 state = CommentState;
00097             }
00098             else if (nextChar == '"')
00099             {
00100                 state = StringState;
00101             }
00102             else if (nextChar == '{')
00103             {
00104                 newToken = TokenBeginGroup;
00105                 nextChar = readChar();
00106             }
00107             else if (nextChar == '}')
00108             {
00109                 newToken = TokenEndGroup;
00110                 nextChar = readChar();
00111             }
00112             else if (nextChar == '[')
00113             {
00114                 newToken = TokenBeginArray;
00115                 nextChar = readChar();
00116             }
00117             else if (nextChar == ']')
00118             {
00119                 newToken = TokenEndArray;
00120                 nextChar = readChar();
00121             }
00122             else if (nextChar == '=')
00123             {
00124                 newToken = TokenEquals;
00125                 nextChar = readChar();
00126             }
00127             else if (nextChar == '|')
00128             {
00129                 newToken = TokenBar;
00130                 nextChar = readChar();
00131             }
00132             else if (nextChar == -1)
00133             {
00134                 newToken = TokenEnd;
00135             }
00136             else
00137             {
00138                 newToken = TokenError;
00139                 syntaxError("Bad character in stream");
00140             }
00141             break;
00142 
00143         case NameState:
00144             if (isalpha(nextChar) || isdigit(nextChar) || nextChar == '_')
00145             {
00146                 state = NameState;
00147                 textToken += (char) nextChar;
00148             }
00149             else
00150             {
00151                 newToken = TokenName;
00152                 haveValidName = true;
00153             }
00154             break;
00155 
00156         case CommentState:
00157             if (nextChar == '\n' || nextChar == '\r')
00158                 state = StartState;
00159             break;
00160 
00161         case StringState:
00162             if (nextChar == '"')
00163             {
00164                 newToken = TokenString;
00165                 haveValidString = true;
00166                 nextChar = readChar();
00167             }
00168             else if (nextChar == '\\')
00169             {
00170                 state = StringEscapeState;
00171             }
00172             else
00173             {
00174                 state = StringState;
00175                 textToken += (char) nextChar;
00176             }
00177             break;
00178 
00179         case StringEscapeState:
00180             if (nextChar == '\\')
00181             {
00182                 textToken += '\\';
00183                 state = StringState;
00184             }
00185             else if (nextChar == 'n')
00186             {
00187                 textToken += '\n';
00188                 state = StringState;
00189             }
00190             else if (nextChar == '"')
00191             {
00192                 textToken += '"';
00193                 state = StringState;
00194             }
00195             else if (nextChar == 'u')
00196             {
00197                 unicodeValue = 0;
00198                 unicodeEscapeDigits = 0;
00199                 state = UnicodeEscapeState;
00200             }
00201             else
00202             {
00203                 newToken = TokenError;
00204                 syntaxError("Unknown escape code in string");
00205                 state = StringState;
00206             }
00207             break;
00208 
00209         case NumberState:
00210             if (isdigit(nextChar))
00211             {
00212                 state = NumberState;
00213                 integerValue = integerValue * 10 + (int) nextChar - (int) '0';
00214             }
00215             else if (nextChar == '.')
00216             {
00217                 state = FractionState;
00218             }
00219             else if (nextChar == 'e' || nextChar == 'E')
00220             {
00221                 state = ExponentFirstState;
00222             }
00223             else if (issep(nextChar))
00224             {
00225                 newToken = TokenNumber;
00226                 haveValidNumber = true;
00227             }
00228             else
00229             {
00230                 newToken = TokenError;
00231                 syntaxError("Bad character in number");
00232             }
00233             break;
00234 
00235         case FractionState:
00236             if (isdigit(nextChar))
00237             {
00238                 state = FractionState;
00239                 fractionValue = fractionValue * 10 + nextChar - (int) '0';
00240                 fracExp *= 10;
00241             } 
00242             else if (nextChar == 'e' || nextChar == 'E')
00243             {
00244                 state = ExponentFirstState;
00245             }
00246             else if (issep(nextChar))
00247             {
00248                 newToken = TokenNumber;
00249                 haveValidNumber = true;
00250             } else {
00251                 newToken = TokenError;
00252                 syntaxError("Bad character in number");
00253             }
00254             break;
00255 
00256         case ExponentFirstState:
00257             if (isdigit(nextChar))
00258             {
00259                 state = ExponentState;
00260                 exponentValue = (int) nextChar - (int) '0';
00261             }
00262             else if (nextChar == '-')
00263             {
00264                 state = ExponentState;
00265                 exponentSign = -1;
00266             }
00267             else if (nextChar == '+')
00268             {
00269                 state = ExponentState;
00270             }
00271             else
00272             {
00273                 state = ErrorState;
00274                 syntaxError("Bad character in number");
00275             }
00276             break;
00277 
00278         case ExponentState:
00279             if (isdigit(nextChar))
00280             {
00281                 state = ExponentState;
00282                 exponentValue = exponentValue * 10 + (int) nextChar - (int) '0';
00283             }
00284             else if (issep(nextChar))
00285             {
00286                 newToken = TokenNumber;
00287                 haveValidNumber = true;
00288             }
00289             else
00290             {
00291                 state = ErrorState;
00292                 syntaxError("Bad character in number");
00293             }
00294             break;
00295 
00296         case DotState:
00297             if (isdigit(nextChar))
00298             {
00299                 state = FractionState;
00300                 fractionValue = fractionValue * 10 + (int) nextChar - (int) '0';
00301                 fracExp = 10;
00302             }
00303             else
00304             {
00305                 state = ErrorState;
00306                 syntaxError("'.' in stupid place");
00307             }
00308             break;
00309 
00310         case UnicodeEscapeState:
00311             if (isxdigit(nextChar))
00312             {
00313                 unsigned int digitValue;
00314                 if (nextChar >= 'a' && nextChar <= 'f')
00315                     digitValue = nextChar - 'a' + 10;
00316                 else if (nextChar >= 'A' && nextChar <= 'F')
00317                     digitValue = nextChar - 'A' + 10;
00318                 else
00319                     digitValue = nextChar - '0';
00320                 unicodeValue = (unicodeValue << 4) + digitValue;
00321                 unicodeEscapeDigits++;
00322                 if (unicodeEscapeDigits == 4)
00323                 {
00324                     char utf8Encoded[7];
00325                     UTF8Encode((wchar_t) unicodeValue, utf8Encoded);
00326                     textToken += utf8Encoded;
00327                     state = StringState;
00328                 }
00329             }
00330             else
00331             {
00332                 state = ErrorState;
00333                 syntaxError("Bad Unicode escape in string");
00334             }
00335             break;
00336         }
00337 
00338         if (newToken == TokenBegin)
00339         {
00340             nextChar = readChar();
00341         }
00342     }
00343 
00344     tokenType = newToken;
00345     if (haveValidNumber)
00346     {
00347         numberValue = integerValue + fractionValue / fracExp;
00348         if (exponentValue != 0)
00349             numberValue *= pow(10.0, exponentValue * exponentSign);
00350         numberValue *= sign;
00351     }
00352 
00353     return tokenType;
00354 }
00355 
00356 
00357 Tokenizer::TokenType Tokenizer::getTokenType()
00358 {
00359     return tokenType;
00360 }
00361 
00362 
00363 void Tokenizer::pushBack()
00364 {
00365     pushedBack = true;
00366 }
00367 
00368 
00369 double Tokenizer::getNumberValue()
00370 {
00371     return numberValue;
00372 }
00373 
00374 
00375 string Tokenizer::getNameValue()
00376 {
00377     return textToken;
00378 }
00379 
00380 
00381 string Tokenizer::getStringValue()
00382 {
00383     return textToken;
00384 }
00385 
00386 
00387 int Tokenizer::readChar()
00388 {
00389     int c = (int) in->get();
00390     if (c == '\n')
00391         lineNum++;
00392 
00393     return c;
00394 }
00395 
00396 void Tokenizer::syntaxError(const char* message)
00397 {
00398     cerr << message << '\n';
00399 }
00400 
00401 
00402 int Tokenizer::getLineNumber() const
00403 {
00404     return lineNum;
00405 }
00406 
00407 #if 0
00408 // Tokenizer test
00409 int main(int argc, char *argv[])
00410 {
00411     Tokenizer tokenizer(&cin);
00412     Tokenizer::TokenType tok = Tokenizer::TokenBegin;
00413 
00414     while (tok != Tokenizer::TokenEnd)
00415     {
00416         tok = tokenizer.nextToken();
00417         switch (tok)
00418         {
00419         case Tokenizer::TokenBegin:
00420             cout << "Begin";
00421             break;
00422         case Tokenizer::TokenEnd:
00423             cout << "End";
00424             break;
00425         case Tokenizer::TokenName:
00426             cout << "Name = " << tokenizer.getNameValue();
00427             break;
00428         case Tokenizer::TokenNumber:
00429             cout << "Number = " << tokenizer.getNumberValue();
00430             break;
00431         case Tokenizer::TokenString:
00432             cout << "String = " << '"' << tokenizer.getStringValue() << '"';
00433             break;
00434         case Tokenizer::TokenBeginGroup:
00435             cout << '{';
00436             break;
00437         case Tokenizer::TokenEndGroup:
00438             cout << '}';
00439             break;
00440         case Tokenizer::TokenEquals:
00441             cout << '=';
00442             break;
00443         default:
00444             cout << "Other";
00445             break;
00446         }
00447 
00448         cout << '\n';
00449     }
00450 
00451     return 0;
00452 }
00453 #endif

Generated on Sat Jan 14 22:30:29 2006 for Celestia by  doxygen 1.4.1