| 1 | #ifndef INC_CharScanner_hpp__ | 
| 2 | #define INC_CharScanner_hpp__ | 
| 3 |  | 
| 4 | /* ANTLR Translator Generator | 
| 5 | * Project led by Terence Parr at http://www.jGuru.com | 
| 6 | * Software rights: http://www.antlr.org/license.html | 
| 7 | * | 
| 8 | * $Id$ | 
| 9 | */ | 
| 10 |  | 
| 11 | #include <antlr/config.hpp> | 
| 12 |  | 
| 13 | #include <map> | 
| 14 |  | 
| 15 | #ifdef HAS_NOT_CCTYPE_H | 
| 16 | #include <ctype.h> | 
| 17 | #else | 
| 18 | #include <cctype> | 
| 19 | #endif | 
| 20 |  | 
| 21 | #if ( _MSC_VER == 1200 ) | 
| 22 | // VC6 seems to need this | 
| 23 | // note that this is not a standard C++ include file. | 
| 24 | # include <stdio.h> | 
| 25 | #endif | 
| 26 |  | 
| 27 | #include <antlr/TokenStream.hpp> | 
| 28 | #include <antlr/RecognitionException.hpp> | 
| 29 | #include <antlr/SemanticException.hpp> | 
| 30 | #include <antlr/MismatchedCharException.hpp> | 
| 31 | #include <antlr/InputBuffer.hpp> | 
| 32 | #include <antlr/BitSet.hpp> | 
| 33 | #include <antlr/LexerSharedInputState.hpp> | 
| 34 |  | 
| 35 | #ifdef ANTLR_CXX_SUPPORTS_NAMESPACE | 
| 36 | namespace antlr { | 
| 37 | #endif | 
| 38 |  | 
| 39 | class ANTLR_API CharScanner; | 
| 40 |  | 
| 41 | ANTLR_C_USING(tolower) | 
| 42 |  | 
| 43 | #ifdef ANTLR_REALLY_NO_STRCASECMP | 
| 44 | // Apparently, neither strcasecmp nor stricmp is standard, and Codewarrior | 
| 45 | // on the mac has neither... | 
| 46 | inline int strcasecmp(const char *s1, const char *s2) | 
| 47 | { | 
| 48 | while (true) | 
| 49 | { | 
| 50 | char  c1 = tolower(*s1++), | 
| 51 | c2 = tolower(*s2++); | 
| 52 | if (c1 < c2) return -1; | 
| 53 | if (c1 > c2) return 1; | 
| 54 | if (c1 == 0) return 0; | 
| 55 | } | 
| 56 | } | 
| 57 | #else | 
| 58 | #ifdef NO_STRCASECMP | 
| 59 | ANTLR_C_USING(stricmp) | 
| 60 | #else | 
| 61 | ANTLR_C_USING(strcasecmp) | 
| 62 | #endif | 
| 63 | #endif | 
| 64 |  | 
| 65 | /** Functor for the literals map | 
| 66 | */ | 
| 67 | class ANTLR_API CharScannerLiteralsLess : public ANTLR_USE_NAMESPACE(std)binary_function<ANTLR_USE_NAMESPACE(std)string,ANTLR_USE_NAMESPACE(std)string,bool> { | 
| 68 | private: | 
| 69 | const CharScanner* scanner; | 
| 70 | public: | 
| 71 | #ifdef NO_TEMPLATE_PARTS | 
| 72 | CharScannerLiteralsLess() {} // not really used, definition to appease MSVC | 
| 73 | #endif | 
| 74 | CharScannerLiteralsLess(const CharScanner* theScanner) | 
| 75 | : scanner(theScanner) | 
| 76 | { | 
| 77 | } | 
| 78 | bool operator() (const ANTLR_USE_NAMESPACE(std)string& x,const ANTLR_USE_NAMESPACE(std)string& y) const; | 
| 79 | // defaults are good enough.. | 
| 80 | //      CharScannerLiteralsLess(const CharScannerLiteralsLess&); | 
| 81 | //      CharScannerLiteralsLess& operator=(const CharScannerLiteralsLess&); | 
| 82 | }; | 
| 83 |  | 
| 84 | /** Superclass of generated lexers | 
| 85 | */ | 
| 86 | class ANTLR_API CharScanner : public TokenStream { | 
| 87 | protected: | 
| 88 | typedef RefToken (*factory_type)(); | 
| 89 | public: | 
| 90 | CharScanner(InputBuffer& cb, bool case_sensitive ); | 
| 91 | CharScanner(InputBuffer* cb, bool case_sensitive ); | 
| 92 | CharScanner(const LexerSharedInputState& state, bool case_sensitive ); | 
| 93 |  | 
| 94 | virtual ~CharScanner() | 
| 95 | { | 
| 96 | } | 
| 97 |  | 
| 98 | virtual int LA(unsigned int i); | 
| 99 |  | 
| 100 | virtual void append(char c) | 
| 101 | { | 
| 102 | if (saveConsumedInput) | 
| 103 | { | 
| 104 | size_t l = text.length(); | 
| 105 |  | 
| 106 | if ((l%256) == 0) | 
| 107 | text.reserve(l+256); | 
| 108 |  | 
| 109 | text.replace(l,0,&c,1); | 
| 110 | } | 
| 111 | } | 
| 112 |  | 
| 113 | virtual void append(const ANTLR_USE_NAMESPACE(std)string& s) | 
| 114 | { | 
| 115 | if( saveConsumedInput ) | 
| 116 | text += s; | 
| 117 | } | 
| 118 |  | 
| 119 | virtual void commit() | 
| 120 | { | 
| 121 | inputState->getInput().commit(); | 
| 122 | } | 
| 123 |  | 
| 124 | /** called by the generated lexer to do error recovery, override to | 
| 125 | * customize the behaviour. | 
| 126 | */ | 
| 127 | virtual void recover(const RecognitionException& ex, const BitSet& tokenSet) | 
| 128 | { | 
| 129 | consume(); | 
| 130 | consumeUntil(tokenSet); | 
| 131 | } | 
| 132 |  | 
| 133 | virtual void consume() | 
| 134 | { | 
| 135 | if (inputState->guessing == 0) | 
| 136 | { | 
| 137 | int c = LA(1); | 
| 138 | if (caseSensitive) | 
| 139 | { | 
| 140 | append(c); | 
| 141 | } | 
| 142 | else | 
| 143 | { | 
| 144 | // use input.LA(), not LA(), to get original case | 
| 145 | // CharScanner.LA() would toLower it. | 
| 146 | append(inputState->getInput().LA(1)); | 
| 147 | } | 
| 148 |  | 
| 149 | // RK: in a sense I don't like this automatic handling. | 
| 150 | if (c == '\t') | 
| 151 | tab(); | 
| 152 | else | 
| 153 | inputState->column++; | 
| 154 | } | 
| 155 | inputState->getInput().consume(); | 
| 156 | } | 
| 157 |  | 
| 158 | /** Consume chars until one matches the given char */ | 
| 159 | virtual void consumeUntil(int c) | 
| 160 | { | 
| 161 | for(;;) | 
| 162 | { | 
| 163 | int la_1 = LA(1); | 
| 164 | if( la_1 == EOF_CHAR || la_1 == c ) | 
| 165 | break; | 
| 166 | consume(); | 
| 167 | } | 
| 168 | } | 
| 169 |  | 
| 170 | /** Consume chars until one matches the given set */ | 
| 171 | virtual void consumeUntil(const BitSet& set) | 
| 172 | { | 
| 173 | for(;;) | 
| 174 | { | 
| 175 | int la_1 = LA(1); | 
| 176 | if( la_1 == EOF_CHAR || set.member(la_1) ) | 
| 177 | break; | 
| 178 | consume(); | 
| 179 | } | 
| 180 | } | 
| 181 |  | 
| 182 | /// Mark the current position and return a id for it | 
| 183 | virtual unsigned int mark() | 
| 184 | { | 
| 185 | return inputState->getInput().mark(); | 
| 186 | } | 
| 187 | /// Rewind the scanner to a previously marked position | 
| 188 | virtual void rewind(unsigned int pos) | 
| 189 | { | 
| 190 | inputState->getInput().rewind(pos); | 
| 191 | } | 
| 192 |  | 
| 193 | /// See if input contains character 'c' throw MismatchedCharException if not | 
| 194 | virtual void match(int c) | 
| 195 | { | 
| 196 | int la_1 = LA(1); | 
| 197 | if ( la_1 != c ) | 
| 198 | throw MismatchedCharException(la_1, c, false, this); | 
| 199 | consume(); | 
| 200 | } | 
| 201 |  | 
| 202 | /** See if input contains element from bitset b | 
| 203 | * throw MismatchedCharException if not | 
| 204 | */ | 
| 205 | virtual void match(const BitSet& b) | 
| 206 | { | 
| 207 | int la_1 = LA(1); | 
| 208 |  | 
| 209 | if ( !b.member(la_1) ) | 
| 210 | throw MismatchedCharException( la_1, b, false, this ); | 
| 211 | consume(); | 
| 212 | } | 
| 213 |  | 
| 214 | /** See if input contains string 's' throw MismatchedCharException if not | 
| 215 | * @note the string cannot match EOF | 
| 216 | */ | 
| 217 | virtual void match( const char* s ) | 
| 218 | { | 
| 219 | while( *s != '\0' ) | 
| 220 | { | 
| 221 | // the & 0xFF is here to prevent sign extension lateron | 
| 222 | int la_1 = LA(1), c = (*s++ & 0xFF); | 
| 223 |  | 
| 224 | if ( la_1 != c ) | 
| 225 | throw MismatchedCharException(la_1, c, false, this); | 
| 226 |  | 
| 227 | consume(); | 
| 228 | } | 
| 229 | } | 
| 230 | /** See if input contains string 's' throw MismatchedCharException if not | 
| 231 | * @note the string cannot match EOF | 
| 232 | */ | 
| 233 | virtual void match(const ANTLR_USE_NAMESPACE(std)string& s) | 
| 234 | { | 
| 235 | size_t len = s.length(); | 
| 236 |  | 
| 237 | for (size_t i = 0; i < len; i++) | 
| 238 | { | 
| 239 | // the & 0xFF is here to prevent sign extension lateron | 
| 240 | int la_1 = LA(1), c = (s[i] & 0xFF); | 
| 241 |  | 
| 242 | if ( la_1 != c ) | 
| 243 | throw MismatchedCharException(la_1, c, false, this); | 
| 244 |  | 
| 245 | consume(); | 
| 246 | } | 
| 247 | } | 
| 248 | /** See if input does not contain character 'c' | 
| 249 | * throw MismatchedCharException if not | 
| 250 | */ | 
| 251 | virtual void matchNot(int c) | 
| 252 | { | 
| 253 | int la_1 = LA(1); | 
| 254 |  | 
| 255 | if ( la_1 == c ) | 
| 256 | throw MismatchedCharException(la_1, c, true, this); | 
| 257 |  | 
| 258 | consume(); | 
| 259 | } | 
| 260 | /** See if input contains character in range c1-c2 | 
| 261 | * throw MismatchedCharException if not | 
| 262 | */ | 
| 263 | virtual void matchRange(int c1, int c2) | 
| 264 | { | 
| 265 | int la_1 = LA(1); | 
| 266 |  | 
| 267 | if ( la_1 < c1 || la_1 > c2 ) | 
| 268 | throw MismatchedCharException(la_1, c1, c2, false, this); | 
| 269 |  | 
| 270 | consume(); | 
| 271 | } | 
| 272 |  | 
| 273 | virtual bool getCaseSensitive() const | 
| 274 | { | 
| 275 | return caseSensitive; | 
| 276 | } | 
| 277 |  | 
| 278 | virtual void setCaseSensitive(bool t) | 
| 279 | { | 
| 280 | caseSensitive = t; | 
| 281 | } | 
| 282 |  | 
| 283 | virtual bool getCaseSensitiveLiterals() const=0; | 
| 284 |  | 
| 285 | /// Get the line the scanner currently is in (starts at 1) | 
| 286 | virtual int getLine() const | 
| 287 | { | 
| 288 | return inputState->line; | 
| 289 | } | 
| 290 |  | 
| 291 | /// set the line number | 
| 292 | virtual void setLine(int l) | 
| 293 | { | 
| 294 | inputState->line = l; | 
| 295 | } | 
| 296 |  | 
| 297 | /// Get the column the scanner currently is in (starts at 1) | 
| 298 | virtual int getColumn() const | 
| 299 | { | 
| 300 | return inputState->column; | 
| 301 | } | 
| 302 | /// set the column number | 
| 303 | virtual void setColumn(int c) | 
| 304 | { | 
| 305 | inputState->column = c; | 
| 306 | } | 
| 307 |  | 
| 308 | /// get the filename for the file currently used | 
| 309 | virtual const ANTLR_USE_NAMESPACE(std)string& getFilename() const | 
| 310 | { | 
| 311 | return inputState->filename; | 
| 312 | } | 
| 313 | /// Set the filename the scanner is using (used in error messages) | 
| 314 | virtual void setFilename(const ANTLR_USE_NAMESPACE(std)string& f) | 
| 315 | { | 
| 316 | inputState->filename = f; | 
| 317 | } | 
| 318 |  | 
| 319 | virtual bool getCommitToPath() const | 
| 320 | { | 
| 321 | return commitToPath; | 
| 322 | } | 
| 323 |  | 
| 324 | virtual void setCommitToPath(bool commit) | 
| 325 | { | 
| 326 | commitToPath = commit; | 
| 327 | } | 
| 328 |  | 
| 329 | /** return a copy of the current text buffer */ | 
| 330 | virtual const ANTLR_USE_NAMESPACE(std)string& getText() const | 
| 331 | { | 
| 332 | return text; | 
| 333 | } | 
| 334 |  | 
| 335 | virtual void setText(const ANTLR_USE_NAMESPACE(std)string& s) | 
| 336 | { | 
| 337 | text = s; | 
| 338 | } | 
| 339 |  | 
| 340 | virtual void resetText() | 
| 341 | { | 
| 342 | text = ""; | 
| 343 | inputState->tokenStartColumn = inputState->column; | 
| 344 | inputState->tokenStartLine = inputState->line; | 
| 345 | } | 
| 346 |  | 
| 347 | virtual RefToken getTokenObject() const | 
| 348 | { | 
| 349 | return _returnToken; | 
| 350 | } | 
| 351 |  | 
| 352 | /** Used to keep track of line breaks, needs to be called from | 
| 353 | * within generated lexers when a \n \r is encountered. | 
| 354 | */ | 
| 355 | virtual void newline() | 
| 356 | { | 
| 357 | ++inputState->line; | 
| 358 | inputState->column = 1; | 
| 359 | } | 
| 360 |  | 
| 361 | /** Advance the current column number by an appropriate amount according | 
| 362 | * to the tabsize. This method needs to be explicitly called from the | 
| 363 | * lexer rules encountering tabs. | 
| 364 | */ | 
| 365 | virtual void tab() | 
| 366 | { | 
| 367 | int c = getColumn(); | 
| 368 | int nc = ( ((c-1)/tabsize) + 1) * tabsize + 1;      // calculate tab stop | 
| 369 | setColumn( nc ); | 
| 370 | } | 
| 371 | /// set the tabsize. Returns the old tabsize | 
| 372 | int setTabsize( int size ) | 
| 373 | { | 
| 374 | int oldsize = tabsize; | 
| 375 | tabsize = size; | 
| 376 | return oldsize; | 
| 377 | } | 
| 378 | /// Return the tabsize used by the scanner | 
| 379 | int getTabSize() const | 
| 380 | { | 
| 381 | return tabsize; | 
| 382 | } | 
| 383 |  | 
| 384 | /** Report exception errors caught in nextToken() */ | 
| 385 | virtual void reportError(const RecognitionException& e); | 
| 386 |  | 
| 387 | /** Parser error-reporting function can be overridden in subclass */ | 
| 388 | virtual void reportError(const ANTLR_USE_NAMESPACE(std)string& s); | 
| 389 |  | 
| 390 | /** Parser warning-reporting function can be overridden in subclass */ | 
| 391 | virtual void reportWarning(const ANTLR_USE_NAMESPACE(std)string& s); | 
| 392 |  | 
| 393 | virtual InputBuffer& getInputBuffer() | 
| 394 | { | 
| 395 | return inputState->getInput(); | 
| 396 | } | 
| 397 |  | 
| 398 | virtual LexerSharedInputState getInputState() | 
| 399 | { | 
| 400 | return inputState; | 
| 401 | } | 
| 402 |  | 
| 403 | /** set the input state for the lexer. | 
| 404 | * @note state is a reference counted object, hence no reference */ | 
| 405 | virtual void setInputState(LexerSharedInputState state) | 
| 406 | { | 
| 407 | inputState = state; | 
| 408 | } | 
| 409 |  | 
| 410 | /// Set the factory for created tokens | 
| 411 | virtual void setTokenObjectFactory(factory_type factory) | 
| 412 | { | 
| 413 | tokenFactory = factory; | 
| 414 | } | 
| 415 |  | 
| 416 | /** Test the token text against the literals table | 
| 417 | * Override this method to perform a different literals test | 
| 418 | */ | 
| 419 | virtual int testLiteralsTable(int ttype) const | 
| 420 | { | 
| 421 | ANTLR_USE_NAMESPACE(std)map<ANTLR_USE_NAMESPACE(std)string,int,CharScannerLiteralsLess>::const_iterator i = literals.find(text); | 
| 422 | if (i != literals.end()) | 
| 423 | ttype = (*i).second; | 
| 424 | return ttype; | 
| 425 | } | 
| 426 |  | 
| 427 | /** Test the text passed in against the literals table | 
| 428 | * Override this method to perform a different literals test | 
| 429 | * This is used primarily when you want to test a portion of | 
| 430 | * a token | 
| 431 | */ | 
| 432 | virtual int testLiteralsTable(const ANTLR_USE_NAMESPACE(std)string& txt,int ttype) const | 
| 433 | { | 
| 434 | ANTLR_USE_NAMESPACE(std)map<ANTLR_USE_NAMESPACE(std)string,int,CharScannerLiteralsLess>::const_iterator i = literals.find(txt); | 
| 435 | if (i != literals.end()) | 
| 436 | ttype = (*i).second; | 
| 437 | return ttype; | 
| 438 | } | 
| 439 |  | 
| 440 | /// Override this method to get more specific case handling | 
| 441 | virtual int toLower(int c) const | 
| 442 | { | 
| 443 | // test on EOF_CHAR for buggy (?) STLPort tolower (or HPUX tolower?) | 
| 444 | // also VC++ 6.0 does this. (see fix 422 (is reverted by this fix) | 
| 445 | // this one is more structural. Maybe make this configurable. | 
| 446 | return (c == EOF_CHAR ? EOF_CHAR : tolower(c)); | 
| 447 | } | 
| 448 |  | 
| 449 | /** This method is called by YourLexer::nextToken() when the lexer has | 
| 450 | *  hit EOF condition.  EOF is NOT a character. | 
| 451 | *  This method is not called if EOF is reached during | 
| 452 | *  syntactic predicate evaluation or during evaluation | 
| 453 | *  of normal lexical rules, which presumably would be | 
| 454 | *  an IOException.  This traps the "normal" EOF condition. | 
| 455 | * | 
| 456 | *  uponEOF() is called after the complete evaluation of | 
| 457 | *  the previous token and only if your parser asks | 
| 458 | *  for another token beyond that last non-EOF token. | 
| 459 | * | 
| 460 | *  You might want to throw token or char stream exceptions | 
| 461 | *  like: "Heh, premature eof" or a retry stream exception | 
| 462 | *  ("I found the end of this file, go back to referencing file"). | 
| 463 | */ | 
| 464 | virtual void uponEOF() | 
| 465 | { | 
| 466 | } | 
| 467 |  | 
| 468 | /// Methods used to change tracing behavior | 
| 469 | virtual void traceIndent(); | 
| 470 | virtual void traceIn(const char* rname); | 
| 471 | virtual void traceOut(const char* rname); | 
| 472 |  | 
| 473 | #ifndef NO_STATIC_CONSTS | 
| 474 | static const int EOF_CHAR = EOF; | 
| 475 | #else | 
| 476 | enum { | 
| 477 | EOF_CHAR = EOF | 
| 478 | }; | 
| 479 | #endif | 
| 480 | protected: | 
| 481 | ANTLR_USE_NAMESPACE(std)string text; ///< Text of current token | 
| 482 | /// flag indicating wether consume saves characters | 
| 483 | bool saveConsumedInput; | 
| 484 | factory_type tokenFactory;                              ///< Factory for tokens | 
| 485 | bool caseSensitive;                                             ///< Is this lexer case sensitive | 
| 486 | ANTLR_USE_NAMESPACE(std)map<ANTLR_USE_NAMESPACE(std)string,int,CharScannerLiteralsLess> literals; // set by subclass | 
| 487 |  | 
| 488 | RefToken _returnToken;          ///< used to return tokens w/o using return val | 
| 489 |  | 
| 490 | /// Input state, gives access to input stream, shared among different lexers | 
| 491 | LexerSharedInputState inputState; | 
| 492 |  | 
| 493 | /** Used during filter mode to indicate that path is desired. | 
| 494 | * A subsequent scan error will report an error as usual | 
| 495 | * if acceptPath=true; | 
| 496 | */ | 
| 497 | bool commitToPath; | 
| 498 |  | 
| 499 | int tabsize;    ///< tab size the scanner uses. | 
| 500 |  | 
| 501 | /// Create a new RefToken of type t | 
| 502 | virtual RefToken makeToken(int t) | 
| 503 | { | 
| 504 | RefToken tok = tokenFactory(); | 
| 505 | tok->setType(t); | 
| 506 | tok->setColumn(inputState->tokenStartColumn); | 
| 507 | tok->setLine(inputState->tokenStartLine); | 
| 508 | return tok; | 
| 509 | } | 
| 510 |  | 
| 511 | /** Tracer class, used when -traceLexer is passed to antlr | 
| 512 | */ | 
| 513 | class Tracer { | 
| 514 | private: | 
| 515 | CharScanner* parser; | 
| 516 | const char* text; | 
| 517 |  | 
| 518 | Tracer(const Tracer& other);                                    // undefined | 
| 519 | Tracer& operator=(const Tracer& other);         // undefined | 
| 520 | public: | 
| 521 | Tracer( CharScanner* p,const char* t ) | 
| 522 | : parser(p), text(t) | 
| 523 | { | 
| 524 | parser->traceIn(text); | 
| 525 | } | 
| 526 | ~Tracer() | 
| 527 | { | 
| 528 | parser->traceOut(text); | 
| 529 | } | 
| 530 | }; | 
| 531 |  | 
| 532 | int traceDepth; | 
| 533 | private: | 
| 534 | CharScanner( const CharScanner& other );                                        // undefined | 
| 535 | CharScanner& operator=( const CharScanner& other );     // undefined | 
| 536 |  | 
| 537 | #ifndef NO_STATIC_CONSTS | 
| 538 | static const int NO_CHAR = 0; | 
| 539 | #else | 
| 540 | enum { | 
| 541 | NO_CHAR = 0 | 
| 542 | }; | 
| 543 | #endif | 
| 544 | }; | 
| 545 |  | 
| 546 | inline int CharScanner::LA(unsigned int i) | 
| 547 | { | 
| 548 | int c = inputState->getInput().LA(i); | 
| 549 |  | 
| 550 | if ( caseSensitive ) | 
| 551 | return c; | 
| 552 | else | 
| 553 | return toLower(c);      // VC 6 tolower bug caught in toLower. | 
| 554 | } | 
| 555 |  | 
| 556 | inline bool CharScannerLiteralsLess::operator() (const ANTLR_USE_NAMESPACE(std)string& x,const ANTLR_USE_NAMESPACE(std)string& y) const | 
| 557 | { | 
| 558 | if (scanner->getCaseSensitiveLiterals()) | 
| 559 | return ANTLR_USE_NAMESPACE(std)less<ANTLR_USE_NAMESPACE(std)string>()(x,y); | 
| 560 | else | 
| 561 | { | 
| 562 | #ifdef NO_STRCASECMP | 
| 563 | return (stricmp(x.c_str(),y.c_str())<0); | 
| 564 | #else | 
| 565 | return (strcasecmp(x.c_str(),y.c_str())<0); | 
| 566 | #endif | 
| 567 | } | 
| 568 | } | 
| 569 |  | 
| 570 | #ifdef ANTLR_CXX_SUPPORTS_NAMESPACE | 
| 571 | } | 
| 572 | #endif | 
| 573 |  | 
| 574 | #endif //INC_CharScanner_hpp__ |