| 1 | #ifndef INC_CharScanner_hpp__ | 
| 2 | #define INC_CharScanner_hpp__ | 
| 3 |  | 
| 4 | /* ANTLR Translator Generator | 
| 5 | * Project led by Terence Parr at http://www.jGuru.com | 
| 6 | * Software rights: http://www.antlr.org/license.html | 
| 7 | * | 
| 8 | * $Id$ | 
| 9 | */ | 
| 10 |  | 
| 11 | #include <antlr/config.hpp> | 
| 12 |  | 
| 13 | #include <map> | 
| 14 | #include <cstdio> | 
| 15 |  | 
| 16 | #ifdef HAS_NOT_CCTYPE_H | 
| 17 | #include <ctype.h> | 
| 18 | #else | 
| 19 | #include <cctype> | 
| 20 | #endif | 
| 21 |  | 
| 22 | #if ( _MSC_VER == 1200 ) | 
| 23 | // VC6 seems to need this | 
| 24 | // note that this is not a standard C++ include file. | 
| 25 | # include <stdio.h> | 
| 26 | #endif | 
| 27 |  | 
| 28 | #include <antlr/TokenStream.hpp> | 
| 29 | #include <antlr/RecognitionException.hpp> | 
| 30 | #include <antlr/SemanticException.hpp> | 
| 31 | #include <antlr/MismatchedCharException.hpp> | 
| 32 | #include <antlr/InputBuffer.hpp> | 
| 33 | #include <antlr/BitSet.hpp> | 
| 34 | #include <antlr/LexerSharedInputState.hpp> | 
| 35 |  | 
| 36 | #ifdef ANTLR_CXX_SUPPORTS_NAMESPACE | 
| 37 | namespace antlr { | 
| 38 | #endif | 
| 39 |  | 
| 40 | class ANTLR_API CharScanner; | 
| 41 |  | 
| 42 | ANTLR_C_USING(tolower) | 
| 43 |  | 
| 44 | #if !defined(HAVE_STRCASECMP) && defined(HAVE_STRICMP) && !defined(stricmp) | 
| 45 | #define strcasecmp stricmp | 
| 46 | #endif | 
| 47 | #if !defined(HAVE_STRNCASECMP) && defined(HAVE_STRNICMP) && !defined(strnicmp) | 
| 48 | #define strncasecmp strnicmp | 
| 49 | #endif | 
| 50 |  | 
| 51 |  | 
| 52 | #if !defined(HAVE_STRCASECMP) && !defined(HAVE_STRICMP) | 
| 53 | inline int strcasecmp(const char *s1, const char *s2) | 
| 54 | { | 
| 55 | while (true) | 
| 56 | { | 
| 57 | char  c1 = tolower(*s1++), | 
| 58 | c2 = tolower(*s2++); | 
| 59 | if (c1 < c2) return -1; | 
| 60 | if (c1 > c2) return 1; | 
| 61 | if (c1 == 0) return 0; | 
| 62 | } | 
| 63 | } | 
| 64 | #endif | 
| 65 |  | 
| 66 | /** Functor for the literals map | 
| 67 | */ | 
| 68 | class ANTLR_API CharScannerLiteralsLess : public ANTLR_USE_NAMESPACE(std)binary_function<ANTLR_USE_NAMESPACE(std)string,ANTLR_USE_NAMESPACE(std)string,bool> { | 
| 69 | private: | 
| 70 | const CharScanner* scanner; | 
| 71 | public: | 
| 72 | #ifdef NO_TEMPLATE_PARTS | 
| 73 | CharScannerLiteralsLess() {} // not really used, definition to appease MSVC | 
| 74 | #endif | 
| 75 | CharScannerLiteralsLess(const CharScanner* theScanner) | 
| 76 | : scanner(theScanner) | 
| 77 | { | 
| 78 | } | 
| 79 | bool operator() (const ANTLR_USE_NAMESPACE(std)string& x,const ANTLR_USE_NAMESPACE(std)string& y) const; | 
| 80 | // defaults are good enough.. | 
| 81 | //      CharScannerLiteralsLess(const CharScannerLiteralsLess&); | 
| 82 | //      CharScannerLiteralsLess& operator=(const CharScannerLiteralsLess&); | 
| 83 | }; | 
| 84 |  | 
| 85 | /** Superclass of generated lexers | 
| 86 | */ | 
| 87 | class ANTLR_API CharScanner : public TokenStream { | 
| 88 | protected: | 
| 89 | typedef RefToken (*factory_type)(); | 
| 90 | public: | 
| 91 | CharScanner(InputBuffer& cb, bool case_sensitive ); | 
| 92 | CharScanner(InputBuffer* cb, bool case_sensitive ); | 
| 93 | CharScanner(const LexerSharedInputState& state, bool case_sensitive ); | 
| 94 |  | 
| 95 | virtual ~CharScanner() | 
| 96 | { | 
| 97 | } | 
| 98 |  | 
| 99 | virtual int LA(unsigned int i); | 
| 100 |  | 
| 101 | virtual void append(char c) | 
| 102 | { | 
| 103 | if (saveConsumedInput) | 
| 104 | { | 
| 105 | size_t l = text.length(); | 
| 106 |  | 
| 107 | if ((l%256) == 0) | 
| 108 | text.reserve(l+256); | 
| 109 |  | 
| 110 | text.replace(l,0,&c,1); | 
| 111 | } | 
| 112 | } | 
| 113 |  | 
| 114 | virtual void append(const ANTLR_USE_NAMESPACE(std)string& s) | 
| 115 | { | 
| 116 | if( saveConsumedInput ) | 
| 117 | text += s; | 
| 118 | } | 
| 119 |  | 
| 120 | virtual void commit() | 
| 121 | { | 
| 122 | inputState->getInput().commit(); | 
| 123 | } | 
| 124 |  | 
| 125 | virtual void consume() | 
| 126 | { | 
| 127 | if (inputState->guessing == 0) | 
| 128 | { | 
| 129 | int c = LA(1); | 
| 130 | if (caseSensitive) | 
| 131 | { | 
| 132 | append(c); | 
| 133 | } | 
| 134 | else | 
| 135 | { | 
| 136 | // use input.LA(), not LA(), to get original case | 
| 137 | // CharScanner.LA() would toLower it. | 
| 138 | append(inputState->getInput().LA(1)); | 
| 139 | } | 
| 140 |  | 
| 141 | // RK: in a sense I don't like this automatic handling. | 
| 142 | if (c == '\t') | 
| 143 | tab(); | 
| 144 | else | 
| 145 | inputState->column++; | 
| 146 | } | 
| 147 | inputState->getInput().consume(); | 
| 148 | } | 
| 149 |  | 
| 150 | /** Consume chars until one matches the given char */ | 
| 151 | virtual void consumeUntil(int c) | 
| 152 | { | 
| 153 | for(;;) | 
| 154 | { | 
| 155 | int la_1 = LA(1); | 
| 156 | if( la_1 == EOF_CHAR || la_1 == c ) | 
| 157 | break; | 
| 158 | consume(); | 
| 159 | } | 
| 160 | } | 
| 161 |  | 
| 162 | /** Consume chars until one matches the given set */ | 
| 163 | virtual void consumeUntil(const BitSet& set) | 
| 164 | { | 
| 165 | for(;;) | 
| 166 | { | 
| 167 | int la_1 = LA(1); | 
| 168 | if( la_1 == EOF_CHAR || set.member(la_1) ) | 
| 169 | break; | 
| 170 | consume(); | 
| 171 | } | 
| 172 | } | 
| 173 |  | 
| 174 | /// Mark the current position and return a id for it | 
| 175 | virtual unsigned int mark() | 
| 176 | { | 
| 177 | return inputState->getInput().mark(); | 
| 178 | } | 
| 179 | /// Rewind the scanner to a previously marked position | 
| 180 | virtual void rewind(unsigned int pos) | 
| 181 | { | 
| 182 | inputState->getInput().rewind(pos); | 
| 183 | } | 
| 184 |  | 
| 185 | /// See if input contains character 'c' throw MismatchedCharException if not | 
| 186 | virtual void match(int c) | 
| 187 | { | 
| 188 | int la_1 = LA(1); | 
| 189 | if ( la_1 != c ) | 
| 190 | throw MismatchedCharException(la_1, c, false, this); | 
| 191 | consume(); | 
| 192 | } | 
| 193 |  | 
| 194 | /** See if input contains element from bitset b | 
| 195 | * throw MismatchedCharException if not | 
| 196 | */ | 
| 197 | virtual void match(const BitSet& b) | 
| 198 | { | 
| 199 | int la_1 = LA(1); | 
| 200 |  | 
| 201 | if ( !b.member(la_1) ) | 
| 202 | throw MismatchedCharException( la_1, b, false, this ); | 
| 203 | consume(); | 
| 204 | } | 
| 205 |  | 
| 206 | /** See if input contains string 's' throw MismatchedCharException if not | 
| 207 | * @note the string cannot match EOF | 
| 208 | */ | 
| 209 | virtual void match( const char* s ) | 
| 210 | { | 
| 211 | while( *s != '\0' ) | 
| 212 | { | 
| 213 | // the & 0xFF is here to prevent sign extension lateron | 
| 214 | int la_1 = LA(1), c = (*s++ & 0xFF); | 
| 215 |  | 
| 216 | if ( la_1 != c ) | 
| 217 | throw MismatchedCharException(la_1, c, false, this); | 
| 218 |  | 
| 219 | consume(); | 
| 220 | } | 
| 221 | } | 
| 222 | /** See if input contains string 's' throw MismatchedCharException if not | 
| 223 | * @note the string cannot match EOF | 
| 224 | */ | 
| 225 | virtual void match(const ANTLR_USE_NAMESPACE(std)string& s) | 
| 226 | { | 
| 227 | size_t len = s.length(); | 
| 228 |  | 
| 229 | for (size_t i = 0; i < len; i++) | 
| 230 | { | 
| 231 | // the & 0xFF is here to prevent sign extension lateron | 
| 232 | int la_1 = LA(1), c = (s[i] & 0xFF); | 
| 233 |  | 
| 234 | if ( la_1 != c ) | 
| 235 | throw MismatchedCharException(la_1, c, false, this); | 
| 236 |  | 
| 237 | consume(); | 
| 238 | } | 
| 239 | } | 
| 240 | /** See if input does not contain character 'c' | 
| 241 | * throw MismatchedCharException if not | 
| 242 | */ | 
| 243 | virtual void matchNot(int c) | 
| 244 | { | 
| 245 | int la_1 = LA(1); | 
| 246 |  | 
| 247 | if ( la_1 == c ) | 
| 248 | throw MismatchedCharException(la_1, c, true, this); | 
| 249 |  | 
| 250 | consume(); | 
| 251 | } | 
| 252 | /** See if input contains character in range c1-c2 | 
| 253 | * throw MismatchedCharException if not | 
| 254 | */ | 
| 255 | virtual void matchRange(int c1, int c2) | 
| 256 | { | 
| 257 | int la_1 = LA(1); | 
| 258 |  | 
| 259 | if ( la_1 < c1 || la_1 > c2 ) | 
| 260 | throw MismatchedCharException(la_1, c1, c2, false, this); | 
| 261 |  | 
| 262 | consume(); | 
| 263 | } | 
| 264 |  | 
| 265 | virtual bool getCaseSensitive() const | 
| 266 | { | 
| 267 | return caseSensitive; | 
| 268 | } | 
| 269 |  | 
| 270 | virtual void setCaseSensitive(bool t) | 
| 271 | { | 
| 272 | caseSensitive = t; | 
| 273 | } | 
| 274 |  | 
| 275 | virtual bool getCaseSensitiveLiterals() const=0; | 
| 276 |  | 
| 277 | /// Get the line the scanner currently is in (starts at 1) | 
| 278 | virtual int getLine() const | 
| 279 | { | 
| 280 | return inputState->line; | 
| 281 | } | 
| 282 |  | 
| 283 | /// set the line number | 
| 284 | virtual void setLine(int l) | 
| 285 | { | 
| 286 | inputState->line = l; | 
| 287 | } | 
| 288 |  | 
| 289 | /// Get the column the scanner currently is in (starts at 1) | 
| 290 | virtual int getColumn() const | 
| 291 | { | 
| 292 | return inputState->column; | 
| 293 | } | 
| 294 | /// set the column number | 
| 295 | virtual void setColumn(int c) | 
| 296 | { | 
| 297 | inputState->column = c; | 
| 298 | } | 
| 299 |  | 
| 300 | /// get the filename for the file currently used | 
| 301 | virtual const ANTLR_USE_NAMESPACE(std)string& getFilename() const | 
| 302 | { | 
| 303 | return inputState->filename; | 
| 304 | } | 
| 305 | /// Set the filename the scanner is using (used in error messages) | 
| 306 | virtual void setFilename(const ANTLR_USE_NAMESPACE(std)string& f) | 
| 307 | { | 
| 308 | inputState->filename = f; | 
| 309 | } | 
| 310 |  | 
| 311 | virtual bool getCommitToPath() const | 
| 312 | { | 
| 313 | return commitToPath; | 
| 314 | } | 
| 315 |  | 
| 316 | virtual void setCommitToPath(bool commit) | 
| 317 | { | 
| 318 | commitToPath = commit; | 
| 319 | } | 
| 320 |  | 
| 321 | /** return a copy of the current text buffer */ | 
| 322 | virtual const ANTLR_USE_NAMESPACE(std)string& getText() const | 
| 323 | { | 
| 324 | return text; | 
| 325 | } | 
| 326 |  | 
| 327 | virtual void setText(const ANTLR_USE_NAMESPACE(std)string& s) | 
| 328 | { | 
| 329 | text = s; | 
| 330 | } | 
| 331 |  | 
| 332 | virtual void resetText() | 
| 333 | { | 
| 334 | text = ""; | 
| 335 | inputState->tokenStartColumn = inputState->column; | 
| 336 | inputState->tokenStartLine = inputState->line; | 
| 337 | } | 
| 338 |  | 
| 339 | virtual RefToken getTokenObject() const | 
| 340 | { | 
| 341 | return _returnToken; | 
| 342 | } | 
| 343 |  | 
| 344 | /** Used to keep track of line breaks, needs to be called from | 
| 345 | * within generated lexers when a \n \r is encountered. | 
| 346 | */ | 
| 347 | virtual void newline() | 
| 348 | { | 
| 349 | ++inputState->line; | 
| 350 | inputState->column = 1; | 
| 351 | } | 
| 352 |  | 
| 353 | /** Advance the current column number by an appropriate amount according | 
| 354 | * to the tabsize. This method needs to be explicitly called from the | 
| 355 | * lexer rules encountering tabs. | 
| 356 | */ | 
| 357 | virtual void tab() | 
| 358 | { | 
| 359 | int c = getColumn(); | 
| 360 | int nc = ( ((c-1)/tabsize) + 1) * tabsize + 1;      // calculate tab stop | 
| 361 | setColumn( nc ); | 
| 362 | } | 
| 363 | /// set the tabsize. Returns the old tabsize | 
| 364 | int setTabsize( int size ) | 
| 365 | { | 
| 366 | int oldsize = tabsize; | 
| 367 | tabsize = size; | 
| 368 | return oldsize; | 
| 369 | } | 
| 370 | /// Return the tabsize used by the scanner | 
| 371 | int getTabSize() const | 
| 372 | { | 
| 373 | return tabsize; | 
| 374 | } | 
| 375 |  | 
| 376 | /** Report exception errors caught in nextToken() */ | 
| 377 | virtual void reportError(const RecognitionException& e); | 
| 378 |  | 
| 379 | /** Parser error-reporting function can be overridden in subclass */ | 
| 380 | virtual void reportError(const ANTLR_USE_NAMESPACE(std)string& s); | 
| 381 |  | 
| 382 | /** Parser warning-reporting function can be overridden in subclass */ | 
| 383 | virtual void reportWarning(const ANTLR_USE_NAMESPACE(std)string& s); | 
| 384 |  | 
| 385 | virtual InputBuffer& getInputBuffer() | 
| 386 | { | 
| 387 | return inputState->getInput(); | 
| 388 | } | 
| 389 |  | 
| 390 | virtual LexerSharedInputState getInputState() | 
| 391 | { | 
| 392 | return inputState; | 
| 393 | } | 
| 394 |  | 
| 395 | /** set the input state for the lexer. | 
| 396 | * @note state is a reference counted object, hence no reference */ | 
| 397 | virtual void setInputState(LexerSharedInputState state) | 
| 398 | { | 
| 399 | inputState = state; | 
| 400 | } | 
| 401 |  | 
| 402 | /// Set the factory for created tokens | 
| 403 | virtual void setTokenObjectFactory(factory_type factory) | 
| 404 | { | 
| 405 | tokenFactory = factory; | 
| 406 | } | 
| 407 |  | 
| 408 | /** Test the token text against the literals table | 
| 409 | * Override this method to perform a different literals test | 
| 410 | */ | 
| 411 | virtual int testLiteralsTable(int ttype) const | 
| 412 | { | 
| 413 | ANTLR_USE_NAMESPACE(std)map<ANTLR_USE_NAMESPACE(std)string,int,CharScannerLiteralsLess>::const_iterator i = literals.find(text); | 
| 414 | if (i != literals.end()) | 
| 415 | ttype = (*i).second; | 
| 416 | return ttype; | 
| 417 | } | 
| 418 |  | 
| 419 | /** Test the text passed in against the literals table | 
| 420 | * Override this method to perform a different literals test | 
| 421 | * This is used primarily when you want to test a portion of | 
| 422 | * a token | 
| 423 | */ | 
| 424 | virtual int testLiteralsTable(const ANTLR_USE_NAMESPACE(std)string& txt,int ttype) const | 
| 425 | { | 
| 426 | ANTLR_USE_NAMESPACE(std)map<ANTLR_USE_NAMESPACE(std)string,int,CharScannerLiteralsLess>::const_iterator i = literals.find(txt); | 
| 427 | if (i != literals.end()) | 
| 428 | ttype = (*i).second; | 
| 429 | return ttype; | 
| 430 | } | 
| 431 |  | 
| 432 | /// Override this method to get more specific case handling | 
| 433 | virtual int toLower(int c) const | 
| 434 | { | 
| 435 | // test on EOF_CHAR for buggy (?) STLPort tolower (or HPUX tolower?) | 
| 436 | // also VC++ 6.0 does this. (see fix 422 (is reverted by this fix) | 
| 437 | // this one is more structural. Maybe make this configurable. | 
| 438 | return (c == EOF_CHAR ? EOF_CHAR : tolower(c)); | 
| 439 | } | 
| 440 |  | 
| 441 | /** This method is called by YourLexer::nextToken() when the lexer has | 
| 442 | *  hit EOF condition.  EOF is NOT a character. | 
| 443 | *  This method is not called if EOF is reached during | 
| 444 | *  syntactic predicate evaluation or during evaluation | 
| 445 | *  of normal lexical rules, which presumably would be | 
| 446 | *  an IOException.  This traps the "normal" EOF condition. | 
| 447 | * | 
| 448 | *  uponEOF() is called after the complete evaluation of | 
| 449 | *  the previous token and only if your parser asks | 
| 450 | *  for another token beyond that last non-EOF token. | 
| 451 | * | 
| 452 | *  You might want to throw token or char stream exceptions | 
| 453 | *  like: "Heh, premature eof" or a retry stream exception | 
| 454 | *  ("I found the end of this file, go back to referencing file"). | 
| 455 | */ | 
| 456 | virtual void uponEOF() | 
| 457 | { | 
| 458 | } | 
| 459 |  | 
| 460 | /// Methods used to change tracing behavior | 
| 461 | virtual void traceIndent(); | 
| 462 | virtual void traceIn(const char* rname); | 
| 463 | virtual void traceOut(const char* rname); | 
| 464 |  | 
| 465 | #ifndef NO_STATIC_CONSTS | 
| 466 | static const int EOF_CHAR = EOF; | 
| 467 | #else | 
| 468 | enum { | 
| 469 | EOF_CHAR = EOF | 
| 470 | }; | 
| 471 | #endif | 
| 472 | protected: | 
| 473 | ANTLR_USE_NAMESPACE(std)string text; ///< Text of current token | 
| 474 | /// flag indicating wether consume saves characters | 
| 475 | bool saveConsumedInput; | 
| 476 | factory_type tokenFactory;                              ///< Factory for tokens | 
| 477 | bool caseSensitive;                                             ///< Is this lexer case sensitive | 
| 478 | ANTLR_USE_NAMESPACE(std)map<ANTLR_USE_NAMESPACE(std)string,int,CharScannerLiteralsLess> literals; // set by subclass | 
| 479 |  | 
| 480 | RefToken _returnToken;          ///< used to return tokens w/o using return val | 
| 481 |  | 
| 482 | /// Input state, gives access to input stream, shared among different lexers | 
| 483 | LexerSharedInputState inputState; | 
| 484 |  | 
| 485 | /** Used during filter mode to indicate that path is desired. | 
| 486 | * A subsequent scan error will report an error as usual | 
| 487 | * if acceptPath=true; | 
| 488 | */ | 
| 489 | bool commitToPath; | 
| 490 |  | 
| 491 | int tabsize;    ///< tab size the scanner uses. | 
| 492 |  | 
| 493 | /// Create a new RefToken of type t | 
| 494 | virtual RefToken makeToken(int t) | 
| 495 | { | 
| 496 | RefToken tok = tokenFactory(); | 
| 497 | tok->setType(t); | 
| 498 | tok->setColumn(inputState->tokenStartColumn); | 
| 499 | tok->setLine(inputState->tokenStartLine); | 
| 500 | return tok; | 
| 501 | } | 
| 502 |  | 
| 503 | /** Tracer class, used when -traceLexer is passed to antlr | 
| 504 | */ | 
| 505 | class Tracer { | 
| 506 | private: | 
| 507 | CharScanner* parser; | 
| 508 | const char* text; | 
| 509 |  | 
| 510 | Tracer(const Tracer& other);                                    // undefined | 
| 511 | Tracer& operator=(const Tracer& other);         // undefined | 
| 512 | public: | 
| 513 | Tracer( CharScanner* p,const char* t ) | 
| 514 | : parser(p), text(t) | 
| 515 | { | 
| 516 | parser->traceIn(text); | 
| 517 | } | 
| 518 | ~Tracer() | 
| 519 | { | 
| 520 | parser->traceOut(text); | 
| 521 | } | 
| 522 | }; | 
| 523 |  | 
| 524 | int traceDepth; | 
| 525 | private: | 
| 526 | CharScanner( const CharScanner& other );                                        // undefined | 
| 527 | CharScanner& operator=( const CharScanner& other );     // undefined | 
| 528 |  | 
| 529 | #ifndef NO_STATIC_CONSTS | 
| 530 | static const int NO_CHAR = 0; | 
| 531 | #else | 
| 532 | enum { | 
| 533 | NO_CHAR = 0 | 
| 534 | }; | 
| 535 | #endif | 
| 536 | }; | 
| 537 |  | 
| 538 | inline int CharScanner::LA(unsigned int i) | 
| 539 | { | 
| 540 | int c = inputState->getInput().LA(i); | 
| 541 |  | 
| 542 | if ( caseSensitive ) | 
| 543 | return c; | 
| 544 | else | 
| 545 | return toLower(c);      // VC 6 tolower bug caught in toLower. | 
| 546 | } | 
| 547 |  | 
| 548 | inline bool CharScannerLiteralsLess::operator() (const ANTLR_USE_NAMESPACE(std)string& x,const ANTLR_USE_NAMESPACE(std)string& y) const | 
| 549 | { | 
| 550 | if (scanner->getCaseSensitiveLiterals()) | 
| 551 | return ANTLR_USE_NAMESPACE(std)less<ANTLR_USE_NAMESPACE(std)string>()(x,y); | 
| 552 | else | 
| 553 | { | 
| 554 | #ifdef NO_STRCASECMP | 
| 555 | return (stricmp(x.c_str(),y.c_str())<0); | 
| 556 | #else | 
| 557 | return (strcasecmp(x.c_str(),y.c_str())<0); | 
| 558 | #endif | 
| 559 | } | 
| 560 | } | 
| 561 |  | 
| 562 | #ifdef ANTLR_CXX_SUPPORTS_NAMESPACE | 
| 563 | } | 
| 564 | #endif | 
| 565 |  | 
| 566 | #endif //INC_CharScanner_hpp__ |