1 |
#ifndef INC_CharScanner_hpp__ |
2 |
#define INC_CharScanner_hpp__ |
3 |
|
4 |
/* ANTLR Translator Generator |
5 |
* Project led by Terence Parr at http://www.jGuru.com |
6 |
* Software rights: http://www.antlr.org/license.html |
7 |
* |
8 |
* $Id: CharScanner.hpp,v 1.2 2005-12-15 14:48:26 gezelter Exp $ |
9 |
*/ |
10 |
|
11 |
#include <antlr/config.hpp> |
12 |
|
13 |
#include <map> |
14 |
|
15 |
#ifdef HAS_NOT_CCTYPE_H |
16 |
#include <ctype.h> |
17 |
#else |
18 |
#include <cctype> |
19 |
#endif |
20 |
|
21 |
#if ( _MSC_VER == 1200 ) |
22 |
// VC6 seems to need this |
23 |
// note that this is not a standard C++ include file. |
24 |
# include <stdio.h> |
25 |
#endif |
26 |
|
27 |
#include <antlr/TokenStream.hpp> |
28 |
#include <antlr/RecognitionException.hpp> |
29 |
#include <antlr/SemanticException.hpp> |
30 |
#include <antlr/MismatchedCharException.hpp> |
31 |
#include <antlr/InputBuffer.hpp> |
32 |
#include <antlr/BitSet.hpp> |
33 |
#include <antlr/LexerSharedInputState.hpp> |
34 |
|
35 |
#ifdef ANTLR_CXX_SUPPORTS_NAMESPACE |
36 |
namespace antlr { |
37 |
#endif |
38 |
|
39 |
class ANTLR_API CharScanner; |
40 |
|
41 |
ANTLR_C_USING(tolower) |
42 |
|
43 |
#if !defined(HAVE_STRCASECMP) && defined(HAVE_STRICMP) && !defined(stricmp) |
44 |
#define strcasecmp stricmp |
45 |
#endif |
46 |
#if !defined(HAVE_STRNCASECMP) && defined(HAVE_STRNICMP) && !defined(strnicmp) |
47 |
#define strncasecmp strnicmp |
48 |
#endif |
49 |
|
50 |
|
51 |
#if !defined(HAVE_STRCASECMP) && !defined(HAVE_STRICMP) |
52 |
inline int strcasecmp(const char *s1, const char *s2) |
53 |
{ |
54 |
while (true) |
55 |
{ |
56 |
char c1 = tolower(*s1++), |
57 |
c2 = tolower(*s2++); |
58 |
if (c1 < c2) return -1; |
59 |
if (c1 > c2) return 1; |
60 |
if (c1 == 0) return 0; |
61 |
} |
62 |
} |
63 |
#endif |
64 |
|
65 |
/** Functor for the literals map |
66 |
*/ |
67 |
class ANTLR_API CharScannerLiteralsLess : public ANTLR_USE_NAMESPACE(std)binary_function<ANTLR_USE_NAMESPACE(std)string,ANTLR_USE_NAMESPACE(std)string,bool> { |
68 |
private: |
69 |
const CharScanner* scanner; |
70 |
public: |
71 |
#ifdef NO_TEMPLATE_PARTS |
72 |
CharScannerLiteralsLess() {} // not really used, definition to appease MSVC |
73 |
#endif |
74 |
CharScannerLiteralsLess(const CharScanner* theScanner) |
75 |
: scanner(theScanner) |
76 |
{ |
77 |
} |
78 |
bool operator() (const ANTLR_USE_NAMESPACE(std)string& x,const ANTLR_USE_NAMESPACE(std)string& y) const; |
79 |
// defaults are good enough.. |
80 |
// CharScannerLiteralsLess(const CharScannerLiteralsLess&); |
81 |
// CharScannerLiteralsLess& operator=(const CharScannerLiteralsLess&); |
82 |
}; |
83 |
|
84 |
/** Superclass of generated lexers |
85 |
*/ |
86 |
class ANTLR_API CharScanner : public TokenStream { |
87 |
protected: |
88 |
typedef RefToken (*factory_type)(); |
89 |
public: |
90 |
CharScanner(InputBuffer& cb, bool case_sensitive ); |
91 |
CharScanner(InputBuffer* cb, bool case_sensitive ); |
92 |
CharScanner(const LexerSharedInputState& state, bool case_sensitive ); |
93 |
|
94 |
virtual ~CharScanner() |
95 |
{ |
96 |
} |
97 |
|
98 |
virtual int LA(unsigned int i); |
99 |
|
100 |
virtual void append(char c) |
101 |
{ |
102 |
if (saveConsumedInput) |
103 |
{ |
104 |
size_t l = text.length(); |
105 |
|
106 |
if ((l%256) == 0) |
107 |
text.reserve(l+256); |
108 |
|
109 |
text.replace(l,0,&c,1); |
110 |
} |
111 |
} |
112 |
|
113 |
virtual void append(const ANTLR_USE_NAMESPACE(std)string& s) |
114 |
{ |
115 |
if( saveConsumedInput ) |
116 |
text += s; |
117 |
} |
118 |
|
119 |
virtual void commit() |
120 |
{ |
121 |
inputState->getInput().commit(); |
122 |
} |
123 |
|
124 |
virtual void consume() |
125 |
{ |
126 |
if (inputState->guessing == 0) |
127 |
{ |
128 |
int c = LA(1); |
129 |
if (caseSensitive) |
130 |
{ |
131 |
append(c); |
132 |
} |
133 |
else |
134 |
{ |
135 |
// use input.LA(), not LA(), to get original case |
136 |
// CharScanner.LA() would toLower it. |
137 |
append(inputState->getInput().LA(1)); |
138 |
} |
139 |
|
140 |
// RK: in a sense I don't like this automatic handling. |
141 |
if (c == '\t') |
142 |
tab(); |
143 |
else |
144 |
inputState->column++; |
145 |
} |
146 |
inputState->getInput().consume(); |
147 |
} |
148 |
|
149 |
/** Consume chars until one matches the given char */ |
150 |
virtual void consumeUntil(int c) |
151 |
{ |
152 |
for(;;) |
153 |
{ |
154 |
int la_1 = LA(1); |
155 |
if( la_1 == EOF_CHAR || la_1 == c ) |
156 |
break; |
157 |
consume(); |
158 |
} |
159 |
} |
160 |
|
161 |
/** Consume chars until one matches the given set */ |
162 |
virtual void consumeUntil(const BitSet& set) |
163 |
{ |
164 |
for(;;) |
165 |
{ |
166 |
int la_1 = LA(1); |
167 |
if( la_1 == EOF_CHAR || set.member(la_1) ) |
168 |
break; |
169 |
consume(); |
170 |
} |
171 |
} |
172 |
|
173 |
/// Mark the current position and return a id for it |
174 |
virtual unsigned int mark() |
175 |
{ |
176 |
return inputState->getInput().mark(); |
177 |
} |
178 |
/// Rewind the scanner to a previously marked position |
179 |
virtual void rewind(unsigned int pos) |
180 |
{ |
181 |
inputState->getInput().rewind(pos); |
182 |
} |
183 |
|
184 |
/// See if input contains character 'c' throw MismatchedCharException if not |
185 |
virtual void match(int c) |
186 |
{ |
187 |
int la_1 = LA(1); |
188 |
if ( la_1 != c ) |
189 |
throw MismatchedCharException(la_1, c, false, this); |
190 |
consume(); |
191 |
} |
192 |
|
193 |
/** See if input contains element from bitset b |
194 |
* throw MismatchedCharException if not |
195 |
*/ |
196 |
virtual void match(const BitSet& b) |
197 |
{ |
198 |
int la_1 = LA(1); |
199 |
|
200 |
if ( !b.member(la_1) ) |
201 |
throw MismatchedCharException( la_1, b, false, this ); |
202 |
consume(); |
203 |
} |
204 |
|
205 |
/** See if input contains string 's' throw MismatchedCharException if not |
206 |
* @note the string cannot match EOF |
207 |
*/ |
208 |
virtual void match( const char* s ) |
209 |
{ |
210 |
while( *s != '\0' ) |
211 |
{ |
212 |
// the & 0xFF is here to prevent sign extension lateron |
213 |
int la_1 = LA(1), c = (*s++ & 0xFF); |
214 |
|
215 |
if ( la_1 != c ) |
216 |
throw MismatchedCharException(la_1, c, false, this); |
217 |
|
218 |
consume(); |
219 |
} |
220 |
} |
221 |
/** See if input contains string 's' throw MismatchedCharException if not |
222 |
* @note the string cannot match EOF |
223 |
*/ |
224 |
virtual void match(const ANTLR_USE_NAMESPACE(std)string& s) |
225 |
{ |
226 |
size_t len = s.length(); |
227 |
|
228 |
for (size_t i = 0; i < len; i++) |
229 |
{ |
230 |
// the & 0xFF is here to prevent sign extension lateron |
231 |
int la_1 = LA(1), c = (s[i] & 0xFF); |
232 |
|
233 |
if ( la_1 != c ) |
234 |
throw MismatchedCharException(la_1, c, false, this); |
235 |
|
236 |
consume(); |
237 |
} |
238 |
} |
239 |
/** See if input does not contain character 'c' |
240 |
* throw MismatchedCharException if not |
241 |
*/ |
242 |
virtual void matchNot(int c) |
243 |
{ |
244 |
int la_1 = LA(1); |
245 |
|
246 |
if ( la_1 == c ) |
247 |
throw MismatchedCharException(la_1, c, true, this); |
248 |
|
249 |
consume(); |
250 |
} |
251 |
/** See if input contains character in range c1-c2 |
252 |
* throw MismatchedCharException if not |
253 |
*/ |
254 |
virtual void matchRange(int c1, int c2) |
255 |
{ |
256 |
int la_1 = LA(1); |
257 |
|
258 |
if ( la_1 < c1 || la_1 > c2 ) |
259 |
throw MismatchedCharException(la_1, c1, c2, false, this); |
260 |
|
261 |
consume(); |
262 |
} |
263 |
|
264 |
virtual bool getCaseSensitive() const |
265 |
{ |
266 |
return caseSensitive; |
267 |
} |
268 |
|
269 |
virtual void setCaseSensitive(bool t) |
270 |
{ |
271 |
caseSensitive = t; |
272 |
} |
273 |
|
274 |
virtual bool getCaseSensitiveLiterals() const=0; |
275 |
|
276 |
/// Get the line the scanner currently is in (starts at 1) |
277 |
virtual int getLine() const |
278 |
{ |
279 |
return inputState->line; |
280 |
} |
281 |
|
282 |
/// set the line number |
283 |
virtual void setLine(int l) |
284 |
{ |
285 |
inputState->line = l; |
286 |
} |
287 |
|
288 |
/// Get the column the scanner currently is in (starts at 1) |
289 |
virtual int getColumn() const |
290 |
{ |
291 |
return inputState->column; |
292 |
} |
293 |
/// set the column number |
294 |
virtual void setColumn(int c) |
295 |
{ |
296 |
inputState->column = c; |
297 |
} |
298 |
|
299 |
/// get the filename for the file currently used |
300 |
virtual const ANTLR_USE_NAMESPACE(std)string& getFilename() const |
301 |
{ |
302 |
return inputState->filename; |
303 |
} |
304 |
/// Set the filename the scanner is using (used in error messages) |
305 |
virtual void setFilename(const ANTLR_USE_NAMESPACE(std)string& f) |
306 |
{ |
307 |
inputState->filename = f; |
308 |
} |
309 |
|
310 |
virtual bool getCommitToPath() const |
311 |
{ |
312 |
return commitToPath; |
313 |
} |
314 |
|
315 |
virtual void setCommitToPath(bool commit) |
316 |
{ |
317 |
commitToPath = commit; |
318 |
} |
319 |
|
320 |
/** return a copy of the current text buffer */ |
321 |
virtual const ANTLR_USE_NAMESPACE(std)string& getText() const |
322 |
{ |
323 |
return text; |
324 |
} |
325 |
|
326 |
virtual void setText(const ANTLR_USE_NAMESPACE(std)string& s) |
327 |
{ |
328 |
text = s; |
329 |
} |
330 |
|
331 |
virtual void resetText() |
332 |
{ |
333 |
text = ""; |
334 |
inputState->tokenStartColumn = inputState->column; |
335 |
inputState->tokenStartLine = inputState->line; |
336 |
} |
337 |
|
338 |
virtual RefToken getTokenObject() const |
339 |
{ |
340 |
return _returnToken; |
341 |
} |
342 |
|
343 |
/** Used to keep track of line breaks, needs to be called from |
344 |
* within generated lexers when a \n \r is encountered. |
345 |
*/ |
346 |
virtual void newline() |
347 |
{ |
348 |
++inputState->line; |
349 |
inputState->column = 1; |
350 |
} |
351 |
|
352 |
/** Advance the current column number by an appropriate amount according |
353 |
* to the tabsize. This method needs to be explicitly called from the |
354 |
* lexer rules encountering tabs. |
355 |
*/ |
356 |
virtual void tab() |
357 |
{ |
358 |
int c = getColumn(); |
359 |
int nc = ( ((c-1)/tabsize) + 1) * tabsize + 1; // calculate tab stop |
360 |
setColumn( nc ); |
361 |
} |
362 |
/// set the tabsize. Returns the old tabsize |
363 |
int setTabsize( int size ) |
364 |
{ |
365 |
int oldsize = tabsize; |
366 |
tabsize = size; |
367 |
return oldsize; |
368 |
} |
369 |
/// Return the tabsize used by the scanner |
370 |
int getTabSize() const |
371 |
{ |
372 |
return tabsize; |
373 |
} |
374 |
|
375 |
/** Report exception errors caught in nextToken() */ |
376 |
virtual void reportError(const RecognitionException& e); |
377 |
|
378 |
/** Parser error-reporting function can be overridden in subclass */ |
379 |
virtual void reportError(const ANTLR_USE_NAMESPACE(std)string& s); |
380 |
|
381 |
/** Parser warning-reporting function can be overridden in subclass */ |
382 |
virtual void reportWarning(const ANTLR_USE_NAMESPACE(std)string& s); |
383 |
|
384 |
virtual InputBuffer& getInputBuffer() |
385 |
{ |
386 |
return inputState->getInput(); |
387 |
} |
388 |
|
389 |
virtual LexerSharedInputState getInputState() |
390 |
{ |
391 |
return inputState; |
392 |
} |
393 |
|
394 |
/** set the input state for the lexer. |
395 |
* @note state is a reference counted object, hence no reference */ |
396 |
virtual void setInputState(LexerSharedInputState state) |
397 |
{ |
398 |
inputState = state; |
399 |
} |
400 |
|
401 |
/// Set the factory for created tokens |
402 |
virtual void setTokenObjectFactory(factory_type factory) |
403 |
{ |
404 |
tokenFactory = factory; |
405 |
} |
406 |
|
407 |
/** Test the token text against the literals table |
408 |
* Override this method to perform a different literals test |
409 |
*/ |
410 |
virtual int testLiteralsTable(int ttype) const |
411 |
{ |
412 |
ANTLR_USE_NAMESPACE(std)map<ANTLR_USE_NAMESPACE(std)string,int,CharScannerLiteralsLess>::const_iterator i = literals.find(text); |
413 |
if (i != literals.end()) |
414 |
ttype = (*i).second; |
415 |
return ttype; |
416 |
} |
417 |
|
418 |
/** Test the text passed in against the literals table |
419 |
* Override this method to perform a different literals test |
420 |
* This is used primarily when you want to test a portion of |
421 |
* a token |
422 |
*/ |
423 |
virtual int testLiteralsTable(const ANTLR_USE_NAMESPACE(std)string& txt,int ttype) const |
424 |
{ |
425 |
ANTLR_USE_NAMESPACE(std)map<ANTLR_USE_NAMESPACE(std)string,int,CharScannerLiteralsLess>::const_iterator i = literals.find(txt); |
426 |
if (i != literals.end()) |
427 |
ttype = (*i).second; |
428 |
return ttype; |
429 |
} |
430 |
|
431 |
/// Override this method to get more specific case handling |
432 |
virtual int toLower(int c) const |
433 |
{ |
434 |
// test on EOF_CHAR for buggy (?) STLPort tolower (or HPUX tolower?) |
435 |
// also VC++ 6.0 does this. (see fix 422 (is reverted by this fix) |
436 |
// this one is more structural. Maybe make this configurable. |
437 |
return (c == EOF_CHAR ? EOF_CHAR : tolower(c)); |
438 |
} |
439 |
|
440 |
/** This method is called by YourLexer::nextToken() when the lexer has |
441 |
* hit EOF condition. EOF is NOT a character. |
442 |
* This method is not called if EOF is reached during |
443 |
* syntactic predicate evaluation or during evaluation |
444 |
* of normal lexical rules, which presumably would be |
445 |
* an IOException. This traps the "normal" EOF condition. |
446 |
* |
447 |
* uponEOF() is called after the complete evaluation of |
448 |
* the previous token and only if your parser asks |
449 |
* for another token beyond that last non-EOF token. |
450 |
* |
451 |
* You might want to throw token or char stream exceptions |
452 |
* like: "Heh, premature eof" or a retry stream exception |
453 |
* ("I found the end of this file, go back to referencing file"). |
454 |
*/ |
455 |
virtual void uponEOF() |
456 |
{ |
457 |
} |
458 |
|
459 |
/// Methods used to change tracing behavior |
460 |
virtual void traceIndent(); |
461 |
virtual void traceIn(const char* rname); |
462 |
virtual void traceOut(const char* rname); |
463 |
|
464 |
#ifndef NO_STATIC_CONSTS |
465 |
static const int EOF_CHAR = EOF; |
466 |
#else |
467 |
enum { |
468 |
EOF_CHAR = EOF |
469 |
}; |
470 |
#endif |
471 |
protected: |
472 |
ANTLR_USE_NAMESPACE(std)string text; ///< Text of current token |
473 |
/// flag indicating wether consume saves characters |
474 |
bool saveConsumedInput; |
475 |
factory_type tokenFactory; ///< Factory for tokens |
476 |
bool caseSensitive; ///< Is this lexer case sensitive |
477 |
ANTLR_USE_NAMESPACE(std)map<ANTLR_USE_NAMESPACE(std)string,int,CharScannerLiteralsLess> literals; // set by subclass |
478 |
|
479 |
RefToken _returnToken; ///< used to return tokens w/o using return val |
480 |
|
481 |
/// Input state, gives access to input stream, shared among different lexers |
482 |
LexerSharedInputState inputState; |
483 |
|
484 |
/** Used during filter mode to indicate that path is desired. |
485 |
* A subsequent scan error will report an error as usual |
486 |
* if acceptPath=true; |
487 |
*/ |
488 |
bool commitToPath; |
489 |
|
490 |
int tabsize; ///< tab size the scanner uses. |
491 |
|
492 |
/// Create a new RefToken of type t |
493 |
virtual RefToken makeToken(int t) |
494 |
{ |
495 |
RefToken tok = tokenFactory(); |
496 |
tok->setType(t); |
497 |
tok->setColumn(inputState->tokenStartColumn); |
498 |
tok->setLine(inputState->tokenStartLine); |
499 |
return tok; |
500 |
} |
501 |
|
502 |
/** Tracer class, used when -traceLexer is passed to antlr |
503 |
*/ |
504 |
class Tracer { |
505 |
private: |
506 |
CharScanner* parser; |
507 |
const char* text; |
508 |
|
509 |
Tracer(const Tracer& other); // undefined |
510 |
Tracer& operator=(const Tracer& other); // undefined |
511 |
public: |
512 |
Tracer( CharScanner* p,const char* t ) |
513 |
: parser(p), text(t) |
514 |
{ |
515 |
parser->traceIn(text); |
516 |
} |
517 |
~Tracer() |
518 |
{ |
519 |
parser->traceOut(text); |
520 |
} |
521 |
}; |
522 |
|
523 |
int traceDepth; |
524 |
private: |
525 |
CharScanner( const CharScanner& other ); // undefined |
526 |
CharScanner& operator=( const CharScanner& other ); // undefined |
527 |
|
528 |
#ifndef NO_STATIC_CONSTS |
529 |
static const int NO_CHAR = 0; |
530 |
#else |
531 |
enum { |
532 |
NO_CHAR = 0 |
533 |
}; |
534 |
#endif |
535 |
}; |
536 |
|
537 |
inline int CharScanner::LA(unsigned int i) |
538 |
{ |
539 |
int c = inputState->getInput().LA(i); |
540 |
|
541 |
if ( caseSensitive ) |
542 |
return c; |
543 |
else |
544 |
return toLower(c); // VC 6 tolower bug caught in toLower. |
545 |
} |
546 |
|
547 |
inline bool CharScannerLiteralsLess::operator() (const ANTLR_USE_NAMESPACE(std)string& x,const ANTLR_USE_NAMESPACE(std)string& y) const |
548 |
{ |
549 |
if (scanner->getCaseSensitiveLiterals()) |
550 |
return ANTLR_USE_NAMESPACE(std)less<ANTLR_USE_NAMESPACE(std)string>()(x,y); |
551 |
else |
552 |
{ |
553 |
#ifdef NO_STRCASECMP |
554 |
return (stricmp(x.c_str(),y.c_str())<0); |
555 |
#else |
556 |
return (strcasecmp(x.c_str(),y.c_str())<0); |
557 |
#endif |
558 |
} |
559 |
} |
560 |
|
561 |
#ifdef ANTLR_CXX_SUPPORTS_NAMESPACE |
562 |
} |
563 |
#endif |
564 |
|
565 |
#endif //INC_CharScanner_hpp__ |