ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/group/trunk/OOPSE-2.0/src/antlr/CharScanner.hpp
Revision: 2469
Committed: Fri Dec 2 15:38:03 2005 UTC (18 years, 7 months ago) by tim
File size: 13486 byte(s)
Log Message:
End of the Link --> List
Return of the Oject-Oriented
replace yacc/lex parser with antlr parser

File Contents

# Content
1 #ifndef INC_CharScanner_hpp__
2 #define INC_CharScanner_hpp__
3
4 /* ANTLR Translator Generator
5 * Project led by Terence Parr at http://www.jGuru.com
6 * Software rights: http://www.antlr.org/license.html
7 *
8 * $Id: CharScanner.hpp,v 1.1 2005-12-02 15:38:02 tim Exp $
9 */
10
11 #include <antlr/config.hpp>
12
13 #include <map>
14
15 #ifdef HAS_NOT_CCTYPE_H
16 #include <ctype.h>
17 #else
18 #include <cctype>
19 #endif
20
21 #if ( _MSC_VER == 1200 )
22 // VC6 seems to need this
23 // note that this is not a standard C++ include file.
24 # include <stdio.h>
25 #endif
26
27 #include <antlr/TokenStream.hpp>
28 #include <antlr/RecognitionException.hpp>
29 #include <antlr/SemanticException.hpp>
30 #include <antlr/MismatchedCharException.hpp>
31 #include <antlr/InputBuffer.hpp>
32 #include <antlr/BitSet.hpp>
33 #include <antlr/LexerSharedInputState.hpp>
34
35 #ifdef ANTLR_CXX_SUPPORTS_NAMESPACE
36 namespace antlr {
37 #endif
38
39 class ANTLR_API CharScanner;
40
41 ANTLR_C_USING(tolower)
42
43 #ifdef ANTLR_REALLY_NO_STRCASECMP
44 // Apparently, neither strcasecmp nor stricmp is standard, and Codewarrior
45 // on the mac has neither...
46 inline int strcasecmp(const char *s1, const char *s2)
47 {
48 while (true)
49 {
50 char c1 = tolower(*s1++),
51 c2 = tolower(*s2++);
52 if (c1 < c2) return -1;
53 if (c1 > c2) return 1;
54 if (c1 == 0) return 0;
55 }
56 }
57 #else
58 #ifdef NO_STRCASECMP
59 ANTLR_C_USING(stricmp)
60 #else
61 ANTLR_C_USING(strcasecmp)
62 #endif
63 #endif
64
65 /** Functor for the literals map
66 */
67 class ANTLR_API CharScannerLiteralsLess : public ANTLR_USE_NAMESPACE(std)binary_function<ANTLR_USE_NAMESPACE(std)string,ANTLR_USE_NAMESPACE(std)string,bool> {
68 private:
69 const CharScanner* scanner;
70 public:
71 #ifdef NO_TEMPLATE_PARTS
72 CharScannerLiteralsLess() {} // not really used, definition to appease MSVC
73 #endif
74 CharScannerLiteralsLess(const CharScanner* theScanner)
75 : scanner(theScanner)
76 {
77 }
78 bool operator() (const ANTLR_USE_NAMESPACE(std)string& x,const ANTLR_USE_NAMESPACE(std)string& y) const;
79 // defaults are good enough..
80 // CharScannerLiteralsLess(const CharScannerLiteralsLess&);
81 // CharScannerLiteralsLess& operator=(const CharScannerLiteralsLess&);
82 };
83
84 /** Superclass of generated lexers
85 */
86 class ANTLR_API CharScanner : public TokenStream {
87 protected:
88 typedef RefToken (*factory_type)();
89 public:
90 CharScanner(InputBuffer& cb, bool case_sensitive );
91 CharScanner(InputBuffer* cb, bool case_sensitive );
92 CharScanner(const LexerSharedInputState& state, bool case_sensitive );
93
94 virtual ~CharScanner()
95 {
96 }
97
98 virtual int LA(unsigned int i);
99
100 virtual void append(char c)
101 {
102 if (saveConsumedInput)
103 {
104 size_t l = text.length();
105
106 if ((l%256) == 0)
107 text.reserve(l+256);
108
109 text.replace(l,0,&c,1);
110 }
111 }
112
113 virtual void append(const ANTLR_USE_NAMESPACE(std)string& s)
114 {
115 if( saveConsumedInput )
116 text += s;
117 }
118
119 virtual void commit()
120 {
121 inputState->getInput().commit();
122 }
123
124 virtual void consume()
125 {
126 if (inputState->guessing == 0)
127 {
128 int c = LA(1);
129 if (caseSensitive)
130 {
131 append(c);
132 }
133 else
134 {
135 // use input.LA(), not LA(), to get original case
136 // CharScanner.LA() would toLower it.
137 append(inputState->getInput().LA(1));
138 }
139
140 // RK: in a sense I don't like this automatic handling.
141 if (c == '\t')
142 tab();
143 else
144 inputState->column++;
145 }
146 inputState->getInput().consume();
147 }
148
149 /** Consume chars until one matches the given char */
150 virtual void consumeUntil(int c)
151 {
152 for(;;)
153 {
154 int la_1 = LA(1);
155 if( la_1 == EOF_CHAR || la_1 == c )
156 break;
157 consume();
158 }
159 }
160
161 /** Consume chars until one matches the given set */
162 virtual void consumeUntil(const BitSet& set)
163 {
164 for(;;)
165 {
166 int la_1 = LA(1);
167 if( la_1 == EOF_CHAR || set.member(la_1) )
168 break;
169 consume();
170 }
171 }
172
173 /// Mark the current position and return a id for it
174 virtual unsigned int mark()
175 {
176 return inputState->getInput().mark();
177 }
178 /// Rewind the scanner to a previously marked position
179 virtual void rewind(unsigned int pos)
180 {
181 inputState->getInput().rewind(pos);
182 }
183
184 /// See if input contains character 'c' throw MismatchedCharException if not
185 virtual void match(int c)
186 {
187 int la_1 = LA(1);
188 if ( la_1 != c )
189 throw MismatchedCharException(la_1, c, false, this);
190 consume();
191 }
192
193 /** See if input contains element from bitset b
194 * throw MismatchedCharException if not
195 */
196 virtual void match(const BitSet& b)
197 {
198 int la_1 = LA(1);
199
200 if ( !b.member(la_1) )
201 throw MismatchedCharException( la_1, b, false, this );
202 consume();
203 }
204
205 /** See if input contains string 's' throw MismatchedCharException if not
206 * @note the string cannot match EOF
207 */
208 virtual void match( const char* s )
209 {
210 while( *s != '\0' )
211 {
212 // the & 0xFF is here to prevent sign extension lateron
213 int la_1 = LA(1), c = (*s++ & 0xFF);
214
215 if ( la_1 != c )
216 throw MismatchedCharException(la_1, c, false, this);
217
218 consume();
219 }
220 }
221 /** See if input contains string 's' throw MismatchedCharException if not
222 * @note the string cannot match EOF
223 */
224 virtual void match(const ANTLR_USE_NAMESPACE(std)string& s)
225 {
226 size_t len = s.length();
227
228 for (size_t i = 0; i < len; i++)
229 {
230 // the & 0xFF is here to prevent sign extension lateron
231 int la_1 = LA(1), c = (s[i] & 0xFF);
232
233 if ( la_1 != c )
234 throw MismatchedCharException(la_1, c, false, this);
235
236 consume();
237 }
238 }
239 /** See if input does not contain character 'c'
240 * throw MismatchedCharException if not
241 */
242 virtual void matchNot(int c)
243 {
244 int la_1 = LA(1);
245
246 if ( la_1 == c )
247 throw MismatchedCharException(la_1, c, true, this);
248
249 consume();
250 }
251 /** See if input contains character in range c1-c2
252 * throw MismatchedCharException if not
253 */
254 virtual void matchRange(int c1, int c2)
255 {
256 int la_1 = LA(1);
257
258 if ( la_1 < c1 || la_1 > c2 )
259 throw MismatchedCharException(la_1, c1, c2, false, this);
260
261 consume();
262 }
263
264 virtual bool getCaseSensitive() const
265 {
266 return caseSensitive;
267 }
268
269 virtual void setCaseSensitive(bool t)
270 {
271 caseSensitive = t;
272 }
273
274 virtual bool getCaseSensitiveLiterals() const=0;
275
276 /// Get the line the scanner currently is in (starts at 1)
277 virtual int getLine() const
278 {
279 return inputState->line;
280 }
281
282 /// set the line number
283 virtual void setLine(int l)
284 {
285 inputState->line = l;
286 }
287
288 /// Get the column the scanner currently is in (starts at 1)
289 virtual int getColumn() const
290 {
291 return inputState->column;
292 }
293 /// set the column number
294 virtual void setColumn(int c)
295 {
296 inputState->column = c;
297 }
298
299 /// get the filename for the file currently used
300 virtual const ANTLR_USE_NAMESPACE(std)string& getFilename() const
301 {
302 return inputState->filename;
303 }
304 /// Set the filename the scanner is using (used in error messages)
305 virtual void setFilename(const ANTLR_USE_NAMESPACE(std)string& f)
306 {
307 inputState->filename = f;
308 }
309
310 virtual bool getCommitToPath() const
311 {
312 return commitToPath;
313 }
314
315 virtual void setCommitToPath(bool commit)
316 {
317 commitToPath = commit;
318 }
319
320 /** return a copy of the current text buffer */
321 virtual const ANTLR_USE_NAMESPACE(std)string& getText() const
322 {
323 return text;
324 }
325
326 virtual void setText(const ANTLR_USE_NAMESPACE(std)string& s)
327 {
328 text = s;
329 }
330
331 virtual void resetText()
332 {
333 text = "";
334 inputState->tokenStartColumn = inputState->column;
335 inputState->tokenStartLine = inputState->line;
336 }
337
338 virtual RefToken getTokenObject() const
339 {
340 return _returnToken;
341 }
342
343 /** Used to keep track of line breaks, needs to be called from
344 * within generated lexers when a \n \r is encountered.
345 */
346 virtual void newline()
347 {
348 ++inputState->line;
349 inputState->column = 1;
350 }
351
352 /** Advance the current column number by an appropriate amount according
353 * to the tabsize. This method needs to be explicitly called from the
354 * lexer rules encountering tabs.
355 */
356 virtual void tab()
357 {
358 int c = getColumn();
359 int nc = ( ((c-1)/tabsize) + 1) * tabsize + 1; // calculate tab stop
360 setColumn( nc );
361 }
362 /// set the tabsize. Returns the old tabsize
363 int setTabsize( int size )
364 {
365 int oldsize = tabsize;
366 tabsize = size;
367 return oldsize;
368 }
369 /// Return the tabsize used by the scanner
370 int getTabSize() const
371 {
372 return tabsize;
373 }
374
375 /** Report exception errors caught in nextToken() */
376 virtual void reportError(const RecognitionException& e);
377
378 /** Parser error-reporting function can be overridden in subclass */
379 virtual void reportError(const ANTLR_USE_NAMESPACE(std)string& s);
380
381 /** Parser warning-reporting function can be overridden in subclass */
382 virtual void reportWarning(const ANTLR_USE_NAMESPACE(std)string& s);
383
384 virtual InputBuffer& getInputBuffer()
385 {
386 return inputState->getInput();
387 }
388
389 virtual LexerSharedInputState getInputState()
390 {
391 return inputState;
392 }
393
394 /** set the input state for the lexer.
395 * @note state is a reference counted object, hence no reference */
396 virtual void setInputState(LexerSharedInputState state)
397 {
398 inputState = state;
399 }
400
401 /// Set the factory for created tokens
402 virtual void setTokenObjectFactory(factory_type factory)
403 {
404 tokenFactory = factory;
405 }
406
407 /** Test the token text against the literals table
408 * Override this method to perform a different literals test
409 */
410 virtual int testLiteralsTable(int ttype) const
411 {
412 ANTLR_USE_NAMESPACE(std)map<ANTLR_USE_NAMESPACE(std)string,int,CharScannerLiteralsLess>::const_iterator i = literals.find(text);
413 if (i != literals.end())
414 ttype = (*i).second;
415 return ttype;
416 }
417
418 /** Test the text passed in against the literals table
419 * Override this method to perform a different literals test
420 * This is used primarily when you want to test a portion of
421 * a token
422 */
423 virtual int testLiteralsTable(const ANTLR_USE_NAMESPACE(std)string& txt,int ttype) const
424 {
425 ANTLR_USE_NAMESPACE(std)map<ANTLR_USE_NAMESPACE(std)string,int,CharScannerLiteralsLess>::const_iterator i = literals.find(txt);
426 if (i != literals.end())
427 ttype = (*i).second;
428 return ttype;
429 }
430
431 /// Override this method to get more specific case handling
432 virtual int toLower(int c) const
433 {
434 // test on EOF_CHAR for buggy (?) STLPort tolower (or HPUX tolower?)
435 // also VC++ 6.0 does this. (see fix 422 (is reverted by this fix)
436 // this one is more structural. Maybe make this configurable.
437 return (c == EOF_CHAR ? EOF_CHAR : tolower(c));
438 }
439
440 /** This method is called by YourLexer::nextToken() when the lexer has
441 * hit EOF condition. EOF is NOT a character.
442 * This method is not called if EOF is reached during
443 * syntactic predicate evaluation or during evaluation
444 * of normal lexical rules, which presumably would be
445 * an IOException. This traps the "normal" EOF condition.
446 *
447 * uponEOF() is called after the complete evaluation of
448 * the previous token and only if your parser asks
449 * for another token beyond that last non-EOF token.
450 *
451 * You might want to throw token or char stream exceptions
452 * like: "Heh, premature eof" or a retry stream exception
453 * ("I found the end of this file, go back to referencing file").
454 */
455 virtual void uponEOF()
456 {
457 }
458
459 /// Methods used to change tracing behavior
460 virtual void traceIndent();
461 virtual void traceIn(const char* rname);
462 virtual void traceOut(const char* rname);
463
464 #ifndef NO_STATIC_CONSTS
465 static const int EOF_CHAR = EOF;
466 #else
467 enum {
468 EOF_CHAR = EOF
469 };
470 #endif
471 protected:
472 ANTLR_USE_NAMESPACE(std)string text; ///< Text of current token
473 /// flag indicating wether consume saves characters
474 bool saveConsumedInput;
475 factory_type tokenFactory; ///< Factory for tokens
476 bool caseSensitive; ///< Is this lexer case sensitive
477 ANTLR_USE_NAMESPACE(std)map<ANTLR_USE_NAMESPACE(std)string,int,CharScannerLiteralsLess> literals; // set by subclass
478
479 RefToken _returnToken; ///< used to return tokens w/o using return val
480
481 /// Input state, gives access to input stream, shared among different lexers
482 LexerSharedInputState inputState;
483
484 /** Used during filter mode to indicate that path is desired.
485 * A subsequent scan error will report an error as usual
486 * if acceptPath=true;
487 */
488 bool commitToPath;
489
490 int tabsize; ///< tab size the scanner uses.
491
492 /// Create a new RefToken of type t
493 virtual RefToken makeToken(int t)
494 {
495 RefToken tok = tokenFactory();
496 tok->setType(t);
497 tok->setColumn(inputState->tokenStartColumn);
498 tok->setLine(inputState->tokenStartLine);
499 return tok;
500 }
501
502 /** Tracer class, used when -traceLexer is passed to antlr
503 */
504 class Tracer {
505 private:
506 CharScanner* parser;
507 const char* text;
508
509 Tracer(const Tracer& other); // undefined
510 Tracer& operator=(const Tracer& other); // undefined
511 public:
512 Tracer( CharScanner* p,const char* t )
513 : parser(p), text(t)
514 {
515 parser->traceIn(text);
516 }
517 ~Tracer()
518 {
519 parser->traceOut(text);
520 }
521 };
522
523 int traceDepth;
524 private:
525 CharScanner( const CharScanner& other ); // undefined
526 CharScanner& operator=( const CharScanner& other ); // undefined
527
528 #ifndef NO_STATIC_CONSTS
529 static const int NO_CHAR = 0;
530 #else
531 enum {
532 NO_CHAR = 0
533 };
534 #endif
535 };
536
537 inline int CharScanner::LA(unsigned int i)
538 {
539 int c = inputState->getInput().LA(i);
540
541 if ( caseSensitive )
542 return c;
543 else
544 return toLower(c); // VC 6 tolower bug caught in toLower.
545 }
546
547 inline bool CharScannerLiteralsLess::operator() (const ANTLR_USE_NAMESPACE(std)string& x,const ANTLR_USE_NAMESPACE(std)string& y) const
548 {
549 if (scanner->getCaseSensitiveLiterals())
550 return ANTLR_USE_NAMESPACE(std)less<ANTLR_USE_NAMESPACE(std)string>()(x,y);
551 else
552 {
553 #ifdef NO_STRCASECMP
554 return (stricmp(x.c_str(),y.c_str())<0);
555 #else
556 return (strcasecmp(x.c_str(),y.c_str())<0);
557 #endif
558 }
559 }
560
561 #ifdef ANTLR_CXX_SUPPORTS_NAMESPACE
562 }
563 #endif
564
565 #endif //INC_CharScanner_hpp__