ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/group/trunk/OOPSE-4/src/antlr/CharScanner.hpp
Revision: 2469
Committed: Fri Dec 2 15:38:03 2005 UTC (18 years, 7 months ago) by tim
File size: 13486 byte(s)
Log Message:
End of the Link --> List
Return of the Oject-Oriented
replace yacc/lex parser with antlr parser

File Contents

# User Rev Content
1 tim 2469 #ifndef INC_CharScanner_hpp__
2     #define INC_CharScanner_hpp__
3    
4     /* ANTLR Translator Generator
5     * Project led by Terence Parr at http://www.jGuru.com
6     * Software rights: http://www.antlr.org/license.html
7     *
8     * $Id: CharScanner.hpp,v 1.1 2005-12-02 15:38:02 tim Exp $
9     */
10    
11     #include <antlr/config.hpp>
12    
13     #include <map>
14    
15     #ifdef HAS_NOT_CCTYPE_H
16     #include <ctype.h>
17     #else
18     #include <cctype>
19     #endif
20    
21     #if ( _MSC_VER == 1200 )
22     // VC6 seems to need this
23     // note that this is not a standard C++ include file.
24     # include <stdio.h>
25     #endif
26    
27     #include <antlr/TokenStream.hpp>
28     #include <antlr/RecognitionException.hpp>
29     #include <antlr/SemanticException.hpp>
30     #include <antlr/MismatchedCharException.hpp>
31     #include <antlr/InputBuffer.hpp>
32     #include <antlr/BitSet.hpp>
33     #include <antlr/LexerSharedInputState.hpp>
34    
35     #ifdef ANTLR_CXX_SUPPORTS_NAMESPACE
36     namespace antlr {
37     #endif
38    
39     class ANTLR_API CharScanner;
40    
41     ANTLR_C_USING(tolower)
42    
43     #ifdef ANTLR_REALLY_NO_STRCASECMP
44     // Apparently, neither strcasecmp nor stricmp is standard, and Codewarrior
45     // on the mac has neither...
46     inline int strcasecmp(const char *s1, const char *s2)
47     {
48     while (true)
49     {
50     char c1 = tolower(*s1++),
51     c2 = tolower(*s2++);
52     if (c1 < c2) return -1;
53     if (c1 > c2) return 1;
54     if (c1 == 0) return 0;
55     }
56     }
57     #else
58     #ifdef NO_STRCASECMP
59     ANTLR_C_USING(stricmp)
60     #else
61     ANTLR_C_USING(strcasecmp)
62     #endif
63     #endif
64    
65     /** Functor for the literals map
66     */
67     class ANTLR_API CharScannerLiteralsLess : public ANTLR_USE_NAMESPACE(std)binary_function<ANTLR_USE_NAMESPACE(std)string,ANTLR_USE_NAMESPACE(std)string,bool> {
68     private:
69     const CharScanner* scanner;
70     public:
71     #ifdef NO_TEMPLATE_PARTS
72     CharScannerLiteralsLess() {} // not really used, definition to appease MSVC
73     #endif
74     CharScannerLiteralsLess(const CharScanner* theScanner)
75     : scanner(theScanner)
76     {
77     }
78     bool operator() (const ANTLR_USE_NAMESPACE(std)string& x,const ANTLR_USE_NAMESPACE(std)string& y) const;
79     // defaults are good enough..
80     // CharScannerLiteralsLess(const CharScannerLiteralsLess&);
81     // CharScannerLiteralsLess& operator=(const CharScannerLiteralsLess&);
82     };
83    
84     /** Superclass of generated lexers
85     */
86     class ANTLR_API CharScanner : public TokenStream {
87     protected:
88     typedef RefToken (*factory_type)();
89     public:
90     CharScanner(InputBuffer& cb, bool case_sensitive );
91     CharScanner(InputBuffer* cb, bool case_sensitive );
92     CharScanner(const LexerSharedInputState& state, bool case_sensitive );
93    
94     virtual ~CharScanner()
95     {
96     }
97    
98     virtual int LA(unsigned int i);
99    
100     virtual void append(char c)
101     {
102     if (saveConsumedInput)
103     {
104     size_t l = text.length();
105    
106     if ((l%256) == 0)
107     text.reserve(l+256);
108    
109     text.replace(l,0,&c,1);
110     }
111     }
112    
113     virtual void append(const ANTLR_USE_NAMESPACE(std)string& s)
114     {
115     if( saveConsumedInput )
116     text += s;
117     }
118    
119     virtual void commit()
120     {
121     inputState->getInput().commit();
122     }
123    
124     virtual void consume()
125     {
126     if (inputState->guessing == 0)
127     {
128     int c = LA(1);
129     if (caseSensitive)
130     {
131     append(c);
132     }
133     else
134     {
135     // use input.LA(), not LA(), to get original case
136     // CharScanner.LA() would toLower it.
137     append(inputState->getInput().LA(1));
138     }
139    
140     // RK: in a sense I don't like this automatic handling.
141     if (c == '\t')
142     tab();
143     else
144     inputState->column++;
145     }
146     inputState->getInput().consume();
147     }
148    
149     /** Consume chars until one matches the given char */
150     virtual void consumeUntil(int c)
151     {
152     for(;;)
153     {
154     int la_1 = LA(1);
155     if( la_1 == EOF_CHAR || la_1 == c )
156     break;
157     consume();
158     }
159     }
160    
161     /** Consume chars until one matches the given set */
162     virtual void consumeUntil(const BitSet& set)
163     {
164     for(;;)
165     {
166     int la_1 = LA(1);
167     if( la_1 == EOF_CHAR || set.member(la_1) )
168     break;
169     consume();
170     }
171     }
172    
173     /// Mark the current position and return a id for it
174     virtual unsigned int mark()
175     {
176     return inputState->getInput().mark();
177     }
178     /// Rewind the scanner to a previously marked position
179     virtual void rewind(unsigned int pos)
180     {
181     inputState->getInput().rewind(pos);
182     }
183    
184     /// See if input contains character 'c' throw MismatchedCharException if not
185     virtual void match(int c)
186     {
187     int la_1 = LA(1);
188     if ( la_1 != c )
189     throw MismatchedCharException(la_1, c, false, this);
190     consume();
191     }
192    
193     /** See if input contains element from bitset b
194     * throw MismatchedCharException if not
195     */
196     virtual void match(const BitSet& b)
197     {
198     int la_1 = LA(1);
199    
200     if ( !b.member(la_1) )
201     throw MismatchedCharException( la_1, b, false, this );
202     consume();
203     }
204    
205     /** See if input contains string 's' throw MismatchedCharException if not
206     * @note the string cannot match EOF
207     */
208     virtual void match( const char* s )
209     {
210     while( *s != '\0' )
211     {
212     // the & 0xFF is here to prevent sign extension lateron
213     int la_1 = LA(1), c = (*s++ & 0xFF);
214    
215     if ( la_1 != c )
216     throw MismatchedCharException(la_1, c, false, this);
217    
218     consume();
219     }
220     }
221     /** See if input contains string 's' throw MismatchedCharException if not
222     * @note the string cannot match EOF
223     */
224     virtual void match(const ANTLR_USE_NAMESPACE(std)string& s)
225     {
226     size_t len = s.length();
227    
228     for (size_t i = 0; i < len; i++)
229     {
230     // the & 0xFF is here to prevent sign extension lateron
231     int la_1 = LA(1), c = (s[i] & 0xFF);
232    
233     if ( la_1 != c )
234     throw MismatchedCharException(la_1, c, false, this);
235    
236     consume();
237     }
238     }
239     /** See if input does not contain character 'c'
240     * throw MismatchedCharException if not
241     */
242     virtual void matchNot(int c)
243     {
244     int la_1 = LA(1);
245    
246     if ( la_1 == c )
247     throw MismatchedCharException(la_1, c, true, this);
248    
249     consume();
250     }
251     /** See if input contains character in range c1-c2
252     * throw MismatchedCharException if not
253     */
254     virtual void matchRange(int c1, int c2)
255     {
256     int la_1 = LA(1);
257    
258     if ( la_1 < c1 || la_1 > c2 )
259     throw MismatchedCharException(la_1, c1, c2, false, this);
260    
261     consume();
262     }
263    
264     virtual bool getCaseSensitive() const
265     {
266     return caseSensitive;
267     }
268    
269     virtual void setCaseSensitive(bool t)
270     {
271     caseSensitive = t;
272     }
273    
274     virtual bool getCaseSensitiveLiterals() const=0;
275    
276     /// Get the line the scanner currently is in (starts at 1)
277     virtual int getLine() const
278     {
279     return inputState->line;
280     }
281    
282     /// set the line number
283     virtual void setLine(int l)
284     {
285     inputState->line = l;
286     }
287    
288     /// Get the column the scanner currently is in (starts at 1)
289     virtual int getColumn() const
290     {
291     return inputState->column;
292     }
293     /// set the column number
294     virtual void setColumn(int c)
295     {
296     inputState->column = c;
297     }
298    
299     /// get the filename for the file currently used
300     virtual const ANTLR_USE_NAMESPACE(std)string& getFilename() const
301     {
302     return inputState->filename;
303     }
304     /// Set the filename the scanner is using (used in error messages)
305     virtual void setFilename(const ANTLR_USE_NAMESPACE(std)string& f)
306     {
307     inputState->filename = f;
308     }
309    
310     virtual bool getCommitToPath() const
311     {
312     return commitToPath;
313     }
314    
315     virtual void setCommitToPath(bool commit)
316     {
317     commitToPath = commit;
318     }
319    
320     /** return a copy of the current text buffer */
321     virtual const ANTLR_USE_NAMESPACE(std)string& getText() const
322     {
323     return text;
324     }
325    
326     virtual void setText(const ANTLR_USE_NAMESPACE(std)string& s)
327     {
328     text = s;
329     }
330    
331     virtual void resetText()
332     {
333     text = "";
334     inputState->tokenStartColumn = inputState->column;
335     inputState->tokenStartLine = inputState->line;
336     }
337    
338     virtual RefToken getTokenObject() const
339     {
340     return _returnToken;
341     }
342    
343     /** Used to keep track of line breaks, needs to be called from
344     * within generated lexers when a \n \r is encountered.
345     */
346     virtual void newline()
347     {
348     ++inputState->line;
349     inputState->column = 1;
350     }
351    
352     /** Advance the current column number by an appropriate amount according
353     * to the tabsize. This method needs to be explicitly called from the
354     * lexer rules encountering tabs.
355     */
356     virtual void tab()
357     {
358     int c = getColumn();
359     int nc = ( ((c-1)/tabsize) + 1) * tabsize + 1; // calculate tab stop
360     setColumn( nc );
361     }
362     /// set the tabsize. Returns the old tabsize
363     int setTabsize( int size )
364     {
365     int oldsize = tabsize;
366     tabsize = size;
367     return oldsize;
368     }
369     /// Return the tabsize used by the scanner
370     int getTabSize() const
371     {
372     return tabsize;
373     }
374    
375     /** Report exception errors caught in nextToken() */
376     virtual void reportError(const RecognitionException& e);
377    
378     /** Parser error-reporting function can be overridden in subclass */
379     virtual void reportError(const ANTLR_USE_NAMESPACE(std)string& s);
380    
381     /** Parser warning-reporting function can be overridden in subclass */
382     virtual void reportWarning(const ANTLR_USE_NAMESPACE(std)string& s);
383    
384     virtual InputBuffer& getInputBuffer()
385     {
386     return inputState->getInput();
387     }
388    
389     virtual LexerSharedInputState getInputState()
390     {
391     return inputState;
392     }
393    
394     /** set the input state for the lexer.
395     * @note state is a reference counted object, hence no reference */
396     virtual void setInputState(LexerSharedInputState state)
397     {
398     inputState = state;
399     }
400    
401     /// Set the factory for created tokens
402     virtual void setTokenObjectFactory(factory_type factory)
403     {
404     tokenFactory = factory;
405     }
406    
407     /** Test the token text against the literals table
408     * Override this method to perform a different literals test
409     */
410     virtual int testLiteralsTable(int ttype) const
411     {
412     ANTLR_USE_NAMESPACE(std)map<ANTLR_USE_NAMESPACE(std)string,int,CharScannerLiteralsLess>::const_iterator i = literals.find(text);
413     if (i != literals.end())
414     ttype = (*i).second;
415     return ttype;
416     }
417    
418     /** Test the text passed in against the literals table
419     * Override this method to perform a different literals test
420     * This is used primarily when you want to test a portion of
421     * a token
422     */
423     virtual int testLiteralsTable(const ANTLR_USE_NAMESPACE(std)string& txt,int ttype) const
424     {
425     ANTLR_USE_NAMESPACE(std)map<ANTLR_USE_NAMESPACE(std)string,int,CharScannerLiteralsLess>::const_iterator i = literals.find(txt);
426     if (i != literals.end())
427     ttype = (*i).second;
428     return ttype;
429     }
430    
431     /// Override this method to get more specific case handling
432     virtual int toLower(int c) const
433     {
434     // test on EOF_CHAR for buggy (?) STLPort tolower (or HPUX tolower?)
435     // also VC++ 6.0 does this. (see fix 422 (is reverted by this fix)
436     // this one is more structural. Maybe make this configurable.
437     return (c == EOF_CHAR ? EOF_CHAR : tolower(c));
438     }
439    
440     /** This method is called by YourLexer::nextToken() when the lexer has
441     * hit EOF condition. EOF is NOT a character.
442     * This method is not called if EOF is reached during
443     * syntactic predicate evaluation or during evaluation
444     * of normal lexical rules, which presumably would be
445     * an IOException. This traps the "normal" EOF condition.
446     *
447     * uponEOF() is called after the complete evaluation of
448     * the previous token and only if your parser asks
449     * for another token beyond that last non-EOF token.
450     *
451     * You might want to throw token or char stream exceptions
452     * like: "Heh, premature eof" or a retry stream exception
453     * ("I found the end of this file, go back to referencing file").
454     */
455     virtual void uponEOF()
456     {
457     }
458    
459     /// Methods used to change tracing behavior
460     virtual void traceIndent();
461     virtual void traceIn(const char* rname);
462     virtual void traceOut(const char* rname);
463    
464     #ifndef NO_STATIC_CONSTS
465     static const int EOF_CHAR = EOF;
466     #else
467     enum {
468     EOF_CHAR = EOF
469     };
470     #endif
471     protected:
472     ANTLR_USE_NAMESPACE(std)string text; ///< Text of current token
473     /// flag indicating wether consume saves characters
474     bool saveConsumedInput;
475     factory_type tokenFactory; ///< Factory for tokens
476     bool caseSensitive; ///< Is this lexer case sensitive
477     ANTLR_USE_NAMESPACE(std)map<ANTLR_USE_NAMESPACE(std)string,int,CharScannerLiteralsLess> literals; // set by subclass
478    
479     RefToken _returnToken; ///< used to return tokens w/o using return val
480    
481     /// Input state, gives access to input stream, shared among different lexers
482     LexerSharedInputState inputState;
483    
484     /** Used during filter mode to indicate that path is desired.
485     * A subsequent scan error will report an error as usual
486     * if acceptPath=true;
487     */
488     bool commitToPath;
489    
490     int tabsize; ///< tab size the scanner uses.
491    
492     /// Create a new RefToken of type t
493     virtual RefToken makeToken(int t)
494     {
495     RefToken tok = tokenFactory();
496     tok->setType(t);
497     tok->setColumn(inputState->tokenStartColumn);
498     tok->setLine(inputState->tokenStartLine);
499     return tok;
500     }
501    
502     /** Tracer class, used when -traceLexer is passed to antlr
503     */
504     class Tracer {
505     private:
506     CharScanner* parser;
507     const char* text;
508    
509     Tracer(const Tracer& other); // undefined
510     Tracer& operator=(const Tracer& other); // undefined
511     public:
512     Tracer( CharScanner* p,const char* t )
513     : parser(p), text(t)
514     {
515     parser->traceIn(text);
516     }
517     ~Tracer()
518     {
519     parser->traceOut(text);
520     }
521     };
522    
523     int traceDepth;
524     private:
525     CharScanner( const CharScanner& other ); // undefined
526     CharScanner& operator=( const CharScanner& other ); // undefined
527    
528     #ifndef NO_STATIC_CONSTS
529     static const int NO_CHAR = 0;
530     #else
531     enum {
532     NO_CHAR = 0
533     };
534     #endif
535     };
536    
537     inline int CharScanner::LA(unsigned int i)
538     {
539     int c = inputState->getInput().LA(i);
540    
541     if ( caseSensitive )
542     return c;
543     else
544     return toLower(c); // VC 6 tolower bug caught in toLower.
545     }
546    
547     inline bool CharScannerLiteralsLess::operator() (const ANTLR_USE_NAMESPACE(std)string& x,const ANTLR_USE_NAMESPACE(std)string& y) const
548     {
549     if (scanner->getCaseSensitiveLiterals())
550     return ANTLR_USE_NAMESPACE(std)less<ANTLR_USE_NAMESPACE(std)string>()(x,y);
551     else
552     {
553     #ifdef NO_STRCASECMP
554     return (stricmp(x.c_str(),y.c_str())<0);
555     #else
556     return (strcasecmp(x.c_str(),y.c_str())<0);
557     #endif
558     }
559     }
560    
561     #ifdef ANTLR_CXX_SUPPORTS_NAMESPACE
562     }
563     #endif
564    
565     #endif //INC_CharScanner_hpp__