TransitionLexer.cxx

Go to the documentation of this file.
00001 /**************************************************************************
00002  *
00003  * Copyright (C) 2010, Jonathan S. Shapiro
00004  * Portions Copyright (C) 2008, Johns Hopkins University
00005  * All rights reserved.
00006  *
00007  * Redistribution and use in source and binary forms, with or
00008  * without modification, are permitted provided that the following
00009  * conditions are met:
00010  *
00011  *   - Redistributions of source code must contain the above 
00012  *     copyright notice, this list of conditions, and the following
00013  *     disclaimer. 
00014  *
00015  *   - Redistributions in binary form must reproduce the above
00016  *     copyright notice, this list of conditions, and the following
00017  *     disclaimer in the documentation and/or other materials 
00018  *     provided with the distribution.
00019  *
00020  *   - Neither the names of the copyright holders nor the names of any
00021  *     of any contributors may be used to endorse or promote products
00022  *     derived from this software without specific prior written
00023  *     permission. 
00024  *
00025  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
00026  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
00027  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
00028  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
00029  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
00030  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
00031  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
00032  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
00033  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
00034  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
00035  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
00036  *
00037  **************************************************************************/
00038 
00039 #include <assert.h>
00040 #include <string.h>
00041 #include <string>
00042 
00043 #include <unicode/uchar.h>
00044 
00045 #include <libsherpa/utf8.hxx>
00046 #include <libsherpa/LexLoc.hxx>
00047 
00048 #include "BUILD/TransitionParser.hxx"
00049 
00050 using namespace sherpa;
00051 
00052 #include "TransitionLexer.hxx"
00053 #include "LitValue.hxx"
00054 
00055 extern const char *TransitionTokenName(int lexTokenNumber);
00056 
00057 bool
00058 TransitionLexer::valid_ident_start(ucs4_t ucs4)
00059 {
00060   // '_' is now handled as a special case in the tokenizer, because it
00061   // has behavioral significance for mixfix identifiers
00062   return u_hasBinaryProperty(ucs4,UCHAR_XID_START);
00063 }
00064 
00065 bool
00066 TransitionLexer::valid_ident_continue(ucs4_t ucs4)
00067 {
00068   // '_' is now handled as a special case in the tokenizer, because it
00069   // has behavioral significance for mixfix identifiers
00070   return u_hasBinaryProperty(ucs4,UCHAR_XID_CONTINUE);
00071 }
00072 
00073 bool
00074 TransitionLexer::valid_ascii_symbol(ucs4_t ucs4)
00075 {
00076   switch (ucs4) {
00077   case '_':
00078     // '_' is now handled as a special case in the tokenizer, because
00079     // it has behavioral significance for mixfix identifiers
00080     return false;
00081 
00082   case '#':                     // thunked hole marker is "#_"
00083     return false;
00084 
00085   case '@':                     // spacer for non-hole quasi-keywords
00086     return false;
00087 
00088   case ':':                     // this just broke too much stuff
00089     return false;
00090 
00091   case '!':
00092   case '$':
00093   case '%':
00094   case '&':
00095   case '*':
00096   case '+':
00097   case '-':
00098   case '/':
00099   case '<':
00100   case '>':
00101   case '=':
00102   case '?':
00103   case '^':
00104   case '|':
00105   case '~':
00106     return true;
00107 
00108   default:
00109     return false;
00110   }
00111 }
00112 
00113 bool
00114 TransitionLexer::valid_operator_start(ucs4_t ucs4)
00115 {
00116 
00117   // Extended characters are only permitted as the first
00118   // identifier character in lisp identifier mode.
00119   return (valid_ascii_symbol(ucs4));
00120 }
00121 
00122 bool
00123 TransitionLexer::valid_operator_continue(ucs4_t ucs4)
00124 {
00125   // For the moment, extended characters are permitted as 
00126   // continue characters.
00127   return (valid_ascii_symbol(ucs4));
00128 }
00129 
00130 bool
00131 TransitionLexer::valid_ifident_start(ucs4_t ucs4)
00132 {
00133   return (isalpha(ucs4) || ucs4 == '_');
00134   //  return (u_hasBinaryProperty(ucs4,UCHAR_XID_START));
00135 }
00136 
00137 bool
00138 TransitionLexer::valid_ifident_continue(ucs4_t ucs4)
00139 {
00140   return (isalpha(ucs4) || isdigit(ucs4) || ucs4 == '_' || ucs4 == '-');
00141   //  return (u_hasBinaryProperty(ucs4,UCHAR_XID_CONTINUE) ||
00142   //valid_ifident_punct(ucs4));
00143 }
00144 
00145 bool
00146 TransitionLexer::valid_tv_ident_start(ucs4_t ucs4)
00147 {
00148   return (u_hasBinaryProperty(ucs4,UCHAR_XID_START) || 
00149           ucs4 == '_');
00150 }
00151 
00152 bool
00153 TransitionLexer::valid_tv_ident_continue(ucs4_t ucs4)
00154 {
00155   return (u_hasBinaryProperty(ucs4,UCHAR_XID_CONTINUE) ||
00156           ucs4 == '_');
00157 }
00158 
00159 TransitionLexer::KeyWord::KeyWord(const char *_nm, LangFlags _whichLang, int _tokValue)
00160 {
00161   nm = _nm;
00162   whichLang = _whichLang;
00163   tokValue = _tokValue;
00164 }
00165 
00176 static bool keywords_sorted = false;
00177 
00178 struct TransitionLexer::KeyWord TransitionLexer::keywords[] = {
00179   TransitionLexer::KeyWord( "=",                lf_block,        '=' ),
00180 //  TransitionLexer::KeyWord( "!=",               lf_block,             tk_NOTEQUALS ),
00181   TransitionLexer::KeyWord( "->",               lf_block,        tk_FNARROW ),
00182   TransitionLexer::KeyWord( "==",               lf_block,        tk_EQUALS ),
00183   TransitionLexer::KeyWord( "and",              lf_block,        tk_AND ),
00184   TransitionLexer::KeyWord( "apply",            lf_block,        tk_APPLY ),
00185   TransitionLexer::KeyWord( "array",            lf_block,        tk_ARRAY ),
00186   TransitionLexer::KeyWord( "ArrayRef",         lf_block,        tk_ARRAY_REF ),
00187   TransitionLexer::KeyWord( "as",               lf_block,        tk_AS ),
00188   TransitionLexer::KeyWord( "assert",           lf_block,        tk_ReservedWord ),
00189   TransitionLexer::KeyWord( "begin",            lf_block,        tk_BEGIN ),
00190   TransitionLexer::KeyWord( "bitc",             lf_version,      tk_BITC ),
00191   TransitionLexer::KeyWord( "bitfield",         lf_block,        tk_BITFIELD ),
00192   TransitionLexer::KeyWord( "bitsizeof",        lf_block,        tk_BITSIZEOF ),
00193 //  TransitionLexer::KeyWord( "block",            lf_block,        tk_BLOCK ),
00194   TransitionLexer::KeyWord( "bool",             lf_block,        tk_BOOL ),
00195   TransitionLexer::KeyWord( "boxed",            lf_block,        tk_BOXED ),
00196   TransitionLexer::KeyWord( "break",            lf_block,        tk_ReservedWord ),
00197   TransitionLexer::KeyWord( "ByRef",            lf_block,        tk_BY_REF ),
00198   TransitionLexer::KeyWord( "case",             lf_block,        tk_CASE ),
00199   TransitionLexer::KeyWord( "catch",            lf_block,        tk_CATCH ),
00200   TransitionLexer::KeyWord( "char",             lf_block,        tk_CHAR ),
00201   TransitionLexer::KeyWord( "check",            lf_block,        tk_ReservedWord ),
00202   TransitionLexer::KeyWord( "closed",           lf_block,        tk_CLOSED ),
00203   TransitionLexer::KeyWord( "cond",             lf_block,        tk_COND ),
00204   TransitionLexer::KeyWord( "const",            lf_block,        tk_CONST ),
00205   TransitionLexer::KeyWord( "constrain",        lf_block,        tk_ReservedWord ),
00206   TransitionLexer::KeyWord( "continue",         lf_block,        tk_CONTINUE ),
00207   TransitionLexer::KeyWord( "declare",          lf_block,        tk_DECLARE ),
00208   TransitionLexer::KeyWord( "deep",             lf_block,        tk_ReservedWord ),
00209   TransitionLexer::KeyWord( "def",              lf_block,        tk_DEF ),
00210   TransitionLexer::KeyWord( "deref",            lf_block,        tk_DEREF ),
00211   TransitionLexer::KeyWord( "disable",          lf_block,        tk_ReservedWord ),
00212   TransitionLexer::KeyWord( "do",               lf_block,        tk_DO ),
00213   TransitionLexer::KeyWord( "do*",              lf_block,        tk_ReservedWord ),
00214   TransitionLexer::KeyWord( "double",           lf_block,        tk_DOUBLE ),
00215   TransitionLexer::KeyWord( "dup",              lf_block,        tk_DUP ),
00216   TransitionLexer::KeyWord( "else",             lf_block,        tk_ELSE ),
00217   TransitionLexer::KeyWord( "enable",           lf_block,        tk_ReservedWord ),
00218   TransitionLexer::KeyWord( "exception",        lf_block,        tk_EXCEPTION ),
00219   TransitionLexer::KeyWord( "external",         lf_block,        tk_EXTERNAL ),
00220   TransitionLexer::KeyWord( "false",            lf_block,        tk_FALSE ),
00221   TransitionLexer::KeyWord( "fill",             lf_block,        tk_FILL ),
00222   TransitionLexer::KeyWord( "float",            lf_block,        tk_FLOAT ),
00223   TransitionLexer::KeyWord( "fn",               lf_block,        tk_FN ),
00224   TransitionLexer::KeyWord( "forall",           lf_block,        tk_FORALL ),
00225   TransitionLexer::KeyWord( "from",             lf_block,        tk_FROM ),
00226 //  TransitionLexer::KeyWord( "giving",           lf_block,        tk_GIVING ),
00227   TransitionLexer::KeyWord( "if",               lf_block,        tk_IF ),
00228   TransitionLexer::KeyWord( "import",           lf_block,        tk_IMPORT ),
00229   TransitionLexer::KeyWord( "impure",           lf_block,        tk_IMPURE ),
00230   TransitionLexer::KeyWord( "in",               lf_block,        tk_IN ),
00231   TransitionLexer::KeyWord( "InnerRef",         lf_block,        tk_INNER_REF ),
00232   TransitionLexer::KeyWord( "instance",         lf_block,        tk_INSTANCE ),
00233   TransitionLexer::KeyWord( "int16",            lf_block,        tk_INT16 ),
00234   TransitionLexer::KeyWord( "int32",            lf_block,        tk_INT32 ),
00235   TransitionLexer::KeyWord( "int64",            lf_block,        tk_INT64 ),
00236   TransitionLexer::KeyWord( "int8",             lf_block,        tk_INT8 ),
00237   TransitionLexer::KeyWord( "interface",        lf_block,        tk_INTERFACE ),
00238   TransitionLexer::KeyWord( "is",               lf_block,        tk_IS ),
00239   TransitionLexer::KeyWord( "label",            lf_block,        tk_LABEL ),
00240   TransitionLexer::KeyWord( "lambda",           lf_block,        tk_LAMBDA ),
00241   TransitionLexer::KeyWord( "let",              lf_block,        tk_LET ),
00242   TransitionLexer::KeyWord( "let*",             lf_block,        tk_ReservedWord ),
00243   TransitionLexer::KeyWord( "letrec",           lf_block,        tk_LETREC ),
00244   TransitionLexer::KeyWord( "location",         lf_block,        tk_ReservedWord ),
00245   TransitionLexer::KeyWord( "loop",             lf_block,        tk_LOOP ),
00246   TransitionLexer::KeyWord( "MakeVector",       lf_block,        tk_MAKE_VECTOR ),
00247   TransitionLexer::KeyWord( "member",           lf_block,        tk_MEMBER ),   /* REDUNDANT */
00248   TransitionLexer::KeyWord( "method",           lf_block,        tk_METHOD ),
00249   TransitionLexer::KeyWord( "mixfix",           lf_block,        tk_MIXFIX ),
00250   TransitionLexer::KeyWord( "module",           lf_block,        tk_MODULE ),
00251   TransitionLexer::KeyWord( "mutable",          lf_block,        tk_MUTABLE ),
00252   TransitionLexer::KeyWord( "namespace",        lf_block,        tk_ReservedWord ),
00253   TransitionLexer::KeyWord( "object",           lf_block,        tk_OBJECT ),
00254   TransitionLexer::KeyWord( "opaque",           lf_block,        tk_OPAQUE ),
00255   TransitionLexer::KeyWord( "or",               lf_block,        tk_OR ),
00256   TransitionLexer::KeyWord( "otherwise",        lf_block,        tk_OTHERWISE ),
00257 //   TransitionLexer::KeyWord( "proclaim",         lf_block,        tk_PROCLAIM ),
00258   TransitionLexer::KeyWord( "provide",          lf_block,        tk_PROVIDE ),
00259   TransitionLexer::KeyWord( "provide!",         lf_block,        tk_ReservedWord ),
00260   TransitionLexer::KeyWord( "pure",             lf_block,        tk_PURE ),
00261   TransitionLexer::KeyWord( "quad",             lf_block,        tk_QUAD ),
00262   TransitionLexer::KeyWord( "read-only",        lf_block,        tk_ReservedWord ),
00263   TransitionLexer::KeyWord( "reference",        lf_block,        tk_PTR ),
00264   TransitionLexer::KeyWord( "repr",             lf_block,        tk_REPR ),
00265   TransitionLexer::KeyWord( "require",          lf_block,        tk_ReservedWord ),
00266   TransitionLexer::KeyWord( "reserved",         lf_block,        tk_RESERVED ),
00267   TransitionLexer::KeyWord( "return",           lf_block,        tk_RETURN ),
00268   TransitionLexer::KeyWord( "sensory",          lf_block,        tk_ReservedWord ),
00269   TransitionLexer::KeyWord( "sizeof",           lf_block,        tk_SIZEOF ),
00270   TransitionLexer::KeyWord( "string",           lf_block,        tk_STRING ),
00271   TransitionLexer::KeyWord( "struct",           lf_block,        tk_STRUCT ),
00272   TransitionLexer::KeyWord( "super",            lf_block,        tk_ReservedWord ),
00273   TransitionLexer::KeyWord( "suspend",          lf_block,        tk_SUSPEND ),  
00274   TransitionLexer::KeyWord( "switch",           lf_block,        tk_SWITCH ),
00275   TransitionLexer::KeyWord( "tag",              lf_block,        tk_TAG ),
00276 //  TransitionLexer::KeyWord( "the",              lf_block,        tk_THE ),
00277   TransitionLexer::KeyWord( "then",             lf_block,        tk_THEN ),
00278   TransitionLexer::KeyWord( "throw",            lf_block,        tk_THROW ),
00279   TransitionLexer::KeyWord( "trait",            lf_block,        tk_TRAIT ),
00280   TransitionLexer::KeyWord( "true",             lf_block,        tk_TRUE ),
00281   TransitionLexer::KeyWord( "try",              lf_block,        tk_TRY ),
00282   TransitionLexer::KeyWord( "tycon",            lf_block,        tk_ReservedWord ),
00283   TransitionLexer::KeyWord( "tyfn",             lf_block,        tk_ReservedWord ),
00284   TransitionLexer::KeyWord( "typecase",         lf_block,        tk_TYPECASE ),
00285   TransitionLexer::KeyWord( "uint16",           lf_block,        tk_UINT16 ),
00286   TransitionLexer::KeyWord( "uint32",           lf_block,        tk_UINT32 ),
00287   TransitionLexer::KeyWord( "uint64",           lf_block,        tk_UINT64 ),
00288   TransitionLexer::KeyWord( "uint8",            lf_block,        tk_UINT8 ),
00289   TransitionLexer::KeyWord( "unboxed",          lf_block,        tk_UNBOXED ),
00290   TransitionLexer::KeyWord( "union",            lf_block,        tk_UNION ),
00291   TransitionLexer::KeyWord( "unless",           lf_block,        tk_UNLESS ),
00292   TransitionLexer::KeyWord( "until",            lf_block,        tk_UNTIL ),
00293   TransitionLexer::KeyWord( "using",            lf_block,        tk_ReservedWord ),
00294   TransitionLexer::KeyWord( "value-at",         lf_block,        tk_ReservedWord ),
00295   TransitionLexer::KeyWord( "vector",           lf_block,        tk_VECTOR ),
00296   TransitionLexer::KeyWord( "version",          lf_version,      tk_VERSION ),
00297   TransitionLexer::KeyWord( "when",             lf_block,        tk_WHEN ),
00298   TransitionLexer::KeyWord( "where",            lf_block,        tk_WHERE ),
00299   TransitionLexer::KeyWord( "word",             lf_block,        tk_WORD )
00300 };
00301 
00302 static int
00303 kwstrcmp(const void *vKey, const void *vCandidate)
00304 {
00305   const char *key = ((const TransitionLexer::KeyWord *) vKey)->nm;
00306   const char *candidate = ((const TransitionLexer::KeyWord *) vCandidate)->nm;
00307 
00308   return strcmp(key, candidate);
00309 }
00310 
00311 int
00312 TransitionLexer::kwCheck(const char *s, int identType)
00313 {
00314   if (ifIdentMode) {
00315     if (!valid_ifident_start(*s))
00316       return tk_ReservedWord;
00317 
00318     for (++s; *s; s++) {
00319       if (!valid_ifident_continue(*s))
00320         return tk_ReservedWord;
00321     }
00322 
00323     return tk_BlkIdent;
00324   }
00325 
00326   KeyWord key = KeyWord(s, lf_block, 0);
00327   KeyWord *entry = 
00328     (KeyWord *)bsearch(&key, keywords, // &OK
00329                        sizeof(keywords)/sizeof(keywords[0]), 
00330                        sizeof(keywords[0]), kwstrcmp);
00331 
00332   // If it is in the token table, and it is accepted in the prevailing
00333   // language variant, return the indicated token type. Note a trick
00334   // here that the very first token accepted may be accepted under the
00335   // lf_version sub-language. Once the first token has been accepted,
00336   // we disable that sub-language.
00337   if (entry) {
00338     if (currentLang & entry->whichLang) {
00339       currentLang &= ~LangFlags(lf_version);
00340 
00341       return entry->tokValue;
00342     }
00343   }
00344 
00345   // Otherwise, check for various reserved words:
00346 
00347   // Things starting with "__":
00348   if (s[0] == '_' && s[1] == '_') {
00349     if (!isRuntimeUoc)
00350       return tk_ReservedWord;
00351   }
00352 
00353   // Things starting with "def" are reserved:
00354   if (s[0] == 'd' && s[1] == 'e' && s[2] == 'f')
00355     return tk_ReservedWord;
00356 
00357   else return identType;
00358 }
00359 
00360 void
00361 TransitionLexer::ReportParseError()
00362 {
00363   errStream << lastToken.loc
00364             << ": syntax error (via yyerror)" << std::endl;
00365   num_errors++;
00366 }
00367  
00368 void
00369 TransitionLexer::ReportParseError(const LexLoc& where, std::string msg)
00370 {
00371   errStream << where
00372             << ": "
00373             << msg << std::endl;
00374 
00375   num_errors++;
00376 }
00377 
00378 void
00379 TransitionLexer::ReportParseWarning(const LexLoc& where, std::string msg)
00380 {
00381   errStream << where
00382             << ": "
00383             << msg << std::endl;
00384 }
00385 
00386 TransitionLexer::TransitionLexer(std::ostream& _err, std::istream& _in, 
00387                        const std::string& origin,
00388                        bool commandLineInput)
00389   :here(origin, 1, 0), inStream(_in), errStream(_err)
00390 {
00391   inStream.unsetf(std::ios_base::skipws);
00392 
00393   if (!keywords_sorted) {
00394     qsort(keywords,
00395          sizeof(keywords)/sizeof(keywords[0]), 
00396          sizeof(keywords[0]), kwstrcmp);
00397     keywords_sorted = true;
00398   }
00399 
00400   // Don't accept block syntax keywords until we see the new version syntax,
00401   // which is accepted under lf_version
00402   currentLang = lf_block | lf_version;
00403   num_errors = 0;
00404   isRuntimeUoc = false;
00405   ifIdentMode = false;
00406   isCommandLineInput = commandLineInput;
00407   debug = false;
00408   nModules = 0;
00409 
00410   showNextError = true;
00411 
00412   lastTokType = EOF;
00413   lastToken = LToken(EOF, "end of file");
00414 }
00415 
00416 ucs4_t
00417 TransitionLexer::getChar()
00418 {
00419   char utf[8];
00420   unsigned char c;
00421 
00422   long ucs4 = pushBackStack.pop();
00423 
00424   if (ucs4 != -1) {
00425     utf8_encode(ucs4, utf);
00426     goto checkDigit;
00427   }
00428 
00429   memset(utf, 0, 8);
00430 
00431   utf[0] = inStream.get();
00432   c = utf[0];
00433   if (inStream.eof())
00434     return EOF;
00435 
00436   if (c <= 127)
00437     goto done;
00438 
00439   utf[1] = inStream.get();
00440   if (inStream.eof())
00441     return EOF;
00442 
00443   if (c <= 223)
00444     goto done;
00445 
00446   utf[2] = inStream.get();
00447   if (inStream.eof())
00448     return EOF;
00449 
00450   if (c <= 239)
00451     goto done;
00452 
00453   utf[3] = inStream.get();
00454   if (inStream.eof())
00455     return EOF;
00456 
00457   if (c <= 247)
00458     goto done;
00459  
00460   utf[4] = inStream.get();
00461   if (inStream.eof())
00462     return EOF;
00463 
00464   if (c <= 251)
00465     goto done;
00466 
00467   utf[5] = inStream.get();
00468   if (inStream.eof())
00469     return EOF;
00470 
00471  done:
00472   ucs4 = utf8_decode(utf, 0);
00473  checkDigit:
00474   thisToken += utf;
00475 
00476   return ucs4;
00477 }
00478 
00479 void
00480 TransitionLexer::ungetChar(ucs4_t c)
00481 {
00482   char utf[8];
00483 
00484   // Never bother to push back EOF, since it is reproduced by the
00485   // input.
00486   if (c == EOF)
00487     return;
00488 
00489   pushBackStack.push(c);
00490 
00491   unsigned len = utf8_encode(c, utf);
00492   thisToken.erase( thisToken.length() - len);
00493 }
00494 
00495 void
00496 TransitionLexer::ungetThisToken()
00497 {
00498   ucs4_t ucsToken[8];
00499   const char *s = thisToken.c_str();
00500   const char *snext = s;
00501   size_t i = 0;
00502 
00503   for( ; *snext && i < 8; i++) {
00504     ucsToken[i] = utf8_decode(s, &snext);
00505     s = snext;
00506   }
00507 
00508   // If the following assert fails, we are backing out something
00509   // excessively large...
00510   assert(*snext == 0);
00511 
00512   for (; i > 0; i--) {
00513     ucs4_t c = ucsToken[i-1];
00514     pushBackStack.push(c);
00515   }
00516 
00517   thisToken.erase();
00518 }
00519 
00520 static bool
00521 isWhiteSpace(ucs4_t c)
00522 {
00523   switch (c) {
00524   case ' ':
00525   case '\t':
00526   case '\n':
00527   case '\r':
00528     return true;
00529   default:
00530     return false;
00531   }
00532 }
00533 
00534 static bool
00535 isCharDelimiter(ucs4_t c)
00536 {
00537   if (c == ')') return true;
00538 
00539   return isWhiteSpace(c);
00540 }
00541 
00542 void
00543 TransitionLexer::showToken(std::ostream& errStream, const LToken& tok)
00544 {
00545   const char *tokTypeName = TransitionTokenName(tok.tokType);
00546   const char *prevTokTypeName = TransitionTokenName(tok.prevTokType);
00547 
00548   errStream << tok.tokType << ": " << tok.loc << ' '
00549             << tokTypeName;
00550 
00551   switch(tok.tokType) {
00552   case tk_TypeVar:
00553   case tk_EffectVar:
00554     errStream << " (" << tok.str << ")";
00555     break;
00556   case tk_BlkIdent:
00557   case tk_NegativeInt:
00558   case tk_Nat:
00559   case tk_Float:
00560   case tk_VersionNumber:
00561     errStream << " (" << tok.str << ")";
00562     break;
00563 
00564   case tk_Char:
00565     errStream << " ('" << tok.str << "')";
00566     break;
00567   case tk_String:
00568     errStream << " (\"" << tok.str << "\")";
00569     break;
00570   default:
00571     break;
00572   }
00573 
00574   errStream << " followed " << prevTokTypeName;
00575 
00576   if (tok.flags) {
00577     errStream << " [";
00578 
00579     if (tok.flags & TF_INSERTED)
00580       errStream << "INSERTED";
00581 
00582     if (tok.flags & TF_BY_PARSER)
00583       errStream << " BY PARSER";
00584 
00585     if (tok.flags & TF_AT_FIRST)
00586       errStream << " AT FIRST";
00587 
00588     if (tok.flags & TF_FIRST_ON_LINE) {
00589       if (tok.flags & TF_INSERTED)
00590         errStream << ", ";
00591       errStream << "FIRST-ON-LINE";
00592     }
00593 
00594     if (tok.flags & TF_REPROCESS) {
00595       if (tok.flags & (TF_INSERTED | TF_FIRST_ON_LINE))
00596         errStream << ", ";
00597       errStream << "REPROCESS";
00598     }
00599     errStream << "]";
00600   }
00601 }
00602 
00603 int
00604 TransitionLexer::lex(ParseType *lvalp)
00605 {
00606   LToken tok = getNextToken();
00607   assert (tok.prevTokType == lastTokType);
00608 
00609   if (debug) {
00610     errStream << "TOKEN ";
00611     showToken(errStream, tok);
00612     errStream << std::endl;
00613   }
00614 
00615   lastTokType = tok.tokType;
00616   lastToken = tok;
00617   
00618   lvalp->tok = tok;
00619   here = tok.endLoc;
00620 
00621   return tok.tokType;
00622 }
00623 
00624 void
00625 TransitionLexer::beginBlock(const LToken& tok)
00626 {
00627   bool inserted = (tok.flags & TF_INSERTED);
00628 
00629 #ifdef LAYOUT_BLOCK_DEBUG
00630   errStream << "  LAYOUT: " << here 
00631             << (inserted ? ": inserted " : ": explicit ")
00632             << "beginBlock()" << std::endl;
00633 #endif
00634 
00635   // Until we find the first token and update it, the semicolon
00636   // comment for a new block is the same as it's parent.
00637   unsigned column = layoutStack ? layoutStack->column : 0;
00638 
00639   boost::shared_ptr<LayoutFrame> lf = 
00640     LayoutFrame::make(lastToken.tokType, inserted, column, tok);
00641 
00642   //  expectingLeftBrace = false; // we're about to return it.
00643   //  learnBlockIndent = true;
00644 
00645   lf->next = layoutStack;
00646   layoutStack = lf;
00647 }
00648 
00649 void
00650 TransitionLexer::endBlock(const LToken& tok)
00651 {
00652   bool inserted = (tok.flags & TF_INSERTED);
00653 
00654   if (inserted && ! layoutStack->inserted) {
00655     std::stringstream ss;
00656     ss << "Inserted close brace balances explicit open brace at " 
00657        << layoutStack->tok.loc
00658        << ".";
00659     ReportParseError(tok.loc, ss.str());
00660   }
00661   else if (!inserted && layoutStack->inserted) {
00662     std::stringstream ss;
00663     ss << "Explicit close brace balances inserted open brace at "
00664        << layoutStack->tok.loc
00665        << ".";
00666     ReportParseError(tok.loc, ss.str());
00667   }
00668 
00669   // considerLineStart = false; // we've done it
00670   layoutStack = layoutStack->next;
00671 
00672 #ifdef LAYOUT_BLOCK_DEBUG
00673   errStream << "  LAYOUT: " << here 
00674             << (inserted ? ": inserted " : ": explicit ")
00675             << "endBlock()";
00676   if (layoutStack)
00677     errStream << " leaving indent at " << layoutStack->column;
00678   errStream << std::endl;
00679 #endif
00680 }
00681 
00682 bool
00683 TransitionLexer::closeToOffset(unsigned offset)
00684 {
00685   // Note that in contrast to closeToOpeningToken, this can safely
00686   // (and usefully) be called multiple times.
00687 
00688   // It is possible for us to be called on the first token in the
00689   // file, or on junk tokens after end of module. In both cases we
00690   // won't have a layout stack yet.
00691   if (!layoutStack)
00692     return false;
00693 
00694   if (!layoutStack->inserted)
00695     return false;
00696 
00697   if (offset >= layoutStack->column)
00698     return false;
00699 
00700 #ifdef LAYOUT_BLOCK_DEBUG
00701   unsigned count = 0;
00702 
00703   errStream << "  LAYOUT: " << here 
00704             << ": closeToOffset inserted '}'" << std::endl;
00705 #endif
00706     
00707   return true;
00708 }
00709 
00710 bool
00711 TransitionLexer::conditionallyInsertSemicolon(unsigned offset)
00712 {
00713   // It is possible for us to be called on the first token in the
00714   // file, or on junk tokens after end of module. In both cases we
00715   // won't have a layout stack yet.
00716   if (!layoutStack)
00717     return false;
00718 
00719   assert(layoutStack);
00720   if (here.offset <= layoutStack->column)
00721     return true;
00722   return false;
00723 }
00724 
00725 LexLoc
00726 TransitionLexer::skipWhiteSpaceAndComments()
00727 {
00728   ucs4_t c;
00729 
00730   LexLoc pos = here;
00731 
00732  startOver:
00733   thisToken.erase();
00734 
00735   c = getChar();
00736 
00737   if (c == '/') {
00738     c = getChar();
00739     if (c == '/') {
00740       do {
00741         c = getChar();
00742       } while (c != '\n' && c != '\r');
00743       // Back out the EOL. We'll handle that with white space
00744       // processing below to simplify layout processing.
00745       ungetChar(c);
00746       pos.updateWith(thisToken);
00747       goto startOver;
00748     }
00749     else if (c == '*') {
00750       for (;;) {
00751         c = getChar();
00752         if (c == '*') {
00753           c = getChar();
00754           if (c == '/') {
00755             break;
00756           }
00757           else if (c == EOF) {
00758             pos.updateWith(thisToken);
00759             return pos;
00760           }
00761           else
00762             ungetChar(c);
00763         }
00764         else if (c == '\n' || c == '\r') {
00765           /* If a multi-line comment spans lines, the next token after
00766              the comment is the first token on that line */
00767           atBeginningOfLine = true;
00768         }
00769         else if (c == EOF) {
00770           pos.updateWith(thisToken);
00771           return pos;
00772         }
00773       }
00774 
00775       pos.updateWith(thisToken);
00776       goto startOver;
00777     }
00778     else {
00779       ungetChar(c);
00780       c = '/';
00781     }
00782   }
00783 
00784   if (isWhiteSpace(c)) {
00785     while (isWhiteSpace(c)) {
00786       if (c == '\n' || c == '\r') {
00787         atBeginningOfLine = true;
00788       }
00789 
00790       c = getChar();
00791     }
00792     ungetChar(c);
00793 
00794     pos.updateWith(thisToken);
00795     goto startOver;
00796   }
00797 
00798   ungetChar(c);
00799 
00800   return pos;
00801 }
00802 
00803 void
00804 TransitionLexer::pushTokenBack(const LToken& tok, bool verbose)
00805 {
00806   if (verbose && debug) {
00807     errStream << "PARSER PUSHED BACK TOKEN ";
00808     showToken(errStream, tok);
00809     errStream << std::endl;
00810   }
00811   pushbackTokens.push_back(tok);
00812 
00813   lastTokType = tok.prevTokType;
00814 
00815   if (verbose && debug) {
00816     errStream << "  Pushback Stack Is:\n";
00817 
00818     for (size_t i = pushbackTokens.size(); i > 0; i--) {
00819       LToken theTok = pushbackTokens[i-1];
00820       errStream << "  " << i-1 << " ";
00821       showToken(errStream, theTok);
00822       errStream << std::endl;
00823     }
00824   }
00825 }
00826 
00827 #if 0
00828 #define RETURN_INSERTED(tok) do {  \
00829     LToken _tok = tok;             \
00830     _tok.flags |= TF_INSERTED;     \
00831     return _tok;                   \
00832   } while(false)
00833 #endif
00834 
00835 LToken
00836 TransitionLexer::getNextToken()
00837 {
00838   LToken tok = havePushbackToken() ? popToken() : getNextInputToken();
00839 
00840   LexLoc startLoc = here;
00841   LexLoc endLoc = here;
00842 
00843   // Rule 1: A left curly-brace is automatically inserted after
00844   // certain tokens:
00845   //
00846   // FIX: Once I clean up the surface syntax, this should also be done
00847   // for tk_DO.
00848   bool curlyRequired = ((lastTokType == EOF)        // beginning of file
00849                         // These are braced for bindings:
00850                         || (lastTokType == tk_LET)
00851                         || (lastTokType == tk_LETREC)
00852                         || (lastTokType == tk_LOOP)
00853                         || (lastTokType == tk_SWITCH)
00854                         // These are braced for blocks:
00855                         || (lastTokType == tk_IN)
00856                         || (lastTokType == tk_IS)
00857                         || (lastTokType == tk_DO)
00858                         || (lastTokType == tk_TRY)
00859                         || (lastTokType == tk_THEN)
00860                         || (lastTokType == tk_OTHERWISE)
00861                         || (lastTokType == tk_ELSE)
00862                         || false);
00863 
00864 #ifdef LAYOUT_BLOCK_DEBUG
00865   if (lastTokType < 256 && isprint(lastTokType)) {
00866     errStream << "  LAYOUT: " << startLoc 
00867               << ": last token was '" << (char)lastTokType << "'." << std::endl;
00868   }
00869   else {
00870     errStream << "  LAYOUT: " << startLoc 
00871               << ": last token was type " << lastTokType << std::endl;
00872   }
00873 #endif
00874 
00875   if (curlyRequired && (tok.tokType != '{')) {
00876     pushTokenBack(tok);
00877 
00878     LToken newTok = LToken('{', startLoc, endLoc, "{");
00879     newTok.flags |= TF_INSERTED;
00880     newTok.prevTokType = tok.prevTokType;
00881 
00882     beginBlock(newTok);
00883     
00884     // No position update for inserted token.
00885     return newTok;
00886   }
00887 
00888   // Rule 2: If the last token was a '{', then the next token is
00889   // processed specially. Either:
00890   //
00891   //   a) The next token is more indented than the prevailing indent
00892   //      level, in which case it establishes the indent level for the
00893   //      newly introduced block, OR
00894   //   b) The next token is NOT more indented, in which case the
00895   //      block is presumed to have been empty and a '} is immediately
00896   //      inserted. 
00897   if (lastTokType == '{') {
00898     assert(layoutStack);
00899 
00900 #ifdef LAYOUT_BLOCK_DEBUG
00901     errStream << "  LAYOUT: " << startLoc 
00902               << ": Processing post-{..." << std::endl;
00903 #endif
00904 
00905     if ((startLoc.offset > layoutStack->column) ||
00906         // Top-level context needs special handling if the curly brace
00907         // was insertedly inserted:
00908         (layoutStack->inserted && !layoutStack->next)) {
00909       // Valid indent. Establish indent level for new block:
00910       layoutStack->column = startLoc.offset;
00911 
00912 #ifdef LAYOUT_BLOCK_DEBUG
00913       errStream << "  LAYOUT: " << startLoc 
00914                 << ": set indent to "
00915                 << startLoc.offset
00916                 << ", disable check first token." << std::endl;
00917 #endif
00918     }
00919     else if (layoutStack->inserted && layoutStack->next) {
00920 #ifdef LAYOUT_BLOCK_DEBUG
00921       errStream << "  LAYOUT: " << startLoc 
00922                 << " : close empty inserted block." << std::endl;
00923 #endif
00924       // First token after '{' is at the previous (or earlier) indent
00925       // level, and it was an inserted open block. Close it immediately:
00926       pushTokenBack(tok);
00927 
00928       LToken newTok = LToken('}', startLoc, endLoc, "}");
00929       newTok.flags |= TF_INSERTED;
00930       newTok.prevTokType = tok.prevTokType;
00931 
00932       endBlock(newTok);
00933 
00934       return newTok;
00935     }
00936   }
00937 
00938   if (tok.flags & TF_FIRST_ON_LINE) {
00939     // Rule 3: If this is the first token on a line, then WHILE the
00940     // current indent level is less than the prevailing indent level,
00941     // close inserted blocks.
00942 
00943     // Consider possible right curly insertion and/or semicolon
00944     // insertion.
00945 
00946 #ifdef LAYOUT_BLOCK_DEBUG
00947     errStream << "  LAYOUT: " << startLoc 
00948               << ": processing first token..." << std::endl;
00949 #endif
00950 
00951     // This hack provided especially for printf-like things. Let indent
00952     // rules be violated for long printf strings:
00953     bool autoCloseOK = !(lastTokType == ','
00954                          || lastTokType == '('
00955                          || tok.tokType == ','
00956                          || tok.tokType == ')'
00957                          || false);
00958 
00959     if (autoCloseOK && closeToOffset(startLoc.offset)) {
00960       pushTokenBack(tok);
00961 
00962       LToken newTok = LToken('}', startLoc, endLoc, "}");
00963       newTok.flags |= TF_INSERTED|TF_AT_FIRST;
00964       newTok.prevTokType = tok.prevTokType;
00965 
00966       endBlock(newTok);
00967       return newTok;
00968     }
00969   }
00970 
00971   // Rule 4: At end of file, any outstanding inserted blocks are closed.
00972   if (tok.tokType == EOF) {
00973     if (layoutStack && layoutStack->inserted) {
00974 
00975       pushTokenBack(tok);
00976 
00977       LToken newTok = LToken('}', startLoc, endLoc, "}");
00978       newTok.flags |= TF_INSERTED|TF_AT_FIRST;
00979       newTok.prevTokType = tok.prevTokType;
00980 
00981       endBlock(newTok);
00982       return newTok;
00983     }
00984 
00985     return tok;
00986   }
00987 
00988   // Rule 5: If this is the first token on a line, and the indent
00989   // level is the SAME as the prevailing indent level, insert a
00990   // semicolon UNLESS:
00991   //   a) we just saw one, or
00992   //   b) we are closing the block explicitly.
00993 
00994   if (tok.flags & TF_FIRST_ON_LINE) {
00995     bool wantAutoSemi = !(lastTokType == ';' ||
00996                           lastTokType == '{' ||
00997                           lastTokType == ',' ||
00998                           lastTokType == '(' ||
00999                           lastTokType == '[' ||
01000                           lastTokType == '=' ||
01001                           lastTokType == tk_ASSIGN ||
01002                           tok.tokType == ')' ||
01003                           tok.tokType == ']' ||
01004                           tok.tokType == ';' ||
01005                           tok.tokType == '}' ||
01006                           tok.tokType == tk_THEN ||
01007                           tok.tokType == tk_ELSE ||
01008                           tok.tokType == tk_CASE ||
01009                           tok.tokType == tk_CATCH ||
01010                           tok.tokType == tk_OTHERWISE ||
01011                           tok.tokType == tk_UNTIL ||
01012                           tok.tokType == tk_IN ||
01013                           tok.tokType == tk_IS ||
01014                           tok.tokType == tk_DO ||
01015                           false);
01016 
01017     if (wantAutoSemi) {
01018       if (conditionallyInsertSemicolon(startLoc.offset)) {
01019 #ifdef LAYOUT_BLOCK_DEBUG
01020         errStream << "  LAYOUT: " << startLoc 
01021                   << ": semicolon inserted at offset " 
01022                   << startLoc.offset << std::endl;
01023 #endif
01024         pushTokenBack(tok);
01025 
01026         LToken newTok = LToken(';', startLoc, endLoc, ";");
01027         newTok.flags |= TF_INSERTED|TF_AT_FIRST;
01028         newTok.prevTokType = tok.prevTokType;
01029 
01030         return newTok;
01031       }
01032     }
01033   }
01034 
01035   // Open and close layout contexts when we see explicit open/close
01036   // curly braces:
01037   if (tok.tokType == '{')
01038     beginBlock(tok);
01039 
01040   if (tok.tokType == '}') {
01041     if (!(tok.flags & TF_INSERTED)) {
01042       // Following will stop closing blocks when it reaches the most
01043       // recent explicit block, which is what we want.
01044       if (closeToOffset(0)) {
01045         pushTokenBack(tok);
01046 
01047         LToken newTok = LToken('}', startLoc, endLoc, "}");
01048         newTok.flags |= TF_INSERTED|TF_AT_FIRST;
01049         newTok.prevTokType = tok.prevTokType;
01050 
01051         endBlock(newTok);
01052         return newTok;
01053       }
01054     }
01055 
01056     endBlock(tok);
01057   }
01058 
01059   return tok;
01060 }
01061 
01062 LToken
01063 TransitionLexer::popToken()
01064 {
01065   assert(havePushbackToken());
01066   
01067   LToken pbTok = pushbackTokens[pushbackTokens.size()-1];
01068   pushbackTokens.pop_back();
01069   pbTok.prevTokType = lastTokType;
01070   return pbTok;
01071 }
01072 
01073 #define RETURN_TOKEN(tok) do {               \
01074     LToken _tok = tok;                       \
01075     _tok.prevTokType = lastTokType;          \
01076     if (atBeginningOfLine) {                 \
01077       _tok.flags |= TF_FIRST_ON_LINE;        \
01078       atBeginningOfLine = false;             \
01079     }                                        \
01080     return _tok;                             \
01081   } while(false)
01082 
01083 LToken
01084 TransitionLexer::getNextInputToken()
01085 {
01086   // For numbers:
01087   int radix = 10;
01088 
01089   here = skipWhiteSpaceAndComments();
01090 
01091   thisToken.erase();
01092 
01093   ucs4_t c = getChar();
01094 
01096   // We've now eaten comments and white space, so whatver
01097   // location this is, this is where the new token starts:
01099   LexLoc startLoc = here;
01100   LexLoc endLoc = here;
01101 
01102   switch (c) {
01103   case ':':
01104     {
01105       int tokID = ':';
01106 
01107       ucs4_t c2 = getChar();
01108       if (c2 == '=')
01109         tokID = tk_ASSIGN;
01110       else if (c2 == ':') {
01111         tokID = tk_BlkIdent;
01112       }
01113       else
01114         ungetChar(c2);
01115 
01116       endLoc.updateWith(thisToken);
01117       RETURN_TOKEN(LToken(tokID, startLoc, endLoc, thisToken));
01118     }
01119   case ';':
01120     {
01121       endLoc.updateWith(thisToken);
01122       RETURN_TOKEN(LToken(c, startLoc, endLoc, thisToken));
01123     }
01124 
01125   case '{':
01126     {
01127       endLoc.updateWith(thisToken);
01128       RETURN_TOKEN(LToken(c, startLoc, endLoc, thisToken));
01129     }
01130   case '}':
01131     {
01132       endLoc.updateWith(thisToken);
01133       RETURN_TOKEN(LToken(c, startLoc, endLoc, thisToken));
01134     }
01135 
01136   case '.':                        // Single character tokens
01137   case ',':
01138   case '[':
01139   case ']':
01140   case '(':
01141   case ')':
01142     {
01143       endLoc.updateWith(thisToken);
01144       RETURN_TOKEN(LToken(c, startLoc, endLoc, thisToken));
01145     }
01146 
01147   case '"':                        // String literal
01148     {
01149       do {
01150         c = getChar();
01151 
01152         if (c == '\\') {
01153           (void) getChar();        // just ignore it -- will validate later
01154         }
01155       } while (c != '"' && c != EOF);
01156 
01157       if (c == EOF) {
01158         errStream << startLoc
01159                   << ": Unterminated string constant. Missing end quote?"
01160                   << std::endl;
01161         num_errors++;
01162         RETURN_TOKEN(LToken(EOF, startLoc, endLoc, "end of file"));
01163       }
01164       
01165       unsigned badpos = LitValue::validate_string(thisToken.c_str());
01166 
01167       if (badpos) {
01168         LexLoc badHere = startLoc;
01169         badHere.offset += badpos;
01170         errStream << badHere.asString()
01171                   << ": Illegal (non-printing) character in string '"
01172                   << thisToken << "'" << std::endl;
01173         num_errors++;
01174       }
01175 
01176       endLoc.updateWith(thisToken);
01177       RETURN_TOKEN(LToken(tk_String, startLoc, endLoc, 
01178                           thisToken.substr(1, thisToken.size()-2)));
01179     }
01180 
01181   case '\'':                        // Type variable or character literal.
01182     {
01183       // This can signal a type variable or a UCS4 codepoint literal,
01184       // depending on whether a close-single-quote is present.
01185       int tokType = tk_TypeVar;
01186 
01187       int c1 = getChar();       // first character after '
01188       int c2 = getChar();       // second character after '
01189 
01190       if (c1 == EOF || c2 == EOF) {
01191         endLoc.updateWith(thisToken);
01192         RETURN_TOKEN(LToken(EOF, startLoc, endLoc, "end of file"));
01193       }
01194 
01195       // Check for simple, one-codepoint character:
01196       if (c2 == '\'') {
01197         /* This is of the form '??', where ?? is a single character
01198            that is not "'" or "\": */
01199         switch (c1) {
01200         case '\'':
01201         case '\\':
01202           {
01203             // These are valid_charpunct() but not actually legal in
01204             // non-escaped characters:
01205             ungetChar(c2);
01206             ungetChar(c1);
01207             endLoc.updateWith(thisToken);
01208             RETURN_TOKEN(LToken(EOF, startLoc, endLoc, "end of file"));
01209           }
01210         default:
01211           {
01212             if (LitValue::DecodeCharacter(thisToken) >= 0) {
01213               endLoc.updateWith(thisToken);
01214               RETURN_TOKEN(LToken(tk_Char, startLoc, endLoc, thisToken));
01215             }
01216 
01217             ungetChar(c2);
01218             ungetChar(c1);
01219             endLoc.updateWith(thisToken);
01220             RETURN_TOKEN(LToken(EOF, startLoc, endLoc, "end of file"));
01221           }          
01222         }
01223       }
01224       else if (c1 == '\\') {
01225         // Escaped character. We have already consumed the next
01226         // character into c2. Scan forward to the matching '\''
01227         // and then test the result by calling DecodeCharacter to
01228         // validate it:
01229         do {
01230           c = getChar();
01231           if (c == EOF) {
01232             endLoc.updateWith(thisToken);
01233             RETURN_TOKEN(LToken(EOF, startLoc, endLoc, "end of file"));
01234           }
01235         } while (c != '\'' && c != EOF);
01236 
01237         if (c == EOF) {
01238           errStream << startLoc
01239                     << ": Unterminated character constant. Missing end quote?"
01240                     << std::endl;
01241           num_errors++;
01242           RETURN_TOKEN(LToken(EOF, startLoc, endLoc, "end of file"));
01243         }
01244 
01245         if (LitValue::DecodeCharacter(thisToken) >= 0) {
01246           endLoc.updateWith(thisToken);
01247           RETURN_TOKEN(LToken(tk_Char, startLoc, endLoc, thisToken));
01248         }
01249 
01250         endLoc.updateWith(thisToken);
01251         RETURN_TOKEN(LToken(EOF, startLoc, endLoc, "end of file"));
01252       }
01253 
01254       // Otherwise it is a type variable.
01255 
01256       // It must be a type variable:
01257 
01258       ungetChar(c2);
01259 
01260       if (c1 == '%') {
01261         tokType = tk_EffectVar;
01262         c1 = getChar();
01263       }
01264 
01265       if (!valid_tv_ident_start(c1)) {
01266         // FIX: this is bad input
01267         ungetChar(c1);
01268         endLoc.updateWith(thisToken);
01269         RETURN_TOKEN(LToken(EOF, startLoc, endLoc, "end of file"));
01270       }
01271 
01272       do {
01273         c = getChar();
01274       } while (valid_tv_ident_continue(c));
01275       ungetChar(c);
01276 
01277       endLoc.updateWith(thisToken);
01278       RETURN_TOKEN(LToken(tokType, startLoc, endLoc, thisToken));
01279     }
01280 
01281     // Leading '-' no longer requires special handling now that we are
01282     // getting rid of the S-expression syntax. Unary negation is
01283     // handled in the parser in the new syntax.
01284   case '0':
01285   case '1':
01286   case '2':
01287   case '3':
01288   case '4':
01289   case '5':
01290   case '6':
01291   case '7':
01292   case '8':
01293   case '9':
01294     {
01295       if (c == '0') {
01296         ucs4_t c2 = getChar();
01297         switch(c2) {
01298         case 'b':
01299           radix = 2;
01300           break;
01301         case 'x':
01302           radix = 16;
01303           break;
01304         case 'o':
01305           radix = 8;
01306           break;
01307         default:
01308           ungetChar(c2);
01309           // 0 followed by legal octal digit is a number:
01310           if (LitValue::digitValue(c2, 8) >= 0) {
01311             radix = 8;
01312             break;
01313           }
01314           // c still holds 0, which is okay.
01315         }
01316       }
01317 
01318       do {
01319         c = getChar();
01320       } while (LitValue::digitValue(c, radix) >= 0);
01321 
01322       /* We are either done with the literal, in which case it is an
01323          integer literal, or we are about to see a decimal point, in
01324          which case it is either a floating point literal or a
01325          language version. */
01326       if (c != '.') {
01327         ungetChar(c);
01328         int tokType = (thisToken[0] == '-') ? tk_NegativeInt : tk_Nat;
01329 
01330         endLoc.updateWith(thisToken);
01331         RETURN_TOKEN(LToken(tokType, startLoc, endLoc, thisToken));
01332       }
01333 
01334       // Language version number?
01335       if (currentLang & lf_version) {
01336         if (radix != 10) {
01337           ReportParseError(startLoc, "Language version number must be decimal.");
01338           RETURN_TOKEN(LToken(EOF, startLoc, endLoc, "end of file"));
01339         }
01340 
01341         /* Looking for a language version number. */
01342         long count = 0;
01343         do {
01344           c = getChar();
01345           count++;
01346         } while (LitValue::digitValue(c, radix) >= 0);
01347         count--;
01348         ungetChar(c);
01349         endLoc.updateWith(thisToken);
01350         RETURN_TOKEN(LToken(tk_VersionNumber, startLoc, endLoc, thisToken));
01351       }
01352 
01353       // It's a floating point literal.
01354       {
01355         if (radix != 10) {
01356           ReportParseError(startLoc, "Floating point literals must be base 10.");
01357           RETURN_TOKEN(LToken(EOF, startLoc, endLoc, "end of file"));
01358         }
01359 
01360         long count = 0;
01361         do {
01362           c = getChar();
01363           count++;
01364         } while (LitValue::digitValue(c, radix) >= 0);
01365         count--;
01366         /* FIX: if count is 0, number is malformed */
01367       }
01368 
01369       /* We are either done with this token or we are looking at a 'e'
01370          indicating start of an exponent. */
01371       if (c != 'e') {
01372         ungetChar(c);
01373         endLoc.updateWith(thisToken);
01374         RETURN_TOKEN(LToken(tk_Float, startLoc, endLoc, thisToken));
01375       }
01376 
01377       /* Need to collect the exponent. Revert to radix 10 until
01378          otherwise proven. */
01379       c = getChar();
01380       radix = 10;
01381 
01382       if (c != '-' && LitValue::digitValue(c, radix) < 0) {
01383         // FIX: Malformed token
01384       }
01385 
01386       do {
01387         c = getChar();
01388       } while (LitValue::digitValue(c, 10) >= 0);
01389 
01390       /* Check for radix marker on exponent */
01391       if (c == 'r') {
01392         radix = strtol(thisToken.c_str(), 0, 10);
01393         if (radix < 0) radix = -radix; // leading sign not part of radix
01394 
01395         long count = 0;
01396         do {
01397           c = getChar();
01398           count++;
01399         } while (LitValue::digitValue(c, radix) >= 0);
01400         count--;
01401         /* FIX: if count is 0, number is malformed */
01402       }
01403 
01404       ungetChar(c);
01405       endLoc.updateWith(thisToken);
01406       RETURN_TOKEN(LToken(tk_Float, startLoc, endLoc, thisToken));
01407     }
01408 
01409   case EOF:
01410     {
01411       endLoc.updateWith(thisToken);
01412       RETURN_TOKEN(LToken(EOF, startLoc, endLoc, "end of file"));
01413     }
01414 
01415   default:
01416     if (valid_ident_start(c) || valid_operator_start(c) || c == '_') {
01417       goto identifier_or_operator;
01418     }
01419 
01420     // FIX: Malformed token
01421     endLoc.updateWith(thisToken);
01422     RETURN_TOKEN(LToken(EOF, startLoc, endLoc, "end of file"));
01423   }
01424 
01425  identifier_or_operator:
01443   while(c == '_')
01444     c = getChar();
01445 
01446   // Match leading parsed hole
01447   if (c == '#') {
01448     switch(c = getChar()) {     // syntactic category
01449     case 'e':
01450       {
01451         c = getChar();
01452         switch (c) {
01453         case '_':               // no modifier
01454           ungetChar(c);
01455           break;
01456         case 'T':               // Thunk modifier
01457           break;
01458         default:
01459           goto malformed_ident;
01460         }
01461         break;
01462       }
01463     case 't':                   // no legal modifiers for now
01464     case 'k':
01465       break;
01466     default:
01467       goto malformed_ident;
01468     }
01469 
01470     c = getChar();
01471     if (c != '_') {
01472       goto malformed_ident;
01473     }
01474 
01475     c = getChar();
01476   }
01477 
01478   while (valid_ident_start(c) || valid_operator_start(c) || c == '_') {
01479     // Grab an alpha chunk or a punctuation chunk:
01480     if (valid_ident_start(c)) {
01481       do {
01482         c = getChar();
01483       } while (valid_ident_continue(c));
01484     }
01485     else if (valid_operator_start(c)) {
01486       do {
01487         c = getChar();
01488       } while (valid_operator_continue(c));
01489     }
01490 
01491     // We need special transitional handling to recognize set! for
01492     // use in S-expressions. This violates the longest match rule, but
01493     // note that it does so on an otherwise illegal token.
01494     //
01495     // This will go away soon.
01496     if (thisToken == "set!") {
01497       int tokType = kwCheck(thisToken.c_str(), tk_BlkIdent);
01498       endLoc.updateWith(thisToken);
01499       RETURN_TOKEN(LToken(tokType, startLoc, endLoc, thisToken));
01500     }
01501 
01502     if ((c != '#') && (c != '_') && (c != '@'))
01503       goto ident_done;
01504 
01505     // Match internal or trailing separator*
01506     if (c == '#') {
01507       switch(c = getChar()) {     // syntactic category
01508       case 'e':
01509         {
01510           c = getChar();
01511           switch (c) {
01512           case '_':               // no modifier
01513             ungetChar(c);
01514             break;
01515           case 'T':               // Thunk modifier
01516             break;
01517           default:
01518             goto malformed_ident;
01519           }
01520           break;
01521         }
01522       case 't':                   // no legal modifiers for now
01523       case 'k':
01524         break;
01525       default:
01526         goto malformed_ident;
01527       }
01528 
01529       c = getChar();
01530       if (c != '_') {
01531         goto malformed_ident;
01532       }
01533     }
01534     c = getChar();
01535   }
01536  ident_done:
01537 
01538   // We have gone one too far.
01539   ungetChar(c);
01540 
01541   // Might have matched trailing '@' in error. If so, back it out:
01542   if (thisToken[thisToken.size()-1] == '@') {
01543     ReportParseError(startLoc, thisToken + 
01544                      " is not a well-formed identifier. Trailing '@' is not valid.");
01545     RETURN_TOKEN(LToken(EOF, startLoc, endLoc, "end of file"));
01546   }
01547   else {
01548     int tokType = kwCheck(thisToken.c_str(), tk_BlkIdent);
01549     endLoc.updateWith(thisToken);
01550     RETURN_TOKEN(LToken(tokType, startLoc, endLoc, thisToken));
01551   }
01552 
01553  malformed_ident:
01554   ReportParseError(startLoc, thisToken + 
01555                    " is not a well-formed identifier.");
01556   RETURN_TOKEN(LToken(EOF, startLoc, endLoc, "end of file"));
01557 }

Generated on Thu May 17 23:59:16 2012 for BitC Compiler by  doxygen 1.4.7