00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038
00039 #include <assert.h>
00040 #include <string.h>
00041 #include <string>
00042
00043 #include <unicode/uchar.h>
00044
00045 #include <libsherpa/utf8.hxx>
00046 #include <libsherpa/LexLoc.hxx>
00047
00048 #include "BUILD/TransitionParser.hxx"
00049
00050 using namespace sherpa;
00051
00052 #include "TransitionLexer.hxx"
00053 #include "LitValue.hxx"
00054
00055 extern const char *TransitionTokenName(int lexTokenNumber);
00056
00057 bool
00058 TransitionLexer::valid_ident_start(ucs4_t ucs4)
00059 {
00060
00061
00062 return u_hasBinaryProperty(ucs4,UCHAR_XID_START);
00063 }
00064
00065 bool
00066 TransitionLexer::valid_ident_continue(ucs4_t ucs4)
00067 {
00068
00069
00070 return u_hasBinaryProperty(ucs4,UCHAR_XID_CONTINUE);
00071 }
00072
00073 bool
00074 TransitionLexer::valid_ascii_symbol(ucs4_t ucs4)
00075 {
00076 switch (ucs4) {
00077 case '_':
00078
00079
00080 return false;
00081
00082 case '#':
00083 return false;
00084
00085 case '@':
00086 return false;
00087
00088 case ':':
00089 return false;
00090
00091 case '!':
00092 case '$':
00093 case '%':
00094 case '&':
00095 case '*':
00096 case '+':
00097 case '-':
00098 case '/':
00099 case '<':
00100 case '>':
00101 case '=':
00102 case '?':
00103 case '^':
00104 case '|':
00105 case '~':
00106 return true;
00107
00108 default:
00109 return false;
00110 }
00111 }
00112
00113 bool
00114 TransitionLexer::valid_operator_start(ucs4_t ucs4)
00115 {
00116
00117
00118
00119 return (valid_ascii_symbol(ucs4));
00120 }
00121
00122 bool
00123 TransitionLexer::valid_operator_continue(ucs4_t ucs4)
00124 {
00125
00126
00127 return (valid_ascii_symbol(ucs4));
00128 }
00129
00130 bool
00131 TransitionLexer::valid_ifident_start(ucs4_t ucs4)
00132 {
00133 return (isalpha(ucs4) || ucs4 == '_');
00134
00135 }
00136
00137 bool
00138 TransitionLexer::valid_ifident_continue(ucs4_t ucs4)
00139 {
00140 return (isalpha(ucs4) || isdigit(ucs4) || ucs4 == '_' || ucs4 == '-');
00141
00142
00143 }
00144
00145 bool
00146 TransitionLexer::valid_tv_ident_start(ucs4_t ucs4)
00147 {
00148 return (u_hasBinaryProperty(ucs4,UCHAR_XID_START) ||
00149 ucs4 == '_');
00150 }
00151
00152 bool
00153 TransitionLexer::valid_tv_ident_continue(ucs4_t ucs4)
00154 {
00155 return (u_hasBinaryProperty(ucs4,UCHAR_XID_CONTINUE) ||
00156 ucs4 == '_');
00157 }
00158
00159 TransitionLexer::KeyWord::KeyWord(const char *_nm, LangFlags _whichLang, int _tokValue)
00160 {
00161 nm = _nm;
00162 whichLang = _whichLang;
00163 tokValue = _tokValue;
00164 }
00165
00176 static bool keywords_sorted = false;
00177
00178 struct TransitionLexer::KeyWord TransitionLexer::keywords[] = {
00179 TransitionLexer::KeyWord( "=", lf_block, '=' ),
00180
00181 TransitionLexer::KeyWord( "->", lf_block, tk_FNARROW ),
00182 TransitionLexer::KeyWord( "==", lf_block, tk_EQUALS ),
00183 TransitionLexer::KeyWord( "and", lf_block, tk_AND ),
00184 TransitionLexer::KeyWord( "apply", lf_block, tk_APPLY ),
00185 TransitionLexer::KeyWord( "array", lf_block, tk_ARRAY ),
00186 TransitionLexer::KeyWord( "ArrayRef", lf_block, tk_ARRAY_REF ),
00187 TransitionLexer::KeyWord( "as", lf_block, tk_AS ),
00188 TransitionLexer::KeyWord( "assert", lf_block, tk_ReservedWord ),
00189 TransitionLexer::KeyWord( "begin", lf_block, tk_BEGIN ),
00190 TransitionLexer::KeyWord( "bitc", lf_version, tk_BITC ),
00191 TransitionLexer::KeyWord( "bitfield", lf_block, tk_BITFIELD ),
00192 TransitionLexer::KeyWord( "bitsizeof", lf_block, tk_BITSIZEOF ),
00193
00194 TransitionLexer::KeyWord( "bool", lf_block, tk_BOOL ),
00195 TransitionLexer::KeyWord( "boxed", lf_block, tk_BOXED ),
00196 TransitionLexer::KeyWord( "break", lf_block, tk_ReservedWord ),
00197 TransitionLexer::KeyWord( "ByRef", lf_block, tk_BY_REF ),
00198 TransitionLexer::KeyWord( "case", lf_block, tk_CASE ),
00199 TransitionLexer::KeyWord( "catch", lf_block, tk_CATCH ),
00200 TransitionLexer::KeyWord( "char", lf_block, tk_CHAR ),
00201 TransitionLexer::KeyWord( "check", lf_block, tk_ReservedWord ),
00202 TransitionLexer::KeyWord( "closed", lf_block, tk_CLOSED ),
00203 TransitionLexer::KeyWord( "cond", lf_block, tk_COND ),
00204 TransitionLexer::KeyWord( "const", lf_block, tk_CONST ),
00205 TransitionLexer::KeyWord( "constrain", lf_block, tk_ReservedWord ),
00206 TransitionLexer::KeyWord( "continue", lf_block, tk_CONTINUE ),
00207 TransitionLexer::KeyWord( "declare", lf_block, tk_DECLARE ),
00208 TransitionLexer::KeyWord( "deep", lf_block, tk_ReservedWord ),
00209 TransitionLexer::KeyWord( "def", lf_block, tk_DEF ),
00210 TransitionLexer::KeyWord( "deref", lf_block, tk_DEREF ),
00211 TransitionLexer::KeyWord( "disable", lf_block, tk_ReservedWord ),
00212 TransitionLexer::KeyWord( "do", lf_block, tk_DO ),
00213 TransitionLexer::KeyWord( "do*", lf_block, tk_ReservedWord ),
00214 TransitionLexer::KeyWord( "double", lf_block, tk_DOUBLE ),
00215 TransitionLexer::KeyWord( "dup", lf_block, tk_DUP ),
00216 TransitionLexer::KeyWord( "else", lf_block, tk_ELSE ),
00217 TransitionLexer::KeyWord( "enable", lf_block, tk_ReservedWord ),
00218 TransitionLexer::KeyWord( "exception", lf_block, tk_EXCEPTION ),
00219 TransitionLexer::KeyWord( "external", lf_block, tk_EXTERNAL ),
00220 TransitionLexer::KeyWord( "false", lf_block, tk_FALSE ),
00221 TransitionLexer::KeyWord( "fill", lf_block, tk_FILL ),
00222 TransitionLexer::KeyWord( "float", lf_block, tk_FLOAT ),
00223 TransitionLexer::KeyWord( "fn", lf_block, tk_FN ),
00224 TransitionLexer::KeyWord( "forall", lf_block, tk_FORALL ),
00225 TransitionLexer::KeyWord( "from", lf_block, tk_FROM ),
00226
00227 TransitionLexer::KeyWord( "if", lf_block, tk_IF ),
00228 TransitionLexer::KeyWord( "import", lf_block, tk_IMPORT ),
00229 TransitionLexer::KeyWord( "impure", lf_block, tk_IMPURE ),
00230 TransitionLexer::KeyWord( "in", lf_block, tk_IN ),
00231 TransitionLexer::KeyWord( "InnerRef", lf_block, tk_INNER_REF ),
00232 TransitionLexer::KeyWord( "instance", lf_block, tk_INSTANCE ),
00233 TransitionLexer::KeyWord( "int16", lf_block, tk_INT16 ),
00234 TransitionLexer::KeyWord( "int32", lf_block, tk_INT32 ),
00235 TransitionLexer::KeyWord( "int64", lf_block, tk_INT64 ),
00236 TransitionLexer::KeyWord( "int8", lf_block, tk_INT8 ),
00237 TransitionLexer::KeyWord( "interface", lf_block, tk_INTERFACE ),
00238 TransitionLexer::KeyWord( "is", lf_block, tk_IS ),
00239 TransitionLexer::KeyWord( "label", lf_block, tk_LABEL ),
00240 TransitionLexer::KeyWord( "lambda", lf_block, tk_LAMBDA ),
00241 TransitionLexer::KeyWord( "let", lf_block, tk_LET ),
00242 TransitionLexer::KeyWord( "let*", lf_block, tk_ReservedWord ),
00243 TransitionLexer::KeyWord( "letrec", lf_block, tk_LETREC ),
00244 TransitionLexer::KeyWord( "location", lf_block, tk_ReservedWord ),
00245 TransitionLexer::KeyWord( "loop", lf_block, tk_LOOP ),
00246 TransitionLexer::KeyWord( "MakeVector", lf_block, tk_MAKE_VECTOR ),
00247 TransitionLexer::KeyWord( "member", lf_block, tk_MEMBER ),
00248 TransitionLexer::KeyWord( "method", lf_block, tk_METHOD ),
00249 TransitionLexer::KeyWord( "mixfix", lf_block, tk_MIXFIX ),
00250 TransitionLexer::KeyWord( "module", lf_block, tk_MODULE ),
00251 TransitionLexer::KeyWord( "mutable", lf_block, tk_MUTABLE ),
00252 TransitionLexer::KeyWord( "namespace", lf_block, tk_ReservedWord ),
00253 TransitionLexer::KeyWord( "object", lf_block, tk_OBJECT ),
00254 TransitionLexer::KeyWord( "opaque", lf_block, tk_OPAQUE ),
00255 TransitionLexer::KeyWord( "or", lf_block, tk_OR ),
00256 TransitionLexer::KeyWord( "otherwise", lf_block, tk_OTHERWISE ),
00257
00258 TransitionLexer::KeyWord( "provide", lf_block, tk_PROVIDE ),
00259 TransitionLexer::KeyWord( "provide!", lf_block, tk_ReservedWord ),
00260 TransitionLexer::KeyWord( "pure", lf_block, tk_PURE ),
00261 TransitionLexer::KeyWord( "quad", lf_block, tk_QUAD ),
00262 TransitionLexer::KeyWord( "read-only", lf_block, tk_ReservedWord ),
00263 TransitionLexer::KeyWord( "reference", lf_block, tk_PTR ),
00264 TransitionLexer::KeyWord( "repr", lf_block, tk_REPR ),
00265 TransitionLexer::KeyWord( "require", lf_block, tk_ReservedWord ),
00266 TransitionLexer::KeyWord( "reserved", lf_block, tk_RESERVED ),
00267 TransitionLexer::KeyWord( "return", lf_block, tk_RETURN ),
00268 TransitionLexer::KeyWord( "sensory", lf_block, tk_ReservedWord ),
00269 TransitionLexer::KeyWord( "sizeof", lf_block, tk_SIZEOF ),
00270 TransitionLexer::KeyWord( "string", lf_block, tk_STRING ),
00271 TransitionLexer::KeyWord( "struct", lf_block, tk_STRUCT ),
00272 TransitionLexer::KeyWord( "super", lf_block, tk_ReservedWord ),
00273 TransitionLexer::KeyWord( "suspend", lf_block, tk_SUSPEND ),
00274 TransitionLexer::KeyWord( "switch", lf_block, tk_SWITCH ),
00275 TransitionLexer::KeyWord( "tag", lf_block, tk_TAG ),
00276
00277 TransitionLexer::KeyWord( "then", lf_block, tk_THEN ),
00278 TransitionLexer::KeyWord( "throw", lf_block, tk_THROW ),
00279 TransitionLexer::KeyWord( "trait", lf_block, tk_TRAIT ),
00280 TransitionLexer::KeyWord( "true", lf_block, tk_TRUE ),
00281 TransitionLexer::KeyWord( "try", lf_block, tk_TRY ),
00282 TransitionLexer::KeyWord( "tycon", lf_block, tk_ReservedWord ),
00283 TransitionLexer::KeyWord( "tyfn", lf_block, tk_ReservedWord ),
00284 TransitionLexer::KeyWord( "typecase", lf_block, tk_TYPECASE ),
00285 TransitionLexer::KeyWord( "uint16", lf_block, tk_UINT16 ),
00286 TransitionLexer::KeyWord( "uint32", lf_block, tk_UINT32 ),
00287 TransitionLexer::KeyWord( "uint64", lf_block, tk_UINT64 ),
00288 TransitionLexer::KeyWord( "uint8", lf_block, tk_UINT8 ),
00289 TransitionLexer::KeyWord( "unboxed", lf_block, tk_UNBOXED ),
00290 TransitionLexer::KeyWord( "union", lf_block, tk_UNION ),
00291 TransitionLexer::KeyWord( "unless", lf_block, tk_UNLESS ),
00292 TransitionLexer::KeyWord( "until", lf_block, tk_UNTIL ),
00293 TransitionLexer::KeyWord( "using", lf_block, tk_ReservedWord ),
00294 TransitionLexer::KeyWord( "value-at", lf_block, tk_ReservedWord ),
00295 TransitionLexer::KeyWord( "vector", lf_block, tk_VECTOR ),
00296 TransitionLexer::KeyWord( "version", lf_version, tk_VERSION ),
00297 TransitionLexer::KeyWord( "when", lf_block, tk_WHEN ),
00298 TransitionLexer::KeyWord( "where", lf_block, tk_WHERE ),
00299 TransitionLexer::KeyWord( "word", lf_block, tk_WORD )
00300 };
00301
00302 static int
00303 kwstrcmp(const void *vKey, const void *vCandidate)
00304 {
00305 const char *key = ((const TransitionLexer::KeyWord *) vKey)->nm;
00306 const char *candidate = ((const TransitionLexer::KeyWord *) vCandidate)->nm;
00307
00308 return strcmp(key, candidate);
00309 }
00310
00311 int
00312 TransitionLexer::kwCheck(const char *s, int identType)
00313 {
00314 if (ifIdentMode) {
00315 if (!valid_ifident_start(*s))
00316 return tk_ReservedWord;
00317
00318 for (++s; *s; s++) {
00319 if (!valid_ifident_continue(*s))
00320 return tk_ReservedWord;
00321 }
00322
00323 return tk_BlkIdent;
00324 }
00325
00326 KeyWord key = KeyWord(s, lf_block, 0);
00327 KeyWord *entry =
00328 (KeyWord *)bsearch(&key, keywords,
00329 sizeof(keywords)/sizeof(keywords[0]),
00330 sizeof(keywords[0]), kwstrcmp);
00331
00332
00333
00334
00335
00336
00337 if (entry) {
00338 if (currentLang & entry->whichLang) {
00339 currentLang &= ~LangFlags(lf_version);
00340
00341 return entry->tokValue;
00342 }
00343 }
00344
00345
00346
00347
00348 if (s[0] == '_' && s[1] == '_') {
00349 if (!isRuntimeUoc)
00350 return tk_ReservedWord;
00351 }
00352
00353
00354 if (s[0] == 'd' && s[1] == 'e' && s[2] == 'f')
00355 return tk_ReservedWord;
00356
00357 else return identType;
00358 }
00359
00360 void
00361 TransitionLexer::ReportParseError()
00362 {
00363 errStream << lastToken.loc
00364 << ": syntax error (via yyerror)" << std::endl;
00365 num_errors++;
00366 }
00367
00368 void
00369 TransitionLexer::ReportParseError(const LexLoc& where, std::string msg)
00370 {
00371 errStream << where
00372 << ": "
00373 << msg << std::endl;
00374
00375 num_errors++;
00376 }
00377
00378 void
00379 TransitionLexer::ReportParseWarning(const LexLoc& where, std::string msg)
00380 {
00381 errStream << where
00382 << ": "
00383 << msg << std::endl;
00384 }
00385
00386 TransitionLexer::TransitionLexer(std::ostream& _err, std::istream& _in,
00387 const std::string& origin,
00388 bool commandLineInput)
00389 :here(origin, 1, 0), inStream(_in), errStream(_err)
00390 {
00391 inStream.unsetf(std::ios_base::skipws);
00392
00393 if (!keywords_sorted) {
00394 qsort(keywords,
00395 sizeof(keywords)/sizeof(keywords[0]),
00396 sizeof(keywords[0]), kwstrcmp);
00397 keywords_sorted = true;
00398 }
00399
00400
00401
00402 currentLang = lf_block | lf_version;
00403 num_errors = 0;
00404 isRuntimeUoc = false;
00405 ifIdentMode = false;
00406 isCommandLineInput = commandLineInput;
00407 debug = false;
00408 nModules = 0;
00409
00410 showNextError = true;
00411
00412 lastTokType = EOF;
00413 lastToken = LToken(EOF, "end of file");
00414 }
00415
00416 ucs4_t
00417 TransitionLexer::getChar()
00418 {
00419 char utf[8];
00420 unsigned char c;
00421
00422 long ucs4 = pushBackStack.pop();
00423
00424 if (ucs4 != -1) {
00425 utf8_encode(ucs4, utf);
00426 goto checkDigit;
00427 }
00428
00429 memset(utf, 0, 8);
00430
00431 utf[0] = inStream.get();
00432 c = utf[0];
00433 if (inStream.eof())
00434 return EOF;
00435
00436 if (c <= 127)
00437 goto done;
00438
00439 utf[1] = inStream.get();
00440 if (inStream.eof())
00441 return EOF;
00442
00443 if (c <= 223)
00444 goto done;
00445
00446 utf[2] = inStream.get();
00447 if (inStream.eof())
00448 return EOF;
00449
00450 if (c <= 239)
00451 goto done;
00452
00453 utf[3] = inStream.get();
00454 if (inStream.eof())
00455 return EOF;
00456
00457 if (c <= 247)
00458 goto done;
00459
00460 utf[4] = inStream.get();
00461 if (inStream.eof())
00462 return EOF;
00463
00464 if (c <= 251)
00465 goto done;
00466
00467 utf[5] = inStream.get();
00468 if (inStream.eof())
00469 return EOF;
00470
00471 done:
00472 ucs4 = utf8_decode(utf, 0);
00473 checkDigit:
00474 thisToken += utf;
00475
00476 return ucs4;
00477 }
00478
00479 void
00480 TransitionLexer::ungetChar(ucs4_t c)
00481 {
00482 char utf[8];
00483
00484
00485
00486 if (c == EOF)
00487 return;
00488
00489 pushBackStack.push(c);
00490
00491 unsigned len = utf8_encode(c, utf);
00492 thisToken.erase( thisToken.length() - len);
00493 }
00494
00495 void
00496 TransitionLexer::ungetThisToken()
00497 {
00498 ucs4_t ucsToken[8];
00499 const char *s = thisToken.c_str();
00500 const char *snext = s;
00501 size_t i = 0;
00502
00503 for( ; *snext && i < 8; i++) {
00504 ucsToken[i] = utf8_decode(s, &snext);
00505 s = snext;
00506 }
00507
00508
00509
00510 assert(*snext == 0);
00511
00512 for (; i > 0; i--) {
00513 ucs4_t c = ucsToken[i-1];
00514 pushBackStack.push(c);
00515 }
00516
00517 thisToken.erase();
00518 }
00519
00520 static bool
00521 isWhiteSpace(ucs4_t c)
00522 {
00523 switch (c) {
00524 case ' ':
00525 case '\t':
00526 case '\n':
00527 case '\r':
00528 return true;
00529 default:
00530 return false;
00531 }
00532 }
00533
00534 static bool
00535 isCharDelimiter(ucs4_t c)
00536 {
00537 if (c == ')') return true;
00538
00539 return isWhiteSpace(c);
00540 }
00541
00542 void
00543 TransitionLexer::showToken(std::ostream& errStream, const LToken& tok)
00544 {
00545 const char *tokTypeName = TransitionTokenName(tok.tokType);
00546 const char *prevTokTypeName = TransitionTokenName(tok.prevTokType);
00547
00548 errStream << tok.tokType << ": " << tok.loc << ' '
00549 << tokTypeName;
00550
00551 switch(tok.tokType) {
00552 case tk_TypeVar:
00553 case tk_EffectVar:
00554 errStream << " (" << tok.str << ")";
00555 break;
00556 case tk_BlkIdent:
00557 case tk_NegativeInt:
00558 case tk_Nat:
00559 case tk_Float:
00560 case tk_VersionNumber:
00561 errStream << " (" << tok.str << ")";
00562 break;
00563
00564 case tk_Char:
00565 errStream << " ('" << tok.str << "')";
00566 break;
00567 case tk_String:
00568 errStream << " (\"" << tok.str << "\")";
00569 break;
00570 default:
00571 break;
00572 }
00573
00574 errStream << " followed " << prevTokTypeName;
00575
00576 if (tok.flags) {
00577 errStream << " [";
00578
00579 if (tok.flags & TF_INSERTED)
00580 errStream << "INSERTED";
00581
00582 if (tok.flags & TF_BY_PARSER)
00583 errStream << " BY PARSER";
00584
00585 if (tok.flags & TF_AT_FIRST)
00586 errStream << " AT FIRST";
00587
00588 if (tok.flags & TF_FIRST_ON_LINE) {
00589 if (tok.flags & TF_INSERTED)
00590 errStream << ", ";
00591 errStream << "FIRST-ON-LINE";
00592 }
00593
00594 if (tok.flags & TF_REPROCESS) {
00595 if (tok.flags & (TF_INSERTED | TF_FIRST_ON_LINE))
00596 errStream << ", ";
00597 errStream << "REPROCESS";
00598 }
00599 errStream << "]";
00600 }
00601 }
00602
00603 int
00604 TransitionLexer::lex(ParseType *lvalp)
00605 {
00606 LToken tok = getNextToken();
00607 assert (tok.prevTokType == lastTokType);
00608
00609 if (debug) {
00610 errStream << "TOKEN ";
00611 showToken(errStream, tok);
00612 errStream << std::endl;
00613 }
00614
00615 lastTokType = tok.tokType;
00616 lastToken = tok;
00617
00618 lvalp->tok = tok;
00619 here = tok.endLoc;
00620
00621 return tok.tokType;
00622 }
00623
00624 void
00625 TransitionLexer::beginBlock(const LToken& tok)
00626 {
00627 bool inserted = (tok.flags & TF_INSERTED);
00628
00629 #ifdef LAYOUT_BLOCK_DEBUG
00630 errStream << " LAYOUT: " << here
00631 << (inserted ? ": inserted " : ": explicit ")
00632 << "beginBlock()" << std::endl;
00633 #endif
00634
00635
00636
00637 unsigned column = layoutStack ? layoutStack->column : 0;
00638
00639 boost::shared_ptr<LayoutFrame> lf =
00640 LayoutFrame::make(lastToken.tokType, inserted, column, tok);
00641
00642
00643
00644
00645 lf->next = layoutStack;
00646 layoutStack = lf;
00647 }
00648
00649 void
00650 TransitionLexer::endBlock(const LToken& tok)
00651 {
00652 bool inserted = (tok.flags & TF_INSERTED);
00653
00654 if (inserted && ! layoutStack->inserted) {
00655 std::stringstream ss;
00656 ss << "Inserted close brace balances explicit open brace at "
00657 << layoutStack->tok.loc
00658 << ".";
00659 ReportParseError(tok.loc, ss.str());
00660 }
00661 else if (!inserted && layoutStack->inserted) {
00662 std::stringstream ss;
00663 ss << "Explicit close brace balances inserted open brace at "
00664 << layoutStack->tok.loc
00665 << ".";
00666 ReportParseError(tok.loc, ss.str());
00667 }
00668
00669
00670 layoutStack = layoutStack->next;
00671
00672 #ifdef LAYOUT_BLOCK_DEBUG
00673 errStream << " LAYOUT: " << here
00674 << (inserted ? ": inserted " : ": explicit ")
00675 << "endBlock()";
00676 if (layoutStack)
00677 errStream << " leaving indent at " << layoutStack->column;
00678 errStream << std::endl;
00679 #endif
00680 }
00681
00682 bool
00683 TransitionLexer::closeToOffset(unsigned offset)
00684 {
00685
00686
00687
00688
00689
00690
00691 if (!layoutStack)
00692 return false;
00693
00694 if (!layoutStack->inserted)
00695 return false;
00696
00697 if (offset >= layoutStack->column)
00698 return false;
00699
00700 #ifdef LAYOUT_BLOCK_DEBUG
00701 unsigned count = 0;
00702
00703 errStream << " LAYOUT: " << here
00704 << ": closeToOffset inserted '}'" << std::endl;
00705 #endif
00706
00707 return true;
00708 }
00709
00710 bool
00711 TransitionLexer::conditionallyInsertSemicolon(unsigned offset)
00712 {
00713
00714
00715
00716 if (!layoutStack)
00717 return false;
00718
00719 assert(layoutStack);
00720 if (here.offset <= layoutStack->column)
00721 return true;
00722 return false;
00723 }
00724
00725 LexLoc
00726 TransitionLexer::skipWhiteSpaceAndComments()
00727 {
00728 ucs4_t c;
00729
00730 LexLoc pos = here;
00731
00732 startOver:
00733 thisToken.erase();
00734
00735 c = getChar();
00736
00737 if (c == '/') {
00738 c = getChar();
00739 if (c == '/') {
00740 do {
00741 c = getChar();
00742 } while (c != '\n' && c != '\r');
00743
00744
00745 ungetChar(c);
00746 pos.updateWith(thisToken);
00747 goto startOver;
00748 }
00749 else if (c == '*') {
00750 for (;;) {
00751 c = getChar();
00752 if (c == '*') {
00753 c = getChar();
00754 if (c == '/') {
00755 break;
00756 }
00757 else if (c == EOF) {
00758 pos.updateWith(thisToken);
00759 return pos;
00760 }
00761 else
00762 ungetChar(c);
00763 }
00764 else if (c == '\n' || c == '\r') {
00765
00766
00767 atBeginningOfLine = true;
00768 }
00769 else if (c == EOF) {
00770 pos.updateWith(thisToken);
00771 return pos;
00772 }
00773 }
00774
00775 pos.updateWith(thisToken);
00776 goto startOver;
00777 }
00778 else {
00779 ungetChar(c);
00780 c = '/';
00781 }
00782 }
00783
00784 if (isWhiteSpace(c)) {
00785 while (isWhiteSpace(c)) {
00786 if (c == '\n' || c == '\r') {
00787 atBeginningOfLine = true;
00788 }
00789
00790 c = getChar();
00791 }
00792 ungetChar(c);
00793
00794 pos.updateWith(thisToken);
00795 goto startOver;
00796 }
00797
00798 ungetChar(c);
00799
00800 return pos;
00801 }
00802
00803 void
00804 TransitionLexer::pushTokenBack(const LToken& tok, bool verbose)
00805 {
00806 if (verbose && debug) {
00807 errStream << "PARSER PUSHED BACK TOKEN ";
00808 showToken(errStream, tok);
00809 errStream << std::endl;
00810 }
00811 pushbackTokens.push_back(tok);
00812
00813 lastTokType = tok.prevTokType;
00814
00815 if (verbose && debug) {
00816 errStream << " Pushback Stack Is:\n";
00817
00818 for (size_t i = pushbackTokens.size(); i > 0; i--) {
00819 LToken theTok = pushbackTokens[i-1];
00820 errStream << " " << i-1 << " ";
00821 showToken(errStream, theTok);
00822 errStream << std::endl;
00823 }
00824 }
00825 }
00826
00827 #if 0
00828 #define RETURN_INSERTED(tok) do { \
00829 LToken _tok = tok; \
00830 _tok.flags |= TF_INSERTED; \
00831 return _tok; \
00832 } while(false)
00833 #endif
00834
00835 LToken
00836 TransitionLexer::getNextToken()
00837 {
00838 LToken tok = havePushbackToken() ? popToken() : getNextInputToken();
00839
00840 LexLoc startLoc = here;
00841 LexLoc endLoc = here;
00842
00843
00844
00845
00846
00847
00848 bool curlyRequired = ((lastTokType == EOF)
00849
00850 || (lastTokType == tk_LET)
00851 || (lastTokType == tk_LETREC)
00852 || (lastTokType == tk_LOOP)
00853 || (lastTokType == tk_SWITCH)
00854
00855 || (lastTokType == tk_IN)
00856 || (lastTokType == tk_IS)
00857 || (lastTokType == tk_DO)
00858 || (lastTokType == tk_TRY)
00859 || (lastTokType == tk_THEN)
00860 || (lastTokType == tk_OTHERWISE)
00861 || (lastTokType == tk_ELSE)
00862 || false);
00863
00864 #ifdef LAYOUT_BLOCK_DEBUG
00865 if (lastTokType < 256 && isprint(lastTokType)) {
00866 errStream << " LAYOUT: " << startLoc
00867 << ": last token was '" << (char)lastTokType << "'." << std::endl;
00868 }
00869 else {
00870 errStream << " LAYOUT: " << startLoc
00871 << ": last token was type " << lastTokType << std::endl;
00872 }
00873 #endif
00874
00875 if (curlyRequired && (tok.tokType != '{')) {
00876 pushTokenBack(tok);
00877
00878 LToken newTok = LToken('{', startLoc, endLoc, "{");
00879 newTok.flags |= TF_INSERTED;
00880 newTok.prevTokType = tok.prevTokType;
00881
00882 beginBlock(newTok);
00883
00884
00885 return newTok;
00886 }
00887
00888
00889
00890
00891
00892
00893
00894
00895
00896
00897 if (lastTokType == '{') {
00898 assert(layoutStack);
00899
00900 #ifdef LAYOUT_BLOCK_DEBUG
00901 errStream << " LAYOUT: " << startLoc
00902 << ": Processing post-{..." << std::endl;
00903 #endif
00904
00905 if ((startLoc.offset > layoutStack->column) ||
00906
00907
00908 (layoutStack->inserted && !layoutStack->next)) {
00909
00910 layoutStack->column = startLoc.offset;
00911
00912 #ifdef LAYOUT_BLOCK_DEBUG
00913 errStream << " LAYOUT: " << startLoc
00914 << ": set indent to "
00915 << startLoc.offset
00916 << ", disable check first token." << std::endl;
00917 #endif
00918 }
00919 else if (layoutStack->inserted && layoutStack->next) {
00920 #ifdef LAYOUT_BLOCK_DEBUG
00921 errStream << " LAYOUT: " << startLoc
00922 << " : close empty inserted block." << std::endl;
00923 #endif
00924
00925
00926 pushTokenBack(tok);
00927
00928 LToken newTok = LToken('}', startLoc, endLoc, "}");
00929 newTok.flags |= TF_INSERTED;
00930 newTok.prevTokType = tok.prevTokType;
00931
00932 endBlock(newTok);
00933
00934 return newTok;
00935 }
00936 }
00937
00938 if (tok.flags & TF_FIRST_ON_LINE) {
00939
00940
00941
00942
00943
00944
00945
00946 #ifdef LAYOUT_BLOCK_DEBUG
00947 errStream << " LAYOUT: " << startLoc
00948 << ": processing first token..." << std::endl;
00949 #endif
00950
00951
00952
00953 bool autoCloseOK = !(lastTokType == ','
00954 || lastTokType == '('
00955 || tok.tokType == ','
00956 || tok.tokType == ')'
00957 || false);
00958
00959 if (autoCloseOK && closeToOffset(startLoc.offset)) {
00960 pushTokenBack(tok);
00961
00962 LToken newTok = LToken('}', startLoc, endLoc, "}");
00963 newTok.flags |= TF_INSERTED|TF_AT_FIRST;
00964 newTok.prevTokType = tok.prevTokType;
00965
00966 endBlock(newTok);
00967 return newTok;
00968 }
00969 }
00970
00971
00972 if (tok.tokType == EOF) {
00973 if (layoutStack && layoutStack->inserted) {
00974
00975 pushTokenBack(tok);
00976
00977 LToken newTok = LToken('}', startLoc, endLoc, "}");
00978 newTok.flags |= TF_INSERTED|TF_AT_FIRST;
00979 newTok.prevTokType = tok.prevTokType;
00980
00981 endBlock(newTok);
00982 return newTok;
00983 }
00984
00985 return tok;
00986 }
00987
00988
00989
00990
00991
00992
00993
00994 if (tok.flags & TF_FIRST_ON_LINE) {
00995 bool wantAutoSemi = !(lastTokType == ';' ||
00996 lastTokType == '{' ||
00997 lastTokType == ',' ||
00998 lastTokType == '(' ||
00999 lastTokType == '[' ||
01000 lastTokType == '=' ||
01001 lastTokType == tk_ASSIGN ||
01002 tok.tokType == ')' ||
01003 tok.tokType == ']' ||
01004 tok.tokType == ';' ||
01005 tok.tokType == '}' ||
01006 tok.tokType == tk_THEN ||
01007 tok.tokType == tk_ELSE ||
01008 tok.tokType == tk_CASE ||
01009 tok.tokType == tk_CATCH ||
01010 tok.tokType == tk_OTHERWISE ||
01011 tok.tokType == tk_UNTIL ||
01012 tok.tokType == tk_IN ||
01013 tok.tokType == tk_IS ||
01014 tok.tokType == tk_DO ||
01015 false);
01016
01017 if (wantAutoSemi) {
01018 if (conditionallyInsertSemicolon(startLoc.offset)) {
01019 #ifdef LAYOUT_BLOCK_DEBUG
01020 errStream << " LAYOUT: " << startLoc
01021 << ": semicolon inserted at offset "
01022 << startLoc.offset << std::endl;
01023 #endif
01024 pushTokenBack(tok);
01025
01026 LToken newTok = LToken(';', startLoc, endLoc, ";");
01027 newTok.flags |= TF_INSERTED|TF_AT_FIRST;
01028 newTok.prevTokType = tok.prevTokType;
01029
01030 return newTok;
01031 }
01032 }
01033 }
01034
01035
01036
01037 if (tok.tokType == '{')
01038 beginBlock(tok);
01039
01040 if (tok.tokType == '}') {
01041 if (!(tok.flags & TF_INSERTED)) {
01042
01043
01044 if (closeToOffset(0)) {
01045 pushTokenBack(tok);
01046
01047 LToken newTok = LToken('}', startLoc, endLoc, "}");
01048 newTok.flags |= TF_INSERTED|TF_AT_FIRST;
01049 newTok.prevTokType = tok.prevTokType;
01050
01051 endBlock(newTok);
01052 return newTok;
01053 }
01054 }
01055
01056 endBlock(tok);
01057 }
01058
01059 return tok;
01060 }
01061
01062 LToken
01063 TransitionLexer::popToken()
01064 {
01065 assert(havePushbackToken());
01066
01067 LToken pbTok = pushbackTokens[pushbackTokens.size()-1];
01068 pushbackTokens.pop_back();
01069 pbTok.prevTokType = lastTokType;
01070 return pbTok;
01071 }
01072
01073 #define RETURN_TOKEN(tok) do { \
01074 LToken _tok = tok; \
01075 _tok.prevTokType = lastTokType; \
01076 if (atBeginningOfLine) { \
01077 _tok.flags |= TF_FIRST_ON_LINE; \
01078 atBeginningOfLine = false; \
01079 } \
01080 return _tok; \
01081 } while(false)
01082
01083 LToken
01084 TransitionLexer::getNextInputToken()
01085 {
01086
01087 int radix = 10;
01088
01089 here = skipWhiteSpaceAndComments();
01090
01091 thisToken.erase();
01092
01093 ucs4_t c = getChar();
01094
01096
01097
01099 LexLoc startLoc = here;
01100 LexLoc endLoc = here;
01101
01102 switch (c) {
01103 case ':':
01104 {
01105 int tokID = ':';
01106
01107 ucs4_t c2 = getChar();
01108 if (c2 == '=')
01109 tokID = tk_ASSIGN;
01110 else if (c2 == ':') {
01111 tokID = tk_BlkIdent;
01112 }
01113 else
01114 ungetChar(c2);
01115
01116 endLoc.updateWith(thisToken);
01117 RETURN_TOKEN(LToken(tokID, startLoc, endLoc, thisToken));
01118 }
01119 case ';':
01120 {
01121 endLoc.updateWith(thisToken);
01122 RETURN_TOKEN(LToken(c, startLoc, endLoc, thisToken));
01123 }
01124
01125 case '{':
01126 {
01127 endLoc.updateWith(thisToken);
01128 RETURN_TOKEN(LToken(c, startLoc, endLoc, thisToken));
01129 }
01130 case '}':
01131 {
01132 endLoc.updateWith(thisToken);
01133 RETURN_TOKEN(LToken(c, startLoc, endLoc, thisToken));
01134 }
01135
01136 case '.':
01137 case ',':
01138 case '[':
01139 case ']':
01140 case '(':
01141 case ')':
01142 {
01143 endLoc.updateWith(thisToken);
01144 RETURN_TOKEN(LToken(c, startLoc, endLoc, thisToken));
01145 }
01146
01147 case '"':
01148 {
01149 do {
01150 c = getChar();
01151
01152 if (c == '\\') {
01153 (void) getChar();
01154 }
01155 } while (c != '"' && c != EOF);
01156
01157 if (c == EOF) {
01158 errStream << startLoc
01159 << ": Unterminated string constant. Missing end quote?"
01160 << std::endl;
01161 num_errors++;
01162 RETURN_TOKEN(LToken(EOF, startLoc, endLoc, "end of file"));
01163 }
01164
01165 unsigned badpos = LitValue::validate_string(thisToken.c_str());
01166
01167 if (badpos) {
01168 LexLoc badHere = startLoc;
01169 badHere.offset += badpos;
01170 errStream << badHere.asString()
01171 << ": Illegal (non-printing) character in string '"
01172 << thisToken << "'" << std::endl;
01173 num_errors++;
01174 }
01175
01176 endLoc.updateWith(thisToken);
01177 RETURN_TOKEN(LToken(tk_String, startLoc, endLoc,
01178 thisToken.substr(1, thisToken.size()-2)));
01179 }
01180
01181 case '\'':
01182 {
01183
01184
01185 int tokType = tk_TypeVar;
01186
01187 int c1 = getChar();
01188 int c2 = getChar();
01189
01190 if (c1 == EOF || c2 == EOF) {
01191 endLoc.updateWith(thisToken);
01192 RETURN_TOKEN(LToken(EOF, startLoc, endLoc, "end of file"));
01193 }
01194
01195
01196 if (c2 == '\'') {
01197
01198
01199 switch (c1) {
01200 case '\'':
01201 case '\\':
01202 {
01203
01204
01205 ungetChar(c2);
01206 ungetChar(c1);
01207 endLoc.updateWith(thisToken);
01208 RETURN_TOKEN(LToken(EOF, startLoc, endLoc, "end of file"));
01209 }
01210 default:
01211 {
01212 if (LitValue::DecodeCharacter(thisToken) >= 0) {
01213 endLoc.updateWith(thisToken);
01214 RETURN_TOKEN(LToken(tk_Char, startLoc, endLoc, thisToken));
01215 }
01216
01217 ungetChar(c2);
01218 ungetChar(c1);
01219 endLoc.updateWith(thisToken);
01220 RETURN_TOKEN(LToken(EOF, startLoc, endLoc, "end of file"));
01221 }
01222 }
01223 }
01224 else if (c1 == '\\') {
01225
01226
01227
01228
01229 do {
01230 c = getChar();
01231 if (c == EOF) {
01232 endLoc.updateWith(thisToken);
01233 RETURN_TOKEN(LToken(EOF, startLoc, endLoc, "end of file"));
01234 }
01235 } while (c != '\'' && c != EOF);
01236
01237 if (c == EOF) {
01238 errStream << startLoc
01239 << ": Unterminated character constant. Missing end quote?"
01240 << std::endl;
01241 num_errors++;
01242 RETURN_TOKEN(LToken(EOF, startLoc, endLoc, "end of file"));
01243 }
01244
01245 if (LitValue::DecodeCharacter(thisToken) >= 0) {
01246 endLoc.updateWith(thisToken);
01247 RETURN_TOKEN(LToken(tk_Char, startLoc, endLoc, thisToken));
01248 }
01249
01250 endLoc.updateWith(thisToken);
01251 RETURN_TOKEN(LToken(EOF, startLoc, endLoc, "end of file"));
01252 }
01253
01254
01255
01256
01257
01258 ungetChar(c2);
01259
01260 if (c1 == '%') {
01261 tokType = tk_EffectVar;
01262 c1 = getChar();
01263 }
01264
01265 if (!valid_tv_ident_start(c1)) {
01266
01267 ungetChar(c1);
01268 endLoc.updateWith(thisToken);
01269 RETURN_TOKEN(LToken(EOF, startLoc, endLoc, "end of file"));
01270 }
01271
01272 do {
01273 c = getChar();
01274 } while (valid_tv_ident_continue(c));
01275 ungetChar(c);
01276
01277 endLoc.updateWith(thisToken);
01278 RETURN_TOKEN(LToken(tokType, startLoc, endLoc, thisToken));
01279 }
01280
01281
01282
01283
01284 case '0':
01285 case '1':
01286 case '2':
01287 case '3':
01288 case '4':
01289 case '5':
01290 case '6':
01291 case '7':
01292 case '8':
01293 case '9':
01294 {
01295 if (c == '0') {
01296 ucs4_t c2 = getChar();
01297 switch(c2) {
01298 case 'b':
01299 radix = 2;
01300 break;
01301 case 'x':
01302 radix = 16;
01303 break;
01304 case 'o':
01305 radix = 8;
01306 break;
01307 default:
01308 ungetChar(c2);
01309
01310 if (LitValue::digitValue(c2, 8) >= 0) {
01311 radix = 8;
01312 break;
01313 }
01314
01315 }
01316 }
01317
01318 do {
01319 c = getChar();
01320 } while (LitValue::digitValue(c, radix) >= 0);
01321
01322
01323
01324
01325
01326 if (c != '.') {
01327 ungetChar(c);
01328 int tokType = (thisToken[0] == '-') ? tk_NegativeInt : tk_Nat;
01329
01330 endLoc.updateWith(thisToken);
01331 RETURN_TOKEN(LToken(tokType, startLoc, endLoc, thisToken));
01332 }
01333
01334
01335 if (currentLang & lf_version) {
01336 if (radix != 10) {
01337 ReportParseError(startLoc, "Language version number must be decimal.");
01338 RETURN_TOKEN(LToken(EOF, startLoc, endLoc, "end of file"));
01339 }
01340
01341
01342 long count = 0;
01343 do {
01344 c = getChar();
01345 count++;
01346 } while (LitValue::digitValue(c, radix) >= 0);
01347 count--;
01348 ungetChar(c);
01349 endLoc.updateWith(thisToken);
01350 RETURN_TOKEN(LToken(tk_VersionNumber, startLoc, endLoc, thisToken));
01351 }
01352
01353
01354 {
01355 if (radix != 10) {
01356 ReportParseError(startLoc, "Floating point literals must be base 10.");
01357 RETURN_TOKEN(LToken(EOF, startLoc, endLoc, "end of file"));
01358 }
01359
01360 long count = 0;
01361 do {
01362 c = getChar();
01363 count++;
01364 } while (LitValue::digitValue(c, radix) >= 0);
01365 count--;
01366
01367 }
01368
01369
01370
01371 if (c != 'e') {
01372 ungetChar(c);
01373 endLoc.updateWith(thisToken);
01374 RETURN_TOKEN(LToken(tk_Float, startLoc, endLoc, thisToken));
01375 }
01376
01377
01378
01379 c = getChar();
01380 radix = 10;
01381
01382 if (c != '-' && LitValue::digitValue(c, radix) < 0) {
01383
01384 }
01385
01386 do {
01387 c = getChar();
01388 } while (LitValue::digitValue(c, 10) >= 0);
01389
01390
01391 if (c == 'r') {
01392 radix = strtol(thisToken.c_str(), 0, 10);
01393 if (radix < 0) radix = -radix;
01394
01395 long count = 0;
01396 do {
01397 c = getChar();
01398 count++;
01399 } while (LitValue::digitValue(c, radix) >= 0);
01400 count--;
01401
01402 }
01403
01404 ungetChar(c);
01405 endLoc.updateWith(thisToken);
01406 RETURN_TOKEN(LToken(tk_Float, startLoc, endLoc, thisToken));
01407 }
01408
01409 case EOF:
01410 {
01411 endLoc.updateWith(thisToken);
01412 RETURN_TOKEN(LToken(EOF, startLoc, endLoc, "end of file"));
01413 }
01414
01415 default:
01416 if (valid_ident_start(c) || valid_operator_start(c) || c == '_') {
01417 goto identifier_or_operator;
01418 }
01419
01420
01421 endLoc.updateWith(thisToken);
01422 RETURN_TOKEN(LToken(EOF, startLoc, endLoc, "end of file"));
01423 }
01424
01425 identifier_or_operator:
01443 while(c == '_')
01444 c = getChar();
01445
01446
01447 if (c == '#') {
01448 switch(c = getChar()) {
01449 case 'e':
01450 {
01451 c = getChar();
01452 switch (c) {
01453 case '_':
01454 ungetChar(c);
01455 break;
01456 case 'T':
01457 break;
01458 default:
01459 goto malformed_ident;
01460 }
01461 break;
01462 }
01463 case 't':
01464 case 'k':
01465 break;
01466 default:
01467 goto malformed_ident;
01468 }
01469
01470 c = getChar();
01471 if (c != '_') {
01472 goto malformed_ident;
01473 }
01474
01475 c = getChar();
01476 }
01477
01478 while (valid_ident_start(c) || valid_operator_start(c) || c == '_') {
01479
01480 if (valid_ident_start(c)) {
01481 do {
01482 c = getChar();
01483 } while (valid_ident_continue(c));
01484 }
01485 else if (valid_operator_start(c)) {
01486 do {
01487 c = getChar();
01488 } while (valid_operator_continue(c));
01489 }
01490
01491
01492
01493
01494
01495
01496 if (thisToken == "set!") {
01497 int tokType = kwCheck(thisToken.c_str(), tk_BlkIdent);
01498 endLoc.updateWith(thisToken);
01499 RETURN_TOKEN(LToken(tokType, startLoc, endLoc, thisToken));
01500 }
01501
01502 if ((c != '#') && (c != '_') && (c != '@'))
01503 goto ident_done;
01504
01505
01506 if (c == '#') {
01507 switch(c = getChar()) {
01508 case 'e':
01509 {
01510 c = getChar();
01511 switch (c) {
01512 case '_':
01513 ungetChar(c);
01514 break;
01515 case 'T':
01516 break;
01517 default:
01518 goto malformed_ident;
01519 }
01520 break;
01521 }
01522 case 't':
01523 case 'k':
01524 break;
01525 default:
01526 goto malformed_ident;
01527 }
01528
01529 c = getChar();
01530 if (c != '_') {
01531 goto malformed_ident;
01532 }
01533 }
01534 c = getChar();
01535 }
01536 ident_done:
01537
01538
01539 ungetChar(c);
01540
01541
01542 if (thisToken[thisToken.size()-1] == '@') {
01543 ReportParseError(startLoc, thisToken +
01544 " is not a well-formed identifier. Trailing '@' is not valid.");
01545 RETURN_TOKEN(LToken(EOF, startLoc, endLoc, "end of file"));
01546 }
01547 else {
01548 int tokType = kwCheck(thisToken.c_str(), tk_BlkIdent);
01549 endLoc.updateWith(thisToken);
01550 RETURN_TOKEN(LToken(tokType, startLoc, endLoc, thisToken));
01551 }
01552
01553 malformed_ident:
01554 ReportParseError(startLoc, thisToken +
01555 " is not a well-formed identifier.");
01556 RETURN_TOKEN(LToken(EOF, startLoc, endLoc, "end of file"));
01557 }