LitValue.cxx

Go to the documentation of this file.
00001 /**************************************************************************
00002  *
00003  * Copyright (C) 2008, Johns Hopkins University.
00004  * All rights reserved.
00005  *
00006  * Redistribution and use in source and binary forms, with or
00007  * without modification, are permitted provided that the following
00008  * conditions are met:
00009  *
00010  *   - Redistributions of source code must contain the above 
00011  *     copyright notice, this list of conditions, and the following
00012  *     disclaimer. 
00013  *
00014  *   - Redistributions in binary form must reproduce the above
00015  *     copyright notice, this list of conditions, and the following
00016  *     disclaimer in the documentation and/or other materials 
00017  *     provided with the distribution.
00018  *
00019  *   - Neither the names of the copyright holders nor the names of any
00020  *     of any contributors may be used to endorse or promote products
00021  *     derived from this software without specific prior written
00022  *     permission. 
00023  *
00024  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
00025  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
00026  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
00027  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
00028  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
00029  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
00030  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
00031  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
00032  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
00033  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
00034  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
00035  *
00036  **************************************************************************/
00037 
00038 #include <assert.h>
00039 #include <stdint.h>
00040 #include <stdlib.h>
00041 #include <dirent.h>
00042 #include <string.h>
00043 #include <fstream>
00044 #include <iostream>
00045 #include <string>
00046 #include <sstream>
00047 
00048 #include <unicode/uchar.h>
00049 
00050 #include <libsherpa/utf8.hxx>
00051 
00052 #include "LitValue.hxx"
00053 
00054 #define DEBUG_DECODE false
00055 
00056 long 
00057 LitValue::digitValue(ucs4_t ucs4, unsigned radix)
00058 {
00059   long l = -1;
00060 
00061   if (ucs4 >= '0' && ucs4 <= '9')
00062     l = ucs4 - '0';
00063   if (ucs4 >= 'a' && ucs4 <= 'f')
00064     l = ucs4 - 'a' + 10;
00065   if (ucs4 >= 'A' && ucs4 <= 'F')
00066     l = ucs4 - 'A' + 10;
00067 
00068   if (l > radix)
00069     l = -1;
00070   return l;
00071 }
00072 
00073 bool
00074 LitValue::valid_char_printable(ucs4_t ucs4)
00075 {
00076   switch (ucs4) {
00077   case '_':
00078     return true;
00079 
00080     // This should match the list TransitionLexer::valid_ident_punct():
00081   case '!':
00082   case '$':
00083   case '%':
00084   case '&':
00085   case '|':
00086   case '*':
00087   case '+':
00088   case '-':
00089   case '/':
00090   case '<':
00091   case '>':
00092   case '=':
00093   case '?':
00094   case '@':
00095   case '~':
00096     return true;
00097 
00098     // Other characters that can appear "naked" in a character constant:
00099   case '^':
00100   case '.':
00101   case ',':
00102   case ':':
00103   case ';':
00104   case '[':
00105   case ']':
00106   case '\'':
00107   case '#':
00108   case '`':
00109   case '(':
00110   case ')':
00111     return true;
00112   default:
00113     return false;
00114   }
00115 }
00116 
00117 bool
00118 LitValue::valid_charpoint(ucs4_t ucs4)
00119 {
00120   if (valid_char_printable(ucs4))
00121     return true;
00122 
00123   return u_isgraph(ucs4);
00124 }
00125 
00126 bool
00127 LitValue::valid_charpunct(ucs4_t ucs4)
00128 {
00129   if (strchr("!\"#$%&'()*+,-./:;{}<=>?@[\\]^_`|~", ucs4))
00130     return true;
00131   return false;
00132 }
00133 
00134 unsigned
00135 LitValue::validate_string(const char *s)
00136 {
00137   const char *spos = s;
00138   ucs4_t c;
00139 
00140   assert(*spos == '"');
00141   spos++;
00142 
00143   while (*spos != '"') {
00144     const char *snext;
00145     c = DecodeStringCharacter(spos, &snext); //&OK
00146     if (c < 0)
00147       return spos - s;
00148 
00149     spos = snext;
00150   }
00151 
00152   return 0;
00153 }
00154 
00175 
00176 #define ESCAPED_LITERAL(s, cp)  \
00177   { "\\{" s "}" , cp },         \
00178   { "'\\" s "'", cp }
00179 #define SINGLE_LITERAL(s, cp)  \
00180   { "\\" s , cp },             \
00181   { "'\\" s "'", cp }
00182 
00183 LitValue::EscapedLiteral
00184 LitValue::EscapedLiteralMap[] = {
00185   ESCAPED_LITERAL("space", ' '),
00186   ESCAPED_LITERAL("linefeed", '\n'),
00187   SINGLE_LITERAL("n", '\n' ),
00188   ESCAPED_LITERAL("return", '\r'),
00189   SINGLE_LITERAL("r", '\r' ),
00190   ESCAPED_LITERAL("tab", '\t'),
00191   SINGLE_LITERAL("t", '\t' ),
00192   ESCAPED_LITERAL("backspace", '\b'),
00193   SINGLE_LITERAL("b", '\b' ),
00194   ESCAPED_LITERAL("formfeed", '\f'),
00195   SINGLE_LITERAL("f", '\f' ),
00196   SINGLE_LITERAL("s", ' ' ),
00197   ESCAPED_LITERAL("backslash", '\\'),
00198   SINGLE_LITERAL("\\", '\\' ),
00199   ESCAPED_LITERAL("dquote", '\"'),
00200   SINGLE_LITERAL("\"", '\"' ),
00201   ESCAPED_LITERAL("squote", '\''),
00202   SINGLE_LITERAL("\'", '\'' )
00203 };
00204 const size_t LitValue::EscapedLiteralMapLength = 
00205   (sizeof(LitValue::EscapedLiteralMap) /
00206    sizeof(LitValue::EscapedLiteralMap[0]));
00207 
00208 ucs4_t
00209 LitValue::GetEscapedCodePoint(const char *escapedLiteral)
00210 {
00211   for (size_t i = 0; i < EscapedLiteralMapLength; i++) {
00212     if (strcmp(EscapedLiteralMap[i].s, escapedLiteral) == 0) {
00213       return EscapedLiteralMap[i].codePoint;
00214     }
00215   }
00216 
00217   return -1;
00218 }
00219 
00220 ucs4_t
00221 LitValue::DecodeNumericCharacter(const char *s, const char **snext)
00222 {
00223   ucs4_t codePoint = 0;
00224   unsigned radix = 10;
00225 
00226   // Could be unicode escape:
00227   if (s[0] == 'U' && s[1] == '+') {
00228     s = s + 2;
00229     radix = 16;
00230   }
00231   else if ((s[0] == '0') && (s[1] == 'x')) { // hexadecimal
00232     s = s + 2;
00233     radix = 16;
00234   }
00235   else if ((s[0] == '0') && (s[1] == 'o')) { // octal
00236     s = s + 2;
00237     radix = 8;
00238   }
00239   else if ((s[0] == '0') && (s[1] == 'b')) { // binary
00240     s = s + 2;
00241     radix = 2;
00242   }
00243   else if (s[0] == '0') {       // C-style octal
00244     s = s + 1;
00245     radix = 8;
00246   }
00247   else if (!isdigit(s[0])) {
00248     if (snext) *snext = s;
00249     return -1;
00250   }
00251 
00252   for (;;) {
00253     long dv = digitValue(*s, 16);
00254     if (dv < 0)                 // exits on NUL, ', or }
00255       break;
00256 
00257     codePoint *= 16;
00258     codePoint += dv;
00259 
00260     if (codePoint > UCHAR_MAX_VALUE) {
00261       if (snext) *snext = s;
00262       return -1;
00263     }
00264     s++;
00265   }
00266 
00267   if (snext) *snext = s;
00268   return codePoint;
00269 }
00270 
00271 ucs4_t
00272 LitValue::DecodeStringCharacter(const char *s, const char **next)
00273 {
00274   const char *sBegin = s;
00275   const char *snext = s + 1;
00276 
00277   ucs4_t c = sherpa::utf8_decode(s, &snext);
00278 
00279   if (c == ' ') {
00280     if (next) *next = snext;
00281     if (DEBUG_DECODE)
00282       std::cerr << "DecodeStringChar handles { } giving "
00283                 << (ucs4_t)' '
00284                 << std::endl;
00285     return ' ';
00286   }
00287   else if (c != '\\') {
00288     if (!u_isgraph(c)) {
00289       if (DEBUG_DECODE)
00290         std::cerr << "DecodeStringChar handles {"
00291                   << (char) c
00292                   << "} giving -1"
00293                   << std::endl;
00294       return -1;
00295     }
00296 
00297     if (DEBUG_DECODE)
00298       std::cerr << "DecodeStringChar handles {"
00299                 << (char) c
00300                 << "} giving "
00301                 << (ucs4_t)c
00302                 << std::endl;
00303 
00304     if (next) *next = snext;
00305     return c;
00306   }
00307   else {
00308     s = snext;
00309     c = sherpa::utf8_decode(s, &snext);
00310 
00311     if (c == '{' ) {
00312       while (c != '}')
00313         c = sherpa::utf8_decode(s, &snext);
00314     }
00315 
00316     std::string theEscape(sBegin, snext - sBegin);
00317     ucs4_t codePoint = GetEscapedCodePoint(theEscape.c_str());
00318     if (codePoint < 0) {
00319       if (DEBUG_DECODE)
00320         std::cerr << "DecodeStringChar handles {"
00321                   << theEscape
00322                   << "} giving -1"
00323                   << std::endl;
00324       return -1;
00325     }
00326 
00327     if (DEBUG_DECODE)
00328       std::cerr << "DecodeStringChar handles {"
00329                 << theEscape
00330                 << "} giving "
00331                 << (ucs4_t)codePoint
00332                 << std::endl;
00333 
00334     if (next) *next = snext;
00335     return codePoint;
00336   }
00337 }
00338 
00339 ucs4_t
00340 LitValue::DecodeBlockCharacter(const char *s)
00341 {
00342   const char *snext;
00343 
00344   ucs4_t codePoint = GetEscapedCodePoint(s);
00345   if (codePoint >= 0)
00346     return codePoint;
00347 
00348   if (s[1] == '\\') {
00349     // Any remaining escape must be numeric:
00350 
00351     s = s + 2;
00352     codePoint = DecodeNumericCharacter(s, &snext);
00353     if (codePoint < 0)
00354       return codePoint;
00355 
00356     if (snext == s) {
00357       // It wasn't a numeric escape:
00358       return -1;
00359     }
00360 
00361     return codePoint;
00362   }
00363 
00364   else {
00365     s = s + 1;                  // skip the '\'
00366 
00367     // This is a non-escaped character:
00368     codePoint = sherpa::utf8_decode(s, &snext); //&OK
00369     if (codePoint < 0 || snext == s)
00370       return -1;
00371 
00372     if (valid_charpoint(codePoint) || valid_charpunct(codePoint))
00373       return codePoint;
00374   }
00375 
00376   return -1;
00377 }
00378 
00379 ucs4_t
00380 LitValue::DecodeCharacter(const std::string& s)
00381 {
00382   const char *str = s.c_str();
00383 
00384   ucs4_t codePoint = DecodeBlockCharacter(str);
00385 
00386 #if 0
00387   std::cerr << "Decoding character {" << s << "} gives " 
00388             << codePoint << std::endl;
00389 #endif
00390 
00391   return codePoint;
00392 }
00393 
00394 static bool
00395 needsBackslashEscape(uint32_t c)
00396 {
00397   return (c == '"' || c == '\'' || c == '\\');
00398 }
00399 
00400 static bool
00401 asciiPrintableCharacter(uint32_t c)
00402 {
00403   /* ASCII printable glyphs are in the range [0x20,0x7e], but a few
00404      of these require special escaping. */
00405   return (c >= 0x20 && c < 0x7f);
00406 }
00407 
00408 std::string
00409 LitValue::asString() const
00410 {
00411   std::stringstream ss;
00412 
00413   switch(litType) {
00414   case lt_bool:
00415     return (b == true) ? "true" : "false";
00416 
00417   case lt_char:
00418     {
00419       if (asciiPrintableCharacter(c))
00420         ss << (needsBackslashEscape(c) ? "'\\" : "'")
00421            << (unsigned char) c << "'";
00422       else
00423         ss << (unsigned long)(c);
00424 
00425       return ss.str();
00426     }
00427   case lt_int:
00428     ss << i;                    // defer to bignum printer
00429     return ss.str();
00430 
00431   case lt_float:
00432     {
00433       char buf[256];
00434       snprintf(buf, sizeof(buf), " %f\n", d);
00435       return buf;
00436     }
00437 
00438   case lt_string:
00439     return s;
00440 
00441   default:
00442     return "BAD LITERAL TYPE";
00443   }
00444 }

Generated on Thu May 17 23:59:16 2012 for BitC Compiler by  doxygen 1.4.7