00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038 #include <assert.h>
00039 #include <stdint.h>
00040 #include <stdlib.h>
00041 #include <dirent.h>
00042 #include <string.h>
00043 #include <fstream>
00044 #include <iostream>
00045 #include <string>
00046 #include <sstream>
00047
00048 #include <unicode/uchar.h>
00049
00050 #include <libsherpa/utf8.hxx>
00051
00052 #include "LitValue.hxx"
00053
00054 #define DEBUG_DECODE false
00055
00056 long
00057 LitValue::digitValue(ucs4_t ucs4, unsigned radix)
00058 {
00059 long l = -1;
00060
00061 if (ucs4 >= '0' && ucs4 <= '9')
00062 l = ucs4 - '0';
00063 if (ucs4 >= 'a' && ucs4 <= 'f')
00064 l = ucs4 - 'a' + 10;
00065 if (ucs4 >= 'A' && ucs4 <= 'F')
00066 l = ucs4 - 'A' + 10;
00067
00068 if (l > radix)
00069 l = -1;
00070 return l;
00071 }
00072
00073 bool
00074 LitValue::valid_char_printable(ucs4_t ucs4)
00075 {
00076 switch (ucs4) {
00077 case '_':
00078 return true;
00079
00080
00081 case '!':
00082 case '$':
00083 case '%':
00084 case '&':
00085 case '|':
00086 case '*':
00087 case '+':
00088 case '-':
00089 case '/':
00090 case '<':
00091 case '>':
00092 case '=':
00093 case '?':
00094 case '@':
00095 case '~':
00096 return true;
00097
00098
00099 case '^':
00100 case '.':
00101 case ',':
00102 case ':':
00103 case ';':
00104 case '[':
00105 case ']':
00106 case '\'':
00107 case '#':
00108 case '`':
00109 case '(':
00110 case ')':
00111 return true;
00112 default:
00113 return false;
00114 }
00115 }
00116
00117 bool
00118 LitValue::valid_charpoint(ucs4_t ucs4)
00119 {
00120 if (valid_char_printable(ucs4))
00121 return true;
00122
00123 return u_isgraph(ucs4);
00124 }
00125
00126 bool
00127 LitValue::valid_charpunct(ucs4_t ucs4)
00128 {
00129 if (strchr("!\"#$%&'()*+,-./:;{}<=>?@[\\]^_`|~", ucs4))
00130 return true;
00131 return false;
00132 }
00133
00134 unsigned
00135 LitValue::validate_string(const char *s)
00136 {
00137 const char *spos = s;
00138 ucs4_t c;
00139
00140 assert(*spos == '"');
00141 spos++;
00142
00143 while (*spos != '"') {
00144 const char *snext;
00145 c = DecodeStringCharacter(spos, &snext);
00146 if (c < 0)
00147 return spos - s;
00148
00149 spos = snext;
00150 }
00151
00152 return 0;
00153 }
00154
00175
00176 #define ESCAPED_LITERAL(s, cp) \
00177 { "\\{" s "}" , cp }, \
00178 { "'\\" s "'", cp }
00179 #define SINGLE_LITERAL(s, cp) \
00180 { "\\" s , cp }, \
00181 { "'\\" s "'", cp }
00182
00183 LitValue::EscapedLiteral
00184 LitValue::EscapedLiteralMap[] = {
00185 ESCAPED_LITERAL("space", ' '),
00186 ESCAPED_LITERAL("linefeed", '\n'),
00187 SINGLE_LITERAL("n", '\n' ),
00188 ESCAPED_LITERAL("return", '\r'),
00189 SINGLE_LITERAL("r", '\r' ),
00190 ESCAPED_LITERAL("tab", '\t'),
00191 SINGLE_LITERAL("t", '\t' ),
00192 ESCAPED_LITERAL("backspace", '\b'),
00193 SINGLE_LITERAL("b", '\b' ),
00194 ESCAPED_LITERAL("formfeed", '\f'),
00195 SINGLE_LITERAL("f", '\f' ),
00196 SINGLE_LITERAL("s", ' ' ),
00197 ESCAPED_LITERAL("backslash", '\\'),
00198 SINGLE_LITERAL("\\", '\\' ),
00199 ESCAPED_LITERAL("dquote", '\"'),
00200 SINGLE_LITERAL("\"", '\"' ),
00201 ESCAPED_LITERAL("squote", '\''),
00202 SINGLE_LITERAL("\'", '\'' )
00203 };
00204 const size_t LitValue::EscapedLiteralMapLength =
00205 (sizeof(LitValue::EscapedLiteralMap) /
00206 sizeof(LitValue::EscapedLiteralMap[0]));
00207
00208 ucs4_t
00209 LitValue::GetEscapedCodePoint(const char *escapedLiteral)
00210 {
00211 for (size_t i = 0; i < EscapedLiteralMapLength; i++) {
00212 if (strcmp(EscapedLiteralMap[i].s, escapedLiteral) == 0) {
00213 return EscapedLiteralMap[i].codePoint;
00214 }
00215 }
00216
00217 return -1;
00218 }
00219
00220 ucs4_t
00221 LitValue::DecodeNumericCharacter(const char *s, const char **snext)
00222 {
00223 ucs4_t codePoint = 0;
00224 unsigned radix = 10;
00225
00226
00227 if (s[0] == 'U' && s[1] == '+') {
00228 s = s + 2;
00229 radix = 16;
00230 }
00231 else if ((s[0] == '0') && (s[1] == 'x')) {
00232 s = s + 2;
00233 radix = 16;
00234 }
00235 else if ((s[0] == '0') && (s[1] == 'o')) {
00236 s = s + 2;
00237 radix = 8;
00238 }
00239 else if ((s[0] == '0') && (s[1] == 'b')) {
00240 s = s + 2;
00241 radix = 2;
00242 }
00243 else if (s[0] == '0') {
00244 s = s + 1;
00245 radix = 8;
00246 }
00247 else if (!isdigit(s[0])) {
00248 if (snext) *snext = s;
00249 return -1;
00250 }
00251
00252 for (;;) {
00253 long dv = digitValue(*s, 16);
00254 if (dv < 0)
00255 break;
00256
00257 codePoint *= 16;
00258 codePoint += dv;
00259
00260 if (codePoint > UCHAR_MAX_VALUE) {
00261 if (snext) *snext = s;
00262 return -1;
00263 }
00264 s++;
00265 }
00266
00267 if (snext) *snext = s;
00268 return codePoint;
00269 }
00270
00271 ucs4_t
00272 LitValue::DecodeStringCharacter(const char *s, const char **next)
00273 {
00274 const char *sBegin = s;
00275 const char *snext = s + 1;
00276
00277 ucs4_t c = sherpa::utf8_decode(s, &snext);
00278
00279 if (c == ' ') {
00280 if (next) *next = snext;
00281 if (DEBUG_DECODE)
00282 std::cerr << "DecodeStringChar handles { } giving "
00283 << (ucs4_t)' '
00284 << std::endl;
00285 return ' ';
00286 }
00287 else if (c != '\\') {
00288 if (!u_isgraph(c)) {
00289 if (DEBUG_DECODE)
00290 std::cerr << "DecodeStringChar handles {"
00291 << (char) c
00292 << "} giving -1"
00293 << std::endl;
00294 return -1;
00295 }
00296
00297 if (DEBUG_DECODE)
00298 std::cerr << "DecodeStringChar handles {"
00299 << (char) c
00300 << "} giving "
00301 << (ucs4_t)c
00302 << std::endl;
00303
00304 if (next) *next = snext;
00305 return c;
00306 }
00307 else {
00308 s = snext;
00309 c = sherpa::utf8_decode(s, &snext);
00310
00311 if (c == '{' ) {
00312 while (c != '}')
00313 c = sherpa::utf8_decode(s, &snext);
00314 }
00315
00316 std::string theEscape(sBegin, snext - sBegin);
00317 ucs4_t codePoint = GetEscapedCodePoint(theEscape.c_str());
00318 if (codePoint < 0) {
00319 if (DEBUG_DECODE)
00320 std::cerr << "DecodeStringChar handles {"
00321 << theEscape
00322 << "} giving -1"
00323 << std::endl;
00324 return -1;
00325 }
00326
00327 if (DEBUG_DECODE)
00328 std::cerr << "DecodeStringChar handles {"
00329 << theEscape
00330 << "} giving "
00331 << (ucs4_t)codePoint
00332 << std::endl;
00333
00334 if (next) *next = snext;
00335 return codePoint;
00336 }
00337 }
00338
00339 ucs4_t
00340 LitValue::DecodeBlockCharacter(const char *s)
00341 {
00342 const char *snext;
00343
00344 ucs4_t codePoint = GetEscapedCodePoint(s);
00345 if (codePoint >= 0)
00346 return codePoint;
00347
00348 if (s[1] == '\\') {
00349
00350
00351 s = s + 2;
00352 codePoint = DecodeNumericCharacter(s, &snext);
00353 if (codePoint < 0)
00354 return codePoint;
00355
00356 if (snext == s) {
00357
00358 return -1;
00359 }
00360
00361 return codePoint;
00362 }
00363
00364 else {
00365 s = s + 1;
00366
00367
00368 codePoint = sherpa::utf8_decode(s, &snext);
00369 if (codePoint < 0 || snext == s)
00370 return -1;
00371
00372 if (valid_charpoint(codePoint) || valid_charpunct(codePoint))
00373 return codePoint;
00374 }
00375
00376 return -1;
00377 }
00378
00379 ucs4_t
00380 LitValue::DecodeCharacter(const std::string& s)
00381 {
00382 const char *str = s.c_str();
00383
00384 ucs4_t codePoint = DecodeBlockCharacter(str);
00385
00386 #if 0
00387 std::cerr << "Decoding character {" << s << "} gives "
00388 << codePoint << std::endl;
00389 #endif
00390
00391 return codePoint;
00392 }
00393
00394 static bool
00395 needsBackslashEscape(uint32_t c)
00396 {
00397 return (c == '"' || c == '\'' || c == '\\');
00398 }
00399
00400 static bool
00401 asciiPrintableCharacter(uint32_t c)
00402 {
00403
00404
00405 return (c >= 0x20 && c < 0x7f);
00406 }
00407
00408 std::string
00409 LitValue::asString() const
00410 {
00411 std::stringstream ss;
00412
00413 switch(litType) {
00414 case lt_bool:
00415 return (b == true) ? "true" : "false";
00416
00417 case lt_char:
00418 {
00419 if (asciiPrintableCharacter(c))
00420 ss << (needsBackslashEscape(c) ? "'\\" : "'")
00421 << (unsigned char) c << "'";
00422 else
00423 ss << (unsigned long)(c);
00424
00425 return ss.str();
00426 }
00427 case lt_int:
00428 ss << i;
00429 return ss.str();
00430
00431 case lt_float:
00432 {
00433 char buf[256];
00434 snprintf(buf, sizeof(buf), " %f\n", d);
00435 return buf;
00436 }
00437
00438 case lt_string:
00439 return s;
00440
00441 default:
00442 return "BAD LITERAL TYPE";
00443 }
00444 }