string.c

Go to the documentation of this file.
00001 /*
00002  * Copyright (C) 2006, The EROS Group, LLC.
00003  * All rights reserved.
00004  *
00005  * Redistribution and use in source and binary forms, with or
00006  * without modification, are permitted provided that the following
00007  * conditions are met:
00008  *
00009  *   - Redistributions of source code must contain the above 
00010  *     copyright notice, this list of conditions, and the following
00011  *     disclaimer. 
00012  *
00013  *   - Redistributions in binary form must reproduce the above
00014  *     copyright notice, this list of conditions, and the following
00015  *     disclaimer in the documentation and/or other materials 
00016  *     provided with the distribution.
00017  *
00018  *   - Neither the names of the copyright holders nor the names of any
00019  *     of any contributors may be used to endorse or promote products
00020  *     derived from this software without specific prior written
00021  *     permission. 
00022  *
00023  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
00024  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
00025  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
00026  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
00027  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
00028  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
00029  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
00030  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
00031  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
00032  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
00033  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
00034  */
00035 
00036 #include <unistd.h>
00037 #include <fcntl.h>
00038 #include "BUILD/bitc-runtime.h"
00039 
00048 static bitc_char_t
00049 utf8_decode(const char *s, const char **snext)
00050 {
00051   uint32_t ucs4=0;
00052   const uint8_t *sb = (uint8_t *)s;
00053 
00054   if (*sb <= 127) {
00055     ucs4 = *sb++;
00056   }
00057   else if (*sb <= 223) {
00058     ucs4 = (*sb++ - 192)*64;
00059     ucs4 += (*sb++ - 128);
00060   }
00061   else if (*sb <= 239) {
00062     ucs4 = (*sb++ - 192)*4096;
00063     ucs4 += (*sb++ - 128)*64;
00064     ucs4 += (*sb++ - 128);
00065   }
00066   else if (*sb <= 247) {
00067     ucs4 = (*sb++ - 192)*262144;
00068     ucs4 += (*sb++ - 128)*4096;
00069     ucs4 += (*sb++ - 128)*64;
00070     ucs4 += (*sb++ - 128);
00071   }
00072   else if (*sb <= 251) {
00073     ucs4 = (*sb++ - 192)*16777216;
00074     ucs4 += (*sb++ - 128)*262144;
00075     ucs4 += (*sb++ - 128)*4096;
00076     ucs4 += (*sb++ - 128)*64;
00077     ucs4 += (*sb++ - 128);
00078   }
00079   else if (*sb <= 253) {
00080     ucs4 = (*sb++ - 192)*1073741824;
00081     ucs4 += (*sb++ - 128)*16777216;
00082     ucs4 += (*sb++ - 128)*262144;
00083     ucs4 += (*sb++ - 128)*4096;
00084     ucs4 += (*sb++ - 128)*64;
00085     ucs4 += (*sb++ - 128);
00086   }
00087 
00088   if (snext) *snext = (char *)sb;
00089   return ucs4;
00090 }
00091 
00092 
00101 static unsigned
00102 utf8_encode(uint32_t ucs4, char utf[7])
00103 {
00104   char *utf8 = utf;
00105 
00106   if (ucs4 <= 0x7f) {
00107     *utf8++ = ucs4;
00108   }
00109   else if (ucs4 <= 0x7ff) {
00110     *utf8++ = 192u + (ucs4 / 64);
00111     *utf8++ = 128u + (ucs4 % 64);
00112   }
00113   else if (ucs4 <= 0xffff) {
00114     *utf8++ = 224u + (ucs4 / 4096);
00115     *utf8++ = 128u + ((ucs4 / 64) % 64);
00116     *utf8++ = 128u + (ucs4 % 64);
00117   }
00118   else if (ucs4 <= 0x1fffff) {
00119     *utf8++ = 240 + (ucs4 / 262144);
00120     *utf8++ = 128u + ((ucs4 / 4096) % 64);
00121     *utf8++ = 128u + ((ucs4 / 64) % 64);
00122     *utf8++ = 128u + (ucs4 % 64);
00123   }
00124   else if (ucs4 <= 0x3ffffff) {
00125     *utf8++ = 248u + (ucs4 / 16777216);
00126     *utf8++ = 128u + ((ucs4 / 262144) % 64);
00127     *utf8++ = 128u + ((ucs4 / 4096) % 64);
00128     *utf8++ = 128u + ((ucs4 / 64) % 64);
00129     *utf8++ = 128u + (ucs4 % 64);
00130   }
00131   else if (ucs4 <= 0x7fffffff) {
00132     *utf8++ = 252u + (ucs4 / 1073741824);
00133     *utf8++ = 128u + ((ucs4 / 16777216) % 64);
00134     *utf8++ = 128u + ((ucs4 / 262144) % 64);
00135     *utf8++ = 128u + ((ucs4 / 4096) % 64);
00136     *utf8++ = 128u + ((ucs4 / 64) % 64);
00137     *utf8++ = 128u + (ucs4 % 64);
00138   }
00139 
00140   *utf8 = 0;
00141 
00142   return utf8 - utf;
00143 }
00144 
00145 
00148 bitc_word_t
00149 DEFUN(bitc_string_length, bitc_string_t *str)
00150 {
00151   bitc_word_t len = 0;
00152   const char *s = str->s;
00153   while (*s) {
00154     utf8_decode(s, &s);
00155     len++;
00156   }
00157 
00158   return len;
00159 }
00160 DEFCLOSURE(bitc_string_length);
00161 
00165 bitc_char_t
00166 DEFUN(bitc_string_nth, bitc_string_t *str, bitc_word_t ndx)
00167 {
00168   bitc_word_t len = ndx;
00169   const char *s = str->s;
00170 
00171   while(*s && len) {
00172     utf8_decode(s, &s);
00173     len--;
00174   }
00175 
00176   if (*s == 0)
00177     BITC_THROW(&val_ExIndexBoundsError);
00178 
00179   return utf8_decode(s, 0);
00180 }
00181 DEFCLOSURE(bitc_string_nth);
00182 
00191 bitc_string_t *
00192 DEFUN(bitc_vector_string, arg_0_bitc_vector_string vec)
00193 {
00194   size_t len = vec->length;
00195   bitc_string_t *tmp = (bitc_string_t *) 
00196     GC_ALLOC_ATOMIC(sizeof(bitc_string_t));
00197 
00198   char *max = (char *) GC_ALLOC_ATOMIC(sizeof(char) * len * 7);
00199   size_t totLen = 0;
00200   for(size_t i=0; i < len; i++) {
00201     char utf8[7];
00202     size_t thisLen = utf8_encode(vec->elem[i], utf8);    
00203     for(size_t j=0; j < thisLen; j++) {
00204       max[totLen] = utf8[thisLen]; 
00205       totLen++;
00206     }
00207   }
00208 
00209   char* exact = (char *) GC_ALLOC_ATOMIC(sizeof(char) * totLen);  
00210   for(size_t i=0; i < totLen; i++)
00211     exact[i] = max[i];
00212 
00213   tmp->length = totLen;
00214   tmp->s = exact;
00215   return tmp;
00216 }
00217 DEFCLOSURE(bitc_vector_string);
00218 
00219 #if 0
00220 bitc_unit_t
00221 DEFUN(bitc_string_map, bitc_string_t *str, CL1* fn)
00222 {
00223   bitc_word_t len = ndx;
00224   const char *s = str->s;
00225 
00226   while(*s && len) {
00227     utf8_decode(s, &s);
00228     len--;
00229   }
00230 
00231   if (*s == 0)
00232     bitc_throw(&val_ExIndexBoundsError);
00233 
00234   return utf8_decode(s, 0);
00235 }
00236 DEFCLOSURE(bitc_string_nth);
00237 #endif

Generated on Sat Feb 4 23:59:30 2012 for BitC Runtime Library by  doxygen 1.4.7