utf8.cxx

Go to the documentation of this file.
00001 /**************************************************************************
00002  *
00003  * Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2006, The EROS
00004  *   Group, LLC.
00005  * Copyright (C) 2004, 2005, 2006, Johns Hopkins University.
00006  * All rights reserved.
00007  *
00008  * Redistribution and use in source and binary forms, with or
00009  * without modification, are permitted provided that the following
00010  * conditions are met:
00011  *
00012  *   - Redistributions of source code must contain the above
00013  *     copyright notice, this list of conditions, and the following
00014  *     disclaimer.
00015  *
00016  *   - Redistributions in binary form must reproduce the above
00017  *     copyright notice, this list of conditions, and the following
00018  *     disclaimer in the documentation and/or other materials
00019  *     provided with the distribution.
00020  *
00021  *   - Neither the names of the copyright holders nor the names of any
00022  *     of any contributors may be used to endorse or promote products
00023  *     derived from this software without specific prior written
00024  *     permission.
00025  *
00026  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
00027  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
00028  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
00029  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
00030  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
00031  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
00032  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
00033  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
00034  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
00035  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
00036  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
00037  *
00038  **************************************************************************/
00039 
00040 #include <stdint.h>
00041 #include "utf8.hxx"
00042 
00043 namespace sherpa {
00044 // Decode a UTF-8 character, returning the start of next character via
00045 // snext.  Note that this decoder ignores the recent ISO-10646
00046 // resolution, and works for private code points.
00047 //
00048 // Note that this assumes that the input character is canonically
00049 // encoded (i.e. not encoded using perverse length), but does not
00050 // check. The natural implementation of a lexer implementing ungetChar
00051 // is to use utf8_encode on the character being returned to determine
00052 // how far it should back up the input pointer, but if the character
00053 // is perversely encoded this will not work. The fix is for the code
00054 // below to range check the input and generate a ExBadValue exception,
00055 // and then provide a raw_utf8_decode for people who need to fix
00056 // content having bad encodings.
00057 uint32_t
00058 utf8_decode(const char *s, const char **snext)
00059 {
00060   uint32_t ucs4 = ~0u;
00061   const uint8_t *sb = (uint8_t *)s;
00062 
00063   if (*sb <= 127) {
00064     ucs4 = *sb++;
00065   }
00066   else if (*sb <= 223) {
00067     ucs4 = (*sb++ - 192)*64;
00068     ucs4 += (*sb++ - 128);
00069   }
00070   else if (*sb <= 239) {
00071     ucs4 = (*sb++ - 192)*4096;
00072     ucs4 += (*sb++ - 128)*64;
00073     ucs4 += (*sb++ - 128);
00074   }
00075   else if (*sb <= 247) {
00076     ucs4 = (*sb++ - 192)*262144;
00077     ucs4 += (*sb++ - 128)*4096;
00078     ucs4 += (*sb++ - 128)*64;
00079     ucs4 += (*sb++ - 128);
00080   }
00081   else if (*sb <= 251) {
00082     ucs4 = (*sb++ - 192)*16777216;
00083     ucs4 += (*sb++ - 128)*262144;
00084     ucs4 += (*sb++ - 128)*4096;
00085     ucs4 += (*sb++ - 128)*64;
00086     ucs4 += (*sb++ - 128);
00087   }
00088   else if (*sb <= 253) {
00089     ucs4 = (*sb++ - 192)*1073741824;
00090     ucs4 += (*sb++ - 128)*16777216;
00091     ucs4 += (*sb++ - 128)*262144;
00092     ucs4 += (*sb++ - 128)*4096;
00093     ucs4 += (*sb++ - 128)*64;
00094     ucs4 += (*sb++ - 128);
00095   }
00096 
00097   if (snext) *snext = (char *)sb;
00098   return ucs4;
00099 }
00100 
00101 unsigned
00102 utf8_encode(uint32_t ucs4, char utf[7])
00103 {
00104   char *utf8 = utf;
00105 
00106   if (ucs4 <= 0x7f) {
00107     *utf8++ = ucs4;
00108   }
00109   else if (ucs4 <= 0x7ff) {
00110     *utf8++ = 192u + (ucs4 / 64);
00111     *utf8++ = 128u + (ucs4 % 64);
00112   }
00113   else if (ucs4 <= 0xffff) {
00114     *utf8++ = 224u + (ucs4 / 4096);
00115     *utf8++ = 128u + ((ucs4 / 64) % 64);
00116     *utf8++ = 128u + (ucs4 % 64);
00117   }
00118   else if (ucs4 <= 0x1fffff) {
00119     *utf8++ = 240 + (ucs4 / 262144);
00120     *utf8++ = 128u + ((ucs4 / 4096) % 64);
00121     *utf8++ = 128u + ((ucs4 / 64) % 64);
00122     *utf8++ = 128u + (ucs4 % 64);
00123   }
00124   else if (ucs4 <= 0x3ffffff) {
00125     *utf8++ = 248u + (ucs4 / 16777216);
00126     *utf8++ = 128u + ((ucs4 / 262144) % 64);
00127     *utf8++ = 128u + ((ucs4 / 4096) % 64);
00128     *utf8++ = 128u + ((ucs4 / 64) % 64);
00129     *utf8++ = 128u + (ucs4 % 64);
00130   }
00131   else if (ucs4 <= 0x7fffffff) {
00132     *utf8++ = 252u + (ucs4 / 1073741824);
00133     *utf8++ = 128u + ((ucs4 / 16777216) % 64);
00134     *utf8++ = 128u + ((ucs4 / 262144) % 64);
00135     *utf8++ = 128u + ((ucs4 / 4096) % 64);
00136     *utf8++ = 128u + ((ucs4 / 64) % 64);
00137     *utf8++ = 128u + (ucs4 % 64);
00138   }
00139 
00140   *utf8 = 0;
00141 
00142   return utf8 - utf;
00143 }
00144 
00145 } /* namespace sherpa */

Generated on Sat Feb 4 23:59:29 2012 for BitC Compiler by  doxygen 1.4.7