Main Page | Namespace List | Class Hierarchy | Alphabetical List | Class List | Directories | File List | Namespace Members | Class Members | File Members

utf8.cpp File Reference

#include "utf8.h"
#include <cctype>
#include "util.h"

Include dependency graph for utf8.cpp:

Go to the source code of this file.

Functions

static std::string noAbbrev ("")
std::string ReplaceGreekLetterAbbr (std::string str)
bool UTF8Decode (const char *str, int pos, int length, wchar_t &ch)
bool UTF8Decode (const std::string &str, int pos, wchar_t &ch)
int UTF8Encode (wchar_t ch, char *s)
int UTF8Length (const std::string &s)
 Return the number of characters encoded by a UTF-8 string.
wchar_t UTF8Normalize (wchar_t ch)
int UTF8StringCompare (const std::string &s0, const std::string &s1, size_t n)
int UTF8StringCompare (const std::string &s0, const std::string &s1)

Variables

static const char * canonicalAbbrevs []
static const char * greekAlphabet []
static const char * greekAlphabetUTF8 []
unsigned int WGL4_Normalization_00 [256]
unsigned int WGL4_Normalization_01 [256]
unsigned int WGL4_Normalization_02 [256]
unsigned int WGL4_Normalization_03 [256]
unsigned int WGL4_Normalization_04 [256]
unsigned int WGL4_Normalization_1e [256]
unsigned int WGL4_Normalization_21 [256]
unsigned int * WGL4NormalizationTables [256]


Function Documentation

static std::string noAbbrev ""   )  [static]
 

Referenced by Greek::canonicalAbbreviation().

std::string ReplaceGreekLetterAbbr std::string  str  ) 
 

Replaces the Greek letter abbreviation at the beginning of a string by the UTF-8 representation of that letter. Also, replace digits following Greek letters with UTF-8 superscripts.

Definition at line 765 of file utf8.cpp.

References Greek::abbrevs, greekAlphabetUTF8, UTF8_SUPERSCRIPT_1, UTF8_SUPERSCRIPT_2, and UTF8_SUPERSCRIPT_3.

Referenced by Renderer::addLabel(), Renderer::addSortedLabel(), addStars(), CelListViewItem::CelListViewItem(), StarDatabase::getStarNameList(), SelectionPopup::init(), menuContext(), and CelestiaCore::renderOverlay().

00766 {
00767     std::string ret = str;
00768     
00769     if (str[0] >= 'A' && str[0] <= 'Z' &&
00770         str[1] >= 'A' && str[1] <= 'Z')
00771     {
00772         // Linear search through all letter abbreviations
00773         for (int i = 0; i < Greek::instance->nLetters; i++)
00774         {
00775             const std::string& abbr = Greek::instance->abbrevs[i];
00776             if (str.compare(0, abbr.length(), abbr) == 0)
00777             {
00778                 std::string superscript;
00779                 if (str.length() > abbr.length())
00780                 {
00781                     if (str[abbr.length()] == '1')
00782                         superscript = UTF8_SUPERSCRIPT_1;
00783                     else if (str[abbr.length()] == '2')
00784                         superscript = UTF8_SUPERSCRIPT_2;
00785                     else if (str[abbr.length()] == '3')
00786                         superscript = UTF8_SUPERSCRIPT_3;
00787                 }
00788 
00789                 if (superscript.empty())
00790                 {
00791                     ret = std::string(greekAlphabetUTF8[i]) + str.substr(abbr.length());
00792                 }
00793                 else
00794                 {
00795                     ret = std::string(greekAlphabetUTF8[i]) + superscript +
00796                         str.substr(abbr.length() + 1);
00797                 }
00798                         
00799                 break;
00800             }
00801         }
00802     }
00803     
00804     return ret;
00805 }

bool UTF8Decode const char *  str,
int  pos,
int  length,
wchar_t &  ch
 

Decode the UTF-8 characters in string str beginning at position pos. The decoded character is returned in ch; the return value of the function is true if a valid UTF-8 sequence was successfully decoded.

Definition at line 365 of file utf8.cpp.

References UTF8EncodedSizeFromFirstByte().

00366 {
00367     unsigned int c0 = (unsigned int) str[pos];
00368     int charlen = UTF8EncodedSizeFromFirstByte(c0);
00369 
00370     // Bad UTF-8 character that extends past end of string
00371     if (pos + charlen > length)
00372         return false;
00373 
00374     // TODO: Should check that the bytes of characters after the first are all
00375     // of the form 01xxxxxx
00376     // TODO: Need to reject overlong encoding sequences
00377 
00378     switch (charlen)
00379     {
00380     case 1:
00381         ch = c0;
00382         return true;
00383 
00384     case 2:
00385         ch = ((c0 & 0x1f) << 6) | ((unsigned int) str[pos + 1] & 0x3f);
00386         return true;
00387 
00388     case 3:
00389         ch = ((c0 & 0x0f) << 12) |
00390             (((unsigned int) str[pos + 1] & 0x3f) << 6) |
00391             ((unsigned int)  str[pos + 2] & 0x3f);
00392         return true;
00393 
00394     case 4:
00395         ch = ((c0 & 0x07) << 18) |
00396             (((unsigned int) str[pos + 1] & 0x3f) << 12) |
00397             (((unsigned int) str[pos + 2] & 0x3f) << 6)  |
00398             ((unsigned int)  str[pos + 3] & 0x3f);
00399         return true;
00400 
00401     case 5:
00402         ch = ((c0 & 0x03) << 24) |
00403             (((unsigned int) str[pos + 1] & 0x3f) << 18) |
00404             (((unsigned int) str[pos + 2] & 0x3f) << 12) |
00405             (((unsigned int) str[pos + 3] & 0x3f) << 6)  |
00406             ((unsigned int)  str[pos + 4] & 0x3f);
00407         return true;
00408 
00409     case 6:
00410         ch = ((c0 & 0x01) << 30) |
00411             (((unsigned int) str[pos + 1] & 0x3f) << 24) |
00412             (((unsigned int) str[pos + 2] & 0x3f) << 18) |
00413             (((unsigned int) str[pos + 3] & 0x3f) << 12) |
00414             (((unsigned int) str[pos + 4] & 0x3f) << 6)  |
00415             ((unsigned int)  str[pos + 5] & 0x3f);
00416         return true;
00417 
00418     default:
00419         return false;
00420     }
00421 }

bool UTF8Decode const std::string str,
int  pos,
wchar_t &  ch
 

Decode the UTF-8 characters in string str beginning at position pos. The decoded character is returned in ch; the return value of the function is true if a valid UTF-8 sequence was successfully decoded.

Definition at line 303 of file utf8.cpp.

Referenced by CelestiaCore::charEntered(), TextureFont::getWidth(), Overlay::print(), Console::print(), TextureFont::render(), and UTF8StringCompare().

00304 {
00305     unsigned int c0 = (unsigned int) str[pos];
00306     int charlen = UTF8EncodedSizeFromFirstByte(c0);
00307 
00308     // Bad UTF-8 character that extends past end of string
00309     if (pos + charlen > (int)str.length())
00310         return false;
00311 
00312     // TODO: Should check that the bytes of characters after the first are all
00313     // of the form 01xxxxxx
00314     // TODO: Need to reject overlong encoding sequences
00315 
00316     switch (charlen)
00317     {
00318     case 1:
00319         ch = c0;
00320         return true;
00321 
00322     case 2:
00323         ch = ((c0 & 0x1f) << 6) | ((unsigned int) str[pos + 1] & 0x3f);
00324         return true;
00325 
00326     case 3:
00327         ch = ((c0 & 0x0f) << 12) |
00328             (((unsigned int) str[pos + 1] & 0x3f) << 6) |
00329             ((unsigned int)  str[pos + 2] & 0x3f);
00330         return true;
00331 
00332     case 4:
00333         ch = ((c0 & 0x07) << 18) |
00334             (((unsigned int) str[pos + 1] & 0x3f) << 12) |
00335             (((unsigned int) str[pos + 2] & 0x3f) << 6)  |
00336             ((unsigned int)  str[pos + 3] & 0x3f);
00337         return true;
00338 
00339     case 5:
00340         ch = ((c0 & 0x03) << 24) |
00341             (((unsigned int) str[pos + 1] & 0x3f) << 18) |
00342             (((unsigned int) str[pos + 2] & 0x3f) << 12) |
00343             (((unsigned int) str[pos + 3] & 0x3f) << 6)  |
00344             ((unsigned int)  str[pos + 4] & 0x3f);
00345         return true;
00346 
00347     case 6:
00348         ch = ((c0 & 0x01) << 30) |
00349             (((unsigned int) str[pos + 1] & 0x3f) << 24) |
00350             (((unsigned int) str[pos + 2] & 0x3f) << 18) |
00351             (((unsigned int) str[pos + 3] & 0x3f) << 12) |
00352             (((unsigned int) str[pos + 4] & 0x3f) << 6)  |
00353             ((unsigned int)  str[pos + 5] & 0x3f);
00354         return true;
00355 
00356     default:
00357         return false;
00358     }
00359 }

int UTF8Encode wchar_t  ch,
char *  s
 

UTF-8 encode the Unicode character ch into the string s and return the encoded length. There should be space for at least 7 characters in s--up to six encoded bytes, plus one byte for the terminating null character.

Definition at line 427 of file utf8.cpp.

Referenced by Tokenizer::nextToken().

00428 {
00429     if (ch < 0x80)
00430     {
00431         s[0] = (char) ch;
00432         s[1] = '\0';
00433         return 1;
00434     }
00435     else if (ch < 0x800)
00436     {
00437         s[0] = (char) (0xc0 | ((ch & 0x7c0) >> 6));
00438         s[1] = (char) (0x80 | (ch & 0x3f));
00439         s[2] = '\0';
00440         return 2;
00441     }
00442     else if (ch < 0x10000)
00443     {
00444         s[0] = (char) (0xe0 | ((ch & 0xf000) >> 12));
00445         s[1] = (char) (0x80 | ((ch & 0x0fc0) >> 6));
00446         s[2] = (char) (0x80 | ((ch & 0x003f)));
00447         s[3] = '\0';
00448         return 3;
00449     }
00450     else if (ch < 0x200000)
00451     {
00452         s[0] = (char) (0xf0 | ((ch & 0x1c0000) >> 18));
00453         s[1] = (char) (0x80 | ((ch & 0x03f000) >> 12));
00454         s[2] = (char) (0x80 | ((ch & 0x000fc0) >>  6));
00455         s[3] = (char) (0x80 | ((ch & 0x00003f)));
00456         s[4] = '\0';
00457         return 4;
00458     }
00459     else if (ch < 0x4000000)
00460     {
00461         s[0] = (char) (0xf8 | ((ch & 0x3000000) >> 24));
00462         s[1] = (char) (0x80 | ((ch & 0x0fc0000) >> 18));
00463         s[2] = (char) (0x80 | ((ch & 0x003f000) >> 12));
00464         s[3] = (char) (0x80 | ((ch & 0x0000fc0) >>  6));
00465         s[4] = (char) (0x80 | ((ch & 0x000003f)));
00466         s[5] = '\0';
00467         return 5;
00468     }
00469     else
00470     {
00471         s[0] = (char) (0xfc | ((ch & 0x40000000) >> 30));
00472         s[1] = (char) (0x80 | ((ch & 0x3f000000) >> 24));
00473         s[2] = (char) (0x80 | ((ch & 0x00fc0000) >> 18));
00474         s[3] = (char) (0x80 | ((ch & 0x0003f000) >> 12));
00475         s[4] = (char) (0x80 | ((ch & 0x00000fc0) >>  6));
00476         s[5] = (char) (0x80 | ((ch & 0x0000003f)));
00477         s[6] = '\0';
00478         return 6;
00479     }
00480 }

int UTF8Length const std::string s  ) 
 

Return the number of characters encoded by a UTF-8 string.

Definition at line 484 of file utf8.cpp.

Referenced by UTF8StringCompare().

00485 {
00486     int len = s.length();
00487     int count = 0;
00488     for (int i = 0; i < len; i++)
00489     {
00490         unsigned int c = (unsigned int) ((unsigned char) s[i]);
00491         if ((c < 0x80) || (c >= 0xc2 && c <= 0xf4))
00492             count++;
00493     }
00494 
00495     return count;
00496 }

wchar_t UTF8Normalize wchar_t  ch  )  [inline]
 

Definition at line 499 of file utf8.cpp.

References WGL4NormalizationTables.

Referenced by UTF8StringCompare().

00500 {
00501     unsigned int page = (unsigned int) ch >> 8;
00502     if (page >= 256)
00503         return ch;
00504 
00505     unsigned int* normTable = WGL4NormalizationTables[page];
00506     if (normTable == NULL)
00507         return ch;
00508 
00509     return (wchar_t) normTable[(unsigned int) ch & 0xff];
00510 }

int UTF8StringCompare const std::string s0,
const std::string s1,
size_t  n
 

Definition at line 555 of file utf8.cpp.

References UTF8Decode(), UTF8EncodedSize(), UTF8Length(), and UTF8Normalize().

00556 {
00557     int len0 = s0.length();
00558     int len1 = s1.length();
00559     int i0 = 0;
00560     int i1 = 0;
00561     while (i0 < len0 && i1 < len1 && n > 0)
00562     {
00563         wchar_t ch0 = 0;
00564         wchar_t ch1 = 0;
00565         if (!UTF8Decode(s0, i0, ch0))
00566             return 1;
00567         if (!UTF8Decode(s1, i1, ch1))
00568             return -1;
00569 
00570         i0 += UTF8EncodedSize(ch0);
00571         i1 += UTF8EncodedSize(ch1);
00572         ch0 = UTF8Normalize(ch0);
00573         ch1 = UTF8Normalize(ch1);
00574 
00575         if (ch0 < ch1)
00576             return -1;
00577         else if (ch0 > ch1)
00578             return 1;
00579 
00580         n--;
00581     }
00582 
00583     if (n == 0)
00584         return 0;
00585 
00586     len0 = UTF8Length(s0);
00587     len1 = UTF8Length(s1);
00588     if (len0 > len1)
00589         return 1;
00590     else if (len0 < len1)
00591         return -1;
00592     else
00593         return 0;
00594 }

int UTF8StringCompare const std::string s0,
const std::string s1
 

Perform a normalized comparison of two UTF-8 strings. The normalization only works for characters in the WGL-4 subset, and no multicharacter translations are performed.

Definition at line 516 of file utf8.cpp.

Referenced by PlanetarySystem::find(), Body::findLocation(), Universe::getCompletion(), NameDatabase< OBJ >::getCompletion(), PlanetarySystem::getCompletion(), and Universe::getCompletionPath().

00517 {
00518     int len0 = s0.length();
00519     int len1 = s1.length();
00520     int i0 = 0;
00521     int i1 = 0;
00522     while (i0 < len0 && i1 < len1)
00523     {
00524         wchar_t ch0 = 0;
00525         wchar_t ch1 = 0;
00526         if (!UTF8Decode(s0, i0, ch0))
00527             return 1;
00528         if (!UTF8Decode(s1, i1, ch1))
00529             return -1;
00530 
00531         i0 += UTF8EncodedSize(ch0);
00532         i1 += UTF8EncodedSize(ch1);
00533         ch0 = UTF8Normalize(ch0);
00534         ch1 = UTF8Normalize(ch1);
00535 
00536         if (ch0 < ch1)
00537             return -1;
00538         else if (ch0 > ch1)
00539             return 1;
00540     }
00541 
00542     if (i0 == len0 && i1 == len1)
00543         return 0;
00544 
00545     len0 = UTF8Length(s0);
00546     len1 = UTF8Length(s1);
00547     if (len0 > len1)
00548         return 1;
00549     else if (len0 < len1)
00550         return -1;
00551     else
00552         return 0;
00553 }


Variable Documentation

const char* canonicalAbbrevs[] [static]
 

Initial value:

{
    "ALF", "BET", "GAM", "DEL", "EPS", "ZET", "ETA", "TET",
    "IOT", "KAP", "LAM", "MU" , "NU" , "XI" , "OMI", "PI" ,
    "RHO", "SIG", "TAU", "UPS", "PHI", "CHI", "PSI", "OME",
}

Definition at line 696 of file utf8.cpp.

Referenced by Greek::Greek().

const char* greekAlphabet[] [static]
 

Initial value:

{
    "Alpha",
    "Beta",
    "Gamma",
    "Delta",
    "Epsilon",
    "Zeta",
    "Eta",
    "Theta",
    "Iota",
    "Kappa",
    "Lambda",
    "Mu",
    "Nu",
    "Xi",
    "Omicron",
    "Pi",
    "Rho",
    "Sigma",
    "Tau",
    "Upsilon",
    "Phi",
    "Chi",
    "Psi",
    "Omega"
}

Definition at line 640 of file utf8.cpp.

Referenced by Greek::Greek().

const char* greekAlphabetUTF8[] [static]
 

Initial value:

{
    "\316\261",
    "\316\262",
    "\316\263",
    "\316\264",
    "\316\265",
    "\316\266",
    "\316\267",
    "\316\270",
    "\316\271",
    "\316\272",
    "\316\273",
    "\316\274",
    "\316\275",
    "\316\276",
    "\316\277",
    "\317\200",
    "\317\201",
    "\317\203",
    "\317\204",
    "\317\205",
    "\317\206",
    "\317\207",
    "\317\210",
    "\317\211",
}

Definition at line 668 of file utf8.cpp.

Referenced by Greek::canonicalAbbreviation(), and ReplaceGreekLetterAbbr().

unsigned int WGL4_Normalization_00[256]
 

Definition at line 14 of file utf8.cpp.

unsigned int WGL4_Normalization_01[256]
 

Definition at line 49 of file utf8.cpp.

unsigned int WGL4_Normalization_02[256]
 

Definition at line 84 of file utf8.cpp.

unsigned int WGL4_Normalization_03[256]
 

Definition at line 119 of file utf8.cpp.

unsigned int WGL4_Normalization_04[256]
 

Definition at line 154 of file utf8.cpp.

unsigned int WGL4_Normalization_1e[256]
 

Definition at line 189 of file utf8.cpp.

unsigned int WGL4_Normalization_21[256]
 

Definition at line 224 of file utf8.cpp.

unsigned int* WGL4NormalizationTables[256]
 

Definition at line 259 of file utf8.cpp.

Referenced by UTF8Normalize().


Generated on Sat Jan 14 22:33:01 2006 for Celestia by  doxygen 1.4.1