Main Page | Namespace List | Class Hierarchy | Alphabetical List | Class List | Directories | File List | Namespace Members | Class Members | File Members

utf8.h File Reference

#include <string>

Include dependency graph for utf8.h:

This graph shows which files directly or indirectly include this file:

Go to the source code of this file.

Defines

#define UTF8_DEGREE_SIGN   "\302\260"
#define UTF8_MULTIPLICATION_SIGN   "\303\227"
#define UTF8_SUPERSCRIPT_1   "\302\271"
#define UTF8_SUPERSCRIPT_2   "\302\262"
#define UTF8_SUPERSCRIPT_3   "\302\263"

Functions

std::string ReplaceGreekLetterAbbr (std::string)
bool UTF8Decode (const char *str, int pos, int length, wchar_t &ch)
bool UTF8Decode (const std::string &str, int pos, wchar_t &ch)
int UTF8Encode (wchar_t ch, char *s)
int UTF8EncodedSize (wchar_t ch)
int UTF8EncodedSizeFromFirstByte (unsigned int ch)
int UTF8Length (const std::string &s)
 Return the number of characters encoded by a UTF-8 string.
int UTF8StringCompare (const std::string &s0, const std::string &s1, size_t length)
int UTF8StringCompare (const std::string &s0, const std::string &s1)


Define Documentation

#define UTF8_DEGREE_SIGN   "\302\260"
 

Definition at line 15 of file utf8.h.

Referenced by displayAngle().

#define UTF8_MULTIPLICATION_SIGN   "\303\227"
 

Definition at line 16 of file utf8.h.

Referenced by CelestiaCore::renderOverlay().

#define UTF8_SUPERSCRIPT_1   "\302\271"
 

Definition at line 17 of file utf8.h.

Referenced by ReplaceGreekLetterAbbr().

#define UTF8_SUPERSCRIPT_2   "\302\262"
 

Definition at line 18 of file utf8.h.

Referenced by ReplaceGreekLetterAbbr().

#define UTF8_SUPERSCRIPT_3   "\302\263"
 

Definition at line 19 of file utf8.h.

Referenced by ReplaceGreekLetterAbbr().


Function Documentation

std::string ReplaceGreekLetterAbbr std::string  str  ) 
 

Replaces the Greek letter abbreviation at the beginning of a string by the UTF-8 representation of that letter. Also, replace digits following Greek letters with UTF-8 superscripts.

Definition at line 765 of file utf8.cpp.

References Greek::abbrevs, greekAlphabetUTF8, UTF8_SUPERSCRIPT_1, UTF8_SUPERSCRIPT_2, and UTF8_SUPERSCRIPT_3.

Referenced by Renderer::addLabel(), Renderer::addSortedLabel(), addStars(), CelListViewItem::CelListViewItem(), StarDatabase::getStarNameList(), SelectionPopup::init(), menuContext(), and CelestiaCore::renderOverlay().

00766 {
00767     std::string ret = str;
00768     
00769     if (str[0] >= 'A' && str[0] <= 'Z' &&
00770         str[1] >= 'A' && str[1] <= 'Z')
00771     {
00772         // Linear search through all letter abbreviations
00773         for (int i = 0; i < Greek::instance->nLetters; i++)
00774         {
00775             const std::string& abbr = Greek::instance->abbrevs[i];
00776             if (str.compare(0, abbr.length(), abbr) == 0)
00777             {
00778                 std::string superscript;
00779                 if (str.length() > abbr.length())
00780                 {
00781                     if (str[abbr.length()] == '1')
00782                         superscript = UTF8_SUPERSCRIPT_1;
00783                     else if (str[abbr.length()] == '2')
00784                         superscript = UTF8_SUPERSCRIPT_2;
00785                     else if (str[abbr.length()] == '3')
00786                         superscript = UTF8_SUPERSCRIPT_3;
00787                 }
00788 
00789                 if (superscript.empty())
00790                 {
00791                     ret = std::string(greekAlphabetUTF8[i]) + str.substr(abbr.length());
00792                 }
00793                 else
00794                 {
00795                     ret = std::string(greekAlphabetUTF8[i]) + superscript +
00796                         str.substr(abbr.length() + 1);
00797                 }
00798                         
00799                 break;
00800             }
00801         }
00802     }
00803     
00804     return ret;
00805 }

bool UTF8Decode const char *  str,
int  pos,
int  length,
wchar_t &  ch
 

Decode the UTF-8 characters in string str beginning at position pos. The decoded character is returned in ch; the return value of the function is true if a valid UTF-8 sequence was successfully decoded.

Definition at line 365 of file utf8.cpp.

References UTF8EncodedSizeFromFirstByte().

00366 {
00367     unsigned int c0 = (unsigned int) str[pos];
00368     int charlen = UTF8EncodedSizeFromFirstByte(c0);
00369 
00370     // Bad UTF-8 character that extends past end of string
00371     if (pos + charlen > length)
00372         return false;
00373 
00374     // TODO: Should check that the bytes of characters after the first are all
00375     // of the form 01xxxxxx
00376     // TODO: Need to reject overlong encoding sequences
00377 
00378     switch (charlen)
00379     {
00380     case 1:
00381         ch = c0;
00382         return true;
00383 
00384     case 2:
00385         ch = ((c0 & 0x1f) << 6) | ((unsigned int) str[pos + 1] & 0x3f);
00386         return true;
00387 
00388     case 3:
00389         ch = ((c0 & 0x0f) << 12) |
00390             (((unsigned int) str[pos + 1] & 0x3f) << 6) |
00391             ((unsigned int)  str[pos + 2] & 0x3f);
00392         return true;
00393 
00394     case 4:
00395         ch = ((c0 & 0x07) << 18) |
00396             (((unsigned int) str[pos + 1] & 0x3f) << 12) |
00397             (((unsigned int) str[pos + 2] & 0x3f) << 6)  |
00398             ((unsigned int)  str[pos + 3] & 0x3f);
00399         return true;
00400 
00401     case 5:
00402         ch = ((c0 & 0x03) << 24) |
00403             (((unsigned int) str[pos + 1] & 0x3f) << 18) |
00404             (((unsigned int) str[pos + 2] & 0x3f) << 12) |
00405             (((unsigned int) str[pos + 3] & 0x3f) << 6)  |
00406             ((unsigned int)  str[pos + 4] & 0x3f);
00407         return true;
00408 
00409     case 6:
00410         ch = ((c0 & 0x01) << 30) |
00411             (((unsigned int) str[pos + 1] & 0x3f) << 24) |
00412             (((unsigned int) str[pos + 2] & 0x3f) << 18) |
00413             (((unsigned int) str[pos + 3] & 0x3f) << 12) |
00414             (((unsigned int) str[pos + 4] & 0x3f) << 6)  |
00415             ((unsigned int)  str[pos + 5] & 0x3f);
00416         return true;
00417 
00418     default:
00419         return false;
00420     }
00421 }

bool UTF8Decode const std::string str,
int  pos,
wchar_t &  ch
 

Decode the UTF-8 characters in string str beginning at position pos. The decoded character is returned in ch; the return value of the function is true if a valid UTF-8 sequence was successfully decoded.

Definition at line 303 of file utf8.cpp.

References UTF8EncodedSizeFromFirstByte().

00304 {
00305     unsigned int c0 = (unsigned int) str[pos];
00306     int charlen = UTF8EncodedSizeFromFirstByte(c0);
00307 
00308     // Bad UTF-8 character that extends past end of string
00309     if (pos + charlen > (int)str.length())
00310         return false;
00311 
00312     // TODO: Should check that the bytes of characters after the first are all
00313     // of the form 01xxxxxx
00314     // TODO: Need to reject overlong encoding sequences
00315 
00316     switch (charlen)
00317     {
00318     case 1:
00319         ch = c0;
00320         return true;
00321 
00322     case 2:
00323         ch = ((c0 & 0x1f) << 6) | ((unsigned int) str[pos + 1] & 0x3f);
00324         return true;
00325 
00326     case 3:
00327         ch = ((c0 & 0x0f) << 12) |
00328             (((unsigned int) str[pos + 1] & 0x3f) << 6) |
00329             ((unsigned int)  str[pos + 2] & 0x3f);
00330         return true;
00331 
00332     case 4:
00333         ch = ((c0 & 0x07) << 18) |
00334             (((unsigned int) str[pos + 1] & 0x3f) << 12) |
00335             (((unsigned int) str[pos + 2] & 0x3f) << 6)  |
00336             ((unsigned int)  str[pos + 3] & 0x3f);
00337         return true;
00338 
00339     case 5:
00340         ch = ((c0 & 0x03) << 24) |
00341             (((unsigned int) str[pos + 1] & 0x3f) << 18) |
00342             (((unsigned int) str[pos + 2] & 0x3f) << 12) |
00343             (((unsigned int) str[pos + 3] & 0x3f) << 6)  |
00344             ((unsigned int)  str[pos + 4] & 0x3f);
00345         return true;
00346 
00347     case 6:
00348         ch = ((c0 & 0x01) << 30) |
00349             (((unsigned int) str[pos + 1] & 0x3f) << 24) |
00350             (((unsigned int) str[pos + 2] & 0x3f) << 18) |
00351             (((unsigned int) str[pos + 3] & 0x3f) << 12) |
00352             (((unsigned int) str[pos + 4] & 0x3f) << 6)  |
00353             ((unsigned int)  str[pos + 5] & 0x3f);
00354         return true;
00355 
00356     default:
00357         return false;
00358     }
00359 }

int UTF8Encode wchar_t  ch,
char *  s
 

UTF-8 encode the Unicode character ch into the string s and return the encoded length. There should be space for at least 7 characters in s--up to six encoded bytes, plus one byte for the terminating null character.

Definition at line 427 of file utf8.cpp.

Referenced by Tokenizer::nextToken().

00428 {
00429     if (ch < 0x80)
00430     {
00431         s[0] = (char) ch;
00432         s[1] = '\0';
00433         return 1;
00434     }
00435     else if (ch < 0x800)
00436     {
00437         s[0] = (char) (0xc0 | ((ch & 0x7c0) >> 6));
00438         s[1] = (char) (0x80 | (ch & 0x3f));
00439         s[2] = '\0';
00440         return 2;
00441     }
00442     else if (ch < 0x10000)
00443     {
00444         s[0] = (char) (0xe0 | ((ch & 0xf000) >> 12));
00445         s[1] = (char) (0x80 | ((ch & 0x0fc0) >> 6));
00446         s[2] = (char) (0x80 | ((ch & 0x003f)));
00447         s[3] = '\0';
00448         return 3;
00449     }
00450     else if (ch < 0x200000)
00451     {
00452         s[0] = (char) (0xf0 | ((ch & 0x1c0000) >> 18));
00453         s[1] = (char) (0x80 | ((ch & 0x03f000) >> 12));
00454         s[2] = (char) (0x80 | ((ch & 0x000fc0) >>  6));
00455         s[3] = (char) (0x80 | ((ch & 0x00003f)));
00456         s[4] = '\0';
00457         return 4;
00458     }
00459     else if (ch < 0x4000000)
00460     {
00461         s[0] = (char) (0xf8 | ((ch & 0x3000000) >> 24));
00462         s[1] = (char) (0x80 | ((ch & 0x0fc0000) >> 18));
00463         s[2] = (char) (0x80 | ((ch & 0x003f000) >> 12));
00464         s[3] = (char) (0x80 | ((ch & 0x0000fc0) >>  6));
00465         s[4] = (char) (0x80 | ((ch & 0x000003f)));
00466         s[5] = '\0';
00467         return 5;
00468     }
00469     else
00470     {
00471         s[0] = (char) (0xfc | ((ch & 0x40000000) >> 30));
00472         s[1] = (char) (0x80 | ((ch & 0x3f000000) >> 24));
00473         s[2] = (char) (0x80 | ((ch & 0x00fc0000) >> 18));
00474         s[3] = (char) (0x80 | ((ch & 0x0003f000) >> 12));
00475         s[4] = (char) (0x80 | ((ch & 0x00000fc0) >>  6));
00476         s[5] = (char) (0x80 | ((ch & 0x0000003f)));
00477         s[6] = '\0';
00478         return 6;
00479     }
00480 }

int UTF8EncodedSize wchar_t  ch  )  [inline]
 

Definition at line 29 of file utf8.h.

Referenced by TextureFont::getWidth(), Overlay::print(), Console::print(), TextureFont::render(), and UTF8StringCompare().

00030 {
00031     if (ch < 0x80)
00032         return 1;
00033     else if (ch < 0x800)
00034         return 2;
00035     else if (ch < 0x10000)
00036         return 3;
00037     else if (ch < 0x200000)
00038         return 4;
00039     else if (ch < 0x4000000)
00040         return 5;
00041     else
00042         return 6;
00043 }

int UTF8EncodedSizeFromFirstByte unsigned int  ch  )  [inline]
 

Definition at line 45 of file utf8.h.

Referenced by UTF8Decode().

00046 {
00047     int charlen = 1;
00048 
00049     if (ch < 0x80)
00050         charlen = 1;
00051     else if ((ch & 0xe0) == 0xc0)
00052         charlen = 2;
00053     else if ((ch & 0xf0) == 0xe0)
00054         charlen = 3;
00055     else if ((ch & 0xf8) == 0xf0)
00056         charlen = 4;
00057     else if ((ch & 0xfc) == 0xf8)
00058         charlen = 5;
00059     else if ((ch & 0xfe) == 0xfc)
00060         charlen = 6;
00061 
00062     return charlen;
00063 }

int UTF8Length const std::string s  ) 
 

Return the number of characters encoded by a UTF-8 string.

Definition at line 484 of file utf8.cpp.

Referenced by UTF8StringCompare().

00485 {
00486     int len = s.length();
00487     int count = 0;
00488     for (int i = 0; i < len; i++)
00489     {
00490         unsigned int c = (unsigned int) ((unsigned char) s[i]);
00491         if ((c < 0x80) || (c >= 0xc2 && c <= 0xf4))
00492             count++;
00493     }
00494 
00495     return count;
00496 }

int UTF8StringCompare const std::string s0,
const std::string s1,
size_t  length
 

Definition at line 555 of file utf8.cpp.

References UTF8Decode(), UTF8EncodedSize(), UTF8Length(), and UTF8Normalize().

00556 {
00557     int len0 = s0.length();
00558     int len1 = s1.length();
00559     int i0 = 0;
00560     int i1 = 0;
00561     while (i0 < len0 && i1 < len1 && n > 0)
00562     {
00563         wchar_t ch0 = 0;
00564         wchar_t ch1 = 0;
00565         if (!UTF8Decode(s0, i0, ch0))
00566             return 1;
00567         if (!UTF8Decode(s1, i1, ch1))
00568             return -1;
00569 
00570         i0 += UTF8EncodedSize(ch0);
00571         i1 += UTF8EncodedSize(ch1);
00572         ch0 = UTF8Normalize(ch0);
00573         ch1 = UTF8Normalize(ch1);
00574 
00575         if (ch0 < ch1)
00576             return -1;
00577         else if (ch0 > ch1)
00578             return 1;
00579 
00580         n--;
00581     }
00582 
00583     if (n == 0)
00584         return 0;
00585 
00586     len0 = UTF8Length(s0);
00587     len1 = UTF8Length(s1);
00588     if (len0 > len1)
00589         return 1;
00590     else if (len0 < len1)
00591         return -1;
00592     else
00593         return 0;
00594 }

int UTF8StringCompare const std::string s0,
const std::string s1
 

Perform a normalized comparison of two UTF-8 strings. The normalization only works for characters in the WGL-4 subset, and no multicharacter translations are performed.

Definition at line 516 of file utf8.cpp.

References UTF8Decode(), UTF8EncodedSize(), UTF8Length(), and UTF8Normalize().

00517 {
00518     int len0 = s0.length();
00519     int len1 = s1.length();
00520     int i0 = 0;
00521     int i1 = 0;
00522     while (i0 < len0 && i1 < len1)
00523     {
00524         wchar_t ch0 = 0;
00525         wchar_t ch1 = 0;
00526         if (!UTF8Decode(s0, i0, ch0))
00527             return 1;
00528         if (!UTF8Decode(s1, i1, ch1))
00529             return -1;
00530 
00531         i0 += UTF8EncodedSize(ch0);
00532         i1 += UTF8EncodedSize(ch1);
00533         ch0 = UTF8Normalize(ch0);
00534         ch1 = UTF8Normalize(ch1);
00535 
00536         if (ch0 < ch1)
00537             return -1;
00538         else if (ch0 > ch1)
00539             return 1;
00540     }
00541 
00542     if (i0 == len0 && i1 == len1)
00543         return 0;
00544 
00545     len0 = UTF8Length(s0);
00546     len1 = UTF8Length(s1);
00547     if (len0 > len1)
00548         return 1;
00549     else if (len0 < len1)
00550         return -1;
00551     else
00552         return 0;
00553 }


Generated on Sat Jan 14 22:33:01 2006 for Celestia by  doxygen 1.4.1