#include "utf8.h"#include <cctype>#include "util.h"Include dependency graph for utf8.cpp:

Go to the source code of this file.
Functions | |
| static std::string | noAbbrev ("") |
| std::string | ReplaceGreekLetterAbbr (std::string str) |
| bool | UTF8Decode (const char *str, int pos, int length, wchar_t &ch) |
| bool | UTF8Decode (const std::string &str, int pos, wchar_t &ch) |
| int | UTF8Encode (wchar_t ch, char *s) |
| int | UTF8Length (const std::string &s) |
| Return the number of characters encoded by a UTF-8 string. | |
| wchar_t | UTF8Normalize (wchar_t ch) |
| int | UTF8StringCompare (const std::string &s0, const std::string &s1, size_t n) |
| int | UTF8StringCompare (const std::string &s0, const std::string &s1) |
Variables | |
| static const char * | canonicalAbbrevs [] |
| static const char * | greekAlphabet [] |
| static const char * | greekAlphabetUTF8 [] |
| unsigned int | WGL4_Normalization_00 [256] |
| unsigned int | WGL4_Normalization_01 [256] |
| unsigned int | WGL4_Normalization_02 [256] |
| unsigned int | WGL4_Normalization_03 [256] |
| unsigned int | WGL4_Normalization_04 [256] |
| unsigned int | WGL4_Normalization_1e [256] |
| unsigned int | WGL4_Normalization_21 [256] |
| unsigned int * | WGL4NormalizationTables [256] |
|
|
Referenced by Greek::canonicalAbbreviation(). |
|
|
Replaces the Greek letter abbreviation at the beginning of a string by the UTF-8 representation of that letter. Also, replace digits following Greek letters with UTF-8 superscripts. Definition at line 765 of file utf8.cpp. References Greek::abbrevs, greekAlphabetUTF8, UTF8_SUPERSCRIPT_1, UTF8_SUPERSCRIPT_2, and UTF8_SUPERSCRIPT_3. Referenced by Renderer::addLabel(), Renderer::addSortedLabel(), addStars(), CelListViewItem::CelListViewItem(), StarDatabase::getStarNameList(), SelectionPopup::init(), menuContext(), and CelestiaCore::renderOverlay(). 00766 {
00767 std::string ret = str;
00768
00769 if (str[0] >= 'A' && str[0] <= 'Z' &&
00770 str[1] >= 'A' && str[1] <= 'Z')
00771 {
00772 // Linear search through all letter abbreviations
00773 for (int i = 0; i < Greek::instance->nLetters; i++)
00774 {
00775 const std::string& abbr = Greek::instance->abbrevs[i];
00776 if (str.compare(0, abbr.length(), abbr) == 0)
00777 {
00778 std::string superscript;
00779 if (str.length() > abbr.length())
00780 {
00781 if (str[abbr.length()] == '1')
00782 superscript = UTF8_SUPERSCRIPT_1;
00783 else if (str[abbr.length()] == '2')
00784 superscript = UTF8_SUPERSCRIPT_2;
00785 else if (str[abbr.length()] == '3')
00786 superscript = UTF8_SUPERSCRIPT_3;
00787 }
00788
00789 if (superscript.empty())
00790 {
00791 ret = std::string(greekAlphabetUTF8[i]) + str.substr(abbr.length());
00792 }
00793 else
00794 {
00795 ret = std::string(greekAlphabetUTF8[i]) + superscript +
00796 str.substr(abbr.length() + 1);
00797 }
00798
00799 break;
00800 }
00801 }
00802 }
00803
00804 return ret;
00805 }
|
|
||||||||||||||||||||
|
Decode the UTF-8 characters in string str beginning at position pos. The decoded character is returned in ch; the return value of the function is true if a valid UTF-8 sequence was successfully decoded. Definition at line 365 of file utf8.cpp. References UTF8EncodedSizeFromFirstByte(). 00366 {
00367 unsigned int c0 = (unsigned int) str[pos];
00368 int charlen = UTF8EncodedSizeFromFirstByte(c0);
00369
00370 // Bad UTF-8 character that extends past end of string
00371 if (pos + charlen > length)
00372 return false;
00373
00374 // TODO: Should check that the bytes of characters after the first are all
00375 // of the form 01xxxxxx
00376 // TODO: Need to reject overlong encoding sequences
00377
00378 switch (charlen)
00379 {
00380 case 1:
00381 ch = c0;
00382 return true;
00383
00384 case 2:
00385 ch = ((c0 & 0x1f) << 6) | ((unsigned int) str[pos + 1] & 0x3f);
00386 return true;
00387
00388 case 3:
00389 ch = ((c0 & 0x0f) << 12) |
00390 (((unsigned int) str[pos + 1] & 0x3f) << 6) |
00391 ((unsigned int) str[pos + 2] & 0x3f);
00392 return true;
00393
00394 case 4:
00395 ch = ((c0 & 0x07) << 18) |
00396 (((unsigned int) str[pos + 1] & 0x3f) << 12) |
00397 (((unsigned int) str[pos + 2] & 0x3f) << 6) |
00398 ((unsigned int) str[pos + 3] & 0x3f);
00399 return true;
00400
00401 case 5:
00402 ch = ((c0 & 0x03) << 24) |
00403 (((unsigned int) str[pos + 1] & 0x3f) << 18) |
00404 (((unsigned int) str[pos + 2] & 0x3f) << 12) |
00405 (((unsigned int) str[pos + 3] & 0x3f) << 6) |
00406 ((unsigned int) str[pos + 4] & 0x3f);
00407 return true;
00408
00409 case 6:
00410 ch = ((c0 & 0x01) << 30) |
00411 (((unsigned int) str[pos + 1] & 0x3f) << 24) |
00412 (((unsigned int) str[pos + 2] & 0x3f) << 18) |
00413 (((unsigned int) str[pos + 3] & 0x3f) << 12) |
00414 (((unsigned int) str[pos + 4] & 0x3f) << 6) |
00415 ((unsigned int) str[pos + 5] & 0x3f);
00416 return true;
00417
00418 default:
00419 return false;
00420 }
00421 }
|
|
||||||||||||||||
|
Decode the UTF-8 characters in string str beginning at position pos. The decoded character is returned in ch; the return value of the function is true if a valid UTF-8 sequence was successfully decoded. Definition at line 303 of file utf8.cpp. Referenced by CelestiaCore::charEntered(), TextureFont::getWidth(), Overlay::print(), Console::print(), TextureFont::render(), and UTF8StringCompare(). 00304 {
00305 unsigned int c0 = (unsigned int) str[pos];
00306 int charlen = UTF8EncodedSizeFromFirstByte(c0);
00307
00308 // Bad UTF-8 character that extends past end of string
00309 if (pos + charlen > (int)str.length())
00310 return false;
00311
00312 // TODO: Should check that the bytes of characters after the first are all
00313 // of the form 01xxxxxx
00314 // TODO: Need to reject overlong encoding sequences
00315
00316 switch (charlen)
00317 {
00318 case 1:
00319 ch = c0;
00320 return true;
00321
00322 case 2:
00323 ch = ((c0 & 0x1f) << 6) | ((unsigned int) str[pos + 1] & 0x3f);
00324 return true;
00325
00326 case 3:
00327 ch = ((c0 & 0x0f) << 12) |
00328 (((unsigned int) str[pos + 1] & 0x3f) << 6) |
00329 ((unsigned int) str[pos + 2] & 0x3f);
00330 return true;
00331
00332 case 4:
00333 ch = ((c0 & 0x07) << 18) |
00334 (((unsigned int) str[pos + 1] & 0x3f) << 12) |
00335 (((unsigned int) str[pos + 2] & 0x3f) << 6) |
00336 ((unsigned int) str[pos + 3] & 0x3f);
00337 return true;
00338
00339 case 5:
00340 ch = ((c0 & 0x03) << 24) |
00341 (((unsigned int) str[pos + 1] & 0x3f) << 18) |
00342 (((unsigned int) str[pos + 2] & 0x3f) << 12) |
00343 (((unsigned int) str[pos + 3] & 0x3f) << 6) |
00344 ((unsigned int) str[pos + 4] & 0x3f);
00345 return true;
00346
00347 case 6:
00348 ch = ((c0 & 0x01) << 30) |
00349 (((unsigned int) str[pos + 1] & 0x3f) << 24) |
00350 (((unsigned int) str[pos + 2] & 0x3f) << 18) |
00351 (((unsigned int) str[pos + 3] & 0x3f) << 12) |
00352 (((unsigned int) str[pos + 4] & 0x3f) << 6) |
00353 ((unsigned int) str[pos + 5] & 0x3f);
00354 return true;
00355
00356 default:
00357 return false;
00358 }
00359 }
|
|
||||||||||||
|
UTF-8 encode the Unicode character ch into the string s and return the encoded length. There should be space for at least 7 characters in s--up to six encoded bytes, plus one byte for the terminating null character. Definition at line 427 of file utf8.cpp. Referenced by Tokenizer::nextToken(). 00428 {
00429 if (ch < 0x80)
00430 {
00431 s[0] = (char) ch;
00432 s[1] = '\0';
00433 return 1;
00434 }
00435 else if (ch < 0x800)
00436 {
00437 s[0] = (char) (0xc0 | ((ch & 0x7c0) >> 6));
00438 s[1] = (char) (0x80 | (ch & 0x3f));
00439 s[2] = '\0';
00440 return 2;
00441 }
00442 else if (ch < 0x10000)
00443 {
00444 s[0] = (char) (0xe0 | ((ch & 0xf000) >> 12));
00445 s[1] = (char) (0x80 | ((ch & 0x0fc0) >> 6));
00446 s[2] = (char) (0x80 | ((ch & 0x003f)));
00447 s[3] = '\0';
00448 return 3;
00449 }
00450 else if (ch < 0x200000)
00451 {
00452 s[0] = (char) (0xf0 | ((ch & 0x1c0000) >> 18));
00453 s[1] = (char) (0x80 | ((ch & 0x03f000) >> 12));
00454 s[2] = (char) (0x80 | ((ch & 0x000fc0) >> 6));
00455 s[3] = (char) (0x80 | ((ch & 0x00003f)));
00456 s[4] = '\0';
00457 return 4;
00458 }
00459 else if (ch < 0x4000000)
00460 {
00461 s[0] = (char) (0xf8 | ((ch & 0x3000000) >> 24));
00462 s[1] = (char) (0x80 | ((ch & 0x0fc0000) >> 18));
00463 s[2] = (char) (0x80 | ((ch & 0x003f000) >> 12));
00464 s[3] = (char) (0x80 | ((ch & 0x0000fc0) >> 6));
00465 s[4] = (char) (0x80 | ((ch & 0x000003f)));
00466 s[5] = '\0';
00467 return 5;
00468 }
00469 else
00470 {
00471 s[0] = (char) (0xfc | ((ch & 0x40000000) >> 30));
00472 s[1] = (char) (0x80 | ((ch & 0x3f000000) >> 24));
00473 s[2] = (char) (0x80 | ((ch & 0x00fc0000) >> 18));
00474 s[3] = (char) (0x80 | ((ch & 0x0003f000) >> 12));
00475 s[4] = (char) (0x80 | ((ch & 0x00000fc0) >> 6));
00476 s[5] = (char) (0x80 | ((ch & 0x0000003f)));
00477 s[6] = '\0';
00478 return 6;
00479 }
00480 }
|
|
|
Return the number of characters encoded by a UTF-8 string.
Definition at line 484 of file utf8.cpp. Referenced by UTF8StringCompare(). 00485 {
00486 int len = s.length();
00487 int count = 0;
00488 for (int i = 0; i < len; i++)
00489 {
00490 unsigned int c = (unsigned int) ((unsigned char) s[i]);
00491 if ((c < 0x80) || (c >= 0xc2 && c <= 0xf4))
00492 count++;
00493 }
00494
00495 return count;
00496 }
|
|
|
Definition at line 499 of file utf8.cpp. References WGL4NormalizationTables. Referenced by UTF8StringCompare(). 00500 {
00501 unsigned int page = (unsigned int) ch >> 8;
00502 if (page >= 256)
00503 return ch;
00504
00505 unsigned int* normTable = WGL4NormalizationTables[page];
00506 if (normTable == NULL)
00507 return ch;
00508
00509 return (wchar_t) normTable[(unsigned int) ch & 0xff];
00510 }
|
|
||||||||||||||||
|
Definition at line 555 of file utf8.cpp. References UTF8Decode(), UTF8EncodedSize(), UTF8Length(), and UTF8Normalize(). 00556 {
00557 int len0 = s0.length();
00558 int len1 = s1.length();
00559 int i0 = 0;
00560 int i1 = 0;
00561 while (i0 < len0 && i1 < len1 && n > 0)
00562 {
00563 wchar_t ch0 = 0;
00564 wchar_t ch1 = 0;
00565 if (!UTF8Decode(s0, i0, ch0))
00566 return 1;
00567 if (!UTF8Decode(s1, i1, ch1))
00568 return -1;
00569
00570 i0 += UTF8EncodedSize(ch0);
00571 i1 += UTF8EncodedSize(ch1);
00572 ch0 = UTF8Normalize(ch0);
00573 ch1 = UTF8Normalize(ch1);
00574
00575 if (ch0 < ch1)
00576 return -1;
00577 else if (ch0 > ch1)
00578 return 1;
00579
00580 n--;
00581 }
00582
00583 if (n == 0)
00584 return 0;
00585
00586 len0 = UTF8Length(s0);
00587 len1 = UTF8Length(s1);
00588 if (len0 > len1)
00589 return 1;
00590 else if (len0 < len1)
00591 return -1;
00592 else
00593 return 0;
00594 }
|
|
||||||||||||
|
Perform a normalized comparison of two UTF-8 strings. The normalization only works for characters in the WGL-4 subset, and no multicharacter translations are performed. Definition at line 516 of file utf8.cpp. Referenced by PlanetarySystem::find(), Body::findLocation(), Universe::getCompletion(), NameDatabase< OBJ >::getCompletion(), PlanetarySystem::getCompletion(), and Universe::getCompletionPath(). 00517 {
00518 int len0 = s0.length();
00519 int len1 = s1.length();
00520 int i0 = 0;
00521 int i1 = 0;
00522 while (i0 < len0 && i1 < len1)
00523 {
00524 wchar_t ch0 = 0;
00525 wchar_t ch1 = 0;
00526 if (!UTF8Decode(s0, i0, ch0))
00527 return 1;
00528 if (!UTF8Decode(s1, i1, ch1))
00529 return -1;
00530
00531 i0 += UTF8EncodedSize(ch0);
00532 i1 += UTF8EncodedSize(ch1);
00533 ch0 = UTF8Normalize(ch0);
00534 ch1 = UTF8Normalize(ch1);
00535
00536 if (ch0 < ch1)
00537 return -1;
00538 else if (ch0 > ch1)
00539 return 1;
00540 }
00541
00542 if (i0 == len0 && i1 == len1)
00543 return 0;
00544
00545 len0 = UTF8Length(s0);
00546 len1 = UTF8Length(s1);
00547 if (len0 > len1)
00548 return 1;
00549 else if (len0 < len1)
00550 return -1;
00551 else
00552 return 0;
00553 }
|
|
|
Initial value:
{
"ALF", "BET", "GAM", "DEL", "EPS", "ZET", "ETA", "TET",
"IOT", "KAP", "LAM", "MU" , "NU" , "XI" , "OMI", "PI" ,
"RHO", "SIG", "TAU", "UPS", "PHI", "CHI", "PSI", "OME",
}
Definition at line 696 of file utf8.cpp. Referenced by Greek::Greek(). |
|
|
Initial value:
{
"Alpha",
"Beta",
"Gamma",
"Delta",
"Epsilon",
"Zeta",
"Eta",
"Theta",
"Iota",
"Kappa",
"Lambda",
"Mu",
"Nu",
"Xi",
"Omicron",
"Pi",
"Rho",
"Sigma",
"Tau",
"Upsilon",
"Phi",
"Chi",
"Psi",
"Omega"
}
Definition at line 640 of file utf8.cpp. Referenced by Greek::Greek(). |
|
|
Initial value:
{
"\316\261",
"\316\262",
"\316\263",
"\316\264",
"\316\265",
"\316\266",
"\316\267",
"\316\270",
"\316\271",
"\316\272",
"\316\273",
"\316\274",
"\316\275",
"\316\276",
"\316\277",
"\317\200",
"\317\201",
"\317\203",
"\317\204",
"\317\205",
"\317\206",
"\317\207",
"\317\210",
"\317\211",
}
Definition at line 668 of file utf8.cpp. Referenced by Greek::canonicalAbbreviation(), and ReplaceGreekLetterAbbr(). |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Definition at line 259 of file utf8.cpp. Referenced by UTF8Normalize(). |
1.4.1