gogoWebsite

wchar_t, UTF-8, UTF-16 conversion method

Updated to 2 days ago

Unicode has two sets of encodings, UCS-2 and UCS-4. The internal UCS-2 standard is actually implemented using UTF-16. Instead of Windows systems, most UTF-8 is implemented.

Everyone knows that on Windows, wchar_t is represented by 2 bytes, while on Linux, wchar_t is represented by 4 bytes. This way, when writing cross-platform programs, it will be inconsistent.

Below are several functions I used, the conversion between wchar_t and UTF-8 encoding and the conversion between UTF-16 and wchar_t.

#ifdef WINDOWS
 #include <>
 #include <>
 #include <>
 #else
 #include <>
 #include <>
 #include <>
 #include <>
 #endif

 //wchar_t converts to UTF-8
 int FW2UTF8Convert(const wchar_t* a_szSrc, int a_nSrcSize, char* a_szDest, int a_nDestSize)
 {
 #ifdef WINDOWS
     return WideCharToMultiByte(CP_UTF8, 0, a_szSrc, -1, a_szDest, a_nDestSize, NULL, NULL);
 #else
     size_t result;
     iconv_t env;
     env = iconv_open("UTF-8", "WCHAR_T");
     if (env == (iconv_t)-1)
     {
         printf("iconv_open WCHAR_T->UTF8 error%s %d/n", strerror(errno), errno);
         return -1;
     }
     result = iconv(env, (char**)&a_szSrc, (size_t*)&a_nSrcSize, (char**)&a_szDest, (size_t*)&a_nDestSize);
     if (result == (size_t)-1)
     {
         printf("iconv WCHAR_T->UTF8 error %d/n", errno);
         return -1;
     }
     iconv_close(env);
     return (int)result;
 #endif
 }

 //UTF-8 is converted to wchar_t
 int FUTF82WConvert(const char* a_szSrc, wchar_t* a_szDest, int a_nDestSize)
 {
 #ifdef WINDOWS
     return MultiByteToWideChar(CP_UTF8, 0, a_szSrc, -1, a_szDest, a_nDestSize);
 #else
     size_t result;
     iconv_t env;
     int size = strlen(a_szSrc) + 1;
     env = iconv_open("WCHAR_T", "UTF-8");
     if (env == (iconv_t)-1)
     {
         printf("iconv_open UTF8->WCHAR_T error %d/n", errno);
         return -1;
     }
     result = iconv(env, (char**)&a_szSrc, (size_t*)&size, (char**)&a_szDest, (size_t*)&a_nDestSize);
     if (result == (size_t)-1)
     {
         printf("iconv UTF8->WCHAR_T error %d/n", errno);
         return -1;
     }
     iconv_close(env);
     return (int)result;
 #endif
 }

 //wchar_t converts to utf16
 int FW2UConvert(const wchar_t* a_szSrc, int a_nSize, char* a_szDest, int a_nDestSize)
 {
 #ifdef WINDOWS
     memcpy_s((wchar_t*)a_szDest, a_nDestSize, a_szSrc, a_nSize);
     return a_nSize;
 #else
     size_t result;
     iconv_t env;
     env = iconv_open("UCS-2-INTERNAL", "UCS-4-INTERNAL");
     if (env == (iconv_t)-1)
     {
         printf("iconv_open WCHAR_T->UTF16 error%s %d/n", strerror(errno), errno);
         return -1;
     }
     result = iconv(env, (char**)&a_szSrc, (size_t*)&a_nSize, (char**)&a_szDest, (size_t*)&a_nDestSize);
     if (result == (size_t)-1)
     {
         printf("iconv WCHAR_T->UTF16 error %s %d/n", strerror(errno), errno);
         return -1;
     }
     iconv_close(env);
     return (int)result;
 #endif
 }

 //UTF16 converts to wchar_t
 int FU2WConvert(const char* a_szSrc, int a_nSize, wchar_t* a_szDest, int a_nDestSize)
 {
 #ifdef WINDOWS
     memcpy_s(a_szDest, a_nDestSize, (const wchar_t*)a_szSrc, a_nSize);
     return a_nSize;
 #else
     size_t result;
     iconv_t env;
     env = iconv_open("UCS-4-INTERNAL", "UCS-2-INTERNAL");
     if (env == (iconv_t)-1)
     {
         printf("iconv_open error %d/n", errno);
         return -1;
     }
     result = iconv(env, (char**)&a_szSrc, (size_t*)&a_nSize, (char**)&a_szDest, (size_t*)&a_nDestSize);
     if (result == (size_t)-1)
     {
         printf("UTF16 -> WCHAR_T conv error %d/n", errno);
         return -1;
     }
     iconv_close(env);
     return (int)result;
 #endif
 }

ps: On Linux I use the iconv library. Wchar_t is generally based on the UCS-4 standard.

UCS-4-INTERNAL, UCS-2-INTERNAL will be processed according to the storage method of the machine (big end, small end).

There are also UCS-2LE and UCS-2BE represent small-endian and big-endian modes respectively.

Garbage in strings is often caused by inconsistent encoding or no corresponding characters in the encoding. In order to display strings normally, there is often a need for encoding conversion. In order to facilitate the use of this, it is organized into a head-only file. Here, the conversion between char, wchar_t, and utf-8 is provided. In actual projects, it is recommended to use wchar_t/utf-8, and utf-8 is strongly recommended.

#pragma once
 #include <>
 #include <string>
 #include <vector>
 #include <>
 
 /*!
  * Encoding conversion namespace
  *
  */
 namespace ZEncode
 {
     /*!
      * Narrow byte to wide byte
      *
      * \param str narrow bytes
      * \param uCodePage Narrow byte encoding
      * \return Wide bytes
      */
     static std::wstring A2W(const std::string &str, UINT uCodePage)
     {
         int nLength = ::MultiByteToWideChar(uCodePage, 0, str.c_str(), -1, NULL, 0);
         if (0 == nLength)
         {
             throw std::exception("A2W Error");
         }
         std::wstring strW(nLength, L'\0');
         int nResult = ::MultiByteToWideChar(uCodePage, 0, str.c_str(), -1, &strW[0], nLength);
         if (nResult != nLength)
         {
             throw std::exception("A2W Error");
         }
         (nLength - 1);
         return strW;
     }
 
     /*!
      * wide byte to narrow byte
      *
      * \param str wide bytes
      * \param uCodePage Narrow byte encoding
      * \return Narrow bytes
      */
     static std::string W2A(const std::wstring &str, UINT uCodePage)
     {
         int nLength = ::WideCharToMultiByte(uCodePage, 0, str.c_str(), -1, NULL, 0, NULL, NULL);
         if (0 == nLength)
         {
             throw std::exception("W2A Error");
         }
         std::string strA(nLength, '\0');
         int nResult = ::WideCharToMultiByte(uCodePage, 0, str.c_str(), -1, &strA[0], nLength, NULL, NULL);
         if (nResult != nLength)
         {
             throw std::exception("W2A Error");
         }
         (nLength - 1);
         return strA;
     }
 
     /*!
      * Narrow byte to narrow byte
      *
      * \param str narrow bytes
      * \param uCodePageFrom Source first byte encoding
      * \param uCodePageTo target byte encoding
      * \return Narrow bytes
      */
     static std::string A2A(const std::string &str, UINT uCodePageFrom, UINT uCodePageTo)
     {
         return W2A(A2W(str, uCodePageFrom), uCodePageTo);
     }
 
     /*!
      * Check whether the buffer data is UTF-8
      *
      * \param pBuffer buffer
      * \param size
      * \return Return true if it is, otherwise false.
      *
      * \note The return result is not necessarily completely correct and is only used as a reference purpose.
      */
     static bool IsUTF8(const void* pBuffer, size_t size)
     {
         //Reference /bladeandmaster88/article/details/54767487
         bool bIsUTF8 = true;
         unsigned char* start = (unsigned char*)pBuffer;
         unsigned char* end = (unsigned char*)pBuffer + size;
 
         While (start < end)
         {
             if (*start < 0x80) // (10000000): ASCII characters with a value less than 0x80
             {
                 start++;
             }
             else if (*start < (0xC0)) // (11000000): The value between 0x80 and 0xC0 is invalid UTF-8 characters
             {
                 bIsUTF8 = false;
                 break;
             }
             else if (*start < (0xE0)) // (11100000): This range is 2 bytes UTF-8 characters
             {
                 if (start >= end - 1)
                     break;
 
                 if ((start[1] & (0xC0)) != 0x80)
                 {
                     bIsUTF8 = false;
                     break;
                 }
                 start += 2;
             }
             else if (*start < (0xF0)) // (11110000): This range is 3 bytes UTF-8 characters
             {
                 if (start >= end - 2)
                     break;
 
                 if ((start[1] & (0xC0)) != 0x80 || (start[2] & (0xC0)) != 0x80)
                 {
                     bIsUTF8 = false;
                     break;
                 }
                 start += 3;
             }
             else
             {
                 bIsUTF8 = false;
                 break;
             }
         }
 
         return bIsUTF8;
     }
 
     /*!
     * Check whether the string is UTF-8 encoding
     *
     * \param str Buffer
     * \return Return true if it is, otherwise false.
     *
     * \note The return result is not necessarily completely correct and is only used as a reference purpose.
     */
     static bool IsUTF8(const std::string &str)
     {
         return IsUTF8(str.c_str(), ());
     }
 }
 
 //For the convenience of using defined macros
 #define ANSI_TO_WCHAR(str) (ZEncode::A2W(str, CP_ACP))
 #define ANSI_TO_UTF8(str) (ZEncode::A2A(str, CP_ACP, CP_UTF8))
 
 #define UTF8_TO_ANSI(str) (ZEncode::A2A(str, CP_UTF8, CP_ACP))
 #define UTF8_TO_WCHAR(str) (ZEncode::A2W(str, CP_UTF8))
 
 #define WCHAR_TO_ANSI(str) (ZEncode::W2A(str, CP_ACP))
 #define WCHAR_TO_UTF8(str) (ZEncode::W2A(str, CP_UTF8))


     TEST(ZEncode, ansi_to_wchar_to_ansi)
     {
         std::string strANSI("People's *");
         std::wstring strWChar = ANSI_TO_WCHAR(strANSI);
         EXPECT_STREQ(strWChar.c_str(), L"People's *");
         EXPECT_STREQ(WCHAR_TO_ANSI(strWChar).c_str(), "People's *");
     }
 
     TEST(ZEncode, ansi_to_utf8_to_wchar)
     {
         std::string strANSI("People's *");
         std::string strUTF8 = ANSI_TO_UTF8(strANSI);
         EXPECT_FALSE(ZEncode::IsUTF8(strANSI));
         EXPECT_TRUE(ZEncode::IsUTF8(strUTF8));
         EXPECT_STREQ(UTF8_TO_WCHAR(strUTF8).c_str(), L "People's *");
     }
 
     TEST(ZEncode, ansi_to_utf8_to_ansi)
     {
         std::string strANSI("People's *");
         std::string strUTF8 = ANSI_TO_UTF8(strANSI);
         EXPECT_FALSE(ZEncode::IsUTF8(strANSI));
         EXPECT_TRUE(ZEncode::IsUTF8(strUTF8));
         EXPECT_STREQ(UTF8_TO_ANSI(strUTF8).c_str(), strANSI.c_str());
     }
 
     TEST(ZEncode, wchar_to_utf8_to_wchar)
     {
         std::wstring strWChar(L"People's *");
         std::string strUTF8 = WCHAR_TO_UTF8(strWChar);
         EXPECT_TRUE(ZEncode::IsUTF8(strUTF8));
         EXPECT_STREQ(UTF8_TO_WCHAR(strUTF8).c_str(), strWChar.c_str());
     }

Others: In C++11, if you want the initialized string to be encoded as utf-8, you only need to add u8 in front of the characters, as follows:

1

2

std::string s1 = "People's *";  //Depending on file encoding

std::string s2 = u8"People's *";  //utf-8 encoding