Re: How to convert 2 WCHAR to 1 WCHAR
 
PRMARJORAM ha scritto:
Again in a nutshell, im downloading webpages from foreign websites not 
necessarily using our charset and needing to display a subset of the textual 
content within a CListCtrl.  I understand I also need to use specific fonts 
to acheive this once I have the correct string representation.
After the cyrillic it will also need to work for other charsets such as 
Arabic etc.
I developed a small MFC test program to try to implement the idea for 
converting an HTML web page to Unicode UTF-16, so the text can be 
displayed and used inside Windows Unicode apps:
http://www.geocities.com/giovanni.dicanio/vc/HtmlTextDecoder.zip
Basically, there are 3 steps: the text is read as a "raw" char array; 
the text is parsed to find the 'charset=' substring; the text is 
converted to Unicode based on the value of charset.
This is the code as implemented in button-click handler:
<code>
     //
     // Load content of file in a raw char array.
     //
     std::vector<BYTE> fileContent;
     if ( ! 
HtmlDecodeHelpers::ReadFileInCharArray(dlgOpenFile.GetPathName(), 
fileContent) )
     {
         AfxMessageBox(IDS_ERROR_IN_OPENING_FILE, MB_OK|MB_ICONERROR);
         return;
     }
     //
     // Extract 'charset' field from the loaded HTML file.
     //
     std::string charset = 
HtmlDecodeHelpers::ParseCharsetFromHTMLFile(fileContent);
     if (charset.empty() )
     {
         // Charset not found
         AfxMessageBox(IDS_ERROR_CHARSET_NOT_FOUND, MB_OK|MB_ICONERROR);
         return;
     }
     //
     // Convert loaded file to Unicode, basing on charset specification.
     //
     std::wstring unicodeContent;
     if (! 
HtmlDecodeHelpers::ConvertToUnicodeBasedOnCharset(fileContent, charset, 
unicodeContent))
     {
         AfxMessageBox(IDS_ERROR_IN_UNICODE_CONVERSION, MB_OK|MB_ICONERROR);
         return;
     }
     //
     // Show converted text.
     //
     m_txtConverted.SetWindowText(unicodeContent.c_str());
</code>
The code is not perfect and needs more testing (and the parsing 
algorithm should be improved), but in simple tests I performed it seems 
to me to work fine (I tried it on a Latin 1 code page, and a Cyrillic 
code page).
The core functions are those in namespace HtmlDecoderHelpers (in files 
HtmlDecoderHelpers.h/.cpp).
For usage example, see method CTextDecoderDlg::OnBnClickedButtonLoadText().
There is also a subfolder called "Test" with a couple of test HTML files 
  I used.
The "core" function implementations follow:
<code>
//////////////////////////////////////////////////////////////////////////
#include "stdafx.h"                 // Pre-compiled headers
#include "HtmlDecodeHelpers.h"      // Module header
//=======================================================================
// Reads the content of the specified file in an array of BYTEs.
// Returns 'true' on success, 'false' on error.
//=======================================================================
bool HtmlDecodeHelpers::ReadFileInCharArray(
     IN const wchar_t * filename,
     OUT std::vector<BYTE> & fileContent
     )
{
     ASSERT( filename != NULL );
     // Empty destination array
     fileContent.clear();
     // Open file for reading
     CFile file;
     if ( ! file.Open( filename, CFile::modeRead ) )
     {
         // Error in opening file
         return false;
     }
     // Get file length, in bytes
     ULONGLONG fileLen = file.GetLength();
     // Assume that file length is not big enough (< 2GB)
     ASSERT( fileLen < 0x7FFFFFFF );
     // Store file size
     size_t sizeInBytes = static_cast<size_t>( fileLen );
     // Resize vector to store file content
     fileContent.resize( sizeInBytes );
     // Read file content in vector
     size_t readCount = file.Read( &fileContent[0], sizeInBytes );
     ASSERT( readCount == sizeInBytes );
     // Close file
     file.Close();
     // All right
     return true;
}
//=======================================================================
// Given an HTML file content, returns the 'charset' value.
// On error, returns an empty string.
//=======================================================================
std::string HtmlDecodeHelpers::ParseCharsetFromHTMLFile(
     IN const std::vector<BYTE> & fileContent
)
{
     //
     // Find the 'charset' attribute.
     //
     // To do so, build a string based on file content,
     // and call std::string::find method on it.
     //
     //
     // Typical HTML charset specification is as follows:
     //
     // <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
     //
     // Build an MBCS string from file content
     const char * source = reinterpret_cast<const char *>( 
&fileContent[0] );
     std::string fileContentString( source, fileContent.size() );
     // Find 'charset=' substring
     size_t charsetIndex = fileContentString.find( "charset=" );
     if (charsetIndex == std::string::npos)
     {
         // Error: no charset specification found
         return "";
     }
     //
     // charset=
     // ||||||||
     // 01234567  --> len = 8
     //
     const size_t charsetLen = 8;
     size_t charsetValueIndex = charsetIndex + charsetLen;
     // Now find the " symbol, that should close the charset specification.
     const char endQuote = '\"';
     size_t endQuoteIndex = fileContentString.find( endQuote, 
charsetValueIndex );
     if ( endQuoteIndex == std::string::npos )
     {
         // Error: no charset specification found
         return "";
     }
     // Extract the charset value
     std::string charsetValue = fileContentString.substr(
         charsetValueIndex, endQuoteIndex - charsetValueIndex );
     // Return it to the caller
     return charsetValue;
}
//=======================================================================
// Given a file content and a charset specification, returns a Unicode
// string obtained from the input file content string, using proper
// encoding (as specified by charset).
// Returns 'true' on success, 'false' on error.
//=======================================================================
bool HtmlDecodeHelpers::ConvertToUnicodeBasedOnCharset(
     IN const std::vector<BYTE> & fileContent,
     IN const std::string & charset,
     OUT std::wstring & unicodeContent
)
{
     // Clear output parameter
     unicodeContent.clear();
     // There must be something in file content
     ASSERT( ! fileContent.empty() );
     if ( fileContent.empty() )
         return false;
     // Charset must be specified
     ASSERT( ! charset.empty() );
     if ( charset.empty() )
         return false;
     //
     // A list of codepage identifiers for MultiByteToWideChar is 
available here:
     //
     // http://msdn.microsoft.com/en-us/library/dd317756(VS.85).aspx
     //
     //
     // This is a map from charset specification to code page value,
     // to be used in MultiByteToWideChar()
     //
     typedef std::map< std::string, UINT > CharsetCodePageMap;
     CharsetCodePageMap charsetToCodePage;
     charsetToCodePage["iso-8859-1"] = 28591;    // ISO 8859-1 Latin 1; 
Western European (ISO)
     charsetToCodePage["iso-8859-2"] = 28592;    // ISO 8859-2 Central 
European; Central European (ISO)
     charsetToCodePage["iso-8859-7"] = 28597;    // ISO 8859-7 Greek
     charsetToCodePage["windows-1251"] = 1251;   // ANSI Cyrillic; 
Cyrillic (Windows)
     charsetToCodePage["koi8-u"] = 21866;        // Ukrainian (KOI8-U); 
Cyrillic (KOI8-U)
     charsetToCodePage["utf-8"] = 65001;         // Unicode (UTF-8)
     charsetToCodePage["utf-7"] = 65000;         // Unicode (UTF-7)
     // TODO: Add more entries here...
     // TODO: This map could be built statically and not each time the 
function is called.
     // Given codepage string identifier (in 'charset'),
     // extracts the integer ID for MultiByteToWideChar
     CharsetCodePageMap::const_iterator it;
     it = charsetToCodePage.find( charset );
     if ( it == charsetToCodePage.end() )
     {
         // Code page not found in table
         return false;
     }
     // Get code page ID value
     UINT codePage = it->second;
     //
     // Convert the original text to a Unicode string, with specified 
codepage
     //
     // Request size of destination buffer for Unicode string
     int destBufferChars = ::MultiByteToWideChar(
         codePage,           // code page for conversion
         0,                  // default flags
         reinterpret_cast<LPCSTR>( &fileContent[0] ), // string to convert
         fileContent.size(), // size in bytes of input string
         NULL,               // destination Unicode buffer
         0                   // request size of destination buffer, in 
WCHAR's
         );
     if (destBufferChars == 0)
     {
         // Failure
         return false;
     }
     // Add +1 to destination buffer size, because we are going to 
terminate it with a L'\0'
     ++destBufferChars;
     // Allocate buffer for destination string
     std::vector< WCHAR > destBuffer(destBufferChars);
     // Convert string to Unicode
     int conversionResult = ::MultiByteToWideChar(
         codePage,           // code page for conversion
         0,                  // default flags
         reinterpret_cast<LPCSTR>( &fileContent[0] ), // string to convert
         fileContent.size(), // size in bytes of input string
         &destBuffer[0],     // destination Unicode buffer
         destBufferChars     // size of destination buffer, in WCHAR's
         );
     if (conversionResult == 0)
     {
         // Failure
         return false;
     }
     // Terminate Unicode string with \0
     destBuffer[destBufferChars - 1] = L'\0';
     // Return the Unicode string in output parameter
     unicodeContent = std::wstring(&destBuffer[0]);
     // All right
     return true;
}
//////////////////////////////////////////////////////////////////////////
</code>
Giovanni