Re: How to convert 2 WCHAR to 1 WCHAR
PRMARJORAM ha scritto:
Again in a nutshell, im downloading webpages from foreign websites not
necessarily using our charset and needing to display a subset of the textual
content within a CListCtrl. I understand I also need to use specific fonts
to acheive this once I have the correct string representation.
After the cyrillic it will also need to work for other charsets such as
Arabic etc.
I developed a small MFC test program to try to implement the idea for
converting an HTML web page to Unicode UTF-16, so the text can be
displayed and used inside Windows Unicode apps:
http://www.geocities.com/giovanni.dicanio/vc/HtmlTextDecoder.zip
Basically, there are 3 steps: the text is read as a "raw" char array;
the text is parsed to find the 'charset=' substring; the text is
converted to Unicode based on the value of charset.
This is the code as implemented in button-click handler:
<code>
//
// Load content of file in a raw char array.
//
std::vector<BYTE> fileContent;
if ( !
HtmlDecodeHelpers::ReadFileInCharArray(dlgOpenFile.GetPathName(),
fileContent) )
{
AfxMessageBox(IDS_ERROR_IN_OPENING_FILE, MB_OK|MB_ICONERROR);
return;
}
//
// Extract 'charset' field from the loaded HTML file.
//
std::string charset =
HtmlDecodeHelpers::ParseCharsetFromHTMLFile(fileContent);
if (charset.empty() )
{
// Charset not found
AfxMessageBox(IDS_ERROR_CHARSET_NOT_FOUND, MB_OK|MB_ICONERROR);
return;
}
//
// Convert loaded file to Unicode, basing on charset specification.
//
std::wstring unicodeContent;
if (!
HtmlDecodeHelpers::ConvertToUnicodeBasedOnCharset(fileContent, charset,
unicodeContent))
{
AfxMessageBox(IDS_ERROR_IN_UNICODE_CONVERSION, MB_OK|MB_ICONERROR);
return;
}
//
// Show converted text.
//
m_txtConverted.SetWindowText(unicodeContent.c_str());
</code>
The code is not perfect and needs more testing (and the parsing
algorithm should be improved), but in simple tests I performed it seems
to me to work fine (I tried it on a Latin 1 code page, and a Cyrillic
code page).
The core functions are those in namespace HtmlDecoderHelpers (in files
HtmlDecoderHelpers.h/.cpp).
For usage example, see method CTextDecoderDlg::OnBnClickedButtonLoadText().
There is also a subfolder called "Test" with a couple of test HTML files
I used.
The "core" function implementations follow:
<code>
//////////////////////////////////////////////////////////////////////////
#include "stdafx.h" // Pre-compiled headers
#include "HtmlDecodeHelpers.h" // Module header
//=======================================================================
// Reads the content of the specified file in an array of BYTEs.
// Returns 'true' on success, 'false' on error.
//=======================================================================
bool HtmlDecodeHelpers::ReadFileInCharArray(
IN const wchar_t * filename,
OUT std::vector<BYTE> & fileContent
)
{
ASSERT( filename != NULL );
// Empty destination array
fileContent.clear();
// Open file for reading
CFile file;
if ( ! file.Open( filename, CFile::modeRead ) )
{
// Error in opening file
return false;
}
// Get file length, in bytes
ULONGLONG fileLen = file.GetLength();
// Assume that file length is not big enough (< 2GB)
ASSERT( fileLen < 0x7FFFFFFF );
// Store file size
size_t sizeInBytes = static_cast<size_t>( fileLen );
// Resize vector to store file content
fileContent.resize( sizeInBytes );
// Read file content in vector
size_t readCount = file.Read( &fileContent[0], sizeInBytes );
ASSERT( readCount == sizeInBytes );
// Close file
file.Close();
// All right
return true;
}
//=======================================================================
// Given an HTML file content, returns the 'charset' value.
// On error, returns an empty string.
//=======================================================================
std::string HtmlDecodeHelpers::ParseCharsetFromHTMLFile(
IN const std::vector<BYTE> & fileContent
)
{
//
// Find the 'charset' attribute.
//
// To do so, build a string based on file content,
// and call std::string::find method on it.
//
//
// Typical HTML charset specification is as follows:
//
// <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
//
// Build an MBCS string from file content
const char * source = reinterpret_cast<const char *>(
&fileContent[0] );
std::string fileContentString( source, fileContent.size() );
// Find 'charset=' substring
size_t charsetIndex = fileContentString.find( "charset=" );
if (charsetIndex == std::string::npos)
{
// Error: no charset specification found
return "";
}
//
// charset=
// ||||||||
// 01234567 --> len = 8
//
const size_t charsetLen = 8;
size_t charsetValueIndex = charsetIndex + charsetLen;
// Now find the " symbol, that should close the charset specification.
const char endQuote = '\"';
size_t endQuoteIndex = fileContentString.find( endQuote,
charsetValueIndex );
if ( endQuoteIndex == std::string::npos )
{
// Error: no charset specification found
return "";
}
// Extract the charset value
std::string charsetValue = fileContentString.substr(
charsetValueIndex, endQuoteIndex - charsetValueIndex );
// Return it to the caller
return charsetValue;
}
//=======================================================================
// Given a file content and a charset specification, returns a Unicode
// string obtained from the input file content string, using proper
// encoding (as specified by charset).
// Returns 'true' on success, 'false' on error.
//=======================================================================
bool HtmlDecodeHelpers::ConvertToUnicodeBasedOnCharset(
IN const std::vector<BYTE> & fileContent,
IN const std::string & charset,
OUT std::wstring & unicodeContent
)
{
// Clear output parameter
unicodeContent.clear();
// There must be something in file content
ASSERT( ! fileContent.empty() );
if ( fileContent.empty() )
return false;
// Charset must be specified
ASSERT( ! charset.empty() );
if ( charset.empty() )
return false;
//
// A list of codepage identifiers for MultiByteToWideChar is
available here:
//
// http://msdn.microsoft.com/en-us/library/dd317756(VS.85).aspx
//
//
// This is a map from charset specification to code page value,
// to be used in MultiByteToWideChar()
//
typedef std::map< std::string, UINT > CharsetCodePageMap;
CharsetCodePageMap charsetToCodePage;
charsetToCodePage["iso-8859-1"] = 28591; // ISO 8859-1 Latin 1;
Western European (ISO)
charsetToCodePage["iso-8859-2"] = 28592; // ISO 8859-2 Central
European; Central European (ISO)
charsetToCodePage["iso-8859-7"] = 28597; // ISO 8859-7 Greek
charsetToCodePage["windows-1251"] = 1251; // ANSI Cyrillic;
Cyrillic (Windows)
charsetToCodePage["koi8-u"] = 21866; // Ukrainian (KOI8-U);
Cyrillic (KOI8-U)
charsetToCodePage["utf-8"] = 65001; // Unicode (UTF-8)
charsetToCodePage["utf-7"] = 65000; // Unicode (UTF-7)
// TODO: Add more entries here...
// TODO: This map could be built statically and not each time the
function is called.
// Given codepage string identifier (in 'charset'),
// extracts the integer ID for MultiByteToWideChar
CharsetCodePageMap::const_iterator it;
it = charsetToCodePage.find( charset );
if ( it == charsetToCodePage.end() )
{
// Code page not found in table
return false;
}
// Get code page ID value
UINT codePage = it->second;
//
// Convert the original text to a Unicode string, with specified
codepage
//
// Request size of destination buffer for Unicode string
int destBufferChars = ::MultiByteToWideChar(
codePage, // code page for conversion
0, // default flags
reinterpret_cast<LPCSTR>( &fileContent[0] ), // string to convert
fileContent.size(), // size in bytes of input string
NULL, // destination Unicode buffer
0 // request size of destination buffer, in
WCHAR's
);
if (destBufferChars == 0)
{
// Failure
return false;
}
// Add +1 to destination buffer size, because we are going to
terminate it with a L'\0'
++destBufferChars;
// Allocate buffer for destination string
std::vector< WCHAR > destBuffer(destBufferChars);
// Convert string to Unicode
int conversionResult = ::MultiByteToWideChar(
codePage, // code page for conversion
0, // default flags
reinterpret_cast<LPCSTR>( &fileContent[0] ), // string to convert
fileContent.size(), // size in bytes of input string
&destBuffer[0], // destination Unicode buffer
destBufferChars // size of destination buffer, in WCHAR's
);
if (conversionResult == 0)
{
// Failure
return false;
}
// Terminate Unicode string with \0
destBuffer[destBufferChars - 1] = L'\0';
// Return the Unicode string in output parameter
unicodeContent = std::wstring(&destBuffer[0]);
// All right
return true;
}
//////////////////////////////////////////////////////////////////////////
</code>
Giovanni