Re: How to read Unicode(Big-Endian) text file(s) in Non-MFC
"meme" <meme@myself.com> ha scritto nel messaggio
news:eALO1CmcIHA.4140@TK2MSFTNGP04.phx.gbl...
so I tried ......following.....but I think I missed or messed up something
and therefore all I see some junk characters when executed ..... :(
You can solve this problem in several ways, there's no one single way.
You might consider this code of mine (need more test, and can be optimized,
but seems to work).
I've put comments in code, so you can read them.
(I hope that Outlook Express does not scramble the pasted lines...)
You should pay attention to the code of the function ReadFileUtf16BE(), that
reads the content of a UTF-16 BE file, and stores it into a Unicode UTF-16
(LE) string (I used std::wstring, but you can use CStringW as well).
The function WriteFileUtf16BE() is used for test (to write a simple UTF-16BE
file).
In your main(), you can use them like this:
<code>
// Write a test file...
WriteFileUtf16BE(_T("test"));
// Read file content
std::wstring fileText;
ReadFileUtf16BE(_T("test"), fileText);
// ...should check return code, if false --> error
// Show it
MessageBoxW( NULL, fileText.c_str(), L"File content:", MB_OK);
</code>
Here are the functions:
<code>
// Swap bytes
inline void SwapBytes(BYTE & b1, BYTE & b2)
{
BYTE temp = b1;
b1 = b2;
b2 = temp;
}
//
// Reads a UTF-16 BE file, and returns a Unicode string with its content.
// Returns 'true' on success, 'false' on error.
//
bool ReadFileUtf16BE(
LPCTSTR filename, // [in] filename
std::wstring & text // [out] file string content
)
{
// Clear output parameter (set to empty string)
text = L"";
// Check filename input parameter
ASSERT( filename != NULL );
if ( filename == NULL )
return false;
//
// Open file
//
FILE * file = _tfopen(filename, _T("rb"));
ASSERT( file != NULL );
if ( file == NULL )
return false;
//
// Check that file is UTF-16 BE
//
BYTE bom[2];
if ( fread( bom, sizeof(bom), 1, file) != 1 )
{
// No UTF-16 BE (BOM does not match)
ASSERT(FALSE);
fclose(file);
return false;
}
// UTF-16 BE BOM is FE FF
if ( bom[0] != 0xFE && bom[1] != 0xFF )
{
// No UTF-16 BE (BOM does not match)
ASSERT(FALSE);
fclose(file);
return false;
}
//
// Get file size, in bytes
//
fseek(file, 0L, SEEK_END);
long size = ftell(file);
// To correctly compute size, we should exclude BOM (-2 bytes),
// but we need to consider string termination L'\0' (+2 bytes).
// So, we don't change 'size' parameter here.
//
// Read file content into memory string
//
// Alloc memory to read file in
std::vector<BYTE> buffer( size );
// Read all file in memory, excluding BOM (2 bytes)
fseek(file, 2, SEEK_SET);
fread(
&(buffer[0]), // destination buffer
1, // read each byte
size - 2, // exclude BOM
file
);
// Add the end-of-string L'\0'
buffer[size-2] = 0x00;
buffer[size-1] = 0x00;
// Close file
fclose(file);
file = NULL;
//
// Now convert from BE to LE, swapping byte order in WORDs
//
BYTE * pBuffer = &(buffer[0]);
ASSERT(pBuffer != NULL);
for ( long i = 0; i < size; i++ )
{
// Swap low and high bytes (*pBuffer and *(pBuffer+1))
SwapBytes( *pBuffer, *(pBuffer+1) );
// Go to next WORD (2 bytes)
pBuffer += 2;
i += 2;
}
// Copy file content to string
text = std::wstring( (const wchar_t *) &(buffer[0]) );
// All right
return true;
}
//
// Prepares a test file UTF-16 BE to read next
//
void WriteFileUtf16BE(LPCTSTR filename)
{
// Open file to write in binary mode
FILE * file = _tfopen(filename, _T("wb") );
ASSERT( file != NULL );
//
// Prepare file content in memory.
//
// We print:
// - UTF-16 BE BOM
// - (c) symbol
// - ? symbol
//
std::vector<BYTE> data;
data.push_back(0xFE); // UTF-16 BE BOM
data.push_back(0xFF);
data.push_back(0x00); // (c)
data.push_back(0xA9);
data.push_back(0x00); // ?
data.push_back(0xE9);
// Write file using our memory buffer
fwrite(&(data[0]), 1, data.size(), file );
// Close file
fclose(file);
file = NULL;
}
</code>
HTH,
Giovanni