Re: Run-time overhead of text-based storage formats for numerical data
Rune Allnor wrote:
On 12 Nov, 13:09, Francis Glassborow
<francis.glassbo...@btinternet.com> wrote:
When I get an idle moment I will write a C++ program to investigate
(me experience suggests something more like a factor of 5 or 10)
I would be very interested in seeing that kind of thing.
Here are some measurements that I made. The measurements were made with
the unix tool 'time', which gives the elapsed time for a complete
program run including startup and shutdown.
To simulate that the data is generated and consumed by different
programs (so in-memory buffering is not possible), reading and writing
are done in separate runs. This has the added advantage that you get
separate measurements for reading and writing.
I have tested text file, native binary files and big-endian IEEE
double-format binary files with the following results:
Writing 10000000 values:
Native binary: 1.716 s (reference value)
IEEE binary: 2.708 s (1.58 times slower)
Text: 22.465 s (13.09 times slower)
Reading 10000000 values:
Native binary: 1.032 s (reference value)
IEEE binary: 2.208 s (2.14 times slower)
Text: 8.693 s (8.42 times slower)
As you can see, there is a factor 8 to 13 between text and native
binary. When comparing with a precisely specified binary format, the
difference becomes about half that factor.
One that takes stuff like locales, input validation and
error checking into account, that is; not just the plain
calls to std::atof() or std::atoi().
The program does not do extensive input validation, because it is
assumed that the input files are created by another automated system
working to the same interface specification. It is unreasonable to
assume that humans will write files containing more than a few hundred
numbers.
The program I tested with is this
<start code>
#include <iostream>
#include <fstream>
#include <cstdlib>
#include <cctype>
#include <cfloat>
#include <math.h>
//#define _DEBUG
using namespace std;
typedef enum {
TEXT,
NATIVE,
IEEE,
} Operation;
ostream& write_text(ostream& os, double val)
{
return os << val << ' ';
}
ostream& write_native(ostream& os, double val)
{
return os.write(reinterpret_cast<const char*>(&val), sizeof(val));
}
ostream& write_ieee(ostream& os, double val)
{
int power;
double significand;
unsigned char sign;
unsigned long long mantissa;
unsigned char bytes[8];
if(val<0)
{
sign=1;
val = -val;
}
else
{
sign=0;
}
significand = frexp(val,&power);
if (power < -1022 || power > 1023)
{
cerr << "ieee754: exponent out of range" << endl;
os.setstate(ios::failbit);
}
else
{
power += 1022;
}
mantissa = (significand-0.5) * pow(2,53);
bytes[0] = ((sign & 0x01) << 7) | ((power & 0x7ff) >> 4);
bytes[1] = ((power & 0xf)) << 4 |
((mantissa & 0xfffffffffffffLL) >> 48);
bytes[2] = (mantissa >> 40) & 0xff;
bytes[3] = (mantissa >> 32) & 0xff;
bytes[4] = (mantissa >> 24) & 0xff;
bytes[5] = (mantissa >> 16) & 0xff;
bytes[6] = (mantissa >> 8) & 0xff;
bytes[7] = mantissa & 0xff;
return os.write(reinterpret_cast<const char*>(bytes), 8);
}
istream& read_text(istream& is, double& val)
{
return is >> val;
}
istream& read_native(istream& is, double& val)
{
return is.read(reinterpret_cast<char*>(&val), sizeof(val));
}
istream& read_ieee(istream& is, double& val)
{
unsigned char bytes[8];
is.read(reinterpret_cast<char*>(bytes), 8);
if (is)
{
int power;
double significand;
unsigned char sign;
unsigned long long mantissa;
mantissa = ( ((unsigned long long)bytes[7]) |
(((unsigned long long)bytes[6]) << 8) |
(((unsigned long long)bytes[5]) << 16) |
(((unsigned long long)bytes[4]) << 24) |
(((unsigned long long)bytes[3]) << 32) |
(((unsigned long long)bytes[2]) << 40) |
(((unsigned long long)bytes[1]) << 48) )
& 0xfffffffffffffLL;
significand = (mantissa/pow(2,53)) + 0.5;
power = (((bytes[1] >> 4) |
(((unsigned int)bytes[0]) << 4)) & 0x7ff) - 1022;
sign = bytes[0] >> 7;
val = ldexp(significand, power);
if (sign) val = -val;
}
return is;
}
int main(int argc, char** argv)
{
if (argc != 5)
{
cerr << "Usage: " << argv[0] << " <r(ead)|w(rite)>" <<
" <t(ext)|n(ative)|i(eee)> <N> <filename>" << endl;
return EXIT_FAILURE;
}
bool read_mode = (tolower(argv[1][0]) == 'r');
unsigned long num = strtoul(argv[3], NULL, 0);
Operation op_mode;
switch (tolower(argv[2][0]))
{
case 't': default: op_mode = TEXT; break;
case 'n': case 'b': op_mode = NATIVE; break;
case 'i': op_mode = IEEE; break;
}
//TODO: Insert timing code here
if (read_mode)
{
ifstream is(argv[4], (op_mode == TEXT ? ios::in : ios::binary));
double value;
for (unsigned long count = 0; count < num; count++)
{
switch (op_mode)
{
case TEXT: read_text (is, value); break;
case NATIVE: read_native(is, value); break;
case IEEE: read_ieee (is, value); break;
}
if (!is)
{
if (is.eof())
{
cerr << "Unexpected EOF after reading " << count
<< " values from file \"" << argv[4] << '"' << endl;
}
else
{
cerr << "Read error after reading " << count
<< " values from file \"" << argv[4] << '"' << endl;
}
break;
}
#ifdef _DEBUG
else
{
cout << value << '\n';
}
#endif
}
}
else
{
ofstream os(argv[4], (op_mode == TEXT ? ios::out : ios::binary));
double value;
for (unsigned long count = 0; count < num; count++)
{
value = rand();
switch (op_mode)
{
case TEXT: write_text (os, value); break;
case NATIVE: write_native(os, value); break;
case IEEE: write_ieee (os, value); break;
}
if (!os)
{
cerr << "Write error after writing " << count
<< " values to file \"" << argv[4] << '"' << endl;
break;
}
#ifdef _DEBUG
else
{
cout << value << '\n';
}
#endif
}
}
//TODO: Insert timing code here
}
<end code>
Rune
Bart v Ingen Schenau
--
a.c.l.l.c-c++ FAQ: http://www.comeaucomputing.com/learn/faq
c.l.c FAQ: http://c-faq.com/
c.l.c++ FAQ: http://www.parashift.com/c++-faq-lite/
[ See http://www.gotw.ca/resources/clcm.htm for info about ]
[ comp.lang.c++.moderated. First time posters: Do this! ]