Re: convert 32bit numbers to 64bit (or float to double)

From:

Sebastian Gibb <lists@sebastiangibb.de>

Newsgroups:

comp.lang.c++

Date:

Sat, 26 Jun 2010 20:15:53 +0200

Message-ID:

<i05g4n$oha$1@speranza.aioe.org>

Hello,

Shorten it to the minimum and post it here. Most of us don't click on
links. At least I don't.

Sorry, I thought nobody would read long code without syntax highlighting.
#include <cmath>
#include <iomanip>
#include <iostream>
#include <vector>

using namespace std;

// some constants from IEEE 754
const int nBitsSingleMantissa = 23;
const int nBitsSingleExzess = 8;
const int nBitsDoubleMantissa = 52;
const int nBitsDoubleExzess = 11;

// old method using by another cpp application
// it is my reference method
double convertWithCast(double value) {
  float x = value;
  return (double)x;
}

// try to simulate the same behaviour without using floats
struct IEEEBinary {
  int signedBit;
  vector<int > exzess;
  vector<int > mantissa;
};

vector<int > swapVectorOrder(const vector<int >& x) {
  vector<int > y;
  for (int i=x.size()-1; i >= 0; --i) {
    y.push_back(x[i]);
  }
  return y;
}

double calcExzess(int nEBits) {
  return pow(2, nEBits-1)-1;
}

IEEEBinary double2binary(double x, int nMBits, int nEBits) {
  // calculate mantissa
  // before point
  int pre = floor(abs(x));
  vector<int > preMantissa;

  while (pre != 0) {
    preMantissa.push_back(pre % 2);
    pre = floor(pre/2.0);
  }

  if (preMantissa.size() > 1)
    preMantissa = swapVectorOrder(preMantissa);

  // after point
  double post = x - floor(x);
  vector<int > postMantissa;
  for (unsigned int i=0; i<2*nMBits; ++i) {
    post = post * 2;
    int pre = floor(post);
    postMantissa.push_back(pre);
    post -= pre;
  }

  vector<int > mantissa = preMantissa;
  mantissa.insert(mantissa.end(), postMantissa.begin(), postMantissa.end());

  // normalize
  vector<int >::iterator it;

  for (it = mantissa.begin(); it != mantissa.end(); ++it) {
    if (*it == 1)
      break;
  }
  // save size for exzess calc
  unsigned int sMantissa = mantissa.size();
  // remove leading zeros and first 1
  it = mantissa.erase(mantissa.begin(), (it+1));
  // save new size for exzess calc
  unsigned int sMantissa2 = mantissa.size();

  // round
  if (mantissa.at(nMBits+1) == 1) {
    mantissa.at(nMBits) = 1;
  }

  // cut
  mantissa.erase(it+nMBits, mantissa.end());
  //mantissa.erase(mantissa.end());

  // exzess
  int ex = calcExzess(nEBits) + preMantissa.size() - (sMantissa-sMantissa2);

  vector<int > exzess;

  while (ex != 0) {
    exzess.push_back(ex % 2);
    ex = floor(ex/2.0);
  }

  // append zeros to exzess
  if (exzess.size() < nEBits) {
    for (unsigned int i=exzess.size(); i<nEBits; ++i)
      exzess.push_back(0);
  }

  exzess = swapVectorOrder(exzess);

  // signed bit
  int signedBit = 0;

  if (x < 0) {
    signedBit = 1;
  }

  // build binary struct
  IEEEBinary bin;
  bin.signedBit = signedBit;
  bin.mantissa = mantissa;
  bin.exzess = exzess;

  return bin;
}

double binary2double(const IEEEBinary& binary) {
  int exzess = 0;

  for (unsigned int i = 0; i < binary.exzess.size(); ++i)
    exzess += binary.exzess[i]*pow(2, binary.exzess.size()-(i+1));

  exzess -= calcExzess(binary.exzess.size());

  double value = pow(2, exzess);

  for (unsigned int i = 0; i < binary.mantissa.size(); ++i) {
    value += binary.mantissa[i]*pow(2, exzess-(int)(i+1));
  }

  if (binary.signedBit == 1)
    value *= (-1);

  return value;
}

// wrapper function
double convertWithoutCast(double value) {
  return binary2double(double2binary(value, nBitsSingleMantissa,
nBitsSingleExzess));
}

int main() {
  vector<double > testValues;
  testValues.push_back(1.0/3.0);
  testValues.push_back(18.4);
  testValues.push_back(0.1);
  testValues.push_back(999.4813232421875);

  for (vector<double >::iterator it=testValues.begin(); it !=
testValues.end(); ++it) {
    double oldConv = convertWithCast(*it);
    double newConv = convertWithoutCast(*it);

    if (oldConv != newConv) {
      cout << setprecision(22) << *it << ": " << oldConv << " != " <<
newConv << endl;
    }
  }

  return 0;
}

// the output:
0.3333333333333333148296:
0.3333333432674407958984 != 0.3333333134651184082031
0.1000000000000000055511:
0.1000000014901161193848 != 0.09999999403953552246094

// it works for
18.4 and 999.4813232421875

I think, I do something wrong because the old method with typical c-cast
returns a different value in comparison to my new method without c-cast.

Kind regards,

Sebastian