Re: bench: binary trees
On Saturday, July 28, 2012 7:20:04 PM UTC-4, Melzzzzz wrote:
I have played a little tree with benchmark from computer language game
site:
http://shootout.alioth.debian.org/u64q/performance.php?test=binarytrees
Interested in gc vs manual I have been very surprised with Java
performance ;)
Seems that Java gc is blazingly fast in comparison with standard
new/delete.
Is it possible that the benchmark never causes GC to run? Is so, that woul=
d also mean any time it took to run destructors was also ignored in the Jav=
a case.
I think this is because gc deallocates in chunks rather piece by
piece, perhaps allocates from pool. Also there are no destructors
calls.
fastest C++ program on this site uses boost pool which calls
destructors and can deallocate, therefore, is slower then C variant
which uses apache's pool, which just deallocate.
So I have coded simple custom pool that does not keeps track of
allocated objects and doesn't call destructors and result is
that it is slightly faster than C ;)
What is clear is that Java garbage collector cannot be bitten
in this case if manual deletion of objects is required.
(as seen here)
http://shootout.alioth.debian.org/u64/performance.php?test=binarytrees
Timings for cpp with my custom alloc
bmaxa@maxa:~/shootout/trees$ time ./binarytreescpp 20
stretch tree of depth 21 check: -1
2097152 trees of depth 4 check: -2097152
524288 trees of depth 6 check: -524288
131072 trees of depth 8 check: -131072
32768 trees of depth 10 check: -32768
8192 trees of depth 12 check: -8192
2048 trees of depth 14 check: -2048
512 trees of depth 16 check: -512
128 trees of depth 18 check: -128
32 trees of depth 20 check: -32
long lived tree of depth 20 check: -1
real 0m4.188s
user 0m7.036s
sys 0m0.112s
bmaxa@maxa:~/shootout/trees$ time taskset -c 0 ./binarytreescpp 20
stretch tree of depth 21 check: -1
2097152 trees of depth 4 check: -2097152
524288 trees of depth 6 check: -524288
131072 trees of depth 8 check: -131072
32768 trees of depth 10 check: -32768
8192 trees of depth 12 check: -8192
2048 trees of depth 14 check: -2048
512 trees of depth 16 check: -512
128 trees of depth 18 check: -128
32 trees of depth 20 check: -32
long lived tree of depth 20 check: -1
real 0m7.077s
user 0m6.976s
sys 0m0.084s
Timings for C with apache pool:
bmaxa@maxa:~/shootout/trees$ time ./binarytreesc 20
stretch tree of depth 21 check: -1
2097152 trees of depth 4 check: -2097152
524288 trees of depth 6 check: -524288
131072 trees of depth 8 check: -131072
32768 trees of depth 10 check: -32768
8192 trees of depth 12 check: -8192
2048 trees of depth 14 check: -2048
512 trees of depth 16 check: -512
128 trees of depth 18 check: -128
32 trees of depth 20 check: -32
long lived tree of depth 20 check: -1
real 0m4.625s
user 0m8.085s
sys 0m0.140s
bmaxa@maxa:~/shootout/trees$ time taskset -c 0 ./binarytreesc 20
stretch tree of depth 21 check: -1
2097152 trees of depth 4 check: -2097152
524288 trees of depth 6 check: -524288
131072 trees of depth 8 check: -131072
32768 trees of depth 10 check: -32768
8192 trees of depth 12 check: -8192
2048 trees of depth 14 check: -2048
512 trees of depth 16 check: -512
128 trees of depth 18 check: -128
32 trees of depth 20 check: -32
long lived tree of depth 20 check: -1
real 0m8.009s
user 0m7.888s
sys 0m0.116s
code follows if someone is interested:
/* The Computer Language Benchmarks Game
* http://shootout.alioth.debian.org/
*
* contributed by Jon Harrop
* modified by Alex Mizrahi
* modified by Andreas Sch=E4fer
* very minor omp tweak by The Anh Tran
* custom pool Branimir Maksimovic
*/
#include <iostream>
#include <stdlib.h>
#include <stdio.h>
#include <vector>
#include <new>
#include <pthread.h>
#include <boost/pool/object_pool.hpp>
const size_t LINE_SIZE = 64;
template <class T>
class Pool{
static const unsigned NELEM = 10000*sizeof(T);
public:
Pool()
: pos_(0)
{
pool_.push_back(alloc());
}
Pool(const Pool&)=delete;
Pool& operator=(const Pool&)=delete;
~Pool()
{
for(auto i : pool_)
{
free(i);
}
}
template <class ...T1>
T* allocate(T1... args)
{
if(NELEM <= pos_)
{
pool_.push_back(alloc());
pos_=0;
}
void* p = &pool_.back()[pos_];
pos_+=sizeof(T);
return new(p)T(args...);
}
private:
static char* alloc()
{
Inner* next = (Inner*)pthread_getspecific(k.key);
if(next == nullptr)
{
next = (Inner*)::operator new(NELEM);
next->next = nullptr;
}
char* p = (char*)next;
next = next->next;
pthread_setspecific(k.key,next);
return p;
}
static void free(void* p)
{
Inner* next = (Inner*)pthread_getspecific(k.key);
Inner* tmp = (Inner*)p;
tmp->next = next;
next = tmp;
pthread_setspecific(k.key,next);
}
std::vector<char*> pool_;
unsigned pos_;
struct Inner{
Inner* next;
};
static struct Key{
Key()
{
pthread_key_create(&key,destroy);
}
~Key()
{
pthread_key_delete(key);
}
static void destroy(void* p)
{
Inner* next = (Inner*)p;
while(next)
{
Inner* tmp = next->next;
::operator delete(next);
next = tmp;
}
}
pthread_key_t key;
}k;
};
template <class T>
typename Pool<T>::Key Pool<T>::k;
struct Node
{
Node *l, *r;
int i;
Node(int i2) :l(0),r(0), i(i2)
{}
Node(Node *l2, int i2, Node *r2) : l(l2), r(r2), i(i2)
{}
Node(const Node&)=delete;
Node& operator=(const Node&)=delete;
int check() const
{
if (l)
return l->check() + i - r->check();
else return i;
}
};
typedef boost::object_pool<Node> NodePool;
Node *make(int i, int d, Pool<Node>& p)
{
if (d > 0)
return p.allocate( make(2*i-1, d-1,p),
i,
make(2*i, d-1,p));
return p.allocate(i);
}
Node *make(int i, int d, NodePool& p)
{
if (d > 0)
return p.construct( make(2*i-1, d-1,p),
i,
make(2*i, d-1,p));
return p.construct(i);
}
int main(int argc, char *argv[])
{
int min_depth = 4;
int max_depth = std::max(min_depth+2,
(argc == 2 ? atoi(argv[1]) : 10));
int stretch_depth = max_depth+1;
// Alloc then dealloc stretchdepth tree
{
Pool<Node> c_pool;
//NodePool c_pool;
Node* c (make(0, stretch_depth,c_pool));
std::cout << "stretch tree of depth " << stretch_depth << "\t "
<< "check: " << c->check() << '\n';
}
Pool<Node> long_lived_tree_pool;
//NodePool long_lived_tree_pool;
Node* long_lived_tree(make(0, max_depth,long_lived_tree_pool));
char *outputstr = (char*)malloc(LINE_SIZE * (max_depth +1) * sizeof=
(char));
#pragma omp parallel for
for (int d = min_depth; d <= max_depth; d += 2)
{
int iterations = 1 << (max_depth - d + min_depth);
int c = 0;
for (int i = 1; i <= iterations; ++i)
{
Pool<Node> pool;
//NodePool pool;
Node *a(make(i, d,pool)), *b(make(-i, d,pool));
c += a->check() + b->check();
}
sprintf(outputstr + LINE_SIZE * d, "%d\t trees of depth %d\t check: %d\=
n", (2 * iterations), d, c);
}
for (int d = min_depth; d <= max_depth; d += 2)
printf("%s", outputstr + (d * LINE_SIZE) );
free(outputstr);
std::cout << "long lived tree of depth " << max_depth << "\t "
<< "check: " << (long_lived_tree->check()) << '\n';
return 0;
}
On Saturday, July 28, 2012 7:20:04 PM UTC-4, Melzzzzz wrote:
I have played a little tree with benchmark from computer language game
site:
http://shootout.alioth.debian.org/u64q/performance.php?test=binarytrees
Interested in gc vs manual I have been very surprised with Java
performance ;)
Seems that Java gc is blazingly fast in comparison with standard
new/delete.
I think this is because gc deallocates in chunks rather piece by
piece, perhaps allocates from pool. Also there are no destructors
calls.
fastest C++ program on this site uses boost pool which calls
destructors and can deallocate, therefore, is slower then C variant
which uses apache's pool, which just deallocate.
So I have coded simple custom pool that does not keeps track of
allocated objects and doesn't call destructors and result is
that it is slightly faster than C ;)
What is clear is that Java garbage collector cannot be bitten
in this case if manual deletion of objects is required.
(as seen here)
http://shootout.alioth.debian.org/u64/performance.php?test=binarytrees
Timings for cpp with my custom alloc
bmaxa@maxa:~/shootout/trees$ time ./binarytreescpp 20
stretch tree of depth 21 check: -1
2097152 trees of depth 4 check: -2097152
524288 trees of depth 6 check: -524288
131072 trees of depth 8 check: -131072
32768 trees of depth 10 check: -32768
8192 trees of depth 12 check: -8192
2048 trees of depth 14 check: -2048
512 trees of depth 16 check: -512
128 trees of depth 18 check: -128
32 trees of depth 20 check: -32
long lived tree of depth 20 check: -1
real 0m4.188s
user 0m7.036s
sys 0m0.112s
bmaxa@maxa:~/shootout/trees$ time taskset -c 0 ./binarytreescpp 20
stretch tree of depth 21 check: -1
2097152 trees of depth 4 check: -2097152
524288 trees of depth 6 check: -524288
131072 trees of depth 8 check: -131072
32768 trees of depth 10 check: -32768
8192 trees of depth 12 check: -8192
2048 trees of depth 14 check: -2048
512 trees of depth 16 check: -512
128 trees of depth 18 check: -128
32 trees of depth 20 check: -32
long lived tree of depth 20 check: -1
real 0m7.077s
user 0m6.976s
sys 0m0.084s
Timings for C with apache pool:
bmaxa@maxa:~/shootout/trees$ time ./binarytreesc 20
stretch tree of depth 21 check: -1
2097152 trees of depth 4 check: -2097152
524288 trees of depth 6 check: -524288
131072 trees of depth 8 check: -131072
32768 trees of depth 10 check: -32768
8192 trees of depth 12 check: -8192
2048 trees of depth 14 check: -2048
512 trees of depth 16 check: -512
128 trees of depth 18 check: -128
32 trees of depth 20 check: -32
long lived tree of depth 20 check: -1
real 0m4.625s
user 0m8.085s
sys 0m0.140s
bmaxa@maxa:~/shootout/trees$ time taskset -c 0 ./binarytreesc 20
stretch tree of depth 21 check: -1
2097152 trees of depth 4 check: -2097152
524288 trees of depth 6 check: -524288
131072 trees of depth 8 check: -131072
32768 trees of depth 10 check: -32768
8192 trees of depth 12 check: -8192
2048 trees of depth 14 check: -2048
512 trees of depth 16 check: -512
128 trees of depth 18 check: -128
32 trees of depth 20 check: -32
long lived tree of depth 20 check: -1
real 0m8.009s
user 0m7.888s
sys 0m0.116s
code follows if someone is interested:
/* The Computer Language Benchmarks Game
* http://shootout.alioth.debian.org/
*
* contributed by Jon Harrop
* modified by Alex Mizrahi
* modified by Andreas Sch=E4fer
* very minor omp tweak by The Anh Tran
* custom pool Branimir Maksimovic
*/
#include <iostream>
#include <stdlib.h>
#include <stdio.h>
#include <vector>
#include <new>
#include <pthread.h>
#include <boost/pool/object_pool.hpp>
const size_t LINE_SIZE = 64;
template <class T>
class Pool{
static const unsigned NELEM = 10000*sizeof(T);
public:
Pool()
: pos_(0)
{
pool_.push_back(alloc());
}
Pool(const Pool&)=delete;
Pool& operator=(const Pool&)=delete;
~Pool()
{
for(auto i : pool_)
{
free(i);
}
}
template <class ...T1>
T* allocate(T1... args)
{
if(NELEM <= pos_)
{
pool_.push_back(alloc());
pos_=0;
}
void* p = &pool_.back()[pos_];
pos_+=sizeof(T);
return new(p)T(args...);
}
private:
static char* alloc()
{
Inner* next = (Inner*)pthread_getspecific(k.key);
if(next == nullptr)
{
next = (Inner*)::operator new(NELEM);
next->next = nullptr;
}
char* p = (char*)next;
next = next->next;
pthread_setspecific(k.key,next);
return p;
}
static void free(void* p)
{
Inner* next = (Inner*)pthread_getspecific(k.key);
Inner* tmp = (Inner*)p;
tmp->next = next;
next = tmp;
pthread_setspecific(k.key,next);
}
std::vector<char*> pool_;
unsigned pos_;
struct Inner{
Inner* next;
};
static struct Key{
Key()
{
pthread_key_create(&key,destroy);
}
~Key()
{
pthread_key_delete(key);
}
static void destroy(void* p)
{
Inner* next = (Inner*)p;
while(next)
{
Inner* tmp = next->next;
::operator delete(next);
next = tmp;
}
}
pthread_key_t key;
}k;
};
template <class T>
typename Pool<T>::Key Pool<T>::k;
struct Node
{
Node *l, *r;
int i;
Node(int i2) :l(0),r(0), i(i2)
{}
Node(Node *l2, int i2, Node *r2) : l(l2), r(r2), i(i2)
{}
Node(const Node&)=delete;
Node& operator=(const Node&)=delete;
int check() const
{
if (l)
return l->check() + i - r->check();
else return i;
}
};
typedef boost::object_pool<Node> NodePool;
Node *make(int i, int d, Pool<Node>& p)
{
if (d > 0)
return p.allocate( make(2*i-1, d-1,p),
i,
make(2*i, d-1,p));
return p.allocate(i);
}
Node *make(int i, int d, NodePool& p)
{
if (d > 0)
return p.construct( make(2*i-1, d-1,p),
i,
make(2*i, d-1,p));
return p.construct(i);
}
int main(int argc, char *argv[])
{
int min_depth = 4;
int max_depth = std::max(min_depth+2,
(argc == 2 ? atoi(argv[1]) : 10));
int stretch_depth = max_depth+1;
// Alloc then dealloc stretchdepth tree
{
Pool<Node> c_pool;
//NodePool c_pool;
Node* c (make(0, stretch_depth,c_pool));
std::cout << "stretch tree of depth " << stretch_depth << "\t "
<< "check: " << c->check() << '\n';
}
Pool<Node> long_lived_tree_pool;
//NodePool long_lived_tree_pool;
Node* long_lived_tree(make(0, max_depth,long_lived_tree_pool));
char *outputstr = (char*)malloc(LINE_SIZE * (max_depth +1) * sizeof=
(char));
#pragma omp parallel for
for (int d = min_depth; d <= max_depth; d += 2)
{
int iterations = 1 << (max_depth - d + min_depth);
int c = 0;
for (int i = 1; i <= iterations; ++i)
{
Pool<Node> pool;
//NodePool pool;
Node *a(make(i, d,pool)), *b(make(-i, d,pool));
c += a->check() + b->check();
}
sprintf(outputstr + LINE_SIZE * d, "%d\t trees of depth %d\t check: %d\=
n", (2 * iterations), d, c);
}
for (int d = min_depth; d <= max_depth; d += 2)
printf("%s", outputstr + (d * LINE_SIZE) );
free(outputstr);
std::cout << "long lived tree of depth " << max_depth << "\t "
<< "check: " << (long_lived_tree->check()) << '\n';
return 0;
}
On Saturday, July 28, 2012 7:20:04 PM UTC-4, Melzzzzz wrote:
I have played a little tree with benchmark from computer language game
site:
http://shootout.alioth.debian.org/u64q/performance.php?test=binarytrees
Interested in gc vs manual I have been very surprised with Java
performance ;)
Seems that Java gc is blazingly fast in comparison with standard
new/delete.
I think this is because gc deallocates in chunks rather piece by
piece, perhaps allocates from pool. Also there are no destructors
calls.
fastest C++ program on this site uses boost pool which calls
destructors and can deallocate, therefore, is slower then C variant
which uses apache's pool, which just deallocate.
So I have coded simple custom pool that does not keeps track of
allocated objects and doesn't call destructors and result is
that it is slightly faster than C ;)
What is clear is that Java garbage collector cannot be bitten
in this case if manual deletion of objects is required.
(as seen here)
http://shootout.alioth.debian.org/u64/performance.php?test=binarytrees
Timings for cpp with my custom alloc
bmaxa@maxa:~/shootout/trees$ time ./binarytreescpp 20
stretch tree of depth 21 check: -1
2097152 trees of depth 4 check: -2097152
524288 trees of depth 6 check: -524288
131072 trees of depth 8 check: -131072
32768 trees of depth 10 check: -32768
8192 trees of depth 12 check: -8192
2048 trees of depth 14 check: -2048
512 trees of depth 16 check: -512
128 trees of depth 18 check: -128
32 trees of depth 20 check: -32
long lived tree of depth 20 check: -1
real 0m4.188s
user 0m7.036s
sys 0m0.112s
bmaxa@maxa:~/shootout/trees$ time taskset -c 0 ./binarytreescpp 20
stretch tree of depth 21 check: -1
2097152 trees of depth 4 check: -2097152
524288 trees of depth 6 check: -524288
131072 trees of depth 8 check: -131072
32768 trees of depth 10 check: -32768
8192 trees of depth 12 check: -8192
2048 trees of depth 14 check: -2048
512 trees of depth 16 check: -512
128 trees of depth 18 check: -128
32 trees of depth 20 check: -32
long lived tree of depth 20 check: -1
real 0m7.077s
user 0m6.976s
sys 0m0.084s
Timings for C with apache pool:
bmaxa@maxa:~/shootout/trees$ time ./binarytreesc 20
stretch tree of depth 21 check: -1
2097152 trees of depth 4 check: -2097152
524288 trees of depth 6 check: -524288
131072 trees of depth 8 check: -131072
32768 trees of depth 10 check: -32768
8192 trees of depth 12 check: -8192
2048 trees of depth 14 check: -2048
512 trees of depth 16 check: -512
128 trees of depth 18 check: -128
32 trees of depth 20 check: -32
long lived tree of depth 20 check: -1
real 0m4.625s
user 0m8.085s
sys 0m0.140s
bmaxa@maxa:~/shootout/trees$ time taskset -c 0 ./binarytreesc 20
stretch tree of depth 21 check: -1
2097152 trees of depth 4 check: -2097152
524288 trees of depth 6 check: -524288
131072 trees of depth 8 check: -131072
32768 trees of depth 10 check: -32768
8192 trees of depth 12 check: -8192
2048 trees of depth 14 check: -2048
512 trees of depth 16 check: -512
128 trees of depth 18 check: -128
32 trees of depth 20 check: -32
long lived tree of depth 20 check: -1
real 0m8.009s
user 0m7.888s
sys 0m0.116s
code follows if someone is interested:
/* The Computer Language Benchmarks Game
* http://shootout.alioth.debian.org/
*
* contributed by Jon Harrop
* modified by Alex Mizrahi
* modified by Andreas Sch=E4fer
* very minor omp tweak by The Anh Tran
* custom pool Branimir Maksimovic
*/
#include <iostream>
#include <stdlib.h>
#include <stdio.h>
#include <vector>
#include <new>
#include <pthread.h>
#include <boost/pool/object_pool.hpp>
const size_t LINE_SIZE = 64;
template <class T>
class Pool{
static const unsigned NELEM = 10000*sizeof(T);
public:
Pool()
: pos_(0)
{
pool_.push_back(alloc());
}
Pool(const Pool&)=delete;
Pool& operator=(const Pool&)=delete;
~Pool()
{
for(auto i : pool_)
{
free(i);
}
}
template <class ...T1>
T* allocate(T1... args)
{
if(NELEM <= pos_)
{
pool_.push_back(alloc());
pos_=0;
}
void* p = &pool_.back()[pos_];
pos_+=sizeof(T);
return new(p)T(args...);
}
private:
static char* alloc()
{
Inner* next = (Inner*)pthread_getspecific(k.key);
if(next == nullptr)
{
next = (Inner*)::operator new(NELEM);
next->next = nullptr;
}
char* p = (char*)next;
next = next->next;
pthread_setspecific(k.key,next);
return p;
}
static void free(void* p)
{
Inner* next = (Inner*)pthread_getspecific(k.key);
Inner* tmp = (Inner*)p;
tmp->next = next;
next = tmp;
pthread_setspecific(k.key,next);
}
std::vector<char*> pool_;
unsigned pos_;
struct Inner{
Inner* next;
};
static struct Key{
Key()
{
pthread_key_create(&key,destroy);
}
~Key()
{
pthread_key_delete(key);
}
static void destroy(void* p)
{
Inner* next = (Inner*)p;
while(next)
{
Inner* tmp = next->next;
::operator delete(next);
next = tmp;
}
}
pthread_key_t key;
}k;
};
template <class T>
typename Pool<T>::Key Pool<T>::k;
struct Node
{
Node *l, *r;
int i;
Node(int i2) :l(0),r(0), i(i2)
{}
Node(Node *l2, int i2, Node *r2) : l(l2), r(r2), i(i2)
{}
Node(const Node&)=delete;
Node& operator=(const Node&)=delete;
int check() const
{
if (l)
return l->check() + i - r->check();
else return i;
}
};
typedef boost::object_pool<Node> NodePool;
Node *make(int i, int d, Pool<Node>& p)
{
if (d > 0)
return p.allocate( make(2*i-1, d-1,p),
i,
make(2*i, d-1,p));
return p.allocate(i);
}
Node *make(int i, int d, NodePool& p)
{
if (d > 0)
return p.construct( make(2*i-1, d-1,p),
i,
make(2*i, d-1,p));
return p.construct(i);
}
int main(int argc, char *argv[])
{
int min_depth = 4;
int max_depth = std::max(min_depth+2,
(argc == 2 ? atoi(argv[1]) : 10));
int stretch_depth = max_depth+1;
// Alloc then dealloc stretchdepth tree
{
Pool<Node> c_pool;
//NodePool c_pool;
Node* c (make(0, stretch_depth,c_pool));
std::cout << "stretch tree of depth " << stretch_depth << "\t "
<< "check: " << c->check() << '\n';
}
Pool<Node> long_lived_tree_pool;
//NodePool long_lived_tree_pool;
Node* long_lived_tree(make(0, max_depth,long_lived_tree_pool));
char *outputstr = (char*)malloc(LINE_SIZE * (max_depth +1) * sizeof=
(char));
#pragma omp parallel for
for (int d = min_depth; d <= max_depth; d += 2)
{
int iterations = 1 << (max_depth - d + min_depth);
int c = 0;
for (int i = 1; i <= iterations; ++i)
{
Pool<Node> pool;
//NodePool pool;
Node *a(make(i, d,pool)), *b(make(-i, d,pool));
c += a->check() + b->check();
}
sprintf(outputstr + LINE_SIZE * d, "%d\t trees of depth %d\t check: %d\=
n", (2 * iterations), d, c);
}
for (int d = min_depth; d <= max_depth; d += 2)
printf("%s", outputstr + (d * LINE_SIZE) );
free(outputstr);
std::cout << "long lived tree of depth " << max_depth << "\t "
<< "check: " << (long_lived_tree->check()) << '\n';
return 0;
}