Performance problem with Pthread code
Hi,
I am running the following code on a 4-processor/8-core IBM Blade with
Red Hat Enterprise Linux Server release 5.4 (Tikanga) and GCC 4.1.2.
I am not getting any performance gain due to pthread multi-threading.
I get the best performance with 1 thread and it gets worse with 2, 4,
8, .. threads, essentially indicating that the threads are actually
runnnig serially. Although I have C++ class definitions, I am not
actually using any C++ features e.g. std::cout or anything in
associated classes.
Any suggestion will be much appreciated.
-----------------------------------------------------
#include <stdio.h>
#include <time.h>
#include <sys/time.h>
#include <unistd.h>
#include "pthread.h"
#include "Particle.H"
#include "Space.H"
//---------------------------------------------------------------------------
#define NUM_BOXES 64
#define NUM_STEPS 10
#define BOX_X_SIZE 100
#define BOX_Y_SIZE 100
#define RADIUS 0.0
#define DT 0.01
#define MAX_PARTTICLES_PER_BOX 100000
typedef struct // Info needed by a worker thread.
{
int id;
double mySum;
} ThreadData_t;
Space box[NUM_BOXES];
double globalSum = 0.0;
int eggCount = 0;
int loglevel = 0;
pthread_mutex_t sum_mutex = PTHREAD_MUTEX_INITIALIZER;
typedef struct {
pthread_mutex_t cond_mutex; // the mutex
pthread_cond_t cond_var; // the condition variable
int data; // the data item used as a flag.
} flag;
flag ourFlag = { // default initialization
PTHREAD_MUTEX_INITIALIZER,
PTHREAD_COND_INITIALIZER,
-1 };
void *threadFunction( void *threadData_ );
//---------------------------------------------------------------------------
int main( int argc, char* argv[] )
{
if( argc < 2 )
{
printf("\nUsage: simulator <num_threads> [loglevel]\n\n");
return( 0 );
}
if( argc > 2 ) loglevel = atoi( argv[2] );
struct timeval time1, time2;
int numThreads = atoi( argv[1] );
printf("Number of threads: %d", numThreads );
if( argc > 2 ) loglevel = atoi( argv[2] );
//
// Initialize each box and particles inside it.
//
for( int i=0; i<NUM_BOXES; ++i )
{
box[i].initialize( 0, 0, BOX_X_SIZE, BOX_Y_SIZE,
MAX_PARTTICLES_PER_BOX );
int n = box[i].initParticles( (u_int)i, RADIUS );
printf("\nNumber of particles in box %d: %d", i, n );
}
//
// Create specified number of threads and assign NUM_BOXES/
num_threads
// boxes to each thread. The last thread may have less boxes than
others.
//
pthread_t *threads = (pthread_t
*)malloc( sizeof(pthread_t)*numThreads );
ThreadData_t *threadData =
(ThreadData_t *)malloc( sizeof(ThreadData_t)*numThreads );
//
// Initialize individual thread data.
//
for( int i=0; i<numThreads; ++i )
{
threadData[i].id = i;
threadData[i].mySum = 0.0;
}
for( int i=0; i<numThreads; ++i )
{
int rc = pthread_create( &threads[i], NULL, threadFunction,
(void*)&threadData[i] );
if( rc != 0 )
printf("\nERROR: Failed to launch thread %d\n", i );
}
sleep(5);
gettimeofday( &time1, NULL );
//
// Master distributes work to the thread pool here.
//
for( int i=0; eggCount<NUM_BOXES; i=(i+1)%NUM_BOXES )
{
if( box[i].step < NUM_STEPS ) // This box is not done yet.
{
int status = pthread_mutex_lock( &ourFlag.cond_mutex );
if( status != 0 )
{
printf("\nERROR: lock failed on cond_mutex.\n");
exit( -1 );
}
ourFlag.data = i; // Send box i to the thread pool.
status = pthread_cond_broadcast( &ourFlag.cond_var );
//status = pthread_cond_signal( &ourFlag.cond_var );
if( status != 0 )
{
printf("\nERROR: signal failed on cond_var.\n");
exit( -1 );
}
status = pthread_mutex_unlock( &ourFlag.cond_mutex );
if( status != 0 )
{
printf("\nERROR: unlock failed on cond_mutex.\n");
exit( -1 );
}
if( loglevel > 2 )
printf("\nWaiting on thread pool for box %d", i );
while( ourFlag.data != -1 ) // Wait until a worker picks this
box up.
{
if( eggCount >= NUM_BOXES ) break; // This should not happen!
}
if( loglevel > 2 ) printf("\nBox %d taken.", i );
}
else if( box[i].step == NUM_STEPS )
{
if( loglevel > 0 ) printf("\nBox %d just completed.", i );
++eggCount;
box[i].step++; // increment beyond NUM_STEP to discard this
box.
}
else // This box is already done, move to the next one.
{
if( loglevel > 1 ) printf("\nBox %d already completed.", i );
}
if( eggCount >= NUM_BOXES ) // Check if all boxes are already
done.
{
printf("\nAll boxes completed.");
printf("\n\t***Global sum of velocity squares: %.5f\n",
globalSum );
}
}
gettimeofday( &time2, NULL );
double etime = time2.tv_sec - time1.tv_sec +
( time2.tv_usec - time1.tv_usec )/1000000.0;
printf("\n\t***Elapsed time: %.5f seconds\n\n", etime );
for( int i=0; i<numThreads; ++i )
pthread_join( threads[i], NULL );
pthread_mutex_destroy( &sum_mutex );
free( threads );
free( threadData );
pthread_exit( NULL );
}
//---------------------------------------------------------------------------
void *threadFunction( void *threadData_ )
{
ThreadData_t *threadData = (ThreadData_t*)threadData_;
while( eggCount < NUM_BOXES )
{
int status = pthread_mutex_lock( &ourFlag.cond_mutex );
if( status != 0 )
{
printf("ERROR: lock failed on cond_mutex.\n");
exit( -1 );
}
while( ourFlag.data == -1 && eggCount < NUM_BOXES )
{
if( loglevel > 0 )
printf("\nThread blocking: %d", threadData->id );
status = pthread_cond_wait( &ourFlag.cond_var,
&ourFlag.cond_mutex );
if( status != 0 )
{
printf("ERROR: wait failed on condition variable.\n");
exit( -1 );
}
}
//
// Get the the box id the Master has given.
//
int boxId = ourFlag.data;
ourFlag.data = -1; // Let the Master know the given box is
taken.
if( loglevel > 0 )
printf("\nThread %d processing box %d", threadData->id, boxId );
status = pthread_mutex_unlock( &ourFlag.cond_mutex );
if( status != 0 )
{
printf("ERROR: unlock failed on cond_mutex.\n");
exit( -1 );
}
//
// Perform just 1 step on the given box here.
//
if( box[boxId].step < NUM_STEPS )
box[boxId].moveParticles( DT );
if( box[boxId].step >= NUM_STEPS ) // All steps done on this box.
{
//
// Sum up velocity squares in this box.
//
double boxSum = box[boxId].getVelocitySquare();
if( loglevel > 0 )
printf("\n\t***Total velocity square in box %d: %.5f",
boxId, boxSum );
//
// Accumulate velocity squares of boxes done by this thread.
//
threadData->mySum += boxSum ;
//
// Protect the shared data using mutex.
//
status = pthread_mutex_lock( &sum_mutex );
if( status != 0 )
{
printf("ERROR: lock failed on sum_mutex.\n");
exit( -1 );
}
globalSum += boxSum;
status = pthread_mutex_unlock( &sum_mutex );
if( status != 0 )
{
printf("ERROR: unlock failed on sum_mutex.\n");
exit( -1 );
}
}
} // end while( eggCount < NUM_BOXES )
return( threadData_ );
}
//---------------------------------------------------------------------------