STL std::inplace_merge and Visual 2008 performance
Hi,
I am currently migrating our code from Visual .Net 2003 (ie 7.1) to visual
2005 SP1 (8.0) and Visual 2008 SP1 (9.0).
We have some algorithm in our software using a lot the STL
"std::inplace_merge" function. I think I found a slowdown with the new
version, on a unitary test software (included below).
Visual 2005SP1: 345ms
Visual 2005SP1: 587ms
Intel C++ 10.1: 340ms (with VC2008)
(on a Windows XP, each of the executable includes the correct MSVCR/MSVCP
versions, tested on a quad-core processor hardware as well as reproduced on a
2 x Xeon PIV processors).
Analysis
--------
The code generated by Visual 2008 seems to have a lot of asm instr related
to the stack (using esp register) whereas the one from Visual 2005 doesn't
include a single one.
Questions
---------
1) Is there an obvious error on the compiler flags used ?
2) Is this slowdown related to the code generator (or optimizer) of Visual
2008 ?
I am puzzled by that because I think Visual C++ team runs serious
performance regression tests.
Any hint would be appreciated.
TIA,
Tof.
Precise versions used
---------------------
Intel compiler: 10.1.025
Visual 2005 compiler: 14.00.50727.762
Visual 2008 compiler: 15.00.30729.01
Code used
---------
1) .bat to compile things:
REM Visual 8.0
call "%VS80COMNTOOLS%\vsvars32.bat"
cl.exe /Zi /Ox /Ob2 /Oi /Oy /D "WIN32" /D "NDEBUG" /D "_WINDOWS" /D
"_USRDLL" /D "_VC80_UPGRADE=0x0710" /D "_WINDLL" /D "_MBCS" /FD /EHsc /MD
/arch:SSE2 /fp:fast /Zc:wchar_t- /D_CRT_SECURE_NO_WARNINGS
/D_CRT_SECURE_NO_DEPRECATE /D_SECURE_SCL=0 /D_SCL_SECURE_NO_DEPRECATE
/D_HAS_ITERATOR_DEBUGGING=0 /DNDEBUG /Fe"SortedVector2005SP1.exe"
SortedVector.cpp /link /OPT:REF /OPT:ICF
REM Visual 9.0
call "%VS90COMNTOOLS%\vsvars32.bat"
cl.exe /Zi /Ox /Ob2 /Oi /Oy /D "WIN32" /D "NDEBUG" /D "_WINDOWS" /D
"_USRDLL" /D "_VC80_UPGRADE=0x0710" /D "_WINDLL" /D "_MBCS" /FD /EHsc /MD
/arch:SSE2 /fp:fast /Zc:wchar_t- /D_CRT_SECURE_NO_WARNINGS
/D_CRT_SECURE_NO_DEPRECATE /D_SECURE_SCL=0 /D_SCL_SECURE_NO_DEPRECATE
/D_HAS_ITERATOR_DEBUGGING=0 /DNDEBUG /Fe"SortedVector2008SP1.exe"
SortedVector.cpp /link /OPT:REF /OPT:ICF
REM Intel 10.1
call "C:\Program Files\Intel\Compiler\C++\10.1.025\IA32\Bin\ICLVars.bat"
icl.exe /Zi /Ox /Ob2 /Oi /Oy /D "WIN32" /D "NDEBUG" /D "_WINDOWS" /D
"_USRDLL" /D "_VC80_UPGRADE=0x0710" /D "_WINDLL" /D "_MBCS" /FD /EHsc /MD
/arch:SSE2 /fp:fast /Zc:wchar_t- /D_CRT_SECURE_NO_WARNINGS
/D_CRT_SECURE_NO_DEPRECATE /D_SECURE_SCL=0 /D_SCL_SECURE_NO_DEPRECATE
/D_HAS_ITERATOR_DEBUGGING=0 /DNDEBUG /Fe"SortedVector2008SP1-intel.exe"
SortedVector.cpp /link /OPT:REF /OPT:ICF
pause
2) .bat to execute things
SortedVector2005SP1.exe
SortedVector2008SP1.exe
SortedVector2008SP1-intel.exe
pause
3) Code used (file named: SortedVector.cpp)
// SortedVector.cpp : Defines the entry point for the console application.
#include <windows.h>
#include <vector>
#include <set>
#include <algorithm>
#include <iostream>
#include <new.h>
#include <malloc.h>
// Define the functions for VTune
// In order to NOT ship VTuneAPI.DLL as a dependency to the client
// we need to dynically get the pointers to the pause/resume API.
// Hence this code has the advantage to make no implict compile or link
// dependencies towards VTune.
static void (__cdecl *VTResume)(void) = NULL;
static void (__cdecl *VTPause) (void) = NULL;
static bool VTuneInitialized = false;
// Init the VTPause and VTResume functions,
// and returns false if VTune is not present
bool InitVtunePointers()
{
HINSTANCE pVTuneDLL = LoadLibrary( "vtuneapi.dll" );
if( pVTuneDLL )
{
VTResume = (void(__cdecl *)())GetProcAddress( pVTuneDLL, "VTResume" );
VTPause = (void(__cdecl *)())GetProcAddress( pVTuneDLL, "VTPause" );
}
return !(NULL==pVTuneDLL);
}
class GeometryFace {
};
typedef std::vector<GeometryFace *> VectorContainer;
int main(int argc, char ** argv[])
{
static int LFH = 0;
LARGE_INTEGER li;
QueryPerformanceFrequency( &li );
InitVtunePointers();
HANDLE DefaultHeap = (HANDLE) _get_heap_handle();
if(NULL == DefaultHeap)
{
printf("There is a problem to compact the CRT heap");
exit(0);
}
else
{
if(0 == LFH)
{
LFH = 1;
// Copy - paste from MSDN
ULONG ulEnableLFH = 2;
if (HeapSetInformation((PVOID)DefaultHeap, HeapCompatibilityInformation,
&ulEnableLFH, sizeof(ulEnableLFH)))
printf("Enabling Low Fragmentation Heap succeeded\n");
else
printf("Enabling Low Fragmentation Heap failed\n");
}
bool ValidationRes = HeapValidate(DefaultHeap, 0, NULL);
SIZE_T MaxBlock = HeapCompact(DefaultHeap, 0);
printf("The maximum size for the CRT heap after compaction is %d,
validation is %s\n", (int) MaxBlock, (ValidationRes) ? "true" : "false");
}
//////////////////////////////////////
// Fill-in foo vector with random values
unsigned int MAX_ELEMENT = 2*1000*1000;
unsigned int ToRemoveNbr = 100;
unsigned int ToMergeNbr = 1000;
unsigned int IterationNbr = 100;
std::ostream_iterator<VectorContainer::value_type,
char,
std::char_traits<char> > out(std::cout, " ");
VectorContainer foo;
foo.reserve(MAX_ELEMENT);
std::set<int> RandomNumbers;
std::srand( 123456 );
int count=0;
while ( RandomNumbers.size() < MAX_ELEMENT )
{
RandomNumbers.insert( RandomNumbers.end(), (rand() * rand()) %
(MAX_ELEMENT * 10));
}
std::cout << std::endl;
std::set<int>::iterator RandomNumberIt = RandomNumbers.begin();
for(unsigned int i=0; i<MAX_ELEMENT; i++)
{
VectorContainer::value_type temp = (VectorContainer::value_type)
*(RandomNumberIt++);
foo.push_back(temp);
}
////////////////////////////////////////
// sort of vector
std::sort(foo.begin(), foo.end());
std::cout << "foo size: " << foo.size() << std::endl;
std::copy(foo.end () - 50, foo.end (), out);
std::cout << std::endl;
////////////////////////////////////////
// Copy of vector
VectorContainer bar(foo.begin(), foo.end());
std::cout << "bar size: " << foo.size() << std::endl;
std::copy(bar.end () - 50, bar.end (), out);
std::cout << std::endl;
///////////////////////////////////////
// LE vecteur a merger
VectorContainer ToBeMerged;
std::set<int> RandomNumbers2;
while ( RandomNumbers2.size() < ToMergeNbr )
{
RandomNumbers2.insert( RandomNumbers2.end(), (rand() * rand()) %
(MAX_ELEMENT * 10));
}
std::cout << std::endl;
std::set<int>::iterator RandomNumberIt2 = RandomNumbers2.begin();
for(unsigned int i=0; i<ToMergeNbr; i++)
{
VectorContainer::value_type temp = (VectorContainer::value_type)
*(RandomNumberIt2++);
ToBeMerged.push_back(temp);
}
std::sort(ToBeMerged.begin(), ToBeMerged.end());
////////////////////////////////////////
// inplace_merge srt --> dest
__int64 TotalTicks = 0;
LARGE_INTEGER timeBefore, timeAfter;
QueryPerformanceCounter( &timeBefore );
for(int ii=0; ii<100; ii++){
// Prep
int NewlyInserted = foo.size();
LARGE_INTEGER timeBefore, timeAfter;
for(unsigned int i=0; i<ToMergeNbr; i++)
{
foo.push_back(ToBeMerged[i]);
}
if(VTResume != NULL)
VTResume();
QueryPerformanceCounter( &timeBefore );
// Merge
//DebugBreak();
std::inplace_merge(foo.begin(),
foo.begin()+ NewlyInserted,
foo.end());
QueryPerformanceCounter( &timeAfter );
if(VTPause != NULL)
VTPause();
TotalTicks += timeAfter.QuadPart - timeBefore.QuadPart;
// On recommence
foo.clear();
foo.resize(bar.size());
std::copy(bar.begin(), bar.end(), foo.begin());
}
double timeInSeconds = (double)TotalTicks * 1000.0 / li.QuadPart;
printf("Took: %f ms\n", timeInSeconds);
return 0;
}