Vectorization of template functions
 
Hello all,
I am attempting to vectorize few template functions with the Intel
compiler, but without much success so far. Ok granted, this question
is not 100% c++, but it is related enough that I felt I could post it
here. Also, I did ask in the Intel forums, without much success. And
maybe there are some c++ coders in here that are familiar with the
Intel compiler.
The code below highlights the problem I have. Essentially, I have a
template class that handle images (i.e. large matrices). The template
parameter represent the data type held by the image (i.e. uint8,
float, ...).
I have many functions that apply certain filters to these images, and
I would like to vectorize them. Obviously, since the argument of the
functions is an instance of a template class (the image), the function
itself is a template. The name of the problematic function is 'test',
defined towards the bottom of the code. 'test' is called in 2
different ways.
First, in the main, I defined 2 images, 'gray' and 'tmp', and call
'test' using them as argument. Now when I compile with -QxN, the inner
loop within 'test' is vectorized. Good.
However, I cannot always have the definition of the template function
in the same file as the main, because they are simply too many
template functions defined. Hence I define the template in another
source file, but I have to instantiate it with the actual parameters
it is going to be used, otherwise the linker will not find the code. I
tried to reproduce this organization in a single file. Just below the
definition of 'test', I inserted a line that instantiates 'test' for
the parameters with which it is going to be used. The compiler
understands that correctly, but now says that it cannot vectorize the
inner loop! The message is : "loop was not vectorized: deference too
complex". Why is the deference now too complex, when the compiler
handled it just fine for the other instantiation in the main?!
Any help would be much appreciated.
Alex
##### CUT #####
#include <windows.h>
#include <stdio.h>
#include <math.h>
typedef unsigned char u8;
typedef float f32;
////////////////////
///// MEMORY ROUTINES
////////////////////
enum { MemoryAlignment=64};
void* AllocateMemory(size_t size)
{
return _aligned_malloc(size, MemoryAlignment);
}
void ReleaseMemory(void *memblock)
{
return _aligned_free(memblock);
}
int ComputeAlignedWidth(int width)
{
int alignment_needed = MemoryAlignment / sizeof(float);
return (int)ceil((float)width/(float)alignment_needed) *
alignment_needed;
}
////////////////////
///// CLASS DECLARATION
////////////////////
template <typename T>
struct Image
{
public: // members
// std information
int width, height, depth;
// actual width of the buffer
// buffer holding image data is padded to be a multiple
// of MemoryAlignment for optimisation purposes
int width_padded;
// dimensions helper
int firstRow, lastRow, firstCol, lastCol;
// pointer to the image data
T* data;
public: // methods
// ctor
Image():
width(0),height(0),depth(0),
width_padded(0),
firstRow(0), lastRow(0), firstCol(0), lastCol(0),
data(NULL)
{
}
// dtor
~Image()
{
}
// memory management
void Allocate() { data =
static_cast<T*>(AllocateMemory(width_padded*height*depth*sizeof(T)));}
void Release () { ReleaseMemory(data);}
// pixel access
// virtual T& operator() (int row, int col)
// dimensions management
void SetDimensions(int h, int w, int d){
height = h;
width = w;
depth = d;
width_padded = ComputeAlignedWidth(width);
firstRow = 0;
firstCol = 0;
lastRow = height-1;
lastCol = width-1;
}
// size information
int GetTotalSize(bool padded=false){
if (padded) return width_padded*height*depth*sizeof(T);
else return width *height*depth*sizeof(T);
}
int GetImageSize(bool padded=false){
if (padded) return width_padded*height*depth;
else return width *height*depth;
}
int GetPlaneSize(bool padded=false){
if (padded) return width_padded*height;
else return width *height;
}
};
template <typename T>
struct GrayImage : public Image<T>
{
public: // methods
// ctor
GrayImage():
Image()
{
depth=1;
}
// pixel access
T& operator() (int row, int col)
{
return data[row*width_padded + col];
}
};
template <typename T>
void test(GrayImage<T> &input, GrayImage<T> &output)
{
int lastR = input.lastRow, firstR = input.firstRow;
int lastC = input.lastCol, firstC = input.firstCol;
for(int row=firstR ; row<=lastR ; ++row){
#pragma ivdep
for(int col=firstC ; col<=lastC ; ++col){
//for(int row=input.firstRow ; row<=input.lastRow ; ++row){
// for(int col=input.firstCol ; col<input.lastCol; ++col){
output(row, col) = input(row, col) + 1;
}
}
}
template void test<f32>(GrayImage<f32> &input, GrayImage<f32>
&output);
int main(int argc, char* argv[])
{
GrayImage<f32> gray, tmp;
gray.SetDimensions(2000, 2000, 1); gray.Allocate();
tmp.SetDimensions(gray.height, gray.width, 1); tmp.Allocate();
test(gray, tmp);
gray.Release(); tmp.Release();
return 0;
}
##### CUT #####