I have some templated vector code to unroll a loop, see below. I am
trying to test this code against a simple a*a + b*b + c*c vector dot
product to see if it truly is producing the same code. I ran the code
below and had a look at the assembly listing and it does look like the
inlining works, but unfortunatly the optimising power of the compiler is
so great that it substituted my code for a simple:

     int main()
         printf("%f", 14.0f);
         return 0;

which is great, but not what i want for profiling purposes :-). I still
want some optimisation for inlining the code, just not so much. NOTE: I
only have the basic optimise for speed settings, nothing fancy like
global intrinsic functions....

template<typename _Value>
struct Vector
     _Value _value;

template<typename _Vector> struct VectorDotProd {};

template<typename _Type, int _Length>
struct VectorDotProd<Vector<_Type[_Length]> >
     typedef Vector<_Type[_Length]> _Vector;

     __inline static float Apply(_Vector const & vector0,
                    _Vector const & vector1)
         return Impl<_Type, _Length-1>::Apply(vector0, vector1);

     template<typename _Type, int _Index>
     struct Impl
         __inline static float Apply(_Vector const & vector0,
                    _Vector const & vector1)
             return vector0._value[_Index] * vector1._value[_Index]
                 + Impl<_Type, _Index-1>::Apply(vector0, vector1);

     template<typename _Type>
     struct Impl<_Type, 0>
         __inline static float Apply(_Vector const & vector0,
                    _Vector const & vector1)
             return vector0._value[0] * vector1._value[0];


int main()
     typedef Vector<float[3]> Vector3f;
     Vector3f v0 = {1.0f, 2.0f, 3.0f};
     Vector3f v1 = {1.0f, 2.0f, 3.0f};

     float result = VectorDotProd<Vector3f>::Apply(v0, v1);
     _RPTF1(_CRT_WARN, "v0.v1 = %f\n", result);

     printf("%f\n", result);

     return 0;

