Forward Scattering - The Weblog of Nicholas Chapman

Multiplication in C++ is left to right associative - e.g.

y = a * b * c

means

y = (a * b) * c

Note that although the mathematical operation of multiplication is associative, e.g. (a * b) * c = a * (b * c), the same is not true for floating point multiplication, due to intermediate rounding error.

Therefore in default circumstances, the compiler may not optimise

y = a * b * c

y = a * (b * c)

as it may give different results. However, if 'fast math' mode is enabled, compilers may in theory perform this 'reassociation'.

This can be important for a number of reasons - one is that b and c may be constants, and the y = a * (b * c) may be inside a loop. Performing reassociation means that the b*c multiplication may be 'hoisted' outside the loop, so only one multiplication is done instead of two.

Clang performs this reassociation optimisation:


float f()
{
  	const float b = 2.f;
	const float c = 3.f;
	float sum = 1.f;
	for(int i=0; i<1024; ++i)
	{
		const float a = (float)(sum + 1.f);
	
		const float y = a * b * c;
		sum += y;
	}
  
  return sum;
}

gives:

.LCPI0_0:
	.long	1065353216              # float 1
.LCPI0_1:
	.long	1086324736              # float 6
f():                                  # @f()
	movss	.LCPI0_0(%rip), %xmm1
	movl	$1024, %eax             # imm = 0x400
	movss	.LCPI0_1(%rip), %xmm2
	movaps	%xmm1, %xmm0
.LBB0_1:                                # =>This Inner Loop Header: Depth=1
	movaps	%xmm0, %xmm3
	addss	%xmm1, %xmm0
	mulss	%xmm2, %xmm0
	addss	%xmm3, %xmm0
	decl	%eax
	jne	.LBB0_1
	ret

(See the program on http://gcc.godbolt.org/)

Note that there is only one multiplication (the mulss instruction) inside the loop. Also b * c is precomputed. (note the 'float 6' constant).

Visual Studio 2012, however, gives:

static float func()
{
00000001408750A0  sub         rsp,18h  
  	const float b = 2.f;
	const float c = 3.f;
	float sum = 1.f;
00000001408750A4  movss       xmm4,dword ptr [__real@3f800000 (0140D9C67Ch)]  
00000001408750AC  movss       xmm5,dword ptr [__real@40000000 (0140D9CBF8h)]  
00000001408750B4  movaps      xmmword ptr [rsp],xmm6  
00000001408750B8  mov         eax,80h  
00000001408750BD  movaps      xmm3,xmm4  
00000001408750C0  movss       xmm6,dword ptr [__real@40400000 (0140D9CD58h)]  
00000001408750C8  nop         dword ptr [rax+rax]  
	for(int i=0; i<1024; ++i)
	{
		const float a = (float)(sum + 1.f);
00000001408750D0  movaps      xmm0,xmm3  
00000001408750D3  addss       xmm0,xmm4  
	
		const float y = a * b * c;
00000001408750D7  mulss       xmm0,xmm5  
00000001408750DB  mulss       xmm0,xmm6  
		sum += y;
00000001408750DF  addss       xmm3,xmm0  
00000001408750E3  movaps      xmm1,xmm3  
00000001408750E6  addss       xmm1,xmm4  
00000001408750EA  mulss       xmm1,xmm5  
		sum += y;
00000001408750EE  mulss       xmm1,xmm6  
00000001408750F2  addss       xmm3,xmm1  
00000001408750F6  movaps      xmm0,xmm3  
00000001408750F9  addss       xmm0,xmm4  
00000001408750FD  mulss       xmm0,xmm5  
0000000140875101  mulss       xmm0,xmm6  
0000000140875105  addss       xmm3,xmm0  
0000000140875109  movaps      xmm1,xmm3  
000000014087510C  addss       xmm1,xmm4  
0000000140875110  mulss       xmm1,xmm5  
0000000140875114  mulss       xmm1,xmm6  
0000000140875118  addss       xmm3,xmm1  
000000014087511C  movaps      xmm0,xmm3  
000000014087511F  addss       xmm0,xmm4  
0000000140875123  mulss       xmm0,xmm5  
0000000140875127  mulss       xmm0,xmm6  
000000014087512B  addss       xmm3,xmm0  
000000014087512F  movaps      xmm1,xmm3  
0000000140875132  addss       xmm1,xmm4  
0000000140875136  mulss       xmm1,xmm5  
000000014087513A  mulss       xmm1,xmm6  
000000014087513E  addss       xmm3,xmm1  
0000000140875142  movaps      xmm2,xmm3  
0000000140875145  addss       xmm2,xmm4  
0000000140875149  mulss       xmm2,xmm5  
000000014087514D  mulss       xmm2,xmm6  
0000000140875151  addss       xmm3,xmm2  
0000000140875155  movaps      xmm1,xmm3  
0000000140875158  addss       xmm1,xmm4  
000000014087515C  mulss       xmm1,xmm5  
0000000140875160  mulss       xmm1,xmm6  
0000000140875164  addss       xmm3,xmm1  
0000000140875168  dec         rax  
000000014087516B  jne         func+30h (01408750D0h)  
	}
  
	return sum;
0000000140875171  movaps      xmm0,xmm3  
}
0000000140875174  movaps      xmm6,xmmword ptr [rsp]  
0000000140875178  add         rsp,18h  
000000014087517C  ret

Although VS has unrolled the loop a bit, you can see that there are two multiplications done, with a dependency between them, e.g. the first has to finish before the second is done. So VS doesn't seem to do the reassociation optimisation, even with fast maths enabled (/fp:fast), which is unfortunate.

EDIT: Bug posted here: https://connect.microsoft.com/VisualStudio/feedback/details/1075102.

Also optimisation still not done in VS 2013.