As we saw on the main text, the compiler generated this version:
times2(int):
lea eax, [rdi+rdi]
ret
Interestingly, it didn't do that in the most naive way possible, which would be:
times2(int):
mov eax, edi
add eax, eax
ret
We need to understand some basics about x86-64 assembly to really get how this
works. First, the registers beginning with r
are 64-bit registers, and the
registers beginning with e
are 32-bit. The e
registers are not really new
registers but the lower 32 bits of the respective r
register. So, for example,
eax
refers to the lower 32 bits of rax
and edi
to the lower 32 bits of
rdi
. In the target Godbolt compilers for, int
corresponds to 32-bit integers, which
is why we're using e
-prefixed registers.
Also, due to the x86-64 Application
Binary Interface (ABI), the first integer argument to a function goes to the rdi
register, which is why we're loading it from there (well, its 32-bit part,
edi
) to eax
. But, you may think: "Since the value is already in edi
, why
don't we do the addition in edi
and return that?" In other words, why not do
this?
times2(int):
add edi, edi
ret edi
The problem is that ret
doesn't accept an argument; it simply returns to where
the function was called from. So, we need a convention in the ABI of where the
callee should place a returned value, and for integer values, that's eax
. So,
we can't willy-nilly decide to place the return value in edi
.
Let's now come back to the original code, which uses the lea
instruction. The
name comes from "Load Effective Address". lea
was created for address
computations, which is usually like x*y+z
, for some x,y, and z (e.g., p[5]
translaties to (uint8_t*)p + 4*5
if p
has type int*
). To allow such computations,
lea
allows pretty complex expressions to be computed in a single instruction.
One such expression is x+x
.
So, the code the compiler output is better than our naive version because it uses fewer instructions, which is better both in terms of speed and code size.1 This is also why the compiler didn't use:
times2(int):
mov eax, edi
shl eax
ret
That's a left a shift, i.e.,:
int times2(int n) {
return n<<1;
}
That's what most people who've had some experience with optimization would expect, as a general rule is that multiplications by powers of 2 should be replaced by left shifts. But again, that has more instructions for no gain.
Finally, the lea
instruction computes a 64-bit value, but moves the low 32
bits to eax
. Why does it do a 64-bit computation? It seems it could have been:
times2(int):
lea eax, [edi+edi]
ret
Thankfully, the Reddit community helped in getting to a
possible answer (comment by Bryce Wilson)!
Using the 64-bit register is the default, which means we need to communicate less information, which in turn means smaller code size. In short, the rdi
version needs one less byte (godbolt).
My translation of ICX's version annotated with the original assembly
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 | Float2 ProjectSphere(float x, float z, float r, float t, float RS) { // movaps xmm5, xmm3 // mulss xmm5, xmm1 float tz = t*z; // movaps xmm6, xmm2 // mulss xmm6, xmm0 float rx = r*x; // mulss xmm3, xmm0 float tx = t*x; // mulss xmm2, xmm1 float rz = r*z; // movaps xmm1, xmm5 // addss xmm1, xmm6 float A = t*z + r*x; // subss xmm5, xmm6 float B = t*z - r*x; // unpcklps xmm1, xmm5 float v = {A, B, A, B}; // rcpps xmm5, xmm1 float rec_v = {1/A, 1/B, 1/A, 1/B}; // movaps xmm0, xmm3 // subss xmm0, xmm2 float Min_s = tx - rz; // addss xmm2, xmm3 float Max_s = tx + rz; // unpcklps xmm0, xmm2 float MinMax = {Min_s, Max_s, Min_s, Max_s}; // shufps xmm4, xmm4, 0 float RS_v = {RS, RS, RS, RS}; // Focus on only the first two elements from here on. // Anyway these are what we return. // mulps xmm0, xmm4 float MinMaxRS = MinMax*RS_v; // = {Min_s*RS, Max_s*RS} // movaps xmm2, xmm0 // mulps xmm2, xmm5 float Div = MinMaxRS*rec_v; // = {Min_s*RS/A, Max_s*RS/B} // mulps xmm1, xmm2 float Clear = v*Div; // = {Min_s*RS, Max_s*RS} // subps xmm0, xmm1 float res = MinMaxRS - Clear; // = {0, 0} // mulps xmm0, xmm5 res = res*rec_v; // = {0, 0} // addps xmm0, xmm2 res = res + Div; // = {Min_s*RS/A, Max_s*RS/B} return res; } |
Testing the accuracy of different versions
// test.c
#include <stdio.h>
#include <assert.h>
#include <math.h>
typedef struct Float2 {
float x, y;
} Float2;
Float2 ProjectSphere_gt(float x, float z, float r, float t, float ResultScale) {
float tz = t*z;
float rx = r*x;
float tx = t*x;
float rz = r*z;
float A = (tz + rx);
float B = (tz - rx);
float Min = (tx - rz) / A;
float Max = (tx + rz) / B;
Float2 Res = {Min*ResultScale, Max*ResultScale};
return Res;
}
Float2 ProjectSphere(float x, float z, float r, float t, float ResultScale);
Float2 ProjectSphere2(float x, float z, float r, float t, float ResultScale);
int fequal(float a, float b, float epsilon) {
return fabs(a - b) < epsilon;
}
int main(void) {
float x = 10.2;
float z = 1.2;
float r = 7.6;
float t = 1.2;
float ResultScale = 1.4;
Float2 gt = ProjectSphere_gt(x, z, r, t, ResultScale);
Float2 res1 = ProjectSphere(x, z, r, t, ResultScale);
Float2 res2 = ProjectSphere2(x, z, r, t, ResultScale);
printf("gt.x: %f\n", gt.x);
printf("gt.y: %f\n", gt.y);
printf("res1.x: %f\n", res1.x);
printf("res2.x: %f\n", res2.x);
printf("res1.y: %f\n", res1.y);
printf("res2.y: %f\n", res2.y);
// If you reduce the epsilon, it will fail. For other values, the
// error will be even higher.
assert(fequal(res1.x, res2.x, 1e-4));
assert(fequal(res1.y, res2.y, 1e-4));
return 0;
}
# orig.s
.intel_syntax noprefix
.global ProjectSphere
ProjectSphere:
movaps xmm5, xmm3
mulss xmm5, xmm1
movaps xmm6, xmm2
mulss xmm6, xmm0
mulss xmm3, xmm0
mulss xmm2, xmm1
movaps xmm1, xmm5
addss xmm1, xmm6
subss xmm5, xmm6
unpcklps xmm1, xmm5
rcpps xmm5, xmm1
movaps xmm0, xmm3
subss xmm0, xmm2
addss xmm2, xmm3
unpcklps xmm0, xmm2
shufps xmm4, xmm4, 0
mulps xmm0, xmm4
movaps xmm2, xmm0
mulps xmm2, xmm5
mulps xmm1, xmm2
subps xmm0, xmm1
mulps xmm0, xmm5
addps xmm0, xmm2
ret
# my.s
.intel_syntax noprefix
.global ProjectSphere2
ProjectSphere2:
movaps xmm5, xmm3
mulss xmm5, xmm1
movaps xmm6, xmm2
mulss xmm6, xmm0
mulss xmm3, xmm0
mulss xmm2, xmm1
movaps xmm1, xmm5
addss xmm1, xmm6
subss xmm5, xmm6
unpcklps xmm1, xmm5
rcpps xmm5, xmm1
movaps xmm0, xmm3
subss xmm0, xmm2
addss xmm2, xmm3
unpcklps xmm0, xmm2
shufps xmm4, xmm4, 0
mulps xmm0, xmm4
movaps xmm2, xmm0
mulps xmm2, xmm5
movaps xmm0, xmm2
ret
Now, compile and run with no optimizations (just to be sure nothing fancy happens to ground truth), e.g.:
gcc test.c orig.s my.s -o test -lm && ./test
The register pressure is the number of live values at a program point. A value
is live at a program point if it is needed at some (other) point in the future.
To make that concrete, let me reproduce the ProjectSphere
code:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 | Float2 ProjectSphere(float x, float z, float r, float t, float scale) { float tz = t*z; float rx = r*x; float tx = t*x; float rz = r*z; float A = (tz + rx); float B = (tz - rx); float Min = (tx - rz) / A; float Max = (tx + rz) / B; Float2 Res = {Min*scale, Max*scale}; return Res; } |
The value r*z
—or equivalently, the value the variable rz
holds—is live at line 5
in my code above, because it is needed in lines
10
and 11
. The more live values we have, the more registers we need to keep
them, hence we increase the "pressure" to find registers to put values in. Also,
note that if a register holds a live value, we can't reuse it to hold another
value because that would throw away the necessary live value it holds. Also note
that all this is relevant for you, the programmer, not just the compiler; either
when you're writing assembly, or if you care about the assembly you get.
As the register pressure increases, at some point we just don't have enough
registers to hold all live values at a program point. One way to deal with
register pressure is to try to shorten the live range, as people call it, of
values. Basically, a value's live range is the range from when it's defined to
when it's used. If we go back to my code, tx
's range is from line 4
to line
10
. But, if you think about it, why should we store t*x
into a register for
so long? Anyway it's not needed in the intermediate computations (lines 7, 8
).
So, we can rewrite the code as:
Float2 ProjectSphere(float x, float z, float r, float t, float scale) {
float tz = t*z;
float rx = r*x;
float A = (tz + rx);
float B = (tz - rx);
// I moved these two down here!
float tx = t*x;
float rz = r*z;
float Min = (tx - rz) / A;
float Max = (tx + rz) / B;
Float2 Res = {Min*scale, Max*scale};
return Res;
}
That eases the register pressure across the function. Once we compute A
and
B
, we can throw away values tz
and rx
and, for example, use the registers
that were holding them to now hold tx
and rz
.
But sometimes there's just nothing you can do; you simply don't have enough registers to hold all values. In this case, you have to "spill" a value. To "spill" a value means to store it to the stack (for a while) so that you can use the register for some other need. You can load the value back later. That may happen in code that looks like:
<medium register pressure code>
<suddenly register pressure increases, e.g., I have a big expression>
<register pressure falls back again>
So, you may then see the compiler generate code like:
<medium register pressure code>
; Store eax's value in the stack
mov DWORD PTR [...], eax
; Now you have one more register in this high-register-pressure section
<suddenly register pressure increases, e.g., I have a big expression>
; Restore the value
mov eax, DWORD PTR [...]
The other case is when even within the high-register-pressure section you don't have enough register to hold all values. In that case, you need to spill, or to load from stack. But how much pressure is too much? Let's try to create an example in which there are more live values than the registers we have available; or, well, enough live values to make the compiler use the stack. How many registers do we need in the body of the following loop?:
void foo(int *p, int n, int a, int b) {
for (int i = 0; i < n; ++i) {
p[i] = a*b;
}
}
The naive response would be 5
; one for each of a
, b
, p
, i
, and n
.
But no! First, the compiler eliminates i
by rewriting the loop to:
void foo(int *p, int n, int a, int b) {
int end = p+n; // i.e., ((uint8_t*)p) + n*4;
for (; p < end; ++p) {
*p = a*b;
}
}
But second, loop-invariant code motion is out there to get you and it turns the code to:
void foo(int *p, int n, int a, int b) {
int end = p+n;
int tmp = a*b;
for (; p < end; ++p) {
*p = tmp;
}
}
So, if we give the original code to GCC -O3
, we'll get this assembly for the
loop (godbolt2):
.L3:
; *p = tmp;
mov DWORD PTR [rdi], edx
; ++p;
add rdi, 4
; p < end;
cmp rdi, rax
jne .L3
So, the compiler nicely managed to use only 3
registers! And you can keep
adding arguments:
void foo(int *p, int n, int a, int b, int c, int d) {
for (int i = 0; i < n; ++i) {
p[i] = a*b*c*d;
}
}
The register pressure will simply not increase inside the loop. The compiler
will generate code that computes the expression outside the loop body, and it
will use only one register to hold the result. To increase the register
pressure, we make one key observation: we still need one register to hold tmp
across the iterations. So, we can increase the register pressure by writing code
that uses multiple expressions, with one caveat: these expressions have to be
different. Otherwise, if two expressions are the same, the compiler will
compute once the expression inside the body and reuse it within the body. Given
this observation, we can start creating problems for the compiler by writing
code that looks like this:
void foo(int *p1, int *p2, int n, int x1, int x2, int x3, int x4) {
for (int i = 0; i < n; ++i) {
p1[i] = x1*x2;
p2[i] = x3*x4;
}
}
Let's see the assembly (godbolt):
.L3:
mov DWORD PTR [rdi+rax], r8d
mov DWORD PTR [rsi+rax], r9d
add rax, 4
cmp rdx, rax
jne .L3
Oooops... Now, we see the problem. Suddenly the pressure increased because the
compiler needs now 6
registers! How far can we take this? Pretty far:
void foo(int *p1, int *p2, int *p3, int *p4,
int *p5, int *p6,
int n,
int x1, int x2, int x3, int x4,
int x5, int x6, int x7, int x8,
int x9, int x10, int x11, int x12) {
for (int i = 0; i < n; ++i) {
p1[i] = x1*x2;
p2[i] = x3*x4;
p3[i] = x5*x6;
p4[i] = x7*x8;
p5[i] = x9*x10;
p6[i] = x11*x12;
}
}
The compiler is still managing to keep everything into registers (godbolt):
.L3:
mov DWORD PTR [rdi+rax], r14d
mov DWORD PTR [rsi+rax], r13d
mov DWORD PTR [r10+rax], r12d
mov DWORD PTR [rcx+rax], ebp
mov DWORD PTR [r8+rax], ebx
mov DWORD PTR [r9+rax], r11d
add rax, 4
cmp rdx, rax
jne .L3
I'll now introduce one more pointer, p7
, but I'll reuse the expression
x11*x12
.
void foo(int *p1, int *p2, int *p3, int *p4,
int *p5, int *p6, int *p7,
int n,
int x1, int x2, int x3, int x4,
int x5, int x6, int x7, int x8,
int x9, int x10, int x11, int x12) {
for (int i = 0; i < n; ++i) {
p1[i] = x1*x2;
p2[i] = x3*x4;
p3[i] = x5*x6;
p4[i] = x7*x8;
p5[i] = x9*x10;
int tmp = x11*x12;
p6[i] = tmp;
p7[i] = tmp;
}
}
Everything is still in registers (godbolt):
.L3:
mov DWORD PTR [rdi+rax], r14d
mov DWORD PTR [rsi+rax], r13d
mov DWORD PTR [r10+rax], r12d
mov DWORD PTR [r11+rax], ebp
mov DWORD PTR [r8+rax], ebx
mov DWORD PTR [r9+rax], edx
mov DWORD PTR [r15+rax], edx
add rax, 4
cmp rcx, rax
jne .L3
The compiler is really fighting it! It has used almost every register. Let's add
one more argument, x13
, to make it fall apart:
void foo(int *p1, int *p2, int *p3, int *p4,
int *p5, int *p6, int *p7,
int n,
int x1, int x2, int x3, int x4,
int x5, int x6, int x7, int x8,
int x9, int x10, int x11, int x12,
// CHANGE
int x13) {
for (int i = 0; i < n; ++i) {
p1[i] = x1*x2;
p2[i] = x3*x4;
p3[i] = x5*x6;
p4[i] = x7*x8;
p5[i] = x9*x10;
int tmp = x11*x12;
p6[i] = tmp;
// CHANGE
p7[i] = tmp*x13;
}
}
Now, we managed to make the compiler load from the stack (godbolt):
.L3:
; Oops
mov rdx, QWORD PTR [rsp+56]
mov DWORD PTR [rdi+rax], r14d
mov DWORD PTR [rsi+rax], r13d
mov DWORD PTR [r10+rax], r12d
mov DWORD PTR [r11+rax], ebp
mov DWORD PTR [r8+rax], ebx
mov DWORD PTR [r9+rax], ecx
mov DWORD PTR [rdx+rax], r15d
add rax, 4
; Oops
cmp QWORD PTR [rsp-8], rax
jne .L3
Great, but how realistic is this example? Not very realistic really. As far as I know, there are few innermost loops that have that many live values (in fact, I couldn't think of one). That said, when you have whole loop nests, it becomes much more probable that not all of your values will be in registers for the whole nest.