inline size_t StrLenA(const char *s)
{
size_t n = 0;
while (*s++) n++;
return n;
}
I realize today that my compiler does not inlined functions declared as inline (I found out by comparing clock ticks). This happens even if using the __forceinline specifier.
Pelles C Vesrion 7.00.355
ASFLAGS -AIA32 -Gz
CCFLAGS -std:C11 -Tx86-coff -Ot -Ob1 -fp:precise -W1 -Gz -Ze
Where am I wrong?
From the PellesC help for /Ob option:
QuoteWhen the /Ob1 option is specified, a definition of the inline function must be visible before it can be expanded into other functions.
When the /Ob2 option is specified, there is no such limitation. The compiler will collect all visible function definitions and emit the referenced ones at the end of the compilation - either with a body, or as inlined code. The /Ob2 option may be the most convenient, but please note that it will require more memory than the /Ob1 option - often much more.
This means that using /Ob1 the definition of the function to inline
must precede the call:
__forceinline int StrLenA(const char *s);
int foo1(void)
{
return StrLenA("Pippo"); //Will *not* be inlined!
}
__forceinline int StrLenA(const char *s)
{
int n = 0;
while (*s++) n++;
return n;
}
int foo2(void)
{
return StrLenA("Pippo"); //Will be inlined!
}
You can use the switch /Ob2, but it could be too much ... aggressive :(
This is the disassembly of the above code usung the /Ob1 switch that shows what has been exposed:
1: __forceinline int StrLenA(const char *s);
2:
3: int foo1(void)
_foo1@0:
[00000000] 55 push ebp
[00000001] 89E5 mov ebp,esp
4: {
5: return StrLenA("Pippo"); //Will *not* be inlined!
[00000003] 6800000000 push @8
[00000008] E833000000 call 00000040
[0000000D] 5D pop ebp
[0000000E] C3 ret
6: }
7:
8: __forceinline int StrLenA(const char *s)
9: {
10: int n = 0;
11: while (*s++) n++;
12: return n;
13: }
14:
15: int foo2(void)
_foo2@0:
[0000000F] 55 push ebp
[00000010] 89E5 mov ebp,esp
[00000012] 83EC08 sub esp,8
16: {
17: return StrLenA("Pippo"); //Will be inlined!
[00000015] C745FC00000000 mov dword ptr [ebp-4],@8
[0000001C] C745F800000000 mov dword ptr [ebp-8],0
[00000023] EB03 jmp 00000028
[00000025] FF45F8 inc dword ptr [ebp-8]
[00000028] 8B45FC mov eax,dword ptr [ebp-4]
[0000002B] 8D5001 lea edx,[eax+1]
[0000002E] 8955FC mov dword ptr [ebp-4],edx
[00000031] 0FBE00 movsx eax,byte ptr [eax]
[00000034] 83F800 cmp eax,0
[00000037] 75EC jne 00000025
[00000039] 8B45F8 mov eax,dword ptr [ebp-8]
[0000003C] 89EC mov esp,ebp
[0000003E] 5D pop ebp
[0000003F] C3 ret
_StrLenA@4:
[00000040] 55 push ebp
[00000041] 89E5 mov ebp,esp
[00000043] 83EC04 sub esp,4
[00000046] C745FC00000000 mov dword ptr [ebp-4],0
[0000004D] EB03 jmp 00000052
[0000004F] FF45FC inc dword ptr [ebp-4]
[00000052] 8B4508 mov eax,dword ptr [ebp+8]
[00000055] 8D5001 lea edx,[eax+1]
[00000058] 895508 mov dword ptr [ebp+8],edx
[0000005B] 0FBE00 movsx eax,byte ptr [eax]
[0000005E] 83F800 cmp eax,0
[00000061] 75EC jne 0000004F
[00000063] 8B45FC mov eax,dword ptr [ebp-4]
[00000066] 89EC mov esp,ebp
[00000068] 5D pop ebp
[00000069] C20400 ret 4
SUMMARY
156 .debug$S
40 .debug$T
10 .drectve
6 .rdata
6C .text
Ok frankie, thanks a lot!
Now I seen a bizarre behavior:
#define WIN32_LEAN_AND_MEAN /* speed up compilations */
#include <windows.h>
#include <stdio.h>
#include <time.h>
int StrLen(const char *s)
{
int n = 0;
while (*s++) n++;
return n;
}
__forceinline int StrLenInline(const char *s)
{
int n = 0;
while (*s++) n++;
return n;
}
int WINAPI WinMain(HINSTANCE hInstance, HINSTANCE hPrevInstance, LPSTR lpszCmdLine, int nCmdShow)
{
char buf[64];
char *str = "Pippo";
char *s = NULL;
int n;
clock_t t;
t = clock();
for(unsigned int u = 0; u<100000000; u++)
n = StrLen(str);
t = clock() - t;
sprintf(buf, "StrLen return: %d \nClock ticks: %u ", n, t);
MessageBox(NULL, buf, "Test", MB_OK);
t = clock();
for(unsigned int u = 0; u<100000000; u++)
n = StrLenInline(str);
t = clock() - t;
sprintf(buf, "StrLenInline return: %d \nClock ticks: %u ", n, t);
MessageBox(NULL, buf, "Test", MB_OK);
t = clock();
for(unsigned int u = 0; u<100000000; u++)
{
n = 0;
s = str;
while (*s++) n++;
}
t = clock() - t;
sprintf(buf, "Inline code result: %d \nClock ticks: %u ", n, t);
MessageBox(NULL, buf, "Test", MB_OK);
////////////////////////////////////////////////////
// Again - (second stage)
t = clock();
for(unsigned int u = 0; u<100000000; u++)
n = StrLen(str);
t = clock() - t;
sprintf(buf, "StrLen return: %d \nClock ticks: %u ", n, t);
MessageBox(NULL, buf, "Test", MB_OK);
t = clock();
for(unsigned int u = 0; u<100000000; u++)
n = StrLenInline(str);
t = clock() - t;
sprintf(buf, "StrLenInline return: %d \nClock ticks: %u ", n, t);
MessageBox(NULL, buf, "Test", MB_OK);
t = clock();
for(unsigned int u = 0; u<100000000; u++)
{
n = 0;
s = str;
while (*s++) n++;
}
t = clock() - t;
sprintf(buf, "Inline code result: %d \nClock ticks: %u ", n, t);
MessageBox(NULL, buf, "Test", MB_OK);
return 0;
}
Outputs:
StrLen return: 5 Clock ticks: 796
StrLenInline return: 5 Clock ticks: 844
Inline code result: 5 Clock ticks: 843
StrLen return: 5 Clock ticks: 796
StrLenInline return: 5 Clock ticks: 406
Inline code result: 5 Clock ticks: 422
|
|
it seems that inlining benefits are taken on the second stage.
This is mainly due to the limited registers available on a 32bit machine.
If you use the debugger you will see that the inlined functions use different registers arrangement in different parts of the code. The register used depends on which of them are available or more convenient to use in that point. The selection of registers is called 'spilling' in compiler theory.
Anyway different registers have also different timings on some operations, and sometimes push or pop operations are required to 'free' a register for the routine.
As proof observe the usage of the standard called StrLen, because we use always the same code (same registers, same memory accesses, etc) its timing is perfectly constant.
Indeed! I noticed that this behavior will change from machine to machine (I tried both a Pentium III processor and a Core i3).
I also saw that the /Ot or Ox compiler option for speed optimization, affect the behavior of the inline functions, sometimes negatively.
Sincerely, I'm growing doubts about the real benefits of inline functions.
May be cheaper using the Ot compiler optimization instead of declaring inline functions?
First of all take any medicine moderately ;D
Exaggerate use of something in 99% of cases is not a good choice.
Said that the inlining is beneficial when the routine is shorter, in code, of prologue and epilogue:
int __forceinline GetMyInt(char *s)
{
return atoi(s+2);
}
Or when it is called so many time to make the epilogue and prologue weigth consistently on execution times. In your example the loop timing make negligible the benefit of inlining.
Of course we have not considered the effects on processor pipelining...