Handmade Hero»Forums»Code
Mirolyub Hristov
6 posts
Building with only one translation unit
Edited by Mirolyub Hristov on
Like I said in the Q&A, you can use #pragma optimize so that only one translation unit is needed.

I tested removing the separate compilation of handmade_optimized.cpp and it seems to work great. In fact, I get better performance for DrawRectangleQuickly (37cy/h instead of 42cy/h). I am using Visual Studio 2015 Community Edition, so I had to disable some of the new warnings, but the changes are minor.

I've attached the diff.

(Edit: I can't seem to attach the file, so here it is inline)

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
diff -r ba92a84dd948 -r fcef0eb84ce4 .hgignore
--- a/.hgignore	Wed Sep 09 19:18:38 2015 +0200
+++ b/.hgignore	Wed Sep 09 19:39:55 2015 +0200
@@ -0,0 +1,3 @@
+syntax: glob
+build/
+handmade/data/*.hha
diff -r ba92a84dd948 -r fcef0eb84ce4 handmade/code/build.bat
--- a/handmade/code/build.bat	Wed Sep 09 19:18:38 2015 +0200
+++ b/handmade/code/build.bat	Wed Sep 09 19:39:55 2015 +0200
@@ -1,6 +1,6 @@
 @echo off
 
-set CommonCompilerFlags=-Od -MTd -nologo -fp:fast -fp:except- -Gm- -GR- -EHa- -Zo -Oi -WX -W4 -wd4201 -wd4100 -wd4189 -wd4505 -wd4127 -DHANDMADE_INTERNAL=1 -DHANDMADE_SLOW=1 -DHANDMADE_WIN32=1 -FC -Z7
+set CommonCompilerFlags=-Od -MTd -nologo -fp:fast -fp:except- -Gm- -GR- -EHa- -Zo -Oi -WX -W4 -wd4201 -wd4100 -wd4189 -wd4505 -wd4127 -wd4456 -wd4457 -DHANDMADE_INTERNAL=1 -DHANDMADE_SLOW=1 -DHANDMADE_WIN32=1 -FC -Z7
 set CommonLinkerFlags= -incremental:no -opt:ref user32.lib gdi32.lib winmm.lib
 
 REM TODO - can we just build both with one exe?
@@ -19,8 +19,7 @@
 REM 64-bit build
 REM Optimization switches /wO2
 echo WAITING FOR PDB > lock.tmp
-cl %CommonCompilerFlags% -DDebugRecordArray=DebugRecords_Optimized -O2 -I..\iaca-win64\ -c ..\handmade\code\handmade_optimized.cpp -Fohandmade_optimized.obj -LD
-cl %CommonCompilerFlags% -DDebugRecordArray=DebugRecords_Main -I..\iaca-win64\ ..\handmade\code\handmade.cpp handmade_optimized.obj -Fmhandmade.map -LD /link -incremental:no -opt:ref -PDB:handmade_%random%.pdb -EXPORT:GameGetSoundSamples -EXPORT:GameUpdateAndRender -EXPORT:DEBUGGameFrameEnd
+cl %CommonCompilerFlags% -DDebugRecordArray=DebugRecords_Main -O2 -I..\iaca-win64\ ..\handmade\code\handmade.cpp -Fmhandmade.map -LD /link -incremental:no -opt:ref -PDB:handmade_%random%.pdb -EXPORT:GameGetSoundSamples -EXPORT:GameUpdateAndRender -EXPORT:DEBUGGameFrameEnd
 del lock.tmp
 cl %CommonCompilerFlags% ..\handmade\code\win32_handmade.cpp -Fmwin32_handmade.map /link %CommonLinkerFlags%
 popd
diff -r ba92a84dd948 -r fcef0eb84ce4 handmade/code/handmade.cpp
--- a/handmade/code/handmade.cpp	Wed Sep 09 19:18:38 2015 +0200
+++ b/handmade/code/handmade.cpp	Wed Sep 09 19:39:55 2015 +0200
@@ -1,3 +1,4 @@
+#pragma optimize("", off)
 /* ========================================================================
    $File: $
    $Date: $
@@ -1697,4 +1698,8 @@
     OutputPlayingSounds(&GameState->AudioState, SoundBuffer, TranState->Assets, &TranState->TranArena);
 }
 
+#pragma optimize("", on)
+#include "handmade_optimized.cpp"
+#pragma optimize("", off)
+
 #include "handmade_debug.cpp"
diff -r ba92a84dd948 -r fcef0eb84ce4 handmade/code/handmade_debug.cpp
--- a/handmade/code/handmade_debug.cpp	Wed Sep 09 19:18:38 2015 +0200
+++ b/handmade/code/handmade_debug.cpp	Wed Sep 09 19:39:55 2015 +0200
@@ -342,16 +342,12 @@
 
 debug_record DebugRecordArray[__COUNTER__];
 
-extern u32 const DebugRecords_Optimized_Count;
-debug_record DebugRecords_Optimized[];
-
 extern "C" DEBUG_GAME_FRAME_END(DEBUGGameFrameEnd)
 {
     debug_state *DebugState = (debug_state *)Memory->DebugStorage;
     if(DebugState)
     {
         DebugState->CounterCount = 0;
-        UpdateDebugRecords(DebugState, DebugRecords_Optimized_Count, DebugRecords_Optimized);
         UpdateDebugRecords(DebugState, ArrayCount(DebugRecords_Main), DebugRecords_Main);
 
         DebugState->FrameEndInfos[DebugState->SnapshotIndex] = *Info;
diff -r ba92a84dd948 -r fcef0eb84ce4 handmade/code/handmade_intrinsics.h
--- a/handmade/code/handmade_intrinsics.h	Wed Sep 09 19:18:38 2015 +0200
+++ b/handmade/code/handmade_intrinsics.h	Wed Sep 09 19:39:55 2015 +0200
@@ -74,7 +74,7 @@
 inline real32
 AbsoluteValue(real32 Real32)
 {
-    real32 Result = fabs(Real32);
+    real32 Result = fabsf(Real32);
     return(Result);
 }
 
diff -r ba92a84dd948 -r fcef0eb84ce4 handmade/code/handmade_optimized.cpp
--- a/handmade/code/handmade_optimized.cpp	Wed Sep 09 19:18:38 2015 +0200
+++ b/handmade/code/handmade_optimized.cpp	Wed Sep 09 19:39:55 2015 +0200
@@ -6,9 +6,6 @@
    $Notice: (C) Copyright 2015 by Molly Rocket, Inc. All Rights Reserved. $
    ======================================================================== */
 
-#define internal
-#include "handmade.h"
-
 #if 0
 #include <iacaMarks.h>
 #else
@@ -372,6 +369,3 @@
         }
     }
 }
-
-extern u32 const DebugRecords_Optimized_Count = __COUNTER__;
-debug_record DebugRecords_Optimized[DebugRecords_Optimized_Count];
Mārtiņš Možeiko
2559 posts / 2 projects
Building with only one translation unit
Doesn't this prevent building optimized build because pragma optimize off always turns off optimizations?
Mirolyub Hristov
6 posts
Building with only one translation unit
Yes, it does. That's the point. We only want the contents of handmade_optimized.cpp to be optimized.
Mārtiņš Možeiko
2559 posts / 2 projects
Building with only one translation unit
We don't want to do that always. Sometimes we want to see how fast game runs with everything optimized. Otherwise those performance counter numbers Casey is working now will be pretty useless. They mean something only when code is optimized.
Mirolyub Hristov
6 posts
Building with only one translation unit
You're right. The #pragma optimize lines should be wrapped in a #ifdef HANDMADE_CONTROLLED_OPTIMIZATIONS or something like that, so that all the code can be optimized if necessary.
Mārtiņš Možeiko
2559 posts / 2 projects
Building with only one translation unit
Oh, right. That makes sense.
Casey Muratori
801 posts / 1 project
Casey Muratori is a programmer at Molly Rocket on the game 1935 and is the host of the educational programming series Handmade Hero.
Building with only one translation unit
That diff looks like _exactly_ what we already tried on stream, and it didn't work? I'm not sure why it is working for you. Maybe we had some kind of typo or something weird like this?

Does anyone happen to remember which episode it was where I try this, so we can check what I did specifically?

- Casey
Kim
Kim Jørgensen
64 posts
Building with only one translation unit
Edited by Kim Jørgensen on
See https://forums.handmadehero.org/j...eos/game-architecture/day095.html from 47:53

/Kim
Casey Muratori
801 posts / 1 project
Casey Muratori is a programmer at Molly Rocket on the game 1935 and is the host of the educational programming series Handmade Hero.
Building with only one translation unit
Yeah, so we did exactly what OP is suggesting, and it didn't work. So what gives?

https://youtu.be/JTIzGsGqQaA?t=3070

- Casey
Mārtiņš Možeiko
2559 posts / 2 projects
Building with only one translation unit
Edited by Mārtiņš Možeiko on
My guess is that all inline functions called from DrawBitmap also needs to be compiled with optimizations. This includes all kinds of vector functions from handmade_math.h header.

For example, if I take Day 95 source (build.bat uses /O2) and modify beginning handmade.cpp file like this:
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
#pragma optimize ("", off)
/* ========================================================================
   $File: $
   $Date: $
   $Revision: $
   $Creator: Casey Muratori $
   $Notice: (C) Copyright 2014 by Molly Rocket, Inc. All Rights Reserved. $
   ======================================================================== */

#include "handmade.h"
#pragma optimize ("", on)
#include "handmade_render_group.h"
#include "handmade_render_group.cpp"
#pragma optimize ("", off)
#include "handmade_world.cpp"
#include "handmade_random.h"
#include "handmade_sim_region.cpp"
#include "handmade_entity.cpp"

Then everything is slow as before.

But if I modify it like this:
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
#pragma optimize ("", off)
/* ========================================================================
   $File: $
   $Date: $
   $Revision: $
   $Creator: Casey Muratori $
   $Notice: (C) Copyright 2014 by Molly Rocket, Inc. All Rights Reserved. $
   ======================================================================== */

#pragma optimize ("", on)
#include "handmade.h"
#include "handmade_render_group.h"
#include "handmade_render_group.cpp"
#pragma optimize ("", off)
#include "handmade_world.cpp"
#include "handmade_random.h"
#include "handmade_sim_region.cpp"
#include "handmade_entity.cpp"


Then everything is fast. handmade.h includes handmade_math.h.

This means all code in handmade_math.h is optimized. So if you will be calling any v2/v3/v4 from part of code that is not optimized (handmade_world.cpp) you won't be able to easily debug that - it all will be inlined (I verified).

MSVC apparently decides on optimization for inline functions at their definition point, not where they are called. Tricky.
Casey Muratori
801 posts / 1 project
Casey Muratori is a programmer at Molly Rocket on the game 1935 and is the host of the educational programming series Handmade Hero.
Building with only one translation unit
That is what I hypothesized at the end of that stream, in fact :( That's too bad. But I suppose it makes sense.

We could give it a shot again, knowing that's definitely the case, and see if there's a reasonable way for us to enclose things...

- Casey
Ameen Sayegh
51 posts
Building with only one translation unit
Edited by Ameen Sayegh on
So we have to turn the optimization on from the command line by using /O2 switch, then we have to mark the entire code with
1
#pragma optimze("", off)
and then the code that we want to optimize with
1
#pragma optimize("", on)
Again!

And It doesn't work the other way around where you just turn the optimization on the code you want.

Good to know. But that is just silly.



EDIT: after I wrote the post I tried it and I noticed that pragma optimize("", off) doesn't turn off the optimization completely, some functions are still being inlined.
I also tried pragma optimize("gts", off), it stopped the inlining but it is still not as if you are compiling without the /O2 switch
What I mean is: marking the entire translation unit with pragma optimize("gts", off) _does not completely_ cancel the switch /O2.
It is good enough I guess but just so you will be aware of that.
511 posts
Building with only one translation unit
Don't judge compilers without having their insides. I only have limited experience with compiler code having only skimmed the llvm source&docs but I can see how they got to that behavior.

Whether a function can be inlined depends on how much code is inside of it. So when optimization is on during definition then the compiler will evaluate the cost of the function and whether it's inlineable, but if optimization is off then it just unconditionally says that the function is too expensive to inline.

Then at call time the compiler will check whether the function can be inlined. (Though I have no idea why they don't also check whether opts are enabled at call site before doing the inline)

If no optimization flags are enabled then it will not even initialize the meta data required for the optimizations.
Ameen Sayegh
51 posts
Building with only one translation unit
Edited by Ameen Sayegh on
First, my complain is not about why I have to mark the functions getting called explicitly to be optimized (even though there should have been an option to say optimize recursively all the function that is getting called),
My complain is why do I have to turn the optimization on from the command line and then turn it off for the entire translation unit and then on for the function to get optimized. That doesn't make any sense.

And No, the compiler knows which are the functions that need to be optimized before it can start doing optimizations. Because pragma is at the parsing stage which is before the Optimization and code generation stage.
That's why I'm saying it is silly because this should be totally possible.

Second, Yes, optimizing single function might not be simpler than optimizing the entire translation unit but it should not be more complicated than that and the inlineing problem is there whether it is optimizing the entire translation unit or a single function recursively.
Christopher LaBauve
1 posts
Building with only one translation unit
I may be completely misunderstanding the docs, but in my code I'm using

1
2
3
4
5
#pragma optimize("gts", on)

// code to be optimized

#pragma optimize("", on)

"When you use the on parameter [with ""], it resets the optimizations to those that you specified with the /O compiler option."