<div dir="ltr"><br><div class="gmail_extra"><br><div class="gmail_quote">On Thu, Dec 4, 2014 at 2:55 PM, Brendan Gregg <span dir="ltr"><<a href="mailto:brendan.d.gregg@gmail.com" target="_blank">brendan.d.gregg@gmail.com</a>></span> wrote:<br><blockquote class="gmail_quote" style="margin:0px 0px 0px 0.8ex;border-left:1px solid rgb(204,204,204);padding-left:1ex"><div>G'Day,<br><br>I've hacked hotspot to return the frame pointer, in part to see what this involves, and also to have a working prototype for analysis. Along with an agent to resolve symbols, this has allowed full stack profiling using Linux perf_events. The following flame graphs show the resulting profiles.<br><br>A mixed mode CPU flame graph of a vert.x benchmark (click to zoom):<br><br><a href="http://www.brendangregg.com/FlameGraphs/cpu-mixedmode-vertx.svg" target="_blank">http://www.brendangregg.com/FlameGraphs/cpu-mixedmode-vertx.svg</a><br><br>Same thing, but this time disabling inlining, to show more frames:<br><br><a href="http://www.brendangregg.com/FlameGraphs/cpu-mixedmode-flamegraph.svg" target="_blank">http://www.brendangregg.com/FlameGraphs/cpu-mixedmode-flamegraph.svg</a><br><br>As expected, performance is worse without inlining. You can compare the flame graphs side by side to see why. Less time spent doing work / I/O!<br><br><a href="https://github.com/brendangregg/Misc/blob/master/java/openjdk8_b132-fp.diff" target="_blank">https://github.com/brendangregg/Misc/blob/master/java/openjdk8_b132-fp.diff</a> is my patch, <br></div></blockquote><blockquote class="gmail_quote" style="margin:0px 0px 0px 0.8ex;border-left:1px solid rgb(204,204,204);padding-left:1ex"><div dir="ltr">[...]<br></div></blockquote><div><br></div><div>In case there's problems with the patch URL, the patch is:<br><br>--- openjdk8clean/hotspot/src/cpu/x86/vm/<a href="http://x86_64.ad">x86_64.ad</a>    2014-03-04 02:52:11.000000000 +0000<br>+++ openjdk8/hotspot/src/cpu/x86/vm/<a href="http://x86_64.ad">x86_64.ad</a>    2014-11-08 01:10:49.686044933 +0000<br>@@ -166,10 +166,9 @@<br> // 3) reg_class stack_slots( /* one chunk of stack-based "registers" */ )<br> //<br> <br>-// Class for all pointer registers (including RSP)<br>+// Class for all pointer registers (including RSP, excluding RBP)<br> reg_class any_reg(RAX, RAX_H,<br>                   RDX, RDX_H,<br>-                  RBP, RBP_H,<br>                   RDI, RDI_H,<br>                   RSI, RSI_H,<br>                   RCX, RCX_H,<br>@@ -184,10 +183,9 @@<br>                   R14, R14_H,<br>                   R15, R15_H);<br> <br>-// Class for all pointer registers except RSP<br>+// Class for all pointer registers except RSP and RBP<br> reg_class ptr_reg(RAX, RAX_H,<br>                   RDX, RDX_H,<br>-                  RBP, RBP_H,<br>                   RDI, RDI_H,<br>                   RSI, RSI_H,<br>                   RCX, RCX_H,<br>@@ -199,9 +197,8 @@<br>                   R13, R13_H,<br>                   R14, R14_H);<br> <br>-// Class for all pointer registers except RAX and RSP<br>+// Class for all pointer registers except RAX, RSP and RBP<br> reg_class ptr_no_rax_reg(RDX, RDX_H,<br>-                         RBP, RBP_H,<br>                          RDI, RDI_H,<br>                          RSI, RSI_H,<br>                          RCX, RCX_H,<br>@@ -226,9 +223,8 @@<br>                          R13, R13_H,<br>                          R14, R14_H);<br> <br>-// Class for all pointer registers except RAX, RBX and RSP<br>+// Class for all pointer registers except RAX, RBX, RSP and RBP<br> reg_class ptr_no_rax_rbx_reg(RDX, RDX_H,<br>-                             RBP, RBP_H,<br>                              RDI, RDI_H,<br>                              RSI, RSI_H,<br>                              RCX, RCX_H,<br>@@ -260,10 +256,9 @@<br> // Singleton class for TLS pointer<br> reg_class ptr_r15_reg(R15, R15_H);<br> <br>-// Class for all long registers (except RSP)<br>+// Class for all long registers (except RSP and RBP)<br> reg_class long_reg(RAX, RAX_H,<br>                    RDX, RDX_H,<br>-                   RBP, RBP_H,<br>                    RDI, RDI_H,<br>                    RSI, RSI_H,<br>                    RCX, RCX_H,<br>@@ -275,9 +270,8 @@<br>                    R13, R13_H,<br>                    R14, R14_H);<br> <br>-// Class for all long registers except RAX, RDX (and RSP)<br>-reg_class long_no_rax_rdx_reg(RBP, RBP_H,<br>-                              RDI, RDI_H,<br>+// Class for all long registers except RAX, RDX (and RSP, RBP)<br>+reg_class long_no_rax_rdx_reg(RDI, RDI_H,<br>                               RSI, RSI_H,<br>                               RCX, RCX_H,<br>                               RBX, RBX_H,<br>@@ -288,9 +282,8 @@<br>                               R13, R13_H,<br>                               R14, R14_H);<br> <br>-// Class for all long registers except RCX (and RSP)<br>-reg_class long_no_rcx_reg(RBP, RBP_H,<br>-                          RDI, RDI_H,<br>+// Class for all long registers except RCX (and RSP, RBP)<br>+reg_class long_no_rcx_reg(RDI, RDI_H,<br>                           RSI, RSI_H,<br>                           RAX, RAX_H,<br>                           RDX, RDX_H,<br>@@ -302,9 +295,8 @@<br>                           R13, R13_H,<br>                           R14, R14_H);<br> <br>-// Class for all long registers except RAX (and RSP)<br>-reg_class long_no_rax_reg(RBP, RBP_H,<br>-                          RDX, RDX_H,<br>+// Class for all long registers except RAX (and RSP, RBP)<br>+reg_class long_no_rax_reg(RDX, RDX_H,<br>                           RDI, RDI_H,<br>                           RSI, RSI_H,<br>                           RCX, RCX_H,<br>@@ -325,10 +317,9 @@<br> // Singleton class for RDX long register<br> reg_class long_rdx_reg(RDX, RDX_H);<br> <br>-// Class for all int registers (except RSP)<br>+// Class for all int registers (except RSP and RBP)<br> reg_class int_reg(RAX,<br>                   RDX,<br>-                  RBP,<br>                   RDI,<br>                   RSI,<br>                   RCX,<br>@@ -340,10 +331,9 @@<br>                   R13,<br>                   R14);<br> <br>-// Class for all int registers except RCX (and RSP)<br>+// Class for all int registers except RCX (and RSP, RBP)<br> reg_class int_no_rcx_reg(RAX,<br>                          RDX,<br>-                         RBP,<br>                          RDI,<br>                          RSI,<br>                          RBX,<br>@@ -355,8 +345,7 @@<br>                          R14);<br> <br> // Class for all int registers except RAX, RDX (and RSP)<br>-reg_class int_no_rax_rdx_reg(RBP,<br>-                             RDI,<br>+reg_class int_no_rax_rdx_reg(RDI,<br>                              RSI,<br>                              RCX,<br>                              RBX,<br>@@ -718,6 +707,7 @@<br>     st->print("# stack bang");<br>     st->print("\n\t");<br>     st->print("pushq   rbp\t# Save rbp");<br>+    // BDG consider: st->print("movq    rbp, rsp\t# ");<br>     if (framesize) {<br>       st->print("\n\t");<br>       st->print("subq    rsp, #%d\t# Create frame",framesize);<br>--- openjdk8clean/hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp    2014-03-04 02:52:11.000000000 +0000<br>+++ openjdk8/hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp    2014-11-07 23:57:11.589593723 +0000<br>@@ -5236,6 +5236,7 @@<br>     // We always push rbp, so that on return to interpreter rbp, will be<br>     // restored correctly and we can correct the stack.<br>     push(rbp);<br>+    mov(rbp, rsp);<br>     // Remove word for ebp<br>     framesize -= wordSize;<br> <br>--- openjdk8clean/hotspot/src/cpu/x86/vm/c1_MacroAssembler_x86.cpp    2014-03-04 02:52:10.000000000 +0000<br>+++ openjdk8/hotspot/src/cpu/x86/vm/c1_MacroAssembler_x86.cpp    2014-11-07 23:57:21.933257882 +0000<br>@@ -358,6 +358,7 @@<br>   generate_stack_overflow_check(frame_size_in_bytes);<br> <br>   push(rbp);<br>+  mov(rbp, rsp);<br> #ifdef TIERED<br>   // c2 leaves fpu stack dirty. Clean it on entry<br>   if (UseSSE < 2 ) {<br><br><br></div><div>Brendan <br></div></div></div></div>