2 回答

TA貢獻(xiàn)1826條經(jīng)驗(yàn) 獲得超6個(gè)贊
我不是圍棋專家,但 Java 確實(shí)在優(yōu)化循環(huán)。
假設(shè)您有一個(gè)帶有 的單核處理器3Ghz
,每條指令給您 0.3ns,讓我們假設(shè)每個(gè)增量都是一條指令。0.3ns *20 bilion = 6s
在沒(méi)有任何優(yōu)化的情況下粗略估計(jì)的性能也是如此。
你可以通過(guò)提供給你的程序來(lái)驗(yàn)證 java 在這里做了一些欺騙-XX:LoopUnrollLimit=1
。這告訴 JVM 幾乎不執(zhí)行循環(huán)展開(kāi),因此可以防止大多數(shù) JIT 優(yōu)化在您的示例中發(fā)生。
這樣做,你的 java 示例的運(yùn)行時(shí)現(xiàn)在6s
在我的機(jī)器上,這與 Go 基準(zhǔn)測(cè)試相當(dāng)。
在 Go 版本中也可能有一個(gè)選項(xiàng)可以啟用循環(huán)展開(kāi)等優(yōu)化(請(qǐng)參閱 Go 手冊(cè))。
最后,這再次表明微基準(zhǔn)測(cè)試很難做到正確。他們經(jīng)常自欺欺人地假設(shè)不正確的事情。

TA貢獻(xiàn)1934條經(jīng)驗(yàn) 獲得超2個(gè)贊
以下是我的一些觀察。我將展示我從編譯該程序中獲得的一些 Intel 語(yǔ)法匯編代碼。我正在使用編譯器資源管理器。要理解下面的內(nèi)容,你不必了解很多匯編,這里最重要的元素是大小,越大越慢。如果可以的話,我會(huì)把這篇文章寫(xiě)得更小一些,但是生成的代碼出奇地龐大,而且我對(duì) Go 的了解還不夠多,不知道什么是無(wú)用的。如果您想查看每個(gè)語(yǔ)句在匯編中轉(zhuǎn)換成什么,Compiler Explorer 將為您突出顯示所有內(nèi)容。
TL;DR:
在我看來(lái),Go 編譯器是一個(gè)災(zāi)難性的混亂,C++ 代碼得到了很好的優(yōu)化,而 Java 與 Go 相比很小。JIT'ing 可能會(huì)對(duì) Java 代碼產(chǎn)生重大影響,它對(duì)于解開(kāi)內(nèi)聯(lián)優(yōu)化的循環(huán)(預(yù)先計(jì)算 的值count
)來(lái)說(shuō)也可能過(guò)于復(fù)雜。
Go 代碼編譯成這個(gè)怪物:
text "".main(SB), $224-0
movq (TLS), CX
leaq -96(SP), AX
cmpq AX, 16(CX)
jls 835
subq $224, SP
movq BP, 216(SP)
leaq 216(SP), BP
funcdata $0, gclocals·f6bd6b3389b872033d462029172c8612(SB)
funcdata $1, gclocals·17283ea8379a997487dd6f8baf7ae6ea(SB)
pcdata $0, $0
call time.Now(SB)
movq 16(SP), AX
movq 8(SP), CX
movq (SP), DX
movq DX, time.t·2+160(SP)
movq CX, time.t·2+168(SP)
movq AX, time.t·2+176(SP)
movq time.t·2+160(SP), AX
movq AX, CX
shrq $63, AX
shlq $63, AX
testq $-1, AX
jeq 806
movq CX, DX
shlq $1, CX
shrq $31, CX
movq $59453308800, BX
addq BX, CX
andq $1073741823, DX
movlqsx DX, DX
imulq $1000000000, CX
addq DX, CX
movq $-6795364578871345152, DX
addq DX, CX
movq $4835703278458516699, AX
imulq CX
sarq $63, CX
sarq $18, DX
subq CX, DX
movq DX, "".start+72(SP)
xorl AX, AX
movq AX, CX
jmp 257
incq CX
incq AX
cmpq CX, $2000000000
jlt 213
movq "".i+80(SP), SI
incq SI
movq "".start+72(SP), DX
movq $59453308800, BX
movq AX, CX
movq SI, AX
movq CX, "".count+88(SP)
cmpq AX, $10
jge 404
movq AX, "".i+80(SP)
movq AX, ""..autotmp_24+112(SP)
xorps X0, X0
movups X0, ""..autotmp_23+120(SP)
leaq type.int(SB), CX
movq CX, (SP)
leaq ""..autotmp_24+112(SP), DX
movq DX, 8(SP)
pcdata $0, $1
call runtime.convT2E64(SB)
movq 24(SP), AX
movq 16(SP), CX
movq CX, ""..autotmp_23+120(SP)
movq AX, ""..autotmp_23+128(SP)
leaq go.string."On step %d\n"(SB), AX
movq AX, (SP)
movq $11, 8(SP)
leaq ""..autotmp_23+120(SP), CX
movq CX, 16(SP)
movq $1, 24(SP)
movq $1, 32(SP)
pcdata $0, $1
call fmt.Printf(SB)
movq "".count+88(SP), AX
xorl CX, CX
jmp 219
pcdata $0, $2
call time.Now(SB)
movq 16(SP), AX
movq 8(SP), CX
movq (SP), DX
movq DX, time.t·2+136(SP)
movq CX, time.t·2+144(SP)
movq AX, time.t·2+152(SP)
movq time.t·2+136(SP), AX
movq AX, CX
shrq $63, AX
shlq $63, AX
testq $-1, AX
jeq 787
movq CX, DX
shlq $1, CX
shrq $31, CX
movq $59453308800, BX
addq BX, CX
imulq $1000000000, CX
andq $1073741823, DX
movlqsx DX, DX
addq DX, CX
movq $-6795364578871345152, DX
leaq (DX)(CX*1), AX
movq AX, "".~R0+64(SP)
movq $4835703278458516699, CX
imulq CX
sarq $18, DX
movq "".~R0+64(SP), CX
sarq $63, CX
subq CX, DX
movq "".start+72(SP), CX
subq CX, DX
movq DX, ""..autotmp_29+104(SP)
movq "".count+88(SP), CX
movq CX, ""..autotmp_30+96(SP)
xorps X0, X0
movups X0, ""..autotmp_28+184(SP)
movups X0, ""..autotmp_28+200(SP)
leaq type.int64(SB), CX
movq CX, (SP)
leaq ""..autotmp_29+104(SP), CX
movq CX, 8(SP)
pcdata $0, $3
call runtime.convT2E64(SB)
movq 16(SP), CX
movq 24(SP), DX
movq CX, ""..autotmp_28+184(SP)
movq DX, ""..autotmp_28+192(SP)
leaq type.int(SB), CX
movq CX, (SP)
leaq ""..autotmp_30+96(SP), CX
movq CX, 8(SP)
pcdata $0, $3
call runtime.convT2E64(SB)
movq 24(SP), CX
movq 16(SP), DX
movq DX, ""..autotmp_28+200(SP)
movq CX, ""..autotmp_28+208(SP)
leaq go.string."Total time took: %d to get at count: %d\n"(SB), CX
movq CX, (SP)
movq $40, 8(SP)
leaq ""..autotmp_28+184(SP), CX
movq CX, 16(SP)
movq $2, 24(SP)
movq $2, 32(SP)
pcdata $0, $3
call fmt.Printf(SB)
movq 216(SP), BP
addq $224, SP
ret
movq time.t·2+144(SP), BX
movq CX, DX
movq BX, CX
jmp 501
movq time.t·2+168(SP), SI
movq CX, DX
movq $59453308800, BX
movq SI, CX
jmp 144
nop
pcdata $0, $-1
call runtime.morestack_noctxt(SB)
jmp 0
text "".init(SB), $8-0
movq (TLS), CX
cmpq SP, 16(CX)
jls 89
subq $8, SP
movq BP, (SP)
leaq (SP), BP
funcdata $0, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
funcdata $1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
movblzx "".initdone·(SB), AX
cmpb AL, $1
jls 47
movq (SP), BP
addq $8, SP
ret
jne 56
pcdata $0, $0
call runtime.throwinit(SB)
undef
movb $1, "".initdone·(SB)
pcdata $0, $0
call fmt.init(SB)
pcdata $0, $0
call time.init(SB)
movb $2, "".initdone·(SB)
movq (SP), BP
addq $8, SP
ret
nop
pcdata $0, $-1
call runtime.morestack_noctxt(SB)
jmp 0
text type..hash.[2]interface {}(SB), DUPOK, $40-24
movq (TLS), CX
cmpq SP, 16(CX)
jls 103
subq $40, SP
movq BP, 32(SP)
leaq 32(SP), BP
funcdata $0, gclocals·d4dc2f11db048877dbc0f60a22b4adb3(SB)
funcdata $1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
xorl AX, AX
movq "".h+56(SP), CX
jmp 82
movq AX, "".i+24(SP)
shlq $4, AX
movq "".p+48(SP), BX
addq BX, AX
movq AX, (SP)
movq CX, 8(SP)
pcdata $0, $0
call runtime.nilinterhash(SB)
movq 16(SP), CX
movq "".i+24(SP), AX
incq AX
cmpq AX, $2
jlt 38
movq CX, "".~r2+64(SP)
movq 32(SP), BP
addq $40, SP
ret
nop
pcdata $0, $-1
call runtime.morestack_noctxt(SB)
jmp 0
text type..eq.[2]interface {}(SB), DUPOK, $48-24
movq (TLS), CX
cmpq SP, 16(CX)
jls 155
subq $48, SP
movq BP, 40(SP)
leaq 40(SP), BP
funcdata $0, gclocals·8f9cec06d1ae35cc9900c511c5e4bdab(SB)
funcdata $1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
xorl AX, AX
jmp 46
movq ""..autotmp_8+32(SP), CX
leaq 1(CX), AX
cmpq AX, $2
jge 140
movq AX, CX
shlq $4, AX
movq "".p+56(SP), DX
movq 8(AX)(DX*1), BX
movq (AX)(DX*1), SI
movq "".q+64(SP), DI
movq 8(AX)(DI*1), R8
movq (AX)(DI*1), AX
cmpq SI, AX
jne 125
movq CX, ""..autotmp_8+32(SP)
movq SI, (SP)
movq BX, 8(SP)
movq R8, 16(SP)
pcdata $0, $0
call runtime.efaceeq(SB)
movblzx 24(SP), AX
testb AL, AL
jne 37
movb $0, "".~r2+72(SP)
movq 40(SP), BP
addq $48, SP
ret
movb $1, "".~r2+72(SP)
movq 40(SP), BP
addq $48, SP
ret
nop
pcdata $0, $-1
call runtime.morestack_noctxt(SB)
jmp 0
我不知道其中大部分在做什么。我只能希望其中大部分是某種 GC 代碼。我查找了如何為 Go 編譯器啟用優(yōu)化,但我所能找到的只是如何禁用優(yōu)化。
相比之下,我在 C++ 中查看了類似的函數(shù)
#include <cstdio>
#include <chrono>
#include <cinttypes>
using namespace std::chrono;
milliseconds getMS()
{
return duration_cast< milliseconds >(
system_clock::now().time_since_epoch()
);
}
int main()
{
int count = 0;
milliseconds millis = getMS();
for(int i = 0; i < 10; ++i)
{
printf("On step %d\n", i);
for(int j = 0; j < 2000000000; ++j)
{
++count;
}
}
milliseconds time = getMS() - millis;
printf("Total time took: %" PRId64 " to get at count: %d\n", time.count(), count);
}
在沒(méi)有優(yōu)化的情況下編譯為(編譯器x86-64 clang (trunk (probably 6.0.0),標(biāo)志-std=c++0x -O0:):
main: # @main
push rbp
mov rbp, rsp
sub rsp, 48
mov dword ptr [rbp - 4], 0
mov dword ptr [rbp - 8], 0
call getMS()
mov qword ptr [rbp - 16], rax
mov dword ptr [rbp - 20], 0
.LBB3_1: # =>This Loop Header: Depth=1
cmp dword ptr [rbp - 20], 10
jge .LBB3_8
mov esi, dword ptr [rbp - 20]
movabs rdi, offset .L.str
mov al, 0
call printf
mov dword ptr [rbp - 24], 0
mov dword ptr [rbp - 44], eax # 4-byte Spill
.LBB3_3: # Parent Loop BB3_1 Depth=1
cmp dword ptr [rbp - 24], 2000000000
jge .LBB3_6
mov eax, dword ptr [rbp - 8]
add eax, 1
mov dword ptr [rbp - 8], eax
mov eax, dword ptr [rbp - 24]
add eax, 1
mov dword ptr [rbp - 24], eax
jmp .LBB3_3
.LBB3_6: # in Loop: Header=BB3_1 Depth=1
jmp .LBB3_7
.LBB3_7: # in Loop: Header=BB3_1 Depth=1
mov eax, dword ptr [rbp - 20]
add eax, 1
mov dword ptr [rbp - 20], eax
jmp .LBB3_1
.LBB3_8:
call getMS()
mov qword ptr [rbp - 40], rax
lea rdi, [rbp - 40]
lea rsi, [rbp - 16]
call std::common_type<std::chrono::duration<long, std::ratio<1l, 1000l> >, std::chrono::duration<long, std::ratio<1l, 1000l> > >::type std::chrono::operator-<long, std::ratio<1l, 1000l>, long, std::ratio<1l, 1000l> >(std::chrono::duration<long, std::ratio<1l, 1000l> > const&, std::chrono::duration<long, std::ratio<1l, 1000l> > const&)
mov qword ptr [rbp - 32], rax
lea rdi, [rbp - 32]
call std::chrono::duration<long, std::ratio<1l, 1000l> >::count() const
mov edx, dword ptr [rbp - 8]
movabs rdi, offset .L.str.1
mov rsi, rax
mov al, 0
call printf
mov edx, dword ptr [rbp - 4]
mov dword ptr [rbp - 48], eax # 4-byte Spill
mov eax, edx
add rsp, 48
pop rbp
ret
.L.str:
.asciz "On step %d\n"
.L.str.1:
.asciz "Total time took: %ld to get at count: %d\n"
實(shí)際上還有很多代碼,但它只是 chrono 的實(shí)現(xiàn),在優(yōu)化后的代碼中它只是一個(gè)庫(kù)函數(shù)調(diào)用。我還刪除了 的實(shí)現(xiàn),getMS因?yàn)樗饕且粋€(gè)包裝器方法。
通過(guò) O1(大小)優(yōu)化,這變成:
main: # @main
push rbx
sub rsp, 32
call getMS()
mov qword ptr [rsp + 24], rax
xor ebx, ebx
.LBB3_1: # =>This Inner Loop Header: Depth=1
mov edi, offset .L.str
xor eax, eax
mov esi, ebx
call printf
add ebx, 1
cmp ebx, 10
jne .LBB3_1
call getMS()
mov qword ptr [rsp + 8], rax
lea rdi, [rsp + 8]
lea rsi, [rsp + 24]
call std::common_type<std::chrono::duration<long, std::ratio<1l, 1000l> >, std::chrono::duration<long, std::ratio<1l, 1000l> > >::type std::chrono::operator-<long, std::ratio<1l, 1000l>, long, std::ratio<1l, 1000l> >(std::chrono::duration<long, std::ratio<1l, 1000l> > const&, std::chrono::duration<long, std::ratio<1l, 1000l> > const&)
mov qword ptr [rsp + 16], rax
lea rdi, [rsp + 16]
call std::chrono::duration<long, std::ratio<1l, 1000l> >::count() const
mov rcx, rax
mov edi, offset .L.str.1
mov edx, -1474836480
xor eax, eax
mov rsi, rcx
call printf
xor eax, eax
add rsp, 32
pop rbx
ret
.L.str:
.asciz "On step %d\n"
.L.str.1:
.asciz "Total time took: %ld to get at count: %d\n"
O2(速度)和 O3(最大)優(yōu)化本質(zhì)上歸結(jié)為展開(kāi)的外循環(huán)(僅用于打印語(yǔ)句)和預(yù)先計(jì)算的計(jì)數(shù)值。
這主要顯示了 Go 生成的糟糕代碼和 C++ 中發(fā)生的一些優(yōu)化。但是這些都沒(méi)有準(zhǔn)確地顯示 Java 字節(jié)碼包含什么,或者如果它運(yùn)行了足夠多的時(shí)間,JIT 會(huì)把它壓縮成什么。所以這是 Java 字節(jié)碼:
public static void countToTwentyBillion();
Code:
0: lconst_0
1: lstore_0
2: invokestatic #2
// Method java/lang/System.currentTimeMillis:()J
5: lstore_2
6: iconst_0
7: istore
4
9: iload
4
11: bipush
10
13: if_icmpge
68
16: getstatic
#3
// Field java/lang/System.out:Ljava/io/PrintStream;
19: new
#4
// class java/lang/StringBuilder
22: dup
23: invokespecial #5
// Method java/lang/StringBuilder.'<init>':()V
26: ldc
#6
// String On step
28: invokevirtual #7
// Method java/lang/StringBuilder.append:(Ljava/lang/String;)Ljava/lang/StringBuilder;
31: iload
4
33: invokevirtual #8
// Method java/lang/StringBuilder.append:(I)Ljava/lang/StringBuilder;
36: invokevirtual #9
// Method java/lang/StringBuilder.toString:()Ljava/lang/String;
39: invokevirtual #10
// Method java/io/PrintStream.println:(Ljava/lang/String;)V
42: iconst_0
43: istore
5
45: iload
5
47: ldc
#11
// int 2000000000
49: if_icmpge
62
52: lload_0
53: lconst_1
54: ladd
55: lstore_0
56: iinc
5, 1
59: goto
45
62: iinc
4, 1
65: goto
9
68: invokestatic #2
// Method java/lang/System.currentTimeMillis:()J
71: lstore
4
73: getstatic
#3
// Field java/lang/System.out:Ljava/io/PrintStream;
76: new
#4
// class java/lang/StringBuilder
79: dup
80: invokespecial #5
// Method java/lang/StringBuilder.'<init>':()V
83: ldc
#12
// String Total time took:
85: invokevirtual #7
// Method java/lang/StringBuilder.append:(Ljava/lang/String;)Ljava/lang/StringBuilder;
88: lload
4
90: lload_2
91: lsub
92: invokevirtual #13
// Method java/lang/StringBuilder.append:(J)Ljava/lang/StringBuilder;
95: ldc
#14
// String ms to get at count:
97: invokevirtual #7
// Method java/lang/StringBuilder.append:(Ljava/lang/String;)Ljava/lang/StringBuilder;
100: lload_0
101: invokevirtual #13
// Method java/lang/StringBuilder.append:(J)Ljava/lang/StringBuilder;
104: invokevirtual #9
// Method java/lang/StringBuilder.toString:()Ljava/lang/String;
107: invokevirtual #10
// Method java/io/PrintStream.println:(Ljava/lang/String;)V
110: return
不幸的是,目前我不想編譯 hsdis 和 JIT 代碼,但它最終可能看起來(lái)像一些 C++ 示例。根據(jù)我對(duì) JIT 的了解,它可能能夠預(yù)先計(jì)算計(jì)數(shù)值。但是這段代碼有點(diǎn)復(fù)雜(就循環(huán)而言),這可能會(huì)使快速 JIT 優(yōu)化變得更加困難。
- 2 回答
- 0 關(guān)注
- 144 瀏覽
添加回答
舉報(bào)