diff --git a/src/cpu/ppc/vm/assembler_ppc.hpp b/src/cpu/ppc/vm/assembler_ppc.hpp --- a/src/cpu/ppc/vm/assembler_ppc.hpp +++ b/src/cpu/ppc/vm/assembler_ppc.hpp @@ -503,6 +503,9 @@ LVSL_OPCODE = (31u << OPCODE_SHIFT | 6u << 1), LVSR_OPCODE = (31u << OPCODE_SHIFT | 38u << 1), + LXVD2X_OPCODE = (31u << OPCODE_SHIFT | 844u << 1), + STXVD2X_OPCODE = (31u << OPCODE_SHIFT | 972u << 1), + // Vector Permute and Formatting VPKPX_OPCODE = (4u << OPCODE_SHIFT | 782u ), VPKSHSS_OPCODE = (4u << OPCODE_SHIFT | 398u ), @@ -2065,6 +2068,10 @@ inline void mtvscr( VectorRegister b); inline void mfvscr( VectorRegister d); + // VSX Vector instructions. + inline void lxvd2x( VectorRegister d, Register a, Register b); + inline void stxvd2x( VectorRegister d, Register a, Register b); + // AES (introduced with Power 8) inline void vcipher( VectorRegister d, VectorRegister a, VectorRegister b); inline void vcipherlast( VectorRegister d, VectorRegister a, VectorRegister b); diff --git a/src/cpu/ppc/vm/assembler_ppc.inline.hpp b/src/cpu/ppc/vm/assembler_ppc.inline.hpp --- a/src/cpu/ppc/vm/assembler_ppc.inline.hpp +++ b/src/cpu/ppc/vm/assembler_ppc.inline.hpp @@ -720,6 +720,8 @@ inline void Assembler::stvxl( VectorRegister d, Register s1, Register s2) { emit_int32( STVXL_OPCODE | vrt(d) | ra0mem(s1) | rb(s2)); } inline void Assembler::lvsl( VectorRegister d, Register s1, Register s2) { emit_int32( LVSL_OPCODE | vrt(d) | ra0mem(s1) | rb(s2)); } inline void Assembler::lvsr( VectorRegister d, Register s1, Register s2) { emit_int32( LVSR_OPCODE | vrt(d) | ra0mem(s1) | rb(s2)); } +inline void Assembler::lxvd2x (VectorRegister d, Register s1, Register s2) { emit_int32( LXVD2X_OPCODE | vrt(d) | ra(s1) | rb(s2)); } +inline void Assembler::stxvd2x(VectorRegister d, Register s1, Register s2) { emit_int32( STXVD2X_OPCODE | vrt(d) | ra(s1) | rb(s2)); } inline void Assembler::vpkpx( VectorRegister d, VectorRegister a, VectorRegister b) { emit_int32( VPKPX_OPCODE | vrt(d) | vra(a) | vrb(b)); } inline void Assembler::vpkshss( VectorRegister d, VectorRegister a, VectorRegister b) { emit_int32( VPKSHSS_OPCODE | vrt(d) | vra(a) | vrb(b)); } diff --git a/src/cpu/ppc/vm/stubGenerator_ppc.cpp b/src/cpu/ppc/vm/stubGenerator_ppc.cpp --- a/src/cpu/ppc/vm/stubGenerator_ppc.cpp +++ b/src/cpu/ppc/vm/stubGenerator_ppc.cpp @@ -1726,22 +1726,72 @@ Register tmp1 = R6_ARG4; Register tmp2 = R7_ARG5; Register tmp3 = R8_ARG6; - Register tmp4 = R0; - - Label l_1, l_2, l_3, l_4; - - { // FasterArrayCopy + Register tmp4 = R9_ARG7; + + Label l_1, l_2, l_3, l_4, l_5, l_6; + + // vector + VectorRegister vTmp1 = VR1; + VectorRegister vTmp2 = VR2; + VectorRegister vTmp3 = VR3; + VectorRegister vTmp4 = VR4; + + __ cmpwi(CCR0, R5_ARG3, 15); // compare immediate. the result of the comparison is placed into CCR0 + __ ble(CCR0, l_6); // copy 4 at a time if <= 15 elements remain (original code) + + { // VSX ArrayCopy + __ srdi(tmp1, R5_ARG3, 4); + __ andi_(R5_ARG3, R5_ARG3, 15); + __ mtctr(tmp1); + + __ li(tmp2, 16); + __ li(tmp3, 32); + __ li(tmp4, 48); + + __ bind(l_5); + + //copy 16 elements (total 128 byte) a time + __ lxvd2x(vTmp1, 0, R3_ARG1); + __ stxvd2x(vTmp1, 0, R4_ARG2); + __ lxvd2x(vTmp1, tmp2, R3_ARG1); + __ stxvd2x(vTmp1, tmp2, R4_ARG2); + __ lxvd2x(vTmp1, tmp3, R3_ARG1); + __ stxvd2x(vTmp1, tmp3, R4_ARG2); + __ lxvd2x(vTmp1, tmp4, R3_ARG1); + __ stxvd2x(vTmp1, tmp4, R4_ARG2); + __ addi(R3_ARG1, R3_ARG1, 64); + __ addi(R4_ARG2, R4_ARG2, 64); + + __ lxvd2x(vTmp1, 0, R3_ARG1); + __ stxvd2x(vTmp1, 0, R4_ARG2); + __ lxvd2x(vTmp1, tmp2, R3_ARG1); + __ stxvd2x(vTmp1, tmp2, R4_ARG2); + __ lxvd2x(vTmp1, tmp3, R3_ARG1); + __ stxvd2x(vTmp1, tmp3, R4_ARG2); + __ lxvd2x(vTmp1, tmp4, R3_ARG1); + __ stxvd2x(vTmp1, tmp4, R4_ARG2); + __ addi(R3_ARG1, R3_ARG1, 64); + __ addi(R4_ARG2, R4_ARG2, 64); + + __ bdnz(l_5); + } + + __ bind(l_6); + + {// FasterArrayCopy + __ cmpwi(CCR0, R5_ARG3, 3); __ ble(CCR0, l_3); // copy 1 at a time if less than 4 elements remain __ srdi(tmp1, R5_ARG3, 2); __ andi_(R5_ARG3, R5_ARG3, 3); __ mtctr(tmp1); - __ bind(l_4); + // Use unrolled version for mass copying (copy 4 elements a time). // Load feeding store gets zero latency on Power6, however not on Power5. // Therefore, the following sequence is made for the good of both. + __ ld(tmp1, 0, R3_ARG1); __ ld(tmp2, 8, R3_ARG1); __ ld(tmp3, 16, R3_ARG1); @@ -1752,6 +1802,7 @@ __ std(tmp4, 24, R4_ARG2); __ addi(R3_ARG1, R3_ARG1, 32); __ addi(R4_ARG2, R4_ARG2, 32); + __ bdnz(l_4); } @@ -1769,9 +1820,9 @@ __ ldu(R0, 8, R3_ARG1); __ stdu(R0, 8, R4_ARG2); __ bdnz(l_2); - } __ bind(l_1); + } // Generate stub for disjoint long copy. If "aligned" is true, the diff --git a/src/cpu/ppc/vm/vm_version_ppc.cpp b/src/cpu/ppc/vm/vm_version_ppc.cpp --- a/src/cpu/ppc/vm/vm_version_ppc.cpp +++ b/src/cpu/ppc/vm/vm_version_ppc.cpp @@ -643,6 +643,7 @@ a->vpmsumb(VR0, VR1, VR2); // code[11] -> vpmsumb a->tcheck(0); // code[12] -> tcheck a->mfdscr(R0); // code[13] -> mfdscr + a->lxvd2x(VR0, 0, R3_ARG1); // code[14] -> vsx a->blr(); // Emit function to set one cache line to zero. Emit function descriptor and get pointer to it. @@ -691,6 +692,7 @@ if (code[feature_cntr++]) features |= vpmsumb_m; if (code[feature_cntr++]) features |= tcheck_m; if (code[feature_cntr++]) features |= mfdscr_m; + if (code[feature_cntr++]) features |= vsx_m; // Print the detection code. if (PrintAssembly) { diff --git a/src/cpu/ppc/vm/vm_version_ppc.hpp b/src/cpu/ppc/vm/vm_version_ppc.hpp --- a/src/cpu/ppc/vm/vm_version_ppc.hpp +++ b/src/cpu/ppc/vm/vm_version_ppc.hpp @@ -46,6 +46,7 @@ vpmsumb, tcheck, mfdscr, + vsx, num_features // last entry to count features }; enum Feature_Flag_Set { @@ -64,6 +65,7 @@ vpmsumb_m = (1 << vpmsumb), tcheck_m = (1 << tcheck ), mfdscr_m = (1 << mfdscr ), + vsx_m = (1 << vsx ), all_features_m = (unsigned long)-1 }; @@ -97,6 +99,7 @@ static bool has_vpmsumb() { return (_features & vpmsumb_m) != 0; } static bool has_tcheck() { return (_features & tcheck_m) != 0; } static bool has_mfdscr() { return (_features & mfdscr_m) != 0; } + static bool has_vsx() { return (_features & vsx_m) != 0; } // Assembler testing static void allow_all();