diff --git a/src/cpu/ppc/vm/assembler_ppc.hpp b/src/cpu/ppc/vm/assembler_ppc.hpp --- a/src/cpu/ppc/vm/assembler_ppc.hpp +++ b/src/cpu/ppc/vm/assembler_ppc.hpp @@ -624,6 +624,7 @@ VNOR_OPCODE = (4u << OPCODE_SHIFT | 1284u ), VOR_OPCODE = (4u << OPCODE_SHIFT | 1156u ), VXOR_OPCODE = (4u << OPCODE_SHIFT | 1220u ), + VRLD_OPCODE = (4u << OPCODE_SHIFT | 196u ), VRLB_OPCODE = (4u << OPCODE_SHIFT | 4u ), VRLW_OPCODE = (4u << OPCODE_SHIFT | 132u ), VRLH_OPCODE = (4u << OPCODE_SHIFT | 68u ), @@ -2047,6 +2048,7 @@ inline void vnor( VectorRegister d, VectorRegister a, VectorRegister b); inline void vor( VectorRegister d, VectorRegister a, VectorRegister b); inline void vxor( VectorRegister d, VectorRegister a, VectorRegister b); + inline void vrld( VectorRegister d, VectorRegister a, VectorRegister b); inline void vrlb( VectorRegister d, VectorRegister a, VectorRegister b); inline void vrlw( VectorRegister d, VectorRegister a, VectorRegister b); inline void vrlh( VectorRegister d, VectorRegister a, VectorRegister b); diff --git a/src/cpu/ppc/vm/assembler_ppc.inline.hpp b/src/cpu/ppc/vm/assembler_ppc.inline.hpp --- a/src/cpu/ppc/vm/assembler_ppc.inline.hpp +++ b/src/cpu/ppc/vm/assembler_ppc.inline.hpp @@ -839,6 +839,7 @@ inline void Assembler::vnor( VectorRegister d, VectorRegister a, VectorRegister b) { emit_int32( VNOR_OPCODE | vrt(d) | vra(a) | vrb(b)); } inline void Assembler::vor( VectorRegister d, VectorRegister a, VectorRegister b) { emit_int32( VOR_OPCODE | vrt(d) | vra(a) | vrb(b)); } inline void Assembler::vxor( VectorRegister d, VectorRegister a, VectorRegister b) { emit_int32( VXOR_OPCODE | vrt(d) | vra(a) | vrb(b)); } +inline void Assembler::vrld( VectorRegister d, VectorRegister a, VectorRegister b) { emit_int32( VRLD_OPCODE | vrt(d) | vra(a) | vrb(b)); } inline void Assembler::vrlb( VectorRegister d, VectorRegister a, VectorRegister b) { emit_int32( VRLB_OPCODE | vrt(d) | vra(a) | vrb(b)); } inline void Assembler::vrlw( VectorRegister d, VectorRegister a, VectorRegister b) { emit_int32( VRLW_OPCODE | vrt(d) | vra(a) | vrb(b)); } inline void Assembler::vrlh( VectorRegister d, VectorRegister a, VectorRegister b) { emit_int32( VRLH_OPCODE | vrt(d) | vra(a) | vrb(b)); } diff --git a/src/cpu/ppc/vm/stubGenerator_ppc.cpp b/src/cpu/ppc/vm/stubGenerator_ppc.cpp --- a/src/cpu/ppc/vm/stubGenerator_ppc.cpp +++ b/src/cpu/ppc/vm/stubGenerator_ppc.cpp @@ -2417,6 +2417,411 @@ return start; } + // Arguments for generated stub (little endian only): + // R3_ARG1 - source byte array address + // R4_ARG2 - destination byte array address + // R5_ARG3 - round key array + address generate_aescrypt_encryptBlock() { + assert(UseAES, "need AES instructions and misaligned SSE support"); + StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock"); + + address start = __ function_entry(); + + Label L_doLast; + + Register from = R3_ARG1; // source array address + Register to = R4_ARG2; // destination array address + Register key = R5_ARG3; // round key array + + Register keylen = R8; + Register temp = R9; + Register keypos = R10; + Register hex = R11; + Register fifteen = R12; + + VectorRegister vRet = VR0; + + VectorRegister vKey1 = VR1; + VectorRegister vKey2 = VR2; + VectorRegister vKey3 = VR3; + VectorRegister vKey4 = VR4; + + VectorRegister fromPerm = VR20; + VectorRegister keyPerm = VR21; + VectorRegister toPerm = VR22; + VectorRegister fSplt = VR23; + + VectorRegister vTmp1 = VR27; + VectorRegister vTmp2 = VR28; + VectorRegister vTmp3 = VR29; + VectorRegister vTmp4 = VR31; + + VectorRegister vLow = VR18; + VectorRegister vHigh = VR19; + + __ li (hex, 16); + __ li (fifteen, 15); + __ vspltisb (fSplt, 0x0f); + + // load unaligned from[0-15] to vsRet + __ lvx (vRet, 0, from); + __ lvx (vTmp1, fifteen, from); + __ lvsl (fromPerm, 0, from); + __ vxor (fromPerm, fromPerm, fSplt); + __ vperm (vRet, vRet, vTmp1, fromPerm); + + // load keylen (44 or 52 or 60) + __ lwz (keylen, in_bytes(arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)), key); + + // to load keys + __ lvsr (keyPerm, 0, key); + __ vxor (vTmp2, vTmp2, vTmp2); + __ vspltisb (vTmp2, 16); + __ vrld (keyPerm, keyPerm, vTmp2); + __ vrld (keyPerm, keyPerm, vTmp2); + __ vsldoi (keyPerm, keyPerm, keyPerm, 8); + + // load the 1st round key to vKey1 + __ li (keypos, 0); + __ lvx (vKey1, keypos, key); + __ addi (keypos, keypos, 16); + __ lvx (vTmp1, keypos, key); + __ vperm (vKey1, vTmp1, vKey1, keyPerm); + + // 1st round + __ vxor (vRet, vRet, vKey1); + + // load the 2nd round key to vKey1 + __ addi (keypos, keypos, 16); + __ lvx (vTmp2, keypos, key); + __ vperm (vKey1, vTmp2, vTmp1, keyPerm); + + // load the 3rd round key to vKey2 + __ addi (keypos, keypos, 16); + __ lvx (vTmp1, keypos, key); + __ vperm (vKey2, vTmp1, vTmp2, keyPerm); + + // load the 4th round key to vKey3 + __ addi (keypos, keypos, 16); + __ lvx (vTmp2, keypos, key); + __ vperm (vKey3, vTmp2, vTmp1, keyPerm); + + // load the 5th round key to vKey4 + __ addi (keypos, keypos, 16); + __ lvx (vTmp1, keypos, key); + __ vperm (vKey4, vTmp1, vTmp2, keyPerm); + + // 2nd - 5th rounds + __ vcipher (vRet, vRet, vKey1); + __ vcipher (vRet, vRet, vKey2); + __ vcipher (vRet, vRet, vKey3); + __ vcipher (vRet, vRet, vKey4); + + // load the 6th round key to vKey1 + __ addi (keypos, keypos, 16); + __ lvx (vTmp2, keypos, key); + __ vperm (vKey1, vTmp2, vTmp1, keyPerm); + + // load the 7th round key to vKey2 + __ addi (keypos, keypos, 16); + __ lvx (vTmp1, keypos, key); + __ vperm (vKey2, vTmp1, vTmp2, keyPerm); + + // load the 8th round key to vKey3 + __ addi (keypos, keypos, 16); + __ lvx (vTmp2, keypos, key); + __ vperm (vKey3, vTmp2, vTmp1, keyPerm); + + // load the 9th round key to vKey4 + __ addi (keypos, keypos, 16); + __ lvx (vTmp1, keypos, key); + __ vperm (vKey4, vTmp1, vTmp2, keyPerm); + + // 6th - 9th rounds + __ vcipher (vRet, vRet, vKey1); + __ vcipher (vRet, vRet, vKey2); + __ vcipher (vRet, vRet, vKey3); + __ vcipher (vRet, vRet, vKey4); + + // load the 10th round key to vKey1 + __ addi (keypos, keypos, 16); + __ lvx (vTmp2, keypos, key); + __ vperm (vKey1, vTmp2, vTmp1, keyPerm); + + // load the 11th round key to vKey2 + __ addi (keypos, keypos, 16); + __ lvx (vTmp1, keypos, key); + __ vperm (vKey2, vTmp1, vTmp2, keyPerm); + + // if all round keys are loaded, skip next 4 rounds + __ cmpwi (CCR0, keylen, 44); + __ beq (CCR0, L_doLast); + + // 10th - 11th rounds + __ vcipher (vRet, vRet, vKey1); + __ vcipher (vRet, vRet, vKey2); + + // load the 12th round key to vKey1 + __ addi (keypos, keypos, 16); + __ lvx (vTmp2, keypos, key); + __ vperm (vKey1, vTmp2, vTmp1, keyPerm); + + // load the 13th round key to vKey2 + __ addi (keypos, keypos, 16); + __ lvx (vTmp1, keypos, key); + __ vperm (vKey2, vTmp1, vTmp2, keyPerm); + + // if all round keys are loaded, skip next 2 rounds + __ cmpwi (CCR0, keylen, 52); + __ beq (CCR0, L_doLast); + + // 12th - 13th rounds + __ vcipher (vRet, vRet, vKey1); + __ vcipher (vRet, vRet, vKey2); + + // load the 14th round key to vKey1 + __ addi (keypos, keypos, 16); + __ lvx (vTmp2, keypos, key); + __ vperm (vKey1, vTmp2, vTmp1, keyPerm); + + // load the 15th round key to vKey2 + __ addi (keypos, keypos, 16); + __ lvx (vTmp1, keypos, key); + __ vperm (vKey2, vTmp1, vTmp2, keyPerm); + + __ bind(L_doLast); + + // last two rounds + __ vcipher (vRet, vRet, vKey1); + __ vcipherlast (vRet, vRet, vKey2); + + __ neg (temp, to); + __ lvsr (toPerm, 0, temp); + __ vspltisb (vTmp2, -1); + __ vxor (vTmp1, vTmp1, vTmp1); + __ vperm (vTmp2, vTmp2, vTmp1, toPerm); + __ vxor (toPerm, toPerm, fSplt); + __ lvx (vTmp1, 0, to); + __ vperm (vRet, vRet, vRet, toPerm); + __ vsel (vTmp1, vTmp1, vRet, vTmp2); + __ lvx (vTmp4, fifteen, to); + __ stvx (vTmp1, 0, to); + __ vsel (vRet, vRet, vTmp4, vTmp2); + __ stvx (vRet, fifteen, to); + + __ blr(); + return start; + } + + // Arguments for generated stub (little endian only): + // R3_ARG1 - source byte array address + // R4_ARG2 - destination byte array address + // R5_ARG3 - K (key) in little endian int array + address generate_aescrypt_decryptBlock() { + assert(UseAES, "need AES instructions and misaligned SSE support"); + StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock"); + + address start = __ function_entry(); + + Label L_doLast; + Label L_do44; + Label L_do52; + Label L_do60; + + Register from = R3_ARG1; // source array address + Register to = R4_ARG2; // destination array address + Register key = R5_ARG3; // round key array + + Register keylen = R8; + Register temp = R9; + Register keypos = R10; + Register hex = R11; + Register fifteen = R12; + + VectorRegister vRet = VR0; + + VectorRegister vKey1 = VR1; + VectorRegister vKey2 = VR2; + VectorRegister vKey3 = VR3; + VectorRegister vKey4 = VR4; + VectorRegister vKey5 = VR5; + VectorRegister vKey6 = VR6; + VectorRegister vKey7 = VR7; + VectorRegister vKey8 = VR8; + VectorRegister vKey9 = VR9; + VectorRegister vKey10 = VR10; + VectorRegister vKey11 = VR11; + VectorRegister vKey12 = VR12; + VectorRegister vKey13 = VR13; + VectorRegister vKey14 = VR14; + VectorRegister vKey15 = VR15; + + VectorRegister fromPerm = VR20; + VectorRegister keyPerm = VR21; + VectorRegister toPerm = VR22; + VectorRegister fSplt = VR23; + + VectorRegister vTmp1 = VR27; + VectorRegister vTmp2 = VR28; + VectorRegister vTmp3 = VR29; + VectorRegister vTmp4 = VR31; + + VectorRegister vLow = VR18; + VectorRegister vHigh = VR19; + + __ li (hex, 16); + __ li (fifteen, 15); + __ vspltisb (fSplt, 0x0f); + + // load unaligned from[0-15] to vsRet + __ lvx (vRet, 0, from); + __ lvx (vTmp1, fifteen, from); + __ lvsl (fromPerm, 0, from); + __ vxor (fromPerm, fromPerm, fSplt); + __ vperm (vRet, vRet, vTmp1, fromPerm); // align [and byte swap in LE] + + // load keylen (44 or 52 or 60) + __ lwz (keylen, in_bytes(arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)), key); + + // to load keys + __ lvsr (keyPerm, 0, key); + __ vxor (vTmp2, vTmp2, vTmp2); + __ vspltisb (vTmp2, 16); + __ vrld (keyPerm, keyPerm, vTmp2); + __ vrld (keyPerm, keyPerm, vTmp2); + __ vsldoi (keyPerm, keyPerm, keyPerm, 8); + + // load the 1st round key to vKey1 + __ li (keypos, 0); + __ lvx (vKey1, keypos, key); + __ addi (keypos, keypos, 16); + __ lvx (vTmp1, keypos, key); + __ vperm (vKey1, vTmp1, vKey1, keyPerm); + + // load the 2nd round key to vKey2 + __ addi (keypos, keypos, 16); + __ lvx (vTmp2, keypos, key); + __ vperm (vKey2, vTmp2, vTmp1, keyPerm); + + // load the 3rd round key to vKey3 + __ addi (keypos, keypos, 16); + __ lvx (vTmp1, keypos, key); + __ vperm (vKey3, vTmp1, vTmp2, keyPerm); + + // load the 4th round key to vKey4 + __ addi (keypos, keypos, 16); + __ lvx (vTmp2, keypos, key); + __ vperm (vKey4, vTmp2, vTmp1, keyPerm); + + // load the 5th round key to vKey5 + __ addi (keypos, keypos, 16); + __ lvx (vTmp1, keypos, key); + __ vperm (vKey5, vTmp1, vTmp2, keyPerm); + + // load the 6th round key to vKey6 + __ addi (keypos, keypos, 16); + __ lvx (vTmp2, keypos, key); + __ vperm (vKey6, vTmp2, vTmp1, keyPerm); + + // load the 7th round key to vKey7 + __ addi (keypos, keypos, 16); + __ lvx (vTmp1, keypos, key); + __ vperm (vKey7, vTmp1, vTmp2, keyPerm); + + // load the 8th round key to vKey8 + __ addi (keypos, keypos, 16); + __ lvx (vTmp2, keypos, key); + __ vperm (vKey8, vTmp2, vTmp1, keyPerm); + + // load the 9th round key to vKey9 + __ addi (keypos, keypos, 16); + __ lvx (vTmp1, keypos, key); + __ vperm (vKey9, vTmp1, vTmp2, keyPerm); + + // load the 10th round key to vKey10 + __ addi (keypos, keypos, 16); + __ lvx (vTmp2, keypos, key); + __ vperm (vKey10, vTmp2, vTmp1, keyPerm); + + // load the 11th round key to vKey11 + __ addi (keypos, keypos, 16); + __ lvx (vTmp1, keypos, key); + __ vperm (vKey11, vTmp1, vTmp2, keyPerm); + + // if all round keys are loaded, skip next 4 rounds + __ cmpwi (CCR0, keylen, 44); + __ beq (CCR0, L_do44); + + // load the 12th round key to vKey12 + __ addi (keypos, keypos, 16); + __ lvx (vTmp2, keypos, key); + __ vperm (vKey12, vTmp2, vTmp1, keyPerm); + + // load the 13th round key to vKey13 + __ addi (keypos, keypos, 16); + __ lvx (vTmp1, keypos, key); + __ vperm (vKey13, vTmp1, vTmp2, keyPerm); + + // if all round keys are loaded, skip next 2 rounds + __ cmpwi (CCR0, keylen, 52); + __ beq (CCR0, L_do52); + + // load the 14th round key to vKey14 + __ addi (keypos, keypos, 16); + __ lvx (vTmp2, keypos, key); + __ vperm (vKey14, vTmp2, vTmp1, keyPerm); + + // load the 15th round key to vKey15 + __ addi (keypos, keypos, 16); + __ lvx (vTmp1, keypos, key); + __ vperm (vKey15, vTmp1, vTmp2, keyPerm); + + __ vxor (vRet, vRet, vKey15); + __ vncipher (vRet, vRet, vKey14); + __ vncipher (vRet, vRet, vKey13); + __ vncipher (vRet, vRet, vKey12); + __ vncipher (vRet, vRet, vKey11); + __ b (L_doLast); + + __ bind (L_do52); + __ vxor (vRet, vRet, vKey13); + __ vncipher (vRet, vRet, vKey12); + __ vncipher (vRet, vRet, vKey11); + __ b (L_doLast); + + __ bind (L_do44); + __ vxor (vRet, vRet, vKey11); + + __ bind(L_doLast); + + __ vncipher (vRet, vRet, vKey10); + __ vncipher (vRet, vRet, vKey9); + __ vncipher (vRet, vRet, vKey8); + __ vncipher (vRet, vRet, vKey7); + __ vncipher (vRet, vRet, vKey6); + __ vncipher (vRet, vRet, vKey5); + __ vncipher (vRet, vRet, vKey4); + __ vncipher (vRet, vRet, vKey3); + __ vncipher (vRet, vRet, vKey2); + __ vncipherlast (vRet, vRet, vKey1); + + __ neg (temp, to); + __ lvsr (toPerm, 0, temp); + __ vspltisb (vTmp2, -1); + __ vxor (vTmp1, vTmp1, vTmp1); + __ vperm (vTmp2, vTmp2, vTmp1, toPerm); + __ vxor (toPerm, toPerm, fSplt); + __ lvx (vTmp1, 0, to); + __ vperm (vRet, vRet, vRet, toPerm); + __ vsel (vTmp1, vTmp1, vRet, vTmp2); + __ lvx (vTmp4, fifteen, to); + __ stvx (vTmp1, 0, to); + __ vsel (vRet, vRet, vTmp4, vTmp2); + __ stvx (vRet, fifteen, to); + + __ blr(); + return start; + } void generate_arraycopy_stubs() { // Note: the disjoint stubs must be generated first, some of @@ -2692,10 +3097,6 @@ // arraycopy stubs used by compilers generate_arraycopy_stubs(); - if (UseAESIntrinsics) { - guarantee(!UseAESIntrinsics, "not yet implemented."); - } - // Safefetch stubs. generate_safefetch("SafeFetch32", sizeof(int), &StubRoutines::_safefetch32_entry, &StubRoutines::_safefetch32_fault_pc, @@ -2718,6 +3119,10 @@ StubRoutines::_montgomerySquare = CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_square); } + + StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock(); + StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock(); + } public: diff --git a/src/cpu/ppc/vm/vm_version_ppc.cpp b/src/cpu/ppc/vm/vm_version_ppc.cpp --- a/src/cpu/ppc/vm/vm_version_ppc.cpp +++ b/src/cpu/ppc/vm/vm_version_ppc.cpp @@ -112,7 +112,7 @@ // Create and print feature-string. char buf[(num_features+1) * 16]; // Max 16 chars per feature. jio_snprintf(buf, sizeof(buf), - "ppc64%s%s%s%s%s%s%s%s%s%s%s%s", + "ppc64%s%s%s%s%s%s%s%s%s%s%s%s%s", (has_fsqrt() ? " fsqrt" : ""), (has_isel() ? " isel" : ""), (has_lxarxeh() ? " lxarxeh" : ""), @@ -124,6 +124,7 @@ (has_vand() ? " vand" : ""), (has_lqarx() ? " lqarx" : ""), (has_vcipher() ? " vcipher" : ""), + (has_vcipher() ? " aes" : ""), (has_vpmsumb() ? " vpmsumb" : ""), (has_tcheck() ? " tcheck" : "") // Make sure number of %s matches num_features! @@ -186,6 +187,28 @@ } // The AES intrinsic stubs require AES instruction support. +#if defined(VM_LITTLE_ENDIAN) + if (has_vcipher()) { + if (FLAG_IS_DEFAULT(UseAES)) { + UseAES = true; + } + } else if (UseAES) { + if (!FLAG_IS_DEFAULT(UseAES)) + warning("AES instructions are not available on this CPU"); + FLAG_SET_DEFAULT(UseAES, false); + } + + if (UseAES && has_vcipher()) { + if (FLAG_IS_DEFAULT(UseAESIntrinsics)) { + UseAESIntrinsics = true; + } + } else if (UseAESIntrinsics) { + if (!FLAG_IS_DEFAULT(UseAESIntrinsics)) + warning("AES intrinsics are not available on this CPU"); + FLAG_SET_DEFAULT(UseAESIntrinsics, false); + } + +#else if (UseAES) { warning("AES instructions are not available on this CPU"); FLAG_SET_DEFAULT(UseAES, false); @@ -195,11 +218,8 @@ warning("AES intrinsics are not available on this CPU"); FLAG_SET_DEFAULT(UseAESIntrinsics, false); } +#endif - if (UseAESCTRIntrinsics) { - warning("AES/CTR intrinsics are not available on this CPU"); - FLAG_SET_DEFAULT(UseAESCTRIntrinsics, false); - } if (UseGHASHIntrinsics) { warning("GHASH intrinsics are not available on this CPU"); diff --git a/src/share/vm/opto/graphKit.cpp b/src/share/vm/opto/graphKit.cpp --- a/src/share/vm/opto/graphKit.cpp +++ b/src/share/vm/opto/graphKit.cpp @@ -1680,6 +1680,8 @@ Node* GraphKit::load_array_element(Node* ctl, Node* ary, Node* idx, const TypeAryPtr* arytype) { const Type* elemtype = arytype->elem(); BasicType elembt = elemtype->array_element_basic_type(); + if (elembt == T_NARROWOOP) + elembt = T_OBJECT; Node* adr = array_element_address(ary, idx, elembt, arytype->size()); Node* ld = make_load(ctl, adr, elemtype, elembt, arytype, MemNode::unordered); return ld; diff --git a/src/share/vm/opto/library_call.cpp b/src/share/vm/opto/library_call.cpp --- a/src/share/vm/opto/library_call.cpp +++ b/src/share/vm/opto/library_call.cpp @@ -6065,7 +6065,19 @@ //------------------------------get_key_start_from_aescrypt_object----------------------- Node * LibraryCallKit::get_key_start_from_aescrypt_object(Node *aescrypt_object) { +#ifdef PPC64 + // MixColumns for decryption can be reduced by preprocessing MixColumns with round keys. + // Intel's extention is based on this optimization and AESCrypt generates round keys by preprocessing MixColumns. + // However, ppc64 vncipher processes MixColumns and require the same round keys with encryption. + // The ppc64 stubs are called with a pointer to the body of sessionK. + // The stubs of encryption and decryption get the same round keys (sessionK[0]). + Node* objSessionK = load_field_from_object(aescrypt_object, "sessionK", "[Ljava/lang/Object;", /*is_exact*/ false); + assert (objSessionK != NULL, "wrong version of com.sun.crypto.provider.AESCrypt"); + if (objSessionK == NULL) return (Node *) NULL; + Node* objAESCryptKey = load_array_element(control(), objSessionK, intcon(0), TypeAryPtr::OOPS); +#else Node* objAESCryptKey = load_field_from_object(aescrypt_object, "K", "[I", /*is_exact*/ false); +#endif assert (objAESCryptKey != NULL, "wrong version of com.sun.crypto.provider.AESCrypt"); if (objAESCryptKey == NULL) return (Node *) NULL;