diff --git a/src/cpu/ppc/vm/stubGenerator_ppc.cpp b/src/cpu/ppc/vm/stubGenerator_ppc.cpp --- a/src/cpu/ppc/vm/stubGenerator_ppc.cpp +++ b/src/cpu/ppc/vm/stubGenerator_ppc.cpp @@ -1723,55 +1723,202 @@ // count: R5_ARG3 treated as signed // void generate_disjoint_long_copy_core(bool aligned) { - Register tmp1 = R6_ARG4; - Register tmp2 = R7_ARG5; - Register tmp3 = R8_ARG6; - Register tmp4 = R0; - - Label l_1, l_2, l_3, l_4; - - { // FasterArrayCopy - __ cmpwi(CCR0, R5_ARG3, 3); - __ ble(CCR0, l_3); // copy 1 at a time if less than 4 elements remain - - __ srdi(tmp1, R5_ARG3, 2); - __ andi_(R5_ARG3, R5_ARG3, 3); - __ mtctr(tmp1); - - __ bind(l_4); - // Use unrolled version for mass copying (copy 4 elements a time). - // Load feeding store gets zero latency on Power6, however not on Power5. - // Therefore, the following sequence is made for the good of both. - __ ld(tmp1, 0, R3_ARG1); - __ ld(tmp2, 8, R3_ARG1); - __ ld(tmp3, 16, R3_ARG1); - __ ld(tmp4, 24, R3_ARG1); - __ std(tmp1, 0, R4_ARG2); - __ std(tmp2, 8, R4_ARG2); - __ std(tmp3, 16, R4_ARG2); - __ std(tmp4, 24, R4_ARG2); - __ addi(R3_ARG1, R3_ARG1, 32); - __ addi(R4_ARG2, R4_ARG2, 32); - __ bdnz(l_4); - } - - // copy 1 element at a time - __ bind(l_3); - __ cmpwi(CCR0, R5_ARG3, 0); - __ beq(CCR0, l_1); - - { // FasterArrayCopy - __ mtctr(R5_ARG3); - __ addi(R3_ARG1, R3_ARG1, -8); - __ addi(R4_ARG2, R4_ARG2, -8); - - __ bind(l_2); - __ ldu(R0, 8, R3_ARG1); - __ stdu(R0, 8, R4_ARG2); - __ bdnz(l_2); - - } - __ bind(l_1); + Register tmp1 = R6_ARG4; + Register tmp2 = R7_ARG5; + Register tmp3 = R8_ARG6; + Register tmp4 = R9_ARG7; + + Label l_1, l_2, l_3, l_4, l_5, l_6, l_7, l_8, l_9, l_10; + + // vector + VectorRegister vTmp1 = VR1; + VectorRegister vTmp2 = VR2; + VectorRegister vTmp3 = VR3; + VectorRegister vTmp4 = VR4; + VectorRegister vPerm = VR5; + + __ cmpwi(CCR0, R5_ARG3, 15); // compare immediate. the result of the comparison is placed into CCR0 + __ ble(CCR0, l_10); // copy 4 at a time if <= 15 elements remain (original code) + + //check if start from 0 + __ andi_(tmp2, R4_ARG2, 15); // to + __ clrldi(tmp3, R3_ARG1, 60); // from + + __ cmpld(CCR0, tmp2, tmp3); + __ bne(CCR0, l_6); // to and from alignments NOT match + + __ cmpdi(CCR0, tmp3, 0); + __ beq(CCR0, l_7); // to and from start from 0 + + //copy 8 byte at first + { + __ ld(tmp1, 0, R3_ARG1); + __ std(tmp1, 0, R4_ARG2); + __ addi(R3_ARG1, R3_ARG1, 8); + __ addi(R4_ARG2, R4_ARG2, 8); + __ addi(R5_ARG3, R5_ARG3, -1); + } + + __ bind(l_7); + __ li(tmp2, 16); + __ li(tmp3, 32); + __ li(tmp4, 48); + + { // VMX ArrayCopy + __ cmpwi(CCR0, R5_ARG3, 15); //compare immediate. the result of the comparison is placed into CCR0 + __ ble(CCR0, l_10); // copy 4 at a time if <= 15 elements remain + + __ srdi(tmp1, R5_ARG3, 4); + __ andi_(R5_ARG3, R5_ARG3, 15); + __ mtctr(tmp1); // count register (CTR), copy the contents of register temp1 to the CTR + + __ bind(l_5); + //copy 16 elements (total 128 byte) a time + __ lvx(vTmp1, 0, R3_ARG1); + __ stvx(vTmp1, 0, R4_ARG2); + __ lvx(vTmp1, tmp2, R3_ARG1); + __ stvx(vTmp1, tmp2, R4_ARG2); + __ lvx(vTmp1, tmp3, R3_ARG1); + __ stvx(vTmp1, tmp3, R4_ARG2); + __ lvx(vTmp1, tmp4, R3_ARG1); + __ stvx(vTmp1, tmp4, R4_ARG2); + __ addi(R3_ARG1, R3_ARG1, 64); + __ addi(R4_ARG2, R4_ARG2, 64); + + __ lvx(vTmp1, 0, R3_ARG1); + __ stvx(vTmp1, 0, R4_ARG2); + __ lvx(vTmp1, tmp2, R3_ARG1); + __ stvx(vTmp1, tmp2, R4_ARG2); + __ lvx(vTmp1, tmp3, R3_ARG1); + __ stvx(vTmp1, tmp3, R4_ARG2); + __ lvx(vTmp1, tmp4, R3_ARG1); + __ stvx(vTmp1, tmp4, R4_ARG2); + __ addi(R3_ARG1, R3_ARG1, 64); + __ addi(R4_ARG2, R4_ARG2, 64); + + __ bdnz(l_5); + } + + __ b(l_10); + + // either from or to is not aligned + __ bind(l_6); + + __ lvsr(vPerm, 0, R3_ARG1); //for little-endian + __ cmpdi(CCR0, tmp2, 0); // if to is aligned + __ beq(CCR0, l_8); + { + __ lvsr(vPerm, 0, R4_ARG2); //for little-endian + __ ld(tmp1, 0, R3_ARG1); // store first 8byte (1 element) + __ std(tmp1, 0, R4_ARG2); + __ addi(R3_ARG1, R3_ARG1, 8); + __ addi(R4_ARG2, R4_ARG2, 8); + __ addi(R5_ARG3, R5_ARG3, -1); //last 1 element is copied after l_9 + } + + __ bind(l_8); // now from and to are aligned + + { + __ cmpwi(CCR0, R5_ARG3, 15); + __ ble(CCR0, l_10); // copy 4 at a time if <= 15 elements remain + __ srdi(tmp1, R5_ARG3, 4); + __ andi_(R5_ARG3, R5_ARG3, 15); + __ mtctr(tmp1); // count register (CTR), copy the contents of register temp1 to the CTR + + + __ li(tmp2, 16); + __ li(tmp3, 32); + __ li(tmp4, 48); + + __ lvx(vTmp1, 0, R3_ARG1); + __ bind(l_9); //copy 16 elements (total 128 byte) a time + + __ lvx(vTmp2, tmp2, R3_ARG1); + __ vperm(vTmp3, vTmp2, vTmp1, vPerm); + __ stvx(vTmp3, 0, R4_ARG2); //0 16 + __ lvx(vTmp1, tmp3, R3_ARG1); + __ vperm(vTmp3, vTmp1, vTmp2, vPerm); + __ stvx(vTmp3, tmp2, R4_ARG2); //16 32 + __ lvx(vTmp2, tmp4, R3_ARG1); + __ vperm(vTmp3, vTmp2, vTmp1, vPerm); + __ stvx(vTmp3, tmp3, R4_ARG2); //32 48 + + __ addi(R3_ARG1, R3_ARG1, 64); + + __ lvx(vTmp1, 0, R3_ARG1); + __ vperm(vTmp3, vTmp1, vTmp2, vPerm); + __ stvx(vTmp3, tmp4, R4_ARG2); //48 64 + + __ addi(R4_ARG2, R4_ARG2, 64); + + // 2nd turn + __ lvx(vTmp2, tmp2, R3_ARG1); + __ vperm(vTmp3, vTmp2, vTmp1, vPerm); + __ stvx(vTmp3, 0, R4_ARG2); //0 16 + __ lvx(vTmp1, tmp3, R3_ARG1); + __ vperm(vTmp3, vTmp1, vTmp2, vPerm); + __ stvx(vTmp3, tmp2, R4_ARG2); //16 32 + __ lvx(vTmp2, tmp4, R3_ARG1); + __ vperm(vTmp3, vTmp2, vTmp1, vPerm); + __ stvx(vTmp3, tmp3, R4_ARG2); //32 48 + + __ addi(R3_ARG1, R3_ARG1, 64); + + __ lvx(vTmp1, 0, R3_ARG1); + __ vperm(vTmp3, vTmp1, vTmp2, vPerm); + __ stvx(vTmp3, tmp4, R4_ARG2); //48 64 + + __ addi(R4_ARG2, R4_ARG2, 64); + + __ bdnz(l_9); + + } + + __ bind(l_10); + + {// FasterArrayCopy + __ cmpwi(CCR0, R5_ARG3, 3); + __ ble(CCR0, l_3); // copy 1 at a time if less than 4 elements remain + + __ srdi(tmp1, R5_ARG3, 2); + __ andi_(R5_ARG3, R5_ARG3, 3); + __ mtctr(tmp1); + + __ bind(l_4); + // Use unrolled version for mass copying (copy 4 elements a time). + // Load feeding store gets zero latency on Power6, however not on Power5. + // Therefore, the following sequence is made for the good of both. + __ ld(tmp1, 0, R3_ARG1); + __ ld(tmp2, 8, R3_ARG1); + __ ld(tmp3, 16, R3_ARG1); + __ ld(tmp4, 24, R3_ARG1); + __ std(tmp1, 0, R4_ARG2); + __ std(tmp2, 8, R4_ARG2); + __ std(tmp3, 16, R4_ARG2); + __ std(tmp4, 24, R4_ARG2); + __ addi(R3_ARG1, R3_ARG1, 32); + __ addi(R4_ARG2, R4_ARG2, 32); + __ bdnz(l_4); + } + + // copy 1 element at a time + __ bind(l_3); + __ cmpwi(CCR0, R5_ARG3, 0); + __ beq(CCR0, l_1); + + { // FasterArrayCopy + __ mtctr(R5_ARG3); + __ addi(R3_ARG1, R3_ARG1, -8); + __ addi(R4_ARG2, R4_ARG2, -8); + + __ bind(l_2); + __ ldu(R0, 8, R3_ARG1); + __ stdu(R0, 8, R4_ARG2); + __ bdnz(l_2); + + } + __ bind(l_1); + } // Generate stub for disjoint long copy. If "aligned" is true, the