hotspot Sdiff src/cpu/aarch64/vm

src/cpu/aarch64/vm/stubGenerator_aarch64.cpp

rev 11452 : 8159063: aarch64: optimise unaligned array copy long
Reviewed-by: aph, adinn

 784     int unit = wordSize * direction;
 785     int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
 786 
 787     int offset;
 788     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
 789       t4 = r7, t5 = r10, t6 = r11, t7 = r12;
 790     const Register stride = r13;
 791 
 792     assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7);
 793     assert_different_registers(s, d, count, rscratch1);
 794 
 795     Label again, drain;
 796     const char *stub_name;
 797     if (direction == copy_forwards)
 798       stub_name = "foward_copy_longs";
 799     else
 800       stub_name = "backward_copy_longs";
 801     StubCodeMark mark(this, "StubRoutines", stub_name);
 802     __ align(CodeEntryAlignment);
 803     __ bind(start);






 804     if (direction == copy_forwards) {
 805       __ sub(s, s, bias);
 806       __ sub(d, d, bias);
 807     }
 808 
 809 #ifdef ASSERT
 810     // Make sure we are never given < 8 words
 811     {
 812       Label L;
 813       __ cmp(count, 8);
 814       __ br(Assembler::GE, L);
 815       __ stop("genrate_copy_longs called with < 8 words");
 816       __ bind(L);
 817     }
 818 #endif
 819 
 820     // Fill 8 registers
 821     if (UseSIMDForMemoryOps) {
 822       __ ldpq(v0, v1, Address(s, 4 * unit));
 823       __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));

 884       } else {
 885         __ ldp(t0, t1, Address(s, 2 * unit));
 886         __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
 887         __ stp(t0, t1, Address(d, 2 * unit));
 888         __ stp(t2, t3, Address(__ pre(d, 4 * unit)));
 889       }
 890       __ bind(L1);
 891 
 892       if (direction == copy_forwards) {
 893         __ add(s, s, bias);
 894         __ add(d, d, bias);
 895       }
 896 
 897       __ tbz(count, 1, L2);
 898       __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
 899       __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
 900       __ bind(L2);
 901     }
 902 
 903     __ ret(lr);
































































































































































































 904   }
 905 
 906   // Small copy: less than 16 bytes.
 907   //
 908   // NB: Ignores all of the bits of count which represent more than 15
 909   // bytes, so a caller doesn't have to mask them.
 910 
 911   void copy_memory_small(Register s, Register d, Register count, Register tmp, int step) {
 912     bool is_backwards = step < 0;
 913     size_t granularity = uabs(step);
 914     int direction = is_backwards ? -1 : 1;
 915     int unit = wordSize * direction;
 916 
 917     Label Lpair, Lword, Lint, Lshort, Lbyte;
 918 
 919     assert(granularity
 920            && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
 921 
 922     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6;
 923

1007 
1008       __ stp(t0, t1, Address(d, 0));
1009       __ stp(t2, t3, Address(d, 16));
1010       __ stp(t4, t5, Address(dend, -32));
1011       __ stp(t6, t7, Address(dend, -16));
1012     }
1013     __ b(finish);
1014 
1015     // 17..32 bytes
1016     __ bind(copy32);
1017     __ ldp(t0, t1, Address(s, 0));
1018     __ ldp(t2, t3, Address(send, -16));
1019     __ stp(t0, t1, Address(d, 0));
1020     __ stp(t2, t3, Address(dend, -16));
1021     __ b(finish);
1022 
1023     // 65..80/96 bytes
1024     // (96 bytes if SIMD because we do 32 byes per instruction)
1025     __ bind(copy80);
1026     if (UseSIMDForMemoryOps) {
1027       __ ldpq(v0, v1, Address(s, 0));
1028       __ ldpq(v2, v3, Address(s, 32));
1029       __ ldpq(v4, v5, Address(send, -32));
1030       __ stpq(v0, v1, Address(d, 0));
1031       __ stpq(v2, v3, Address(d, 32));
1032       __ stpq(v4, v5, Address(dend, -32));
1033     } else {
1034       __ ldp(t0, t1, Address(s, 0));
1035       __ ldp(t2, t3, Address(s, 16));
1036       __ ldp(t4, t5, Address(s, 32));
1037       __ ldp(t6, t7, Address(s, 48));
1038       __ ldp(t8, t9, Address(send, -16));
1039 
1040       __ stp(t0, t1, Address(d, 0));
1041       __ stp(t2, t3, Address(d, 16));
1042       __ stp(t4, t5, Address(d, 32));
1043       __ stp(t6, t7, Address(d, 48));
1044       __ stp(t8, t9, Address(dend, -16));
1045     }
1046     __ b(finish);
1047 
1048     // 0..16 bytes
1049     __ bind(copy16);
1050     __ cmp(count, 8/granularity);
1051     __ br(Assembler::LO, copy8);

 784     int unit = wordSize * direction;
 785     int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
 786 
 787     int offset;
 788     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
 789       t4 = r7, t5 = r10, t6 = r11, t7 = r12;
 790     const Register stride = r13;
 791 
 792     assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7);
 793     assert_different_registers(s, d, count, rscratch1);
 794 
 795     Label again, drain;
 796     const char *stub_name;
 797     if (direction == copy_forwards)
 798       stub_name = "foward_copy_longs";
 799     else
 800       stub_name = "backward_copy_longs";
 801     StubCodeMark mark(this, "StubRoutines", stub_name);
 802     __ align(CodeEntryAlignment);
 803     __ bind(start);
 804 
 805     Label unaligned_copy_long;
 806     if (AvoidUnalignedAccesses) {
 807       __ tbnz(d, 3, unaligned_copy_long);
 808     }
 809 
 810     if (direction == copy_forwards) {
 811       __ sub(s, s, bias);
 812       __ sub(d, d, bias);
 813     }
 814 
 815 #ifdef ASSERT
 816     // Make sure we are never given < 8 words
 817     {
 818       Label L;
 819       __ cmp(count, 8);
 820       __ br(Assembler::GE, L);
 821       __ stop("genrate_copy_longs called with < 8 words");
 822       __ bind(L);
 823     }
 824 #endif
 825 
 826     // Fill 8 registers
 827     if (UseSIMDForMemoryOps) {
 828       __ ldpq(v0, v1, Address(s, 4 * unit));
 829       __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));

 890       } else {
 891         __ ldp(t0, t1, Address(s, 2 * unit));
 892         __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
 893         __ stp(t0, t1, Address(d, 2 * unit));
 894         __ stp(t2, t3, Address(__ pre(d, 4 * unit)));
 895       }
 896       __ bind(L1);
 897 
 898       if (direction == copy_forwards) {
 899         __ add(s, s, bias);
 900         __ add(d, d, bias);
 901       }
 902 
 903       __ tbz(count, 1, L2);
 904       __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
 905       __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
 906       __ bind(L2);
 907     }
 908 
 909     __ ret(lr);
 910 
 911     if (AvoidUnalignedAccesses) {
 912       Label drain, again;
 913       // Register order for storing. Order is different for backward copy.
 914 
 915       __ bind(unaligned_copy_long);
 916 
 917       // source address is even aligned, target odd aligned
 918       //
 919       // when forward copying word pairs we read long pairs at offsets
 920       // {0, 2, 4, 6} (in long words). when backwards copying we read
 921       // long pairs at offsets {-2, -4, -6, -8}. We adjust the source
 922       // address by -2 in the forwards case so we can compute the
 923       // source offsets for both as {2, 4, 6, 8} * unit where unit = 1
 924       // or -1.
 925       //
 926       // when forward copying we need to store 1 word, 3 pairs and
 927       // then 1 word at offsets {0, 1, 3, 5, 7}. Rather thna use a
 928       // zero offset We adjust the destination by -1 which means we
 929       // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores.
 930       //
 931       // When backwards copyng we need to store 1 word, 3 pairs and
 932       // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use
 933       // offsets {1, 3, 5, 7, 8} * unit.
 934 
 935       if (direction == copy_forwards) {
 936         __ sub(s, s, 16);
 937         __ sub(d, d, 8);
 938       }
 939 
 940       // Fill 8 registers
 941       //
 942       // for forwards copy s was offset by -16 from the original input
 943       // value of s so the register contents are at these offsets
 944       // relative to the 64 bit block addressed by that original input
 945       // and so on for each successive 64 byte block when s is updated
 946       //
 947       // t0 at offset 0,  t1 at offset 8
 948       // t2 at offset 16, t3 at offset 24
 949       // t4 at offset 32, t5 at offset 40
 950       // t6 at offset 48, t7 at offset 56
 951 
 952       // for backwards copy s was not offset so the register contents
 953       // are at these offsets into the preceding 64 byte block
 954       // relative to that original input and so on for each successive
 955       // preceding 64 byte block when s is updated. this explains the
 956       // slightly counter-intuitive looking pattern of register usage
 957       // in the stp instructions for backwards copy.
 958       //
 959       // t0 at offset -16, t1 at offset -8
 960       // t2 at offset -32, t3 at offset -24
 961       // t4 at offset -48, t5 at offset -40
 962       // t6 at offset -64, t7 at offset -56
 963 
 964       __ ldp(t0, t1, Address(s, 2 * unit));
 965       __ ldp(t2, t3, Address(s, 4 * unit));
 966       __ ldp(t4, t5, Address(s, 6 * unit));
 967       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 968 
 969       __ subs(count, count, 16);
 970       __ br(Assembler::LO, drain);
 971 
 972       int prefetch = PrefetchCopyIntervalInBytes;
 973       bool use_stride = false;
 974       if (direction == copy_backwards) {
 975          use_stride = prefetch > 256;
 976          prefetch = -prefetch;
 977          if (use_stride) __ mov(stride, prefetch);
 978       }
 979 
 980       __ bind(again);
 981 
 982       if (PrefetchCopyIntervalInBytes > 0)
 983         __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 984 
 985       if (direction == copy_forwards) {
 986        // allowing for the offset of -8 the store instructions place
 987        // registers into the target 64 bit block at the following
 988        // offsets
 989        //
 990        // t0 at offset 0
 991        // t1 at offset 8,  t2 at offset 16
 992        // t3 at offset 24, t4 at offset 32
 993        // t5 at offset 40, t6 at offset 48
 994        // t7 at offset 56
 995 
 996         __ str(t0, Address(d, 1 * unit));
 997         __ stp(t1, t2, Address(d, 2 * unit));
 998         __ ldp(t0, t1, Address(s, 2 * unit));
 999         __ stp(t3, t4, Address(d, 4 * unit));
1000         __ ldp(t2, t3, Address(s, 4 * unit));
1001         __ stp(t5, t6, Address(d, 6 * unit));
1002         __ ldp(t4, t5, Address(s, 6 * unit));
1003         __ str(t7, Address(__ pre(d, 8 * unit)));
1004         __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
1005       } else {
1006        // d was not offset when we started so the registers are
1007        // written into the 64 bit block preceding d with the following
1008        // offsets
1009        //
1010        // t1 at offset -8
1011        // t3 at offset -24, t0 at offset -16
1012        // t5 at offset -48, t2 at offset -32
1013        // t7 at offset -56, t4 at offset -48
1014        //                   t6 at offset -64
1015        //
1016        // note that this matches the offsets previously noted for the
1017        // loads
1018 
1019         __ str(t1, Address(d, 1 * unit));
1020         __ stp(t3, t0, Address(d, 3 * unit));
1021         __ ldp(t0, t1, Address(s, 2 * unit));
1022         __ stp(t5, t2, Address(d, 5 * unit));
1023         __ ldp(t2, t3, Address(s, 4 * unit));
1024         __ stp(t7, t4, Address(d, 7 * unit));
1025         __ ldp(t4, t5, Address(s, 6 * unit));
1026         __ str(t6, Address(__ pre(d, 8 * unit)));
1027         __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
1028       }
1029 
1030       __ subs(count, count, 8);
1031       __ br(Assembler::HS, again);
1032 
1033       // Drain
1034       //
1035       // this uses the same pattern of offsets and register arguments
1036       // as above
1037       __ bind(drain);
1038       if (direction == copy_forwards) {
1039         __ str(t0, Address(d, 1 * unit));
1040         __ stp(t1, t2, Address(d, 2 * unit));
1041         __ stp(t3, t4, Address(d, 4 * unit));
1042         __ stp(t5, t6, Address(d, 6 * unit));
1043         __ str(t7, Address(__ pre(d, 8 * unit)));
1044       } else {
1045         __ str(t1, Address(d, 1 * unit));
1046         __ stp(t3, t0, Address(d, 3 * unit));
1047         __ stp(t5, t2, Address(d, 5 * unit));
1048         __ stp(t7, t4, Address(d, 7 * unit));
1049         __ str(t6, Address(__ pre(d, 8 * unit)));
1050       }
1051       // now we need to copy any remaining part block which may
1052       // include a 4 word block subblock and/or a 2 word subblock.
1053       // bits 2 and 1 in the count are the tell-tale for whetehr we
1054       // have each such subblock
1055       {
1056         Label L1, L2;
1057         __ tbz(count, exact_log2(4), L1);
1058        // this is the same as above but copying only 4 longs hence
1059        // with ony one intervening stp between the str instructions
1060        // but note that the offsets and registers still follow the
1061        // same pattern
1062         __ ldp(t0, t1, Address(s, 2 * unit));
1063         __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
1064         if (direction == copy_forwards) {
1065           __ str(t0, Address(d, 1 * unit));
1066           __ stp(t1, t2, Address(d, 2 * unit));
1067           __ str(t3, Address(__ pre(d, 4 * unit)));
1068         } else {
1069           __ str(t1, Address(d, 1 * unit));
1070           __ stp(t3, t0, Address(d, 3 * unit));
1071           __ str(t2, Address(__ pre(d, 4 * unit)));
1072         }
1073         __ bind(L1);
1074 
1075         __ tbz(count, 1, L2);
1076        // this is the same as above but copying only 2 longs hence
1077        // there is no intervening stp between the str instructions
1078        // but note that the offset and register patterns are still
1079        // the same
1080         __ ldp(t0, t1, Address(__ pre(s, 2 * unit)));
1081         if (direction == copy_forwards) {
1082           __ str(t0, Address(d, 1 * unit));
1083           __ str(t1, Address(__ pre(d, 2 * unit)));
1084         } else {
1085           __ str(t1, Address(d, 1 * unit));
1086           __ str(t0, Address(__ pre(d, 2 * unit)));
1087         }
1088         __ bind(L2);
1089 
1090        // for forwards copy we need to re-adjust the offsets we
1091        // applied so that s and d are follow the last words written
1092 
1093        if (direction == copy_forwards) {
1094          __ add(s, s, 16);
1095          __ add(d, d, 8);
1096        }
1097 
1098       }
1099 
1100       __ ret(lr);
1101       }
1102   }
1103 
1104   // Small copy: less than 16 bytes.
1105   //
1106   // NB: Ignores all of the bits of count which represent more than 15
1107   // bytes, so a caller doesn't have to mask them.
1108 
1109   void copy_memory_small(Register s, Register d, Register count, Register tmp, int step) {
1110     bool is_backwards = step < 0;
1111     size_t granularity = uabs(step);
1112     int direction = is_backwards ? -1 : 1;
1113     int unit = wordSize * direction;
1114 
1115     Label Lpair, Lword, Lint, Lshort, Lbyte;
1116 
1117     assert(granularity
1118            && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
1119 
1120     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6;
1121

1205 
1206       __ stp(t0, t1, Address(d, 0));
1207       __ stp(t2, t3, Address(d, 16));
1208       __ stp(t4, t5, Address(dend, -32));
1209       __ stp(t6, t7, Address(dend, -16));
1210     }
1211     __ b(finish);
1212 
1213     // 17..32 bytes
1214     __ bind(copy32);
1215     __ ldp(t0, t1, Address(s, 0));
1216     __ ldp(t2, t3, Address(send, -16));
1217     __ stp(t0, t1, Address(d, 0));
1218     __ stp(t2, t3, Address(dend, -16));
1219     __ b(finish);
1220 
1221     // 65..80/96 bytes
1222     // (96 bytes if SIMD because we do 32 byes per instruction)
1223     __ bind(copy80);
1224     if (UseSIMDForMemoryOps) {
1225       __ ld4(v0, v1, v2, v3, __ T16B, Address(s, 0));

1226       __ ldpq(v4, v5, Address(send, -32));
1227       __ st4(v0, v1, v2, v3, __ T16B, Address(d, 0));

1228       __ stpq(v4, v5, Address(dend, -32));
1229     } else {
1230       __ ldp(t0, t1, Address(s, 0));
1231       __ ldp(t2, t3, Address(s, 16));
1232       __ ldp(t4, t5, Address(s, 32));
1233       __ ldp(t6, t7, Address(s, 48));
1234       __ ldp(t8, t9, Address(send, -16));
1235 
1236       __ stp(t0, t1, Address(d, 0));
1237       __ stp(t2, t3, Address(d, 16));
1238       __ stp(t4, t5, Address(d, 32));
1239       __ stp(t6, t7, Address(d, 48));
1240       __ stp(t8, t9, Address(dend, -16));
1241     }
1242     __ b(finish);
1243 
1244     // 0..16 bytes
1245     __ bind(copy16);
1246     __ cmp(count, 8/granularity);
1247     __ br(Assembler::LO, copy8);

< prev index next >