784 int unit = wordSize * direction;
785 int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
786
787 int offset;
788 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
789 t4 = r7, t5 = r10, t6 = r11, t7 = r12;
790 const Register stride = r13;
791
792 assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7);
793 assert_different_registers(s, d, count, rscratch1);
794
795 Label again, drain;
796 const char *stub_name;
797 if (direction == copy_forwards)
798 stub_name = "foward_copy_longs";
799 else
800 stub_name = "backward_copy_longs";
801 StubCodeMark mark(this, "StubRoutines", stub_name);
802 __ align(CodeEntryAlignment);
803 __ bind(start);
804 if (direction == copy_forwards) {
805 __ sub(s, s, bias);
806 __ sub(d, d, bias);
807 }
808
809 #ifdef ASSERT
810 // Make sure we are never given < 8 words
811 {
812 Label L;
813 __ cmp(count, 8);
814 __ br(Assembler::GE, L);
815 __ stop("genrate_copy_longs called with < 8 words");
816 __ bind(L);
817 }
818 #endif
819
820 // Fill 8 registers
821 if (UseSIMDForMemoryOps) {
822 __ ldpq(v0, v1, Address(s, 4 * unit));
823 __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
884 } else {
885 __ ldp(t0, t1, Address(s, 2 * unit));
886 __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
887 __ stp(t0, t1, Address(d, 2 * unit));
888 __ stp(t2, t3, Address(__ pre(d, 4 * unit)));
889 }
890 __ bind(L1);
891
892 if (direction == copy_forwards) {
893 __ add(s, s, bias);
894 __ add(d, d, bias);
895 }
896
897 __ tbz(count, 1, L2);
898 __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
899 __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
900 __ bind(L2);
901 }
902
903 __ ret(lr);
904 }
905
906 // Small copy: less than 16 bytes.
907 //
908 // NB: Ignores all of the bits of count which represent more than 15
909 // bytes, so a caller doesn't have to mask them.
910
911 void copy_memory_small(Register s, Register d, Register count, Register tmp, int step) {
912 bool is_backwards = step < 0;
913 size_t granularity = uabs(step);
914 int direction = is_backwards ? -1 : 1;
915 int unit = wordSize * direction;
916
917 Label Lpair, Lword, Lint, Lshort, Lbyte;
918
919 assert(granularity
920 && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
921
922 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6;
923
1007
1008 __ stp(t0, t1, Address(d, 0));
1009 __ stp(t2, t3, Address(d, 16));
1010 __ stp(t4, t5, Address(dend, -32));
1011 __ stp(t6, t7, Address(dend, -16));
1012 }
1013 __ b(finish);
1014
1015 // 17..32 bytes
1016 __ bind(copy32);
1017 __ ldp(t0, t1, Address(s, 0));
1018 __ ldp(t2, t3, Address(send, -16));
1019 __ stp(t0, t1, Address(d, 0));
1020 __ stp(t2, t3, Address(dend, -16));
1021 __ b(finish);
1022
1023 // 65..80/96 bytes
1024 // (96 bytes if SIMD because we do 32 byes per instruction)
1025 __ bind(copy80);
1026 if (UseSIMDForMemoryOps) {
1027 __ ldpq(v0, v1, Address(s, 0));
1028 __ ldpq(v2, v3, Address(s, 32));
1029 __ ldpq(v4, v5, Address(send, -32));
1030 __ stpq(v0, v1, Address(d, 0));
1031 __ stpq(v2, v3, Address(d, 32));
1032 __ stpq(v4, v5, Address(dend, -32));
1033 } else {
1034 __ ldp(t0, t1, Address(s, 0));
1035 __ ldp(t2, t3, Address(s, 16));
1036 __ ldp(t4, t5, Address(s, 32));
1037 __ ldp(t6, t7, Address(s, 48));
1038 __ ldp(t8, t9, Address(send, -16));
1039
1040 __ stp(t0, t1, Address(d, 0));
1041 __ stp(t2, t3, Address(d, 16));
1042 __ stp(t4, t5, Address(d, 32));
1043 __ stp(t6, t7, Address(d, 48));
1044 __ stp(t8, t9, Address(dend, -16));
1045 }
1046 __ b(finish);
1047
1048 // 0..16 bytes
1049 __ bind(copy16);
1050 __ cmp(count, 8/granularity);
1051 __ br(Assembler::LO, copy8);
|
784 int unit = wordSize * direction;
785 int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
786
787 int offset;
788 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
789 t4 = r7, t5 = r10, t6 = r11, t7 = r12;
790 const Register stride = r13;
791
792 assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7);
793 assert_different_registers(s, d, count, rscratch1);
794
795 Label again, drain;
796 const char *stub_name;
797 if (direction == copy_forwards)
798 stub_name = "foward_copy_longs";
799 else
800 stub_name = "backward_copy_longs";
801 StubCodeMark mark(this, "StubRoutines", stub_name);
802 __ align(CodeEntryAlignment);
803 __ bind(start);
804
805 Label unaligned_copy_long;
806 if (AvoidUnalignedAccesses) {
807 __ tbnz(d, 3, unaligned_copy_long);
808 }
809
810 if (direction == copy_forwards) {
811 __ sub(s, s, bias);
812 __ sub(d, d, bias);
813 }
814
815 #ifdef ASSERT
816 // Make sure we are never given < 8 words
817 {
818 Label L;
819 __ cmp(count, 8);
820 __ br(Assembler::GE, L);
821 __ stop("genrate_copy_longs called with < 8 words");
822 __ bind(L);
823 }
824 #endif
825
826 // Fill 8 registers
827 if (UseSIMDForMemoryOps) {
828 __ ldpq(v0, v1, Address(s, 4 * unit));
829 __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
890 } else {
891 __ ldp(t0, t1, Address(s, 2 * unit));
892 __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
893 __ stp(t0, t1, Address(d, 2 * unit));
894 __ stp(t2, t3, Address(__ pre(d, 4 * unit)));
895 }
896 __ bind(L1);
897
898 if (direction == copy_forwards) {
899 __ add(s, s, bias);
900 __ add(d, d, bias);
901 }
902
903 __ tbz(count, 1, L2);
904 __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
905 __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
906 __ bind(L2);
907 }
908
909 __ ret(lr);
910
911 if (AvoidUnalignedAccesses) {
912 Label drain, again;
913 // Register order for storing. Order is different for backward copy.
914
915 __ bind(unaligned_copy_long);
916
917 // source address is even aligned, target odd aligned
918 //
919 // when forward copying word pairs we read long pairs at offsets
920 // {0, 2, 4, 6} (in long words). when backwards copying we read
921 // long pairs at offsets {-2, -4, -6, -8}. We adjust the source
922 // address by -2 in the forwards case so we can compute the
923 // source offsets for both as {2, 4, 6, 8} * unit where unit = 1
924 // or -1.
925 //
926 // when forward copying we need to store 1 word, 3 pairs and
927 // then 1 word at offsets {0, 1, 3, 5, 7}. Rather thna use a
928 // zero offset We adjust the destination by -1 which means we
929 // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores.
930 //
931 // When backwards copyng we need to store 1 word, 3 pairs and
932 // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use
933 // offsets {1, 3, 5, 7, 8} * unit.
934
935 if (direction == copy_forwards) {
936 __ sub(s, s, 16);
937 __ sub(d, d, 8);
938 }
939
940 // Fill 8 registers
941 //
942 // for forwards copy s was offset by -16 from the original input
943 // value of s so the register contents are at these offsets
944 // relative to the 64 bit block addressed by that original input
945 // and so on for each successive 64 byte block when s is updated
946 //
947 // t0 at offset 0, t1 at offset 8
948 // t2 at offset 16, t3 at offset 24
949 // t4 at offset 32, t5 at offset 40
950 // t6 at offset 48, t7 at offset 56
951
952 // for backwards copy s was not offset so the register contents
953 // are at these offsets into the preceding 64 byte block
954 // relative to that original input and so on for each successive
955 // preceding 64 byte block when s is updated. this explains the
956 // slightly counter-intuitive looking pattern of register usage
957 // in the stp instructions for backwards copy.
958 //
959 // t0 at offset -16, t1 at offset -8
960 // t2 at offset -32, t3 at offset -24
961 // t4 at offset -48, t5 at offset -40
962 // t6 at offset -64, t7 at offset -56
963
964 __ ldp(t0, t1, Address(s, 2 * unit));
965 __ ldp(t2, t3, Address(s, 4 * unit));
966 __ ldp(t4, t5, Address(s, 6 * unit));
967 __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
968
969 __ subs(count, count, 16);
970 __ br(Assembler::LO, drain);
971
972 int prefetch = PrefetchCopyIntervalInBytes;
973 bool use_stride = false;
974 if (direction == copy_backwards) {
975 use_stride = prefetch > 256;
976 prefetch = -prefetch;
977 if (use_stride) __ mov(stride, prefetch);
978 }
979
980 __ bind(again);
981
982 if (PrefetchCopyIntervalInBytes > 0)
983 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
984
985 if (direction == copy_forwards) {
986 // allowing for the offset of -8 the store instructions place
987 // registers into the target 64 bit block at the following
988 // offsets
989 //
990 // t0 at offset 0
991 // t1 at offset 8, t2 at offset 16
992 // t3 at offset 24, t4 at offset 32
993 // t5 at offset 40, t6 at offset 48
994 // t7 at offset 56
995
996 __ str(t0, Address(d, 1 * unit));
997 __ stp(t1, t2, Address(d, 2 * unit));
998 __ ldp(t0, t1, Address(s, 2 * unit));
999 __ stp(t3, t4, Address(d, 4 * unit));
1000 __ ldp(t2, t3, Address(s, 4 * unit));
1001 __ stp(t5, t6, Address(d, 6 * unit));
1002 __ ldp(t4, t5, Address(s, 6 * unit));
1003 __ str(t7, Address(__ pre(d, 8 * unit)));
1004 __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
1005 } else {
1006 // d was not offset when we started so the registers are
1007 // written into the 64 bit block preceding d with the following
1008 // offsets
1009 //
1010 // t1 at offset -8
1011 // t3 at offset -24, t0 at offset -16
1012 // t5 at offset -48, t2 at offset -32
1013 // t7 at offset -56, t4 at offset -48
1014 // t6 at offset -64
1015 //
1016 // note that this matches the offsets previously noted for the
1017 // loads
1018
1019 __ str(t1, Address(d, 1 * unit));
1020 __ stp(t3, t0, Address(d, 3 * unit));
1021 __ ldp(t0, t1, Address(s, 2 * unit));
1022 __ stp(t5, t2, Address(d, 5 * unit));
1023 __ ldp(t2, t3, Address(s, 4 * unit));
1024 __ stp(t7, t4, Address(d, 7 * unit));
1025 __ ldp(t4, t5, Address(s, 6 * unit));
1026 __ str(t6, Address(__ pre(d, 8 * unit)));
1027 __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
1028 }
1029
1030 __ subs(count, count, 8);
1031 __ br(Assembler::HS, again);
1032
1033 // Drain
1034 //
1035 // this uses the same pattern of offsets and register arguments
1036 // as above
1037 __ bind(drain);
1038 if (direction == copy_forwards) {
1039 __ str(t0, Address(d, 1 * unit));
1040 __ stp(t1, t2, Address(d, 2 * unit));
1041 __ stp(t3, t4, Address(d, 4 * unit));
1042 __ stp(t5, t6, Address(d, 6 * unit));
1043 __ str(t7, Address(__ pre(d, 8 * unit)));
1044 } else {
1045 __ str(t1, Address(d, 1 * unit));
1046 __ stp(t3, t0, Address(d, 3 * unit));
1047 __ stp(t5, t2, Address(d, 5 * unit));
1048 __ stp(t7, t4, Address(d, 7 * unit));
1049 __ str(t6, Address(__ pre(d, 8 * unit)));
1050 }
1051 // now we need to copy any remaining part block which may
1052 // include a 4 word block subblock and/or a 2 word subblock.
1053 // bits 2 and 1 in the count are the tell-tale for whetehr we
1054 // have each such subblock
1055 {
1056 Label L1, L2;
1057 __ tbz(count, exact_log2(4), L1);
1058 // this is the same as above but copying only 4 longs hence
1059 // with ony one intervening stp between the str instructions
1060 // but note that the offsets and registers still follow the
1061 // same pattern
1062 __ ldp(t0, t1, Address(s, 2 * unit));
1063 __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
1064 if (direction == copy_forwards) {
1065 __ str(t0, Address(d, 1 * unit));
1066 __ stp(t1, t2, Address(d, 2 * unit));
1067 __ str(t3, Address(__ pre(d, 4 * unit)));
1068 } else {
1069 __ str(t1, Address(d, 1 * unit));
1070 __ stp(t3, t0, Address(d, 3 * unit));
1071 __ str(t2, Address(__ pre(d, 4 * unit)));
1072 }
1073 __ bind(L1);
1074
1075 __ tbz(count, 1, L2);
1076 // this is the same as above but copying only 2 longs hence
1077 // there is no intervening stp between the str instructions
1078 // but note that the offset and register patterns are still
1079 // the same
1080 __ ldp(t0, t1, Address(__ pre(s, 2 * unit)));
1081 if (direction == copy_forwards) {
1082 __ str(t0, Address(d, 1 * unit));
1083 __ str(t1, Address(__ pre(d, 2 * unit)));
1084 } else {
1085 __ str(t1, Address(d, 1 * unit));
1086 __ str(t0, Address(__ pre(d, 2 * unit)));
1087 }
1088 __ bind(L2);
1089
1090 // for forwards copy we need to re-adjust the offsets we
1091 // applied so that s and d are follow the last words written
1092
1093 if (direction == copy_forwards) {
1094 __ add(s, s, 16);
1095 __ add(d, d, 8);
1096 }
1097
1098 }
1099
1100 __ ret(lr);
1101 }
1102 }
1103
1104 // Small copy: less than 16 bytes.
1105 //
1106 // NB: Ignores all of the bits of count which represent more than 15
1107 // bytes, so a caller doesn't have to mask them.
1108
1109 void copy_memory_small(Register s, Register d, Register count, Register tmp, int step) {
1110 bool is_backwards = step < 0;
1111 size_t granularity = uabs(step);
1112 int direction = is_backwards ? -1 : 1;
1113 int unit = wordSize * direction;
1114
1115 Label Lpair, Lword, Lint, Lshort, Lbyte;
1116
1117 assert(granularity
1118 && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
1119
1120 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6;
1121
1205
1206 __ stp(t0, t1, Address(d, 0));
1207 __ stp(t2, t3, Address(d, 16));
1208 __ stp(t4, t5, Address(dend, -32));
1209 __ stp(t6, t7, Address(dend, -16));
1210 }
1211 __ b(finish);
1212
1213 // 17..32 bytes
1214 __ bind(copy32);
1215 __ ldp(t0, t1, Address(s, 0));
1216 __ ldp(t2, t3, Address(send, -16));
1217 __ stp(t0, t1, Address(d, 0));
1218 __ stp(t2, t3, Address(dend, -16));
1219 __ b(finish);
1220
1221 // 65..80/96 bytes
1222 // (96 bytes if SIMD because we do 32 byes per instruction)
1223 __ bind(copy80);
1224 if (UseSIMDForMemoryOps) {
1225 __ ld4(v0, v1, v2, v3, __ T16B, Address(s, 0));
1226 __ ldpq(v4, v5, Address(send, -32));
1227 __ st4(v0, v1, v2, v3, __ T16B, Address(d, 0));
1228 __ stpq(v4, v5, Address(dend, -32));
1229 } else {
1230 __ ldp(t0, t1, Address(s, 0));
1231 __ ldp(t2, t3, Address(s, 16));
1232 __ ldp(t4, t5, Address(s, 32));
1233 __ ldp(t6, t7, Address(s, 48));
1234 __ ldp(t8, t9, Address(send, -16));
1235
1236 __ stp(t0, t1, Address(d, 0));
1237 __ stp(t2, t3, Address(d, 16));
1238 __ stp(t4, t5, Address(d, 32));
1239 __ stp(t6, t7, Address(d, 48));
1240 __ stp(t8, t9, Address(dend, -16));
1241 }
1242 __ b(finish);
1243
1244 // 0..16 bytes
1245 __ bind(copy16);
1246 __ cmp(count, 8/granularity);
1247 __ br(Assembler::LO, copy8);
|