1 /* 2 * Copyright (c) 2016, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2016 SAP SE. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 // Major contributions by LS 27 28 #ifndef CPU_S390_VM_COPY_S390_HPP 29 #define CPU_S390_VM_COPY_S390_HPP 30 31 // Inline functions for memory copy and fill. 32 33 // HeapWordSize (the size of class HeapWord) is 8 Bytes (the size of a 34 // pointer variable), since we always run the _LP64 model. As a consequence, 35 // HeapWord* memory ranges are always assumed to be doubleword-aligned, 36 // having a size which is an integer multiple of HeapWordSize. 37 // 38 // Dealing only with doubleword-aligned doubleword units has important 39 // positive performance and data access consequences. Many of the move 40 // instructions perform particularly well under these circumstances. 41 // Data access is "doubleword-concurrent", except for MVC and XC. 42 // Furthermore, data access can be forced to be sequential (MVCL and MVCLE) 43 // by use of the special padding byte 0xb1, where required. For copying, 44 // we use padding byte 0xb0 to prevent the D-cache from being polluted. 45 // 46 // On z/Architecture, gcc optimizes memcpy into a series of MVC instructions. 47 // This is optimal, even if just one HeapWord is copied. However, MVC 48 // copying is not atomic, i.e. not "doubleword concurrent" by definition. 49 // 50 // If the -mmvcle compiler option is specified, memcpy translates into 51 // code such that the entire memory range is copied or preset with just 52 // one MVCLE instruction. 53 // 54 // *to = *from is transformed into a MVC instruction already with -O1. 55 // Thus, for atomic copy operations, (inline) assembler code is required 56 // to guarantee atomic data accesses. 57 // 58 // For large (len >= MVCLEThreshold) chunks of memory, we exploit 59 // special H/W support of z/Architecture: 60 // 1) copy short piece of memory to page-align address(es) 61 // 2) copy largest part (all contained full pages) of memory using mvcle instruction. 62 // z/Architecture processors have special H/W support for page-aligned storage 63 // where len is an int multiple of page size. In that case, up to 4 cache lines are 64 // processed in parallel and L1 cache is not polluted. 65 // 3) copy the remaining piece of memory. 66 // 67 // Measurement classifications: 68 // very rare - <= 10.000 calls AND <= 1.000 usec elapsed 69 // rare - <= 100.000 calls AND <= 10.000 usec elapsed 70 // some - <= 1.000.000 calls AND <= 100.000 usec elapsed 71 // freq - <= 10.000.000 calls AND <= 1.000.000 usec elapsed 72 // very freq - > 10.000.000 calls OR > 1.000.000 usec elapsed 73 74 #undef USE_INLINE_ASM 75 76 static void copy_conjoint_jshorts_atomic(jshort* from, jshort* to, size_t count) { 77 if (from > to) { 78 while (count-- > 0) { 79 // Copy forwards 80 *to++ = *from++; 81 } 82 } else { 83 from += count - 1; 84 to += count - 1; 85 while (count-- > 0) { 86 // Copy backwards 87 *to-- = *from--; 88 } 89 } 90 } 91 92 static void copy_conjoint_jints_atomic(jint* from, jint* to, size_t count) { 93 if (from > to) { 94 while (count-- > 0) { 95 // Copy forwards 96 *to++ = *from++; 97 } 98 } else { 99 from += count - 1; 100 to += count - 1; 101 while (count-- > 0) { 102 // Copy backwards 103 *to-- = *from--; 104 } 105 } 106 } 107 108 static bool has_destructive_overlap(char* from, char* to, size_t byte_count) { 109 return (from < to) && ((to-from) < (ptrdiff_t)byte_count); 110 } 111 112 #ifdef USE_INLINE_ASM 113 114 //-------------------------------------------------------------- 115 // Atomic copying. Atomicity is given by the minimum of source 116 // and target alignment. Refer to mail comm with Tim Slegel/IBM. 117 // Only usable for disjoint source and target. 118 //-------------------------------------------------------------- 119 #define MOVE8_ATOMIC_4(_to,_from) { \ 120 unsigned long toaddr; \ 121 unsigned long fromaddr; \ 122 asm( \ 123 "LG %[toaddr],%[to] \n\t" /* address of to area */ \ 124 "LG %[fromaddr],%[from] \n\t" /* address of from area */ \ 125 "MVC 0(32,%[toaddr]),0(%[fromaddr]) \n\t" /* move data */ \ 126 : [to] "+Q" (_to) /* outputs */ \ 127 , [from] "+Q" (_from) \ 128 , [toaddr] "=a" (toaddr) \ 129 , [fromaddr] "=a" (fromaddr) \ 130 : \ 131 : "cc" /* clobbered */ \ 132 ); \ 133 } 134 #define MOVE8_ATOMIC_3(_to,_from) { \ 135 unsigned long toaddr; \ 136 unsigned long fromaddr; \ 137 asm( \ 138 "LG %[toaddr],%[to] \n\t" /* address of to area */ \ 139 "LG %[fromaddr],%[from] \n\t" /* address of from area */ \ 140 "MVC 0(24,%[toaddr]),0(%[fromaddr]) \n\t" /* move data */ \ 141 : [to] "+Q" (_to) /* outputs */ \ 142 , [from] "+Q" (_from) \ 143 , [toaddr] "=a" (toaddr) \ 144 , [fromaddr] "=a" (fromaddr) \ 145 : \ 146 : "cc" /* clobbered */ \ 147 ); \ 148 } 149 #define MOVE8_ATOMIC_2(_to,_from) { \ 150 unsigned long toaddr; \ 151 unsigned long fromaddr; \ 152 asm( \ 153 "LG %[toaddr],%[to] \n\t" /* address of to area */ \ 154 "LG %[fromaddr],%[from] \n\t" /* address of from area */ \ 155 "MVC 0(16,%[toaddr]),0(%[fromaddr]) \n\t" /* move data */ \ 156 : [to] "+Q" (_to) /* outputs */ \ 157 , [from] "+Q" (_from) \ 158 , [toaddr] "=a" (toaddr) \ 159 , [fromaddr] "=a" (fromaddr) \ 160 : \ 161 : "cc" /* clobbered */ \ 162 ); \ 163 } 164 #define MOVE8_ATOMIC_1(_to,_from) { \ 165 unsigned long toaddr; \ 166 unsigned long fromaddr; \ 167 asm( \ 168 "LG %[toaddr],%[to] \n\t" /* address of to area */ \ 169 "LG %[fromaddr],%[from] \n\t" /* address of from area */ \ 170 "MVC 0(8,%[toaddr]),0(%[fromaddr]) \n\t" /* move data */ \ 171 : [to] "+Q" (_to) /* outputs */ \ 172 , [from] "+Q" (_from) \ 173 , [toaddr] "=a" (toaddr) \ 174 , [fromaddr] "=a" (fromaddr) \ 175 : \ 176 : "cc" /* clobbered */ \ 177 ); \ 178 } 179 180 //-------------------------------------------------------------- 181 // Atomic copying of 8-byte entities. 182 // Conjoint/disjoint property does not matter. Entities are first 183 // loaded and then stored. 184 // _to and _from must be 8-byte aligned. 185 //-------------------------------------------------------------- 186 #define COPY8_ATOMIC_4(_to,_from) { \ 187 unsigned long toaddr; \ 188 asm( \ 189 "LG 3,%[from] \n\t" /* address of from area */ \ 190 "LG %[toaddr],%[to] \n\t" /* address of to area */ \ 191 "LMG 0,3,0(3) \n\t" /* load data */ \ 192 "STMG 0,3,0(%[toaddr]) \n\t" /* store data */ \ 193 : [to] "+Q" (_to) /* outputs */ \ 194 , [from] "+Q" (_from) /* outputs */ \ 195 , [toaddr] "=a" (toaddr) /* inputs */ \ 196 : \ 197 : "cc", "r0", "r1", "r2", "r3" /* clobbered */ \ 198 ); \ 199 } 200 #define COPY8_ATOMIC_3(_to,_from) { \ 201 unsigned long toaddr; \ 202 asm( \ 203 "LG 2,%[from] \n\t" /* address of from area */ \ 204 "LG %[toaddr],%[to] \n\t" /* address of to area */ \ 205 "LMG 0,2,0(2) \n\t" /* load data */ \ 206 "STMG 0,2,0(%[toaddr]) \n\t" /* store data */ \ 207 : [to] "+Q" (_to) /* outputs */ \ 208 , [from] "+Q" (_from) /* outputs */ \ 209 , [toaddr] "=a" (toaddr) /* inputs */ \ 210 : \ 211 : "cc", "r0", "r1", "r2" /* clobbered */ \ 212 ); \ 213 } 214 #define COPY8_ATOMIC_2(_to,_from) { \ 215 unsigned long toaddr; \ 216 asm( \ 217 "LG 1,%[from] \n\t" /* address of from area */ \ 218 "LG %[toaddr],%[to] \n\t" /* address of to area */ \ 219 "LMG 0,1,0(1) \n\t" /* load data */ \ 220 "STMG 0,1,0(%[toaddr]) \n\t" /* store data */ \ 221 : [to] "+Q" (_to) /* outputs */ \ 222 , [from] "+Q" (_from) /* outputs */ \ 223 , [toaddr] "=a" (toaddr) /* inputs */ \ 224 : \ 225 : "cc", "r0", "r1" /* clobbered */ \ 226 ); \ 227 } 228 #define COPY8_ATOMIC_1(_to,_from) { \ 229 unsigned long addr; \ 230 asm( \ 231 "LG %[addr],%[from] \n\t" /* address of from area */ \ 232 "LG 0,0(0,%[addr]) \n\t" /* load data */ \ 233 "LG %[addr],%[to] \n\t" /* address of to area */ \ 234 "STG 0,0(0,%[addr]) \n\t" /* store data */ \ 235 : [to] "+Q" (_to) /* outputs */ \ 236 , [from] "+Q" (_from) /* outputs */ \ 237 , [addr] "=a" (addr) /* inputs */ \ 238 : \ 239 : "cc", "r0" /* clobbered */ \ 240 ); \ 241 } 242 243 //-------------------------------------------------------------- 244 // Atomic copying of 4-byte entities. 245 // Exactly 4 (four) entities are copied. 246 // Conjoint/disjoint property does not matter. Entities are first 247 // loaded and then stored. 248 // _to and _from must be 4-byte aligned. 249 //-------------------------------------------------------------- 250 #define COPY4_ATOMIC_4(_to,_from) { \ 251 unsigned long toaddr; \ 252 asm( \ 253 "LG 3,%[from] \n\t" /* address of from area */ \ 254 "LG %[toaddr],%[to] \n\t" /* address of to area */ \ 255 "LM 0,3,0(3) \n\t" /* load data */ \ 256 "STM 0,3,0(%[toaddr]) \n\t" /* store data */ \ 257 : [to] "+Q" (_to) /* outputs */ \ 258 , [from] "+Q" (_from) /* outputs */ \ 259 , [toaddr] "=a" (toaddr) /* inputs */ \ 260 : \ 261 : "cc", "r0", "r1", "r2", "r3" /* clobbered */ \ 262 ); \ 263 } 264 #define COPY4_ATOMIC_3(_to,_from) { \ 265 unsigned long toaddr; \ 266 asm( \ 267 "LG 2,%[from] \n\t" /* address of from area */ \ 268 "LG %[toaddr],%[to] \n\t" /* address of to area */ \ 269 "LM 0,2,0(2) \n\t" /* load data */ \ 270 "STM 0,2,0(%[toaddr]) \n\t" /* store data */ \ 271 : [to] "+Q" (_to) /* outputs */ \ 272 , [from] "+Q" (_from) /* outputs */ \ 273 , [toaddr] "=a" (toaddr) /* inputs */ \ 274 : \ 275 : "cc", "r0", "r1", "r2" /* clobbered */ \ 276 ); \ 277 } 278 #define COPY4_ATOMIC_2(_to,_from) { \ 279 unsigned long toaddr; \ 280 asm( \ 281 "LG 1,%[from] \n\t" /* address of from area */ \ 282 "LG %[toaddr],%[to] \n\t" /* address of to area */ \ 283 "LM 0,1,0(1) \n\t" /* load data */ \ 284 "STM 0,1,0(%[toaddr]) \n\t" /* store data */ \ 285 : [to] "+Q" (_to) /* outputs */ \ 286 , [from] "+Q" (_from) /* outputs */ \ 287 , [toaddr] "=a" (toaddr) /* inputs */ \ 288 : \ 289 : "cc", "r0", "r1" /* clobbered */ \ 290 ); \ 291 } 292 #define COPY4_ATOMIC_1(_to,_from) { \ 293 unsigned long addr; \ 294 asm( \ 295 "LG %[addr],%[from] \n\t" /* address of from area */ \ 296 "L 0,0(0,%[addr]) \n\t" /* load data */ \ 297 "LG %[addr],%[to] \n\t" /* address of to area */ \ 298 "ST 0,0(0,%[addr]) \n\t" /* store data */ \ 299 : [to] "+Q" (_to) /* outputs */ \ 300 , [from] "+Q" (_from) /* outputs */ \ 301 , [addr] "=a" (addr) /* inputs */ \ 302 : \ 303 : "cc", "r0" /* clobbered */ \ 304 ); \ 305 } 306 307 #if 0 // Waiting for gcc to support EXRL. 308 #define MVC_MEMCOPY(_to,_from,_len) \ 309 if (VM_Version::has_ExecuteExtensions()) { \ 310 asm("\t" \ 311 " LAY 1,-1(0,%[len]) \n\t" /* decr for MVC */ \ 312 " EXRL 1,1f \n\t" /* execute MVC instr */ \ 313 " BRC 15,2f \n\t" /* skip template */ \ 314 "1: MVC 0(%[len],%[to]),0(%[from]) \n\t" \ 315 "2: BCR 0,0 \n\t" \ 316 : [to] "+Q" (_to) /* outputs */ \ 317 , [from] "+Q" (_from) /* outputs */ \ 318 : [len] "r" (_len) /* inputs */ \ 319 : "cc", "r1" /* clobbered */ \ 320 ); \ 321 } else { \ 322 asm("\t" \ 323 " LARL 2,3f \n\t" \ 324 " LAY 1,-1(0,%[len]) \n\t" /* decr for MVC */ \ 325 " EX 1,0(2) \n\t" /* execute MVC instr */ \ 326 " BRC 15,4f \n\t" /* skip template */ \ 327 "3: MVC 0(%[len],%[to]),0(%[from]) \n\t" \ 328 "4: BCR 0,0 \n\t" \ 329 : [to] "+Q" (_to) /* outputs */ \ 330 , [from] "+Q" (_from) /* outputs */ \ 331 : [len] "r" (_len) /* inputs */ \ 332 : "cc", "r1", "r2" /* clobbered */ \ 333 ); \ 334 } 335 #else 336 #define MVC_MEMCOPY(_to,_from,_len) \ 337 { unsigned long toaddr; unsigned long tolen; \ 338 unsigned long fromaddr; unsigned long target; \ 339 asm("\t" \ 340 " LTGR %[tolen],%[len] \n\t" /* decr for MVC */ \ 341 " BRC 8,2f \n\t" /* do nothing for l=0*/ \ 342 " AGHI %[tolen],-1 \n\t" \ 343 " LG %[toaddr],%[to] \n\t" \ 344 " LG %[fromaddr],%[from] \n\t" \ 345 " LARL %[target],1f \n\t" /* addr of MVC instr */ \ 346 " EX %[tolen],0(%[target]) \n\t" /* execute MVC instr */ \ 347 " BRC 15,2f \n\t" /* skip template */ \ 348 "1: MVC 0(1,%[toaddr]),0(%[fromaddr]) \n\t" \ 349 "2: BCR 0,0 \n\t" /* nop a branch target*/\ 350 : [to] "+Q" (_to) /* outputs */ \ 351 , [from] "+Q" (_from) \ 352 , [tolen] "=a" (tolen) \ 353 , [toaddr] "=a" (toaddr) \ 354 , [fromaddr] "=a" (fromaddr) \ 355 , [target] "=a" (target) \ 356 : [len] "r" (_len) /* inputs */ \ 357 : "cc" /* clobbered */ \ 358 ); \ 359 } 360 #endif 361 362 #if 0 // code snippet to be used for debugging 363 /* ASSERT code BEGIN */ \ 364 " LARL %[len],5f \n\t" \ 365 " LARL %[mta],4f \n\t" \ 366 " SLGR %[len],%[mta] \n\t" \ 367 " CGHI %[len],16 \n\t" \ 368 " BRC 7,9f \n\t" /* block size != 16 */ \ 369 \ 370 " LARL %[len],1f \n\t" \ 371 " SLGR %[len],%[mta] \n\t" \ 372 " CGHI %[len],256 \n\t" \ 373 " BRC 7,9f \n\t" /* list len != 256 */ \ 374 \ 375 " LGR 0,0 \n\t" /* artificial SIGILL */ \ 376 "9: BRC 7,-2 \n\t" \ 377 " LARL %[mta],1f \n\t" /* restore MVC table begin */ \ 378 /* ASSERT code END */ 379 #endif 380 381 // Optimized copying for data less than 4k 382 // - no destructive overlap 383 // - 0 <= _n_bytes <= 4096 384 // This macro needs to be gcc-compiled with -march=z990. Otherwise, the 385 // LAY instruction is not available. 386 #define MVC_MULTI(_to,_from,_n_bytes) \ 387 { unsigned long toaddr; \ 388 unsigned long fromaddr; \ 389 unsigned long movetable; \ 390 unsigned long len; \ 391 asm("\t" \ 392 " LTGFR %[len],%[nby] \n\t" \ 393 " LG %[ta],%[to] \n\t" /* address of to area */ \ 394 " BRC 8,1f \n\t" /* nothing to copy */ \ 395 \ 396 " NILL %[nby],255 \n\t" /* # bytes mod 256 */ \ 397 " LG %[fa],%[from] \n\t" /* address of from area */ \ 398 " BRC 8,3f \n\t" /* no rest, skip copying */ \ 399 \ 400 " LARL %[mta],2f \n\t" /* MVC template addr */ \ 401 " AHI %[nby],-1 \n\t" /* adjust for EX MVC */ \ 402 \ 403 " EX %[nby],0(%[mta]) \n\t" /* only rightmost */ \ 404 /* 8 bits of nby used */ \ 405 /* Since nby is <= 4096 on entry to this code, we do need */ \ 406 /* no zero extension before using it in addr calc. */ \ 407 " LA %[fa],1(%[nby],%[fa]) \n\t"/* adjust from addr */ \ 408 " LA %[ta],1(%[nby],%[ta]) \n\t"/* adjust to addr */ \ 409 \ 410 "3: SRAG %[nby],%[len],8 \n\t" /* # cache lines */ \ 411 " LARL %[mta],1f \n\t" /* MVC table begin */ \ 412 " BRC 8,1f \n\t" /* nothing to copy */ \ 413 \ 414 /* Insert ASSERT code here if required. */ \ 415 \ 416 \ 417 " LNGFR %[nby],%[nby] \n\t" /* negative offset into */ \ 418 " SLLG %[nby],%[nby],4 \n\t" /* MVC table 16-byte blocks */ \ 419 " BC 15,0(%[nby],%[mta]) \n\t" /* branch to block #ncl */ \ 420 \ 421 "2: MVC 0(1,%[ta]),0(%[fa]) \n\t" /* MVC template */ \ 422 \ 423 "4: MVC 0(256,%[ta]),0(%[fa]) \n\t" /* 4096 == l */ \ 424 " LAY %[ta],256(0,%[ta]) \n\t" \ 425 " LA %[fa],256(0,%[fa]) \n\t" \ 426 "5: MVC 0(256,%[ta]),0(%[fa]) \n\t" /* 3840 <= l < 4096 */ \ 427 " LAY %[ta],256(0,%[ta]) \n\t" \ 428 " LA %[fa],256(0,%[fa]) \n\t" \ 429 " MVC 0(256,%[ta]),0(%[fa]) \n\t" /* 3548 <= l < 3328 */ \ 430 " LAY %[ta],256(0,%[ta]) \n\t" \ 431 " LA %[fa],256(0,%[fa]) \n\t" \ 432 " MVC 0(256,%[ta]),0(%[fa]) \n\t" /* 3328 <= l < 3328 */ \ 433 " LAY %[ta],256(0,%[ta]) \n\t" \ 434 " LA %[fa],256(0,%[fa]) \n\t" \ 435 " MVC 0(256,%[ta]),0(%[fa]) \n\t" /* 3072 <= l < 3328 */ \ 436 " LAY %[ta],256(0,%[ta]) \n\t" \ 437 " LA %[fa],256(0,%[fa]) \n\t" \ 438 " MVC 0(256,%[ta]),0(%[fa]) \n\t" /* 2816 <= l < 3072 */ \ 439 " LAY %[ta],256(0,%[ta]) \n\t" \ 440 " LA %[fa],256(0,%[fa]) \n\t" \ 441 " MVC 0(256,%[ta]),0(%[fa]) \n\t" /* 2560 <= l < 2816 */ \ 442 " LAY %[ta],256(0,%[ta]) \n\t" \ 443 " LA %[fa],256(0,%[fa]) \n\t" \ 444 " MVC 0(256,%[ta]),0(%[fa]) \n\t" /* 2304 <= l < 2560 */ \ 445 " LAY %[ta],256(0,%[ta]) \n\t" \ 446 " LA %[fa],256(0,%[fa]) \n\t" \ 447 " MVC 0(256,%[ta]),0(%[fa]) \n\t" /* 2048 <= l < 2304 */ \ 448 " LAY %[ta],256(0,%[ta]) \n\t" \ 449 " LA %[fa],256(0,%[fa]) \n\t" \ 450 " MVC 0(256,%[ta]),0(%[fa]) \n\t" /* 1792 <= l < 2048 */ \ 451 " LAY %[ta],256(0,%[ta]) \n\t" \ 452 " LA %[fa],256(0,%[fa]) \n\t" \ 453 " MVC 0(256,%[ta]),0(%[fa]) \n\t" /* 1536 <= l < 1792 */ \ 454 " LAY %[ta],256(0,%[ta]) \n\t" \ 455 " LA %[fa],256(0,%[fa]) \n\t" \ 456 " MVC 0(256,%[ta]),0(%[fa]) \n\t" /* 1280 <= l < 1536 */ \ 457 " LAY %[ta],256(0,%[ta]) \n\t" \ 458 " LA %[fa],256(0,%[fa]) \n\t" \ 459 " MVC 0(256,%[ta]),0(%[fa]) \n\t" /* 1024 <= l < 1280 */ \ 460 " LAY %[ta],256(0,%[ta]) \n\t" \ 461 " LA %[fa],256(0,%[fa]) \n\t" \ 462 " MVC 0(256,%[ta]),0(%[fa]) \n\t" /* 768 <= l < 1024 */ \ 463 " LAY %[ta],256(0,%[ta]) \n\t" \ 464 " LA %[fa],256(0,%[fa]) \n\t" \ 465 " MVC 0(256,%[ta]),0(%[fa]) \n\t" /* 512 <= l < 768 */ \ 466 " LAY %[ta],256(0,%[ta]) \n\t" \ 467 " LA %[fa],256(0,%[fa]) \n\t" \ 468 " MVC 0(256,%[ta]),0(%[fa]) \n\t" /* 256 <= l < 512 */ \ 469 " LAY %[ta],256(0,%[ta]) \n\t" \ 470 " LA %[fa],256(0,%[fa]) \n\t" \ 471 "1: BCR 0,0 \n\t" /* nop as branch target */ \ 472 : [to] "+Q" (_to) /* outputs */ \ 473 , [from] "+Q" (_from) \ 474 , [ta] "=a" (toaddr) \ 475 , [fa] "=a" (fromaddr) \ 476 , [mta] "=a" (movetable) \ 477 , [nby] "+a" (_n_bytes) \ 478 , [len] "=a" (len) \ 479 : \ 480 : "cc" /* clobbered */ \ 481 ); \ 482 } 483 484 #define MVCLE_MEMCOPY(_to,_from,_len) \ 485 asm( \ 486 " LG 0,%[to] \n\t" /* address of to area */ \ 487 " LG 2,%[from] \n\t" /* address of from area */ \ 488 " LGR 1,%[len] \n\t" /* len of to area */ \ 489 " LGR 3,%[len] \n\t" /* len of from area */ \ 490 "1: MVCLE 0,2,176 \n\t" /* copy storage, bypass cache (0xb0) */ \ 491 " BRC 1,1b \n\t" /* retry if interrupted */ \ 492 : [to] "+Q" (_to) /* outputs */ \ 493 , [from] "+Q" (_from) /* outputs */ \ 494 : [len] "r" (_len) /* inputs */ \ 495 : "cc", "r0", "r1", "r2", "r3" /* clobbered */ \ 496 ); 497 498 #define MVCLE_MEMINIT(_to,_val,_len) \ 499 asm( \ 500 " LG 0,%[to] \n\t" /* address of to area */ \ 501 " LGR 1,%[len] \n\t" /* len of to area */ \ 502 " XGR 3,3 \n\t" /* from area len = 0 */ \ 503 "1: MVCLE 0,2,0(%[val]) \n\t" /* init storage */ \ 504 " BRC 1,1b \n\t" /* retry if interrupted */ \ 505 : [to] "+Q" (_to) /* outputs */ \ 506 : [len] "r" (_len) /* inputs */ \ 507 , [val] "r" (_val) /* inputs */ \ 508 : "cc", "r0", "r1", "r3" /* clobbered */ \ 509 ); 510 #define MVCLE_MEMZERO(_to,_len) \ 511 asm( \ 512 " LG 0,%[to] \n\t" /* address of to area */ \ 513 " LGR 1,%[len] \n\t" /* len of to area */ \ 514 " XGR 3,3 \n\t" /* from area len = 0 */ \ 515 "1: MVCLE 0,2,0 \n\t" /* clear storage */ \ 516 " BRC 1,1b \n\t" /* retry if interrupted */ \ 517 : [to] "+Q" (_to) /* outputs */ \ 518 : [len] "r" (_len) /* inputs */ \ 519 : "cc", "r0", "r1", "r3" /* clobbered */ \ 520 ); 521 522 // Clear a stretch of memory, 0 <= _len <= 256. 523 // There is no alignment prereq. 524 // There is no test for len out of range specified above. 525 #define XC_MEMZERO_256(_to,_len) \ 526 { unsigned long toaddr; unsigned long tolen; \ 527 unsigned long target; \ 528 asm("\t" \ 529 " LTGR %[tolen],%[len] \n\t" /* decr for MVC */ \ 530 " BRC 8,2f \n\t" /* do nothing for l=0*/ \ 531 " AGHI %[tolen],-1 \n\t" /* adjust for EX XC */ \ 532 " LARL %[target],1f \n\t" /* addr of XC instr */ \ 533 " LG %[toaddr],%[to] \n\t" /* addr of data area */ \ 534 " EX %[tolen],0(%[target]) \n\t" /* execute MVC instr */ \ 535 " BRC 15,2f \n\t" /* skip template */ \ 536 "1: XC 0(1,%[toaddr]),0(%[toaddr]) \n\t" \ 537 "2: BCR 0,0 \n\t" /* nop a branch target*/\ 538 : [to] "+Q" (_to) /* outputs */ \ 539 , [tolen] "=a" (tolen) \ 540 , [toaddr] "=a" (toaddr) \ 541 , [target] "=a" (target) \ 542 : [len] "r" (_len) /* inputs */ \ 543 : "cc" /* clobbered */ \ 544 ); \ 545 } 546 547 // Clear a stretch of memory, 256 < _len. 548 // XC_MEMZERO_256 may be used to clear shorter areas. 549 // 550 // The code 551 // - first zeroes a few bytes to align on a HeapWord. 552 // This step is currently inactive because all calls seem 553 // to have their data aligned on HeapWord boundaries. 554 // - then zeroes a few HeapWords to align on a cache line. 555 // - then zeroes entire cache lines in a loop. 556 // - then zeroes the remaining (partial) cache line. 557 #if 1 558 #define XC_MEMZERO_ANY(_to,_len) \ 559 { unsigned long toaddr; unsigned long tolen; \ 560 unsigned long len8; unsigned long len256; \ 561 unsigned long target; unsigned long lenx; \ 562 asm("\t" \ 563 " LTGR %[tolen],%[len] \n\t" /* */ \ 564 " BRC 8,2f \n\t" /* do nothing for l=0*/ \ 565 " LG %[toaddr],%[to] \n\t" /* addr of data area */ \ 566 " LARL %[target],1f \n\t" /* addr of XC instr */ \ 567 " " \ 568 " LCGR %[len256],%[toaddr] \n\t" /* cache line alignment */\ 569 " NILL %[len256],0xff \n\t" \ 570 " BRC 8,4f \n\t" /* already aligned */ \ 571 " NILH %[len256],0x00 \n\t" /* zero extend */ \ 572 " LLGFR %[len256],%[len256] \n\t" \ 573 " LAY %[lenx],-1(,%[len256]) \n\t" \ 574 " EX %[lenx],0(%[target]) \n\t" /* execute MVC instr */ \ 575 " LA %[toaddr],0(%[len256],%[toaddr]) \n\t" \ 576 " SGR %[tolen],%[len256] \n\t" /* adjust len */ \ 577 " " \ 578 "4: SRAG %[lenx],%[tolen],8 \n\t" /* # cache lines */ \ 579 " BRC 8,6f \n\t" /* no full cache lines */ \ 580 "5: XC 0(256,%[toaddr]),0(%[toaddr]) \n\t" \ 581 " LA %[toaddr],256(,%[toaddr]) \n\t" \ 582 " BRCTG %[lenx],5b \n\t" /* iterate */ \ 583 " " \ 584 "6: NILL %[tolen],0xff \n\t" /* leftover bytes */ \ 585 " BRC 8,2f \n\t" /* done if none */ \ 586 " LAY %[lenx],-1(,%[tolen]) \n\t" \ 587 " EX %[lenx],0(%[target]) \n\t" /* execute MVC instr */ \ 588 " BRC 15,2f \n\t" /* skip template */ \ 589 " " \ 590 "1: XC 0(1,%[toaddr]),0(%[toaddr]) \n\t" \ 591 "2: BCR 0,0 \n\t" /* nop a branch target */ \ 592 : [to] "+Q" (_to) /* outputs */ \ 593 , [lenx] "=a" (lenx) \ 594 , [len256] "=a" (len256) \ 595 , [tolen] "=a" (tolen) \ 596 , [toaddr] "=a" (toaddr) \ 597 , [target] "=a" (target) \ 598 : [len] "r" (_len) /* inputs */ \ 599 : "cc" /* clobbered */ \ 600 ); \ 601 } 602 #else 603 #define XC_MEMZERO_ANY(_to,_len) \ 604 { unsigned long toaddr; unsigned long tolen; \ 605 unsigned long len8; unsigned long len256; \ 606 unsigned long target; unsigned long lenx; \ 607 asm("\t" \ 608 " LTGR %[tolen],%[len] \n\t" /* */ \ 609 " BRC 8,2f \n\t" /* do nothing for l=0*/ \ 610 " LG %[toaddr],%[to] \n\t" /* addr of data area */ \ 611 " LARL %[target],1f \n\t" /* addr of XC instr */ \ 612 " " \ 613 " LCGR %[len8],%[toaddr] \n\t" /* HeapWord alignment */ \ 614 " NILL %[len8],0x07 \n\t" \ 615 " BRC 8,3f \n\t" /* already aligned */ \ 616 " NILH %[len8],0x00 \n\t" /* zero extend */ \ 617 " LLGFR %[len8],%[len8] \n\t" \ 618 " LAY %[lenx],-1(,%[len8]) \n\t" \ 619 " EX %[lenx],0(%[target]) \n\t" /* execute MVC instr */ \ 620 " LA %[toaddr],0(%[len8],%[toaddr]) \n\t" \ 621 " SGR %[tolen],%[len8] \n\t" /* adjust len */ \ 622 " " \ 623 "3: LCGR %[len256],%[toaddr] \n\t" /* cache line alignment */\ 624 " NILL %[len256],0xff \n\t" \ 625 " BRC 8,4f \n\t" /* already aligned */ \ 626 " NILH %[len256],0x00 \n\t" /* zero extend */ \ 627 " LLGFR %[len256],%[len256] \n\t" \ 628 " LAY %[lenx],-1(,%[len256]) \n\t" \ 629 " EX %[lenx],0(%[target]) \n\t" /* execute MVC instr */ \ 630 " LA %[toaddr],0(%[len256],%[toaddr]) \n\t" \ 631 " SGR %[tolen],%[len256] \n\t" /* adjust len */ \ 632 " " \ 633 "4: SRAG %[lenx],%[tolen],8 \n\t" /* # cache lines */ \ 634 " BRC 8,6f \n\t" /* no full cache lines */ \ 635 "5: XC 0(256,%[toaddr]),0(%[toaddr]) \n\t" \ 636 " LA %[toaddr],256(,%[toaddr]) \n\t" \ 637 " BRCTG %[lenx],5b \n\t" /* iterate */ \ 638 " " \ 639 "6: NILL %[tolen],0xff \n\t" /* leftover bytes */ \ 640 " BRC 8,2f \n\t" /* done if none */ \ 641 " LAY %[lenx],-1(,%[tolen]) \n\t" \ 642 " EX %[lenx],0(%[target]) \n\t" /* execute MVC instr */ \ 643 " BRC 15,2f \n\t" /* skip template */ \ 644 " " \ 645 "1: XC 0(1,%[toaddr]),0(%[toaddr]) \n\t" \ 646 "2: BCR 0,0 \n\t" /* nop a branch target */ \ 647 : [to] "+Q" (_to) /* outputs */ \ 648 , [lenx] "=a" (lenx) \ 649 , [len8] "=a" (len8) \ 650 , [len256] "=a" (len256) \ 651 , [tolen] "=a" (tolen) \ 652 , [toaddr] "=a" (toaddr) \ 653 , [target] "=a" (target) \ 654 : [len] "r" (_len) /* inputs */ \ 655 : "cc" /* clobbered */ \ 656 ); \ 657 } 658 #endif 659 #endif // USE_INLINE_ASM 660 661 //*************************************// 662 // D I S J O I N T C O P Y I N G // 663 //*************************************// 664 665 static void pd_aligned_disjoint_words(HeapWord* from, HeapWord* to, size_t count) { 666 // JVM2008: very frequent, some tests frequent. 667 668 // Copy HeapWord (=DW) aligned storage. Use MVCLE in inline-asm code. 669 // MVCLE guarantees DW concurrent (i.e. atomic) accesses if both the addresses of the operands 670 // are DW aligned and the length is an integer multiple of a DW. Should always be true here. 671 // 672 // No special exploit needed. H/W discovers suitable situations itself. 673 // 674 // For large chunks of memory, exploit special H/W support of z/Architecture: 675 // 1) copy short piece of memory to page-align address(es) 676 // 2) copy largest part (all contained full pages) of memory using mvcle instruction. 677 // z/Architecture processors have special H/W support for page-aligned storage 678 // where len is an int multiple of page size. In that case, up to 4 cache lines are 679 // processed in parallel and L1 cache is not polluted. 680 // 3) copy the remaining piece of memory. 681 // 682 #ifdef USE_INLINE_ASM 683 jbyte* to_bytes = (jbyte*)to; 684 jbyte* from_bytes = (jbyte*)from; 685 size_t len_bytes = count*HeapWordSize; 686 687 // Optimized copying for data less than 4k 688 switch (count) { 689 case 0: return; 690 case 1: MOVE8_ATOMIC_1(to,from) 691 return; 692 case 2: MOVE8_ATOMIC_2(to,from) 693 return; 694 // case 3: MOVE8_ATOMIC_3(to,from) 695 // return; 696 // case 4: MOVE8_ATOMIC_4(to,from) 697 // return; 698 default: 699 if (len_bytes <= 4096) { 700 MVC_MULTI(to,from,len_bytes) 701 return; 702 } 703 // else 704 MVCLE_MEMCOPY(to_bytes, from_bytes, len_bytes) 705 return; 706 } 707 #else 708 // Fallback code. 709 switch (count) { 710 case 0: 711 return; 712 713 case 1: 714 *to = *from; 715 return; 716 717 case 2: 718 *to++ = *from++; 719 *to = *from; 720 return; 721 722 case 3: 723 *to++ = *from++; 724 *to++ = *from++; 725 *to = *from; 726 return; 727 728 case 4: 729 *to++ = *from++; 730 *to++ = *from++; 731 *to++ = *from++; 732 *to = *from; 733 return; 734 735 default: 736 while (count-- > 0) 737 *(to++) = *(from++); 738 return; 739 } 740 #endif 741 } 742 743 static void pd_disjoint_words_atomic(HeapWord* from, HeapWord* to, size_t count) { 744 // JVM2008: < 4k calls. 745 assert(((((size_t)from) & 0x07L) | (((size_t)to) & 0x07L)) == 0, "No atomic copy w/o aligned data"); 746 pd_aligned_disjoint_words(from, to, count); // Rare calls -> just delegate. 747 } 748 749 static void pd_disjoint_words(HeapWord* from, HeapWord* to, size_t count) { 750 // JVM2008: very rare. 751 pd_aligned_disjoint_words(from, to, count); // Rare calls -> just delegate. 752 } 753 754 755 //*************************************// 756 // C O N J O I N T C O P Y I N G // 757 //*************************************// 758 759 static void pd_aligned_conjoint_words(HeapWord* from, HeapWord* to, size_t count) { 760 // JVM2008: between some and lower end of frequent. 761 762 #ifdef USE_INLINE_ASM 763 size_t count_in = count; 764 if (has_destructive_overlap((char*)from, (char*)to, count_in*BytesPerLong)) { 765 switch (count_in) { 766 case 4: COPY8_ATOMIC_4(to,from) 767 return; 768 case 3: COPY8_ATOMIC_3(to,from) 769 return; 770 case 2: COPY8_ATOMIC_2(to,from) 771 return; 772 case 1: COPY8_ATOMIC_1(to,from) 773 return; 774 case 0: return; 775 default: 776 from += count_in; 777 to += count_in; 778 while (count_in-- > 0) 779 *(--to) = *(--from); // Copy backwards, areas overlap destructively. 780 return; 781 } 782 } 783 // else 784 jbyte* to_bytes = (jbyte*)to; 785 jbyte* from_bytes = (jbyte*)from; 786 size_t len_bytes = count_in*BytesPerLong; 787 MVCLE_MEMCOPY(to_bytes, from_bytes, len_bytes) 788 return; 789 #else 790 // Fallback code. 791 if (has_destructive_overlap((char*)from, (char*)to, count*BytesPerLong)) { 792 HeapWord t1, t2, t3; 793 switch (count) { 794 case 0: 795 return; 796 797 case 1: 798 *to = *from; 799 return; 800 801 case 2: 802 t1 = *(from+1); 803 *to = *from; 804 *(to+1) = t1; 805 return; 806 807 case 3: 808 t1 = *(from+1); 809 t2 = *(from+2); 810 *to = *from; 811 *(to+1) = t1; 812 *(to+2) = t2; 813 return; 814 815 case 4: 816 t1 = *(from+1); 817 t2 = *(from+2); 818 t3 = *(from+3); 819 *to = *from; 820 *(to+1) = t1; 821 *(to+2) = t2; 822 *(to+3) = t3; 823 return; 824 825 default: 826 from += count; 827 to += count; 828 while (count-- > 0) 829 *(--to) = *(--from); // Copy backwards, areas overlap destructively. 830 return; 831 } 832 } 833 // else 834 // Just delegate. HeapWords are optimally aligned anyway. 835 pd_aligned_disjoint_words(from, to, count); 836 #endif 837 } 838 839 static void pd_conjoint_words(HeapWord* from, HeapWord* to, size_t count) { 840 841 // Just delegate. HeapWords are optimally aligned anyway. 842 pd_aligned_conjoint_words(from, to, count); 843 } 844 845 static void pd_conjoint_bytes(void* from, void* to, size_t count) { 846 847 #ifdef USE_INLINE_ASM 848 size_t count_in = count; 849 if (has_destructive_overlap((char*)from, (char*)to, count_in)) 850 (void)memmove(to, from, count_in); 851 else { 852 jbyte* to_bytes = (jbyte*)to; 853 jbyte* from_bytes = (jbyte*)from; 854 size_t len_bytes = count_in; 855 MVCLE_MEMCOPY(to_bytes, from_bytes, len_bytes) 856 } 857 #else 858 if (has_destructive_overlap((char*)from, (char*)to, count)) 859 (void)memmove(to, from, count); 860 else 861 (void)memcpy(to, from, count); 862 #endif 863 } 864 865 //**************************************************// 866 // C O N J O I N T A T O M I C C O P Y I N G // 867 //**************************************************// 868 869 static void pd_conjoint_bytes_atomic(void* from, void* to, size_t count) { 870 // Call arraycopy stubs to do the job. 871 pd_conjoint_bytes(from, to, count); // bytes are always accessed atomically. 872 } 873 874 static void pd_conjoint_jshorts_atomic(jshort* from, jshort* to, size_t count) { 875 876 #ifdef USE_INLINE_ASM 877 size_t count_in = count; 878 if (has_destructive_overlap((char*)from, (char*)to, count_in*BytesPerShort)) { 879 // Use optimizations from shared code where no z-specific optimization exists. 880 copy_conjoint_jshorts_atomic(from, to, count); 881 } else { 882 jbyte* to_bytes = (jbyte*)to; 883 jbyte* from_bytes = (jbyte*)from; 884 size_t len_bytes = count_in*BytesPerShort; 885 MVCLE_MEMCOPY(to_bytes, from_bytes, len_bytes) 886 } 887 #else 888 // Use optimizations from shared code where no z-specific optimization exists. 889 copy_conjoint_jshorts_atomic(from, to, count); 890 #endif 891 } 892 893 static void pd_conjoint_jints_atomic(jint* from, jint* to, size_t count) { 894 895 #ifdef USE_INLINE_ASM 896 size_t count_in = count; 897 if (has_destructive_overlap((char*)from, (char*)to, count_in*BytesPerInt)) { 898 switch (count_in) { 899 case 4: COPY4_ATOMIC_4(to,from) 900 return; 901 case 3: COPY4_ATOMIC_3(to,from) 902 return; 903 case 2: COPY4_ATOMIC_2(to,from) 904 return; 905 case 1: COPY4_ATOMIC_1(to,from) 906 return; 907 case 0: return; 908 default: 909 // Use optimizations from shared code where no z-specific optimization exists. 910 copy_conjoint_jints_atomic(from, to, count_in); 911 return; 912 } 913 } 914 // else 915 jbyte* to_bytes = (jbyte*)to; 916 jbyte* from_bytes = (jbyte*)from; 917 size_t len_bytes = count_in*BytesPerInt; 918 MVCLE_MEMCOPY(to_bytes, from_bytes, len_bytes) 919 #else 920 // Use optimizations from shared code where no z-specific optimization exists. 921 copy_conjoint_jints_atomic(from, to, count); 922 #endif 923 } 924 925 static void pd_conjoint_jlongs_atomic(jlong* from, jlong* to, size_t count) { 926 927 #ifdef USE_INLINE_ASM 928 size_t count_in = count; 929 if (has_destructive_overlap((char*)from, (char*)to, count_in*BytesPerLong)) { 930 switch (count_in) { 931 case 4: COPY8_ATOMIC_4(to,from) return; 932 case 3: COPY8_ATOMIC_3(to,from) return; 933 case 2: COPY8_ATOMIC_2(to,from) return; 934 case 1: COPY8_ATOMIC_1(to,from) return; 935 case 0: return; 936 default: 937 from += count_in; 938 to += count_in; 939 while (count_in-- > 0) { *(--to) = *(--from); } // Copy backwards, areas overlap destructively. 940 return; 941 } 942 } 943 // else { 944 jbyte* to_bytes = (jbyte*)to; 945 jbyte* from_bytes = (jbyte*)from; 946 size_t len_bytes = count_in*BytesPerLong; 947 MVCLE_MEMCOPY(to_bytes, from_bytes, len_bytes) 948 #else 949 size_t count_in = count; 950 if (has_destructive_overlap((char*)from, (char*)to, count_in*BytesPerLong)) { 951 if (count_in < 8) { 952 from += count_in; 953 to += count_in; 954 while (count_in-- > 0) 955 *(--to) = *(--from); // Copy backwards, areas overlap destructively. 956 return; 957 } 958 // else { 959 from += count_in-1; 960 to += count_in-1; 961 if (count_in&0x01) { 962 *(to--) = *(from--); 963 count_in--; 964 } 965 for (; count_in>0; count_in-=2) { 966 *to = *from; 967 *(to-1) = *(from-1); 968 to -= 2; 969 from -= 2; 970 } 971 } 972 else 973 pd_aligned_disjoint_words((HeapWord*)from, (HeapWord*)to, count_in); // rare calls -> just delegate. 974 #endif 975 } 976 977 static void pd_conjoint_oops_atomic(oop* from, oop* to, size_t count) { 978 979 #ifdef USE_INLINE_ASM 980 size_t count_in = count; 981 if (has_destructive_overlap((char*)from, (char*)to, count_in*BytesPerOop)) { 982 switch (count_in) { 983 case 4: COPY8_ATOMIC_4(to,from) return; 984 case 3: COPY8_ATOMIC_3(to,from) return; 985 case 2: COPY8_ATOMIC_2(to,from) return; 986 case 1: COPY8_ATOMIC_1(to,from) return; 987 case 0: return; 988 default: 989 from += count_in; 990 to += count_in; 991 while (count_in-- > 0) { *(--to) = *(--from); } // Copy backwards, areas overlap destructively. 992 return; 993 } 994 } 995 // else 996 jbyte* to_bytes = (jbyte*)to; 997 jbyte* from_bytes = (jbyte*)from; 998 size_t len_bytes = count_in*BytesPerOop; 999 MVCLE_MEMCOPY(to_bytes, from_bytes, len_bytes) 1000 #else 1001 size_t count_in = count; 1002 if (has_destructive_overlap((char*)from, (char*)to, count_in*BytesPerOop)) { 1003 from += count_in; 1004 to += count_in; 1005 while (count_in-- > 0) *(--to) = *(--from); // Copy backwards, areas overlap destructively. 1006 return; 1007 } 1008 // else 1009 pd_aligned_disjoint_words((HeapWord*)from, (HeapWord*)to, count_in); // rare calls -> just delegate. 1010 return; 1011 #endif 1012 } 1013 1014 static void pd_arrayof_conjoint_bytes(HeapWord* from, HeapWord* to, size_t count) { 1015 pd_conjoint_bytes_atomic(from, to, count); 1016 } 1017 1018 static void pd_arrayof_conjoint_jshorts(HeapWord* from, HeapWord* to, size_t count) { 1019 pd_conjoint_jshorts_atomic((jshort*)from, (jshort*)to, count); 1020 } 1021 1022 static void pd_arrayof_conjoint_jints(HeapWord* from, HeapWord* to, size_t count) { 1023 pd_conjoint_jints_atomic((jint*)from, (jint*)to, count); 1024 } 1025 1026 static void pd_arrayof_conjoint_jlongs(HeapWord* from, HeapWord* to, size_t count) { 1027 pd_conjoint_jlongs_atomic((jlong*)from, (jlong*)to, count); 1028 } 1029 1030 static void pd_arrayof_conjoint_oops(HeapWord* from, HeapWord* to, size_t count) { 1031 pd_conjoint_oops_atomic((oop*)from, (oop*)to, count); 1032 } 1033 1034 //**********************************************// 1035 // M E M O R Y I N I T I A L I S A T I O N // 1036 //**********************************************// 1037 1038 static void pd_fill_to_bytes(void* to, size_t count, jubyte value) { 1039 // JVM2008: very rare, only in some tests. 1040 #ifdef USE_INLINE_ASM 1041 // Initialize storage to a given value. Use memset instead of copy loop. 1042 // For large chunks of memory, exploit special H/W support of z/Architecture: 1043 // 1) init short piece of memory to page-align address 1044 // 2) init largest part (all contained full pages) of memory using mvcle instruction. 1045 // z/Architecture processors have special H/W support for page-aligned storage 1046 // where len is an int multiple of page size. In that case, up to 4 cache lines are 1047 // processed in parallel and L1 cache is not polluted. 1048 // 3) init the remaining piece of memory. 1049 // Atomicity cannot really be an issue since gcc implements the loop body with XC anyway. 1050 // If atomicity is a problem, we have to prevent gcc optimization. Best workaround: inline asm. 1051 1052 jbyte* to_bytes = (jbyte*)to; 1053 size_t len_bytes = count; 1054 1055 MVCLE_MEMINIT(to_bytes, value, len_bytes) 1056 1057 #else 1058 // Memset does the best job possible: loop over 256-byte MVCs, with 1059 // the last MVC EXecuted. With the -mmvcle option, initialization 1060 // is done using MVCLE -> slight advantage for large areas. 1061 (void)memset(to, value, count); 1062 #endif 1063 } 1064 1065 static void pd_fill_to_words(HeapWord* tohw, size_t count, juint value) { 1066 // Occurs in dbg builds only. Usually memory poisoning with BAADBABE, DEADBEEF, etc. 1067 // JVM2008: < 4k calls. 1068 if (value == 0) { 1069 pd_zero_to_words(tohw, count); 1070 return; 1071 } 1072 if (value == ~(juint)(0)) { 1073 pd_fill_to_bytes(tohw, count*HeapWordSize, (jubyte)(~(juint)(0))); 1074 return; 1075 } 1076 julong* to = (julong*) tohw; 1077 julong v = ((julong) value << 32) | value; 1078 while (count-- > 0) { 1079 *to++ = v; 1080 } 1081 } 1082 1083 static void pd_fill_to_aligned_words(HeapWord* tohw, size_t count, juint value) { 1084 // JVM2008: very frequent, but virtually all calls are with value == 0. 1085 pd_fill_to_words(tohw, count, value); 1086 } 1087 1088 //**********************************// 1089 // M E M O R Y C L E A R I N G // 1090 //**********************************// 1091 1092 // Delegate to pd_zero_to_bytes. It also works HeapWord-atomic. 1093 // Distinguish between simple and large zero_to_words. 1094 static void pd_zero_to_words(HeapWord* tohw, size_t count) { 1095 pd_zero_to_bytes(tohw, count*HeapWordSize); 1096 } 1097 1098 // Delegate to pd_zero_to_bytes. It also works HeapWord-atomic. 1099 static void pd_zero_to_words_large(HeapWord* tohw, size_t count) { 1100 // JVM2008: generally frequent, some tests show very frequent calls. 1101 pd_zero_to_bytes(tohw, count*HeapWordSize); 1102 } 1103 1104 static void pd_zero_to_bytes(void* to, size_t count) { 1105 // JVM2008: some calls (generally), some tests frequent 1106 #ifdef USE_INLINE_ASM 1107 // Even zero_to_bytes() requires HeapWord-atomic, or, at least, sequential 1108 // zeroing of the memory. MVCLE is not fit for that job: 1109 // "As observed by other CPUs and by the channel subsystem, 1110 // that portion of the first operand which is filled 1111 // with the padding byte is not necessarily stored into in 1112 // a left-to-right direction and may appear to be stored 1113 // into more than once." 1114 // Therefore, implementation was changed to use (multiple) XC instructions. 1115 1116 const long line_size = 256; 1117 jbyte* to_bytes = (jbyte*)to; 1118 size_t len_bytes = count; 1119 1120 if (len_bytes <= line_size) { 1121 XC_MEMZERO_256(to_bytes, len_bytes); 1122 } else { 1123 XC_MEMZERO_ANY(to_bytes, len_bytes); 1124 } 1125 1126 #else 1127 // Memset does the best job possible: loop over 256-byte MVCs, with 1128 // the last MVC EXecuted. With the -mmvcle option, initialization 1129 // is done using MVCLE -> slight advantage for large areas. 1130 (void)memset(to, 0, count); 1131 #endif 1132 } 1133 1134 #endif // CPU_S390_VM_COPY_S390_HPP