1 /*
   2  * Copyright (c) 2016, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2016 SAP SE. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 // Major contributions by LS
  27 
  28 #ifndef CPU_S390_VM_COPY_S390_HPP
  29 #define CPU_S390_VM_COPY_S390_HPP
  30 
  31 // Inline functions for memory copy and fill.
  32 
  33 // HeapWordSize (the size of class HeapWord) is 8 Bytes (the size of a
  34 // pointer variable), since we always run the _LP64 model. As a consequence,
  35 // HeapWord* memory ranges are always assumed to be doubleword-aligned,
  36 // having a size which is an integer multiple of HeapWordSize.
  37 //
  38 // Dealing only with doubleword-aligned doubleword units has important
  39 // positive performance and data access consequences. Many of the move
  40 // instructions perform particularly well under these circumstances.
  41 // Data access is "doubleword-concurrent", except for MVC and XC.
  42 // Furthermore, data access can be forced to be sequential (MVCL and MVCLE)
  43 // by use of the special padding byte 0xb1, where required. For copying,
  44 // we use padding byte 0xb0 to prevent the D-cache from being polluted.
  45 //
  46 // On z/Architecture, gcc optimizes memcpy into a series of MVC instructions.
  47 // This is optimal, even if just one HeapWord is copied. However, MVC
  48 // copying is not atomic, i.e. not "doubleword concurrent" by definition.
  49 //
  50 // If the -mmvcle compiler option is specified, memcpy translates into
  51 // code such that the entire memory range is copied or preset with just
  52 // one MVCLE instruction.
  53 //
  54 // *to = *from is transformed into a MVC instruction already with -O1.
  55 // Thus, for atomic copy operations, (inline) assembler code is required
  56 // to guarantee atomic data accesses.
  57 //
  58 // For large (len >= MVCLEThreshold) chunks of memory, we exploit
  59 // special H/W support of z/Architecture:
  60 // 1) copy short piece of memory to page-align address(es)
  61 // 2) copy largest part (all contained full pages) of memory using mvcle instruction.
  62 //    z/Architecture processors have special H/W support for page-aligned storage
  63 //    where len is an int multiple of page size. In that case, up to 4 cache lines are
  64 //    processed in parallel and L1 cache is not polluted.
  65 // 3) copy the remaining piece of memory.
  66 //
  67 //  Measurement classifications:
  68 //  very rare - <=     10.000 calls AND <=     1.000 usec elapsed
  69 //       rare - <=    100.000 calls AND <=    10.000 usec elapsed
  70 //       some - <=  1.000.000 calls AND <=   100.000 usec elapsed
  71 //       freq - <= 10.000.000 calls AND <= 1.000.000 usec elapsed
  72 //  very freq - >  10.000.000 calls OR  >  1.000.000 usec elapsed
  73 
  74 #undef USE_INLINE_ASM
  75 
  76 static void copy_conjoint_jshorts_atomic(jshort* from, jshort* to, size_t count) {
  77   if (from > to) {
  78     while (count-- > 0) {
  79       // Copy forwards
  80       *to++ = *from++;
  81     }
  82   } else {
  83     from += count - 1;
  84     to   += count - 1;
  85     while (count-- > 0) {
  86       // Copy backwards
  87       *to-- = *from--;
  88     }
  89   }
  90 }
  91 
  92 static void copy_conjoint_jints_atomic(jint* from, jint* to, size_t count) {
  93   if (from > to) {
  94     while (count-- > 0) {
  95       // Copy forwards
  96       *to++ = *from++;
  97     }
  98   } else {
  99     from += count - 1;
 100     to   += count - 1;
 101     while (count-- > 0) {
 102       // Copy backwards
 103       *to-- = *from--;
 104     }
 105   }
 106 }
 107 
 108 static bool has_destructive_overlap(char* from, char* to, size_t byte_count) {
 109   return (from < to) && ((to-from) < (ptrdiff_t)byte_count);
 110 }
 111 
 112 #ifdef USE_INLINE_ASM
 113 
 114   //--------------------------------------------------------------
 115   // Atomic copying. Atomicity is given by the minimum of source
 116   // and target alignment. Refer to mail comm with Tim Slegel/IBM.
 117   // Only usable for disjoint source and target.
 118   //--------------------------------------------------------------
 119   #define MOVE8_ATOMIC_4(_to,_from) {                            \
 120     unsigned long toaddr;                                        \
 121     unsigned long fromaddr;                                      \
 122     asm(                                                         \
 123       "LG      %[toaddr],%[to]     \n\t" /* address of to area   */ \
 124       "LG      %[fromaddr],%[from] \n\t" /* address of from area */ \
 125       "MVC     0(32,%[toaddr]),0(%[fromaddr]) \n\t" /* move data */ \
 126       : [to]       "+Q"  (_to)          /* outputs   */          \
 127       , [from]     "+Q"  (_from)                                 \
 128       , [toaddr]   "=a"  (toaddr)                                \
 129       , [fromaddr] "=a"  (fromaddr)                              \
 130       :                                                          \
 131       : "cc"                            /* clobbered */          \
 132     );                                                           \
 133   }
 134   #define MOVE8_ATOMIC_3(_to,_from) {                            \
 135     unsigned long toaddr;                                        \
 136     unsigned long fromaddr;                                      \
 137     asm(                                                         \
 138       "LG      %[toaddr],%[to]     \n\t" /* address of to area   */ \
 139       "LG      %[fromaddr],%[from] \n\t" /* address of from area */ \
 140       "MVC     0(24,%[toaddr]),0(%[fromaddr]) \n\t" /* move data */ \
 141       : [to]       "+Q"  (_to)          /* outputs   */          \
 142       , [from]     "+Q"  (_from)                                 \
 143       , [toaddr]   "=a"  (toaddr)                                \
 144       , [fromaddr] "=a"  (fromaddr)                              \
 145       :                                                          \
 146       : "cc"                            /* clobbered */          \
 147     );                                                           \
 148   }
 149   #define MOVE8_ATOMIC_2(_to,_from) {                            \
 150     unsigned long toaddr;                                        \
 151     unsigned long fromaddr;                                      \
 152     asm(                                                         \
 153       "LG      %[toaddr],%[to]     \n\t" /* address of to area   */ \
 154       "LG      %[fromaddr],%[from] \n\t" /* address of from area */ \
 155       "MVC     0(16,%[toaddr]),0(%[fromaddr]) \n\t" /* move data */ \
 156       : [to]       "+Q"  (_to)          /* outputs   */          \
 157       , [from]     "+Q"  (_from)                                 \
 158       , [toaddr]   "=a"  (toaddr)                                \
 159       , [fromaddr] "=a"  (fromaddr)                              \
 160       :                                                          \
 161       : "cc"                            /* clobbered */          \
 162     );                                                           \
 163   }
 164   #define MOVE8_ATOMIC_1(_to,_from) {                            \
 165     unsigned long toaddr;                                        \
 166     unsigned long fromaddr;                                      \
 167     asm(                                                         \
 168       "LG      %[toaddr],%[to]     \n\t" /* address of to area   */ \
 169       "LG      %[fromaddr],%[from] \n\t" /* address of from area */ \
 170       "MVC     0(8,%[toaddr]),0(%[fromaddr]) \n\t"  /* move data */ \
 171       : [to]       "+Q"  (_to)          /* outputs   */          \
 172       , [from]     "+Q"  (_from)                                 \
 173       , [toaddr]   "=a"  (toaddr)                                \
 174       , [fromaddr] "=a"  (fromaddr)                              \
 175       :                                                          \
 176       : "cc"                            /* clobbered */          \
 177     );                                                           \
 178   }
 179 
 180   //--------------------------------------------------------------
 181   // Atomic copying of 8-byte entities.
 182   // Conjoint/disjoint property does not matter. Entities are first
 183   // loaded and then stored.
 184   // _to and _from must be 8-byte aligned.
 185   //--------------------------------------------------------------
 186   #define COPY8_ATOMIC_4(_to,_from) {                            \
 187     unsigned long toaddr;                                        \
 188     asm(                                                         \
 189       "LG      3,%[from]        \n\t" /* address of from area */ \
 190       "LG      %[toaddr],%[to]  \n\t" /* address of to area   */ \
 191       "LMG     0,3,0(3)         \n\t" /* load data            */ \
 192       "STMG    0,3,0(%[toaddr]) \n\t" /* store data           */ \
 193       : [to]     "+Q"  (_to)          /* outputs   */            \
 194       , [from]   "+Q"  (_from)        /* outputs   */            \
 195       , [toaddr] "=a"  (toaddr)       /* inputs    */            \
 196       :                                                          \
 197       : "cc",  "r0", "r1", "r2", "r3" /* clobbered */            \
 198     );                                                           \
 199   }
 200   #define COPY8_ATOMIC_3(_to,_from) {                            \
 201     unsigned long toaddr;                                        \
 202     asm(                                                         \
 203       "LG      2,%[from]        \n\t" /* address of from area */ \
 204       "LG      %[toaddr],%[to]  \n\t" /* address of to area   */ \
 205       "LMG     0,2,0(2)         \n\t" /* load data            */ \
 206       "STMG    0,2,0(%[toaddr]) \n\t" /* store data           */ \
 207       : [to]     "+Q"  (_to)          /* outputs   */            \
 208       , [from]   "+Q"  (_from)        /* outputs   */            \
 209       , [toaddr] "=a"  (toaddr)       /* inputs    */            \
 210       :                                                          \
 211       : "cc",  "r0", "r1", "r2"       /* clobbered */            \
 212     );                                                           \
 213   }
 214   #define COPY8_ATOMIC_2(_to,_from) {                            \
 215     unsigned long toaddr;                                        \
 216     asm(                                                         \
 217       "LG      1,%[from]        \n\t" /* address of from area */ \
 218       "LG      %[toaddr],%[to]  \n\t" /* address of to area   */ \
 219       "LMG     0,1,0(1)         \n\t" /* load data            */ \
 220       "STMG    0,1,0(%[toaddr]) \n\t" /* store data           */ \
 221       : [to]     "+Q"  (_to)          /* outputs   */            \
 222       , [from]   "+Q"  (_from)        /* outputs   */            \
 223       , [toaddr] "=a"  (toaddr)       /* inputs    */            \
 224       :                                                          \
 225       : "cc",  "r0", "r1"             /* clobbered */            \
 226     );                                                           \
 227   }
 228   #define COPY8_ATOMIC_1(_to,_from) {                            \
 229     unsigned long addr;                                          \
 230     asm(                                                         \
 231       "LG      %[addr],%[from]  \n\t" /* address of from area */ \
 232       "LG      0,0(0,%[addr])   \n\t" /* load data            */ \
 233       "LG      %[addr],%[to]    \n\t" /* address of to area   */ \
 234       "STG     0,0(0,%[addr])   \n\t" /* store data           */ \
 235       : [to]     "+Q"  (_to)          /* outputs   */            \
 236       , [from]   "+Q"  (_from)        /* outputs   */            \
 237       , [addr]   "=a"  (addr)         /* inputs    */            \
 238       :                                                          \
 239       : "cc",  "r0"                   /* clobbered */            \
 240     );                                                           \
 241   }
 242 
 243   //--------------------------------------------------------------
 244   // Atomic copying of 4-byte entities.
 245   // Exactly 4 (four) entities are copied.
 246   // Conjoint/disjoint property does not matter. Entities are first
 247   // loaded and then stored.
 248   // _to and _from must be 4-byte aligned.
 249   //--------------------------------------------------------------
 250   #define COPY4_ATOMIC_4(_to,_from) {                            \
 251     unsigned long toaddr;                                        \
 252     asm(                                                         \
 253       "LG      3,%[from]        \n\t" /* address of from area */ \
 254       "LG      %[toaddr],%[to]  \n\t" /* address of to area   */ \
 255       "LM      0,3,0(3)         \n\t" /* load data            */ \
 256       "STM     0,3,0(%[toaddr]) \n\t" /* store data           */ \
 257       : [to]     "+Q"  (_to)          /* outputs   */            \
 258       , [from]   "+Q"  (_from)        /* outputs   */            \
 259       , [toaddr] "=a"  (toaddr)       /* inputs    */            \
 260       :                                                          \
 261       : "cc",  "r0", "r1", "r2", "r3" /* clobbered */            \
 262     );                                                           \
 263   }
 264   #define COPY4_ATOMIC_3(_to,_from) {                            \
 265     unsigned long toaddr;                                        \
 266     asm(                                                         \
 267       "LG      2,%[from]        \n\t" /* address of from area */ \
 268       "LG      %[toaddr],%[to]  \n\t" /* address of to area   */ \
 269       "LM      0,2,0(2)         \n\t" /* load data            */ \
 270       "STM     0,2,0(%[toaddr]) \n\t" /* store data           */ \
 271       : [to]     "+Q"  (_to)          /* outputs   */            \
 272       , [from]   "+Q"  (_from)        /* outputs   */            \
 273       , [toaddr] "=a"  (toaddr)       /* inputs    */            \
 274       :                                                          \
 275       : "cc",  "r0", "r1", "r2"       /* clobbered */            \
 276     );                                                           \
 277   }
 278   #define COPY4_ATOMIC_2(_to,_from) {                            \
 279     unsigned long toaddr;                                        \
 280     asm(                                                         \
 281       "LG      1,%[from]        \n\t" /* address of from area */ \
 282       "LG      %[toaddr],%[to]  \n\t" /* address of to area   */ \
 283       "LM      0,1,0(1)         \n\t" /* load data            */ \
 284       "STM     0,1,0(%[toaddr]) \n\t" /* store data           */ \
 285       : [to]     "+Q"  (_to)          /* outputs   */            \
 286       , [from]   "+Q"  (_from)        /* outputs   */            \
 287       , [toaddr] "=a"  (toaddr)       /* inputs    */            \
 288       :                                                          \
 289       : "cc",  "r0", "r1"             /* clobbered */            \
 290     );                                                           \
 291   }
 292   #define COPY4_ATOMIC_1(_to,_from) {                            \
 293     unsigned long addr;                                          \
 294     asm(                                                         \
 295       "LG      %[addr],%[from]  \n\t" /* address of from area */ \
 296       "L       0,0(0,%[addr])   \n\t" /* load data            */ \
 297       "LG      %[addr],%[to]    \n\t" /* address of to area   */ \
 298       "ST      0,0(0,%[addr])   \n\t" /* store data           */ \
 299       : [to]     "+Q"  (_to)          /* outputs   */            \
 300       , [from]   "+Q"  (_from)        /* outputs   */            \
 301       , [addr]   "=a"  (addr)         /* inputs    */            \
 302       :                                                          \
 303       : "cc",  "r0"                   /* clobbered */            \
 304     );                                                           \
 305   }
 306 
 307 #if 0  // Waiting for gcc to support EXRL.
 308   #define MVC_MEMCOPY(_to,_from,_len)                                \
 309     if (VM_Version::has_ExecuteExtensions()) {                       \
 310       asm("\t"                                                       \
 311       "    LAY     1,-1(0,%[len])      \n\t" /* decr for MVC  */     \
 312       "    EXRL    1,1f                \n\t" /* execute MVC instr */ \
 313       "    BRC     15,2f               \n\t" /* skip template */     \
 314       "1:  MVC     0(%[len],%[to]),0(%[from]) \n\t"                  \
 315       "2:  BCR     0,0                 \n\t"                         \
 316       : [to]   "+Q"  (_to)             /* outputs   */               \
 317       , [from] "+Q"  (_from)           /* outputs   */               \
 318       : [len]  "r"   (_len)            /* inputs    */               \
 319       : "cc",  "r1"                    /* clobbered */               \
 320       );                                                             \
 321     } else {                                                         \
 322       asm("\t"                                                       \
 323       "    LARL    2,3f                \n\t"                         \
 324       "    LAY     1,-1(0,%[len])      \n\t" /* decr for MVC  */     \
 325       "    EX      1,0(2)              \n\t" /* execute MVC instr */ \
 326       "    BRC     15,4f               \n\t" /* skip template */     \
 327       "3:  MVC     0(%[len],%[to]),0(%[from])  \n\t"                 \
 328       "4:  BCR     0,0                 \n\t"                         \
 329       : [to]   "+Q"  (_to)             /* outputs   */               \
 330       , [from] "+Q"  (_from)           /* outputs   */               \
 331       : [len]  "r"   (_len)            /* inputs    */               \
 332       : "cc",  "r1", "r2"              /* clobbered */               \
 333       );                                                             \
 334     }
 335 #else
 336   #define MVC_MEMCOPY(_to,_from,_len)                                \
 337   { unsigned long toaddr;   unsigned long tolen;                     \
 338     unsigned long fromaddr; unsigned long target;                    \
 339       asm("\t"                                                       \
 340       "    LTGR    %[tolen],%[len]     \n\t" /* decr for MVC  */     \
 341       "    BRC     8,2f                \n\t" /* do nothing for l=0*/ \
 342       "    AGHI    %[tolen],-1         \n\t"                         \
 343       "    LG      %[toaddr],%[to]     \n\t"                         \
 344       "    LG      %[fromaddr],%[from] \n\t"                         \
 345       "    LARL    %[target],1f        \n\t" /* addr of MVC instr */ \
 346       "    EX      %[tolen],0(%[target])         \n\t" /* execute MVC instr */ \
 347       "    BRC     15,2f                         \n\t" /* skip template */     \
 348       "1:  MVC     0(1,%[toaddr]),0(%[fromaddr]) \n\t"                         \
 349       "2:  BCR     0,0                 \n\t" /* nop a branch target*/\
 350       : [to]       "+Q"  (_to)         /* outputs   */               \
 351       , [from]     "+Q"  (_from)                                     \
 352       , [tolen]    "=a"  (tolen)                                     \
 353       , [toaddr]   "=a"  (toaddr)                                    \
 354       , [fromaddr] "=a"  (fromaddr)                                  \
 355       , [target]   "=a"  (target)                                    \
 356       : [len]       "r"  (_len)        /* inputs    */               \
 357       : "cc"                           /* clobbered */               \
 358       );                                                             \
 359   }
 360 #endif
 361 
 362   #if 0  // code snippet to be used for debugging
 363       /* ASSERT code BEGIN */                                                \
 364       "    LARL    %[len],5f       \n\t"                                     \
 365       "    LARL    %[mta],4f       \n\t"                                     \
 366       "    SLGR    %[len],%[mta]   \n\t"                                     \
 367       "    CGHI    %[len],16       \n\t"                                     \
 368       "    BRC     7,9f            \n\t"      /* block size !=  16 */        \
 369                                                                              \
 370       "    LARL    %[len],1f       \n\t"                                     \
 371       "    SLGR    %[len],%[mta]   \n\t"                                     \
 372       "    CGHI    %[len],256      \n\t"                                     \
 373       "    BRC     7,9f            \n\t"      /* list len   != 256 */        \
 374                                                                              \
 375       "    LGR     0,0             \n\t"      /* artificial SIGILL */        \
 376       "9:  BRC     7,-2            \n\t"                                     \
 377       "    LARL    %[mta],1f       \n\t"      /* restore MVC table begin */  \
 378       /* ASSERT code END   */
 379   #endif
 380 
 381   // Optimized copying for data less than 4k
 382   // - no destructive overlap
 383   // - 0 <= _n_bytes <= 4096
 384   // This macro needs to be gcc-compiled with -march=z990. Otherwise, the
 385   // LAY instruction is not available.
 386   #define MVC_MULTI(_to,_from,_n_bytes)                                      \
 387   { unsigned long toaddr;                                                    \
 388     unsigned long fromaddr;                                                  \
 389     unsigned long movetable;                                                 \
 390     unsigned long len;                                                       \
 391       asm("\t"                                                               \
 392       "    LTGFR   %[len],%[nby]   \n\t"                                     \
 393       "    LG      %[ta],%[to]     \n\t"      /* address of to area   */     \
 394       "    BRC     8,1f            \n\t"      /* nothing to copy   */        \
 395                                                                              \
 396       "    NILL    %[nby],255      \n\t"      /* # bytes mod 256      */     \
 397       "    LG      %[fa],%[from]   \n\t"      /* address of from area */     \
 398       "    BRC     8,3f            \n\t"      /* no rest, skip copying */    \
 399                                                                              \
 400       "    LARL    %[mta],2f       \n\t"      /* MVC template addr */        \
 401       "    AHI     %[nby],-1       \n\t"      /* adjust for EX MVC  */       \
 402                                                                              \
 403       "    EX      %[nby],0(%[mta]) \n\t"     /* only rightmost */           \
 404                                               /* 8 bits of nby used */       \
 405       /* Since nby is <= 4096 on entry to this code, we do need */           \
 406       /* no zero extension before using it in addr calc.        */           \
 407       "    LA      %[fa],1(%[nby],%[fa]) \n\t"/* adjust from addr */         \
 408       "    LA      %[ta],1(%[nby],%[ta]) \n\t"/* adjust to   addr */         \
 409                                                                              \
 410       "3:  SRAG    %[nby],%[len],8 \n\t"      /* # cache lines     */        \
 411       "    LARL    %[mta],1f       \n\t"      /* MVC table begin   */        \
 412       "    BRC     8,1f            \n\t"      /* nothing to copy   */        \
 413                                                                              \
 414       /* Insert ASSERT code here if required. */                             \
 415                                                                              \
 416                                                                              \
 417       "    LNGFR   %[nby],%[nby]   \n\t"      /* negative offset into     */ \
 418       "    SLLG    %[nby],%[nby],4 \n\t"      /* MVC table 16-byte blocks */ \
 419       "    BC      15,0(%[nby],%[mta]) \n\t"  /* branch to block #ncl  */    \
 420                                                                              \
 421       "2:  MVC     0(1,%[ta]),0(%[fa]) \n\t"  /* MVC template */             \
 422                                                                              \
 423       "4:  MVC     0(256,%[ta]),0(%[fa])   \n\t" /* 4096 == l        */      \
 424       "    LAY     %[ta],256(0,%[ta])      \n\t"                             \
 425       "    LA      %[fa],256(0,%[fa])      \n\t"                             \
 426       "5:  MVC     0(256,%[ta]),0(%[fa])   \n\t" /* 3840 <= l < 4096 */      \
 427       "    LAY     %[ta],256(0,%[ta])      \n\t"                             \
 428       "    LA      %[fa],256(0,%[fa])      \n\t"                             \
 429       "    MVC     0(256,%[ta]),0(%[fa])   \n\t" /* 3548 <= l < 3328 */      \
 430       "    LAY     %[ta],256(0,%[ta])      \n\t"                             \
 431       "    LA      %[fa],256(0,%[fa])      \n\t"                             \
 432       "    MVC     0(256,%[ta]),0(%[fa])   \n\t" /* 3328 <= l < 3328 */      \
 433       "    LAY     %[ta],256(0,%[ta])      \n\t"                             \
 434       "    LA      %[fa],256(0,%[fa])      \n\t"                             \
 435       "    MVC     0(256,%[ta]),0(%[fa])   \n\t" /* 3072 <= l < 3328 */      \
 436       "    LAY     %[ta],256(0,%[ta])      \n\t"                             \
 437       "    LA      %[fa],256(0,%[fa])      \n\t"                             \
 438       "    MVC     0(256,%[ta]),0(%[fa])   \n\t" /* 2816 <= l < 3072 */      \
 439       "    LAY     %[ta],256(0,%[ta])      \n\t"                             \
 440       "    LA      %[fa],256(0,%[fa])      \n\t"                             \
 441       "    MVC     0(256,%[ta]),0(%[fa])   \n\t" /* 2560 <= l < 2816 */      \
 442       "    LAY     %[ta],256(0,%[ta])      \n\t"                             \
 443       "    LA      %[fa],256(0,%[fa])      \n\t"                             \
 444       "    MVC     0(256,%[ta]),0(%[fa])   \n\t" /* 2304 <= l < 2560 */      \
 445       "    LAY     %[ta],256(0,%[ta])      \n\t"                             \
 446       "    LA      %[fa],256(0,%[fa])      \n\t"                             \
 447       "    MVC     0(256,%[ta]),0(%[fa])   \n\t" /* 2048 <= l < 2304 */      \
 448       "    LAY     %[ta],256(0,%[ta])      \n\t"                             \
 449       "    LA      %[fa],256(0,%[fa])      \n\t"                             \
 450       "    MVC     0(256,%[ta]),0(%[fa])   \n\t" /* 1792 <= l < 2048 */      \
 451       "    LAY     %[ta],256(0,%[ta])      \n\t"                             \
 452       "    LA      %[fa],256(0,%[fa])      \n\t"                             \
 453       "    MVC     0(256,%[ta]),0(%[fa])   \n\t" /* 1536 <= l < 1792 */      \
 454       "    LAY     %[ta],256(0,%[ta])      \n\t"                             \
 455       "    LA      %[fa],256(0,%[fa])      \n\t"                             \
 456       "    MVC     0(256,%[ta]),0(%[fa])   \n\t" /* 1280 <= l < 1536 */      \
 457       "    LAY     %[ta],256(0,%[ta])      \n\t"                             \
 458       "    LA      %[fa],256(0,%[fa])      \n\t"                             \
 459       "    MVC     0(256,%[ta]),0(%[fa])   \n\t" /* 1024 <= l < 1280 */      \
 460       "    LAY     %[ta],256(0,%[ta])      \n\t"                             \
 461       "    LA      %[fa],256(0,%[fa])      \n\t"                             \
 462       "    MVC     0(256,%[ta]),0(%[fa])   \n\t" /*  768 <= l < 1024 */      \
 463       "    LAY     %[ta],256(0,%[ta])      \n\t"                             \
 464       "    LA      %[fa],256(0,%[fa])      \n\t"                             \
 465       "    MVC     0(256,%[ta]),0(%[fa])   \n\t" /*  512 <= l <  768 */      \
 466       "    LAY     %[ta],256(0,%[ta])      \n\t"                             \
 467       "    LA      %[fa],256(0,%[fa])      \n\t"                             \
 468       "    MVC     0(256,%[ta]),0(%[fa])   \n\t" /*  256 <= l <  512 */      \
 469       "    LAY     %[ta],256(0,%[ta])      \n\t"                             \
 470       "    LA      %[fa],256(0,%[fa])      \n\t"                             \
 471       "1:  BCR     0,0                     \n\t" /* nop as branch target */  \
 472       : [to]       "+Q"  (_to)          /* outputs   */          \
 473       , [from]     "+Q"  (_from)                                 \
 474       , [ta]       "=a"  (toaddr)                                \
 475       , [fa]       "=a"  (fromaddr)                              \
 476       , [mta]      "=a"  (movetable)                             \
 477       , [nby]      "+a"  (_n_bytes)                              \
 478       , [len]      "=a"  (len)                                   \
 479       :                                                          \
 480       : "cc"                            /* clobbered */          \
 481     );                                                           \
 482   }
 483 
 484   #define MVCLE_MEMCOPY(_to,_from,_len)                           \
 485     asm(                                                          \
 486       "    LG      0,%[to]     \n\t"   /* address of to area   */ \
 487       "    LG      2,%[from]   \n\t"   /* address of from area */ \
 488       "    LGR     1,%[len]    \n\t"   /* len of to area       */ \
 489       "    LGR     3,%[len]    \n\t"   /* len of from area     */ \
 490       "1:  MVCLE   0,2,176     \n\t"   /* copy storage, bypass cache (0xb0) */ \
 491       "    BRC     1,1b        \n\t"   /* retry if interrupted */ \
 492       : [to]   "+Q"  (_to)             /* outputs   */            \
 493       , [from] "+Q"  (_from)           /* outputs   */            \
 494       : [len]  "r"   (_len)            /* inputs    */            \
 495       : "cc",  "r0", "r1", "r2", "r3"  /* clobbered */            \
 496     );
 497 
 498   #define MVCLE_MEMINIT(_to,_val,_len)                            \
 499     asm(                                                          \
 500       "    LG      0,%[to]       \n\t" /* address of to area   */ \
 501       "    LGR     1,%[len]      \n\t" /* len of to area       */ \
 502       "    XGR     3,3           \n\t" /* from area len = 0    */ \
 503       "1:  MVCLE   0,2,0(%[val]) \n\t" /* init storage         */ \
 504       "    BRC     1,1b          \n\t" /* retry if interrupted */ \
 505       : [to]   "+Q"  (_to)             /* outputs   */            \
 506       : [len]  "r"   (_len)            /* inputs    */            \
 507       , [val]  "r"   (_val)            /* inputs    */            \
 508       : "cc",  "r0", "r1", "r3"        /* clobbered */            \
 509     );
 510   #define MVCLE_MEMZERO(_to,_len)                                 \
 511     asm(                                                          \
 512       "    LG      0,%[to]       \n\t" /* address of to area   */ \
 513       "    LGR     1,%[len]      \n\t" /* len of to area       */ \
 514       "    XGR     3,3           \n\t" /* from area len = 0    */ \
 515       "1:  MVCLE   0,2,0         \n\t" /* clear storage        */ \
 516       "    BRC     1,1b          \n\t" /* retry if interrupted */ \
 517       : [to]   "+Q"  (_to)             /* outputs   */            \
 518       : [len]  "r"   (_len)            /* inputs    */            \
 519       : "cc",  "r0", "r1", "r3"        /* clobbered */            \
 520     );
 521 
 522   // Clear a stretch of memory, 0 <= _len <= 256.
 523   // There is no alignment prereq.
 524   // There is no test for len out of range specified above.
 525   #define XC_MEMZERO_256(_to,_len)                                 \
 526 { unsigned long toaddr;   unsigned long tolen;                     \
 527   unsigned long target;                                            \
 528     asm("\t"                                                       \
 529     "    LTGR    %[tolen],%[len]     \n\t" /* decr for MVC  */     \
 530     "    BRC     8,2f                \n\t" /* do nothing for l=0*/ \
 531     "    AGHI    %[tolen],-1         \n\t" /* adjust for EX XC  */ \
 532     "    LARL    %[target],1f        \n\t" /* addr of XC instr  */ \
 533     "    LG      %[toaddr],%[to]     \n\t" /* addr of data area */ \
 534     "    EX      %[tolen],0(%[target])       \n\t" /* execute MVC instr */ \
 535     "    BRC     15,2f                       \n\t" /* skip template */     \
 536     "1:  XC      0(1,%[toaddr]),0(%[toaddr]) \n\t"                         \
 537     "2:  BCR     0,0                 \n\t" /* nop a branch target*/\
 538     : [to]       "+Q"  (_to)         /* outputs   */               \
 539     , [tolen]    "=a"  (tolen)                                     \
 540     , [toaddr]   "=a"  (toaddr)                                    \
 541     , [target]   "=a"  (target)                                    \
 542     : [len]       "r"  (_len)        /* inputs    */               \
 543     : "cc"                           /* clobbered */               \
 544     );                                                             \
 545 }
 546 
 547   // Clear a stretch of memory, 256 < _len.
 548   // XC_MEMZERO_256 may be used to clear shorter areas.
 549   //
 550   // The code
 551   // - first zeroes a few bytes to align on a HeapWord.
 552   //   This step is currently inactive because all calls seem
 553   //   to have their data aligned on HeapWord boundaries.
 554   // - then zeroes a few HeapWords to align on a cache line.
 555   // - then zeroes entire cache lines in a loop.
 556   // - then zeroes the remaining (partial) cache line.
 557 #if 1
 558   #define XC_MEMZERO_ANY(_to,_len)                                    \
 559 { unsigned long toaddr;   unsigned long tolen;                        \
 560   unsigned long len8;     unsigned long len256;                       \
 561   unsigned long target;   unsigned long lenx;                         \
 562     asm("\t"                                                          \
 563     "    LTGR    %[tolen],%[len]      \n\t" /*                   */   \
 564     "    BRC     8,2f                 \n\t" /* do nothing for l=0*/   \
 565     "    LG      %[toaddr],%[to]      \n\t" /* addr of data area */   \
 566     "    LARL    %[target],1f         \n\t" /* addr of XC instr  */   \
 567     " "                                                               \
 568     "    LCGR    %[len256],%[toaddr]  \n\t" /* cache line alignment */\
 569     "    NILL    %[len256],0xff       \n\t"                           \
 570     "    BRC     8,4f                 \n\t" /* already aligned     */ \
 571     "    NILH    %[len256],0x00       \n\t" /* zero extend         */ \
 572     "    LLGFR   %[len256],%[len256]  \n\t"                           \
 573     "    LAY     %[lenx],-1(,%[len256]) \n\t"                         \
 574     "    EX      %[lenx],0(%[target]) \n\t" /* execute MVC instr   */ \
 575     "    LA      %[toaddr],0(%[len256],%[toaddr]) \n\t"               \
 576     "    SGR     %[tolen],%[len256]   \n\t" /* adjust len          */ \
 577     " "                                                               \
 578     "4:  SRAG    %[lenx],%[tolen],8   \n\t" /* # cache lines       */ \
 579     "    BRC     8,6f                 \n\t" /* no full cache lines */ \
 580     "5:  XC      0(256,%[toaddr]),0(%[toaddr]) \n\t"                  \
 581     "    LA      %[toaddr],256(,%[toaddr]) \n\t"                      \
 582     "    BRCTG   %[lenx],5b           \n\t" /* iterate             */ \
 583     " "                                                               \
 584     "6:  NILL    %[tolen],0xff        \n\t" /* leftover bytes      */ \
 585     "    BRC     8,2f                 \n\t" /* done if none        */ \
 586     "    LAY     %[lenx],-1(,%[tolen]) \n\t"                          \
 587     "    EX      %[lenx],0(%[target]) \n\t" /* execute MVC instr   */ \
 588     "    BRC     15,2f                \n\t" /* skip template       */ \
 589     " "                                                               \
 590     "1:  XC      0(1,%[toaddr]),0(%[toaddr]) \n\t"                    \
 591     "2:  BCR     0,0                  \n\t" /* nop a branch target */ \
 592     : [to]       "+Q"  (_to)         /* outputs   */               \
 593     , [lenx]     "=a"  (lenx)                                      \
 594     , [len256]   "=a"  (len256)                                    \
 595     , [tolen]    "=a"  (tolen)                                     \
 596     , [toaddr]   "=a"  (toaddr)                                    \
 597     , [target]   "=a"  (target)                                    \
 598     : [len]       "r"  (_len)        /* inputs    */               \
 599     : "cc"                           /* clobbered */               \
 600     );                                                             \
 601 }
 602 #else
 603   #define XC_MEMZERO_ANY(_to,_len)                                    \
 604 { unsigned long toaddr;   unsigned long tolen;                        \
 605   unsigned long len8;     unsigned long len256;                       \
 606   unsigned long target;   unsigned long lenx;                         \
 607     asm("\t"                                                          \
 608     "    LTGR    %[tolen],%[len]      \n\t" /*                   */   \
 609     "    BRC     8,2f                 \n\t" /* do nothing for l=0*/   \
 610     "    LG      %[toaddr],%[to]      \n\t" /* addr of data area */   \
 611     "    LARL    %[target],1f         \n\t" /* addr of XC instr  */   \
 612     " "                                                               \
 613     "    LCGR    %[len8],%[toaddr]    \n\t" /* HeapWord alignment  */ \
 614     "    NILL    %[len8],0x07         \n\t"                           \
 615     "    BRC     8,3f                 \n\t" /* already aligned     */ \
 616     "    NILH    %[len8],0x00         \n\t" /* zero extend         */ \
 617     "    LLGFR   %[len8],%[len8]      \n\t"                           \
 618     "    LAY     %[lenx],-1(,%[len8]) \n\t"                           \
 619     "    EX      %[lenx],0(%[target]) \n\t" /* execute MVC instr */   \
 620     "    LA      %[toaddr],0(%[len8],%[toaddr]) \n\t"                 \
 621     "    SGR     %[tolen],%[len8]     \n\t" /* adjust len          */ \
 622     " "                                                               \
 623     "3:  LCGR    %[len256],%[toaddr]  \n\t" /* cache line alignment */\
 624     "    NILL    %[len256],0xff       \n\t"                           \
 625     "    BRC     8,4f                 \n\t" /* already aligned     */ \
 626     "    NILH    %[len256],0x00       \n\t" /* zero extend         */ \
 627     "    LLGFR   %[len256],%[len256]  \n\t"                           \
 628     "    LAY     %[lenx],-1(,%[len256]) \n\t"                         \
 629     "    EX      %[lenx],0(%[target]) \n\t" /* execute MVC instr   */ \
 630     "    LA      %[toaddr],0(%[len256],%[toaddr]) \n\t"               \
 631     "    SGR     %[tolen],%[len256]   \n\t" /* adjust len          */ \
 632     " "                                                               \
 633     "4:  SRAG    %[lenx],%[tolen],8   \n\t" /* # cache lines       */ \
 634     "    BRC     8,6f                 \n\t" /* no full cache lines */ \
 635     "5:  XC      0(256,%[toaddr]),0(%[toaddr]) \n\t"                  \
 636     "    LA      %[toaddr],256(,%[toaddr]) \n\t"                      \
 637     "    BRCTG   %[lenx],5b           \n\t" /* iterate             */ \
 638     " "                                                               \
 639     "6:  NILL    %[tolen],0xff        \n\t" /* leftover bytes      */ \
 640     "    BRC     8,2f                 \n\t" /* done if none        */ \
 641     "    LAY     %[lenx],-1(,%[tolen]) \n\t"                          \
 642     "    EX      %[lenx],0(%[target]) \n\t" /* execute MVC instr   */ \
 643     "    BRC     15,2f                \n\t" /* skip template       */ \
 644     " "                                                               \
 645     "1:  XC      0(1,%[toaddr]),0(%[toaddr]) \n\t"                    \
 646     "2:  BCR     0,0                  \n\t" /* nop a branch target */ \
 647     : [to]       "+Q"  (_to)         /* outputs   */               \
 648     , [lenx]     "=a"  (lenx)                                      \
 649     , [len8]     "=a"  (len8)                                      \
 650     , [len256]   "=a"  (len256)                                    \
 651     , [tolen]    "=a"  (tolen)                                     \
 652     , [toaddr]   "=a"  (toaddr)                                    \
 653     , [target]   "=a"  (target)                                    \
 654     : [len]       "r"  (_len)        /* inputs    */               \
 655     : "cc"                           /* clobbered */               \
 656     );                                                             \
 657 }
 658 #endif
 659 #endif // USE_INLINE_ASM
 660 
 661 //*************************************//
 662 //   D I S J O I N T   C O P Y I N G   //
 663 //*************************************//
 664 
 665 static void pd_aligned_disjoint_words(HeapWord* from, HeapWord* to, size_t count) {
 666   // JVM2008: very frequent, some tests frequent.
 667 
 668   // Copy HeapWord (=DW) aligned storage. Use MVCLE in inline-asm code.
 669   // MVCLE guarantees DW concurrent (i.e. atomic) accesses if both the addresses of the operands
 670   // are DW aligned and the length is an integer multiple of a DW. Should always be true here.
 671   //
 672   // No special exploit needed. H/W discovers suitable situations itself.
 673   //
 674   // For large chunks of memory, exploit special H/W support of z/Architecture:
 675   // 1) copy short piece of memory to page-align address(es)
 676   // 2) copy largest part (all contained full pages) of memory using mvcle instruction.
 677   //    z/Architecture processors have special H/W support for page-aligned storage
 678   //    where len is an int multiple of page size. In that case, up to 4 cache lines are
 679   //    processed in parallel and L1 cache is not polluted.
 680   // 3) copy the remaining piece of memory.
 681   //
 682 #ifdef USE_INLINE_ASM
 683   jbyte* to_bytes   = (jbyte*)to;
 684   jbyte* from_bytes = (jbyte*)from;
 685   size_t len_bytes  = count*HeapWordSize;
 686 
 687   // Optimized copying for data less than 4k
 688   switch (count) {
 689     case 0: return;
 690     case 1: MOVE8_ATOMIC_1(to,from)
 691             return;
 692     case 2: MOVE8_ATOMIC_2(to,from)
 693             return;
 694 //  case 3: MOVE8_ATOMIC_3(to,from)
 695 //          return;
 696 //  case 4: MOVE8_ATOMIC_4(to,from)
 697 //          return;
 698     default:
 699       if (len_bytes <= 4096) {
 700         MVC_MULTI(to,from,len_bytes)
 701         return;
 702       }
 703       // else
 704       MVCLE_MEMCOPY(to_bytes, from_bytes, len_bytes)
 705       return;
 706   }
 707 #else
 708   // Fallback code.
 709   switch (count) {
 710     case 0:
 711       return;
 712 
 713     case 1:
 714       *to = *from;
 715       return;
 716 
 717     case 2:
 718       *to++ = *from++;
 719       *to = *from;
 720       return;
 721 
 722     case 3:
 723       *to++ = *from++;
 724       *to++ = *from++;
 725       *to = *from;
 726       return;
 727 
 728     case 4:
 729       *to++ = *from++;
 730       *to++ = *from++;
 731       *to++ = *from++;
 732       *to = *from;
 733       return;
 734 
 735     default:
 736       while (count-- > 0)
 737         *(to++) = *(from++);
 738       return;
 739   }
 740 #endif
 741 }
 742 
 743 static void pd_disjoint_words_atomic(HeapWord* from, HeapWord* to, size_t count) {
 744   // JVM2008: < 4k calls.
 745   assert(((((size_t)from) & 0x07L) | (((size_t)to) & 0x07L)) == 0, "No atomic copy w/o aligned data");
 746   pd_aligned_disjoint_words(from, to, count); // Rare calls -> just delegate.
 747 }
 748 
 749 static void pd_disjoint_words(HeapWord* from, HeapWord* to, size_t count) {
 750   // JVM2008: very rare.
 751   pd_aligned_disjoint_words(from, to, count); // Rare calls -> just delegate.
 752 }
 753 
 754 
 755 //*************************************//
 756 //   C O N J O I N T   C O P Y I N G   //
 757 //*************************************//
 758 
 759 static void pd_aligned_conjoint_words(HeapWord* from, HeapWord* to, size_t count) {
 760   // JVM2008: between some and lower end of frequent.
 761 
 762 #ifdef USE_INLINE_ASM
 763   size_t  count_in = count;
 764   if (has_destructive_overlap((char*)from, (char*)to, count_in*BytesPerLong)) {
 765     switch (count_in) {
 766       case 4: COPY8_ATOMIC_4(to,from)
 767               return;
 768       case 3: COPY8_ATOMIC_3(to,from)
 769               return;
 770       case 2: COPY8_ATOMIC_2(to,from)
 771               return;
 772       case 1: COPY8_ATOMIC_1(to,from)
 773               return;
 774       case 0: return;
 775       default:
 776         from += count_in;
 777         to   += count_in;
 778         while (count_in-- > 0)
 779           *(--to) = *(--from); // Copy backwards, areas overlap destructively.
 780         return;
 781     }
 782   }
 783   // else
 784   jbyte* to_bytes   = (jbyte*)to;
 785   jbyte* from_bytes = (jbyte*)from;
 786   size_t len_bytes  = count_in*BytesPerLong;
 787   MVCLE_MEMCOPY(to_bytes, from_bytes, len_bytes)
 788   return;
 789 #else
 790   // Fallback code.
 791   if (has_destructive_overlap((char*)from, (char*)to, count*BytesPerLong)) {
 792     HeapWord t1, t2, t3;
 793     switch (count) {
 794       case 0:
 795         return;
 796 
 797       case 1:
 798         *to = *from;
 799         return;
 800 
 801       case 2:
 802         t1 = *(from+1);
 803         *to = *from;
 804         *(to+1) = t1;
 805         return;
 806 
 807       case 3:
 808         t1 = *(from+1);
 809         t2 = *(from+2);
 810         *to = *from;
 811         *(to+1) = t1;
 812         *(to+2) = t2;
 813         return;
 814 
 815       case 4:
 816         t1 = *(from+1);
 817         t2 = *(from+2);
 818         t3 = *(from+3);
 819         *to = *from;
 820         *(to+1) = t1;
 821         *(to+2) = t2;
 822         *(to+3) = t3;
 823         return;
 824 
 825       default:
 826         from += count;
 827         to   += count;
 828         while (count-- > 0)
 829           *(--to) = *(--from); // Copy backwards, areas overlap destructively.
 830         return;
 831     }
 832   }
 833   // else
 834   // Just delegate. HeapWords are optimally aligned anyway.
 835   pd_aligned_disjoint_words(from, to, count);
 836 #endif
 837 }
 838 
 839 static void pd_conjoint_words(HeapWord* from, HeapWord* to, size_t count) {
 840 
 841   // Just delegate. HeapWords are optimally aligned anyway.
 842   pd_aligned_conjoint_words(from, to, count);
 843 }
 844 
 845 static void pd_conjoint_bytes(void* from, void* to, size_t count) {
 846 
 847 #ifdef USE_INLINE_ASM
 848   size_t count_in = count;
 849   if (has_destructive_overlap((char*)from, (char*)to, count_in))
 850     (void)memmove(to, from, count_in);
 851   else {
 852     jbyte*  to_bytes   = (jbyte*)to;
 853     jbyte*  from_bytes = (jbyte*)from;
 854     size_t  len_bytes  = count_in;
 855     MVCLE_MEMCOPY(to_bytes, from_bytes, len_bytes)
 856   }
 857 #else
 858   if (has_destructive_overlap((char*)from, (char*)to, count))
 859     (void)memmove(to, from, count);
 860   else
 861     (void)memcpy(to, from, count);
 862 #endif
 863 }
 864 
 865 //**************************************************//
 866 //   C O N J O I N T  A T O M I C   C O P Y I N G   //
 867 //**************************************************//
 868 
 869 static void pd_conjoint_bytes_atomic(void* from, void* to, size_t count) {
 870   // Call arraycopy stubs to do the job.
 871   pd_conjoint_bytes(from, to, count); // bytes are always accessed atomically.
 872 }
 873 
 874 static void pd_conjoint_jshorts_atomic(jshort* from, jshort* to, size_t count) {
 875 
 876 #ifdef USE_INLINE_ASM
 877   size_t count_in = count;
 878   if (has_destructive_overlap((char*)from, (char*)to, count_in*BytesPerShort)) {
 879     // Use optimizations from shared code where no z-specific optimization exists.
 880     copy_conjoint_jshorts_atomic(from, to, count);
 881   } else {
 882     jbyte* to_bytes   = (jbyte*)to;
 883     jbyte* from_bytes = (jbyte*)from;
 884     size_t len_bytes  = count_in*BytesPerShort;
 885     MVCLE_MEMCOPY(to_bytes, from_bytes, len_bytes)
 886   }
 887 #else
 888   // Use optimizations from shared code where no z-specific optimization exists.
 889   copy_conjoint_jshorts_atomic(from, to, count);
 890 #endif
 891 }
 892 
 893 static void pd_conjoint_jints_atomic(jint* from, jint* to, size_t count) {
 894 
 895 #ifdef USE_INLINE_ASM
 896   size_t count_in = count;
 897   if (has_destructive_overlap((char*)from, (char*)to, count_in*BytesPerInt)) {
 898     switch (count_in) {
 899       case 4: COPY4_ATOMIC_4(to,from)
 900               return;
 901       case 3: COPY4_ATOMIC_3(to,from)
 902               return;
 903       case 2: COPY4_ATOMIC_2(to,from)
 904               return;
 905       case 1: COPY4_ATOMIC_1(to,from)
 906               return;
 907       case 0: return;
 908       default:
 909         // Use optimizations from shared code where no z-specific optimization exists.
 910         copy_conjoint_jints_atomic(from, to, count_in);
 911         return;
 912     }
 913   }
 914   // else
 915   jbyte* to_bytes   = (jbyte*)to;
 916   jbyte* from_bytes = (jbyte*)from;
 917   size_t len_bytes  = count_in*BytesPerInt;
 918   MVCLE_MEMCOPY(to_bytes, from_bytes, len_bytes)
 919 #else
 920   // Use optimizations from shared code where no z-specific optimization exists.
 921   copy_conjoint_jints_atomic(from, to, count);
 922 #endif
 923 }
 924 
 925 static void pd_conjoint_jlongs_atomic(jlong* from, jlong* to, size_t count) {
 926 
 927 #ifdef USE_INLINE_ASM
 928   size_t count_in = count;
 929   if (has_destructive_overlap((char*)from, (char*)to, count_in*BytesPerLong)) {
 930     switch (count_in) {
 931       case 4: COPY8_ATOMIC_4(to,from) return;
 932       case 3: COPY8_ATOMIC_3(to,from) return;
 933       case 2: COPY8_ATOMIC_2(to,from) return;
 934       case 1: COPY8_ATOMIC_1(to,from) return;
 935       case 0: return;
 936       default:
 937         from += count_in;
 938         to   += count_in;
 939         while (count_in-- > 0) { *(--to) = *(--from); } // Copy backwards, areas overlap destructively.
 940         return;
 941     }
 942   }
 943   // else {
 944   jbyte* to_bytes   = (jbyte*)to;
 945   jbyte* from_bytes = (jbyte*)from;
 946   size_t len_bytes  = count_in*BytesPerLong;
 947   MVCLE_MEMCOPY(to_bytes, from_bytes, len_bytes)
 948 #else
 949   size_t count_in = count;
 950   if (has_destructive_overlap((char*)from, (char*)to, count_in*BytesPerLong)) {
 951     if (count_in < 8) {
 952       from += count_in;
 953       to   += count_in;
 954       while (count_in-- > 0)
 955          *(--to) = *(--from); // Copy backwards, areas overlap destructively.
 956       return;
 957     }
 958     // else {
 959     from += count_in-1;
 960     to   += count_in-1;
 961     if (count_in&0x01) {
 962       *(to--) = *(from--);
 963       count_in--;
 964     }
 965     for (; count_in>0; count_in-=2) {
 966       *to     = *from;
 967       *(to-1) = *(from-1);
 968       to     -= 2;
 969       from   -= 2;
 970     }
 971   }
 972   else
 973     pd_aligned_disjoint_words((HeapWord*)from, (HeapWord*)to, count_in); // rare calls -> just delegate.
 974 #endif
 975 }
 976 
 977 static void pd_conjoint_oops_atomic(oop* from, oop* to, size_t count) {
 978 
 979 #ifdef USE_INLINE_ASM
 980   size_t count_in = count;
 981   if (has_destructive_overlap((char*)from, (char*)to, count_in*BytesPerOop)) {
 982     switch (count_in) {
 983       case 4: COPY8_ATOMIC_4(to,from) return;
 984       case 3: COPY8_ATOMIC_3(to,from) return;
 985       case 2: COPY8_ATOMIC_2(to,from) return;
 986       case 1: COPY8_ATOMIC_1(to,from) return;
 987       case 0: return;
 988       default:
 989         from += count_in;
 990         to   += count_in;
 991         while (count_in-- > 0) { *(--to) = *(--from); } // Copy backwards, areas overlap destructively.
 992         return;
 993     }
 994   }
 995   // else
 996   jbyte* to_bytes   = (jbyte*)to;
 997   jbyte* from_bytes = (jbyte*)from;
 998   size_t len_bytes  = count_in*BytesPerOop;
 999   MVCLE_MEMCOPY(to_bytes, from_bytes, len_bytes)
1000 #else
1001   size_t count_in = count;
1002   if (has_destructive_overlap((char*)from, (char*)to, count_in*BytesPerOop)) {
1003     from += count_in;
1004     to   += count_in;
1005     while (count_in-- > 0) *(--to) = *(--from); // Copy backwards, areas overlap destructively.
1006     return;
1007   }
1008   // else
1009   pd_aligned_disjoint_words((HeapWord*)from, (HeapWord*)to, count_in); // rare calls -> just delegate.
1010   return;
1011 #endif
1012 }
1013 
1014 static void pd_arrayof_conjoint_bytes(HeapWord* from, HeapWord* to, size_t count) {
1015   pd_conjoint_bytes_atomic(from, to, count);
1016 }
1017 
1018 static void pd_arrayof_conjoint_jshorts(HeapWord* from, HeapWord* to, size_t count) {
1019   pd_conjoint_jshorts_atomic((jshort*)from, (jshort*)to, count);
1020 }
1021 
1022 static void pd_arrayof_conjoint_jints(HeapWord* from, HeapWord* to, size_t count) {
1023   pd_conjoint_jints_atomic((jint*)from, (jint*)to, count);
1024 }
1025 
1026 static void pd_arrayof_conjoint_jlongs(HeapWord* from, HeapWord* to, size_t count) {
1027   pd_conjoint_jlongs_atomic((jlong*)from, (jlong*)to, count);
1028 }
1029 
1030 static void pd_arrayof_conjoint_oops(HeapWord* from, HeapWord* to, size_t count) {
1031   pd_conjoint_oops_atomic((oop*)from, (oop*)to, count);
1032 }
1033 
1034 //**********************************************//
1035 //  M E M O R Y   I N I T I A L I S A T I O N   //
1036 //**********************************************//
1037 
1038 static void pd_fill_to_bytes(void* to, size_t count, jubyte value) {
1039   // JVM2008: very rare, only in some tests.
1040 #ifdef USE_INLINE_ASM
1041   // Initialize storage to a given value. Use memset instead of copy loop.
1042   // For large chunks of memory, exploit special H/W support of z/Architecture:
1043   // 1) init short piece of memory to page-align address
1044   // 2) init largest part (all contained full pages) of memory using mvcle instruction.
1045   //    z/Architecture processors have special H/W support for page-aligned storage
1046   //    where len is an int multiple of page size. In that case, up to 4 cache lines are
1047   //    processed in parallel and L1 cache is not polluted.
1048   // 3) init the remaining piece of memory.
1049   // Atomicity cannot really be an issue since gcc implements the loop body with XC anyway.
1050   // If atomicity is a problem, we have to prevent gcc optimization. Best workaround: inline asm.
1051 
1052   jbyte*  to_bytes  = (jbyte*)to;
1053   size_t  len_bytes = count;
1054 
1055   MVCLE_MEMINIT(to_bytes, value, len_bytes)
1056 
1057 #else
1058   // Memset does the best job possible: loop over 256-byte MVCs, with
1059   // the last MVC EXecuted. With the -mmvcle option, initialization
1060   // is done using MVCLE -> slight advantage for large areas.
1061   (void)memset(to, value, count);
1062 #endif
1063 }
1064 
1065 static void pd_fill_to_words(HeapWord* tohw, size_t count, juint value) {
1066   // Occurs in dbg builds only. Usually memory poisoning with BAADBABE, DEADBEEF, etc.
1067   // JVM2008: < 4k calls.
1068   if (value == 0) {
1069     pd_zero_to_words(tohw, count);
1070     return;
1071   }
1072   if (value == ~(juint)(0)) {
1073     pd_fill_to_bytes(tohw, count*HeapWordSize, (jubyte)(~(juint)(0)));
1074     return;
1075   }
1076   julong* to = (julong*) tohw;
1077   julong  v  = ((julong) value << 32) | value;
1078   while (count-- > 0) {
1079     *to++ = v;
1080   }
1081 }
1082 
1083 static void pd_fill_to_aligned_words(HeapWord* tohw, size_t count, juint value) {
1084   // JVM2008: very frequent, but virtually all calls are with value == 0.
1085   pd_fill_to_words(tohw, count, value);
1086 }
1087 
1088 //**********************************//
1089 //  M E M O R Y   C L E A R I N G   //
1090 //**********************************//
1091 
1092 // Delegate to pd_zero_to_bytes. It also works HeapWord-atomic.
1093 // Distinguish between simple and large zero_to_words.
1094 static void pd_zero_to_words(HeapWord* tohw, size_t count) {
1095   pd_zero_to_bytes(tohw, count*HeapWordSize);
1096 }
1097 
1098 // Delegate to pd_zero_to_bytes. It also works HeapWord-atomic.
1099 static void pd_zero_to_words_large(HeapWord* tohw, size_t count) {
1100   // JVM2008: generally frequent, some tests show very frequent calls.
1101   pd_zero_to_bytes(tohw, count*HeapWordSize);
1102 }
1103 
1104 static void pd_zero_to_bytes(void* to, size_t count) {
1105   // JVM2008: some calls (generally), some tests frequent
1106 #ifdef USE_INLINE_ASM
1107   // Even zero_to_bytes() requires HeapWord-atomic, or, at least, sequential
1108   // zeroing of the memory. MVCLE is not fit for that job:
1109   //   "As observed by other CPUs and by the channel subsystem,
1110   //    that portion of the first operand which is filled
1111   //    with the padding byte is not necessarily stored into in
1112   //    a left-to-right direction and may appear to be stored
1113   //    into more than once."
1114   // Therefore, implementation was changed to use (multiple) XC instructions.
1115 
1116   const long line_size = 256;
1117   jbyte* to_bytes  = (jbyte*)to;
1118   size_t len_bytes = count;
1119 
1120   if (len_bytes <= line_size) {
1121     XC_MEMZERO_256(to_bytes, len_bytes);
1122   } else {
1123     XC_MEMZERO_ANY(to_bytes, len_bytes);
1124   }
1125 
1126 #else
1127   // Memset does the best job possible: loop over 256-byte MVCs, with
1128   // the last MVC EXecuted. With the -mmvcle option, initialization
1129   // is done using MVCLE -> slight advantage for large areas.
1130   (void)memset(to, 0, count);
1131 #endif
1132 }
1133 
1134 #endif // CPU_S390_VM_COPY_S390_HPP