1 /*
   2  * Copyright (c) 2013, 2015, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "runtime/deoptimization.hpp"
  27 #include "runtime/frame.inline.hpp"
  28 #include "runtime/stubRoutines.hpp"
  29 #include "runtime/thread.inline.hpp"
  30 #include "crc32c.h"
  31 
  32 #ifdef _MSC_VER
  33 #define ALIGNED_(x) __declspec(align(x))
  34 #else
  35 #define ALIGNED_(x) __attribute__ ((aligned(x)))
  36 #endif
  37 
  38 // Implementation of the platform-specific part of StubRoutines - for
  39 // a description of how to extend it, see the stubRoutines.hpp file.
  40 
  41 address StubRoutines::x86::_verify_mxcsr_entry = NULL;
  42 address StubRoutines::x86::_key_shuffle_mask_addr = NULL;
  43 address StubRoutines::x86::_counter_shuffle_mask_addr = NULL;
  44 address StubRoutines::x86::_ghash_long_swap_mask_addr = NULL;
  45 address StubRoutines::x86::_ghash_byte_swap_mask_addr = NULL;
  46 address StubRoutines::x86::_upper_word_mask_addr = NULL;
  47 address StubRoutines::x86::_shuffle_byte_flip_mask_addr = NULL;
  48 address StubRoutines::x86::_k256_adr = NULL;
  49 address StubRoutines::x86::_pshuffle_byte_flip_mask_addr = NULL;
  50 
  51 //tables common for sin and cos
  52 address StubRoutines::x86::_ONEHALF_adr = NULL;
  53 address StubRoutines::x86::_P_2_adr = NULL;
  54 address StubRoutines::x86::_SC_4_adr = NULL;
  55 address StubRoutines::x86::_Ctable_adr = NULL;
  56 address StubRoutines::x86::_SC_2_adr = NULL;
  57 address StubRoutines::x86::_SC_3_adr = NULL;
  58 address StubRoutines::x86::_SC_1_adr = NULL;
  59 address StubRoutines::x86::_PI_INV_TABLE_adr = NULL;
  60 address StubRoutines::x86::_PI_4_adr = NULL;
  61 address StubRoutines::x86::_PI32INV_adr = NULL;
  62 address StubRoutines::x86::_SIGN_MASK_adr = NULL;
  63 address StubRoutines::x86::_P_1_adr = NULL;
  64 address StubRoutines::x86::_P_3_adr = NULL;
  65 address StubRoutines::x86::_NEG_ZERO_adr = NULL;
  66 
  67 //tables common for sincos and tancot
  68 address StubRoutines::x86::_L_2il0floatpacket_0_adr = NULL;
  69 address StubRoutines::x86::_Pi4Inv_adr = NULL;
  70 address StubRoutines::x86::_Pi4x3_adr = NULL;
  71 address StubRoutines::x86::_Pi4x4_adr = NULL;
  72 address StubRoutines::x86::_ones_adr = NULL;
  73 
  74 uint64_t StubRoutines::x86::_crc_by128_masks[] =
  75 {
  76   /* The fields in this structure are arranged so that they can be
  77    * picked up two at a time with 128-bit loads.
  78    *
  79    * Because of flipped bit order for this CRC polynomials
  80    * the constant for X**N is left-shifted by 1.  This is because
  81    * a 64 x 64 polynomial multiply produces a 127-bit result
  82    * but the highest term is always aligned to bit 0 in the container.
  83    * Pre-shifting by one fixes this, at the cost of potentially making
  84    * the 32-bit constant no longer fit in a 32-bit container (thus the
  85    * use of uint64_t, though this is also the size used by the carry-
  86    * less multiply instruction.
  87    *
  88    * In addition, the flipped bit order and highest-term-at-least-bit
  89    * multiply changes the constants used.  The 96-bit result will be
  90    * aligned to the high-term end of the target 128-bit container,
  91    * not the low-term end; that is, instead of a 512-bit or 576-bit fold,
  92    * instead it is a 480 (=512-32) or 544 (=512+64-32) bit fold.
  93    *
  94    * This cause additional problems in the 128-to-64-bit reduction; see the
  95    * code for details.  By storing a mask in the otherwise unused half of
  96    * a 128-bit constant, bits can be cleared before multiplication without
  97    * storing and reloading.  Note that staying on a 128-bit datapath means
  98    * that some data is uselessly stored and some unused data is intersected
  99    * with an irrelevant constant.
 100    */
 101 
 102   ((uint64_t) 0xffffffffUL),     /* low  of K_M_64    */
 103   ((uint64_t) 0xb1e6b092U << 1), /* high of K_M_64    */
 104   ((uint64_t) 0xba8ccbe8U << 1), /* low  of K_160_96  */
 105   ((uint64_t) 0x6655004fU << 1), /* high of K_160_96  */
 106   ((uint64_t) 0xaa2215eaU << 1), /* low  of K_544_480 */
 107   ((uint64_t) 0xe3720acbU << 1)  /* high of K_544_480 */
 108 };
 109 
 110 /**
 111  *  crc_table[] from jdk/src/share/native/java/util/zip/zlib-1.2.5/crc32.h
 112  */
 113 juint StubRoutines::x86::_crc_table[] =
 114 {
 115     0x00000000UL, 0x77073096UL, 0xee0e612cUL, 0x990951baUL, 0x076dc419UL,
 116     0x706af48fUL, 0xe963a535UL, 0x9e6495a3UL, 0x0edb8832UL, 0x79dcb8a4UL,
 117     0xe0d5e91eUL, 0x97d2d988UL, 0x09b64c2bUL, 0x7eb17cbdUL, 0xe7b82d07UL,
 118     0x90bf1d91UL, 0x1db71064UL, 0x6ab020f2UL, 0xf3b97148UL, 0x84be41deUL,
 119     0x1adad47dUL, 0x6ddde4ebUL, 0xf4d4b551UL, 0x83d385c7UL, 0x136c9856UL,
 120     0x646ba8c0UL, 0xfd62f97aUL, 0x8a65c9ecUL, 0x14015c4fUL, 0x63066cd9UL,
 121     0xfa0f3d63UL, 0x8d080df5UL, 0x3b6e20c8UL, 0x4c69105eUL, 0xd56041e4UL,
 122     0xa2677172UL, 0x3c03e4d1UL, 0x4b04d447UL, 0xd20d85fdUL, 0xa50ab56bUL,
 123     0x35b5a8faUL, 0x42b2986cUL, 0xdbbbc9d6UL, 0xacbcf940UL, 0x32d86ce3UL,
 124     0x45df5c75UL, 0xdcd60dcfUL, 0xabd13d59UL, 0x26d930acUL, 0x51de003aUL,
 125     0xc8d75180UL, 0xbfd06116UL, 0x21b4f4b5UL, 0x56b3c423UL, 0xcfba9599UL,
 126     0xb8bda50fUL, 0x2802b89eUL, 0x5f058808UL, 0xc60cd9b2UL, 0xb10be924UL,
 127     0x2f6f7c87UL, 0x58684c11UL, 0xc1611dabUL, 0xb6662d3dUL, 0x76dc4190UL,
 128     0x01db7106UL, 0x98d220bcUL, 0xefd5102aUL, 0x71b18589UL, 0x06b6b51fUL,
 129     0x9fbfe4a5UL, 0xe8b8d433UL, 0x7807c9a2UL, 0x0f00f934UL, 0x9609a88eUL,
 130     0xe10e9818UL, 0x7f6a0dbbUL, 0x086d3d2dUL, 0x91646c97UL, 0xe6635c01UL,
 131     0x6b6b51f4UL, 0x1c6c6162UL, 0x856530d8UL, 0xf262004eUL, 0x6c0695edUL,
 132     0x1b01a57bUL, 0x8208f4c1UL, 0xf50fc457UL, 0x65b0d9c6UL, 0x12b7e950UL,
 133     0x8bbeb8eaUL, 0xfcb9887cUL, 0x62dd1ddfUL, 0x15da2d49UL, 0x8cd37cf3UL,
 134     0xfbd44c65UL, 0x4db26158UL, 0x3ab551ceUL, 0xa3bc0074UL, 0xd4bb30e2UL,
 135     0x4adfa541UL, 0x3dd895d7UL, 0xa4d1c46dUL, 0xd3d6f4fbUL, 0x4369e96aUL,
 136     0x346ed9fcUL, 0xad678846UL, 0xda60b8d0UL, 0x44042d73UL, 0x33031de5UL,
 137     0xaa0a4c5fUL, 0xdd0d7cc9UL, 0x5005713cUL, 0x270241aaUL, 0xbe0b1010UL,
 138     0xc90c2086UL, 0x5768b525UL, 0x206f85b3UL, 0xb966d409UL, 0xce61e49fUL,
 139     0x5edef90eUL, 0x29d9c998UL, 0xb0d09822UL, 0xc7d7a8b4UL, 0x59b33d17UL,
 140     0x2eb40d81UL, 0xb7bd5c3bUL, 0xc0ba6cadUL, 0xedb88320UL, 0x9abfb3b6UL,
 141     0x03b6e20cUL, 0x74b1d29aUL, 0xead54739UL, 0x9dd277afUL, 0x04db2615UL,
 142     0x73dc1683UL, 0xe3630b12UL, 0x94643b84UL, 0x0d6d6a3eUL, 0x7a6a5aa8UL,
 143     0xe40ecf0bUL, 0x9309ff9dUL, 0x0a00ae27UL, 0x7d079eb1UL, 0xf00f9344UL,
 144     0x8708a3d2UL, 0x1e01f268UL, 0x6906c2feUL, 0xf762575dUL, 0x806567cbUL,
 145     0x196c3671UL, 0x6e6b06e7UL, 0xfed41b76UL, 0x89d32be0UL, 0x10da7a5aUL,
 146     0x67dd4accUL, 0xf9b9df6fUL, 0x8ebeeff9UL, 0x17b7be43UL, 0x60b08ed5UL,
 147     0xd6d6a3e8UL, 0xa1d1937eUL, 0x38d8c2c4UL, 0x4fdff252UL, 0xd1bb67f1UL,
 148     0xa6bc5767UL, 0x3fb506ddUL, 0x48b2364bUL, 0xd80d2bdaUL, 0xaf0a1b4cUL,
 149     0x36034af6UL, 0x41047a60UL, 0xdf60efc3UL, 0xa867df55UL, 0x316e8eefUL,
 150     0x4669be79UL, 0xcb61b38cUL, 0xbc66831aUL, 0x256fd2a0UL, 0x5268e236UL,
 151     0xcc0c7795UL, 0xbb0b4703UL, 0x220216b9UL, 0x5505262fUL, 0xc5ba3bbeUL,
 152     0xb2bd0b28UL, 0x2bb45a92UL, 0x5cb36a04UL, 0xc2d7ffa7UL, 0xb5d0cf31UL,
 153     0x2cd99e8bUL, 0x5bdeae1dUL, 0x9b64c2b0UL, 0xec63f226UL, 0x756aa39cUL,
 154     0x026d930aUL, 0x9c0906a9UL, 0xeb0e363fUL, 0x72076785UL, 0x05005713UL,
 155     0x95bf4a82UL, 0xe2b87a14UL, 0x7bb12baeUL, 0x0cb61b38UL, 0x92d28e9bUL,
 156     0xe5d5be0dUL, 0x7cdcefb7UL, 0x0bdbdf21UL, 0x86d3d2d4UL, 0xf1d4e242UL,
 157     0x68ddb3f8UL, 0x1fda836eUL, 0x81be16cdUL, 0xf6b9265bUL, 0x6fb077e1UL,
 158     0x18b74777UL, 0x88085ae6UL, 0xff0f6a70UL, 0x66063bcaUL, 0x11010b5cUL,
 159     0x8f659effUL, 0xf862ae69UL, 0x616bffd3UL, 0x166ccf45UL, 0xa00ae278UL,
 160     0xd70dd2eeUL, 0x4e048354UL, 0x3903b3c2UL, 0xa7672661UL, 0xd06016f7UL,
 161     0x4969474dUL, 0x3e6e77dbUL, 0xaed16a4aUL, 0xd9d65adcUL, 0x40df0b66UL,
 162     0x37d83bf0UL, 0xa9bcae53UL, 0xdebb9ec5UL, 0x47b2cf7fUL, 0x30b5ffe9UL,
 163     0xbdbdf21cUL, 0xcabac28aUL, 0x53b39330UL, 0x24b4a3a6UL, 0xbad03605UL,
 164     0xcdd70693UL, 0x54de5729UL, 0x23d967bfUL, 0xb3667a2eUL, 0xc4614ab8UL,
 165     0x5d681b02UL, 0x2a6f2b94UL, 0xb40bbe37UL, 0xc30c8ea1UL, 0x5a05df1bUL,
 166     0x2d02ef8dUL
 167 };
 168 
 169 #define D 32
 170 #define P 0x82F63B78 // Reflection of Castagnoli (0x11EDC6F41)
 171 
 172 #define TILL_CYCLE 31
 173 uint32_t _crc32c_pow_2k_table[TILL_CYCLE]; // because _crc32c_pow_2k_table[TILL_CYCLE == 31] == _crc32c_pow_2k_table[0]
 174 
 175 // A. Kadatch and B. Jenkins / Everything we know about CRC but afraid to forget September 3, 2010 8
 176 // Listing 1: Multiplication of normalized polynomials
 177 // "a" and "b" occupy D least significant bits.
 178 uint32_t crc32c_multiply(uint32_t a, uint32_t b) {
 179   uint32_t product = 0;
 180   uint32_t b_pow_x_table[D + 1]; // b_pow_x_table[k] = (b * x**k) mod P
 181   b_pow_x_table[0] = b;
 182   for (int k = 0; k < D; ++k) {
 183     // If "a" has non-zero coefficient at x**k,/ add ((b * x**k) mod P) to the result.
 184     if ((a & (((uint32_t)1) << (D - 1 - k))) != 0) product ^= b_pow_x_table[k];
 185 
 186     // Compute b_pow_x_table[k+1] = (b ** x**(k+1)) mod P.
 187     if (b_pow_x_table[k] & 1) {
 188       // If degree of (b_pow_x_table[k] * x) is D, then
 189       // degree of (b_pow_x_table[k] * x - P) is less than D.
 190       b_pow_x_table[k + 1] = (b_pow_x_table[k] >> 1) ^ P;
 191     }
 192     else {
 193       b_pow_x_table[k + 1] = b_pow_x_table[k] >> 1;
 194     }
 195   }
 196   return product;
 197 }
 198 #undef D
 199 #undef P
 200 
 201 // A. Kadatch and B. Jenkins / Everything we know about CRC but afraid to forget September 3, 2010 9
 202 void crc32c_init_pow_2k(void) {
 203   // _crc32c_pow_2k_table(0) =
 204   // x^(2^k) mod P(x) = x mod P(x) = x
 205   // Since we are operating on a reflected values
 206   // x = 10b, reflect(x) = 0x40000000
 207   _crc32c_pow_2k_table[0] = 0x40000000;
 208 
 209   for (int k = 1; k < TILL_CYCLE; k++) {
 210     // _crc32c_pow_2k_table(k+1) = _crc32c_pow_2k_table(k-1)^2 mod P(x)
 211     uint32_t tmp = _crc32c_pow_2k_table[k - 1];
 212     _crc32c_pow_2k_table[k] = crc32c_multiply(tmp, tmp);
 213   }
 214 }
 215 
 216 // x^N mod P(x)
 217 uint32_t crc32c_f_pow_n(uint32_t n) {
 218   //            result = 1 (polynomial)
 219   uint32_t one, result = 0x80000000, i = 0;
 220 
 221   while (one = (n & 1), (n == 1 || n - one > 0)) {
 222     if (one) {
 223       result = crc32c_multiply(result, _crc32c_pow_2k_table[i]);
 224     }
 225     n >>= 1;
 226     i++;
 227   }
 228 
 229   return result;
 230 }
 231 
 232 juint *StubRoutines::x86::_crc32c_table;
 233 
 234 void StubRoutines::x86::generate_CRC32C_table(bool is_pclmulqdq_table_supported) {
 235 
 236   static juint pow_n[CRC32C_NUM_PRECOMPUTED_CONSTANTS];
 237 
 238   crc32c_init_pow_2k();
 239 
 240   pow_n[0] = crc32c_f_pow_n(CRC32C_HIGH * 8);      // 8N * 8 = 64N
 241   pow_n[1] = crc32c_f_pow_n(CRC32C_HIGH * 8 * 2);  // 128N
 242 
 243   pow_n[2] = crc32c_f_pow_n(CRC32C_MIDDLE * 8);
 244   pow_n[3] = crc32c_f_pow_n(CRC32C_MIDDLE * 8 * 2);
 245 
 246   pow_n[4] = crc32c_f_pow_n(CRC32C_LOW * 8);
 247   pow_n[CRC32C_NUM_PRECOMPUTED_CONSTANTS - 1] =
 248             crc32c_f_pow_n(CRC32C_LOW * 8 * 2);
 249 
 250   if (is_pclmulqdq_table_supported) {
 251     _crc32c_table = pow_n;
 252   } else {
 253     static julong pclmulqdq_table[CRC32C_NUM_PRECOMPUTED_CONSTANTS * 256];
 254 
 255     for (int j = 0; j < CRC32C_NUM_PRECOMPUTED_CONSTANTS; j++) {
 256       static juint X_CONST = pow_n[j];
 257       for (int64_t i = 0; i < 256; i++) { // to force 64 bit wide computations
 258       // S. Gueron / Information Processing Letters 112 (2012) 184
 259       // Algorithm 3: Generating a carry-less multiplication lookup table.
 260       // Input: A 32-bit constant, X_CONST.
 261       // Output: A table of 256 entries, each one is a 64-bit quadword,
 262       // that can be used for computing "byte" * X_CONST, for a given byte.
 263         pclmulqdq_table[j * 256 + i] =
 264           ((i & 1) * X_CONST) ^ ((i & 2) * X_CONST) ^ ((i & 4) * X_CONST) ^
 265           ((i & 8) * X_CONST) ^ ((i & 16) * X_CONST) ^ ((i & 32) * X_CONST) ^
 266           ((i & 64) * X_CONST) ^ ((i & 128) * X_CONST);
 267       }
 268     }
 269     _crc32c_table = (juint*)pclmulqdq_table;
 270   }
 271 }
 272 
 273 ALIGNED_(64) juint StubRoutines::x86::_k256[] =
 274 {
 275     0x428a2f98UL, 0x71374491UL, 0xb5c0fbcfUL, 0xe9b5dba5UL,
 276     0x3956c25bUL, 0x59f111f1UL, 0x923f82a4UL, 0xab1c5ed5UL,
 277     0xd807aa98UL, 0x12835b01UL, 0x243185beUL, 0x550c7dc3UL,
 278     0x72be5d74UL, 0x80deb1feUL, 0x9bdc06a7UL, 0xc19bf174UL,
 279     0xe49b69c1UL, 0xefbe4786UL, 0x0fc19dc6UL, 0x240ca1ccUL,
 280     0x2de92c6fUL, 0x4a7484aaUL, 0x5cb0a9dcUL, 0x76f988daUL,
 281     0x983e5152UL, 0xa831c66dUL, 0xb00327c8UL, 0xbf597fc7UL,
 282     0xc6e00bf3UL, 0xd5a79147UL, 0x06ca6351UL, 0x14292967UL,
 283     0x27b70a85UL, 0x2e1b2138UL, 0x4d2c6dfcUL, 0x53380d13UL,
 284     0x650a7354UL, 0x766a0abbUL, 0x81c2c92eUL, 0x92722c85UL,
 285     0xa2bfe8a1UL, 0xa81a664bUL, 0xc24b8b70UL, 0xc76c51a3UL,
 286     0xd192e819UL, 0xd6990624UL, 0xf40e3585UL, 0x106aa070UL,
 287     0x19a4c116UL, 0x1e376c08UL, 0x2748774cUL, 0x34b0bcb5UL,
 288     0x391c0cb3UL, 0x4ed8aa4aUL, 0x5b9cca4fUL, 0x682e6ff3UL,
 289     0x748f82eeUL, 0x78a5636fUL, 0x84c87814UL, 0x8cc70208UL,
 290     0x90befffaUL, 0xa4506cebUL, 0xbef9a3f7UL, 0xc67178f2UL
 291 };