1 /* 2 * Copyright (c) 2013, 2015, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #include "precompiled.hpp" 26 #include "runtime/deoptimization.hpp" 27 #include "runtime/frame.inline.hpp" 28 #include "runtime/stubRoutines.hpp" 29 #include "runtime/thread.inline.hpp" 30 #include "crc32c.h" 31 32 #ifdef _MSC_VER 33 #define ALIGNED_(x) __declspec(align(x)) 34 #else 35 #define ALIGNED_(x) __attribute__ ((aligned(x))) 36 #endif 37 38 // Implementation of the platform-specific part of StubRoutines - for 39 // a description of how to extend it, see the stubRoutines.hpp file. 40 41 address StubRoutines::x86::_verify_mxcsr_entry = NULL; 42 address StubRoutines::x86::_key_shuffle_mask_addr = NULL; 43 address StubRoutines::x86::_counter_shuffle_mask_addr = NULL; 44 address StubRoutines::x86::_ghash_long_swap_mask_addr = NULL; 45 address StubRoutines::x86::_ghash_byte_swap_mask_addr = NULL; 46 address StubRoutines::x86::_upper_word_mask_addr = NULL; 47 address StubRoutines::x86::_shuffle_byte_flip_mask_addr = NULL; 48 address StubRoutines::x86::_k256_adr = NULL; 49 #ifdef _LP64 50 address StubRoutines::x86::_k256_W_adr = NULL; 51 #endif 52 address StubRoutines::x86::_pshuffle_byte_flip_mask_addr = NULL; 53 54 //tables common for sin and cos 55 address StubRoutines::x86::_ONEHALF_adr = NULL; 56 address StubRoutines::x86::_P_2_adr = NULL; 57 address StubRoutines::x86::_SC_4_adr = NULL; 58 address StubRoutines::x86::_Ctable_adr = NULL; 59 address StubRoutines::x86::_SC_2_adr = NULL; 60 address StubRoutines::x86::_SC_3_adr = NULL; 61 address StubRoutines::x86::_SC_1_adr = NULL; 62 address StubRoutines::x86::_PI_INV_TABLE_adr = NULL; 63 address StubRoutines::x86::_PI_4_adr = NULL; 64 address StubRoutines::x86::_PI32INV_adr = NULL; 65 address StubRoutines::x86::_SIGN_MASK_adr = NULL; 66 address StubRoutines::x86::_P_1_adr = NULL; 67 address StubRoutines::x86::_P_3_adr = NULL; 68 address StubRoutines::x86::_NEG_ZERO_adr = NULL; 69 70 //tables common for sincos and tancot 71 address StubRoutines::x86::_L_2il0floatpacket_0_adr = NULL; 72 address StubRoutines::x86::_Pi4Inv_adr = NULL; 73 address StubRoutines::x86::_Pi4x3_adr = NULL; 74 address StubRoutines::x86::_Pi4x4_adr = NULL; 75 address StubRoutines::x86::_ones_adr = NULL; 76 77 uint64_t StubRoutines::x86::_crc_by128_masks[] = 78 { 79 /* The fields in this structure are arranged so that they can be 80 * picked up two at a time with 128-bit loads. 81 * 82 * Because of flipped bit order for this CRC polynomials 83 * the constant for X**N is left-shifted by 1. This is because 84 * a 64 x 64 polynomial multiply produces a 127-bit result 85 * but the highest term is always aligned to bit 0 in the container. 86 * Pre-shifting by one fixes this, at the cost of potentially making 87 * the 32-bit constant no longer fit in a 32-bit container (thus the 88 * use of uint64_t, though this is also the size used by the carry- 89 * less multiply instruction. 90 * 91 * In addition, the flipped bit order and highest-term-at-least-bit 92 * multiply changes the constants used. The 96-bit result will be 93 * aligned to the high-term end of the target 128-bit container, 94 * not the low-term end; that is, instead of a 512-bit or 576-bit fold, 95 * instead it is a 480 (=512-32) or 544 (=512+64-32) bit fold. 96 * 97 * This cause additional problems in the 128-to-64-bit reduction; see the 98 * code for details. By storing a mask in the otherwise unused half of 99 * a 128-bit constant, bits can be cleared before multiplication without 100 * storing and reloading. Note that staying on a 128-bit datapath means 101 * that some data is uselessly stored and some unused data is intersected 102 * with an irrelevant constant. 103 */ 104 105 ((uint64_t) 0xffffffffUL), /* low of K_M_64 */ 106 ((uint64_t) 0xb1e6b092U << 1), /* high of K_M_64 */ 107 ((uint64_t) 0xba8ccbe8U << 1), /* low of K_160_96 */ 108 ((uint64_t) 0x6655004fU << 1), /* high of K_160_96 */ 109 ((uint64_t) 0xaa2215eaU << 1), /* low of K_544_480 */ 110 ((uint64_t) 0xe3720acbU << 1) /* high of K_544_480 */ 111 }; 112 113 /** 114 * crc_table[] from jdk/src/share/native/java/util/zip/zlib-1.2.5/crc32.h 115 */ 116 juint StubRoutines::x86::_crc_table[] = 117 { 118 0x00000000UL, 0x77073096UL, 0xee0e612cUL, 0x990951baUL, 0x076dc419UL, 119 0x706af48fUL, 0xe963a535UL, 0x9e6495a3UL, 0x0edb8832UL, 0x79dcb8a4UL, 120 0xe0d5e91eUL, 0x97d2d988UL, 0x09b64c2bUL, 0x7eb17cbdUL, 0xe7b82d07UL, 121 0x90bf1d91UL, 0x1db71064UL, 0x6ab020f2UL, 0xf3b97148UL, 0x84be41deUL, 122 0x1adad47dUL, 0x6ddde4ebUL, 0xf4d4b551UL, 0x83d385c7UL, 0x136c9856UL, 123 0x646ba8c0UL, 0xfd62f97aUL, 0x8a65c9ecUL, 0x14015c4fUL, 0x63066cd9UL, 124 0xfa0f3d63UL, 0x8d080df5UL, 0x3b6e20c8UL, 0x4c69105eUL, 0xd56041e4UL, 125 0xa2677172UL, 0x3c03e4d1UL, 0x4b04d447UL, 0xd20d85fdUL, 0xa50ab56bUL, 126 0x35b5a8faUL, 0x42b2986cUL, 0xdbbbc9d6UL, 0xacbcf940UL, 0x32d86ce3UL, 127 0x45df5c75UL, 0xdcd60dcfUL, 0xabd13d59UL, 0x26d930acUL, 0x51de003aUL, 128 0xc8d75180UL, 0xbfd06116UL, 0x21b4f4b5UL, 0x56b3c423UL, 0xcfba9599UL, 129 0xb8bda50fUL, 0x2802b89eUL, 0x5f058808UL, 0xc60cd9b2UL, 0xb10be924UL, 130 0x2f6f7c87UL, 0x58684c11UL, 0xc1611dabUL, 0xb6662d3dUL, 0x76dc4190UL, 131 0x01db7106UL, 0x98d220bcUL, 0xefd5102aUL, 0x71b18589UL, 0x06b6b51fUL, 132 0x9fbfe4a5UL, 0xe8b8d433UL, 0x7807c9a2UL, 0x0f00f934UL, 0x9609a88eUL, 133 0xe10e9818UL, 0x7f6a0dbbUL, 0x086d3d2dUL, 0x91646c97UL, 0xe6635c01UL, 134 0x6b6b51f4UL, 0x1c6c6162UL, 0x856530d8UL, 0xf262004eUL, 0x6c0695edUL, 135 0x1b01a57bUL, 0x8208f4c1UL, 0xf50fc457UL, 0x65b0d9c6UL, 0x12b7e950UL, 136 0x8bbeb8eaUL, 0xfcb9887cUL, 0x62dd1ddfUL, 0x15da2d49UL, 0x8cd37cf3UL, 137 0xfbd44c65UL, 0x4db26158UL, 0x3ab551ceUL, 0xa3bc0074UL, 0xd4bb30e2UL, 138 0x4adfa541UL, 0x3dd895d7UL, 0xa4d1c46dUL, 0xd3d6f4fbUL, 0x4369e96aUL, 139 0x346ed9fcUL, 0xad678846UL, 0xda60b8d0UL, 0x44042d73UL, 0x33031de5UL, 140 0xaa0a4c5fUL, 0xdd0d7cc9UL, 0x5005713cUL, 0x270241aaUL, 0xbe0b1010UL, 141 0xc90c2086UL, 0x5768b525UL, 0x206f85b3UL, 0xb966d409UL, 0xce61e49fUL, 142 0x5edef90eUL, 0x29d9c998UL, 0xb0d09822UL, 0xc7d7a8b4UL, 0x59b33d17UL, 143 0x2eb40d81UL, 0xb7bd5c3bUL, 0xc0ba6cadUL, 0xedb88320UL, 0x9abfb3b6UL, 144 0x03b6e20cUL, 0x74b1d29aUL, 0xead54739UL, 0x9dd277afUL, 0x04db2615UL, 145 0x73dc1683UL, 0xe3630b12UL, 0x94643b84UL, 0x0d6d6a3eUL, 0x7a6a5aa8UL, 146 0xe40ecf0bUL, 0x9309ff9dUL, 0x0a00ae27UL, 0x7d079eb1UL, 0xf00f9344UL, 147 0x8708a3d2UL, 0x1e01f268UL, 0x6906c2feUL, 0xf762575dUL, 0x806567cbUL, 148 0x196c3671UL, 0x6e6b06e7UL, 0xfed41b76UL, 0x89d32be0UL, 0x10da7a5aUL, 149 0x67dd4accUL, 0xf9b9df6fUL, 0x8ebeeff9UL, 0x17b7be43UL, 0x60b08ed5UL, 150 0xd6d6a3e8UL, 0xa1d1937eUL, 0x38d8c2c4UL, 0x4fdff252UL, 0xd1bb67f1UL, 151 0xa6bc5767UL, 0x3fb506ddUL, 0x48b2364bUL, 0xd80d2bdaUL, 0xaf0a1b4cUL, 152 0x36034af6UL, 0x41047a60UL, 0xdf60efc3UL, 0xa867df55UL, 0x316e8eefUL, 153 0x4669be79UL, 0xcb61b38cUL, 0xbc66831aUL, 0x256fd2a0UL, 0x5268e236UL, 154 0xcc0c7795UL, 0xbb0b4703UL, 0x220216b9UL, 0x5505262fUL, 0xc5ba3bbeUL, 155 0xb2bd0b28UL, 0x2bb45a92UL, 0x5cb36a04UL, 0xc2d7ffa7UL, 0xb5d0cf31UL, 156 0x2cd99e8bUL, 0x5bdeae1dUL, 0x9b64c2b0UL, 0xec63f226UL, 0x756aa39cUL, 157 0x026d930aUL, 0x9c0906a9UL, 0xeb0e363fUL, 0x72076785UL, 0x05005713UL, 158 0x95bf4a82UL, 0xe2b87a14UL, 0x7bb12baeUL, 0x0cb61b38UL, 0x92d28e9bUL, 159 0xe5d5be0dUL, 0x7cdcefb7UL, 0x0bdbdf21UL, 0x86d3d2d4UL, 0xf1d4e242UL, 160 0x68ddb3f8UL, 0x1fda836eUL, 0x81be16cdUL, 0xf6b9265bUL, 0x6fb077e1UL, 161 0x18b74777UL, 0x88085ae6UL, 0xff0f6a70UL, 0x66063bcaUL, 0x11010b5cUL, 162 0x8f659effUL, 0xf862ae69UL, 0x616bffd3UL, 0x166ccf45UL, 0xa00ae278UL, 163 0xd70dd2eeUL, 0x4e048354UL, 0x3903b3c2UL, 0xa7672661UL, 0xd06016f7UL, 164 0x4969474dUL, 0x3e6e77dbUL, 0xaed16a4aUL, 0xd9d65adcUL, 0x40df0b66UL, 165 0x37d83bf0UL, 0xa9bcae53UL, 0xdebb9ec5UL, 0x47b2cf7fUL, 0x30b5ffe9UL, 166 0xbdbdf21cUL, 0xcabac28aUL, 0x53b39330UL, 0x24b4a3a6UL, 0xbad03605UL, 167 0xcdd70693UL, 0x54de5729UL, 0x23d967bfUL, 0xb3667a2eUL, 0xc4614ab8UL, 168 0x5d681b02UL, 0x2a6f2b94UL, 0xb40bbe37UL, 0xc30c8ea1UL, 0x5a05df1bUL, 169 0x2d02ef8dUL 170 }; 171 172 #define D 32 173 #define P 0x82F63B78 // Reflection of Castagnoli (0x11EDC6F41) 174 175 #define TILL_CYCLE 31 176 uint32_t _crc32c_pow_2k_table[TILL_CYCLE]; // because _crc32c_pow_2k_table[TILL_CYCLE == 31] == _crc32c_pow_2k_table[0] 177 178 // A. Kadatch and B. Jenkins / Everything we know about CRC but afraid to forget September 3, 2010 8 179 // Listing 1: Multiplication of normalized polynomials 180 // "a" and "b" occupy D least significant bits. 181 uint32_t crc32c_multiply(uint32_t a, uint32_t b) { 182 uint32_t product = 0; 183 uint32_t b_pow_x_table[D + 1]; // b_pow_x_table[k] = (b * x**k) mod P 184 b_pow_x_table[0] = b; 185 for (int k = 0; k < D; ++k) { 186 // If "a" has non-zero coefficient at x**k,/ add ((b * x**k) mod P) to the result. 187 if ((a & (((uint32_t)1) << (D - 1 - k))) != 0) product ^= b_pow_x_table[k]; 188 189 // Compute b_pow_x_table[k+1] = (b ** x**(k+1)) mod P. 190 if (b_pow_x_table[k] & 1) { 191 // If degree of (b_pow_x_table[k] * x) is D, then 192 // degree of (b_pow_x_table[k] * x - P) is less than D. 193 b_pow_x_table[k + 1] = (b_pow_x_table[k] >> 1) ^ P; 194 } 195 else { 196 b_pow_x_table[k + 1] = b_pow_x_table[k] >> 1; 197 } 198 } 199 return product; 200 } 201 #undef D 202 #undef P 203 204 // A. Kadatch and B. Jenkins / Everything we know about CRC but afraid to forget September 3, 2010 9 205 void crc32c_init_pow_2k(void) { 206 // _crc32c_pow_2k_table(0) = 207 // x^(2^k) mod P(x) = x mod P(x) = x 208 // Since we are operating on a reflected values 209 // x = 10b, reflect(x) = 0x40000000 210 _crc32c_pow_2k_table[0] = 0x40000000; 211 212 for (int k = 1; k < TILL_CYCLE; k++) { 213 // _crc32c_pow_2k_table(k+1) = _crc32c_pow_2k_table(k-1)^2 mod P(x) 214 uint32_t tmp = _crc32c_pow_2k_table[k - 1]; 215 _crc32c_pow_2k_table[k] = crc32c_multiply(tmp, tmp); 216 } 217 } 218 219 // x^N mod P(x) 220 uint32_t crc32c_f_pow_n(uint32_t n) { 221 // result = 1 (polynomial) 222 uint32_t one, result = 0x80000000, i = 0; 223 224 while (one = (n & 1), (n == 1 || n - one > 0)) { 225 if (one) { 226 result = crc32c_multiply(result, _crc32c_pow_2k_table[i]); 227 } 228 n >>= 1; 229 i++; 230 } 231 232 return result; 233 } 234 235 juint *StubRoutines::x86::_crc32c_table; 236 237 void StubRoutines::x86::generate_CRC32C_table(bool is_pclmulqdq_table_supported) { 238 239 static juint pow_n[CRC32C_NUM_PRECOMPUTED_CONSTANTS]; 240 241 crc32c_init_pow_2k(); 242 243 pow_n[0] = crc32c_f_pow_n(CRC32C_HIGH * 8); // 8N * 8 = 64N 244 pow_n[1] = crc32c_f_pow_n(CRC32C_HIGH * 8 * 2); // 128N 245 246 pow_n[2] = crc32c_f_pow_n(CRC32C_MIDDLE * 8); 247 pow_n[3] = crc32c_f_pow_n(CRC32C_MIDDLE * 8 * 2); 248 249 pow_n[4] = crc32c_f_pow_n(CRC32C_LOW * 8); 250 pow_n[CRC32C_NUM_PRECOMPUTED_CONSTANTS - 1] = 251 crc32c_f_pow_n(CRC32C_LOW * 8 * 2); 252 253 if (is_pclmulqdq_table_supported) { 254 _crc32c_table = pow_n; 255 } else { 256 static julong pclmulqdq_table[CRC32C_NUM_PRECOMPUTED_CONSTANTS * 256]; 257 258 for (int j = 0; j < CRC32C_NUM_PRECOMPUTED_CONSTANTS; j++) { 259 static juint X_CONST = pow_n[j]; 260 for (int64_t i = 0; i < 256; i++) { // to force 64 bit wide computations 261 // S. Gueron / Information Processing Letters 112 (2012) 184 262 // Algorithm 3: Generating a carry-less multiplication lookup table. 263 // Input: A 32-bit constant, X_CONST. 264 // Output: A table of 256 entries, each one is a 64-bit quadword, 265 // that can be used for computing "byte" * X_CONST, for a given byte. 266 pclmulqdq_table[j * 256 + i] = 267 ((i & 1) * X_CONST) ^ ((i & 2) * X_CONST) ^ ((i & 4) * X_CONST) ^ 268 ((i & 8) * X_CONST) ^ ((i & 16) * X_CONST) ^ ((i & 32) * X_CONST) ^ 269 ((i & 64) * X_CONST) ^ ((i & 128) * X_CONST); 270 } 271 } 272 _crc32c_table = (juint*)pclmulqdq_table; 273 } 274 } 275 276 ALIGNED_(64) juint StubRoutines::x86::_k256[] = 277 { 278 0x428a2f98UL, 0x71374491UL, 0xb5c0fbcfUL, 0xe9b5dba5UL, 279 0x3956c25bUL, 0x59f111f1UL, 0x923f82a4UL, 0xab1c5ed5UL, 280 0xd807aa98UL, 0x12835b01UL, 0x243185beUL, 0x550c7dc3UL, 281 0x72be5d74UL, 0x80deb1feUL, 0x9bdc06a7UL, 0xc19bf174UL, 282 0xe49b69c1UL, 0xefbe4786UL, 0x0fc19dc6UL, 0x240ca1ccUL, 283 0x2de92c6fUL, 0x4a7484aaUL, 0x5cb0a9dcUL, 0x76f988daUL, 284 0x983e5152UL, 0xa831c66dUL, 0xb00327c8UL, 0xbf597fc7UL, 285 0xc6e00bf3UL, 0xd5a79147UL, 0x06ca6351UL, 0x14292967UL, 286 0x27b70a85UL, 0x2e1b2138UL, 0x4d2c6dfcUL, 0x53380d13UL, 287 0x650a7354UL, 0x766a0abbUL, 0x81c2c92eUL, 0x92722c85UL, 288 0xa2bfe8a1UL, 0xa81a664bUL, 0xc24b8b70UL, 0xc76c51a3UL, 289 0xd192e819UL, 0xd6990624UL, 0xf40e3585UL, 0x106aa070UL, 290 0x19a4c116UL, 0x1e376c08UL, 0x2748774cUL, 0x34b0bcb5UL, 291 0x391c0cb3UL, 0x4ed8aa4aUL, 0x5b9cca4fUL, 0x682e6ff3UL, 292 0x748f82eeUL, 0x78a5636fUL, 0x84c87814UL, 0x8cc70208UL, 293 0x90befffaUL, 0xa4506cebUL, 0xbef9a3f7UL, 0xc67178f2UL 294 }; 295 296 #ifdef _LP64 297 // used in MacroAssembler::sha256_AVX2 298 // dynamically built from _k256 299 ALIGNED_(64) juint StubRoutines::x86::_k256_W[2*sizeof(StubRoutines::x86::_k256)]; 300 #endif