1 /* 2 * Copyright (c) 2018, 2019, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 */ 23 24 25 package org.graalvm.compiler.lir.amd64; 26 27 import static jdk.vm.ci.amd64.AMD64.k1; 28 import static jdk.vm.ci.amd64.AMD64.k2; 29 import static jdk.vm.ci.amd64.AMD64.k3; 30 import static jdk.vm.ci.amd64.AMD64.rax; 31 import static jdk.vm.ci.amd64.AMD64.rdi; 32 import static jdk.vm.ci.amd64.AMD64.rdx; 33 import static jdk.vm.ci.amd64.AMD64.rsi; 34 import static jdk.vm.ci.amd64.AMD64.rsp; 35 import static jdk.vm.ci.code.ValueUtil.asRegister; 36 import static org.graalvm.compiler.lir.LIRInstruction.OperandFlag.REG; 37 38 import org.graalvm.compiler.asm.Label; 39 import org.graalvm.compiler.asm.amd64.AMD64Address; 40 import org.graalvm.compiler.asm.amd64.AMD64Assembler; 41 import org.graalvm.compiler.asm.amd64.AMD64MacroAssembler; 42 import org.graalvm.compiler.core.common.LIRKind; 43 import org.graalvm.compiler.lir.LIRInstructionClass; 44 import org.graalvm.compiler.lir.Opcode; 45 import org.graalvm.compiler.lir.asm.CompilationResultBuilder; 46 import org.graalvm.compiler.lir.gen.LIRGeneratorTool; 47 48 import jdk.vm.ci.amd64.AMD64; 49 import jdk.vm.ci.amd64.AMD64Kind; 50 import jdk.vm.ci.code.Register; 51 import jdk.vm.ci.meta.Value; 52 53 @Opcode("AMD64_STRING_COMPRESS") 54 public final class AMD64StringUTF16CompressOp extends AMD64LIRInstruction { 55 public static final LIRInstructionClass<AMD64StringUTF16CompressOp> TYPE = LIRInstructionClass.create(AMD64StringUTF16CompressOp.class); 56 57 @Def({REG}) private Value rres; 58 @Use({REG}) private Value rsrc; 59 @Use({REG}) private Value rdst; 60 @Use({REG}) private Value rlen; 61 62 @Temp({REG}) private Value rsrcTemp; 63 @Temp({REG}) private Value rdstTemp; 64 @Temp({REG}) private Value rlenTemp; 65 66 @Temp({REG}) private Value vtmp1; 67 @Temp({REG}) private Value vtmp2; 68 @Temp({REG}) private Value vtmp3; 69 @Temp({REG}) private Value vtmp4; 70 @Temp({REG}) private Value rtmp5; 71 72 public AMD64StringUTF16CompressOp(LIRGeneratorTool tool, Value res, Value src, Value dst, Value len) { 73 super(TYPE); 74 75 assert asRegister(src).equals(rsi); 76 assert asRegister(dst).equals(rdi); 77 assert asRegister(len).equals(rdx); 78 assert asRegister(res).equals(rax); 79 80 rres = res; 81 rsrcTemp = rsrc = src; 82 rdstTemp = rdst = dst; 83 rlenTemp = rlen = len; 84 85 LIRKind vkind = LIRKind.value(AMD64Kind.V512_BYTE); 86 87 vtmp1 = tool.newVariable(vkind); 88 vtmp2 = tool.newVariable(vkind); 89 vtmp3 = tool.newVariable(vkind); 90 vtmp4 = tool.newVariable(vkind); 91 92 rtmp5 = tool.newVariable(LIRKind.value(AMD64Kind.DWORD)); 93 } 94 95 @Override 96 public void emitCode(CompilationResultBuilder crb, AMD64MacroAssembler masm) { 97 Register res = asRegister(rres); 98 Register src = asRegister(rsrc); 99 Register dst = asRegister(rdst); 100 Register len = asRegister(rlen); 101 102 Register tmp1 = asRegister(vtmp1); 103 Register tmp2 = asRegister(vtmp2); 104 Register tmp3 = asRegister(vtmp3); 105 Register tmp4 = asRegister(vtmp4); 106 Register tmp5 = asRegister(rtmp5); 107 108 charArrayCompress(masm, src, dst, len, tmp1, tmp2, tmp3, tmp4, tmp5, res); 109 } 110 111 /** 112 * Compress a UTF16 string which de facto is a Latin1 string into a byte array representation 113 * (buffer). 114 * 115 * @param masm the assembler 116 * @param src (rsi) the start address of source char[] to be compressed 117 * @param dst (rdi) the start address of destination byte[] vector 118 * @param len (rdx) the length 119 * @param tmp1 (xmm) temporary xmm register 120 * @param tmp2 (xmm) temporary xmm register 121 * @param tmp3 (xmm) temporary xmm register 122 * @param tmp4 (xmm) temporary xmm register 123 * @param tmp (gpr) temporary gpr register 124 * @param res (rax) the result code (length on success, zero otherwise) 125 */ 126 private static void charArrayCompress(AMD64MacroAssembler masm, Register src, Register dst, Register len, Register tmp1, 127 Register tmp2, Register tmp3, Register tmp4, Register tmp, Register res) { 128 assert tmp1.getRegisterCategory().equals(AMD64.XMM); 129 assert tmp2.getRegisterCategory().equals(AMD64.XMM); 130 assert tmp3.getRegisterCategory().equals(AMD64.XMM); 131 assert tmp4.getRegisterCategory().equals(AMD64.XMM); 132 133 Label labelReturnLength = new Label(); 134 Label labelReturnZero = new Label(); 135 Label labelDone = new Label(); 136 Label labelBelowThreshold = new Label(); 137 138 assert len.number != res.number; 139 140 masm.push(len); // Save length for return. 141 142 if (masm.supports(AMD64.CPUFeature.AVX512BW) && 143 masm.supports(AMD64.CPUFeature.AVX512VL) && 144 masm.supports(AMD64.CPUFeature.BMI2)) { 145 146 Label labelRestoreK1ReturnZero = new Label(); 147 Label labelAvxPostAlignment = new Label(); 148 149 // If the length of the string is less than 32, we chose not to use the 150 // AVX512 instructions. 151 masm.testl(len, -32); 152 masm.jcc(AMD64Assembler.ConditionFlag.Zero, labelBelowThreshold); 153 154 // First check whether a character is compressible (<= 0xff). 155 // Create mask to test for Unicode chars inside (zmm) vector. 156 masm.movl(res, 0x00ff); 157 masm.evpbroadcastw(tmp2, res); 158 159 masm.kmovq(k3, k1); // Save k1 160 161 masm.testl(len, -64); 162 masm.jcc(AMD64Assembler.ConditionFlag.Zero, labelAvxPostAlignment); 163 164 masm.movl(tmp, dst); 165 masm.andl(tmp, (32 - 1)); 166 masm.negl(tmp); 167 masm.andl(tmp, (32 - 1)); 168 169 // bail out when there is nothing to be done 170 masm.testl(tmp, tmp); 171 masm.jcc(AMD64Assembler.ConditionFlag.Zero, labelAvxPostAlignment); 172 173 // Compute (1 << N) - 1 = ~(~0 << N), where N is the remaining number 174 // of characters to process. 175 masm.movl(res, -1); 176 masm.shlxl(res, res, tmp); 177 masm.notl(res); 178 179 masm.kmovd(k1, res); 180 masm.evmovdqu16(tmp1, k1, new AMD64Address(src)); 181 masm.evpcmpuw(k2, k1, tmp1, tmp2, 2 /* le */); 182 masm.ktestd(k2, k1); 183 masm.jcc(AMD64Assembler.ConditionFlag.CarryClear, labelRestoreK1ReturnZero); 184 185 masm.evpmovwb(new AMD64Address(dst), k1, tmp1); 186 187 masm.addq(src, tmp); 188 masm.addq(src, tmp); 189 masm.addq(dst, tmp); 190 masm.subl(len, tmp); 191 192 masm.bind(labelAvxPostAlignment); 193 // end of alignment 194 Label labelAvx512LoopTail = new Label(); 195 196 masm.movl(tmp, len); 197 masm.andl(tmp, -32); // The vector count (in chars). 198 masm.jcc(AMD64Assembler.ConditionFlag.Zero, labelAvx512LoopTail); 199 masm.andl(len, 32 - 1); // The tail count (in chars). 200 201 masm.leaq(src, new AMD64Address(src, tmp, AMD64Address.Scale.Times2)); 202 masm.leaq(dst, new AMD64Address(dst, tmp, AMD64Address.Scale.Times1)); 203 masm.negq(tmp); 204 205 Label labelAvx512Loop = new Label(); 206 // Test and compress 32 chars per iteration, reading 512-bit vectors and 207 // writing 256-bit compressed ditto. 208 masm.bind(labelAvx512Loop); 209 masm.evmovdqu16(tmp1, new AMD64Address(src, tmp, AMD64Address.Scale.Times2)); 210 masm.evpcmpuw(k2, tmp1, tmp2, 2 /* le */); 211 masm.kortestd(k2, k2); 212 masm.jcc(AMD64Assembler.ConditionFlag.CarryClear, labelRestoreK1ReturnZero); 213 214 // All 32 chars in the current vector (chunk) are valid for compression, 215 // write truncated byte elements to memory. 216 masm.evpmovwb(new AMD64Address(dst, tmp, AMD64Address.Scale.Times1), tmp1); 217 masm.addq(tmp, 32); 218 masm.jcc(AMD64Assembler.ConditionFlag.NotZero, labelAvx512Loop); 219 220 masm.bind(labelAvx512LoopTail); 221 masm.kmovq(k1, k3); // Restore k1 222 223 // All done if the tail count is zero. 224 masm.testl(len, len); 225 masm.jcc(AMD64Assembler.ConditionFlag.Zero, labelReturnLength); 226 227 // Compute (1 << N) - 1 = ~(~0 << N), where N is the remaining number 228 // of characters to process. 229 masm.movl(res, -1); 230 masm.shlxl(res, res, len); 231 masm.notl(res); 232 233 masm.kmovd(k1, res); 234 masm.evmovdqu16(tmp1, k1, new AMD64Address(src)); 235 masm.evpcmpuw(k2, k1, tmp1, tmp2, 2 /* le */); 236 masm.ktestd(k2, k1); 237 masm.jcc(AMD64Assembler.ConditionFlag.CarryClear, labelRestoreK1ReturnZero); 238 239 masm.evpmovwb(new AMD64Address(dst), k1, tmp1); 240 241 masm.kmovq(k1, k3); // Restore k1 242 masm.jmp(labelReturnLength); 243 244 masm.bind(labelRestoreK1ReturnZero); 245 masm.kmovq(k1, k3); // Restore k1 246 masm.jmp(labelReturnZero); 247 } 248 249 if (masm.supports(AMD64.CPUFeature.SSE4_2)) { 250 251 Label labelSSETail = new Label(); 252 253 masm.bind(labelBelowThreshold); 254 255 masm.movl(tmp, 0xff00ff00); // Create mask to test for Unicode chars in vectors. 256 257 masm.movl(res, len); 258 masm.andl(res, -16); 259 masm.jccb(AMD64Assembler.ConditionFlag.Zero, labelSSETail); 260 masm.andl(len, 16 - 1); 261 262 // Compress 16 chars per iteration. 263 masm.movdl(tmp1, tmp); 264 masm.pshufd(tmp1, tmp1, 0); // Store Unicode mask in 'vtmp1'. 265 masm.pxor(tmp4, tmp4); 266 267 masm.leaq(src, new AMD64Address(src, res, AMD64Address.Scale.Times2)); 268 masm.leaq(dst, new AMD64Address(dst, res, AMD64Address.Scale.Times1)); 269 masm.negq(res); 270 271 Label lSSELoop = new Label(); 272 // Test and compress 16 chars per iteration, reading 128-bit vectors and 273 // writing 64-bit compressed ditto. 274 masm.bind(lSSELoop); 275 masm.movdqu(tmp2, new AMD64Address(src, res, AMD64Address.Scale.Times2)); // load 276 // 1st 8 277 // characters 278 masm.movdqu(tmp3, new AMD64Address(src, res, AMD64Address.Scale.Times2, 16)); // load 279 // next 8 280 // characters 281 masm.por(tmp4, tmp2); 282 masm.por(tmp4, tmp3); 283 masm.ptest(tmp4, tmp1); // Check for Unicode chars in vector. 284 masm.jcc(AMD64Assembler.ConditionFlag.NotZero, labelReturnZero); 285 286 masm.packuswb(tmp2, tmp3); // Only ASCII chars; compress each to a byte. 287 masm.movdqu(new AMD64Address(dst, res, AMD64Address.Scale.Times1), tmp2); 288 masm.addq(res, 16); 289 masm.jcc(AMD64Assembler.ConditionFlag.NotZero, lSSELoop); 290 291 Label labelCopyChars = new Label(); 292 // Test and compress another 8 chars before final tail copy. 293 masm.bind(labelSSETail); 294 masm.movl(res, len); 295 masm.andl(res, -8); 296 masm.jccb(AMD64Assembler.ConditionFlag.Zero, labelCopyChars); 297 masm.andl(len, 8 - 1); 298 299 masm.movdl(tmp1, tmp); 300 masm.pshufd(tmp1, tmp1, 0); // Store Unicode mask in 'vtmp1'. 301 masm.pxor(tmp3, tmp3); 302 303 masm.movdqu(tmp2, new AMD64Address(src)); 304 masm.ptest(tmp2, tmp1); // Check for Unicode chars in vector. 305 masm.jccb(AMD64Assembler.ConditionFlag.NotZero, labelReturnZero); 306 masm.packuswb(tmp2, tmp3); // Only ASCII chars; compress each to a byte. 307 masm.movq(new AMD64Address(dst), tmp2); 308 masm.addq(src, 16); 309 masm.addq(dst, 8); 310 311 masm.bind(labelCopyChars); 312 } 313 314 // Compress any remaining characters using a vanilla implementation. 315 masm.testl(len, len); 316 masm.jccb(AMD64Assembler.ConditionFlag.Zero, labelReturnLength); 317 masm.leaq(src, new AMD64Address(src, len, AMD64Address.Scale.Times2)); 318 masm.leaq(dst, new AMD64Address(dst, len, AMD64Address.Scale.Times1)); 319 masm.negq(len); 320 321 Label labelCopyCharsLoop = new Label(); 322 // Compress a single character per iteration. 323 masm.bind(labelCopyCharsLoop); 324 masm.movzwl(res, new AMD64Address(src, len, AMD64Address.Scale.Times2)); 325 masm.testl(res, 0xff00); // Check if Unicode character. 326 masm.jccb(AMD64Assembler.ConditionFlag.NotZero, labelReturnZero); 327 // An ASCII character; compress to a byte. 328 masm.movb(new AMD64Address(dst, len, AMD64Address.Scale.Times1), res); 329 masm.incrementq(len, 1); 330 masm.jcc(AMD64Assembler.ConditionFlag.NotZero, labelCopyCharsLoop); 331 332 // If compression succeeded, return the length. 333 masm.bind(labelReturnLength); 334 masm.pop(res); 335 masm.jmpb(labelDone); 336 337 // If compression failed, return 0. 338 masm.bind(labelReturnZero); 339 masm.xorl(res, res); 340 masm.addq(rsp, 8 /* wordSize */); 341 342 masm.bind(labelDone); 343 } 344 345 }