1 /* 2 * Copyright (c) 2018, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 */ 23 24 25 package org.graalvm.compiler.lir.amd64; 26 27 import static jdk.vm.ci.amd64.AMD64.k1; 28 import static jdk.vm.ci.amd64.AMD64.k2; 29 import static jdk.vm.ci.amd64.AMD64.k3; 30 import static jdk.vm.ci.amd64.AMD64.rax; 31 import static jdk.vm.ci.amd64.AMD64.rdi; 32 import static jdk.vm.ci.amd64.AMD64.rdx; 33 import static jdk.vm.ci.amd64.AMD64.rsi; 34 35 import static jdk.vm.ci.amd64.AMD64.rsp; 36 import static jdk.vm.ci.code.ValueUtil.asRegister; 37 import static org.graalvm.compiler.lir.LIRInstruction.OperandFlag.REG; 38 39 import jdk.vm.ci.amd64.AMD64; 40 import org.graalvm.compiler.asm.Label; 41 import org.graalvm.compiler.asm.amd64.AMD64Address; 42 import org.graalvm.compiler.asm.amd64.AMD64Assembler; 43 import org.graalvm.compiler.asm.amd64.AMD64MacroAssembler; 44 import org.graalvm.compiler.core.common.LIRKind; 45 import org.graalvm.compiler.lir.LIRInstructionClass; 46 import org.graalvm.compiler.lir.Opcode; 47 import org.graalvm.compiler.lir.asm.CompilationResultBuilder; 48 import org.graalvm.compiler.lir.gen.LIRGeneratorTool; 49 50 import jdk.vm.ci.amd64.AMD64Kind; 51 import jdk.vm.ci.code.Register; 52 import jdk.vm.ci.meta.Value; 53 54 @Opcode("AMD64_STRING_COMPRESS") 55 public final class AMD64StringUTF16CompressOp extends AMD64LIRInstruction { 56 public static final LIRInstructionClass<AMD64StringUTF16CompressOp> TYPE = LIRInstructionClass.create(AMD64StringUTF16CompressOp.class); 57 58 @Def({REG}) private Value rres; 59 @Alive({REG}) private Value rsrc; 60 @Alive({REG}) private Value rdst; 61 @Alive({REG}) private Value rlen; 62 63 @Temp({REG}) private Value vtmp1; 64 @Temp({REG}) private Value vtmp2; 65 @Temp({REG}) private Value vtmp3; 66 @Temp({REG}) private Value vtmp4; 67 @Temp({REG}) private Value rtmp5; 68 69 public AMD64StringUTF16CompressOp(LIRGeneratorTool tool, Value res, Value src, Value dst, Value len) { 70 super(TYPE); 71 72 assert asRegister(src).equals(rsi); 73 assert asRegister(dst).equals(rdi); 74 assert asRegister(len).equals(rdx); 75 assert asRegister(res).equals(rax); 76 77 rres = res; 78 rsrc = src; 79 rdst = dst; 80 rlen = len; 81 82 LIRKind vkind = LIRKind.value(AMD64Kind.V512_BYTE); 83 84 vtmp1 = tool.newVariable(vkind); 85 vtmp2 = tool.newVariable(vkind); 86 vtmp3 = tool.newVariable(vkind); 87 vtmp4 = tool.newVariable(vkind); 88 89 rtmp5 = tool.newVariable(LIRKind.value(AMD64Kind.DWORD)); 90 } 91 92 @Override 93 public void emitCode(CompilationResultBuilder crb, AMD64MacroAssembler masm) { 94 Register res = asRegister(rres); 95 Register src = asRegister(rsrc); 96 Register dst = asRegister(rdst); 97 Register len = asRegister(rlen); 98 99 Register tmp1 = asRegister(vtmp1); 100 Register tmp2 = asRegister(vtmp2); 101 Register tmp3 = asRegister(vtmp3); 102 Register tmp4 = asRegister(vtmp4); 103 Register tmp5 = asRegister(rtmp5); 104 105 charArrayCompress(masm, src, dst, len, tmp1, tmp2, tmp3, tmp4, tmp5, res); 106 } 107 108 /** 109 * Compress a UTF16 string which de facto is a Latin1 string into a byte array representation 110 * (buffer). 111 * 112 * @param masm the assembler 113 * @param src (rsi) the start address of source char[] to be compressed 114 * @param dst (rdi) the start address of destination byte[] vector 115 * @param len (rdx) the length 116 * @param tmp1 (xmm) temporary xmm register 117 * @param tmp2 (xmm) temporary xmm register 118 * @param tmp3 (xmm) temporary xmm register 119 * @param tmp4 (xmm) temporary xmm register 120 * @param tmp (gpr) temporary gpr register 121 * @param res (rax) the result code (length on success, zero otherwise) 122 */ 123 private static void charArrayCompress(AMD64MacroAssembler masm, Register src, Register dst, Register len, Register tmp1, 124 Register tmp2, Register tmp3, Register tmp4, Register tmp, Register res) { 125 assert tmp1.getRegisterCategory().equals(AMD64.XMM); 126 assert tmp2.getRegisterCategory().equals(AMD64.XMM); 127 assert tmp3.getRegisterCategory().equals(AMD64.XMM); 128 assert tmp4.getRegisterCategory().equals(AMD64.XMM); 129 130 Label labelReturnLength = new Label(); 131 Label labelReturnZero = new Label(); 132 Label labelDone = new Label(); 133 Label labelBelowThreshold = new Label(); 134 135 assert len.number != res.number; 136 137 masm.push(len); // Save length for return. 138 139 if (masm.supports(AMD64.CPUFeature.AVX512BW) && 140 masm.supports(AMD64.CPUFeature.AVX512VL) && 141 masm.supports(AMD64.CPUFeature.BMI2)) { 142 143 Label labelRestoreK1ReturnZero = new Label(); 144 Label labelAvxPostAlignment = new Label(); 145 146 // If the length of the string is less than 32, we chose not to use the 147 // AVX512 instructions. 148 masm.testl(len, -32); 149 masm.jcc(AMD64Assembler.ConditionFlag.Zero, labelBelowThreshold); 150 151 // First check whether a character is compressible (<= 0xff). 152 // Create mask to test for Unicode chars inside (zmm) vector. 153 masm.movl(res, 0x00ff); 154 masm.evpbroadcastw(tmp2, res); 155 156 masm.kmovq(k3, k1); // Save k1 157 158 masm.testl(len, -64); 159 masm.jcc(AMD64Assembler.ConditionFlag.Zero, labelAvxPostAlignment); 160 161 masm.movl(tmp, dst); 162 masm.andl(tmp, (32 - 1)); 163 masm.negl(tmp); 164 masm.andl(tmp, (32 - 1)); 165 166 // bail out when there is nothing to be done 167 masm.testl(tmp, tmp); 168 masm.jcc(AMD64Assembler.ConditionFlag.Zero, labelAvxPostAlignment); 169 170 // Compute (1 << N) - 1 = ~(~0 << N), where N is the remaining number 171 // of characters to process. 172 masm.movl(res, -1); 173 masm.shlxl(res, res, tmp); 174 masm.notl(res); 175 176 masm.kmovd(k1, res); 177 masm.evmovdqu16(tmp1, k1, new AMD64Address(src)); 178 masm.evpcmpuw(k2, k1, tmp1, tmp2, 2 /* le */); 179 masm.ktestd(k2, k1); 180 masm.jcc(AMD64Assembler.ConditionFlag.CarryClear, labelRestoreK1ReturnZero); 181 182 masm.evpmovwb(new AMD64Address(dst), k1, tmp1); 183 184 masm.addq(src, tmp); 185 masm.addq(src, tmp); 186 masm.addq(dst, tmp); 187 masm.subl(len, tmp); 188 189 masm.bind(labelAvxPostAlignment); 190 // end of alignment 191 Label labelAvx512LoopTail = new Label(); 192 193 masm.movl(tmp, len); 194 masm.andl(tmp, -32); // The vector count (in chars). 195 masm.jcc(AMD64Assembler.ConditionFlag.Zero, labelAvx512LoopTail); 196 masm.andl(len, 32 - 1); // The tail count (in chars). 197 198 masm.leaq(src, new AMD64Address(src, tmp, AMD64Address.Scale.Times2)); 199 masm.leaq(dst, new AMD64Address(dst, tmp, AMD64Address.Scale.Times1)); 200 masm.negq(tmp); 201 202 Label labelAvx512Loop = new Label(); 203 // Test and compress 32 chars per iteration, reading 512-bit vectors and 204 // writing 256-bit compressed ditto. 205 masm.bind(labelAvx512Loop); 206 masm.evmovdqu16(tmp1, new AMD64Address(src, tmp, AMD64Address.Scale.Times2)); 207 masm.evpcmpuw(k2, tmp1, tmp2, 2 /* le */); 208 masm.kortestd(k2, k2); 209 masm.jcc(AMD64Assembler.ConditionFlag.CarryClear, labelRestoreK1ReturnZero); 210 211 // All 32 chars in the current vector (chunk) are valid for compression, 212 // write truncated byte elements to memory. 213 masm.evpmovwb(new AMD64Address(dst, tmp, AMD64Address.Scale.Times1), tmp1); 214 masm.addq(tmp, 32); 215 masm.jcc(AMD64Assembler.ConditionFlag.NotZero, labelAvx512Loop); 216 217 masm.bind(labelAvx512LoopTail); 218 masm.kmovq(k1, k3); // Restore k1 219 220 // All done if the tail count is zero. 221 masm.testl(len, len); 222 masm.jcc(AMD64Assembler.ConditionFlag.Zero, labelReturnLength); 223 224 // Compute (1 << N) - 1 = ~(~0 << N), where N is the remaining number 225 // of characters to process. 226 masm.movl(res, -1); 227 masm.shlxl(res, res, len); 228 masm.notl(res); 229 230 masm.kmovd(k1, res); 231 masm.evmovdqu16(tmp1, k1, new AMD64Address(src)); 232 masm.evpcmpuw(k2, k1, tmp1, tmp2, 2 /* le */); 233 masm.ktestd(k2, k1); 234 masm.jcc(AMD64Assembler.ConditionFlag.CarryClear, labelRestoreK1ReturnZero); 235 236 masm.evpmovwb(new AMD64Address(dst), k1, tmp1); 237 238 masm.kmovq(k1, k3); // Restore k1 239 masm.jmp(labelReturnLength); 240 241 masm.bind(labelRestoreK1ReturnZero); 242 masm.kmovq(k1, k3); // Restore k1 243 masm.jmp(labelReturnZero); 244 } 245 246 if (masm.supports(AMD64.CPUFeature.SSE4_2)) { 247 248 Label labelSSETail = new Label(); 249 250 masm.bind(labelBelowThreshold); 251 252 masm.movl(tmp, 0xff00ff00); // Create mask to test for Unicode chars in vectors. 253 254 masm.movl(res, len); 255 masm.andl(res, -16); 256 masm.jccb(AMD64Assembler.ConditionFlag.Zero, labelSSETail); 257 masm.andl(len, 16 - 1); 258 259 // Compress 16 chars per iteration. 260 masm.movdl(tmp1, tmp); 261 masm.pshufd(tmp1, tmp1, 0); // Store Unicode mask in 'vtmp1'. 262 masm.pxor(tmp4, tmp4); 263 264 masm.leaq(src, new AMD64Address(src, res, AMD64Address.Scale.Times2)); 265 masm.leaq(dst, new AMD64Address(dst, res, AMD64Address.Scale.Times1)); 266 masm.negq(res); 267 268 Label lSSELoop = new Label(); 269 // Test and compress 16 chars per iteration, reading 128-bit vectors and 270 // writing 64-bit compressed ditto. 271 masm.bind(lSSELoop); 272 masm.movdqu(tmp2, new AMD64Address(src, res, AMD64Address.Scale.Times2)); // load 273 // 1st 8 274 // characters 275 masm.movdqu(tmp3, new AMD64Address(src, res, AMD64Address.Scale.Times2, 16)); // load 276 // next 8 277 // characters 278 masm.por(tmp4, tmp2); 279 masm.por(tmp4, tmp3); 280 masm.ptest(tmp4, tmp1); // Check for Unicode chars in vector. 281 masm.jcc(AMD64Assembler.ConditionFlag.NotZero, labelReturnZero); 282 283 masm.packuswb(tmp2, tmp3); // Only ASCII chars; compress each to a byte. 284 masm.movdqu(new AMD64Address(dst, res, AMD64Address.Scale.Times1), tmp2); 285 masm.addq(res, 16); 286 masm.jcc(AMD64Assembler.ConditionFlag.NotZero, lSSELoop); 287 288 Label labelCopyChars = new Label(); 289 // Test and compress another 8 chars before final tail copy. 290 masm.bind(labelSSETail); 291 masm.movl(res, len); 292 masm.andl(res, -8); 293 masm.jccb(AMD64Assembler.ConditionFlag.Zero, labelCopyChars); 294 masm.andl(len, 8 - 1); 295 296 masm.movdl(tmp1, tmp); 297 masm.pshufd(tmp1, tmp1, 0); // Store Unicode mask in 'vtmp1'. 298 masm.pxor(tmp3, tmp3); 299 300 masm.movdqu(tmp2, new AMD64Address(src)); 301 masm.ptest(tmp2, tmp1); // Check for Unicode chars in vector. 302 masm.jccb(AMD64Assembler.ConditionFlag.NotZero, labelReturnZero); 303 masm.packuswb(tmp2, tmp3); // Only ASCII chars; compress each to a byte. 304 masm.movq(new AMD64Address(dst), tmp2); 305 masm.addq(src, 16); 306 masm.addq(dst, 8); 307 308 masm.bind(labelCopyChars); 309 } 310 311 // Compress any remaining characters using a vanilla implementation. 312 masm.testl(len, len); 313 masm.jccb(AMD64Assembler.ConditionFlag.Zero, labelReturnLength); 314 masm.leaq(src, new AMD64Address(src, len, AMD64Address.Scale.Times2)); 315 masm.leaq(dst, new AMD64Address(dst, len, AMD64Address.Scale.Times1)); 316 masm.negq(len); 317 318 Label labelCopyCharsLoop = new Label(); 319 // Compress a single character per iteration. 320 masm.bind(labelCopyCharsLoop); 321 masm.movzwl(res, new AMD64Address(src, len, AMD64Address.Scale.Times2)); 322 masm.testl(res, 0xff00); // Check if Unicode character. 323 masm.jccb(AMD64Assembler.ConditionFlag.NotZero, labelReturnZero); 324 // An ASCII character; compress to a byte. 325 masm.movb(new AMD64Address(dst, len, AMD64Address.Scale.Times1), res); 326 masm.incrementq(len, 1); 327 masm.jcc(AMD64Assembler.ConditionFlag.NotZero, labelCopyCharsLoop); 328 329 // If compression succeeded, return the length. 330 masm.bind(labelReturnLength); 331 masm.pop(res); 332 masm.jmpb(labelDone); 333 334 // If compression failed, return 0. 335 masm.bind(labelReturnZero); 336 masm.xorl(res, res); 337 masm.addq(rsp, 8 /* wordSize */); 338 339 masm.bind(labelDone); 340 } 341 342 }