1 /*
   2  * Copyright (c) 2018, 2019, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  */
  23 
  24 
  25 package org.graalvm.compiler.lir.amd64;
  26 
  27 import static jdk.vm.ci.amd64.AMD64.k1;
  28 import static jdk.vm.ci.amd64.AMD64.k2;
  29 import static jdk.vm.ci.amd64.AMD64.k3;
  30 import static jdk.vm.ci.amd64.AMD64.rax;
  31 import static jdk.vm.ci.amd64.AMD64.rdi;
  32 import static jdk.vm.ci.amd64.AMD64.rdx;
  33 import static jdk.vm.ci.amd64.AMD64.rsi;
  34 import static jdk.vm.ci.amd64.AMD64.rsp;
  35 import static jdk.vm.ci.code.ValueUtil.asRegister;
  36 import static org.graalvm.compiler.lir.LIRInstruction.OperandFlag.REG;
  37 
  38 import org.graalvm.compiler.asm.Label;
  39 import org.graalvm.compiler.asm.amd64.AMD64Address;
  40 import org.graalvm.compiler.asm.amd64.AMD64Assembler;
  41 import org.graalvm.compiler.asm.amd64.AMD64MacroAssembler;
  42 import org.graalvm.compiler.core.common.LIRKind;
  43 import org.graalvm.compiler.lir.LIRInstructionClass;
  44 import org.graalvm.compiler.lir.Opcode;
  45 import org.graalvm.compiler.lir.asm.CompilationResultBuilder;
  46 import org.graalvm.compiler.lir.gen.LIRGeneratorTool;
  47 
  48 import jdk.vm.ci.amd64.AMD64;
  49 import jdk.vm.ci.amd64.AMD64Kind;
  50 import jdk.vm.ci.code.Register;
  51 import jdk.vm.ci.meta.Value;
  52 
  53 @Opcode("AMD64_STRING_COMPRESS")
  54 public final class AMD64StringUTF16CompressOp extends AMD64LIRInstruction {
  55     public static final LIRInstructionClass<AMD64StringUTF16CompressOp> TYPE = LIRInstructionClass.create(AMD64StringUTF16CompressOp.class);
  56 
  57     @Def({REG}) private Value rres;
  58     @Use({REG}) private Value rsrc;
  59     @Use({REG}) private Value rdst;
  60     @Use({REG}) private Value rlen;
  61 
  62     @Temp({REG}) private Value rsrcTemp;
  63     @Temp({REG}) private Value rdstTemp;
  64     @Temp({REG}) private Value rlenTemp;
  65 
  66     @Temp({REG}) private Value vtmp1;
  67     @Temp({REG}) private Value vtmp2;
  68     @Temp({REG}) private Value vtmp3;
  69     @Temp({REG}) private Value vtmp4;
  70     @Temp({REG}) private Value rtmp5;
  71 
  72     public AMD64StringUTF16CompressOp(LIRGeneratorTool tool, Value res, Value src, Value dst, Value len) {
  73         super(TYPE);
  74 
  75         assert asRegister(src).equals(rsi);
  76         assert asRegister(dst).equals(rdi);
  77         assert asRegister(len).equals(rdx);
  78         assert asRegister(res).equals(rax);
  79 
  80         rres = res;
  81         rsrcTemp = rsrc = src;
  82         rdstTemp = rdst = dst;
  83         rlenTemp = rlen = len;
  84 
  85         LIRKind vkind = LIRKind.value(AMD64Kind.V512_BYTE);
  86 
  87         vtmp1 = tool.newVariable(vkind);
  88         vtmp2 = tool.newVariable(vkind);
  89         vtmp3 = tool.newVariable(vkind);
  90         vtmp4 = tool.newVariable(vkind);
  91 
  92         rtmp5 = tool.newVariable(LIRKind.value(AMD64Kind.DWORD));
  93     }
  94 
  95     @Override
  96     public void emitCode(CompilationResultBuilder crb, AMD64MacroAssembler masm) {
  97         Register res = asRegister(rres);
  98         Register src = asRegister(rsrc);
  99         Register dst = asRegister(rdst);
 100         Register len = asRegister(rlen);
 101 
 102         Register tmp1 = asRegister(vtmp1);
 103         Register tmp2 = asRegister(vtmp2);
 104         Register tmp3 = asRegister(vtmp3);
 105         Register tmp4 = asRegister(vtmp4);
 106         Register tmp5 = asRegister(rtmp5);
 107 
 108         charArrayCompress(masm, src, dst, len, tmp1, tmp2, tmp3, tmp4, tmp5, res);
 109     }
 110 
 111     /**
 112      * Compress a UTF16 string which de facto is a Latin1 string into a byte array representation
 113      * (buffer).
 114      *
 115      * @param masm the assembler
 116      * @param src (rsi) the start address of source char[] to be compressed
 117      * @param dst (rdi) the start address of destination byte[] vector
 118      * @param len (rdx) the length
 119      * @param tmp1 (xmm) temporary xmm register
 120      * @param tmp2 (xmm) temporary xmm register
 121      * @param tmp3 (xmm) temporary xmm register
 122      * @param tmp4 (xmm) temporary xmm register
 123      * @param tmp (gpr) temporary gpr register
 124      * @param res (rax) the result code (length on success, zero otherwise)
 125      */
 126     private static void charArrayCompress(AMD64MacroAssembler masm, Register src, Register dst, Register len, Register tmp1,
 127                     Register tmp2, Register tmp3, Register tmp4, Register tmp, Register res) {
 128         assert tmp1.getRegisterCategory().equals(AMD64.XMM);
 129         assert tmp2.getRegisterCategory().equals(AMD64.XMM);
 130         assert tmp3.getRegisterCategory().equals(AMD64.XMM);
 131         assert tmp4.getRegisterCategory().equals(AMD64.XMM);
 132 
 133         Label labelReturnLength = new Label();
 134         Label labelReturnZero = new Label();
 135         Label labelDone = new Label();
 136         Label labelBelowThreshold = new Label();
 137 
 138         assert len.number != res.number;
 139 
 140         masm.push(len);      // Save length for return.
 141 
 142         if (masm.supports(AMD64.CPUFeature.AVX512BW) &&
 143                         masm.supports(AMD64.CPUFeature.AVX512VL) &&
 144                         masm.supports(AMD64.CPUFeature.BMI2)) {
 145 
 146             Label labelRestoreK1ReturnZero = new Label();
 147             Label labelAvxPostAlignment = new Label();
 148 
 149             // If the length of the string is less than 32, we chose not to use the
 150             // AVX512 instructions.
 151             masm.testl(len, -32);
 152             masm.jcc(AMD64Assembler.ConditionFlag.Zero, labelBelowThreshold);
 153 
 154             // First check whether a character is compressible (<= 0xff).
 155             // Create mask to test for Unicode chars inside (zmm) vector.
 156             masm.movl(res, 0x00ff);
 157             masm.evpbroadcastw(tmp2, res);
 158 
 159             masm.kmovq(k3, k1);      // Save k1
 160 
 161             masm.testl(len, -64);
 162             masm.jcc(AMD64Assembler.ConditionFlag.Zero, labelAvxPostAlignment);
 163 
 164             masm.movl(tmp, dst);
 165             masm.andl(tmp, (32 - 1));
 166             masm.negl(tmp);
 167             masm.andl(tmp, (32 - 1));
 168 
 169             // bail out when there is nothing to be done
 170             masm.testl(tmp, tmp);
 171             masm.jcc(AMD64Assembler.ConditionFlag.Zero, labelAvxPostAlignment);
 172 
 173             // Compute (1 << N) - 1 = ~(~0 << N), where N is the remaining number
 174             // of characters to process.
 175             masm.movl(res, -1);
 176             masm.shlxl(res, res, tmp);
 177             masm.notl(res);
 178 
 179             masm.kmovd(k1, res);
 180             masm.evmovdqu16(tmp1, k1, new AMD64Address(src));
 181             masm.evpcmpuw(k2, k1, tmp1, tmp2, 2 /* le */);
 182             masm.ktestd(k2, k1);
 183             masm.jcc(AMD64Assembler.ConditionFlag.CarryClear, labelRestoreK1ReturnZero);
 184 
 185             masm.evpmovwb(new AMD64Address(dst), k1, tmp1);
 186 
 187             masm.addq(src, tmp);
 188             masm.addq(src, tmp);
 189             masm.addq(dst, tmp);
 190             masm.subl(len, tmp);
 191 
 192             masm.bind(labelAvxPostAlignment);
 193             // end of alignment
 194             Label labelAvx512LoopTail = new Label();
 195 
 196             masm.movl(tmp, len);
 197             masm.andl(tmp, -32);         // The vector count (in chars).
 198             masm.jcc(AMD64Assembler.ConditionFlag.Zero, labelAvx512LoopTail);
 199             masm.andl(len, 32 - 1);      // The tail count (in chars).
 200 
 201             masm.leaq(src, new AMD64Address(src, tmp, AMD64Address.Scale.Times2));
 202             masm.leaq(dst, new AMD64Address(dst, tmp, AMD64Address.Scale.Times1));
 203             masm.negq(tmp);
 204 
 205             Label labelAvx512Loop = new Label();
 206             // Test and compress 32 chars per iteration, reading 512-bit vectors and
 207             // writing 256-bit compressed ditto.
 208             masm.bind(labelAvx512Loop);
 209             masm.evmovdqu16(tmp1, new AMD64Address(src, tmp, AMD64Address.Scale.Times2));
 210             masm.evpcmpuw(k2, tmp1, tmp2, 2 /* le */);
 211             masm.kortestd(k2, k2);
 212             masm.jcc(AMD64Assembler.ConditionFlag.CarryClear, labelRestoreK1ReturnZero);
 213 
 214             // All 32 chars in the current vector (chunk) are valid for compression,
 215             // write truncated byte elements to memory.
 216             masm.evpmovwb(new AMD64Address(dst, tmp, AMD64Address.Scale.Times1), tmp1);
 217             masm.addq(tmp, 32);
 218             masm.jcc(AMD64Assembler.ConditionFlag.NotZero, labelAvx512Loop);
 219 
 220             masm.bind(labelAvx512LoopTail);
 221             masm.kmovq(k1, k3);      // Restore k1
 222 
 223             // All done if the tail count is zero.
 224             masm.testl(len, len);
 225             masm.jcc(AMD64Assembler.ConditionFlag.Zero, labelReturnLength);
 226 
 227             // Compute (1 << N) - 1 = ~(~0 << N), where N is the remaining number
 228             // of characters to process.
 229             masm.movl(res, -1);
 230             masm.shlxl(res, res, len);
 231             masm.notl(res);
 232 
 233             masm.kmovd(k1, res);
 234             masm.evmovdqu16(tmp1, k1, new AMD64Address(src));
 235             masm.evpcmpuw(k2, k1, tmp1, tmp2, 2 /* le */);
 236             masm.ktestd(k2, k1);
 237             masm.jcc(AMD64Assembler.ConditionFlag.CarryClear, labelRestoreK1ReturnZero);
 238 
 239             masm.evpmovwb(new AMD64Address(dst), k1, tmp1);
 240 
 241             masm.kmovq(k1, k3);      // Restore k1
 242             masm.jmp(labelReturnLength);
 243 
 244             masm.bind(labelRestoreK1ReturnZero);
 245             masm.kmovq(k1, k3);      // Restore k1
 246             masm.jmp(labelReturnZero);
 247         }
 248 
 249         if (masm.supports(AMD64.CPUFeature.SSE4_2)) {
 250 
 251             Label labelSSETail = new Label();
 252 
 253             masm.bind(labelBelowThreshold);
 254 
 255             masm.movl(tmp, 0xff00ff00);  // Create mask to test for Unicode chars in vectors.
 256 
 257             masm.movl(res, len);
 258             masm.andl(res, -16);
 259             masm.jccb(AMD64Assembler.ConditionFlag.Zero, labelSSETail);
 260             masm.andl(len, 16 - 1);
 261 
 262             // Compress 16 chars per iteration.
 263             masm.movdl(tmp1, tmp);
 264             masm.pshufd(tmp1, tmp1, 0);    // Store Unicode mask in 'vtmp1'.
 265             masm.pxor(tmp4, tmp4);
 266 
 267             masm.leaq(src, new AMD64Address(src, res, AMD64Address.Scale.Times2));
 268             masm.leaq(dst, new AMD64Address(dst, res, AMD64Address.Scale.Times1));
 269             masm.negq(res);
 270 
 271             Label lSSELoop = new Label();
 272             // Test and compress 16 chars per iteration, reading 128-bit vectors and
 273             // writing 64-bit compressed ditto.
 274             masm.bind(lSSELoop);
 275             masm.movdqu(tmp2, new AMD64Address(src, res, AMD64Address.Scale.Times2));     // load
 276                                                                                           // 1st 8
 277                                                                                           // characters
 278             masm.movdqu(tmp3, new AMD64Address(src, res, AMD64Address.Scale.Times2, 16)); // load
 279                                                                                           // next 8
 280                                                                                           // characters
 281             masm.por(tmp4, tmp2);
 282             masm.por(tmp4, tmp3);
 283             masm.ptest(tmp4, tmp1);        // Check for Unicode chars in vector.
 284             masm.jcc(AMD64Assembler.ConditionFlag.NotZero, labelReturnZero);
 285 
 286             masm.packuswb(tmp2, tmp3);     // Only ASCII chars; compress each to a byte.
 287             masm.movdqu(new AMD64Address(dst, res, AMD64Address.Scale.Times1), tmp2);
 288             masm.addq(res, 16);
 289             masm.jcc(AMD64Assembler.ConditionFlag.NotZero, lSSELoop);
 290 
 291             Label labelCopyChars = new Label();
 292             // Test and compress another 8 chars before final tail copy.
 293             masm.bind(labelSSETail);
 294             masm.movl(res, len);
 295             masm.andl(res, -8);
 296             masm.jccb(AMD64Assembler.ConditionFlag.Zero, labelCopyChars);
 297             masm.andl(len, 8 - 1);
 298 
 299             masm.movdl(tmp1, tmp);
 300             masm.pshufd(tmp1, tmp1, 0);    // Store Unicode mask in 'vtmp1'.
 301             masm.pxor(tmp3, tmp3);
 302 
 303             masm.movdqu(tmp2, new AMD64Address(src));
 304             masm.ptest(tmp2, tmp1);        // Check for Unicode chars in vector.
 305             masm.jccb(AMD64Assembler.ConditionFlag.NotZero, labelReturnZero);
 306             masm.packuswb(tmp2, tmp3);     // Only ASCII chars; compress each to a byte.
 307             masm.movq(new AMD64Address(dst), tmp2);
 308             masm.addq(src, 16);
 309             masm.addq(dst, 8);
 310 
 311             masm.bind(labelCopyChars);
 312         }
 313 
 314         // Compress any remaining characters using a vanilla implementation.
 315         masm.testl(len, len);
 316         masm.jccb(AMD64Assembler.ConditionFlag.Zero, labelReturnLength);
 317         masm.leaq(src, new AMD64Address(src, len, AMD64Address.Scale.Times2));
 318         masm.leaq(dst, new AMD64Address(dst, len, AMD64Address.Scale.Times1));
 319         masm.negq(len);
 320 
 321         Label labelCopyCharsLoop = new Label();
 322         // Compress a single character per iteration.
 323         masm.bind(labelCopyCharsLoop);
 324         masm.movzwl(res, new AMD64Address(src, len, AMD64Address.Scale.Times2));
 325         masm.testl(res, 0xff00);     // Check if Unicode character.
 326         masm.jccb(AMD64Assembler.ConditionFlag.NotZero, labelReturnZero);
 327         // An ASCII character; compress to a byte.
 328         masm.movb(new AMD64Address(dst, len, AMD64Address.Scale.Times1), res);
 329         masm.incrementq(len, 1);
 330         masm.jcc(AMD64Assembler.ConditionFlag.NotZero, labelCopyCharsLoop);
 331 
 332         // If compression succeeded, return the length.
 333         masm.bind(labelReturnLength);
 334         masm.pop(res);
 335         masm.jmpb(labelDone);
 336 
 337         // If compression failed, return 0.
 338         masm.bind(labelReturnZero);
 339         masm.xorl(res, res);
 340         masm.addq(rsp, 8 /* wordSize */);
 341 
 342         masm.bind(labelDone);
 343     }
 344 
 345 }