1 /*
   2  * Copyright (c) 2018, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  */
  23 
  24 
  25 package org.graalvm.compiler.lir.amd64;
  26 
  27 import static jdk.vm.ci.amd64.AMD64.k1;
  28 import static jdk.vm.ci.amd64.AMD64.k2;
  29 import static jdk.vm.ci.amd64.AMD64.k3;
  30 import static jdk.vm.ci.amd64.AMD64.rax;
  31 import static jdk.vm.ci.amd64.AMD64.rdi;
  32 import static jdk.vm.ci.amd64.AMD64.rdx;
  33 import static jdk.vm.ci.amd64.AMD64.rsi;
  34 
  35 import static jdk.vm.ci.amd64.AMD64.rsp;
  36 import static jdk.vm.ci.code.ValueUtil.asRegister;
  37 import static org.graalvm.compiler.lir.LIRInstruction.OperandFlag.REG;
  38 
  39 import jdk.vm.ci.amd64.AMD64;
  40 import org.graalvm.compiler.asm.Label;
  41 import org.graalvm.compiler.asm.amd64.AMD64Address;
  42 import org.graalvm.compiler.asm.amd64.AMD64Assembler;
  43 import org.graalvm.compiler.asm.amd64.AMD64MacroAssembler;
  44 import org.graalvm.compiler.core.common.LIRKind;
  45 import org.graalvm.compiler.lir.LIRInstructionClass;
  46 import org.graalvm.compiler.lir.Opcode;
  47 import org.graalvm.compiler.lir.asm.CompilationResultBuilder;
  48 import org.graalvm.compiler.lir.gen.LIRGeneratorTool;
  49 
  50 import jdk.vm.ci.amd64.AMD64Kind;
  51 import jdk.vm.ci.code.Register;
  52 import jdk.vm.ci.meta.Value;
  53 
  54 @Opcode("AMD64_STRING_COMPRESS")
  55 public final class AMD64StringUTF16CompressOp extends AMD64LIRInstruction {
  56     public static final LIRInstructionClass<AMD64StringUTF16CompressOp> TYPE = LIRInstructionClass.create(AMD64StringUTF16CompressOp.class);
  57 
  58     @Def({REG}) private Value rres;
  59     @Alive({REG}) private Value rsrc;
  60     @Alive({REG}) private Value rdst;
  61     @Alive({REG}) private Value rlen;
  62 
  63     @Temp({REG}) private Value vtmp1;
  64     @Temp({REG}) private Value vtmp2;
  65     @Temp({REG}) private Value vtmp3;
  66     @Temp({REG}) private Value vtmp4;
  67     @Temp({REG}) private Value rtmp5;
  68 
  69     public AMD64StringUTF16CompressOp(LIRGeneratorTool tool, Value res, Value src, Value dst, Value len) {
  70         super(TYPE);
  71 
  72         assert asRegister(src).equals(rsi);
  73         assert asRegister(dst).equals(rdi);
  74         assert asRegister(len).equals(rdx);
  75         assert asRegister(res).equals(rax);
  76 
  77         rres = res;
  78         rsrc = src;
  79         rdst = dst;
  80         rlen = len;
  81 
  82         LIRKind vkind = LIRKind.value(AMD64Kind.V512_BYTE);
  83 
  84         vtmp1 = tool.newVariable(vkind);
  85         vtmp2 = tool.newVariable(vkind);
  86         vtmp3 = tool.newVariable(vkind);
  87         vtmp4 = tool.newVariable(vkind);
  88 
  89         rtmp5 = tool.newVariable(LIRKind.value(AMD64Kind.DWORD));
  90     }
  91 
  92     @Override
  93     public void emitCode(CompilationResultBuilder crb, AMD64MacroAssembler masm) {
  94         Register res = asRegister(rres);
  95         Register src = asRegister(rsrc);
  96         Register dst = asRegister(rdst);
  97         Register len = asRegister(rlen);
  98 
  99         Register tmp1 = asRegister(vtmp1);
 100         Register tmp2 = asRegister(vtmp2);
 101         Register tmp3 = asRegister(vtmp3);
 102         Register tmp4 = asRegister(vtmp4);
 103         Register tmp5 = asRegister(rtmp5);
 104 
 105         charArrayCompress(masm, src, dst, len, tmp1, tmp2, tmp3, tmp4, tmp5, res);
 106     }
 107 
 108     /**
 109      * Compress a UTF16 string which de facto is a Latin1 string into a byte array representation
 110      * (buffer).
 111      *
 112      * @param masm the assembler
 113      * @param src (rsi) the start address of source char[] to be compressed
 114      * @param dst (rdi) the start address of destination byte[] vector
 115      * @param len (rdx) the length
 116      * @param tmp1 (xmm) temporary xmm register
 117      * @param tmp2 (xmm) temporary xmm register
 118      * @param tmp3 (xmm) temporary xmm register
 119      * @param tmp4 (xmm) temporary xmm register
 120      * @param tmp (gpr) temporary gpr register
 121      * @param res (rax) the result code (length on success, zero otherwise)
 122      */
 123     private static void charArrayCompress(AMD64MacroAssembler masm, Register src, Register dst, Register len, Register tmp1,
 124                     Register tmp2, Register tmp3, Register tmp4, Register tmp, Register res) {
 125         assert tmp1.getRegisterCategory().equals(AMD64.XMM);
 126         assert tmp2.getRegisterCategory().equals(AMD64.XMM);
 127         assert tmp3.getRegisterCategory().equals(AMD64.XMM);
 128         assert tmp4.getRegisterCategory().equals(AMD64.XMM);
 129 
 130         Label labelReturnLength = new Label();
 131         Label labelReturnZero = new Label();
 132         Label labelDone = new Label();
 133         Label labelBelowThreshold = new Label();
 134 
 135         assert len.number != res.number;
 136 
 137         masm.push(len);      // Save length for return.
 138 
 139         if (masm.supports(AMD64.CPUFeature.AVX512BW) &&
 140                         masm.supports(AMD64.CPUFeature.AVX512VL) &&
 141                         masm.supports(AMD64.CPUFeature.BMI2)) {
 142 
 143             Label labelRestoreK1ReturnZero = new Label();
 144             Label labelAvxPostAlignment = new Label();
 145 
 146             // If the length of the string is less than 32, we chose not to use the
 147             // AVX512 instructions.
 148             masm.testl(len, -32);
 149             masm.jcc(AMD64Assembler.ConditionFlag.Zero, labelBelowThreshold);
 150 
 151             // First check whether a character is compressible (<= 0xff).
 152             // Create mask to test for Unicode chars inside (zmm) vector.
 153             masm.movl(res, 0x00ff);
 154             masm.evpbroadcastw(tmp2, res);
 155 
 156             masm.kmovq(k3, k1);      // Save k1
 157 
 158             masm.testl(len, -64);
 159             masm.jcc(AMD64Assembler.ConditionFlag.Zero, labelAvxPostAlignment);
 160 
 161             masm.movl(tmp, dst);
 162             masm.andl(tmp, (32 - 1));
 163             masm.negl(tmp);
 164             masm.andl(tmp, (32 - 1));
 165 
 166             // bail out when there is nothing to be done
 167             masm.testl(tmp, tmp);
 168             masm.jcc(AMD64Assembler.ConditionFlag.Zero, labelAvxPostAlignment);
 169 
 170             // Compute (1 << N) - 1 = ~(~0 << N), where N is the remaining number
 171             // of characters to process.
 172             masm.movl(res, -1);
 173             masm.shlxl(res, res, tmp);
 174             masm.notl(res);
 175 
 176             masm.kmovd(k1, res);
 177             masm.evmovdqu16(tmp1, k1, new AMD64Address(src));
 178             masm.evpcmpuw(k2, k1, tmp1, tmp2, 2 /* le */);
 179             masm.ktestd(k2, k1);
 180             masm.jcc(AMD64Assembler.ConditionFlag.CarryClear, labelRestoreK1ReturnZero);
 181 
 182             masm.evpmovwb(new AMD64Address(dst), k1, tmp1);
 183 
 184             masm.addq(src, tmp);
 185             masm.addq(src, tmp);
 186             masm.addq(dst, tmp);
 187             masm.subl(len, tmp);
 188 
 189             masm.bind(labelAvxPostAlignment);
 190             // end of alignment
 191             Label labelAvx512LoopTail = new Label();
 192 
 193             masm.movl(tmp, len);
 194             masm.andl(tmp, -32);         // The vector count (in chars).
 195             masm.jcc(AMD64Assembler.ConditionFlag.Zero, labelAvx512LoopTail);
 196             masm.andl(len, 32 - 1);      // The tail count (in chars).
 197 
 198             masm.leaq(src, new AMD64Address(src, tmp, AMD64Address.Scale.Times2));
 199             masm.leaq(dst, new AMD64Address(dst, tmp, AMD64Address.Scale.Times1));
 200             masm.negq(tmp);
 201 
 202             Label labelAvx512Loop = new Label();
 203             // Test and compress 32 chars per iteration, reading 512-bit vectors and
 204             // writing 256-bit compressed ditto.
 205             masm.bind(labelAvx512Loop);
 206             masm.evmovdqu16(tmp1, new AMD64Address(src, tmp, AMD64Address.Scale.Times2));
 207             masm.evpcmpuw(k2, tmp1, tmp2, 2 /* le */);
 208             masm.kortestd(k2, k2);
 209             masm.jcc(AMD64Assembler.ConditionFlag.CarryClear, labelRestoreK1ReturnZero);
 210 
 211             // All 32 chars in the current vector (chunk) are valid for compression,
 212             // write truncated byte elements to memory.
 213             masm.evpmovwb(new AMD64Address(dst, tmp, AMD64Address.Scale.Times1), tmp1);
 214             masm.addq(tmp, 32);
 215             masm.jcc(AMD64Assembler.ConditionFlag.NotZero, labelAvx512Loop);
 216 
 217             masm.bind(labelAvx512LoopTail);
 218             masm.kmovq(k1, k3);      // Restore k1
 219 
 220             // All done if the tail count is zero.
 221             masm.testl(len, len);
 222             masm.jcc(AMD64Assembler.ConditionFlag.Zero, labelReturnLength);
 223 
 224             // Compute (1 << N) - 1 = ~(~0 << N), where N is the remaining number
 225             // of characters to process.
 226             masm.movl(res, -1);
 227             masm.shlxl(res, res, len);
 228             masm.notl(res);
 229 
 230             masm.kmovd(k1, res);
 231             masm.evmovdqu16(tmp1, k1, new AMD64Address(src));
 232             masm.evpcmpuw(k2, k1, tmp1, tmp2, 2 /* le */);
 233             masm.ktestd(k2, k1);
 234             masm.jcc(AMD64Assembler.ConditionFlag.CarryClear, labelRestoreK1ReturnZero);
 235 
 236             masm.evpmovwb(new AMD64Address(dst), k1, tmp1);
 237 
 238             masm.kmovq(k1, k3);      // Restore k1
 239             masm.jmp(labelReturnLength);
 240 
 241             masm.bind(labelRestoreK1ReturnZero);
 242             masm.kmovq(k1, k3);      // Restore k1
 243             masm.jmp(labelReturnZero);
 244         }
 245 
 246         if (masm.supports(AMD64.CPUFeature.SSE4_2)) {
 247 
 248             Label labelSSETail = new Label();
 249 
 250             masm.bind(labelBelowThreshold);
 251 
 252             masm.movl(tmp, 0xff00ff00);  // Create mask to test for Unicode chars in vectors.
 253 
 254             masm.movl(res, len);
 255             masm.andl(res, -16);
 256             masm.jccb(AMD64Assembler.ConditionFlag.Zero, labelSSETail);
 257             masm.andl(len, 16 - 1);
 258 
 259             // Compress 16 chars per iteration.
 260             masm.movdl(tmp1, tmp);
 261             masm.pshufd(tmp1, tmp1, 0);    // Store Unicode mask in 'vtmp1'.
 262             masm.pxor(tmp4, tmp4);
 263 
 264             masm.leaq(src, new AMD64Address(src, res, AMD64Address.Scale.Times2));
 265             masm.leaq(dst, new AMD64Address(dst, res, AMD64Address.Scale.Times1));
 266             masm.negq(res);
 267 
 268             Label lSSELoop = new Label();
 269             // Test and compress 16 chars per iteration, reading 128-bit vectors and
 270             // writing 64-bit compressed ditto.
 271             masm.bind(lSSELoop);
 272             masm.movdqu(tmp2, new AMD64Address(src, res, AMD64Address.Scale.Times2));     // load
 273                                                                                           // 1st 8
 274                                                                                           // characters
 275             masm.movdqu(tmp3, new AMD64Address(src, res, AMD64Address.Scale.Times2, 16)); // load
 276                                                                                           // next 8
 277                                                                                           // characters
 278             masm.por(tmp4, tmp2);
 279             masm.por(tmp4, tmp3);
 280             masm.ptest(tmp4, tmp1);        // Check for Unicode chars in vector.
 281             masm.jcc(AMD64Assembler.ConditionFlag.NotZero, labelReturnZero);
 282 
 283             masm.packuswb(tmp2, tmp3);     // Only ASCII chars; compress each to a byte.
 284             masm.movdqu(new AMD64Address(dst, res, AMD64Address.Scale.Times1), tmp2);
 285             masm.addq(res, 16);
 286             masm.jcc(AMD64Assembler.ConditionFlag.NotZero, lSSELoop);
 287 
 288             Label labelCopyChars = new Label();
 289             // Test and compress another 8 chars before final tail copy.
 290             masm.bind(labelSSETail);
 291             masm.movl(res, len);
 292             masm.andl(res, -8);
 293             masm.jccb(AMD64Assembler.ConditionFlag.Zero, labelCopyChars);
 294             masm.andl(len, 8 - 1);
 295 
 296             masm.movdl(tmp1, tmp);
 297             masm.pshufd(tmp1, tmp1, 0);    // Store Unicode mask in 'vtmp1'.
 298             masm.pxor(tmp3, tmp3);
 299 
 300             masm.movdqu(tmp2, new AMD64Address(src));
 301             masm.ptest(tmp2, tmp1);        // Check for Unicode chars in vector.
 302             masm.jccb(AMD64Assembler.ConditionFlag.NotZero, labelReturnZero);
 303             masm.packuswb(tmp2, tmp3);     // Only ASCII chars; compress each to a byte.
 304             masm.movq(new AMD64Address(dst), tmp2);
 305             masm.addq(src, 16);
 306             masm.addq(dst, 8);
 307 
 308             masm.bind(labelCopyChars);
 309         }
 310 
 311         // Compress any remaining characters using a vanilla implementation.
 312         masm.testl(len, len);
 313         masm.jccb(AMD64Assembler.ConditionFlag.Zero, labelReturnLength);
 314         masm.leaq(src, new AMD64Address(src, len, AMD64Address.Scale.Times2));
 315         masm.leaq(dst, new AMD64Address(dst, len, AMD64Address.Scale.Times1));
 316         masm.negq(len);
 317 
 318         Label labelCopyCharsLoop = new Label();
 319         // Compress a single character per iteration.
 320         masm.bind(labelCopyCharsLoop);
 321         masm.movzwl(res, new AMD64Address(src, len, AMD64Address.Scale.Times2));
 322         masm.testl(res, 0xff00);     // Check if Unicode character.
 323         masm.jccb(AMD64Assembler.ConditionFlag.NotZero, labelReturnZero);
 324         // An ASCII character; compress to a byte.
 325         masm.movb(new AMD64Address(dst, len, AMD64Address.Scale.Times1), res);
 326         masm.incrementq(len, 1);
 327         masm.jcc(AMD64Assembler.ConditionFlag.NotZero, labelCopyCharsLoop);
 328 
 329         // If compression succeeded, return the length.
 330         masm.bind(labelReturnLength);
 331         masm.pop(res);
 332         masm.jmpb(labelDone);
 333 
 334         // If compression failed, return 0.
 335         masm.bind(labelReturnZero);
 336         masm.xorl(res, res);
 337         masm.addq(rsp, 8 /* wordSize */);
 338 
 339         masm.bind(labelDone);
 340     }
 341 
 342 }