--- old/src/share/vm/opto/superword.cpp 2015-05-12 15:37:04.762829714 +0200 +++ new/src/share/vm/opto/superword.cpp 2015-05-12 15:37:04.678829718 +0200 @@ -232,6 +232,13 @@ // if unaligned memory access is not allowed because number of // iterations in pre-loop will be not enough to align it. create_pack = false; + } else { + SWPointer p2(best_align_to_mem_ref, this); + if (align_to_ref_p.invar() != p2.invar()) { + // Do not vectorize memory accesses with different invariants + // if unaligned memory accesses are not allowed. + create_pack = false; + } } } } else { @@ -450,24 +457,50 @@ if (ABS(span) == p.memory_size()) return true; - // If initial offset from start of object is computable, - // compute alignment within the vector. + // If the initial offset from start of the object is computable, + // check if the pre-loop can align the final offset accordingly. + // + // In other words: Can we find an i such that the offset + // after i pre-loop iterations is aligned to vw? + // (init_offset + pre_loop) % vw == 0 (1) + // where + // pre_loop = i * span + // is the number of bytes added to the offset by i pre-loop iterations. + // + // For this to hold we need pre_loop to increase init_offset by + // pre_loop = vw - (init_offset % vw) + // + // This is only possible if pre_loop is divisible by span because each + // pre-loop iteration increases the initial offset by 'span' bytes: + // (vw - (init_offset % vw)) % span == 0 + // int vw = vector_width_in_bytes(p.mem()); assert(vw > 1, "sanity"); - if (vw % span == 0) { - Node* init_nd = pre_end->init_trip(); - if (init_nd->is_Con() && p.invar() == NULL) { - int init = init_nd->bottom_type()->is_int()->get_con(); - - int init_offset = init * p.scale_in_bytes() + p.offset_in_bytes(); - assert(init_offset >= 0, "positive offset from object start"); - + Node* init_nd = pre_end->init_trip(); + if (init_nd->is_Con() && p.invar() == NULL) { + int init = init_nd->bottom_type()->is_int()->get_con(); + int init_offset = init * p.scale_in_bytes() + p.offset_in_bytes(); + assert(init_offset >= 0, "positive offset from object start"); + if (vw % span == 0) { + // If vm is a multiple of span, we use formula (1). if (span > 0) { return (vw - (init_offset % vw)) % span == 0; } else { assert(span < 0, "nonzero stride * scale"); return (init_offset % vw) % -span == 0; } + } else if (span % vw == 0) { + // If span is a multiple of vw, we can simplify formula (1) to: + // (init_offset + i * span) % vw == 0 + // => + // (init_offset % vw) + ((i * span) % vw) == 0 + // => + // init_offset % vw == 0 + // + // Because we add a multiple of vw to the initial offset, the final + // offset is a multiple of vw if and only if init_offset is a multiple. + // + return (init_offset % vw) == 0; } } return false; @@ -479,17 +512,23 @@ SWPointer align_to_ref_p(mem_ref, this); int offset = align_to_ref_p.offset_in_bytes(); int scale = align_to_ref_p.scale_in_bytes(); + int elt_size = align_to_ref_p.memory_size(); int vw = vector_width_in_bytes(mem_ref); assert(vw > 1, "sanity"); - int stride_sign = (scale * iv_stride()) > 0 ? 1 : -1; - // At least one iteration is executed in pre-loop by default. As result - // several iterations are needed to align memory operations in main-loop even - // if offset is 0. - int iv_adjustment_in_bytes = (stride_sign * vw - (offset % vw)); - int elt_size = align_to_ref_p.memory_size(); - assert(((ABS(iv_adjustment_in_bytes) % elt_size) == 0), - err_msg_res("(%d) should be divisible by (%d)", iv_adjustment_in_bytes, elt_size)); - int iv_adjustment = iv_adjustment_in_bytes/elt_size; + int iv_adjustment; + if (scale != 0) { + int stride_sign = (scale * iv_stride()) > 0 ? 1 : -1; + // At least one iteration is executed in pre-loop by default. As result + // several iterations are needed to align memory operations in main-loop even + // if offset is 0. + int iv_adjustment_in_bytes = (stride_sign * vw - (offset % vw)); + assert(((ABS(iv_adjustment_in_bytes) % elt_size) == 0), + err_msg_res("(%d) should be divisible by (%d)", iv_adjustment_in_bytes, elt_size)); + iv_adjustment = iv_adjustment_in_bytes/elt_size; + } else { + // This memory op is not dependent on iv (scale == 0) + iv_adjustment = 0; + } #ifndef PRODUCT if (TraceSuperWord) --- old/src/share/vm/opto/superword.hpp 2015-05-12 15:37:05.086829699 +0200 +++ new/src/share/vm/opto/superword.hpp 2015-05-12 15:37:05.010829703 +0200 @@ -41,7 +41,7 @@ // Exploiting SuperWord Level Parallelism with // Multimedia Instruction Sets // by -// Samuel Larsen and Saman Amarasighe +// Samuel Larsen and Saman Amarasinghe // MIT Laboratory for Computer Science // date // May 2000 @@ -432,7 +432,7 @@ Node* _base; // NULL if unsafe nonheap reference Node* _adr; // address pointer - jint _scale; // multipler for iv (in bytes), 0 if no loop iv + jint _scale; // multiplier for iv (in bytes), 0 if no loop iv jint _offset; // constant offset (in bytes) Node* _invar; // invariant offset (in bytes), NULL if none bool _negate_invar; // if true then use: (0 - _invar) --- /dev/null 2015-05-12 07:27:52.428998979 +0200 +++ new/test/compiler/loopopts/superword/TestVectorizationWithInvariant.java 2015-05-12 15:37:05.310829689 +0200 @@ -0,0 +1,144 @@ +/* + * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + * + */ + +import com.oracle.java.testlibrary.*; +import sun.misc.Unsafe; + +/** + * @test + * @bug 8078497 + * @summary Tests correct alignment of vectors with loop invariant offset. + * @library /testlibrary + * @run main TestVectorizationWithInvariant + */ +public class TestVectorizationWithInvariant { + + private static Unsafe unsafe; + private static final long BYTE_ARRAY_OFFSET; + private static final long CHAR_ARRAY_OFFSET; + + static { + unsafe = Utils.getUnsafe(); + BYTE_ARRAY_OFFSET = unsafe.arrayBaseOffset(byte[].class); + CHAR_ARRAY_OFFSET = unsafe.arrayBaseOffset(char[].class); + } + + public static void main(String[] args) throws Exception { + byte[] byte_array1 = new byte[1000]; + byte[] byte_array2 = new byte[1000]; + char[] char_array = new char[1000]; + + for (int i = 0; i < 20_000; ++i) { + copyByteToChar(byte_array1, byte_array2, char_array, 1); + copyCharToByte(char_array, byte_array1, 1); + copyCharToByteAligned(char_array, byte_array1); + copyCharToByteUnaligned(char_array, byte_array1); + } + } + + /* + * Copy multiple consecutive chars from a byte array to a given offset in a char array + * to trigger C2's superword optimization. The offset in the byte array is independent + * of the loop induction variable and can be set to an arbitrary value. It may then not + * be possible to both align the LoadUS and the StoreC operations. Therefore, vectorization + * should only be done in this case if unaligned memory accesses are allowed. + */ + public static void copyByteToChar(byte[] src1, byte[] src2, char[] dst, int off) { + off = (int) BYTE_ARRAY_OFFSET + (off << 1); + byte[] src = src1; + for (int i = (int) CHAR_ARRAY_OFFSET; i < 100; i = i + 8) { + // Copy 8 chars from src to dst + unsafe.putChar(dst, i + 0, unsafe.getChar(src, off + 0)); + unsafe.putChar(dst, i + 2, unsafe.getChar(src, off + 2)); + unsafe.putChar(dst, i + 4, unsafe.getChar(src, off + 4)); + unsafe.putChar(dst, i + 6, unsafe.getChar(src, off + 6)); + unsafe.putChar(dst, i + 8, unsafe.getChar(src, off + 8)); + unsafe.putChar(dst, i + 10, unsafe.getChar(src, off + 10)); + unsafe.putChar(dst, i + 12, unsafe.getChar(src, off + 12)); + unsafe.putChar(dst, i + 14, unsafe.getChar(src, off + 14)); + + // Prevent loop invariant code motion of char read. + src = (src == src1) ? src2 : src1; + } + } + + /* + * Copy multiple consecutive chars from a char array to a given offset in a byte array + * to trigger C2's superword optimization. Checks for similar problems as 'copyByteToChar'. + */ + public static void copyCharToByte(char[] src, byte[] dst, int off) { + off = (int) BYTE_ARRAY_OFFSET + (off << 1); + for (int i = 0; i < 100; i = i + 8) { + // Copy 8 chars from src to dst + unsafe.putChar(dst, off + 0, src[i + 0]); + unsafe.putChar(dst, off + 2, src[i + 1]); + unsafe.putChar(dst, off + 4, src[i + 2]); + unsafe.putChar(dst, off + 6, src[i + 3]); + unsafe.putChar(dst, off + 8, src[i + 4]); + unsafe.putChar(dst, off + 10, src[i + 5]); + unsafe.putChar(dst, off + 12, src[i + 6]); + unsafe.putChar(dst, off + 14, src[i + 7]); + } + } + + /* + * Variant of copyCharToByte with a constant destination array offset. + * The loop should always be vectorized because both the LoadUS and StoreC + * operations can be aligned. + */ + public static void copyCharToByteAligned(char[] src, byte[] dst) { + final int off = (int) BYTE_ARRAY_OFFSET; + for (int i = 8; i < 100; i = i + 8) { + // Copy 8 chars from src to dst + unsafe.putChar(dst, off + 0, src[i + 0]); + unsafe.putChar(dst, off + 2, src[i + 1]); + unsafe.putChar(dst, off + 4, src[i + 2]); + unsafe.putChar(dst, off + 6, src[i + 3]); + unsafe.putChar(dst, off + 8, src[i + 4]); + unsafe.putChar(dst, off + 10, src[i + 5]); + unsafe.putChar(dst, off + 12, src[i + 6]); + unsafe.putChar(dst, off + 14, src[i + 7]); + } + } + + /* + * Variant of copyCharToByte with a constant destination array offset. The + * loop should only be vectorized if unaligned memory operations are allowed + * because not both the LoadUS and the StoreC can be aligned. + */ + public static void copyCharToByteUnaligned(char[] src, byte[] dst) { + final int off = (int) BYTE_ARRAY_OFFSET + 2; + for (int i = 0; i < 100; i = i + 8) { + // Copy 8 chars from src to dst + unsafe.putChar(dst, off + 0, src[i + 0]); + unsafe.putChar(dst, off + 2, src[i + 1]); + unsafe.putChar(dst, off + 4, src[i + 2]); + unsafe.putChar(dst, off + 6, src[i + 3]); + unsafe.putChar(dst, off + 8, src[i + 4]); + unsafe.putChar(dst, off + 10, src[i + 5]); + unsafe.putChar(dst, off + 12, src[i + 6]); + unsafe.putChar(dst, off + 14, src[i + 7]); + } + } +}