--- old/common/autoconf/flags.m4	2016-12-02 11:14:25.324327569 -0500
+++ new/common/autoconf/flags.m4	2016-12-02 11:14:23.720236604 -0500
@@ -23,6 +23,101 @@
 # questions.
 #
 
+################################################################################
+#
+# Setup ABI profile (for arm)
+#
+AC_DEFUN([FLAGS_SETUP_ABI_PROFILE],
+[
+  AC_ARG_WITH(abi-profile, [AS_HELP_STRING([--with-abi-profile],
+      [specify ABI profile for ARM builds (arm-vfp-sflt,arm-vfp-hflt,arm-sflt, armv5-vfp-sflt,armv6-vfp-hflt,arm64,aarch64) @<:@toolchain dependent@:>@ ])])
+
+  if test "x$with_abi_profile" != x; then
+    if test "x$OPENJDK_TARGET_CPU" != xarm && \
+        test "x$OPENJDK_TARGET_CPU" != xaarch64; then
+      AC_MSG_ERROR([--with-abi-profile only available on arm/aarch64])
+    fi
+
+    OPENJDK_TARGET_ABI_PROFILE=$with_abi_profile
+    AC_MSG_CHECKING([for ABI profle])
+    AC_MSG_RESULT([$OPENJDK_TARGET_ABI_PROFILE])
+
+    if test "x$OPENJDK_TARGET_ABI_PROFILE" = xarm-vfp-sflt; then
+      ARM_FLOAT_TYPE=vfp-sflt
+      ARM_ARCH_TYPE_FLAGS='-march=armv7-a -mthumb'
+    elif test "x$OPENJDK_TARGET_ABI_PROFILE" = xarm-vfp-hflt; then
+      ARM_FLOAT_TYPE=vfp-hflt
+      ARM_ARCH_TYPE_FLAGS='-march=armv7-a -mthumb'
+    elif test "x$OPENJDK_TARGET_ABI_PROFILE" = xarm-sflt; then
+      ARM_FLOAT_TYPE=sflt
+      ARM_ARCH_TYPE_FLAGS='-march=armv5t -marm'
+    elif test "x$OPENJDK_TARGET_ABI_PROFILE" = xarmv5-vfp-sflt; then
+      ARM_FLOAT_TYPE=vfp-sflt
+      ARM_ARCH_TYPE_FLAGS='-march=armv5t -marm'
+    elif test "x$OPENJDK_TARGET_ABI_PROFILE" = xarmv6-vfp-hflt; then
+      ARM_FLOAT_TYPE=vfp-hflt
+      ARM_ARCH_TYPE_FLAGS='-march=armv6 -marm'
+    elif test "x$OPENJDK_TARGET_ABI_PROFILE" = xarm64; then
+      # No special flags, just need to trigger setting JDK_ARCH_ABI_PROP_NAME
+      ARM_FLOAT_TYPE=
+      ARM_ARCH_TYPE_FLAGS=
+    elif test "x$OPENJDK_TARGET_ABI_PROFILE" = xaarch64; then
+      # No special flags, just need to trigger setting JDK_ARCH_ABI_PROP_NAME
+      ARM_FLOAT_TYPE=
+      ARM_ARCH_TYPE_FLAGS=
+    else
+      AC_MSG_ERROR([Invalid ABI profile: "$OPENJDK_TARGET_ABI_PROFILE"])
+    fi
+
+    if test "x$ARM_FLOAT_TYPE" = xvfp-sflt; then
+      ARM_FLOAT_TYPE_FLAGS='-mfloat-abi=softfp -mfpu=vfp -DFLOAT_ARCH=-vfp-sflt'
+    elif test "x$ARM_FLOAT_TYPE" = xvfp-hflt; then
+      ARM_FLOAT_TYPE_FLAGS='-mfloat-abi=hard -mfpu=vfp -DFLOAT_ARCH=-vfp-hflt'
+    elif test "x$ARM_FLOAT_TYPE" = xsflt; then
+      ARM_FLOAT_TYPE_FLAGS='-msoft-float -mfpu=vfp'
+    fi
+    AC_MSG_CHECKING([for $ARM_FLOAT_TYPE floating point flags])
+    AC_MSG_RESULT([$ARM_FLOAT_TYPE_FLAGS])
+
+    AC_MSG_CHECKING([for arch type flags])
+    AC_MSG_RESULT([$ARM_ARCH_TYPE_FLAGS])
+
+    # Now set JDK_ARCH_ABI_PROP_NAME. This is equivalent to the last part of the
+    # autoconf target triplet.
+    [ JDK_ARCH_ABI_PROP_NAME=`$ECHO $OPENJDK_TARGET_AUTOCONF_NAME | $SED -e 's/.*-\([^-]*\)$/\1/'` ]
+    # Sanity check that it is a known ABI.
+    if test "x$JDK_ARCH_ABI_PROP_NAME" != xgnu && \
+        test "x$JDK_ARCH_ABI_PROP_NAME" != xgnueabi  && \
+        test "x$JDK_ARCH_ABI_PROP_NAME" != xgnueabihf; then
+          AC_MSG_WARN([Unknown autoconf target triplet ABI: "$JDK_ARCH_ABI_PROP_NAME"])
+    fi
+    AC_MSG_CHECKING([for ABI property name])
+    AC_MSG_RESULT([$JDK_ARCH_ABI_PROP_NAME])
+    AC_SUBST(JDK_ARCH_ABI_PROP_NAME)
+
+    # Pass these on to the open part of configure as if they were set using
+    # --with-extra-c[xx]flags.
+    EXTRA_CFLAGS="$EXTRA_CFLAGS $ARM_ARCH_TYPE_FLAGS $ARM_FLOAT_TYPE_FLAGS"
+    EXTRA_CXXFLAGS="$EXTRA_CXXFLAGS $ARM_ARCH_TYPE_FLAGS $ARM_FLOAT_TYPE_FLAGS"
+    # Get rid of annoying "note: the mangling of 'va_list' has changed in GCC 4.4"
+    # FIXME: This should not really be set using extra_cflags.
+    if test "x$OPENJDK_TARGET_CPU" = xarm; then
+        EXTRA_CFLAGS="$EXTRA_CFLAGS -Wno-psabi"
+        EXTRA_CXXFLAGS="$EXTRA_CXXFLAGS -Wno-psabi"
+    fi
+    # Also add JDK_ARCH_ABI_PROP_NAME define, but only to CFLAGS.
+    EXTRA_CFLAGS="$EXTRA_CFLAGS -DJDK_ARCH_ABI_PROP_NAME='\"\$(JDK_ARCH_ABI_PROP_NAME)\"'"
+    # And pass the architecture flags to the linker as well
+    EXTRA_LDFLAGS="$EXTRA_LDFLAGS $ARM_ARCH_TYPE_FLAGS $ARM_FLOAT_TYPE_FLAGS"
+  fi
+
+  # When building with an abi profile, the name of that profile is appended on the
+  # bundle platform, which is used in bundle names.
+  if test "x$OPENJDK_TARGET_ABI_PROFILE" != x; then
+    OPENJDK_TARGET_BUNDLE_PLATFORM="$OPENJDK_TARGET_OS_BUNDLE-$OPENJDK_TARGET_ABI_PROFILE"
+  fi
+])
+
 # Reset the global CFLAGS/LDFLAGS variables and initialize them with the
 # corresponding configure arguments instead
 AC_DEFUN_ONCE([FLAGS_SETUP_USER_SUPPLIED_FLAGS],
@@ -306,9 +401,17 @@
       PICFLAG='-fPIC'
       SHARED_LIBRARY_FLAGS='-shared'
       SET_EXECUTABLE_ORIGIN='-Wl,-rpath,\$$ORIGIN[$]1'
-      SET_SHARED_LIBRARY_ORIGIN="-Wl,-z,origin $SET_EXECUTABLE_ORIGIN"
       SET_SHARED_LIBRARY_NAME='-Wl,-soname=[$]1'
       SET_SHARED_LIBRARY_MAPFILE='-Wl,-version-script=[$]1'
+
+      # arm specific settings
+      if test "x$OPENJDK_TARGET_CPU_ARCH" = "xarm"; then
+        # '-Wl,-z,origin' isn't used on arm.
+        SET_SHARED_LIBRARY_ORIGIN='-Wl,-rpath,\$$$$ORIGIN[$]1'
+      else
+        SET_SHARED_LIBRARY_ORIGIN="-Wl,-z,origin $SET_EXECUTABLE_ORIGIN"
+      fi
+
     fi
   elif test "x$TOOLCHAIN_TYPE" = xsolstudio; then
     PICFLAG="-KPIC"
@@ -665,6 +768,7 @@
 AC_DEFUN([FLAGS_SETUP_COMPILER_FLAGS_FOR_JDK],
 [
 
+  FLAGS_SETUP_ABI_PROFILE
   FLAGS_SETUP_COMPILER_FLAGS_FOR_JDK_HELPER([TARGET])
   FLAGS_SETUP_COMPILER_FLAGS_FOR_JDK_HELPER([BUILD], [OPENJDK_BUILD_])
 
@@ -754,6 +858,7 @@
       arm )
         # on arm we don't prevent gcc to omit frame pointer but do prevent strict aliasing
         $2CFLAGS_JDK="${$2CFLAGS_JDK} -fno-strict-aliasing"
+        $2COMMON_CCXXFLAGS_JDK="${$2COMMON_CCXXFLAGS_JDK} -fsigned-char"
         ;;
       ppc )
         # on ppc we don't prevent gcc to omit frame pointer but do prevent strict aliasing
--- old/common/autoconf/generated-configure.sh	2016-12-02 11:14:33.332781714 -0500
+++ new/common/autoconf/generated-configure.sh	2016-12-02 11:14:31.620684624 -0500
@@ -749,6 +749,7 @@
 CFLAGS_JDKLIB
 MACOSX_VERSION_MIN
 CXXSTD_CXXFLAG
+JDK_ARCH_ABI_PROP_NAME
 CXX_O_FLAG_SIZE
 CXX_O_FLAG_NONE
 CXX_O_FLAG_DEBUG
@@ -1139,6 +1140,7 @@
 enable_debug
 with_debug_level
 with_jvm_variants
+with_cpu_port
 with_devkit
 with_sys_root
 with_sysroot
@@ -1188,6 +1190,7 @@
 with_toolchain_version
 with_build_devkit
 with_jtreg
+with_abi_profile
 enable_warnings_as_errors
 with_native_debug_symbols
 enable_debug_symbols
@@ -2030,6 +2033,8 @@
   --with-jvm-variants     JVM variants (separated by commas) to build
                           (server,client,minimal,core,zero,zeroshark,custom)
                           [server]
+  --with-cpu-port         specify sources to use for Hotspot 64-bit ARM port
+                          (arm64,aarch64) [aarch64]
   --with-devkit           use this devkit for compilers, tools and resources
   --with-sys-root         alias for --with-sysroot for backwards compatability
   --with-sysroot          use this directory as sysroot
@@ -2112,6 +2117,10 @@
                           dependent]
   --with-build-devkit     Devkit to use for the build platform toolchain
   --with-jtreg            Regression Test Harness [probed]
+  --with-abi-profile      specify ABI profile for ARM builds
+                          (arm-vfp-sflt,arm-vfp-hflt,arm-sflt,
+                          armv5-vfp-sflt,armv6-vfp-hflt,arm64,aarch64)
+                          [toolchain dependent]
   --with-native-debug-symbols
                           set the native debug symbol configuration (none,
                           internal, external, zipped) [varying]
@@ -3996,6 +4005,12 @@
 # questions.
 #
 
+################################################################################
+#
+# Setup ABI profile (for arm)
+#
+
+
 # Reset the global CFLAGS/LDFLAGS variables and initialize them with the
 # corresponding configure arguments instead
 
@@ -4252,7 +4267,8 @@
 
 # All valid JVM features, regardless of platform
 VALID_JVM_FEATURES="compiler1 compiler2 zero shark minimal dtrace jvmti jvmci \
-    fprof vm-structs jni-check services management all-gcs nmt cds static-build"
+    fprof vm-structs jni-check services management all-gcs nmt cds \
+    static-build link-time-opt"
 
 # All valid JVM variants
 VALID_JVM_VARIANTS="server client minimal core zero zeroshark custom"
@@ -4307,6 +4323,16 @@
 
 
 ################################################################################
+#
+# Specify which sources will be used to build the 64-bit ARM port
+#
+# --with-cpu-port=arm64   will use hotspot/src/cpu/arm
+# --with-cpu-port=aarch64 will use hotspot/src/cpu/aarch64
+#
+
+
+
+################################################################################
 # Check if gtest should be built
 #
 
@@ -5091,7 +5117,7 @@
 #CUSTOM_AUTOCONF_INCLUDE
 
 # Do not change or remove the following line, it is needed for consistency checks:
-DATE_WHEN_GENERATED=1479997584
+DATE_WHEN_GENERATED=1480631660
 
 ###############################################################################
 #
@@ -16708,6 +16734,28 @@
 fi
 
 
+
+
+# Check whether --with-cpu-port was given.
+if test "${with_cpu_port+set}" = set; then :
+  withval=$with_cpu_port;
+fi
+
+
+  if test "x$with_cpu_port" != x; then
+    if test "x$OPENJDK_TARGET_CPU" != xaarch64; then
+      as_fn_error $? "--with-cpu-port only available on aarch64" "$LINENO" 5
+    fi
+
+    if test "x$with_cpu_port" != x; then
+      if test "x$with_cpu_port" != xarm64 && \
+          test "x$with_cpu_port" != xaarch64; then
+        as_fn_error $? "--with-cpu-port must specify arm64 or aarch64" "$LINENO" 5
+      fi
+    fi
+  fi
+
+
   if test "x$with_jvm_variants" = x; then
     with_jvm_variants="server"
   fi
@@ -49093,9 +49141,17 @@
       PICFLAG='-fPIC'
       SHARED_LIBRARY_FLAGS='-shared'
       SET_EXECUTABLE_ORIGIN='-Wl,-rpath,\$$ORIGIN$1'
-      SET_SHARED_LIBRARY_ORIGIN="-Wl,-z,origin $SET_EXECUTABLE_ORIGIN"
       SET_SHARED_LIBRARY_NAME='-Wl,-soname=$1'
       SET_SHARED_LIBRARY_MAPFILE='-Wl,-version-script=$1'
+
+      # arm specific settings
+      if test "x$OPENJDK_TARGET_CPU_ARCH" = "xarm"; then
+        # '-Wl,-z,origin' isn't used on arm.
+        SET_SHARED_LIBRARY_ORIGIN='-Wl,-rpath,\$$$$ORIGIN$1'
+      else
+        SET_SHARED_LIBRARY_ORIGIN="-Wl,-z,origin $SET_EXECUTABLE_ORIGIN"
+      fi
+
     fi
   elif test "x$TOOLCHAIN_TYPE" = xsolstudio; then
     PICFLAG="-KPIC"
@@ -49681,6 +49737,108 @@
 
 
 
+
+# Check whether --with-abi-profile was given.
+if test "${with_abi_profile+set}" = set; then :
+  withval=$with_abi_profile;
+fi
+
+
+  if test "x$with_abi_profile" != x; then
+    if test "x$OPENJDK_TARGET_CPU" != xarm && \
+        test "x$OPENJDK_TARGET_CPU" != xaarch64; then
+      as_fn_error $? "--with-abi-profile only available on arm/aarch64" "$LINENO" 5
+    fi
+
+    OPENJDK_TARGET_ABI_PROFILE=$with_abi_profile
+    { $as_echo "$as_me:${as_lineno-$LINENO}: checking for ABI profle" >&5
+$as_echo_n "checking for ABI profle... " >&6; }
+    { $as_echo "$as_me:${as_lineno-$LINENO}: result: $OPENJDK_TARGET_ABI_PROFILE" >&5
+$as_echo "$OPENJDK_TARGET_ABI_PROFILE" >&6; }
+
+    if test "x$OPENJDK_TARGET_ABI_PROFILE" = xarm-vfp-sflt; then
+      ARM_FLOAT_TYPE=vfp-sflt
+      ARM_ARCH_TYPE_FLAGS='-march=armv7-a -mthumb'
+    elif test "x$OPENJDK_TARGET_ABI_PROFILE" = xarm-vfp-hflt; then
+      ARM_FLOAT_TYPE=vfp-hflt
+      ARM_ARCH_TYPE_FLAGS='-march=armv7-a -mthumb'
+    elif test "x$OPENJDK_TARGET_ABI_PROFILE" = xarm-sflt; then
+      ARM_FLOAT_TYPE=sflt
+      ARM_ARCH_TYPE_FLAGS='-march=armv5t -marm'
+    elif test "x$OPENJDK_TARGET_ABI_PROFILE" = xarmv5-vfp-sflt; then
+      ARM_FLOAT_TYPE=vfp-sflt
+      ARM_ARCH_TYPE_FLAGS='-march=armv5t -marm'
+    elif test "x$OPENJDK_TARGET_ABI_PROFILE" = xarmv6-vfp-hflt; then
+      ARM_FLOAT_TYPE=vfp-hflt
+      ARM_ARCH_TYPE_FLAGS='-march=armv6 -marm'
+    elif test "x$OPENJDK_TARGET_ABI_PROFILE" = xarm64; then
+      # No special flags, just need to trigger setting JDK_ARCH_ABI_PROP_NAME
+      ARM_FLOAT_TYPE=
+      ARM_ARCH_TYPE_FLAGS=
+    elif test "x$OPENJDK_TARGET_ABI_PROFILE" = xaarch64; then
+      # No special flags, just need to trigger setting JDK_ARCH_ABI_PROP_NAME
+      ARM_FLOAT_TYPE=
+      ARM_ARCH_TYPE_FLAGS=
+    else
+      as_fn_error $? "Invalid ABI profile: \"$OPENJDK_TARGET_ABI_PROFILE\"" "$LINENO" 5
+    fi
+
+    if test "x$ARM_FLOAT_TYPE" = xvfp-sflt; then
+      ARM_FLOAT_TYPE_FLAGS='-mfloat-abi=softfp -mfpu=vfp -DFLOAT_ARCH=-vfp-sflt'
+    elif test "x$ARM_FLOAT_TYPE" = xvfp-hflt; then
+      ARM_FLOAT_TYPE_FLAGS='-mfloat-abi=hard -mfpu=vfp -DFLOAT_ARCH=-vfp-hflt'
+    elif test "x$ARM_FLOAT_TYPE" = xsflt; then
+      ARM_FLOAT_TYPE_FLAGS='-msoft-float -mfpu=vfp'
+    fi
+    { $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ARM_FLOAT_TYPE floating point flags" >&5
+$as_echo_n "checking for $ARM_FLOAT_TYPE floating point flags... " >&6; }
+    { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ARM_FLOAT_TYPE_FLAGS" >&5
+$as_echo "$ARM_FLOAT_TYPE_FLAGS" >&6; }
+
+    { $as_echo "$as_me:${as_lineno-$LINENO}: checking for arch type flags" >&5
+$as_echo_n "checking for arch type flags... " >&6; }
+    { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ARM_ARCH_TYPE_FLAGS" >&5
+$as_echo "$ARM_ARCH_TYPE_FLAGS" >&6; }
+
+    # Now set JDK_ARCH_ABI_PROP_NAME. This is equivalent to the last part of the
+    # autoconf target triplet.
+     JDK_ARCH_ABI_PROP_NAME=`$ECHO $OPENJDK_TARGET_AUTOCONF_NAME | $SED -e 's/.*-\([^-]*\)$/\1/'`
+    # Sanity check that it is a known ABI.
+    if test "x$JDK_ARCH_ABI_PROP_NAME" != xgnu && \
+        test "x$JDK_ARCH_ABI_PROP_NAME" != xgnueabi  && \
+        test "x$JDK_ARCH_ABI_PROP_NAME" != xgnueabihf; then
+          { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: Unknown autoconf target triplet ABI: \"$JDK_ARCH_ABI_PROP_NAME\"" >&5
+$as_echo "$as_me: WARNING: Unknown autoconf target triplet ABI: \"$JDK_ARCH_ABI_PROP_NAME\"" >&2;}
+    fi
+    { $as_echo "$as_me:${as_lineno-$LINENO}: checking for ABI property name" >&5
+$as_echo_n "checking for ABI property name... " >&6; }
+    { $as_echo "$as_me:${as_lineno-$LINENO}: result: $JDK_ARCH_ABI_PROP_NAME" >&5
+$as_echo "$JDK_ARCH_ABI_PROP_NAME" >&6; }
+
+
+    # Pass these on to the open part of configure as if they were set using
+    # --with-extra-c[xx]flags.
+    EXTRA_CFLAGS="$EXTRA_CFLAGS $ARM_ARCH_TYPE_FLAGS $ARM_FLOAT_TYPE_FLAGS"
+    EXTRA_CXXFLAGS="$EXTRA_CXXFLAGS $ARM_ARCH_TYPE_FLAGS $ARM_FLOAT_TYPE_FLAGS"
+    # Get rid of annoying "note: the mangling of 'va_list' has changed in GCC 4.4"
+    # FIXME: This should not really be set using extra_cflags.
+    if test "x$OPENJDK_TARGET_CPU" = xarm; then
+        EXTRA_CFLAGS="$EXTRA_CFLAGS -Wno-psabi"
+        EXTRA_CXXFLAGS="$EXTRA_CXXFLAGS -Wno-psabi"
+    fi
+    # Also add JDK_ARCH_ABI_PROP_NAME define, but only to CFLAGS.
+    EXTRA_CFLAGS="$EXTRA_CFLAGS -DJDK_ARCH_ABI_PROP_NAME='\"\$(JDK_ARCH_ABI_PROP_NAME)\"'"
+    # And pass the architecture flags to the linker as well
+    EXTRA_LDFLAGS="$EXTRA_LDFLAGS $ARM_ARCH_TYPE_FLAGS $ARM_FLOAT_TYPE_FLAGS"
+  fi
+
+  # When building with an abi profile, the name of that profile is appended on the
+  # bundle platform, which is used in bundle names.
+  if test "x$OPENJDK_TARGET_ABI_PROFILE" != x; then
+    OPENJDK_TARGET_BUNDLE_PLATFORM="$OPENJDK_TARGET_OS_BUNDLE-$OPENJDK_TARGET_ABI_PROFILE"
+  fi
+
+
   # Special extras...
   if test "x$TOOLCHAIN_TYPE" = xsolstudio; then
     if test "x$OPENJDK_TARGET_CPU_ARCH" = "xsparc"; then
@@ -49832,6 +49990,7 @@
       arm )
         # on arm we don't prevent gcc to omit frame pointer but do prevent strict aliasing
         CFLAGS_JDK="${CFLAGS_JDK} -fno-strict-aliasing"
+        COMMON_CCXXFLAGS_JDK="${COMMON_CCXXFLAGS_JDK} -fsigned-char"
         ;;
       ppc )
         # on ppc we don't prevent gcc to omit frame pointer but do prevent strict aliasing
@@ -50655,6 +50814,7 @@
       arm )
         # on arm we don't prevent gcc to omit frame pointer but do prevent strict aliasing
         OPENJDK_BUILD_CFLAGS_JDK="${OPENJDK_BUILD_CFLAGS_JDK} -fno-strict-aliasing"
+        OPENJDK_BUILD_COMMON_CCXXFLAGS_JDK="${OPENJDK_BUILD_COMMON_CCXXFLAGS_JDK} -fsigned-char"
         ;;
       ppc )
         # on ppc we don't prevent gcc to omit frame pointer but do prevent strict aliasing
@@ -52823,6 +52983,19 @@
 $as_echo "$JVM_FEATURES" >&6; }
   fi
 
+  # Override hotspot cpu definitions for ARM platforms
+  if test "x$OPENJDK_TARGET_CPU" = xarm; then
+    HOTSPOT_TARGET_CPU=arm_32
+    HOTSPOT_TARGET_CPU_DEFINE="ARM32"
+    JVM_LDFLAGS="$JVM_LDFLAGS -fsigned-char"
+    JVM_CFLAGS="$JVM_CFLAGS -DARM -fsigned-char"
+  elif test "x$OPENJDK_TARGET_CPU" = xaarch64 && test "x$with_cpu_port" = xarm64; then
+    HOTSPOT_TARGET_CPU=arm_64
+    HOTSPOT_TARGET_CPU_ARCH=arm
+    JVM_LDFLAGS="$JVM_LDFLAGS -fsigned-char"
+    JVM_CFLAGS="$JVM_CFLAGS -DARM -fsigned-char"
+  fi
+
   # Verify that dependencies are met for explicitly set features.
   if   [[ " $JVM_FEATURES " =~ " jvmti " ]]   && !   [[ " $JVM_FEATURES " =~ " services " ]]  ; then
     as_fn_error $? "Specified JVM feature 'jvmti' requires feature 'services'" "$LINENO" 5
@@ -52882,6 +53055,13 @@
     JVM_FEATURES_jvmci=""
   fi
 
+  if test "x$OPENJDK_TARGET_CPU" = xarm ; then
+    # Default to use link time optimizations on minimal on arm
+    JVM_FEATURES_link_time_opt="link-time-opt"
+  else
+    JVM_FEATURES_link_time_opt=""
+  fi
+
   # All variants but minimal (and custom) get these features
   NON_MINIMAL_FEATURES="$NON_MINIMAL_FEATURES jvmti fprof vm-structs jni-check services management all-gcs nmt cds"
 
@@ -52889,7 +53069,7 @@
   JVM_FEATURES_server="compiler1 compiler2 $NON_MINIMAL_FEATURES $JVM_FEATURES $JVM_FEATURES_jvmci"
   JVM_FEATURES_client="compiler1 $NON_MINIMAL_FEATURES $JVM_FEATURES $JVM_FEATURES_jvmci"
   JVM_FEATURES_core="$NON_MINIMAL_FEATURES $JVM_FEATURES"
-  JVM_FEATURES_minimal="compiler1 minimal $JVM_FEATURES"
+  JVM_FEATURES_minimal="compiler1 minimal $JVM_FEATURES $JVM_FEATURES_link_time_opt"
   JVM_FEATURES_zero="zero $NON_MINIMAL_FEATURES $JVM_FEATURES"
   JVM_FEATURES_zeroshark="zero shark $NON_MINIMAL_FEATURES $JVM_FEATURES"
   JVM_FEATURES_custom="$JVM_FEATURES"
--- old/common/autoconf/hotspot.m4	2016-12-02 11:14:39.793148072 -0500
+++ new/common/autoconf/hotspot.m4	2016-12-02 11:14:38.217058694 -0500
@@ -25,7 +25,8 @@
 
 # All valid JVM features, regardless of platform
 VALID_JVM_FEATURES="compiler1 compiler2 zero shark minimal dtrace jvmti jvmci \
-    fprof vm-structs jni-check services management all-gcs nmt cds static-build"
+    fprof vm-structs jni-check services management all-gcs nmt cds \
+    static-build link-time-opt"
 
 # All valid JVM variants
 VALID_JVM_VARIANTS="server client minimal core zero zeroshark custom"
@@ -69,6 +70,8 @@
   AC_ARG_WITH([jvm-variants], [AS_HELP_STRING([--with-jvm-variants],
       [JVM variants (separated by commas) to build (server,client,minimal,core,zero,zeroshark,custom) @<:@server@:>@])])
 
+  SETUP_HOTSPOT_TARGET_CPU_PORT
+
   if test "x$with_jvm_variants" = x; then
     with_jvm_variants="server"
   fi
@@ -204,6 +207,19 @@
     AC_MSG_RESULT([$JVM_FEATURES])
   fi
 
+  # Override hotspot cpu definitions for ARM platforms
+  if test "x$OPENJDK_TARGET_CPU" = xarm; then
+    HOTSPOT_TARGET_CPU=arm_32
+    HOTSPOT_TARGET_CPU_DEFINE="ARM32"
+    JVM_LDFLAGS="$JVM_LDFLAGS -fsigned-char"
+    JVM_CFLAGS="$JVM_CFLAGS -DARM -fsigned-char"
+  elif test "x$OPENJDK_TARGET_CPU" = xaarch64 && test "x$with_cpu_port" = xarm64; then
+    HOTSPOT_TARGET_CPU=arm_64
+    HOTSPOT_TARGET_CPU_ARCH=arm
+    JVM_LDFLAGS="$JVM_LDFLAGS -fsigned-char"
+    JVM_CFLAGS="$JVM_CFLAGS -DARM -fsigned-char"
+  fi
+
   # Verify that dependencies are met for explicitly set features.
   if HOTSPOT_CHECK_JVM_FEATURE(jvmti) && ! HOTSPOT_CHECK_JVM_FEATURE(services); then
     AC_MSG_ERROR([Specified JVM feature 'jvmti' requires feature 'services'])
@@ -263,6 +279,13 @@
     JVM_FEATURES_jvmci=""
   fi
 
+  if test "x$OPENJDK_TARGET_CPU" = xarm ; then
+    # Default to use link time optimizations on minimal on arm
+    JVM_FEATURES_link_time_opt="link-time-opt"
+  else
+    JVM_FEATURES_link_time_opt=""
+  fi
+
   # All variants but minimal (and custom) get these features
   NON_MINIMAL_FEATURES="$NON_MINIMAL_FEATURES jvmti fprof vm-structs jni-check services management all-gcs nmt cds"
 
@@ -270,7 +293,7 @@
   JVM_FEATURES_server="compiler1 compiler2 $NON_MINIMAL_FEATURES $JVM_FEATURES $JVM_FEATURES_jvmci"
   JVM_FEATURES_client="compiler1 $NON_MINIMAL_FEATURES $JVM_FEATURES $JVM_FEATURES_jvmci"
   JVM_FEATURES_core="$NON_MINIMAL_FEATURES $JVM_FEATURES"
-  JVM_FEATURES_minimal="compiler1 minimal $JVM_FEATURES"
+  JVM_FEATURES_minimal="compiler1 minimal $JVM_FEATURES $JVM_FEATURES_link_time_opt"
   JVM_FEATURES_zero="zero $NON_MINIMAL_FEATURES $JVM_FEATURES"
   JVM_FEATURES_zeroshark="zero shark $NON_MINIMAL_FEATURES $JVM_FEATURES"
   JVM_FEATURES_custom="$JVM_FEATURES"
@@ -320,6 +343,33 @@
 ])
 
 ################################################################################
+#
+# Specify which sources will be used to build the 64-bit ARM port
+#
+# --with-cpu-port=arm64   will use hotspot/src/cpu/arm
+# --with-cpu-port=aarch64 will use hotspot/src/cpu/aarch64
+#
+AC_DEFUN([SETUP_HOTSPOT_TARGET_CPU_PORT],
+[
+  AC_ARG_WITH(cpu-port, [AS_HELP_STRING([--with-cpu-port],
+      [specify sources to use for Hotspot 64-bit ARM port (arm64,aarch64) @<:@aarch64@:>@ ])])
+
+  if test "x$with_cpu_port" != x; then
+    if test "x$OPENJDK_TARGET_CPU" != xaarch64; then
+      AC_MSG_ERROR([--with-cpu-port only available on aarch64])
+    fi
+
+    if test "x$with_cpu_port" != x; then
+      if test "x$with_cpu_port" != xarm64 && \
+          test "x$with_cpu_port" != xaarch64; then
+        AC_MSG_ERROR([--with-cpu-port must specify arm64 or aarch64])
+      fi
+    fi
+  fi
+])
+
+
+################################################################################
 # Check if gtest should be built
 #
 AC_DEFUN_ONCE([HOTSPOT_ENABLE_DISABLE_GTEST],
--- old/hotspot/make/gensrc/GensrcAdlc.gmk	2016-12-02 11:14:45.541474048 -0500
+++ new/hotspot/make/gensrc/GensrcAdlc.gmk	2016-12-02 11:14:43.917381949 -0500
@@ -114,6 +114,10 @@
     ADLCFLAGS += -U_LP64
   endif
 
+  ifeq ($(HOTSPOT_TARGET_CPU_ARCH), arm)
+    ADLCFLAGS += -DARM=1
+  endif
+
   ##############################################################################
   # Concatenate all ad source files into a single file, which will be fed to
   # adlc. Also include a #line directive at the start of every included file
--- old/hotspot/make/lib/CompileJvm.gmk	2016-12-02 11:14:50.777770989 -0500
+++ new/hotspot/make/lib/CompileJvm.gmk	2016-12-02 11:14:49.217682519 -0500
@@ -139,6 +139,20 @@
 ################################################################################
 # Platform specific setup
 
+# ARM source selection
+
+ifeq ($(OPENJDK_TARGET_OS)-$(OPENJDK_TARGET_CPU), linux-arm)
+  JVM_EXCLUDE_PATTERNS += arm_64
+
+else ifeq ($(OPENJDK_TARGET_OS)-$(OPENJDK_TARGET_CPU), linux-aarch64)
+  # Open aarch64 port is named "aarch64", exclude it if the 
+  # HOTSPOT_TARGET_CPU_ARCH is set to arm.  In this case we 
+  # want the hybrid sources.
+  ifeq ($(HOTSPOT_TARGET_CPU_ARCH), arm)
+    JVM_EXCLUDE_PATTERNS += arm_32 aarch64
+  endif
+endif
+
 ifneq ($(filter $(OPENJDK_TARGET_OS), linux macosx windows), )
   JVM_PRECOMPILED_HEADER := $(HOTSPOT_TOPDIR)/src/share/vm/precompiled/precompiled.hpp
 endif
--- old/hotspot/make/lib/JvmFeatures.gmk	2016-12-02 11:14:55.950064299 -0500
+++ new/hotspot/make/lib/JvmFeatures.gmk	2016-12-02 11:14:54.317971746 -0500
@@ -146,3 +146,109 @@
       memBaseline.cpp memReporter.cpp mallocTracker.cpp virtualMemoryTracker.cpp nmtCommon.cpp \
       memTracker.cpp nmtDCmd.cpp mallocSiteTable.cpp
 endif
+
+################################################################################
+
+ifeq ($(call check-jvm-feature, link-time-opt), true)
+  # NOTE: Disable automatic opimization level and let the explicit cflag control
+  # optimization level instead. This activates O3 on slowdebug builds, just
+  # like the old build, but it's probably not right.
+  JVM_OPTIMIZATION :=
+  JVM_CFLAGS_FEATURES += -O3 -flto
+  JVM_LDFLAGS_FEATURES += -O3 -flto -fwhole-program -fno-strict-aliasing
+endif
+
+ifeq ($(call check-jvm-feature, minimal), true)
+  ifeq ($(call check-jvm-feature, link-time-opt), false)
+    JVM_OPTIMIZATION := SIZE
+    OPT_SPEED_SRC := \
+        allocation.cpp \
+        assembler.cpp \
+        assembler_linux_arm.cpp \
+        barrierSet.cpp \
+        basicLock.cpp \
+        biasedLocking.cpp \
+        bytecode.cpp \
+        bytecodeInterpreter.cpp \
+        bytecodeInterpreter_x86.cpp \
+        c1_Compilation.cpp \
+        c1_Compiler.cpp \
+        c1_GraphBuilder.cpp \
+        c1_LinearScan.cpp \
+        c1_LIR.cpp \
+        ciEnv.cpp \
+        ciObjectFactory.cpp \
+        codeBlob.cpp \
+        constantPool.cpp \
+        constMethod.cpp \
+        classLoader.cpp \
+        classLoaderData.cpp \
+        classFileParser.cpp \
+        classFileStream.cpp \
+        cpCache.cpp \
+        defNewGeneration.cpp \
+        frame_arm.cpp \
+        genCollectedHeap.cpp \
+        generation.cpp \
+        genMarkSweep.cpp \
+        growableArray.cpp \
+        handles.cpp \
+        hashtable.cpp \
+        heap.cpp \
+        icache.cpp \
+        icache_arm.cpp \
+        instanceKlass.cpp \
+        invocationCounter.cpp \
+        iterator.cpp \
+        javaCalls.cpp \
+        javaClasses.cpp \
+        jniFastGetField_arm.cpp \
+        jvm.cpp \
+        jvm_linux.cpp \
+        linkResolver.cpp \
+        klass.cpp \
+        klassVtable.cpp \
+        markSweep.cpp \
+        memRegion.cpp \
+        memoryPool.cpp \
+        method.cpp \
+        methodHandles.cpp \
+        methodHandles_arm.cpp \
+        methodLiveness.cpp \
+        metablock.cpp \
+        metaspace.cpp \
+        mutex.cpp \
+        mutex_linux.cpp \
+        mutexLocker.cpp \
+        nativeLookup.cpp \
+        objArrayKlass.cpp \
+        os_linux.cpp \
+        os_linux_arm.cpp \
+        placeHolders.cpp \
+        quickSort.cpp \
+        resourceArea.cpp \
+        rewriter.cpp \
+        sharedRuntime.cpp \
+        signature.cpp \
+        space.cpp \
+        stackMapTable.cpp \
+        symbolTable.cpp \
+        systemDictionary.cpp \
+        symbol.cpp \
+        synchronizer.cpp \
+        threadLS_bsd_x86.cpp \
+        threadLS_linux_arm.cpp \
+        threadLS_linux_x86.cpp \
+        timer.cpp \
+        typeArrayKlass.cpp \
+        unsafe.cpp \
+        utf8.cpp \
+        vmSymbols.cpp \
+        #
+
+    $(foreach s, $(OPT_SPEED_SRC), \
+        $(eval BUILD_LIBJVM_$s_OPTIMIZATION := HIGHEST_JVM))
+
+    BUILD_LIBJVM_systemDictionary.cpp_CXXFLAGS := -fno-optimize-sibling-calls
+  endif
+endif
--- old/hotspot/src/jdk.hotspot.agent/linux/native/libsaproc/libproc.h	2016-12-02 11:15:01.206362373 -0500
+++ new/hotspot/src/jdk.hotspot.agent/linux/native/libsaproc/libproc.h	2016-12-02 11:14:59.610271862 -0500
@@ -80,6 +80,12 @@
 #include <asm/ptrace.h>
 #endif
 
+#if defined(arm) || defined(arm64)
+struct user_regs_struct {
+    unsigned long   regs[ELF_NGREG];     /* integer and fp regs */
+};
+#endif
+
 // This C bool type must be int for compatibility with Linux calls and
 // it would be a mistake to equivalence it to C++ bool on many platforms
 
--- old/hotspot/src/share/vm/c1/c1_Runtime1.cpp	2016-12-02 11:15:06.514663396 -0500
+++ new/hotspot/src/share/vm/c1/c1_Runtime1.cpp	2016-12-02 11:15:04.906572205 -0500
@@ -33,7 +33,6 @@
 #include "classfile/systemDictionary.hpp"
 #include "classfile/vmSymbols.hpp"
 #include "code/codeBlob.hpp"
-#include "code/codeCacheExtensions.hpp"
 #include "code/compiledIC.hpp"
 #include "code/pcDesc.hpp"
 #include "code/scopeDesc.hpp"
@@ -189,52 +188,44 @@
   int frame_size;
   bool must_gc_arguments;
 
-  if (!CodeCacheExtensions::skip_compiler_support()) {
-    // bypass useless code generation
-    Compilation::setup_code_buffer(&code, 0);
-
-    // create assembler for code generation
-    StubAssembler* sasm = new StubAssembler(&code, name_for(id), id);
-    // generate code for runtime stub
-    oop_maps = generate_code_for(id, sasm);
-    assert(oop_maps == NULL || sasm->frame_size() != no_frame_size,
-           "if stub has an oop map it must have a valid frame size");
+  Compilation::setup_code_buffer(&code, 0);
+
+  // create assembler for code generation
+  StubAssembler* sasm = new StubAssembler(&code, name_for(id), id);
+  // generate code for runtime stub
+  oop_maps = generate_code_for(id, sasm);
+  assert(oop_maps == NULL || sasm->frame_size() != no_frame_size,
+         "if stub has an oop map it must have a valid frame size");
 
 #ifdef ASSERT
-    // Make sure that stubs that need oopmaps have them
-    switch (id) {
-      // These stubs don't need to have an oopmap
-    case dtrace_object_alloc_id:
-    case g1_pre_barrier_slow_id:
-    case g1_post_barrier_slow_id:
-    case slow_subtype_check_id:
-    case fpu2long_stub_id:
-    case unwind_exception_id:
-    case counter_overflow_id:
+  // Make sure that stubs that need oopmaps have them
+  switch (id) {
+    // These stubs don't need to have an oopmap
+  case dtrace_object_alloc_id:
+  case g1_pre_barrier_slow_id:
+  case g1_post_barrier_slow_id:
+  case slow_subtype_check_id:
+  case fpu2long_stub_id:
+  case unwind_exception_id:
+  case counter_overflow_id:
 #if defined(SPARC) || defined(PPC32)
-    case handle_exception_nofpu_id:  // Unused on sparc
+  case handle_exception_nofpu_id:  // Unused on sparc
 #endif
-      break;
+    break;
 
-      // All other stubs should have oopmaps
-    default:
-      assert(oop_maps != NULL, "must have an oopmap");
-    }
+    // All other stubs should have oopmaps
+  default:
+    assert(oop_maps != NULL, "must have an oopmap");
+  }
 #endif
 
-    // align so printing shows nop's instead of random code at the end (SimpleStubs are aligned)
-    sasm->align(BytesPerWord);
-    // make sure all code is in code buffer
-    sasm->flush();
-
-    frame_size = sasm->frame_size();
-    must_gc_arguments = sasm->must_gc_arguments();
-  } else {
-    /* ignored values */
-    oop_maps = NULL;
-    frame_size = 0;
-    must_gc_arguments = false;
-  }
+  // align so printing shows nop's instead of random code at the end (SimpleStubs are aligned)
+  sasm->align(BytesPerWord);
+  // make sure all code is in code buffer
+  sasm->flush();
+
+  frame_size = sasm->frame_size();
+  must_gc_arguments = sasm->must_gc_arguments();
   // create blob - distinguish a few special cases
   CodeBlob* blob = RuntimeStub::new_runtime_stub(name_for(id),
                                                  &code,
--- old/hotspot/src/share/vm/code/codeBlob.cpp	2016-12-02 11:15:11.674956025 -0500
+++ new/hotspot/src/share/vm/code/codeBlob.cpp	2016-12-02 11:15:10.106867101 -0500
@@ -25,7 +25,6 @@
 #include "precompiled.hpp"
 #include "code/codeBlob.hpp"
 #include "code/codeCache.hpp"
-#include "code/codeCacheExtensions.hpp"
 #include "code/relocInfo.hpp"
 #include "compiler/disassembler.hpp"
 #include "interpreter/bytecode.hpp"
@@ -228,7 +227,6 @@
 
   BufferBlob* blob = NULL;
   unsigned int size = sizeof(BufferBlob);
-  CodeCacheExtensions::size_blob(name, &buffer_size);
   // align the size to CodeEntryAlignment
   size = CodeBlob::align_code_offset(size);
   size += round_to(buffer_size, oopSize);
@@ -312,7 +310,6 @@
 
   MethodHandlesAdapterBlob* blob = NULL;
   unsigned int size = sizeof(MethodHandlesAdapterBlob);
-  CodeCacheExtensions::size_blob("MethodHandles adapters", &buffer_size);
   // align the size to CodeEntryAlignment
   size = CodeBlob::align_code_offset(size);
   size += round_to(buffer_size, oopSize);
@@ -354,13 +351,11 @@
 {
   RuntimeStub* stub = NULL;
   ThreadInVMfromUnknown __tiv;  // get to VM state in case we block on CodeCache_lock
-  if (!CodeCacheExtensions::skip_code_generation()) {
-    // bypass useless code generation
+  {
     MutexLockerEx mu(CodeCache_lock, Mutex::_no_safepoint_check_flag);
     unsigned int size = CodeBlob::allocation_size(cb, sizeof(RuntimeStub));
     stub = new (size) RuntimeStub(stub_name, cb, size, frame_complete, frame_size, oop_maps, caller_must_gc_arguments);
   }
-  stub = (RuntimeStub*) CodeCacheExtensions::handle_generated_blob(stub, stub_name);
 
   trace_new_stub(stub, "RuntimeStub - ", stub_name);
 
--- old/hotspot/src/share/vm/code/codeBlob.hpp	2016-12-02 11:15:16.959255688 -0500
+++ new/hotspot/src/share/vm/code/codeBlob.hpp	2016-12-02 11:15:15.143152699 -0500
@@ -40,8 +40,7 @@
     MethodProfiled      = 1,    // Execution level 2 and 3 (profiled) nmethods
     NonNMethod          = 2,    // Non-nmethods like Buffers, Adapters and Runtime Stubs
     All                 = 3,    // All types (No code cache segmentation)
-    Pregenerated        = 4,    // Special blobs, managed by CodeCacheExtensions
-    NumTypes            = 5     // Number of CodeBlobTypes
+    NumTypes            = 4     // Number of CodeBlobTypes
   };
 };
 
--- old/hotspot/src/share/vm/code/stubs.cpp	2016-12-02 11:15:22.183551948 -0500
+++ new/hotspot/src/share/vm/code/stubs.cpp	2016-12-02 11:15:20.579460980 -0500
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1997, 2015, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2016, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -262,16 +262,3 @@
   }
 }
 
-// Fixup for pregenerated code
-void StubQueue::fix_buffer(address buffer, address queue_end, address buffer_end, int number_of_stubs) {
-  const int extra_bytes = CodeEntryAlignment;
-  _stub_buffer = buffer;
-  _queue_begin = 0;
-  _queue_end = queue_end - buffer;
-  _number_of_stubs = number_of_stubs;
-  int size = buffer_end - buffer;
-  // Note: _buffer_limit must differ from _queue_end in the iteration loops
-  // => add extra space at the end (preserving alignment for asserts) if needed
-  if (buffer_end == queue_end) size += extra_bytes;
-  _buffer_limit = _buffer_size = size;
-}
--- old/hotspot/src/share/vm/code/stubs.hpp	2016-12-02 11:15:27.263840039 -0500
+++ new/hotspot/src/share/vm/code/stubs.hpp	2016-12-02 11:15:25.655748848 -0500
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1997, 2015, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2016, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -217,8 +217,6 @@
   void  verify();                                // verifies the stub queue
   void  print();                                 // prints information about the stub queue
 
-  // Fixup for pregenerated code
-  void fix_buffer(address buffer, address queue_end, address buffer_end, int number_of_stubs);
 };
 
 #endif // SHARE_VM_CODE_STUBS_HPP
--- old/hotspot/src/share/vm/interpreter/interpreterRuntime.cpp	2016-12-02 11:15:34.352242006 -0500
+++ new/hotspot/src/share/vm/interpreter/interpreterRuntime.cpp	2016-12-02 11:15:32.736150362 -0500
@@ -27,7 +27,6 @@
 #include "classfile/systemDictionary.hpp"
 #include "classfile/vmSymbols.hpp"
 #include "code/codeCache.hpp"
-#include "code/codeCacheExtensions.hpp"
 #include "compiler/compileBroker.hpp"
 #include "compiler/disassembler.hpp"
 #include "gc/shared/collectedHeap.hpp"
@@ -1199,7 +1198,6 @@
     ICache::invalidate_range(handler, insts_size);
     _handler = handler + insts_size;
   }
-  CodeCacheExtensions::handle_generated_handler(handler, buffer->name(), _handler);
   return handler;
 }
 
@@ -1208,7 +1206,7 @@
     // use slow signature handler if we can't do better
     int handler_index = -1;
     // check if we can use customized (fast) signature handler
-    if (UseFastSignatureHandlers && CodeCacheExtensions::support_fast_signature_handlers() && method->size_of_parameters() <= Fingerprinter::max_size_of_parameters) {
+    if (UseFastSignatureHandlers && method->size_of_parameters() <= Fingerprinter::max_size_of_parameters) {
       // use customized signature handler
       MutexLocker mu(SignatureHandlerLibrary_lock);
       // make sure data structure is initialized
@@ -1225,15 +1223,6 @@
           round_to((intptr_t)_buffer, CodeEntryAlignment) - (address)_buffer;
         CodeBuffer buffer((address)(_buffer + align_offset),
                           SignatureHandlerLibrary::buffer_size - align_offset);
-        if (!CodeCacheExtensions::support_dynamic_code()) {
-          // we need a name for the signature (for lookups or saving)
-          const int SYMBOL_SIZE = 50;
-          char *symbolName = NEW_RESOURCE_ARRAY(char, SYMBOL_SIZE);
-          // support for named signatures
-          jio_snprintf(symbolName, SYMBOL_SIZE,
-                       "native_" UINT64_FORMAT, fingerprint);
-          buffer.set_name(symbolName);
-        }
         InterpreterRuntime::SignatureHandlerGenerator(method, &buffer).generate(fingerprint);
         // copy into code heap
         address handler = set_handler(&buffer);
--- old/hotspot/src/share/vm/interpreter/templateInterpreter.cpp	2016-12-02 11:15:39.876555277 -0500
+++ new/hotspot/src/share/vm/interpreter/templateInterpreter.cpp	2016-12-02 11:15:38.024450250 -0500
@@ -23,7 +23,6 @@
  */
 
 #include "precompiled.hpp"
-#include "code/codeCacheExtensions.hpp"
 #include "interpreter/interpreter.hpp"
 #include "interpreter/interpreterRuntime.hpp"
 #include "interpreter/interp_masm.hpp"
@@ -52,29 +51,10 @@
     TraceTime timer("Interpreter generation", TRACETIME_LOG(Info, startuptime));
     int code_size = InterpreterCodeSize;
     NOT_PRODUCT(code_size *= 4;)  // debug uses extra interpreter code space
-#if INCLUDE_JVMTI
-    if (CodeCacheExtensions::saving_generated_interpreter()) {
-      // May requires several versions of the codelets.
-      // Final size will automatically be optimized.
-      code_size *= 2;
-    }
-#endif
     _code = new StubQueue(new InterpreterCodeletInterface, code_size, NULL,
                           "Interpreter");
     TemplateInterpreterGenerator g(_code);
   }
-  if (PrintInterpreter) {
-    if (CodeCacheExtensions::saving_generated_interpreter() &&
-        CodeCacheExtensions::use_pregenerated_interpreter()) {
-      ResourceMark rm;
-      tty->print("Printing the newly generated interpreter first");
-      print();
-      tty->print("Printing the pregenerated interpreter next");
-    }
-  }
-
-  // Install the pregenerated interpreter code before printing it
-  CodeCacheExtensions::complete_step(CodeCacheExtensionsSteps::TemplateInterpreter);
 
   if (PrintInterpreter) {
     ResourceMark rm;
--- old/hotspot/src/share/vm/interpreter/templateInterpreterGenerator.cpp	2016-12-02 11:15:45.384867643 -0500
+++ new/hotspot/src/share/vm/interpreter/templateInterpreterGenerator.cpp	2016-12-02 11:15:43.516761705 -0500
@@ -23,7 +23,6 @@
  */
 
 #include "precompiled.hpp"
-#include "code/codeCacheExtensions.hpp"
 #include "interpreter/interpreter.hpp"
 #include "interpreter/interpreterRuntime.hpp"
 #include "interpreter/interp_masm.hpp"
@@ -55,219 +54,213 @@
 };
 
 void TemplateInterpreterGenerator::generate_all() {
-  // Loop, in case we need several variants of the interpreter entries
-  do {
-    if (!CodeCacheExtensions::skip_code_generation()) {
-      // bypass code generation when useless
-      { CodeletMark cm(_masm, "slow signature handler");
-        AbstractInterpreter::_slow_signature_handler = generate_slow_signature_handler();
-      }
+  { CodeletMark cm(_masm, "slow signature handler");
+    AbstractInterpreter::_slow_signature_handler = generate_slow_signature_handler();
+  }
 
-      { CodeletMark cm(_masm, "error exits");
-        _unimplemented_bytecode    = generate_error_exit("unimplemented bytecode");
-        _illegal_bytecode_sequence = generate_error_exit("illegal bytecode sequence - method not verified");
-      }
+  { CodeletMark cm(_masm, "error exits");
+    _unimplemented_bytecode    = generate_error_exit("unimplemented bytecode");
+    _illegal_bytecode_sequence = generate_error_exit("illegal bytecode sequence - method not verified");
+  }
 
 #ifndef PRODUCT
-      if (TraceBytecodes) {
-        CodeletMark cm(_masm, "bytecode tracing support");
-        Interpreter::_trace_code =
-          EntryPoint(
-                     generate_trace_code(btos),
-                     generate_trace_code(ztos),
-                     generate_trace_code(ctos),
-                     generate_trace_code(stos),
-                     generate_trace_code(atos),
-                     generate_trace_code(itos),
-                     generate_trace_code(ltos),
-                     generate_trace_code(ftos),
-                     generate_trace_code(dtos),
-                     generate_trace_code(vtos)
-                     );
-      }
+  if (TraceBytecodes) {
+    CodeletMark cm(_masm, "bytecode tracing support");
+    Interpreter::_trace_code =
+      EntryPoint(
+                 generate_trace_code(btos),
+                 generate_trace_code(ztos),
+                 generate_trace_code(ctos),
+                 generate_trace_code(stos),
+                 generate_trace_code(atos),
+                 generate_trace_code(itos),
+                 generate_trace_code(ltos),
+                 generate_trace_code(ftos),
+                 generate_trace_code(dtos),
+                 generate_trace_code(vtos)
+                 );
+  }
 #endif // !PRODUCT
 
-      { CodeletMark cm(_masm, "return entry points");
-        const int index_size = sizeof(u2);
-        for (int i = 0; i < Interpreter::number_of_return_entries; i++) {
-          Interpreter::_return_entry[i] =
-            EntryPoint(
-                       generate_return_entry_for(itos, i, index_size),
-                       generate_return_entry_for(itos, i, index_size),
-                       generate_return_entry_for(itos, i, index_size),
-                       generate_return_entry_for(itos, i, index_size),
-                       generate_return_entry_for(atos, i, index_size),
-                       generate_return_entry_for(itos, i, index_size),
-                       generate_return_entry_for(ltos, i, index_size),
-                       generate_return_entry_for(ftos, i, index_size),
-                       generate_return_entry_for(dtos, i, index_size),
-                       generate_return_entry_for(vtos, i, index_size)
-                       );
-        }
-      }
+  { CodeletMark cm(_masm, "return entry points");
+    const int index_size = sizeof(u2);
+    for (int i = 0; i < Interpreter::number_of_return_entries; i++) {
+      Interpreter::_return_entry[i] =
+        EntryPoint(
+                   generate_return_entry_for(itos, i, index_size),
+                   generate_return_entry_for(itos, i, index_size),
+                   generate_return_entry_for(itos, i, index_size),
+                   generate_return_entry_for(itos, i, index_size),
+                   generate_return_entry_for(atos, i, index_size),
+                   generate_return_entry_for(itos, i, index_size),
+                   generate_return_entry_for(ltos, i, index_size),
+                   generate_return_entry_for(ftos, i, index_size),
+                   generate_return_entry_for(dtos, i, index_size),
+                   generate_return_entry_for(vtos, i, index_size)
+                   );
+    }
+  }
 
-      { CodeletMark cm(_masm, "invoke return entry points");
-        // These states are in order specified in TosState, except btos/ztos/ctos/stos are
-        // really the same as itos since there is no top of stack optimization for these types
-        const TosState states[] = {itos, itos, itos, itos, itos, ltos, ftos, dtos, atos, vtos, ilgl};
-        const int invoke_length = Bytecodes::length_for(Bytecodes::_invokestatic);
-        const int invokeinterface_length = Bytecodes::length_for(Bytecodes::_invokeinterface);
-        const int invokedynamic_length = Bytecodes::length_for(Bytecodes::_invokedynamic);
-
-        for (int i = 0; i < Interpreter::number_of_return_addrs; i++) {
-          TosState state = states[i];
-          assert(state != ilgl, "states array is wrong above");
-          Interpreter::_invoke_return_entry[i] = generate_return_entry_for(state, invoke_length, sizeof(u2));
-          Interpreter::_invokeinterface_return_entry[i] = generate_return_entry_for(state, invokeinterface_length, sizeof(u2));
-          Interpreter::_invokedynamic_return_entry[i] = generate_return_entry_for(state, invokedynamic_length, sizeof(u4));
-        }
-      }
+  { CodeletMark cm(_masm, "invoke return entry points");
+    // These states are in order specified in TosState, except btos/ztos/ctos/stos are
+    // really the same as itos since there is no top of stack optimization for these types
+    const TosState states[] = {itos, itos, itos, itos, itos, ltos, ftos, dtos, atos, vtos, ilgl};
+    const int invoke_length = Bytecodes::length_for(Bytecodes::_invokestatic);
+    const int invokeinterface_length = Bytecodes::length_for(Bytecodes::_invokeinterface);
+    const int invokedynamic_length = Bytecodes::length_for(Bytecodes::_invokedynamic);
+
+    for (int i = 0; i < Interpreter::number_of_return_addrs; i++) {
+      TosState state = states[i];
+      assert(state != ilgl, "states array is wrong above");
+      Interpreter::_invoke_return_entry[i] = generate_return_entry_for(state, invoke_length, sizeof(u2));
+      Interpreter::_invokeinterface_return_entry[i] = generate_return_entry_for(state, invokeinterface_length, sizeof(u2));
+      Interpreter::_invokedynamic_return_entry[i] = generate_return_entry_for(state, invokedynamic_length, sizeof(u4));
+    }
+  }
 
-      { CodeletMark cm(_masm, "earlyret entry points");
-        Interpreter::_earlyret_entry =
-          EntryPoint(
-                     generate_earlyret_entry_for(btos),
-                     generate_earlyret_entry_for(ztos),
-                     generate_earlyret_entry_for(ctos),
-                     generate_earlyret_entry_for(stos),
-                     generate_earlyret_entry_for(atos),
-                     generate_earlyret_entry_for(itos),
-                     generate_earlyret_entry_for(ltos),
-                     generate_earlyret_entry_for(ftos),
-                     generate_earlyret_entry_for(dtos),
-                     generate_earlyret_entry_for(vtos)
-                     );
-      }
+  { CodeletMark cm(_masm, "earlyret entry points");
+    Interpreter::_earlyret_entry =
+      EntryPoint(
+                 generate_earlyret_entry_for(btos),
+                 generate_earlyret_entry_for(ztos),
+                 generate_earlyret_entry_for(ctos),
+                 generate_earlyret_entry_for(stos),
+                 generate_earlyret_entry_for(atos),
+                 generate_earlyret_entry_for(itos),
+                 generate_earlyret_entry_for(ltos),
+                 generate_earlyret_entry_for(ftos),
+                 generate_earlyret_entry_for(dtos),
+                 generate_earlyret_entry_for(vtos)
+                 );
+  }
 
-      { CodeletMark cm(_masm, "deoptimization entry points");
-        for (int i = 0; i < Interpreter::number_of_deopt_entries; i++) {
-          Interpreter::_deopt_entry[i] =
-            EntryPoint(
-                       generate_deopt_entry_for(itos, i),
-                       generate_deopt_entry_for(itos, i),
-                       generate_deopt_entry_for(itos, i),
-                       generate_deopt_entry_for(itos, i),
-                       generate_deopt_entry_for(atos, i),
-                       generate_deopt_entry_for(itos, i),
-                       generate_deopt_entry_for(ltos, i),
-                       generate_deopt_entry_for(ftos, i),
-                       generate_deopt_entry_for(dtos, i),
-                       generate_deopt_entry_for(vtos, i)
-                       );
-        }
-      }
+  { CodeletMark cm(_masm, "deoptimization entry points");
+    for (int i = 0; i < Interpreter::number_of_deopt_entries; i++) {
+      Interpreter::_deopt_entry[i] =
+        EntryPoint(
+                   generate_deopt_entry_for(itos, i),
+                   generate_deopt_entry_for(itos, i),
+                   generate_deopt_entry_for(itos, i),
+                   generate_deopt_entry_for(itos, i),
+                   generate_deopt_entry_for(atos, i),
+                   generate_deopt_entry_for(itos, i),
+                   generate_deopt_entry_for(ltos, i),
+                   generate_deopt_entry_for(ftos, i),
+                   generate_deopt_entry_for(dtos, i),
+                   generate_deopt_entry_for(vtos, i)
+                   );
+    }
+  }
 
-      { CodeletMark cm(_masm, "result handlers for native calls");
-        // The various result converter stublets.
-        int is_generated[Interpreter::number_of_result_handlers];
-        memset(is_generated, 0, sizeof(is_generated));
-
-        for (int i = 0; i < Interpreter::number_of_result_handlers; i++) {
-          BasicType type = types[i];
-          if (!is_generated[Interpreter::BasicType_as_index(type)]++) {
-            Interpreter::_native_abi_to_tosca[Interpreter::BasicType_as_index(type)] = generate_result_handler_for(type);
-          }
-        }
+  { CodeletMark cm(_masm, "result handlers for native calls");
+    // The various result converter stublets.
+    int is_generated[Interpreter::number_of_result_handlers];
+    memset(is_generated, 0, sizeof(is_generated));
+
+    for (int i = 0; i < Interpreter::number_of_result_handlers; i++) {
+      BasicType type = types[i];
+      if (!is_generated[Interpreter::BasicType_as_index(type)]++) {
+        Interpreter::_native_abi_to_tosca[Interpreter::BasicType_as_index(type)] = generate_result_handler_for(type);
       }
+    }
+  }
 
-      { CodeletMark cm(_masm, "continuation entry points");
-        Interpreter::_continuation_entry =
-          EntryPoint(
-                     generate_continuation_for(btos),
-                     generate_continuation_for(ztos),
-                     generate_continuation_for(ctos),
-                     generate_continuation_for(stos),
-                     generate_continuation_for(atos),
-                     generate_continuation_for(itos),
-                     generate_continuation_for(ltos),
-                     generate_continuation_for(ftos),
-                     generate_continuation_for(dtos),
-                     generate_continuation_for(vtos)
-                     );
-      }
+  { CodeletMark cm(_masm, "continuation entry points");
+    Interpreter::_continuation_entry =
+      EntryPoint(
+                 generate_continuation_for(btos),
+                 generate_continuation_for(ztos),
+                 generate_continuation_for(ctos),
+                 generate_continuation_for(stos),
+                 generate_continuation_for(atos),
+                 generate_continuation_for(itos),
+                 generate_continuation_for(ltos),
+                 generate_continuation_for(ftos),
+                 generate_continuation_for(dtos),
+                 generate_continuation_for(vtos)
+                 );
+  }
 
-      { CodeletMark cm(_masm, "safepoint entry points");
-        Interpreter::_safept_entry =
-          EntryPoint(
-                     generate_safept_entry_for(btos, CAST_FROM_FN_PTR(address, InterpreterRuntime::at_safepoint)),
-                     generate_safept_entry_for(ztos, CAST_FROM_FN_PTR(address, InterpreterRuntime::at_safepoint)),
-                     generate_safept_entry_for(ctos, CAST_FROM_FN_PTR(address, InterpreterRuntime::at_safepoint)),
-                     generate_safept_entry_for(stos, CAST_FROM_FN_PTR(address, InterpreterRuntime::at_safepoint)),
-                     generate_safept_entry_for(atos, CAST_FROM_FN_PTR(address, InterpreterRuntime::at_safepoint)),
-                     generate_safept_entry_for(itos, CAST_FROM_FN_PTR(address, InterpreterRuntime::at_safepoint)),
-                     generate_safept_entry_for(ltos, CAST_FROM_FN_PTR(address, InterpreterRuntime::at_safepoint)),
-                     generate_safept_entry_for(ftos, CAST_FROM_FN_PTR(address, InterpreterRuntime::at_safepoint)),
-                     generate_safept_entry_for(dtos, CAST_FROM_FN_PTR(address, InterpreterRuntime::at_safepoint)),
-                     generate_safept_entry_for(vtos, CAST_FROM_FN_PTR(address, InterpreterRuntime::at_safepoint))
-                     );
-      }
+  { CodeletMark cm(_masm, "safepoint entry points");
+    Interpreter::_safept_entry =
+      EntryPoint(
+                 generate_safept_entry_for(btos, CAST_FROM_FN_PTR(address, InterpreterRuntime::at_safepoint)),
+                 generate_safept_entry_for(ztos, CAST_FROM_FN_PTR(address, InterpreterRuntime::at_safepoint)),
+                 generate_safept_entry_for(ctos, CAST_FROM_FN_PTR(address, InterpreterRuntime::at_safepoint)),
+                 generate_safept_entry_for(stos, CAST_FROM_FN_PTR(address, InterpreterRuntime::at_safepoint)),
+                 generate_safept_entry_for(atos, CAST_FROM_FN_PTR(address, InterpreterRuntime::at_safepoint)),
+                 generate_safept_entry_for(itos, CAST_FROM_FN_PTR(address, InterpreterRuntime::at_safepoint)),
+                 generate_safept_entry_for(ltos, CAST_FROM_FN_PTR(address, InterpreterRuntime::at_safepoint)),
+                 generate_safept_entry_for(ftos, CAST_FROM_FN_PTR(address, InterpreterRuntime::at_safepoint)),
+                 generate_safept_entry_for(dtos, CAST_FROM_FN_PTR(address, InterpreterRuntime::at_safepoint)),
+                 generate_safept_entry_for(vtos, CAST_FROM_FN_PTR(address, InterpreterRuntime::at_safepoint))
+                 );
+  }
 
-      { CodeletMark cm(_masm, "exception handling");
-        // (Note: this is not safepoint safe because thread may return to compiled code)
-        generate_throw_exception();
-      }
+  { CodeletMark cm(_masm, "exception handling");
+    // (Note: this is not safepoint safe because thread may return to compiled code)
+    generate_throw_exception();
+  }
 
-      { CodeletMark cm(_masm, "throw exception entrypoints");
-        Interpreter::_throw_ArrayIndexOutOfBoundsException_entry = generate_ArrayIndexOutOfBounds_handler("java/lang/ArrayIndexOutOfBoundsException");
-        Interpreter::_throw_ArrayStoreException_entry            = generate_klass_exception_handler("java/lang/ArrayStoreException"                 );
-        Interpreter::_throw_ArithmeticException_entry            = generate_exception_handler("java/lang/ArithmeticException"           , "/ by zero");
-        Interpreter::_throw_ClassCastException_entry             = generate_ClassCastException_handler();
-        Interpreter::_throw_NullPointerException_entry           = generate_exception_handler("java/lang/NullPointerException"          , NULL       );
-        Interpreter::_throw_StackOverflowError_entry             = generate_StackOverflowError_handler();
-      }
+  { CodeletMark cm(_masm, "throw exception entrypoints");
+    Interpreter::_throw_ArrayIndexOutOfBoundsException_entry = generate_ArrayIndexOutOfBounds_handler("java/lang/ArrayIndexOutOfBoundsException");
+    Interpreter::_throw_ArrayStoreException_entry            = generate_klass_exception_handler("java/lang/ArrayStoreException"                 );
+    Interpreter::_throw_ArithmeticException_entry            = generate_exception_handler("java/lang/ArithmeticException"           , "/ by zero");
+    Interpreter::_throw_ClassCastException_entry             = generate_ClassCastException_handler();
+    Interpreter::_throw_NullPointerException_entry           = generate_exception_handler("java/lang/NullPointerException"          , NULL       );
+    Interpreter::_throw_StackOverflowError_entry             = generate_StackOverflowError_handler();
+  }
 
 
 
 #define method_entry(kind)                                              \
-      { CodeletMark cm(_masm, "method entry point (kind = " #kind ")"); \
-        Interpreter::_entry_table[Interpreter::kind] = generate_method_entry(Interpreter::kind); \
-        Interpreter::update_cds_entry_table(Interpreter::kind); \
-      }
+  { CodeletMark cm(_masm, "method entry point (kind = " #kind ")"); \
+    Interpreter::_entry_table[Interpreter::kind] = generate_method_entry(Interpreter::kind); \
+    Interpreter::update_cds_entry_table(Interpreter::kind); \
+  }
 
-      // all non-native method kinds
-      method_entry(zerolocals)
-      method_entry(zerolocals_synchronized)
-      method_entry(empty)
-      method_entry(accessor)
-      method_entry(abstract)
-      method_entry(java_lang_math_sin  )
-      method_entry(java_lang_math_cos  )
-      method_entry(java_lang_math_tan  )
-      method_entry(java_lang_math_abs  )
-      method_entry(java_lang_math_sqrt )
-      method_entry(java_lang_math_log  )
-      method_entry(java_lang_math_log10)
-      method_entry(java_lang_math_exp  )
-      method_entry(java_lang_math_pow  )
-      method_entry(java_lang_math_fmaF )
-      method_entry(java_lang_math_fmaD )
-      method_entry(java_lang_ref_reference_get)
-
-      AbstractInterpreter::initialize_method_handle_entries();
-
-      // all native method kinds (must be one contiguous block)
-      Interpreter::_native_entry_begin = Interpreter::code()->code_end();
-      method_entry(native)
-      method_entry(native_synchronized)
-      Interpreter::_native_entry_end = Interpreter::code()->code_end();
-
-      method_entry(java_util_zip_CRC32_update)
-      method_entry(java_util_zip_CRC32_updateBytes)
-      method_entry(java_util_zip_CRC32_updateByteBuffer)
-      method_entry(java_util_zip_CRC32C_updateBytes)
-      method_entry(java_util_zip_CRC32C_updateDirectByteBuffer)
-
-      method_entry(java_lang_Float_intBitsToFloat);
-      method_entry(java_lang_Float_floatToRawIntBits);
-      method_entry(java_lang_Double_longBitsToDouble);
-      method_entry(java_lang_Double_doubleToRawLongBits);
+  // all non-native method kinds
+  method_entry(zerolocals)
+  method_entry(zerolocals_synchronized)
+  method_entry(empty)
+  method_entry(accessor)
+  method_entry(abstract)
+  method_entry(java_lang_math_sin  )
+  method_entry(java_lang_math_cos  )
+  method_entry(java_lang_math_tan  )
+  method_entry(java_lang_math_abs  )
+  method_entry(java_lang_math_sqrt )
+  method_entry(java_lang_math_log  )
+  method_entry(java_lang_math_log10)
+  method_entry(java_lang_math_exp  )
+  method_entry(java_lang_math_pow  )
+  method_entry(java_lang_math_fmaF )
+  method_entry(java_lang_math_fmaD )
+  method_entry(java_lang_ref_reference_get)
+
+  AbstractInterpreter::initialize_method_handle_entries();
+
+  // all native method kinds (must be one contiguous block)
+  Interpreter::_native_entry_begin = Interpreter::code()->code_end();
+  method_entry(native)
+  method_entry(native_synchronized)
+  Interpreter::_native_entry_end = Interpreter::code()->code_end();
+
+  method_entry(java_util_zip_CRC32_update)
+  method_entry(java_util_zip_CRC32_updateBytes)
+  method_entry(java_util_zip_CRC32_updateByteBuffer)
+  method_entry(java_util_zip_CRC32C_updateBytes)
+  method_entry(java_util_zip_CRC32C_updateDirectByteBuffer)
+
+  method_entry(java_lang_Float_intBitsToFloat);
+  method_entry(java_lang_Float_floatToRawIntBits);
+  method_entry(java_lang_Double_longBitsToDouble);
+  method_entry(java_lang_Double_doubleToRawLongBits);
 
 #undef method_entry
 
-      // Bytecodes
-      set_entry_points_for_all_bytes();
-    }
-  } while (CodeCacheExtensions::needs_other_interpreter_variant());
+  // Bytecodes
+  set_entry_points_for_all_bytes();
 
   // installation of code in other places in the runtime
   // (ExcutableCodeManager calls not needed to copy the entries)
@@ -314,9 +307,6 @@
 
 
 void TemplateInterpreterGenerator::set_entry_points(Bytecodes::Code code) {
-  if (CodeCacheExtensions::skip_template_interpreter_entries(code)) {
-    return;
-  }
   CodeletMark cm(_masm, Bytecodes::name(code), code);
   // initialize entry points
   assert(_unimplemented_bytecode    != NULL, "should have been generated before");
@@ -347,7 +337,6 @@
   EntryPoint entry(bep, zep, cep, sep, aep, iep, lep, fep, dep, vep);
   Interpreter::_normal_table.set_entry(code, entry);
   Interpreter::_wentry_point[code] = wep;
-  CodeCacheExtensions::completed_template_interpreter_entries(_masm, code);
 }
 
 
--- old/hotspot/src/share/vm/memory/virtualspace.cpp	2016-12-02 11:15:50.381150970 -0500
+++ new/hotspot/src/share/vm/memory/virtualspace.cpp	2016-12-02 11:15:48.629051613 -0500
@@ -23,7 +23,6 @@
  */
 
 #include "precompiled.hpp"
-#include "code/codeCacheExtensions.hpp"
 #include "logging/log.hpp"
 #include "memory/resourceArea.hpp"
 #include "memory/virtualspace.hpp"
@@ -592,7 +591,7 @@
 ReservedCodeSpace::ReservedCodeSpace(size_t r_size,
                                      size_t rs_align,
                                      bool large) :
-  ReservedSpace(r_size, rs_align, large, /*executable*/ CodeCacheExtensions::support_dynamic_code()) {
+  ReservedSpace(r_size, rs_align, large, /*executable*/ true) {
   MemTracker::record_virtual_memory_type((address)base(), mtCode);
 }
 
--- old/hotspot/src/share/vm/precompiled/precompiled.hpp	2016-12-02 11:15:55.761456075 -0500
+++ new/hotspot/src/share/vm/precompiled/precompiled.hpp	2016-12-02 11:15:54.105362162 -0500
@@ -66,7 +66,6 @@
 # include "classfile/vmSymbols.hpp"
 # include "code/codeBlob.hpp"
 # include "code/codeCache.hpp"
-# include "code/codeCacheExtensions.hpp"
 # include "code/compressedStream.hpp"
 # include "code/debugInfo.hpp"
 # include "code/debugInfoRec.hpp"
--- old/hotspot/src/share/vm/prims/methodHandles.cpp	2016-12-02 11:16:01.073757323 -0500
+++ new/hotspot/src/share/vm/prims/methodHandles.cpp	2016-12-02 11:15:59.509668628 -0500
@@ -26,7 +26,6 @@
 #include "classfile/javaClasses.inline.hpp"
 #include "classfile/stringTable.hpp"
 #include "code/codeCache.hpp"
-#include "code/codeCacheExtensions.hpp"
 #include "code/dependencyContext.hpp"
 #include "compiler/compileBroker.hpp"
 #include "interpreter/interpreter.hpp"
@@ -94,7 +93,6 @@
     StubCodeMark mark(this, "MethodHandle::interpreter_entry", vmIntrinsics::name_at(iid));
     address entry = MethodHandles::generate_method_handle_interpreter_entry(_masm, iid);
     if (entry != NULL) {
-      CodeCacheExtensions::handle_generated_pc(entry, vmIntrinsics::name_at(iid));
       Interpreter::set_entry_for_kind(mk, entry);
     }
     // If the entry is not set, it will throw AbstractMethodError.
--- old/hotspot/src/share/vm/runtime/arguments.cpp	2016-12-02 11:16:06.098042239 -0500
+++ new/hotspot/src/share/vm/runtime/arguments.cpp	2016-12-02 11:16:04.385945150 -0500
@@ -27,7 +27,6 @@
 #include "classfile/javaAssertions.hpp"
 #include "classfile/stringTable.hpp"
 #include "classfile/symbolTable.hpp"
-#include "code/codeCacheExtensions.hpp"
 #include "gc/shared/cardTableRS.hpp"
 #include "gc/shared/genCollectedHeap.hpp"
 #include "gc/shared/referenceProcessor.hpp"
@@ -1877,7 +1876,6 @@
 #endif // _LP64
 #endif // !ZERO
 
-  CodeCacheExtensions::set_ergonomics_flags();
 }
 
 void Arguments::set_parallel_gc_flags() {
--- old/hotspot/src/share/vm/runtime/init.cpp	2016-12-02 11:16:11.266335320 -0500
+++ new/hotspot/src/share/vm/runtime/init.cpp	2016-12-02 11:16:09.526236644 -0500
@@ -25,7 +25,6 @@
 #include "precompiled.hpp"
 #include "classfile/stringTable.hpp"
 #include "classfile/symbolTable.hpp"
-#include "code/codeCacheExtensions.hpp"
 #include "code/icBuffer.hpp"
 #include "gc/shared/collectedHeap.hpp"
 #include "interpreter/bytecodes.hpp"
@@ -105,20 +104,15 @@
   classLoader_init1();
   compilationPolicy_init();
   codeCache_init();
-  CodeCacheExtensions::initialize();
   VM_Version_init();
-  CodeCacheExtensions::complete_step(CodeCacheExtensionsSteps::VMVersion);
   os_init_globals();
   stubRoutines_init1();
-  CodeCacheExtensions::complete_step(CodeCacheExtensionsSteps::StubRoutines1);
   jint status = universe_init();  // dependent on codeCache_init and
                                   // stubRoutines_init1 and metaspace_init.
   if (status != JNI_OK)
     return status;
 
-  CodeCacheExtensions::complete_step(CodeCacheExtensionsSteps::Universe);
   interpreter_init();  // before any methods loaded
-  CodeCacheExtensions::complete_step(CodeCacheExtensionsSteps::Interpreter);
   invocationCounter_init();  // before any methods loaded
   marksweep_init();
   accessFlags_init();
@@ -148,7 +142,6 @@
   javaClasses_init();   // must happen after vtable initialization
   stubRoutines_init2(); // note: StubRoutines need 2-phase init
   MethodHandles::generate_adapters();
-  CodeCacheExtensions::complete_step(CodeCacheExtensionsSteps::StubRoutines2);
 
 #if INCLUDE_NMT
   // Solaris stack is walkable only after stubRoutines are set up.
@@ -162,7 +155,6 @@
     CommandLineFlags::printFlags(tty, false, PrintFlagsRanges);
   }
 
-  CodeCacheExtensions::complete_step(CodeCacheExtensionsSteps::InitGlobals);
   return JNI_OK;
 }
 
--- old/hotspot/src/share/vm/runtime/sharedRuntime.cpp	2016-12-02 11:16:16.362624319 -0500
+++ new/hotspot/src/share/vm/runtime/sharedRuntime.cpp	2016-12-02 11:16:14.690529498 -0500
@@ -28,7 +28,6 @@
 #include "classfile/vmSymbols.hpp"
 #include "code/codeCache.hpp"
 #include "code/compiledIC.hpp"
-#include "code/codeCacheExtensions.hpp"
 #include "code/scopeDesc.hpp"
 #include "code/vtableStubs.hpp"
 #include "compiler/abstractCompiler.hpp"
@@ -2567,27 +2566,15 @@
   if (_adapters != NULL) return;
   _adapters = new AdapterHandlerTable();
 
-  if (!CodeCacheExtensions::skip_compiler_support()) {
-    // Create a special handler for abstract methods.  Abstract methods
-    // are never compiled so an i2c entry is somewhat meaningless, but
-    // throw AbstractMethodError just in case.
-    // Pass wrong_method_abstract for the c2i transitions to return
-    // AbstractMethodError for invalid invocations.
-    address wrong_method_abstract = SharedRuntime::get_handle_wrong_method_abstract_stub();
-    _abstract_method_handler = AdapterHandlerLibrary::new_entry(new AdapterFingerPrint(0, NULL),
-                                                                StubRoutines::throw_AbstractMethodError_entry(),
-                                                                wrong_method_abstract, wrong_method_abstract);
-  } else {
-    // Adapters are not supposed to be used.
-    // Generate a special one to cause an error if used (and store this
-    // singleton in place of the useless _abstract_method_error adapter).
-    address entry = (address) &unexpected_adapter_call;
-    _abstract_method_handler = AdapterHandlerLibrary::new_entry(new AdapterFingerPrint(0, NULL),
-                                                                entry,
-                                                                entry,
-                                                                entry);
-
-  }
+  // Create a special handler for abstract methods.  Abstract methods
+  // are never compiled so an i2c entry is somewhat meaningless, but
+  // throw AbstractMethodError just in case.
+  // Pass wrong_method_abstract for the c2i transitions to return
+  // AbstractMethodError for invalid invocations.
+  address wrong_method_abstract = SharedRuntime::get_handle_wrong_method_abstract_stub();
+  _abstract_method_handler = AdapterHandlerLibrary::new_entry(new AdapterFingerPrint(0, NULL),
+                                                              StubRoutines::throw_AbstractMethodError_entry(),
+                                                              wrong_method_abstract, wrong_method_abstract);
 }
 
 AdapterHandlerEntry* AdapterHandlerLibrary::new_entry(AdapterFingerPrint* fingerprint,
@@ -2638,17 +2625,6 @@
     // make sure data structure is initialized
     initialize();
 
-    // during dump time, always generate adapters, even if the
-    // compiler has been turned off.
-    if (!DumpSharedSpaces && CodeCacheExtensions::skip_compiler_support()) {
-      // adapters are useless and should not be used, including the
-      // abstract_method_handler. However, some callers check that
-      // an adapter was installed.
-      // Return the singleton adapter, stored into _abstract_method_handler
-      // and modified to cause an error if we ever call it.
-      return _abstract_method_handler;
-    }
-
     if (method->is_abstract()) {
       return _abstract_method_handler;
     }
--- old/hotspot/src/share/vm/runtime/stubCodeGenerator.cpp	2016-12-02 11:16:21.866936456 -0500
+++ new/hotspot/src/share/vm/runtime/stubCodeGenerator.cpp	2016-12-02 11:16:20.214842770 -0500
@@ -26,7 +26,6 @@
 #include "asm/macroAssembler.hpp"
 #include "asm/macroAssembler.inline.hpp"
 #include "code/codeCache.hpp"
-#include "code/codeCacheExtensions.hpp"
 #include "compiler/disassembler.hpp"
 #include "oops/oop.inline.hpp"
 #include "prims/forte.hpp"
--- old/hotspot/src/share/vm/runtime/stubRoutines.cpp	2016-12-02 11:16:27.099233167 -0500
+++ new/hotspot/src/share/vm/runtime/stubRoutines.cpp	2016-12-02 11:16:25.495142200 -0500
@@ -24,7 +24,6 @@
 
 #include "precompiled.hpp"
 #include "asm/codeBuffer.hpp"
-#include "code/codeCacheExtensions.hpp"
 #include "memory/resourceArea.hpp"
 #include "oops/oop.inline.hpp"
 #include "runtime/interfaceSupport.hpp"
@@ -204,12 +203,6 @@
 
 // simple tests of generated arraycopy functions
 static void test_arraycopy_func(address func, int alignment) {
-  if (CodeCacheExtensions::use_pregenerated_interpreter() || !CodeCacheExtensions::is_executable(func)) {
-    // Exit safely if stubs were generated but cannot be used.
-    // Also excluding pregenerated interpreter since the code may depend on
-    // some registers being properly initialized (for instance Rthread)
-    return;
-  }
   int v = 0xcc;
   int v2 = 0x11;
   jlong lbuffer[8];
--- old/hotspot/src/share/vm/runtime/thread.cpp	2016-12-02 11:16:33.847615848 -0500
+++ new/hotspot/src/share/vm/runtime/thread.cpp	2016-12-02 11:16:31.635490404 -0500
@@ -29,7 +29,6 @@
 #include "classfile/systemDictionary.hpp"
 #include "classfile/vmSymbols.hpp"
 #include "code/codeCache.hpp"
-#include "code/codeCacheExtensions.hpp"
 #include "code/scopeDesc.hpp"
 #include "compiler/compileBroker.hpp"
 #include "compiler/compileTask.hpp"
@@ -3842,8 +3841,6 @@
     }
   }
 
-  CodeCacheExtensions::complete_step(CodeCacheExtensionsSteps::CreateVM);
-
   create_vm_timer.end();
 #ifdef ASSERT
   _vm_complete = true;
--- old/hotspot/src/share/vm/runtime/vm_operations.cpp	2016-12-02 11:16:38.975906660 -0500
+++ new/hotspot/src/share/vm/runtime/vm_operations.cpp	2016-12-02 11:16:37.363815242 -0500
@@ -26,7 +26,6 @@
 #include "classfile/symbolTable.hpp"
 #include "classfile/vmSymbols.hpp"
 #include "code/codeCache.hpp"
-#include "code/codeCacheExtensions.hpp"
 #include "compiler/compileBroker.hpp"
 #include "gc/shared/isGCActiveMark.hpp"
 #include "logging/log.hpp"
@@ -390,7 +389,6 @@
 Thread * VM_Exit::_shutdown_thread = NULL;
 
 int VM_Exit::set_vm_exited() {
-  CodeCacheExtensions::complete_step(CodeCacheExtensionsSteps::LastStep);
 
   Thread * thr_cur = Thread::current();
 
--- old/hotspot/src/share/vm/runtime/vm_version.cpp	2016-12-02 11:16:43.996191347 -0500
+++ new/hotspot/src/share/vm/runtime/vm_version.cpp	2016-12-02 11:16:42.380099706 -0500
@@ -23,7 +23,6 @@
  */
 
 #include "precompiled.hpp"
-#include "code/codeCacheExtensions.hpp"
 #include "logging/log.hpp"
 #include "memory/universe.hpp"
 #include "oops/oop.inline.hpp"
@@ -127,9 +126,6 @@
 
 
 const char* Abstract_VM_Version::vm_info_string() {
-  if (CodeCacheExtensions::use_pregenerated_interpreter()) {
-    return "interpreted mode, pregenerated";
-  }
   switch (Arguments::mode()) {
     case Arguments::_int:
       return UseSharedSpaces ? "interpreted mode, sharing" : "interpreted mode";
--- /dev/null	2016-08-24 15:41:39.598575000 -0400
+++ new/hotspot/src/cpu/arm/vm/abstractInterpreter_arm.cpp	2016-12-02 11:16:47.768405262 -0500
@@ -0,0 +1,270 @@
+/*
+ * Copyright (c) 2008, 2016, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+#include "asm/assembler.hpp"
+#include "interpreter/bytecode.hpp"
+#include "interpreter/interpreter.hpp"
+#include "oops/constMethod.hpp"
+#include "oops/method.hpp"
+#include "prims/methodHandles.hpp"
+#include "runtime/handles.inline.hpp"
+#include "runtime/frame.inline.hpp"
+#include "runtime/synchronizer.hpp"
+#include "utilities/macros.hpp"
+
+int AbstractInterpreter::BasicType_as_index(BasicType type) {
+  int i = 0;
+  switch (type) {
+#ifdef AARCH64
+    case T_BOOLEAN: i = 0; break;
+    case T_CHAR   : i = 1; break;
+    case T_BYTE   : i = 2; break;
+    case T_SHORT  : i = 3; break;
+    case T_INT    : // fall through
+    case T_LONG   : // fall through
+    case T_VOID   : // fall through
+    case T_FLOAT  : // fall through
+    case T_DOUBLE : i = 4; break;
+    case T_OBJECT : // fall through
+    case T_ARRAY  : i = 5; break;
+#else
+    case T_VOID   : i = 0; break;
+    case T_BOOLEAN: i = 1; break;
+    case T_CHAR   : i = 2; break;
+    case T_BYTE   : i = 3; break;
+    case T_SHORT  : i = 4; break;
+    case T_INT    : i = 5; break;
+    case T_OBJECT : // fall through
+    case T_ARRAY  : i = 6; break;
+    case T_LONG   : i = 7; break;
+    case T_FLOAT  : i = 8; break;
+    case T_DOUBLE : i = 9; break;
+#endif // AARCH64
+    default       : ShouldNotReachHere();
+  }
+  assert(0 <= i && i < AbstractInterpreter::number_of_result_handlers, "index out of bounds");
+  return i;
+}
+
+// These should never be compiled since the interpreter will prefer
+// the compiled version to the intrinsic version.
+bool AbstractInterpreter::can_be_compiled(methodHandle m) {
+  switch (method_kind(m)) {
+    case Interpreter::java_lang_math_sin     : // fall thru
+    case Interpreter::java_lang_math_cos     : // fall thru
+    case Interpreter::java_lang_math_tan     : // fall thru
+    case Interpreter::java_lang_math_abs     : // fall thru
+    case Interpreter::java_lang_math_log     : // fall thru
+    case Interpreter::java_lang_math_log10   : // fall thru
+    case Interpreter::java_lang_math_sqrt    :
+      return false;
+    default:
+      return true;
+  }
+}
+
+// How much stack a method activation needs in words.
+int AbstractInterpreter::size_top_interpreter_activation(Method* method) {
+  const int stub_code = AARCH64_ONLY(24) NOT_AARCH64(12);  // see generate_call_stub
+  // Save space for one monitor to get into the interpreted method in case
+  // the method is synchronized
+  int monitor_size    = method->is_synchronized() ?
+                                1*frame::interpreter_frame_monitor_size() : 0;
+
+  // total overhead size: monitor_size + (sender SP, thru expr stack bottom).
+  // be sure to change this if you add/subtract anything to/from the overhead area
+  const int overhead_size = monitor_size +
+                            (frame::sender_sp_offset - frame::interpreter_frame_initial_sp_offset);
+  const int method_stack = (method->max_locals() + method->max_stack()) *
+                           Interpreter::stackElementWords;
+  return overhead_size + method_stack + stub_code;
+}
+
+// asm based interpreter deoptimization helpers
+int AbstractInterpreter::size_activation(int max_stack,
+                                         int tempcount,
+                                         int extra_args,
+                                         int moncount,
+                                         int callee_param_count,
+                                         int callee_locals,
+                                         bool is_top_frame) {
+  // Note: This calculation must exactly parallel the frame setup
+  // in TemplateInterpreterGenerator::generate_fixed_frame.
+  // fixed size of an interpreter frame:
+  int overhead = frame::sender_sp_offset - frame::interpreter_frame_initial_sp_offset;
+
+  // Our locals were accounted for by the caller (or last_frame_adjust on the transistion)
+  // Since the callee parameters already account for the callee's params we only need to account for
+  // the extra locals.
+
+  int size = overhead +
+         ((callee_locals - callee_param_count)*Interpreter::stackElementWords) +
+         (moncount*frame::interpreter_frame_monitor_size()) +
+         tempcount*Interpreter::stackElementWords + extra_args;
+
+#ifdef AARCH64
+  size = round_to(size, StackAlignmentInBytes/BytesPerWord);
+#endif // AARCH64
+
+  return size;
+}
+
+void AbstractInterpreter::layout_activation(Method* method,
+                                            int tempcount,
+                                            int popframe_extra_args,
+                                            int moncount,
+                                            int caller_actual_parameters,
+                                            int callee_param_count,
+                                            int callee_locals,
+                                            frame* caller,
+                                            frame* interpreter_frame,
+                                            bool is_top_frame,
+                                            bool is_bottom_frame) {
+
+  // Set up the method, locals, and monitors.
+  // The frame interpreter_frame is guaranteed to be the right size,
+  // as determined by a previous call to the size_activation() method.
+  // It is also guaranteed to be walkable even though it is in a skeletal state
+  // NOTE: return size is in words not bytes
+
+  // fixed size of an interpreter frame:
+  int max_locals = method->max_locals() * Interpreter::stackElementWords;
+  int extra_locals = (method->max_locals() - method->size_of_parameters()) * Interpreter::stackElementWords;
+
+#ifdef ASSERT
+  assert(caller->sp() == interpreter_frame->sender_sp(), "Frame not properly walkable");
+#endif
+
+  interpreter_frame->interpreter_frame_set_method(method);
+  // NOTE the difference in using sender_sp and interpreter_frame_sender_sp
+  // interpreter_frame_sender_sp is the original sp of the caller (the unextended_sp)
+  // and sender_sp is (fp + sender_sp_offset*wordSize)
+
+#ifdef AARCH64
+  intptr_t* locals;
+  if (caller->is_interpreted_frame()) {
+    // attach locals to the expression stack of caller interpreter frame
+    locals = caller->interpreter_frame_tos_address() + caller_actual_parameters*Interpreter::stackElementWords - 1;
+  } else {
+    assert (is_bottom_frame, "should be");
+    locals = interpreter_frame->fp() + frame::sender_sp_offset + method->max_locals() - 1;
+  }
+
+  if (TraceDeoptimization) {
+    tty->print_cr("layout_activation:");
+
+    if (caller->is_entry_frame()) {
+      tty->print("entry ");
+    }
+    if (caller->is_compiled_frame()) {
+      tty->print("compiled ");
+    }
+    if (caller->is_interpreted_frame()) {
+      tty->print("interpreted ");
+    }
+    tty->print_cr("caller: sp=%p, unextended_sp=%p, fp=%p, pc=%p", caller->sp(), caller->unextended_sp(), caller->fp(), caller->pc());
+    tty->print_cr("interpreter_frame: sp=%p, unextended_sp=%p, fp=%p, pc=%p", interpreter_frame->sp(), interpreter_frame->unextended_sp(), interpreter_frame->fp(), interpreter_frame->pc());
+    tty->print_cr("method: max_locals = %d, size_of_parameters = %d", method->max_locals(), method->size_of_parameters());
+    tty->print_cr("caller_actual_parameters = %d", caller_actual_parameters);
+    tty->print_cr("locals = %p", locals);
+  }
+
+#ifdef ASSERT
+  if (caller_actual_parameters != method->size_of_parameters()) {
+    assert(caller->is_interpreted_frame(), "adjusted caller_actual_parameters, but caller is not interpreter frame");
+    Bytecode_invoke inv(caller->interpreter_frame_method(), caller->interpreter_frame_bci());
+
+    if (is_bottom_frame) {
+      assert(caller_actual_parameters == 0, "invalid adjusted caller_actual_parameters value for bottom frame");
+      assert(inv.is_invokedynamic() || inv.is_invokehandle(), "adjusted caller_actual_parameters for bottom frame, but not invokedynamic/invokehandle");
+    } else {
+      assert(caller_actual_parameters == method->size_of_parameters()+1, "invalid adjusted caller_actual_parameters value");
+      assert(!inv.is_invokedynamic() && MethodHandles::has_member_arg(inv.klass(), inv.name()), "adjusted caller_actual_parameters, but no member arg");
+    }
+  }
+  if (caller->is_interpreted_frame()) {
+    intptr_t* locals_base = (locals - method->max_locals()*Interpreter::stackElementWords + 1);
+    locals_base = (intptr_t*)round_down((intptr_t)locals_base, StackAlignmentInBytes);
+    assert(interpreter_frame->sender_sp() <= locals_base, "interpreter-to-interpreter frame chaining");
+
+  } else if (caller->is_compiled_frame()) {
+    assert(locals + 1 <= caller->unextended_sp(), "compiled-to-interpreter frame chaining");
+
+  } else {
+    assert(caller->is_entry_frame(), "should be");
+    assert(locals + 1 <= caller->fp(), "entry-to-interpreter frame chaining");
+  }
+#endif // ASSERT
+
+#else
+  intptr_t* locals = interpreter_frame->sender_sp() + max_locals - 1;
+#endif // AARCH64
+
+  interpreter_frame->interpreter_frame_set_locals(locals);
+  BasicObjectLock* montop = interpreter_frame->interpreter_frame_monitor_begin();
+  BasicObjectLock* monbot = montop - moncount;
+  interpreter_frame->interpreter_frame_set_monitor_end(monbot);
+
+  // Set last_sp
+  intptr_t* stack_top = (intptr_t*) monbot  -
+    tempcount*Interpreter::stackElementWords -
+    popframe_extra_args;
+#ifdef AARCH64
+  interpreter_frame->interpreter_frame_set_stack_top(stack_top);
+
+  intptr_t* extended_sp = (intptr_t*) monbot  -
+    (method->max_stack() + 1) * Interpreter::stackElementWords - // +1 is reserved slot for exception handler
+    popframe_extra_args;
+  extended_sp = (intptr_t*)round_down((intptr_t)extended_sp, StackAlignmentInBytes);
+  interpreter_frame->interpreter_frame_set_extended_sp(extended_sp);
+#else
+  interpreter_frame->interpreter_frame_set_last_sp(stack_top);
+#endif // AARCH64
+
+  // All frames but the initial (oldest) interpreter frame we fill in have a
+  // value for sender_sp that allows walking the stack but isn't
+  // truly correct. Correct the value here.
+
+#ifdef AARCH64
+  if (caller->is_interpreted_frame()) {
+    intptr_t* sender_sp = (intptr_t*)round_down((intptr_t)caller->interpreter_frame_tos_address(), StackAlignmentInBytes);
+    interpreter_frame->set_interpreter_frame_sender_sp(sender_sp);
+
+  } else {
+    // in case of non-interpreter caller sender_sp of the oldest frame is already
+    // set to valid value
+  }
+#else
+  if (extra_locals != 0 &&
+      interpreter_frame->sender_sp() == interpreter_frame->interpreter_frame_sender_sp() ) {
+    interpreter_frame->set_interpreter_frame_sender_sp(caller->sp() + extra_locals);
+  }
+#endif // AARCH64
+
+  *interpreter_frame->interpreter_frame_cache_addr() =
+    method->constants()->cache();
+  *interpreter_frame->interpreter_frame_mirror_addr() =
+    method->method_holder()->java_mirror();
+}
--- /dev/null	2016-08-24 15:41:39.598575000 -0400
+++ new/hotspot/src/cpu/arm/vm/arm.ad	2016-12-02 11:16:52.812691310 -0500
@@ -0,0 +1,14428 @@
+//
+// Copyright (c) 2008, 2015, Oracle and/or its affiliates. All rights reserved.
+// DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+//
+// This code is free software; you can redistribute it and/or modify it
+// under the terms of the GNU General Public License version 2 only, as
+// published by the Free Software Foundation.
+//
+// This code is distributed in the hope that it will be useful, but WITHOUT
+// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+// FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+// version 2 for more details (a copy is included in the LICENSE file that
+// accompanied this code).
+//
+// You should have received a copy of the GNU General Public License version
+// 2 along with this work; if not, write to the Free Software Foundation,
+// Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+//
+// Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+// or visit www.oracle.com if you need additional information or have any
+// questions.
+//
+
+// ARM Architecture Description File
+
+//----------DEFINITION BLOCK---------------------------------------------------
+// Define name --> value mappings to inform the ADLC of an integer valued name
+// Current support includes integer values in the range [0, 0x7FFFFFFF]
+// Format:
+//        int_def  <name>         ( <int_value>, <expression>);
+// Generated Code in ad_<arch>.hpp
+//        #define  <name>   (<expression>)
+//        // value == <int_value>
+// Generated code in ad_<arch>.cpp adlc_verification()
+//        assert( <name> == <int_value>, "Expect (<expression>) to equal <int_value>");
+//
+definitions %{
+// The default cost (of an ALU instruction).
+  int_def DEFAULT_COST      (    100,     100);
+  int_def HUGE_COST         (1000000, 1000000);
+
+// Memory refs are twice as expensive as run-of-the-mill.
+  int_def MEMORY_REF_COST   (    200, DEFAULT_COST * 2);
+
+// Branches are even more expensive.
+  int_def BRANCH_COST       (    300, DEFAULT_COST * 3);
+  int_def CALL_COST         (    300, DEFAULT_COST * 3);
+%}
+
+
+//----------SOURCE BLOCK-------------------------------------------------------
+// This is a block of C++ code which provides values, functions, and
+// definitions necessary in the rest of the architecture description
+source_hpp %{
+// Header information of the source block.
+// Method declarations/definitions which are used outside
+// the ad-scope can conveniently be defined here.
+//
+// To keep related declarations/definitions/uses close together,
+// we switch between source %{ }% and source_hpp %{ }% freely as needed.
+
+// Does destination need to be loaded in a register then passed to a
+// branch instruction?
+extern bool maybe_far_call(const CallNode *n);
+extern bool maybe_far_call(const MachCallNode *n);
+static inline bool cache_reachable() {
+  return MacroAssembler::_cache_fully_reachable();
+}
+
+#ifdef AARCH64
+#define ldr_32 ldr_w
+#define str_32 str_w
+#else
+#define ldr_32 ldr
+#define str_32 str
+#define tst_32 tst
+#define teq_32 teq
+#endif
+#if 1
+extern bool PrintOptoAssembly;
+#endif
+
+class c2 {
+public:
+  static OptoRegPair return_value(int ideal_reg);
+};
+
+class CallStubImpl {
+
+  //--------------------------------------------------------------
+  //---<  Used for optimization in Compile::Shorten_branches  >---
+  //--------------------------------------------------------------
+
+ public:
+  // Size of call trampoline stub.
+  static uint size_call_trampoline() {
+    return 0; // no call trampolines on this platform
+  }
+
+  // number of relocations needed by a call trampoline stub
+  static uint reloc_call_trampoline() {
+    return 0; // no call trampolines on this platform
+  }
+};
+
+class HandlerImpl {
+
+ public:
+
+  static int emit_exception_handler(CodeBuffer &cbuf);
+  static int emit_deopt_handler(CodeBuffer& cbuf);
+
+  static uint size_exception_handler() {
+#ifdef AARCH64
+    // ldr_literal; br; (pad); <literal>
+    return 3 * Assembler::InstructionSize + wordSize;
+#else
+    return ( 3 * 4 );
+#endif
+  }
+
+
+  static uint size_deopt_handler() {
+    return ( 9 * 4 );
+  }
+
+};
+
+%}
+
+source %{
+#define __ _masm.
+
+static FloatRegister reg_to_FloatRegister_object(int register_encoding);
+static Register reg_to_register_object(int register_encoding);
+
+
+// ****************************************************************************
+
+// REQUIRED FUNCTIONALITY
+
+// Indicate if the safepoint node needs the polling page as an input.
+// Since ARM does not have absolute addressing, it does.
+bool SafePointNode::needs_polling_address_input() {
+  return true;
+}
+
+// emit an interrupt that is caught by the debugger (for debugging compiler)
+void emit_break(CodeBuffer &cbuf) {
+  MacroAssembler _masm(&cbuf);
+  __ breakpoint();
+}
+
+#ifndef PRODUCT
+void MachBreakpointNode::format( PhaseRegAlloc *, outputStream *st ) const {
+  st->print("TA");
+}
+#endif
+
+void MachBreakpointNode::emit(CodeBuffer &cbuf, PhaseRegAlloc *ra_) const {
+  emit_break(cbuf);
+}
+
+uint MachBreakpointNode::size(PhaseRegAlloc *ra_) const {
+  return MachNode::size(ra_);
+}
+
+
+void emit_nop(CodeBuffer &cbuf) {
+  MacroAssembler _masm(&cbuf);
+  __ nop();
+}
+
+
+void emit_call_reloc(CodeBuffer &cbuf, const MachCallNode *n, MachOper *m, RelocationHolder const& rspec) {
+  int ret_addr_offset0 = n->as_MachCall()->ret_addr_offset();
+  int call_site_offset = cbuf.insts()->mark_off();
+  MacroAssembler _masm(&cbuf);
+  __ set_inst_mark(); // needed in emit_to_interp_stub() to locate the call
+  address target = (address)m->method();
+  assert(n->as_MachCall()->entry_point() == target, "sanity");
+  assert(maybe_far_call(n) == !__ reachable_from_cache(target), "sanity");
+  assert(cache_reachable() == __ cache_fully_reachable(), "sanity");
+
+  assert(target != NULL, "need real address");
+
+  int ret_addr_offset = -1;
+  if (rspec.type() == relocInfo::runtime_call_type) {
+    __ call(target, rspec);
+    ret_addr_offset = __ offset();
+  } else {
+    // scratches Rtemp
+    ret_addr_offset = __ patchable_call(target, rspec, true);
+  }
+  assert(ret_addr_offset - call_site_offset == ret_addr_offset0, "fix ret_addr_offset()");
+}
+
+//=============================================================================
+// REQUIRED FUNCTIONALITY for encoding
+void emit_lo(CodeBuffer &cbuf, int val) {  }
+void emit_hi(CodeBuffer &cbuf, int val) {  }
+
+
+//=============================================================================
+const RegMask& MachConstantBaseNode::_out_RegMask = PTR_REG_mask();
+
+int Compile::ConstantTable::calculate_table_base_offset() const {
+#ifdef AARCH64
+  return 0;
+#else
+  int offset = -(size() / 2);
+  // flds, fldd: 8-bit  offset multiplied by 4: +/- 1024
+  // ldr, ldrb : 12-bit offset:                 +/- 4096
+  if (!Assembler::is_simm10(offset)) {
+    offset = Assembler::min_simm10();
+  }
+  return offset;
+#endif
+}
+
+bool MachConstantBaseNode::requires_postalloc_expand() const { return false; }
+void MachConstantBaseNode::postalloc_expand(GrowableArray <Node *> *nodes, PhaseRegAlloc *ra_) {
+  ShouldNotReachHere();
+}
+
+void MachConstantBaseNode::emit(CodeBuffer& cbuf, PhaseRegAlloc* ra_) const {
+  Compile* C = ra_->C;
+  Compile::ConstantTable& constant_table = C->constant_table();
+  MacroAssembler _masm(&cbuf);
+
+  Register r = as_Register(ra_->get_encode(this));
+  CodeSection* consts_section = __ code()->consts();
+  int consts_size = consts_section->align_at_start(consts_section->size());
+  assert(constant_table.size() == consts_size, "must be: %d == %d", constant_table.size(), consts_size);
+
+  // Materialize the constant table base.
+  address baseaddr = consts_section->start() + -(constant_table.table_base_offset());
+  RelocationHolder rspec = internal_word_Relocation::spec(baseaddr);
+  __ mov_address(r, baseaddr, rspec);
+}
+
+uint MachConstantBaseNode::size(PhaseRegAlloc*) const {
+#ifdef AARCH64
+  return 5 * Assembler::InstructionSize;
+#else
+  return 8;
+#endif
+}
+
+#ifndef PRODUCT
+void MachConstantBaseNode::format(PhaseRegAlloc* ra_, outputStream* st) const {
+  char reg[128];
+  ra_->dump_register(this, reg);
+  st->print("MOV_SLOW    &constanttable,%s\t! constant table base", reg);
+}
+#endif
+
+#ifndef PRODUCT
+void MachPrologNode::format( PhaseRegAlloc *ra_, outputStream *st ) const {
+  Compile* C = ra_->C;
+
+  for (int i = 0; i < OptoPrologueNops; i++) {
+    st->print_cr("NOP"); st->print("\t");
+  }
+#ifdef AARCH64
+  if (OptoPrologueNops <= 0) {
+    st->print_cr("NOP\t! required for safe patching");
+    st->print("\t");
+  }
+#endif
+
+  size_t framesize = C->frame_size_in_bytes();
+  assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
+  int bangsize = C->bang_size_in_bytes();
+  // Remove two words for return addr and rbp,
+  framesize -= 2*wordSize;
+  bangsize -= 2*wordSize;
+
+  // Calls to C2R adapters often do not accept exceptional returns.
+  // We require that their callers must bang for them.  But be careful, because
+  // some VM calls (such as call site linkage) can use several kilobytes of
+  // stack.  But the stack safety zone should account for that.
+  // See bugs 4446381, 4468289, 4497237.
+  if (C->need_stack_bang(bangsize)) {
+    st->print_cr("! stack bang (%d bytes)", bangsize); st->print("\t");
+  }
+  st->print_cr("PUSH   R_FP|R_LR_LR"); st->print("\t");
+  if (framesize != 0) {
+    st->print   ("SUB    R_SP, R_SP, " SIZE_FORMAT,framesize);
+  }
+}
+#endif
+
+void MachPrologNode::emit(CodeBuffer &cbuf, PhaseRegAlloc *ra_) const {
+  Compile* C = ra_->C;
+  MacroAssembler _masm(&cbuf);
+
+  for (int i = 0; i < OptoPrologueNops; i++) {
+    __ nop();
+  }
+#ifdef AARCH64
+  if (OptoPrologueNops <= 0) {
+    __ nop(); // required for safe patching by patch_verified_entry()
+  }
+#endif
+
+  size_t framesize = C->frame_size_in_bytes();
+  assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
+  int bangsize = C->bang_size_in_bytes();
+  // Remove two words for return addr and fp,
+  framesize -= 2*wordSize;
+  bangsize -= 2*wordSize;
+
+  // Calls to C2R adapters often do not accept exceptional returns.
+  // We require that their callers must bang for them.  But be careful, because
+  // some VM calls (such as call site linkage) can use several kilobytes of
+  // stack.  But the stack safety zone should account for that.
+  // See bugs 4446381, 4468289, 4497237.
+  if (C->need_stack_bang(bangsize)) {
+    __ arm_stack_overflow_check(bangsize, Rtemp);
+  }
+
+  __ raw_push(FP, LR);
+  if (framesize != 0) {
+    __ sub_slow(SP, SP, framesize);
+  }
+
+  // offset from scratch buffer is not valid
+  if (strcmp(cbuf.name(), "Compile::Fill_buffer") == 0) {
+    C->set_frame_complete( __ offset() );
+  }
+
+  if (C->has_mach_constant_base_node()) {
+    // NOTE: We set the table base offset here because users might be
+    // emitted before MachConstantBaseNode.
+    Compile::ConstantTable& constant_table = C->constant_table();
+    constant_table.set_table_base_offset(constant_table.calculate_table_base_offset());
+  }
+}
+
+uint MachPrologNode::size(PhaseRegAlloc *ra_) const {
+  return MachNode::size(ra_);
+}
+
+int MachPrologNode::reloc() const {
+  return 10; // a large enough number
+}
+
+//=============================================================================
+#ifndef PRODUCT
+void MachEpilogNode::format( PhaseRegAlloc *ra_, outputStream *st ) const {
+  Compile* C = ra_->C;
+
+  size_t framesize = C->frame_size_in_bytes();
+  framesize -= 2*wordSize;
+
+  if (framesize != 0) {
+    st->print("ADD    R_SP, R_SP, " SIZE_FORMAT "\n\t",framesize);
+  }
+  st->print("POP    R_FP|R_LR_LR");
+
+  if (do_polling() && ra_->C->is_method_compilation()) {
+    st->print("\n\t");
+#ifdef AARCH64
+    if (MacroAssembler::page_reachable_from_cache(os::get_polling_page())) {
+      st->print("ADRP     Rtemp, #PollAddr\t! Load Polling address\n\t");
+      st->print("LDR      ZR,[Rtemp + #PollAddr & 0xfff]\t!Poll for Safepointing");
+    } else {
+      st->print("mov_slow Rtemp, #PollAddr\t! Load Polling address\n\t");
+      st->print("LDR      ZR,[Rtemp]\t!Poll for Safepointing");
+    }
+#else
+    st->print("MOV    Rtemp, #PollAddr\t! Load Polling address\n\t");
+    st->print("LDR    Rtemp,[Rtemp]\t!Poll for Safepointing");
+#endif
+  }
+}
+#endif
+
+void MachEpilogNode::emit(CodeBuffer &cbuf, PhaseRegAlloc *ra_) const {
+  MacroAssembler _masm(&cbuf);
+  Compile* C = ra_->C;
+
+  size_t framesize = C->frame_size_in_bytes();
+  framesize -= 2*wordSize;
+  if (framesize != 0) {
+    __ add_slow(SP, SP, framesize);
+  }
+  __ raw_pop(FP, LR);
+
+  // If this does safepoint polling, then do it here
+  if (do_polling() && ra_->C->is_method_compilation()) {
+#ifdef AARCH64
+    if (false && MacroAssembler::page_reachable_from_cache(os::get_polling_page())) {
+/* FIXME: TODO
+      __ relocate(relocInfo::xxx);
+      __ adrp(Rtemp, (intptr_t)os::get_polling_page());
+      __ relocate(relocInfo::poll_return_type);
+      int offset = os::get_polling_page() & 0xfff;
+      __ ldr(ZR, Address(Rtemp + offset));
+*/
+    } else {
+      __ mov_address(Rtemp, (address)os::get_polling_page(), symbolic_Relocation::polling_page_reference);
+      __ relocate(relocInfo::poll_return_type);
+      __ ldr(ZR, Address(Rtemp));
+    }
+#else
+    // mov_slow here is usually one or two instruction
+    __ mov_address(Rtemp, (address)os::get_polling_page(), symbolic_Relocation::polling_page_reference);
+    __ relocate(relocInfo::poll_return_type);
+    __ ldr(Rtemp, Address(Rtemp));
+#endif
+  }
+}
+
+uint MachEpilogNode::size(PhaseRegAlloc *ra_) const {
+#ifdef AARCH64
+  // allow for added alignment nop from mov_address bind_literal
+  return MachNode::size(ra_) + 1 * Assembler::InstructionSize;
+#else
+  return MachNode::size(ra_);
+#endif
+}
+
+int MachEpilogNode::reloc() const {
+  return 16; // a large enough number
+}
+
+const Pipeline * MachEpilogNode::pipeline() const {
+  return MachNode::pipeline_class();
+}
+
+int MachEpilogNode::safepoint_offset() const {
+  assert( do_polling(), "no return for this epilog node");
+  //  return MacroAssembler::size_of_sethi(os::get_polling_page());
+  Unimplemented();
+  return 0;
+}
+
+//=============================================================================
+
+// Figure out which register class each belongs in: rc_int, rc_float, rc_stack
+enum RC { rc_bad, rc_int, rc_float, rc_stack };
+static enum RC rc_class( OptoReg::Name reg ) {
+  if (!OptoReg::is_valid(reg)) return rc_bad;
+  if (OptoReg::is_stack(reg)) return rc_stack;
+  VMReg r = OptoReg::as_VMReg(reg);
+  if (r->is_Register()) return rc_int;
+  assert(r->is_FloatRegister(), "must be");
+  return rc_float;
+}
+
+static inline bool is_iRegLd_memhd(OptoReg::Name src_first, OptoReg::Name src_second, int offset) {
+#ifdef AARCH64
+  return is_memoryHD(offset);
+#else
+  int rlo = Matcher::_regEncode[src_first];
+  int rhi = Matcher::_regEncode[src_second];
+  if (!((rlo&1)==0 && (rlo+1 == rhi))) {
+    tty->print_cr("CAUGHT BAD LDRD/STRD");
+  }
+  return (rlo&1)==0 && (rlo+1 == rhi) && is_memoryHD(offset);
+#endif
+}
+
+uint MachSpillCopyNode::implementation( CodeBuffer *cbuf,
+                                        PhaseRegAlloc *ra_,
+                                        bool do_size,
+                                        outputStream* st ) const {
+  // Get registers to move
+  OptoReg::Name src_second = ra_->get_reg_second(in(1));
+  OptoReg::Name src_first = ra_->get_reg_first(in(1));
+  OptoReg::Name dst_second = ra_->get_reg_second(this );
+  OptoReg::Name dst_first = ra_->get_reg_first(this );
+
+  enum RC src_second_rc = rc_class(src_second);
+  enum RC src_first_rc = rc_class(src_first);
+  enum RC dst_second_rc = rc_class(dst_second);
+  enum RC dst_first_rc = rc_class(dst_first);
+
+  assert( OptoReg::is_valid(src_first) && OptoReg::is_valid(dst_first), "must move at least 1 register" );
+
+  // Generate spill code!
+  int size = 0;
+
+  if (src_first == dst_first && src_second == dst_second)
+    return size;            // Self copy, no move
+
+#ifdef TODO
+  if (bottom_type()->isa_vect() != NULL) {
+  }
+#endif
+
+  // Shared code does not expect instruction set capability based bailouts here.
+  // Handle offset unreachable bailout with minimal change in shared code.
+  // Bailout only for real instruction emit.
+  // This requires a single comment change in shared code. ( see output.cpp "Normal" instruction case )
+
+  MacroAssembler _masm(cbuf);
+
+  // --------------------------------------
+  // Check for mem-mem move.  Load into unused float registers and fall into
+  // the float-store case.
+  if (src_first_rc == rc_stack && dst_first_rc == rc_stack) {
+    int offset = ra_->reg2offset(src_first);
+    if (cbuf && !is_memoryfp(offset)) {
+      ra_->C->record_method_not_compilable("unable to handle large constant offsets");
+      return 0;
+    } else {
+      if (src_second_rc != rc_bad) {
+        assert((src_first&1)==0 && src_first+1 == src_second, "pair of registers must be aligned/contiguous");
+        src_first     = OptoReg::Name(R_mem_copy_lo_num);
+        src_second    = OptoReg::Name(R_mem_copy_hi_num);
+        src_first_rc  = rc_float;
+        src_second_rc = rc_float;
+        if (cbuf) {
+          __ ldr_double(Rmemcopy, Address(SP, offset));
+        } else if (!do_size) {
+          st->print(LDR_DOUBLE "   R_%s,[R_SP + #%d]\t! spill",OptoReg::regname(src_first),offset);
+        }
+      } else {
+        src_first     = OptoReg::Name(R_mem_copy_lo_num);
+        src_first_rc  = rc_float;
+        if (cbuf) {
+          __ ldr_float(Rmemcopy, Address(SP, offset));
+        } else if (!do_size) {
+          st->print(LDR_FLOAT "   R_%s,[R_SP + #%d]\t! spill",OptoReg::regname(src_first),offset);
+        }
+      }
+      size += 4;
+    }
+  }
+
+  if (src_second_rc == rc_stack && dst_second_rc == rc_stack) {
+    Unimplemented();
+  }
+
+  // --------------------------------------
+  // Check for integer reg-reg copy
+  if (src_first_rc == rc_int && dst_first_rc == rc_int) {
+    // Else normal reg-reg copy
+    assert( src_second != dst_first, "smashed second before evacuating it" );
+    if (cbuf) {
+      __ mov(reg_to_register_object(Matcher::_regEncode[dst_first]), reg_to_register_object(Matcher::_regEncode[src_first]));
+#ifndef PRODUCT
+    } else if (!do_size) {
+      st->print("MOV    R_%s, R_%s\t# spill",
+                Matcher::regName[dst_first],
+                Matcher::regName[src_first]);
+#endif
+    }
+#ifdef AARCH64
+    if (src_first+1 == src_second && dst_first+1 == dst_second) {
+      return size + 4;
+    }
+#endif
+    size += 4;
+  }
+
+  // Check for integer store
+  if (src_first_rc == rc_int && dst_first_rc == rc_stack) {
+    int offset = ra_->reg2offset(dst_first);
+    if (cbuf && !is_memoryI(offset)) {
+      ra_->C->record_method_not_compilable("unable to handle large constant offsets");
+      return 0;
+    } else {
+      if (src_second_rc != rc_bad && is_iRegLd_memhd(src_first, src_second, offset)) {
+        assert((src_first&1)==0 && src_first+1 == src_second, "pair of registers must be aligned/contiguous");
+        if (cbuf) {
+          __ str_64(reg_to_register_object(Matcher::_regEncode[src_first]), Address(SP, offset));
+#ifndef PRODUCT
+        } else if (!do_size) {
+          if (size != 0) st->print("\n\t");
+          st->print(STR_64 "   R_%s,[R_SP + #%d]\t! spill",OptoReg::regname(src_first), offset);
+#endif
+        }
+        return size + 4;
+      } else {
+        if (cbuf) {
+          __ str_32(reg_to_register_object(Matcher::_regEncode[src_first]), Address(SP, offset));
+#ifndef PRODUCT
+        } else if (!do_size) {
+          if (size != 0) st->print("\n\t");
+          st->print(STR_32 "   R_%s,[R_SP + #%d]\t! spill",OptoReg::regname(src_first), offset);
+#endif
+        }
+      }
+    }
+    size += 4;
+  }
+
+  // Check for integer load
+  if (dst_first_rc == rc_int && src_first_rc == rc_stack) {
+    int offset = ra_->reg2offset(src_first);
+    if (cbuf && !is_memoryI(offset)) {
+      ra_->C->record_method_not_compilable("unable to handle large constant offsets");
+      return 0;
+    } else {
+      if (src_second_rc != rc_bad && is_iRegLd_memhd(dst_first, dst_second, offset)) {
+        assert((src_first&1)==0 && src_first+1 == src_second, "pair of registers must be aligned/contiguous");
+        if (cbuf) {
+          __ ldr_64(reg_to_register_object(Matcher::_regEncode[dst_first]), Address(SP, offset));
+#ifndef PRODUCT
+        } else if (!do_size) {
+          if (size != 0) st->print("\n\t");
+          st->print(LDR_64 "   R_%s,[R_SP + #%d]\t! spill",OptoReg::regname(dst_first), offset);
+#endif
+        }
+        return size + 4;
+      } else {
+        if (cbuf) {
+          __ ldr_32(reg_to_register_object(Matcher::_regEncode[dst_first]), Address(SP, offset));
+#ifndef PRODUCT
+        } else if (!do_size) {
+          if (size != 0) st->print("\n\t");
+          st->print(LDR_32 "   R_%s,[R_SP + #%d]\t! spill",OptoReg::regname(dst_first), offset);
+#endif
+        }
+      }
+    }
+    size += 4;
+  }
+
+  // Check for float reg-reg copy
+  if (src_first_rc == rc_float && dst_first_rc == rc_float) {
+    if (src_second_rc != rc_bad) {
+      assert((src_first&1)==0 && src_first+1 == src_second && (dst_first&1)==0 && dst_first+1 == dst_second, "pairs of registers must be aligned/contiguous");
+      if (cbuf) {
+      __ mov_double(reg_to_FloatRegister_object(Matcher::_regEncode[dst_first]), reg_to_FloatRegister_object(Matcher::_regEncode[src_first]));
+#ifndef PRODUCT
+      } else if (!do_size) {
+        st->print(MOV_DOUBLE "    R_%s, R_%s\t# spill",
+                  Matcher::regName[dst_first],
+                  Matcher::regName[src_first]);
+#endif
+      }
+      return 4;
+    }
+    if (cbuf) {
+      __ mov_float(reg_to_FloatRegister_object(Matcher::_regEncode[dst_first]), reg_to_FloatRegister_object(Matcher::_regEncode[src_first]));
+#ifndef PRODUCT
+    } else if (!do_size) {
+      st->print(MOV_FLOAT "    R_%s, R_%s\t# spill",
+                Matcher::regName[dst_first],
+                Matcher::regName[src_first]);
+#endif
+    }
+    size = 4;
+  }
+
+  // Check for float store
+  if (src_first_rc == rc_float && dst_first_rc == rc_stack) {
+    int offset = ra_->reg2offset(dst_first);
+    if (cbuf && !is_memoryfp(offset)) {
+      ra_->C->record_method_not_compilable("unable to handle large constant offsets");
+      return 0;
+    } else {
+      // Further check for aligned-adjacent pair, so we can use a double store
+      if (src_second_rc != rc_bad) {
+        assert((src_first&1)==0 && src_first+1 == src_second && (dst_first&1)==0 && dst_first+1 == dst_second, "pairs of registers and stack slots must be aligned/contiguous");
+        if (cbuf) {
+          __ str_double(reg_to_FloatRegister_object(Matcher::_regEncode[src_first]), Address(SP, offset));
+#ifndef PRODUCT
+        } else if (!do_size) {
+          if (size != 0) st->print("\n\t");
+          st->print(STR_DOUBLE "   R_%s,[R_SP + #%d]\t! spill",OptoReg::regname(src_first),offset);
+#endif
+        }
+        return size + 4;
+      } else {
+        if (cbuf) {
+          __ str_float(reg_to_FloatRegister_object(Matcher::_regEncode[src_first]), Address(SP, offset));
+#ifndef PRODUCT
+        } else if (!do_size) {
+          if (size != 0) st->print("\n\t");
+          st->print(STR_FLOAT "   R_%s,[R_SP + #%d]\t! spill",OptoReg::regname(src_first),offset);
+#endif
+        }
+      }
+    }
+    size += 4;
+  }
+
+  // Check for float load
+  if (dst_first_rc == rc_float && src_first_rc == rc_stack) {
+    int offset = ra_->reg2offset(src_first);
+    if (cbuf && !is_memoryfp(offset)) {
+      ra_->C->record_method_not_compilable("unable to handle large constant offsets");
+      return 0;
+    } else {
+      // Further check for aligned-adjacent pair, so we can use a double store
+      if (src_second_rc != rc_bad) {
+        assert((src_first&1)==0 && src_first+1 == src_second && (dst_first&1)==0 && dst_first+1 == dst_second, "pairs of registers and stack slots must be aligned/contiguous");
+        if (cbuf) {
+          __ ldr_double(reg_to_FloatRegister_object(Matcher::_regEncode[dst_first]), Address(SP, offset));
+#ifndef PRODUCT
+        } else if (!do_size) {
+          if (size != 0) st->print("\n\t");
+          st->print(LDR_DOUBLE "   R_%s,[R_SP + #%d]\t! spill",OptoReg::regname(dst_first),offset);
+#endif
+        }
+        return size + 4;
+      } else {
+        if (cbuf) {
+          __ ldr_float(reg_to_FloatRegister_object(Matcher::_regEncode[dst_first]), Address(SP, offset));
+#ifndef PRODUCT
+        } else if (!do_size) {
+          if (size != 0) st->print("\n\t");
+          st->print(LDR_FLOAT "   R_%s,[R_SP + #%d]\t! spill",OptoReg::regname(dst_first),offset);
+#endif
+        }
+      }
+    }
+    size += 4;
+  }
+
+  // check for int reg -> float reg move
+  if (src_first_rc == rc_int && dst_first_rc == rc_float) {
+    // Further check for aligned-adjacent pair, so we can use a single instruction
+    if (src_second_rc != rc_bad) {
+      assert((dst_first&1)==0 && dst_first+1 == dst_second, "pairs of registers must be aligned/contiguous");
+      assert((src_first&1)==0 && src_first+1 == src_second, "pairs of registers must be aligned/contiguous");
+      assert(src_second_rc == rc_int && dst_second_rc == rc_float, "unsupported");
+      if (cbuf) {
+#ifdef AARCH64
+        __ fmov_dx(reg_to_FloatRegister_object(Matcher::_regEncode[dst_first]), reg_to_register_object(Matcher::_regEncode[src_first]));
+#else
+        __ fmdrr(reg_to_FloatRegister_object(Matcher::_regEncode[dst_first]), reg_to_register_object(Matcher::_regEncode[src_first]), reg_to_register_object(Matcher::_regEncode[src_second]));
+#endif
+#ifndef PRODUCT
+      } else if (!do_size) {
+        if (size != 0) st->print("\n\t");
+#ifdef AARCH64
+        st->print("FMOV_DX   R_%s, R_%s\t! spill",OptoReg::regname(dst_first), OptoReg::regname(src_first));
+#else
+        st->print("FMDRR   R_%s, R_%s, R_%s\t! spill",OptoReg::regname(dst_first), OptoReg::regname(src_first), OptoReg::regname(src_second));
+#endif
+#endif
+      }
+      return size + 4;
+    } else {
+      if (cbuf) {
+        __ fmsr(reg_to_FloatRegister_object(Matcher::_regEncode[dst_first]), reg_to_register_object(Matcher::_regEncode[src_first]));
+#ifndef PRODUCT
+      } else if (!do_size) {
+        if (size != 0) st->print("\n\t");
+        st->print(FMSR "   R_%s, R_%s\t! spill",OptoReg::regname(dst_first), OptoReg::regname(src_first));
+#endif
+      }
+      size += 4;
+    }
+  }
+
+  // check for float reg -> int reg move
+  if (src_first_rc == rc_float && dst_first_rc == rc_int) {
+    // Further check for aligned-adjacent pair, so we can use a single instruction
+    if (src_second_rc != rc_bad) {
+      assert((src_first&1)==0 && src_first+1 == src_second, "pairs of registers must be aligned/contiguous");
+      assert((dst_first&1)==0 && dst_first+1 == dst_second, "pairs of registers must be aligned/contiguous");
+      assert(src_second_rc == rc_float && dst_second_rc == rc_int, "unsupported");
+      if (cbuf) {
+#ifdef AARCH64
+        __ fmov_xd(reg_to_register_object(Matcher::_regEncode[dst_first]), reg_to_FloatRegister_object(Matcher::_regEncode[src_first]));
+#else
+        __ fmrrd(reg_to_register_object(Matcher::_regEncode[dst_first]), reg_to_register_object(Matcher::_regEncode[dst_second]), reg_to_FloatRegister_object(Matcher::_regEncode[src_first]));
+#endif
+#ifndef PRODUCT
+      } else if (!do_size) {
+        if (size != 0) st->print("\n\t");
+#ifdef AARCH64
+        st->print("FMOV_XD R_%s, R_%s\t! spill",OptoReg::regname(dst_first), OptoReg::regname(src_first));
+#else
+        st->print("FMRRD   R_%s, R_%s, R_%s\t! spill",OptoReg::regname(dst_first), OptoReg::regname(dst_second), OptoReg::regname(src_first));
+#endif
+#endif
+      }
+      return size + 4;
+    } else {
+      if (cbuf) {
+        __ fmrs(reg_to_register_object(Matcher::_regEncode[dst_first]), reg_to_FloatRegister_object(Matcher::_regEncode[src_first]));
+#ifndef PRODUCT
+      } else if (!do_size) {
+        if (size != 0) st->print("\n\t");
+        st->print(FMRS "   R_%s, R_%s\t! spill",OptoReg::regname(dst_first), OptoReg::regname(src_first));
+#endif
+      }
+      size += 4;
+    }
+  }
+
+  // --------------------------------------------------------------------
+  // Check for hi bits still needing moving.  Only happens for misaligned
+  // arguments to native calls.
+  if (src_second == dst_second)
+    return size;               // Self copy; no move
+  assert( src_second_rc != rc_bad && dst_second_rc != rc_bad, "src_second & dst_second cannot be Bad" );
+
+#ifndef AARCH64
+  // Check for integer reg-reg copy.  Hi bits are stuck up in the top
+  // 32-bits of a 64-bit register, but are needed in low bits of another
+  // register (else it's a hi-bits-to-hi-bits copy which should have
+  // happened already as part of a 64-bit move)
+  if (src_second_rc == rc_int && dst_second_rc == rc_int) {
+    if (cbuf) {
+      __ mov(reg_to_register_object(Matcher::_regEncode[dst_second]), reg_to_register_object(Matcher::_regEncode[src_second]));
+#ifndef PRODUCT
+    } else if (!do_size) {
+      if (size != 0) st->print("\n\t");
+      st->print("MOV    R_%s, R_%s\t# spill high",
+                Matcher::regName[dst_second],
+                Matcher::regName[src_second]);
+#endif
+    }
+    return size+4;
+  }
+
+  // Check for high word integer store
+  if (src_second_rc == rc_int && dst_second_rc == rc_stack) {
+    int offset = ra_->reg2offset(dst_second);
+
+    if (cbuf && !is_memoryP(offset)) {
+      ra_->C->record_method_not_compilable("unable to handle large constant offsets");
+      return 0;
+    } else {
+      if (cbuf) {
+        __ str(reg_to_register_object(Matcher::_regEncode[src_second]), Address(SP, offset));
+#ifndef PRODUCT
+      } else if (!do_size) {
+        if (size != 0) st->print("\n\t");
+        st->print("STR   R_%s,[R_SP + #%d]\t! spill",OptoReg::regname(src_second), offset);
+#endif
+      }
+    }
+    return size + 4;
+  }
+
+  // Check for high word integer load
+  if (dst_second_rc == rc_int && src_second_rc == rc_stack) {
+    int offset = ra_->reg2offset(src_second);
+    if (cbuf && !is_memoryP(offset)) {
+      ra_->C->record_method_not_compilable("unable to handle large constant offsets");
+      return 0;
+    } else {
+      if (cbuf) {
+        __ ldr(reg_to_register_object(Matcher::_regEncode[dst_second]), Address(SP, offset));
+#ifndef PRODUCT
+      } else if (!do_size) {
+        if (size != 0) st->print("\n\t");
+        st->print("LDR   R_%s,[R_SP + #%d]\t! spill",OptoReg::regname(dst_second), offset);
+#endif
+      }
+    }
+    return size + 4;
+  }
+#endif
+
+  Unimplemented();
+  return 0; // Mute compiler
+}
+
+#ifndef PRODUCT
+void MachSpillCopyNode::format( PhaseRegAlloc *ra_, outputStream *st ) const {
+  implementation( NULL, ra_, false, st );
+}
+#endif
+
+void MachSpillCopyNode::emit(CodeBuffer &cbuf, PhaseRegAlloc *ra_) const {
+  implementation( &cbuf, ra_, false, NULL );
+}
+
+uint MachSpillCopyNode::size(PhaseRegAlloc *ra_) const {
+  return implementation( NULL, ra_, true, NULL );
+}
+
+//=============================================================================
+#ifndef PRODUCT
+void MachNopNode::format( PhaseRegAlloc *, outputStream *st ) const {
+  st->print("NOP \t# %d bytes pad for loops and calls", 4 * _count);
+}
+#endif
+
+void MachNopNode::emit(CodeBuffer &cbuf, PhaseRegAlloc * ) const {
+  MacroAssembler _masm(&cbuf);
+  for(int i = 0; i < _count; i += 1) {
+    __ nop();
+  }
+}
+
+uint MachNopNode::size(PhaseRegAlloc *ra_) const {
+  return 4 * _count;
+}
+
+
+//=============================================================================
+#ifndef PRODUCT
+void BoxLockNode::format( PhaseRegAlloc *ra_, outputStream *st ) const {
+  int offset = ra_->reg2offset(in_RegMask(0).find_first_elem());
+  int reg = ra_->get_reg_first(this);
+  st->print("ADD    %s,R_SP+#%d",Matcher::regName[reg], offset);
+}
+#endif
+
+void BoxLockNode::emit(CodeBuffer &cbuf, PhaseRegAlloc *ra_) const {
+  MacroAssembler _masm(&cbuf);
+  int offset = ra_->reg2offset(in_RegMask(0).find_first_elem());
+  int reg = ra_->get_encode(this);
+  Register dst = reg_to_register_object(reg);
+
+  if (is_aimm(offset)) {
+    __ add(dst, SP, offset);
+  } else {
+    __ mov_slow(dst, offset);
+#ifdef AARCH64
+    __ add(dst, SP, dst, ex_lsl);
+#else
+    __ add(dst, SP, dst);
+#endif
+  }
+}
+
+uint BoxLockNode::size(PhaseRegAlloc *ra_) const {
+  // BoxLockNode is not a MachNode, so we can't just call MachNode::size(ra_)
+  assert(ra_ == ra_->C->regalloc(), "sanity");
+  return ra_->C->scratch_emit_size(this);
+}
+
+//=============================================================================
+#ifndef PRODUCT
+#ifdef AARCH64
+#define R_RTEMP "R_R16"
+#else
+#define R_RTEMP "R_R12"
+#endif
+void MachUEPNode::format( PhaseRegAlloc *ra_, outputStream *st ) const {
+  st->print_cr("\nUEP:");
+  if (UseCompressedClassPointers) {
+    st->print_cr("\tLDR_w " R_RTEMP ",[R_R0 + oopDesc::klass_offset_in_bytes]\t! Inline cache check");
+    st->print_cr("\tdecode_klass " R_RTEMP);
+  } else {
+    st->print_cr("\tLDR   " R_RTEMP ",[R_R0 + oopDesc::klass_offset_in_bytes]\t! Inline cache check");
+  }
+  st->print_cr("\tCMP   " R_RTEMP ",R_R8" );
+  st->print   ("\tB.NE  SharedRuntime::handle_ic_miss_stub");
+}
+#endif
+
+void MachUEPNode::emit(CodeBuffer &cbuf, PhaseRegAlloc *ra_) const {
+  MacroAssembler _masm(&cbuf);
+  Register iCache  = reg_to_register_object(Matcher::inline_cache_reg_encode());
+  assert(iCache == Ricklass, "should be");
+  Register receiver = R0;
+
+  __ load_klass(Rtemp, receiver);
+  __ cmp(Rtemp, iCache);
+#ifdef AARCH64
+  Label match;
+  __ b(match, eq);
+  __ jump(SharedRuntime::get_ic_miss_stub(), relocInfo::runtime_call_type, Rtemp);
+  __ bind(match);
+#else
+  __ jump(SharedRuntime::get_ic_miss_stub(), relocInfo::runtime_call_type, noreg, ne);
+#endif
+}
+
+uint MachUEPNode::size(PhaseRegAlloc *ra_) const {
+  return MachNode::size(ra_);
+}
+
+
+//=============================================================================
+
+// Emit exception handler code.
+int HandlerImpl::emit_exception_handler(CodeBuffer& cbuf) {
+  MacroAssembler _masm(&cbuf);
+
+  address base = __ start_a_stub(size_exception_handler());
+  if (base == NULL) {
+    ciEnv::current()->record_failure("CodeCache is full");
+    return 0;  // CodeBuffer::expand failed
+  }
+
+  int offset = __ offset();
+
+  // OK to trash LR, because exception blob will kill it
+  __ jump(OptoRuntime::exception_blob()->entry_point(), relocInfo::runtime_call_type, LR_tmp);
+
+  assert(__ offset() - offset <= (int) size_exception_handler(), "overflow");
+
+  __ end_a_stub();
+
+  return offset;
+}
+
+int HandlerImpl::emit_deopt_handler(CodeBuffer& cbuf) {
+  // Can't use any of the current frame's registers as we may have deopted
+  // at a poll and everything can be live.
+  MacroAssembler _masm(&cbuf);
+
+  address base = __ start_a_stub(size_deopt_handler());
+  if (base == NULL) {
+    ciEnv::current()->record_failure("CodeCache is full");
+    return 0;  // CodeBuffer::expand failed
+  }
+
+  int offset = __ offset();
+  address deopt_pc = __ pc();
+
+#ifdef AARCH64
+  // See LR saved by caller in sharedRuntime_arm.cpp
+  // see also hse1 ws
+  // see also LIR_Assembler::emit_deopt_handler
+
+  __ raw_push(LR, LR); // preserve LR in both slots
+  __ mov_relative_address(LR, deopt_pc);
+  __ str(LR, Address(SP, 1 * wordSize)); // save deopt PC
+  // OK to kill LR, because deopt blob will restore it from SP[0]
+  __ jump(SharedRuntime::deopt_blob()->unpack(), relocInfo::runtime_call_type, LR_tmp);
+#else
+  __ sub(SP, SP, wordSize); // make room for saved PC
+  __ push(LR); // save LR that may be live when we get here
+  __ mov_relative_address(LR, deopt_pc);
+  __ str(LR, Address(SP, wordSize)); // save deopt PC
+  __ pop(LR); // restore LR
+  __ jump(SharedRuntime::deopt_blob()->unpack(), relocInfo::runtime_call_type, noreg);
+#endif
+
+  assert(__ offset() - offset <= (int) size_deopt_handler(), "overflow");
+
+  __ end_a_stub();
+  return offset;
+}
+
+const bool Matcher::match_rule_supported(int opcode) {
+  if (!has_match_rule(opcode))
+    return false;
+
+  switch (opcode) {
+  case Op_PopCountI:
+  case Op_PopCountL:
+    if (!UsePopCountInstruction)
+      return false;
+    break;
+  case Op_LShiftCntV:
+  case Op_RShiftCntV:
+  case Op_AddVB:
+  case Op_AddVS:
+  case Op_AddVI:
+  case Op_AddVL:
+  case Op_SubVB:
+  case Op_SubVS:
+  case Op_SubVI:
+  case Op_SubVL:
+  case Op_MulVS:
+  case Op_MulVI:
+  case Op_LShiftVB:
+  case Op_LShiftVS:
+  case Op_LShiftVI:
+  case Op_LShiftVL:
+  case Op_RShiftVB:
+  case Op_RShiftVS:
+  case Op_RShiftVI:
+  case Op_RShiftVL:
+  case Op_URShiftVB:
+  case Op_URShiftVS:
+  case Op_URShiftVI:
+  case Op_URShiftVL:
+  case Op_AndV:
+  case Op_OrV:
+  case Op_XorV:
+    return VM_Version::has_simd();
+  case Op_LoadVector:
+  case Op_StoreVector:
+  case Op_AddVF:
+  case Op_SubVF:
+  case Op_MulVF:
+#ifdef AARCH64
+    return VM_Version::has_simd();
+#else
+    return VM_Version::has_vfp() || VM_Version::has_simd();
+#endif
+  case Op_AddVD:
+  case Op_SubVD:
+  case Op_MulVD:
+  case Op_DivVF:
+  case Op_DivVD:
+#ifdef AARCH64
+    return VM_Version::has_simd();
+#else
+    return VM_Version::has_vfp();
+#endif
+  }
+
+  return true;  // Per default match rules are supported.
+}
+
+const bool Matcher::match_rule_supported_vector(int opcode, int vlen) {
+
+  // TODO
+  // identify extra cases that we might want to provide match rules for
+  // e.g. Op_ vector nodes and other intrinsics while guarding with vlen
+  bool ret_value = match_rule_supported(opcode);
+  // Add rules here.
+
+  return ret_value;  // Per default match rules are supported.
+}
+
+const bool Matcher::has_predicated_vectors(void) {
+  return false;
+}
+
+const int Matcher::float_pressure(int default_pressure_threshold) {
+  return default_pressure_threshold;
+}
+
+int Matcher::regnum_to_fpu_offset(int regnum) {
+  return regnum - 32; // The FP registers are in the second chunk
+}
+
+// Vector width in bytes
+const int Matcher::vector_width_in_bytes(BasicType bt) {
+  return MaxVectorSize;
+}
+
+// Vector ideal reg corresponding to specified size in bytes
+const int Matcher::vector_ideal_reg(int size) {
+  assert(MaxVectorSize >= size, "");
+  switch(size) {
+    case  8: return Op_VecD;
+    case 16: return Op_VecX;
+  }
+  ShouldNotReachHere();
+  return 0;
+}
+
+const int Matcher::vector_shift_count_ideal_reg(int size) {
+  return vector_ideal_reg(size);
+}
+
+// Limits on vector size (number of elements) loaded into vector.
+const int Matcher::max_vector_size(const BasicType bt) {
+  assert(is_java_primitive(bt), "only primitive type vectors");
+  return vector_width_in_bytes(bt)/type2aelembytes(bt);
+}
+
+const int Matcher::min_vector_size(const BasicType bt) {
+  assert(is_java_primitive(bt), "only primitive type vectors");
+  return 8/type2aelembytes(bt);
+}
+
+// ARM doesn't support misaligned vectors store/load.
+const bool Matcher::misaligned_vectors_ok() {
+  return false;
+}
+
+// ARM doesn't support AES intrinsics
+const bool Matcher::pass_original_key_for_aes() {
+  return false;
+}
+
+const bool Matcher::convL2FSupported(void) {
+#ifdef AARCH64
+  return true;
+#else
+  return false;
+#endif
+}
+
+// Is this branch offset short enough that a short branch can be used?
+//
+// NOTE: If the platform does not provide any short branch variants, then
+//       this method should return false for offset 0.
+bool Matcher::is_short_branch_offset(int rule, int br_size, int offset) {
+  // The passed offset is relative to address of the branch.
+  // On ARM a branch displacement is calculated relative to address
+  // of the branch + 8.
+  //
+  // offset -= 8;
+  // return (Assembler::is_simm24(offset));
+  return false;
+}
+
+const bool Matcher::isSimpleConstant64(jlong value) {
+  // Will one (StoreL ConL) be cheaper than two (StoreI ConI)?.
+#ifdef AARCH64
+  return (value == 0);
+#else
+  return false;
+#endif
+}
+
+// No scaling for the parameter the ClearArray node.
+const bool Matcher::init_array_count_is_in_bytes = true;
+
+#ifdef AARCH64
+const int Matcher::long_cmove_cost() { return 1; }
+#else
+// Needs 2 CMOV's for longs.
+const int Matcher::long_cmove_cost() { return 2; }
+#endif
+
+#ifdef AARCH64
+const int Matcher::float_cmove_cost() { return 1; }
+#else
+// CMOVF/CMOVD are expensive on ARM.
+const int Matcher::float_cmove_cost() { return ConditionalMoveLimit; }
+#endif
+
+// Does the CPU require late expand (see block.cpp for description of late expand)?
+const bool Matcher::require_postalloc_expand = false;
+
+// Do we need to mask the count passed to shift instructions or does
+// the cpu only look at the lower 5/6 bits anyway?
+// FIXME: does this handle vector shifts as well?
+#ifdef AARCH64
+const bool Matcher::need_masked_shift_count = false;
+#else
+const bool Matcher::need_masked_shift_count = true;
+#endif
+
+const bool Matcher::convi2l_type_required = true;
+
+// Should the Matcher clone shifts on addressing modes, expecting them
+// to be subsumed into complex addressing expressions or compute them
+// into registers?
+bool Matcher::clone_address_expressions(AddPNode* m, Matcher::MStack& mstack, VectorSet& address_visited) {
+  return clone_base_plus_offset_address(m, mstack, address_visited);
+}
+
+void Compile::reshape_address(AddPNode* addp) {
+}
+
+bool Matcher::narrow_oop_use_complex_address() {
+  NOT_LP64(ShouldNotCallThis());
+  assert(UseCompressedOops, "only for compressed oops code");
+  return false;
+}
+
+bool Matcher::narrow_klass_use_complex_address() {
+  NOT_LP64(ShouldNotCallThis());
+  assert(UseCompressedClassPointers, "only for compressed klass code");
+  return false;
+}
+
+bool Matcher::const_oop_prefer_decode() {
+  NOT_LP64(ShouldNotCallThis());
+  return true;
+}
+
+bool Matcher::const_klass_prefer_decode() {
+  NOT_LP64(ShouldNotCallThis());
+  return true;
+}
+
+// Is it better to copy float constants, or load them directly from memory?
+// Intel can load a float constant from a direct address, requiring no
+// extra registers.  Most RISCs will have to materialize an address into a
+// register first, so they would do better to copy the constant from stack.
+const bool Matcher::rematerialize_float_constants = false;
+
+// If CPU can load and store mis-aligned doubles directly then no fixup is
+// needed.  Else we split the double into 2 integer pieces and move it
+// piece-by-piece.  Only happens when passing doubles into C code as the
+// Java calling convention forces doubles to be aligned.
+#ifdef AARCH64
+// On stack replacement support:
+// We don't need Load[DL]_unaligned support, because interpreter stack
+// has correct alignment
+const bool Matcher::misaligned_doubles_ok = true;
+#else
+const bool Matcher::misaligned_doubles_ok = false;
+#endif
+
+// No-op on ARM.
+void Matcher::pd_implicit_null_fixup(MachNode *node, uint idx) {
+}
+
+// Advertise here if the CPU requires explicit rounding operations
+// to implement the UseStrictFP mode.
+const bool Matcher::strict_fp_requires_explicit_rounding = false;
+
+// Are floats converted to double when stored to stack during deoptimization?
+// ARM does not handle callee-save floats.
+bool Matcher::float_in_double() {
+  return false;
+}
+
+// Do ints take an entire long register or just half?
+// Note that we if-def off of _LP64.
+// The relevant question is how the int is callee-saved.  In _LP64
+// the whole long is written but de-opt'ing will have to extract
+// the relevant 32 bits, in not-_LP64 only the low 32 bits is written.
+#ifdef _LP64
+const bool Matcher::int_in_long = true;
+#else
+const bool Matcher::int_in_long = false;
+#endif
+
+// Return whether or not this register is ever used as an argument.  This
+// function is used on startup to build the trampoline stubs in generateOptoStub.
+// Registers not mentioned will be killed by the VM call in the trampoline, and
+// arguments in those registers not be available to the callee.
+bool Matcher::can_be_java_arg( int reg ) {
+#ifdef AARCH64
+  if (reg >= R_R0_num && reg < R_R8_num) return true;
+  if (reg >= R_V0_num && reg <= R_V7b_num && ((reg & 3) < 2)) return true;
+#else
+  if (reg == R_R0_num ||
+      reg == R_R1_num ||
+      reg == R_R2_num ||
+      reg == R_R3_num) return true;
+
+  if (reg >= R_S0_num &&
+      reg <= R_S13_num) return true;
+#endif
+  return false;
+}
+
+bool Matcher::is_spillable_arg( int reg ) {
+  return can_be_java_arg(reg);
+}
+
+bool Matcher::use_asm_for_ldiv_by_con( jlong divisor ) {
+  return false;
+}
+
+// Register for DIVI projection of divmodI
+RegMask Matcher::divI_proj_mask() {
+  ShouldNotReachHere();
+  return RegMask();
+}
+
+// Register for MODI projection of divmodI
+RegMask Matcher::modI_proj_mask() {
+  ShouldNotReachHere();
+  return RegMask();
+}
+
+// Register for DIVL projection of divmodL
+RegMask Matcher::divL_proj_mask() {
+  ShouldNotReachHere();
+  return RegMask();
+}
+
+// Register for MODL projection of divmodL
+RegMask Matcher::modL_proj_mask() {
+  ShouldNotReachHere();
+  return RegMask();
+}
+
+const RegMask Matcher::method_handle_invoke_SP_save_mask() {
+  return FP_REGP_mask();
+}
+
+bool maybe_far_call(const CallNode *n) {
+  return !MacroAssembler::_reachable_from_cache(n->as_Call()->entry_point());
+}
+
+bool maybe_far_call(const MachCallNode *n) {
+  return !MacroAssembler::_reachable_from_cache(n->as_MachCall()->entry_point());
+}
+
+%}
+
+//----------ENCODING BLOCK-----------------------------------------------------
+// This block specifies the encoding classes used by the compiler to output
+// byte streams.  Encoding classes are parameterized macros used by
+// Machine Instruction Nodes in order to generate the bit encoding of the
+// instruction.  Operands specify their base encoding interface with the
+// interface keyword.  There are currently supported four interfaces,
+// REG_INTER, CONST_INTER, MEMORY_INTER, & COND_INTER.  REG_INTER causes an
+// operand to generate a function which returns its register number when
+// queried.   CONST_INTER causes an operand to generate a function which
+// returns the value of the constant when queried.  MEMORY_INTER causes an
+// operand to generate four functions which return the Base Register, the
+// Index Register, the Scale Value, and the Offset Value of the operand when
+// queried.  COND_INTER causes an operand to generate six functions which
+// return the encoding code (ie - encoding bits for the instruction)
+// associated with each basic boolean condition for a conditional instruction.
+//
+// Instructions specify two basic values for encoding.  Again, a function
+// is available to check if the constant displacement is an oop. They use the
+// ins_encode keyword to specify their encoding classes (which must be
+// a sequence of enc_class names, and their parameters, specified in
+// the encoding block), and they use the
+// opcode keyword to specify, in order, their primary, secondary, and
+// tertiary opcode.  Only the opcode sections which a particular instruction
+// needs for encoding need to be specified.
+encode %{
+  enc_class call_epilog %{
+    // nothing
+  %}
+
+  enc_class Java_To_Runtime (method meth) %{
+    // CALL directly to the runtime
+    emit_call_reloc(cbuf, as_MachCall(), $meth, runtime_call_Relocation::spec());
+  %}
+
+  enc_class Java_Static_Call (method meth) %{
+    // CALL to fixup routine.  Fixup routine uses ScopeDesc info to determine
+    // who we intended to call.
+
+    if ( !_method) {
+      emit_call_reloc(cbuf, as_MachCall(), $meth, runtime_call_Relocation::spec());
+    } else {
+      int method_index = resolved_method_index(cbuf);
+      RelocationHolder rspec = _optimized_virtual ? opt_virtual_call_Relocation::spec(method_index)
+                                                  : static_call_Relocation::spec(method_index);
+      emit_call_reloc(cbuf, as_MachCall(), $meth, rspec);
+
+      // Emit stubs for static call.
+      address stub = CompiledStaticCall::emit_to_interp_stub(cbuf);
+      if (stub == NULL) {
+        ciEnv::current()->record_failure("CodeCache is full");
+        return;
+      }
+    }
+  %}
+
+  enc_class save_last_PC %{
+    // preserve mark
+    address mark = cbuf.insts()->mark();
+    debug_only(int off0 = cbuf.insts_size());
+    MacroAssembler _masm(&cbuf);
+    int ret_addr_offset = as_MachCall()->ret_addr_offset();
+    __ adr(LR, mark + ret_addr_offset);
+    __ str(LR, Address(Rthread, JavaThread::last_Java_pc_offset()));
+    debug_only(int off1 = cbuf.insts_size());
+    assert(off1 - off0 == 2 * Assembler::InstructionSize, "correct size prediction");
+    // restore mark
+    cbuf.insts()->set_mark(mark);
+  %}
+
+  enc_class preserve_SP %{
+    // preserve mark
+    address mark = cbuf.insts()->mark();
+    debug_only(int off0 = cbuf.insts_size());
+    MacroAssembler _masm(&cbuf);
+    // FP is preserved across all calls, even compiled calls.
+    // Use it to preserve SP in places where the callee might change the SP.
+    __ mov(Rmh_SP_save, SP);
+    debug_only(int off1 = cbuf.insts_size());
+    assert(off1 - off0 == 4, "correct size prediction");
+    // restore mark
+    cbuf.insts()->set_mark(mark);
+  %}
+
+  enc_class restore_SP %{
+    MacroAssembler _masm(&cbuf);
+    __ mov(SP, Rmh_SP_save);
+  %}
+
+  enc_class Java_Dynamic_Call (method meth) %{
+    MacroAssembler _masm(&cbuf);
+    Register R8_ic_reg = reg_to_register_object(Matcher::inline_cache_reg_encode());
+    assert(R8_ic_reg == Ricklass, "should be");
+    __ set_inst_mark();
+#ifdef AARCH64
+// TODO: see C1 LIR_Assembler::ic_call()
+    InlinedAddress oop_literal((address)Universe::non_oop_word());
+    int offset = __ offset();
+    int fixed_size = mov_oop_size * 4;
+    if (VM_Version::prefer_moves_over_load_literal()) {
+      uintptr_t val = (uintptr_t)Universe::non_oop_word();
+      __ movz(R8_ic_reg, (val >>  0) & 0xffff,  0);
+      __ movk(R8_ic_reg, (val >> 16) & 0xffff, 16);
+      __ movk(R8_ic_reg, (val >> 32) & 0xffff, 32);
+      __ movk(R8_ic_reg, (val >> 48) & 0xffff, 48);
+    } else {
+      __ ldr_literal(R8_ic_reg, oop_literal);
+    }
+    assert(__ offset() - offset == fixed_size, "bad mov_oop size");
+#else
+    __ movw(R8_ic_reg, ((unsigned int)Universe::non_oop_word()) & 0xffff);
+    __ movt(R8_ic_reg, ((unsigned int)Universe::non_oop_word()) >> 16);
+#endif
+    address  virtual_call_oop_addr = __ inst_mark();
+    // CALL to fixup routine.  Fixup routine uses ScopeDesc info to determine
+    // who we intended to call.
+    int method_index = resolved_method_index(cbuf);
+    __ relocate(virtual_call_Relocation::spec(virtual_call_oop_addr, method_index));
+    emit_call_reloc(cbuf, as_MachCall(), $meth, RelocationHolder::none);
+#ifdef AARCH64
+    if (!VM_Version::prefer_moves_over_load_literal()) {
+      Label skip_literal;
+      __ b(skip_literal);
+      int off2 = __ offset();
+      __ bind_literal(oop_literal);
+      if (__ offset() - off2 == wordSize) {
+        // no padding, so insert nop for worst-case sizing
+        __ nop();
+      }
+      __ bind(skip_literal);
+    }
+#endif
+  %}
+
+  enc_class LdReplImmI(immI src, regD dst, iRegI tmp, int cnt, int wth) %{
+    // FIXME: load from constant table?
+    // Load a constant replicated "count" times with width "width"
+    int count = $cnt$$constant;
+    int width = $wth$$constant;
+    assert(count*width == 4, "sanity");
+    int val = $src$$constant;
+    if (width < 4) {
+      int bit_width = width * 8;
+      val &= (((int)1) << bit_width) - 1; // mask off sign bits
+      for (int i = 0; i < count - 1; i++) {
+        val |= (val << bit_width);
+      }
+    }
+    MacroAssembler _masm(&cbuf);
+
+    if (val == -1) {
+      __ mvn($tmp$$Register, 0);
+    } else if (val == 0) {
+      __ mov($tmp$$Register, 0);
+    } else {
+      __ movw($tmp$$Register, val & 0xffff);
+      __ movt($tmp$$Register, (unsigned int)val >> 16);
+    }
+    __ fmdrr($dst$$FloatRegister, $tmp$$Register, $tmp$$Register);
+  %}
+
+  enc_class LdReplImmF(immF src, regD dst, iRegI tmp) %{
+    // Replicate float con 2 times and pack into vector (8 bytes) in regD.
+    float fval = $src$$constant;
+    int val = *((int*)&fval);
+    MacroAssembler _masm(&cbuf);
+
+    if (val == -1) {
+      __ mvn($tmp$$Register, 0);
+    } else if (val == 0) {
+      __ mov($tmp$$Register, 0);
+    } else {
+      __ movw($tmp$$Register, val & 0xffff);
+      __ movt($tmp$$Register, (unsigned int)val >> 16);
+    }
+    __ fmdrr($dst$$FloatRegister, $tmp$$Register, $tmp$$Register);
+  %}
+
+  enc_class enc_String_Compare(R0RegP str1, R1RegP str2, R2RegI cnt1, R3RegI cnt2, iRegI result, iRegI tmp1, iRegI tmp2) %{
+    Label Ldone, Lloop;
+    MacroAssembler _masm(&cbuf);
+
+    Register   str1_reg = $str1$$Register;
+    Register   str2_reg = $str2$$Register;
+    Register   cnt1_reg = $cnt1$$Register; // int
+    Register   cnt2_reg = $cnt2$$Register; // int
+    Register   tmp1_reg = $tmp1$$Register;
+    Register   tmp2_reg = $tmp2$$Register;
+    Register result_reg = $result$$Register;
+
+    assert_different_registers(str1_reg, str2_reg, cnt1_reg, cnt2_reg, tmp1_reg, tmp2_reg);
+
+    // Compute the minimum of the string lengths(str1_reg) and the
+    // difference of the string lengths (stack)
+
+    // See if the lengths are different, and calculate min in str1_reg.
+    // Stash diff in tmp2 in case we need it for a tie-breaker.
+    __ subs_32(tmp2_reg, cnt1_reg, cnt2_reg);
+#ifdef AARCH64
+    Label Lskip;
+    __ _lsl_w(cnt1_reg, cnt1_reg, exact_log2(sizeof(jchar))); // scale the limit
+    __ b(Lskip, mi);
+    __ _lsl_w(cnt1_reg, cnt2_reg, exact_log2(sizeof(jchar))); // scale the limit
+    __ bind(Lskip);
+#else
+    __ mov(cnt1_reg, AsmOperand(cnt1_reg, lsl, exact_log2(sizeof(jchar)))); // scale the limit
+    __ mov(cnt1_reg, AsmOperand(cnt2_reg, lsl, exact_log2(sizeof(jchar))), pl); // scale the limit
+#endif
+
+    // reallocate cnt1_reg, cnt2_reg, result_reg
+    // Note:  limit_reg holds the string length pre-scaled by 2
+    Register limit_reg = cnt1_reg;
+    Register  chr2_reg = cnt2_reg;
+    Register  chr1_reg = tmp1_reg;
+    // str{12} are the base pointers
+
+    // Is the minimum length zero?
+    __ cmp_32(limit_reg, 0);
+    if (result_reg != tmp2_reg) {
+      __ mov(result_reg, tmp2_reg, eq);
+    }
+    __ b(Ldone, eq);
+
+    // Load first characters
+    __ ldrh(chr1_reg, Address(str1_reg, 0));
+    __ ldrh(chr2_reg, Address(str2_reg, 0));
+
+    // Compare first characters
+    __ subs(chr1_reg, chr1_reg, chr2_reg);
+    if (result_reg != chr1_reg) {
+      __ mov(result_reg, chr1_reg, ne);
+    }
+    __ b(Ldone, ne);
+
+    {
+      // Check after comparing first character to see if strings are equivalent
+      // Check if the strings start at same location
+      __ cmp(str1_reg, str2_reg);
+      // Check if the length difference is zero
+      __ cond_cmp(tmp2_reg, 0, eq);
+      __ mov(result_reg, 0, eq); // result is zero
+      __ b(Ldone, eq);
+      // Strings might not be equal
+    }
+
+    __ subs(chr1_reg, limit_reg, 1 * sizeof(jchar));
+    if (result_reg != tmp2_reg) {
+      __ mov(result_reg, tmp2_reg, eq);
+    }
+    __ b(Ldone, eq);
+
+    // Shift str1_reg and str2_reg to the end of the arrays, negate limit
+    __ add(str1_reg, str1_reg, limit_reg);
+    __ add(str2_reg, str2_reg, limit_reg);
+    __ neg(limit_reg, chr1_reg);  // limit = -(limit-2)
+
+    // Compare the rest of the characters
+    __ bind(Lloop);
+    __ ldrh(chr1_reg, Address(str1_reg, limit_reg));
+    __ ldrh(chr2_reg, Address(str2_reg, limit_reg));
+    __ subs(chr1_reg, chr1_reg, chr2_reg);
+    if (result_reg != chr1_reg) {
+      __ mov(result_reg, chr1_reg, ne);
+    }
+    __ b(Ldone, ne);
+
+    __ adds(limit_reg, limit_reg, sizeof(jchar));
+    __ b(Lloop, ne);
+
+    // If strings are equal up to min length, return the length difference.
+    if (result_reg != tmp2_reg) {
+      __ mov(result_reg, tmp2_reg);
+    }
+
+    // Otherwise, return the difference between the first mismatched chars.
+    __ bind(Ldone);
+  %}
+
+  enc_class enc_String_Equals(R0RegP str1, R1RegP str2, R2RegI cnt, iRegI result, iRegI tmp1, iRegI tmp2) %{
+    Label Lword_loop, Lpost_word, Lchar, Lchar_loop, Ldone, Lequal;
+    MacroAssembler _masm(&cbuf);
+
+    Register   str1_reg = $str1$$Register;
+    Register   str2_reg = $str2$$Register;
+    Register    cnt_reg = $cnt$$Register; // int
+    Register   tmp1_reg = $tmp1$$Register;
+    Register   tmp2_reg = $tmp2$$Register;
+    Register result_reg = $result$$Register;
+
+    assert_different_registers(str1_reg, str2_reg, cnt_reg, tmp1_reg, tmp2_reg, result_reg);
+
+    __ cmp(str1_reg, str2_reg); //same char[] ?
+    __ b(Lequal, eq);
+
+    __ cbz_32(cnt_reg, Lequal); // count == 0
+
+    //rename registers
+    Register limit_reg = cnt_reg;
+    Register  chr1_reg = tmp1_reg;
+    Register  chr2_reg = tmp2_reg;
+
+    __ logical_shift_left(limit_reg, limit_reg, exact_log2(sizeof(jchar)));
+
+    //check for alignment and position the pointers to the ends
+    __ orr(chr1_reg, str1_reg, str2_reg);
+    __ tst(chr1_reg, 0x3);
+
+    // notZero means at least one not 4-byte aligned.
+    // We could optimize the case when both arrays are not aligned
+    // but it is not frequent case and it requires additional checks.
+    __ b(Lchar, ne);
+
+    // Compare char[] arrays aligned to 4 bytes.
+    __ char_arrays_equals(str1_reg, str2_reg, limit_reg, result_reg,
+                          chr1_reg, chr2_reg, Ldone);
+
+    __ b(Lequal); // equal
+
+    // char by char compare
+    __ bind(Lchar);
+    __ mov(result_reg, 0);
+    __ add(str1_reg, limit_reg, str1_reg);
+    __ add(str2_reg, limit_reg, str2_reg);
+    __ neg(limit_reg, limit_reg); //negate count
+
+    // Lchar_loop
+    __ bind(Lchar_loop);
+    __ ldrh(chr1_reg, Address(str1_reg, limit_reg));
+    __ ldrh(chr2_reg, Address(str2_reg, limit_reg));
+    __ cmp(chr1_reg, chr2_reg);
+    __ b(Ldone, ne);
+    __ adds(limit_reg, limit_reg, sizeof(jchar));
+    __ b(Lchar_loop, ne);
+
+    __ bind(Lequal);
+    __ mov(result_reg, 1);  //equal
+
+    __ bind(Ldone);
+  %}
+
+  enc_class enc_Array_Equals(R0RegP ary1, R1RegP ary2, iRegI tmp1, iRegI tmp2, iRegI tmp3, iRegI result) %{
+    Label Lvector, Ldone, Lloop, Lequal;
+    MacroAssembler _masm(&cbuf);
+
+    Register   ary1_reg = $ary1$$Register;
+    Register   ary2_reg = $ary2$$Register;
+    Register   tmp1_reg = $tmp1$$Register;
+    Register   tmp2_reg = $tmp2$$Register;
+    Register   tmp3_reg = $tmp3$$Register;
+    Register result_reg = $result$$Register;
+
+    assert_different_registers(ary1_reg, ary2_reg, tmp1_reg, tmp2_reg, tmp3_reg, result_reg);
+
+    int length_offset  = arrayOopDesc::length_offset_in_bytes();
+    int base_offset    = arrayOopDesc::base_offset_in_bytes(T_CHAR);
+
+    // return true if the same array
+#ifdef AARCH64
+    __ cmp(ary1_reg, ary2_reg);
+    __ b(Lequal, eq);
+
+    __ mov(result_reg, 0);
+
+    __ cbz(ary1_reg, Ldone); // not equal
+
+    __ cbz(ary2_reg, Ldone); // not equal
+#else
+    __ teq(ary1_reg, ary2_reg);
+    __ mov(result_reg, 1, eq);
+    __ b(Ldone, eq); // equal
+
+    __ tst(ary1_reg, ary1_reg);
+    __ mov(result_reg, 0, eq);
+    __ b(Ldone, eq);    // not equal
+
+    __ tst(ary2_reg, ary2_reg);
+    __ mov(result_reg, 0, eq);
+    __ b(Ldone, eq);    // not equal
+#endif
+
+    //load the lengths of arrays
+    __ ldr_s32(tmp1_reg, Address(ary1_reg, length_offset)); // int
+    __ ldr_s32(tmp2_reg, Address(ary2_reg, length_offset)); // int
+
+    // return false if the two arrays are not equal length
+#ifdef AARCH64
+    __ cmp_w(tmp1_reg, tmp2_reg);
+    __ b(Ldone, ne);    // not equal
+
+    __ cbz_w(tmp1_reg, Lequal); // zero-length arrays are equal
+#else
+    __ teq_32(tmp1_reg, tmp2_reg);
+    __ mov(result_reg, 0, ne);
+    __ b(Ldone, ne);    // not equal
+
+    __ tst(tmp1_reg, tmp1_reg);
+    __ mov(result_reg, 1, eq);
+    __ b(Ldone, eq);    // zero-length arrays are equal
+#endif
+
+    // load array addresses
+    __ add(ary1_reg, ary1_reg, base_offset);
+    __ add(ary2_reg, ary2_reg, base_offset);
+
+    // renaming registers
+    Register chr1_reg  =  tmp3_reg;   // for characters in ary1
+    Register chr2_reg  =  tmp2_reg;   // for characters in ary2
+    Register limit_reg =  tmp1_reg;   // length
+
+    // set byte count
+    __ logical_shift_left_32(limit_reg, limit_reg, exact_log2(sizeof(jchar)));
+
+    // Compare char[] arrays aligned to 4 bytes.
+    __ char_arrays_equals(ary1_reg, ary2_reg, limit_reg, result_reg,
+                          chr1_reg, chr2_reg, Ldone);
+    __ bind(Lequal);
+    __ mov(result_reg, 1);  //equal
+
+    __ bind(Ldone);
+    %}
+%}
+
+//----------FRAME--------------------------------------------------------------
+// Definition of frame structure and management information.
+//
+//  S T A C K   L A Y O U T    Allocators stack-slot number
+//                             |   (to get allocators register number
+//  G  Owned by    |        |  v    add VMRegImpl::stack0)
+//  r   CALLER     |        |
+//  o     |        +--------+      pad to even-align allocators stack-slot
+//  w     V        |  pad0  |        numbers; owned by CALLER
+//  t   -----------+--------+----> Matcher::_in_arg_limit, unaligned
+//  h     ^        |   in   |  5
+//        |        |  args  |  4   Holes in incoming args owned by SELF
+//  |     |        |        |  3
+//  |     |        +--------+
+//  V     |        | old out|      Empty on Intel, window on Sparc
+//        |    old |preserve|      Must be even aligned.
+//        |     SP-+--------+----> Matcher::_old_SP, 8 (or 16 in LP64)-byte aligned
+//        |        |   in   |  3   area for Intel ret address
+//     Owned by    |preserve|      Empty on Sparc.
+//       SELF      +--------+
+//        |        |  pad2  |  2   pad to align old SP
+//        |        +--------+  1
+//        |        | locks  |  0
+//        |        +--------+----> VMRegImpl::stack0, 8 (or 16 in LP64)-byte aligned
+//        |        |  pad1  | 11   pad to align new SP
+//        |        +--------+
+//        |        |        | 10
+//        |        | spills |  9   spills
+//        V        |        |  8   (pad0 slot for callee)
+//      -----------+--------+----> Matcher::_out_arg_limit, unaligned
+//        ^        |  out   |  7
+//        |        |  args  |  6   Holes in outgoing args owned by CALLEE
+//     Owned by    +--------+
+//      CALLEE     | new out|  6   Empty on Intel, window on Sparc
+//        |    new |preserve|      Must be even-aligned.
+//        |     SP-+--------+----> Matcher::_new_SP, even aligned
+//        |        |        |
+//
+// Note 1: Only region 8-11 is determined by the allocator.  Region 0-5 is
+//         known from SELF's arguments and the Java calling convention.
+//         Region 6-7 is determined per call site.
+// Note 2: If the calling convention leaves holes in the incoming argument
+//         area, those holes are owned by SELF.  Holes in the outgoing area
+//         are owned by the CALLEE.  Holes should not be nessecary in the
+//         incoming area, as the Java calling convention is completely under
+//         the control of the AD file.  Doubles can be sorted and packed to
+//         avoid holes.  Holes in the outgoing arguments may be nessecary for
+//         varargs C calling conventions.
+// Note 3: Region 0-3 is even aligned, with pad2 as needed.  Region 3-5 is
+//         even aligned with pad0 as needed.
+//         Region 6 is even aligned.  Region 6-7 is NOT even aligned;
+//         region 6-11 is even aligned; it may be padded out more so that
+//         the region from SP to FP meets the minimum stack alignment.
+
+frame %{
+  // What direction does stack grow in (assumed to be same for native & Java)
+  stack_direction(TOWARDS_LOW);
+
+  // These two registers define part of the calling convention
+  // between compiled code and the interpreter.
+  inline_cache_reg(R_Ricklass);          // Inline Cache Register or Method* for I2C
+  interpreter_method_oop_reg(R_Rmethod); // Method Oop Register when calling interpreter
+
+  // Optional: name the operand used by cisc-spilling to access [stack_pointer + offset]
+  cisc_spilling_operand_name(indOffset);
+
+  // Number of stack slots consumed by a Monitor enter
+  sync_stack_slots(1 * VMRegImpl::slots_per_word);
+
+  // Compiled code's Frame Pointer
+#ifdef AARCH64
+  frame_pointer(R_SP);
+#else
+  frame_pointer(R_R13);
+#endif
+
+  // Stack alignment requirement
+  stack_alignment(StackAlignmentInBytes);
+  //  LP64: Alignment size in bytes (128-bit -> 16 bytes)
+  // !LP64: Alignment size in bytes (64-bit  ->  8 bytes)
+
+  // Number of stack slots between incoming argument block and the start of
+  // a new frame.  The PROLOG must add this many slots to the stack.  The
+  // EPILOG must remove this many slots.
+  // FP + LR
+  in_preserve_stack_slots(2 * VMRegImpl::slots_per_word);
+
+  // Number of outgoing stack slots killed above the out_preserve_stack_slots
+  // for calls to C.  Supports the var-args backing area for register parms.
+  // ADLC doesn't support parsing expressions, so I folded the math by hand.
+  varargs_C_out_slots_killed( 0);
+
+  // The after-PROLOG location of the return address.  Location of
+  // return address specifies a type (REG or STACK) and a number
+  // representing the register number (i.e. - use a register name) or
+  // stack slot.
+  // Ret Addr is on stack in slot 0 if no locks or verification or alignment.
+  // Otherwise, it is above the locks and verification slot and alignment word
+  return_addr(STACK - 1*VMRegImpl::slots_per_word +
+              round_to((Compile::current()->in_preserve_stack_slots() +
+                        Compile::current()->fixed_slots()),
+                       stack_alignment_in_slots()));
+
+  // Body of function which returns an OptoRegs array locating
+  // arguments either in registers or in stack slots for calling
+  // java
+  calling_convention %{
+    (void) SharedRuntime::java_calling_convention(sig_bt, regs, length, is_outgoing);
+
+  %}
+
+  // Body of function which returns an OptoRegs array locating
+  // arguments either in registers or in stack slots for callin
+  // C.
+  c_calling_convention %{
+    // This is obviously always outgoing
+    (void) SharedRuntime::c_calling_convention(sig_bt, regs, /*regs2=*/NULL, length);
+  %}
+
+  // Location of compiled Java return values.  Same as C
+  return_value %{
+    return c2::return_value(ideal_reg);
+  %}
+
+%}
+
+//----------ATTRIBUTES---------------------------------------------------------
+//----------Instruction Attributes---------------------------------------------
+ins_attrib ins_cost(DEFAULT_COST); // Required cost attribute
+ins_attrib ins_size(32);           // Required size attribute (in bits)
+ins_attrib ins_short_branch(0);    // Required flag: is this instruction a
+                                   // non-matching short branch variant of some
+                                                            // long branch?
+
+//----------OPERANDS-----------------------------------------------------------
+// Operand definitions must precede instruction definitions for correct parsing
+// in the ADLC because operands constitute user defined types which are used in
+// instruction definitions.
+
+//----------Simple Operands----------------------------------------------------
+// Immediate Operands
+// Integer Immediate: 32-bit
+operand immI() %{
+  match(ConI);
+
+  op_cost(0);
+  // formats are generated automatically for constants and base registers
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
+// Integer Immediate: 8-bit unsigned - for VMOV
+operand immU8() %{
+  predicate(0 <= n->get_int() && (n->get_int() <= 255));
+  match(ConI);
+  op_cost(0);
+
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
+// Integer Immediate: 16-bit
+operand immI16() %{
+  predicate((n->get_int() >> 16) == 0 && VM_Version::supports_movw());
+  match(ConI);
+  op_cost(0);
+
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
+#ifndef AARCH64
+// Integer Immediate: offset for half and double word loads and stores
+operand immIHD() %{
+  predicate(is_memoryHD(n->get_int()));
+  match(ConI);
+  op_cost(0);
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
+// Integer Immediate: offset for fp loads and stores
+operand immIFP() %{
+  predicate(is_memoryfp(n->get_int()) && ((n->get_int() & 3) == 0));
+  match(ConI);
+  op_cost(0);
+
+  format %{ %}
+  interface(CONST_INTER);
+%}
+#endif
+
+// Valid scale values for addressing modes and shifts
+operand immU5() %{
+  predicate(0 <= n->get_int() && (n->get_int() <= 31));
+  match(ConI);
+  op_cost(0);
+
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
+// Integer Immediate: 6-bit
+operand immU6Big() %{
+  predicate(n->get_int() >= 32 && n->get_int() <= 63);
+  match(ConI);
+  op_cost(0);
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
+// Integer Immediate: 0-bit
+operand immI0() %{
+  predicate(n->get_int() == 0);
+  match(ConI);
+  op_cost(0);
+
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
+// Integer Immediate: the value 1
+operand immI_1() %{
+  predicate(n->get_int() == 1);
+  match(ConI);
+  op_cost(0);
+
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
+// Integer Immediate: the value 2
+operand immI_2() %{
+  predicate(n->get_int() == 2);
+  match(ConI);
+  op_cost(0);
+
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
+// Integer Immediate: the value 3
+operand immI_3() %{
+  predicate(n->get_int() == 3);
+  match(ConI);
+  op_cost(0);
+
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
+// Integer Immediate: the value 4
+operand immI_4() %{
+  predicate(n->get_int() == 4);
+  match(ConI);
+  op_cost(0);
+
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
+// Integer Immediate: the value 8
+operand immI_8() %{
+  predicate(n->get_int() == 8);
+  match(ConI);
+  op_cost(0);
+
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
+// Int Immediate non-negative
+operand immU31()
+%{
+  predicate(n->get_int() >= 0);
+  match(ConI);
+
+  op_cost(0);
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
+// Integer Immediate: the values 32-63
+operand immI_32_63() %{
+  predicate(n->get_int() >= 32 && n->get_int() <= 63);
+  match(ConI);
+  op_cost(0);
+
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
+// Immediates for special shifts (sign extend)
+
+// Integer Immediate: the value 16
+operand immI_16() %{
+  predicate(n->get_int() == 16);
+  match(ConI);
+  op_cost(0);
+
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
+// Integer Immediate: the value 24
+operand immI_24() %{
+  predicate(n->get_int() == 24);
+  match(ConI);
+  op_cost(0);
+
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
+// Integer Immediate: the value 255
+operand immI_255() %{
+  predicate( n->get_int() == 255 );
+  match(ConI);
+  op_cost(0);
+
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
+// Integer Immediate: the value 65535
+operand immI_65535() %{
+  predicate(n->get_int() == 65535);
+  match(ConI);
+  op_cost(0);
+
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
+// Integer Immediates for arithmetic instructions
+
+operand aimmI() %{
+  predicate(is_aimm(n->get_int()));
+  match(ConI);
+  op_cost(0);
+
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
+operand aimmIneg() %{
+  predicate(is_aimm(-n->get_int()));
+  match(ConI);
+  op_cost(0);
+
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
+operand aimmU31() %{
+  predicate((0 <= n->get_int()) && is_aimm(n->get_int()));
+  match(ConI);
+  op_cost(0);
+
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
+// Integer Immediates for logical instructions
+
+operand limmI() %{
+  predicate(is_limmI(n->get_int()));
+  match(ConI);
+  op_cost(0);
+
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
+operand limmIlow8() %{
+  predicate(is_limmI_low(n->get_int(), 8));
+  match(ConI);
+  op_cost(0);
+
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
+operand limmU31() %{
+  predicate(0 <= n->get_int() && is_limmI(n->get_int()));
+  match(ConI);
+  op_cost(0);
+
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
+operand limmIn() %{
+  predicate(is_limmI(~n->get_int()));
+  match(ConI);
+  op_cost(0);
+
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
+#ifdef AARCH64
+// Long Immediate: for logical instruction
+operand limmL() %{
+  predicate(is_limmL(n->get_long()));
+  match(ConL);
+  op_cost(0);
+
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
+operand limmLn() %{
+  predicate(is_limmL(~n->get_long()));
+  match(ConL);
+  op_cost(0);
+
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
+// Long Immediate: for arithmetic instruction
+operand aimmL() %{
+  predicate(is_aimm(n->get_long()));
+  match(ConL);
+  op_cost(0);
+
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
+operand aimmLneg() %{
+  predicate(is_aimm(-n->get_long()));
+  match(ConL);
+  op_cost(0);
+
+  format %{ %}
+  interface(CONST_INTER);
+%}
+#endif // AARCH64
+
+// Long Immediate: the value FF
+operand immL_FF() %{
+  predicate( n->get_long() == 0xFFL );
+  match(ConL);
+  op_cost(0);
+
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
+// Long Immediate: the value FFFF
+operand immL_FFFF() %{
+  predicate( n->get_long() == 0xFFFFL );
+  match(ConL);
+  op_cost(0);
+
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
+// Pointer Immediate: 32 or 64-bit
+operand immP() %{
+  match(ConP);
+
+  op_cost(5);
+  // formats are generated automatically for constants and base registers
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
+operand immP0() %{
+  predicate(n->get_ptr() == 0);
+  match(ConP);
+  op_cost(0);
+
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
+operand immP_poll() %{
+  predicate(n->get_ptr() != 0 && n->get_ptr() == (intptr_t)os::get_polling_page());
+  match(ConP);
+
+  // formats are generated automatically for constants and base registers
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
+// Pointer Immediate
+operand immN()
+%{
+  match(ConN);
+
+  op_cost(10);
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
+operand immNKlass()
+%{
+  match(ConNKlass);
+
+  op_cost(10);
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
+// NULL Pointer Immediate
+operand immN0()
+%{
+  predicate(n->get_narrowcon() == 0);
+  match(ConN);
+
+  op_cost(0);
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
+operand immL() %{
+  match(ConL);
+  op_cost(40);
+  // formats are generated automatically for constants and base registers
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
+operand immL0() %{
+  predicate(n->get_long() == 0L);
+  match(ConL);
+  op_cost(0);
+  // formats are generated automatically for constants and base registers
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
+// Long Immediate: 16-bit
+operand immL16() %{
+  predicate(n->get_long() >= 0 && n->get_long() < (1<<16)  && VM_Version::supports_movw());
+  match(ConL);
+  op_cost(0);
+
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
+// Long Immediate: low 32-bit mask
+operand immL_32bits() %{
+  predicate(n->get_long() == 0xFFFFFFFFL);
+  match(ConL);
+  op_cost(0);
+
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
+// Double Immediate
+operand immD() %{
+  match(ConD);
+
+  op_cost(40);
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
+// Double Immediate: +0.0d.
+operand immD0() %{
+  predicate(jlong_cast(n->getd()) == 0);
+
+  match(ConD);
+  op_cost(0);
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
+operand imm8D() %{
+  predicate(Assembler::double_num(n->getd()).can_be_imm8());
+  match(ConD);
+
+  op_cost(0);
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
+// Float Immediate
+operand immF() %{
+  match(ConF);
+
+  op_cost(20);
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
+// Float Immediate: +0.0f
+operand immF0() %{
+  predicate(jint_cast(n->getf()) == 0);
+  match(ConF);
+
+  op_cost(0);
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
+// Float Immediate: encoded as 8 bits
+operand imm8F() %{
+  predicate(Assembler::float_num(n->getf()).can_be_imm8());
+  match(ConF);
+
+  op_cost(0);
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
+// Integer Register Operands
+// Integer Register
+operand iRegI() %{
+  constraint(ALLOC_IN_RC(int_reg));
+  match(RegI);
+  match(R0RegI);
+  match(R1RegI);
+  match(R2RegI);
+  match(R3RegI);
+#ifdef AARCH64
+  match(ZRRegI);
+#else
+  match(R12RegI);
+#endif
+
+  format %{ %}
+  interface(REG_INTER);
+%}
+
+// Pointer Register
+operand iRegP() %{
+  constraint(ALLOC_IN_RC(ptr_reg));
+  match(RegP);
+  match(R0RegP);
+  match(R1RegP);
+  match(R2RegP);
+  match(RExceptionRegP);
+  match(R8RegP);
+  match(R9RegP);
+  match(RthreadRegP); // FIXME: move to sp_ptr_RegP?
+  match(R12RegP);
+  match(LRRegP);
+
+  match(sp_ptr_RegP);
+  match(store_ptr_RegP);
+
+  format %{ %}
+  interface(REG_INTER);
+%}
+
+// GPRs + Rthread + SP
+operand sp_ptr_RegP() %{
+  constraint(ALLOC_IN_RC(sp_ptr_reg));
+  match(RegP);
+  match(iRegP);
+  match(SPRegP); // FIXME: check cost
+
+  format %{ %}
+  interface(REG_INTER);
+%}
+
+#ifdef AARCH64
+// Like sp_ptr_reg, but exclude regs (Aarch64 SP) that can't be
+// stored directly.  Includes ZR, so can't be used as a destination.
+operand store_ptr_RegP() %{
+  constraint(ALLOC_IN_RC(store_ptr_reg));
+  match(RegP);
+  match(iRegP);
+  match(ZRRegP);
+
+  format %{ %}
+  interface(REG_INTER);
+%}
+
+operand store_RegI() %{
+  constraint(ALLOC_IN_RC(store_reg));
+  match(RegI);
+  match(iRegI);
+  match(ZRRegI);
+
+  format %{ %}
+  interface(REG_INTER);
+%}
+
+operand store_RegL() %{
+  constraint(ALLOC_IN_RC(store_ptr_reg));
+  match(RegL);
+  match(iRegL);
+  match(ZRRegL);
+
+  format %{ %}
+  interface(REG_INTER);
+%}
+
+operand store_RegN() %{
+  constraint(ALLOC_IN_RC(store_reg));
+  match(RegN);
+  match(iRegN);
+  match(ZRRegN);
+
+  format %{ %}
+  interface(REG_INTER);
+%}
+#endif
+
+operand R0RegP() %{
+  constraint(ALLOC_IN_RC(R0_regP));
+  match(iRegP);
+
+  format %{ %}
+  interface(REG_INTER);
+%}
+
+operand R1RegP() %{
+  constraint(ALLOC_IN_RC(R1_regP));
+  match(iRegP);
+
+  format %{ %}
+  interface(REG_INTER);
+%}
+
+operand R2RegP() %{
+  constraint(ALLOC_IN_RC(R2_regP));
+  match(iRegP);
+
+  format %{ %}
+  interface(REG_INTER);
+%}
+
+operand RExceptionRegP() %{
+  constraint(ALLOC_IN_RC(Rexception_regP));
+  match(iRegP);
+
+  format %{ %}
+  interface(REG_INTER);
+%}
+
+operand RthreadRegP() %{
+  constraint(ALLOC_IN_RC(Rthread_regP));
+  match(iRegP);
+
+  format %{ %}
+  interface(REG_INTER);
+%}
+
+operand IPRegP() %{
+  constraint(ALLOC_IN_RC(IP_regP));
+  match(iRegP);
+
+  format %{ %}
+  interface(REG_INTER);
+%}
+
+operand LRRegP() %{
+  constraint(ALLOC_IN_RC(LR_regP));
+  match(iRegP);
+
+  format %{ %}
+  interface(REG_INTER);
+%}
+
+operand R0RegI() %{
+  constraint(ALLOC_IN_RC(R0_regI));
+  match(iRegI);
+
+  format %{ %}
+  interface(REG_INTER);
+%}
+
+operand R1RegI() %{
+  constraint(ALLOC_IN_RC(R1_regI));
+  match(iRegI);
+
+  format %{ %}
+  interface(REG_INTER);
+%}
+
+operand R2RegI() %{
+  constraint(ALLOC_IN_RC(R2_regI));
+  match(iRegI);
+
+  format %{ %}
+  interface(REG_INTER);
+%}
+
+operand R3RegI() %{
+  constraint(ALLOC_IN_RC(R3_regI));
+  match(iRegI);
+
+  format %{ %}
+  interface(REG_INTER);
+%}
+
+#ifndef AARCH64
+operand R12RegI() %{
+  constraint(ALLOC_IN_RC(R12_regI));
+  match(iRegI);
+
+  format %{ %}
+  interface(REG_INTER);
+%}
+#endif
+
+// Long Register
+operand iRegL() %{
+  constraint(ALLOC_IN_RC(long_reg));
+  match(RegL);
+#ifdef AARCH64
+  match(iRegLd);
+#else
+  match(R0R1RegL);
+  match(R2R3RegL);
+#endif
+//match(iRegLex);
+
+  format %{ %}
+  interface(REG_INTER);
+%}
+
+operand iRegLd() %{
+  constraint(ALLOC_IN_RC(long_reg_align));
+  match(iRegL); // FIXME: allows unaligned R11/R12?
+
+  format %{ %}
+  interface(REG_INTER);
+%}
+
+#ifndef AARCH64
+// first long arg, or return value
+operand R0R1RegL() %{
+  constraint(ALLOC_IN_RC(R0R1_regL));
+  match(iRegL);
+
+  format %{ %}
+  interface(REG_INTER);
+%}
+
+operand R2R3RegL() %{
+  constraint(ALLOC_IN_RC(R2R3_regL));
+  match(iRegL);
+
+  format %{ %}
+  interface(REG_INTER);
+%}
+#endif
+
+// Condition Code Flag Register
+operand flagsReg() %{
+  constraint(ALLOC_IN_RC(int_flags));
+  match(RegFlags);
+
+  format %{ "apsr" %}
+  interface(REG_INTER);
+%}
+
+// Result of compare to 0 (TST)
+operand flagsReg_EQNELTGE() %{
+  constraint(ALLOC_IN_RC(int_flags));
+  match(RegFlags);
+
+  format %{ "apsr_EQNELTGE" %}
+  interface(REG_INTER);
+%}
+
+// Condition Code Register, unsigned comparisons.
+operand flagsRegU() %{
+  constraint(ALLOC_IN_RC(int_flags));
+  match(RegFlags);
+#ifdef TODO
+  match(RegFlagsP);
+#endif
+
+  format %{ "apsr_U" %}
+  interface(REG_INTER);
+%}
+
+// Condition Code Register, pointer comparisons.
+operand flagsRegP() %{
+  constraint(ALLOC_IN_RC(int_flags));
+  match(RegFlags);
+
+  format %{ "apsr_P" %}
+  interface(REG_INTER);
+%}
+
+// Condition Code Register, long comparisons.
+#ifndef AARCH64
+operand flagsRegL_LTGE() %{
+  constraint(ALLOC_IN_RC(int_flags));
+  match(RegFlags);
+
+  format %{ "apsr_L_LTGE" %}
+  interface(REG_INTER);
+%}
+
+operand flagsRegL_EQNE() %{
+  constraint(ALLOC_IN_RC(int_flags));
+  match(RegFlags);
+
+  format %{ "apsr_L_EQNE" %}
+  interface(REG_INTER);
+%}
+
+operand flagsRegL_LEGT() %{
+  constraint(ALLOC_IN_RC(int_flags));
+  match(RegFlags);
+
+  format %{ "apsr_L_LEGT" %}
+  interface(REG_INTER);
+%}
+#endif
+
+// Condition Code Register, floating comparisons, unordered same as "less".
+operand flagsRegF() %{
+  constraint(ALLOC_IN_RC(float_flags));
+  match(RegFlags);
+
+  format %{ "fpscr_F" %}
+  interface(REG_INTER);
+%}
+
+// Vectors
+operand vecD() %{
+  constraint(ALLOC_IN_RC(actual_dflt_reg));
+  match(VecD);
+
+  format %{ %}
+  interface(REG_INTER);
+%}
+
+operand vecX() %{
+  constraint(ALLOC_IN_RC(vectorx_reg));
+  match(VecX);
+
+  format %{ %}
+  interface(REG_INTER);
+%}
+
+operand regD() %{
+  constraint(ALLOC_IN_RC(actual_dflt_reg));
+  match(RegD);
+  match(regD_low);
+
+  format %{ %}
+  interface(REG_INTER);
+%}
+
+operand regF() %{
+  constraint(ALLOC_IN_RC(sflt_reg));
+  match(RegF);
+
+  format %{ %}
+  interface(REG_INTER);
+%}
+
+operand regD_low() %{
+  constraint(ALLOC_IN_RC(dflt_low_reg));
+  match(RegD);
+
+  format %{ %}
+  interface(REG_INTER);
+%}
+
+// Special Registers
+
+// Method Register
+operand inline_cache_regP(iRegP reg) %{
+  constraint(ALLOC_IN_RC(Ricklass_regP));
+  match(reg);
+  format %{ %}
+  interface(REG_INTER);
+%}
+
+operand interpreter_method_oop_regP(iRegP reg) %{
+  constraint(ALLOC_IN_RC(Rmethod_regP));
+  match(reg);
+  format %{ %}
+  interface(REG_INTER);
+%}
+
+
+//----------Complex Operands---------------------------------------------------
+// Indirect Memory Reference
+operand indirect(sp_ptr_RegP reg) %{
+  constraint(ALLOC_IN_RC(sp_ptr_reg));
+  match(reg);
+
+  op_cost(100);
+  format %{ "[$reg]" %}
+  interface(MEMORY_INTER) %{
+    base($reg);
+#ifdef AARCH64
+    index(0xff); // 0xff => no index
+#else
+    index(0xf); // PC => no index
+#endif
+    scale(0x0);
+    disp(0x0);
+  %}
+%}
+
+#ifdef AARCH64
+// Indirect with scaled*1 uimm12 offset
+operand indOffsetU12ScaleB(sp_ptr_RegP reg, immUL12 offset) %{
+  constraint(ALLOC_IN_RC(sp_ptr_reg));
+  match(AddP reg offset);
+
+  op_cost(100);
+  format %{ "[$reg + $offset]" %}
+  interface(MEMORY_INTER) %{
+    base($reg);
+#ifdef AARCH64
+    index(0xff); // 0xff => no index
+#else
+    index(0xf); // PC => no index
+#endif
+    scale(0x0);
+    disp($offset);
+  %}
+%}
+
+// Indirect with scaled*2 uimm12 offset
+operand indOffsetU12ScaleS(sp_ptr_RegP reg, immUL12x2 offset) %{
+  constraint(ALLOC_IN_RC(sp_ptr_reg));
+  match(AddP reg offset);
+
+  op_cost(100);
+  format %{ "[$reg + $offset]" %}
+  interface(MEMORY_INTER) %{
+    base($reg);
+#ifdef AARCH64
+    index(0xff); // 0xff => no index
+#else
+    index(0xf); // PC => no index
+#endif
+    scale(0x0);
+    disp($offset);
+  %}
+%}
+
+// Indirect with scaled*4 uimm12 offset
+operand indOffsetU12ScaleI(sp_ptr_RegP reg, immUL12x4 offset) %{
+  constraint(ALLOC_IN_RC(sp_ptr_reg));
+  match(AddP reg offset);
+
+  op_cost(100);
+  format %{ "[$reg + $offset]" %}
+  interface(MEMORY_INTER) %{
+    base($reg);
+#ifdef AARCH64
+    index(0xff); // 0xff => no index
+#else
+    index(0xf); // PC => no index
+#endif
+    scale(0x0);
+    disp($offset);
+  %}
+%}
+
+// Indirect with scaled*8 uimm12 offset
+operand indOffsetU12ScaleL(sp_ptr_RegP reg, immUL12x8 offset) %{
+  constraint(ALLOC_IN_RC(sp_ptr_reg));
+  match(AddP reg offset);
+
+  op_cost(100);
+  format %{ "[$reg + $offset]" %}
+  interface(MEMORY_INTER) %{
+    base($reg);
+#ifdef AARCH64
+    index(0xff); // 0xff => no index
+#else
+    index(0xf); // PC => no index
+#endif
+    scale(0x0);
+    disp($offset);
+  %}
+%}
+
+// Indirect with scaled*16 uimm12 offset
+operand indOffsetU12ScaleQ(sp_ptr_RegP reg, immUL12x16 offset) %{
+  constraint(ALLOC_IN_RC(sp_ptr_reg));
+  match(AddP reg offset);
+
+  op_cost(100);
+  format %{ "[$reg + $offset]" %}
+  interface(MEMORY_INTER) %{
+    base($reg);
+#ifdef AARCH64
+    index(0xff); // 0xff => no index
+#else
+    index(0xf); // PC => no index
+#endif
+    scale(0x0);
+    disp($offset);
+  %}
+%}
+
+#else // ! AARCH64
+
+// Indirect with Offset in ]-4096, 4096[
+operand indOffset12(sp_ptr_RegP reg, immI12 offset) %{
+  constraint(ALLOC_IN_RC(sp_ptr_reg));
+  match(AddP reg offset);
+
+  op_cost(100);
+  format %{ "[$reg + $offset]" %}
+  interface(MEMORY_INTER) %{
+    base($reg);
+#ifdef AARCH64
+    index(0xff); // 0xff => no index
+#else
+    index(0xf); // PC => no index
+#endif
+    scale(0x0);
+    disp($offset);
+  %}
+%}
+
+// Indirect with offset for float load/store
+operand indOffsetFP(sp_ptr_RegP reg, immIFP offset) %{
+  constraint(ALLOC_IN_RC(sp_ptr_reg));
+  match(AddP reg offset);
+
+  op_cost(100);
+  format %{ "[$reg + $offset]" %}
+  interface(MEMORY_INTER) %{
+    base($reg);
+#ifdef AARCH64
+    index(0xff); // 0xff => no index
+#else
+    index(0xf); // PC => no index
+#endif
+    scale(0x0);
+    disp($offset);
+  %}
+%}
+
+// Indirect with Offset for half and double words
+operand indOffsetHD(sp_ptr_RegP reg, immIHD offset) %{
+  constraint(ALLOC_IN_RC(sp_ptr_reg));
+  match(AddP reg offset);
+
+  op_cost(100);
+  format %{ "[$reg + $offset]" %}
+  interface(MEMORY_INTER) %{
+    base($reg);
+#ifdef AARCH64
+    index(0xff); // 0xff => no index
+#else
+    index(0xf); // PC => no index
+#endif
+    scale(0x0);
+    disp($offset);
+  %}
+%}
+
+// Indirect with Offset and Offset+4 in ]-1024, 1024[
+operand indOffsetFPx2(sp_ptr_RegP reg, immX10x2 offset) %{
+  constraint(ALLOC_IN_RC(sp_ptr_reg));
+  match(AddP reg offset);
+
+  op_cost(100);
+  format %{ "[$reg + $offset]" %}
+  interface(MEMORY_INTER) %{
+    base($reg);
+#ifdef AARCH64
+    index(0xff); // 0xff => no index
+#else
+    index(0xf); // PC => no index
+#endif
+    scale(0x0);
+    disp($offset);
+  %}
+%}
+
+// Indirect with Offset and Offset+4 in ]-4096, 4096[
+operand indOffset12x2(sp_ptr_RegP reg, immI12x2 offset) %{
+  constraint(ALLOC_IN_RC(sp_ptr_reg));
+  match(AddP reg offset);
+
+  op_cost(100);
+  format %{ "[$reg + $offset]" %}
+  interface(MEMORY_INTER) %{
+    base($reg);
+#ifdef AARCH64
+    index(0xff); // 0xff => no index
+#else
+    index(0xf); // PC => no index
+#endif
+    scale(0x0);
+    disp($offset);
+  %}
+%}
+#endif // !AARCH64
+
+// Indirect with Register Index
+operand indIndex(iRegP addr, iRegX index) %{
+  constraint(ALLOC_IN_RC(ptr_reg));
+  match(AddP addr index);
+
+  op_cost(100);
+  format %{ "[$addr + $index]" %}
+  interface(MEMORY_INTER) %{
+    base($addr);
+    index($index);
+    scale(0x0);
+    disp(0x0);
+  %}
+%}
+
+#ifdef AARCH64
+// Indirect Memory Times Scale Plus Index Register
+operand indIndexScaleS(iRegP addr, iRegX index, immI_1 scale) %{
+  constraint(ALLOC_IN_RC(ptr_reg));
+  match(AddP addr (LShiftX index scale));
+
+  op_cost(100);
+  format %{"[$addr + $index << $scale]" %}
+  interface(MEMORY_INTER) %{
+    base($addr);
+    index($index);
+    scale($scale);
+    disp(0x0);
+  %}
+%}
+
+// Indirect Memory Times Scale Plus 32-bit Index Register
+operand indIndexIScaleS(iRegP addr, iRegI index, immI_1 scale) %{
+  constraint(ALLOC_IN_RC(ptr_reg));
+  match(AddP addr (LShiftX (ConvI2L index) scale));
+
+  op_cost(100);
+  format %{"[$addr + $index.w << $scale]" %}
+  interface(MEMORY_INTER) %{
+    base($addr);
+    index($index);
+    scale($scale);
+    disp(0x7fffffff); // sxtw
+  %}
+%}
+
+// Indirect Memory Times Scale Plus Index Register
+operand indIndexScaleI(iRegP addr, iRegX index, immI_2 scale) %{
+  constraint(ALLOC_IN_RC(ptr_reg));
+  match(AddP addr (LShiftX index scale));
+
+  op_cost(100);
+  format %{"[$addr + $index << $scale]" %}
+  interface(MEMORY_INTER) %{
+    base($addr);
+    index($index);
+    scale($scale);
+    disp(0x0);
+  %}
+%}
+
+// Indirect Memory Times Scale Plus 32-bit Index Register
+operand indIndexIScaleI(iRegP addr, iRegI index, immI_2 scale) %{
+  constraint(ALLOC_IN_RC(ptr_reg));
+  match(AddP addr (LShiftX (ConvI2L index) scale));
+
+  op_cost(100);
+  format %{"[$addr + $index.w << $scale]" %}
+  interface(MEMORY_INTER) %{
+    base($addr);
+    index($index);
+    scale($scale);
+    disp(0x7fffffff); // sxtw
+  %}
+%}
+
+// Indirect Memory Times Scale Plus Index Register
+operand indIndexScaleL(iRegP addr, iRegX index, immI_3 scale) %{
+  constraint(ALLOC_IN_RC(ptr_reg));
+  match(AddP addr (LShiftX index scale));
+
+  op_cost(100);
+  format %{"[$addr + $index << $scale]" %}
+  interface(MEMORY_INTER) %{
+    base($addr);
+    index($index);
+    scale($scale);
+    disp(0x0);
+  %}
+%}
+
+// Indirect Memory Times Scale Plus 32-bit Index Register
+operand indIndexIScaleL(iRegP addr, iRegI index, immI_3 scale) %{
+  constraint(ALLOC_IN_RC(ptr_reg));
+  match(AddP addr (LShiftX (ConvI2L index) scale));
+
+  op_cost(100);
+  format %{"[$addr + $index.w << $scale]" %}
+  interface(MEMORY_INTER) %{
+    base($addr);
+    index($index);
+    scale($scale);
+    disp(0x7fffffff); // sxtw
+  %}
+%}
+
+// Indirect Memory Times Scale Plus Index Register
+operand indIndexScaleQ(iRegP addr, iRegX index, immI_4 scale) %{
+  constraint(ALLOC_IN_RC(ptr_reg));
+  match(AddP addr (LShiftX index scale));
+
+  op_cost(100);
+  format %{"[$addr + $index << $scale]" %}
+  interface(MEMORY_INTER) %{
+    base($addr);
+    index($index);
+    scale($scale);
+    disp(0x0);
+  %}
+%}
+
+// Indirect Memory Times Scale Plus 32-bit Index Register
+operand indIndexIScaleQ(iRegP addr, iRegI index, immI_4 scale) %{
+  constraint(ALLOC_IN_RC(ptr_reg));
+  match(AddP addr (LShiftX (ConvI2L index) scale));
+
+  op_cost(100);
+  format %{"[$addr + $index.w << $scale]" %}
+  interface(MEMORY_INTER) %{
+    base($addr);
+    index($index);
+    scale($scale);
+    disp(0x7fffffff); // sxtw
+  %}
+%}
+#else
+// Indirect Memory Times Scale Plus Index Register
+operand indIndexScale(iRegP addr, iRegX index, immU5 scale) %{
+  constraint(ALLOC_IN_RC(ptr_reg));
+  match(AddP addr (LShiftX index scale));
+
+  op_cost(100);
+  format %{"[$addr + $index << $scale]" %}
+  interface(MEMORY_INTER) %{
+    base($addr);
+    index($index);
+    scale($scale);
+    disp(0x0);
+  %}
+%}
+#endif
+
+// Operands for expressing Control Flow
+// NOTE:  Label is a predefined operand which should not be redefined in
+//        the AD file.  It is generically handled within the ADLC.
+
+//----------Conditional Branch Operands----------------------------------------
+// Comparison Op  - This is the operation of the comparison, and is limited to
+//                  the following set of codes:
+//                  L (<), LE (<=), G (>), GE (>=), E (==), NE (!=)
+//
+// Other attributes of the comparison, such as unsignedness, are specified
+// by the comparison instruction that sets a condition code flags register.
+// That result is represented by a flags operand whose subtype is appropriate
+// to the unsignedness (etc.) of the comparison.
+//
+// Later, the instruction which matches both the Comparison Op (a Bool) and
+// the flags (produced by the Cmp) specifies the coding of the comparison op
+// by matching a specific subtype of Bool operand below, such as cmpOpU.
+
+operand cmpOp() %{
+  match(Bool);
+
+  format %{ "" %}
+  interface(COND_INTER) %{
+    equal(0x0);
+    not_equal(0x1);
+    less(0xb);
+    greater_equal(0xa);
+    less_equal(0xd);
+    greater(0xc);
+    overflow(0x0); // unsupported/unimplemented
+    no_overflow(0x0); // unsupported/unimplemented
+  %}
+%}
+
+// integer comparison with 0, signed
+operand cmpOp0() %{
+  match(Bool);
+
+  format %{ "" %}
+  interface(COND_INTER) %{
+    equal(0x0);
+    not_equal(0x1);
+    less(0x4);
+    greater_equal(0x5);
+    less_equal(0xd); // unsupported
+    greater(0xc); // unsupported
+    overflow(0x0); // unsupported/unimplemented
+    no_overflow(0x0); // unsupported/unimplemented
+  %}
+%}
+
+// Comparison Op, unsigned
+operand cmpOpU() %{
+  match(Bool);
+
+  format %{ "u" %}
+  interface(COND_INTER) %{
+    equal(0x0);
+    not_equal(0x1);
+    less(0x3);
+    greater_equal(0x2);
+    less_equal(0x9);
+    greater(0x8);
+    overflow(0x0); // unsupported/unimplemented
+    no_overflow(0x0); // unsupported/unimplemented
+  %}
+%}
+
+// Comparison Op, pointer (same as unsigned)
+operand cmpOpP() %{
+  match(Bool);
+
+  format %{ "p" %}
+  interface(COND_INTER) %{
+    equal(0x0);
+    not_equal(0x1);
+    less(0x3);
+    greater_equal(0x2);
+    less_equal(0x9);
+    greater(0x8);
+    overflow(0x0); // unsupported/unimplemented
+    no_overflow(0x0); // unsupported/unimplemented
+  %}
+%}
+
+operand cmpOpL() %{
+  match(Bool);
+
+  format %{ "L" %}
+  interface(COND_INTER) %{
+    equal(0x0);
+    not_equal(0x1);
+    less(0xb);
+    greater_equal(0xa);
+    less_equal(0xd);
+    greater(0xc);
+    overflow(0x0); // unsupported/unimplemented
+    no_overflow(0x0); // unsupported/unimplemented
+  %}
+%}
+
+operand cmpOpL_commute() %{
+  match(Bool);
+
+  format %{ "L" %}
+  interface(COND_INTER) %{
+    equal(0x0);
+    not_equal(0x1);
+    less(0xc);
+    greater_equal(0xd);
+    less_equal(0xa);
+    greater(0xb);
+    overflow(0x0); // unsupported/unimplemented
+    no_overflow(0x0); // unsupported/unimplemented
+  %}
+%}
+
+//----------OPERAND CLASSES----------------------------------------------------
+// Operand Classes are groups of operands that are used to simplify
+// instruction definitions by not requiring the AD writer to specify separate
+// instructions for every form of operand when the instruction accepts
+// multiple operand types with the same basic encoding and format.  The classic
+// case of this is memory operands.
+#ifdef AARCH64
+opclass memoryB(indirect, indIndex, indOffsetU12ScaleB);
+opclass memoryS(indirect, indIndex, indIndexScaleS, indIndexIScaleS, indOffsetU12ScaleS);
+opclass memoryI(indirect, indIndex, indIndexScaleI, indIndexIScaleI, indOffsetU12ScaleI);
+opclass memoryL(indirect, indIndex, indIndexScaleL, indIndexIScaleL, indOffsetU12ScaleL);
+opclass memoryP(indirect, indIndex, indIndexScaleL, indIndexIScaleL, indOffsetU12ScaleL);
+opclass memoryQ(indirect, indIndex, indIndexScaleQ, indIndexIScaleQ, indOffsetU12ScaleQ);
+opclass memoryF(indirect, indIndex, indIndexScaleI, indIndexIScaleI, indOffsetU12ScaleI);
+opclass memoryD(indirect, indIndex, indIndexScaleL, indIndexIScaleL, indOffsetU12ScaleL);
+
+opclass memoryScaledS(indIndexScaleS, indIndexIScaleS);
+opclass memoryScaledI(indIndexScaleI, indIndexIScaleI);
+opclass memoryScaledL(indIndexScaleL, indIndexIScaleL);
+opclass memoryScaledP(indIndexScaleL, indIndexIScaleL);
+opclass memoryScaledQ(indIndexScaleQ, indIndexIScaleQ);
+opclass memoryScaledF(indIndexScaleI, indIndexIScaleI);
+opclass memoryScaledD(indIndexScaleL, indIndexIScaleL);
+// when ldrex/strex is used:
+opclass memoryex ( indirect );
+opclass indIndexMemory( indIndex );
+opclass memoryvld ( indirect /* , write back mode not implemented */ );
+
+#else
+
+opclass memoryI ( indirect, indOffset12, indIndex, indIndexScale );
+opclass memoryP ( indirect, indOffset12, indIndex, indIndexScale );
+opclass memoryF ( indirect, indOffsetFP );
+opclass memoryF2 ( indirect, indOffsetFPx2 );
+opclass memoryD ( indirect, indOffsetFP );
+opclass memoryfp( indirect, indOffsetFP );
+opclass memoryB ( indirect, indIndex, indOffsetHD );
+opclass memoryS ( indirect, indIndex, indOffsetHD );
+opclass memoryL ( indirect, indIndex, indOffsetHD );
+
+opclass memoryScaledI(indIndexScale);
+opclass memoryScaledP(indIndexScale);
+
+// when ldrex/strex is used:
+opclass memoryex ( indirect );
+opclass indIndexMemory( indIndex );
+opclass memorylong ( indirect, indOffset12x2 );
+opclass memoryvld ( indirect /* , write back mode not implemented */ );
+#endif
+
+//----------PIPELINE-----------------------------------------------------------
+pipeline %{
+
+//----------ATTRIBUTES---------------------------------------------------------
+attributes %{
+  fixed_size_instructions;           // Fixed size instructions
+  max_instructions_per_bundle = 4;   // Up to 4 instructions per bundle
+  instruction_unit_size = 4;         // An instruction is 4 bytes long
+  instruction_fetch_unit_size = 16;  // The processor fetches one line
+  instruction_fetch_units = 1;       // of 16 bytes
+
+  // List of nop instructions
+  nops( Nop_A0, Nop_A1, Nop_MS, Nop_FA, Nop_BR );
+%}
+
+//----------RESOURCES----------------------------------------------------------
+// Resources are the functional units available to the machine
+resources(A0, A1, MS, BR, FA, FM, IDIV, FDIV, IALU = A0 | A1);
+
+//----------PIPELINE DESCRIPTION-----------------------------------------------
+// Pipeline Description specifies the stages in the machine's pipeline
+
+pipe_desc(A, P, F, B, I, J, S, R, E, C, M, W, X, T, D);
+
+//----------PIPELINE CLASSES---------------------------------------------------
+// Pipeline Classes describe the stages in which input and output are
+// referenced by the hardware pipeline.
+
+// Integer ALU reg-reg operation
+pipe_class ialu_reg_reg(iRegI dst, iRegI src1, iRegI src2) %{
+    single_instruction;
+    dst   : E(write);
+    src1  : R(read);
+    src2  : R(read);
+    IALU  : R;
+%}
+
+// Integer ALU reg-reg long operation
+pipe_class ialu_reg_reg_2(iRegL dst, iRegL src1, iRegL src2) %{
+    instruction_count(2);
+    dst   : E(write);
+    src1  : R(read);
+    src2  : R(read);
+    IALU  : R;
+    IALU  : R;
+%}
+
+// Integer ALU reg-reg long dependent operation
+pipe_class ialu_reg_reg_2_dep(iRegL dst, iRegL src1, iRegL src2, flagsReg cr) %{
+    instruction_count(1); multiple_bundles;
+    dst   : E(write);
+    src1  : R(read);
+    src2  : R(read);
+    cr    : E(write);
+    IALU  : R(2);
+%}
+
+// Integer ALU reg-imm operaion
+pipe_class ialu_reg_imm(iRegI dst, iRegI src1) %{
+    single_instruction;
+    dst   : E(write);
+    src1  : R(read);
+    IALU  : R;
+%}
+
+// Integer ALU reg-reg operation with condition code
+pipe_class ialu_cc_reg_reg(iRegI dst, iRegI src1, iRegI src2, flagsReg cr) %{
+    single_instruction;
+    dst   : E(write);
+    cr    : E(write);
+    src1  : R(read);
+    src2  : R(read);
+    IALU  : R;
+%}
+
+// Integer ALU zero-reg operation
+pipe_class ialu_zero_reg(iRegI dst, immI0 zero, iRegI src2) %{
+    single_instruction;
+    dst   : E(write);
+    src2  : R(read);
+    IALU  : R;
+%}
+
+// Integer ALU zero-reg operation with condition code only
+pipe_class ialu_cconly_zero_reg(flagsReg cr, iRegI src) %{
+    single_instruction;
+    cr    : E(write);
+    src   : R(read);
+    IALU  : R;
+%}
+
+// Integer ALU reg-reg operation with condition code only
+pipe_class ialu_cconly_reg_reg(flagsReg cr, iRegI src1, iRegI src2) %{
+    single_instruction;
+    cr    : E(write);
+    src1  : R(read);
+    src2  : R(read);
+    IALU  : R;
+%}
+
+// Integer ALU reg-imm operation with condition code only
+pipe_class ialu_cconly_reg_imm(flagsReg cr, iRegI src1) %{
+    single_instruction;
+    cr    : E(write);
+    src1  : R(read);
+    IALU  : R;
+%}
+
+// Integer ALU reg-reg-zero operation with condition code only
+pipe_class ialu_cconly_reg_reg_zero(flagsReg cr, iRegI src1, iRegI src2, immI0 zero) %{
+    single_instruction;
+    cr    : E(write);
+    src1  : R(read);
+    src2  : R(read);
+    IALU  : R;
+%}
+
+// Integer ALU reg-imm-zero operation with condition code only
+pipe_class ialu_cconly_reg_imm_zero(flagsReg cr, iRegI src1, immI0 zero) %{
+    single_instruction;
+    cr    : E(write);
+    src1  : R(read);
+    IALU  : R;
+%}
+
+// Integer ALU reg-reg operation with condition code, src1 modified
+pipe_class ialu_cc_rwreg_reg(flagsReg cr, iRegI src1, iRegI src2) %{
+    single_instruction;
+    cr    : E(write);
+    src1  : E(write);
+    src1  : R(read);
+    src2  : R(read);
+    IALU  : R;
+%}
+
+pipe_class cmpL_reg(iRegI dst, iRegL src1, iRegL src2, flagsReg cr ) %{
+    multiple_bundles;
+    dst   : E(write)+4;
+    cr    : E(write);
+    src1  : R(read);
+    src2  : R(read);
+    IALU  : R(3);
+    BR    : R(2);
+%}
+
+// Integer ALU operation
+pipe_class ialu_none(iRegI dst) %{
+    single_instruction;
+    dst   : E(write);
+    IALU  : R;
+%}
+
+// Integer ALU reg operation
+pipe_class ialu_reg(iRegI dst, iRegI src) %{
+    single_instruction; may_have_no_code;
+    dst   : E(write);
+    src   : R(read);
+    IALU  : R;
+%}
+
+// Integer ALU reg conditional operation
+// This instruction has a 1 cycle stall, and cannot execute
+// in the same cycle as the instruction setting the condition
+// code. We kludge this by pretending to read the condition code
+// 1 cycle earlier, and by marking the functional units as busy
+// for 2 cycles with the result available 1 cycle later than
+// is really the case.
+pipe_class ialu_reg_flags( iRegI op2_out, iRegI op2_in, iRegI op1, flagsReg cr ) %{
+    single_instruction;
+    op2_out : C(write);
+    op1     : R(read);
+    cr      : R(read);       // This is really E, with a 1 cycle stall
+    BR      : R(2);
+    MS      : R(2);
+%}
+
+// Integer ALU reg operation
+pipe_class ialu_move_reg_L_to_I(iRegI dst, iRegL src) %{
+    single_instruction; may_have_no_code;
+    dst   : E(write);
+    src   : R(read);
+    IALU  : R;
+%}
+pipe_class ialu_move_reg_I_to_L(iRegL dst, iRegI src) %{
+    single_instruction; may_have_no_code;
+    dst   : E(write);
+    src   : R(read);
+    IALU  : R;
+%}
+
+// Two integer ALU reg operations
+pipe_class ialu_reg_2(iRegL dst, iRegL src) %{
+    instruction_count(2);
+    dst   : E(write);
+    src   : R(read);
+    A0    : R;
+    A1    : R;
+%}
+
+// Two integer ALU reg operations
+pipe_class ialu_move_reg_L_to_L(iRegL dst, iRegL src) %{
+    instruction_count(2); may_have_no_code;
+    dst   : E(write);
+    src   : R(read);
+    A0    : R;
+    A1    : R;
+%}
+
+// Integer ALU imm operation
+pipe_class ialu_imm(iRegI dst) %{
+    single_instruction;
+    dst   : E(write);
+    IALU  : R;
+%}
+
+pipe_class ialu_imm_n(iRegI dst) %{
+    single_instruction;
+    dst   : E(write);
+    IALU  : R;
+%}
+
+// Integer ALU reg-reg with carry operation
+pipe_class ialu_reg_reg_cy(iRegI dst, iRegI src1, iRegI src2, iRegI cy) %{
+    single_instruction;
+    dst   : E(write);
+    src1  : R(read);
+    src2  : R(read);
+    IALU  : R;
+%}
+
+// Integer ALU cc operation
+pipe_class ialu_cc(iRegI dst, flagsReg cc) %{
+    single_instruction;
+    dst   : E(write);
+    cc    : R(read);
+    IALU  : R;
+%}
+
+// Integer ALU cc / second IALU operation
+pipe_class ialu_reg_ialu( iRegI dst, iRegI src ) %{
+    instruction_count(1); multiple_bundles;
+    dst   : E(write)+1;
+    src   : R(read);
+    IALU  : R;
+%}
+
+// Integer ALU cc / second IALU operation
+pipe_class ialu_reg_reg_ialu( iRegI dst, iRegI p, iRegI q ) %{
+    instruction_count(1); multiple_bundles;
+    dst   : E(write)+1;
+    p     : R(read);
+    q     : R(read);
+    IALU  : R;
+%}
+
+// Integer ALU hi-lo-reg operation
+pipe_class ialu_hi_lo_reg(iRegI dst, immI src) %{
+    instruction_count(1); multiple_bundles;
+    dst   : E(write)+1;
+    IALU  : R(2);
+%}
+
+// Long Constant
+pipe_class loadConL( iRegL dst, immL src ) %{
+    instruction_count(2); multiple_bundles;
+    dst   : E(write)+1;
+    IALU  : R(2);
+    IALU  : R(2);
+%}
+
+// Pointer Constant
+pipe_class loadConP( iRegP dst, immP src ) %{
+    instruction_count(0); multiple_bundles;
+    fixed_latency(6);
+%}
+
+// Polling Address
+pipe_class loadConP_poll( iRegP dst, immP_poll src ) %{
+    dst   : E(write);
+    IALU  : R;
+%}
+
+// Long Constant small
+pipe_class loadConLlo( iRegL dst, immL src ) %{
+    instruction_count(2);
+    dst   : E(write);
+    IALU  : R;
+    IALU  : R;
+%}
+
+// [PHH] This is wrong for 64-bit.  See LdImmF/D.
+pipe_class loadConFD(regF dst, immF src, iRegP tmp) %{
+    instruction_count(1); multiple_bundles;
+    src   : R(read);
+    dst   : M(write)+1;
+    IALU  : R;
+    MS    : E;
+%}
+
+// Integer ALU nop operation
+pipe_class ialu_nop() %{
+    single_instruction;
+    IALU  : R;
+%}
+
+// Integer ALU nop operation
+pipe_class ialu_nop_A0() %{
+    single_instruction;
+    A0    : R;
+%}
+
+// Integer ALU nop operation
+pipe_class ialu_nop_A1() %{
+    single_instruction;
+    A1    : R;
+%}
+
+// Integer Multiply reg-reg operation
+pipe_class imul_reg_reg(iRegI dst, iRegI src1, iRegI src2) %{
+    single_instruction;
+    dst   : E(write);
+    src1  : R(read);
+    src2  : R(read);
+    MS    : R(5);
+%}
+
+pipe_class mulL_reg_reg(iRegL dst, iRegL src1, iRegL src2) %{
+    single_instruction;
+    dst   : E(write)+4;
+    src1  : R(read);
+    src2  : R(read);
+    MS    : R(6);
+%}
+
+// Integer Divide reg-reg
+pipe_class sdiv_reg_reg(iRegI dst, iRegI src1, iRegI src2, iRegI temp, flagsReg cr) %{
+    instruction_count(1); multiple_bundles;
+    dst   : E(write);
+    temp  : E(write);
+    src1  : R(read);
+    src2  : R(read);
+    temp  : R(read);
+    MS    : R(38);
+%}
+
+// Long Divide
+pipe_class divL_reg_reg(iRegL dst, iRegL src1, iRegL src2) %{
+    dst  : E(write)+71;
+    src1 : R(read);
+    src2 : R(read)+1;
+    MS   : R(70);
+%}
+
+// Floating Point Add Float
+pipe_class faddF_reg_reg(regF dst, regF src1, regF src2) %{
+    single_instruction;
+    dst   : X(write);
+    src1  : E(read);
+    src2  : E(read);
+    FA    : R;
+%}
+
+// Floating Point Add Double
+pipe_class faddD_reg_reg(regD dst, regD src1, regD src2) %{
+    single_instruction;
+    dst   : X(write);
+    src1  : E(read);
+    src2  : E(read);
+    FA    : R;
+%}
+
+// Floating Point Conditional Move based on integer flags
+pipe_class int_conditional_float_move (cmpOp cmp, flagsReg cr, regF dst, regF src) %{
+    single_instruction;
+    dst   : X(write);
+    src   : E(read);
+    cr    : R(read);
+    FA    : R(2);
+    BR    : R(2);
+%}
+
+// Floating Point Conditional Move based on integer flags
+pipe_class int_conditional_double_move (cmpOp cmp, flagsReg cr, regD dst, regD src) %{
+    single_instruction;
+    dst   : X(write);
+    src   : E(read);
+    cr    : R(read);
+    FA    : R(2);
+    BR    : R(2);
+%}
+
+// Floating Point Multiply Float
+pipe_class fmulF_reg_reg(regF dst, regF src1, regF src2) %{
+    single_instruction;
+    dst   : X(write);
+    src1  : E(read);
+    src2  : E(read);
+    FM    : R;
+%}
+
+// Floating Point Multiply Double
+pipe_class fmulD_reg_reg(regD dst, regD src1, regD src2) %{
+    single_instruction;
+    dst   : X(write);
+    src1  : E(read);
+    src2  : E(read);
+    FM    : R;
+%}
+
+// Floating Point Divide Float
+pipe_class fdivF_reg_reg(regF dst, regF src1, regF src2) %{
+    single_instruction;
+    dst   : X(write);
+    src1  : E(read);
+    src2  : E(read);
+    FM    : R;
+    FDIV  : C(14);
+%}
+
+// Floating Point Divide Double
+pipe_class fdivD_reg_reg(regD dst, regD src1, regD src2) %{
+    single_instruction;
+    dst   : X(write);
+    src1  : E(read);
+    src2  : E(read);
+    FM    : R;
+    FDIV  : C(17);
+%}
+
+// Floating Point Move/Negate/Abs Float
+pipe_class faddF_reg(regF dst, regF src) %{
+    single_instruction;
+    dst   : W(write);
+    src   : E(read);
+    FA    : R(1);
+%}
+
+// Floating Point Move/Negate/Abs Double
+pipe_class faddD_reg(regD dst, regD src) %{
+    single_instruction;
+    dst   : W(write);
+    src   : E(read);
+    FA    : R;
+%}
+
+// Floating Point Convert F->D
+pipe_class fcvtF2D(regD dst, regF src) %{
+    single_instruction;
+    dst   : X(write);
+    src   : E(read);
+    FA    : R;
+%}
+
+// Floating Point Convert I->D
+pipe_class fcvtI2D(regD dst, regF src) %{
+    single_instruction;
+    dst   : X(write);
+    src   : E(read);
+    FA    : R;
+%}
+
+// Floating Point Convert LHi->D
+pipe_class fcvtLHi2D(regD dst, regD src) %{
+    single_instruction;
+    dst   : X(write);
+    src   : E(read);
+    FA    : R;
+%}
+
+// Floating Point Convert L->D
+pipe_class fcvtL2D(regD dst, iRegL src) %{
+    single_instruction;
+    dst   : X(write);
+    src   : E(read);
+    FA    : R;
+%}
+
+// Floating Point Convert L->F
+pipe_class fcvtL2F(regF dst, iRegL src) %{
+    single_instruction;
+    dst   : X(write);
+    src   : E(read);
+    FA    : R;
+%}
+
+// Floating Point Convert D->F
+pipe_class fcvtD2F(regD dst, regF src) %{
+    single_instruction;
+    dst   : X(write);
+    src   : E(read);
+    FA    : R;
+%}
+
+// Floating Point Convert I->L
+pipe_class fcvtI2L(regD dst, regF src) %{
+    single_instruction;
+    dst   : X(write);
+    src   : E(read);
+    FA    : R;
+%}
+
+// Floating Point Convert D->F
+pipe_class fcvtD2I(iRegI dst, regD src, flagsReg cr) %{
+    instruction_count(1); multiple_bundles;
+    dst   : X(write)+6;
+    src   : E(read);
+    FA    : R;
+%}
+
+// Floating Point Convert D->L
+pipe_class fcvtD2L(regD dst, regD src, flagsReg cr) %{
+    instruction_count(1); multiple_bundles;
+    dst   : X(write)+6;
+    src   : E(read);
+    FA    : R;
+%}
+
+// Floating Point Convert F->I
+pipe_class fcvtF2I(regF dst, regF src, flagsReg cr) %{
+    instruction_count(1); multiple_bundles;
+    dst   : X(write)+6;
+    src   : E(read);
+    FA    : R;
+%}
+
+// Floating Point Convert F->L
+pipe_class fcvtF2L(regD dst, regF src, flagsReg cr) %{
+    instruction_count(1); multiple_bundles;
+    dst   : X(write)+6;
+    src   : E(read);
+    FA    : R;
+%}
+
+// Floating Point Convert I->F
+pipe_class fcvtI2F(regF dst, regF src) %{
+    single_instruction;
+    dst   : X(write);
+    src   : E(read);
+    FA    : R;
+%}
+
+// Floating Point Compare
+pipe_class faddF_fcc_reg_reg_zero(flagsRegF cr, regF src1, regF src2, immI0 zero) %{
+    single_instruction;
+    cr    : X(write);
+    src1  : E(read);
+    src2  : E(read);
+    FA    : R;
+%}
+
+// Floating Point Compare
+pipe_class faddD_fcc_reg_reg_zero(flagsRegF cr, regD src1, regD src2, immI0 zero) %{
+    single_instruction;
+    cr    : X(write);
+    src1  : E(read);
+    src2  : E(read);
+    FA    : R;
+%}
+
+// Floating Add Nop
+pipe_class fadd_nop() %{
+    single_instruction;
+    FA  : R;
+%}
+
+// Integer Store to Memory
+pipe_class istore_mem_reg(memoryI mem, iRegI src) %{
+    single_instruction;
+    mem   : R(read);
+    src   : C(read);
+    MS    : R;
+%}
+
+// Integer Store to Memory
+pipe_class istore_mem_spORreg(memoryI mem, sp_ptr_RegP src) %{
+    single_instruction;
+    mem   : R(read);
+    src   : C(read);
+    MS    : R;
+%}
+
+// Float Store
+pipe_class fstoreF_mem_reg(memoryF mem, RegF src) %{
+    single_instruction;
+    mem : R(read);
+    src : C(read);
+    MS  : R;
+%}
+
+// Float Store
+pipe_class fstoreF_mem_zero(memoryF mem, immF0 src) %{
+    single_instruction;
+    mem : R(read);
+    MS  : R;
+%}
+
+// Double Store
+pipe_class fstoreD_mem_reg(memoryD mem, RegD src) %{
+    instruction_count(1);
+    mem : R(read);
+    src : C(read);
+    MS  : R;
+%}
+
+// Double Store
+pipe_class fstoreD_mem_zero(memoryD mem, immD0 src) %{
+    single_instruction;
+    mem : R(read);
+    MS  : R;
+%}
+
+// Integer Load (when sign bit propagation not needed)
+pipe_class iload_mem(iRegI dst, memoryI mem) %{
+    single_instruction;
+    mem : R(read);
+    dst : C(write);
+    MS  : R;
+%}
+
+// Integer Load (when sign bit propagation or masking is needed)
+pipe_class iload_mask_mem(iRegI dst, memoryI mem) %{
+    single_instruction;
+    mem : R(read);
+    dst : M(write);
+    MS  : R;
+%}
+
+// Float Load
+pipe_class floadF_mem(regF dst, memoryF mem) %{
+    single_instruction;
+    mem : R(read);
+    dst : M(write);
+    MS  : R;
+%}
+
+// Float Load
+pipe_class floadD_mem(regD dst, memoryD mem) %{
+    instruction_count(1); multiple_bundles; // Again, unaligned argument is only multiple case
+    mem : R(read);
+    dst : M(write);
+    MS  : R;
+%}
+
+// Memory Nop
+pipe_class mem_nop() %{
+    single_instruction;
+    MS  : R;
+%}
+
+pipe_class sethi(iRegP dst, immI src) %{
+    single_instruction;
+    dst  : E(write);
+    IALU : R;
+%}
+
+pipe_class loadPollP(iRegP poll) %{
+    single_instruction;
+    poll : R(read);
+    MS   : R;
+%}
+
+pipe_class br(Universe br, label labl) %{
+    single_instruction_with_delay_slot;
+    BR  : R;
+%}
+
+pipe_class br_cc(Universe br, cmpOp cmp, flagsReg cr, label labl) %{
+    single_instruction_with_delay_slot;
+    cr    : E(read);
+    BR    : R;
+%}
+
+pipe_class br_reg(Universe br, cmpOp cmp, iRegI op1, label labl) %{
+    single_instruction_with_delay_slot;
+    op1 : E(read);
+    BR  : R;
+    MS  : R;
+%}
+
+pipe_class br_nop() %{
+    single_instruction;
+    BR  : R;
+%}
+
+pipe_class simple_call(method meth) %{
+    instruction_count(2); multiple_bundles; force_serialization;
+    fixed_latency(100);
+    BR  : R(1);
+    MS  : R(1);
+    A0  : R(1);
+%}
+
+pipe_class compiled_call(method meth) %{
+    instruction_count(1); multiple_bundles; force_serialization;
+    fixed_latency(100);
+    MS  : R(1);
+%}
+
+pipe_class call(method meth) %{
+    instruction_count(0); multiple_bundles; force_serialization;
+    fixed_latency(100);
+%}
+
+pipe_class tail_call(Universe ignore, label labl) %{
+    single_instruction; has_delay_slot;
+    fixed_latency(100);
+    BR  : R(1);
+    MS  : R(1);
+%}
+
+pipe_class ret(Universe ignore) %{
+    single_instruction; has_delay_slot;
+    BR  : R(1);
+    MS  : R(1);
+%}
+
+// The real do-nothing guy
+pipe_class empty( ) %{
+    instruction_count(0);
+%}
+
+pipe_class long_memory_op() %{
+    instruction_count(0); multiple_bundles; force_serialization;
+    fixed_latency(25);
+    MS  : R(1);
+%}
+
+// Check-cast
+pipe_class partial_subtype_check_pipe(Universe ignore, iRegP array, iRegP match ) %{
+    array : R(read);
+    match  : R(read);
+    IALU   : R(2);
+    BR     : R(2);
+    MS     : R;
+%}
+
+// Convert FPU flags into +1,0,-1
+pipe_class floating_cmp( iRegI dst, regF src1, regF src2 ) %{
+    src1  : E(read);
+    src2  : E(read);
+    dst   : E(write);
+    FA    : R;
+    MS    : R(2);
+    BR    : R(2);
+%}
+
+// Compare for p < q, and conditionally add y
+pipe_class cadd_cmpltmask( iRegI p, iRegI q, iRegI y ) %{
+    p     : E(read);
+    q     : E(read);
+    y     : E(read);
+    IALU  : R(3)
+%}
+
+// Perform a compare, then move conditionally in a branch delay slot.
+pipe_class min_max( iRegI src2, iRegI srcdst ) %{
+    src2   : E(read);
+    srcdst : E(read);
+    IALU   : R;
+    BR     : R;
+%}
+
+// Define the class for the Nop node
+define %{
+   MachNop = ialu_nop;
+%}
+
+%}
+
+//----------INSTRUCTIONS-------------------------------------------------------
+
+//------------Special Nop instructions for bundling - no match rules-----------
+// Nop using the A0 functional unit
+instruct Nop_A0() %{
+  ins_pipe(ialu_nop_A0);
+%}
+
+// Nop using the A1 functional unit
+instruct Nop_A1( ) %{
+  ins_pipe(ialu_nop_A1);
+%}
+
+// Nop using the memory functional unit
+instruct Nop_MS( ) %{
+  ins_pipe(mem_nop);
+%}
+
+// Nop using the floating add functional unit
+instruct Nop_FA( ) %{
+  ins_pipe(fadd_nop);
+%}
+
+// Nop using the branch functional unit
+instruct Nop_BR( ) %{
+  ins_pipe(br_nop);
+%}
+
+//----------Load/Store/Move Instructions---------------------------------------
+//----------Load Instructions--------------------------------------------------
+// Load Byte (8bit signed)
+instruct loadB(iRegI dst, memoryB mem) %{
+  match(Set dst (LoadB mem));
+  ins_cost(MEMORY_REF_COST);
+
+  size(4);
+  format %{ "LDRSB   $dst,$mem\t! byte -> int" %}
+  ins_encode %{
+    // High 32 bits are harmlessly set on Aarch64
+    __ ldrsb($dst$$Register, $mem$$Address);
+  %}
+  ins_pipe(iload_mask_mem);
+%}
+
+// Load Byte (8bit signed) into a Long Register
+instruct loadB2L(iRegL dst, memoryB mem) %{
+  match(Set dst (ConvI2L (LoadB mem)));
+  ins_cost(MEMORY_REF_COST);
+
+#ifdef AARCH64
+  size(4);
+  format %{ "LDRSB $dst,$mem\t! byte -> long"  %}
+  ins_encode %{
+    __ ldrsb($dst$$Register, $mem$$Address);
+  %}
+#else
+  size(8);
+  format %{ "LDRSB $dst.lo,$mem\t! byte -> long\n\t"
+            "ASR   $dst.hi,$dst.lo,31" %}
+  ins_encode %{
+    __ ldrsb($dst$$Register, $mem$$Address);
+    __ mov($dst$$Register->successor(), AsmOperand($dst$$Register, asr, 31));
+  %}
+#endif
+  ins_pipe(iload_mask_mem);
+%}
+
+// Load Unsigned Byte (8bit UNsigned) into an int reg
+instruct loadUB(iRegI dst, memoryB mem) %{
+  match(Set dst (LoadUB mem));
+  ins_cost(MEMORY_REF_COST);
+
+  size(4);
+  format %{ "LDRB   $dst,$mem\t! ubyte -> int" %}
+  ins_encode %{
+    __ ldrb($dst$$Register, $mem$$Address);
+  %}
+  ins_pipe(iload_mem);
+%}
+
+// Load Unsigned Byte (8bit UNsigned) into a Long Register
+instruct loadUB2L(iRegL dst, memoryB mem) %{
+  match(Set dst (ConvI2L (LoadUB mem)));
+  ins_cost(MEMORY_REF_COST);
+
+#ifdef AARCH64
+  size(4);
+  format %{ "LDRB  $dst,$mem\t! ubyte -> long"  %}
+  ins_encode %{
+    __ ldrb($dst$$Register, $mem$$Address);
+  %}
+#else
+  size(8);
+  format %{ "LDRB  $dst.lo,$mem\t! ubyte -> long\n\t"
+            "MOV   $dst.hi,0" %}
+  ins_encode %{
+    __ ldrb($dst$$Register, $mem$$Address);
+    __ mov($dst$$Register->successor(), 0);
+  %}
+#endif
+  ins_pipe(iload_mem);
+%}
+
+// Load Unsigned Byte (8 bit UNsigned) with immediate mask into Long Register
+instruct loadUB2L_limmI(iRegL dst, memoryB mem, limmIlow8 mask) %{
+  match(Set dst (ConvI2L (AndI (LoadUB mem) mask)));
+
+#ifdef AARCH64
+  ins_cost(MEMORY_REF_COST + DEFAULT_COST);
+  size(8);
+  format %{ "LDRB  $dst,$mem\t! ubyte -> long\n\t"
+            "AND  $dst,$dst,$mask" %}
+  ins_encode %{
+    __ ldrb($dst$$Register, $mem$$Address);
+    __ andr($dst$$Register, $dst$$Register, limmI_low($mask$$constant, 8));
+  %}
+#else
+  ins_cost(MEMORY_REF_COST + 2*DEFAULT_COST);
+  size(12);
+  format %{ "LDRB  $dst.lo,$mem\t! ubyte -> long\n\t"
+            "MOV   $dst.hi,0\n\t"
+            "AND  $dst.lo,$dst.lo,$mask" %}
+  ins_encode %{
+    __ ldrb($dst$$Register, $mem$$Address);
+    __ mov($dst$$Register->successor(), 0);
+    __ andr($dst$$Register, $dst$$Register, limmI_low($mask$$constant, 8));
+  %}
+#endif
+  ins_pipe(iload_mem);
+%}
+
+// Load Short (16bit signed)
+#ifdef AARCH64
+// XXX This variant shouldn't be necessary if 6217251 is implemented
+instruct loadSoff(iRegI dst, memoryScaledS mem, aimmX off, iRegP tmp) %{
+  match(Set dst (LoadS (AddP mem off)));
+  ins_cost(MEMORY_REF_COST + DEFAULT_COST); // assume shift/sign-extend is free
+  effect(TEMP tmp);
+  size(4 * 2);
+
+  format %{ "LDRSH   $dst,$mem+$off\t! short temp=$tmp" %}
+  ins_encode %{
+    Register base = reg_to_register_object($mem$$base);
+    __ add($tmp$$Register, base, $off$$constant);
+    Address nmem = Address::make_raw($tmp$$reg, $mem$$index, $mem$$scale, $mem$$disp, relocInfo::none);
+    __ ldrsh($dst$$Register, nmem);
+  %}
+  ins_pipe(iload_mask_mem);
+%}
+#endif
+
+instruct loadS(iRegI dst, memoryS mem) %{
+  match(Set dst (LoadS mem));
+  ins_cost(MEMORY_REF_COST);
+
+  size(4);
+  format %{ "LDRSH   $dst,$mem\t! short" %}
+  ins_encode %{
+    __ ldrsh($dst$$Register, $mem$$Address);
+  %}
+  ins_pipe(iload_mask_mem);
+%}
+
+// Load Short (16 bit signed) to Byte (8 bit signed)
+instruct loadS2B(iRegI dst, memoryS mem, immI_24 twentyfour) %{
+  match(Set dst (RShiftI (LShiftI (LoadS mem) twentyfour) twentyfour));
+  ins_cost(MEMORY_REF_COST);
+
+  size(4);
+
+  format %{ "LDRSB   $dst,$mem\t! short -> byte" %}
+  ins_encode %{
+    // High 32 bits are harmlessly set on Aarch64
+    __ ldrsb($dst$$Register, $mem$$Address);
+  %}
+  ins_pipe(iload_mask_mem);
+%}
+
+// Load Short (16bit signed) into a Long Register
+instruct loadS2L(iRegL dst, memoryS mem) %{
+  match(Set dst (ConvI2L (LoadS mem)));
+  ins_cost(MEMORY_REF_COST);
+
+#ifdef AARCH64
+  size(4);
+  format %{ "LDRSH $dst,$mem\t! short -> long"  %}
+  ins_encode %{
+    __ ldrsh($dst$$Register, $mem$$Address);
+  %}
+#else
+  size(8);
+  format %{ "LDRSH $dst.lo,$mem\t! short -> long\n\t"
+            "ASR   $dst.hi,$dst.lo,31" %}
+  ins_encode %{
+    __ ldrsh($dst$$Register, $mem$$Address);
+    __ mov($dst$$Register->successor(), AsmOperand($dst$$Register, asr, 31));
+  %}
+#endif
+  ins_pipe(iload_mask_mem);
+%}
+
+// Load Unsigned Short/Char (16bit UNsigned)
+
+#ifdef AARCH64
+// XXX This variant shouldn't be necessary if 6217251 is implemented
+instruct loadUSoff(iRegI dst, memoryScaledS mem, aimmX off, iRegP tmp) %{
+  match(Set dst (LoadUS (AddP mem off)));
+  ins_cost(MEMORY_REF_COST + DEFAULT_COST); // assume shift/sign-extend is free
+  effect(TEMP tmp);
+  size(4 * 2);
+
+  format %{ "LDRH   $dst,$mem+$off\t! ushort/char temp=$tmp" %}
+  ins_encode %{
+    Register base = reg_to_register_object($mem$$base);
+    __ add($tmp$$Register, base, $off$$constant);
+    Address nmem = Address::make_raw($tmp$$reg, $mem$$index, $mem$$scale, $mem$$disp, relocInfo::none);
+    __ ldrh($dst$$Register, nmem);
+  %}
+  ins_pipe(iload_mem);
+%}
+#endif
+
+instruct loadUS(iRegI dst, memoryS mem) %{
+  match(Set dst (LoadUS mem));
+  ins_cost(MEMORY_REF_COST);
+
+  size(4);
+  format %{ "LDRH   $dst,$mem\t! ushort/char" %}
+  ins_encode %{
+    __ ldrh($dst$$Register, $mem$$Address);
+  %}
+  ins_pipe(iload_mem);
+%}
+
+// Load Unsigned Short/Char (16 bit UNsigned) to Byte (8 bit signed)
+instruct loadUS2B(iRegI dst, memoryB mem, immI_24 twentyfour) %{
+  match(Set dst (RShiftI (LShiftI (LoadUS mem) twentyfour) twentyfour));
+  ins_cost(MEMORY_REF_COST);
+
+  size(4);
+  format %{ "LDRSB   $dst,$mem\t! ushort -> byte" %}
+  ins_encode %{
+    __ ldrsb($dst$$Register, $mem$$Address);
+  %}
+  ins_pipe(iload_mask_mem);
+%}
+
+// Load Unsigned Short/Char (16bit UNsigned) into a Long Register
+instruct loadUS2L(iRegL dst, memoryS mem) %{
+  match(Set dst (ConvI2L (LoadUS mem)));
+  ins_cost(MEMORY_REF_COST);
+
+#ifdef AARCH64
+  size(4);
+  format %{ "LDRH  $dst,$mem\t! short -> long"  %}
+  ins_encode %{
+    __ ldrh($dst$$Register, $mem$$Address);
+  %}
+#else
+  size(8);
+  format %{ "LDRH  $dst.lo,$mem\t! short -> long\n\t"
+            "MOV   $dst.hi, 0" %}
+  ins_encode %{
+    __ ldrh($dst$$Register, $mem$$Address);
+    __ mov($dst$$Register->successor(), 0);
+  %}
+#endif
+  ins_pipe(iload_mem);
+%}
+
+// Load Unsigned Short/Char (16bit UNsigned) with mask 0xFF into a Long Register
+instruct loadUS2L_immI_255(iRegL dst, memoryB mem, immI_255 mask) %{
+  match(Set dst (ConvI2L (AndI (LoadUS mem) mask)));
+  ins_cost(MEMORY_REF_COST);
+
+#ifdef AARCH64
+  size(4);
+  format %{ "LDRB  $dst,$mem"  %}
+  ins_encode %{
+    __ ldrb($dst$$Register, $mem$$Address);
+  %}
+#else
+  size(8);
+  format %{ "LDRB  $dst.lo,$mem\t! \n\t"
+            "MOV   $dst.hi, 0" %}
+  ins_encode %{
+    __ ldrb($dst$$Register, $mem$$Address);
+    __ mov($dst$$Register->successor(), 0);
+  %}
+#endif
+  ins_pipe(iload_mem);
+%}
+
+// Load Unsigned Short/Char (16bit UNsigned) with a immediate mask into a Long Register
+instruct loadUS2L_limmI(iRegL dst, memoryS mem, limmI mask) %{
+  match(Set dst (ConvI2L (AndI (LoadUS mem) mask)));
+#ifdef AARCH64
+  ins_cost(MEMORY_REF_COST + 1*DEFAULT_COST);
+
+  size(8);
+  format %{ "LDRH   $dst,$mem\t! ushort/char & mask -> long\n\t"
+            "AND    $dst,$dst,$mask" %}
+  ins_encode %{
+    __ ldrh($dst$$Register, $mem$$Address);
+    __ andr($dst$$Register, $dst$$Register, (uintx)$mask$$constant);
+  %}
+#else
+  ins_cost(MEMORY_REF_COST + 2*DEFAULT_COST);
+
+  size(12);
+  format %{ "LDRH   $dst,$mem\t! ushort/char & mask -> long\n\t"
+            "MOV    $dst.hi, 0\n\t"
+            "AND    $dst,$dst,$mask" %}
+  ins_encode %{
+    __ ldrh($dst$$Register, $mem$$Address);
+    __ mov($dst$$Register->successor(), 0);
+    __ andr($dst$$Register, $dst$$Register, $mask$$constant);
+  %}
+#endif
+  ins_pipe(iload_mem);
+%}
+
+// Load Integer
+
+#ifdef AARCH64
+// XXX This variant shouldn't be necessary if 6217251 is implemented
+instruct loadIoff(iRegI dst, memoryScaledI mem, aimmX off, iRegP tmp) %{
+  match(Set dst (LoadI (AddP mem off)));
+  ins_cost(MEMORY_REF_COST + DEFAULT_COST); // assume shift/sign-extend is free
+  effect(TEMP tmp);
+  size(4 * 2);
+
+  format %{ "ldr_s32 $dst,$mem+$off\t! int temp=$tmp" %}
+  ins_encode %{
+    Register base = reg_to_register_object($mem$$base);
+    __ add($tmp$$Register, base, $off$$constant);
+    Address nmem = Address::make_raw($tmp$$reg, $mem$$index, $mem$$scale, $mem$$disp, relocInfo::none);
+    __ ldr_s32($dst$$Register, nmem);
+  %}
+  ins_pipe(iload_mem);
+%}
+#endif
+
+instruct loadI(iRegI dst, memoryI mem) %{
+  match(Set dst (LoadI mem));
+  ins_cost(MEMORY_REF_COST);
+
+  size(4);
+  format %{ "ldr_s32 $dst,$mem\t! int" %}
+  ins_encode %{
+    __ ldr_s32($dst$$Register, $mem$$Address);
+  %}
+  ins_pipe(iload_mem);
+%}
+
+// Load Integer to Byte (8 bit signed)
+instruct loadI2B(iRegI dst, memoryS mem, immI_24 twentyfour) %{
+  match(Set dst (RShiftI (LShiftI (LoadI mem) twentyfour) twentyfour));
+  ins_cost(MEMORY_REF_COST);
+
+  size(4);
+
+  format %{ "LDRSB   $dst,$mem\t! int -> byte" %}
+  ins_encode %{
+    __ ldrsb($dst$$Register, $mem$$Address);
+  %}
+  ins_pipe(iload_mask_mem);
+%}
+
+// Load Integer to Unsigned Byte (8 bit UNsigned)
+instruct loadI2UB(iRegI dst, memoryB mem, immI_255 mask) %{
+  match(Set dst (AndI (LoadI mem) mask));
+  ins_cost(MEMORY_REF_COST);
+
+  size(4);
+
+  format %{ "LDRB   $dst,$mem\t! int -> ubyte" %}
+  ins_encode %{
+    __ ldrb($dst$$Register, $mem$$Address);
+  %}
+  ins_pipe(iload_mask_mem);
+%}
+
+// Load Integer to Short (16 bit signed)
+instruct loadI2S(iRegI dst, memoryS mem, immI_16 sixteen) %{
+  match(Set dst (RShiftI (LShiftI (LoadI mem) sixteen) sixteen));
+  ins_cost(MEMORY_REF_COST);
+
+  size(4);
+  format %{ "LDRSH   $dst,$mem\t! int -> short" %}
+  ins_encode %{
+    __ ldrsh($dst$$Register, $mem$$Address);
+  %}
+  ins_pipe(iload_mask_mem);
+%}
+
+// Load Integer to Unsigned Short (16 bit UNsigned)
+instruct loadI2US(iRegI dst, memoryS mem, immI_65535 mask) %{
+  match(Set dst (AndI (LoadI mem) mask));
+  ins_cost(MEMORY_REF_COST);
+
+  size(4);
+  format %{ "LDRH   $dst,$mem\t! int -> ushort/char" %}
+  ins_encode %{
+    __ ldrh($dst$$Register, $mem$$Address);
+  %}
+  ins_pipe(iload_mask_mem);
+%}
+
+// Load Integer into a Long Register
+instruct loadI2L(iRegL dst, memoryI mem) %{
+  match(Set dst (ConvI2L (LoadI mem)));
+#ifdef AARCH64
+  ins_cost(MEMORY_REF_COST);
+
+  size(4);
+  format %{ "LDRSW $dst.lo,$mem\t! int -> long"  %}
+  ins_encode %{
+    __ ldr_s32($dst$$Register, $mem$$Address);
+  %}
+#else
+  ins_cost(MEMORY_REF_COST);
+
+  size(8);
+  format %{ "LDR   $dst.lo,$mem\t! int -> long\n\t"
+            "ASR   $dst.hi,$dst.lo,31\t! int->long" %}
+  ins_encode %{
+    __ ldr($dst$$Register, $mem$$Address);
+    __ mov($dst$$Register->successor(), AsmOperand($dst$$Register, asr, 31));
+  %}
+#endif
+  ins_pipe(iload_mask_mem);
+%}
+
+// Load Integer with mask 0xFF into a Long Register
+instruct loadI2L_immI_255(iRegL dst, memoryB mem, immI_255 mask) %{
+  match(Set dst (ConvI2L (AndI (LoadI mem) mask)));
+#ifdef AARCH64
+  ins_cost(MEMORY_REF_COST);
+
+  size(4);
+  format %{ "LDRB   $dst.lo,$mem\t! int & 0xFF -> long"  %}
+  ins_encode %{
+    __ ldrb($dst$$Register, $mem$$Address);
+  %}
+#else
+  ins_cost(MEMORY_REF_COST);
+
+  size(8);
+  format %{ "LDRB   $dst.lo,$mem\t! int & 0xFF -> long\n\t"
+            "MOV    $dst.hi, 0" %}
+  ins_encode %{
+    __ ldrb($dst$$Register, $mem$$Address);
+    __ mov($dst$$Register->successor(), 0);
+  %}
+#endif
+  ins_pipe(iload_mem);
+%}
+
+// Load Integer with mask 0xFFFF into a Long Register
+instruct loadI2L_immI_65535(iRegL dst, memoryS mem, immI_65535 mask) %{
+  match(Set dst (ConvI2L (AndI (LoadI mem) mask)));
+  ins_cost(MEMORY_REF_COST);
+
+#ifdef AARCH64
+  size(4);
+  format %{ "LDRH   $dst,$mem\t! int & 0xFFFF -> long" %}
+  ins_encode %{
+    __ ldrh($dst$$Register, $mem$$Address);
+  %}
+#else
+  size(8);
+  format %{ "LDRH   $dst,$mem\t! int & 0xFFFF -> long\n\t"
+            "MOV    $dst.hi, 0" %}
+  ins_encode %{
+    __ ldrh($dst$$Register, $mem$$Address);
+    __ mov($dst$$Register->successor(), 0);
+  %}
+#endif
+  ins_pipe(iload_mask_mem);
+%}
+
+#ifdef AARCH64
+// Load Integer with an immediate mask into a Long Register
+instruct loadI2L_limmI(iRegL dst, memoryI mem, limmI mask) %{
+  match(Set dst (ConvI2L (AndI (LoadI mem) mask)));
+  ins_cost(MEMORY_REF_COST + 1*DEFAULT_COST);
+
+  size(8);
+  format %{ "LDRSW $dst,$mem\t! int -> long\n\t"
+            "AND   $dst,$dst,$mask" %}
+
+  ins_encode %{
+    __ ldr_s32($dst$$Register, $mem$$Address);
+    __ andr($dst$$Register, $dst$$Register, (uintx)$mask$$constant);
+  %}
+  ins_pipe(iload_mem);
+%}
+#else
+// Load Integer with a 31-bit immediate mask into a Long Register
+instruct loadI2L_limmU31(iRegL dst, memoryI mem, limmU31 mask) %{
+  match(Set dst (ConvI2L (AndI (LoadI mem) mask)));
+  ins_cost(MEMORY_REF_COST + 2*DEFAULT_COST);
+
+  size(12);
+  format %{ "LDR   $dst.lo,$mem\t! int -> long\n\t"
+            "MOV    $dst.hi, 0\n\t"
+            "AND   $dst,$dst,$mask" %}
+
+  ins_encode %{
+    __ ldr($dst$$Register, $mem$$Address);
+    __ mov($dst$$Register->successor(), 0);
+    __ andr($dst$$Register, $dst$$Register, $mask$$constant);
+  %}
+  ins_pipe(iload_mem);
+%}
+#endif
+
+#ifdef AARCH64
+// Load Integer with mask into a Long Register
+// FIXME: use signedRegI mask, remove tmp?
+instruct loadI2L_immI(iRegL dst, memoryI mem, immI mask, iRegI tmp) %{
+  match(Set dst (ConvI2L (AndI (LoadI mem) mask)));
+  effect(TEMP dst, TEMP tmp);
+
+  ins_cost(MEMORY_REF_COST + 3*DEFAULT_COST);
+  format %{ "LDRSW    $mem,$dst\t! int & 31-bit mask -> long\n\t"
+            "MOV_SLOW $tmp,$mask\n\t"
+            "AND      $dst,$tmp,$dst" %}
+  ins_encode %{
+    __ ldrsw($dst$$Register, $mem$$Address);
+    __ mov_slow($tmp$$Register, $mask$$constant);
+    __ andr($dst$$Register, $dst$$Register, $tmp$$Register);
+  %}
+  ins_pipe(iload_mem);
+%}
+#else
+// Load Integer with a 31-bit mask into a Long Register
+// FIXME: use iRegI mask, remove tmp?
+instruct loadI2L_immU31(iRegL dst, memoryI mem, immU31 mask, iRegI tmp) %{
+  match(Set dst (ConvI2L (AndI (LoadI mem) mask)));
+  effect(TEMP dst, TEMP tmp);
+
+  ins_cost(MEMORY_REF_COST + 4*DEFAULT_COST);
+  size(20);
+  format %{ "LDR      $mem,$dst\t! int & 31-bit mask -> long\n\t"
+            "MOV      $dst.hi, 0\n\t"
+            "MOV_SLOW $tmp,$mask\n\t"
+            "AND      $dst,$tmp,$dst" %}
+  ins_encode %{
+    __ ldr($dst$$Register, $mem$$Address);
+    __ mov($dst$$Register->successor(), 0);
+    __ mov_slow($tmp$$Register, $mask$$constant);
+    __ andr($dst$$Register, $dst$$Register, $tmp$$Register);
+  %}
+  ins_pipe(iload_mem);
+%}
+#endif
+
+// Load Unsigned Integer into a Long Register
+instruct loadUI2L(iRegL dst, memoryI mem, immL_32bits mask) %{
+  match(Set dst (AndL (ConvI2L (LoadI mem)) mask));
+  ins_cost(MEMORY_REF_COST);
+
+#ifdef AARCH64
+//size(4);
+  format %{ "LDR_w $dst,$mem\t! uint -> long" %}
+  ins_encode %{
+    __ ldr_w($dst$$Register, $mem$$Address);
+  %}
+#else
+  size(8);
+  format %{ "LDR   $dst.lo,$mem\t! uint -> long\n\t"
+            "MOV   $dst.hi,0" %}
+  ins_encode %{
+    __ ldr($dst$$Register, $mem$$Address);
+    __ mov($dst$$Register->successor(), 0);
+  %}
+#endif
+  ins_pipe(iload_mem);
+%}
+
+// Load Long
+
+#ifdef AARCH64
+// XXX This variant shouldn't be necessary if 6217251 is implemented
+instruct loadLoff(iRegLd dst, memoryScaledL mem, aimmX off, iRegP tmp) %{
+  match(Set dst (LoadL (AddP mem off)));
+  ins_cost(MEMORY_REF_COST + DEFAULT_COST); // assume shift/sign-extend is free
+  effect(TEMP tmp);
+  size(4 * 2);
+
+  format %{ "LDR    $dst,$mem+$off\t! long temp=$tmp" %}
+  ins_encode %{
+    Register base = reg_to_register_object($mem$$base);
+    __ add($tmp$$Register, base, $off$$constant);
+    Address nmem = Address::make_raw($tmp$$reg, $mem$$index, $mem$$scale, $mem$$disp, relocInfo::none);
+    __ ldr($dst$$Register, nmem);
+  %}
+  ins_pipe(iload_mem);
+%}
+#endif
+
+instruct loadL(iRegLd dst, memoryL mem ) %{
+#ifdef AARCH64
+  // already atomic for Aarch64
+#else
+  predicate(!((LoadLNode*)n)->require_atomic_access());
+#endif
+  match(Set dst (LoadL mem));
+  effect(TEMP dst);
+  ins_cost(MEMORY_REF_COST);
+
+  size(4);
+  format %{ "ldr_64  $dst,$mem\t! long" %}
+  ins_encode %{
+    __ ldr_64($dst$$Register, $mem$$Address);
+  %}
+  ins_pipe(iload_mem);
+%}
+
+#ifndef AARCH64
+instruct loadL_2instr(iRegL dst, memorylong mem ) %{
+  predicate(!((LoadLNode*)n)->require_atomic_access());
+  match(Set dst (LoadL mem));
+  ins_cost(MEMORY_REF_COST + DEFAULT_COST);
+
+  size(8);
+  format %{ "LDR    $dst.lo,$mem \t! long order of instrs reversed if $dst.lo == base($mem)\n\t"
+            "LDR    $dst.hi,$mem+4 or $mem" %}
+  ins_encode %{
+    Address Amemlo = Address::make_raw($mem$$base, $mem$$index, $mem$$scale, $mem$$disp, relocInfo::none);
+    Address Amemhi = Address::make_raw($mem$$base, $mem$$index, $mem$$scale, $mem$$disp + 4, relocInfo::none);
+
+    if ($dst$$Register == reg_to_register_object($mem$$base)) {
+      __ ldr($dst$$Register->successor(), Amemhi);
+      __ ldr($dst$$Register, Amemlo);
+    } else {
+      __ ldr($dst$$Register, Amemlo);
+      __ ldr($dst$$Register->successor(), Amemhi);
+    }
+  %}
+  ins_pipe(iload_mem);
+%}
+
+instruct loadL_volatile(iRegL dst, indirect mem ) %{
+  predicate(((LoadLNode*)n)->require_atomic_access());
+  match(Set dst (LoadL mem));
+  ins_cost(MEMORY_REF_COST);
+
+  size(4);
+  format %{ "LDMIA    $dst,$mem\t! long" %}
+  ins_encode %{
+    // FIXME: why is ldmia considered atomic?  Should be ldrexd
+    RegisterSet set($dst$$Register);
+    set = set | reg_to_register_object($dst$$reg + 1);
+    __ ldmia(reg_to_register_object($mem$$base), set);
+  %}
+  ins_pipe(iload_mem);
+%}
+
+instruct loadL_volatile_fp(iRegL dst, memoryD mem ) %{
+  predicate(((LoadLNode*)n)->require_atomic_access());
+  match(Set dst (LoadL mem));
+  ins_cost(MEMORY_REF_COST);
+
+  size(8);
+  format %{ "FLDD      S14, $mem"
+            "FMRRD    $dst, S14\t! long \n't" %}
+  ins_encode %{
+    __ fldd(S14, $mem$$Address);
+    __ fmrrd($dst$$Register, $dst$$Register->successor(), S14);
+  %}
+  ins_pipe(iload_mem);
+%}
+
+instruct loadL_unaligned(iRegL dst, memorylong mem ) %{
+  match(Set dst (LoadL_unaligned mem));
+  ins_cost(MEMORY_REF_COST);
+
+  size(8);
+  format %{ "LDR    $dst.lo,$mem\t! long order of instrs reversed if $dst.lo == base($mem)\n\t"
+            "LDR    $dst.hi,$mem+4" %}
+  ins_encode %{
+    Address Amemlo = Address::make_raw($mem$$base, $mem$$index, $mem$$scale, $mem$$disp, relocInfo::none);
+    Address Amemhi = Address::make_raw($mem$$base, $mem$$index, $mem$$scale, $mem$$disp + 4, relocInfo::none);
+
+    if ($dst$$Register == reg_to_register_object($mem$$base)) {
+      __ ldr($dst$$Register->successor(), Amemhi);
+      __ ldr($dst$$Register, Amemlo);
+    } else {
+      __ ldr($dst$$Register, Amemlo);
+      __ ldr($dst$$Register->successor(), Amemhi);
+    }
+  %}
+  ins_pipe(iload_mem);
+%}
+#endif // !AARCH64
+
+// Load Range
+instruct loadRange(iRegI dst, memoryI mem) %{
+  match(Set dst (LoadRange mem));
+  ins_cost(MEMORY_REF_COST);
+
+  size(4);
+  format %{ "LDR_u32 $dst,$mem\t! range" %}
+  ins_encode %{
+    __ ldr_u32($dst$$Register, $mem$$Address);
+  %}
+  ins_pipe(iload_mem);
+%}
+
+// Load Pointer
+
+#ifdef AARCH64
+// XXX This variant shouldn't be necessary if 6217251 is implemented
+instruct loadPoff(iRegP dst, memoryScaledP mem, aimmX off, iRegP tmp) %{
+  match(Set dst (LoadP (AddP mem off)));
+  ins_cost(MEMORY_REF_COST + DEFAULT_COST); // assume shift/sign-extend is free
+  effect(TEMP tmp);
+  size(4 * 2);
+
+  format %{ "LDR    $dst,$mem+$off\t! ptr temp=$tmp" %}
+  ins_encode %{
+    Register base = reg_to_register_object($mem$$base);
+    __ add($tmp$$Register, base, $off$$constant);
+    Address nmem = Address::make_raw($tmp$$reg, $mem$$index, $mem$$scale, $mem$$disp, relocInfo::none);
+    __ ldr($dst$$Register, nmem);
+  %}
+  ins_pipe(iload_mem);
+%}
+#endif
+
+instruct loadP(iRegP dst, memoryP mem) %{
+  match(Set dst (LoadP mem));
+  ins_cost(MEMORY_REF_COST);
+  size(4);
+
+  format %{ "LDR   $dst,$mem\t! ptr" %}
+  ins_encode %{
+    __ ldr($dst$$Register, $mem$$Address);
+  %}
+  ins_pipe(iload_mem);
+%}
+
+#ifdef XXX
+// FIXME XXXX
+//instruct loadSP(iRegP dst, memoryP mem) %{
+instruct loadSP(SPRegP dst, memoryP mem, iRegP tmp) %{
+  match(Set dst (LoadP mem));
+  effect(TEMP tmp);
+  ins_cost(MEMORY_REF_COST+1);
+  size(8);
+
+  format %{ "LDR   $tmp,$mem\t! ptr\n\t"
+            "MOV   $dst,$tmp\t! ptr" %}
+  ins_encode %{
+    __ ldr($tmp$$Register, $mem$$Address);
+    __ mov($dst$$Register, $tmp$$Register);
+  %}
+  ins_pipe(iload_mem);
+%}
+#endif
+
+#ifdef _LP64
+// Load Compressed Pointer
+
+// XXX This variant shouldn't be necessary if 6217251 is implemented
+instruct loadNoff(iRegN dst, memoryScaledI mem, aimmX off, iRegP tmp) %{
+  match(Set dst (LoadN (AddP mem off)));
+  ins_cost(MEMORY_REF_COST + DEFAULT_COST); // assume shift/sign-extend is free
+  effect(TEMP tmp);
+  size(4 * 2);
+
+  format %{ "ldr_u32 $dst,$mem+$off\t! compressed ptr temp=$tmp" %}
+  ins_encode %{
+    Register base = reg_to_register_object($mem$$base);
+    __ add($tmp$$Register, base, $off$$constant);
+    Address nmem = Address::make_raw($tmp$$reg, $mem$$index, $mem$$scale, $mem$$disp, relocInfo::none);
+    __ ldr_u32($dst$$Register, nmem);
+  %}
+  ins_pipe(iload_mem);
+%}
+
+instruct loadN(iRegN dst, memoryI mem) %{
+  match(Set dst (LoadN mem));
+  ins_cost(MEMORY_REF_COST);
+  size(4);
+
+  format %{ "ldr_u32 $dst,$mem\t! compressed ptr" %}
+  ins_encode %{
+    __ ldr_u32($dst$$Register, $mem$$Address);
+  %}
+  ins_pipe(iload_mem);
+%}
+#endif
+
+// Load Klass Pointer
+instruct loadKlass(iRegP dst, memoryI mem) %{
+  match(Set dst (LoadKlass mem));
+  ins_cost(MEMORY_REF_COST);
+  size(4);
+
+  format %{ "LDR   $dst,$mem\t! klass ptr" %}
+  ins_encode %{
+    __ ldr($dst$$Register, $mem$$Address);
+  %}
+  ins_pipe(iload_mem);
+%}
+
+#ifdef _LP64
+// Load narrow Klass Pointer
+instruct loadNKlass(iRegN dst, memoryI mem) %{
+  match(Set dst (LoadNKlass mem));
+  ins_cost(MEMORY_REF_COST);
+  size(4);
+
+  format %{ "ldr_u32 $dst,$mem\t! compressed klass ptr" %}
+  ins_encode %{
+    __ ldr_u32($dst$$Register, $mem$$Address);
+  %}
+  ins_pipe(iload_mem);
+%}
+#endif
+
+#ifdef AARCH64
+// XXX This variant shouldn't be necessary if 6217251 is implemented
+instruct loadDoff(regD dst, memoryScaledD mem, aimmX off, iRegP tmp) %{
+  match(Set dst (LoadD (AddP mem off)));
+  ins_cost(MEMORY_REF_COST + DEFAULT_COST); // assume shift/sign-extend is free
+  effect(TEMP tmp);
+  size(4 * 2);
+
+  format %{ "ldr    $dst,$mem+$off\t! double temp=$tmp" %}
+  ins_encode %{
+    Register base = reg_to_register_object($mem$$base);
+    __ add($tmp$$Register, base, $off$$constant);
+    Address nmem = Address::make_raw($tmp$$reg, $mem$$index, $mem$$scale, $mem$$disp, relocInfo::none);
+    __ ldr_d($dst$$FloatRegister, nmem);
+  %}
+  ins_pipe(floadD_mem);
+%}
+#endif
+
+instruct loadD(regD dst, memoryD mem) %{
+  match(Set dst (LoadD mem));
+  ins_cost(MEMORY_REF_COST);
+
+  size(4);
+  // FIXME: needs to be atomic, but  ARMv7 A.R.M. guarantees
+  // only LDREXD and STREXD are 64-bit single-copy atomic
+  format %{ "FLDD   $dst,$mem" %}
+  ins_encode %{
+    __ ldr_double($dst$$FloatRegister, $mem$$Address);
+  %}
+  ins_pipe(floadD_mem);
+%}
+
+#ifndef AARCH64
+// Load Double - UNaligned
+instruct loadD_unaligned(regD_low dst, memoryF2 mem ) %{
+  match(Set dst (LoadD_unaligned mem));
+  ins_cost(MEMORY_REF_COST*2+DEFAULT_COST);
+  size(8);
+  format %{ "FLDS    $dst.lo,$mem\t! misaligned double\n"
+          "\tFLDS    $dst.hi,$mem+4\t!" %}
+  ins_encode %{
+    Address Amemlo = Address::make_raw($mem$$base, $mem$$index, $mem$$scale, $mem$$disp, relocInfo::none);
+    Address Amemhi = Address::make_raw($mem$$base, $mem$$index, $mem$$scale, $mem$$disp + 4, relocInfo::none);
+      __ flds($dst$$FloatRegister, Amemlo);
+      __ flds($dst$$FloatRegister->successor(), Amemhi);
+  %}
+  ins_pipe(iload_mem);
+%}
+#endif
+
+#ifdef AARCH64
+// XXX This variant shouldn't be necessary if 6217251 is implemented
+instruct loadFoff(regF dst, memoryScaledF mem, aimmX off, iRegP tmp) %{
+  match(Set dst (LoadF (AddP mem off)));
+  ins_cost(MEMORY_REF_COST + DEFAULT_COST); // assume shift/sign-extend is free
+  effect(TEMP tmp);
+  size(4 * 2);
+
+  format %{ "ldr    $dst,$mem+$off\t! float temp=$tmp" %}
+  ins_encode %{
+    Register base = reg_to_register_object($mem$$base);
+    __ add($tmp$$Register, base, $off$$constant);
+    Address nmem = Address::make_raw($tmp$$reg, $mem$$index, $mem$$scale, $mem$$disp, relocInfo::none);
+    __ ldr_s($dst$$FloatRegister, nmem);
+  %}
+  ins_pipe(floadF_mem);
+%}
+#endif
+
+instruct loadF(regF dst, memoryF mem) %{
+  match(Set dst (LoadF mem));
+
+  ins_cost(MEMORY_REF_COST);
+  size(4);
+  format %{ "FLDS    $dst,$mem" %}
+  ins_encode %{
+    __ ldr_float($dst$$FloatRegister, $mem$$Address);
+  %}
+  ins_pipe(floadF_mem);
+%}
+
+#ifdef AARCH64
+instruct load_limmI(iRegI dst, limmI src) %{
+  match(Set dst src);
+  ins_cost(DEFAULT_COST + 1); // + 1 because MOV is preferred
+  format %{ "ORR_w  $dst, ZR, $src\t! int"  %}
+  ins_encode %{
+    __ orr_w($dst$$Register, ZR, (uintx)$src$$constant);
+  %}
+  ins_pipe(ialu_imm);
+%}
+#endif
+
+// // Load Constant
+instruct loadConI( iRegI dst, immI src ) %{
+  match(Set dst src);
+  ins_cost(DEFAULT_COST * 3/2);
+  format %{ "MOV_SLOW    $dst, $src" %}
+  ins_encode %{
+    __ mov_slow($dst$$Register, $src$$constant);
+  %}
+  ins_pipe(ialu_hi_lo_reg);
+%}
+
+instruct loadConIMov( iRegI dst, immIMov src ) %{
+  match(Set dst src);
+  size(4);
+  format %{ "MOV    $dst, $src" %}
+  ins_encode %{
+    __ mov($dst$$Register, $src$$constant);
+  %}
+  ins_pipe(ialu_imm);
+%}
+
+#ifndef AARCH64
+instruct loadConIMovn( iRegI dst, immIRotn src ) %{
+  match(Set dst src);
+  size(4);
+  format %{ "MVN    $dst, ~$src" %}
+  ins_encode %{
+    __ mvn($dst$$Register, ~$src$$constant);
+  %}
+  ins_pipe(ialu_imm_n);
+%}
+#endif
+
+instruct loadConI16( iRegI dst, immI16 src ) %{
+  match(Set dst src);
+  size(4);
+#ifdef AARCH64
+  format %{ "MOVZ_w  $dst, $src" %}
+#else
+  format %{ "MOVW    $dst, $src" %}
+#endif
+  ins_encode %{
+#ifdef AARCH64
+    __ mov_w($dst$$Register, $src$$constant);
+#else
+    __ movw($dst$$Register, $src$$constant);
+#endif
+  %}
+  ins_pipe(ialu_imm_n);
+%}
+
+instruct loadConP(iRegP dst, immP src) %{
+  match(Set dst src);
+  ins_cost(DEFAULT_COST * 3/2);
+  format %{ "MOV_SLOW    $dst,$src\t!ptr" %}
+  ins_encode %{
+    relocInfo::relocType constant_reloc = _opnds[1]->constant_reloc();
+    intptr_t val = $src$$constant;
+    if (constant_reloc == relocInfo::oop_type) {
+      __ mov_oop($dst$$Register, (jobject)val);
+    } else if (constant_reloc == relocInfo::metadata_type) {
+      __ mov_metadata($dst$$Register, (Metadata*)val);
+    } else {
+      __ mov_slow($dst$$Register, val);
+    }
+  %}
+  ins_pipe(loadConP);
+%}
+
+
+instruct loadConP_poll(iRegP dst, immP_poll src) %{
+  match(Set dst src);
+  ins_cost(DEFAULT_COST);
+  format %{ "MOV_SLOW    $dst,$src\t!ptr" %}
+  ins_encode %{
+      __ mov_slow($dst$$Register, $src$$constant);
+  %}
+  ins_pipe(loadConP_poll);
+%}
+
+#ifdef AARCH64
+instruct loadConP0(iRegP dst, immP0 src) %{
+  match(Set dst src);
+  ins_cost(DEFAULT_COST);
+  format %{ "MOV    $dst,ZR\t!ptr" %}
+  ins_encode %{
+    __ mov($dst$$Register, ZR);
+  %}
+  ins_pipe(ialu_none);
+%}
+
+instruct loadConN(iRegN dst, immN src) %{
+  match(Set dst src);
+  ins_cost(DEFAULT_COST * 3/2);
+  format %{ "SET    $dst,$src\t! compressed ptr" %}
+  ins_encode %{
+    Register dst = $dst$$Register;
+    // FIXME: use $constanttablebase?
+    __ set_narrow_oop(dst, (jobject)$src$$constant);
+  %}
+  ins_pipe(ialu_hi_lo_reg);
+%}
+
+instruct loadConN0(iRegN dst, immN0 src) %{
+  match(Set dst src);
+  ins_cost(DEFAULT_COST);
+  format %{ "MOV    $dst,ZR\t! compressed ptr" %}
+  ins_encode %{
+    __ mov($dst$$Register, ZR);
+  %}
+  ins_pipe(ialu_none);
+%}
+
+instruct loadConNKlass(iRegN dst, immNKlass src) %{
+  match(Set dst src);
+  ins_cost(DEFAULT_COST * 3/2);
+  format %{ "SET    $dst,$src\t! compressed klass ptr" %}
+  ins_encode %{
+    Register dst = $dst$$Register;
+    // FIXME: use $constanttablebase?
+    __ set_narrow_klass(dst, (Klass*)$src$$constant);
+  %}
+  ins_pipe(ialu_hi_lo_reg);
+%}
+
+instruct load_limmL(iRegL dst, limmL src) %{
+  match(Set dst src);
+  ins_cost(DEFAULT_COST);
+  format %{ "ORR    $dst, ZR, $src\t! long"  %}
+  ins_encode %{
+    __ orr($dst$$Register, ZR, (uintx)$src$$constant);
+  %}
+  ins_pipe(loadConL);
+%}
+instruct load_immLMov(iRegL dst, immLMov src) %{
+  match(Set dst src);
+  ins_cost(DEFAULT_COST);
+  format %{ "MOV    $dst, $src\t! long"  %}
+  ins_encode %{
+    __ mov($dst$$Register, $src$$constant);
+  %}
+  ins_pipe(loadConL);
+%}
+instruct loadConL(iRegL dst, immL src) %{
+  match(Set dst src);
+  ins_cost(DEFAULT_COST * 4); // worst case
+  format %{ "mov_slow   $dst, $src\t! long"  %}
+  ins_encode %{
+    // FIXME: use $constanttablebase?
+    __ mov_slow($dst$$Register, $src$$constant);
+  %}
+  ins_pipe(loadConL);
+%}
+#else
+instruct loadConL(iRegL dst, immL src) %{
+  match(Set dst src);
+  ins_cost(DEFAULT_COST * 4);
+  format %{ "MOV_SLOW   $dst.lo, $src & 0x0FFFFFFFFL \t! long\n\t"
+            "MOV_SLOW   $dst.hi, $src >> 32" %}
+  ins_encode %{
+    __ mov_slow(reg_to_register_object($dst$$reg), $src$$constant & 0x0FFFFFFFFL);
+    __ mov_slow(reg_to_register_object($dst$$reg + 1), ((julong)($src$$constant)) >> 32);
+  %}
+  ins_pipe(loadConL);
+%}
+
+instruct loadConL16( iRegL dst, immL16 src ) %{
+  match(Set dst src);
+  ins_cost(DEFAULT_COST * 2);
+
+  size(8);
+  format %{ "MOVW    $dst.lo, $src \n\t"
+            "MOVW    $dst.hi, 0 \n\t" %}
+  ins_encode %{
+    __ movw($dst$$Register, $src$$constant);
+    __ movw($dst$$Register->successor(), 0);
+  %}
+  ins_pipe(ialu_imm);
+%}
+#endif
+
+instruct loadConF_imm8(regF dst, imm8F src) %{
+  match(Set dst src);
+  ins_cost(DEFAULT_COST);
+  size(4);
+
+  format %{ "FCONSTS      $dst, $src"%}
+
+  ins_encode %{
+    __ fconsts($dst$$FloatRegister, Assembler::float_num($src$$constant).imm8());
+  %}
+  ins_pipe(loadConFD); // FIXME
+%}
+
+#ifdef AARCH64
+instruct loadIConF(iRegI dst, immF src) %{
+  match(Set dst src);
+  ins_cost(DEFAULT_COST * 2);
+
+  format %{ "MOV_SLOW  $dst, $src\t! loadIConF"  %}
+
+  ins_encode %{
+    // FIXME revisit once 6961697 is in
+    union {
+      jfloat f;
+      int i;
+    } v;
+    v.f = $src$$constant;
+    __ mov_slow($dst$$Register, v.i);
+  %}
+  ins_pipe(ialu_imm);
+%}
+#endif
+
+instruct loadConF(regF dst, immF src, iRegI tmp) %{
+  match(Set dst src);
+  ins_cost(DEFAULT_COST * 2);
+  effect(TEMP tmp);
+  size(3*4);
+
+  format %{ "MOV_SLOW  $tmp, $src\n\t"
+            "FMSR      $dst, $tmp"%}
+
+  ins_encode %{
+    // FIXME revisit once 6961697 is in
+    union {
+      jfloat f;
+      int i;
+    } v;
+    v.f = $src$$constant;
+    __ mov_slow($tmp$$Register, v.i);
+    __ fmsr($dst$$FloatRegister, $tmp$$Register);
+  %}
+  ins_pipe(loadConFD); // FIXME
+%}
+
+instruct loadConD_imm8(regD dst, imm8D src) %{
+  match(Set dst src);
+  ins_cost(DEFAULT_COST);
+  size(4);
+
+  format %{ "FCONSTD      $dst, $src"%}
+
+  ins_encode %{
+    __ fconstd($dst$$FloatRegister, Assembler::double_num($src$$constant).imm8());
+  %}
+  ins_pipe(loadConFD); // FIXME
+%}
+
+instruct loadConD(regD dst, immD src, iRegP tmp) %{
+  match(Set dst src);
+  effect(TEMP tmp);
+  ins_cost(MEMORY_REF_COST);
+  format %{ "FLDD  $dst, [$constanttablebase + $constantoffset]\t! load from constant table: double=$src" %}
+
+  ins_encode %{
+    Register r = $constanttablebase;
+    int offset  = $constantoffset($src);
+    if (!is_memoryD(offset)) {                // can't use a predicate
+                                              // in load constant instructs
+      __ add_slow($tmp$$Register, r, offset);
+      r = $tmp$$Register;
+      offset = 0;
+    }
+    __ ldr_double($dst$$FloatRegister, Address(r, offset));
+  %}
+  ins_pipe(loadConFD);
+%}
+
+// Prefetch instructions.
+// Must be safe to execute with invalid address (cannot fault).
+
+instruct prefetchAlloc_mp( memoryP mem ) %{
+  predicate(os::is_MP());
+  match( PrefetchAllocation mem );
+  ins_cost(MEMORY_REF_COST);
+  size(4);
+
+  format %{ "PLDW $mem\t! Prefetch allocation" %}
+  ins_encode %{
+#ifdef AARCH64
+    __ prfm(pstl1keep, $mem$$Address);
+#else
+    __ pldw($mem$$Address);
+#endif
+  %}
+  ins_pipe(iload_mem);
+%}
+
+instruct prefetchAlloc_sp( memoryP mem ) %{
+  predicate(!os::is_MP());
+  match( PrefetchAllocation mem );
+  ins_cost(MEMORY_REF_COST);
+  size(4);
+
+  format %{ "PLD $mem\t! Prefetch allocation" %}
+  ins_encode %{
+#ifdef AARCH64
+    __ prfm(pstl1keep, $mem$$Address);
+#else
+    __ pld($mem$$Address);
+#endif
+  %}
+  ins_pipe(iload_mem);
+%}
+
+//----------Store Instructions-------------------------------------------------
+// Store Byte
+instruct storeB(memoryB mem, store_RegI src) %{
+  match(Set mem (StoreB mem src));
+  ins_cost(MEMORY_REF_COST);
+
+  size(4);
+  format %{ "STRB    $src,$mem\t! byte" %}
+  ins_encode %{
+    __ strb($src$$Register, $mem$$Address);
+  %}
+  ins_pipe(istore_mem_reg);
+%}
+
+instruct storeCM(memoryB mem, store_RegI src) %{
+  match(Set mem (StoreCM mem src));
+  ins_cost(MEMORY_REF_COST);
+
+  size(4);
+  format %{ "STRB    $src,$mem\t! CMS card-mark byte" %}
+  ins_encode %{
+    __ strb($src$$Register, $mem$$Address);
+  %}
+  ins_pipe(istore_mem_reg);
+%}
+
+// Store Char/Short
+
+#ifdef AARCH64
+// XXX This variant shouldn't be necessary if 6217251 is implemented
+instruct storeCoff(store_RegI src, memoryScaledS mem, aimmX off, iRegP tmp) %{
+  match(Set mem (StoreC (AddP mem off) src));
+  ins_cost(MEMORY_REF_COST + DEFAULT_COST); // assume shift/sign-extend is free
+  effect(TEMP tmp);
+  size(4 * 2);
+
+  format %{ "STRH    $src,$mem+$off\t! short temp=$tmp" %}
+  ins_encode %{
+    Register base = reg_to_register_object($mem$$base);
+    __ add($tmp$$Register, base, $off$$constant);
+    Address nmem = Address::make_raw($tmp$$reg, $mem$$index, $mem$$scale, $mem$$disp, relocInfo::none);
+    __ strh($src$$Register, nmem);
+  %}
+  ins_pipe(istore_mem_reg);
+%}
+#endif
+
+instruct storeC(memoryS mem, store_RegI src) %{
+  match(Set mem (StoreC mem src));
+  ins_cost(MEMORY_REF_COST);
+
+  size(4);
+  format %{ "STRH    $src,$mem\t! short" %}
+  ins_encode %{
+    __ strh($src$$Register, $mem$$Address);
+  %}
+  ins_pipe(istore_mem_reg);
+%}
+
+// Store Integer
+
+#ifdef AARCH64
+// XXX This variant shouldn't be necessary if 6217251 is implemented
+instruct storeIoff(store_RegI src, memoryScaledI mem, aimmX off, iRegP tmp) %{
+  match(Set mem (StoreI (AddP mem off) src));
+  ins_cost(MEMORY_REF_COST + DEFAULT_COST); // assume shift/sign-extend is free
+  effect(TEMP tmp);
+  size(4 * 2);
+
+  format %{ "str_32 $src,$mem+$off\t! int temp=$tmp" %}
+  ins_encode %{
+    Register base = reg_to_register_object($mem$$base);
+    __ add($tmp$$Register, base, $off$$constant);
+    Address nmem = Address::make_raw($tmp$$reg, $mem$$index, $mem$$scale, $mem$$disp, relocInfo::none);
+    __ str_32($src$$Register, nmem);
+  %}
+  ins_pipe(istore_mem_reg);
+%}
+#endif
+
+instruct storeI(memoryI mem, store_RegI src) %{
+  match(Set mem (StoreI mem src));
+  ins_cost(MEMORY_REF_COST);
+
+  size(4);
+  format %{ "str_32 $src,$mem" %}
+  ins_encode %{
+    __ str_32($src$$Register, $mem$$Address);
+  %}
+  ins_pipe(istore_mem_reg);
+%}
+
+// Store Long
+
+#ifdef AARCH64
+// XXX This variant shouldn't be necessary if 6217251 is implemented
+instruct storeLoff(store_RegLd src, memoryScaledL mem, aimmX off, iRegP tmp) %{
+  match(Set mem (StoreL (AddP mem off) src));
+  ins_cost(MEMORY_REF_COST + DEFAULT_COST); // assume shift/sign-extend is free
+  effect(TEMP tmp);
+  size(4 * 2);
+
+  format %{ "str_64 $src,$mem+$off\t! long temp=$tmp" %}
+  ins_encode %{
+    Register base = reg_to_register_object($mem$$base);
+    __ add($tmp$$Register, base, $off$$constant);
+    Address nmem = Address::make_raw($tmp$$reg, $mem$$index, $mem$$scale, $mem$$disp, relocInfo::none);
+    __ str_64($src$$Register, nmem);
+  %}
+  ins_pipe(istore_mem_reg);
+%}
+#endif
+
+instruct storeL(memoryL mem, store_RegLd src) %{
+#ifdef AARCH64
+  // already atomic for Aarch64
+#else
+  predicate(!((StoreLNode*)n)->require_atomic_access());
+#endif
+  match(Set mem (StoreL mem src));
+  ins_cost(MEMORY_REF_COST);
+
+  size(4);
+  format %{ "str_64  $src,$mem\t! long\n\t" %}
+
+  ins_encode %{
+    __ str_64($src$$Register, $mem$$Address);
+  %}
+  ins_pipe(istore_mem_reg);
+%}
+
+#ifndef AARCH64
+instruct storeL_2instr(memorylong mem, iRegL src) %{
+  predicate(!((StoreLNode*)n)->require_atomic_access());
+  match(Set mem (StoreL mem src));
+  ins_cost(MEMORY_REF_COST + DEFAULT_COST);
+
+  size(8);
+  format %{ "STR    $src.lo,$mem\t! long\n\t"
+            "STR    $src.hi,$mem+4" %}
+
+  ins_encode %{
+    Address Amemlo = Address::make_raw($mem$$base, $mem$$index, $mem$$scale, $mem$$disp, relocInfo::none);
+    Address Amemhi = Address::make_raw($mem$$base, $mem$$index, $mem$$scale, $mem$$disp + 4, relocInfo::none);
+    __ str($src$$Register, Amemlo);
+    __ str($src$$Register->successor(), Amemhi);
+  %}
+  ins_pipe(istore_mem_reg);
+%}
+
+instruct storeL_volatile(indirect mem, iRegL src) %{
+  predicate(((StoreLNode*)n)->require_atomic_access());
+  match(Set mem (StoreL mem src));
+  ins_cost(MEMORY_REF_COST);
+  size(4);
+  format %{ "STMIA    $src,$mem\t! long" %}
+  ins_encode %{
+    // FIXME: why is stmia considered atomic?  Should be strexd
+    RegisterSet set($src$$Register);
+    set = set | reg_to_register_object($src$$reg + 1);
+    __ stmia(reg_to_register_object($mem$$base), set);
+  %}
+  ins_pipe(istore_mem_reg);
+%}
+#endif // !AARCH64
+
+#ifndef AARCH64
+instruct storeL_volatile_fp(memoryD mem, iRegL src) %{
+  predicate(((StoreLNode*)n)->require_atomic_access());
+  match(Set mem (StoreL mem src));
+  ins_cost(MEMORY_REF_COST);
+  size(8);
+  format %{ "FMDRR    S14, $src\t! long \n\t"
+            "FSTD     S14, $mem" %}
+  ins_encode %{
+    __ fmdrr(S14, $src$$Register, $src$$Register->successor());
+    __ fstd(S14, $mem$$Address);
+  %}
+  ins_pipe(istore_mem_reg);
+%}
+#endif
+
+#ifdef XXX
+// Move SP Pointer
+//instruct movSP(sp_ptr_RegP dst, SPRegP src) %{
+//instruct movSP(iRegP dst, SPRegP src) %{
+instruct movSP(store_ptr_RegP dst, SPRegP src) %{
+  match(Set dst src);
+//predicate(!_kids[1]->_leaf->is_Proj() || _kids[1]->_leaf->as_Proj()->_con == TypeFunc::FramePtr);
+  ins_cost(MEMORY_REF_COST);
+  size(4);
+
+  format %{ "MOV    $dst,$src\t! SP ptr\n\t" %}
+  ins_encode %{
+    assert(false, "XXX1 got here");
+    __ mov($dst$$Register, SP);
+    __ mov($dst$$Register, $src$$Register);
+  %}
+  ins_pipe(ialu_reg);
+%}
+#endif
+
+#ifdef AARCH64
+// FIXME
+// Store SP Pointer
+instruct storeSP(memoryP mem, SPRegP src, iRegP tmp) %{
+  match(Set mem (StoreP mem src));
+  predicate(_kids[1]->_leaf->is_Proj() && _kids[1]->_leaf->as_Proj()->_con == TypeFunc::FramePtr);
+  // Multiple StoreP rules, different only in register mask.
+  // Matcher makes the last always valid.  The others will
+  // only be valid if they cost less than the last valid
+  // rule.  So cost(rule1) < cost(rule2) < cost(last)
+  // Unlike immediates, register constraints are not checked
+  // at match time.
+  ins_cost(MEMORY_REF_COST+DEFAULT_COST+4);
+  effect(TEMP tmp);
+  size(8);
+
+  format %{ "MOV    $tmp,$src\t! SP ptr\n\t"
+            "STR    $tmp,$mem\t! SP ptr" %}
+  ins_encode %{
+    assert($src$$Register == SP, "SP expected");
+    __ mov($tmp$$Register, $src$$Register);
+    __ str($tmp$$Register, $mem$$Address);
+  %}
+  ins_pipe(istore_mem_spORreg); // FIXME
+%}
+#endif // AARCH64
+
+// Store Pointer
+
+#ifdef AARCH64
+// XXX This variant shouldn't be necessary if 6217251 is implemented
+instruct storePoff(store_ptr_RegP src, memoryScaledP mem, aimmX off, iRegP tmp) %{
+  predicate(!_kids[1]->_leaf->is_Proj() || _kids[1]->_leaf->as_Proj()->_con != TypeFunc::FramePtr);
+  match(Set mem (StoreP (AddP mem off) src));
+  ins_cost(MEMORY_REF_COST + DEFAULT_COST); // assume shift/sign-extend is free
+  effect(TEMP tmp);
+  size(4 * 2);
+
+  format %{ "STR    $src,$mem+$off\t! ptr temp=$tmp" %}
+  ins_encode %{
+    Register base = reg_to_register_object($mem$$base);
+    __ add($tmp$$Register, base, $off$$constant);
+    Address nmem = Address::make_raw($tmp$$reg, $mem$$index, $mem$$scale, $mem$$disp, relocInfo::none);
+    __ str($src$$Register, nmem);
+  %}
+  ins_pipe(istore_mem_reg);
+%}
+#endif
+
+instruct storeP(memoryP mem, store_ptr_RegP src) %{
+  match(Set mem (StoreP mem src));
+#ifdef AARCH64
+  predicate(!_kids[1]->_leaf->is_Proj() || _kids[1]->_leaf->as_Proj()->_con != TypeFunc::FramePtr);
+#endif
+  ins_cost(MEMORY_REF_COST);
+  size(4);
+
+  format %{ "STR    $src,$mem\t! ptr" %}
+  ins_encode %{
+    __ str($src$$Register, $mem$$Address);
+  %}
+  ins_pipe(istore_mem_spORreg);
+%}
+
+#ifdef AARCH64
+// Store NULL Pointer
+instruct storeP0(memoryP mem, immP0 src) %{
+  match(Set mem (StoreP mem src));
+  ins_cost(MEMORY_REF_COST);
+  size(4);
+
+  format %{ "STR    ZR,$mem\t! ptr" %}
+  ins_encode %{
+    __ str(ZR, $mem$$Address);
+  %}
+  ins_pipe(istore_mem_spORreg);
+%}
+#endif // AARCH64
+
+#ifdef _LP64
+// Store Compressed Pointer
+
+#ifdef AARCH64
+// XXX This variant shouldn't be necessary if 6217251 is implemented
+instruct storeNoff(store_RegN src, memoryScaledI mem, aimmX off, iRegP tmp) %{
+  match(Set mem (StoreN (AddP mem off) src));
+  ins_cost(MEMORY_REF_COST + DEFAULT_COST); // assume shift/sign-extend is free
+  effect(TEMP tmp);
+  size(4 * 2);
+
+  format %{ "str_32 $src,$mem+$off\t! compressed ptr temp=$tmp" %}
+  ins_encode %{
+    Register base = reg_to_register_object($mem$$base);
+    __ add($tmp$$Register, base, $off$$constant);
+    Address nmem = Address::make_raw($tmp$$reg, $mem$$index, $mem$$scale, $mem$$disp, relocInfo::none);
+    __ str_32($src$$Register, nmem);
+  %}
+  ins_pipe(istore_mem_reg);
+%}
+#endif
+
+instruct storeN(memoryI mem, store_RegN src) %{
+  match(Set mem (StoreN mem src));
+  ins_cost(MEMORY_REF_COST);
+  size(4);
+
+  format %{ "str_32 $src,$mem\t! compressed ptr" %}
+  ins_encode %{
+    __ str_32($src$$Register, $mem$$Address);
+  %}
+  ins_pipe(istore_mem_reg);
+%}
+
+#ifdef AARCH64
+// Store NULL Pointer
+instruct storeN0(memoryI mem, immN0 src) %{
+  match(Set mem (StoreN mem src));
+  ins_cost(MEMORY_REF_COST);
+  size(4);
+
+  format %{ "str_32 ZR,$mem\t! compressed ptr" %}
+  ins_encode %{
+    __ str_32(ZR, $mem$$Address);
+  %}
+  ins_pipe(istore_mem_reg);
+%}
+#endif
+
+// Store Compressed Klass Pointer
+instruct storeNKlass(memoryI mem, store_RegN src) %{
+  match(Set mem (StoreNKlass mem src));
+  ins_cost(MEMORY_REF_COST);
+  size(4);
+
+  format %{ "str_32 $src,$mem\t! compressed klass ptr" %}
+  ins_encode %{
+    __ str_32($src$$Register, $mem$$Address);
+  %}
+  ins_pipe(istore_mem_reg);
+%}
+#endif
+
+// Store Double
+
+#ifdef AARCH64
+// XXX This variant shouldn't be necessary if 6217251 is implemented
+instruct storeDoff(regD src, memoryScaledD mem, aimmX off, iRegP tmp) %{
+  match(Set mem (StoreD (AddP mem off) src));
+  ins_cost(MEMORY_REF_COST + DEFAULT_COST); // assume shift/sign-extend is free
+  effect(TEMP tmp);
+  size(4 * 2);
+
+  format %{ "STR    $src,$mem+$off\t! double temp=$tmp" %}
+  ins_encode %{
+    Register base = reg_to_register_object($mem$$base);
+    __ add($tmp$$Register, base, $off$$constant);
+    Address nmem = Address::make_raw($tmp$$reg, $mem$$index, $mem$$scale, $mem$$disp, relocInfo::none);
+    __ str_d($src$$FloatRegister, nmem);
+  %}
+  ins_pipe(fstoreD_mem_reg);
+%}
+#endif
+
+instruct storeD(memoryD mem, regD src) %{
+  match(Set mem (StoreD mem src));
+  ins_cost(MEMORY_REF_COST);
+
+  size(4);
+  // FIXME: needs to be atomic, but  ARMv7 A.R.M. guarantees
+  // only LDREXD and STREXD are 64-bit single-copy atomic
+  format %{ "FSTD   $src,$mem" %}
+  ins_encode %{
+    __ str_double($src$$FloatRegister, $mem$$Address);
+  %}
+  ins_pipe(fstoreD_mem_reg);
+%}
+
+#ifdef AARCH64
+instruct movI2F(regF dst, iRegI src) %{
+  match(Set dst src);
+  size(4);
+
+  format %{ "FMOV_sw $dst,$src\t! movI2F" %}
+  ins_encode %{
+    __ fmov_sw($dst$$FloatRegister, $src$$Register);
+  %}
+  ins_pipe(ialu_reg); // FIXME
+%}
+
+instruct movF2I(iRegI dst, regF src) %{
+  match(Set dst src);
+  size(4);
+
+  format %{ "FMOV_ws $dst,$src\t! movF2I" %}
+  ins_encode %{
+    __ fmov_ws($dst$$Register, $src$$FloatRegister);
+  %}
+  ins_pipe(ialu_reg); // FIXME
+%}
+#endif
+
+// Store Float
+
+#ifdef AARCH64
+// XXX This variant shouldn't be necessary if 6217251 is implemented
+instruct storeFoff(regF src, memoryScaledF mem, aimmX off, iRegP tmp) %{
+  match(Set mem (StoreF (AddP mem off) src));
+  ins_cost(MEMORY_REF_COST + DEFAULT_COST); // assume shift/sign-extend is free
+  effect(TEMP tmp);
+  size(4 * 2);
+
+  format %{ "str_s  $src,$mem+$off\t! float temp=$tmp" %}
+  ins_encode %{
+    Register base = reg_to_register_object($mem$$base);
+    __ add($tmp$$Register, base, $off$$constant);
+    Address nmem = Address::make_raw($tmp$$reg, $mem$$index, $mem$$scale, $mem$$disp, relocInfo::none);
+    __ str_s($src$$FloatRegister, nmem);
+  %}
+  ins_pipe(fstoreF_mem_reg);
+%}
+#endif
+
+instruct storeF( memoryF mem, regF src) %{
+  match(Set mem (StoreF mem src));
+  ins_cost(MEMORY_REF_COST);
+
+  size(4);
+  format %{ "FSTS    $src,$mem" %}
+  ins_encode %{
+    __ str_float($src$$FloatRegister, $mem$$Address);
+  %}
+  ins_pipe(fstoreF_mem_reg);
+%}
+
+#ifdef AARCH64
+// Convert oop pointer into compressed form
+instruct encodeHeapOop(iRegN dst, iRegP src, flagsReg ccr) %{
+  predicate(n->bottom_type()->make_ptr()->ptr() != TypePtr::NotNull);
+  match(Set dst (EncodeP src));
+  effect(KILL ccr);
+  format %{ "encode_heap_oop $dst, $src" %}
+  ins_encode %{
+    __ encode_heap_oop($dst$$Register, $src$$Register);
+  %}
+  ins_pipe(ialu_reg);
+%}
+
+instruct encodeHeapOop_not_null(iRegN dst, iRegP src) %{
+  predicate(n->bottom_type()->make_ptr()->ptr() == TypePtr::NotNull);
+  match(Set dst (EncodeP src));
+  format %{ "encode_heap_oop_not_null $dst, $src" %}
+  ins_encode %{
+    __ encode_heap_oop_not_null($dst$$Register, $src$$Register);
+  %}
+  ins_pipe(ialu_reg);
+%}
+
+instruct decodeHeapOop(iRegP dst, iRegN src, flagsReg ccr) %{
+  predicate(n->bottom_type()->is_oopptr()->ptr() != TypePtr::NotNull &&
+            n->bottom_type()->is_oopptr()->ptr() != TypePtr::Constant);
+  match(Set dst (DecodeN src));
+  effect(KILL ccr);
+  format %{ "decode_heap_oop $dst, $src" %}
+  ins_encode %{
+    __ decode_heap_oop($dst$$Register, $src$$Register);
+  %}
+  ins_pipe(ialu_reg);
+%}
+
+instruct decodeHeapOop_not_null(iRegP dst, iRegN src) %{
+  predicate(n->bottom_type()->is_oopptr()->ptr() == TypePtr::NotNull ||
+            n->bottom_type()->is_oopptr()->ptr() == TypePtr::Constant);
+  match(Set dst (DecodeN src));
+  format %{ "decode_heap_oop_not_null $dst, $src" %}
+  ins_encode %{
+    __ decode_heap_oop_not_null($dst$$Register, $src$$Register);
+  %}
+  ins_pipe(ialu_reg);
+%}
+
+instruct encodeKlass_not_null(iRegN dst, iRegP src) %{
+  match(Set dst (EncodePKlass src));
+  format %{ "encode_klass_not_null $dst, $src" %}
+  ins_encode %{
+    __ encode_klass_not_null($dst$$Register, $src$$Register);
+  %}
+  ins_pipe(ialu_reg);
+%}
+
+instruct decodeKlass_not_null(iRegP dst, iRegN src) %{
+  match(Set dst (DecodeNKlass src));
+  format %{ "decode_klass_not_null $dst, $src" %}
+  ins_encode %{
+    __ decode_klass_not_null($dst$$Register, $src$$Register);
+  %}
+  ins_pipe(ialu_reg);
+%}
+#endif // AARCH64
+
+//----------MemBar Instructions-----------------------------------------------
+// Memory barrier flavors
+
+// TODO: take advantage of Aarch64 load-acquire, store-release, etc
+// pattern-match out unnecessary membars
+instruct membar_storestore() %{
+  match(MemBarStoreStore);
+  ins_cost(4*MEMORY_REF_COST);
+
+  size(4);
+  format %{ "MEMBAR-storestore" %}
+  ins_encode %{
+    __ membar(MacroAssembler::Membar_mask_bits(MacroAssembler::StoreStore), noreg);
+  %}
+  ins_pipe(long_memory_op);
+%}
+
+instruct membar_acquire() %{
+  match(MemBarAcquire);
+  match(LoadFence);
+  ins_cost(4*MEMORY_REF_COST);
+
+  size(4);
+  format %{ "MEMBAR-acquire" %}
+  ins_encode %{
+    __ membar(MacroAssembler::Membar_mask_bits(MacroAssembler::LoadLoad | MacroAssembler::LoadStore), noreg);
+  %}
+  ins_pipe(long_memory_op);
+%}
+
+instruct membar_acquire_lock() %{
+  match(MemBarAcquireLock);
+  ins_cost(0);
+
+  size(0);
+  format %{ "!MEMBAR-acquire (CAS in prior FastLock so empty encoding)" %}
+  ins_encode( );
+  ins_pipe(empty);
+%}
+
+instruct membar_release() %{
+  match(MemBarRelease);
+  match(StoreFence);
+  ins_cost(4*MEMORY_REF_COST);
+
+  size(4);
+  format %{ "MEMBAR-release" %}
+  ins_encode %{
+    __ membar(MacroAssembler::Membar_mask_bits(MacroAssembler::StoreStore | MacroAssembler::LoadStore), noreg);
+  %}
+  ins_pipe(long_memory_op);
+%}
+
+instruct membar_release_lock() %{
+  match(MemBarReleaseLock);
+  ins_cost(0);
+
+  size(0);
+  format %{ "!MEMBAR-release (CAS in succeeding FastUnlock so empty encoding)" %}
+  ins_encode( );
+  ins_pipe(empty);
+%}
+
+instruct membar_volatile() %{
+  match(MemBarVolatile);
+  ins_cost(4*MEMORY_REF_COST);
+
+  size(4);
+  format %{ "MEMBAR-volatile" %}
+  ins_encode %{
+    __ membar(MacroAssembler::StoreLoad, noreg);
+  %}
+  ins_pipe(long_memory_op);
+%}
+
+instruct unnecessary_membar_volatile() %{
+  match(MemBarVolatile);
+  predicate(Matcher::post_store_load_barrier(n));
+  ins_cost(0);
+
+  size(0);
+  format %{ "!MEMBAR-volatile (unnecessary so empty encoding)" %}
+  ins_encode( );
+  ins_pipe(empty);
+%}
+
+//----------Register Move Instructions-----------------------------------------
+// instruct roundDouble_nop(regD dst) %{
+//   match(Set dst (RoundDouble dst));
+//   ins_pipe(empty);
+// %}
+
+
+// instruct roundFloat_nop(regF dst) %{
+//   match(Set dst (RoundFloat dst));
+//   ins_pipe(empty);
+// %}
+
+
+#ifdef AARCH64
+// 0 constant in register
+instruct zrImmI0(ZRRegI dst, immI0 imm) %{
+  match(Set dst imm);
+  size(0);
+  ins_cost(0);
+
+  format %{ "! ZR (int 0)" %}
+  ins_encode( /*empty encoding*/ );
+  ins_pipe(ialu_none);
+%}
+
+// 0 constant in register
+instruct zrImmL0(ZRRegL dst, immL0 imm) %{
+  match(Set dst imm);
+  size(0);
+  ins_cost(0);
+
+  format %{ "! ZR (long 0)" %}
+  ins_encode( /*empty encoding*/ );
+  ins_pipe(ialu_none);
+%}
+
+#ifdef XXX
+// 0 constant in register
+instruct zrImmN0(ZRRegN dst, immN0 imm) %{
+  match(Set dst imm);
+  size(0);
+  ins_cost(0);
+
+  format %{ "! ZR (compressed pointer NULL)" %}
+  ins_encode( /*empty encoding*/ );
+  ins_pipe(ialu_none);
+%}
+
+// 0 constant in register
+instruct zrImmP0(ZRRegP dst, immP0 imm) %{
+  match(Set dst imm);
+  size(0);
+  ins_cost(0);
+
+  format %{ "! ZR (NULL)" %}
+  ins_encode( /*empty encoding*/ );
+  ins_pipe(ialu_none);
+%}
+#endif
+#endif // AARCH64
+
+// Cast Index to Pointer for unsafe natives
+instruct castX2P(iRegX src, iRegP dst) %{
+  match(Set dst (CastX2P src));
+
+  format %{ "MOV    $dst,$src\t! IntX->Ptr if $dst != $src" %}
+  ins_encode %{
+    if ($dst$$Register !=  $src$$Register) {
+      __ mov($dst$$Register, $src$$Register);
+    }
+  %}
+  ins_pipe(ialu_reg);
+%}
+
+// Cast Pointer to Index for unsafe natives
+instruct castP2X(iRegP src, iRegX dst) %{
+  match(Set dst (CastP2X src));
+
+  format %{ "MOV    $dst,$src\t! Ptr->IntX if $dst != $src" %}
+  ins_encode %{
+    if ($dst$$Register !=  $src$$Register) {
+      __ mov($dst$$Register, $src$$Register);
+    }
+  %}
+  ins_pipe(ialu_reg);
+%}
+
+#ifndef AARCH64
+//----------Conditional Move---------------------------------------------------
+// Conditional move
+instruct cmovIP_reg(cmpOpP cmp, flagsRegP pcc, iRegI dst, iRegI src) %{
+  match(Set dst (CMoveI (Binary cmp pcc) (Binary dst src)));
+  ins_cost(150);
+  size(4);
+  format %{ "MOV$cmp  $dst,$src\t! int" %}
+  ins_encode %{
+    __ mov($dst$$Register, $src$$Register, (AsmCondition)($cmp$$cmpcode));
+  %}
+  ins_pipe(ialu_reg);
+%}
+#endif
+
+#ifdef AARCH64
+instruct cmovI_reg3(cmpOp cmp, flagsReg icc, iRegI dst, iRegI src1, iRegI src2) %{
+  match(Set dst (CMoveI (Binary cmp icc) (Binary src2 src1)));
+  ins_cost(150);
+  size(4);
+  format %{ "CSEL $dst,$src1,$src2,$cmp\t! int" %}
+  ins_encode %{
+    __ csel($dst$$Register, $src1$$Register, $src2$$Register, (AsmCondition)($cmp$$cmpcode));
+  %}
+  ins_pipe(ialu_reg);
+%}
+
+instruct cmovL_reg3(cmpOp cmp, flagsReg icc, iRegL dst, iRegL src1, iRegL src2) %{
+  match(Set dst (CMoveL (Binary cmp icc) (Binary src2 src1)));
+  ins_cost(150);
+  size(4);
+  format %{ "CSEL $dst,$src1,$src2,$cmp\t! long" %}
+  ins_encode %{
+    __ csel($dst$$Register, $src1$$Register, $src2$$Register, (AsmCondition)($cmp$$cmpcode));
+  %}
+  ins_pipe(ialu_reg);
+%}
+
+instruct cmovP_reg3(cmpOp cmp, flagsReg icc, iRegP dst, iRegP src1, iRegP src2) %{
+  match(Set dst (CMoveP (Binary cmp icc) (Binary src2 src1)));
+  ins_cost(150);
+  size(4);
+  format %{ "CSEL $dst,$src1,$src2,$cmp\t! ptr" %}
+  ins_encode %{
+    __ csel($dst$$Register, $src1$$Register, $src2$$Register, (AsmCondition)($cmp$$cmpcode));
+  %}
+  ins_pipe(ialu_reg);
+%}
+
+instruct cmovN_reg3(cmpOp cmp, flagsReg icc, iRegN dst, iRegN src1, iRegN src2) %{
+  match(Set dst (CMoveN (Binary cmp icc) (Binary src2 src1)));
+  ins_cost(150);
+  size(4);
+  format %{ "CSEL $dst,$src1,$src2,$cmp\t! compressed ptr" %}
+  ins_encode %{
+    __ csel($dst$$Register, $src1$$Register, $src2$$Register, (AsmCondition)($cmp$$cmpcode));
+  %}
+  ins_pipe(ialu_reg);
+%}
+
+instruct cmovIP_reg3(cmpOpP cmp, flagsRegP icc, iRegI dst, iRegI src1, iRegI src2) %{
+  match(Set dst (CMoveI (Binary cmp icc) (Binary src2 src1)));
+  ins_cost(150);
+  size(4);
+  format %{ "CSEL $dst,$src1,$src2,$cmp\t! int" %}
+  ins_encode %{
+    __ csel($dst$$Register, $src1$$Register, $src2$$Register, (AsmCondition)($cmp$$cmpcode));
+  %}
+  ins_pipe(ialu_reg);
+%}
+
+instruct cmovLP_reg3(cmpOpP cmp, flagsRegP icc, iRegL dst, iRegL src1, iRegL src2) %{
+  match(Set dst (CMoveL (Binary cmp icc) (Binary src2 src1)));
+  ins_cost(150);
+  size(4);
+  format %{ "CSEL $dst,$src1,$src2,$cmp\t! long" %}
+  ins_encode %{
+    __ csel($dst$$Register, $src1$$Register, $src2$$Register, (AsmCondition)($cmp$$cmpcode));
+  %}
+  ins_pipe(ialu_reg);
+%}
+
+instruct cmovPP_reg3(cmpOpP cmp, flagsRegP icc, iRegP dst, iRegP src1, iRegP src2) %{
+  match(Set dst (CMoveP (Binary cmp icc) (Binary src2 src1)));
+  ins_cost(150);
+  size(4);
+  format %{ "CSEL $dst,$src1,$src2,$cmp\t! ptr" %}
+  ins_encode %{
+    __ csel($dst$$Register, $src1$$Register, $src2$$Register, (AsmCondition)($cmp$$cmpcode));
+  %}
+  ins_pipe(ialu_reg);
+%}
+
+instruct cmovNP_reg3(cmpOpP cmp, flagsRegP icc, iRegN dst, iRegN src1, iRegN src2) %{
+  match(Set dst (CMoveN (Binary cmp icc) (Binary src2 src1)));
+  ins_cost(150);
+  size(4);
+  format %{ "CSEL $dst,$src1,$src2,$cmp\t! compressed ptr" %}
+  ins_encode %{
+    __ csel($dst$$Register, $src1$$Register, $src2$$Register, (AsmCondition)($cmp$$cmpcode));
+  %}
+  ins_pipe(ialu_reg);
+%}
+
+instruct cmovIU_reg3(cmpOpU cmp, flagsRegU icc, iRegI dst, iRegI src1, iRegI src2) %{
+  match(Set dst (CMoveI (Binary cmp icc) (Binary src2 src1)));
+  ins_cost(150);
+  size(4);
+  format %{ "CSEL $dst,$src1,$src2,$cmp\t! int" %}
+  ins_encode %{
+    __ csel($dst$$Register, $src1$$Register, $src2$$Register, (AsmCondition)($cmp$$cmpcode));
+  %}
+  ins_pipe(ialu_reg);
+%}
+
+instruct cmovLU_reg3(cmpOpU cmp, flagsRegU icc, iRegL dst, iRegL src1, iRegL src2) %{
+  match(Set dst (CMoveL (Binary cmp icc) (Binary src2 src1)));
+  ins_cost(150);
+  size(4);
+  format %{ "CSEL $dst,$src1,$src2,$cmp\t! long" %}
+  ins_encode %{
+    __ csel($dst$$Register, $src1$$Register, $src2$$Register, (AsmCondition)($cmp$$cmpcode));
+  %}
+  ins_pipe(ialu_reg);
+%}
+
+instruct cmovPU_reg3(cmpOpU cmp, flagsRegU icc, iRegP dst, iRegP src1, iRegP src2) %{
+  match(Set dst (CMoveP (Binary cmp icc) (Binary src2 src1)));
+  ins_cost(150);
+  size(4);
+  format %{ "CSEL $dst,$src1,$src2,$cmp\t! ptr" %}
+  ins_encode %{
+    __ csel($dst$$Register, $src1$$Register, $src2$$Register, (AsmCondition)($cmp$$cmpcode));
+  %}
+  ins_pipe(ialu_reg);
+%}
+
+instruct cmovNU_reg3(cmpOpU cmp, flagsRegU icc, iRegN dst, iRegN src1, iRegN src2) %{
+  match(Set dst (CMoveN (Binary cmp icc) (Binary src2 src1)));
+  ins_cost(150);
+  size(4);
+  format %{ "CSEL $dst,$src1,$src2,$cmp\t! compressed ptr" %}
+  ins_encode %{
+    __ csel($dst$$Register, $src1$$Register, $src2$$Register, (AsmCondition)($cmp$$cmpcode));
+  %}
+  ins_pipe(ialu_reg);
+%}
+
+instruct cmovIZ_reg3(cmpOp0 cmp, flagsReg_EQNELTGE icc, iRegI dst, iRegI src1, iRegI src2) %{
+  match(Set dst (CMoveI (Binary cmp icc) (Binary src2 src1)));
+  ins_cost(150);
+  size(4);
+  format %{ "CSEL $dst,$src1,$src2,$cmp\t! int" %}
+  ins_encode %{
+    __ csel($dst$$Register, $src1$$Register, $src2$$Register, (AsmCondition)($cmp$$cmpcode));
+  %}
+  ins_pipe(ialu_reg);
+%}
+
+instruct cmovLZ_reg3(cmpOp0 cmp, flagsReg_EQNELTGE icc, iRegL dst, iRegL src1, iRegL src2) %{
+  match(Set dst (CMoveL (Binary cmp icc) (Binary src2 src1)));
+  ins_cost(150);
+  size(4);
+  format %{ "CSEL $dst,$src1,$src2,$cmp\t! long" %}
+  ins_encode %{
+    __ csel($dst$$Register, $src1$$Register, $src2$$Register, (AsmCondition)($cmp$$cmpcode));
+  %}
+  ins_pipe(ialu_reg);
+%}
+
+instruct cmovPZ_reg3(cmpOp0 cmp, flagsReg_EQNELTGE icc, iRegP dst, iRegP src1, iRegP src2) %{
+  match(Set dst (CMoveP (Binary cmp icc) (Binary src2 src1)));
+  ins_cost(150);
+  size(4);
+  format %{ "CSEL $dst,$src1,$src2,$cmp\t! ptr" %}
+  ins_encode %{
+    __ csel($dst$$Register, $src1$$Register, $src2$$Register, (AsmCondition)($cmp$$cmpcode));
+  %}
+  ins_pipe(ialu_reg);
+%}
+
+instruct cmovNZ_reg3(cmpOp0 cmp, flagsReg_EQNELTGE icc, iRegN dst, iRegN src1, iRegN src2) %{
+  match(Set dst (CMoveN (Binary cmp icc) (Binary src2 src1)));
+  ins_cost(150);
+  size(4);
+  format %{ "CSEL $dst,$src1,$src2,$cmp\t! compressed ptr" %}
+  ins_encode %{
+    __ csel($dst$$Register, $src1$$Register, $src2$$Register, (AsmCondition)($cmp$$cmpcode));
+  %}
+  ins_pipe(ialu_reg);
+%}
+#endif // AARCH64
+
+#ifndef AARCH64
+instruct cmovIP_immMov(cmpOpP cmp, flagsRegP pcc, iRegI dst, immIMov src) %{
+  match(Set dst (CMoveI (Binary cmp pcc) (Binary dst src)));
+  ins_cost(140);
+  size(4);
+  format %{ "MOV$cmp  $dst,$src" %}
+  ins_encode %{
+    __ mov($dst$$Register, $src$$constant, (AsmCondition)($cmp$$cmpcode));
+  %}
+  ins_pipe(ialu_imm);
+%}
+
+instruct cmovIP_imm16(cmpOpP cmp, flagsRegP pcc, iRegI dst, immI16 src) %{
+  match(Set dst (CMoveI (Binary cmp pcc) (Binary dst src)));
+  ins_cost(140);
+  size(4);
+  format %{ "MOVw$cmp  $dst,$src" %}
+  ins_encode %{
+    __ movw($dst$$Register, $src$$constant, (AsmCondition)($cmp$$cmpcode));
+  %}
+  ins_pipe(ialu_imm);
+%}
+#endif
+
+instruct cmovI_reg(cmpOp cmp, flagsReg icc, iRegI dst, iRegI src) %{
+  match(Set dst (CMoveI (Binary cmp icc) (Binary dst src)));
+  ins_cost(150);
+  size(4);
+  format %{ "MOV$cmp  $dst,$src" %}
+  ins_encode %{
+    __ mov($dst$$Register, $src$$Register, (AsmCondition)($cmp$$cmpcode));
+  %}
+  ins_pipe(ialu_reg);
+%}
+
+#ifdef AARCH64
+instruct cmovL_reg(cmpOp cmp, flagsReg icc, iRegL dst, iRegL src) %{
+  match(Set dst (CMoveL (Binary cmp icc) (Binary dst src)));
+  ins_cost(150);
+  size(4);
+  format %{ "MOV$cmp  $dst,$src\t! long" %}
+  ins_encode %{
+    __ mov($dst$$Register, $src$$Register, (AsmCondition)($cmp$$cmpcode));
+  %}
+  ins_pipe(ialu_reg);
+%}
+#endif
+
+#ifndef AARCH64
+instruct cmovI_immMov(cmpOp cmp, flagsReg icc, iRegI dst, immIMov src) %{
+  match(Set dst (CMoveI (Binary cmp icc) (Binary dst src)));
+  ins_cost(140);
+  size(4);
+  format %{ "MOV$cmp  $dst,$src" %}
+  ins_encode %{
+    __ mov($dst$$Register, $src$$constant, (AsmCondition)($cmp$$cmpcode));
+  %}
+  ins_pipe(ialu_imm);
+%}
+
+instruct cmovII_imm16(cmpOp cmp, flagsReg icc, iRegI dst, immI16 src) %{
+  match(Set dst (CMoveI (Binary cmp icc) (Binary dst src)));
+  ins_cost(140);
+  size(4);
+  format %{ "MOVw$cmp  $dst,$src" %}
+  ins_encode %{
+    __ movw($dst$$Register, $src$$constant, (AsmCondition)($cmp$$cmpcode));
+  %}
+  ins_pipe(ialu_imm);
+%}
+#endif
+
+instruct cmovII_reg_EQNELTGE(cmpOp0 cmp, flagsReg_EQNELTGE icc, iRegI dst, iRegI src) %{
+  match(Set dst (CMoveI (Binary cmp icc) (Binary dst src)));
+  predicate(_kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::eq ||
+            _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ne ||
+            _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::lt ||
+            _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ge);
+  ins_cost(150);
+  size(4);
+  format %{ "MOV$cmp  $dst,$src" %}
+  ins_encode %{
+    __ mov($dst$$Register, $src$$Register, (AsmCondition)($cmp$$cmpcode));
+  %}
+  ins_pipe(ialu_reg);
+%}
+
+#ifndef AARCH64
+instruct cmovII_immMov_EQNELTGE(cmpOp0 cmp, flagsReg_EQNELTGE icc, iRegI dst, immIMov src) %{
+  match(Set dst (CMoveI (Binary cmp icc) (Binary dst src)));
+  predicate(_kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::eq ||
+            _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ne ||
+            _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::lt ||
+            _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ge);
+  ins_cost(140);
+  size(4);
+  format %{ "MOV$cmp  $dst,$src" %}
+  ins_encode %{
+    __ mov($dst$$Register, $src$$constant, (AsmCondition)($cmp$$cmpcode));
+  %}
+  ins_pipe(ialu_imm);
+%}
+
+instruct cmovII_imm16_EQNELTGE(cmpOp0 cmp, flagsReg_EQNELTGE icc, iRegI dst, immI16 src) %{
+  match(Set dst (CMoveI (Binary cmp icc) (Binary dst src)));
+  predicate(_kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::eq ||
+            _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ne ||
+            _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::lt ||
+            _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ge);
+  ins_cost(140);
+  size(4);
+  format %{ "MOVW$cmp  $dst,$src" %}
+  ins_encode %{
+    __ movw($dst$$Register, $src$$constant, (AsmCondition)($cmp$$cmpcode));
+  %}
+  ins_pipe(ialu_imm);
+%}
+#endif
+
+instruct cmovIIu_reg(cmpOpU cmp, flagsRegU icc, iRegI dst, iRegI src) %{
+  match(Set dst (CMoveI (Binary cmp icc) (Binary dst src)));
+  ins_cost(150);
+  size(4);
+  format %{ "MOV$cmp  $dst,$src" %}
+  ins_encode %{
+    __ mov($dst$$Register, $src$$Register, (AsmCondition)($cmp$$cmpcode));
+  %}
+  ins_pipe(ialu_reg);
+%}
+
+#ifndef AARCH64
+instruct cmovIIu_immMov(cmpOpU cmp, flagsRegU icc, iRegI dst, immIMov src) %{
+  match(Set dst (CMoveI (Binary cmp icc) (Binary dst src)));
+  ins_cost(140);
+  size(4);
+  format %{ "MOV$cmp  $dst,$src" %}
+  ins_encode %{
+    __ mov($dst$$Register, $src$$constant, (AsmCondition)($cmp$$cmpcode));
+  %}
+  ins_pipe(ialu_imm);
+%}
+
+instruct cmovIIu_imm16(cmpOpU cmp, flagsRegU icc, iRegI dst, immI16 src) %{
+  match(Set dst (CMoveI (Binary cmp icc) (Binary dst src)));
+  ins_cost(140);
+  size(4);
+  format %{ "MOVW$cmp  $dst,$src" %}
+  ins_encode %{
+    __ movw($dst$$Register, $src$$constant, (AsmCondition)($cmp$$cmpcode));
+  %}
+  ins_pipe(ialu_imm);
+%}
+#endif
+
+// Conditional move
+instruct cmovPP_reg(cmpOpP cmp, flagsRegP pcc, iRegP dst, iRegP src) %{
+  match(Set dst (CMoveP (Binary cmp pcc) (Binary dst src)));
+  ins_cost(150);
+  size(4);
+  format %{ "MOV$cmp  $dst,$src" %}
+  ins_encode %{
+    __ mov($dst$$Register, $src$$Register, (AsmCondition)($cmp$$cmpcode));
+  %}
+  ins_pipe(ialu_reg);
+%}
+
+instruct cmovPP_imm(cmpOpP cmp, flagsRegP pcc, iRegP dst, immP0 src) %{
+  match(Set dst (CMoveP (Binary cmp pcc) (Binary dst src)));
+  ins_cost(140);
+  size(4);
+#ifdef AARCH64
+  format %{ "MOV$cmp  $dst,ZR" %}
+#else
+  format %{ "MOV$cmp  $dst,$src" %}
+#endif
+  ins_encode %{
+#ifdef AARCH64
+    __ mov($dst$$Register,             ZR, (AsmCondition)($cmp$$cmpcode));
+#else
+    __ mov($dst$$Register, $src$$constant, (AsmCondition)($cmp$$cmpcode));
+#endif
+  %}
+  ins_pipe(ialu_imm);
+%}
+
+// This instruction also works with CmpN so we don't need cmovPN_reg.
+instruct cmovPI_reg(cmpOp cmp, flagsReg icc, iRegP dst, iRegP src) %{
+  match(Set dst (CMoveP (Binary cmp icc) (Binary dst src)));
+  ins_cost(150);
+
+  size(4);
+  format %{ "MOV$cmp  $dst,$src\t! ptr" %}
+  ins_encode %{
+    __ mov($dst$$Register, $src$$Register, (AsmCondition)($cmp$$cmpcode));
+  %}
+  ins_pipe(ialu_reg);
+%}
+
+instruct cmovPI_reg_EQNELTGE(cmpOp0 cmp, flagsReg_EQNELTGE icc, iRegP dst, iRegP src) %{
+  match(Set dst (CMoveP (Binary cmp icc) (Binary dst src)));
+  predicate(_kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::eq ||
+            _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ne ||
+            _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::lt ||
+            _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ge);
+  ins_cost(150);
+
+  size(4);
+  format %{ "MOV$cmp  $dst,$src\t! ptr" %}
+  ins_encode %{
+    __ mov($dst$$Register, $src$$Register, (AsmCondition)($cmp$$cmpcode));
+  %}
+  ins_pipe(ialu_reg);
+%}
+
+instruct cmovPIu_reg(cmpOpU cmp, flagsRegU icc, iRegP dst, iRegP src) %{
+  match(Set dst (CMoveP (Binary cmp icc) (Binary dst src)));
+  ins_cost(150);
+
+  size(4);
+  format %{ "MOV$cmp  $dst,$src\t! ptr" %}
+  ins_encode %{
+    __ mov($dst$$Register, $src$$Register, (AsmCondition)($cmp$$cmpcode));
+  %}
+  ins_pipe(ialu_reg);
+%}
+
+instruct cmovPI_imm(cmpOp cmp, flagsReg icc, iRegP dst, immP0 src) %{
+  match(Set dst (CMoveP (Binary cmp icc) (Binary dst src)));
+  ins_cost(140);
+
+  size(4);
+#ifdef AARCH64
+  format %{ "MOV$cmp  $dst,ZR\t! ptr" %}
+#else
+  format %{ "MOV$cmp  $dst,$src\t! ptr" %}
+#endif
+  ins_encode %{
+#ifdef AARCH64
+    __ mov($dst$$Register,             ZR, (AsmCondition)($cmp$$cmpcode));
+#else
+    __ mov($dst$$Register, $src$$constant, (AsmCondition)($cmp$$cmpcode));
+#endif
+  %}
+  ins_pipe(ialu_imm);
+%}
+
+instruct cmovPI_imm_EQNELTGE(cmpOp0 cmp, flagsReg_EQNELTGE icc, iRegP dst, immP0 src) %{
+  match(Set dst (CMoveP (Binary cmp icc) (Binary dst src)));
+  predicate(_kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::eq ||
+            _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ne ||
+            _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::lt ||
+            _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ge);
+  ins_cost(140);
+
+  size(4);
+#ifdef AARCH64
+  format %{ "MOV$cmp  $dst,ZR\t! ptr" %}
+#else
+  format %{ "MOV$cmp  $dst,$src\t! ptr" %}
+#endif
+  ins_encode %{
+#ifdef AARCH64
+    __ mov($dst$$Register,             ZR, (AsmCondition)($cmp$$cmpcode));
+#else
+    __ mov($dst$$Register, $src$$constant, (AsmCondition)($cmp$$cmpcode));
+#endif
+  %}
+  ins_pipe(ialu_imm);
+%}
+
+instruct cmovPIu_imm(cmpOpU cmp, flagsRegU icc, iRegP dst, immP0 src) %{
+  match(Set dst (CMoveP (Binary cmp icc) (Binary dst src)));
+  ins_cost(140);
+
+  size(4);
+#ifdef AARCH64
+  format %{ "MOV$cmp  $dst,ZR\t! ptr" %}
+#else
+  format %{ "MOV$cmp  $dst,$src\t! ptr" %}
+#endif
+  ins_encode %{
+#ifdef AARCH64
+    __ mov($dst$$Register,             ZR, (AsmCondition)($cmp$$cmpcode));
+#else
+    __ mov($dst$$Register, $src$$constant, (AsmCondition)($cmp$$cmpcode));
+#endif
+  %}
+  ins_pipe(ialu_imm);
+%}
+
+#ifdef AARCH64
+// Conditional move
+instruct cmovF_reg(cmpOp cmp, flagsReg icc, regF dst, regF src1, regF src2) %{
+  match(Set dst (CMoveF (Binary cmp icc) (Binary src2 src1)));
+  ins_cost(150);
+  size(4);
+  format %{ "FCSEL_s $dst,$src1,$src2,$cmp" %}
+  ins_encode %{
+    __ fcsel_s($dst$$FloatRegister, $src1$$FloatRegister, $src2$$FloatRegister, (AsmCondition)($cmp$$cmpcode));
+  %}
+  ins_pipe(int_conditional_float_move);
+%}
+
+instruct cmovD_reg(cmpOp cmp, flagsReg icc, regD dst, regD src1, regD src2) %{
+  match(Set dst (CMoveD (Binary cmp icc) (Binary src2 src1)));
+  ins_cost(150);
+  size(4);
+  format %{ "FCSEL_d $dst,$src1,$src2,$cmp" %}
+  ins_encode %{
+    __ fcsel_d($dst$$FloatRegister, $src1$$FloatRegister, $src2$$FloatRegister, (AsmCondition)($cmp$$cmpcode));
+  %}
+  ins_pipe(int_conditional_float_move);
+%}
+
+instruct cmovFP_reg(cmpOpP cmp, flagsRegP icc, regF dst, regF src1, regF src2) %{
+  match(Set dst (CMoveF (Binary cmp icc) (Binary src2 src1)));
+  ins_cost(150);
+  size(4);
+  format %{ "FCSEL_s $dst,$src1,$src2,$cmp" %}
+  ins_encode %{
+    __ fcsel_s($dst$$FloatRegister, $src1$$FloatRegister, $src2$$FloatRegister, (AsmCondition)($cmp$$cmpcode));
+  %}
+  ins_pipe(int_conditional_float_move);
+%}
+
+instruct cmovDP_reg(cmpOpP cmp, flagsRegP icc, regD dst, regD src1, regD src2) %{
+  match(Set dst (CMoveD (Binary cmp icc) (Binary src2 src1)));
+  ins_cost(150);
+  size(4);
+  format %{ "FCSEL_d $dst,$src1,$src2,$cmp" %}
+  ins_encode %{
+    __ fcsel_d($dst$$FloatRegister, $src1$$FloatRegister, $src2$$FloatRegister, (AsmCondition)($cmp$$cmpcode));
+  %}
+  ins_pipe(int_conditional_float_move);
+%}
+
+instruct cmovFU_reg(cmpOpU cmp, flagsRegU icc, regF dst, regF src1, regF src2) %{
+  match(Set dst (CMoveF (Binary cmp icc) (Binary src2 src1)));
+  ins_cost(150);
+  size(4);
+  format %{ "FCSEL_s $dst,$src1,$src2,$cmp" %}
+  ins_encode %{
+    __ fcsel_s($dst$$FloatRegister, $src1$$FloatRegister, $src2$$FloatRegister, (AsmCondition)($cmp$$cmpcode));
+  %}
+  ins_pipe(int_conditional_float_move);
+%}
+
+instruct cmovDU_reg(cmpOpU cmp, flagsRegU icc, regD dst, regD src1, regD src2) %{
+  match(Set dst (CMoveD (Binary cmp icc) (Binary src2 src1)));
+  ins_cost(150);
+  size(4);
+  format %{ "FCSEL_d $dst,$src1,$src2,$cmp" %}
+  ins_encode %{
+    __ fcsel_d($dst$$FloatRegister, $src1$$FloatRegister, $src2$$FloatRegister, (AsmCondition)($cmp$$cmpcode));
+  %}
+  ins_pipe(int_conditional_float_move);
+%}
+
+instruct cmovFZ_reg(cmpOp0 cmp, flagsReg_EQNELTGE icc, regF dst, regF src1, regF src2) %{
+  match(Set dst (CMoveF (Binary cmp icc) (Binary src2 src1)));
+  ins_cost(150);
+  size(4);
+  format %{ "FCSEL_s $dst,$src1,$src2,$cmp" %}
+  ins_encode %{
+    __ fcsel_s($dst$$FloatRegister, $src1$$FloatRegister, $src2$$FloatRegister, (AsmCondition)($cmp$$cmpcode));
+  %}
+  ins_pipe(int_conditional_float_move);
+%}
+
+instruct cmovDZ_reg(cmpOp0 cmp, flagsReg_EQNELTGE icc, regD dst, regD src1, regD src2) %{
+  match(Set dst (CMoveD (Binary cmp icc) (Binary src2 src1)));
+  ins_cost(150);
+  size(4);
+  format %{ "FCSEL_d $dst,$src1,$src2,$cmp" %}
+  ins_encode %{
+    __ fcsel_d($dst$$FloatRegister, $src1$$FloatRegister, $src2$$FloatRegister, (AsmCondition)($cmp$$cmpcode));
+  %}
+  ins_pipe(int_conditional_float_move);
+%}
+
+#else // !AARCH64
+
+// Conditional move
+instruct cmovFP_reg(cmpOpP cmp, flagsRegP pcc, regF dst, regF src) %{
+  match(Set dst (CMoveF (Binary cmp pcc) (Binary dst src)));
+  ins_cost(150);
+  size(4);
+  format %{ "FCPYS$cmp $dst,$src" %}
+  ins_encode %{
+    __ fcpys($dst$$FloatRegister, $src$$FloatRegister, (AsmCondition)($cmp$$cmpcode));
+  %}
+  ins_pipe(int_conditional_float_move);
+%}
+
+instruct cmovFI_reg(cmpOp cmp, flagsReg icc, regF dst, regF src) %{
+  match(Set dst (CMoveF (Binary cmp icc) (Binary dst src)));
+  ins_cost(150);
+
+  size(4);
+  format %{ "FCPYS$cmp $dst,$src" %}
+  ins_encode %{
+    __ fcpys($dst$$FloatRegister, $src$$FloatRegister, (AsmCondition)($cmp$$cmpcode));
+  %}
+  ins_pipe(int_conditional_float_move);
+%}
+
+instruct cmovFI_reg_EQNELTGE(cmpOp0 cmp, flagsReg_EQNELTGE icc, regF dst, regF src) %{
+  match(Set dst (CMoveF (Binary cmp icc) (Binary dst src)));
+  predicate(_kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::eq ||
+            _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ne ||
+            _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::lt ||
+            _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ge);
+  ins_cost(150);
+
+  size(4);
+  format %{ "FCPYS$cmp $dst,$src" %}
+  ins_encode %{
+    __ fcpys($dst$$FloatRegister, $src$$FloatRegister, (AsmCondition)($cmp$$cmpcode));
+  %}
+  ins_pipe(int_conditional_float_move);
+%}
+
+instruct cmovFIu_reg(cmpOpU cmp, flagsRegU icc, regF dst, regF src) %{
+  match(Set dst (CMoveF (Binary cmp icc) (Binary dst src)));
+  ins_cost(150);
+
+  size(4);
+  format %{ "FCPYS$cmp $dst,$src" %}
+  ins_encode %{
+    __ fcpys($dst$$FloatRegister, $src$$FloatRegister, (AsmCondition)($cmp$$cmpcode));
+  %}
+  ins_pipe(int_conditional_float_move);
+%}
+
+// Conditional move
+instruct cmovDP_reg(cmpOpP cmp, flagsRegP pcc, regD dst, regD src) %{
+  match(Set dst (CMoveD (Binary cmp pcc) (Binary dst src)));
+  ins_cost(150);
+  size(4);
+  format %{ "FCPYD$cmp $dst,$src" %}
+  ins_encode %{
+    __ fcpyd($dst$$FloatRegister, $src$$FloatRegister, (AsmCondition)($cmp$$cmpcode));
+  %}
+  ins_pipe(int_conditional_double_move);
+%}
+
+instruct cmovDI_reg(cmpOp cmp, flagsReg icc, regD dst, regD src) %{
+  match(Set dst (CMoveD (Binary cmp icc) (Binary dst src)));
+  ins_cost(150);
+
+  size(4);
+  format %{ "FCPYD$cmp $dst,$src" %}
+  ins_encode %{
+    __ fcpyd($dst$$FloatRegister, $src$$FloatRegister, (AsmCondition)($cmp$$cmpcode));
+  %}
+  ins_pipe(int_conditional_double_move);
+%}
+
+instruct cmovDI_reg_EQNELTGE(cmpOp0 cmp, flagsReg_EQNELTGE icc, regD dst, regD src) %{
+  match(Set dst (CMoveD (Binary cmp icc) (Binary dst src)));
+  predicate(_kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::eq || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ne || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::lt || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ge);
+  ins_cost(150);
+
+  size(4);
+  format %{ "FCPYD$cmp $dst,$src" %}
+  ins_encode %{
+    __ fcpyd($dst$$FloatRegister, $src$$FloatRegister, (AsmCondition)($cmp$$cmpcode));
+  %}
+  ins_pipe(int_conditional_double_move);
+%}
+
+instruct cmovDIu_reg(cmpOpU cmp, flagsRegU icc, regD dst, regD src) %{
+  match(Set dst (CMoveD (Binary cmp icc) (Binary dst src)));
+  ins_cost(150);
+
+  size(4);
+  format %{ "FCPYD$cmp $dst,$src" %}
+  ins_encode %{
+    __ fcpyd($dst$$FloatRegister, $src$$FloatRegister, (AsmCondition)($cmp$$cmpcode));
+  %}
+  ins_pipe(int_conditional_double_move);
+%}
+
+// Conditional move
+instruct cmovLP_reg(cmpOpP cmp, flagsRegP pcc, iRegL dst, iRegL src) %{
+  match(Set dst (CMoveL (Binary cmp pcc) (Binary dst src)));
+  ins_cost(150);
+
+  size(8);
+  format %{ "MOV$cmp  $dst.lo,$src.lo\t! long\n\t"
+            "MOV$cmp  $dst.hi,$src.hi" %}
+  ins_encode %{
+    __ mov($dst$$Register, $src$$Register, (AsmCondition)($cmp$$cmpcode));
+    __ mov($dst$$Register->successor(), $src$$Register->successor(), (AsmCondition)($cmp$$cmpcode));
+  %}
+  ins_pipe(ialu_reg);
+%}
+
+// TODO: try immLRot2 instead, (0, $con$$constant) becomes
+// (hi($con$$constant), lo($con$$constant)) becomes
+instruct cmovLP_immRot(cmpOpP cmp, flagsRegP pcc, iRegL dst, immLlowRot src) %{
+  match(Set dst (CMoveL (Binary cmp pcc) (Binary dst src)));
+  ins_cost(140);
+
+  size(8);
+  format %{ "MOV$cmp  $dst.lo,$src\t! long\n\t"
+            "MOV$cmp  $dst.hi,0" %}
+  ins_encode %{
+    __ mov($dst$$Register, $src$$constant, (AsmCondition)($cmp$$cmpcode));
+    __ mov($dst$$Register->successor(), 0, (AsmCondition)($cmp$$cmpcode));
+  %}
+  ins_pipe(ialu_imm);
+%}
+
+instruct cmovLP_imm16(cmpOpP cmp, flagsRegP pcc, iRegL dst, immL16 src) %{
+  match(Set dst (CMoveL (Binary cmp pcc) (Binary dst src)));
+  ins_cost(140);
+
+  size(8);
+  format %{ "MOV$cmp  $dst.lo,$src\t! long\n\t"
+            "MOV$cmp  $dst.hi,0" %}
+  ins_encode %{
+    __ movw($dst$$Register, $src$$constant, (AsmCondition)($cmp$$cmpcode));
+    __ mov($dst$$Register->successor(), 0, (AsmCondition)($cmp$$cmpcode));
+  %}
+  ins_pipe(ialu_imm);
+%}
+
+instruct cmovLI_reg(cmpOp cmp, flagsReg icc, iRegL dst, iRegL src) %{
+  match(Set dst (CMoveL (Binary cmp icc) (Binary dst src)));
+  ins_cost(150);
+
+  size(8);
+  format %{ "MOV$cmp  $dst.lo,$src.lo\t! long\n\t"
+            "MOV$cmp  $dst.hi,$src.hi" %}
+  ins_encode %{
+    __ mov($dst$$Register, $src$$Register, (AsmCondition)($cmp$$cmpcode));
+    __ mov($dst$$Register->successor(), $src$$Register->successor(), (AsmCondition)($cmp$$cmpcode));
+  %}
+  ins_pipe(ialu_reg);
+%}
+
+instruct cmovLI_reg_EQNELTGE(cmpOp0 cmp, flagsReg_EQNELTGE icc, iRegL dst, iRegL src) %{
+  match(Set dst (CMoveL (Binary cmp icc) (Binary dst src)));
+  predicate(_kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::eq ||
+            _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ne ||
+            _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::lt ||
+            _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ge);
+  ins_cost(150);
+
+  size(8);
+  format %{ "MOV$cmp  $dst.lo,$src.lo\t! long\n\t"
+            "MOV$cmp  $dst.hi,$src.hi" %}
+  ins_encode %{
+    __ mov($dst$$Register, $src$$Register, (AsmCondition)($cmp$$cmpcode));
+    __ mov($dst$$Register->successor(), $src$$Register->successor(), (AsmCondition)($cmp$$cmpcode));
+  %}
+  ins_pipe(ialu_reg);
+%}
+
+// TODO: try immLRot2 instead, (0, $con$$constant) becomes
+// (hi($con$$constant), lo($con$$constant)) becomes
+instruct cmovLI_immRot(cmpOp cmp, flagsReg icc, iRegL dst, immLlowRot src) %{
+  match(Set dst (CMoveL (Binary cmp icc) (Binary dst src)));
+  ins_cost(140);
+
+  size(8);
+  format %{ "MOV$cmp  $dst.lo,$src\t! long\n\t"
+            "MOV$cmp  $dst.hi,0" %}
+  ins_encode %{
+    __ mov($dst$$Register, $src$$constant, (AsmCondition)($cmp$$cmpcode));
+    __ mov($dst$$Register->successor(), 0, (AsmCondition)($cmp$$cmpcode));
+  %}
+  ins_pipe(ialu_imm);
+%}
+
+// TODO: try immLRot2 instead, (0, $con$$constant) becomes
+// (hi($con$$constant), lo($con$$constant)) becomes
+instruct cmovLI_immRot_EQNELTGE(cmpOp0 cmp, flagsReg_EQNELTGE icc, iRegL dst, immLlowRot src) %{
+  match(Set dst (CMoveL (Binary cmp icc) (Binary dst src)));
+  predicate(_kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::eq ||
+            _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ne ||
+            _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::lt ||
+            _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ge);
+  ins_cost(140);
+
+  size(8);
+  format %{ "MOV$cmp  $dst.lo,$src\t! long\n\t"
+            "MOV$cmp  $dst.hi,0" %}
+  ins_encode %{
+    __ mov($dst$$Register, $src$$constant, (AsmCondition)($cmp$$cmpcode));
+    __ mov($dst$$Register->successor(), 0, (AsmCondition)($cmp$$cmpcode));
+  %}
+  ins_pipe(ialu_imm);
+%}
+
+instruct cmovLI_imm16(cmpOp cmp, flagsReg icc, iRegL dst, immL16 src) %{
+  match(Set dst (CMoveL (Binary cmp icc) (Binary dst src)));
+  ins_cost(140);
+
+  size(8);
+  format %{ "MOV$cmp  $dst.lo,$src\t! long\n\t"
+            "MOV$cmp  $dst.hi,0" %}
+  ins_encode %{
+    __ movw($dst$$Register, $src$$constant, (AsmCondition)($cmp$$cmpcode));
+    __ movw($dst$$Register->successor(), 0, (AsmCondition)($cmp$$cmpcode));
+  %}
+  ins_pipe(ialu_imm);
+%}
+
+instruct cmovLI_imm16_EQNELTGE(cmpOp0 cmp, flagsReg_EQNELTGE icc, iRegL dst, immL16 src) %{
+  match(Set dst (CMoveL (Binary cmp icc) (Binary dst src)));
+  predicate(_kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::eq ||
+            _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ne ||
+            _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::lt ||
+            _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ge);
+  ins_cost(140);
+
+  size(8);
+  format %{ "MOV$cmp  $dst.lo,$src\t! long\n\t"
+            "MOV$cmp  $dst.hi,0" %}
+  ins_encode %{
+    __ movw($dst$$Register, $src$$constant, (AsmCondition)($cmp$$cmpcode));
+    __ movw($dst$$Register->successor(), 0, (AsmCondition)($cmp$$cmpcode));
+  %}
+  ins_pipe(ialu_imm);
+%}
+
+instruct cmovLIu_reg(cmpOpU cmp, flagsRegU icc, iRegL dst, iRegL src) %{
+  match(Set dst (CMoveL (Binary cmp icc) (Binary dst src)));
+  ins_cost(150);
+
+  size(8);
+  format %{ "MOV$cmp  $dst.lo,$src.lo\t! long\n\t"
+            "MOV$cmp  $dst.hi,$src.hi" %}
+  ins_encode %{
+    __ mov($dst$$Register, $src$$Register, (AsmCondition)($cmp$$cmpcode));
+    __ mov($dst$$Register->successor(), $src$$Register->successor(), (AsmCondition)($cmp$$cmpcode));
+  %}
+  ins_pipe(ialu_reg);
+%}
+#endif // !AARCH64
+
+
+//----------OS and Locking Instructions----------------------------------------
+
+// This name is KNOWN by the ADLC and cannot be changed.
+// The ADLC forces a 'TypeRawPtr::BOTTOM' output type
+// for this guy.
+instruct tlsLoadP(RthreadRegP dst) %{
+  match(Set dst (ThreadLocal));
+
+  size(0);
+  ins_cost(0);
+  format %{ "! TLS is in $dst" %}
+  ins_encode( /*empty encoding*/ );
+  ins_pipe(ialu_none);
+%}
+
+instruct checkCastPP( iRegP dst ) %{
+  match(Set dst (CheckCastPP dst));
+
+  size(0);
+  format %{ "! checkcastPP of $dst" %}
+  ins_encode( /*empty encoding*/ );
+  ins_pipe(empty);
+%}
+
+
+instruct castPP( iRegP dst ) %{
+  match(Set dst (CastPP dst));
+  format %{ "! castPP of $dst" %}
+  ins_encode( /*empty encoding*/ );
+  ins_pipe(empty);
+%}
+
+instruct castII( iRegI dst ) %{
+  match(Set dst (CastII dst));
+  format %{ "! castII of $dst" %}
+  ins_encode( /*empty encoding*/ );
+  ins_cost(0);
+  ins_pipe(empty);
+%}
+
+//----------Arithmetic Instructions--------------------------------------------
+// Addition Instructions
+// Register Addition
+instruct addI_reg_reg(iRegI dst, iRegI src1, iRegI src2) %{
+  match(Set dst (AddI src1 src2));
+
+  size(4);
+  format %{ "add_32 $dst,$src1,$src2\t! int" %}
+  ins_encode %{
+    __ add_32($dst$$Register, $src1$$Register, $src2$$Register);
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+
+#ifndef AARCH64
+instruct addshlI_reg_reg_reg(iRegI dst, iRegI src1, iRegI src2, iRegI src3) %{
+  match(Set dst (AddI (LShiftI src1 src2) src3));
+
+  size(4);
+  format %{ "add_32 $dst,$src3,$src1<<$src2\t! int" %}
+  ins_encode %{
+    __ add_32($dst$$Register, $src3$$Register, AsmOperand($src1$$Register, lsl, $src2$$Register));
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+#endif
+
+#ifdef AARCH64
+#ifdef TODO
+instruct addshlL_reg_imm_reg(iRegL dst, iRegL src1, immU6 src2, iRegL src3) %{
+  match(Set dst (AddL (LShiftL src1 src2) src3));
+
+  size(4);
+  format %{ "ADD    $dst,$src3,$src1<<$src2\t! long" %}
+  ins_encode %{
+    __ add($dst$$Register, $src3$$Register, AsmOperand($src1$$Register, lsl, $src2$$constant));
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+#endif
+#endif
+
+instruct addshlI_reg_imm_reg(iRegI dst, iRegI src1, immU5 src2, iRegI src3) %{
+  match(Set dst (AddI (LShiftI src1 src2) src3));
+
+  size(4);
+  format %{ "add_32 $dst,$src3,$src1<<$src2\t! int" %}
+  ins_encode %{
+    __ add_32($dst$$Register, $src3$$Register, AsmOperand($src1$$Register, lsl, $src2$$constant));
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+
+#ifndef AARCH64
+instruct addsarI_reg_reg_reg(iRegI dst, iRegI src1, iRegI src2, iRegI src3) %{
+  match(Set dst (AddI (RShiftI src1 src2) src3));
+
+  size(4);
+  format %{ "add_32 $dst,$src3,$src1>>$src2\t! int" %}
+  ins_encode %{
+    __ add_32($dst$$Register, $src3$$Register, AsmOperand($src1$$Register, asr, $src2$$Register));
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+#endif
+
+instruct addsarI_reg_imm_reg(iRegI dst, iRegI src1, immU5 src2, iRegI src3) %{
+  match(Set dst (AddI (RShiftI src1 src2) src3));
+
+  size(4);
+  format %{ "add_32 $dst,$src3,$src1>>$src2\t! int" %}
+  ins_encode %{
+    __ add_32($dst$$Register, $src3$$Register, AsmOperand($src1$$Register, asr, $src2$$constant));
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+
+#ifndef AARCH64
+instruct addshrI_reg_reg_reg(iRegI dst, iRegI src1, iRegI src2, iRegI src3) %{
+  match(Set dst (AddI (URShiftI src1 src2) src3));
+
+  size(4);
+  format %{ "add_32 $dst,$src3,$src1>>>$src2\t! int" %}
+  ins_encode %{
+    __ add_32($dst$$Register, $src3$$Register, AsmOperand($src1$$Register, lsr, $src2$$Register));
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+#endif
+
+instruct addshrI_reg_imm_reg(iRegI dst, iRegI src1, immU5 src2, iRegI src3) %{
+  match(Set dst (AddI (URShiftI src1 src2) src3));
+
+  size(4);
+  format %{ "add_32 $dst,$src3,$src1>>>$src2\t! int" %}
+  ins_encode %{
+    __ add_32($dst$$Register, $src3$$Register, AsmOperand($src1$$Register, lsr, $src2$$constant));
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+
+// Immediate Addition
+instruct addI_reg_aimmI(iRegI dst, iRegI src1, aimmI src2) %{
+  match(Set dst (AddI src1 src2));
+
+  size(4);
+  format %{ "add_32 $dst,$src1,$src2\t! int" %}
+  ins_encode %{
+    __ add_32($dst$$Register, $src1$$Register, $src2$$constant);
+  %}
+  ins_pipe(ialu_reg_imm);
+%}
+
+// Pointer Register Addition
+instruct addP_reg_reg(iRegP dst, iRegP src1, iRegX src2) %{
+  match(Set dst (AddP src1 src2));
+
+  size(4);
+  format %{ "ADD    $dst,$src1,$src2\t! ptr" %}
+  ins_encode %{
+    __ add($dst$$Register, $src1$$Register, $src2$$Register);
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+
+#ifdef AARCH64
+// unshifted I2L operand
+operand unshiftedI2L(iRegI src2) %{
+//constraint(ALLOC_IN_RC(sp_ptr_reg));
+  match(ConvI2L src2);
+
+  op_cost(1);
+  format %{ "$src2.w" %}
+  interface(MEMORY_INTER) %{
+    base($src2);
+    index(0xff);
+    scale(0x0);
+    disp(0x0);
+  %}
+%}
+
+// shifted I2L operand
+operand shiftedI2L(iRegI src2, immI_0_4 src3) %{
+//constraint(ALLOC_IN_RC(sp_ptr_reg));
+  match(LShiftX (ConvI2L src2) src3);
+
+  op_cost(1);
+  format %{ "$src2.w << $src3" %}
+  interface(MEMORY_INTER) %{
+    base($src2);
+    index(0xff);
+    scale($src3);
+    disp(0x0);
+  %}
+%}
+
+opclass shiftedRegI(shiftedI2L, unshiftedI2L);
+
+instruct shlL_reg_regI(iRegL dst, iRegI src1, immU6 src2) %{
+  match(Set dst (LShiftL (ConvI2L src1) src2));
+
+  size(4);
+  format %{ "LSL    $dst,$src1.w,$src2\t! ptr" %}
+  ins_encode %{
+    int c = $src2$$constant;
+    int r = 64 - c;
+    int s = 31;
+    if (s >= r) {
+      s = r - 1;
+    }
+    __ sbfm($dst$$Register, $src1$$Register, r, s);
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+
+instruct addP_reg_regI(iRegP dst, iRegP src1, shiftedRegI src2) %{
+  match(Set dst (AddP src1 src2));
+
+  ins_cost(DEFAULT_COST * 3/2);
+  size(4);
+  format %{ "ADD    $dst,$src1,$src2, sxtw\t! ptr" %}
+  ins_encode %{
+    Register base = reg_to_register_object($src2$$base);
+    __ add($dst$$Register, $src1$$Register, base, ex_sxtw, $src2$$scale);
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+#endif
+
+// shifted iRegX operand
+operand shiftedX(iRegX src2, shimmX src3) %{
+//constraint(ALLOC_IN_RC(sp_ptr_reg));
+  match(LShiftX src2 src3);
+
+  op_cost(1);
+  format %{ "$src2 << $src3" %}
+  interface(MEMORY_INTER) %{
+    base($src2);
+    index(0xff);
+    scale($src3);
+    disp(0x0);
+  %}
+%}
+
+instruct addshlP_reg_reg_imm(iRegP dst, iRegP src1, shiftedX src2) %{
+  match(Set dst (AddP src1 src2));
+
+  ins_cost(DEFAULT_COST * 3/2);
+  size(4);
+  format %{ "ADD    $dst,$src1,$src2\t! ptr" %}
+  ins_encode %{
+    Register base = reg_to_register_object($src2$$base);
+    __ add($dst$$Register, $src1$$Register, AsmOperand(base, lsl, $src2$$scale));
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+
+// Pointer Immediate Addition
+instruct addP_reg_aimmX(iRegP dst, iRegP src1, aimmX src2) %{
+  match(Set dst (AddP src1 src2));
+
+  size(4);
+  format %{ "ADD    $dst,$src1,$src2\t! ptr" %}
+  ins_encode %{
+    __ add($dst$$Register, $src1$$Register, $src2$$constant);
+  %}
+  ins_pipe(ialu_reg_imm);
+%}
+
+// Long Addition
+#ifdef AARCH64
+instruct addL_reg_reg(iRegL dst, iRegL src1, iRegL src2) %{
+  match(Set dst (AddL src1 src2));
+  size(4);
+  format %{ "ADD     $dst,$src1,$src2\t! long" %}
+  ins_encode %{
+    __ add($dst$$Register, $src1$$Register, $src2$$Register);
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+
+instruct addL_reg_regI(iRegL dst, iRegL src1, shiftedRegI src2) %{
+  match(Set dst (AddL src1 src2));
+
+  ins_cost(DEFAULT_COST * 3/2);
+  size(4);
+  format %{ "ADD    $dst,$src1,$src2, sxtw\t! long" %}
+  ins_encode %{
+    Register base = reg_to_register_object($src2$$base);
+    __ add($dst$$Register, $src1$$Register, base, ex_sxtw, $src2$$scale);
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+#else
+instruct addL_reg_reg(iRegL dst, iRegL src1, iRegL src2, flagsReg ccr) %{
+  match(Set dst (AddL src1 src2));
+  effect(KILL ccr);
+  size(8);
+  format %{ "ADDS    $dst.lo,$src1.lo,$src2.lo\t! long\n\t"
+            "ADC     $dst.hi,$src1.hi,$src2.hi" %}
+  ins_encode %{
+    __ adds($dst$$Register, $src1$$Register, $src2$$Register);
+    __ adc($dst$$Register->successor(), $src1$$Register->successor(), $src2$$Register->successor());
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+#endif
+
+#ifdef AARCH64
+// Immediate Addition
+instruct addL_reg_aimm(iRegL dst, iRegL src1, aimmL src2) %{
+  match(Set dst (AddL src1 src2));
+
+  size(4);
+  format %{ "ADD    $dst,$src1,$src2\t! long" %}
+  ins_encode %{
+    __ add($dst$$Register, $src1$$Register, $src2$$constant);
+  %}
+  ins_pipe(ialu_reg_imm);
+%}
+
+instruct addL_reg_immLneg(iRegL dst, iRegL src1, aimmLneg src2) %{
+  match(Set dst (SubL src1 src2));
+
+  size(4);
+  format %{ "ADD    $dst,$src1,-($src2)\t! long" %}
+  ins_encode %{
+    __ add($dst$$Register, $src1$$Register, -$src2$$constant);
+  %}
+  ins_pipe(ialu_reg_imm);
+%}
+#else
+// TODO
+#endif
+
+#ifndef AARCH64
+// TODO: try immLRot2 instead, (0, $con$$constant) becomes
+// (hi($con$$constant), lo($con$$constant)) becomes
+instruct addL_reg_immRot(iRegL dst, iRegL src1, immLlowRot con, flagsReg ccr) %{
+  match(Set dst (AddL src1 con));
+  effect(KILL ccr);
+  size(8);
+  format %{ "ADDS    $dst.lo,$src1.lo,$con\t! long\n\t"
+            "ADC     $dst.hi,$src1.hi,0" %}
+  ins_encode %{
+    __ adds($dst$$Register, $src1$$Register, $con$$constant);
+    __ adc($dst$$Register->successor(), $src1$$Register->successor(), 0);
+  %}
+  ins_pipe(ialu_reg_imm);
+%}
+#endif
+
+//----------Conditional_store--------------------------------------------------
+// Conditional-store of the updated heap-top.
+// Used during allocation of the shared heap.
+// Sets flags (EQ) on success.
+
+// TODO: optimize out barriers with AArch64 load-acquire/store-release
+// LoadP-locked.
+instruct loadPLocked(iRegP dst, memoryex mem) %{
+  match(Set dst (LoadPLocked mem));
+  size(4);
+  format %{ "LDREX  $dst,$mem" %}
+  ins_encode %{
+#ifdef AARCH64
+    Register base = reg_to_register_object($mem$$base);
+    __ ldxr($dst$$Register, base);
+#else
+    __ ldrex($dst$$Register,$mem$$Address);
+#endif
+  %}
+  ins_pipe(iload_mem);
+%}
+
+instruct storePConditional( memoryex heap_top_ptr, iRegP oldval, iRegP newval, iRegI tmp, flagsRegP pcc ) %{
+  predicate(_kids[1]->_kids[0]->_leaf->Opcode() == Op_LoadPLocked); // only works in conjunction with a LoadPLocked node
+  match(Set pcc (StorePConditional heap_top_ptr (Binary oldval newval)));
+  effect( TEMP tmp );
+  size(8);
+  format %{ "STREX  $tmp,$newval,$heap_top_ptr\n\t"
+            "CMP    $tmp, 0" %}
+  ins_encode %{
+#ifdef AARCH64
+    Register base = reg_to_register_object($heap_top_ptr$$base);
+    __ stxr($tmp$$Register, $newval$$Register, base);
+#else
+    __ strex($tmp$$Register, $newval$$Register, $heap_top_ptr$$Address);
+#endif
+    __ cmp($tmp$$Register, 0);
+  %}
+  ins_pipe( long_memory_op );
+%}
+
+// Conditional-store of an intx value.
+instruct storeXConditional( memoryex mem, iRegX oldval, iRegX newval, iRegX tmp, flagsReg icc ) %{
+#ifdef AARCH64
+  match(Set icc (StoreLConditional mem (Binary oldval newval)));
+  effect( TEMP tmp );
+  size(28);
+  format %{ "loop:\n\t"
+            "LDXR     $tmp, $mem\t! If $oldval==[$mem] Then store $newval into [$mem], DOESN'T set $newval=[$mem] in any case\n\t"
+            "SUBS     $tmp, $tmp, $oldval\n\t"
+            "B.ne     done\n\t"
+            "STXR     $tmp, $newval, $mem\n\t"
+            "CBNZ_w   $tmp, loop\n\t"
+            "CMP      $tmp, 0\n\t"
+            "done:\n\t"
+            "membar   LoadStore|LoadLoad" %}
+#else
+  match(Set icc (StoreIConditional mem (Binary oldval newval)));
+  effect( TEMP tmp );
+  size(28);
+  format %{ "loop: \n\t"
+            "LDREX    $tmp, $mem\t! If $oldval==[$mem] Then store $newval into [$mem], DOESN'T set $newval=[$mem] in any case\n\t"
+            "XORS     $tmp,$tmp, $oldval\n\t"
+            "STREX.eq $tmp, $newval, $mem\n\t"
+            "CMP.eq   $tmp, 1 \n\t"
+            "B.eq     loop \n\t"
+            "TEQ      $tmp, 0\n\t"
+            "membar   LoadStore|LoadLoad" %}
+#endif
+  ins_encode %{
+    Label loop;
+    __ bind(loop);
+#ifdef AARCH64
+// FIXME: use load-acquire/store-release, remove membar?
+    Label done;
+    Register base = reg_to_register_object($mem$$base);
+    __ ldxr($tmp$$Register, base);
+    __ subs($tmp$$Register, $tmp$$Register, $oldval$$Register);
+    __ b(done, ne);
+    __ stxr($tmp$$Register, $newval$$Register, base);
+    __ cbnz_w($tmp$$Register, loop);
+    __ cmp($tmp$$Register, 0);
+    __ bind(done);
+#else
+    __ ldrex($tmp$$Register, $mem$$Address);
+    __ eors($tmp$$Register, $tmp$$Register, $oldval$$Register);
+    __ strex($tmp$$Register, $newval$$Register, $mem$$Address, eq);
+    __ cmp($tmp$$Register, 1, eq);
+    __ b(loop, eq);
+    __ teq($tmp$$Register, 0);
+#endif
+    // used by biased locking only. Requires a membar.
+    __ membar(MacroAssembler::Membar_mask_bits(MacroAssembler::LoadStore | MacroAssembler::LoadLoad), noreg);
+  %}
+  ins_pipe( long_memory_op );
+%}
+
+// No flag versions for CompareAndSwap{P,I,L} because matcher can't match them
+
+#ifdef AARCH64
+// TODO: if combined with membar, elide membar and use
+// load-acquire/store-release if appropriate
+instruct compareAndSwapL_bool(memoryex mem, iRegL oldval, iRegL newval, iRegI res, iRegI tmp, flagsReg ccr) %{
+  match(Set res (CompareAndSwapL mem (Binary oldval newval)));
+  effect( KILL ccr, TEMP tmp);
+  size(24);
+  format %{ "loop:\n\t"
+            "LDXR     $tmp, $mem\t! If $oldval==[$mem] Then store $newval into [$mem]\n\t"
+            "CMP      $tmp, $oldval\n\t"
+            "B.ne     done\n\t"
+            "STXR     $tmp, $newval, $mem\n\t"
+            "CBNZ_w   $tmp, loop\n\t"
+            "done:\n\t"
+            "CSET_w   $res, eq" %}
+  ins_encode %{
+    Register base = reg_to_register_object($mem$$base);
+    Label loop, done;
+    __ bind(loop);
+    __ ldxr($tmp$$Register, base);
+    __ cmp($tmp$$Register, $oldval$$Register);
+    __ b(done, ne);
+    __ stxr($tmp$$Register, $newval$$Register, base);
+    __ cbnz_w($tmp$$Register, loop);
+    __ bind(done);
+    __ cset_w($res$$Register, eq);
+  %}
+  ins_pipe( long_memory_op );
+%}
+
+instruct compareAndSwapI_bool(memoryex mem, iRegI oldval, iRegI newval, iRegI res, iRegI tmp, flagsReg ccr) %{
+  match(Set res (CompareAndSwapI mem (Binary oldval newval)));
+  effect( KILL ccr, TEMP tmp);
+  size(24);
+  format %{ "loop:\n\t"
+            "LDXR_w   $tmp, $mem\t! If $oldval==[$mem] Then store $newval into [$mem]\n\t"
+            "CMP_w    $tmp, $oldval\n\t"
+            "B.ne     done\n\t"
+            "STXR_w   $tmp, $newval, $mem\n\t"
+            "CBNZ_w   $tmp, loop\n\t"
+            "done:\n\t"
+            "CSET_w   $res, eq" %}
+  ins_encode %{
+    Register base = reg_to_register_object($mem$$base);
+    Label loop, done;
+    __ bind(loop);
+    __ ldxr_w($tmp$$Register, base);
+    __ cmp_w($tmp$$Register, $oldval$$Register);
+    __ b(done, ne);
+    __ stxr_w($tmp$$Register, $newval$$Register,  base);
+    __ cbnz_w($tmp$$Register, loop);
+    __ bind(done);
+    __ cset_w($res$$Register, eq);
+  %}
+  ins_pipe( long_memory_op );
+%}
+
+// tmp must use iRegI instead of iRegN until 8051805 is fixed.
+instruct compareAndSwapN_bool(memoryex mem, iRegN oldval, iRegN newval, iRegI res, iRegI tmp, flagsReg ccr) %{
+  match(Set res (CompareAndSwapN mem (Binary oldval newval)));
+  effect( KILL ccr, TEMP tmp);
+  size(24);
+  format %{ "loop:\n\t"
+            "LDXR_w   $tmp, $mem\t! If $oldval==[$mem] Then store $newval into [$mem]\n\t"
+            "CMP_w    $tmp, $oldval\n\t"
+            "B.ne     done\n\t"
+            "STXR_w   $tmp, $newval, $mem\n\t"
+            "CBNZ_w   $tmp, loop\n\t"
+            "done:\n\t"
+            "CSET_w   $res, eq" %}
+  ins_encode %{
+    Register base = reg_to_register_object($mem$$base);
+    Label loop, done;
+    __ bind(loop);
+    __ ldxr_w($tmp$$Register, base);
+    __ cmp_w($tmp$$Register, $oldval$$Register);
+    __ b(done, ne);
+    __ stxr_w($tmp$$Register, $newval$$Register,  base);
+    __ cbnz_w($tmp$$Register, loop);
+    __ bind(done);
+    __ cset_w($res$$Register, eq);
+  %}
+  ins_pipe( long_memory_op );
+%}
+
+instruct compareAndSwapP_bool(memoryex mem, iRegP oldval, iRegP newval, iRegI res, iRegI tmp, flagsReg ccr) %{
+  match(Set res (CompareAndSwapP mem (Binary oldval newval)));
+  effect( KILL ccr, TEMP tmp);
+  size(24);
+  format %{ "loop:\n\t"
+            "LDXR     $tmp, $mem\t! If $oldval==[$mem] Then store $newval into [$mem]\n\t"
+            "CMP      $tmp, $oldval\n\t"
+            "B.ne     done\n\t"
+            "STXR     $tmp, $newval, $mem\n\t"
+            "CBNZ_w   $tmp, loop\n\t"
+            "done:\n\t"
+            "CSET_w   $res, eq" %}
+  ins_encode %{
+    Register base = reg_to_register_object($mem$$base);
+    Label loop, done;
+    __ bind(loop);
+    __ ldxr($tmp$$Register, base);
+    __ cmp($tmp$$Register, $oldval$$Register);
+    __ b(done, ne);
+    __ stxr($tmp$$Register, $newval$$Register,  base);
+    __ cbnz_w($tmp$$Register, loop);
+    __ bind(done);
+    __ cset_w($res$$Register, eq);
+  %}
+  ins_pipe( long_memory_op );
+%}
+#else // !AARCH64
+instruct compareAndSwapL_bool(memoryex mem, iRegL oldval, iRegLd newval, iRegI res, iRegLd tmp, flagsReg ccr ) %{
+  match(Set res (CompareAndSwapL mem (Binary oldval newval)));
+  effect( KILL ccr, TEMP tmp);
+  size(32);
+  format %{ "loop: \n\t"
+            "LDREXD   $tmp, $mem\t! If $oldval==[$mem] Then store $newval into [$mem]\n\t"
+            "CMP      $tmp.lo, $oldval.lo\n\t"
+            "CMP.eq   $tmp.hi, $oldval.hi\n\t"
+            "STREXD.eq $tmp, $newval, $mem\n\t"
+            "MOV.ne   $tmp, 0 \n\t"
+            "XORS.eq  $tmp,$tmp, 1 \n\t"
+            "B.eq     loop \n\t"
+            "MOV      $res, $tmp" %}
+  ins_encode %{
+    Label loop;
+    __ bind(loop);
+    __ ldrexd($tmp$$Register, $mem$$Address);
+    __ cmp($tmp$$Register, $oldval$$Register);
+    __ cmp($tmp$$Register->successor(), $oldval$$Register->successor(), eq);
+    __ strexd($tmp$$Register, $newval$$Register, $mem$$Address, eq);
+    __ mov($tmp$$Register, 0, ne);
+    __ eors($tmp$$Register, $tmp$$Register, 1, eq);
+    __ b(loop, eq);
+    __ mov($res$$Register, $tmp$$Register);
+  %}
+  ins_pipe( long_memory_op );
+%}
+
+
+instruct compareAndSwapI_bool(memoryex mem, iRegI oldval, iRegI newval, iRegI res, iRegI tmp, flagsReg ccr ) %{
+  match(Set res (CompareAndSwapI mem (Binary oldval newval)));
+  effect( KILL ccr, TEMP tmp);
+  size(28);
+  format %{ "loop: \n\t"
+            "LDREX    $tmp, $mem\t! If $oldval==[$mem] Then store $newval into [$mem]\n\t"
+            "CMP      $tmp, $oldval\n\t"
+            "STREX.eq $tmp, $newval, $mem\n\t"
+            "MOV.ne   $tmp, 0 \n\t"
+            "XORS.eq  $tmp,$tmp, 1 \n\t"
+            "B.eq     loop \n\t"
+            "MOV      $res, $tmp" %}
+
+  ins_encode %{
+    Label loop;
+    __ bind(loop);
+    __ ldrex($tmp$$Register,$mem$$Address);
+    __ cmp($tmp$$Register, $oldval$$Register);
+    __ strex($tmp$$Register, $newval$$Register, $mem$$Address, eq);
+    __ mov($tmp$$Register, 0, ne);
+    __ eors($tmp$$Register, $tmp$$Register, 1, eq);
+    __ b(loop, eq);
+    __ mov($res$$Register, $tmp$$Register);
+  %}
+  ins_pipe( long_memory_op );
+%}
+
+instruct compareAndSwapP_bool(memoryex mem, iRegP oldval, iRegP newval, iRegI res, iRegI tmp, flagsReg ccr ) %{
+  match(Set res (CompareAndSwapP mem (Binary oldval newval)));
+  effect( KILL ccr, TEMP tmp);
+  size(28);
+  format %{ "loop: \n\t"
+            "LDREX    $tmp, $mem\t! If $oldval==[$mem] Then store $newval into [$mem]\n\t"
+            "CMP      $tmp, $oldval\n\t"
+            "STREX.eq $tmp, $newval, $mem\n\t"
+            "MOV.ne   $tmp, 0 \n\t"
+            "EORS.eq  $tmp,$tmp, 1 \n\t"
+            "B.eq     loop \n\t"
+            "MOV      $res, $tmp" %}
+
+  ins_encode %{
+    Label loop;
+    __ bind(loop);
+    __ ldrex($tmp$$Register,$mem$$Address);
+    __ cmp($tmp$$Register, $oldval$$Register);
+    __ strex($tmp$$Register, $newval$$Register, $mem$$Address, eq);
+    __ mov($tmp$$Register, 0, ne);
+    __ eors($tmp$$Register, $tmp$$Register, 1, eq);
+    __ b(loop, eq);
+    __ mov($res$$Register, $tmp$$Register);
+  %}
+  ins_pipe( long_memory_op );
+%}
+#endif // !AARCH64
+
+#ifdef AARCH64
+instruct xaddI_aimmI_no_res(memoryex mem, aimmI add, Universe dummy, iRegI tmp1, iRegI tmp2) %{
+  predicate(n->as_LoadStore()->result_not_used());
+  match(Set dummy (GetAndAddI mem add));
+  effect(TEMP tmp1, TEMP tmp2);
+  size(16);
+  format %{ "loop:\n\t"
+            "LDXR_w   $tmp1, $mem\n\t"
+            "ADD_w    $tmp1, $tmp1, $add\n\t"
+            "STXR_w   $tmp2, $tmp1, $mem\n\t"
+            "CBNZ_w   $tmp2, loop" %}
+
+  ins_encode %{
+    Label loop;
+    Register base = reg_to_register_object($mem$$base);
+    __ bind(loop);
+    __ ldxr_w($tmp1$$Register, base);
+    __ add_w($tmp1$$Register, $tmp1$$Register, $add$$constant);
+    __ stxr_w($tmp2$$Register, $tmp1$$Register, base);
+    __ cbnz_w($tmp2$$Register, loop);
+  %}
+  ins_pipe( long_memory_op );
+%}
+#else
+instruct xaddI_aimmI_no_res(memoryex mem, aimmI add, Universe dummy, iRegI tmp1, iRegI tmp2, flagsReg ccr) %{
+  predicate(n->as_LoadStore()->result_not_used());
+  match(Set dummy (GetAndAddI mem add));
+  effect(KILL ccr, TEMP tmp1, TEMP tmp2);
+  size(20);
+  format %{ "loop: \n\t"
+            "LDREX    $tmp1, $mem\n\t"
+            "ADD      $tmp1, $tmp1, $add\n\t"
+            "STREX    $tmp2, $tmp1, $mem\n\t"
+            "CMP      $tmp2, 0 \n\t"
+            "B.ne     loop \n\t" %}
+
+  ins_encode %{
+    Label loop;
+    __ bind(loop);
+    __ ldrex($tmp1$$Register,$mem$$Address);
+    __ add($tmp1$$Register, $tmp1$$Register, $add$$constant);
+    __ strex($tmp2$$Register, $tmp1$$Register, $mem$$Address);
+    __ cmp($tmp2$$Register, 0);
+    __ b(loop, ne);
+  %}
+  ins_pipe( long_memory_op );
+%}
+#endif
+
+#ifdef AARCH64
+instruct xaddI_reg_no_res(memoryex mem, iRegI add, Universe dummy, iRegI tmp1, iRegI tmp2) %{
+  predicate(n->as_LoadStore()->result_not_used());
+  match(Set dummy (GetAndAddI mem add));
+  effect(TEMP tmp1, TEMP tmp2);
+  size(16);
+  format %{ "loop:\n\t"
+            "LDXR_w   $tmp1, $mem\n\t"
+            "ADD_w    $tmp1, $tmp1, $add\n\t"
+            "STXR_w   $tmp2, $tmp1, $mem\n\t"
+            "CBNZ_w   $tmp2, loop" %}
+
+  ins_encode %{
+    Label loop;
+    Register base = reg_to_register_object($mem$$base);
+    __ bind(loop);
+    __ ldxr_w($tmp1$$Register, base);
+    __ add_w($tmp1$$Register, $tmp1$$Register, $add$$Register);
+    __ stxr_w($tmp2$$Register, $tmp1$$Register, base);
+    __ cbnz_w($tmp2$$Register, loop);
+  %}
+  ins_pipe( long_memory_op );
+%}
+#else
+instruct xaddI_reg_no_res(memoryex mem, iRegI add, Universe dummy, iRegI tmp1, iRegI tmp2, flagsReg ccr) %{
+  predicate(n->as_LoadStore()->result_not_used());
+  match(Set dummy (GetAndAddI mem add));
+  effect(KILL ccr, TEMP tmp1, TEMP tmp2);
+  size(20);
+  format %{ "loop: \n\t"
+            "LDREX    $tmp1, $mem\n\t"
+            "ADD      $tmp1, $tmp1, $add\n\t"
+            "STREX    $tmp2, $tmp1, $mem\n\t"
+            "CMP      $tmp2, 0 \n\t"
+            "B.ne     loop \n\t" %}
+
+  ins_encode %{
+    Label loop;
+    __ bind(loop);
+    __ ldrex($tmp1$$Register,$mem$$Address);
+    __ add($tmp1$$Register, $tmp1$$Register, $add$$Register);
+    __ strex($tmp2$$Register, $tmp1$$Register, $mem$$Address);
+    __ cmp($tmp2$$Register, 0);
+    __ b(loop, ne);
+  %}
+  ins_pipe( long_memory_op );
+%}
+#endif
+
+#ifdef AARCH64
+instruct xaddI_aimmI(memoryex mem, aimmI add, iRegI res, iRegI tmp1, iRegI tmp2) %{
+  match(Set res (GetAndAddI mem add));
+  effect(TEMP tmp1, TEMP tmp2, TEMP res);
+  size(16);
+  format %{ "loop:\n\t"
+            "LDXR_w   $res, $mem\n\t"
+            "ADD_w    $tmp1, $res, $add\n\t"
+            "STXR_w   $tmp2, $tmp1, $mem\n\t"
+            "CBNZ_w   $tmp2, loop" %}
+
+  ins_encode %{
+    Label loop;
+    Register base = reg_to_register_object($mem$$base);
+    __ bind(loop);
+    __ ldxr_w($res$$Register, base);
+    __ add_w($tmp1$$Register, $res$$Register, $add$$constant);
+    __ stxr_w($tmp2$$Register, $tmp1$$Register, base);
+    __ cbnz_w($tmp2$$Register, loop);
+  %}
+  ins_pipe( long_memory_op );
+%}
+#else
+instruct xaddI_aimmI(memoryex mem, aimmI add, iRegI res, iRegI tmp1, iRegI tmp2, flagsReg ccr) %{
+  match(Set res (GetAndAddI mem add));
+  effect(KILL ccr, TEMP tmp1, TEMP tmp2, TEMP res);
+  size(20);
+  format %{ "loop: \n\t"
+            "LDREX    $res, $mem\n\t"
+            "ADD      $tmp1, $res, $add\n\t"
+            "STREX    $tmp2, $tmp1, $mem\n\t"
+            "CMP      $tmp2, 0 \n\t"
+            "B.ne     loop \n\t" %}
+
+  ins_encode %{
+    Label loop;
+    __ bind(loop);
+    __ ldrex($res$$Register,$mem$$Address);
+    __ add($tmp1$$Register, $res$$Register, $add$$constant);
+    __ strex($tmp2$$Register, $tmp1$$Register, $mem$$Address);
+    __ cmp($tmp2$$Register, 0);
+    __ b(loop, ne);
+  %}
+  ins_pipe( long_memory_op );
+%}
+#endif
+
+#ifdef AARCH64
+instruct xaddI_reg(memoryex mem, iRegI add, iRegI res, iRegI tmp1, iRegI tmp2) %{
+  match(Set res (GetAndAddI mem add));
+  effect(TEMP tmp1, TEMP tmp2, TEMP res);
+  size(16);
+  format %{ "loop:\n\t"
+            "LDXR_w   $res, $mem\n\t"
+            "ADD_w    $tmp1, $res, $add\n\t"
+            "STXR_w   $tmp2, $tmp1, $mem\n\t"
+            "CBNZ_w   $tmp2, loop" %}
+
+  ins_encode %{
+    Label loop;
+    Register base = reg_to_register_object($mem$$base);
+    __ bind(loop);
+    __ ldxr_w($res$$Register, base);
+    __ add_w($tmp1$$Register, $res$$Register, $add$$Register);
+    __ stxr_w($tmp2$$Register, $tmp1$$Register, base);
+    __ cbnz_w($tmp2$$Register, loop);
+  %}
+  ins_pipe( long_memory_op );
+%}
+#else
+instruct xaddI_reg(memoryex mem, iRegI add, iRegI res, iRegI tmp1, iRegI tmp2, flagsReg ccr) %{
+  match(Set res (GetAndAddI mem add));
+  effect(KILL ccr, TEMP tmp1, TEMP tmp2, TEMP res);
+  size(20);
+  format %{ "loop: \n\t"
+            "LDREX    $res, $mem\n\t"
+            "ADD      $tmp1, $res, $add\n\t"
+            "STREX    $tmp2, $tmp1, $mem\n\t"
+            "CMP      $tmp2, 0 \n\t"
+            "B.ne     loop \n\t" %}
+
+  ins_encode %{
+    Label loop;
+    __ bind(loop);
+    __ ldrex($res$$Register,$mem$$Address);
+    __ add($tmp1$$Register, $res$$Register, $add$$Register);
+    __ strex($tmp2$$Register, $tmp1$$Register, $mem$$Address);
+    __ cmp($tmp2$$Register, 0);
+    __ b(loop, ne);
+  %}
+  ins_pipe( long_memory_op );
+%}
+#endif
+
+#ifdef AARCH64
+instruct xaddL_reg_no_res(memoryex mem, iRegL add, Universe dummy, iRegL tmp1, iRegI tmp2) %{
+  predicate(n->as_LoadStore()->result_not_used());
+  match(Set dummy (GetAndAddL mem add));
+  effect(TEMP tmp1, TEMP tmp2);
+  size(16);
+  format %{ "loop:\n\t"
+            "LDXR     $tmp1, $mem\n\t"
+            "ADD      $tmp1, $tmp1, $add\n\t"
+            "STXR     $tmp2, $tmp1, $mem\n\t"
+            "CBNZ_w   $tmp2, loop" %}
+
+  ins_encode %{
+    Label loop;
+    Register base = reg_to_register_object($mem$$base);
+    __ bind(loop);
+    __ ldxr($tmp1$$Register, base);
+    __ add($tmp1$$Register, $tmp1$$Register, $add$$Register);
+    __ stxr($tmp2$$Register, $tmp1$$Register, base);
+    __ cbnz_w($tmp2$$Register, loop);
+  %}
+  ins_pipe( long_memory_op );
+%}
+#else
+instruct xaddL_reg_no_res(memoryex mem, iRegL add, Universe dummy, iRegLd tmp1, iRegI tmp2, flagsReg ccr) %{
+  predicate(n->as_LoadStore()->result_not_used());
+  match(Set dummy (GetAndAddL mem add));
+  effect( KILL ccr, TEMP tmp1, TEMP tmp2);
+  size(24);
+  format %{ "loop: \n\t"
+            "LDREXD   $tmp1, $mem\n\t"
+            "ADDS     $tmp1.lo, $tmp1.lo, $add.lo\n\t"
+            "ADC      $tmp1.hi, $tmp1.hi, $add.hi\n\t"
+            "STREXD   $tmp2, $tmp1, $mem\n\t"
+            "CMP      $tmp2, 0 \n\t"
+            "B.ne     loop \n\t" %}
+
+  ins_encode %{
+    Label loop;
+    __ bind(loop);
+    __ ldrexd($tmp1$$Register, $mem$$Address);
+    __ adds($tmp1$$Register, $tmp1$$Register, $add$$Register);
+    __ adc($tmp1$$Register->successor(), $tmp1$$Register->successor(), $add$$Register->successor());
+    __ strexd($tmp2$$Register, $tmp1$$Register, $mem$$Address);
+    __ cmp($tmp2$$Register, 0);
+    __ b(loop, ne);
+  %}
+  ins_pipe( long_memory_op );
+%}
+#endif
+
+#ifdef AARCH64
+instruct xaddL_imm_no_res(memoryex mem, aimmL add, Universe dummy, iRegL tmp1, iRegI tmp2) %{
+  predicate(n->as_LoadStore()->result_not_used());
+  match(Set dummy (GetAndAddL mem add));
+  effect(TEMP tmp1, TEMP tmp2);
+  size(16);
+  format %{ "loop:\n\t"
+            "LDXR     $tmp1, $mem\n\t"
+            "ADD      $tmp1, $tmp1, $add\n\t"
+            "STXR     $tmp2, $tmp1, $mem\n\t"
+            "CBNZ_w   $tmp2, loop" %}
+
+  ins_encode %{
+    Label loop;
+    Register base = reg_to_register_object($mem$$base);
+    __ bind(loop);
+    __ ldxr($tmp1$$Register, base);
+    __ add($tmp1$$Register, $tmp1$$Register, $add$$constant);
+    __ stxr($tmp2$$Register, $tmp1$$Register, base);
+    __ cbnz_w($tmp2$$Register, loop);
+  %}
+  ins_pipe( long_memory_op );
+%}
+#else
+// TODO: try immLRot2 instead, (0, $con$$constant) becomes
+// (hi($con$$constant), lo($con$$constant)) becomes
+instruct xaddL_immRot_no_res(memoryex mem, immLlowRot add, Universe dummy, iRegLd tmp1, iRegI tmp2, flagsReg ccr) %{
+  predicate(n->as_LoadStore()->result_not_used());
+  match(Set dummy (GetAndAddL mem add));
+  effect( KILL ccr, TEMP tmp1, TEMP tmp2);
+  size(24);
+  format %{ "loop: \n\t"
+            "LDREXD   $tmp1, $mem\n\t"
+            "ADDS     $tmp1.lo, $tmp1.lo, $add\n\t"
+            "ADC      $tmp1.hi, $tmp1.hi, 0\n\t"
+            "STREXD   $tmp2, $tmp1, $mem\n\t"
+            "CMP      $tmp2, 0 \n\t"
+            "B.ne     loop \n\t" %}
+
+  ins_encode %{
+    Label loop;
+    __ bind(loop);
+    __ ldrexd($tmp1$$Register, $mem$$Address);
+    __ adds($tmp1$$Register, $tmp1$$Register, $add$$constant);
+    __ adc($tmp1$$Register->successor(), $tmp1$$Register->successor(), 0);
+    __ strexd($tmp2$$Register, $tmp1$$Register, $mem$$Address);
+    __ cmp($tmp2$$Register, 0);
+    __ b(loop, ne);
+  %}
+  ins_pipe( long_memory_op );
+%}
+#endif
+
+#ifdef AARCH64
+instruct xaddL_reg(memoryex mem, iRegL add, iRegL res, iRegL tmp1, iRegI tmp2) %{
+  match(Set res (GetAndAddL mem add));
+  effect(TEMP tmp1, TEMP tmp2, TEMP res);
+  size(16);
+  format %{ "loop:\n\t"
+            "LDXR     $res, $mem\n\t"
+            "ADD      $tmp1, $res, $add\n\t"
+            "STXR     $tmp2, $tmp1, $mem\n\t"
+            "CBNZ_w   $tmp2, loop" %}
+
+  ins_encode %{
+    Label loop;
+    Register base = reg_to_register_object($mem$$base);
+    __ bind(loop);
+    __ ldxr($res$$Register, base);
+    __ add($tmp1$$Register, $res$$Register, $add$$Register);
+    __ stxr($tmp2$$Register, $tmp1$$Register, base);
+    __ cbnz_w($tmp2$$Register, loop);
+  %}
+  ins_pipe( long_memory_op );
+%}
+#else
+instruct xaddL_reg(memoryex mem, iRegL add, iRegLd res, iRegLd tmp1, iRegI tmp2, flagsReg ccr) %{
+  match(Set res (GetAndAddL mem add));
+  effect( KILL ccr, TEMP tmp1, TEMP tmp2, TEMP res);
+  size(24);
+  format %{ "loop: \n\t"
+            "LDREXD   $res, $mem\n\t"
+            "ADDS     $tmp1.lo, $res.lo, $add.lo\n\t"
+            "ADC      $tmp1.hi, $res.hi, $add.hi\n\t"
+            "STREXD   $tmp2, $tmp1, $mem\n\t"
+            "CMP      $tmp2, 0 \n\t"
+            "B.ne     loop \n\t" %}
+
+  ins_encode %{
+    Label loop;
+    __ bind(loop);
+    __ ldrexd($res$$Register, $mem$$Address);
+    __ adds($tmp1$$Register, $res$$Register, $add$$Register);
+    __ adc($tmp1$$Register->successor(), $res$$Register->successor(), $add$$Register->successor());
+    __ strexd($tmp2$$Register, $tmp1$$Register, $mem$$Address);
+    __ cmp($tmp2$$Register, 0);
+    __ b(loop, ne);
+  %}
+  ins_pipe( long_memory_op );
+%}
+#endif
+
+#ifdef AARCH64
+instruct xaddL_imm(memoryex mem, aimmL add, iRegL res, iRegL tmp1, iRegI tmp2) %{
+  match(Set res (GetAndAddL mem add));
+  effect(TEMP tmp1, TEMP tmp2, TEMP res);
+  size(16);
+  format %{ "loop:\n\t"
+            "LDXR     $res, $mem\n\t"
+            "ADD      $tmp1, $res, $add\n\t"
+            "STXR     $tmp2, $tmp1, $mem\n\t"
+            "CBNZ_w   $tmp2, loop" %}
+
+  ins_encode %{
+    Label loop;
+    Register base = reg_to_register_object($mem$$base);
+    __ bind(loop);
+    __ ldxr($res$$Register, base);
+    __ add($tmp1$$Register, $res$$Register, $add$$constant);
+    __ stxr($tmp2$$Register, $tmp1$$Register, base);
+    __ cbnz_w($tmp2$$Register, loop);
+  %}
+  ins_pipe( long_memory_op );
+%}
+#else
+// TODO: try immLRot2 instead, (0, $con$$constant) becomes
+// (hi($con$$constant), lo($con$$constant)) becomes
+instruct xaddL_immRot(memoryex mem, immLlowRot add, iRegLd res, iRegLd tmp1, iRegI tmp2, flagsReg ccr) %{
+  match(Set res (GetAndAddL mem add));
+  effect( KILL ccr, TEMP tmp1, TEMP tmp2, TEMP res);
+  size(24);
+  format %{ "loop: \n\t"
+            "LDREXD   $res, $mem\n\t"
+            "ADDS     $tmp1.lo, $res.lo, $add\n\t"
+            "ADC      $tmp1.hi, $res.hi, 0\n\t"
+            "STREXD   $tmp2, $tmp1, $mem\n\t"
+            "CMP      $tmp2, 0 \n\t"
+            "B.ne     loop \n\t" %}
+
+  ins_encode %{
+    Label loop;
+    __ bind(loop);
+    __ ldrexd($res$$Register, $mem$$Address);
+    __ adds($tmp1$$Register, $res$$Register, $add$$constant);
+    __ adc($tmp1$$Register->successor(), $res$$Register->successor(), 0);
+    __ strexd($tmp2$$Register, $tmp1$$Register, $mem$$Address);
+    __ cmp($tmp2$$Register, 0);
+    __ b(loop, ne);
+  %}
+  ins_pipe( long_memory_op );
+%}
+#endif
+
+#ifdef AARCH64
+instruct xchgI(memoryex mem, iRegI newval, iRegI res, iRegI tmp) %{
+  match(Set res (GetAndSetI mem newval));
+  effect(TEMP tmp, TEMP res);
+  size(12);
+  format %{ "loop:\n\t"
+            "LDXR_w   $res, $mem\n\t"
+            "STXR_w   $tmp, $newval, $mem\n\t"
+            "CBNZ_w   $tmp, loop" %}
+
+  ins_encode %{
+    Label loop;
+    Register base = reg_to_register_object($mem$$base);
+    __ bind(loop);
+    __ ldxr_w($res$$Register, base);
+    __ stxr_w($tmp$$Register, $newval$$Register, base);
+    __ cbnz_w($tmp$$Register, loop);
+  %}
+  ins_pipe( long_memory_op );
+%}
+
+#ifdef XXX
+// Disabled until 8051805 is fixed.
+instruct xchgN(memoryex mem, iRegN newval, iRegN res, iRegN tmp) %{
+  match(Set res (GetAndSetN mem newval));
+  effect(TEMP tmp, TEMP res);
+  size(12);
+  format %{ "loop:\n\t"
+            "LDXR_w   $res, $mem\n\t"
+            "STXR_w   $tmp, $newval, $mem\n\t"
+            "CBNZ_w   $tmp, loop" %}
+
+  ins_encode %{
+    Label loop;
+    Register base = reg_to_register_object($mem$$base);
+    __ bind(loop);
+    __ ldxr_w($res$$Register, base);
+    __ stxr_w($tmp$$Register, $newval$$Register, base);
+    __ cbnz_w($tmp$$Register, loop);
+  %}
+  ins_pipe( long_memory_op );
+%}
+#endif
+#else
+instruct xchgI(memoryex mem, iRegI newval, iRegI res, iRegI tmp, flagsReg ccr) %{
+  match(Set res (GetAndSetI mem newval));
+  effect(KILL ccr, TEMP tmp, TEMP res);
+  size(16);
+  format %{ "loop: \n\t"
+            "LDREX    $res, $mem\n\t"
+            "STREX    $tmp, $newval, $mem\n\t"
+            "CMP      $tmp, 0 \n\t"
+            "B.ne     loop \n\t" %}
+
+  ins_encode %{
+    Label loop;
+    __ bind(loop);
+    __ ldrex($res$$Register,$mem$$Address);
+    __ strex($tmp$$Register, $newval$$Register, $mem$$Address);
+    __ cmp($tmp$$Register, 0);
+    __ b(loop, ne);
+  %}
+  ins_pipe( long_memory_op );
+%}
+#endif
+
+#ifdef AARCH64
+instruct xchgL(memoryex mem, iRegL newval, iRegL res, iRegI tmp) %{
+  match(Set res (GetAndSetL mem newval));
+  effect(TEMP tmp, TEMP res);
+  size(12);
+  format %{ "loop:\n\t"
+            "LDXR     $res, $mem\n\t"
+            "STXR     $tmp, $newval, $mem\n\t"
+            "CBNZ_w   $tmp, loop" %}
+
+  ins_encode %{
+    Label loop;
+    Register base = reg_to_register_object($mem$$base);
+    __ bind(loop);
+    __ ldxr($res$$Register, base);
+    __ stxr($tmp$$Register, $newval$$Register, base);
+    __ cbnz_w($tmp$$Register, loop);
+  %}
+  ins_pipe( long_memory_op );
+%}
+#else
+instruct xchgL(memoryex mem, iRegLd newval, iRegLd res, iRegI tmp, flagsReg ccr) %{
+  match(Set res (GetAndSetL mem newval));
+  effect( KILL ccr, TEMP tmp, TEMP res);
+  size(16);
+  format %{ "loop: \n\t"
+            "LDREXD   $res, $mem\n\t"
+            "STREXD   $tmp, $newval, $mem\n\t"
+            "CMP      $tmp, 0 \n\t"
+            "B.ne     loop \n\t" %}
+
+  ins_encode %{
+    Label loop;
+    __ bind(loop);
+    __ ldrexd($res$$Register, $mem$$Address);
+    __ strexd($tmp$$Register, $newval$$Register, $mem$$Address);
+    __ cmp($tmp$$Register, 0);
+    __ b(loop, ne);
+  %}
+  ins_pipe( long_memory_op );
+%}
+#endif // !AARCH64
+
+#ifdef AARCH64
+instruct xchgP(memoryex mem, iRegP newval, iRegP res, iRegI tmp) %{
+  match(Set res (GetAndSetP mem newval));
+  effect(TEMP tmp, TEMP res);
+  size(12);
+  format %{ "loop:\n\t"
+            "LDREX    $res, $mem\n\t"
+            "STREX    $tmp, $newval, $mem\n\t"
+            "CBNZ_w   $tmp, loop" %}
+
+  ins_encode %{
+    Label loop;
+    Register base = reg_to_register_object($mem$$base);
+    __ bind(loop);
+    __ ldrex($res$$Register, base);
+    __ strex($tmp$$Register, $newval$$Register, base);
+    __ cbnz_w($tmp$$Register, loop);
+  %}
+  ins_pipe( long_memory_op );
+%}
+#else
+instruct xchgP(memoryex mem, iRegP newval, iRegP res, iRegI tmp, flagsReg ccr) %{
+  match(Set res (GetAndSetP mem newval));
+  effect(KILL ccr, TEMP tmp, TEMP res);
+  size(16);
+  format %{ "loop: \n\t"
+            "LDREX    $res, $mem\n\t"
+            "STREX    $tmp, $newval, $mem\n\t"
+            "CMP      $tmp, 0 \n\t"
+            "B.ne     loop \n\t" %}
+
+  ins_encode %{
+    Label loop;
+    __ bind(loop);
+    __ ldrex($res$$Register,$mem$$Address);
+    __ strex($tmp$$Register, $newval$$Register, $mem$$Address);
+    __ cmp($tmp$$Register, 0);
+    __ b(loop, ne);
+  %}
+  ins_pipe( long_memory_op );
+%}
+#endif // !AARCH64
+
+//---------------------
+// Subtraction Instructions
+// Register Subtraction
+instruct subI_reg_reg(iRegI dst, iRegI src1, iRegI src2) %{
+  match(Set dst (SubI src1 src2));
+
+  size(4);
+  format %{ "sub_32 $dst,$src1,$src2\t! int" %}
+  ins_encode %{
+    __ sub_32($dst$$Register, $src1$$Register, $src2$$Register);
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+
+#ifndef AARCH64
+instruct subshlI_reg_reg_reg(iRegI dst, iRegI src1, iRegI src2, iRegI src3) %{
+  match(Set dst (SubI src1 (LShiftI src2 src3)));
+
+  size(4);
+  format %{ "SUB    $dst,$src1,$src2<<$src3" %}
+  ins_encode %{
+    __ sub($dst$$Register, $src1$$Register, AsmOperand($src2$$Register, lsl, $src3$$Register));
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+#endif
+
+instruct subshlI_reg_reg_imm(iRegI dst, iRegI src1, iRegI src2, immU5 src3) %{
+  match(Set dst (SubI src1 (LShiftI src2 src3)));
+
+  size(4);
+  format %{ "sub_32 $dst,$src1,$src2<<$src3\t! int" %}
+  ins_encode %{
+    __ sub_32($dst$$Register, $src1$$Register, AsmOperand($src2$$Register, lsl, $src3$$constant));
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+
+#ifndef AARCH64
+instruct subsarI_reg_reg_reg(iRegI dst, iRegI src1, iRegI src2, iRegI src3) %{
+  match(Set dst (SubI src1 (RShiftI src2 src3)));
+
+  size(4);
+  format %{ "SUB    $dst,$src1,$src2>>$src3" %}
+  ins_encode %{
+    __ sub($dst$$Register, $src1$$Register, AsmOperand($src2$$Register, asr, $src3$$Register));
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+#endif
+
+instruct subsarI_reg_reg_imm(iRegI dst, iRegI src1, iRegI src2, immU5 src3) %{
+  match(Set dst (SubI src1 (RShiftI src2 src3)));
+
+  size(4);
+  format %{ "sub_32 $dst,$src1,$src2>>$src3\t! int" %}
+  ins_encode %{
+    __ sub_32($dst$$Register, $src1$$Register, AsmOperand($src2$$Register, asr, $src3$$constant));
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+
+#ifndef AARCH64
+instruct subshrI_reg_reg_reg(iRegI dst, iRegI src1, iRegI src2, iRegI src3) %{
+  match(Set dst (SubI src1 (URShiftI src2 src3)));
+
+  size(4);
+  format %{ "SUB    $dst,$src1,$src2>>>$src3" %}
+  ins_encode %{
+    __ sub($dst$$Register, $src1$$Register, AsmOperand($src2$$Register, lsr, $src3$$Register));
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+#endif
+
+instruct subshrI_reg_reg_imm(iRegI dst, iRegI src1, iRegI src2, immU5 src3) %{
+  match(Set dst (SubI src1 (URShiftI src2 src3)));
+
+  size(4);
+  format %{ "sub_32 $dst,$src1,$src2>>>$src3\t! int" %}
+  ins_encode %{
+    __ sub_32($dst$$Register, $src1$$Register, AsmOperand($src2$$Register, lsr, $src3$$constant));
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+
+#ifndef AARCH64
+instruct rsbshlI_reg_reg_reg(iRegI dst, iRegI src1, iRegI src2, iRegI src3) %{
+  match(Set dst (SubI (LShiftI src1 src2) src3));
+
+  size(4);
+  format %{ "RSB    $dst,$src3,$src1<<$src2" %}
+  ins_encode %{
+    __ rsb($dst$$Register, $src3$$Register, AsmOperand($src1$$Register, lsl, $src2$$Register));
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+
+instruct rsbshlI_reg_imm_reg(iRegI dst, iRegI src1, immU5 src2, iRegI src3) %{
+  match(Set dst (SubI (LShiftI src1 src2) src3));
+
+  size(4);
+  format %{ "RSB    $dst,$src3,$src1<<$src2" %}
+  ins_encode %{
+    __ rsb($dst$$Register, $src3$$Register, AsmOperand($src1$$Register, lsl, $src2$$constant));
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+
+instruct rsbsarI_reg_reg_reg(iRegI dst, iRegI src1, iRegI src2, iRegI src3) %{
+  match(Set dst (SubI (RShiftI src1 src2) src3));
+
+  size(4);
+  format %{ "RSB    $dst,$src3,$src1>>$src2" %}
+  ins_encode %{
+    __ rsb($dst$$Register, $src3$$Register, AsmOperand($src1$$Register, asr, $src2$$Register));
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+
+instruct rsbsarI_reg_imm_reg(iRegI dst, iRegI src1, immU5 src2, iRegI src3) %{
+  match(Set dst (SubI (RShiftI src1 src2) src3));
+
+  size(4);
+  format %{ "RSB    $dst,$src3,$src1>>$src2" %}
+  ins_encode %{
+    __ rsb($dst$$Register, $src3$$Register, AsmOperand($src1$$Register, asr, $src2$$constant));
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+
+instruct rsbshrI_reg_reg_reg(iRegI dst, iRegI src1, iRegI src2, iRegI src3) %{
+  match(Set dst (SubI (URShiftI src1 src2) src3));
+
+  size(4);
+  format %{ "RSB    $dst,$src3,$src1>>>$src2" %}
+  ins_encode %{
+    __ rsb($dst$$Register, $src3$$Register, AsmOperand($src1$$Register, lsr, $src2$$Register));
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+
+instruct rsbshrI_reg_imm_reg(iRegI dst, iRegI src1, immU5 src2, iRegI src3) %{
+  match(Set dst (SubI (URShiftI src1 src2) src3));
+
+  size(4);
+  format %{ "RSB    $dst,$src3,$src1>>>$src2" %}
+  ins_encode %{
+    __ rsb($dst$$Register, $src3$$Register, AsmOperand($src1$$Register, lsr, $src2$$constant));
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+#endif
+
+// Immediate Subtraction
+instruct subI_reg_aimmI(iRegI dst, iRegI src1, aimmI src2) %{
+  match(Set dst (SubI src1 src2));
+
+  size(4);
+  format %{ "sub_32 $dst,$src1,$src2\t! int" %}
+  ins_encode %{
+    __ sub_32($dst$$Register, $src1$$Register, $src2$$constant);
+  %}
+  ins_pipe(ialu_reg_imm);
+%}
+
+instruct subI_reg_immRotneg(iRegI dst, iRegI src1, aimmIneg src2) %{
+  match(Set dst (AddI src1 src2));
+
+  size(4);
+  format %{ "sub_32 $dst,$src1,-($src2)\t! int" %}
+  ins_encode %{
+    __ sub_32($dst$$Register, $src1$$Register, -$src2$$constant);
+  %}
+  ins_pipe(ialu_reg_imm);
+%}
+
+#ifndef AARCH64
+instruct subI_immRot_reg(iRegI dst, immIRot src1, iRegI src2) %{
+  match(Set dst (SubI src1 src2));
+
+  size(4);
+  format %{ "RSB    $dst,$src2,src1" %}
+  ins_encode %{
+    __ rsb($dst$$Register, $src2$$Register, $src1$$constant);
+  %}
+  ins_pipe(ialu_zero_reg);
+%}
+#endif
+
+// Register Subtraction
+#ifdef AARCH64
+instruct subL_reg_reg(iRegL dst, iRegL src1, iRegL src2) %{
+  match(Set dst (SubL src1 src2));
+
+  size(4);
+  format %{ "SUB    $dst,$src1,$src2\t! long" %}
+  ins_encode %{
+    __ sub($dst$$Register, $src1$$Register, $src2$$Register);
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+#else
+instruct subL_reg_reg(iRegL dst, iRegL src1, iRegL src2, flagsReg icc ) %{
+  match(Set dst (SubL src1 src2));
+  effect (KILL icc);
+
+  size(8);
+  format %{ "SUBS   $dst.lo,$src1.lo,$src2.lo\t! long\n\t"
+            "SBC    $dst.hi,$src1.hi,$src2.hi" %}
+  ins_encode %{
+    __ subs($dst$$Register, $src1$$Register, $src2$$Register);
+    __ sbc($dst$$Register->successor(), $src1$$Register->successor(), $src2$$Register->successor());
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+#endif
+
+#ifdef AARCH64
+// Immediate Subtraction
+instruct subL_reg_aimm(iRegL dst, iRegL src1, aimmL src2) %{
+  match(Set dst (SubL src1 src2));
+
+  size(4);
+  format %{ "SUB    $dst,$src1,$src2\t! long" %}
+  ins_encode %{
+    __ sub($dst$$Register, $src1$$Register, $src2$$constant);
+  %}
+  ins_pipe(ialu_reg_imm);
+%}
+
+instruct subL_reg_immLneg(iRegL dst, iRegL src1, aimmLneg src2) %{
+  match(Set dst (AddL src1 src2));
+
+  size(4);
+  format %{ "SUB    $dst,$src1,-($src2)\t! long" %}
+  ins_encode %{
+    __ sub($dst$$Register, $src1$$Register, -$src2$$constant);
+  %}
+  ins_pipe(ialu_reg_imm);
+%}
+#else
+// TODO
+#endif
+
+#ifndef AARCH64
+// Immediate Subtraction
+// TODO: try immLRot2 instead, (0, $con$$constant) becomes
+// (hi($con$$constant), lo($con$$constant)) becomes
+instruct subL_reg_immRot(iRegL dst, iRegL src1, immLlowRot con, flagsReg icc) %{
+  match(Set dst (SubL src1 con));
+  effect (KILL icc);
+
+  size(8);
+  format %{ "SUB    $dst.lo,$src1.lo,$con\t! long\n\t"
+            "SBC    $dst.hi,$src1.hi,0" %}
+  ins_encode %{
+    __ subs($dst$$Register, $src1$$Register, $con$$constant);
+    __ sbc($dst$$Register->successor(), $src1$$Register->successor(), 0);
+  %}
+  ins_pipe(ialu_reg_imm);
+%}
+
+// Long negation
+instruct negL_reg_reg(iRegL dst, immL0 zero, iRegL src2, flagsReg icc) %{
+  match(Set dst (SubL zero src2));
+  effect (KILL icc);
+
+  size(8);
+  format %{ "RSBS   $dst.lo,$src2.lo,0\t! long\n\t"
+            "RSC    $dst.hi,$src2.hi,0" %}
+  ins_encode %{
+    __ rsbs($dst$$Register, $src2$$Register, 0);
+    __ rsc($dst$$Register->successor(), $src2$$Register->successor(), 0);
+  %}
+  ins_pipe(ialu_zero_reg);
+%}
+#endif // !AARCH64
+
+// Multiplication Instructions
+// Integer Multiplication
+// Register Multiplication
+instruct mulI_reg_reg(iRegI dst, iRegI src1, iRegI src2) %{
+  match(Set dst (MulI src1 src2));
+
+  size(4);
+  format %{ "mul_32 $dst,$src1,$src2" %}
+  ins_encode %{
+    __ mul_32($dst$$Register, $src1$$Register, $src2$$Register);
+  %}
+  ins_pipe(imul_reg_reg);
+%}
+
+#ifdef AARCH64
+instruct mulL_reg_reg(iRegL dst, iRegL src1, iRegL src2) %{
+  match(Set dst (MulL src1 src2));
+  size(4);
+  format %{ "MUL  $dst,$src1,$src2\t! long" %}
+  ins_encode %{
+    __ mul($dst$$Register, $src1$$Register, $src2$$Register);
+  %}
+  ins_pipe(imul_reg_reg);
+%}
+#else
+instruct mulL_lo1_hi2(iRegL dst, iRegL src1, iRegL src2) %{
+  effect(DEF dst, USE src1, USE src2);
+  size(4);
+  format %{ "MUL  $dst.hi,$src1.lo,$src2.hi\t! long" %}
+  ins_encode %{
+    __ mul($dst$$Register->successor(), $src1$$Register, $src2$$Register->successor());
+  %}
+  ins_pipe(imul_reg_reg);
+%}
+
+instruct mulL_hi1_lo2(iRegL dst, iRegL src1, iRegL src2) %{
+  effect(USE_DEF dst, USE src1, USE src2);
+  size(8);
+  format %{ "MLA  $dst.hi,$src1.hi,$src2.lo,$dst.hi\t! long\n\t"
+            "MOV  $dst.lo, 0"%}
+  ins_encode %{
+    __ mla($dst$$Register->successor(), $src1$$Register->successor(), $src2$$Register, $dst$$Register->successor());
+    __ mov($dst$$Register, 0);
+  %}
+  ins_pipe(imul_reg_reg);
+%}
+
+instruct mulL_lo1_lo2(iRegL dst, iRegL src1, iRegL src2) %{
+  effect(USE_DEF dst, USE src1, USE src2);
+  size(4);
+  format %{ "UMLAL  $dst.lo,$dst.hi,$src1,$src2\t! long" %}
+  ins_encode %{
+    __ umlal($dst$$Register, $dst$$Register->successor(), $src1$$Register, $src2$$Register);
+  %}
+  ins_pipe(imul_reg_reg);
+%}
+
+instruct mulL_reg_reg(iRegL dst, iRegL src1, iRegL src2) %{
+  match(Set dst (MulL src1 src2));
+
+  expand %{
+    mulL_lo1_hi2(dst, src1, src2);
+    mulL_hi1_lo2(dst, src1, src2);
+    mulL_lo1_lo2(dst, src1, src2);
+  %}
+%}
+#endif // !AARCH64
+
+// Integer Division
+// Register Division
+#ifdef AARCH64
+instruct divI_reg_reg(iRegI dst, iRegI src1, iRegI src2) %{
+  match(Set dst (DivI src1 src2));
+
+  size(4);
+  format %{ "SDIV    $dst,$src1,$src2\t! 32-bit" %}
+  ins_encode %{
+    __ sdiv_w($dst$$Register, $src1$$Register, $src2$$Register);
+  %}
+  ins_pipe(ialu_reg_reg); // FIXME
+%}
+#else
+instruct divI_reg_reg(R1RegI dst, R0RegI src1, R2RegI src2, LRRegP lr, flagsReg ccr) %{
+  match(Set dst (DivI src1 src2));
+  effect( KILL ccr, KILL src1, KILL src2, KILL lr);
+  ins_cost((2+71)*DEFAULT_COST);
+
+  format %{ "DIV   $dst,$src1,$src2 ! call to StubRoutines::Arm::idiv_irem_entry()" %}
+  ins_encode %{
+    __ call(StubRoutines::Arm::idiv_irem_entry(), relocInfo::runtime_call_type);
+  %}
+  ins_pipe(sdiv_reg_reg);
+%}
+#endif
+
+// Register Long Division
+#ifdef AARCH64
+instruct divL_reg_reg(iRegL dst, iRegL src1, iRegL src2) %{
+  match(Set dst (DivL src1 src2));
+
+  size(4);
+  format %{ "SDIV    $dst,$src1,$src2" %}
+  ins_encode %{
+    __ sdiv($dst$$Register, $src1$$Register, $src2$$Register);
+  %}
+  ins_pipe(ialu_reg_reg); // FIXME
+%}
+#else
+instruct divL_reg_reg(R0R1RegL dst, R2R3RegL src1, R0R1RegL src2) %{
+  match(Set dst (DivL src1 src2));
+  effect(CALL);
+  ins_cost(DEFAULT_COST*71);
+  format %{ "DIVL  $src1,$src2,$dst\t! long ! call to SharedRuntime::ldiv" %}
+  ins_encode %{
+    address target = CAST_FROM_FN_PTR(address, SharedRuntime::ldiv);
+    __ call(target, relocInfo::runtime_call_type);
+  %}
+  ins_pipe(divL_reg_reg);
+%}
+#endif
+
+// Integer Remainder
+// Register Remainder
+#ifdef AARCH64
+#ifdef TODO
+instruct msubI_reg_reg(iRegI dst, iRegI src1, iRegI src2, iRegI src3) %{
+  match(Set dst (SubI src1 (MulI src2 src3)));
+
+  size(4);
+  format %{ "MSUB    $dst,$src2,$src3,$src1\t! 32-bit\n\t" %}
+  ins_encode %{
+    __ msub_w($dst$$Register, $src2$$Register, $src3$$Register, $src1$$Register);
+  %}
+  ins_pipe(ialu_reg_reg); // FIXME
+%}
+#endif
+
+instruct modI_reg_reg(iRegI dst, iRegI src1, iRegI src2, iRegI temp) %{
+  match(Set dst (ModI src1 src2));
+  effect(TEMP temp);
+
+  size(8);
+  format %{ "SDIV    $temp,$src1,$src2\t! 32-bit\n\t"
+            "MSUB    $dst,$src2,$temp,$src1\t! 32-bit\n\t" %}
+  ins_encode %{
+    __ sdiv_w($temp$$Register, $src1$$Register, $src2$$Register);
+    __ msub_w($dst$$Register, $src2$$Register, $temp$$Register, $src1$$Register);
+  %}
+  ins_pipe(ialu_reg_reg); // FIXME
+%}
+#else
+instruct modI_reg_reg(R0RegI dst, R0RegI src1, R2RegI src2, R1RegI temp, LRRegP lr, flagsReg ccr ) %{
+  match(Set dst (ModI src1 src2));
+  effect( KILL ccr, KILL temp, KILL src2, KILL lr);
+
+  format %{ "MODI   $dst,$src1,$src2\t ! call to StubRoutines::Arm::idiv_irem_entry" %}
+  ins_encode %{
+    __ call(StubRoutines::Arm::idiv_irem_entry(), relocInfo::runtime_call_type);
+  %}
+  ins_pipe(sdiv_reg_reg);
+%}
+#endif
+
+// Register Long Remainder
+#ifdef AARCH64
+instruct modL_reg_reg(iRegL dst, iRegL src1, iRegL src2, iRegL temp) %{
+  match(Set dst (ModL src1 src2));
+  effect(TEMP temp);
+
+  size(8);
+  format %{ "SDIV    $temp,$src1,$src2\n\t"
+            "MSUB    $dst,$src2,$temp,$src1" %}
+  ins_encode %{
+    __ sdiv($temp$$Register, $src1$$Register, $src2$$Register);
+    __ msub($dst$$Register, $src2$$Register, $temp$$Register, $src1$$Register);
+  %}
+  ins_pipe(ialu_reg_reg); // FIXME
+%}
+#else
+instruct modL_reg_reg(R0R1RegL dst, R2R3RegL src1, R0R1RegL src2) %{
+  match(Set dst (ModL src1 src2));
+  effect(CALL);
+  ins_cost(MEMORY_REF_COST); // FIXME
+  format %{ "modL    $dst,$src1,$src2\t ! call to SharedRuntime::lrem" %}
+  ins_encode %{
+    address target = CAST_FROM_FN_PTR(address, SharedRuntime::lrem);
+    __ call(target, relocInfo::runtime_call_type);
+  %}
+  ins_pipe(divL_reg_reg);
+%}
+#endif
+
+// Integer Shift Instructions
+
+// Register Shift Left
+instruct shlI_reg_reg(iRegI dst, iRegI src1, iRegI src2) %{
+  match(Set dst (LShiftI src1 src2));
+
+  size(4);
+#ifdef AARCH64
+  format %{ "LSLV   $dst,$src1,$src2\t! int" %}
+  ins_encode %{
+    __ lslv_w($dst$$Register, $src1$$Register, $src2$$Register);
+  %}
+#else
+  format %{ "LSL  $dst,$src1,$src2 \n\t" %}
+  ins_encode %{
+    __ mov($dst$$Register, AsmOperand($src1$$Register, lsl, $src2$$Register));
+  %}
+#endif
+  ins_pipe(ialu_reg_reg);
+%}
+
+// Register Shift Left Immediate
+instruct shlI_reg_imm5(iRegI dst, iRegI src1, immU5 src2) %{
+  match(Set dst (LShiftI src1 src2));
+
+  size(4);
+#ifdef AARCH64
+  format %{ "LSL_w  $dst,$src1,$src2\t! int" %}
+  ins_encode %{
+    __ _lsl($dst$$Register, $src1$$Register, $src2$$constant);
+  %}
+#else
+  format %{ "LSL    $dst,$src1,$src2\t! int" %}
+  ins_encode %{
+    __ logical_shift_left($dst$$Register, $src1$$Register, $src2$$constant);
+  %}
+#endif
+  ins_pipe(ialu_reg_imm);
+%}
+
+#ifndef AARCH64
+instruct shlL_reg_reg_merge_hi(iRegL dst, iRegL src1, iRegI src2) %{
+  effect(USE_DEF dst, USE src1, USE src2);
+  size(4);
+  format %{"OR  $dst.hi,$dst.hi,($src1.hi << $src2)"  %}
+  ins_encode %{
+    __ orr($dst$$Register->successor(), $dst$$Register->successor(), AsmOperand($src1$$Register->successor(), lsl, $src2$$Register));
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+
+instruct shlL_reg_reg_merge_lo(iRegL dst, iRegL src1, iRegI src2) %{
+  effect(USE_DEF dst, USE src1, USE src2);
+  size(4);
+  format %{ "LSL  $dst.lo,$src1.lo,$src2 \n\t" %}
+  ins_encode %{
+    __ mov($dst$$Register, AsmOperand($src1$$Register, lsl, $src2$$Register));
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+
+instruct shlL_reg_reg_overlap(iRegL dst, iRegL src1, iRegI src2, flagsReg ccr) %{
+  effect(DEF dst, USE src1, USE src2, KILL ccr);
+  size(16);
+  format %{ "SUBS  $dst.hi,$src2,32 \n\t"
+            "LSLpl $dst.hi,$src1.lo,$dst.hi \n\t"
+            "RSBmi $dst.hi,$dst.hi,0 \n\t"
+            "LSRmi $dst.hi,$src1.lo,$dst.hi" %}
+
+  ins_encode %{
+    // $src1$$Register and $dst$$Register->successor() can't be the same
+    __ subs($dst$$Register->successor(), $src2$$Register, 32);
+    __ mov($dst$$Register->successor(), AsmOperand($src1$$Register, lsl, $dst$$Register->successor()), pl);
+    __ rsb($dst$$Register->successor(), $dst$$Register->successor(), 0, mi);
+    __ mov($dst$$Register->successor(), AsmOperand($src1$$Register, lsr, $dst$$Register->successor()), mi);
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+#endif // !AARCH64
+
+instruct shlL_reg_reg(iRegL dst, iRegL src1, iRegI src2) %{
+  match(Set dst (LShiftL src1 src2));
+
+#ifdef AARCH64
+  size(4);
+  format %{ "LSLV  $dst,$src1,$src2\t! long" %}
+  ins_encode %{
+    __ lslv($dst$$Register, $src1$$Register, $src2$$Register);
+  %}
+  ins_pipe(ialu_reg_reg);
+#else
+  expand %{
+    flagsReg ccr;
+    shlL_reg_reg_overlap(dst, src1, src2, ccr);
+    shlL_reg_reg_merge_hi(dst, src1, src2);
+    shlL_reg_reg_merge_lo(dst, src1, src2);
+  %}
+#endif
+%}
+
+#ifdef AARCH64
+instruct shlL_reg_imm6(iRegL dst, iRegL src1, immU6 src2) %{
+  match(Set dst (LShiftL src1 src2));
+
+  size(4);
+  format %{ "LSL    $dst,$src1,$src2\t! long" %}
+  ins_encode %{
+    __ logical_shift_left($dst$$Register, $src1$$Register, $src2$$constant);
+  %}
+  ins_pipe(ialu_reg_imm);
+%}
+#else
+// Register Shift Left Immediate
+instruct shlL_reg_imm6(iRegL dst, iRegL src1, immU6Big src2) %{
+  match(Set dst (LShiftL src1 src2));
+
+  size(8);
+  format %{ "LSL   $dst.hi,$src1.lo,$src2-32\t! or mov if $src2==32\n\t"
+            "MOV   $dst.lo, 0" %}
+  ins_encode %{
+    if ($src2$$constant == 32) {
+      __ mov($dst$$Register->successor(), $src1$$Register);
+    } else {
+      __ mov($dst$$Register->successor(), AsmOperand($src1$$Register, lsl, $src2$$constant-32));
+    }
+    __ mov($dst$$Register, 0);
+  %}
+  ins_pipe(ialu_reg_imm);
+%}
+
+instruct shlL_reg_imm5(iRegL dst, iRegL src1, immU5 src2) %{
+  match(Set dst (LShiftL src1 src2));
+
+  size(12);
+  format %{ "LSL   $dst.hi,$src1.lo,$src2\n\t"
+            "OR    $dst.hi, $dst.hi, $src1.lo >> 32-$src2\n\t"
+            "LSL   $dst.lo,$src1.lo,$src2" %}
+  ins_encode %{
+    // The order of the following 3 instructions matters: src1.lo and
+    // dst.hi can't overlap but src.hi and dst.hi can.
+    __ mov($dst$$Register->successor(), AsmOperand($src1$$Register->successor(), lsl, $src2$$constant));
+    __ orr($dst$$Register->successor(), $dst$$Register->successor(), AsmOperand($src1$$Register, lsr, 32-$src2$$constant));
+    __ mov($dst$$Register, AsmOperand($src1$$Register, lsl, $src2$$constant));
+  %}
+  ins_pipe(ialu_reg_imm);
+%}
+#endif // !AARCH64
+
+// Register Arithmetic Shift Right
+instruct sarI_reg_reg(iRegI dst, iRegI src1, iRegI src2) %{
+  match(Set dst (RShiftI src1 src2));
+  size(4);
+#ifdef AARCH64
+  format %{ "ASRV   $dst,$src1,$src2\t! int" %}
+  ins_encode %{
+    __ asrv_w($dst$$Register, $src1$$Register, $src2$$Register);
+  %}
+#else
+  format %{ "ASR    $dst,$src1,$src2\t! int" %}
+  ins_encode %{
+    __ mov($dst$$Register, AsmOperand($src1$$Register, asr, $src2$$Register));
+  %}
+#endif
+  ins_pipe(ialu_reg_reg);
+%}
+
+// Register Arithmetic Shift Right Immediate
+instruct sarI_reg_imm5(iRegI dst, iRegI src1, immU5 src2) %{
+  match(Set dst (RShiftI src1 src2));
+
+  size(4);
+#ifdef AARCH64
+  format %{ "ASR_w  $dst,$src1,$src2" %}
+  ins_encode %{
+    __ _asr_w($dst$$Register, $src1$$Register, $src2$$constant);
+  %}
+#else
+  format %{ "ASR    $dst,$src1,$src2" %}
+  ins_encode %{
+    __ mov($dst$$Register, AsmOperand($src1$$Register, asr, $src2$$constant));
+  %}
+#endif
+  ins_pipe(ialu_reg_imm);
+%}
+
+#ifndef AARCH64
+// Register Shift Right Arithmetic Long
+instruct sarL_reg_reg_merge_lo(iRegL dst, iRegL src1, iRegI src2) %{
+  effect(USE_DEF dst, USE src1, USE src2);
+  size(4);
+  format %{ "OR  $dst.lo,$dst.lo,($src1.lo >> $src2)"  %}
+  ins_encode %{
+    __ orr($dst$$Register, $dst$$Register, AsmOperand($src1$$Register, lsr, $src2$$Register));
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+
+instruct sarL_reg_reg_merge_hi(iRegL dst, iRegL src1, iRegI src2) %{
+  effect(USE_DEF dst, USE src1, USE src2);
+  size(4);
+  format %{ "ASR  $dst.hi,$src1.hi,$src2 \n\t" %}
+  ins_encode %{
+    __ mov($dst$$Register->successor(), AsmOperand($src1$$Register->successor(), asr, $src2$$Register));
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+
+instruct sarL_reg_reg_overlap(iRegL dst, iRegL src1, iRegI src2, flagsReg ccr) %{
+  effect(DEF dst, USE src1, USE src2, KILL ccr);
+  size(16);
+  format %{ "SUBS  $dst.lo,$src2,32 \n\t"
+            "ASRpl $dst.lo,$src1.hi,$dst.lo \n\t"
+            "RSBmi $dst.lo,$dst.lo,0 \n\t"
+            "LSLmi $dst.lo,$src1.hi,$dst.lo" %}
+
+  ins_encode %{
+    // $src1$$Register->successor() and $dst$$Register can't be the same
+    __ subs($dst$$Register, $src2$$Register, 32);
+    __ mov($dst$$Register, AsmOperand($src1$$Register->successor(), asr, $dst$$Register), pl);
+    __ rsb($dst$$Register, $dst$$Register, 0, mi);
+    __ mov($dst$$Register, AsmOperand($src1$$Register->successor(), lsl, $dst$$Register), mi);
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+#endif // !AARCH64
+
+instruct sarL_reg_reg(iRegL dst, iRegL src1, iRegI src2) %{
+  match(Set dst (RShiftL src1 src2));
+
+#ifdef AARCH64
+  size(4);
+  format %{ "ASRV  $dst,$src1,$src2\t! long" %}
+  ins_encode %{
+    __ asrv($dst$$Register, $src1$$Register, $src2$$Register);
+  %}
+  ins_pipe(ialu_reg_reg);
+#else
+  expand %{
+    flagsReg ccr;
+    sarL_reg_reg_overlap(dst, src1, src2, ccr);
+    sarL_reg_reg_merge_lo(dst, src1, src2);
+    sarL_reg_reg_merge_hi(dst, src1, src2);
+  %}
+#endif
+%}
+
+// Register Shift Left Immediate
+#ifdef AARCH64
+instruct sarL_reg_imm6(iRegL dst, iRegL src1, immU6 src2) %{
+  match(Set dst (RShiftL src1 src2));
+
+  size(4);
+  format %{ "ASR    $dst,$src1,$src2\t! long" %}
+  ins_encode %{
+    __ _asr($dst$$Register, $src1$$Register, $src2$$constant);
+  %}
+  ins_pipe(ialu_reg_imm);
+%}
+#else
+instruct sarL_reg_imm6(iRegL dst, iRegL src1, immU6Big src2) %{
+  match(Set dst (RShiftL src1 src2));
+
+  size(8);
+  format %{ "ASR   $dst.lo,$src1.hi,$src2-32\t! or mov if $src2==32\n\t"
+            "ASR   $dst.hi,$src1.hi, $src2" %}
+  ins_encode %{
+    if ($src2$$constant == 32) {
+      __ mov($dst$$Register, $src1$$Register->successor());
+    } else{
+      __ mov($dst$$Register, AsmOperand($src1$$Register->successor(), asr, $src2$$constant-32));
+    }
+    __ mov($dst$$Register->successor(), AsmOperand($src1$$Register->successor(), asr, 0));
+  %}
+
+  ins_pipe(ialu_reg_imm);
+%}
+
+instruct sarL_reg_imm5(iRegL dst, iRegL src1, immU5 src2) %{
+  match(Set dst (RShiftL src1 src2));
+  size(12);
+  format %{ "LSR   $dst.lo,$src1.lo,$src2\n\t"
+            "OR    $dst.lo, $dst.lo, $src1.hi << 32-$src2\n\t"
+            "ASR   $dst.hi,$src1.hi,$src2" %}
+  ins_encode %{
+    // The order of the following 3 instructions matters: src1.lo and
+    // dst.hi can't overlap but src.hi and dst.hi can.
+    __ mov($dst$$Register, AsmOperand($src1$$Register, lsr, $src2$$constant));
+    __ orr($dst$$Register, $dst$$Register, AsmOperand($src1$$Register->successor(), lsl, 32-$src2$$constant));
+    __ mov($dst$$Register->successor(), AsmOperand($src1$$Register->successor(), asr, $src2$$constant));
+  %}
+  ins_pipe(ialu_reg_imm);
+%}
+#endif
+
+// Register Shift Right
+instruct shrI_reg_reg(iRegI dst, iRegI src1, iRegI src2) %{
+  match(Set dst (URShiftI src1 src2));
+  size(4);
+#ifdef AARCH64
+  format %{ "LSRV   $dst,$src1,$src2\t! int" %}
+  ins_encode %{
+    __ lsrv_w($dst$$Register, $src1$$Register, $src2$$Register);
+  %}
+#else
+  format %{ "LSR    $dst,$src1,$src2\t! int" %}
+  ins_encode %{
+    __ mov($dst$$Register, AsmOperand($src1$$Register, lsr, $src2$$Register));
+  %}
+#endif
+  ins_pipe(ialu_reg_reg);
+%}
+
+// Register Shift Right Immediate
+instruct shrI_reg_imm5(iRegI dst, iRegI src1, immU5 src2) %{
+  match(Set dst (URShiftI src1 src2));
+
+  size(4);
+#ifdef AARCH64
+  format %{ "LSR_w  $dst,$src1,$src2" %}
+  ins_encode %{
+    __ _lsr_w($dst$$Register, $src1$$Register, $src2$$constant);
+  %}
+#else
+  format %{ "LSR    $dst,$src1,$src2" %}
+  ins_encode %{
+    __ mov($dst$$Register, AsmOperand($src1$$Register, lsr, $src2$$constant));
+  %}
+#endif
+  ins_pipe(ialu_reg_imm);
+%}
+
+#ifndef AARCH64
+// Register Shift Right
+instruct shrL_reg_reg_merge_lo(iRegL dst, iRegL src1, iRegI src2) %{
+  effect(USE_DEF dst, USE src1, USE src2);
+  size(4);
+  format %{ "OR   $dst.lo,$dst,($src1.lo >>> $src2)"  %}
+  ins_encode %{
+    __ orr($dst$$Register, $dst$$Register, AsmOperand($src1$$Register, lsr, $src2$$Register));
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+
+instruct shrL_reg_reg_merge_hi(iRegL dst, iRegL src1, iRegI src2) %{
+  effect(USE_DEF dst, USE src1, USE src2);
+  size(4);
+  format %{ "LSR  $dst.hi,$src1.hi,$src2 \n\t" %}
+  ins_encode %{
+    __ mov($dst$$Register->successor(), AsmOperand($src1$$Register->successor(), lsr, $src2$$Register));
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+
+instruct shrL_reg_reg_overlap(iRegL dst, iRegL src1, iRegI src2, flagsReg ccr) %{
+  effect(DEF dst, USE src1, USE src2, KILL ccr);
+  size(16);
+  format %{ "SUBS  $dst,$src2,32 \n\t"
+            "LSRpl $dst,$src1.hi,$dst \n\t"
+            "RSBmi $dst,$dst,0 \n\t"
+            "LSLmi $dst,$src1.hi,$dst" %}
+
+  ins_encode %{
+    // $src1$$Register->successor() and $dst$$Register can't be the same
+    __ subs($dst$$Register, $src2$$Register, 32);
+    __ mov($dst$$Register, AsmOperand($src1$$Register->successor(), lsr, $dst$$Register), pl);
+    __ rsb($dst$$Register, $dst$$Register, 0, mi);
+    __ mov($dst$$Register, AsmOperand($src1$$Register->successor(), lsl, $dst$$Register), mi);
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+#endif // !AARCH64
+
+instruct shrL_reg_reg(iRegL dst, iRegL src1, iRegI src2) %{
+  match(Set dst (URShiftL src1 src2));
+
+#ifdef AARCH64
+  size(4);
+  format %{ "LSRV  $dst,$src1,$src2\t! long" %}
+  ins_encode %{
+    __ lsrv($dst$$Register, $src1$$Register, $src2$$Register);
+  %}
+  ins_pipe(ialu_reg_reg);
+#else
+  expand %{
+    flagsReg ccr;
+    shrL_reg_reg_overlap(dst, src1, src2, ccr);
+    shrL_reg_reg_merge_lo(dst, src1, src2);
+    shrL_reg_reg_merge_hi(dst, src1, src2);
+  %}
+#endif
+%}
+
+// Register Shift Right Immediate
+#ifdef AARCH64
+instruct shrL_reg_imm6(iRegL dst, iRegL src1, immU6 src2) %{
+  match(Set dst (URShiftL src1 src2));
+
+  size(4);
+  format %{ "LSR    $dst,$src1,$src2" %}
+  ins_encode %{
+    __ _lsr($dst$$Register, $src1$$Register, $src2$$constant);
+  %}
+  ins_pipe(ialu_reg_imm);
+%}
+#else
+instruct shrL_reg_imm6(iRegL dst, iRegL src1, immU6Big src2) %{
+  match(Set dst (URShiftL src1 src2));
+
+  size(8);
+  format %{ "LSR   $dst.lo,$src1.hi,$src2-32\t! or mov if $src2==32\n\t"
+            "MOV   $dst.hi, 0" %}
+  ins_encode %{
+    if ($src2$$constant == 32) {
+      __ mov($dst$$Register, $src1$$Register->successor());
+    } else {
+      __ mov($dst$$Register, AsmOperand($src1$$Register->successor(), lsr, $src2$$constant-32));
+    }
+    __ mov($dst$$Register->successor(), 0);
+  %}
+
+  ins_pipe(ialu_reg_imm);
+%}
+
+instruct shrL_reg_imm5(iRegL dst, iRegL src1, immU5 src2) %{
+  match(Set dst (URShiftL src1 src2));
+
+  size(12);
+  format %{ "LSR   $dst.lo,$src1.lo,$src2\n\t"
+            "OR    $dst.lo, $dst.lo, $src1.hi << 32-$src2\n\t"
+            "LSR   $dst.hi,$src1.hi,$src2" %}
+  ins_encode %{
+    // The order of the following 3 instructions matters: src1.lo and
+    // dst.hi can't overlap but src.hi and dst.hi can.
+    __ mov($dst$$Register, AsmOperand($src1$$Register, lsr, $src2$$constant));
+    __ orr($dst$$Register, $dst$$Register, AsmOperand($src1$$Register->successor(), lsl, 32-$src2$$constant));
+    __ mov($dst$$Register->successor(), AsmOperand($src1$$Register->successor(), lsr, $src2$$constant));
+  %}
+  ins_pipe(ialu_reg_imm);
+%}
+#endif // !AARCH64
+
+
+instruct shrP_reg_imm5(iRegX dst, iRegP src1, immU5 src2) %{
+  match(Set dst (URShiftI (CastP2X src1) src2));
+  size(4);
+  format %{ "LSR    $dst,$src1,$src2\t! Cast ptr $src1 to int and shift" %}
+  ins_encode %{
+    __ logical_shift_right($dst$$Register, $src1$$Register, $src2$$constant);
+  %}
+  ins_pipe(ialu_reg_imm);
+%}
+
+//----------Floating Point Arithmetic Instructions-----------------------------
+
+//  Add float single precision
+instruct addF_reg_reg(regF dst, regF src1, regF src2) %{
+  match(Set dst (AddF src1 src2));
+
+  size(4);
+  format %{ "FADDS  $dst,$src1,$src2" %}
+  ins_encode %{
+    __ add_float($dst$$FloatRegister, $src1$$FloatRegister, $src2$$FloatRegister);
+  %}
+
+  ins_pipe(faddF_reg_reg);
+%}
+
+//  Add float double precision
+instruct addD_reg_reg(regD dst, regD src1, regD src2) %{
+  match(Set dst (AddD src1 src2));
+
+  size(4);
+  format %{ "FADDD  $dst,$src1,$src2" %}
+  ins_encode %{
+    __ add_double($dst$$FloatRegister, $src1$$FloatRegister, $src2$$FloatRegister);
+  %}
+
+  ins_pipe(faddD_reg_reg);
+%}
+
+//  Sub float single precision
+instruct subF_reg_reg(regF dst, regF src1, regF src2) %{
+  match(Set dst (SubF src1 src2));
+
+  size(4);
+  format %{ "FSUBS  $dst,$src1,$src2" %}
+  ins_encode %{
+    __ sub_float($dst$$FloatRegister, $src1$$FloatRegister, $src2$$FloatRegister);
+  %}
+  ins_pipe(faddF_reg_reg);
+%}
+
+//  Sub float double precision
+instruct subD_reg_reg(regD dst, regD src1, regD src2) %{
+  match(Set dst (SubD src1 src2));
+
+  size(4);
+  format %{ "FSUBD  $dst,$src1,$src2" %}
+  ins_encode %{
+    __ sub_double($dst$$FloatRegister, $src1$$FloatRegister, $src2$$FloatRegister);
+  %}
+  ins_pipe(faddD_reg_reg);
+%}
+
+//  Mul float single precision
+instruct mulF_reg_reg(regF dst, regF src1, regF src2) %{
+  match(Set dst (MulF src1 src2));
+
+  size(4);
+  format %{ "FMULS  $dst,$src1,$src2" %}
+  ins_encode %{
+    __ mul_float($dst$$FloatRegister, $src1$$FloatRegister, $src2$$FloatRegister);
+  %}
+
+  ins_pipe(fmulF_reg_reg);
+%}
+
+//  Mul float double precision
+instruct mulD_reg_reg(regD dst, regD src1, regD src2) %{
+  match(Set dst (MulD src1 src2));
+
+  size(4);
+  format %{ "FMULD  $dst,$src1,$src2" %}
+  ins_encode %{
+    __ mul_double($dst$$FloatRegister, $src1$$FloatRegister, $src2$$FloatRegister);
+  %}
+
+  ins_pipe(fmulD_reg_reg);
+%}
+
+//  Div float single precision
+instruct divF_reg_reg(regF dst, regF src1, regF src2) %{
+  match(Set dst (DivF src1 src2));
+
+  size(4);
+  format %{ "FDIVS  $dst,$src1,$src2" %}
+  ins_encode %{
+    __ div_float($dst$$FloatRegister, $src1$$FloatRegister, $src2$$FloatRegister);
+  %}
+
+  ins_pipe(fdivF_reg_reg);
+%}
+
+//  Div float double precision
+instruct divD_reg_reg(regD dst, regD src1, regD src2) %{
+  match(Set dst (DivD src1 src2));
+
+  size(4);
+  format %{ "FDIVD  $dst,$src1,$src2" %}
+  ins_encode %{
+    __ div_double($dst$$FloatRegister, $src1$$FloatRegister, $src2$$FloatRegister);
+  %}
+
+  ins_pipe(fdivD_reg_reg);
+%}
+
+//  Absolute float double precision
+instruct absD_reg(regD dst, regD src) %{
+  match(Set dst (AbsD src));
+
+  size(4);
+  format %{ "FABSd  $dst,$src" %}
+  ins_encode %{
+    __ abs_double($dst$$FloatRegister, $src$$FloatRegister);
+  %}
+  ins_pipe(faddD_reg);
+%}
+
+//  Absolute float single precision
+instruct absF_reg(regF dst, regF src) %{
+  match(Set dst (AbsF src));
+  format %{ "FABSs  $dst,$src" %}
+  ins_encode %{
+    __ abs_float($dst$$FloatRegister, $src$$FloatRegister);
+  %}
+  ins_pipe(faddF_reg);
+%}
+
+instruct negF_reg(regF dst, regF src) %{
+  match(Set dst (NegF src));
+
+  size(4);
+  format %{ "FNEGs  $dst,$src" %}
+  ins_encode %{
+    __ neg_float($dst$$FloatRegister, $src$$FloatRegister);
+  %}
+  ins_pipe(faddF_reg);
+%}
+
+instruct negD_reg(regD dst, regD src) %{
+  match(Set dst (NegD src));
+
+  format %{ "FNEGd  $dst,$src" %}
+  ins_encode %{
+    __ neg_double($dst$$FloatRegister, $src$$FloatRegister);
+  %}
+  ins_pipe(faddD_reg);
+%}
+
+//  Sqrt float double precision
+instruct sqrtF_reg_reg(regF dst, regF src) %{
+  match(Set dst (ConvD2F (SqrtD (ConvF2D src))));
+
+  size(4);
+  format %{ "FSQRTS $dst,$src" %}
+  ins_encode %{
+    __ sqrt_float($dst$$FloatRegister, $src$$FloatRegister);
+  %}
+  ins_pipe(fdivF_reg_reg);
+%}
+
+//  Sqrt float double precision
+instruct sqrtD_reg_reg(regD dst, regD src) %{
+  match(Set dst (SqrtD src));
+
+  size(4);
+  format %{ "FSQRTD $dst,$src" %}
+  ins_encode %{
+    __ sqrt_double($dst$$FloatRegister, $src$$FloatRegister);
+  %}
+  ins_pipe(fdivD_reg_reg);
+%}
+
+//----------Logical Instructions-----------------------------------------------
+// And Instructions
+// Register And
+instruct andI_reg_reg(iRegI dst, iRegI src1, iRegI src2) %{
+  match(Set dst (AndI src1 src2));
+
+  size(4);
+  format %{ "and_32 $dst,$src1,$src2" %}
+  ins_encode %{
+    __ and_32($dst$$Register, $src1$$Register, $src2$$Register);
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+
+#ifndef AARCH64
+instruct andshlI_reg_reg_reg(iRegI dst, iRegI src1, iRegI src2, iRegI src3) %{
+  match(Set dst (AndI src1 (LShiftI src2 src3)));
+
+  size(4);
+  format %{ "AND    $dst,$src1,$src2<<$src3" %}
+  ins_encode %{
+    __ andr($dst$$Register, $src1$$Register, AsmOperand($src2$$Register, lsl, $src3$$Register));
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+#endif
+
+instruct andshlI_reg_reg_imm(iRegI dst, iRegI src1, iRegI src2, immU5 src3) %{
+  match(Set dst (AndI src1 (LShiftI src2 src3)));
+
+  size(4);
+  format %{ "and_32 $dst,$src1,$src2<<$src3" %}
+  ins_encode %{
+    __ and_32($dst$$Register, $src1$$Register, AsmOperand($src2$$Register, lsl, $src3$$constant));
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+
+#ifndef AARCH64
+instruct andsarI_reg_reg_reg(iRegI dst, iRegI src1, iRegI src2, iRegI src3) %{
+  match(Set dst (AndI src1 (RShiftI src2 src3)));
+
+  size(4);
+  format %{ "AND    $dst,$src1,$src2>>$src3" %}
+  ins_encode %{
+    __ andr($dst$$Register, $src1$$Register, AsmOperand($src2$$Register, asr, $src3$$Register));
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+#endif
+
+instruct andsarI_reg_reg_imm(iRegI dst, iRegI src1, iRegI src2, immU5 src3) %{
+  match(Set dst (AndI src1 (RShiftI src2 src3)));
+
+  size(4);
+  format %{ "and_32 $dst,$src1,$src2>>$src3" %}
+  ins_encode %{
+    __ and_32($dst$$Register, $src1$$Register, AsmOperand($src2$$Register, asr, $src3$$constant));
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+
+#ifndef AARCH64
+instruct andshrI_reg_reg_reg(iRegI dst, iRegI src1, iRegI src2, iRegI src3) %{
+  match(Set dst (AndI src1 (URShiftI src2 src3)));
+
+  size(4);
+  format %{ "AND    $dst,$src1,$src2>>>$src3" %}
+  ins_encode %{
+    __ andr($dst$$Register, $src1$$Register, AsmOperand($src2$$Register, lsr, $src3$$Register));
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+#endif
+
+instruct andshrI_reg_reg_imm(iRegI dst, iRegI src1, iRegI src2, immU5 src3) %{
+  match(Set dst (AndI src1 (URShiftI src2 src3)));
+
+  size(4);
+  format %{ "and_32 $dst,$src1,$src2>>>$src3" %}
+  ins_encode %{
+    __ and_32($dst$$Register, $src1$$Register, AsmOperand($src2$$Register, lsr, $src3$$constant));
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+
+// Immediate And
+instruct andI_reg_limm(iRegI dst, iRegI src1, limmI src2) %{
+  match(Set dst (AndI src1 src2));
+
+  size(4);
+  format %{ "and_32 $dst,$src1,$src2\t! int" %}
+  ins_encode %{
+    __ and_32($dst$$Register, $src1$$Register, $src2$$constant);
+  %}
+  ins_pipe(ialu_reg_imm);
+%}
+
+#ifndef AARCH64
+instruct andI_reg_limmn(iRegI dst, iRegI src1, limmIn src2) %{
+  match(Set dst (AndI src1 src2));
+
+  size(4);
+  format %{ "bic    $dst,$src1,~$src2\t! int" %}
+  ins_encode %{
+    __ bic($dst$$Register, $src1$$Register, ~$src2$$constant);
+  %}
+  ins_pipe(ialu_reg_imm);
+%}
+#endif
+
+// Register And Long
+instruct andL_reg_reg(iRegL dst, iRegL src1, iRegL src2) %{
+  match(Set dst (AndL src1 src2));
+
+  ins_cost(DEFAULT_COST);
+#ifdef AARCH64
+  size(4);
+  format %{ "AND    $dst,$src1,$src2\t! long" %}
+  ins_encode %{
+    __ andr($dst$$Register, $src1$$Register, $src2$$Register);
+  %}
+#else
+  size(8);
+  format %{ "AND    $dst,$src1,$src2\t! long" %}
+  ins_encode %{
+    __ andr($dst$$Register, $src1$$Register, $src2$$Register);
+    __ andr($dst$$Register->successor(), $src1$$Register->successor(), $src2$$Register->successor());
+  %}
+#endif
+  ins_pipe(ialu_reg_reg);
+%}
+
+#ifdef AARCH64
+// Immediate And
+instruct andL_reg_limm(iRegL dst, iRegL src1, limmL src2) %{
+  match(Set dst (AndL src1 src2));
+
+  size(4);
+  format %{ "AND    $dst,$src1,$src2\t! long" %}
+  ins_encode %{
+    __ andr($dst$$Register, $src1$$Register, (uintx)$src2$$constant);
+  %}
+  ins_pipe(ialu_reg_imm);
+%}
+#else
+// TODO: try immLRot2 instead, (0, $con$$constant) becomes
+// (hi($con$$constant), lo($con$$constant)) becomes
+instruct andL_reg_immRot(iRegL dst, iRegL src1, immLlowRot con) %{
+  match(Set dst (AndL src1 con));
+  ins_cost(DEFAULT_COST);
+  size(8);
+  format %{ "AND    $dst,$src1,$con\t! long" %}
+  ins_encode %{
+    __ andr($dst$$Register, $src1$$Register, $con$$constant);
+    __ andr($dst$$Register->successor(), $src1$$Register->successor(), 0);
+  %}
+  ins_pipe(ialu_reg_imm);
+%}
+#endif
+
+// Or Instructions
+// Register Or
+instruct orI_reg_reg(iRegI dst, iRegI src1, iRegI src2) %{
+  match(Set dst (OrI src1 src2));
+
+  size(4);
+  format %{ "orr_32 $dst,$src1,$src2\t! int" %}
+  ins_encode %{
+    __ orr_32($dst$$Register, $src1$$Register, $src2$$Register);
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+
+#ifndef AARCH64
+instruct orshlI_reg_reg_reg(iRegI dst, iRegI src1, iRegI src2, iRegI src3) %{
+  match(Set dst (OrI src1 (LShiftI src2 src3)));
+
+  size(4);
+  format %{ "OR    $dst,$src1,$src2<<$src3" %}
+  ins_encode %{
+    __ orr($dst$$Register, $src1$$Register, AsmOperand($src2$$Register, lsl, $src3$$Register));
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+#endif
+
+instruct orshlI_reg_reg_imm(iRegI dst, iRegI src1, iRegI src2, immU5 src3) %{
+  match(Set dst (OrI src1 (LShiftI src2 src3)));
+
+  size(4);
+  format %{ "orr_32 $dst,$src1,$src2<<$src3" %}
+  ins_encode %{
+    __ orr_32($dst$$Register, $src1$$Register, AsmOperand($src2$$Register, lsl, $src3$$constant));
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+
+#ifndef AARCH64
+instruct orsarI_reg_reg_reg(iRegI dst, iRegI src1, iRegI src2, iRegI src3) %{
+  match(Set dst (OrI src1 (RShiftI src2 src3)));
+
+  size(4);
+  format %{ "OR    $dst,$src1,$src2>>$src3" %}
+  ins_encode %{
+    __ orr($dst$$Register, $src1$$Register, AsmOperand($src2$$Register, asr, $src3$$Register));
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+#endif
+
+instruct orsarI_reg_reg_imm(iRegI dst, iRegI src1, iRegI src2, immU5 src3) %{
+  match(Set dst (OrI src1 (RShiftI src2 src3)));
+
+  size(4);
+  format %{ "orr_32 $dst,$src1,$src2>>$src3" %}
+  ins_encode %{
+    __ orr_32($dst$$Register, $src1$$Register, AsmOperand($src2$$Register, asr, $src3$$constant));
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+
+#ifndef AARCH64
+instruct orshrI_reg_reg_reg(iRegI dst, iRegI src1, iRegI src2, iRegI src3) %{
+  match(Set dst (OrI src1 (URShiftI src2 src3)));
+
+  size(4);
+  format %{ "OR    $dst,$src1,$src2>>>$src3" %}
+  ins_encode %{
+    __ orr($dst$$Register, $src1$$Register, AsmOperand($src2$$Register, lsr, $src3$$Register));
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+#endif
+
+instruct orshrI_reg_reg_imm(iRegI dst, iRegI src1, iRegI src2, immU5 src3) %{
+  match(Set dst (OrI src1 (URShiftI src2 src3)));
+
+  size(4);
+  format %{ "orr_32 $dst,$src1,$src2>>>$src3" %}
+  ins_encode %{
+    __ orr_32($dst$$Register, $src1$$Register, AsmOperand($src2$$Register, lsr, $src3$$constant));
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+
+// Immediate Or
+instruct orI_reg_limm(iRegI dst, iRegI src1, limmI src2) %{
+  match(Set dst (OrI src1 src2));
+
+  size(4);
+  format %{ "orr_32  $dst,$src1,$src2" %}
+  ins_encode %{
+    __ orr_32($dst$$Register, $src1$$Register, $src2$$constant);
+  %}
+  ins_pipe(ialu_reg_imm);
+%}
+// TODO: orn_32 with limmIn
+
+// Register Or Long
+instruct orL_reg_reg(iRegL dst, iRegL src1, iRegL src2) %{
+  match(Set dst (OrL src1 src2));
+
+  ins_cost(DEFAULT_COST);
+#ifdef AARCH64
+  size(4);
+  format %{ "OR     $dst,$src1,$src2\t! long" %}
+  ins_encode %{
+    __ orr($dst$$Register, $src1$$Register, $src2$$Register);
+  %}
+#else
+  size(8);
+  format %{ "OR     $dst.lo,$src1.lo,$src2.lo\t! long\n\t"
+            "OR     $dst.hi,$src1.hi,$src2.hi" %}
+  ins_encode %{
+    __ orr($dst$$Register, $src1$$Register, $src2$$Register);
+    __ orr($dst$$Register->successor(), $src1$$Register->successor(), $src2$$Register->successor());
+  %}
+#endif
+  ins_pipe(ialu_reg_reg);
+%}
+
+#ifdef AARCH64
+instruct orL_reg_limm(iRegL dst, iRegL src1, limmL src2) %{
+  match(Set dst (OrL src1 src2));
+
+  size(4);
+  format %{ "ORR    $dst,$src1,$src2\t! long" %}
+  ins_encode %{
+    __ orr($dst$$Register, $src1$$Register, (uintx)$src2$$constant);
+  %}
+  ins_pipe(ialu_reg_imm);
+%}
+#else
+// TODO: try immLRot2 instead, (0, $con$$constant) becomes
+// (hi($con$$constant), lo($con$$constant)) becomes
+instruct orL_reg_immRot(iRegL dst, iRegL src1, immLlowRot con) %{
+  match(Set dst (OrL src1 con));
+  ins_cost(DEFAULT_COST);
+  size(8);
+  format %{ "OR     $dst.lo,$src1.lo,$con\t! long\n\t"
+            "OR     $dst.hi,$src1.hi,$con" %}
+  ins_encode %{
+    __ orr($dst$$Register, $src1$$Register, $con$$constant);
+    __ orr($dst$$Register->successor(), $src1$$Register->successor(), 0);
+  %}
+  ins_pipe(ialu_reg_imm);
+%}
+#endif
+
+#ifdef TODO
+// Use SPRegP to match Rthread (TLS register) without spilling.
+// Use store_ptr_RegP to match Rthread (TLS register) without spilling.
+// Use sp_ptr_RegP to match Rthread (TLS register) without spilling.
+instruct orI_reg_castP2X(iRegI dst, iRegI src1, sp_ptr_RegP src2) %{
+  match(Set dst (OrI src1 (CastP2X src2)));
+  size(4);
+  format %{ "OR     $dst,$src1,$src2" %}
+  ins_encode %{
+    __ orr($dst$$Register, $src1$$Register, $src2$$Register);
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+#endif
+
+// Xor Instructions
+// Register Xor
+instruct xorI_reg_reg(iRegI dst, iRegI src1, iRegI src2) %{
+  match(Set dst (XorI src1 src2));
+
+  size(4);
+  format %{ "eor_32 $dst,$src1,$src2" %}
+  ins_encode %{
+    __ eor_32($dst$$Register, $src1$$Register, $src2$$Register);
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+
+#ifndef AARCH64
+instruct xorshlI_reg_reg_reg(iRegI dst, iRegI src1, iRegI src2, iRegI src3) %{
+  match(Set dst (XorI src1 (LShiftI src2 src3)));
+
+  size(4);
+  format %{ "XOR    $dst,$src1,$src2<<$src3" %}
+  ins_encode %{
+    __ eor($dst$$Register, $src1$$Register, AsmOperand($src2$$Register, lsl, $src3$$Register));
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+#endif
+
+instruct xorshlI_reg_reg_imm(iRegI dst, iRegI src1, iRegI src2, immU5 src3) %{
+  match(Set dst (XorI src1 (LShiftI src2 src3)));
+
+  size(4);
+  format %{ "eor_32 $dst,$src1,$src2<<$src3" %}
+  ins_encode %{
+    __ eor_32($dst$$Register, $src1$$Register, AsmOperand($src2$$Register, lsl, $src3$$constant));
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+
+#ifndef AARCH64
+instruct xorsarI_reg_reg_reg(iRegI dst, iRegI src1, iRegI src2, iRegI src3) %{
+  match(Set dst (XorI src1 (RShiftI src2 src3)));
+
+  size(4);
+  format %{ "XOR    $dst,$src1,$src2>>$src3" %}
+  ins_encode %{
+    __ eor($dst$$Register, $src1$$Register, AsmOperand($src2$$Register, asr, $src3$$Register));
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+#endif
+
+instruct xorsarI_reg_reg_imm(iRegI dst, iRegI src1, iRegI src2, immU5 src3) %{
+  match(Set dst (XorI src1 (RShiftI src2 src3)));
+
+  size(4);
+  format %{ "eor_32 $dst,$src1,$src2>>$src3" %}
+  ins_encode %{
+    __ eor_32($dst$$Register, $src1$$Register, AsmOperand($src2$$Register, asr, $src3$$constant));
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+
+#ifndef AARCH64
+instruct xorshrI_reg_reg_reg(iRegI dst, iRegI src1, iRegI src2, iRegI src3) %{
+  match(Set dst (XorI src1 (URShiftI src2 src3)));
+
+  size(4);
+  format %{ "XOR    $dst,$src1,$src2>>>$src3" %}
+  ins_encode %{
+    __ eor($dst$$Register, $src1$$Register, AsmOperand($src2$$Register, lsr, $src3$$Register));
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+#endif
+
+instruct xorshrI_reg_reg_imm(iRegI dst, iRegI src1, iRegI src2, immU5 src3) %{
+  match(Set dst (XorI src1 (URShiftI src2 src3)));
+
+  size(4);
+  format %{ "eor_32 $dst,$src1,$src2>>>$src3" %}
+  ins_encode %{
+    __ eor_32($dst$$Register, $src1$$Register, AsmOperand($src2$$Register, lsr, $src3$$constant));
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+
+// Immediate Xor
+instruct xorI_reg_imm(iRegI dst, iRegI src1, limmI src2) %{
+  match(Set dst (XorI src1 src2));
+
+  size(4);
+  format %{ "eor_32 $dst,$src1,$src2" %}
+  ins_encode %{
+    __ eor_32($dst$$Register, $src1$$Register, $src2$$constant);
+  %}
+  ins_pipe(ialu_reg_imm);
+%}
+
+// Register Xor Long
+instruct xorL_reg_reg(iRegL dst, iRegL src1, iRegL src2) %{
+  match(Set dst (XorL src1 src2));
+  ins_cost(DEFAULT_COST);
+#ifdef AARCH64
+  size(4);
+  format %{ "XOR     $dst,$src1,$src2\t! long" %}
+  ins_encode %{
+    __ eor($dst$$Register, $src1$$Register, $src2$$Register);
+  %}
+#else
+  size(8);
+  format %{ "XOR     $dst.hi,$src1.hi,$src2.hi\t! long\n\t"
+            "XOR     $dst.lo,$src1.lo,$src2.lo\t! long" %}
+  ins_encode %{
+    __ eor($dst$$Register, $src1$$Register, $src2$$Register);
+    __ eor($dst$$Register->successor(), $src1$$Register->successor(), $src2$$Register->successor());
+  %}
+#endif
+  ins_pipe(ialu_reg_reg);
+%}
+
+#ifdef AARCH64
+instruct xorL_reg_limmL(iRegL dst, iRegL src1, limmL con) %{
+  match(Set dst (XorL src1 con));
+  ins_cost(DEFAULT_COST);
+  size(4);
+  format %{ "EOR     $dst,$src1,$con\t! long" %}
+  ins_encode %{
+    __ eor($dst$$Register, $src1$$Register, (uintx)$con$$constant);
+  %}
+  ins_pipe(ialu_reg_imm);
+%}
+#else
+// TODO: try immLRot2 instead, (0, $con$$constant) becomes
+// (hi($con$$constant), lo($con$$constant)) becomes
+instruct xorL_reg_immRot(iRegL dst, iRegL src1, immLlowRot con) %{
+  match(Set dst (XorL src1 con));
+  ins_cost(DEFAULT_COST);
+  size(8);
+  format %{ "XOR     $dst.hi,$src1.hi,$con\t! long\n\t"
+            "XOR     $dst.lo,$src1.lo,0\t! long" %}
+  ins_encode %{
+    __ eor($dst$$Register, $src1$$Register, $con$$constant);
+    __ eor($dst$$Register->successor(), $src1$$Register->successor(), 0);
+  %}
+  ins_pipe(ialu_reg_imm);
+%}
+#endif // AARCH64
+
+//----------Convert to Boolean-------------------------------------------------
+instruct convI2B( iRegI dst, iRegI src, flagsReg ccr ) %{
+  match(Set dst (Conv2B src));
+  effect(KILL ccr);
+#ifdef AARCH64
+  size(8);
+  ins_cost(DEFAULT_COST*2);
+  format %{ "cmp_32 $src,ZR\n\t"
+            "cset_w $dst, ne" %}
+  ins_encode %{
+    __ cmp_32($src$$Register, ZR);
+    __ cset_w($dst$$Register, ne);
+  %}
+#else
+  size(12);
+  ins_cost(DEFAULT_COST*2);
+  format %{ "TST    $src,$src \n\t"
+            "MOV    $dst, 0   \n\t"
+            "MOV.ne $dst, 1" %}
+  ins_encode %{ // FIXME: can do better?
+    __ tst($src$$Register, $src$$Register);
+    __ mov($dst$$Register, 0);
+    __ mov($dst$$Register, 1, ne);
+  %}
+#endif
+  ins_pipe(ialu_reg_ialu);
+%}
+
+instruct convP2B( iRegI dst, iRegP src, flagsReg ccr ) %{
+  match(Set dst (Conv2B src));
+  effect(KILL ccr);
+#ifdef AARCH64
+  size(8);
+  ins_cost(DEFAULT_COST*2);
+  format %{ "CMP    $src,ZR\n\t"
+            "cset   $dst, ne" %}
+  ins_encode %{
+    __ cmp($src$$Register, ZR);
+    __ cset($dst$$Register, ne);
+  %}
+#else
+  size(12);
+  ins_cost(DEFAULT_COST*2);
+  format %{ "TST    $src,$src \n\t"
+            "MOV    $dst, 0   \n\t"
+            "MOV.ne $dst, 1" %}
+  ins_encode %{
+    __ tst($src$$Register, $src$$Register);
+    __ mov($dst$$Register, 0);
+    __ mov($dst$$Register, 1, ne);
+  %}
+#endif
+  ins_pipe(ialu_reg_ialu);
+%}
+
+instruct cmpLTMask_reg_reg( iRegI dst, iRegI p, iRegI q, flagsReg ccr ) %{
+  match(Set dst (CmpLTMask p q));
+  effect( KILL ccr );
+#ifdef AARCH64
+  size(8);
+  ins_cost(DEFAULT_COST*2);
+  format %{ "CMP_w   $p,$q\n\t"
+            "CSETM_w $dst, lt" %}
+  ins_encode %{
+    __ cmp_w($p$$Register, $q$$Register);
+    __ csetm_w($dst$$Register, lt);
+  %}
+#else
+  ins_cost(DEFAULT_COST*3);
+  format %{ "CMP    $p,$q\n\t"
+            "MOV    $dst, #0\n\t"
+            "MOV.lt $dst, #-1" %}
+  ins_encode %{
+    __ cmp($p$$Register, $q$$Register);
+    __ mov($dst$$Register, 0);
+    __ mvn($dst$$Register, 0, lt);
+  %}
+#endif
+  ins_pipe(ialu_reg_reg_ialu);
+%}
+
+instruct cmpLTMask_reg_imm( iRegI dst, iRegI p, aimmI q, flagsReg ccr ) %{
+  match(Set dst (CmpLTMask p q));
+  effect( KILL ccr );
+#ifdef AARCH64
+  size(8);
+  ins_cost(DEFAULT_COST*2);
+  format %{ "CMP_w   $p,$q\n\t"
+            "CSETM_w $dst, lt" %}
+  ins_encode %{
+    __ cmp_w($p$$Register, $q$$constant);
+    __ csetm_w($dst$$Register, lt);
+  %}
+#else
+  ins_cost(DEFAULT_COST*3);
+  format %{ "CMP    $p,$q\n\t"
+            "MOV    $dst, #0\n\t"
+            "MOV.lt $dst, #-1" %}
+  ins_encode %{
+    __ cmp($p$$Register, $q$$constant);
+    __ mov($dst$$Register, 0);
+    __ mvn($dst$$Register, 0, lt);
+  %}
+#endif
+  ins_pipe(ialu_reg_reg_ialu);
+%}
+
+#ifdef AARCH64
+instruct cadd_cmpLTMask3( iRegI dst, iRegI p, iRegI q, iRegI y, iRegI x, flagsReg ccr ) %{
+  match(Set dst (AddI (AndI (CmpLTMask p q) y) x));
+  effect( TEMP dst, KILL ccr );
+  size(12);
+  ins_cost(DEFAULT_COST*3);
+  format %{ "CMP_w  $p,$q\n\t"
+            "ADD_w  $dst,$y,$x\n\t"
+            "CSEL_w $dst,$dst,$x,lt" %}
+  ins_encode %{
+    __ cmp_w($p$$Register, $q$$Register);
+    __ add_w($dst$$Register, $y$$Register, $x$$Register);
+    __ csel_w($dst$$Register, $dst$$Register, $x$$Register, lt);
+  %}
+  ins_pipe( cadd_cmpltmask );
+%}
+#else
+instruct cadd_cmpLTMask3( iRegI p, iRegI q, iRegI y, iRegI z, flagsReg ccr ) %{
+  match(Set z (AddI (AndI (CmpLTMask p q) y) z));
+  effect( KILL ccr );
+  ins_cost(DEFAULT_COST*2);
+  format %{ "CMP    $p,$q\n\t"
+            "ADD.lt $z,$y,$z" %}
+  ins_encode %{
+    __ cmp($p$$Register, $q$$Register);
+    __ add($z$$Register, $y$$Register, $z$$Register, lt);
+  %}
+  ins_pipe( cadd_cmpltmask );
+%}
+#endif
+
+#ifdef AARCH64
+instruct cadd_cmpLTMask4( iRegI dst, iRegI p, aimmI q, iRegI y, iRegI x, flagsReg ccr ) %{
+  match(Set dst (AddI (AndI (CmpLTMask p q) y) x));
+  effect( TEMP dst, KILL ccr );
+  size(12);
+  ins_cost(DEFAULT_COST*3);
+  format %{ "CMP_w  $p,$q\n\t"
+            "ADD_w  $dst,$y,$x\n\t"
+            "CSEL_w $dst,$dst,$x,lt" %}
+  ins_encode %{
+    __ cmp_w($p$$Register, $q$$constant);
+    __ add_w($dst$$Register, $y$$Register, $x$$Register);
+    __ csel_w($dst$$Register, $dst$$Register, $x$$Register, lt);
+  %}
+  ins_pipe( cadd_cmpltmask );
+%}
+#else
+// FIXME: remove unused "dst"
+instruct cadd_cmpLTMask4( iRegI dst, iRegI p, aimmI q, iRegI y, iRegI z, flagsReg ccr ) %{
+  match(Set z (AddI (AndI (CmpLTMask p q) y) z));
+  effect( KILL ccr );
+  ins_cost(DEFAULT_COST*2);
+  format %{ "CMP    $p,$q\n\t"
+            "ADD.lt $z,$y,$z" %}
+  ins_encode %{
+    __ cmp($p$$Register, $q$$constant);
+    __ add($z$$Register, $y$$Register, $z$$Register, lt);
+  %}
+  ins_pipe( cadd_cmpltmask );
+%}
+#endif // !AARCH64
+
+#ifdef AARCH64
+instruct cadd_cmpLTMask( iRegI dst, iRegI p, iRegI q, iRegI y, flagsReg ccr ) %{
+  match(Set dst (AddI (AndI (CmpLTMask p q) y) (SubI p q)));
+  effect( TEMP dst, KILL ccr );
+  size(12);
+  ins_cost(DEFAULT_COST*3);
+  format %{ "SUBS_w $p,$p,$q\n\t"
+            "ADD_w  $dst,$y,$p\n\t"
+            "CSEL_w $dst,$dst,$p,lt" %}
+  ins_encode %{
+    __ subs_w($p$$Register, $p$$Register, $q$$Register);
+    __ add_w($dst$$Register, $y$$Register, $p$$Register);
+    __ csel_w($dst$$Register, $dst$$Register, $p$$Register, lt);
+  %}
+  ins_pipe( cadd_cmpltmask ); // FIXME
+%}
+#else
+instruct cadd_cmpLTMask( iRegI p, iRegI q, iRegI y, flagsReg ccr ) %{
+  match(Set p (AddI (AndI (CmpLTMask p q) y) (SubI p q)));
+  effect( KILL ccr );
+  ins_cost(DEFAULT_COST*2);
+  format %{ "SUBS   $p,$p,$q\n\t"
+            "ADD.lt $p,$y,$p" %}
+  ins_encode %{
+    __ subs($p$$Register, $p$$Register, $q$$Register);
+    __ add($p$$Register, $y$$Register, $p$$Register, lt);
+  %}
+  ins_pipe( cadd_cmpltmask );
+%}
+#endif
+
+//----------Arithmetic Conversion Instructions---------------------------------
+// The conversions operations are all Alpha sorted.  Please keep it that way!
+
+instruct convD2F_reg(regF dst, regD src) %{
+  match(Set dst (ConvD2F src));
+  size(4);
+  format %{ "FCVTSD  $dst,$src" %}
+  ins_encode %{
+    __ convert_d2f($dst$$FloatRegister, $src$$FloatRegister);
+  %}
+  ins_pipe(fcvtD2F);
+%}
+
+// Convert a double to an int in a float register.
+// If the double is a NAN, stuff a zero in instead.
+
+#ifdef AARCH64
+instruct convD2I_reg_reg(iRegI dst, regD src) %{
+  match(Set dst (ConvD2I src));
+  ins_cost(DEFAULT_COST*2 + MEMORY_REF_COST*2 + BRANCH_COST); // FIXME
+  format %{ "FCVTZS_wd $dst, $src" %}
+  ins_encode %{
+    __ fcvtzs_wd($dst$$Register, $src$$FloatRegister);
+  %}
+  ins_pipe(fcvtD2I);
+%}
+
+instruct convD2L_reg_reg(iRegL dst, regD src) %{
+  match(Set dst (ConvD2L src));
+  ins_cost(DEFAULT_COST*2 + MEMORY_REF_COST*2 + BRANCH_COST); // FIXME
+  format %{ "FCVTZS_xd $dst, $src" %}
+  ins_encode %{
+    __ fcvtzs_xd($dst$$Register, $src$$FloatRegister);
+  %}
+  ins_pipe(fcvtD2L);
+%}
+#else
+instruct convD2I_reg_reg(iRegI dst, regD src, regF tmp) %{
+  match(Set dst (ConvD2I src));
+  effect( TEMP tmp );
+  ins_cost(DEFAULT_COST*2 + MEMORY_REF_COST*2 + BRANCH_COST); // FIXME
+  format %{ "FTOSIZD  $tmp,$src\n\t"
+            "FMRS     $dst, $tmp" %}
+  ins_encode %{
+    __ ftosizd($tmp$$FloatRegister, $src$$FloatRegister);
+    __ fmrs($dst$$Register, $tmp$$FloatRegister);
+  %}
+  ins_pipe(fcvtD2I);
+%}
+#endif
+
+// Convert a double to a long in a double register.
+// If the double is a NAN, stuff a zero in instead.
+
+#ifndef AARCH64
+// Double to Long conversion
+instruct convD2L_reg(R0R1RegL dst, regD src) %{
+  match(Set dst (ConvD2L src));
+  effect(CALL);
+  ins_cost(MEMORY_REF_COST); // FIXME
+  format %{ "convD2L    $dst,$src\t ! call to SharedRuntime::d2l" %}
+  ins_encode %{
+#ifndef __ABI_HARD__
+    __ fmrrd($dst$$Register, $dst$$Register->successor(), $src$$FloatRegister);
+#else
+    if ($src$$FloatRegister != D0) {
+      __ mov_double(D0, $src$$FloatRegister);
+    }
+#endif
+    address target = CAST_FROM_FN_PTR(address, SharedRuntime::d2l);
+    __ call(target, relocInfo::runtime_call_type);
+  %}
+  ins_pipe(fcvtD2L);
+%}
+#endif
+
+instruct convF2D_reg(regD dst, regF src) %{
+  match(Set dst (ConvF2D src));
+  size(4);
+  format %{ "FCVTDS  $dst,$src" %}
+  ins_encode %{
+    __ convert_f2d($dst$$FloatRegister, $src$$FloatRegister);
+  %}
+  ins_pipe(fcvtF2D);
+%}
+
+#ifdef AARCH64
+instruct convF2I_reg_reg(iRegI dst, regF src) %{
+  match(Set dst (ConvF2I src));
+  ins_cost(DEFAULT_COST*2 + MEMORY_REF_COST*2 + BRANCH_COST); // FIXME
+  size(4);
+  format %{ "FCVTZS_ws $dst, $src" %}
+  ins_encode %{
+    __ fcvtzs_ws($dst$$Register, $src$$FloatRegister);
+  %}
+  ins_pipe(fcvtF2I);
+%}
+
+instruct convF2L_reg_reg(iRegL dst, regF src) %{
+  match(Set dst (ConvF2L src));
+  ins_cost(DEFAULT_COST*2 + MEMORY_REF_COST*2 + BRANCH_COST); // FIXME
+  size(4);
+  format %{ "FCVTZS_xs $dst, $src" %}
+  ins_encode %{
+    __ fcvtzs_xs($dst$$Register, $src$$FloatRegister);
+  %}
+  ins_pipe(fcvtF2L);
+%}
+#else
+instruct convF2I_reg_reg(iRegI dst, regF src, regF tmp) %{
+  match(Set dst (ConvF2I src));
+  effect( TEMP tmp );
+  ins_cost(DEFAULT_COST*2 + MEMORY_REF_COST*2 + BRANCH_COST); // FIXME
+  size(8);
+  format %{ "FTOSIZS  $tmp,$src\n\t"
+            "FMRS     $dst, $tmp" %}
+  ins_encode %{
+    __ ftosizs($tmp$$FloatRegister, $src$$FloatRegister);
+    __ fmrs($dst$$Register, $tmp$$FloatRegister);
+  %}
+  ins_pipe(fcvtF2I);
+%}
+
+// Float to Long conversion
+instruct convF2L_reg(R0R1RegL dst, regF src, R0RegI arg1) %{
+  match(Set dst (ConvF2L src));
+  ins_cost(DEFAULT_COST*2 + MEMORY_REF_COST*2 + BRANCH_COST); // FIXME
+  effect(CALL);
+  format %{ "convF2L  $dst,$src\t! call to SharedRuntime::f2l" %}
+  ins_encode %{
+#ifndef __ABI_HARD__
+    __ fmrs($arg1$$Register, $src$$FloatRegister);
+#else
+    if($src$$FloatRegister != S0) {
+      __ mov_float(S0, $src$$FloatRegister);
+    }
+#endif
+    address target = CAST_FROM_FN_PTR(address, SharedRuntime::f2l);
+    __ call(target, relocInfo::runtime_call_type);
+  %}
+  ins_pipe(fcvtF2L);
+%}
+#endif
+
+#ifdef AARCH64
+instruct convI2D_reg_reg(iRegI src, regD dst) %{
+  match(Set dst (ConvI2D src));
+  ins_cost(DEFAULT_COST + MEMORY_REF_COST); // FIXME
+  size(4);
+  format %{ "SCVTF_dw $dst,$src" %}
+  ins_encode %{
+      __ scvtf_dw($dst$$FloatRegister, $src$$Register);
+  %}
+  ins_pipe(fcvtI2D);
+%}
+#else
+instruct convI2D_reg_reg(iRegI src, regD_low dst) %{
+  match(Set dst (ConvI2D src));
+  ins_cost(DEFAULT_COST + MEMORY_REF_COST); // FIXME
+  size(8);
+  format %{ "FMSR     $dst,$src \n\t"
+            "FSITOD   $dst $dst"%}
+  ins_encode %{
+      __ fmsr($dst$$FloatRegister, $src$$Register);
+      __ fsitod($dst$$FloatRegister, $dst$$FloatRegister);
+  %}
+  ins_pipe(fcvtI2D);
+%}
+#endif
+
+instruct convI2F_reg_reg( regF dst, iRegI src ) %{
+  match(Set dst (ConvI2F src));
+  ins_cost(DEFAULT_COST + MEMORY_REF_COST); // FIXME
+#ifdef AARCH64
+  size(4);
+  format %{ "SCVTF_sw $dst,$src" %}
+  ins_encode %{
+      __ scvtf_sw($dst$$FloatRegister, $src$$Register);
+  %}
+#else
+  size(8);
+  format %{ "FMSR     $dst,$src \n\t"
+            "FSITOS   $dst, $dst"%}
+  ins_encode %{
+      __ fmsr($dst$$FloatRegister, $src$$Register);
+      __ fsitos($dst$$FloatRegister, $dst$$FloatRegister);
+  %}
+#endif
+  ins_pipe(fcvtI2F);
+%}
+
+instruct convI2L_reg(iRegL dst, iRegI src) %{
+  match(Set dst (ConvI2L src));
+#ifdef AARCH64
+  size(4);
+  format %{ "SXTW   $dst,$src\t! int->long" %}
+  ins_encode %{
+    __ sxtw($dst$$Register, $src$$Register);
+  %}
+#else
+  size(8);
+  format %{ "MOV    $dst.lo, $src \n\t"
+            "ASR    $dst.hi,$src,31\t! int->long" %}
+  ins_encode %{
+    __ mov($dst$$Register, $src$$Register);
+    __ mov($dst$$Register->successor(), AsmOperand($src$$Register, asr, 31));
+  %}
+#endif
+  ins_pipe(ialu_reg_reg);
+%}
+
+// Zero-extend convert int to long
+instruct convI2L_reg_zex(iRegL dst, iRegI src, immL_32bits mask ) %{
+  match(Set dst (AndL (ConvI2L src) mask) );
+#ifdef AARCH64
+  size(4);
+  format %{ "mov_w  $dst,$src\t! zero-extend int to long"  %}
+  ins_encode %{
+    __ mov_w($dst$$Register, $src$$Register);
+  %}
+#else
+  size(8);
+  format %{ "MOV    $dst.lo,$src.lo\t! zero-extend int to long\n\t"
+            "MOV    $dst.hi, 0"%}
+  ins_encode %{
+    __ mov($dst$$Register, $src$$Register);
+    __ mov($dst$$Register->successor(), 0);
+  %}
+#endif
+  ins_pipe(ialu_reg_reg);
+%}
+
+// Zero-extend long
+instruct zerox_long(iRegL dst, iRegL src, immL_32bits mask ) %{
+  match(Set dst (AndL src mask) );
+#ifdef AARCH64
+  size(4);
+  format %{ "mov_w  $dst,$src\t! zero-extend long"  %}
+  ins_encode %{
+    __ mov_w($dst$$Register, $src$$Register);
+  %}
+#else
+  size(8);
+  format %{ "MOV    $dst.lo,$src.lo\t! zero-extend long\n\t"
+            "MOV    $dst.hi, 0"%}
+  ins_encode %{
+    __ mov($dst$$Register, $src$$Register);
+    __ mov($dst$$Register->successor(), 0);
+  %}
+#endif
+  ins_pipe(ialu_reg_reg);
+%}
+
+instruct MoveF2I_reg_reg(iRegI dst, regF src) %{
+  match(Set dst (MoveF2I src));
+  effect(DEF dst, USE src);
+  ins_cost(MEMORY_REF_COST); // FIXME
+
+  size(4);
+  format %{ "FMRS   $dst,$src\t! MoveF2I" %}
+  ins_encode %{
+    __ fmrs($dst$$Register, $src$$FloatRegister);
+  %}
+  ins_pipe(iload_mem); // FIXME
+%}
+
+instruct MoveI2F_reg_reg(regF dst, iRegI src) %{
+  match(Set dst (MoveI2F src));
+  ins_cost(MEMORY_REF_COST); // FIXME
+
+  size(4);
+  format %{ "FMSR   $dst,$src\t! MoveI2F" %}
+  ins_encode %{
+    __ fmsr($dst$$FloatRegister, $src$$Register);
+  %}
+  ins_pipe(iload_mem); // FIXME
+%}
+
+instruct MoveD2L_reg_reg(iRegL dst, regD src) %{
+  match(Set dst (MoveD2L src));
+  effect(DEF dst, USE src);
+  ins_cost(MEMORY_REF_COST); // FIXME
+
+  size(4);
+#ifdef AARCH64
+  format %{ "FMOV_xd  $dst,$src\t! MoveD2L" %}
+  ins_encode %{
+    __ fmov_xd($dst$$Register, $src$$FloatRegister);
+  %}
+#else
+  format %{ "FMRRD    $dst,$src\t! MoveD2L" %}
+  ins_encode %{
+    __ fmrrd($dst$$Register, $dst$$Register->successor(), $src$$FloatRegister);
+  %}
+#endif
+  ins_pipe(iload_mem); // FIXME
+%}
+
+instruct MoveL2D_reg_reg(regD dst, iRegL src) %{
+  match(Set dst (MoveL2D src));
+  effect(DEF dst, USE src);
+  ins_cost(MEMORY_REF_COST); // FIXME
+
+  size(4);
+#ifdef AARCH64
+  format %{ "FMOV_dx $dst,$src\t! MoveL2D" %}
+  ins_encode %{
+    __ fmov_dx($dst$$FloatRegister, $src$$Register);
+  %}
+#else
+  format %{ "FMDRR   $dst,$src\t! MoveL2D" %}
+  ins_encode %{
+    __ fmdrr($dst$$FloatRegister, $src$$Register, $src$$Register->successor());
+  %}
+#endif
+  ins_pipe(ialu_reg_reg); // FIXME
+%}
+
+//-----------
+// Long to Double conversion
+
+#ifdef AARCH64
+instruct convL2D(regD dst, iRegL src) %{
+  match(Set dst (ConvL2D src));
+  ins_cost(DEFAULT_COST*2 + MEMORY_REF_COST*2 + BRANCH_COST); // FIXME
+  size(4);
+  format %{ "SCVTF_dx $dst, $src" %}
+  ins_encode %{
+    __ scvtf_dx($dst$$FloatRegister, $src$$Register);
+  %}
+  ins_pipe(fcvtL2D);
+%}
+
+instruct convL2F(regF dst, iRegL src) %{
+  match(Set dst (ConvL2F src));
+  ins_cost(DEFAULT_COST*2 + MEMORY_REF_COST*2 + BRANCH_COST); // FIXME
+  size(4);
+  format %{ "SCVTF_sx $dst, $src" %}
+  ins_encode %{
+    __ scvtf_sx($dst$$FloatRegister, $src$$Register);
+  %}
+  ins_pipe(fcvtL2F);
+%}
+#else
+// Magic constant, 0x43300000
+instruct loadConI_x43300000(iRegI dst) %{
+  effect(DEF dst);
+  size(8);
+  format %{ "MOV_SLOW  $dst,0x43300000\t! 2^52" %}
+  ins_encode %{
+    __ mov_slow($dst$$Register, 0x43300000);
+  %}
+  ins_pipe(ialu_none);
+%}
+
+// Magic constant, 0x41f00000
+instruct loadConI_x41f00000(iRegI dst) %{
+  effect(DEF dst);
+  size(8);
+  format %{ "MOV_SLOW  $dst, 0x41f00000\t! 2^32" %}
+  ins_encode %{
+    __ mov_slow($dst$$Register, 0x41f00000);
+  %}
+  ins_pipe(ialu_none);
+%}
+
+instruct loadConI_x0(iRegI dst) %{
+  effect(DEF dst);
+  size(4);
+  format %{ "MOV  $dst, 0x0\t! 0" %}
+  ins_encode %{
+    __ mov($dst$$Register, 0);
+  %}
+  ins_pipe(ialu_none);
+%}
+
+// Construct a double from two float halves
+instruct regDHi_regDLo_to_regD(regD_low dst, regD_low src1, regD_low src2) %{
+  effect(DEF dst, USE src1, USE src2);
+  size(8);
+  format %{ "FCPYS  $dst.hi,$src1.hi\n\t"
+            "FCPYS  $dst.lo,$src2.lo" %}
+  ins_encode %{
+    __ fcpys($dst$$FloatRegister->successor(), $src1$$FloatRegister->successor());
+    __ fcpys($dst$$FloatRegister, $src2$$FloatRegister);
+  %}
+  ins_pipe(faddD_reg_reg);
+%}
+
+#ifndef AARCH64
+// Convert integer in high half of a double register (in the lower half of
+// the double register file) to double
+instruct convI2D_regDHi_regD(regD dst, regD_low src) %{
+  effect(DEF dst, USE src);
+  size(4);
+  format %{ "FSITOD  $dst,$src" %}
+  ins_encode %{
+    __ fsitod($dst$$FloatRegister, $src$$FloatRegister->successor());
+  %}
+  ins_pipe(fcvtLHi2D);
+%}
+#endif
+
+// Add float double precision
+instruct addD_regD_regD(regD dst, regD src1, regD src2) %{
+  effect(DEF dst, USE src1, USE src2);
+  size(4);
+  format %{ "FADDD  $dst,$src1,$src2" %}
+  ins_encode %{
+    __ add_double($dst$$FloatRegister, $src1$$FloatRegister, $src2$$FloatRegister);
+  %}
+  ins_pipe(faddD_reg_reg);
+%}
+
+// Sub float double precision
+instruct subD_regD_regD(regD dst, regD src1, regD src2) %{
+  effect(DEF dst, USE src1, USE src2);
+  size(4);
+  format %{ "FSUBD  $dst,$src1,$src2" %}
+  ins_encode %{
+    __ sub_double($dst$$FloatRegister, $src1$$FloatRegister, $src2$$FloatRegister);
+  %}
+  ins_pipe(faddD_reg_reg);
+%}
+
+// Mul float double precision
+instruct mulD_regD_regD(regD dst, regD src1, regD src2) %{
+  effect(DEF dst, USE src1, USE src2);
+  size(4);
+  format %{ "FMULD  $dst,$src1,$src2" %}
+  ins_encode %{
+    __ mul_double($dst$$FloatRegister, $src1$$FloatRegister, $src2$$FloatRegister);
+  %}
+  ins_pipe(fmulD_reg_reg);
+%}
+
+instruct regL_to_regD(regD dst, iRegL src) %{
+  // No match rule to avoid chain rule match.
+  effect(DEF dst, USE src);
+  ins_cost(MEMORY_REF_COST);
+  size(4);
+  format %{ "FMDRR   $dst,$src\t! regL to regD" %}
+  ins_encode %{
+    __ fmdrr($dst$$FloatRegister, $src$$Register, $src$$Register->successor());
+  %}
+  ins_pipe(ialu_reg_reg); // FIXME
+%}
+
+instruct regI_regI_to_regD(regD dst, iRegI src1, iRegI src2) %{
+  // No match rule to avoid chain rule match.
+  effect(DEF dst, USE src1, USE src2);
+  ins_cost(MEMORY_REF_COST);
+  size(4);
+  format %{ "FMDRR   $dst,$src1,$src2\t! regI,regI to regD" %}
+  ins_encode %{
+    __ fmdrr($dst$$FloatRegister, $src1$$Register, $src2$$Register);
+  %}
+  ins_pipe(ialu_reg_reg); // FIXME
+%}
+
+instruct convL2D_reg_slow_fxtof(regD dst, iRegL src) %{
+  match(Set dst (ConvL2D src));
+  ins_cost(DEFAULT_COST*8 + MEMORY_REF_COST*6); // FIXME
+
+  expand %{
+    regD_low   tmpsrc;
+    iRegI      ix43300000;
+    iRegI      ix41f00000;
+    iRegI      ix0;
+    regD_low   dx43300000;
+    regD       dx41f00000;
+    regD       tmp1;
+    regD_low   tmp2;
+    regD       tmp3;
+    regD       tmp4;
+
+    regL_to_regD(tmpsrc, src);
+
+    loadConI_x43300000(ix43300000);
+    loadConI_x41f00000(ix41f00000);
+    loadConI_x0(ix0);
+
+    regI_regI_to_regD(dx43300000, ix0, ix43300000);
+    regI_regI_to_regD(dx41f00000, ix0, ix41f00000);
+
+    convI2D_regDHi_regD(tmp1, tmpsrc);
+    regDHi_regDLo_to_regD(tmp2, dx43300000, tmpsrc);
+    subD_regD_regD(tmp3, tmp2, dx43300000);
+    mulD_regD_regD(tmp4, tmp1, dx41f00000);
+    addD_regD_regD(dst, tmp3, tmp4);
+  %}
+%}
+#endif // !AARCH64
+
+instruct convL2I_reg(iRegI dst, iRegL src) %{
+  match(Set dst (ConvL2I src));
+  size(4);
+#ifdef AARCH64
+  format %{ "MOV_w  $dst,$src\t! long->int" %}
+  ins_encode %{
+    __ mov_w($dst$$Register, $src$$Register);
+  %}
+#else
+  format %{ "MOV    $dst,$src.lo\t! long->int" %}
+  ins_encode %{
+    __ mov($dst$$Register, $src$$Register);
+  %}
+#endif
+  ins_pipe(ialu_move_reg_I_to_L);
+%}
+
+#ifndef AARCH64
+// Register Shift Right Immediate
+instruct shrL_reg_imm6_L2I(iRegI dst, iRegL src, immI_32_63 cnt) %{
+  match(Set dst (ConvL2I (RShiftL src cnt)));
+  size(4);
+  format %{ "ASR    $dst,$src.hi,($cnt - 32)\t! long->int or mov if $cnt==32" %}
+  ins_encode %{
+    if ($cnt$$constant == 32) {
+      __ mov($dst$$Register, $src$$Register->successor());
+    } else {
+      __ mov($dst$$Register, AsmOperand($src$$Register->successor(), asr, $cnt$$constant - 32));
+    }
+  %}
+  ins_pipe(ialu_reg_imm);
+%}
+#endif
+
+
+//----------Control Flow Instructions------------------------------------------
+// Compare Instructions
+// Compare Integers
+instruct compI_iReg(flagsReg icc, iRegI op1, iRegI op2) %{
+  match(Set icc (CmpI op1 op2));
+  effect( DEF icc, USE op1, USE op2 );
+
+  size(4);
+  format %{ "cmp_32 $op1,$op2\t! int" %}
+  ins_encode %{
+    __ cmp_32($op1$$Register, $op2$$Register);
+  %}
+  ins_pipe(ialu_cconly_reg_reg);
+%}
+
+#ifdef _LP64
+// Compare compressed pointers
+instruct compN_reg2(flagsRegU icc, iRegN op1, iRegN op2) %{
+  match(Set icc (CmpN op1 op2));
+  effect( DEF icc, USE op1, USE op2 );
+
+  size(4);
+  format %{ "cmp_32 $op1,$op2\t! int" %}
+  ins_encode %{
+    __ cmp_32($op1$$Register, $op2$$Register);
+  %}
+  ins_pipe(ialu_cconly_reg_reg);
+%}
+#endif
+
+instruct compU_iReg(flagsRegU icc, iRegI op1, iRegI op2) %{
+  match(Set icc (CmpU op1 op2));
+
+  size(4);
+  format %{ "cmp_32 $op1,$op2\t! unsigned int" %}
+  ins_encode %{
+    __ cmp_32($op1$$Register, $op2$$Register);
+  %}
+  ins_pipe(ialu_cconly_reg_reg);
+%}
+
+instruct compI_iReg_immneg(flagsReg icc, iRegI op1, aimmIneg op2) %{
+  match(Set icc (CmpI op1 op2));
+  effect( DEF icc, USE op1 );
+
+  size(4);
+  format %{ "cmn_32 $op1,-$op2\t! int" %}
+  ins_encode %{
+    __ cmn_32($op1$$Register, -$op2$$constant);
+  %}
+  ins_pipe(ialu_cconly_reg_imm);
+%}
+
+instruct compI_iReg_imm(flagsReg icc, iRegI op1, aimmI op2) %{
+  match(Set icc (CmpI op1 op2));
+  effect( DEF icc, USE op1 );
+
+  size(4);
+  format %{ "cmp_32 $op1,$op2\t! int" %}
+  ins_encode %{
+    __ cmp_32($op1$$Register, $op2$$constant);
+  %}
+  ins_pipe(ialu_cconly_reg_imm);
+%}
+
+instruct testI_reg_reg( flagsReg_EQNELTGE icc, iRegI op1, iRegI op2, immI0 zero ) %{
+  match(Set icc (CmpI (AndI op1 op2) zero));
+  size(4);
+  format %{ "tst_32 $op2,$op1" %}
+
+  ins_encode %{
+    __ tst_32($op1$$Register, $op2$$Register);
+  %}
+  ins_pipe(ialu_cconly_reg_reg_zero);
+%}
+
+#ifndef AARCH64
+instruct testshlI_reg_reg_reg( flagsReg_EQNELTGE icc, iRegI op1, iRegI op2, iRegI op3, immI0 zero ) %{
+  match(Set icc (CmpI (AndI op1 (LShiftI op2 op3)) zero));
+  size(4);
+  format %{ "TST   $op2,$op1<<$op3" %}
+
+  ins_encode %{
+    __ tst($op1$$Register, AsmOperand($op2$$Register, lsl, $op3$$Register));
+  %}
+  ins_pipe(ialu_cconly_reg_reg_zero);
+%}
+#endif
+
+instruct testshlI_reg_reg_imm( flagsReg_EQNELTGE icc, iRegI op1, iRegI op2, immU5 op3, immI0 zero ) %{
+  match(Set icc (CmpI (AndI op1 (LShiftI op2 op3)) zero));
+  size(4);
+  format %{ "tst_32 $op2,$op1<<$op3" %}
+
+  ins_encode %{
+    __ tst_32($op1$$Register, AsmOperand($op2$$Register, lsl, $op3$$constant));
+  %}
+  ins_pipe(ialu_cconly_reg_reg_zero);
+%}
+
+#ifndef AARCH64
+instruct testsarI_reg_reg_reg( flagsReg_EQNELTGE icc, iRegI op1, iRegI op2, iRegI op3, immI0 zero ) %{
+  match(Set icc (CmpI (AndI op1 (RShiftI op2 op3)) zero));
+  size(4);
+  format %{ "TST   $op2,$op1<<$op3" %}
+
+  ins_encode %{
+    __ tst($op1$$Register, AsmOperand($op2$$Register, asr, $op3$$Register));
+  %}
+  ins_pipe(ialu_cconly_reg_reg_zero);
+%}
+#endif
+
+instruct testsarI_reg_reg_imm( flagsReg_EQNELTGE icc, iRegI op1, iRegI op2, immU5 op3, immI0 zero ) %{
+  match(Set icc (CmpI (AndI op1 (RShiftI op2 op3)) zero));
+  size(4);
+  format %{ "tst_32 $op2,$op1<<$op3" %}
+
+  ins_encode %{
+    __ tst_32($op1$$Register, AsmOperand($op2$$Register, asr, $op3$$constant));
+  %}
+  ins_pipe(ialu_cconly_reg_reg_zero);
+%}
+
+#ifndef AARCH64
+instruct testshrI_reg_reg_reg( flagsReg_EQNELTGE icc, iRegI op1, iRegI op2, iRegI op3, immI0 zero ) %{
+  match(Set icc (CmpI (AndI op1 (URShiftI op2 op3)) zero));
+  size(4);
+  format %{ "TST   $op2,$op1<<$op3" %}
+
+  ins_encode %{
+    __ tst($op1$$Register, AsmOperand($op2$$Register, lsr, $op3$$Register));
+  %}
+  ins_pipe(ialu_cconly_reg_reg_zero);
+%}
+#endif
+
+instruct testshrI_reg_reg_imm( flagsReg_EQNELTGE icc, iRegI op1, iRegI op2, immU5 op3, immI0 zero ) %{
+  match(Set icc (CmpI (AndI op1 (URShiftI op2 op3)) zero));
+  size(4);
+  format %{ "tst_32 $op2,$op1<<$op3" %}
+
+  ins_encode %{
+    __ tst_32($op1$$Register, AsmOperand($op2$$Register, lsr, $op3$$constant));
+  %}
+  ins_pipe(ialu_cconly_reg_reg_zero);
+%}
+
+instruct testI_reg_imm( flagsReg_EQNELTGE icc, iRegI op1, limmI op2, immI0 zero ) %{
+  match(Set icc (CmpI (AndI op1 op2) zero));
+  size(4);
+  format %{ "tst_32 $op2,$op1" %}
+
+  ins_encode %{
+    __ tst_32($op1$$Register, $op2$$constant);
+  %}
+  ins_pipe(ialu_cconly_reg_imm_zero);
+%}
+
+#ifdef AARCH64
+instruct compL_reg_reg(flagsReg xcc, iRegL op1, iRegL op2)
+%{
+  match(Set xcc (CmpL op1 op2));
+  effect( DEF xcc, USE op1, USE op2 );
+
+  size(4);
+  format %{ "CMP     $op1,$op2\t! long" %}
+  ins_encode %{
+    __ cmp($op1$$Register, $op2$$Register);
+  %}
+  ins_pipe(ialu_cconly_reg_reg);
+%}
+#else
+instruct compL_reg_reg_LTGE(flagsRegL_LTGE xcc, iRegL op1, iRegL op2, iRegL tmp) %{
+  match(Set xcc (CmpL op1 op2));
+  effect( DEF xcc, USE op1, USE op2, TEMP tmp );
+
+  size(8);
+  format %{ "SUBS    $tmp,$op1.low,$op2.low\t\t! long\n\t"
+            "SBCS    $tmp,$op1.hi,$op2.hi" %}
+  ins_encode %{
+    __ subs($tmp$$Register, $op1$$Register, $op2$$Register);
+    __ sbcs($tmp$$Register->successor(), $op1$$Register->successor(), $op2$$Register->successor());
+  %}
+  ins_pipe(ialu_cconly_reg_reg);
+%}
+#endif
+
+#ifdef AARCH64
+instruct compL_reg_con(flagsReg xcc, iRegL op1, aimmL con) %{
+  match(Set xcc (CmpL op1 con));
+  effect( DEF xcc, USE op1, USE con );
+
+  size(8);
+  format %{ "CMP     $op1,$con\t\t! long"  %}
+  ins_encode %{
+    __ cmp($op1$$Register, $con$$constant);
+  %}
+
+  ins_pipe(ialu_cconly_reg_imm);
+%}
+#else
+instruct compL_reg_reg_EQNE(flagsRegL_EQNE xcc, iRegL op1, iRegL op2) %{
+  match(Set xcc (CmpL op1 op2));
+  effect( DEF xcc, USE op1, USE op2 );
+
+  size(8);
+  format %{ "TEQ    $op1.hi,$op2.hi\t\t! long\n\t"
+            "TEQ.eq $op1.lo,$op2.lo" %}
+  ins_encode %{
+    __ teq($op1$$Register->successor(), $op2$$Register->successor());
+    __ teq($op1$$Register, $op2$$Register, eq);
+  %}
+  ins_pipe(ialu_cconly_reg_reg);
+%}
+
+instruct compL_reg_reg_LEGT(flagsRegL_LEGT xcc, iRegL op1, iRegL op2, iRegL tmp) %{
+  match(Set xcc (CmpL op1 op2));
+  effect( DEF xcc, USE op1, USE op2, TEMP tmp );
+
+  size(8);
+  format %{ "SUBS    $tmp,$op2.low,$op1.low\t\t! long\n\t"
+            "SBCS    $tmp,$op2.hi,$op1.hi" %}
+  ins_encode %{
+    __ subs($tmp$$Register, $op2$$Register, $op1$$Register);
+    __ sbcs($tmp$$Register->successor(), $op2$$Register->successor(), $op1$$Register->successor());
+  %}
+  ins_pipe(ialu_cconly_reg_reg);
+%}
+
+// TODO: try immLRot2 instead, (0, $con$$constant) becomes
+// (hi($con$$constant), lo($con$$constant)) becomes
+instruct compL_reg_con_LTGE(flagsRegL_LTGE xcc, iRegL op1, immLlowRot con, iRegL tmp) %{
+  match(Set xcc (CmpL op1 con));
+  effect( DEF xcc, USE op1, USE con, TEMP tmp );
+
+  size(8);
+  format %{ "SUBS    $tmp,$op1.low,$con\t\t! long\n\t"
+            "SBCS    $tmp,$op1.hi,0" %}
+  ins_encode %{
+    __ subs($tmp$$Register, $op1$$Register, $con$$constant);
+    __ sbcs($tmp$$Register->successor(), $op1$$Register->successor(), 0);
+  %}
+
+  ins_pipe(ialu_cconly_reg_reg);
+%}
+
+// TODO: try immLRot2 instead, (0, $con$$constant) becomes
+// (hi($con$$constant), lo($con$$constant)) becomes
+instruct compL_reg_con_EQNE(flagsRegL_EQNE xcc, iRegL op1, immLlowRot con) %{
+  match(Set xcc (CmpL op1 con));
+  effect( DEF xcc, USE op1, USE con );
+
+  size(8);
+  format %{ "TEQ    $op1.hi,0\t\t! long\n\t"
+            "TEQ.eq $op1.lo,$con" %}
+  ins_encode %{
+    __ teq($op1$$Register->successor(), 0);
+    __ teq($op1$$Register, $con$$constant, eq);
+  %}
+
+  ins_pipe(ialu_cconly_reg_reg);
+%}
+
+// TODO: try immLRot2 instead, (0, $con$$constant) becomes
+// (hi($con$$constant), lo($con$$constant)) becomes
+instruct compL_reg_con_LEGT(flagsRegL_LEGT xcc, iRegL op1, immLlowRot con, iRegL tmp) %{
+  match(Set xcc (CmpL op1 con));
+  effect( DEF xcc, USE op1, USE con, TEMP tmp );
+
+  size(8);
+  format %{ "RSBS    $tmp,$op1.low,$con\t\t! long\n\t"
+            "RSCS    $tmp,$op1.hi,0" %}
+  ins_encode %{
+    __ rsbs($tmp$$Register, $op1$$Register, $con$$constant);
+    __ rscs($tmp$$Register->successor(), $op1$$Register->successor(), 0);
+  %}
+
+  ins_pipe(ialu_cconly_reg_reg);
+%}
+#endif
+
+/* instruct testL_reg_reg(flagsRegL xcc, iRegL op1, iRegL op2, immL0 zero) %{ */
+/*   match(Set xcc (CmpL (AndL op1 op2) zero)); */
+/*   ins_encode %{ */
+/*     __ stop("testL_reg_reg unimplemented"); */
+/*   %} */
+/*   ins_pipe(ialu_cconly_reg_reg); */
+/* %} */
+
+/* // useful for checking the alignment of a pointer: */
+/* instruct testL_reg_con(flagsRegL xcc, iRegL op1, immLlowRot con, immL0 zero) %{ */
+/*   match(Set xcc (CmpL (AndL op1 con) zero)); */
+/*   ins_encode %{ */
+/*     __ stop("testL_reg_con unimplemented"); */
+/*   %} */
+/*   ins_pipe(ialu_cconly_reg_reg); */
+/* %} */
+
+instruct compU_iReg_imm(flagsRegU icc, iRegI op1, aimmU31 op2 ) %{
+  match(Set icc (CmpU op1 op2));
+
+  size(4);
+  format %{ "cmp_32 $op1,$op2\t! unsigned" %}
+  ins_encode %{
+    __ cmp_32($op1$$Register, $op2$$constant);
+  %}
+  ins_pipe(ialu_cconly_reg_imm);
+%}
+
+// Compare Pointers
+instruct compP_iRegP(flagsRegP pcc, iRegP op1, iRegP op2 ) %{
+  match(Set pcc (CmpP op1 op2));
+
+  size(4);
+  format %{ "CMP    $op1,$op2\t! ptr" %}
+  ins_encode %{
+    __ cmp($op1$$Register, $op2$$Register);
+  %}
+  ins_pipe(ialu_cconly_reg_reg);
+%}
+
+instruct compP_iRegP_imm(flagsRegP pcc, iRegP op1, aimmP op2 ) %{
+  match(Set pcc (CmpP op1 op2));
+
+  size(4);
+  format %{ "CMP    $op1,$op2\t! ptr" %}
+  ins_encode %{
+    assert($op2$$constant == 0 || _opnds[2]->constant_reloc() == relocInfo::none, "reloc in cmp?");
+    __ cmp($op1$$Register, $op2$$constant);
+  %}
+  ins_pipe(ialu_cconly_reg_imm);
+%}
+
+//----------Max and Min--------------------------------------------------------
+// Min Instructions
+// Conditional move for min
+instruct cmovI_reg_lt( iRegI op2, iRegI op1, flagsReg icc ) %{
+  effect( USE_DEF op2, USE op1, USE icc );
+
+  size(4);
+  format %{ "MOV.lt  $op2,$op1\t! min" %}
+  ins_encode %{
+    __ mov($op2$$Register, $op1$$Register, lt);
+  %}
+  ins_pipe(ialu_reg_flags);
+%}
+
+// Min Register with Register.
+instruct minI_eReg(iRegI op1, iRegI op2) %{
+  match(Set op2 (MinI op1 op2));
+  ins_cost(DEFAULT_COST*2);
+  expand %{
+    flagsReg icc;
+    compI_iReg(icc,op1,op2);
+    cmovI_reg_lt(op2,op1,icc);
+  %}
+%}
+
+// Max Instructions
+// Conditional move for max
+instruct cmovI_reg_gt( iRegI op2, iRegI op1, flagsReg icc ) %{
+  effect( USE_DEF op2, USE op1, USE icc );
+  format %{ "MOV.gt  $op2,$op1\t! max" %}
+  ins_encode %{
+    __ mov($op2$$Register, $op1$$Register, gt);
+  %}
+  ins_pipe(ialu_reg_flags);
+%}
+
+// Max Register with Register
+instruct maxI_eReg(iRegI op1, iRegI op2) %{
+  match(Set op2 (MaxI op1 op2));
+  ins_cost(DEFAULT_COST*2);
+  expand %{
+    flagsReg icc;
+    compI_iReg(icc,op1,op2);
+    cmovI_reg_gt(op2,op1,icc);
+  %}
+%}
+
+
+//----------Float Compares----------------------------------------------------
+// Compare floating, generate condition code
+instruct cmpF_cc(flagsRegF fcc, flagsReg icc, regF src1, regF src2) %{
+  match(Set icc (CmpF src1 src2));
+  effect(KILL fcc);
+
+#ifdef AARCH64
+  size(4);
+  format %{ "FCMP_s  $src1,$src2" %}
+  ins_encode %{
+    __ fcmp_s($src1$$FloatRegister, $src2$$FloatRegister);
+  %}
+#else
+  size(8);
+  format %{ "FCMPs  $src1,$src2\n\t"
+            "FMSTAT" %}
+  ins_encode %{
+    __ fcmps($src1$$FloatRegister, $src2$$FloatRegister);
+    __ fmstat();
+  %}
+#endif
+  ins_pipe(faddF_fcc_reg_reg_zero);
+%}
+
+instruct cmpF0_cc(flagsRegF fcc, flagsReg icc, regF src1, immF0 src2) %{
+  match(Set icc (CmpF src1 src2));
+  effect(KILL fcc);
+
+#ifdef AARCH64
+  size(4);
+  format %{ "FCMP0_s $src1" %}
+  ins_encode %{
+    __ fcmp0_s($src1$$FloatRegister);
+  %}
+#else
+  size(8);
+  format %{ "FCMPs  $src1,$src2\n\t"
+            "FMSTAT" %}
+  ins_encode %{
+    __ fcmpzs($src1$$FloatRegister);
+    __ fmstat();
+  %}
+#endif
+  ins_pipe(faddF_fcc_reg_reg_zero);
+%}
+
+instruct cmpD_cc(flagsRegF fcc, flagsReg icc, regD src1, regD src2) %{
+  match(Set icc (CmpD src1 src2));
+  effect(KILL fcc);
+
+#ifdef AARCH64
+  size(4);
+  format %{ "FCMP_d $src1,$src2" %}
+  ins_encode %{
+    __ fcmp_d($src1$$FloatRegister, $src2$$FloatRegister);
+  %}
+#else
+  size(8);
+  format %{ "FCMPd  $src1,$src2 \n\t"
+            "FMSTAT" %}
+  ins_encode %{
+    __ fcmpd($src1$$FloatRegister, $src2$$FloatRegister);
+    __ fmstat();
+  %}
+#endif
+  ins_pipe(faddD_fcc_reg_reg_zero);
+%}
+
+instruct cmpD0_cc(flagsRegF fcc, flagsReg icc, regD src1, immD0 src2) %{
+  match(Set icc (CmpD src1 src2));
+  effect(KILL fcc);
+
+#ifdef AARCH64
+  size(8);
+  format %{ "FCMP0_d $src1" %}
+  ins_encode %{
+    __ fcmp0_d($src1$$FloatRegister);
+  %}
+#else
+  size(8);
+  format %{ "FCMPZd  $src1,$src2 \n\t"
+            "FMSTAT" %}
+  ins_encode %{
+    __ fcmpzd($src1$$FloatRegister);
+    __ fmstat();
+  %}
+#endif
+  ins_pipe(faddD_fcc_reg_reg_zero);
+%}
+
+#ifdef AARCH64
+// Compare floating, generate -1,0,1
+instruct cmpF_reg(iRegI dst, regF src1, regF src2, flagsReg icc) %{
+  match(Set dst (CmpF3 src1 src2));
+  // effect(KILL fcc); // nobody cares if flagsRegF is killed
+  effect(KILL icc);
+  ins_cost(DEFAULT_COST*3); // FIXME
+  size(12);
+  format %{ "FCMP_s $src1,$src2\n\t"
+            "CSET   $dst, gt\n\t"
+            "CSINV  $dst, $dst, ZR, ge" %}
+  ins_encode %{
+    Register dst = $dst$$Register;
+    __ fcmp_s($src1$$FloatRegister, $src2$$FloatRegister);
+    __ cset(dst, gt);            // 1 if '>', else 0
+    __ csinv(dst, dst, ZR, ge);  // previous value if '>=', else -1
+  %}
+  ins_pipe( floating_cmp ); // FIXME
+%}
+
+// Compare floating, generate -1,0,1
+instruct cmpD_reg(iRegI dst, regD src1, regD src2, flagsReg icc) %{
+  match(Set dst (CmpD3 src1 src2));
+  // effect(KILL fcc); // nobody cares if flagsRegF is killed
+  effect(KILL icc);
+  ins_cost(DEFAULT_COST*3); // FIXME
+  size(12);
+  format %{ "FCMP_d $src1,$src2\n\t"
+            "CSET   $dst, gt\n\t"
+            "CSINV  $dst, $dst, ZR, ge" %}
+  ins_encode %{
+    Register dst = $dst$$Register;
+    __ fcmp_d($src1$$FloatRegister, $src2$$FloatRegister);
+    __ cset(dst, gt);            // 1 if '>', else 0
+    __ csinv(dst, dst, ZR, ge);  // previous value if '>=', else -1
+  %}
+  ins_pipe( floating_cmp ); // FIXME
+%}
+
+// Compare floating, generate -1,0,1
+instruct cmpF0_reg(iRegI dst, regF src1, immF0 src2, flagsReg icc) %{
+  match(Set dst (CmpF3 src1 src2));
+  // effect(KILL fcc); // nobody cares if flagsRegF is killed
+  effect(KILL icc);
+  ins_cost(DEFAULT_COST*3); // FIXME
+  size(12);
+  format %{ "FCMP0_s $src1\n\t"
+            "CSET   $dst, gt\n\t"
+            "CSINV  $dst, $dst, ZR, ge" %}
+  ins_encode %{
+    Register dst = $dst$$Register;
+    __ fcmp0_s($src1$$FloatRegister);
+    __ cset(dst, gt);            // 1 if '>', else 0
+    __ csinv(dst, dst, ZR, ge);  // previous value if '>=', else -1
+  %}
+  ins_pipe( floating_cmp ); // FIXME
+%}
+
+// Compare floating, generate -1,0,1
+instruct cmpD0_reg(iRegI dst, regD src1, immD0 src2, flagsReg icc) %{
+  match(Set dst (CmpD3 src1 src2));
+  // effect(KILL fcc); // nobody cares if flagsRegF is killed
+  effect(KILL icc);
+  ins_cost(DEFAULT_COST*3); // FIXME
+  size(12);
+  format %{ "FCMP0_d $src1\n\t"
+            "CSET   $dst, gt\n\t"
+            "CSINV  $dst, $dst, ZR, ge" %}
+  ins_encode %{
+    Register dst = $dst$$Register;
+    __ fcmp0_d($src1$$FloatRegister);
+    __ cset(dst, gt);            // 1 if '>', else 0
+    __ csinv(dst, dst, ZR, ge);  // previous value if '>=', else -1
+  %}
+  ins_pipe( floating_cmp ); // FIXME
+%}
+#else
+// Compare floating, generate -1,0,1
+instruct cmpF_reg(iRegI dst, regF src1, regF src2, flagsRegF fcc) %{
+  match(Set dst (CmpF3 src1 src2));
+  effect(KILL fcc);
+  ins_cost(DEFAULT_COST*3+BRANCH_COST*3); // FIXME
+  size(20);
+  // same number of instructions as code using conditional moves but
+  // doesn't kill integer condition register
+  format %{ "FCMPs  $dst,$src1,$src2 \n\t"
+            "VMRS   $dst, FPSCR \n\t"
+            "OR     $dst, $dst, 0x08000000 \n\t"
+            "EOR    $dst, $dst, $dst << 3 \n\t"
+            "MOV    $dst, $dst >> 30" %}
+  ins_encode %{
+    __ fcmps($src1$$FloatRegister, $src2$$FloatRegister);
+    __ floating_cmp($dst$$Register);
+  %}
+  ins_pipe( floating_cmp );
+%}
+
+instruct cmpF0_reg(iRegI dst, regF src1, immF0 src2, flagsRegF fcc) %{
+  match(Set dst (CmpF3 src1 src2));
+  effect(KILL fcc);
+  ins_cost(DEFAULT_COST*3+BRANCH_COST*3); // FIXME
+  size(20);
+  // same number of instructions as code using conditional moves but
+  // doesn't kill integer condition register
+  format %{ "FCMPZs $dst,$src1,$src2 \n\t"
+            "VMRS   $dst, FPSCR \n\t"
+            "OR     $dst, $dst, 0x08000000 \n\t"
+            "EOR    $dst, $dst, $dst << 3 \n\t"
+            "MOV    $dst, $dst >> 30" %}
+  ins_encode %{
+    __ fcmpzs($src1$$FloatRegister);
+    __ floating_cmp($dst$$Register);
+  %}
+  ins_pipe( floating_cmp );
+%}
+
+instruct cmpD_reg(iRegI dst, regD src1, regD src2, flagsRegF fcc) %{
+  match(Set dst (CmpD3 src1 src2));
+  effect(KILL fcc);
+  ins_cost(DEFAULT_COST*3+BRANCH_COST*3); // FIXME
+  size(20);
+  // same number of instructions as code using conditional moves but
+  // doesn't kill integer condition register
+  format %{ "FCMPd  $dst,$src1,$src2 \n\t"
+            "VMRS   $dst, FPSCR \n\t"
+            "OR     $dst, $dst, 0x08000000 \n\t"
+            "EOR    $dst, $dst, $dst << 3 \n\t"
+            "MOV    $dst, $dst >> 30" %}
+  ins_encode %{
+    __ fcmpd($src1$$FloatRegister, $src2$$FloatRegister);
+    __ floating_cmp($dst$$Register);
+  %}
+  ins_pipe( floating_cmp );
+%}
+
+instruct cmpD0_reg(iRegI dst, regD src1, immD0 src2, flagsRegF fcc) %{
+  match(Set dst (CmpD3 src1 src2));
+  effect(KILL fcc);
+  ins_cost(DEFAULT_COST*3+BRANCH_COST*3); // FIXME
+  size(20);
+  // same number of instructions as code using conditional moves but
+  // doesn't kill integer condition register
+  format %{ "FCMPZd $dst,$src1,$src2 \n\t"
+            "VMRS   $dst, FPSCR \n\t"
+            "OR     $dst, $dst, 0x08000000 \n\t"
+            "EOR    $dst, $dst, $dst << 3 \n\t"
+            "MOV    $dst, $dst >> 30" %}
+  ins_encode %{
+    __ fcmpzd($src1$$FloatRegister);
+    __ floating_cmp($dst$$Register);
+  %}
+  ins_pipe( floating_cmp );
+%}
+#endif // !AARCH64
+
+//----------Branches---------------------------------------------------------
+// Jump
+// (compare 'operand indIndex' and 'instruct addP_reg_reg' above)
+// FIXME
+instruct jumpXtnd(iRegX switch_val, iRegP tmp) %{
+  match(Jump switch_val);
+  effect(TEMP tmp);
+  ins_cost(350);
+  format %{  "ADD    $tmp, $constanttablebase, $switch_val\n\t"
+             "LDR    $tmp,[$tmp + $constantoffset]\n\t"
+             "BX     $tmp" %}
+  size(20);
+  ins_encode %{
+    Register table_reg;
+    Register label_reg = $tmp$$Register;
+    if (constant_offset() == 0) {
+      table_reg = $constanttablebase;
+      __ ldr(label_reg, Address(table_reg, $switch_val$$Register));
+    } else {
+      table_reg = $tmp$$Register;
+      int offset = $constantoffset;
+      if (is_memoryP(offset)) {
+        __ add(table_reg, $constanttablebase, $switch_val$$Register);
+        __ ldr(label_reg, Address(table_reg, offset));
+      } else {
+        __ mov_slow(table_reg, $constantoffset);
+        __ add(table_reg, $constanttablebase, table_reg);
+        __ ldr(label_reg, Address(table_reg, $switch_val$$Register));
+      }
+    }
+    __ jump(label_reg); // ldr + b better than ldr to PC for branch predictor?
+    //    __ ldr(PC, Address($table$$Register, $switch_val$$Register));
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+
+// // Direct Branch.
+instruct branch(label labl) %{
+  match(Goto);
+  effect(USE labl);
+
+  size(4);
+  ins_cost(BRANCH_COST);
+  format %{ "B     $labl" %}
+  ins_encode %{
+    __ b(*($labl$$label));
+  %}
+  ins_pipe(br);
+%}
+
+// Conditional Direct Branch
+instruct branchCon(cmpOp cmp, flagsReg icc, label labl) %{
+  match(If cmp icc);
+  effect(USE labl);
+
+  size(4);
+  ins_cost(BRANCH_COST);
+  format %{ "B$cmp   $icc,$labl" %}
+  ins_encode %{
+    __ b(*($labl$$label), (AsmCondition)($cmp$$cmpcode));
+  %}
+  ins_pipe(br_cc);
+%}
+
+#ifdef ARM
+instruct branchCon_EQNELTGE(cmpOp0 cmp, flagsReg_EQNELTGE icc, label labl) %{
+  match(If cmp icc);
+  effect(USE labl);
+  predicate( _kids[0]->_leaf->as_Bool()->_test._test == BoolTest::eq || _kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ne || _kids[0]->_leaf->as_Bool()->_test._test == BoolTest::lt || _kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ge);
+
+  size(4);
+  ins_cost(BRANCH_COST);
+  format %{ "B$cmp   $icc,$labl" %}
+  ins_encode %{
+    __ b(*($labl$$label), (AsmCondition)($cmp$$cmpcode));
+  %}
+  ins_pipe(br_cc);
+%}
+#endif
+
+#ifdef AARCH64
+instruct cbzI(cmpOp cmp, iRegI op1, immI0 op2, label labl) %{
+  match(If cmp (CmpI op1 op2));
+  effect(USE labl);
+  predicate(_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::eq ||
+            _kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ne);
+  size(4);
+  ins_cost(BRANCH_COST);
+  format %{ "CB{N}Z $op1, $labl\t! int $cmp" %}
+  ins_encode %{
+    if ($cmp$$cmpcode == eq) {
+      __ cbz_w($op1$$Register, *($labl$$label));
+    } else {
+      __ cbnz_w($op1$$Register, *($labl$$label));
+    }
+  %}
+  ins_pipe(br_cc); // FIXME
+%}
+
+instruct cbzP(cmpOpP cmp, iRegP op1, immP0 op2, label labl) %{
+  match(If cmp (CmpP op1 op2));
+  effect(USE labl);
+  predicate(_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::eq ||
+            _kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ne);
+  size(4);
+  ins_cost(BRANCH_COST);
+  format %{ "CB{N}Z $op1, $labl\t! ptr $cmp" %}
+  ins_encode %{
+    if ($cmp$$cmpcode == eq) {
+      __ cbz($op1$$Register, *($labl$$label));
+    } else {
+      __ cbnz($op1$$Register, *($labl$$label));
+    }
+  %}
+  ins_pipe(br_cc); // FIXME
+%}
+
+instruct cbzL(cmpOpL cmp, iRegL op1, immL0 op2, label labl) %{
+  match(If cmp (CmpL op1 op2));
+  effect(USE labl);
+  predicate(_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::eq ||
+            _kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ne);
+  size(4);
+  ins_cost(BRANCH_COST);
+  format %{ "CB{N}Z $op1, $labl\t! long $cmp" %}
+  ins_encode %{
+    if ($cmp$$cmpcode == eq) {
+      __ cbz($op1$$Register, *($labl$$label));
+    } else {
+      __ cbnz($op1$$Register, *($labl$$label));
+    }
+  %}
+  ins_pipe(br_cc); // FIXME
+%}
+#endif
+
+instruct branchConU(cmpOpU cmp, flagsRegU icc, label labl) %{
+  match(If cmp icc);
+  effect(USE labl);
+
+  size(4);
+  ins_cost(BRANCH_COST);
+  format %{ "B$cmp  $icc,$labl" %}
+  ins_encode %{
+    __ b(*($labl$$label), (AsmCondition)($cmp$$cmpcode));
+  %}
+  ins_pipe(br_cc);
+%}
+
+instruct branchConP(cmpOpP cmp, flagsRegP pcc, label labl) %{
+  match(If cmp pcc);
+  effect(USE labl);
+
+  size(4);
+  ins_cost(BRANCH_COST);
+  format %{ "B$cmp  $pcc,$labl" %}
+  ins_encode %{
+    __ b(*($labl$$label), (AsmCondition)($cmp$$cmpcode));
+  %}
+  ins_pipe(br_cc);
+%}
+
+#ifndef AARCH64
+instruct branchConL_LTGE(cmpOpL cmp, flagsRegL_LTGE xcc, label labl) %{
+  match(If cmp xcc);
+  effect(USE labl);
+  predicate( _kids[0]->_leaf->as_Bool()->_test._test == BoolTest::lt || _kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ge );
+
+  size(4);
+  ins_cost(BRANCH_COST);
+  format %{ "B$cmp  $xcc,$labl" %}
+  ins_encode %{
+    __ b(*($labl$$label), (AsmCondition)($cmp$$cmpcode));
+  %}
+  ins_pipe(br_cc);
+%}
+
+instruct branchConL_EQNE(cmpOpL cmp, flagsRegL_EQNE xcc, label labl) %{
+  match(If cmp xcc);
+  effect(USE labl);
+  predicate( _kids[0]->_leaf->as_Bool()->_test._test == BoolTest::eq || _kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ne );
+
+  size(4);
+  ins_cost(BRANCH_COST);
+  format %{ "B$cmp  $xcc,$labl" %}
+  ins_encode %{
+    __ b(*($labl$$label), (AsmCondition)($cmp$$cmpcode));
+  %}
+  ins_pipe(br_cc);
+%}
+
+instruct branchConL_LEGT(cmpOpL_commute cmp, flagsRegL_LEGT xcc, label labl) %{
+  match(If cmp xcc);
+  effect(USE labl);
+  predicate( _kids[0]->_leaf->as_Bool()->_test._test == BoolTest::gt || _kids[0]->_leaf->as_Bool()->_test._test == BoolTest::le );
+
+  size(4);
+  ins_cost(BRANCH_COST);
+  format %{ "B$cmp  $xcc,$labl" %}
+  ins_encode %{
+    __ b(*($labl$$label), (AsmCondition)($cmp$$cmpcode));
+  %}
+  ins_pipe(br_cc);
+%}
+#endif
+
+instruct branchLoopEnd(cmpOp cmp, flagsReg icc, label labl) %{
+  match(CountedLoopEnd cmp icc);
+  effect(USE labl);
+
+  size(4);
+  ins_cost(BRANCH_COST);
+  format %{ "B$cmp   $icc,$labl\t! Loop end" %}
+  ins_encode %{
+    __ b(*($labl$$label), (AsmCondition)($cmp$$cmpcode));
+  %}
+  ins_pipe(br_cc);
+%}
+
+// instruct branchLoopEndU(cmpOpU cmp, flagsRegU icc, label labl) %{
+//   match(CountedLoopEnd cmp icc);
+//   ins_pipe(br_cc);
+// %}
+
+// ============================================================================
+// Long Compare
+//
+// Currently we hold longs in 2 registers.  Comparing such values efficiently
+// is tricky.  The flavor of compare used depends on whether we are testing
+// for LT, LE, or EQ.  For a simple LT test we can check just the sign bit.
+// The GE test is the negated LT test.  The LE test can be had by commuting
+// the operands (yielding a GE test) and then negating; negate again for the
+// GT test.  The EQ test is done by ORcc'ing the high and low halves, and the
+// NE test is negated from that.
+
+// Due to a shortcoming in the ADLC, it mixes up expressions like:
+// (foo (CmpI (CmpL X Y) 0)) and (bar (CmpI (CmpL X 0L) 0)).  Note the
+// difference between 'Y' and '0L'.  The tree-matches for the CmpI sections
+// are collapsed internally in the ADLC's dfa-gen code.  The match for
+// (CmpI (CmpL X Y) 0) is silently replaced with (CmpI (CmpL X 0L) 0) and the
+// foo match ends up with the wrong leaf.  One fix is to not match both
+// reg-reg and reg-zero forms of long-compare.  This is unfortunate because
+// both forms beat the trinary form of long-compare and both are very useful
+// on Intel which has so few registers.
+
+// instruct branchCon_long(cmpOp cmp, flagsRegL xcc, label labl) %{
+//   match(If cmp xcc);
+//   ins_pipe(br_cc);
+// %}
+
+// Manifest a CmpL3 result in an integer register.  Very painful.
+// This is the test to avoid.
+#ifdef AARCH64
+instruct cmpL3_reg_reg(iRegI dst, iRegL src1, iRegL src2, flagsReg ccr) %{
+  match(Set dst (CmpL3 src1 src2));
+  // effect(KILL fcc); // nobody cares if flagsRegF is killed
+  effect(KILL ccr);
+  ins_cost(DEFAULT_COST*3); // FIXME
+  size(12);
+  format %{ "CMP    $src1,$src2\n\t"
+            "CSET   $dst, gt\n\t"
+            "CSINV  $dst, $dst, ZR, ge" %}
+  ins_encode %{
+    Register dst = $dst$$Register;
+    __ cmp($src1$$Register, $src2$$Register);
+    __ cset(dst, gt);            // 1 if '>', else 0
+    __ csinv(dst, dst, ZR, ge);  // previous value if '>=', else -1
+  %}
+  ins_pipe( ialu_cconly_reg_reg ); // FIXME
+%}
+// TODO cmpL3_reg_imm
+#else
+instruct cmpL3_reg_reg(iRegI dst, iRegL src1, iRegL src2, flagsReg ccr ) %{
+  match(Set dst (CmpL3 src1 src2) );
+  effect( KILL ccr );
+  ins_cost(6*DEFAULT_COST); // FIXME
+  size(32);
+  format %{
+      "CMP    $src1.hi, $src2.hi\t\t! long\n"
+    "\tMOV.gt $dst, 1\n"
+    "\tmvn.lt $dst, 0\n"
+    "\tB.ne   done\n"
+    "\tSUBS   $dst, $src1.lo, $src2.lo\n"
+    "\tMOV.hi $dst, 1\n"
+    "\tmvn.lo $dst, 0\n"
+    "done:"     %}
+  ins_encode %{
+    Label done;
+    __ cmp($src1$$Register->successor(), $src2$$Register->successor());
+    __ mov($dst$$Register, 1, gt);
+    __ mvn($dst$$Register, 0, lt);
+    __ b(done, ne);
+    __ subs($dst$$Register, $src1$$Register, $src2$$Register);
+    __ mov($dst$$Register, 1, hi);
+    __ mvn($dst$$Register, 0, lo);
+    __ bind(done);
+  %}
+  ins_pipe(cmpL_reg);
+%}
+#endif
+
+#ifndef AARCH64
+// Conditional move
+instruct cmovLL_reg_LTGE(cmpOpL cmp, flagsRegL_LTGE xcc, iRegL dst, iRegL src) %{
+  match(Set dst (CMoveL (Binary cmp xcc) (Binary dst src)));
+  predicate(_kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::lt || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ge );
+
+  ins_cost(150);
+  size(8);
+  format %{ "MOV$cmp  $dst.lo,$src.lo\t! long\n\t"
+            "MOV$cmp  $dst,$src.hi" %}
+  ins_encode %{
+    __ mov($dst$$Register, $src$$Register, (AsmCondition)($cmp$$cmpcode));
+    __ mov($dst$$Register->successor(), $src$$Register->successor(), (AsmCondition)($cmp$$cmpcode));
+  %}
+  ins_pipe(ialu_reg);
+%}
+
+instruct cmovLL_reg_EQNE(cmpOpL cmp, flagsRegL_EQNE xcc, iRegL dst, iRegL src) %{
+  match(Set dst (CMoveL (Binary cmp xcc) (Binary dst src)));
+  predicate(_kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::eq || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ne );
+
+  ins_cost(150);
+  size(8);
+  format %{ "MOV$cmp  $dst.lo,$src.lo\t! long\n\t"
+            "MOV$cmp  $dst,$src.hi" %}
+  ins_encode %{
+    __ mov($dst$$Register, $src$$Register, (AsmCondition)($cmp$$cmpcode));
+    __ mov($dst$$Register->successor(), $src$$Register->successor(), (AsmCondition)($cmp$$cmpcode));
+  %}
+  ins_pipe(ialu_reg);
+%}
+
+instruct cmovLL_reg_LEGT(cmpOpL_commute cmp, flagsRegL_LEGT xcc, iRegL dst, iRegL src) %{
+  match(Set dst (CMoveL (Binary cmp xcc) (Binary dst src)));
+  predicate(_kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::le || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::gt );
+
+  ins_cost(150);
+  size(8);
+  format %{ "MOV$cmp  $dst.lo,$src.lo\t! long\n\t"
+            "MOV$cmp  $dst,$src.hi" %}
+  ins_encode %{
+    __ mov($dst$$Register, $src$$Register, (AsmCondition)($cmp$$cmpcode));
+    __ mov($dst$$Register->successor(), $src$$Register->successor(), (AsmCondition)($cmp$$cmpcode));
+  %}
+  ins_pipe(ialu_reg);
+%}
+
+instruct cmovLL_imm_LTGE(cmpOpL cmp, flagsRegL_LTGE xcc, iRegL dst, immL0 src) %{
+  match(Set dst (CMoveL (Binary cmp xcc) (Binary dst src)));
+  predicate(_kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::lt || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ge );
+  ins_cost(140);
+  size(8);
+  format %{ "MOV$cmp  $dst.lo,0\t! long\n\t"
+            "MOV$cmp  $dst,0" %}
+  ins_encode %{
+    __ mov($dst$$Register, 0, (AsmCondition)($cmp$$cmpcode));
+    __ mov($dst$$Register->successor(), 0, (AsmCondition)($cmp$$cmpcode));
+  %}
+  ins_pipe(ialu_imm);
+%}
+
+instruct cmovLL_imm_EQNE(cmpOpL cmp, flagsRegL_EQNE xcc, iRegL dst, immL0 src) %{
+  match(Set dst (CMoveL (Binary cmp xcc) (Binary dst src)));
+  predicate(_kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::eq || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ne );
+  ins_cost(140);
+  size(8);
+  format %{ "MOV$cmp  $dst.lo,0\t! long\n\t"
+            "MOV$cmp  $dst,0" %}
+  ins_encode %{
+    __ mov($dst$$Register, 0, (AsmCondition)($cmp$$cmpcode));
+    __ mov($dst$$Register->successor(), 0, (AsmCondition)($cmp$$cmpcode));
+  %}
+  ins_pipe(ialu_imm);
+%}
+
+instruct cmovLL_imm_LEGT(cmpOpL_commute cmp, flagsRegL_LEGT xcc, iRegL dst, immL0 src) %{
+  match(Set dst (CMoveL (Binary cmp xcc) (Binary dst src)));
+  predicate(_kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::le || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::gt );
+  ins_cost(140);
+  size(8);
+  format %{ "MOV$cmp  $dst.lo,0\t! long\n\t"
+            "MOV$cmp  $dst,0" %}
+  ins_encode %{
+    __ mov($dst$$Register, 0, (AsmCondition)($cmp$$cmpcode));
+    __ mov($dst$$Register->successor(), 0, (AsmCondition)($cmp$$cmpcode));
+  %}
+  ins_pipe(ialu_imm);
+%}
+#endif // !AARCH64
+
+#ifndef AARCH64
+instruct cmovIL_reg_LTGE(cmpOpL cmp, flagsRegL_LTGE xcc, iRegI dst, iRegI src) %{
+  match(Set dst (CMoveI (Binary cmp xcc) (Binary dst src)));
+  predicate(_kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::lt || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ge );
+
+  ins_cost(150);
+  size(4);
+  format %{ "MOV$cmp  $dst,$src" %}
+  ins_encode %{
+    __ mov($dst$$Register, $src$$Register, (AsmCondition)($cmp$$cmpcode));
+  %}
+  ins_pipe(ialu_reg);
+%}
+
+instruct cmovIL_reg_EQNE(cmpOpL cmp, flagsRegL_EQNE xcc, iRegI dst, iRegI src) %{
+  match(Set dst (CMoveI (Binary cmp xcc) (Binary dst src)));
+  predicate(_kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::eq || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ne );
+
+  ins_cost(150);
+  size(4);
+  format %{ "MOV$cmp  $dst,$src" %}
+  ins_encode %{
+    __ mov($dst$$Register, $src$$Register, (AsmCondition)($cmp$$cmpcode));
+  %}
+  ins_pipe(ialu_reg);
+%}
+
+instruct cmovIL_reg_LEGT(cmpOpL_commute cmp, flagsRegL_LEGT xcc, iRegI dst, iRegI src) %{
+  match(Set dst (CMoveI (Binary cmp xcc) (Binary dst src)));
+  predicate(_kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::le || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::gt );
+
+  ins_cost(150);
+  size(4);
+  format %{ "MOV$cmp  $dst,$src" %}
+  ins_encode %{
+    __ mov($dst$$Register, $src$$Register, (AsmCondition)($cmp$$cmpcode));
+  %}
+  ins_pipe(ialu_reg);
+%}
+#endif // !AARCH64
+
+#ifndef AARCH64
+instruct cmovIL_imm_LTGE(cmpOpL cmp, flagsRegL_LTGE xcc, iRegI dst, immI16 src) %{
+  match(Set dst (CMoveI (Binary cmp xcc) (Binary dst src)));
+  predicate(_kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::lt || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ge );
+
+  ins_cost(140);
+  format %{ "MOVW$cmp  $dst,$src" %}
+  ins_encode %{
+    __ movw($dst$$Register, $src$$constant, (AsmCondition)($cmp$$cmpcode));
+  %}
+  ins_pipe(ialu_imm);
+%}
+
+instruct cmovIL_imm_EQNE(cmpOpL cmp, flagsRegL_EQNE xcc, iRegI dst, immI16 src) %{
+  match(Set dst (CMoveI (Binary cmp xcc) (Binary dst src)));
+  predicate(_kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::eq || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ne );
+
+  ins_cost(140);
+  format %{ "MOVW$cmp  $dst,$src" %}
+  ins_encode %{
+    __ movw($dst$$Register, $src$$constant, (AsmCondition)($cmp$$cmpcode));
+  %}
+  ins_pipe(ialu_imm);
+%}
+
+instruct cmovIL_imm_LEGT(cmpOpL_commute cmp, flagsRegL_LEGT xcc, iRegI dst, immI16 src) %{
+  match(Set dst (CMoveI (Binary cmp xcc) (Binary dst src)));
+  predicate(_kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::le || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::gt );
+
+  ins_cost(140);
+  format %{ "MOVW$cmp  $dst,$src" %}
+  ins_encode %{
+    __ movw($dst$$Register, $src$$constant, (AsmCondition)($cmp$$cmpcode));
+  %}
+  ins_pipe(ialu_imm);
+%}
+
+instruct cmovPL_reg_LTGE(cmpOpL cmp, flagsRegL_LTGE xcc, iRegP dst, iRegP src) %{
+  match(Set dst (CMoveP (Binary cmp xcc) (Binary dst src)));
+  predicate(_kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::lt || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ge );
+
+  ins_cost(150);
+  size(4);
+  format %{ "MOV$cmp  $dst,$src" %}
+  ins_encode %{
+    __ mov($dst$$Register, $src$$Register, (AsmCondition)($cmp$$cmpcode));
+  %}
+  ins_pipe(ialu_reg);
+%}
+
+instruct cmovPL_reg_EQNE(cmpOpL cmp, flagsRegL_EQNE xcc, iRegP dst, iRegP src) %{
+  match(Set dst (CMoveP (Binary cmp xcc) (Binary dst src)));
+  predicate(_kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::eq || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ne );
+
+  ins_cost(150);
+  size(4);
+  format %{ "MOV$cmp  $dst,$src" %}
+  ins_encode %{
+    __ mov($dst$$Register, $src$$Register, (AsmCondition)($cmp$$cmpcode));
+  %}
+  ins_pipe(ialu_reg);
+%}
+
+instruct cmovPL_reg_LEGT(cmpOpL_commute cmp, flagsRegL_LEGT xcc, iRegP dst, iRegP src) %{
+  match(Set dst (CMoveP (Binary cmp xcc) (Binary dst src)));
+  predicate(_kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::le || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::gt );
+
+  ins_cost(150);
+  size(4);
+  format %{ "MOV$cmp  $dst,$src" %}
+  ins_encode %{
+    __ mov($dst$$Register, $src$$Register, (AsmCondition)($cmp$$cmpcode));
+  %}
+  ins_pipe(ialu_reg);
+%}
+
+instruct cmovPL_imm_LTGE(cmpOpL cmp, flagsRegL_LTGE xcc, iRegP dst, immP0 src) %{
+  match(Set dst (CMoveP (Binary cmp xcc) (Binary dst src)));
+  predicate(_kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::lt || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ge );
+
+  ins_cost(140);
+  format %{ "MOVW$cmp  $dst,$src" %}
+  ins_encode %{
+    __ movw($dst$$Register, $src$$constant, (AsmCondition)($cmp$$cmpcode));
+  %}
+  ins_pipe(ialu_imm);
+%}
+
+instruct cmovPL_imm_EQNE(cmpOpL cmp, flagsRegL_EQNE xcc, iRegP dst, immP0 src) %{
+  match(Set dst (CMoveP (Binary cmp xcc) (Binary dst src)));
+  predicate(_kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::eq || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ne );
+
+  ins_cost(140);
+  format %{ "MOVW$cmp  $dst,$src" %}
+  ins_encode %{
+    __ movw($dst$$Register, $src$$constant, (AsmCondition)($cmp$$cmpcode));
+  %}
+  ins_pipe(ialu_imm);
+%}
+
+instruct cmovPL_imm_LEGT(cmpOpL_commute cmp, flagsRegL_LEGT xcc, iRegP dst, immP0 src) %{
+  match(Set dst (CMoveP (Binary cmp xcc) (Binary dst src)));
+  predicate(_kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::le || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::gt );
+
+  ins_cost(140);
+  format %{ "MOVW$cmp  $dst,$src" %}
+  ins_encode %{
+    __ movw($dst$$Register, $src$$constant, (AsmCondition)($cmp$$cmpcode));
+  %}
+  ins_pipe(ialu_imm);
+%}
+
+instruct cmovFL_reg_LTGE(cmpOpL cmp, flagsRegL_LTGE xcc, regF dst, regF src) %{
+  match(Set dst (CMoveF (Binary cmp xcc) (Binary dst src)));
+  predicate(_kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::lt || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ge );
+  ins_cost(150);
+  size(4);
+  format %{ "FCPYS$cmp $dst,$src" %}
+  ins_encode %{
+    __ fcpys($dst$$FloatRegister, $src$$FloatRegister, (AsmCondition)($cmp$$cmpcode));
+  %}
+  ins_pipe(int_conditional_float_move);
+%}
+
+instruct cmovFL_reg_EQNE(cmpOpL cmp, flagsRegL_EQNE xcc, regF dst, regF src) %{
+  match(Set dst (CMoveF (Binary cmp xcc) (Binary dst src)));
+  predicate(_kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::eq || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ne );
+  ins_cost(150);
+  size(4);
+  format %{ "FCPYS$cmp $dst,$src" %}
+  ins_encode %{
+    __ fcpys($dst$$FloatRegister, $src$$FloatRegister, (AsmCondition)($cmp$$cmpcode));
+  %}
+  ins_pipe(int_conditional_float_move);
+%}
+
+instruct cmovFL_reg_LEGT(cmpOpL_commute cmp, flagsRegL_LEGT xcc, regF dst, regF src) %{
+  match(Set dst (CMoveF (Binary cmp xcc) (Binary dst src)));
+  predicate(_kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::le || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::gt );
+  ins_cost(150);
+  size(4);
+  format %{ "FCPYS$cmp $dst,$src" %}
+  ins_encode %{
+    __ fcpys($dst$$FloatRegister, $src$$FloatRegister, (AsmCondition)($cmp$$cmpcode));
+  %}
+  ins_pipe(int_conditional_float_move);
+%}
+
+instruct cmovDL_reg_LTGE(cmpOpL cmp, flagsRegL_LTGE xcc, regD dst, regD src) %{
+  match(Set dst (CMoveD (Binary cmp xcc) (Binary dst src)));
+  predicate(_kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::lt || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ge );
+
+  ins_cost(150);
+  size(4);
+  format %{ "FCPYD$cmp $dst,$src" %}
+  ins_encode %{
+    __ fcpyd($dst$$FloatRegister, $src$$FloatRegister, (AsmCondition)($cmp$$cmpcode));
+  %}
+  ins_pipe(int_conditional_float_move);
+%}
+
+instruct cmovDL_reg_EQNE(cmpOpL cmp, flagsRegL_EQNE xcc, regD dst, regD src) %{
+  match(Set dst (CMoveD (Binary cmp xcc) (Binary dst src)));
+  predicate(_kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::eq || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ne );
+
+  ins_cost(150);
+  size(4);
+  format %{ "FCPYD$cmp $dst,$src" %}
+  ins_encode %{
+    __ fcpyd($dst$$FloatRegister, $src$$FloatRegister, (AsmCondition)($cmp$$cmpcode));
+  %}
+  ins_pipe(int_conditional_float_move);
+%}
+
+instruct cmovDL_reg_LEGT(cmpOpL_commute cmp, flagsRegL_LEGT xcc, regD dst, regD src) %{
+  match(Set dst (CMoveD (Binary cmp xcc) (Binary dst src)));
+  predicate(_kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::le || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::gt );
+
+  ins_cost(150);
+  size(4);
+  format %{ "FCPYD$cmp $dst,$src" %}
+  ins_encode %{
+    __ fcpyd($dst$$FloatRegister, $src$$FloatRegister, (AsmCondition)($cmp$$cmpcode));
+  %}
+  ins_pipe(int_conditional_float_move);
+%}
+#endif // !AARCH64
+
+// ============================================================================
+// Safepoint Instruction
+#ifdef AARCH64
+instruct safePoint_poll(iRegP poll, flagsReg icc, RtempRegP tmp) %{
+  match(SafePoint poll);
+  // The handler stub kills Rtemp
+  effect(USE poll, KILL tmp, KILL icc);
+
+  size(4);
+  format %{ "LDR   ZR,[$poll]\t! Safepoint: poll for GC" %}
+  ins_encode %{
+    __ relocate(relocInfo::poll_type);
+    __ ldr(ZR, Address($poll$$Register));
+  %}
+  ins_pipe(loadPollP);
+%}
+#else
+// rather than KILL R12, it would be better to use any reg as
+// TEMP. Can't do that at this point because it crashes the compiler
+instruct safePoint_poll(iRegP poll, R12RegI tmp, flagsReg icc) %{
+  match(SafePoint poll);
+  effect(USE poll, KILL tmp, KILL icc);
+
+  size(4);
+  format %{ "LDR   $tmp,[$poll]\t! Safepoint: poll for GC" %}
+  ins_encode %{
+    __ relocate(relocInfo::poll_type);
+    __ ldr($tmp$$Register, Address($poll$$Register));
+  %}
+  ins_pipe(loadPollP);
+%}
+#endif
+
+
+// ============================================================================
+// Call Instructions
+// Call Java Static Instruction
+instruct CallStaticJavaDirect( method meth ) %{
+  match(CallStaticJava);
+  predicate(! ((CallStaticJavaNode*)n)->is_method_handle_invoke());
+  effect(USE meth);
+
+  ins_cost(CALL_COST);
+  format %{ "CALL,static ==> " %}
+  ins_encode( Java_Static_Call( meth ), call_epilog );
+  ins_pipe(simple_call);
+%}
+
+// Call Java Static Instruction (method handle version)
+instruct CallStaticJavaHandle( method meth ) %{
+  match(CallStaticJava);
+  predicate(((CallStaticJavaNode*)n)->is_method_handle_invoke());
+  effect(USE meth);
+  // FP is saved by all callees (for interpreter stack correction).
+  // We use it here for a similar purpose, in {preserve,restore}_FP.
+
+  ins_cost(CALL_COST);
+  format %{ "CALL,static/MethodHandle ==> " %}
+  ins_encode( preserve_SP, Java_Static_Call( meth ), restore_SP, call_epilog );
+  ins_pipe(simple_call);
+%}
+
+// Call Java Dynamic Instruction
+instruct CallDynamicJavaDirect( method meth ) %{
+  match(CallDynamicJava);
+  effect(USE meth);
+
+  ins_cost(CALL_COST);
+  format %{ "MOV_OOP    (empty),R_R8\n\t"
+            "CALL,dynamic  ; NOP ==> " %}
+  ins_encode( Java_Dynamic_Call( meth ), call_epilog );
+  ins_pipe(call);
+%}
+
+// Call Runtime Instruction
+instruct CallRuntimeDirect(method meth) %{
+  match(CallRuntime);
+  effect(USE meth);
+  ins_cost(CALL_COST);
+  format %{ "CALL,runtime" %}
+#ifdef AARCH64
+  ins_encode( save_last_PC, Java_To_Runtime( meth ),
+              call_epilog );
+#else
+  ins_encode( Java_To_Runtime( meth ),
+              call_epilog );
+#endif
+  ins_pipe(simple_call);
+%}
+
+// Call runtime without safepoint - same as CallRuntime
+instruct CallLeafDirect(method meth) %{
+  match(CallLeaf);
+  effect(USE meth);
+  ins_cost(CALL_COST);
+  format %{ "CALL,runtime leaf" %}
+  // TODO: ned save_last_PC here?
+  ins_encode( Java_To_Runtime( meth ),
+              call_epilog );
+  ins_pipe(simple_call);
+%}
+
+// Call runtime without safepoint - same as CallLeaf
+instruct CallLeafNoFPDirect(method meth) %{
+  match(CallLeafNoFP);
+  effect(USE meth);
+  ins_cost(CALL_COST);
+  format %{ "CALL,runtime leaf nofp" %}
+  // TODO: ned save_last_PC here?
+  ins_encode( Java_To_Runtime( meth ),
+              call_epilog );
+  ins_pipe(simple_call);
+%}
+
+// Tail Call; Jump from runtime stub to Java code.
+// Also known as an 'interprocedural jump'.
+// Target of jump will eventually return to caller.
+// TailJump below removes the return address.
+instruct TailCalljmpInd(IPRegP jump_target, inline_cache_regP method_oop) %{
+  match(TailCall jump_target method_oop );
+
+  ins_cost(CALL_COST);
+  format %{ "MOV    Rexception_pc, LR\n\t"
+            "jump   $jump_target  \t! $method_oop holds method oop" %}
+  ins_encode %{
+    __ mov(Rexception_pc, LR);   // this is used only to call
+                                 // StubRoutines::forward_exception_entry()
+                                 // which expects PC of exception in
+                                 // R5. FIXME?
+    __ jump($jump_target$$Register);
+  %}
+  ins_pipe(tail_call);
+%}
+
+
+// Return Instruction
+instruct Ret() %{
+  match(Return);
+
+  format %{ "ret LR" %}
+
+  ins_encode %{
+    __ ret(LR);
+  %}
+
+  ins_pipe(br);
+%}
+
+
+// Tail Jump; remove the return address; jump to target.
+// TailCall above leaves the return address around.
+// TailJump is used in only one place, the rethrow_Java stub (fancy_jump=2).
+// ex_oop (Exception Oop) is needed in %o0 at the jump. As there would be a
+// "restore" before this instruction (in Epilogue), we need to materialize it
+// in %i0.
+instruct tailjmpInd(IPRegP jump_target, RExceptionRegP ex_oop) %{
+  match( TailJump jump_target ex_oop );
+  ins_cost(CALL_COST);
+  format %{ "MOV    Rexception_pc, LR\n\t"
+            "jump   $jump_target \t! $ex_oop holds exc. oop" %}
+  ins_encode %{
+    __ mov(Rexception_pc, LR);
+    __ jump($jump_target$$Register);
+  %}
+  ins_pipe(tail_call);
+%}
+
+// Create exception oop: created by stack-crawling runtime code.
+// Created exception is now available to this handler, and is setup
+// just prior to jumping to this handler.  No code emitted.
+instruct CreateException( RExceptionRegP ex_oop )
+%{
+  match(Set ex_oop (CreateEx));
+  ins_cost(0);
+
+  size(0);
+  // use the following format syntax
+  format %{ "! exception oop is in Rexception_obj; no code emitted" %}
+  ins_encode();
+  ins_pipe(empty);
+%}
+
+
+// Rethrow exception:
+// The exception oop will come in the first argument position.
+// Then JUMP (not call) to the rethrow stub code.
+instruct RethrowException()
+%{
+  match(Rethrow);
+  ins_cost(CALL_COST);
+
+  // use the following format syntax
+  format %{ "b    rethrow_stub" %}
+  ins_encode %{
+    Register scratch = R1_tmp;
+    assert_different_registers(scratch, c_rarg0, LR);
+    __ jump(OptoRuntime::rethrow_stub(), relocInfo::runtime_call_type, scratch);
+  %}
+  ins_pipe(tail_call);
+%}
+
+
+// Die now
+instruct ShouldNotReachHere( )
+%{
+  match(Halt);
+  ins_cost(CALL_COST);
+
+  size(4);
+  // Use the following format syntax
+  format %{ "breakpoint   ; ShouldNotReachHere" %}
+  ins_encode %{
+    __ breakpoint();
+  %}
+  ins_pipe(tail_call);
+%}
+
+// ============================================================================
+// The 2nd slow-half of a subtype check.  Scan the subklass's 2ndary superklass
+// array for an instance of the superklass.  Set a hidden internal cache on a
+// hit (cache is checked with exposed code in gen_subtype_check()).  Return
+// not zero for a miss or zero for a hit.  The encoding ALSO sets flags.
+instruct partialSubtypeCheck( R0RegP index, R1RegP sub, R2RegP super, flagsRegP pcc, LRRegP lr ) %{
+  match(Set index (PartialSubtypeCheck sub super));
+  effect( KILL pcc, KILL lr );
+  ins_cost(DEFAULT_COST*10);
+  format %{ "CALL   PartialSubtypeCheck" %}
+  ins_encode %{
+    __ call(StubRoutines::Arm::partial_subtype_check(), relocInfo::runtime_call_type);
+  %}
+  ins_pipe(partial_subtype_check_pipe);
+%}
+
+/* instruct partialSubtypeCheck_vs_zero( flagsRegP pcc, o1RegP sub, o2RegP super, immP0 zero, o0RegP idx, o7RegP o7 ) %{ */
+/*   match(Set pcc (CmpP (PartialSubtypeCheck sub super) zero)); */
+/*   ins_pipe(partial_subtype_check_pipe); */
+/* %} */
+
+
+// ============================================================================
+// inlined locking and unlocking
+
+#ifdef AARCH64
+instruct cmpFastLock(flagsRegP pcc, iRegP object, iRegP box, iRegP scratch2, iRegP scratch, iRegP scratch3 )
+#else
+instruct cmpFastLock(flagsRegP pcc, iRegP object, iRegP box, iRegP scratch2, iRegP scratch )
+#endif
+%{
+  match(Set pcc (FastLock object box));
+
+#ifdef AARCH64
+  effect(TEMP scratch, TEMP scratch2, TEMP scratch3);
+#else
+  effect(TEMP scratch, TEMP scratch2);
+#endif
+  ins_cost(100);
+
+#ifdef AARCH64
+  format %{ "FASTLOCK  $object, $box; KILL $scratch, $scratch2, $scratch3" %}
+  ins_encode %{
+    __ fast_lock($object$$Register, $box$$Register, $scratch$$Register, $scratch2$$Register, $scratch3$$Register);
+  %}
+#else
+  format %{ "FASTLOCK  $object, $box; KILL $scratch, $scratch2" %}
+  ins_encode %{
+    __ fast_lock($object$$Register, $box$$Register, $scratch$$Register, $scratch2$$Register);
+  %}
+#endif
+  ins_pipe(long_memory_op);
+%}
+
+
+#ifdef AARCH64
+instruct cmpFastUnlock(flagsRegP pcc, iRegP object, iRegP box, iRegP scratch2, iRegP scratch, iRegP scratch3 ) %{
+  match(Set pcc (FastUnlock object box));
+  effect(TEMP scratch, TEMP scratch2, TEMP scratch3);
+  ins_cost(100);
+
+  format %{ "FASTUNLOCK  $object, $box; KILL $scratch, $scratch2, $scratch3" %}
+  ins_encode %{
+    __ fast_unlock($object$$Register, $box$$Register, $scratch$$Register, $scratch2$$Register, $scratch3$$Register);
+  %}
+  ins_pipe(long_memory_op);
+%}
+#else
+instruct cmpFastUnlock(flagsRegP pcc, iRegP object, iRegP box, iRegP scratch2, iRegP scratch ) %{
+  match(Set pcc (FastUnlock object box));
+  effect(TEMP scratch, TEMP scratch2);
+  ins_cost(100);
+
+  format %{ "FASTUNLOCK  $object, $box; KILL $scratch, $scratch2" %}
+  ins_encode %{
+    __ fast_unlock($object$$Register, $box$$Register, $scratch$$Register, $scratch2$$Register);
+  %}
+  ins_pipe(long_memory_op);
+%}
+#endif
+
+#ifdef AARCH64
+// TODO: add version that takes immI cnt?
+instruct clear_array(iRegX cnt, iRegP base, iRegP ptr, iRegX temp, Universe dummy, flagsReg cpsr) %{
+  match(Set dummy (ClearArray cnt base));
+  effect(TEMP temp, TEMP ptr, KILL cpsr);
+  ins_cost(300);
+  format %{
+      "        MOV    $temp,$cnt\n"
+      "        ADD    $ptr,$base,$cnt\n"
+      "        SUBS   $temp,$temp,16\t! Count down dword pair in bytes\n"
+      "        B.lt   done16\n"
+      "loop:   STP    ZR,ZR,[$ptr,-16]!\n"
+      "        SUBS   $temp,$temp,16\t! Count down dword pair in bytes\n"
+      "        B.ge   loop\t! Clearing loop\n"
+      "done16: ADDS   $temp,$temp,8\t! Room for 1 more long?\n"
+      "        B.lt   done\n"
+      "        STR    ZR,[$base+$temp]\n"
+      "done:"
+  %}
+  ins_encode %{
+    // TODO: preload?
+    __ mov($temp$$Register, $cnt$$Register);
+    __ add($ptr$$Register, $base$$Register, $cnt$$Register);
+    Label loop, done, done16;
+    __ subs($temp$$Register, $temp$$Register, 16);
+    __ b(done16, lt);
+    __ bind(loop);
+    __ stp(ZR, ZR, Address($ptr$$Register, -16, pre_indexed));
+    __ subs($temp$$Register, $temp$$Register, 16);
+    __ b(loop, ge);
+    __ bind(done16);
+    __ adds($temp$$Register, $temp$$Register, 8);
+    __ b(done, lt);
+    // $temp should be 0 here
+    __ str(ZR, Address($base$$Register, $temp$$Register));
+    __ bind(done);
+  %}
+  ins_pipe(long_memory_op);
+%}
+#else
+// Count and Base registers are fixed because the allocator cannot
+// kill unknown registers.  The encodings are generic.
+instruct clear_array(iRegX cnt, iRegP base, iRegI temp, iRegX zero, Universe dummy, flagsReg cpsr) %{
+  match(Set dummy (ClearArray cnt base));
+  effect(TEMP temp, TEMP zero, KILL cpsr);
+  ins_cost(300);
+  format %{ "MOV    $zero,0\n"
+      "        MOV    $temp,$cnt\n"
+      "loop:   SUBS   $temp,$temp,4\t! Count down a dword of bytes\n"
+      "        STR.ge $zero,[$base+$temp]\t! delay slot"
+      "        B.gt   loop\t\t! Clearing loop\n" %}
+  ins_encode %{
+    __ mov($zero$$Register, 0);
+    __ mov($temp$$Register, $cnt$$Register);
+    Label(loop);
+    __ bind(loop);
+    __ subs($temp$$Register, $temp$$Register, 4);
+    __ str($zero$$Register, Address($base$$Register, $temp$$Register), ge);
+    __ b(loop, gt);
+  %}
+  ins_pipe(long_memory_op);
+%}
+#endif
+
+#ifdef XXX
+// FIXME: Why R0/R1/R2/R3?
+instruct string_compare(R0RegP str1, R1RegP str2, R2RegI cnt1, R3RegI cnt2, iRegI result,
+                        iRegI tmp1, iRegI tmp2, flagsReg ccr) %{
+  predicate(!CompactStrings);
+  match(Set result (StrComp (Binary str1 cnt1) (Binary str2 cnt2)));
+  effect(USE_KILL str1, USE_KILL str2, USE_KILL cnt1, USE_KILL cnt2, KILL ccr, TEMP tmp1, TEMP tmp2);
+  ins_cost(300);
+  format %{ "String Compare $str1,$cnt1,$str2,$cnt2 -> $result   // TEMP $tmp1, $tmp2" %}
+  ins_encode( enc_String_Compare(str1, str2, cnt1, cnt2, result, tmp1, tmp2) );
+
+  ins_pipe(long_memory_op);
+%}
+
+// FIXME: Why R0/R1/R2?
+instruct string_equals(R0RegP str1, R1RegP str2, R2RegI cnt, iRegI result, iRegI tmp1, iRegI tmp2,
+                       flagsReg ccr) %{
+  predicate(!CompactStrings);
+  match(Set result (StrEquals (Binary str1 str2) cnt));
+  effect(USE_KILL str1, USE_KILL str2, USE_KILL cnt, TEMP tmp1, TEMP tmp2, TEMP result, KILL ccr);
+
+  ins_cost(300);
+  format %{ "String Equals $str1,$str2,$cnt -> $result   // TEMP $tmp1, $tmp2" %}
+  ins_encode( enc_String_Equals(str1, str2, cnt, result, tmp1, tmp2) );
+  ins_pipe(long_memory_op);
+%}
+
+// FIXME: Why R0/R1?
+instruct array_equals(R0RegP ary1, R1RegP ary2, iRegI tmp1, iRegI tmp2, iRegI tmp3, iRegI result,
+                      flagsReg ccr) %{
+  predicate(((AryEqNode*)n)->encoding() == StrIntrinsicNode::UU);
+  match(Set result (AryEq ary1 ary2));
+  effect(USE_KILL ary1, USE_KILL ary2, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP result, KILL ccr);
+
+  ins_cost(300);
+  format %{ "Array Equals $ary1,$ary2 -> $result   // TEMP $tmp1,$tmp2,$tmp3" %}
+  ins_encode( enc_Array_Equals(ary1, ary2, tmp1, tmp2, tmp3, result));
+  ins_pipe(long_memory_op);
+%}
+#endif
+
+//---------- Zeros Count Instructions ------------------------------------------
+
+instruct countLeadingZerosI(iRegI dst, iRegI src) %{
+  match(Set dst (CountLeadingZerosI src));
+  size(4);
+  format %{ "CLZ_32 $dst,$src" %}
+  ins_encode %{
+    __ clz_32($dst$$Register, $src$$Register);
+  %}
+  ins_pipe(ialu_reg);
+%}
+
+#ifdef AARCH64
+instruct countLeadingZerosL(iRegI dst, iRegL src) %{
+  match(Set dst (CountLeadingZerosL src));
+  size(4);
+  format %{ "CLZ $dst,$src" %}
+  ins_encode %{
+    __ clz($dst$$Register, $src$$Register);
+  %}
+  ins_pipe(ialu_reg);
+%}
+#else
+instruct countLeadingZerosL(iRegI dst, iRegL src, iRegI tmp, flagsReg ccr) %{
+  match(Set dst (CountLeadingZerosL src));
+  effect(TEMP tmp, TEMP dst, KILL ccr);
+  size(16);
+  format %{ "CLZ    $dst,$src.hi\n\t"
+            "TEQ    $dst,32\n\t"
+            "CLZ.eq $tmp,$src.lo\n\t"
+            "ADD.eq $dst, $dst, $tmp\n\t" %}
+  ins_encode %{
+    __ clz($dst$$Register, $src$$Register->successor());
+    __ teq($dst$$Register, 32);
+    __ clz($tmp$$Register, $src$$Register, eq);
+    __ add($dst$$Register, $dst$$Register, $tmp$$Register, eq);
+  %}
+  ins_pipe(ialu_reg);
+%}
+#endif
+
+instruct countTrailingZerosI(iRegI dst, iRegI src, iRegI tmp) %{
+  match(Set dst (CountTrailingZerosI src));
+  effect(TEMP tmp);
+  size(8);
+  format %{ "RBIT_32 $tmp, $src\n\t"
+            "CLZ_32  $dst,$tmp" %}
+  ins_encode %{
+    __ rbit_32($tmp$$Register, $src$$Register);
+    __ clz_32($dst$$Register, $tmp$$Register);
+  %}
+  ins_pipe(ialu_reg);
+%}
+
+#ifdef AARCH64
+instruct countTrailingZerosL(iRegI dst, iRegL src, iRegL tmp) %{
+  match(Set dst (CountTrailingZerosL src));
+  effect(TEMP tmp);
+  size(8);
+  format %{ "RBIT $tmp, $src\n\t"
+            "CLZ  $dst,$tmp" %}
+  ins_encode %{
+    __ rbit($tmp$$Register, $src$$Register);
+    __ clz($dst$$Register, $tmp$$Register);
+  %}
+  ins_pipe(ialu_reg);
+%}
+#else
+instruct countTrailingZerosL(iRegI dst, iRegL src, iRegI tmp, flagsReg ccr) %{
+  match(Set dst (CountTrailingZerosL src));
+  effect(TEMP tmp, TEMP dst, KILL ccr);
+  size(24);
+  format %{ "RBIT   $tmp,$src.lo\n\t"
+            "CLZ    $dst,$tmp\n\t"
+            "TEQ    $dst,32\n\t"
+            "RBIT   $tmp,$src.hi\n\t"
+            "CLZ.eq $tmp,$tmp\n\t"
+            "ADD.eq $dst,$dst,$tmp\n\t" %}
+  ins_encode %{
+    __ rbit($tmp$$Register, $src$$Register);
+    __ clz($dst$$Register, $tmp$$Register);
+    __ teq($dst$$Register, 32);
+    __ rbit($tmp$$Register, $src$$Register->successor());
+    __ clz($tmp$$Register, $tmp$$Register, eq);
+    __ add($dst$$Register, $dst$$Register, $tmp$$Register, eq);
+  %}
+  ins_pipe(ialu_reg);
+%}
+#endif
+
+
+//---------- Population Count Instructions -------------------------------------
+
+#ifdef AARCH64
+instruct popCountI(iRegI dst, iRegI src, regD_low tmp) %{
+  predicate(UsePopCountInstruction);
+  match(Set dst (PopCountI src));
+  effect(TEMP tmp);
+  size(20);
+
+  format %{ "MOV_W      $dst,$src\n\t"
+            "FMOV_dx    $tmp,$dst\n\t"
+            "VCNT       $tmp.8B,$tmp.8B\n\t"
+            "ADDV       $tmp.B,$tmp.8B\n\t"
+            "FMRS       $dst,$tmp" %}
+
+  ins_encode %{
+    __ mov_w($dst$$Register, $src$$Register);
+    __ fmov_dx($tmp$$FloatRegister, $dst$$Register);
+    int quad = 0;
+    int cnt_size = 0; // VELEM_SIZE_8
+    __ vcnt($tmp$$FloatRegister, $tmp$$FloatRegister, quad, cnt_size);
+    int add_size = 0; // VELEM_SIZE_8
+    __ addv($tmp$$FloatRegister, $tmp$$FloatRegister, quad, add_size);
+    __ fmrs($dst$$Register, $tmp$$FloatRegister);
+  %}
+  ins_pipe(ialu_reg); // FIXME
+%}
+#else
+instruct popCountI(iRegI dst, iRegI src, regD_low tmp) %{
+  predicate(UsePopCountInstruction);
+  match(Set dst (PopCountI src));
+  effect(TEMP tmp);
+
+  format %{ "FMSR       $tmp,$src\n\t"
+            "VCNT.8     $tmp,$tmp\n\t"
+            "VPADDL.U8  $tmp,$tmp\n\t"
+            "VPADDL.U16 $tmp,$tmp\n\t"
+            "FMRS       $dst,$tmp" %}
+  size(20);
+
+  ins_encode %{
+    __ fmsr($tmp$$FloatRegister, $src$$Register);
+    __ vcnt($tmp$$FloatRegister, $tmp$$FloatRegister);
+    __ vpaddl($tmp$$FloatRegister, $tmp$$FloatRegister, 8, 0);
+    __ vpaddl($tmp$$FloatRegister, $tmp$$FloatRegister, 16, 0);
+    __ fmrs($dst$$Register, $tmp$$FloatRegister);
+  %}
+  ins_pipe(ialu_reg); // FIXME
+%}
+#endif
+
+#ifdef AARCH64
+instruct popCountL(iRegI dst, iRegL src, regD tmp) %{
+  predicate(UsePopCountInstruction);
+  match(Set dst (PopCountL src));
+  effect(TEMP tmp);
+  size(16);
+
+  format %{ "FMOV_dx    $tmp,$src\n\t"
+            "VCNT       $tmp.8B,$tmp.8B\n\t"
+            "ADDV       $tmp.B,$tmp.8B\n\t"
+            "FMOV_ws    $dst,$tmp" %}
+
+  ins_encode %{
+    __ fmov_dx($tmp$$FloatRegister, $src$$Register);
+    int quad = 0;
+    int cnt_size = 0;
+    __ vcnt($tmp$$FloatRegister, $tmp$$FloatRegister, quad, cnt_size);
+    int add_size = 0;
+    __ addv($tmp$$FloatRegister, $tmp$$FloatRegister, quad, add_size);
+    __ fmov_ws($dst$$Register, $tmp$$FloatRegister);
+  %}
+  ins_pipe(ialu_reg); // FIXME
+%}
+#else
+// Note: Long.bitCount(long) returns an int.
+instruct popCountL(iRegI dst, iRegL src, regD_low tmp) %{
+  predicate(UsePopCountInstruction);
+  match(Set dst (PopCountL src));
+  effect(TEMP tmp);
+
+  format %{ "FMDRR       $tmp,$src.lo,$src.hi\n\t"
+            "VCNT.8      $tmp,$tmp\n\t"
+            "VPADDL.U8   $tmp,$tmp\n\t"
+            "VPADDL.U16  $tmp,$tmp\n\t"
+            "VPADDL.U32  $tmp,$tmp\n\t"
+            "FMRS        $dst,$tmp" %}
+
+  size(32);
+
+  ins_encode %{
+    __ fmdrr($tmp$$FloatRegister, $src$$Register, $src$$Register->successor());
+    __ vcnt($tmp$$FloatRegister, $tmp$$FloatRegister);
+    __ vpaddl($tmp$$FloatRegister, $tmp$$FloatRegister, 8, 0);
+    __ vpaddl($tmp$$FloatRegister, $tmp$$FloatRegister, 16, 0);
+    __ vpaddl($tmp$$FloatRegister, $tmp$$FloatRegister, 32, 0);
+    __ fmrs($dst$$Register, $tmp$$FloatRegister);
+  %}
+  ins_pipe(ialu_reg);
+%}
+#endif
+
+
+// ============================================================================
+//------------Bytes reverse--------------------------------------------------
+
+instruct bytes_reverse_int(iRegI dst, iRegI src) %{
+  match(Set dst (ReverseBytesI src));
+
+  size(4);
+  format %{ "REV32 $dst,$src" %}
+  ins_encode %{
+#ifdef AARCH64
+    __ rev_w($dst$$Register, $src$$Register);
+    // high 32 bits zeroed, not sign extended
+#else
+    __ rev($dst$$Register, $src$$Register);
+#endif
+  %}
+  ins_pipe( iload_mem ); // FIXME
+%}
+
+instruct bytes_reverse_long(iRegL dst, iRegL src) %{
+  match(Set dst (ReverseBytesL src));
+#ifdef AARCH64
+//size(4);
+  format %{ "REV $dst,$src"  %}
+  ins_encode %{
+    __ rev($dst$$Register, $src$$Register);
+  %}
+  ins_pipe(ialu_reg_reg); // FIXME
+#else
+  effect(TEMP dst);
+  size(8);
+  format %{ "REV $dst.lo,$src.lo\n\t"
+            "REV $dst.hi,$src.hi" %}
+  ins_encode %{
+    __ rev($dst$$Register, $src$$Register->successor());
+    __ rev($dst$$Register->successor(), $src$$Register);
+  %}
+  ins_pipe( iload_mem ); // FIXME
+#endif
+%}
+
+instruct bytes_reverse_unsigned_short(iRegI dst, iRegI src) %{
+  match(Set dst (ReverseBytesUS src));
+#ifdef AARCH64
+  size(4);
+  format %{ "REV16_W $dst,$src" %}
+  ins_encode %{
+    __ rev16_w($dst$$Register, $src$$Register);
+    // high 32 bits zeroed
+  %}
+#else
+  size(4);
+  format %{ "REV16 $dst,$src" %}
+  ins_encode %{
+    __ rev16($dst$$Register, $src$$Register);
+  %}
+#endif
+  ins_pipe( iload_mem ); // FIXME
+%}
+
+instruct bytes_reverse_short(iRegI dst, iRegI src) %{
+  match(Set dst (ReverseBytesS src));
+#ifdef AARCH64
+  size(8);
+  format %{ "REV16_W $dst,$src\n\t"
+            "SIGN_EXT16 $dst" %}
+  ins_encode %{
+    __ rev16_w($dst$$Register, $src$$Register);
+    __ sign_extend($dst$$Register, $dst$$Register, 16);
+  %}
+#else
+  size(4);
+  format %{ "REVSH $dst,$src" %}
+  ins_encode %{
+    __ revsh($dst$$Register, $src$$Register);
+  %}
+#endif
+  ins_pipe( iload_mem ); // FIXME
+%}
+
+
+// ====================VECTOR INSTRUCTIONS=====================================
+
+// Load Aligned Packed values into a Double Register
+instruct loadV8(vecD dst, memoryD mem) %{
+  predicate(n->as_LoadVector()->memory_size() == 8);
+  match(Set dst (LoadVector mem));
+  ins_cost(MEMORY_REF_COST);
+  size(4);
+  format %{ "FLDD   $mem,$dst\t! load vector (8 bytes)" %}
+  ins_encode %{
+    __ ldr_double($dst$$FloatRegister, $mem$$Address);
+  %}
+  ins_pipe(floadD_mem);
+%}
+
+// Load Aligned Packed values into a Double Register Pair
+instruct loadV16(vecX dst, memoryvld mem) %{
+  predicate(n->as_LoadVector()->memory_size() == 16);
+  match(Set dst (LoadVector mem));
+  ins_cost(MEMORY_REF_COST);
+  size(4);
+  format %{ "VLD1   $mem,$dst.Q\t! load vector (16 bytes)" %}
+  ins_encode %{
+    __ vld1($dst$$FloatRegister, $mem$$Address, MacroAssembler::VELEM_SIZE_16, 128);
+  %}
+  ins_pipe(floadD_mem); // FIXME
+%}
+
+// Store Vector in Double register to memory
+instruct storeV8(memoryD mem, vecD src) %{
+  predicate(n->as_StoreVector()->memory_size() == 8);
+  match(Set mem (StoreVector mem src));
+  ins_cost(MEMORY_REF_COST);
+  size(4);
+  format %{ "FSTD   $src,$mem\t! store vector (8 bytes)" %}
+  ins_encode %{
+    __ str_double($src$$FloatRegister, $mem$$Address);
+  %}
+  ins_pipe(fstoreD_mem_reg);
+%}
+
+// Store Vector in Double Register Pair to memory
+instruct storeV16(memoryvld mem, vecX src) %{
+  predicate(n->as_StoreVector()->memory_size() == 16);
+  match(Set mem (StoreVector mem src));
+  ins_cost(MEMORY_REF_COST);
+  size(4);
+  format %{ "VST1   $src,$mem\t! store vector (16 bytes)" %}
+  ins_encode %{
+    __ vst1($src$$FloatRegister, $mem$$Address, MacroAssembler::VELEM_SIZE_16, 128);
+  %}
+  ins_pipe(fstoreD_mem_reg); // FIXME
+%}
+
+#ifndef AARCH64
+// Replicate scalar to packed byte values in Double register
+instruct Repl8B_reg(vecD dst, iRegI src, iRegI tmp) %{
+  predicate(n->as_Vector()->length() == 8);
+  match(Set dst (ReplicateB src));
+  ins_cost(DEFAULT_COST*4);
+  effect(TEMP tmp);
+  size(16);
+
+  // FIXME: could use PKH instruction instead?
+  format %{ "LSL      $tmp, $src, 24 \n\t"
+            "OR       $tmp, $tmp, ($tmp >> 8) \n\t"
+            "OR       $tmp, $tmp, ($tmp >> 16) \n\t"
+            "FMDRR    $dst,$tmp,$tmp\t" %}
+  ins_encode %{
+    __ mov($tmp$$Register, AsmOperand($src$$Register, lsl, 24));
+    __ orr($tmp$$Register, $tmp$$Register, AsmOperand($tmp$$Register, lsr, 8));
+    __ orr($tmp$$Register, $tmp$$Register, AsmOperand($tmp$$Register, lsr, 16));
+    __ fmdrr($dst$$FloatRegister, $tmp$$Register, $tmp$$Register);
+  %}
+  ins_pipe(ialu_reg); // FIXME
+%}
+#endif /* !AARCH64 */
+
+// Replicate scalar to packed byte values in Double register
+instruct Repl8B_reg_simd(vecD dst, iRegI src) %{
+  predicate(n->as_Vector()->length_in_bytes() == 8 && VM_Version::has_simd());
+  match(Set dst (ReplicateB src));
+  size(4);
+
+  format %{ "VDUP.8 $dst,$src\t" %}
+  ins_encode %{
+    bool quad = false;
+    __ vdupI($dst$$FloatRegister, $src$$Register,
+             MacroAssembler::VELEM_SIZE_8, quad);
+  %}
+  ins_pipe(ialu_reg); // FIXME
+%}
+
+// Replicate scalar to packed byte values in Double register pair
+instruct Repl16B_reg(vecX dst, iRegI src) %{
+  predicate(n->as_Vector()->length_in_bytes() == 16);
+  match(Set dst (ReplicateB src));
+  size(4);
+
+  format %{ "VDUP.8 $dst.Q,$src\t" %}
+  ins_encode %{
+    bool quad = true;
+    __ vdupI($dst$$FloatRegister, $src$$Register,
+             MacroAssembler::VELEM_SIZE_8, quad);
+  %}
+  ins_pipe(ialu_reg); // FIXME
+%}
+
+#ifndef AARCH64
+// Replicate scalar constant to packed byte values in Double register
+instruct Repl8B_immI(vecD dst, immI src, iRegI tmp) %{
+  predicate(n->as_Vector()->length() == 8);
+  match(Set dst (ReplicateB src));
+  ins_cost(DEFAULT_COST*2);
+  effect(TEMP tmp);
+  size(12);
+
+  format %{ "MOV      $tmp, Repl4($src))\n\t"
+            "FMDRR    $dst,$tmp,$tmp\t" %}
+  ins_encode( LdReplImmI(src, dst, tmp, (4), (1)) );
+  ins_pipe(loadConFD); // FIXME
+%}
+#endif /* !AARCH64 */
+
+// Replicate scalar constant to packed byte values in Double register
+// TODO: support negative constants with MVNI?
+instruct Repl8B_immU8(vecD dst, immU8 src) %{
+  predicate(n->as_Vector()->length_in_bytes() == 8 && VM_Version::has_simd());
+  match(Set dst (ReplicateB src));
+  size(4);
+
+  format %{ "VMOV.U8  $dst,$src" %}
+  ins_encode %{
+    bool quad = false;
+    __ vmovI($dst$$FloatRegister, $src$$constant,
+             MacroAssembler::VELEM_SIZE_8, quad);
+  %}
+  ins_pipe(loadConFD); // FIXME
+%}
+
+// Replicate scalar constant to packed byte values in Double register pair
+instruct Repl16B_immU8(vecX dst, immU8 src) %{
+  predicate(n->as_Vector()->length_in_bytes() == 16 && VM_Version::has_simd());
+  match(Set dst (ReplicateB src));
+  size(4);
+
+  format %{ "VMOV.U8  $dst.Q,$src" %}
+  ins_encode %{
+    bool quad = true;
+    __ vmovI($dst$$FloatRegister, $src$$constant,
+             MacroAssembler::VELEM_SIZE_8, quad);
+  %}
+  ins_pipe(loadConFD); // FIXME
+%}
+
+#ifndef AARCH64
+// Replicate scalar to packed short/char values into Double register
+instruct Repl4S_reg(vecD dst, iRegI src, iRegI tmp) %{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (ReplicateS src));
+  ins_cost(DEFAULT_COST*3);
+  effect(TEMP tmp);
+  size(12);
+
+  // FIXME: could use PKH instruction instead?
+  format %{ "LSL      $tmp, $src, 16 \n\t"
+            "OR       $tmp, $tmp, ($tmp >> 16) \n\t"
+            "FMDRR    $dst,$tmp,$tmp\t" %}
+  ins_encode %{
+    __ mov($tmp$$Register, AsmOperand($src$$Register, lsl, 16));
+    __ orr($tmp$$Register, $tmp$$Register, AsmOperand($tmp$$Register, lsr, 16));
+    __ fmdrr($dst$$FloatRegister, $tmp$$Register, $tmp$$Register);
+  %}
+  ins_pipe(ialu_reg); // FIXME
+%}
+#endif /* !AARCH64 */
+
+// Replicate scalar to packed byte values in Double register
+instruct Repl4S_reg_simd(vecD dst, iRegI src) %{
+  predicate(n->as_Vector()->length_in_bytes() == 8 && VM_Version::has_simd());
+  match(Set dst (ReplicateS src));
+  size(4);
+
+  format %{ "VDUP.16 $dst,$src\t" %}
+  ins_encode %{
+    bool quad = false;
+    __ vdupI($dst$$FloatRegister, $src$$Register,
+             MacroAssembler::VELEM_SIZE_16, quad);
+  %}
+  ins_pipe(ialu_reg); // FIXME
+%}
+
+// Replicate scalar to packed byte values in Double register pair
+instruct Repl8S_reg(vecX dst, iRegI src) %{
+  predicate(n->as_Vector()->length_in_bytes() == 16 && VM_Version::has_simd());
+  match(Set dst (ReplicateS src));
+  size(4);
+
+  format %{ "VDUP.16 $dst.Q,$src\t" %}
+  ins_encode %{
+    bool quad = true;
+    __ vdupI($dst$$FloatRegister, $src$$Register,
+             MacroAssembler::VELEM_SIZE_16, quad);
+  %}
+  ins_pipe(ialu_reg); // FIXME
+%}
+
+
+#ifndef AARCH64
+// Replicate scalar constant to packed short/char values in Double register
+instruct Repl4S_immI(vecD dst, immI src, iRegP tmp) %{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (ReplicateS src));
+  effect(TEMP tmp);
+  size(12);
+  ins_cost(DEFAULT_COST*4); // FIXME
+
+  format %{ "MOV      $tmp, Repl2($src))\n\t"
+            "FMDRR    $dst,$tmp,$tmp\t" %}
+  ins_encode( LdReplImmI(src, dst, tmp, (2), (2)) );
+  ins_pipe(loadConFD); // FIXME
+%}
+#endif /* !AARCH64 */
+
+// Replicate scalar constant to packed byte values in Double register
+instruct Repl4S_immU8(vecD dst, immU8 src) %{
+  predicate(n->as_Vector()->length_in_bytes() == 8 && VM_Version::has_simd());
+  match(Set dst (ReplicateS src));
+  size(4);
+
+  format %{ "VMOV.U16  $dst,$src" %}
+  ins_encode %{
+    bool quad = false;
+    __ vmovI($dst$$FloatRegister, $src$$constant,
+             MacroAssembler::VELEM_SIZE_16, quad);
+  %}
+  ins_pipe(loadConFD); // FIXME
+%}
+
+// Replicate scalar constant to packed byte values in Double register pair
+instruct Repl8S_immU8(vecX dst, immU8 src) %{
+  predicate(n->as_Vector()->length_in_bytes() == 16 && VM_Version::has_simd());
+  match(Set dst (ReplicateS src));
+  size(4);
+
+  format %{ "VMOV.U16  $dst.Q,$src" %}
+  ins_encode %{
+    bool quad = true;
+    __ vmovI($dst$$FloatRegister, $src$$constant,
+             MacroAssembler::VELEM_SIZE_16, quad);
+  %}
+  ins_pipe(loadConFD); // FIXME
+%}
+
+#ifndef AARCH64
+// Replicate scalar to packed int values in Double register
+instruct Repl2I_reg(vecD dst, iRegI src) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (ReplicateI src));
+  size(4);
+
+  format %{ "FMDRR    $dst,$src,$src\t" %}
+  ins_encode %{
+    __ fmdrr($dst$$FloatRegister, $src$$Register, $src$$Register);
+  %}
+  ins_pipe(ialu_reg); // FIXME
+%}
+
+// Replicate scalar to packed int values in Double register pair
+instruct Repl4I_reg(vecX dst, iRegI src) %{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (ReplicateI src));
+  ins_cost(DEFAULT_COST*2);
+  size(8);
+
+  format %{ "FMDRR    $dst.lo,$src,$src\n\t"
+            "FMDRR    $dst.hi,$src,$src" %}
+
+  ins_encode %{
+    __ fmdrr($dst$$FloatRegister, $src$$Register, $src$$Register);
+    __ fmdrr($dst$$FloatRegister->successor()->successor(),
+             $src$$Register, $src$$Register);
+  %}
+  ins_pipe(ialu_reg); // FIXME
+%}
+#endif /* !AARCH64 */
+
+// Replicate scalar to packed int values in Double register
+instruct Repl2I_reg_simd(vecD dst, iRegI src) %{
+  predicate(n->as_Vector()->length_in_bytes() == 8 && VM_Version::has_simd());
+  match(Set dst (ReplicateI src));
+  size(4);
+
+  format %{ "VDUP.32 $dst.D,$src\t" %}
+  ins_encode %{
+    bool quad = false;
+    __ vdupI($dst$$FloatRegister, $src$$Register,
+             MacroAssembler::VELEM_SIZE_32, quad);
+  %}
+  ins_pipe(ialu_reg); // FIXME
+%}
+
+// Replicate scalar to packed int values in Double register pair
+instruct Repl4I_reg_simd(vecX dst, iRegI src) %{
+  predicate(n->as_Vector()->length_in_bytes() == 16 && VM_Version::has_simd());
+  match(Set dst (ReplicateI src));
+  size(4);
+
+  format %{ "VDUP.32 $dst.Q,$src\t" %}
+  ins_encode %{
+    bool quad = true;
+    __ vdupI($dst$$FloatRegister, $src$$Register,
+             MacroAssembler::VELEM_SIZE_32, quad);
+  %}
+  ins_pipe(ialu_reg); // FIXME
+%}
+
+
+#ifndef AARCH64
+// Replicate scalar zero constant to packed int values in Double register
+instruct Repl2I_immI(vecD dst, immI src, iRegI tmp) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (ReplicateI src));
+  effect(TEMP tmp);
+  size(12);
+  ins_cost(DEFAULT_COST*4); // FIXME
+
+  format %{ "MOV      $tmp, Repl1($src))\n\t"
+            "FMDRR    $dst,$tmp,$tmp\t" %}
+  ins_encode( LdReplImmI(src, dst, tmp, (1), (4)) );
+  ins_pipe(loadConFD); // FIXME
+%}
+#endif /* !AARCH64 */
+
+// Replicate scalar constant to packed byte values in Double register
+instruct Repl2I_immU8(vecD dst, immU8 src) %{
+  predicate(n->as_Vector()->length_in_bytes() == 8 && VM_Version::has_simd());
+  match(Set dst (ReplicateI src));
+  size(4);
+
+  format %{ "VMOV.I32  $dst.D,$src" %}
+  ins_encode %{
+    bool quad = false;
+    __ vmovI($dst$$FloatRegister, $src$$constant,
+             MacroAssembler::VELEM_SIZE_32, quad);
+  %}
+  ins_pipe(loadConFD); // FIXME
+%}
+
+// Replicate scalar constant to packed byte values in Double register pair
+instruct Repl4I_immU8(vecX dst, immU8 src) %{
+  predicate(n->as_Vector()->length_in_bytes() == 16 && VM_Version::has_simd());
+  match(Set dst (ReplicateI src));
+  size(4);
+
+  format %{ "VMOV.I32  $dst.Q,$src" %}
+  ins_encode %{
+    bool quad = true;
+    __ vmovI($dst$$FloatRegister, $src$$constant,
+             MacroAssembler::VELEM_SIZE_32, quad);
+  %}
+  ins_pipe(loadConFD); // FIXME
+%}
+
+#ifdef AARCH64
+// Replicate scalar to packed byte values in Double register pair
+instruct Repl2L_reg(vecX dst, iRegL src) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (ReplicateL src));
+  size(4*1);
+  ins_cost(DEFAULT_COST*1); // FIXME
+
+  format %{ "VDUP.2D $dst.Q,$src\t" %}
+  ins_encode %{
+    bool quad = true;
+    __ vdupI($dst$$FloatRegister, $src$$Register,
+             MacroAssembler::VELEM_SIZE_64, quad);
+  %}
+  ins_pipe(ialu_reg); // FIXME
+%}
+#else /* !AARCH64 */
+// Replicate scalar to packed byte values in Double register pair
+instruct Repl2L_reg(vecX dst, iRegL src) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (ReplicateL src));
+  size(8);
+  ins_cost(DEFAULT_COST*2); // FIXME
+
+  format %{ "FMDRR $dst.D,$src.lo,$src.hi\t\n"
+            "FMDRR $dst.D.next,$src.lo,$src.hi" %}
+  ins_encode %{
+    __ fmdrr($dst$$FloatRegister, $src$$Register, $src$$Register->successor());
+    __ fmdrr($dst$$FloatRegister->successor()->successor(),
+             $src$$Register, $src$$Register->successor());
+  %}
+  ins_pipe(ialu_reg); // FIXME
+%}
+
+
+// Replicate scalar to packed float values in Double register
+instruct Repl2F_regI(vecD dst, iRegI src) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (ReplicateF src));
+  size(4);
+
+  format %{ "FMDRR    $dst.D,$src,$src\t" %}
+  ins_encode %{
+    __ fmdrr($dst$$FloatRegister, $src$$Register, $src$$Register);
+  %}
+  ins_pipe(ialu_reg); // FIXME
+%}
+
+// Replicate scalar to packed float values in Double register
+instruct Repl2F_reg_vfp(vecD dst, regF src) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (ReplicateF src));
+  size(4*2);
+  ins_cost(DEFAULT_COST*2); // FIXME
+
+  expand %{
+    iRegI tmp;
+    MoveF2I_reg_reg(tmp, src);
+    Repl2F_regI(dst,tmp);
+  %}
+%}
+#endif /* !AARCH64 */
+
+// Replicate scalar to packed float values in Double register
+instruct Repl2F_reg_simd(vecD dst, regF src) %{
+  predicate(n->as_Vector()->length_in_bytes() == 8 && VM_Version::has_simd());
+  match(Set dst (ReplicateF src));
+  size(4);
+  ins_cost(DEFAULT_COST); // FIXME
+
+  format %{ "VDUP.32  $dst.D,$src.D\t" %}
+  ins_encode %{
+    bool quad = false;
+    __ vdupF($dst$$FloatRegister, $src$$FloatRegister, quad);
+  %}
+  ins_pipe(ialu_reg); // FIXME
+%}
+
+#ifndef AARCH64
+// Replicate scalar to packed float values in Double register pair
+instruct Repl4F_reg(vecX dst, regF src, iRegI tmp) %{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (ReplicateF src));
+  effect(TEMP tmp);
+  size(4*3);
+  ins_cost(DEFAULT_COST*3); // FIXME
+
+  format %{ "FMRS     $tmp,$src\n\t"
+            "FMDRR    $dst.D,$tmp,$tmp\n\t"
+            "FMDRR    $dst.D.next,$tmp,$tmp\t" %}
+  ins_encode %{
+    __ fmrs($tmp$$Register, $src$$FloatRegister);
+    __ fmdrr($dst$$FloatRegister, $tmp$$Register, $tmp$$Register);
+    __ fmdrr($dst$$FloatRegister->successor()->successor(),
+             $tmp$$Register, $tmp$$Register);
+  %}
+  ins_pipe(ialu_reg); // FIXME
+%}
+#endif /* !AARCH64 */
+
+// Replicate scalar to packed float values in Double register pair
+instruct Repl4F_reg_simd(vecX dst, regF src) %{
+  predicate(n->as_Vector()->length_in_bytes() == 16 && VM_Version::has_simd());
+  match(Set dst (ReplicateF src));
+  size(4);
+  ins_cost(DEFAULT_COST); // FIXME
+
+  format %{ "VDUP.32  $dst.Q,$src.D\t" %}
+  ins_encode %{
+    bool quad = true;
+    __ vdupF($dst$$FloatRegister, $src$$FloatRegister, quad);
+  %}
+  ins_pipe(ialu_reg); // FIXME
+%}
+
+#ifndef AARCH64
+// Replicate scalar zero constant to packed float values in Double register
+instruct Repl2F_immI(vecD dst, immF src, iRegI tmp) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (ReplicateF src));
+  effect(TEMP tmp);
+  size(12);
+  ins_cost(DEFAULT_COST*4); // FIXME
+
+  format %{ "MOV      $tmp, Repl1($src))\n\t"
+            "FMDRR    $dst,$tmp,$tmp\t" %}
+  ins_encode( LdReplImmF(src, dst, tmp) );
+  ins_pipe(loadConFD); // FIXME
+%}
+#endif /* !AAARCH64 */
+
+// Replicate scalar to packed double float values in Double register pair
+instruct Repl2D_reg(vecX dst, regD src) %{
+#ifdef AARCH64
+  predicate(n->as_Vector()->length() == 2 && VM_Version::has_simd());
+  match(Set dst (ReplicateD src));
+  size(4*1);
+  ins_cost(DEFAULT_COST*1); // FIXME
+
+  format %{ "VDUP     $dst.2D,$src\t" %}
+  ins_encode %{
+    bool quad = true;
+    __ vdupD($dst$$FloatRegister, $src$$FloatRegister, quad);
+  %}
+#else
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (ReplicateD src));
+  size(4*2);
+  ins_cost(DEFAULT_COST*2); // FIXME
+
+  format %{ "FCPYD    $dst.D.a,$src\n\t"
+            "FCPYD    $dst.D.b,$src\t" %}
+  ins_encode %{
+    FloatRegister dsta = $dst$$FloatRegister;
+    FloatRegister src = $src$$FloatRegister;
+    __ fcpyd(dsta, src);
+    FloatRegister dstb = dsta->successor()->successor();
+    __ fcpyd(dstb, src);
+  %}
+#endif
+  ins_pipe(ialu_reg); // FIXME
+%}
+
+// ====================VECTOR ARITHMETIC=======================================
+
+// --------------------------------- ADD --------------------------------------
+
+// Bytes vector add
+instruct vadd8B_reg(vecD dst, vecD src1, vecD src2) %{
+  predicate(n->as_Vector()->length() == 8);
+  match(Set dst (AddVB src1 src2));
+  format %{ "VADD.I8 $dst,$src1,$src2\t! add packed8B" %}
+  size(4);
+  ins_encode %{
+    bool quad = false;
+    __ vaddI($dst$$FloatRegister, $src1$$FloatRegister, $src2$$FloatRegister,
+             MacroAssembler::VELEM_SIZE_8, quad);
+  %}
+  ins_pipe( ialu_reg_reg ); // FIXME
+%}
+
+instruct vadd16B_reg(vecX dst, vecX src1, vecX src2) %{
+  predicate(n->as_Vector()->length() == 16);
+  match(Set dst (AddVB src1 src2));
+  size(4);
+  format %{ "VADD.I8 $dst.Q,$src1.Q,$src2.Q\t! add packed16B" %}
+  ins_encode %{
+    bool quad = true;
+    __ vaddI($dst$$FloatRegister, $src1$$FloatRegister, $src2$$FloatRegister,
+             MacroAssembler::VELEM_SIZE_8, quad);
+  %}
+  ins_pipe( ialu_reg_reg ); // FIXME
+%}
+
+// Shorts/Chars vector add
+instruct vadd4S_reg(vecD dst, vecD src1, vecD src2) %{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (AddVS src1 src2));
+  size(4);
+  format %{ "VADD.I16 $dst,$src1,$src2\t! add packed4S" %}
+  ins_encode %{
+    bool quad = false;
+    __ vaddI($dst$$FloatRegister, $src1$$FloatRegister, $src2$$FloatRegister,
+             MacroAssembler::VELEM_SIZE_16, quad);
+  %}
+  ins_pipe( ialu_reg_reg ); // FIXME
+%}
+
+instruct vadd8S_reg(vecX dst, vecX src1, vecX src2) %{
+  predicate(n->as_Vector()->length() == 8);
+  match(Set dst (AddVS src1 src2));
+  size(4);
+  format %{ "VADD.I16 $dst.Q,$src1.Q,$src2.Q\t! add packed8S" %}
+  ins_encode %{
+    bool quad = true;
+    __ vaddI($dst$$FloatRegister, $src1$$FloatRegister, $src2$$FloatRegister,
+             MacroAssembler::VELEM_SIZE_16, quad);
+  %}
+  ins_pipe( ialu_reg_reg ); // FIXME
+%}
+
+// Integers vector add
+instruct vadd2I_reg(vecD dst, vecD src1, vecD src2) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (AddVI src1 src2));
+  size(4);
+  format %{ "VADD.I32 $dst.D,$src1.D,$src2.D\t! add packed2I" %}
+  ins_encode %{
+    bool quad = false;
+    __ vaddI($dst$$FloatRegister, $src1$$FloatRegister, $src2$$FloatRegister,
+             MacroAssembler::VELEM_SIZE_32, quad);
+  %}
+  ins_pipe( ialu_reg_reg ); // FIXME
+%}
+
+instruct vadd4I_reg(vecX dst, vecX src1, vecX src2) %{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (AddVI src1 src2));
+  size(4);
+  format %{ "VADD.I32 $dst.Q,$src1.Q,$src2.Q\t! add packed4I" %}
+  ins_encode %{
+    bool quad = true;
+    __ vaddI($dst$$FloatRegister, $src1$$FloatRegister, $src2$$FloatRegister,
+             MacroAssembler::VELEM_SIZE_32, quad);
+  %}
+  ins_pipe( ialu_reg_reg ); // FIXME
+%}
+
+// Longs vector add
+instruct vadd2L_reg(vecX dst, vecX src1, vecX src2) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (AddVL src1 src2));
+  size(4);
+  format %{ "VADD.I64 $dst.Q,$src1.Q,$src2.Q\t! add packed2L" %}
+  ins_encode %{
+    bool quad = true;
+    __ vaddI($dst$$FloatRegister, $src1$$FloatRegister, $src2$$FloatRegister,
+             MacroAssembler::VELEM_SIZE_64, quad);
+  %}
+  ins_pipe( ialu_reg_reg ); // FIXME
+%}
+
+// Floats vector add
+instruct vadd2F_reg(vecD dst, vecD src1, vecD src2) %{
+  predicate(n->as_Vector()->length() == 2 && VM_Version::simd_math_is_compliant());
+  match(Set dst (AddVF src1 src2));
+  size(4);
+  format %{ "VADD.F32 $dst,$src1,$src2\t! add packed2F" %}
+  ins_encode %{
+    bool quad = false;
+    __ vaddF($dst$$FloatRegister, $src1$$FloatRegister, $src2$$FloatRegister,
+             MacroAssembler::VFA_SIZE_F32, quad);
+  %}
+  ins_pipe( faddD_reg_reg ); // FIXME
+%}
+
+#ifndef AARCH64
+instruct vadd2F_reg_vfp(vecD dst, vecD src1, vecD src2) %{
+  predicate(n->as_Vector()->length() == 2 && !VM_Version::simd_math_is_compliant());
+  match(Set dst (AddVF src1 src2));
+  ins_cost(DEFAULT_COST*2); // FIXME
+
+  size(4*2);
+  format %{ "FADDS  $dst.a,$src1.a,$src2.a\n\t"
+            "FADDS  $dst.b,$src1.b,$src2.b" %}
+  ins_encode %{
+    __ add_float($dst$$FloatRegister, $src1$$FloatRegister, $src2$$FloatRegister);
+    __ add_float($dst$$FloatRegister->successor(),
+             $src1$$FloatRegister->successor(),
+             $src2$$FloatRegister->successor());
+  %}
+
+  ins_pipe(faddF_reg_reg); // FIXME
+%}
+#endif
+
+instruct vadd4F_reg_simd(vecX dst, vecX src1, vecX src2) %{
+  predicate(n->as_Vector()->length() == 4 && VM_Version::simd_math_is_compliant());
+  match(Set dst (AddVF src1 src2));
+  size(4);
+  format %{ "VADD.F32 $dst.Q,$src1.Q,$src2.Q\t! add packed4F" %}
+  ins_encode %{
+    bool quad = true;
+    __ vaddF($dst$$FloatRegister, $src1$$FloatRegister, $src2$$FloatRegister,
+             MacroAssembler::VFA_SIZE_F32, quad);
+  %}
+  ins_pipe( faddD_reg_reg ); // FIXME
+%}
+
+#ifdef AARCH64
+instruct vadd2D_reg_simd(vecX dst, vecX src1, vecX src2) %{
+  predicate(n->as_Vector()->length() == 2 && VM_Version::simd_math_is_compliant());
+  match(Set dst (AddVD src1 src2));
+  size(4);
+  format %{ "VADD.F64 $dst.Q,$src1.Q,$src2.Q\t! add packed2D" %}
+  ins_encode %{
+    bool quad = true;
+    __ vaddF($dst$$FloatRegister, $src1$$FloatRegister, $src2$$FloatRegister,
+             MacroAssembler::VFA_SIZE_F64, quad);
+  %}
+  ins_pipe( faddD_reg_reg ); // FIXME
+%}
+#else
+instruct vadd4F_reg_vfp(vecX dst, vecX src1, vecX src2) %{
+  predicate(n->as_Vector()->length() == 4 && !VM_Version::simd_math_is_compliant());
+  match(Set dst (AddVF src1 src2));
+  size(4*4);
+  ins_cost(DEFAULT_COST*4); // FIXME
+
+  format %{ "FADDS  $dst.a,$src1.a,$src2.a\n\t"
+            "FADDS  $dst.b,$src1.b,$src2.b\n\t"
+            "FADDS  $dst.c,$src1.c,$src2.c\n\t"
+            "FADDS  $dst.d,$src1.d,$src2.d" %}
+
+  ins_encode %{
+    FloatRegister dsta = $dst$$FloatRegister;
+    FloatRegister src1a = $src1$$FloatRegister;
+    FloatRegister src2a = $src2$$FloatRegister;
+    __ add_float(dsta, src1a, src2a);
+    FloatRegister dstb = dsta->successor();
+    FloatRegister src1b = src1a->successor();
+    FloatRegister src2b = src2a->successor();
+    __ add_float(dstb, src1b, src2b);
+    FloatRegister dstc = dstb->successor();
+    FloatRegister src1c = src1b->successor();
+    FloatRegister src2c = src2b->successor();
+    __ add_float(dstc, src1c, src2c);
+    FloatRegister dstd = dstc->successor();
+    FloatRegister src1d = src1c->successor();
+    FloatRegister src2d = src2c->successor();
+    __ add_float(dstd, src1d, src2d);
+  %}
+
+  ins_pipe(faddF_reg_reg); // FIXME
+%}
+
+instruct vadd2D_reg_vfp(vecX dst, vecX src1, vecX src2) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (AddVD src1 src2));
+  size(4*2);
+  ins_cost(DEFAULT_COST*2); // FIXME
+
+  format %{ "FADDD  $dst.a,$src1.a,$src2.a\n\t"
+            "FADDD  $dst.b,$src1.b,$src2.b" %}
+
+  ins_encode %{
+    FloatRegister dsta = $dst$$FloatRegister;
+    FloatRegister src1a = $src1$$FloatRegister;
+    FloatRegister src2a = $src2$$FloatRegister;
+    __ add_double(dsta, src1a, src2a);
+    FloatRegister dstb = dsta->successor()->successor();
+    FloatRegister src1b = src1a->successor()->successor();
+    FloatRegister src2b = src2a->successor()->successor();
+    __ add_double(dstb, src1b, src2b);
+  %}
+
+  ins_pipe(faddF_reg_reg); // FIXME
+%}
+#endif
+
+
+// Bytes vector sub
+instruct vsub8B_reg(vecD dst, vecD src1, vecD src2) %{
+  predicate(n->as_Vector()->length() == 8);
+  match(Set dst (SubVB src1 src2));
+  size(4);
+  format %{ "VSUB.I8 $dst,$src1,$src2\t! sub packed8B" %}
+  ins_encode %{
+    bool quad = false;
+    __ vsubI($dst$$FloatRegister, $src1$$FloatRegister, $src2$$FloatRegister,
+             MacroAssembler::VELEM_SIZE_8, quad);
+  %}
+  ins_pipe( ialu_reg_reg ); // FIXME
+%}
+
+instruct vsub16B_reg(vecX dst, vecX src1, vecX src2) %{
+  predicate(n->as_Vector()->length() == 16);
+  match(Set dst (SubVB src1 src2));
+  size(4);
+  format %{ "VSUB.I8 $dst.Q,$src1.Q,$src2.Q\t! sub packed16B" %}
+  ins_encode %{
+    bool quad = true;
+    __ vsubI($dst$$FloatRegister, $src1$$FloatRegister, $src2$$FloatRegister,
+             MacroAssembler::VELEM_SIZE_8, quad);
+  %}
+  ins_pipe( ialu_reg_reg ); // FIXME
+%}
+
+// Shorts/Chars vector sub
+instruct vsub4S_reg(vecD dst, vecD src1, vecD src2) %{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (SubVS src1 src2));
+  size(4);
+  format %{ "VSUB.I16 $dst,$src1,$src2\t! sub packed4S" %}
+  ins_encode %{
+    bool quad = false;
+    __ vsubI($dst$$FloatRegister, $src1$$FloatRegister, $src2$$FloatRegister,
+             MacroAssembler::VELEM_SIZE_16, quad);
+  %}
+  ins_pipe( ialu_reg_reg ); // FIXME
+%}
+
+instruct vsub16S_reg(vecX dst, vecX src1, vecX src2) %{
+  predicate(n->as_Vector()->length() == 8);
+  match(Set dst (SubVS src1 src2));
+  size(4);
+  format %{ "VSUB.I16 $dst.Q,$src1.Q,$src2.Q\t! sub packed8S" %}
+  ins_encode %{
+    bool quad = true;
+    __ vsubI($dst$$FloatRegister, $src1$$FloatRegister, $src2$$FloatRegister,
+             MacroAssembler::VELEM_SIZE_16, quad);
+  %}
+  ins_pipe( ialu_reg_reg ); // FIXME
+%}
+
+// Integers vector sub
+instruct vsub2I_reg(vecD dst, vecD src1, vecD src2) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (SubVI src1 src2));
+  size(4);
+  format %{ "VSUB.I32 $dst,$src1,$src2\t! sub packed2I" %}
+  ins_encode %{
+    bool quad = false;
+    __ vsubI($dst$$FloatRegister, $src1$$FloatRegister, $src2$$FloatRegister,
+             MacroAssembler::VELEM_SIZE_32, quad);
+  %}
+  ins_pipe( ialu_reg_reg ); // FIXME
+%}
+
+instruct vsub4I_reg(vecX dst, vecX src1, vecX src2) %{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (SubVI src1 src2));
+  size(4);
+  format %{ "VSUB.I32 $dst.Q,$src1.Q,$src2.Q\t! sub packed4I" %}
+  ins_encode %{
+    bool quad = true;
+    __ vsubI($dst$$FloatRegister, $src1$$FloatRegister, $src2$$FloatRegister,
+             MacroAssembler::VELEM_SIZE_32, quad);
+  %}
+  ins_pipe( ialu_reg_reg ); // FIXME
+%}
+
+// Longs vector sub
+instruct vsub2L_reg(vecX dst, vecX src1, vecX src2) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (SubVL src1 src2));
+  size(4);
+  format %{ "VSUB.I64 $dst.Q,$src1.Q,$src2.Q\t! sub packed2L" %}
+  ins_encode %{
+    bool quad = true;
+    __ vsubI($dst$$FloatRegister, $src1$$FloatRegister, $src2$$FloatRegister,
+             MacroAssembler::VELEM_SIZE_64, quad);
+  %}
+  ins_pipe( ialu_reg_reg ); // FIXME
+%}
+
+// Floats vector sub
+instruct vsub2F_reg(vecD dst, vecD src1, vecD src2) %{
+  predicate(n->as_Vector()->length() == 2 && VM_Version::simd_math_is_compliant());
+  match(Set dst (SubVF src1 src2));
+  size(4);
+  format %{ "VSUB.F32 $dst,$src1,$src2\t! sub packed2F" %}
+  ins_encode %{
+    bool quad = false;
+    __ vsubF($dst$$FloatRegister, $src1$$FloatRegister, $src2$$FloatRegister,
+             MacroAssembler::VFA_SIZE_F32, quad);
+  %}
+  ins_pipe( faddF_reg_reg ); // FIXME
+%}
+
+#ifndef AARCH64
+instruct vsub2F_reg_vfp(vecD dst, vecD src1, vecD src2) %{
+  predicate(n->as_Vector()->length() == 2 && !VM_Version::simd_math_is_compliant());
+  match(Set dst (SubVF src1 src2));
+  size(4*2);
+  ins_cost(DEFAULT_COST*2); // FIXME
+
+  format %{ "FSUBS  $dst.a,$src1.a,$src2.a\n\t"
+            "FSUBS  $dst.b,$src1.b,$src2.b" %}
+
+  ins_encode %{
+    FloatRegister dsta = $dst$$FloatRegister;
+    FloatRegister src1a = $src1$$FloatRegister;
+    FloatRegister src2a = $src2$$FloatRegister;
+    __ sub_float(dsta, src1a, src2a);
+    FloatRegister dstb = dsta->successor();
+    FloatRegister src1b = src1a->successor();
+    FloatRegister src2b = src2a->successor();
+    __ sub_float(dstb, src1b, src2b);
+  %}
+
+  ins_pipe(faddF_reg_reg); // FIXME
+%}
+#endif
+
+
+instruct vsub4F_reg(vecX dst, vecX src1, vecX src2) %{
+  predicate(n->as_Vector()->length() == 4 && VM_Version::simd_math_is_compliant());
+  match(Set dst (SubVF src1 src2));
+  size(4);
+  format %{ "VSUB.F32 $dst.Q,$src1.Q,$src2.Q\t! sub packed4F" %}
+  ins_encode %{
+    bool quad = true;
+    __ vsubF($dst$$FloatRegister, $src1$$FloatRegister, $src2$$FloatRegister,
+             MacroAssembler::VFA_SIZE_F32, quad);
+  %}
+  ins_pipe( faddF_reg_reg ); // FIXME
+%}
+
+#ifdef AARCH64
+instruct vsub2D_reg_simd(vecX dst, vecX src1, vecX src2) %{
+  predicate(n->as_Vector()->length() == 2 && VM_Version::simd_math_is_compliant());
+  match(Set dst (SubVD src1 src2));
+  size(4);
+  format %{ "VSUB.F64 $dst.Q,$src1.Q,$src2.Q\t! add packed2D" %}
+  ins_encode %{
+    bool quad = true;
+    __ vsubF($dst$$FloatRegister, $src1$$FloatRegister, $src2$$FloatRegister,
+             MacroAssembler::VFA_SIZE_F64, quad);
+  %}
+  ins_pipe( faddD_reg_reg ); // FIXME
+%}
+#else
+instruct vsub4F_reg_vfp(vecX dst, vecX src1, vecX src2) %{
+  predicate(n->as_Vector()->length() == 4 && !VM_Version::simd_math_is_compliant());
+  match(Set dst (SubVF src1 src2));
+  size(4*4);
+  ins_cost(DEFAULT_COST*4); // FIXME
+
+  format %{ "FSUBS  $dst.a,$src1.a,$src2.a\n\t"
+            "FSUBS  $dst.b,$src1.b,$src2.b\n\t"
+            "FSUBS  $dst.c,$src1.c,$src2.c\n\t"
+            "FSUBS  $dst.d,$src1.d,$src2.d" %}
+
+  ins_encode %{
+    FloatRegister dsta = $dst$$FloatRegister;
+    FloatRegister src1a = $src1$$FloatRegister;
+    FloatRegister src2a = $src2$$FloatRegister;
+    __ sub_float(dsta, src1a, src2a);
+    FloatRegister dstb = dsta->successor();
+    FloatRegister src1b = src1a->successor();
+    FloatRegister src2b = src2a->successor();
+    __ sub_float(dstb, src1b, src2b);
+    FloatRegister dstc = dstb->successor();
+    FloatRegister src1c = src1b->successor();
+    FloatRegister src2c = src2b->successor();
+    __ sub_float(dstc, src1c, src2c);
+    FloatRegister dstd = dstc->successor();
+    FloatRegister src1d = src1c->successor();
+    FloatRegister src2d = src2c->successor();
+    __ sub_float(dstd, src1d, src2d);
+  %}
+
+  ins_pipe(faddF_reg_reg); // FIXME
+%}
+
+instruct vsub2D_reg_vfp(vecX dst, vecX src1, vecX src2) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (SubVD src1 src2));
+  size(4*2);
+  ins_cost(DEFAULT_COST*2); // FIXME
+
+  format %{ "FSUBD  $dst.a,$src1.a,$src2.a\n\t"
+            "FSUBD  $dst.b,$src1.b,$src2.b" %}
+
+  ins_encode %{
+    FloatRegister dsta = $dst$$FloatRegister;
+    FloatRegister src1a = $src1$$FloatRegister;
+    FloatRegister src2a = $src2$$FloatRegister;
+    __ sub_double(dsta, src1a, src2a);
+    FloatRegister dstb = dsta->successor()->successor();
+    FloatRegister src1b = src1a->successor()->successor();
+    FloatRegister src2b = src2a->successor()->successor();
+    __ sub_double(dstb, src1b, src2b);
+  %}
+
+  ins_pipe(faddF_reg_reg); // FIXME
+%}
+#endif
+
+// Shorts/Chars vector mul
+instruct vmul4S_reg(vecD dst, vecD src1, vecD src2) %{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (MulVS src1 src2));
+  size(4);
+  format %{ "VMUL.I16 $dst,$src1,$src2\t! mul packed4S" %}
+  ins_encode %{
+    __ vmulI($dst$$FloatRegister, $src1$$FloatRegister, $src2$$FloatRegister,
+             MacroAssembler::VELEM_SIZE_16, 0);
+  %}
+  ins_pipe( ialu_reg_reg ); // FIXME
+%}
+
+instruct vmul8S_reg(vecX dst, vecX src1, vecX src2) %{
+  predicate(n->as_Vector()->length() == 8);
+  match(Set dst (MulVS src1 src2));
+  size(4);
+  format %{ "VMUL.I16 $dst.Q,$src1.Q,$src2.Q\t! mul packed8S" %}
+  ins_encode %{
+    __ vmulI($dst$$FloatRegister, $src1$$FloatRegister, $src2$$FloatRegister,
+             MacroAssembler::VELEM_SIZE_16, 1);
+  %}
+  ins_pipe( ialu_reg_reg ); // FIXME
+%}
+
+// Integers vector mul
+instruct vmul2I_reg(vecD dst, vecD src1, vecD src2) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (MulVI src1 src2));
+  size(4);
+  format %{ "VMUL.I32 $dst,$src1,$src2\t! mul packed2I" %}
+  ins_encode %{
+    __ vmulI($dst$$FloatRegister, $src1$$FloatRegister, $src2$$FloatRegister,
+             MacroAssembler::VELEM_SIZE_32, 0);
+  %}
+  ins_pipe( ialu_reg_reg ); // FIXME
+%}
+
+instruct vmul4I_reg(vecX dst, vecX src1, vecX src2) %{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (MulVI src1 src2));
+  size(4);
+  format %{ "VMUL.I32 $dst.Q,$src1.Q,$src2.Q\t! mul packed4I" %}
+  ins_encode %{
+    __ vmulI($dst$$FloatRegister, $src1$$FloatRegister, $src2$$FloatRegister,
+             MacroAssembler::VELEM_SIZE_32, 1);
+  %}
+  ins_pipe( ialu_reg_reg ); // FIXME
+%}
+
+// Floats vector mul
+instruct vmul2F_reg(vecD dst, vecD src1, vecD src2) %{
+  predicate(n->as_Vector()->length() == 2 && VM_Version::simd_math_is_compliant());
+  match(Set dst (MulVF src1 src2));
+  size(4);
+  format %{ "VMUL.F32 $dst,$src1,$src2\t! mul packed2F" %}
+  ins_encode %{
+    __ vmulF($dst$$FloatRegister, $src1$$FloatRegister, $src2$$FloatRegister,
+             MacroAssembler::VFA_SIZE_F32, 0);
+  %}
+  ins_pipe( fmulF_reg_reg ); // FIXME
+%}
+
+#ifndef AARCH64
+instruct vmul2F_reg_vfp(vecD dst, vecD src1, vecD src2) %{
+  predicate(n->as_Vector()->length() == 2 && !VM_Version::simd_math_is_compliant());
+  match(Set dst (MulVF src1 src2));
+  size(4*2);
+  ins_cost(DEFAULT_COST*2); // FIXME
+
+  format %{ "FMULS  $dst.a,$src1.a,$src2.a\n\t"
+            "FMULS  $dst.b,$src1.b,$src2.b" %}
+  ins_encode %{
+    __ mul_float($dst$$FloatRegister, $src1$$FloatRegister, $src2$$FloatRegister);
+    __ mul_float($dst$$FloatRegister->successor(),
+             $src1$$FloatRegister->successor(),
+             $src2$$FloatRegister->successor());
+  %}
+
+  ins_pipe(fmulF_reg_reg); // FIXME
+%}
+#endif
+
+instruct vmul4F_reg(vecX dst, vecX src1, vecX src2) %{
+  predicate(n->as_Vector()->length() == 4 && VM_Version::simd_math_is_compliant());
+  match(Set dst (MulVF src1 src2));
+  size(4);
+  format %{ "VMUL.F32 $dst.Q,$src1.Q,$src2.Q\t! mul packed4F" %}
+  ins_encode %{
+    __ vmulF($dst$$FloatRegister, $src1$$FloatRegister, $src2$$FloatRegister,
+             MacroAssembler::VFA_SIZE_F32, 1);
+  %}
+  ins_pipe( fmulF_reg_reg ); // FIXME
+%}
+
+#ifndef AARCH64
+instruct vmul4F_reg_vfp(vecX dst, vecX src1, vecX src2) %{
+  predicate(n->as_Vector()->length() == 4 && !VM_Version::simd_math_is_compliant());
+  match(Set dst (MulVF src1 src2));
+  size(4*4);
+  ins_cost(DEFAULT_COST*4); // FIXME
+
+  format %{ "FMULS  $dst.a,$src1.a,$src2.a\n\t"
+            "FMULS  $dst.b,$src1.b,$src2.b\n\t"
+            "FMULS  $dst.c,$src1.c,$src2.c\n\t"
+            "FMULS  $dst.d,$src1.d,$src2.d" %}
+
+  ins_encode %{
+    FloatRegister dsta = $dst$$FloatRegister;
+    FloatRegister src1a = $src1$$FloatRegister;
+    FloatRegister src2a = $src2$$FloatRegister;
+    __ mul_float(dsta, src1a, src2a);
+    FloatRegister dstb = dsta->successor();
+    FloatRegister src1b = src1a->successor();
+    FloatRegister src2b = src2a->successor();
+    __ mul_float(dstb, src1b, src2b);
+    FloatRegister dstc = dstb->successor();
+    FloatRegister src1c = src1b->successor();
+    FloatRegister src2c = src2b->successor();
+    __ mul_float(dstc, src1c, src2c);
+    FloatRegister dstd = dstc->successor();
+    FloatRegister src1d = src1c->successor();
+    FloatRegister src2d = src2c->successor();
+    __ mul_float(dstd, src1d, src2d);
+  %}
+
+  ins_pipe(fmulF_reg_reg); // FIXME
+%}
+#endif
+
+#ifdef AARCH64
+instruct vmul2D_reg(vecX dst, vecX src1, vecX src2) %{
+  predicate(n->as_Vector()->length() == 2 && VM_Version::has_simd());
+  match(Set dst (MulVD src1 src2));
+  size(4*1);
+  ins_cost(DEFAULT_COST*1); // FIXME
+
+  format %{ "FMUL.2D $dst,$src1,$src2\t! double[2]" %}
+  ins_encode %{
+    int quad = 1;
+    __ vmulF($dst$$FloatRegister, $src1$$FloatRegister, $src2$$FloatRegister,
+             MacroAssembler::VFA_SIZE_F64, quad);
+  %}
+
+  ins_pipe(fdivF_reg_reg); // FIXME
+%}
+#else
+instruct vmul2D_reg_vfp(vecX dst, vecX src1, vecX src2) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (MulVD src1 src2));
+  size(4*2);
+  ins_cost(DEFAULT_COST*2); // FIXME
+
+  format %{ "FMULD  $dst.D.a,$src1.D.a,$src2.D.a\n\t"
+            "FMULD  $dst.D.b,$src1.D.b,$src2.D.b" %}
+  ins_encode %{
+    FloatRegister dsta = $dst$$FloatRegister;
+    FloatRegister src1a = $src1$$FloatRegister;
+    FloatRegister src2a = $src2$$FloatRegister;
+    __ mul_double(dsta, src1a, src2a);
+    FloatRegister dstb = dsta->successor()->successor();
+    FloatRegister src1b = src1a->successor()->successor();
+    FloatRegister src2b = src2a->successor()->successor();
+    __ mul_double(dstb, src1b, src2b);
+  %}
+
+  ins_pipe(fmulD_reg_reg); // FIXME
+%}
+#endif
+
+
+// Floats vector div
+instruct vdiv2F_reg_vfp(vecD dst, vecD src1, vecD src2) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (DivVF src1 src2));
+#ifdef AARCH64
+  size(4*1);
+  ins_cost(DEFAULT_COST*1); // FIXME
+
+  format %{ "FDIV.2S $dst,$src1,$src2\t! float[2]" %}
+  ins_encode %{
+    int quad = 0;
+    __ vdivF($dst$$FloatRegister, $src1$$FloatRegister, $src2$$FloatRegister,
+             MacroAssembler::VFA_SIZE_F32, quad);
+  %}
+
+  ins_pipe(fdivF_reg_reg); // FIXME
+#else
+  size(4*2);
+  ins_cost(DEFAULT_COST*2); // FIXME
+
+  format %{ "FDIVS  $dst.a,$src1.a,$src2.a\n\t"
+            "FDIVS  $dst.b,$src1.b,$src2.b" %}
+  ins_encode %{
+    __ div_float($dst$$FloatRegister, $src1$$FloatRegister, $src2$$FloatRegister);
+    __ div_float($dst$$FloatRegister->successor(),
+             $src1$$FloatRegister->successor(),
+             $src2$$FloatRegister->successor());
+  %}
+
+  ins_pipe(fdivF_reg_reg); // FIXME
+#endif
+%}
+
+instruct vdiv4F_reg_vfp(vecX dst, vecX src1, vecX src2) %{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (DivVF src1 src2));
+#ifdef AARCH64
+  size(4*1);
+  ins_cost(DEFAULT_COST*1); // FIXME
+
+  format %{ "FDIV.4S $dst,$src1,$src2\t! float[4]" %}
+  ins_encode %{
+    int quad = 1;
+    __ vdivF($dst$$FloatRegister, $src1$$FloatRegister, $src2$$FloatRegister,
+             MacroAssembler::VFA_SIZE_F32, quad);
+  %}
+
+  ins_pipe(fdivF_reg_reg); // FIXME
+#else
+  size(4*4);
+  ins_cost(DEFAULT_COST*4); // FIXME
+
+  format %{ "FDIVS  $dst.a,$src1.a,$src2.a\n\t"
+            "FDIVS  $dst.b,$src1.b,$src2.b\n\t"
+            "FDIVS  $dst.c,$src1.c,$src2.c\n\t"
+            "FDIVS  $dst.d,$src1.d,$src2.d" %}
+
+  ins_encode %{
+    FloatRegister dsta = $dst$$FloatRegister;
+    FloatRegister src1a = $src1$$FloatRegister;
+    FloatRegister src2a = $src2$$FloatRegister;
+    __ div_float(dsta, src1a, src2a);
+    FloatRegister dstb = dsta->successor();
+    FloatRegister src1b = src1a->successor();
+    FloatRegister src2b = src2a->successor();
+    __ div_float(dstb, src1b, src2b);
+    FloatRegister dstc = dstb->successor();
+    FloatRegister src1c = src1b->successor();
+    FloatRegister src2c = src2b->successor();
+    __ div_float(dstc, src1c, src2c);
+    FloatRegister dstd = dstc->successor();
+    FloatRegister src1d = src1c->successor();
+    FloatRegister src2d = src2c->successor();
+    __ div_float(dstd, src1d, src2d);
+  %}
+
+  ins_pipe(fdivF_reg_reg); // FIXME
+#endif
+%}
+
+#ifdef AARCH64
+instruct vdiv2D_reg(vecX dst, vecX src1, vecX src2) %{
+  predicate(n->as_Vector()->length() == 2 && VM_Version::has_simd());
+  match(Set dst (DivVD src1 src2));
+  size(4*1);
+  ins_cost(DEFAULT_COST*1); // FIXME
+
+  format %{ "FDIV.2D $dst,$src1,$src2\t! double[2]" %}
+  ins_encode %{
+    int quad = 1;
+    __ vdivF($dst$$FloatRegister, $src1$$FloatRegister, $src2$$FloatRegister,
+             MacroAssembler::VFA_SIZE_F64, quad);
+  %}
+
+  ins_pipe(fdivF_reg_reg); // FIXME
+%}
+#else
+instruct vdiv2D_reg_vfp(vecX dst, vecX src1, vecX src2) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (DivVD src1 src2));
+  size(4*2);
+  ins_cost(DEFAULT_COST*2); // FIXME
+
+  format %{ "FDIVD  $dst.D.a,$src1.D.a,$src2.D.a\n\t"
+            "FDIVD  $dst.D.b,$src1.D.b,$src2.D.b" %}
+  ins_encode %{
+    FloatRegister dsta = $dst$$FloatRegister;
+    FloatRegister src1a = $src1$$FloatRegister;
+    FloatRegister src2a = $src2$$FloatRegister;
+    __ div_double(dsta, src1a, src2a);
+    FloatRegister dstb = dsta->successor()->successor();
+    FloatRegister src1b = src1a->successor()->successor();
+    FloatRegister src2b = src2a->successor()->successor();
+    __ div_double(dstb, src1b, src2b);
+  %}
+
+  ins_pipe(fdivD_reg_reg); // FIXME
+%}
+#endif
+
+// --------------------------------- NEG --------------------------------------
+
+instruct vneg8B_reg(vecD dst, vecD src) %{
+  predicate(n->as_Vector()->length_in_bytes() == 8);
+  effect(DEF dst, USE src);
+  size(4);
+  ins_cost(DEFAULT_COST); // FIXME
+  format %{ "VNEG.S8 $dst.D,$src.D\t! neg packed8B" %}
+  ins_encode %{
+    bool quad = false;
+    __ vnegI($dst$$FloatRegister, $src$$FloatRegister,
+              MacroAssembler::VELEM_SIZE_8, quad);
+  %}
+  ins_pipe( ialu_reg_reg ); // FIXME
+%}
+
+instruct vneg16B_reg(vecX dst, vecX src) %{
+  predicate(n->as_Vector()->length_in_bytes() == 16);
+  effect(DEF dst, USE src);
+  size(4);
+  ins_cost(DEFAULT_COST); // FIXME
+  format %{ "VNEG.S8 $dst.Q,$src.Q\t! neg0 packed16B" %}
+  ins_encode %{
+    bool _float = false;
+    bool quad = true;
+    __ vnegI($dst$$FloatRegister, $src$$FloatRegister,
+              MacroAssembler::VELEM_SIZE_8, quad);
+  %}
+  ins_pipe( ialu_reg_reg ); // FIXME
+%}
+
+// ------------------------------ Shift ---------------------------------------
+
+instruct vslcntD(vecD dst, iRegI cnt) %{
+  predicate(n->as_Vector()->length_in_bytes() == 8 && VM_Version::has_simd());
+  match(Set dst (LShiftCntV cnt));
+  size(4);
+  ins_cost(DEFAULT_COST); // FIXME
+  expand %{
+    Repl8B_reg_simd(dst, cnt);
+  %}
+%}
+
+instruct vslcntX(vecX dst, iRegI cnt) %{
+  predicate(n->as_Vector()->length_in_bytes() == 16 && VM_Version::has_simd());
+  match(Set dst (LShiftCntV cnt));
+  size(4);
+  ins_cost(DEFAULT_COST); // FIXME
+  expand %{
+    Repl16B_reg(dst, cnt);
+  %}
+%}
+
+// Low bits of vector "shift" elements are used, so it
+// doesn't matter if we treat it as ints or bytes here.
+instruct vsrcntD(vecD dst, iRegI cnt) %{
+  predicate(n->as_Vector()->length_in_bytes() == 8 && VM_Version::has_simd());
+  match(Set dst (RShiftCntV cnt));
+  size(4*2);
+  ins_cost(DEFAULT_COST*2); // FIXME
+
+  format %{ "VDUP.8 $dst.D,$cnt\n\t"
+            "VNEG.S8 $dst.D,$dst.D\t! neg packed8B" %}
+  ins_encode %{
+    bool quad = false;
+    __ vdupI($dst$$FloatRegister, $cnt$$Register,
+             MacroAssembler::VELEM_SIZE_8, quad);
+    __ vnegI($dst$$FloatRegister, $dst$$FloatRegister,
+              MacroAssembler::VELEM_SIZE_8, quad);
+  %}
+  ins_pipe( ialu_reg_reg ); // FIXME
+%}
+
+instruct vsrcntX(vecX dst, iRegI cnt) %{
+  predicate(n->as_Vector()->length_in_bytes() == 16 && VM_Version::has_simd());
+  match(Set dst (RShiftCntV cnt));
+  size(4*2);
+  ins_cost(DEFAULT_COST*2); // FIXME
+  format %{ "VDUP.8 $dst.Q,$cnt\n\t"
+            "VNEG.S8 $dst.Q,$dst.Q\t! neg packed16B" %}
+  ins_encode %{
+    bool quad = true;
+    __ vdupI($dst$$FloatRegister, $cnt$$Register,
+             MacroAssembler::VELEM_SIZE_8, quad);
+    __ vnegI($dst$$FloatRegister, $dst$$FloatRegister,
+              MacroAssembler::VELEM_SIZE_8, quad);
+  %}
+  ins_pipe( ialu_reg_reg ); // FIXME
+%}
+
+// Byte vector logical left/right shift based on sign
+instruct vsh8B_reg(vecD dst, vecD src, vecD shift) %{
+  predicate(n->as_Vector()->length() == 8);
+  effect(DEF dst, USE src, USE shift);
+  size(4);
+  ins_cost(DEFAULT_COST); // FIXME
+  format %{
+    "VSHL.U8 $dst.D,$src.D,$shift.D\t! logical left/right shift packed8B"
+  %}
+  ins_encode %{
+    bool quad = false;
+    __ vshlUI($dst$$FloatRegister, $shift$$FloatRegister, $src$$FloatRegister,
+              MacroAssembler::VELEM_SIZE_8, quad);
+  %}
+  ins_pipe( ialu_reg_reg ); // FIXME
+%}
+
+instruct vsh16B_reg(vecX dst, vecX src, vecX shift) %{
+  predicate(n->as_Vector()->length() == 16);
+  effect(DEF dst, USE src, USE shift);
+  size(4);
+  ins_cost(DEFAULT_COST); // FIXME
+  format %{
+    "VSHL.U8 $dst.Q,$src.Q,$shift.Q\t! logical left/right shift packed16B"
+  %}
+  ins_encode %{
+    bool quad = true;
+    __ vshlUI($dst$$FloatRegister, $shift$$FloatRegister, $src$$FloatRegister,
+              MacroAssembler::VELEM_SIZE_8, quad);
+  %}
+  ins_pipe( ialu_reg_reg ); // FIXME
+%}
+
+// Shorts/Char vector logical left/right shift based on sign
+instruct vsh4S_reg(vecD dst, vecD src, vecD shift) %{
+  predicate(n->as_Vector()->length() == 4);
+  effect(DEF dst, USE src, USE shift);
+  size(4);
+  ins_cost(DEFAULT_COST); // FIXME
+  format %{
+    "VSHL.U16 $dst.D,$src.D,$shift.D\t! logical left/right shift packed4S"
+  %}
+  ins_encode %{
+    bool quad = false;
+    __ vshlUI($dst$$FloatRegister, $shift$$FloatRegister, $src$$FloatRegister,
+              MacroAssembler::VELEM_SIZE_16, quad);
+  %}
+  ins_pipe( ialu_reg_reg ); // FIXME
+%}
+
+instruct vsh8S_reg(vecX dst, vecX src, vecX shift) %{
+  predicate(n->as_Vector()->length() == 8);
+  effect(DEF dst, USE src, USE shift);
+  size(4);
+  ins_cost(DEFAULT_COST); // FIXME
+  format %{
+    "VSHL.U16 $dst.Q,$src.Q,$shift.Q\t! logical left/right shift packed8S"
+  %}
+  ins_encode %{
+    bool quad = true;
+    __ vshlUI($dst$$FloatRegister, $shift$$FloatRegister, $src$$FloatRegister,
+              MacroAssembler::VELEM_SIZE_16, quad);
+  %}
+  ins_pipe( ialu_reg_reg ); // FIXME
+%}
+
+// Integers vector logical left/right shift based on sign
+instruct vsh2I_reg(vecD dst, vecD src, vecD shift) %{
+  predicate(n->as_Vector()->length() == 2);
+  effect(DEF dst, USE src, USE shift);
+  size(4);
+  ins_cost(DEFAULT_COST); // FIXME
+  format %{
+    "VSHL.U32 $dst.D,$src.D,$shift.D\t! logical left/right shift packed2I"
+  %}
+  ins_encode %{
+    bool quad = false;
+    __ vshlUI($dst$$FloatRegister, $shift$$FloatRegister, $src$$FloatRegister,
+              MacroAssembler::VELEM_SIZE_32, quad);
+  %}
+  ins_pipe( ialu_reg_reg ); // FIXME
+%}
+
+instruct vsh4I_reg(vecX dst, vecX src, vecX shift) %{
+  predicate(n->as_Vector()->length() == 4);
+  effect(DEF dst, USE src, USE shift);
+  size(4);
+  ins_cost(DEFAULT_COST); // FIXME
+  format %{
+    "VSHL.U32 $dst.Q,$src.Q,$shift.Q\t! logical left/right shift packed4I"
+  %}
+  ins_encode %{
+    bool quad = true;
+    __ vshlUI($dst$$FloatRegister, $shift$$FloatRegister, $src$$FloatRegister,
+              MacroAssembler::VELEM_SIZE_32, quad);
+  %}
+  ins_pipe( ialu_reg_reg ); // FIXME
+%}
+
+// Longs vector logical left/right shift based on sign
+instruct vsh2L_reg(vecX dst, vecX src, vecX shift) %{
+  predicate(n->as_Vector()->length() == 2);
+  effect(DEF dst, USE src, USE shift);
+  size(4);
+  ins_cost(DEFAULT_COST); // FIXME
+  format %{
+    "VSHL.U64 $dst.Q,$src.Q,$shift.Q\t! logical left/right shift packed2L"
+  %}
+  ins_encode %{
+    bool quad = true;
+    __ vshlUI($dst$$FloatRegister, $shift$$FloatRegister, $src$$FloatRegister,
+              MacroAssembler::VELEM_SIZE_64, quad);
+  %}
+  ins_pipe( ialu_reg_reg ); // FIXME
+%}
+
+// ------------------------------ LeftShift -----------------------------------
+
+// Byte vector left shift
+instruct vsl8B_reg(vecD dst, vecD src, vecD shift) %{
+  predicate(n->as_Vector()->length() == 8);
+  match(Set dst (LShiftVB src shift));
+  size(4*1);
+  ins_cost(DEFAULT_COST*1); // FIXME
+  expand %{
+    vsh8B_reg(dst, src, shift);
+  %}
+%}
+
+instruct vsl16B_reg(vecX dst, vecX src, vecX shift) %{
+  predicate(n->as_Vector()->length() == 16);
+  match(Set dst (LShiftVB src shift));
+  size(4*1);
+  ins_cost(DEFAULT_COST*1); // FIXME
+  expand %{
+    vsh16B_reg(dst, src, shift);
+  %}
+%}
+
+instruct vsl8B_immI(vecD dst, vecD src, immI shift) %{
+  predicate(n->as_Vector()->length() == 8);
+  match(Set dst (LShiftVB src shift));
+  size(4);
+  ins_cost(DEFAULT_COST); // FIXME
+  format %{
+    "VSHL.I8 $dst.D,$src.D,$shift\t! logical left shift packed8B"
+  %}
+  ins_encode %{
+    bool quad = false;
+    __ vshli($dst$$FloatRegister, $src$$FloatRegister, 8, $shift$$constant,
+             quad);
+  %}
+  ins_pipe( ialu_reg_reg ); // FIXME
+%}
+
+instruct vsl16B_immI(vecX dst, vecX src, immI shift) %{
+  predicate(n->as_Vector()->length() == 16);
+  match(Set dst (LShiftVB src shift));
+  size(4);
+  ins_cost(DEFAULT_COST); // FIXME
+  format %{
+    "VSHL.I8 $dst.Q,$src.Q,$shift\t! logical left shift packed16B"
+  %}
+  ins_encode %{
+    bool quad = true;
+    __ vshli($dst$$FloatRegister, $src$$FloatRegister, 8, $shift$$constant,
+             quad);
+  %}
+  ins_pipe( ialu_reg_reg ); // FIXME
+%}
+
+// Shorts/Chars vector logical left/right shift
+instruct vsl4S_reg(vecD dst, vecD src, vecD shift) %{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (LShiftVS src shift));
+  match(Set dst (URShiftVS src shift));
+  size(4*1);
+  ins_cost(DEFAULT_COST*1); // FIXME
+  expand %{
+    vsh4S_reg(dst, src, shift);
+  %}
+%}
+
+instruct vsl8S_reg(vecX dst, vecX src, vecX shift) %{
+  predicate(n->as_Vector()->length() == 8);
+  match(Set dst (LShiftVS src shift));
+  match(Set dst (URShiftVS src shift));
+  size(4*1);
+  ins_cost(DEFAULT_COST*1); // FIXME
+  expand %{
+    vsh8S_reg(dst, src, shift);
+  %}
+%}
+
+instruct vsl4S_immI(vecD dst, vecD src, immI shift) %{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (LShiftVS src shift));
+  size(4);
+  ins_cost(DEFAULT_COST); // FIXME
+  format %{
+    "VSHL.I16 $dst.D,$src.D,$shift\t! logical left shift packed4S"
+  %}
+  ins_encode %{
+    bool quad = false;
+    __ vshli($dst$$FloatRegister, $src$$FloatRegister, 16, $shift$$constant,
+             quad);
+  %}
+  ins_pipe( ialu_reg_reg ); // FIXME
+%}
+
+instruct vsl8S_immI(vecX dst, vecX src, immI shift) %{
+  predicate(n->as_Vector()->length() == 8);
+  match(Set dst (LShiftVS src shift));
+  size(4);
+  ins_cost(DEFAULT_COST); // FIXME
+  format %{
+    "VSHL.I16 $dst.Q,$src.Q,$shift\t! logical left shift packed8S"
+  %}
+  ins_encode %{
+    bool quad = true;
+    __ vshli($dst$$FloatRegister, $src$$FloatRegister, 16, $shift$$constant,
+             quad);
+  %}
+  ins_pipe( ialu_reg_reg ); // FIXME
+%}
+
+// Integers vector logical left/right shift
+instruct vsl2I_reg(vecD dst, vecD src, vecD shift) %{
+  predicate(n->as_Vector()->length() == 2 && VM_Version::has_simd());
+  match(Set dst (LShiftVI src shift));
+  match(Set dst (URShiftVI src shift));
+  size(4*1);
+  ins_cost(DEFAULT_COST*1); // FIXME
+  expand %{
+    vsh2I_reg(dst, src, shift);
+  %}
+%}
+
+instruct vsl4I_reg(vecX dst, vecX src, vecX shift) %{
+  predicate(n->as_Vector()->length() == 4 && VM_Version::has_simd());
+  match(Set dst (LShiftVI src shift));
+  match(Set dst (URShiftVI src shift));
+  size(4*1);
+  ins_cost(DEFAULT_COST*1); // FIXME
+  expand %{
+    vsh4I_reg(dst, src, shift);
+  %}
+%}
+
+instruct vsl2I_immI(vecD dst, vecD src, immI shift) %{
+  predicate(n->as_Vector()->length() == 2 && VM_Version::has_simd());
+  match(Set dst (LShiftVI src shift));
+  size(4);
+  ins_cost(DEFAULT_COST); // FIXME
+  format %{
+    "VSHL.I32 $dst.D,$src.D,$shift\t! logical left shift packed2I"
+  %}
+  ins_encode %{
+    bool quad = false;
+    __ vshli($dst$$FloatRegister, $src$$FloatRegister, 32, $shift$$constant,
+             quad);
+  %}
+  ins_pipe( ialu_reg_reg ); // FIXME
+%}
+
+instruct vsl4I_immI(vecX dst, vecX src, immI shift) %{
+  predicate(n->as_Vector()->length() == 4 && VM_Version::has_simd());
+  match(Set dst (LShiftVI src shift));
+  size(4);
+  ins_cost(DEFAULT_COST); // FIXME
+  format %{
+    "VSHL.I32 $dst.Q,$src.Q,$shift\t! logical left shift packed4I"
+  %}
+  ins_encode %{
+    bool quad = true;
+    __ vshli($dst$$FloatRegister, $src$$FloatRegister, 32, $shift$$constant,
+             quad);
+  %}
+  ins_pipe( ialu_reg_reg ); // FIXME
+%}
+
+// Longs vector logical left/right shift
+instruct vsl2L_reg(vecX dst, vecX src, vecX shift) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (LShiftVL src shift));
+  match(Set dst (URShiftVL src shift));
+  size(4*1);
+  ins_cost(DEFAULT_COST*1); // FIXME
+  expand %{
+    vsh2L_reg(dst, src, shift);
+  %}
+%}
+
+instruct vsl2L_immI(vecX dst, vecX src, immI shift) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (LShiftVL src shift));
+  size(4);
+  ins_cost(DEFAULT_COST); // FIXME
+  format %{
+    "VSHL.I64 $dst.Q,$src.Q,$shift\t! logical left shift packed2L"
+  %}
+  ins_encode %{
+    bool quad = true;
+    __ vshli($dst$$FloatRegister, $src$$FloatRegister, 64, $shift$$constant,
+             quad);
+  %}
+  ins_pipe( ialu_reg_reg ); // FIXME
+%}
+
+// ----------------------- LogicalRightShift -----------------------------------
+
+// Bytes/Shorts vector logical right shift produces incorrect Java result
+// for negative data because java code convert short value into int with
+// sign extension before a shift.
+
+// Chars vector logical right shift
+instruct vsrl4S_immI(vecD dst, vecD src, immI shift) %{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (URShiftVS src shift));
+  size(4);
+  ins_cost(DEFAULT_COST); // FIXME
+  format %{
+    "VSHR.U16 $dst.D,$src.D,$shift\t! logical right shift packed4S"
+  %}
+  ins_encode %{
+    bool quad = false;
+    __ vshrUI($dst$$FloatRegister, $src$$FloatRegister, 16, $shift$$constant,
+             quad);
+  %}
+  ins_pipe( ialu_reg_reg ); // FIXME
+%}
+
+instruct vsrl8S_immI(vecX dst, vecX src, immI shift) %{
+  predicate(n->as_Vector()->length() == 8);
+  match(Set dst (URShiftVS src shift));
+  size(4);
+  ins_cost(DEFAULT_COST); // FIXME
+  format %{
+    "VSHR.U16 $dst.Q,$src.Q,$shift\t! logical right shift packed8S"
+  %}
+  ins_encode %{
+    bool quad = true;
+    __ vshrUI($dst$$FloatRegister, $src$$FloatRegister, 16, $shift$$constant,
+             quad);
+  %}
+  ins_pipe( ialu_reg_reg ); // FIXME
+%}
+
+// Integers vector logical right shift
+instruct vsrl2I_immI(vecD dst, vecD src, immI shift) %{
+  predicate(n->as_Vector()->length() == 2 && VM_Version::has_simd());
+  match(Set dst (URShiftVI src shift));
+  size(4);
+  ins_cost(DEFAULT_COST); // FIXME
+  format %{
+    "VSHR.U32 $dst.D,$src.D,$shift\t! logical right shift packed2I"
+  %}
+  ins_encode %{
+    bool quad = false;
+    __ vshrUI($dst$$FloatRegister, $src$$FloatRegister, 32, $shift$$constant,
+             quad);
+  %}
+  ins_pipe( ialu_reg_reg ); // FIXME
+%}
+
+instruct vsrl4I_immI(vecX dst, vecX src, immI shift) %{
+  predicate(n->as_Vector()->length() == 4 && VM_Version::has_simd());
+  match(Set dst (URShiftVI src shift));
+  size(4);
+  ins_cost(DEFAULT_COST); // FIXME
+  format %{
+    "VSHR.U32 $dst.Q,$src.Q,$shift\t! logical right shift packed4I"
+  %}
+  ins_encode %{
+    bool quad = true;
+    __ vshrUI($dst$$FloatRegister, $src$$FloatRegister, 32, $shift$$constant,
+             quad);
+  %}
+  ins_pipe( ialu_reg_reg ); // FIXME
+%}
+
+// Longs vector logical right shift
+instruct vsrl2L_immI(vecX dst, vecX src, immI shift) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (URShiftVL src shift));
+  size(4);
+  ins_cost(DEFAULT_COST); // FIXME
+  format %{
+    "VSHR.U64 $dst.Q,$src.Q,$shift\t! logical right shift packed2L"
+  %}
+  ins_encode %{
+    bool quad = true;
+    __ vshrUI($dst$$FloatRegister, $src$$FloatRegister, 64, $shift$$constant,
+             quad);
+  %}
+  ins_pipe( ialu_reg_reg ); // FIXME
+%}
+
+// ------------------- ArithmeticRightShift -----------------------------------
+
+// Bytes vector arithmetic left/right shift based on sign
+instruct vsha8B_reg(vecD dst, vecD src, vecD shift) %{
+  predicate(n->as_Vector()->length() == 8);
+  effect(DEF dst, USE src, USE shift);
+  size(4);
+  ins_cost(DEFAULT_COST); // FIXME
+  format %{
+    "VSHL.S8 $dst.D,$src.D,$shift.D\t! arithmetic right shift packed8B"
+  %}
+  ins_encode %{
+    bool quad = false;
+    __ vshlSI($dst$$FloatRegister, $shift$$FloatRegister, $src$$FloatRegister,
+              MacroAssembler::VELEM_SIZE_8, quad);
+  %}
+  ins_pipe( ialu_reg_reg ); // FIXME
+%}
+
+instruct vsha16B_reg(vecX dst, vecX src, vecX shift) %{
+  predicate(n->as_Vector()->length() == 16);
+  effect(DEF dst, USE src, USE shift);
+  size(4);
+  ins_cost(DEFAULT_COST); // FIXME
+  format %{
+    "VSHL.S8 $dst.Q,$src.Q,$shift.Q\t! arithmetic right shift packed16B"
+  %}
+  ins_encode %{
+    bool quad = true;
+    __ vshlSI($dst$$FloatRegister, $shift$$FloatRegister, $src$$FloatRegister,
+              MacroAssembler::VELEM_SIZE_8, quad);
+  %}
+  ins_pipe( ialu_reg_reg ); // FIXME
+%}
+
+// Shorts vector arithmetic left/right shift based on sign
+instruct vsha4S_reg(vecD dst, vecD src, vecD shift) %{
+  predicate(n->as_Vector()->length() == 4);
+  effect(DEF dst, USE src, USE shift);
+  size(4);
+  ins_cost(DEFAULT_COST); // FIXME
+  format %{
+    "VSHL.S16 $dst.D,$src.D,$shift.D\t! arithmetic right shift packed4S"
+  %}
+  ins_encode %{
+    bool quad = false;
+    __ vshlSI($dst$$FloatRegister, $shift$$FloatRegister, $src$$FloatRegister,
+              MacroAssembler::VELEM_SIZE_16, quad);
+  %}
+  ins_pipe( ialu_reg_reg ); // FIXME
+%}
+
+instruct vsha8S_reg(vecX dst, vecX src, vecX shift) %{
+  predicate(n->as_Vector()->length() == 8);
+  effect(DEF dst, USE src, USE shift);
+  size(4);
+  ins_cost(DEFAULT_COST); // FIXME
+  format %{
+    "VSHL.S16 $dst.Q,$src.Q,$shift.Q\t! arithmetic right shift packed8S"
+  %}
+  ins_encode %{
+    bool quad = true;
+    __ vshlSI($dst$$FloatRegister, $shift$$FloatRegister, $src$$FloatRegister,
+              MacroAssembler::VELEM_SIZE_16, quad);
+  %}
+  ins_pipe( ialu_reg_reg ); // FIXME
+%}
+
+// Integers vector arithmetic left/right shift based on sign
+instruct vsha2I_reg(vecD dst, vecD src, vecD shift) %{
+  predicate(n->as_Vector()->length() == 2);
+  effect(DEF dst, USE src, USE shift);
+  size(4);
+  ins_cost(DEFAULT_COST); // FIXME
+  format %{
+    "VSHL.S32 $dst.D,$src.D,$shift.D\t! arithmetic right shift packed2I"
+  %}
+  ins_encode %{
+    bool quad = false;
+    __ vshlSI($dst$$FloatRegister, $shift$$FloatRegister, $src$$FloatRegister,
+              MacroAssembler::VELEM_SIZE_32, quad);
+  %}
+  ins_pipe( ialu_reg_reg ); // FIXME
+%}
+
+instruct vsha4I_reg(vecX dst, vecX src, vecX shift) %{
+  predicate(n->as_Vector()->length() == 4);
+  effect(DEF dst, USE src, USE shift);
+  size(4);
+  ins_cost(DEFAULT_COST); // FIXME
+  format %{
+    "VSHL.S32 $dst.Q,$src.Q,$shift.Q\t! arithmetic right shift packed4I"
+  %}
+  ins_encode %{
+    bool quad = true;
+    __ vshlSI($dst$$FloatRegister, $shift$$FloatRegister, $src$$FloatRegister,
+              MacroAssembler::VELEM_SIZE_32, quad);
+  %}
+  ins_pipe( ialu_reg_reg ); // FIXME
+%}
+
+// Longs vector arithmetic left/right shift based on sign
+instruct vsha2L_reg(vecX dst, vecX src, vecX shift) %{
+  predicate(n->as_Vector()->length() == 2);
+  effect(DEF dst, USE src, USE shift);
+  size(4);
+  ins_cost(DEFAULT_COST); // FIXME
+  format %{
+    "VSHL.S64 $dst.Q,$src.Q,$shift.Q\t! arithmetic right shift packed2L"
+  %}
+  ins_encode %{
+    bool quad = true;
+    __ vshlSI($dst$$FloatRegister, $shift$$FloatRegister, $src$$FloatRegister,
+              MacroAssembler::VELEM_SIZE_64, quad);
+  %}
+  ins_pipe( ialu_reg_reg ); // FIXME
+%}
+
+// Byte vector arithmetic right shift
+
+instruct vsra8B_reg(vecD dst, vecD src, vecD shift) %{
+  predicate(n->as_Vector()->length() == 8);
+  match(Set dst (RShiftVB src shift));
+  size(4);
+  ins_cost(DEFAULT_COST); // FIXME
+  expand %{
+    vsha8B_reg(dst, src, shift);
+  %}
+%}
+
+instruct vsrl16B_reg(vecX dst, vecX src, vecX shift) %{
+  predicate(n->as_Vector()->length() == 16);
+  match(Set dst (RShiftVB src shift));
+  size(4);
+  ins_cost(DEFAULT_COST); // FIXME
+  expand %{
+    vsha16B_reg(dst, src, shift);
+  %}
+%}
+
+instruct vsrl8B_immI(vecD dst, vecD src, immI shift) %{
+  predicate(n->as_Vector()->length() == 8);
+  match(Set dst (RShiftVB src shift));
+  size(4);
+  ins_cost(DEFAULT_COST); // FIXME
+  format %{
+    "VSHR.S8 $dst.D,$src.D,$shift\t! logical right shift packed8B"
+  %}
+  ins_encode %{
+    bool quad = false;
+    __ vshrSI($dst$$FloatRegister, $src$$FloatRegister, 8, $shift$$constant,
+             quad);
+  %}
+  ins_pipe( ialu_reg_reg ); // FIXME
+%}
+
+instruct vsrl16B_immI(vecX dst, vecX src, immI shift) %{
+  predicate(n->as_Vector()->length() == 16);
+  match(Set dst (RShiftVB src shift));
+  size(4);
+  ins_cost(DEFAULT_COST); // FIXME
+  format %{
+    "VSHR.S8 $dst.Q,$src.Q,$shift\t! logical right shift packed16B"
+  %}
+  ins_encode %{
+    bool quad = true;
+    __ vshrSI($dst$$FloatRegister, $src$$FloatRegister, 8, $shift$$constant,
+             quad);
+  %}
+  ins_pipe( ialu_reg_reg ); // FIXME
+%}
+
+// Shorts vector arithmetic right shift
+instruct vsra4S_reg(vecD dst, vecD src, vecD shift) %{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (RShiftVS src shift));
+  size(4);
+  ins_cost(DEFAULT_COST); // FIXME
+  expand %{
+    vsha4S_reg(dst, src, shift);
+  %}
+%}
+
+instruct vsra8S_reg(vecX dst, vecX src, vecX shift) %{
+  predicate(n->as_Vector()->length() == 8);
+  match(Set dst (RShiftVS src shift));
+  size(4);
+  ins_cost(DEFAULT_COST); // FIXME
+  expand %{
+    vsha8S_reg(dst, src, shift);
+  %}
+%}
+
+instruct vsra4S_immI(vecD dst, vecD src, immI shift) %{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (RShiftVS src shift));
+  size(4);
+  ins_cost(DEFAULT_COST); // FIXME
+  format %{
+    "VSHR.S16 $dst.D,$src.D,$shift\t! logical right shift packed4S"
+  %}
+  ins_encode %{
+    bool quad = false;
+    __ vshrSI($dst$$FloatRegister, $src$$FloatRegister, 16, $shift$$constant,
+             quad);
+  %}
+  ins_pipe( ialu_reg_reg ); // FIXME
+%}
+
+instruct vsra8S_immI(vecX dst, vecX src, immI shift) %{
+  predicate(n->as_Vector()->length() == 8);
+  match(Set dst (RShiftVS src shift));
+  size(4);
+  ins_cost(DEFAULT_COST); // FIXME
+  format %{
+    "VSHR.S16 $dst.Q,$src.Q,$shift\t! logical right shift packed8S"
+  %}
+  ins_encode %{
+    bool quad = true;
+    __ vshrSI($dst$$FloatRegister, $src$$FloatRegister, 16, $shift$$constant,
+             quad);
+  %}
+  ins_pipe( ialu_reg_reg ); // FIXME
+%}
+
+// Integers vector arithmetic right shift
+instruct vsra2I_reg(vecD dst, vecD src, vecD shift) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (RShiftVI src shift));
+  size(4);
+  ins_cost(DEFAULT_COST); // FIXME
+  expand %{
+    vsha2I_reg(dst, src, shift);
+  %}
+%}
+
+instruct vsra4I_reg(vecX dst, vecX src, vecX shift) %{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (RShiftVI src shift));
+  size(4);
+  ins_cost(DEFAULT_COST); // FIXME
+  expand %{
+    vsha4I_reg(dst, src, shift);
+  %}
+%}
+
+instruct vsra2I_immI(vecD dst, vecD src, immI shift) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (RShiftVI src shift));
+  size(4);
+  ins_cost(DEFAULT_COST); // FIXME
+  format %{
+    "VSHR.S32 $dst.D,$src.D,$shift\t! logical right shift packed2I"
+  %}
+  ins_encode %{
+    bool quad = false;
+    __ vshrSI($dst$$FloatRegister, $src$$FloatRegister, 32, $shift$$constant,
+             quad);
+  %}
+  ins_pipe( ialu_reg_reg ); // FIXME
+%}
+
+instruct vsra4I_immI(vecX dst, vecX src, immI shift) %{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (RShiftVI src shift));
+  size(4);
+  ins_cost(DEFAULT_COST); // FIXME
+  format %{
+    "VSHR.S32 $dst.Q,$src.Q,$shift\t! logical right shift packed4I"
+  %}
+  ins_encode %{
+    bool quad = true;
+    __ vshrSI($dst$$FloatRegister, $src$$FloatRegister, 32, $shift$$constant,
+             quad);
+  %}
+  ins_pipe( ialu_reg_reg ); // FIXME
+%}
+
+// Longs vector arithmetic right shift
+instruct vsra2L_reg(vecX dst, vecX src, vecX shift) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (RShiftVL src shift));
+  size(4);
+  ins_cost(DEFAULT_COST); // FIXME
+  expand %{
+    vsha2L_reg(dst, src, shift);
+  %}
+%}
+
+instruct vsra2L_immI(vecX dst, vecX src, immI shift) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (RShiftVL src shift));
+  size(4);
+  ins_cost(DEFAULT_COST); // FIXME
+  format %{
+    "VSHR.S64 $dst.Q,$src.Q,$shift\t! logical right shift packed2L"
+  %}
+  ins_encode %{
+    bool quad = true;
+    __ vshrSI($dst$$FloatRegister, $src$$FloatRegister, 64, $shift$$constant,
+             quad);
+  %}
+  ins_pipe( ialu_reg_reg ); // FIXME
+%}
+
+// --------------------------------- AND --------------------------------------
+
+instruct vandD(vecD dst, vecD src1, vecD src2) %{
+  predicate(n->as_Vector()->length_in_bytes() == 8);
+  match(Set dst (AndV src1 src2));
+  format %{ "VAND    $dst.D,$src1.D,$src2.D\t! and vectors (8 bytes)" %}
+  ins_encode %{
+    bool quad = false;
+    __ vandI($dst$$FloatRegister, $src1$$FloatRegister, $src2$$FloatRegister,
+             quad);
+  %}
+  ins_pipe( ialu_reg_reg ); // FIXME
+%}
+
+instruct vandX(vecX dst, vecX src1, vecX src2) %{
+  predicate(n->as_Vector()->length_in_bytes() == 16);
+  match(Set dst (AndV src1 src2));
+  format %{ "VAND    $dst.Q,$src1.Q,$src2.Q\t! and vectors (16 bytes)" %}
+  ins_encode %{
+    bool quad = true;
+    __ vandI($dst$$FloatRegister, $src1$$FloatRegister, $src2$$FloatRegister,
+             quad);
+  %}
+  ins_pipe( ialu_reg_reg ); // FIXME
+%}
+
+// --------------------------------- OR ---------------------------------------
+
+instruct vorD(vecD dst, vecD src1, vecD src2) %{
+  predicate(n->as_Vector()->length_in_bytes() == 8);
+  match(Set dst (OrV src1 src2));
+  format %{ "VOR     $dst.D,$src1.D,$src2.D\t! and vectors (8 bytes)" %}
+  ins_encode %{
+    bool quad = false;
+    __ vorI($dst$$FloatRegister, $src1$$FloatRegister, $src2$$FloatRegister,
+            quad);
+  %}
+  ins_pipe( ialu_reg_reg ); // FIXME
+%}
+
+instruct vorX(vecX dst, vecX src1, vecX src2) %{
+  predicate(n->as_Vector()->length_in_bytes() == 16);
+  match(Set dst (OrV src1 src2));
+  format %{ "VOR     $dst.Q,$src1.Q,$src2.Q\t! and vectors (16 bytes)" %}
+  ins_encode %{
+    bool quad = true;
+    __ vorI($dst$$FloatRegister, $src1$$FloatRegister, $src2$$FloatRegister,
+            quad);
+  %}
+  ins_pipe( ialu_reg_reg ); // FIXME
+%}
+
+// --------------------------------- XOR --------------------------------------
+
+instruct vxorD(vecD dst, vecD src1, vecD src2) %{
+  predicate(n->as_Vector()->length_in_bytes() == 8);
+  match(Set dst (XorV src1 src2));
+  format %{ "VXOR    $dst.D,$src1.D,$src2.D\t! and vectors (8 bytes)" %}
+  ins_encode %{
+    bool quad = false;
+    __ vxorI($dst$$FloatRegister, $src1$$FloatRegister, $src2$$FloatRegister,
+             quad);
+  %}
+  ins_pipe( ialu_reg_reg ); // FIXME
+%}
+
+instruct vxorX(vecX dst, vecX src1, vecX src2) %{
+  predicate(n->as_Vector()->length_in_bytes() == 16);
+  match(Set dst (XorV src1 src2));
+  format %{ "VXOR    $dst.Q,$src1.Q,$src2.Q\t! and vectors (16 bytes)" %}
+  ins_encode %{
+    bool quad = true;
+    __ vxorI($dst$$FloatRegister, $src1$$FloatRegister, $src2$$FloatRegister,
+             quad);
+  %}
+  ins_pipe( ialu_reg_reg ); // FIXME
+%}
+
+
+//----------PEEPHOLE RULES-----------------------------------------------------
+// These must follow all instruction definitions as they use the names
+// defined in the instructions definitions.
+//
+// peepmatch ( root_instr_name [preceding_instruction]* );
+//
+// peepconstraint %{
+// (instruction_number.operand_name relational_op instruction_number.operand_name
+//  [, ...] );
+// // instruction numbers are zero-based using left to right order in peepmatch
+//
+// peepreplace ( instr_name  ( [instruction_number.operand_name]* ) );
+// // provide an instruction_number.operand_name for each operand that appears
+// // in the replacement instruction's match rule
+//
+// ---------VM FLAGS---------------------------------------------------------
+//
+// All peephole optimizations can be turned off using -XX:-OptoPeephole
+//
+// Each peephole rule is given an identifying number starting with zero and
+// increasing by one in the order seen by the parser.  An individual peephole
+// can be enabled, and all others disabled, by using -XX:OptoPeepholeAt=#
+// on the command-line.
+//
+// ---------CURRENT LIMITATIONS----------------------------------------------
+//
+// Only match adjacent instructions in same basic block
+// Only equality constraints
+// Only constraints between operands, not (0.dest_reg == EAX_enc)
+// Only one replacement instruction
+//
+// ---------EXAMPLE----------------------------------------------------------
+//
+// // pertinent parts of existing instructions in architecture description
+// instruct movI(eRegI dst, eRegI src) %{
+//   match(Set dst (CopyI src));
+// %}
+//
+// instruct incI_eReg(eRegI dst, immI1 src, eFlagsReg cr) %{
+//   match(Set dst (AddI dst src));
+//   effect(KILL cr);
+// %}
+//
+// // Change (inc mov) to lea
+// peephole %{
+//   // increment preceeded by register-register move
+//   peepmatch ( incI_eReg movI );
+//   // require that the destination register of the increment
+//   // match the destination register of the move
+//   peepconstraint ( 0.dst == 1.dst );
+//   // construct a replacement instruction that sets
+//   // the destination to ( move's source register + one )
+//   peepreplace ( incI_eReg_immI1( 0.dst 1.src 0.src ) );
+// %}
+//
+
+// // Change load of spilled value to only a spill
+// instruct storeI(memory mem, eRegI src) %{
+//   match(Set mem (StoreI mem src));
+// %}
+//
+// instruct loadI(eRegI dst, memory mem) %{
+//   match(Set dst (LoadI mem));
+// %}
+//
+// peephole %{
+//   peepmatch ( loadI storeI );
+//   peepconstraint ( 1.src == 0.dst, 1.mem == 0.mem );
+//   peepreplace ( storeI( 1.mem 1.mem 1.src ) );
+// %}
+
+//----------SMARTSPILL RULES---------------------------------------------------
+// These must follow all instruction definitions as they use the names
+// defined in the instructions definitions.
+//
+// ARM will probably not have any of these rules due to RISC instruction set.
+
+//----------PIPELINE-----------------------------------------------------------
+// Rules which define the behavior of the target architectures pipeline.
--- /dev/null	2016-08-24 15:41:39.598575000 -0400
+++ new/hotspot/src/cpu/arm/vm/arm_32.ad	2016-12-02 11:16:58.144993691 -0500
@@ -0,0 +1,586 @@
+//
+// Copyright (c) 2008, 2013, Oracle and/or its affiliates. All rights reserved.
+// DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+//
+// This code is free software; you can redistribute it and/or modify it
+// under the terms of the GNU General Public License version 2 only, as
+// published by the Free Software Foundation.
+//
+// This code is distributed in the hope that it will be useful, but WITHOUT
+// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+// FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+// version 2 for more details (a copy is included in the LICENSE file that
+// accompanied this code).
+//
+// You should have received a copy of the GNU General Public License version
+// 2 along with this work; if not, write to the Free Software Foundation,
+// Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+//
+// Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+// or visit www.oracle.com if you need additional information or have any
+// questions.
+//
+
+// ARM Architecture Description File
+
+//----------REGISTER DEFINITION BLOCK------------------------------------------
+// This information is used by the matcher and the register allocator to
+// describe individual registers and classes of registers within the target
+// archtecture.
+register %{
+//----------Architecture Description Register Definitions----------------------
+// General Registers
+// "reg_def"  name ( register save type, C convention save type,
+//                   ideal register type, encoding, vm name );
+// Register Save Types:
+//
+// NS  = No-Save:       The register allocator assumes that these registers
+//                      can be used without saving upon entry to the method, &
+//                      that they do not need to be saved at call sites.
+//
+// SOC = Save-On-Call:  The register allocator assumes that these registers
+//                      can be used without saving upon entry to the method,
+//                      but that they must be saved at call sites.
+//
+// SOE = Save-On-Entry: The register allocator assumes that these registers
+//                      must be saved before using them upon entry to the
+//                      method, but they do not need to be saved at call
+//                      sites.
+//
+// AS  = Always-Save:   The register allocator assumes that these registers
+//                      must be saved before using them upon entry to the
+//                      method, & that they must be saved at call sites.
+//
+// Ideal Register Type is used to determine how to save & restore a
+// register.  Op_RegI will get spilled with LoadI/StoreI, Op_RegP will get
+// spilled with LoadP/StoreP.  If the register supports both, use Op_RegI.
+//
+// The encoding number is the actual bit-pattern placed into the opcodes.
+
+
+// ----------------------------
+// Integer/Long Registers
+// ----------------------------
+
+reg_def R_R0 (SOC, SOC, Op_RegI,  0,  R(0)->as_VMReg());
+reg_def R_R1 (SOC, SOC, Op_RegI,  1,  R(1)->as_VMReg());
+reg_def R_R2 (SOC, SOC, Op_RegI,  2,  R(2)->as_VMReg());
+reg_def R_R3 (SOC, SOC, Op_RegI,  3,  R(3)->as_VMReg());
+reg_def R_R4 (SOC, SOE, Op_RegI,  4,  R(4)->as_VMReg());
+reg_def R_R5 (SOC, SOE, Op_RegI,  5,  R(5)->as_VMReg());
+reg_def R_R6 (SOC, SOE, Op_RegI,  6,  R(6)->as_VMReg());
+reg_def R_R7 (SOC, SOE, Op_RegI,  7,  R(7)->as_VMReg());
+reg_def R_R8 (SOC, SOE, Op_RegI,  8,  R(8)->as_VMReg());
+reg_def R_R9 (SOC, SOE, Op_RegI,  9,  R(9)->as_VMReg());
+reg_def R_R10(NS,  SOE, Op_RegI, 10, R(10)->as_VMReg());
+reg_def R_R11(NS,  SOE, Op_RegI, 11, R(11)->as_VMReg());
+reg_def R_R12(SOC, SOC, Op_RegI, 12, R(12)->as_VMReg());
+reg_def R_R13(NS,  NS,  Op_RegI, 13, R(13)->as_VMReg());
+reg_def R_R14(SOC, SOC, Op_RegI, 14, R(14)->as_VMReg());
+reg_def R_R15(NS,  NS,  Op_RegI, 15, R(15)->as_VMReg());
+
+// ----------------------------
+// Float/Double Registers
+// ----------------------------
+
+// Float Registers
+
+reg_def R_S0 ( SOC, SOC, Op_RegF,  0, S0->as_VMReg());
+reg_def R_S1 ( SOC, SOC, Op_RegF,  1, S1_reg->as_VMReg());
+reg_def R_S2 ( SOC, SOC, Op_RegF,  2, S2_reg->as_VMReg());
+reg_def R_S3 ( SOC, SOC, Op_RegF,  3, S3_reg->as_VMReg());
+reg_def R_S4 ( SOC, SOC, Op_RegF,  4, S4_reg->as_VMReg());
+reg_def R_S5 ( SOC, SOC, Op_RegF,  5, S5_reg->as_VMReg());
+reg_def R_S6 ( SOC, SOC, Op_RegF,  6, S6_reg->as_VMReg());
+reg_def R_S7 ( SOC, SOC, Op_RegF,  7, S7->as_VMReg());
+reg_def R_S8 ( SOC, SOC, Op_RegF,  8, S8->as_VMReg());
+reg_def R_S9 ( SOC, SOC, Op_RegF,  9, S9->as_VMReg());
+reg_def R_S10( SOC, SOC, Op_RegF, 10,S10->as_VMReg());
+reg_def R_S11( SOC, SOC, Op_RegF, 11,S11->as_VMReg());
+reg_def R_S12( SOC, SOC, Op_RegF, 12,S12->as_VMReg());
+reg_def R_S13( SOC, SOC, Op_RegF, 13,S13->as_VMReg());
+reg_def R_S14( SOC, SOC, Op_RegF, 14,S14->as_VMReg());
+reg_def R_S15( SOC, SOC, Op_RegF, 15,S15->as_VMReg());
+reg_def R_S16( SOC, SOE, Op_RegF, 16,S16->as_VMReg());
+reg_def R_S17( SOC, SOE, Op_RegF, 17,S17->as_VMReg());
+reg_def R_S18( SOC, SOE, Op_RegF, 18,S18->as_VMReg());
+reg_def R_S19( SOC, SOE, Op_RegF, 19,S19->as_VMReg());
+reg_def R_S20( SOC, SOE, Op_RegF, 20,S20->as_VMReg());
+reg_def R_S21( SOC, SOE, Op_RegF, 21,S21->as_VMReg());
+reg_def R_S22( SOC, SOE, Op_RegF, 22,S22->as_VMReg());
+reg_def R_S23( SOC, SOE, Op_RegF, 23,S23->as_VMReg());
+reg_def R_S24( SOC, SOE, Op_RegF, 24,S24->as_VMReg());
+reg_def R_S25( SOC, SOE, Op_RegF, 25,S25->as_VMReg());
+reg_def R_S26( SOC, SOE, Op_RegF, 26,S26->as_VMReg());
+reg_def R_S27( SOC, SOE, Op_RegF, 27,S27->as_VMReg());
+reg_def R_S28( SOC, SOE, Op_RegF, 28,S28->as_VMReg());
+reg_def R_S29( SOC, SOE, Op_RegF, 29,S29->as_VMReg());
+reg_def R_S30( SOC, SOE, Op_RegF, 30,S30->as_VMReg());
+reg_def R_S31( SOC, SOE, Op_RegF, 31,S31->as_VMReg());
+
+// Double Registers
+// The rules of ADL require that double registers be defined in pairs.
+// Each pair must be two 32-bit values, but not necessarily a pair of
+// single float registers.  In each pair, ADLC-assigned register numbers
+// must be adjacent, with the lower number even.  Finally, when the
+// CPU stores such a register pair to memory, the word associated with
+// the lower ADLC-assigned number must be stored to the lower address.
+
+reg_def R_D16 (SOC, SOC, Op_RegD, 32, D16->as_VMReg());
+reg_def R_D16x(SOC, SOC, Op_RegD,255, D16->as_VMReg()->next());
+reg_def R_D17 (SOC, SOC, Op_RegD, 34, D17->as_VMReg());
+reg_def R_D17x(SOC, SOC, Op_RegD,255, D17->as_VMReg()->next());
+reg_def R_D18 (SOC, SOC, Op_RegD, 36, D18->as_VMReg());
+reg_def R_D18x(SOC, SOC, Op_RegD,255, D18->as_VMReg()->next());
+reg_def R_D19 (SOC, SOC, Op_RegD, 38, D19->as_VMReg());
+reg_def R_D19x(SOC, SOC, Op_RegD,255, D19->as_VMReg()->next());
+reg_def R_D20 (SOC, SOC, Op_RegD, 40, D20->as_VMReg());
+reg_def R_D20x(SOC, SOC, Op_RegD,255, D20->as_VMReg()->next());
+reg_def R_D21 (SOC, SOC, Op_RegD, 42, D21->as_VMReg());
+reg_def R_D21x(SOC, SOC, Op_RegD,255, D21->as_VMReg()->next());
+reg_def R_D22 (SOC, SOC, Op_RegD, 44, D22->as_VMReg());
+reg_def R_D22x(SOC, SOC, Op_RegD,255, D22->as_VMReg()->next());
+reg_def R_D23 (SOC, SOC, Op_RegD, 46, D23->as_VMReg());
+reg_def R_D23x(SOC, SOC, Op_RegD,255, D23->as_VMReg()->next());
+reg_def R_D24 (SOC, SOC, Op_RegD, 48, D24->as_VMReg());
+reg_def R_D24x(SOC, SOC, Op_RegD,255, D24->as_VMReg()->next());
+reg_def R_D25 (SOC, SOC, Op_RegD, 50, D25->as_VMReg());
+reg_def R_D25x(SOC, SOC, Op_RegD,255, D25->as_VMReg()->next());
+reg_def R_D26 (SOC, SOC, Op_RegD, 52, D26->as_VMReg());
+reg_def R_D26x(SOC, SOC, Op_RegD,255, D26->as_VMReg()->next());
+reg_def R_D27 (SOC, SOC, Op_RegD, 54, D27->as_VMReg());
+reg_def R_D27x(SOC, SOC, Op_RegD,255, D27->as_VMReg()->next());
+reg_def R_D28 (SOC, SOC, Op_RegD, 56, D28->as_VMReg());
+reg_def R_D28x(SOC, SOC, Op_RegD,255, D28->as_VMReg()->next());
+reg_def R_D29 (SOC, SOC, Op_RegD, 58, D29->as_VMReg());
+reg_def R_D29x(SOC, SOC, Op_RegD,255, D29->as_VMReg()->next());
+reg_def R_D30 (SOC, SOC, Op_RegD, 60, D30->as_VMReg());
+reg_def R_D30x(SOC, SOC, Op_RegD,255, D30->as_VMReg()->next());
+reg_def R_D31 (SOC, SOC, Op_RegD, 62, D31->as_VMReg());
+reg_def R_D31x(SOC, SOC, Op_RegD,255, D31->as_VMReg()->next());
+
+// ----------------------------
+// Special Registers
+// Condition Codes Flag Registers
+reg_def APSR (SOC, SOC,  Op_RegFlags, 0, VMRegImpl::Bad());
+reg_def FPSCR(SOC, SOC,  Op_RegFlags, 0, VMRegImpl::Bad());
+
+// ----------------------------
+// Specify the enum values for the registers.  These enums are only used by the
+// OptoReg "class". We can convert these enum values at will to VMReg when needed
+// for visibility to the rest of the vm. The order of this enum influences the
+// register allocator so having the freedom to set this order and not be stuck
+// with the order that is natural for the rest of the vm is worth it.
+
+// registers in that order so that R11/R12 is an aligned pair that can be used for longs
+alloc_class chunk0(
+                   R_R4, R_R5, R_R6, R_R7, R_R8, R_R9, R_R11, R_R12, R_R10, R_R13, R_R14, R_R15, R_R0, R_R1, R_R2, R_R3);
+
+// Note that a register is not allocatable unless it is also mentioned
+// in a widely-used reg_class below.
+
+alloc_class chunk1(
+                   R_S16, R_S17, R_S18, R_S19, R_S20, R_S21, R_S22, R_S23,
+                   R_S24, R_S25, R_S26, R_S27, R_S28, R_S29, R_S30, R_S31,
+                   R_S0,  R_S1,  R_S2,  R_S3,  R_S4,  R_S5,  R_S6,  R_S7, 
+                   R_S8,  R_S9,  R_S10, R_S11, R_S12, R_S13, R_S14, R_S15,
+                   R_D16, R_D16x,R_D17, R_D17x,R_D18, R_D18x,R_D19, R_D19x, 
+                   R_D20, R_D20x,R_D21, R_D21x,R_D22, R_D22x,R_D23, R_D23x, 
+                   R_D24, R_D24x,R_D25, R_D25x,R_D26, R_D26x,R_D27, R_D27x, 
+                   R_D28, R_D28x,R_D29, R_D29x,R_D30, R_D30x,R_D31, R_D31x
+);
+
+alloc_class chunk2(APSR, FPSCR);
+
+//----------Architecture Description Register Classes--------------------------
+// Several register classes are automatically defined based upon information in
+// this architecture description.
+// 1) reg_class inline_cache_reg           ( as defined in frame section )
+// 2) reg_class interpreter_method_oop_reg ( as defined in frame section )
+// 3) reg_class stack_slots( /* one chunk of stack-based "registers" */ )
+//
+
+// ----------------------------
+// Integer Register Classes
+// ----------------------------
+// Exclusions from i_reg:
+// SP (R13), PC (R15)
+// R10: reserved by HotSpot to the TLS register (invariant within Java)
+reg_class int_reg(R_R0, R_R1, R_R2, R_R3, R_R4, R_R5, R_R6, R_R7, R_R8, R_R9, R_R11, R_R12, R_R14);
+
+reg_class R0_regI(R_R0);
+reg_class R1_regI(R_R1);
+reg_class R2_regI(R_R2);
+reg_class R3_regI(R_R3);
+reg_class R12_regI(R_R12);
+
+// ----------------------------
+// Pointer Register Classes
+// ----------------------------
+reg_class ptr_reg(R_R0, R_R1, R_R2, R_R3, R_R4, R_R5, R_R6, R_R7, R_R8, R_R9, R_R11, R_R12, R_R14);
+// Special class for storeP instructions, which can store SP or RPC to TLS.
+// It is also used for memory addressing, allowing direct TLS addressing.
+reg_class sp_ptr_reg(R_R0, R_R1, R_R2, R_R3, R_R4, R_R5, R_R6, R_R7, R_R8, R_R9, R_R11, R_R12, R_R14, R_R10 /* TLS*/, R_R13 /* SP*/);
+
+#define R_Ricklass R_R8
+#define R_Rmethod  R_R9
+#define R_Rthread  R_R10
+#define R_Rexception_obj R_R4
+
+// Other special pointer regs
+reg_class R0_regP(R_R0);
+reg_class R1_regP(R_R1);
+reg_class R2_regP(R_R2);
+reg_class R4_regP(R_R4);
+reg_class Rexception_regP(R_Rexception_obj);
+reg_class Ricklass_regP(R_Ricklass);
+reg_class Rmethod_regP(R_Rmethod);
+reg_class Rthread_regP(R_Rthread);
+reg_class IP_regP(R_R12);
+reg_class LR_regP(R_R14);
+
+reg_class FP_regP(R_R11);
+
+// ----------------------------
+// Long Register Classes
+// ----------------------------
+reg_class long_reg (             R_R0,R_R1, R_R2,R_R3, R_R4,R_R5, R_R6,R_R7, R_R8,R_R9, R_R11,R_R12);
+// for ldrexd, strexd: first reg of pair must be even
+reg_class long_reg_align (       R_R0,R_R1, R_R2,R_R3, R_R4,R_R5, R_R6,R_R7, R_R8,R_R9);
+
+reg_class R0R1_regL(R_R0,R_R1);
+reg_class R2R3_regL(R_R2,R_R3);
+
+// ----------------------------
+// Special Class for Condition Code Flags Register
+reg_class int_flags(APSR);
+reg_class float_flags(FPSCR);
+
+
+// ----------------------------
+// Float Point Register Classes
+// ----------------------------
+// Skip S14/S15, they are reserved for mem-mem copies
+reg_class sflt_reg(R_S0, R_S1, R_S2, R_S3, R_S4, R_S5, R_S6, R_S7, R_S8, R_S9, R_S10, R_S11, R_S12, R_S13,
+                   R_S16, R_S17, R_S18, R_S19, R_S20, R_S21, R_S22, R_S23, R_S24, R_S25, R_S26, R_S27, R_S28, R_S29, R_S30, R_S31);
+
+// Paired floating point registers--they show up in the same order as the floats,
+// but they are used with the "Op_RegD" type, and always occur in even/odd pairs.
+reg_class dflt_reg(R_S0,R_S1, R_S2,R_S3, R_S4,R_S5, R_S6,R_S7, R_S8,R_S9, R_S10,R_S11, R_S12,R_S13,
+                   R_S16,R_S17, R_S18,R_S19, R_S20,R_S21, R_S22,R_S23, R_S24,R_S25, R_S26,R_S27, R_S28,R_S29, R_S30,R_S31,
+                   R_D16,R_D16x, R_D17,R_D17x, R_D18,R_D18x, R_D19,R_D19x, R_D20,R_D20x, R_D21,R_D21x, R_D22,R_D22x,
+                   R_D23,R_D23x, R_D24,R_D24x, R_D25,R_D25x, R_D26,R_D26x, R_D27,R_D27x, R_D28,R_D28x, R_D29,R_D29x,
+                   R_D30,R_D30x, R_D31,R_D31x);
+
+reg_class dflt_low_reg(R_S0,R_S1, R_S2,R_S3, R_S4,R_S5, R_S6,R_S7, R_S8,R_S9, R_S10,R_S11, R_S12,R_S13,
+                       R_S16,R_S17, R_S18,R_S19, R_S20,R_S21, R_S22,R_S23, R_S24,R_S25, R_S26,R_S27, R_S28,R_S29, R_S30,R_S31);
+
+
+reg_class actual_dflt_reg %{
+  if (VM_Version::has_vfp3_32()) {
+    return DFLT_REG_mask();
+  } else {
+    return DFLT_LOW_REG_mask();
+  }
+%}
+
+reg_class S0_regF(R_S0);
+reg_class D0_regD(R_S0,R_S1);
+reg_class D1_regD(R_S2,R_S3);
+reg_class D2_regD(R_S4,R_S5);
+reg_class D3_regD(R_S6,R_S7);
+reg_class D4_regD(R_S8,R_S9);
+reg_class D5_regD(R_S10,R_S11);
+reg_class D6_regD(R_S12,R_S13);
+reg_class D7_regD(R_S14,R_S15);
+
+reg_class D16_regD(R_D16,R_D16x);
+reg_class D17_regD(R_D17,R_D17x);
+reg_class D18_regD(R_D18,R_D18x);
+reg_class D19_regD(R_D19,R_D19x);
+reg_class D20_regD(R_D20,R_D20x);
+reg_class D21_regD(R_D21,R_D21x);
+reg_class D22_regD(R_D22,R_D22x);
+reg_class D23_regD(R_D23,R_D23x);
+reg_class D24_regD(R_D24,R_D24x);
+reg_class D25_regD(R_D25,R_D25x);
+reg_class D26_regD(R_D26,R_D26x);
+reg_class D27_regD(R_D27,R_D27x);
+reg_class D28_regD(R_D28,R_D28x);
+reg_class D29_regD(R_D29,R_D29x);
+reg_class D30_regD(R_D30,R_D30x);
+reg_class D31_regD(R_D31,R_D31x);
+
+reg_class vectorx_reg(R_S0,R_S1,R_S2,R_S3, R_S4,R_S5,R_S6,R_S7,
+                      R_S8,R_S9,R_S10,R_S11, /* skip S14/S15 */
+                      R_S16,R_S17,R_S18,R_S19, R_S20,R_S21,R_S22,R_S23,
+                      R_S24,R_S25,R_S26,R_S27, R_S28,R_S29,R_S30,R_S31,
+                      R_D16,R_D16x,R_D17,R_D17x, R_D18,R_D18x,R_D19,R_D19x,
+                      R_D20,R_D20x,R_D21,R_D21x, R_D22,R_D22x,R_D23,R_D23x,
+                      R_D24,R_D24x,R_D25,R_D25x, R_D26,R_D26x,R_D27,R_D27x,
+                      R_D28,R_D28x,R_D29,R_D29x, R_D30,R_D30x,R_D31,R_D31x);
+
+%}
+
+source_hpp %{
+// FIXME
+const MachRegisterNumbers R_mem_copy_lo_num = R_S14_num;
+const MachRegisterNumbers R_mem_copy_hi_num = R_S15_num;
+const FloatRegister Rmemcopy = S14;
+const MachRegisterNumbers R_hf_ret_lo_num = R_S0_num;
+const MachRegisterNumbers R_hf_ret_hi_num = R_S1_num;
+
+const MachRegisterNumbers R_Ricklass_num = R_R8_num;
+const MachRegisterNumbers R_Rmethod_num  = R_R9_num;
+
+#define LDR_DOUBLE "FLDD"
+#define LDR_FLOAT  "FLDS"
+#define STR_DOUBLE "FSTD"
+#define STR_FLOAT  "FSTS"
+#define LDR_64     "LDRD"
+#define STR_64     "STRD"
+#define LDR_32     "LDR"
+#define STR_32     "STR"
+#define MOV_DOUBLE "FCPYD"
+#define MOV_FLOAT  "FCPYS"
+#define FMSR       "FMSR"
+#define FMRS       "FMRS"
+#define LDREX      "ldrex "
+#define STREX      "strex "
+
+#define str_64     strd
+#define ldr_64     ldrd
+#define ldr_32     ldr
+#define ldrex      ldrex
+#define strex      strex
+
+static inline bool is_memoryD(int offset) {
+  return offset < 1024 && offset > -1024;
+}
+
+static inline bool is_memoryfp(int offset) {
+  return offset < 1024 && offset > -1024;
+}
+
+static inline bool is_memoryI(int offset) {
+  return offset < 4096 && offset > -4096;
+}
+
+static inline bool is_memoryP(int offset) {
+  return offset < 4096 && offset > -4096;
+}
+
+static inline bool is_memoryHD(int offset) {
+  return offset < 256 && offset > -256;
+}
+
+static inline bool is_aimm(int imm) {
+  return AsmOperand::is_rotated_imm(imm);
+}
+
+static inline bool is_limmI(jint imm) {
+  return AsmOperand::is_rotated_imm(imm);
+}
+
+static inline bool is_limmI_low(jint imm, int n) {
+  int imml = imm & right_n_bits(n);
+  return is_limmI(imml) || is_limmI(imm);
+}
+
+static inline int limmI_low(jint imm, int n) {
+  int imml = imm & right_n_bits(n);
+  return is_limmI(imml) ? imml : imm;
+}
+
+%}
+
+source %{
+
+// Given a register encoding, produce a Integer Register object
+static Register reg_to_register_object(int register_encoding) {
+  assert(R0->encoding() == R_R0_enc && R15->encoding() == R_R15_enc, "right coding");
+  return as_Register(register_encoding);
+}
+
+// Given a register encoding, produce a single-precision Float Register object
+static FloatRegister reg_to_FloatRegister_object(int register_encoding) {
+  assert(S0->encoding() == R_S0_enc && S31->encoding() == R_S31_enc, "right coding");
+  return as_FloatRegister(register_encoding);
+}
+
+void Compile::pd_compiler2_init() {
+  // Umimplemented
+}
+
+// Location of compiled Java return values.  Same as C
+OptoRegPair c2::return_value(int ideal_reg) {
+  assert( ideal_reg >= Op_RegI && ideal_reg <= Op_RegL, "only return normal values" );
+#ifndef __ABI_HARD__
+  static int lo[Op_RegL+1] = { 0, 0, OptoReg::Bad, R_R0_num,     R_R0_num,     R_R0_num,     R_R0_num, R_R0_num };
+  static int hi[Op_RegL+1] = { 0, 0, OptoReg::Bad, OptoReg::Bad, OptoReg::Bad, OptoReg::Bad, R_R1_num, R_R1_num };
+#else
+  static int lo[Op_RegL+1] = { 0, 0, OptoReg::Bad, R_R0_num,     R_R0_num,     R_hf_ret_lo_num,  R_hf_ret_lo_num, R_R0_num };
+  static int hi[Op_RegL+1] = { 0, 0, OptoReg::Bad, OptoReg::Bad, OptoReg::Bad, OptoReg::Bad,     R_hf_ret_hi_num, R_R1_num };
+#endif
+  return OptoRegPair( hi[ideal_reg], lo[ideal_reg]);
+}
+
+// !!!!! Special hack to get all type of calls to specify the byte offset
+//       from the start of the call to the point where the return address
+//       will point.
+
+int MachCallStaticJavaNode::ret_addr_offset() {
+  bool far = (_method == NULL) ? maybe_far_call(this) : !cache_reachable();
+  return ((far ? 3 : 1) + (_method_handle_invoke ? 1 : 0)) *
+    NativeInstruction::instruction_size;
+}
+
+int MachCallDynamicJavaNode::ret_addr_offset() {
+  bool far = !cache_reachable();
+  // mov_oop is always 2 words
+  return (2 + (far ? 3 : 1)) * NativeInstruction::instruction_size; 
+}
+
+int MachCallRuntimeNode::ret_addr_offset() {
+  // bl or movw; movt; blx
+  bool far = maybe_far_call(this);
+  return (far ? 3 : 1) * NativeInstruction::instruction_size;
+}
+%}
+
+// The intptr_t operand types, defined by textual substitution.
+// (Cf. opto/type.hpp.  This lets us avoid many, many other ifdefs.)
+#define immX      immI
+#define immXRot   immIRot
+#define iRegX     iRegI
+#define aimmX     aimmI
+#define limmX     limmI
+#define immX10x2  immI10x2
+#define LShiftX   LShiftI
+#define shimmX    immU5
+
+// Compatibility interface
+#define aimmP     immPRot
+#define immIMov   immIRot
+
+#define store_RegL     iRegL
+#define store_RegLd    iRegLd
+#define store_RegI     iRegI
+#define store_ptr_RegP iRegP
+
+//----------ATTRIBUTES---------------------------------------------------------
+//----------Operand Attributes-------------------------------------------------
+op_attrib op_cost(1);          // Required cost attribute
+
+//----------OPERANDS-----------------------------------------------------------
+// Operand definitions must precede instruction definitions for correct parsing
+// in the ADLC because operands constitute user defined types which are used in
+// instruction definitions.
+
+//----------Simple Operands----------------------------------------------------
+// Immediate Operands
+
+operand immIRot() %{
+  predicate(AsmOperand::is_rotated_imm(n->get_int()));
+  match(ConI);
+
+  op_cost(0);
+  // formats are generated automatically for constants and base registers
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
+operand immIRotn() %{
+  predicate(n->get_int() != 0 && AsmOperand::is_rotated_imm(~n->get_int()));
+  match(ConI);
+
+  op_cost(0);
+  // formats are generated automatically for constants and base registers
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
+operand immIRotneg() %{
+  // if AsmOperand::is_rotated_imm() is true for this constant, it is
+  // a immIRot and an optimal instruction combination exists to handle the
+  // constant as an immIRot
+  predicate(!AsmOperand::is_rotated_imm(n->get_int()) && AsmOperand::is_rotated_imm(-n->get_int()));
+  match(ConI);
+
+  op_cost(0);
+  // formats are generated automatically for constants and base registers
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
+// Non-negative integer immediate that is encodable using the rotation scheme,
+// and that when expanded fits in 31 bits.
+operand immU31Rot() %{
+  predicate((0 <= n->get_int()) && AsmOperand::is_rotated_imm(n->get_int()));
+  match(ConI);
+
+  op_cost(0);
+  // formats are generated automatically for constants and base registers
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
+operand immPRot() %{
+  predicate(n->get_ptr() == 0 || (AsmOperand::is_rotated_imm(n->get_ptr()) && ((ConPNode*)n)->type()->reloc() == relocInfo::none));
+
+  match(ConP);
+
+  op_cost(0);
+  // formats are generated automatically for constants and base registers
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
+operand immLlowRot() %{
+  predicate(n->get_long() >> 32 == 0 && AsmOperand::is_rotated_imm((int)n->get_long()));
+  match(ConL);
+  op_cost(0);
+
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
+operand immLRot2() %{
+  predicate(AsmOperand::is_rotated_imm((int)(n->get_long() >> 32)) &&
+            AsmOperand::is_rotated_imm((int)(n->get_long())));
+  match(ConL);
+  op_cost(0);
+
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
+// Integer Immediate: 12-bit - for addressing mode
+operand immI12() %{
+  predicate((-4096 < n->get_int()) && (n->get_int() < 4096));
+  match(ConI);
+  op_cost(0);
+
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
+// Integer Immediate: 10-bit disp and disp+4 - for addressing float pair
+operand immI10x2() %{
+  predicate((-1024 < n->get_int()) && (n->get_int() < 1024 - 4));
+  match(ConI);
+  op_cost(0);
+
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
+// Integer Immediate: 12-bit disp and disp+4 - for addressing word pair
+operand immI12x2() %{
+  predicate((-4096 < n->get_int()) && (n->get_int() < 4096 - 4));
+  match(ConI);
+  op_cost(0);
+
+  format %{ %}
+  interface(CONST_INTER);
+%}
--- /dev/null	2016-08-24 15:41:39.598575000 -0400
+++ new/hotspot/src/cpu/arm/vm/arm_64.ad	2016-12-02 11:17:03.689308094 -0500
@@ -0,0 +1,998 @@
+//
+// Copyright (c) 2008, 2014, Oracle and/or its affiliates. All rights reserved.
+// DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+//
+// This code is free software; you can redistribute it and/or modify it
+// under the terms of the GNU General Public License version 2 only, as
+// published by the Free Software Foundation.
+//
+// This code is distributed in the hope that it will be useful, but WITHOUT
+// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+// FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+// version 2 for more details (a copy is included in the LICENSE file that
+// accompanied this code).
+//
+// You should have received a copy of the GNU General Public License version
+// 2 along with this work; if not, write to the Free Software Foundation,
+// Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+//
+// Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+// or visit www.oracle.com if you need additional information or have any
+// questions.
+//
+
+// ARM Architecture Description File
+
+//----------REGISTER DEFINITION BLOCK------------------------------------------
+// This information is used by the matcher and the register allocator to
+// describe individual registers and classes of registers within the target
+// archtecture.
+register %{
+//----------Architecture Description Register Definitions----------------------
+// General Registers
+// "reg_def"  name ( register save type, C convention save type,
+//                   ideal register type, encoding, vm name );
+// Register Save Types:
+//
+// NS  = No-Save:       The register allocator assumes that these registers
+//                      can be used without saving upon entry to the method, &
+//                      that they do not need to be saved at call sites.
+//
+// SOC = Save-On-Call:  The register allocator assumes that these registers
+//                      can be used without saving upon entry to the method,
+//                      but that they must be saved at call sites.
+//
+// SOE = Save-On-Entry: The register allocator assumes that these registers
+//                      must be saved before using them upon entry to the
+//                      method, but they do not need to be saved at call
+//                      sites.
+//
+// AS  = Always-Save:   The register allocator assumes that these registers
+//                      must be saved before using them upon entry to the
+//                      method, & that they must be saved at call sites.
+//
+// Ideal Register Type is used to determine how to save & restore a
+// register.  Op_RegI will get spilled with LoadI/StoreI, Op_RegP will get
+// spilled with LoadP/StoreP.  If the register supports both, use Op_RegI.
+// FIXME: above comment seems wrong.  Spill done through MachSpillCopyNode
+//
+// The encoding number is the actual bit-pattern placed into the opcodes.
+
+
+// ----------------------------
+// Integer/Long Registers
+// ----------------------------
+
+// TODO: would be nice to keep track of high-word state:
+// zeroRegI --> RegL
+// signedRegI --> RegL
+// junkRegI --> RegL
+// how to tell C2 to treak RegI as RegL, or RegL as RegI?
+reg_def R_R0  (SOC, SOC, Op_RegI,   0, R0->as_VMReg());
+reg_def R_R0x (SOC, SOC, Op_RegI, 255, R0->as_VMReg()->next());
+reg_def R_R1  (SOC, SOC, Op_RegI,   1, R1->as_VMReg());
+reg_def R_R1x (SOC, SOC, Op_RegI, 255, R1->as_VMReg()->next());
+reg_def R_R2  (SOC, SOC, Op_RegI,   2, R2->as_VMReg());
+reg_def R_R2x (SOC, SOC, Op_RegI, 255, R2->as_VMReg()->next());
+reg_def R_R3  (SOC, SOC, Op_RegI,   3, R3->as_VMReg());
+reg_def R_R3x (SOC, SOC, Op_RegI, 255, R3->as_VMReg()->next());
+reg_def R_R4  (SOC, SOC, Op_RegI,   4, R4->as_VMReg());
+reg_def R_R4x (SOC, SOC, Op_RegI, 255, R4->as_VMReg()->next());
+reg_def R_R5  (SOC, SOC, Op_RegI,   5, R5->as_VMReg());
+reg_def R_R5x (SOC, SOC, Op_RegI, 255, R5->as_VMReg()->next());
+reg_def R_R6  (SOC, SOC, Op_RegI,   6, R6->as_VMReg());
+reg_def R_R6x (SOC, SOC, Op_RegI, 255, R6->as_VMReg()->next());
+reg_def R_R7  (SOC, SOC, Op_RegI,   7, R7->as_VMReg());
+reg_def R_R7x (SOC, SOC, Op_RegI, 255, R7->as_VMReg()->next());
+
+reg_def R_R8  (SOC, SOC, Op_RegI,   8, R8->as_VMReg());
+reg_def R_R8x (SOC, SOC, Op_RegI, 255, R8->as_VMReg()->next());
+reg_def R_R9  (SOC, SOC, Op_RegI,   9, R9->as_VMReg());
+reg_def R_R9x (SOC, SOC, Op_RegI, 255, R9->as_VMReg()->next());
+reg_def R_R10 (SOC, SOC, Op_RegI,  10, R10->as_VMReg());
+reg_def R_R10x(SOC, SOC, Op_RegI, 255, R10->as_VMReg()->next());
+reg_def R_R11 (SOC, SOC, Op_RegI,  11, R11->as_VMReg());
+reg_def R_R11x(SOC, SOC, Op_RegI, 255, R11->as_VMReg()->next());
+reg_def R_R12 (SOC, SOC, Op_RegI,  12, R12->as_VMReg());
+reg_def R_R12x(SOC, SOC, Op_RegI, 255, R12->as_VMReg()->next());
+reg_def R_R13 (SOC, SOC, Op_RegI,  13, R13->as_VMReg());
+reg_def R_R13x(SOC, SOC, Op_RegI, 255, R13->as_VMReg()->next());
+reg_def R_R14 (SOC, SOC, Op_RegI,  14, R14->as_VMReg());
+reg_def R_R14x(SOC, SOC, Op_RegI, 255, R14->as_VMReg()->next());
+reg_def R_R15 (SOC, SOC, Op_RegI,  15, R15->as_VMReg());
+reg_def R_R15x(SOC, SOC, Op_RegI, 255, R15->as_VMReg()->next());
+
+reg_def R_R16 (SOC, SOC, Op_RegI,  16, R16->as_VMReg()); // IP0
+reg_def R_R16x(SOC, SOC, Op_RegI, 255, R16->as_VMReg()->next());
+reg_def R_R17 (SOC, SOC, Op_RegI,  17, R17->as_VMReg()); // IP1
+reg_def R_R17x(SOC, SOC, Op_RegI, 255, R17->as_VMReg()->next());
+reg_def R_R18 (SOC, SOC, Op_RegI,  18, R18->as_VMReg()); // Platform Register
+reg_def R_R18x(SOC, SOC, Op_RegI, 255, R18->as_VMReg()->next());
+
+reg_def R_R19 (SOC, SOE, Op_RegI,  19, R19->as_VMReg());
+reg_def R_R19x(SOC, SOE, Op_RegI, 255, R19->as_VMReg()->next());
+reg_def R_R20 (SOC, SOE, Op_RegI,  20, R20->as_VMReg());
+reg_def R_R20x(SOC, SOE, Op_RegI, 255, R20->as_VMReg()->next());
+reg_def R_R21 (SOC, SOE, Op_RegI,  21, R21->as_VMReg());
+reg_def R_R21x(SOC, SOE, Op_RegI, 255, R21->as_VMReg()->next());
+reg_def R_R22 (SOC, SOE, Op_RegI,  22, R22->as_VMReg());
+reg_def R_R22x(SOC, SOE, Op_RegI, 255, R22->as_VMReg()->next());
+reg_def R_R23 (SOC, SOE, Op_RegI,  23, R23->as_VMReg());
+reg_def R_R23x(SOC, SOE, Op_RegI, 255, R23->as_VMReg()->next());
+reg_def R_R24 (SOC, SOE, Op_RegI,  24, R24->as_VMReg());
+reg_def R_R24x(SOC, SOE, Op_RegI, 255, R24->as_VMReg()->next());
+reg_def R_R25 (SOC, SOE, Op_RegI,  25, R25->as_VMReg());
+reg_def R_R25x(SOC, SOE, Op_RegI, 255, R25->as_VMReg()->next());
+reg_def R_R26 (SOC, SOE, Op_RegI,  26, R26->as_VMReg());
+reg_def R_R26x(SOC, SOE, Op_RegI, 255, R26->as_VMReg()->next());
+reg_def R_R27 (SOC, SOE, Op_RegI,  27, R27->as_VMReg());         // Rheap_base
+reg_def R_R27x(SOC, SOE, Op_RegI, 255, R27->as_VMReg()->next()); // Rheap_base
+reg_def R_R28 ( NS, SOE, Op_RegI,  28, R28->as_VMReg());         // TLS
+reg_def R_R28x( NS, SOE, Op_RegI, 255, R28->as_VMReg()->next()); // TLS
+
+reg_def R_R29 ( NS, SOE, Op_RegI,  29, R29->as_VMReg());         // FP
+reg_def R_R29x( NS, SOE, Op_RegI, 255, R29->as_VMReg()->next()); // FP
+reg_def R_R30 (SOC, SOC, Op_RegI,  30, R30->as_VMReg());         // LR
+reg_def R_R30x(SOC, SOC, Op_RegI, 255, R30->as_VMReg()->next()); // LR
+
+reg_def R_ZR ( NS,  NS, Op_RegI,  31, ZR->as_VMReg());  // ZR
+reg_def R_ZRx( NS,  NS, Op_RegI, 255, ZR->as_VMReg()->next()); // ZR
+
+// FIXME
+//reg_def R_SP ( NS,  NS, Op_RegP,  32, SP->as_VMReg());
+reg_def R_SP ( NS,  NS, Op_RegI,  32, SP->as_VMReg());
+//reg_def R_SPx( NS, NS, Op_RegP, 255, SP->as_VMReg()->next());
+reg_def R_SPx( NS,  NS, Op_RegI, 255, SP->as_VMReg()->next());
+
+// ----------------------------
+// Float/Double/Vector Registers
+// ----------------------------
+
+reg_def  R_V0(SOC, SOC, Op_RegF,  0,  V0->as_VMReg());
+reg_def  R_V1(SOC, SOC, Op_RegF,  1,  V1->as_VMReg());
+reg_def  R_V2(SOC, SOC, Op_RegF,  2,  V2->as_VMReg());
+reg_def  R_V3(SOC, SOC, Op_RegF,  3,  V3->as_VMReg());
+reg_def  R_V4(SOC, SOC, Op_RegF,  4,  V4->as_VMReg());
+reg_def  R_V5(SOC, SOC, Op_RegF,  5,  V5->as_VMReg());
+reg_def  R_V6(SOC, SOC, Op_RegF,  6,  V6->as_VMReg());
+reg_def  R_V7(SOC, SOC, Op_RegF,  7,  V7->as_VMReg());
+reg_def  R_V8(SOC, SOC, Op_RegF,  8,  V8->as_VMReg());
+reg_def  R_V9(SOC, SOC, Op_RegF,  9,  V9->as_VMReg());
+reg_def R_V10(SOC, SOC, Op_RegF, 10, V10->as_VMReg());
+reg_def R_V11(SOC, SOC, Op_RegF, 11, V11->as_VMReg());
+reg_def R_V12(SOC, SOC, Op_RegF, 12, V12->as_VMReg());
+reg_def R_V13(SOC, SOC, Op_RegF, 13, V13->as_VMReg());
+reg_def R_V14(SOC, SOC, Op_RegF, 14, V14->as_VMReg());
+reg_def R_V15(SOC, SOC, Op_RegF, 15, V15->as_VMReg());
+reg_def R_V16(SOC, SOC, Op_RegF, 16, V16->as_VMReg());
+reg_def R_V17(SOC, SOC, Op_RegF, 17, V17->as_VMReg());
+reg_def R_V18(SOC, SOC, Op_RegF, 18, V18->as_VMReg());
+reg_def R_V19(SOC, SOC, Op_RegF, 19, V19->as_VMReg());
+reg_def R_V20(SOC, SOC, Op_RegF, 20, V20->as_VMReg());
+reg_def R_V21(SOC, SOC, Op_RegF, 21, V21->as_VMReg());
+reg_def R_V22(SOC, SOC, Op_RegF, 22, V22->as_VMReg());
+reg_def R_V23(SOC, SOC, Op_RegF, 23, V23->as_VMReg());
+reg_def R_V24(SOC, SOC, Op_RegF, 24, V24->as_VMReg());
+reg_def R_V25(SOC, SOC, Op_RegF, 25, V25->as_VMReg());
+reg_def R_V26(SOC, SOC, Op_RegF, 26, V26->as_VMReg());
+reg_def R_V27(SOC, SOC, Op_RegF, 27, V27->as_VMReg());
+reg_def R_V28(SOC, SOC, Op_RegF, 28, V28->as_VMReg());
+reg_def R_V29(SOC, SOC, Op_RegF, 29, V29->as_VMReg());
+reg_def R_V30(SOC, SOC, Op_RegF, 30, V30->as_VMReg());
+reg_def R_V31(SOC, SOC, Op_RegF, 31, V31->as_VMReg());
+
+reg_def  R_V0b(SOC, SOC, Op_RegF, 255, V0->as_VMReg()->next(1));
+reg_def  R_V1b(SOC, SOC, Op_RegF, 255, V1->as_VMReg()->next(1));
+reg_def  R_V2b(SOC, SOC, Op_RegF, 255, V2->as_VMReg()->next(1));
+reg_def  R_V3b(SOC, SOC, Op_RegF,  3,  V3->as_VMReg()->next(1));
+reg_def  R_V4b(SOC, SOC, Op_RegF,  4,  V4->as_VMReg()->next(1));
+reg_def  R_V5b(SOC, SOC, Op_RegF,  5,  V5->as_VMReg()->next(1));
+reg_def  R_V6b(SOC, SOC, Op_RegF,  6,  V6->as_VMReg()->next(1));
+reg_def  R_V7b(SOC, SOC, Op_RegF,  7,  V7->as_VMReg()->next(1));
+reg_def  R_V8b(SOC, SOC, Op_RegF, 255, V8->as_VMReg()->next(1));
+reg_def  R_V9b(SOC, SOC, Op_RegF,  9,  V9->as_VMReg()->next(1));
+reg_def R_V10b(SOC, SOC, Op_RegF, 10, V10->as_VMReg()->next(1));
+reg_def R_V11b(SOC, SOC, Op_RegF, 11, V11->as_VMReg()->next(1));
+reg_def R_V12b(SOC, SOC, Op_RegF, 12, V12->as_VMReg()->next(1));
+reg_def R_V13b(SOC, SOC, Op_RegF, 13, V13->as_VMReg()->next(1));
+reg_def R_V14b(SOC, SOC, Op_RegF, 14, V14->as_VMReg()->next(1));
+reg_def R_V15b(SOC, SOC, Op_RegF, 15, V15->as_VMReg()->next(1));
+reg_def R_V16b(SOC, SOC, Op_RegF, 16, V16->as_VMReg()->next(1));
+reg_def R_V17b(SOC, SOC, Op_RegF, 17, V17->as_VMReg()->next(1));
+reg_def R_V18b(SOC, SOC, Op_RegF, 18, V18->as_VMReg()->next(1));
+reg_def R_V19b(SOC, SOC, Op_RegF, 19, V19->as_VMReg()->next(1));
+reg_def R_V20b(SOC, SOC, Op_RegF, 20, V20->as_VMReg()->next(1));
+reg_def R_V21b(SOC, SOC, Op_RegF, 21, V21->as_VMReg()->next(1));
+reg_def R_V22b(SOC, SOC, Op_RegF, 22, V22->as_VMReg()->next(1));
+reg_def R_V23b(SOC, SOC, Op_RegF, 23, V23->as_VMReg()->next(1));
+reg_def R_V24b(SOC, SOC, Op_RegF, 24, V24->as_VMReg()->next(1));
+reg_def R_V25b(SOC, SOC, Op_RegF, 25, V25->as_VMReg()->next(1));
+reg_def R_V26b(SOC, SOC, Op_RegF, 26, V26->as_VMReg()->next(1));
+reg_def R_V27b(SOC, SOC, Op_RegF, 27, V27->as_VMReg()->next(1));
+reg_def R_V28b(SOC, SOC, Op_RegF, 28, V28->as_VMReg()->next(1));
+reg_def R_V29b(SOC, SOC, Op_RegF, 29, V29->as_VMReg()->next(1));
+reg_def R_V30b(SOC, SOC, Op_RegD, 30, V30->as_VMReg()->next(1));
+reg_def R_V31b(SOC, SOC, Op_RegF, 31, V31->as_VMReg()->next(1));
+
+reg_def  R_V0c(SOC, SOC, Op_RegF,  0,  V0->as_VMReg()->next(2));
+reg_def  R_V1c(SOC, SOC, Op_RegF,  1,  V1->as_VMReg()->next(2));
+reg_def  R_V2c(SOC, SOC, Op_RegF,  2,  V2->as_VMReg()->next(2));
+reg_def  R_V3c(SOC, SOC, Op_RegF,  3,  V3->as_VMReg()->next(2));
+reg_def  R_V4c(SOC, SOC, Op_RegF,  4,  V4->as_VMReg()->next(2));
+reg_def  R_V5c(SOC, SOC, Op_RegF,  5,  V5->as_VMReg()->next(2));
+reg_def  R_V6c(SOC, SOC, Op_RegF,  6,  V6->as_VMReg()->next(2));
+reg_def  R_V7c(SOC, SOC, Op_RegF,  7,  V7->as_VMReg()->next(2));
+reg_def  R_V8c(SOC, SOC, Op_RegF,  8,  V8->as_VMReg()->next(2));
+reg_def  R_V9c(SOC, SOC, Op_RegF,  9,  V9->as_VMReg()->next(2));
+reg_def R_V10c(SOC, SOC, Op_RegF, 10, V10->as_VMReg()->next(2));
+reg_def R_V11c(SOC, SOC, Op_RegF, 11, V11->as_VMReg()->next(2));
+reg_def R_V12c(SOC, SOC, Op_RegF, 12, V12->as_VMReg()->next(2));
+reg_def R_V13c(SOC, SOC, Op_RegF, 13, V13->as_VMReg()->next(2));
+reg_def R_V14c(SOC, SOC, Op_RegF, 14, V14->as_VMReg()->next(2));
+reg_def R_V15c(SOC, SOC, Op_RegF, 15, V15->as_VMReg()->next(2));
+reg_def R_V16c(SOC, SOC, Op_RegF, 16, V16->as_VMReg()->next(2));
+reg_def R_V17c(SOC, SOC, Op_RegF, 17, V17->as_VMReg()->next(2));
+reg_def R_V18c(SOC, SOC, Op_RegF, 18, V18->as_VMReg()->next(2));
+reg_def R_V19c(SOC, SOC, Op_RegF, 19, V19->as_VMReg()->next(2));
+reg_def R_V20c(SOC, SOC, Op_RegF, 20, V20->as_VMReg()->next(2));
+reg_def R_V21c(SOC, SOC, Op_RegF, 21, V21->as_VMReg()->next(2));
+reg_def R_V22c(SOC, SOC, Op_RegF, 22, V22->as_VMReg()->next(2));
+reg_def R_V23c(SOC, SOC, Op_RegF, 23, V23->as_VMReg()->next(2));
+reg_def R_V24c(SOC, SOC, Op_RegF, 24, V24->as_VMReg()->next(2));
+reg_def R_V25c(SOC, SOC, Op_RegF, 25, V25->as_VMReg()->next(2));
+reg_def R_V26c(SOC, SOC, Op_RegF, 26, V26->as_VMReg()->next(2));
+reg_def R_V27c(SOC, SOC, Op_RegF, 27, V27->as_VMReg()->next(2));
+reg_def R_V28c(SOC, SOC, Op_RegF, 28, V28->as_VMReg()->next(2));
+reg_def R_V29c(SOC, SOC, Op_RegF, 29, V29->as_VMReg()->next(2));
+reg_def R_V30c(SOC, SOC, Op_RegF, 30, V30->as_VMReg()->next(2));
+reg_def R_V31c(SOC, SOC, Op_RegF, 31, V31->as_VMReg()->next(2));
+
+reg_def  R_V0d(SOC, SOC, Op_RegF,  0,  V0->as_VMReg()->next(3));
+reg_def  R_V1d(SOC, SOC, Op_RegF,  1,  V1->as_VMReg()->next(3));
+reg_def  R_V2d(SOC, SOC, Op_RegF,  2,  V2->as_VMReg()->next(3));
+reg_def  R_V3d(SOC, SOC, Op_RegF,  3,  V3->as_VMReg()->next(3));
+reg_def  R_V4d(SOC, SOC, Op_RegF,  4,  V4->as_VMReg()->next(3));
+reg_def  R_V5d(SOC, SOC, Op_RegF,  5,  V5->as_VMReg()->next(3));
+reg_def  R_V6d(SOC, SOC, Op_RegF,  6,  V6->as_VMReg()->next(3));
+reg_def  R_V7d(SOC, SOC, Op_RegF,  7,  V7->as_VMReg()->next(3));
+reg_def  R_V8d(SOC, SOC, Op_RegF,  8,  V8->as_VMReg()->next(3));
+reg_def  R_V9d(SOC, SOC, Op_RegF,  9,  V9->as_VMReg()->next(3));
+reg_def R_V10d(SOC, SOC, Op_RegF, 10, V10->as_VMReg()->next(3));
+reg_def R_V11d(SOC, SOC, Op_RegF, 11, V11->as_VMReg()->next(3));
+reg_def R_V12d(SOC, SOC, Op_RegF, 12, V12->as_VMReg()->next(3));
+reg_def R_V13d(SOC, SOC, Op_RegF, 13, V13->as_VMReg()->next(3));
+reg_def R_V14d(SOC, SOC, Op_RegF, 14, V14->as_VMReg()->next(3));
+reg_def R_V15d(SOC, SOC, Op_RegF, 15, V15->as_VMReg()->next(3));
+reg_def R_V16d(SOC, SOC, Op_RegF, 16, V16->as_VMReg()->next(3));
+reg_def R_V17d(SOC, SOC, Op_RegF, 17, V17->as_VMReg()->next(3));
+reg_def R_V18d(SOC, SOC, Op_RegF, 18, V18->as_VMReg()->next(3));
+reg_def R_V19d(SOC, SOC, Op_RegF, 19, V19->as_VMReg()->next(3));
+reg_def R_V20d(SOC, SOC, Op_RegF, 20, V20->as_VMReg()->next(3));
+reg_def R_V21d(SOC, SOC, Op_RegF, 21, V21->as_VMReg()->next(3));
+reg_def R_V22d(SOC, SOC, Op_RegF, 22, V22->as_VMReg()->next(3));
+reg_def R_V23d(SOC, SOC, Op_RegF, 23, V23->as_VMReg()->next(3));
+reg_def R_V24d(SOC, SOC, Op_RegF, 24, V24->as_VMReg()->next(3));
+reg_def R_V25d(SOC, SOC, Op_RegF, 25, V25->as_VMReg()->next(3));
+reg_def R_V26d(SOC, SOC, Op_RegF, 26, V26->as_VMReg()->next(3));
+reg_def R_V27d(SOC, SOC, Op_RegF, 27, V27->as_VMReg()->next(3));
+reg_def R_V28d(SOC, SOC, Op_RegF, 28, V28->as_VMReg()->next(3));
+reg_def R_V29d(SOC, SOC, Op_RegF, 29, V29->as_VMReg()->next(3));
+reg_def R_V30d(SOC, SOC, Op_RegF, 30, V30->as_VMReg()->next(3));
+reg_def R_V31d(SOC, SOC, Op_RegF, 31, V31->as_VMReg()->next(3));
+
+// ----------------------------
+// Special Registers
+// Condition Codes Flag Registers
+reg_def APSR (SOC, SOC,  Op_RegFlags, 255, VMRegImpl::Bad());
+reg_def FPSCR(SOC, SOC,  Op_RegFlags, 255, VMRegImpl::Bad());
+
+// ----------------------------
+// Specify the enum values for the registers.  These enums are only used by the
+// OptoReg "class". We can convert these enum values at will to VMReg when needed
+// for visibility to the rest of the vm. The order of this enum influences the
+// register allocator so having the freedom to set this order and not be stuck
+// with the order that is natural for the rest of the vm is worth it.
+
+// Quad vector must be aligned here, so list them first.
+alloc_class fprs(
+    R_V8,  R_V8b,  R_V8c,  R_V8d,  R_V9,  R_V9b,  R_V9c,  R_V9d,
+    R_V10, R_V10b, R_V10c, R_V10d, R_V11, R_V11b, R_V11c, R_V11d,
+    R_V12, R_V12b, R_V12c, R_V12d, R_V13, R_V13b, R_V13c, R_V13d,
+    R_V14, R_V14b, R_V14c, R_V14d, R_V15, R_V15b, R_V15c, R_V15d,
+    R_V16, R_V16b, R_V16c, R_V16d, R_V17, R_V17b, R_V17c, R_V17d,
+    R_V18, R_V18b, R_V18c, R_V18d, R_V19, R_V19b, R_V19c, R_V19d,
+    R_V20, R_V20b, R_V20c, R_V20d, R_V21, R_V21b, R_V21c, R_V21d,
+    R_V22, R_V22b, R_V22c, R_V22d, R_V23, R_V23b, R_V23c, R_V23d,
+    R_V24, R_V24b, R_V24c, R_V24d, R_V25, R_V25b, R_V25c, R_V25d,
+    R_V26, R_V26b, R_V26c, R_V26d, R_V27, R_V27b, R_V27c, R_V27d,
+    R_V28, R_V28b, R_V28c, R_V28d, R_V29, R_V29b, R_V29c, R_V29d,
+    R_V30, R_V30b, R_V30c, R_V30d, R_V31, R_V31b, R_V31c, R_V31d,
+    R_V0,  R_V0b,  R_V0c,  R_V0d,  R_V1,  R_V1b,  R_V1c,  R_V1d,
+    R_V2,  R_V2b,  R_V2c,  R_V2d,  R_V3,  R_V3b,  R_V3c,  R_V3d,
+    R_V4,  R_V4b,  R_V4c,  R_V4d,  R_V5,  R_V5b,  R_V5c,  R_V5d,
+    R_V6,  R_V6b,  R_V6c,  R_V6d,  R_V7,  R_V7b,  R_V7c,  R_V7d
+);
+
+// Need double-register alignment here.
+// We are already quad-register aligned because of vectors above.
+alloc_class gprs(
+    R_R0,  R_R0x,  R_R1,  R_R1x,  R_R2,  R_R2x,  R_R3,  R_R3x,
+    R_R4,  R_R4x,  R_R5,  R_R5x,  R_R6,  R_R6x,  R_R7,  R_R7x,
+    R_R8,  R_R8x,  R_R9,  R_R9x,  R_R10, R_R10x, R_R11, R_R11x,
+    R_R12, R_R12x, R_R13, R_R13x, R_R14, R_R14x, R_R15, R_R15x,
+    R_R16, R_R16x, R_R17, R_R17x, R_R18, R_R18x, R_R19, R_R19x,
+    R_R20, R_R20x, R_R21, R_R21x, R_R22, R_R22x, R_R23, R_R23x,
+    R_R24, R_R24x, R_R25, R_R25x, R_R26, R_R26x, R_R27, R_R27x,
+    R_R28, R_R28x, R_R29, R_R29x, R_R30, R_R30x
+);
+// Continuing with double-reigister alignment...
+alloc_class chunk2(APSR, FPSCR);
+alloc_class chunk3(R_SP, R_SPx);
+alloc_class chunk4(R_ZR, R_ZRx);
+
+//----------Architecture Description Register Classes--------------------------
+// Several register classes are automatically defined based upon information in
+// this architecture description.
+// 1) reg_class inline_cache_reg           ( as defined in frame section )
+// 2) reg_class interpreter_method_oop_reg ( as defined in frame section )
+// 3) reg_class stack_slots( /* one chunk of stack-based "registers" */ )
+//
+
+// ----------------------------
+// Integer Register Classes
+// ----------------------------
+reg_class int_reg_all(R_R0,  R_R1,  R_R2,  R_R3,  R_R4,  R_R5,  R_R6,  R_R7,
+                      R_R8,  R_R9,  R_R10, R_R11, R_R12, R_R13, R_R14, R_R15,
+                      R_R16, R_R17, R_R18, R_R19, R_R20, R_R21, R_R22, R_R23,
+                      R_R24, R_R25, R_R26, R_R27, R_R28, R_R29, R_R30
+);
+
+// Exclusions from i_reg:
+// SP (R31)
+// Rthread/R28: reserved by HotSpot to the TLS register (invariant within Java)
+reg_class int_reg %{
+    return _INT_REG_mask;
+%}
+reg_class ptr_reg %{
+    return _PTR_REG_mask;
+%}
+reg_class vectorx_reg %{
+    return _VECTORX_REG_mask;
+%}
+
+reg_class R0_regI(R_R0);
+reg_class R1_regI(R_R1);
+reg_class R2_regI(R_R2);
+reg_class R3_regI(R_R3);
+//reg_class R12_regI(R_R12);
+
+// ----------------------------
+// Pointer Register Classes
+// ----------------------------
+
+// Special class for storeP instructions, which can store SP or RPC to TLS.
+// It is also used for memory addressing, allowing direct TLS addressing.
+
+reg_class sp_ptr_reg %{
+    return _SP_PTR_REG_mask;
+%}
+
+reg_class store_reg %{
+    return _STR_REG_mask;
+%}
+
+reg_class store_ptr_reg %{
+    return _STR_PTR_REG_mask;
+%}
+
+reg_class spillP_reg %{
+    return _SPILLP_REG_mask;
+%}
+
+// Other special pointer regs
+reg_class R0_regP(R_R0, R_R0x);
+reg_class R1_regP(R_R1, R_R1x);
+reg_class R2_regP(R_R2, R_R2x);
+reg_class Rexception_regP(R_R19, R_R19x);
+reg_class Ricklass_regP(R_R8, R_R8x);
+reg_class Rmethod_regP(R_R27, R_R27x);
+
+reg_class Rthread_regP(R_R28, R_R28x);
+reg_class IP_regP(R_R16, R_R16x);
+#define RtempRegP IPRegP
+reg_class LR_regP(R_R30, R_R30x);
+
+reg_class SP_regP(R_SP,  R_SPx);
+reg_class FP_regP(R_R29, R_R29x);
+
+reg_class ZR_regP(R_ZR, R_ZRx);
+reg_class ZR_regI(R_ZR);
+
+// ----------------------------
+// Long Register Classes
+// ----------------------------
+reg_class long_reg %{ return _PTR_REG_mask; %}
+// for ldrexd, strexd: first reg of pair must be even
+reg_class long_reg_align %{ return LONG_REG_mask(); %}
+
+reg_class R0_regL(R_R0,R_R0x); // arg 1 or return value
+
+// ----------------------------
+// Special Class for Condition Code Flags Register
+reg_class int_flags(APSR);
+reg_class float_flags(FPSCR);
+
+
+// ----------------------------
+// Float Point Register Classes
+// ----------------------------
+reg_class sflt_reg_0(
+  R_V0,  R_V1,  R_V2,  R_V3,  R_V4,  R_V5,  R_V6,  R_V7,
+  R_V8,  R_V9,  R_V10, R_V11, R_V12, R_V13, R_V14, R_V15,
+  R_V16, R_V17, R_V18, R_V19, R_V20, R_V21, R_V22, R_V23,
+  R_V24, R_V25, R_V26, R_V27, R_V28, R_V29, R_V30, R_V31);
+
+reg_class sflt_reg %{
+    return _SFLT_REG_mask;
+%}
+
+reg_class dflt_low_reg %{
+    return _DFLT_REG_mask;
+%}
+
+reg_class actual_dflt_reg %{
+    return _DFLT_REG_mask;
+%}
+
+reg_class vectorx_reg_0(
+  R_V0,  R_V1,  R_V2,  R_V3,  R_V4,  R_V5, R_V6, R_V7,
+  R_V8,  R_V9,  R_V10, R_V11, R_V12, R_V13, R_V14, R_V15,
+  R_V16, R_V17, R_V18, R_V19, R_V20, R_V21, R_V22, R_V23,
+  R_V24, R_V25, R_V26, R_V27, R_V28, R_V29, R_V30, /*R_V31,*/
+  R_V0b,  R_V1b,  R_V2b,  R_V3b,  R_V4b,  R_V5b,  R_V6b,  R_V7b,
+  R_V8b,  R_V9b,  R_V10b, R_V11b, R_V12b, R_V13b, R_V14b, R_V15b,
+  R_V16b, R_V17b, R_V18b, R_V19b, R_V20b, R_V21b, R_V22b, R_V23b,
+  R_V24b, R_V25b, R_V26b, R_V27b, R_V28b, R_V29b, R_V30b, /*R_V31b,*/
+  R_V0c,  R_V1c,  R_V2c,  R_V3c,  R_V4c,  R_V5c,  R_V6c,  R_V7c,
+  R_V8c,  R_V9c,  R_V10c, R_V11c, R_V12c, R_V13c, R_V14c, R_V15c,
+  R_V16c, R_V17c, R_V18c, R_V19c, R_V20c, R_V21c, R_V22c, R_V23c,
+  R_V24c, R_V25c, R_V26c, R_V27c, R_V28c, R_V29c, R_V30c, /*R_V31c,*/
+  R_V0d,  R_V1d,  R_V2d,  R_V3d,  R_V4d,  R_V5d,  R_V6d,  R_V7d,
+  R_V8d,  R_V9d,  R_V10d, R_V11d, R_V12d, R_V13d, R_V14d, R_V15d,
+  R_V16d, R_V17d, R_V18d, R_V19d, R_V20d, R_V21d, R_V22d, R_V23d,
+  R_V24d, R_V25d, R_V26d, R_V27d, R_V28d, R_V29d, R_V30d, /*R_V31d*/);
+
+reg_class Rmemcopy_reg %{
+    return _RMEMCOPY_REG_mask;
+%}
+
+%}
+
+source_hpp %{
+
+const MachRegisterNumbers R_mem_copy_lo_num = R_V31_num;
+const MachRegisterNumbers R_mem_copy_hi_num = R_V31b_num;
+const FloatRegister Rmemcopy = V31;
+
+const MachRegisterNumbers R_hf_ret_lo_num = R_V0_num;
+const MachRegisterNumbers R_hf_ret_hi_num = R_V0b_num;
+const FloatRegister Rhfret = V0;
+
+extern OptoReg::Name R_Ricklass_num;
+extern OptoReg::Name R_Rmethod_num;
+extern OptoReg::Name R_tls_num;
+extern OptoReg::Name R_Rheap_base_num;
+
+extern RegMask _INT_REG_mask;
+extern RegMask _PTR_REG_mask;
+extern RegMask _SFLT_REG_mask;
+extern RegMask _DFLT_REG_mask;
+extern RegMask _VECTORX_REG_mask;
+extern RegMask _RMEMCOPY_REG_mask;
+extern RegMask _SP_PTR_REG_mask;
+extern RegMask _SPILLP_REG_mask;
+extern RegMask _STR_REG_mask;
+extern RegMask _STR_PTR_REG_mask;
+
+#define LDR_DOUBLE "LDR_D"
+#define LDR_FLOAT  "LDR_S"
+#define STR_DOUBLE "STR_D"
+#define STR_FLOAT  "STR_S"
+#define STR_64     "STR"
+#define LDR_64     "LDR"
+#define STR_32     "STR_W"
+#define LDR_32     "LDR_W"
+#define MOV_DOUBLE "FMOV_D"
+#define MOV_FLOAT  "FMOV_S"
+#define FMSR       "FMOV_SW"
+#define FMRS       "FMOV_WS"
+#define LDREX      "ldxr  "
+#define STREX      "stxr  "
+
+#define str_64     str
+#define ldr_64     ldr
+#define ldr_32     ldr_w
+#define ldrex      ldxr
+#define strex      stxr
+
+#define fmsr       fmov_sw
+#define fmrs       fmov_ws
+#define fconsts    fmov_s
+#define fconstd    fmov_d
+
+static inline bool is_uimm12(jlong imm, int shift) {
+  return Assembler::is_unsigned_imm_in_range(imm, 12, shift);
+}
+
+static inline bool is_memoryD(int offset) {
+  int scale = 3; // LogBytesPerDouble
+  return is_uimm12(offset, scale);
+}
+
+static inline bool is_memoryfp(int offset) {
+  int scale = LogBytesPerInt; // include 32-bit word accesses
+  return is_uimm12(offset, scale);
+}
+
+static inline bool is_memoryI(int offset) {
+  int scale = LogBytesPerInt;
+  return is_uimm12(offset, scale);
+}
+
+static inline bool is_memoryP(int offset) {
+  int scale = LogBytesPerWord;
+  return is_uimm12(offset, scale);
+}
+
+static inline bool is_memoryHD(int offset) {
+  int scale = LogBytesPerInt; // include 32-bit word accesses
+  return is_uimm12(offset, scale);
+}
+
+uintx limmL_low(uintx imm, int n);
+
+static inline bool Xis_aimm(int imm) {
+  return Assembler::ArithmeticImmediate(imm).is_encoded();
+}
+
+static inline bool is_aimm(intptr_t imm) {
+  return Assembler::ArithmeticImmediate(imm).is_encoded();
+}
+
+static inline bool is_limmL(uintptr_t imm) {
+  return Assembler::LogicalImmediate(imm).is_encoded();
+}
+
+static inline bool is_limmL_low(intptr_t imm, int n) {
+  return is_limmL(limmL_low(imm, n));
+}
+
+static inline bool is_limmI(jint imm) {
+  return Assembler::LogicalImmediate(imm, true).is_encoded();
+}
+
+static inline uintx limmI_low(jint imm, int n) {
+  return limmL_low(imm, n);
+}
+
+static inline bool is_limmI_low(jint imm, int n) {
+  return is_limmL_low(imm, n);
+}
+
+%}
+
+source %{
+
+// Given a register encoding, produce a Integer Register object
+static Register reg_to_register_object(int register_encoding) {
+  assert(R0->encoding() == R_R0_enc && R30->encoding() == R_R30_enc, "right coding");
+  assert(Rthread->encoding() == R_R28_enc, "right coding");
+  assert(SP->encoding() == R_SP_enc, "right coding");
+  return as_Register(register_encoding);
+}
+
+// Given a register encoding, produce a single-precision Float Register object
+static FloatRegister reg_to_FloatRegister_object(int register_encoding) {
+  assert(V0->encoding() == R_V0_enc && V31->encoding() == R_V31_enc, "right coding");
+  return as_FloatRegister(register_encoding);
+}
+
+RegMask _INT_REG_mask;
+RegMask _PTR_REG_mask;
+RegMask _SFLT_REG_mask;
+RegMask _DFLT_REG_mask;
+RegMask _VECTORX_REG_mask;
+RegMask _RMEMCOPY_REG_mask;
+RegMask _SP_PTR_REG_mask;
+RegMask _SPILLP_REG_mask;
+RegMask _STR_REG_mask;
+RegMask _STR_PTR_REG_mask;
+
+OptoReg::Name R_Ricklass_num = -1;
+OptoReg::Name R_Rmethod_num  = -1;
+OptoReg::Name R_tls_num      = -1;
+OptoReg::Name R_Rtemp_num    = -1;
+OptoReg::Name R_Rheap_base_num = -1;
+
+static int mov_oop_size = -1;
+
+#ifdef ASSERT
+static bool same_mask(const RegMask &a, const RegMask &b) {
+    RegMask a_sub_b = a; a_sub_b.SUBTRACT(b);
+    RegMask b_sub_a = b; b_sub_a.SUBTRACT(a);
+    return a_sub_b.Size() == 0 && b_sub_a.Size() == 0;
+}
+#endif
+
+void Compile::pd_compiler2_init() {
+
+    R_Ricklass_num = OptoReg::as_OptoReg(Ricklass->as_VMReg());
+    R_Rmethod_num  = OptoReg::as_OptoReg(Rmethod->as_VMReg());
+    R_tls_num      = OptoReg::as_OptoReg(Rthread->as_VMReg());
+    R_Rtemp_num    = OptoReg::as_OptoReg(Rtemp->as_VMReg());
+    R_Rheap_base_num = OptoReg::as_OptoReg(Rheap_base->as_VMReg());
+
+    _INT_REG_mask = _INT_REG_ALL_mask;
+    _INT_REG_mask.Remove(R_tls_num);
+    _INT_REG_mask.Remove(R_SP_num);
+    if (UseCompressedOops) {
+      _INT_REG_mask.Remove(R_Rheap_base_num);
+    }
+    // Remove Rtemp because safepoint poll can trash it
+    // (see SharedRuntime::generate_handler_blob)
+    _INT_REG_mask.Remove(R_Rtemp_num);
+
+    _PTR_REG_mask = _INT_REG_mask;
+    _PTR_REG_mask.smear_to_sets(2);
+
+    // STR_REG    = INT_REG+ZR
+    // SPILLP_REG = INT_REG+SP
+    // SP_PTR_REG = INT_REG+SP+TLS
+    _STR_REG_mask = _INT_REG_mask;
+    _SP_PTR_REG_mask = _STR_REG_mask;
+    _STR_REG_mask.Insert(R_ZR_num);
+    _SP_PTR_REG_mask.Insert(R_SP_num);
+    _SPILLP_REG_mask = _SP_PTR_REG_mask;
+    _SP_PTR_REG_mask.Insert(R_tls_num);
+    _STR_PTR_REG_mask = _STR_REG_mask;
+    _STR_PTR_REG_mask.smear_to_sets(2);
+    _SP_PTR_REG_mask.smear_to_sets(2);
+    _SPILLP_REG_mask.smear_to_sets(2);
+
+    _RMEMCOPY_REG_mask = RegMask(R_mem_copy_lo_num);
+assert(OptoReg::as_OptoReg(Rmemcopy->as_VMReg()) == R_mem_copy_lo_num, "!");
+
+    _SFLT_REG_mask = _SFLT_REG_0_mask;
+    _SFLT_REG_mask.SUBTRACT(_RMEMCOPY_REG_mask);
+    _DFLT_REG_mask = _SFLT_REG_mask;
+    _DFLT_REG_mask.smear_to_sets(2);
+    _VECTORX_REG_mask = _SFLT_REG_mask;
+    _VECTORX_REG_mask.smear_to_sets(4);
+    assert(same_mask(_VECTORX_REG_mask, _VECTORX_REG_0_mask), "!");
+
+#ifdef ASSERT
+    RegMask r((RegMask *)&SFLT_REG_mask());
+    r.smear_to_sets(2);
+    assert(same_mask(r, _DFLT_REG_mask), "!");
+#endif
+
+    if (VM_Version::prefer_moves_over_load_literal()) {
+      mov_oop_size = 4;
+    } else {
+      mov_oop_size = 1;
+    }
+
+    assert(Matcher::interpreter_method_oop_reg_encode() == Rmethod->encoding(), "should be");
+}
+
+uintx limmL_low(uintx imm, int n) {
+  // 1: try as is
+  if (is_limmL(imm)) {
+    return imm;
+  }
+  // 2: try low bits + all 0's
+  uintx imm0 = imm & right_n_bits(n);
+  if (is_limmL(imm0)) {
+    return imm0;
+  }
+  // 3: try low bits + all 1's
+  uintx imm1 = imm0 | left_n_bits(BitsPerWord - n);
+  if (is_limmL(imm1)) {
+    return imm1;
+  }
+#if 0
+  // 4: try low bits replicated
+  int field = 1 << log2_intptr(n + n - 1);
+  assert(field >= n, "!");
+  assert(field / n == 1, "!");
+  intptr_t immr = immx;
+  while (field < BitsPerWord) {
+    intrptr_t bits = immr & right_n_bits(field);
+    immr = bits | (bits << field);
+    field = field << 1;
+  }
+  // replicate at power-of-2 boundary
+  if (is_limmL(immr)) {
+    return immr;
+  }
+#endif
+  return imm;
+}
+
+// Convert the raw encoding form into the form expected by the
+// constructor for Address.
+Address Address::make_raw(int base, int index, int scale, int disp, relocInfo::relocType disp_reloc) {
+  RelocationHolder rspec;
+  if (disp_reloc != relocInfo::none) {
+    rspec = Relocation::spec_simple(disp_reloc);
+  }
+
+  Register rbase = (base == 0xff) ? SP : as_Register(base);
+  if (index != 0xff) {
+    Register rindex = as_Register(index);
+    if (disp == 0x7fffffff) { // special value to indicate sign-extend
+      Address madr(rbase, rindex, ex_sxtw, scale);
+      madr._rspec = rspec;
+      return madr;
+    } else {
+      assert(disp == 0, "unsupported");
+      Address madr(rbase, rindex, ex_lsl, scale);
+      madr._rspec = rspec;
+      return madr;
+    }
+  } else {
+    assert(scale == 0, "not supported");
+    Address madr(rbase, disp);
+    madr._rspec = rspec;
+    return madr;
+  }
+}
+
+// Location of compiled Java return values.  Same as C
+OptoRegPair c2::return_value(int ideal_reg) {
+  assert( ideal_reg >= Op_RegI && ideal_reg <= Op_RegL, "only return normal values" );
+  static int lo[Op_RegL+1] = { 0, 0, OptoReg::Bad, R_R0_num,     R_R0_num,  R_hf_ret_lo_num,  R_hf_ret_lo_num, R_R0_num };
+  static int hi[Op_RegL+1] = { 0, 0, OptoReg::Bad, OptoReg::Bad, R_R0x_num, OptoReg::Bad,     R_hf_ret_hi_num, R_R0x_num };
+  return OptoRegPair( hi[ideal_reg], lo[ideal_reg]);
+}
+
+// !!!!! Special hack to get all type of calls to specify the byte offset
+//       from the start of the call to the point where the return address
+//       will point.
+
+int MachCallStaticJavaNode::ret_addr_offset() {
+  bool far = (_method == NULL) ? maybe_far_call(this) : !cache_reachable();
+  bool patchable = _method != NULL;
+  int call_size = MacroAssembler::call_size(entry_point(), far, patchable);
+  return (call_size + (_method_handle_invoke ? 1 : 0)) * NativeInstruction::instruction_size;
+}
+
+int MachCallDynamicJavaNode::ret_addr_offset() {
+  bool far = !cache_reachable();
+  int call_size = MacroAssembler::call_size(entry_point(), far, true);
+  return (mov_oop_size + call_size) * NativeInstruction::instruction_size; 
+}
+
+int MachCallRuntimeNode::ret_addr_offset() {
+  int call_size = 0;
+  // TODO: check if Leaf nodes also need this
+  if (!is_MachCallLeaf()) {
+    // adr $temp, ret_addr
+    // str $temp, [SP + last_java_pc]
+    call_size += 2;
+  }
+  // bl or mov_slow; blr
+  bool far = maybe_far_call(this);
+  call_size += MacroAssembler::call_size(entry_point(), far, false);
+  return call_size * NativeInstruction::instruction_size;
+}
+
+%}
+
+// The intptr_t operand types, defined by textual substitution.
+// (Cf. opto/type.hpp.  This lets us avoid many, many other ifdefs.)
+#define immX      immL
+#define iRegX     iRegL
+#define aimmX     aimmL
+#define limmX     limmL
+#define immX9     immL9
+#define LShiftX   LShiftL
+#define shimmX    immU6
+
+#define store_RegLd store_RegL
+
+//----------ATTRIBUTES---------------------------------------------------------
+//----------Operand Attributes-------------------------------------------------
+op_attrib op_cost(1);          // Required cost attribute
+
+//----------OPERANDS-----------------------------------------------------------
+// Operand definitions must precede instruction definitions for correct parsing
+// in the ADLC because operands constitute user defined types which are used in
+// instruction definitions.
+
+//----------Simple Operands----------------------------------------------------
+// Immediate Operands
+
+// Integer Immediate: 9-bit (including sign bit), so same as immI8?
+// FIXME: simm9 allows -256, but immI8 doesn't...
+operand simm9() %{
+  predicate(Assembler::is_imm_in_range(n->get_int(), 9, 0));
+  match(ConI);
+  op_cost(0);
+
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
+
+operand uimm12() %{
+  predicate(Assembler::is_unsigned_imm_in_range(n->get_int(), 12, 0));
+  match(ConI);
+  op_cost(0);
+
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
+operand aimmP() %{
+  predicate(n->get_ptr() == 0 || (is_aimm(n->get_ptr()) && ((ConPNode*)n)->type()->reloc() == relocInfo::none));
+  match(ConP);
+
+  op_cost(0);
+  // formats are generated automatically for constants and base registers
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
+// Long Immediate: 12-bit - for addressing mode
+operand immL12() %{
+  predicate((-4096 < n->get_long()) && (n->get_long() < 4096));
+  match(ConL);
+  op_cost(0);
+
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
+// Long Immediate: 9-bit - for addressing mode
+operand immL9() %{
+  predicate((-256 <= n->get_long()) && (n->get_long() < 256));
+  match(ConL);
+  op_cost(0);
+
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
+operand immIMov() %{
+  predicate(n->get_int() >> 16 == 0);
+  match(ConI);
+  op_cost(0);
+
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
+operand immLMov() %{
+  predicate(n->get_long() >> 16 == 0);
+  match(ConL);
+  op_cost(0);
+
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
+operand immUL12() %{
+  predicate(is_uimm12(n->get_long(), 0));
+  match(ConL);
+  op_cost(0);
+
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
+operand immUL12x2() %{
+  predicate(is_uimm12(n->get_long(), 1));
+  match(ConL);
+  op_cost(0);
+
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
+operand immUL12x4() %{
+  predicate(is_uimm12(n->get_long(), 2));
+  match(ConL);
+  op_cost(0);
+
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
+operand immUL12x8() %{
+  predicate(is_uimm12(n->get_long(), 3));
+  match(ConL);
+  op_cost(0);
+
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
+operand immUL12x16() %{
+  predicate(is_uimm12(n->get_long(), 4));
+  match(ConL);
+  op_cost(0);
+
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
+// Used for long shift
+operand immU6() %{
+  predicate(0 <= n->get_int() && (n->get_int() <= 63));
+  match(ConI);
+  op_cost(0);
+
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
+// Used for register extended shift
+operand immI_0_4() %{
+  predicate(0 <= n->get_int() && (n->get_int() <= 4));
+  match(ConI);
+  op_cost(0);
+
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
+// Compressed Pointer Register
+operand iRegN() %{
+  constraint(ALLOC_IN_RC(int_reg));
+  match(RegN);
+  match(ZRRegN);
+
+  format %{ %}
+  interface(REG_INTER);
+%}
+
+operand SPRegP() %{
+  constraint(ALLOC_IN_RC(SP_regP));
+  match(RegP);
+
+  format %{ %}
+  interface(REG_INTER);
+%}
+
+operand ZRRegP() %{
+  constraint(ALLOC_IN_RC(ZR_regP));
+  match(RegP);
+
+  format %{ %}
+  interface(REG_INTER);
+%}
+
+operand ZRRegL() %{
+  constraint(ALLOC_IN_RC(ZR_regP));
+  match(RegL);
+
+  format %{ %}
+  interface(REG_INTER);
+%}
+
+operand ZRRegI() %{
+  constraint(ALLOC_IN_RC(ZR_regI));
+  match(RegI);
+
+  format %{ %}
+  interface(REG_INTER);
+%}
+
+operand ZRRegN() %{
+  constraint(ALLOC_IN_RC(ZR_regI));
+  match(RegN);
+
+  format %{ %}
+  interface(REG_INTER);
+%}
--- /dev/null	2016-08-24 15:41:39.598575000 -0400
+++ new/hotspot/src/cpu/arm/vm/assembler_arm.cpp	2016-12-02 11:17:09.337628395 -0500
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2008, 2015, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+#include "asm/assembler.hpp"
+#include "asm/assembler.inline.hpp"
+#include "ci/ciEnv.hpp"
+#include "gc/shared/cardTableModRefBS.hpp"
+#include "gc/shared/collectedHeap.inline.hpp"
+#include "interpreter/interpreter.hpp"
+#include "interpreter/interpreterRuntime.hpp"
+#include "interpreter/templateInterpreterGenerator.hpp"
+#include "memory/resourceArea.hpp"
+#include "prims/jvm_misc.hpp"
+#include "prims/methodHandles.hpp"
+#include "runtime/biasedLocking.hpp"
+#include "runtime/interfaceSupport.hpp"
+#include "runtime/objectMonitor.hpp"
+#include "runtime/os.hpp"
+#include "runtime/sharedRuntime.hpp"
+#include "runtime/stubRoutines.hpp"
+#include "utilities/hashtable.hpp"
+#include "utilities/macros.hpp"
+#if INCLUDE_ALL_GCS
+#include "gc/g1/g1CollectedHeap.inline.hpp"
+#include "gc/g1/g1SATBCardTableModRefBS.hpp"
+#include "gc/g1/heapRegion.hpp"
+#endif // INCLUDE_ALL_GCS
+
+int AbstractAssembler::code_fill_byte() {
+  return 0xff; // illegal instruction 0xffffffff
+}
+
+#ifdef ASSERT
+bool AbstractAssembler::pd_check_instruction_mark() { return false; }
+#endif
--- /dev/null	2016-08-24 15:41:39.598575000 -0400
+++ new/hotspot/src/cpu/arm/vm/assembler_arm.hpp	2016-12-02 11:17:14.429917164 -0500
@@ -0,0 +1,404 @@
+/*
+ * Copyright (c) 2008, 2013, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef CPU_ARM_VM_ASSEMBLER_ARM_HPP
+#define CPU_ARM_VM_ASSEMBLER_ARM_HPP
+
+#include "utilities/macros.hpp"
+
+enum AsmCondition {
+  eq, ne, cs, cc, mi, pl, vs, vc,
+  hi, ls, ge, lt, gt, le, al, nv,
+  number_of_conditions,
+  // alternative names
+  hs = cs,
+  lo = cc
+};
+
+enum AsmShift {
+  lsl, lsr, asr, ror
+};
+
+#ifdef AARCH64
+enum AsmExtendOp {
+  ex_uxtb, ex_uxth, ex_uxtw, ex_uxtx,
+  ex_sxtb, ex_sxth, ex_sxtw, ex_sxtx,
+
+  ex_lsl = ex_uxtx
+};
+#endif
+
+enum AsmOffset {
+#ifdef AARCH64
+  basic_offset = 0b00,
+  pre_indexed  = 0b11,
+  post_indexed = 0b01
+#else
+  basic_offset = 1 << 24,
+  pre_indexed  = 1 << 24 | 1 << 21,
+  post_indexed = 0
+#endif
+};
+
+
+#ifndef AARCH64
+enum AsmWriteback {
+  no_writeback,
+  writeback
+};
+
+enum AsmOffsetOp {
+  sub_offset = 0,
+  add_offset = 1
+};
+#endif
+
+
+// ARM Addressing Modes 2 and 3 - Load and store
+class Address VALUE_OBJ_CLASS_SPEC {
+ private:
+  Register  _base;
+  Register  _index;
+  int       _disp;
+  AsmOffset _mode;
+  RelocationHolder   _rspec;
+  int       _shift_imm;
+#ifdef AARCH64
+  AsmExtendOp _extend;
+#else
+  AsmShift  _shift;
+  AsmOffsetOp _offset_op;
+
+  static inline int abs(int x) { return x < 0 ? -x : x; }
+  static inline int up (int x) { return x < 0 ?  0 : 1; }
+#endif
+
+#ifdef AARCH64
+  static const AsmExtendOp LSL = ex_lsl;
+#else
+  static const AsmShift LSL = lsl;
+#endif
+
+ public:
+  Address() : _base(noreg) {}
+
+  Address(Register rn, int offset = 0, AsmOffset mode = basic_offset) {
+    _base = rn;
+    _index = noreg;
+    _disp = offset;
+    _mode = mode;
+    _shift_imm = 0;
+#ifdef AARCH64
+    _extend = ex_lsl;
+#else
+    _shift = lsl;
+    _offset_op = add_offset;
+#endif
+  }
+
+#ifdef ASSERT
+  Address(Register rn, ByteSize offset, AsmOffset mode = basic_offset) {
+    _base = rn;
+    _index = noreg;
+    _disp = in_bytes(offset);
+    _mode = mode;
+    _shift_imm = 0;
+#ifdef AARCH64
+    _extend = ex_lsl;
+#else
+    _shift = lsl;
+    _offset_op = add_offset;
+#endif
+  }
+#endif
+
+#ifdef AARCH64
+  Address(Register rn, Register rm, AsmExtendOp extend = ex_lsl, int shift_imm = 0) {
+    assert ((extend == ex_uxtw) || (extend == ex_lsl) || (extend == ex_sxtw) || (extend == ex_sxtx), "invalid extend for address mode");
+    assert ((0 <= shift_imm) && (shift_imm <= 4), "shift amount is out of range");
+    _base = rn;
+    _index = rm;
+    _disp = 0;
+    _mode = basic_offset;
+    _extend = extend;
+    _shift_imm = shift_imm;
+  }
+#else
+  Address(Register rn, Register rm, AsmShift shift = lsl,
+          int shift_imm = 0, AsmOffset mode = basic_offset,
+          AsmOffsetOp offset_op = add_offset) {
+    _base = rn;
+    _index = rm;
+    _disp = 0;
+    _shift = shift;
+    _shift_imm = shift_imm;
+    _mode = mode;
+    _offset_op = offset_op;
+  }
+
+  Address(Register rn, RegisterOrConstant offset, AsmShift shift = lsl,
+          int shift_imm = 0) {
+    _base = rn;
+    if (offset.is_constant()) {
+      _index = noreg;
+      {
+        int off = (int) offset.as_constant();
+        if (shift_imm != 0) {
+          assert(shift == lsl,"shift not yet encoded");
+          off =  off << shift_imm;
+        }
+        _disp = off;
+      }
+      _shift = lsl;
+      _shift_imm = 0;
+    } else {
+      _index = offset.as_register();
+      _disp = 0;
+      _shift = shift;
+      _shift_imm = shift_imm;
+    }
+    _mode = basic_offset;
+    _offset_op = add_offset;
+  }
+#endif // AARCH64
+
+  // [base + index * wordSize]
+  static Address indexed_ptr(Register base, Register index) {
+    return Address(base, index, LSL, LogBytesPerWord);
+  }
+
+  // [base + index * BytesPerInt]
+  static Address indexed_32(Register base, Register index) {
+    return Address(base, index, LSL, LogBytesPerInt);
+  }
+
+  // [base + index * BytesPerHeapOop]
+  static Address indexed_oop(Register base, Register index) {
+    return Address(base, index, LSL, LogBytesPerHeapOop);
+  }
+
+  Address plus_disp(int disp) const {
+    assert((disp == 0) || (_index == noreg),"can't apply an offset to a register indexed address");
+    Address a = (*this);
+    a._disp += disp;
+    return a;
+  }
+
+  Address rebase(Register new_base) const {
+    Address a = (*this);
+    a._base = new_base;
+    return a;
+  }
+
+#ifdef AARCH64
+  int encoding_simd() const {
+    assert(_index != SP, "encoding constraint");
+    assert(_disp == 0 || _mode == post_indexed,  "encoding constraint");
+    assert(_index == noreg || _mode == basic_offset, "encoding constraint");
+    assert(_mode == basic_offset || _mode == post_indexed, "encoding constraint");
+    assert(_extend == ex_lsl, "encoding constraint");
+    int index;
+    if (_index == noreg) {
+      if (_mode == post_indexed)
+        index = 0b100 << 5 | 31;
+      else
+        index = 0;
+    } else {
+      index = 0b100 << 5 | _index->encoding();
+    }
+    return index << 16 | _base->encoding_with_sp() << 5;
+  }
+#else /* !AARCH64 */
+  int encoding2() const {
+    assert(_mode == basic_offset || _base != PC, "unpredictable instruction");
+    if (_index == noreg) {
+      assert(-4096 < _disp && _disp < 4096, "encoding constraint");
+      return _mode | up(_disp) << 23 | _base->encoding() << 16 | abs(_disp);
+    } else {
+      assert(_index != PC && (_mode == basic_offset || _index != _base), "unpredictable instruction");
+      assert(_disp == 0 && (_shift_imm >> 5) == 0, "encoding constraint");
+      return 1 << 25 | _offset_op << 23 | _mode | _base->encoding() << 16 |
+             _shift_imm << 7 | _shift << 5 | _index->encoding();
+    }
+  }
+
+  int encoding3() const {
+    assert(_mode == basic_offset || _base != PC, "unpredictable instruction");
+    if (_index == noreg) {
+      assert(-256 < _disp && _disp < 256, "encoding constraint");
+      return _mode | up(_disp) << 23 | 1 << 22 | _base->encoding() << 16 |
+             (abs(_disp) & 0xf0) << 4 | abs(_disp) & 0x0f;
+    } else {
+      assert(_index != PC && (_mode == basic_offset || _index != _base), "unpredictable instruction");
+      assert(_disp == 0 && _shift == lsl && _shift_imm == 0, "encoding constraint");
+      return _mode | _offset_op << 23 | _base->encoding() << 16 | _index->encoding();
+    }
+  }
+
+  int encoding_ex() const {
+    assert(_index == noreg && _disp == 0 && _mode == basic_offset &&
+           _base != PC, "encoding constraint");
+    return _base->encoding() << 16;
+  }
+
+  int encoding_vfp() const {
+    assert(_index == noreg && _mode == basic_offset, "encoding constraint");
+    assert(-1024 < _disp && _disp < 1024 && (_disp & 3) == 0, "encoding constraint");
+    return _base->encoding() << 16 | up(_disp) << 23 | abs(_disp) >> 2;
+  }
+
+  int encoding_simd() const {
+    assert(_base != PC, "encoding constraint");
+    assert(_index != PC && _index != SP, "encoding constraint");
+    assert(_disp == 0, "encoding constraint");
+    assert(_shift == 0, "encoding constraint");
+    assert(_index == noreg || _mode == basic_offset, "encoding constraint");
+    assert(_mode == basic_offset || _mode == post_indexed, "encoding constraint");
+    int index;
+    if (_index == noreg) {
+      if (_mode == post_indexed)
+        index = 13;
+      else
+        index = 15;
+    } else {
+      index = _index->encoding();
+    }
+
+    return _base->encoding() << 16 | index;
+  }
+#endif // !AARCH64
+
+  Register base() const {
+    return _base;
+  }
+
+  Register index() const {
+    return _index;
+  }
+
+  int disp() const {
+    return _disp;
+  }
+
+  AsmOffset mode() const {
+    return _mode;
+  }
+
+  int shift_imm() const {
+    return _shift_imm;
+  }
+
+#ifdef AARCH64
+  AsmExtendOp extend() const {
+    return _extend;
+  }
+#else
+  AsmShift shift() const {
+    return _shift;
+  }
+
+  AsmOffsetOp offset_op() const {
+    return _offset_op;
+  }
+#endif
+
+  bool uses(Register reg) const { return _base == reg || _index == reg; }
+
+  const relocInfo::relocType rtype() { return _rspec.type(); }
+  const RelocationHolder&    rspec() { return _rspec; }
+
+  // Convert the raw encoding form into the form expected by the
+  // constructor for Address.
+  static Address make_raw(int base, int index, int scale, int disp, relocInfo::relocType disp_reloc);
+};
+
+#ifdef COMPILER2
+class VFP VALUE_OBJ_CLASS_SPEC {
+  // Helper classes to detect whether a floating point constant can be
+  // encoded in a fconstd or fconsts instruction
+  // The conversion from the imm8, 8 bit constant, to the floating
+  // point value encoding is done with either:
+  // for single precision: imm8<7>:NOT(imm8<6>):Replicate(imm8<6>,5):imm8<5:0>:Zeros(19)
+  // or
+  // for double precision: imm8<7>:NOT(imm8<6>):Replicate(imm8<6>,8):imm8<5:0>:Zeros(48)
+
+ private:
+  class fpnum {
+   public:
+    virtual unsigned int f_hi4() const = 0;
+    virtual bool f_lo_is_null() const = 0;
+    virtual int e() const = 0;
+    virtual unsigned int s() const = 0;
+
+    inline bool can_be_imm8() const { return e() >= -3 && e() <= 4 && f_lo_is_null(); }
+    inline unsigned char imm8() const { int v = (s() << 7) | (((e() - 1) & 0x7) << 4) | f_hi4(); assert((v >> 8) == 0, "overflow"); return v; }
+  };
+
+ public:
+  class float_num : public fpnum {
+   public:
+    float_num(float v) {
+      _num.val = v;
+    }
+
+    virtual unsigned int f_hi4() const { return (_num.bits << 9) >> (19+9); }
+    virtual bool f_lo_is_null() const { return (_num.bits & ((1 << 19) - 1)) == 0; }
+    virtual int e() const { return ((_num.bits << 1) >> (23+1)) - 127; }
+    virtual unsigned int s() const { return _num.bits >> 31; }
+
+   private:
+    union {
+      float val;
+      unsigned int bits;
+    } _num;
+  };
+
+  class double_num : public fpnum {
+   public:
+    double_num(double v) {
+      _num.val = v;
+    }
+
+    virtual unsigned int f_hi4() const { return (_num.bits << 12) >> (48+12); }
+    virtual bool f_lo_is_null() const { return (_num.bits & ((1LL << 48) - 1)) == 0; }
+    virtual int e() const { return ((_num.bits << 1) >> (52+1)) - 1023; }
+    virtual unsigned int s() const { return _num.bits >> 63; }
+
+   private:
+    union {
+      double val;
+      unsigned long long bits;
+    } _num;
+  };
+};
+#endif
+
+#ifdef AARCH64
+#include "assembler_arm_64.hpp"
+#else
+#include "assembler_arm_32.hpp"
+#endif
+
+
+#endif // CPU_ARM_VM_ASSEMBLER_ARM_HPP
--- /dev/null	2016-08-24 15:41:39.598575000 -0400
+++ new/hotspot/src/cpu/arm/vm/assembler_arm.inline.hpp	2016-12-02 11:17:19.250190509 -0500
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2008, 2013, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef CPU_ARM_VM_ASSEMBLER_ARM_INLINE_HPP
+#define CPU_ARM_VM_ASSEMBLER_ARM_INLINE_HPP
+
+
+#endif // CPU_ARM_VM_ASSEMBLER_ARM_INLINE_HPP
--- /dev/null	2016-08-24 15:41:39.598575000 -0400
+++ new/hotspot/src/cpu/arm/vm/assembler_arm_32.cpp	2016-12-02 11:17:24.370480866 -0500
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2008, 2015, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+#include "asm/assembler.hpp"
+#include "asm/assembler.inline.hpp"
+#include "ci/ciEnv.hpp"
+#include "gc/shared/cardTableModRefBS.hpp"
+#include "gc/shared/collectedHeap.inline.hpp"
+#include "interpreter/interpreter.hpp"
+#include "interpreter/interpreterRuntime.hpp"
+#include "interpreter/templateInterpreterGenerator.hpp"
+#include "memory/resourceArea.hpp"
+#include "prims/jvm_misc.hpp"
+#include "prims/methodHandles.hpp"
+#include "runtime/biasedLocking.hpp"
+#include "runtime/interfaceSupport.hpp"
+#include "runtime/objectMonitor.hpp"
+#include "runtime/os.hpp"
+#include "runtime/sharedRuntime.hpp"
+#include "runtime/stubRoutines.hpp"
+#include "utilities/hashtable.hpp"
+#include "utilities/macros.hpp"
+#if INCLUDE_ALL_GCS
+#include "gc/g1/g1CollectedHeap.inline.hpp"
+#include "gc/g1/g1SATBCardTableModRefBS.hpp"
+#include "gc/g1/heapRegion.hpp"
+#endif // INCLUDE_ALL_GCS
+
+#ifdef COMPILER2
+// Convert the raw encoding form into the form expected by the
+// constructor for Address.
+Address Address::make_raw(int base, int index, int scale, int disp, relocInfo::relocType disp_reloc) {
+  RelocationHolder rspec;
+  if (disp_reloc != relocInfo::none) {
+    rspec = Relocation::spec_simple(disp_reloc);
+  }
+
+  Register rindex = as_Register(index);
+  if (rindex != PC) {
+    assert(disp == 0, "unsupported");
+    Address madr(as_Register(base), rindex, lsl, scale);
+    madr._rspec = rspec;
+    return madr;
+  } else {
+    assert(scale == 0, "not supported");
+    Address madr(as_Register(base), disp);
+    madr._rspec = rspec;
+    return madr;
+  }
+}
+#endif
+
+void AsmOperand::initialize_rotated_imm(unsigned int imm) {
+  for (int shift = 2; shift <= 24; shift += 2) {
+    if ((imm & ~(0xff << shift)) == 0) {
+      _encoding = 1 << 25 | (32 - shift) << 7 | imm >> shift;
+      return;
+    }
+  }
+  assert((imm & 0x0ffffff0) == 0, "too complicated constant: %d (%x)", imm, imm);
+  _encoding = 1 << 25 | 4 << 7 | imm >> 28 | imm << 4;
+}
+
+bool AsmOperand::is_rotated_imm(unsigned int imm) {
+  if ((imm >> 8) == 0) {
+    return true;
+  }
+  for (int shift = 2; shift <= 24; shift += 2) {
+    if ((imm & ~(0xff << shift)) == 0) {
+      return true;
+    }
+  }
+  if ((imm & 0x0ffffff0) == 0) {
+    return true;
+  }
+  return false;
+}
--- /dev/null	2016-08-24 15:41:39.598575000 -0400
+++ new/hotspot/src/cpu/arm/vm/assembler_arm_32.hpp	2016-12-02 11:17:29.858792094 -0500
@@ -0,0 +1,1245 @@
+/*
+ * Copyright (c) 2008, 2016, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef CPU_ARM_VM_ASSEMBLER_ARM_32_HPP
+#define CPU_ARM_VM_ASSEMBLER_ARM_32_HPP
+
+// ARM Addressing Mode 1 - Data processing operands
+class AsmOperand VALUE_OBJ_CLASS_SPEC {
+ private:
+  int _encoding;
+
+  void initialize_rotated_imm(unsigned int imm);
+
+  void encode(int imm_8) {
+    if ((imm_8 >> 8) == 0) {
+      _encoding = 1 << 25 | imm_8;  // the most common case
+    } else {
+      initialize_rotated_imm((unsigned int)imm_8);  // slow case
+    }
+  }
+
+  void encode(Register rm, AsmShift shift, int shift_imm) {
+    assert((shift_imm >> 5) == 0, "encoding constraint");
+    _encoding = shift_imm << 7 | shift << 5 | rm->encoding();
+  }
+
+ public:
+
+  AsmOperand(Register reg) {
+    _encoding = reg->encoding();
+  }
+
+  AsmOperand(int imm_8) {
+    encode(imm_8);
+  }
+
+#ifdef ASSERT
+  AsmOperand(ByteSize bytesize_8) {
+    const int imm_8 = in_bytes(bytesize_8);
+    encode(imm_8);
+  }
+#endif // ASSERT
+
+  AsmOperand(Register rm, AsmShift shift, int shift_imm) {
+    encode(rm,shift,shift_imm);
+  }
+
+  AsmOperand(Register rm, AsmShift shift, Register rs) {
+    assert(rm != PC && rs != PC, "unpredictable instruction");
+    _encoding = rs->encoding() << 8 | shift << 5 | 1 << 4 | rm->encoding();
+  }
+
+  AsmOperand(RegisterOrConstant offset, AsmShift shift = lsl, int shift_imm = 0) {
+    if (offset.is_register()) {
+      encode(offset.as_register(), shift, shift_imm);
+    } else {
+      assert(shift == lsl,"shift type not yet encoded");
+      int imm_8 = ((int)offset.as_constant()) << shift_imm;
+      encode(imm_8);
+    }
+  }
+
+  int encoding() const {
+    return _encoding;
+  }
+
+  bool is_immediate() const {
+    return _encoding & (1 << 25) ? true : false;
+  }
+
+  Register base_register() const {
+    assert(!is_immediate(), "is_immediate, no base reg");
+    return as_Register(_encoding & 15);
+  }
+
+  static bool is_rotated_imm(unsigned int imm);
+};
+
+
+// ARM Addressing Mode 4 - Load and store multiple
+class RegisterSet VALUE_OBJ_CLASS_SPEC {
+ private:
+  int _encoding;
+
+  RegisterSet(int encoding) {
+    _encoding = encoding;
+  }
+
+ public:
+
+  RegisterSet(Register reg) {
+    _encoding = 1 << reg->encoding();
+  }
+
+  RegisterSet() {
+    _encoding = 0;
+  }
+
+  RegisterSet(Register first, Register last) {
+    assert(first < last, "encoding constraint");
+    _encoding = (1 << (last->encoding() + 1)) - (1 << first->encoding());
+  }
+
+  friend RegisterSet operator | (const RegisterSet set1, const RegisterSet set2) {
+    assert((set1._encoding & set2._encoding) == 0,
+           "encoding constraint");
+    return RegisterSet(set1._encoding | set2._encoding);
+  }
+
+  int encoding() const {
+    return _encoding;
+  }
+
+  bool contains(Register reg) const {
+    return (_encoding & (1 << reg->encoding())) != 0;
+  }
+
+  // number of registers in the set
+  int size() const {
+    int count = 0;
+    unsigned int remaining = (unsigned int) _encoding;
+    while (remaining != 0) {
+      if ((remaining & 1) != 0) count++;
+      remaining >>= 1;
+    }
+    return count;
+  }
+};
+
+#if R9_IS_SCRATCHED
+#define R9ifScratched RegisterSet(R9)
+#else
+#define R9ifScratched RegisterSet()
+#endif
+
+// ARM Addressing Mode 5 - Load and store multiple VFP registers
+class FloatRegisterSet VALUE_OBJ_CLASS_SPEC {
+ private:
+  int _encoding;
+
+ public:
+
+  FloatRegisterSet(FloatRegister reg) {
+    if (reg->hi_bit() == 0) {
+      _encoding = reg->hi_bits() << 12 | reg->lo_bit() << 22 | 1;
+    } else {
+      assert (reg->lo_bit() == 0, "impossible encoding");
+      _encoding = reg->hi_bits() << 12 | reg->hi_bit() << 22 | 1;
+    }
+  }
+
+  FloatRegisterSet(FloatRegister first, int count) {
+    assert(count >= 1, "encoding constraint");
+    if (first->hi_bit() == 0) {
+      _encoding = first->hi_bits() << 12 | first->lo_bit() << 22 | count;
+    } else {
+      assert (first->lo_bit() == 0, "impossible encoding");
+      _encoding = first->hi_bits() << 12 | first->hi_bit() << 22 | count;
+    }
+  }
+
+  int encoding_s() const {
+    return _encoding;
+  }
+
+  int encoding_d() const {
+    assert((_encoding & 0xFF) <= 16, "no more than 16 double registers" );
+    return (_encoding & 0xFFFFFF00) | ((_encoding & 0xFF) << 1);
+  }
+
+};
+
+
+class Assembler : public AbstractAssembler  {
+
+ public:
+
+  static const int LogInstructionSize = 2;
+  static const int InstructionSize    = 1 << LogInstructionSize;
+
+  static inline AsmCondition inverse(AsmCondition cond) {
+    assert ((cond != al) && (cond != nv), "AL and NV conditions cannot be inversed");
+    return (AsmCondition)((int)cond ^ 1);
+  }
+
+  // Returns true if given value can be used as immediate in arithmetic (add/sub/cmp/cmn) instructions.
+  static inline bool is_arith_imm_in_range(intx value) {
+    return AsmOperand::is_rotated_imm(value);
+  }
+
+  // Arithmetic instructions
+
+#define F(mnemonic, opcode) \
+  void mnemonic(Register rd, Register rn, AsmOperand operand, AsmCondition cond = al) {    \
+    emit_int32(cond << 28 | opcode << 21 | rn->encoding() << 16 |                          \
+               rd->encoding() << 12 | operand.encoding());                                 \
+  }                                                                                        \
+  void mnemonic##s(Register rd, Register rn, AsmOperand operand, AsmCondition cond = al) { \
+    emit_int32(cond << 28 | opcode << 21 | 1 << 20 | rn->encoding() << 16 |                \
+               rd->encoding() << 12 | operand.encoding());                                 \
+  }
+
+  F(andr, 0)
+  F(eor,  1)
+  F(sub,  2)
+  F(rsb,  3)
+  F(add,  4)
+  F(adc,  5)
+  F(sbc,  6)
+  F(rsc,  7)
+  F(orr,  12)
+  F(bic,  14)
+#undef F
+
+#define F(mnemonic, opcode) \
+  void mnemonic(Register rn, AsmOperand operand, AsmCondition cond = al) {  \
+    emit_int32(cond << 28 | opcode << 21 | 1 << 20 | rn->encoding() << 16 | \
+              operand.encoding());                                          \
+  }
+
+  F(tst, 8)
+  F(teq, 9)
+  F(cmp, 10)
+  F(cmn, 11)
+#undef F
+
+#define F(mnemonic, opcode) \
+  void mnemonic(Register rd, AsmOperand operand, AsmCondition cond = al) {    \
+    emit_int32(cond << 28 | opcode << 21 | rd->encoding() << 12 |             \
+              operand.encoding());                                            \
+  }                                                                           \
+  void mnemonic##s(Register rd, AsmOperand operand, AsmCondition cond = al) { \
+    emit_int32(cond << 28 | opcode << 21 | 1 << 20 | rd->encoding() << 12 |   \
+              operand.encoding());                                            \
+  }
+
+  F(mov, 13)
+  F(mvn, 15)
+#undef F
+
+  void msr(uint fields, AsmOperand operand, AsmCondition cond = al) {
+    assert((operand.encoding() & (1<<25)) || ((operand.encoding() & 0xff0) == 0), "invalid addressing mode");
+    emit_int32(cond << 28 | 1 << 24 | 1 << 21 | fields << 16 | 0xf << 12 | operand.encoding());
+  }
+
+  void mrs(uint fields, Register Rd, AsmCondition cond = al) {
+    emit_int32(cond << 28 | 1 << 24 | (fields|0xf) << 16 | (Rd->encoding() << 12));
+  }
+
+
+  enum {
+    CPSR = 0x00, CPSR_c = 0x01, CPSR_x = 0x02, CPSR_xc = 0x03,
+    CPSR_s = 0x004, CPSR_sc = 0x05, CPSR_sx = 0x06, CPSR_sxc = 0x07,
+    CPSR_f = 0x08, CPSR_fc = 0x09, CPSR_fx = 0x0a, CPSR_fxc = 0x0b,
+    CPSR_fs = 0x0c, CPSR_fsc = 0x0d, CPSR_fsx = 0x0e, CPSR_fsxc = 0x0f,
+    SPSR = 0x40, SPSR_c = 0x41, SPSR_x = 0x42, SPSR_xc = 0x43,
+    SPSR_s = 0x44, SPSR_sc = 0x45, SPSR_sx = 0x46, SPSR_sxc = 0x47,
+    SPSR_f = 0x48, SPSR_fc = 0x49, SPSR_fx = 0x4a, SPSR_fxc = 0x4b,
+    SPSR_fs = 0x4c, SPSR_fsc = 0x4d, SPSR_fsx = 0x4e, SPSR_fsxc = 0x4f
+  };
+
+#define F(mnemonic, opcode) \
+  void mnemonic(Register rdlo, Register rdhi, Register rm, Register rs,                  \
+                AsmCondition cond = al) {                                                \
+    emit_int32(cond << 28 | opcode << 21 | rdhi->encoding() << 16 |                      \
+              rdlo->encoding() << 12 | rs->encoding() << 8 | 0x9 << 4 | rm->encoding()); \
+  }                                                                                      \
+  void mnemonic##s(Register rdlo, Register rdhi, Register rm, Register rs,               \
+                   AsmCondition cond = al) {                                             \
+    emit_int32(cond << 28 | opcode << 21 | 1 << 20 | rdhi->encoding() << 16 |            \
+              rdlo->encoding() << 12 | rs->encoding() << 8 | 0x9 << 4 | rm->encoding()); \
+  }
+
+  F(umull, 4)
+  F(umlal, 5)
+  F(smull, 6)
+  F(smlal, 7)
+#undef F
+
+  void mul(Register rd, Register rm, Register rs, AsmCondition cond = al) {
+    emit_int32(cond << 28 | rd->encoding() << 16 |
+              rs->encoding() << 8 | 0x9 << 4 | rm->encoding());
+  }
+
+  void muls(Register rd, Register rm, Register rs, AsmCondition cond = al) {
+    emit_int32(cond << 28 | 1 << 20 | rd->encoding() << 16 |
+              rs->encoding() << 8 | 0x9 << 4 | rm->encoding());
+  }
+
+  void mla(Register rd, Register rm, Register rs, Register rn, AsmCondition cond = al) {
+    emit_int32(cond << 28 | 1 << 21 | rd->encoding() << 16 |
+              rn->encoding() << 12 | rs->encoding() << 8 | 0x9 << 4 | rm->encoding());
+  }
+
+  void mlas(Register rd, Register rm, Register rs, Register rn, AsmCondition cond = al) {
+    emit_int32(cond << 28 | 1 << 21 | 1 << 20 | rd->encoding() << 16 |
+              rn->encoding() << 12 | rs->encoding() << 8 | 0x9 << 4 | rm->encoding());
+  }
+
+  // Loads and stores
+
+#define F(mnemonic, l, b) \
+  void mnemonic(Register rd, Address addr, AsmCondition cond = al) { \
+    emit_int32(cond << 28 | 1 << 26 | b << 22 | l << 20 |            \
+              rd->encoding() << 12 | addr.encoding2());              \
+  }
+
+  F(ldr,  1, 0)
+  F(ldrb, 1, 1)
+  F(str,  0, 0)
+  F(strb, 0, 1)
+#undef F
+
+#undef F
+
+#define F(mnemonic, l, sh, even) \
+  void mnemonic(Register rd, Address addr, AsmCondition cond = al) { \
+    assert(!even || (rd->encoding() & 1) == 0, "must be even");      \
+    emit_int32(cond << 28 | l << 20 | rd->encoding() << 12 |         \
+              1 << 7 | sh << 5 | 1 << 4 | addr.encoding3());         \
+  }
+
+  F(strh,  0, 1, false)
+  F(ldrh,  1, 1, false)
+  F(ldrsb, 1, 2, false)
+  F(ldrsh, 1, 3, false)
+  F(strd,  0, 3, true)
+
+#undef F
+
+  void ldrd(Register rd, Address addr, AsmCondition cond = al) {
+    assert((rd->encoding() & 1) == 0, "must be even");
+    assert(!addr.index()->is_valid() ||
+           (addr.index()->encoding() != rd->encoding() &&
+            addr.index()->encoding() != (rd->encoding()+1)), "encoding constraint");
+    emit_int32(cond << 28 | rd->encoding() << 12 | 0xD /* 0b1101 */ << 4 | addr.encoding3());
+  }
+
+#define F(mnemonic, l, pu) \
+  void mnemonic(Register rn, RegisterSet reg_set,                        \
+                AsmWriteback w = no_writeback, AsmCondition cond = al) { \
+    assert(reg_set.encoding() != 0 && (w == no_writeback ||              \
+           (reg_set.encoding() & (1 << rn->encoding())) == 0),           \
+           "unpredictable instruction");                                 \
+    emit_int32(cond << 28 | 4 << 25 | pu << 23 | w << 21 | l << 20 |     \
+              rn->encoding() << 16 | reg_set.encoding());                \
+  }
+
+  F(ldmda, 1, 0)    F(ldmfa, 1, 0)
+  F(ldmia, 1, 1)    F(ldmfd, 1, 1)
+  F(ldmdb, 1, 2)    F(ldmea, 1, 2)
+  F(ldmib, 1, 3)    F(ldmed, 1, 3)
+  F(stmda, 0, 0)    F(stmed, 0, 0)
+  F(stmia, 0, 1)    F(stmea, 0, 1)
+  F(stmdb, 0, 2)    F(stmfd, 0, 2)
+  F(stmib, 0, 3)    F(stmfa, 0, 3)
+#undef F
+
+  void ldrex(Register rd, Address addr, AsmCondition cond = al) {
+    assert(rd != PC, "unpredictable instruction");
+    emit_int32(cond << 28 | 0x19 << 20 | addr.encoding_ex() |
+              rd->encoding()  << 12 | 0xf9f);
+  }
+
+  void strex(Register rs, Register rd, Address addr, AsmCondition cond = al) {
+    assert(rd != PC && rs != PC &&
+           rs != rd && rs != addr.base(), "unpredictable instruction");
+    emit_int32(cond << 28 | 0x18 << 20 | addr.encoding_ex() |
+              rs->encoding()  << 12 | 0xf90 | rd->encoding());
+  }
+
+  void ldrexd(Register rd, Address addr, AsmCondition cond = al) {
+    assert(rd != PC, "unpredictable instruction");
+    emit_int32(cond << 28 | 0x1B << 20 | addr.encoding_ex() |
+              rd->encoding()  << 12 | 0xf9f);
+  }
+
+  void strexd(Register rs, Register rd, Address addr, AsmCondition cond = al) {
+    assert(rd != PC && rs != PC &&
+           rs != rd && rs != addr.base(), "unpredictable instruction");
+    emit_int32(cond << 28 | 0x1A << 20 | addr.encoding_ex() |
+              rs->encoding()  << 12 | 0xf90 | rd->encoding());
+  }
+
+  void clrex() {
+    emit_int32(0xF << 28 | 0x57 << 20 | 0xFF  << 12 | 0x01f);
+  }
+
+  // Miscellaneous instructions
+
+  void clz(Register rd, Register rm, AsmCondition cond = al) {
+    emit_int32(cond << 28 | 0x016f0f10 | rd->encoding() << 12 | rm->encoding());
+  }
+
+  void rev(Register rd, Register rm, AsmCondition cond = al) {
+    emit_int32(cond << 28 | 0x06bf0f30 | rd->encoding() << 12 | rm->encoding());
+  }
+
+  void rev16(Register rd, Register rm, AsmCondition cond = al) {
+    emit_int32(cond << 28 | 0x6bf0fb0 | rd->encoding() << 12 | rm->encoding());
+  }
+
+  void revsh(Register rd, Register rm, AsmCondition cond = al) {
+    emit_int32(cond << 28 | 0x6ff0fb0 | rd->encoding() << 12 | rm->encoding());
+  }
+
+  void rbit(Register rd, Register rm, AsmCondition cond = al) {
+    emit_int32(cond << 28 | 0x6ff0f30 | rd->encoding() << 12 | rm->encoding());
+  }
+
+  void pld(Address addr) {
+    emit_int32(0xf550f000 | addr.encoding2());
+  }
+
+  void pldw(Address addr) {
+    assert(VM_Version::arm_arch() >= 7 && os::is_MP(), "no pldw on this processor");
+    emit_int32(0xf510f000 | addr.encoding2());
+  }
+
+  void svc(int imm_24, AsmCondition cond = al) {
+    assert((imm_24 >> 24) == 0, "encoding constraint");
+    emit_int32(cond << 28 | 0xf << 24 | imm_24);
+  }
+
+  void ubfx(Register rd, Register rn, unsigned int lsb, unsigned int width, AsmCondition cond = al) {
+    assert(VM_Version::arm_arch() >= 7, "no ubfx on this processor");
+    assert(width > 0, "must be");
+    assert(lsb < 32, "must be");
+    emit_int32(cond << 28 | 0x3f << 21 | (width - 1)  << 16 | rd->encoding() << 12 |
+              lsb << 7 | 0x5 << 4 | rn->encoding());
+  }
+
+  void uxtb(Register rd, Register rm, unsigned int rotation = 0, AsmCondition cond = al) {
+    assert(VM_Version::arm_arch() >= 7, "no uxtb on this processor");
+    assert((rotation % 8) == 0 && (rotation <= 24), "encoding constraint");
+    emit_int32(cond << 28 | 0x6e << 20 | 0xf << 16 | rd->encoding() << 12 |
+              (rotation >> 3) << 10 | 0x7 << 4 | rm->encoding());
+  }
+
+  // ARM Memory Barriers
+  //
+  // There are two types of memory barriers defined for the ARM processor
+  // DataSynchronizationBarrier and DataMemoryBarrier
+  //
+  // The Linux kernel uses the DataMemoryBarrier for all of it's
+  // memory barrier operations (smp_mb, smp_rmb, smp_wmb)
+  //
+  // There are two forms of each barrier instruction.
+  // The mcr forms are supported on armv5 and newer architectures
+  //
+  // The dmb, dsb instructions were added in armv7
+  // architectures and are compatible with their mcr
+  // predecessors.
+  //
+  // Here are the encodings for future reference:
+  //
+  // DataSynchronizationBarrier (dsb)
+  // on ARMv7 - emit_int32(0xF57FF04F)
+  //
+  // on ARMv5+ - mcr p15, 0, Rtmp, c7, c10, 4  on earlier processors
+  //             emit_int32(0xe << 28 | 0xe << 24 | 0x7 << 16 | Rtmp->encoding() << 12  |
+  //                       0xf << 8  | 0x9 << 4  | 0xa);
+  //
+  // DataMemoryBarrier (dmb)
+  // on ARMv7 - emit_int32(0xF57FF05F)
+  //
+  // on ARMv5+ - mcr p15, 0, Rtmp, c7, c10, 5 on earlier processors
+  //             emit_int32(0xe << 28 | 0xe << 24 | 0x7 << 16 | Rtmp->encoding() << 12  |
+  //                       0xf << 8  | 0xb << 4  | 0xa);
+  //
+
+  enum DMB_Opt {
+    DMB_all = 0xf,
+    DMB_st  = 0xe,
+  };
+
+  void dmb(DMB_Opt opt, Register reg) {
+    if (VM_Version::arm_arch() >= 7) {
+      emit_int32(0xF57FF050 | opt);
+    } else {
+      bool preserve_tmp = (reg == noreg);
+      if(preserve_tmp) {
+        reg = Rtemp;
+        str(reg, Address(SP, -wordSize, pre_indexed));
+      }
+      mov(reg, 0);
+      // DataMemoryBarrier
+      emit_int32(0xe << 28 |
+                0xe << 24 |
+                0x7 << 16 |
+                reg->encoding() << 12  |
+                0xf << 8  |
+                0xb << 4  |
+                0xa);
+      if(preserve_tmp) {
+        ldr(reg, Address(SP, wordSize, post_indexed));
+      }
+    }
+  }
+
+  void dsb(Register reg) {
+    if (VM_Version::arm_arch() >= 7) {
+      emit_int32(0xF57FF04F);
+    } else {
+      bool preserve_tmp = (reg == noreg);
+      if(preserve_tmp) {
+        reg = Rtemp;
+        str(reg, Address(SP, -wordSize, pre_indexed));
+      }
+      mov(reg, 0);
+      // DataSynchronizationBarrier
+      emit_int32(0xe << 28 |
+                0xe << 24 |
+                0x7 << 16 |
+                reg->encoding() << 12  |
+                0xf << 8  |
+                0x9 << 4  |
+                0xa);
+      if(preserve_tmp) {
+        ldr(reg, Address(SP, wordSize, post_indexed));
+      }
+    }
+  }
+
+
+#define F(mnemonic, b) \
+  void mnemonic(Register rd, Register rm, Register rn, AsmCondition cond = al) { \
+    assert(rn != rm && rn != rd, "unpredictable instruction");                   \
+    emit_int32(cond << 28 | 0x2 << 23 | b << 22 | rn->encoding() << 16 |         \
+              rd->encoding() << 12 | 9 << 4 | rm->encoding());                   \
+  }
+
+  F(swp,  0)
+  F(swpb, 1)
+#undef F
+
+  // Branches
+
+#define F(mnemonic, l) \
+  void mnemonic(Register rm, AsmCondition cond = al) {            \
+    emit_int32(cond << 28 | 0x012fff10 | l << 5 | rm->encoding()); \
+  }
+
+  F(bx,  0)
+  F(blx, 1)
+#undef F
+
+#define F(mnemonic, l)                                                  \
+  void mnemonic(address target, AsmCondition cond = al) {               \
+    unsigned int offset = (unsigned int)(target - pc() - 8);            \
+    assert((offset & 3) == 0, "bad alignment");                         \
+    assert((offset >> 25) == 0 || ((int)offset >> 25) == -1, "offset is too large"); \
+    emit_int32(cond << 28 | l << 24 | offset << 6 >> 8);                \
+  }
+
+  F(b,  0xa)
+  F(bl, 0xb)
+#undef F
+
+  // ARMv7 instructions
+
+#define F(mnemonic, wt) \
+  void mnemonic(Register rd, int imm_16, AsmCondition cond = al) { \
+    assert((imm_16 >> 16) == 0, "encoding constraint");            \
+    emit_int32(cond << 28 | wt << 20 | rd->encoding() << 12 |      \
+              (imm_16 & 0xf000) << 4 | (imm_16 & 0xfff));          \
+  }
+
+  F(movw, 0x30)
+  F(movt, 0x34)
+#undef F
+
+  // VFP Support
+
+// Checks that VFP instructions are not used in SOFTFP mode.
+#ifdef __SOFTFP__
+#define CHECK_VFP_PRESENT ShouldNotReachHere()
+#else
+#define CHECK_VFP_PRESENT
+#endif // __SOFTFP__
+
+  static const int single_cp_num = 0xa00;
+  static const int double_cp_num = 0xb00;
+
+  // Bits P, Q, R, S collectively form the opcode
+#define F(mnemonic, P, Q, R, S) \
+  void mnemonic##d(FloatRegister fd, FloatRegister fn, FloatRegister fm, \
+                   AsmCondition cond = al) {                             \
+    CHECK_VFP_PRESENT;                                                   \
+    assert(fn->lo_bit() == 0 && fd->lo_bit() == 0 && fm->lo_bit() == 0, "single precision register?"); \
+    emit_int32(cond << 28 | 0x7 << 25 | double_cp_num |                  \
+              P << 23 | Q << 21 | R << 20 | S << 6 |                     \
+              fn->hi_bits() << 16 | fn->hi_bit() << 7 |                  \
+              fd->hi_bits() << 12 | fd->hi_bit() << 22 |                 \
+              fm->hi_bits()       | fm->hi_bit() << 5);                  \
+  }                                                                      \
+  void mnemonic##s(FloatRegister fd, FloatRegister fn, FloatRegister fm, \
+                   AsmCondition cond = al) {                             \
+    assert(fn->hi_bit() == 0 && fd->hi_bit() == 0 && fm->hi_bit() == 0, "double precision register?"); \
+    CHECK_VFP_PRESENT;                                                   \
+    emit_int32(cond << 28 | 0x7 << 25 | single_cp_num |                  \
+              P << 23 | Q << 21 | R << 20 | S << 6 |                     \
+              fn->hi_bits() << 16 | fn->lo_bit() << 7 |                  \
+              fd->hi_bits() << 12 | fd->lo_bit() << 22 |                 \
+              fm->hi_bits()       | fm->lo_bit() << 5);                  \
+  }
+
+  F(fmac,  0, 0, 0, 0)  // Fd = Fd + (Fn * Fm)
+  F(fnmac, 0, 0, 0, 1)  // Fd = Fd - (Fn * Fm)
+  F(fmsc,  0, 0, 1, 0)  // Fd = -Fd + (Fn * Fm)
+  F(fnmsc, 0, 0, 1, 1)  // Fd = -Fd - (Fn * Fm)
+
+  F(fmul,  0, 1, 0, 0)  // Fd = Fn * Fm
+  F(fnmul, 0, 1, 0, 1)  // Fd = -(Fn * Fm)
+  F(fadd,  0, 1, 1, 0)  // Fd = Fn + Fm
+  F(fsub,  0, 1, 1, 1)  // Fd = Fn - Fm
+  F(fdiv,  1, 0, 0, 0)  // Fd = Fn / Fm
+#undef F
+
+  enum VElem_Size {
+    VELEM_SIZE_8  = 0x00,
+    VELEM_SIZE_16 = 0x01,
+    VELEM_SIZE_32 = 0x02,
+    VELEM_SIZE_64 = 0x03
+  };
+
+  enum VLD_Type {
+    VLD1_TYPE_1_REG  = 0x7 /* 0b0111 */,
+    VLD1_TYPE_2_REGS = 0xA /* 0b1010 */,
+    VLD1_TYPE_3_REGS = 0x6 /* 0b0110 */,
+    VLD1_TYPE_4_REGS = 0x2 /* 0b0010 */
+  };
+
+  enum VFloat_Arith_Size {
+    VFA_SIZE_F32 = 0x0 /* 0b0 */,
+  };
+
+  // Bits P, Q, R, S collectively form the opcode
+#define F(mnemonic, P, Q, R, S) \
+  void mnemonic(FloatRegister fd, FloatRegister fn, FloatRegister fm,    \
+                int size, int quad) {                                    \
+    CHECK_VFP_PRESENT;                                                   \
+    assert(VM_Version::has_simd(), "simd instruction");                  \
+    assert(fn->lo_bit() == 0 && fd->lo_bit() == 0 && fm->lo_bit() == 0,  \
+           "single precision register?");                                \
+    assert(!quad || ((fn->hi_bits() | fd->hi_bits() | fm->hi_bits()) & 1) == 0, \
+           "quad precision register?");                                  \
+    emit_int32(0xf << 28 | P << 23 | Q << 8 | R << 4 |                   \
+              S << 21 | size << 20 | quad << 6 |                         \
+              fn->hi_bits() << 16 | fn->hi_bit() << 7 |                  \
+              fd->hi_bits() << 12 | fd->hi_bit() << 22 |                 \
+              fm->hi_bits()       | fm->hi_bit() << 5);                  \
+  }
+
+  F(vmulI,  0x4 /* 0b0100 */, 0x9 /* 0b1001 */, 1, 0)  // Vd = Vn * Vm (int)
+  F(vaddI,  0x4 /* 0b0100 */, 0x8 /* 0b1000 */, 0, 0)  // Vd = Vn + Vm (int)
+  F(vsubI,  0x6 /* 0b0110 */, 0x8 /* 0b1000 */, 0, 0)  // Vd = Vn - Vm (int)
+  F(vaddF,  0x4 /* 0b0100 */, 0xD /* 0b1101 */, 0, 0)  // Vd = Vn + Vm (float)
+  F(vsubF,  0x4 /* 0b0100 */, 0xD /* 0b1101 */, 0, 1)  // Vd = Vn - Vm (float)
+  F(vmulF,  0x6 /* 0b0110 */, 0xD /* 0b1101 */, 1, 0)  // Vd = Vn * Vm (float)
+  F(vshlSI, 0x4 /* 0b0100 */, 0x4 /* 0b0100 */, 0, 0)  // Vd = ashift(Vm,Vn) (int)
+  F(vshlUI, 0x6 /* 0b0110 */, 0x4 /* 0b0100 */, 0, 0)  // Vd = lshift(Vm,Vn) (int)
+  F(_vandI, 0x4 /* 0b0100 */, 0x1 /* 0b0001 */, 1, 0)  // Vd = Vn & Vm (int)
+  F(_vorI,  0x4 /* 0b0100 */, 0x1 /* 0b0001 */, 1, 1)  // Vd = Vn | Vm (int)
+  F(_vxorI, 0x6 /* 0b0110 */, 0x1 /* 0b0001 */, 1, 0)  // Vd = Vn ^ Vm (int)
+#undef F
+
+  void vandI(FloatRegister fd, FloatRegister fn, FloatRegister fm, int quad) {
+    _vandI(fd, fn, fm, 0, quad);
+  }
+  void vorI(FloatRegister fd, FloatRegister fn, FloatRegister fm, int quad) {
+    _vorI(fd, fn, fm, 0, quad);
+  }
+  void vxorI(FloatRegister fd, FloatRegister fn, FloatRegister fm, int quad) {
+    _vxorI(fd, fn, fm, 0, quad);
+  }
+
+  void vneg(FloatRegister fd, FloatRegister fm, int size, int flt, int quad) {
+    CHECK_VFP_PRESENT;
+    assert(VM_Version::has_simd(), "simd instruction");
+    assert(fd->lo_bit() == 0 && fm->lo_bit() == 0,
+           "single precision register?");
+    assert(!quad || ((fd->hi_bits() | fm->hi_bits()) & 1) == 0,
+           "quad precision register?");
+    emit_int32(0xf << 28 | 0x3B /* 0b00111011 */ << 20 | 0x1 /* 0b01 */ << 16 | 0x7 /* 0b111 */ << 7 |
+               size << 18 | quad << 6 | flt << 10 |
+               fd->hi_bits() << 12 | fd->hi_bit() << 22 |
+               fm->hi_bits() <<  0 | fm->hi_bit() << 5);
+  }
+
+  void vnegI(FloatRegister fd, FloatRegister fm, int size, int quad) {
+    int flt = 0;
+    vneg(fd, fm, size, flt, quad);
+  }
+
+  void vshli(FloatRegister fd, FloatRegister fm, int size, int imm, int quad) {
+    CHECK_VFP_PRESENT;
+    assert(VM_Version::has_simd(), "simd instruction");
+    assert(fd->lo_bit() == 0 && fm->lo_bit() == 0,
+           "single precision register?");
+    assert(!quad || ((fd->hi_bits() | fm->hi_bits()) & 1) == 0,
+           "quad precision register?");
+
+    if (imm >= size) {
+      // maximum shift gives all zeroes, direction doesn't matter,
+      // but only available for shift right
+      vshri(fd, fm, size, size, true /* unsigned */, quad);
+      return;
+    }
+    assert(imm >= 0 && imm < size, "out of range");
+
+    int imm6 = 0;
+    int L = 0;
+    switch (size) {
+    case 8:
+    case 16:
+    case 32:
+      imm6 = size + imm ;
+      break;
+    case 64:
+      L = 1;
+      imm6 = imm ;
+      break;
+    default:
+      ShouldNotReachHere();
+    }
+    emit_int32(0xf << 28 | 0x5 /* 0b00101 */ << 23 | 0x51 /* 0b01010001 */ << 4 |
+               imm6 << 16 | L << 7 | quad << 6 |
+               fd->hi_bits() << 12 | fd->hi_bit() << 22 |
+               fm->hi_bits() <<  0 | fm->hi_bit() << 5);
+  }
+
+  void vshri(FloatRegister fd, FloatRegister fm, int size, int imm,
+             bool U /* unsigned */, int quad) {
+    CHECK_VFP_PRESENT;
+    assert(VM_Version::has_simd(), "simd instruction");
+    assert(fd->lo_bit() == 0 && fm->lo_bit() == 0,
+           "single precision register?");
+    assert(!quad || ((fd->hi_bits() | fm->hi_bits()) & 1) == 0,
+           "quad precision register?");
+    assert(imm > 0, "out of range");
+    if (imm >= size) {
+      // maximum shift (all zeroes)
+      imm = size;
+    }
+    int imm6 = 0;
+    int L = 0;
+    switch (size) {
+    case 8:
+    case 16:
+    case 32:
+      imm6 = 2 * size - imm ;
+      break;
+    case 64:
+      L = 1;
+      imm6 = 64 - imm ;
+      break;
+    default:
+      ShouldNotReachHere();
+    }
+    emit_int32(0xf << 28 | 0x5 /* 0b00101 */ << 23 | 0x1 /* 0b00000001 */ << 4 |
+               imm6 << 16 | L << 7 | quad << 6 | U << 24 |
+               fd->hi_bits() << 12 | fd->hi_bit() << 22 |
+               fm->hi_bits() <<  0 | fm->hi_bit() << 5);
+  }
+  void vshrUI(FloatRegister fd, FloatRegister fm, int size, int imm, int quad) {
+    vshri(fd, fm, size, imm, true /* unsigned */, quad);
+  }
+  void vshrSI(FloatRegister fd, FloatRegister fm, int size, int imm, int quad) {
+    vshri(fd, fm, size, imm, false /* signed */, quad);
+  }
+
+  // Extension opcodes where P,Q,R,S = 1 opcode is in Fn
+#define F(mnemonic, N, opcode) \
+  void mnemonic##d(FloatRegister fd, FloatRegister fm, AsmCondition cond = al) {  \
+    CHECK_VFP_PRESENT;                                                            \
+    assert(fd->lo_bit() == 0 && fm->hi_bit() == 0, "incorrect register?");        \
+    emit_int32(cond << 28 | 0xeb << 20 | opcode << 16 | N << 7 | 1 << 6 |         \
+              double_cp_num |                                                     \
+              fd->hi_bits() << 12 | fd->hi_bit() << 22 |                          \
+              fm->hi_bits()       | fm->lo_bit() << 5);                           \
+  }                                                                               \
+  void mnemonic##s(FloatRegister fd, FloatRegister fm, AsmCondition cond = al) {  \
+    CHECK_VFP_PRESENT;                                                            \
+    assert(fd->hi_bit() == 0 && fm->hi_bit() == 0, "double precision register?"); \
+    emit_int32(cond << 28 | 0xeb << 20 | opcode << 16 | N << 7 | 1 << 6 |         \
+              single_cp_num |                                                     \
+              fd->hi_bits() << 12 | fd->lo_bit() << 22 |                          \
+              fm->hi_bits()       | fm->lo_bit() << 5);                           \
+  }
+
+  F(fuito,  0, 0x8)  // Unsigned integer to floating point conversion
+  F(fsito,  1, 0x8)  // Signed integer to floating point conversion
+#undef F
+
+#define F(mnemonic, N, opcode) \
+  void mnemonic##d(FloatRegister fd, FloatRegister fm, AsmCondition cond = al) {  \
+    CHECK_VFP_PRESENT;                                                            \
+    assert(fd->hi_bit() == 0 && fm->lo_bit() == 0, "incorrect register?");        \
+    emit_int32(cond << 28 | 0xeb << 20 | opcode << 16 | N << 7 | 1 << 6 |         \
+              double_cp_num |                                                     \
+              fd->hi_bits() << 12 | fd->lo_bit() << 22 |                          \
+              fm->hi_bits()       | fm->hi_bit() << 5);                           \
+  }                                                                               \
+  void mnemonic##s(FloatRegister fd, FloatRegister fm, AsmCondition cond = al) {  \
+    CHECK_VFP_PRESENT;                                                            \
+    assert(fd->hi_bit() == 0 && fm->hi_bit() == 0, "double precision register?"); \
+    emit_int32(cond << 28 | 0xeb << 20 | opcode << 16 | N << 7 | 1 << 6 |         \
+              single_cp_num |                                                     \
+              fd->hi_bits() << 12 | fd->lo_bit() << 22 |                          \
+              fm->hi_bits()       | fm->lo_bit() << 5);                           \
+  }
+
+  F(ftoui,  0, 0xc)  // Float to unsigned int conversion
+  F(ftouiz, 1, 0xc)  // Float to unsigned int conversion, RZ mode
+  F(ftosi,  0, 0xd)  // Float to signed int conversion
+  F(ftosiz, 1, 0xd)  // Float to signed int conversion, RZ mode
+#undef F
+
+#define F(mnemonic, N, opcode) \
+  void mnemonic##d(FloatRegister fd, FloatRegister fm, AsmCondition cond = al) {  \
+    CHECK_VFP_PRESENT;                                                            \
+    assert(fd->hi_bit() == 0 && fm->lo_bit() == 0, "incorrect register?");        \
+    emit_int32(cond << 28 | 0xeb << 20 | opcode << 16 | N << 7 | 1 << 6 |         \
+              double_cp_num |                                                     \
+              fd->hi_bits() << 12 | fd->lo_bit() << 22 |                          \
+              fm->hi_bits()       | fm->hi_bit() << 5);                           \
+  }                                                                               \
+  void mnemonic##s(FloatRegister fd, FloatRegister fm, AsmCondition cond = al) {  \
+    CHECK_VFP_PRESENT;                                                            \
+    assert(fd->lo_bit() == 0 && fm->hi_bit() == 0, "incorrect register?");        \
+    emit_int32(cond << 28 | 0xeb << 20 | opcode << 16 | N << 7 | 1 << 6 |         \
+              single_cp_num |                                                     \
+              fd->hi_bits() << 12 | fd->hi_bit() << 22 |                          \
+              fm->hi_bits()       | fm->lo_bit() << 5);                           \
+  }
+
+  F(fcvtd,  1, 0x7)  // Single->Double conversion
+  F(fcvts,  1, 0x7)  // Double->Single conversion
+#undef F
+
+#define F(mnemonic, N, opcode) \
+  void mnemonic##d(FloatRegister fd, FloatRegister fm, AsmCondition cond = al) {  \
+    CHECK_VFP_PRESENT;                                                            \
+    assert(fd->lo_bit() == 0 && fm->lo_bit() == 0, "single precision register?"); \
+    emit_int32(cond << 28 | 0xeb << 20 | opcode << 16 | N << 7 | 1 << 6 |         \
+              double_cp_num |                                                     \
+              fd->hi_bits() << 12 | fd->hi_bit() << 22 |                          \
+              fm->hi_bits()       | fm->hi_bit() << 5);                           \
+  }                                                                               \
+  void mnemonic##s(FloatRegister fd, FloatRegister fm, AsmCondition cond = al) {  \
+    CHECK_VFP_PRESENT;                                                            \
+    assert(fd->hi_bit() == 0 && fm->hi_bit() == 0, "double precision register?"); \
+    emit_int32(cond << 28 | 0xeb << 20 | opcode << 16 | N << 7 | 1 << 6 |         \
+              single_cp_num |                                                     \
+              fd->hi_bits() << 12 | fd->lo_bit() << 22 |                          \
+              fm->hi_bits()       | fm->lo_bit() << 5);                           \
+  }
+
+  F(fcpy,   0, 0x0)  // Fd = Fm
+  F(fabs,   1, 0x0)  // Fd = abs(Fm)
+  F(fneg,   0, 0x1)  // Fd = -Fm
+  F(fsqrt,  1, 0x1)  // Fd = sqrt(Fm)
+  F(fcmp,   0, 0x4)  // Compare Fd with Fm no exceptions on quiet NANs
+  F(fcmpe,  1, 0x4)  // Compare Fd with Fm with exceptions on quiet NANs
+#undef F
+
+  // Opcodes with one operand only
+#define F(mnemonic, N, opcode) \
+  void mnemonic##d(FloatRegister fd, AsmCondition cond = al) {               \
+    CHECK_VFP_PRESENT;                                                       \
+    assert(fd->lo_bit() == 0, "single precision register?");                 \
+    emit_int32(cond << 28 | 0xeb << 20 | opcode << 16 | N << 7 | 1 << 6 |    \
+              double_cp_num | fd->hi_bits() << 12 | fd->hi_bit() << 22);     \
+  }                                                                          \
+  void mnemonic##s(FloatRegister fd, AsmCondition cond = al) {               \
+    CHECK_VFP_PRESENT;                                                       \
+    assert(fd->hi_bit() == 0, "double precision register?");                 \
+    emit_int32(cond << 28 | 0xeb << 20 | opcode << 16 | N << 7 | 1 << 6 |    \
+              single_cp_num | fd->hi_bits() << 12 | fd->lo_bit() << 22);     \
+  }
+
+  F(fcmpz,  0, 0x5)  // Compare Fd with 0, no exceptions quiet NANs
+  F(fcmpez, 1, 0x5)  // Compare Fd with 0, with exceptions quiet NANs
+#undef F
+
+  // Float loads (L==1) and stores (L==0)
+#define F(mnemonic, L) \
+  void mnemonic##d(FloatRegister fd, Address addr, AsmCondition cond = al) { \
+    CHECK_VFP_PRESENT;                                                       \
+    assert(fd->lo_bit() == 0, "single precision register?");                 \
+    emit_int32(cond << 28 | 0xd << 24 | L << 20 |                            \
+              fd->hi_bits() << 12 | fd->hi_bit() << 22 |                     \
+              double_cp_num | addr.encoding_vfp());                          \
+  }                                                                          \
+  void mnemonic##s(FloatRegister fd, Address addr, AsmCondition cond = al) { \
+    CHECK_VFP_PRESENT;                                                       \
+    assert(fd->hi_bit() == 0, "double precision register?");                 \
+    emit_int32(cond << 28 | 0xd << 24 | L << 20 |                            \
+              fd->hi_bits() << 12 | fd->lo_bit() << 22 |                     \
+              single_cp_num | addr.encoding_vfp());                          \
+  }
+
+  F(fst, 0)  // Store 1 register
+  F(fld, 1)  // Load 1 register
+#undef F
+
+  // Float load and store multiple
+#define F(mnemonic, l, pu) \
+  void mnemonic##d(Register rn, FloatRegisterSet reg_set,                    \
+                   AsmWriteback w = no_writeback, AsmCondition cond = al) {  \
+    CHECK_VFP_PRESENT;                                                       \
+    assert(w == no_writeback || rn != PC, "unpredictable instruction");      \
+    assert(!(w == no_writeback && pu == 2), "encoding constraint");          \
+    assert((reg_set.encoding_d() & 1) == 0, "encoding constraint");          \
+    emit_int32(cond << 28 | 6 << 25 | pu << 23 | w << 21 | l << 20 |         \
+              rn->encoding() << 16 | reg_set.encoding_d() | double_cp_num);  \
+  }                                                                          \
+  void mnemonic##s(Register rn, FloatRegisterSet reg_set,                    \
+                   AsmWriteback w = no_writeback, AsmCondition cond = al) {  \
+    CHECK_VFP_PRESENT;                                                       \
+    assert(w == no_writeback || rn != PC, "unpredictable instruction");      \
+    assert(!(w == no_writeback && pu == 2), "encoding constraint");          \
+    emit_int32(cond << 28 | 6 << 25 | pu << 23 | w << 21 | l << 20 |         \
+              rn->encoding() << 16 | reg_set.encoding_s() | single_cp_num);  \
+  }
+
+  F(fldmia, 1, 1)    F(fldmfd, 1, 1)
+  F(fldmdb, 1, 2)    F(fldmea, 1, 2)
+  F(fstmia, 0, 1)    F(fstmfd, 0, 1)
+  F(fstmdb, 0, 2)    F(fstmea, 0, 2)
+#undef F
+
+  // fconst{s,d} encoding:
+  //  31  28 27   23 22  21 20 19   16 15 12 10  9  8   7    4 3     0
+  // | cond | 11101 | D | 11  | imm4H | Vd  | 101 | sz | 0000 | imm4L |
+  // sz = 0 for single precision, 1 otherwise
+  // Register number is Vd:D for single precision, D:Vd otherwise
+  // immediate value is imm4H:imm4L
+
+  void fconsts(FloatRegister fd, unsigned char imm_8, AsmCondition cond = al) {
+    CHECK_VFP_PRESENT;
+    assert(fd->hi_bit() == 0, "double precision register?");
+    emit_int32(cond << 28 | 0xeb << 20 | single_cp_num |
+              fd->hi_bits() << 12 | fd->lo_bit() << 22 | (imm_8 & 0xf) | (imm_8 >> 4) << 16);
+  }
+
+  void fconstd(FloatRegister fd, unsigned char imm_8, AsmCondition cond = al) {
+    CHECK_VFP_PRESENT;
+    assert(fd->lo_bit() == 0, "double precision register?");
+    emit_int32(cond << 28 | 0xeb << 20 | double_cp_num |
+              fd->hi_bits() << 12 | fd->hi_bit() << 22 | (imm_8 & 0xf) | (imm_8 >> 4) << 16);
+  }
+
+  // GPR <-> FPR transfers
+  void fmsr(FloatRegister fd, Register rd, AsmCondition cond = al) {
+    CHECK_VFP_PRESENT;
+    assert(fd->hi_bit() == 0, "double precision register?");
+    emit_int32(cond << 28 | 0xe0 << 20 | single_cp_num | 1 << 4 |
+              fd->hi_bits() << 16 | fd->lo_bit() << 7 | rd->encoding() << 12);
+  }
+
+  void fmrs(Register rd, FloatRegister fd, AsmCondition cond = al) {
+    CHECK_VFP_PRESENT;
+    assert(fd->hi_bit() == 0, "double precision register?");
+    emit_int32(cond << 28 | 0xe1 << 20 | single_cp_num | 1 << 4 |
+              fd->hi_bits() << 16 | fd->lo_bit() << 7 | rd->encoding() << 12);
+  }
+
+  void fmdrr(FloatRegister fd, Register rd, Register rn, AsmCondition cond = al) {
+    CHECK_VFP_PRESENT;
+    assert(fd->lo_bit() == 0, "single precision register?");
+    emit_int32(cond << 28 | 0xc4 << 20 | double_cp_num | 1 << 4 |
+              fd->hi_bits() | fd->hi_bit() << 5 |
+              rn->encoding() << 16 | rd->encoding() << 12);
+  }
+
+  void fmrrd(Register rd, Register rn, FloatRegister fd, AsmCondition cond = al) {
+    CHECK_VFP_PRESENT;
+    assert(fd->lo_bit() == 0, "single precision register?");
+    emit_int32(cond << 28 | 0xc5 << 20 | double_cp_num | 1 << 4 |
+              fd->hi_bits() | fd->hi_bit() << 5 |
+              rn->encoding() << 16 | rd->encoding() << 12);
+  }
+
+  void fmstat(AsmCondition cond = al) {
+    CHECK_VFP_PRESENT;
+    emit_int32(cond << 28 | 0xef1fa10);
+  }
+
+  void vmrs(Register rt, VFPSystemRegister sr, AsmCondition cond = al) {
+    assert((sr->encoding() & (~0xf)) == 0, "what system register is that?");
+    emit_int32(cond << 28 | rt->encoding() << 12 | sr->encoding() << 16 | 0xef00a10);
+  }
+
+  void vmsr(VFPSystemRegister sr, Register rt, AsmCondition cond = al) {
+    assert((sr->encoding() & (~0xf)) == 0, "what system register is that?");
+    emit_int32(cond << 28 | rt->encoding() << 12 | sr->encoding() << 16 | 0xee00a10);
+  }
+
+  void vcnt(FloatRegister Dd, FloatRegister Dm) {
+    CHECK_VFP_PRESENT;
+    // emitted at VM startup to detect whether the instruction is available
+    assert(!VM_Version::is_initialized() || VM_Version::has_simd(), "simd instruction");
+    assert(Dd->lo_bit() == 0 && Dm->lo_bit() == 0, "single precision registers?");
+    emit_int32(0xf3b00500 | Dd->hi_bit() << 22 | Dd->hi_bits() << 12 | Dm->hi_bit() << 5 | Dm->hi_bits());
+  }
+
+  void vpaddl(FloatRegister Dd, FloatRegister Dm, int size, bool s) {
+    CHECK_VFP_PRESENT;
+    assert(VM_Version::has_simd(), "simd instruction");
+    assert(Dd->lo_bit() == 0 && Dm->lo_bit() == 0, "single precision registers?");
+    assert(size == 8 || size == 16 || size == 32, "unexpected size");
+    emit_int32(0xf3b00200 | Dd->hi_bit() << 22 | (size >> 4) << 18 | Dd->hi_bits() << 12 | (s ? 0 : 1) << 7 | Dm->hi_bit() << 5 | Dm->hi_bits());
+  }
+
+  void vld1(FloatRegister Dd, Address addr, VElem_Size size, int bits) {
+    CHECK_VFP_PRESENT;
+    assert(VM_Version::has_simd(), "simd instruction");
+    assert(Dd->lo_bit() == 0, "single precision registers?");
+    int align = 0;
+    assert(bits == 128, "code assumption");
+    VLD_Type type = VLD1_TYPE_2_REGS; // 2x64
+    emit_int32(0xf4200000 | Dd->hi_bit() << 22 | Dd->hi_bits() << 12 | type << 8 | size << 6 | align << 4 | addr.encoding_simd());
+  }
+
+  void vst1(FloatRegister Dd, Address addr, VElem_Size size, int bits) {
+    CHECK_VFP_PRESENT;
+    assert(VM_Version::has_simd(), "simd instruction");
+    assert(Dd->lo_bit() == 0, "single precision registers?");
+    int align = 0;
+    assert(bits == 128, "code assumption");
+    VLD_Type type = VLD1_TYPE_2_REGS; // 2x64
+    emit_int32(0xf4000000 | Dd->hi_bit() << 22 | Dd->hi_bits() << 12 | type << 8 | size << 6 | align << 4 | addr.encoding_simd());
+  }
+
+  void vmovI(FloatRegister Dd, int imm8, VElem_Size size, int quad) {
+    CHECK_VFP_PRESENT;
+    assert(VM_Version::has_simd(), "simd instruction");
+    assert(Dd->lo_bit() == 0, "single precision register?");
+    assert(!quad || (Dd->hi_bits() & 1) == 0, "quad precision register?");
+    assert(imm8 >= 0 && imm8 < 256, "out of range");
+    int op;
+    int cmode;
+    switch (size) {
+    case VELEM_SIZE_8:
+      op = 0;
+      cmode = 0xE /* 0b1110 */;
+      break;
+    case VELEM_SIZE_16:
+      op = 0;
+      cmode = 0x8 /* 0b1000 */;
+      break;
+    case VELEM_SIZE_32:
+      op = 0;
+      cmode = 0x0 /* 0b0000 */;
+      break;
+    default:
+      ShouldNotReachHere();
+    }
+    emit_int32(0xf << 28 | 0x1 << 25 | 0x1 << 23 | 0x1 << 4 |
+              (imm8 >> 7) << 24 | ((imm8 & 0x70) >> 4) << 16 | (imm8 & 0xf) |
+              quad << 6 | op << 5 | cmode << 8 |
+              Dd->hi_bits() << 12 | Dd->hi_bit() << 22);
+  }
+
+  void vdupI(FloatRegister Dd, Register Rs, VElem_Size size, int quad,
+             AsmCondition cond = al) {
+    CHECK_VFP_PRESENT;
+    assert(VM_Version::has_simd(), "simd instruction");
+    assert(Dd->lo_bit() == 0, "single precision register?");
+    assert(!quad || (Dd->hi_bits() & 1) == 0, "quad precision register?");
+    int b;
+    int e;
+    switch (size) {
+    case VELEM_SIZE_8:
+      b = 1;
+      e = 0;
+      break;
+    case VELEM_SIZE_16:
+      b = 0;
+      e = 1;
+      break;
+    case VELEM_SIZE_32:
+      b = 0;
+      e = 0;
+      break;
+    default:
+      ShouldNotReachHere();
+    }
+    emit_int32(cond << 28 | 0x1D /* 0b11101 */ << 23 | 0xB /* 0b1011 */ << 8 | 0x1 << 4 |
+              quad << 21 | b << 22 |  e << 5 | Rs->encoding() << 12 |
+              Dd->hi_bits() << 16 | Dd->hi_bit() << 7);
+  }
+
+  void vdup(FloatRegister Dd, FloatRegister Ds, int index, int size, int quad) {
+    CHECK_VFP_PRESENT;
+    assert(VM_Version::has_simd(), "simd instruction");
+    assert(Dd->lo_bit() == 0, "single precision register?");
+    assert(Ds->lo_bit() == 0, "single precision register?");
+    assert(!quad || (Dd->hi_bits() & 1) == 0, "quad precision register?");
+    int range = 64 / size;
+    assert(index < range, "overflow");
+    int imm4;
+    switch (size) {
+    case 8:
+      assert((index & 0x7 /* 0b111 */) == index, "overflow");
+      imm4 = index << 1 | 0x1 /* 0b0001 */;
+      break;
+    case 16:
+      assert((index & 0x3 /* 0b11 */) == index, "overflow");
+      imm4 = index << 2 | 0x2 /* 0b0010 */;
+      break;
+    case 32:
+      assert((index & 0x1 /* 0b1 */) == index, "overflow");
+      imm4 = index << 3 | 0x4 /* 0b0100 */;
+      break;
+    default:
+      ShouldNotReachHere();
+    }
+    emit_int32(0xF /* 0b1111 */ << 28 | 0x3B /* 0b00111011 */ << 20 | 0x6 /* 0b110 */ << 9 |
+               quad << 6 | imm4 << 16 |
+               Dd->hi_bits() << 12 | Dd->hi_bit() << 22 |
+               Ds->hi_bits() << 00 | Ds->hi_bit() << 5);
+  }
+
+  void vdupF(FloatRegister Dd, FloatRegister Ss, int quad) {
+    int index = 0;
+    FloatRegister Ds = as_FloatRegister(Ss->encoding() & ~1);
+    if (Ss->lo_bit() != 0) {
+      /* odd S register */
+      assert(Ds->successor() == Ss, "bad reg");
+      index = 1;
+    } else {
+      /* even S register */
+      assert(Ds == Ss, "bad reg");
+    }
+    vdup(Dd, Ds, index, 32, quad);
+  }
+
+  void vrev(FloatRegister Dd, FloatRegister Dm, int quad, int region_size, VElem_Size size) {
+    CHECK_VFP_PRESENT;
+    assert(VM_Version::has_simd(), "simd instruction");
+    assert(Dd->lo_bit() == 0, "single precision register?");
+    assert(Dm->lo_bit() == 0, "single precision register?");
+    assert(!quad || ((Dd->hi_bits() | Dm->hi_bits()) & 1) == 0,
+           "quad precision register?");
+    unsigned int op = 0;
+    switch (region_size) {
+      case 16: op = 0x2; /*0b10*/ break;
+      case 32: op = 0x1; /*0b01*/ break;
+      case 64: op = 0x0; /*0b00*/ break;
+      default: assert(false, "encoding constraint");
+    }
+    emit_int32(0xf << 28 | 0x7 << 23 | Dd->hi_bit() << 22 | 0x3 << 20 |
+               size << 18 | Dd->hi_bits() << 12 | op  << 7 | quad << 6 | Dm->hi_bit() << 5 |
+               Dm->hi_bits());
+  }
+
+  void veor(FloatRegister Dd, FloatRegister Dn, FloatRegister Dm, int quad) {
+    CHECK_VFP_PRESENT;
+    assert(VM_Version::has_simd(), "simd instruction");
+    assert(Dd->lo_bit() == 0, "single precision register?");
+    assert(Dm->lo_bit() == 0, "single precision register?");
+    assert(Dn->lo_bit() == 0, "single precision register?");
+    assert(!quad || ((Dd->hi_bits() | Dm->hi_bits() | Dn->hi_bits()) & 1) == 0,
+           "quad precision register?");
+
+    emit_int32(0xf << 28 | 0x3 << 24 | Dd->hi_bit() << 22 | Dn->hi_bits() << 16 |
+               Dd->hi_bits() << 12 | 0x1 << 8 | Dn->hi_bit() << 7 | quad << 6 |
+               Dm->hi_bit() << 5 | 0x1 << 4 | Dm->hi_bits());
+  }
+
+
+  Assembler(CodeBuffer* code) : AbstractAssembler(code) {}
+
+#ifdef COMPILER2
+  typedef VFP::double_num double_num;
+  typedef VFP::float_num  float_num;
+#endif
+};
+
+#ifdef __SOFTFP__
+// Soft float function declarations
+extern "C" {
+extern float  __aeabi_fadd(float, float);
+extern float  __aeabi_fmul(float, float);
+extern float  __aeabi_fsub(float, float);
+extern float  __aeabi_fdiv(float, float);
+
+extern double __aeabi_dadd(double, double);
+extern double __aeabi_dmul(double, double);
+extern double __aeabi_dsub(double, double);
+extern double __aeabi_ddiv(double, double);
+
+extern double __aeabi_f2d(float);
+extern float  __aeabi_d2f(double);
+extern float  __aeabi_i2f(int);
+extern double __aeabi_i2d(int);
+extern int    __aeabi_f2iz(float);
+
+extern int  __aeabi_fcmpeq(float, float);
+extern int  __aeabi_fcmplt(float, float);
+extern int  __aeabi_fcmple(float, float);
+extern int  __aeabi_fcmpge(float, float);
+extern int  __aeabi_fcmpgt(float, float);
+
+extern int  __aeabi_dcmpeq(double, double);
+extern int  __aeabi_dcmplt(double, double);
+extern int  __aeabi_dcmple(double, double);
+extern int  __aeabi_dcmpge(double, double);
+extern int  __aeabi_dcmpgt(double, double);
+
+// Imported code from glibc soft-fp bundle for
+// calculation accuracy improvement. See CR 6757269.
+extern double __aeabi_fadd_glibc(float, float);
+extern double __aeabi_fsub_glibc(float, float);
+extern double __aeabi_dadd_glibc(double, double);
+extern double __aeabi_dsub_glibc(double, double);
+};
+#endif // __SOFTFP__
+
+
+#endif // CPU_ARM_VM_ASSEMBLER_ARM_32_HPP
--- /dev/null	2016-08-24 15:41:39.598575000 -0400
+++ new/hotspot/src/cpu/arm/vm/assembler_arm_64.cpp	2016-12-02 11:17:35.247097647 -0500
@@ -0,0 +1,191 @@
+/*
+ * Copyright (c) 2008, 2015, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+#include "asm/assembler.hpp"
+#include "asm/assembler.inline.hpp"
+#include "ci/ciEnv.hpp"
+#include "gc/shared/cardTableModRefBS.hpp"
+#include "gc/shared/collectedHeap.inline.hpp"
+#include "interpreter/interpreter.hpp"
+#include "interpreter/interpreterRuntime.hpp"
+#include "interpreter/templateInterpreterGenerator.hpp"
+#include "memory/resourceArea.hpp"
+#include "prims/jvm_misc.hpp"
+#include "prims/methodHandles.hpp"
+#include "runtime/biasedLocking.hpp"
+#include "runtime/interfaceSupport.hpp"
+#include "runtime/objectMonitor.hpp"
+#include "runtime/os.hpp"
+#include "runtime/sharedRuntime.hpp"
+#include "runtime/stubRoutines.hpp"
+#include "utilities/hashtable.hpp"
+#include "utilities/macros.hpp"
+#if INCLUDE_ALL_GCS
+#include "gc/g1/g1CollectedHeap.inline.hpp"
+#include "gc/g1/g1SATBCardTableModRefBS.hpp"
+#include "gc/g1/heapRegion.hpp"
+#endif // INCLUDE_ALL_GCS
+
+// Returns whether given imm has equal bit fields <0:size-1> and <size:2*size-1>.
+inline bool Assembler::LogicalImmediate::has_equal_subpatterns(uintx imm, int size) {
+  uintx mask = right_n_bits(size);
+  uintx subpattern1 = mask_bits(imm, mask);
+  uintx subpattern2 = mask_bits(imm >> size, mask);
+  return subpattern1 == subpattern2;
+}
+
+// Returns least size that is a power of two from 2 to 64 with the proviso that given
+// imm is composed of repeating patterns of this size.
+inline int Assembler::LogicalImmediate::least_pattern_size(uintx imm) {
+  int size = BitsPerWord;
+  while (size > 2 && has_equal_subpatterns(imm, size >> 1)) {
+    size >>= 1;
+  }
+  return size;
+}
+
+// Returns count of set bits in given imm. Based on variable-precision SWAR algorithm.
+inline int Assembler::LogicalImmediate::population_count(uintx x) {
+  x -= ((x >> 1) & 0x5555555555555555L);
+  x = (((x >> 2) & 0x3333333333333333L) + (x & 0x3333333333333333L));
+  x = (((x >> 4) + x) & 0x0f0f0f0f0f0f0f0fL);
+  x += (x >> 8);
+  x += (x >> 16);
+  x += (x >> 32);
+  return(x & 0x7f);
+}
+
+// Let given x be <A:B> where B = 0 and least bit of A = 1. Returns <A:C>, where C is B-size set bits.
+inline uintx Assembler::LogicalImmediate::set_least_zeroes(uintx x) {
+  return x | (x - 1);
+}
+
+
+#ifdef ASSERT
+
+// Restores immediate by encoded bit masks.
+uintx Assembler::LogicalImmediate::decode() {
+  assert (_encoded, "should be");
+
+  int len_code = (_immN << 6) | ((~_imms) & 0x3f);
+  assert (len_code != 0, "should be");
+
+  int len = 6;
+  while (!is_set_nth_bit(len_code, len)) len--;
+  int esize = 1 << len;
+  assert (len > 0, "should be");
+  assert ((_is32bit ? 32 : 64) >= esize, "should be");
+
+  int levels = right_n_bits(len);
+  int S = _imms & levels;
+  int R = _immr & levels;
+
+  assert (S != levels, "should be");
+
+  uintx welem = right_n_bits(S + 1);
+  uintx wmask = (R == 0) ? welem : ((welem >> R) | (welem << (esize - R)));
+
+  for (int size = esize; size < 64; size <<= 1) {
+    wmask |= (wmask << size);
+  }
+
+  return wmask;
+}
+
+#endif
+
+
+// Constructs LogicalImmediate by given imm. Figures out if given imm can be used in AArch64 logical
+// instructions (AND, ANDS, EOR, ORR) and saves its encoding.
+void Assembler::LogicalImmediate::construct(uintx imm, bool is32) {
+  _is32bit = is32;
+
+  if (is32) {
+    assert(((imm >> 32) == 0) || (((intx)imm >> 31) == -1), "32-bit immediate is out of range");
+
+    // Replicate low 32 bits.
+    imm &= 0xffffffff;
+    imm |= imm << 32;
+  }
+
+  // All-zeroes and all-ones can not be encoded.
+  if (imm != 0 && (~imm != 0)) {
+
+    // Let LPS (least pattern size) be the least size (power of two from 2 to 64) of repeating
+    // patterns in the immediate. If immediate value can be encoded, it is encoded by pattern
+    // of exactly LPS size (due to structure of valid patterns). In order to verify
+    // that immediate value can be encoded, LPS is calculated and <LPS-1:0> bits of immediate
+    // are verified to be valid pattern.
+    int lps = least_pattern_size(imm);
+    uintx lps_mask = right_n_bits(lps);
+
+    // A valid pattern has one of the following forms:
+    //  | 0 x A | 1 x B | 0 x C |, where B > 0 and C > 0, or
+    //  | 1 x A | 0 x B | 1 x C |, where B > 0 and C > 0.
+    // For simplicity, the second form of the pattern is inverted into the first form.
+    bool inverted = imm & 0x1;
+    uintx pattern = (inverted ? ~imm : imm) & lps_mask;
+
+    //  | 0 x A | 1 x (B + C)   |
+    uintx without_least_zeroes = set_least_zeroes(pattern);
+
+    // Pattern is valid iff without least zeroes it is a power of two - 1.
+    if ((without_least_zeroes & (without_least_zeroes + 1)) == 0) {
+
+      // Count B as population count of pattern.
+      int bits_count = population_count(pattern);
+
+      // Count B+C as population count of pattern without least zeroes
+      int left_range = population_count(without_least_zeroes);
+
+      // S-prefix is a part of imms field which encodes LPS.
+      //  LPS  |  S prefix
+      //   64  |     not defined
+      //   32  |     0b0
+      //   16  |     0b10
+      //    8  |     0b110
+      //    4  |     0b1110
+      //    2  |     0b11110
+      int s_prefix = (lps == 64) ? 0 : ~set_least_zeroes(lps) & 0x3f;
+
+      // immN bit is set iff LPS == 64.
+      _immN = (lps == 64) ? 1 : 0;
+      assert (!is32 || (_immN == 0), "32-bit immediate should be encoded with zero N-bit");
+
+      // immr is the rotation size.
+      _immr = lps + (inverted ? 0 : bits_count) - left_range;
+
+      // imms is the field that encodes bits count and S-prefix.
+      _imms = ((inverted ? (lps - bits_count) : bits_count) - 1) | s_prefix;
+
+      _encoded = true;
+      assert (decode() == imm, "illegal encoding");
+
+      return;
+    }
+  }
+
+  _encoded = false;
+}
--- /dev/null	2016-08-24 15:41:39.598575000 -0400
+++ new/hotspot/src/cpu/arm/vm/assembler_arm_64.hpp	2016-12-02 11:17:41.103429759 -0500
@@ -0,0 +1,1717 @@
+/*
+ * Copyright (c) 2008, 2016, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef CPU_ARM_VM_ASSEMBLER_ARM_64_HPP
+#define CPU_ARM_VM_ASSEMBLER_ARM_64_HPP
+
+enum AsmShift12 {
+  lsl0, lsl12
+};
+
+enum AsmPrefetchOp {
+    pldl1keep = 0b00000,
+    pldl1strm,
+    pldl2keep,
+    pldl2strm,
+    pldl3keep,
+    pldl3strm,
+
+    plil1keep = 0b01000,
+    plil1strm,
+    plil2keep,
+    plil2strm,
+    plil3keep,
+    plil3strm,
+
+    pstl1keep = 0b10000,
+    pstl1strm,
+    pstl2keep,
+    pstl2strm,
+    pstl3keep,
+    pstl3strm,
+};
+
+// Shifted register operand for data processing instructions.
+class AsmOperand VALUE_OBJ_CLASS_SPEC {
+ private:
+  Register _reg;
+  AsmShift _shift;
+  int _shift_imm;
+
+ public:
+  AsmOperand(Register reg) {
+    assert(reg != SP, "SP is not allowed in shifted register operand");
+    _reg = reg;
+    _shift = lsl;
+    _shift_imm = 0;
+  }
+
+  AsmOperand(Register reg, AsmShift shift, int shift_imm) {
+    assert(reg != SP, "SP is not allowed in shifted register operand");
+    assert(shift_imm >= 0, "shift amount should be non-negative");
+    _reg = reg;
+    _shift = shift;
+    _shift_imm = shift_imm;
+  }
+
+  Register reg() const {
+    return _reg;
+  }
+
+  AsmShift shift() const {
+    return _shift;
+  }
+
+  int shift_imm() const {
+    return _shift_imm;
+  }
+};
+
+
+class Assembler : public AbstractAssembler  {
+
+ public:
+
+  static const int LogInstructionSize = 2;
+  static const int InstructionSize    = 1 << LogInstructionSize;
+
+  Assembler(CodeBuffer* code) : AbstractAssembler(code) {}
+
+  static inline AsmCondition inverse(AsmCondition cond) {
+    assert ((cond != al) && (cond != nv), "AL and NV conditions cannot be inversed");
+    return (AsmCondition)((int)cond ^ 1);
+  }
+
+  // Returns value of nzcv flags conforming to the given condition.
+  static inline int flags_for_condition(AsmCondition cond) {
+    switch(cond) {            // NZCV
+      case mi: case lt: return 0b1000;
+      case eq: case le: return 0b0100;
+      case hs: case hi: return 0b0010;
+      case vs:          return 0b0001;
+      default:          return 0b0000;
+    }
+  }
+
+  // Immediate, encoded into logical instructions.
+  class LogicalImmediate {
+   private:
+    bool _encoded;
+    bool _is32bit;
+    int _immN;
+    int _immr;
+    int _imms;
+
+    static inline bool has_equal_subpatterns(uintx imm, int size);
+    static inline int least_pattern_size(uintx imm);
+    static inline int population_count(uintx x);
+    static inline uintx set_least_zeroes(uintx x);
+
+#ifdef ASSERT
+    uintx decode();
+#endif
+
+    void construct(uintx imm, bool is32);
+
+   public:
+    LogicalImmediate(uintx imm, bool is32 = false) { construct(imm, is32); }
+
+    // Returns true if given immediate can be used in AArch64 logical instruction.
+    bool is_encoded() const { return _encoded; }
+
+    bool is32bit() const { return _is32bit; }
+    int immN() const { assert(_encoded, "should be"); return _immN; }
+    int immr() const { assert(_encoded, "should be"); return _immr; }
+    int imms() const { assert(_encoded, "should be"); return _imms; }
+  };
+
+  // Immediate, encoded into arithmetic add/sub instructions.
+  class ArithmeticImmediate {
+   private:
+    bool _encoded;
+    int _imm;
+    AsmShift12 _shift;
+
+   public:
+    ArithmeticImmediate(intx x) {
+      if (is_unsigned_imm_in_range(x, 12, 0)) {
+        _encoded = true;
+        _imm = x;
+        _shift = lsl0;
+      } else if (is_unsigned_imm_in_range(x, 12, 12)) {
+        _encoded = true;
+        _imm = x >> 12;
+        _shift = lsl12;
+      } else {
+        _encoded = false;
+      }
+    }
+
+    ArithmeticImmediate(intx x, AsmShift12 sh) {
+      if (is_unsigned_imm_in_range(x, 12, 0)) {
+        _encoded = true;
+        _imm = x;
+        _shift = sh;
+      } else {
+        _encoded = false;
+      }
+    }
+
+    // Returns true if this immediate can be used in AArch64 arithmetic (add/sub/cmp/cmn) instructions.
+    bool is_encoded() const  { return _encoded; }
+
+    int imm() const          { assert(_encoded, "should be"); return _imm; }
+    AsmShift12 shift() const { assert(_encoded, "should be"); return _shift; }
+  };
+
+  static inline bool is_imm_in_range(intx value, int bits, int align_bits) {
+    intx sign_bits = (value >> (bits + align_bits - 1));
+    return ((value & right_n_bits(align_bits)) == 0) && ((sign_bits == 0) || (sign_bits == -1));
+  }
+
+  static inline int encode_imm(intx value, int bits, int align_bits, int low_bit_in_encoding) {
+    assert (is_imm_in_range(value, bits, align_bits), "immediate value is out of range");
+    return ((value >> align_bits) & right_n_bits(bits)) << low_bit_in_encoding;
+  }
+
+  static inline bool is_unsigned_imm_in_range(intx value, int bits, int align_bits) {
+    return (value >= 0) && ((value & right_n_bits(align_bits)) == 0) && ((value >> (align_bits + bits)) == 0);
+  }
+
+  static inline int encode_unsigned_imm(intx value, int bits, int align_bits, int low_bit_in_encoding) {
+    assert (is_unsigned_imm_in_range(value, bits, align_bits), "immediate value is out of range");
+    return (value >> align_bits) << low_bit_in_encoding;
+  }
+
+  static inline bool is_offset_in_range(intx offset, int bits) {
+    assert (bits == 14 || bits == 19 || bits == 26, "wrong bits number");
+    return is_imm_in_range(offset, bits, 2);
+  }
+
+  static inline int encode_offset(intx offset, int bits, int low_bit_in_encoding) {
+    return encode_imm(offset, bits, 2, low_bit_in_encoding);
+  }
+
+  // Returns true if given value can be used as immediate in arithmetic (add/sub/cmp/cmn) instructions.
+  static inline bool is_arith_imm_in_range(intx value) {
+    return ArithmeticImmediate(value).is_encoded();
+  }
+
+
+  // Load/store instructions
+
+#define F(mnemonic, opc) \
+  void mnemonic(Register rd, address literal_addr) {                                                       \
+    intx offset = literal_addr - pc();                                                                     \
+    assert (opc != 0b01 || offset == 0 || ((uintx)literal_addr & 7) == 0, "ldr target should be aligned"); \
+    assert (is_offset_in_range(offset, 19), "offset is out of range");                                     \
+    emit_int32(opc << 30 | 0b011 << 27 | encode_offset(offset, 19, 5) | rd->encoding_with_zr());           \
+  }
+
+  F(ldr_w, 0b00)
+  F(ldr,   0b01)
+  F(ldrsw, 0b10)
+#undef F
+
+#define F(mnemonic, opc) \
+  void mnemonic(FloatRegister rt, address literal_addr) {                                                  \
+    intx offset = literal_addr - pc();                                                                     \
+    assert (offset == 0 || ((uintx)literal_addr & right_n_bits(2 + opc)) == 0, "ldr target should be aligned"); \
+    assert (is_offset_in_range(offset, 19), "offset is out of range");                                     \
+    emit_int32(opc << 30 | 0b011100 << 24 | encode_offset(offset, 19, 5) | rt->encoding());                \
+  }
+
+  F(ldr_s, 0b00)
+  F(ldr_d, 0b01)
+  F(ldr_q, 0b10)
+#undef F
+
+#define F(mnemonic, size, o2, L, o1, o0) \
+  void mnemonic(Register rt, Register rn) {                                                                \
+    emit_int32(size << 30 | 0b001000 << 24 | o2 << 23 | L << 22 | o1 << 21 | 0b11111 << 16 |               \
+        o0 << 15 | 0b11111 << 10 | rn->encoding_with_sp() << 5 | rt->encoding_with_zr());                  \
+  }
+
+  F(ldxrb,   0b00, 0, 1, 0, 0)
+  F(ldaxrb,  0b00, 0, 1, 0, 1)
+  F(ldarb,   0b00, 1, 1, 0, 1)
+  F(ldxrh,   0b01, 0, 1, 0, 0)
+  F(ldaxrh,  0b01, 0, 1, 0, 1)
+  F(ldarh,   0b01, 1, 1, 0, 1)
+  F(ldxr_w,  0b10, 0, 1, 0, 0)
+  F(ldaxr_w, 0b10, 0, 1, 0, 1)
+  F(ldar_w,  0b10, 1, 1, 0, 1)
+  F(ldxr,    0b11, 0, 1, 0, 0)
+  F(ldaxr,   0b11, 0, 1, 0, 1)
+  F(ldar,    0b11, 1, 1, 0, 1)
+
+  F(stlrb,   0b00, 1, 0, 0, 1)
+  F(stlrh,   0b01, 1, 0, 0, 1)
+  F(stlr_w,  0b10, 1, 0, 0, 1)
+  F(stlr,    0b11, 1, 0, 0, 1)
+#undef F
+
+#define F(mnemonic, size, o2, L, o1, o0) \
+  void mnemonic(Register rs, Register rt, Register rn) {                                                     \
+    assert (rs != rt, "should be different");                                                                \
+    assert (rs != rn, "should be different");                                                                \
+    emit_int32(size << 30 | 0b001000 << 24 | o2 << 23 | L << 22 | o1 << 21 | rs->encoding_with_zr() << 16 |  \
+        o0 << 15 | 0b11111 << 10 | rn->encoding_with_sp() << 5 | rt->encoding_with_zr());                    \
+  }
+
+  F(stxrb,   0b00, 0, 0, 0, 0)
+  F(stlxrb,  0b00, 0, 0, 0, 1)
+  F(stxrh,   0b01, 0, 0, 0, 0)
+  F(stlxrh,  0b01, 0, 0, 0, 1)
+  F(stxr_w,  0b10, 0, 0, 0, 0)
+  F(stlxr_w, 0b10, 0, 0, 0, 1)
+  F(stxr,    0b11, 0, 0, 0, 0)
+  F(stlxr,   0b11, 0, 0, 0, 1)
+#undef F
+
+#define F(mnemonic, size, o2, L, o1, o0) \
+  void mnemonic(Register rt, Register rt2, Register rn) {                                                  \
+    assert (rt != rt2, "should be different");                                                             \
+    emit_int32(size << 30 | 0b001000 << 24 | o2 << 23 | L << 22 | o1 << 21 | 0b11111 << 16 |               \
+        o0 << 15 | rt2->encoding_with_zr() << 10 | rn->encoding_with_sp() << 5 | rt->encoding_with_zr());  \
+  }
+
+  F(ldxp_w,  0b10, 0, 1, 1, 0)
+  F(ldaxp_w, 0b10, 0, 1, 1, 1)
+  F(ldxp,    0b11, 0, 1, 1, 0)
+  F(ldaxp,   0b11, 0, 1, 1, 1)
+#undef F
+
+#define F(mnemonic, size, o2, L, o1, o0) \
+  void mnemonic(Register rs, Register rt, Register rt2, Register rn) {                                       \
+    assert (rs != rt, "should be different");                                                                \
+    assert (rs != rt2, "should be different");                                                               \
+    assert (rs != rn, "should be different");                                                                \
+    emit_int32(size << 30 | 0b001000 << 24 | o2 << 23 | L << 22 | o1 << 21 | rs->encoding_with_zr() << 16 |  \
+        o0 << 15 | rt2->encoding_with_zr() << 10 | rn->encoding_with_sp() << 5 | rt->encoding_with_zr());    \
+  }
+
+  F(stxp_w,  0b10, 0, 0, 1, 0)
+  F(stlxp_w, 0b10, 0, 0, 1, 1)
+  F(stxp,    0b11, 0, 0, 1, 0)
+  F(stlxp,   0b11, 0, 0, 1, 1)
+#undef F
+
+#define F(mnemonic, opc, V, L) \
+  void mnemonic(Register rt, Register rt2, Register rn, int offset = 0) {                                  \
+    assert (!L || rt != rt2, "should be different");                                                       \
+    int align_bits = 2 + (opc >> 1);                                                                       \
+    assert (is_imm_in_range(offset, 7, align_bits), "offset is out of range");                             \
+    emit_int32(opc << 30 | 0b101 << 27 | V << 26 | L << 22 | encode_imm(offset, 7, align_bits, 15) |       \
+        rt2->encoding_with_zr() << 10 | rn->encoding_with_sp() << 5 | rt->encoding_with_zr());             \
+  }
+
+  F(stnp_w,  0b00, 0, 0)
+  F(ldnp_w,  0b00, 0, 1)
+  F(stnp,    0b10, 0, 0)
+  F(ldnp,    0b10, 0, 1)
+#undef F
+
+#define F(mnemonic, opc, V, L) \
+  void mnemonic(FloatRegister rt, FloatRegister rt2, Register rn, int offset = 0) {                        \
+    assert (!L || (rt != rt2), "should be different");                                                     \
+    int align_bits = 2 + opc;                                                                              \
+    assert (is_imm_in_range(offset, 7, align_bits), "offset is out of range");                             \
+    emit_int32(opc << 30 | 0b101 << 27 | V << 26 | L << 22 | encode_imm(offset, 7, align_bits, 15) |       \
+        rt2->encoding() << 10 | rn->encoding_with_sp() << 5 | rt->encoding());                             \
+  }
+
+  F(stnp_s,  0b00, 1, 0)
+  F(stnp_d,  0b01, 1, 0)
+  F(stnp_q,  0b10, 1, 0)
+  F(ldnp_s,  0b00, 1, 1)
+  F(ldnp_d,  0b01, 1, 1)
+  F(ldnp_q,  0b10, 1, 1)
+#undef F
+
+#define F(mnemonic, size, V, opc) \
+  void mnemonic(Register rt, Address addr) { \
+    assert((addr.mode() == basic_offset) || (rt != addr.base()), "should be different");                    \
+    if (addr.index() == noreg) {                                                                            \
+      if ((addr.mode() == basic_offset) && is_unsigned_imm_in_range(addr.disp(), 12, size)) {               \
+        emit_int32(size << 30 | 0b111 << 27 | V << 26 | 0b01 << 24 | opc << 22 |                            \
+           encode_unsigned_imm(addr.disp(), 12, size, 10) |                                                 \
+           addr.base()->encoding_with_sp() << 5 | rt->encoding_with_zr());                                  \
+      } else {                                                                                              \
+        assert(is_imm_in_range(addr.disp(), 9, 0), "offset is out of range");                               \
+        emit_int32(size << 30 | 0b111 << 27 | V << 26 | opc << 22 | encode_imm(addr.disp(), 9, 0, 12) |     \
+           addr.mode() << 10 | addr.base()->encoding_with_sp() << 5 | rt->encoding_with_zr());              \
+      }                                                                                                     \
+    } else {                                                                                                \
+      assert (addr.disp() == 0, "non-zero displacement for [reg + reg] address mode");                      \
+      assert ((addr.shift_imm() == 0) || (addr.shift_imm() == size), "invalid shift amount");               \
+      emit_int32(size << 30 | 0b111 << 27 | V << 26 | opc << 22 | 1 << 21 |                                 \
+         addr.index()->encoding_with_zr() << 16 | addr.extend() << 13 | (addr.shift_imm() != 0) << 12 |     \
+         0b10 << 10 | addr.base()->encoding_with_sp() << 5 | rt->encoding_with_zr());                       \
+    }                                                                                                       \
+  }
+
+  F(strb,    0b00, 0, 0b00)
+  F(ldrb,    0b00, 0, 0b01)
+  F(ldrsb,   0b00, 0, 0b10)
+  F(ldrsb_w, 0b00, 0, 0b11)
+
+  F(strh,    0b01, 0, 0b00)
+  F(ldrh,    0b01, 0, 0b01)
+  F(ldrsh,   0b01, 0, 0b10)
+  F(ldrsh_w, 0b01, 0, 0b11)
+
+  F(str_w,   0b10, 0, 0b00)
+  F(ldr_w,   0b10, 0, 0b01)
+  F(ldrsw,   0b10, 0, 0b10)
+
+  F(str,     0b11, 0, 0b00)
+  F(ldr,     0b11, 0, 0b01)
+#undef F
+
+#define F(mnemonic, size, V, opc) \
+  void mnemonic(AsmPrefetchOp prfop, Address addr) { \
+    assert (addr.mode() == basic_offset, #mnemonic " supports only basic_offset address mode");             \
+    if (addr.index() == noreg) {                                                                            \
+      if (is_unsigned_imm_in_range(addr.disp(), 12, size)) {                                                \
+        emit_int32(size << 30 | 0b111 << 27 | V << 26 | 0b01 << 24 | opc << 22 |                            \
+           encode_unsigned_imm(addr.disp(), 12, size, 10) |                                                 \
+           addr.base()->encoding_with_sp() << 5 | prfop);                                                   \
+      } else {                                                                                              \
+        assert(is_imm_in_range(addr.disp(), 9, 0), "offset is out of range");                               \
+        emit_int32(size << 30 | 0b111 << 27 | V << 26 | opc << 22 | encode_imm(addr.disp(), 9, 0, 12) |     \
+           addr.base()->encoding_with_sp() << 5 | prfop);                                                   \
+      }                                                                                                     \
+    } else {                                                                                                \
+      assert (addr.disp() == 0, "non-zero displacement for [reg + reg] address mode");                      \
+      assert ((addr.shift_imm() == 0) || (addr.shift_imm() == size), "invalid shift amount");               \
+      emit_int32(size << 30 | 0b111 << 27 | V << 26 | opc << 22 | 1 << 21 |                                 \
+         addr.index()->encoding_with_zr() << 16 | addr.extend() << 13 | (addr.shift_imm() != 0) << 12 |     \
+         0b10 << 10 | addr.base()->encoding_with_sp() << 5 | prfop);                                        \
+    }                                                                                                       \
+  }
+
+  F(prfm, 0b11, 0, 0b10)
+#undef F
+
+#define F(mnemonic, size, V, opc) \
+  void mnemonic(FloatRegister rt, Address addr) { \
+    int align_bits = (((opc & 0b10) >> 1) << 2) | size;                                                     \
+    if (addr.index() == noreg) {                                                                            \
+      if ((addr.mode() == basic_offset) && is_unsigned_imm_in_range(addr.disp(), 12, align_bits)) {         \
+        emit_int32(size << 30 | 0b111 << 27 | V << 26 | 0b01 << 24 | opc << 22 |                            \
+           encode_unsigned_imm(addr.disp(), 12, align_bits, 10) |                                           \
+           addr.base()->encoding_with_sp() << 5 | rt->encoding());                                          \
+      } else {                                                                                              \
+        assert(is_imm_in_range(addr.disp(), 9, 0), "offset is out of range");                               \
+        emit_int32(size << 30 | 0b111 << 27 | V << 26 | opc << 22 | encode_imm(addr.disp(), 9, 0, 12) |     \
+           addr.mode() << 10 | addr.base()->encoding_with_sp() << 5 | rt->encoding());                      \
+      }                                                                                                     \
+    } else {                                                                                                \
+      assert (addr.disp() == 0, "non-zero displacement for [reg + reg] address mode");                      \
+      assert ((addr.shift_imm() == 0) || (addr.shift_imm() == align_bits), "invalid shift amount");         \
+      emit_int32(size << 30 | 0b111 << 27 | V << 26 | opc << 22 | 1 << 21 |                                 \
+         addr.index()->encoding_with_zr() << 16 | addr.extend() << 13 | (addr.shift_imm() != 0) << 12 |     \
+         0b10 << 10 | addr.base()->encoding_with_sp() << 5 | rt->encoding());                               \
+    }                                                                                                       \
+  }
+
+  F(str_b, 0b00, 1, 0b00)
+  F(ldr_b, 0b00, 1, 0b01)
+  F(str_h, 0b01, 1, 0b00)
+  F(ldr_h, 0b01, 1, 0b01)
+  F(str_s, 0b10, 1, 0b00)
+  F(ldr_s, 0b10, 1, 0b01)
+  F(str_d, 0b11, 1, 0b00)
+  F(ldr_d, 0b11, 1, 0b01)
+  F(str_q, 0b00, 1, 0b10)
+  F(ldr_q, 0b00, 1, 0b11)
+#undef F
+
+#define F(mnemonic, opc, V, L) \
+  void mnemonic(Register rt, Register rt2, Address addr) {                                                         \
+    assert((addr.mode() == basic_offset) || ((rt != addr.base()) && (rt2 != addr.base())), "should be different"); \
+    assert(!L || (rt != rt2), "should be different");                                                              \
+    assert(addr.index() == noreg, "[reg + reg] address mode is not available for load/store pair");                \
+    int align_bits = 2 + (opc >> 1);                                                                               \
+    int mode_encoding = (addr.mode() == basic_offset) ? 0b10 : addr.mode();                                        \
+    assert(is_imm_in_range(addr.disp(), 7, align_bits), "offset is out of range");                                 \
+    emit_int32(opc << 30 | 0b101 << 27 | V << 26 | mode_encoding << 23 | L << 22 |                                 \
+       encode_imm(addr.disp(), 7, align_bits, 15) | rt2->encoding_with_zr() << 10 |                                \
+       addr.base()->encoding_with_sp() << 5 | rt->encoding_with_zr());                                             \
+  }
+
+  F(stp_w, 0b00, 0, 0)
+  F(ldp_w, 0b00, 0, 1)
+  F(ldpsw, 0b01, 0, 1)
+  F(stp,   0b10, 0, 0)
+  F(ldp,   0b10, 0, 1)
+#undef F
+
+#define F(mnemonic, opc, V, L) \
+  void mnemonic(FloatRegister rt, FloatRegister rt2, Address addr) {                                                         \
+    assert(!L || (rt != rt2), "should be different");                                                              \
+    assert(addr.index() == noreg, "[reg + reg] address mode is not available for load/store pair");                \
+    int align_bits = 2 + opc;                                                                                      \
+    int mode_encoding = (addr.mode() == basic_offset) ? 0b10 : addr.mode();                                        \
+    assert(is_imm_in_range(addr.disp(), 7, align_bits), "offset is out of range");                                 \
+    emit_int32(opc << 30 | 0b101 << 27 | V << 26 | mode_encoding << 23 | L << 22 |                                 \
+       encode_imm(addr.disp(), 7, align_bits, 15) | rt2->encoding() << 10 |                                        \
+       addr.base()->encoding_with_sp() << 5 | rt->encoding());                                                     \
+  }
+
+  F(stp_s, 0b00, 1, 0)
+  F(ldp_s, 0b00, 1, 1)
+  F(stp_d, 0b01, 1, 0)
+  F(ldp_d, 0b01, 1, 1)
+  F(stp_q, 0b10, 1, 0)
+  F(ldp_q, 0b10, 1, 1)
+#undef F
+
+  // Data processing instructions
+
+#define F(mnemonic, sf, opc) \
+  void mnemonic(Register rd, Register rn, const LogicalImmediate& imm) {                      \
+    assert (imm.is_encoded(), "illegal immediate for logical instruction");                   \
+    assert (imm.is32bit() == (sf == 0), "immediate size does not match instruction size");    \
+    emit_int32(sf << 31 | opc << 29 | 0b100100 << 23 | imm.immN() << 22 | imm.immr() << 16 |  \
+        imm.imms() << 10 | rn->encoding_with_zr() << 5 |                                      \
+        ((opc == 0b11) ? rd->encoding_with_zr() : rd->encoding_with_sp()));                   \
+  }                                                                                           \
+  void mnemonic(Register rd, Register rn, uintx imm) {                                        \
+    LogicalImmediate limm(imm, (sf == 0));                                                    \
+    mnemonic(rd, rn, limm);                                                                   \
+  }                                                                                           \
+  void mnemonic(Register rd, Register rn, unsigned int imm) {                                 \
+    mnemonic(rd, rn, (uintx)imm);                                                             \
+  }
+
+  F(andr_w, 0, 0b00)
+  F(orr_w,  0, 0b01)
+  F(eor_w,  0, 0b10)
+  F(ands_w, 0, 0b11)
+
+  F(andr, 1, 0b00)
+  F(orr,  1, 0b01)
+  F(eor,  1, 0b10)
+  F(ands, 1, 0b11)
+#undef F
+
+  void tst(Register rn, unsigned int imm) {
+    ands(ZR, rn, imm);
+  }
+
+  void tst_w(Register rn, unsigned int imm) {
+    ands_w(ZR, rn, imm);
+  }
+
+#define F(mnemonic, sf, opc, N) \
+  void mnemonic(Register rd, Register rn, AsmOperand operand) { \
+    assert (operand.shift_imm() >> (5 + sf) == 0, "shift amount is too large");          \
+    emit_int32(sf << 31 | opc << 29 | 0b01010 << 24 | operand.shift() << 22 | N << 21 |  \
+        operand.reg()->encoding_with_zr() << 16 | operand.shift_imm() << 10 |            \
+        rn->encoding_with_zr() << 5 | rd->encoding_with_zr());                           \
+  }
+
+  F(andr_w, 0, 0b00, 0)
+  F(bic_w,  0, 0b00, 1)
+  F(orr_w,  0, 0b01, 0)
+  F(orn_w,  0, 0b01, 1)
+  F(eor_w,  0, 0b10, 0)
+  F(eon_w,  0, 0b10, 1)
+  F(ands_w, 0, 0b11, 0)
+  F(bics_w, 0, 0b11, 1)
+
+  F(andr, 1, 0b00, 0)
+  F(bic,  1, 0b00, 1)
+  F(orr,  1, 0b01, 0)
+  F(orn,  1, 0b01, 1)
+  F(eor,  1, 0b10, 0)
+  F(eon,  1, 0b10, 1)
+  F(ands, 1, 0b11, 0)
+  F(bics, 1, 0b11, 1)
+#undef F
+
+  void tst(Register rn, AsmOperand operand) {
+    ands(ZR, rn, operand);
+  }
+
+  void tst_w(Register rn, AsmOperand operand) {
+    ands_w(ZR, rn, operand);
+  }
+
+  void mvn(Register rd, AsmOperand operand) {
+    orn(rd, ZR, operand);
+  }
+
+  void mvn_w(Register rd, AsmOperand operand) {
+    orn_w(rd, ZR, operand);
+  }
+
+#define F(mnemonic, sf, op, S) \
+  void mnemonic(Register rd, Register rn, const ArithmeticImmediate& imm) {                       \
+    assert(imm.is_encoded(), "immediate is out of range");                                        \
+    emit_int32(sf << 31 | op << 30 | S << 29 | 0b10001 << 24 | imm.shift() << 22 |                \
+        imm.imm() << 10 | rn->encoding_with_sp() << 5 |                                           \
+        (S == 1 ? rd->encoding_with_zr() : rd->encoding_with_sp()));                              \
+  }                                                                                               \
+  void mnemonic(Register rd, Register rn, int imm) {                                              \
+    mnemonic(rd, rn, ArithmeticImmediate(imm));                                                   \
+  }                                                                                               \
+  void mnemonic(Register rd, Register rn, int imm, AsmShift12 shift) {                            \
+    mnemonic(rd, rn, ArithmeticImmediate(imm, shift));                                            \
+  }                                                                                               \
+  void mnemonic(Register rd, Register rn, Register rm, AsmExtendOp extend, int shift_imm = 0) {   \
+    assert ((0 <= shift_imm) && (shift_imm <= 4), "shift amount is out of range");                \
+    emit_int32(sf << 31 | op << 30 | S << 29 | 0b01011001 << 21 | rm->encoding_with_zr() << 16 |  \
+        extend << 13 | shift_imm << 10 | rn->encoding_with_sp() << 5 |                            \
+        (S == 1 ? rd->encoding_with_zr() : rd->encoding_with_sp()));                              \
+  }                                                                                               \
+  void mnemonic(Register rd, Register rn, AsmOperand operand) {                                   \
+    assert (operand.shift() != ror, "illegal shift type");                                        \
+    assert (operand.shift_imm() >> (5 + sf) == 0, "shift amount is too large");                   \
+    emit_int32(sf << 31 | op << 30 | S << 29 | 0b01011 << 24 | operand.shift() << 22 |            \
+        operand.reg()->encoding_with_zr() << 16 | operand.shift_imm() << 10 |                     \
+        rn->encoding_with_zr() << 5 | rd->encoding_with_zr());                                    \
+  }
+
+  F(add_w,  0, 0, 0)
+  F(adds_w, 0, 0, 1)
+  F(sub_w,  0, 1, 0)
+  F(subs_w, 0, 1, 1)
+
+  F(add,    1, 0, 0)
+  F(adds,   1, 0, 1)
+  F(sub,    1, 1, 0)
+  F(subs,   1, 1, 1)
+#undef F
+
+  void mov(Register rd, Register rm) {
+    if ((rd == SP) || (rm == SP)) {
+      add(rd, rm, 0);
+    } else {
+      orr(rd, ZR, rm);
+    }
+  }
+
+  void mov_w(Register rd, Register rm) {
+    if ((rd == SP) || (rm == SP)) {
+      add_w(rd, rm, 0);
+    } else {
+      orr_w(rd, ZR, rm);
+    }
+  }
+
+  void cmp(Register rn, int imm) {
+    subs(ZR, rn, imm);
+  }
+
+  void cmp_w(Register rn, int imm) {
+    subs_w(ZR, rn, imm);
+  }
+
+  void cmp(Register rn, Register rm) {
+    assert (rm != SP, "SP should not be used as the 2nd operand of cmp");
+    if (rn == SP) {
+      subs(ZR, rn, rm, ex_uxtx);
+    } else {
+      subs(ZR, rn, rm);
+    }
+  }
+
+  void cmp_w(Register rn, Register rm) {
+    assert ((rn != SP) && (rm != SP), "SP should not be used in 32-bit cmp");
+    subs_w(ZR, rn, rm);
+  }
+
+  void cmp(Register rn, AsmOperand operand) {
+    assert (rn != SP, "SP is not allowed in cmp with shifted register (AsmOperand)");
+    subs(ZR, rn, operand);
+  }
+
+  void cmn(Register rn, int imm) {
+    adds(ZR, rn, imm);
+  }
+
+  void cmn_w(Register rn, int imm) {
+    adds_w(ZR, rn, imm);
+  }
+
+  void cmn(Register rn, Register rm) {
+    assert (rm != SP, "SP should not be used as the 2nd operand of cmp");
+    if (rn == SP) {
+      adds(ZR, rn, rm, ex_uxtx);
+    } else {
+      adds(ZR, rn, rm);
+    }
+  }
+
+  void cmn_w(Register rn, Register rm) {
+    assert ((rn != SP) && (rm != SP), "SP should not be used in 32-bit cmp");
+    adds_w(ZR, rn, rm);
+  }
+
+  void neg(Register rd, Register rm) {
+    sub(rd, ZR, rm);
+  }
+
+  void neg_w(Register rd, Register rm) {
+    sub_w(rd, ZR, rm);
+  }
+
+#define F(mnemonic, sf, op, S) \
+  void mnemonic(Register rd, Register rn, Register rm) { \
+    emit_int32(sf << 31 | op << 30 | S << 29 | 0b11010000 << 21 | rm->encoding_with_zr() << 16 |   \
+        rn->encoding_with_zr() << 5 | rd->encoding_with_zr());                                     \
+  }
+
+  F(adc_w,  0, 0, 0)
+  F(adcs_w, 0, 0, 1)
+  F(sbc_w,  0, 1, 0)
+  F(sbcs_w, 0, 1, 1)
+
+  F(adc,    1, 0, 0)
+  F(adcs,   1, 0, 1)
+  F(sbc,    1, 1, 0)
+  F(sbcs,   1, 1, 1)
+#undef F
+
+#define F(mnemonic, sf, N) \
+  void mnemonic(Register rd, Register rn, Register rm, int lsb) { \
+    assert ((lsb >> (5 + sf)) == 0, "illegal least significant bit position");        \
+    emit_int32(sf << 31 | 0b100111 << 23 | N << 22 | rm->encoding_with_zr() << 16 |   \
+        lsb << 10 | rn->encoding_with_zr() << 5 | rd->encoding_with_zr());            \
+  }
+
+  F(extr_w,  0, 0)
+  F(extr,    1, 1)
+#undef F
+
+#define F(mnemonic, sf, opc) \
+  void mnemonic(Register rd, int imm, int shift) { \
+    assert ((imm >> 16) == 0, "immediate is out of range");                       \
+    assert (((shift & 0xf) == 0) && ((shift >> (5 + sf)) == 0), "invalid shift"); \
+    emit_int32(sf << 31 | opc << 29 | 0b100101 << 23 | (shift >> 4) << 21 |       \
+        imm << 5 | rd->encoding_with_zr());                                       \
+  }
+
+  F(movn_w,  0, 0b00)
+  F(movz_w,  0, 0b10)
+  F(movk_w,  0, 0b11)
+  F(movn,    1, 0b00)
+  F(movz,    1, 0b10)
+  F(movk,    1, 0b11)
+#undef F
+
+  void mov(Register rd, int imm) {
+    assert ((imm >> 16) == 0, "immediate is out of range");
+    movz(rd, imm, 0);
+  }
+
+  void mov_w(Register rd, int imm) {
+    assert ((imm >> 16) == 0, "immediate is out of range");
+    movz_w(rd, imm, 0);
+  }
+
+#define F(mnemonic, sf, op, S) \
+  void mnemonic(Register rn, int imm, int nzcv, AsmCondition cond) { \
+    assert ((imm >> 5) == 0, "immediate is out of range");                      \
+    assert ((nzcv >> 4) == 0, "illegal nzcv");                                  \
+    emit_int32(sf << 31 | op << 30 | S << 29 | 0b11010010 << 21 | imm << 16 |   \
+         cond << 12 | 1 << 11 | rn->encoding_with_zr() << 5 | nzcv);            \
+  }
+
+  F(ccmn_w, 0, 0, 1)
+  F(ccmp_w, 0, 1, 1)
+  F(ccmn,   1, 0, 1)
+  F(ccmp,   1, 1, 1)
+#undef F
+
+#define F(mnemonic, sf, op, S) \
+  void mnemonic(Register rn, Register rm, int nzcv, AsmCondition cond) { \
+    assert ((nzcv >> 4) == 0, "illegal nzcv");                                                    \
+    emit_int32(sf << 31 | op << 30 | S << 29 | 0b11010010 << 21 | rm->encoding_with_zr() << 16 |  \
+        cond << 12 | rn->encoding_with_zr() << 5 | nzcv);                                         \
+  }
+
+  F(ccmn_w, 0, 0, 1)
+  F(ccmp_w, 0, 1, 1)
+  F(ccmn,   1, 0, 1)
+  F(ccmp,   1, 1, 1)
+#undef F
+
+#define F(mnemonic, sf, op, S, op2) \
+  void mnemonic(Register rd, Register rn, Register rm, AsmCondition cond) { \
+    emit_int32(sf << 31 | op << 30 | S << 29 | 0b11010100 << 21 | rm->encoding_with_zr() << 16 |  \
+        cond << 12 | op2 << 10 | rn->encoding_with_zr() << 5 | rd->encoding_with_zr());           \
+  }
+
+  F(csel_w,  0, 0, 0, 0b00)
+  F(csinc_w, 0, 0, 0, 0b01)
+  F(csinv_w, 0, 1, 0, 0b00)
+  F(csneg_w, 0, 1, 0, 0b01)
+
+  F(csel,    1, 0, 0, 0b00)
+  F(csinc,   1, 0, 0, 0b01)
+  F(csinv,   1, 1, 0, 0b00)
+  F(csneg,   1, 1, 0, 0b01)
+#undef F
+
+  void cset(Register rd, AsmCondition cond) {
+    csinc(rd, ZR, ZR, inverse(cond));
+  }
+
+  void cset_w(Register rd, AsmCondition cond) {
+    csinc_w(rd, ZR, ZR, inverse(cond));
+  }
+
+  void csetm(Register rd, AsmCondition cond) {
+    csinv(rd, ZR, ZR, inverse(cond));
+  }
+
+  void csetm_w(Register rd, AsmCondition cond) {
+    csinv_w(rd, ZR, ZR, inverse(cond));
+  }
+
+  void cinc(Register rd, Register rn, AsmCondition cond) {
+    csinc(rd, rn, rn, inverse(cond));
+  }
+
+  void cinc_w(Register rd, Register rn, AsmCondition cond) {
+    csinc_w(rd, rn, rn, inverse(cond));
+  }
+
+  void cinv(Register rd, Register rn, AsmCondition cond) {
+    csinv(rd, rn, rn, inverse(cond));
+  }
+
+  void cinv_w(Register rd, Register rn, AsmCondition cond) {
+    csinv_w(rd, rn, rn, inverse(cond));
+  }
+
+#define F(mnemonic, sf, S, opcode) \
+  void mnemonic(Register rd, Register rn) { \
+    emit_int32(sf << 31 | 1 << 30 | S << 29 | 0b11010110 << 21 | opcode << 10 |  \
+        rn->encoding_with_zr() << 5 | rd->encoding_with_zr());                   \
+  }
+
+  F(rbit_w,  0, 0, 0b000000)
+  F(rev16_w, 0, 0, 0b000001)
+  F(rev_w,   0, 0, 0b000010)
+  F(clz_w,   0, 0, 0b000100)
+  F(cls_w,   0, 0, 0b000101)
+
+  F(rbit,    1, 0, 0b000000)
+  F(rev16,   1, 0, 0b000001)
+  F(rev32,   1, 0, 0b000010)
+  F(rev,     1, 0, 0b000011)
+  F(clz,     1, 0, 0b000100)
+  F(cls,     1, 0, 0b000101)
+#undef F
+
+#define F(mnemonic, sf, S, opcode) \
+  void mnemonic(Register rd, Register rn, Register rm) { \
+    emit_int32(sf << 31 | S << 29 | 0b11010110 << 21 | rm->encoding_with_zr() << 16 |  \
+        opcode << 10 | rn->encoding_with_zr() << 5 | rd->encoding_with_zr());          \
+  }
+
+  F(udiv_w,  0, 0, 0b000010)
+  F(sdiv_w,  0, 0, 0b000011)
+  F(lslv_w,  0, 0, 0b001000)
+  F(lsrv_w,  0, 0, 0b001001)
+  F(asrv_w,  0, 0, 0b001010)
+  F(rorv_w,  0, 0, 0b001011)
+
+  F(udiv,    1, 0, 0b000010)
+  F(sdiv,    1, 0, 0b000011)
+  F(lslv,    1, 0, 0b001000)
+  F(lsrv,    1, 0, 0b001001)
+  F(asrv,    1, 0, 0b001010)
+  F(rorv,    1, 0, 0b001011)
+#undef F
+
+#define F(mnemonic, sf, op31, o0) \
+  void mnemonic(Register rd, Register rn, Register rm, Register ra) { \
+    emit_int32(sf << 31 | 0b11011 << 24 | op31 << 21 | rm->encoding_with_zr() << 16 |                     \
+        o0 << 15 | ra->encoding_with_zr() << 10 | rn->encoding_with_zr() << 5 | rd->encoding_with_zr());  \
+  }
+
+  F(madd_w,  0, 0b000, 0)
+  F(msub_w,  0, 0b000, 1)
+  F(madd,    1, 0b000, 0)
+  F(msub,    1, 0b000, 1)
+
+  F(smaddl,  1, 0b001, 0)
+  F(smsubl,  1, 0b001, 1)
+  F(umaddl,  1, 0b101, 0)
+  F(umsubl,  1, 0b101, 1)
+#undef F
+
+  void mul(Register rd, Register rn, Register rm) {
+      madd(rd, rn, rm, ZR);
+  }
+
+  void mul_w(Register rd, Register rn, Register rm) {
+      madd_w(rd, rn, rm, ZR);
+  }
+
+#define F(mnemonic, sf, op31, o0) \
+  void mnemonic(Register rd, Register rn, Register rm) { \
+    emit_int32(sf << 31 | 0b11011 << 24 | op31 << 21 | rm->encoding_with_zr() << 16 |      \
+        o0 << 15 | 0b11111 << 10 | rn->encoding_with_zr() << 5 | rd->encoding_with_zr());  \
+  }
+
+  F(smulh,   1, 0b010, 0)
+  F(umulh,   1, 0b110, 0)
+#undef F
+
+#define F(mnemonic, op) \
+  void mnemonic(Register rd, address addr) { \
+    intx offset;                                                        \
+    if (op == 0) {                                                      \
+      offset = addr - pc();                                             \
+    } else {                                                            \
+      offset = (((intx)addr) - (((intx)pc()) & ~0xfff)) >> 12;          \
+    }                                                                   \
+    assert (is_imm_in_range(offset, 21, 0), "offset is out of range");  \
+    emit_int32(op << 31 | (offset & 3) << 29 | 0b10000 << 24 |          \
+        encode_imm(offset >> 2, 19, 0, 5) | rd->encoding_with_zr());    \
+  }                                                                     \
+
+  F(adr,   0)
+  F(adrp,  1)
+#undef F
+
+  void adr(Register rd, Label& L) {
+    adr(rd, target(L));
+  }
+
+#define F(mnemonic, sf, opc, N)                                                \
+  void mnemonic(Register rd, Register rn, int immr, int imms) {                \
+    assert ((immr >> (5 + sf)) == 0, "immr is out of range");                  \
+    assert ((imms >> (5 + sf)) == 0, "imms is out of range");                  \
+    emit_int32(sf << 31 | opc << 29 | 0b100110 << 23 | N << 22 | immr << 16 |  \
+        imms << 10 | rn->encoding_with_zr() << 5 | rd->encoding_with_zr());    \
+  }
+
+  F(sbfm_w, 0, 0b00, 0)
+  F(bfm_w,  0, 0b01, 0)
+  F(ubfm_w, 0, 0b10, 0)
+
+  F(sbfm, 1, 0b00, 1)
+  F(bfm,  1, 0b01, 1)
+  F(ubfm, 1, 0b10, 1)
+#undef F
+
+#define F(alias, mnemonic, sf, immr, imms) \
+  void alias(Register rd, Register rn, int lsb, int width) {                        \
+    assert ((lsb >> (5 + sf)) == 0, "lsb is out of range");                         \
+    assert ((1 <= width) && (width <= (32 << sf) - lsb), "width is out of range");  \
+    mnemonic(rd, rn, immr, imms);                                                   \
+  }
+
+  F(bfi_w,   bfm_w,  0, (-lsb) & 0x1f, width - 1)
+  F(bfi,     bfm,    1, (-lsb) & 0x3f, width - 1)
+  F(bfxil_w, bfm_w,  0, lsb,           lsb + width - 1)
+  F(bfxil,   bfm,    1, lsb,           lsb + width - 1)
+  F(sbfiz_w, sbfm_w, 0, (-lsb) & 0x1f, width - 1)
+  F(sbfiz,   sbfm,   1, (-lsb) & 0x3f, width - 1)
+  F(sbfx_w,  sbfm_w, 0, lsb,           lsb + width - 1)
+  F(sbfx,    sbfm,   1, lsb,           lsb + width - 1)
+  F(ubfiz_w, ubfm_w, 0, (-lsb) & 0x1f, width - 1)
+  F(ubfiz,   ubfm,   1, (-lsb) & 0x3f, width - 1)
+  F(ubfx_w,  ubfm_w, 0, lsb,           lsb + width - 1)
+  F(ubfx,    ubfm,   1, lsb,           lsb + width - 1)
+#undef F
+
+#define F(alias, mnemonic, sf, immr, imms) \
+  void alias(Register rd, Register rn, int shift) {              \
+    assert ((shift >> (5 + sf)) == 0, "shift is out of range");  \
+    mnemonic(rd, rn, immr, imms);                                \
+  }
+
+  F(_asr_w, sbfm_w, 0, shift, 31)
+  F(_asr,   sbfm,   1, shift, 63)
+  F(_lsl_w, ubfm_w, 0, (-shift) & 0x1f, 31 - shift)
+  F(_lsl,   ubfm,   1, (-shift) & 0x3f, 63 - shift)
+  F(_lsr_w, ubfm_w, 0, shift, 31)
+  F(_lsr,   ubfm,   1, shift, 63)
+#undef F
+
+#define F(alias, mnemonic, immr, imms) \
+  void alias(Register rd, Register rn) {   \
+    mnemonic(rd, rn, immr, imms);          \
+  }
+
+  F(sxtb_w, sbfm_w, 0, 7)
+  F(sxtb,   sbfm,   0, 7)
+  F(sxth_w, sbfm_w, 0, 15)
+  F(sxth,   sbfm,   0, 15)
+  F(sxtw,   sbfm,   0, 31)
+  F(uxtb_w, ubfm_w, 0, 7)
+  F(uxtb,   ubfm,   0, 7)
+  F(uxth_w, ubfm_w, 0, 15)
+  F(uxth,   ubfm,   0, 15)
+#undef F
+
+  // Branch instructions
+
+#define F(mnemonic, op) \
+  void mnemonic(Register rn) {                                                             \
+    emit_int32(0b1101011 << 25 | op << 21 | 0b11111 << 16 | rn->encoding_with_zr() << 5);  \
+  }
+
+  F(br,  0b00)
+  F(blr, 0b01)
+  F(ret, 0b10)
+#undef F
+
+  void ret() {
+    ret(LR);
+  }
+
+#define F(mnemonic, op) \
+  void mnemonic(address target) {                                         \
+    intx offset = target - pc();                                          \
+    assert (is_offset_in_range(offset, 26), "offset is out of range");    \
+    emit_int32(op << 31 | 0b00101 << 26 | encode_offset(offset, 26, 0));  \
+  }
+
+  F(b,  0)
+  F(bl, 1)
+#undef F
+
+  void b(address target, AsmCondition cond) {
+    if (cond == al) {
+      b(target);
+    } else {
+      intx offset = target - pc();
+      assert (is_offset_in_range(offset, 19), "offset is out of range");
+      emit_int32(0b0101010 << 25 | encode_offset(offset, 19, 5) | cond);
+    }
+  }
+
+
+#define F(mnemonic, sf, op)                                             \
+  void mnemonic(Register rt, address target) {                          \
+    intx offset = target - pc();                                        \
+    assert (is_offset_in_range(offset, 19), "offset is out of range");  \
+    emit_int32(sf << 31 | 0b011010 << 25 | op << 24 | encode_offset(offset, 19, 5) | rt->encoding_with_zr()); \
+  }                                                                     \
+
+  F(cbz_w,  0, 0)
+  F(cbnz_w, 0, 1)
+  F(cbz,    1, 0)
+  F(cbnz,   1, 1)
+#undef F
+
+#define F(mnemonic, op)                                                 \
+  void mnemonic(Register rt, int bit, address target) {                 \
+    intx offset = target - pc();                                        \
+    assert (is_offset_in_range(offset, 14), "offset is out of range");  \
+    assert (0 <= bit && bit < 64, "bit number is out of range");        \
+    emit_int32((bit >> 5) << 31 | 0b011011 << 25 | op << 24 | (bit & 0x1f) << 19 | \
+        encode_offset(offset, 14, 5) | rt->encoding_with_zr());         \
+  }                                                                     \
+
+  F(tbz,  0)
+  F(tbnz, 1)
+#undef F
+
+  // System instructions
+
+  enum DMB_Opt {
+    DMB_ld  = 0b1101,
+    DMB_st  = 0b1110,
+    DMB_all = 0b1111
+  };
+
+#define F(mnemonic, L, op0, op1, CRn, op2, Rt) \
+  void mnemonic(DMB_Opt option) {                                       \
+    emit_int32(0b1101010100 << 22 | L << 21 | op0 << 19 | op1 << 16 |   \
+        CRn << 12 | option << 8 | op2 << 5 | Rt);                       \
+  }
+
+  F(dsb,  0, 0b00, 0b011, 0b0011, 0b100, 0b11111)
+  F(dmb,  0, 0b00, 0b011, 0b0011, 0b101, 0b11111)
+#undef F
+
+#define F(mnemonic, L, op0, op1, CRn, Rt) \
+  void mnemonic(int imm) {                                              \
+    assert ((imm >> 7) == 0, "immediate is out of range");              \
+    emit_int32(0b1101010100 << 22 | L << 21 | op0 << 19 | op1 << 16 |   \
+        CRn << 12 | imm << 5 | Rt);                                     \
+  }
+
+  F(hint, 0, 0b00, 0b011, 0b0010, 0b11111)
+#undef F
+
+  void nop() {
+    hint(0);
+  }
+
+  void yield() {
+    hint(1);
+  }
+
+#define F(mnemonic, opc, op2, LL) \
+  void mnemonic(int imm = 0) {                                           \
+    assert ((imm >> 16) == 0, "immediate is out of range");              \
+    emit_int32(0b11010100 << 24 | opc << 21 | imm << 5 | op2 << 2 | LL); \
+  }
+
+  F(brk, 0b001, 0b000, 0b00)
+  F(hlt, 0b010, 0b000, 0b00)
+#undef F
+
+  enum SystemRegister { // o0<1> op1<3> CRn<4> CRm<4> op2<3>
+    SysReg_NZCV = 0b101101000010000,
+    SysReg_FPCR = 0b101101000100000,
+  };
+
+  void mrs(Register rt, SystemRegister systemReg) {
+    assert ((systemReg >> 15) == 0, "systemReg is out of range");
+    emit_int32(0b110101010011 << 20 | systemReg << 5 | rt->encoding_with_zr());
+  }
+
+  void msr(SystemRegister systemReg, Register rt) {
+    assert ((systemReg >> 15) == 0, "systemReg is out of range");
+    emit_int32(0b110101010001 << 20 | systemReg << 5 | rt->encoding_with_zr());
+  }
+
+  // Floating-point instructions
+
+#define F(mnemonic, M, S, type, opcode2) \
+  void mnemonic(FloatRegister rn, FloatRegister rm) {                         \
+    emit_int32(M << 31 | S << 29 | 0b11110 << 24 | type << 22 | 1 << 21 |     \
+        rm->encoding() << 16 | 0b1000 << 10 | rn->encoding() << 5 | opcode2); \
+  }
+
+  F(fcmp_s,   0, 0, 0b00, 0b00000)
+  F(fcmpe_s,  0, 0, 0b00, 0b01000)
+  F(fcmp_d,   0, 0, 0b01, 0b00000)
+  F(fcmpe_d,  0, 0, 0b01, 0b10000)
+#undef F
+
+#define F(mnemonic, M, S, type, opcode2) \
+  void mnemonic(FloatRegister rn) {                                           \
+    emit_int32(M << 31 | S << 29 | 0b11110 << 24 | type << 22 | 1 << 21 |     \
+        0b1000 << 10 | rn->encoding() << 5 | opcode2);                        \
+  }
+
+  F(fcmp0_s,   0, 0, 0b00, 0b01000)
+  F(fcmpe0_s,  0, 0, 0b00, 0b11000)
+  F(fcmp0_d,   0, 0, 0b01, 0b01000)
+  F(fcmpe0_d,  0, 0, 0b01, 0b11000)
+#undef F
+
+#define F(mnemonic, M, S, type, op) \
+  void mnemonic(FloatRegister rn, FloatRegister rm, int nzcv, AsmCondition cond) { \
+    assert ((nzcv >> 4) == 0, "illegal nzcv");                                                  \
+    emit_int32(M << 31 | S << 29 | 0b11110 << 24 | type << 22 | 1 << 21 |                       \
+        rm->encoding() << 16 | cond << 12 | 0b01 << 10 | rn->encoding() << 5 | op << 4 | nzcv); \
+  }
+
+  F(fccmp_s,   0, 0, 0b00, 0)
+  F(fccmpe_s,  0, 0, 0b00, 1)
+  F(fccmp_d,   0, 0, 0b01, 0)
+  F(fccmpe_d,  0, 0, 0b01, 1)
+#undef F
+
+#define F(mnemonic, M, S, type) \
+  void mnemonic(FloatRegister rd, FloatRegister rn, FloatRegister rm, AsmCondition cond) { \
+    emit_int32(M << 31 | S << 29 | 0b11110 << 24 | type << 22 | 1 << 21 |                       \
+        rm->encoding() << 16 | cond << 12 | 0b11 << 10 | rn->encoding() << 5 | rd->encoding()); \
+  }
+
+  F(fcsel_s,   0, 0, 0b00)
+  F(fcsel_d,   0, 0, 0b01)
+#undef F
+
+#define F(mnemonic, M, S, type, opcode) \
+  void mnemonic(FloatRegister rd, FloatRegister rn) { \
+    emit_int32(M << 31 | S << 29 | 0b11110 << 24 | type << 22 | 1 << 21 |      \
+        opcode << 15 | 0b10000 << 10 | rn->encoding() << 5 | rd->encoding());  \
+  }
+
+  F(fmov_s,   0, 0, 0b00, 0b000000)
+  F(fabs_s,   0, 0, 0b00, 0b000001)
+  F(fneg_s,   0, 0, 0b00, 0b000010)
+  F(fsqrt_s,  0, 0, 0b00, 0b000011)
+  F(fcvt_ds,  0, 0, 0b00, 0b000101)
+  F(fcvt_hs,  0, 0, 0b00, 0b000111)
+  F(frintn_s, 0, 0, 0b00, 0b001000)
+  F(frintp_s, 0, 0, 0b00, 0b001001)
+  F(frintm_s, 0, 0, 0b00, 0b001010)
+  F(frintz_s, 0, 0, 0b00, 0b001011)
+  F(frinta_s, 0, 0, 0b00, 0b001100)
+  F(frintx_s, 0, 0, 0b00, 0b001110)
+  F(frinti_s, 0, 0, 0b00, 0b001111)
+
+  F(fmov_d,   0, 0, 0b01, 0b000000)
+  F(fabs_d,   0, 0, 0b01, 0b000001)
+  F(fneg_d,   0, 0, 0b01, 0b000010)
+  F(fsqrt_d,  0, 0, 0b01, 0b000011)
+  F(fcvt_sd,  0, 0, 0b01, 0b000100)
+  F(fcvt_hd,  0, 0, 0b01, 0b000111)
+  F(frintn_d, 0, 0, 0b01, 0b001000)
+  F(frintp_d, 0, 0, 0b01, 0b001001)
+  F(frintm_d, 0, 0, 0b01, 0b001010)
+  F(frintz_d, 0, 0, 0b01, 0b001011)
+  F(frinta_d, 0, 0, 0b01, 0b001100)
+  F(frintx_d, 0, 0, 0b01, 0b001110)
+  F(frinti_d, 0, 0, 0b01, 0b001111)
+
+  F(fcvt_sh,  0, 0, 0b11, 0b000100)
+  F(fcvt_dh,  0, 0, 0b11, 0b000101)
+#undef F
+
+#define F(mnemonic, M, S, type, opcode) \
+  void mnemonic(FloatRegister rd, FloatRegister rn, FloatRegister rm) { \
+    emit_int32(M << 31 | S << 29 | 0b11110 << 24 | type << 22 | 1 << 21 |                          \
+        rm->encoding() << 16 | opcode << 12 | 0b10 << 10 | rn->encoding() << 5 | rd->encoding());  \
+  }
+
+  F(fmul_s,   0, 0, 0b00, 0b0000)
+  F(fdiv_s,   0, 0, 0b00, 0b0001)
+  F(fadd_s,   0, 0, 0b00, 0b0010)
+  F(fsub_s,   0, 0, 0b00, 0b0011)
+  F(fmax_s,   0, 0, 0b00, 0b0100)
+  F(fmin_s,   0, 0, 0b00, 0b0101)
+  F(fmaxnm_s, 0, 0, 0b00, 0b0110)
+  F(fminnm_s, 0, 0, 0b00, 0b0111)
+  F(fnmul_s,  0, 0, 0b00, 0b1000)
+
+  F(fmul_d,   0, 0, 0b01, 0b0000)
+  F(fdiv_d,   0, 0, 0b01, 0b0001)
+  F(fadd_d,   0, 0, 0b01, 0b0010)
+  F(fsub_d,   0, 0, 0b01, 0b0011)
+  F(fmax_d,   0, 0, 0b01, 0b0100)
+  F(fmin_d,   0, 0, 0b01, 0b0101)
+  F(fmaxnm_d, 0, 0, 0b01, 0b0110)
+  F(fminnm_d, 0, 0, 0b01, 0b0111)
+  F(fnmul_d,  0, 0, 0b01, 0b1000)
+#undef F
+
+#define F(mnemonic, M, S, type, o1, o0) \
+  void mnemonic(FloatRegister rd, FloatRegister rn, FloatRegister rm, FloatRegister ra) { \
+    emit_int32(M << 31 | S << 29 | 0b11111 << 24 | type << 22 | o1 << 21 | rm->encoding() << 16 |  \
+         o0 << 15 | ra->encoding() << 10 | rn->encoding() << 5 | rd->encoding());                  \
+  }
+
+  F(fmadd_s,  0, 0, 0b00, 0, 0)
+  F(fmsub_s,  0, 0, 0b00, 0, 1)
+  F(fnmadd_s, 0, 0, 0b00, 1, 0)
+  F(fnmsub_s, 0, 0, 0b00, 1, 1)
+
+  F(fmadd_d,  0, 0, 0b01, 0, 0)
+  F(fmsub_d,  0, 0, 0b01, 0, 1)
+  F(fnmadd_d, 0, 0, 0b01, 1, 0)
+  F(fnmsub_d, 0, 0, 0b01, 1, 1)
+#undef F
+
+#define F(mnemonic, M, S, type) \
+  void mnemonic(FloatRegister rd, int imm8) { \
+    assert ((imm8 >> 8) == 0, "immediate is out of range");                \
+    emit_int32(M << 31 | S << 29 | 0b11110 << 24 | type << 22 | 1 << 21 |  \
+         imm8 << 13 | 0b100 << 10 | rd->encoding());                       \
+  }
+
+  F(fmov_s, 0, 0, 0b00)
+  F(fmov_d, 0, 0, 0b01)
+#undef F
+
+#define F(mnemonic, sf, S, type, rmode, opcode) \
+  void mnemonic(Register rd, FloatRegister rn) {                                     \
+    emit_int32(sf << 31 | S << 29 | 0b11110 << 24 | type << 22 | 1 << 21 |           \
+         rmode << 19 | opcode << 16 | rn->encoding() << 5 | rd->encoding_with_zr()); \
+  }
+
+  F(fcvtns_ws, 0, 0, 0b00, 0b00, 0b000)
+  F(fcvtnu_ws, 0, 0, 0b00, 0b00, 0b001)
+  F(fcvtas_ws, 0, 0, 0b00, 0b00, 0b100)
+  F(fcvtau_ws, 0, 0, 0b00, 0b00, 0b101)
+  F(fmov_ws,   0, 0, 0b00, 0b00, 0b110)
+  F(fcvtps_ws, 0, 0, 0b00, 0b01, 0b000)
+  F(fcvtpu_ws, 0, 0, 0b00, 0b01, 0b001)
+  F(fcvtms_ws, 0, 0, 0b00, 0b10, 0b000)
+  F(fcvtmu_ws, 0, 0, 0b00, 0b10, 0b001)
+  F(fcvtzs_ws, 0, 0, 0b00, 0b11, 0b000)
+  F(fcvtzu_ws, 0, 0, 0b00, 0b11, 0b001)
+
+  F(fcvtns_wd, 0, 0, 0b01, 0b00, 0b000)
+  F(fcvtnu_wd, 0, 0, 0b01, 0b00, 0b001)
+  F(fcvtas_wd, 0, 0, 0b01, 0b00, 0b100)
+  F(fcvtau_wd, 0, 0, 0b01, 0b00, 0b101)
+  F(fcvtps_wd, 0, 0, 0b01, 0b01, 0b000)
+  F(fcvtpu_wd, 0, 0, 0b01, 0b01, 0b001)
+  F(fcvtms_wd, 0, 0, 0b01, 0b10, 0b000)
+  F(fcvtmu_wd, 0, 0, 0b01, 0b10, 0b001)
+  F(fcvtzs_wd, 0, 0, 0b01, 0b11, 0b000)
+  F(fcvtzu_wd, 0, 0, 0b01, 0b11, 0b001)
+
+  F(fcvtns_xs, 1, 0, 0b00, 0b00, 0b000)
+  F(fcvtnu_xs, 1, 0, 0b00, 0b00, 0b001)
+  F(fcvtas_xs, 1, 0, 0b00, 0b00, 0b100)
+  F(fcvtau_xs, 1, 0, 0b00, 0b00, 0b101)
+  F(fcvtps_xs, 1, 0, 0b00, 0b01, 0b000)
+  F(fcvtpu_xs, 1, 0, 0b00, 0b01, 0b001)
+  F(fcvtms_xs, 1, 0, 0b00, 0b10, 0b000)
+  F(fcvtmu_xs, 1, 0, 0b00, 0b10, 0b001)
+  F(fcvtzs_xs, 1, 0, 0b00, 0b11, 0b000)
+  F(fcvtzu_xs, 1, 0, 0b00, 0b11, 0b001)
+
+  F(fcvtns_xd, 1, 0, 0b01, 0b00, 0b000)
+  F(fcvtnu_xd, 1, 0, 0b01, 0b00, 0b001)
+  F(fcvtas_xd, 1, 0, 0b01, 0b00, 0b100)
+  F(fcvtau_xd, 1, 0, 0b01, 0b00, 0b101)
+  F(fmov_xd,   1, 0, 0b01, 0b00, 0b110)
+  F(fcvtps_xd, 1, 0, 0b01, 0b01, 0b000)
+  F(fcvtpu_xd, 1, 0, 0b01, 0b01, 0b001)
+  F(fcvtms_xd, 1, 0, 0b01, 0b10, 0b000)
+  F(fcvtmu_xd, 1, 0, 0b01, 0b10, 0b001)
+  F(fcvtzs_xd, 1, 0, 0b01, 0b11, 0b000)
+  F(fcvtzu_xd, 1, 0, 0b01, 0b11, 0b001)
+
+  F(fmov_xq,   1, 0, 0b10, 0b01, 0b110)
+#undef F
+
+#define F(mnemonic, sf, S, type, rmode, opcode) \
+  void mnemonic(FloatRegister rd, Register rn) {                                     \
+    emit_int32(sf << 31 | S << 29 | 0b11110 << 24 | type << 22 | 1 << 21 |           \
+         rmode << 19 | opcode << 16 | rn->encoding_with_zr() << 5 | rd->encoding()); \
+  }
+
+  F(scvtf_sw,  0, 0, 0b00, 0b00, 0b010)
+  F(ucvtf_sw,  0, 0, 0b00, 0b00, 0b011)
+  F(fmov_sw,   0, 0, 0b00, 0b00, 0b111)
+  F(scvtf_dw,  0, 0, 0b01, 0b00, 0b010)
+  F(ucvtf_dw,  0, 0, 0b01, 0b00, 0b011)
+
+  F(scvtf_sx,  1, 0, 0b00, 0b00, 0b010)
+  F(ucvtf_sx,  1, 0, 0b00, 0b00, 0b011)
+  F(scvtf_dx,  1, 0, 0b01, 0b00, 0b010)
+  F(ucvtf_dx,  1, 0, 0b01, 0b00, 0b011)
+  F(fmov_dx,   1, 0, 0b01, 0b00, 0b111)
+
+  F(fmov_qx,   1, 0, 0b10, 0b01, 0b111)
+#undef F
+
+#define F(mnemonic, opcode) \
+  void mnemonic(FloatRegister Vd, FloatRegister Vn) {                                     \
+    emit_int32( opcode << 10 | Vn->encoding() << 5 | Vd->encoding());             \
+  }
+
+  F(aese, 0b0100111000101000010010);
+  F(aesd, 0b0100111000101000010110);
+  F(aesmc, 0b0100111000101000011010);
+  F(aesimc, 0b0100111000101000011110);
+#undef F
+
+#ifdef COMPILER2
+  typedef VFP::double_num double_num;
+  typedef VFP::float_num  float_num;
+#endif
+
+  void vcnt(FloatRegister Dd, FloatRegister Dn, int quad = 0, int size = 0) {
+    // emitted at VM startup to detect whether the instruction is available
+    assert(!VM_Version::is_initialized() || VM_Version::has_simd(), "simd instruction");
+    assert(size == 0, "illegal size value");
+    emit_int32(0x0e205800 | quad << 30 | size << 22 | Dn->encoding() << 5 | Dd->encoding());
+  }
+
+#ifdef COMPILER2
+  void addv(FloatRegister Dd, FloatRegister Dm, int quad, int size) {
+    // emitted at VM startup to detect whether the instruction is available
+    assert(VM_Version::has_simd(), "simd instruction");
+    assert((quad & ~1) == 0, "illegal value");
+    assert(size >= 0 && size < 3, "illegal value");
+    assert(((size << 1) | quad) != 4, "illegal values (size 2, quad 0)");
+    emit_int32(0x0e31b800 | quad << 30 | size << 22 | Dm->encoding() << 5 | Dd->encoding());
+  }
+
+  enum VElem_Size {
+    VELEM_SIZE_8  = 0x00,
+    VELEM_SIZE_16 = 0x01,
+    VELEM_SIZE_32 = 0x02,
+    VELEM_SIZE_64 = 0x03
+  };
+
+  enum VLD_Type {
+    VLD1_TYPE_1_REG  = 0b0111,
+    VLD1_TYPE_2_REGS = 0b1010,
+    VLD1_TYPE_3_REGS = 0b0110,
+    VLD1_TYPE_4_REGS = 0b0010
+  };
+
+  enum VFloat_Arith_Size {
+    VFA_SIZE_F32 = 0b0,
+    VFA_SIZE_F64 = 0b1
+  };
+
+#define F(mnemonic, U, S, P) \
+  void mnemonic(FloatRegister fd, FloatRegister fn, FloatRegister fm,    \
+                int size, int quad) {                                    \
+    assert(VM_Version::has_simd(), "simd instruction");                  \
+    assert(!(size == VFA_SIZE_F64 && !quad), "reserved");                \
+    assert((size & 1) == size, "overflow");                              \
+    emit_int32(quad << 30 | U << 29 | 0b01110 << 24 |                    \
+               S << 23 | size << 22 | 1 << 21 | P << 11 | 1 << 10 |      \
+               fm->encoding() << 16 |                                    \
+               fn->encoding() <<  5 |                                    \
+               fd->encoding());                                          \
+  }
+
+  F(vaddF, 0, 0, 0b11010)  // Vd = Vn + Vm (float)
+  F(vsubF, 0, 1, 0b11010)  // Vd = Vn - Vm (float)
+  F(vmulF, 1, 0, 0b11011)  // Vd = Vn - Vm (float)
+  F(vdivF, 1, 0, 0b11111)  // Vd = Vn / Vm (float)
+#undef F
+
+#define F(mnemonic, U) \
+  void mnemonic(FloatRegister fd, FloatRegister fm, FloatRegister fn,    \
+                int size, int quad) {                                    \
+    assert(VM_Version::has_simd(), "simd instruction");                  \
+    assert(!(size == VELEM_SIZE_64 && !quad), "reserved");               \
+    assert((size & 0b11) == size, "overflow");                           \
+    int R = 0; /* rounding */                                            \
+    int S = 0; /* saturating */                                          \
+    emit_int32(quad << 30 | U << 29 | 0b01110 << 24 | size << 22 |       \
+               1 << 21 | R << 12 | S << 11 | 0b10001 << 10 |             \
+               fm->encoding() << 16 |                                    \
+               fn->encoding() <<  5 |                                    \
+               fd->encoding());                                          \
+  }
+
+  F(vshlSI, 0)  // Vd = ashift(Vn,Vm) (int)
+  F(vshlUI, 1)  // Vd = lshift(Vn,Vm) (int)
+#undef F
+
+#define F(mnemonic, U, P, M) \
+  void mnemonic(FloatRegister fd, FloatRegister fn, FloatRegister fm,    \
+                int size, int quad) {                                    \
+    assert(VM_Version::has_simd(), "simd instruction");                  \
+    assert(!(size == VELEM_SIZE_64 && !quad), "reserved");               \
+    assert(!(size == VELEM_SIZE_64 && M), "reserved");                   \
+    assert((size & 0b11) == size, "overflow");                           \
+    emit_int32(quad << 30 | U << 29 | 0b01110 << 24 | size << 22 |       \
+               1 << 21 | P << 11 | 1 << 10 |                             \
+               fm->encoding() << 16 |                                    \
+               fn->encoding() <<  5 |                                    \
+               fd->encoding());                                          \
+  }
+
+  F(vmulI, 0, 0b10011,  true)  // Vd = Vn * Vm (int)
+  F(vaddI, 0, 0b10000, false)  // Vd = Vn + Vm (int)
+  F(vsubI, 1, 0b10000, false)  // Vd = Vn - Vm (int)
+#undef F
+
+#define F(mnemonic, U, O) \
+  void mnemonic(FloatRegister fd, FloatRegister fn, FloatRegister fm,    \
+                int quad) {                                              \
+    assert(VM_Version::has_simd(), "simd instruction");                  \
+    emit_int32(quad << 30 | U << 29 | 0b01110 << 24 | O << 22 |          \
+               1 << 21 | 0b00011 << 11 | 1 << 10 |                       \
+               fm->encoding() << 16 |                                    \
+               fn->encoding() <<  5 |                                    \
+               fd->encoding());                                          \
+  }
+
+  F(vandI, 0, 0b00)  // Vd = Vn & Vm (int)
+  F(vorI,  0, 0b10)  // Vd = Vn | Vm (int)
+  F(vxorI, 1, 0b00)  // Vd = Vn ^ Vm (int)
+#undef F
+
+  void vnegI(FloatRegister fd, FloatRegister fn, int size, int quad) {
+    int U = 1;
+    assert(VM_Version::has_simd(), "simd instruction");
+    assert(quad || size != VELEM_SIZE_64, "reserved");
+    emit_int32(quad << 30 | U << 29 | 0b01110 << 24 |
+              size << 22 | 0b100000101110 << 10 |
+              fn->encoding() << 5 |
+              fd->encoding() << 0);
+  }
+
+  void vshli(FloatRegister fd, FloatRegister fn, int esize, int imm, int quad) {
+    assert(VM_Version::has_simd(), "simd instruction");
+
+    if (imm >= esize) {
+      // maximum shift gives all zeroes, direction doesn't matter,
+      // but only available for shift right
+      vshri(fd, fn, esize, esize, true /* unsigned */, quad);
+      return;
+    }
+    assert(imm >= 0 && imm < esize, "out of range");
+
+    int imm7 = esize + imm;
+    int immh = imm7 >> 3;
+    assert(immh != 0, "encoding constraint");
+    assert((uint)immh < 16, "sanity");
+    assert(((immh >> 2) | quad) != 0b10, "reserved");
+    emit_int32(quad << 30 | 0b011110 << 23 | imm7 << 16 |
+               0b010101 << 10 | fn->encoding() << 5 | fd->encoding() << 0);
+  }
+
+  void vshri(FloatRegister fd, FloatRegister fn, int esize, int imm,
+             bool U /* unsigned */, int quad) {
+    assert(VM_Version::has_simd(), "simd instruction");
+    assert(imm > 0, "out of range");
+    if (imm >= esize) {
+      // maximum shift (all zeroes)
+      imm = esize;
+    }
+    int imm7 = 2 * esize - imm ;
+    int immh = imm7 >> 3;
+    assert(immh != 0, "encoding constraint");
+    assert((uint)immh < 16, "sanity");
+    assert(((immh >> 2) | quad) != 0b10, "reserved");
+    emit_int32(quad << 30 | U << 29 | 0b011110 << 23 | imm7 << 16 |
+               0b000001 << 10 | fn->encoding() << 5 | fd->encoding() << 0);
+  }
+  void vshrUI(FloatRegister fd, FloatRegister fm, int size, int imm, int quad) {
+    vshri(fd, fm, size, imm, true /* unsigned */, quad);
+  }
+  void vshrSI(FloatRegister fd, FloatRegister fm, int size, int imm, int quad) {
+    vshri(fd, fm, size, imm, false /* signed */, quad);
+  }
+
+  void vld1(FloatRegister Vt, Address addr, VElem_Size size, int bits) {
+    assert(VM_Version::has_simd(), "simd instruction");
+    assert(bits == 128, "unsupported");
+    assert(addr.disp() == 0 || addr.disp() == 16, "must be");
+    int type = 0b11; // 2D
+    int quad = 1;
+    int L = 1;
+    int opcode = VLD1_TYPE_1_REG;
+    emit_int32(quad << 30 | 0b11 << 26 | L << 22 | opcode << 12 | size << 10 |
+               Vt->encoding() << 0 | addr.encoding_simd());
+  }
+
+  void vst1(FloatRegister Vt, Address addr, VElem_Size size, int bits) {
+    assert(VM_Version::has_simd(), "simd instruction");
+    assert(bits == 128, "unsupported");
+    assert(addr.disp() == 0 || addr.disp() == 16, "must be");
+    int type = 0b11; // 2D
+    int quad = 1;
+    int L = 0;
+    int opcode = VLD1_TYPE_1_REG;
+    emit_int32(quad << 30 | 0b11 << 26 | L << 22 | opcode << 12 | size << 10 |
+               Vt->encoding() << 0 | addr.encoding_simd());
+  }
+
+  void vld1(FloatRegister Vt, FloatRegister Vt2, Address addr, VElem_Size size, int bits) {
+    assert(VM_Version::has_simd(), "simd instruction");
+    assert(bits == 128, "unsupported");
+    assert(Vt->successor() == Vt2, "Registers must be ordered");
+    assert(addr.disp() == 0 || addr.disp() == 32, "must be");
+    int type = 0b11; // 2D
+    int quad = 1;
+    int L = 1;
+    int opcode = VLD1_TYPE_2_REGS;
+    emit_int32(quad << 30 | 0b11 << 26 | L << 22 | opcode << 12 | size << 10 |
+               Vt->encoding() << 0 | addr.encoding_simd());
+  }
+
+  void vst1(FloatRegister Vt, FloatRegister Vt2, Address addr, VElem_Size size, int bits) {
+    assert(VM_Version::has_simd(), "simd instruction");
+    assert(Vt->successor() == Vt2, "Registers must be ordered");
+    assert(bits == 128, "unsupported");
+    assert(addr.disp() == 0 || addr.disp() == 32, "must be");
+    int type = 0b11; // 2D
+    int quad = 1;
+    int L = 0;
+    int opcode = VLD1_TYPE_2_REGS;
+    emit_int32(quad << 30 | 0b11 << 26 | L << 22 | opcode << 12 | size << 10 |
+               Vt->encoding() << 0 | addr.encoding_simd());
+  }
+
+  void vld1(FloatRegister Vt, FloatRegister Vt2, FloatRegister Vt3,
+            Address addr, VElem_Size size, int bits) {
+    assert(VM_Version::has_simd(), "simd instruction");
+    assert(bits == 128, "unsupported");
+    assert(Vt->successor() == Vt2 && Vt2->successor() == Vt3,
+          "Registers must be ordered");
+    assert(addr.disp() == 0 || addr.disp() == 48, "must be");
+    int type = 0b11; // 2D
+    int quad = 1;
+    int L = 1;
+    int opcode = VLD1_TYPE_3_REGS;
+    emit_int32(quad << 30 | 0b11 << 26 | L << 22 | opcode << 12 | size << 10 |
+               Vt->encoding() << 0 | addr.encoding_simd());
+  }
+
+  void vst1(FloatRegister Vt, FloatRegister Vt2, FloatRegister Vt3,
+            Address addr, VElem_Size size, int bits) {
+    assert(VM_Version::has_simd(), "simd instruction");
+    assert(bits == 128, "unsupported");
+    assert(Vt->successor() == Vt2 &&  Vt2->successor() == Vt3,
+           "Registers must be ordered");
+    assert(addr.disp() == 0 || addr.disp() == 48, "must be");
+    int type = 0b11; // 2D
+    int quad = 1;
+    int L = 0;
+    int opcode = VLD1_TYPE_3_REGS;
+    emit_int32(quad << 30 | 0b11 << 26 | L << 22 | opcode << 12 | size << 10 |
+               Vt->encoding() << 0 | addr.encoding_simd());
+  }
+
+  void vld1(FloatRegister Vt, FloatRegister Vt2, FloatRegister Vt3,
+            FloatRegister Vt4, Address addr, VElem_Size size, int bits) {
+    assert(VM_Version::has_simd(), "simd instruction");
+    assert(bits == 128, "unsupported");
+    assert(Vt->successor() == Vt2 && Vt2->successor() == Vt3 &&
+           Vt3->successor() == Vt4, "Registers must be ordered");
+    assert(addr.disp() == 0 || addr.disp() == 64, "must be");
+    int type = 0b11; // 2D
+    int quad = 1;
+    int L = 1;
+    int opcode = VLD1_TYPE_4_REGS;
+    emit_int32(quad << 30 | 0b11 << 26 | L << 22 | opcode << 12 | size << 10 |
+               Vt->encoding() << 0 | addr.encoding_simd());
+  }
+
+  void vst1(FloatRegister Vt, FloatRegister Vt2, FloatRegister Vt3,
+            FloatRegister Vt4,  Address addr, VElem_Size size, int bits) {
+    assert(VM_Version::has_simd(), "simd instruction");
+    assert(bits == 128, "unsupported");
+    assert(Vt->successor() == Vt2 && Vt2->successor() == Vt3 &&
+           Vt3->successor() == Vt4, "Registers must be ordered");
+    assert(addr.disp() == 0 || addr.disp() == 64, "must be");
+    int type = 0b11; // 2D
+    int quad = 1;
+    int L = 0;
+    int opcode = VLD1_TYPE_4_REGS;
+    emit_int32(quad << 30 | 0b11 << 26 | L << 22 | opcode << 12 | size << 10 |
+               Vt->encoding() << 0 | addr.encoding_simd());
+  }
+
+  void rev32(FloatRegister Vd, FloatRegister Vn, VElem_Size size, int quad) {
+    assert(VM_Version::has_simd(), "simd instruction");
+    assert(size == VELEM_SIZE_8 || size == VELEM_SIZE_16, "must be");
+    emit_int32(quad << 30 | 0b101110 << 24 | size << 22 |
+               0b100000000010 << 10 | Vn->encoding() << 5 | Vd->encoding());
+  }
+
+  void eor(FloatRegister Vd, FloatRegister Vn,  FloatRegister Vm, VElem_Size size, int quad) {
+    assert(VM_Version::has_simd(), "simd instruction");
+    assert(size == VELEM_SIZE_8, "must be");
+    emit_int32(quad << 30 | 0b101110001 << 21 | Vm->encoding() << 16 |
+               0b000111 << 10 | Vn->encoding() << 5 | Vd->encoding());
+  }
+
+  void orr(FloatRegister Vd, FloatRegister Vn,  FloatRegister Vm, VElem_Size size, int quad) {
+    assert(VM_Version::has_simd(), "simd instruction");
+    assert(size == VELEM_SIZE_8, "must be");
+    emit_int32(quad << 30 | 0b001110101 << 21 | Vm->encoding() << 16 |
+               0b000111 << 10 | Vn->encoding() << 5 | Vd->encoding());
+  }
+
+  void vmovI(FloatRegister Dd, int imm8, VElem_Size size, int quad) {
+    assert(VM_Version::has_simd(), "simd instruction");
+    assert(imm8 >= 0 && imm8 < 256, "out of range");
+    int op;
+    int cmode;
+    switch (size) {
+    case VELEM_SIZE_8:
+      op = 0;
+      cmode = 0b1110;
+      break;
+    case VELEM_SIZE_16:
+      op = 0;
+      cmode = 0b1000;
+      break;
+    case VELEM_SIZE_32:
+      op = 0;
+      cmode = 0b0000;
+      break;
+    default:
+      cmode = 0;
+      ShouldNotReachHere();
+    }
+    int abc = imm8 >> 5;
+    int defgh = imm8 & 0b11111;
+    emit_int32(quad << 30 | op << 29 | 0b1111 << 24 |
+               abc << 16 | cmode << 12 | 0b01 << 10 |
+               defgh << 5 | Dd->encoding() << 0);
+  }
+
+  void vdupI(FloatRegister Dd, Register Rn, VElem_Size size, int quad) {
+    assert(VM_Version::has_simd(), "simd instruction");
+    assert(size <= 3, "unallocated encoding");
+    assert(size != 3 || quad == 1, "reserved");
+    int imm5 = 1 << size;
+#ifdef ASSERT
+    switch (size) {
+    case VELEM_SIZE_8:
+      assert(imm5 == 0b00001, "sanity");
+      break;
+    case VELEM_SIZE_16:
+      assert(imm5 == 0b00010, "sanity");
+      break;
+    case VELEM_SIZE_32:
+      assert(imm5 == 0b00100, "sanity");
+      break;
+    case VELEM_SIZE_64:
+      assert(imm5 == 0b01000, "sanity");
+      break;
+    default:
+      ShouldNotReachHere();
+    }
+#endif
+    emit_int32(quad << 30 | 0b111 << 25 | 0b11 << 10 |
+               imm5 << 16 | Rn->encoding() << 5 |
+               Dd->encoding() << 0);
+  }
+
+  void vdup(FloatRegister Vd, FloatRegister Vn, VElem_Size size, int quad) {
+    assert(VM_Version::has_simd(), "simd instruction");
+    int index = 0;
+    int bytes = 1 << size;
+    int range = 16 / bytes;
+    assert(index < range, "overflow");
+
+    assert(size != VELEM_SIZE_64 || quad, "reserved");
+    assert(8 << VELEM_SIZE_8  ==  8, "sanity");
+    assert(8 << VELEM_SIZE_16 == 16, "sanity");
+    assert(8 << VELEM_SIZE_32 == 32, "sanity");
+    assert(8 << VELEM_SIZE_64 == 64, "sanity");
+
+    int imm5 = (index << (size + 1)) | bytes;
+
+    emit_int32(quad << 30 | 0b001110000 << 21 | imm5 << 16 | 0b000001 << 10 |
+               Vn->encoding() << 5 | Vd->encoding() << 0);
+  }
+
+  void vdupF(FloatRegister Vd, FloatRegister Vn, int quad) {
+    vdup(Vd, Vn, VELEM_SIZE_32, quad);
+  }
+
+  void vdupD(FloatRegister Vd, FloatRegister Vn, int quad) {
+    vdup(Vd, Vn, VELEM_SIZE_64, quad);
+  }
+#endif
+};
+
+
+#endif // CPU_ARM_VM_ASSEMBLER_ARM_64_HPP
--- /dev/null	2016-08-24 15:41:39.598575000 -0400
+++ new/hotspot/src/cpu/arm/vm/bytes_arm.hpp	2016-12-02 11:17:46.451733060 -0500
@@ -0,0 +1,195 @@
+/*
+ * Copyright (c) 2008, 2016, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef CPU_ARM_VM_BYTES_ARM_HPP
+#define CPU_ARM_VM_BYTES_ARM_HPP
+
+#include "memory/allocation.hpp"
+#include "utilities/macros.hpp"
+
+#ifndef VM_LITTLE_ENDIAN
+#define VM_LITTLE_ENDIAN  1
+#endif
+
+class Bytes: AllStatic {
+
+ public:
+  // Returns true if the byte ordering used by Java is different from the native byte ordering
+  // of the underlying machine.
+  static inline bool is_Java_byte_ordering_different() {
+    return VM_LITTLE_ENDIAN != 0;
+  }
+
+  static inline u2 get_Java_u2(address p) {
+    return (u2(p[0]) << 8) | u2(p[1]);
+  }
+
+  static inline u4 get_Java_u4(address p) {
+    return u4(p[0]) << 24 |
+           u4(p[1]) << 16 |
+           u4(p[2]) <<  8 |
+           u4(p[3]);
+  }
+
+  static inline u8 get_Java_u8(address p) {
+    return u8(p[0]) << 56 |
+           u8(p[1]) << 48 |
+           u8(p[2]) << 40 |
+           u8(p[3]) << 32 |
+           u8(p[4]) << 24 |
+           u8(p[5]) << 16 |
+           u8(p[6]) <<  8 |
+           u8(p[7]);
+  }
+
+  static inline void put_Java_u2(address p, u2 x) {
+    p[0] = x >> 8;
+    p[1] = x;
+  }
+
+  static inline void put_Java_u4(address p, u4 x) {
+    ((u1*)p)[0] = x >> 24;
+    ((u1*)p)[1] = x >> 16;
+    ((u1*)p)[2] = x >>  8;
+    ((u1*)p)[3] = x;
+  }
+
+  static inline void put_Java_u8(address p, u8 x) {
+    ((u1*)p)[0] = x >> 56;
+    ((u1*)p)[1] = x >> 48;
+    ((u1*)p)[2] = x >> 40;
+    ((u1*)p)[3] = x >> 32;
+    ((u1*)p)[4] = x >> 24;
+    ((u1*)p)[5] = x >> 16;
+    ((u1*)p)[6] = x >>  8;
+    ((u1*)p)[7] = x;
+  }
+
+#ifdef VM_LITTLE_ENDIAN
+
+  static inline u2 get_native_u2(address p) {
+    return (intptr_t(p) & 1) == 0 ? *(u2*)p : u2(p[0]) | (u2(p[1]) << 8);
+  }
+
+  static inline u4 get_native_u4(address p) {
+    switch (intptr_t(p) & 3) {
+      case 0:  return *(u4*)p;
+      case 2:  return u4(((u2*)p)[0]) |
+                      u4(((u2*)p)[1]) << 16;
+      default: return u4(p[0])       |
+                      u4(p[1]) <<  8 |
+                      u4(p[2]) << 16 |
+                      u4(p[3]) << 24;
+    }
+  }
+
+  static inline u8 get_native_u8(address p) {
+    switch (intptr_t(p) & 7) {
+      case 0:  return *(u8*)p;
+      case 4:  return u8(((u4*)p)[0]) |
+                      u8(((u4*)p)[1]) << 32;
+      case 2:  return u8(((u2*)p)[0])       |
+                      u8(((u2*)p)[1]) << 16 |
+                      u8(((u2*)p)[2]) << 32 |
+                      u8(((u2*)p)[3]) << 48;
+      default: return u8(p[0])       |
+                      u8(p[1]) <<  8 |
+                      u8(p[2]) << 16 |
+                      u8(p[3]) << 24 |
+                      u8(p[4]) << 32 |
+                      u8(p[5]) << 40 |
+                      u8(p[6]) << 48 |
+                      u8(p[7]) << 56;
+    }
+  }
+
+  static inline void put_native_u2(address p, u2 x) {
+    if ((intptr_t(p) & 1) == 0) {
+      *(u2*)p = x;
+    } else {
+      p[0] = x;
+      p[1] = x >> 8;
+    }
+  }
+
+  static inline void put_native_u4(address p, u4 x) {
+    switch (intptr_t(p) & 3) {
+      case 0:  *(u4*)p = x;
+               break;
+      case 2:  ((u2*)p)[0] = x;
+               ((u2*)p)[1] = x >> 16;
+               break;
+      default: ((u1*)p)[0] = x;
+               ((u1*)p)[1] = x >>  8;
+               ((u1*)p)[2] = x >> 16;
+               ((u1*)p)[3] = x >> 24;
+               break;
+    }
+  }
+
+  static inline void put_native_u8(address p, u8 x) {
+    switch (intptr_t(p) & 7) {
+      case 0:  *(u8*)p = x;
+               break;
+      case 4:  ((u4*)p)[0] = x;
+               ((u4*)p)[1] = x >> 32;
+               break;
+      case 2:  ((u2*)p)[0] = x;
+               ((u2*)p)[1] = x >> 16;
+               ((u2*)p)[2] = x >> 32;
+               ((u2*)p)[3] = x >> 48;
+               break;
+      default: ((u1*)p)[0] = x;
+               ((u1*)p)[1] = x >>  8;
+               ((u1*)p)[2] = x >> 16;
+               ((u1*)p)[3] = x >> 24;
+               ((u1*)p)[4] = x >> 32;
+               ((u1*)p)[5] = x >> 40;
+               ((u1*)p)[6] = x >> 48;
+               ((u1*)p)[7] = x >> 56;
+    }
+  }
+
+#else
+
+  static inline u2 get_native_u2(address p) { return get_Java_u2(p); }
+  static inline u4 get_native_u4(address p) { return get_Java_u4(p); }
+  static inline u8 get_native_u8(address p) { return get_Java_u8(p); }
+  static inline void put_native_u2(address p, u2 x) { put_Java_u2(p, x); }
+  static inline void put_native_u4(address p, u4 x) { put_Java_u4(p, x); }
+  static inline void put_native_u8(address p, u8 x) { put_Java_u8(p, x); }
+
+#endif // VM_LITTLE_ENDIAN
+
+  // Efficient swapping of byte ordering
+  static inline u2 swap_u2(u2 x);
+  static inline u4 swap_u4(u4 x);
+  static inline u8 swap_u8(u8 x);
+};
+
+
+// The following header contains the implementations of swap_u2, swap_u4, and swap_u8
+#include OS_CPU_HEADER_INLINE(bytes)
+
+#endif // CPU_ARM_VM_BYTES_ARM_HPP
--- /dev/null	2016-08-24 15:41:39.598575000 -0400
+++ new/hotspot/src/cpu/arm/vm/c1_CodeStubs_arm.cpp	2016-12-02 11:17:51.460017084 -0500
@@ -0,0 +1,510 @@
+/*
+ * Copyright (c) 2008, 2015, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+#include "asm/macroAssembler.hpp"
+#include "c1/c1_CodeStubs.hpp"
+#include "c1/c1_FrameMap.hpp"
+#include "c1/c1_LIRAssembler.hpp"
+#include "c1/c1_MacroAssembler.hpp"
+#include "c1/c1_Runtime1.hpp"
+#include "nativeInst_arm.hpp"
+#include "runtime/sharedRuntime.hpp"
+#include "utilities/macros.hpp"
+#include "vmreg_arm.inline.hpp"
+#if INCLUDE_ALL_GCS
+#include "gc/g1/g1SATBCardTableModRefBS.hpp"
+#endif // INCLUDE_ALL_GCS
+
+#define __ ce->masm()->
+
+void CounterOverflowStub::emit_code(LIR_Assembler* ce) {
+  __ bind(_entry);
+  ce->store_parameter(_bci, 0);
+  ce->store_parameter(_method->as_constant_ptr()->as_metadata(), 1);
+  __ call(Runtime1::entry_for(Runtime1::counter_overflow_id), relocInfo::runtime_call_type);
+  ce->add_call_info_here(_info);
+  ce->verify_oop_map(_info);
+
+  __ b(_continuation);
+}
+
+
+// TODO: ARM - is it possible to inline these stubs into the main code stream?
+
+RangeCheckStub::RangeCheckStub(CodeEmitInfo* info, LIR_Opr index,
+                               bool throw_index_out_of_bounds_exception)
+  : _throw_index_out_of_bounds_exception(throw_index_out_of_bounds_exception)
+  , _index(index)
+{
+  _info = info == NULL ? NULL : new CodeEmitInfo(info);
+}
+
+
+void RangeCheckStub::emit_code(LIR_Assembler* ce) {
+  __ bind(_entry);
+
+  if (_info->deoptimize_on_exception()) {
+#ifdef AARCH64
+    __ NOT_TESTED();
+#endif
+    __ call(Runtime1::entry_for(Runtime1::predicate_failed_trap_id), relocInfo::runtime_call_type);
+    ce->add_call_info_here(_info);
+    ce->verify_oop_map(_info);
+    debug_only(__ should_not_reach_here());
+    return;
+  }
+  // Pass the array index on stack because all registers must be preserved
+  ce->verify_reserved_argument_area_size(1);
+  if (_index->is_cpu_register()) {
+    __ str_32(_index->as_register(), Address(SP));
+  } else {
+    __ mov_slow(Rtemp, _index->as_jint()); // Rtemp should be OK in C1
+    __ str_32(Rtemp, Address(SP));
+  }
+
+  if (_throw_index_out_of_bounds_exception) {
+#ifdef AARCH64
+    __ NOT_TESTED();
+#endif
+    __ call(Runtime1::entry_for(Runtime1::throw_index_exception_id), relocInfo::runtime_call_type);
+  } else {
+    __ call(Runtime1::entry_for(Runtime1::throw_range_check_failed_id), relocInfo::runtime_call_type);
+  }
+  ce->add_call_info_here(_info);
+  ce->verify_oop_map(_info);
+  DEBUG_ONLY(STOP("RangeCheck");)
+}
+
+PredicateFailedStub::PredicateFailedStub(CodeEmitInfo* info) {
+  _info = new CodeEmitInfo(info);
+}
+
+void PredicateFailedStub::emit_code(LIR_Assembler* ce) {
+  __ bind(_entry);
+  __ call(Runtime1::entry_for(Runtime1::predicate_failed_trap_id), relocInfo::runtime_call_type);
+  ce->add_call_info_here(_info);
+  ce->verify_oop_map(_info);
+  debug_only(__ should_not_reach_here());
+}
+
+void DivByZeroStub::emit_code(LIR_Assembler* ce) {
+  if (_offset != -1) {
+    ce->compilation()->implicit_exception_table()->append(_offset, __ offset());
+  }
+  __ bind(_entry);
+  __ call(Runtime1::entry_for(Runtime1::throw_div0_exception_id),
+          relocInfo::runtime_call_type);
+  ce->add_call_info_here(_info);
+  DEBUG_ONLY(STOP("DivByZero");)
+}
+
+
+// Implementation of NewInstanceStub
+
+NewInstanceStub::NewInstanceStub(LIR_Opr klass_reg, LIR_Opr result, ciInstanceKlass* klass, CodeEmitInfo* info, Runtime1::StubID stub_id) {
+  _result = result;
+  _klass = klass;
+  _klass_reg = klass_reg;
+  _info = new CodeEmitInfo(info);
+  assert(stub_id == Runtime1::new_instance_id                 ||
+         stub_id == Runtime1::fast_new_instance_id            ||
+         stub_id == Runtime1::fast_new_instance_init_check_id,
+         "need new_instance id");
+  _stub_id   = stub_id;
+}
+
+
+void NewInstanceStub::emit_code(LIR_Assembler* ce) {
+  assert(_result->as_register() == R0, "runtime call setup");
+  assert(_klass_reg->as_register() == R1, "runtime call setup");
+  __ bind(_entry);
+  __ call(Runtime1::entry_for(_stub_id), relocInfo::runtime_call_type);
+  ce->add_call_info_here(_info);
+  ce->verify_oop_map(_info);
+  __ b(_continuation);
+}
+
+
+// Implementation of NewTypeArrayStub
+
+NewTypeArrayStub::NewTypeArrayStub(LIR_Opr klass_reg, LIR_Opr length, LIR_Opr result, CodeEmitInfo* info) {
+  _klass_reg = klass_reg;
+  _length = length;
+  _result = result;
+  _info = new CodeEmitInfo(info);
+}
+
+
+void NewTypeArrayStub::emit_code(LIR_Assembler* ce) {
+  assert(_result->as_register() == R0, "runtime call setup");
+  assert(_klass_reg->as_register() == R1, "runtime call setup");
+  assert(_length->as_register() == R2, "runtime call setup");
+  __ bind(_entry);
+  __ call(Runtime1::entry_for(Runtime1::new_type_array_id), relocInfo::runtime_call_type);
+  ce->add_call_info_here(_info);
+  ce->verify_oop_map(_info);
+  __ b(_continuation);
+}
+
+
+// Implementation of NewObjectArrayStub
+
+NewObjectArrayStub::NewObjectArrayStub(LIR_Opr klass_reg, LIR_Opr length, LIR_Opr result, CodeEmitInfo* info) {
+  _klass_reg = klass_reg;
+  _result = result;
+  _length = length;
+  _info = new CodeEmitInfo(info);
+}
+
+
+void NewObjectArrayStub::emit_code(LIR_Assembler* ce) {
+  assert(_result->as_register() == R0, "runtime call setup");
+  assert(_klass_reg->as_register() == R1, "runtime call setup");
+  assert(_length->as_register() == R2, "runtime call setup");
+  __ bind(_entry);
+  __ call(Runtime1::entry_for(Runtime1::new_object_array_id), relocInfo::runtime_call_type);
+  ce->add_call_info_here(_info);
+  ce->verify_oop_map(_info);
+  __ b(_continuation);
+}
+
+
+// Implementation of MonitorAccessStubs
+
+MonitorEnterStub::MonitorEnterStub(LIR_Opr obj_reg, LIR_Opr lock_reg, CodeEmitInfo* info)
+: MonitorAccessStub(obj_reg, lock_reg)
+{
+  _info = new CodeEmitInfo(info);
+}
+
+
+void MonitorEnterStub::emit_code(LIR_Assembler* ce) {
+  __ bind(_entry);
+  const Register obj_reg = _obj_reg->as_pointer_register();
+  const Register lock_reg = _lock_reg->as_pointer_register();
+
+  ce->verify_reserved_argument_area_size(2);
+#ifdef AARCH64
+  __ stp(obj_reg, lock_reg, Address(SP));
+#else
+  if (obj_reg < lock_reg) {
+    __ stmia(SP, RegisterSet(obj_reg) | RegisterSet(lock_reg));
+  } else {
+    __ str(obj_reg, Address(SP));
+    __ str(lock_reg, Address(SP, BytesPerWord));
+  }
+#endif // AARCH64
+
+  Runtime1::StubID enter_id = ce->compilation()->has_fpu_code() ?
+                              Runtime1::monitorenter_id :
+                              Runtime1::monitorenter_nofpu_id;
+  __ call(Runtime1::entry_for(enter_id), relocInfo::runtime_call_type);
+  ce->add_call_info_here(_info);
+  ce->verify_oop_map(_info);
+  __ b(_continuation);
+}
+
+
+void MonitorExitStub::emit_code(LIR_Assembler* ce) {
+  __ bind(_entry);
+  if (_compute_lock) {
+    ce->monitor_address(_monitor_ix, _lock_reg);
+  }
+  const Register lock_reg = _lock_reg->as_pointer_register();
+
+  ce->verify_reserved_argument_area_size(1);
+  __ str(lock_reg, Address(SP));
+
+  // Non-blocking leaf routine - no call info needed
+  Runtime1::StubID exit_id = ce->compilation()->has_fpu_code() ?
+                             Runtime1::monitorexit_id :
+                             Runtime1::monitorexit_nofpu_id;
+  __ call(Runtime1::entry_for(exit_id), relocInfo::runtime_call_type);
+  __ b(_continuation);
+}
+
+
+// Call return is directly after patch word
+int PatchingStub::_patch_info_offset = 0;
+
+void PatchingStub::align_patch_site(MacroAssembler* masm) {
+#if 0
+  // TODO: investigate if we required to implement this
+    ShouldNotReachHere();
+#endif
+}
+
+void PatchingStub::emit_code(LIR_Assembler* ce) {
+  const int patchable_instruction_offset = AARCH64_ONLY(NativeInstruction::instruction_size) NOT_AARCH64(0);
+
+  assert(NativeCall::instruction_size <= _bytes_to_copy && _bytes_to_copy <= 0xFF,
+         "not enough room for call");
+  assert((_bytes_to_copy & 3) == 0, "must copy a multiple of four bytes");
+  Label call_patch;
+  bool is_load = (_id == load_klass_id) || (_id == load_mirror_id) || (_id == load_appendix_id);
+
+#ifdef AARCH64
+  assert(nativeInstruction_at(_pc_start)->is_nop(), "required for MT safe patching");
+
+  // Same alignment of reg2mem code and PatchingStub code. Required to make copied bind_literal() code properly aligned.
+  __ align(wordSize);
+#endif // AARCH64
+
+  if (is_load NOT_AARCH64(&& !VM_Version::supports_movw())) {
+    address start = __ pc();
+
+    // The following sequence duplicates code provided in MacroAssembler::patchable_mov_oop()
+    // without creating relocation info entry.
+#ifdef AARCH64
+    // Extra nop for MT safe patching
+    __ nop();
+#endif // AARCH64
+
+    assert((__ pc() - start) == patchable_instruction_offset, "should be");
+#ifdef AARCH64
+    __ ldr(_obj, __ pc());
+#else
+    __ ldr(_obj, Address(PC));
+    // Extra nop to handle case of large offset of oop placeholder (see NativeMovConstReg::set_data).
+    __ nop();
+#endif // AARCH64
+
+#ifdef ASSERT
+    for (int i = 0; i < _bytes_to_copy; i++) {
+      assert(((address)_pc_start)[i] == start[i], "should be the same code");
+    }
+#endif // ASSERT
+  }
+
+  address being_initialized_entry = __ pc();
+  if (CommentedAssembly) {
+    __ block_comment(" patch template");
+  }
+  if (is_load) {
+    address start = __ pc();
+    if (_id == load_mirror_id || _id == load_appendix_id) {
+      __ patchable_mov_oop(_obj, (jobject)Universe::non_oop_word(), _index);
+    } else {
+      __ patchable_mov_metadata(_obj, (Metadata*)Universe::non_oop_word(), _index);
+    }
+#ifdef ASSERT
+    for (int i = 0; i < _bytes_to_copy; i++) {
+      assert(((address)_pc_start)[i] == start[i], "should be the same code");
+    }
+#endif // ASSERT
+  } else {
+    int* start = (int*)_pc_start;
+    int* end = start + (_bytes_to_copy / BytesPerInt);
+    while (start < end) {
+      __ emit_int32(*start++);
+    }
+  }
+  address end_of_patch = __ pc();
+
+  int bytes_to_skip = 0;
+  if (_id == load_mirror_id) {
+    int offset = __ offset();
+    if (CommentedAssembly) {
+      __ block_comment(" being_initialized check");
+    }
+
+    assert(_obj != noreg, "must be a valid register");
+    // Rtemp should be OK in C1
+    __ ldr(Rtemp, Address(_obj, java_lang_Class::klass_offset_in_bytes()));
+    __ ldr(Rtemp, Address(Rtemp, InstanceKlass::init_thread_offset()));
+    __ cmp(Rtemp, Rthread);
+    __ b(call_patch, ne);
+    __ b(_patch_site_continuation);
+
+    bytes_to_skip += __ offset() - offset;
+  }
+
+  if (CommentedAssembly) {
+    __ block_comment("patch data - 3 high bytes of the word");
+  }
+  const int sizeof_patch_record = 4;
+  bytes_to_skip += sizeof_patch_record;
+  int being_initialized_entry_offset = __ pc() - being_initialized_entry + sizeof_patch_record;
+  __ emit_int32(0xff | being_initialized_entry_offset << 8 | bytes_to_skip << 16 | _bytes_to_copy << 24);
+
+  address patch_info_pc = __ pc();
+  assert(patch_info_pc - end_of_patch == bytes_to_skip, "incorrect patch info");
+
+  // runtime call will return here
+  Label call_return;
+  __ bind(call_return);
+  ce->add_call_info_here(_info);
+  assert(_patch_info_offset == (patch_info_pc - __ pc()), "must not change");
+  __ b(_patch_site_entry);
+
+  address entry = __ pc();
+  NativeGeneralJump::insert_unconditional((address)_pc_start, entry);
+  address target = NULL;
+  relocInfo::relocType reloc_type = relocInfo::none;
+  switch (_id) {
+    case access_field_id:  target = Runtime1::entry_for(Runtime1::access_field_patching_id); break;
+    case load_klass_id:    target = Runtime1::entry_for(Runtime1::load_klass_patching_id); reloc_type = relocInfo::metadata_type; break;
+    case load_mirror_id:   target = Runtime1::entry_for(Runtime1::load_mirror_patching_id); reloc_type = relocInfo::oop_type; break;
+    case load_appendix_id: target = Runtime1::entry_for(Runtime1::load_appendix_patching_id); reloc_type = relocInfo::oop_type; break;
+    default: ShouldNotReachHere();
+  }
+  __ bind(call_patch);
+
+  if (CommentedAssembly) {
+    __ block_comment("patch entry point");
+  }
+
+  // arrange for call to return just after patch word
+  __ adr(LR, call_return);
+  __ jump(target, relocInfo::runtime_call_type, Rtemp);
+
+  if (is_load) {
+    CodeSection* cs = __ code_section();
+    address pc = (address)_pc_start;
+    RelocIterator iter(cs, pc, pc + 1);
+    relocInfo::change_reloc_info_for_address(&iter, pc, reloc_type, relocInfo::none);
+  }
+}
+
+void DeoptimizeStub::emit_code(LIR_Assembler* ce) {
+  __ bind(_entry);
+  __ mov_slow(Rtemp, _trap_request);
+  ce->verify_reserved_argument_area_size(1);
+  __ str(Rtemp, Address(SP));
+  __ call(Runtime1::entry_for(Runtime1::deoptimize_id), relocInfo::runtime_call_type);
+  ce->add_call_info_here(_info);
+  DEBUG_ONLY(__ should_not_reach_here());
+}
+
+
+void ImplicitNullCheckStub::emit_code(LIR_Assembler* ce) {
+  address a;
+  if (_info->deoptimize_on_exception()) {
+    // Deoptimize, do not throw the exception, because it is
+    // probably wrong to do it here.
+    a = Runtime1::entry_for(Runtime1::predicate_failed_trap_id);
+  } else {
+    a = Runtime1::entry_for(Runtime1::throw_null_pointer_exception_id);
+  }
+  ce->compilation()->implicit_exception_table()->append(_offset, __ offset());
+  __ bind(_entry);
+  __ call(a, relocInfo::runtime_call_type);
+  ce->add_call_info_here(_info);
+  ce->verify_oop_map(_info);
+  DEBUG_ONLY(STOP("ImplicitNullCheck");)
+}
+
+
+void SimpleExceptionStub::emit_code(LIR_Assembler* ce) {
+  __ bind(_entry);
+  // Pass the object on stack because all registers must be preserved
+  if (_obj->is_cpu_register()) {
+    ce->verify_reserved_argument_area_size(1);
+    __ str(_obj->as_pointer_register(), Address(SP));
+  } else {
+    assert(_obj->is_illegal(), "should be");
+  }
+  __ call(Runtime1::entry_for(_stub), relocInfo::runtime_call_type);
+  ce->add_call_info_here(_info);
+  DEBUG_ONLY(STOP("SimpleException");)
+}
+
+
+void ArrayCopyStub::emit_code(LIR_Assembler* ce) {
+  __ bind(_entry);
+
+  VMRegPair args[5];
+  BasicType signature[5] = { T_OBJECT, T_INT, T_OBJECT, T_INT, T_INT };
+  SharedRuntime::java_calling_convention(signature, args, 5, true);
+
+  Register r[5];
+  r[0] = src()->as_pointer_register();
+  r[1] = src_pos()->as_register();
+  r[2] = dst()->as_pointer_register();
+  r[3] = dst_pos()->as_register();
+  r[4] = length()->as_register();
+
+  for (int i = 0; i < 5; i++) {
+    VMReg arg = args[i].first();
+    if (arg->is_stack()) {
+      __ str(r[i], Address(SP, arg->reg2stack() * VMRegImpl::stack_slot_size));
+    } else {
+      assert(r[i] == arg->as_Register(), "Calling conventions must match");
+    }
+  }
+
+  ce->emit_static_call_stub();
+  if (ce->compilation()->bailed_out()) {
+    return; // CodeCache is full
+  }
+  int ret_addr_offset = __ patchable_call(SharedRuntime::get_resolve_static_call_stub(), relocInfo::static_call_type);
+  assert(ret_addr_offset == __ offset(), "embedded return address not allowed");
+  ce->add_call_info_here(info());
+  ce->verify_oop_map(info());
+  __ b(_continuation);
+}
+
+/////////////////////////////////////////////////////////////////////////////
+#if INCLUDE_ALL_GCS
+
+void G1PreBarrierStub::emit_code(LIR_Assembler* ce) {
+  // At this point we know that marking is in progress.
+  // If do_load() is true then we have to emit the
+  // load of the previous value; otherwise it has already
+  // been loaded into _pre_val.
+
+  __ bind(_entry);
+  assert(pre_val()->is_register(), "Precondition.");
+
+  Register pre_val_reg = pre_val()->as_register();
+
+  if (do_load()) {
+    ce->mem2reg(addr(), pre_val(), T_OBJECT, patch_code(), info(), false /*wide*/, false /*unaligned*/);
+  }
+
+  __ cbz(pre_val_reg, _continuation);
+  ce->verify_reserved_argument_area_size(1);
+  __ str(pre_val_reg, Address(SP));
+  __ call(Runtime1::entry_for(Runtime1::g1_pre_barrier_slow_id), relocInfo::runtime_call_type);
+
+  __ b(_continuation);
+}
+
+void G1PostBarrierStub::emit_code(LIR_Assembler* ce) {
+  __ bind(_entry);
+  assert(addr()->is_register(), "Precondition.");
+  assert(new_val()->is_register(), "Precondition.");
+  Register new_val_reg = new_val()->as_register();
+  __ cbz(new_val_reg, _continuation);
+  ce->verify_reserved_argument_area_size(1);
+  __ str(addr()->as_pointer_register(), Address(SP));
+  __ call(Runtime1::entry_for(Runtime1::g1_post_barrier_slow_id), relocInfo::runtime_call_type);
+  __ b(_continuation);
+}
+
+#endif // INCLUDE_ALL_GCS
+/////////////////////////////////////////////////////////////////////////////
+
+#undef __
--- /dev/null	2016-08-24 15:41:39.598575000 -0400
+++ new/hotspot/src/cpu/arm/vm/c1_Defs_arm.hpp	2016-12-02 11:17:57.044333770 -0500
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2008, 2014, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef CPU_ARM_VM_C1_DEFS_ARM_HPP
+#define CPU_ARM_VM_C1_DEFS_ARM_HPP
+
+// native word offsets from memory address (little endian)
+enum {
+  pd_lo_word_offset_in_bytes = 0,
+  pd_hi_word_offset_in_bytes = BytesPerWord
+};
+
+// explicit rounding operations are required to implement the strictFP mode
+enum {
+  pd_strict_fp_requires_explicit_rounding = false
+};
+
+#ifdef __SOFTFP__
+#define SOFT(n) n
+#define VFP(n)
+#else  // __SOFTFP__
+#define SOFT(n)
+#define VFP(n)        n
+#endif // __SOFTFP__
+
+
+// registers
+enum {
+  pd_nof_cpu_regs_frame_map             = AARCH64_ONLY(33) NOT_AARCH64(16), // number of registers used during code emission
+  pd_nof_caller_save_cpu_regs_frame_map = AARCH64_ONLY(27) NOT_AARCH64(10), // number of registers killed by calls
+  pd_nof_cpu_regs_reg_alloc             = AARCH64_ONLY(27) NOT_AARCH64(10), // number of registers that are visible to register allocator (including Rheap_base which is visible only if compressed pointers are not enabled)
+  pd_nof_cpu_regs_linearscan = pd_nof_cpu_regs_frame_map,                   // number of registers visible to linear scan
+  pd_nof_cpu_regs_processed_in_linearscan = pd_nof_cpu_regs_reg_alloc + 1,  // number of registers processed in linear scan; includes LR as it is used as temporary register in c1_LIRGenerator_arm
+  pd_first_cpu_reg = 0,
+  pd_last_cpu_reg  = pd_nof_cpu_regs_frame_map - 1,
+
+  pd_nof_fpu_regs_frame_map             = VFP(32) SOFT(0),                               // number of float registers used during code emission
+  pd_nof_caller_save_fpu_regs_frame_map = VFP(32) SOFT(0),                               // number of float registers killed by calls
+  pd_nof_fpu_regs_reg_alloc             = AARCH64_ONLY(32) NOT_AARCH64(VFP(30) SOFT(0)), // number of float registers that are visible to register allocator
+  pd_nof_fpu_regs_linearscan            = pd_nof_fpu_regs_frame_map,                     // number of float registers visible to linear scan
+  pd_first_fpu_reg = pd_nof_cpu_regs_frame_map,
+  pd_last_fpu_reg  = pd_first_fpu_reg + pd_nof_fpu_regs_frame_map - 1,
+
+  pd_nof_xmm_regs_linearscan = 0,
+  pd_nof_caller_save_xmm_regs = 0,
+  pd_first_xmm_reg = -1,
+  pd_last_xmm_reg  = -1
+};
+
+
+// encoding of float value in debug info:
+enum {
+  pd_float_saved_as_double = false
+};
+
+#ifdef AARCH64
+#define PATCHED_ADDR 0xff8
+#else
+#define PATCHED_ADDR (204)
+#endif
+#define CARDTABLEMODREF_POST_BARRIER_HELPER
+#define GENERATE_ADDRESS_IS_PREFERRED
+
+#endif // CPU_ARM_VM_C1_DEFS_ARM_HPP
--- /dev/null	2016-08-24 15:41:39.598575000 -0400
+++ new/hotspot/src/cpu/arm/vm/c1_FpuStackSim_arm.cpp	2016-12-02 11:18:02.372635941 -0500
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2008, 2011, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+#include "c1/c1_FpuStackSim.hpp"
+#include "c1/c1_FrameMap.hpp"
+#include "utilities/array.hpp"
+#include "utilities/ostream.hpp"
+
+// Nothing needed here
--- /dev/null	2016-08-24 15:41:39.598575000 -0400
+++ new/hotspot/src/cpu/arm/vm/c1_FpuStackSim_arm.hpp	2016-12-02 11:18:07.816944689 -0500
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2008, 2011, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef CPU_ARM_VM_C1_FPUSTACKSIM_ARM_HPP
+#define CPU_ARM_VM_C1_FPUSTACKSIM_ARM_HPP
+
+// Nothing needed here
+
+#endif // CPU_ARM_VM_C1_FPUSTACKSIM_ARM_HPP
--- /dev/null	2016-08-24 15:41:39.598575000 -0400
+++ new/hotspot/src/cpu/arm/vm/c1_FrameMap_arm.cpp	2016-12-02 11:18:13.185249126 -0500
@@ -0,0 +1,230 @@
+/*
+ * Copyright (c) 2008, 2015, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+#include "c1/c1_FrameMap.hpp"
+#include "c1/c1_LIR.hpp"
+#include "runtime/sharedRuntime.hpp"
+#include "vmreg_arm.inline.hpp"
+
+LIR_Opr FrameMap::R0_opr;
+LIR_Opr FrameMap::R1_opr;
+LIR_Opr FrameMap::R2_opr;
+LIR_Opr FrameMap::R3_opr;
+LIR_Opr FrameMap::R4_opr;
+LIR_Opr FrameMap::R5_opr;
+
+LIR_Opr FrameMap::R0_oop_opr;
+LIR_Opr FrameMap::R1_oop_opr;
+LIR_Opr FrameMap::R2_oop_opr;
+LIR_Opr FrameMap::R3_oop_opr;
+LIR_Opr FrameMap::R4_oop_opr;
+LIR_Opr FrameMap::R5_oop_opr;
+
+LIR_Opr FrameMap::R0_metadata_opr;
+LIR_Opr FrameMap::R1_metadata_opr;
+LIR_Opr FrameMap::R2_metadata_opr;
+LIR_Opr FrameMap::R3_metadata_opr;
+LIR_Opr FrameMap::R4_metadata_opr;
+LIR_Opr FrameMap::R5_metadata_opr;
+
+#ifdef AARCH64
+LIR_Opr FrameMap::ZR_opr;
+#endif // AARCH64
+
+LIR_Opr FrameMap::LR_opr;
+LIR_Opr FrameMap::LR_oop_opr;
+LIR_Opr FrameMap::LR_ptr_opr;
+LIR_Opr FrameMap::FP_opr;
+LIR_Opr FrameMap::SP_opr;
+LIR_Opr FrameMap::Rthread_opr;
+
+LIR_Opr FrameMap::Int_result_opr;
+LIR_Opr FrameMap::Long_result_opr;
+LIR_Opr FrameMap::Object_result_opr;
+LIR_Opr FrameMap::Float_result_opr;
+LIR_Opr FrameMap::Double_result_opr;
+
+LIR_Opr FrameMap::Exception_oop_opr;
+LIR_Opr FrameMap::Exception_pc_opr;
+
+LIR_Opr FrameMap::_caller_save_cpu_regs[] = { 0 };
+LIR_Opr FrameMap::_caller_save_fpu_regs[];  // same as initialize to zero
+
+LIR_Opr FrameMap::map_to_opr(BasicType type, VMRegPair* reg, bool) {
+  LIR_Opr opr = LIR_OprFact::illegalOpr;
+  VMReg r_1 = reg->first();
+  VMReg r_2 = reg->second();
+  if (r_1->is_stack()) {
+    int st_off = (r_1->reg2stack() + SharedRuntime::out_preserve_stack_slots()) * VMRegImpl::stack_slot_size;
+    opr = LIR_OprFact::address(new LIR_Address(SP_opr, st_off, type));
+  } else if (r_1->is_Register()) {
+    Register reg = r_1->as_Register();
+    if (r_2->is_Register() && (type == T_LONG || type == T_DOUBLE)) {
+#ifdef AARCH64
+      assert(r_1->next() == r_2, "should be the same");
+      opr = as_long_opr(reg);
+#else
+      opr = as_long_opr(reg, r_2->as_Register());
+#endif
+    } else if (type == T_OBJECT || type == T_ARRAY) {
+      opr = as_oop_opr(reg);
+    } else if (type == T_METADATA) {
+      opr = as_metadata_opr(reg);
+    } else {
+      // PreferInterpreterNativeStubs should ensure we never need to
+      // handle a long opr passed as R3+stack_slot
+      assert(! r_2->is_stack(), "missing support for ALIGN_WIDE_ARGUMENTS==0");
+      opr = as_opr(reg);
+    }
+  } else if (r_1->is_FloatRegister()) {
+    FloatRegister reg = r_1->as_FloatRegister();
+    opr = type == T_FLOAT ? as_float_opr(reg) : as_double_opr(reg);
+  } else {
+    ShouldNotReachHere();
+  }
+  return opr;
+}
+
+
+void FrameMap::initialize() {
+  if (_init_done) return;
+
+  int i;
+  int rnum = 0;
+
+  // Registers used for allocation
+#ifdef AARCH64
+  assert(Rthread == R28 && Rheap_base == R27 && Rtemp == R16, "change the code here");
+  for (i = 0; i < 16; i++) {
+    map_register(rnum++, as_Register(i));
+  }
+  for (i = 17; i < 28; i++) {
+    map_register(rnum++, as_Register(i));
+  }
+#else
+  assert(Rthread == R10 && Rtemp == R12, "change the code here");
+  for (i = 0; i < 10; i++) {
+    map_register(rnum++, as_Register(i));
+  }
+#endif // AARCH64
+  assert(rnum == pd_nof_cpu_regs_reg_alloc, "should be");
+
+  // Registers not used for allocation
+  map_register(rnum++, LR); // LR register should be listed first, see c1_LinearScan_arm.hpp::is_processed_reg_num.
+  assert(rnum == pd_nof_cpu_regs_processed_in_linearscan, "should be");
+
+  map_register(rnum++, Rtemp);
+  map_register(rnum++, Rthread);
+  map_register(rnum++, FP); // ARM32: R7 or R11
+  map_register(rnum++, SP);
+#ifdef AARCH64
+  map_register(rnum++, ZR);
+#else
+  map_register(rnum++, PC);
+#endif
+  assert(rnum == pd_nof_cpu_regs_frame_map, "should be");
+
+  _init_done = true;
+
+  R0_opr  = as_opr(R0);   R0_oop_opr = as_oop_opr(R0);    R0_metadata_opr = as_metadata_opr(R0);
+  R1_opr  = as_opr(R1);   R1_oop_opr = as_oop_opr(R1);    R1_metadata_opr = as_metadata_opr(R1);
+  R2_opr  = as_opr(R2);   R2_oop_opr = as_oop_opr(R2);    R2_metadata_opr = as_metadata_opr(R2);
+  R3_opr  = as_opr(R3);   R3_oop_opr = as_oop_opr(R3);    R3_metadata_opr = as_metadata_opr(R3);
+  R4_opr  = as_opr(R4);   R4_oop_opr = as_oop_opr(R4);    R4_metadata_opr = as_metadata_opr(R4);
+  R5_opr  = as_opr(R5);   R5_oop_opr = as_oop_opr(R5);    R5_metadata_opr = as_metadata_opr(R5);
+
+#ifdef AARCH64
+  ZR_opr = as_opr(ZR);
+#endif // AARCH64
+
+  LR_opr      = as_opr(LR);
+  LR_oop_opr  = as_oop_opr(LR);
+  LR_ptr_opr  = as_pointer_opr(LR);
+  FP_opr      = as_pointer_opr(FP);
+  SP_opr      = as_pointer_opr(SP);
+  Rthread_opr = as_pointer_opr(Rthread);
+
+  // LIR operands for result
+  Int_result_opr = R0_opr;
+  Object_result_opr = R0_oop_opr;
+#ifdef AARCH64
+  Long_result_opr = as_long_opr(R0);
+  Float_result_opr = as_float_opr(S0);
+  Double_result_opr = as_double_opr(D0);
+#else
+  Long_result_opr = as_long_opr(R0, R1);
+#ifdef __ABI_HARD__
+  Float_result_opr = as_float_opr(S0);
+  Double_result_opr = as_double_opr(D0);
+#else
+  Float_result_opr = LIR_OprFact::single_softfp(0);
+  Double_result_opr = LIR_OprFact::double_softfp(0, 1);
+#endif // __ABI_HARD__
+#endif // AARCH64
+
+  Exception_oop_opr = as_oop_opr(Rexception_obj);
+  Exception_pc_opr = as_opr(Rexception_pc);
+
+  for (i = 0; i < nof_caller_save_cpu_regs(); i++) {
+    _caller_save_cpu_regs[i] = LIR_OprFact::single_cpu(i);
+  }
+  for (i = 0; i < nof_caller_save_fpu_regs; i++) {
+    _caller_save_fpu_regs[i] = LIR_OprFact::single_fpu(i);
+  }
+}
+
+
+Address FrameMap::make_new_address(ByteSize sp_offset) const {
+  return Address(SP, sp_offset);
+}
+
+LIR_Opr FrameMap::stack_pointer() {
+  return FrameMap::SP_opr;
+}
+
+LIR_Opr FrameMap::method_handle_invoke_SP_save_opr() {
+  assert(Rmh_SP_save == FP, "Fix register used for saving SP for MethodHandle calls");
+  return FP_opr;
+}
+
+bool FrameMap::validate_frame() {
+  int max_offset = in_bytes(framesize_in_bytes());
+  int java_index = 0;
+  for (int i = 0; i < _incoming_arguments->length(); i++) {
+    LIR_Opr opr = _incoming_arguments->at(i);
+    if (opr->is_stack()) {
+      int arg_offset = _argument_locations->at(java_index);
+      if (arg_offset > max_offset) {
+        max_offset = arg_offset;
+      }
+    }
+    java_index += type2size[opr->type()];
+  }
+  return max_offset < AARCH64_ONLY(16384) NOT_AARCH64(4096); // TODO-AARCH64 check that LIRAssembler does not generate load/store of byte and half-word with SP as address base
+}
+
+VMReg FrameMap::fpu_regname(int n) {
+  return as_FloatRegister(n)->as_VMReg();
+}
--- /dev/null	2016-08-24 15:41:39.598575000 -0400
+++ new/hotspot/src/cpu/arm/vm/c1_FrameMap_arm.hpp	2016-12-02 11:18:18.141530198 -0500
@@ -0,0 +1,128 @@
+/*
+ * Copyright (c) 2008, 2014, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef CPU_ARM_VM_C1_FRAMEMAP_ARM_HPP
+#define CPU_ARM_VM_C1_FRAMEMAP_ARM_HPP
+
+ public:
+
+  enum {
+    first_available_sp_in_frame = 0,
+    frame_pad_in_bytes = 2*wordSize      // Account for FP/LR saved at build_frame().
+  };
+
+  static LIR_Opr R0_opr;
+  static LIR_Opr R1_opr;
+  static LIR_Opr R2_opr;
+  static LIR_Opr R3_opr;
+  static LIR_Opr R4_opr;
+  static LIR_Opr R5_opr;
+  // add more predefined register oprs as needed
+
+  static LIR_Opr R0_oop_opr;
+  static LIR_Opr R1_oop_opr;
+  static LIR_Opr R2_oop_opr;
+  static LIR_Opr R3_oop_opr;
+  static LIR_Opr R4_oop_opr;
+  static LIR_Opr R5_oop_opr;
+
+  static LIR_Opr R0_metadata_opr;
+  static LIR_Opr R1_metadata_opr;
+  static LIR_Opr R2_metadata_opr;
+  static LIR_Opr R3_metadata_opr;
+  static LIR_Opr R4_metadata_opr;
+  static LIR_Opr R5_metadata_opr;
+
+#ifdef AARCH64
+  static LIR_Opr ZR_opr;
+#endif // AARCH64
+
+  static LIR_Opr LR_opr;
+  static LIR_Opr LR_oop_opr;
+  static LIR_Opr LR_ptr_opr;
+
+  static LIR_Opr FP_opr;
+  static LIR_Opr SP_opr;
+  static LIR_Opr Rthread_opr;
+
+  static LIR_Opr Int_result_opr;
+  static LIR_Opr Long_result_opr;
+  static LIR_Opr Object_result_opr;
+  static LIR_Opr Float_result_opr;
+  static LIR_Opr Double_result_opr;
+
+  static LIR_Opr Exception_oop_opr;
+  static LIR_Opr Exception_pc_opr;
+
+#ifdef AARCH64
+  static LIR_Opr as_long_opr(Register r) {
+    return LIR_OprFact::double_cpu(cpu_reg2rnr(r), cpu_reg2rnr(r));
+  }
+
+  static LIR_Opr as_pointer_opr(Register r) {
+    return LIR_OprFact::double_cpu(cpu_reg2rnr(r), cpu_reg2rnr(r));
+  }
+
+  static LIR_Opr as_double_opr(FloatRegister r) {
+    return LIR_OprFact::double_fpu(r->encoding());
+  }
+#else
+  static LIR_Opr as_long_opr(Register r, Register r2) {
+    return LIR_OprFact::double_cpu(cpu_reg2rnr(r), cpu_reg2rnr(r2));
+  }
+
+  static LIR_Opr as_pointer_opr(Register r) {
+    return LIR_OprFact::single_cpu(cpu_reg2rnr(r));
+  }
+
+  static LIR_Opr as_double_opr(FloatRegister r) {
+    return LIR_OprFact::double_fpu(r->encoding(), r->successor()->encoding());
+  }
+#endif
+
+  static LIR_Opr as_float_opr(FloatRegister r) {
+    return LIR_OprFact::single_fpu(r->encoding());
+  }
+
+  static VMReg fpu_regname(int n);
+
+  static bool is_caller_save_register(LIR_Opr opr) {
+    return true;
+  }
+
+  static int adjust_reg_range(int range) {
+    // Reduce the number of available regs (to free Rheap_base) in case of compressed oops
+    if (UseCompressedOops || UseCompressedClassPointers) return range - 1;
+    return range;
+  }
+
+  static int nof_caller_save_cpu_regs() {
+    return adjust_reg_range(pd_nof_caller_save_cpu_regs_frame_map);
+  }
+
+  static int last_cpu_reg() {
+    return pd_last_cpu_reg;
+  }
+
+#endif // CPU_ARM_VM_C1_FRAMEMAP_ARM_HPP
--- /dev/null	2016-08-24 15:41:39.598575000 -0400
+++ new/hotspot/src/cpu/arm/vm/c1_LIRAssembler_arm.cpp	2016-12-02 11:18:23.561837584 -0500
@@ -0,0 +1,3610 @@
+/*
+ * Copyright (c) 2008, 2016, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+#include "c1/c1_Compilation.hpp"
+#include "c1/c1_LIRAssembler.hpp"
+#include "c1/c1_MacroAssembler.hpp"
+#include "c1/c1_Runtime1.hpp"
+#include "c1/c1_ValueStack.hpp"
+#include "ci/ciArrayKlass.hpp"
+#include "ci/ciInstance.hpp"
+#include "gc/shared/barrierSet.hpp"
+#include "gc/shared/cardTableModRefBS.hpp"
+#include "gc/shared/collectedHeap.hpp"
+#include "nativeInst_arm.hpp"
+#include "oops/objArrayKlass.hpp"
+#include "runtime/sharedRuntime.hpp"
+#include "vmreg_arm.inline.hpp"
+
+#define __ _masm->
+
+// Note: Rtemp usage is this file should not impact C2 and should be
+// correct as long as it is not implicitly used in lower layers (the
+// arm [macro]assembler) and used with care in the other C1 specific
+// files.
+
+bool LIR_Assembler::is_small_constant(LIR_Opr opr) {
+  ShouldNotCallThis(); // Not used on ARM
+  return false;
+}
+
+
+LIR_Opr LIR_Assembler::receiverOpr() {
+  // The first register in Java calling conventions
+  return FrameMap::R0_oop_opr;
+}
+
+LIR_Opr LIR_Assembler::osrBufferPointer() {
+  return FrameMap::as_pointer_opr(R0);
+}
+
+#ifndef PRODUCT
+void LIR_Assembler::verify_reserved_argument_area_size(int args_count) {
+  assert(args_count * wordSize <= frame_map()->reserved_argument_area_size(), "not enough space for arguments");
+}
+#endif // !PRODUCT
+
+void LIR_Assembler::store_parameter(jint c, int offset_from_sp_in_words) {
+  assert(offset_from_sp_in_words >= 0, "invalid offset from sp");
+  int offset_from_sp_in_bytes = offset_from_sp_in_words * BytesPerWord;
+  assert(offset_from_sp_in_bytes < frame_map()->reserved_argument_area_size(), "not enough space");
+  __ mov_slow(Rtemp, c);
+  __ str(Rtemp, Address(SP, offset_from_sp_in_bytes));
+}
+
+void LIR_Assembler::store_parameter(Metadata* m, int offset_from_sp_in_words) {
+  assert(offset_from_sp_in_words >= 0, "invalid offset from sp");
+  int offset_from_sp_in_bytes = offset_from_sp_in_words * BytesPerWord;
+  assert(offset_from_sp_in_bytes < frame_map()->reserved_argument_area_size(), "not enough space");
+  __ mov_metadata(Rtemp, m);
+  __ str(Rtemp, Address(SP, offset_from_sp_in_bytes));
+}
+
+//--------------fpu register translations-----------------------
+
+
+void LIR_Assembler::set_24bit_FPU() {
+  ShouldNotReachHere();
+}
+
+void LIR_Assembler::reset_FPU() {
+  ShouldNotReachHere();
+}
+
+void LIR_Assembler::fpop() {
+  Unimplemented();
+}
+
+void LIR_Assembler::fxch(int i) {
+  Unimplemented();
+}
+
+void LIR_Assembler::fld(int i) {
+  Unimplemented();
+}
+
+void LIR_Assembler::ffree(int i) {
+  Unimplemented();
+}
+
+void LIR_Assembler::breakpoint() {
+  __ breakpoint();
+}
+
+void LIR_Assembler::push(LIR_Opr opr) {
+  Unimplemented();
+}
+
+void LIR_Assembler::pop(LIR_Opr opr) {
+  Unimplemented();
+}
+
+//-------------------------------------------
+Address LIR_Assembler::as_Address(LIR_Address* addr) {
+  Register base = addr->base()->as_pointer_register();
+
+#ifdef AARCH64
+  int align = exact_log2(type2aelembytes(addr->type(), true));
+#endif
+
+  if (addr->index()->is_illegal() || addr->index()->is_constant()) {
+    int offset = addr->disp();
+    if (addr->index()->is_constant()) {
+      offset += addr->index()->as_constant_ptr()->as_jint() << addr->scale();
+    }
+
+#ifdef AARCH64
+    if (!Assembler::is_unsigned_imm_in_range(offset, 12, align) && !Assembler::is_imm_in_range(offset, 9, 0)) {
+      BAILOUT_("offset not in range", Address(base));
+    }
+    assert(UseUnalignedAccesses || (offset & right_n_bits(align)) == 0, "offset should be aligned");
+#else
+    if ((offset <= -4096) || (offset >= 4096)) {
+      BAILOUT_("offset not in range", Address(base));
+    }
+#endif // AARCH64
+
+    return Address(base, offset);
+
+  } else {
+    assert(addr->disp() == 0, "can't have both");
+    int scale = addr->scale();
+
+#ifdef AARCH64
+    assert((scale == 0) || (scale == align), "scale should be zero or equal to embedded shift");
+
+    bool is_index_extended = (addr->index()->type() == T_INT);
+    if (is_index_extended) {
+      assert(addr->index()->is_single_cpu(), "should be");
+      return Address(base, addr->index()->as_register(), ex_sxtw, scale);
+    } else {
+      assert(addr->index()->is_double_cpu(), "should be");
+      return Address(base, addr->index()->as_register_lo(), ex_lsl, scale);
+    }
+#else
+    assert(addr->index()->is_single_cpu(), "should be");
+    return scale >= 0 ? Address(base, addr->index()->as_register(), lsl, scale) :
+                        Address(base, addr->index()->as_register(), lsr, -scale);
+#endif // AARCH64
+  }
+}
+
+Address LIR_Assembler::as_Address_hi(LIR_Address* addr) {
+#ifdef AARCH64
+  ShouldNotCallThis(); // Not used on AArch64
+  return Address();
+#else
+  Address base = as_Address(addr);
+  assert(base.index() == noreg, "must be");
+  if (base.disp() + BytesPerWord >= 4096) { BAILOUT_("offset not in range", Address(base.base(),0)); }
+  return Address(base.base(), base.disp() + BytesPerWord);
+#endif // AARCH64
+}
+
+Address LIR_Assembler::as_Address_lo(LIR_Address* addr) {
+#ifdef AARCH64
+  ShouldNotCallThis(); // Not used on AArch64
+  return Address();
+#else
+  return as_Address(addr);
+#endif // AARCH64
+}
+
+
+void LIR_Assembler::osr_entry() {
+  offsets()->set_value(CodeOffsets::OSR_Entry, code_offset());
+  BlockBegin* osr_entry = compilation()->hir()->osr_entry();
+  ValueStack* entry_state = osr_entry->end()->state();
+  int number_of_locks = entry_state->locks_size();
+
+  __ build_frame(initial_frame_size_in_bytes(), bang_size_in_bytes());
+  Register OSR_buf = osrBufferPointer()->as_pointer_register();
+
+  assert(frame::interpreter_frame_monitor_size() == BasicObjectLock::size(), "adjust code below");
+  int monitor_offset = (method()->max_locals() + 2 * (number_of_locks - 1)) * BytesPerWord;
+  for (int i = 0; i < number_of_locks; i++) {
+    int slot_offset = monitor_offset - (i * 2 * BytesPerWord);
+    __ ldr(R1, Address(OSR_buf, slot_offset + 0*BytesPerWord));
+    __ ldr(R2, Address(OSR_buf, slot_offset + 1*BytesPerWord));
+    __ str(R1, frame_map()->address_for_monitor_lock(i));
+    __ str(R2, frame_map()->address_for_monitor_object(i));
+  }
+}
+
+
+int LIR_Assembler::check_icache() {
+  Register receiver = LIR_Assembler::receiverOpr()->as_register();
+  int offset = __ offset();
+  __ inline_cache_check(receiver, Ricklass);
+  return offset;
+}
+
+
+void LIR_Assembler::jobject2reg_with_patching(Register reg, CodeEmitInfo* info) {
+  jobject o = (jobject)Universe::non_oop_word();
+  int index = __ oop_recorder()->allocate_oop_index(o);
+
+  PatchingStub* patch = new PatchingStub(_masm, patching_id(info), index);
+
+  __ patchable_mov_oop(reg, o, index);
+  patching_epilog(patch, lir_patch_normal, reg, info);
+}
+
+
+void LIR_Assembler::klass2reg_with_patching(Register reg, CodeEmitInfo* info) {
+  Metadata* o = (Metadata*)Universe::non_oop_word();
+  int index = __ oop_recorder()->allocate_metadata_index(o);
+  PatchingStub* patch = new PatchingStub(_masm, PatchingStub::load_klass_id, index);
+
+  __ patchable_mov_metadata(reg, o, index);
+  patching_epilog(patch, lir_patch_normal, reg, info);
+}
+
+
+int LIR_Assembler::initial_frame_size_in_bytes() const {
+  // Subtracts two words to account for return address and link
+  return frame_map()->framesize()*VMRegImpl::stack_slot_size - 2*wordSize;
+}
+
+
+int LIR_Assembler::emit_exception_handler() {
+  // TODO: ARM
+  __ nop(); // See comments in other ports
+
+  address handler_base = __ start_a_stub(exception_handler_size);
+  if (handler_base == NULL) {
+    bailout("exception handler overflow");
+    return -1;
+  }
+
+  int offset = code_offset();
+
+  // check that there is really an exception
+  __ verify_not_null_oop(Rexception_obj);
+
+  __ call(Runtime1::entry_for(Runtime1::handle_exception_from_callee_id), relocInfo::runtime_call_type);
+  __ should_not_reach_here();
+
+  assert(code_offset() - offset <= exception_handler_size, "overflow");
+  __ end_a_stub();
+
+  return offset;
+}
+
+// Emit the code to remove the frame from the stack in the exception
+// unwind path.
+int LIR_Assembler::emit_unwind_handler() {
+#ifndef PRODUCT
+  if (CommentedAssembly) {
+    _masm->block_comment("Unwind handler");
+  }
+#endif
+
+  int offset = code_offset();
+
+  // Fetch the exception from TLS and clear out exception related thread state
+  Register zero = __ zero_register(Rtemp);
+  __ ldr(Rexception_obj, Address(Rthread, JavaThread::exception_oop_offset()));
+  __ str(zero, Address(Rthread, JavaThread::exception_oop_offset()));
+  __ str(zero, Address(Rthread, JavaThread::exception_pc_offset()));
+
+  __ bind(_unwind_handler_entry);
+  __ verify_not_null_oop(Rexception_obj);
+
+  // Preform needed unlocking
+  MonitorExitStub* stub = NULL;
+  if (method()->is_synchronized()) {
+    monitor_address(0, FrameMap::R0_opr);
+    stub = new MonitorExitStub(FrameMap::R0_opr, true, 0);
+    __ unlock_object(R2, R1, R0, Rtemp, *stub->entry());
+    __ bind(*stub->continuation());
+  }
+
+  // remove the activation and dispatch to the unwind handler
+  __ remove_frame(initial_frame_size_in_bytes()); // restores FP and LR
+  __ jump(Runtime1::entry_for(Runtime1::unwind_exception_id), relocInfo::runtime_call_type, Rtemp);
+
+  // Emit the slow path assembly
+  if (stub != NULL) {
+    stub->emit_code(this);
+  }
+
+  return offset;
+}
+
+
+int LIR_Assembler::emit_deopt_handler() {
+  address handler_base = __ start_a_stub(deopt_handler_size);
+  if (handler_base == NULL) {
+    bailout("deopt handler overflow");
+    return -1;
+  }
+
+  int offset = code_offset();
+
+  __ mov_relative_address(LR, __ pc());
+#ifdef AARCH64
+  __ raw_push(LR, LR);
+  __ jump(SharedRuntime::deopt_blob()->unpack(), relocInfo::runtime_call_type, Rtemp);
+#else
+  __ push(LR); // stub expects LR to be saved
+  __ jump(SharedRuntime::deopt_blob()->unpack(), relocInfo::runtime_call_type, noreg);
+#endif // AARCH64
+
+  assert(code_offset() - offset <= deopt_handler_size, "overflow");
+  __ end_a_stub();
+
+  return offset;
+}
+
+
+void LIR_Assembler::return_op(LIR_Opr result) {
+  // Pop the frame before safepoint polling
+  __ remove_frame(initial_frame_size_in_bytes());
+
+  // mov_slow here is usually one or two instruction
+  // TODO-AARCH64 3 instructions on AArch64, so try to load polling page by ldr_literal
+  __ mov_address(Rtemp, os::get_polling_page(), symbolic_Relocation::polling_page_reference);
+  __ relocate(relocInfo::poll_return_type);
+  __ ldr(Rtemp, Address(Rtemp));
+  __ ret();
+}
+
+
+int LIR_Assembler::safepoint_poll(LIR_Opr tmp, CodeEmitInfo* info) {
+  __ mov_address(Rtemp, os::get_polling_page(), symbolic_Relocation::polling_page_reference);
+  if (info != NULL) {
+    add_debug_info_for_branch(info);
+  }
+  int offset = __ offset();
+  __ relocate(relocInfo::poll_type);
+  __ ldr(Rtemp, Address(Rtemp));
+  return offset;
+}
+
+
+void LIR_Assembler::move_regs(Register from_reg, Register to_reg) {
+  if (from_reg != to_reg) {
+    __ mov(to_reg, from_reg);
+  }
+}
+
+void LIR_Assembler::const2reg(LIR_Opr src, LIR_Opr dest, LIR_PatchCode patch_code, CodeEmitInfo* info) {
+  assert(src->is_constant() && dest->is_register(), "must be");
+  LIR_Const* c = src->as_constant_ptr();
+
+  switch (c->type()) {
+    case T_ADDRESS:
+    case T_INT:
+      assert(patch_code == lir_patch_none, "no patching handled here");
+      __ mov_slow(dest->as_register(), c->as_jint());
+      break;
+
+    case T_LONG:
+      assert(patch_code == lir_patch_none, "no patching handled here");
+#ifdef AARCH64
+      __ mov_slow(dest->as_pointer_register(), (intptr_t)c->as_jlong());
+#else
+      __ mov_slow(dest->as_register_lo(), c->as_jint_lo());
+      __ mov_slow(dest->as_register_hi(), c->as_jint_hi());
+#endif // AARCH64
+      break;
+
+    case T_OBJECT:
+      if (patch_code == lir_patch_none) {
+        __ mov_oop(dest->as_register(), c->as_jobject());
+      } else {
+        jobject2reg_with_patching(dest->as_register(), info);
+      }
+      break;
+
+    case T_METADATA:
+      if (patch_code == lir_patch_none) {
+        __ mov_metadata(dest->as_register(), c->as_metadata());
+      } else {
+        klass2reg_with_patching(dest->as_register(), info);
+      }
+      break;
+
+    case T_FLOAT:
+      if (dest->is_single_fpu()) {
+        __ mov_float(dest->as_float_reg(), c->as_jfloat());
+      } else {
+#ifdef AARCH64
+        ShouldNotReachHere();
+#else
+        // Simple getters can return float constant directly into r0
+        __ mov_slow(dest->as_register(), c->as_jint_bits());
+#endif // AARCH64
+      }
+      break;
+
+    case T_DOUBLE:
+      if (dest->is_double_fpu()) {
+        __ mov_double(dest->as_double_reg(), c->as_jdouble());
+      } else {
+#ifdef AARCH64
+        ShouldNotReachHere();
+#else
+        // Simple getters can return double constant directly into r1r0
+        __ mov_slow(dest->as_register_lo(), c->as_jint_lo_bits());
+        __ mov_slow(dest->as_register_hi(), c->as_jint_hi_bits());
+#endif // AARCH64
+      }
+      break;
+
+    default:
+      ShouldNotReachHere();
+  }
+}
+
+void LIR_Assembler::const2stack(LIR_Opr src, LIR_Opr dest) {
+  assert(src->is_constant(), "must be");
+  assert(dest->is_stack(), "must be");
+  LIR_Const* c = src->as_constant_ptr();
+
+  switch (c->type()) {
+    case T_INT:  // fall through
+    case T_FLOAT:
+      __ mov_slow(Rtemp, c->as_jint_bits());
+      __ str_32(Rtemp, frame_map()->address_for_slot(dest->single_stack_ix()));
+      break;
+
+    case T_ADDRESS:
+      __ mov_slow(Rtemp, c->as_jint());
+      __ str(Rtemp, frame_map()->address_for_slot(dest->single_stack_ix()));
+      break;
+
+    case T_OBJECT:
+      __ mov_oop(Rtemp, c->as_jobject());
+      __ str(Rtemp, frame_map()->address_for_slot(dest->single_stack_ix()));
+      break;
+
+    case T_LONG:  // fall through
+    case T_DOUBLE:
+#ifdef AARCH64
+      __ mov_slow(Rtemp, c->as_jlong_bits());
+      __ str(Rtemp, frame_map()->address_for_slot(dest->double_stack_ix()));
+#else
+      __ mov_slow(Rtemp, c->as_jint_lo_bits());
+      __ str(Rtemp, frame_map()->address_for_slot(dest->double_stack_ix(), lo_word_offset_in_bytes));
+      if (c->as_jint_hi_bits() != c->as_jint_lo_bits()) {
+        __ mov_slow(Rtemp, c->as_jint_hi_bits());
+      }
+      __ str(Rtemp, frame_map()->address_for_slot(dest->double_stack_ix(), hi_word_offset_in_bytes));
+#endif // AARCH64
+      break;
+
+    default:
+      ShouldNotReachHere();
+  }
+}
+
+void LIR_Assembler::const2mem(LIR_Opr src, LIR_Opr dest, BasicType type,
+                              CodeEmitInfo* info, bool wide) {
+#ifdef AARCH64
+  assert((src->as_constant_ptr()->type() == T_OBJECT && src->as_constant_ptr()->as_jobject() == NULL) ||
+         (src->as_constant_ptr()->type() == T_INT && src->as_constant_ptr()->as_jint() == 0) ||
+         (src->as_constant_ptr()->type() == T_LONG && src->as_constant_ptr()->as_jlong() == 0) ||
+         (src->as_constant_ptr()->type() == T_FLOAT && src->as_constant_ptr()->as_jint_bits() == 0) ||
+         (src->as_constant_ptr()->type() == T_DOUBLE && src->as_constant_ptr()->as_jlong_bits() == 0),
+        "cannot handle otherwise");
+  assert(dest->as_address_ptr()->type() == type, "should be");
+
+  Address addr = as_Address(dest->as_address_ptr());
+  int null_check_offset = code_offset();
+  switch (type) {
+    case T_OBJECT:  // fall through
+    case T_ARRAY:
+        if (UseCompressedOops && !wide) {
+          __ str_w(ZR, addr);
+        } else {
+          __ str(ZR, addr);
+        }
+        break;
+    case T_ADDRESS: // fall through
+    case T_DOUBLE:  // fall through
+    case T_LONG:    __ str(ZR, addr);   break;
+    case T_FLOAT:   // fall through
+    case T_INT:     __ str_w(ZR, addr); break;
+    case T_BOOLEAN: // fall through
+    case T_BYTE:    __ strb(ZR, addr);  break;
+    case T_CHAR:    // fall through
+    case T_SHORT:   __ strh(ZR, addr);  break;
+    default: ShouldNotReachHere();
+  }
+#else
+  assert((src->as_constant_ptr()->type() == T_OBJECT && src->as_constant_ptr()->as_jobject() == NULL),"cannot handle otherwise");
+  __ mov(Rtemp, 0);
+
+  int null_check_offset = code_offset();
+  __ str(Rtemp, as_Address(dest->as_address_ptr()));
+#endif // AARCH64
+
+  if (info != NULL) {
+#ifndef AARCH64
+    assert(false, "arm32 didn't support this before, investigate if bug");
+#endif
+    add_debug_info_for_null_check(null_check_offset, info);
+  }
+}
+
+void LIR_Assembler::reg2reg(LIR_Opr src, LIR_Opr dest) {
+  assert(src->is_register() && dest->is_register(), "must be");
+
+  if (src->is_single_cpu()) {
+    if (dest->is_single_cpu()) {
+      move_regs(src->as_register(), dest->as_register());
+#ifdef AARCH64
+    } else if (dest->is_double_cpu()) {
+      assert ((src->type() == T_OBJECT) || (src->type() == T_ARRAY) || (src->type() == T_ADDRESS), "invalid src type");
+      move_regs(src->as_register(), dest->as_register_lo());
+#else
+    } else if (dest->is_single_fpu()) {
+      __ fmsr(dest->as_float_reg(), src->as_register());
+#endif // AARCH64
+    } else {
+      ShouldNotReachHere();
+    }
+  } else if (src->is_double_cpu()) {
+#ifdef AARCH64
+    move_regs(src->as_register_lo(), dest->as_register_lo());
+#else
+    if (dest->is_double_cpu()) {
+      __ long_move(dest->as_register_lo(), dest->as_register_hi(), src->as_register_lo(), src->as_register_hi());
+    } else {
+      __ fmdrr(dest->as_double_reg(), src->as_register_lo(), src->as_register_hi());
+    }
+#endif // AARCH64
+  } else if (src->is_single_fpu()) {
+    if (dest->is_single_fpu()) {
+      __ mov_float(dest->as_float_reg(), src->as_float_reg());
+    } else if (dest->is_single_cpu()) {
+      __ mov_fpr2gpr_float(dest->as_register(), src->as_float_reg());
+    } else {
+      ShouldNotReachHere();
+    }
+  } else if (src->is_double_fpu()) {
+    if (dest->is_double_fpu()) {
+      __ mov_double(dest->as_double_reg(), src->as_double_reg());
+    } else if (dest->is_double_cpu()) {
+#ifdef AARCH64
+      __ fmov_xd(dest->as_register_lo(), src->as_double_reg());
+#else
+      __ fmrrd(dest->as_register_lo(), dest->as_register_hi(), src->as_double_reg());
+#endif // AARCH64
+    } else {
+      ShouldNotReachHere();
+    }
+  } else {
+    ShouldNotReachHere();
+  }
+}
+
+void LIR_Assembler::reg2stack(LIR_Opr src, LIR_Opr dest, BasicType type, bool pop_fpu_stack) {
+  assert(src->is_register(), "should not call otherwise");
+  assert(dest->is_stack(), "should not call otherwise");
+
+  Address addr = dest->is_single_word() ?
+    frame_map()->address_for_slot(dest->single_stack_ix()) :
+    frame_map()->address_for_slot(dest->double_stack_ix());
+
+#ifndef AARCH64
+  assert(lo_word_offset_in_bytes == 0 && hi_word_offset_in_bytes == 4, "little ending");
+  if (src->is_single_fpu() || src->is_double_fpu()) {
+    if (addr.disp() >= 1024) { BAILOUT("Too exotic case to handle here"); }
+  }
+#endif // !AARCH64
+
+  if (src->is_single_cpu()) {
+    switch (type) {
+      case T_OBJECT:
+      case T_ARRAY:    __ verify_oop(src->as_register());   // fall through
+      case T_ADDRESS:
+      case T_METADATA: __ str(src->as_register(), addr);    break;
+      case T_FLOAT:    // used in intBitsToFloat intrinsic implementation, fall through
+      case T_INT:      __ str_32(src->as_register(), addr); break;
+      default:
+        ShouldNotReachHere();
+    }
+  } else if (src->is_double_cpu()) {
+    __ str(src->as_register_lo(), addr);
+#ifndef AARCH64
+    __ str(src->as_register_hi(), frame_map()->address_for_slot(dest->double_stack_ix(), hi_word_offset_in_bytes));
+#endif // !AARCH64
+  } else if (src->is_single_fpu()) {
+    __ str_float(src->as_float_reg(), addr);
+  } else if (src->is_double_fpu()) {
+    __ str_double(src->as_double_reg(), addr);
+  } else {
+    ShouldNotReachHere();
+  }
+}
+
+
+void LIR_Assembler::reg2mem(LIR_Opr src, LIR_Opr dest, BasicType type,
+                            LIR_PatchCode patch_code, CodeEmitInfo* info,
+                            bool pop_fpu_stack, bool wide,
+                            bool unaligned) {
+  LIR_Address* to_addr = dest->as_address_ptr();
+  Register base_reg = to_addr->base()->as_pointer_register();
+  const bool needs_patching = (patch_code != lir_patch_none);
+
+  PatchingStub* patch = NULL;
+  if (needs_patching) {
+#ifdef AARCH64
+    // Same alignment of reg2mem code and PatchingStub code. Required to make copied bind_literal() code properly aligned.
+    __ align(wordSize);
+#endif
+    patch = new PatchingStub(_masm, PatchingStub::access_field_id);
+#ifdef AARCH64
+    // Extra nop for MT safe patching
+    __ nop();
+#endif // AARCH64
+  }
+
+  int null_check_offset = code_offset();
+
+  switch (type) {
+    case T_ARRAY:
+    case T_OBJECT:
+      if (UseCompressedOops && !wide) {
+#ifdef AARCH64
+        const Register temp_src = Rtemp;
+        assert_different_registers(temp_src, src->as_register());
+        __ encode_heap_oop(temp_src, src->as_register());
+        null_check_offset = code_offset();
+        __ str_32(temp_src, as_Address(to_addr));
+#else
+        ShouldNotReachHere();
+#endif // AARCH64
+      } else {
+        __ str(src->as_register(), as_Address(to_addr));
+      }
+      break;
+
+    case T_ADDRESS:
+#ifdef AARCH64
+    case T_LONG:
+#endif // AARCH64
+      __ str(src->as_pointer_register(), as_Address(to_addr));
+      break;
+
+    case T_BYTE:
+    case T_BOOLEAN:
+      __ strb(src->as_register(), as_Address(to_addr));
+      break;
+
+    case T_CHAR:
+    case T_SHORT:
+      __ strh(src->as_register(), as_Address(to_addr));
+      break;
+
+    case T_INT:
+#ifdef __SOFTFP__
+    case T_FLOAT:
+#endif // __SOFTFP__
+      __ str_32(src->as_register(), as_Address(to_addr));
+      break;
+
+#ifdef AARCH64
+
+    case T_FLOAT:
+      __ str_s(src->as_float_reg(), as_Address(to_addr));
+      break;
+
+    case T_DOUBLE:
+      __ str_d(src->as_double_reg(), as_Address(to_addr));
+      break;
+
+#else // AARCH64
+
+#ifdef __SOFTFP__
+    case T_DOUBLE:
+#endif // __SOFTFP__
+    case T_LONG: {
+      Register from_lo = src->as_register_lo();
+      Register from_hi = src->as_register_hi();
+      if (to_addr->index()->is_register()) {
+        assert(to_addr->scale() == LIR_Address::times_1,"Unexpected scaled register");
+        assert(to_addr->disp() == 0, "Not yet supporting both");
+        __ add(Rtemp, base_reg, to_addr->index()->as_register());
+        base_reg = Rtemp;
+        __ str(from_lo, Address(Rtemp));
+        if (patch != NULL) {
+          patching_epilog(patch, lir_patch_low, base_reg, info);
+          patch = new PatchingStub(_masm, PatchingStub::access_field_id);
+          patch_code = lir_patch_high;
+        }
+        __ str(from_hi, Address(Rtemp, BytesPerWord));
+      } else if (base_reg == from_lo) {
+        __ str(from_hi, as_Address_hi(to_addr));
+        if (patch != NULL) {
+          patching_epilog(patch, lir_patch_high, base_reg, info);
+          patch = new PatchingStub(_masm, PatchingStub::access_field_id);
+          patch_code = lir_patch_low;
+        }
+        __ str(from_lo, as_Address_lo(to_addr));
+      } else {
+        __ str(from_lo, as_Address_lo(to_addr));
+        if (patch != NULL) {
+          patching_epilog(patch, lir_patch_low, base_reg, info);
+          patch = new PatchingStub(_masm, PatchingStub::access_field_id);
+          patch_code = lir_patch_high;
+        }
+        __ str(from_hi, as_Address_hi(to_addr));
+      }
+      break;
+    }
+
+#ifndef __SOFTFP__
+    case T_FLOAT:
+      if (to_addr->index()->is_register()) {
+        assert(to_addr->scale() == LIR_Address::times_1,"Unexpected scaled register");
+        __ add(Rtemp, base_reg, to_addr->index()->as_register());
+        if ((to_addr->disp() <= -4096) || (to_addr->disp() >= 4096)) { BAILOUT("offset not in range"); }
+        __ fsts(src->as_float_reg(), Address(Rtemp, to_addr->disp()));
+      } else {
+        __ fsts(src->as_float_reg(), as_Address(to_addr));
+      }
+      break;
+
+    case T_DOUBLE:
+      if (to_addr->index()->is_register()) {
+        assert(to_addr->scale() == LIR_Address::times_1,"Unexpected scaled register");
+        __ add(Rtemp, base_reg, to_addr->index()->as_register());
+        if ((to_addr->disp() <= -4096) || (to_addr->disp() >= 4096)) { BAILOUT("offset not in range"); }
+        __ fstd(src->as_double_reg(), Address(Rtemp, to_addr->disp()));
+      } else {
+        __ fstd(src->as_double_reg(), as_Address(to_addr));
+      }
+      break;
+#endif // __SOFTFP__
+
+#endif // AARCH64
+
+    default:
+      ShouldNotReachHere();
+  }
+
+  if (info != NULL) {
+    add_debug_info_for_null_check(null_check_offset, info);
+  }
+
+  if (patch != NULL) {
+    // Offset embeedded into LDR/STR instruction may appear not enough
+    // to address a field. So, provide a space for one more instruction
+    // that will deal with larger offsets.
+    __ nop();
+    patching_epilog(patch, patch_code, base_reg, info);
+  }
+}
+
+
+void LIR_Assembler::stack2reg(LIR_Opr src, LIR_Opr dest, BasicType type) {
+  assert(src->is_stack(), "should not call otherwise");
+  assert(dest->is_register(), "should not call otherwise");
+
+  Address addr = src->is_single_word() ?
+    frame_map()->address_for_slot(src->single_stack_ix()) :
+    frame_map()->address_for_slot(src->double_stack_ix());
+
+#ifndef AARCH64
+  assert(lo_word_offset_in_bytes == 0 && hi_word_offset_in_bytes == 4, "little ending");
+  if (dest->is_single_fpu() || dest->is_double_fpu()) {
+    if (addr.disp() >= 1024) { BAILOUT("Too exotic case to handle here"); }
+  }
+#endif // !AARCH64
+
+  if (dest->is_single_cpu()) {
+    switch (type) {
+      case T_OBJECT:
+      case T_ARRAY:
+      case T_ADDRESS:
+      case T_METADATA: __ ldr(dest->as_register(), addr); break;
+      case T_FLOAT:    // used in floatToRawIntBits intrinsic implemenation
+      case T_INT:      __ ldr_u32(dest->as_register(), addr); break;
+      default:
+        ShouldNotReachHere();
+    }
+    if ((type == T_OBJECT) || (type == T_ARRAY)) {
+      __ verify_oop(dest->as_register());
+    }
+  } else if (dest->is_double_cpu()) {
+    __ ldr(dest->as_register_lo(), addr);
+#ifndef AARCH64
+    __ ldr(dest->as_register_hi(), frame_map()->address_for_slot(src->double_stack_ix(), hi_word_offset_in_bytes));
+#endif // !AARCH64
+  } else if (dest->is_single_fpu()) {
+    __ ldr_float(dest->as_float_reg(), addr);
+  } else if (dest->is_double_fpu()) {
+    __ ldr_double(dest->as_double_reg(), addr);
+  } else {
+    ShouldNotReachHere();
+  }
+}
+
+
+void LIR_Assembler::stack2stack(LIR_Opr src, LIR_Opr dest, BasicType type) {
+  if (src->is_single_stack()) {
+    switch (src->type()) {
+      case T_OBJECT:
+      case T_ARRAY:
+      case T_ADDRESS:
+      case T_METADATA:
+        __ ldr(Rtemp, frame_map()->address_for_slot(src->single_stack_ix()));
+        __ str(Rtemp, frame_map()->address_for_slot(dest->single_stack_ix()));
+        break;
+
+      case T_INT:
+      case T_FLOAT:
+        __ ldr_u32(Rtemp, frame_map()->address_for_slot(src->single_stack_ix()));
+        __ str_32(Rtemp, frame_map()->address_for_slot(dest->single_stack_ix()));
+        break;
+
+      default:
+        ShouldNotReachHere();
+    }
+  } else {
+    assert(src->is_double_stack(), "must be");
+    __ ldr(Rtemp, frame_map()->address_for_slot(src->double_stack_ix(), lo_word_offset_in_bytes));
+    __ str(Rtemp, frame_map()->address_for_slot(dest->double_stack_ix(), lo_word_offset_in_bytes));
+#ifdef AARCH64
+    assert(lo_word_offset_in_bytes == 0, "adjust this code");
+#else
+    __ ldr(Rtemp, frame_map()->address_for_slot(src->double_stack_ix(), hi_word_offset_in_bytes));
+    __ str(Rtemp, frame_map()->address_for_slot(dest->double_stack_ix(), hi_word_offset_in_bytes));
+#endif // AARCH64
+  }
+}
+
+
+void LIR_Assembler::mem2reg(LIR_Opr src, LIR_Opr dest, BasicType type,
+                            LIR_PatchCode patch_code, CodeEmitInfo* info,
+                            bool wide, bool unaligned) {
+  assert(src->is_address(), "should not call otherwise");
+  assert(dest->is_register(), "should not call otherwise");
+  LIR_Address* addr = src->as_address_ptr();
+
+  Register base_reg = addr->base()->as_pointer_register();
+
+  PatchingStub* patch = NULL;
+  if (patch_code != lir_patch_none) {
+    patch = new PatchingStub(_masm, PatchingStub::access_field_id);
+#ifdef AARCH64
+    // Extra nop for MT safe patching
+    __ nop();
+#endif // AARCH64
+  }
+  if (info != NULL) {
+    add_debug_info_for_null_check_here(info);
+  }
+
+  switch (type) {
+    case T_OBJECT:  // fall through
+    case T_ARRAY:
+      if (UseCompressedOops && !wide) {
+        __ ldr_u32(dest->as_register(), as_Address(addr));
+      } else {
+        __ ldr(dest->as_register(), as_Address(addr));
+      }
+      break;
+
+    case T_ADDRESS:
+      if (UseCompressedClassPointers && addr->disp() == oopDesc::klass_offset_in_bytes()) {
+        __ ldr_u32(dest->as_pointer_register(), as_Address(addr));
+      } else {
+        __ ldr(dest->as_pointer_register(), as_Address(addr));
+      }
+      break;
+
+#ifdef AARCH64
+    case T_LONG:
+#else
+    case T_INT:
+#ifdef __SOFTFP__
+    case T_FLOAT:
+#endif // __SOFTFP__
+#endif // AARCH64
+      __ ldr(dest->as_pointer_register(), as_Address(addr));
+      break;
+
+    case T_BOOLEAN:
+      __ ldrb(dest->as_register(), as_Address(addr));
+      break;
+
+    case T_BYTE:
+      __ ldrsb(dest->as_register(), as_Address(addr));
+      break;
+
+    case T_CHAR:
+      __ ldrh(dest->as_register(), as_Address(addr));
+      break;
+
+    case T_SHORT:
+      __ ldrsh(dest->as_register(), as_Address(addr));
+      break;
+
+#ifdef AARCH64
+
+    case T_INT:
+      __ ldr_w(dest->as_register(), as_Address(addr));
+      break;
+
+    case T_FLOAT:
+      __ ldr_s(dest->as_float_reg(), as_Address(addr));
+      break;
+
+    case T_DOUBLE:
+      __ ldr_d(dest->as_double_reg(), as_Address(addr));
+      break;
+
+#else // AARCH64
+
+#ifdef __SOFTFP__
+    case T_DOUBLE:
+#endif // __SOFTFP__
+    case T_LONG: {
+      Register to_lo = dest->as_register_lo();
+      Register to_hi = dest->as_register_hi();
+      if (addr->index()->is_register()) {
+        assert(addr->scale() == LIR_Address::times_1,"Unexpected scaled register");
+        assert(addr->disp() == 0, "Not yet supporting both");
+        __ add(Rtemp, base_reg, addr->index()->as_register());
+        base_reg = Rtemp;
+        __ ldr(to_lo, Address(Rtemp));
+        if (patch != NULL) {
+          patching_epilog(patch, lir_patch_low, base_reg, info);
+          patch = new PatchingStub(_masm, PatchingStub::access_field_id);
+          patch_code = lir_patch_high;
+        }
+        __ ldr(to_hi, Address(Rtemp, BytesPerWord));
+      } else if (base_reg == to_lo) {
+        __ ldr(to_hi, as_Address_hi(addr));
+        if (patch != NULL) {
+          patching_epilog(patch, lir_patch_high, base_reg, info);
+          patch = new PatchingStub(_masm, PatchingStub::access_field_id);
+          patch_code = lir_patch_low;
+        }
+        __ ldr(to_lo, as_Address_lo(addr));
+      } else {
+        __ ldr(to_lo, as_Address_lo(addr));
+        if (patch != NULL) {
+          patching_epilog(patch, lir_patch_low, base_reg, info);
+          patch = new PatchingStub(_masm, PatchingStub::access_field_id);
+          patch_code = lir_patch_high;
+        }
+        __ ldr(to_hi, as_Address_hi(addr));
+      }
+      break;
+    }
+
+#ifndef __SOFTFP__
+    case T_FLOAT:
+      if (addr->index()->is_register()) {
+        assert(addr->scale() == LIR_Address::times_1,"Unexpected scaled register");
+        __ add(Rtemp, base_reg, addr->index()->as_register());
+        if ((addr->disp() <= -4096) || (addr->disp() >= 4096)) { BAILOUT("offset not in range"); }
+        __ flds(dest->as_float_reg(), Address(Rtemp, addr->disp()));
+      } else {
+        __ flds(dest->as_float_reg(), as_Address(addr));
+      }
+      break;
+
+    case T_DOUBLE:
+      if (addr->index()->is_register()) {
+        assert(addr->scale() == LIR_Address::times_1,"Unexpected scaled register");
+        __ add(Rtemp, base_reg, addr->index()->as_register());
+        if ((addr->disp() <= -4096) || (addr->disp() >= 4096)) { BAILOUT("offset not in range"); }
+        __ fldd(dest->as_double_reg(), Address(Rtemp, addr->disp()));
+      } else {
+        __ fldd(dest->as_double_reg(), as_Address(addr));
+      }
+      break;
+#endif // __SOFTFP__
+
+#endif // AARCH64
+
+    default:
+      ShouldNotReachHere();
+  }
+
+  if (patch != NULL) {
+    // Offset embeedded into LDR/STR instruction may appear not enough
+    // to address a field. So, provide a space for one more instruction
+    // that will deal with larger offsets.
+    __ nop();
+    patching_epilog(patch, patch_code, base_reg, info);
+  }
+
+#ifdef AARCH64
+  switch (type) {
+    case T_ARRAY:
+    case T_OBJECT:
+      if (UseCompressedOops && !wide) {
+        __ decode_heap_oop(dest->as_register());
+      }
+      __ verify_oop(dest->as_register());
+      break;
+
+    case T_ADDRESS:
+      if (UseCompressedClassPointers && addr->disp() == oopDesc::klass_offset_in_bytes()) {
+        __ decode_klass_not_null(dest->as_register());
+      }
+      break;
+  }
+#endif // AARCH64
+}
+
+
+void LIR_Assembler::emit_op3(LIR_Op3* op) {
+  bool is_32 = op->result_opr()->is_single_cpu();
+
+  if (op->code() == lir_idiv && op->in_opr2()->is_constant() && is_32) {
+    int c = op->in_opr2()->as_constant_ptr()->as_jint();
+    assert(is_power_of_2(c), "non power-of-2 constant should be put in a register");
+
+    Register left = op->in_opr1()->as_register();
+    Register dest = op->result_opr()->as_register();
+    if (c == 1) {
+      __ mov(dest, left);
+    } else if (c == 2) {
+      __ add_32(dest, left, AsmOperand(left, lsr, 31));
+      __ asr_32(dest, dest, 1);
+    } else if (c != (int) 0x80000000) {
+      int power = log2_intptr(c);
+      __ asr_32(Rtemp, left, 31);
+      __ add_32(dest, left, AsmOperand(Rtemp, lsr, 32-power)); // dest = left + (left < 0 ? 2^power - 1 : 0);
+      __ asr_32(dest, dest, power);                            // dest = dest >>> power;
+    } else {
+      // x/0x80000000 is a special case, since dividend is a power of two, but is negative.
+      // The only possible result values are 0 and 1, with 1 only for dividend == divisor == 0x80000000.
+      __ cmp_32(left, c);
+#ifdef AARCH64
+      __ cset(dest, eq);
+#else
+      __ mov(dest, 0, ne);
+      __ mov(dest, 1, eq);
+#endif // AARCH64
+    }
+  } else {
+#ifdef AARCH64
+    Register left  = op->in_opr1()->as_pointer_register();
+    Register right = op->in_opr2()->as_pointer_register();
+    Register dest  = op->result_opr()->as_pointer_register();
+
+    switch (op->code()) {
+      case lir_idiv:
+        if (is_32) {
+          __ sdiv_w(dest, left, right);
+        } else {
+          __ sdiv(dest, left, right);
+        }
+        break;
+      case lir_irem: {
+        Register tmp = op->in_opr3()->as_pointer_register();
+        assert_different_registers(left, tmp);
+        assert_different_registers(right, tmp);
+        if (is_32) {
+          __ sdiv_w(tmp, left, right);
+          __ msub_w(dest, right, tmp, left);
+        } else {
+          __ sdiv(tmp, left, right);
+          __ msub(dest, right, tmp, left);
+        }
+        break;
+      }
+      default:
+        ShouldNotReachHere();
+    }
+#else
+    assert(op->code() == lir_idiv || op->code() == lir_irem, "unexpected op3");
+    __ call(StubRoutines::Arm::idiv_irem_entry(), relocInfo::runtime_call_type);
+    add_debug_info_for_div0_here(op->info());
+#endif // AARCH64
+  }
+}
+
+
+void LIR_Assembler::emit_opBranch(LIR_OpBranch* op) {
+#ifdef ASSERT
+  assert(op->block() == NULL || op->block()->label() == op->label(), "wrong label");
+  if (op->block() != NULL)  _branch_target_blocks.append(op->block());
+  if (op->ublock() != NULL) _branch_target_blocks.append(op->ublock());
+  assert(op->info() == NULL, "CodeEmitInfo?");
+#endif // ASSERT
+
+#ifdef __SOFTFP__
+  assert (op->code() != lir_cond_float_branch, "this should be impossible");
+#else
+  if (op->code() == lir_cond_float_branch) {
+#ifndef AARCH64
+    __ fmstat();
+#endif // !AARCH64
+    __ b(*(op->ublock()->label()), vs);
+  }
+#endif // __SOFTFP__
+
+  AsmCondition acond = al;
+  switch (op->cond()) {
+    case lir_cond_equal:        acond = eq; break;
+    case lir_cond_notEqual:     acond = ne; break;
+    case lir_cond_less:         acond = lt; break;
+    case lir_cond_lessEqual:    acond = le; break;
+    case lir_cond_greaterEqual: acond = ge; break;
+    case lir_cond_greater:      acond = gt; break;
+    case lir_cond_aboveEqual:   acond = hs; break;
+    case lir_cond_belowEqual:   acond = ls; break;
+    default: assert(op->cond() == lir_cond_always, "must be");
+  }
+  __ b(*(op->label()), acond);
+}
+
+
+void LIR_Assembler::emit_opConvert(LIR_OpConvert* op) {
+  LIR_Opr src  = op->in_opr();
+  LIR_Opr dest = op->result_opr();
+
+  switch (op->bytecode()) {
+    case Bytecodes::_i2l:
+#ifdef AARCH64
+      __ sign_extend(dest->as_register_lo(), src->as_register(), 32);
+#else
+      move_regs(src->as_register(), dest->as_register_lo());
+      __ mov(dest->as_register_hi(), AsmOperand(src->as_register(), asr, 31));
+#endif // AARCH64
+      break;
+    case Bytecodes::_l2i:
+      move_regs(src->as_register_lo(), dest->as_register());
+      break;
+    case Bytecodes::_i2b:
+      __ sign_extend(dest->as_register(), src->as_register(), 8);
+      break;
+    case Bytecodes::_i2s:
+      __ sign_extend(dest->as_register(), src->as_register(), 16);
+      break;
+    case Bytecodes::_i2c:
+      __ zero_extend(dest->as_register(), src->as_register(), 16);
+      break;
+    case Bytecodes::_f2d:
+      __ convert_f2d(dest->as_double_reg(), src->as_float_reg());
+      break;
+    case Bytecodes::_d2f:
+      __ convert_d2f(dest->as_float_reg(), src->as_double_reg());
+      break;
+    case Bytecodes::_i2f:
+#ifdef AARCH64
+      __ scvtf_sw(dest->as_float_reg(), src->as_register());
+#else
+      __ fmsr(Stemp, src->as_register());
+      __ fsitos(dest->as_float_reg(), Stemp);
+#endif // AARCH64
+      break;
+    case Bytecodes::_i2d:
+#ifdef AARCH64
+      __ scvtf_dw(dest->as_double_reg(), src->as_register());
+#else
+      __ fmsr(Stemp, src->as_register());
+      __ fsitod(dest->as_double_reg(), Stemp);
+#endif // AARCH64
+      break;
+    case Bytecodes::_f2i:
+#ifdef AARCH64
+      __ fcvtzs_ws(dest->as_register(), src->as_float_reg());
+#else
+      __ ftosizs(Stemp, src->as_float_reg());
+      __ fmrs(dest->as_register(), Stemp);
+#endif // AARCH64
+      break;
+    case Bytecodes::_d2i:
+#ifdef AARCH64
+      __ fcvtzs_wd(dest->as_register(), src->as_double_reg());
+#else
+      __ ftosizd(Stemp, src->as_double_reg());
+      __ fmrs(dest->as_register(), Stemp);
+#endif // AARCH64
+      break;
+#ifdef AARCH64
+    case Bytecodes::_l2f:
+      __ scvtf_sx(dest->as_float_reg(), src->as_register_lo());
+      break;
+    case Bytecodes::_l2d:
+      __ scvtf_dx(dest->as_double_reg(), src->as_register_lo());
+      break;
+    case Bytecodes::_f2l:
+      __ fcvtzs_xs(dest->as_register_lo(), src->as_float_reg());
+      break;
+    case Bytecodes::_d2l:
+      __ fcvtzs_xd(dest->as_register_lo(), src->as_double_reg());
+      break;
+#endif // AARCH64
+    default:
+      ShouldNotReachHere();
+  }
+}
+
+
+void LIR_Assembler::emit_alloc_obj(LIR_OpAllocObj* op) {
+  if (op->init_check()) {
+    Register tmp = op->tmp1()->as_register();
+    __ ldrb(tmp, Address(op->klass()->as_register(), InstanceKlass::init_state_offset()));
+    add_debug_info_for_null_check_here(op->stub()->info());
+    __ cmp(tmp, InstanceKlass::fully_initialized);
+    __ b(*op->stub()->entry(), ne);
+  }
+  __ allocate_object(op->obj()->as_register(),
+                     op->tmp1()->as_register(),
+                     op->tmp2()->as_register(),
+                     op->tmp3()->as_register(),
+                     op->header_size(),
+                     op->object_size(),
+                     op->klass()->as_register(),
+                     *op->stub()->entry());
+  __ bind(*op->stub()->continuation());
+}
+
+void LIR_Assembler::emit_alloc_array(LIR_OpAllocArray* op) {
+  if (UseSlowPath ||
+      (!UseFastNewObjectArray && (op->type() == T_OBJECT || op->type() == T_ARRAY)) ||
+      (!UseFastNewTypeArray   && (op->type() != T_OBJECT && op->type() != T_ARRAY))) {
+    __ b(*op->stub()->entry());
+  } else {
+    __ allocate_array(op->obj()->as_register(),
+                      op->len()->as_register(),
+                      op->tmp1()->as_register(),
+                      op->tmp2()->as_register(),
+                      op->tmp3()->as_register(),
+                      arrayOopDesc::header_size(op->type()),
+                      type2aelembytes(op->type()),
+                      op->klass()->as_register(),
+                      *op->stub()->entry());
+  }
+  __ bind(*op->stub()->continuation());
+}
+
+void LIR_Assembler::type_profile_helper(Register mdo, int mdo_offset_bias,
+                                        ciMethodData *md, ciProfileData *data,
+                                        Register recv, Register tmp1, Label* update_done) {
+  assert_different_registers(mdo, recv, tmp1);
+  uint i;
+  for (i = 0; i < VirtualCallData::row_limit(); i++) {
+    Label next_test;
+    // See if the receiver is receiver[n].
+    Address receiver_addr(mdo, md->byte_offset_of_slot(data, ReceiverTypeData::receiver_offset(i)) -
+                          mdo_offset_bias);
+    __ ldr(tmp1, receiver_addr);
+    __ verify_klass_ptr(tmp1);
+    __ cmp(recv, tmp1);
+    __ b(next_test, ne);
+    Address data_addr(mdo, md->byte_offset_of_slot(data, ReceiverTypeData::receiver_count_offset(i)) -
+                      mdo_offset_bias);
+    __ ldr(tmp1, data_addr);
+    __ add(tmp1, tmp1, DataLayout::counter_increment);
+    __ str(tmp1, data_addr);
+    __ b(*update_done);
+    __ bind(next_test);
+  }
+
+  // Didn't find receiver; find next empty slot and fill it in
+  for (i = 0; i < VirtualCallData::row_limit(); i++) {
+    Label next_test;
+    Address recv_addr(mdo, md->byte_offset_of_slot(data, ReceiverTypeData::receiver_offset(i)) -
+                      mdo_offset_bias);
+    __ ldr(tmp1, recv_addr);
+    __ cbnz(tmp1, next_test);
+    __ str(recv, recv_addr);
+    __ mov(tmp1, DataLayout::counter_increment);
+    __ str(tmp1, Address(mdo, md->byte_offset_of_slot(data, ReceiverTypeData::receiver_count_offset(i)) -
+                         mdo_offset_bias));
+    __ b(*update_done);
+    __ bind(next_test);
+  }
+}
+
+void LIR_Assembler::setup_md_access(ciMethod* method, int bci,
+                                    ciMethodData*& md, ciProfileData*& data, int& mdo_offset_bias) {
+  md = method->method_data_or_null();
+  assert(md != NULL, "Sanity");
+  data = md->bci_to_data(bci);
+  assert(data != NULL,       "need data for checkcast");
+  assert(data->is_ReceiverTypeData(), "need ReceiverTypeData for type check");
+  if (md->byte_offset_of_slot(data, DataLayout::header_offset()) + data->size_in_bytes() >= 4096) {
+    // The offset is large so bias the mdo by the base of the slot so
+    // that the ldr can use an immediate offset to reference the slots of the data
+    mdo_offset_bias = md->byte_offset_of_slot(data, DataLayout::header_offset());
+  }
+}
+
+// On 32-bit ARM, code before this helper should test obj for null (ZF should be set if obj is null).
+void LIR_Assembler::typecheck_profile_helper1(ciMethod* method, int bci,
+                                              ciMethodData*& md, ciProfileData*& data, int& mdo_offset_bias,
+                                              Register obj, Register mdo, Register data_val, Label* obj_is_null) {
+  assert(method != NULL, "Should have method");
+  assert_different_registers(obj, mdo, data_val);
+  setup_md_access(method, bci, md, data, mdo_offset_bias);
+  Label not_null;
+#ifdef AARCH64
+  __ cbnz(obj, not_null);
+#else
+  __ b(not_null, ne);
+#endif // AARCH64
+  __ mov_metadata(mdo, md->constant_encoding());
+  if (mdo_offset_bias > 0) {
+    __ mov_slow(data_val, mdo_offset_bias);
+    __ add(mdo, mdo, data_val);
+  }
+  Address flags_addr(mdo, md->byte_offset_of_slot(data, DataLayout::flags_offset()) - mdo_offset_bias);
+  __ ldrb(data_val, flags_addr);
+  __ orr(data_val, data_val, (uint)BitData::null_seen_byte_constant());
+  __ strb(data_val, flags_addr);
+  __ b(*obj_is_null);
+  __ bind(not_null);
+}
+
+void LIR_Assembler::typecheck_profile_helper2(ciMethodData* md, ciProfileData* data, int mdo_offset_bias,
+                                              Register mdo, Register recv, Register value, Register tmp1,
+                                              Label* profile_cast_success, Label* profile_cast_failure,
+                                              Label* success, Label* failure) {
+  assert_different_registers(mdo, value, tmp1);
+  __ bind(*profile_cast_success);
+  __ mov_metadata(mdo, md->constant_encoding());
+  if (mdo_offset_bias > 0) {
+    __ mov_slow(tmp1, mdo_offset_bias);
+    __ add(mdo, mdo, tmp1);
+  }
+  __ load_klass(recv, value);
+  type_profile_helper(mdo, mdo_offset_bias, md, data, recv, tmp1, success);
+  __ b(*success);
+  // Cast failure case
+  __ bind(*profile_cast_failure);
+  __ mov_metadata(mdo, md->constant_encoding());
+  if (mdo_offset_bias > 0) {
+    __ mov_slow(tmp1, mdo_offset_bias);
+    __ add(mdo, mdo, tmp1);
+  }
+  Address data_addr(mdo, md->byte_offset_of_slot(data, CounterData::count_offset()) - mdo_offset_bias);
+  __ ldr(tmp1, data_addr);
+  __ sub(tmp1, tmp1, DataLayout::counter_increment);
+  __ str(tmp1, data_addr);
+  __ b(*failure);
+}
+
+// Sets `res` to true, if `cond` holds. On AArch64 also sets `res` to false if `cond` does not hold.
+static void set_instanceof_result(MacroAssembler* _masm, Register res, AsmCondition cond) {
+#ifdef AARCH64
+  __ cset(res, cond);
+#else
+  __ mov(res, 1, cond);
+#endif // AARCH64
+}
+
+
+void LIR_Assembler::emit_opTypeCheck(LIR_OpTypeCheck* op) {
+  // TODO: ARM - can be more effective with one more register
+  switch (op->code()) {
+    case lir_store_check: {
+      CodeStub* stub = op->stub();
+      Register value = op->object()->as_register();
+      Register array = op->array()->as_register();
+      Register klass_RInfo = op->tmp1()->as_register();
+      Register k_RInfo = op->tmp2()->as_register();
+      assert_different_registers(klass_RInfo, k_RInfo, Rtemp);
+      if (op->should_profile()) {
+        assert_different_registers(value, klass_RInfo, k_RInfo, Rtemp);
+      }
+
+      // check if it needs to be profiled
+      ciMethodData* md;
+      ciProfileData* data;
+      int mdo_offset_bias = 0;
+      Label profile_cast_success, profile_cast_failure, done;
+      Label *success_target = op->should_profile() ? &profile_cast_success : &done;
+      Label *failure_target = op->should_profile() ? &profile_cast_failure : stub->entry();
+
+      if (op->should_profile()) {
+#ifndef AARCH64
+        __ cmp(value, 0);
+#endif // !AARCH64
+        typecheck_profile_helper1(op->profiled_method(), op->profiled_bci(), md, data, mdo_offset_bias, value, k_RInfo, Rtemp, &done);
+      } else {
+        __ cbz(value, done);
+      }
+      assert_different_registers(k_RInfo, value);
+      add_debug_info_for_null_check_here(op->info_for_exception());
+      __ load_klass(k_RInfo, array);
+      __ load_klass(klass_RInfo, value);
+      __ ldr(k_RInfo, Address(k_RInfo, ObjArrayKlass::element_klass_offset()));
+      __ ldr_u32(Rtemp, Address(k_RInfo, Klass::super_check_offset_offset()));
+      // check for immediate positive hit
+      __ ldr(Rtemp, Address(klass_RInfo, Rtemp));
+      __ cmp(klass_RInfo, k_RInfo);
+      __ cond_cmp(Rtemp, k_RInfo, ne);
+      __ b(*success_target, eq);
+      // check for immediate negative hit
+      __ ldr_u32(Rtemp, Address(k_RInfo, Klass::super_check_offset_offset()));
+      __ cmp(Rtemp, in_bytes(Klass::secondary_super_cache_offset()));
+      __ b(*failure_target, ne);
+      // slow case
+      assert(klass_RInfo == R0 && k_RInfo == R1, "runtime call setup");
+      __ call(Runtime1::entry_for(Runtime1::slow_subtype_check_id), relocInfo::runtime_call_type);
+      __ cbz(R0, *failure_target);
+      if (op->should_profile()) {
+        Register mdo  = klass_RInfo, recv = k_RInfo, tmp1 = Rtemp;
+        if (mdo == value) {
+          mdo = k_RInfo;
+          recv = klass_RInfo;
+        }
+        typecheck_profile_helper2(md, data, mdo_offset_bias, mdo, recv, value, tmp1,
+                                  &profile_cast_success, &profile_cast_failure,
+                                  &done, stub->entry());
+      }
+      __ bind(done);
+      break;
+    }
+
+    case lir_checkcast: {
+      CodeStub* stub = op->stub();
+      Register obj = op->object()->as_register();
+      Register res = op->result_opr()->as_register();
+      Register klass_RInfo = op->tmp1()->as_register();
+      Register k_RInfo = op->tmp2()->as_register();
+      ciKlass* k = op->klass();
+      assert_different_registers(res, k_RInfo, klass_RInfo, Rtemp);
+
+      // TODO: ARM - Late binding is used to prevent confusion of register allocator
+      assert(stub->is_exception_throw_stub(), "must be");
+      ((SimpleExceptionStub*)stub)->set_obj(op->result_opr());
+
+      ciMethodData* md;
+      ciProfileData* data;
+      int mdo_offset_bias = 0;
+
+      Label done;
+
+      Label profile_cast_failure, profile_cast_success;
+      Label *failure_target = op->should_profile() ? &profile_cast_failure : op->stub()->entry();
+      Label *success_target = op->should_profile() ? &profile_cast_success : &done;
+
+#ifdef AARCH64
+      move_regs(obj, res);
+      if (op->should_profile()) {
+        typecheck_profile_helper1(op->profiled_method(), op->profiled_bci(), md, data, mdo_offset_bias, res, klass_RInfo, Rtemp, &done);
+      } else {
+        __ cbz(obj, done);
+      }
+      if (k->is_loaded()) {
+        __ mov_metadata(k_RInfo, k->constant_encoding());
+      } else {
+        if (res != obj) {
+          op->info_for_patch()->add_register_oop(FrameMap::as_oop_opr(res));
+        }
+        klass2reg_with_patching(k_RInfo, op->info_for_patch());
+      }
+      __ load_klass(klass_RInfo, res);
+
+      if (op->fast_check()) {
+        __ cmp(klass_RInfo, k_RInfo);
+        __ b(*failure_target, ne);
+      } else if (k->is_loaded()) {
+        __ ldr(Rtemp, Address(klass_RInfo, k->super_check_offset()));
+        if (in_bytes(Klass::secondary_super_cache_offset()) != (int) k->super_check_offset()) {
+          __ cmp(Rtemp, k_RInfo);
+          __ b(*failure_target, ne);
+        } else {
+          __ cmp(klass_RInfo, k_RInfo);
+          __ cond_cmp(Rtemp, k_RInfo, ne);
+          __ b(*success_target, eq);
+          assert(klass_RInfo == R0 && k_RInfo == R1, "runtime call setup");
+          __ call(Runtime1::entry_for(Runtime1::slow_subtype_check_id), relocInfo::runtime_call_type);
+          __ cbz(R0, *failure_target);
+        }
+      } else {
+        __ ldr_u32(Rtemp, Address(k_RInfo, Klass::super_check_offset_offset()));
+        // check for immediate positive hit
+        __ ldr(Rtemp, Address(klass_RInfo, Rtemp));
+        __ cmp(klass_RInfo, k_RInfo);
+        __ cond_cmp(Rtemp, k_RInfo, ne);
+        __ b(*success_target, eq);
+        // check for immediate negative hit
+        __ ldr_u32(Rtemp, Address(k_RInfo, Klass::super_check_offset_offset()));
+        __ cmp(Rtemp, in_bytes(Klass::secondary_super_cache_offset()));
+        __ b(*failure_target, ne);
+        // slow case
+        assert(klass_RInfo == R0 && k_RInfo == R1, "runtime call setup");
+        __ call(Runtime1::entry_for(Runtime1::slow_subtype_check_id), relocInfo::runtime_call_type);
+        __ cbz(R0, *failure_target);
+      }
+
+#else // AARCH64
+
+      __ movs(res, obj);
+      if (op->should_profile()) {
+        typecheck_profile_helper1(op->profiled_method(), op->profiled_bci(), md, data, mdo_offset_bias, res, klass_RInfo, Rtemp, &done);
+      } else {
+        __ b(done, eq);
+      }
+      if (k->is_loaded()) {
+        __ mov_metadata(k_RInfo, k->constant_encoding());
+      } else if (k_RInfo != obj) {
+        klass2reg_with_patching(k_RInfo, op->info_for_patch());
+        __ movs(res, obj);
+      } else {
+        // Patching doesn't update "res" register after GC, so do patching first
+        klass2reg_with_patching(Rtemp, op->info_for_patch());
+        __ movs(res, obj);
+        __ mov(k_RInfo, Rtemp);
+      }
+      __ load_klass(klass_RInfo, res, ne);
+
+      if (op->fast_check()) {
+        __ cmp(klass_RInfo, k_RInfo, ne);
+        __ b(*failure_target, ne);
+      } else if (k->is_loaded()) {
+        __ b(*success_target, eq);
+        __ ldr(Rtemp, Address(klass_RInfo, k->super_check_offset()));
+        if (in_bytes(Klass::secondary_super_cache_offset()) != (int) k->super_check_offset()) {
+          __ cmp(Rtemp, k_RInfo);
+          __ b(*failure_target, ne);
+        } else {
+          __ cmp(klass_RInfo, k_RInfo);
+          __ cmp(Rtemp, k_RInfo, ne);
+          __ b(*success_target, eq);
+          assert(klass_RInfo == R0 && k_RInfo == R1, "runtime call setup");
+          __ call(Runtime1::entry_for(Runtime1::slow_subtype_check_id), relocInfo::runtime_call_type);
+          __ cbz(R0, *failure_target);
+        }
+      } else {
+        __ ldr_u32(Rtemp, Address(k_RInfo, Klass::super_check_offset_offset()));
+        __ b(*success_target, eq);
+        // check for immediate positive hit
+        __ ldr(Rtemp, Address(klass_RInfo, Rtemp));
+        __ cmp(klass_RInfo, k_RInfo);
+        __ cmp(Rtemp, k_RInfo, ne);
+        __ b(*success_target, eq);
+        // check for immediate negative hit
+        __ ldr_u32(Rtemp, Address(k_RInfo, Klass::super_check_offset_offset()));
+        __ cmp(Rtemp, in_bytes(Klass::secondary_super_cache_offset()));
+        __ b(*failure_target, ne);
+        // slow case
+        assert(klass_RInfo == R0 && k_RInfo == R1, "runtime call setup");
+        __ call(Runtime1::entry_for(Runtime1::slow_subtype_check_id), relocInfo::runtime_call_type);
+        __ cbz(R0, *failure_target);
+      }
+#endif // AARCH64
+
+      if (op->should_profile()) {
+        Register mdo  = klass_RInfo, recv = k_RInfo, tmp1 = Rtemp;
+        typecheck_profile_helper2(md, data, mdo_offset_bias, mdo, recv, res, tmp1,
+                                  &profile_cast_success, &profile_cast_failure,
+                                  &done, stub->entry());
+      }
+      __ bind(done);
+      break;
+    }
+
+    case lir_instanceof: {
+      Register obj = op->object()->as_register();
+      Register res = op->result_opr()->as_register();
+      Register klass_RInfo = op->tmp1()->as_register();
+      Register k_RInfo = op->tmp2()->as_register();
+      ciKlass* k = op->klass();
+      assert_different_registers(res, klass_RInfo, k_RInfo, Rtemp);
+
+      ciMethodData* md;
+      ciProfileData* data;
+      int mdo_offset_bias = 0;
+
+      Label done;
+
+      Label profile_cast_failure, profile_cast_success;
+      Label *failure_target = op->should_profile() ? &profile_cast_failure : &done;
+      Label *success_target = op->should_profile() ? &profile_cast_success : &done;
+
+#ifdef AARCH64
+      move_regs(obj, res);
+#else
+      __ movs(res, obj);
+#endif // AARCH64
+
+      if (op->should_profile()) {
+        typecheck_profile_helper1(op->profiled_method(), op->profiled_bci(), md, data, mdo_offset_bias, res, klass_RInfo, Rtemp, &done);
+      } else {
+#ifdef AARCH64
+        __ cbz(obj, done); // If obj == NULL, res is false
+#else
+        __ b(done, eq);
+#endif // AARCH64
+      }
+
+      if (k->is_loaded()) {
+        __ mov_metadata(k_RInfo, k->constant_encoding());
+      } else {
+        op->info_for_patch()->add_register_oop(FrameMap::as_oop_opr(res));
+        klass2reg_with_patching(k_RInfo, op->info_for_patch());
+      }
+      __ load_klass(klass_RInfo, res);
+
+#ifndef AARCH64
+      if (!op->should_profile()) {
+        __ mov(res, 0);
+      }
+#endif // !AARCH64
+
+      if (op->fast_check()) {
+        __ cmp(klass_RInfo, k_RInfo);
+        if (!op->should_profile()) {
+          set_instanceof_result(_masm, res, eq);
+        } else {
+          __ b(profile_cast_failure, ne);
+        }
+      } else if (k->is_loaded()) {
+        __ ldr(Rtemp, Address(klass_RInfo, k->super_check_offset()));
+        if (in_bytes(Klass::secondary_super_cache_offset()) != (int) k->super_check_offset()) {
+          __ cmp(Rtemp, k_RInfo);
+          if (!op->should_profile()) {
+            set_instanceof_result(_masm, res, eq);
+          } else {
+            __ b(profile_cast_failure, ne);
+          }
+        } else {
+          __ cmp(klass_RInfo, k_RInfo);
+          __ cond_cmp(Rtemp, k_RInfo, ne);
+          if (!op->should_profile()) {
+            set_instanceof_result(_masm, res, eq);
+          }
+          __ b(*success_target, eq);
+          assert(klass_RInfo == R0 && k_RInfo == R1, "runtime call setup");
+          __ call(Runtime1::entry_for(Runtime1::slow_subtype_check_id), relocInfo::runtime_call_type);
+          if (!op->should_profile()) {
+            move_regs(R0, res);
+          } else {
+            __ cbz(R0, *failure_target);
+          }
+        }
+      } else {
+        __ ldr_u32(Rtemp, Address(k_RInfo, Klass::super_check_offset_offset()));
+        // check for immediate positive hit
+        __ cmp(klass_RInfo, k_RInfo);
+        if (!op->should_profile()) {
+#ifdef AARCH64
+          // TODO-AARCH64 check if separate conditional branch is more efficient than ldr+cond_cmp
+          __ ldr(res, Address(klass_RInfo, Rtemp));
+#else
+          __ ldr(res, Address(klass_RInfo, Rtemp), ne);
+#endif // AARCH64
+          __ cond_cmp(res, k_RInfo, ne);
+          set_instanceof_result(_masm, res, eq);
+        } else {
+#ifdef AARCH64
+          // TODO-AARCH64 check if separate conditional branch is more efficient than ldr+cond_cmp
+          __ ldr(Rtemp, Address(klass_RInfo, Rtemp));
+#else
+          __ ldr(Rtemp, Address(klass_RInfo, Rtemp), ne);
+#endif // AARCH64
+          __ cond_cmp(Rtemp, k_RInfo, ne);
+        }
+        __ b(*success_target, eq);
+        // check for immediate negative hit
+        if (op->should_profile()) {
+          __ ldr_u32(Rtemp, Address(k_RInfo, Klass::super_check_offset_offset()));
+        }
+        __ cmp(Rtemp, in_bytes(Klass::secondary_super_cache_offset()));
+        if (!op->should_profile()) {
+#ifdef AARCH64
+          __ mov(res, 0);
+#else
+          __ mov(res, 0, ne);
+#endif // AARCH64
+        }
+        __ b(*failure_target, ne);
+        // slow case
+        assert(klass_RInfo == R0 && k_RInfo == R1, "runtime call setup");
+        __ call(Runtime1::entry_for(Runtime1::slow_subtype_check_id), relocInfo::runtime_call_type);
+        if (!op->should_profile()) {
+          move_regs(R0, res);
+        }
+        if (op->should_profile()) {
+          __ cbz(R0, *failure_target);
+        }
+      }
+
+      if (op->should_profile()) {
+        Label done_ok, done_failure;
+        Register mdo  = klass_RInfo, recv = k_RInfo, tmp1 = Rtemp;
+        typecheck_profile_helper2(md, data, mdo_offset_bias, mdo, recv, res, tmp1,
+                                  &profile_cast_success, &profile_cast_failure,
+                                  &done_ok, &done_failure);
+        __ bind(done_failure);
+        __ mov(res, 0);
+        __ b(done);
+        __ bind(done_ok);
+        __ mov(res, 1);
+      }
+      __ bind(done);
+      break;
+    }
+    default:
+      ShouldNotReachHere();
+  }
+}
+
+
+void LIR_Assembler::emit_compare_and_swap(LIR_OpCompareAndSwap* op) {
+  //   if (*addr == cmpval) {
+  //     *addr = newval;
+  //     dest = 1;
+  //   } else {
+  //     dest = 0;
+  //   }
+#ifdef AARCH64
+  Label retry, done;
+  Register addr = op->addr()->as_pointer_register();
+  Register cmpval = op->cmp_value()->as_pointer_register();
+  Register newval = op->new_value()->as_pointer_register();
+  Register dest = op->result_opr()->as_pointer_register();
+  assert_different_registers(dest, addr, cmpval, newval, Rtemp);
+
+  if (UseCompressedOops && op->code() == lir_cas_obj) {
+    Register tmp1 = op->tmp1()->as_pointer_register();
+    Register tmp2 = op->tmp2()->as_pointer_register();
+    assert_different_registers(dest, addr, cmpval, newval, tmp1, tmp2, Rtemp);
+    __ encode_heap_oop(tmp1, cmpval); cmpval = tmp1;
+    __ encode_heap_oop(tmp2, newval); newval = tmp2;
+  }
+
+  __ mov(dest, ZR);
+  __ bind(retry);
+  if (((op->code() == lir_cas_obj) && !UseCompressedOops) || op->code() == lir_cas_long) {
+    __ ldaxr(Rtemp, addr);
+    __ cmp(Rtemp, cmpval);
+    __ b(done, ne);
+    __ stlxr(Rtemp, newval, addr);
+  } else if (((op->code() == lir_cas_obj) && UseCompressedOops) || op->code() == lir_cas_int) {
+    __ ldaxr_w(Rtemp, addr);
+    __ cmp_w(Rtemp, cmpval);
+    __ b(done, ne);
+    __ stlxr_w(Rtemp, newval, addr);
+  } else {
+    ShouldNotReachHere();
+  }
+  __ cbnz_w(Rtemp, retry);
+  __ mov(dest, 1);
+  __ bind(done);
+#else
+  // FIXME: membar_release
+  __ membar(MacroAssembler::Membar_mask_bits(MacroAssembler::StoreStore | MacroAssembler::LoadStore), Rtemp);
+  if (op->code() == lir_cas_int || op->code() == lir_cas_obj) {
+    Register addr = op->addr()->as_register();
+    Register cmpval = op->cmp_value()->as_register();
+    Register newval = op->new_value()->as_register();
+    Register dest = op->result_opr()->as_register();
+    assert_different_registers(dest, addr, cmpval, newval, Rtemp);
+
+    __ atomic_cas_bool(cmpval, newval, addr, 0, Rtemp); // Rtemp free by default at C1 LIR layer
+    __ mov(dest, 1, eq);
+    __ mov(dest, 0, ne);
+  } else if (op->code() == lir_cas_long) {
+    assert(VM_Version::supports_cx8(), "wrong machine");
+    Register addr = op->addr()->as_pointer_register();
+    Register cmp_value_lo = op->cmp_value()->as_register_lo();
+    Register cmp_value_hi = op->cmp_value()->as_register_hi();
+    Register new_value_lo = op->new_value()->as_register_lo();
+    Register new_value_hi = op->new_value()->as_register_hi();
+    Register dest = op->result_opr()->as_register();
+    Register tmp_lo = op->tmp1()->as_register_lo();
+    Register tmp_hi = op->tmp1()->as_register_hi();
+
+    assert_different_registers(tmp_lo, tmp_hi, cmp_value_lo, cmp_value_hi, dest, new_value_lo, new_value_hi, addr);
+    assert(tmp_hi->encoding() == tmp_lo->encoding() + 1, "non aligned register pair");
+    assert(new_value_hi->encoding() == new_value_lo->encoding() + 1, "non aligned register pair");
+    assert((tmp_lo->encoding() & 0x1) == 0, "misaligned register pair");
+    assert((new_value_lo->encoding() & 0x1) == 0, "misaligned register pair");
+    __ atomic_cas64(tmp_lo, tmp_hi, dest, cmp_value_lo, cmp_value_hi,
+                    new_value_lo, new_value_hi, addr, 0);
+  } else {
+    Unimplemented();
+  }
+#endif // AARCH64
+  // FIXME: is full membar really needed instead of just membar_acquire?
+  __ membar(MacroAssembler::Membar_mask_bits(MacroAssembler::StoreLoad | MacroAssembler::StoreStore), Rtemp);
+}
+
+
+void LIR_Assembler::cmove(LIR_Condition condition, LIR_Opr opr1, LIR_Opr opr2, LIR_Opr result, BasicType type) {
+  AsmCondition acond = al;
+  AsmCondition ncond = nv;
+  if (opr1 != opr2) {
+    switch (condition) {
+      case lir_cond_equal:        acond = eq; ncond = ne; break;
+      case lir_cond_notEqual:     acond = ne; ncond = eq; break;
+      case lir_cond_less:         acond = lt; ncond = ge; break;
+      case lir_cond_lessEqual:    acond = le; ncond = gt; break;
+      case lir_cond_greaterEqual: acond = ge; ncond = lt; break;
+      case lir_cond_greater:      acond = gt; ncond = le; break;
+      case lir_cond_aboveEqual:   acond = hs; ncond = lo; break;
+      case lir_cond_belowEqual:   acond = ls; ncond = hi; break;
+      default: ShouldNotReachHere();
+    }
+  }
+
+#ifdef AARCH64
+
+  // TODO-AARCH64 implement it more efficiently
+
+  if (opr1->is_register()) {
+    reg2reg(opr1, result);
+  } else if (opr1->is_stack()) {
+    stack2reg(opr1, result, result->type());
+  } else if (opr1->is_constant()) {
+    const2reg(opr1, result, lir_patch_none, NULL);
+  } else {
+    ShouldNotReachHere();
+  }
+
+  Label skip;
+  __ b(skip, acond);
+
+  if (opr2->is_register()) {
+    reg2reg(opr2, result);
+  } else if (opr2->is_stack()) {
+    stack2reg(opr2, result, result->type());
+  } else if (opr2->is_constant()) {
+    const2reg(opr2, result, lir_patch_none, NULL);
+  } else {
+    ShouldNotReachHere();
+  }
+
+  __ bind(skip);
+
+#else
+  for (;;) {                         // two iterations only
+    if (opr1 == result) {
+      // do nothing
+    } else if (opr1->is_single_cpu()) {
+      __ mov(result->as_register(), opr1->as_register(), acond);
+    } else if (opr1->is_double_cpu()) {
+      __ long_move(result->as_register_lo(), result->as_register_hi(),
+                   opr1->as_register_lo(), opr1->as_register_hi(), acond);
+    } else if (opr1->is_single_stack()) {
+      __ ldr(result->as_register(), frame_map()->address_for_slot(opr1->single_stack_ix()), acond);
+    } else if (opr1->is_double_stack()) {
+      __ ldr(result->as_register_lo(),
+             frame_map()->address_for_slot(opr1->double_stack_ix(), lo_word_offset_in_bytes), acond);
+      __ ldr(result->as_register_hi(),
+             frame_map()->address_for_slot(opr1->double_stack_ix(), hi_word_offset_in_bytes), acond);
+    } else if (opr1->is_illegal()) {
+      // do nothing: this part of the cmove has been optimized away in the peephole optimizer
+    } else {
+      assert(opr1->is_constant(), "must be");
+      LIR_Const* c = opr1->as_constant_ptr();
+
+      switch (c->type()) {
+        case T_INT:
+          __ mov_slow(result->as_register(), c->as_jint(), acond);
+          break;
+        case T_LONG:
+          __ mov_slow(result->as_register_lo(), c->as_jint_lo(), acond);
+          __ mov_slow(result->as_register_hi(), c->as_jint_hi(), acond);
+          break;
+        case T_OBJECT:
+          __ mov_oop(result->as_register(), c->as_jobject(), 0, acond);
+          break;
+        case T_FLOAT:
+#ifdef __SOFTFP__
+          // not generated now.
+          __ mov_slow(result->as_register(), c->as_jint(), acond);
+#else
+          __ mov_float(result->as_float_reg(), c->as_jfloat(), acond);
+#endif // __SOFTFP__
+          break;
+        case T_DOUBLE:
+#ifdef __SOFTFP__
+          // not generated now.
+          __ mov_slow(result->as_register_lo(), c->as_jint_lo(), acond);
+          __ mov_slow(result->as_register_hi(), c->as_jint_hi(), acond);
+#else
+          __ mov_double(result->as_double_reg(), c->as_jdouble(), acond);
+#endif // __SOFTFP__
+          break;
+        default:
+          ShouldNotReachHere();
+      }
+    }
+
+    // Negate the condition and repeat the algorithm with the second operand
+    if (opr1 == opr2) { break; }
+    opr1 = opr2;
+    acond = ncond;
+  }
+#endif // AARCH64
+}
+
+#if defined(AARCH64) || defined(ASSERT)
+static int reg_size(LIR_Opr op) {
+  switch (op->type()) {
+  case T_FLOAT:
+  case T_INT:      return BytesPerInt;
+  case T_LONG:
+  case T_DOUBLE:   return BytesPerLong;
+  case T_OBJECT:
+  case T_ARRAY:
+  case T_METADATA: return BytesPerWord;
+  case T_ADDRESS:
+  case T_ILLEGAL:  // fall through
+  default: ShouldNotReachHere(); return -1;
+  }
+}
+#endif
+
+void LIR_Assembler::arith_op(LIR_Code code, LIR_Opr left, LIR_Opr right, LIR_Opr dest, CodeEmitInfo* info, bool pop_fpu_stack) {
+  assert(info == NULL, "unused on this code path");
+  assert(dest->is_register(), "wrong items state");
+
+  if (right->is_address()) {
+    // special case for adding shifted/extended register
+    const Register res = dest->as_pointer_register();
+    const Register lreg = left->as_pointer_register();
+    const LIR_Address* addr = right->as_address_ptr();
+
+    assert(addr->base()->as_pointer_register() == lreg && addr->index()->is_register() && addr->disp() == 0, "must be");
+
+    int scale = addr->scale();
+    AsmShift shift = lsl;
+
+#ifdef AARCH64
+    bool is_index_extended = reg_size(addr->base()) > reg_size(addr->index());
+    if (scale < 0) {
+      scale = -scale;
+      shift = lsr;
+    }
+    assert(shift == lsl || !is_index_extended, "could not have extend and right shift in one operand");
+    assert(0 <= scale && scale <= 63, "scale is too large");
+
+    if (is_index_extended) {
+      assert(scale <= 4, "scale is too large for add with extended register");
+      assert(addr->index()->is_single_cpu(), "should be");
+      assert(addr->index()->type() == T_INT, "should be");
+      assert(dest->is_double_cpu(), "should be");
+      assert(code == lir_add, "special case of add with extended register");
+
+      __ add(res, lreg, addr->index()->as_register(), ex_sxtw, scale);
+      return;
+    } else if (reg_size(dest) == BytesPerInt) {
+      assert(reg_size(addr->base()) == reg_size(addr->index()), "should be");
+      assert(reg_size(addr->base()) == reg_size(dest), "should be");
+
+      AsmOperand operand(addr->index()->as_pointer_register(), shift, scale);
+      switch (code) {
+        case lir_add: __ add_32(res, lreg, operand); break;
+        case lir_sub: __ sub_32(res, lreg, operand); break;
+        default: ShouldNotReachHere();
+      }
+      return;
+    }
+#endif // AARCH64
+
+    assert(reg_size(addr->base()) == reg_size(addr->index()), "should be");
+    assert(reg_size(addr->base()) == reg_size(dest), "should be");
+    assert(reg_size(dest) == wordSize, "should be");
+
+    AsmOperand operand(addr->index()->as_pointer_register(), shift, scale);
+    switch (code) {
+      case lir_add: __ add(res, lreg, operand); break;
+      case lir_sub: __ sub(res, lreg, operand); break;
+      default: ShouldNotReachHere();
+    }
+
+#ifndef AARCH64
+  } else if (left->is_address()) {
+    assert(code == lir_sub && right->is_single_cpu(), "special case used by strength_reduce_multiply()");
+    const LIR_Address* addr = left->as_address_ptr();
+    const Register res = dest->as_register();
+    const Register rreg = right->as_register();
+    assert(addr->base()->as_register() == rreg && addr->index()->is_register() && addr->disp() == 0, "must be");
+    __ rsb(res, rreg, AsmOperand(addr->index()->as_register(), lsl, addr->scale()));
+#endif // !AARCH64
+
+  } else if (dest->is_single_cpu()) {
+    assert(left->is_single_cpu(), "unexpected left operand");
+#ifdef AARCH64
+    assert(dest->type() == T_INT, "unexpected dest type");
+    assert(left->type() == T_INT, "unexpected left type");
+    assert(right->type() == T_INT, "unexpected right type");
+#endif // AARCH64
+
+    const Register res = dest->as_register();
+    const Register lreg = left->as_register();
+
+    if (right->is_single_cpu()) {
+      const Register rreg = right->as_register();
+      switch (code) {
+        case lir_add: __ add_32(res, lreg, rreg); break;
+        case lir_sub: __ sub_32(res, lreg, rreg); break;
+        case lir_mul: __ mul_32(res, lreg, rreg); break;
+        default: ShouldNotReachHere();
+      }
+    } else {
+      assert(right->is_constant(), "must be");
+      const jint c = right->as_constant_ptr()->as_jint();
+      if (!Assembler::is_arith_imm_in_range(c)) {
+        BAILOUT("illegal arithmetic operand");
+      }
+      switch (code) {
+        case lir_add: __ add_32(res, lreg, c); break;
+        case lir_sub: __ sub_32(res, lreg, c); break;
+        default: ShouldNotReachHere();
+      }
+    }
+
+  } else if (dest->is_double_cpu()) {
+#ifdef AARCH64
+    assert(left->is_double_cpu() ||
+           (left->is_single_cpu() && ((left->type() == T_OBJECT) || (left->type() == T_ARRAY) || (left->type() == T_ADDRESS))),
+           "unexpected left operand");
+
+    const Register res = dest->as_register_lo();
+    const Register lreg = left->as_pointer_register();
+
+    if (right->is_constant()) {
+      assert(right->type() == T_LONG, "unexpected right type");
+      assert((right->as_constant_ptr()->as_jlong() >> 24) == 0, "out of range");
+      jint imm = (jint)right->as_constant_ptr()->as_jlong();
+      switch (code) {
+        case lir_add: __ add(res, lreg, imm); break;
+        case lir_sub: __ sub(res, lreg, imm); break;
+        default: ShouldNotReachHere();
+      }
+    } else {
+      assert(right->is_double_cpu() ||
+             (right->is_single_cpu() && ((right->type() == T_OBJECT) || (right->type() == T_ARRAY) || (right->type() == T_ADDRESS))),
+             "unexpected right operand");
+      const Register rreg = right->as_pointer_register();
+      switch (code) {
+        case lir_add: __ add(res, lreg, rreg); break;
+        case lir_sub: __ sub(res, lreg, rreg); break;
+        case lir_mul: __ mul(res, lreg, rreg); break;
+        default: ShouldNotReachHere();
+      }
+    }
+#else // AARCH64
+    Register res_lo = dest->as_register_lo();
+    Register res_hi = dest->as_register_hi();
+    Register lreg_lo = left->as_register_lo();
+    Register lreg_hi = left->as_register_hi();
+    if (right->is_double_cpu()) {
+      Register rreg_lo = right->as_register_lo();
+      Register rreg_hi = right->as_register_hi();
+      if (res_lo == lreg_hi || res_lo == rreg_hi) {
+        res_lo = Rtemp;
+      }
+      switch (code) {
+        case lir_add:
+          __ adds(res_lo, lreg_lo, rreg_lo);
+          __ adc(res_hi, lreg_hi, rreg_hi);
+          break;
+        case lir_sub:
+          __ subs(res_lo, lreg_lo, rreg_lo);
+          __ sbc(res_hi, lreg_hi, rreg_hi);
+          break;
+        default:
+          ShouldNotReachHere();
+      }
+    } else {
+      assert(right->is_constant(), "must be");
+      assert((right->as_constant_ptr()->as_jlong() >> 32) == 0, "out of range");
+      const jint c = (jint) right->as_constant_ptr()->as_jlong();
+      if (res_lo == lreg_hi) {
+        res_lo = Rtemp;
+      }
+      switch (code) {
+        case lir_add:
+          __ adds(res_lo, lreg_lo, c);
+          __ adc(res_hi, lreg_hi, 0);
+          break;
+        case lir_sub:
+          __ subs(res_lo, lreg_lo, c);
+          __ sbc(res_hi, lreg_hi, 0);
+          break;
+        default:
+          ShouldNotReachHere();
+      }
+    }
+    move_regs(res_lo, dest->as_register_lo());
+#endif // AARCH64
+
+  } else if (dest->is_single_fpu()) {
+    assert(left->is_single_fpu(), "must be");
+    assert(right->is_single_fpu(), "must be");
+    const FloatRegister res = dest->as_float_reg();
+    const FloatRegister lreg = left->as_float_reg();
+    const FloatRegister rreg = right->as_float_reg();
+    switch (code) {
+      case lir_add: __ add_float(res, lreg, rreg); break;
+      case lir_sub: __ sub_float(res, lreg, rreg); break;
+      case lir_mul_strictfp: // fall through
+      case lir_mul: __ mul_float(res, lreg, rreg); break;
+      case lir_div_strictfp: // fall through
+      case lir_div: __ div_float(res, lreg, rreg); break;
+      default: ShouldNotReachHere();
+    }
+  } else if (dest->is_double_fpu()) {
+    assert(left->is_double_fpu(), "must be");
+    assert(right->is_double_fpu(), "must be");
+    const FloatRegister res = dest->as_double_reg();
+    const FloatRegister lreg = left->as_double_reg();
+    const FloatRegister rreg = right->as_double_reg();
+    switch (code) {
+      case lir_add: __ add_double(res, lreg, rreg); break;
+      case lir_sub: __ sub_double(res, lreg, rreg); break;
+      case lir_mul_strictfp: // fall through
+      case lir_mul: __ mul_double(res, lreg, rreg); break;
+      case lir_div_strictfp: // fall through
+      case lir_div: __ div_double(res, lreg, rreg); break;
+      default: ShouldNotReachHere();
+    }
+  } else {
+    ShouldNotReachHere();
+  }
+}
+
+
+void LIR_Assembler::intrinsic_op(LIR_Code code, LIR_Opr value, LIR_Opr unused, LIR_Opr dest, LIR_Op* op) {
+  switch (code) {
+    case lir_abs:
+      __ abs_double(dest->as_double_reg(), value->as_double_reg());
+      break;
+    case lir_sqrt:
+      __ sqrt_double(dest->as_double_reg(), value->as_double_reg());
+      break;
+    default:
+      ShouldNotReachHere();
+  }
+}
+
+
+void LIR_Assembler::logic_op(LIR_Code code, LIR_Opr left, LIR_Opr right, LIR_Opr dest) {
+  assert(dest->is_register(), "wrong items state");
+  assert(left->is_register(), "wrong items state");
+
+  if (dest->is_single_cpu()) {
+#ifdef AARCH64
+    assert (dest->type() == T_INT, "unexpected result type");
+    assert (left->type() == T_INT, "unexpected left type");
+    assert (right->type() == T_INT, "unexpected right type");
+#endif // AARCH64
+
+    const Register res = dest->as_register();
+    const Register lreg = left->as_register();
+
+    if (right->is_single_cpu()) {
+      const Register rreg = right->as_register();
+      switch (code) {
+        case lir_logic_and: __ and_32(res, lreg, rreg); break;
+        case lir_logic_or:  __ orr_32(res, lreg, rreg); break;
+        case lir_logic_xor: __ eor_32(res, lreg, rreg); break;
+        default: ShouldNotReachHere();
+      }
+    } else {
+      assert(right->is_constant(), "must be");
+      const uint c = (uint)right->as_constant_ptr()->as_jint();
+      switch (code) {
+        case lir_logic_and: __ and_32(res, lreg, c); break;
+        case lir_logic_or:  __ orr_32(res, lreg, c); break;
+        case lir_logic_xor: __ eor_32(res, lreg, c); break;
+        default: ShouldNotReachHere();
+      }
+    }
+  } else {
+    assert(dest->is_double_cpu(), "should be");
+    Register res_lo = dest->as_register_lo();
+
+#ifdef AARCH64
+    assert ((left->is_single_cpu() && left->is_oop_register()) || left->is_double_cpu(), "should be");
+    const Register lreg_lo = left->as_pointer_register();
+#else
+    assert (dest->type() == T_LONG, "unexpected result type");
+    assert (left->type() == T_LONG, "unexpected left type");
+    assert (right->type() == T_LONG, "unexpected right type");
+
+    const Register res_hi = dest->as_register_hi();
+    const Register lreg_lo = left->as_register_lo();
+    const Register lreg_hi = left->as_register_hi();
+#endif // AARCH64
+
+    if (right->is_register()) {
+#ifdef AARCH64
+      assert ((right->is_single_cpu() && right->is_oop_register()) || right->is_double_cpu(), "should be");
+      const Register rreg_lo = right->as_pointer_register();
+      switch (code) {
+        case lir_logic_and: __ andr(res_lo, lreg_lo, rreg_lo); break;
+        case lir_logic_or:  __ orr (res_lo, lreg_lo, rreg_lo); break;
+        case lir_logic_xor: __ eor (res_lo, lreg_lo, rreg_lo); break;
+        default: ShouldNotReachHere();
+      }
+#else
+      const Register rreg_lo = right->as_register_lo();
+      const Register rreg_hi = right->as_register_hi();
+      if (res_lo == lreg_hi || res_lo == rreg_hi) {
+        res_lo = Rtemp; // Temp register helps to avoid overlap between result and input
+      }
+      switch (code) {
+        case lir_logic_and:
+          __ andr(res_lo, lreg_lo, rreg_lo);
+          __ andr(res_hi, lreg_hi, rreg_hi);
+          break;
+        case lir_logic_or:
+          __ orr(res_lo, lreg_lo, rreg_lo);
+          __ orr(res_hi, lreg_hi, rreg_hi);
+          break;
+        case lir_logic_xor:
+          __ eor(res_lo, lreg_lo, rreg_lo);
+          __ eor(res_hi, lreg_hi, rreg_hi);
+          break;
+        default:
+          ShouldNotReachHere();
+      }
+      move_regs(res_lo, dest->as_register_lo());
+#endif // AARCH64
+    } else {
+      assert(right->is_constant(), "must be");
+#ifdef AARCH64
+      const julong c = (julong)right->as_constant_ptr()->as_jlong();
+      Assembler::LogicalImmediate imm(c, false);
+      if (imm.is_encoded()) {
+        switch (code) {
+          case lir_logic_and: __ andr(res_lo, lreg_lo, imm); break;
+          case lir_logic_or:  __ orr (res_lo, lreg_lo, imm); break;
+          case lir_logic_xor: __ eor (res_lo, lreg_lo, imm); break;
+          default: ShouldNotReachHere();
+        }
+      } else {
+        BAILOUT("64 bit constant cannot be inlined");
+      }
+#else
+      const jint c_lo = (jint) right->as_constant_ptr()->as_jlong();
+      const jint c_hi = (jint) (right->as_constant_ptr()->as_jlong() >> 32);
+      // Case for logic_or from do_ClassIDIntrinsic()
+      if (c_hi == 0 && AsmOperand::is_rotated_imm(c_lo)) {
+        switch (code) {
+          case lir_logic_and:
+            __ andr(res_lo, lreg_lo, c_lo);
+            __ mov(res_hi, 0);
+            break;
+          case lir_logic_or:
+            __ orr(res_lo, lreg_lo, c_lo);
+            break;
+          case lir_logic_xor:
+            __ eor(res_lo, lreg_lo, c_lo);
+            break;
+        default:
+          ShouldNotReachHere();
+        }
+      } else if (code == lir_logic_and &&
+                 c_hi == -1 &&
+                 (AsmOperand::is_rotated_imm(c_lo) ||
+                  AsmOperand::is_rotated_imm(~c_lo))) {
+        // Another case which handles logic_and from do_ClassIDIntrinsic()
+        if (AsmOperand::is_rotated_imm(c_lo)) {
+          __ andr(res_lo, lreg_lo, c_lo);
+        } else {
+          __ bic(res_lo, lreg_lo, ~c_lo);
+        }
+        if (res_hi != lreg_hi) {
+          __ mov(res_hi, lreg_hi);
+        }
+      } else {
+        BAILOUT("64 bit constant cannot be inlined");
+      }
+#endif // AARCH64
+    }
+  }
+}
+
+
+#ifdef AARCH64
+
+void LIR_Assembler::long_compare_helper(LIR_Opr opr1, LIR_Opr opr2) {
+  assert(opr1->is_double_cpu(), "should be");
+  Register x = opr1->as_register_lo();
+
+  if (opr2->is_double_cpu()) {
+    Register y = opr2->as_register_lo();
+    __ cmp(x, y);
+
+  } else {
+    assert(opr2->is_constant(), "should be");
+    assert(opr2->as_constant_ptr()->type() == T_LONG, "long constant expected");
+    jlong c = opr2->as_jlong();
+    assert(((c >> 31) == 0) || ((c >> 31) == -1), "immediate is out of range");
+    if (c >= 0) {
+      __ cmp(x, (jint)c);
+    } else {
+      __ cmn(x, (jint)(-c));
+    }
+  }
+}
+
+#endif // AARCH64
+
+void LIR_Assembler::comp_op(LIR_Condition condition, LIR_Opr opr1, LIR_Opr opr2, LIR_Op2* op) {
+  if (opr1->is_single_cpu()) {
+    if (opr2->is_constant()) {
+      switch (opr2->as_constant_ptr()->type()) {
+        case T_INT: {
+          const jint c = opr2->as_constant_ptr()->as_jint();
+          if (Assembler::is_arith_imm_in_range(c)) {
+            __ cmp_32(opr1->as_register(), c);
+          } else if (Assembler::is_arith_imm_in_range(-c)) {
+            __ cmn_32(opr1->as_register(), -c);
+          } else {
+            // This can happen when compiling lookupswitch
+            __ mov_slow(Rtemp, c);
+            __ cmp_32(opr1->as_register(), Rtemp);
+          }
+          break;
+        }
+        case T_OBJECT:
+          assert(opr2->as_constant_ptr()->as_jobject() == NULL, "cannot handle otherwise");
+          __ cmp(opr1->as_register(), 0);
+          break;
+        default:
+          ShouldNotReachHere();
+      }
+    } else if (opr2->is_single_cpu()) {
+      if (opr1->type() == T_OBJECT || opr1->type() == T_ARRAY || opr1->type() == T_METADATA || opr1->type() == T_ADDRESS) {
+        assert(opr2->type() == T_OBJECT || opr2->type() == T_ARRAY || opr2->type() == T_METADATA || opr2->type() == T_ADDRESS, "incompatibe type");
+        __ cmp(opr1->as_register(), opr2->as_register());
+      } else {
+        assert(opr2->type() != T_OBJECT && opr2->type() != T_ARRAY && opr2->type() != T_METADATA && opr2->type() != T_ADDRESS, "incompatibe type");
+        __ cmp_32(opr1->as_register(), opr2->as_register());
+      }
+    } else {
+      ShouldNotReachHere();
+    }
+  } else if (opr1->is_double_cpu()) {
+#ifdef AARCH64
+    long_compare_helper(opr1, opr2);
+#else
+    Register xlo = opr1->as_register_lo();
+    Register xhi = opr1->as_register_hi();
+    if (opr2->is_constant() && opr2->as_jlong() == 0) {
+      assert(condition == lir_cond_equal || condition == lir_cond_notEqual, "cannot handle otherwise");
+      __ orrs(Rtemp, xlo, xhi);
+    } else if (opr2->is_register()) {
+      Register ylo = opr2->as_register_lo();
+      Register yhi = opr2->as_register_hi();
+      if (condition == lir_cond_equal || condition == lir_cond_notEqual) {
+        __ teq(xhi, yhi);
+        __ teq(xlo, ylo, eq);
+      } else {
+        __ subs(xlo, xlo, ylo);
+        __ sbcs(xhi, xhi, yhi);
+      }
+    } else {
+      ShouldNotReachHere();
+    }
+#endif // AARCH64
+  } else if (opr1->is_single_fpu()) {
+    if (opr2->is_constant()) {
+      assert(opr2->as_jfloat() == 0.0f, "cannot handle otherwise");
+      __ cmp_zero_float(opr1->as_float_reg());
+    } else {
+      __ cmp_float(opr1->as_float_reg(), opr2->as_float_reg());
+    }
+  } else if (opr1->is_double_fpu()) {
+    if (opr2->is_constant()) {
+      assert(opr2->as_jdouble() == 0.0, "cannot handle otherwise");
+      __ cmp_zero_double(opr1->as_double_reg());
+    } else {
+      __ cmp_double(opr1->as_double_reg(), opr2->as_double_reg());
+    }
+  } else {
+    ShouldNotReachHere();
+  }
+}
+
+void LIR_Assembler::comp_fl2i(LIR_Code code, LIR_Opr left, LIR_Opr right, LIR_Opr dst, LIR_Op2* op) {
+  const Register res = dst->as_register();
+  if (code == lir_cmp_fd2i || code == lir_ucmp_fd2i) {
+    comp_op(lir_cond_unknown, left, right, op);
+#ifdef AARCH64
+    if (code == lir_ucmp_fd2i) {         // unordered is less
+      __ cset(res, gt);                  // 1 if '>', else 0
+      __ csinv(res, res, ZR, ge);        // previous value if '>=', else -1
+    } else {
+      __ cset(res, hi);                  // 1 if '>' or unordered, else 0
+      __ csinv(res, res, ZR, pl);        // previous value if '>=' or unordered, else -1
+    }
+#else
+    __ fmstat();
+    if (code == lir_ucmp_fd2i) {  // unordered is less
+      __ mvn(res, 0, lt);
+      __ mov(res, 1, ge);
+    } else {                      // unordered is greater
+      __ mov(res, 1, cs);
+      __ mvn(res, 0, cc);
+    }
+    __ mov(res, 0, eq);
+#endif // AARCH64
+
+  } else {
+    assert(code == lir_cmp_l2i, "must be");
+
+#ifdef AARCH64
+    long_compare_helper(left, right);
+
+    __ cset(res, gt);            // 1 if '>', else 0
+    __ csinv(res, res, ZR, ge);  // previous value if '>=', else -1
+#else
+    Label done;
+    const Register xlo = left->as_register_lo();
+    const Register xhi = left->as_register_hi();
+    const Register ylo = right->as_register_lo();
+    const Register yhi = right->as_register_hi();
+    __ cmp(xhi, yhi);
+    __ mov(res, 1, gt);
+    __ mvn(res, 0, lt);
+    __ b(done, ne);
+    __ subs(res, xlo, ylo);
+    __ mov(res, 1, hi);
+    __ mvn(res, 0, lo);
+    __ bind(done);
+#endif // AARCH64
+  }
+}
+
+
+void LIR_Assembler::align_call(LIR_Code code) {
+  // Not needed
+}
+
+
+void LIR_Assembler::call(LIR_OpJavaCall *op, relocInfo::relocType rtype) {
+  int ret_addr_offset = __ patchable_call(op->addr(), rtype);
+  assert(ret_addr_offset == __ offset(), "embedded return address not allowed");
+  add_call_info_here(op->info());
+}
+
+
+void LIR_Assembler::ic_call(LIR_OpJavaCall *op) {
+  bool near_range = __ cache_fully_reachable();
+  address oop_address = pc();
+
+  bool use_movw = AARCH64_ONLY(false) NOT_AARCH64(VM_Version::supports_movw());
+
+  // Ricklass may contain something that is not a metadata pointer so
+  // mov_metadata can't be used
+  InlinedAddress value((address)Universe::non_oop_word());
+  InlinedAddress addr(op->addr());
+  if (use_movw) {
+#ifdef AARCH64
+    ShouldNotReachHere();
+#else
+    __ movw(Ricklass, ((unsigned int)Universe::non_oop_word()) & 0xffff);
+    __ movt(Ricklass, ((unsigned int)Universe::non_oop_word()) >> 16);
+#endif // AARCH64
+  } else {
+    // No movw/movt, must be load a pc relative value but no
+    // relocation so no metadata table to load from.
+    // Use a b instruction rather than a bl, inline constant after the
+    // branch, use a PC relative ldr to load the constant, arrange for
+    // the call to return after the constant(s).
+    __ ldr_literal(Ricklass, value);
+  }
+  __ relocate(virtual_call_Relocation::spec(oop_address));
+  if (near_range && use_movw) {
+    __ bl(op->addr());
+  } else {
+    Label call_return;
+    __ adr(LR, call_return);
+    if (near_range) {
+      __ b(op->addr());
+    } else {
+      __ indirect_jump(addr, Rtemp);
+      __ bind_literal(addr);
+    }
+    if (!use_movw) {
+      __ bind_literal(value);
+    }
+    __ bind(call_return);
+  }
+  add_call_info(code_offset(), op->info());
+}
+
+
+/* Currently, vtable-dispatch is only enabled for sparc platforms */
+void LIR_Assembler::vtable_call(LIR_OpJavaCall* op) {
+  ShouldNotReachHere();
+}
+
+void LIR_Assembler::emit_static_call_stub() {
+  address call_pc = __ pc();
+  address stub = __ start_a_stub(call_stub_size);
+  if (stub == NULL) {
+    BAILOUT("static call stub overflow");
+  }
+
+  DEBUG_ONLY(int offset = code_offset();)
+
+  InlinedMetadata metadata_literal(NULL);
+  __ relocate(static_stub_Relocation::spec(call_pc));
+  // If not a single instruction, NativeMovConstReg::next_instruction_address()
+  // must jump over the whole following ldr_literal.
+  // (See CompiledStaticCall::set_to_interpreted())
+#ifdef ASSERT
+  address ldr_site = __ pc();
+#endif
+  __ ldr_literal(Rmethod, metadata_literal);
+  assert(nativeMovConstReg_at(ldr_site)->next_instruction_address() == __ pc(), "Fix ldr_literal or its parsing");
+  bool near_range = __ cache_fully_reachable();
+  InlinedAddress dest((address)-1);
+  if (near_range) {
+    address branch_site = __ pc();
+    __ b(branch_site); // b to self maps to special NativeJump -1 destination
+  } else {
+    __ indirect_jump(dest, Rtemp);
+  }
+  __ bind_literal(metadata_literal); // includes spec_for_immediate reloc
+  if (!near_range) {
+    __ bind_literal(dest); // special NativeJump -1 destination
+  }
+
+  assert(code_offset() - offset <= call_stub_size, "overflow");
+  __ end_a_stub();
+}
+
+void LIR_Assembler::throw_op(LIR_Opr exceptionPC, LIR_Opr exceptionOop, CodeEmitInfo* info) {
+  assert(exceptionOop->as_register() == Rexception_obj, "must match");
+  assert(exceptionPC->as_register()  == Rexception_pc, "must match");
+  info->add_register_oop(exceptionOop);
+
+  Runtime1::StubID handle_id = compilation()->has_fpu_code() ?
+                               Runtime1::handle_exception_id :
+                               Runtime1::handle_exception_nofpu_id;
+  Label return_address;
+  __ adr(Rexception_pc, return_address);
+  __ call(Runtime1::entry_for(handle_id), relocInfo::runtime_call_type);
+  __ bind(return_address);
+  add_call_info_here(info);  // for exception handler
+}
+
+void LIR_Assembler::unwind_op(LIR_Opr exceptionOop) {
+  assert(exceptionOop->as_register() == Rexception_obj, "must match");
+  __ b(_unwind_handler_entry);
+}
+
+void LIR_Assembler::shift_op(LIR_Code code, LIR_Opr left, LIR_Opr count, LIR_Opr dest, LIR_Opr tmp) {
+#ifdef AARCH64
+  if (dest->is_single_cpu()) {
+    Register res = dest->as_register();
+    Register x = left->as_register();
+    Register y = count->as_register();
+    assert (dest->type() == T_INT, "unexpected result type");
+    assert (left->type() == T_INT, "unexpected left type");
+
+    switch (code) {
+      case lir_shl:  __ lslv_w(res, x, y); break;
+      case lir_shr:  __ asrv_w(res, x, y); break;
+      case lir_ushr: __ lsrv_w(res, x, y); break;
+      default: ShouldNotReachHere();
+    }
+  } else if (dest->is_double_cpu()) {
+    Register res = dest->as_register_lo();
+    Register x = left->as_register_lo();
+    Register y = count->as_register();
+
+    switch (code) {
+      case lir_shl:  __ lslv(res, x, y); break;
+      case lir_shr:  __ asrv(res, x, y); break;
+      case lir_ushr: __ lsrv(res, x, y); break;
+      default: ShouldNotReachHere();
+    }
+  } else {
+    ShouldNotReachHere();
+  }
+#else
+  AsmShift shift = lsl;
+  switch (code) {
+    case lir_shl:  shift = lsl; break;
+    case lir_shr:  shift = asr; break;
+    case lir_ushr: shift = lsr; break;
+    default: ShouldNotReachHere();
+  }
+
+  if (dest->is_single_cpu()) {
+    __ andr(Rtemp, count->as_register(), 31);
+    __ mov(dest->as_register(), AsmOperand(left->as_register(), shift, Rtemp));
+  } else if (dest->is_double_cpu()) {
+    Register dest_lo = dest->as_register_lo();
+    Register dest_hi = dest->as_register_hi();
+    Register src_lo  = left->as_register_lo();
+    Register src_hi  = left->as_register_hi();
+    Register Rcount  = count->as_register();
+    // Resolve possible register conflicts
+    if (shift == lsl && dest_hi == src_lo) {
+      dest_hi = Rtemp;
+    } else if (shift != lsl && dest_lo == src_hi) {
+      dest_lo = Rtemp;
+    } else if (dest_lo == src_lo && dest_hi == src_hi) {
+      dest_lo = Rtemp;
+    } else if (dest_lo == Rcount || dest_hi == Rcount) {
+      Rcount = Rtemp;
+    }
+    __ andr(Rcount, count->as_register(), 63);
+    __ long_shift(dest_lo, dest_hi, src_lo, src_hi, shift, Rcount);
+    move_regs(dest_lo, dest->as_register_lo());
+    move_regs(dest_hi, dest->as_register_hi());
+  } else {
+    ShouldNotReachHere();
+  }
+#endif // AARCH64
+}
+
+
+void LIR_Assembler::shift_op(LIR_Code code, LIR_Opr left, jint count, LIR_Opr dest) {
+#ifdef AARCH64
+  if (dest->is_single_cpu()) {
+    assert (dest->type() == T_INT, "unexpected result type");
+    assert (left->type() == T_INT, "unexpected left type");
+    count &= 31;
+    if (count != 0) {
+      switch (code) {
+        case lir_shl:  __ _lsl_w(dest->as_register(), left->as_register(), count); break;
+        case lir_shr:  __ _asr_w(dest->as_register(), left->as_register(), count); break;
+        case lir_ushr: __ _lsr_w(dest->as_register(), left->as_register(), count); break;
+        default: ShouldNotReachHere();
+      }
+    } else {
+      move_regs(left->as_register(), dest->as_register());
+    }
+  } else if (dest->is_double_cpu()) {
+    count &= 63;
+    if (count != 0) {
+      switch (code) {
+        case lir_shl:  __ _lsl(dest->as_register_lo(), left->as_register_lo(), count); break;
+        case lir_shr:  __ _asr(dest->as_register_lo(), left->as_register_lo(), count); break;
+        case lir_ushr: __ _lsr(dest->as_register_lo(), left->as_register_lo(), count); break;
+        default: ShouldNotReachHere();
+      }
+    } else {
+      move_regs(left->as_register_lo(), dest->as_register_lo());
+    }
+  } else {
+    ShouldNotReachHere();
+  }
+
+#else
+  AsmShift shift = lsl;
+  switch (code) {
+    case lir_shl:  shift = lsl; break;
+    case lir_shr:  shift = asr; break;
+    case lir_ushr: shift = lsr; break;
+    default: ShouldNotReachHere();
+  }
+
+  if (dest->is_single_cpu()) {
+    count &= 31;
+    if (count != 0) {
+      __ mov(dest->as_register(), AsmOperand(left->as_register(), shift, count));
+    } else {
+      move_regs(left->as_register(), dest->as_register());
+    }
+  } else if (dest->is_double_cpu()) {
+    count &= 63;
+    if (count != 0) {
+      Register dest_lo = dest->as_register_lo();
+      Register dest_hi = dest->as_register_hi();
+      Register src_lo  = left->as_register_lo();
+      Register src_hi  = left->as_register_hi();
+      // Resolve possible register conflicts
+      if (shift == lsl && dest_hi == src_lo) {
+        dest_hi = Rtemp;
+      } else if (shift != lsl && dest_lo == src_hi) {
+        dest_lo = Rtemp;
+      }
+      __ long_shift(dest_lo, dest_hi, src_lo, src_hi, shift, count);
+      move_regs(dest_lo, dest->as_register_lo());
+      move_regs(dest_hi, dest->as_register_hi());
+    } else {
+      __ long_move(dest->as_register_lo(), dest->as_register_hi(),
+                   left->as_register_lo(), left->as_register_hi());
+    }
+  } else {
+    ShouldNotReachHere();
+  }
+#endif // AARCH64
+}
+
+
+// Saves 4 given registers in reserved argument area.
+void LIR_Assembler::save_in_reserved_area(Register r1, Register r2, Register r3, Register r4) {
+  verify_reserved_argument_area_size(4);
+#ifdef AARCH64
+  __ stp(r1, r2, Address(SP, 0));
+  __ stp(r3, r4, Address(SP, 2*wordSize));
+#else
+  __ stmia(SP, RegisterSet(r1) | RegisterSet(r2) | RegisterSet(r3) | RegisterSet(r4));
+#endif // AARCH64
+}
+
+// Restores 4 given registers from reserved argument area.
+void LIR_Assembler::restore_from_reserved_area(Register r1, Register r2, Register r3, Register r4) {
+#ifdef AARCH64
+  __ ldp(r1, r2, Address(SP, 0));
+  __ ldp(r3, r4, Address(SP, 2*wordSize));
+#else
+  __ ldmia(SP, RegisterSet(r1) | RegisterSet(r2) | RegisterSet(r3) | RegisterSet(r4), no_writeback);
+#endif // AARCH64
+}
+
+
+void LIR_Assembler::emit_arraycopy(LIR_OpArrayCopy* op) {
+  ciArrayKlass* default_type = op->expected_type();
+  Register src = op->src()->as_register();
+  Register src_pos = op->src_pos()->as_register();
+  Register dst = op->dst()->as_register();
+  Register dst_pos = op->dst_pos()->as_register();
+  Register length  = op->length()->as_register();
+  Register tmp = op->tmp()->as_register();
+  Register tmp2 = Rtemp;
+
+  assert(src == R0 && src_pos == R1 && dst == R2 && dst_pos == R3, "code assumption");
+#ifdef AARCH64
+  assert(length == R4, "code assumption");
+#endif // AARCH64
+
+  CodeStub* stub = op->stub();
+
+  int flags = op->flags();
+  BasicType basic_type = default_type != NULL ? default_type->element_type()->basic_type() : T_ILLEGAL;
+  if (basic_type == T_ARRAY) basic_type = T_OBJECT;
+
+  // If we don't know anything or it's an object array, just go through the generic arraycopy
+  if (default_type == NULL) {
+
+    // save arguments, because they will be killed by a runtime call
+    save_in_reserved_area(R0, R1, R2, R3);
+
+#ifdef AARCH64
+    // save length argument, will be killed by a runtime call
+    __ raw_push(length, ZR);
+#else
+    // pass length argument on SP[0]
+    __ str(length, Address(SP, -2*wordSize, pre_indexed));  // 2 words for a proper stack alignment
+#endif // AARCH64
+
+    address copyfunc_addr = StubRoutines::generic_arraycopy();
+    if (copyfunc_addr == NULL) { // Use C version if stub was not generated
+      __ call(CAST_FROM_FN_PTR(address, Runtime1::arraycopy));
+    } else {
+#ifndef PRODUCT
+      if (PrintC1Statistics) {
+        __ inc_counter((address)&Runtime1::_generic_arraycopystub_cnt, tmp, tmp2);
+      }
+#endif // !PRODUCT
+      // the stub is in the code cache so close enough
+      __ call(copyfunc_addr, relocInfo::runtime_call_type);
+    }
+
+#ifdef AARCH64
+    __ raw_pop(length, ZR);
+#else
+    __ add(SP, SP, 2*wordSize);
+#endif // AARCH64
+
+    __ cbz_32(R0, *stub->continuation());
+
+    if (copyfunc_addr != NULL) {
+      __ mvn_32(tmp, R0);
+      restore_from_reserved_area(R0, R1, R2, R3);  // load saved arguments in slow case only
+      __ sub_32(length, length, tmp);
+      __ add_32(src_pos, src_pos, tmp);
+      __ add_32(dst_pos, dst_pos, tmp);
+    } else {
+      restore_from_reserved_area(R0, R1, R2, R3);  // load saved arguments in slow case only
+    }
+
+    __ b(*stub->entry());
+
+    __ bind(*stub->continuation());
+    return;
+  }
+
+  assert(default_type != NULL && default_type->is_array_klass() && default_type->is_loaded(),
+         "must be true at this point");
+  int elem_size = type2aelembytes(basic_type);
+  int shift = exact_log2(elem_size);
+
+  // Check for NULL
+  if (flags & LIR_OpArrayCopy::src_null_check) {
+    if (flags & LIR_OpArrayCopy::dst_null_check) {
+      __ cmp(src, 0);
+      __ cond_cmp(dst, 0, ne);  // make one instruction shorter if both checks are needed
+      __ b(*stub->entry(), eq);
+    } else {
+      __ cbz(src, *stub->entry());
+    }
+  } else if (flags & LIR_OpArrayCopy::dst_null_check) {
+    __ cbz(dst, *stub->entry());
+  }
+
+  // If the compiler was not able to prove that exact type of the source or the destination
+  // of the arraycopy is an array type, check at runtime if the source or the destination is
+  // an instance type.
+  if (flags & LIR_OpArrayCopy::type_check) {
+    if (!(flags & LIR_OpArrayCopy::LIR_OpArrayCopy::dst_objarray)) {
+      __ load_klass(tmp, dst);
+      __ ldr_u32(tmp2, Address(tmp, in_bytes(Klass::layout_helper_offset())));
+      __ mov_slow(tmp, Klass::_lh_neutral_value);
+      __ cmp_32(tmp2, tmp);
+      __ b(*stub->entry(), ge);
+    }
+
+    if (!(flags & LIR_OpArrayCopy::LIR_OpArrayCopy::src_objarray)) {
+      __ load_klass(tmp, src);
+      __ ldr_u32(tmp2, Address(tmp, in_bytes(Klass::layout_helper_offset())));
+      __ mov_slow(tmp, Klass::_lh_neutral_value);
+      __ cmp_32(tmp2, tmp);
+      __ b(*stub->entry(), ge);
+    }
+  }
+
+  // Check if negative
+  const int all_positive_checks = LIR_OpArrayCopy::src_pos_positive_check |
+                                  LIR_OpArrayCopy::dst_pos_positive_check |
+                                  LIR_OpArrayCopy::length_positive_check;
+  switch (flags & all_positive_checks) {
+    case LIR_OpArrayCopy::src_pos_positive_check:
+      __ branch_if_negative_32(src_pos, *stub->entry());
+      break;
+    case LIR_OpArrayCopy::dst_pos_positive_check:
+      __ branch_if_negative_32(dst_pos, *stub->entry());
+      break;
+    case LIR_OpArrayCopy::length_positive_check:
+      __ branch_if_negative_32(length, *stub->entry());
+      break;
+    case LIR_OpArrayCopy::src_pos_positive_check | LIR_OpArrayCopy::dst_pos_positive_check:
+      __ branch_if_any_negative_32(src_pos, dst_pos, tmp, *stub->entry());
+      break;
+    case LIR_OpArrayCopy::src_pos_positive_check | LIR_OpArrayCopy::length_positive_check:
+      __ branch_if_any_negative_32(src_pos, length, tmp, *stub->entry());
+      break;
+    case LIR_OpArrayCopy::dst_pos_positive_check | LIR_OpArrayCopy::length_positive_check:
+      __ branch_if_any_negative_32(dst_pos, length, tmp, *stub->entry());
+      break;
+    case all_positive_checks:
+      __ branch_if_any_negative_32(src_pos, dst_pos, length, tmp, *stub->entry());
+      break;
+    default:
+      assert((flags & all_positive_checks) == 0, "the last option");
+  }
+
+  // Range checks
+  if (flags & LIR_OpArrayCopy::src_range_check) {
+    __ ldr_s32(tmp2, Address(src, arrayOopDesc::length_offset_in_bytes()));
+    __ add_32(tmp, src_pos, length);
+    __ cmp_32(tmp, tmp2);
+    __ b(*stub->entry(), hi);
+  }
+  if (flags & LIR_OpArrayCopy::dst_range_check) {
+    __ ldr_s32(tmp2, Address(dst, arrayOopDesc::length_offset_in_bytes()));
+    __ add_32(tmp, dst_pos, length);
+    __ cmp_32(tmp, tmp2);
+    __ b(*stub->entry(), hi);
+  }
+
+  // Check if src and dst are of the same type
+  if (flags & LIR_OpArrayCopy::type_check) {
+    // We don't know the array types are compatible
+    if (basic_type != T_OBJECT) {
+      // Simple test for basic type arrays
+      if (UseCompressedClassPointers) {
+        // We don't need decode because we just need to compare
+        __ ldr_u32(tmp, Address(src, oopDesc::klass_offset_in_bytes()));
+        __ ldr_u32(tmp2, Address(dst, oopDesc::klass_offset_in_bytes()));
+        __ cmp_32(tmp, tmp2);
+      } else {
+        __ load_klass(tmp, src);
+        __ load_klass(tmp2, dst);
+        __ cmp(tmp, tmp2);
+      }
+      __ b(*stub->entry(), ne);
+    } else {
+      // For object arrays, if src is a sub class of dst then we can
+      // safely do the copy.
+      Label cont, slow;
+
+      address copyfunc_addr = StubRoutines::checkcast_arraycopy();
+
+      __ load_klass(tmp, src);
+      __ load_klass(tmp2, dst);
+
+      // We are at a call so all live registers are saved before we
+      // get here
+      assert_different_registers(tmp, tmp2, R6, altFP_7_11);
+
+      __ check_klass_subtype_fast_path(tmp, tmp2, R6, altFP_7_11, &cont, copyfunc_addr == NULL ? stub->entry() : &slow, NULL);
+
+      __ mov(R6, R0);
+      __ mov(altFP_7_11, R1);
+      __ mov(R0, tmp);
+      __ mov(R1, tmp2);
+      __ call(Runtime1::entry_for(Runtime1::slow_subtype_check_id), relocInfo::runtime_call_type); // does not blow any registers except R0, LR and Rtemp
+      __ cmp_32(R0, 0);
+      __ mov(R0, R6);
+      __ mov(R1, altFP_7_11);
+
+      if (copyfunc_addr != NULL) { // use stub if available
+        // src is not a sub class of dst so we have to do a
+        // per-element check.
+
+        __ b(cont, ne);
+
+        __ bind(slow);
+
+        int mask = LIR_OpArrayCopy::src_objarray|LIR_OpArrayCopy::dst_objarray;
+        if ((flags & mask) != mask) {
+          // Check that at least both of them object arrays.
+          assert(flags & mask, "one of the two should be known to be an object array");
+
+          if (!(flags & LIR_OpArrayCopy::src_objarray)) {
+            __ load_klass(tmp, src);
+          } else if (!(flags & LIR_OpArrayCopy::dst_objarray)) {
+            __ load_klass(tmp, dst);
+          }
+          int lh_offset = in_bytes(Klass::layout_helper_offset());
+
+          __ ldr_u32(tmp2, Address(tmp, lh_offset));
+
+          jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
+          __ mov_slow(tmp, objArray_lh);
+          __ cmp_32(tmp, tmp2);
+          __ b(*stub->entry(), ne);
+        }
+
+        save_in_reserved_area(R0, R1, R2, R3);
+
+        Register src_ptr = R0;
+        Register dst_ptr = R1;
+        Register len     = R2;
+        Register chk_off = R3;
+        Register super_k = AARCH64_ONLY(R4) NOT_AARCH64(tmp);
+
+        __ add(src_ptr, src, arrayOopDesc::base_offset_in_bytes(basic_type));
+        __ add_ptr_scaled_int32(src_ptr, src_ptr, src_pos, shift);
+
+        __ add(dst_ptr, dst, arrayOopDesc::base_offset_in_bytes(basic_type));
+        __ add_ptr_scaled_int32(dst_ptr, dst_ptr, dst_pos, shift);
+        __ load_klass(tmp, dst);
+
+        int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
+        int sco_offset = in_bytes(Klass::super_check_offset_offset());
+
+#ifdef AARCH64
+        __ raw_push(length, ZR); // Preserve length around *copyfunc_addr call
+
+        __ mov(len, length);
+        __ ldr(super_k, Address(tmp, ek_offset)); // super_k == R4 == length, so this load cannot be performed earlier
+        // TODO-AARCH64: check whether it is faster to load super klass early by using tmp and additional mov.
+        __ ldr_u32(chk_off, Address(super_k, sco_offset));
+#else // AARCH64
+        __ ldr(super_k, Address(tmp, ek_offset));
+
+        __ mov(len, length);
+        __ ldr_u32(chk_off, Address(super_k, sco_offset));
+        __ push(super_k);
+#endif // AARCH64
+
+        __ call(copyfunc_addr, relocInfo::runtime_call_type);
+
+#ifndef PRODUCT
+        if (PrintC1Statistics) {
+          Label failed;
+          __ cbnz_32(R0, failed);
+          __ inc_counter((address)&Runtime1::_arraycopy_checkcast_cnt, tmp, tmp2);
+          __ bind(failed);
+        }
+#endif // PRODUCT
+
+#ifdef AARCH64
+        __ raw_pop(length, ZR);
+#else
+        __ add(SP, SP, wordSize);  // Drop super_k argument
+#endif // AARCH64
+
+        __ cbz_32(R0, *stub->continuation());
+        __ mvn_32(tmp, R0);
+
+        // load saved arguments in slow case only
+        restore_from_reserved_area(R0, R1, R2, R3);
+
+        __ sub_32(length, length, tmp);
+        __ add_32(src_pos, src_pos, tmp);
+        __ add_32(dst_pos, dst_pos, tmp);
+
+#ifndef PRODUCT
+        if (PrintC1Statistics) {
+          __ inc_counter((address)&Runtime1::_arraycopy_checkcast_attempt_cnt, tmp, tmp2);
+        }
+#endif
+
+        __ b(*stub->entry());
+
+        __ bind(cont);
+      } else {
+        __ b(*stub->entry(), eq);
+        __ bind(cont);
+      }
+    }
+  }
+
+#ifndef PRODUCT
+  if (PrintC1Statistics) {
+    address counter = Runtime1::arraycopy_count_address(basic_type);
+    __ inc_counter(counter, tmp, tmp2);
+  }
+#endif // !PRODUCT
+
+  bool disjoint = (flags & LIR_OpArrayCopy::overlapping) == 0;
+  bool aligned = (flags & LIR_OpArrayCopy::unaligned) == 0;
+  const char *name;
+  address entry = StubRoutines::select_arraycopy_function(basic_type, aligned, disjoint, name, false);
+
+  Register src_ptr = R0;
+  Register dst_ptr = R1;
+  Register len     = R2;
+
+  __ add(src_ptr, src, arrayOopDesc::base_offset_in_bytes(basic_type));
+  __ add_ptr_scaled_int32(src_ptr, src_ptr, src_pos, shift);
+
+  __ add(dst_ptr, dst, arrayOopDesc::base_offset_in_bytes(basic_type));
+  __ add_ptr_scaled_int32(dst_ptr, dst_ptr, dst_pos, shift);
+
+  __ mov(len, length);
+
+  __ call(entry, relocInfo::runtime_call_type);
+
+  __ bind(*stub->continuation());
+}
+
+#ifdef ASSERT
+ // emit run-time assertion
+void LIR_Assembler::emit_assert(LIR_OpAssert* op) {
+  assert(op->code() == lir_assert, "must be");
+
+#ifdef AARCH64
+  __ NOT_IMPLEMENTED();
+#else
+  if (op->in_opr1()->is_valid()) {
+    assert(op->in_opr2()->is_valid(), "both operands must be valid");
+    comp_op(op->condition(), op->in_opr1(), op->in_opr2(), op);
+  } else {
+    assert(op->in_opr2()->is_illegal(), "both operands must be illegal");
+    assert(op->condition() == lir_cond_always, "no other conditions allowed");
+  }
+
+  Label ok;
+  if (op->condition() != lir_cond_always) {
+    AsmCondition acond;
+    switch (op->condition()) {
+      case lir_cond_equal:        acond = eq; break;
+      case lir_cond_notEqual:     acond = ne; break;
+      case lir_cond_less:         acond = lt; break;
+      case lir_cond_lessEqual:    acond = le; break;
+      case lir_cond_greaterEqual: acond = ge; break;
+      case lir_cond_greater:      acond = gt; break;
+      case lir_cond_aboveEqual:   acond = hs; break;
+      case lir_cond_belowEqual:   acond = ls; break;
+      default:                    ShouldNotReachHere();
+    }
+    __ b(ok, acond);
+  }
+  if (op->halt()) {
+    const char* str = __ code_string(op->msg());
+    __ stop(str);
+  } else {
+    breakpoint();
+  }
+  __ bind(ok);
+#endif // AARCH64
+}
+#endif // ASSERT
+
+void LIR_Assembler::emit_updatecrc32(LIR_OpUpdateCRC32* op) {
+  fatal("CRC32 intrinsic is not implemented on this platform");
+}
+
+void LIR_Assembler::emit_lock(LIR_OpLock* op) {
+  Register obj = op->obj_opr()->as_pointer_register();
+  Register hdr = op->hdr_opr()->as_pointer_register();
+  Register lock = op->lock_opr()->as_pointer_register();
+  Register tmp = op->scratch_opr()->is_illegal() ? noreg :
+                 op->scratch_opr()->as_pointer_register();
+
+  if (!UseFastLocking) {
+    __ b(*op->stub()->entry());
+  } else if (op->code() == lir_lock) {
+    assert(BasicLock::displaced_header_offset_in_bytes() == 0, "lock_reg must point to the displaced header");
+    int null_check_offset = __ lock_object(hdr, obj, lock, tmp, *op->stub()->entry());
+    if (op->info() != NULL) {
+      add_debug_info_for_null_check(null_check_offset, op->info());
+    }
+  } else if (op->code() == lir_unlock) {
+    __ unlock_object(hdr, obj, lock, tmp, *op->stub()->entry());
+  } else {
+    ShouldNotReachHere();
+  }
+  __ bind(*op->stub()->continuation());
+}
+
+
+void LIR_Assembler::emit_profile_call(LIR_OpProfileCall* op) {
+  ciMethod* method = op->profiled_method();
+  int bci          = op->profiled_bci();
+  ciMethod* callee = op->profiled_callee();
+
+  // Update counter for all call types
+  ciMethodData* md = method->method_data_or_null();
+  assert(md != NULL, "Sanity");
+  ciProfileData* data = md->bci_to_data(bci);
+  assert(data->is_CounterData(), "need CounterData for calls");
+  assert(op->mdo()->is_single_cpu(),  "mdo must be allocated");
+  Register mdo  = op->mdo()->as_register();
+  assert(op->tmp1()->is_register(), "tmp1 must be allocated");
+  Register tmp1 = op->tmp1()->as_pointer_register();
+  assert_different_registers(mdo, tmp1);
+  __ mov_metadata(mdo, md->constant_encoding());
+  int mdo_offset_bias = 0;
+  int max_offset = AARCH64_ONLY(4096 << LogBytesPerWord) NOT_AARCH64(4096);
+  if (md->byte_offset_of_slot(data, CounterData::count_offset()) + data->size_in_bytes() >= max_offset) {
+    // The offset is large so bias the mdo by the base of the slot so
+    // that the ldr can use an immediate offset to reference the slots of the data
+    mdo_offset_bias = md->byte_offset_of_slot(data, CounterData::count_offset());
+    __ mov_slow(tmp1, mdo_offset_bias);
+    __ add(mdo, mdo, tmp1);
+  }
+
+  Address counter_addr(mdo, md->byte_offset_of_slot(data, CounterData::count_offset()) - mdo_offset_bias);
+  Bytecodes::Code bc = method->java_code_at_bci(bci);
+  const bool callee_is_static = callee->is_loaded() && callee->is_static();
+  // Perform additional virtual call profiling for invokevirtual and
+  // invokeinterface bytecodes
+  if ((bc == Bytecodes::_invokevirtual || bc == Bytecodes::_invokeinterface) &&
+      !callee_is_static &&  // required for optimized MH invokes
+      C1ProfileVirtualCalls) {
+
+    assert(op->recv()->is_single_cpu(), "recv must be allocated");
+    Register recv = op->recv()->as_register();
+    assert_different_registers(mdo, tmp1, recv);
+    assert(data->is_VirtualCallData(), "need VirtualCallData for virtual calls");
+    ciKlass* known_klass = op->known_holder();
+    if (C1OptimizeVirtualCallProfiling && known_klass != NULL) {
+      // We know the type that will be seen at this call site; we can
+      // statically update the MethodData* rather than needing to do
+      // dynamic tests on the receiver type
+
+      // NOTE: we should probably put a lock around this search to
+      // avoid collisions by concurrent compilations
+      ciVirtualCallData* vc_data = (ciVirtualCallData*) data;
+      uint i;
+      for (i = 0; i < VirtualCallData::row_limit(); i++) {
+        ciKlass* receiver = vc_data->receiver(i);
+        if (known_klass->equals(receiver)) {
+          Address data_addr(mdo, md->byte_offset_of_slot(data,
+                                                         VirtualCallData::receiver_count_offset(i)) -
+                            mdo_offset_bias);
+          __ ldr(tmp1, data_addr);
+          __ add(tmp1, tmp1, DataLayout::counter_increment);
+          __ str(tmp1, data_addr);
+          return;
+        }
+      }
+
+      // Receiver type not found in profile data; select an empty slot
+
+      // Note that this is less efficient than it should be because it
+      // always does a write to the receiver part of the
+      // VirtualCallData rather than just the first time
+      for (i = 0; i < VirtualCallData::row_limit(); i++) {
+        ciKlass* receiver = vc_data->receiver(i);
+        if (receiver == NULL) {
+          Address recv_addr(mdo, md->byte_offset_of_slot(data, VirtualCallData::receiver_offset(i)) -
+                            mdo_offset_bias);
+          __ mov_metadata(tmp1, known_klass->constant_encoding());
+          __ str(tmp1, recv_addr);
+          Address data_addr(mdo, md->byte_offset_of_slot(data, VirtualCallData::receiver_count_offset(i)) -
+                            mdo_offset_bias);
+          __ ldr(tmp1, data_addr);
+          __ add(tmp1, tmp1, DataLayout::counter_increment);
+          __ str(tmp1, data_addr);
+          return;
+        }
+      }
+    } else {
+      __ load_klass(recv, recv);
+      Label update_done;
+      type_profile_helper(mdo, mdo_offset_bias, md, data, recv, tmp1, &update_done);
+      // Receiver did not match any saved receiver and there is no empty row for it.
+      // Increment total counter to indicate polymorphic case.
+      __ ldr(tmp1, counter_addr);
+      __ add(tmp1, tmp1, DataLayout::counter_increment);
+      __ str(tmp1, counter_addr);
+
+      __ bind(update_done);
+    }
+  } else {
+    // Static call
+    __ ldr(tmp1, counter_addr);
+    __ add(tmp1, tmp1, DataLayout::counter_increment);
+    __ str(tmp1, counter_addr);
+  }
+}
+
+void LIR_Assembler::emit_profile_type(LIR_OpProfileType* op) {
+  fatal("Type profiling not implemented on this platform");
+}
+
+void LIR_Assembler::emit_delay(LIR_OpDelay*) {
+  Unimplemented();
+}
+
+
+void LIR_Assembler::monitor_address(int monitor_no, LIR_Opr dst) {
+  Address mon_addr = frame_map()->address_for_monitor_lock(monitor_no);
+  __ add_slow(dst->as_pointer_register(), mon_addr.base(), mon_addr.disp());
+}
+
+
+void LIR_Assembler::align_backward_branch_target() {
+  // TODO-AARCH64 review it
+  // Some ARM processors do better with 8-byte branch target alignment
+  __ align(8);
+}
+
+
+void LIR_Assembler::negate(LIR_Opr left, LIR_Opr dest) {
+
+  if (left->is_single_cpu()) {
+    assert (dest->type() == T_INT, "unexpected result type");
+    assert (left->type() == T_INT, "unexpected left type");
+    __ neg_32(dest->as_register(), left->as_register());
+  } else if (left->is_double_cpu()) {
+#ifdef AARCH64
+    __ neg(dest->as_register_lo(), left->as_register_lo());
+#else
+    Register dest_lo = dest->as_register_lo();
+    Register dest_hi = dest->as_register_hi();
+    Register src_lo = left->as_register_lo();
+    Register src_hi = left->as_register_hi();
+    if (dest_lo == src_hi) {
+      dest_lo = Rtemp;
+    }
+    __ rsbs(dest_lo, src_lo, 0);
+    __ rsc(dest_hi, src_hi, 0);
+    move_regs(dest_lo, dest->as_register_lo());
+#endif // AARCH64
+  } else if (left->is_single_fpu()) {
+    __ neg_float(dest->as_float_reg(), left->as_float_reg());
+  } else if (left->is_double_fpu()) {
+    __ neg_double(dest->as_double_reg(), left->as_double_reg());
+  } else {
+    ShouldNotReachHere();
+  }
+}
+
+
+void LIR_Assembler::leal(LIR_Opr addr_opr, LIR_Opr dest) {
+  LIR_Address* addr = addr_opr->as_address_ptr();
+  if (addr->index()->is_illegal()) {
+    jint c = addr->disp();
+    if (!Assembler::is_arith_imm_in_range(c)) {
+      BAILOUT("illegal arithmetic operand");
+    }
+    __ add(dest->as_pointer_register(), addr->base()->as_pointer_register(), c);
+  } else {
+    assert(addr->disp() == 0, "cannot handle otherwise");
+#ifdef AARCH64
+    assert(addr->index()->is_double_cpu(), "should be");
+#endif // AARCH64
+    __ add(dest->as_pointer_register(), addr->base()->as_pointer_register(),
+           AsmOperand(addr->index()->as_pointer_register(), lsl, addr->scale()));
+  }
+}
+
+
+void LIR_Assembler::rt_call(LIR_Opr result, address dest, const LIR_OprList* args, LIR_Opr tmp, CodeEmitInfo* info) {
+  assert(!tmp->is_valid(), "don't need temporary");
+  __ call(dest);
+  if (info != NULL) {
+    add_call_info_here(info);
+  }
+}
+
+
+void LIR_Assembler::volatile_move_op(LIR_Opr src, LIR_Opr dest, BasicType type, CodeEmitInfo* info) {
+#ifdef AARCH64
+  Unimplemented(); // TODO-AARCH64: Use stlr/ldar instructions for volatile load/store
+#else
+  assert(src->is_double_cpu() && dest->is_address() ||
+         src->is_address() && dest->is_double_cpu(),
+         "Simple move_op is called for all other cases");
+
+  int null_check_offset;
+  if (dest->is_address()) {
+    // Store
+    const LIR_Address* addr = dest->as_address_ptr();
+    const Register src_lo = src->as_register_lo();
+    const Register src_hi = src->as_register_hi();
+    assert(addr->index()->is_illegal() && addr->disp() == 0, "The address is simple already");
+
+    if (src_lo < src_hi) {
+      null_check_offset = __ offset();
+      __ stmia(addr->base()->as_register(), RegisterSet(src_lo) | RegisterSet(src_hi));
+    } else {
+      assert(src_lo < Rtemp, "Rtemp is higher than any allocatable register");
+      __ mov(Rtemp, src_hi);
+      null_check_offset = __ offset();
+      __ stmia(addr->base()->as_register(), RegisterSet(src_lo) | RegisterSet(Rtemp));
+    }
+  } else {
+    // Load
+    const LIR_Address* addr = src->as_address_ptr();
+    const Register dest_lo = dest->as_register_lo();
+    const Register dest_hi = dest->as_register_hi();
+    assert(addr->index()->is_illegal() && addr->disp() == 0, "The address is simple already");
+
+    null_check_offset = __ offset();
+    if (dest_lo < dest_hi) {
+      __ ldmia(addr->base()->as_register(), RegisterSet(dest_lo) | RegisterSet(dest_hi));
+    } else {
+      assert(dest_lo < Rtemp, "Rtemp is higher than any allocatable register");
+      __ ldmia(addr->base()->as_register(), RegisterSet(dest_lo) | RegisterSet(Rtemp));
+      __ mov(dest_hi, Rtemp);
+    }
+  }
+
+  if (info != NULL) {
+    add_debug_info_for_null_check(null_check_offset, info);
+  }
+#endif // AARCH64
+}
+
+
+void LIR_Assembler::membar() {
+  __ membar(MacroAssembler::StoreLoad, Rtemp);
+}
+
+void LIR_Assembler::membar_acquire() {
+  __ membar(MacroAssembler::Membar_mask_bits(MacroAssembler::LoadLoad | MacroAssembler::LoadStore), Rtemp);
+}
+
+void LIR_Assembler::membar_release() {
+  __ membar(MacroAssembler::Membar_mask_bits(MacroAssembler::StoreStore | MacroAssembler::LoadStore), Rtemp);
+}
+
+void LIR_Assembler::membar_loadload() {
+  __ membar(MacroAssembler::LoadLoad, Rtemp);
+}
+
+void LIR_Assembler::membar_storestore() {
+  __ membar(MacroAssembler::StoreStore, Rtemp);
+}
+
+void LIR_Assembler::membar_loadstore() {
+  __ membar(MacroAssembler::LoadStore, Rtemp);
+}
+
+void LIR_Assembler::membar_storeload() {
+  __ membar(MacroAssembler::StoreLoad, Rtemp);
+}
+
+void LIR_Assembler::on_spin_wait() {
+  Unimplemented();
+}
+
+void LIR_Assembler::get_thread(LIR_Opr result_reg) {
+  // Not used on ARM
+  Unimplemented();
+}
+
+void LIR_Assembler::peephole(LIR_List* lir) {
+#ifdef AARCH64
+  return; // TODO-AARCH64 implement peephole optimizations
+#endif
+  LIR_OpList* inst = lir->instructions_list();
+  const int inst_length = inst->length();
+  for (int i = 0; i < inst_length; i++) {
+    LIR_Op* op = inst->at(i);
+    switch (op->code()) {
+      case lir_cmp: {
+        // Replace:
+        //   cmp rX, y
+        //   cmove [EQ] y, z, rX
+        // with
+        //   cmp rX, y
+        //   cmove [EQ] illegalOpr, z, rX
+        //
+        // or
+        //   cmp rX, y
+        //   cmove [NE] z, y, rX
+        // with
+        //   cmp rX, y
+        //   cmove [NE] z, illegalOpr, rX
+        //
+        // moves from illegalOpr should be removed when converting LIR to native assembly
+
+        LIR_Op2* cmp = op->as_Op2();
+        assert(cmp != NULL, "cmp LIR instruction is not an op2");
+
+        if (i + 1 < inst_length) {
+          LIR_Op2* cmove = inst->at(i + 1)->as_Op2();
+          if (cmove != NULL && cmove->code() == lir_cmove) {
+            LIR_Opr cmove_res = cmove->result_opr();
+            bool res_is_op1 = cmove_res == cmp->in_opr1();
+            bool res_is_op2 = cmove_res == cmp->in_opr2();
+            LIR_Opr cmp_res, cmp_arg;
+            if (res_is_op1) {
+              cmp_res = cmp->in_opr1();
+              cmp_arg = cmp->in_opr2();
+            } else if (res_is_op2) {
+              cmp_res = cmp->in_opr2();
+              cmp_arg = cmp->in_opr1();
+            } else {
+              cmp_res = LIR_OprFact::illegalOpr;
+              cmp_arg = LIR_OprFact::illegalOpr;
+            }
+
+            if (cmp_res != LIR_OprFact::illegalOpr) {
+              LIR_Condition cond = cmove->condition();
+              if (cond == lir_cond_equal && cmove->in_opr1() == cmp_arg) {
+                cmove->set_in_opr1(LIR_OprFact::illegalOpr);
+              } else if (cond == lir_cond_notEqual && cmove->in_opr2() == cmp_arg) {
+                cmove->set_in_opr2(LIR_OprFact::illegalOpr);
+              }
+            }
+          }
+        }
+        break;
+      }
+
+      default:
+        break;
+    }
+  }
+}
+
+void LIR_Assembler::atomic_op(LIR_Code code, LIR_Opr src, LIR_Opr data, LIR_Opr dest, LIR_Opr tmp) {
+  Register ptr = src->as_pointer_register();
+
+  if (code == lir_xchg) {
+#ifdef AARCH64
+    if (UseCompressedOops && data->is_oop()) {
+      __ encode_heap_oop(tmp->as_pointer_register(), data->as_register());
+    }
+#endif // AARCH64
+  } else {
+    assert (!data->is_oop(), "xadd for oops");
+  }
+
+#ifndef AARCH64
+  __ membar(MacroAssembler::Membar_mask_bits(MacroAssembler::StoreStore | MacroAssembler::LoadStore), Rtemp);
+#endif // !AARCH64
+
+  Label retry;
+  __ bind(retry);
+
+  if ((data->type() == T_INT) || (data->is_oop() AARCH64_ONLY(&& UseCompressedOops))) {
+    Register dst = dest->as_register();
+    Register new_val = noreg;
+#ifdef AARCH64
+    __ ldaxr_w(dst, ptr);
+#else
+    __ ldrex(dst, Address(ptr));
+#endif
+    if (code == lir_xadd) {
+      Register tmp_reg = tmp->as_register();
+      if (data->is_constant()) {
+        assert_different_registers(dst, ptr, tmp_reg);
+        __ add_32(tmp_reg, dst, data->as_constant_ptr()->as_jint());
+      } else {
+        assert_different_registers(dst, ptr, tmp_reg, data->as_register());
+        __ add_32(tmp_reg, dst, data->as_register());
+      }
+      new_val = tmp_reg;
+    } else {
+      if (UseCompressedOops && data->is_oop()) {
+        new_val = tmp->as_pointer_register();
+      } else {
+        new_val = data->as_register();
+      }
+      assert_different_registers(dst, ptr, new_val);
+    }
+#ifdef AARCH64
+    __ stlxr_w(Rtemp, new_val, ptr);
+#else
+    __ strex(Rtemp, new_val, Address(ptr));
+#endif // AARCH64
+
+#ifdef AARCH64
+  } else if ((data->type() == T_LONG) || (data->is_oop() && !UseCompressedOops)) {
+    Register dst = dest->as_pointer_register();
+    Register new_val = noreg;
+    __ ldaxr(dst, ptr);
+    if (code == lir_xadd) {
+      Register tmp_reg = tmp->as_pointer_register();
+      if (data->is_constant()) {
+        assert_different_registers(dst, ptr, tmp_reg);
+        jlong c = data->as_constant_ptr()->as_jlong();
+        assert((jlong)((jint)c) == c, "overflow");
+        __ add(tmp_reg, dst, (jint)c);
+      } else {
+        assert_different_registers(dst, ptr, tmp_reg, data->as_pointer_register());
+        __ add(tmp_reg, dst, data->as_pointer_register());
+      }
+      new_val = tmp_reg;
+    } else {
+      new_val = data->as_pointer_register();
+      assert_different_registers(dst, ptr, new_val);
+    }
+    __ stlxr(Rtemp, new_val, ptr);
+#else
+  } else if (data->type() == T_LONG) {
+    Register dst_lo = dest->as_register_lo();
+    Register new_val_lo = noreg;
+    Register dst_hi = dest->as_register_hi();
+
+    assert(dst_hi->encoding() == dst_lo->encoding() + 1, "non aligned register pair");
+    assert((dst_lo->encoding() & 0x1) == 0, "misaligned register pair");
+
+    __ bind(retry);
+    __ ldrexd(dst_lo, Address(ptr));
+    if (code == lir_xadd) {
+      Register tmp_lo = tmp->as_register_lo();
+      Register tmp_hi = tmp->as_register_hi();
+
+      assert(tmp_hi->encoding() == tmp_lo->encoding() + 1, "non aligned register pair");
+      assert((tmp_lo->encoding() & 0x1) == 0, "misaligned register pair");
+
+      if (data->is_constant()) {
+        jlong c = data->as_constant_ptr()->as_jlong();
+        assert((jlong)((jint)c) == c, "overflow");
+        assert_different_registers(dst_lo, dst_hi, ptr, tmp_lo, tmp_hi);
+        __ adds(tmp_lo, dst_lo, (jint)c);
+        __ adc(tmp_hi, dst_hi, 0);
+      } else {
+        Register new_val_lo = data->as_register_lo();
+        Register new_val_hi = data->as_register_hi();
+        __ adds(tmp_lo, dst_lo, new_val_lo);
+        __ adc(tmp_hi, dst_hi, new_val_hi);
+        assert_different_registers(dst_lo, dst_hi, ptr, tmp_lo, tmp_hi, new_val_lo, new_val_hi);
+      }
+      new_val_lo = tmp_lo;
+    } else {
+      new_val_lo = data->as_register_lo();
+      Register new_val_hi = data->as_register_hi();
+
+      assert_different_registers(dst_lo, dst_hi, ptr, new_val_lo, new_val_hi);
+      assert(new_val_hi->encoding() == new_val_lo->encoding() + 1, "non aligned register pair");
+      assert((new_val_lo->encoding() & 0x1) == 0, "misaligned register pair");
+    }
+    __ strexd(Rtemp, new_val_lo, Address(ptr));
+#endif // AARCH64
+  } else {
+    ShouldNotReachHere();
+  }
+
+  __ cbnz_32(Rtemp, retry);
+  __ membar(MacroAssembler::Membar_mask_bits(MacroAssembler::StoreLoad | MacroAssembler::StoreStore), Rtemp);
+
+#ifdef AARCH64
+  if (UseCompressedOops && data->is_oop()) {
+    __ decode_heap_oop(dest->as_register());
+  }
+#endif // AARCH64
+}
+
+int LIR_Assembler::exception_handler_size = -1;
+
+#undef __
--- /dev/null	2016-08-24 15:41:39.598575000 -0400
+++ new/hotspot/src/cpu/arm/vm/c1_LIRAssembler_arm.hpp	2016-12-02 11:18:29.518175368 -0500
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2008, 2014, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef CPU_ARM_VM_C1_LIRASSEMBLER_ARM_HPP
+#define CPU_ARM_VM_C1_LIRASSEMBLER_ARM_HPP
+
+ private:
+
+  // Record the type of the receiver in ReceiverTypeData
+  void type_profile_helper(Register mdo, int mdo_offset_bias,
+                           ciMethodData *md, ciProfileData *data,
+                           Register recv, Register tmp1, Label* update_done);
+  // Setup pointers to MDO, MDO slot, also compute offset bias to access the slot.
+  void setup_md_access(ciMethod* method, int bci,
+                       ciMethodData*& md, ciProfileData*& data, int& mdo_offset_bias);
+
+  void typecheck_profile_helper1(ciMethod* method, int bci,
+                                 ciMethodData*& md, ciProfileData*& data, int& mdo_offset_bias,
+                                 Register obj, Register mdo, Register data_val, Label* obj_is_null);
+
+  void typecheck_profile_helper2(ciMethodData* md, ciProfileData* data, int mdo_offset_bias,
+                                 Register mdo, Register recv, Register value, Register tmp1,
+                                 Label* profile_cast_success, Label* profile_cast_failure,
+                                 Label* success, Label* failure);
+
+#ifdef AARCH64
+  void long_compare_helper(LIR_Opr opr1, LIR_Opr opr2);
+#endif // AARCH64
+
+  // Saves 4 given registers in reserved argument area.
+  void save_in_reserved_area(Register r1, Register r2, Register r3, Register r4);
+
+  // Restores 4 given registers from reserved argument area.
+  void restore_from_reserved_area(Register r1, Register r2, Register r3, Register r4);
+
+ public:
+
+  enum {
+    call_stub_size = AARCH64_ONLY(32) NOT_AARCH64(16),
+    deopt_handler_size = AARCH64_ONLY(32) NOT_AARCH64(16)
+  };
+
+  static int exception_handler_size;
+
+  void verify_reserved_argument_area_size(int args_count) PRODUCT_RETURN;
+
+  void store_parameter(jint c,      int offset_from_sp_in_words);
+  void store_parameter(Metadata* m, int offset_from_sp_in_words);
+
+#endif // CPU_ARM_VM_C1_LIRASSEMBLER_ARM_HPP
--- /dev/null	2016-08-24 15:41:39.598575000 -0400
+++ new/hotspot/src/cpu/arm/vm/c1_LIRGenerator_arm.cpp	2016-12-02 11:18:36.538573496 -0500
@@ -0,0 +1,1767 @@
+/*
+ * Copyright (c) 2008, 2015, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+#include "c1/c1_Compilation.hpp"
+#include "c1/c1_FrameMap.hpp"
+#include "c1/c1_Instruction.hpp"
+#include "c1/c1_LIRAssembler.hpp"
+#include "c1/c1_LIRGenerator.hpp"
+#include "c1/c1_Runtime1.hpp"
+#include "c1/c1_ValueStack.hpp"
+#include "ci/ciArray.hpp"
+#include "ci/ciObjArrayKlass.hpp"
+#include "ci/ciTypeArrayKlass.hpp"
+#include "gc/shared/cardTableModRefBS.hpp"
+#include "runtime/sharedRuntime.hpp"
+#include "runtime/stubRoutines.hpp"
+#include "vmreg_arm.inline.hpp"
+
+#ifdef ASSERT
+#define __ gen()->lir(__FILE__, __LINE__)->
+#else
+#define __ gen()->lir()->
+#endif
+
+void LIRItem::load_byte_item() {
+  load_item();
+}
+
+void LIRItem::load_nonconstant() {
+  LIR_Opr r = value()->operand();
+  if (_gen->can_inline_as_constant(value())) {
+    if (!r->is_constant()) {
+      r = LIR_OprFact::value_type(value()->type());
+    }
+    _result = r;
+  } else {
+    load_item();
+  }
+}
+
+//--------------------------------------------------------------
+//               LIRGenerator
+//--------------------------------------------------------------
+
+
+LIR_Opr LIRGenerator::exceptionOopOpr() {
+  return FrameMap::Exception_oop_opr;
+}
+
+LIR_Opr LIRGenerator::exceptionPcOpr()  {
+  return FrameMap::Exception_pc_opr;
+}
+
+LIR_Opr LIRGenerator::syncLockOpr()     {
+  return new_register(T_INT);
+}
+
+LIR_Opr LIRGenerator::syncTempOpr()     {
+  return new_register(T_OBJECT);
+}
+
+LIR_Opr LIRGenerator::getThreadTemp()   {
+  return LIR_OprFact::illegalOpr;
+}
+
+LIR_Opr LIRGenerator::atomicLockOpr() {
+  return LIR_OprFact::illegalOpr;
+}
+
+LIR_Opr LIRGenerator::result_register_for(ValueType* type, bool callee) {
+  LIR_Opr opr;
+  switch (type->tag()) {
+    case intTag:     opr = FrameMap::Int_result_opr;    break;
+    case objectTag:  opr = FrameMap::Object_result_opr; break;
+    case longTag:    opr = FrameMap::Long_result_opr;   break;
+    case floatTag:   opr = FrameMap::Float_result_opr;  break;
+    case doubleTag:  opr = FrameMap::Double_result_opr; break;
+    case addressTag:
+    default: ShouldNotReachHere(); return LIR_OprFact::illegalOpr;
+  }
+  assert(opr->type_field() == as_OprType(as_BasicType(type)), "type mismatch");
+  return opr;
+}
+
+
+LIR_Opr LIRGenerator::rlock_byte(BasicType type) {
+  return new_register(T_INT);
+}
+
+
+//--------- loading items into registers --------------------------------
+
+
+bool LIRGenerator::can_store_as_constant(Value v, BasicType type) const {
+#ifdef AARCH64
+  if (v->type()->as_IntConstant() != NULL) {
+    return v->type()->as_IntConstant()->value() == 0;
+  } else if (v->type()->as_LongConstant() != NULL) {
+    return v->type()->as_LongConstant()->value() == 0;
+  } else if (v->type()->as_ObjectConstant() != NULL) {
+    return v->type()->as_ObjectConstant()->value()->is_null_object();
+  } else if (v->type()->as_FloatConstant() != NULL) {
+    return jint_cast(v->type()->as_FloatConstant()->value()) == 0;
+  } else if (v->type()->as_DoubleConstant() != NULL) {
+    return jlong_cast(v->type()->as_DoubleConstant()->value()) == 0;
+  }
+#endif // AARCH64
+  return false;
+}
+
+
+bool LIRGenerator::can_inline_as_constant(Value v) const {
+  if (v->type()->as_IntConstant() != NULL) {
+    return Assembler::is_arith_imm_in_range(v->type()->as_IntConstant()->value());
+  } else if (v->type()->as_ObjectConstant() != NULL) {
+    return v->type()->as_ObjectConstant()->value()->is_null_object();
+#ifdef AARCH64
+  } else if (v->type()->as_LongConstant() != NULL) {
+    return Assembler::is_arith_imm_in_range(v->type()->as_LongConstant()->value());
+#else
+  } else if (v->type()->as_FloatConstant() != NULL) {
+    return v->type()->as_FloatConstant()->value() == 0.0f;
+  } else if (v->type()->as_DoubleConstant() != NULL) {
+    return v->type()->as_DoubleConstant()->value() == 0.0;
+#endif // AARCH64
+  }
+  return false;
+}
+
+
+bool LIRGenerator::can_inline_as_constant(LIR_Const* c) const {
+  ShouldNotCallThis(); // Not used on ARM
+  return false;
+}
+
+
+#ifdef AARCH64
+
+static bool can_inline_as_constant_in_cmp(Value v) {
+  jlong constant;
+  if (v->type()->as_IntConstant() != NULL) {
+    constant = v->type()->as_IntConstant()->value();
+  } else if (v->type()->as_LongConstant() != NULL) {
+    constant = v->type()->as_LongConstant()->value();
+  } else if (v->type()->as_ObjectConstant() != NULL) {
+    return v->type()->as_ObjectConstant()->value()->is_null_object();
+  } else if (v->type()->as_FloatConstant() != NULL) {
+    return v->type()->as_FloatConstant()->value() == 0.0f;
+  } else if (v->type()->as_DoubleConstant() != NULL) {
+    return v->type()->as_DoubleConstant()->value() == 0.0;
+  } else {
+    return false;
+  }
+
+  return Assembler::is_arith_imm_in_range(constant) || Assembler::is_arith_imm_in_range(-constant);
+}
+
+
+static bool can_inline_as_constant_in_logic(Value v) {
+  if (v->type()->as_IntConstant() != NULL) {
+    return Assembler::LogicalImmediate(v->type()->as_IntConstant()->value(), true).is_encoded();
+  } else if (v->type()->as_LongConstant() != NULL) {
+    return Assembler::LogicalImmediate(v->type()->as_LongConstant()->value(), false).is_encoded();
+  }
+  return false;
+}
+
+
+#endif // AARCH64
+
+
+LIR_Opr LIRGenerator::safepoint_poll_register() {
+  return LIR_OprFact::illegalOpr;
+}
+
+
+static LIR_Opr make_constant(BasicType type, jlong c) {
+  switch (type) {
+    case T_ADDRESS:
+    case T_OBJECT:  return LIR_OprFact::intptrConst(c);
+    case T_LONG:    return LIR_OprFact::longConst(c);
+    case T_INT:     return LIR_OprFact::intConst(c);
+    default: ShouldNotReachHere();
+    return LIR_OprFact::intConst(-1);
+  }
+}
+
+#ifdef AARCH64
+
+void LIRGenerator::add_constant(LIR_Opr src, jlong c, LIR_Opr dest) {
+  if (c == 0) {
+    __ move(src, dest);
+    return;
+  }
+
+  BasicType type = src->type();
+  bool is_neg = (c < 0);
+  c = ABS(c);
+
+  if ((c >> 24) == 0) {
+    for (int shift = 0; shift <= 12; shift += 12) {
+      int part = ((int)c) & (right_n_bits(12) << shift);
+      if (part != 0) {
+        if (is_neg) {
+          __ sub(src, make_constant(type, part), dest);
+        } else {
+          __ add(src, make_constant(type, part), dest);
+        }
+        src = dest;
+      }
+    }
+  } else {
+    __ move(make_constant(type, c), dest);
+    if (is_neg) {
+      __ sub(src, dest, dest);
+    } else {
+      __ add(src, dest, dest);
+    }
+  }
+}
+
+#endif // AARCH64
+
+
+void LIRGenerator::add_large_constant(LIR_Opr src, int c, LIR_Opr dest) {
+  assert(c != 0, "must be");
+#ifdef AARCH64
+  add_constant(src, c, dest);
+#else
+  // Find first non-zero bit
+  int shift = 0;
+  while ((c & (3 << shift)) == 0) {
+    shift += 2;
+  }
+  // Add the least significant part of the constant
+  int mask = 0xff << shift;
+  __ add(src, LIR_OprFact::intConst(c & mask), dest);
+  // Add up to 3 other parts of the constant;
+  // each of them can be represented as rotated_imm
+  if (c & (mask << 8)) {
+    __ add(dest, LIR_OprFact::intConst(c & (mask << 8)), dest);
+  }
+  if (c & (mask << 16)) {
+    __ add(dest, LIR_OprFact::intConst(c & (mask << 16)), dest);
+  }
+  if (c & (mask << 24)) {
+    __ add(dest, LIR_OprFact::intConst(c & (mask << 24)), dest);
+  }
+#endif // AARCH64
+}
+
+static LIR_Address* make_address(LIR_Opr base, LIR_Opr index, LIR_Address::Scale scale, BasicType type) {
+  return new LIR_Address(base, index, scale, 0, type);
+}
+
+LIR_Address* LIRGenerator::generate_address(LIR_Opr base, LIR_Opr index,
+                                            int shift, int disp, BasicType type) {
+  assert(base->is_register(), "must be");
+
+  if (index->is_constant()) {
+    disp += index->as_constant_ptr()->as_jint() << shift;
+    index = LIR_OprFact::illegalOpr;
+  }
+
+#ifndef AARCH64
+  if (base->type() == T_LONG) {
+    LIR_Opr tmp = new_register(T_INT);
+    __ convert(Bytecodes::_l2i, base, tmp);
+    base = tmp;
+  }
+  if (index != LIR_OprFact::illegalOpr && index->type() == T_LONG) {
+    LIR_Opr tmp = new_register(T_INT);
+    __ convert(Bytecodes::_l2i, index, tmp);
+    index = tmp;
+  }
+  // At this point base and index should be all ints and not constants
+  assert(base->is_single_cpu() && !base->is_constant(), "base should be an non-constant int");
+  assert(index->is_illegal() || (index->type() == T_INT && !index->is_constant()), "index should be an non-constant int");
+#endif
+
+  int max_disp;
+  bool disp_is_in_range;
+  bool embedded_shift;
+
+#ifdef AARCH64
+  int align = exact_log2(type2aelembytes(type, true));
+  assert((disp & right_n_bits(align)) == 0, "displacement is not aligned");
+  assert(shift == 0 || shift == align, "shift should be zero or equal to embedded align");
+  max_disp = (1 << 12) << align;
+
+  if (disp >= 0) {
+    disp_is_in_range = Assembler::is_unsigned_imm_in_range(disp, 12, align);
+  } else {
+    disp_is_in_range = Assembler::is_imm_in_range(disp, 9, 0);
+  }
+
+  embedded_shift = true;
+#else
+  switch (type) {
+    case T_BYTE:
+    case T_SHORT:
+    case T_CHAR:
+      max_disp = 256;          // ldrh, ldrsb encoding has 8-bit offset
+      embedded_shift = false;
+      break;
+    case T_FLOAT:
+    case T_DOUBLE:
+      max_disp = 1024;         // flds, fldd have 8-bit offset multiplied by 4
+      embedded_shift = false;
+      break;
+    case T_LONG:
+      max_disp = 4096;
+      embedded_shift = false;
+      break;
+    default:
+      max_disp = 4096;         // ldr, ldrb allow 12-bit offset
+      embedded_shift = true;
+  }
+
+  disp_is_in_range = (-max_disp < disp && disp < max_disp);
+#endif // !AARCH64
+
+  if (index->is_register()) {
+    LIR_Opr tmp = new_pointer_register();
+    if (!disp_is_in_range) {
+      add_large_constant(base, disp, tmp);
+      base = tmp;
+      disp = 0;
+    }
+    LIR_Address* addr = make_address(base, index, (LIR_Address::Scale)shift, type);
+    if (disp == 0 && embedded_shift) {
+      // can use ldr/str instruction with register index
+      return addr;
+    } else {
+      LIR_Opr tmp = new_pointer_register();
+      __ add(base, LIR_OprFact::address(addr), tmp); // add with shifted/extended register
+      return new LIR_Address(tmp, disp, type);
+    }
+  }
+
+  // If the displacement is too large to be inlined into LDR instruction,
+  // generate large constant with additional sequence of ADD instructions
+  int excess_disp = disp & ~(max_disp - 1);
+  if (excess_disp != 0) {
+    LIR_Opr tmp = new_pointer_register();
+    add_large_constant(base, excess_disp, tmp);
+    base = tmp;
+  }
+  return new LIR_Address(base, disp & (max_disp - 1), type);
+}
+
+
+LIR_Address* LIRGenerator::emit_array_address(LIR_Opr array_opr, LIR_Opr index_opr,
+                                              BasicType type, bool needs_card_mark) {
+  int base_offset = arrayOopDesc::base_offset_in_bytes(type);
+  int elem_size = type2aelembytes(type);
+
+  if (index_opr->is_constant()) {
+    int offset = base_offset + index_opr->as_constant_ptr()->as_jint() * elem_size;
+    if (needs_card_mark) {
+      LIR_Opr base_opr = new_pointer_register();
+      add_large_constant(array_opr, offset, base_opr);
+      return new LIR_Address(base_opr, (intx)0, type);
+    } else {
+      return generate_address(array_opr, offset, type);
+    }
+  } else {
+    assert(index_opr->is_register(), "must be");
+    int scale = exact_log2(elem_size);
+    if (needs_card_mark) {
+      LIR_Opr base_opr = new_pointer_register();
+      LIR_Address* addr = make_address(base_opr, index_opr, (LIR_Address::Scale)scale, type);
+      __ add(array_opr, LIR_OprFact::intptrConst(base_offset), base_opr);
+      __ add(base_opr, LIR_OprFact::address(addr), base_opr); // add with shifted/extended register
+      return new LIR_Address(base_opr, type);
+    } else {
+      return generate_address(array_opr, index_opr, scale, base_offset, type);
+    }
+  }
+}
+
+
+LIR_Opr LIRGenerator::load_immediate(int x, BasicType type) {
+  assert(type == T_LONG || type == T_INT, "should be");
+  LIR_Opr r = make_constant(type, x);
+#ifdef AARCH64
+  bool imm_in_range = Assembler::LogicalImmediate(x, type == T_INT).is_encoded();
+#else
+  bool imm_in_range = AsmOperand::is_rotated_imm(x);
+#endif // AARCH64
+  if (!imm_in_range) {
+    LIR_Opr tmp = new_register(type);
+    __ move(r, tmp);
+    return tmp;
+  }
+  return r;
+}
+
+
+void LIRGenerator::increment_counter(address counter, BasicType type, int step) {
+  LIR_Opr pointer = new_pointer_register();
+  __ move(LIR_OprFact::intptrConst(counter), pointer);
+  LIR_Address* addr = new LIR_Address(pointer, type);
+  increment_counter(addr, step);
+}
+
+
+void LIRGenerator::increment_counter(LIR_Address* addr, int step) {
+  LIR_Opr temp = new_register(addr->type());
+  __ move(addr, temp);
+  __ add(temp, make_constant(addr->type(), step), temp);
+  __ move(temp, addr);
+}
+
+
+void LIRGenerator::cmp_mem_int(LIR_Condition condition, LIR_Opr base, int disp, int c, CodeEmitInfo* info) {
+  __ load(new LIR_Address(base, disp, T_INT), FrameMap::LR_opr, info);
+  __ cmp(condition, FrameMap::LR_opr, c);
+}
+
+
+void LIRGenerator::cmp_reg_mem(LIR_Condition condition, LIR_Opr reg, LIR_Opr base, int disp, BasicType type, CodeEmitInfo* info) {
+  __ load(new LIR_Address(base, disp, type), FrameMap::LR_opr, info);
+  __ cmp(condition, reg, FrameMap::LR_opr);
+}
+
+
+bool LIRGenerator::strength_reduce_multiply(LIR_Opr left, int c, LIR_Opr result, LIR_Opr tmp) {
+  assert(left != result, "should be different registers");
+  if (is_power_of_2(c + 1)) {
+#ifdef AARCH64
+    __ shift_left(left, log2_intptr(c + 1), result);
+    __ sub(result, left, result);
+#else
+    LIR_Address::Scale scale = (LIR_Address::Scale) log2_intptr(c + 1);
+    LIR_Address* addr = new LIR_Address(left, left, scale, 0, T_INT);
+    __ sub(LIR_OprFact::address(addr), left, result); // rsb with shifted register
+#endif // AARCH64
+    return true;
+  } else if (is_power_of_2(c - 1)) {
+    LIR_Address::Scale scale = (LIR_Address::Scale) log2_intptr(c - 1);
+    LIR_Address* addr = new LIR_Address(left, left, scale, 0, T_INT);
+    __ add(left, LIR_OprFact::address(addr), result); // add with shifted register
+    return true;
+  }
+  return false;
+}
+
+
+void LIRGenerator::store_stack_parameter(LIR_Opr item, ByteSize offset_from_sp) {
+  assert(item->type() == T_INT, "other types are not expected");
+  __ store(item, new LIR_Address(FrameMap::SP_opr, in_bytes(offset_from_sp), item->type()));
+}
+
+void LIRGenerator::set_card(LIR_Opr value, LIR_Address* card_addr) {
+  assert(CardTableModRefBS::dirty_card_val() == 0,
+    "Cannot use ZR register (aarch64) or the register containing the card table base address directly (aarch32) otherwise");
+#ifdef AARCH64
+  // AARCH64 has a register that is constant zero. We can use that one to set the
+  // value in the card table to dirty.
+  __ move(FrameMap::ZR_opr, card_addr);
+#else // AARCH64
+  CardTableModRefBS* ct = (CardTableModRefBS*)_bs;
+  if(((intx)ct->byte_map_base & 0xff) == 0) {
+    // If the card table base address is aligned to 256 bytes, we can use the register
+    // that contains the card_table_base_address.
+    __ move(value, card_addr);
+  } else {
+    // Otherwise we need to create a register containing that value.
+    LIR_Opr tmp_zero = new_register(T_INT);
+    __ move(LIR_OprFact::intConst(CardTableModRefBS::dirty_card_val()), tmp_zero);
+    __ move(tmp_zero, card_addr);
+  }
+#endif // AARCH64
+}
+
+void LIRGenerator::CardTableModRef_post_barrier_helper(LIR_OprDesc* addr, LIR_Const* card_table_base) {
+  assert(addr->is_register(), "must be a register at this point");
+
+  LIR_Opr tmp = FrameMap::LR_ptr_opr;
+
+  // TODO-AARCH64: check performance
+  bool load_card_table_base_const = AARCH64_ONLY(false) NOT_AARCH64(VM_Version::supports_movw());
+  if (load_card_table_base_const) {
+    __ move((LIR_Opr)card_table_base, tmp);
+  } else {
+    __ move(new LIR_Address(FrameMap::Rthread_opr, in_bytes(JavaThread::card_table_base_offset()), T_ADDRESS), tmp);
+  }
+
+#ifdef AARCH64
+  LIR_Address* shifted_reg_operand = new LIR_Address(tmp, addr, (LIR_Address::Scale) -CardTableModRefBS::card_shift, 0, T_BYTE);
+  LIR_Opr tmp2 = tmp;
+  __ add(tmp, LIR_OprFact::address(shifted_reg_operand), tmp2); // tmp2 = tmp + (addr >> CardTableModRefBS::card_shift)
+  LIR_Address* card_addr = new LIR_Address(tmp2, T_BYTE);
+#else
+  // Use unsigned type T_BOOLEAN here rather than (signed) T_BYTE since signed load
+  // byte instruction does not support the addressing mode we need.
+  LIR_Address* card_addr = new LIR_Address(tmp, addr, (LIR_Address::Scale) -CardTableModRefBS::card_shift, 0, T_BOOLEAN);
+#endif
+  if (UseCondCardMark) {
+    if (UseConcMarkSweepGC) {
+      __ membar_storeload();
+    }
+    LIR_Opr cur_value = new_register(T_INT);
+    __ move(card_addr, cur_value);
+
+    LabelObj* L_already_dirty = new LabelObj();
+    __ cmp(lir_cond_equal, cur_value, LIR_OprFact::intConst(CardTableModRefBS::dirty_card_val()));
+    __ branch(lir_cond_equal, T_BYTE, L_already_dirty->label());
+    set_card(tmp, card_addr);
+    __ branch_destination(L_already_dirty->label());
+  } else {
+    if (UseConcMarkSweepGC && CMSPrecleaningEnabled) {
+      __ membar_storestore();
+    }
+    set_card(tmp, card_addr);
+  }
+}
+
+//----------------------------------------------------------------------
+//             visitor functions
+//----------------------------------------------------------------------
+
+
+void LIRGenerator::do_StoreIndexed(StoreIndexed* x) {
+  assert(x->is_pinned(),"");
+  bool needs_range_check = x->compute_needs_range_check();
+  bool use_length = x->length() != NULL;
+  bool obj_store = x->elt_type() == T_ARRAY || x->elt_type() == T_OBJECT;
+  bool needs_store_check = obj_store && (x->value()->as_Constant() == NULL ||
+                                         !get_jobject_constant(x->value())->is_null_object() ||
+                                         x->should_profile());
+
+  LIRItem array(x->array(), this);
+  LIRItem index(x->index(), this);
+  LIRItem value(x->value(), this);
+  LIRItem length(this);
+
+  array.load_item();
+  index.load_nonconstant();
+
+  if (use_length && needs_range_check) {
+    length.set_instruction(x->length());
+    length.load_item();
+  }
+  if (needs_store_check || x->check_boolean()) {
+    value.load_item();
+  } else {
+    value.load_for_store(x->elt_type());
+  }
+
+  set_no_result(x);
+
+  // the CodeEmitInfo must be duplicated for each different
+  // LIR-instruction because spilling can occur anywhere between two
+  // instructions and so the debug information must be different
+  CodeEmitInfo* range_check_info = state_for(x);
+  CodeEmitInfo* null_check_info = NULL;
+  if (x->needs_null_check()) {
+    null_check_info = new CodeEmitInfo(range_check_info);
+  }
+
+  // emit array address setup early so it schedules better
+  LIR_Address* array_addr = emit_array_address(array.result(), index.result(), x->elt_type(), obj_store);
+
+  if (GenerateRangeChecks && needs_range_check) {
+    if (use_length) {
+      __ cmp(lir_cond_belowEqual, length.result(), index.result());
+      __ branch(lir_cond_belowEqual, T_INT, new RangeCheckStub(range_check_info, index.result()));
+    } else {
+      array_range_check(array.result(), index.result(), null_check_info, range_check_info);
+      // range_check also does the null check
+      null_check_info = NULL;
+    }
+  }
+
+  if (GenerateArrayStoreCheck && needs_store_check) {
+    LIR_Opr tmp1 = FrameMap::R0_oop_opr;
+    LIR_Opr tmp2 = FrameMap::R1_oop_opr;
+    CodeEmitInfo* store_check_info = new CodeEmitInfo(range_check_info);
+    __ store_check(value.result(), array.result(), tmp1, tmp2,
+                   LIR_OprFact::illegalOpr, store_check_info,
+                   x->profiled_method(), x->profiled_bci());
+  }
+
+#if INCLUDE_ALL_GCS
+  if (obj_store) {
+    // Needs GC write barriers.
+    pre_barrier(LIR_OprFact::address(array_addr), LIR_OprFact::illegalOpr /* pre_val */,
+                true /* do_load */, false /* patch */, NULL);
+  }
+#endif // INCLUDE_ALL_GCS
+
+  LIR_Opr result = maybe_mask_boolean(x, array.result(), value.result(), null_check_info);
+  __ move(result, array_addr, null_check_info);
+  if (obj_store) {
+    post_barrier(LIR_OprFact::address(array_addr), value.result());
+  }
+}
+
+
+void LIRGenerator::do_MonitorEnter(MonitorEnter* x) {
+  assert(x->is_pinned(),"");
+  LIRItem obj(x->obj(), this);
+  obj.load_item();
+  set_no_result(x);
+
+  LIR_Opr lock = new_pointer_register();
+  LIR_Opr hdr  = new_pointer_register();
+
+  // Need a scratch register for biased locking on arm
+  LIR_Opr scratch = LIR_OprFact::illegalOpr;
+  if(UseBiasedLocking) {
+    scratch = new_pointer_register();
+  } else {
+    scratch = atomicLockOpr();
+  }
+
+  CodeEmitInfo* info_for_exception = NULL;
+  if (x->needs_null_check()) {
+    info_for_exception = state_for(x);
+  }
+
+  CodeEmitInfo* info = state_for(x, x->state(), true);
+  monitor_enter(obj.result(), lock, hdr, scratch,
+                x->monitor_no(), info_for_exception, info);
+}
+
+
+void LIRGenerator::do_MonitorExit(MonitorExit* x) {
+  assert(x->is_pinned(),"");
+  LIRItem obj(x->obj(), this);
+  obj.dont_load_item();
+  set_no_result(x);
+
+  LIR_Opr obj_temp = new_pointer_register();
+  LIR_Opr lock     = new_pointer_register();
+  LIR_Opr hdr      = new_pointer_register();
+
+  monitor_exit(obj_temp, lock, hdr, atomicLockOpr(), x->monitor_no());
+}
+
+
+// _ineg, _lneg, _fneg, _dneg
+void LIRGenerator::do_NegateOp(NegateOp* x) {
+#ifdef __SOFTFP__
+  address runtime_func = NULL;
+  ValueTag tag = x->type()->tag();
+  if (tag == floatTag) {
+    runtime_func = CAST_FROM_FN_PTR(address, SharedRuntime::fneg);
+  } else if (tag == doubleTag) {
+    runtime_func = CAST_FROM_FN_PTR(address, SharedRuntime::dneg);
+  }
+  if (runtime_func != NULL) {
+    set_result(x, call_runtime(x->x(), runtime_func, x->type(), NULL));
+    return;
+  }
+#endif // __SOFTFP__
+  LIRItem value(x->x(), this);
+  value.load_item();
+  LIR_Opr reg = rlock_result(x);
+  __ negate(value.result(), reg);
+}
+
+
+// for  _fadd, _fmul, _fsub, _fdiv, _frem
+//      _dadd, _dmul, _dsub, _ddiv, _drem
+void LIRGenerator::do_ArithmeticOp_FPU(ArithmeticOp* x) {
+  address runtime_func;
+  switch (x->op()) {
+    case Bytecodes::_frem:
+      runtime_func = CAST_FROM_FN_PTR(address, SharedRuntime::frem);
+      break;
+    case Bytecodes::_drem:
+      runtime_func = CAST_FROM_FN_PTR(address, SharedRuntime::drem);
+      break;
+#ifdef __SOFTFP__
+    // Call function compiled with -msoft-float.
+
+      // __aeabi_XXXX_glibc: Imported code from glibc soft-fp bundle for calculation accuracy improvement. See CR 6757269.
+
+    case Bytecodes::_fadd:
+      runtime_func = CAST_FROM_FN_PTR(address, __aeabi_fadd_glibc);
+      break;
+    case Bytecodes::_fmul:
+      runtime_func = CAST_FROM_FN_PTR(address, __aeabi_fmul);
+      break;
+    case Bytecodes::_fsub:
+      runtime_func = CAST_FROM_FN_PTR(address, __aeabi_fsub_glibc);
+      break;
+    case Bytecodes::_fdiv:
+      runtime_func = CAST_FROM_FN_PTR(address, __aeabi_fdiv);
+      break;
+    case Bytecodes::_dadd:
+      runtime_func = CAST_FROM_FN_PTR(address, __aeabi_dadd_glibc);
+      break;
+    case Bytecodes::_dmul:
+      runtime_func = CAST_FROM_FN_PTR(address, __aeabi_dmul);
+      break;
+    case Bytecodes::_dsub:
+      runtime_func = CAST_FROM_FN_PTR(address, __aeabi_dsub_glibc);
+      break;
+    case Bytecodes::_ddiv:
+      runtime_func = CAST_FROM_FN_PTR(address, __aeabi_ddiv);
+      break;
+    default:
+      ShouldNotReachHere();
+#else // __SOFTFP__
+    default: {
+      LIRItem left(x->x(), this);
+      LIRItem right(x->y(), this);
+      left.load_item();
+      right.load_item();
+      rlock_result(x);
+      arithmetic_op_fpu(x->op(), x->operand(), left.result(), right.result(), x->is_strictfp());
+      return;
+    }
+#endif // __SOFTFP__
+  }
+
+  LIR_Opr result = call_runtime(x->x(), x->y(), runtime_func, x->type(), NULL);
+  set_result(x, result);
+}
+
+
+void LIRGenerator::make_div_by_zero_check(LIR_Opr right_arg, BasicType type, CodeEmitInfo* info) {
+  assert(right_arg->is_register(), "must be");
+  __ cmp(lir_cond_equal, right_arg, make_constant(type, 0));
+  __ branch(lir_cond_equal, type, new DivByZeroStub(info));
+}
+
+
+// for  _ladd, _lmul, _lsub, _ldiv, _lrem
+void LIRGenerator::do_ArithmeticOp_Long(ArithmeticOp* x) {
+  CodeEmitInfo* info = NULL;
+  if (x->op() == Bytecodes::_ldiv || x->op() == Bytecodes::_lrem) {
+    info = state_for(x);
+  }
+
+#ifdef AARCH64
+  LIRItem left(x->x(), this);
+  LIRItem right(x->y(), this);
+  LIRItem* left_arg = &left;
+  LIRItem* right_arg = &right;
+
+  // Test if instr is commutative and if we should swap
+  if (x->is_commutative() && left.is_constant()) {
+    left_arg = &right;
+    right_arg = &left;
+  }
+
+  left_arg->load_item();
+  switch (x->op()) {
+    case Bytecodes::_ldiv:
+      right_arg->load_item();
+      make_div_by_zero_check(right_arg->result(), T_LONG, info);
+      __ idiv(left_arg->result(), right_arg->result(), rlock_result(x), LIR_OprFact::illegalOpr, NULL);
+      break;
+
+    case Bytecodes::_lrem: {
+      right_arg->load_item();
+      make_div_by_zero_check(right_arg->result(), T_LONG, info);
+      // a % b is implemented with 2 instructions:
+      // tmp = a/b       (sdiv)
+      // res = a - b*tmp (msub)
+      LIR_Opr tmp = FrameMap::as_long_opr(Rtemp);
+      __ irem(left_arg->result(), right_arg->result(), rlock_result(x), tmp, NULL);
+      break;
+    }
+
+    case Bytecodes::_lmul:
+      if (right_arg->is_constant() && is_power_of_2_long(right_arg->get_jlong_constant())) {
+        right_arg->dont_load_item();
+        __ shift_left(left_arg->result(), exact_log2_long(right_arg->get_jlong_constant()), rlock_result(x));
+      } else {
+        right_arg->load_item();
+        __ mul(left_arg->result(), right_arg->result(), rlock_result(x));
+      }
+      break;
+
+    case Bytecodes::_ladd:
+    case Bytecodes::_lsub:
+      if (right_arg->is_constant()) {
+        jlong c = right_arg->get_jlong_constant();
+        add_constant(left_arg->result(), (x->op() == Bytecodes::_ladd) ? c : -c, rlock_result(x));
+      } else {
+        right_arg->load_item();
+        arithmetic_op_long(x->op(), rlock_result(x), left_arg->result(), right_arg->result(), NULL);
+      }
+      break;
+
+    default:
+      ShouldNotReachHere();
+  }
+#else
+  switch (x->op()) {
+    case Bytecodes::_ldiv:
+    case Bytecodes::_lrem: {
+      LIRItem right(x->y(), this);
+      right.load_item();
+      make_div_by_zero_check(right.result(), T_LONG, info);
+    }
+    // Fall through
+    case Bytecodes::_lmul: {
+      address entry;
+      switch (x->op()) {
+      case Bytecodes::_lrem:
+        entry = CAST_FROM_FN_PTR(address, SharedRuntime::lrem);
+        break;
+      case Bytecodes::_ldiv:
+        entry = CAST_FROM_FN_PTR(address, SharedRuntime::ldiv);
+        break;
+      case Bytecodes::_lmul:
+        entry = CAST_FROM_FN_PTR(address, SharedRuntime::lmul);
+        break;
+      default:
+        ShouldNotReachHere();
+      }
+      LIR_Opr result = call_runtime(x->y(), x->x(), entry, x->type(), NULL);
+      set_result(x, result);
+      break;
+    }
+    case Bytecodes::_ladd:
+    case Bytecodes::_lsub: {
+      LIRItem left(x->x(), this);
+      LIRItem right(x->y(), this);
+      left.load_item();
+      right.load_item();
+      rlock_result(x);
+      arithmetic_op_long(x->op(), x->operand(), left.result(), right.result(), NULL);
+      break;
+    }
+    default:
+      ShouldNotReachHere();
+  }
+#endif // AARCH64
+}
+
+
+// for: _iadd, _imul, _isub, _idiv, _irem
+void LIRGenerator::do_ArithmeticOp_Int(ArithmeticOp* x) {
+  bool is_div_rem = x->op() == Bytecodes::_idiv || x->op() == Bytecodes::_irem;
+  LIRItem left(x->x(), this);
+  LIRItem right(x->y(), this);
+  LIRItem* left_arg = &left;
+  LIRItem* right_arg = &right;
+
+  // Test if instr is commutative and if we should swap
+  if (x->is_commutative() && left.is_constant()) {
+    left_arg = &right;
+    right_arg = &left;
+  }
+
+  if (is_div_rem) {
+    CodeEmitInfo* info = state_for(x);
+    if (x->op() == Bytecodes::_idiv && right_arg->is_constant() && is_power_of_2(right_arg->get_jint_constant())) {
+      left_arg->load_item();
+      right_arg->dont_load_item();
+      LIR_Opr tmp = LIR_OprFact::illegalOpr;
+      LIR_Opr result = rlock_result(x);
+      __ idiv(left_arg->result(), right_arg->result(), result, tmp, info);
+    } else {
+#ifdef AARCH64
+      left_arg->load_item();
+      right_arg->load_item();
+      make_div_by_zero_check(right_arg->result(), T_INT, info);
+      if (x->op() == Bytecodes::_idiv) {
+        __ idiv(left_arg->result(), right_arg->result(), rlock_result(x), LIR_OprFact::illegalOpr, NULL);
+      } else {
+        // a % b is implemented with 2 instructions:
+        // tmp = a/b       (sdiv)
+        // res = a - b*tmp (msub)
+        LIR_Opr tmp = FrameMap::as_opr(Rtemp);
+        __ irem(left_arg->result(), right_arg->result(), rlock_result(x), tmp, NULL);
+      }
+#else
+      left_arg->load_item_force(FrameMap::R0_opr);
+      right_arg->load_item_force(FrameMap::R2_opr);
+      LIR_Opr tmp = FrameMap::R1_opr;
+      LIR_Opr result = rlock_result(x);
+      LIR_Opr out_reg;
+      if (x->op() == Bytecodes::_irem) {
+        out_reg = FrameMap::R0_opr;
+        __ irem(left_arg->result(), right_arg->result(), out_reg, tmp, info);
+      } else if (x->op() == Bytecodes::_idiv) {
+        out_reg = FrameMap::R1_opr;
+        __ idiv(left_arg->result(), right_arg->result(), out_reg, tmp, info);
+      }
+      __ move(out_reg, result);
+#endif // AARCH64
+    }
+
+#ifdef AARCH64
+  } else if (((x->op() == Bytecodes::_iadd) || (x->op() == Bytecodes::_isub)) && right_arg->is_constant()) {
+    left_arg->load_item();
+    jint c = right_arg->get_jint_constant();
+    right_arg->dont_load_item();
+    add_constant(left_arg->result(), (x->op() == Bytecodes::_iadd) ? c : -c, rlock_result(x));
+#endif // AARCH64
+
+  } else {
+    left_arg->load_item();
+    if (x->op() == Bytecodes::_imul && right_arg->is_constant()) {
+      int c = right_arg->get_jint_constant();
+      if (c > 0 && (is_power_of_2(c) || is_power_of_2(c - 1) || is_power_of_2(c + 1))) {
+        right_arg->dont_load_item();
+      } else {
+        right_arg->load_item();
+      }
+    } else {
+      AARCH64_ONLY(assert(!right_arg->is_constant(), "constant right_arg is already handled by this moment");)
+      right_arg->load_nonconstant();
+    }
+    rlock_result(x);
+    assert(right_arg->is_constant() || right_arg->is_register(), "wrong state of right");
+    arithmetic_op_int(x->op(), x->operand(), left_arg->result(), right_arg->result(), NULL);
+  }
+}
+
+
+void LIRGenerator::do_ArithmeticOp(ArithmeticOp* x) {
+  ValueTag tag = x->type()->tag();
+  assert(x->x()->type()->tag() == tag && x->y()->type()->tag() == tag, "wrong parameters");
+  switch (tag) {
+    case floatTag:
+    case doubleTag:  do_ArithmeticOp_FPU(x);  return;
+    case longTag:    do_ArithmeticOp_Long(x); return;
+    case intTag:     do_ArithmeticOp_Int(x);  return;
+  }
+  ShouldNotReachHere();
+}
+
+
+// _ishl, _lshl, _ishr, _lshr, _iushr, _lushr
+void LIRGenerator::do_ShiftOp(ShiftOp* x) {
+  LIRItem value(x->x(), this);
+  LIRItem count(x->y(), this);
+
+#ifndef AARCH64
+  if (value.type()->is_long()) {
+    count.set_destroys_register();
+  }
+#endif // !AARCH64
+
+  if (count.is_constant()) {
+    assert(count.type()->as_IntConstant() != NULL, "should be");
+    count.dont_load_item();
+  } else {
+    count.load_item();
+  }
+  value.load_item();
+
+  LIR_Opr res = rlock_result(x);
+  shift_op(x->op(), res, value.result(), count.result(), LIR_OprFact::illegalOpr);
+}
+
+
+// _iand, _land, _ior, _lor, _ixor, _lxor
+void LIRGenerator::do_LogicOp(LogicOp* x) {
+  LIRItem left(x->x(), this);
+  LIRItem right(x->y(), this);
+
+  left.load_item();
+
+#ifdef AARCH64
+  if (right.is_constant() && can_inline_as_constant_in_logic(right.value())) {
+    right.dont_load_item();
+  } else {
+    right.load_item();
+  }
+#else
+  right.load_nonconstant();
+#endif // AARCH64
+
+  logic_op(x->op(), rlock_result(x), left.result(), right.result());
+}
+
+
+// _lcmp, _fcmpl, _fcmpg, _dcmpl, _dcmpg
+void LIRGenerator::do_CompareOp(CompareOp* x) {
+#ifdef __SOFTFP__
+  address runtime_func;
+  switch (x->op()) {
+    case Bytecodes::_fcmpl:
+      runtime_func = CAST_FROM_FN_PTR(address, SharedRuntime::fcmpl);
+      break;
+    case Bytecodes::_fcmpg:
+      runtime_func = CAST_FROM_FN_PTR(address, SharedRuntime::fcmpg);
+      break;
+    case Bytecodes::_dcmpl:
+      runtime_func = CAST_FROM_FN_PTR(address, SharedRuntime::dcmpl);
+      break;
+    case Bytecodes::_dcmpg:
+      runtime_func = CAST_FROM_FN_PTR(address, SharedRuntime::dcmpg);
+      break;
+    case Bytecodes::_lcmp: {
+        LIRItem left(x->x(), this);
+        LIRItem right(x->y(), this);
+        left.load_item();
+        right.load_nonconstant();
+        LIR_Opr reg = rlock_result(x);
+         __ lcmp2int(left.result(), right.result(), reg);
+        return;
+      }
+    default:
+      ShouldNotReachHere();
+  }
+  LIR_Opr result = call_runtime(x->x(), x->y(), runtime_func, x->type(), NULL);
+  set_result(x, result);
+#else // __SOFTFP__
+  LIRItem left(x->x(), this);
+  LIRItem right(x->y(), this);
+  left.load_item();
+
+#ifdef AARCH64
+  if (right.is_constant() && can_inline_as_constant_in_cmp(right.value())) {
+    right.dont_load_item();
+  } else {
+    right.load_item();
+  }
+#else
+  right.load_nonconstant();
+#endif // AARCH64
+
+  LIR_Opr reg = rlock_result(x);
+
+  if (x->x()->type()->is_float_kind()) {
+    Bytecodes::Code code = x->op();
+    __ fcmp2int(left.result(), right.result(), reg, (code == Bytecodes::_fcmpl || code == Bytecodes::_dcmpl));
+  } else if (x->x()->type()->tag() == longTag) {
+    __ lcmp2int(left.result(), right.result(), reg);
+  } else {
+    ShouldNotReachHere();
+  }
+#endif // __SOFTFP__
+}
+
+
+void LIRGenerator::do_CompareAndSwap(Intrinsic* x, ValueType* type) {
+  assert(x->number_of_arguments() == 4, "wrong type");
+  LIRItem obj   (x->argument_at(0), this);  // object
+  LIRItem offset(x->argument_at(1), this);  // offset of field
+  LIRItem cmp   (x->argument_at(2), this);  // value to compare with field
+  LIRItem val   (x->argument_at(3), this);  // replace field with val if matches cmp
+
+  LIR_Opr addr = new_pointer_register();
+  LIR_Opr tmp1 = LIR_OprFact::illegalOpr;
+  LIR_Opr tmp2 = LIR_OprFact::illegalOpr;
+
+  // get address of field
+  obj.load_item();
+  offset.load_item();
+  cmp.load_item();
+  val.load_item();
+
+  __ add(obj.result(), offset.result(), addr);
+  LIR_Opr result = rlock_result(x);
+
+  if (type == objectType) {
+#if INCLUDE_ALL_GCS
+    // Do the pre-write barrier, if any.
+    pre_barrier(addr, LIR_OprFact::illegalOpr /* pre_val */,
+                true /* do_load */, false /* patch */, NULL);
+#endif // INCLUDE_ALL_GCS
+#ifdef AARCH64
+    if (UseCompressedOops) {
+      tmp1 = new_pointer_register();
+      tmp2 = new_pointer_register();
+    }
+#endif // AARCH64
+    __ cas_obj(addr, cmp.result(), val.result(), tmp1, tmp2, result);
+    post_barrier(addr, val.result());
+  }
+  else if (type == intType) {
+    __ cas_int(addr, cmp.result(), val.result(), tmp1, tmp1, result);
+  }
+  else if (type == longType) {
+#ifndef AARCH64
+    tmp1 = new_register(T_LONG);
+#endif // !AARCH64
+    __ cas_long(addr, cmp.result(), val.result(), tmp1, tmp2, result);
+  }
+  else {
+    ShouldNotReachHere();
+  }
+}
+
+
+void LIRGenerator::do_MathIntrinsic(Intrinsic* x) {
+  address runtime_func;
+  switch (x->id()) {
+    case vmIntrinsics::_dabs: {
+#ifdef __SOFTFP__
+      runtime_func = CAST_FROM_FN_PTR(address, SharedRuntime::dabs);
+      break;
+#else
+      assert(x->number_of_arguments() == 1, "wrong type");
+      LIRItem value(x->argument_at(0), this);
+      value.load_item();
+      __ abs(value.result(), rlock_result(x), LIR_OprFact::illegalOpr);
+      return;
+#endif // __SOFTFP__
+    }
+    case vmIntrinsics::_dsqrt: {
+#ifdef __SOFTFP__
+      runtime_func = CAST_FROM_FN_PTR(address, SharedRuntime::dsqrt);
+      break;
+#else
+      assert(x->number_of_arguments() == 1, "wrong type");
+      LIRItem value(x->argument_at(0), this);
+      value.load_item();
+      __ sqrt(value.result(), rlock_result(x), LIR_OprFact::illegalOpr);
+      return;
+#endif // __SOFTFP__
+    }
+    case vmIntrinsics::_dsin:
+      runtime_func = CAST_FROM_FN_PTR(address, SharedRuntime::dsin);
+      break;
+    case vmIntrinsics::_dcos:
+      runtime_func = CAST_FROM_FN_PTR(address, SharedRuntime::dcos);
+      break;
+    case vmIntrinsics::_dtan:
+      runtime_func = CAST_FROM_FN_PTR(address, SharedRuntime::dtan);
+      break;
+    case vmIntrinsics::_dlog:
+      runtime_func = CAST_FROM_FN_PTR(address, SharedRuntime::dlog);
+      break;
+    case vmIntrinsics::_dlog10:
+      runtime_func = CAST_FROM_FN_PTR(address, SharedRuntime::dlog10);
+      break;
+    case vmIntrinsics::_dexp:
+      runtime_func = CAST_FROM_FN_PTR(address, SharedRuntime::dexp);
+      break;
+    case vmIntrinsics::_dpow:
+      runtime_func = CAST_FROM_FN_PTR(address, SharedRuntime::dpow);
+      break;
+    default:
+      ShouldNotReachHere();
+      return;
+  }
+
+  LIR_Opr result;
+  if (x->number_of_arguments() == 1) {
+    result = call_runtime(x->argument_at(0), runtime_func, x->type(), NULL);
+  } else {
+    assert(x->number_of_arguments() == 2 && x->id() == vmIntrinsics::_dpow, "unexpected intrinsic");
+    result = call_runtime(x->argument_at(0), x->argument_at(1), runtime_func, x->type(), NULL);
+  }
+  set_result(x, result);
+}
+
+void LIRGenerator::do_FmaIntrinsic(Intrinsic* x) {
+  fatal("FMA intrinsic is not implemented on this platform");
+}
+
+void LIRGenerator::do_vectorizedMismatch(Intrinsic* x) {
+  fatal("vectorizedMismatch intrinsic is not implemented on this platform");
+}
+
+void LIRGenerator::do_ArrayCopy(Intrinsic* x) {
+  CodeEmitInfo* info = state_for(x, x->state());
+  assert(x->number_of_arguments() == 5, "wrong type");
+  LIRItem src(x->argument_at(0), this);
+  LIRItem src_pos(x->argument_at(1), this);
+  LIRItem dst(x->argument_at(2), this);
+  LIRItem dst_pos(x->argument_at(3), this);
+  LIRItem length(x->argument_at(4), this);
+
+  // We put arguments into the same registers which are used for a Java call.
+  // Note: we used fixed registers for all arguments because all registers
+  // are caller-saved, so register allocator treats them all as used.
+  src.load_item_force    (FrameMap::R0_oop_opr);
+  src_pos.load_item_force(FrameMap::R1_opr);
+  dst.load_item_force    (FrameMap::R2_oop_opr);
+  dst_pos.load_item_force(FrameMap::R3_opr);
+  length.load_item_force (FrameMap::R4_opr);
+  LIR_Opr tmp =          (FrameMap::R5_opr);
+  set_no_result(x);
+
+  int flags;
+  ciArrayKlass* expected_type;
+  arraycopy_helper(x, &flags, &expected_type);
+  __ arraycopy(src.result(), src_pos.result(), dst.result(), dst_pos.result(), length.result(),
+               tmp, expected_type, flags, info);
+}
+
+void LIRGenerator::do_update_CRC32(Intrinsic* x) {
+  fatal("CRC32 intrinsic is not implemented on this platform");
+}
+
+void LIRGenerator::do_update_CRC32C(Intrinsic* x) {
+  Unimplemented();
+}
+
+void LIRGenerator::do_Convert(Convert* x) {
+  address runtime_func;
+  switch (x->op()) {
+#ifndef AARCH64
+    case Bytecodes::_l2f:
+      runtime_func = CAST_FROM_FN_PTR(address, SharedRuntime::l2f);
+      break;
+    case Bytecodes::_l2d:
+      runtime_func = CAST_FROM_FN_PTR(address, SharedRuntime::l2d);
+      break;
+    case Bytecodes::_f2l:
+      runtime_func = CAST_FROM_FN_PTR(address, SharedRuntime::f2l);
+      break;
+    case Bytecodes::_d2l:
+      runtime_func = CAST_FROM_FN_PTR(address, SharedRuntime::d2l);
+      break;
+#ifdef __SOFTFP__
+    case Bytecodes::_f2d:
+      runtime_func = CAST_FROM_FN_PTR(address, __aeabi_f2d);
+      break;
+    case Bytecodes::_d2f:
+      runtime_func = CAST_FROM_FN_PTR(address, __aeabi_d2f);
+      break;
+    case Bytecodes::_i2f:
+      runtime_func = CAST_FROM_FN_PTR(address, __aeabi_i2f);
+      break;
+    case Bytecodes::_i2d:
+      runtime_func = CAST_FROM_FN_PTR(address, __aeabi_i2d);
+      break;
+    case Bytecodes::_f2i:
+      runtime_func = CAST_FROM_FN_PTR(address, __aeabi_f2iz);
+      break;
+    case Bytecodes::_d2i:
+      // This is implemented in hard float in assembler on arm but a call
+      // on other platforms.
+      runtime_func = CAST_FROM_FN_PTR(address, SharedRuntime::d2i);
+      break;
+#endif // __SOFTFP__
+#endif // !AARCH64
+    default: {
+      LIRItem value(x->value(), this);
+      value.load_item();
+      LIR_Opr reg = rlock_result(x);
+      __ convert(x->op(), value.result(), reg, NULL);
+      return;
+    }
+  }
+
+  LIR_Opr result = call_runtime(x->value(), runtime_func, x->type(), NULL);
+  set_result(x, result);
+}
+
+
+void LIRGenerator::do_NewInstance(NewInstance* x) {
+  print_if_not_loaded(x);
+
+  CodeEmitInfo* info = state_for(x, x->state());
+  LIR_Opr reg = result_register_for(x->type());  // R0 is required by runtime call in NewInstanceStub::emit_code
+  LIR_Opr klass_reg = FrameMap::R1_metadata_opr; // R1 is required by runtime call in NewInstanceStub::emit_code
+  LIR_Opr tmp1 = new_register(objectType);
+  LIR_Opr tmp2 = new_register(objectType);
+  LIR_Opr tmp3 = FrameMap::LR_oop_opr;
+
+  new_instance(reg, x->klass(), x->is_unresolved(), tmp1, tmp2, tmp3,
+               LIR_OprFact::illegalOpr, klass_reg, info);
+
+  LIR_Opr result = rlock_result(x);
+  __ move(reg, result);
+}
+
+
+void LIRGenerator::do_NewTypeArray(NewTypeArray* x) {
+  // Evaluate state_for() first, because it can emit code
+  // with the same fixed registers that are used here (R1, R2)
+  CodeEmitInfo* info = state_for(x, x->state());
+  LIRItem length(x->length(), this);
+
+  length.load_item_force(FrameMap::R2_opr);      // R2 is required by runtime call in NewTypeArrayStub::emit_code
+  LIR_Opr len = length.result();
+
+  LIR_Opr reg = result_register_for(x->type());  // R0 is required by runtime call in NewTypeArrayStub::emit_code
+  LIR_Opr klass_reg = FrameMap::R1_metadata_opr; // R1 is required by runtime call in NewTypeArrayStub::emit_code
+
+  LIR_Opr tmp1 = new_register(objectType);
+  LIR_Opr tmp2 = new_register(objectType);
+  LIR_Opr tmp3 = FrameMap::LR_oop_opr;
+  LIR_Opr tmp4 = LIR_OprFact::illegalOpr;
+
+  BasicType elem_type = x->elt_type();
+  __ metadata2reg(ciTypeArrayKlass::make(elem_type)->constant_encoding(), klass_reg);
+
+  CodeStub* slow_path = new NewTypeArrayStub(klass_reg, len, reg, info);
+  __ allocate_array(reg, len, tmp1, tmp2, tmp3, tmp4, elem_type, klass_reg, slow_path);
+
+  LIR_Opr result = rlock_result(x);
+  __ move(reg, result);
+}
+
+
+void LIRGenerator::do_NewObjectArray(NewObjectArray* x) {
+  // Evaluate state_for() first, because it can emit code
+  // with the same fixed registers that are used here (R1, R2)
+  CodeEmitInfo* info = state_for(x, x->state());
+  LIRItem length(x->length(), this);
+
+  length.load_item_force(FrameMap::R2_opr);           // R2 is required by runtime call in NewObjectArrayStub::emit_code
+  LIR_Opr len = length.result();
+
+  CodeEmitInfo* patching_info = NULL;
+  if (!x->klass()->is_loaded() || PatchALot) {
+    patching_info = state_for(x, x->state_before());
+  }
+
+  LIR_Opr reg = result_register_for(x->type());       // R0 is required by runtime call in NewObjectArrayStub::emit_code
+  LIR_Opr klass_reg = FrameMap::R1_metadata_opr;      // R1 is required by runtime call in NewObjectArrayStub::emit_code
+
+  LIR_Opr tmp1 = new_register(objectType);
+  LIR_Opr tmp2 = new_register(objectType);
+  LIR_Opr tmp3 = FrameMap::LR_oop_opr;
+  LIR_Opr tmp4 = LIR_OprFact::illegalOpr;
+
+  CodeStub* slow_path = new NewObjectArrayStub(klass_reg, len, reg, info);
+  ciMetadata* obj = ciObjArrayKlass::make(x->klass());
+  if (obj == ciEnv::unloaded_ciobjarrayklass()) {
+    BAILOUT("encountered unloaded_ciobjarrayklass due to out of memory error");
+  }
+  klass2reg_with_patching(klass_reg, obj, patching_info);
+  __ allocate_array(reg, len, tmp1, tmp2, tmp3, tmp4, T_OBJECT, klass_reg, slow_path);
+
+  LIR_Opr result = rlock_result(x);
+  __ move(reg, result);
+}
+
+
+void LIRGenerator::do_NewMultiArray(NewMultiArray* x) {
+  Values* dims = x->dims();
+  int i = dims->length();
+  LIRItemList* items = new LIRItemList(i, i, NULL);
+  while (i-- > 0) {
+    LIRItem* size = new LIRItem(dims->at(i), this);
+    items->at_put(i, size);
+  }
+
+  // Need to get the info before, as the items may become invalid through item_free
+  CodeEmitInfo* patching_info = NULL;
+  if (!x->klass()->is_loaded() || PatchALot) {
+    patching_info = state_for(x, x->state_before());
+
+    // Cannot re-use same xhandlers for multiple CodeEmitInfos, so
+    // clone all handlers (NOTE: Usually this is handled transparently
+    // by the CodeEmitInfo cloning logic in CodeStub constructors but
+    // is done explicitly here because a stub isn't being used).
+    x->set_exception_handlers(new XHandlers(x->exception_handlers()));
+  }
+
+  i = dims->length();
+  while (i-- > 0) {
+    LIRItem* size = items->at(i);
+    size->load_item();
+    LIR_Opr sz = size->result();
+    assert(sz->type() == T_INT, "should be");
+    store_stack_parameter(sz, in_ByteSize(i * BytesPerInt));
+  }
+
+  CodeEmitInfo* info = state_for(x, x->state());
+  LIR_Opr klass_reg = FrameMap::R0_metadata_opr;
+  klass2reg_with_patching(klass_reg, x->klass(), patching_info);
+
+  LIR_Opr rank = FrameMap::R2_opr;
+  __ move(LIR_OprFact::intConst(x->rank()), rank);
+  LIR_Opr varargs = FrameMap::SP_opr;
+  LIR_OprList* args = new LIR_OprList(3);
+  args->append(klass_reg);
+  args->append(rank);
+  args->append(varargs);
+  LIR_Opr reg = result_register_for(x->type());
+  __ call_runtime(Runtime1::entry_for(Runtime1::new_multi_array_id),
+                  LIR_OprFact::illegalOpr, reg, args, info);
+
+  LIR_Opr result = rlock_result(x);
+  __ move(reg, result);
+}
+
+
+void LIRGenerator::do_BlockBegin(BlockBegin* x) {
+  // nothing to do for now
+}
+
+
+void LIRGenerator::do_CheckCast(CheckCast* x) {
+  LIRItem obj(x->obj(), this);
+  CodeEmitInfo* patching_info = NULL;
+  if (!x->klass()->is_loaded() || (PatchALot && !x->is_incompatible_class_change_check())) {
+    patching_info = state_for(x, x->state_before());
+  }
+
+  obj.load_item();
+
+  CodeEmitInfo* info_for_exception = state_for(x);
+  CodeStub* stub;
+  if (x->is_incompatible_class_change_check()) {
+    assert(patching_info == NULL, "can't patch this");
+    stub = new SimpleExceptionStub(Runtime1::throw_incompatible_class_change_error_id,
+                                   LIR_OprFact::illegalOpr, info_for_exception);
+  } else {
+    stub = new SimpleExceptionStub(Runtime1::throw_class_cast_exception_id,
+                                   LIR_OprFact::illegalOpr, info_for_exception);
+  }
+
+  LIR_Opr out_reg = rlock_result(x);
+  LIR_Opr tmp1 = FrameMap::R0_oop_opr;
+  LIR_Opr tmp2 = FrameMap::R1_oop_opr;
+  LIR_Opr tmp3 = LIR_OprFact::illegalOpr;
+
+  __ checkcast(out_reg, obj.result(), x->klass(), tmp1, tmp2, tmp3, x->direct_compare(),
+               info_for_exception, patching_info, stub, x->profiled_method(), x->profiled_bci());
+}
+
+
+void LIRGenerator::do_InstanceOf(InstanceOf* x) {
+  LIRItem obj(x->obj(), this);
+  CodeEmitInfo* patching_info = NULL;
+  if (!x->klass()->is_loaded() || PatchALot) {
+    patching_info = state_for(x, x->state_before());
+  }
+
+  obj.load_item();
+  LIR_Opr out_reg = rlock_result(x);
+  LIR_Opr tmp1 = FrameMap::R0_oop_opr;
+  LIR_Opr tmp2 = FrameMap::R1_oop_opr;
+  LIR_Opr tmp3 = LIR_OprFact::illegalOpr;
+
+  __ instanceof(out_reg, obj.result(), x->klass(), tmp1, tmp2, tmp3,
+                x->direct_compare(), patching_info, x->profiled_method(), x->profiled_bci());
+}
+
+
+#ifdef __SOFTFP__
+// Turn operator if (f <op> g) into runtime call:
+//     call _aeabi_fcmp<op>(f, g)
+//     cmp(eq, 1)
+//     branch(eq, true path).
+void LIRGenerator::do_soft_float_compare(If* x) {
+  assert(x->number_of_sux() == 2, "inconsistency");
+  ValueTag tag = x->x()->type()->tag();
+  If::Condition cond = x->cond();
+  address runtime_func;
+  // unordered comparison gets the wrong answer because aeabi functions
+  //  return false.
+  bool unordered_is_true = x->unordered_is_true();
+  // reverse of condition for ne
+  bool compare_to_zero = false;
+  switch (lir_cond(cond)) {
+    case lir_cond_notEqual:
+      compare_to_zero = true;  // fall through
+    case lir_cond_equal:
+      runtime_func = tag == floatTag ?
+          CAST_FROM_FN_PTR(address, __aeabi_fcmpeq):
+          CAST_FROM_FN_PTR(address, __aeabi_dcmpeq);
+      break;
+    case lir_cond_less:
+      if (unordered_is_true) {
+        runtime_func = tag == floatTag ?
+          CAST_FROM_FN_PTR(address, SharedRuntime::unordered_fcmplt):
+          CAST_FROM_FN_PTR(address, SharedRuntime::unordered_dcmplt);
+      } else {
+        runtime_func = tag == floatTag ?
+          CAST_FROM_FN_PTR(address, __aeabi_fcmplt):
+          CAST_FROM_FN_PTR(address, __aeabi_dcmplt);
+      }
+      break;
+    case lir_cond_lessEqual:
+      if (unordered_is_true) {
+        runtime_func = tag == floatTag ?
+          CAST_FROM_FN_PTR(address, SharedRuntime::unordered_fcmple):
+          CAST_FROM_FN_PTR(address, SharedRuntime::unordered_dcmple);
+      } else {
+        runtime_func = tag == floatTag ?
+          CAST_FROM_FN_PTR(address, __aeabi_fcmple):
+          CAST_FROM_FN_PTR(address, __aeabi_dcmple);
+      }
+      break;
+    case lir_cond_greaterEqual:
+      if (unordered_is_true) {
+        runtime_func = tag == floatTag ?
+          CAST_FROM_FN_PTR(address, SharedRuntime::unordered_fcmpge):
+          CAST_FROM_FN_PTR(address, SharedRuntime::unordered_dcmpge);
+      } else {
+        runtime_func = tag == floatTag ?
+          CAST_FROM_FN_PTR(address, __aeabi_fcmpge):
+          CAST_FROM_FN_PTR(address, __aeabi_dcmpge);
+      }
+      break;
+    case lir_cond_greater:
+      if (unordered_is_true) {
+        runtime_func = tag == floatTag ?
+          CAST_FROM_FN_PTR(address, SharedRuntime::unordered_fcmpgt):
+          CAST_FROM_FN_PTR(address, SharedRuntime::unordered_dcmpgt);
+      } else {
+        runtime_func = tag == floatTag ?
+          CAST_FROM_FN_PTR(address, __aeabi_fcmpgt):
+          CAST_FROM_FN_PTR(address, __aeabi_dcmpgt);
+      }
+      break;
+    case lir_cond_aboveEqual:
+    case lir_cond_belowEqual:
+      ShouldNotReachHere();  // We're not going to get these.
+    default:
+      assert(lir_cond(cond) == lir_cond_always, "must be");
+      ShouldNotReachHere();
+  }
+  set_no_result(x);
+
+  // add safepoint before generating condition code so it can be recomputed
+  if (x->is_safepoint()) {
+    increment_backedge_counter(state_for(x, x->state_before()), x->profiled_bci());
+    __ safepoint(LIR_OprFact::illegalOpr, state_for(x, x->state_before()));
+  }
+  // Call float compare function, returns (1,0) if true or false.
+  LIR_Opr result = call_runtime(x->x(), x->y(), runtime_func, intType, NULL);
+  __ cmp(lir_cond_equal, result,
+         compare_to_zero ?
+           LIR_OprFact::intConst(0) : LIR_OprFact::intConst(1));
+  profile_branch(x, cond);
+  move_to_phi(x->state());
+  __ branch(lir_cond_equal, T_INT, x->tsux());
+}
+#endif // __SOFTFP__
+
+void LIRGenerator::do_If(If* x) {
+  assert(x->number_of_sux() == 2, "inconsistency");
+  ValueTag tag = x->x()->type()->tag();
+
+#ifdef __SOFTFP__
+  if (tag == floatTag || tag == doubleTag) {
+    do_soft_float_compare(x);
+    assert(x->default_sux() == x->fsux(), "wrong destination above");
+    __ jump(x->default_sux());
+    return;
+  }
+#endif // __SOFTFP__
+
+  LIRItem xitem(x->x(), this);
+  LIRItem yitem(x->y(), this);
+  LIRItem* xin = &xitem;
+  LIRItem* yin = &yitem;
+  If::Condition cond = x->cond();
+
+#ifndef AARCH64
+  if (tag == longTag) {
+    if (cond == If::gtr || cond == If::leq) {
+      cond = Instruction::mirror(cond);
+      xin = &yitem;
+      yin = &xitem;
+    }
+    xin->set_destroys_register();
+  }
+#endif // !AARCH64
+
+  xin->load_item();
+  LIR_Opr left = xin->result();
+  LIR_Opr right;
+
+#ifdef AARCH64
+  if (yin->is_constant() && can_inline_as_constant_in_cmp(yin->value())) {
+    yin->dont_load_item();
+  } else {
+    yin->load_item();
+  }
+  right = yin->result();
+#else
+  if (tag == longTag && yin->is_constant() && yin->get_jlong_constant() == 0 &&
+      (cond == If::eql || cond == If::neq)) {
+    // inline long zero
+    right = LIR_OprFact::value_type(yin->value()->type());
+  } else {
+    yin->load_nonconstant();
+    right = yin->result();
+  }
+#endif // AARCH64
+
+  set_no_result(x);
+
+  // add safepoint before generating condition code so it can be recomputed
+  if (x->is_safepoint()) {
+    increment_backedge_counter(state_for(x, x->state_before()), x->profiled_bci());
+    __ safepoint(LIR_OprFact::illegalOpr, state_for(x, x->state_before()));
+  }
+
+  __ cmp(lir_cond(cond), left, right);
+  profile_branch(x, cond);
+  move_to_phi(x->state());
+  if (x->x()->type()->is_float_kind()) {
+    __ branch(lir_cond(cond), right->type(), x->tsux(), x->usux());
+  } else {
+    __ branch(lir_cond(cond), right->type(), x->tsux());
+  }
+  assert(x->default_sux() == x->fsux(), "wrong destination above");
+  __ jump(x->default_sux());
+}
+
+
+LIR_Opr LIRGenerator::getThreadPointer() {
+  return FrameMap::Rthread_opr;
+}
+
+void LIRGenerator::trace_block_entry(BlockBegin* block) {
+  __ move(LIR_OprFact::intConst(block->block_id()), FrameMap::R0_opr);
+  LIR_OprList* args = new LIR_OprList(1);
+  args->append(FrameMap::R0_opr);
+  address func = CAST_FROM_FN_PTR(address, Runtime1::trace_block_entry);
+  __ call_runtime_leaf(func, getThreadTemp(), LIR_OprFact::illegalOpr, args);
+}
+
+
+void LIRGenerator::volatile_field_store(LIR_Opr value, LIR_Address* address,
+                                        CodeEmitInfo* info) {
+#ifndef AARCH64
+  if (value->is_double_cpu()) {
+    assert(address->index()->is_illegal(), "should have a constant displacement");
+    LIR_Opr tmp = new_pointer_register();
+    add_large_constant(address->base(), address->disp(), tmp);
+    __ volatile_store_mem_reg(value, new LIR_Address(tmp, (intx)0, address->type()), info);
+    return;
+  }
+#endif // !AARCH64
+  // TODO-AARCH64 implement with stlr instruction
+  __ store(value, address, info, lir_patch_none);
+}
+
+void LIRGenerator::volatile_field_load(LIR_Address* address, LIR_Opr result,
+                                       CodeEmitInfo* info) {
+#ifndef AARCH64
+  if (result->is_double_cpu()) {
+    assert(address->index()->is_illegal(), "should have a constant displacement");
+    LIR_Opr tmp = new_pointer_register();
+    add_large_constant(address->base(), address->disp(), tmp);
+    __ volatile_load_mem_reg(new LIR_Address(tmp, (intx)0, address->type()), result, info);
+    return;
+  }
+#endif // !AARCH64
+  // TODO-AARCH64 implement with ldar instruction
+  __ load(address, result, info, lir_patch_none);
+}
+
+void LIRGenerator::get_Object_unsafe(LIR_Opr dst, LIR_Opr src, LIR_Opr offset,
+                                     BasicType type, bool is_volatile) {
+#ifdef AARCH64
+  __ load(new LIR_Address(src, offset, type), dst);
+#else
+  assert(offset->is_single_cpu(), "must be");
+  if (is_volatile && dst->is_double_cpu()) {
+    LIR_Opr tmp = new_pointer_register();
+    __ add(src, offset, tmp);
+    __ volatile_load_mem_reg(new LIR_Address(tmp, (intx)0, type), dst, NULL);
+  } else if (type == T_FLOAT || type == T_DOUBLE) {
+    // fld doesn't have indexed addressing mode
+    LIR_Opr tmp = new_register(T_INT);
+    __ add(src, offset, tmp);
+    __ load(new LIR_Address(tmp, (intx)0, type), dst);
+  } else {
+    __ load(new LIR_Address(src, offset, type), dst);
+  }
+#endif // AARCH64
+}
+
+void LIRGenerator::put_Object_unsafe(LIR_Opr src, LIR_Opr offset, LIR_Opr data,
+                                     BasicType type, bool is_volatile) {
+#ifdef AARCH64
+  LIR_Address* addr = new LIR_Address(src, offset, type);
+  if (type == T_ARRAY || type == T_OBJECT) {
+    pre_barrier(LIR_OprFact::address(addr), LIR_OprFact::illegalOpr /* pre_val */,
+                true /* do_load */, false /* patch */, NULL);
+    __ move(data, addr);
+    assert(src->is_register(), "must be register");
+    post_barrier(LIR_OprFact::address(addr), data);
+  } else {
+    __ move(data, addr);
+  }
+#else
+  assert(offset->is_single_cpu(), "must be");
+  if (is_volatile && data->is_double_cpu()) {
+    LIR_Opr tmp = new_register(T_INT);
+    __ add(src, offset, tmp);
+    __ volatile_store_mem_reg(data, new LIR_Address(tmp, (intx)0, type), NULL);
+  } else if (type == T_FLOAT || type == T_DOUBLE) {
+    // fst doesn't have indexed addressing mode
+    LIR_Opr tmp = new_register(T_INT);
+    __ add(src, offset, tmp);
+    __ move(data, new LIR_Address(tmp, (intx)0, type));
+  } else {
+    LIR_Address* addr = new LIR_Address(src, offset, type);
+    bool is_obj = (type == T_ARRAY || type == T_OBJECT);
+#if INCLUDE_ALL_GCS
+    if (is_obj) {
+      // Do the pre-write barrier, if any.
+      pre_barrier(LIR_OprFact::address(addr), LIR_OprFact::illegalOpr /* pre_val */,
+                  true /* do_load */, false /* patch */, NULL);
+    }
+#endif // INCLUDE_ALL_GCS
+    __ move(data, addr);
+    if (is_obj) {
+      assert(src->is_register(), "must be register");
+      post_barrier(LIR_OprFact::address(addr), data);
+    }
+  }
+#endif // AARCH64
+}
+
+void LIRGenerator::do_UnsafeGetAndSetObject(UnsafeGetAndSetObject* x) {
+  BasicType type = x->basic_type();
+  LIRItem src(x->object(), this);
+  LIRItem off(x->offset(), this);
+  LIRItem value(x->value(), this);
+
+  src.load_item();
+  if (x->is_add()) {
+    value.load_nonconstant();
+  } else {
+    value.load_item();
+  }
+  off.load_nonconstant();
+
+  LIR_Opr dst = rlock_result(x, type);
+  LIR_Opr data = value.result();
+  bool is_obj = (type == T_ARRAY || type == T_OBJECT);
+
+  assert (type == T_INT || type == T_LONG || (!x->is_add() && is_obj), "unexpected type");
+  LIR_Opr addr_ptr = new_pointer_register();
+
+  __ add(src.result(), off.result(), addr_ptr);
+
+  LIR_Address* addr = new LIR_Address(addr_ptr, (intx)0, type);
+
+  if (x->is_add()) {
+    LIR_Opr tmp = new_register(type);
+    __ xadd(addr_ptr, data, dst, tmp);
+  } else {
+    LIR_Opr tmp = (UseCompressedOops && is_obj) ? new_pointer_register() : LIR_OprFact::illegalOpr;
+    if (is_obj) {
+      // Do the pre-write barrier, if any.
+      pre_barrier(LIR_OprFact::address(addr), LIR_OprFact::illegalOpr /* pre_val */,
+                  true /* do_load */, false /* patch */, NULL);
+    }
+    __ xchg(addr_ptr, data, dst, tmp);
+    if (is_obj) {
+      // Seems to be a precise address
+      post_barrier(LIR_OprFact::address(addr), data);
+    }
+  }
+}
--- /dev/null	2016-08-24 15:41:39.598575000 -0400
+++ new/hotspot/src/cpu/arm/vm/c1_LIRGenerator_arm.hpp	2016-12-02 11:18:42.490911053 -0500
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2014, 2015, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+  // Helper to set the card at the given address to the given value.
+  void set_card(LIR_Opr value, LIR_Address* card_addr);
+
+  void make_div_by_zero_check(LIR_Opr right_arg, BasicType type, CodeEmitInfo* info);
+
+#ifdef AARCH64
+  // the helper for arithmetic
+  void add_constant(LIR_Opr src, jlong c, LIR_Opr dest);
+#endif // AARCH64
--- /dev/null	2016-08-24 15:41:39.598575000 -0400
+++ new/hotspot/src/cpu/arm/vm/c1_LIR_arm.cpp	2016-12-02 11:18:48.171233184 -0500
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2010, 2016, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+#include "c1/c1_LIR.hpp"
+
+FloatRegister LIR_OprDesc::as_float_reg() const {
+  return as_FloatRegister(fpu_regnr());
+}
+
+FloatRegister LIR_OprDesc::as_double_reg() const {
+  return as_FloatRegister(fpu_regnrLo());
+}
+
+#ifdef AARCH64
+// Reg2 unused.
+LIR_Opr LIR_OprFact::double_fpu(int reg1, int reg2) {
+  assert(as_FloatRegister(reg2) == fnoreg, "Not used on this platform");
+  return (LIR_Opr)(intptr_t)((reg1 << LIR_OprDesc::reg1_shift) |
+                             (reg1 << LIR_OprDesc::reg2_shift) |
+                             LIR_OprDesc::double_type          |
+                             LIR_OprDesc::fpu_register         |
+                             LIR_OprDesc::double_size);
+}
+#else
+LIR_Opr LIR_OprFact::double_fpu(int reg1, int reg2) {
+  assert(as_FloatRegister(reg2) != fnoreg, "Arm32 holds double in two regs.");
+  return (LIR_Opr)(intptr_t)((reg1 << LIR_OprDesc::reg1_shift) |
+                             (reg2 << LIR_OprDesc::reg2_shift) |
+                             LIR_OprDesc::double_type          |
+                             LIR_OprDesc::fpu_register         |
+                             LIR_OprDesc::double_size);
+}
+#endif
+
+#ifndef PRODUCT
+void LIR_Address::verify() const {
+#ifdef _LP64
+  assert(base()->is_cpu_register(), "wrong base operand");
+#endif
+#ifdef AARCH64
+  if (base()->type() == T_INT) {
+    assert(index()->is_single_cpu() && (index()->type() == T_INT), "wrong index operand");
+  } else {
+    assert(index()->is_illegal() || index()->is_double_cpu() ||
+           (index()->is_single_cpu() && (index()->is_oop_register() || index()->type() == T_INT)), "wrong index operand");
+    assert(base()->type() == T_OBJECT || base()->type() == T_LONG || base()->type() == T_METADATA, "wrong type for addresses");
+  }
+#else
+  assert(disp() == 0 || index()->is_illegal(), "can't have both");
+  // Note: offsets higher than 4096 must not be rejected here. They can
+  // be handled by the back-end or will be rejected if not.
+#ifdef _LP64
+  assert(index()->is_illegal() || index()->is_double_cpu(), "wrong index operand");
+  assert(base()->type() == T_OBJECT || base()->type() == T_LONG || base()->type() == T_METADATA,
+         "wrong type for addresses");
+#else
+  assert(base()->is_single_cpu(), "wrong base operand");
+  assert(index()->is_illegal() || index()->is_single_cpu(), "wrong index operand");
+  assert(base()->type() == T_OBJECT || base()->type() == T_INT || base()->type() == T_METADATA,
+         "wrong type for addresses");
+#endif
+#endif // AARCH64
+}
+#endif // PRODUCT
--- /dev/null	2016-08-24 15:41:39.598575000 -0400
+++ new/hotspot/src/cpu/arm/vm/c1_LinearScan_arm.cpp	2016-12-02 11:18:53.807552817 -0500
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2008, 2011, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+#include "c1/c1_Instruction.hpp"
+#include "c1/c1_LinearScan.hpp"
+#include "utilities/bitMap.inline.hpp"
+
+void LinearScan::allocate_fpu_stack() {
+  // No FPU stack on ARM
+}
--- /dev/null	2016-08-24 15:41:39.598575000 -0400
+++ new/hotspot/src/cpu/arm/vm/c1_LinearScan_arm.hpp	2016-12-02 11:18:59.259862017 -0500
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2008, 2014, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef CPU_ARM_VM_C1_LINEARSCAN_ARM_HPP
+#define CPU_ARM_VM_C1_LINEARSCAN_ARM_HPP
+
+inline bool LinearScan::is_processed_reg_num(int reg_num) {
+  return reg_num < pd_nof_cpu_regs_processed_in_linearscan ||
+         reg_num >= pd_nof_cpu_regs_frame_map;
+}
+
+inline int LinearScan::num_physical_regs(BasicType type) {
+#ifndef AARCH64
+  if (type == T_LONG || type == T_DOUBLE) return 2;
+#endif // !AARCH64
+  return 1;
+}
+
+
+inline bool LinearScan::requires_adjacent_regs(BasicType type) {
+#ifdef AARCH64
+  return false;
+#else
+  return type == T_DOUBLE || type == T_LONG;
+#endif // AARCH64
+}
+
+inline bool LinearScan::is_caller_save(int assigned_reg) {
+  assert(assigned_reg >= 0 && assigned_reg < nof_regs, "should call this only for registers");
+  // TODO-AARCH64 try to add callee-saved registers
+  return true;
+}
+
+
+inline void LinearScan::pd_add_temps(LIR_Op* op) {
+  // No extra temporals on ARM
+}
+
+
+// Implementation of LinearScanWalker
+
+inline bool LinearScanWalker::pd_init_regs_for_alloc(Interval* cur) {
+#ifndef __SOFTFP__
+  if (cur->type() == T_FLOAT || cur->type() == T_DOUBLE) {
+    _first_reg = pd_first_fpu_reg;
+    _last_reg = pd_first_fpu_reg + pd_nof_fpu_regs_reg_alloc - 1;
+    return true;
+  }
+#endif // !__SOFTFP__
+
+  // Use allocatable CPU registers otherwise
+  _first_reg = pd_first_cpu_reg;
+  _last_reg = pd_first_cpu_reg + FrameMap::adjust_reg_range(pd_nof_cpu_regs_reg_alloc) - 1;
+  return true;
+}
+
+#endif // CPU_ARM_VM_C1_LINEARSCAN_ARM_HPP
--- /dev/null	2016-08-24 15:41:39.598575000 -0400
+++ new/hotspot/src/cpu/arm/vm/c1_MacroAssembler_arm.cpp	2016-12-02 11:19:05.072191636 -0500
@@ -0,0 +1,408 @@
+/*
+ * Copyright (c) 2008, 2015, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+#include "c1/c1_MacroAssembler.hpp"
+#include "c1/c1_Runtime1.hpp"
+#include "classfile/systemDictionary.hpp"
+#include "gc/shared/collectedHeap.hpp"
+#include "interpreter/interpreter.hpp"
+#include "oops/arrayOop.hpp"
+#include "oops/markOop.hpp"
+#include "runtime/basicLock.hpp"
+#include "runtime/biasedLocking.hpp"
+#include "runtime/os.hpp"
+#include "runtime/sharedRuntime.hpp"
+#include "runtime/stubRoutines.hpp"
+
+// Note: Rtemp usage is this file should not impact C2 and should be
+// correct as long as it is not implicitly used in lower layers (the
+// arm [macro]assembler) and used with care in the other C1 specific
+// files.
+
+void C1_MacroAssembler::inline_cache_check(Register receiver, Register iCache) {
+  Label verified;
+  load_klass(Rtemp, receiver);
+  cmp(Rtemp, iCache);
+  b(verified, eq); // jump over alignment no-ops
+#ifdef AARCH64
+  jump(SharedRuntime::get_ic_miss_stub(), relocInfo::runtime_call_type, Rtemp);
+#else
+  jump(SharedRuntime::get_ic_miss_stub(), relocInfo::runtime_call_type);
+#endif
+  align(CodeEntryAlignment);
+  bind(verified);
+}
+
+void C1_MacroAssembler::build_frame(int frame_size_in_bytes, int bang_size_in_bytes) {
+  assert(bang_size_in_bytes >= frame_size_in_bytes, "stack bang size incorrect");
+  assert((frame_size_in_bytes % StackAlignmentInBytes) == 0, "frame size should be aligned");
+
+#ifdef AARCH64
+  // Extra nop for MT-safe patching in NativeJump::patch_verified_entry
+  nop();
+#endif // AARCH64
+
+  arm_stack_overflow_check(bang_size_in_bytes, Rtemp);
+
+  // FP can no longer be used to memorize SP. It may be modified
+  // if this method contains a methodHandle call site
+  raw_push(FP, LR);
+  sub_slow(SP, SP, frame_size_in_bytes);
+}
+
+void C1_MacroAssembler::remove_frame(int frame_size_in_bytes) {
+  add_slow(SP, SP, frame_size_in_bytes);
+  raw_pop(FP, LR);
+}
+
+void C1_MacroAssembler::verified_entry() {
+  if (C1Breakpoint) {
+    breakpoint();
+  }
+}
+
+// Puts address of allocated object into register `obj` and end of allocated object into register `obj_end`.
+void C1_MacroAssembler::try_allocate(Register obj, Register obj_end, Register tmp1, Register tmp2,
+                                     RegisterOrConstant size_expression, Label& slow_case) {
+  if (UseTLAB) {
+    tlab_allocate(obj, obj_end, tmp1, size_expression, slow_case);
+  } else {
+    eden_allocate(obj, obj_end, tmp1, tmp2, size_expression, slow_case);
+    incr_allocated_bytes(size_expression, tmp1);
+  }
+}
+
+
+void C1_MacroAssembler::initialize_header(Register obj, Register klass, Register len, Register tmp) {
+  assert_different_registers(obj, klass, len, tmp);
+
+  if(UseBiasedLocking && !len->is_valid()) {
+    ldr(tmp, Address(klass, Klass::prototype_header_offset()));
+  } else {
+    mov(tmp, (intptr_t)markOopDesc::prototype());
+  }
+
+#ifdef AARCH64
+  if (UseCompressedClassPointers) {
+    str(tmp, Address(obj, oopDesc::mark_offset_in_bytes()));
+    encode_klass_not_null(tmp, klass);          // Take care not to kill klass
+    str_w(tmp, Address(obj, oopDesc::klass_offset_in_bytes()));
+  } else {
+    assert(oopDesc::mark_offset_in_bytes() + wordSize == oopDesc::klass_offset_in_bytes(), "adjust this code");
+    stp(tmp, klass, Address(obj, oopDesc::mark_offset_in_bytes()));
+  }
+#else
+  str(tmp, Address(obj, oopDesc::mark_offset_in_bytes()));
+  str(klass, Address(obj, oopDesc::klass_offset_in_bytes()));
+#endif // AARCH64
+
+  if (len->is_valid()) {
+    str_32(len, Address(obj, arrayOopDesc::length_offset_in_bytes()));
+  }
+#ifdef AARCH64
+  else if (UseCompressedClassPointers) {
+    store_klass_gap(obj);
+  }
+#endif // AARCH64
+}
+
+
+// Cleans object body [base..obj_end]. Clobbers `base` and `tmp` registers.
+void C1_MacroAssembler::initialize_body(Register base, Register obj_end, Register tmp) {
+  zero_memory(base, obj_end, tmp);
+}
+
+
+void C1_MacroAssembler::initialize_object(Register obj, Register obj_end, Register klass,
+                                          Register len, Register tmp1, Register tmp2,
+                                          RegisterOrConstant header_size, int obj_size_in_bytes,
+                                          bool is_tlab_allocated)
+{
+  assert_different_registers(obj, obj_end, klass, len, tmp1, tmp2);
+  initialize_header(obj, klass, len, tmp1);
+
+  const Register ptr = tmp2;
+
+  if (!(UseTLAB && ZeroTLAB && is_tlab_allocated)) {
+#ifdef AARCH64
+    if (obj_size_in_bytes < 0) {
+      add_rc(ptr, obj, header_size);
+      initialize_body(ptr, obj_end, tmp1);
+
+    } else {
+      int base = instanceOopDesc::header_size() * HeapWordSize;
+      assert(obj_size_in_bytes >= base, "should be");
+
+      const int zero_bytes = obj_size_in_bytes - base;
+      assert((zero_bytes % wordSize) == 0, "should be");
+
+      if ((zero_bytes % (2*wordSize)) != 0) {
+        str(ZR, Address(obj, base));
+        base += wordSize;
+      }
+
+      const int stp_count = zero_bytes / (2*wordSize);
+
+      if (zero_bytes > 8 * wordSize) {
+        Label loop;
+        add(ptr, obj, base);
+        mov(tmp1, stp_count);
+        bind(loop);
+        subs(tmp1, tmp1, 1);
+        stp(ZR, ZR, Address(ptr, 2*wordSize, post_indexed));
+        b(loop, gt);
+      } else {
+        for (int i = 0; i < stp_count; i++) {
+          stp(ZR, ZR, Address(obj, base + i * 2 * wordSize));
+        }
+      }
+    }
+#else
+    if (obj_size_in_bytes >= 0 && obj_size_in_bytes <= 8 * BytesPerWord) {
+      mov(tmp1, 0);
+      const int base = instanceOopDesc::header_size() * HeapWordSize;
+      for (int i = base; i < obj_size_in_bytes; i += wordSize) {
+        str(tmp1, Address(obj, i));
+      }
+    } else {
+      assert(header_size.is_constant() || header_size.as_register() == ptr, "code assumption");
+      add(ptr, obj, header_size);
+      initialize_body(ptr, obj_end, tmp1);
+    }
+#endif // AARCH64
+  }
+
+  // StoreStore barrier required after complete initialization
+  // (headers + content zeroing), before the object may escape.
+  membar(MacroAssembler::StoreStore, tmp1);
+}
+
+void C1_MacroAssembler::allocate_object(Register obj, Register tmp1, Register tmp2, Register tmp3,
+                                        int header_size, int object_size,
+                                        Register klass, Label& slow_case) {
+  assert_different_registers(obj, tmp1, tmp2, tmp3, klass, Rtemp);
+  assert(header_size >= 0 && object_size >= header_size, "illegal sizes");
+  const int object_size_in_bytes = object_size * BytesPerWord;
+
+  const Register obj_end = tmp1;
+  const Register len = noreg;
+
+  if (Assembler::is_arith_imm_in_range(object_size_in_bytes)) {
+    try_allocate(obj, obj_end, tmp2, tmp3, object_size_in_bytes, slow_case);
+  } else {
+    // Rtemp should be free at c1 LIR level
+    mov_slow(Rtemp, object_size_in_bytes);
+    try_allocate(obj, obj_end, tmp2, tmp3, Rtemp, slow_case);
+  }
+  initialize_object(obj, obj_end, klass, len, tmp2, tmp3, instanceOopDesc::header_size() * HeapWordSize, object_size_in_bytes, /* is_tlab_allocated */ UseTLAB);
+}
+
+void C1_MacroAssembler::allocate_array(Register obj, Register len,
+                                       Register tmp1, Register tmp2, Register tmp3,
+                                       int header_size, int element_size,
+                                       Register klass, Label& slow_case) {
+  assert_different_registers(obj, len, tmp1, tmp2, tmp3, klass, Rtemp);
+  const int header_size_in_bytes = header_size * BytesPerWord;
+  const int scale_shift = exact_log2(element_size);
+  const Register obj_size = Rtemp; // Rtemp should be free at c1 LIR level
+
+#ifdef AARCH64
+  mov_slow(Rtemp, max_array_allocation_length);
+  cmp_32(len, Rtemp);
+#else
+  cmp_32(len, max_array_allocation_length);
+#endif // AARCH64
+  b(slow_case, hs);
+
+  bool align_header = ((header_size_in_bytes | element_size) & MinObjAlignmentInBytesMask) != 0;
+  assert(align_header || ((header_size_in_bytes & MinObjAlignmentInBytesMask) == 0), "must be");
+  assert(align_header || ((element_size & MinObjAlignmentInBytesMask) == 0), "must be");
+
+  mov(obj_size, header_size_in_bytes + (align_header ? (MinObjAlignmentInBytes - 1) : 0));
+  add_ptr_scaled_int32(obj_size, obj_size, len, scale_shift);
+
+  if (align_header) {
+    align_reg(obj_size, obj_size, MinObjAlignmentInBytes);
+  }
+
+  try_allocate(obj, tmp1, tmp2, tmp3, obj_size, slow_case);
+  initialize_object(obj, tmp1, klass, len, tmp2, tmp3, header_size_in_bytes, -1, /* is_tlab_allocated */ UseTLAB);
+}
+
+int C1_MacroAssembler::lock_object(Register hdr, Register obj,
+                                   Register disp_hdr, Register tmp1,
+                                   Label& slow_case) {
+  Label done, fast_lock, fast_lock_done;
+  int null_check_offset = 0;
+
+  const Register tmp2 = Rtemp; // Rtemp should be free at c1 LIR level
+  assert_different_registers(hdr, obj, disp_hdr, tmp1, tmp2);
+
+  assert(BasicObjectLock::lock_offset_in_bytes() == 0, "ajust this code");
+  const int obj_offset = BasicObjectLock::obj_offset_in_bytes();
+  const int mark_offset = BasicLock::displaced_header_offset_in_bytes();
+
+  if (UseBiasedLocking) {
+    // load object
+    str(obj, Address(disp_hdr, obj_offset));
+    null_check_offset = biased_locking_enter(obj, hdr/*scratched*/, tmp1, false, tmp2, done, slow_case);
+  }
+
+  assert(oopDesc::mark_offset_in_bytes() == 0, "Required by atomic instructions");
+
+#ifdef AARCH64
+
+  str(obj, Address(disp_hdr, obj_offset));
+
+  if (!UseBiasedLocking) {
+    null_check_offset = offset();
+  }
+  ldr(hdr, obj);
+
+  // Test if object is already locked
+  assert(markOopDesc::unlocked_value == 1, "adjust this code");
+  tbnz(hdr, exact_log2(markOopDesc::unlocked_value), fast_lock);
+
+  // Check for recursive locking
+  // See comments in InterpreterMacroAssembler::lock_object for
+  // explanations on the fast recursive locking check.
+  intptr_t mask = ((intptr_t)3) - ((intptr_t)os::vm_page_size());
+  Assembler::LogicalImmediate imm(mask, false);
+  mov(tmp2, SP);
+  sub(tmp2, hdr, tmp2);
+  ands(tmp2, tmp2, imm);
+  b(slow_case, ne);
+
+  // Recursive locking: store 0 into a lock record
+  str(ZR, Address(disp_hdr, mark_offset));
+  b(fast_lock_done);
+
+#else // AARCH64
+
+  if (!UseBiasedLocking) {
+    null_check_offset = offset();
+  }
+
+  // On MP platforms the next load could return a 'stale' value if the memory location has been modified by another thread.
+  // That would be acceptable as ether CAS or slow case path is taken in that case.
+
+  // Must be the first instruction here, because implicit null check relies on it
+  ldr(hdr, Address(obj, oopDesc::mark_offset_in_bytes()));
+
+  str(obj, Address(disp_hdr, obj_offset));
+  tst(hdr, markOopDesc::unlocked_value);
+  b(fast_lock, ne);
+
+  // Check for recursive locking
+  // See comments in InterpreterMacroAssembler::lock_object for
+  // explanations on the fast recursive locking check.
+  // -1- test low 2 bits
+  movs(tmp2, AsmOperand(hdr, lsl, 30));
+  // -2- test (hdr - SP) if the low two bits are 0
+  sub(tmp2, hdr, SP, eq);
+  movs(tmp2, AsmOperand(tmp2, lsr, exact_log2(os::vm_page_size())), eq);
+  // If 'eq' then OK for recursive fast locking: store 0 into a lock record.
+  str(tmp2, Address(disp_hdr, mark_offset), eq);
+  b(fast_lock_done, eq);
+  // else need slow case
+  b(slow_case);
+
+#endif // AARCH64
+
+  bind(fast_lock);
+  // Save previous object header in BasicLock structure and update the header
+  str(hdr, Address(disp_hdr, mark_offset));
+
+  cas_for_lock_acquire(hdr, disp_hdr, obj, tmp2, slow_case);
+
+  bind(fast_lock_done);
+
+#ifndef PRODUCT
+  if (PrintBiasedLockingStatistics) {
+    cond_atomic_inc32(al, BiasedLocking::fast_path_entry_count_addr());
+  }
+#endif // !PRODUCT
+
+  bind(done);
+
+  return null_check_offset;
+}
+
+void C1_MacroAssembler::unlock_object(Register hdr, Register obj,
+                                      Register disp_hdr, Register tmp,
+                                      Label& slow_case) {
+  // Note: this method is not using its 'tmp' argument
+
+  assert_different_registers(hdr, obj, disp_hdr, Rtemp);
+  Register tmp2 = Rtemp;
+
+  assert(BasicObjectLock::lock_offset_in_bytes() == 0, "ajust this code");
+  const int obj_offset = BasicObjectLock::obj_offset_in_bytes();
+  const int mark_offset = BasicLock::displaced_header_offset_in_bytes();
+
+  Label done;
+  if (UseBiasedLocking) {
+    // load object
+    ldr(obj, Address(disp_hdr, obj_offset));
+    biased_locking_exit(obj, hdr, done);
+  }
+
+  assert(oopDesc::mark_offset_in_bytes() == 0, "Required by atomic instructions");
+  Label retry;
+
+  // Load displaced header and object from the lock
+  ldr(hdr, Address(disp_hdr, mark_offset));
+  // If hdr is NULL, we've got recursive locking and there's nothing more to do
+  cbz(hdr, done);
+
+  if(!UseBiasedLocking) {
+    // load object
+    ldr(obj, Address(disp_hdr, obj_offset));
+  }
+
+  // Restore the object header
+  cas_for_lock_release(disp_hdr, hdr, obj, tmp2, slow_case);
+
+  bind(done);
+}
+
+
+#ifndef PRODUCT
+
+void C1_MacroAssembler::verify_stack_oop(int stack_offset) {
+  if (!VerifyOops) return;
+  verify_oop_addr(Address(SP, stack_offset));
+}
+
+void C1_MacroAssembler::verify_not_null_oop(Register r) {
+  Label not_null;
+  cbnz(r, not_null);
+  stop("non-null oop required");
+  bind(not_null);
+  if (!VerifyOops) return;
+  verify_oop(r);
+}
+
+#endif // !PRODUCT
--- /dev/null	2016-08-24 15:41:39.598575000 -0400
+++ new/hotspot/src/cpu/arm/vm/c1_MacroAssembler_arm.hpp	2016-12-02 11:19:10.756513990 -0500
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2008, 2014, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef CPU_ARM_VM_C1_MACROASSEMBLER_ARM_HPP
+#define CPU_ARM_VM_C1_MACROASSEMBLER_ARM_HPP
+
+ private:
+
+  void pd_init() { /* not used */ }
+
+ public:
+
+  // Puts address of allocated object into register `obj` and end of allocated object into register `obj_end`.
+  // `size_expression` should be a register or constant which can be used as immediate in "add" instruction.
+  void try_allocate(Register obj, Register obj_end, Register tmp1, Register tmp2,
+                    RegisterOrConstant size_expression, Label& slow_case);
+
+  void initialize_header(Register obj, Register klass, Register len, Register tmp);
+
+  // Cleans object body [base..obj_end]. Clobbers `base` and `tmp` registers.
+  void initialize_body(Register base, Register obj_end, Register tmp);
+
+  void initialize_object(Register obj, Register obj_end, Register klass,
+                         Register len, Register tmp1, Register tmp2,
+                         RegisterOrConstant header_size_expression, int obj_size_in_bytes,
+                         bool is_tlab_allocated);
+
+  void allocate_object(Register obj, Register tmp1, Register tmp2, Register tmp3,
+                       int header_size, int object_size,
+                       Register klass, Label& slow_case);
+
+  void allocate_array(Register obj, Register len,
+                      Register tmp1, Register tmp2, Register tmp3,
+                      int header_size, int element_size,
+                      Register klass, Label& slow_case);
+
+  enum {
+    max_array_allocation_length = 0x01000000
+  };
+
+  int lock_object(Register hdr, Register obj, Register disp_hdr, Register tmp, Label& slow_case);
+
+  void unlock_object(Register hdr, Register obj, Register disp_hdr, Register tmp, Label& slow_case);
+
+  // This platform only uses signal-based null checks. The Label is not needed.
+  void null_check(Register r, Label *Lnull = NULL) { MacroAssembler::null_check(r); }
+
+#endif // CPU_ARM_VM_C1_MACROASSEMBLER_ARM_HPP
--- /dev/null	2016-08-24 15:41:39.598575000 -0400
+++ new/hotspot/src/cpu/arm/vm/c1_Runtime1_arm.cpp	2016-12-02 11:19:16.700851094 -0500
@@ -0,0 +1,1230 @@
+/*
+ * Copyright (c) 2008, 2016, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+#include "asm/macroAssembler.hpp"
+#include "c1/c1_Defs.hpp"
+#include "c1/c1_LIRAssembler.hpp"
+#include "c1/c1_MacroAssembler.hpp"
+#include "c1/c1_Runtime1.hpp"
+#include "interpreter/interpreter.hpp"
+#include "nativeInst_arm.hpp"
+#include "oops/compiledICHolder.hpp"
+#include "oops/oop.inline.hpp"
+#include "prims/jvmtiExport.hpp"
+#include "register_arm.hpp"
+#include "runtime/sharedRuntime.hpp"
+#include "runtime/signature.hpp"
+#include "runtime/vframeArray.hpp"
+#include "vmreg_arm.inline.hpp"
+#if INCLUDE_ALL_GCS
+#include "gc/g1/g1SATBCardTableModRefBS.hpp"
+#endif
+
+// Note: Rtemp usage is this file should not impact C2 and should be
+// correct as long as it is not implicitly used in lower layers (the
+// arm [macro]assembler) and used with care in the other C1 specific
+// files.
+
+// Implementation of StubAssembler
+
+int StubAssembler::call_RT(Register oop_result1, Register metadata_result, address entry, int args_size) {
+  mov(R0, Rthread);
+
+  int call_offset = set_last_Java_frame(SP, FP, false, Rtemp);
+
+  call(entry);
+  if (call_offset == -1) { // PC not saved
+    call_offset = offset();
+  }
+  reset_last_Java_frame(Rtemp);
+
+  assert(frame_size() != no_frame_size, "frame must be fixed");
+  if (_stub_id != Runtime1::forward_exception_id) {
+    ldr(R3, Address(Rthread, Thread::pending_exception_offset()));
+  }
+
+  if (oop_result1->is_valid()) {
+    assert_different_registers(oop_result1, R3, Rtemp);
+    get_vm_result(oop_result1, Rtemp);
+  }
+  if (metadata_result->is_valid()) {
+    assert_different_registers(metadata_result, R3, Rtemp);
+    get_vm_result_2(metadata_result, Rtemp);
+  }
+
+  // Check for pending exception
+  // unpack_with_exception_in_tls path is taken through
+  // Runtime1::exception_handler_for_pc
+  if (_stub_id != Runtime1::forward_exception_id) {
+    assert(frame_size() != no_frame_size, "cannot directly call forward_exception_id");
+#ifdef AARCH64
+    Label skip;
+    cbz(R3, skip);
+    jump(Runtime1::entry_for(Runtime1::forward_exception_id), relocInfo::runtime_call_type, Rtemp);
+    bind(skip);
+#else
+    cmp(R3, 0);
+    jump(Runtime1::entry_for(Runtime1::forward_exception_id), relocInfo::runtime_call_type, Rtemp, ne);
+#endif // AARCH64
+  } else {
+#ifdef ASSERT
+    // Should not have pending exception in forward_exception stub
+    ldr(R3, Address(Rthread, Thread::pending_exception_offset()));
+    cmp(R3, 0);
+    breakpoint(ne);
+#endif // ASSERT
+  }
+  return call_offset;
+}
+
+
+int StubAssembler::call_RT(Register oop_result1, Register metadata_result, address entry, Register arg1) {
+  if (arg1 != R1) {
+    mov(R1, arg1);
+  }
+  return call_RT(oop_result1, metadata_result, entry, 1);
+}
+
+
+int StubAssembler::call_RT(Register oop_result1, Register metadata_result, address entry, Register arg1, Register arg2) {
+  assert(arg1 == R1 && arg2 == R2, "cannot handle otherwise");
+  return call_RT(oop_result1, metadata_result, entry, 2);
+}
+
+
+int StubAssembler::call_RT(Register oop_result1, Register metadata_result, address entry, Register arg1, Register arg2, Register arg3) {
+  assert(arg1 == R1 && arg2 == R2 && arg3 == R3, "cannot handle otherwise");
+  return call_RT(oop_result1, metadata_result, entry, 3);
+}
+
+
+#define __ sasm->
+
+// TODO: ARM - does this duplicate RegisterSaver in SharedRuntime?
+#ifdef AARCH64
+
+  //
+  // On AArch64 registers save area has the following layout:
+  //
+  // |---------------------|
+  // | return address (LR) |
+  // | FP                  |
+  // |---------------------|
+  // | D31                 |
+  // | ...                 |
+  // | D0                  |
+  // |---------------------|
+  // | padding             |
+  // |---------------------|
+  // | R28                 |
+  // | ...                 |
+  // | R0                  |
+  // |---------------------| <-- SP
+  //
+
+enum RegisterLayout {
+  number_of_saved_gprs = 29,
+  number_of_saved_fprs = FloatRegisterImpl::number_of_registers,
+
+  R0_offset  = 0,
+  D0_offset  = R0_offset + number_of_saved_gprs + 1,
+  FP_offset  = D0_offset + number_of_saved_fprs,
+  LR_offset  = FP_offset + 1,
+
+  reg_save_size = LR_offset + 1,
+
+  arg1_offset = reg_save_size * wordSize,
+  arg2_offset = (reg_save_size + 1) * wordSize
+};
+
+#else
+
+enum RegisterLayout {
+  fpu_save_size = pd_nof_fpu_regs_reg_alloc,
+#ifndef __SOFTFP__
+  D0_offset = 0,
+#endif
+  R0_offset = fpu_save_size,
+  R1_offset,
+  R2_offset,
+  R3_offset,
+  R4_offset,
+  R5_offset,
+  R6_offset,
+#if (FP_REG_NUM != 7)
+  R7_offset,
+#endif
+  R8_offset,
+  R9_offset,
+  R10_offset,
+#if (FP_REG_NUM != 11)
+  R11_offset,
+#endif
+  R12_offset,
+  FP_offset,
+  LR_offset,
+  reg_save_size,
+  arg1_offset = reg_save_size * wordSize,
+  arg2_offset = (reg_save_size + 1) * wordSize
+};
+
+#endif // AARCH64
+
+static OopMap* generate_oop_map(StubAssembler* sasm, bool save_fpu_registers = HaveVFP) {
+  sasm->set_frame_size(reg_save_size /* in words */);
+
+  // Record saved value locations in an OopMap.
+  // Locations are offsets from sp after runtime call.
+  OopMap* map = new OopMap(VMRegImpl::slots_per_word * reg_save_size, 0);
+
+#ifdef AARCH64
+  for (int i = 0; i < number_of_saved_gprs; i++) {
+    map->set_callee_saved(VMRegImpl::stack2reg((R0_offset + i) * VMRegImpl::slots_per_word), as_Register(i)->as_VMReg());
+  }
+  map->set_callee_saved(VMRegImpl::stack2reg(FP_offset * VMRegImpl::slots_per_word), FP->as_VMReg());
+  map->set_callee_saved(VMRegImpl::stack2reg(LR_offset * VMRegImpl::slots_per_word), LR->as_VMReg());
+
+  if (save_fpu_registers) {
+    for (int i = 0; i < number_of_saved_fprs; i++) {
+      map->set_callee_saved(VMRegImpl::stack2reg((D0_offset + i) * VMRegImpl::slots_per_word), as_FloatRegister(i)->as_VMReg());
+    }
+  }
+#else
+  int j=0;
+  for (int i = R0_offset; i < R10_offset; i++) {
+    if (j == FP_REG_NUM) {
+      // skip the FP register, saved below
+      j++;
+    }
+    map->set_callee_saved(VMRegImpl::stack2reg(i), as_Register(j)->as_VMReg());
+    j++;
+  }
+  assert(j == R10->encoding(), "must be");
+#if (FP_REG_NUM != 11)
+  // add R11, if not saved as FP
+  map->set_callee_saved(VMRegImpl::stack2reg(R11_offset), R11->as_VMReg());
+#endif
+  map->set_callee_saved(VMRegImpl::stack2reg(FP_offset), FP->as_VMReg());
+  map->set_callee_saved(VMRegImpl::stack2reg(LR_offset), LR->as_VMReg());
+
+  if (save_fpu_registers) {
+    for (int i = 0; i < fpu_save_size; i++) {
+      map->set_callee_saved(VMRegImpl::stack2reg(i), as_FloatRegister(i)->as_VMReg());
+    }
+  }
+#endif // AARCH64
+
+  return map;
+}
+
+static OopMap* save_live_registers(StubAssembler* sasm, bool save_fpu_registers = HaveVFP) {
+  __ block_comment("save_live_registers");
+  sasm->set_frame_size(reg_save_size /* in words */);
+
+#ifdef AARCH64
+  assert((reg_save_size * wordSize) % StackAlignmentInBytes == 0, "SP should be aligned");
+
+  __ raw_push(FP, LR);
+
+  __ sub(SP, SP, (reg_save_size - 2) * wordSize);
+
+  for (int i = 0; i < round_down(number_of_saved_gprs, 2); i += 2) {
+    __ stp(as_Register(i), as_Register(i+1), Address(SP, (R0_offset + i) * wordSize));
+  }
+
+  if (is_odd(number_of_saved_gprs)) {
+    int i = number_of_saved_gprs - 1;
+    __ str(as_Register(i), Address(SP, (R0_offset + i) * wordSize));
+  }
+
+  if (save_fpu_registers) {
+    assert (is_even(number_of_saved_fprs), "adjust this code");
+    for (int i = 0; i < number_of_saved_fprs; i += 2) {
+      __ stp_d(as_FloatRegister(i), as_FloatRegister(i+1), Address(SP, (D0_offset + i) * wordSize));
+    }
+  }
+#else
+  __ push(RegisterSet(FP) | RegisterSet(LR));
+  __ push(RegisterSet(R0, R6) | RegisterSet(R8, R10) | R12 | altFP_7_11);
+  if (save_fpu_registers) {
+    __ fstmdbd(SP, FloatRegisterSet(D0, fpu_save_size / 2), writeback);
+  } else {
+    __ sub(SP, SP, fpu_save_size * wordSize);
+  }
+#endif // AARCH64
+
+  return generate_oop_map(sasm, save_fpu_registers);
+}
+
+
+static void restore_live_registers(StubAssembler* sasm,
+                                   bool restore_R0,
+                                   bool restore_FP_LR,
+                                   bool do_return,
+                                   bool restore_fpu_registers = HaveVFP) {
+  __ block_comment("restore_live_registers");
+
+#ifdef AARCH64
+  if (restore_R0) {
+    __ ldr(R0, Address(SP, R0_offset * wordSize));
+  }
+
+  assert(is_odd(number_of_saved_gprs), "adjust this code");
+  for (int i = 1; i < number_of_saved_gprs; i += 2) {
+    __ ldp(as_Register(i), as_Register(i+1), Address(SP, (R0_offset + i) * wordSize));
+  }
+
+  if (restore_fpu_registers) {
+    assert (is_even(number_of_saved_fprs), "adjust this code");
+    for (int i = 0; i < number_of_saved_fprs; i += 2) {
+      __ ldp_d(as_FloatRegister(i), as_FloatRegister(i+1), Address(SP, (D0_offset + i) * wordSize));
+    }
+  }
+
+  __ add(SP, SP, (reg_save_size - 2) * wordSize);
+
+  if (restore_FP_LR) {
+    __ raw_pop(FP, LR);
+    if (do_return) {
+      __ ret();
+    }
+  } else {
+    assert (!do_return, "return without restoring FP/LR");
+  }
+#else
+  if (restore_fpu_registers) {
+    __ fldmiad(SP, FloatRegisterSet(D0, fpu_save_size / 2), writeback);
+    if (!restore_R0) {
+      __ add(SP, SP, (R1_offset - fpu_save_size) * wordSize);
+    }
+  } else {
+    __ add(SP, SP, (restore_R0 ? fpu_save_size : R1_offset) * wordSize);
+  }
+  __ pop(RegisterSet((restore_R0 ? R0 : R1), R6) | RegisterSet(R8, R10) | R12 | altFP_7_11);
+  if (restore_FP_LR) {
+    __ pop(RegisterSet(FP) | RegisterSet(do_return ? PC : LR));
+  } else {
+    assert (!do_return, "return without restoring FP/LR");
+  }
+#endif // AARCH64
+}
+
+
+static void restore_live_registers_except_R0(StubAssembler* sasm, bool restore_fpu_registers = HaveVFP) {
+  restore_live_registers(sasm, false, true, true, restore_fpu_registers);
+}
+
+static void restore_live_registers(StubAssembler* sasm, bool restore_fpu_registers = HaveVFP) {
+  restore_live_registers(sasm, true, true, true, restore_fpu_registers);
+}
+
+#ifndef AARCH64
+static void restore_live_registers_except_FP_LR(StubAssembler* sasm, bool restore_fpu_registers = HaveVFP) {
+  restore_live_registers(sasm, true, false, false, restore_fpu_registers);
+}
+#endif // !AARCH64
+
+static void restore_live_registers_without_return(StubAssembler* sasm, bool restore_fpu_registers = HaveVFP) {
+  restore_live_registers(sasm, true, true, false, restore_fpu_registers);
+}
+
+
+void Runtime1::initialize_pd() {
+  LIR_Assembler::exception_handler_size = AARCH64_ONLY(256) NOT_AARCH64(68);
+#ifndef PRODUCT
+  if (VerifyOops) LIR_Assembler::exception_handler_size += AARCH64_ONLY(216) NOT_AARCH64(60);
+#endif // !PRODUCT
+}
+
+
+OopMapSet* Runtime1::generate_exception_throw(StubAssembler* sasm, address target, bool has_argument) {
+  OopMap* oop_map = save_live_registers(sasm);
+
+  if (has_argument) {
+    __ ldr(R1, Address(SP, arg1_offset));
+  }
+
+  int call_offset = __ call_RT(noreg, noreg, target);
+  OopMapSet* oop_maps = new OopMapSet();
+  oop_maps->add_gc_map(call_offset, oop_map);
+
+  DEBUG_ONLY(STOP("generate_exception_throw");)  // Should not reach here
+  return oop_maps;
+}
+
+
+static void restore_sp_for_method_handle(StubAssembler* sasm) {
+  // Restore SP from its saved reg (FP) if the exception PC is a MethodHandle call site.
+  __ ldr_s32(Rtemp, Address(Rthread, JavaThread::is_method_handle_return_offset()));
+#ifdef AARCH64
+  Label skip;
+  __ cbz(Rtemp, skip);
+  __ mov(SP, Rmh_SP_save);
+  __ bind(skip);
+#else
+  __ cmp(Rtemp, 0);
+  __ mov(SP, Rmh_SP_save, ne);
+#endif // AARCH64
+}
+
+
+OopMapSet* Runtime1::generate_handle_exception(StubID id, StubAssembler* sasm) {
+  __ block_comment("generate_handle_exception");
+
+  bool save_fpu_registers = false;
+
+  // Save registers, if required.
+  OopMapSet* oop_maps = new OopMapSet();
+  OopMap* oop_map = NULL;
+
+  switch (id) {
+  case forward_exception_id: {
+    save_fpu_registers = HaveVFP;
+    oop_map = generate_oop_map(sasm);
+    __ ldr(Rexception_obj, Address(Rthread, Thread::pending_exception_offset()));
+    __ ldr(Rexception_pc, Address(SP, LR_offset * wordSize));
+    Register zero = __ zero_register(Rtemp);
+    __ str(zero, Address(Rthread, Thread::pending_exception_offset()));
+    break;
+  }
+  case handle_exception_id:
+    save_fpu_registers = HaveVFP;
+    // fall-through
+  case handle_exception_nofpu_id:
+    // At this point all registers MAY be live.
+    oop_map = save_live_registers(sasm, save_fpu_registers);
+    break;
+  case handle_exception_from_callee_id:
+    // At this point all registers except exception oop (R4/R19) and
+    // exception pc (R5/R20) are dead.
+    oop_map = save_live_registers(sasm);  // TODO it's not required to save all registers
+    break;
+  default:  ShouldNotReachHere();
+  }
+
+  __ str(Rexception_obj, Address(Rthread, JavaThread::exception_oop_offset()));
+  __ str(Rexception_pc, Address(Rthread, JavaThread::exception_pc_offset()));
+
+  __ str(Rexception_pc, Address(SP, LR_offset * wordSize)); // patch throwing pc into return address
+
+  int call_offset = __ call_RT(noreg, noreg, CAST_FROM_FN_PTR(address, exception_handler_for_pc));
+  oop_maps->add_gc_map(call_offset, oop_map);
+
+  // Exception handler found
+  __ str(R0, Address(SP, LR_offset * wordSize)); // patch the return address
+
+  // Restore the registers that were saved at the beginning, remove
+  // frame and jump to the exception handler.
+  switch (id) {
+  case forward_exception_id:
+  case handle_exception_nofpu_id:
+  case handle_exception_id:
+    restore_live_registers(sasm, save_fpu_registers);
+    // Note: the restore live registers includes the jump to LR (patched to R0)
+    break;
+  case handle_exception_from_callee_id:
+    restore_live_registers_without_return(sasm); // must not jump immediatly to handler
+    restore_sp_for_method_handle(sasm);
+    __ ret();
+    break;
+  default:  ShouldNotReachHere();
+  }
+
+  DEBUG_ONLY(STOP("generate_handle_exception");)  // Should not reach here
+
+  return oop_maps;
+}
+
+
+void Runtime1::generate_unwind_exception(StubAssembler* sasm) {
+  // FP no longer used to find the frame start
+  // on entry, remove_frame() has already been called (restoring FP and LR)
+
+  // search the exception handler address of the caller (using the return address)
+  __ mov(c_rarg0, Rthread);
+  __ mov(Rexception_pc, LR);
+  __ mov(c_rarg1, LR);
+  __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), c_rarg0, c_rarg1);
+
+  // Exception oop should be still in Rexception_obj and pc in Rexception_pc
+  // Jump to handler
+  __ verify_not_null_oop(Rexception_obj);
+
+  // JSR292 extension
+  restore_sp_for_method_handle(sasm);
+
+  __ jump(R0);
+}
+
+
+OopMapSet* Runtime1::generate_patching(StubAssembler* sasm, address target) {
+  OopMap* oop_map = save_live_registers(sasm);
+
+  // call the runtime patching routine, returns non-zero if nmethod got deopted.
+  int call_offset = __ call_RT(noreg, noreg, target);
+  OopMapSet* oop_maps = new OopMapSet();
+  oop_maps->add_gc_map(call_offset, oop_map);
+
+  DeoptimizationBlob* deopt_blob = SharedRuntime::deopt_blob();
+  assert(deopt_blob != NULL, "deoptimization blob must have been created");
+
+  __ cmp_32(R0, 0);
+
+#ifdef AARCH64
+  Label call_deopt;
+
+  restore_live_registers_without_return(sasm);
+  __ b(call_deopt, ne);
+  __ ret();
+
+  __ bind(call_deopt);
+#else
+  restore_live_registers_except_FP_LR(sasm);
+  __ pop(RegisterSet(FP) | RegisterSet(PC), eq);
+
+  // Deoptimization needed
+  // TODO: ARM - no need to restore FP & LR because unpack_with_reexecution() stores them back
+  __ pop(RegisterSet(FP) | RegisterSet(LR));
+#endif // AARCH64
+
+  __ jump(deopt_blob->unpack_with_reexecution(), relocInfo::runtime_call_type, Rtemp);
+
+  DEBUG_ONLY(STOP("generate_patching");)  // Should not reach here
+  return oop_maps;
+}
+
+
+OopMapSet* Runtime1::generate_code_for(StubID id, StubAssembler* sasm) {
+  const bool must_gc_arguments = true;
+  const bool dont_gc_arguments = false;
+
+  OopMapSet* oop_maps = NULL;
+  bool save_fpu_registers = HaveVFP;
+
+  switch (id) {
+    case forward_exception_id:
+      {
+        oop_maps = generate_handle_exception(id, sasm);
+        // does not return on ARM
+      }
+      break;
+
+#if INCLUDE_ALL_GCS
+    case g1_pre_barrier_slow_id:
+      {
+        // Input:
+        // - pre_val pushed on the stack
+
+        __ set_info("g1_pre_barrier_slow_id", dont_gc_arguments);
+
+        // save at least the registers that need saving if the runtime is called
+#ifdef AARCH64
+        __ raw_push(R0, R1);
+        __ raw_push(R2, R3);
+        const int nb_saved_regs = 4;
+#else // AARCH64
+        const RegisterSet saved_regs = RegisterSet(R0,R3) | RegisterSet(R12) | RegisterSet(LR);
+        const int nb_saved_regs = 6;
+        assert(nb_saved_regs == saved_regs.size(), "fix nb_saved_regs");
+        __ push(saved_regs);
+#endif // AARCH64
+
+        const Register r_pre_val_0  = R0; // must be R0, to be ready for the runtime call
+        const Register r_index_1    = R1;
+        const Register r_buffer_2   = R2;
+
+        Address queue_index(Rthread, in_bytes(JavaThread::satb_mark_queue_offset() +
+                                              SATBMarkQueue::byte_offset_of_index()));
+        Address buffer(Rthread, in_bytes(JavaThread::satb_mark_queue_offset() +
+                                         SATBMarkQueue::byte_offset_of_buf()));
+
+        Label done;
+        Label runtime;
+
+        __ ldr(r_index_1, queue_index);
+        __ ldr(r_pre_val_0, Address(SP, nb_saved_regs*wordSize));
+        __ ldr(r_buffer_2, buffer);
+
+        __ subs(r_index_1, r_index_1, wordSize);
+        __ b(runtime, lt);
+
+        __ str(r_index_1, queue_index);
+        __ str(r_pre_val_0, Address(r_buffer_2, r_index_1));
+
+        __ bind(done);
+
+#ifdef AARCH64
+        __ raw_pop(R2, R3);
+        __ raw_pop(R0, R1);
+#else // AARCH64
+        __ pop(saved_regs);
+#endif // AARCH64
+
+        __ ret();
+
+        __ bind(runtime);
+
+        save_live_registers(sasm);
+
+        assert(r_pre_val_0 == c_rarg0, "pre_val should be in R0");
+        __ mov(c_rarg1, Rthread);
+        __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_pre), c_rarg0, c_rarg1);
+
+        restore_live_registers_without_return(sasm);
+
+        __ b(done);
+      }
+      break;
+    case g1_post_barrier_slow_id:
+      {
+        // Input:
+        // - store_addr, pushed on the stack
+
+        __ set_info("g1_post_barrier_slow_id", dont_gc_arguments);
+
+        BarrierSet* bs = Universe::heap()->barrier_set();
+        CardTableModRefBS* ct = barrier_set_cast<CardTableModRefBS>(bs);
+        Label done;
+        Label recheck;
+        Label runtime;
+
+        Address queue_index(Rthread, in_bytes(JavaThread::dirty_card_queue_offset() +
+                                              DirtyCardQueue::byte_offset_of_index()));
+        Address buffer(Rthread, in_bytes(JavaThread::dirty_card_queue_offset() +
+                                         DirtyCardQueue::byte_offset_of_buf()));
+
+        AddressLiteral cardtable((address)ct->byte_map_base);
+        assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
+
+        // save at least the registers that need saving if the runtime is called
+#ifdef AARCH64
+        __ raw_push(R0, R1);
+        __ raw_push(R2, R3);
+        const int nb_saved_regs = 4;
+#else // AARCH64
+        const RegisterSet saved_regs = RegisterSet(R0,R3) | RegisterSet(R12) | RegisterSet(LR);
+        const int nb_saved_regs = 6;
+        assert(nb_saved_regs == saved_regs.size(), "fix nb_saved_regs");
+        __ push(saved_regs);
+#endif // AARCH64
+
+        const Register r_card_addr_0 = R0; // must be R0 for the slow case
+        const Register r_obj_0 = R0;
+        const Register r_card_base_1 = R1;
+        const Register r_tmp2 = R2;
+        const Register r_index_2 = R2;
+        const Register r_buffer_3 = R3;
+        const Register tmp1 = Rtemp;
+
+        __ ldr(r_obj_0, Address(SP, nb_saved_regs*wordSize));
+        // Note: there is a comment in x86 code about not using
+        // ExternalAddress / lea, due to relocation not working
+        // properly for that address. Should be OK for arm, where we
+        // explicitly specify that 'cartable' has a relocInfo::none
+        // type.
+        __ lea(r_card_base_1, cardtable);
+        __ add(r_card_addr_0, r_card_base_1, AsmOperand(r_obj_0, lsr, CardTableModRefBS::card_shift));
+
+        // first quick check without barrier
+        __ ldrb(r_tmp2, Address(r_card_addr_0));
+
+        __ cmp(r_tmp2, (int)G1SATBCardTableModRefBS::g1_young_card_val());
+        __ b(recheck, ne);
+
+        __ bind(done);
+
+#ifdef AARCH64
+        __ raw_pop(R2, R3);
+        __ raw_pop(R0, R1);
+#else // AARCH64
+        __ pop(saved_regs);
+#endif // AARCH64
+
+        __ ret();
+
+        __ bind(recheck);
+
+        __ membar(MacroAssembler::Membar_mask_bits(MacroAssembler::StoreLoad), tmp1);
+
+        // reload card state after the barrier that ensures the stored oop was visible
+        __ ldrb(r_tmp2, Address(r_card_addr_0));
+
+        assert(CardTableModRefBS::dirty_card_val() == 0, "adjust this code");
+        __ cbz(r_tmp2, done);
+
+        // storing region crossing non-NULL, card is clean.
+        // dirty card and log.
+
+        assert(0 == (int)CardTableModRefBS::dirty_card_val(), "adjust this code");
+        if (((intptr_t)ct->byte_map_base & 0xff) == 0) {
+          // Card table is aligned so the lowest byte of the table address base is zero.
+          __ strb(r_card_base_1, Address(r_card_addr_0));
+        } else {
+          __ strb(__ zero_register(r_tmp2), Address(r_card_addr_0));
+        }
+
+        __ ldr(r_index_2, queue_index);
+        __ ldr(r_buffer_3, buffer);
+
+        __ subs(r_index_2, r_index_2, wordSize);
+        __ b(runtime, lt); // go to runtime if now negative
+
+        __ str(r_index_2, queue_index);
+
+        __ str(r_card_addr_0, Address(r_buffer_3, r_index_2));
+
+        __ b(done);
+
+        __ bind(runtime);
+
+        save_live_registers(sasm);
+
+        assert(r_card_addr_0 == c_rarg0, "card_addr should be in R0");
+        __ mov(c_rarg1, Rthread);
+        __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_post), c_rarg0, c_rarg1);
+
+        restore_live_registers_without_return(sasm);
+
+        __ b(done);
+      }
+      break;
+#endif // INCLUDE_ALL_GCS
+    case new_instance_id:
+    case fast_new_instance_id:
+    case fast_new_instance_init_check_id:
+      {
+        const Register result = R0;
+        const Register klass  = R1;
+
+        if (UseTLAB && FastTLABRefill && id != new_instance_id) {
+          // We come here when TLAB allocation failed.
+          // In this case we either refill TLAB or allocate directly from eden.
+          Label retry_tlab, try_eden, slow_case, slow_case_no_pop;
+
+          // Make sure the class is fully initialized
+          if (id == fast_new_instance_init_check_id) {
+            __ ldrb(result, Address(klass, InstanceKlass::init_state_offset()));
+            __ cmp(result, InstanceKlass::fully_initialized);
+            __ b(slow_case_no_pop, ne);
+          }
+
+          // Free some temporary registers
+          const Register obj_size = R4;
+          const Register tmp1     = R5;
+          const Register tmp2     = LR;
+          const Register obj_end  = Rtemp;
+
+          __ raw_push(R4, R5, LR);
+
+          __ tlab_refill(result, obj_size, tmp1, tmp2, obj_end, try_eden, slow_case);
+
+          __ bind(retry_tlab);
+          __ ldr_u32(obj_size, Address(klass, Klass::layout_helper_offset()));
+          __ tlab_allocate(result, obj_end, tmp1, obj_size, slow_case);              // initializes result and obj_end
+          __ initialize_object(result, obj_end, klass, noreg /* len */, tmp1, tmp2,
+                               instanceOopDesc::header_size() * HeapWordSize, -1,
+                               /* is_tlab_allocated */ true);
+          __ raw_pop_and_ret(R4, R5);
+
+          __ bind(try_eden);
+          __ ldr_u32(obj_size, Address(klass, Klass::layout_helper_offset()));
+          __ eden_allocate(result, obj_end, tmp1, tmp2, obj_size, slow_case);        // initializes result and obj_end
+          __ incr_allocated_bytes(obj_size, tmp2);
+          __ initialize_object(result, obj_end, klass, noreg /* len */, tmp1, tmp2,
+                               instanceOopDesc::header_size() * HeapWordSize, -1,
+                               /* is_tlab_allocated */ false);
+          __ raw_pop_and_ret(R4, R5);
+
+          __ bind(slow_case);
+          __ raw_pop(R4, R5, LR);
+
+          __ bind(slow_case_no_pop);
+        }
+
+        OopMap* map = save_live_registers(sasm);
+        int call_offset = __ call_RT(result, noreg, CAST_FROM_FN_PTR(address, new_instance), klass);
+        oop_maps = new OopMapSet();
+        oop_maps->add_gc_map(call_offset, map);
+
+        // MacroAssembler::StoreStore useless (included in the runtime exit path)
+
+        restore_live_registers_except_R0(sasm);
+      }
+      break;
+
+    case counter_overflow_id:
+      {
+        OopMap* oop_map = save_live_registers(sasm);
+        __ ldr(R1, Address(SP, arg1_offset));
+        __ ldr(R2, Address(SP, arg2_offset));
+        int call_offset = __ call_RT(noreg, noreg, CAST_FROM_FN_PTR(address, counter_overflow), R1, R2);
+        oop_maps = new OopMapSet();
+        oop_maps->add_gc_map(call_offset, oop_map);
+        restore_live_registers(sasm);
+      }
+      break;
+
+    case new_type_array_id:
+    case new_object_array_id:
+      {
+        if (id == new_type_array_id) {
+          __ set_info("new_type_array", dont_gc_arguments);
+        } else {
+          __ set_info("new_object_array", dont_gc_arguments);
+        }
+
+        const Register result = R0;
+        const Register klass  = R1;
+        const Register length = R2;
+
+        if (UseTLAB && FastTLABRefill) {
+          // We come here when TLAB allocation failed.
+          // In this case we either refill TLAB or allocate directly from eden.
+          Label retry_tlab, try_eden, slow_case, slow_case_no_pop;
+
+#ifdef AARCH64
+          __ mov_slow(Rtemp, C1_MacroAssembler::max_array_allocation_length);
+          __ cmp_32(length, Rtemp);
+#else
+          __ cmp_32(length, C1_MacroAssembler::max_array_allocation_length);
+#endif // AARCH64
+          __ b(slow_case_no_pop, hs);
+
+          // Free some temporary registers
+          const Register arr_size = R4;
+          const Register tmp1     = R5;
+          const Register tmp2     = LR;
+          const Register tmp3     = Rtemp;
+          const Register obj_end  = tmp3;
+
+          __ raw_push(R4, R5, LR);
+
+          __ tlab_refill(result, arr_size, tmp1, tmp2, tmp3, try_eden, slow_case);
+
+          __ bind(retry_tlab);
+          // Get the allocation size: round_up((length << (layout_helper & 0xff)) + header_size)
+          __ ldr_u32(tmp1, Address(klass, Klass::layout_helper_offset()));
+          __ mov(arr_size, MinObjAlignmentInBytesMask);
+          __ and_32(tmp2, tmp1, (unsigned int)(Klass::_lh_header_size_mask << Klass::_lh_header_size_shift));
+
+#ifdef AARCH64
+          __ lslv_w(tmp3, length, tmp1);
+          __ add(arr_size, arr_size, tmp3);
+#else
+          __ add(arr_size, arr_size, AsmOperand(length, lsl, tmp1));
+#endif // AARCH64
+
+          __ add(arr_size, arr_size, AsmOperand(tmp2, lsr, Klass::_lh_header_size_shift));
+          __ align_reg(arr_size, arr_size, MinObjAlignmentInBytes);
+
+          // tlab_allocate initializes result and obj_end, and preserves tmp2 which contains header_size
+          __ tlab_allocate(result, obj_end, tmp1, arr_size, slow_case);
+
+          assert_different_registers(result, obj_end, klass, length, tmp1, tmp2);
+          __ initialize_header(result, klass, length, tmp1);
+
+          __ add(tmp2, result, AsmOperand(tmp2, lsr, Klass::_lh_header_size_shift));
+          if (!ZeroTLAB) {
+            __ initialize_body(tmp2, obj_end, tmp1);
+          }
+
+          __ membar(MacroAssembler::StoreStore, tmp1);
+
+          __ raw_pop_and_ret(R4, R5);
+
+          __ bind(try_eden);
+          // Get the allocation size: round_up((length << (layout_helper & 0xff)) + header_size)
+          __ ldr_u32(tmp1, Address(klass, Klass::layout_helper_offset()));
+          __ mov(arr_size, MinObjAlignmentInBytesMask);
+          __ and_32(tmp2, tmp1, (unsigned int)(Klass::_lh_header_size_mask << Klass::_lh_header_size_shift));
+
+#ifdef AARCH64
+          __ lslv_w(tmp3, length, tmp1);
+          __ add(arr_size, arr_size, tmp3);
+#else
+          __ add(arr_size, arr_size, AsmOperand(length, lsl, tmp1));
+#endif // AARCH64
+
+          __ add(arr_size, arr_size, AsmOperand(tmp2, lsr, Klass::_lh_header_size_shift));
+          __ align_reg(arr_size, arr_size, MinObjAlignmentInBytes);
+
+          // eden_allocate destroys tmp2, so reload header_size after allocation
+          // eden_allocate initializes result and obj_end
+          __ eden_allocate(result, obj_end, tmp1, tmp2, arr_size, slow_case);
+          __ incr_allocated_bytes(arr_size, tmp2);
+          __ ldrb(tmp2, Address(klass, in_bytes(Klass::layout_helper_offset()) +
+                                       Klass::_lh_header_size_shift / BitsPerByte));
+          __ initialize_object(result, obj_end, klass, length, tmp1, tmp2, tmp2, -1, /* is_tlab_allocated */ false);
+          __ raw_pop_and_ret(R4, R5);
+
+          __ bind(slow_case);
+          __ raw_pop(R4, R5, LR);
+          __ bind(slow_case_no_pop);
+        }
+
+        OopMap* map = save_live_registers(sasm);
+        int call_offset;
+        if (id == new_type_array_id) {
+          call_offset = __ call_RT(result, noreg, CAST_FROM_FN_PTR(address, new_type_array), klass, length);
+        } else {
+          call_offset = __ call_RT(result, noreg, CAST_FROM_FN_PTR(address, new_object_array), klass, length);
+        }
+        oop_maps = new OopMapSet();
+        oop_maps->add_gc_map(call_offset, map);
+
+        // MacroAssembler::StoreStore useless (included in the runtime exit path)
+
+        restore_live_registers_except_R0(sasm);
+      }
+      break;
+
+    case new_multi_array_id:
+      {
+        __ set_info("new_multi_array", dont_gc_arguments);
+
+        // R0: klass
+        // R2: rank
+        // SP: address of 1st dimension
+        const Register result = R0;
+        OopMap* map = save_live_registers(sasm);
+
+        __ mov(R1, R0);
+        __ add(R3, SP, arg1_offset);
+        int call_offset = __ call_RT(result, noreg, CAST_FROM_FN_PTR(address, new_multi_array), R1, R2, R3);
+
+        oop_maps = new OopMapSet();
+        oop_maps->add_gc_map(call_offset, map);
+
+        // MacroAssembler::StoreStore useless (included in the runtime exit path)
+
+        restore_live_registers_except_R0(sasm);
+      }
+      break;
+
+    case register_finalizer_id:
+      {
+        __ set_info("register_finalizer", dont_gc_arguments);
+
+        // Do not call runtime if JVM_ACC_HAS_FINALIZER flag is not set
+        __ load_klass(Rtemp, R0);
+        __ ldr_u32(Rtemp, Address(Rtemp, Klass::access_flags_offset()));
+
+#ifdef AARCH64
+        Label L;
+        __ tbnz(Rtemp, exact_log2(JVM_ACC_HAS_FINALIZER), L);
+        __ ret();
+        __ bind(L);
+#else
+        __ tst(Rtemp, JVM_ACC_HAS_FINALIZER);
+        __ bx(LR, eq);
+#endif // AARCH64
+
+        // Call VM
+        OopMap* map = save_live_registers(sasm);
+        oop_maps = new OopMapSet();
+        int call_offset = __ call_RT(noreg, noreg,
+                                     CAST_FROM_FN_PTR(address, SharedRuntime::register_finalizer), R0);
+        oop_maps->add_gc_map(call_offset, map);
+        restore_live_registers(sasm);
+      }
+      break;
+
+    case throw_range_check_failed_id:
+      {
+        __ set_info("range_check_failed", dont_gc_arguments);
+        oop_maps = generate_exception_throw(sasm, CAST_FROM_FN_PTR(address, throw_range_check_exception), true);
+      }
+      break;
+
+    case throw_index_exception_id:
+      {
+        __ set_info("index_range_check_failed", dont_gc_arguments);
+#ifdef AARCH64
+        __ NOT_TESTED();
+#endif
+        oop_maps = generate_exception_throw(sasm, CAST_FROM_FN_PTR(address, throw_index_exception), true);
+      }
+      break;
+
+    case throw_div0_exception_id:
+      {
+        __ set_info("throw_div0_exception", dont_gc_arguments);
+        oop_maps = generate_exception_throw(sasm, CAST_FROM_FN_PTR(address, throw_div0_exception), false);
+      }
+      break;
+
+    case throw_null_pointer_exception_id:
+      {
+        __ set_info("throw_null_pointer_exception", dont_gc_arguments);
+        oop_maps = generate_exception_throw(sasm, CAST_FROM_FN_PTR(address, throw_null_pointer_exception), false);
+      }
+      break;
+
+    case handle_exception_nofpu_id:
+    case handle_exception_id:
+      {
+        __ set_info("handle_exception", dont_gc_arguments);
+        oop_maps = generate_handle_exception(id, sasm);
+      }
+      break;
+
+    case handle_exception_from_callee_id:
+      {
+        __ set_info("handle_exception_from_callee", dont_gc_arguments);
+        oop_maps = generate_handle_exception(id, sasm);
+      }
+      break;
+
+    case unwind_exception_id:
+      {
+        __ set_info("unwind_exception", dont_gc_arguments);
+        generate_unwind_exception(sasm);
+      }
+      break;
+
+    case throw_array_store_exception_id:
+      {
+        __ set_info("throw_array_store_exception", dont_gc_arguments);
+        oop_maps = generate_exception_throw(sasm, CAST_FROM_FN_PTR(address, throw_array_store_exception), true);
+      }
+      break;
+
+    case throw_class_cast_exception_id:
+      {
+        __ set_info("throw_class_cast_exception", dont_gc_arguments);
+        oop_maps = generate_exception_throw(sasm, CAST_FROM_FN_PTR(address, throw_class_cast_exception), true);
+      }
+      break;
+
+    case throw_incompatible_class_change_error_id:
+      {
+        __ set_info("throw_incompatible_class_cast_exception", dont_gc_arguments);
+#ifdef AARCH64
+        __ NOT_TESTED();
+#endif
+        oop_maps = generate_exception_throw(sasm, CAST_FROM_FN_PTR(address, throw_incompatible_class_change_error), false);
+      }
+      break;
+
+    case slow_subtype_check_id:
+      {
+        // (in)  R0 - sub, destroyed,
+        // (in)  R1 - super, not changed
+        // (out) R0 - result: 1 if check passed, 0 otherwise
+        __ raw_push(R2, R3, LR);
+
+        // Load an array of secondary_supers
+        __ ldr(R2, Address(R0, Klass::secondary_supers_offset()));
+        // Length goes to R3
+        __ ldr_s32(R3, Address(R2, Array<Klass*>::length_offset_in_bytes()));
+        __ add(R2, R2, Array<Klass*>::base_offset_in_bytes());
+
+        Label loop, miss;
+        __ bind(loop);
+        __ cbz(R3, miss);
+        __ ldr(LR, Address(R2, wordSize, post_indexed));
+        __ sub(R3, R3, 1);
+        __ cmp(LR, R1);
+        __ b(loop, ne);
+
+        // We get here if an equal cache entry is found
+        __ str(R1, Address(R0, Klass::secondary_super_cache_offset()));
+        __ mov(R0, 1);
+        __ raw_pop_and_ret(R2, R3);
+
+        // A cache entry not found - return false
+        __ bind(miss);
+        __ mov(R0, 0);
+        __ raw_pop_and_ret(R2, R3);
+      }
+      break;
+
+    case monitorenter_nofpu_id:
+      save_fpu_registers = false;
+      // fall through
+    case monitorenter_id:
+      {
+        __ set_info("monitorenter", dont_gc_arguments);
+        const Register obj  = R1;
+        const Register lock = R2;
+        OopMap* map = save_live_registers(sasm, save_fpu_registers);
+        __ ldr(obj, Address(SP, arg1_offset));
+        __ ldr(lock, Address(SP, arg2_offset));
+        int call_offset = __ call_RT(noreg, noreg, CAST_FROM_FN_PTR(address, monitorenter), obj, lock);
+        oop_maps = new OopMapSet();
+        oop_maps->add_gc_map(call_offset, map);
+        restore_live_registers(sasm, save_fpu_registers);
+      }
+      break;
+
+    case monitorexit_nofpu_id:
+      save_fpu_registers = false;
+      // fall through
+    case monitorexit_id:
+      {
+        __ set_info("monitorexit", dont_gc_arguments);
+        const Register lock = R1;
+        OopMap* map = save_live_registers(sasm, save_fpu_registers);
+        __ ldr(lock, Address(SP, arg1_offset));
+        int call_offset = __ call_RT(noreg, noreg, CAST_FROM_FN_PTR(address, monitorexit), lock);
+        oop_maps = new OopMapSet();
+        oop_maps->add_gc_map(call_offset, map);
+        restore_live_registers(sasm, save_fpu_registers);
+      }
+      break;
+
+    case deoptimize_id:
+      {
+        __ set_info("deoptimize", dont_gc_arguments);
+        OopMap* oop_map = save_live_registers(sasm);
+        const Register trap_request = R1;
+        __ ldr(trap_request, Address(SP, arg1_offset));
+        int call_offset = __ call_RT(noreg, noreg, CAST_FROM_FN_PTR(address, deoptimize), trap_request);
+        oop_maps = new OopMapSet();
+        oop_maps->add_gc_map(call_offset, oop_map);
+        restore_live_registers_without_return(sasm);
+        DeoptimizationBlob* deopt_blob = SharedRuntime::deopt_blob();
+        assert(deopt_blob != NULL, "deoptimization blob must have been created");
+        __ jump(deopt_blob->unpack_with_reexecution(), relocInfo::runtime_call_type, AARCH64_ONLY(Rtemp) NOT_AARCH64(noreg));
+      }
+      break;
+
+    case access_field_patching_id:
+      {
+        __ set_info("access_field_patching", dont_gc_arguments);
+        oop_maps = generate_patching(sasm, CAST_FROM_FN_PTR(address, access_field_patching));
+      }
+      break;
+
+    case load_klass_patching_id:
+      {
+        __ set_info("load_klass_patching", dont_gc_arguments);
+        oop_maps = generate_patching(sasm, CAST_FROM_FN_PTR(address, move_klass_patching));
+      }
+      break;
+
+    case load_appendix_patching_id:
+      {
+        __ set_info("load_appendix_patching", dont_gc_arguments);
+        oop_maps = generate_patching(sasm, CAST_FROM_FN_PTR(address, move_appendix_patching));
+      }
+      break;
+
+    case load_mirror_patching_id:
+      {
+        __ set_info("load_mirror_patching", dont_gc_arguments);
+        oop_maps = generate_patching(sasm, CAST_FROM_FN_PTR(address, move_mirror_patching));
+      }
+      break;
+
+    case predicate_failed_trap_id:
+      {
+        __ set_info("predicate_failed_trap", dont_gc_arguments);
+
+        OopMap* oop_map = save_live_registers(sasm);
+        int call_offset = __ call_RT(noreg, noreg, CAST_FROM_FN_PTR(address, predicate_failed_trap));
+
+        oop_maps = new OopMapSet();
+        oop_maps->add_gc_map(call_offset, oop_map);
+
+        restore_live_registers_without_return(sasm);
+
+        DeoptimizationBlob* deopt_blob = SharedRuntime::deopt_blob();
+        assert(deopt_blob != NULL, "deoptimization blob must have been created");
+        __ jump(deopt_blob->unpack_with_reexecution(), relocInfo::runtime_call_type, Rtemp);
+      }
+      break;
+
+    default:
+      {
+        __ set_info("unimplemented entry", dont_gc_arguments);
+        STOP("unimplemented entry");
+      }
+      break;
+  }
+  return oop_maps;
+}
+
+#undef __
+
+#ifdef __SOFTFP__
+const char *Runtime1::pd_name_for_address(address entry) {
+
+#define FUNCTION_CASE(a, f) \
+  if ((intptr_t)a == CAST_FROM_FN_PTR(intptr_t, f))  return #f
+
+  FUNCTION_CASE(entry, __aeabi_fadd_glibc);
+  FUNCTION_CASE(entry, __aeabi_fmul);
+  FUNCTION_CASE(entry, __aeabi_fsub_glibc);
+  FUNCTION_CASE(entry, __aeabi_fdiv);
+
+  // __aeabi_XXXX_glibc: Imported code from glibc soft-fp bundle for calculation accuracy improvement. See CR 6757269.
+  FUNCTION_CASE(entry, __aeabi_dadd_glibc);
+  FUNCTION_CASE(entry, __aeabi_dmul);
+  FUNCTION_CASE(entry, __aeabi_dsub_glibc);
+  FUNCTION_CASE(entry, __aeabi_ddiv);
+
+  FUNCTION_CASE(entry, __aeabi_f2d);
+  FUNCTION_CASE(entry, __aeabi_d2f);
+  FUNCTION_CASE(entry, __aeabi_i2f);
+  FUNCTION_CASE(entry, __aeabi_i2d);
+  FUNCTION_CASE(entry, __aeabi_f2iz);
+
+  FUNCTION_CASE(entry, SharedRuntime::fcmpl);
+  FUNCTION_CASE(entry, SharedRuntime::fcmpg);
+  FUNCTION_CASE(entry, SharedRuntime::dcmpl);
+  FUNCTION_CASE(entry, SharedRuntime::dcmpg);
+
+  FUNCTION_CASE(entry, SharedRuntime::unordered_fcmplt);
+  FUNCTION_CASE(entry, SharedRuntime::unordered_dcmplt);
+  FUNCTION_CASE(entry, SharedRuntime::unordered_fcmple);
+  FUNCTION_CASE(entry, SharedRuntime::unordered_dcmple);
+  FUNCTION_CASE(entry, SharedRuntime::unordered_fcmpge);
+  FUNCTION_CASE(entry, SharedRuntime::unordered_dcmpge);
+  FUNCTION_CASE(entry, SharedRuntime::unordered_fcmpgt);
+  FUNCTION_CASE(entry, SharedRuntime::unordered_dcmpgt);
+
+  FUNCTION_CASE(entry, SharedRuntime::fneg);
+  FUNCTION_CASE(entry, SharedRuntime::dneg);
+
+  FUNCTION_CASE(entry, __aeabi_fcmpeq);
+  FUNCTION_CASE(entry, __aeabi_fcmplt);
+  FUNCTION_CASE(entry, __aeabi_fcmple);
+  FUNCTION_CASE(entry, __aeabi_fcmpge);
+  FUNCTION_CASE(entry, __aeabi_fcmpgt);
+
+  FUNCTION_CASE(entry, __aeabi_dcmpeq);
+  FUNCTION_CASE(entry, __aeabi_dcmplt);
+  FUNCTION_CASE(entry, __aeabi_dcmple);
+  FUNCTION_CASE(entry, __aeabi_dcmpge);
+  FUNCTION_CASE(entry, __aeabi_dcmpgt);
+#undef FUNCTION_CASE
+  return "";
+}
+#else  // __SOFTFP__
+const char *Runtime1::pd_name_for_address(address entry) {
+  return "<unknown function>";
+}
+#endif // __SOFTFP__
--- /dev/null	2016-08-24 15:41:39.598575000 -0400
+++ new/hotspot/src/cpu/arm/vm/c1_globals_arm.hpp	2016-12-02 11:19:22.009152127 -0500
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2008, 2015, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef CPU_ARM_VM_C1_GLOBALS_ARM_HPP
+#define CPU_ARM_VM_C1_GLOBALS_ARM_HPP
+
+#include "utilities/globalDefinitions.hpp"
+#include "utilities/macros.hpp"
+
+//
+// Sets the default values for platform dependent flags used by the client compiler.
+// (see c1_globals.hpp)
+//
+
+#ifndef COMPILER2 // avoid duplicated definitions, favoring C2 version
+define_pd_global(bool, BackgroundCompilation,        true );
+define_pd_global(bool, UseTLAB,                      true );
+define_pd_global(bool, ResizeTLAB,                   true );
+define_pd_global(bool, InlineIntrinsics,             false); // TODO: ARM
+define_pd_global(bool, PreferInterpreterNativeStubs, false);
+define_pd_global(bool, ProfileTraps,                 false);
+define_pd_global(bool, UseOnStackReplacement,        true );
+define_pd_global(bool, TieredCompilation,            false);
+define_pd_global(intx, CompileThreshold,             1500 );
+
+define_pd_global(intx, OnStackReplacePercentage,     933  );
+define_pd_global(intx, FreqInlineSize,               325  );
+define_pd_global(size_t, NewSizeThreadIncrease,      4*K  );
+define_pd_global(size_t, InitialCodeCacheSize,       160*K);
+define_pd_global(size_t, ReservedCodeCacheSize,      32*M );
+define_pd_global(size_t, NonProfiledCodeHeapSize,    13*M );
+define_pd_global(size_t, ProfiledCodeHeapSize,       14*M );
+define_pd_global(size_t, NonNMethodCodeHeapSize,     5*M  );
+define_pd_global(bool, ProfileInterpreter,           false);
+define_pd_global(size_t, CodeCacheExpansionSize,     32*K );
+define_pd_global(uintx, CodeCacheMinBlockLength,     1);
+define_pd_global(size_t, CodeCacheMinimumUseSpace,   400*K);
+define_pd_global(size_t, MetaspaceSize,              12*M );
+define_pd_global(bool, NeverActAsServerClassMachine, true);
+define_pd_global(uint64_t, MaxRAM,                   1ULL*G);
+define_pd_global(bool, CICompileOSR,                 true );
+#endif // COMPILER2
+define_pd_global(bool, UseTypeProfile,               false);
+define_pd_global(bool, RoundFPResults,               false);
+
+
+define_pd_global(bool, LIRFillDelaySlots,            false);
+define_pd_global(bool, OptimizeSinglePrecision,      true);
+define_pd_global(bool, CSEArrayLength,               true);
+define_pd_global(bool, TwoOperandLIRForm,            false);
+
+#endif // CPU_ARM_VM_C1_GLOBALS_ARM_HPP
--- /dev/null	2016-08-24 15:41:39.598575000 -0400
+++ new/hotspot/src/cpu/arm/vm/c2_globals_arm.hpp	2016-12-02 11:19:27.641471534 -0500
@@ -0,0 +1,124 @@
+/*
+ * Copyright (c) 2008, 2015, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef CPU_ARM_VM_C2_GLOBALS_ARM_HPP
+#define CPU_ARM_VM_C2_GLOBALS_ARM_HPP
+
+#include "utilities/globalDefinitions.hpp"
+#include "utilities/macros.hpp"
+
+//
+// Sets the default values for platform dependent flags used by the server compiler.
+// (see c2_globals.hpp).  Alpha-sorted.
+
+define_pd_global(bool, BackgroundCompilation,        true);
+define_pd_global(bool, CICompileOSR,                 true);
+define_pd_global(bool, InlineIntrinsics,             false);
+define_pd_global(bool, PreferInterpreterNativeStubs, false);
+define_pd_global(bool, ProfileTraps,                 true);
+define_pd_global(bool, UseOnStackReplacement,        true);
+define_pd_global(bool, ProfileInterpreter,           true);
+#ifdef AARCH64
+define_pd_global(bool, TieredCompilation,            trueInTiered);
+#else
+define_pd_global(bool, TieredCompilation,            false);
+#endif
+define_pd_global(intx, CompileThreshold,             10000);
+
+define_pd_global(intx, OnStackReplacePercentage,     140);
+define_pd_global(intx, ConditionalMoveLimit,         4);
+// C2 gets to use all the float/double registers
+#ifdef AARCH64
+define_pd_global(intx, FLOATPRESSURE,                31);
+#else
+define_pd_global(intx, FLOATPRESSURE,                30);
+#endif
+define_pd_global(intx, FreqInlineSize,               175);
+#ifdef AARCH64
+define_pd_global(intx, INTPRESSURE,                  27);
+#else
+define_pd_global(intx, INTPRESSURE,                  12);
+#endif
+define_pd_global(intx, InteriorEntryAlignment,       16);  // = CodeEntryAlignment
+define_pd_global(size_t, NewSizeThreadIncrease,      ScaleForWordSize(4*K));
+// The default setting 16/16 seems to work best.
+// (For _228_jack 16/16 is 2% better than 4/4, 16/4, 32/32, 32/16, or 16/32.)
+//define_pd_global(intx, OptoLoopAlignment,            16);  // = 4*wordSize
+define_pd_global(intx, RegisterCostAreaRatio,        16000);
+define_pd_global(bool, UseTLAB,                      true);
+define_pd_global(bool, ResizeTLAB,                   true);
+define_pd_global(intx, LoopUnrollLimit,              60); // Design center runs on 1.3.1
+define_pd_global(intx, LoopPercentProfileLimit,      10);
+define_pd_global(intx, PostLoopMultiversioning,      false);
+define_pd_global(intx, MinJumpTableSize,             16);
+
+// Peephole and CISC spilling both break the graph, and so makes the
+// scheduler sick.
+define_pd_global(bool, OptoPeephole,                 false);
+define_pd_global(bool, UseCISCSpill,                 false);
+define_pd_global(bool, OptoBundling,                 false);
+define_pd_global(bool, OptoScheduling,               true);
+define_pd_global(bool, OptoRegScheduling,            false);
+define_pd_global(bool, SuperWordLoopUnrollAnalysis,  false);
+define_pd_global(bool, IdealizeClearArrayNode,       true);
+
+#ifdef _LP64
+// We need to make sure that all generated code is within
+// 2 gigs of the libjvm.so runtime routines so we can use
+// the faster "call" instruction rather than the expensive
+// sequence of instructions to load a 64 bit pointer.
+//
+// InitialCodeCacheSize derived from specjbb2000 run.
+define_pd_global(size_t, InitialCodeCacheSize,       2048*K); // Integral multiple of CodeCacheExpansionSize
+define_pd_global(size_t, ReservedCodeCacheSize,      48*M);
+define_pd_global(size_t, NonProfiledCodeHeapSize,    21*M);
+define_pd_global(size_t, ProfiledCodeHeapSize,       22*M);
+define_pd_global(size_t, NonNMethodCodeHeapSize,     5*M );
+define_pd_global(size_t, CodeCacheExpansionSize,     64*K);
+
+// Ergonomics related flags
+define_pd_global(uint64_t, MaxRAM,                   128ULL*G);
+#else
+// InitialCodeCacheSize derived from specjbb2000 run.
+define_pd_global(size_t, InitialCodeCacheSize,       1536*K); // Integral multiple of CodeCacheExpansionSize
+define_pd_global(size_t, ReservedCodeCacheSize,      32*M);
+define_pd_global(size_t, NonProfiledCodeHeapSize,    13*M);
+define_pd_global(size_t, ProfiledCodeHeapSize,       14*M);
+define_pd_global(size_t, NonNMethodCodeHeapSize,     5*M );
+define_pd_global(size_t, CodeCacheExpansionSize,     32*K);
+// Ergonomics related flags
+define_pd_global(uint64_t, MaxRAM,                   4ULL*G);
+#endif
+define_pd_global(uintx, CodeCacheMinBlockLength,     4);
+define_pd_global(size_t, CodeCacheMinimumUseSpace,   400*K);
+
+define_pd_global(bool,  TrapBasedRangeChecks,        false); // Not needed
+
+// Heap related flags
+define_pd_global(size_t, MetaspaceSize,              ScaleForWordSize(16*M));
+
+// Ergonomics related flags
+define_pd_global(bool, NeverActAsServerClassMachine, false);
+
+#endif // CPU_ARM_VM_C2_GLOBALS_ARM_HPP
--- /dev/null	2016-08-24 15:41:39.598575000 -0400
+++ new/hotspot/src/cpu/arm/vm/codeBuffer_arm.hpp	2016-12-02 11:19:34.037834270 -0500
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2008, 2011, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef CPU_ARM_VM_CODEBUFFER_ARM_HPP
+#define CPU_ARM_VM_CODEBUFFER_ARM_HPP
+
+private:
+  void pd_initialize() {}
+
+public:
+  void flush_bundle(bool start_new_bundle) {}
+
+#endif // CPU_ARM_VM_CODEBUFFER_ARM_HPP
--- /dev/null	2016-08-24 15:41:39.598575000 -0400
+++ new/hotspot/src/cpu/arm/vm/compiledIC_arm.cpp	2016-12-02 11:19:40.826219237 -0500
@@ -0,0 +1,166 @@
+/*
+ * Copyright (c) 1997, 2016, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+#include "asm/macroAssembler.inline.hpp"
+#include "code/compiledIC.hpp"
+#include "code/icBuffer.hpp"
+#include "code/nativeInst.hpp"
+#include "code/nmethod.hpp"
+#include "memory/resourceArea.hpp"
+#include "runtime/mutexLocker.hpp"
+#include "runtime/safepoint.hpp"
+
+// ----------------------------------------------------------------------------
+#if defined(COMPILER2) || INCLUDE_JVMCI
+#define __ _masm.
+// emit call stub, compiled java to interpreter
+address CompiledStaticCall::emit_to_interp_stub(CodeBuffer &cbuf, address mark) {
+  // Stub is fixed up when the corresponding call is converted from calling
+  // compiled code to calling interpreted code.
+  // set (empty), R9
+  // b -1
+
+  if (mark == NULL) {
+    mark = cbuf.insts_mark();  // get mark within main instrs section
+  }
+
+  MacroAssembler _masm(&cbuf);
+
+  address base = __ start_a_stub(to_interp_stub_size());
+  if (base == NULL) {
+    return NULL;  // CodeBuffer::expand failed
+  }
+
+  // static stub relocation stores the instruction address of the call
+  __ relocate(static_stub_Relocation::spec(mark));
+
+  InlinedMetadata object_literal(NULL);
+  // single instruction, see NativeMovConstReg::next_instruction_address() in
+  // CompiledStaticCall::set_to_interpreted()
+  __ ldr_literal(Rmethod, object_literal);
+
+  __ set_inst_mark(); // Who uses this?
+
+  bool near_range = __ cache_fully_reachable();
+  InlinedAddress dest((address)-1);
+  address branch_site = __ pc();
+  if (near_range) {
+    __ b(branch_site); // special NativeJump -1 destination
+  } else {
+    // Can't trash LR, FP, or argument registers
+    __ indirect_jump(dest, Rtemp);
+  }
+  __ bind_literal(object_literal); // includes spec_for_immediate reloc
+  if (!near_range) {
+    __ bind_literal(dest); // special NativeJump -1 destination
+  }
+
+  assert(__ pc() - base <= to_interp_stub_size(), "wrong stub size");
+
+  // Update current stubs pointer and restore code_end.
+  __ end_a_stub();
+  return base;
+}
+#undef __
+
+// size of C2 call stub, compiled java to interpretor
+int CompiledStaticCall::to_interp_stub_size() {
+  return 8 * NativeInstruction::instruction_size;
+}
+
+// Relocation entries for call stub, compiled java to interpreter.
+int CompiledStaticCall::reloc_to_interp_stub() {
+  return 10;  // 4 in emit_to_interp_stub + 1 in Java_Static_Call
+}
+#endif // COMPILER2 || JVMCI
+
+void CompiledStaticCall::set_to_interpreted(methodHandle callee, address entry) {
+  address stub = find_stub();
+  guarantee(stub != NULL, "stub not found");
+
+  if (TraceICs) {
+    ResourceMark rm;
+    tty->print_cr("CompiledStaticCall@" INTPTR_FORMAT ": set_to_interpreted %s",
+                  p2i(instruction_address()),
+                  callee->name_and_sig_as_C_string());
+  }
+
+  // Creation also verifies the object.
+  NativeMovConstReg* method_holder = nativeMovConstReg_at(stub);
+  NativeJump*        jump          = nativeJump_at(method_holder->next_instruction_address());
+
+#ifdef ASSERT
+  // read the value once
+  volatile intptr_t data = method_holder->data();
+  volatile address destination = jump->jump_destination();
+  assert(data == 0 || data == (intptr_t)callee(),
+         "a) MT-unsafe modification of inline cache");
+  assert(destination == (address)-1 || destination == entry,
+         "b) MT-unsafe modification of inline cache");
+#endif
+
+  // Update stub.
+  method_holder->set_data((intptr_t)callee());
+  jump->set_jump_destination(entry);
+
+  // Update jump to call.
+  set_destination_mt_safe(stub);
+}
+
+void CompiledStaticCall::set_stub_to_clean(static_stub_Relocation* static_stub) {
+  assert (CompiledIC_lock->is_locked() || SafepointSynchronize::is_at_safepoint(), "mt unsafe call");
+  // Reset stub.
+  address stub = static_stub->addr();
+  assert(stub != NULL, "stub not found");
+  // Creation also verifies the object.
+  NativeMovConstReg* method_holder = nativeMovConstReg_at(stub);
+  NativeJump*        jump          = nativeJump_at(method_holder->next_instruction_address());
+  method_holder->set_data(0);
+  jump->set_jump_destination((address)-1);
+}
+
+//-----------------------------------------------------------------------------
+// Non-product mode code
+#ifndef PRODUCT
+
+void CompiledStaticCall::verify() {
+  // Verify call.
+  NativeCall::verify();
+  if (os::is_MP()) {
+    verify_alignment();
+  }
+
+  // Verify stub.
+  address stub = find_stub();
+  assert(stub != NULL, "no stub found for static call");
+  // Creation also verifies the object.
+  NativeMovConstReg* method_holder = nativeMovConstReg_at(stub);
+  NativeJump*        jump          = nativeJump_at(method_holder->next_instruction_address());
+
+  // Verify state.
+  assert(is_clean() || is_call_to_compiled() || is_call_to_interpreted(), "sanity check");
+}
+
+#endif // !PRODUCT
--- /dev/null	2016-08-24 15:41:39.598575000 -0400
+++ new/hotspot/src/cpu/arm/vm/copy_arm.hpp	2016-12-02 11:19:46.310530250 -0500
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2008, 2016, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef CPU_ARM_VM_COPY_ARM_HPP
+#define CPU_ARM_VM_COPY_ARM_HPP
+
+#include "utilities/macros.hpp"
+
+// Inline functions for memory copy and fill.
+
+// Contains inline asm implementations
+#include OS_CPU_HEADER_INLINE(copy)
+
+static void pd_fill_to_words(HeapWord* tohw, size_t count, juint value) {
+  juint* to = (juint*)tohw;
+  count *= HeapWordSize / BytesPerInt;
+  while (count-- > 0) {
+    *to++ = value;
+  }
+}
+
+static void pd_fill_to_aligned_words(HeapWord* tohw, size_t count, juint value) {
+  pd_fill_to_words(tohw, count, value);
+}
+
+static void pd_fill_to_bytes(void* to, size_t count, jubyte value) {
+  memset(to, value, count);
+}
+
+static void pd_zero_to_words(HeapWord* tohw, size_t count) {
+  pd_fill_to_words(tohw, count, 0);
+}
+
+static void pd_zero_to_bytes(void* to, size_t count) {
+  memset(to, 0, count);
+}
+
+#endif // CPU_ARM_VM_COPY_ARM_HPP
--- /dev/null	2016-08-24 15:41:39.598575000 -0400
+++ new/hotspot/src/cpu/arm/vm/debug_arm.cpp	2016-12-02 11:19:51.722837178 -0500
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2008, 2011, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+#include "code/codeCache.hpp"
+#include "code/nmethod.hpp"
+#include "runtime/frame.hpp"
+#include "runtime/init.hpp"
+#include "runtime/os.hpp"
+#include "utilities/debug.hpp"
+
+void pd_ps(frame f) {}
--- /dev/null	2016-08-24 15:41:39.598575000 -0400
+++ new/hotspot/src/cpu/arm/vm/depChecker_arm.cpp	2016-12-02 11:19:57.247150459 -0500
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2008, 2011, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+#include "compiler/disassembler.hpp"
+#include "depChecker_arm.hpp"
+
+// Nothing to do
--- /dev/null	2016-08-24 15:41:39.598575000 -0400
+++ new/hotspot/src/cpu/arm/vm/depChecker_arm.hpp	2016-12-02 11:20:02.907471453 -0500
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2008, 2011, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef CPU_ARM_VM_DEPCHECKER_ARM_HPP
+#define CPU_ARM_VM_DEPCHECKER_ARM_HPP
+
+// Nothing to do
+
+#endif // CPU_ARM_VM_DEPCHECKER_ARM_HPP
--- /dev/null	2016-08-24 15:41:39.598575000 -0400
+++ new/hotspot/src/cpu/arm/vm/disassembler_arm.hpp	2016-12-02 11:20:08.591793808 -0500
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2008, 2011, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef CPU_ARM_VM_DISASSEMBLER_ARM_HPP
+#define CPU_ARM_VM_DISASSEMBLER_ARM_HPP
+
+  static int pd_instruction_alignment() {
+    return sizeof(int);
+  }
+
+  static const char* pd_cpu_opts() {
+    return "";
+  }
+
+#endif // CPU_ARM_VM_DISASSEMBLER_ARM_HPP
--- /dev/null	2016-08-24 15:41:39.598575000 -0400
+++ new/hotspot/src/cpu/arm/vm/frame_arm.cpp	2016-12-02 11:20:14.332119339 -0500
@@ -0,0 +1,655 @@
+/*
+ * Copyright (c) 2008, 2016, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+#include "interpreter/interpreter.hpp"
+#include "memory/resourceArea.hpp"
+#include "oops/markOop.hpp"
+#include "oops/method.hpp"
+#include "oops/oop.inline.hpp"
+#include "runtime/frame.inline.hpp"
+#include "runtime/handles.inline.hpp"
+#include "runtime/javaCalls.hpp"
+#include "runtime/monitorChunk.hpp"
+#include "runtime/signature.hpp"
+#include "runtime/stubCodeGenerator.hpp"
+#include "runtime/stubRoutines.hpp"
+#include "vmreg_arm.inline.hpp"
+#ifdef COMPILER1
+#include "c1/c1_Runtime1.hpp"
+#include "runtime/vframeArray.hpp"
+#endif
+#include "prims/methodHandles.hpp"
+
+#ifdef ASSERT
+void RegisterMap::check_location_valid() {
+}
+#endif
+
+
+// Profiling/safepoint support
+
+bool frame::safe_for_sender(JavaThread *thread) {
+  address   sp = (address)_sp;
+  address   fp = (address)_fp;
+  address   unextended_sp = (address)_unextended_sp;
+
+  static size_t stack_guard_size = os::uses_stack_guard_pages() ?
+    (JavaThread::stack_red_zone_size() + JavaThread::stack_yellow_zone_size()) : 0;
+  size_t usable_stack_size = thread->stack_size() - stack_guard_size;
+
+  // sp must be within the usable part of the stack (not in guards)
+  bool sp_safe = (sp != NULL &&
+                 (sp <= thread->stack_base()) &&
+                 (sp >= thread->stack_base() - usable_stack_size));
+
+  if (!sp_safe) {
+    return false;
+  }
+
+  bool unextended_sp_safe = (unextended_sp != NULL &&
+                             (unextended_sp <= thread->stack_base()) &&
+                             (unextended_sp >= sp));
+  if (!unextended_sp_safe) {
+    return false;
+  }
+
+  // We know sp/unextended_sp are safe. Only fp is questionable here.
+
+  bool fp_safe = (fp != NULL &&
+                  (fp <= thread->stack_base()) &&
+                  fp >= sp);
+
+  if (_cb != NULL ) {
+
+    // First check if frame is complete and tester is reliable
+    // Unfortunately we can only check frame complete for runtime stubs and nmethod
+    // other generic buffer blobs are more problematic so we just assume they are
+    // ok. adapter blobs never have a frame complete and are never ok.
+
+    if (!_cb->is_frame_complete_at(_pc)) {
+      if (_cb->is_compiled() || _cb->is_adapter_blob() || _cb->is_runtime_stub()) {
+        return false;
+      }
+    }
+
+    // Could just be some random pointer within the codeBlob
+    if (!_cb->code_contains(_pc)) {
+      return false;
+    }
+
+    // Entry frame checks
+    if (is_entry_frame()) {
+      // an entry frame must have a valid fp.
+      return fp_safe && is_entry_frame_valid(thread);
+    }
+
+    intptr_t* sender_sp = NULL;
+    address   sender_pc = NULL;
+
+    if (is_interpreted_frame()) {
+      // fp must be safe
+      if (!fp_safe) {
+        return false;
+      }
+
+      sender_pc = (address) this->fp()[return_addr_offset];
+      sender_sp = (intptr_t*) addr_at(sender_sp_offset);
+
+    } else {
+      // must be some sort of compiled/runtime frame
+      // fp does not have to be safe (although it could be check for c1?)
+
+      sender_sp = _unextended_sp + _cb->frame_size();
+      // Is sender_sp safe?
+      if ((address)sender_sp >= thread->stack_base()) {
+        return false;
+      }
+      // With our calling conventions, the return_address should
+      // end up being the word on the stack
+      sender_pc = (address) *(sender_sp - sender_sp_offset + return_addr_offset);
+    }
+
+    // We must always be able to find a recognizable pc
+    CodeBlob* sender_blob = CodeCache::find_blob_unsafe(sender_pc);
+    if (sender_pc == NULL || sender_blob == NULL) {
+      return false;
+    }
+
+
+    // If the potential sender is the interpreter then we can do some more checking
+    if (Interpreter::contains(sender_pc)) {
+
+      // FP is always saved in a recognizable place in any code we generate. However
+      // only if the sender is interpreted/call_stub (c1 too?) are we certain that the saved FP
+      // is really a frame pointer.
+
+      intptr_t *saved_fp = (intptr_t*)*(sender_sp - frame::sender_sp_offset + link_offset);
+      bool saved_fp_safe = ((address)saved_fp <= thread->stack_base()) && (saved_fp > sender_sp);
+
+      if (!saved_fp_safe) {
+        return false;
+      }
+
+      // construct the potential sender
+
+      frame sender(sender_sp, saved_fp, sender_pc);
+
+      return sender.is_interpreted_frame_valid(thread);
+    }
+
+    if (sender_blob->is_zombie() || sender_blob->is_unloaded()) {
+      return false;
+    }
+
+    // Could just be some random pointer within the codeBlob
+    if (!sender_blob->code_contains(sender_pc)) {
+      return false;
+    }
+
+    // We should never be able to see an adapter if the current frame is something from code cache
+    if (sender_blob->is_adapter_blob()) {
+      return false;
+    }
+
+    // Could be the call_stub
+    if (StubRoutines::returns_to_call_stub(sender_pc)) {
+      intptr_t *saved_fp = (intptr_t*)*(sender_sp - frame::sender_sp_offset + link_offset);
+      bool saved_fp_safe = ((address)saved_fp <= thread->stack_base()) && (saved_fp >= sender_sp);
+
+      if (!saved_fp_safe) {
+        return false;
+      }
+
+      // construct the potential sender
+
+      frame sender(sender_sp, saved_fp, sender_pc);
+
+      // Validate the JavaCallWrapper an entry frame must have
+      address jcw = (address)sender.entry_frame_call_wrapper();
+
+      bool jcw_safe = (jcw <= thread->stack_base()) && (jcw > (address)sender.fp());
+
+      return jcw_safe;
+    }
+
+    // If the frame size is 0 something (or less) is bad because every nmethod has a non-zero frame size
+    // because the return address counts against the callee's frame.
+
+    if (sender_blob->frame_size() <= 0) {
+      assert(!sender_blob->is_compiled(), "should count return address at least");
+      return false;
+    }
+
+    // We should never be able to see anything here except an nmethod. If something in the
+    // code cache (current frame) is called by an entity within the code cache that entity
+    // should not be anything but the call stub (already covered), the interpreter (already covered)
+    // or an nmethod.
+
+    if (!sender_blob->is_compiled()) {
+      return false;
+    }
+
+    // Could put some more validation for the potential non-interpreted sender
+    // frame we'd create by calling sender if I could think of any. Wait for next crash in forte...
+
+    // One idea is seeing if the sender_pc we have is one that we'd expect to call to current cb
+
+    // We've validated the potential sender that would be created
+    return true;
+  }
+
+  // Must be native-compiled frame. Since sender will try and use fp to find
+  // linkages it must be safe
+
+  if (!fp_safe) {
+    return false;
+  }
+
+  // Will the pc we fetch be non-zero (which we'll find at the oldest frame)
+
+  if ((address) this->fp()[return_addr_offset] == NULL) return false;
+
+
+  // could try and do some more potential verification of native frame if we could think of some...
+
+  return true;
+}
+
+
+void frame::patch_pc(Thread* thread, address pc) {
+  address* pc_addr = &((address *)sp())[-sender_sp_offset+return_addr_offset];
+  if (TracePcPatching) {
+    tty->print_cr("patch_pc at address" INTPTR_FORMAT " [" INTPTR_FORMAT " -> " INTPTR_FORMAT "] ",
+                  p2i(pc_addr), p2i(*pc_addr), p2i(pc));
+  }
+  *pc_addr = pc;
+  _cb = CodeCache::find_blob(pc);
+  address original_pc = CompiledMethod::get_deopt_original_pc(this);
+  if (original_pc != NULL) {
+    assert(original_pc == _pc, "expected original PC to be stored before patching");
+    _deopt_state = is_deoptimized;
+    // leave _pc as is
+  } else {
+    _deopt_state = not_deoptimized;
+    _pc = pc;
+  }
+}
+
+bool frame::is_interpreted_frame() const  {
+  return Interpreter::contains(pc());
+}
+
+int frame::frame_size(RegisterMap* map) const {
+  frame sender = this->sender(map);
+  return sender.sp() - sp();
+}
+
+intptr_t* frame::entry_frame_argument_at(int offset) const {
+  assert(is_entry_frame(), "entry frame expected");
+  // convert offset to index to deal with tsi
+  int index = (Interpreter::expr_offset_in_bytes(offset)/wordSize);
+  // Entry frame's arguments are always in relation to unextended_sp()
+  return &unextended_sp()[index];
+}
+
+// sender_sp
+intptr_t* frame::interpreter_frame_sender_sp() const {
+  assert(is_interpreted_frame(), "interpreted frame expected");
+  return (intptr_t*) at(interpreter_frame_sender_sp_offset);
+}
+
+void frame::set_interpreter_frame_sender_sp(intptr_t* sender_sp) {
+  assert(is_interpreted_frame(), "interpreted frame expected");
+  ptr_at_put(interpreter_frame_sender_sp_offset, (intptr_t) sender_sp);
+}
+
+
+// monitor elements
+
+BasicObjectLock* frame::interpreter_frame_monitor_begin() const {
+  return (BasicObjectLock*) addr_at(interpreter_frame_monitor_block_bottom_offset);
+}
+
+BasicObjectLock* frame::interpreter_frame_monitor_end() const {
+  BasicObjectLock* result = (BasicObjectLock*) *addr_at(interpreter_frame_monitor_block_top_offset);
+  // make sure the pointer points inside the frame
+  assert((intptr_t) fp() >  (intptr_t) result, "result must <  than frame pointer");
+  assert((intptr_t) sp() <= (intptr_t) result, "result must >= than stack pointer");
+  return result;
+}
+
+void frame::interpreter_frame_set_monitor_end(BasicObjectLock* value) {
+  *((BasicObjectLock**)addr_at(interpreter_frame_monitor_block_top_offset)) = value;
+}
+
+#ifdef AARCH64
+
+// Used by template based interpreter deoptimization
+void frame::interpreter_frame_set_stack_top(intptr_t* stack_top) {
+  *((intptr_t**)addr_at(interpreter_frame_stack_top_offset)) = stack_top;
+}
+
+// Used by template based interpreter deoptimization
+void frame::interpreter_frame_set_extended_sp(intptr_t* sp) {
+  *((intptr_t**)addr_at(interpreter_frame_extended_sp_offset)) = sp;
+}
+
+#else
+
+// Used by template based interpreter deoptimization
+void frame::interpreter_frame_set_last_sp(intptr_t* sp) {
+    *((intptr_t**)addr_at(interpreter_frame_last_sp_offset)) = sp;
+}
+
+#endif // AARCH64
+
+frame frame::sender_for_entry_frame(RegisterMap* map) const {
+  assert(map != NULL, "map must be set");
+  // Java frame called from C; skip all C frames and return top C
+  // frame of that chunk as the sender
+  JavaFrameAnchor* jfa = entry_frame_call_wrapper()->anchor();
+  assert(!entry_frame_is_first(), "next Java fp must be non zero");
+  assert(jfa->last_Java_sp() > sp(), "must be above this frame on stack");
+  map->clear();
+  assert(map->include_argument_oops(), "should be set by clear");
+#ifdef AARCH64
+  assert (jfa->last_Java_pc() != NULL, "pc should be stored");
+  frame fr(jfa->last_Java_sp(), jfa->last_Java_fp(), jfa->last_Java_pc());
+  return fr;
+#else
+  if (jfa->last_Java_pc() != NULL) {
+    frame fr(jfa->last_Java_sp(), jfa->last_Java_fp(), jfa->last_Java_pc());
+    return fr;
+  }
+  frame fr(jfa->last_Java_sp(), jfa->last_Java_fp());
+  return fr;
+#endif // AARCH64
+}
+
+//------------------------------------------------------------------------------
+// frame::verify_deopt_original_pc
+//
+// Verifies the calculated original PC of a deoptimization PC for the
+// given unextended SP.  The unextended SP might also be the saved SP
+// for MethodHandle call sites.
+#ifdef ASSERT
+void frame::verify_deopt_original_pc(CompiledMethod* nm, intptr_t* unextended_sp, bool is_method_handle_return) {
+  frame fr;
+
+  // This is ugly but it's better than to change {get,set}_original_pc
+  // to take an SP value as argument.  And it's only a debugging
+  // method anyway.
+  fr._unextended_sp = unextended_sp;
+
+  address original_pc = nm->get_original_pc(&fr);
+  assert(nm->insts_contains(original_pc), "original PC must be in nmethod");
+  assert(nm->is_method_handle_return(original_pc) == is_method_handle_return, "must be");
+}
+#endif
+
+//------------------------------------------------------------------------------
+// frame::adjust_unextended_sp
+void frame::adjust_unextended_sp() {
+  // same as on x86
+
+  // If we are returning to a compiled MethodHandle call site, the
+  // saved_fp will in fact be a saved value of the unextended SP.  The
+  // simplest way to tell whether we are returning to such a call site
+  // is as follows:
+
+  CompiledMethod* sender_cm = (_cb == NULL) ? NULL : _cb->as_compiled_method_or_null();
+  if (sender_cm != NULL) {
+    // If the sender PC is a deoptimization point, get the original
+    // PC.  For MethodHandle call site the unextended_sp is stored in
+    // saved_fp.
+    if (sender_cm->is_deopt_mh_entry(_pc)) {
+      DEBUG_ONLY(verify_deopt_mh_original_pc(sender_cm, _fp));
+      _unextended_sp = _fp;
+    }
+    else if (sender_cm->is_deopt_entry(_pc)) {
+      DEBUG_ONLY(verify_deopt_original_pc(sender_cm, _unextended_sp));
+    }
+    else if (sender_cm->is_method_handle_return(_pc)) {
+      _unextended_sp = _fp;
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// frame::update_map_with_saved_link
+void frame::update_map_with_saved_link(RegisterMap* map, intptr_t** link_addr) {
+  // see x86 for comments
+  map->set_location(FP->as_VMReg(), (address) link_addr);
+#ifdef AARCH64
+  // also adjust a high part of register
+  map->set_location(FP->as_VMReg()->next(), (address) link_addr);
+#endif // AARCH64
+}
+
+frame frame::sender_for_interpreter_frame(RegisterMap* map) const {
+  // SP is the raw SP from the sender after adapter or interpreter
+  // extension.
+  intptr_t* sender_sp = this->sender_sp();
+
+  // This is the sp before any possible extension (adapter/locals).
+  intptr_t* unextended_sp = interpreter_frame_sender_sp();
+
+#ifdef COMPILER2
+  if (map->update_map()) {
+    update_map_with_saved_link(map, (intptr_t**) addr_at(link_offset));
+  }
+#endif // COMPILER2
+
+  return frame(sender_sp, unextended_sp, link(), sender_pc());
+}
+
+frame frame::sender_for_compiled_frame(RegisterMap* map) const {
+  assert(map != NULL, "map must be set");
+
+  // frame owned by optimizing compiler
+  assert(_cb->frame_size() >= 0, "must have non-zero frame size");
+  intptr_t* sender_sp = unextended_sp() + _cb->frame_size();
+  intptr_t* unextended_sp = sender_sp;
+
+  address sender_pc = (address) *(sender_sp - sender_sp_offset + return_addr_offset);
+
+  // This is the saved value of FP which may or may not really be an FP.
+  // It is only an FP if the sender is an interpreter frame (or C1?).
+  intptr_t** saved_fp_addr = (intptr_t**) (sender_sp - sender_sp_offset + link_offset);
+
+  if (map->update_map()) {
+    // Tell GC to use argument oopmaps for some runtime stubs that need it.
+    // For C1, the runtime stub might not have oop maps, so set this flag
+    // outside of update_register_map.
+    map->set_include_argument_oops(_cb->caller_must_gc_arguments(map->thread()));
+    if (_cb->oop_maps() != NULL) {
+      OopMapSet::update_register_map(this, map);
+    }
+
+    // Since the prolog does the save and restore of FP there is no oopmap
+    // for it so we must fill in its location as if there was an oopmap entry
+    // since if our caller was compiled code there could be live jvm state in it.
+    update_map_with_saved_link(map, saved_fp_addr);
+  }
+
+  assert(sender_sp != sp(), "must have changed");
+  return frame(sender_sp, unextended_sp, *saved_fp_addr, sender_pc);
+}
+
+frame frame::sender(RegisterMap* map) const {
+  // Default is we done have to follow them. The sender_for_xxx will
+  // update it accordingly
+  map->set_include_argument_oops(false);
+
+  if (is_entry_frame())       return sender_for_entry_frame(map);
+  if (is_interpreted_frame()) return sender_for_interpreter_frame(map);
+  assert(_cb == CodeCache::find_blob(pc()),"Must be the same");
+
+  if (_cb != NULL) {
+    return sender_for_compiled_frame(map);
+  }
+
+  assert(false, "should not be called for a C frame");
+  return frame();
+}
+
+bool frame::is_interpreted_frame_valid(JavaThread* thread) const {
+  assert(is_interpreted_frame(), "Not an interpreted frame");
+  // These are reasonable sanity checks
+  if (fp() == 0 || (intptr_t(fp()) & (wordSize-1)) != 0) {
+    return false;
+  }
+  if (sp() == 0 || (intptr_t(sp()) & (wordSize-1)) != 0) {
+    return false;
+  }
+  if (fp() + interpreter_frame_initial_sp_offset < sp()) {
+    return false;
+  }
+  // These are hacks to keep us out of trouble.
+  // The problem with these is that they mask other problems
+  if (fp() <= sp()) {        // this attempts to deal with unsigned comparison above
+    return false;
+  }
+  // do some validation of frame elements
+
+  // first the method
+
+  Method* m = *interpreter_frame_method_addr();
+
+  // validate the method we'd find in this potential sender
+  if (!m->is_valid_method()) return false;
+
+  // stack frames shouldn't be much larger than max_stack elements
+
+  if (fp() - sp() > 1024 + m->max_stack()*Interpreter::stackElementSize) {
+    return false;
+  }
+
+  // validate bci/bcp
+
+  address bcp = interpreter_frame_bcp();
+  if (m->validate_bci_from_bcp(bcp) < 0) {
+    return false;
+  }
+
+  // validate ConstantPoolCache*
+  ConstantPoolCache* cp = *interpreter_frame_cache_addr();
+  if (cp == NULL || !cp->is_metaspace_object()) return false;
+
+  // validate locals
+
+  address locals =  (address) *interpreter_frame_locals_addr();
+
+  if (locals > thread->stack_base() || locals < (address) fp()) return false;
+
+  // We'd have to be pretty unlucky to be mislead at this point
+
+  return true;
+}
+
+BasicType frame::interpreter_frame_result(oop* oop_result, jvalue* value_result) {
+  assert(is_interpreted_frame(), "interpreted frame expected");
+  Method* method = interpreter_frame_method();
+  BasicType type = method->result_type();
+
+  intptr_t* res_addr;
+  if (method->is_native()) {
+    // Prior to calling into the runtime to report the method_exit both of
+    // the possible return value registers are saved.
+#ifdef AARCH64
+    // Return value registers are saved into the frame
+    if (type == T_FLOAT || type == T_DOUBLE) {
+      res_addr = addr_at(interpreter_frame_fp_saved_result_offset);
+    } else {
+      res_addr = addr_at(interpreter_frame_gp_saved_result_offset);
+    }
+#else
+    // Return value registers are pushed to the native stack
+    res_addr = (intptr_t*)sp();
+#ifdef __ABI_HARD__
+    // FP result is pushed onto a stack along with integer result registers
+    if (type == T_FLOAT || type == T_DOUBLE) {
+      res_addr += 2;
+    }
+#endif // __ABI_HARD__
+#endif // AARCH64
+  } else {
+    res_addr = (intptr_t*)interpreter_frame_tos_address();
+  }
+
+  switch (type) {
+    case T_OBJECT  :
+    case T_ARRAY   : {
+      oop obj;
+      if (method->is_native()) {
+        obj = cast_to_oop(at(interpreter_frame_oop_temp_offset));
+      } else {
+        obj = *(oop*)res_addr;
+      }
+      assert(obj == NULL || Universe::heap()->is_in(obj), "sanity check");
+      *oop_result = obj;
+      break;
+    }
+    case T_BOOLEAN : value_result->z = *(jboolean*)res_addr; break;
+    case T_BYTE    : value_result->b = *(jbyte*)res_addr; break;
+    case T_CHAR    : value_result->c = *(jchar*)res_addr; break;
+    case T_SHORT   : value_result->s = *(jshort*)res_addr; break;
+    case T_INT     : value_result->i = *(jint*)res_addr; break;
+    case T_LONG    : value_result->j = *(jlong*)res_addr; break;
+    case T_FLOAT   : value_result->f = *(jfloat*)res_addr; break;
+    case T_DOUBLE  : value_result->d = *(jdouble*)res_addr; break;
+    case T_VOID    : /* Nothing to do */ break;
+    default        : ShouldNotReachHere();
+  }
+
+  return type;
+}
+
+
+intptr_t* frame::interpreter_frame_tos_at(jint offset) const {
+  int index = (Interpreter::expr_offset_in_bytes(offset)/wordSize);
+  return &interpreter_frame_tos_address()[index];
+}
+
+#ifndef PRODUCT
+
+#define DESCRIBE_FP_OFFSET(name) \
+  values.describe(frame_no, fp() + frame::name##_offset, #name)
+
+void frame::describe_pd(FrameValues& values, int frame_no) {
+  if (is_interpreted_frame()) {
+    DESCRIBE_FP_OFFSET(interpreter_frame_sender_sp);
+#ifdef AARCH64
+    DESCRIBE_FP_OFFSET(interpreter_frame_stack_top);
+    DESCRIBE_FP_OFFSET(interpreter_frame_extended_sp);
+#else
+    DESCRIBE_FP_OFFSET(interpreter_frame_last_sp);
+#endif // AARCH64
+    DESCRIBE_FP_OFFSET(interpreter_frame_method);
+    DESCRIBE_FP_OFFSET(interpreter_frame_mdp);
+    DESCRIBE_FP_OFFSET(interpreter_frame_cache);
+    DESCRIBE_FP_OFFSET(interpreter_frame_locals);
+    DESCRIBE_FP_OFFSET(interpreter_frame_bcp);
+    DESCRIBE_FP_OFFSET(interpreter_frame_initial_sp);
+  }
+}
+
+// This is a generic constructor which is only used by pns() in debug.cpp.
+frame::frame(void* sp, void* fp, void* pc) {
+  init((intptr_t*)sp, (intptr_t*)fp, (address)pc);
+}
+#endif
+
+intptr_t *frame::initial_deoptimization_info() {
+  // used to reset the saved FP
+  return fp();
+}
+
+intptr_t* frame::real_fp() const {
+#ifndef AARCH64
+  if (is_entry_frame()) {
+    // Work-around: FP (currently) does not conform to the ABI for entry
+    // frames (see generate_call_stub). Might be worth fixing as another CR.
+    // Following code assumes (and asserts) this has not yet been fixed.
+    assert(frame::entry_frame_call_wrapper_offset == 0, "adjust this code");
+    intptr_t* new_fp = fp();
+    new_fp += 5; // saved R0,R1,R2,R4,R10
+#ifndef __SOFTFP__
+    new_fp += 8*2; // saved D8..D15
+#endif
+    return new_fp;
+  }
+#endif // !AARCH64
+  if (_cb != NULL) {
+    // use the frame size if valid
+    int size = _cb->frame_size();
+    if (size > 0) {
+      return unextended_sp() + size;
+    }
+  }
+  // else rely on fp()
+  assert(! is_compiled_frame(), "unknown compiled frame size");
+  return fp();
+}
--- /dev/null	2016-08-24 15:41:39.598575000 -0400
+++ new/hotspot/src/cpu/arm/vm/frame_arm.hpp	2016-12-02 11:20:19.960438520 -0500
@@ -0,0 +1,138 @@
+/*
+ * Copyright (c) 2008, 2016, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef CPU_ARM_VM_FRAME_ARM_HPP
+#define CPU_ARM_VM_FRAME_ARM_HPP
+
+#include "runtime/synchronizer.hpp"
+
+ public:
+  enum {
+    pc_return_offset                                 =  0,
+    // All frames
+    link_offset                                      =  0,
+    return_addr_offset                               =  1,
+    // non-interpreter frames
+    sender_sp_offset                                 =  2,
+
+    // Interpreter frames
+#ifdef AARCH64
+    interpreter_frame_gp_saved_result_offset         =  4, // for native calls only
+    interpreter_frame_fp_saved_result_offset         =  3, // for native calls only
+#endif
+    interpreter_frame_oop_temp_offset                =  2, // for native calls only
+
+    interpreter_frame_sender_sp_offset               = -1,
+#ifdef AARCH64
+    interpreter_frame_stack_top_offset               = interpreter_frame_sender_sp_offset - 1,
+    interpreter_frame_extended_sp_offset             = interpreter_frame_stack_top_offset - 1,
+    interpreter_frame_method_offset                  = interpreter_frame_extended_sp_offset - 1,
+#else
+    // outgoing sp before a call to an invoked method
+    interpreter_frame_last_sp_offset                 = interpreter_frame_sender_sp_offset - 1,
+    interpreter_frame_method_offset                  = interpreter_frame_last_sp_offset - 1,
+#endif // AARCH64
+    interpreter_frame_mirror_offset                  = interpreter_frame_method_offset - 1,
+    interpreter_frame_mdp_offset                     = interpreter_frame_mirror_offset - 1,
+    interpreter_frame_cache_offset                   = interpreter_frame_mdp_offset - 1,
+    interpreter_frame_locals_offset                  = interpreter_frame_cache_offset - 1,
+    interpreter_frame_bcp_offset                     = interpreter_frame_locals_offset - 1,
+    interpreter_frame_initial_sp_offset              = interpreter_frame_bcp_offset - 1,
+
+    interpreter_frame_monitor_block_top_offset       = interpreter_frame_initial_sp_offset,
+    interpreter_frame_monitor_block_bottom_offset    = interpreter_frame_initial_sp_offset,
+
+    // Entry frames
+    entry_frame_call_wrapper_offset                  =  AARCH64_ONLY(2) NOT_AARCH64(0)
+  };
+
+  intptr_t ptr_at(int offset) const {
+    return *ptr_at_addr(offset);
+  }
+
+  void ptr_at_put(int offset, intptr_t value) {
+    *ptr_at_addr(offset) = value;
+  }
+
+ private:
+  // an additional field beyond _sp and _pc:
+  intptr_t* _fp; // frame pointer
+  // The interpreter and adapters will extend the frame of the caller.
+  // Since oopMaps are based on the sp of the caller before extension
+  // we need to know that value. However in order to compute the address
+  // of the return address we need the real "raw" sp. Since sparc already
+  // uses sp() to mean "raw" sp and unextended_sp() to mean the caller's
+  // original sp we use that convention.
+
+  intptr_t* _unextended_sp;
+  void adjust_unextended_sp();
+
+  intptr_t* ptr_at_addr(int offset) const {
+    return (intptr_t*) addr_at(offset);
+  }
+
+#ifdef ASSERT
+  // Used in frame::sender_for_{interpreter,compiled}_frame
+  static void verify_deopt_original_pc(   CompiledMethod* nm, intptr_t* unextended_sp, bool is_method_handle_return = false);
+  static void verify_deopt_mh_original_pc(CompiledMethod* nm, intptr_t* unextended_sp) {
+    verify_deopt_original_pc(nm, unextended_sp, true);
+  }
+#endif
+
+ public:
+  // Constructors
+
+  frame(intptr_t* sp, intptr_t* fp, address pc);
+
+  frame(intptr_t* sp, intptr_t* unextended_sp, intptr_t* fp, address pc);
+
+#ifndef AARCH64
+  frame(intptr_t* sp, intptr_t* fp);
+#endif // !AARCH64
+
+  void init(intptr_t* sp, intptr_t* fp, address pc);
+
+  // accessors for the instance variables
+  // Note: not necessarily the real 'frame pointer' (see real_fp)
+  intptr_t* fp() const { return _fp; }
+
+  inline address* sender_pc_addr() const;
+
+#ifdef AARCH64
+  // Used by template based interpreter deoptimization
+  void interpreter_frame_set_stack_top(intptr_t* stack_top);
+  void interpreter_frame_set_extended_sp(intptr_t* sp);
+
+#else
+  // expression stack tos if we are nested in a java call
+  intptr_t* interpreter_frame_last_sp() const;
+
+  // deoptimization support
+  void interpreter_frame_set_last_sp(intptr_t* sp);
+#endif // AARCH64
+
+  // helper to update a map with callee-saved FP
+  static void update_map_with_saved_link(RegisterMap* map, intptr_t** link_addr);
+
+#endif // CPU_ARM_VM_FRAME_ARM_HPP
--- /dev/null	2016-08-24 15:41:39.598575000 -0400
+++ new/hotspot/src/cpu/arm/vm/frame_arm.inline.hpp	2016-12-02 11:20:25.484751799 -0500
@@ -0,0 +1,248 @@
+/*
+ * Copyright (c) 2008, 2016, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef CPU_ARM_VM_FRAME_ARM_INLINE_HPP
+#define CPU_ARM_VM_FRAME_ARM_INLINE_HPP
+
+#include "code/codeCache.hpp"
+#include "code/vmreg.inline.hpp"
+
+// Inline functions for ARM frames:
+
+// Constructors:
+
+inline frame::frame() {
+  _pc = NULL;
+  _sp = NULL;
+  _unextended_sp = NULL;
+  _fp = NULL;
+  _cb = NULL;
+  _deopt_state = unknown;
+}
+
+inline void frame::init(intptr_t* sp, intptr_t* fp, address pc) {
+  _sp = sp;
+  _unextended_sp = sp;
+  _fp = fp;
+  _pc = pc;
+  assert(pc != NULL, "no pc?");
+  _cb = CodeCache::find_blob(pc);
+  adjust_unextended_sp();
+
+  address original_pc = CompiledMethod::get_deopt_original_pc(this);
+  if (original_pc != NULL) {
+    _pc = original_pc;
+    _deopt_state = is_deoptimized;
+  } else {
+    _deopt_state = not_deoptimized;
+  }
+}
+
+inline frame::frame(intptr_t* sp, intptr_t* fp, address pc) {
+  init(sp, fp, pc);
+}
+
+inline frame::frame(intptr_t* sp, intptr_t* unextended_sp, intptr_t* fp, address pc) {
+  _sp = sp;
+  _unextended_sp = unextended_sp;
+  _fp = fp;
+  _pc = pc;
+  assert(pc != NULL, "no pc?");
+  _cb = CodeCache::find_blob(pc);
+  adjust_unextended_sp();
+
+  address original_pc = CompiledMethod::get_deopt_original_pc(this);
+  if (original_pc != NULL) {
+    _pc = original_pc;
+    assert(_cb->as_compiled_method()->insts_contains(_pc), "original PC must be in CompiledMethod");
+    _deopt_state = is_deoptimized;
+  } else {
+    _deopt_state = not_deoptimized;
+  }
+}
+
+#ifndef AARCH64
+
+inline frame::frame(intptr_t* sp, intptr_t* fp) {
+  _sp = sp;
+  _unextended_sp = sp;
+  _fp = fp;
+  assert(sp != NULL,"null SP ?");
+  _pc = (address)(sp[-1]);
+  // assert(_pc != NULL, "no pc?"); // see comments in x86
+  _cb = CodeCache::find_blob(_pc);
+  adjust_unextended_sp();
+
+  address original_pc = CompiledMethod::get_deopt_original_pc(this);
+  if (original_pc != NULL) {
+    _pc = original_pc;
+    _deopt_state = is_deoptimized;
+  } else {
+    _deopt_state = not_deoptimized;
+  }
+}
+
+#endif // !AARCH64
+
+// Accessors
+
+inline bool frame::equal(frame other) const {
+  bool ret =  sp() == other.sp()
+              && unextended_sp() == other.unextended_sp()
+              && fp() == other.fp()
+              && pc() == other.pc();
+  assert(!ret || ret && cb() == other.cb() && _deopt_state == other._deopt_state, "inconsistent construction");
+  return ret;
+}
+
+// Return unique id for this frame. The id must have a value where we can distinguish
+// identity and younger/older relationship. NULL represents an invalid (incomparable)
+// frame.
+inline intptr_t* frame::id(void) const { return unextended_sp(); }
+
+// Relationals on frames based
+// Return true if the frame is younger (more recent activation) than the frame represented by id
+inline bool frame::is_younger(intptr_t* id) const { assert(this->id() != NULL && id != NULL, "NULL frame id");
+                                                    return this->id() < id ; }
+
+// Return true if the frame is older (less recent activation) than the frame represented by id
+inline bool frame::is_older(intptr_t* id) const   { assert(this->id() != NULL && id != NULL, "NULL frame id");
+                                                    return this->id() > id ; }
+
+
+
+inline intptr_t* frame::link() const              { return (intptr_t*) *(intptr_t **)addr_at(link_offset); }
+
+inline intptr_t* frame::unextended_sp() const     { return _unextended_sp; }
+
+// Return address:
+
+inline address* frame::sender_pc_addr()      const { return (address*) addr_at(return_addr_offset); }
+inline address  frame::sender_pc()           const { return *sender_pc_addr(); }
+
+inline intptr_t* frame::sender_sp() const { return addr_at(sender_sp_offset); }
+
+inline intptr_t** frame::interpreter_frame_locals_addr() const {
+  return (intptr_t**)addr_at(interpreter_frame_locals_offset);
+}
+
+#ifndef AARCH64
+inline intptr_t* frame::interpreter_frame_last_sp() const {
+  return *(intptr_t**)addr_at(interpreter_frame_last_sp_offset);
+}
+#endif // !AARCH64
+
+inline intptr_t* frame::interpreter_frame_bcp_addr() const {
+  return (intptr_t*)addr_at(interpreter_frame_bcp_offset);
+}
+
+inline intptr_t* frame::interpreter_frame_mdp_addr() const {
+  return (intptr_t*)addr_at(interpreter_frame_mdp_offset);
+}
+
+
+// Constant pool cache
+
+inline ConstantPoolCache** frame::interpreter_frame_cache_addr() const {
+  return (ConstantPoolCache**)addr_at(interpreter_frame_cache_offset);
+}
+
+// Method
+
+inline Method** frame::interpreter_frame_method_addr() const {
+  return (Method**)addr_at(interpreter_frame_method_offset);
+}
+
+inline oop* frame::interpreter_frame_mirror_addr() const {
+  return (oop*)addr_at(interpreter_frame_mirror_offset);
+}
+
+// top of expression stack
+inline intptr_t* frame::interpreter_frame_tos_address() const {
+#ifdef AARCH64
+  intptr_t* stack_top = (intptr_t*)*addr_at(interpreter_frame_stack_top_offset);
+  assert(stack_top != NULL, "should be stored before call");
+  assert(stack_top <= (intptr_t*) interpreter_frame_monitor_end(), "bad tos");
+  return stack_top;
+#else
+  intptr_t* last_sp = interpreter_frame_last_sp();
+  if (last_sp == NULL ) {
+    return sp();
+  } else {
+    // sp() may have been extended or shrunk by an adapter.  At least
+    // check that we don't fall behind the legal region.
+    // For top deoptimized frame last_sp == interpreter_frame_monitor_end.
+    assert(last_sp <= (intptr_t*) interpreter_frame_monitor_end(), "bad tos");
+    return last_sp;
+  }
+#endif // AARCH64
+}
+
+inline oop* frame::interpreter_frame_temp_oop_addr() const {
+  return (oop *)(fp() + interpreter_frame_oop_temp_offset);
+}
+
+inline int frame::interpreter_frame_monitor_size() {
+  return BasicObjectLock::size();
+}
+
+
+// expression stack
+// (the max_stack arguments are used by the GC; see class FrameClosure)
+
+inline intptr_t* frame::interpreter_frame_expression_stack() const {
+  intptr_t* monitor_end = (intptr_t*) interpreter_frame_monitor_end();
+  return monitor_end-1;
+}
+
+
+inline jint frame::interpreter_frame_expression_stack_direction() { return -1; }
+
+
+// Entry frames
+
+inline JavaCallWrapper** frame::entry_frame_call_wrapper_addr() const {
+ return (JavaCallWrapper**)addr_at(entry_frame_call_wrapper_offset);
+}
+
+
+// Compiled frames
+
+inline bool frame::volatile_across_calls(Register reg) {
+  return true;
+}
+
+inline oop frame::saved_oop_result(RegisterMap* map) const {
+  oop* result_adr = (oop*) map->location(R0->as_VMReg());
+  guarantee(result_adr != NULL, "bad register save location");
+  return (*result_adr);
+}
+
+inline void frame::set_saved_oop_result(RegisterMap* map, oop obj) {
+  oop* result_adr = (oop*) map->location(R0->as_VMReg());
+  guarantee(result_adr != NULL, "bad register save location");
+  *result_adr = obj;
+}
+
+#endif // CPU_ARM_VM_FRAME_ARM_INLINE_HPP
--- /dev/null	2016-08-24 15:41:39.598575000 -0400
+++ new/hotspot/src/cpu/arm/vm/globalDefinitions_arm.hpp	2016-12-02 11:20:30.921060089 -0500
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2008, 2016, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef CPU_ARM_VM_GLOBALDEFINITIONS_ARM_HPP
+#define CPU_ARM_VM_GLOBALDEFINITIONS_ARM_HPP
+
+#ifdef AARCH64
+#define AARCH64_ONLY(code) code
+#define AARCH64_ONLY_ARG(arg) , arg
+#define NOT_AARCH64(code)
+#define NOT_AARCH64_ARG(arg)
+#else
+#define AARCH64_ONLY(code)
+#define AARCH64_ONLY_ARG(arg)
+#define NOT_AARCH64(code) code
+#define NOT_AARCH64_ARG(arg) , arg
+#endif
+
+const int StackAlignmentInBytes = AARCH64_ONLY(16) NOT_AARCH64(8);
+
+// Indicates whether the C calling conventions require that
+// 32-bit integer argument values are extended to 64 bits.
+const bool CCallingConventionRequiresIntsAsLongs = false;
+
+#ifdef __SOFTFP__
+const bool HaveVFP = false;
+#else
+const bool HaveVFP = true;
+#endif
+
+#if defined(__ARM_PCS_VFP) || defined(AARCH64)
+#define __ABI_HARD__
+#endif
+
+#if defined(__ARM_ARCH_7A__) || defined(AARCH64)
+#define SUPPORTS_NATIVE_CX8
+#endif
+
+#define STUBROUTINES_MD_HPP    "stubRoutines_arm.hpp"
+#define INTERP_MASM_MD_HPP     "interp_masm_arm.hpp"
+#define TEMPLATETABLE_MD_HPP   "templateTable_arm.hpp"
+#ifdef AARCH64
+#define ADGLOBALS_MD_HPP       "adfiles/adGlobals_arm_64.hpp"
+#define AD_MD_HPP              "adfiles/ad_arm_64.hpp"
+#else
+#define ADGLOBALS_MD_HPP       "adfiles/adGlobals_arm_32.hpp"
+#define AD_MD_HPP              "adfiles/ad_arm_32.hpp"
+#endif
+#define C1_LIRGENERATOR_MD_HPP "c1_LIRGenerator_arm.hpp"
+
+#ifdef TARGET_COMPILER_gcc
+#ifdef ARM32
+#undef BREAKPOINT
+#define BREAKPOINT __asm__ volatile ("bkpt")
+#endif
+#endif
+
+#endif // CPU_ARM_VM_GLOBALDEFINITIONS_ARM_HPP
--- /dev/null	2016-08-24 15:41:39.598575000 -0400
+++ new/hotspot/src/cpu/arm/vm/globals_arm.hpp	2016-12-02 11:20:36.929400818 -0500
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2008, 2016, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef CPU_ARM_VM_GLOBALS_ARM_HPP
+#define CPU_ARM_VM_GLOBALS_ARM_HPP
+
+//
+// Sets the default values for platform dependent flags used by the runtime system.
+// (see globals.hpp)
+//
+
+define_pd_global(bool,  ShareVtableStubs,         true);
+
+define_pd_global(bool,  ImplicitNullChecks,       true);  // Generate code for implicit null checks
+define_pd_global(bool,  UncommonNullCast,         true);  // Uncommon-trap NULLs past to check cast
+define_pd_global(bool,  TrapBasedNullChecks,      false); // Not needed
+
+define_pd_global(uintx, CodeCacheSegmentSize, 64 TIERED_ONLY(+64)); // Tiered compilation has large code-entry alignment.
+define_pd_global(intx,  CodeEntryAlignment,       16);
+define_pd_global(intx,  OptoLoopAlignment,        16);
+
+define_pd_global(bool,  NeedsDeoptSuspend,        false); // only register window machines need this
+
+#define DEFAULT_STACK_YELLOW_PAGES (2)
+#define DEFAULT_STACK_RED_PAGES (1)
+#define DEFAULT_STACK_SHADOW_PAGES (5 DEBUG_ONLY(+1))
+#define DEFAULT_STACK_RESERVED_PAGES (0)
+
+#define MIN_STACK_YELLOW_PAGES DEFAULT_STACK_YELLOW_PAGES
+#define MIN_STACK_RED_PAGES    DEFAULT_STACK_RED_PAGES
+#define MIN_STACK_SHADOW_PAGES DEFAULT_STACK_SHADOW_PAGES
+#define MIN_STACK_RESERVED_PAGES (0)
+
+define_pd_global(intx,  StackYellowPages,         DEFAULT_STACK_YELLOW_PAGES);
+define_pd_global(intx,  StackRedPages,            DEFAULT_STACK_RED_PAGES);
+define_pd_global(intx,  StackShadowPages,         DEFAULT_STACK_SHADOW_PAGES);
+define_pd_global(intx,  StackReservedPages,       DEFAULT_STACK_RESERVED_PAGES);
+
+define_pd_global(intx,  InlineFrequencyCount,     50);
+#if  defined(COMPILER1) || defined(COMPILER2)
+define_pd_global(intx,  InlineSmallCode,          1500);
+#endif
+
+define_pd_global(bool,  RewriteBytecodes,         true);
+define_pd_global(bool,  RewriteFrequentPairs,     true);
+
+define_pd_global(bool,  UseMembar,                true);
+
+define_pd_global(bool,  PreserveFramePointer,     false);
+
+// GC Ergo Flags
+define_pd_global(size_t, CMSYoungGenPerWorker,    16*M);  // default max size of CMS young gen, per GC worker thread
+
+define_pd_global(uintx, TypeProfileLevel, 0);
+
+// No performance work done here yet.
+define_pd_global(bool, CompactStrings, false);
+
+define_pd_global(intx, InitArrayShortSize, 8*BytesPerLong);
+
+#define ARCH_FLAGS(develop, \
+                   product, \
+                   diagnostic, \
+                   experimental, \
+                   notproduct, \
+                   range, \
+                   constraint, \
+                   writeable) \
+                                                                                        \
+  develop(bool, VerifyInterpreterStackTop, false,                                       \
+          "Verify interpreter stack top at every stack expansion (AArch64 only)")       \
+                                                                                        \
+  develop(bool, ZapHighNonSignificantBits, false,                                       \
+          "Zap high non-significant bits of values (AArch64 only)")                     \
+                                                                                        \
+
+#endif // CPU_ARM_VM_GLOBALS_ARM_HPP
--- /dev/null	2016-08-24 15:41:39.598575000 -0400
+++ new/hotspot/src/cpu/arm/vm/icBuffer_arm.cpp	2016-12-02 11:20:42.669726348 -0500
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2008, 2016, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+#include "asm/assembler.hpp"
+#include "assembler_arm.inline.hpp"
+#include "code/icBuffer.hpp"
+#include "gc/shared/collectedHeap.inline.hpp"
+#include "interpreter/bytecodes.hpp"
+#include "memory/resourceArea.hpp"
+#include "nativeInst_arm.hpp"
+#include "oops/oop.inline.hpp"
+
+#define __ masm->
+
+int InlineCacheBuffer::ic_stub_code_size() {
+  return (AARCH64_ONLY(8) NOT_AARCH64(4)) * Assembler::InstructionSize;
+}
+
+void InlineCacheBuffer::assemble_ic_buffer_code(address code_begin, void* cached_value, address entry_point) {
+  ResourceMark rm;
+  CodeBuffer code(code_begin, ic_stub_code_size());
+  MacroAssembler* masm = new MacroAssembler(&code);
+
+  InlinedAddress oop_literal((address) cached_value);
+  __ ldr_literal(Ricklass, oop_literal);
+  // FIXME: OK to remove reloc here?
+  __ patchable_jump(entry_point, relocInfo::runtime_call_type, Rtemp);
+  __ bind_literal(oop_literal);
+  __ flush();
+}
+
+address InlineCacheBuffer::ic_buffer_entry_point(address code_begin) {
+  address jump_address;
+  jump_address = code_begin + NativeInstruction::instruction_size;
+  NativeJump* jump = nativeJump_at(jump_address);
+  return jump->jump_destination();
+}
+
+void* InlineCacheBuffer::ic_buffer_cached_value(address code_begin) {
+  NativeMovConstReg* move = nativeMovConstReg_at(code_begin);
+  return (void*)move->data();
+}
+
+#undef __
--- /dev/null	2016-08-24 15:41:39.598575000 -0400
+++ new/hotspot/src/cpu/arm/vm/icache_arm.cpp	2016-12-02 11:20:47.742013994 -0500
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2008, 2016, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+#include "asm/macroAssembler.hpp"
+#include "assembler_arm.inline.hpp"
+#include "runtime/icache.hpp"
+
+#define __ _masm->
+
+#ifdef AARCH64
+
+static int icache_flush(address addr, int lines, int magic) {
+  // TODO-AARCH64 Figure out actual cache line size (mrs Xt, CTR_EL0)
+
+  address p = addr;
+  for (int i = 0; i < lines; i++, p += ICache::line_size) {
+    __asm__ volatile(
+      " dc cvau, %[p]"
+      :
+      : [p] "r" (p)
+      : "memory");
+  }
+
+  __asm__ volatile(
+    " dsb ish"
+    : : : "memory");
+
+  p = addr;
+  for (int i = 0; i < lines; i++, p += ICache::line_size) {
+    __asm__ volatile(
+      " ic ivau, %[p]"
+      :
+      : [p] "r" (p)
+      : "memory");
+  }
+
+  __asm__ volatile(
+    " dsb ish\n\t"
+    " isb\n\t"
+    : : : "memory");
+
+  return magic;
+}
+
+#else
+
+static int icache_flush(address addr, int lines, int magic) {
+  __builtin___clear_cache(addr, addr + (lines << ICache::log2_line_size));
+  return magic;
+}
+
+#endif // AARCH64
+
+void ICacheStubGenerator::generate_icache_flush(ICache::flush_icache_stub_t* flush_icache_stub) {
+  address start = (address)icache_flush;
+
+  *flush_icache_stub = (ICache::flush_icache_stub_t)start;
+
+  // ICache::invalidate_range() contains explicit condition that the first
+  // call is invoked on the generated icache flush stub code range.
+  ICache::invalidate_range(start, 0);
+
+  {
+    // dummy code mark to make the shared code happy
+    // (fields that would need to be modified to emulate the correct
+    // mark are not accessible)
+    StubCodeMark mark(this, "ICache", "fake_stub_for_inlined_icache_flush");
+    __ ret();
+  }
+}
+
+#undef __
--- /dev/null	2016-08-24 15:41:39.598575000 -0400
+++ new/hotspot/src/cpu/arm/vm/icache_arm.hpp	2016-12-02 11:20:53.362332717 -0500
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2008, 2011, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef CPU_ARM_VM_ICACHE_ARM_HPP
+#define CPU_ARM_VM_ICACHE_ARM_HPP
+
+// Interface for updating the instruction cache.  Whenever the VM modifies
+// code, part of the processor instruction cache potentially has to be flushed.
+
+class ICache : public AbstractICache {
+ public:
+  enum {
+    stub_size      = 32,                // Size of the icache flush stub in bytes
+    line_size      = BytesPerWord,      // conservative
+    log2_line_size = LogBytesPerWord    // log2(line_size)
+  };
+};
+
+#endif // CPU_ARM_VM_ICACHE_ARM_HPP
--- /dev/null	2016-08-24 15:41:39.598575000 -0400
+++ new/hotspot/src/cpu/arm/vm/interp_masm_arm.cpp	2016-12-02 11:20:58.614630571 -0500
@@ -0,0 +1,2272 @@
+/*
+ * Copyright (c) 2008, 2016, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+#include "gc/shared/barrierSet.inline.hpp"
+#include "gc/shared/cardTableModRefBS.inline.hpp"
+#include "gc/shared/collectedHeap.hpp"
+#include "interp_masm_arm.hpp"
+#include "interpreter/interpreter.hpp"
+#include "interpreter/interpreterRuntime.hpp"
+#include "logging/log.hpp"
+#include "oops/arrayOop.hpp"
+#include "oops/markOop.hpp"
+#include "oops/method.hpp"
+#include "oops/methodData.hpp"
+#include "prims/jvmtiExport.hpp"
+#include "prims/jvmtiThreadState.hpp"
+#include "runtime/basicLock.hpp"
+#include "runtime/biasedLocking.hpp"
+#include "runtime/sharedRuntime.hpp"
+
+#if INCLUDE_ALL_GCS
+#include "gc/g1/g1CollectedHeap.inline.hpp"
+#include "gc/g1/g1SATBCardTableModRefBS.hpp"
+#include "gc/g1/heapRegion.hpp"
+#endif // INCLUDE_ALL_GCS
+
+//--------------------------------------------------------------------
+// Implementation of InterpreterMacroAssembler
+
+
+
+
+InterpreterMacroAssembler::InterpreterMacroAssembler(CodeBuffer* code) : MacroAssembler(code) {
+}
+
+void InterpreterMacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) {
+#if defined(ASSERT) && !defined(AARCH64)
+  // Ensure that last_sp is not filled.
+  { Label L;
+    ldr(Rtemp, Address(FP, frame::interpreter_frame_last_sp_offset * wordSize));
+    cbz(Rtemp, L);
+    stop("InterpreterMacroAssembler::call_VM_helper: last_sp != NULL");
+    bind(L);
+  }
+#endif // ASSERT && !AARCH64
+
+  // Rbcp must be saved/restored since it may change due to GC.
+  save_bcp();
+
+#ifdef AARCH64
+  check_no_cached_stack_top(Rtemp);
+  save_stack_top();
+  check_extended_sp(Rtemp);
+  cut_sp_before_call();
+#endif // AARCH64
+
+  // super call
+  MacroAssembler::call_VM_helper(oop_result, entry_point, number_of_arguments, check_exceptions);
+
+#ifdef AARCH64
+  // Restore SP to extended SP
+  restore_sp_after_call(Rtemp);
+  check_stack_top();
+  clear_cached_stack_top();
+#endif // AARCH64
+
+  // Restore interpreter specific registers.
+  restore_bcp();
+  restore_method();
+}
+
+void InterpreterMacroAssembler::jump_to_entry(address entry) {
+  assert(entry, "Entry must have been generated by now");
+  b(entry);
+}
+
+void InterpreterMacroAssembler::check_and_handle_popframe() {
+  if (can_pop_frame()) {
+    Label L;
+    const Register popframe_cond = R2_tmp;
+
+    // Initiate popframe handling only if it is not already being processed.  If the flag
+    // has the popframe_processing bit set, it means that this code is called *during* popframe
+    // handling - we don't want to reenter.
+
+    ldr_s32(popframe_cond, Address(Rthread, JavaThread::popframe_condition_offset()));
+    tbz(popframe_cond, exact_log2(JavaThread::popframe_pending_bit), L);
+    tbnz(popframe_cond, exact_log2(JavaThread::popframe_processing_bit), L);
+
+    // Call Interpreter::remove_activation_preserving_args_entry() to get the
+    // address of the same-named entrypoint in the generated interpreter code.
+    call_VM_leaf(CAST_FROM_FN_PTR(address, Interpreter::remove_activation_preserving_args_entry));
+
+    // Call indirectly to avoid generation ordering problem.
+    jump(R0);
+
+    bind(L);
+  }
+}
+
+
+// Blows R2, Rtemp. Sets TOS cached value.
+void InterpreterMacroAssembler::load_earlyret_value(TosState state) {
+  const Register thread_state = R2_tmp;
+
+  ldr(thread_state, Address(Rthread, JavaThread::jvmti_thread_state_offset()));
+
+  const Address tos_addr(thread_state, JvmtiThreadState::earlyret_tos_offset());
+  const Address oop_addr(thread_state, JvmtiThreadState::earlyret_oop_offset());
+  const Address val_addr(thread_state, JvmtiThreadState::earlyret_value_offset());
+#ifndef AARCH64
+  const Address val_addr_hi(thread_state, JvmtiThreadState::earlyret_value_offset()
+                             + in_ByteSize(wordSize));
+#endif // !AARCH64
+
+  Register zero = zero_register(Rtemp);
+
+  switch (state) {
+    case atos: ldr(R0_tos, oop_addr);
+               str(zero, oop_addr);
+               interp_verify_oop(R0_tos, state, __FILE__, __LINE__);
+               break;
+
+#ifdef AARCH64
+    case ltos: ldr(R0_tos, val_addr);              break;
+#else
+    case ltos: ldr(R1_tos_hi, val_addr_hi);        // fall through
+#endif // AARCH64
+    case btos:                                     // fall through
+    case ztos:                                     // fall through
+    case ctos:                                     // fall through
+    case stos:                                     // fall through
+    case itos: ldr_s32(R0_tos, val_addr);          break;
+#ifdef __SOFTFP__
+    case dtos: ldr(R1_tos_hi, val_addr_hi);        // fall through
+    case ftos: ldr(R0_tos, val_addr);              break;
+#else
+    case ftos: ldr_float (S0_tos, val_addr);       break;
+    case dtos: ldr_double(D0_tos, val_addr);       break;
+#endif // __SOFTFP__
+    case vtos: /* nothing to do */                 break;
+    default  : ShouldNotReachHere();
+  }
+  // Clean up tos value in the thread object
+  str(zero, val_addr);
+#ifndef AARCH64
+  str(zero, val_addr_hi);
+#endif // !AARCH64
+
+  mov(Rtemp, (int) ilgl);
+  str_32(Rtemp, tos_addr);
+}
+
+
+// Blows R2, Rtemp.
+void InterpreterMacroAssembler::check_and_handle_earlyret() {
+  if (can_force_early_return()) {
+    Label L;
+    const Register thread_state = R2_tmp;
+
+    ldr(thread_state, Address(Rthread, JavaThread::jvmti_thread_state_offset()));
+    cbz(thread_state, L); // if (thread->jvmti_thread_state() == NULL) exit;
+
+    // Initiate earlyret handling only if it is not already being processed.
+    // If the flag has the earlyret_processing bit set, it means that this code
+    // is called *during* earlyret handling - we don't want to reenter.
+
+    ldr_s32(Rtemp, Address(thread_state, JvmtiThreadState::earlyret_state_offset()));
+    cmp(Rtemp, JvmtiThreadState::earlyret_pending);
+    b(L, ne);
+
+    // Call Interpreter::remove_activation_early_entry() to get the address of the
+    // same-named entrypoint in the generated interpreter code.
+
+    ldr_s32(R0, Address(thread_state, JvmtiThreadState::earlyret_tos_offset()));
+    call_VM_leaf(CAST_FROM_FN_PTR(address, Interpreter::remove_activation_early_entry), R0);
+
+    jump(R0);
+
+    bind(L);
+  }
+}
+
+
+// Sets reg. Blows Rtemp.
+void InterpreterMacroAssembler::get_unsigned_2_byte_index_at_bcp(Register reg, int bcp_offset) {
+  assert(bcp_offset >= 0, "bcp is still pointing to start of bytecode");
+  assert(reg != Rtemp, "should be different registers");
+
+  ldrb(Rtemp, Address(Rbcp, bcp_offset));
+  ldrb(reg, Address(Rbcp, bcp_offset+1));
+  orr(reg, reg, AsmOperand(Rtemp, lsl, BitsPerByte));
+}
+
+void InterpreterMacroAssembler::get_index_at_bcp(Register index, int bcp_offset, Register tmp_reg, size_t index_size) {
+  assert_different_registers(index, tmp_reg);
+  if (index_size == sizeof(u2)) {
+    // load bytes of index separately to avoid unaligned access
+    ldrb(index, Address(Rbcp, bcp_offset+1));
+    ldrb(tmp_reg, Address(Rbcp, bcp_offset));
+    orr(index, tmp_reg, AsmOperand(index, lsl, BitsPerByte));
+  } else if (index_size == sizeof(u4)) {
+    // TODO-AARCH64: consider using unaligned access here
+    ldrb(index, Address(Rbcp, bcp_offset+3));
+    ldrb(tmp_reg, Address(Rbcp, bcp_offset+2));
+    orr(index, tmp_reg, AsmOperand(index, lsl, BitsPerByte));
+    ldrb(tmp_reg, Address(Rbcp, bcp_offset+1));
+    orr(index, tmp_reg, AsmOperand(index, lsl, BitsPerByte));
+    ldrb(tmp_reg, Address(Rbcp, bcp_offset));
+    orr(index, tmp_reg, AsmOperand(index, lsl, BitsPerByte));
+    // Check if the secondary index definition is still ~x, otherwise
+    // we have to change the following assembler code to calculate the
+    // plain index.
+    assert(ConstantPool::decode_invokedynamic_index(~123) == 123, "else change next line");
+    mvn_32(index, index);  // convert to plain index
+  } else if (index_size == sizeof(u1)) {
+    ldrb(index, Address(Rbcp, bcp_offset));
+  } else {
+    ShouldNotReachHere();
+  }
+}
+
+// Sets cache, index.
+void InterpreterMacroAssembler::get_cache_and_index_at_bcp(Register cache, Register index, int bcp_offset, size_t index_size) {
+  assert(bcp_offset > 0, "bcp is still pointing to start of bytecode");
+  assert_different_registers(cache, index);
+
+  get_index_at_bcp(index, bcp_offset, cache, index_size);
+
+  // load constant pool cache pointer
+  ldr(cache, Address(FP, frame::interpreter_frame_cache_offset * wordSize));
+
+  // convert from field index to ConstantPoolCacheEntry index
+  assert(sizeof(ConstantPoolCacheEntry) == 4*wordSize, "adjust code below");
+  // TODO-AARCH64 merge this shift with shift "add(..., Rcache, AsmOperand(Rindex, lsl, LogBytesPerWord))" after this method is called
+  logical_shift_left(index, index, 2);
+}
+
+// Sets cache, index, bytecode.
+void InterpreterMacroAssembler::get_cache_and_index_and_bytecode_at_bcp(Register cache, Register index, Register bytecode, int byte_no, int bcp_offset, size_t index_size) {
+  get_cache_and_index_at_bcp(cache, index, bcp_offset, index_size);
+  // caution index and bytecode can be the same
+  add(bytecode, cache, AsmOperand(index, lsl, LogBytesPerWord));
+#ifdef AARCH64
+  add(bytecode, bytecode, (1 + byte_no) + in_bytes(ConstantPoolCache::base_offset() + ConstantPoolCacheEntry::indices_offset()));
+  ldarb(bytecode, bytecode);
+#else
+  ldrb(bytecode, Address(bytecode, (1 + byte_no) + in_bytes(ConstantPoolCache::base_offset() + ConstantPoolCacheEntry::indices_offset())));
+  TemplateTable::volatile_barrier(MacroAssembler::LoadLoad, noreg, true);
+#endif // AARCH64
+}
+
+// Sets cache. Blows reg_tmp.
+void InterpreterMacroAssembler::get_cache_entry_pointer_at_bcp(Register cache, Register reg_tmp, int bcp_offset, size_t index_size) {
+  assert(bcp_offset > 0, "bcp is still pointing to start of bytecode");
+  assert_different_registers(cache, reg_tmp);
+
+  get_index_at_bcp(reg_tmp, bcp_offset, cache, index_size);
+
+  // load constant pool cache pointer
+  ldr(cache, Address(FP, frame::interpreter_frame_cache_offset * wordSize));
+
+  // skip past the header
+  add(cache, cache, in_bytes(ConstantPoolCache::base_offset()));
+  // convert from field index to ConstantPoolCacheEntry index
+  // and from word offset to byte offset
+  assert(sizeof(ConstantPoolCacheEntry) == 4*wordSize, "adjust code below");
+  add(cache, cache, AsmOperand(reg_tmp, lsl, 2 + LogBytesPerWord));
+}
+
+// Load object from cpool->resolved_references(index)
+void InterpreterMacroAssembler::load_resolved_reference_at_index(
+                                           Register result, Register index) {
+  assert_different_registers(result, index);
+  get_constant_pool(result);
+
+  Register cache = result;
+  // load pointer for resolved_references[] objArray
+  ldr(cache, Address(result, ConstantPool::resolved_references_offset_in_bytes()));
+  // JNIHandles::resolve(result)
+  ldr(cache, Address(cache, 0));
+  // Add in the index
+  // convert from field index to resolved_references() index and from
+  // word index to byte offset. Since this is a java object, it can be compressed
+  add(cache, cache, AsmOperand(index, lsl, LogBytesPerHeapOop));
+  load_heap_oop(result, Address(cache, arrayOopDesc::base_offset_in_bytes(T_OBJECT)));
+}
+
+// Generate a subtype check: branch to not_subtype if sub_klass is
+// not a subtype of super_klass.
+// Profiling code for the subtype check failure (profile_typecheck_failed)
+// should be explicitly generated by the caller in the not_subtype case.
+// Blows Rtemp, tmp1, tmp2.
+void InterpreterMacroAssembler::gen_subtype_check(Register Rsub_klass,
+                                                  Register Rsuper_klass,
+                                                  Label &not_subtype,
+                                                  Register tmp1,
+                                                  Register tmp2) {
+
+  assert_different_registers(Rsub_klass, Rsuper_klass, tmp1, tmp2, Rtemp);
+  Label ok_is_subtype, loop, update_cache;
+
+  const Register super_check_offset = tmp1;
+  const Register cached_super = tmp2;
+
+  // Profile the not-null value's klass.
+  profile_typecheck(tmp1, Rsub_klass);
+
+  // Load the super-klass's check offset into
+  ldr_u32(super_check_offset, Address(Rsuper_klass, Klass::super_check_offset_offset()));
+
+  // Check for self
+  cmp(Rsub_klass, Rsuper_klass);
+
+  // Load from the sub-klass's super-class display list, or a 1-word cache of
+  // the secondary superclass list, or a failing value with a sentinel offset
+  // if the super-klass is an interface or exceptionally deep in the Java
+  // hierarchy and we have to scan the secondary superclass list the hard way.
+  // See if we get an immediate positive hit
+  ldr(cached_super, Address(Rsub_klass, super_check_offset));
+
+  cond_cmp(Rsuper_klass, cached_super, ne);
+  b(ok_is_subtype, eq);
+
+  // Check for immediate negative hit
+  cmp(super_check_offset, in_bytes(Klass::secondary_super_cache_offset()));
+  b(not_subtype, ne);
+
+  // Now do a linear scan of the secondary super-klass chain.
+  const Register supers_arr = tmp1;
+  const Register supers_cnt = tmp2;
+  const Register cur_super  = Rtemp;
+
+  // Load objArrayOop of secondary supers.
+  ldr(supers_arr, Address(Rsub_klass, Klass::secondary_supers_offset()));
+
+  ldr_u32(supers_cnt, Address(supers_arr, Array<Klass*>::length_offset_in_bytes())); // Load the array length
+#ifdef AARCH64
+  cbz(supers_cnt, not_subtype);
+  add(supers_arr, supers_arr, Array<Klass*>::base_offset_in_bytes());
+#else
+  cmp(supers_cnt, 0);
+
+  // Skip to the start of array elements and prefetch the first super-klass.
+  ldr(cur_super, Address(supers_arr, Array<Klass*>::base_offset_in_bytes(), pre_indexed), ne);
+  b(not_subtype, eq);
+#endif // AARCH64
+
+  bind(loop);
+
+#ifdef AARCH64
+  ldr(cur_super, Address(supers_arr, wordSize, post_indexed));
+#endif // AARCH64
+
+  cmp(cur_super, Rsuper_klass);
+  b(update_cache, eq);
+
+  subs(supers_cnt, supers_cnt, 1);
+
+#ifndef AARCH64
+  ldr(cur_super, Address(supers_arr, wordSize, pre_indexed), ne);
+#endif // !AARCH64
+
+  b(loop, ne);
+
+  b(not_subtype);
+
+  bind(update_cache);
+  // Must be equal but missed in cache.  Update cache.
+  str(Rsuper_klass, Address(Rsub_klass, Klass::secondary_super_cache_offset()));
+
+  bind(ok_is_subtype);
+}
+
+
+// The 1st part of the store check.
+// Sets card_table_base register.
+void InterpreterMacroAssembler::store_check_part1(Register card_table_base) {
+  // Check barrier set type (should be card table) and element size
+  BarrierSet* bs = Universe::heap()->barrier_set();
+  assert(bs->kind() == BarrierSet::CardTableForRS ||
+         bs->kind() == BarrierSet::CardTableExtension,
+         "Wrong barrier set kind");
+
+  CardTableModRefBS* ct = barrier_set_cast<CardTableModRefBS>(bs);
+  assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "Adjust store check code");
+
+  // Load card table base address.
+
+  /* Performance note.
+
+     There is an alternative way of loading card table base address
+     from thread descriptor, which may look more efficient:
+
+     ldr(card_table_base, Address(Rthread, JavaThread::card_table_base_offset()));
+
+     However, performance measurements of micro benchmarks and specJVM98
+     showed that loading of card table base from thread descriptor is
+     7-18% slower compared to loading of literal embedded into the code.
+     Possible cause is a cache miss (card table base address resides in a
+     rarely accessed area of thread descriptor).
+  */
+  // TODO-AARCH64 Investigate if mov_slow is faster than ldr from Rthread on AArch64
+  mov_address(card_table_base, (address)ct->byte_map_base, symbolic_Relocation::card_table_reference);
+}
+
+// The 2nd part of the store check.
+void InterpreterMacroAssembler::store_check_part2(Register obj, Register card_table_base, Register tmp) {
+  assert_different_registers(obj, card_table_base, tmp);
+
+  assert(CardTableModRefBS::dirty_card_val() == 0, "Dirty card value must be 0 due to optimizations.");
+#ifdef AARCH64
+  add(card_table_base, card_table_base, AsmOperand(obj, lsr, CardTableModRefBS::card_shift));
+  Address card_table_addr(card_table_base);
+#else
+  Address card_table_addr(card_table_base, obj, lsr, CardTableModRefBS::card_shift);
+#endif
+
+  if (UseCondCardMark) {
+    if (UseConcMarkSweepGC) {
+      membar(MacroAssembler::Membar_mask_bits(MacroAssembler::StoreLoad), noreg);
+    }
+    Label already_dirty;
+
+    ldrb(tmp, card_table_addr);
+    cbz(tmp, already_dirty);
+
+    set_card(card_table_base, card_table_addr, tmp);
+    bind(already_dirty);
+
+  } else {
+    if (UseConcMarkSweepGC && CMSPrecleaningEnabled) {
+      membar(MacroAssembler::Membar_mask_bits(MacroAssembler::StoreStore), noreg);
+    }
+    set_card(card_table_base, card_table_addr, tmp);
+  }
+}
+
+void InterpreterMacroAssembler::set_card(Register card_table_base, Address card_table_addr, Register tmp) {
+#ifdef AARCH64
+  strb(ZR, card_table_addr);
+#else
+  CardTableModRefBS* ct = barrier_set_cast<CardTableModRefBS>(Universe::heap()->barrier_set());
+  if ((((uintptr_t)ct->byte_map_base & 0xff) == 0)) {
+    // Card table is aligned so the lowest byte of the table address base is zero.
+    // This works only if the code is not saved for later use, possibly
+    // in a context where the base would no longer be aligned.
+    strb(card_table_base, card_table_addr);
+  } else {
+    mov(tmp, 0);
+    strb(tmp, card_table_addr);
+  }
+#endif // AARCH64
+}
+
+//////////////////////////////////////////////////////////////////////////////////
+#if INCLUDE_ALL_GCS
+
+// G1 pre-barrier.
+// Blows all volatile registers (R0-R3 on 32-bit ARM, R0-R18 on AArch64, Rtemp, LR).
+// If store_addr != noreg, then previous value is loaded from [store_addr];
+// in such case store_addr and new_val registers are preserved;
+// otherwise pre_val register is preserved.
+void InterpreterMacroAssembler::g1_write_barrier_pre(Register store_addr,
+                                                     Register new_val,
+                                                     Register pre_val,
+                                                     Register tmp1,
+                                                     Register tmp2) {
+  Label done;
+  Label runtime;
+
+  if (store_addr != noreg) {
+    assert_different_registers(store_addr, new_val, pre_val, tmp1, tmp2, noreg);
+  } else {
+    assert (new_val == noreg, "should be");
+    assert_different_registers(pre_val, tmp1, tmp2, noreg);
+  }
+
+  Address in_progress(Rthread, in_bytes(JavaThread::satb_mark_queue_offset() +
+                                        SATBMarkQueue::byte_offset_of_active()));
+  Address index(Rthread, in_bytes(JavaThread::satb_mark_queue_offset() +
+                                  SATBMarkQueue::byte_offset_of_index()));
+  Address buffer(Rthread, in_bytes(JavaThread::satb_mark_queue_offset() +
+                                   SATBMarkQueue::byte_offset_of_buf()));
+
+  // Is marking active?
+  assert(in_bytes(SATBMarkQueue::byte_width_of_active()) == 1, "adjust this code");
+  ldrb(tmp1, in_progress);
+  cbz(tmp1, done);
+
+  // Do we need to load the previous value?
+  if (store_addr != noreg) {
+    load_heap_oop(pre_val, Address(store_addr, 0));
+  }
+
+  // Is the previous value null?
+  cbz(pre_val, done);
+
+  // Can we store original value in the thread's buffer?
+  // Is index == 0?
+  // (The index field is typed as size_t.)
+
+  ldr(tmp1, index);           // tmp1 := *index_adr
+  ldr(tmp2, buffer);
+
+  subs(tmp1, tmp1, wordSize); // tmp1 := tmp1 - wordSize
+  b(runtime, lt);             // If negative, goto runtime
+
+  str(tmp1, index);           // *index_adr := tmp1
+
+  // Record the previous value
+  str(pre_val, Address(tmp2, tmp1));
+  b(done);
+
+  bind(runtime);
+
+  // save the live input values
+#ifdef AARCH64
+  if (store_addr != noreg) {
+    raw_push(store_addr, new_val);
+  } else {
+    raw_push(pre_val, ZR);
+  }
+#else
+  if (store_addr != noreg) {
+    // avoid raw_push to support any ordering of store_addr and new_val
+    push(RegisterSet(store_addr) | RegisterSet(new_val));
+  } else {
+    push(pre_val);
+  }
+#endif // AARCH64
+
+  if (pre_val != R0) {
+    mov(R0, pre_val);
+  }
+  mov(R1, Rthread);
+
+  call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_pre), R0, R1);
+
+#ifdef AARCH64
+  if (store_addr != noreg) {
+    raw_pop(store_addr, new_val);
+  } else {
+    raw_pop(pre_val, ZR);
+  }
+#else
+  if (store_addr != noreg) {
+    pop(RegisterSet(store_addr) | RegisterSet(new_val));
+  } else {
+    pop(pre_val);
+  }
+#endif // AARCH64
+
+  bind(done);
+}
+
+// G1 post-barrier.
+// Blows all volatile registers (R0-R3 on 32-bit ARM, R0-R18 on AArch64, Rtemp, LR).
+void InterpreterMacroAssembler::g1_write_barrier_post(Register store_addr,
+                                                      Register new_val,
+                                                      Register tmp1,
+                                                      Register tmp2,
+                                                      Register tmp3) {
+
+  Address queue_index(Rthread, in_bytes(JavaThread::dirty_card_queue_offset() +
+                                        DirtyCardQueue::byte_offset_of_index()));
+  Address buffer(Rthread, in_bytes(JavaThread::dirty_card_queue_offset() +
+                                   DirtyCardQueue::byte_offset_of_buf()));
+
+  BarrierSet* bs = Universe::heap()->barrier_set();
+  CardTableModRefBS* ct = (CardTableModRefBS*)bs;
+  Label done;
+  Label runtime;
+
+  // Does store cross heap regions?
+
+  eor(tmp1, store_addr, new_val);
+#ifdef AARCH64
+  logical_shift_right(tmp1, tmp1, HeapRegion::LogOfHRGrainBytes);
+  cbz(tmp1, done);
+#else
+  movs(tmp1, AsmOperand(tmp1, lsr, HeapRegion::LogOfHRGrainBytes));
+  b(done, eq);
+#endif
+
+  // crosses regions, storing NULL?
+
+  cbz(new_val, done);
+
+  // storing region crossing non-NULL, is card already dirty?
+  const Register card_addr = tmp1;
+  assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
+
+  mov_address(tmp2, (address)ct->byte_map_base, symbolic_Relocation::card_table_reference);
+  add(card_addr, tmp2, AsmOperand(store_addr, lsr, CardTableModRefBS::card_shift));
+
+  ldrb(tmp2, Address(card_addr));
+  cmp(tmp2, (int)G1SATBCardTableModRefBS::g1_young_card_val());
+  b(done, eq);
+
+  membar(MacroAssembler::Membar_mask_bits(MacroAssembler::StoreLoad), tmp2);
+
+  assert(CardTableModRefBS::dirty_card_val() == 0, "adjust this code");
+  ldrb(tmp2, Address(card_addr));
+  cbz(tmp2, done);
+
+  // storing a region crossing, non-NULL oop, card is clean.
+  // dirty card and log.
+
+  strb(zero_register(tmp2), Address(card_addr));
+
+  ldr(tmp2, queue_index);
+  ldr(tmp3, buffer);
+
+  subs(tmp2, tmp2, wordSize);
+  b(runtime, lt); // go to runtime if now negative
+
+  str(tmp2, queue_index);
+
+  str(card_addr, Address(tmp3, tmp2));
+  b(done);
+
+  bind(runtime);
+
+  if (card_addr != R0) {
+    mov(R0, card_addr);
+  }
+  mov(R1, Rthread);
+  call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_post), R0, R1);
+
+  bind(done);
+}
+
+#endif // INCLUDE_ALL_GCS
+//////////////////////////////////////////////////////////////////////////////////
+
+
+// Java Expression Stack
+
+void InterpreterMacroAssembler::pop_ptr(Register r) {
+  assert(r != Rstack_top, "unpredictable instruction");
+  ldr(r, Address(Rstack_top, wordSize, post_indexed));
+}
+
+void InterpreterMacroAssembler::pop_i(Register r) {
+  assert(r != Rstack_top, "unpredictable instruction");
+  ldr_s32(r, Address(Rstack_top, wordSize, post_indexed));
+  zap_high_non_significant_bits(r);
+}
+
+#ifdef AARCH64
+void InterpreterMacroAssembler::pop_l(Register r) {
+  assert(r != Rstack_top, "unpredictable instruction");
+  ldr(r, Address(Rstack_top, 2*wordSize, post_indexed));
+}
+#else
+void InterpreterMacroAssembler::pop_l(Register lo, Register hi) {
+  assert_different_registers(lo, hi);
+  assert(lo < hi, "lo must be < hi");
+  pop(RegisterSet(lo) | RegisterSet(hi));
+}
+#endif // AARCH64
+
+void InterpreterMacroAssembler::pop_f(FloatRegister fd) {
+#ifdef AARCH64
+  ldr_s(fd, Address(Rstack_top, wordSize, post_indexed));
+#else
+  fpops(fd);
+#endif // AARCH64
+}
+
+void InterpreterMacroAssembler::pop_d(FloatRegister fd) {
+#ifdef AARCH64
+  ldr_d(fd, Address(Rstack_top, 2*wordSize, post_indexed));
+#else
+  fpopd(fd);
+#endif // AARCH64
+}
+
+
+// Transition vtos -> state. Blows R0, R1. Sets TOS cached value.
+void InterpreterMacroAssembler::pop(TosState state) {
+  switch (state) {
+    case atos: pop_ptr(R0_tos);                              break;
+    case btos:                                               // fall through
+    case ztos:                                               // fall through
+    case ctos:                                               // fall through
+    case stos:                                               // fall through
+    case itos: pop_i(R0_tos);                                break;
+#ifdef AARCH64
+    case ltos: pop_l(R0_tos);                                break;
+#else
+    case ltos: pop_l(R0_tos_lo, R1_tos_hi);                  break;
+#endif // AARCH64
+#ifdef __SOFTFP__
+    case ftos: pop_i(R0_tos);                                break;
+    case dtos: pop_l(R0_tos_lo, R1_tos_hi);                  break;
+#else
+    case ftos: pop_f(S0_tos);                                break;
+    case dtos: pop_d(D0_tos);                                break;
+#endif // __SOFTFP__
+    case vtos: /* nothing to do */                           break;
+    default  : ShouldNotReachHere();
+  }
+  interp_verify_oop(R0_tos, state, __FILE__, __LINE__);
+}
+
+void InterpreterMacroAssembler::push_ptr(Register r) {
+  assert(r != Rstack_top, "unpredictable instruction");
+  str(r, Address(Rstack_top, -wordSize, pre_indexed));
+  check_stack_top_on_expansion();
+}
+
+void InterpreterMacroAssembler::push_i(Register r) {
+  assert(r != Rstack_top, "unpredictable instruction");
+  str_32(r, Address(Rstack_top, -wordSize, pre_indexed));
+  check_stack_top_on_expansion();
+}
+
+#ifdef AARCH64
+void InterpreterMacroAssembler::push_l(Register r) {
+  assert(r != Rstack_top, "unpredictable instruction");
+  stp(r, ZR, Address(Rstack_top, -2*wordSize, pre_indexed));
+  check_stack_top_on_expansion();
+}
+#else
+void InterpreterMacroAssembler::push_l(Register lo, Register hi) {
+  assert_different_registers(lo, hi);
+  assert(lo < hi, "lo must be < hi");
+  push(RegisterSet(lo) | RegisterSet(hi));
+}
+#endif // AARCH64
+
+void InterpreterMacroAssembler::push_f() {
+#ifdef AARCH64
+  str_s(S0_tos, Address(Rstack_top, -wordSize, pre_indexed));
+  check_stack_top_on_expansion();
+#else
+  fpushs(S0_tos);
+#endif // AARCH64
+}
+
+void InterpreterMacroAssembler::push_d() {
+#ifdef AARCH64
+  str_d(D0_tos, Address(Rstack_top, -2*wordSize, pre_indexed));
+  check_stack_top_on_expansion();
+#else
+  fpushd(D0_tos);
+#endif // AARCH64
+}
+
+// Transition state -> vtos. Blows Rtemp.
+void InterpreterMacroAssembler::push(TosState state) {
+  interp_verify_oop(R0_tos, state, __FILE__, __LINE__);
+  switch (state) {
+    case atos: push_ptr(R0_tos);                              break;
+    case btos:                                                // fall through
+    case ztos:                                                // fall through
+    case ctos:                                                // fall through
+    case stos:                                                // fall through
+    case itos: push_i(R0_tos);                                break;
+#ifdef AARCH64
+    case ltos: push_l(R0_tos);                                break;
+#else
+    case ltos: push_l(R0_tos_lo, R1_tos_hi);                  break;
+#endif // AARCH64
+#ifdef __SOFTFP__
+    case ftos: push_i(R0_tos);                                break;
+    case dtos: push_l(R0_tos_lo, R1_tos_hi);                  break;
+#else
+    case ftos: push_f();                                      break;
+    case dtos: push_d();                                      break;
+#endif // __SOFTFP__
+    case vtos: /* nothing to do */                            break;
+    default  : ShouldNotReachHere();
+  }
+}
+
+
+#ifndef AARCH64
+
+// Converts return value in R0/R1 (interpreter calling conventions) to TOS cached value.
+void InterpreterMacroAssembler::convert_retval_to_tos(TosState state) {
+#if (!defined __SOFTFP__ && !defined __ABI_HARD__)
+  // According to interpreter calling conventions, result is returned in R0/R1,
+  // but templates expect ftos in S0, and dtos in D0.
+  if (state == ftos) {
+    fmsr(S0_tos, R0);
+  } else if (state == dtos) {
+    fmdrr(D0_tos, R0, R1);
+  }
+#endif // !__SOFTFP__ && !__ABI_HARD__
+}
+
+// Converts TOS cached value to return value in R0/R1 (according to interpreter calling conventions).
+void InterpreterMacroAssembler::convert_tos_to_retval(TosState state) {
+#if (!defined __SOFTFP__ && !defined __ABI_HARD__)
+  // According to interpreter calling conventions, result is returned in R0/R1,
+  // so ftos (S0) and dtos (D0) are moved to R0/R1.
+  if (state == ftos) {
+    fmrs(R0, S0_tos);
+  } else if (state == dtos) {
+    fmrrd(R0, R1, D0_tos);
+  }
+#endif // !__SOFTFP__ && !__ABI_HARD__
+}
+
+#endif // !AARCH64
+
+
+// Helpers for swap and dup
+void InterpreterMacroAssembler::load_ptr(int n, Register val) {
+  ldr(val, Address(Rstack_top, Interpreter::expr_offset_in_bytes(n)));
+}
+
+void InterpreterMacroAssembler::store_ptr(int n, Register val) {
+  str(val, Address(Rstack_top, Interpreter::expr_offset_in_bytes(n)));
+}
+
+
+void InterpreterMacroAssembler::prepare_to_jump_from_interpreted() {
+#ifdef AARCH64
+  check_no_cached_stack_top(Rtemp);
+  save_stack_top();
+  cut_sp_before_call();
+  mov(Rparams, Rstack_top);
+#endif // AARCH64
+
+  // set sender sp
+  mov(Rsender_sp, SP);
+
+#ifndef AARCH64
+  // record last_sp
+  str(Rsender_sp, Address(FP, frame::interpreter_frame_last_sp_offset * wordSize));
+#endif // !AARCH64
+}
+
+// Jump to from_interpreted entry of a call unless single stepping is possible
+// in this thread in which case we must call the i2i entry
+void InterpreterMacroAssembler::jump_from_interpreted(Register method) {
+  assert_different_registers(method, Rtemp);
+
+  prepare_to_jump_from_interpreted();
+
+  if (can_post_interpreter_events()) {
+    // JVMTI events, such as single-stepping, are implemented partly by avoiding running
+    // compiled code in threads for which the event is enabled.  Check here for
+    // interp_only_mode if these events CAN be enabled.
+
+    ldr_s32(Rtemp, Address(Rthread, JavaThread::interp_only_mode_offset()));
+#ifdef AARCH64
+    {
+      Label not_interp_only_mode;
+
+      cbz(Rtemp, not_interp_only_mode);
+      indirect_jump(Address(method, Method::interpreter_entry_offset()), Rtemp);
+
+      bind(not_interp_only_mode);
+    }
+#else
+    cmp(Rtemp, 0);
+    ldr(PC, Address(method, Method::interpreter_entry_offset()), ne);
+#endif // AARCH64
+  }
+
+  indirect_jump(Address(method, Method::from_interpreted_offset()), Rtemp);
+}
+
+
+void InterpreterMacroAssembler::restore_dispatch() {
+  mov_slow(RdispatchTable, (address)Interpreter::dispatch_table(vtos));
+}
+
+
+// The following two routines provide a hook so that an implementation
+// can schedule the dispatch in two parts.
+void InterpreterMacroAssembler::dispatch_prolog(TosState state, int step) {
+  // Nothing ARM-specific to be done here.
+}
+
+void InterpreterMacroAssembler::dispatch_epilog(TosState state, int step) {
+  dispatch_next(state, step);
+}
+
+void InterpreterMacroAssembler::dispatch_base(TosState state,
+                                              DispatchTableMode table_mode,
+                                              bool verifyoop) {
+  if (VerifyActivationFrameSize) {
+    Label L;
+#ifdef AARCH64
+    mov(Rtemp, SP);
+    sub(Rtemp, FP, Rtemp);
+#else
+    sub(Rtemp, FP, SP);
+#endif // AARCH64
+    int min_frame_size = (frame::link_offset - frame::interpreter_frame_initial_sp_offset) * wordSize;
+    cmp(Rtemp, min_frame_size);
+    b(L, ge);
+    stop("broken stack frame");
+    bind(L);
+  }
+
+  if (verifyoop) {
+    interp_verify_oop(R0_tos, state, __FILE__, __LINE__);
+  }
+
+  if((state == itos) || (state == btos) || (state == ztos) || (state == ctos) || (state == stos)) {
+    zap_high_non_significant_bits(R0_tos);
+  }
+
+#ifdef ASSERT
+  Label L;
+  mov_slow(Rtemp, (address)Interpreter::dispatch_table(vtos));
+  cmp(Rtemp, RdispatchTable);
+  b(L, eq);
+  stop("invalid RdispatchTable");
+  bind(L);
+#endif
+
+  if (table_mode == DispatchDefault) {
+    if (state == vtos) {
+      indirect_jump(Address::indexed_ptr(RdispatchTable, R3_bytecode), Rtemp);
+    } else {
+#ifdef AARCH64
+      sub(Rtemp, R3_bytecode, (Interpreter::distance_from_dispatch_table(vtos) -
+                           Interpreter::distance_from_dispatch_table(state)));
+      indirect_jump(Address::indexed_ptr(RdispatchTable, Rtemp), Rtemp);
+#else
+      // on 32-bit ARM this method is faster than the one above.
+      sub(Rtemp, RdispatchTable, (Interpreter::distance_from_dispatch_table(vtos) -
+                           Interpreter::distance_from_dispatch_table(state)) * wordSize);
+      indirect_jump(Address::indexed_ptr(Rtemp, R3_bytecode), Rtemp);
+#endif
+    }
+  } else {
+    assert(table_mode == DispatchNormal, "invalid dispatch table mode");
+    address table = (address) Interpreter::normal_table(state);
+    mov_slow(Rtemp, table);
+    indirect_jump(Address::indexed_ptr(Rtemp, R3_bytecode), Rtemp);
+  }
+
+  nop(); // to avoid filling CPU pipeline with invalid instructions
+  nop();
+}
+
+void InterpreterMacroAssembler::dispatch_only(TosState state) {
+  dispatch_base(state, DispatchDefault);
+}
+
+
+void InterpreterMacroAssembler::dispatch_only_normal(TosState state) {
+  dispatch_base(state, DispatchNormal);
+}
+
+void InterpreterMacroAssembler::dispatch_only_noverify(TosState state) {
+  dispatch_base(state, DispatchNormal, false);
+}
+
+void InterpreterMacroAssembler::dispatch_next(TosState state, int step) {
+  // load next bytecode and advance Rbcp
+  ldrb(R3_bytecode, Address(Rbcp, step, pre_indexed));
+  dispatch_base(state, DispatchDefault);
+}
+
+void InterpreterMacroAssembler::narrow(Register result) {
+  // mask integer result to narrower return type.
+  const Register Rtmp = R2;
+
+  // get method type
+  ldr(Rtmp, Address(Rmethod, Method::const_offset()));
+  ldrb(Rtmp, Address(Rtmp, ConstMethod::result_type_offset()));
+
+  Label notBool, notByte, notChar, done;
+  cmp(Rtmp, T_INT);
+  b(done, eq);
+
+  cmp(Rtmp, T_BOOLEAN);
+  b(notBool, ne);
+  and_32(result, result, 1);
+  b(done);
+
+  bind(notBool);
+  cmp(Rtmp, T_BYTE);
+  b(notByte, ne);
+  sign_extend(result, result, 8);
+  b(done);
+
+  bind(notByte);
+  cmp(Rtmp, T_CHAR);
+  b(notChar, ne);
+  zero_extend(result, result, 16);
+  b(done);
+
+  bind(notChar);
+  // cmp(Rtmp, T_SHORT);
+  // b(done, ne);
+  sign_extend(result, result, 16);
+
+  // Nothing to do
+  bind(done);
+}
+
+// remove activation
+//
+// Unlock the receiver if this is a synchronized method.
+// Unlock any Java monitors from syncronized blocks.
+// Remove the activation from the stack.
+//
+// If there are locked Java monitors
+//    If throw_monitor_exception
+//       throws IllegalMonitorStateException
+//    Else if install_monitor_exception
+//       installs IllegalMonitorStateException
+//    Else
+//       no error processing
+void InterpreterMacroAssembler::remove_activation(TosState state, Register ret_addr,
+                                                  bool throw_monitor_exception,
+                                                  bool install_monitor_exception,
+                                                  bool notify_jvmdi) {
+  Label unlock, unlocked, no_unlock;
+
+  // Note: Registers R0, R1, S0 and D0 (TOS cached value) may be in use for the result.
+
+  const Address do_not_unlock_if_synchronized(Rthread,
+                         JavaThread::do_not_unlock_if_synchronized_offset());
+
+  const Register Rflag = R2;
+  const Register Raccess_flags = R3;
+
+  restore_method();
+
+  ldrb(Rflag, do_not_unlock_if_synchronized);
+
+  // get method access flags
+  ldr_u32(Raccess_flags, Address(Rmethod, Method::access_flags_offset()));
+
+  strb(zero_register(Rtemp), do_not_unlock_if_synchronized); // reset the flag
+
+  // check if method is synchronized
+
+  tbz(Raccess_flags, JVM_ACC_SYNCHRONIZED_BIT, unlocked);
+
+  // Don't unlock anything if the _do_not_unlock_if_synchronized flag is set.
+  cbnz(Rflag, no_unlock);
+
+  // unlock monitor
+  push(state);                                   // save result
+
+  // BasicObjectLock will be first in list, since this is a synchronized method. However, need
+  // to check that the object has not been unlocked by an explicit monitorexit bytecode.
+
+  const Register Rmonitor = R1;                  // fixed in unlock_object()
+  const Register Robj = R2;
+
+  // address of first monitor
+  sub(Rmonitor, FP, - frame::interpreter_frame_monitor_block_bottom_offset * wordSize + (int)sizeof(BasicObjectLock));
+
+  ldr(Robj, Address(Rmonitor, BasicObjectLock::obj_offset_in_bytes()));
+  cbnz(Robj, unlock);
+
+  pop(state);
+
+  if (throw_monitor_exception) {
+    // Entry already unlocked, need to throw exception
+    call_VM(noreg, CAST_FROM_FN_PTR(address, InterpreterRuntime::throw_illegal_monitor_state_exception));
+    should_not_reach_here();
+  } else {
+    // Monitor already unlocked during a stack unroll.
+    // If requested, install an illegal_monitor_state_exception.
+    // Continue with stack unrolling.
+    if (install_monitor_exception) {
+      call_VM(noreg, CAST_FROM_FN_PTR(address, InterpreterRuntime::new_illegal_monitor_state_exception));
+    }
+    b(unlocked);
+  }
+
+
+  // Exception case for the check that all monitors are unlocked.
+  const Register Rcur = R2;
+  Label restart_check_monitors_unlocked, exception_monitor_is_still_locked;
+
+  bind(exception_monitor_is_still_locked);
+  // Monitor entry is still locked, need to throw exception.
+  // Rcur: monitor entry.
+
+  if (throw_monitor_exception) {
+    // Throw exception
+    call_VM(noreg, CAST_FROM_FN_PTR(address, InterpreterRuntime::throw_illegal_monitor_state_exception));
+    should_not_reach_here();
+  } else {
+    // Stack unrolling. Unlock object and install illegal_monitor_exception
+    // Unlock does not block, so don't have to worry about the frame
+
+    push(state);
+    mov(R1, Rcur);
+    unlock_object(R1);
+
+    if (install_monitor_exception) {
+      call_VM(noreg, CAST_FROM_FN_PTR(address, InterpreterRuntime::new_illegal_monitor_state_exception));
+    }
+
+    pop(state);
+    b(restart_check_monitors_unlocked);
+  }
+
+  bind(unlock);
+  unlock_object(Rmonitor);
+  pop(state);
+
+  // Check that for block-structured locking (i.e., that all locked objects has been unlocked)
+  bind(unlocked);
+
+  // Check that all monitors are unlocked
+  {
+    Label loop;
+
+    const int entry_size = frame::interpreter_frame_monitor_size() * wordSize;
+    const Register Rbottom = R3;
+    const Register Rcur_obj = Rtemp;
+
+    bind(restart_check_monitors_unlocked);
+
+    ldr(Rcur, Address(FP, frame::interpreter_frame_monitor_block_top_offset * wordSize));
+                                 // points to current entry, starting with top-most entry
+    sub(Rbottom, FP, -frame::interpreter_frame_monitor_block_bottom_offset * wordSize);
+                                 // points to word before bottom of monitor block
+
+    cmp(Rcur, Rbottom);          // check if there are no monitors
+#ifndef AARCH64
+    ldr(Rcur_obj, Address(Rcur, BasicObjectLock::obj_offset_in_bytes()), ne);
+                                 // prefetch monitor's object
+#endif // !AARCH64
+    b(no_unlock, eq);
+
+    bind(loop);
+#ifdef AARCH64
+    ldr(Rcur_obj, Address(Rcur, BasicObjectLock::obj_offset_in_bytes()));
+#endif // AARCH64
+    // check if current entry is used
+    cbnz(Rcur_obj, exception_monitor_is_still_locked);
+
+    add(Rcur, Rcur, entry_size);      // otherwise advance to next entry
+    cmp(Rcur, Rbottom);               // check if bottom reached
+#ifndef AARCH64
+    ldr(Rcur_obj, Address(Rcur, BasicObjectLock::obj_offset_in_bytes()), ne);
+                                      // prefetch monitor's object
+#endif // !AARCH64
+    b(loop, ne);                      // if not at bottom then check this entry
+  }
+
+  bind(no_unlock);
+
+  // jvmti support
+  if (notify_jvmdi) {
+    notify_method_exit(state, NotifyJVMTI);     // preserve TOSCA
+  } else {
+    notify_method_exit(state, SkipNotifyJVMTI); // preserve TOSCA
+  }
+
+  // remove activation
+#ifdef AARCH64
+  ldr(Rtemp, Address(FP, frame::interpreter_frame_sender_sp_offset * wordSize));
+  ldp(FP, LR, Address(FP));
+  mov(SP, Rtemp);
+#else
+  mov(Rtemp, FP);
+  ldmia(FP, RegisterSet(FP) | RegisterSet(LR));
+  ldr(SP, Address(Rtemp, frame::interpreter_frame_sender_sp_offset * wordSize));
+#endif
+
+  if (ret_addr != LR) {
+    mov(ret_addr, LR);
+  }
+}
+
+
+// At certain points in the method invocation the monitor of
+// synchronized methods hasn't been entered yet.
+// To correctly handle exceptions at these points, we set the thread local
+// variable _do_not_unlock_if_synchronized to true. The remove_activation will
+// check this flag.
+void InterpreterMacroAssembler::set_do_not_unlock_if_synchronized(bool flag, Register tmp) {
+  const Address do_not_unlock_if_synchronized(Rthread,
+                         JavaThread::do_not_unlock_if_synchronized_offset());
+  if (flag) {
+    mov(tmp, 1);
+    strb(tmp, do_not_unlock_if_synchronized);
+  } else {
+    strb(zero_register(tmp), do_not_unlock_if_synchronized);
+  }
+}
+
+// Lock object
+//
+// Argument: R1 : Points to BasicObjectLock to be used for locking.
+// Must be initialized with object to lock.
+// Blows volatile registers (R0-R3 on 32-bit ARM, R0-R18 on AArch64), Rtemp, LR. Calls VM.
+void InterpreterMacroAssembler::lock_object(Register Rlock) {
+  assert(Rlock == R1, "the second argument");
+
+  if (UseHeavyMonitors) {
+    call_VM(noreg, CAST_FROM_FN_PTR(address, InterpreterRuntime::monitorenter), Rlock);
+  } else {
+    Label done;
+
+    const Register Robj = R2;
+    const Register Rmark = R3;
+    assert_different_registers(Robj, Rmark, Rlock, R0, Rtemp);
+
+    const int obj_offset = BasicObjectLock::obj_offset_in_bytes();
+    const int lock_offset = BasicObjectLock::lock_offset_in_bytes ();
+    const int mark_offset = lock_offset + BasicLock::displaced_header_offset_in_bytes();
+
+    Label already_locked, slow_case;
+
+    // Load object pointer
+    ldr(Robj, Address(Rlock, obj_offset));
+
+    if (UseBiasedLocking) {
+      biased_locking_enter(Robj, Rmark/*scratched*/, R0, false, Rtemp, done, slow_case);
+    }
+
+#ifdef AARCH64
+    assert(oopDesc::mark_offset_in_bytes() == 0, "must be");
+    ldr(Rmark, Robj);
+
+    // Test if object is already locked
+    assert(markOopDesc::unlocked_value == 1, "adjust this code");
+    tbz(Rmark, exact_log2(markOopDesc::unlocked_value), already_locked);
+
+#else // AARCH64
+
+    // On MP platforms the next load could return a 'stale' value if the memory location has been modified by another thread.
+    // That would be acceptable as ether CAS or slow case path is taken in that case.
+    // Exception to that is if the object is locked by the calling thread, then the recursive test will pass (guaranteed as
+    // loads are satisfied from a store queue if performed on the same processor).
+
+    assert(oopDesc::mark_offset_in_bytes() == 0, "must be");
+    ldr(Rmark, Address(Robj, oopDesc::mark_offset_in_bytes()));
+
+    // Test if object is already locked
+    tst(Rmark, markOopDesc::unlocked_value);
+    b(already_locked, eq);
+
+#endif // !AARCH64
+    // Save old object->mark() into BasicLock's displaced header
+    str(Rmark, Address(Rlock, mark_offset));
+
+    cas_for_lock_acquire(Rmark, Rlock, Robj, Rtemp, slow_case);
+
+#ifndef PRODUCT
+    if (PrintBiasedLockingStatistics) {
+      cond_atomic_inc32(al, BiasedLocking::fast_path_entry_count_addr());
+    }
+#endif //!PRODUCT
+
+    b(done);
+
+    // If we got here that means the object is locked by ether calling thread or another thread.
+    bind(already_locked);
+    // Handling of locked objects: recursive locks and slow case.
+
+    // Fast check for recursive lock.
+    //
+    // Can apply the optimization only if this is a stack lock
+    // allocated in this thread. For efficiency, we can focus on
+    // recently allocated stack locks (instead of reading the stack
+    // base and checking whether 'mark' points inside the current
+    // thread stack):
+    //  1) (mark & 3) == 0
+    //  2) SP <= mark < SP + os::pagesize()
+    //
+    // Warning: SP + os::pagesize can overflow the stack base. We must
+    // neither apply the optimization for an inflated lock allocated
+    // just above the thread stack (this is why condition 1 matters)
+    // nor apply the optimization if the stack lock is inside the stack
+    // of another thread. The latter is avoided even in case of overflow
+    // because we have guard pages at the end of all stacks. Hence, if
+    // we go over the stack base and hit the stack of another thread,
+    // this should not be in a writeable area that could contain a
+    // stack lock allocated by that thread. As a consequence, a stack
+    // lock less than page size away from SP is guaranteed to be
+    // owned by the current thread.
+    //
+    // Note: assuming SP is aligned, we can check the low bits of
+    // (mark-SP) instead of the low bits of mark. In that case,
+    // assuming page size is a power of 2, we can merge the two
+    // conditions into a single test:
+    // => ((mark - SP) & (3 - os::pagesize())) == 0
+
+#ifdef AARCH64
+    // Use the single check since the immediate is OK for AARCH64
+    sub(R0, Rmark, Rstack_top);
+    intptr_t mask = ((intptr_t)3) - ((intptr_t)os::vm_page_size());
+    Assembler::LogicalImmediate imm(mask, false);
+    ands(R0, R0, imm);
+
+    // For recursive case store 0 into lock record.
+    // It is harmless to store it unconditionally as lock record contains some garbage
+    // value in its _displaced_header field by this moment.
+    str(ZR, Address(Rlock, mark_offset));
+
+#else // AARCH64
+    // (3 - os::pagesize()) cannot be encoded as an ARM immediate operand.
+    // Check independently the low bits and the distance to SP.
+    // -1- test low 2 bits
+    movs(R0, AsmOperand(Rmark, lsl, 30));
+    // -2- test (mark - SP) if the low two bits are 0
+    sub(R0, Rmark, SP, eq);
+    movs(R0, AsmOperand(R0, lsr, exact_log2(os::vm_page_size())), eq);
+    // If still 'eq' then recursive locking OK: store 0 into lock record
+    str(R0, Address(Rlock, mark_offset), eq);
+
+#endif // AARCH64
+
+#ifndef PRODUCT
+    if (PrintBiasedLockingStatistics) {
+      cond_atomic_inc32(eq, BiasedLocking::fast_path_entry_count_addr());
+    }
+#endif // !PRODUCT
+
+    b(done, eq);
+
+    bind(slow_case);
+
+    // Call the runtime routine for slow case
+    call_VM(noreg, CAST_FROM_FN_PTR(address, InterpreterRuntime::monitorenter), Rlock);
+
+    bind(done);
+  }
+}
+
+
+// Unlocks an object. Used in monitorexit bytecode and remove_activation.
+//
+// Argument: R1: Points to BasicObjectLock structure for lock
+// Throw an IllegalMonitorException if object is not locked by current thread
+// Blows volatile registers (R0-R3 on 32-bit ARM, R0-R18 on AArch64), Rtemp, LR. Calls VM.
+void InterpreterMacroAssembler::unlock_object(Register Rlock) {
+  assert(Rlock == R1, "the second argument");
+
+  if (UseHeavyMonitors) {
+    call_VM(noreg, CAST_FROM_FN_PTR(address, InterpreterRuntime::monitorexit), Rlock);
+  } else {
+    Label done, slow_case;
+
+    const Register Robj = R2;
+    const Register Rmark = R3;
+    const Register Rresult = R0;
+    assert_different_registers(Robj, Rmark, Rlock, R0, Rtemp);
+
+    const int obj_offset = BasicObjectLock::obj_offset_in_bytes();
+    const int lock_offset = BasicObjectLock::lock_offset_in_bytes ();
+    const int mark_offset = lock_offset + BasicLock::displaced_header_offset_in_bytes();
+
+    const Register Rzero = zero_register(Rtemp);
+
+    // Load oop into Robj
+    ldr(Robj, Address(Rlock, obj_offset));
+
+    // Free entry
+    str(Rzero, Address(Rlock, obj_offset));
+
+    if (UseBiasedLocking) {
+      biased_locking_exit(Robj, Rmark, done);
+    }
+
+    // Load the old header from BasicLock structure
+    ldr(Rmark, Address(Rlock, mark_offset));
+
+    // Test for recursion (zero mark in BasicLock)
+    cbz(Rmark, done);
+
+    bool allow_fallthrough_on_failure = true;
+
+    cas_for_lock_release(Rlock, Rmark, Robj, Rtemp, slow_case, allow_fallthrough_on_failure);
+
+    b(done, eq);
+
+    bind(slow_case);
+
+    // Call the runtime routine for slow case.
+    str(Robj, Address(Rlock, obj_offset)); // restore obj
+    call_VM(noreg, CAST_FROM_FN_PTR(address, InterpreterRuntime::monitorexit), Rlock);
+
+    bind(done);
+  }
+}
+
+
+// Test ImethodDataPtr.  If it is null, continue at the specified label
+void InterpreterMacroAssembler::test_method_data_pointer(Register mdp, Label& zero_continue) {
+  assert(ProfileInterpreter, "must be profiling interpreter");
+  ldr(mdp, Address(FP, frame::interpreter_frame_mdp_offset * wordSize));
+  cbz(mdp, zero_continue);
+}
+
+
+// Set the method data pointer for the current bcp.
+// Blows volatile registers (R0-R3 on 32-bit ARM, R0-R18 on AArch64), Rtemp, LR.
+void InterpreterMacroAssembler::set_method_data_pointer_for_bcp() {
+  assert(ProfileInterpreter, "must be profiling interpreter");
+  Label set_mdp;
+
+  // Test MDO to avoid the call if it is NULL.
+  ldr(Rtemp, Address(Rmethod, Method::method_data_offset()));
+  cbz(Rtemp, set_mdp);
+
+  mov(R0, Rmethod);
+  mov(R1, Rbcp);
+  call_VM_leaf(CAST_FROM_FN_PTR(address, InterpreterRuntime::bcp_to_di), R0, R1);
+  // R0/W0: mdi
+
+  // mdo is guaranteed to be non-zero here, we checked for it before the call.
+  ldr(Rtemp, Address(Rmethod, Method::method_data_offset()));
+  add(Rtemp, Rtemp, in_bytes(MethodData::data_offset()));
+  add_ptr_scaled_int32(Rtemp, Rtemp, R0, 0);
+
+  bind(set_mdp);
+  str(Rtemp, Address(FP, frame::interpreter_frame_mdp_offset * wordSize));
+}
+
+
+void InterpreterMacroAssembler::verify_method_data_pointer() {
+  assert(ProfileInterpreter, "must be profiling interpreter");
+#ifdef ASSERT
+  Label verify_continue;
+  save_caller_save_registers();
+
+  const Register Rmdp = R2;
+  test_method_data_pointer(Rmdp, verify_continue); // If mdp is zero, continue
+
+  // If the mdp is valid, it will point to a DataLayout header which is
+  // consistent with the bcp.  The converse is highly probable also.
+
+  ldrh(R3, Address(Rmdp, DataLayout::bci_offset()));
+  ldr(Rtemp, Address(Rmethod, Method::const_offset()));
+  add(R3, R3, Rtemp);
+  add(R3, R3, in_bytes(ConstMethod::codes_offset()));
+  cmp(R3, Rbcp);
+  b(verify_continue, eq);
+
+  mov(R0, Rmethod);
+  mov(R1, Rbcp);
+  call_VM_leaf(CAST_FROM_FN_PTR(address, InterpreterRuntime::verify_mdp), R0, R1, Rmdp);
+
+  bind(verify_continue);
+  restore_caller_save_registers();
+#endif // ASSERT
+}
+
+
+void InterpreterMacroAssembler::set_mdp_data_at(Register mdp_in, int offset, Register value) {
+  assert(ProfileInterpreter, "must be profiling interpreter");
+  assert_different_registers(mdp_in, value);
+  str(value, Address(mdp_in, offset));
+}
+
+
+// Increments mdp data. Sets bumped_count register to adjusted counter.
+void InterpreterMacroAssembler::increment_mdp_data_at(Register mdp_in,
+                                                      int offset,
+                                                      Register bumped_count,
+                                                      bool decrement) {
+  assert(ProfileInterpreter, "must be profiling interpreter");
+
+  // Counter address
+  Address data(mdp_in, offset);
+  assert_different_registers(mdp_in, bumped_count);
+
+  increment_mdp_data_at(data, bumped_count, decrement);
+}
+
+void InterpreterMacroAssembler::set_mdp_flag_at(Register mdp_in, int flag_byte_constant) {
+  assert_different_registers(mdp_in, Rtemp);
+  assert(ProfileInterpreter, "must be profiling interpreter");
+  assert((0 < flag_byte_constant) && (flag_byte_constant < (1 << BitsPerByte)), "flag mask is out of range");
+
+  // Set the flag
+  ldrb(Rtemp, Address(mdp_in, in_bytes(DataLayout::flags_offset())));
+  orr(Rtemp, Rtemp, (unsigned)flag_byte_constant);
+  strb(Rtemp, Address(mdp_in, in_bytes(DataLayout::flags_offset())));
+}
+
+
+// Increments mdp data. Sets bumped_count register to adjusted counter.
+void InterpreterMacroAssembler::increment_mdp_data_at(Address data,
+                                                      Register bumped_count,
+                                                      bool decrement) {
+  assert(ProfileInterpreter, "must be profiling interpreter");
+
+  ldr(bumped_count, data);
+  if (decrement) {
+    // Decrement the register. Set condition codes.
+    subs(bumped_count, bumped_count, DataLayout::counter_increment);
+    // Avoid overflow.
+#ifdef AARCH64
+    assert(DataLayout::counter_increment == 1, "required for cinc");
+    cinc(bumped_count, bumped_count, pl);
+#else
+    add(bumped_count, bumped_count, DataLayout::counter_increment, pl);
+#endif // AARCH64
+  } else {
+    // Increment the register. Set condition codes.
+    adds(bumped_count, bumped_count, DataLayout::counter_increment);
+    // Avoid overflow.
+#ifdef AARCH64
+    assert(DataLayout::counter_increment == 1, "required for cinv");
+    cinv(bumped_count, bumped_count, mi); // inverts 0x80..00 back to 0x7f..ff
+#else
+    sub(bumped_count, bumped_count, DataLayout::counter_increment, mi);
+#endif // AARCH64
+  }
+  str(bumped_count, data);
+}
+
+
+void InterpreterMacroAssembler::test_mdp_data_at(Register mdp_in,
+                                                 int offset,
+                                                 Register value,
+                                                 Register test_value_out,
+                                                 Label& not_equal_continue) {
+  assert(ProfileInterpreter, "must be profiling interpreter");
+  assert_different_registers(mdp_in, test_value_out, value);
+
+  ldr(test_value_out, Address(mdp_in, offset));
+  cmp(test_value_out, value);
+
+  b(not_equal_continue, ne);
+}
+
+
+void InterpreterMacroAssembler::update_mdp_by_offset(Register mdp_in, int offset_of_disp, Register reg_temp) {
+  assert(ProfileInterpreter, "must be profiling interpreter");
+  assert_different_registers(mdp_in, reg_temp);
+
+  ldr(reg_temp, Address(mdp_in, offset_of_disp));
+  add(mdp_in, mdp_in, reg_temp);
+  str(mdp_in, Address(FP, frame::interpreter_frame_mdp_offset * wordSize));
+}
+
+
+void InterpreterMacroAssembler::update_mdp_by_offset(Register mdp_in, Register reg_offset, Register reg_tmp) {
+  assert(ProfileInterpreter, "must be profiling interpreter");
+  assert_different_registers(mdp_in, reg_offset, reg_tmp);
+
+  ldr(reg_tmp, Address(mdp_in, reg_offset));
+  add(mdp_in, mdp_in, reg_tmp);
+  str(mdp_in, Address(FP, frame::interpreter_frame_mdp_offset * wordSize));
+}
+
+
+void InterpreterMacroAssembler::update_mdp_by_constant(Register mdp_in, int constant) {
+  assert(ProfileInterpreter, "must be profiling interpreter");
+  add(mdp_in, mdp_in, constant);
+  str(mdp_in, Address(FP, frame::interpreter_frame_mdp_offset * wordSize));
+}
+
+
+// Blows volatile registers (R0-R3 on 32-bit ARM, R0-R18 on AArch64, Rtemp, LR).
+void InterpreterMacroAssembler::update_mdp_for_ret(Register return_bci) {
+  assert(ProfileInterpreter, "must be profiling interpreter");
+  assert_different_registers(return_bci, R0, R1, R2, R3, Rtemp);
+
+  mov(R1, return_bci);
+  call_VM(noreg, CAST_FROM_FN_PTR(address, InterpreterRuntime::update_mdp_for_ret), R1);
+}
+
+
+// Sets mdp, bumped_count registers, blows Rtemp.
+void InterpreterMacroAssembler::profile_taken_branch(Register mdp, Register bumped_count) {
+  assert_different_registers(mdp, bumped_count);
+
+  if (ProfileInterpreter) {
+    Label profile_continue;
+
+    // If no method data exists, go to profile_continue.
+    // Otherwise, assign to mdp
+    test_method_data_pointer(mdp, profile_continue);
+
+    // We are taking a branch. Increment the taken count.
+    increment_mdp_data_at(mdp, in_bytes(JumpData::taken_offset()), bumped_count);
+
+    // The method data pointer needs to be updated to reflect the new target.
+    update_mdp_by_offset(mdp, in_bytes(JumpData::displacement_offset()), Rtemp);
+
+    bind (profile_continue);
+  }
+}
+
+
+// Sets mdp, blows Rtemp.
+void InterpreterMacroAssembler::profile_not_taken_branch(Register mdp) {
+  assert_different_registers(mdp, Rtemp);
+
+  if (ProfileInterpreter) {
+    Label profile_continue;
+
+    // If no method data exists, go to profile_continue.
+    test_method_data_pointer(mdp, profile_continue);
+
+    // We are taking a branch.  Increment the not taken count.
+    increment_mdp_data_at(mdp, in_bytes(BranchData::not_taken_offset()), Rtemp);
+
+    // The method data pointer needs to be updated to correspond to the next bytecode
+    update_mdp_by_constant(mdp, in_bytes(BranchData::branch_data_size()));
+
+    bind (profile_continue);
+  }
+}
+
+
+// Sets mdp, blows Rtemp.
+void InterpreterMacroAssembler::profile_call(Register mdp) {
+  assert_different_registers(mdp, Rtemp);
+
+  if (ProfileInterpreter) {
+    Label profile_continue;
+
+    // If no method data exists, go to profile_continue.
+    test_method_data_pointer(mdp, profile_continue);
+
+    // We are making a call.  Increment the count.
+    increment_mdp_data_at(mdp, in_bytes(CounterData::count_offset()), Rtemp);
+
+    // The method data pointer needs to be updated to reflect the new target.
+    update_mdp_by_constant(mdp, in_bytes(CounterData::counter_data_size()));
+
+    bind (profile_continue);
+  }
+}
+
+
+// Sets mdp, blows Rtemp.
+void InterpreterMacroAssembler::profile_final_call(Register mdp) {
+  if (ProfileInterpreter) {
+    Label profile_continue;
+
+    // If no method data exists, go to profile_continue.
+    test_method_data_pointer(mdp, profile_continue);
+
+    // We are making a call.  Increment the count.
+    increment_mdp_data_at(mdp, in_bytes(CounterData::count_offset()), Rtemp);
+
+    // The method data pointer needs to be updated to reflect the new target.
+    update_mdp_by_constant(mdp, in_bytes(VirtualCallData::virtual_call_data_size()));
+
+    bind (profile_continue);
+  }
+}
+
+
+// Sets mdp, blows Rtemp.
+void InterpreterMacroAssembler::profile_virtual_call(Register mdp, Register receiver, bool receiver_can_be_null) {
+  assert_different_registers(mdp, receiver, Rtemp);
+
+  if (ProfileInterpreter) {
+    Label profile_continue;
+
+    // If no method data exists, go to profile_continue.
+    test_method_data_pointer(mdp, profile_continue);
+
+    Label skip_receiver_profile;
+    if (receiver_can_be_null) {
+      Label not_null;
+      cbnz(receiver, not_null);
+      // We are making a call.  Increment the count for null receiver.
+      increment_mdp_data_at(mdp, in_bytes(CounterData::count_offset()), Rtemp);
+      b(skip_receiver_profile);
+      bind(not_null);
+    }
+
+    // Record the receiver type.
+    record_klass_in_profile(receiver, mdp, Rtemp, true);
+    bind(skip_receiver_profile);
+
+    // The method data pointer needs to be updated to reflect the new target.
+    update_mdp_by_constant(mdp, in_bytes(VirtualCallData::virtual_call_data_size()));
+    bind(profile_continue);
+  }
+}
+
+
+void InterpreterMacroAssembler::record_klass_in_profile_helper(
+                                        Register receiver, Register mdp,
+                                        Register reg_tmp,
+                                        int start_row, Label& done, bool is_virtual_call) {
+  if (TypeProfileWidth == 0)
+    return;
+
+  assert_different_registers(receiver, mdp, reg_tmp);
+
+  int last_row = VirtualCallData::row_limit() - 1;
+  assert(start_row <= last_row, "must be work left to do");
+  // Test this row for both the receiver and for null.
+  // Take any of three different outcomes:
+  //   1. found receiver => increment count and goto done
+  //   2. found null => keep looking for case 1, maybe allocate this cell
+  //   3. found something else => keep looking for cases 1 and 2
+  // Case 3 is handled by a recursive call.
+  for (int row = start_row; row <= last_row; row++) {
+    Label next_test;
+
+    // See if the receiver is receiver[n].
+    int recvr_offset = in_bytes(VirtualCallData::receiver_offset(row));
+
+    test_mdp_data_at(mdp, recvr_offset, receiver, reg_tmp, next_test);
+
+    // The receiver is receiver[n].  Increment count[n].
+    int count_offset = in_bytes(VirtualCallData::receiver_count_offset(row));
+    increment_mdp_data_at(mdp, count_offset, reg_tmp);
+    b(done);
+
+    bind(next_test);
+    // reg_tmp now contains the receiver from the CallData.
+
+    if (row == start_row) {
+      Label found_null;
+      // Failed the equality check on receiver[n]...  Test for null.
+      if (start_row == last_row) {
+        // The only thing left to do is handle the null case.
+        if (is_virtual_call) {
+          cbz(reg_tmp, found_null);
+          // Receiver did not match any saved receiver and there is no empty row for it.
+          // Increment total counter to indicate polymorphic case.
+          increment_mdp_data_at(mdp, in_bytes(CounterData::count_offset()), reg_tmp);
+          b(done);
+          bind(found_null);
+        } else {
+          cbnz(reg_tmp, done);
+        }
+        break;
+      }
+      // Since null is rare, make it be the branch-taken case.
+      cbz(reg_tmp, found_null);
+
+      // Put all the "Case 3" tests here.
+      record_klass_in_profile_helper(receiver, mdp, reg_tmp, start_row + 1, done, is_virtual_call);
+
+      // Found a null.  Keep searching for a matching receiver,
+      // but remember that this is an empty (unused) slot.
+      bind(found_null);
+    }
+  }
+
+  // In the fall-through case, we found no matching receiver, but we
+  // observed the receiver[start_row] is NULL.
+
+  // Fill in the receiver field and increment the count.
+  int recvr_offset = in_bytes(VirtualCallData::receiver_offset(start_row));
+  set_mdp_data_at(mdp, recvr_offset, receiver);
+  int count_offset = in_bytes(VirtualCallData::receiver_count_offset(start_row));
+  mov(reg_tmp, DataLayout::counter_increment);
+  set_mdp_data_at(mdp, count_offset, reg_tmp);
+  if (start_row > 0) {
+    b(done);
+  }
+}
+
+void InterpreterMacroAssembler::record_klass_in_profile(Register receiver,
+                                                        Register mdp,
+                                                        Register reg_tmp,
+                                                        bool is_virtual_call) {
+  assert(ProfileInterpreter, "must be profiling");
+  assert_different_registers(receiver, mdp, reg_tmp);
+
+  Label done;
+
+  record_klass_in_profile_helper(receiver, mdp, reg_tmp, 0, done, is_virtual_call);
+
+  bind (done);
+}
+
+// Sets mdp, blows volatile registers (R0-R3 on 32-bit ARM, R0-R18 on AArch64, Rtemp, LR).
+void InterpreterMacroAssembler::profile_ret(Register mdp, Register return_bci) {
+  assert_different_registers(mdp, return_bci, Rtemp, R0, R1, R2, R3);
+
+  if (ProfileInterpreter) {
+    Label profile_continue;
+    uint row;
+
+    // If no method data exists, go to profile_continue.
+    test_method_data_pointer(mdp, profile_continue);
+
+    // Update the total ret count.
+    increment_mdp_data_at(mdp, in_bytes(CounterData::count_offset()), Rtemp);
+
+    for (row = 0; row < RetData::row_limit(); row++) {
+      Label next_test;
+
+      // See if return_bci is equal to bci[n]:
+      test_mdp_data_at(mdp, in_bytes(RetData::bci_offset(row)), return_bci,
+                       Rtemp, next_test);
+
+      // return_bci is equal to bci[n].  Increment the count.
+      increment_mdp_data_at(mdp, in_bytes(RetData::bci_count_offset(row)), Rtemp);
+
+      // The method data pointer needs to be updated to reflect the new target.
+      update_mdp_by_offset(mdp, in_bytes(RetData::bci_displacement_offset(row)), Rtemp);
+      b(profile_continue);
+      bind(next_test);
+    }
+
+    update_mdp_for_ret(return_bci);
+
+    bind(profile_continue);
+  }
+}
+
+
+// Sets mdp.
+void InterpreterMacroAssembler::profile_null_seen(Register mdp) {
+  if (ProfileInterpreter) {
+    Label profile_continue;
+
+    // If no method data exists, go to profile_continue.
+    test_method_data_pointer(mdp, profile_continue);
+
+    set_mdp_flag_at(mdp, BitData::null_seen_byte_constant());
+
+    // The method data pointer needs to be updated.
+    int mdp_delta = in_bytes(BitData::bit_data_size());
+    if (TypeProfileCasts) {
+      mdp_delta = in_bytes(VirtualCallData::virtual_call_data_size());
+    }
+    update_mdp_by_constant(mdp, mdp_delta);
+
+    bind (profile_continue);
+  }
+}
+
+
+// Sets mdp, blows Rtemp.
+void InterpreterMacroAssembler::profile_typecheck_failed(Register mdp) {
+  assert_different_registers(mdp, Rtemp);
+
+  if (ProfileInterpreter && TypeProfileCasts) {
+    Label profile_continue;
+
+    // If no method data exists, go to profile_continue.
+    test_method_data_pointer(mdp, profile_continue);
+
+    int count_offset = in_bytes(CounterData::count_offset());
+    // Back up the address, since we have already bumped the mdp.
+    count_offset -= in_bytes(VirtualCallData::virtual_call_data_size());
+
+    // *Decrement* the counter.  We expect to see zero or small negatives.
+    increment_mdp_data_at(mdp, count_offset, Rtemp, true);
+
+    bind (profile_continue);
+  }
+}
+
+
+// Sets mdp, blows Rtemp.
+void InterpreterMacroAssembler::profile_typecheck(Register mdp, Register klass)
+{
+  assert_different_registers(mdp, klass, Rtemp);
+
+  if (ProfileInterpreter) {
+    Label profile_continue;
+
+    // If no method data exists, go to profile_continue.
+    test_method_data_pointer(mdp, profile_continue);
+
+    // The method data pointer needs to be updated.
+    int mdp_delta = in_bytes(BitData::bit_data_size());
+    if (TypeProfileCasts) {
+      mdp_delta = in_bytes(VirtualCallData::virtual_call_data_size());
+
+      // Record the object type.
+      record_klass_in_profile(klass, mdp, Rtemp, false);
+    }
+    update_mdp_by_constant(mdp, mdp_delta);
+
+    bind(profile_continue);
+  }
+}
+
+
+// Sets mdp, blows Rtemp.
+void InterpreterMacroAssembler::profile_switch_default(Register mdp) {
+  assert_different_registers(mdp, Rtemp);
+
+  if (ProfileInterpreter) {
+    Label profile_continue;
+
+    // If no method data exists, go to profile_continue.
+    test_method_data_pointer(mdp, profile_continue);
+
+    // Update the default case count
+    increment_mdp_data_at(mdp, in_bytes(MultiBranchData::default_count_offset()), Rtemp);
+
+    // The method data pointer needs to be updated.
+    update_mdp_by_offset(mdp, in_bytes(MultiBranchData::default_displacement_offset()), Rtemp);
+
+    bind(profile_continue);
+  }
+}
+
+
+// Sets mdp. Blows reg_tmp1, reg_tmp2. Index could be the same as reg_tmp2.
+void InterpreterMacroAssembler::profile_switch_case(Register mdp, Register index, Register reg_tmp1, Register reg_tmp2) {
+  assert_different_registers(mdp, reg_tmp1, reg_tmp2);
+  assert_different_registers(mdp, reg_tmp1, index);
+
+  if (ProfileInterpreter) {
+    Label profile_continue;
+
+    const int count_offset = in_bytes(MultiBranchData::case_array_offset()) +
+                              in_bytes(MultiBranchData::relative_count_offset());
+
+    const int displacement_offset = in_bytes(MultiBranchData::case_array_offset()) +
+                              in_bytes(MultiBranchData::relative_displacement_offset());
+
+    // If no method data exists, go to profile_continue.
+    test_method_data_pointer(mdp, profile_continue);
+
+    // Build the base (index * per_case_size_in_bytes())
+    logical_shift_left(reg_tmp1, index, exact_log2(in_bytes(MultiBranchData::per_case_size())));
+
+    // Update the case count
+    add(reg_tmp1, reg_tmp1, count_offset);
+    increment_mdp_data_at(Address(mdp, reg_tmp1), reg_tmp2);
+
+    // The method data pointer needs to be updated.
+    add(reg_tmp1, reg_tmp1, displacement_offset - count_offset);
+    update_mdp_by_offset(mdp, reg_tmp1, reg_tmp2);
+
+    bind (profile_continue);
+  }
+}
+
+
+void InterpreterMacroAssembler::byteswap_u32(Register r, Register rtmp1, Register rtmp2) {
+#ifdef AARCH64
+  rev_w(r, r);
+#else
+  if (VM_Version::supports_rev()) {
+    rev(r, r);
+  } else {
+    eor(rtmp1, r, AsmOperand(r, ror, 16));
+    mvn(rtmp2, 0x0000ff00);
+    andr(rtmp1, rtmp2, AsmOperand(rtmp1, lsr, 8));
+    eor(r, rtmp1, AsmOperand(r, ror, 8));
+  }
+#endif // AARCH64
+}
+
+
+void InterpreterMacroAssembler::inc_global_counter(address address_of_counter, int offset, Register tmp1, Register tmp2, bool avoid_overflow) {
+  const intx addr = (intx) (address_of_counter + offset);
+
+  assert ((addr & 0x3) == 0, "address of counter should be aligned");
+  const intx offset_mask = right_n_bits(AARCH64_ONLY(12 + 2) NOT_AARCH64(12));
+
+  const address base = (address) (addr & ~offset_mask);
+  const int offs = (int) (addr & offset_mask);
+
+  const Register addr_base = tmp1;
+  const Register val = tmp2;
+
+  mov_slow(addr_base, base);
+  ldr_s32(val, Address(addr_base, offs));
+
+  if (avoid_overflow) {
+    adds_32(val, val, 1);
+#ifdef AARCH64
+    Label L;
+    b(L, mi);
+    str_32(val, Address(addr_base, offs));
+    bind(L);
+#else
+    str(val, Address(addr_base, offs), pl);
+#endif // AARCH64
+  } else {
+    add_32(val, val, 1);
+    str_32(val, Address(addr_base, offs));
+  }
+}
+
+void InterpreterMacroAssembler::interp_verify_oop(Register reg, TosState state, const char *file, int line) {
+  if (state == atos) { MacroAssembler::_verify_oop(reg, "broken oop", file, line); }
+}
+
+// Inline assembly for:
+//
+// if (thread is in interp_only_mode) {
+//   InterpreterRuntime::post_method_entry();
+// }
+// if (DTraceMethodProbes) {
+//   SharedRuntime::dtrace_method_entry(method, receiver);
+// }
+// if (RC_TRACE_IN_RANGE(0x00001000, 0x00002000)) {
+//   SharedRuntime::rc_trace_method_entry(method, receiver);
+// }
+
+void InterpreterMacroAssembler::notify_method_entry() {
+  // Whenever JVMTI is interp_only_mode, method entry/exit events are sent to
+  // track stack depth.  If it is possible to enter interp_only_mode we add
+  // the code to check if the event should be sent.
+  if (can_post_interpreter_events()) {
+    Label L;
+
+    ldr_s32(Rtemp, Address(Rthread, JavaThread::interp_only_mode_offset()));
+    cbz(Rtemp, L);
+
+    call_VM(noreg, CAST_FROM_FN_PTR(address, InterpreterRuntime::post_method_entry));
+
+    bind(L);
+  }
+
+  // Note: Disable DTrace runtime check for now to eliminate overhead on each method entry
+  if (DTraceMethodProbes) {
+    Label Lcontinue;
+
+    ldrb_global(Rtemp, (address)&DTraceMethodProbes);
+    cbz(Rtemp, Lcontinue);
+
+    mov(R0, Rthread);
+    mov(R1, Rmethod);
+    call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_entry), R0, R1);
+
+    bind(Lcontinue);
+  }
+  // RedefineClasses() tracing support for obsolete method entry
+  if (log_is_enabled(Trace, redefine, class, obsolete)) {
+    mov(R0, Rthread);
+    mov(R1, Rmethod);
+    call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::rc_trace_method_entry),
+                 R0, R1);
+  }
+}
+
+
+void InterpreterMacroAssembler::notify_method_exit(
+                 TosState state, NotifyMethodExitMode mode,
+                 bool native, Register result_lo, Register result_hi, FloatRegister result_fp) {
+  // Whenever JVMTI is interp_only_mode, method entry/exit events are sent to
+  // track stack depth.  If it is possible to enter interp_only_mode we add
+  // the code to check if the event should be sent.
+  if (mode == NotifyJVMTI && can_post_interpreter_events()) {
+    Label L;
+    // Note: frame::interpreter_frame_result has a dependency on how the
+    // method result is saved across the call to post_method_exit. If this
+    // is changed then the interpreter_frame_result implementation will
+    // need to be updated too.
+
+    ldr_s32(Rtemp, Address(Rthread, JavaThread::interp_only_mode_offset()));
+    cbz(Rtemp, L);
+
+    if (native) {
+      // For c++ and template interpreter push both result registers on the
+      // stack in native, we don't know the state.
+      // On AArch64 result registers are stored into the frame at known locations.
+      // See frame::interpreter_frame_result for code that gets the result values from here.
+      assert(result_lo != noreg, "result registers should be defined");
+
+#ifdef AARCH64
+      assert(result_hi == noreg, "result_hi is not used on AArch64");
+      assert(result_fp != fnoreg, "FP result register must be defined");
+
+      str_d(result_fp, Address(FP, frame::interpreter_frame_fp_saved_result_offset * wordSize));
+      str(result_lo, Address(FP, frame::interpreter_frame_gp_saved_result_offset * wordSize));
+#else
+      assert(result_hi != noreg, "result registers should be defined");
+
+#ifdef __ABI_HARD__
+      assert(result_fp != fnoreg, "FP result register must be defined");
+      sub(SP, SP, 2 * wordSize);
+      fstd(result_fp, Address(SP));
+#endif // __ABI_HARD__
+
+      push(RegisterSet(result_lo) | RegisterSet(result_hi));
+#endif // AARCH64
+
+      call_VM(noreg, CAST_FROM_FN_PTR(address, InterpreterRuntime::post_method_exit));
+
+#ifdef AARCH64
+      ldr_d(result_fp, Address(FP, frame::interpreter_frame_fp_saved_result_offset * wordSize));
+      ldr(result_lo, Address(FP, frame::interpreter_frame_gp_saved_result_offset * wordSize));
+#else
+      pop(RegisterSet(result_lo) | RegisterSet(result_hi));
+#ifdef __ABI_HARD__
+      fldd(result_fp, Address(SP));
+      add(SP, SP, 2 * wordSize);
+#endif // __ABI_HARD__
+#endif // AARCH64
+
+    } else {
+      // For the template interpreter, the value on tos is the size of the
+      // state. (c++ interpreter calls jvmti somewhere else).
+      push(state);
+      call_VM(noreg, CAST_FROM_FN_PTR(address, InterpreterRuntime::post_method_exit));
+      pop(state);
+    }
+
+    bind(L);
+  }
+
+  // Note: Disable DTrace runtime check for now to eliminate overhead on each method exit
+  if (DTraceMethodProbes) {
+    Label Lcontinue;
+
+    ldrb_global(Rtemp, (address)&DTraceMethodProbes);
+    cbz(Rtemp, Lcontinue);
+
+    push(state);
+
+    mov(R0, Rthread);
+    mov(R1, Rmethod);
+
+    call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_exit), R0, R1);
+
+    pop(state);
+
+    bind(Lcontinue);
+  }
+}
+
+
+#ifndef PRODUCT
+
+void InterpreterMacroAssembler::trace_state(const char* msg) {
+  int push_size = save_caller_save_registers();
+
+  Label Lcontinue;
+  InlinedString Lmsg0("%s: FP=" INTPTR_FORMAT ", SP=" INTPTR_FORMAT "\n");
+  InlinedString Lmsg(msg);
+  InlinedAddress Lprintf((address)printf);
+
+  ldr_literal(R0, Lmsg0);
+  ldr_literal(R1, Lmsg);
+  mov(R2, FP);
+  add(R3, SP, push_size);  // original SP (without saved registers)
+  ldr_literal(Rtemp, Lprintf);
+  call(Rtemp);
+
+  b(Lcontinue);
+
+  bind_literal(Lmsg0);
+  bind_literal(Lmsg);
+  bind_literal(Lprintf);
+
+
+  bind(Lcontinue);
+
+  restore_caller_save_registers();
+}
+
+#endif
+
+// Jump if ((*counter_addr += increment) & mask) satisfies the condition.
+void InterpreterMacroAssembler::increment_mask_and_jump(Address counter_addr,
+                                                        int increment, Address mask_addr,
+                                                        Register scratch, Register scratch2,
+                                                        AsmCondition cond, Label* where) {
+  // caution: scratch2 and base address of counter_addr can be the same
+  assert_different_registers(scratch, scratch2);
+  ldr_u32(scratch, counter_addr);
+  add(scratch, scratch, increment);
+  str_32(scratch, counter_addr);
+
+#ifdef AARCH64
+  ldr_u32(scratch2, mask_addr);
+  ands_w(ZR, scratch, scratch2);
+#else
+  ldr(scratch2, mask_addr);
+  andrs(scratch, scratch, scratch2);
+#endif // AARCH64
+  b(*where, cond);
+}
+
+void InterpreterMacroAssembler::get_method_counters(Register method,
+                                                    Register Rcounters,
+                                                    Label& skip) {
+  const Address method_counters(method, Method::method_counters_offset());
+  Label has_counters;
+
+  ldr(Rcounters, method_counters);
+  cbnz(Rcounters, has_counters);
+
+#ifdef AARCH64
+  const Register tmp = Rcounters;
+  const int saved_regs_size = 20*wordSize;
+
+  // Note: call_VM will cut SP according to Rstack_top value before call, and restore SP to
+  // extended_sp value from frame after the call.
+  // So make sure there is enough stack space to save registers and adjust Rstack_top accordingly.
+  {
+    Label enough_stack_space;
+    check_extended_sp(tmp);
+    sub(Rstack_top, Rstack_top, saved_regs_size);
+    cmp(SP, Rstack_top);
+    b(enough_stack_space, ls);
+
+    align_reg(tmp, Rstack_top, StackAlignmentInBytes);
+    mov(SP, tmp);
+    str(tmp, Address(FP, frame::interpreter_frame_extended_sp_offset * wordSize));
+
+    bind(enough_stack_space);
+    check_stack_top();
+
+    int offset = 0;
+    stp(R0,  R1,  Address(Rstack_top, offset)); offset += 2*wordSize;
+    stp(R2,  R3,  Address(Rstack_top, offset)); offset += 2*wordSize;
+    stp(R4,  R5,  Address(Rstack_top, offset)); offset += 2*wordSize;
+    stp(R6,  R7,  Address(Rstack_top, offset)); offset += 2*wordSize;
+    stp(R8,  R9,  Address(Rstack_top, offset)); offset += 2*wordSize;
+    stp(R10, R11, Address(Rstack_top, offset)); offset += 2*wordSize;
+    stp(R12, R13, Address(Rstack_top, offset)); offset += 2*wordSize;
+    stp(R14, R15, Address(Rstack_top, offset)); offset += 2*wordSize;
+    stp(R16, R17, Address(Rstack_top, offset)); offset += 2*wordSize;
+    stp(R18, LR,  Address(Rstack_top, offset)); offset += 2*wordSize;
+    assert (offset == saved_regs_size, "should be");
+  }
+#else
+  push(RegisterSet(R0, R3) | RegisterSet(R12) | RegisterSet(R14));
+#endif // AARCH64
+
+  mov(R1, method);
+  call_VM(noreg, CAST_FROM_FN_PTR(address,
+          InterpreterRuntime::build_method_counters), R1);
+
+#ifdef AARCH64
+  {
+    int offset = 0;
+    ldp(R0,  R1,  Address(Rstack_top, offset)); offset += 2*wordSize;
+    ldp(R2,  R3,  Address(Rstack_top, offset)); offset += 2*wordSize;
+    ldp(R4,  R5,  Address(Rstack_top, offset)); offset += 2*wordSize;
+    ldp(R6,  R7,  Address(Rstack_top, offset)); offset += 2*wordSize;
+    ldp(R8,  R9,  Address(Rstack_top, offset)); offset += 2*wordSize;
+    ldp(R10, R11, Address(Rstack_top, offset)); offset += 2*wordSize;
+    ldp(R12, R13, Address(Rstack_top, offset)); offset += 2*wordSize;
+    ldp(R14, R15, Address(Rstack_top, offset)); offset += 2*wordSize;
+    ldp(R16, R17, Address(Rstack_top, offset)); offset += 2*wordSize;
+    ldp(R18, LR,  Address(Rstack_top, offset)); offset += 2*wordSize;
+    assert (offset == saved_regs_size, "should be");
+
+    add(Rstack_top, Rstack_top, saved_regs_size);
+  }
+#else
+  pop(RegisterSet(R0, R3) | RegisterSet(R12) | RegisterSet(R14));
+#endif // AARCH64
+
+  ldr(Rcounters, method_counters);
+  cbz(Rcounters, skip); // No MethodCounters created, OutOfMemory
+
+  bind(has_counters);
+}
--- /dev/null	2016-08-24 15:41:39.598575000 -0400
+++ new/hotspot/src/cpu/arm/vm/interp_masm_arm.hpp	2016-12-02 11:21:04.674974248 -0500
@@ -0,0 +1,355 @@
+/*
+ * Copyright (c) 2008, 2016, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef CPU_ARM_VM_INTERP_MASM_ARM_HPP
+#define CPU_ARM_VM_INTERP_MASM_ARM_HPP
+
+#include "asm/macroAssembler.hpp"
+#include "asm/macroAssembler.inline.hpp"
+#include "interpreter/invocationCounter.hpp"
+#include "runtime/frame.hpp"
+#include "prims/jvmtiExport.hpp"
+
+// This file specializes the assember with interpreter-specific macros
+
+
+class InterpreterMacroAssembler: public MacroAssembler {
+
+ public:
+
+  // allow JvmtiExport checks to be extended
+  bool can_force_early_return()       { return JvmtiExport::can_force_early_return(); }
+  bool can_post_interpreter_events()  { return JvmtiExport::can_post_interpreter_events(); }
+  bool can_pop_frame()                { return JvmtiExport::can_pop_frame(); }
+  bool can_post_breakpoint()          { return JvmtiExport::can_post_breakpoint(); }
+  bool can_post_field_access()        { return JvmtiExport::can_post_field_access(); }
+  bool can_post_field_modification()  { return JvmtiExport::can_post_field_modification(); }
+  // flags controlled by JVMTI settings
+  bool rewrite_frequent_pairs()       { return RewriteFrequentPairs; }
+
+ protected:
+
+  // Template interpreter specific version of call_VM_helper
+  virtual void call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions);
+
+  virtual void check_and_handle_popframe();
+  virtual void check_and_handle_earlyret();
+
+  // base routine for all dispatches
+  typedef enum { DispatchDefault, DispatchNormal } DispatchTableMode;
+  void dispatch_base(TosState state, DispatchTableMode table_mode, bool verifyoop = true);
+
+ public:
+  InterpreterMacroAssembler(CodeBuffer* code);
+
+  // Interpreter-specific registers
+#if defined(AARCH64) && defined(ASSERT)
+
+#define check_stack_top()               _check_stack_top("invalid Rstack_top at " __FILE__ ":" XSTR(__LINE__))
+#define check_stack_top_on_expansion()  _check_stack_top("invalid Rstack_top at " __FILE__ ":" XSTR(__LINE__), VerifyInterpreterStackTop)
+#define check_extended_sp(tmp)          _check_extended_sp(tmp, "SP does not match extended SP in frame at " __FILE__ ":" XSTR(__LINE__))
+#define check_no_cached_stack_top(tmp)  _check_no_cached_stack_top(tmp, "stack_top is already cached in frame at " __FILE__ ":" XSTR(__LINE__))
+
+  void _check_stack_top(const char* msg, bool enabled = true) {
+      if (enabled) {
+          Label L;
+          cmp(SP, Rstack_top);
+          b(L, ls);
+          stop(msg);
+          bind(L);
+      }
+  }
+
+  void _check_extended_sp(Register tmp, const char* msg) {
+      Label L;
+      ldr(tmp, Address(FP, frame::interpreter_frame_extended_sp_offset * wordSize));
+      cmp(SP, tmp);
+      b(L, eq);
+      stop(msg);
+      bind(L);
+  }
+
+  void _check_no_cached_stack_top(Register tmp, const char* msg) {
+      Label L;
+      ldr(tmp, Address(FP, frame::interpreter_frame_stack_top_offset * wordSize));
+      cbz(tmp, L);
+      stop(msg);
+      bind(L);
+  }
+
+#else
+
+  inline void check_stack_top() {}
+  inline void check_stack_top_on_expansion() {}
+  inline void check_extended_sp(Register tmp) {}
+  inline void check_no_cached_stack_top(Register tmp) {}
+
+#endif // AARCH64 && ASSERT
+
+  void save_bcp()                                          { str(Rbcp, Address(FP, frame::interpreter_frame_bcp_offset * wordSize)); }
+  void restore_bcp()                                       { ldr(Rbcp, Address(FP, frame::interpreter_frame_bcp_offset * wordSize)); }
+  void restore_locals()                                    { ldr(Rlocals, Address(FP, frame::interpreter_frame_locals_offset * wordSize)); }
+  void restore_method()                                    { ldr(Rmethod, Address(FP, frame::interpreter_frame_method_offset * wordSize)); }
+  void restore_dispatch();
+
+#ifdef AARCH64
+  void save_stack_top()                                    { check_stack_top(); str(Rstack_top, Address(FP, frame::interpreter_frame_stack_top_offset * wordSize)); }
+  void clear_cached_stack_top()                            { str(ZR, Address(FP, frame::interpreter_frame_stack_top_offset * wordSize)); }
+  void restore_stack_top()                                 { ldr(Rstack_top, Address(FP, frame::interpreter_frame_stack_top_offset * wordSize)); clear_cached_stack_top(); check_stack_top(); }
+  void cut_sp_before_call()                                { align_reg(SP, Rstack_top, StackAlignmentInBytes); }
+  void restore_sp_after_call(Register tmp)                 { ldr(tmp, Address(FP, frame::interpreter_frame_extended_sp_offset * wordSize)); mov(SP, tmp); }
+#endif
+
+  // Helpers for runtime call arguments/results
+  void get_const(Register reg)                             { ldr(reg, Address(Rmethod, Method::const_offset())); }
+  void get_constant_pool(Register reg)                     { get_const(reg); ldr(reg, Address(reg, ConstMethod::constants_offset())); }
+  void get_constant_pool_cache(Register reg)               { get_constant_pool(reg); ldr(reg, Address(reg, ConstantPool::cache_offset_in_bytes())); }
+  void get_cpool_and_tags(Register cpool, Register tags)   { get_constant_pool(cpool); ldr(tags, Address(cpool, ConstantPool::tags_offset_in_bytes())); }
+
+  // Sets reg. Blows Rtemp.
+  void get_unsigned_2_byte_index_at_bcp(Register reg, int bcp_offset);
+
+  // Sets index. Blows reg_tmp.
+  void get_index_at_bcp(Register index, int bcp_offset, Register reg_tmp, size_t index_size = sizeof(u2));
+  // Sets cache, index.
+  void get_cache_and_index_at_bcp(Register cache, Register index, int bcp_offset, size_t index_size = sizeof(u2));
+  void get_cache_and_index_and_bytecode_at_bcp(Register cache, Register index, Register bytecode, int byte_no, int bcp_offset, size_t index_size = sizeof(u2));
+  // Sets cache. Blows reg_tmp.
+  void get_cache_entry_pointer_at_bcp(Register cache, Register reg_tmp, int bcp_offset, size_t index_size = sizeof(u2));
+
+  // Load object from cpool->resolved_references(*bcp+1)
+  void load_resolved_reference_at_index(Register result, Register tmp);
+
+  void store_check_part1(Register card_table_base);                // Sets card_table_base register.
+  void store_check_part2(Register obj, Register card_table_base, Register tmp);
+
+  void set_card(Register card_table_base, Address card_table_addr, Register tmp);
+
+#if INCLUDE_ALL_GCS
+  // G1 pre-barrier.
+  // Blows all volatile registers (R0-R3 on 32-bit ARM, R0-R18 on AArch64, Rtemp, LR).
+  // If store_addr != noreg, then previous value is loaded from [store_addr];
+  // in such case store_addr and new_val registers are preserved;
+  // otherwise pre_val register is preserved.
+  void g1_write_barrier_pre(Register store_addr,
+                            Register new_val,
+                            Register pre_val,
+                            Register tmp1,
+                            Register tmp2);
+
+  // G1 post-barrier.
+  // Blows all volatile registers (R0-R3 on 32-bit ARM, R0-R18 on AArch64, Rtemp, LR).
+  void g1_write_barrier_post(Register store_addr,
+                             Register new_val,
+                             Register tmp1,
+                             Register tmp2,
+                             Register tmp3);
+#endif // INCLUDE_ALL_GCS
+
+  void pop_ptr(Register r);
+  void pop_i(Register r = R0_tos);
+#ifdef AARCH64
+  void pop_l(Register r = R0_tos);
+#else
+  void pop_l(Register lo = R0_tos_lo, Register hi = R1_tos_hi);
+#endif
+  void pop_f(FloatRegister fd);
+  void pop_d(FloatRegister fd);
+
+  void push_ptr(Register r);
+  void push_i(Register r = R0_tos);
+#ifdef AARCH64
+  void push_l(Register r = R0_tos);
+#else
+  void push_l(Register lo = R0_tos_lo, Register hi = R1_tos_hi);
+#endif
+  void push_f();
+  void push_d();
+
+  // Transition vtos -> state. Blows R0, R1. Sets TOS cached value.
+  void pop(TosState state);
+  // Transition state -> vtos. Blows Rtemp.
+  void push(TosState state);
+
+#ifndef AARCH64
+  // The following methods are overridden to allow overloaded calls to
+  //   MacroAssembler::push/pop(Register)
+  //   MacroAssembler::push/pop(RegisterSet)
+  //   InterpreterMacroAssembler::push/pop(TosState)
+  void push(Register rd, AsmCondition cond = al)         { MacroAssembler::push(rd, cond);      }
+  void pop(Register rd, AsmCondition cond = al)          { MacroAssembler::pop(rd, cond);       }
+
+  void push(RegisterSet reg_set, AsmCondition cond = al) { MacroAssembler::push(reg_set, cond); }
+  void pop(RegisterSet reg_set, AsmCondition cond = al)  { MacroAssembler::pop(reg_set, cond);  }
+
+  // Converts return value in R0/R1 (interpreter calling conventions) to TOS cached value.
+  void convert_retval_to_tos(TosState state);
+  // Converts TOS cached value to return value in R0/R1 (according to interpreter calling conventions).
+  void convert_tos_to_retval(TosState state);
+#endif
+
+  // JVMTI ForceEarlyReturn support
+  void load_earlyret_value(TosState state);
+
+  void jump_to_entry(address entry);
+
+  // Blows Rtemp.
+  void empty_expression_stack() {
+      ldr(Rstack_top, Address(FP, frame::interpreter_frame_monitor_block_top_offset * wordSize));
+      check_stack_top();
+#ifdef AARCH64
+      clear_cached_stack_top();
+#else
+      // NULL last_sp until next java call
+      str(zero_register(Rtemp), Address(FP, frame::interpreter_frame_last_sp_offset * wordSize));
+#endif // AARCH64
+  }
+
+  // Helpers for swap and dup
+  void load_ptr(int n, Register val);
+  void store_ptr(int n, Register val);
+
+  // Generate a subtype check: branch to not_subtype if sub_klass is
+  // not a subtype of super_klass.
+  // Profiling code for the subtype check failure (profile_typecheck_failed)
+  // should be explicitly generated by the caller in the not_subtype case.
+  // Blows Rtemp, tmp1, tmp2.
+  void gen_subtype_check(Register Rsub_klass, Register Rsuper_klass,
+                         Label &not_subtype, Register tmp1, Register tmp2);
+
+  // Dispatching
+  void dispatch_prolog(TosState state, int step = 0);
+  void dispatch_epilog(TosState state, int step = 0);
+  void dispatch_only(TosState state);                      // dispatch by R3_bytecode
+  void dispatch_only_normal(TosState state);               // dispatch normal table by R3_bytecode
+  void dispatch_only_noverify(TosState state);
+  void dispatch_next(TosState state, int step = 0);        // load R3_bytecode from [Rbcp + step] and dispatch by R3_bytecode
+
+  // jump to an invoked target
+  void prepare_to_jump_from_interpreted();
+  void jump_from_interpreted(Register method);
+
+  void narrow(Register result);
+
+  // Returning from interpreted functions
+  //
+  // Removes the current activation (incl. unlocking of monitors)
+  // and sets up the return address.  This code is also used for
+  // exception unwindwing. In that case, we do not want to throw
+  // IllegalMonitorStateExceptions, since that might get us into an
+  // infinite rethrow exception loop.
+  // Additionally this code is used for popFrame and earlyReturn.
+  // In popFrame case we want to skip throwing an exception,
+  // installing an exception, and notifying jvmdi.
+  // In earlyReturn case we only want to skip throwing an exception
+  // and installing an exception.
+  void remove_activation(TosState state, Register ret_addr,
+                         bool throw_monitor_exception = true,
+                         bool install_monitor_exception = true,
+                         bool notify_jvmdi = true);
+
+  // At certain points in the method invocation the monitor of
+  // synchronized methods hasn't been entered yet.
+  // To correctly handle exceptions at these points, we set the thread local
+  // variable _do_not_unlock_if_synchronized to true. The remove_activation will
+  // check this flag.
+  void set_do_not_unlock_if_synchronized(bool flag, Register tmp);
+
+  // Debugging
+  void interp_verify_oop(Register reg, TosState state, const char* file, int line);    // only if +VerifyOops && state == atos
+
+  void verify_FPU(int stack_depth, TosState state = ftos) {
+    // No VFP state verification is required for ARM
+  }
+
+  // Object locking
+  void lock_object  (Register lock_reg);
+  void unlock_object(Register lock_reg);
+
+  // Interpreter profiling operations
+  void set_method_data_pointer_for_bcp(); // Blows R0-R3/R0-R18, Rtemp, LR
+  void test_method_data_pointer(Register mdp, Label& zero_continue);
+  void verify_method_data_pointer();
+
+  void set_mdp_data_at(Register mdp_in, int offset, Register value);
+
+  // Increments mdp data. Sets bumped_count register to adjusted counter.
+  void increment_mdp_data_at(Address data, Register bumped_count, bool decrement = false);
+  // Increments mdp data. Sets bumped_count register to adjusted counter.
+  void increment_mdp_data_at(Register mdp_in, int offset, Register bumped_count, bool decrement = false);
+  void increment_mask_and_jump(Address counter_addr,
+                               int increment, Address mask_addr,
+                               Register scratch, Register scratch2,
+                               AsmCondition cond, Label* where);
+  void set_mdp_flag_at(Register mdp_in, int flag_constant);
+
+  void test_mdp_data_at(Register mdp_in, int offset, Register value,
+                        Register test_value_out,
+                        Label& not_equal_continue);
+
+  void record_klass_in_profile(Register receiver, Register mdp,
+                               Register reg_tmp, bool is_virtual_call);
+  void record_klass_in_profile_helper(Register receiver, Register mdp,
+                                      Register reg_tmp,
+                                      int start_row, Label& done, bool is_virtual_call);
+
+  void update_mdp_by_offset(Register mdp_in, int offset_of_offset, Register reg_tmp);
+  void update_mdp_by_offset(Register mdp_in, Register reg_offset, Register reg_tmp);
+  void update_mdp_by_constant(Register mdp_in, int constant);
+  void update_mdp_for_ret(Register return_bci);                   // Blows R0-R3/R0-R18, Rtemp, LR
+
+  void profile_taken_branch(Register mdp, Register bumped_count); // Sets mdp, bumped_count registers, blows Rtemp.
+  void profile_not_taken_branch(Register mdp);                    // Sets mdp, blows Rtemp.
+
+  void profile_call(Register mdp);                                // Sets mdp, blows Rtemp.
+  void profile_final_call(Register mdp);                          // Sets mdp, blows Rtemp.
+  void profile_virtual_call(Register mdp, Register receiver,      // Sets mdp, blows Rtemp.
+                            bool receiver_can_be_null = false);
+  void profile_ret(Register mdp, Register return_bci);            // Sets mdp, blows R0-R3/R0-R18, Rtemp, LR
+  void profile_null_seen(Register mdp);                           // Sets mdp.
+  void profile_typecheck(Register mdp, Register klass);           // Sets mdp, blows Rtemp.
+
+  void profile_typecheck_failed(Register mdp);                    // Sets mdp, blows Rtemp.
+  void profile_switch_default(Register mdp);                      // Sets mdp, blows Rtemp.
+
+  // Sets mdp. Blows reg_tmp1, reg_tmp2. Index could be the same as reg_tmp2.
+  void profile_switch_case(Register mdp, Register index, Register reg_tmp1, Register reg_tmp2);
+
+  void byteswap_u32(Register r, Register rtmp1, Register rtmp2);
+
+  void inc_global_counter(address address_of_counter, int offset_in_bytes, Register tmp1, Register tmp2, bool avoid_overflow);
+
+  typedef enum { NotifyJVMTI, SkipNotifyJVMTI } NotifyMethodExitMode;
+
+  // support for jvmti
+  void notify_method_entry();
+  void notify_method_exit(TosState state, NotifyMethodExitMode mode,
+                          bool native = false, Register result_lo = noreg, Register result_hi = noreg, FloatRegister result_fp = fnoreg);
+
+  void trace_state(const char* msg) PRODUCT_RETURN;
+
+  void get_method_counters(Register method, Register Rcounters, Label& skip);
+};
+
+#endif // CPU_ARM_VM_INTERP_MASM_ARM_HPP
--- /dev/null	2016-08-24 15:41:39.598575000 -0400
+++ new/hotspot/src/cpu/arm/vm/interpreterRT_arm.cpp	2016-12-02 11:21:09.991275732 -0500
@@ -0,0 +1,449 @@
+/*
+ * Copyright (c) 2008, 2015, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+#include "interpreter/interpreter.hpp"
+#include "interpreter/interpreterRuntime.hpp"
+#include "memory/allocation.inline.hpp"
+#include "memory/universe.inline.hpp"
+#include "oops/method.hpp"
+#include "oops/oop.inline.hpp"
+#include "runtime/handles.inline.hpp"
+#include "runtime/icache.hpp"
+#include "runtime/interfaceSupport.hpp"
+#include "runtime/signature.hpp"
+
+#define __ _masm->
+
+#ifdef SHARING_FAST_NATIVE_FINGERPRINTS
+// mapping from SignatureIterator param to (common) type of parsing
+static const u1 shared_type[] = {
+  (u1) SignatureIterator::int_parm, // bool
+  (u1) SignatureIterator::int_parm, // byte
+  (u1) SignatureIterator::int_parm, // char
+  (u1) SignatureIterator::int_parm, // short
+  (u1) SignatureIterator::int_parm, // int
+  (u1) SignatureIterator::long_parm, // long
+#ifndef __ABI_HARD__
+  (u1) SignatureIterator::int_parm, // float, passed as int
+  (u1) SignatureIterator::long_parm, // double, passed as long
+#else
+  (u1) SignatureIterator::float_parm, // float
+  (u1) SignatureIterator::double_parm, // double
+#endif
+  (u1) SignatureIterator::obj_parm, // obj
+  (u1) SignatureIterator::done_parm // done
+};
+
+uint64_t InterpreterRuntime::normalize_fast_native_fingerprint(uint64_t fingerprint) {
+  if (fingerprint == UCONST64(-1)) {
+    // special signature used when the argument list cannot be encoded in a 64 bits value
+    return fingerprint;
+  }
+  int shift = SignatureIterator::static_feature_size;
+  uint64_t result = fingerprint & ((1 << shift) - 1);
+  fingerprint >>= shift;
+
+  BasicType ret_type = (BasicType) (fingerprint & SignatureIterator::result_feature_mask);
+  // For ARM, the fast signature handler only needs to know whether
+  // the return value must be unboxed. T_OBJECT and T_ARRAY need not
+  // be distinguished from each other and all other return values
+  // behave like integers with respect to the handler.
+  bool unbox = (ret_type == T_OBJECT) || (ret_type == T_ARRAY);
+  if (unbox) {
+    ret_type = T_OBJECT;
+  } else {
+    ret_type = T_INT;
+  }
+  result |= ((uint64_t) ret_type) << shift;
+  shift += SignatureIterator::result_feature_size;
+  fingerprint >>= SignatureIterator::result_feature_size;
+
+  while (true) {
+    uint32_t type = (uint32_t) (fingerprint & SignatureIterator::parameter_feature_mask);
+    if (type == SignatureIterator::done_parm) {
+      result |= ((uint64_t) SignatureIterator::done_parm) << shift;
+      return result;
+    }
+    assert((type >= SignatureIterator::bool_parm) && (type <= SignatureIterator::obj_parm), "check fingerprint encoding");
+    int shared = shared_type[type - SignatureIterator::bool_parm];
+    result |= ((uint64_t) shared) << shift;
+    shift += SignatureIterator::parameter_feature_size;
+    fingerprint >>= SignatureIterator::parameter_feature_size;
+  }
+}
+#endif // SHARING_FAST_NATIVE_FINGERPRINTS
+
+// Implementation of SignatureHandlerGenerator
+void InterpreterRuntime::SignatureHandlerGenerator::pass_int() {
+  if (_ireg < GPR_PARAMS) {
+    Register dst = as_Register(_ireg);
+    __ ldr_s32(dst, Address(Rlocals, Interpreter::local_offset_in_bytes(offset())));
+    _ireg++;
+  } else {
+    __ ldr_s32(Rtemp, Address(Rlocals, Interpreter::local_offset_in_bytes(offset())));
+    __ str_32(Rtemp, Address(SP, _abi_offset * wordSize));
+    _abi_offset++;
+  }
+}
+
+void InterpreterRuntime::SignatureHandlerGenerator::pass_long() {
+#ifdef AARCH64
+  if (_ireg < GPR_PARAMS) {
+    Register dst = as_Register(_ireg);
+    __ ldr(dst, Address(Rlocals, Interpreter::local_offset_in_bytes(offset() + 1)));
+    _ireg++;
+  } else {
+    __ ldr(Rtemp, Address(Rlocals, Interpreter::local_offset_in_bytes(offset() + 1)));
+    __ str(Rtemp, Address(SP, _abi_offset * wordSize));
+    _abi_offset++;
+  }
+#else
+  if (_ireg <= 2) {
+#if (ALIGN_WIDE_ARGUMENTS == 1)
+    if ((_ireg & 1) != 0) {
+      // 64-bit values should be 8-byte aligned
+      _ireg++;
+    }
+#endif
+    Register dst1 = as_Register(_ireg);
+    Register dst2 = as_Register(_ireg+1);
+    __ ldr(dst1, Address(Rlocals, Interpreter::local_offset_in_bytes(offset()+1)));
+    __ ldr(dst2, Address(Rlocals, Interpreter::local_offset_in_bytes(offset())));
+    _ireg += 2;
+#if (ALIGN_WIDE_ARGUMENTS == 0)
+  } else if (_ireg == 3) {
+    // uses R3 + one stack slot
+    Register dst1 = as_Register(_ireg);
+    __ ldr(Rtemp, Address(Rlocals, Interpreter::local_offset_in_bytes(offset())));
+    __ ldr(dst1, Address(Rlocals, Interpreter::local_offset_in_bytes(offset()+1)));
+    __ str(Rtemp, Address(SP, _abi_offset * wordSize));
+    _ireg += 1;
+    _abi_offset += 1;
+#endif
+  } else {
+#if (ALIGN_WIDE_ARGUMENTS == 1)
+    if(_abi_offset & 1) _abi_offset++;
+#endif
+    __ ldr(Rtemp, Address(Rlocals, Interpreter::local_offset_in_bytes(offset()+1)));
+    __ str(Rtemp, Address(SP, (_abi_offset) * wordSize));
+    __ ldr(Rtemp, Address(Rlocals, Interpreter::local_offset_in_bytes(offset())));
+    __ str(Rtemp, Address(SP, (_abi_offset+1) * wordSize));
+    _abi_offset += 2;
+    _ireg = 4;
+  }
+#endif // AARCH64
+}
+
+void InterpreterRuntime::SignatureHandlerGenerator::pass_object() {
+#ifdef AARCH64
+  __ ldr(Rtemp, Address(Rlocals, Interpreter::local_offset_in_bytes(offset())));
+  __ cmp(Rtemp, 0);
+  __ sub(Rtemp, Rlocals, -Interpreter::local_offset_in_bytes(offset()));
+  if (_ireg < GPR_PARAMS) {
+    Register dst = as_Register(_ireg);
+    __ csel(dst, ZR, Rtemp, eq);
+    _ireg++;
+  } else {
+    __ csel(Rtemp, ZR, Rtemp, eq);
+    __ str(Rtemp, Address(SP, _abi_offset * wordSize));
+    _abi_offset++;
+  }
+#else
+  if (_ireg < 4) {
+    Register dst = as_Register(_ireg);
+    __ ldr(dst, Address(Rlocals, Interpreter::local_offset_in_bytes(offset())));
+    __ cmp(dst, 0);
+    __ sub(dst, Rlocals, -Interpreter::local_offset_in_bytes(offset()), ne);
+    _ireg++;
+  } else {
+    __ ldr(Rtemp, Address(Rlocals, Interpreter::local_offset_in_bytes(offset())));
+    __ cmp(Rtemp, 0);
+    __ sub(Rtemp, Rlocals, -Interpreter::local_offset_in_bytes(offset()), ne);
+    __ str(Rtemp, Address(SP, _abi_offset * wordSize));
+    _abi_offset++;
+  }
+#endif // AARCH64
+}
+
+#ifndef __ABI_HARD__
+void InterpreterRuntime::SignatureHandlerGenerator::pass_float() {
+  if (_ireg < 4) {
+    Register dst = as_Register(_ireg);
+    __ ldr(dst, Address(Rlocals, Interpreter::local_offset_in_bytes(offset())));
+    _ireg++;
+  } else {
+    __ ldr(Rtemp, Address(Rlocals, Interpreter::local_offset_in_bytes(offset())));
+    __ str(Rtemp, Address(SP, _abi_offset * wordSize));
+    _abi_offset++;
+  }
+}
+
+#else
+#ifndef __SOFTFP__
+void InterpreterRuntime::SignatureHandlerGenerator::pass_float() {
+#ifdef AARCH64
+    if (_freg < FPR_PARAMS) {
+      FloatRegister dst = as_FloatRegister(_freg);
+      __ ldr_s(dst, Address(Rlocals, Interpreter::local_offset_in_bytes(offset())));
+      _freg++;
+    } else {
+      __ ldr_u32(Rtemp, Address(Rlocals, Interpreter::local_offset_in_bytes(offset())));
+      __ str_32(Rtemp, Address(SP, _abi_offset * wordSize));
+      _abi_offset++;
+    }
+#else
+    if((_fp_slot < 16) || (_single_fpr_slot & 1)) {
+      if ((_single_fpr_slot & 1) == 0) {
+        _single_fpr_slot = _fp_slot;
+        _fp_slot += 2;
+      }
+      __ flds(as_FloatRegister(_single_fpr_slot), Address(Rlocals, Interpreter::local_offset_in_bytes(offset())));
+      _single_fpr_slot++;
+    } else {
+      __ ldr(Rtemp, Address(Rlocals, Interpreter::local_offset_in_bytes(offset())));
+      __ str(Rtemp, Address(SP, _abi_offset * wordSize));
+      _abi_offset++;
+    }
+#endif // AARCH64
+}
+
+void InterpreterRuntime::SignatureHandlerGenerator::pass_double() {
+#ifdef AARCH64
+    if (_freg < FPR_PARAMS) {
+      FloatRegister dst = as_FloatRegister(_freg);
+      __ ldr_d(dst, Address(Rlocals, Interpreter::local_offset_in_bytes(offset() + 1)));
+      _freg++;
+    } else {
+      __ ldr(Rtemp, Address(Rlocals, Interpreter::local_offset_in_bytes(offset() + 1)));
+      __ str(Rtemp, Address(SP, _abi_offset * wordSize));
+      _abi_offset++;
+    }
+#else
+    if(_fp_slot <= 14) {
+      __ fldd(as_FloatRegister(_fp_slot), Address(Rlocals, Interpreter::local_offset_in_bytes(offset()+1)));
+      _fp_slot += 2;
+    } else {
+      __ ldr(Rtemp, Address(Rlocals, Interpreter::local_offset_in_bytes(offset()+1)));
+      __ str(Rtemp, Address(SP, (_abi_offset) * wordSize));
+      __ ldr(Rtemp, Address(Rlocals, Interpreter::local_offset_in_bytes(offset())));
+      __ str(Rtemp, Address(SP, (_abi_offset+1) * wordSize));
+      _abi_offset += 2;
+      _single_fpr_slot = 16;
+    }
+#endif // AARCH64
+}
+#endif // __SOFTFP__
+#endif // __ABI_HARD__
+
+void InterpreterRuntime::SignatureHandlerGenerator::generate(uint64_t fingerprint) {
+  iterate(fingerprint);
+
+  BasicType result_type = SignatureIterator::return_type(fingerprint);
+
+  address result_handler = Interpreter::result_handler(result_type);
+
+#ifdef AARCH64
+  __ mov_slow(R0, (address)result_handler);
+#else
+  // Check that result handlers are not real handler on ARM (0 or -1).
+  // This ensures the signature handlers do not need symbolic information.
+  assert((result_handler == NULL)||(result_handler==(address)0xffffffff),"");
+  __ mov_slow(R0, (intptr_t)result_handler);
+#endif
+
+  __ ret();
+}
+
+
+// Implementation of SignatureHandlerLibrary
+
+void SignatureHandlerLibrary::pd_set_handler(address handler) {}
+
+class SlowSignatureHandler: public NativeSignatureIterator {
+ private:
+  address   _from;
+  intptr_t* _to;
+
+#ifndef __ABI_HARD__
+  virtual void pass_int() {
+    *_to++ = *(jint *)(_from+Interpreter::local_offset_in_bytes(0));
+    _from -= Interpreter::stackElementSize;
+  }
+
+  virtual void pass_float() {
+    *_to++ = *(jint *)(_from+Interpreter::local_offset_in_bytes(0));
+    _from -= Interpreter::stackElementSize;
+  }
+
+  virtual void pass_long() {
+#if (ALIGN_WIDE_ARGUMENTS == 1)
+    if (((intptr_t)_to & 7) != 0) {
+      // 64-bit values should be 8-byte aligned
+      _to++;
+    }
+#endif
+    _to[0] = *(intptr_t*)(_from+Interpreter::local_offset_in_bytes(1));
+    _to[1] = *(intptr_t*)(_from+Interpreter::local_offset_in_bytes(0));
+    _to += 2;
+    _from -= 2*Interpreter::stackElementSize;
+  }
+
+  virtual void pass_object() {
+    intptr_t from_addr = (intptr_t)(_from + Interpreter::local_offset_in_bytes(0));
+    *_to++ = (*(intptr_t*)from_addr == 0) ? (intptr_t)NULL : from_addr;
+    _from -= Interpreter::stackElementSize;
+   }
+
+#else
+
+  intptr_t* _toFP;
+  intptr_t* _toGP;
+  int       _last_gp;
+  int       _last_fp;
+#ifndef AARCH64
+  int       _last_single_fp;
+#endif // !AARCH64
+
+  virtual void pass_int() {
+    if(_last_gp < GPR_PARAMS) {
+      _toGP[_last_gp++] = *(jint *)(_from+Interpreter::local_offset_in_bytes(0));
+    } else {
+      *_to++ = *(jint *)(_from+Interpreter::local_offset_in_bytes(0));
+    }
+    _from -= Interpreter::stackElementSize;
+  }
+
+  virtual void pass_long() {
+#ifdef AARCH64
+    if(_last_gp < GPR_PARAMS) {
+      _toGP[_last_gp++] = *(jlong *)(_from+Interpreter::local_offset_in_bytes(1));
+    } else {
+      *_to++ = *(jlong *)(_from+Interpreter::local_offset_in_bytes(1));
+    }
+#else
+    assert(ALIGN_WIDE_ARGUMENTS == 1, "ABI_HARD not supported with unaligned wide arguments");
+    if (_last_gp <= 2) {
+      if(_last_gp & 1) _last_gp++;
+      _toGP[_last_gp++] = *(jint *)(_from+Interpreter::local_offset_in_bytes(1));
+      _toGP[_last_gp++] = *(jint *)(_from+Interpreter::local_offset_in_bytes(0));
+    } else {
+      if (((intptr_t)_to & 7) != 0) {
+        // 64-bit values should be 8-byte aligned
+        _to++;
+      }
+      _to[0] = *(intptr_t*)(_from+Interpreter::local_offset_in_bytes(1));
+      _to[1] = *(intptr_t*)(_from+Interpreter::local_offset_in_bytes(0));
+      _to += 2;
+      _last_gp = 4;
+    }
+#endif // AARCH64
+    _from -= 2*Interpreter::stackElementSize;
+  }
+
+  virtual void pass_object() {
+    intptr_t from_addr = (intptr_t)(_from + Interpreter::local_offset_in_bytes(0));
+    if(_last_gp < GPR_PARAMS) {
+      _toGP[_last_gp++] = (*(intptr_t*)from_addr == 0) ? NULL : from_addr;
+    } else {
+      *_to++ = (*(intptr_t*)from_addr == 0) ? NULL : from_addr;
+    }
+    _from -= Interpreter::stackElementSize;
+  }
+
+  virtual void pass_float() {
+#ifdef AARCH64
+    if(_last_fp < FPR_PARAMS) {
+      _toFP[_last_fp++] = *(jint *)(_from+Interpreter::local_offset_in_bytes(0));
+    } else {
+      *_to++ = *(jint *)(_from+Interpreter::local_offset_in_bytes(0));
+    }
+#else
+    if((_last_fp < 16) || (_last_single_fp & 1)) {
+      if ((_last_single_fp & 1) == 0) {
+        _last_single_fp = _last_fp;
+        _last_fp += 2;
+      }
+
+      _toFP[_last_single_fp++] = *(jint *)(_from+Interpreter::local_offset_in_bytes(0));
+    } else {
+      *_to++ = *(jint *)(_from+Interpreter::local_offset_in_bytes(0));
+    }
+#endif // AARCH64
+    _from -= Interpreter::stackElementSize;
+  }
+
+  virtual void pass_double() {
+#ifdef AARCH64
+    if(_last_fp < FPR_PARAMS) {
+      _toFP[_last_fp++] = *(intptr_t*)(_from+Interpreter::local_offset_in_bytes(1));
+    } else {
+      *_to++ = *(intptr_t*)(_from+Interpreter::local_offset_in_bytes(1));
+    }
+#else
+    assert(ALIGN_WIDE_ARGUMENTS == 1, "ABI_HARD not supported with unaligned wide arguments");
+    if(_last_fp <= 14) {
+      _toFP[_last_fp++] = *(intptr_t*)(_from+Interpreter::local_offset_in_bytes(1));
+      _toFP[_last_fp++] = *(intptr_t*)(_from+Interpreter::local_offset_in_bytes(0));
+    } else {
+      if (((intptr_t)_to & 7) != 0) {      // 64-bit values should be 8-byte aligned
+        _to++;
+      }
+      _to[0] = *(intptr_t*)(_from+Interpreter::local_offset_in_bytes(1));
+      _to[1] = *(intptr_t*)(_from+Interpreter::local_offset_in_bytes(0));
+      _to += 2;
+      _last_single_fp = 16;
+    }
+#endif // AARCH64
+    _from -= 2*Interpreter::stackElementSize;
+  }
+
+#endif // !__ABI_HARD__
+
+ public:
+  SlowSignatureHandler(methodHandle method, address from, intptr_t* to) :
+    NativeSignatureIterator(method) {
+    _from = from;
+
+#ifdef __ABI_HARD__
+    _toGP  = to;
+    _toFP = _toGP + GPR_PARAMS;
+    _to   = _toFP + AARCH64_ONLY(FPR_PARAMS) NOT_AARCH64(8*2);
+    _last_gp = (is_static() ? 2 : 1);
+    _last_fp = 0;
+#ifndef AARCH64
+    _last_single_fp = 0;
+#endif // !AARCH64
+#else
+    _to   = to + (is_static() ? 2 : 1);
+#endif // __ABI_HARD__
+  }
+};
+
+IRT_ENTRY(address, InterpreterRuntime::slow_signature_handler(JavaThread* thread, Method* method, intptr_t* from, intptr_t* to))
+  methodHandle m(thread, (Method*)method);
+  assert(m->is_native(), "sanity check");
+  SlowSignatureHandler(m, (address)from, to).iterate(UCONST64(-1));
+  return Interpreter::result_handler(m->result_type());
+IRT_END
--- /dev/null	2016-08-24 15:41:39.598575000 -0400
+++ new/hotspot/src/cpu/arm/vm/interpreterRT_arm.hpp	2016-12-02 11:21:15.307577211 -0500
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2008, 2013, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef CPU_ARM_VM_INTERPRETERRT_ARM_HPP
+#define CPU_ARM_VM_INTERPRETERRT_ARM_HPP
+
+#include "memory/allocation.hpp"
+
+// native method calls
+
+class SignatureHandlerGenerator: public NativeSignatureIterator {
+ private:
+  MacroAssembler* _masm;
+  int _abi_offset;
+  int  _ireg;
+
+#ifdef __ABI_HARD__
+#ifdef AARCH64
+  int _freg;
+#else
+  int _fp_slot; // number of FPR's with arguments loaded
+  int _single_fpr_slot;
+#endif
+#endif
+
+  void move(int from_offset, int to_offset);
+  void box(int from_offset, int to_offset);
+
+  void pass_int();
+  void pass_long();
+  void pass_float();
+  void pass_object();
+#ifdef __ABI_HARD__
+  void pass_double();
+#endif
+ public:
+  // Creation
+  SignatureHandlerGenerator(methodHandle method, CodeBuffer* buffer) : NativeSignatureIterator(method) {
+    _masm = new MacroAssembler(buffer);
+    _abi_offset = 0;
+    _ireg = is_static() ? 2 : 1;
+#ifdef __ABI_HARD__
+#ifdef AARCH64
+    _freg = 0;
+#else
+    _fp_slot = 0;
+    _single_fpr_slot = 0;
+#endif
+#endif
+  }
+
+  // Code generation
+  void generate(uint64_t fingerprint);
+
+};
+
+#ifndef AARCH64
+// ARM provides a normalized fingerprint for native calls (to increase
+// sharing). See normalize_fast_native_fingerprint
+#define SHARING_FAST_NATIVE_FINGERPRINTS
+#endif
+
+#endif // CPU_ARM_VM_INTERPRETERRT_ARM_HPP
--- /dev/null	2016-08-24 15:41:39.598575000 -0400
+++ new/hotspot/src/cpu/arm/vm/javaFrameAnchor_arm.hpp	2016-12-02 11:21:20.931896161 -0500
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2008, 2011, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef CPU_ARM_VM_JAVAFRAMEANCHOR_ARM_HPP
+#define CPU_ARM_VM_JAVAFRAMEANCHOR_ARM_HPP
+
+private:
+
+  // FP value associated with _last_Java_sp:
+  intptr_t* volatile        _last_Java_fp;           // pointer is volatile not what it points to
+
+public:
+  // Each arch must define reset, save, restore
+  // These are used by objects that only care about:
+  //  1 - initializing a new state (thread creation, javaCalls)
+  //  2 - saving a current state (javaCalls)
+  //  3 - restoring an old state (javaCalls)
+
+  void clear(void) {
+    // clearing _last_Java_sp must be first
+    _last_Java_sp = NULL;
+    // fence?
+    _last_Java_fp = NULL;
+    _last_Java_pc = NULL;
+  }
+
+  void copy(JavaFrameAnchor* src) {
+    // In order to make sure the transition state is valid for "this"
+    // We must clear _last_Java_sp before copying the rest of the new data
+    //
+    // Hack Alert: Temporary bugfix for 4717480/4721647
+    // To act like previous version (pd_cache_state) don't NULL _last_Java_sp
+    // unless the value is changing
+    //
+    if (_last_Java_sp != src->_last_Java_sp)
+      _last_Java_sp = NULL;
+
+    _last_Java_fp = src->_last_Java_fp;
+    _last_Java_pc = src->_last_Java_pc;
+    // Must be last so profiler will always see valid frame if has_last_frame() is true
+    _last_Java_sp = src->_last_Java_sp;
+  }
+
+  // Always walkable
+  bool walkable(void) { return true; }
+  // Never any thing to do since we are always walkable and can find address of return addresses
+  void make_walkable(JavaThread* thread) { }
+
+  intptr_t* last_Java_sp(void) const             { return _last_Java_sp; }
+
+  address last_Java_pc(void)                     { return _last_Java_pc; }
+
+private:
+
+  static ByteSize last_Java_fp_offset()          { return byte_offset_of(JavaFrameAnchor, _last_Java_fp); }
+
+public:
+
+  void set_last_Java_sp(intptr_t* sp)            { _last_Java_sp = sp; }
+
+  intptr_t*   last_Java_fp(void)                     { return _last_Java_fp; }
+  // Assert (last_Java_sp == NULL || fp == NULL)
+  void set_last_Java_fp(intptr_t* fp)                { _last_Java_fp = fp; }
+
+#endif // CPU_ARM_VM_JAVAFRAMEANCHOR_ARM_HPP
--- /dev/null	2016-08-24 15:41:39.598575000 -0400
+++ new/hotspot/src/cpu/arm/vm/jniFastGetField_arm.cpp	2016-12-02 11:21:27.296257080 -0500
@@ -0,0 +1,277 @@
+/*
+ * Copyright (c) 2008, 2016, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+#include "asm/macroAssembler.hpp"
+#include "assembler_arm.inline.hpp"
+#include "memory/resourceArea.hpp"
+#include "prims/jniFastGetField.hpp"
+#include "prims/jvm_misc.hpp"
+#include "runtime/safepoint.hpp"
+
+#define __ masm->
+
+#define BUFFER_SIZE  96
+
+address JNI_FastGetField::generate_fast_get_int_field0(BasicType type) {
+  const char* name = NULL;
+  address slow_case_addr = NULL;
+  switch (type) {
+    case T_BOOLEAN:
+      name = "jni_fast_GetBooleanField";
+      slow_case_addr = jni_GetBooleanField_addr();
+      break;
+    case T_BYTE:
+      name = "jni_fast_GetByteField";
+      slow_case_addr = jni_GetByteField_addr();
+      break;
+    case T_CHAR:
+      name = "jni_fast_GetCharField";
+      slow_case_addr = jni_GetCharField_addr();
+      break;
+    case T_SHORT:
+      name = "jni_fast_GetShortField";
+      slow_case_addr = jni_GetShortField_addr();
+      break;
+    case T_INT:
+      name = "jni_fast_GetIntField";
+      slow_case_addr = jni_GetIntField_addr();
+      break;
+    case T_LONG:
+      name = "jni_fast_GetLongField";
+      slow_case_addr = jni_GetLongField_addr();
+      break;
+    case T_FLOAT:
+      name = "jni_fast_GetFloatField";
+      slow_case_addr = jni_GetFloatField_addr();
+      break;
+    case T_DOUBLE:
+      name = "jni_fast_GetDoubleField";
+      slow_case_addr = jni_GetDoubleField_addr();
+      break;
+    default:
+      ShouldNotReachHere();
+  }
+
+  // R0 - jni env
+  // R1 - object handle
+  // R2 - jfieldID
+
+  const Register Rsafepoint_counter_addr = AARCH64_ONLY(R4) NOT_AARCH64(R3);
+  const Register Robj = AARCH64_ONLY(R5) NOT_AARCH64(R1);
+  const Register Rres = AARCH64_ONLY(R6) NOT_AARCH64(R0);
+#ifndef AARCH64
+  const Register Rres_hi = R1;
+#endif // !AARCH64
+  const Register Rsafept_cnt = Rtemp;
+  const Register Rsafept_cnt2 = Rsafepoint_counter_addr;
+  const Register Rtmp1 = AARCH64_ONLY(R7) NOT_AARCH64(R3); // same as Rsafepoint_counter_addr on 32-bit ARM
+  const Register Rtmp2 = AARCH64_ONLY(R8) NOT_AARCH64(R2); // same as jfieldID on 32-bit ARM
+
+#ifdef AARCH64
+  assert_different_registers(Rsafepoint_counter_addr, Rsafept_cnt, Robj, Rres, Rtmp1, Rtmp2, R0, R1, R2, LR);
+  assert_different_registers(Rsafept_cnt2, Rsafept_cnt, Rres, R0, R1, R2, LR);
+#else
+  assert_different_registers(Rsafepoint_counter_addr, Rsafept_cnt, Robj, Rres, LR);
+  assert_different_registers(Rsafept_cnt, R1, R2, Rtmp1, LR);
+  assert_different_registers(Rsafepoint_counter_addr, Rsafept_cnt, Rres, Rres_hi, Rtmp2, LR);
+  assert_different_registers(Rsafept_cnt2, Rsafept_cnt, Rres, Rres_hi, LR);
+#endif // AARCH64
+
+  address fast_entry;
+
+  ResourceMark rm;
+  BufferBlob* blob = BufferBlob::create(name, BUFFER_SIZE);
+  CodeBuffer cbuf(blob);
+  MacroAssembler* masm = new MacroAssembler(&cbuf);
+  fast_entry = __ pc();
+
+  // Safepoint check
+  InlinedAddress safepoint_counter_addr(SafepointSynchronize::safepoint_counter_addr());
+  Label slow_case;
+  __ ldr_literal(Rsafepoint_counter_addr, safepoint_counter_addr);
+
+#ifndef AARCH64
+  __ push(RegisterSet(R0, R3));  // save incoming arguments for slow case
+#endif // !AARCH64
+
+  __ ldr_s32(Rsafept_cnt, Address(Rsafepoint_counter_addr));
+  __ tbnz(Rsafept_cnt, 0, slow_case);
+
+  if (os::is_MP()) {
+    // Address dependency restricts memory access ordering. It's cheaper than explicit LoadLoad barrier
+    __ andr(Rtmp1, Rsafept_cnt, (unsigned)1);
+    __ ldr(Robj, Address(R1, Rtmp1));
+  } else {
+    __ ldr(Robj, Address(R1));
+  }
+
+#ifdef AARCH64
+  __ add(Robj, Robj, AsmOperand(R2, lsr, 2));
+  Address field_addr = Address(Robj);
+#else
+  Address field_addr;
+  if (type != T_BOOLEAN
+      && type != T_INT
+#ifndef __ABI_HARD__
+      && type != T_FLOAT
+#endif // !__ABI_HARD__
+      ) {
+    // Only ldr and ldrb support embedded shift, other loads do not
+    __ add(Robj, Robj, AsmOperand(R2, lsr, 2));
+    field_addr = Address(Robj);
+  } else {
+    field_addr = Address(Robj, R2, lsr, 2);
+  }
+#endif // AARCH64
+  assert(count < LIST_CAPACITY, "LIST_CAPACITY too small");
+  speculative_load_pclist[count] = __ pc();
+
+  switch (type) {
+    case T_BOOLEAN:
+      __ ldrb(Rres, field_addr);
+      break;
+    case T_BYTE:
+      __ ldrsb(Rres, field_addr);
+      break;
+    case T_CHAR:
+      __ ldrh(Rres, field_addr);
+      break;
+    case T_SHORT:
+      __ ldrsh(Rres, field_addr);
+      break;
+    case T_INT:
+#ifndef __ABI_HARD__
+    case T_FLOAT:
+#endif
+      __ ldr_s32(Rres, field_addr);
+      break;
+    case T_LONG:
+#ifndef __ABI_HARD__
+    case T_DOUBLE:
+#endif
+#ifdef AARCH64
+      __ ldr(Rres, field_addr);
+#else
+      // Safe to use ldrd since long and double fields are 8-byte aligned
+      __ ldrd(Rres, field_addr);
+#endif // AARCH64
+      break;
+#ifdef __ABI_HARD__
+    case T_FLOAT:
+      __ ldr_float(S0, field_addr);
+      break;
+    case T_DOUBLE:
+      __ ldr_double(D0, field_addr);
+      break;
+#endif // __ABI_HARD__
+    default:
+      ShouldNotReachHere();
+  }
+
+  if(os::is_MP()) {
+      // Address dependency restricts memory access ordering. It's cheaper than explicit LoadLoad barrier
+#if defined(__ABI_HARD__) && !defined(AARCH64)
+    if (type == T_FLOAT || type == T_DOUBLE) {
+      __ ldr_literal(Rsafepoint_counter_addr, safepoint_counter_addr);
+      __ fmrrd(Rres, Rres_hi, D0);
+      __ eor(Rtmp2, Rres, Rres);
+      __ ldr_s32(Rsafept_cnt2, Address(Rsafepoint_counter_addr, Rtmp2));
+    } else
+#endif // __ABI_HARD__ && !AARCH64
+    {
+#ifndef AARCH64
+      __ ldr_literal(Rsafepoint_counter_addr, safepoint_counter_addr);
+#endif // !AARCH64
+      __ eor(Rtmp2, Rres, Rres);
+      __ ldr_s32(Rsafept_cnt2, Address(Rsafepoint_counter_addr, Rtmp2));
+    }
+  } else {
+    __ ldr_s32(Rsafept_cnt2, Address(Rsafepoint_counter_addr));
+  }
+  __ cmp(Rsafept_cnt2, Rsafept_cnt);
+#ifdef AARCH64
+  __ b(slow_case, ne);
+  __ mov(R0, Rres);
+  __ ret();
+#else
+  // discards saved R0 R1 R2 R3
+  __ add(SP, SP, 4 * wordSize, eq);
+  __ bx(LR, eq);
+#endif // AARCH64
+
+  slowcase_entry_pclist[count++] = __ pc();
+
+  __ bind(slow_case);
+#ifndef AARCH64
+  __ pop(RegisterSet(R0, R3));
+#endif // !AARCH64
+  // thumb mode switch handled by MacroAssembler::jump if needed
+  __ jump(slow_case_addr, relocInfo::none, Rtemp);
+
+  __ bind_literal(safepoint_counter_addr);
+
+  __ flush();
+
+  guarantee((__ pc() - fast_entry) <= BUFFER_SIZE, "BUFFER_SIZE too small");
+
+  return fast_entry;
+}
+
+address JNI_FastGetField::generate_fast_get_float_field0(BasicType type) {
+  ShouldNotReachHere();
+  return NULL;
+}
+
+address JNI_FastGetField::generate_fast_get_boolean_field() {
+  return generate_fast_get_int_field0(T_BOOLEAN);
+}
+
+address JNI_FastGetField::generate_fast_get_byte_field() {
+  return generate_fast_get_int_field0(T_BYTE);
+}
+
+address JNI_FastGetField::generate_fast_get_char_field() {
+  return generate_fast_get_int_field0(T_CHAR);
+}
+
+address JNI_FastGetField::generate_fast_get_short_field() {
+  return generate_fast_get_int_field0(T_SHORT);
+}
+
+address JNI_FastGetField::generate_fast_get_int_field() {
+  return generate_fast_get_int_field0(T_INT);
+}
+
+address JNI_FastGetField::generate_fast_get_long_field() {
+  return generate_fast_get_int_field0(T_LONG);
+}
+
+address JNI_FastGetField::generate_fast_get_float_field() {
+  return generate_fast_get_int_field0(T_FLOAT);
+}
+
+address JNI_FastGetField::generate_fast_get_double_field() {
+  return generate_fast_get_int_field0(T_DOUBLE);
+}
--- /dev/null	2016-08-24 15:41:39.598575000 -0400
+++ new/hotspot/src/cpu/arm/vm/jniTypes_arm.hpp	2016-12-02 11:21:33.312598258 -0500
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2008, 2013, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef CPU_ARM_VM_JNITYPES_ARM_HPP
+#define CPU_ARM_VM_JNITYPES_ARM_HPP
+
+#include "memory/allocation.hpp"
+#include "oops/oop.hpp"
+#include "prims/jni.h"
+
+// This file holds platform-dependent routines used to write primitive jni
+// types to the array of arguments passed into JavaCalls::call
+
+class JNITypes : AllStatic {
+  // These functions write a java primitive type (in native format)
+  // to a java stack slot array to be passed as an argument to JavaCalls:calls.
+  // I.e., they are functionally 'push' operations if they have a 'pos'
+  // formal parameter.  Note that jlong's and jdouble's are written
+  // _in reverse_ of the order in which they appear in the interpreter
+  // stack.  This is because call stubs (see stubGenerator_arm.cpp)
+  // reverse the argument list constructed by JavaCallArguments (see
+  // javaCalls.hpp).
+
+private:
+
+#ifndef AARCH64
+  // 32bit Helper routines.
+  static inline void put_int2r(jint *from, intptr_t *to)           { *(jint *)(to++) = from[1];
+                                                                        *(jint *)(to  ) = from[0]; }
+  static inline void put_int2r(jint *from, intptr_t *to, int& pos) { put_int2r(from, to + pos); pos += 2; }
+#endif
+
+public:
+  // Ints are stored in native format in one JavaCallArgument slot at *to.
+  static inline void put_int(jint  from, intptr_t *to)           { *(jint *)(to +   0  ) =  from; }
+  static inline void put_int(jint  from, intptr_t *to, int& pos) { *(jint *)(to + pos++) =  from; }
+  static inline void put_int(jint *from, intptr_t *to, int& pos) { *(jint *)(to + pos++) = *from; }
+
+#ifdef AARCH64
+  // Longs are stored in native format in one JavaCallArgument slot at *(to+1).
+  static inline void put_long(jlong  from, intptr_t *to)           { *(jlong *)(to + 1 +   0) =  from; }
+  static inline void put_long(jlong  from, intptr_t *to, int& pos) { *(jlong *)(to + 1 + pos) =  from; pos += 2; }
+  static inline void put_long(jlong *from, intptr_t *to, int& pos) { *(jlong *)(to + 1 + pos) = *from; pos += 2; }
+#else
+  // Longs are stored in big-endian word format in two JavaCallArgument slots at *to.
+  // The high half is in *to and the low half in *(to+1).
+  static inline void put_long(jlong  from, intptr_t *to)           { put_int2r((jint *)&from, to); }
+  static inline void put_long(jlong  from, intptr_t *to, int& pos) { put_int2r((jint *)&from, to, pos); }
+  static inline void put_long(jlong *from, intptr_t *to, int& pos) { put_int2r((jint *) from, to, pos); }
+#endif
+
+  // Oops are stored in native format in one JavaCallArgument slot at *to.
+  static inline void put_obj(oop  from, intptr_t *to)           { *(oop *)(to +   0  ) =  from; }
+  static inline void put_obj(oop  from, intptr_t *to, int& pos) { *(oop *)(to + pos++) =  from; }
+  static inline void put_obj(oop *from, intptr_t *to, int& pos) { *(oop *)(to + pos++) = *from; }
+
+  // Floats are stored in native format in one JavaCallArgument slot at *to.
+  static inline void put_float(jfloat  from, intptr_t *to)           { *(jfloat *)(to +   0  ) =  from;  }
+  static inline void put_float(jfloat  from, intptr_t *to, int& pos) { *(jfloat *)(to + pos++) =  from; }
+  static inline void put_float(jfloat *from, intptr_t *to, int& pos) { *(jfloat *)(to + pos++) = *from; }
+
+#ifdef AARCH64
+  // Doubles are stored in native word format in one JavaCallArgument slot at *(to+1).
+  static inline void put_double(jdouble  from, intptr_t *to)           { *(jdouble *)(to + 1 +   0) =  from; }
+  static inline void put_double(jdouble  from, intptr_t *to, int& pos) { *(jdouble *)(to + 1 + pos) =  from; pos += 2; }
+  static inline void put_double(jdouble *from, intptr_t *to, int& pos) { *(jdouble *)(to + 1 + pos) = *from; pos += 2; }
+#else
+  // Doubles are stored in big-endian word format in two JavaCallArgument slots at *to.
+  // The high half is in *to and the low half in *(to+1).
+  static inline void put_double(jdouble  from, intptr_t *to)           { put_int2r((jint *)&from, to); }
+  static inline void put_double(jdouble  from, intptr_t *to, int& pos) { put_int2r((jint *)&from, to, pos); }
+  static inline void put_double(jdouble *from, intptr_t *to, int& pos) { put_int2r((jint *) from, to, pos); }
+#endif
+
+};
+
+#endif // CPU_ARM_VM_JNITYPES_ARM_HPP
--- /dev/null	2016-08-24 15:41:39.598575000 -0400
+++ new/hotspot/src/cpu/arm/vm/jni_arm.h	2016-12-02 11:21:40.565009537 -0500
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2008, 2013, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.  Oracle designates this
+ * particular file as subject to the "Classpath" exception as provided
+ * by Oracle in the LICENSE file that accompanied this code.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+#ifndef _JAVASOFT_JNI_MD_H_
+#define _JAVASOFT_JNI_MD_H_
+
+// Note: please do not change these without also changing jni_md.h in the JDK
+// repository
+#ifndef __has_attribute
+  #define __has_attribute(x) 0
+#endif
+#if (defined(__GNUC__) && ((__GNUC__ > 4) || (__GNUC__ == 4) && (__GNUC_MINOR__ > 2))) || __has_attribute(visibility)
+  #define JNIEXPORT     __attribute__((externally_visible,visibility("default")))
+  #define JNIIMPORT     __attribute__((externally_visible,visibility("default")))
+#else
+  #define JNIEXPORT
+  #define JNIIMPORT
+#endif
+
+#define JNICALL
+
+typedef int jint;
+#if defined(_LP64)
+  typedef long jlong;
+#else
+  typedef long long jlong;
+#endif
+typedef signed char jbyte;
+
+#endif /* !_JAVASOFT_JNI_MD_H_ */
--- /dev/null	2016-08-24 15:41:39.598575000 -0400
+++ new/hotspot/src/cpu/arm/vm/jvmciCodeInstaller_arm.cpp	2016-12-02 11:21:45.689300130 -0500
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "jvmci/jvmciCodeInstaller.hpp"
+#include "jvmci/jvmciRuntime.hpp"
+#include "jvmci/jvmciCompilerToVM.hpp"
+#include "jvmci/jvmciJavaClasses.hpp"
+#include "oops/oop.inline.hpp"
+#include "runtime/sharedRuntime.hpp"
+#include "vmreg_arm.inline.hpp"
+
+jint CodeInstaller::pd_next_offset(NativeInstruction* inst, jint pc_offset, Handle method, TRAPS) {
+  Unimplemented();
+  return 0;
+}
+
+void CodeInstaller::pd_patch_OopConstant(int pc_offset, Handle constant, TRAPS) {
+  Unimplemented();
+}
+
+void CodeInstaller::pd_patch_MetaspaceConstant(int pc_offset, Handle constant, TRAPS) {
+  Unimplemented();
+}
+
+void CodeInstaller::pd_patch_DataSectionReference(int pc_offset, int data_offset, TRAPS) {
+  Unimplemented();
+}
+
+void CodeInstaller::pd_relocate_ForeignCall(NativeInstruction* inst, jlong foreign_call_destination, TRAPS) {
+  Unimplemented();
+}
+
+void CodeInstaller::pd_relocate_JavaMethod(Handle hotspot_method, jint pc_offset, TRAPS) {
+  Unimplemented();
+}
+
+void CodeInstaller::pd_relocate_poll(address pc, jint mark, TRAPS) {
+  Unimplemented();
+}
+
+// convert JVMCI register indices (as used in oop maps) to HotSpot registers
+VMReg CodeInstaller::get_hotspot_reg(jint jvmci_reg, TRAPS) {
+  return NULL;
+}
+
+bool CodeInstaller::is_general_purpose_reg(VMReg hotspotRegister) {
+  return false;
+}
--- /dev/null	2016-08-24 15:41:39.598575000 -0400
+++ new/hotspot/src/cpu/arm/vm/macroAssembler_arm.cpp	2016-12-02 11:21:50.529574616 -0500
@@ -0,0 +1,3120 @@
+/*
+ * Copyright (c) 2008, 2016, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+#include "asm/assembler.hpp"
+#include "asm/assembler.inline.hpp"
+#include "asm/macroAssembler.hpp"
+#include "ci/ciEnv.hpp"
+#include "code/nativeInst.hpp"
+#include "compiler/disassembler.hpp"
+#include "gc/shared/cardTableModRefBS.hpp"
+#include "gc/shared/collectedHeap.inline.hpp"
+#include "interpreter/interpreter.hpp"
+#include "memory/resourceArea.hpp"
+#include "oops/klass.inline.hpp"
+#include "prims/methodHandles.hpp"
+#include "runtime/biasedLocking.hpp"
+#include "runtime/interfaceSupport.hpp"
+#include "runtime/objectMonitor.hpp"
+#include "runtime/os.hpp"
+#include "runtime/sharedRuntime.hpp"
+#include "runtime/stubRoutines.hpp"
+#include "utilities/macros.hpp"
+#if INCLUDE_ALL_GCS
+#include "gc/g1/g1CollectedHeap.inline.hpp"
+#include "gc/g1/g1SATBCardTableModRefBS.hpp"
+#include "gc/g1/heapRegion.hpp"
+#endif
+
+// Implementation of AddressLiteral
+
+void AddressLiteral::set_rspec(relocInfo::relocType rtype) {
+  switch (rtype) {
+  case relocInfo::oop_type:
+    // Oops are a special case. Normally they would be their own section
+    // but in cases like icBuffer they are literals in the code stream that
+    // we don't have a section for. We use none so that we get a literal address
+    // which is always patchable.
+    break;
+  case relocInfo::external_word_type:
+    _rspec = external_word_Relocation::spec(_target);
+    break;
+  case relocInfo::internal_word_type:
+    _rspec = internal_word_Relocation::spec(_target);
+    break;
+  case relocInfo::opt_virtual_call_type:
+    _rspec = opt_virtual_call_Relocation::spec();
+    break;
+  case relocInfo::static_call_type:
+    _rspec = static_call_Relocation::spec();
+    break;
+  case relocInfo::runtime_call_type:
+    _rspec = runtime_call_Relocation::spec();
+    break;
+  case relocInfo::poll_type:
+  case relocInfo::poll_return_type:
+    _rspec = Relocation::spec_simple(rtype);
+    break;
+  case relocInfo::none:
+    break;
+  default:
+    ShouldNotReachHere();
+    break;
+  }
+}
+
+// Initially added to the Assembler interface as a pure virtual:
+//   RegisterConstant delayed_value(..)
+// for:
+//   6812678 macro assembler needs delayed binding of a few constants (for 6655638)
+// this was subsequently modified to its present name and return type
+RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr,
+                                                      Register tmp,
+                                                      int offset) {
+  ShouldNotReachHere();
+  return RegisterOrConstant(-1);
+}
+
+
+#ifdef AARCH64
+// Note: ARM32 version is OS dependent
+void MacroAssembler::breakpoint(AsmCondition cond) {
+  if (cond == al) {
+    brk();
+  } else {
+    Label L;
+    b(L, inverse(cond));
+    brk();
+    bind(L);
+  }
+}
+#endif // AARCH64
+
+
+// virtual method calling
+void MacroAssembler::lookup_virtual_method(Register recv_klass,
+                                           Register vtable_index,
+                                           Register method_result) {
+  const int base_offset = in_bytes(Klass::vtable_start_offset()) + vtableEntry::method_offset_in_bytes();
+  assert(vtableEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
+  add(recv_klass, recv_klass, AsmOperand(vtable_index, lsl, LogBytesPerWord));
+  ldr(method_result, Address(recv_klass, base_offset));
+}
+
+
+// Simplified, combined version, good for typical uses.
+// Falls through on failure.
+void MacroAssembler::check_klass_subtype(Register sub_klass,
+                                         Register super_klass,
+                                         Register temp_reg,
+                                         Register temp_reg2,
+                                         Register temp_reg3,
+                                         Label& L_success) {
+  Label L_failure;
+  check_klass_subtype_fast_path(sub_klass, super_klass, temp_reg, temp_reg2, &L_success, &L_failure, NULL);
+  check_klass_subtype_slow_path(sub_klass, super_klass, temp_reg, temp_reg2, temp_reg3, &L_success, NULL);
+  bind(L_failure);
+};
+
+void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
+                                                   Register super_klass,
+                                                   Register temp_reg,
+                                                   Register temp_reg2,
+                                                   Label* L_success,
+                                                   Label* L_failure,
+                                                   Label* L_slow_path) {
+
+  assert_different_registers(sub_klass, super_klass, temp_reg, temp_reg2, noreg);
+  const Register super_check_offset = temp_reg2;
+
+  Label L_fallthrough;
+  int label_nulls = 0;
+  if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
+  if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
+  if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; }
+  assert(label_nulls <= 1, "at most one NULL in the batch");
+
+  int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
+  int sco_offset = in_bytes(Klass::super_check_offset_offset());
+  Address super_check_offset_addr(super_klass, sco_offset);
+
+  // If the pointers are equal, we are done (e.g., String[] elements).
+  // This self-check enables sharing of secondary supertype arrays among
+  // non-primary types such as array-of-interface.  Otherwise, each such
+  // type would need its own customized SSA.
+  // We move this check to the front of the fast path because many
+  // type checks are in fact trivially successful in this manner,
+  // so we get a nicely predicted branch right at the start of the check.
+  cmp(sub_klass, super_klass);
+  b(*L_success, eq);
+
+  // Check the supertype display:
+  ldr_u32(super_check_offset, super_check_offset_addr);
+
+  Address super_check_addr(sub_klass, super_check_offset);
+  ldr(temp_reg, super_check_addr);
+  cmp(super_klass, temp_reg); // load displayed supertype
+
+  // This check has worked decisively for primary supers.
+  // Secondary supers are sought in the super_cache ('super_cache_addr').
+  // (Secondary supers are interfaces and very deeply nested subtypes.)
+  // This works in the same check above because of a tricky aliasing
+  // between the super_cache and the primary super display elements.
+  // (The 'super_check_addr' can address either, as the case requires.)
+  // Note that the cache is updated below if it does not help us find
+  // what we need immediately.
+  // So if it was a primary super, we can just fail immediately.
+  // Otherwise, it's the slow path for us (no success at this point).
+
+  b(*L_success, eq);
+  cmp_32(super_check_offset, sc_offset);
+  if (L_failure == &L_fallthrough) {
+    b(*L_slow_path, eq);
+  } else {
+    b(*L_failure, ne);
+    if (L_slow_path != &L_fallthrough) {
+      b(*L_slow_path);
+    }
+  }
+
+  bind(L_fallthrough);
+}
+
+
+void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
+                                                   Register super_klass,
+                                                   Register temp_reg,
+                                                   Register temp2_reg,
+                                                   Register temp3_reg,
+                                                   Label* L_success,
+                                                   Label* L_failure,
+                                                   bool set_cond_codes) {
+#ifdef AARCH64
+  NOT_IMPLEMENTED();
+#else
+  // Note: if used by code that expects a register to be 0 on success,
+  // this register must be temp_reg and set_cond_codes must be true
+
+  Register saved_reg = noreg;
+
+  // get additional tmp registers
+  if (temp3_reg == noreg) {
+    saved_reg = temp3_reg = LR;
+    push(saved_reg);
+  }
+
+  assert(temp2_reg != noreg, "need all the temporary registers");
+  assert_different_registers(sub_klass, super_klass, temp_reg, temp2_reg, temp3_reg);
+
+  Register cmp_temp = temp_reg;
+  Register scan_temp = temp3_reg;
+  Register count_temp = temp2_reg;
+
+  Label L_fallthrough;
+  int label_nulls = 0;
+  if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
+  if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
+  assert(label_nulls <= 1, "at most one NULL in the batch");
+
+  // a couple of useful fields in sub_klass:
+  int ss_offset = in_bytes(Klass::secondary_supers_offset());
+  int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
+  Address secondary_supers_addr(sub_klass, ss_offset);
+  Address super_cache_addr(     sub_klass, sc_offset);
+
+#ifndef PRODUCT
+  inc_counter((address)&SharedRuntime::_partial_subtype_ctr, scan_temp, count_temp);
+#endif
+
+  // We will consult the secondary-super array.
+  ldr(scan_temp, Address(sub_klass, ss_offset));
+
+  assert(! UseCompressedOops, "search_key must be the compressed super_klass");
+  // else search_key is the
+  Register search_key = super_klass;
+
+  // Load the array length.
+  ldr(count_temp, Address(scan_temp, Array<Klass*>::length_offset_in_bytes()));
+  add(scan_temp, scan_temp, Array<Klass*>::base_offset_in_bytes());
+
+  add(count_temp, count_temp, 1);
+
+  Label L_loop, L_setnz_and_fail, L_fail;
+
+  // Top of search loop
+  bind(L_loop);
+  // Notes:
+  //  scan_temp starts at the array elements
+  //  count_temp is 1+size
+  subs(count_temp, count_temp, 1);
+  if ((L_failure != &L_fallthrough) && (! set_cond_codes) && (saved_reg == noreg)) {
+    // direct jump to L_failure if failed and no cleanup needed
+    b(*L_failure, eq); // not found and
+  } else {
+    b(L_fail, eq); // not found in the array
+  }
+
+  // Load next super to check
+  // In the array of super classes elements are pointer sized.
+  int element_size = wordSize;
+  ldr(cmp_temp, Address(scan_temp, element_size, post_indexed));
+
+  // Look for Rsuper_klass on Rsub_klass's secondary super-class-overflow list
+  subs(cmp_temp, cmp_temp, search_key);
+
+  // A miss means we are NOT a subtype and need to keep looping
+  b(L_loop, ne);
+
+  // Falling out the bottom means we found a hit; we ARE a subtype
+
+  // Note: temp_reg/cmp_temp is already 0 and flag Z is set
+
+  // Success.  Cache the super we found and proceed in triumph.
+  str(super_klass, Address(sub_klass, sc_offset));
+
+  if (saved_reg != noreg) {
+    // Return success
+    pop(saved_reg);
+  }
+
+  b(*L_success);
+
+  bind(L_fail);
+  // Note1: check "b(*L_failure, eq)" above if adding extra instructions here
+  if (set_cond_codes) {
+    movs(temp_reg, sub_klass); // clears Z and sets temp_reg to non-0 if needed
+  }
+  if (saved_reg != noreg) {
+    pop(saved_reg);
+  }
+  if (L_failure != &L_fallthrough) {
+    b(*L_failure);
+  }
+
+  bind(L_fallthrough);
+#endif
+}
+
+// Returns address of receiver parameter, using tmp as base register. tmp and params_count can be the same.
+Address MacroAssembler::receiver_argument_address(Register params_base, Register params_count, Register tmp) {
+  assert_different_registers(params_base, params_count);
+  add(tmp, params_base, AsmOperand(params_count, lsl, Interpreter::logStackElementSize));
+  return Address(tmp, -Interpreter::stackElementSize);
+}
+
+
+void MacroAssembler::align(int modulus) {
+  while (offset() % modulus != 0) {
+    nop();
+  }
+}
+
+int MacroAssembler::set_last_Java_frame(Register last_java_sp,
+                                        Register last_java_fp,
+                                        bool save_last_java_pc,
+                                        Register tmp) {
+  int pc_offset;
+  if (last_java_fp != noreg) {
+    // optional
+    str(last_java_fp, Address(Rthread, JavaThread::last_Java_fp_offset()));
+    _fp_saved = true;
+  } else {
+    _fp_saved = false;
+  }
+  if (AARCH64_ONLY(true) NOT_AARCH64(save_last_java_pc)) { // optional on 32-bit ARM
+#ifdef AARCH64
+    pc_offset = mov_pc_to(tmp);
+    str(tmp, Address(Rthread, JavaThread::last_Java_pc_offset()));
+#else
+    str(PC, Address(Rthread, JavaThread::last_Java_pc_offset()));
+    pc_offset = offset() + VM_Version::stored_pc_adjustment();
+#endif
+    _pc_saved = true;
+  } else {
+    _pc_saved = false;
+    pc_offset = -1;
+  }
+  // According to comment in javaFrameAnchorm SP must be saved last, so that other
+  // entries are valid when SP is set.
+
+  // However, this is probably not a strong constrainst since for instance PC is
+  // sometimes read from the stack at SP... but is pushed later (by the call). Hence,
+  // we now write the fields in the expected order but we have not added a StoreStore
+  // barrier.
+
+  // XXX: if the ordering is really important, PC should always be saved (without forgetting
+  // to update oop_map offsets) and a StoreStore barrier might be needed.
+
+  if (last_java_sp == noreg) {
+    last_java_sp = SP; // always saved
+  }
+#ifdef AARCH64
+  if (last_java_sp == SP) {
+    mov(tmp, SP);
+    str(tmp, Address(Rthread, JavaThread::last_Java_sp_offset()));
+  } else {
+    str(last_java_sp, Address(Rthread, JavaThread::last_Java_sp_offset()));
+  }
+#else
+  str(last_java_sp, Address(Rthread, JavaThread::last_Java_sp_offset()));
+#endif
+
+  return pc_offset; // for oopmaps
+}
+
+void MacroAssembler::reset_last_Java_frame(Register tmp) {
+  const Register Rzero = zero_register(tmp);
+  str(Rzero, Address(Rthread, JavaThread::last_Java_sp_offset()));
+  if (_fp_saved) {
+    str(Rzero, Address(Rthread, JavaThread::last_Java_fp_offset()));
+  }
+  if (_pc_saved) {
+    str(Rzero, Address(Rthread, JavaThread::last_Java_pc_offset()));
+  }
+}
+
+
+// Implementation of call_VM versions
+
+void MacroAssembler::call_VM_leaf_helper(address entry_point, int number_of_arguments) {
+  assert(number_of_arguments >= 0, "cannot have negative number of arguments");
+  assert(number_of_arguments <= 4, "cannot have more than 4 arguments");
+
+#ifndef AARCH64
+  // Safer to save R9 here since callers may have been written
+  // assuming R9 survives. This is suboptimal but is not worth
+  // optimizing for the few platforms where R9 is scratched.
+  push(RegisterSet(R4) | R9ifScratched);
+  mov(R4, SP);
+  bic(SP, SP, StackAlignmentInBytes - 1);
+#endif // AARCH64
+  call(entry_point, relocInfo::runtime_call_type);
+#ifndef AARCH64
+  mov(SP, R4);
+  pop(RegisterSet(R4) | R9ifScratched);
+#endif // AARCH64
+}
+
+
+void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) {
+  assert(number_of_arguments >= 0, "cannot have negative number of arguments");
+  assert(number_of_arguments <= 3, "cannot have more than 3 arguments");
+
+  const Register tmp = Rtemp;
+  assert_different_registers(oop_result, tmp);
+
+  set_last_Java_frame(SP, FP, true, tmp);
+
+#ifdef ASSERT
+  AARCH64_ONLY(if (UseCompressedOops || UseCompressedClassPointers) { verify_heapbase("call_VM_helper: heap base corrupted?"); });
+#endif // ASSERT
+
+#ifndef AARCH64
+#if R9_IS_SCRATCHED
+  // Safer to save R9 here since callers may have been written
+  // assuming R9 survives. This is suboptimal but is not worth
+  // optimizing for the few platforms where R9 is scratched.
+
+  // Note: cannot save R9 above the saved SP (some calls expect for
+  // instance the Java stack top at the saved SP)
+  // => once saved (with set_last_Java_frame), decrease SP before rounding to
+  // ensure the slot at SP will be free for R9).
+  sub(SP, SP, 4);
+  bic(SP, SP, StackAlignmentInBytes - 1);
+  str(R9, Address(SP, 0));
+#else
+  bic(SP, SP, StackAlignmentInBytes - 1);
+#endif // R9_IS_SCRATCHED
+#endif
+
+  mov(R0, Rthread);
+  call(entry_point, relocInfo::runtime_call_type);
+
+#ifndef AARCH64
+#if R9_IS_SCRATCHED
+  ldr(R9, Address(SP, 0));
+#endif
+  ldr(SP, Address(Rthread, JavaThread::last_Java_sp_offset()));
+#endif
+
+  reset_last_Java_frame(tmp);
+
+  // C++ interp handles this in the interpreter
+  check_and_handle_popframe();
+  check_and_handle_earlyret();
+
+  if (check_exceptions) {
+    // check for pending exceptions
+    ldr(tmp, Address(Rthread, Thread::pending_exception_offset()));
+#ifdef AARCH64
+    Label L;
+    cbz(tmp, L);
+    mov_pc_to(Rexception_pc);
+    b(StubRoutines::forward_exception_entry());
+    bind(L);
+#else
+    cmp(tmp, 0);
+    mov(Rexception_pc, PC, ne);
+    b(StubRoutines::forward_exception_entry(), ne);
+#endif // AARCH64
+  }
+
+  // get oop result if there is one and reset the value in the thread
+  if (oop_result->is_valid()) {
+    get_vm_result(oop_result, tmp);
+  }
+}
+
+void MacroAssembler::call_VM(Register oop_result, address entry_point, bool check_exceptions) {
+  call_VM_helper(oop_result, entry_point, 0, check_exceptions);
+}
+
+
+void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, bool check_exceptions) {
+  assert (arg_1 == R1, "fixed register for arg_1");
+  call_VM_helper(oop_result, entry_point, 1, check_exceptions);
+}
+
+
+void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2, bool check_exceptions) {
+  assert (arg_1 == R1, "fixed register for arg_1");
+  assert (arg_2 == R2, "fixed register for arg_2");
+  call_VM_helper(oop_result, entry_point, 2, check_exceptions);
+}
+
+
+void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2, Register arg_3, bool check_exceptions) {
+  assert (arg_1 == R1, "fixed register for arg_1");
+  assert (arg_2 == R2, "fixed register for arg_2");
+  assert (arg_3 == R3, "fixed register for arg_3");
+  call_VM_helper(oop_result, entry_point, 3, check_exceptions);
+}
+
+
+void MacroAssembler::call_VM(Register oop_result, Register last_java_sp, address entry_point, int number_of_arguments, bool check_exceptions) {
+  // Not used on ARM
+  Unimplemented();
+}
+
+
+void MacroAssembler::call_VM(Register oop_result, Register last_java_sp, address entry_point, Register arg_1, bool check_exceptions) {
+  // Not used on ARM
+  Unimplemented();
+}
+
+
+void MacroAssembler::call_VM(Register oop_result, Register last_java_sp, address entry_point, Register arg_1, Register arg_2, bool check_exceptions) {
+// Not used on ARM
+  Unimplemented();
+}
+
+
+void MacroAssembler::call_VM(Register oop_result, Register last_java_sp, address entry_point, Register arg_1, Register arg_2, Register arg_3, bool check_exceptions) {
+  // Not used on ARM
+  Unimplemented();
+}
+
+// Raw call, without saving/restoring registers, exception handling, etc.
+// Mainly used from various stubs.
+void MacroAssembler::call_VM(address entry_point, bool save_R9_if_scratched) {
+  const Register tmp = Rtemp; // Rtemp free since scratched by call
+  set_last_Java_frame(SP, FP, true, tmp);
+#if R9_IS_SCRATCHED
+  if (save_R9_if_scratched) {
+    // Note: Saving also R10 for alignment.
+    push(RegisterSet(R9, R10));
+  }
+#endif
+  mov(R0, Rthread);
+  call(entry_point, relocInfo::runtime_call_type);
+#if R9_IS_SCRATCHED
+  if (save_R9_if_scratched) {
+    pop(RegisterSet(R9, R10));
+  }
+#endif
+  reset_last_Java_frame(tmp);
+}
+
+void MacroAssembler::call_VM_leaf(address entry_point) {
+  call_VM_leaf_helper(entry_point, 0);
+}
+
+void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1) {
+  assert (arg_1 == R0, "fixed register for arg_1");
+  call_VM_leaf_helper(entry_point, 1);
+}
+
+void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2) {
+  assert (arg_1 == R0, "fixed register for arg_1");
+  assert (arg_2 == R1, "fixed register for arg_2");
+  call_VM_leaf_helper(entry_point, 2);
+}
+
+void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2, Register arg_3) {
+  assert (arg_1 == R0, "fixed register for arg_1");
+  assert (arg_2 == R1, "fixed register for arg_2");
+  assert (arg_3 == R2, "fixed register for arg_3");
+  call_VM_leaf_helper(entry_point, 3);
+}
+
+void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2, Register arg_3, Register arg_4) {
+  assert (arg_1 == R0, "fixed register for arg_1");
+  assert (arg_2 == R1, "fixed register for arg_2");
+  assert (arg_3 == R2, "fixed register for arg_3");
+  assert (arg_4 == R3, "fixed register for arg_4");
+  call_VM_leaf_helper(entry_point, 4);
+}
+
+void MacroAssembler::get_vm_result(Register oop_result, Register tmp) {
+  assert_different_registers(oop_result, tmp);
+  ldr(oop_result, Address(Rthread, JavaThread::vm_result_offset()));
+  str(zero_register(tmp), Address(Rthread, JavaThread::vm_result_offset()));
+  verify_oop(oop_result);
+}
+
+void MacroAssembler::get_vm_result_2(Register metadata_result, Register tmp) {
+  assert_different_registers(metadata_result, tmp);
+  ldr(metadata_result, Address(Rthread, JavaThread::vm_result_2_offset()));
+  str(zero_register(tmp), Address(Rthread, JavaThread::vm_result_2_offset()));
+}
+
+void MacroAssembler::add_rc(Register dst, Register arg1, RegisterOrConstant arg2) {
+  if (arg2.is_register()) {
+    add(dst, arg1, arg2.as_register());
+  } else {
+    add(dst, arg1, arg2.as_constant());
+  }
+}
+
+void MacroAssembler::add_slow(Register rd, Register rn, int c) {
+#ifdef AARCH64
+  if (c == 0) {
+    if (rd != rn) {
+      mov(rd, rn);
+    }
+    return;
+  }
+  if (c < 0) {
+    sub_slow(rd, rn, -c);
+    return;
+  }
+  if (c > right_n_bits(24)) {
+    guarantee(rd != rn, "no large add_slow with only one register");
+    mov_slow(rd, c);
+    add(rd, rn, rd);
+  } else {
+    int lo = c & right_n_bits(12);
+    int hi = (c >> 12) & right_n_bits(12);
+    if (lo != 0) {
+      add(rd, rn, lo, lsl0);
+    }
+    if (hi != 0) {
+      add(rd, (lo == 0) ? rn : rd, hi, lsl12);
+    }
+  }
+#else
+  // This function is used in compiler for handling large frame offsets
+  if ((c < 0) && (((-c) & ~0x3fc) == 0)) {
+    return sub(rd, rn, (-c));
+  }
+  int low = c & 0x3fc;
+  if (low != 0) {
+    add(rd, rn, low);
+    rn = rd;
+  }
+  if (c & ~0x3fc) {
+    assert(AsmOperand::is_rotated_imm(c & ~0x3fc), "unsupported add_slow offset %d", c);
+    add(rd, rn, c & ~0x3fc);
+  } else if (rd != rn) {
+    assert(c == 0, "");
+    mov(rd, rn); // need to generate at least one move!
+  }
+#endif // AARCH64
+}
+
+void MacroAssembler::sub_slow(Register rd, Register rn, int c) {
+#ifdef AARCH64
+  if (c <= 0) {
+    add_slow(rd, rn, -c);
+    return;
+  }
+  if (c > right_n_bits(24)) {
+    guarantee(rd != rn, "no large sub_slow with only one register");
+    mov_slow(rd, c);
+    sub(rd, rn, rd);
+  } else {
+    int lo = c & right_n_bits(12);
+    int hi = (c >> 12) & right_n_bits(12);
+    if (lo != 0) {
+      sub(rd, rn, lo, lsl0);
+    }
+    if (hi != 0) {
+      sub(rd, (lo == 0) ? rn : rd, hi, lsl12);
+    }
+  }
+#else
+  // This function is used in compiler for handling large frame offsets
+  if ((c < 0) && (((-c) & ~0x3fc) == 0)) {
+    return add(rd, rn, (-c));
+  }
+  int low = c & 0x3fc;
+  if (low != 0) {
+    sub(rd, rn, low);
+    rn = rd;
+  }
+  if (c & ~0x3fc) {
+    assert(AsmOperand::is_rotated_imm(c & ~0x3fc), "unsupported sub_slow offset %d", c);
+    sub(rd, rn, c & ~0x3fc);
+  } else if (rd != rn) {
+    assert(c == 0, "");
+    mov(rd, rn); // need to generate at least one move!
+  }
+#endif // AARCH64
+}
+
+void MacroAssembler::mov_slow(Register rd, address addr) {
+  // do *not* call the non relocated mov_related_address
+  mov_slow(rd, (intptr_t)addr);
+}
+
+void MacroAssembler::mov_slow(Register rd, const char *str) {
+  mov_slow(rd, (intptr_t)str);
+}
+
+#ifdef AARCH64
+
+// Common code for mov_slow and instr_count_for_mov_slow.
+// Returns number of instructions of mov_slow pattern,
+// generating it if non-null MacroAssembler is given.
+int MacroAssembler::mov_slow_helper(Register rd, intptr_t c, MacroAssembler* masm) {
+  // This code pattern is matched in NativeIntruction::is_mov_slow.
+  // Update it at modifications.
+
+  const intx mask = right_n_bits(16);
+  // 1 movz instruction
+  for (int base_shift = 0; base_shift < 64; base_shift += 16) {
+    if ((c & ~(mask << base_shift)) == 0) {
+      if (masm != NULL) {
+        masm->movz(rd, ((uintx)c) >> base_shift, base_shift);
+      }
+      return 1;
+    }
+  }
+  // 1 movn instruction
+  for (int base_shift = 0; base_shift < 64; base_shift += 16) {
+    if (((~c) & ~(mask << base_shift)) == 0) {
+      if (masm != NULL) {
+        masm->movn(rd, ((uintx)(~c)) >> base_shift, base_shift);
+      }
+      return 1;
+    }
+  }
+  // 1 orr instruction
+  {
+    LogicalImmediate imm(c, false);
+    if (imm.is_encoded()) {
+      if (masm != NULL) {
+        masm->orr(rd, ZR, imm);
+      }
+      return 1;
+    }
+  }
+  // 1 movz/movn + up to 3 movk instructions
+  int zeroes = 0;
+  int ones = 0;
+  for (int base_shift = 0; base_shift < 64; base_shift += 16) {
+    int part = (c >> base_shift) & mask;
+    if (part == 0) {
+      ++zeroes;
+    } else if (part == mask) {
+      ++ones;
+    }
+  }
+  int def_bits = 0;
+  if (ones > zeroes) {
+    def_bits = mask;
+  }
+  int inst_count = 0;
+  for (int base_shift = 0; base_shift < 64; base_shift += 16) {
+    int part = (c >> base_shift) & mask;
+    if (part != def_bits) {
+      if (masm != NULL) {
+        if (inst_count > 0) {
+          masm->movk(rd, part, base_shift);
+        } else {
+          if (def_bits == 0) {
+            masm->movz(rd, part, base_shift);
+          } else {
+            masm->movn(rd, ~part & mask, base_shift);
+          }
+        }
+      }
+      inst_count++;
+    }
+  }
+  assert((1 <= inst_count) && (inst_count <= 4), "incorrect number of instructions");
+  return inst_count;
+}
+
+void MacroAssembler::mov_slow(Register rd, intptr_t c) {
+#ifdef ASSERT
+  int off = offset();
+#endif
+  (void) mov_slow_helper(rd, c, this);
+  assert(offset() - off == instr_count_for_mov_slow(c) * InstructionSize, "size mismatch");
+}
+
+// Counts instructions generated by mov_slow(rd, c).
+int MacroAssembler::instr_count_for_mov_slow(intptr_t c) {
+  return mov_slow_helper(noreg, c, NULL);
+}
+
+int MacroAssembler::instr_count_for_mov_slow(address c) {
+  return mov_slow_helper(noreg, (intptr_t)c, NULL);
+}
+
+#else
+
+void MacroAssembler::mov_slow(Register rd, intptr_t c, AsmCondition cond) {
+  if (AsmOperand::is_rotated_imm(c)) {
+    mov(rd, c, cond);
+  } else if (AsmOperand::is_rotated_imm(~c)) {
+    mvn(rd, ~c, cond);
+  } else if (VM_Version::supports_movw()) {
+    movw(rd, c & 0xffff, cond);
+    if ((unsigned int)c >> 16) {
+      movt(rd, (unsigned int)c >> 16, cond);
+    }
+  } else {
+    // Find first non-zero bit
+    int shift = 0;
+    while ((c & (3 << shift)) == 0) {
+      shift += 2;
+    }
+    // Put the least significant part of the constant
+    int mask = 0xff << shift;
+    mov(rd, c & mask, cond);
+    // Add up to 3 other parts of the constant;
+    // each of them can be represented as rotated_imm
+    if (c & (mask << 8)) {
+      orr(rd, rd, c & (mask << 8), cond);
+    }
+    if (c & (mask << 16)) {
+      orr(rd, rd, c & (mask << 16), cond);
+    }
+    if (c & (mask << 24)) {
+      orr(rd, rd, c & (mask << 24), cond);
+    }
+  }
+}
+
+#endif // AARCH64
+
+void MacroAssembler::mov_oop(Register rd, jobject o, int oop_index,
+#ifdef AARCH64
+                             bool patchable
+#else
+                             AsmCondition cond
+#endif
+                             ) {
+
+  if (o == NULL) {
+#ifdef AARCH64
+    if (patchable) {
+      nop();
+    }
+    mov(rd, ZR);
+#else
+    mov(rd, 0, cond);
+#endif
+    return;
+  }
+
+  if (oop_index == 0) {
+    oop_index = oop_recorder()->allocate_oop_index(o);
+  }
+  relocate(oop_Relocation::spec(oop_index));
+
+#ifdef AARCH64
+  if (patchable) {
+    nop();
+  }
+  ldr(rd, pc());
+#else
+  if (VM_Version::supports_movw()) {
+    movw(rd, 0, cond);
+    movt(rd, 0, cond);
+  } else {
+    ldr(rd, Address(PC), cond);
+    // Extra nop to handle case of large offset of oop placeholder (see NativeMovConstReg::set_data).
+    nop();
+  }
+#endif
+}
+
+void MacroAssembler::mov_metadata(Register rd, Metadata* o, int metadata_index AARCH64_ONLY_ARG(bool patchable)) {
+  if (o == NULL) {
+#ifdef AARCH64
+    if (patchable) {
+      nop();
+    }
+#endif
+    mov(rd, 0);
+    return;
+  }
+
+  if (metadata_index == 0) {
+    metadata_index = oop_recorder()->allocate_metadata_index(o);
+  }
+  relocate(metadata_Relocation::spec(metadata_index));
+
+#ifdef AARCH64
+  if (patchable) {
+    nop();
+  }
+#ifdef COMPILER2
+  if (!patchable && VM_Version::prefer_moves_over_load_literal()) {
+    mov_slow(rd, (address)o);
+    return;
+  }
+#endif
+  ldr(rd, pc());
+#else
+  if (VM_Version::supports_movw()) {
+    movw(rd, ((int)o) & 0xffff);
+    movt(rd, (unsigned int)o >> 16);
+  } else {
+    ldr(rd, Address(PC));
+    // Extra nop to handle case of large offset of metadata placeholder (see NativeMovConstReg::set_data).
+    nop();
+  }
+#endif // AARCH64
+}
+
+void MacroAssembler::mov_float(FloatRegister fd, jfloat c NOT_AARCH64_ARG(AsmCondition cond)) {
+  Label skip_constant;
+  union {
+    jfloat f;
+    jint i;
+  } accessor;
+  accessor.f = c;
+
+#ifdef AARCH64
+  // TODO-AARCH64 - try to optimize loading of float constants with fmov and/or mov_slow
+  Label L;
+  ldr_s(fd, target(L));
+  b(skip_constant);
+  bind(L);
+  emit_int32(accessor.i);
+  bind(skip_constant);
+#else
+  flds(fd, Address(PC), cond);
+  b(skip_constant);
+  emit_int32(accessor.i);
+  bind(skip_constant);
+#endif // AARCH64
+}
+
+void MacroAssembler::mov_double(FloatRegister fd, jdouble c NOT_AARCH64_ARG(AsmCondition cond)) {
+  Label skip_constant;
+  union {
+    jdouble d;
+    jint i[2];
+  } accessor;
+  accessor.d = c;
+
+#ifdef AARCH64
+  // TODO-AARCH64 - try to optimize loading of double constants with fmov
+  Label L;
+  ldr_d(fd, target(L));
+  b(skip_constant);
+  align(wordSize);
+  bind(L);
+  emit_int32(accessor.i[0]);
+  emit_int32(accessor.i[1]);
+  bind(skip_constant);
+#else
+  fldd(fd, Address(PC), cond);
+  b(skip_constant);
+  emit_int32(accessor.i[0]);
+  emit_int32(accessor.i[1]);
+  bind(skip_constant);
+#endif // AARCH64
+}
+
+void MacroAssembler::ldr_global_s32(Register reg, address address_of_global) {
+  intptr_t addr = (intptr_t) address_of_global;
+#ifdef AARCH64
+  assert((addr & 0x3) == 0, "address should be aligned");
+
+  // FIXME: TODO
+  if (false && page_reachable_from_cache(address_of_global)) {
+    assert(false,"TODO: relocate");
+    //relocate();
+    adrp(reg, address_of_global);
+    ldrsw(reg, Address(reg, addr & 0xfff));
+  } else {
+    mov_slow(reg, addr & ~0x3fff);
+    ldrsw(reg, Address(reg, addr & 0x3fff));
+  }
+#else
+  mov_slow(reg, addr & ~0xfff);
+  ldr(reg, Address(reg, addr & 0xfff));
+#endif
+}
+
+void MacroAssembler::ldr_global_ptr(Register reg, address address_of_global) {
+#ifdef AARCH64
+  intptr_t addr = (intptr_t) address_of_global;
+  assert ((addr & 0x7) == 0, "address should be aligned");
+  mov_slow(reg, addr & ~0x7fff);
+  ldr(reg, Address(reg, addr & 0x7fff));
+#else
+  ldr_global_s32(reg, address_of_global);
+#endif
+}
+
+void MacroAssembler::ldrb_global(Register reg, address address_of_global) {
+  intptr_t addr = (intptr_t) address_of_global;
+  mov_slow(reg, addr & ~0xfff);
+  ldrb(reg, Address(reg, addr & 0xfff));
+}
+
+void MacroAssembler::zero_extend(Register rd, Register rn, int bits) {
+#ifdef AARCH64
+  switch (bits) {
+    case  8: uxtb(rd, rn); break;
+    case 16: uxth(rd, rn); break;
+    case 32: mov_w(rd, rn); break;
+    default: ShouldNotReachHere();
+  }
+#else
+  if (bits <= 8) {
+    andr(rd, rn, (1 << bits) - 1);
+  } else if (bits >= 24) {
+    bic(rd, rn, -1 << bits);
+  } else {
+    mov(rd, AsmOperand(rn, lsl, 32 - bits));
+    mov(rd, AsmOperand(rd, lsr, 32 - bits));
+  }
+#endif
+}
+
+void MacroAssembler::sign_extend(Register rd, Register rn, int bits) {
+#ifdef AARCH64
+  switch (bits) {
+    case  8: sxtb(rd, rn); break;
+    case 16: sxth(rd, rn); break;
+    case 32: sxtw(rd, rn); break;
+    default: ShouldNotReachHere();
+  }
+#else
+  mov(rd, AsmOperand(rn, lsl, 32 - bits));
+  mov(rd, AsmOperand(rd, asr, 32 - bits));
+#endif
+}
+
+#ifndef AARCH64
+
+void MacroAssembler::long_move(Register rd_lo, Register rd_hi,
+                               Register rn_lo, Register rn_hi,
+                               AsmCondition cond) {
+  if (rd_lo != rn_hi) {
+    if (rd_lo != rn_lo) { mov(rd_lo, rn_lo, cond); }
+    if (rd_hi != rn_hi) { mov(rd_hi, rn_hi, cond); }
+  } else if (rd_hi != rn_lo) {
+    if (rd_hi != rn_hi) { mov(rd_hi, rn_hi, cond); }
+    if (rd_lo != rn_lo) { mov(rd_lo, rn_lo, cond); }
+  } else {
+    eor(rd_lo, rd_hi, rd_lo, cond);
+    eor(rd_hi, rd_lo, rd_hi, cond);
+    eor(rd_lo, rd_hi, rd_lo, cond);
+  }
+}
+
+void MacroAssembler::long_shift(Register rd_lo, Register rd_hi,
+                                Register rn_lo, Register rn_hi,
+                                AsmShift shift, Register count) {
+  Register tmp;
+  if (rd_lo != rn_lo && rd_lo != rn_hi && rd_lo != count) {
+    tmp = rd_lo;
+  } else {
+    tmp = rd_hi;
+  }
+  assert_different_registers(tmp, count, rn_lo, rn_hi);
+
+  subs(tmp, count, 32);
+  if (shift == lsl) {
+    assert_different_registers(rd_hi, rn_lo);
+    assert_different_registers(count, rd_hi);
+    mov(rd_hi, AsmOperand(rn_lo, shift, tmp), pl);
+    rsb(tmp, count, 32, mi);
+    if (rd_hi == rn_hi) {
+      mov(rd_hi, AsmOperand(rn_hi, lsl, count), mi);
+      orr(rd_hi, rd_hi, AsmOperand(rn_lo, lsr, tmp), mi);
+    } else {
+      mov(rd_hi, AsmOperand(rn_lo, lsr, tmp), mi);
+      orr(rd_hi, rd_hi, AsmOperand(rn_hi, lsl, count), mi);
+    }
+    mov(rd_lo, AsmOperand(rn_lo, shift, count));
+  } else {
+    assert_different_registers(rd_lo, rn_hi);
+    assert_different_registers(rd_lo, count);
+    mov(rd_lo, AsmOperand(rn_hi, shift, tmp), pl);
+    rsb(tmp, count, 32, mi);
+    if (rd_lo == rn_lo) {
+      mov(rd_lo, AsmOperand(rn_lo, lsr, count), mi);
+      orr(rd_lo, rd_lo, AsmOperand(rn_hi, lsl, tmp), mi);
+    } else {
+      mov(rd_lo, AsmOperand(rn_hi, lsl, tmp), mi);
+      orr(rd_lo, rd_lo, AsmOperand(rn_lo, lsr, count), mi);
+    }
+    mov(rd_hi, AsmOperand(rn_hi, shift, count));
+  }
+}
+
+void MacroAssembler::long_shift(Register rd_lo, Register rd_hi,
+                                Register rn_lo, Register rn_hi,
+                                AsmShift shift, int count) {
+  assert(count != 0 && (count & ~63) == 0, "must be");
+
+  if (shift == lsl) {
+    assert_different_registers(rd_hi, rn_lo);
+    if (count >= 32) {
+      mov(rd_hi, AsmOperand(rn_lo, lsl, count - 32));
+      mov(rd_lo, 0);
+    } else {
+      mov(rd_hi, AsmOperand(rn_hi, lsl, count));
+      orr(rd_hi, rd_hi, AsmOperand(rn_lo, lsr, 32 - count));
+      mov(rd_lo, AsmOperand(rn_lo, lsl, count));
+    }
+  } else {
+    assert_different_registers(rd_lo, rn_hi);
+    if (count >= 32) {
+      if (count == 32) {
+        mov(rd_lo, rn_hi);
+      } else {
+        mov(rd_lo, AsmOperand(rn_hi, shift, count - 32));
+      }
+      if (shift == asr) {
+        mov(rd_hi, AsmOperand(rn_hi, asr, 0));
+      } else {
+        mov(rd_hi, 0);
+      }
+    } else {
+      mov(rd_lo, AsmOperand(rn_lo, lsr, count));
+      orr(rd_lo, rd_lo, AsmOperand(rn_hi, lsl, 32 - count));
+      mov(rd_hi, AsmOperand(rn_hi, shift, count));
+    }
+  }
+}
+#endif // !AARCH64
+
+void MacroAssembler::_verify_oop(Register reg, const char* s, const char* file, int line) {
+  // This code pattern is matched in NativeIntruction::skip_verify_oop.
+  // Update it at modifications.
+  if (!VerifyOops) return;
+
+  char buffer[64];
+#ifdef COMPILER1
+  if (CommentedAssembly) {
+    snprintf(buffer, sizeof(buffer), "verify_oop at %d", offset());
+    block_comment(buffer);
+  }
+#endif
+  const char* msg_buffer = NULL;
+  {
+    ResourceMark rm;
+    stringStream ss;
+    ss.print("%s at offset %d (%s:%d)", s, offset(), file, line);
+    msg_buffer = code_string(ss.as_string());
+  }
+
+  save_all_registers();
+
+  if (reg != R2) {
+      mov(R2, reg);                              // oop to verify
+  }
+  mov(R1, SP);                                   // register save area
+
+  Label done;
+  InlinedString Lmsg(msg_buffer);
+  ldr_literal(R0, Lmsg);                         // message
+
+  // call indirectly to solve generation ordering problem
+  ldr_global_ptr(Rtemp, StubRoutines::verify_oop_subroutine_entry_address());
+  call(Rtemp);
+
+  restore_all_registers();
+
+  b(done);
+#ifdef COMPILER2
+  int off = offset();
+#endif
+  bind_literal(Lmsg);
+#ifdef COMPILER2
+  if (offset() - off == 1 * wordSize) {
+    // no padding, so insert nop for worst-case sizing
+    nop();
+  }
+#endif
+  bind(done);
+}
+
+void MacroAssembler::_verify_oop_addr(Address addr, const char* s, const char* file, int line) {
+  if (!VerifyOops) return;
+
+  const char* msg_buffer = NULL;
+  {
+    ResourceMark rm;
+    stringStream ss;
+    if ((addr.base() == SP) && (addr.index()==noreg)) {
+      ss.print("verify_oop_addr SP[%d]: %s", (int)addr.disp(), s);
+    } else {
+      ss.print("verify_oop_addr: %s", s);
+    }
+    ss.print(" (%s:%d)", file, line);
+    msg_buffer = code_string(ss.as_string());
+  }
+
+  int push_size = save_all_registers();
+
+  if (addr.base() == SP) {
+    // computes an addr that takes into account the push
+    if (addr.index() != noreg) {
+      Register new_base = addr.index() == R2 ? R1 : R2; // avoid corrupting the index
+      add(new_base, SP, push_size);
+      addr = addr.rebase(new_base);
+    } else {
+      addr = addr.plus_disp(push_size);
+    }
+  }
+
+  ldr(R2, addr);                                 // oop to verify
+  mov(R1, SP);                                   // register save area
+
+  Label done;
+  InlinedString Lmsg(msg_buffer);
+  ldr_literal(R0, Lmsg);                         // message
+
+  // call indirectly to solve generation ordering problem
+  ldr_global_ptr(Rtemp, StubRoutines::verify_oop_subroutine_entry_address());
+  call(Rtemp);
+
+  restore_all_registers();
+
+  b(done);
+  bind_literal(Lmsg);
+  bind(done);
+}
+
+void MacroAssembler::null_check(Register reg, Register tmp, int offset) {
+  if (needs_explicit_null_check(offset)) {
+#ifdef AARCH64
+    ldr(ZR, Address(reg));
+#else
+    assert_different_registers(reg, tmp);
+    if (tmp == noreg) {
+      tmp = Rtemp;
+      assert((! Thread::current()->is_Compiler_thread()) ||
+             (! (ciEnv::current()->task() == NULL)) ||
+             (! (ciEnv::current()->comp_level() == CompLevel_full_optimization)),
+             "Rtemp not available in C2"); // explicit tmp register required
+      // XXX: could we mark the code buffer as not compatible with C2 ?
+    }
+    ldr(tmp, Address(reg));
+#endif
+  }
+}
+
+// Puts address of allocated object into register `obj` and end of allocated object into register `obj_end`.
+void MacroAssembler::eden_allocate(Register obj, Register obj_end, Register tmp1, Register tmp2,
+                                 RegisterOrConstant size_expression, Label& slow_case) {
+  if (!Universe::heap()->supports_inline_contig_alloc()) {
+    b(slow_case);
+    return;
+  }
+
+  CollectedHeap* ch = Universe::heap();
+
+  const Register top_addr = tmp1;
+  const Register heap_end = tmp2;
+
+  if (size_expression.is_register()) {
+    assert_different_registers(obj, obj_end, top_addr, heap_end, size_expression.as_register());
+  } else {
+    assert_different_registers(obj, obj_end, top_addr, heap_end);
+  }
+
+  bool load_const = AARCH64_ONLY(false) NOT_AARCH64(VM_Version::supports_movw() ); // TODO-AARCH64 check performance
+  if (load_const) {
+    mov_address(top_addr, (address)Universe::heap()->top_addr(), symbolic_Relocation::eden_top_reference);
+  } else {
+    ldr(top_addr, Address(Rthread, JavaThread::heap_top_addr_offset()));
+  }
+  // Calculate new heap_top by adding the size of the object
+  Label retry;
+  bind(retry);
+
+#ifdef AARCH64
+  ldxr(obj, top_addr);
+#else
+  ldr(obj, Address(top_addr));
+#endif // AARCH64
+
+  ldr(heap_end, Address(top_addr, (intptr_t)ch->end_addr() - (intptr_t)ch->top_addr()));
+  add_rc(obj_end, obj, size_expression);
+  // Check if obj_end wrapped around, i.e., obj_end < obj. If yes, jump to the slow case.
+  cmp(obj_end, obj);
+  b(slow_case, lo);
+  // Update heap_top if allocation succeeded
+  cmp(obj_end, heap_end);
+  b(slow_case, hi);
+
+#ifdef AARCH64
+  stxr(heap_end/*scratched*/, obj_end, top_addr);
+  cbnz_w(heap_end, retry);
+#else
+  atomic_cas_bool(obj, obj_end, top_addr, 0, heap_end/*scratched*/);
+  b(retry, ne);
+#endif // AARCH64
+}
+
+// Puts address of allocated object into register `obj` and end of allocated object into register `obj_end`.
+void MacroAssembler::tlab_allocate(Register obj, Register obj_end, Register tmp1,
+                                 RegisterOrConstant size_expression, Label& slow_case) {
+  const Register tlab_end = tmp1;
+  assert_different_registers(obj, obj_end, tlab_end);
+
+  ldr(obj, Address(Rthread, JavaThread::tlab_top_offset()));
+  ldr(tlab_end, Address(Rthread, JavaThread::tlab_end_offset()));
+  add_rc(obj_end, obj, size_expression);
+  cmp(obj_end, tlab_end);
+  b(slow_case, hi);
+  str(obj_end, Address(Rthread, JavaThread::tlab_top_offset()));
+}
+
+void MacroAssembler::tlab_refill(Register top, Register tmp1, Register tmp2,
+                                 Register tmp3, Register tmp4,
+                               Label& try_eden, Label& slow_case) {
+  if (!Universe::heap()->supports_inline_contig_alloc()) {
+    b(slow_case);
+    return;
+  }
+
+  InlinedAddress intArrayKlass_addr((address)Universe::intArrayKlassObj_addr());
+  Label discard_tlab, do_refill;
+  ldr(top,  Address(Rthread, JavaThread::tlab_top_offset()));
+  ldr(tmp1, Address(Rthread, JavaThread::tlab_end_offset()));
+  ldr(tmp2, Address(Rthread, JavaThread::tlab_refill_waste_limit_offset()));
+
+  // Calculate amount of free space
+  sub(tmp1, tmp1, top);
+  // Retain tlab and allocate in shared space
+  // if the amount of free space in tlab is too large to discard
+  cmp(tmp2, AsmOperand(tmp1, lsr, LogHeapWordSize));
+  b(discard_tlab, ge);
+
+  // Increment waste limit to prevent getting stuck on this slow path
+  mov_slow(tmp3, ThreadLocalAllocBuffer::refill_waste_limit_increment());
+  add(tmp2, tmp2, tmp3);
+  str(tmp2, Address(Rthread, JavaThread::tlab_refill_waste_limit_offset()));
+  if (TLABStats) {
+    ldr_u32(tmp2, Address(Rthread, JavaThread::tlab_slow_allocations_offset()));
+    add_32(tmp2, tmp2, 1);
+    str_32(tmp2, Address(Rthread, JavaThread::tlab_slow_allocations_offset()));
+  }
+  b(try_eden);
+  bind_literal(intArrayKlass_addr);
+
+  bind(discard_tlab);
+  if (TLABStats) {
+    ldr_u32(tmp2, Address(Rthread, JavaThread::tlab_number_of_refills_offset()));
+    ldr_u32(tmp3, Address(Rthread, JavaThread::tlab_fast_refill_waste_offset()));
+    add_32(tmp2, tmp2, 1);
+    add_32(tmp3, tmp3, AsmOperand(tmp1, lsr, LogHeapWordSize));
+    str_32(tmp2, Address(Rthread, JavaThread::tlab_number_of_refills_offset()));
+    str_32(tmp3, Address(Rthread, JavaThread::tlab_fast_refill_waste_offset()));
+  }
+  // If tlab is currently allocated (top or end != null)
+  // then fill [top, end + alignment_reserve) with array object
+  cbz(top, do_refill);
+
+  // Set up the mark word
+  mov_slow(tmp2, (intptr_t)markOopDesc::prototype()->copy_set_hash(0x2));
+  str(tmp2, Address(top, oopDesc::mark_offset_in_bytes()));
+  // Set klass to intArrayKlass and the length to the remaining space
+  ldr_literal(tmp2, intArrayKlass_addr);
+  add(tmp1, tmp1, ThreadLocalAllocBuffer::alignment_reserve_in_bytes() -
+      typeArrayOopDesc::header_size(T_INT) * HeapWordSize);
+  Register klass = tmp2;
+  ldr(klass, Address(tmp2));
+  logical_shift_right(tmp1, tmp1, LogBytesPerInt); // divide by sizeof(jint)
+  str_32(tmp1, Address(top, arrayOopDesc::length_offset_in_bytes()));
+  store_klass(klass, top); // blows klass:
+  klass = noreg;
+
+  ldr(tmp1, Address(Rthread, JavaThread::tlab_start_offset()));
+  sub(tmp1, top, tmp1); // size of tlab's allocated portion
+  incr_allocated_bytes(tmp1, tmp2);
+
+  bind(do_refill);
+  // Refill the tlab with an eden allocation
+  ldr(tmp1, Address(Rthread, JavaThread::tlab_size_offset()));
+  logical_shift_left(tmp4, tmp1, LogHeapWordSize);
+  eden_allocate(top, tmp1, tmp2, tmp3, tmp4, slow_case);
+  str(top, Address(Rthread, JavaThread::tlab_start_offset()));
+  str(top, Address(Rthread, JavaThread::tlab_top_offset()));
+
+#ifdef ASSERT
+  // Verify that tmp1 contains tlab_end
+  ldr(tmp2, Address(Rthread, JavaThread::tlab_size_offset()));
+  add(tmp2, top, AsmOperand(tmp2, lsl, LogHeapWordSize));
+  cmp(tmp1, tmp2);
+  breakpoint(ne);
+#endif
+
+  sub(tmp1, tmp1, ThreadLocalAllocBuffer::alignment_reserve_in_bytes());
+  str(tmp1, Address(Rthread, JavaThread::tlab_end_offset()));
+
+  if (ZeroTLAB) {
+    // clobbers start and tmp
+    // top must be preserved!
+    add(tmp1, tmp1, ThreadLocalAllocBuffer::alignment_reserve_in_bytes());
+    ldr(tmp2, Address(Rthread, JavaThread::tlab_start_offset()));
+    zero_memory(tmp2, tmp1, tmp3);
+  }
+}
+
+// Fills memory regions [start..end] with zeroes. Clobbers `start` and `tmp` registers.
+void MacroAssembler::zero_memory(Register start, Register end, Register tmp) {
+  Label loop;
+  const Register ptr = start;
+
+#ifdef AARCH64
+  // TODO-AARCH64 - compare performance of 2x word zeroing with simple 1x
+  const Register size = tmp;
+  Label remaining, done;
+
+  sub(size, end, start);
+
+#ifdef ASSERT
+  { Label L;
+    tst(size, wordSize - 1);
+    b(L, eq);
+    stop("size is not a multiple of wordSize");
+    bind(L);
+  }
+#endif // ASSERT
+
+  subs(size, size, wordSize);
+  b(remaining, le);
+
+  // Zero by 2 words per iteration.
+  bind(loop);
+  subs(size, size, 2*wordSize);
+  stp(ZR, ZR, Address(ptr, 2*wordSize, post_indexed));
+  b(loop, gt);
+
+  bind(remaining);
+  b(done, ne);
+  str(ZR, Address(ptr));
+  bind(done);
+#else
+  mov(tmp, 0);
+  bind(loop);
+  cmp(ptr, end);
+  str(tmp, Address(ptr, wordSize, post_indexed), lo);
+  b(loop, lo);
+#endif // AARCH64
+}
+
+void MacroAssembler::incr_allocated_bytes(RegisterOrConstant size_in_bytes, Register tmp) {
+#ifdef AARCH64
+  ldr(tmp, Address(Rthread, in_bytes(JavaThread::allocated_bytes_offset())));
+  add_rc(tmp, tmp, size_in_bytes);
+  str(tmp, Address(Rthread, in_bytes(JavaThread::allocated_bytes_offset())));
+#else
+  // Bump total bytes allocated by this thread
+  Label done;
+
+  ldr(tmp, Address(Rthread, in_bytes(JavaThread::allocated_bytes_offset())));
+  adds(tmp, tmp, size_in_bytes);
+  str(tmp, Address(Rthread, in_bytes(JavaThread::allocated_bytes_offset())), cc);
+  b(done, cc);
+
+  // Increment the high word and store single-copy atomically (that is an unlikely scenario on typical embedded systems as it means >4GB has been allocated)
+  // To do so ldrd/strd instructions used which require an even-odd pair of registers. Such a request could be difficult to satisfy by
+  // allocating those registers on a higher level, therefore the routine is ready to allocate a pair itself.
+  Register low, high;
+  // Select ether R0/R1 or R2/R3
+
+  if (size_in_bytes.is_register() && (size_in_bytes.as_register() == R0 || size_in_bytes.as_register() == R1)) {
+    low = R2;
+    high  = R3;
+  } else {
+    low = R0;
+    high  = R1;
+  }
+  push(RegisterSet(low, high));
+
+  ldrd(low, Address(Rthread, in_bytes(JavaThread::allocated_bytes_offset())));
+  adds(low, low, size_in_bytes);
+  adc(high, high, 0);
+  strd(low, Address(Rthread, in_bytes(JavaThread::allocated_bytes_offset())));
+
+  pop(RegisterSet(low, high));
+
+  bind(done);
+#endif // AARCH64
+}
+
+void MacroAssembler::arm_stack_overflow_check(int frame_size_in_bytes, Register tmp) {
+  // Version of AbstractAssembler::generate_stack_overflow_check optimized for ARM
+  if (UseStackBanging) {
+    const int page_size = os::vm_page_size();
+
+    sub_slow(tmp, SP, JavaThread::stack_shadow_zone_size());
+    strb(R0, Address(tmp));
+#ifdef AARCH64
+    for (; frame_size_in_bytes >= page_size; frame_size_in_bytes -= page_size) {
+      sub(tmp, tmp, page_size);
+      strb(R0, Address(tmp));
+    }
+#else
+    for (; frame_size_in_bytes >= page_size; frame_size_in_bytes -= 0xff0) {
+      strb(R0, Address(tmp, -0xff0, pre_indexed));
+    }
+#endif // AARCH64
+  }
+}
+
+void MacroAssembler::arm_stack_overflow_check(Register Rsize, Register tmp) {
+  if (UseStackBanging) {
+    Label loop;
+
+    mov(tmp, SP);
+    add_slow(Rsize, Rsize, JavaThread::stack_shadow_zone_size() - os::vm_page_size());
+#ifdef AARCH64
+    sub(tmp, tmp, Rsize);
+    bind(loop);
+    subs(Rsize, Rsize, os::vm_page_size());
+    strb(ZR, Address(tmp, Rsize));
+#else
+    bind(loop);
+    subs(Rsize, Rsize, 0xff0);
+    strb(R0, Address(tmp, -0xff0, pre_indexed));
+#endif // AARCH64
+    b(loop, hi);
+  }
+}
+
+void MacroAssembler::stop(const char* msg) {
+  // This code pattern is matched in NativeIntruction::is_stop.
+  // Update it at modifications.
+#ifdef COMPILER1
+  if (CommentedAssembly) {
+    block_comment("stop");
+  }
+#endif
+
+  InlinedAddress Ldebug(CAST_FROM_FN_PTR(address, MacroAssembler::debug));
+  InlinedString Lmsg(msg);
+
+  // save all registers for further inspection
+  save_all_registers();
+
+  ldr_literal(R0, Lmsg);                     // message
+  mov(R1, SP);                               // register save area
+
+#ifdef AARCH64
+  ldr_literal(Rtemp, Ldebug);
+  br(Rtemp);
+#else
+  ldr_literal(PC, Ldebug);                   // call MacroAssembler::debug
+#endif // AARCH64
+
+#if defined(COMPILER2) && defined(AARCH64)
+  int off = offset();
+#endif
+  bind_literal(Lmsg);
+  bind_literal(Ldebug);
+#if defined(COMPILER2) && defined(AARCH64)
+  if (offset() - off == 2 * wordSize) {
+    // no padding, so insert nop for worst-case sizing
+    nop();
+  }
+#endif
+}
+
+void MacroAssembler::warn(const char* msg) {
+#ifdef COMPILER1
+  if (CommentedAssembly) {
+    block_comment("warn");
+  }
+#endif
+
+  InlinedAddress Lwarn(CAST_FROM_FN_PTR(address, warning));
+  InlinedString Lmsg(msg);
+  Label done;
+
+  int push_size = save_caller_save_registers();
+
+#ifdef AARCH64
+  // TODO-AARCH64 - get rid of extra debug parameters
+  mov(R1, LR);
+  mov(R2, FP);
+  add(R3, SP, push_size);
+#endif
+
+  ldr_literal(R0, Lmsg);                    // message
+  ldr_literal(LR, Lwarn);                   // call warning
+
+  call(LR);
+
+  restore_caller_save_registers();
+
+  b(done);
+  bind_literal(Lmsg);
+  bind_literal(Lwarn);
+  bind(done);
+}
+
+
+int MacroAssembler::save_all_registers() {
+  // This code pattern is matched in NativeIntruction::is_save_all_registers.
+  // Update it at modifications.
+#ifdef AARCH64
+  const Register tmp = Rtemp;
+  raw_push(R30, ZR);
+  for (int i = 28; i >= 0; i -= 2) {
+      raw_push(as_Register(i), as_Register(i+1));
+  }
+  mov_pc_to(tmp);
+  str(tmp, Address(SP, 31*wordSize));
+  ldr(tmp, Address(SP, tmp->encoding()*wordSize));
+  return 32*wordSize;
+#else
+  push(RegisterSet(R0, R12) | RegisterSet(LR) | RegisterSet(PC));
+  return 15*wordSize;
+#endif // AARCH64
+}
+
+void MacroAssembler::restore_all_registers() {
+#ifdef AARCH64
+  for (int i = 0; i <= 28; i += 2) {
+    raw_pop(as_Register(i), as_Register(i+1));
+  }
+  raw_pop(R30, ZR);
+#else
+  pop(RegisterSet(R0, R12) | RegisterSet(LR));   // restore registers
+  add(SP, SP, wordSize);                         // discard saved PC
+#endif // AARCH64
+}
+
+int MacroAssembler::save_caller_save_registers() {
+#ifdef AARCH64
+  for (int i = 0; i <= 16; i += 2) {
+    raw_push(as_Register(i), as_Register(i+1));
+  }
+  raw_push(R18, LR);
+  return 20*wordSize;
+#else
+#if R9_IS_SCRATCHED
+  // Save also R10 to preserve alignment
+  push(RegisterSet(R0, R3) | RegisterSet(R12) | RegisterSet(LR) | RegisterSet(R9,R10));
+  return 8*wordSize;
+#else
+  push(RegisterSet(R0, R3) | RegisterSet(R12) | RegisterSet(LR));
+  return 6*wordSize;
+#endif
+#endif // AARCH64
+}
+
+void MacroAssembler::restore_caller_save_registers() {
+#ifdef AARCH64
+  raw_pop(R18, LR);
+  for (int i = 16; i >= 0; i -= 2) {
+    raw_pop(as_Register(i), as_Register(i+1));
+  }
+#else
+#if R9_IS_SCRATCHED
+  pop(RegisterSet(R0, R3) | RegisterSet(R12) | RegisterSet(LR) | RegisterSet(R9,R10));
+#else
+  pop(RegisterSet(R0, R3) | RegisterSet(R12) | RegisterSet(LR));
+#endif
+#endif // AARCH64
+}
+
+void MacroAssembler::debug(const char* msg, const intx* registers) {
+  // In order to get locks to work, we need to fake a in_VM state
+  JavaThread* thread = JavaThread::current();
+  thread->set_thread_state(_thread_in_vm);
+
+  if (ShowMessageBoxOnError) {
+    ttyLocker ttyl;
+    if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
+      BytecodeCounter::print();
+    }
+    if (os::message_box(msg, "Execution stopped, print registers?")) {
+#ifdef AARCH64
+      // saved registers: R0-R30, PC
+      const int nregs = 32;
+#else
+      // saved registers: R0-R12, LR, PC
+      const int nregs = 15;
+      const Register regs[nregs] = {R0, R1, R2, R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, LR, PC};
+#endif // AARCH64
+
+      for (int i = 0; i < nregs AARCH64_ONLY(-1); i++) {
+        tty->print_cr("%s = " INTPTR_FORMAT, AARCH64_ONLY(as_Register(i)) NOT_AARCH64(regs[i])->name(), registers[i]);
+      }
+
+#ifdef AARCH64
+      tty->print_cr("pc = " INTPTR_FORMAT, registers[nregs-1]);
+#endif // AARCH64
+
+      // derive original SP value from the address of register save area
+      tty->print_cr("%s = " INTPTR_FORMAT, SP->name(), p2i(&registers[nregs]));
+    }
+    BREAKPOINT;
+  } else {
+    ::tty->print_cr("=============== DEBUG MESSAGE: %s ================\n", msg);
+  }
+  assert(false, "DEBUG MESSAGE: %s", msg);
+  fatal("%s", msg); // returning from MacroAssembler::debug is not supported
+}
+
+void MacroAssembler::unimplemented(const char* what) {
+  const char* buf = NULL;
+  {
+    ResourceMark rm;
+    stringStream ss;
+    ss.print("unimplemented: %s", what);
+    buf = code_string(ss.as_string());
+  }
+  stop(buf);
+}
+
+
+// Implementation of FixedSizeCodeBlock
+
+FixedSizeCodeBlock::FixedSizeCodeBlock(MacroAssembler* masm, int size_in_instrs, bool enabled) :
+_masm(masm), _start(masm->pc()), _size_in_instrs(size_in_instrs), _enabled(enabled) {
+}
+
+FixedSizeCodeBlock::~FixedSizeCodeBlock() {
+  if (_enabled) {
+    address curr_pc = _masm->pc();
+
+    assert(_start < curr_pc, "invalid current pc");
+    guarantee(curr_pc <= _start + _size_in_instrs * Assembler::InstructionSize, "code block is too long");
+
+    int nops_count = (_start - curr_pc) / Assembler::InstructionSize + _size_in_instrs;
+    for (int i = 0; i < nops_count; i++) {
+      _masm->nop();
+    }
+  }
+}
+
+#ifdef AARCH64
+
+// Serializes memory.
+// tmp register is not used on AArch64, this parameter is provided solely for better compatibility with 32-bit ARM
+void MacroAssembler::membar(Membar_mask_bits order_constraint, Register tmp) {
+  if (!os::is_MP()) return;
+
+  // TODO-AARCH64 investigate dsb vs dmb effects
+  if (order_constraint == StoreStore) {
+    dmb(DMB_st);
+  } else if ((order_constraint & ~(LoadLoad | LoadStore)) == 0) {
+    dmb(DMB_ld);
+  } else {
+    dmb(DMB_all);
+  }
+}
+
+#else
+
+// Serializes memory. Potentially blows flags and reg.
+// tmp is a scratch for v6 co-processor write op (could be noreg for other architecure versions)
+// preserve_flags takes a longer path in LoadStore case (dmb rather then control dependency) to preserve status flags. Optional.
+// load_tgt is an ordered load target in a LoadStore case only, to create dependency between the load operation and conditional branch. Optional.
+void MacroAssembler::membar(Membar_mask_bits order_constraint,
+                            Register tmp,
+                            bool preserve_flags,
+                            Register load_tgt) {
+  if (!os::is_MP()) return;
+
+  if (order_constraint == StoreStore) {
+    dmb(DMB_st, tmp);
+  } else if ((order_constraint & StoreLoad)  ||
+             (order_constraint & LoadLoad)   ||
+             (order_constraint & StoreStore) ||
+             (load_tgt == noreg)             ||
+             preserve_flags) {
+    dmb(DMB_all, tmp);
+  } else {
+    // LoadStore: speculative stores reordeing is prohibited
+
+    // By providing an ordered load target register, we avoid an extra memory load reference
+    Label not_taken;
+    bind(not_taken);
+    cmp(load_tgt, load_tgt);
+    b(not_taken, ne);
+  }
+}
+
+#endif // AARCH64
+
+// If "allow_fallthrough_on_failure" is false, we always branch to "slow_case"
+// on failure, so fall-through can only mean success.
+// "one_shot" controls whether we loop and retry to mitigate spurious failures.
+// This is only needed for C2, which for some reason does not rety,
+// while C1/interpreter does.
+// TODO: measure if it makes a difference
+
+void MacroAssembler::cas_for_lock_acquire(Register oldval, Register newval,
+  Register base, Register tmp, Label &slow_case,
+  bool allow_fallthrough_on_failure, bool one_shot)
+{
+
+  bool fallthrough_is_success = false;
+
+  // ARM Litmus Test example does prefetching here.
+  // TODO: investigate if it helps performance
+
+  // The last store was to the displaced header, so to prevent
+  // reordering we must issue a StoreStore or Release barrier before
+  // the CAS store.
+
+#ifdef AARCH64
+
+  Register Rscratch = tmp;
+  Register Roop = base;
+  Register mark = oldval;
+  Register Rbox = newval;
+  Label loop;
+
+  assert(oopDesc::mark_offset_in_bytes() == 0, "must be");
+
+  // Instead of StoreStore here, we use store-release-exclusive below
+
+  bind(loop);
+
+  ldaxr(tmp, base);  // acquire
+  cmp(tmp, oldval);
+  b(slow_case, ne);
+  stlxr(tmp, newval, base); // release
+  if (one_shot) {
+    cmp_w(tmp, 0);
+  } else {
+    cbnz_w(tmp, loop);
+    fallthrough_is_success = true;
+  }
+
+  // MemBarAcquireLock would normally go here, but
+  // we already do ldaxr+stlxr above, which has
+  // Sequential Consistency
+
+#else
+  membar(MacroAssembler::StoreStore, noreg);
+
+  if (one_shot) {
+    ldrex(tmp, Address(base, oopDesc::mark_offset_in_bytes()));
+    cmp(tmp, oldval);
+    strex(tmp, newval, Address(base, oopDesc::mark_offset_in_bytes()), eq);
+    cmp(tmp, 0, eq);
+  } else {
+    atomic_cas_bool(oldval, newval, base, oopDesc::mark_offset_in_bytes(), tmp);
+  }
+
+  // MemBarAcquireLock barrier
+  // According to JSR-133 Cookbook, this should be LoadLoad | LoadStore,
+  // but that doesn't prevent a load or store from floating up between
+  // the load and store in the CAS sequence, so play it safe and
+  // do a full fence.
+  membar(Membar_mask_bits(LoadLoad | LoadStore | StoreStore | StoreLoad), noreg);
+#endif
+  if (!fallthrough_is_success && !allow_fallthrough_on_failure) {
+    b(slow_case, ne);
+  }
+}
+
+void MacroAssembler::cas_for_lock_release(Register oldval, Register newval,
+  Register base, Register tmp, Label &slow_case,
+  bool allow_fallthrough_on_failure, bool one_shot)
+{
+
+  bool fallthrough_is_success = false;
+
+  assert_different_registers(oldval,newval,base,tmp);
+
+#ifdef AARCH64
+  Label loop;
+
+  assert(oopDesc::mark_offset_in_bytes() == 0, "must be");
+
+  bind(loop);
+  ldxr(tmp, base);
+  cmp(tmp, oldval);
+  b(slow_case, ne);
+  // MemBarReleaseLock barrier
+  stlxr(tmp, newval, base);
+  if (one_shot) {
+    cmp_w(tmp, 0);
+  } else {
+    cbnz_w(tmp, loop);
+    fallthrough_is_success = true;
+  }
+#else
+  // MemBarReleaseLock barrier
+  // According to JSR-133 Cookbook, this should be StoreStore | LoadStore,
+  // but that doesn't prevent a load or store from floating down between
+  // the load and store in the CAS sequence, so play it safe and
+  // do a full fence.
+  membar(Membar_mask_bits(LoadLoad | LoadStore | StoreStore | StoreLoad), tmp);
+
+  if (one_shot) {
+    ldrex(tmp, Address(base, oopDesc::mark_offset_in_bytes()));
+    cmp(tmp, oldval);
+    strex(tmp, newval, Address(base, oopDesc::mark_offset_in_bytes()), eq);
+    cmp(tmp, 0, eq);
+  } else {
+    atomic_cas_bool(oldval, newval, base, oopDesc::mark_offset_in_bytes(), tmp);
+  }
+#endif
+  if (!fallthrough_is_success && !allow_fallthrough_on_failure) {
+    b(slow_case, ne);
+  }
+
+  // ExitEnter
+  // According to JSR-133 Cookbook, this should be StoreLoad, the same
+  // barrier that follows volatile store.
+  // TODO: Should be able to remove on armv8 if volatile loads
+  // use the load-acquire instruction.
+  membar(StoreLoad, noreg);
+}
+
+#ifndef PRODUCT
+
+// Preserves flags and all registers.
+// On SMP the updated value might not be visible to external observers without a sychronization barrier
+void MacroAssembler::cond_atomic_inc32(AsmCondition cond, int* counter_addr) {
+  if (counter_addr != NULL) {
+    InlinedAddress counter_addr_literal((address)counter_addr);
+    Label done, retry;
+    if (cond != al) {
+      b(done, inverse(cond));
+    }
+
+#ifdef AARCH64
+    raw_push(R0, R1);
+    raw_push(R2, ZR);
+
+    ldr_literal(R0, counter_addr_literal);
+
+    bind(retry);
+    ldxr_w(R1, R0);
+    add_w(R1, R1, 1);
+    stxr_w(R2, R1, R0);
+    cbnz_w(R2, retry);
+
+    raw_pop(R2, ZR);
+    raw_pop(R0, R1);
+#else
+    push(RegisterSet(R0, R3) | RegisterSet(Rtemp));
+    ldr_literal(R0, counter_addr_literal);
+
+    mrs(CPSR, Rtemp);
+
+    bind(retry);
+    ldr_s32(R1, Address(R0));
+    add(R2, R1, 1);
+    atomic_cas_bool(R1, R2, R0, 0, R3);
+    b(retry, ne);
+
+    msr(CPSR_fsxc, Rtemp);
+
+    pop(RegisterSet(R0, R3) | RegisterSet(Rtemp));
+#endif // AARCH64
+
+    b(done);
+    bind_literal(counter_addr_literal);
+
+    bind(done);
+  }
+}
+
+#endif // !PRODUCT
+
+
+// Building block for CAS cases of biased locking: makes CAS and records statistics.
+// The slow_case label is used to transfer control if CAS fails. Otherwise leaves condition codes set.
+void MacroAssembler::biased_locking_enter_with_cas(Register obj_reg, Register old_mark_reg, Register new_mark_reg,
+                                                 Register tmp, Label& slow_case, int* counter_addr) {
+
+  cas_for_lock_acquire(old_mark_reg, new_mark_reg, obj_reg, tmp, slow_case);
+#ifdef ASSERT
+  breakpoint(ne); // Fallthrough only on success
+#endif
+#ifndef PRODUCT
+  if (counter_addr != NULL) {
+    cond_atomic_inc32(al, counter_addr);
+  }
+#endif // !PRODUCT
+}
+
+int MacroAssembler::biased_locking_enter(Register obj_reg, Register swap_reg, Register tmp_reg,
+                                         bool swap_reg_contains_mark,
+                                         Register tmp2,
+                                         Label& done, Label& slow_case,
+                                         BiasedLockingCounters* counters) {
+  // obj_reg must be preserved (at least) if the bias locking fails
+  // tmp_reg is a temporary register
+  // swap_reg was used as a temporary but contained a value
+  //   that was used afterwards in some call pathes. Callers
+  //   have been fixed so that swap_reg no longer needs to be
+  //   saved.
+  // Rtemp in no longer scratched
+
+  assert(UseBiasedLocking, "why call this otherwise?");
+  assert_different_registers(obj_reg, swap_reg, tmp_reg, tmp2);
+  guarantee(swap_reg!=tmp_reg, "invariant");
+  assert(tmp_reg != noreg, "must supply tmp_reg");
+
+#ifndef PRODUCT
+  if (PrintBiasedLockingStatistics && (counters == NULL)) {
+    counters = BiasedLocking::counters();
+  }
+#endif
+
+  assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, "biased locking makes assumptions about bit layout");
+  Address mark_addr(obj_reg, oopDesc::mark_offset_in_bytes());
+
+  // Biased locking
+  // See whether the lock is currently biased toward our thread and
+  // whether the epoch is still valid
+  // Note that the runtime guarantees sufficient alignment of JavaThread
+  // pointers to allow age to be placed into low bits
+  // First check to see whether biasing is even enabled for this object
+  Label cas_label;
+
+  // The null check applies to the mark loading, if we need to load it.
+  // If the mark has already been loaded in swap_reg then it has already
+  // been performed and the offset is irrelevant.
+  int null_check_offset = offset();
+  if (!swap_reg_contains_mark) {
+    ldr(swap_reg, mark_addr);
+  }
+
+  // On MP platform loads could return 'stale' values in some cases.
+  // That is acceptable since either CAS or slow case path is taken in the worst case.
+
+  andr(tmp_reg, swap_reg, (uintx)markOopDesc::biased_lock_mask_in_place);
+  cmp(tmp_reg, markOopDesc::biased_lock_pattern);
+
+  b(cas_label, ne);
+
+  // The bias pattern is present in the object's header. Need to check
+  // whether the bias owner and the epoch are both still current.
+  load_klass(tmp_reg, obj_reg);
+  ldr(tmp_reg, Address(tmp_reg, Klass::prototype_header_offset()));
+  orr(tmp_reg, tmp_reg, Rthread);
+  eor(tmp_reg, tmp_reg, swap_reg);
+
+#ifdef AARCH64
+  ands(tmp_reg, tmp_reg, ~((uintx) markOopDesc::age_mask_in_place));
+#else
+  bics(tmp_reg, tmp_reg, ((int) markOopDesc::age_mask_in_place));
+#endif // AARCH64
+
+#ifndef PRODUCT
+  if (counters != NULL) {
+    cond_atomic_inc32(eq, counters->biased_lock_entry_count_addr());
+  }
+#endif // !PRODUCT
+
+  b(done, eq);
+
+  Label try_revoke_bias;
+  Label try_rebias;
+
+  // At this point we know that the header has the bias pattern and
+  // that we are not the bias owner in the current epoch. We need to
+  // figure out more details about the state of the header in order to
+  // know what operations can be legally performed on the object's
+  // header.
+
+  // If the low three bits in the xor result aren't clear, that means
+  // the prototype header is no longer biased and we have to revoke
+  // the bias on this object.
+  tst(tmp_reg, (uintx)markOopDesc::biased_lock_mask_in_place);
+  b(try_revoke_bias, ne);
+
+  // Biasing is still enabled for this data type. See whether the
+  // epoch of the current bias is still valid, meaning that the epoch
+  // bits of the mark word are equal to the epoch bits of the
+  // prototype header. (Note that the prototype header's epoch bits
+  // only change at a safepoint.) If not, attempt to rebias the object
+  // toward the current thread. Note that we must be absolutely sure
+  // that the current epoch is invalid in order to do this because
+  // otherwise the manipulations it performs on the mark word are
+  // illegal.
+  tst(tmp_reg, (uintx)markOopDesc::epoch_mask_in_place);
+  b(try_rebias, ne);
+
+  // tmp_reg has the age, epoch and pattern bits cleared
+  // The remaining (owner) bits are (Thread ^ current_owner)
+
+  // The epoch of the current bias is still valid but we know nothing
+  // about the owner; it might be set or it might be clear. Try to
+  // acquire the bias of the object using an atomic operation. If this
+  // fails we will go in to the runtime to revoke the object's bias.
+  // Note that we first construct the presumed unbiased header so we
+  // don't accidentally blow away another thread's valid bias.
+
+  // Note that we know the owner is not ourself. Hence, success can
+  // only happen when the owner bits is 0
+
+#ifdef AARCH64
+  // Bit mask biased_lock + age + epoch is not a valid AArch64 logical immediate, as it has
+  // cleared bit in the middle (cms bit). So it is loaded with separate instruction.
+  mov(tmp2, (markOopDesc::biased_lock_mask_in_place | markOopDesc::age_mask_in_place | markOopDesc::epoch_mask_in_place));
+  andr(swap_reg, swap_reg, tmp2);
+#else
+  // until the assembler can be made smarter, we need to make some assumptions about the values
+  // so we can optimize this:
+  assert((markOopDesc::biased_lock_mask_in_place | markOopDesc::age_mask_in_place | markOopDesc::epoch_mask_in_place) == 0x1ff, "biased bitmasks changed");
+
+  mov(swap_reg, AsmOperand(swap_reg, lsl, 23));
+  mov(swap_reg, AsmOperand(swap_reg, lsr, 23)); // markOop with thread bits cleared (for CAS)
+#endif // AARCH64
+
+  orr(tmp_reg, swap_reg, Rthread); // new mark
+
+  biased_locking_enter_with_cas(obj_reg, swap_reg, tmp_reg, tmp2, slow_case,
+        (counters != NULL) ? counters->anonymously_biased_lock_entry_count_addr() : NULL);
+
+  // If the biasing toward our thread failed, this means that
+  // another thread succeeded in biasing it toward itself and we
+  // need to revoke that bias. The revocation will occur in the
+  // interpreter runtime in the slow case.
+
+  b(done);
+
+  bind(try_rebias);
+
+  // At this point we know the epoch has expired, meaning that the
+  // current "bias owner", if any, is actually invalid. Under these
+  // circumstances _only_, we are allowed to use the current header's
+  // value as the comparison value when doing the cas to acquire the
+  // bias in the current epoch. In other words, we allow transfer of
+  // the bias from one thread to another directly in this situation.
+
+  // tmp_reg low (not owner) bits are (age: 0 | pattern&epoch: prototype^swap_reg)
+
+  eor(tmp_reg, tmp_reg, swap_reg); // OK except for owner bits (age preserved !)
+
+  // owner bits 'random'. Set them to Rthread.
+#ifdef AARCH64
+  mov(tmp2, (markOopDesc::biased_lock_mask_in_place | markOopDesc::age_mask_in_place | markOopDesc::epoch_mask_in_place));
+  andr(tmp_reg, tmp_reg, tmp2);
+#else
+  mov(tmp_reg, AsmOperand(tmp_reg, lsl, 23));
+  mov(tmp_reg, AsmOperand(tmp_reg, lsr, 23));
+#endif // AARCH64
+
+  orr(tmp_reg, tmp_reg, Rthread); // new mark
+
+  biased_locking_enter_with_cas(obj_reg, swap_reg, tmp_reg, tmp2, slow_case,
+        (counters != NULL) ? counters->rebiased_lock_entry_count_addr() : NULL);
+
+  // If the biasing toward our thread failed, then another thread
+  // succeeded in biasing it toward itself and we need to revoke that
+  // bias. The revocation will occur in the runtime in the slow case.
+
+  b(done);
+
+  bind(try_revoke_bias);
+
+  // The prototype mark in the klass doesn't have the bias bit set any
+  // more, indicating that objects of this data type are not supposed
+  // to be biased any more. We are going to try to reset the mark of
+  // this object to the prototype value and fall through to the
+  // CAS-based locking scheme. Note that if our CAS fails, it means
+  // that another thread raced us for the privilege of revoking the
+  // bias of this particular object, so it's okay to continue in the
+  // normal locking code.
+
+  // tmp_reg low (not owner) bits are (age: 0 | pattern&epoch: prototype^swap_reg)
+
+  eor(tmp_reg, tmp_reg, swap_reg); // OK except for owner bits (age preserved !)
+
+  // owner bits 'random'. Clear them
+#ifdef AARCH64
+  mov(tmp2, (markOopDesc::biased_lock_mask_in_place | markOopDesc::age_mask_in_place | markOopDesc::epoch_mask_in_place));
+  andr(tmp_reg, tmp_reg, tmp2);
+#else
+  mov(tmp_reg, AsmOperand(tmp_reg, lsl, 23));
+  mov(tmp_reg, AsmOperand(tmp_reg, lsr, 23));
+#endif // AARCH64
+
+  biased_locking_enter_with_cas(obj_reg, swap_reg, tmp_reg, tmp2, cas_label,
+        (counters != NULL) ? counters->revoked_lock_entry_count_addr() : NULL);
+
+  // Fall through to the normal CAS-based lock, because no matter what
+  // the result of the above CAS, some thread must have succeeded in
+  // removing the bias bit from the object's header.
+
+  bind(cas_label);
+
+  return null_check_offset;
+}
+
+
+void MacroAssembler::biased_locking_exit(Register obj_reg, Register tmp_reg, Label& done) {
+  assert(UseBiasedLocking, "why call this otherwise?");
+
+  // Check for biased locking unlock case, which is a no-op
+  // Note: we do not have to check the thread ID for two reasons.
+  // First, the interpreter checks for IllegalMonitorStateException at
+  // a higher level. Second, if the bias was revoked while we held the
+  // lock, the object could not be rebiased toward another thread, so
+  // the bias bit would be clear.
+  ldr(tmp_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
+
+  andr(tmp_reg, tmp_reg, (uintx)markOopDesc::biased_lock_mask_in_place);
+  cmp(tmp_reg, markOopDesc::biased_lock_pattern);
+  b(done, eq);
+}
+
+#ifdef AARCH64
+
+void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed) {
+  switch (size_in_bytes) {
+    case  8: ldr(dst, src); break;
+    case  4: is_signed ? ldr_s32(dst, src) : ldr_u32(dst, src); break;
+    case  2: is_signed ? ldrsh(dst, src) : ldrh(dst, src); break;
+    case  1: is_signed ? ldrsb(dst, src) : ldrb(dst, src); break;
+    default: ShouldNotReachHere();
+  }
+}
+
+void MacroAssembler::store_sized_value(Register src, Address dst, size_t size_in_bytes) {
+  switch (size_in_bytes) {
+    case  8: str(src, dst);    break;
+    case  4: str_32(src, dst); break;
+    case  2: strh(src, dst);   break;
+    case  1: strb(src, dst);   break;
+    default: ShouldNotReachHere();
+  }
+}
+
+#else
+
+void MacroAssembler::load_sized_value(Register dst, Address src,
+                                    size_t size_in_bytes, bool is_signed, AsmCondition cond) {
+  switch (size_in_bytes) {
+    case  4: ldr(dst, src, cond); break;
+    case  2: is_signed ? ldrsh(dst, src, cond) : ldrh(dst, src, cond); break;
+    case  1: is_signed ? ldrsb(dst, src, cond) : ldrb(dst, src, cond); break;
+    default: ShouldNotReachHere();
+  }
+}
+
+
+void MacroAssembler::store_sized_value(Register src, Address dst, size_t size_in_bytes, AsmCondition cond) {
+  switch (size_in_bytes) {
+    case  4: str(src, dst, cond); break;
+    case  2: strh(src, dst, cond);   break;
+    case  1: strb(src, dst, cond);   break;
+    default: ShouldNotReachHere();
+  }
+}
+#endif // AARCH64
+
+// Look up the method for a megamorphic invokeinterface call.
+// The target method is determined by <Rinterf, Rindex>.
+// The receiver klass is in Rklass.
+// On success, the result will be in method_result, and execution falls through.
+// On failure, execution transfers to the given label.
+void MacroAssembler::lookup_interface_method(Register Rklass,
+                                             Register Rinterf,
+                                             Register Rindex,
+                                             Register method_result,
+                                             Register temp_reg1,
+                                             Register temp_reg2,
+                                             Label& L_no_such_interface) {
+
+  assert_different_registers(Rklass, Rinterf, temp_reg1, temp_reg2, Rindex);
+
+  Register Ritable = temp_reg1;
+
+  // Compute start of first itableOffsetEntry (which is at the end of the vtable)
+  const int base = in_bytes(Klass::vtable_start_offset());
+  const int scale = exact_log2(vtableEntry::size_in_bytes());
+  ldr_s32(temp_reg2, Address(Rklass, Klass::vtable_length_offset())); // Get length of vtable
+  add(Ritable, Rklass, base);
+  add(Ritable, Ritable, AsmOperand(temp_reg2, lsl, scale));
+
+  Label entry, search;
+
+  b(entry);
+
+  bind(search);
+  add(Ritable, Ritable, itableOffsetEntry::size() * HeapWordSize);
+
+  bind(entry);
+
+  // Check that the entry is non-null.  A null entry means that the receiver
+  // class doesn't implement the interface, and wasn't the same as the
+  // receiver class checked when the interface was resolved.
+
+  ldr(temp_reg2, Address(Ritable, itableOffsetEntry::interface_offset_in_bytes()));
+  cbz(temp_reg2, L_no_such_interface);
+
+  cmp(Rinterf, temp_reg2);
+  b(search, ne);
+
+  ldr_s32(temp_reg2, Address(Ritable, itableOffsetEntry::offset_offset_in_bytes()));
+  add(temp_reg2, temp_reg2, Rklass); // Add offset to Klass*
+  assert(itableMethodEntry::size() * HeapWordSize == wordSize, "adjust the scaling in the code below");
+  assert(itableMethodEntry::method_offset_in_bytes() == 0, "adjust the offset in the code below");
+
+  ldr(method_result, Address::indexed_ptr(temp_reg2, Rindex));
+}
+
+#ifdef COMPILER2
+// TODO: 8 bytes at a time? pre-fetch?
+// Compare char[] arrays aligned to 4 bytes.
+void MacroAssembler::char_arrays_equals(Register ary1, Register ary2,
+                                        Register limit, Register result,
+                                      Register chr1, Register chr2, Label& Ldone) {
+  Label Lvector, Lloop;
+
+  // Note: limit contains number of bytes (2*char_elements) != 0.
+  tst(limit, 0x2); // trailing character ?
+  b(Lvector, eq);
+
+  // compare the trailing char
+  sub(limit, limit, sizeof(jchar));
+  ldrh(chr1, Address(ary1, limit));
+  ldrh(chr2, Address(ary2, limit));
+  cmp(chr1, chr2);
+  mov(result, 0, ne);     // not equal
+  b(Ldone, ne);
+
+  // only one char ?
+  tst(limit, limit);
+  mov(result, 1, eq);
+  b(Ldone, eq);
+
+  // word by word compare, dont't need alignment check
+  bind(Lvector);
+
+  // Shift ary1 and ary2 to the end of the arrays, negate limit
+  add(ary1, limit, ary1);
+  add(ary2, limit, ary2);
+  neg(limit, limit);
+
+  bind(Lloop);
+  ldr_u32(chr1, Address(ary1, limit));
+  ldr_u32(chr2, Address(ary2, limit));
+  cmp_32(chr1, chr2);
+  mov(result, 0, ne);     // not equal
+  b(Ldone, ne);
+  adds(limit, limit, 2*sizeof(jchar));
+  b(Lloop, ne);
+
+  // Caller should set it:
+  // mov(result_reg, 1);  //equal
+}
+#endif
+
+void MacroAssembler::inc_counter(address counter_addr, Register tmpreg1, Register tmpreg2) {
+  mov_slow(tmpreg1, counter_addr);
+  ldr_s32(tmpreg2, tmpreg1);
+  add_32(tmpreg2, tmpreg2, 1);
+  str_32(tmpreg2, tmpreg1);
+}
+
+void MacroAssembler::floating_cmp(Register dst) {
+#ifdef AARCH64
+  NOT_TESTED();
+  cset(dst, gt);            // 1 if '>', else 0
+  csinv(dst, dst, ZR, ge);  // previous value if '>=', else -1
+#else
+  vmrs(dst, FPSCR);
+  orr(dst, dst, 0x08000000);
+  eor(dst, dst, AsmOperand(dst, lsl, 3));
+  mov(dst, AsmOperand(dst, asr, 30));
+#endif
+}
+
+void MacroAssembler::restore_default_fp_mode() {
+#ifdef AARCH64
+  msr(SysReg_FPCR, ZR);
+#else
+#ifndef __SOFTFP__
+  // Round to Near mode, IEEE compatible, masked exceptions
+  mov(Rtemp, 0);
+  vmsr(FPSCR, Rtemp);
+#endif // !__SOFTFP__
+#endif // AARCH64
+}
+
+#ifndef AARCH64
+// 24-bit word range == 26-bit byte range
+bool check26(int offset) {
+  // this could be simplified, but it mimics encoding and decoding
+  // an actual branch insrtuction
+  int off1 = offset << 6 >> 8;
+  int encoded = off1 & ((1<<24)-1);
+  int decoded = encoded << 8 >> 6;
+  return offset == decoded;
+}
+#endif // !AARCH64
+
+// Perform some slight adjustments so the default 32MB code cache
+// is fully reachable.
+static inline address first_cache_address() {
+  return CodeCache::low_bound() + sizeof(HeapBlock::Header);
+}
+static inline address last_cache_address() {
+  return CodeCache::high_bound() - Assembler::InstructionSize;
+}
+
+#ifdef AARCH64
+// Can we reach target using ADRP?
+bool MacroAssembler::page_reachable_from_cache(address target) {
+  intptr_t cl = (intptr_t)first_cache_address() & ~0xfff;
+  intptr_t ch = (intptr_t)last_cache_address() & ~0xfff;
+  intptr_t addr = (intptr_t)target & ~0xfff;
+
+  intptr_t loffset = addr - cl;
+  intptr_t hoffset = addr - ch;
+  return is_imm_in_range(loffset >> 12, 21, 0) && is_imm_in_range(hoffset >> 12, 21, 0);
+}
+#endif
+
+// Can we reach target using unconditional branch or call from anywhere
+// in the code cache (because code can be relocated)?
+bool MacroAssembler::_reachable_from_cache(address target) {
+#ifdef __thumb__
+  if ((1 & (intptr_t)target) != 0) {
+    // Return false to avoid 'b' if we need switching to THUMB mode.
+    return false;
+  }
+#endif
+
+  address cl = first_cache_address();
+  address ch = last_cache_address();
+
+  if (ForceUnreachable) {
+    // Only addresses from CodeCache can be treated as reachable.
+    if (target < CodeCache::low_bound() || CodeCache::high_bound() < target) {
+      return false;
+    }
+  }
+
+  intptr_t loffset = (intptr_t)target - (intptr_t)cl;
+  intptr_t hoffset = (intptr_t)target - (intptr_t)ch;
+
+#ifdef AARCH64
+  return is_offset_in_range(loffset, 26) && is_offset_in_range(hoffset, 26);
+#else
+  return check26(loffset - 8) && check26(hoffset - 8);
+#endif
+}
+
+bool MacroAssembler::reachable_from_cache(address target) {
+  assert(CodeCache::contains(pc()), "not supported");
+  return _reachable_from_cache(target);
+}
+
+// Can we reach the entire code cache from anywhere else in the code cache?
+bool MacroAssembler::_cache_fully_reachable() {
+  address cl = first_cache_address();
+  address ch = last_cache_address();
+  return _reachable_from_cache(cl) && _reachable_from_cache(ch);
+}
+
+bool MacroAssembler::cache_fully_reachable() {
+  assert(CodeCache::contains(pc()), "not supported");
+  return _cache_fully_reachable();
+}
+
+void MacroAssembler::jump(address target, relocInfo::relocType rtype, Register scratch NOT_AARCH64_ARG(AsmCondition cond)) {
+  assert((rtype == relocInfo::runtime_call_type) || (rtype == relocInfo::none), "not supported");
+  if (reachable_from_cache(target)) {
+    relocate(rtype);
+    b(target NOT_AARCH64_ARG(cond));
+    return;
+  }
+
+  // Note: relocate is not needed for the code below,
+  // encoding targets in absolute format.
+  if (ignore_non_patchable_relocations()) {
+    rtype = relocInfo::none;
+  }
+
+#ifdef AARCH64
+  assert (scratch != noreg, "should be specified");
+  InlinedAddress address_literal(target, rtype);
+  ldr_literal(scratch, address_literal);
+  br(scratch);
+  int off = offset();
+  bind_literal(address_literal);
+#ifdef COMPILER2
+  if (offset() - off == wordSize) {
+    // no padding, so insert nop for worst-case sizing
+    nop();
+  }
+#endif
+#else
+  if (VM_Version::supports_movw() && (scratch != noreg) && (rtype == relocInfo::none)) {
+    // Note: this version cannot be (atomically) patched
+    mov_slow(scratch, (intptr_t)target, cond);
+    bx(scratch, cond);
+  } else {
+    Label skip;
+    InlinedAddress address_literal(target);
+    if (cond != al) {
+      b(skip, inverse(cond));
+    }
+    relocate(rtype);
+    ldr_literal(PC, address_literal);
+    bind_literal(address_literal);
+    bind(skip);
+  }
+#endif // AARCH64
+}
+
+// Similar to jump except that:
+// - near calls are valid only if any destination in the cache is near
+// - no movt/movw (not atomically patchable)
+void MacroAssembler::patchable_jump(address target, relocInfo::relocType rtype, Register scratch NOT_AARCH64_ARG(AsmCondition cond)) {
+  assert((rtype == relocInfo::runtime_call_type) || (rtype == relocInfo::none), "not supported");
+  if (cache_fully_reachable()) {
+    // Note: this assumes that all possible targets (the initial one
+    // and the addressed patched to) are all in the code cache.
+    assert(CodeCache::contains(target), "target might be too far");
+    relocate(rtype);
+    b(target NOT_AARCH64_ARG(cond));
+    return;
+  }
+
+  // Discard the relocation information if not needed for CacheCompiledCode
+  // since the next encodings are all in absolute format.
+  if (ignore_non_patchable_relocations()) {
+    rtype = relocInfo::none;
+  }
+
+#ifdef AARCH64
+  assert (scratch != noreg, "should be specified");
+  InlinedAddress address_literal(target);
+  relocate(rtype);
+  ldr_literal(scratch, address_literal);
+  br(scratch);
+  int off = offset();
+  bind_literal(address_literal);
+#ifdef COMPILER2
+  if (offset() - off == wordSize) {
+    // no padding, so insert nop for worst-case sizing
+    nop();
+  }
+#endif
+#else
+  {
+    Label skip;
+    InlinedAddress address_literal(target);
+    if (cond != al) {
+      b(skip, inverse(cond));
+    }
+    relocate(rtype);
+    ldr_literal(PC, address_literal);
+    bind_literal(address_literal);
+    bind(skip);
+  }
+#endif // AARCH64
+}
+
+void MacroAssembler::call(address target, RelocationHolder rspec NOT_AARCH64_ARG(AsmCondition cond)) {
+  Register scratch = LR;
+  assert(rspec.type() == relocInfo::runtime_call_type || rspec.type() == relocInfo::none, "not supported");
+  if (reachable_from_cache(target)) {
+    relocate(rspec);
+    bl(target NOT_AARCH64_ARG(cond));
+    return;
+  }
+
+  // Note: relocate is not needed for the code below,
+  // encoding targets in absolute format.
+  if (ignore_non_patchable_relocations()) {
+    // This assumes the information was needed only for relocating the code.
+    rspec = RelocationHolder::none;
+  }
+
+#ifndef AARCH64
+  if (VM_Version::supports_movw() && (rspec.type() == relocInfo::none)) {
+    // Note: this version cannot be (atomically) patched
+    mov_slow(scratch, (intptr_t)target, cond);
+    blx(scratch, cond);
+    return;
+  }
+#endif
+
+  {
+    Label ret_addr;
+#ifndef AARCH64
+    if (cond != al) {
+      b(ret_addr, inverse(cond));
+    }
+#endif
+
+
+#ifdef AARCH64
+    // TODO-AARCH64: make more optimal implementation
+    // [ Keep in sync with MacroAssembler::call_size ]
+    assert(rspec.type() == relocInfo::none, "call reloc not implemented");
+    mov_slow(scratch, target);
+    blr(scratch);
+#else
+    InlinedAddress address_literal(target);
+    relocate(rspec);
+    adr(LR, ret_addr);
+    ldr_literal(PC, address_literal);
+
+    bind_literal(address_literal);
+    bind(ret_addr);
+#endif
+  }
+}
+
+#if defined(AARCH64) && defined(COMPILER2)
+int MacroAssembler::call_size(address target, bool far, bool patchable) {
+  // FIXME: mov_slow is variable-length
+  if (!far) return 1; // bl
+  if (patchable) return 2;  // ldr; blr
+  return instr_count_for_mov_slow((intptr_t)target) + 1;
+}
+#endif
+
+int MacroAssembler::patchable_call(address target, RelocationHolder const& rspec, bool c2) {
+  assert(rspec.type() == relocInfo::static_call_type ||
+         rspec.type() == relocInfo::none ||
+         rspec.type() == relocInfo::opt_virtual_call_type, "not supported");
+
+  // Always generate the relocation information, needed for patching
+  relocate(rspec); // used by NativeCall::is_call_before()
+  if (cache_fully_reachable()) {
+    // Note: this assumes that all possible targets (the initial one
+    // and the addresses patched to) are all in the code cache.
+    assert(CodeCache::contains(target), "target might be too far");
+    bl(target);
+  } else {
+#if defined(AARCH64) && defined(COMPILER2)
+    if (c2) {
+      // return address needs to match call_size().
+      // no need to trash Rtemp
+      int off = offset();
+      Label skip_literal;
+      InlinedAddress address_literal(target);
+      ldr_literal(LR, address_literal);
+      blr(LR);
+      int ret_addr_offset = offset();
+      assert(offset() - off == call_size(target, true, true) * InstructionSize, "need to fix call_size()");
+      b(skip_literal);
+      int off2 = offset();
+      bind_literal(address_literal);
+      if (offset() - off2 == wordSize) {
+        // no padding, so insert nop for worst-case sizing
+        nop();
+      }
+      bind(skip_literal);
+      return ret_addr_offset;
+    }
+#endif
+    Label ret_addr;
+    InlinedAddress address_literal(target);
+#ifdef AARCH64
+    ldr_literal(Rtemp, address_literal);
+    adr(LR, ret_addr);
+    br(Rtemp);
+#else
+    adr(LR, ret_addr);
+    ldr_literal(PC, address_literal);
+#endif
+    bind_literal(address_literal);
+    bind(ret_addr);
+  }
+  return offset();
+}
+
+
+void MacroAssembler::load_mirror(Register mirror, Register method, Register tmp) {
+  const int mirror_offset = in_bytes(Klass::java_mirror_offset());
+  ldr(tmp, Address(method, Method::const_offset()));
+  ldr(tmp, Address(tmp,  ConstMethod::constants_offset()));
+  ldr(tmp, Address(tmp, ConstantPool::pool_holder_offset_in_bytes()));
+  ldr(mirror, Address(tmp, mirror_offset));
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+// Compressed pointers
+
+#ifdef AARCH64
+
+void MacroAssembler::load_klass(Register dst_klass, Register src_oop) {
+  if (UseCompressedClassPointers) {
+    ldr_w(dst_klass, Address(src_oop, oopDesc::klass_offset_in_bytes()));
+    decode_klass_not_null(dst_klass);
+  } else {
+    ldr(dst_klass, Address(src_oop, oopDesc::klass_offset_in_bytes()));
+  }
+}
+
+#else
+
+void MacroAssembler::load_klass(Register dst_klass, Register src_oop, AsmCondition cond) {
+  ldr(dst_klass, Address(src_oop, oopDesc::klass_offset_in_bytes()), cond);
+}
+
+#endif // AARCH64
+
+// Blows src_klass.
+void MacroAssembler::store_klass(Register src_klass, Register dst_oop) {
+#ifdef AARCH64
+  if (UseCompressedClassPointers) {
+    assert(src_klass != dst_oop, "not enough registers");
+    encode_klass_not_null(src_klass);
+    str_w(src_klass, Address(dst_oop, oopDesc::klass_offset_in_bytes()));
+    return;
+  }
+#endif // AARCH64
+  str(src_klass, Address(dst_oop, oopDesc::klass_offset_in_bytes()));
+}
+
+#ifdef AARCH64
+
+void MacroAssembler::store_klass_gap(Register dst) {
+  if (UseCompressedClassPointers) {
+    str_w(ZR, Address(dst, oopDesc::klass_gap_offset_in_bytes()));
+  }
+}
+
+#endif // AARCH64
+
+
+void MacroAssembler::load_heap_oop(Register dst, Address src) {
+#ifdef AARCH64
+  if (UseCompressedOops) {
+    ldr_w(dst, src);
+    decode_heap_oop(dst);
+    return;
+  }
+#endif // AARCH64
+  ldr(dst, src);
+}
+
+// Blows src and flags.
+void MacroAssembler::store_heap_oop(Register src, Address dst) {
+#ifdef AARCH64
+  if (UseCompressedOops) {
+    assert(!dst.uses(src), "not enough registers");
+    encode_heap_oop(src);
+    str_w(src, dst);
+    return;
+  }
+#endif // AARCH64
+  str(src, dst);
+}
+
+void MacroAssembler::store_heap_oop_null(Register src, Address dst) {
+#ifdef AARCH64
+  if (UseCompressedOops) {
+    str_w(src, dst);
+    return;
+  }
+#endif // AARCH64
+  str(src, dst);
+}
+
+
+#ifdef AARCH64
+
+// Algorithm must match oop.inline.hpp encode_heap_oop.
+void MacroAssembler::encode_heap_oop(Register dst, Register src) {
+  // This code pattern is matched in NativeIntruction::skip_encode_heap_oop.
+  // Update it at modifications.
+  assert (UseCompressedOops, "must be compressed");
+  assert (Universe::heap() != NULL, "java heap should be initialized");
+#ifdef ASSERT
+  verify_heapbase("MacroAssembler::encode_heap_oop: heap base corrupted?");
+#endif
+  verify_oop(src);
+  if (Universe::narrow_oop_base() == NULL) {
+    if (Universe::narrow_oop_shift() != 0) {
+      assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
+      _lsr(dst, src, Universe::narrow_oop_shift());
+    } else if (dst != src) {
+      mov(dst, src);
+    }
+  } else {
+    tst(src, src);
+    csel(dst, Rheap_base, src, eq);
+    sub(dst, dst, Rheap_base);
+    if (Universe::narrow_oop_shift() != 0) {
+      assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
+      _lsr(dst, dst, Universe::narrow_oop_shift());
+    }
+  }
+}
+
+// Same algorithm as oop.inline.hpp decode_heap_oop.
+void MacroAssembler::decode_heap_oop(Register dst, Register src) {
+#ifdef ASSERT
+  verify_heapbase("MacroAssembler::decode_heap_oop: heap base corrupted?");
+#endif
+  assert(Universe::narrow_oop_shift() == 0 || LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
+  if (Universe::narrow_oop_base() != NULL) {
+    tst(src, src);
+    add(dst, Rheap_base, AsmOperand(src, lsl, Universe::narrow_oop_shift()));
+    csel(dst, dst, ZR, ne);
+  } else {
+    _lsl(dst, src, Universe::narrow_oop_shift());
+  }
+  verify_oop(dst);
+}
+
+#ifdef COMPILER2
+// Algorithm must match oop.inline.hpp encode_heap_oop.
+// Must preserve condition codes, or C2 encodeHeapOop_not_null rule
+// must be changed.
+void MacroAssembler::encode_heap_oop_not_null(Register dst, Register src) {
+  assert (UseCompressedOops, "must be compressed");
+  assert (Universe::heap() != NULL, "java heap should be initialized");
+#ifdef ASSERT
+  verify_heapbase("MacroAssembler::encode_heap_oop: heap base corrupted?");
+#endif
+  verify_oop(src);
+  if (Universe::narrow_oop_base() == NULL) {
+    if (Universe::narrow_oop_shift() != 0) {
+      assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
+      _lsr(dst, src, Universe::narrow_oop_shift());
+    } else if (dst != src) {
+          mov(dst, src);
+    }
+  } else {
+    sub(dst, src, Rheap_base);
+    if (Universe::narrow_oop_shift() != 0) {
+      assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
+      _lsr(dst, dst, Universe::narrow_oop_shift());
+    }
+  }
+}
+
+// Same algorithm as oops.inline.hpp decode_heap_oop.
+// Must preserve condition codes, or C2 decodeHeapOop_not_null rule
+// must be changed.
+void MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) {
+#ifdef ASSERT
+  verify_heapbase("MacroAssembler::decode_heap_oop: heap base corrupted?");
+#endif
+  assert(Universe::narrow_oop_shift() == 0 || LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
+  if (Universe::narrow_oop_base() != NULL) {
+    add(dst, Rheap_base, AsmOperand(src, lsl, Universe::narrow_oop_shift()));
+  } else {
+    _lsl(dst, src, Universe::narrow_oop_shift());
+  }
+  verify_oop(dst);
+}
+
+void MacroAssembler::set_narrow_klass(Register dst, Klass* k) {
+  assert(UseCompressedClassPointers, "should only be used for compressed header");
+  assert(oop_recorder() != NULL, "this assembler needs an OopRecorder");
+  int klass_index = oop_recorder()->find_index(k);
+  RelocationHolder rspec = metadata_Relocation::spec(klass_index);
+
+  // Relocation with special format (see relocInfo_arm.hpp).
+  relocate(rspec);
+  narrowKlass encoded_k = Klass::encode_klass(k);
+  movz(dst, encoded_k & 0xffff, 0);
+  movk(dst, (encoded_k >> 16) & 0xffff, 16);
+}
+
+void MacroAssembler::set_narrow_oop(Register dst, jobject obj) {
+  assert(UseCompressedOops, "should only be used for compressed header");
+  assert(oop_recorder() != NULL, "this assembler needs an OopRecorder");
+  int oop_index = oop_recorder()->find_index(obj);
+  RelocationHolder rspec = oop_Relocation::spec(oop_index);
+
+  relocate(rspec);
+  movz(dst, 0xffff, 0);
+  movk(dst, 0xffff, 16);
+}
+
+#endif // COMPILER2
+
+// Must preserve condition codes, or C2 encodeKlass_not_null rule
+// must be changed.
+void MacroAssembler::encode_klass_not_null(Register r) {
+  if (Universe::narrow_klass_base() != NULL) {
+    // Use Rheap_base as a scratch register in which to temporarily load the narrow_klass_base.
+    assert(r != Rheap_base, "Encoding a klass in Rheap_base");
+    mov_slow(Rheap_base, Universe::narrow_klass_base());
+    sub(r, r, Rheap_base);
+  }
+  if (Universe::narrow_klass_shift() != 0) {
+    assert (LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
+    _lsr(r, r, Universe::narrow_klass_shift());
+  }
+  if (Universe::narrow_klass_base() != NULL) {
+    reinit_heapbase();
+  }
+}
+
+// Must preserve condition codes, or C2 encodeKlass_not_null rule
+// must be changed.
+void MacroAssembler::encode_klass_not_null(Register dst, Register src) {
+  if (dst == src) {
+    encode_klass_not_null(src);
+    return;
+  }
+  if (Universe::narrow_klass_base() != NULL) {
+    mov_slow(dst, (int64_t)Universe::narrow_klass_base());
+    sub(dst, src, dst);
+    if (Universe::narrow_klass_shift() != 0) {
+      assert (LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
+      _lsr(dst, dst, Universe::narrow_klass_shift());
+    }
+  } else {
+    if (Universe::narrow_klass_shift() != 0) {
+      assert (LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
+      _lsr(dst, src, Universe::narrow_klass_shift());
+    } else {
+      mov(dst, src);
+    }
+  }
+}
+
+// Function instr_count_for_decode_klass_not_null() counts the instructions
+// generated by decode_klass_not_null(register r) and reinit_heapbase(),
+// when (Universe::heap() != NULL).  Hence, if the instructions they
+// generate change, then this method needs to be updated.
+int MacroAssembler::instr_count_for_decode_klass_not_null() {
+  assert(UseCompressedClassPointers, "only for compressed klass ptrs");
+  assert(Universe::heap() != NULL, "java heap should be initialized");
+  if (Universe::narrow_klass_base() != NULL) {
+    return instr_count_for_mov_slow(Universe::narrow_klass_base()) + // mov_slow
+      1 +                                                                 // add
+      instr_count_for_mov_slow(Universe::narrow_ptrs_base());   // reinit_heapbase() = mov_slow
+  } else {
+    if (Universe::narrow_klass_shift() != 0) {
+      return 1;
+    }
+  }
+  return 0;
+}
+
+// Must preserve condition codes, or C2 decodeKlass_not_null rule
+// must be changed.
+void MacroAssembler::decode_klass_not_null(Register r) {
+  int off = offset();
+  assert(UseCompressedClassPointers, "should only be used for compressed headers");
+  assert(Universe::heap() != NULL, "java heap should be initialized");
+  assert(r != Rheap_base, "Decoding a klass in Rheap_base");
+  // Cannot assert, instr_count_for_decode_klass_not_null() counts instructions.
+  // Also do not verify_oop as this is called by verify_oop.
+  if (Universe::narrow_klass_base() != NULL) {
+    // Use Rheap_base as a scratch register in which to temporarily load the narrow_klass_base.
+    mov_slow(Rheap_base, Universe::narrow_klass_base());
+    add(r, Rheap_base, AsmOperand(r, lsl, Universe::narrow_klass_shift()));
+    reinit_heapbase();
+  } else {
+    if (Universe::narrow_klass_shift() != 0) {
+      assert (LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
+      _lsl(r, r, Universe::narrow_klass_shift());
+    }
+  }
+  assert((offset() - off) == (instr_count_for_decode_klass_not_null() * InstructionSize), "need to fix instr_count_for_decode_klass_not_null");
+}
+
+// Must preserve condition codes, or C2 decodeKlass_not_null rule
+// must be changed.
+void MacroAssembler::decode_klass_not_null(Register dst, Register src) {
+  if (src == dst) {
+    decode_klass_not_null(src);
+    return;
+  }
+
+  assert(UseCompressedClassPointers, "should only be used for compressed headers");
+  assert(Universe::heap() != NULL, "java heap should be initialized");
+  assert(src != Rheap_base, "Decoding a klass in Rheap_base");
+  assert(dst != Rheap_base, "Decoding a klass into Rheap_base");
+  // Also do not verify_oop as this is called by verify_oop.
+  if (Universe::narrow_klass_base() != NULL) {
+    mov_slow(dst, Universe::narrow_klass_base());
+    add(dst, dst, AsmOperand(src, lsl, Universe::narrow_klass_shift()));
+  } else {
+    _lsl(dst, src, Universe::narrow_klass_shift());
+  }
+}
+
+
+void MacroAssembler::reinit_heapbase() {
+  if (UseCompressedOops || UseCompressedClassPointers) {
+    if (Universe::heap() != NULL) {
+      mov_slow(Rheap_base, Universe::narrow_ptrs_base());
+    } else {
+      ldr_global_ptr(Rheap_base, (address)Universe::narrow_ptrs_base_addr());
+    }
+  }
+}
+
+#ifdef ASSERT
+void MacroAssembler::verify_heapbase(const char* msg) {
+  // This code pattern is matched in NativeIntruction::skip_verify_heapbase.
+  // Update it at modifications.
+  assert (UseCompressedOops, "should be compressed");
+  assert (Universe::heap() != NULL, "java heap should be initialized");
+  if (CheckCompressedOops) {
+    Label ok;
+    str(Rthread, Address(Rthread, in_bytes(JavaThread::in_top_frame_unsafe_section_offset())));
+    raw_push(Rtemp, ZR);
+    mrs(Rtemp, Assembler::SysReg_NZCV);
+    str(Rtemp, Address(SP, 1 * wordSize));
+    mov_slow(Rtemp, Universe::narrow_ptrs_base());
+    cmp(Rheap_base, Rtemp);
+    b(ok, eq);
+    stop(msg);
+    bind(ok);
+    ldr(Rtemp, Address(SP, 1 * wordSize));
+    msr(Assembler::SysReg_NZCV, Rtemp);
+    raw_pop(Rtemp, ZR);
+    str(ZR, Address(Rthread, in_bytes(JavaThread::in_top_frame_unsafe_section_offset())));
+  }
+}
+#endif // ASSERT
+
+#endif // AARCH64
+
+#ifdef COMPILER2
+void MacroAssembler::fast_lock(Register Roop, Register Rbox, Register Rscratch, Register Rscratch2 AARCH64_ONLY_ARG(Register Rscratch3))
+{
+  assert(VM_Version::supports_ldrex(), "unsupported, yet?");
+
+  Register Rmark      = Rscratch2;
+
+  assert(Roop != Rscratch, "");
+  assert(Roop != Rmark, "");
+  assert(Rbox != Rscratch, "");
+  assert(Rbox != Rmark, "");
+
+  Label fast_lock, done;
+
+  if (UseBiasedLocking && !UseOptoBiasInlining) {
+    Label failed;
+#ifdef AARCH64
+    biased_locking_enter(Roop, Rmark, Rscratch, false, Rscratch3, done, failed);
+#else
+    biased_locking_enter(Roop, Rmark, Rscratch, false, noreg, done, failed);
+#endif
+    bind(failed);
+  }
+
+  ldr(Rmark, Address(Roop, oopDesc::mark_offset_in_bytes()));
+  tst(Rmark, markOopDesc::unlocked_value);
+  b(fast_lock, ne);
+
+  // Check for recursive lock
+  // See comments in InterpreterMacroAssembler::lock_object for
+  // explanations on the fast recursive locking check.
+#ifdef AARCH64
+  intptr_t mask = ((intptr_t)3) - ((intptr_t)os::vm_page_size());
+  Assembler::LogicalImmediate imm(mask, false);
+  mov(Rscratch, SP);
+  sub(Rscratch, Rmark, Rscratch);
+  ands(Rscratch, Rscratch, imm);
+  b(done, ne); // exit with failure
+  str(Rscratch, Address(Rbox, BasicLock::displaced_header_offset_in_bytes())); // set to zero
+  b(done);
+
+#else
+  // -1- test low 2 bits
+  movs(Rscratch, AsmOperand(Rmark, lsl, 30));
+  // -2- test (hdr - SP) if the low two bits are 0
+  sub(Rscratch, Rmark, SP, eq);
+  movs(Rscratch, AsmOperand(Rscratch, lsr, exact_log2(os::vm_page_size())), eq);
+  // If still 'eq' then recursive locking OK
+  str(Rscratch, Address(Rbox, BasicLock::displaced_header_offset_in_bytes()), eq); // set to zero
+  b(done);
+#endif
+
+  bind(fast_lock);
+  str(Rmark, Address(Rbox, BasicLock::displaced_header_offset_in_bytes()));
+
+  bool allow_fallthrough_on_failure = true;
+  bool one_shot = true;
+  cas_for_lock_acquire(Rmark, Rbox, Roop, Rscratch, done, allow_fallthrough_on_failure, one_shot);
+
+  bind(done);
+
+}
+
+void MacroAssembler::fast_unlock(Register Roop, Register Rbox, Register Rscratch, Register Rscratch2  AARCH64_ONLY_ARG(Register Rscratch3))
+{
+  assert(VM_Version::supports_ldrex(), "unsupported, yet?");
+
+  Register Rmark      = Rscratch2;
+
+  assert(Roop != Rscratch, "");
+  assert(Roop != Rmark, "");
+  assert(Rbox != Rscratch, "");
+  assert(Rbox != Rmark, "");
+
+  Label done;
+
+  if (UseBiasedLocking && !UseOptoBiasInlining) {
+    biased_locking_exit(Roop, Rscratch, done);
+  }
+
+  ldr(Rmark, Address(Rbox, BasicLock::displaced_header_offset_in_bytes()));
+  // If hdr is NULL, we've got recursive locking and there's nothing more to do
+  cmp(Rmark, 0);
+  b(done, eq);
+
+  // Restore the object header
+  bool allow_fallthrough_on_failure = true;
+  bool one_shot = true;
+  cas_for_lock_release(Rmark, Rbox, Roop, Rscratch, done, allow_fallthrough_on_failure, one_shot);
+
+  bind(done);
+
+}
+#endif // COMPILER2
+
--- /dev/null	2016-08-24 15:41:39.598575000 -0400
+++ new/hotspot/src/cpu/arm/vm/macroAssembler_arm.hpp	2016-12-02 11:21:55.745870426 -0500
@@ -0,0 +1,1390 @@
+/*
+ * Copyright (c) 2008, 2016, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef CPU_ARM_VM_MACROASSEMBLER_ARM_HPP
+#define CPU_ARM_VM_MACROASSEMBLER_ARM_HPP
+
+#include "code/relocInfo.hpp"
+#include "code/relocInfo_ext.hpp"
+
+class BiasedLockingCounters;
+
+// Introduced AddressLiteral and its subclasses to ease portability from
+// x86 and avoid relocation issues
+class AddressLiteral VALUE_OBJ_CLASS_SPEC {
+  RelocationHolder _rspec;
+  // Typically we use AddressLiterals we want to use their rval
+  // However in some situations we want the lval (effect address) of the item.
+  // We provide a special factory for making those lvals.
+  bool _is_lval;
+
+  address          _target;
+
+ private:
+  static relocInfo::relocType reloc_for_target(address target) {
+    // Used for ExternalAddress or when the type is not specified
+    // Sometimes ExternalAddress is used for values which aren't
+    // exactly addresses, like the card table base.
+    // external_word_type can't be used for values in the first page
+    // so just skip the reloc in that case.
+    return external_word_Relocation::can_be_relocated(target) ? relocInfo::external_word_type : relocInfo::none;
+  }
+
+  void set_rspec(relocInfo::relocType rtype);
+
+ protected:
+  // creation
+  AddressLiteral()
+    : _is_lval(false),
+      _target(NULL)
+  {}
+
+  public:
+
+  AddressLiteral(address target, relocInfo::relocType rtype) {
+    _is_lval = false;
+    _target = target;
+    set_rspec(rtype);
+  }
+
+  AddressLiteral(address target, RelocationHolder const& rspec)
+    : _rspec(rspec),
+      _is_lval(false),
+      _target(target)
+  {}
+
+  AddressLiteral(address target) {
+    _is_lval = false;
+    _target = target;
+    set_rspec(reloc_for_target(target));
+  }
+
+  AddressLiteral addr() {
+    AddressLiteral ret = *this;
+    ret._is_lval = true;
+    return ret;
+  }
+
+ private:
+
+  address target() { return _target; }
+  bool is_lval() { return _is_lval; }
+
+  relocInfo::relocType reloc() const { return _rspec.type(); }
+  const RelocationHolder& rspec() const { return _rspec; }
+
+  friend class Assembler;
+  friend class MacroAssembler;
+  friend class Address;
+  friend class LIR_Assembler;
+  friend class InlinedAddress;
+};
+
+class ExternalAddress: public AddressLiteral {
+
+  public:
+
+  ExternalAddress(address target) : AddressLiteral(target) {}
+
+};
+
+class InternalAddress: public AddressLiteral {
+
+  public:
+
+  InternalAddress(address target) : AddressLiteral(target, relocInfo::internal_word_type) {}
+
+};
+
+// Inlined constants, for use with ldr_literal / bind_literal
+// Note: InlinedInteger not supported (use move_slow(Register,int[,cond]))
+class InlinedLiteral: StackObj {
+ public:
+  Label label; // need to be public for direct access with &
+  InlinedLiteral() {
+  }
+};
+
+class InlinedMetadata: public InlinedLiteral {
+ private:
+  Metadata *_data;
+
+ public:
+  InlinedMetadata(Metadata *data): InlinedLiteral() {
+    _data = data;
+  }
+  Metadata *data() { return _data; }
+};
+
+// Currently unused
+// class InlinedOop: public InlinedLiteral {
+//  private:
+//   jobject _jobject;
+//
+//  public:
+//   InlinedOop(jobject target): InlinedLiteral() {
+//     _jobject = target;
+//   }
+//   jobject jobject() { return _jobject; }
+// };
+
+class InlinedAddress: public InlinedLiteral {
+ private:
+  AddressLiteral _literal;
+
+ public:
+
+  InlinedAddress(jobject object): InlinedLiteral(), _literal((address)object, relocInfo::oop_type) {
+    ShouldNotReachHere(); // use mov_oop (or implement InlinedOop)
+  }
+
+  InlinedAddress(Metadata *data): InlinedLiteral(), _literal((address)data, relocInfo::metadata_type) {
+    ShouldNotReachHere(); // use InlinedMetadata or mov_metadata
+  }
+
+  InlinedAddress(address target, const RelocationHolder &rspec): InlinedLiteral(), _literal(target, rspec) {
+    assert(rspec.type() != relocInfo::oop_type, "Do not use InlinedAddress for oops");
+    assert(rspec.type() != relocInfo::metadata_type, "Do not use InlinedAddress for metadatas");
+  }
+
+  InlinedAddress(address target, relocInfo::relocType rtype): InlinedLiteral(), _literal(target, rtype) {
+    assert(rtype != relocInfo::oop_type, "Do not use InlinedAddress for oops");
+    assert(rtype != relocInfo::metadata_type, "Do not use InlinedAddress for metadatas");
+  }
+
+  // Note: default is relocInfo::none for InlinedAddress
+  InlinedAddress(address target): InlinedLiteral(), _literal(target, relocInfo::none) {
+  }
+
+  address target() { return _literal.target(); }
+
+  const RelocationHolder& rspec() const { return _literal.rspec(); }
+};
+
+class InlinedString: public InlinedLiteral {
+ private:
+  const char* _msg;
+
+ public:
+  InlinedString(const char* msg): InlinedLiteral() {
+    _msg = msg;
+  }
+  const char* msg() { return _msg; }
+};
+
+class MacroAssembler: public Assembler {
+protected:
+
+  // Support for VM calls
+  //
+
+  // This is the base routine called by the different versions of call_VM_leaf.
+  void call_VM_leaf_helper(address entry_point, int number_of_arguments);
+
+  // This is the base routine called by the different versions of call_VM. The interpreter
+  // may customize this version by overriding it for its purposes (e.g., to save/restore
+  // additional registers when doing a VM call).
+  virtual void call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions);
+
+  // These routines should emit JVMTI PopFrame and ForceEarlyReturn handling code.
+  // The implementation is only non-empty for the InterpreterMacroAssembler,
+  // as only the interpreter handles PopFrame and ForceEarlyReturn requests.
+  virtual void check_and_handle_popframe() {}
+  virtual void check_and_handle_earlyret() {}
+
+public:
+
+  MacroAssembler(CodeBuffer* code) : Assembler(code) {}
+
+  // By default, we do not need relocation information for non
+  // patchable absolute addresses. However, when needed by some
+  // extensions, ignore_non_patchable_relocations can be modified,
+  // returning false to preserve all relocation information.
+  inline bool ignore_non_patchable_relocations() { return true; }
+
+  // Initially added to the Assembler interface as a pure virtual:
+  //   RegisterConstant delayed_value(..)
+  // for:
+  //   6812678 macro assembler needs delayed binding of a few constants (for 6655638)
+  // this was subsequently modified to its present name and return type
+  virtual RegisterOrConstant delayed_value_impl(intptr_t* delayed_value_addr, Register tmp, int offset);
+
+#ifdef AARCH64
+# define NOT_IMPLEMENTED() unimplemented("NYI at " __FILE__ ":" XSTR(__LINE__))
+# define NOT_TESTED()      warn("Not tested at " __FILE__ ":" XSTR(__LINE__))
+#endif
+
+  void align(int modulus);
+
+  // Support for VM calls
+  //
+  // It is imperative that all calls into the VM are handled via the call_VM methods.
+  // They make sure that the stack linkage is setup correctly. call_VM's correspond
+  // to ENTRY/ENTRY_X entry points while call_VM_leaf's correspond to LEAF entry points.
+
+  void call_VM(Register oop_result, address entry_point, bool check_exceptions = true);
+  void call_VM(Register oop_result, address entry_point, Register arg_1, bool check_exceptions = true);
+  void call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2, bool check_exceptions = true);
+  void call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2, Register arg_3, bool check_exceptions = true);
+
+  // The following methods are required by templateTable.cpp,
+  // but not used on ARM.
+  void call_VM(Register oop_result, Register last_java_sp, address entry_point, int number_of_arguments = 0, bool check_exceptions = true);
+  void call_VM(Register oop_result, Register last_java_sp, address entry_point, Register arg_1, bool check_exceptions = true);
+  void call_VM(Register oop_result, Register last_java_sp, address entry_point, Register arg_1, Register arg_2, bool check_exceptions = true);
+  void call_VM(Register oop_result, Register last_java_sp, address entry_point, Register arg_1, Register arg_2, Register arg_3, bool check_exceptions = true);
+
+  // Note: The super_call_VM calls are not used on ARM
+
+  // Raw call, without saving/restoring registers, exception handling, etc.
+  // Mainly used from various stubs.
+  // Note: if 'save_R9_if_scratched' is true, call_VM may on some
+  // platforms save values on the stack. Set it to false (and handle
+  // R9 in the callers) if the top of the stack must not be modified
+  // by call_VM.
+  void call_VM(address entry_point, bool save_R9_if_scratched);
+
+  void call_VM_leaf(address entry_point);
+  void call_VM_leaf(address entry_point, Register arg_1);
+  void call_VM_leaf(address entry_point, Register arg_1, Register arg_2);
+  void call_VM_leaf(address entry_point, Register arg_1, Register arg_2, Register arg_3);
+  void call_VM_leaf(address entry_point, Register arg_1, Register arg_2, Register arg_3, Register arg_4);
+
+  void get_vm_result(Register oop_result, Register tmp);
+  void get_vm_result_2(Register metadata_result, Register tmp);
+
+  // Always sets/resets sp, which default to SP if (last_sp == noreg)
+  // Optionally sets/resets fp (use noreg to avoid setting it)
+  // Always sets/resets pc on AArch64; optionally sets/resets pc on 32-bit ARM depending on save_last_java_pc flag
+  // Note: when saving PC, set_last_Java_frame returns PC's offset in the code section
+  //       (for oop_maps offset computation)
+  int set_last_Java_frame(Register last_sp, Register last_fp, bool save_last_java_pc, Register tmp);
+  void reset_last_Java_frame(Register tmp);
+  // status set in set_last_Java_frame for reset_last_Java_frame
+  bool _fp_saved;
+  bool _pc_saved;
+
+#ifdef PRODUCT
+#define BLOCK_COMMENT(str) /* nothing */
+#define STOP(error) __ stop(error)
+#else
+#define BLOCK_COMMENT(str) __ block_comment(str)
+#define STOP(error) __ block_comment(error); __ stop(error)
+#endif
+
+  void lookup_virtual_method(Register recv_klass,
+                             Register vtable_index,
+                             Register method_result);
+
+  // Test sub_klass against super_klass, with fast and slow paths.
+
+  // The fast path produces a tri-state answer: yes / no / maybe-slow.
+  // One of the three labels can be NULL, meaning take the fall-through.
+  // No registers are killed, except temp_regs.
+  void check_klass_subtype_fast_path(Register sub_klass,
+                                     Register super_klass,
+                                     Register temp_reg,
+                                     Register temp_reg2,
+                                     Label* L_success,
+                                     Label* L_failure,
+                                     Label* L_slow_path);
+
+  // The rest of the type check; must be wired to a corresponding fast path.
+  // It does not repeat the fast path logic, so don't use it standalone.
+  // temp_reg3 can be noreg, if no temps are available.
+  // Updates the sub's secondary super cache as necessary.
+  // If set_cond_codes:
+  // - condition codes will be Z on success, NZ on failure.
+  // - temp_reg will be 0 on success, non-0 on failure
+  void check_klass_subtype_slow_path(Register sub_klass,
+                                     Register super_klass,
+                                     Register temp_reg,
+                                     Register temp_reg2,
+                                     Register temp_reg3, // auto assigned if noreg
+                                     Label* L_success,
+                                     Label* L_failure,
+                                     bool set_cond_codes = false);
+
+  // Simplified, combined version, good for typical uses.
+  // temp_reg3 can be noreg, if no temps are available. It is used only on slow path.
+  // Falls through on failure.
+  void check_klass_subtype(Register sub_klass,
+                           Register super_klass,
+                           Register temp_reg,
+                           Register temp_reg2,
+                           Register temp_reg3, // auto assigned on slow path if noreg
+                           Label& L_success);
+
+  // Returns address of receiver parameter, using tmp as base register. tmp and params_count can be the same.
+  Address receiver_argument_address(Register params_base, Register params_count, Register tmp);
+
+  void _verify_oop(Register reg, const char* s, const char* file, int line);
+  void _verify_oop_addr(Address addr, const char * s, const char* file, int line);
+
+  // TODO: verify method and klass metadata (compare against vptr?)
+  void _verify_method_ptr(Register reg, const char * msg, const char * file, int line) {}
+  void _verify_klass_ptr(Register reg, const char * msg, const char * file, int line) {}
+
+#define verify_oop(reg) _verify_oop(reg, "broken oop " #reg, __FILE__, __LINE__)
+#define verify_oop_addr(addr) _verify_oop_addr(addr, "broken oop ", __FILE__, __LINE__)
+#define verify_method_ptr(reg) _verify_method_ptr(reg, "broken method " #reg, __FILE__, __LINE__)
+#define verify_klass_ptr(reg) _verify_klass_ptr(reg, "broken klass " #reg, __FILE__, __LINE__)
+
+  void null_check(Register reg, Register tmp, int offset = -1);
+  inline void null_check(Register reg) { null_check(reg, noreg, -1); } // for C1 lir_null_check
+
+  // Puts address of allocated object into register `obj` and end of allocated object into register `obj_end`.
+  void eden_allocate(Register obj, Register obj_end, Register tmp1, Register tmp2,
+                     RegisterOrConstant size_expression, Label& slow_case);
+  void tlab_allocate(Register obj, Register obj_end, Register tmp1,
+                     RegisterOrConstant size_expression, Label& slow_case);
+
+  void tlab_refill(Register top, Register tmp1, Register tmp2, Register tmp3, Register tmp4,
+                   Label& try_eden, Label& slow_case);
+  void zero_memory(Register start, Register end, Register tmp);
+
+  void incr_allocated_bytes(RegisterOrConstant size_in_bytes, Register tmp);
+
+  static bool needs_explicit_null_check(intptr_t offset);
+
+  void arm_stack_overflow_check(int frame_size_in_bytes, Register tmp);
+  void arm_stack_overflow_check(Register Rsize, Register tmp);
+
+  void bang_stack_with_offset(int offset) {
+    ShouldNotReachHere();
+  }
+
+  // Biased locking support
+  // lock_reg and obj_reg must be loaded up with the appropriate values.
+  // swap_reg must be supplied.
+  // tmp_reg must be supplied.
+  // Optional slow case is for implementations (interpreter and C1) which branch to
+  // slow case directly. If slow_case is NULL, then leaves condition
+  // codes set (for C2's Fast_Lock node) and jumps to done label.
+  // Falls through for the fast locking attempt.
+  // Returns offset of first potentially-faulting instruction for null
+  // check info (currently consumed only by C1). If
+  // swap_reg_contains_mark is true then returns -1 as it is assumed
+  // the calling code has already passed any potential faults.
+  // Notes:
+  // - swap_reg and tmp_reg are scratched
+  // - Rtemp was (implicitly) scratched and can now be specified as the tmp2
+  int biased_locking_enter(Register obj_reg, Register swap_reg, Register tmp_reg,
+                           bool swap_reg_contains_mark,
+                           Register tmp2,
+                           Label& done, Label& slow_case,
+                           BiasedLockingCounters* counters = NULL);
+  void biased_locking_exit(Register obj_reg, Register temp_reg, Label& done);
+
+  // Building block for CAS cases of biased locking: makes CAS and records statistics.
+  // Optional slow_case label is used to transfer control if CAS fails. Otherwise leaves condition codes set.
+  void biased_locking_enter_with_cas(Register obj_reg, Register old_mark_reg, Register new_mark_reg,
+                                     Register tmp, Label& slow_case, int* counter_addr);
+
+#ifndef AARCH64
+  void nop() {
+    mov(R0, R0);
+  }
+
+  void push(Register rd, AsmCondition cond = al) {
+    assert(rd != SP, "unpredictable instruction");
+    str(rd, Address(SP, -wordSize, pre_indexed), cond);
+  }
+
+  void push(RegisterSet reg_set, AsmCondition cond = al) {
+    assert(!reg_set.contains(SP), "unpredictable instruction");
+    stmdb(SP, reg_set, writeback, cond);
+  }
+
+  void pop(Register rd, AsmCondition cond = al) {
+    assert(rd != SP, "unpredictable instruction");
+    ldr(rd, Address(SP, wordSize, post_indexed), cond);
+  }
+
+  void pop(RegisterSet reg_set, AsmCondition cond = al) {
+    assert(!reg_set.contains(SP), "unpredictable instruction");
+    ldmia(SP, reg_set, writeback, cond);
+  }
+
+  void fpushd(FloatRegister fd, AsmCondition cond = al) {
+    fstmdbd(SP, FloatRegisterSet(fd), writeback, cond);
+  }
+
+  void fpushs(FloatRegister fd, AsmCondition cond = al) {
+    fstmdbs(SP, FloatRegisterSet(fd), writeback, cond);
+  }
+
+  void fpopd(FloatRegister fd, AsmCondition cond = al) {
+    fldmiad(SP, FloatRegisterSet(fd), writeback, cond);
+  }
+
+  void fpops(FloatRegister fd, AsmCondition cond = al) {
+    fldmias(SP, FloatRegisterSet(fd), writeback, cond);
+  }
+#endif // !AARCH64
+
+  // Order access primitives
+  enum Membar_mask_bits {
+    StoreStore = 1 << 3,
+    LoadStore  = 1 << 2,
+    StoreLoad  = 1 << 1,
+    LoadLoad   = 1 << 0
+  };
+
+#ifdef AARCH64
+  // tmp register is not used on AArch64, this parameter is provided solely for better compatibility with 32-bit ARM
+  void membar(Membar_mask_bits order_constraint, Register tmp = noreg);
+#else
+  void membar(Membar_mask_bits mask,
+              Register tmp,
+              bool preserve_flags = true,
+              Register load_tgt = noreg);
+#endif
+
+  void breakpoint(AsmCondition cond = al);
+  void stop(const char* msg);
+  // prints msg and continues
+  void warn(const char* msg);
+  void unimplemented(const char* what = "");
+  void should_not_reach_here()                   { stop("should not reach here"); }
+  static void debug(const char* msg, const intx* registers);
+
+  // Create a walkable frame to help tracking down who called this code.
+  // Returns the frame size in words.
+  int should_not_call_this() {
+    raw_push(FP, LR);
+    should_not_reach_here();
+    flush();
+    return 2; // frame_size_in_words (FP+LR)
+  }
+
+  int save_all_registers();
+  void restore_all_registers();
+  int save_caller_save_registers();
+  void restore_caller_save_registers();
+
+  void add_rc(Register dst, Register arg1, RegisterOrConstant arg2);
+
+  // add_slow and mov_slow are used to manipulate offsets larger than 1024,
+  // these functions are not expected to handle all possible constants,
+  // only those that can really occur during compilation
+  void add_slow(Register rd, Register rn, int c);
+  void sub_slow(Register rd, Register rn, int c);
+
+#ifdef AARCH64
+  static int mov_slow_helper(Register rd, intptr_t c, MacroAssembler* masm /* optional */);
+#endif
+
+  void mov_slow(Register rd, intptr_t c NOT_AARCH64_ARG(AsmCondition cond = al));
+  void mov_slow(Register rd, const char *string);
+  void mov_slow(Register rd, address addr);
+
+  void patchable_mov_oop(Register rd, jobject o, int oop_index) {
+    mov_oop(rd, o, oop_index AARCH64_ONLY_ARG(true));
+  }
+  void mov_oop(Register rd, jobject o, int index = 0
+               AARCH64_ONLY_ARG(bool patchable = false)
+               NOT_AARCH64_ARG(AsmCondition cond = al));
+
+
+  void patchable_mov_metadata(Register rd, Metadata* o, int index) {
+    mov_metadata(rd, o, index AARCH64_ONLY_ARG(true));
+  }
+  void mov_metadata(Register rd, Metadata* o, int index = 0 AARCH64_ONLY_ARG(bool patchable = false));
+
+  void mov_float(FloatRegister fd, jfloat c NOT_AARCH64_ARG(AsmCondition cond = al));
+  void mov_double(FloatRegister fd, jdouble c NOT_AARCH64_ARG(AsmCondition cond = al));
+
+#ifdef AARCH64
+  int mov_pc_to(Register rd) {
+    Label L;
+    adr(rd, L);
+    bind(L);
+    return offset();
+  }
+#endif
+
+  // Note: this variant of mov_address assumes the address moves with
+  // the code. Do *not* implement it with non-relocated instructions,
+  // unless PC-relative.
+#ifdef AARCH64
+  void mov_relative_address(Register rd, address addr) {
+    adr(rd, addr);
+  }
+#else
+  void mov_relative_address(Register rd, address addr, AsmCondition cond = al) {
+    int offset = addr - pc() - 8;
+    assert((offset & 3) == 0, "bad alignment");
+    if (offset >= 0) {
+      assert(AsmOperand::is_rotated_imm(offset), "addr too far");
+      add(rd, PC, offset, cond);
+    } else {
+      assert(AsmOperand::is_rotated_imm(-offset), "addr too far");
+      sub(rd, PC, -offset, cond);
+    }
+  }
+#endif // AARCH64
+
+  // Runtime address that may vary from one execution to another. The
+  // symbolic_reference describes what the address is, allowing
+  // the address to be resolved in a different execution context.
+  // Warning: do not implement as a PC relative address.
+  void mov_address(Register rd, address addr, symbolic_Relocation::symbolic_reference t) {
+    mov_address(rd, addr, RelocationHolder::none);
+  }
+
+  // rspec can be RelocationHolder::none (for ignored symbolic_Relocation).
+  // In that case, the address is absolute and the generated code need
+  // not be relocable.
+  void mov_address(Register rd, address addr, RelocationHolder const& rspec) {
+    assert(rspec.type() != relocInfo::runtime_call_type, "do not use mov_address for runtime calls");
+    assert(rspec.type() != relocInfo::static_call_type, "do not use mov_address for relocable calls");
+    if (rspec.type() == relocInfo::none) {
+      // absolute address, relocation not needed
+      mov_slow(rd, (intptr_t)addr);
+      return;
+    }
+#ifndef AARCH64
+    if (VM_Version::supports_movw()) {
+      relocate(rspec);
+      int c = (int)addr;
+      movw(rd, c & 0xffff);
+      if ((unsigned int)c >> 16) {
+        movt(rd, (unsigned int)c >> 16);
+      }
+      return;
+    }
+#endif
+    Label skip_literal;
+    InlinedAddress addr_literal(addr, rspec);
+    ldr_literal(rd, addr_literal);
+    b(skip_literal);
+    bind_literal(addr_literal);
+    // AARCH64 WARNING: because of alignment padding, extra padding
+    // may be required to get a consistent size for C2, or rules must
+    // overestimate size see MachEpilogNode::size
+    bind(skip_literal);
+  }
+
+  // Note: Do not define mov_address for a Label
+  //
+  // Load from addresses potentially within the code are now handled
+  // InlinedLiteral subclasses (to allow more flexibility on how the
+  // ldr_literal is performed).
+
+  void ldr_literal(Register rd, InlinedAddress& L) {
+    assert(L.rspec().type() != relocInfo::runtime_call_type, "avoid ldr_literal for calls");
+    assert(L.rspec().type() != relocInfo::static_call_type, "avoid ldr_literal for calls");
+    relocate(L.rspec());
+#ifdef AARCH64
+    ldr(rd, target(L.label));
+#else
+    ldr(rd, Address(PC, target(L.label) - pc() - 8));
+#endif
+  }
+
+  void ldr_literal(Register rd, InlinedString& L) {
+    const char* msg = L.msg();
+    if (code()->consts()->contains((address)msg)) {
+      // string address moves with the code
+#ifdef AARCH64
+      ldr(rd, (address)msg);
+#else
+      ldr(rd, Address(PC, ((address)msg) - pc() - 8));
+#endif
+      return;
+    }
+    // Warning: use external strings with care. They are not relocated
+    // if the code moves. If needed, use code_string to move them
+    // to the consts section.
+#ifdef AARCH64
+    ldr(rd, target(L.label));
+#else
+    ldr(rd, Address(PC, target(L.label) - pc() - 8));
+#endif
+  }
+
+  void ldr_literal(Register rd, InlinedMetadata& L) {
+    // relocation done in the bind_literal for metadatas
+#ifdef AARCH64
+    ldr(rd, target(L.label));
+#else
+    ldr(rd, Address(PC, target(L.label) - pc() - 8));
+#endif
+  }
+
+  void bind_literal(InlinedAddress& L) {
+    AARCH64_ONLY(align(wordSize));
+    bind(L.label);
+    assert(L.rspec().type() != relocInfo::metadata_type, "Must use InlinedMetadata");
+    // We currently do not use oop 'bound' literals.
+    // If the code evolves and the following assert is triggered,
+    // we need to implement InlinedOop (see InlinedMetadata).
+    assert(L.rspec().type() != relocInfo::oop_type, "Inlined oops not supported");
+    // Note: relocation is handled by relocate calls in ldr_literal
+    AbstractAssembler::emit_address((address)L.target());
+  }
+
+  void bind_literal(InlinedString& L) {
+    const char* msg = L.msg();
+    if (code()->consts()->contains((address)msg)) {
+      // The Label should not be used; avoid binding it
+      // to detect errors.
+      return;
+    }
+    AARCH64_ONLY(align(wordSize));
+    bind(L.label);
+    AbstractAssembler::emit_address((address)L.msg());
+  }
+
+  void bind_literal(InlinedMetadata& L) {
+    AARCH64_ONLY(align(wordSize));
+    bind(L.label);
+    relocate(metadata_Relocation::spec_for_immediate());
+    AbstractAssembler::emit_address((address)L.data());
+  }
+
+  void load_mirror(Register mirror, Register method, Register tmp);
+
+  // Porting layer between 32-bit ARM and AArch64
+
+#define COMMON_INSTR_1(common_mnemonic, aarch64_mnemonic, arm32_mnemonic, arg_type) \
+  void common_mnemonic(arg_type arg) { \
+      AARCH64_ONLY(aarch64_mnemonic) NOT_AARCH64(arm32_mnemonic) (arg); \
+  }
+
+#define COMMON_INSTR_2(common_mnemonic, aarch64_mnemonic, arm32_mnemonic, arg1_type, arg2_type) \
+  void common_mnemonic(arg1_type arg1, arg2_type arg2) { \
+      AARCH64_ONLY(aarch64_mnemonic) NOT_AARCH64(arm32_mnemonic) (arg1, arg2); \
+  }
+
+#define COMMON_INSTR_3(common_mnemonic, aarch64_mnemonic, arm32_mnemonic, arg1_type, arg2_type, arg3_type) \
+  void common_mnemonic(arg1_type arg1, arg2_type arg2, arg3_type arg3) { \
+      AARCH64_ONLY(aarch64_mnemonic) NOT_AARCH64(arm32_mnemonic) (arg1, arg2, arg3); \
+  }
+
+  COMMON_INSTR_1(jump, br,  bx,  Register)
+  COMMON_INSTR_1(call, blr, blx, Register)
+
+  COMMON_INSTR_2(cbz_32,  cbz_w,  cbz,  Register, Label&)
+  COMMON_INSTR_2(cbnz_32, cbnz_w, cbnz, Register, Label&)
+
+  COMMON_INSTR_2(ldr_u32, ldr_w,  ldr,  Register, Address)
+  COMMON_INSTR_2(ldr_s32, ldrsw,  ldr,  Register, Address)
+  COMMON_INSTR_2(str_32,  str_w,  str,  Register, Address)
+
+  COMMON_INSTR_2(mvn_32,  mvn_w,  mvn,  Register, Register)
+  COMMON_INSTR_2(cmp_32,  cmp_w,  cmp,  Register, Register)
+  COMMON_INSTR_2(neg_32,  neg_w,  neg,  Register, Register)
+  COMMON_INSTR_2(clz_32,  clz_w,  clz,  Register, Register)
+  COMMON_INSTR_2(rbit_32, rbit_w, rbit, Register, Register)
+
+  COMMON_INSTR_2(cmp_32,  cmp_w,  cmp,  Register, int)
+  COMMON_INSTR_2(cmn_32,  cmn_w,  cmn,  Register, int)
+
+  COMMON_INSTR_3(add_32,  add_w,  add,  Register, Register, Register)
+  COMMON_INSTR_3(sub_32,  sub_w,  sub,  Register, Register, Register)
+  COMMON_INSTR_3(subs_32, subs_w, subs, Register, Register, Register)
+  COMMON_INSTR_3(mul_32,  mul_w,  mul,  Register, Register, Register)
+  COMMON_INSTR_3(and_32,  andr_w, andr, Register, Register, Register)
+  COMMON_INSTR_3(orr_32,  orr_w,  orr,  Register, Register, Register)
+  COMMON_INSTR_3(eor_32,  eor_w,  eor,  Register, Register, Register)
+
+  COMMON_INSTR_3(add_32,  add_w,  add,  Register, Register, AsmOperand)
+  COMMON_INSTR_3(sub_32,  sub_w,  sub,  Register, Register, AsmOperand)
+  COMMON_INSTR_3(orr_32,  orr_w,  orr,  Register, Register, AsmOperand)
+  COMMON_INSTR_3(eor_32,  eor_w,  eor,  Register, Register, AsmOperand)
+  COMMON_INSTR_3(and_32,  andr_w, andr, Register, Register, AsmOperand)
+
+
+  COMMON_INSTR_3(add_32,  add_w,  add,  Register, Register, int)
+  COMMON_INSTR_3(adds_32, adds_w, adds, Register, Register, int)
+  COMMON_INSTR_3(sub_32,  sub_w,  sub,  Register, Register, int)
+  COMMON_INSTR_3(subs_32, subs_w, subs, Register, Register, int)
+
+  COMMON_INSTR_2(tst_32,  tst_w,  tst,  Register, unsigned int)
+  COMMON_INSTR_2(tst_32,  tst_w,  tst,  Register, AsmOperand)
+
+  COMMON_INSTR_3(and_32,  andr_w, andr, Register, Register, uint)
+  COMMON_INSTR_3(orr_32,  orr_w,  orr,  Register, Register, uint)
+  COMMON_INSTR_3(eor_32,  eor_w,  eor,  Register, Register, uint)
+
+  COMMON_INSTR_1(cmp_zero_float,  fcmp0_s, fcmpzs, FloatRegister)
+  COMMON_INSTR_1(cmp_zero_double, fcmp0_d, fcmpzd, FloatRegister)
+
+  COMMON_INSTR_2(ldr_float,   ldr_s,   flds,   FloatRegister, Address)
+  COMMON_INSTR_2(str_float,   str_s,   fsts,   FloatRegister, Address)
+  COMMON_INSTR_2(mov_float,   fmov_s,  fcpys,  FloatRegister, FloatRegister)
+  COMMON_INSTR_2(neg_float,   fneg_s,  fnegs,  FloatRegister, FloatRegister)
+  COMMON_INSTR_2(abs_float,   fabs_s,  fabss,  FloatRegister, FloatRegister)
+  COMMON_INSTR_2(sqrt_float,  fsqrt_s, fsqrts, FloatRegister, FloatRegister)
+  COMMON_INSTR_2(cmp_float,   fcmp_s,  fcmps,  FloatRegister, FloatRegister)
+
+  COMMON_INSTR_3(add_float,   fadd_s,  fadds,  FloatRegister, FloatRegister, FloatRegister)
+  COMMON_INSTR_3(sub_float,   fsub_s,  fsubs,  FloatRegister, FloatRegister, FloatRegister)
+  COMMON_INSTR_3(mul_float,   fmul_s,  fmuls,  FloatRegister, FloatRegister, FloatRegister)
+  COMMON_INSTR_3(div_float,   fdiv_s,  fdivs,  FloatRegister, FloatRegister, FloatRegister)
+
+  COMMON_INSTR_2(ldr_double,  ldr_d,   fldd,   FloatRegister, Address)
+  COMMON_INSTR_2(str_double,  str_d,   fstd,   FloatRegister, Address)
+  COMMON_INSTR_2(mov_double,  fmov_d,  fcpyd,  FloatRegister, FloatRegister)
+  COMMON_INSTR_2(neg_double,  fneg_d,  fnegd,  FloatRegister, FloatRegister)
+  COMMON_INSTR_2(cmp_double,  fcmp_d,  fcmpd,  FloatRegister, FloatRegister)
+  COMMON_INSTR_2(abs_double,  fabs_d,  fabsd,  FloatRegister, FloatRegister)
+  COMMON_INSTR_2(sqrt_double, fsqrt_d, fsqrtd, FloatRegister, FloatRegister)
+
+  COMMON_INSTR_3(add_double,  fadd_d,  faddd,  FloatRegister, FloatRegister, FloatRegister)
+  COMMON_INSTR_3(sub_double,  fsub_d,  fsubd,  FloatRegister, FloatRegister, FloatRegister)
+  COMMON_INSTR_3(mul_double,  fmul_d,  fmuld,  FloatRegister, FloatRegister, FloatRegister)
+  COMMON_INSTR_3(div_double,  fdiv_d,  fdivd,  FloatRegister, FloatRegister, FloatRegister)
+
+  COMMON_INSTR_2(convert_f2d, fcvt_ds, fcvtds, FloatRegister, FloatRegister)
+  COMMON_INSTR_2(convert_d2f, fcvt_sd, fcvtsd, FloatRegister, FloatRegister)
+
+  COMMON_INSTR_2(mov_fpr2gpr_float, fmov_ws, fmrs, Register, FloatRegister)
+
+#undef COMMON_INSTR_1
+#undef COMMON_INSTR_2
+#undef COMMON_INSTR_3
+
+
+#ifdef AARCH64
+
+  void mov(Register dst, Register src, AsmCondition cond) {
+    if (cond == al) {
+      mov(dst, src);
+    } else {
+      csel(dst, src, dst, cond);
+    }
+  }
+
+  // Propagate other overloaded "mov" methods from Assembler.
+  void mov(Register dst, Register src)    { Assembler::mov(dst, src); }
+  void mov(Register rd, int imm)          { Assembler::mov(rd, imm);  }
+
+  void mov(Register dst, int imm, AsmCondition cond) {
+    assert(imm == 0 || imm == 1, "");
+    if (imm == 0) {
+      mov(dst, ZR, cond);
+    } else if (imm == 1) {
+      csinc(dst, dst, ZR, inverse(cond));
+    } else if (imm == -1) {
+      csinv(dst, dst, ZR, inverse(cond));
+    } else {
+      fatal("illegal mov(R%d,%d,cond)", dst->encoding(), imm);
+    }
+  }
+
+  void movs(Register dst, Register src)    { adds(dst, src, 0); }
+
+#else // AARCH64
+
+  void tbz(Register rt, int bit, Label& L) {
+    assert(0 <= bit && bit < BitsPerWord, "bit number is out of range");
+    tst(rt, 1 << bit);
+    b(L, eq);
+  }
+
+  void tbnz(Register rt, int bit, Label& L) {
+    assert(0 <= bit && bit < BitsPerWord, "bit number is out of range");
+    tst(rt, 1 << bit);
+    b(L, ne);
+  }
+
+  void cbz(Register rt, Label& L) {
+    cmp(rt, 0);
+    b(L, eq);
+  }
+
+  void cbz(Register rt, address target) {
+    cmp(rt, 0);
+    b(target, eq);
+  }
+
+  void cbnz(Register rt, Label& L) {
+    cmp(rt, 0);
+    b(L, ne);
+  }
+
+  void ret(Register dst = LR) {
+    bx(dst);
+  }
+
+#endif // AARCH64
+
+  Register zero_register(Register tmp) {
+#ifdef AARCH64
+    return ZR;
+#else
+    mov(tmp, 0);
+    return tmp;
+#endif
+  }
+
+  void logical_shift_left(Register dst, Register src, int shift) {
+#ifdef AARCH64
+    _lsl(dst, src, shift);
+#else
+    mov(dst, AsmOperand(src, lsl, shift));
+#endif
+  }
+
+  void logical_shift_left_32(Register dst, Register src, int shift) {
+#ifdef AARCH64
+    _lsl_w(dst, src, shift);
+#else
+    mov(dst, AsmOperand(src, lsl, shift));
+#endif
+  }
+
+  void logical_shift_right(Register dst, Register src, int shift) {
+#ifdef AARCH64
+    _lsr(dst, src, shift);
+#else
+    mov(dst, AsmOperand(src, lsr, shift));
+#endif
+  }
+
+  void arith_shift_right(Register dst, Register src, int shift) {
+#ifdef AARCH64
+    _asr(dst, src, shift);
+#else
+    mov(dst, AsmOperand(src, asr, shift));
+#endif
+  }
+
+  void asr_32(Register dst, Register src, int shift) {
+#ifdef AARCH64
+    _asr_w(dst, src, shift);
+#else
+    mov(dst, AsmOperand(src, asr, shift));
+#endif
+  }
+
+  // If <cond> holds, compares r1 and r2. Otherwise, flags are set so that <cond> does not hold.
+  void cond_cmp(Register r1, Register r2, AsmCondition cond) {
+#ifdef AARCH64
+    ccmp(r1, r2, flags_for_condition(inverse(cond)), cond);
+#else
+    cmp(r1, r2, cond);
+#endif
+  }
+
+  // If <cond> holds, compares r and imm. Otherwise, flags are set so that <cond> does not hold.
+  void cond_cmp(Register r, int imm, AsmCondition cond) {
+#ifdef AARCH64
+    ccmp(r, imm, flags_for_condition(inverse(cond)), cond);
+#else
+    cmp(r, imm, cond);
+#endif
+  }
+
+  void align_reg(Register dst, Register src, int align) {
+    assert (is_power_of_2(align), "should be");
+#ifdef AARCH64
+    andr(dst, src, ~(uintx)(align-1));
+#else
+    bic(dst, src, align-1);
+#endif
+  }
+
+  void prefetch_read(Address addr) {
+#ifdef AARCH64
+    prfm(pldl1keep, addr);
+#else
+    pld(addr);
+#endif
+  }
+
+  void raw_push(Register r1, Register r2) {
+#ifdef AARCH64
+    stp(r1, r2, Address(SP, -2*wordSize, pre_indexed));
+#else
+    assert(r1->encoding() < r2->encoding(), "should be ordered");
+    push(RegisterSet(r1) | RegisterSet(r2));
+#endif
+  }
+
+  void raw_pop(Register r1, Register r2) {
+#ifdef AARCH64
+    ldp(r1, r2, Address(SP, 2*wordSize, post_indexed));
+#else
+    assert(r1->encoding() < r2->encoding(), "should be ordered");
+    pop(RegisterSet(r1) | RegisterSet(r2));
+#endif
+  }
+
+  void raw_push(Register r1, Register r2, Register r3) {
+#ifdef AARCH64
+    raw_push(r1, r2);
+    raw_push(r3, ZR);
+#else
+    assert(r1->encoding() < r2->encoding() && r2->encoding() < r3->encoding(), "should be ordered");
+    push(RegisterSet(r1) | RegisterSet(r2) | RegisterSet(r3));
+#endif
+  }
+
+  void raw_pop(Register r1, Register r2, Register r3) {
+#ifdef AARCH64
+    raw_pop(r3, ZR);
+    raw_pop(r1, r2);
+#else
+    assert(r1->encoding() < r2->encoding() && r2->encoding() < r3->encoding(), "should be ordered");
+    pop(RegisterSet(r1) | RegisterSet(r2) | RegisterSet(r3));
+#endif
+  }
+
+  // Restores registers r1 and r2 previously saved by raw_push(r1, r2, ret_addr) and returns by ret_addr. Clobbers LR.
+  void raw_pop_and_ret(Register r1, Register r2) {
+#ifdef AARCH64
+    raw_pop(r1, r2, LR);
+    ret();
+#else
+    raw_pop(r1, r2, PC);
+#endif
+  }
+
+  void indirect_jump(Address addr, Register scratch) {
+#ifdef AARCH64
+    ldr(scratch, addr);
+    br(scratch);
+#else
+    ldr(PC, addr);
+#endif
+  }
+
+  void indirect_jump(InlinedAddress& literal, Register scratch) {
+#ifdef AARCH64
+    ldr_literal(scratch, literal);
+    br(scratch);
+#else
+    ldr_literal(PC, literal);
+#endif
+  }
+
+#ifndef AARCH64
+  void neg(Register dst, Register src) {
+    rsb(dst, src, 0);
+  }
+#endif
+
+  void branch_if_negative_32(Register r, Label& L) {
+    // Note about branch_if_negative_32() / branch_if_any_negative_32() implementation for AArch64:
+    // tbnz is not used instead of tst & b.mi because destination may be out of tbnz range (+-32KB)
+    // since these methods are used in LIR_Assembler::emit_arraycopy() to jump to stub entry.
+    tst_32(r, r);
+    b(L, mi);
+  }
+
+  void branch_if_any_negative_32(Register r1, Register r2, Register tmp, Label& L) {
+#ifdef AARCH64
+    orr_32(tmp, r1, r2);
+    tst_32(tmp, tmp);
+#else
+    orrs(tmp, r1, r2);
+#endif
+    b(L, mi);
+  }
+
+  void branch_if_any_negative_32(Register r1, Register r2, Register r3, Register tmp, Label& L) {
+    orr_32(tmp, r1, r2);
+#ifdef AARCH64
+    orr_32(tmp, tmp, r3);
+    tst_32(tmp, tmp);
+#else
+    orrs(tmp, tmp, r3);
+#endif
+    b(L, mi);
+  }
+
+  void add_ptr_scaled_int32(Register dst, Register r1, Register r2, int shift) {
+#ifdef AARCH64
+      add(dst, r1, r2, ex_sxtw, shift);
+#else
+      add(dst, r1, AsmOperand(r2, lsl, shift));
+#endif
+  }
+
+  void sub_ptr_scaled_int32(Register dst, Register r1, Register r2, int shift) {
+#ifdef AARCH64
+    sub(dst, r1, r2, ex_sxtw, shift);
+#else
+    sub(dst, r1, AsmOperand(r2, lsl, shift));
+#endif
+  }
+
+
+    // klass oop manipulations if compressed
+
+#ifdef AARCH64
+  void load_klass(Register dst_klass, Register src_oop);
+#else
+  void load_klass(Register dst_klass, Register src_oop, AsmCondition cond = al);
+#endif // AARCH64
+
+  void store_klass(Register src_klass, Register dst_oop);
+
+#ifdef AARCH64
+  void store_klass_gap(Register dst);
+#endif // AARCH64
+
+    // oop manipulations
+
+  void load_heap_oop(Register dst, Address src);
+  void store_heap_oop(Register src, Address dst);
+  void store_heap_oop(Address dst, Register src) {
+    store_heap_oop(src, dst);
+  }
+  void store_heap_oop_null(Register src, Address dst);
+
+#ifdef AARCH64
+  void encode_heap_oop(Register dst, Register src);
+  void encode_heap_oop(Register r) {
+    encode_heap_oop(r, r);
+  }
+  void decode_heap_oop(Register dst, Register src);
+  void decode_heap_oop(Register r) {
+      decode_heap_oop(r, r);
+  }
+
+#ifdef COMPILER2
+  void encode_heap_oop_not_null(Register dst, Register src);
+  void decode_heap_oop_not_null(Register dst, Register src);
+
+  void set_narrow_klass(Register dst, Klass* k);
+  void set_narrow_oop(Register dst, jobject obj);
+#endif
+
+  void encode_klass_not_null(Register r);
+  void encode_klass_not_null(Register dst, Register src);
+  void decode_klass_not_null(Register r);
+  void decode_klass_not_null(Register dst, Register src);
+
+  void reinit_heapbase();
+
+#ifdef ASSERT
+  void verify_heapbase(const char* msg);
+#endif // ASSERT
+
+  static int instr_count_for_mov_slow(intptr_t c);
+  static int instr_count_for_mov_slow(address addr);
+  static int instr_count_for_decode_klass_not_null();
+#endif // AARCH64
+
+  void ldr_global_ptr(Register reg, address address_of_global);
+  void ldr_global_s32(Register reg, address address_of_global);
+  void ldrb_global(Register reg, address address_of_global);
+
+  // address_placeholder_instruction is invalid instruction and is used
+  // as placeholder in code for address of label
+  enum { address_placeholder_instruction = 0xFFFFFFFF };
+
+  void emit_address(Label& L) {
+    assert(!L.is_bound(), "otherwise address will not be patched");
+    target(L);       // creates relocation which will be patched later
+
+    assert ((offset() & (wordSize-1)) == 0, "should be aligned by word size");
+
+#ifdef AARCH64
+    emit_int32(address_placeholder_instruction);
+    emit_int32(address_placeholder_instruction);
+#else
+    AbstractAssembler::emit_address((address)address_placeholder_instruction);
+#endif
+  }
+
+  void b(address target, AsmCondition cond = al) {
+    Assembler::b(target, cond);                 \
+  }
+  void b(Label& L, AsmCondition cond = al) {
+    // internal jumps
+    Assembler::b(target(L), cond);
+  }
+
+  void bl(address target NOT_AARCH64_ARG(AsmCondition cond = al)) {
+    Assembler::bl(target NOT_AARCH64_ARG(cond));
+  }
+  void bl(Label& L NOT_AARCH64_ARG(AsmCondition cond = al)) {
+    // internal calls
+    Assembler::bl(target(L)  NOT_AARCH64_ARG(cond));
+  }
+
+#ifndef AARCH64
+  void adr(Register dest, Label& L, AsmCondition cond = al) {
+    int delta = target(L) - pc() - 8;
+    if (delta >= 0) {
+      add(dest, PC, delta, cond);
+    } else {
+      sub(dest, PC, -delta, cond);
+    }
+  }
+#endif // !AARCH64
+
+  // Variable-length jump and calls. We now distinguish only the
+  // patchable case from the other cases. Patchable must be
+  // distinguised from relocable. Relocable means the generated code
+  // containing the jump/call may move. Patchable means that the
+  // targeted address may be changed later.
+
+  // Non patchable versions.
+  // - used only for relocInfo::runtime_call_type and relocInfo::none
+  // - may use relative or absolute format (do not use relocInfo::none
+  //   if the generated code may move)
+  // - the implementation takes into account switch to THUMB mode if the
+  //   destination is a THUMB address
+  // - the implementation supports far targets
+  //
+  // To reduce regression risk, scratch still defaults to noreg on
+  // arm32. This results in patchable instructions. However, if
+  // patching really matters, the call sites should be modified and
+  // use patchable_call or patchable_jump. If patching is not required
+  // and if a register can be cloberred, it should be explicitly
+  // specified to allow future optimizations.
+  void jump(address target,
+            relocInfo::relocType rtype = relocInfo::runtime_call_type,
+            Register scratch = AARCH64_ONLY(Rtemp) NOT_AARCH64(noreg)
+#ifndef AARCH64
+            , AsmCondition cond = al
+#endif
+            );
+
+  void call(address target,
+            RelocationHolder rspec
+            NOT_AARCH64_ARG(AsmCondition cond = al));
+
+  void call(address target,
+            relocInfo::relocType rtype = relocInfo::runtime_call_type
+            NOT_AARCH64_ARG(AsmCondition cond = al)) {
+    call(target, Relocation::spec_simple(rtype) NOT_AARCH64_ARG(cond));
+  }
+
+  void jump(AddressLiteral dest) {
+    jump(dest.target(), dest.reloc());
+  }
+#ifndef AARCH64
+  void jump(address dest, relocInfo::relocType rtype, AsmCondition cond) {
+    jump(dest, rtype, Rtemp, cond);
+  }
+#endif
+
+  void call(AddressLiteral dest) {
+    call(dest.target(), dest.reloc());
+  }
+
+  // Patchable version:
+  // - set_destination can be used to atomically change the target
+  //
+  // The targets for patchable_jump and patchable_call must be in the
+  // code cache.
+  // [ including possible extensions of the code cache, like AOT code ]
+  //
+  // To reduce regression risk, scratch still defaults to noreg on
+  // arm32. If a register can be cloberred, it should be explicitly
+  // specified to allow future optimizations.
+  void patchable_jump(address target,
+                      relocInfo::relocType rtype = relocInfo::runtime_call_type,
+                      Register scratch = AARCH64_ONLY(Rtemp) NOT_AARCH64(noreg)
+#ifndef AARCH64
+                      , AsmCondition cond = al
+#endif
+                      );
+
+  // patchable_call may scratch Rtemp
+  int patchable_call(address target,
+                     RelocationHolder const& rspec,
+                     bool c2 = false);
+
+  int patchable_call(address target,
+                     relocInfo::relocType rtype,
+                     bool c2 = false) {
+    return patchable_call(target, Relocation::spec_simple(rtype), c2);
+  }
+
+#if defined(AARCH64) && defined(COMPILER2)
+  static int call_size(address target, bool far, bool patchable);
+#endif
+
+#ifdef AARCH64
+  static bool page_reachable_from_cache(address target);
+#endif
+  static bool _reachable_from_cache(address target);
+  static bool _cache_fully_reachable();
+  bool cache_fully_reachable();
+  bool reachable_from_cache(address target);
+
+  void zero_extend(Register rd, Register rn, int bits);
+  void sign_extend(Register rd, Register rn, int bits);
+
+  inline void zap_high_non_significant_bits(Register r) {
+#ifdef AARCH64
+    if(ZapHighNonSignificantBits) {
+      movk(r, 0xBAAD, 48);
+      movk(r, 0xF00D, 32);
+    }
+#endif
+  }
+
+#ifndef AARCH64
+  void long_move(Register rd_lo, Register rd_hi,
+                 Register rn_lo, Register rn_hi,
+                 AsmCondition cond = al);
+  void long_shift(Register rd_lo, Register rd_hi,
+                  Register rn_lo, Register rn_hi,
+                  AsmShift shift, Register count);
+  void long_shift(Register rd_lo, Register rd_hi,
+                  Register rn_lo, Register rn_hi,
+                  AsmShift shift, int count);
+
+  void atomic_cas(Register tmpreg1, Register tmpreg2, Register oldval, Register newval, Register base, int offset);
+  void atomic_cas_bool(Register oldval, Register newval, Register base, int offset, Register tmpreg);
+  void atomic_cas64(Register temp_lo, Register temp_hi, Register temp_result, Register oldval_lo, Register oldval_hi, Register newval_lo, Register newval_hi, Register base, int offset);
+#endif // !AARCH64
+
+  void cas_for_lock_acquire(Register oldval, Register newval, Register base, Register tmp, Label &slow_case, bool allow_fallthrough_on_failure = false, bool one_shot = false);
+  void cas_for_lock_release(Register oldval, Register newval, Register base, Register tmp, Label &slow_case, bool allow_fallthrough_on_failure = false, bool one_shot = false);
+
+#ifndef PRODUCT
+  // Preserves flags and all registers.
+  // On SMP the updated value might not be visible to external observers without a sychronization barrier
+  void cond_atomic_inc32(AsmCondition cond, int* counter_addr);
+#endif // !PRODUCT
+
+  // unconditional non-atomic increment
+  void inc_counter(address counter_addr, Register tmpreg1, Register tmpreg2);
+  void inc_counter(int* counter_addr, Register tmpreg1, Register tmpreg2) {
+    inc_counter((address) counter_addr, tmpreg1, tmpreg2);
+  }
+
+  void pd_patch_instruction(address branch, address target);
+
+  // Loading and storing values by size and signed-ness;
+  // size must not exceed wordSize (i.e. 8-byte values are not supported on 32-bit ARM);
+  // each of these calls generates exactly one load or store instruction,
+  // so src can be pre- or post-indexed address.
+#ifdef AARCH64
+  void load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed);
+  void store_sized_value(Register src, Address dst, size_t size_in_bytes);
+#else
+  // 32-bit ARM variants also support conditional execution
+  void load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, AsmCondition cond = al);
+  void store_sized_value(Register src, Address dst, size_t size_in_bytes, AsmCondition cond = al);
+#endif
+
+  void lookup_interface_method(Register recv_klass,
+                               Register intf_klass,
+                               Register itable_index,
+                               Register method_result,
+                               Register temp_reg1,
+                               Register temp_reg2,
+                               Label& L_no_such_interface);
+
+  // Compare char[] arrays aligned to 4 bytes.
+  void char_arrays_equals(Register ary1, Register ary2,
+                          Register limit, Register result,
+                          Register chr1, Register chr2, Label& Ldone);
+
+
+  void floating_cmp(Register dst);
+
+  // improved x86 portability (minimizing source code changes)
+
+  void ldr_literal(Register rd, AddressLiteral addr) {
+    relocate(addr.rspec());
+#ifdef AARCH64
+    ldr(rd, addr.target());
+#else
+    ldr(rd, Address(PC, addr.target() - pc() - 8));
+#endif
+  }
+
+  void lea(Register Rd, AddressLiteral addr) {
+    // Never dereferenced, as on x86 (lval status ignored)
+    mov_address(Rd, addr.target(), addr.rspec());
+  }
+
+  void restore_default_fp_mode();
+
+#ifdef COMPILER2
+#ifdef AARCH64
+  // Code used by cmpFastLock and cmpFastUnlock mach instructions in .ad file.
+  void fast_lock(Register obj, Register box, Register scratch, Register scratch2, Register scratch3);
+  void fast_unlock(Register obj, Register box, Register scratch, Register scratch2, Register scratch3);
+#else
+  void fast_lock(Register obj, Register box, Register scratch, Register scratch2);
+  void fast_unlock(Register obj, Register box, Register scratch, Register scratch2);
+#endif
+#endif
+
+#ifdef AARCH64
+
+#define F(mnemonic)                                             \
+  void mnemonic(Register rt, address target) {                  \
+    Assembler::mnemonic(rt, target);                            \
+  }                                                             \
+  void mnemonic(Register rt, Label& L) {                        \
+    Assembler::mnemonic(rt, target(L));                         \
+  }
+
+  F(cbz_w);
+  F(cbnz_w);
+  F(cbz);
+  F(cbnz);
+
+#undef F
+
+#define F(mnemonic)                                             \
+  void mnemonic(Register rt, int bit, address target) {         \
+    Assembler::mnemonic(rt, bit, target);                       \
+  }                                                             \
+  void mnemonic(Register rt, int bit, Label& L) {               \
+    Assembler::mnemonic(rt, bit, target(L));                    \
+  }
+
+  F(tbz);
+  F(tbnz);
+#undef F
+
+#endif // AARCH64
+
+};
+
+
+// The purpose of this class is to build several code fragments of the same size
+// in order to allow fast table branch.
+
+class FixedSizeCodeBlock VALUE_OBJ_CLASS_SPEC {
+public:
+  FixedSizeCodeBlock(MacroAssembler* masm, int size_in_instrs, bool enabled);
+  ~FixedSizeCodeBlock();
+
+private:
+  MacroAssembler* _masm;
+  address _start;
+  int _size_in_instrs;
+  bool _enabled;
+};
+
+
+#endif // CPU_ARM_VM_MACROASSEMBLER_ARM_HPP
+
--- /dev/null	2016-08-24 15:41:39.598575000 -0400
+++ new/hotspot/src/cpu/arm/vm/macroAssembler_arm.inline.hpp	2016-12-02 11:22:01.110174630 -0500
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2008, 2014, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef CPU_ARM_VM_MACROASSEMBLER_ARM_INLINE_HPP
+#define CPU_ARM_VM_MACROASSEMBLER_ARM_INLINE_HPP
+
+#include "asm/assembler.inline.hpp"
+#include "asm/codeBuffer.hpp"
+#include "code/codeCache.hpp"
+#include "runtime/handles.inline.hpp"
+
+inline void MacroAssembler::pd_patch_instruction(address branch, address target) {
+  int instr = *(int*)branch;
+  int new_offset = (int)(target - branch NOT_AARCH64(- 8));
+  assert((new_offset & 3) == 0, "bad alignment");
+
+#ifdef AARCH64
+  if ((instr & (0x1f << 26)) == (0b00101 << 26)) {
+    // Unconditional B or BL
+    assert (is_offset_in_range(new_offset, 26), "offset is too large");
+    *(int*)branch = (instr & ~right_n_bits(26)) | encode_offset(new_offset, 26, 0);
+  } else if ((instr & (0xff << 24)) == (0b01010100 << 24) && (instr & (1 << 4)) == 0) {
+    // Conditional B
+    assert (is_offset_in_range(new_offset, 19), "offset is too large");
+    *(int*)branch = (instr & ~(right_n_bits(19) << 5)) | encode_offset(new_offset, 19, 5);
+  } else if ((instr & (0b111111 << 25)) == (0b011010 << 25)) {
+    // Compare & branch CBZ/CBNZ
+    assert (is_offset_in_range(new_offset, 19), "offset is too large");
+    *(int*)branch = (instr & ~(right_n_bits(19) << 5)) | encode_offset(new_offset, 19, 5);
+  } else if ((instr & (0b111111 << 25)) == (0b011011 << 25)) {
+    // Test & branch TBZ/TBNZ
+    assert (is_offset_in_range(new_offset, 14), "offset is too large");
+    *(int*)branch = (instr & ~(right_n_bits(14) << 5)) | encode_offset(new_offset, 14, 5);
+  } else if ((instr & (0b111011 << 24)) == (0b011000 << 24)) {
+    // LDR (literal)
+    unsigned opc = ((unsigned)instr >> 30);
+    assert (opc != 0b01 || ((uintx)target & 7) == 0, "ldr target should be aligned");
+    assert (is_offset_in_range(new_offset, 19), "offset is too large");
+    *(int*)branch = (instr & ~(right_n_bits(19) << 5)) | encode_offset(new_offset, 19, 5);
+  } else if (((instr & (1 << 31)) == 0) && ((instr & (0b11111 << 24)) == (0b10000 << 24))) {
+    // ADR
+    assert (is_imm_in_range(new_offset, 21, 0), "offset is too large");
+    instr = (instr & ~(right_n_bits(2) << 29)) | (new_offset & 3) << 29;
+    *(int*)branch = (instr & ~(right_n_bits(19) << 5)) | encode_imm(new_offset >> 2, 19, 0, 5);
+  } else if((unsigned int)instr == address_placeholder_instruction) {
+    // address
+    assert (*(unsigned int *)(branch + InstructionSize) == address_placeholder_instruction, "address placeholder occupies two instructions");
+    *(intx*)branch = (intx)target;
+  } else {
+    ::tty->print_cr("=============== instruction: 0x%x ================\n", instr);
+    Unimplemented(); // TODO-AARCH64
+  }
+#else
+  if ((instr & 0x0e000000) == 0x0a000000) {
+    // B or BL instruction
+    assert(new_offset < 0x2000000 && new_offset > -0x2000000, "encoding constraint");
+    *(int*)branch = (instr & 0xff000000) | ((unsigned int)new_offset << 6 >> 8);
+  } else if((unsigned int)instr == address_placeholder_instruction) {
+    // address
+    *(int*)branch = (int)target;
+  } else if ((instr & 0x0fff0000) == 0x028f0000 || ((instr & 0x0fff0000) == 0x024f0000)) {
+    // ADR
+    int encoding = 0x8 << 20; // ADD
+    if (new_offset < 0) {
+      encoding = 0x4 << 20; // SUB
+      new_offset = -new_offset;
+    }
+    AsmOperand o(new_offset);
+    *(int*)branch = (instr & 0xff0ff000) | encoding | o.encoding();
+  } else {
+    // LDR Rd, [PC, offset] instruction
+    assert((instr & 0x0f7f0000) == 0x051f0000, "Must be ldr_literal");
+    assert(new_offset < 4096 && new_offset > -4096, "encoding constraint");
+    if (new_offset >= 0) {
+      *(int*)branch = (instr & 0xff0ff000) | 9 << 20 | new_offset;
+    } else {
+      *(int*)branch = (instr & 0xff0ff000) | 1 << 20 | -new_offset;
+    }
+  }
+#endif // AARCH64
+}
+
+#endif // CPU_ARM_VM_MACROASSEMBLER_ARM_INLINE_HPP
--- /dev/null	2016-08-24 15:41:39.598575000 -0400
+++ new/hotspot/src/cpu/arm/vm/metaspaceShared_arm.cpp	2016-12-02 11:22:06.350471801 -0500
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2008, 2015, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+#include "asm/macroAssembler.hpp"
+#include "assembler_arm.inline.hpp"
+#include "memory/metaspaceShared.hpp"
+
+// Generate the self-patching vtable method:
+//
+// This method will be called (as any other Klass virtual method) with
+// the Klass itself as the first argument.  Example:
+//
+//      oop obj;
+//      int size = obj->klass()->oop_size(this);
+//
+// for which the virtual method call is Klass::oop_size();
+//
+// The dummy method is called with the Klass object as the first
+// operand, and an object as the second argument.
+//
+
+//=====================================================================
+
+// All of the dummy methods in the vtable are essentially identical,
+// differing only by an ordinal constant, and they bear no relationship
+// to the original method which the caller intended. Also, there needs
+// to be 'vtbl_list_size' instances of the vtable in order to
+// differentiate between the 'vtable_list_size' original Klass objects.
+
+#define __ masm->
+
+void MetaspaceShared::generate_vtable_methods(void** vtbl_list,
+                                                   void** vtable,
+                                                   char** md_top,
+                                                   char* md_end,
+                                                   char** mc_top,
+                                                   char* mc_end) {
+  intptr_t vtable_bytes = (num_virtuals * vtbl_list_size) * sizeof(void*);
+  *(intptr_t *)(*md_top) = vtable_bytes;
+  *md_top += sizeof(intptr_t);
+  void** dummy_vtable = (void**)*md_top;
+  *vtable = dummy_vtable;
+  *md_top += vtable_bytes;
+
+  CodeBuffer cb((unsigned char*)*mc_top, mc_end - *mc_top);
+  MacroAssembler* masm = new MacroAssembler(&cb);
+
+  for (int i = 0; i < vtbl_list_size; ++i) {
+    Label common_code;
+    for (int j = 0; j < num_virtuals; ++j) {
+      dummy_vtable[num_virtuals * i + j] = (void*) __ pc();
+      __ mov(Rtemp, j);  // Rtemp contains an index of a virtual method in the table
+      __ b(common_code);
+    }
+
+    InlinedAddress vtable_address((address)&vtbl_list[i]);
+    __ bind(common_code);
+    const Register tmp2 = AARCH64_ONLY(Rtemp2) NOT_AARCH64(R4);
+    assert_different_registers(Rtemp, tmp2);
+#ifndef AARCH64
+    __ push(tmp2);
+#endif // !AARCH64
+    // Do not use ldr_global since the code must be portable across all ARM architectures
+    __ ldr_literal(tmp2, vtable_address);
+    __ ldr(tmp2, Address(tmp2));                              // get correct vtable address
+    __ ldr(Rtemp, Address::indexed_ptr(tmp2, Rtemp));         // get real method pointer
+    __ str(tmp2, Address(R0));                                // update vtable. R0 = "this"
+#ifndef AARCH64
+    __ pop(tmp2);
+#endif // !AARCH64
+    __ jump(Rtemp);
+    __ bind_literal(vtable_address);
+  }
+
+  __ flush();
+  *mc_top = (char*) __ pc();
+}
--- /dev/null	2016-08-24 15:41:39.598575000 -0400
+++ new/hotspot/src/cpu/arm/vm/methodHandles_arm.cpp	2016-12-02 11:22:11.362756039 -0500
@@ -0,0 +1,587 @@
+/*
+ * Copyright (c) 2008, 2016, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+// This file mirror as much as possible methodHandles_x86.cpp to ease
+// cross platform development for JSR292.
+// Last synchronization: changeset f8c9417e3571
+
+#include "precompiled.hpp"
+#include "classfile/javaClasses.inline.hpp"
+#include "interpreter/interpreter.hpp"
+#include "interpreter/interpreterRuntime.hpp"
+#include "memory/allocation.inline.hpp"
+#include "memory/resourceArea.hpp"
+#include "prims/methodHandles.hpp"
+
+#define __ _masm->
+
+#ifdef PRODUCT
+#define BLOCK_COMMENT(str) /* nothing */
+#else
+#define BLOCK_COMMENT(str) __ block_comment(str)
+#endif
+
+#define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
+
+void MethodHandles::load_klass_from_Class(MacroAssembler* _masm, Register klass_reg, Register temp1, Register temp2) {
+  if (VerifyMethodHandles) {
+    verify_klass(_masm, klass_reg, temp1, temp2, SystemDictionary::WK_KLASS_ENUM_NAME(java_lang_Class),
+                 "MH argument is a Class");
+  }
+  __ ldr(klass_reg, Address(klass_reg, java_lang_Class::klass_offset_in_bytes()));
+}
+
+#ifdef ASSERT
+static int check_nonzero(const char* xname, int x) {
+  assert(x != 0, "%s should be nonzero", xname);
+  return x;
+}
+#define NONZERO(x) check_nonzero(#x, x)
+#else //ASSERT
+#define NONZERO(x) (x)
+#endif //ASSERT
+
+#ifdef ASSERT
+void MethodHandles::verify_klass(MacroAssembler* _masm,
+                                 Register obj, Register temp1, Register temp2, SystemDictionary::WKID klass_id,
+                                 const char* error_message) {
+  InstanceKlass** klass_addr = SystemDictionary::well_known_klass_addr(klass_id);
+  KlassHandle klass = SystemDictionary::well_known_klass(klass_id);
+  Label L_ok, L_bad;
+  BLOCK_COMMENT("verify_klass {");
+  __ verify_oop(obj);
+  __ cbz(obj, L_bad);
+  __ load_klass(temp1, obj);
+  __ lea(temp2, ExternalAddress((address) klass_addr));
+  __ ldr(temp2, temp2); // the cmpptr on x86 dereferences the AddressLiteral (not lea)
+  __ cmp(temp1, temp2);
+  __ b(L_ok, eq);
+  intptr_t super_check_offset = klass->super_check_offset();
+  __ ldr(temp1, Address(temp1, super_check_offset));
+  __ cmp(temp1, temp2);
+  __ b(L_ok, eq);
+
+  __ bind(L_bad);
+  __ stop(error_message);
+  __ BIND(L_ok);
+  BLOCK_COMMENT("} verify_klass");
+}
+
+void MethodHandles::verify_ref_kind(MacroAssembler* _masm, int ref_kind, Register member_reg, Register temp) {
+  Label L;
+  BLOCK_COMMENT("verify_ref_kind {");
+  __ ldr_u32(temp, Address(member_reg, NONZERO(java_lang_invoke_MemberName::flags_offset_in_bytes())));
+  __ logical_shift_right(temp, temp, java_lang_invoke_MemberName::MN_REFERENCE_KIND_SHIFT);
+  __ andr(temp, temp, (unsigned)java_lang_invoke_MemberName::MN_REFERENCE_KIND_MASK);
+  __ cmp(temp, ref_kind);
+  __ b(L, eq);
+  { char* buf = NEW_C_HEAP_ARRAY(char, 100, mtInternal);
+  jio_snprintf(buf, 100, "verify_ref_kind expected %x", ref_kind);
+  if (ref_kind == JVM_REF_invokeVirtual ||
+      ref_kind == JVM_REF_invokeSpecial)
+    // could do this for all ref_kinds, but would explode assembly code size
+    trace_method_handle(_masm, buf);
+  __ stop(buf);
+  }
+  BLOCK_COMMENT("} verify_ref_kind");
+  __ bind(L);
+}
+
+#endif //ASSERT
+
+void MethodHandles::jump_from_method_handle(MacroAssembler* _masm, bool for_compiler_entry) {
+  Label L_no_such_method;
+  __ cbz(Rmethod, L_no_such_method);
+
+  // Note: JVMTI overhead seems small enough compared to invocation
+  // cost and is not worth the complexity or code size overhead of
+  // supporting several variants of each adapter.
+  if (!for_compiler_entry && (JvmtiExport::can_post_interpreter_events())) {
+    // JVMTI events, such as single-stepping, are implemented partly by avoiding running
+    // compiled code in threads for which the event is enabled.  Check here for
+    // interp_only_mode if these events CAN be enabled.
+    __ ldr_s32(Rtemp, Address(Rthread, JavaThread::interp_only_mode_offset()));
+#ifdef AARCH64
+    Label L;
+    __ cbz(Rtemp, L);
+    __ indirect_jump(Address(Rmethod, Method::interpreter_entry_offset()), Rtemp);
+    __ bind(L);
+#else
+    __ cmp(Rtemp, 0);
+    __ ldr(PC, Address(Rmethod, Method::interpreter_entry_offset()), ne);
+#endif // AARCH64
+  }
+  const ByteSize entry_offset = for_compiler_entry ? Method::from_compiled_offset() :
+                                                     Method::from_interpreted_offset();
+
+  __ indirect_jump(Address(Rmethod, entry_offset), Rtemp);
+
+  __ bind(L_no_such_method);
+  // throw exception
+  __ jump(StubRoutines::throw_AbstractMethodError_entry(), relocInfo::runtime_call_type, Rtemp);
+}
+
+void MethodHandles::jump_to_lambda_form(MacroAssembler* _masm,
+                                        Register recv, Register tmp,
+                                        bool for_compiler_entry) {
+  BLOCK_COMMENT("jump_to_lambda_form {");
+  // This is the initial entry point of a lazy method handle.
+  // After type checking, it picks up the invoker from the LambdaForm.
+  assert_different_registers(recv, tmp, Rmethod);
+
+  // Load the invoker, as MH -> MH.form -> LF.vmentry
+  __ load_heap_oop(tmp, Address(recv, NONZERO(java_lang_invoke_MethodHandle::form_offset_in_bytes())));
+  __ verify_oop(tmp);
+
+  __ load_heap_oop(tmp, Address(tmp, NONZERO(java_lang_invoke_LambdaForm::vmentry_offset_in_bytes())));
+  __ verify_oop(tmp);
+
+  // the following assumes that a Method* is normally compressed in the vmtarget field:
+  __ ldr(Rmethod, Address(tmp, NONZERO(java_lang_invoke_MemberName::vmtarget_offset_in_bytes())));
+
+  if (VerifyMethodHandles && !for_compiler_entry) {
+    // make sure recv is already on stack
+    __ ldr(tmp, Address(Rmethod, Method::const_offset()));
+    __ load_sized_value(tmp,
+                        Address(tmp, ConstMethod::size_of_parameters_offset()),
+                        sizeof(u2), /*is_signed*/ false);
+    // assert(sizeof(u2) == sizeof(Method::_size_of_parameters), "");
+    Label L;
+    __ ldr(tmp, __ receiver_argument_address(Rparams, tmp, tmp));
+    __ cmp(tmp, recv);
+    __ b(L, eq);
+    __ stop("receiver not on stack");
+    __ bind(L);
+  }
+
+  jump_from_method_handle(_masm, for_compiler_entry);
+  BLOCK_COMMENT("} jump_to_lambda_form");
+}
+
+
+// Code generation
+address MethodHandles::generate_method_handle_interpreter_entry(MacroAssembler* _masm,
+                                                                vmIntrinsics::ID iid) {
+  const bool not_for_compiler_entry = false;  // this is the interpreter entry
+  assert(is_signature_polymorphic(iid), "expected invoke iid");
+  if (iid == vmIntrinsics::_invokeGeneric ||
+      iid == vmIntrinsics::_compiledLambdaForm) {
+    // Perhaps surprisingly, the user-visible names, and linkToCallSite, are not directly used.
+    // They are linked to Java-generated adapters via MethodHandleNatives.linkMethod.
+    // They all require an extra argument.
+    __ should_not_reach_here();           // empty stubs make SG sick
+    return NULL;
+  }
+
+  // Rmethod: Method*
+  // Rparams (SP on 32-bit ARM): pointer to parameters
+  // Rsender_sp (R4/R19): sender SP (must preserve; see prepare_to_jump_from_interpreted)
+  // R5_mh: receiver method handle (must load from sp[MethodTypeForm.vmslots])
+  // R1, R2, Rtemp: garbage temp, blown away
+
+  // Use same name as x86 to ease future merges
+  Register rdx_temp       = R2_tmp;
+  Register rdx_param_size = rdx_temp;  // size of parameters
+  Register rax_temp       = R1_tmp;
+  Register rcx_mh         = R5_mh;     // MH receiver; dies quickly and is recycled
+  Register rbx_method     = Rmethod;   // eventual target of this invocation
+  Register rdi_temp       = Rtemp;
+
+  // here's where control starts out:
+  __ align(CodeEntryAlignment);
+  address entry_point = __ pc();
+
+  if (VerifyMethodHandles) {
+    Label L;
+    BLOCK_COMMENT("verify_intrinsic_id {");
+    __ ldrh(rdi_temp, Address(rbx_method, Method::intrinsic_id_offset_in_bytes()));
+    __ sub_slow(rdi_temp, rdi_temp, (int) iid);
+    __ cbz(rdi_temp, L);
+    if (iid == vmIntrinsics::_linkToVirtual ||
+        iid == vmIntrinsics::_linkToSpecial) {
+      // could do this for all kinds, but would explode assembly code size
+      trace_method_handle(_masm, "bad Method*::intrinsic_id");
+    }
+    __ stop("bad Method*::intrinsic_id");
+    __ bind(L);
+    BLOCK_COMMENT("} verify_intrinsic_id");
+  }
+
+  // First task:  Find out how big the argument list is.
+  Address rdx_first_arg_addr;
+  int ref_kind = signature_polymorphic_intrinsic_ref_kind(iid);
+  assert(ref_kind != 0 || iid == vmIntrinsics::_invokeBasic, "must be _invokeBasic or a linkTo intrinsic");
+  if (ref_kind == 0 || MethodHandles::ref_kind_has_receiver(ref_kind)) {
+    __ ldr(rdx_param_size, Address(rbx_method, Method::const_offset()));
+    __ load_sized_value(rdx_param_size,
+                        Address(rdx_param_size, ConstMethod::size_of_parameters_offset()),
+                        sizeof(u2), /*is_signed*/ false);
+    // assert(sizeof(u2) == sizeof(Method::_size_of_parameters), "");
+    rdx_first_arg_addr = __ receiver_argument_address(Rparams, rdx_param_size, rdi_temp);
+  } else {
+    DEBUG_ONLY(rdx_param_size = noreg);
+  }
+
+  if (!is_signature_polymorphic_static(iid)) {
+    __ ldr(rcx_mh, rdx_first_arg_addr);
+    DEBUG_ONLY(rdx_param_size = noreg);
+  }
+
+  // rdx_first_arg_addr is live!
+
+  trace_method_handle_interpreter_entry(_masm, iid);
+
+  if (iid == vmIntrinsics::_invokeBasic) {
+    generate_method_handle_dispatch(_masm, iid, rcx_mh, noreg, not_for_compiler_entry);
+
+  } else {
+    // Adjust argument list by popping the trailing MemberName argument.
+    Register rcx_recv = noreg;
+    if (MethodHandles::ref_kind_has_receiver(ref_kind)) {
+      // Load the receiver (not the MH; the actual MemberName's receiver) up from the interpreter stack.
+      __ ldr(rcx_recv = rcx_mh, rdx_first_arg_addr);
+      DEBUG_ONLY(rdx_param_size = noreg);
+    }
+    Register rbx_member = rbx_method;  // MemberName ptr; incoming method ptr is dead now
+#ifdef AARCH64
+    __ ldr(rbx_member, Address(Rparams, Interpreter::stackElementSize, post_indexed));
+#else
+    __ pop(rbx_member);
+#endif
+    generate_method_handle_dispatch(_masm, iid, rcx_recv, rbx_member, not_for_compiler_entry);
+  }
+  return entry_point;
+}
+
+void MethodHandles::generate_method_handle_dispatch(MacroAssembler* _masm,
+                                                    vmIntrinsics::ID iid,
+                                                    Register receiver_reg,
+                                                    Register member_reg,
+                                                    bool for_compiler_entry) {
+  assert(is_signature_polymorphic(iid), "expected invoke iid");
+  // Use same name as x86 to ease future merges
+  Register rbx_method = Rmethod;   // eventual target of this invocation
+  // temps used in this code are not used in *either* compiled or interpreted calling sequences
+  Register temp1 = (for_compiler_entry ? saved_last_sp_register() : R1_tmp);
+  Register temp2 = AARCH64_ONLY(R9) NOT_AARCH64(R8);
+  Register temp3 = Rtemp; // R12/R16
+  Register temp4 = AARCH64_ONLY(Rtemp2) NOT_AARCH64(R5);
+  if (for_compiler_entry) {
+    assert(receiver_reg == (iid == vmIntrinsics::_linkToStatic ? noreg : j_rarg0), "only valid assignment");
+#ifdef AARCH64
+    assert_different_registers(temp1, j_rarg0, j_rarg1, j_rarg2, j_rarg3, j_rarg4, j_rarg5, j_rarg6, j_rarg7);
+    assert_different_registers(temp2, j_rarg0, j_rarg1, j_rarg2, j_rarg3, j_rarg4, j_rarg5, j_rarg6, j_rarg7);
+    assert_different_registers(temp3, j_rarg0, j_rarg1, j_rarg2, j_rarg3, j_rarg4, j_rarg5, j_rarg6, j_rarg7);
+    assert_different_registers(temp4, j_rarg0, j_rarg1, j_rarg2, j_rarg3, j_rarg4, j_rarg5, j_rarg6, j_rarg7);
+#else
+    assert_different_registers(temp1, j_rarg0, j_rarg1, j_rarg2, j_rarg3);
+    assert_different_registers(temp2, j_rarg0, j_rarg1, j_rarg2, j_rarg3);
+    assert_different_registers(temp3, j_rarg0, j_rarg1, j_rarg2, j_rarg3);
+    assert_different_registers(temp4, j_rarg0, j_rarg1, j_rarg2, j_rarg3);
+#endif // AARCH64
+  }
+  assert_different_registers(temp1, temp2, temp3, receiver_reg);
+  assert_different_registers(temp1, temp2, temp3, temp4, member_reg);
+  if (!for_compiler_entry)
+    assert_different_registers(temp1, temp2, temp3, temp4, saved_last_sp_register());  // don't trash lastSP
+
+  if (iid == vmIntrinsics::_invokeBasic) {
+    // indirect through MH.form.exactInvoker.vmtarget
+    jump_to_lambda_form(_masm, receiver_reg, temp3, for_compiler_entry);
+
+  } else {
+    // The method is a member invoker used by direct method handles.
+    if (VerifyMethodHandles) {
+      // make sure the trailing argument really is a MemberName (caller responsibility)
+      verify_klass(_masm, member_reg, temp2, temp3, SystemDictionary::WK_KLASS_ENUM_NAME(java_lang_invoke_MemberName),
+                   "MemberName required for invokeVirtual etc.");
+    }
+
+    Address member_clazz(   member_reg, NONZERO(java_lang_invoke_MemberName::clazz_offset_in_bytes()));
+    Address member_vmindex( member_reg, NONZERO(java_lang_invoke_MemberName::vmindex_offset_in_bytes()));
+    Address member_vmtarget(member_reg, NONZERO(java_lang_invoke_MemberName::vmtarget_offset_in_bytes()));
+
+    Register temp1_recv_klass = temp1;
+    if (iid != vmIntrinsics::_linkToStatic) {
+      if (iid == vmIntrinsics::_linkToSpecial) {
+        // Don't actually load the klass; just null-check the receiver.
+        __ null_check(receiver_reg, temp3);
+      } else {
+        // load receiver klass itself
+        __ null_check(receiver_reg, temp3, oopDesc::klass_offset_in_bytes());
+        __ load_klass(temp1_recv_klass, receiver_reg);
+        __ verify_klass_ptr(temp1_recv_klass);
+      }
+      BLOCK_COMMENT("check_receiver {");
+      // The receiver for the MemberName must be in receiver_reg.
+      // Check the receiver against the MemberName.clazz
+      if (VerifyMethodHandles && iid == vmIntrinsics::_linkToSpecial) {
+        // Did not load it above...
+        __ load_klass(temp1_recv_klass, receiver_reg);
+        __ verify_klass_ptr(temp1_recv_klass);
+      }
+      // Check the receiver against the MemberName.clazz
+      if (VerifyMethodHandles && iid != vmIntrinsics::_linkToInterface) {
+        Label L_ok;
+        Register temp2_defc = temp2;
+        __ load_heap_oop(temp2_defc, member_clazz);
+        load_klass_from_Class(_masm, temp2_defc, temp3, temp4);
+        __ verify_klass_ptr(temp2_defc);
+#ifdef AARCH64
+        // TODO-AARCH64
+        __ b(L_ok);
+#else
+        __ check_klass_subtype(temp1_recv_klass, temp2_defc, temp3, temp4, noreg, L_ok);
+#endif
+        // If we get here, the type check failed!
+        __ stop("receiver class disagrees with MemberName.clazz");
+        __ bind(L_ok);
+      }
+      BLOCK_COMMENT("} check_receiver");
+    }
+    if (iid == vmIntrinsics::_linkToSpecial ||
+        iid == vmIntrinsics::_linkToStatic) {
+      DEBUG_ONLY(temp1_recv_klass = noreg);  // these guys didn't load the recv_klass
+    }
+
+    // Live registers at this point:
+    //  member_reg - MemberName that was the extra argument
+    //  temp1_recv_klass - klass of stacked receiver, if needed
+
+    Label L_incompatible_class_change_error;
+    switch (iid) {
+    case vmIntrinsics::_linkToSpecial:
+      if (VerifyMethodHandles) {
+        verify_ref_kind(_masm, JVM_REF_invokeSpecial, member_reg, temp3);
+      }
+      __ ldr(Rmethod, member_vmtarget);
+      break;
+
+    case vmIntrinsics::_linkToStatic:
+      if (VerifyMethodHandles) {
+        verify_ref_kind(_masm, JVM_REF_invokeStatic, member_reg, temp3);
+      }
+      __ ldr(Rmethod, member_vmtarget);
+      break;
+
+    case vmIntrinsics::_linkToVirtual:
+    {
+      // same as TemplateTable::invokevirtual,
+      // minus the CP setup and profiling:
+
+      if (VerifyMethodHandles) {
+        verify_ref_kind(_masm, JVM_REF_invokeVirtual, member_reg, temp3);
+      }
+
+      // pick out the vtable index from the MemberName, and then we can discard it:
+      Register temp2_index = temp2;
+      __ ldr(temp2_index, member_vmindex);
+
+      if (VerifyMethodHandles) {
+        Label L_index_ok;
+        __ cmp(temp2_index, 0);
+        __ b(L_index_ok, ge);
+        __ stop("no virtual index");
+        __ bind(L_index_ok);
+      }
+
+      // Note:  The verifier invariants allow us to ignore MemberName.clazz and vmtarget
+      // at this point.  And VerifyMethodHandles has already checked clazz, if needed.
+
+      // get target Method* & entry point
+      __ lookup_virtual_method(temp1_recv_klass, temp2_index, Rmethod);
+      break;
+    }
+
+    case vmIntrinsics::_linkToInterface:
+    {
+      // same as TemplateTable::invokeinterface
+      // (minus the CP setup and profiling, with different argument motion)
+      if (VerifyMethodHandles) {
+        verify_ref_kind(_masm, JVM_REF_invokeInterface, member_reg, temp3);
+      }
+
+      Register temp3_intf = temp3;
+      __ load_heap_oop(temp3_intf, member_clazz);
+      load_klass_from_Class(_masm, temp3_intf, temp2, temp4);
+      __ verify_klass_ptr(temp3_intf);
+
+      Register rbx_index = rbx_method;
+      __ ldr(rbx_index, member_vmindex);
+      if (VerifyMethodHandles) {
+        Label L;
+        __ cmp(rbx_index, 0);
+        __ b(L, ge);
+        __ stop("invalid vtable index for MH.invokeInterface");
+        __ bind(L);
+      }
+
+      // given intf, index, and recv klass, dispatch to the implementation method
+      Label L_no_such_interface;
+      __ lookup_interface_method(temp1_recv_klass, temp3_intf,
+                                 // note: next two args must be the same:
+                                 rbx_index, rbx_method,
+                                 temp2, temp4,
+                                 L_incompatible_class_change_error);
+      break;
+    }
+
+    default:
+      fatal("unexpected intrinsic %d: %s", iid, vmIntrinsics::name_at(iid));
+      break;
+    }
+
+    // Live at this point:
+    //   Rmethod (target method)
+    //   Rsender_sp, Rparams (if interpreted)
+    //   register arguments (if compiled)
+
+    // After figuring out which concrete method to call, jump into it.
+    __ verify_method_ptr(Rmethod);
+    jump_from_method_handle(_masm, for_compiler_entry);
+
+    if (iid == vmIntrinsics::_linkToInterface) {
+      __ bind(L_incompatible_class_change_error);
+      __ jump(StubRoutines::throw_IncompatibleClassChangeError_entry(), relocInfo::runtime_call_type, Rtemp);
+    }
+  }
+}
+
+
+#ifndef PRODUCT
+enum {
+  ARG_LIMIT = 255, SLOP = 4,
+  // use this parameter for checking for garbage stack movements:
+  UNREASONABLE_STACK_MOVE = (ARG_LIMIT + SLOP)
+  // the slop defends against false alarms due to fencepost errors
+};
+
+#ifdef AARCH64
+const int trace_mh_nregs = 32; // R0-R30, PC
+#else
+const int trace_mh_nregs = 15;
+const Register trace_mh_regs[trace_mh_nregs] =
+  {R0, R1, R2, R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, LR, PC};
+#endif // AARCH64
+
+void trace_method_handle_stub(const char* adaptername,
+                              intptr_t* saved_regs,
+                              intptr_t* saved_bp,
+                              oop mh) {
+  // called as a leaf from native code: do not block the JVM!
+  bool has_mh = (strstr(adaptername, "/static") == NULL &&
+                 strstr(adaptername, "linkTo") == NULL);    // static linkers don't have MH
+  intptr_t* entry_sp = (intptr_t*) &saved_regs[trace_mh_nregs]; // just after the saved regs
+  intptr_t* saved_sp = (intptr_t*)  saved_regs[Rsender_sp->encoding()]; // save of Rsender_sp
+  intptr_t* last_sp  = (intptr_t*)  saved_bp[AARCH64_ONLY(frame::interpreter_frame_stack_top_offset) NOT_AARCH64(frame::interpreter_frame_last_sp_offset)];
+  intptr_t* base_sp  = last_sp;
+
+  intptr_t    mh_reg = (intptr_t)saved_regs[R5_mh->encoding()];
+  const char* mh_reg_name = "R5_mh";
+  if (!has_mh)  mh_reg_name = "R5";
+  tty->print_cr("MH %s %s=" PTR_FORMAT " sp=(" PTR_FORMAT "+" INTX_FORMAT ") stack_size=" INTX_FORMAT " bp=" PTR_FORMAT,
+                adaptername, mh_reg_name, mh_reg,
+                (intptr_t)entry_sp, (intptr_t)saved_sp - (intptr_t)entry_sp, (intptr_t)(base_sp - last_sp), (intptr_t)saved_bp);
+
+  if (last_sp != saved_sp && last_sp != NULL)
+    tty->print_cr("*** last_sp=" INTPTR_FORMAT, p2i(last_sp));
+  if (Verbose) {
+    tty->print(" reg dump: ");
+    int i;
+    for (i = 0; i < trace_mh_nregs; i++) {
+      if (i > 0 && i % AARCH64_ONLY(2) NOT_AARCH64(4) == 0)
+        tty->print("\n   + dump: ");
+#ifdef AARCH64
+      const char* reg_name = (i == trace_mh_nregs-1) ? "pc" : as_Register(i)->name();
+#else
+      const char* reg_name = trace_mh_regs[i]->name();
+#endif
+      tty->print(" %s: " INTPTR_FORMAT, reg_name, p2i((void *)saved_regs[i]));
+    }
+    tty->cr();
+  }
+
+  if (Verbose) {
+    // dump last frame (from JavaThread::print_frame_layout)
+
+    // Note: code is robust but the dumped informationm may not be
+    // 100% correct, particularly with respect to the dumped
+    // "unextended_sp". Getting it right for all trace_method_handle
+    // call paths is not worth the complexity/risk. The correct slot
+    // will be identified by *Rsender_sp anyway in the dump.
+    JavaThread* p = JavaThread::active();
+
+    ResourceMark rm;
+    PRESERVE_EXCEPTION_MARK;
+    FrameValues values;
+
+    intptr_t* dump_fp = (intptr_t *) saved_bp;
+    address dump_pc = (address) saved_regs[trace_mh_nregs-2]; // LR (with LR,PC last in saved_regs)
+    frame dump_frame((intptr_t *)entry_sp, dump_fp, dump_pc);
+
+    dump_frame.describe(values, 1);
+    // mark Rsender_sp if seems valid
+    if (has_mh) {
+      if ((saved_sp >= entry_sp - UNREASONABLE_STACK_MOVE) && (saved_sp < dump_fp)) {
+        values.describe(-1, saved_sp, "*Rsender_sp");
+      }
+    }
+
+    // Note: the unextended_sp may not be correct
+    tty->print_cr("  stack layout:");
+    values.print(p);
+  }
+  if (Verbose) {
+    if (has_mh && mh->is_oop()) {
+      mh->print();
+      if (java_lang_invoke_MethodHandle::is_instance(mh)) {
+        if (java_lang_invoke_MethodHandle::form_offset_in_bytes() != 0)
+          java_lang_invoke_MethodHandle::form(mh)->print();
+      }
+    }
+  }
+}
+
+void MethodHandles::trace_method_handle(MacroAssembler* _masm, const char* adaptername) {
+  if (!TraceMethodHandles)  return;
+  BLOCK_COMMENT("trace_method_handle {");
+  // register saving
+  //  must correspond to trace_mh_nregs and trace_mh_regs defined above
+  int push_size = __ save_all_registers();
+  assert(trace_mh_nregs*wordSize == push_size,"saved register count mismatch");
+
+  __ mov_slow(R0, adaptername);
+  __ mov(R1, SP); // entry_sp (after pushes)
+  __ mov(R2, FP);
+  if (R5_mh != R3) {
+    assert_different_registers(R0, R1, R2, R5_mh);
+    __ mov(R3, R5_mh);
+  }
+
+  __ call_VM_leaf(CAST_FROM_FN_PTR(address, trace_method_handle_stub), R0, R1, R2, R3);
+
+  __ restore_all_registers();
+  BLOCK_COMMENT("} trace_method_handle");
+}
+#endif //PRODUCT
--- /dev/null	2016-08-24 15:41:39.598575000 -0400
+++ new/hotspot/src/cpu/arm/vm/methodHandles_arm.hpp	2016-12-02 11:22:16.723060015 -0500
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2011, 2016, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+// Platform-specific definitions for method handles.
+// These definitions are inlined into class MethodHandles.
+
+// Adapters
+enum /* platform_dependent_constants */ {
+  adapter_code_size = 18000 NOT_PRODUCT(+ 30000)
+};
+
+// Additional helper methods for MethodHandles code generation:
+public:
+  static void load_klass_from_Class(MacroAssembler* _masm, Register klass_reg, Register temp1, Register temp2);
+
+  static void verify_klass(MacroAssembler* _masm,
+                           Register obj, Register temp1, Register temp2, SystemDictionary::WKID klass_id,
+                           const char* error_message = "wrong klass") NOT_DEBUG_RETURN;
+
+  static void verify_ref_kind(MacroAssembler* _masm, int ref_kind, Register member_reg, Register temp) NOT_DEBUG_RETURN;
+
+  // Similar to InterpreterMacroAssembler::jump_from_interpreted.
+  // Takes care of special dispatch from single stepping too.
+  // Rmethod should contain target methodOop.
+  static void jump_from_method_handle(MacroAssembler* _masm, bool for_compiler_entry);
+
+  static void jump_to_lambda_form(MacroAssembler* _masm,
+                                  Register recv, Register tmp,
+                                  bool for_compiler_entry);
+
+  static Register saved_last_sp_register() {
+    // Should be in sharedRuntime, not here.
+    return Rsender_sp;
+  }
--- /dev/null	2016-08-24 15:41:39.598575000 -0400
+++ new/hotspot/src/cpu/arm/vm/nativeInst_arm.hpp	2016-12-02 11:22:22.051362178 -0500
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2008, 2015, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef CPU_ARM_VM_NATIVEINST_ARM_HPP
+#define CPU_ARM_VM_NATIVEINST_ARM_HPP
+
+#include "asm/macroAssembler.hpp"
+#include "memory/allocation.hpp"
+#include "runtime/icache.hpp"
+#include "runtime/os.hpp"
+
+
+#ifdef AARCH64
+#include "nativeInst_arm_64.hpp"
+#else
+#include "nativeInst_arm_32.hpp"
+#endif
+
+
+#endif // CPU_ARM_VM_NATIVEINST_ARM_HPP
--- /dev/null	2016-08-24 15:41:39.598575000 -0400
+++ new/hotspot/src/cpu/arm/vm/nativeInst_arm_32.cpp	2016-12-02 11:22:27.323661161 -0500
@@ -0,0 +1,339 @@
+/*
+ * Copyright (c) 2008, 2016, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+#include "assembler_arm.inline.hpp"
+#include "code/codeCache.hpp"
+#include "memory/resourceArea.hpp"
+#include "nativeInst_arm.hpp"
+#include "oops/oop.inline.hpp"
+#include "runtime/handles.hpp"
+#include "runtime/sharedRuntime.hpp"
+#include "runtime/stubRoutines.hpp"
+#include "utilities/ostream.hpp"
+#ifdef COMPILER1
+#include "c1/c1_Runtime1.hpp"
+#endif
+#include "code/icBuffer.hpp"
+
+int NativeMovRegMem::offset() const {
+  switch (kind()) {
+    case instr_ldr_str:
+      return encoding() & 0xfff;
+    case instr_ldrh_strh:
+      return (encoding() & 0x0f) | ((encoding() >> 4) & 0xf0);
+    case instr_fld_fst:
+      return (encoding() & 0xff) << 2;
+    default:
+      ShouldNotReachHere();
+      return 0;
+  }
+}
+
+void NativeMovRegMem::set_offset(int x) {
+  assert(x >= 0 && x < 65536, "encoding constraint");
+  const int Rt = Rtemp->encoding();
+
+  // If offset is too large to be placed into single ldr/str instruction, we replace
+  //   ldr  Rd, [Rn, #offset]
+  //   nop
+  // with
+  //   add  Rtemp, Rn, #offset_hi
+  //   ldr  Rd, [Rtemp, #offset_lo]
+  switch (kind()) {
+    case instr_ldr_str:
+      if (x < 4096) {
+        set_encoding((encoding() & 0xfffff000) | x);
+      } else {
+        NativeInstruction* next = nativeInstruction_at(next_raw_instruction_address());
+        assert(next->is_nop(), "must be");
+        next->set_encoding((encoding() & 0xfff0f000) | Rt << 16 | (x & 0xfff));
+        this->set_encoding((encoding() & 0x000f0000) | Rt << 12 | x >> 12 | 0xe2800a00);
+      }
+      break;
+    case instr_ldrh_strh:
+      if (x < 256) {
+        set_encoding((encoding() & 0xfffff0f0) | (x & 0x0f) | (x & 0xf0) << 4);
+      } else {
+        NativeInstruction* next = nativeInstruction_at(next_raw_instruction_address());
+        assert(next->is_nop(), "must be");
+        next->set_encoding((encoding() & 0xfff0f0f0) | Rt << 16 | (x & 0x0f) | (x & 0xf0) << 4);
+        this->set_encoding((encoding() & 0x000f0000) | Rt << 12 | x >> 8 | 0xe2800c00);
+      }
+      break;
+    case instr_fld_fst:
+      if (x < 1024) {
+        set_encoding((encoding() & 0xffffff00) | (x >> 2));
+      } else {
+        NativeInstruction* next = nativeInstruction_at(next_raw_instruction_address());
+        assert(next->is_nop(), "must be");
+        next->set_encoding((encoding() & 0xfff0ff00) | Rt << 16 | ((x >> 2) & 0xff));
+        this->set_encoding((encoding() & 0x000f0000) | Rt << 12 | x >> 10 | 0xe2800b00);
+      }
+      break;
+    default:
+      ShouldNotReachHere();
+  }
+}
+
+intptr_t NativeMovConstReg::data() const {
+  RawNativeInstruction* next = next_raw();
+  if (is_movw()) {
+    // Oop embedded in movw/movt instructions
+    assert(VM_Version::supports_movw(), "must be");
+    return (this->encoding() & 0x00000fff)       | (this->encoding() & 0x000f0000) >> 4 |
+           (next->encoding() & 0x00000fff) << 16 | (next->encoding() & 0x000f0000) << 12;
+  } else {
+    // Oop is loaded from oops section or inlined in the code
+    int oop_offset;
+    if (is_ldr_literal()) {
+      //   ldr  Rd, [PC, #offset]
+      oop_offset = ldr_offset();
+    } else {
+      assert(next->is_ldr(), "must be");
+      oop_offset = (this->encoding() & 0xff) << 12 | (next->encoding() & 0xfff);
+      if (is_add_pc()) {
+        //   add  Rd, PC, #offset_hi
+        //   ldr  Rd, [Rd, #offset_lo]
+        assert(next->encoding() & (1 << 23), "sign mismatch");
+        // offset OK (both positive)
+      } else {
+        assert(is_sub_pc(), "must be");
+        //   sub  Rd, PC, #offset_hi
+        //   ldr  Rd, [Rd, -#offset_lo]
+        assert(!(next->encoding() & (1 << 23)), "sign mismatch");
+        // negative offsets
+        oop_offset = -oop_offset;
+      }
+    }
+    return *(int*)(instruction_address() + 8 + oop_offset);
+  }
+}
+
+void NativeMovConstReg::set_data(intptr_t x, address pc) {
+  // Find and replace the oop corresponding to this instruction in oops section
+  RawNativeInstruction* next = next_raw();
+  oop* oop_addr = NULL;
+  Metadata** metadata_addr = NULL;
+  CodeBlob* cb = CodeCache::find_blob(instruction_address());
+  if (cb != NULL) {
+    nmethod* nm = cb->as_nmethod_or_null();
+    if (nm != NULL) {
+      RelocIterator iter(nm, instruction_address(), next->instruction_address());
+      while (iter.next()) {
+        if (iter.type() == relocInfo::oop_type) {
+          oop_addr = iter.oop_reloc()->oop_addr();
+          *oop_addr = cast_to_oop(x);
+          break;
+        } else if (iter.type() == relocInfo::metadata_type) {
+          metadata_addr = iter.metadata_reloc()->metadata_addr();
+          *metadata_addr = (Metadata*)x;
+          break;
+        }
+      }
+    }
+  }
+
+  if (is_movw()) {
+    // data embedded in movw/movt instructions
+    assert(VM_Version::supports_movw(), "must be");
+    unsigned int lo = (unsigned int)x;
+    unsigned int hi = (unsigned int)(x >> 16);
+    this->set_encoding((this->encoding() & 0xfff0f000) | (lo & 0xf000) << 4 | (lo & 0xfff));
+    next->set_encoding((next->encoding() & 0xfff0f000) | (hi & 0xf000) << 4 | (hi & 0xfff));
+  } else if (oop_addr == NULL & metadata_addr == NULL) {
+    // A static ldr_literal (without oop or metadata relocation)
+    assert(is_ldr_literal(), "must be");
+    int offset = ldr_offset();
+    oop_addr = (oop*)(instruction_address() + 8 + offset);
+    *oop_addr = cast_to_oop(x);
+  } else {
+    // data is loaded from oop or metadata section
+    int offset;
+
+    address addr = oop_addr != NULL ? (address)oop_addr : (address)metadata_addr;
+
+    if(pc == 0) {
+      offset = addr - instruction_address() - 8;
+    } else {
+      offset = addr - pc - 8;
+    }
+
+    int sign = (offset >= 0) ? (1 << 23) : 0;
+    int delta = (offset >= 0) ? offset : (-offset);
+    assert(delta < 0x100000, "within accessible range");
+    if (is_ldr_literal()) {
+      // fix the ldr with the real offset to the oop/metadata table
+      assert(next->is_nop(), "must be");
+      if (delta < 4096) {
+        //   ldr  Rd, [PC, #offset]
+        set_encoding((encoding() & 0xff7ff000) | delta | sign);
+        assert(ldr_offset() == offset, "check encoding");
+      } else {
+        int cc = encoding() & 0xf0000000;
+        int Rd = (encoding() >> 12) & 0xf;
+        int Rt = Rd;
+        assert(Rt != 0xf, "Illegal destination register"); // or fix by using Rtemp
+        // move the ldr, fixing delta_lo and the source register
+        next->set_encoding((encoding() & 0xff70f000) | (Rt << 16) | (delta & 0xfff) | sign);
+        assert(next->is_ldr(), "must be");
+        if (offset > 0) {
+          //   add  Rt, PC, #delta_hi
+          //   ldr  Rd, [Rt, #delta_lo]
+          this->set_encoding((Rt << 12) | (delta >> 12) | 0x028f0a00 | cc);
+          assert(is_add_pc(), "must be");
+        } else {
+          //   sub Rt, PC, #delta_hi
+          //   ldr  Rd, [Rt, -#delta_lo]
+          this->set_encoding((Rt << 12) | (delta >> 12) | 0x024f0a00 | cc);
+          assert(is_sub_pc(), "must be");
+        }
+      }
+    } else {
+      assert(is_pc_rel(), "must be");
+      assert(next->is_ldr(), "must be");
+      if (offset > 0) {
+        //   add Rt, PC, #delta_hi
+        this->set_encoding((this->encoding() & 0xf00ff000) | 0x02800a00 | (delta >> 12));
+        assert(is_add_pc(), "must be");
+      } else {
+        //   sub Rt, PC, #delta_hi
+        this->set_encoding((this->encoding() & 0xf00ff000) | 0x02400a00 | (delta >> 12));
+        assert(is_sub_pc(), "must be");
+      }
+      //    ldr Rd, Rt, #delta_lo (or -#delta_lo)
+      next->set_encoding((next->encoding() & 0xff7ff000) | (delta & 0xfff) | sign);
+    }
+  }
+}
+
+void NativeMovConstReg::set_pc_relative_offset(address addr, address pc) {
+  int offset;
+  if (pc == 0) {
+    offset = addr - instruction_address() - 8;
+  } else {
+    offset = addr - pc - 8;
+  }
+
+  RawNativeInstruction* next = next_raw();
+
+  int sign = (offset >= 0) ? (1 << 23) : 0;
+  int delta = (offset >= 0) ? offset : (-offset);
+  assert(delta < 0x100000, "within accessible range");
+  if (is_ldr_literal()) {
+    if (delta < 4096) {
+      //   ldr  Rd, [PC, #offset]
+      set_encoding((encoding() & 0xff7ff000) | delta | sign);
+      assert(ldr_offset() == offset, "check encoding");
+    } else {
+      assert(next->is_nop(), "must be");
+      int cc = encoding() & 0xf0000000;
+      int Rd = (encoding() >> 12) & 0xf;
+      int Rt = Rd;
+      assert(Rt != 0xf, "Illegal destination register"); // or fix by using Rtemp
+      // move the ldr, fixing delta_lo and the source register
+      next->set_encoding((encoding() & 0xff70f000) | (Rt << 16) | (delta & 0xfff) | sign);
+      assert(next->is_ldr(), "must be");
+      if (offset > 0) {
+        //   add  Rt, PC, #delta_hi
+        //   ldr  Rd, [Rt, #delta_lo]
+        this->set_encoding((Rt << 12) | (delta >> 12) | 0x028f0a00 | cc);
+        assert(is_add_pc(), "must be");
+      } else {
+        //   sub Rt, PC, #delta_hi
+        //   ldr Rd, [Rt, -#delta_lo]
+        this->set_encoding((Rt << 12) | (delta >> 12) | 0x024f0a00 | cc);
+        assert(is_sub_pc(), "must be");
+      }
+    }
+  } else {
+    assert(is_pc_rel(), "must be");
+    assert(next->is_ldr(), "must be");
+    if (offset > 0) {
+      //   add Rt, PC, #delta_hi
+      this->set_encoding((this->encoding() & 0xf00ff000) | 0x02800a00 | (delta >> 12));
+      assert(is_add_pc(), "must be");
+    } else {
+      //   sub Rt, PC, #delta_hi
+      this->set_encoding((this->encoding() & 0xf00ff000) | 0x02400a00 | (delta >> 12));
+      assert(is_sub_pc(), "must be");
+    }
+    //    ldr Rd, Rt, #delta_lo (or -#delta_lo)
+    next->set_encoding((next->encoding() & 0xff7ff000) | (delta & 0xfff) | sign);
+  }
+}
+
+void RawNativeJump::check_verified_entry_alignment(address entry, address verified_entry) {
+}
+
+void RawNativeJump::patch_verified_entry(address entry, address verified_entry, address dest) {
+  assert(dest == SharedRuntime::get_handle_wrong_method_stub(), "should be");
+  int *a = (int *)verified_entry;
+  a[0] = zombie_illegal_instruction; // always illegal
+  ICache::invalidate_range((address)&a[0], sizeof a[0]);
+}
+
+void NativeGeneralJump::insert_unconditional(address code_pos, address entry) {
+  int offset = (int)(entry - code_pos - 8);
+  assert(offset < 0x2000000 && offset > -0x2000000, "encoding constraint");
+  nativeInstruction_at(code_pos)->set_encoding(0xea000000 | ((unsigned int)offset << 6 >> 8));
+}
+
+static address raw_call_for(address return_address) {
+  CodeBlob* cb = CodeCache::find_blob(return_address);
+  nmethod* nm = cb->as_nmethod_or_null();
+  if (nm == NULL) {
+    ShouldNotReachHere();
+    return NULL;
+  }
+  // Look back 4 instructions, to allow for ic_call
+  address begin = MAX2(return_address - 4*NativeInstruction::instruction_size, nm->code_begin());
+  RelocIterator iter(nm, begin, return_address);
+  while (iter.next()) {
+    Relocation* reloc = iter.reloc();
+    if (reloc->is_call()) {
+      address call = reloc->addr();
+      if (nativeInstruction_at(call)->is_call()) {
+        if (nativeCall_at(call)->return_address() == return_address) {
+          return call;
+        }
+      } else {
+        // Some "calls" are really jumps
+        assert(nativeInstruction_at(call)->is_jump(), "must be call or jump");
+      }
+    }
+  }
+  return NULL;
+}
+
+bool RawNativeCall::is_call_before(address return_address) {
+  return (raw_call_for(return_address) != NULL);
+}
+
+NativeCall* rawNativeCall_before(address return_address) {
+  address call = raw_call_for(return_address);
+  assert(call != NULL, "must be");
+  return nativeCall_at(call);
+}
+
--- /dev/null	2016-08-24 15:41:39.598575000 -0400
+++ new/hotspot/src/cpu/arm/vm/nativeInst_arm_32.hpp	2016-12-02 11:22:32.923978750 -0500
@@ -0,0 +1,432 @@
+/*
+ * Copyright (c) 2008, 2016, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef CPU_ARM_VM_NATIVEINST_ARM_32_HPP
+#define CPU_ARM_VM_NATIVEINST_ARM_32_HPP
+
+#include "asm/macroAssembler.hpp"
+#include "code/codeCache.hpp"
+#include "memory/allocation.hpp"
+#include "runtime/icache.hpp"
+#include "runtime/os.hpp"
+#include "runtime/thread.hpp"
+#include "register_arm.hpp"
+
+// -------------------------------------------------------------------
+
+// Some experimental projects extend the ARM back-end by implementing
+// what the front-end usually assumes is a single native instruction
+// with a sequence of instructions.
+//
+// The 'Raw' variants are the low level initial code (usually one
+// instruction wide but some of them were already composed
+// instructions). They should be used only by the back-end.
+//
+// The non-raw classes are the front-end entry point, hiding potential
+// back-end extensions or the actual instructions size.
+class NativeInstruction;
+
+class RawNativeInstruction VALUE_OBJ_CLASS_SPEC {
+ public:
+
+  enum ARM_specific {
+    instruction_size = Assembler::InstructionSize
+  };
+
+  enum InstructionKind {
+    instr_ldr_str    = 0x50,
+    instr_ldrh_strh  = 0x10,
+    instr_fld_fst    = 0xd0
+  };
+
+  // illegal instruction used by NativeJump::patch_verified_entry
+  // permanently undefined (UDF): 0xe << 28 | 0b1111111 << 20 | 0b1111 << 4
+  static const int zombie_illegal_instruction = 0xe7f000f0;
+
+  static int decode_rotated_imm12(int encoding) {
+    int base = encoding & 0xff;
+    int right_rotation = (encoding & 0xf00) >> 7;
+    int left_rotation = 32 - right_rotation;
+    int val = (base >> right_rotation) | (base << left_rotation);
+    return val;
+  }
+
+  address addr_at(int offset)        const { return (address)this + offset; }
+  address instruction_address()      const { return addr_at(0); }
+  address next_raw_instruction_address() const { return addr_at(instruction_size); }
+
+  static RawNativeInstruction* at(address address) {
+    return (RawNativeInstruction*)address;
+  }
+  RawNativeInstruction* next_raw() const {
+    return at(next_raw_instruction_address());
+  }
+
+ public:
+  int encoding()                     const { return *(int*)this; }
+
+  void set_encoding(int value) {
+    int old = *(int*)this;
+    if (old != value) {
+      *(int*)this = value;
+      ICache::invalidate_word((address)this);
+    }
+  }
+
+  InstructionKind kind() const {
+    return (InstructionKind) ((encoding() >> 20) & 0xf2);
+  }
+
+  bool is_nop()            const { return encoding() == (int)0xe1a00000; }
+  bool is_b()              const { return (encoding() & 0x0f000000) == 0x0a000000; }
+  bool is_bx()             const { return (encoding() & 0x0ffffff0) == 0x012fff10; }
+  bool is_bl()             const { return (encoding() & 0x0f000000) == 0x0b000000; }
+  bool is_blx()            const { return (encoding() & 0x0ffffff0) == 0x012fff30; }
+  bool is_fat_call()       const {
+    return (is_add_lr() && next_raw()->is_jump());
+  }
+  bool is_ldr_call()       const {
+    return (is_add_lr() && next_raw()->is_ldr_pc());
+  }
+  bool is_jump()           const { return is_b() || is_ldr_pc(); }
+  bool is_call()           const { return is_bl() || is_fat_call(); }
+  bool is_branch()         const { return is_b() || is_bl(); }
+  bool is_far_branch()     const { return is_movw() || is_ldr_literal(); }
+  bool is_ldr_literal()    const {
+    // ldr Rx, [PC, #offset] for positive or negative offsets
+    return (encoding() & 0x0f7f0000) == 0x051f0000;
+  }
+  bool is_ldr()    const {
+    // ldr Rd, [Rn, #offset] for positive or negative offsets
+    return (encoding() & 0x0f700000) == 0x05100000;
+  }
+  int ldr_offset() const {
+    assert(is_ldr(), "must be");
+    int offset = encoding() & 0xfff;
+    if (encoding() & (1 << 23)) {
+      // positive offset
+    } else {
+      // negative offset
+      offset = -offset;
+    }
+    return offset;
+  }
+  // is_ldr_pc: ldr PC, PC, #offset
+  bool is_ldr_pc()         const { return (encoding() & 0x0f7ff000) == 0x051ff000; }
+  // is_setting_pc(): ldr PC, Rxx, #offset
+  bool is_setting_pc()         const { return (encoding() & 0x0f70f000) == 0x0510f000; }
+  bool is_add_lr()         const { return (encoding() & 0x0ffff000) == 0x028fe000; }
+  bool is_add_pc()         const { return (encoding() & 0x0fff0000) == 0x028f0000; }
+  bool is_sub_pc()         const { return (encoding() & 0x0fff0000) == 0x024f0000; }
+  bool is_pc_rel()         const { return is_add_pc() || is_sub_pc(); }
+  bool is_movw()           const { return (encoding() & 0x0ff00000) == 0x03000000; }
+  bool is_movt()           const { return (encoding() & 0x0ff00000) == 0x03400000; }
+  // c2 doesn't use fixed registers for safepoint poll address
+  bool is_safepoint_poll() const { return (encoding() & 0xfff0ffff) == 0xe590c000; }
+  // For unit tests
+  static void test() {}
+
+};
+
+inline RawNativeInstruction* rawNativeInstruction_at(address address) {
+  return (RawNativeInstruction*)address;
+}
+
+// Base class exported to the front-end
+class NativeInstruction: public RawNativeInstruction {
+public:
+  static NativeInstruction* at(address address) {
+    return (NativeInstruction*)address;
+  }
+
+public:
+  // No need to consider indirections while parsing NativeInstruction
+  address next_instruction_address() const {
+    return next_raw_instruction_address();
+  }
+
+  // next() is no longer defined to avoid confusion.
+  //
+  // The front end and most classes except for those defined in nativeInst_arm
+  // or relocInfo_arm should only use next_instruction_address(), skipping
+  // over composed instruction and ignoring back-end extensions.
+  //
+  // The back-end can use next_raw() when it knows the instruction sequence
+  // and only wants to skip a single native instruction.
+};
+
+inline NativeInstruction* nativeInstruction_at(address address) {
+  return (NativeInstruction*)address;
+}
+
+// -------------------------------------------------------------------
+// Raw b() or bl() instructions, not used by the front-end.
+class RawNativeBranch: public RawNativeInstruction {
+ public:
+
+  address destination(int adj = 0) const {
+    return instruction_address() + (encoding() << 8 >> 6) + 8 + adj;
+  }
+
+  void set_destination(address dest) {
+    int new_offset = (int)(dest - instruction_address() - 8);
+    assert(new_offset < 0x2000000 && new_offset > -0x2000000, "encoding constraint");
+    set_encoding((encoding() & 0xff000000) | ((unsigned int)new_offset << 6 >> 8));
+  }
+};
+
+inline RawNativeBranch* rawNativeBranch_at(address address) {
+  assert(rawNativeInstruction_at(address)->is_branch(), "must be");
+  return (RawNativeBranch*)address;
+}
+
+class NativeBranch: public RawNativeBranch {
+};
+
+inline NativeBranch* nativeBranch_at(address address) {
+  return (NativeBranch *) rawNativeBranch_at(address);
+}
+
+// -------------------------------------------------------------------
+// NativeGeneralJump is for patchable internal (near) jumps
+// It is used directly by the front-end and must be a single instruction wide
+// (to support patching to other kind of instructions).
+class NativeGeneralJump: public RawNativeInstruction {
+ public:
+
+  address jump_destination() const {
+    return rawNativeBranch_at(instruction_address())->destination();
+  }
+
+  void set_jump_destination(address dest) {
+    return rawNativeBranch_at(instruction_address())->set_destination(dest);
+  }
+
+  static void insert_unconditional(address code_pos, address entry);
+
+  static void replace_mt_safe(address instr_addr, address code_buffer) {
+    assert(((int)instr_addr & 3) == 0 && ((int)code_buffer & 3) == 0, "must be aligned");
+    // Writing a word is atomic on ARM, so no MT-safe tricks are needed
+    rawNativeInstruction_at(instr_addr)->set_encoding(*(int*)code_buffer);
+  }
+};
+
+inline NativeGeneralJump* nativeGeneralJump_at(address address) {
+  assert(rawNativeInstruction_at(address)->is_jump(), "must be");
+  return (NativeGeneralJump*)address;
+}
+
+// -------------------------------------------------------------------
+class RawNativeJump: public NativeInstruction {
+ public:
+
+  address jump_destination(int adj = 0) const {
+    address a;
+    if (is_b()) {
+      a = rawNativeBranch_at(instruction_address())->destination(adj);
+      // Jump destination -1 is encoded as a jump to self
+      if (a == instruction_address()) {
+        return (address)-1;
+      }
+    } else {
+      assert(is_ldr_pc(), "must be");
+      int offset = this->ldr_offset();
+      a = *(address*)(instruction_address() + 8 + offset);
+    }
+    return a;
+  }
+
+  void set_jump_destination(address dest) {
+    address a;
+    if (is_b()) {
+      // Jump destination -1 is encoded as a jump to self
+      if (dest == (address)-1) {
+        dest = instruction_address();
+      }
+      rawNativeBranch_at(instruction_address())->set_destination(dest);
+    } else {
+      assert(is_ldr_pc(), "must be");
+      int offset = this->ldr_offset();
+      *(address*)(instruction_address() + 8 + offset) = dest;
+      OrderAccess::storeload(); // overkill if caller holds lock?
+    }
+  }
+
+  static void check_verified_entry_alignment(address entry, address verified_entry);
+
+  static void patch_verified_entry(address entry, address verified_entry, address dest);
+
+};
+
+inline RawNativeJump* rawNativeJump_at(address address) {
+  assert(rawNativeInstruction_at(address)->is_jump(), "must be");
+  return (RawNativeJump*)address;
+}
+
+// -------------------------------------------------------------------
+class RawNativeCall: public NativeInstruction {
+  // See IC calls in LIR_Assembler::ic_call(): ARM v5/v6 doesn't use a
+  // single bl for IC calls.
+
+ public:
+
+  address return_address() const {
+    if (is_bl()) {
+      return addr_at(instruction_size);
+    } else {
+      assert(is_fat_call(), "must be");
+      int offset = encoding() & 0xff;
+      return addr_at(offset + 8);
+    }
+  }
+
+  address destination(int adj = 0) const {
+    if (is_bl()) {
+      return rawNativeBranch_at(instruction_address())->destination(adj);
+    } else {
+      assert(is_add_lr(), "must be"); // fat_call
+      RawNativeJump *next = rawNativeJump_at(next_raw_instruction_address());
+      return next->jump_destination(adj);
+    }
+  }
+
+  void set_destination(address dest) {
+    if (is_bl()) {
+      return rawNativeBranch_at(instruction_address())->set_destination(dest);
+    } else {
+      assert(is_add_lr(), "must be"); // fat_call
+      RawNativeJump *next = rawNativeJump_at(next_raw_instruction_address());
+      return next->set_jump_destination(dest);
+    }
+  }
+
+  void set_destination_mt_safe(address dest) {
+    assert(CodeCache::contains(dest), "external destination might be too far");
+    set_destination(dest);
+  }
+
+  void verify() {
+    assert(RawNativeInstruction::is_call() || (!VM_Version::supports_movw() && RawNativeInstruction::is_jump()), "must be");
+  }
+
+  void verify_alignment() {
+    // Nothing to do on ARM
+  }
+
+  static bool is_call_before(address return_address);
+};
+
+inline RawNativeCall* rawNativeCall_at(address address) {
+  assert(rawNativeInstruction_at(address)->is_call(), "must be");
+  return (RawNativeCall*)address;
+}
+
+NativeCall* rawNativeCall_before(address return_address);
+
+// -------------------------------------------------------------------
+// NativeMovRegMem need not be extended with indirection support.
+// (field access patching is handled differently in that case)
+class NativeMovRegMem: public NativeInstruction {
+ public:
+
+  int offset() const;
+  void set_offset(int x);
+
+  void add_offset_in_bytes(int add_offset) {
+    set_offset(offset() + add_offset);
+  }
+
+};
+
+inline NativeMovRegMem* nativeMovRegMem_at(address address) {
+  NativeMovRegMem* instr = (NativeMovRegMem*)address;
+  assert(instr->kind() == NativeInstruction::instr_ldr_str   ||
+         instr->kind() == NativeInstruction::instr_ldrh_strh ||
+         instr->kind() == NativeInstruction::instr_fld_fst, "must be");
+  return instr;
+}
+
+// -------------------------------------------------------------------
+// NativeMovConstReg is primarily for loading oops and metadata
+class NativeMovConstReg: public NativeInstruction {
+ public:
+
+  intptr_t data() const;
+  void set_data(intptr_t x, address pc = 0);
+  bool is_pc_relative() {
+    return !is_movw();
+  }
+  void set_pc_relative_offset(address addr, address pc);
+  address next_instruction_address() const {
+    // NOTE: CompiledStaticCall::set_to_interpreted() calls this but
+    // are restricted to single-instruction ldr. No need to jump over
+    // several instructions.
+    assert(is_ldr_literal(), "Should only use single-instructions load");
+    return next_raw_instruction_address();
+  }
+};
+
+inline NativeMovConstReg* nativeMovConstReg_at(address address) {
+  NativeInstruction* ni = nativeInstruction_at(address);
+  assert(ni->is_ldr_literal() || ni->is_pc_rel() ||
+         ni->is_movw() && VM_Version::supports_movw(), "must be");
+  return (NativeMovConstReg*)address;
+}
+
+// -------------------------------------------------------------------
+// Front end classes, hiding experimental back-end extensions.
+
+// Extension to support indirections
+class NativeJump: public RawNativeJump {
+ public:
+};
+
+inline NativeJump* nativeJump_at(address address) {
+  assert(nativeInstruction_at(address)->is_jump(), "must be");
+  return (NativeJump*)address;
+}
+
+class NativeCall: public RawNativeCall {
+public:
+  // NativeCall::next_instruction_address() is used only to define the
+  // range where to look for the relocation information. We need not
+  // walk over composed instructions (as long as the relocation information
+  // is associated to the first instruction).
+  address next_instruction_address() const {
+    return next_raw_instruction_address();
+  }
+
+};
+
+inline NativeCall* nativeCall_at(address address) {
+  assert(nativeInstruction_at(address)->is_call() ||
+         (!VM_Version::supports_movw() && nativeInstruction_at(address)->is_jump()), "must be");
+  return (NativeCall*)address;
+}
+
+inline NativeCall* nativeCall_before(address return_address) {
+  return (NativeCall *) rawNativeCall_before(return_address);
+}
+
+#endif // CPU_ARM_VM_NATIVEINST_ARM_32_HPP
--- /dev/null	2016-08-24 15:41:39.598575000 -0400
+++ new/hotspot/src/cpu/arm/vm/nativeInst_arm_64.cpp	2016-12-02 11:22:38.000266619 -0500
@@ -0,0 +1,243 @@
+/*
+ * Copyright (c) 2008, 2016, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+#include "assembler_arm.inline.hpp"
+#include "code/codeCache.hpp"
+#include "memory/resourceArea.hpp"
+#include "nativeInst_arm.hpp"
+#include "oops/klass.inline.hpp"
+#include "oops/oop.inline.hpp"
+#include "runtime/handles.hpp"
+#include "runtime/sharedRuntime.hpp"
+#include "runtime/stubRoutines.hpp"
+#include "utilities/ostream.hpp"
+#ifdef COMPILER1
+#include "c1/c1_Runtime1.hpp"
+#endif
+
+void RawNativeInstruction::verify() {
+  // make sure code pattern is actually an instruction address
+  address addr = instruction_address();
+  if (addr == NULL || ((intptr_t)addr & (instruction_size - 1)) != 0) {
+    fatal("not an instruction address");
+  }
+}
+
+void NativeMovRegMem::set_offset(int x) {
+  int scale = get_offset_scale();
+  assert((x & right_n_bits(scale)) == 0, "offset should be aligned");
+  guarantee((x >> 24) == 0, "encoding constraint");
+
+  if (Assembler::is_unsigned_imm_in_range(x, 12, scale)) {
+    set_unsigned_imm(x, 12, get_offset_scale(), 10);
+    return;
+  }
+
+  // If offset is too large to be placed into single ldr/str instruction, we replace
+  //   ldr/str  Rt, [Rn, #offset]
+  //   nop
+  // with
+  //   add  LR, Rn, #offset_hi
+  //   ldr/str  Rt, [LR, #offset_lo]
+
+  // Note: Rtemp cannot be used as a temporary register as it could be used
+  // for value being stored (see LIR_Assembler::reg2mem).
+  // Patchable NativeMovRegMem instructions are generated in LIR_Assembler::mem2reg and LIR_Assembler::reg2mem
+  // which do not use LR, so it is free. Also, it does not conflict with LR usages in c1_LIRGenerator_arm.cpp.
+  const int tmp = LR->encoding();
+  const int rn = (encoding() >> 5) & 0x1f;
+
+  NativeInstruction* next = nativeInstruction_at(next_raw_instruction_address());
+  assert(next->is_nop(), "must be");
+
+  next->set_encoding((encoding() & 0xffc0001f) | Assembler::encode_unsigned_imm((x & 0xfff), 12, scale, 10) | tmp << 5);
+  this->set_encoding(0x91400000 | Assembler::encode_unsigned_imm((x >> 12), 12, 0, 10) | rn << 5 | tmp);
+}
+
+intptr_t NativeMovConstReg::_data() const {
+#ifdef COMPILER2
+  if (is_movz()) {
+    // narrow constant or ic call cached value
+    RawNativeInstruction* ni = next_raw();
+    assert(ni->is_movk(), "movz;movk expected");
+    uint lo16 = (encoding() >> 5) & 0xffff;
+    intptr_t hi = 0;
+    int i = 0;
+    while (ni->is_movk() && i < 3) {
+      uint hi16 = (ni->encoding() >> 5) & 0xffff;
+      int shift = ((ni->encoding() >> 21) & 0x3) << 4;
+      hi |= (intptr_t)hi16 << shift;
+      ni = ni->next_raw();
+      ++i;
+    }
+    return lo16 | hi;
+  }
+#endif
+  return (intptr_t)(nativeLdrLiteral_at(instruction_address())->literal_value());
+}
+
+static void raw_set_data(RawNativeInstruction* si, intptr_t x, oop* oop_addr, Metadata** metadata_addr) {
+#ifdef COMPILER2
+  if (si->is_movz()) {
+    // narrow constant or ic call cached value
+    uintptr_t nx = 0;
+    int val_size = 32;
+    if (oop_addr != NULL) {
+      narrowOop encoded_oop = oopDesc::encode_heap_oop(*oop_addr);
+      nx = encoded_oop;
+    } else if (metadata_addr != NULL) {
+      assert((*metadata_addr)->is_klass(), "expected Klass");
+      narrowKlass encoded_k = Klass::encode_klass((Klass *)*metadata_addr);
+      nx = encoded_k;
+    } else {
+      nx = x;
+      val_size = 64;
+    }
+    RawNativeInstruction* ni = si->next_raw();
+    uint lo16 = nx & 0xffff;
+    int shift = 16;
+    int imm16 = 0xffff << 5;
+    si->set_encoding((si->encoding() & ~imm16) | (lo16 << 5));
+    while (shift < val_size) {
+      assert(ni->is_movk(), "movk expected");
+      assert((((ni->encoding() >> 21) & 0x3) << 4) == shift, "wrong shift");
+      uint hi16 = (nx >> shift) & 0xffff;
+      ni->set_encoding((ni->encoding() & ~imm16) | (hi16 << 5));
+      shift += 16;
+      ni = ni->next_raw();
+    }
+    return;
+  }
+#endif
+
+  assert(si->is_ldr_literal(), "should be");
+
+  if (oop_addr == NULL && metadata_addr == NULL) {
+    // A static ldr_literal without oop_relocation
+    nativeLdrLiteral_at(si->instruction_address())->set_literal_value((address)x);
+  } else {
+    // Oop is loaded from oops section
+    address addr = oop_addr != NULL ? (address)oop_addr : (address)metadata_addr;
+    int offset = addr - si->instruction_address();
+
+    assert((((intptr_t)addr) & 0x7) == 0, "target address should be aligned");
+    assert((offset & 0x3) == 0, "offset should be aligned");
+
+    guarantee(Assembler::is_offset_in_range(offset, 19), "offset is not in range");
+    nativeLdrLiteral_at(si->instruction_address())->set_literal_address(si->instruction_address() + offset);
+  }
+}
+
+void NativeMovConstReg::set_data(intptr_t x) {
+  // Find and replace the oop corresponding to this instruction in oops section
+  oop* oop_addr = NULL;
+  Metadata** metadata_addr = NULL;
+  CodeBlob* cb = CodeCache::find_blob(instruction_address());
+  {
+    nmethod* nm = cb->as_nmethod_or_null();
+    if (nm != NULL) {
+      RelocIterator iter(nm, instruction_address(), next_raw()->instruction_address());
+      while (iter.next()) {
+        if (iter.type() == relocInfo::oop_type) {
+          oop_addr = iter.oop_reloc()->oop_addr();
+          *oop_addr = cast_to_oop(x);
+          break;
+        } else if (iter.type() == relocInfo::metadata_type) {
+          metadata_addr = iter.metadata_reloc()->metadata_addr();
+          *metadata_addr = (Metadata*)x;
+          break;
+        }
+      }
+    }
+  }
+  raw_set_data(adjust(this), x, oop_addr,  metadata_addr);
+}
+
+void NativeJump::check_verified_entry_alignment(address entry, address verified_entry) {
+}
+
+void NativeJump::patch_verified_entry(address entry, address verified_entry, address dest) {
+  assert(dest == SharedRuntime::get_handle_wrong_method_stub(), "should be");
+
+  NativeInstruction* instr = nativeInstruction_at(verified_entry);
+  assert(instr->is_nop() || instr->encoding() == zombie_illegal_instruction, "required for MT-safe patching");
+  instr->set_encoding(zombie_illegal_instruction);
+}
+
+void NativeGeneralJump::replace_mt_safe(address instr_addr, address code_buffer) {
+  assert (nativeInstruction_at(instr_addr)->is_b(), "MT-safe patching of arbitrary instructions is not allowed");
+  assert (nativeInstruction_at(code_buffer)->is_nop(), "MT-safe patching of arbitrary instructions is not allowed");
+  nativeInstruction_at(instr_addr)->set_encoding(*(int*)code_buffer);
+}
+
+void NativeGeneralJump::insert_unconditional(address code_pos, address entry) {
+  // Insert at code_pos unconditional B instruction jumping to entry
+  intx offset = entry - code_pos;
+  assert (Assembler::is_offset_in_range(offset, 26), "offset is out of range");
+
+  NativeInstruction* instr = nativeInstruction_at(code_pos);
+  assert (instr->is_b() || instr->is_nop(), "MT-safe patching of arbitrary instructions is not allowed");
+
+  instr->set_encoding(0x5 << 26 | Assembler::encode_offset(offset, 26, 0));
+}
+
+static address call_for(address return_address) {
+  CodeBlob* cb = CodeCache::find_blob(return_address);
+  nmethod* nm = cb->as_nmethod_or_null();
+  if (nm == NULL) {
+    ShouldNotReachHere();
+    return NULL;
+  }
+
+  // Look back 8 instructions (for LIR_Assembler::ic_call and MacroAssembler::patchable_call)
+  address begin = return_address - 8*NativeInstruction::instruction_size;
+  if (begin < nm->code_begin()) {
+    begin = nm->code_begin();
+  }
+  RelocIterator iter(nm, begin, return_address);
+  while (iter.next()) {
+    Relocation* reloc = iter.reloc();
+    if (reloc->is_call()) {
+      address call = reloc->addr();
+      if (nativeInstruction_at(call)->is_call()) {
+        if (nativeCall_at(call)->return_address() == return_address) {
+          return call;
+        }
+      }
+    }
+  }
+
+  return NULL;
+}
+
+bool NativeCall::is_call_before(address return_address) {
+  return (call_for(return_address) != NULL);
+}
+
+NativeCall* nativeCall_before(address return_address) {
+  assert(NativeCall::is_call_before(return_address), "must be");
+  return nativeCall_at(call_for(return_address));
+}
+
--- /dev/null	2016-08-24 15:41:39.598575000 -0400
+++ new/hotspot/src/cpu/arm/vm/nativeInst_arm_64.hpp	2016-12-02 11:22:43.952604165 -0500
@@ -0,0 +1,772 @@
+/*
+ * Copyright (c) 2008, 2016, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef CPU_ARM_VM_NATIVEINST_ARM_64_HPP
+#define CPU_ARM_VM_NATIVEINST_ARM_64_HPP
+
+#include "asm/macroAssembler.hpp"
+#include "code/codeCache.hpp"
+#include "memory/allocation.hpp"
+#include "runtime/icache.hpp"
+#include "runtime/os.hpp"
+
+// -------------------------------------------------------------------
+
+// Some experimental projects extend the ARM back-end by implementing
+// what the front-end usually assumes is a single native instruction
+// with a sequence of instructions.
+//
+// The 'Raw' variants are the low level initial code (usually one
+// instruction wide but some of them were already composed
+// instructions). They should be used only by the back-end.
+//
+// The non-raw classes are the front-end entry point, hiding potential
+// back-end extensions or the actual instructions size.
+class NativeInstruction;
+
+class RawNativeInstruction VALUE_OBJ_CLASS_SPEC {
+ public:
+
+  enum ARM_specific {
+    instruction_size = Assembler::InstructionSize,
+    instruction_size_in_bits = instruction_size * BitsPerByte,
+  };
+
+  // illegal instruction used by NativeJump::patch_verified_entry
+  static const int zombie_illegal_instruction = 0xd4000542; // hvc #42
+
+  address addr_at(int offset)        const { return (address)this + offset; }
+  address instruction_address()      const { return addr_at(0); }
+  address next_raw_instruction_address() const { return addr_at(instruction_size); }
+
+  static RawNativeInstruction* at(address address) {
+    return (RawNativeInstruction*)address;
+  }
+
+  RawNativeInstruction* next_raw() const {
+    return at(next_raw_instruction_address());
+  }
+
+  int encoding() const {
+    return *(int*)this;
+  }
+
+  void set_encoding(int value) {
+    int old = encoding();
+    if (old != value) {
+      *(int*)this = value;
+      ICache::invalidate_word((address)this);
+    }
+  }
+
+  bool is_nop()                      const { return encoding() == (int)0xd503201f; }
+  bool is_b()                        const { return (encoding() & 0xfc000000) == 0x14000000; } // unconditional branch
+  bool is_b_cond()                   const { return (encoding() & 0xff000010) == 0x54000000; } // conditional branch
+  bool is_bl()                       const { return (encoding() & 0xfc000000) == 0x94000000; }
+  bool is_br()                       const { return (encoding() & 0xfffffc1f) == 0xd61f0000; }
+  bool is_blr()                      const { return (encoding() & 0xfffffc1f) == 0xd63f0000; }
+  bool is_ldr_literal()              const { return (encoding() & 0xff000000) == 0x58000000; }
+  bool is_adr_aligned()              const { return (encoding() & 0xff000000) == 0x10000000; } // adr Xn, <label>, where label is aligned to 4 bytes (address of instruction).
+  bool is_adr_aligned_lr()           const { return (encoding() & 0xff00001f) == 0x1000001e; } // adr LR, <label>, where label is aligned to 4 bytes (address of instruction).
+
+  bool is_ldr_str_gp_reg_unsigned_imm()   const { return (encoding() & 0x3f000000) == 0x39000000; } // ldr/str{b, sb, h, sh, _w, sw} Rt, [Rn, #imm]
+  bool is_ldr_str_fp_reg_unsigned_imm()   const { return (encoding() & 0x3f000000) == 0x3D000000; } // ldr/str Rt(SIMD), [Rn, #imm]
+  bool is_ldr_str_reg_unsigned_imm()      const { return is_ldr_str_gp_reg_unsigned_imm() || is_ldr_str_fp_reg_unsigned_imm(); }
+
+  bool is_stp_preindex()             const { return (encoding() & 0xffc00000) == 0xa9800000; } // stp Xt1, Xt2, [Xn, #imm]!
+  bool is_ldp_postindex()            const { return (encoding() & 0xffc00000) == 0xa8c00000; } // ldp Xt1, Xt2, [Xn] #imm
+  bool is_mov_sp()                   const { return (encoding() & 0xfffffc00) == 0x91000000; } // mov <Xn|SP>, <Xm|SP>
+  bool is_movn()                     const { return (encoding() & 0x7f800000) == 0x12800000; }
+  bool is_movz()                     const { return (encoding() & 0x7f800000) == 0x52800000; }
+  bool is_movk()                     const { return (encoding() & 0x7f800000) == 0x72800000; }
+  bool is_orr_imm()                  const { return (encoding() & 0x7f800000) == 0x32000000; }
+  bool is_cmp_rr()                   const { return (encoding() & 0x7fe00000) == 0x6b000000; }
+  bool is_csel()                     const { return (encoding() & 0x7fe00000) == 0x1a800000; }
+  bool is_sub_shift()                const { return (encoding() & 0x7f200000) == 0x4b000000; } // sub Rd, Rn, shift (Rm, imm)
+  bool is_mov()                      const { return (encoding() & 0x7fe0ffe0) == 0x2a0003e0; } // mov Rd, Rm (orr Rd, ZR, shift (Rm, 0))
+  bool is_tst()                      const { return (encoding() & 0x7f20001f) == 0x6a00001f; } // tst Rn, shift (Rm, imm) (ands ZR, Rn, shift(Rm, imm))
+  bool is_lsr_imm()                  const { return (encoding() & 0x7f807c00) == 0x53007c00; } // lsr Rd, Rn, imm (ubfm Rd, Rn, imm, 31/63)
+
+  bool is_far_jump()                 const { return is_ldr_literal() && next_raw()->is_br(); }
+  bool is_fat_call()                 const {
+    return
+#ifdef COMPILER2
+      (is_blr() && next_raw()->is_b()) ||
+#endif
+      (is_adr_aligned_lr() && next_raw()->is_br());
+  }
+  bool is_far_call()                 const {
+    return is_ldr_literal() && next_raw()->is_fat_call();
+  }
+
+  bool is_ic_near_call()             const { return is_adr_aligned_lr() && next_raw()->is_b(); }
+  bool is_ic_far_call()              const { return is_adr_aligned_lr() && next_raw()->is_ldr_literal() && next_raw()->next_raw()->is_br(); }
+  bool is_ic_call()                  const { return is_ic_near_call() || is_ic_far_call(); }
+
+  bool is_jump()                     const { return is_b() || is_far_jump(); }
+  bool is_call()                     const { return is_bl() || is_far_call() || is_ic_call(); }
+  bool is_branch()                   const { return is_b() || is_bl(); }
+
+  // c2 doesn't use fixed registers for safepoint poll address
+  bool is_safepoint_poll() const {
+    return true;
+  }
+
+  bool is_save_all_registers(const RawNativeInstruction** next) const {
+    const RawNativeInstruction* current = this;
+
+    if (!current->is_stp_preindex()) return false; current = current->next_raw();
+    for (int i = 28; i >= 0; i -= 2) {
+      if (!current->is_stp_preindex()) return false; current = current->next_raw();
+    }
+
+    if (!current->is_adr_aligned())                 return false; current = current->next_raw();
+    if (!current->is_ldr_str_gp_reg_unsigned_imm()) return false; current = current->next_raw();
+    if (!current->is_ldr_str_gp_reg_unsigned_imm()) return false; current = current->next_raw();
+
+    *next = (RawNativeInstruction*) current;
+    return true;
+  }
+
+  bool is_restore_all_registers(const RawNativeInstruction** next) const {
+    const RawNativeInstruction* current = this;
+
+    for (int i = 0; i <= 28; i += 2) {
+      if (!current->is_ldp_postindex()) return false; current = current->next_raw();
+    }
+    if (!current->is_ldp_postindex()) return false; current = current->next_raw();
+
+    *next = (RawNativeInstruction*) current;
+    return true;
+  }
+
+  const RawNativeInstruction* skip_bind_literal() const {
+    const RawNativeInstruction* current = this;
+    if (((uintptr_t)current) % wordSize != 0) {
+      assert(current->is_nop(), "should be");
+      current = current->next_raw();
+    }
+    assert(((uintptr_t)current) % wordSize == 0, "should be"); // bound literal should be aligned
+    current = current->next_raw()->next_raw();
+    return current;
+  }
+
+  bool is_stop(const RawNativeInstruction** next) const {
+    const RawNativeInstruction* current = this;
+
+    if (!current->is_save_all_registers(&current)) return false;
+    if (!current->is_ldr_literal())                return false; current = current->next_raw();
+    if (!current->is_mov_sp())                     return false; current = current->next_raw();
+    if (!current->is_ldr_literal())                return false; current = current->next_raw();
+    if (!current->is_br())                         return false; current = current->next_raw();
+
+    current = current->skip_bind_literal();
+    current = current->skip_bind_literal();
+
+    *next = (RawNativeInstruction*) current;
+    return true;
+  }
+
+  bool is_mov_slow(const RawNativeInstruction** next = NULL) const {
+    const RawNativeInstruction* current = this;
+
+    if (current->is_orr_imm()) {
+      current = current->next_raw();
+
+    } else if (current->is_movn() || current->is_movz()) {
+      current = current->next_raw();
+      int movkCount = 0;
+      while (current->is_movk()) {
+        movkCount++;
+        if (movkCount > 3) return false;
+        current = current->next_raw();
+      }
+
+    } else {
+      return false;
+    }
+
+    if (next != NULL) {
+      *next = (RawNativeInstruction*)current;
+    }
+    return true;
+  }
+
+#ifdef ASSERT
+  void skip_verify_heapbase(const RawNativeInstruction** next) const {
+    const RawNativeInstruction* current = this;
+
+    if (CheckCompressedOops) {
+      if (!current->is_ldr_str_gp_reg_unsigned_imm()) return; current = current->next_raw();
+      if (!current->is_stp_preindex())      return; current = current->next_raw();
+      // NOTE: temporary workaround, remove with m6-01?
+      // skip saving condition flags
+      current = current->next_raw();
+      current = current->next_raw();
+
+      if (!current->is_mov_slow(&current))  return;
+      if (!current->is_cmp_rr())            return; current = current->next_raw();
+      if (!current->is_b_cond())            return; current = current->next_raw();
+      if (!current->is_stop(&current))      return;
+
+#ifdef COMPILER2
+      if (current->is_nop()) current = current->next_raw();
+#endif
+      // NOTE: temporary workaround, remove with m6-01?
+      // skip restoring condition flags
+      current = current->next_raw();
+      current = current->next_raw();
+
+      if (!current->is_ldp_postindex())     return; current = current->next_raw();
+      if (!current->is_ldr_str_gp_reg_unsigned_imm()) return; current = current->next_raw();
+    }
+
+    *next = (RawNativeInstruction*) current;
+  }
+#endif // ASSERT
+
+  bool is_ldr_global_ptr(const RawNativeInstruction** next) const {
+    const RawNativeInstruction* current = this;
+
+    if (!current->is_mov_slow(&current))            return false;
+    if (!current->is_ldr_str_gp_reg_unsigned_imm()) return false; current = current->next_raw();
+
+    *next = (RawNativeInstruction*) current;
+    return true;
+  }
+
+  void skip_verify_oop(const RawNativeInstruction** next) const {
+    const RawNativeInstruction* current = this;
+
+    if (VerifyOops) {
+      if (!current->is_save_all_registers(&current)) return;
+
+      if (current->is_mov()) {
+        current = current->next_raw();
+      }
+
+      if (!current->is_mov_sp())                        return; current = current->next_raw();
+      if (!current->is_ldr_literal())                   return; current = current->next_raw();
+      if (!current->is_ldr_global_ptr(&current))        return;
+      if (!current->is_blr())                           return; current = current->next_raw();
+      if (!current->is_restore_all_registers(&current)) return;
+      if (!current->is_b())                             return; current = current->next_raw();
+
+      current = current->skip_bind_literal();
+    }
+
+    *next = (RawNativeInstruction*) current;
+  }
+
+  void skip_encode_heap_oop(const RawNativeInstruction** next) const {
+    const RawNativeInstruction* current = this;
+
+    assert (Universe::heap() != NULL, "java heap should be initialized");
+#ifdef ASSERT
+    current->skip_verify_heapbase(&current);
+#endif // ASSERT
+    current->skip_verify_oop(&current);
+
+    if (Universe::narrow_oop_base() == NULL) {
+      if (Universe::narrow_oop_shift() != 0) {
+        if (!current->is_lsr_imm()) return; current = current->next_raw();
+      } else {
+        if (current->is_mov()) {
+          current = current->next_raw();
+        }
+      }
+    } else {
+      if (!current->is_tst())       return; current = current->next_raw();
+      if (!current->is_csel())      return; current = current->next_raw();
+      if (!current->is_sub_shift()) return; current = current->next_raw();
+      if (Universe::narrow_oop_shift() != 0) {
+        if (!current->is_lsr_imm())  return; current = current->next_raw();
+      }
+    }
+
+    *next = (RawNativeInstruction*) current;
+  }
+
+  void verify();
+
+  // For unit tests
+  static void test() {}
+
+ private:
+
+  void check_bits_range(int bits, int scale, int low_bit) const {
+    assert((0 <= low_bit) && (0 < bits) && (low_bit + bits <= instruction_size_in_bits), "invalid bits range");
+    assert((0 <= scale) && (scale <= 4), "scale is out of range");
+  }
+
+  void set_imm(int imm_encoding, int bits, int low_bit) {
+    int imm_mask = right_n_bits(bits) << low_bit;
+    assert((imm_encoding & ~imm_mask) == 0, "invalid imm encoding");
+    set_encoding((encoding() & ~imm_mask) | imm_encoding);
+  }
+
+ protected:
+
+  // Returns signed immediate from [low_bit .. low_bit + bits - 1] bits of this instruction, scaled by given scale.
+  int get_signed_imm(int bits, int scale, int low_bit) const {
+    check_bits_range(bits, scale, low_bit);
+    int high_bits_to_clean = (instruction_size_in_bits - (low_bit + bits));
+    return encoding() << high_bits_to_clean >> (high_bits_to_clean + low_bit) << scale;
+  }
+
+  // Puts given signed immediate into the [low_bit .. low_bit + bits - 1] bits of this instruction.
+  void set_signed_imm(int value, int bits, int scale, int low_bit) {
+    set_imm(Assembler::encode_imm(value, bits, scale, low_bit), bits, low_bit);
+  }
+
+  // Returns unsigned immediate from [low_bit .. low_bit + bits - 1] bits of this instruction, scaled by given scale.
+  int get_unsigned_imm(int bits, int scale, int low_bit) const {
+    check_bits_range(bits, scale, low_bit);
+    return ((encoding() >> low_bit) & right_n_bits(bits)) << scale;
+  }
+
+  // Puts given unsigned immediate into the [low_bit .. low_bit + bits - 1] bits of this instruction.
+  void set_unsigned_imm(int value, int bits, int scale, int low_bit) {
+    set_imm(Assembler::encode_unsigned_imm(value, bits, scale, low_bit), bits, low_bit);
+  }
+
+  int get_signed_offset(int bits, int low_bit) const {
+    return get_signed_imm(bits, 2, low_bit);
+  }
+
+  void set_signed_offset(int offset, int bits, int low_bit) {
+    set_signed_imm(offset, bits, 2, low_bit);
+  }
+};
+
+inline RawNativeInstruction* rawNativeInstruction_at(address address) {
+  RawNativeInstruction* instr = RawNativeInstruction::at(address);
+#ifdef ASSERT
+  instr->verify();
+#endif // ASSERT
+  return instr;
+}
+
+// -------------------------------------------------------------------
+
+// Load/store register (unsigned scaled immediate)
+class NativeMovRegMem: public RawNativeInstruction {
+ private:
+  int get_offset_scale() const {
+    return get_unsigned_imm(2, 0, 30);
+  }
+
+ public:
+  int offset() const {
+    return get_unsigned_imm(12, get_offset_scale(), 10);
+  }
+
+  void set_offset(int x);
+
+  void add_offset_in_bytes(int add_offset) {
+    set_offset(offset() + add_offset);
+  }
+};
+
+inline NativeMovRegMem* nativeMovRegMem_at(address address) {
+  const RawNativeInstruction* instr = rawNativeInstruction_at(address);
+
+#ifdef COMPILER1
+    // NOP required for C1 patching
+    if (instr->is_nop()) {
+      instr = instr->next_raw();
+    }
+#endif
+
+  instr->skip_encode_heap_oop(&instr);
+
+  assert(instr->is_ldr_str_reg_unsigned_imm(), "must be");
+  return (NativeMovRegMem*)instr;
+}
+
+// -------------------------------------------------------------------
+
+class NativeInstruction : public RawNativeInstruction {
+public:
+  static NativeInstruction* at(address address) {
+    return (NativeInstruction*)address;
+  }
+
+public:
+  // No need to consider indirections while parsing NativeInstruction
+  address next_instruction_address() const {
+    return next_raw_instruction_address();
+  }
+
+  // next() is no longer defined to avoid confusion.
+  //
+  // The front end and most classes except for those defined in nativeInst_arm
+  // or relocInfo_arm should only use next_instruction_address(), skipping
+  // over composed instruction and ignoring back-end extensions.
+  //
+  // The back-end can use next_raw() when it knows the instruction sequence
+  // and only wants to skip a single native instruction.
+};
+
+inline NativeInstruction* nativeInstruction_at(address address) {
+  NativeInstruction* instr = NativeInstruction::at(address);
+#ifdef ASSERT
+  instr->verify();
+#endif // ASSERT
+  return instr;
+}
+
+// -------------------------------------------------------------------
+class NativeInstructionLdrLiteral: public NativeInstruction {
+ public:
+  address literal_address() {
+    address la = instruction_address() + get_signed_offset(19, 5);
+    assert(la != instruction_address(), "literal points to instruction");
+    return la;
+  }
+
+  address after_literal_address() {
+    return literal_address() + wordSize;
+  }
+
+  void set_literal_address(address addr, address pc) {
+    assert(is_ldr_literal(), "must be");
+    int opc = (encoding() >> 30) & 0x3;
+    assert (opc != 0b01 || addr == pc || ((uintx)addr & 7) == 0, "ldr target should be aligned");
+    set_signed_offset(addr - pc, 19, 5);
+  }
+
+  void set_literal_address(address addr) {
+    set_literal_address(addr, instruction_address());
+  }
+
+  address literal_value() {
+    return *(address*)literal_address();
+  }
+
+  void set_literal_value(address dest) {
+    *(address*)literal_address() = dest;
+  }
+};
+
+inline NativeInstructionLdrLiteral* nativeLdrLiteral_at(address address) {
+  assert(nativeInstruction_at(address)->is_ldr_literal(), "must be");
+  return (NativeInstructionLdrLiteral*)address;
+}
+
+// -------------------------------------------------------------------
+// Common class for branch instructions with 26-bit immediate offset: B (unconditional) and BL
+class NativeInstructionBranchImm26: public NativeInstruction {
+ public:
+  address destination(int adj = 0) const {
+    return instruction_address() + get_signed_offset(26, 0) + adj;
+  }
+
+  void set_destination(address dest) {
+    intptr_t offset = (intptr_t)(dest - instruction_address());
+    assert((offset & 0x3) == 0, "should be aligned");
+    set_signed_offset(offset, 26, 0);
+  }
+};
+
+inline NativeInstructionBranchImm26* nativeB_at(address address) {
+  assert(nativeInstruction_at(address)->is_b(), "must be");
+  return (NativeInstructionBranchImm26*)address;
+}
+
+inline NativeInstructionBranchImm26* nativeBL_at(address address) {
+  assert(nativeInstruction_at(address)->is_bl(), "must be");
+  return (NativeInstructionBranchImm26*)address;
+}
+
+// -------------------------------------------------------------------
+class NativeInstructionAdrLR: public NativeInstruction {
+ public:
+  // Returns address which is loaded into LR by this instruction.
+  address target_lr_value() {
+    return instruction_address() + get_signed_offset(19, 5);
+  }
+};
+
+inline NativeInstructionAdrLR* nativeAdrLR_at(address address) {
+  assert(nativeInstruction_at(address)->is_adr_aligned_lr(), "must be");
+  return (NativeInstructionAdrLR*)address;
+}
+
+// -------------------------------------------------------------------
+class RawNativeCall: public NativeInstruction {
+ public:
+
+  address return_address() const {
+    if (is_bl()) {
+      return next_raw_instruction_address();
+
+    } else if (is_far_call()) {
+#ifdef COMPILER2
+      if (next_raw()->is_blr()) {
+        // ldr_literal; blr; ret_addr: b skip_literal;
+        return addr_at(2 * instruction_size);
+      }
+#endif
+      assert(next_raw()->is_adr_aligned_lr() && next_raw()->next_raw()->is_br(), "must be");
+      return nativeLdrLiteral_at(instruction_address())->after_literal_address();
+
+    } else if (is_ic_call()) {
+      return nativeAdrLR_at(instruction_address())->target_lr_value();
+
+    } else {
+      ShouldNotReachHere();
+      return NULL;
+    }
+  }
+
+  address destination(int adj = 0) const {
+    if (is_bl()) {
+      return nativeBL_at(instruction_address())->destination(adj);
+
+    } else if (is_far_call()) {
+      return nativeLdrLiteral_at(instruction_address())->literal_value();
+
+    } else if (is_adr_aligned_lr()) {
+      RawNativeInstruction *next = next_raw();
+      if (next->is_b()) {
+        // ic_near_call
+        return nativeB_at(next->instruction_address())->destination(adj);
+      } else if (next->is_far_jump()) {
+        // ic_far_call
+        return nativeLdrLiteral_at(next->instruction_address())->literal_value();
+      }
+    }
+    ShouldNotReachHere();
+    return NULL;
+  }
+
+  void set_destination(address dest) {
+    if (is_bl()) {
+      nativeBL_at(instruction_address())->set_destination(dest);
+      return;
+    }
+    if (is_far_call()) {
+      nativeLdrLiteral_at(instruction_address())->set_literal_value(dest);
+      OrderAccess::storeload(); // overkill if caller holds lock?
+      return;
+    }
+    if (is_adr_aligned_lr()) {
+      RawNativeInstruction *next = next_raw();
+      if (next->is_b()) {
+        // ic_near_call
+        nativeB_at(next->instruction_address())->set_destination(dest);
+        return;
+      }
+      if (next->is_far_jump()) {
+        // ic_far_call
+        nativeLdrLiteral_at(next->instruction_address())->set_literal_value(dest);
+        OrderAccess::storeload(); // overkill if caller holds lock?
+        return;
+      }
+    }
+    ShouldNotReachHere();
+  }
+
+  void set_destination_mt_safe(address dest) {
+    assert(CodeCache::contains(dest), "call target should be from code cache (required by ic_call and patchable_call)");
+    set_destination(dest);
+  }
+
+  void verify() {
+    assert(RawNativeInstruction::is_call(), "should be");
+  }
+
+  void verify_alignment() {
+    // Nothing to do on ARM
+  }
+};
+
+inline RawNativeCall* rawNativeCall_at(address address) {
+  RawNativeCall * call = (RawNativeCall*)address;
+  call->verify();
+  return call;
+}
+
+class NativeCall: public RawNativeCall {
+ public:
+
+  // NativeCall::next_instruction_address() is used only to define the
+  // range where to look for the relocation information. We need not
+  // walk over composed instructions (as long as the relocation information
+  // is associated to the first instruction).
+  address next_instruction_address() const {
+    return next_raw_instruction_address();
+  }
+
+  static bool is_call_before(address return_address);
+};
+
+inline NativeCall* nativeCall_at(address address) {
+  NativeCall * call = (NativeCall*)address;
+  call->verify();
+  return call;
+}
+
+NativeCall* nativeCall_before(address return_address);
+
+// -------------------------------------------------------------------
+class NativeGeneralJump: public NativeInstruction {
+ public:
+
+  address jump_destination() const {
+    return nativeB_at(instruction_address())->destination();
+  }
+
+  static void replace_mt_safe(address instr_addr, address code_buffer);
+
+  static void insert_unconditional(address code_pos, address entry);
+
+};
+
+inline NativeGeneralJump* nativeGeneralJump_at(address address) {
+  assert(nativeInstruction_at(address)->is_b(), "must be");
+  return (NativeGeneralJump*)address;
+}
+
+// -------------------------------------------------------------------
+class RawNativeJump: public NativeInstruction {
+ public:
+
+  address jump_destination(int adj = 0) const {
+    if (is_b()) {
+      address a = nativeB_at(instruction_address())->destination(adj);
+      // Jump destination -1 is encoded as a jump to self
+      if (a == instruction_address()) {
+        return (address)-1;
+      }
+      return a;
+    } else {
+      assert(is_far_jump(), "should be");
+      return nativeLdrLiteral_at(instruction_address())->literal_value();
+    }
+  }
+
+  void set_jump_destination(address dest) {
+    if (is_b()) {
+      // Jump destination -1 is encoded as a jump to self
+      if (dest == (address)-1) {
+        dest = instruction_address();
+      }
+      nativeB_at(instruction_address())->set_destination(dest);
+    } else {
+      assert(is_far_jump(), "should be");
+      nativeLdrLiteral_at(instruction_address())->set_literal_value(dest);
+    }
+  }
+};
+
+inline RawNativeJump* rawNativeJump_at(address address) {
+  assert(rawNativeInstruction_at(address)->is_jump(), "must be");
+  return (RawNativeJump*)address;
+}
+
+// -------------------------------------------------------------------
+class NativeMovConstReg: public NativeInstruction {
+
+  NativeMovConstReg *adjust() const {
+    return (NativeMovConstReg *)adjust(this);
+  }
+
+ public:
+
+  static RawNativeInstruction *adjust(const RawNativeInstruction *ni) {
+#ifdef COMPILER1
+    // NOP required for C1 patching
+    if (ni->is_nop()) {
+      return ni->next_raw();
+    }
+#endif
+    return (RawNativeInstruction *)ni;
+  }
+
+  intptr_t _data() const;
+  void set_data(intptr_t x);
+
+  intptr_t data() const {
+    return adjust()->_data();
+  }
+
+  bool is_pc_relative() {
+    return adjust()->is_ldr_literal();
+  }
+
+  void _set_pc_relative_offset(address addr, address pc) {
+    assert(is_ldr_literal(), "must be");
+    nativeLdrLiteral_at(instruction_address())->set_literal_address(addr, pc);
+  }
+
+  void set_pc_relative_offset(address addr, address pc) {
+    NativeMovConstReg *ni = adjust();
+    int dest_adj = ni->instruction_address() - instruction_address();
+    ni->_set_pc_relative_offset(addr, pc + dest_adj);
+  }
+
+  address _next_instruction_address() const {
+#ifdef COMPILER2
+    if (is_movz()) {
+      // narrow constant
+      RawNativeInstruction* ni = next_raw();
+      assert(ni->is_movk(), "movz;movk expected");
+      return ni->next_raw_instruction_address();
+    }
+#endif
+    assert(is_ldr_literal(), "must be");
+    return NativeInstruction::next_raw_instruction_address();
+  }
+
+  address next_instruction_address() const {
+    return adjust()->_next_instruction_address();
+  }
+};
+
+inline NativeMovConstReg* nativeMovConstReg_at(address address) {
+  RawNativeInstruction* ni = rawNativeInstruction_at(address);
+
+  ni = NativeMovConstReg::adjust(ni);
+
+  assert(ni->is_mov_slow() || ni->is_ldr_literal(), "must be");
+  return (NativeMovConstReg*)address;
+}
+
+// -------------------------------------------------------------------
+class NativeJump: public RawNativeJump {
+ public:
+
+  static void check_verified_entry_alignment(address entry, address verified_entry);
+
+  static void patch_verified_entry(address entry, address verified_entry, address dest);
+};
+
+inline NativeJump* nativeJump_at(address address) {
+  assert(nativeInstruction_at(address)->is_jump(), "must be");
+  return (NativeJump*)address;
+}
+
+#endif // CPU_ARM_VM_NATIVEINST_ARM_64_HPP
--- /dev/null	2016-08-24 15:41:39.598575000 -0400
+++ new/hotspot/src/cpu/arm/vm/registerMap_arm.hpp	2016-12-02 11:22:49.360910864 -0500
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2008, 2011, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef CPU_ARM_VM_REGISTERMAP_ARM_HPP
+#define CPU_ARM_VM_REGISTERMAP_ARM_HPP
+
+// machine-dependent implemention for register maps
+  friend class frame;
+
+ private:
+  // This is the hook for finding a register in an "well-known" location,
+  // such as a register block of a predetermined format.
+  // Since there is none, we just return NULL.
+  // See registerMap_sparc.hpp for an example of grabbing registers
+  // from register save areas of a standard layout.
+   address pd_location(VMReg reg) const {return NULL;}
+
+  // no PD state to clear or copy:
+  void pd_clear() {}
+  void pd_initialize() {}
+  void pd_initialize_from(const RegisterMap* map) {}
+
+#endif // CPU_ARM_VM_REGISTERMAP_ARM_HPP
--- /dev/null	2016-08-24 15:41:39.598575000 -0400
+++ new/hotspot/src/cpu/arm/vm/register_arm.cpp	2016-12-02 11:22:54.693213251 -0500
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2008, 2013, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+#include "register_arm.hpp"
+#include "utilities/debug.hpp"
+
+const int ConcreteRegisterImpl::max_gpr = ConcreteRegisterImpl::num_gpr;
+const int ConcreteRegisterImpl::max_fpr = ConcreteRegisterImpl::num_fpr +
+                                          ConcreteRegisterImpl::max_gpr;
+
+const char* RegisterImpl::name() const {
+  const char* names[number_of_registers] = {
+#ifdef AARCH64
+    "x0",  "x1",  "x2",  "x3",  "x4",  "x5",  "x6",  "x7",
+    "x8",  "x9",  "x10", "x11", "x12", "x13", "x14", "x15",
+    "x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23",
+    "x24", "x25", "x26", "x27", "x28", "fp",  "lr",  "xzr", "sp"
+#else
+    "r0", "r1", "r2", "r3", "r4", "r5", "r6",
+#if (FP_REG_NUM == 7)
+    "fp",
+#else
+    "r7",
+#endif
+    "r8", "r9", "r10",
+#if (FP_REG_NUM == 11)
+    "fp",
+#else
+    "r11",
+#endif
+    "r12", "sp", "lr", "pc"
+#endif // AARCH64
+  };
+  return is_valid() ? names[encoding()] : "noreg";
+}
+
+const char* FloatRegisterImpl::name() const {
+  const char* names[number_of_registers] = {
+#ifdef AARCH64
+    "v0",  "v1",  "v2",  "v3",  "v4",  "v5",  "v6",  "v7",
+    "v8",  "v9",  "v10", "v11", "v12", "v13", "v14", "v15",
+    "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
+    "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
+#else
+     "s0",  "s1",  "s2",  "s3",  "s4",  "s5",  "s6",  "s7",
+     "s8",  "s9", "s10", "s11", "s12", "s13", "s14", "s15",
+    "s16", "s17", "s18", "s19", "s20", "s21", "s22", "s23",
+    "s24", "s25", "s26", "s27", "s28", "s29", "s30", "s31"
+#ifdef COMPILER2
+   ,"s32", "s33?","s34", "s35?","s36", "s37?","s38", "s39?",
+    "s40", "s41?","s42", "s43?","s44", "s45?","s46", "s47?",
+    "s48", "s49?","s50", "s51?","s52", "s53?","s54", "s55?",
+    "s56", "s57?","s58", "s59?","s60", "s61?","s62", "s63?"
+#endif
+#endif // AARCH64
+  };
+  return is_valid() ? names[encoding()] : "fnoreg";
+}
--- /dev/null	2016-08-24 15:41:39.598575000 -0400
+++ new/hotspot/src/cpu/arm/vm/register_arm.hpp	2016-12-02 11:22:59.993513823 -0500
@@ -0,0 +1,570 @@
+/*
+ * Copyright (c) 2008, 2015, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef CPU_ARM_VM_REGISTER_ARM_HPP
+#define CPU_ARM_VM_REGISTER_ARM_HPP
+
+#include "asm/register.hpp"
+#include "vm_version_arm.hpp"
+
+class VMRegImpl;
+typedef VMRegImpl* VMReg;
+
+// These are declared ucontext.h
+#undef R0
+#undef R1
+#undef R2
+#undef R3
+#undef R4
+#undef R5
+#undef R6
+#undef R7
+#undef R8
+#undef R9
+#undef R10
+#undef R11
+#undef R12
+#undef R13
+#undef R14
+#undef R15
+
+#define R(r)   ((Register)(r))
+
+/////////////////////////////////
+// Support for different ARM ABIs
+// Note: default ABI is for linux
+
+
+// R9_IS_SCRATCHED
+//
+// The ARM ABI does not guarantee that R9 is callee saved.
+// Set R9_IS_SCRATCHED to 1 to ensure it is properly saved/restored by
+// the caller.
+#ifndef R9_IS_SCRATCHED
+// Default: R9 is callee saved
+#define R9_IS_SCRATCHED 0
+#endif
+
+#ifndef AARCH64
+// FP_REG_NUM
+//
+// The ARM ABI does not state which register is used for the frame pointer.
+// Note: for the ABIs we are currently aware of, FP is currently
+// either R7 or R11. Code may have to be extended if a third register
+// register must be supported (see altFP_7_11).
+#ifndef FP_REG_NUM
+// Default: FP is R11
+#define FP_REG_NUM 11
+#endif
+#endif // AARCH64
+
+// ALIGN_WIDE_ARGUMENTS
+//
+// The ARM ABI requires 64-bits arguments to be aligned on 4 words
+// or on even registers. Set ALIGN_WIDE_ARGUMENTS to 1 for that behavior.
+//
+// Unfortunately, some platforms do not endorse that part of the ABI.
+//
+// We are aware of one which expects 64-bit arguments to only be 4
+// bytes aligned and can for instance use R3 + a stack slot for such
+// an argument.
+//
+// This is the behavor implemented if (ALIGN_WIDE_ARGUMENTS == 0)
+#ifndef  ALIGN_WIDE_ARGUMENTS
+// Default: align on 8 bytes and avoid using <r3+stack>
+#define ALIGN_WIDE_ARGUMENTS 1
+#endif
+
+#define R0     ((Register)0)
+#define R1     ((Register)1)
+#define R2     ((Register)2)
+#define R3     ((Register)3)
+#define R4     ((Register)4)
+#define R5     ((Register)5)
+#define R6     ((Register)6)
+#define R7     ((Register)7)
+#define R8     ((Register)8)
+#define R9     ((Register)9)
+#define R10    ((Register)10)
+#define R11    ((Register)11)
+#define R12    ((Register)12)
+#define R13    ((Register)13)
+#define R14    ((Register)14)
+#define R15    ((Register)15)
+
+#ifdef AARCH64
+
+#define R16    ((Register)16)
+#define R17    ((Register)17)
+#define R18    ((Register)18)
+#define R19    ((Register)19)
+#define R20    ((Register)20)
+#define R21    ((Register)21)
+#define R22    ((Register)22)
+#define R23    ((Register)23)
+#define R24    ((Register)24)
+#define R25    ((Register)25)
+#define R26    ((Register)26)
+#define R27    ((Register)27)
+#define R28    ((Register)28)
+#define R29    ((Register)29)
+#define R30    ((Register)30)
+#define ZR     ((Register)31)
+#define SP     ((Register)32)
+
+#define FP     R29
+#define LR     R30
+
+#define altFP_7_11 R7
+
+#else // !AARCH64
+
+#define FP     ((Register)FP_REG_NUM)
+
+// Safe use of registers which may be FP on some platforms.
+//
+// altFP_7_11: R7 if not equal to FP, else R11 (the default FP)
+//
+// Note: add additional altFP_#_11 for each register potentially used
+// as FP on supported ABIs (and replace R# by altFP_#_11). altFP_#_11
+// must be #define to R11 if and only if # is FP_REG_NUM.
+#if (FP_REG_NUM == 7)
+#define altFP_7_11     ((Register)11)
+#else
+#define altFP_7_11     ((Register)7)
+#endif
+#define SP     R13
+#define LR     R14
+#define PC     R15
+
+#endif // !AARCH64
+
+
+class RegisterImpl;
+typedef RegisterImpl* Register;
+
+inline Register as_Register(int encoding) {
+  return (Register)(intptr_t)encoding;
+}
+
+class RegisterImpl : public AbstractRegisterImpl {
+ public:
+  enum {
+#ifdef AARCH64
+    number_of_gprs = 31,
+    zr_sp_encoding = 31,
+#endif
+    number_of_registers = AARCH64_ONLY(number_of_gprs + 2) NOT_AARCH64(16)
+  };
+
+  Register successor() const      { return as_Register(encoding() + 1); }
+
+  inline friend Register as_Register(int encoding);
+
+  VMReg as_VMReg();
+
+  // accessors
+  int   encoding() const          { assert(is_valid(), "invalid register"); return value(); }
+  const char* name() const;
+
+#ifdef AARCH64
+  int encoding_with_zr() const   { assert (is_valid_gpr_or_zr(), "invalid register"); return (this == ZR) ? zr_sp_encoding : value(); }
+  int encoding_with_sp() const   { assert (is_valid_gpr_or_sp(), "invalid register"); return (this == SP) ? zr_sp_encoding : value(); }
+#endif
+
+  // testers
+  bool is_valid() const           { return 0 <= value() && value() < number_of_registers; }
+
+#ifdef AARCH64
+  bool is_valid_gpr()       const  { return (0 <= value() && value() < number_of_gprs); }
+  bool is_valid_gpr_or_zr() const  { return is_valid_gpr() || (this == ZR); }
+  bool is_valid_gpr_or_sp() const  { return is_valid_gpr() || (this == SP); }
+#endif
+};
+
+CONSTANT_REGISTER_DECLARATION(Register, noreg, (-1));
+
+
+// Use FloatRegister as shortcut
+class FloatRegisterImpl;
+typedef FloatRegisterImpl* FloatRegister;
+
+inline FloatRegister as_FloatRegister(int encoding) {
+  return (FloatRegister)(intptr_t)encoding;
+}
+
+class FloatRegisterImpl : public AbstractRegisterImpl {
+ public:
+  enum {
+#ifdef AARCH64
+    number_of_registers = 32
+#else
+    number_of_registers = NOT_COMPILER2(32) COMPILER2_PRESENT(64)
+#endif
+  };
+
+  inline friend FloatRegister as_FloatRegister(int encoding);
+
+  VMReg as_VMReg();
+
+  int   encoding() const          { assert(is_valid(), "invalid register"); return value(); }
+  bool  is_valid() const          { return 0 <= (intx)this && (intx)this < number_of_registers; }
+  FloatRegister successor() const { return as_FloatRegister(encoding() + 1); }
+
+  const char* name() const;
+
+#ifndef AARCH64
+  int hi_bits() const {
+    return (encoding() >> 1) & 0xf;
+  }
+
+  int lo_bit() const {
+    return encoding() & 1;
+  }
+
+  int hi_bit() const {
+    return encoding() >> 5;
+  }
+#endif // !AARCH64
+};
+
+CONSTANT_REGISTER_DECLARATION(FloatRegister, fnoreg, (-1));
+
+#ifdef AARCH64
+
+CONSTANT_REGISTER_DECLARATION(FloatRegister, V0,     ( 0));
+CONSTANT_REGISTER_DECLARATION(FloatRegister, V1,     ( 1));
+CONSTANT_REGISTER_DECLARATION(FloatRegister, V2,     ( 2));
+CONSTANT_REGISTER_DECLARATION(FloatRegister, V3,     ( 3));
+CONSTANT_REGISTER_DECLARATION(FloatRegister, V4,     ( 4));
+CONSTANT_REGISTER_DECLARATION(FloatRegister, V5,     ( 5));
+CONSTANT_REGISTER_DECLARATION(FloatRegister, V6,     ( 6));
+CONSTANT_REGISTER_DECLARATION(FloatRegister, V7,     ( 7));
+CONSTANT_REGISTER_DECLARATION(FloatRegister, V8,     ( 8));
+CONSTANT_REGISTER_DECLARATION(FloatRegister, V9,     ( 9));
+CONSTANT_REGISTER_DECLARATION(FloatRegister, V10,    (10));
+CONSTANT_REGISTER_DECLARATION(FloatRegister, V11,    (11));
+CONSTANT_REGISTER_DECLARATION(FloatRegister, V12,    (12));
+CONSTANT_REGISTER_DECLARATION(FloatRegister, V13,    (13));
+CONSTANT_REGISTER_DECLARATION(FloatRegister, V14,    (14));
+CONSTANT_REGISTER_DECLARATION(FloatRegister, V15,    (15));
+CONSTANT_REGISTER_DECLARATION(FloatRegister, V16,    (16));
+CONSTANT_REGISTER_DECLARATION(FloatRegister, V17,    (17));
+CONSTANT_REGISTER_DECLARATION(FloatRegister, V18,    (18));
+CONSTANT_REGISTER_DECLARATION(FloatRegister, V19,    (19));
+CONSTANT_REGISTER_DECLARATION(FloatRegister, V20,    (20));
+CONSTANT_REGISTER_DECLARATION(FloatRegister, V21,    (21));
+CONSTANT_REGISTER_DECLARATION(FloatRegister, V22,    (22));
+CONSTANT_REGISTER_DECLARATION(FloatRegister, V23,    (23));
+CONSTANT_REGISTER_DECLARATION(FloatRegister, V24,    (24));
+CONSTANT_REGISTER_DECLARATION(FloatRegister, V25,    (25));
+CONSTANT_REGISTER_DECLARATION(FloatRegister, V26,    (26));
+CONSTANT_REGISTER_DECLARATION(FloatRegister, V27,    (27));
+CONSTANT_REGISTER_DECLARATION(FloatRegister, V28,    (28));
+CONSTANT_REGISTER_DECLARATION(FloatRegister, V29,    (29));
+CONSTANT_REGISTER_DECLARATION(FloatRegister, V30,    (30));
+CONSTANT_REGISTER_DECLARATION(FloatRegister, V31,    (31));
+
+#define S0       V0
+#define S1_reg   V1
+#define Stemp    V31
+
+#define D0       V0
+#define D1       V1
+
+#else // AARCH64
+
+/*
+ * S1-S6 are named with "_reg" suffix to avoid conflict with
+ * constants defined in sharedRuntimeTrig.cpp
+ */
+CONSTANT_REGISTER_DECLARATION(FloatRegister, S0,     ( 0));
+CONSTANT_REGISTER_DECLARATION(FloatRegister, S1_reg, ( 1));
+CONSTANT_REGISTER_DECLARATION(FloatRegister, S2_reg, ( 2));
+CONSTANT_REGISTER_DECLARATION(FloatRegister, S3_reg, ( 3));
+CONSTANT_REGISTER_DECLARATION(FloatRegister, S4_reg, ( 4));
+CONSTANT_REGISTER_DECLARATION(FloatRegister, S5_reg, ( 5));
+CONSTANT_REGISTER_DECLARATION(FloatRegister, S6_reg, ( 6));
+CONSTANT_REGISTER_DECLARATION(FloatRegister, S7,     ( 7));
+CONSTANT_REGISTER_DECLARATION(FloatRegister, S8,     ( 8));
+CONSTANT_REGISTER_DECLARATION(FloatRegister, S9,     ( 9));
+CONSTANT_REGISTER_DECLARATION(FloatRegister, S10,    (10));
+CONSTANT_REGISTER_DECLARATION(FloatRegister, S11,    (11));
+CONSTANT_REGISTER_DECLARATION(FloatRegister, S12,    (12));
+CONSTANT_REGISTER_DECLARATION(FloatRegister, S13,    (13));
+CONSTANT_REGISTER_DECLARATION(FloatRegister, S14,    (14));
+CONSTANT_REGISTER_DECLARATION(FloatRegister, S15,    (15));
+CONSTANT_REGISTER_DECLARATION(FloatRegister, S16,    (16));
+CONSTANT_REGISTER_DECLARATION(FloatRegister, S17,    (17));
+CONSTANT_REGISTER_DECLARATION(FloatRegister, S18,    (18));
+CONSTANT_REGISTER_DECLARATION(FloatRegister, S19,    (19));
+CONSTANT_REGISTER_DECLARATION(FloatRegister, S20,    (20));
+CONSTANT_REGISTER_DECLARATION(FloatRegister, S21,    (21));
+CONSTANT_REGISTER_DECLARATION(FloatRegister, S22,    (22));
+CONSTANT_REGISTER_DECLARATION(FloatRegister, S23,    (23));
+CONSTANT_REGISTER_DECLARATION(FloatRegister, S24,    (24));
+CONSTANT_REGISTER_DECLARATION(FloatRegister, S25,    (25));
+CONSTANT_REGISTER_DECLARATION(FloatRegister, S26,    (26));
+CONSTANT_REGISTER_DECLARATION(FloatRegister, S27,    (27));
+CONSTANT_REGISTER_DECLARATION(FloatRegister, S28,    (28));
+CONSTANT_REGISTER_DECLARATION(FloatRegister, S29,    (29));
+CONSTANT_REGISTER_DECLARATION(FloatRegister, S30,    (30));
+CONSTANT_REGISTER_DECLARATION(FloatRegister, S31,    (31));
+CONSTANT_REGISTER_DECLARATION(FloatRegister, Stemp,  (30));
+
+CONSTANT_REGISTER_DECLARATION(FloatRegister, D0,     ( 0));
+CONSTANT_REGISTER_DECLARATION(FloatRegister, D1,     ( 2));
+CONSTANT_REGISTER_DECLARATION(FloatRegister, D2,     ( 4));
+CONSTANT_REGISTER_DECLARATION(FloatRegister, D3,     ( 6));
+CONSTANT_REGISTER_DECLARATION(FloatRegister, D4,     ( 8));
+CONSTANT_REGISTER_DECLARATION(FloatRegister, D5,     ( 10));
+CONSTANT_REGISTER_DECLARATION(FloatRegister, D6,     ( 12));
+CONSTANT_REGISTER_DECLARATION(FloatRegister, D7,     ( 14));
+CONSTANT_REGISTER_DECLARATION(FloatRegister, D8,     ( 16));
+CONSTANT_REGISTER_DECLARATION(FloatRegister, D9,     ( 18));
+CONSTANT_REGISTER_DECLARATION(FloatRegister, D10,    ( 20));
+CONSTANT_REGISTER_DECLARATION(FloatRegister, D11,    ( 22));
+CONSTANT_REGISTER_DECLARATION(FloatRegister, D12,    ( 24));
+CONSTANT_REGISTER_DECLARATION(FloatRegister, D13,    ( 26));
+CONSTANT_REGISTER_DECLARATION(FloatRegister, D14,    ( 28));
+CONSTANT_REGISTER_DECLARATION(FloatRegister, D15,    (30));
+CONSTANT_REGISTER_DECLARATION(FloatRegister, D16,    (32));
+CONSTANT_REGISTER_DECLARATION(FloatRegister, D17,    (34));
+CONSTANT_REGISTER_DECLARATION(FloatRegister, D18,    (36));
+CONSTANT_REGISTER_DECLARATION(FloatRegister, D19,    (38));
+CONSTANT_REGISTER_DECLARATION(FloatRegister, D20,    (40));
+CONSTANT_REGISTER_DECLARATION(FloatRegister, D21,    (42));
+CONSTANT_REGISTER_DECLARATION(FloatRegister, D22,    (44));
+CONSTANT_REGISTER_DECLARATION(FloatRegister, D23,    (46));
+CONSTANT_REGISTER_DECLARATION(FloatRegister, D24,    (48));
+CONSTANT_REGISTER_DECLARATION(FloatRegister, D25,    (50));
+CONSTANT_REGISTER_DECLARATION(FloatRegister, D26,    (52));
+CONSTANT_REGISTER_DECLARATION(FloatRegister, D27,    (54));
+CONSTANT_REGISTER_DECLARATION(FloatRegister, D28,    (56));
+CONSTANT_REGISTER_DECLARATION(FloatRegister, D29,    (58));
+CONSTANT_REGISTER_DECLARATION(FloatRegister, D30,    (60));
+CONSTANT_REGISTER_DECLARATION(FloatRegister, D31,    (62));
+
+#endif // AARCH64
+
+class ConcreteRegisterImpl : public AbstractRegisterImpl {
+ public:
+  enum {
+    log_vmregs_per_word = LogBytesPerWord - LogBytesPerInt, // VMRegs are of 4-byte size
+#ifdef COMPILER2
+    log_bytes_per_fpr  = AARCH64_ONLY(4) NOT_AARCH64(2), // quad vectors
+#else
+    log_bytes_per_fpr  = AARCH64_ONLY(3) NOT_AARCH64(2), // double vectors
+#endif
+    log_words_per_fpr  = log_bytes_per_fpr - LogBytesPerWord,
+    words_per_fpr      = 1 << log_words_per_fpr,
+    log_vmregs_per_fpr = log_bytes_per_fpr - LogBytesPerInt,
+    log_vmregs_per_gpr = log_vmregs_per_word,
+    vmregs_per_gpr = 1 << log_vmregs_per_gpr,
+    vmregs_per_fpr = 1 << log_vmregs_per_fpr,
+
+    num_gpr  = RegisterImpl::number_of_registers << log_vmregs_per_gpr,
+    max_gpr0 = num_gpr,
+    num_fpr  = FloatRegisterImpl::number_of_registers << log_vmregs_per_fpr,
+    max_fpr0 = max_gpr0 + num_fpr,
+    number_of_registers = num_gpr + num_fpr +
+                          // TODO-AARCH64 revise
+                          1+1 // APSR and FPSCR so that c2's REG_COUNT <= ConcreteRegisterImpl::number_of_registers
+  };
+
+  static const int max_gpr;
+  static const int max_fpr;
+};
+
+// TODO-AARCH64 revise the following definitions
+
+class VFPSystemRegisterImpl;
+typedef VFPSystemRegisterImpl* VFPSystemRegister;
+class VFPSystemRegisterImpl : public AbstractRegisterImpl {
+ public:
+  int   encoding() const          { return value(); }
+};
+
+#define FPSID     ((VFPSystemRegister)0)
+#define FPSCR     ((VFPSystemRegister)1)
+#define MVFR0     ((VFPSystemRegister)0x6)
+#define MVFR1     ((VFPSystemRegister)0x7)
+
+/*
+ * Register definitions shared across interpreter and compiler
+ */
+#define Rexception_obj   AARCH64_ONLY(R19) NOT_AARCH64(R4)
+#define Rexception_pc    AARCH64_ONLY(R20) NOT_AARCH64(R5)
+
+#ifdef AARCH64
+#define Rheap_base       R27
+#endif // AARCH64
+
+/*
+ * Interpreter register definitions common to C++ and template interpreters.
+ */
+#ifdef AARCH64
+#define Rlocals          R23
+#define Rmethod          R26
+#define Rthread          R28
+#define Rtemp            R16
+#define Rtemp2           R17
+#else
+#define Rlocals          R8
+#define Rmethod          R9
+#define Rthread          R10
+#define Rtemp            R12
+#endif // AARCH64
+
+// Interpreter calling conventions
+
+#define Rparams          AARCH64_ONLY(R8)  NOT_AARCH64(SP)
+#define Rsender_sp       AARCH64_ONLY(R19) NOT_AARCH64(R4)
+
+// JSR292
+//  Note: R5_mh is needed only during the call setup, including adapters
+//  This does not seem to conflict with Rexception_pc
+//  In case of issues, R3 might be OK but adapters calling the runtime would have to save it
+#define R5_mh            R5 // MethodHandle register, used during the call setup
+#define Rmh_SP_save      FP // for C1
+
+/*
+ * C++ Interpreter Register Defines
+ */
+#define Rsave0   R4
+#define Rsave1   R5
+#define Rsave2   R6
+#define Rstate   altFP_7_11 // R7 or R11
+#define Ricklass R8
+
+/*
+ * TemplateTable Interpreter Register Usage
+ */
+
+// Temporary registers
+#define R0_tmp                 R0
+#define R1_tmp                 R1
+#define R2_tmp                 R2
+#define R3_tmp                 R3
+#define R4_tmp                 R4
+#define R5_tmp                 R5
+#define R12_tmp                R12
+#define LR_tmp                 LR
+
+#define S0_tmp                 S0
+#define S1_tmp                 S1_reg
+
+#define D0_tmp                 D0
+#define D1_tmp                 D1
+
+// Temporary registers saved across VM calls (according to C calling conventions)
+#define Rtmp_save0             AARCH64_ONLY(R19) NOT_AARCH64(R4)
+#define Rtmp_save1             AARCH64_ONLY(R20) NOT_AARCH64(R5)
+
+// Cached TOS value
+#define R0_tos                 R0
+
+#ifndef AARCH64
+#define R0_tos_lo              R0
+#define R1_tos_hi              R1
+#endif
+
+#define S0_tos                 S0
+#define D0_tos                 D0
+
+// Dispatch table
+#define RdispatchTable         AARCH64_ONLY(R22) NOT_AARCH64(R6)
+
+// Bytecode pointer
+#define Rbcp                   AARCH64_ONLY(R24) NOT_AARCH64(altFP_7_11)
+
+// Pre-loaded next bytecode for the dispatch
+#define R3_bytecode            R3
+
+// Conventions between bytecode templates and stubs
+#define R2_ClassCastException_obj        R2
+#define R4_ArrayIndexOutOfBounds_index   R4
+
+// Interpreter expression stack top
+#define Rstack_top             AARCH64_ONLY(R25) NOT_AARCH64(SP)
+
+/*
+ * Linux 32-bit ARM C ABI Register calling conventions
+ *
+ *   REG         use                     callee/caller saved
+ *
+ *   R0         First argument reg            caller
+ *              result register
+ *   R1         Second argument reg           caller
+ *              result register
+ *   R2         Third argument reg            caller
+ *   R3         Fourth argument reg           caller
+ *
+ *   R4 - R8    Local variable registers      callee
+ *   R9
+ *   R10, R11   Local variable registers      callee
+ *
+ *   R12 (IP)   Scratch register used in inter-procedural calling
+ *   R13 (SP)   Stack Pointer                 callee
+ *   R14 (LR)   Link register
+ *   R15 (PC)   Program Counter
+ *
+ * TODO-AARCH64: document AArch64 ABI
+ *
+ */
+#define c_rarg0  R0
+#define c_rarg1  R1
+#define c_rarg2  R2
+#define c_rarg3  R3
+
+#ifdef AARCH64
+#define c_rarg4  R4
+#define c_rarg5  R5
+#define c_rarg6  R6
+#define c_rarg7  R7
+#endif
+
+#ifdef AARCH64
+#define GPR_PARAMS    8
+#define FPR_PARAMS    8
+#else
+#define GPR_PARAMS    4
+#endif
+
+
+// Java ABI
+// XXX Is this correct?
+#define j_rarg0  c_rarg0
+#define j_rarg1  c_rarg1
+#define j_rarg2  c_rarg2
+#define j_rarg3  c_rarg3
+
+#ifdef AARCH64
+#define j_rarg4  c_rarg4
+#define j_rarg5  c_rarg5
+#define j_rarg6  c_rarg6
+#define j_rarg7  c_rarg7
+#endif
+
+#endif // CPU_ARM_VM_REGISTER_ARM_HPP
--- /dev/null	2016-08-24 15:41:39.598575000 -0400
+++ new/hotspot/src/cpu/arm/vm/register_definitions_arm.cpp	2016-12-02 11:23:05.769841390 -0500
@@ -0,0 +1,137 @@
+/*
+ * Copyright (c) 2008, 2016, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+#include "asm/assembler.hpp"
+#include "asm/register.hpp"
+#include "interp_masm_arm.hpp"
+#include "register_arm.hpp"
+
+REGISTER_DEFINITION(Register, noreg);
+REGISTER_DEFINITION(FloatRegister, fnoreg);
+
+#ifdef AARCH64
+
+REGISTER_DEFINITION(FloatRegister, V0);
+REGISTER_DEFINITION(FloatRegister, V1);
+REGISTER_DEFINITION(FloatRegister, V2);
+REGISTER_DEFINITION(FloatRegister, V3);
+REGISTER_DEFINITION(FloatRegister, V4);
+REGISTER_DEFINITION(FloatRegister, V5);
+REGISTER_DEFINITION(FloatRegister, V6);
+REGISTER_DEFINITION(FloatRegister, V7);
+REGISTER_DEFINITION(FloatRegister, V8);
+REGISTER_DEFINITION(FloatRegister, V9);
+REGISTER_DEFINITION(FloatRegister, V10);
+REGISTER_DEFINITION(FloatRegister, V11);
+REGISTER_DEFINITION(FloatRegister, V12);
+REGISTER_DEFINITION(FloatRegister, V13);
+REGISTER_DEFINITION(FloatRegister, V14);
+REGISTER_DEFINITION(FloatRegister, V15);
+REGISTER_DEFINITION(FloatRegister, V16);
+REGISTER_DEFINITION(FloatRegister, V17);
+REGISTER_DEFINITION(FloatRegister, V18);
+REGISTER_DEFINITION(FloatRegister, V19);
+REGISTER_DEFINITION(FloatRegister, V20);
+REGISTER_DEFINITION(FloatRegister, V21);
+REGISTER_DEFINITION(FloatRegister, V22);
+REGISTER_DEFINITION(FloatRegister, V23);
+REGISTER_DEFINITION(FloatRegister, V24);
+REGISTER_DEFINITION(FloatRegister, V25);
+REGISTER_DEFINITION(FloatRegister, V26);
+REGISTER_DEFINITION(FloatRegister, V27);
+REGISTER_DEFINITION(FloatRegister, V28);
+REGISTER_DEFINITION(FloatRegister, V29);
+REGISTER_DEFINITION(FloatRegister, V30);
+REGISTER_DEFINITION(FloatRegister, V31);
+
+#else // AARCH64
+
+REGISTER_DEFINITION(FloatRegister, S0);
+REGISTER_DEFINITION(FloatRegister, S1_reg);
+REGISTER_DEFINITION(FloatRegister, S2_reg);
+REGISTER_DEFINITION(FloatRegister, S3_reg);
+REGISTER_DEFINITION(FloatRegister, S4_reg);
+REGISTER_DEFINITION(FloatRegister, S5_reg);
+REGISTER_DEFINITION(FloatRegister, S6_reg);
+REGISTER_DEFINITION(FloatRegister, S7);
+REGISTER_DEFINITION(FloatRegister, S8);
+REGISTER_DEFINITION(FloatRegister, S9);
+REGISTER_DEFINITION(FloatRegister, S10);
+REGISTER_DEFINITION(FloatRegister, S11);
+REGISTER_DEFINITION(FloatRegister, S12);
+REGISTER_DEFINITION(FloatRegister, S13);
+REGISTER_DEFINITION(FloatRegister, S14);
+REGISTER_DEFINITION(FloatRegister, S15);
+REGISTER_DEFINITION(FloatRegister, S16);
+REGISTER_DEFINITION(FloatRegister, S17);
+REGISTER_DEFINITION(FloatRegister, S18);
+REGISTER_DEFINITION(FloatRegister, S19);
+REGISTER_DEFINITION(FloatRegister, S20);
+REGISTER_DEFINITION(FloatRegister, S21);
+REGISTER_DEFINITION(FloatRegister, S22);
+REGISTER_DEFINITION(FloatRegister, S23);
+REGISTER_DEFINITION(FloatRegister, S24);
+REGISTER_DEFINITION(FloatRegister, S25);
+REGISTER_DEFINITION(FloatRegister, S26);
+REGISTER_DEFINITION(FloatRegister, S27);
+REGISTER_DEFINITION(FloatRegister, S28);
+REGISTER_DEFINITION(FloatRegister, S29);
+REGISTER_DEFINITION(FloatRegister, S30);
+REGISTER_DEFINITION(FloatRegister, S31);
+REGISTER_DEFINITION(FloatRegister, Stemp);
+REGISTER_DEFINITION(FloatRegister, D0);
+REGISTER_DEFINITION(FloatRegister, D1);
+REGISTER_DEFINITION(FloatRegister, D2);
+REGISTER_DEFINITION(FloatRegister, D3);
+REGISTER_DEFINITION(FloatRegister, D4);
+REGISTER_DEFINITION(FloatRegister, D5);
+REGISTER_DEFINITION(FloatRegister, D6);
+REGISTER_DEFINITION(FloatRegister, D7);
+REGISTER_DEFINITION(FloatRegister, D8);
+REGISTER_DEFINITION(FloatRegister, D9);
+REGISTER_DEFINITION(FloatRegister, D10);
+REGISTER_DEFINITION(FloatRegister, D11);
+REGISTER_DEFINITION(FloatRegister, D12);
+REGISTER_DEFINITION(FloatRegister, D13);
+REGISTER_DEFINITION(FloatRegister, D14);
+REGISTER_DEFINITION(FloatRegister, D15);
+REGISTER_DEFINITION(FloatRegister, D16);
+REGISTER_DEFINITION(FloatRegister, D17);
+REGISTER_DEFINITION(FloatRegister, D18);
+REGISTER_DEFINITION(FloatRegister, D19);
+REGISTER_DEFINITION(FloatRegister, D20);
+REGISTER_DEFINITION(FloatRegister, D21);
+REGISTER_DEFINITION(FloatRegister, D22);
+REGISTER_DEFINITION(FloatRegister, D23);
+REGISTER_DEFINITION(FloatRegister, D24);
+REGISTER_DEFINITION(FloatRegister, D25);
+REGISTER_DEFINITION(FloatRegister, D26);
+REGISTER_DEFINITION(FloatRegister, D27);
+REGISTER_DEFINITION(FloatRegister, D28);
+REGISTER_DEFINITION(FloatRegister, D29);
+REGISTER_DEFINITION(FloatRegister, D30);
+REGISTER_DEFINITION(FloatRegister, D31);
+
+#endif //AARCH64
--- /dev/null	2016-08-24 15:41:39.598575000 -0400
+++ new/hotspot/src/cpu/arm/vm/relocInfo_arm.cpp	2016-12-02 11:23:10.974136517 -0500
@@ -0,0 +1,167 @@
+/*
+ * Copyright (c) 2008, 2016, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+#include "asm/assembler.inline.hpp"
+#include "assembler_arm.inline.hpp"
+#include "code/relocInfo.hpp"
+#include "nativeInst_arm.hpp"
+#include "oops/oop.inline.hpp"
+#include "runtime/safepoint.hpp"
+
+void Relocation::pd_set_data_value(address x, intptr_t o, bool verify_only) {
+
+  NativeMovConstReg* ni = nativeMovConstReg_at(addr());
+#if defined(AARCH64) && defined(COMPILER2)
+  if (ni->is_movz()) {
+    assert(type() == relocInfo::oop_type, "!");
+    if (verify_only) {
+      uintptr_t d = ni->data();
+      guarantee((d >> 32) == 0, "not narrow oop");
+      narrowOop no = d;
+      oop o = oopDesc::decode_heap_oop(no);
+      guarantee(cast_from_oop<intptr_t>(o) == (intptr_t)x, "instructions must match");
+    } else {
+      ni->set_data((intptr_t)x);
+    }
+    return;
+  }
+#endif
+  if (verify_only) {
+    guarantee(ni->data() == (intptr_t)(x + o), "instructions must match");
+  } else {
+    ni->set_data((intptr_t)(x + o));
+  }
+}
+
+address Relocation::pd_call_destination(address orig_addr) {
+  address pc = addr();
+
+  int adj = 0;
+  if (orig_addr != NULL) {
+    // We just moved this call instruction from orig_addr to addr().
+    // This means that, when relative, its target will appear to have grown by addr() - orig_addr.
+    adj = orig_addr - pc;
+  }
+
+  RawNativeInstruction* ni = rawNativeInstruction_at(pc);
+
+#if (!defined(AARCH64))
+  if (NOT_AARCH64(ni->is_add_lr()) AARCH64_ONLY(ni->is_adr_aligned_lr())) {
+    // On arm32, skip the optional 'add LR, PC, #offset'
+    // (Allowing the jump support code to handle fat_call)
+    pc = ni->next_raw_instruction_address();
+    ni = nativeInstruction_at(pc);
+  }
+#endif
+
+  if (AARCH64_ONLY(ni->is_call()) NOT_AARCH64(ni->is_bl())) {
+    // For arm32, fat_call are handled by is_jump for the new 'ni',
+    // requiring only to support is_bl.
+    //
+    // For AARCH64, skipping a leading adr is not sufficient
+    // to reduce calls to a simple bl.
+    return rawNativeCall_at(pc)->destination(adj);
+  }
+
+  if (ni->is_jump()) {
+    return rawNativeJump_at(pc)->jump_destination(adj);
+  }
+  ShouldNotReachHere();
+  return NULL;
+}
+
+void Relocation::pd_set_call_destination(address x) {
+  address pc = addr();
+  NativeInstruction* ni = nativeInstruction_at(pc);
+
+#if (!defined(AARCH64))
+  if (NOT_AARCH64(ni->is_add_lr()) AARCH64_ONLY(ni->is_adr_aligned_lr())) {
+    // On arm32, skip the optional 'add LR, PC, #offset'
+    // (Allowing the jump support code to handle fat_call)
+    pc = ni->next_raw_instruction_address();
+    ni = nativeInstruction_at(pc);
+  }
+#endif
+
+  if (AARCH64_ONLY(ni->is_call()) NOT_AARCH64(ni->is_bl())) {
+    // For arm32, fat_call are handled by is_jump for the new 'ni',
+    // requiring only to support is_bl.
+    //
+    // For AARCH64, skipping a leading adr is not sufficient
+    // to reduce calls to a simple bl.
+    rawNativeCall_at(pc)->set_destination(x);
+    return;
+  }
+
+  if (ni->is_jump()) { // raw jump
+    rawNativeJump_at(pc)->set_jump_destination(x);
+    return;
+  }
+  ShouldNotReachHere();
+}
+
+
+address* Relocation::pd_address_in_code() {
+  return (address*)addr();
+}
+
+address Relocation::pd_get_address_from_code() {
+  return *pd_address_in_code();
+}
+
+void poll_Relocation::fix_relocation_after_move(const CodeBuffer* src, CodeBuffer* dest) {
+}
+
+void metadata_Relocation::pd_fix_value(address x) {
+  assert(! addr_in_const(), "Do not use");
+#ifdef AARCH64
+#ifdef COMPILER2
+  NativeMovConstReg* ni = nativeMovConstReg_at(addr());
+  if (ni->is_movz()) {
+    return;
+  }
+#endif
+  set_value(x);
+#else
+  if (!VM_Version::supports_movw()) {
+    set_value(x);
+#ifdef ASSERT
+  } else {
+    // the movw/movt data should be correct
+    NativeMovConstReg* ni = nativeMovConstReg_at(addr());
+    assert(ni->is_movw(), "not a movw");
+    // The following assert should be correct but the shared code
+    // currently 'fixes' the metadata instructions before the
+    // metadata_table is copied in the new method (see
+    // JDK-8042845). This means that 'x' (which comes from the table)
+    // does not match the value inlined in the code (which is
+    // correct). Failure can be temporarily ignored since the code is
+    // correct and the table is copied shortly afterward.
+    //
+    // assert(ni->data() == (int)x, "metadata relocation mismatch");
+#endif
+  }
+#endif // !AARCH64
+}
--- /dev/null	2016-08-24 15:41:39.598575000 -0400
+++ new/hotspot/src/cpu/arm/vm/relocInfo_arm.hpp	2016-12-02 11:23:15.930417580 -0500
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2008, 2011, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef CPU_ARM_VM_RELOCINFO_ARM_HPP
+#define CPU_ARM_VM_RELOCINFO_ARM_HPP
+
+ private:
+
+  enum {
+    offset_unit  = 4,
+    format_width = 0
+  };
+
+#endif // CPU_ARM_VM_RELOCINFO_ARM_HPP
--- /dev/null	2016-08-24 15:41:39.598575000 -0400
+++ new/hotspot/src/cpu/arm/vm/runtime_arm.cpp	2016-12-02 11:23:20.922700684 -0500
@@ -0,0 +1,160 @@
+/*
+ * Copyright (c) 2008, 2016, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+#ifdef COMPILER2
+#include "asm/assembler.hpp"
+#include "assembler_arm.inline.hpp"
+#include "classfile/systemDictionary.hpp"
+#include "code/vmreg.hpp"
+#include "interpreter/interpreter.hpp"
+#include "memory/resourceArea.hpp"
+#include "nativeInst_arm.hpp"
+#include "opto/runtime.hpp"
+#include "runtime/interfaceSupport.hpp"
+#include "runtime/sharedRuntime.hpp"
+#include "runtime/stubRoutines.hpp"
+#include "runtime/vframeArray.hpp"
+#include "utilities/globalDefinitions.hpp"
+#include "vmreg_arm.inline.hpp"
+#endif
+
+#define __ masm->
+
+//------------------------------ generate_exception_blob ---------------------------
+// creates exception blob at the end
+// Using exception blob, this code is jumped from a compiled method.
+// (see emit_exception_handler in sparc.ad file)
+//
+// Given an exception pc at a call we call into the runtime for the
+// handler in this method. This handler might merely restore state
+// (i.e. callee save registers) unwind the frame and jump to the
+// exception handler for the nmethod if there is no Java level handler
+// for the nmethod.
+//
+// This code is entered with a jmp.
+//
+// Arguments:
+//   Rexception_obj (R4/R19): exception oop
+//   Rexception_pc  (R5/R20): exception pc
+//
+// Results:
+//   Rexception_obj (R4/R19): exception oop
+//   O1: exception pc in caller or ???
+//   destination: exception handler of caller
+//
+// Note: the exception pc MUST be at a call (precise debug information)
+//
+void OptoRuntime::generate_exception_blob() {
+  // allocate space for code
+  ResourceMark rm;
+  int pad = VerifyThread ? 256 : 0;// Extra slop space for more verify code
+
+  // setup code generation tools
+  // Measured 8/7/03 at 256 in 32bit debug build (no VerifyThread)
+  // Measured 8/7/03 at 528 in 32bit debug build (VerifyThread)
+  CodeBuffer buffer("exception_blob", 600+pad, 512);
+  MacroAssembler* masm     = new MacroAssembler(&buffer);
+
+  int framesize_in_words = 2; // FP + LR
+  int framesize_in_bytes = framesize_in_words * wordSize;
+  int framesize_in_slots = framesize_in_bytes / sizeof(jint);
+
+  int start = __ offset();
+
+  __ str(Rexception_obj, Address(Rthread, JavaThread::exception_oop_offset()));
+  __ str(Rexception_pc, Address(Rthread, JavaThread::exception_pc_offset()));
+
+  // This call does all the hard work. It checks if an exception catch
+  // exists in the method.
+  // If so, it returns the handler address.
+  // If the nmethod has been deoptimized and it had a handler the handler
+  // address is the deopt blob unpack_with_exception entry.
+  //
+  // If no handler exists it prepares for stack-unwinding, restoring the callee-save
+  // registers of the frame being removed.
+  //
+  __ mov(LR, Rexception_pc);
+  __ raw_push(FP, LR);
+  int pc_offset = __ set_last_Java_frame(SP, FP, false, Rtemp);
+
+  __ mov(R0, Rthread);
+
+  // This call can block at exit and nmethod can be deoptimized at that
+  // point. If the nmethod had a catch point we would jump to the
+  // now deoptimized catch point and fall thru the vanilla deopt
+  // path and lose the exception
+  // Sure would be simpler if this call didn't block!
+  __ call(CAST_FROM_FN_PTR(address, OptoRuntime::handle_exception_C));
+  if (pc_offset == -1) {
+    pc_offset = __ offset();
+  }
+
+  // Set an oopmap for the call site.  This oopmap will only be used if we
+  // are unwinding the stack.  Hence, all locations will be dead.
+  // Callee-saved registers will be the same as the frame above (i.e.,
+  // handle_exception_stub), since they were restored when we got the
+  // exception.
+
+  OopMapSet *oop_maps = new OopMapSet();
+  oop_maps->add_gc_map(pc_offset - start, new OopMap(framesize_in_slots, 0));
+
+  __ reset_last_Java_frame(Rtemp);
+
+  __ raw_pop(FP, LR);
+
+  // Restore SP from its saved reg (FP) if the exception PC is a MethodHandle call site.
+  __ ldr(Rtemp, Address(Rthread, JavaThread::is_method_handle_return_offset()));
+#ifdef AARCH64
+  Label skip;
+  __ cbz(Rtemp, skip);
+  __ mov(SP, Rmh_SP_save);
+  __ bind(skip);
+#else
+  __ cmp(Rtemp, 0);
+  __ mov(SP, Rmh_SP_save, ne);
+#endif
+
+  // R0 contains handler address
+  // Since this may be the deopt blob we must set R5 to look like we returned
+  // from the original pc that threw the exception
+
+  __ ldr(Rexception_pc,  Address(Rthread, JavaThread::exception_pc_offset()));  // R5/R20
+
+  __ ldr(Rexception_obj, Address(Rthread, JavaThread::exception_oop_offset())); // R4/R19
+  __ mov(Rtemp, 0);
+#ifdef ASSERT
+  __ str(Rtemp, Address(Rthread, JavaThread::exception_handler_pc_offset()));
+  __ str(Rtemp, Address(Rthread, JavaThread::exception_pc_offset()));
+#endif
+  // Clear the exception oop so GC no longer processes it as a root.
+  __ str(Rtemp, Address(Rthread, JavaThread::exception_oop_offset()));
+  __ jump(R0);
+
+  // -------------
+  // make sure all code is generated
+  masm->flush();
+
+  _exception_blob = ExceptionBlob::create(&buffer, oop_maps, framesize_in_words);
+}
--- /dev/null	2016-08-24 15:41:39.598575000 -0400
+++ new/hotspot/src/cpu/arm/vm/sharedRuntime_arm.cpp	2016-12-02 11:23:25.946985603 -0500
@@ -0,0 +1,2501 @@
+/*
+ * Copyright (c) 2008, 2016, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+#include "asm/assembler.hpp"
+#include "assembler_arm.inline.hpp"
+#include "code/debugInfoRec.hpp"
+#include "code/icBuffer.hpp"
+#include "code/vtableStubs.hpp"
+#include "interpreter/interpreter.hpp"
+#include "logging/log.hpp"
+#include "memory/resourceArea.hpp"
+#include "oops/compiledICHolder.hpp"
+#include "runtime/sharedRuntime.hpp"
+#include "runtime/vframeArray.hpp"
+#include "vmreg_arm.inline.hpp"
+#ifdef COMPILER1
+#include "c1/c1_Runtime1.hpp"
+#endif
+#ifdef COMPILER2
+#include "opto/runtime.hpp"
+#endif
+#ifdef SHARK
+#include "compiler/compileBroker.hpp"
+#include "shark/sharkCompiler.hpp"
+#endif
+
+#define __ masm->
+
+class RegisterSaver {
+public:
+
+  // Special registers:
+  //              32-bit ARM     64-bit ARM
+  //  Rthread:       R10            R28
+  //  LR:            R14            R30
+
+  // Rthread is callee saved in the C ABI and never changed by compiled code:
+  // no need to save it.
+
+  // 2 slots for LR: the one at LR_offset and an other one at R14/R30_offset.
+  // The one at LR_offset is a return address that is needed by stack walking.
+  // A c2 method uses LR as a standard register so it may be live when we
+  // branch to the runtime. The slot at R14/R30_offset is for the value of LR
+  // in case it's live in the method we are coming from.
+
+#ifdef AARCH64
+
+  //
+  // On AArch64 registers save area has the following layout:
+  //
+  // |---------------------|
+  // | return address (LR) |
+  // | FP                  |
+  // |---------------------|
+  // | V31                 |
+  // | ...                 |
+  // | V0                  |
+  // |---------------------|
+  // | padding             |
+  // | R30 (LR live value) |
+  // |---------------------|
+  // | R27                 |
+  // | ...                 |
+  // | R0                  |
+  // |---------------------| <-- SP
+  //
+
+  enum RegisterLayout {
+    number_of_saved_gprs = 28,
+    number_of_saved_fprs = FloatRegisterImpl::number_of_registers,
+    words_per_fpr = ConcreteRegisterImpl::words_per_fpr,
+
+    R0_offset  = 0,
+    R30_offset = R0_offset + number_of_saved_gprs,
+    D0_offset  = R30_offset + 2,
+    FP_offset  = D0_offset + number_of_saved_fprs * words_per_fpr,
+    LR_offset  = FP_offset + 1,
+
+    reg_save_size = LR_offset + 1,
+  };
+
+  static const int Rmethod_offset;
+  static const int Rtemp_offset;
+
+#else
+
+  enum RegisterLayout {
+    fpu_save_size = FloatRegisterImpl::number_of_registers,
+#ifndef __SOFTFP__
+    D0_offset = 0,
+#endif
+    R0_offset = fpu_save_size,
+    R1_offset,
+    R2_offset,
+    R3_offset,
+    R4_offset,
+    R5_offset,
+    R6_offset,
+#if (FP_REG_NUM != 7)
+    // if not saved as FP
+    R7_offset,
+#endif
+    R8_offset,
+    R9_offset,
+#if (FP_REG_NUM != 11)
+    // if not saved as FP
+    R11_offset,
+#endif
+    R12_offset,
+    R14_offset,
+    FP_offset,
+    LR_offset,
+    reg_save_size,
+
+    Rmethod_offset = R9_offset,
+    Rtemp_offset = R12_offset,
+  };
+
+  // all regs but Rthread (R10), FP (R7 or R11), SP and PC
+  // (altFP_7_11 is the one amoung R7 and R11 which is not FP)
+#define SAVED_BASE_REGS (RegisterSet(R0, R6) | RegisterSet(R8, R9) | RegisterSet(R12) | R14 | altFP_7_11)
+
+#endif // AARCH64
+
+  //  When LR may be live in the nmethod from which we are comming
+  //  then lr_saved is true, the return address is saved before the
+  //  call to save_live_register by the caller and LR contains the
+  //  live value.
+
+  static OopMap* save_live_registers(MacroAssembler* masm,
+                                     int* total_frame_words,
+                                     bool lr_saved = false);
+  static void restore_live_registers(MacroAssembler* masm, bool restore_lr = true);
+
+};
+
+
+#ifdef AARCH64
+const int RegisterSaver::Rmethod_offset = RegisterSaver::R0_offset + Rmethod->encoding();
+const int RegisterSaver::Rtemp_offset   = RegisterSaver::R0_offset + Rtemp->encoding();
+#endif // AARCH64
+
+
+OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm,
+                                           int* total_frame_words,
+                                           bool lr_saved) {
+  *total_frame_words = reg_save_size;
+
+  OopMapSet *oop_maps = new OopMapSet();
+  OopMap* map = new OopMap(VMRegImpl::slots_per_word * (*total_frame_words), 0);
+
+#ifdef AARCH64
+  assert((reg_save_size * wordSize) % StackAlignmentInBytes == 0, "SP should be aligned");
+
+  if (lr_saved) {
+    // LR was stashed here, so that jump could use it as a scratch reg
+    __ ldr(LR, Address(SP, 0));
+    // There are two words on the stack top:
+    //  [SP + 0]: placeholder for FP
+    //  [SP + wordSize]: saved return address
+    __ str(FP, Address(SP, 0));
+  } else {
+    __ raw_push(FP, LR);
+  }
+
+  __ sub(SP, SP, (reg_save_size - 2) * wordSize);
+
+  for (int i = 0; i < number_of_saved_gprs; i += 2) {
+    int offset = R0_offset + i;
+    __ stp(as_Register(i), as_Register(i+1), Address(SP, offset * wordSize));
+    map->set_callee_saved(VMRegImpl::stack2reg((offset + 0) * VMRegImpl::slots_per_word), as_Register(i)->as_VMReg());
+    map->set_callee_saved(VMRegImpl::stack2reg((offset + 1) * VMRegImpl::slots_per_word), as_Register(i+1)->as_VMReg());
+  }
+
+  __ str(R30, Address(SP, R30_offset * wordSize));
+  map->set_callee_saved(VMRegImpl::stack2reg(R30_offset * VMRegImpl::slots_per_word), R30->as_VMReg());
+
+  for (int i = 0; i < number_of_saved_fprs; i += 2) {
+    int offset1 = D0_offset + i * words_per_fpr;
+    int offset2 = offset1 + words_per_fpr;
+    Address base(SP, offset1 * wordSize);
+    if (words_per_fpr == 2) {
+      // pair of "wide" quad vector registers
+      __ stp_q(as_FloatRegister(i), as_FloatRegister(i+1), base);
+    } else {
+      // pair of double vector registers
+      __ stp_d(as_FloatRegister(i), as_FloatRegister(i+1), base);
+    }
+    map->set_callee_saved(VMRegImpl::stack2reg(offset1 * VMRegImpl::slots_per_word), as_FloatRegister(i)->as_VMReg());
+    map->set_callee_saved(VMRegImpl::stack2reg(offset2 * VMRegImpl::slots_per_word), as_FloatRegister(i+1)->as_VMReg());
+  }
+#else
+  if (lr_saved) {
+    __ push(RegisterSet(FP));
+  } else {
+    __ push(RegisterSet(FP) | RegisterSet(LR));
+  }
+  __ push(SAVED_BASE_REGS);
+  if (HaveVFP) {
+    if (VM_Version::has_vfp3_32()) {
+      __ fstmdbd(SP, FloatRegisterSet(D16, 16), writeback);
+    } else {
+      if (FloatRegisterImpl::number_of_registers > 32) {
+        assert(FloatRegisterImpl::number_of_registers == 64, "nb fp registers should be 64");
+        __ sub(SP, SP, 32 * wordSize);
+      }
+    }
+    __ fstmdbd(SP, FloatRegisterSet(D0, 16), writeback);
+  } else {
+    __ sub(SP, SP, fpu_save_size * wordSize);
+  }
+
+  int i;
+  int j=0;
+  for (i = R0_offset; i <= R9_offset; i++) {
+    if (j == FP_REG_NUM) {
+      // skip the FP register, managed below.
+      j++;
+    }
+    map->set_callee_saved(VMRegImpl::stack2reg(i), as_Register(j)->as_VMReg());
+    j++;
+  }
+  assert(j == R10->encoding(), "must be");
+#if (FP_REG_NUM != 11)
+  // add R11, if not managed as FP
+  map->set_callee_saved(VMRegImpl::stack2reg(R11_offset), R11->as_VMReg());
+#endif
+  map->set_callee_saved(VMRegImpl::stack2reg(R12_offset), R12->as_VMReg());
+  map->set_callee_saved(VMRegImpl::stack2reg(R14_offset), R14->as_VMReg());
+  if (HaveVFP) {
+    for (i = 0; i < (VM_Version::has_vfp3_32() ? 64 : 32); i+=2) {
+      map->set_callee_saved(VMRegImpl::stack2reg(i), as_FloatRegister(i)->as_VMReg());
+      map->set_callee_saved(VMRegImpl::stack2reg(i + 1), as_FloatRegister(i)->as_VMReg()->next());
+    }
+  }
+#endif // AARCH64
+
+  return map;
+}
+
+void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_lr) {
+#ifdef AARCH64
+  for (int i = 0; i < number_of_saved_gprs; i += 2) {
+    __ ldp(as_Register(i), as_Register(i+1), Address(SP, (R0_offset + i) * wordSize));
+  }
+
+  __ ldr(R30, Address(SP, R30_offset * wordSize));
+
+  for (int i = 0; i < number_of_saved_fprs; i += 2) {
+    Address base(SP, (D0_offset + i * words_per_fpr) * wordSize);
+    if (words_per_fpr == 2) {
+      // pair of "wide" quad vector registers
+      __ ldp_q(as_FloatRegister(i), as_FloatRegister(i+1), base);
+    } else {
+      // pair of double vector registers
+      __ ldp_d(as_FloatRegister(i), as_FloatRegister(i+1), base);
+    }
+  }
+
+  __ add(SP, SP, (reg_save_size - 2) * wordSize);
+
+  if (restore_lr) {
+    __ raw_pop(FP, LR);
+  } else {
+    __ ldr(FP, Address(SP, 0));
+  }
+#else
+  if (HaveVFP) {
+    __ fldmiad(SP, FloatRegisterSet(D0, 16), writeback);
+    if (VM_Version::has_vfp3_32()) {
+      __ fldmiad(SP, FloatRegisterSet(D16, 16), writeback);
+    } else {
+      if (FloatRegisterImpl::number_of_registers > 32) {
+        assert(FloatRegisterImpl::number_of_registers == 64, "nb fp registers should be 64");
+        __ add(SP, SP, 32 * wordSize);
+      }
+    }
+  } else {
+    __ add(SP, SP, fpu_save_size * wordSize);
+  }
+  __ pop(SAVED_BASE_REGS);
+  if (restore_lr) {
+    __ pop(RegisterSet(FP) | RegisterSet(LR));
+  } else {
+    __ pop(RegisterSet(FP));
+  }
+#endif // AARCH64
+}
+
+#ifdef AARCH64
+
+static void push_result_registers(MacroAssembler* masm, BasicType ret_type) {
+  if (ret_type == T_DOUBLE || ret_type == T_FLOAT) {
+    __ str_d(D0, Address(SP, -2*wordSize, pre_indexed));
+  } else {
+    __ raw_push(R0, ZR);
+  }
+}
+
+static void pop_result_registers(MacroAssembler* masm, BasicType ret_type) {
+  if (ret_type == T_DOUBLE || ret_type == T_FLOAT) {
+    __ ldr_d(D0, Address(SP, 2*wordSize, post_indexed));
+  } else {
+    __ raw_pop(R0, ZR);
+  }
+}
+
+static void push_param_registers(MacroAssembler* masm, int fp_regs_in_arguments) {
+  __ raw_push(R0, R1);
+  __ raw_push(R2, R3);
+  __ raw_push(R4, R5);
+  __ raw_push(R6, R7);
+
+  assert(FPR_PARAMS == 8, "adjust this code");
+  assert((0 <= fp_regs_in_arguments) && (fp_regs_in_arguments <= FPR_PARAMS), "should be");
+
+  if (fp_regs_in_arguments > 6) __ stp_d(V6, V7, Address(SP, -2 * wordSize, pre_indexed));
+  if (fp_regs_in_arguments > 4) __ stp_d(V4, V5, Address(SP, -2 * wordSize, pre_indexed));
+  if (fp_regs_in_arguments > 2) __ stp_d(V2, V3, Address(SP, -2 * wordSize, pre_indexed));
+  if (fp_regs_in_arguments > 0) __ stp_d(V0, V1, Address(SP, -2 * wordSize, pre_indexed));
+}
+
+static void pop_param_registers(MacroAssembler* masm, int fp_regs_in_arguments) {
+  assert(FPR_PARAMS == 8, "adjust this code");
+  assert((0 <= fp_regs_in_arguments) && (fp_regs_in_arguments <= FPR_PARAMS), "should be");
+
+  if (fp_regs_in_arguments > 0) __ ldp_d(V0, V1, Address(SP, 2 * wordSize, post_indexed));
+  if (fp_regs_in_arguments > 2) __ ldp_d(V2, V3, Address(SP, 2 * wordSize, post_indexed));
+  if (fp_regs_in_arguments > 4) __ ldp_d(V4, V5, Address(SP, 2 * wordSize, post_indexed));
+  if (fp_regs_in_arguments > 6) __ ldp_d(V6, V7, Address(SP, 2 * wordSize, post_indexed));
+
+  __ raw_pop(R6, R7);
+  __ raw_pop(R4, R5);
+  __ raw_pop(R2, R3);
+  __ raw_pop(R0, R1);
+}
+
+#else // AARCH64
+
+static void push_result_registers(MacroAssembler* masm, BasicType ret_type) {
+#ifdef __ABI_HARD__
+  if (ret_type == T_DOUBLE || ret_type == T_FLOAT) {
+    __ sub(SP, SP, 8);
+    __ fstd(D0, Address(SP));
+    return;
+  }
+#endif // __ABI_HARD__
+  __ raw_push(R0, R1);
+}
+
+static void pop_result_registers(MacroAssembler* masm, BasicType ret_type) {
+#ifdef __ABI_HARD__
+  if (ret_type == T_DOUBLE || ret_type == T_FLOAT) {
+    __ fldd(D0, Address(SP));
+    __ add(SP, SP, 8);
+    return;
+  }
+#endif // __ABI_HARD__
+  __ raw_pop(R0, R1);
+}
+
+static void push_param_registers(MacroAssembler* masm, int fp_regs_in_arguments) {
+  // R1-R3 arguments need to be saved, but we push 4 registers for 8-byte alignment
+  __ push(RegisterSet(R0, R3));
+
+#ifdef __ABI_HARD__
+  // preserve arguments
+  // Likely not needed as the locking code won't probably modify volatile FP registers,
+  // but there is no way to guarantee that
+  if (fp_regs_in_arguments) {
+    // convert fp_regs_in_arguments to a number of double registers
+    int double_regs_num = (fp_regs_in_arguments + 1) >> 1;
+    __ fstmdbd(SP, FloatRegisterSet(D0, double_regs_num), writeback);
+  }
+#endif // __ ABI_HARD__
+}
+
+static void pop_param_registers(MacroAssembler* masm, int fp_regs_in_arguments) {
+#ifdef __ABI_HARD__
+  if (fp_regs_in_arguments) {
+    int double_regs_num = (fp_regs_in_arguments + 1) >> 1;
+    __ fldmiad(SP, FloatRegisterSet(D0, double_regs_num), writeback);
+  }
+#endif // __ABI_HARD__
+
+  __ pop(RegisterSet(R0, R3));
+}
+
+#endif // AARCH64
+
+
+// Is vector's size (in bytes) bigger than a size saved by default?
+// All vector registers are saved by default on ARM.
+bool SharedRuntime::is_wide_vector(int size) {
+  return false;
+}
+
+size_t SharedRuntime::trampoline_size() {
+  return 16;
+}
+
+void SharedRuntime::generate_trampoline(MacroAssembler *masm, address destination) {
+  InlinedAddress dest(destination);
+  __ indirect_jump(dest, Rtemp);
+  __ bind_literal(dest);
+}
+
+int SharedRuntime::c_calling_convention(const BasicType *sig_bt,
+                                        VMRegPair *regs,
+                                        VMRegPair *regs2,
+                                        int total_args_passed) {
+  assert(regs2 == NULL, "not needed on arm");
+#ifdef AARCH64
+  int slot = 0; // counted in 32-bit VMReg slots
+  int reg = 0;
+  int fp_reg = 0;
+  for (int i = 0; i < total_args_passed; i++) {
+    switch (sig_bt[i]) {
+    case T_SHORT:
+    case T_CHAR:
+    case T_BYTE:
+    case T_BOOLEAN:
+    case T_INT:
+      if (reg < GPR_PARAMS) {
+        Register r = as_Register(reg);
+        regs[i].set1(r->as_VMReg());
+        reg++;
+      } else {
+        regs[i].set1(VMRegImpl::stack2reg(slot));
+        slot+=2;
+      }
+      break;
+    case T_LONG:
+      assert(sig_bt[i+1] == T_VOID, "missing Half" );
+      // fall through
+    case T_ARRAY:
+    case T_OBJECT:
+    case T_ADDRESS:
+      if (reg < GPR_PARAMS) {
+        Register r = as_Register(reg);
+        regs[i].set2(r->as_VMReg());
+        reg++;
+      } else {
+        regs[i].set2(VMRegImpl::stack2reg(slot));
+        slot+=2;
+      }
+      break;
+    case T_FLOAT:
+      if (fp_reg < FPR_PARAMS) {
+        FloatRegister r = as_FloatRegister(fp_reg);
+        regs[i].set1(r->as_VMReg());
+        fp_reg++;
+      } else {
+        regs[i].set1(VMRegImpl::stack2reg(slot));
+        slot+=2;
+      }
+      break;
+    case T_DOUBLE:
+      assert(sig_bt[i+1] == T_VOID, "missing Half" );
+      if (fp_reg < FPR_PARAMS) {
+        FloatRegister r = as_FloatRegister(fp_reg);
+        regs[i].set2(r->as_VMReg());
+        fp_reg++;
+      } else {
+        regs[i].set2(VMRegImpl::stack2reg(slot));
+        slot+=2;
+      }
+      break;
+    case T_VOID:
+      assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
+      regs[i].set_bad();
+      break;
+    default:
+      ShouldNotReachHere();
+    }
+  }
+  return slot;
+
+#else // AARCH64
+
+  int slot = 0;
+  int ireg = 0;
+#ifdef __ABI_HARD__
+  int fp_slot = 0;
+  int single_fpr_slot = 0;
+#endif // __ABI_HARD__
+  for (int i = 0; i < total_args_passed; i++) {
+    switch (sig_bt[i]) {
+    case T_SHORT:
+    case T_CHAR:
+    case T_BYTE:
+    case T_BOOLEAN:
+    case T_INT:
+    case T_ARRAY:
+    case T_OBJECT:
+    case T_ADDRESS:
+#ifndef __ABI_HARD__
+    case T_FLOAT:
+#endif // !__ABI_HARD__
+      if (ireg < 4) {
+        Register r = as_Register(ireg);
+        regs[i].set1(r->as_VMReg());
+        ireg++;
+      } else {
+        regs[i].set1(VMRegImpl::stack2reg(slot));
+        slot++;
+      }
+      break;
+    case T_LONG:
+#ifndef __ABI_HARD__
+    case T_DOUBLE:
+#endif // !__ABI_HARD__
+      assert(sig_bt[i+1] == T_VOID, "missing Half" );
+      if (ireg <= 2) {
+#if (ALIGN_WIDE_ARGUMENTS == 1)
+        if(ireg & 1) ireg++;  // Aligned location required
+#endif
+        Register r1 = as_Register(ireg);
+        Register r2 = as_Register(ireg + 1);
+        regs[i].set_pair(r2->as_VMReg(), r1->as_VMReg());
+        ireg += 2;
+#if (ALIGN_WIDE_ARGUMENTS == 0)
+      } else if (ireg == 3) {
+        // uses R3 + one stack slot
+        Register r = as_Register(ireg);
+        regs[i].set_pair(VMRegImpl::stack2reg(slot), r->as_VMReg());
+        ireg += 1;
+        slot += 1;
+#endif
+      } else {
+        if (slot & 1) slot++; // Aligned location required
+        regs[i].set_pair(VMRegImpl::stack2reg(slot+1), VMRegImpl::stack2reg(slot));
+        slot += 2;
+        ireg = 4;
+      }
+      break;
+    case T_VOID:
+      regs[i].set_bad();
+      break;
+#ifdef __ABI_HARD__
+    case T_FLOAT:
+      if ((fp_slot < 16)||(single_fpr_slot & 1)) {
+        if ((single_fpr_slot & 1) == 0) {
+          single_fpr_slot = fp_slot;
+          fp_slot += 2;
+        }
+        FloatRegister r = as_FloatRegister(single_fpr_slot);
+        single_fpr_slot++;
+        regs[i].set1(r->as_VMReg());
+      } else {
+        regs[i].set1(VMRegImpl::stack2reg(slot));
+        slot++;
+      }
+      break;
+    case T_DOUBLE:
+      assert(ALIGN_WIDE_ARGUMENTS == 1, "ABI_HARD not supported with unaligned wide arguments");
+      if (fp_slot <= 14) {
+        FloatRegister r1 = as_FloatRegister(fp_slot);
+        FloatRegister r2 = as_FloatRegister(fp_slot+1);
+        regs[i].set_pair(r2->as_VMReg(), r1->as_VMReg());
+        fp_slot += 2;
+      } else {
+        if(slot & 1) slot++;
+        regs[i].set_pair(VMRegImpl::stack2reg(slot+1), VMRegImpl::stack2reg(slot));
+        slot += 2;
+        single_fpr_slot = 16;
+      }
+      break;
+#endif // __ABI_HARD__
+    default:
+      ShouldNotReachHere();
+    }
+  }
+  return slot;
+#endif // AARCH64
+}
+
+int SharedRuntime::java_calling_convention(const BasicType *sig_bt,
+                                           VMRegPair *regs,
+                                           int total_args_passed,
+                                           int is_outgoing) {
+#ifdef AARCH64
+  // C calling convention on AArch64 is good enough.
+  return c_calling_convention(sig_bt, regs, NULL, total_args_passed);
+#else
+#ifdef __SOFTFP__
+  // soft float is the same as the C calling convention.
+  return c_calling_convention(sig_bt, regs, NULL, total_args_passed);
+#endif // __SOFTFP__
+  (void) is_outgoing;
+  int slot = 0;
+  int ireg = 0;
+  int freg = 0;
+  int single_fpr = 0;
+
+  for (int i = 0; i < total_args_passed; i++) {
+    switch (sig_bt[i]) {
+    case T_SHORT:
+    case T_CHAR:
+    case T_BYTE:
+    case T_BOOLEAN:
+    case T_INT:
+    case T_ARRAY:
+    case T_OBJECT:
+    case T_ADDRESS:
+      if (ireg < 4) {
+        Register r = as_Register(ireg++);
+        regs[i].set1(r->as_VMReg());
+      } else {
+        regs[i].set1(VMRegImpl::stack2reg(slot++));
+      }
+      break;
+    case T_FLOAT:
+      // C2 utilizes S14/S15 for mem-mem moves
+      if ((freg < 16 COMPILER2_PRESENT(-2)) || (single_fpr & 1)) {
+        if ((single_fpr & 1) == 0) {
+          single_fpr = freg;
+          freg += 2;
+        }
+        FloatRegister r = as_FloatRegister(single_fpr++);
+        regs[i].set1(r->as_VMReg());
+      } else {
+        regs[i].set1(VMRegImpl::stack2reg(slot++));
+      }
+      break;
+    case T_DOUBLE:
+      // C2 utilizes S14/S15 for mem-mem moves
+      if (freg <= 14 COMPILER2_PRESENT(-2)) {
+        FloatRegister r1 = as_FloatRegister(freg);
+        FloatRegister r2 = as_FloatRegister(freg + 1);
+        regs[i].set_pair(r2->as_VMReg(), r1->as_VMReg());
+        freg += 2;
+      } else {
+        // Keep internally the aligned calling convention,
+        // ignoring ALIGN_WIDE_ARGUMENTS
+        if (slot & 1) slot++;
+        regs[i].set_pair(VMRegImpl::stack2reg(slot + 1), VMRegImpl::stack2reg(slot));
+        slot += 2;
+        single_fpr = 16;
+      }
+      break;
+    case T_LONG:
+      // Keep internally the aligned calling convention,
+      // ignoring ALIGN_WIDE_ARGUMENTS
+      if (ireg <= 2) {
+        if (ireg & 1) ireg++;
+        Register r1 = as_Register(ireg);
+        Register r2 = as_Register(ireg + 1);
+        regs[i].set_pair(r2->as_VMReg(), r1->as_VMReg());
+        ireg += 2;
+      } else {
+        if (slot & 1) slot++;
+        regs[i].set_pair(VMRegImpl::stack2reg(slot + 1), VMRegImpl::stack2reg(slot));
+        slot += 2;
+        ireg = 4;
+      }
+      break;
+    case T_VOID:
+      regs[i].set_bad();
+      break;
+    default:
+      ShouldNotReachHere();
+    }
+  }
+
+  if (slot & 1) slot++;
+  return slot;
+#endif // AARCH64
+}
+
+static void patch_callers_callsite(MacroAssembler *masm) {
+  Label skip;
+
+  __ ldr(Rtemp, Address(Rmethod, Method::code_offset()));
+  __ cbz(Rtemp, skip);
+
+#ifdef AARCH64
+  push_param_registers(masm, FPR_PARAMS);
+  __ raw_push(LR, ZR);
+#else
+  // Pushing an even number of registers for stack alignment.
+  // Selecting R9, which had to be saved anyway for some platforms.
+  __ push(RegisterSet(R0, R3) | R9 | LR);
+#endif // AARCH64
+
+  __ mov(R0, Rmethod);
+  __ mov(R1, LR);
+  __ call(CAST_FROM_FN_PTR(address, SharedRuntime::fixup_callers_callsite));
+
+#ifdef AARCH64
+  __ raw_pop(LR, ZR);
+  pop_param_registers(masm, FPR_PARAMS);
+#else
+  __ pop(RegisterSet(R0, R3) | R9 | LR);
+#endif // AARCH64
+
+  __ bind(skip);
+}
+
+void SharedRuntime::gen_i2c_adapter(MacroAssembler *masm,
+                                    int total_args_passed, int comp_args_on_stack,
+                                    const BasicType *sig_bt, const VMRegPair *regs) {
+  // TODO: ARM - May be can use ldm to load arguments
+  const Register tmp = Rtemp; // avoid erasing R5_mh
+
+  // Next assert may not be needed but safer. Extra analysis required
+  // if this there is not enough free registers and we need to use R5 here.
+  assert_different_registers(tmp, R5_mh);
+
+  // 6243940 We might end up in handle_wrong_method if
+  // the callee is deoptimized as we race thru here. If that
+  // happens we don't want to take a safepoint because the
+  // caller frame will look interpreted and arguments are now
+  // "compiled" so it is much better to make this transition
+  // invisible to the stack walking code. Unfortunately if
+  // we try and find the callee by normal means a safepoint
+  // is possible. So we stash the desired callee in the thread
+  // and the vm will find there should this case occur.
+  Address callee_target_addr(Rthread, JavaThread::callee_target_offset());
+  __ str(Rmethod, callee_target_addr);
+
+#ifdef AARCH64
+
+  assert_different_registers(tmp, R0, R1, R2, R3, R4, R5, R6, R7, Rsender_sp, Rmethod);
+  assert_different_registers(tmp, R0, R1, R2, R3, R4, R5, R6, R7, Rsender_sp, Rparams);
+
+  if (comp_args_on_stack) {
+    __ sub_slow(SP, SP, round_to(comp_args_on_stack * VMRegImpl::stack_slot_size, StackAlignmentInBytes));
+  }
+
+  for (int i = 0; i < total_args_passed; i++) {
+    if (sig_bt[i] == T_VOID) {
+      assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half");
+      continue;
+    }
+    assert(!regs[i].second()->is_valid() || regs[i].first()->next() == regs[i].second(), "must be ordered");
+
+    int expr_slots_count = (sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) ? 2 : 1;
+    Address source_addr(Rparams, Interpreter::expr_offset_in_bytes(total_args_passed - expr_slots_count - i));
+
+    VMReg r = regs[i].first();
+    bool full_word = regs[i].second()->is_valid();
+
+    if (r->is_stack()) {
+      if (full_word) {
+        __ ldr(tmp, source_addr);
+        __ str(tmp, Address(SP, r->reg2stack() * VMRegImpl::stack_slot_size));
+      } else {
+        __ ldr_w(tmp, source_addr);
+        __ str_w(tmp, Address(SP, r->reg2stack() * VMRegImpl::stack_slot_size));
+      }
+    } else if (r->is_Register()) {
+      if (full_word) {
+        __ ldr(r->as_Register(), source_addr);
+      } else {
+        __ ldr_w(r->as_Register(), source_addr);
+      }
+    } else if (r->is_FloatRegister()) {
+      if (sig_bt[i] == T_DOUBLE) {
+        __ ldr_d(r->as_FloatRegister(), source_addr);
+      } else {
+        __ ldr_s(r->as_FloatRegister(), source_addr);
+      }
+    } else {
+      assert(!r->is_valid() && !regs[i].second()->is_valid(), "must be");
+    }
+  }
+
+  __ ldr(tmp, Address(Rmethod, Method::from_compiled_offset()));
+  __ br(tmp);
+
+#else
+
+  assert_different_registers(tmp, R0, R1, R2, R3, Rsender_sp, Rmethod);
+
+  const Register initial_sp = Rmethod; // temporarily scratched
+
+  // Old code was modifying R4 but this looks unsafe (particularly with JSR292)
+  assert_different_registers(tmp, R0, R1, R2, R3, Rsender_sp, initial_sp);
+
+  __ mov(initial_sp, SP);
+
+  if (comp_args_on_stack) {
+    __ sub_slow(SP, SP, comp_args_on_stack * VMRegImpl::stack_slot_size);
+  }
+  __ bic(SP, SP, StackAlignmentInBytes - 1);
+
+  for (int i = 0; i < total_args_passed; i++) {
+    if (sig_bt[i] == T_VOID) {
+      assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half");
+      continue;
+    }
+    assert(!regs[i].second()->is_valid() || regs[i].first()->next() == regs[i].second(), "must be ordered");
+    int arg_offset = Interpreter::expr_offset_in_bytes(total_args_passed - 1 - i);
+
+    VMReg r_1 = regs[i].first();
+    VMReg r_2 = regs[i].second();
+    if (r_1->is_stack()) {
+      int stack_offset = r_1->reg2stack() * VMRegImpl::stack_slot_size;
+      if (!r_2->is_valid()) {
+        __ ldr(tmp, Address(initial_sp, arg_offset));
+        __ str(tmp, Address(SP, stack_offset));
+      } else {
+        __ ldr(tmp, Address(initial_sp, arg_offset - Interpreter::stackElementSize));
+        __ str(tmp, Address(SP, stack_offset));
+        __ ldr(tmp, Address(initial_sp, arg_offset));
+        __ str(tmp, Address(SP, stack_offset + wordSize));
+      }
+    } else if (r_1->is_Register()) {
+      if (!r_2->is_valid()) {
+        __ ldr(r_1->as_Register(), Address(initial_sp, arg_offset));
+      } else {
+        __ ldr(r_1->as_Register(), Address(initial_sp, arg_offset - Interpreter::stackElementSize));
+        __ ldr(r_2->as_Register(), Address(initial_sp, arg_offset));
+      }
+    } else if (r_1->is_FloatRegister()) {
+#ifdef __SOFTFP__
+      ShouldNotReachHere();
+#endif // __SOFTFP__
+      if (!r_2->is_valid()) {
+        __ flds(r_1->as_FloatRegister(), Address(initial_sp, arg_offset));
+      } else {
+        __ fldd(r_1->as_FloatRegister(), Address(initial_sp, arg_offset - Interpreter::stackElementSize));
+      }
+    } else {
+      assert(!r_1->is_valid() && !r_2->is_valid(), "must be");
+    }
+  }
+
+  // restore Rmethod (scratched for initial_sp)
+  __ ldr(Rmethod, callee_target_addr);
+  __ ldr(PC, Address(Rmethod, Method::from_compiled_offset()));
+
+#endif // AARCH64
+}
+
+static void gen_c2i_adapter(MacroAssembler *masm,
+                            int total_args_passed,  int comp_args_on_stack,
+                            const BasicType *sig_bt, const VMRegPair *regs,
+                            Label& skip_fixup) {
+  // TODO: ARM - May be can use stm to deoptimize arguments
+  const Register tmp = Rtemp;
+
+  patch_callers_callsite(masm);
+  __ bind(skip_fixup);
+
+  __ mov(Rsender_sp, SP); // not yet saved
+
+#ifdef AARCH64
+
+  int extraspace = round_to(total_args_passed * Interpreter::stackElementSize, StackAlignmentInBytes);
+  if (extraspace) {
+    __ sub(SP, SP, extraspace);
+  }
+
+  for (int i = 0; i < total_args_passed; i++) {
+    if (sig_bt[i] == T_VOID) {
+      assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half");
+      continue;
+    }
+
+    int expr_slots_count = (sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) ? 2 : 1;
+    Address dest_addr(SP, Interpreter::expr_offset_in_bytes(total_args_passed - expr_slots_count - i));
+
+    VMReg r = regs[i].first();
+    bool full_word = regs[i].second()->is_valid();
+
+    if (r->is_stack()) {
+      if (full_word) {
+        __ ldr(tmp, Address(SP, r->reg2stack() * VMRegImpl::stack_slot_size + extraspace));
+        __ str(tmp, dest_addr);
+      } else {
+        __ ldr_w(tmp, Address(SP, r->reg2stack() * VMRegImpl::stack_slot_size + extraspace));
+        __ str_w(tmp, dest_addr);
+      }
+    } else if (r->is_Register()) {
+      if (full_word) {
+        __ str(r->as_Register(), dest_addr);
+      } else {
+        __ str_w(r->as_Register(), dest_addr);
+      }
+    } else if (r->is_FloatRegister()) {
+      if (sig_bt[i] == T_DOUBLE) {
+        __ str_d(r->as_FloatRegister(), dest_addr);
+      } else {
+        __ str_s(r->as_FloatRegister(), dest_addr);
+      }
+    } else {
+      assert(!r->is_valid() && !regs[i].second()->is_valid(), "must be");
+    }
+  }
+
+  __ mov(Rparams, SP);
+
+  __ ldr(tmp, Address(Rmethod, Method::interpreter_entry_offset()));
+  __ br(tmp);
+
+#else
+
+  int extraspace = total_args_passed * Interpreter::stackElementSize;
+  if (extraspace) {
+    __ sub_slow(SP, SP, extraspace);
+  }
+
+  for (int i = 0; i < total_args_passed; i++) {
+    if (sig_bt[i] == T_VOID) {
+      assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half");
+      continue;
+    }
+    int stack_offset = (total_args_passed - 1 - i) * Interpreter::stackElementSize;
+
+    VMReg r_1 = regs[i].first();
+    VMReg r_2 = regs[i].second();
+    if (r_1->is_stack()) {
+      int arg_offset = r_1->reg2stack() * VMRegImpl::stack_slot_size + extraspace;
+      if (!r_2->is_valid()) {
+        __ ldr(tmp, Address(SP, arg_offset));
+        __ str(tmp, Address(SP, stack_offset));
+      } else {
+        __ ldr(tmp, Address(SP, arg_offset));
+        __ str(tmp, Address(SP, stack_offset - Interpreter::stackElementSize));
+        __ ldr(tmp, Address(SP, arg_offset + wordSize));
+        __ str(tmp, Address(SP, stack_offset));
+      }
+    } else if (r_1->is_Register()) {
+      if (!r_2->is_valid()) {
+        __ str(r_1->as_Register(), Address(SP, stack_offset));
+      } else {
+        __ str(r_1->as_Register(), Address(SP, stack_offset - Interpreter::stackElementSize));
+        __ str(r_2->as_Register(), Address(SP, stack_offset));
+      }
+    } else if (r_1->is_FloatRegister()) {
+#ifdef __SOFTFP__
+      ShouldNotReachHere();
+#endif // __SOFTFP__
+      if (!r_2->is_valid()) {
+        __ fsts(r_1->as_FloatRegister(), Address(SP, stack_offset));
+      } else {
+        __ fstd(r_1->as_FloatRegister(), Address(SP, stack_offset - Interpreter::stackElementSize));
+      }
+    } else {
+      assert(!r_1->is_valid() && !r_2->is_valid(), "must be");
+    }
+  }
+
+  __ ldr(PC, Address(Rmethod, Method::interpreter_entry_offset()));
+
+#endif // AARCH64
+}
+
+AdapterHandlerEntry* SharedRuntime::generate_i2c2i_adapters(MacroAssembler *masm,
+                                                            int total_args_passed,
+                                                            int comp_args_on_stack,
+                                                            const BasicType *sig_bt,
+                                                            const VMRegPair *regs,
+                                                            AdapterFingerPrint* fingerprint) {
+  address i2c_entry = __ pc();
+  gen_i2c_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs);
+
+  address c2i_unverified_entry = __ pc();
+  Label skip_fixup;
+  const Register receiver       = R0;
+  const Register holder_klass   = Rtemp; // XXX should be OK for C2 but not 100% sure
+  const Register receiver_klass = AARCH64_ONLY(R8) NOT_AARCH64(R4);
+
+  __ load_klass(receiver_klass, receiver);
+  __ ldr(holder_klass, Address(Ricklass, CompiledICHolder::holder_klass_offset()));
+  __ ldr(Rmethod, Address(Ricklass, CompiledICHolder::holder_method_offset()));
+  __ cmp(receiver_klass, holder_klass);
+
+#ifdef AARCH64
+  Label ic_miss;
+  __ b(ic_miss, ne);
+  __ ldr(Rtemp, Address(Rmethod, Method::code_offset()));
+  __ cbz(Rtemp, skip_fixup);
+  __ bind(ic_miss);
+  __ jump(SharedRuntime::get_ic_miss_stub(), relocInfo::runtime_call_type, Rtemp);
+#else
+  __ ldr(Rtemp, Address(Rmethod, Method::code_offset()), eq);
+  __ cmp(Rtemp, 0, eq);
+  __ b(skip_fixup, eq);
+  __ jump(SharedRuntime::get_ic_miss_stub(), relocInfo::runtime_call_type, noreg, ne);
+#endif // AARCH64
+
+  address c2i_entry = __ pc();
+  gen_c2i_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs, skip_fixup);
+
+  __ flush();
+  return AdapterHandlerLibrary::new_entry(fingerprint, i2c_entry, c2i_entry, c2i_unverified_entry);
+}
+
+
+static int reg2offset_in(VMReg r) {
+  // Account for saved FP and LR
+  return r->reg2stack() * VMRegImpl::stack_slot_size + 2*wordSize;
+}
+
+static int reg2offset_out(VMReg r) {
+  return (r->reg2stack() + SharedRuntime::out_preserve_stack_slots()) * VMRegImpl::stack_slot_size;
+}
+
+
+static void verify_oop_args(MacroAssembler* masm,
+                            methodHandle method,
+                            const BasicType* sig_bt,
+                            const VMRegPair* regs) {
+  Register temp_reg = Rmethod;  // not part of any compiled calling seq
+  if (VerifyOops) {
+    for (int i = 0; i < method->size_of_parameters(); i++) {
+      if (sig_bt[i] == T_OBJECT || sig_bt[i] == T_ARRAY) {
+        VMReg r = regs[i].first();
+        assert(r->is_valid(), "bad oop arg");
+        if (r->is_stack()) {
+          __ ldr(temp_reg, Address(SP, r->reg2stack() * VMRegImpl::stack_slot_size));
+          __ verify_oop(temp_reg);
+        } else {
+          __ verify_oop(r->as_Register());
+        }
+      }
+    }
+  }
+}
+
+static void gen_special_dispatch(MacroAssembler* masm,
+                                 methodHandle method,
+                                 const BasicType* sig_bt,
+                                 const VMRegPair* regs) {
+  verify_oop_args(masm, method, sig_bt, regs);
+  vmIntrinsics::ID iid = method->intrinsic_id();
+
+  // Now write the args into the outgoing interpreter space
+  bool     has_receiver   = false;
+  Register receiver_reg   = noreg;
+  int      member_arg_pos = -1;
+  Register member_reg     = noreg;
+  int      ref_kind       = MethodHandles::signature_polymorphic_intrinsic_ref_kind(iid);
+  if (ref_kind != 0) {
+    member_arg_pos = method->size_of_parameters() - 1;  // trailing MemberName argument
+    member_reg = Rmethod;  // known to be free at this point
+    has_receiver = MethodHandles::ref_kind_has_receiver(ref_kind);
+  } else if (iid == vmIntrinsics::_invokeBasic) {
+    has_receiver = true;
+  } else {
+    fatal("unexpected intrinsic id %d", iid);
+  }
+
+  if (member_reg != noreg) {
+    // Load the member_arg into register, if necessary.
+    SharedRuntime::check_member_name_argument_is_last_argument(method, sig_bt, regs);
+    VMReg r = regs[member_arg_pos].first();
+    if (r->is_stack()) {
+      __ ldr(member_reg, Address(SP, r->reg2stack() * VMRegImpl::stack_slot_size));
+    } else {
+      // no data motion is needed
+      member_reg = r->as_Register();
+    }
+  }
+
+  if (has_receiver) {
+    // Make sure the receiver is loaded into a register.
+    assert(method->size_of_parameters() > 0, "oob");
+    assert(sig_bt[0] == T_OBJECT, "receiver argument must be an object");
+    VMReg r = regs[0].first();
+    assert(r->is_valid(), "bad receiver arg");
+    if (r->is_stack()) {
+      // Porting note:  This assumes that compiled calling conventions always
+      // pass the receiver oop in a register.  If this is not true on some
+      // platform, pick a temp and load the receiver from stack.
+      assert(false, "receiver always in a register");
+      receiver_reg = j_rarg0;  // known to be free at this point
+      __ ldr(receiver_reg, Address(SP, r->reg2stack() * VMRegImpl::stack_slot_size));
+    } else {
+      // no data motion is needed
+      receiver_reg = r->as_Register();
+    }
+  }
+
+  // Figure out which address we are really jumping to:
+  MethodHandles::generate_method_handle_dispatch(masm, iid,
+                                                 receiver_reg, member_reg, /*for_compiler_entry:*/ true);
+}
+
+// ---------------------------------------------------------------------------
+// Generate a native wrapper for a given method.  The method takes arguments
+// in the Java compiled code convention, marshals them to the native
+// convention (handlizes oops, etc), transitions to native, makes the call,
+// returns to java state (possibly blocking), unhandlizes any result and
+// returns.
+nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm,
+                                                const methodHandle& method,
+                                                int compile_id,
+                                                BasicType* in_sig_bt,
+                                                VMRegPair* in_regs,
+                                                BasicType ret_type) {
+  if (method->is_method_handle_intrinsic()) {
+    vmIntrinsics::ID iid = method->intrinsic_id();
+    intptr_t start = (intptr_t)__ pc();
+    int vep_offset = ((intptr_t)__ pc()) - start;
+    gen_special_dispatch(masm,
+                         method,
+                         in_sig_bt,
+                         in_regs);
+    int frame_complete = ((intptr_t)__ pc()) - start;  // not complete, period
+    __ flush();
+    int stack_slots = SharedRuntime::out_preserve_stack_slots();  // no out slots at all, actually
+    return nmethod::new_native_nmethod(method,
+                                       compile_id,
+                                       masm->code(),
+                                       vep_offset,
+                                       frame_complete,
+                                       stack_slots / VMRegImpl::slots_per_word,
+                                       in_ByteSize(-1),
+                                       in_ByteSize(-1),
+                                       (OopMapSet*)NULL);
+  }
+  // Arguments for JNI method include JNIEnv and Class if static
+
+  // Usage of Rtemp should be OK since scratched by native call
+
+  bool is_static = method->is_static();
+
+  const int total_in_args = method->size_of_parameters();
+  int total_c_args = total_in_args + 1;
+  if (is_static) {
+    total_c_args++;
+  }
+
+  BasicType* out_sig_bt = NEW_RESOURCE_ARRAY(BasicType, total_c_args);
+  VMRegPair* out_regs   = NEW_RESOURCE_ARRAY(VMRegPair, total_c_args);
+
+  int argc = 0;
+  out_sig_bt[argc++] = T_ADDRESS;
+  if (is_static) {
+    out_sig_bt[argc++] = T_OBJECT;
+  }
+
+  int i;
+  for (i = 0; i < total_in_args; i++) {
+    out_sig_bt[argc++] = in_sig_bt[i];
+  }
+
+  int out_arg_slots = c_calling_convention(out_sig_bt, out_regs, NULL, total_c_args);
+  int stack_slots = SharedRuntime::out_preserve_stack_slots() + out_arg_slots;
+  // Since object arguments need to be wrapped, we must preserve space
+  // for those object arguments which come in registers (GPR_PARAMS maximum)
+  // plus one more slot for Klass handle (for static methods)
+  int oop_handle_offset = stack_slots;
+  stack_slots += (GPR_PARAMS + 1) * VMRegImpl::slots_per_word;
+
+  // Plus a lock if needed
+  int lock_slot_offset = 0;
+  if (method->is_synchronized()) {
+    lock_slot_offset = stack_slots;
+    assert(sizeof(BasicLock) == wordSize, "adjust this code");
+    stack_slots += VMRegImpl::slots_per_word;
+  }
+
+  // Space to save return address and FP
+  stack_slots += 2 * VMRegImpl::slots_per_word;
+
+  // Calculate the final stack size taking account of alignment
+  stack_slots = round_to(stack_slots, StackAlignmentInBytes / VMRegImpl::stack_slot_size);
+  int stack_size = stack_slots * VMRegImpl::stack_slot_size;
+  int lock_slot_fp_offset = stack_size - 2 * wordSize -
+    lock_slot_offset * VMRegImpl::stack_slot_size;
+
+  // Unverified entry point
+  address start = __ pc();
+
+  // Inline cache check, same as in C1_MacroAssembler::inline_cache_check()
+  const Register receiver = R0; // see receiverOpr()
+  __ load_klass(Rtemp, receiver);
+  __ cmp(Rtemp, Ricklass);
+  Label verified;
+
+  __ b(verified, eq); // jump over alignment no-ops too
+  __ jump(SharedRuntime::get_ic_miss_stub(), relocInfo::runtime_call_type, Rtemp);
+  __ align(CodeEntryAlignment);
+
+  // Verified entry point
+  __ bind(verified);
+  int vep_offset = __ pc() - start;
+
+#ifdef AARCH64
+  // Extra nop for MT-safe patching in NativeJump::patch_verified_entry
+  __ nop();
+#endif // AARCH64
+
+  if ((InlineObjectHash && method->intrinsic_id() == vmIntrinsics::_hashCode) || (method->intrinsic_id() == vmIntrinsics::_identityHashCode)) {
+    // Object.hashCode, System.identityHashCode can pull the hashCode from the header word
+    // instead of doing a full VM transition once it's been computed.
+    Label slow_case;
+    const Register obj_reg = R0;
+
+    // Unlike for Object.hashCode, System.identityHashCode is static method and
+    // gets object as argument instead of the receiver.
+    if (method->intrinsic_id() == vmIntrinsics::_identityHashCode) {
+      assert(method->is_static(), "method should be static");
+      // return 0 for null reference input, return val = R0 = obj_reg = 0
+#ifdef AARCH64
+      Label Continue;
+      __ cbnz(obj_reg, Continue);
+      __ ret();
+      __ bind(Continue);
+#else
+      __ cmp(obj_reg, 0);
+      __ bx(LR, eq);
+#endif
+    }
+
+    __ ldr(Rtemp, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
+
+    assert(markOopDesc::unlocked_value == 1, "adjust this code");
+    __ tbz(Rtemp, exact_log2(markOopDesc::unlocked_value), slow_case);
+
+    if (UseBiasedLocking) {
+      assert(is_power_of_2(markOopDesc::biased_lock_bit_in_place), "adjust this code");
+      __ tbnz(Rtemp, exact_log2(markOopDesc::biased_lock_bit_in_place), slow_case);
+    }
+
+#ifdef AARCH64
+    __ ands(Rtemp, Rtemp, (uintx)markOopDesc::hash_mask_in_place);
+    __ b(slow_case, eq);
+    __ logical_shift_right(R0, Rtemp, markOopDesc::hash_shift);
+    __ ret();
+#else
+    __ bics(Rtemp, Rtemp, ~markOopDesc::hash_mask_in_place);
+    __ mov(R0, AsmOperand(Rtemp, lsr, markOopDesc::hash_shift), ne);
+    __ bx(LR, ne);
+#endif // AARCH64
+
+    __ bind(slow_case);
+  }
+
+  // Bang stack pages
+  __ arm_stack_overflow_check(stack_size, Rtemp);
+
+  // Setup frame linkage
+  __ raw_push(FP, LR);
+  __ mov(FP, SP);
+  __ sub_slow(SP, SP, stack_size - 2*wordSize);
+
+  int frame_complete = __ pc() - start;
+
+  OopMapSet* oop_maps = new OopMapSet();
+  OopMap* map = new OopMap(stack_slots * 2, 0 /* arg_slots*/);
+  const int extra_args = is_static ? 2 : 1;
+  int receiver_offset = -1;
+  int fp_regs_in_arguments = 0;
+
+  for (i = total_in_args; --i >= 0; ) {
+    switch (in_sig_bt[i]) {
+    case T_ARRAY:
+    case T_OBJECT: {
+      VMReg src = in_regs[i].first();
+      VMReg dst = out_regs[i + extra_args].first();
+      if (src->is_stack()) {
+        assert(dst->is_stack(), "must be");
+        assert(i != 0, "Incoming receiver is always in a register");
+        __ ldr(Rtemp, Address(FP, reg2offset_in(src)));
+        __ cmp(Rtemp, 0);
+#ifdef AARCH64
+        __ add(Rtemp, FP, reg2offset_in(src));
+        __ csel(Rtemp, ZR, Rtemp, eq);
+#else
+        __ add(Rtemp, FP, reg2offset_in(src), ne);
+#endif // AARCH64
+        __ str(Rtemp, Address(SP, reg2offset_out(dst)));
+        int offset_in_older_frame = src->reg2stack() + SharedRuntime::out_preserve_stack_slots();
+        map->set_oop(VMRegImpl::stack2reg(offset_in_older_frame + stack_slots));
+      } else {
+        int offset = oop_handle_offset * VMRegImpl::stack_slot_size;
+        __ str(src->as_Register(), Address(SP, offset));
+        map->set_oop(VMRegImpl::stack2reg(oop_handle_offset));
+        if ((i == 0) && (!is_static)) {
+          receiver_offset = offset;
+        }
+        oop_handle_offset += VMRegImpl::slots_per_word;
+
+#ifdef AARCH64
+        __ cmp(src->as_Register(), 0);
+        __ add(Rtemp, SP, offset);
+        __ csel(dst->is_stack() ? Rtemp : dst->as_Register(), ZR, Rtemp, eq);
+        if (dst->is_stack()) {
+          __ str(Rtemp, Address(SP, reg2offset_out(dst)));
+        }
+#else
+        if (dst->is_stack()) {
+          __ movs(Rtemp, src->as_Register());
+          __ add(Rtemp, SP, offset, ne);
+          __ str(Rtemp, Address(SP, reg2offset_out(dst)));
+        } else {
+          __ movs(dst->as_Register(), src->as_Register());
+          __ add(dst->as_Register(), SP, offset, ne);
+        }
+#endif // AARCH64
+      }
+    }
+
+    case T_VOID:
+      break;
+
+#ifdef AARCH64
+    case T_FLOAT:
+    case T_DOUBLE: {
+      VMReg src = in_regs[i].first();
+      VMReg dst = out_regs[i + extra_args].first();
+      if (src->is_stack()) {
+        assert(dst->is_stack(), "must be");
+        __ ldr(Rtemp, Address(FP, reg2offset_in(src)));
+        __ str(Rtemp, Address(SP, reg2offset_out(dst)));
+      } else {
+        assert(src->is_FloatRegister() && dst->is_FloatRegister(), "must be");
+        assert(src->as_FloatRegister() == dst->as_FloatRegister(), "must be");
+        fp_regs_in_arguments++;
+      }
+      break;
+    }
+#else // AARCH64
+
+#ifdef __SOFTFP__
+    case T_DOUBLE:
+#endif
+    case T_LONG: {
+      VMReg src_1 = in_regs[i].first();
+      VMReg src_2 = in_regs[i].second();
+      VMReg dst_1 = out_regs[i + extra_args].first();
+      VMReg dst_2 = out_regs[i + extra_args].second();
+#if (ALIGN_WIDE_ARGUMENTS == 0)
+      // C convention can mix a register and a stack slot for a
+      // 64-bits native argument.
+
+      // Note: following code should work independently of whether
+      // the Java calling convention follows C convention or whether
+      // it aligns 64-bit values.
+      if (dst_2->is_Register()) {
+        if (src_1->as_Register() != dst_1->as_Register()) {
+          assert(src_1->as_Register() != dst_2->as_Register() &&
+                 src_2->as_Register() != dst_2->as_Register(), "must be");
+          __ mov(dst_2->as_Register(), src_2->as_Register());
+          __ mov(dst_1->as_Register(), src_1->as_Register());
+        } else {
+          assert(src_2->as_Register() == dst_2->as_Register(), "must be");
+        }
+      } else if (src_2->is_Register()) {
+        if (dst_1->is_Register()) {
+          // dst mixes a register and a stack slot
+          assert(dst_2->is_stack() && src_1->is_Register() && src_2->is_Register(), "must be");
+          assert(src_1->as_Register() != dst_1->as_Register(), "must be");
+          __ str(src_2->as_Register(), Address(SP, reg2offset_out(dst_2)));
+          __ mov(dst_1->as_Register(), src_1->as_Register());
+        } else {
+          // registers to stack slots
+          assert(dst_2->is_stack() && src_1->is_Register() && src_2->is_Register(), "must be");
+          __ str(src_1->as_Register(), Address(SP, reg2offset_out(dst_1)));
+          __ str(src_2->as_Register(), Address(SP, reg2offset_out(dst_2)));
+        }
+      } else if (src_1->is_Register()) {
+        if (dst_1->is_Register()) {
+          // src and dst must be R3 + stack slot
+          assert(dst_1->as_Register() == src_1->as_Register(), "must be");
+          __ ldr(Rtemp,    Address(FP, reg2offset_in(src_2)));
+          __ str(Rtemp,    Address(SP, reg2offset_out(dst_2)));
+        } else {
+          // <R3,stack> -> <stack,stack>
+          assert(dst_2->is_stack() && src_2->is_stack(), "must be");
+          __ ldr(LR, Address(FP, reg2offset_in(src_2)));
+          __ str(src_1->as_Register(), Address(SP, reg2offset_out(dst_1)));
+          __ str(LR, Address(SP, reg2offset_out(dst_2)));
+        }
+      } else {
+        assert(src_2->is_stack() && dst_1->is_stack() && dst_2->is_stack(), "must be");
+        __ ldr(Rtemp, Address(FP, reg2offset_in(src_1)));
+        __ ldr(LR,    Address(FP, reg2offset_in(src_2)));
+        __ str(Rtemp, Address(SP, reg2offset_out(dst_1)));
+        __ str(LR,    Address(SP, reg2offset_out(dst_2)));
+      }
+#else // ALIGN_WIDE_ARGUMENTS
+      if (src_1->is_stack()) {
+        assert(src_2->is_stack() && dst_1->is_stack() && dst_2->is_stack(), "must be");
+        __ ldr(Rtemp, Address(FP, reg2offset_in(src_1)));
+        __ ldr(LR,    Address(FP, reg2offset_in(src_2)));
+        __ str(Rtemp, Address(SP, reg2offset_out(dst_1)));
+        __ str(LR,    Address(SP, reg2offset_out(dst_2)));
+      } else if (dst_1->is_stack()) {
+        assert(dst_2->is_stack() && src_1->is_Register() && src_2->is_Register(), "must be");
+        __ str(src_1->as_Register(), Address(SP, reg2offset_out(dst_1)));
+        __ str(src_2->as_Register(), Address(SP, reg2offset_out(dst_2)));
+      } else if (src_1->as_Register() == dst_1->as_Register()) {
+        assert(src_2->as_Register() == dst_2->as_Register(), "must be");
+      } else {
+        assert(src_1->as_Register() != dst_2->as_Register() &&
+               src_2->as_Register() != dst_2->as_Register(), "must be");
+        __ mov(dst_2->as_Register(), src_2->as_Register());
+        __ mov(dst_1->as_Register(), src_1->as_Register());
+      }
+#endif // ALIGN_WIDE_ARGUMENTS
+      break;
+    }
+
+#if (!defined __SOFTFP__ && !defined __ABI_HARD__)
+    case T_FLOAT: {
+      VMReg src = in_regs[i].first();
+      VMReg dst = out_regs[i + extra_args].first();
+      if (src->is_stack()) {
+        assert(dst->is_stack(), "must be");
+        __ ldr(Rtemp, Address(FP, reg2offset_in(src)));
+        __ str(Rtemp, Address(SP, reg2offset_out(dst)));
+      } else if (dst->is_stack()) {
+        __ fsts(src->as_FloatRegister(), Address(SP, reg2offset_out(dst)));
+      } else {
+        assert(src->is_FloatRegister() && dst->is_Register(), "must be");
+        __ fmrs(dst->as_Register(), src->as_FloatRegister());
+      }
+      break;
+    }
+
+    case T_DOUBLE: {
+      VMReg src_1 = in_regs[i].first();
+      VMReg src_2 = in_regs[i].second();
+      VMReg dst_1 = out_regs[i + extra_args].first();
+      VMReg dst_2 = out_regs[i + extra_args].second();
+      if (src_1->is_stack()) {
+        assert(src_2->is_stack() && dst_1->is_stack() && dst_2->is_stack(), "must be");
+        __ ldr(Rtemp, Address(FP, reg2offset_in(src_1)));
+        __ ldr(LR,    Address(FP, reg2offset_in(src_2)));
+        __ str(Rtemp, Address(SP, reg2offset_out(dst_1)));
+        __ str(LR,    Address(SP, reg2offset_out(dst_2)));
+      } else if (dst_1->is_stack()) {
+        assert(dst_2->is_stack() && src_1->is_FloatRegister(), "must be");
+        __ fstd(src_1->as_FloatRegister(), Address(SP, reg2offset_out(dst_1)));
+#if (ALIGN_WIDE_ARGUMENTS == 0)
+      } else if (dst_2->is_stack()) {
+        assert(! src_2->is_stack(), "must be"); // assuming internal java convention is aligned
+        // double register must go into R3 + one stack slot
+        __ fmrrd(dst_1->as_Register(), Rtemp, src_1->as_FloatRegister());
+        __ str(Rtemp, Address(SP, reg2offset_out(dst_2)));
+#endif
+      } else {
+        assert(src_1->is_FloatRegister() && dst_1->is_Register() && dst_2->is_Register(), "must be");
+        __ fmrrd(dst_1->as_Register(), dst_2->as_Register(), src_1->as_FloatRegister());
+      }
+      break;
+    }
+#endif // __SOFTFP__
+
+#ifdef __ABI_HARD__
+    case T_FLOAT: {
+      VMReg src = in_regs[i].first();
+      VMReg dst = out_regs[i + extra_args].first();
+      if (src->is_stack()) {
+        if (dst->is_stack()) {
+          __ ldr(Rtemp, Address(FP, reg2offset_in(src)));
+          __ str(Rtemp, Address(SP, reg2offset_out(dst)));
+        } else {
+          // C2 Java calling convention does not populate S14 and S15, therefore
+          // those need to be loaded from stack here
+          __ flds(dst->as_FloatRegister(), Address(FP, reg2offset_in(src)));
+          fp_regs_in_arguments++;
+        }
+      } else {
+        assert(src->is_FloatRegister(), "must be");
+        fp_regs_in_arguments++;
+      }
+      break;
+    }
+    case T_DOUBLE: {
+      VMReg src_1 = in_regs[i].first();
+      VMReg src_2 = in_regs[i].second();
+      VMReg dst_1 = out_regs[i + extra_args].first();
+      VMReg dst_2 = out_regs[i + extra_args].second();
+      if (src_1->is_stack()) {
+        if (dst_1->is_stack()) {
+          assert(dst_2->is_stack(), "must be");
+          __ ldr(Rtemp, Address(FP, reg2offset_in(src_1)));
+          __ ldr(LR,    Address(FP, reg2offset_in(src_2)));
+          __ str(Rtemp, Address(SP, reg2offset_out(dst_1)));
+          __ str(LR,    Address(SP, reg2offset_out(dst_2)));
+        } else {
+          // C2 Java calling convention does not populate S14 and S15, therefore
+          // those need to be loaded from stack here
+          __ fldd(dst_1->as_FloatRegister(), Address(FP, reg2offset_in(src_1)));
+          fp_regs_in_arguments += 2;
+        }
+      } else {
+        assert(src_1->is_FloatRegister() && src_2->is_FloatRegister(), "must be");
+        fp_regs_in_arguments += 2;
+      }
+      break;
+    }
+#endif // __ABI_HARD__
+#endif // AARCH64
+
+    default: {
+      assert(in_sig_bt[i] != T_ADDRESS, "found T_ADDRESS in java args");
+      VMReg src = in_regs[i].first();
+      VMReg dst = out_regs[i + extra_args].first();
+      if (src->is_stack()) {
+        assert(dst->is_stack(), "must be");
+        __ ldr(Rtemp, Address(FP, reg2offset_in(src)));
+        __ str(Rtemp, Address(SP, reg2offset_out(dst)));
+      } else if (dst->is_stack()) {
+        __ str(src->as_Register(), Address(SP, reg2offset_out(dst)));
+      } else {
+        assert(src->is_Register() && dst->is_Register(), "must be");
+        __ mov(dst->as_Register(), src->as_Register());
+      }
+    }
+    }
+  }
+
+  // Get Klass mirror
+  int klass_offset = -1;
+  if (is_static) {
+    klass_offset = oop_handle_offset * VMRegImpl::stack_slot_size;
+    __ mov_oop(Rtemp, JNIHandles::make_local(method->method_holder()->java_mirror()));
+    __ add(c_rarg1, SP, klass_offset);
+    __ str(Rtemp, Address(SP, klass_offset));
+    map->set_oop(VMRegImpl::stack2reg(oop_handle_offset));
+  }
+
+  // the PC offset given to add_gc_map must match the PC saved in set_last_Java_frame
+  int pc_offset = __ set_last_Java_frame(SP, FP, true, Rtemp);
+  assert(((__ pc()) - start) == __ offset(), "warning: start differs from code_begin");
+  oop_maps->add_gc_map(pc_offset, map);
+
+#ifndef AARCH64
+  // Order last_Java_pc store with the thread state transition (to _thread_in_native)
+  __ membar(MacroAssembler::StoreStore, Rtemp);
+#endif // !AARCH64
+
+  // RedefineClasses() tracing support for obsolete method entry
+  if (log_is_enabled(Trace, redefine, class, obsolete)) {
+#ifdef AARCH64
+    __ NOT_TESTED();
+#endif
+    __ save_caller_save_registers();
+    __ mov(R0, Rthread);
+    __ mov_metadata(R1, method());
+    __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::rc_trace_method_entry), R0, R1);
+    __ restore_caller_save_registers();
+  }
+
+  const Register sync_handle = AARCH64_ONLY(R20) NOT_AARCH64(R5);
+  const Register sync_obj    = AARCH64_ONLY(R21) NOT_AARCH64(R6);
+  const Register disp_hdr    = AARCH64_ONLY(R22) NOT_AARCH64(altFP_7_11);
+  const Register tmp         = AARCH64_ONLY(R23) NOT_AARCH64(R8);
+
+  Label slow_lock, slow_lock_biased, lock_done, fast_lock, leave;
+  if (method->is_synchronized()) {
+    // The first argument is a handle to sync object (a class or an instance)
+    __ ldr(sync_obj, Address(R1));
+    // Remember the handle for the unlocking code
+    __ mov(sync_handle, R1);
+
+    if(UseBiasedLocking) {
+      __ biased_locking_enter(sync_obj, tmp, disp_hdr/*scratched*/, false, Rtemp, lock_done, slow_lock_biased);
+    }
+
+    const Register mark = tmp;
+#ifdef AARCH64
+    __ sub(disp_hdr, FP, lock_slot_fp_offset);
+    assert(oopDesc::mark_offset_in_bytes() == 0, "Required by atomic instructions");
+
+    __ ldr(mark, sync_obj);
+
+    // Test if object is already locked
+    assert(markOopDesc::unlocked_value == 1, "adjust this code");
+    __ tbnz(mark, exact_log2(markOopDesc::unlocked_value), fast_lock);
+
+    // Check for recursive lock
+    // See comments in InterpreterMacroAssembler::lock_object for
+    // explanations on the fast recursive locking check.
+    __ mov(Rtemp, SP);
+    __ sub(Rtemp, mark, Rtemp);
+    intptr_t mask = ((intptr_t)3) - ((intptr_t)os::vm_page_size());
+    Assembler::LogicalImmediate imm(mask, false);
+    __ ands(Rtemp, Rtemp, imm);
+    __ b(slow_lock, ne);
+
+    // Recursive locking: store 0 into a lock record
+    __ str(ZR, Address(disp_hdr, BasicLock::displaced_header_offset_in_bytes()));
+    __ b(lock_done);
+
+    __ bind(fast_lock);
+    __ str(mark, Address(disp_hdr, BasicLock::displaced_header_offset_in_bytes()));
+
+    __ cas_for_lock_acquire(mark, disp_hdr, sync_obj, Rtemp, slow_lock);
+#else
+    // On MP platforms the next load could return a 'stale' value if the memory location has been modified by another thread.
+    // That would be acceptable as either CAS or slow case path is taken in that case
+
+    __ ldr(mark, Address(sync_obj, oopDesc::mark_offset_in_bytes()));
+    __ sub(disp_hdr, FP, lock_slot_fp_offset);
+    __ tst(mark, markOopDesc::unlocked_value);
+    __ b(fast_lock, ne);
+
+    // Check for recursive lock
+    // See comments in InterpreterMacroAssembler::lock_object for
+    // explanations on the fast recursive locking check.
+    // Check independently the low bits and the distance to SP
+    // -1- test low 2 bits
+    __ movs(Rtemp, AsmOperand(mark, lsl, 30));
+    // -2- test (hdr - SP) if the low two bits are 0
+    __ sub(Rtemp, mark, SP, eq);
+    __ movs(Rtemp, AsmOperand(Rtemp, lsr, exact_log2(os::vm_page_size())), eq);
+    // If still 'eq' then recursive locking OK: set displaced header to 0
+    __ str(Rtemp, Address(disp_hdr, BasicLock::displaced_header_offset_in_bytes()), eq);
+    __ b(lock_done, eq);
+    __ b(slow_lock);
+
+    __ bind(fast_lock);
+    __ str(mark, Address(disp_hdr, BasicLock::displaced_header_offset_in_bytes()));
+
+    __ cas_for_lock_acquire(mark, disp_hdr, sync_obj, Rtemp, slow_lock);
+#endif // AARCH64
+
+    __ bind(lock_done);
+  }
+
+  // Get JNIEnv*
+  __ add(c_rarg0, Rthread, in_bytes(JavaThread::jni_environment_offset()));
+
+  // Perform thread state transition
+  __ mov(Rtemp, _thread_in_native);
+#ifdef AARCH64
+  // stlr instruction is used to force all preceding writes to be observed prior to thread state change
+  __ add(Rtemp2, Rthread, in_bytes(JavaThread::thread_state_offset()));
+  __ stlr_w(Rtemp, Rtemp2);
+#else
+  __ str(Rtemp, Address(Rthread, JavaThread::thread_state_offset()));
+#endif // AARCH64
+
+  // Finally, call the native method
+  __ call(method->native_function());
+
+  // Set FPSCR/FPCR to a known state
+  if (AlwaysRestoreFPU) {
+    __ restore_default_fp_mode();
+  }
+
+  // Do a safepoint check while thread is in transition state
+  InlinedAddress safepoint_state(SafepointSynchronize::address_of_state());
+  Label call_safepoint_runtime, return_to_java;
+  __ mov(Rtemp, _thread_in_native_trans);
+  __ ldr_literal(R2, safepoint_state);
+  __ str_32(Rtemp, Address(Rthread, JavaThread::thread_state_offset()));
+
+  // make sure the store is observed before reading the SafepointSynchronize state and further mem refs
+  __ membar(MacroAssembler::Membar_mask_bits(MacroAssembler::StoreLoad | MacroAssembler::StoreStore), Rtemp);
+
+  __ ldr_s32(R2, Address(R2));
+  __ ldr_u32(R3, Address(Rthread, JavaThread::suspend_flags_offset()));
+  __ cmp(R2, SafepointSynchronize::_not_synchronized);
+  __ cond_cmp(R3, 0, eq);
+  __ b(call_safepoint_runtime, ne);
+  __ bind(return_to_java);
+
+  // Perform thread state transition and reguard stack yellow pages if needed
+  Label reguard, reguard_done;
+  __ mov(Rtemp, _thread_in_Java);
+  __ ldr_s32(R2, Address(Rthread, JavaThread::stack_guard_state_offset()));
+  __ str_32(Rtemp, Address(Rthread, JavaThread::thread_state_offset()));
+
+  __ cmp(R2, JavaThread::stack_guard_yellow_reserved_disabled);
+  __ b(reguard, eq);
+  __ bind(reguard_done);
+
+  Label slow_unlock, unlock_done, retry;
+  if (method->is_synchronized()) {
+    __ ldr(sync_obj, Address(sync_handle));
+
+    if(UseBiasedLocking) {
+      __ biased_locking_exit(sync_obj, Rtemp, unlock_done);
+      // disp_hdr may not have been saved on entry with biased locking
+      __ sub(disp_hdr, FP, lock_slot_fp_offset);
+    }
+
+    // See C1_MacroAssembler::unlock_object() for more comments
+    __ ldr(R2, Address(disp_hdr, BasicLock::displaced_header_offset_in_bytes()));
+    __ cbz(R2, unlock_done);
+
+    __ cas_for_lock_release(disp_hdr, R2, sync_obj, Rtemp, slow_unlock);
+
+    __ bind(unlock_done);
+  }
+
+  // Set last java frame and handle block to zero
+  __ ldr(LR, Address(Rthread, JavaThread::active_handles_offset()));
+  __ reset_last_Java_frame(Rtemp); // sets Rtemp to 0 on 32-bit ARM
+
+#ifdef AARCH64
+  __ str_32(ZR, Address(LR, JNIHandleBlock::top_offset_in_bytes()));
+  if (CheckJNICalls) {
+    __ str(ZR, Address(Rthread, JavaThread::pending_jni_exception_check_fn_offset()));
+  }
+
+
+  switch (ret_type) {
+  case T_BOOLEAN:
+    __ tst(R0, 0xff);
+    __ cset(R0, ne);
+    break;
+  case T_CHAR   : __ zero_extend(R0, R0, 16);  break;
+  case T_BYTE   : __ sign_extend(R0, R0,  8);  break;
+  case T_SHORT  : __ sign_extend(R0, R0, 16);  break;
+  case T_INT    : // fall through
+  case T_LONG   : // fall through
+  case T_VOID   : // fall through
+  case T_FLOAT  : // fall through
+  case T_DOUBLE : /* nothing to do */          break;
+  case T_OBJECT : // fall through
+  case T_ARRAY  : {
+    Label L;
+    __ cbz(R0, L);
+    __ ldr(R0, Address(R0));
+    __ verify_oop(R0);
+    __ bind(L);
+    break;
+  }
+  default:
+    ShouldNotReachHere();
+  }
+#else
+  __ str_32(Rtemp, Address(LR, JNIHandleBlock::top_offset_in_bytes()));
+  if (CheckJNICalls) {
+    __ str(__ zero_register(Rtemp), Address(Rthread, JavaThread::pending_jni_exception_check_fn_offset()));
+  }
+
+  // Unhandle the result
+  if (ret_type == T_OBJECT || ret_type == T_ARRAY) {
+    __ cmp(R0, 0);
+    __ ldr(R0, Address(R0), ne);
+  }
+#endif // AARCH64
+
+  // Any exception pending?
+  __ ldr(Rtemp, Address(Rthread, Thread::pending_exception_offset()));
+  __ mov(SP, FP);
+
+#ifdef AARCH64
+  Label except;
+  __ cbnz(Rtemp, except);
+  __ raw_pop(FP, LR);
+  __ ret();
+
+  __ bind(except);
+  // Pop the frame and forward the exception. Rexception_pc contains return address.
+  __ raw_pop(FP, Rexception_pc);
+#else
+  __ cmp(Rtemp, 0);
+  // Pop the frame and return if no exception pending
+  __ pop(RegisterSet(FP) | RegisterSet(PC), eq);
+  // Pop the frame and forward the exception. Rexception_pc contains return address.
+  __ ldr(FP, Address(SP, wordSize, post_indexed), ne);
+  __ ldr(Rexception_pc, Address(SP, wordSize, post_indexed), ne);
+#endif // AARCH64
+  __ jump(StubRoutines::forward_exception_entry(), relocInfo::runtime_call_type, Rtemp);
+
+  // Safepoint operation and/or pending suspend request is in progress.
+  // Save the return values and call the runtime function by hand.
+  __ bind(call_safepoint_runtime);
+  push_result_registers(masm, ret_type);
+  __ mov(R0, Rthread);
+  __ call(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans));
+  pop_result_registers(masm, ret_type);
+  __ b(return_to_java);
+
+  __ bind_literal(safepoint_state);
+
+  // Reguard stack pages. Save native results around a call to C runtime.
+  __ bind(reguard);
+  push_result_registers(masm, ret_type);
+  __ call(CAST_FROM_FN_PTR(address, SharedRuntime::reguard_yellow_pages));
+  pop_result_registers(masm, ret_type);
+  __ b(reguard_done);
+
+  if (method->is_synchronized()) {
+    // Locking slow case
+    if(UseBiasedLocking) {
+      __ bind(slow_lock_biased);
+      __ sub(disp_hdr, FP, lock_slot_fp_offset);
+    }
+
+    __ bind(slow_lock);
+
+    push_param_registers(masm, fp_regs_in_arguments);
+
+    // last_Java_frame is already set, so do call_VM manually; no exception can occur
+    __ mov(R0, sync_obj);
+    __ mov(R1, disp_hdr);
+    __ mov(R2, Rthread);
+    __ call(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_locking_C));
+
+    pop_param_registers(masm, fp_regs_in_arguments);
+
+    __ b(lock_done);
+
+    // Unlocking slow case
+    __ bind(slow_unlock);
+
+    push_result_registers(masm, ret_type);
+
+    // Clear pending exception before reentering VM.
+    // Can store the oop in register since it is a leaf call.
+    assert_different_registers(Rtmp_save1, sync_obj, disp_hdr);
+    __ ldr(Rtmp_save1, Address(Rthread, Thread::pending_exception_offset()));
+    Register zero = __ zero_register(Rtemp);
+    __ str(zero, Address(Rthread, Thread::pending_exception_offset()));
+    __ mov(R0, sync_obj);
+    __ mov(R1, disp_hdr);
+    __ mov(R2, Rthread);
+    __ call(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_unlocking_C));
+    __ str(Rtmp_save1, Address(Rthread, Thread::pending_exception_offset()));
+
+    pop_result_registers(masm, ret_type);
+
+    __ b(unlock_done);
+  }
+
+  __ flush();
+  return nmethod::new_native_nmethod(method,
+                                     compile_id,
+                                     masm->code(),
+                                     vep_offset,
+                                     frame_complete,
+                                     stack_slots / VMRegImpl::slots_per_word,
+                                     in_ByteSize(is_static ? klass_offset : receiver_offset),
+                                     in_ByteSize(lock_slot_offset * VMRegImpl::stack_slot_size),
+                                     oop_maps);
+}
+
+// this function returns the adjust size (in number of words) to a c2i adapter
+// activation for use during deoptimization
+int Deoptimization::last_frame_adjust(int callee_parameters, int callee_locals) {
+  int extra_locals_size = (callee_locals - callee_parameters) * Interpreter::stackElementWords;
+#ifdef AARCH64
+  extra_locals_size = round_to(extra_locals_size, StackAlignmentInBytes/BytesPerWord);
+#endif // AARCH64
+  return extra_locals_size;
+}
+
+
+uint SharedRuntime::out_preserve_stack_slots() {
+  return 0;
+}
+
+
+//------------------------------generate_deopt_blob----------------------------
+void SharedRuntime::generate_deopt_blob() {
+  ResourceMark rm;
+#ifdef AARCH64
+  CodeBuffer buffer("deopt_blob", 1024+256, 1);
+#else
+  CodeBuffer buffer("deopt_blob", 1024, 1024);
+#endif
+  int frame_size_in_words;
+  OopMapSet* oop_maps;
+  int reexecute_offset;
+  int exception_in_tls_offset;
+  int exception_offset;
+
+  MacroAssembler* masm = new MacroAssembler(&buffer);
+  Label cont;
+  const Register Rkind   = AARCH64_ONLY(R21) NOT_AARCH64(R9); // caller-saved on 32bit
+  const Register Rublock = AARCH64_ONLY(R22) NOT_AARCH64(R6);
+  const Register Rsender = AARCH64_ONLY(R23) NOT_AARCH64(altFP_7_11);
+  assert_different_registers(Rkind, Rublock, Rsender, Rexception_obj, Rexception_pc, R0, R1, R2, R3, R8, Rtemp);
+
+  address start = __ pc();
+
+  oop_maps = new OopMapSet();
+  // LR saved by caller (can be live in c2 method)
+
+  // A deopt is a case where LR may be live in the c2 nmethod. So it's
+  // not possible to call the deopt blob from the nmethod and pass the
+  // address of the deopt handler of the nmethod in LR. What happens
+  // now is that the caller of the deopt blob pushes the current
+  // address so the deopt blob doesn't have to do it. This way LR can
+  // be preserved, contains the live value from the nmethod and is
+  // saved at R14/R30_offset here.
+  OopMap* map = RegisterSaver::save_live_registers(masm, &frame_size_in_words, true);
+  __ mov(Rkind, Deoptimization::Unpack_deopt);
+  __ b(cont);
+
+  exception_offset = __ pc() - start;
+
+  // Transfer Rexception_obj & Rexception_pc in TLS and fall thru to the
+  // exception_in_tls_offset entry point.
+  __ str(Rexception_obj, Address(Rthread, JavaThread::exception_oop_offset()));
+  __ str(Rexception_pc, Address(Rthread, JavaThread::exception_pc_offset()));
+  // Force return value to NULL to avoid confusing the escape analysis
+  // logic. Everything is dead here anyway.
+  __ mov(R0, 0);
+
+  exception_in_tls_offset = __ pc() - start;
+
+  // Exception data is in JavaThread structure
+  // Patch the return address of the current frame
+  __ ldr(LR, Address(Rthread, JavaThread::exception_pc_offset()));
+  (void) RegisterSaver::save_live_registers(masm, &frame_size_in_words);
+  {
+    const Register Rzero = __ zero_register(Rtemp); // XXX should be OK for C2 but not 100% sure
+    __ str(Rzero, Address(Rthread, JavaThread::exception_pc_offset()));
+  }
+  __ mov(Rkind, Deoptimization::Unpack_exception);
+  __ b(cont);
+
+  reexecute_offset = __ pc() - start;
+
+  (void) RegisterSaver::save_live_registers(masm, &frame_size_in_words);
+  __ mov(Rkind, Deoptimization::Unpack_reexecute);
+
+  // Calculate UnrollBlock and save the result in Rublock
+  __ bind(cont);
+  __ mov(R0, Rthread);
+  __ mov(R1, Rkind);
+
+  int pc_offset = __ set_last_Java_frame(SP, FP, false, Rtemp); // note: FP may not need to be saved (not on x86)
+  assert(((__ pc()) - start) == __ offset(), "warning: start differs from code_begin");
+  __ call(CAST_FROM_FN_PTR(address, Deoptimization::fetch_unroll_info));
+  if (pc_offset == -1) {
+    pc_offset = __ offset();
+  }
+  oop_maps->add_gc_map(pc_offset, map);
+  __ reset_last_Java_frame(Rtemp); // Rtemp free since scratched by far call
+
+  __ mov(Rublock, R0);
+
+  // Reload Rkind from the UnrollBlock (might have changed)
+  __ ldr_s32(Rkind, Address(Rublock, Deoptimization::UnrollBlock::unpack_kind_offset_in_bytes()));
+  Label noException;
+  __ cmp_32(Rkind, Deoptimization::Unpack_exception);   // Was exception pending?
+  __ b(noException, ne);
+  // handle exception case
+#ifdef ASSERT
+  // assert that exception_pc is zero in tls
+  { Label L;
+    __ ldr(Rexception_pc, Address(Rthread, JavaThread::exception_pc_offset()));
+    __ cbz(Rexception_pc, L);
+    __ stop("exception pc should be null");
+    __ bind(L);
+  }
+#endif
+  __ ldr(Rexception_obj, Address(Rthread, JavaThread::exception_oop_offset()));
+  __ verify_oop(Rexception_obj);
+  {
+    const Register Rzero = __ zero_register(Rtemp);
+    __ str(Rzero, Address(Rthread, JavaThread::exception_oop_offset()));
+  }
+
+  __ bind(noException);
+
+  // This frame is going away.  Fetch return value, so we can move it to
+  // a new frame.
+  __ ldr(R0, Address(SP, RegisterSaver::R0_offset * wordSize));
+#ifndef AARCH64
+  __ ldr(R1, Address(SP, RegisterSaver::R1_offset * wordSize));
+#endif // !AARCH64
+#ifndef __SOFTFP__
+  __ ldr_double(D0, Address(SP, RegisterSaver::D0_offset * wordSize));
+#endif
+  // pop frame
+  __ add(SP, SP, RegisterSaver::reg_save_size * wordSize);
+
+  // Set initial stack state before pushing interpreter frames
+  __ ldr_s32(Rtemp, Address(Rublock, Deoptimization::UnrollBlock::size_of_deoptimized_frame_offset_in_bytes()));
+  __ ldr(R2, Address(Rublock, Deoptimization::UnrollBlock::frame_pcs_offset_in_bytes()));
+  __ ldr(R3, Address(Rublock, Deoptimization::UnrollBlock::frame_sizes_offset_in_bytes()));
+
+#ifdef AARCH64
+  // Pop deoptimized frame. Make sure to restore the initial saved FP/LR of the caller.
+  // They are needed for correct stack walking during stack overflow handling.
+  // Also, restored FP is saved in the bottom interpreter frame (LR is reloaded from unroll block).
+  __ sub(Rtemp, Rtemp, 2*wordSize);
+  __ add(SP, SP, Rtemp, ex_uxtx);
+  __ raw_pop(FP, LR);
+
+#ifdef ASSERT
+  { Label L;
+    __ ldr(Rtemp, Address(Rublock, Deoptimization::UnrollBlock::initial_info_offset_in_bytes()));
+    __ cmp(FP, Rtemp);
+    __ b(L, eq);
+    __ stop("FP restored from deoptimized frame does not match FP stored in unroll block");
+    __ bind(L);
+  }
+  { Label L;
+    __ ldr(Rtemp, Address(R2));
+    __ cmp(LR, Rtemp);
+    __ b(L, eq);
+    __ stop("LR restored from deoptimized frame does not match the 1st PC in unroll block");
+    __ bind(L);
+  }
+#endif // ASSERT
+
+#else
+  __ add(SP, SP, Rtemp);
+#endif // AARCH64
+
+#ifdef ASSERT
+  // Compilers generate code that bang the stack by as much as the
+  // interpreter would need. So this stack banging should never
+  // trigger a fault. Verify that it does not on non product builds.
+  // See if it is enough stack to push deoptimized frames
+  if (UseStackBanging) {
+#ifndef AARCH64
+    // The compiled method that we are deoptimizing was popped from the stack.
+    // If the stack bang results in a stack overflow, we don't return to the
+    // method that is being deoptimized. The stack overflow exception is
+    // propagated to the caller of the deoptimized method. Need to get the pc
+    // from the caller in LR and restore FP.
+    __ ldr(LR, Address(R2, 0));
+    __ ldr(FP, Address(Rublock, Deoptimization::UnrollBlock::initial_info_offset_in_bytes()));
+#endif // !AARCH64
+    __ ldr_s32(R8, Address(Rublock, Deoptimization::UnrollBlock::total_frame_sizes_offset_in_bytes()));
+    __ arm_stack_overflow_check(R8, Rtemp);
+  }
+#endif
+  __ ldr_s32(R8, Address(Rublock, Deoptimization::UnrollBlock::number_of_frames_offset_in_bytes()));
+
+#ifndef AARCH64
+  // Pick up the initial fp we should save
+  // XXX Note: was ldr(FP, Address(FP));
+
+  // The compiler no longer uses FP as a frame pointer for the
+  // compiled code. It can be used by the allocator in C2 or to
+  // memorize the original SP for JSR292 call sites.
+
+  // Hence, ldr(FP, Address(FP)) is probably not correct. For x86,
+  // Deoptimization::fetch_unroll_info computes the right FP value and
+  // stores it in Rublock.initial_info. This has been activated for ARM.
+  __ ldr(FP, Address(Rublock, Deoptimization::UnrollBlock::initial_info_offset_in_bytes()));
+#endif // !AARCH64
+
+  __ ldr_s32(Rtemp, Address(Rublock, Deoptimization::UnrollBlock::caller_adjustment_offset_in_bytes()));
+  __ mov(Rsender, SP);
+#ifdef AARCH64
+  __ sub(SP, SP, Rtemp, ex_uxtx);
+#else
+  __ sub(SP, SP, Rtemp);
+#endif // AARCH64
+
+  // Push interpreter frames in a loop
+  Label loop;
+  __ bind(loop);
+  __ ldr(LR, Address(R2, wordSize, post_indexed));         // load frame pc
+  __ ldr(Rtemp, Address(R3, wordSize, post_indexed));      // load frame size
+
+  __ raw_push(FP, LR);                                     // create new frame
+  __ mov(FP, SP);
+  __ sub(Rtemp, Rtemp, 2*wordSize);
+
+#ifdef AARCH64
+  __ sub(SP, SP, Rtemp, ex_uxtx);
+#else
+  __ sub(SP, SP, Rtemp);
+#endif // AARCH64
+
+  __ str(Rsender, Address(FP, frame::interpreter_frame_sender_sp_offset * wordSize));
+#ifdef AARCH64
+  __ str(ZR, Address(FP, frame::interpreter_frame_stack_top_offset * wordSize));
+#else
+  __ mov(LR, 0);
+  __ str(LR, Address(FP, frame::interpreter_frame_last_sp_offset * wordSize));
+#endif // AARCH64
+
+  __ subs(R8, R8, 1);                               // decrement counter
+  __ mov(Rsender, SP);
+  __ b(loop, ne);
+
+  // Re-push self-frame
+  __ ldr(LR, Address(R2));
+  __ raw_push(FP, LR);
+  __ mov(FP, SP);
+  __ sub(SP, SP, (frame_size_in_words - 2) * wordSize);
+
+  // Restore frame locals after moving the frame
+  __ str(R0, Address(SP, RegisterSaver::R0_offset * wordSize));
+#ifndef AARCH64
+  __ str(R1, Address(SP, RegisterSaver::R1_offset * wordSize));
+#endif // !AARCH64
+
+#ifndef __SOFTFP__
+  __ str_double(D0, Address(SP, RegisterSaver::D0_offset * wordSize));
+#endif // !__SOFTFP__
+
+#ifndef AARCH64
+#ifdef ASSERT
+  // Reload Rkind from the UnrollBlock and check that it was not overwritten (Rkind is not callee-saved)
+  { Label L;
+    __ ldr_s32(Rtemp, Address(Rublock, Deoptimization::UnrollBlock::unpack_kind_offset_in_bytes()));
+    __ cmp_32(Rkind, Rtemp);
+    __ b(L, eq);
+    __ stop("Rkind was overwritten");
+    __ bind(L);
+  }
+#endif
+#endif
+
+  // Call unpack_frames with proper arguments
+  __ mov(R0, Rthread);
+  __ mov(R1, Rkind);
+
+  pc_offset = __ set_last_Java_frame(SP, FP, false, Rtemp);
+  assert(((__ pc()) - start) == __ offset(), "warning: start differs from code_begin");
+  __ call(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames));
+  if (pc_offset == -1) {
+    pc_offset = __ offset();
+  }
+  oop_maps->add_gc_map(pc_offset, new OopMap(frame_size_in_words * VMRegImpl::slots_per_word, 0));
+  __ reset_last_Java_frame(Rtemp); // Rtemp free since scratched by far call
+
+  // Collect return values, pop self-frame and jump to interpreter
+  __ ldr(R0, Address(SP, RegisterSaver::R0_offset * wordSize));
+#ifndef AARCH64
+  __ ldr(R1, Address(SP, RegisterSaver::R1_offset * wordSize));
+#endif // !AARCH64
+  // Interpreter floats controlled by __SOFTFP__, but compiler
+  // float return value registers controlled by __ABI_HARD__
+  // This matters for vfp-sflt builds.
+#ifndef __SOFTFP__
+  // Interpreter hard float
+#ifdef __ABI_HARD__
+  // Compiler float return value in FP registers
+  __ ldr_double(D0, Address(SP, RegisterSaver::D0_offset * wordSize));
+#else
+  // Compiler float return value in integer registers,
+  // copy to D0 for interpreter (S0 <-- R0)
+  __ fmdrr(D0_tos, R0, R1);
+#endif
+#endif // !__SOFTFP__
+  __ mov(SP, FP);
+
+#ifdef AARCH64
+  __ raw_pop(FP, LR);
+  __ ret();
+#else
+  __ pop(RegisterSet(FP) | RegisterSet(PC));
+#endif // AARCH64
+
+  __ flush();
+
+  _deopt_blob = DeoptimizationBlob::create(&buffer, oop_maps, 0, exception_offset,
+                                           reexecute_offset, frame_size_in_words);
+  _deopt_blob->set_unpack_with_exception_in_tls_offset(exception_in_tls_offset);
+}
+
+#ifdef COMPILER2
+
+//------------------------------generate_uncommon_trap_blob--------------------
+// Ought to generate an ideal graph & compile, but here's some SPARC ASM
+// instead.
+void SharedRuntime::generate_uncommon_trap_blob() {
+  // allocate space for the code
+  ResourceMark rm;
+
+  // setup code generation tools
+  int pad = VerifyThread ? 512 : 0;
+#ifdef _LP64
+  CodeBuffer buffer("uncommon_trap_blob", 2700+pad, 512);
+#else
+  // Measured 8/7/03 at 660 in 32bit debug build (no VerifyThread)
+  // Measured 8/7/03 at 1028 in 32bit debug build (VerifyThread)
+  CodeBuffer buffer("uncommon_trap_blob", 2000+pad, 512);
+#endif
+  // bypassed when code generation useless
+  MacroAssembler* masm               = new MacroAssembler(&buffer);
+  const Register Rublock = AARCH64_ONLY(R22) NOT_AARCH64(R6);
+  const Register Rsender = AARCH64_ONLY(R23) NOT_AARCH64(altFP_7_11);
+  assert_different_registers(Rublock, Rsender, Rexception_obj, R0, R1, R2, R3, R8, Rtemp);
+
+  //
+  // This is the entry point for all traps the compiler takes when it thinks
+  // it cannot handle further execution of compilation code. The frame is
+  // deoptimized in these cases and converted into interpreter frames for
+  // execution
+  // The steps taken by this frame are as follows:
+  //   - push a fake "unpack_frame"
+  //   - call the C routine Deoptimization::uncommon_trap (this function
+  //     packs the current compiled frame into vframe arrays and returns
+  //     information about the number and size of interpreter frames which
+  //     are equivalent to the frame which is being deoptimized)
+  //   - deallocate the "unpack_frame"
+  //   - deallocate the deoptimization frame
+  //   - in a loop using the information returned in the previous step
+  //     push interpreter frames;
+  //   - create a dummy "unpack_frame"
+  //   - call the C routine: Deoptimization::unpack_frames (this function
+  //     lays out values on the interpreter frame which was just created)
+  //   - deallocate the dummy unpack_frame
+  //   - return to the interpreter entry point
+  //
+  //  Refer to the following methods for more information:
+  //   - Deoptimization::uncommon_trap
+  //   - Deoptimization::unpack_frame
+
+  // the unloaded class index is in R0 (first parameter to this blob)
+
+  __ raw_push(FP, LR);
+  __ set_last_Java_frame(SP, FP, false, Rtemp);
+  __ mov(R2, Deoptimization::Unpack_uncommon_trap);
+  __ mov(R1, R0);
+  __ mov(R0, Rthread);
+  __ call(CAST_FROM_FN_PTR(address, Deoptimization::uncommon_trap));
+  __ mov(Rublock, R0);
+  __ reset_last_Java_frame(Rtemp);
+  __ raw_pop(FP, LR);
+
+#ifdef ASSERT
+  { Label L;
+    __ ldr_s32(Rtemp, Address(Rublock, Deoptimization::UnrollBlock::unpack_kind_offset_in_bytes()));
+    __ cmp_32(Rtemp, Deoptimization::Unpack_uncommon_trap);
+    __ b(L, eq);
+    __ stop("SharedRuntime::generate_uncommon_trap_blob: expected Unpack_uncommon_trap");
+    __ bind(L);
+  }
+#endif
+
+
+  // Set initial stack state before pushing interpreter frames
+  __ ldr_s32(Rtemp, Address(Rublock, Deoptimization::UnrollBlock::size_of_deoptimized_frame_offset_in_bytes()));
+  __ ldr(R2, Address(Rublock, Deoptimization::UnrollBlock::frame_pcs_offset_in_bytes()));
+  __ ldr(R3, Address(Rublock, Deoptimization::UnrollBlock::frame_sizes_offset_in_bytes()));
+
+#ifdef AARCH64
+  // Pop deoptimized frame. Make sure to restore the initial saved FP/LR of the caller.
+  // They are needed for correct stack walking during stack overflow handling.
+  // Also, restored FP is saved in the bottom interpreter frame (LR is reloaded from unroll block).
+  __ sub(Rtemp, Rtemp, 2*wordSize);
+  __ add(SP, SP, Rtemp, ex_uxtx);
+  __ raw_pop(FP, LR);
+
+#ifdef ASSERT
+  { Label L;
+    __ ldr(Rtemp, Address(Rublock, Deoptimization::UnrollBlock::initial_info_offset_in_bytes()));
+    __ cmp(FP, Rtemp);
+    __ b(L, eq);
+    __ stop("FP restored from deoptimized frame does not match FP stored in unroll block");
+    __ bind(L);
+  }
+  { Label L;
+    __ ldr(Rtemp, Address(R2));
+    __ cmp(LR, Rtemp);
+    __ b(L, eq);
+    __ stop("LR restored from deoptimized frame does not match the 1st PC in unroll block");
+    __ bind(L);
+  }
+#endif // ASSERT
+
+#else
+  __ add(SP, SP, Rtemp);
+#endif //AARCH64
+
+  // See if it is enough stack to push deoptimized frames
+#ifdef ASSERT
+  // Compilers generate code that bang the stack by as much as the
+  // interpreter would need. So this stack banging should never
+  // trigger a fault. Verify that it does not on non product builds.
+  if (UseStackBanging) {
+#ifndef AARCH64
+    // The compiled method that we are deoptimizing was popped from the stack.
+    // If the stack bang results in a stack overflow, we don't return to the
+    // method that is being deoptimized. The stack overflow exception is
+    // propagated to the caller of the deoptimized method. Need to get the pc
+    // from the caller in LR and restore FP.
+    __ ldr(LR, Address(R2, 0));
+    __ ldr(FP, Address(Rublock, Deoptimization::UnrollBlock::initial_info_offset_in_bytes()));
+#endif // !AARCH64
+    __ ldr_s32(R8, Address(Rublock, Deoptimization::UnrollBlock::total_frame_sizes_offset_in_bytes()));
+    __ arm_stack_overflow_check(R8, Rtemp);
+  }
+#endif
+  __ ldr_s32(R8, Address(Rublock, Deoptimization::UnrollBlock::number_of_frames_offset_in_bytes()));
+  __ ldr_s32(Rtemp, Address(Rublock, Deoptimization::UnrollBlock::caller_adjustment_offset_in_bytes()));
+  __ mov(Rsender, SP);
+#ifdef AARCH64
+  __ sub(SP, SP, Rtemp, ex_uxtx);
+#else
+  __ sub(SP, SP, Rtemp);
+#endif
+#ifndef AARCH64
+  //  __ ldr(FP, Address(FP));
+  __ ldr(FP, Address(Rublock, Deoptimization::UnrollBlock::initial_info_offset_in_bytes()));
+#endif // AARCH64
+
+  // Push interpreter frames in a loop
+  Label loop;
+  __ bind(loop);
+  __ ldr(LR, Address(R2, wordSize, post_indexed));         // load frame pc
+  __ ldr(Rtemp, Address(R3, wordSize, post_indexed));      // load frame size
+
+  __ raw_push(FP, LR);                                     // create new frame
+  __ mov(FP, SP);
+  __ sub(Rtemp, Rtemp, 2*wordSize);
+
+#ifdef AARCH64
+  __ sub(SP, SP, Rtemp, ex_uxtx);
+#else
+  __ sub(SP, SP, Rtemp);
+#endif // AARCH64
+
+  __ str(Rsender, Address(FP, frame::interpreter_frame_sender_sp_offset * wordSize));
+#ifdef AARCH64
+  __ str(ZR, Address(FP, frame::interpreter_frame_stack_top_offset * wordSize));
+#else
+  __ mov(LR, 0);
+  __ str(LR, Address(FP, frame::interpreter_frame_last_sp_offset * wordSize));
+#endif // AARCH64
+  __ subs(R8, R8, 1);                               // decrement counter
+  __ mov(Rsender, SP);
+  __ b(loop, ne);
+
+  // Re-push self-frame
+  __ ldr(LR, Address(R2));
+  __ raw_push(FP, LR);
+  __ mov(FP, SP);
+
+  // Call unpack_frames with proper arguments
+  __ mov(R0, Rthread);
+  __ mov(R1, Deoptimization::Unpack_uncommon_trap);
+  __ set_last_Java_frame(SP, FP, false, Rtemp);
+  __ call(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames));
+  //  oop_maps->add_gc_map(__ pc() - start, new OopMap(frame_size_in_words, 0));
+  __ reset_last_Java_frame(Rtemp);
+
+  __ mov(SP, FP);
+#ifdef AARCH64
+  __ raw_pop(FP, LR);
+  __ ret();
+#else
+  __ pop(RegisterSet(FP) | RegisterSet(PC));
+#endif
+
+  masm->flush();
+  _uncommon_trap_blob = UncommonTrapBlob::create(&buffer, NULL, 2 /* LR+FP */);
+}
+
+#endif // COMPILER2
+
+//------------------------------generate_handler_blob------
+//
+// Generate a special Compile2Runtime blob that saves all registers,
+// setup oopmap, and calls safepoint code to stop the compiled code for
+// a safepoint.
+//
+SafepointBlob* SharedRuntime::generate_handler_blob(address call_ptr, int poll_type) {
+  assert(StubRoutines::forward_exception_entry() != NULL, "must be generated before");
+
+  ResourceMark rm;
+  CodeBuffer buffer("handler_blob", 256, 256);
+  int frame_size_words;
+  OopMapSet* oop_maps;
+
+  bool cause_return = (poll_type == POLL_AT_RETURN);
+
+  MacroAssembler* masm = new MacroAssembler(&buffer);
+  address start = __ pc();
+  oop_maps = new OopMapSet();
+
+  if (!cause_return) {
+#ifdef AARCH64
+    __ raw_push(LR, LR);
+#else
+    __ sub(SP, SP, 4); // make room for LR which may still be live
+                       // here if we are coming from a c2 method
+#endif // AARCH64
+  }
+
+  OopMap* map = RegisterSaver::save_live_registers(masm, &frame_size_words, !cause_return);
+  if (!cause_return) {
+    // update saved PC with correct value
+    // need 2 steps because LR can be live in c2 method
+    __ ldr(LR, Address(Rthread, JavaThread::saved_exception_pc_offset()));
+    __ str(LR, Address(SP, RegisterSaver::LR_offset * wordSize));
+  }
+
+  __ mov(R0, Rthread);
+  int pc_offset = __ set_last_Java_frame(SP, FP, false, Rtemp); // note: FP may not need to be saved (not on x86)
+  assert(((__ pc()) - start) == __ offset(), "warning: start differs from code_begin");
+  __ call(call_ptr);
+  if (pc_offset == -1) {
+    pc_offset = __ offset();
+  }
+  oop_maps->add_gc_map(pc_offset, map);
+  __ reset_last_Java_frame(Rtemp); // Rtemp free since scratched by far call
+
+  // Check for pending exception
+  __ ldr(Rtemp, Address(Rthread, Thread::pending_exception_offset()));
+  __ cmp(Rtemp, 0);
+
+#ifdef AARCH64
+  RegisterSaver::restore_live_registers(masm, cause_return);
+  Register ret_addr = cause_return ? LR : Rtemp;
+  if (!cause_return) {
+    __ raw_pop(FP, ret_addr);
+  }
+
+  Label throw_exception;
+  __ b(throw_exception, ne);
+  __ br(ret_addr);
+
+  __ bind(throw_exception);
+  __ mov(Rexception_pc, ret_addr);
+#else // AARCH64
+  if (!cause_return) {
+    RegisterSaver::restore_live_registers(masm, false);
+    __ pop(PC, eq);
+    __ pop(Rexception_pc);
+  } else {
+    RegisterSaver::restore_live_registers(masm);
+    __ bx(LR, eq);
+    __ mov(Rexception_pc, LR);
+  }
+#endif // AARCH64
+
+  __ jump(StubRoutines::forward_exception_entry(), relocInfo::runtime_call_type, Rtemp);
+
+  __ flush();
+
+  return SafepointBlob::create(&buffer, oop_maps, frame_size_words);
+}
+
+RuntimeStub* SharedRuntime::generate_resolve_blob(address destination, const char* name) {
+  assert(StubRoutines::forward_exception_entry() != NULL, "must be generated before");
+
+  ResourceMark rm;
+  CodeBuffer buffer(name, 1000, 512);
+  int frame_size_words;
+  OopMapSet *oop_maps;
+  int frame_complete;
+
+  MacroAssembler* masm = new MacroAssembler(&buffer);
+  Label pending_exception;
+
+  int start = __ offset();
+
+  oop_maps = new OopMapSet();
+  OopMap* map = RegisterSaver::save_live_registers(masm, &frame_size_words);
+
+  frame_complete = __ offset();
+
+  __ mov(R0, Rthread);
+
+  int pc_offset = __ set_last_Java_frame(SP, FP, false, Rtemp);
+  assert(start == 0, "warning: start differs from code_begin");
+  __ call(destination);
+  if (pc_offset == -1) {
+    pc_offset = __ offset();
+  }
+  oop_maps->add_gc_map(pc_offset, map);
+  __ reset_last_Java_frame(Rtemp); // Rtemp free since scratched by far call
+
+  __ ldr(R1, Address(Rthread, Thread::pending_exception_offset()));
+  __ cbnz(R1, pending_exception);
+
+  // Overwrite saved register values
+
+  // Place metadata result of VM call into Rmethod
+  __ get_vm_result_2(R1, Rtemp);
+  __ str(R1, Address(SP, RegisterSaver::Rmethod_offset * wordSize));
+
+  // Place target address (VM call result) into Rtemp
+  __ str(R0, Address(SP, RegisterSaver::Rtemp_offset * wordSize));
+
+  RegisterSaver::restore_live_registers(masm);
+  __ jump(Rtemp);
+
+  __ bind(pending_exception);
+
+  RegisterSaver::restore_live_registers(masm);
+  const Register Rzero = __ zero_register(Rtemp);
+  __ str(Rzero, Address(Rthread, JavaThread::vm_result_2_offset()));
+  __ mov(Rexception_pc, LR);
+  __ jump(StubRoutines::forward_exception_entry(), relocInfo::runtime_call_type, Rtemp);
+
+  __ flush();
+
+  return RuntimeStub::new_runtime_stub(name, &buffer, frame_complete, frame_size_words, oop_maps, true);
+}
--- /dev/null	2016-08-24 15:41:39.598575000 -0400
+++ new/hotspot/src/cpu/arm/vm/stubGenerator_arm.cpp	2016-12-02 11:23:31.763315438 -0500
@@ -0,0 +1,4510 @@
+/*
+ * Copyright (c) 2008, 2016, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+#include "asm/assembler.hpp"
+#include "assembler_arm.inline.hpp"
+#include "interpreter/interpreter.hpp"
+#include "nativeInst_arm.hpp"
+#include "oops/instanceOop.hpp"
+#include "oops/method.hpp"
+#include "oops/objArrayKlass.hpp"
+#include "oops/oop.inline.hpp"
+#include "prims/methodHandles.hpp"
+#include "runtime/frame.inline.hpp"
+#include "runtime/handles.inline.hpp"
+#include "runtime/sharedRuntime.hpp"
+#include "runtime/stubCodeGenerator.hpp"
+#include "runtime/stubRoutines.hpp"
+#ifdef COMPILER2
+#include "opto/runtime.hpp"
+#endif
+
+// Declaration and definition of StubGenerator (no .hpp file).
+// For a more detailed description of the stub routine structure
+// see the comment in stubRoutines.hpp
+
+#define __ _masm->
+
+#ifdef PRODUCT
+#define BLOCK_COMMENT(str) /* nothing */
+#else
+#define BLOCK_COMMENT(str) __ block_comment(str)
+#endif
+
+#define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
+
+// -------------------------------------------------------------------------------------------------------------------------
+// Stub Code definitions
+
+// Platform dependent parameters for array copy stubs
+
+// Note: we have noticed a huge change in behavior on a microbenchmark
+// from platform to platform depending on the configuration.
+
+// Instead of adding a series of command line options (which
+// unfortunately have to be done in the shared file and cannot appear
+// only in the ARM port), the tested result are hard-coded here in a set
+// of options, selected by specifying 'ArmCopyPlatform'
+
+// Currently, this 'platform' is hardcoded to a value that is a good
+// enough trade-off.  However, one can easily modify this file to test
+// the hard-coded configurations or create new ones. If the gain is
+// significant, we could decide to either add command line options or
+// add code to automatically choose a configuration.
+
+// see comments below for the various configurations created
+#define DEFAULT_ARRAYCOPY_CONFIG 0
+#define TEGRA2_ARRAYCOPY_CONFIG 1
+#define IMX515_ARRAYCOPY_CONFIG 2
+
+// Hard coded choices (XXX: could be changed to a command line option)
+#define ArmCopyPlatform DEFAULT_ARRAYCOPY_CONFIG
+
+#ifdef AARCH64
+#define ArmCopyCacheLineSize 64
+#else
+#define ArmCopyCacheLineSize 32 // not worth optimizing to 64 according to measured gains
+#endif // AARCH64
+
+// TODO-AARCH64: tune and revise AArch64 arraycopy optimizations
+
+// configuration for each kind of loop
+typedef struct {
+  int pld_distance;       // prefetch distance (0 => no prefetch, <0: prefetch_before);
+#ifndef AARCH64
+  bool split_ldm;         // if true, split each STM in STMs with fewer registers
+  bool split_stm;         // if true, split each LTM in LTMs with fewer registers
+#endif // !AARCH64
+} arraycopy_loop_config;
+
+// configuration for all loops
+typedef struct {
+  // const char *description;
+  arraycopy_loop_config forward_aligned;
+  arraycopy_loop_config backward_aligned;
+  arraycopy_loop_config forward_shifted;
+  arraycopy_loop_config backward_shifted;
+} arraycopy_platform_config;
+
+// configured platforms
+static arraycopy_platform_config arraycopy_configurations[] = {
+  // configuration parameters for arraycopy loops
+#ifdef AARCH64
+  {
+    {-256 }, // forward aligned
+    {-128 }, // backward aligned
+    {-256 }, // forward shifted
+    {-128 }  // backward shifted
+  }
+#else
+
+  // Configurations were chosen based on manual analysis of benchmark
+  // results, minimizing overhead with respect to best results on the
+  // different test cases.
+
+  // Prefetch before is always favored since it avoids dirtying the
+  // cache uselessly for small copies. Code for prefetch after has
+  // been kept in case the difference is significant for some
+  // platforms but we might consider dropping it.
+
+  // distance, ldm, stm
+  {
+    // default: tradeoff tegra2/imx515/nv-tegra2,
+    // Notes on benchmarking:
+    // - not far from optimal configuration on nv-tegra2
+    // - within 5% of optimal configuration except for backward aligned on IMX
+    // - up to 40% from optimal configuration for backward shifted and backward align for tegra2
+    //   but still on par with the operating system copy
+    {-256, true,  true  }, // forward aligned
+    {-256, true,  true  }, // backward aligned
+    {-256, false, false }, // forward shifted
+    {-256, true,  true  } // backward shifted
+  },
+  {
+    // configuration tuned on tegra2-4.
+    // Warning: should not be used on nv-tegra2 !
+    // Notes:
+    // - prefetch after gives 40% gain on backward copies on tegra2-4,
+    //   resulting in better number than the operating system
+    //   copy. However, this can lead to a 300% loss on nv-tegra and has
+    //   more impact on the cache (fetches futher than what is
+    //   copied). Use this configuration with care, in case it improves
+    //   reference benchmarks.
+    {-256, true,  true  }, // forward aligned
+    {96,   false, false }, // backward aligned
+    {-256, false, false }, // forward shifted
+    {96,   false, false } // backward shifted
+  },
+  {
+    // configuration tuned on imx515
+    // Notes:
+    // - smaller prefetch distance is sufficient to get good result and might be more stable
+    // - refined backward aligned options within 5% of optimal configuration except for
+    //   tests were the arrays fit in the cache
+    {-160, false, false }, // forward aligned
+    {-160, false, false }, // backward aligned
+    {-160, false, false }, // forward shifted
+    {-160, true,  true  } // backward shifted
+  }
+#endif // AARCH64
+};
+
+class StubGenerator: public StubCodeGenerator {
+
+#ifdef PRODUCT
+#define inc_counter_np(a,b,c) ((void)0)
+#else
+#define inc_counter_np(counter, t1, t2) \
+  BLOCK_COMMENT("inc_counter " #counter); \
+  __ inc_counter(&counter, t1, t2);
+#endif
+
+ private:
+
+  address generate_call_stub(address& return_address) {
+    StubCodeMark mark(this, "StubRoutines", "call_stub");
+    address start = __ pc();
+
+#ifdef AARCH64
+    const int saved_regs_size = 192;
+
+    __ stp(FP, LR, Address(SP, -saved_regs_size, pre_indexed));
+    __ mov(FP, SP);
+
+    int sp_offset = 16;
+    assert(frame::entry_frame_call_wrapper_offset * wordSize == sp_offset, "adjust this code");
+    __ stp(R0,  ZR,  Address(SP, sp_offset)); sp_offset += 16;
+
+    const int saved_result_and_result_type_offset = sp_offset;
+    __ stp(R1,  R2,  Address(SP, sp_offset)); sp_offset += 16;
+    __ stp(R19, R20, Address(SP, sp_offset)); sp_offset += 16;
+    __ stp(R21, R22, Address(SP, sp_offset)); sp_offset += 16;
+    __ stp(R23, R24, Address(SP, sp_offset)); sp_offset += 16;
+    __ stp(R25, R26, Address(SP, sp_offset)); sp_offset += 16;
+    __ stp(R27, R28, Address(SP, sp_offset)); sp_offset += 16;
+
+    __ stp_d(V8,  V9,  Address(SP, sp_offset)); sp_offset += 16;
+    __ stp_d(V10, V11, Address(SP, sp_offset)); sp_offset += 16;
+    __ stp_d(V12, V13, Address(SP, sp_offset)); sp_offset += 16;
+    __ stp_d(V14, V15, Address(SP, sp_offset)); sp_offset += 16;
+    assert (sp_offset == saved_regs_size, "adjust this code");
+
+    __ mov(Rmethod, R3);
+    __ mov(Rthread, R7);
+    __ reinit_heapbase();
+
+    { // Pass parameters
+      Label done_parameters, pass_parameters;
+
+      __ mov(Rparams, SP);
+      __ cbz_w(R6, done_parameters);
+
+      __ sub(Rtemp, SP, R6, ex_uxtw, LogBytesPerWord);
+      __ align_reg(SP, Rtemp, StackAlignmentInBytes);
+      __ add(Rparams, SP, R6, ex_uxtw, LogBytesPerWord);
+
+      __ bind(pass_parameters);
+      __ subs_w(R6, R6, 1);
+      __ ldr(Rtemp, Address(R5, wordSize, post_indexed));
+      __ str(Rtemp, Address(Rparams, -wordSize, pre_indexed));
+      __ b(pass_parameters, ne);
+
+      __ bind(done_parameters);
+
+#ifdef ASSERT
+      {
+        Label L;
+        __ cmp(SP, Rparams);
+        __ b(L, eq);
+        __ stop("SP does not match Rparams");
+        __ bind(L);
+      }
+#endif
+    }
+
+    __ mov(Rsender_sp, SP);
+    __ blr(R4);
+    return_address = __ pc();
+
+    __ mov(SP, FP);
+
+    __ ldp(R1, R2, Address(SP, saved_result_and_result_type_offset));
+
+    { // Handle return value
+      Label cont;
+      __ str(R0, Address(R1));
+
+      __ cmp_w(R2, T_DOUBLE);
+      __ ccmp_w(R2, T_FLOAT, Assembler::flags_for_condition(eq), ne);
+      __ b(cont, ne);
+
+      __ str_d(V0, Address(R1));
+      __ bind(cont);
+    }
+
+    sp_offset = saved_result_and_result_type_offset + 16;
+    __ ldp(R19, R20, Address(SP, sp_offset)); sp_offset += 16;
+    __ ldp(R21, R22, Address(SP, sp_offset)); sp_offset += 16;
+    __ ldp(R23, R24, Address(SP, sp_offset)); sp_offset += 16;
+    __ ldp(R25, R26, Address(SP, sp_offset)); sp_offset += 16;
+    __ ldp(R27, R28, Address(SP, sp_offset)); sp_offset += 16;
+
+    __ ldp_d(V8,  V9,  Address(SP, sp_offset)); sp_offset += 16;
+    __ ldp_d(V10, V11, Address(SP, sp_offset)); sp_offset += 16;
+    __ ldp_d(V12, V13, Address(SP, sp_offset)); sp_offset += 16;
+    __ ldp_d(V14, V15, Address(SP, sp_offset)); sp_offset += 16;
+    assert (sp_offset == saved_regs_size, "adjust this code");
+
+    __ ldp(FP, LR, Address(SP, saved_regs_size, post_indexed));
+    __ ret();
+
+#else // AARCH64
+
+    assert(frame::entry_frame_call_wrapper_offset == 0, "adjust this code");
+
+    __ mov(Rtemp, SP);
+    __ push(RegisterSet(FP) | RegisterSet(LR));
+#ifndef __SOFTFP__
+    __ fstmdbd(SP, FloatRegisterSet(D8, 8), writeback);
+#endif
+    __ stmdb(SP, RegisterSet(R0, R2) | RegisterSet(R4, R6) | RegisterSet(R8, R10) | altFP_7_11, writeback);
+    __ mov(Rmethod, R3);
+    __ ldmia(Rtemp, RegisterSet(R1, R3) | Rthread); // stacked arguments
+
+    // XXX: TODO
+    // Would be better with respect to native tools if the following
+    // setting of FP was changed to conform to the native ABI, with FP
+    // pointing to the saved FP slot (and the corresponding modifications
+    // for entry_frame_call_wrapper_offset and frame::real_fp).
+    __ mov(FP, SP);
+
+    {
+      Label no_parameters, pass_parameters;
+      __ cmp(R3, 0);
+      __ b(no_parameters, eq);
+
+      __ bind(pass_parameters);
+      __ ldr(Rtemp, Address(R2, wordSize, post_indexed)); // Rtemp OK, unused and scratchable
+      __ subs(R3, R3, 1);
+      __ push(Rtemp);
+      __ b(pass_parameters, ne);
+      __ bind(no_parameters);
+    }
+
+    __ mov(Rsender_sp, SP);
+    __ blx(R1);
+    return_address = __ pc();
+
+    __ add(SP, FP, wordSize); // Skip link to JavaCallWrapper
+    __ pop(RegisterSet(R2, R3));
+#ifndef __ABI_HARD__
+    __ cmp(R3, T_LONG);
+    __ cmp(R3, T_DOUBLE, ne);
+    __ str(R0, Address(R2));
+    __ str(R1, Address(R2, wordSize), eq);
+#else
+    Label cont, l_float, l_double;
+
+    __ cmp(R3, T_DOUBLE);
+    __ b(l_double, eq);
+
+    __ cmp(R3, T_FLOAT);
+    __ b(l_float, eq);
+
+    __ cmp(R3, T_LONG);
+    __ str(R0, Address(R2));
+    __ str(R1, Address(R2, wordSize), eq);
+    __ b(cont);
+
+
+    __ bind(l_double);
+    __ fstd(D0, Address(R2));
+    __ b(cont);
+
+    __ bind(l_float);
+    __ fsts(S0, Address(R2));
+
+    __ bind(cont);
+#endif
+
+    __ pop(RegisterSet(R4, R6) | RegisterSet(R8, R10) | altFP_7_11);
+#ifndef __SOFTFP__
+    __ fldmiad(SP, FloatRegisterSet(D8, 8), writeback);
+#endif
+    __ pop(RegisterSet(FP) | RegisterSet(PC));
+
+#endif // AARCH64
+    return start;
+  }
+
+
+  // (in) Rexception_obj: exception oop
+  address generate_catch_exception() {
+    StubCodeMark mark(this, "StubRoutines", "catch_exception");
+    address start = __ pc();
+
+    __ str(Rexception_obj, Address(Rthread, Thread::pending_exception_offset()));
+    __ b(StubRoutines::_call_stub_return_address);
+
+    return start;
+  }
+
+
+  // (in) Rexception_pc: return address
+  address generate_forward_exception() {
+    StubCodeMark mark(this, "StubRoutines", "forward exception");
+    address start = __ pc();
+
+    __ mov(c_rarg0, Rthread);
+    __ mov(c_rarg1, Rexception_pc);
+    __ call_VM_leaf(CAST_FROM_FN_PTR(address,
+                         SharedRuntime::exception_handler_for_return_address),
+                         c_rarg0, c_rarg1);
+    __ ldr(Rexception_obj, Address(Rthread, Thread::pending_exception_offset()));
+    const Register Rzero = __ zero_register(Rtemp); // Rtemp OK (cleared by above call)
+    __ str(Rzero, Address(Rthread, Thread::pending_exception_offset()));
+
+#ifdef ASSERT
+    // make sure exception is set
+    { Label L;
+      __ cbnz(Rexception_obj, L);
+      __ stop("StubRoutines::forward exception: no pending exception (2)");
+      __ bind(L);
+    }
+#endif
+
+    // Verify that there is really a valid exception in RAX.
+    __ verify_oop(Rexception_obj);
+
+    __ jump(R0); // handler is returned in R0 by runtime function
+    return start;
+  }
+
+
+#ifndef AARCH64
+
+  // Integer division shared routine
+  //   Input:
+  //     R0  - dividend
+  //     R2  - divisor
+  //   Output:
+  //     R0  - remainder
+  //     R1  - quotient
+  //   Destroys:
+  //     R2
+  //     LR
+  address generate_idiv_irem() {
+    Label positive_arguments, negative_or_zero, call_slow_path;
+    Register dividend  = R0;
+    Register divisor   = R2;
+    Register remainder = R0;
+    Register quotient  = R1;
+    Register tmp       = LR;
+    assert(dividend == remainder, "must be");
+
+    address start = __ pc();
+
+    // Check for special cases: divisor <= 0 or dividend < 0
+    __ cmp(divisor, 0);
+    __ orrs(quotient, dividend, divisor, ne);
+    __ b(negative_or_zero, le);
+
+    __ bind(positive_arguments);
+    // Save return address on stack to free one extra register
+    __ push(LR);
+    // Approximate the mamximum order of the quotient
+    __ clz(tmp, dividend);
+    __ clz(quotient, divisor);
+    __ subs(tmp, quotient, tmp);
+    __ mov(quotient, 0);
+    // Jump to the appropriate place in the unrolled loop below
+    __ ldr(PC, Address(PC, tmp, lsl, 2), pl);
+    // If divisor is greater than dividend, return immediately
+    __ pop(PC);
+
+    // Offset table
+    Label offset_table[32];
+    int i;
+    for (i = 0; i <= 31; i++) {
+      __ emit_address(offset_table[i]);
+    }
+
+    // Unrolled loop of 32 division steps
+    for (i = 31; i >= 0; i--) {
+      __ bind(offset_table[i]);
+      __ cmp(remainder, AsmOperand(divisor, lsl, i));
+      __ sub(remainder, remainder, AsmOperand(divisor, lsl, i), hs);
+      __ add(quotient, quotient, 1 << i, hs);
+    }
+    __ pop(PC);
+
+    __ bind(negative_or_zero);
+    // Find the combination of argument signs and jump to corresponding handler
+    __ andr(quotient, dividend, 0x80000000, ne);
+    __ orr(quotient, quotient, AsmOperand(divisor, lsr, 31), ne);
+    __ add(PC, PC, AsmOperand(quotient, ror, 26), ne);
+    __ str(LR, Address(Rthread, JavaThread::saved_exception_pc_offset()));
+
+    // The leaf runtime function can destroy R0-R3 and R12 registers which are still alive
+    RegisterSet saved_registers = RegisterSet(R3) | RegisterSet(R12);
+#if R9_IS_SCRATCHED
+    // Safer to save R9 here since callers may have been written
+    // assuming R9 survives. This is suboptimal but may not be worth
+    // revisiting for this slow case.
+
+    // save also R10 for alignment
+    saved_registers = saved_registers | RegisterSet(R9, R10);
+#endif
+    {
+      // divisor == 0
+      FixedSizeCodeBlock zero_divisor(_masm, 8, true);
+      __ push(saved_registers);
+      __ mov(R0, Rthread);
+      __ mov(R1, LR);
+      __ mov(R2, SharedRuntime::IMPLICIT_DIVIDE_BY_ZERO);
+      __ b(call_slow_path);
+    }
+
+    {
+      // divisor > 0 && dividend < 0
+      FixedSizeCodeBlock positive_divisor_negative_dividend(_masm, 8, true);
+      __ push(LR);
+      __ rsb(dividend, dividend, 0);
+      __ bl(positive_arguments);
+      __ rsb(remainder, remainder, 0);
+      __ rsb(quotient, quotient, 0);
+      __ pop(PC);
+    }
+
+    {
+      // divisor < 0 && dividend > 0
+      FixedSizeCodeBlock negative_divisor_positive_dividend(_masm, 8, true);
+      __ push(LR);
+      __ rsb(divisor, divisor, 0);
+      __ bl(positive_arguments);
+      __ rsb(quotient, quotient, 0);
+      __ pop(PC);
+    }
+
+    {
+      // divisor < 0 && dividend < 0
+      FixedSizeCodeBlock negative_divisor_negative_dividend(_masm, 8, true);
+      __ push(LR);
+      __ rsb(dividend, dividend, 0);
+      __ rsb(divisor, divisor, 0);
+      __ bl(positive_arguments);
+      __ rsb(remainder, remainder, 0);
+      __ pop(PC);
+    }
+
+    __ bind(call_slow_path);
+    __ call(CAST_FROM_FN_PTR(address, SharedRuntime::continuation_for_implicit_exception));
+    __ pop(saved_registers);
+    __ bx(R0);
+
+    return start;
+  }
+
+
+ // As per atomic.hpp the Atomic read-modify-write operations must be logically implemented as:
+ //  <fence>; <op>; <membar StoreLoad|StoreStore>
+ // But for load-linked/store-conditional based systems a fence here simply means
+ // no load/store can be reordered with respect to the initial load-linked, so we have:
+ // <membar storeload|loadload> ; load-linked; <op>; store-conditional; <membar storeload|storestore>
+ // There are no memory actions in <op> so nothing further is needed.
+ //
+ // So we define the following for convenience:
+#define MEMBAR_ATOMIC_OP_PRE \
+    MacroAssembler::Membar_mask_bits(MacroAssembler::StoreLoad|MacroAssembler::LoadLoad)
+#define MEMBAR_ATOMIC_OP_POST \
+    MacroAssembler::Membar_mask_bits(MacroAssembler::StoreLoad|MacroAssembler::StoreStore)
+
+  // Note: JDK 9 only supports ARMv7+ so we always have ldrexd available even though the
+  // code below allows for it to be otherwise. The else clause indicates an ARMv5 system
+  // for which we do not support MP and so membars are not necessary. This ARMv5 code will
+  // be removed in the future.
+
+  // Support for jint Atomic::add(jint add_value, volatile jint *dest)
+  //
+  // Arguments :
+  //
+  //      add_value:      R0
+  //      dest:           R1
+  //
+  // Results:
+  //
+  //     R0: the new stored in dest
+  //
+  // Overwrites:
+  //
+  //     R1, R2, R3
+  //
+  address generate_atomic_add() {
+    address start;
+
+    StubCodeMark mark(this, "StubRoutines", "atomic_add");
+    Label retry;
+    start = __ pc();
+    Register addval    = R0;
+    Register dest      = R1;
+    Register prev      = R2;
+    Register ok        = R2;
+    Register newval    = R3;
+
+    if (VM_Version::supports_ldrex()) {
+      __ membar(MEMBAR_ATOMIC_OP_PRE, prev);
+      __ bind(retry);
+      __ ldrex(newval, Address(dest));
+      __ add(newval, addval, newval);
+      __ strex(ok, newval, Address(dest));
+      __ cmp(ok, 0);
+      __ b(retry, ne);
+      __ mov (R0, newval);
+      __ membar(MEMBAR_ATOMIC_OP_POST, prev);
+    } else {
+      __ bind(retry);
+      __ ldr (prev, Address(dest));
+      __ add(newval, addval, prev);
+      __ atomic_cas_bool(prev, newval, dest, 0, noreg/*ignored*/);
+      __ b(retry, ne);
+      __ mov (R0, newval);
+    }
+    __ bx(LR);
+
+    return start;
+  }
+
+  // Support for jint Atomic::xchg(jint exchange_value, volatile jint *dest)
+  //
+  // Arguments :
+  //
+  //      exchange_value: R0
+  //      dest:           R1
+  //
+  // Results:
+  //
+  //     R0: the value previously stored in dest
+  //
+  // Overwrites:
+  //
+  //     R1, R2, R3
+  //
+  address generate_atomic_xchg() {
+    address start;
+
+    StubCodeMark mark(this, "StubRoutines", "atomic_xchg");
+    start = __ pc();
+    Register newval    = R0;
+    Register dest      = R1;
+    Register prev      = R2;
+
+    Label retry;
+
+    if (VM_Version::supports_ldrex()) {
+      Register ok=R3;
+      __ membar(MEMBAR_ATOMIC_OP_PRE, prev);
+      __ bind(retry);
+      __ ldrex(prev, Address(dest));
+      __ strex(ok, newval, Address(dest));
+      __ cmp(ok, 0);
+      __ b(retry, ne);
+      __ mov (R0, prev);
+      __ membar(MEMBAR_ATOMIC_OP_POST, prev);
+    } else {
+      __ bind(retry);
+      __ ldr (prev, Address(dest));
+      __ atomic_cas_bool(prev, newval, dest, 0, noreg/*ignored*/);
+      __ b(retry, ne);
+      __ mov (R0, prev);
+    }
+    __ bx(LR);
+
+    return start;
+  }
+
+  // Support for jint Atomic::cmpxchg(jint exchange_value, volatile jint *dest, jint compare_value)
+  //
+  // Arguments :
+  //
+  //      compare_value:  R0
+  //      exchange_value: R1
+  //      dest:           R2
+  //
+  // Results:
+  //
+  //     R0: the value previously stored in dest
+  //
+  // Overwrites:
+  //
+  //     R0, R1, R2, R3, Rtemp
+  //
+  address generate_atomic_cmpxchg() {
+    address start;
+
+    StubCodeMark mark(this, "StubRoutines", "atomic_cmpxchg");
+    start = __ pc();
+    Register cmp       = R0;
+    Register newval    = R1;
+    Register dest      = R2;
+    Register temp1     = R3;
+    Register temp2     = Rtemp; // Rtemp free (native ABI)
+
+    __ membar(MEMBAR_ATOMIC_OP_PRE, temp1);
+
+    // atomic_cas returns previous value in R0
+    __ atomic_cas(temp1, temp2, cmp, newval, dest, 0);
+
+    __ membar(MEMBAR_ATOMIC_OP_POST, temp1);
+
+    __ bx(LR);
+
+    return start;
+  }
+
+  // Support for jlong Atomic::cmpxchg(jlong exchange_value, volatile jlong *dest, jlong compare_value)
+  // reordered before by a wrapper to (jlong compare_value, jlong exchange_value, volatile jlong *dest)
+  //
+  // Arguments :
+  //
+  //      compare_value:  R1 (High), R0 (Low)
+  //      exchange_value: R3 (High), R2 (Low)
+  //      dest:           SP+0
+  //
+  // Results:
+  //
+  //     R0:R1: the value previously stored in dest
+  //
+  // Overwrites:
+  //
+  address generate_atomic_cmpxchg_long() {
+    address start;
+
+    StubCodeMark mark(this, "StubRoutines", "atomic_cmpxchg_long");
+    start = __ pc();
+    Register cmp_lo      = R0;
+    Register cmp_hi      = R1;
+    Register newval_lo   = R2;
+    Register newval_hi   = R3;
+    Register addr        = Rtemp;  /* After load from stack */
+    Register temp_lo     = R4;
+    Register temp_hi     = R5;
+    Register temp_result = R8;
+    assert_different_registers(cmp_lo, newval_lo, temp_lo, addr, temp_result, R7);
+    assert_different_registers(cmp_hi, newval_hi, temp_hi, addr, temp_result, R7);
+
+    __ membar(MEMBAR_ATOMIC_OP_PRE, Rtemp); // Rtemp free (native ABI)
+
+    // Stack is unaligned, maintain double word alignment by pushing
+    // odd number of regs.
+    __ push(RegisterSet(temp_result) | RegisterSet(temp_lo, temp_hi));
+    __ ldr(addr, Address(SP, 12));
+
+    // atomic_cas64 returns previous value in temp_lo, temp_hi
+    __ atomic_cas64(temp_lo, temp_hi, temp_result, cmp_lo, cmp_hi,
+                    newval_lo, newval_hi, addr, 0);
+    __ mov(R0, temp_lo);
+    __ mov(R1, temp_hi);
+
+    __ pop(RegisterSet(temp_result) | RegisterSet(temp_lo, temp_hi));
+
+    __ membar(MEMBAR_ATOMIC_OP_POST, Rtemp); // Rtemp free (native ABI)
+    __ bx(LR);
+
+    return start;
+  }
+
+  address generate_atomic_load_long() {
+    address start;
+
+    StubCodeMark mark(this, "StubRoutines", "atomic_load_long");
+    start = __ pc();
+    Register result_lo = R0;
+    Register result_hi = R1;
+    Register src       = R0;
+
+    if (!os::is_MP()) {
+      __ ldmia(src, RegisterSet(result_lo, result_hi));
+      __ bx(LR);
+    } else if (VM_Version::supports_ldrexd()) {
+      __ ldrexd(result_lo, Address(src));
+      __ clrex(); // FIXME: safe to remove?
+      __ bx(LR);
+    } else {
+      __ stop("Atomic load(jlong) unsupported on this platform");
+      __ bx(LR);
+    }
+
+    return start;
+  }
+
+  address generate_atomic_store_long() {
+    address start;
+
+    StubCodeMark mark(this, "StubRoutines", "atomic_store_long");
+    start = __ pc();
+    Register newval_lo = R0;
+    Register newval_hi = R1;
+    Register dest      = R2;
+    Register scratch_lo    = R2;
+    Register scratch_hi    = R3;  /* After load from stack */
+    Register result    = R3;
+
+    if (!os::is_MP()) {
+      __ stmia(dest, RegisterSet(newval_lo, newval_hi));
+      __ bx(LR);
+    } else if (VM_Version::supports_ldrexd()) {
+      __ mov(Rtemp, dest);  // get dest to Rtemp
+      Label retry;
+      __ bind(retry);
+      __ ldrexd(scratch_lo, Address(Rtemp));
+      __ strexd(result, R0, Address(Rtemp));
+      __ rsbs(result, result, 1);
+      __ b(retry, eq);
+      __ bx(LR);
+    } else {
+      __ stop("Atomic store(jlong) unsupported on this platform");
+      __ bx(LR);
+    }
+
+    return start;
+  }
+
+
+#endif // AARCH64
+
+#ifdef COMPILER2
+  // Support for uint StubRoutine::Arm::partial_subtype_check( Klass sub, Klass super );
+  // Arguments :
+  //
+  //      ret  : R0, returned
+  //      icc/xcc: set as R0 (depending on wordSize)
+  //      sub  : R1, argument, not changed
+  //      super: R2, argument, not changed
+  //      raddr: LR, blown by call
+  address generate_partial_subtype_check() {
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", "partial_subtype_check");
+    address start = __ pc();
+
+    // based on SPARC check_klass_subtype_[fast|slow]_path (without CompressedOops)
+
+    // R0 used as tmp_reg (in addition to return reg)
+    Register sub_klass = R1;
+    Register super_klass = R2;
+    Register tmp_reg2 = R3;
+    Register tmp_reg3 = R4;
+#define saved_set tmp_reg2, tmp_reg3
+
+    Label L_loop, L_fail;
+
+    int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
+
+    // fast check should be redundant
+
+    // slow check
+    {
+      __ raw_push(saved_set);
+
+      // a couple of useful fields in sub_klass:
+      int ss_offset = in_bytes(Klass::secondary_supers_offset());
+
+      // Do a linear scan of the secondary super-klass chain.
+      // This code is rarely used, so simplicity is a virtue here.
+
+      inc_counter_np(SharedRuntime::_partial_subtype_ctr, tmp_reg2, tmp_reg3);
+
+      Register scan_temp = tmp_reg2;
+      Register count_temp = tmp_reg3;
+
+      // We will consult the secondary-super array.
+      __ ldr(scan_temp, Address(sub_klass, ss_offset));
+
+      Register search_key = super_klass;
+
+      // Load the array length.
+      __ ldr_s32(count_temp, Address(scan_temp, Array<Klass*>::length_offset_in_bytes()));
+      __ add(scan_temp, scan_temp, Array<Klass*>::base_offset_in_bytes());
+
+      __ add(count_temp, count_temp, 1);
+
+      // Top of search loop
+      __ bind(L_loop);
+      // Notes:
+      //  scan_temp starts at the array elements
+      //  count_temp is 1+size
+      __ subs(count_temp, count_temp, 1);
+      __ b(L_fail, eq); // not found in the array
+
+      // Load next super to check
+      // In the array of super classes elements are pointer sized.
+      int element_size = wordSize;
+      __ ldr(R0, Address(scan_temp, element_size, post_indexed));
+
+      // Look for Rsuper_klass on Rsub_klass's secondary super-class-overflow list
+      __ subs(R0, R0, search_key); // set R0 to 0 on success (and flags to eq)
+
+      // A miss means we are NOT a subtype and need to keep looping
+      __ b(L_loop, ne);
+
+      // Falling out the bottom means we found a hit; we ARE a subtype
+
+      // Success.  Cache the super we found and proceed in triumph.
+      __ str(super_klass, Address(sub_klass, sc_offset));
+
+      // Return success
+      // R0 is already 0 and flags are already set to eq
+      __ raw_pop(saved_set);
+      __ ret();
+
+      // Return failure
+      __ bind(L_fail);
+#ifdef AARCH64
+      // count_temp is 0, can't use ZR here
+      __ adds(R0, count_temp, 1); // sets the flags
+#else
+      __ movs(R0, 1); // sets the flags
+#endif
+      __ raw_pop(saved_set);
+      __ ret();
+    }
+    return start;
+  }
+#undef saved_set
+#endif // COMPILER2
+
+
+  //----------------------------------------------------------------------------------------------------
+  // Non-destructive plausibility checks for oops
+
+  address generate_verify_oop() {
+    StubCodeMark mark(this, "StubRoutines", "verify_oop");
+    address start = __ pc();
+
+    // Incoming arguments:
+    //
+    // R0: error message (char* )
+    // R1: address of register save area
+    // R2: oop to verify
+    //
+    // All registers are saved before calling this stub. However, condition flags should be saved here.
+
+    const Register oop   = R2;
+    const Register klass = R3;
+    const Register tmp1  = R6;
+    const Register tmp2  = R8;
+
+    const Register flags     = Rtmp_save0; // R4/R19
+    const Register ret_addr  = Rtmp_save1; // R5/R20
+    assert_different_registers(oop, klass, tmp1, tmp2, flags, ret_addr, R7);
+
+    Label exit, error;
+    InlinedAddress verify_oop_count((address) StubRoutines::verify_oop_count_addr());
+
+#ifdef AARCH64
+    __ mrs(flags, Assembler::SysReg_NZCV);
+#else
+    __ mrs(Assembler::CPSR, flags);
+#endif // AARCH64
+
+    __ ldr_literal(tmp1, verify_oop_count);
+    __ ldr_s32(tmp2, Address(tmp1));
+    __ add(tmp2, tmp2, 1);
+    __ str_32(tmp2, Address(tmp1));
+
+    // make sure object is 'reasonable'
+    __ cbz(oop, exit);                           // if obj is NULL it is ok
+
+    // Check if the oop is in the right area of memory
+    // Note: oop_mask and oop_bits must be updated if the code is saved/reused
+    const address oop_mask = (address) Universe::verify_oop_mask();
+    const address oop_bits = (address) Universe::verify_oop_bits();
+    __ mov_address(tmp1, oop_mask, symbolic_Relocation::oop_mask_reference);
+    __ andr(tmp2, oop, tmp1);
+    __ mov_address(tmp1, oop_bits, symbolic_Relocation::oop_bits_reference);
+    __ cmp(tmp2, tmp1);
+    __ b(error, ne);
+
+    // make sure klass is 'reasonable'
+    __ load_klass(klass, oop);                   // get klass
+    __ cbz(klass, error);                        // if klass is NULL it is broken
+
+    // return if everything seems ok
+    __ bind(exit);
+
+#ifdef AARCH64
+    __ msr(Assembler::SysReg_NZCV, flags);
+#else
+    __ msr(Assembler::CPSR_f, flags);
+#endif // AARCH64
+
+    __ ret();
+
+    // handle errors
+    __ bind(error);
+
+    __ mov(ret_addr, LR);                      // save return address
+
+    // R0: error message
+    // R1: register save area
+    __ call(CAST_FROM_FN_PTR(address, MacroAssembler::debug));
+
+    __ mov(LR, ret_addr);
+    __ b(exit);
+
+    __ bind_literal(verify_oop_count);
+
+    return start;
+  }
+
+  //----------------------------------------------------------------------------------------------------
+  // Array copy stubs
+
+  //
+  //  Generate overlap test for array copy stubs
+  //
+  //  Input:
+  //    R0    -  array1
+  //    R1    -  array2
+  //    R2    -  element count, 32-bit int
+  //
+  //  input registers are preserved
+  //
+  void array_overlap_test(address no_overlap_target, int log2_elem_size, Register tmp1, Register tmp2) {
+    assert(no_overlap_target != NULL, "must be generated");
+    array_overlap_test(no_overlap_target, NULL, log2_elem_size, tmp1, tmp2);
+  }
+  void array_overlap_test(Label& L_no_overlap, int log2_elem_size, Register tmp1, Register tmp2) {
+    array_overlap_test(NULL, &L_no_overlap, log2_elem_size, tmp1, tmp2);
+  }
+  void array_overlap_test(address no_overlap_target, Label* NOLp, int log2_elem_size, Register tmp1, Register tmp2) {
+    const Register from       = R0;
+    const Register to         = R1;
+    const Register count      = R2;
+    const Register to_from    = tmp1; // to - from
+#ifndef AARCH64
+    const Register byte_count = (log2_elem_size == 0) ? count : tmp2; // count << log2_elem_size
+#endif // AARCH64
+    assert_different_registers(from, to, count, tmp1, tmp2);
+
+    // no_overlap version works if 'to' lower (unsigned) than 'from'
+    // and or 'to' more than (count*size) from 'from'
+
+    BLOCK_COMMENT("Array Overlap Test:");
+    __ subs(to_from, to, from);
+#ifndef AARCH64
+    if (log2_elem_size != 0) {
+      __ mov(byte_count, AsmOperand(count, lsl, log2_elem_size));
+    }
+#endif // !AARCH64
+    if (NOLp == NULL)
+      __ b(no_overlap_target,lo);
+    else
+      __ b((*NOLp), lo);
+#ifdef AARCH64
+    __ subs(ZR, to_from, count, ex_sxtw, log2_elem_size);
+#else
+    __ cmp(to_from, byte_count);
+#endif // AARCH64
+    if (NOLp == NULL)
+      __ b(no_overlap_target, ge);
+    else
+      __ b((*NOLp), ge);
+  }
+
+#ifdef AARCH64
+  // TODO-AARCH64: revise usages of bulk_* methods (probably ldp`s and stp`s should interlace)
+
+  // Loads [from, from + count*wordSize) into regs[0], regs[1], ..., regs[count-1]
+  // and increases 'from' by count*wordSize.
+  void bulk_load_forward(Register from, const Register regs[], int count) {
+    assert (count > 0 && count % 2 == 0, "count must be positive even number");
+    int bytes = count * wordSize;
+
+    int offset = 0;
+    __ ldp(regs[0], regs[1], Address(from, bytes, post_indexed));
+    offset += 2*wordSize;
+
+    for (int i = 2; i < count; i += 2) {
+      __ ldp(regs[i], regs[i+1], Address(from, -bytes + offset));
+      offset += 2*wordSize;
+    }
+
+    assert (offset == bytes, "must be");
+  }
+
+  // Stores regs[0], regs[1], ..., regs[count-1] to [to, to + count*wordSize)
+  // and increases 'to' by count*wordSize.
+  void bulk_store_forward(Register to, const Register regs[], int count) {
+    assert (count > 0 && count % 2 == 0, "count must be positive even number");
+    int bytes = count * wordSize;
+
+    int offset = 0;
+    __ stp(regs[0], regs[1], Address(to, bytes, post_indexed));
+    offset += 2*wordSize;
+
+    for (int i = 2; i < count; i += 2) {
+      __ stp(regs[i], regs[i+1], Address(to, -bytes + offset));
+      offset += 2*wordSize;
+    }
+
+    assert (offset == bytes, "must be");
+  }
+
+  // Loads [from - count*wordSize, from) into regs[0], regs[1], ..., regs[count-1]
+  // and decreases 'from' by count*wordSize.
+  // Note that the word with lowest address goes to regs[0].
+  void bulk_load_backward(Register from, const Register regs[], int count) {
+    assert (count > 0 && count % 2 == 0, "count must be positive even number");
+    int bytes = count * wordSize;
+
+    int offset = 0;
+
+    for (int i = count - 2; i > 0; i -= 2) {
+      offset += 2*wordSize;
+      __ ldp(regs[i], regs[i+1], Address(from, -offset));
+    }
+
+    offset += 2*wordSize;
+    __ ldp(regs[0], regs[1], Address(from, -bytes, pre_indexed));
+
+    assert (offset == bytes, "must be");
+  }
+
+  // Stores regs[0], regs[1], ..., regs[count-1] into [to - count*wordSize, to)
+  // and decreases 'to' by count*wordSize.
+  // Note that regs[0] value goes into the memory with lowest address.
+  void bulk_store_backward(Register to, const Register regs[], int count) {
+    assert (count > 0 && count % 2 == 0, "count must be positive even number");
+    int bytes = count * wordSize;
+
+    int offset = 0;
+
+    for (int i = count - 2; i > 0; i -= 2) {
+      offset += 2*wordSize;
+      __ stp(regs[i], regs[i+1], Address(to, -offset));
+    }
+
+    offset += 2*wordSize;
+    __ stp(regs[0], regs[1], Address(to, -bytes, pre_indexed));
+
+    assert (offset == bytes, "must be");
+  }
+#endif // AARCH64
+
+  // TODO-AARCH64: rearrange in-loop prefetches:
+  //   probably we should choose between "prefetch-store before or after store", not "before or after load".
+  void prefetch(Register from, Register to, int offset, int to_delta = 0) {
+    __ prefetch_read(Address(from, offset));
+#ifdef AARCH64
+  // Next line commented out to avoid significant loss of performance in memory copy - JDK-8078120
+  // __ prfm(pstl1keep, Address(to, offset + to_delta));
+#endif // AARCH64
+  }
+
+  // Generate the inner loop for forward aligned array copy
+  //
+  // Arguments
+  //      from:      src address, 64 bits  aligned
+  //      to:        dst address, wordSize aligned
+  //      count:     number of elements (32-bit int)
+  //      bytes_per_count: number of bytes for each unit of 'count'
+  //
+  // Return the minimum initial value for count
+  //
+  // Notes:
+  // - 'from' aligned on 64-bit (recommended for 32-bit ARM in case this speeds up LDMIA, required for AArch64)
+  // - 'to' aligned on wordSize
+  // - 'count' must be greater or equal than the returned value
+  //
+  // Increases 'from' and 'to' by count*bytes_per_count.
+  //
+  // Scratches 'count', R3.
+  // On AArch64 also scratches R4-R10; on 32-bit ARM R4-R10 are preserved (saved/restored).
+  //
+  int generate_forward_aligned_copy_loop(Register from, Register to, Register count, int bytes_per_count) {
+    assert (from == R0 && to == R1 && count == R2, "adjust the implementation below");
+
+    const int bytes_per_loop = 8*wordSize; // 8 registers are read and written on every loop iteration
+    arraycopy_loop_config *config=&arraycopy_configurations[ArmCopyPlatform].forward_aligned;
+    int pld_offset = config->pld_distance;
+    const int count_per_loop = bytes_per_loop / bytes_per_count;
+
+#ifndef AARCH64
+    bool split_read= config->split_ldm;
+    bool split_write= config->split_stm;
+
+    // XXX optim: use VLDM/VSTM when available (Neon) with PLD
+    //  NEONCopyPLD
+    //      PLD [r1, #0xC0]
+    //      VLDM r1!,{d0-d7}
+    //      VSTM r0!,{d0-d7}
+    //      SUBS r2,r2,#0x40
+    //      BGE NEONCopyPLD
+
+    __ push(RegisterSet(R4,R10));
+#endif // !AARCH64
+
+    const bool prefetch_before = pld_offset < 0;
+    const bool prefetch_after = pld_offset > 0;
+
+    Label L_skip_pld;
+
+    // predecrease to exit when there is less than count_per_loop
+    __ sub_32(count, count, count_per_loop);
+
+    if (pld_offset != 0) {
+      pld_offset = (pld_offset < 0) ? -pld_offset : pld_offset;
+
+      prefetch(from, to, 0);
+
+      if (prefetch_before) {
+        // If prefetch is done ahead, final PLDs that overflow the
+        // copied area can be easily avoided. 'count' is predecreased
+        // by the prefetch distance to optimize the inner loop and the
+        // outer loop skips the PLD.
+        __ subs_32(count, count, (bytes_per_loop+pld_offset)/bytes_per_count);
+
+        // skip prefetch for small copies
+        __ b(L_skip_pld, lt);
+      }
+
+      int offset = ArmCopyCacheLineSize;
+      while (offset <= pld_offset) {
+        prefetch(from, to, offset);
+        offset += ArmCopyCacheLineSize;
+      };
+    }
+
+#ifdef AARCH64
+    const Register data_regs[8] = {R3, R4, R5, R6, R7, R8, R9, R10};
+#endif // AARCH64
+    {
+      // LDM (32-bit ARM) / LDP (AArch64) copy of 'bytes_per_loop' bytes
+
+      // 32-bit ARM note: we have tried implementing loop unrolling to skip one
+      // PLD with 64 bytes cache line but the gain was not significant.
+
+      Label L_copy_loop;
+      __ align(OptoLoopAlignment);
+      __ BIND(L_copy_loop);
+
+      if (prefetch_before) {
+        prefetch(from, to, bytes_per_loop + pld_offset);
+        __ BIND(L_skip_pld);
+      }
+
+#ifdef AARCH64
+      bulk_load_forward(from, data_regs, 8);
+#else
+      if (split_read) {
+        // Split the register set in two sets so that there is less
+        // latency between LDM and STM (R3-R6 available while R7-R10
+        // still loading) and less register locking issue when iterating
+        // on the first LDM.
+        __ ldmia(from, RegisterSet(R3, R6), writeback);
+        __ ldmia(from, RegisterSet(R7, R10), writeback);
+      } else {
+        __ ldmia(from, RegisterSet(R3, R10), writeback);
+      }
+#endif // AARCH64
+
+      __ subs_32(count, count, count_per_loop);
+
+      if (prefetch_after) {
+        prefetch(from, to, pld_offset, bytes_per_loop);
+      }
+
+#ifdef AARCH64
+      bulk_store_forward(to, data_regs, 8);
+#else
+      if (split_write) {
+        __ stmia(to, RegisterSet(R3, R6), writeback);
+        __ stmia(to, RegisterSet(R7, R10), writeback);
+      } else {
+        __ stmia(to, RegisterSet(R3, R10), writeback);
+      }
+#endif // AARCH64
+
+      __ b(L_copy_loop, ge);
+
+      if (prefetch_before) {
+        // the inner loop may end earlier, allowing to skip PLD for the last iterations
+        __ cmn_32(count, (bytes_per_loop + pld_offset)/bytes_per_count);
+        __ b(L_skip_pld, ge);
+      }
+    }
+    BLOCK_COMMENT("Remaining bytes:");
+    // still 0..bytes_per_loop-1 aligned bytes to copy, count already decreased by (at least) bytes_per_loop bytes
+
+    // __ add(count, count, ...); // addition useless for the bit tests
+    assert (pld_offset % bytes_per_loop == 0, "decreasing count by pld_offset before loop must not change tested bits");
+
+#ifdef AARCH64
+    assert (bytes_per_loop == 64, "adjust the code below");
+    assert (bytes_per_count <= 8, "adjust the code below");
+
+    {
+      Label L;
+      __ tbz(count, exact_log2(32/bytes_per_count), L);
+
+      bulk_load_forward(from, data_regs, 4);
+      bulk_store_forward(to, data_regs, 4);
+
+      __ bind(L);
+    }
+
+    {
+      Label L;
+      __ tbz(count, exact_log2(16/bytes_per_count), L);
+
+      bulk_load_forward(from, data_regs, 2);
+      bulk_store_forward(to, data_regs, 2);
+
+      __ bind(L);
+    }
+
+    {
+      Label L;
+      __ tbz(count, exact_log2(8/bytes_per_count), L);
+
+      __ ldr(R3, Address(from, 8, post_indexed));
+      __ str(R3, Address(to,   8, post_indexed));
+
+      __ bind(L);
+    }
+
+    if (bytes_per_count <= 4) {
+      Label L;
+      __ tbz(count, exact_log2(4/bytes_per_count), L);
+
+      __ ldr_w(R3, Address(from, 4, post_indexed));
+      __ str_w(R3, Address(to,   4, post_indexed));
+
+      __ bind(L);
+    }
+
+    if (bytes_per_count <= 2) {
+      Label L;
+      __ tbz(count, exact_log2(2/bytes_per_count), L);
+
+      __ ldrh(R3, Address(from, 2, post_indexed));
+      __ strh(R3, Address(to,   2, post_indexed));
+
+      __ bind(L);
+    }
+
+    if (bytes_per_count <= 1) {
+      Label L;
+      __ tbz(count, 0, L);
+
+      __ ldrb(R3, Address(from, 1, post_indexed));
+      __ strb(R3, Address(to,   1, post_indexed));
+
+      __ bind(L);
+    }
+#else
+    __ tst(count, 16 / bytes_per_count);
+    __ ldmia(from, RegisterSet(R3, R6), writeback, ne); // copy 16 bytes
+    __ stmia(to, RegisterSet(R3, R6), writeback, ne);
+
+    __ tst(count, 8 / bytes_per_count);
+    __ ldmia(from, RegisterSet(R3, R4), writeback, ne); // copy 8 bytes
+    __ stmia(to, RegisterSet(R3, R4), writeback, ne);
+
+    if (bytes_per_count <= 4) {
+      __ tst(count, 4 / bytes_per_count);
+      __ ldr(R3, Address(from, 4, post_indexed), ne); // copy 4 bytes
+      __ str(R3, Address(to, 4, post_indexed), ne);
+    }
+
+    if (bytes_per_count <= 2) {
+      __ tst(count, 2 / bytes_per_count);
+      __ ldrh(R3, Address(from, 2, post_indexed), ne); // copy 2 bytes
+      __ strh(R3, Address(to, 2, post_indexed), ne);
+    }
+
+    if (bytes_per_count == 1) {
+      __ tst(count, 1);
+      __ ldrb(R3, Address(from, 1, post_indexed), ne);
+      __ strb(R3, Address(to, 1, post_indexed), ne);
+    }
+
+    __ pop(RegisterSet(R4,R10));
+#endif // AARCH64
+
+    return count_per_loop;
+  }
+
+
+  // Generate the inner loop for backward aligned array copy
+  //
+  // Arguments
+  //      end_from:      src end address, 64 bits  aligned
+  //      end_to:        dst end address, wordSize aligned
+  //      count:         number of elements (32-bit int)
+  //      bytes_per_count: number of bytes for each unit of 'count'
+  //
+  // Return the minimum initial value for count
+  //
+  // Notes:
+  // - 'end_from' aligned on 64-bit (recommended for 32-bit ARM in case this speeds up LDMIA, required for AArch64)
+  // - 'end_to' aligned on wordSize
+  // - 'count' must be greater or equal than the returned value
+  //
+  // Decreases 'end_from' and 'end_to' by count*bytes_per_count.
+  //
+  // Scratches 'count', R3.
+  // On AArch64 also scratches R4-R10; on 32-bit ARM R4-R10 are preserved (saved/restored).
+  //
+  int generate_backward_aligned_copy_loop(Register end_from, Register end_to, Register count, int bytes_per_count) {
+    assert (end_from == R0 && end_to == R1 && count == R2, "adjust the implementation below");
+
+    const int bytes_per_loop = 8*wordSize; // 8 registers are read and written on every loop iteration
+    const int count_per_loop = bytes_per_loop / bytes_per_count;
+
+    arraycopy_loop_config *config=&arraycopy_configurations[ArmCopyPlatform].backward_aligned;
+    int pld_offset = config->pld_distance;
+
+#ifndef AARCH64
+    bool split_read= config->split_ldm;
+    bool split_write= config->split_stm;
+
+    // See the forward copy variant for additional comments.
+
+    __ push(RegisterSet(R4,R10));
+#endif // !AARCH64
+
+    __ sub_32(count, count, count_per_loop);
+
+    const bool prefetch_before = pld_offset < 0;
+    const bool prefetch_after = pld_offset > 0;
+
+    Label L_skip_pld;
+
+    if (pld_offset != 0) {
+      pld_offset = (pld_offset < 0) ? -pld_offset : pld_offset;
+
+      prefetch(end_from, end_to, -wordSize);
+
+      if (prefetch_before) {
+        __ subs_32(count, count, (bytes_per_loop + pld_offset) / bytes_per_count);
+        __ b(L_skip_pld, lt);
+      }
+
+      int offset = ArmCopyCacheLineSize;
+      while (offset <= pld_offset) {
+        prefetch(end_from, end_to, -(wordSize + offset));
+        offset += ArmCopyCacheLineSize;
+      };
+    }
+
+#ifdef AARCH64
+    const Register data_regs[8] = {R3, R4, R5, R6, R7, R8, R9, R10};
+#endif // AARCH64
+    {
+      // LDM (32-bit ARM) / LDP (AArch64) copy of 'bytes_per_loop' bytes
+
+      // 32-bit ARM note: we have tried implementing loop unrolling to skip one
+      // PLD with 64 bytes cache line but the gain was not significant.
+
+      Label L_copy_loop;
+      __ align(OptoLoopAlignment);
+      __ BIND(L_copy_loop);
+
+      if (prefetch_before) {
+        prefetch(end_from, end_to, -(wordSize + bytes_per_loop + pld_offset));
+        __ BIND(L_skip_pld);
+      }
+
+#ifdef AARCH64
+      bulk_load_backward(end_from, data_regs, 8);
+#else
+      if (split_read) {
+        __ ldmdb(end_from, RegisterSet(R7, R10), writeback);
+        __ ldmdb(end_from, RegisterSet(R3, R6), writeback);
+      } else {
+        __ ldmdb(end_from, RegisterSet(R3, R10), writeback);
+      }
+#endif // AARCH64
+
+      __ subs_32(count, count, count_per_loop);
+
+      if (prefetch_after) {
+        prefetch(end_from, end_to, -(wordSize + pld_offset), -bytes_per_loop);
+      }
+
+#ifdef AARCH64
+      bulk_store_backward(end_to, data_regs, 8);
+#else
+      if (split_write) {
+        __ stmdb(end_to, RegisterSet(R7, R10), writeback);
+        __ stmdb(end_to, RegisterSet(R3, R6), writeback);
+      } else {
+        __ stmdb(end_to, RegisterSet(R3, R10), writeback);
+      }
+#endif // AARCH64
+
+      __ b(L_copy_loop, ge);
+
+      if (prefetch_before) {
+        __ cmn_32(count, (bytes_per_loop + pld_offset)/bytes_per_count);
+        __ b(L_skip_pld, ge);
+      }
+    }
+    BLOCK_COMMENT("Remaining bytes:");
+    // still 0..bytes_per_loop-1 aligned bytes to copy, count already decreased by (at least) bytes_per_loop bytes
+
+    // __ add(count, count, ...); // addition useless for the bit tests
+    assert (pld_offset % bytes_per_loop == 0, "decreasing count by pld_offset before loop must not change tested bits");
+
+#ifdef AARCH64
+    assert (bytes_per_loop == 64, "adjust the code below");
+    assert (bytes_per_count <= 8, "adjust the code below");
+
+    {
+      Label L;
+      __ tbz(count, exact_log2(32/bytes_per_count), L);
+
+      bulk_load_backward(end_from, data_regs, 4);
+      bulk_store_backward(end_to, data_regs, 4);
+
+      __ bind(L);
+    }
+
+    {
+      Label L;
+      __ tbz(count, exact_log2(16/bytes_per_count), L);
+
+      bulk_load_backward(end_from, data_regs, 2);
+      bulk_store_backward(end_to, data_regs, 2);
+
+      __ bind(L);
+    }
+
+    {
+      Label L;
+      __ tbz(count, exact_log2(8/bytes_per_count), L);
+
+      __ ldr(R3, Address(end_from, -8, pre_indexed));
+      __ str(R3, Address(end_to,   -8, pre_indexed));
+
+      __ bind(L);
+    }
+
+    if (bytes_per_count <= 4) {
+      Label L;
+      __ tbz(count, exact_log2(4/bytes_per_count), L);
+
+      __ ldr_w(R3, Address(end_from, -4, pre_indexed));
+      __ str_w(R3, Address(end_to,   -4, pre_indexed));
+
+      __ bind(L);
+    }
+
+    if (bytes_per_count <= 2) {
+      Label L;
+      __ tbz(count, exact_log2(2/bytes_per_count), L);
+
+      __ ldrh(R3, Address(end_from, -2, pre_indexed));
+      __ strh(R3, Address(end_to,   -2, pre_indexed));
+
+      __ bind(L);
+    }
+
+    if (bytes_per_count <= 1) {
+      Label L;
+      __ tbz(count, 0, L);
+
+      __ ldrb(R3, Address(end_from, -1, pre_indexed));
+      __ strb(R3, Address(end_to,   -1, pre_indexed));
+
+      __ bind(L);
+    }
+#else
+    __ tst(count, 16 / bytes_per_count);
+    __ ldmdb(end_from, RegisterSet(R3, R6), writeback, ne); // copy 16 bytes
+    __ stmdb(end_to, RegisterSet(R3, R6), writeback, ne);
+
+    __ tst(count, 8 / bytes_per_count);
+    __ ldmdb(end_from, RegisterSet(R3, R4), writeback, ne); // copy 8 bytes
+    __ stmdb(end_to, RegisterSet(R3, R4), writeback, ne);
+
+    if (bytes_per_count <= 4) {
+      __ tst(count, 4 / bytes_per_count);
+      __ ldr(R3, Address(end_from, -4, pre_indexed), ne); // copy 4 bytes
+      __ str(R3, Address(end_to, -4, pre_indexed), ne);
+    }
+
+    if (bytes_per_count <= 2) {
+      __ tst(count, 2 / bytes_per_count);
+      __ ldrh(R3, Address(end_from, -2, pre_indexed), ne); // copy 2 bytes
+      __ strh(R3, Address(end_to, -2, pre_indexed), ne);
+    }
+
+    if (bytes_per_count == 1) {
+      __ tst(count, 1);
+      __ ldrb(R3, Address(end_from, -1, pre_indexed), ne);
+      __ strb(R3, Address(end_to, -1, pre_indexed), ne);
+    }
+
+    __ pop(RegisterSet(R4,R10));
+#endif // AARCH64
+
+    return count_per_loop;
+  }
+
+
+  // Generate the inner loop for shifted forward array copy (unaligned copy).
+  // It can be used when bytes_per_count < wordSize, i.e.
+  //  byte/short copy on 32-bit ARM, byte/short/int/compressed-oop copy on AArch64.
+  //
+  // Arguments
+  //      from:      start src address, 64 bits aligned
+  //      to:        start dst address, (now) wordSize aligned
+  //      count:     number of elements (32-bit int)
+  //      bytes_per_count: number of bytes for each unit of 'count'
+  //      lsr_shift: shift applied to 'old' value to skipped already written bytes
+  //      lsl_shift: shift applied to 'new' value to set the high bytes of the next write
+  //
+  // Return the minimum initial value for count
+  //
+  // Notes:
+  // - 'from' aligned on 64-bit (recommended for 32-bit ARM in case this speeds up LDMIA, required for AArch64)
+  // - 'to' aligned on wordSize
+  // - 'count' must be greater or equal than the returned value
+  // - 'lsr_shift' + 'lsl_shift' = BitsPerWord
+  // - 'bytes_per_count' is 1 or 2 on 32-bit ARM; 1, 2 or 4 on AArch64
+  //
+  // Increases 'to' by count*bytes_per_count.
+  //
+  // Scratches 'from' and 'count', R3-R10, R12
+  //
+  // On entry:
+  // - R12 is preloaded with the first 'BitsPerWord' bits read just before 'from'
+  // - (R12 >> lsr_shift) is the part not yet written (just before 'to')
+  // --> (*to) = (R12 >> lsr_shift) | (*from) << lsl_shift); ...
+  //
+  // This implementation may read more bytes than required.
+  // Actually, it always reads exactly all data from the copied region with upper bound aligned up by wordSize,
+  // so excessive read do not cross a word bound and is thus harmless.
+  //
+  int generate_forward_shifted_copy_loop(Register from, Register to, Register count, int bytes_per_count, int lsr_shift, int lsl_shift) {
+    assert (from == R0 && to == R1 && count == R2, "adjust the implementation below");
+
+    const int bytes_per_loop = 8*wordSize; // 8 registers are read and written on every loop iter
+    const int count_per_loop = bytes_per_loop / bytes_per_count;
+
+    arraycopy_loop_config *config=&arraycopy_configurations[ArmCopyPlatform].forward_shifted;
+    int pld_offset = config->pld_distance;
+
+#ifndef AARCH64
+    bool split_read= config->split_ldm;
+    bool split_write= config->split_stm;
+#endif // !AARCH64
+
+    const bool prefetch_before = pld_offset < 0;
+    const bool prefetch_after = pld_offset > 0;
+    Label L_skip_pld, L_last_read, L_done;
+    if (pld_offset != 0) {
+
+      pld_offset = (pld_offset < 0) ? -pld_offset : pld_offset;
+
+      prefetch(from, to, 0);
+
+      if (prefetch_before) {
+        __ cmp_32(count, count_per_loop);
+        __ b(L_last_read, lt);
+        // skip prefetch for small copies
+        // warning: count is predecreased by the prefetch distance to optimize the inner loop
+        __ subs_32(count, count, ((bytes_per_loop + pld_offset) / bytes_per_count) + count_per_loop);
+        __ b(L_skip_pld, lt);
+      }
+
+      int offset = ArmCopyCacheLineSize;
+      while (offset <= pld_offset) {
+        prefetch(from, to, offset);
+        offset += ArmCopyCacheLineSize;
+      };
+    }
+
+    Label L_shifted_loop;
+
+    __ align(OptoLoopAlignment);
+    __ BIND(L_shifted_loop);
+
+    if (prefetch_before) {
+      // do it early if there might be register locking issues
+      prefetch(from, to, bytes_per_loop + pld_offset);
+      __ BIND(L_skip_pld);
+    } else {
+      __ cmp_32(count, count_per_loop);
+      __ b(L_last_read, lt);
+    }
+
+#ifdef AARCH64
+    const Register data_regs[9] = {R3, R4, R5, R6, R7, R8, R9, R10, R12};
+    __ logical_shift_right(R3, R12, lsr_shift); // part of R12 not yet written
+    __ subs_32(count, count, count_per_loop);
+    bulk_load_forward(from, &data_regs[1], 8);
+#else
+    // read 32 bytes
+    if (split_read) {
+      // if write is not split, use less registers in first set to reduce locking
+      RegisterSet set1 = split_write ? RegisterSet(R4, R7) : RegisterSet(R4, R5);
+      RegisterSet set2 = (split_write ? RegisterSet(R8, R10) : RegisterSet(R6, R10)) | R12;
+      __ ldmia(from, set1, writeback);
+      __ mov(R3, AsmOperand(R12, lsr, lsr_shift)); // part of R12 not yet written
+      __ ldmia(from, set2, writeback);
+      __ subs(count, count, count_per_loop); // XXX: should it be before the 2nd LDM ? (latency vs locking)
+    } else {
+      __ mov(R3, AsmOperand(R12, lsr, lsr_shift)); // part of R12 not yet written
+      __ ldmia(from, RegisterSet(R4, R10) | R12, writeback); // Note: small latency on R4
+      __ subs(count, count, count_per_loop);
+    }
+#endif // AARCH64
+
+    if (prefetch_after) {
+      // do it after the 1st ldm/ldp anyway  (no locking issues with early STM/STP)
+      prefetch(from, to, pld_offset, bytes_per_loop);
+    }
+
+    // prepare (shift) the values in R3..R10
+    __ orr(R3, R3, AsmOperand(R4, lsl, lsl_shift)); // merged below low bytes of next val
+    __ logical_shift_right(R4, R4, lsr_shift); // unused part of next val
+    __ orr(R4, R4, AsmOperand(R5, lsl, lsl_shift)); // ...
+    __ logical_shift_right(R5, R5, lsr_shift);
+    __ orr(R5, R5, AsmOperand(R6, lsl, lsl_shift));
+    __ logical_shift_right(R6, R6, lsr_shift);
+    __ orr(R6, R6, AsmOperand(R7, lsl, lsl_shift));
+#ifndef AARCH64
+    if (split_write) {
+      // write the first half as soon as possible to reduce stm locking
+      __ stmia(to, RegisterSet(R3, R6), writeback, prefetch_before ? gt : ge);
+    }
+#endif // !AARCH64
+    __ logical_shift_right(R7, R7, lsr_shift);
+    __ orr(R7, R7, AsmOperand(R8, lsl, lsl_shift));
+    __ logical_shift_right(R8, R8, lsr_shift);
+    __ orr(R8, R8, AsmOperand(R9, lsl, lsl_shift));
+    __ logical_shift_right(R9, R9, lsr_shift);
+    __ orr(R9, R9, AsmOperand(R10, lsl, lsl_shift));
+    __ logical_shift_right(R10, R10, lsr_shift);
+    __ orr(R10, R10, AsmOperand(R12, lsl, lsl_shift));
+
+#ifdef AARCH64
+    bulk_store_forward(to, data_regs, 8);
+#else
+    if (split_write) {
+      __ stmia(to, RegisterSet(R7, R10), writeback, prefetch_before ? gt : ge);
+    } else {
+      __ stmia(to, RegisterSet(R3, R10), writeback, prefetch_before ? gt : ge);
+    }
+#endif // AARCH64
+    __ b(L_shifted_loop, gt); // no need to loop if 0 (when count need not be precise modulo bytes_per_loop)
+
+    if (prefetch_before) {
+      // the first loop may end earlier, allowing to skip pld at the end
+      __ cmn_32(count, (bytes_per_loop + pld_offset)/bytes_per_count);
+#ifndef AARCH64
+      __ stmia(to, RegisterSet(R3, R10), writeback); // stmia was skipped
+#endif // !AARCH64
+      __ b(L_skip_pld, ge);
+      __ adds_32(count, count, ((bytes_per_loop + pld_offset) / bytes_per_count) + count_per_loop);
+    }
+
+    __ BIND(L_last_read);
+    __ b(L_done, eq);
+
+#ifdef AARCH64
+    assert(bytes_per_count < 8, "adjust the code below");
+
+    __ logical_shift_right(R3, R12, lsr_shift);
+
+    {
+      Label L;
+      __ tbz(count, exact_log2(32/bytes_per_count), L);
+      bulk_load_forward(from, &data_regs[1], 4);
+      __ orr(R3, R3, AsmOperand(R4, lsl, lsl_shift));
+      __ logical_shift_right(R4, R4, lsr_shift);
+      __ orr(R4, R4, AsmOperand(R5, lsl, lsl_shift));
+      __ logical_shift_right(R5, R5, lsr_shift);
+      __ orr(R5, R5, AsmOperand(R6, lsl, lsl_shift));
+      __ logical_shift_right(R6, R6, lsr_shift);
+      __ orr(R6, R6, AsmOperand(R7, lsl, lsl_shift));
+      bulk_store_forward(to, data_regs, 4);
+      __ logical_shift_right(R3, R7, lsr_shift);
+      __ bind(L);
+    }
+
+    {
+      Label L;
+      __ tbz(count, exact_log2(16/bytes_per_count), L);
+      bulk_load_forward(from, &data_regs[1], 2);
+      __ orr(R3, R3, AsmOperand(R4, lsl, lsl_shift));
+      __ logical_shift_right(R4, R4, lsr_shift);
+      __ orr(R4, R4, AsmOperand(R5, lsl, lsl_shift));
+      bulk_store_forward(to, data_regs, 2);
+      __ logical_shift_right(R3, R5, lsr_shift);
+      __ bind(L);
+    }
+
+    {
+      Label L;
+      __ tbz(count, exact_log2(8/bytes_per_count), L);
+      __ ldr(R4, Address(from, 8, post_indexed));
+      __ orr(R3, R3, AsmOperand(R4, lsl, lsl_shift));
+      __ str(R3, Address(to, 8, post_indexed));
+      __ logical_shift_right(R3, R4, lsr_shift);
+      __ bind(L);
+    }
+
+    const int have_bytes = lsl_shift/BitsPerByte; // number of already read bytes in R3
+
+    // It remains less than wordSize to write.
+    // Do not check count if R3 already has maximal number of loaded elements (one less than wordSize).
+    if (have_bytes < wordSize - bytes_per_count) {
+      Label L;
+      __ andr(count, count, (uintx)(8/bytes_per_count-1)); // make count exact
+      __ cmp_32(count, have_bytes/bytes_per_count); // do we have enough bytes to store?
+      __ b(L, le);
+      __ ldr(R4, Address(from, 8, post_indexed));
+      __ orr(R3, R3, AsmOperand(R4, lsl, lsl_shift));
+      __ bind(L);
+    }
+
+    {
+      Label L;
+      __ tbz(count, exact_log2(4/bytes_per_count), L);
+      __ str_w(R3, Address(to, 4, post_indexed));
+      if (bytes_per_count < 4) {
+        __ logical_shift_right(R3, R3, 4*BitsPerByte);
+      }
+      __ bind(L);
+    }
+
+    if (bytes_per_count <= 2) {
+      Label L;
+      __ tbz(count, exact_log2(2/bytes_per_count), L);
+      __ strh(R3, Address(to, 2, post_indexed));
+      if (bytes_per_count < 2) {
+        __ logical_shift_right(R3, R3, 2*BitsPerByte);
+      }
+      __ bind(L);
+    }
+
+    if (bytes_per_count <= 1) {
+      Label L;
+      __ tbz(count, exact_log2(1/bytes_per_count), L);
+      __ strb(R3, Address(to, 1, post_indexed));
+      __ bind(L);
+    }
+#else
+    switch (bytes_per_count) {
+    case 2:
+      __ mov(R3, AsmOperand(R12, lsr, lsr_shift));
+      __ tst(count, 8);
+      __ ldmia(from, RegisterSet(R4, R7), writeback, ne);
+      __ orr(R3, R3, AsmOperand(R4, lsl, lsl_shift), ne); // merged below low bytes of next val
+      __ mov(R4, AsmOperand(R4, lsr, lsr_shift), ne); // unused part of next val
+      __ orr(R4, R4, AsmOperand(R5, lsl, lsl_shift), ne); // ...
+      __ mov(R5, AsmOperand(R5, lsr, lsr_shift), ne);
+      __ orr(R5, R5, AsmOperand(R6, lsl, lsl_shift), ne);
+      __ mov(R6, AsmOperand(R6, lsr, lsr_shift), ne);
+      __ orr(R6, R6, AsmOperand(R7, lsl, lsl_shift), ne);
+      __ stmia(to, RegisterSet(R3, R6), writeback, ne);
+      __ mov(R3, AsmOperand(R7, lsr, lsr_shift), ne);
+
+      __ tst(count, 4);
+      __ ldmia(from, RegisterSet(R4, R5), writeback, ne);
+      __ orr(R3, R3, AsmOperand(R4, lsl, lsl_shift), ne); // merged below low bytes of next val
+      __ mov(R4, AsmOperand(R4, lsr, lsr_shift), ne); // unused part of next val
+      __ orr(R4, R4, AsmOperand(R5, lsl, lsl_shift), ne); // ...
+      __ stmia(to, RegisterSet(R3, R4), writeback, ne);
+      __ mov(R3, AsmOperand(R5, lsr, lsr_shift), ne);
+
+      __ tst(count, 2);
+      __ ldr(R4, Address(from, 4, post_indexed), ne);
+      __ orr(R3, R3, AsmOperand(R4, lsl, lsl_shift), ne);
+      __ str(R3, Address(to, 4, post_indexed), ne);
+      __ mov(R3, AsmOperand(R4, lsr, lsr_shift), ne);
+
+      __ tst(count, 1);
+      __ strh(R3, Address(to, 2, post_indexed), ne); // one last short
+      break;
+
+    case 1:
+      __ mov(R3, AsmOperand(R12, lsr, lsr_shift));
+      __ tst(count, 16);
+      __ ldmia(from, RegisterSet(R4, R7), writeback, ne);
+      __ orr(R3, R3, AsmOperand(R4, lsl, lsl_shift), ne); // merged below low bytes of next val
+      __ mov(R4, AsmOperand(R4, lsr, lsr_shift), ne); // unused part of next val
+      __ orr(R4, R4, AsmOperand(R5, lsl, lsl_shift), ne); // ...
+      __ mov(R5, AsmOperand(R5, lsr, lsr_shift), ne);
+      __ orr(R5, R5, AsmOperand(R6, lsl, lsl_shift), ne);
+      __ mov(R6, AsmOperand(R6, lsr, lsr_shift), ne);
+      __ orr(R6, R6, AsmOperand(R7, lsl, lsl_shift), ne);
+      __ stmia(to, RegisterSet(R3, R6), writeback, ne);
+      __ mov(R3, AsmOperand(R7, lsr, lsr_shift), ne);
+
+      __ tst(count, 8);
+      __ ldmia(from, RegisterSet(R4, R5), writeback, ne);
+      __ orr(R3, R3, AsmOperand(R4, lsl, lsl_shift), ne); // merged below low bytes of next val
+      __ mov(R4, AsmOperand(R4, lsr, lsr_shift), ne); // unused part of next val
+      __ orr(R4, R4, AsmOperand(R5, lsl, lsl_shift), ne); // ...
+      __ stmia(to, RegisterSet(R3, R4), writeback, ne);
+      __ mov(R3, AsmOperand(R5, lsr, lsr_shift), ne);
+
+      __ tst(count, 4);
+      __ ldr(R4, Address(from, 4, post_indexed), ne);
+      __ orr(R3, R3, AsmOperand(R4, lsl, lsl_shift), ne);
+      __ str(R3, Address(to, 4, post_indexed), ne);
+      __ mov(R3, AsmOperand(R4, lsr, lsr_shift), ne);
+
+      __ andr(count, count, 3);
+      __ cmp(count, 2);
+
+      // Note: R3 might contain enough bytes ready to write (3 needed at most),
+      // thus load on lsl_shift==24 is not needed (in fact forces reading
+      // beyond source buffer end boundary)
+      if (lsl_shift == 8) {
+        __ ldr(R4, Address(from, 4, post_indexed), ge);
+        __ orr(R3, R3, AsmOperand(R4, lsl, lsl_shift), ge);
+      } else if (lsl_shift == 16) {
+        __ ldr(R4, Address(from, 4, post_indexed), gt);
+        __ orr(R3, R3, AsmOperand(R4, lsl, lsl_shift), gt);
+      }
+
+      __ strh(R3, Address(to, 2, post_indexed), ge); // two last bytes
+      __ mov(R3, AsmOperand(R3, lsr, 16), gt);
+
+      __ tst(count, 1);
+      __ strb(R3, Address(to, 1, post_indexed), ne); // one last byte
+      break;
+    }
+#endif // AARCH64
+
+    __ BIND(L_done);
+    return 0; // no minimum
+  }
+
+  // Generate the inner loop for shifted backward array copy (unaligned copy).
+  // It can be used when bytes_per_count < wordSize, i.e.
+  //  byte/short copy on 32-bit ARM, byte/short/int/compressed-oop copy on AArch64.
+  //
+  // Arguments
+  //      end_from:  end src address, 64 bits aligned
+  //      end_to:    end dst address, (now) wordSize aligned
+  //      count:     number of elements (32-bit int)
+  //      bytes_per_count: number of bytes for each unit of 'count'
+  //      lsl_shift: shift applied to 'old' value to skipped already written bytes
+  //      lsr_shift: shift applied to 'new' value to set the low bytes of the next write
+  //
+  // Return the minimum initial value for count
+  //
+  // Notes:
+  // - 'end_from' aligned on 64-bit (recommended for 32-bit ARM in case this speeds up LDMIA, required for AArch64)
+  // - 'end_to' aligned on wordSize
+  // - 'count' must be greater or equal than the returned value
+  // - 'lsr_shift' + 'lsl_shift' = 'BitsPerWord'
+  // - 'bytes_per_count' is 1 or 2 on 32-bit ARM; 1, 2 or 4 on AArch64
+  //
+  // Decreases 'end_to' by count*bytes_per_count.
+  //
+  // Scratches 'end_from', 'count', R3-R10, R12
+  //
+  // On entry:
+  // - R3 is preloaded with the first 'BitsPerWord' bits read just after 'from'
+  // - (R3 << lsl_shift) is the part not yet written
+  // --> (*--to) = (R3 << lsl_shift) | (*--from) >> lsr_shift); ...
+  //
+  // This implementation may read more bytes than required.
+  // Actually, it always reads exactly all data from the copied region with beginning aligned down by wordSize,
+  // so excessive read do not cross a word bound and is thus harmless.
+  //
+  int generate_backward_shifted_copy_loop(Register end_from, Register end_to, Register count, int bytes_per_count, int lsr_shift, int lsl_shift) {
+    assert (end_from == R0 && end_to == R1 && count == R2, "adjust the implementation below");
+
+    const int bytes_per_loop = 8*wordSize; // 8 registers are read and written on every loop iter
+    const int count_per_loop = bytes_per_loop / bytes_per_count;
+
+    arraycopy_loop_config *config=&arraycopy_configurations[ArmCopyPlatform].backward_shifted;
+    int pld_offset = config->pld_distance;
+
+#ifndef AARCH64
+    bool split_read= config->split_ldm;
+    bool split_write= config->split_stm;
+#endif // !AARCH64
+
+
+    const bool prefetch_before = pld_offset < 0;
+    const bool prefetch_after = pld_offset > 0;
+
+    Label L_skip_pld, L_done, L_last_read;
+    if (pld_offset != 0) {
+
+      pld_offset = (pld_offset < 0) ? -pld_offset : pld_offset;
+
+      prefetch(end_from, end_to, -wordSize);
+
+      if (prefetch_before) {
+        __ cmp_32(count, count_per_loop);
+        __ b(L_last_read, lt);
+
+        // skip prefetch for small copies
+        // warning: count is predecreased by the prefetch distance to optimize the inner loop
+        __ subs_32(count, count, ((bytes_per_loop + pld_offset)/bytes_per_count) + count_per_loop);
+        __ b(L_skip_pld, lt);
+      }
+
+      int offset = ArmCopyCacheLineSize;
+      while (offset <= pld_offset) {
+        prefetch(end_from, end_to, -(wordSize + offset));
+        offset += ArmCopyCacheLineSize;
+      };
+    }
+
+    Label L_shifted_loop;
+    __ align(OptoLoopAlignment);
+    __ BIND(L_shifted_loop);
+
+    if (prefetch_before) {
+      // do the 1st ldm/ldp first anyway (no locking issues with early STM/STP)
+      prefetch(end_from, end_to, -(wordSize + bytes_per_loop + pld_offset));
+      __ BIND(L_skip_pld);
+    } else {
+      __ cmp_32(count, count_per_loop);
+      __ b(L_last_read, lt);
+    }
+
+#ifdef AARCH64
+    __ logical_shift_left(R12, R3, lsl_shift);
+    const Register data_regs[9] = {R3, R4, R5, R6, R7, R8, R9, R10, R12};
+    bulk_load_backward(end_from, data_regs, 8);
+#else
+    if (split_read) {
+      __ ldmdb(end_from, RegisterSet(R7, R10), writeback);
+      __ mov(R12, AsmOperand(R3, lsl, lsl_shift)); // part of R3 not yet written
+      __ ldmdb(end_from, RegisterSet(R3, R6), writeback);
+    } else {
+      __ mov(R12, AsmOperand(R3, lsl, lsl_shift)); // part of R3 not yet written
+      __ ldmdb(end_from, RegisterSet(R3, R10), writeback);
+    }
+#endif // AARCH64
+
+    __ subs_32(count, count, count_per_loop);
+
+    if (prefetch_after) { // do prefetch during ldm/ldp latency
+      prefetch(end_from, end_to, -(wordSize + pld_offset), -bytes_per_loop);
+    }
+
+    // prepare the values in R4..R10,R12
+    __ orr(R12, R12, AsmOperand(R10, lsr, lsr_shift)); // merged above high  bytes of prev val
+    __ logical_shift_left(R10, R10, lsl_shift); // unused part of prev val
+    __ orr(R10, R10, AsmOperand(R9, lsr, lsr_shift)); // ...
+    __ logical_shift_left(R9, R9, lsl_shift);
+    __ orr(R9, R9, AsmOperand(R8, lsr, lsr_shift));
+    __ logical_shift_left(R8, R8, lsl_shift);
+    __ orr(R8, R8, AsmOperand(R7, lsr, lsr_shift));
+    __ logical_shift_left(R7, R7, lsl_shift);
+    __ orr(R7, R7, AsmOperand(R6, lsr, lsr_shift));
+    __ logical_shift_left(R6, R6, lsl_shift);
+    __ orr(R6, R6, AsmOperand(R5, lsr, lsr_shift));
+#ifndef AARCH64
+    if (split_write) {
+      // store early to reduce locking issues
+      __ stmdb(end_to, RegisterSet(R6, R10) | R12, writeback, prefetch_before ? gt : ge);
+    }
+#endif // !AARCH64
+    __ logical_shift_left(R5, R5, lsl_shift);
+    __ orr(R5, R5, AsmOperand(R4, lsr, lsr_shift));
+    __ logical_shift_left(R4, R4, lsl_shift);
+    __ orr(R4, R4, AsmOperand(R3, lsr, lsr_shift));
+
+#ifdef AARCH64
+    bulk_store_backward(end_to, &data_regs[1], 8);
+#else
+    if (split_write) {
+      __ stmdb(end_to, RegisterSet(R4, R5), writeback, prefetch_before ? gt : ge);
+    } else {
+      __ stmdb(end_to, RegisterSet(R4, R10) | R12, writeback, prefetch_before ? gt : ge);
+    }
+#endif // AARCH64
+
+    __ b(L_shifted_loop, gt); // no need to loop if 0 (when count need not be precise modulo bytes_per_loop)
+
+    if (prefetch_before) {
+      // the first loop may end earlier, allowing to skip pld at the end
+      __ cmn_32(count, ((bytes_per_loop + pld_offset)/bytes_per_count));
+#ifndef AARCH64
+      __ stmdb(end_to, RegisterSet(R4, R10) | R12, writeback); // stmdb was skipped
+#endif // !AARCH64
+      __ b(L_skip_pld, ge);
+      __ adds_32(count, count, ((bytes_per_loop + pld_offset) / bytes_per_count) + count_per_loop);
+    }
+
+    __ BIND(L_last_read);
+    __ b(L_done, eq);
+
+#ifdef AARCH64
+    assert(bytes_per_count < 8, "adjust the code below");
+
+    __ logical_shift_left(R12, R3, lsl_shift);
+
+    {
+      Label L;
+      __ tbz(count, exact_log2(32/bytes_per_count), L);
+      bulk_load_backward(end_from, &data_regs[4], 4);
+
+      __ orr(R12, R12, AsmOperand(R10, lsr, lsr_shift));
+      __ logical_shift_left(R10, R10, lsl_shift);
+      __ orr(R10, R10, AsmOperand(R9, lsr, lsr_shift));
+      __ logical_shift_left(R9, R9, lsl_shift);
+      __ orr(R9, R9, AsmOperand(R8, lsr, lsr_shift));
+      __ logical_shift_left(R8, R8, lsl_shift);
+      __ orr(R8, R8, AsmOperand(R7, lsr, lsr_shift));
+
+      bulk_store_backward(end_to, &data_regs[5], 4);
+      __ logical_shift_left(R12, R7, lsl_shift);
+      __ bind(L);
+    }
+
+    {
+      Label L;
+      __ tbz(count, exact_log2(16/bytes_per_count), L);
+      bulk_load_backward(end_from, &data_regs[6], 2);
+
+      __ orr(R12, R12, AsmOperand(R10, lsr, lsr_shift));
+      __ logical_shift_left(R10, R10, lsl_shift);
+      __ orr(R10, R10, AsmOperand(R9, lsr, lsr_shift));
+
+      bulk_store_backward(end_to, &data_regs[7], 2);
+      __ logical_shift_left(R12, R9, lsl_shift);
+      __ bind(L);
+    }
+
+    {
+      Label L;
+      __ tbz(count, exact_log2(8/bytes_per_count), L);
+      __ ldr(R10, Address(end_from, -8, pre_indexed));
+      __ orr(R12, R12, AsmOperand(R10, lsr, lsr_shift));
+      __ str(R12, Address(end_to, -8, pre_indexed));
+      __ logical_shift_left(R12, R10, lsl_shift);
+      __ bind(L);
+    }
+
+    const int have_bytes = lsr_shift/BitsPerByte; // number of already read bytes in R12
+
+    // It remains less than wordSize to write.
+    // Do not check count if R12 already has maximal number of loaded elements (one less than wordSize).
+    if (have_bytes < wordSize - bytes_per_count) {
+      Label L;
+      __ andr(count, count, (uintx)(8/bytes_per_count-1)); // make count exact
+      __ cmp_32(count, have_bytes/bytes_per_count); // do we have enough bytes to store?
+      __ b(L, le);
+      __ ldr(R10, Address(end_from, -8, pre_indexed));
+      __ orr(R12, R12, AsmOperand(R10, lsr, lsr_shift));
+      __ bind(L);
+    }
+
+    assert (bytes_per_count <= 4, "must be");
+
+    {
+      Label L;
+      __ tbz(count, exact_log2(4/bytes_per_count), L);
+      __ logical_shift_right(R9, R12, (wordSize-4)*BitsPerByte);
+      __ str_w(R9, Address(end_to, -4, pre_indexed)); // Write 4 MSB
+      if (bytes_per_count < 4) {
+        __ logical_shift_left(R12, R12, 4*BitsPerByte); // Promote remaining bytes to MSB
+      }
+      __ bind(L);
+    }
+
+    if (bytes_per_count <= 2) {
+      Label L;
+      __ tbz(count, exact_log2(2/bytes_per_count), L);
+      __ logical_shift_right(R9, R12, (wordSize-2)*BitsPerByte);
+      __ strh(R9, Address(end_to, -2, pre_indexed)); // Write 2 MSB
+      if (bytes_per_count < 2) {
+        __ logical_shift_left(R12, R12, 2*BitsPerByte); // Promote remaining bytes to MSB
+      }
+      __ bind(L);
+    }
+
+    if (bytes_per_count <= 1) {
+      Label L;
+      __ tbz(count, exact_log2(1/bytes_per_count), L);
+      __ logical_shift_right(R9, R12, (wordSize-1)*BitsPerByte);
+      __ strb(R9, Address(end_to, -1, pre_indexed)); // Write 1 MSB
+      __ bind(L);
+    }
+#else
+      switch(bytes_per_count) {
+      case 2:
+      __ mov(R12, AsmOperand(R3, lsl, lsl_shift)); // part of R3 not yet written
+      __ tst(count, 8);
+      __ ldmdb(end_from, RegisterSet(R7,R10), writeback, ne);
+      __ orr(R12, R12, AsmOperand(R10, lsr, lsr_shift), ne);
+      __ mov(R10, AsmOperand(R10, lsl, lsl_shift),ne); // unused part of prev val
+      __ orr(R10, R10, AsmOperand(R9, lsr, lsr_shift),ne); // ...
+      __ mov(R9, AsmOperand(R9, lsl, lsl_shift),ne);
+      __ orr(R9, R9, AsmOperand(R8, lsr, lsr_shift),ne);
+      __ mov(R8, AsmOperand(R8, lsl, lsl_shift),ne);
+      __ orr(R8, R8, AsmOperand(R7, lsr, lsr_shift),ne);
+      __ stmdb(end_to, RegisterSet(R8,R10)|R12, writeback, ne);
+      __ mov(R12, AsmOperand(R7, lsl, lsl_shift), ne);
+
+      __ tst(count, 4);
+      __ ldmdb(end_from, RegisterSet(R9, R10), writeback, ne);
+      __ orr(R12, R12, AsmOperand(R10, lsr, lsr_shift), ne);
+      __ mov(R10, AsmOperand(R10, lsl, lsl_shift),ne); // unused part of prev val
+      __ orr(R10, R10, AsmOperand(R9, lsr,lsr_shift),ne); // ...
+      __ stmdb(end_to, RegisterSet(R10)|R12, writeback, ne);
+      __ mov(R12, AsmOperand(R9, lsl, lsl_shift), ne);
+
+      __ tst(count, 2);
+      __ ldr(R10, Address(end_from, -4, pre_indexed), ne);
+      __ orr(R12, R12, AsmOperand(R10, lsr, lsr_shift), ne);
+      __ str(R12, Address(end_to, -4, pre_indexed), ne);
+      __ mov(R12, AsmOperand(R10, lsl, lsl_shift), ne);
+
+      __ tst(count, 1);
+      __ mov(R12, AsmOperand(R12, lsr, lsr_shift),ne);
+      __ strh(R12, Address(end_to, -2, pre_indexed), ne); // one last short
+      break;
+
+      case 1:
+      __ mov(R12, AsmOperand(R3, lsl, lsl_shift)); // part of R3 not yet written
+      __ tst(count, 16);
+      __ ldmdb(end_from, RegisterSet(R7,R10), writeback, ne);
+      __ orr(R12, R12, AsmOperand(R10, lsr, lsr_shift), ne);
+      __ mov(R10, AsmOperand(R10, lsl, lsl_shift),ne); // unused part of prev val
+      __ orr(R10, R10, AsmOperand(R9, lsr, lsr_shift),ne); // ...
+      __ mov(R9, AsmOperand(R9, lsl, lsl_shift),ne);
+      __ orr(R9, R9, AsmOperand(R8, lsr, lsr_shift),ne);
+      __ mov(R8, AsmOperand(R8, lsl, lsl_shift),ne);
+      __ orr(R8, R8, AsmOperand(R7, lsr, lsr_shift),ne);
+      __ stmdb(end_to, RegisterSet(R8,R10)|R12, writeback, ne);
+      __ mov(R12, AsmOperand(R7, lsl, lsl_shift), ne);
+
+      __ tst(count, 8);
+      __ ldmdb(end_from, RegisterSet(R9,R10), writeback, ne);
+      __ orr(R12, R12, AsmOperand(R10, lsr, lsr_shift), ne);
+      __ mov(R10, AsmOperand(R10, lsl, lsl_shift),ne); // unused part of prev val
+      __ orr(R10, R10, AsmOperand(R9, lsr, lsr_shift),ne); // ...
+      __ stmdb(end_to, RegisterSet(R10)|R12, writeback, ne);
+      __ mov(R12, AsmOperand(R9, lsl, lsl_shift), ne);
+
+      __ tst(count, 4);
+      __ ldr(R10, Address(end_from, -4, pre_indexed), ne);
+      __ orr(R12, R12, AsmOperand(R10, lsr, lsr_shift), ne);
+      __ str(R12, Address(end_to, -4, pre_indexed), ne);
+      __ mov(R12, AsmOperand(R10, lsl, lsl_shift), ne);
+
+      __ tst(count, 2);
+      if (lsr_shift != 24) {
+        // avoid useless reading R10 when we already have 3 bytes ready in R12
+        __ ldr(R10, Address(end_from, -4, pre_indexed), ne);
+        __ orr(R12, R12, AsmOperand(R10, lsr,lsr_shift), ne);
+      }
+
+      // Note: R12 contains enough bytes ready to write (3 needed at most)
+      // write the 2 MSBs
+      __ mov(R9, AsmOperand(R12, lsr, 16), ne);
+      __ strh(R9, Address(end_to, -2, pre_indexed), ne);
+      // promote remaining to MSB
+      __ mov(R12, AsmOperand(R12, lsl, 16), ne);
+
+      __ tst(count, 1);
+      // write the MSB of R12
+      __ mov(R12, AsmOperand(R12, lsr, 24), ne);
+      __ strb(R12, Address(end_to, -1, pre_indexed), ne);
+
+      break;
+      }
+#endif // AARCH64
+
+    __ BIND(L_done);
+    return 0; // no minimum
+  }
+
+  // This method is very useful for merging forward/backward implementations
+  Address get_addr_with_indexing(Register base, int delta, bool forward) {
+    if (forward) {
+      return Address(base, delta, post_indexed);
+    } else {
+      return Address(base, -delta, pre_indexed);
+    }
+  }
+
+#ifdef AARCH64
+  // Loads one 'size_in_bytes'-sized value from 'from' in given direction, i.e.
+  //   if forward:  loads value at from and increases from by size
+  //   if !forward: loads value at from-size_in_bytes and decreases from by size
+  void load_one(Register rd, Register from, int size_in_bytes, bool forward) {
+    assert_different_registers(from, rd);
+    Address addr = get_addr_with_indexing(from, size_in_bytes, forward);
+    __ load_sized_value(rd, addr, size_in_bytes, false);
+  }
+
+  // Stores one 'size_in_bytes'-sized value to 'to' in given direction (see load_one)
+  void store_one(Register rd, Register to, int size_in_bytes, bool forward) {
+    assert_different_registers(to, rd);
+    Address addr = get_addr_with_indexing(to, size_in_bytes, forward);
+    __ store_sized_value(rd, addr, size_in_bytes);
+  }
+#else
+  // load_one and store_one are the same as for AArch64 except for
+  //   *) Support for condition execution
+  //   *) Second value register argument for 8-byte values
+
+  void load_one(Register rd, Register from, int size_in_bytes, bool forward, AsmCondition cond = al, Register rd2 = noreg) {
+    assert_different_registers(from, rd, rd2);
+    if (size_in_bytes < 8) {
+      Address addr = get_addr_with_indexing(from, size_in_bytes, forward);
+      __ load_sized_value(rd, addr, size_in_bytes, false, cond);
+    } else {
+      assert (rd2 != noreg, "second value register must be specified");
+      assert (rd->encoding() < rd2->encoding(), "wrong value register set");
+
+      if (forward) {
+        __ ldmia(from, RegisterSet(rd) | rd2, writeback, cond);
+      } else {
+        __ ldmdb(from, RegisterSet(rd) | rd2, writeback, cond);
+      }
+    }
+  }
+
+  void store_one(Register rd, Register to, int size_in_bytes, bool forward, AsmCondition cond = al, Register rd2 = noreg) {
+    assert_different_registers(to, rd, rd2);
+    if (size_in_bytes < 8) {
+      Address addr = get_addr_with_indexing(to, size_in_bytes, forward);
+      __ store_sized_value(rd, addr, size_in_bytes, cond);
+    } else {
+      assert (rd2 != noreg, "second value register must be specified");
+      assert (rd->encoding() < rd2->encoding(), "wrong value register set");
+
+      if (forward) {
+        __ stmia(to, RegisterSet(rd) | rd2, writeback, cond);
+      } else {
+        __ stmdb(to, RegisterSet(rd) | rd2, writeback, cond);
+      }
+    }
+  }
+#endif // AARCH64
+
+  // Copies data from 'from' to 'to' in specified direction to align 'from' by 64 bits.
+  // (on 32-bit ARM 64-bit alignment is better for LDM).
+  //
+  // Arguments:
+  //     from:              beginning (if forward) or upper bound (if !forward) of the region to be read
+  //     to:                beginning (if forward) or upper bound (if !forward) of the region to be written
+  //     count:             32-bit int, maximum number of elements which can be copied
+  //     bytes_per_count:   size of an element
+  //     forward:           specifies copy direction
+  //
+  // Notes:
+  //   'from' and 'to' must be aligned by 'bytes_per_count'
+  //   'count' must not be less than the returned value
+  //   shifts 'from' and 'to' by the number of copied bytes in corresponding direction
+  //   decreases 'count' by the number of elements copied
+  //
+  // Returns maximum number of bytes which may be copied.
+  int align_src(Register from, Register to, Register count, Register tmp, int bytes_per_count, bool forward) {
+    assert_different_registers(from, to, count, tmp);
+#ifdef AARCH64
+    // TODO-AARCH64: replace by simple loop?
+    Label Laligned_by_2, Laligned_by_4, Laligned_by_8;
+
+    if (bytes_per_count == 1) {
+      __ tbz(from, 0, Laligned_by_2);
+      __ sub_32(count, count, 1);
+      load_one(tmp, from, 1, forward);
+      store_one(tmp, to, 1, forward);
+    }
+
+    __ BIND(Laligned_by_2);
+
+    if (bytes_per_count <= 2) {
+      __ tbz(from, 1, Laligned_by_4);
+      __ sub_32(count, count, 2/bytes_per_count);
+      load_one(tmp, from, 2, forward);
+      store_one(tmp, to, 2, forward);
+    }
+
+    __ BIND(Laligned_by_4);
+
+    if (bytes_per_count <= 4) {
+      __ tbz(from, 2, Laligned_by_8);
+      __ sub_32(count, count, 4/bytes_per_count);
+      load_one(tmp, from, 4, forward);
+      store_one(tmp, to, 4, forward);
+    }
+    __ BIND(Laligned_by_8);
+#else // AARCH64
+    if (bytes_per_count < 8) {
+      Label L_align_src;
+      __ BIND(L_align_src);
+      __ tst(from, 7);
+      // ne => not aligned: copy one element and (if bytes_per_count < 4) loop
+      __ sub(count, count, 1, ne);
+      load_one(tmp, from, bytes_per_count, forward, ne);
+      store_one(tmp, to, bytes_per_count, forward, ne);
+      if (bytes_per_count < 4) {
+        __ b(L_align_src, ne); // if bytes_per_count == 4, then 0 or 1 loop iterations are enough
+      }
+    }
+#endif // AARCH64
+    return 7/bytes_per_count;
+  }
+
+  // Copies 'count' of 'bytes_per_count'-sized elements in the specified direction.
+  //
+  // Arguments:
+  //     from:              beginning (if forward) or upper bound (if !forward) of the region to be read
+  //     to:                beginning (if forward) or upper bound (if !forward) of the region to be written
+  //     count:             32-bit int, number of elements to be copied
+  //     entry:             copy loop entry point
+  //     bytes_per_count:   size of an element
+  //     forward:           specifies copy direction
+  //
+  // Notes:
+  //     shifts 'from' and 'to'
+  void copy_small_array(Register from, Register to, Register count, Register tmp, Register tmp2, int bytes_per_count, bool forward, Label & entry) {
+    assert_different_registers(from, to, count, tmp);
+
+    __ align(OptoLoopAlignment);
+#ifdef AARCH64
+    Label L_small_array_done, L_small_array_loop;
+    __ BIND(entry);
+    __ cbz_32(count, L_small_array_done);
+
+    __ BIND(L_small_array_loop);
+    __ subs_32(count, count, 1);
+    load_one(tmp, from, bytes_per_count, forward);
+    store_one(tmp, to, bytes_per_count, forward);
+    __ b(L_small_array_loop, gt);
+
+    __ BIND(L_small_array_done);
+#else
+    Label L_small_loop;
+    __ BIND(L_small_loop);
+    store_one(tmp, to, bytes_per_count, forward, al, tmp2);
+    __ BIND(entry); // entry point
+    __ subs(count, count, 1);
+    load_one(tmp, from, bytes_per_count, forward, ge, tmp2);
+    __ b(L_small_loop, ge);
+#endif // AARCH64
+  }
+
+  // Aligns 'to' by reading one word from 'from' and writting its part to 'to'.
+  //
+  // Arguments:
+  //     to:                beginning (if forward) or upper bound (if !forward) of the region to be written
+  //     count:             32-bit int, number of elements allowed to be copied
+  //     to_remainder:      remainder of dividing 'to' by wordSize
+  //     bytes_per_count:   size of an element
+  //     forward:           specifies copy direction
+  //     Rval:              contains an already read but not yet written word;
+  //                        its' LSBs (if forward) or MSBs (if !forward) are to be written to align 'to'.
+  //
+  // Notes:
+  //     'count' must not be less then the returned value
+  //     'to' must be aligned by bytes_per_count but must not be aligned by wordSize
+  //     shifts 'to' by the number of written bytes (so that it becomes the bound of memory to be written)
+  //     decreases 'count' by the the number of elements written
+  //     Rval's MSBs or LSBs remain to be written further by generate_{forward,backward}_shifted_copy_loop
+  int align_dst(Register to, Register count, Register Rval, Register tmp,
+                                        int to_remainder, int bytes_per_count, bool forward) {
+    assert_different_registers(to, count, tmp, Rval);
+
+    assert (0 < to_remainder && to_remainder < wordSize, "to_remainder is not valid");
+    assert (to_remainder % bytes_per_count == 0, "to must be aligned by bytes_per_count");
+
+    int bytes_to_write = forward ? (wordSize - to_remainder) : to_remainder;
+
+    int offset = 0;
+
+    for (int l = 0; l < LogBytesPerWord; ++l) {
+      int s = (1 << l);
+      if (bytes_to_write & s) {
+        int new_offset = offset + s*BitsPerByte;
+        if (forward) {
+          if (offset == 0) {
+            store_one(Rval, to, s, forward);
+          } else {
+            __ logical_shift_right(tmp, Rval, offset);
+            store_one(tmp, to, s, forward);
+          }
+        } else {
+          __ logical_shift_right(tmp, Rval, BitsPerWord - new_offset);
+          store_one(tmp, to, s, forward);
+        }
+
+        offset = new_offset;
+      }
+    }
+
+    assert (offset == bytes_to_write * BitsPerByte, "all bytes must be copied");
+
+    __ sub_32(count, count, bytes_to_write/bytes_per_count);
+
+    return bytes_to_write / bytes_per_count;
+  }
+
+  // Copies 'count' of elements using shifted copy loop
+  //
+  // Arguments:
+  //     from:              beginning (if forward) or upper bound (if !forward) of the region to be read
+  //     to:                beginning (if forward) or upper bound (if !forward) of the region to be written
+  //     count:             32-bit int, number of elements to be copied
+  //     to_remainder:      remainder of dividing 'to' by wordSize
+  //     bytes_per_count:   size of an element
+  //     forward:           specifies copy direction
+  //     Rval:              contains an already read but not yet written word
+  //
+  //
+  // Notes:
+  //     'count' must not be less then the returned value
+  //     'from' must be aligned by wordSize
+  //     'to' must be aligned by bytes_per_count but must not be aligned by wordSize
+  //     shifts 'to' by the number of copied bytes
+  //
+  // Scratches R3-R10, R12
+  int align_dst_and_generate_shifted_copy_loop(Register from, Register to, Register count, Register Rval,
+                                                        int to_remainder, int bytes_per_count, bool forward) {
+
+    assert (0 < to_remainder && to_remainder < wordSize, "to_remainder is invalid");
+
+    const Register tmp  = forward ? R3 : R12; // TODO-AARCH64: on cojoint_short R4 was used for tmp
+    assert_different_registers(from, to, count, Rval, tmp);
+
+    int required_to_align = align_dst(to, count, Rval, tmp, to_remainder, bytes_per_count, forward);
+
+    int lsr_shift = (wordSize - to_remainder) * BitsPerByte;
+    int lsl_shift = to_remainder * BitsPerByte;
+
+    int min_copy;
+    if (forward) {
+      min_copy = generate_forward_shifted_copy_loop(from, to, count, bytes_per_count, lsr_shift, lsl_shift);
+    } else {
+      min_copy = generate_backward_shifted_copy_loop(from, to, count, bytes_per_count, lsr_shift, lsl_shift);
+    }
+
+    return min_copy + required_to_align;
+  }
+
+  // Copies 'count' of elements using shifted copy loop
+  //
+  // Arguments:
+  //     from:              beginning (if forward) or upper bound (if !forward) of the region to be read
+  //     to:                beginning (if forward) or upper bound (if !forward) of the region to be written
+  //     count:             32-bit int, number of elements to be copied
+  //     bytes_per_count:   size of an element
+  //     forward:           specifies copy direction
+  //
+  // Notes:
+  //     'count' must not be less then the returned value
+  //     'from' must be aligned by wordSize
+  //     'to' must be aligned by bytes_per_count but must not be aligned by wordSize
+  //     shifts 'to' by the number of copied bytes
+  //
+  // Scratches 'from', 'count', R3 and R12.
+  // On AArch64 also scratches R4-R10, on 32-bit ARM saves them to use.
+  int align_dst_and_generate_shifted_copy_loop(Register from, Register to, Register count, int bytes_per_count, bool forward) {
+
+    const Register Rval = forward ? R12 : R3; // as generate_{forward,backward}_shifted_copy_loop expect
+
+    int min_copy = 0;
+
+    // Note: if {seq} is a sequence of numbers, L{seq} means that if the execution reaches this point,
+    // then the remainder of 'to' divided by wordSize is one of elements of {seq}.
+
+#ifdef AARCH64
+    // TODO-AARCH64: simplify, tune
+
+    load_one(Rval, from, wordSize, forward);
+
+    Label L_loop_finished;
+
+    switch (bytes_per_count) {
+      case 4:
+        min_copy = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 4, bytes_per_count, forward);
+        break;
+      case 2:
+      {
+        Label L2, L4, L6;
+
+        __ tbz(to, 1, L4);
+        __ tbz(to, 2, L2);
+
+        __ BIND(L6);
+        int min_copy6 = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 6, bytes_per_count, forward);
+        __ b(L_loop_finished);
+
+        __ BIND(L2);
+        int min_copy2 = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 2, bytes_per_count, forward);
+        __ b(L_loop_finished);
+
+        __ BIND(L4);
+        int min_copy4 = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 4, bytes_per_count, forward);
+
+        min_copy = MAX2(MAX2(min_copy2, min_copy4), min_copy6);
+        break;
+      }
+      case 1:
+      {
+        Label L1, L2, L3, L4, L5, L6, L7;
+        Label L15, L26;
+        Label L246;
+
+        __ tbz(to, 0, L246);
+        __ tbz(to, 1, L15);
+        __ tbz(to, 2, L3);
+
+        __ BIND(L7);
+        int min_copy7 = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 7, bytes_per_count, forward);
+        __ b(L_loop_finished);
+
+        __ BIND(L246);
+        __ tbnz(to, 1, L26);
+
+        __ BIND(L4);
+        int min_copy4 = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 4, bytes_per_count, forward);
+        __ b(L_loop_finished);
+
+        __ BIND(L15);
+        __ tbz(to, 2, L1);
+
+        __ BIND(L5);
+        int min_copy5 = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 5, bytes_per_count, forward);
+        __ b(L_loop_finished);
+
+        __ BIND(L3);
+        int min_copy3 = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 3, bytes_per_count, forward);
+        __ b(L_loop_finished);
+
+        __ BIND(L26);
+        __ tbz(to, 2, L2);
+
+        __ BIND(L6);
+        int min_copy6 = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 6, bytes_per_count, forward);
+        __ b(L_loop_finished);
+
+        __ BIND(L1);
+        int min_copy1 = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 1, bytes_per_count, forward);
+        __ b(L_loop_finished);
+
+        __ BIND(L2);
+        int min_copy2 = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 2, bytes_per_count, forward);
+
+
+        min_copy = MAX2(min_copy1, min_copy2);
+        min_copy = MAX2(min_copy,  min_copy3);
+        min_copy = MAX2(min_copy,  min_copy4);
+        min_copy = MAX2(min_copy,  min_copy5);
+        min_copy = MAX2(min_copy,  min_copy6);
+        min_copy = MAX2(min_copy,  min_copy7);
+        break;
+      }
+      default:
+        ShouldNotReachHere();
+        break;
+    }
+    __ BIND(L_loop_finished);
+
+#else
+    __ push(RegisterSet(R4,R10));
+    load_one(Rval, from, wordSize, forward);
+
+    switch (bytes_per_count) {
+      case 2:
+        min_copy = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 2, bytes_per_count, forward);
+        break;
+      case 1:
+      {
+        Label L1, L2, L3;
+        int min_copy1, min_copy2, min_copy3;
+
+        Label L_loop_finished;
+
+        if (forward) {
+            __ tbz(to, 0, L2);
+            __ tbz(to, 1, L1);
+
+            __ BIND(L3);
+            min_copy3 = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 3, bytes_per_count, forward);
+            __ b(L_loop_finished);
+
+            __ BIND(L1);
+            min_copy1 = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 1, bytes_per_count, forward);
+            __ b(L_loop_finished);
+
+            __ BIND(L2);
+            min_copy2 = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 2, bytes_per_count, forward);
+        } else {
+            __ tbz(to, 0, L2);
+            __ tbnz(to, 1, L3);
+
+            __ BIND(L1);
+            min_copy1 = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 1, bytes_per_count, forward);
+            __ b(L_loop_finished);
+
+             __ BIND(L3);
+            min_copy3 = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 3, bytes_per_count, forward);
+            __ b(L_loop_finished);
+
+           __ BIND(L2);
+            min_copy2 = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 2, bytes_per_count, forward);
+        }
+
+        min_copy = MAX2(MAX2(min_copy1, min_copy2), min_copy3);
+
+        __ BIND(L_loop_finished);
+
+        break;
+      }
+      default:
+        ShouldNotReachHere();
+        break;
+    }
+
+    __ pop(RegisterSet(R4,R10));
+#endif // AARCH64
+
+    return min_copy;
+  }
+
+#ifndef PRODUCT
+  int * get_arraycopy_counter(int bytes_per_count) {
+    switch (bytes_per_count) {
+      case 1:
+        return &SharedRuntime::_jbyte_array_copy_ctr;
+      case 2:
+        return &SharedRuntime::_jshort_array_copy_ctr;
+      case 4:
+        return &SharedRuntime::_jint_array_copy_ctr;
+      case 8:
+        return &SharedRuntime::_jlong_array_copy_ctr;
+      default:
+        ShouldNotReachHere();
+        return NULL;
+    }
+  }
+#endif // !PRODUCT
+
+  //
+  //  Generate stub for primitive array copy.  If "aligned" is true, the
+  //  "from" and "to" addresses are assumed to be heapword aligned.
+  //
+  //  If "disjoint" is true, arrays are assumed to be disjoint, otherwise they may overlap and
+  //  "nooverlap_target" must be specified as the address to jump if they don't.
+  //
+  // Arguments for generated stub:
+  //      from:  R0
+  //      to:    R1
+  //      count: R2 treated as signed 32-bit int
+  //
+  address generate_primitive_copy(bool aligned, const char * name, bool status, int bytes_per_count, bool disjoint, address nooverlap_target = NULL) {
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", name);
+    address start = __ pc();
+
+    const Register from  = R0;   // source array address
+    const Register to    = R1;   // destination array address
+    const Register count = R2;   // elements count
+    const Register tmp1  = R3;
+    const Register tmp2  = R12;
+
+    if (!aligned)  {
+      BLOCK_COMMENT("Entry:");
+    }
+
+    __ zap_high_non_significant_bits(R2);
+
+    if (!disjoint) {
+      assert (nooverlap_target != NULL, "must be specified for conjoint case");
+      array_overlap_test(nooverlap_target, exact_log2(bytes_per_count), tmp1, tmp2);
+    }
+
+    inc_counter_np(*get_arraycopy_counter(bytes_per_count), tmp1, tmp2);
+
+    // Conjoint case: since execution reaches this point, the arrays overlap, so performing backward copy
+    // Disjoint case: perform forward copy
+    bool forward = disjoint;
+
+
+    if (!forward) {
+      // Set 'from' and 'to' to upper bounds
+      int log_bytes_per_count = exact_log2(bytes_per_count);
+      __ add_ptr_scaled_int32(to,   to,   count, log_bytes_per_count);
+      __ add_ptr_scaled_int32(from, from, count, log_bytes_per_count);
+    }
+
+    // There are two main copy loop implementations:
+    //  *) The huge and complex one applicable only for large enough arrays
+    //  *) The small and simple one applicable for any array (but not efficient for large arrays).
+    // Currently "small" implementation is used if and only if the "large" one could not be used.
+    // XXX optim: tune the limit higher ?
+    // Large implementation lower applicability bound is actually determined by
+    // aligned copy loop which require <=7 bytes for src alignment, and 8 words for aligned copy loop.
+    const int small_copy_limit = (8*wordSize + 7) / bytes_per_count;
+
+    Label L_small_array;
+    __ cmp_32(count, small_copy_limit);
+    __ b(L_small_array, le); // TODO-AARCH64: le vs lt
+
+    // Otherwise proceed with large implementation.
+
+    bool from_is_aligned = (bytes_per_count >= 8);
+    if (aligned && forward && (HeapWordSize % 8 == 0)) {
+        // if 'from' is heapword aligned and HeapWordSize is divisible by 8,
+        //  then from is aligned by 8
+        from_is_aligned = true;
+    }
+
+    int count_required_to_align = from_is_aligned ? 0 : align_src(from, to, count, tmp1, bytes_per_count, forward);
+    assert (small_copy_limit >= count_required_to_align, "alignment could exhaust count");
+
+    // now 'from' is aligned
+
+    bool to_is_aligned = false;
+
+    if (bytes_per_count >= wordSize) {
+      // 'to' is aligned by bytes_per_count, so it is aligned by wordSize
+      to_is_aligned = true;
+    } else {
+      if (aligned && (8 % HeapWordSize == 0) && (HeapWordSize % wordSize == 0)) {
+        // Originally 'from' and 'to' were heapword aligned;
+        // (from - to) has not been changed, so since now 'from' is 8-byte aligned, then it is also heapword aligned,
+        //  so 'to' is also heapword aligned and thus aligned by wordSize.
+        to_is_aligned = true;
+      }
+    }
+
+    Label L_unaligned_dst;
+
+    if (!to_is_aligned) {
+      BLOCK_COMMENT("Check dst alignment:");
+      __ tst(to, wordSize - 1);
+      __ b(L_unaligned_dst, ne); // 'to' is not aligned
+    }
+
+    // 'from' and 'to' are properly aligned
+
+    int min_copy;
+    if (forward) {
+      min_copy = generate_forward_aligned_copy_loop (from, to, count, bytes_per_count);
+    } else {
+      min_copy = generate_backward_aligned_copy_loop(from, to, count, bytes_per_count);
+    }
+    assert(small_copy_limit >= count_required_to_align + min_copy, "first loop might exhaust count");
+
+    if (status) {
+      __ mov(R0, 0); // OK
+    }
+
+    __ ret();
+
+    {
+      copy_small_array(from, to, count, tmp1, tmp2, bytes_per_count, forward, L_small_array /* entry */);
+
+      if (status) {
+        __ mov(R0, 0); // OK
+      }
+
+      __ ret();
+    }
+
+    if (! to_is_aligned) {
+      __ BIND(L_unaligned_dst);
+      int min_copy_shifted = align_dst_and_generate_shifted_copy_loop(from, to, count, bytes_per_count, forward);
+      assert (small_copy_limit >= count_required_to_align + min_copy_shifted, "first loop might exhaust count");
+
+      if (status) {
+        __ mov(R0, 0); // OK
+      }
+
+      __ ret();
+    }
+
+    return start;
+  }
+
+#if INCLUDE_ALL_GCS
+  //
+  //  Generate pre-write barrier for array.
+  //
+  //  Input:
+  //     addr     - register containing starting address
+  //     count    - register containing element count, 32-bit int
+  //     callee_saved_regs -
+  //                the call must preserve this number of registers: R0, R1, ..., R[callee_saved_regs-1]
+  //
+  //  callee_saved_regs must include addr and count
+  //  Blows all volatile registers (R0-R3 on 32-bit ARM, R0-R18 on AArch64, Rtemp, LR) except for callee_saved_regs.
+  void gen_write_ref_array_pre_barrier(Register addr, Register count, int callee_saved_regs) {
+    BarrierSet* bs = Universe::heap()->barrier_set();
+    if (bs->has_write_ref_pre_barrier()) {
+      assert(bs->has_write_ref_array_pre_opt(),
+             "Else unsupported barrier set.");
+
+      assert( addr->encoding() < callee_saved_regs, "addr must be saved");
+      assert(count->encoding() < callee_saved_regs, "count must be saved");
+
+      BLOCK_COMMENT("PreBarrier");
+
+#ifdef AARCH64
+      callee_saved_regs = round_to(callee_saved_regs, 2);
+      for (int i = 0; i < callee_saved_regs; i += 2) {
+        __ raw_push(as_Register(i), as_Register(i+1));
+      }
+#else
+      RegisterSet saved_regs = RegisterSet(R0, as_Register(callee_saved_regs-1));
+      __ push(saved_regs | R9ifScratched);
+#endif // AARCH64
+
+      if (addr != R0) {
+        assert_different_registers(count, R0);
+        __ mov(R0, addr);
+      }
+#ifdef AARCH64
+      __ zero_extend(R1, count, 32); // BarrierSet::static_write_ref_array_pre takes size_t
+#else
+      if (count != R1) {
+        __ mov(R1, count);
+      }
+#endif // AARCH64
+
+      __ call(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_pre));
+
+#ifdef AARCH64
+      for (int i = callee_saved_regs - 2; i >= 0; i -= 2) {
+        __ raw_pop(as_Register(i), as_Register(i+1));
+      }
+#else
+      __ pop(saved_regs | R9ifScratched);
+#endif // AARCH64
+    }
+  }
+#endif // INCLUDE_ALL_GCS
+
+  //
+  //  Generate post-write barrier for array.
+  //
+  //  Input:
+  //     addr     - register containing starting address (can be scratched)
+  //     count    - register containing element count, 32-bit int (can be scratched)
+  //     tmp      - scratch register
+  //
+  //  Note: LR can be scratched but might be equal to addr, count or tmp
+  //  Blows all volatile registers (R0-R3 on 32-bit ARM, R0-R18 on AArch64, Rtemp, LR).
+  void gen_write_ref_array_post_barrier(Register addr, Register count, Register tmp) {
+    assert_different_registers(addr, count, tmp);
+    BarrierSet* bs = Universe::heap()->barrier_set();
+
+    switch (bs->kind()) {
+    case BarrierSet::G1SATBCTLogging:
+      {
+        BLOCK_COMMENT("G1PostBarrier");
+        if (addr != R0) {
+          assert_different_registers(count, R0);
+          __ mov(R0, addr);
+        }
+#ifdef AARCH64
+        __ zero_extend(R1, count, 32); // BarrierSet::static_write_ref_array_post takes size_t
+#else
+        if (count != R1) {
+          __ mov(R1, count);
+        }
+#if R9_IS_SCRATCHED
+        // Safer to save R9 here since callers may have been written
+        // assuming R9 survives. This is suboptimal but is not in
+        // general worth optimizing for the few platforms where R9
+        // is scratched. Note that the optimization might not be to
+        // difficult for this particular call site.
+        __ push(R9);
+#endif
+#endif // !AARCH64
+        __ call(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_post));
+#ifndef AARCH64
+#if R9_IS_SCRATCHED
+        __ pop(R9);
+#endif
+#endif // !AARCH64
+      }
+      break;
+    case BarrierSet::CardTableForRS:
+    case BarrierSet::CardTableExtension:
+      {
+        BLOCK_COMMENT("CardTablePostBarrier");
+        CardTableModRefBS* ct = barrier_set_cast<CardTableModRefBS>(bs);
+        assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
+
+        Label L_cardtable_loop;
+
+        __ add_ptr_scaled_int32(count, addr, count, LogBytesPerHeapOop);
+        __ sub(count, count, BytesPerHeapOop);                            // last addr
+
+        __ logical_shift_right(addr, addr, CardTableModRefBS::card_shift);
+        __ logical_shift_right(count, count, CardTableModRefBS::card_shift);
+        __ sub(count, count, addr); // nb of cards
+
+        // warning: Rthread has not been preserved
+        __ mov_address(tmp, (address) ct->byte_map_base, symbolic_Relocation::card_table_reference);
+        __ add(addr,tmp, addr);
+
+        Register zero = __ zero_register(tmp);
+
+        __ BIND(L_cardtable_loop);
+        __ strb(zero, Address(addr, 1, post_indexed));
+        __ subs(count, count, 1);
+        __ b(L_cardtable_loop, ge);
+      }
+      break;
+    case BarrierSet::ModRef:
+      break;
+    default:
+      ShouldNotReachHere();
+    }
+  }
+
+  // Generates pattern of code to be placed after raw data copying in generate_oop_copy
+  // Includes return from arraycopy stub.
+  //
+  // Arguments:
+  //     to:       destination pointer after copying.
+  //               if 'forward' then 'to' == upper bound, else 'to' == beginning of the modified region
+  //     count:    total number of copied elements, 32-bit int
+  //
+  // Blows all volatile (R0-R3 on 32-bit ARM, R0-R18 on AArch64, Rtemp, LR) and 'to', 'count', 'tmp' registers.
+  void oop_arraycopy_stub_epilogue_helper(Register to, Register count, Register tmp, bool status, bool forward) {
+    assert_different_registers(to, count, tmp);
+
+    if (forward) {
+      // 'to' is upper bound of the modified region
+      // restore initial dst:
+      __ sub_ptr_scaled_int32(to, to, count, LogBytesPerHeapOop);
+    }
+
+    // 'to' is the beginning of the region
+
+    gen_write_ref_array_post_barrier(to, count, tmp);
+
+    if (status) {
+      __ mov(R0, 0); // OK
+    }
+
+#ifdef AARCH64
+    __ raw_pop(LR, ZR);
+    __ ret();
+#else
+    __ pop(PC);
+#endif // AARCH64
+  }
+
+
+  //  Generate stub for assign-compatible oop copy.  If "aligned" is true, the
+  //  "from" and "to" addresses are assumed to be heapword aligned.
+  //
+  //  If "disjoint" is true, arrays are assumed to be disjoint, otherwise they may overlap and
+  //  "nooverlap_target" must be specified as the address to jump if they don't.
+  //
+  // Arguments for generated stub:
+  //      from:  R0
+  //      to:    R1
+  //      count: R2 treated as signed 32-bit int
+  //
+  address generate_oop_copy(bool aligned, const char * name, bool status, bool disjoint, address nooverlap_target = NULL) {
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", name);
+    address start = __ pc();
+
+    Register from  = R0;
+    Register to    = R1;
+    Register count = R2;
+    Register tmp1  = R3;
+    Register tmp2  = R12;
+
+
+    if (!aligned) {
+      BLOCK_COMMENT("Entry:");
+    }
+
+    __ zap_high_non_significant_bits(R2);
+
+    if (!disjoint) {
+      assert (nooverlap_target != NULL, "must be specified for conjoint case");
+      array_overlap_test(nooverlap_target, LogBytesPerHeapOop, tmp1, tmp2);
+    }
+
+    inc_counter_np(SharedRuntime::_oop_array_copy_ctr, tmp1, tmp2);
+
+    // Conjoint case: since execution reaches this point, the arrays overlap, so performing backward copy
+    // Disjoint case: perform forward copy
+    bool forward = disjoint;
+
+    const int bytes_per_count = BytesPerHeapOop;
+    const int log_bytes_per_count = LogBytesPerHeapOop;
+
+    const Register saved_count = LR;
+    const int callee_saved_regs = 3; // R0-R2
+
+    // LR is used later to save barrier args
+#ifdef AARCH64
+    __ raw_push(LR, ZR);
+#else
+    __ push(LR);
+#endif // AARCH64
+
+#if INCLUDE_ALL_GCS
+    gen_write_ref_array_pre_barrier(to, count, callee_saved_regs);
+#endif // INCLUDE_ALL_GCS
+
+    // save arguments for barrier generation (after the pre barrier)
+    __ mov(saved_count, count);
+
+    if (!forward) {
+      __ add_ptr_scaled_int32(to,   to,   count, log_bytes_per_count);
+      __ add_ptr_scaled_int32(from, from, count, log_bytes_per_count);
+    }
+
+    // for short arrays, just do single element copy
+    Label L_small_array;
+    const int small_copy_limit = (8*wordSize + 7)/bytes_per_count; // XXX optim: tune the limit higher ?
+    __ cmp_32(count, small_copy_limit);
+    __ b(L_small_array, le);
+
+    bool from_is_aligned = (bytes_per_count >= 8);
+    if (aligned && forward && (HeapWordSize % 8 == 0)) {
+        // if 'from' is heapword aligned and HeapWordSize is divisible by 8,
+        //  then from is aligned by 8
+        from_is_aligned = true;
+    }
+
+    int count_required_to_align = from_is_aligned ? 0 : align_src(from, to, count, tmp1, bytes_per_count, forward);
+    assert (small_copy_limit >= count_required_to_align, "alignment could exhaust count");
+
+    // now 'from' is aligned
+
+    bool to_is_aligned = false;
+
+    if (bytes_per_count >= wordSize) {
+      // 'to' is aligned by bytes_per_count, so it is aligned by wordSize
+      to_is_aligned = true;
+    } else {
+      if (aligned && (8 % HeapWordSize == 0) && (HeapWordSize % wordSize == 0)) {
+        // Originally 'from' and 'to' were heapword aligned;
+        // (from - to) has not been changed, so since now 'from' is 8-byte aligned, then it is also heapword aligned,
+        //  so 'to' is also heapword aligned and thus aligned by wordSize.
+        to_is_aligned = true;
+      }
+    }
+
+    Label L_unaligned_dst;
+
+    if (!to_is_aligned) {
+      BLOCK_COMMENT("Check dst alignment:");
+      __ tst(to, wordSize - 1);
+      __ b(L_unaligned_dst, ne); // 'to' is not aligned
+    }
+
+    int min_copy;
+    if (forward) {
+      min_copy = generate_forward_aligned_copy_loop(from, to, count, bytes_per_count);
+    } else {
+      min_copy = generate_backward_aligned_copy_loop(from, to, count, bytes_per_count);
+    }
+    assert(small_copy_limit >= count_required_to_align + min_copy, "first loop might exhaust count");
+
+    oop_arraycopy_stub_epilogue_helper(to, saved_count, /* tmp */ tmp1, status, forward);
+
+    {
+      copy_small_array(from, to, count, tmp1, noreg, bytes_per_count, forward, L_small_array);
+
+      oop_arraycopy_stub_epilogue_helper(to, saved_count, /* tmp */ tmp1, status, forward);
+    }
+
+    if (!to_is_aligned) {
+      // !to_is_aligned <=> UseCompressedOops && AArch64
+      __ BIND(L_unaligned_dst);
+#ifdef AARCH64
+      assert (UseCompressedOops, "unaligned oop array copy may be requested only with UseCompressedOops");
+#else
+      ShouldNotReachHere();
+#endif // AARCH64
+      int min_copy_shifted = align_dst_and_generate_shifted_copy_loop(from, to, count, bytes_per_count, forward);
+      assert (small_copy_limit >= count_required_to_align + min_copy_shifted, "first loop might exhaust count");
+
+      oop_arraycopy_stub_epilogue_helper(to, saved_count, /* tmp */ tmp1, status, forward);
+    }
+
+    return start;
+  }
+
+  //  Generate 'unsafe' array copy stub
+  //  Though just as safe as the other stubs, it takes an unscaled
+  //  size_t argument instead of an element count.
+  //
+  // Arguments for generated stub:
+  //      from:  R0
+  //      to:    R1
+  //      count: R2 byte count, treated as ssize_t, can be zero
+  //
+  // Examines the alignment of the operands and dispatches
+  // to a long, int, short, or byte copy loop.
+  //
+  address generate_unsafe_copy(const char* name) {
+
+    const Register R0_from   = R0;      // source array address
+    const Register R1_to     = R1;      // destination array address
+    const Register R2_count  = R2;      // elements count
+
+    const Register R3_bits   = R3;      // test copy of low bits
+
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", name);
+    address start = __ pc();
+#ifdef AARCH64
+    __ NOT_IMPLEMENTED();
+    start = NULL;
+#else
+    const Register tmp = Rtemp;
+
+    // bump this on entry, not on exit:
+    inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr, R3, tmp);
+
+    __ orr(R3_bits, R0_from, R1_to);
+    __ orr(R3_bits, R2_count, R3_bits);
+
+    __ tst(R3_bits, BytesPerLong-1);
+    __ mov(R2_count,AsmOperand(R2_count,asr,LogBytesPerLong), eq);
+    __ jump(StubRoutines::_jlong_arraycopy, relocInfo::runtime_call_type, tmp, eq);
+
+    __ tst(R3_bits, BytesPerInt-1);
+    __ mov(R2_count,AsmOperand(R2_count,asr,LogBytesPerInt), eq);
+    __ jump(StubRoutines::_jint_arraycopy, relocInfo::runtime_call_type, tmp, eq);
+
+    __ tst(R3_bits, BytesPerShort-1);
+    __ mov(R2_count,AsmOperand(R2_count,asr,LogBytesPerShort), eq);
+    __ jump(StubRoutines::_jshort_arraycopy, relocInfo::runtime_call_type, tmp, eq);
+
+    __ jump(StubRoutines::_jbyte_arraycopy, relocInfo::runtime_call_type, tmp);
+#endif
+    return start;
+  }
+
+  // Helper for generating a dynamic type check.
+  // Smashes only the given temp registers.
+  void generate_type_check(Register sub_klass,
+                           Register super_check_offset,
+                           Register super_klass,
+                           Register tmp1,
+                           Register tmp2,
+                           Register tmp3,
+                           Label& L_success) {
+    assert_different_registers(sub_klass, super_check_offset, super_klass, tmp1, tmp2, tmp3);
+
+    BLOCK_COMMENT("type_check:");
+
+    // If the pointers are equal, we are done (e.g., String[] elements).
+
+    __ cmp(super_klass, sub_klass);
+    __ b(L_success, eq); // fast success
+
+
+    Label L_loop, L_fail;
+
+    int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
+
+    // Check the supertype display:
+    __ ldr(tmp1, Address(sub_klass, super_check_offset));
+    __ cmp(tmp1, super_klass);
+    __ b(L_success, eq);
+
+    __ cmp(super_check_offset, sc_offset);
+    __ b(L_fail, ne); // failure
+
+    BLOCK_COMMENT("type_check_slow_path:");
+
+    // a couple of useful fields in sub_klass:
+    int ss_offset = in_bytes(Klass::secondary_supers_offset());
+
+    // Do a linear scan of the secondary super-klass chain.
+
+#ifndef PRODUCT
+    int* pst_counter = &SharedRuntime::_partial_subtype_ctr;
+    __ inc_counter((address) pst_counter, tmp1, tmp2);
+#endif
+
+    Register scan_temp = tmp1;
+    Register count_temp = tmp2;
+
+    // We will consult the secondary-super array.
+    __ ldr(scan_temp, Address(sub_klass, ss_offset));
+
+    Register search_key = super_klass;
+
+    // Load the array length.
+    __ ldr_s32(count_temp, Address(scan_temp, Array<Klass*>::length_offset_in_bytes()));
+    __ add(scan_temp, scan_temp, Array<Klass*>::base_offset_in_bytes());
+
+    __ add(count_temp, count_temp, 1);
+
+    // Top of search loop
+    __ bind(L_loop);
+    // Notes:
+    //  scan_temp starts at the array elements
+    //  count_temp is 1+size
+
+    __ subs(count_temp, count_temp, 1);
+    __ b(L_fail, eq); // not found
+
+    // Load next super to check
+    // In the array of super classes elements are pointer sized.
+    int element_size = wordSize;
+    __ ldr(tmp3, Address(scan_temp, element_size, post_indexed));
+
+    // Look for Rsuper_klass on Rsub_klass's secondary super-class-overflow list
+    __ cmp(tmp3, search_key);
+
+    // A miss means we are NOT a subtype and need to keep looping
+    __ b(L_loop, ne);
+
+    // Falling out the bottom means we found a hit; we ARE a subtype
+
+    // Success.  Cache the super we found and proceed in triumph.
+    __ str(super_klass, Address(sub_klass, sc_offset));
+
+    // Jump to success
+    __ b(L_success);
+
+    // Fall through on failure!
+    __ bind(L_fail);
+  }
+
+  //  Generate stub for checked oop copy.
+  //
+  // Arguments for generated stub:
+  //      from:  R0
+  //      to:    R1
+  //      count: R2 treated as signed 32-bit int
+  //      ckoff: R3 (super_check_offset)
+  //      ckval: R4 (AArch64) / SP[0] (32-bit ARM) (super_klass)
+  //      ret:   R0 zero for success; (-1^K) where K is partial transfer count (32-bit)
+  //
+  address generate_checkcast_copy(const char * name) {
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", name);
+    address start = __ pc();
+
+    const Register from  = R0;  // source array address
+    const Register to    = R1;  // destination array address
+    const Register count = R2;  // elements count
+
+    const Register R3_ckoff  = R3;      // super_check_offset
+    const Register R4_ckval  = R4;      // super_klass
+
+    const int callee_saved_regs = AARCH64_ONLY(5) NOT_AARCH64(4); // LR saved differently
+
+    Label load_element, store_element, do_card_marks, fail;
+
+    BLOCK_COMMENT("Entry:");
+
+    __ zap_high_non_significant_bits(R2);
+
+#ifdef AARCH64
+    __ raw_push(LR, ZR);
+    __ raw_push(R19, R20);
+#else
+    int pushed = 0;
+    __ push(LR);
+    pushed+=1;
+#endif // AARCH64
+
+#if INCLUDE_ALL_GCS
+    gen_write_ref_array_pre_barrier(to, count, callee_saved_regs);
+#endif // INCLUDE_ALL_GCS
+
+#ifndef AARCH64
+    const RegisterSet caller_saved_regs = RegisterSet(R4,R6) | RegisterSet(R8,R9) | altFP_7_11;
+    __ push(caller_saved_regs);
+    assert(caller_saved_regs.size() == 6, "check the count");
+    pushed+=6;
+
+    __ ldr(R4_ckval,Address(SP, wordSize*pushed)); // read the argument that was on the stack
+#endif // !AARCH64
+
+    // Save arguments for barrier generation (after the pre barrier):
+    // - must be a caller saved register and not LR
+    // - ARM32: avoid R10 in case RThread is needed
+    const Register saved_count = AARCH64_ONLY(R19) NOT_AARCH64(altFP_7_11);
+#ifdef AARCH64
+    __ mov_w(saved_count, count);
+    __ cbnz_w(count, load_element); // and test count
+#else
+    __ movs(saved_count, count); // and test count
+    __ b(load_element,ne);
+#endif // AARCH64
+
+    // nothing to copy
+    __ mov(R0, 0);
+
+#ifdef AARCH64
+    __ raw_pop(R19, R20);
+    __ raw_pop(LR, ZR);
+    __ ret();
+#else
+    __ pop(caller_saved_regs);
+    __ pop(PC);
+#endif // AARCH64
+
+    // ======== begin loop ========
+    // (Loop is rotated; its entry is load_element.)
+    __ align(OptoLoopAlignment);
+    __ BIND(store_element);
+    if (UseCompressedOops) {
+      __ store_heap_oop(R5, Address(to, BytesPerHeapOop, post_indexed));  // store the oop, changes flags
+      __ subs_32(count,count,1);
+    } else {
+      __ subs_32(count,count,1);
+      __ str(R5, Address(to, BytesPerHeapOop, post_indexed));             // store the oop
+    }
+    __ b(do_card_marks, eq); // count exhausted
+
+    // ======== loop entry is here ========
+    __ BIND(load_element);
+    __ load_heap_oop(R5, Address(from, BytesPerHeapOop, post_indexed));  // load the oop
+    __ cbz(R5, store_element); // NULL
+
+    __ load_klass(R6, R5);
+
+    generate_type_check(R6, R3_ckoff, R4_ckval, /*tmps*/ R12, R8, R9,
+                        // branch to this on success:
+                        store_element);
+    // ======== end loop ========
+
+    // It was a real error; we must depend on the caller to finish the job.
+    // Register count has number of *remaining* oops, saved_count number of *total* oops.
+    // Emit GC store barriers for the oops we have copied
+    // and report their number to the caller (0 or (-1^n))
+    __ BIND(fail);
+
+    // Note: fail marked by the fact that count differs from saved_count
+
+    __ BIND(do_card_marks);
+
+    Register copied = AARCH64_ONLY(R20) NOT_AARCH64(R4); // saved
+    Label L_not_copied;
+
+    __ subs_32(copied, saved_count, count); // copied count (in saved reg)
+    __ b(L_not_copied, eq); // nothing was copied, skip post barrier
+    __ sub(to, to, AsmOperand(copied, lsl, LogBytesPerHeapOop)); // initial to value
+    __ mov(R12, copied); // count arg scratched by post barrier
+
+    gen_write_ref_array_post_barrier(to, R12, R3);
+
+    assert_different_registers(R3,R12,LR,copied,saved_count);
+    inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr, R3, R12);
+
+    __ BIND(L_not_copied);
+    __ cmp_32(copied, saved_count); // values preserved in saved registers
+
+#ifdef AARCH64
+    __ csinv(R0, ZR, copied, eq); // 0 if all copied else NOT(copied)
+    __ raw_pop(R19, R20);
+    __ raw_pop(LR, ZR);
+    __ ret();
+#else
+    __ mov(R0, 0, eq); // 0 if all copied
+    __ mvn(R0, copied, ne); // else NOT(copied)
+    __ pop(caller_saved_regs);
+    __ pop(PC);
+#endif // AARCH64
+
+    return start;
+  }
+
+  // Perform range checks on the proposed arraycopy.
+  // Kills the two temps, but nothing else.
+  void arraycopy_range_checks(Register src,     // source array oop
+                              Register src_pos, // source position (32-bit int)
+                              Register dst,     // destination array oop
+                              Register dst_pos, // destination position (32-bit int)
+                              Register length,  // length of copy (32-bit int)
+                              Register temp1, Register temp2,
+                              Label& L_failed) {
+
+    BLOCK_COMMENT("arraycopy_range_checks:");
+
+    //  if (src_pos + length > arrayOop(src)->length() ) FAIL;
+
+    const Register array_length = temp1;  // scratch
+    const Register end_pos      = temp2;  // scratch
+
+    __ add_32(end_pos, length, src_pos);  // src_pos + length
+    __ ldr_s32(array_length, Address(src, arrayOopDesc::length_offset_in_bytes()));
+    __ cmp_32(end_pos, array_length);
+    __ b(L_failed, hi);
+
+    //  if (dst_pos + length > arrayOop(dst)->length() ) FAIL;
+    __ add_32(end_pos, length, dst_pos); // dst_pos + length
+    __ ldr_s32(array_length, Address(dst, arrayOopDesc::length_offset_in_bytes()));
+    __ cmp_32(end_pos, array_length);
+    __ b(L_failed, hi);
+
+    BLOCK_COMMENT("arraycopy_range_checks done");
+  }
+
+  //
+  //  Generate generic array copy stubs
+  //
+  //  Input:
+  //    R0    -  src oop
+  //    R1    -  src_pos (32-bit int)
+  //    R2    -  dst oop
+  //    R3    -  dst_pos (32-bit int)
+  //    R4 (AArch64) / SP[0] (32-bit ARM) -  element count (32-bit int)
+  //
+  //  Output: (32-bit int)
+  //    R0 ==  0  -  success
+  //    R0 <   0  -  need to call System.arraycopy
+  //
+  address generate_generic_copy(const char *name) {
+    Label L_failed, L_objArray;
+
+    // Input registers
+    const Register src      = R0;  // source array oop
+    const Register src_pos  = R1;  // source position
+    const Register dst      = R2;  // destination array oop
+    const Register dst_pos  = R3;  // destination position
+
+    // registers used as temp
+    const Register R5_src_klass = R5; // source array klass
+    const Register R6_dst_klass = R6; // destination array klass
+    const Register R_lh         = AARCH64_ONLY(R7) NOT_AARCH64(altFP_7_11); // layout handler
+    const Register R8_temp      = R8;
+
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", name);
+    address start = __ pc();
+
+    __ zap_high_non_significant_bits(R1);
+    __ zap_high_non_significant_bits(R3);
+    __ zap_high_non_significant_bits(R4);
+
+#ifndef AARCH64
+    int pushed = 0;
+    const RegisterSet saved_regs = RegisterSet(R4,R6) | RegisterSet(R8,R9) | altFP_7_11;
+    __ push(saved_regs);
+    assert(saved_regs.size() == 6, "check the count");
+    pushed+=6;
+#endif // !AARCH64
+
+    // bump this on entry, not on exit:
+    inc_counter_np(SharedRuntime::_generic_array_copy_ctr, R5, R12);
+
+    const Register length   = R4;  // elements count
+#ifndef AARCH64
+    __ ldr(length, Address(SP,4*pushed));
+#endif // !AARCH64
+
+
+    //-----------------------------------------------------------------------
+    // Assembler stubs will be used for this call to arraycopy
+    // if the following conditions are met:
+    //
+    // (1) src and dst must not be null.
+    // (2) src_pos must not be negative.
+    // (3) dst_pos must not be negative.
+    // (4) length  must not be negative.
+    // (5) src klass and dst klass should be the same and not NULL.
+    // (6) src and dst should be arrays.
+    // (7) src_pos + length must not exceed length of src.
+    // (8) dst_pos + length must not exceed length of dst.
+    BLOCK_COMMENT("arraycopy initial argument checks");
+
+    //  if (src == NULL) return -1;
+    __ cbz(src, L_failed);
+
+    //  if (src_pos < 0) return -1;
+    __ cmp_32(src_pos, 0);
+    __ b(L_failed, lt);
+
+    //  if (dst == NULL) return -1;
+    __ cbz(dst, L_failed);
+
+    //  if (dst_pos < 0) return -1;
+    __ cmp_32(dst_pos, 0);
+    __ b(L_failed, lt);
+
+    //  if (length < 0) return -1;
+    __ cmp_32(length, 0);
+    __ b(L_failed, lt);
+
+    BLOCK_COMMENT("arraycopy argument klass checks");
+    //  get src->klass()
+    __ load_klass(R5_src_klass, src);
+
+    // Load layout helper
+    //
+    //  |array_tag|     | header_size | element_type |     |log2_element_size|
+    // 32        30    24            16              8     2                 0
+    //
+    //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
+    //
+
+    int lh_offset = in_bytes(Klass::layout_helper_offset());
+    __ ldr_u32(R_lh, Address(R5_src_klass, lh_offset));
+
+    __ load_klass(R6_dst_klass, dst);
+
+    // Handle objArrays completely differently...
+    juint objArray_lh = Klass::array_layout_helper(T_OBJECT);
+    __ mov_slow(R8_temp, objArray_lh);
+    __ cmp_32(R_lh, R8_temp);
+    __ b(L_objArray,eq);
+
+    //  if (src->klass() != dst->klass()) return -1;
+    __ cmp(R5_src_klass, R6_dst_klass);
+    __ b(L_failed, ne);
+
+    //  if (!src->is_Array()) return -1;
+    __ cmp_32(R_lh, Klass::_lh_neutral_value); // < 0
+    __ b(L_failed, ge);
+
+    arraycopy_range_checks(src, src_pos, dst, dst_pos, length,
+                           R8_temp, R6_dst_klass, L_failed);
+
+    {
+      // TypeArrayKlass
+      //
+      // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
+      // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
+      //
+
+      const Register R6_offset = R6_dst_klass;    // array offset
+      const Register R12_elsize = R12;            // log2 element size
+
+      __ logical_shift_right(R6_offset, R_lh, Klass::_lh_header_size_shift);
+      __ andr(R6_offset, R6_offset, (unsigned int)Klass::_lh_header_size_mask); // array_offset
+      __ add(src, src, R6_offset);       // src array offset
+      __ add(dst, dst, R6_offset);       // dst array offset
+      __ andr(R12_elsize, R_lh, (unsigned int)Klass::_lh_log2_element_size_mask); // log2 element size
+
+      // next registers should be set before the jump to corresponding stub
+      const Register from     = R0;  // source array address
+      const Register to       = R1;  // destination array address
+      const Register count    = R2;  // elements count
+
+      // 'from', 'to', 'count' registers should be set in this order
+      // since they are the same as 'src', 'src_pos', 'dst'.
+
+#ifdef AARCH64
+
+      BLOCK_COMMENT("choose copy loop based on element size and scale indexes");
+      Label Lbyte, Lshort, Lint, Llong;
+
+      __ cbz(R12_elsize, Lbyte);
+
+      assert (LogBytesPerShort < LogBytesPerInt && LogBytesPerInt < LogBytesPerLong, "must be");
+      __ cmp(R12_elsize, LogBytesPerInt);
+      __ b(Lint,  eq);
+      __ b(Llong, gt);
+
+      __ BIND(Lshort);
+      __ add_ptr_scaled_int32(from, src, src_pos, LogBytesPerShort);
+      __ add_ptr_scaled_int32(to,   dst, dst_pos, LogBytesPerShort);
+      __ mov(count, length);
+      __ b(StubRoutines::_jshort_arraycopy);
+
+      __ BIND(Lint);
+      __ add_ptr_scaled_int32(from, src, src_pos, LogBytesPerInt);
+      __ add_ptr_scaled_int32(to,   dst, dst_pos, LogBytesPerInt);
+      __ mov(count, length);
+      __ b(StubRoutines::_jint_arraycopy);
+
+      __ BIND(Lbyte);
+      __ add_ptr_scaled_int32(from, src, src_pos, 0);
+      __ add_ptr_scaled_int32(to,   dst, dst_pos, 0);
+      __ mov(count, length);
+      __ b(StubRoutines::_jbyte_arraycopy);
+
+      __ BIND(Llong);
+      __ add_ptr_scaled_int32(from, src, src_pos, LogBytesPerLong);
+      __ add_ptr_scaled_int32(to,   dst, dst_pos, LogBytesPerLong);
+      __ mov(count, length);
+      __ b(StubRoutines::_jlong_arraycopy);
+
+#else // AARCH64
+
+      BLOCK_COMMENT("scale indexes to element size");
+      __ add(from, src, AsmOperand(src_pos, lsl, R12_elsize));       // src_addr
+      __ add(to, dst, AsmOperand(dst_pos, lsl, R12_elsize));         // dst_addr
+
+      __ mov(count, length);  // length
+
+      // XXX optim: avoid later push in arraycopy variants ?
+
+      __ pop(saved_regs);
+
+      BLOCK_COMMENT("choose copy loop based on element size");
+      __ cmp(R12_elsize, 0);
+      __ b(StubRoutines::_jbyte_arraycopy,eq);
+
+      __ cmp(R12_elsize, LogBytesPerShort);
+      __ b(StubRoutines::_jshort_arraycopy,eq);
+
+      __ cmp(R12_elsize, LogBytesPerInt);
+      __ b(StubRoutines::_jint_arraycopy,eq);
+
+      __ b(StubRoutines::_jlong_arraycopy);
+
+#endif // AARCH64
+    }
+
+    // ObjArrayKlass
+    __ BIND(L_objArray);
+    // live at this point:  R5_src_klass, R6_dst_klass, src[_pos], dst[_pos], length
+
+    Label L_plain_copy, L_checkcast_copy;
+    //  test array classes for subtyping
+    __ cmp(R5_src_klass, R6_dst_klass);         // usual case is exact equality
+    __ b(L_checkcast_copy, ne);
+
+    BLOCK_COMMENT("Identically typed arrays");
+    {
+      // Identically typed arrays can be copied without element-wise checks.
+      arraycopy_range_checks(src, src_pos, dst, dst_pos, length,
+                             R8_temp, R_lh, L_failed);
+
+      // next registers should be set before the jump to corresponding stub
+      const Register from     = R0;  // source array address
+      const Register to       = R1;  // destination array address
+      const Register count    = R2;  // elements count
+
+      __ add(src, src, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); //src offset
+      __ add(dst, dst, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); //dst offset
+      __ add_ptr_scaled_int32(from, src, src_pos, LogBytesPerHeapOop);         // src_addr
+      __ add_ptr_scaled_int32(to, dst, dst_pos, LogBytesPerHeapOop);           // dst_addr
+      __ BIND(L_plain_copy);
+      __ mov(count, length);
+
+#ifndef AARCH64
+      __ pop(saved_regs); // XXX optim: avoid later push in oop_arraycopy ?
+#endif // !AARCH64
+      __ b(StubRoutines::_oop_arraycopy);
+    }
+
+    {
+      __ BIND(L_checkcast_copy);
+      // live at this point:  R5_src_klass, R6_dst_klass
+
+      // Before looking at dst.length, make sure dst is also an objArray.
+      __ ldr_u32(R8_temp, Address(R6_dst_klass, lh_offset));
+      __ cmp_32(R_lh, R8_temp);
+      __ b(L_failed, ne);
+
+      // It is safe to examine both src.length and dst.length.
+
+      arraycopy_range_checks(src, src_pos, dst, dst_pos, length,
+                             R8_temp, R_lh, L_failed);
+
+      // next registers should be set before the jump to corresponding stub
+      const Register from     = R0;  // source array address
+      const Register to       = R1;  // destination array address
+      const Register count    = R2;  // elements count
+
+      // Marshal the base address arguments now, freeing registers.
+      __ add(src, src, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); //src offset
+      __ add(dst, dst, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); //dst offset
+      __ add_ptr_scaled_int32(from, src, src_pos, LogBytesPerHeapOop);         // src_addr
+      __ add_ptr_scaled_int32(to, dst, dst_pos, LogBytesPerHeapOop);           // dst_addr
+
+      __ mov(count, length); // length (reloaded)
+
+      Register sco_temp = R3;                   // this register is free now
+      assert_different_registers(from, to, count, sco_temp,
+                                 R6_dst_klass, R5_src_klass);
+
+      // Generate the type check.
+      int sco_offset = in_bytes(Klass::super_check_offset_offset());
+      __ ldr_u32(sco_temp, Address(R6_dst_klass, sco_offset));
+      generate_type_check(R5_src_klass, sco_temp, R6_dst_klass,
+                          R8_temp, R9,
+                          AARCH64_ONLY(R10) NOT_AARCH64(R12),
+                          L_plain_copy);
+
+      // Fetch destination element klass from the ObjArrayKlass header.
+      int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
+
+      // the checkcast_copy loop needs two extra arguments:
+      const Register Rdst_elem_klass = AARCH64_ONLY(R4) NOT_AARCH64(R3);
+      __ ldr(Rdst_elem_klass, Address(R6_dst_klass, ek_offset));   // dest elem klass
+#ifndef AARCH64
+      __ pop(saved_regs); // XXX optim: avoid later push in oop_arraycopy ?
+      __ str(Rdst_elem_klass, Address(SP,0));    // dest elem klass argument
+#endif // !AARCH64
+      __ ldr_u32(R3, Address(Rdst_elem_klass, sco_offset));  // sco of elem klass
+      __ b(StubRoutines::_checkcast_arraycopy);
+    }
+
+    __ BIND(L_failed);
+
+#ifndef AARCH64
+    __ pop(saved_regs);
+#endif // !AARCH64
+    __ mvn(R0, 0); // failure, with 0 copied
+    __ ret();
+
+    return start;
+  }
+
+  // Safefetch stubs.
+  void generate_safefetch(const char* name, int size, address* entry, address* fault_pc, address* continuation_pc) {
+    // safefetch signatures:
+    //   int      SafeFetch32(int*      adr, int      errValue);
+    //   intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue);
+    //
+    // arguments:
+    //   R0 = adr
+    //   R1 = errValue
+    //
+    // result:
+    //   R0  = *adr or errValue
+
+    StubCodeMark mark(this, "StubRoutines", name);
+
+    // Entry point, pc or function descriptor.
+    *entry = __ pc();
+
+    // Load *adr into c_rarg2, may fault.
+    *fault_pc = __ pc();
+
+    switch (size) {
+      case 4: // int32_t
+        __ ldr_s32(R1, Address(R0));
+        break;
+
+      case 8: // int64_t
+#ifdef AARCH64
+        __ ldr(R1, Address(R0));
+#else
+        Unimplemented();
+#endif // AARCH64
+        break;
+
+      default:
+        ShouldNotReachHere();
+    }
+
+    // return errValue or *adr
+    *continuation_pc = __ pc();
+    __ mov(R0, R1);
+    __ ret();
+  }
+
+  void generate_arraycopy_stubs() {
+
+    // Note:  the disjoint stubs must be generated first, some of
+    //        the conjoint stubs use them.
+
+    bool status = false; // non failing C2 stubs need not return a status in R0
+
+#ifdef TEST_C2_GENERIC_ARRAYCOPY /* Internal development flag */
+    // With this flag, the C2 stubs are tested by generating calls to
+    // generic_arraycopy instead of Runtime1::arraycopy
+
+    // Runtime1::arraycopy return a status in R0 (0 if OK, else ~copied)
+    // and the result is tested to see whether the arraycopy stub should
+    // be called.
+
+    // When we test arraycopy this way, we must generate extra code in the
+    // arraycopy methods callable from C2 generic_arraycopy to set the
+    // status to 0 for those who always succeed (calling the slow path stub might
+    // lead to errors since the copy has already been performed).
+
+    status = true; // generate a status compatible with C1 calls
+#endif
+
+    // these need always status in case they are called from generic_arraycopy
+    StubRoutines::_jbyte_disjoint_arraycopy  = generate_primitive_copy(false, "jbyte_disjoint_arraycopy",  true, 1, true);
+    StubRoutines::_jshort_disjoint_arraycopy = generate_primitive_copy(false, "jshort_disjoint_arraycopy", true, 2, true);
+    StubRoutines::_jint_disjoint_arraycopy   = generate_primitive_copy(false, "jint_disjoint_arraycopy",   true, 4, true);
+    StubRoutines::_jlong_disjoint_arraycopy  = generate_primitive_copy(false, "jlong_disjoint_arraycopy",  true, 8, true);
+    StubRoutines::_oop_disjoint_arraycopy    = generate_oop_copy      (false, "oop_disjoint_arraycopy",    true,    true);
+
+    StubRoutines::_arrayof_jbyte_disjoint_arraycopy  = generate_primitive_copy(true, "arrayof_jbyte_disjoint_arraycopy", status, 1, true);
+    StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_primitive_copy(true, "arrayof_jshort_disjoint_arraycopy",status, 2, true);
+    StubRoutines::_arrayof_jint_disjoint_arraycopy   = generate_primitive_copy(true, "arrayof_jint_disjoint_arraycopy",  status, 4, true);
+    StubRoutines::_arrayof_jlong_disjoint_arraycopy  = generate_primitive_copy(true, "arrayof_jlong_disjoint_arraycopy", status, 8, true);
+    StubRoutines::_arrayof_oop_disjoint_arraycopy    = generate_oop_copy      (true, "arrayof_oop_disjoint_arraycopy",   status,    true);
+
+    // these need always status in case they are called from generic_arraycopy
+    StubRoutines::_jbyte_arraycopy  = generate_primitive_copy(false, "jbyte_arraycopy",  true, 1, false, StubRoutines::_jbyte_disjoint_arraycopy);
+    StubRoutines::_jshort_arraycopy = generate_primitive_copy(false, "jshort_arraycopy", true, 2, false, StubRoutines::_jshort_disjoint_arraycopy);
+    StubRoutines::_jint_arraycopy   = generate_primitive_copy(false, "jint_arraycopy",   true, 4, false, StubRoutines::_jint_disjoint_arraycopy);
+    StubRoutines::_jlong_arraycopy  = generate_primitive_copy(false, "jlong_arraycopy",  true, 8, false, StubRoutines::_jlong_disjoint_arraycopy);
+    StubRoutines::_oop_arraycopy    = generate_oop_copy      (false, "oop_arraycopy",    true,    false, StubRoutines::_oop_disjoint_arraycopy);
+
+    StubRoutines::_arrayof_jbyte_arraycopy    = generate_primitive_copy(true, "arrayof_jbyte_arraycopy",  status, 1, false, StubRoutines::_arrayof_jbyte_disjoint_arraycopy);
+    StubRoutines::_arrayof_jshort_arraycopy   = generate_primitive_copy(true, "arrayof_jshort_arraycopy", status, 2, false, StubRoutines::_arrayof_jshort_disjoint_arraycopy);
+#ifdef _LP64
+    // since sizeof(jint) < sizeof(HeapWord), there's a different flavor:
+    StubRoutines::_arrayof_jint_arraycopy     = generate_primitive_copy(true, "arrayof_jint_arraycopy",   status, 4, false, StubRoutines::_arrayof_jint_disjoint_arraycopy);
+#else
+    StubRoutines::_arrayof_jint_arraycopy     = StubRoutines::_jint_arraycopy;
+#endif
+    if (BytesPerHeapOop < HeapWordSize) {
+      StubRoutines::_arrayof_oop_arraycopy    = generate_oop_copy      (true, "arrayof_oop_arraycopy",    status,    false, StubRoutines::_arrayof_oop_disjoint_arraycopy);
+    } else {
+      StubRoutines::_arrayof_oop_arraycopy    = StubRoutines::_oop_arraycopy;
+    }
+    StubRoutines::_arrayof_jlong_arraycopy    = StubRoutines::_jlong_arraycopy;
+
+    StubRoutines::_checkcast_arraycopy = generate_checkcast_copy("checkcast_arraycopy");
+    StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy("unsafe_arraycopy");
+    StubRoutines::_generic_arraycopy   = generate_generic_copy("generic_arraycopy");
+
+
+  }
+
+#ifndef AARCH64
+#define COMPILE_CRYPTO
+#include "stubRoutinesCrypto_arm.cpp"
+#else
+
+#ifdef COMPILER2
+  // Arguments:
+  //
+  // Inputs:
+  //   c_rarg0   - source byte array address
+  //   c_rarg1   - destination byte array address
+  //   c_rarg2   - K (key) in little endian int array
+  //
+  address generate_aescrypt_encryptBlock() {
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
+
+    Label L_doLast;
+
+    const Register from        = c_rarg0;  // source array address
+    const Register to          = c_rarg1;  // destination array address
+    const Register key         = c_rarg2;  // key array address
+    const Register keylen      = R8;
+
+    address start = __ pc();
+    __ stp(FP, LR, Address(SP, -2 * wordSize, pre_indexed));
+    __ mov(FP, SP);
+
+    __ ldr_w(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
+
+    __ vld1(V0, Address(from), MacroAssembler::VELEM_SIZE_8, 128); // get 16 bytes of input
+
+    __ vld1(V1, V2, V3, V4, Address(key, 64, post_indexed), MacroAssembler::VELEM_SIZE_8, 128);
+
+    int quad = 1;
+    __ rev32(V1, V1, MacroAssembler::VELEM_SIZE_8, quad);
+    __ rev32(V2, V2, MacroAssembler::VELEM_SIZE_8, quad);
+    __ rev32(V3, V3, MacroAssembler::VELEM_SIZE_8, quad);
+    __ rev32(V4, V4, MacroAssembler::VELEM_SIZE_8, quad);
+    __ aese(V0, V1);
+    __ aesmc(V0, V0);
+    __ aese(V0, V2);
+    __ aesmc(V0, V0);
+    __ aese(V0, V3);
+    __ aesmc(V0, V0);
+    __ aese(V0, V4);
+    __ aesmc(V0, V0);
+
+    __ vld1(V1, V2, V3, V4, Address(key, 64, post_indexed), MacroAssembler::VELEM_SIZE_8, 128);
+    __ rev32(V1, V1, MacroAssembler::VELEM_SIZE_8, quad);
+    __ rev32(V2, V2, MacroAssembler::VELEM_SIZE_8, quad);
+    __ rev32(V3, V3, MacroAssembler::VELEM_SIZE_8, quad);
+    __ rev32(V4, V4, MacroAssembler::VELEM_SIZE_8, quad);
+    __ aese(V0, V1);
+    __ aesmc(V0, V0);
+    __ aese(V0, V2);
+    __ aesmc(V0, V0);
+    __ aese(V0, V3);
+    __ aesmc(V0, V0);
+    __ aese(V0, V4);
+    __ aesmc(V0, V0);
+
+    __ vld1(V1, V2, Address(key, 32, post_indexed), MacroAssembler::VELEM_SIZE_8, 128);
+    __ rev32(V1, V1, MacroAssembler::VELEM_SIZE_8, quad);
+    __ rev32(V2, V2, MacroAssembler::VELEM_SIZE_8, quad);
+
+    __ cmp_w(keylen, 44);
+    __ b(L_doLast, eq);
+
+    __ aese(V0, V1);
+    __ aesmc(V0, V0);
+    __ aese(V0, V2);
+    __ aesmc(V0, V0);
+
+    __ vld1(V1, V2, Address(key, 32, post_indexed), MacroAssembler::VELEM_SIZE_8, 128);
+    __ rev32(V1, V1, MacroAssembler::VELEM_SIZE_8, quad);
+    __ rev32(V2, V2, MacroAssembler::VELEM_SIZE_8, quad);
+
+    __ cmp_w(keylen, 52);
+    __ b(L_doLast, eq);
+
+    __ aese(V0, V1);
+    __ aesmc(V0, V0);
+    __ aese(V0, V2);
+    __ aesmc(V0, V0);
+
+    __ vld1(V1, V2, Address(key, 32, post_indexed), MacroAssembler::VELEM_SIZE_8, 128);
+    __ rev32(V1, V1, MacroAssembler::VELEM_SIZE_8, quad);
+    __ rev32(V2, V2, MacroAssembler::VELEM_SIZE_8, quad);
+
+    __ BIND(L_doLast);
+
+    __ aese(V0, V1);
+    __ aesmc(V0, V0);
+    __ aese(V0, V2);
+
+    __ vld1(V1, Address(key), MacroAssembler::VELEM_SIZE_8, 128);
+    __ rev32(V1, V1, MacroAssembler::VELEM_SIZE_8, quad);
+    __ eor(V0, V0, V1, MacroAssembler::VELEM_SIZE_8, quad);
+
+    __ vst1(V0, Address(to), MacroAssembler::VELEM_SIZE_8, 128);
+
+    __ mov(R0, 0);
+
+    __ mov(SP, FP);
+    __ ldp(FP, LR, Address(SP, 2 * wordSize, post_indexed));
+    __ ret(LR);
+
+    return start;
+  }
+
+  // Arguments:
+  //
+  // Inputs:
+  //   c_rarg0   - source byte array address
+  //   c_rarg1   - destination byte array address
+  //   c_rarg2   - K (key) in little endian int array
+  //
+  address generate_aescrypt_decryptBlock() {
+    assert(UseAES, "need AES instructions and misaligned SSE support");
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
+    Label L_doLast;
+
+    const Register from        = c_rarg0;  // source array address
+    const Register to          = c_rarg1;  // destination array address
+    const Register key         = c_rarg2;  // key array address
+    const Register keylen      = R8;
+
+    address start = __ pc();
+    __ stp(FP, LR, Address(SP, -2 * wordSize, pre_indexed));
+    __ mov(FP, SP);
+
+    __ ldr_w(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
+
+    __ vld1(V0, Address(from), MacroAssembler::VELEM_SIZE_8, 128); // get 16 bytes of input
+
+    __ vld1(V5, Address(key, 16, post_indexed), MacroAssembler::VELEM_SIZE_8, 128);
+
+    int quad = 1;
+    __ rev32(V5, V5, MacroAssembler::VELEM_SIZE_8, quad);
+
+    __ vld1(V1, V2, V3, V4, Address(key, 64, post_indexed), MacroAssembler::VELEM_SIZE_8, 128);
+    __ rev32(V1, V1, MacroAssembler::VELEM_SIZE_8, quad);
+    __ rev32(V2, V2, MacroAssembler::VELEM_SIZE_8, quad);
+    __ rev32(V3, V3, MacroAssembler::VELEM_SIZE_8, quad);
+    __ rev32(V4, V4, MacroAssembler::VELEM_SIZE_8, quad);
+    __ aesd(V0, V1);
+    __ aesimc(V0, V0);
+    __ aesd(V0, V2);
+    __ aesimc(V0, V0);
+    __ aesd(V0, V3);
+    __ aesimc(V0, V0);
+    __ aesd(V0, V4);
+    __ aesimc(V0, V0);
+
+    __ vld1(V1, V2, V3, V4, Address(key, 64, post_indexed), MacroAssembler::VELEM_SIZE_8, 128);
+    __ rev32(V1, V1, MacroAssembler::VELEM_SIZE_8, quad);
+    __ rev32(V2, V2, MacroAssembler::VELEM_SIZE_8, quad);
+    __ rev32(V3, V3, MacroAssembler::VELEM_SIZE_8, quad);
+    __ rev32(V4, V4, MacroAssembler::VELEM_SIZE_8, quad);
+    __ aesd(V0, V1);
+    __ aesimc(V0, V0);
+    __ aesd(V0, V2);
+    __ aesimc(V0, V0);
+    __ aesd(V0, V3);
+    __ aesimc(V0, V0);
+    __ aesd(V0, V4);
+    __ aesimc(V0, V0);
+
+    __ vld1(V1, V2, Address(key, 32, post_indexed), MacroAssembler::VELEM_SIZE_8, 128);
+    __ rev32(V1, V1, MacroAssembler::VELEM_SIZE_8, quad);
+    __ rev32(V2, V2, MacroAssembler::VELEM_SIZE_8, quad);
+
+    __ cmp_w(keylen, 44);
+    __ b(L_doLast, eq);
+
+    __ aesd(V0, V1);
+    __ aesimc(V0, V0);
+    __ aesd(V0, V2);
+    __ aesimc(V0, V0);
+
+    __ vld1(V1, V2, Address(key, 32, post_indexed), MacroAssembler::VELEM_SIZE_8, 128);
+    __ rev32(V1, V1, MacroAssembler::VELEM_SIZE_8, quad);
+    __ rev32(V2, V2, MacroAssembler::VELEM_SIZE_8, quad);
+
+    __ cmp_w(keylen, 52);
+    __ b(L_doLast, eq);
+
+    __ aesd(V0, V1);
+    __ aesimc(V0, V0);
+    __ aesd(V0, V2);
+    __ aesimc(V0, V0);
+
+    __ vld1(V1, V2, Address(key, 32, post_indexed), MacroAssembler::VELEM_SIZE_8, 128);
+    __ rev32(V1, V1, MacroAssembler::VELEM_SIZE_8, quad);
+    __ rev32(V2, V2, MacroAssembler::VELEM_SIZE_8, quad);
+
+    __ BIND(L_doLast);
+
+    __ aesd(V0, V1);
+    __ aesimc(V0, V0);
+    __ aesd(V0, V2);
+
+    __ eor(V0, V0, V5, MacroAssembler::VELEM_SIZE_8, quad);
+
+    __ vst1(V0, Address(to), MacroAssembler::VELEM_SIZE_8, 128);
+
+    __ mov(R0, 0);
+
+    __ mov(SP, FP);
+    __ ldp(FP, LR, Address(SP, 2 * wordSize, post_indexed));
+    __ ret(LR);
+
+
+    return start;
+  }
+
+  // Arguments:
+  //
+  // Inputs:
+  //   c_rarg0   - source byte array address
+  //   c_rarg1   - destination byte array address
+  //   c_rarg2   - K (key) in little endian int array
+  //   c_rarg3   - r vector byte array address
+  //   c_rarg4   - input length
+  //
+  // Output:
+  //   x0        - input length
+  //
+  address generate_cipherBlockChaining_encryptAESCrypt() {
+    assert(UseAES, "need AES instructions and misaligned SSE support");
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
+
+    Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
+
+    const Register from        = c_rarg0;  // source array address
+    const Register to          = c_rarg1;  // destination array address
+    const Register key         = c_rarg2;  // key array address
+    const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
+                                           // and left with the results of the last encryption block
+    const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
+    const Register keylen      = R8;
+
+    address start = __ pc();
+    __ stp(FP, LR, Address(SP, -2 * wordSize, pre_indexed));
+    __ mov(FP, SP);
+
+    __ mov(R9, len_reg);
+    __ ldr_w(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
+
+    __ vld1(V0, Address(rvec), MacroAssembler::VELEM_SIZE_8, 128);
+
+    __ cmp_w(keylen, 52);
+    __ b(L_loadkeys_44, cc);
+    __ b(L_loadkeys_52, eq);
+
+    __ vld1(V17, V18, Address(key, 32, post_indexed), MacroAssembler::VELEM_SIZE_8, 128);
+
+    int quad = 1;
+    __ rev32(V17, V17, MacroAssembler::VELEM_SIZE_8, quad);
+    __ rev32(V18, V18, MacroAssembler::VELEM_SIZE_8, quad);
+    __ BIND(L_loadkeys_52);
+    __ vld1(V19, V20, Address(key, 32, post_indexed), MacroAssembler::VELEM_SIZE_8, 128);
+    __ rev32(V19, V19, MacroAssembler::VELEM_SIZE_8, quad);
+    __ rev32(V20, V20, MacroAssembler::VELEM_SIZE_8, quad);
+    __ BIND(L_loadkeys_44);
+    __ vld1(V21, V22, V23, V24, Address(key, 64, post_indexed), MacroAssembler::VELEM_SIZE_8, 128);
+    __ rev32(V21, V21, MacroAssembler::VELEM_SIZE_8, quad);
+    __ rev32(V22, V22, MacroAssembler::VELEM_SIZE_8, quad);
+    __ rev32(V23, V23, MacroAssembler::VELEM_SIZE_8, quad);
+    __ rev32(V24, V24, MacroAssembler::VELEM_SIZE_8, quad);
+    __ vld1(V25, V26, V27, V28, Address(key, 64, post_indexed), MacroAssembler::VELEM_SIZE_8, 128);
+    __ rev32(V25, V25, MacroAssembler::VELEM_SIZE_8, quad);
+    __ rev32(V26, V26, MacroAssembler::VELEM_SIZE_8, quad);
+    __ rev32(V27, V27, MacroAssembler::VELEM_SIZE_8, quad);
+    __ rev32(V28, V28, MacroAssembler::VELEM_SIZE_8, quad);
+    __ vld1(V29, V30, V31, Address(key), MacroAssembler::VELEM_SIZE_8, 128);
+    __ rev32(V29, V29, MacroAssembler::VELEM_SIZE_8, quad);
+    __ rev32(V30, V30, MacroAssembler::VELEM_SIZE_8, quad);
+    __ rev32(V31, V31, MacroAssembler::VELEM_SIZE_8, quad);
+
+    __ BIND(L_aes_loop);
+    __ vld1(V1, Address(from, 16, post_indexed), MacroAssembler::VELEM_SIZE_8, 128);
+    __ eor(V0, V0, V1, MacroAssembler::VELEM_SIZE_8, quad);
+
+    __ b(L_rounds_44, cc);
+    __ b(L_rounds_52, eq);
+
+    __ aese(V0, V17);
+    __ aesmc(V0, V0);
+    __ aese(V0, V18);
+    __ aesmc(V0, V0);
+    __ BIND(L_rounds_52);
+    __ aese(V0, V19);
+    __ aesmc(V0, V0);
+    __ aese(V0, V20);
+    __ aesmc(V0, V0);
+    __ BIND(L_rounds_44);
+    __ aese(V0, V21);
+    __ aesmc(V0, V0);
+    __ aese(V0, V22);
+    __ aesmc(V0, V0);
+    __ aese(V0, V23);
+    __ aesmc(V0, V0);
+    __ aese(V0, V24);
+    __ aesmc(V0, V0);
+    __ aese(V0, V25);
+    __ aesmc(V0, V0);
+    __ aese(V0, V26);
+    __ aesmc(V0, V0);
+    __ aese(V0, V27);
+    __ aesmc(V0, V0);
+    __ aese(V0, V28);
+    __ aesmc(V0, V0);
+    __ aese(V0, V29);
+    __ aesmc(V0, V0);
+    __ aese(V0, V30);
+    __ eor(V0, V0, V31, MacroAssembler::VELEM_SIZE_8, quad);
+
+    __ vst1(V0, Address(to, 16, post_indexed), MacroAssembler::VELEM_SIZE_8, 128);
+    __ sub(len_reg, len_reg, 16);
+    __ cbnz(len_reg, L_aes_loop);
+
+    __ vst1(V0, Address(rvec), MacroAssembler::VELEM_SIZE_8, 128);
+
+    __ mov(R0, R9);
+
+    __ mov(SP, FP);
+    __ ldp(FP, LR, Address(SP, 2 * wordSize, post_indexed));
+    __ ret(LR);
+
+    return start;
+  }
+
+  // Arguments:
+  //
+  // Inputs:
+  //   c_rarg0   - source byte array address
+  //   c_rarg1   - destination byte array address
+  //   c_rarg2   - K (key) in little endian int array
+  //   c_rarg3   - r vector byte array address
+  //   c_rarg4   - input length
+  //
+  // Output:
+  //   rax       - input length
+  //
+  address generate_cipherBlockChaining_decryptAESCrypt() {
+    assert(UseAES, "need AES instructions and misaligned SSE support");
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
+
+    Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
+
+    const Register from        = c_rarg0;  // source array address
+    const Register to          = c_rarg1;  // destination array address
+    const Register key         = c_rarg2;  // key array address
+    const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
+                                           // and left with the results of the last encryption block
+    const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
+    const Register keylen      = R8;
+
+    address start = __ pc();
+    __ stp(FP, LR, Address(SP, -2 * wordSize, pre_indexed));
+    __ mov(FP, SP);
+
+    __ mov(R9, len_reg);
+    __ ldr_w(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
+
+    __ vld1(V2, Address(rvec), MacroAssembler::VELEM_SIZE_8, 128);
+
+    __ vld1(V31, Address(key, 16, post_indexed), MacroAssembler::VELEM_SIZE_8, 128);
+
+    int quad = 1;
+    __ rev32(V31, V31, MacroAssembler::VELEM_SIZE_8, quad);
+
+    __ cmp_w(keylen, 52);
+    __ b(L_loadkeys_44, cc);
+    __ b(L_loadkeys_52, eq);
+
+    __ vld1(V17, V18, Address(key, 32, post_indexed), MacroAssembler::VELEM_SIZE_8, 128);
+    __ rev32(V17, V17, MacroAssembler::VELEM_SIZE_8, quad);
+    __ rev32(V18, V18, MacroAssembler::VELEM_SIZE_8, quad);
+    __ BIND(L_loadkeys_52);
+    __ vld1(V19, V20, Address(key, 32, post_indexed), MacroAssembler::VELEM_SIZE_8, 128);
+    __ rev32(V19, V19, MacroAssembler::VELEM_SIZE_8, quad);
+    __ rev32(V20, V20, MacroAssembler::VELEM_SIZE_8, quad);
+    __ BIND(L_loadkeys_44);
+    __ vld1(V21, V22, V23, V24, Address(key, 64, post_indexed), MacroAssembler::VELEM_SIZE_8, 128);
+    __ rev32(V21, V21, MacroAssembler::VELEM_SIZE_8, quad);
+    __ rev32(V22, V22, MacroAssembler::VELEM_SIZE_8, quad);
+    __ rev32(V23, V23, MacroAssembler::VELEM_SIZE_8, quad);
+    __ rev32(V24, V24, MacroAssembler::VELEM_SIZE_8, quad);
+    __ vld1(V25, V26, V27, V28, Address(key, 64, post_indexed), MacroAssembler::VELEM_SIZE_8, 128);
+    __ rev32(V25, V25, MacroAssembler::VELEM_SIZE_8, quad);
+    __ rev32(V26, V26, MacroAssembler::VELEM_SIZE_8, quad);
+    __ rev32(V27, V27, MacroAssembler::VELEM_SIZE_8, quad);
+    __ rev32(V28, V28, MacroAssembler::VELEM_SIZE_8, quad);
+    __ vld1(V29, V30, Address(key), MacroAssembler::VELEM_SIZE_8, 128);
+    __ rev32(V29, V29, MacroAssembler::VELEM_SIZE_8, quad);
+    __ rev32(V30, V30, MacroAssembler::VELEM_SIZE_8, quad);
+
+    __ BIND(L_aes_loop);
+    __ vld1(V0, Address(from, 16, post_indexed), MacroAssembler::VELEM_SIZE_8, 128);
+    __ orr(V1, V0, V0, MacroAssembler::VELEM_SIZE_8, quad);
+
+    __ b(L_rounds_44, cc);
+    __ b(L_rounds_52, eq);
+
+    __ aesd(V0, V17);
+    __ aesimc(V0, V0);
+    __ aesd(V0, V17);
+    __ aesimc(V0, V0);
+    __ BIND(L_rounds_52);
+    __ aesd(V0, V19);
+    __ aesimc(V0, V0);
+    __ aesd(V0, V20);
+    __ aesimc(V0, V0);
+    __ BIND(L_rounds_44);
+    __ aesd(V0, V21);
+    __ aesimc(V0, V0);
+    __ aesd(V0, V22);
+    __ aesimc(V0, V0);
+    __ aesd(V0, V23);
+    __ aesimc(V0, V0);
+    __ aesd(V0, V24);
+    __ aesimc(V0, V0);
+    __ aesd(V0, V25);
+    __ aesimc(V0, V0);
+    __ aesd(V0, V26);
+    __ aesimc(V0, V0);
+    __ aesd(V0, V27);
+    __ aesimc(V0, V0);
+    __ aesd(V0, V28);
+    __ aesimc(V0, V0);
+    __ aesd(V0, V29);
+    __ aesimc(V0, V0);
+    __ aesd(V0, V30);
+    __ eor(V0, V0, V31, MacroAssembler::VELEM_SIZE_8, quad);
+    __ eor(V0, V0, V2, MacroAssembler::VELEM_SIZE_8, quad);
+
+    __ vst1(V0, Address(to, 16, post_indexed), MacroAssembler::VELEM_SIZE_8, 128);
+    __ orr(V2, V1, V1, MacroAssembler::VELEM_SIZE_8, quad);
+
+    __ sub(len_reg, len_reg, 16);
+    __ cbnz(len_reg, L_aes_loop);
+
+    __ vst1(V2, Address(rvec), MacroAssembler::VELEM_SIZE_8, 128);
+
+    __ mov(R0, R9);
+
+    __ mov(SP, FP);
+    __ ldp(FP, LR, Address(SP, 2 * wordSize, post_indexed));
+    __ ret(LR);
+
+    return start;
+  }
+
+#endif // COMPILER2
+#endif // AARCH64
+
+ private:
+
+#undef  __
+#define __ masm->
+
+  //------------------------------------------------------------------------------------------------------------------------
+  // Continuation point for throwing of implicit exceptions that are not handled in
+  // the current activation. Fabricates an exception oop and initiates normal
+  // exception dispatching in this frame.
+  address generate_throw_exception(const char* name, address runtime_entry) {
+    int insts_size = 128;
+    int locs_size  = 32;
+    CodeBuffer code(name, insts_size, locs_size);
+    OopMapSet* oop_maps;
+    int frame_size;
+    int frame_complete;
+
+    oop_maps = new OopMapSet();
+    MacroAssembler* masm = new MacroAssembler(&code);
+
+    address start = __ pc();
+
+    frame_size = 2;
+    __ mov(Rexception_pc, LR);
+    __ raw_push(FP, LR);
+
+    frame_complete = __ pc() - start;
+
+    // Any extra arguments are already supposed to be R1 and R2
+    __ mov(R0, Rthread);
+
+    int pc_offset = __ set_last_Java_frame(SP, FP, false, Rtemp);
+    assert(((__ pc()) - start) == __ offset(), "warning: start differs from code_begin");
+    __ call(runtime_entry);
+    if (pc_offset == -1) {
+      pc_offset = __ offset();
+    }
+
+    // Generate oop map
+    OopMap* map =  new OopMap(frame_size*VMRegImpl::slots_per_word, 0);
+    oop_maps->add_gc_map(pc_offset, map);
+    __ reset_last_Java_frame(Rtemp); // Rtemp free since scratched by far call
+
+    __ raw_pop(FP, LR);
+    __ jump(StubRoutines::forward_exception_entry(), relocInfo::runtime_call_type, Rtemp);
+
+    RuntimeStub* stub = RuntimeStub::new_runtime_stub(name, &code, frame_complete,
+                                                      frame_size, oop_maps, false);
+    return stub->entry_point();
+  }
+
+  //---------------------------------------------------------------------------
+  // Initialization
+
+  void generate_initial() {
+    // Generates all stubs and initializes the entry points
+
+    //------------------------------------------------------------------------------------------------------------------------
+    // entry points that exist in all platforms
+    // Note: This is code that could be shared among different platforms - however the benefit seems to be smaller than
+    //       the disadvantage of having a much more complicated generator structure. See also comment in stubRoutines.hpp.
+    StubRoutines::_forward_exception_entry      = generate_forward_exception();
+
+    StubRoutines::_call_stub_entry              =
+      generate_call_stub(StubRoutines::_call_stub_return_address);
+    // is referenced by megamorphic call
+    StubRoutines::_catch_exception_entry        = generate_catch_exception();
+
+    // stub for throwing stack overflow error used both by interpreter and compiler
+    StubRoutines::_throw_StackOverflowError_entry  = generate_throw_exception("StackOverflowError throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_StackOverflowError));
+
+#ifndef AARCH64
+    // integer division used both by interpreter and compiler
+    StubRoutines::Arm::_idiv_irem_entry = generate_idiv_irem();
+
+    StubRoutines::_atomic_add_entry = generate_atomic_add();
+    StubRoutines::_atomic_xchg_entry = generate_atomic_xchg();
+    StubRoutines::_atomic_cmpxchg_entry = generate_atomic_cmpxchg();
+    StubRoutines::_atomic_cmpxchg_long_entry = generate_atomic_cmpxchg_long();
+    StubRoutines::_atomic_load_long_entry = generate_atomic_load_long();
+    StubRoutines::_atomic_store_long_entry = generate_atomic_store_long();
+#endif // !AARCH64
+  }
+
+  void generate_all() {
+    // Generates all stubs and initializes the entry points
+
+#ifdef COMPILER2
+    // Generate partial_subtype_check first here since its code depends on
+    // UseZeroBaseCompressedOops which is defined after heap initialization.
+    StubRoutines::Arm::_partial_subtype_check                = generate_partial_subtype_check();
+#endif
+    // These entry points require SharedInfo::stack0 to be set up in non-core builds
+    // and need to be relocatable, so they each fabricate a RuntimeStub internally.
+    StubRoutines::_throw_AbstractMethodError_entry         = generate_throw_exception("AbstractMethodError throw_exception",          CAST_FROM_FN_PTR(address, SharedRuntime::throw_AbstractMethodError));
+    StubRoutines::_throw_IncompatibleClassChangeError_entry= generate_throw_exception("IncompatibleClassChangeError throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_IncompatibleClassChangeError));
+    StubRoutines::_throw_NullPointerException_at_call_entry= generate_throw_exception("NullPointerException at call throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_NullPointerException_at_call));
+
+    //------------------------------------------------------------------------------------------------------------------------
+    // entry points that are platform specific
+
+    // support for verify_oop (must happen after universe_init)
+    StubRoutines::_verify_oop_subroutine_entry     = generate_verify_oop();
+
+    // arraycopy stubs used by compilers
+    generate_arraycopy_stubs();
+
+    // Safefetch stubs.
+    generate_safefetch("SafeFetch32", sizeof(int), &StubRoutines::_safefetch32_entry,
+                                                   &StubRoutines::_safefetch32_fault_pc,
+                                                   &StubRoutines::_safefetch32_continuation_pc);
+#ifdef AARCH64
+    generate_safefetch("SafeFetchN", wordSize, &StubRoutines::_safefetchN_entry,
+                                               &StubRoutines::_safefetchN_fault_pc,
+                                               &StubRoutines::_safefetchN_continuation_pc);
+#ifdef COMPILER2
+    if (UseAESIntrinsics) {
+      StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
+      StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
+      StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
+      StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
+    }
+#endif
+#else
+    assert (sizeof(int) == wordSize, "32-bit architecture");
+    StubRoutines::_safefetchN_entry           = StubRoutines::_safefetch32_entry;
+    StubRoutines::_safefetchN_fault_pc        = StubRoutines::_safefetch32_fault_pc;
+    StubRoutines::_safefetchN_continuation_pc = StubRoutines::_safefetch32_continuation_pc;
+#endif // AARCH64
+
+#ifdef COMPILE_CRYPTO
+    // generate AES intrinsics code
+    if (UseAESIntrinsics) {
+      aes_init();
+      StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
+      StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
+      StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
+      StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
+    }
+#endif // COMPILE_CRYPTO
+  }
+
+
+ public:
+  StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
+    if (all) {
+      generate_all();
+    } else {
+      generate_initial();
+    }
+  }
+}; // end class declaration
+
+void StubGenerator_generate(CodeBuffer* code, bool all) {
+  StubGenerator g(code, all);
+}
--- /dev/null	2016-08-24 15:41:39.598575000 -0400
+++ new/hotspot/src/cpu/arm/vm/stubRoutinesCrypto_arm.cpp	2016-12-02 11:23:36.871605117 -0500
@@ -0,0 +1,1033 @@
+/*
+ * Copyright (c) 2008, 2016, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifdef COMPILE_CRYPTO
+
+// The Rijndael S-box and inverted S-box are embedded here for a faster access.
+//
+// Note about lookup tables (T1...T4 and T5..T8):
+// The tables (boxes) combine ahead-of-time precalculated transposition and mixing steps as
+// an alternative to a runtime calculation.
+// The tables are statically generated in com/sun/crypto/provider/AESCrypt class.
+// Only the first table reference is passed to AES methods below. The other 3 tables
+// in ecryption and decryption are calculated in runtime by rotating the T1 result accordingly.
+// It is a free operation on ARM with embedded register-shifted-register EOR capability.
+// The table reference is passed in a form of a last argument on the parametes list.
+// The tables lookup method proves to perform better then a runtime Galois Field caclulation,
+// due to a lack of HW acceleration for the later.
+
+unsigned char * SBox;
+unsigned char * SInvBox;
+
+void  aes_init() {
+
+  const static unsigned char Si[256] =
+    {
+      0x52, 0x09, 0x6A, 0xD5, 0x30, 0x36, 0xA5, 0x38,
+      0xBF, 0x40, 0xA3, 0x9E, 0x81, 0xF3, 0xD7, 0xFB,
+      0x7C, 0xE3, 0x39, 0x82, 0x9B, 0x2F, 0xFF, 0x87,
+      0x34, 0x8E, 0x43, 0x44, 0xC4, 0xDE, 0xE9, 0xCB,
+      0x54, 0x7B, 0x94, 0x32, 0xA6, 0xC2, 0x23, 0x3D,
+      0xEE, 0x4C, 0x95, 0x0B, 0x42, 0xFA, 0xC3, 0x4E,
+      0x08, 0x2E, 0xA1, 0x66, 0x28, 0xD9, 0x24, 0xB2,
+      0x76, 0x5B, 0xA2, 0x49, 0x6D, 0x8B, 0xD1, 0x25,
+      0x72, 0xF8, 0xF6, 0x64, 0x86, 0x68, 0x98, 0x16,
+      0xD4, 0xA4, 0x5C, 0xCC, 0x5D, 0x65, 0xB6, 0x92,
+      0x6C, 0x70, 0x48, 0x50, 0xFD, 0xED, 0xB9, 0xDA,
+      0x5E, 0x15, 0x46, 0x57, 0xA7, 0x8D, 0x9D, 0x84,
+      0x90, 0xD8, 0xAB, 0x00, 0x8C, 0xBC, 0xD3, 0x0A,
+      0xF7, 0xE4, 0x58, 0x05, 0xB8, 0xB3, 0x45, 0x06,
+      0xD0, 0x2C, 0x1E, 0x8F, 0xCA, 0x3F, 0x0F, 0x02,
+      0xC1, 0xAF, 0xBD, 0x03, 0x01, 0x13, 0x8A, 0x6B,
+      0x3A, 0x91, 0x11, 0x41, 0x4F, 0x67, 0xDC, 0xEA,
+      0x97, 0xF2, 0xCF, 0xCE, 0xF0, 0xB4, 0xE6, 0x73,
+      0x96, 0xAC, 0x74, 0x22, 0xE7, 0xAD, 0x35, 0x85,
+      0xE2, 0xF9, 0x37, 0xE8, 0x1C, 0x75, 0xDF, 0x6E,
+      0x47, 0xF1, 0x1A, 0x71, 0x1D, 0x29, 0xC5, 0x89,
+      0x6F, 0xB7, 0x62, 0x0E, 0xAA, 0x18, 0xBE, 0x1B,
+      0xFC, 0x56, 0x3E, 0x4B, 0xC6, 0xD2, 0x79, 0x20,
+      0x9A, 0xDB, 0xC0, 0xFE, 0x78, 0xCD, 0x5A, 0xF4,
+      0x1F, 0xDD, 0xA8, 0x33, 0x88, 0x07, 0xC7, 0x31,
+      0xB1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xEC, 0x5F,
+      0x60, 0x51, 0x7F, 0xA9, 0x19, 0xB5, 0x4A, 0x0D,
+      0x2D, 0xE5, 0x7A, 0x9F, 0x93, 0xC9, 0x9C, 0xEF,
+      0xA0, 0xE0, 0x3B, 0x4D, 0xAE, 0x2A, 0xF5, 0xB0,
+      0xC8, 0xEB, 0xBB, 0x3C, 0x83, 0x53, 0x99, 0x61,
+      0x17, 0x2B, 0x04, 0x7E, 0xBA, 0x77, 0xD6, 0x26,
+      0xE1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0C, 0x7D
+    };
+
+  static const unsigned char S[256]={
+      0x63, 0x7C, 0x77, 0x7B, 0xF2, 0x6B, 0x6F, 0xC5,
+      0x30, 0x01, 0x67, 0x2B, 0xFE, 0xD7, 0xAB, 0x76,
+      0xCA, 0x82, 0xC9, 0x7D, 0xFA, 0x59, 0x47, 0xF0,
+      0xAD, 0xD4, 0xA2, 0xAF, 0x9C, 0xA4, 0x72, 0xC0,
+      0xB7, 0xFD, 0x93, 0x26, 0x36, 0x3F, 0xF7, 0xCC,
+      0x34, 0xA5, 0xE5, 0xF1, 0x71, 0xD8, 0x31, 0x15,
+      0x04, 0xC7, 0x23, 0xC3, 0x18, 0x96, 0x05, 0x9A,
+      0x07, 0x12, 0x80, 0xE2, 0xEB, 0x27, 0xB2, 0x75,
+      0x09, 0x83, 0x2C, 0x1A, 0x1B, 0x6E, 0x5A, 0xA0,
+      0x52, 0x3B, 0xD6, 0xB3, 0x29, 0xE3, 0x2F, 0x84,
+      0x53, 0xD1, 0x00, 0xED, 0x20, 0xFC, 0xB1, 0x5B,
+      0x6A, 0xCB, 0xBE, 0x39, 0x4A, 0x4C, 0x58, 0xCF,
+      0xD0, 0xEF, 0xAA, 0xFB, 0x43, 0x4D, 0x33, 0x85,
+      0x45, 0xF9, 0x02, 0x7F, 0x50, 0x3C, 0x9F, 0xA8,
+      0x51, 0xA3, 0x40, 0x8F, 0x92, 0x9D, 0x38, 0xF5,
+      0xBC, 0xB6, 0xDA, 0x21, 0x10, 0xFF, 0xF3, 0xD2,
+      0xCD, 0x0C, 0x13, 0xEC, 0x5F, 0x97, 0x44, 0x17,
+      0xC4, 0xA7, 0x7E, 0x3D, 0x64, 0x5D, 0x19, 0x73,
+      0x60, 0x81, 0x4F, 0xDC, 0x22, 0x2A, 0x90, 0x88,
+      0x46, 0xEE, 0xB8, 0x14, 0xDE, 0x5E, 0x0B, 0xDB,
+      0xE0, 0x32, 0x3A, 0x0A, 0x49, 0x06, 0x24, 0x5C,
+      0xC2, 0xD3, 0xAC, 0x62, 0x91, 0x95, 0xE4, 0x79,
+      0xE7, 0xC8, 0x37, 0x6D, 0x8D, 0xD5, 0x4E, 0xA9,
+      0x6C, 0x56, 0xF4, 0xEA, 0x65, 0x7A, 0xAE, 0x08,
+      0xBA, 0x78, 0x25, 0x2E, 0x1C, 0xA6, 0xB4, 0xC6,
+      0xE8, 0xDD, 0x74, 0x1F, 0x4B, 0xBD, 0x8B, 0x8A,
+      0x70, 0x3E, 0xB5, 0x66, 0x48, 0x03, 0xF6, 0x0E,
+      0x61, 0x35, 0x57, 0xB9, 0x86, 0xC1, 0x1D, 0x9E,
+      0xE1, 0xF8, 0x98, 0x11, 0x69, 0xD9, 0x8E, 0x94,
+      0x9B, 0x1E, 0x87, 0xE9, 0xCE, 0x55, 0x28, 0xDF,
+      0x8C, 0xA1, 0x89, 0x0D, 0xBF, 0xE6, 0x42, 0x68,
+      0x41, 0x99, 0x2D, 0x0F, 0xB0, 0x54, 0xBB, 0x16
+  };
+
+  SBox = (unsigned char*)S;
+  SInvBox = (unsigned char*)Si;
+}
+
+address generate_aescrypt_encryptBlock() {
+  __ align(CodeEntryAlignment);
+  StubCodeMark mark(this, "StubRoutines", "aesencryptBlock");
+
+  address start = __ pc();
+
+  //    Register from = R0; // source byte array
+  //    Register to = R1;   // destination byte array
+  //    Register key = R2;  // expanded key array
+  //    Register tbox = R3; // transposition box reference
+
+  __ push (RegisterSet(R4, R12) | LR);
+  __ fstmdbd(SP, FloatRegisterSet(D0, 4), writeback);
+  __ sub(SP, SP, 32);
+
+  // preserve TBox references
+  __ add(R3, R3, arrayOopDesc::base_offset_in_bytes(T_INT));
+  __ str(R3, Address(SP, 16));
+
+  // retrieve key length. The length is used to determine the number of subsequent rounds (10, 12 or 14)
+  __ ldr(R9, Address(R2, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
+
+  __ ldr(R5, Address(R0));
+  __ ldr(R10, Address(R2, 4, post_indexed));
+  __ rev(R5, R5);
+  __ eor(R5, R5, R10);
+  __ ldr(R6, Address(R0, 4));
+  __ ldr(R10, Address(R2, 4, post_indexed));
+  __ rev(R6, R6);
+  __ eor(R6, R6, R10);
+  __ ldr(R7, Address(R0, 8));
+  __ ldr(R10, Address(R2, 4, post_indexed));
+  __ rev(R7, R7);
+  __ eor(R7, R7, R10);
+  __ ldr(R8, Address(R0, 12));
+  __ ldr(R10, Address(R2, 4, post_indexed));
+  __ rev(R8, R8);
+  __ eor(R8, R8, R10);
+
+  // Store the key size; However before doing that adjust the key to compensate for the Initial and Last rounds
+  __ sub(R9, R9, 8);
+  __ fmsr(S7, R1);
+
+  // load first transporistion box (T1)
+  __ ldr(R0, Address(SP, 16));
+
+  __ mov(LR, R2);
+
+  Label round;
+
+  __ bind(round);
+
+  // Utilize a Transposition Box lookup along with subsequent shift and EOR with a round key.
+  // instructions ordering is rearranged to minimize ReadAferWrite dependency. Not that important on A15 target
+  // with register renaming but performs ~10% better on A9.
+  __ mov(R12, AsmOperand(R5, lsr, 24));
+  __ ubfx(R4, R6, 16, 8);
+  __ ldr (R1, Address(R0, R12, lsl, 2));
+  __ ldr(R2, Address(R0, R4, lsl, 2));
+  __ ubfx(R3, R7, 8, 8);
+  __ eor(R1, R1, AsmOperand(R2, ror, 8));
+  __ uxtb(R4, R8);
+  __ ldr(R3, Address(R0, R3, lsl, 2));
+  __ ldr(R4, Address(R0, R4, lsl, 2));
+  __ ldr(R12, Address(LR, 4, post_indexed));
+  __ eor(R1, R1, AsmOperand(R3, ror, 16));
+  __ eor(R12, R12, AsmOperand(R4, ror, 24));
+  __ eor(R10, R1, R12);
+
+  __ mov(R12, AsmOperand(R6, lsr, 24));
+  __ ubfx(R4, R7, 16, 8);
+  __ ldr (R1, Address(R0, R12, lsl, 2));
+  __ ldr(R2, Address(R0, R4, lsl, 2));
+  __ ubfx(R3, R8, 8, 8);
+  __ eor(R1, R1, AsmOperand(R2, ror, 8));
+  __ uxtb(R4, R5);
+  __ ldr(R3, Address(R0, R3, lsl, 2));
+  __ ldr(R4, Address(R0, R4, lsl, 2));
+  __ ldr(R12, Address(LR, 4, post_indexed));
+  __ eor(R1, R1, AsmOperand(R3, ror, 16));
+  __ eor(R12, R12, AsmOperand(R4, ror, 24));
+  __ eor(R11, R1, R12);
+
+  __ mov(R12, AsmOperand(R7, lsr, 24));
+  __ ubfx(R4, R8, 16, 8);
+  __ ldr (R1, Address(R0, R12, lsl, 2));
+  __ ldr(R2, Address(R0, R4, lsl, 2));
+  __ ubfx(R3, R5, 8, 8);
+  __ eor(R1, R1, AsmOperand(R2, ror, 8));
+  __ uxtb(R4, R6);
+  __ ldr(R3, Address(R0, R3, lsl, 2));
+  __ ldr(R4, Address(R0, R4, lsl, 2));
+  __ ldr(R12, Address(LR, 4, post_indexed));
+  __ eor(R1, R1, AsmOperand(R3, ror, 16));
+  __ eor(R12, R12, AsmOperand(R4, ror, 24));
+  __ eor(R3, R1, R12);
+  __ str(R3, Address(SP, 0));
+
+  __ mov(R12, AsmOperand(R8, lsr, 24));
+  __ ubfx(R4, R5, 16, 8);
+  __ ldr (R1, Address(R0, R12, lsl, 2));
+  __ ldr(R2, Address(R0, R4, lsl, 2));
+  __ ubfx(R3, R6, 8, 8);
+  __ eor(R1, R1, AsmOperand(R2, ror, 8));
+  __ uxtb(R4, R7);
+  __ ldr(R3, Address(R0, R3, lsl, 2));
+  __ ldr(R4, Address(R0, R4, lsl, 2));
+  __ ldr(R12, Address(LR, 4, post_indexed));
+  __ eor(R1, R1, AsmOperand(R3, ror, 16));
+  __ eor(R12, R12, AsmOperand(R4, ror, 24));
+  __ eor(R8, R1, R12);
+
+  // update round count
+  __ subs(R9, R9, 4);
+
+  __ mov(R5, R10);
+  __ mov(R6, R11);
+  __ ldr(R7, Address(SP, 0));
+
+  __ b(round, gt);
+
+
+  // last round - a special case, no MixColumn
+  __ mov_slow(R10, (int)SBox);
+
+
+  // output buffer pointer
+  __ fmrs(R9, S7);
+
+  __ ldr(R11, Address(LR, 4, post_indexed));
+  __ ldrb(R0, Address(R10, R5, lsr, 24));
+  __ ubfx(R12, R6, 16, 8);
+  __ ldrb(R1, Address(R10, R12));
+  __ orr(R0, R1, AsmOperand(R0, lsl, 8));
+  __ ubfx(R12, R7, 8, 8);
+  __ ldrb(R2, Address(R10, R12));
+  __ orr(R0, R2, AsmOperand(R0, lsl, 8));
+  __ uxtb (R12, R8);
+  __ ldrb(R3, Address(R10, R12));
+  __ orr(R0, R3, AsmOperand(R0, lsl, 8));
+  __ eor(R0, R0, R11);
+  __ rev(R0, R0);
+  __ str(R0, Address(R9, 4, post_indexed));
+
+  __ ldr(R11, Address(LR, 4, post_indexed));
+  __ ldrb(R0, Address(R10, R6, lsr, 24));
+  __ ubfx(R12, R7, 16, 8);
+  __ ldrb(R1, Address(R10, R12));
+  __ orr(R0, R1, AsmOperand(R0, lsl, 8));
+  __ ubfx(R12, R8, 8, 8);
+  __ ldrb(R2, Address(R10, R12));
+  __ orr(R0, R2, AsmOperand(R0, lsl, 8));
+  __ uxtb (R12, R5);
+  __ ldrb(R3, Address(R10, R12));
+  __ orr(R0, R3, AsmOperand(R0, lsl, 8));
+  __ eor(R0, R0, R11);
+  __ rev(R0, R0);
+
+  __ str(R0, Address(R9, 4, post_indexed));
+  __ ldr(R11, Address(LR, 4, post_indexed));
+  __ ldrb(R0, Address(R10, R7, lsr, 24));
+  __ ubfx(R12, R8, 16, 8);
+  __ ldrb(R1, Address(R10, R12));
+  __ orr(R0, R1, AsmOperand(R0, lsl, 8));
+  __ ubfx(R12, R5, 8, 8);
+  __ ldrb(R2, Address(R10, R12));
+  __ orr(R0, R2, AsmOperand(R0, lsl, 8));
+  __ uxtb (R12, R6);
+  __ ldrb(R3, Address(R10, R12));
+  __ orr(R0, R3, AsmOperand(R0, lsl, 8));
+  __ eor(R0, R0, R11);
+  __ rev(R0, R0);
+
+  __ str(R0, Address(R9, 4, post_indexed));
+  __ ldr(R11, Address(LR));
+  __ ldrb(R0, Address(R10, R8, lsr, 24));
+  __ ubfx(R12, R5, 16, 8);
+  __ ldrb(R1, Address(R10, R12));
+  __ orr(R0, R1, AsmOperand(R0, lsl, 8));
+  __ ubfx(R12, R6, 8, 8);
+  __ ldrb(R2, Address(R10, R12));
+  __ orr(R0, R2, AsmOperand(R0, lsl, 8));
+  __ uxtb (R12, R7);
+  __ ldrb(R3, Address(R10, R12));
+  __ orr(R0, R3, AsmOperand(R0, lsl, 8));
+  __ eor(R0, R0, R11);
+  __ rev(R0, R0);
+
+  __ str(R0, Address(R9));
+
+  __ add(SP, SP, 32);
+  __ fldmiad(SP, FloatRegisterSet(D0, 4), writeback);;
+
+  __ pop(RegisterSet(R4, R12) | PC);
+  return start;
+}
+
+address generate_aescrypt_decryptBlock() {
+  __ align(CodeEntryAlignment);
+  StubCodeMark mark(this, "StubRoutines", "aesdecryptBlock");
+
+  address start = __ pc();
+
+  //    Register from = R0; // source byte array
+  //    Register to = R1;   // destination byte array
+  //    Register key = R2;  // expanded key array
+  //    Register tbox = R3; // transposition box reference
+
+  __ push (RegisterSet(R4, R12) | LR);
+  __ fstmdbd(SP, FloatRegisterSet(D0, 4), writeback);
+  __ sub(SP, SP, 32);
+
+  // retrieve key length
+  __ ldr(R9, Address(R2, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
+
+  // preserve TBox references
+  __ add(R3, R3, arrayOopDesc::base_offset_in_bytes(T_INT));
+  __ str(R3, Address(SP, 16));
+
+
+  // Preserve the expanded key pointer
+  __ fmsr(S8, R2);
+
+  // The first key round is applied to the last round
+  __ add(LR, R2, 16);
+
+
+  __ ldr(R5, Address(R0));
+  __ ldr(R10, Address(LR, 4, post_indexed));
+  __ rev(R5, R5);
+  __ eor(R5, R5, R10);
+  __ ldr(R6, Address(R0, 4));
+  __ ldr(R10, Address(LR, 4, post_indexed));
+  __ rev(R6, R6);
+  __ eor(R6, R6, R10);
+  __ ldr(R7, Address(R0, 8));
+  __ ldr(R10, Address(LR, 4, post_indexed));
+  __ rev(R7, R7);
+  __ eor(R7, R7, R10);
+  __ ldr(R8, Address(R0, 12));
+  __ ldr(R10, Address(LR, 4, post_indexed));
+  __ rev(R8, R8);
+  __ eor(R8, R8, R10);
+
+
+  // Store the key size; However before doing that adjust the key to compensate for the Initial and Last rounds
+  __ sub(R9, R9, 8);
+  __ fmsr(S7, R1);
+
+  // load transporistion box (T5)
+  __ ldr(R0, Address(SP, 16));
+
+  Label round;
+
+  __ bind(round);
+  // each sub-block is treated similary:
+
+  // combine SubBytes|ShiftRows|MixColumn through a precalculated set of tables
+  // Utilize a Transposition Box lookup along with subsequent shift and EOR with a round key.
+  // instructions ordering is rearranged to minimize ReadAferWrite dependency. Not that important on A15 target
+  // with register renaming but performs ~10% better on A9.
+  __ mov(R12, AsmOperand(R5, lsr, 24));
+  __ ubfx(R4, R8, 16, 8);
+  __ ldr (R1, Address(R0, R12, lsl, 2));
+  __ ldr(R2, Address(R0, R4, lsl, 2));
+  __ ubfx(R3, R7, 8, 8);
+  __ eor(R1, R1, AsmOperand(R2, ror, 8));
+  __ uxtb(R4, R6);
+  __ ldr(R3, Address(R0, R3, lsl, 2));
+  __ ldr(R4, Address(R0, R4, lsl, 2));
+  __ ldr(R12, Address(LR, 4, post_indexed));
+  __ eor(R1, R1, AsmOperand(R3, ror, 16));
+  __ eor(R12, R12, AsmOperand(R4, ror, 24));
+  __ eor(R10, R1, R12);
+
+  __ mov(R12, AsmOperand(R6, lsr, 24));
+  __ ubfx(R4, R5, 16, 8);
+  __ ldr (R1, Address(R0, R12, lsl, 2));
+  __ ldr(R2, Address(R0, R4, lsl, 2));
+  __ ubfx(R3, R8, 8, 8);
+  __ eor(R1, R1, AsmOperand(R2, ror, 8));
+  __ uxtb(R4, R7);
+  __ ldr(R3, Address(R0, R3, lsl, 2));
+  __ ldr(R4, Address(R0, R4, lsl, 2));
+  __ ldr(R12, Address(LR, 4, post_indexed));
+  __ eor(R1, R1, AsmOperand(R3, ror, 16));
+  __ eor(R12, R12, AsmOperand(R4, ror, 24));
+  __ eor(R11, R1, R12);
+
+  __ mov(R12, AsmOperand(R7, lsr, 24));
+  __ ubfx(R4, R6, 16, 8);
+  __ ldr (R1, Address(R0, R12, lsl, 2));
+  __ ldr(R2, Address(R0, R4, lsl, 2));
+  __ ubfx(R3, R5, 8, 8);
+  __ eor(R1, R1, AsmOperand(R2, ror, 8));
+  __ uxtb(R4, R8);
+  __ ldr(R3, Address(R0, R3, lsl, 2));
+  __ ldr(R4, Address(R0, R4, lsl, 2));
+  __ ldr(R12, Address(LR, 4, post_indexed));
+  __ eor(R1, R1, AsmOperand(R3, ror, 16));
+  __ eor(R12, R12, AsmOperand(R4, ror, 24));
+  __ eor(R3, R1, R12);
+  __ str(R3, Address(SP, 0));
+
+  __ mov(R12, AsmOperand(R8, lsr, 24));
+  __ ubfx(R4, R7, 16, 8);
+  __ ldr (R1, Address(R0, R12, lsl, 2));
+  __ ldr(R2, Address(R0, R4, lsl, 2));
+  __ ubfx(R3, R6, 8, 8);
+  __ eor(R1, R1, AsmOperand(R2, ror, 8));
+  __ uxtb(R4, R5);
+  __ ldr(R3, Address(R0, R3, lsl, 2));
+  __ ldr(R4, Address(R0, R4, lsl, 2));
+  __ ldr(R12, Address(LR, 4, post_indexed));
+  __ eor(R1, R1, AsmOperand(R3, ror, 16));
+  __ eor(R12, R12, AsmOperand(R4, ror, 24));
+  __ eor(R8, R1, R12);
+
+  // update round count
+  __ subs(R9, R9, 4);
+
+  __ mov(R5, R10);
+  __ mov(R6, R11);
+  __ ldr(R7, Address(SP, 0));
+
+  __ b(round, gt);
+
+  // last round - a special case, no MixColumn:
+
+  // Retrieve expanded key pointer
+  __ fmrs(LR, S8);
+
+  __ mov_slow(R10, (int)SInvBox);
+
+  // output buffer pointer
+  __ fmrs(R9, S7);
+
+  // process each sub-block in a similar manner:
+  // 1. load a corresponding round key
+  __ ldr(R11, Address(LR, 4, post_indexed));
+  // 2. combine SubBytes and ShiftRows stages
+  __ ldrb(R0, Address(R10, R5, lsr, 24));
+  __ ubfx(R12, R8, 16, 8);
+  __ ldrb(R1, Address(R10, R12));
+  __ orr(R0, R1, AsmOperand(R0, lsl, 8));
+  __ ubfx(R12, R7, 8, 8);
+  __ ldrb(R2, Address(R10, R12));
+  __ orr(R0, R2, AsmOperand(R0, lsl, 8));
+  __ uxtb (R12, R6);
+  __ ldrb(R3, Address(R10, R12));
+  __ orr(R3, R3, AsmOperand(R0, lsl, 8));
+  // 3. AddRoundKey stage
+  __ eor(R0, R3, R11);
+  // 4. convert the result to LE representation
+  __ rev(R0, R0);
+  // 5. store in the output buffer
+  __ str(R0, Address(R9, 4, post_indexed));
+
+  __ ldr(R11, Address(LR, 4, post_indexed));
+  __ ldrb(R0, Address(R10, R6, lsr, 24));
+  __ ubfx(R12, R5, 16, 8);
+  __ ldrb(R1, Address(R10, R12));
+  __ orr(R0, R1, AsmOperand(R0, lsl, 8));
+  __ ubfx(R12, R8, 8, 8);
+  __ ldrb(R2, Address(R10, R12));
+  __ orr(R0, R2, AsmOperand(R0, lsl, 8));
+  __ uxtb (R12, R7);
+  __ ldrb(R3, Address(R10, R12));
+  __ orr(R0, R3, AsmOperand(R0, lsl, 8));
+  __ eor(R0, R0, R11);
+  __ rev(R0, R0);
+  __ str(R0, Address(R9, 4, post_indexed));
+
+  __ ldr(R11, Address(LR, 4, post_indexed));
+  __ ldrb(R0, Address(R10, R7, lsr, 24));
+  __ ubfx(R12, R6, 16, 8);
+  __ ldrb(R1, Address(R10, R12));
+  __ orr(R0, R1, AsmOperand(R0, lsl, 8));
+  __ ubfx(R12, R5, 8, 8);
+  __ ldrb(R2, Address(R10, R12));
+  __ orr(R0, R2, AsmOperand(R0, lsl, 8));
+  __ uxtb (R12, R8);
+  __ ldrb(R3, Address(R10, R12));
+  __ orr(R0, R3, AsmOperand(R0, lsl, 8));
+  __ eor(R0, R0, R11);
+  __ rev(R0, R0);
+  __ str(R0, Address(R9, 4, post_indexed));
+
+  __ ldr(R11, Address(LR));
+  __ ldrb(R0, Address(R10, R8, lsr, 24));
+  __ ubfx(R12, R7, 16, 8);
+  __ ldrb(R1, Address(R10, R12));
+  __ orr(R0, R1, AsmOperand(R0, lsl, 8));
+  __ ubfx(R12, R6, 8, 8);
+  __ ldrb(R2, Address(R10, R12));
+  __ orr(R0, R2, AsmOperand(R0, lsl, 8));
+  __ uxtb (R12, R5);
+  __ ldrb(R3, Address(R10, R12));
+  __ orr(R0, R3, AsmOperand(R0, lsl, 8));
+  __ eor(R0, R0, R11);
+  __ rev(R0, R0);
+  __ str(R0, Address(R9));
+
+  __ add(SP, SP, 32);
+  __ fldmiad(SP, FloatRegisterSet(D0, 4), writeback);;
+  __ pop(RegisterSet(R4, R12) | PC);
+
+  return start;
+}
+
+address generate_cipherBlockChaining_encryptAESCrypt() {
+  // R0 - plain
+  // R1 - cipher
+  // R2 - expanded key
+  // R3 - Initialization Vector (IV)
+  // [sp+0] - cipher len
+  // [sp+4] Transposition Box reference
+
+  __ align(CodeEntryAlignment);
+  StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
+
+  address start = __ pc();
+
+  __ push(RegisterSet(R4, R12) | LR);
+  // load cipher length (which is first element on the original calling stack)
+  __ ldr(R4, Address(SP, 40));
+
+  __ sub(SP, SP, 32);
+
+  // preserve some arguments
+  __ mov(R5, R1);
+  __ mov(R6, R2);
+
+  // load IV
+  __ ldmia(R3, RegisterSet(R9, R12), writeback);
+
+  // preserve original source buffer on stack
+  __ str(R0, Address(SP, 16));
+
+  Label loop;
+  __ bind(loop);
+  __ ldmia(R0, RegisterSet(R0, R1) | RegisterSet(R7, R8));
+
+  __ eor(R0, R0, R9);
+  __ eor(R1, R1, R10);
+  __ eor(R7, R7, R11);
+  __ eor(R8, R8, R12);
+  __ stmia(SP, RegisterSet(R0, R1) | RegisterSet(R7, R8));
+
+  __ mov(R0, SP);
+  __ mov(R1, R5);
+  __ mov(R2, R6);
+  __ ldr(R3, Address(SP, 40+32+4));
+
+  // near call is sufficient since the target is also in the stubs
+  __ bl(StubRoutines::_aescrypt_encryptBlock);
+
+  __ subs(R4, R4, 16);
+  __ ldr(R0, Address(SP, 16), gt);
+  __ ldmia(R5, RegisterSet(R9, R12), writeback);
+  __ add(R0, R0, 16, gt);
+  __ str(R0, Address(SP, 16), gt);
+  __ b(loop, gt);
+
+  __ add(SP, SP, 32);
+  __ pop(RegisterSet(R4, R12) | LR);
+  // return cipher len (copied from the original argument)
+  __ ldr(R0, Address(SP));
+  __ bx(LR);
+
+  return start;
+}
+
+
+// The CBC decryption could benefit from parallel processing as the blocks could be
+// decrypted separatly from each other.
+// NEON is utilized (if available) to perform parallel execution on 8 blocks at a time.
+// Since Transposition Box (tbox) is used the parallel execution will only apply to an
+// Initial Round and the last round. It's not practical to use NEON for a table lookup
+// larger than 128 bytes. It also appears to be faster performing  tbox lookup
+// sequentially then execute Galois Field calculation in parallel.
+
+address generate_cipherBlockChaining_decryptAESCrypt() {
+  __ align(CodeEntryAlignment);
+  StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
+
+  address start = __ pc();
+
+  Label single_block_done, single_block, cbc_done;
+  // R0 - cipher
+  // R1 - plain
+  // R2 - expanded key
+  // R3 - Initialization Vector (iv)
+  // [sp+0] - cipher len
+  // [sp+4] - Transpotition Box reference
+
+  __ push(RegisterSet(R4, R12) | LR);
+
+  // load cipher len: must be modulo 16
+  __ ldr(R4, Address(SP, 40));
+
+  if (VM_Version::has_simd()) {
+    __ andrs(R4, R4, 0x7f);
+  }
+
+  // preserve registers based arguments
+  __ mov(R7, R2);
+  __ mov(R8, R3);
+
+  if (VM_Version::has_simd()) {
+    __ b(single_block_done, eq);
+  }
+
+  __ bind(single_block);
+  // preserve args
+  __ mov(R5, R0);
+  __ mov(R6, R1);
+
+  // reload arguments
+  __ mov(R2, R7);
+  __ ldr(R3, Address(SP, 40+4));
+
+  // near call is sufficient as the method is part of the StubGenerator
+  __ bl((address)StubRoutines::_aescrypt_decryptBlock);
+
+  // check remainig cipher size (for individual block processing)
+  __ subs(R4, R4, 16);
+  if (VM_Version::has_simd()) {
+    __ tst(R4, 0x7f);
+  }
+
+  // load IV (changes based on a CBC schedule)
+  __ ldmia(R8, RegisterSet(R9, R12));
+
+  // load plaintext from the previous block processing
+  __ ldmia(R6, RegisterSet(R0, R3));
+
+  // perform IV addition and save the plaintext for good now
+  __ eor(R0, R0, R9);
+  __ eor(R1, R1, R10);
+  __ eor(R2, R2, R11);
+  __ eor(R3, R3, R12);
+  __ stmia(R6, RegisterSet(R0, R3));
+
+  // adjust pointers for next block processing
+  __ mov(R8, R5);
+  __ add(R0, R5, 16);
+  __ add(R1, R6, 16);
+  __ b(single_block, ne);
+
+  __ bind(single_block_done);
+  if (!VM_Version::has_simd()) {
+    __ b(cbc_done);
+  } else {
+  // done with single blocks.
+  // check if any 8 block chunks are available for parallel processing
+  __ ldr(R4, Address(SP, 40));
+  __ bics(R4, R4, 0x7f);
+  __ b(cbc_done, eq);
+
+  Label decrypt_8_blocks;
+  int quad = 1;
+  // Process 8 blocks in parallel
+  __ fstmdbd(SP, FloatRegisterSet(D8, 8), writeback);
+  __ sub(SP, SP, 40);
+
+  // record output buffer end address (used as a block counter)
+  Address output_buffer_end(SP, 16);
+  __ add(R5, R1, R4);
+  __ str(R5, output_buffer_end);
+
+  // preserve key pointer
+  Address rounds_key(SP, 28);
+  __ str(R7, rounds_key);
+  // in decryption the first 16 bytes of expanded key are used in the last round
+  __ add(LR, R7, 16);
+
+
+  // Record the end of the key which is used to indicate a last round
+  __ ldr(R3, Address(R7, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
+  __ add(R9, R7, AsmOperand(R3, lsl, 2));
+
+  // preserve IV
+  Address iv(SP, 36);
+  __ str(R8, iv);
+
+  __ bind(decrypt_8_blocks);
+  __ mov(R5, R1);
+
+  // preserve original source pointer
+  Address original_src(SP, 32);
+  __ str(R0, original_src);
+
+  // Apply ShiftRow for 8 block at once:
+  // use output buffer for a temp storage to preload it into cache
+
+  __ vld1(D18, LR, MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS);
+  __ vld1(D0, Address(R0, 0, post_indexed),  MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS);
+  __ vrev(D0, D0, quad, 32, MacroAssembler::VELEM_SIZE_8);
+  __ veor(D20, D0, D18, quad);
+  __ vst1(D20, Address(R5, 0, post_indexed),  MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS);
+
+  __ vld1(D2, Address(R0, 0, post_indexed),  MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS);
+  __ vrev(D2, D2, quad, 32, MacroAssembler::VELEM_SIZE_8);
+  __ veor(D20, D2, D18, quad);
+  __ vst1(D20, Address(R5, 0, post_indexed),  MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS);
+
+  __ vld1(D4, Address(R0, 0, post_indexed),  MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS);
+  __ vrev(D4, D4, quad, 32, MacroAssembler::VELEM_SIZE_8);
+  __ veor(D20, D4, D18, quad);
+  __ vst1(D20, Address(R5, 0, post_indexed),  MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS);
+
+  __ vld1(D6, Address(R0, 0, post_indexed),  MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS);
+  __ vrev(D6, D6, quad, 32, MacroAssembler::VELEM_SIZE_8);
+  __ veor(D20, D6, D18, quad);
+  __ vst1(D20, Address(R5, 0, post_indexed),  MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS);
+
+  __ vld1(D8, Address(R0, 0, post_indexed),  MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS);
+  __ vrev(D8, D8, quad, 32, MacroAssembler::VELEM_SIZE_8);
+  __ veor(D20, D8, D18, quad);
+  __ vst1(D20, Address(R5, 0, post_indexed),  MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS);
+
+  __ vld1(D10, Address(R0, 0, post_indexed), MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS);
+  __ vrev(D10, D10, quad, 32, MacroAssembler::VELEM_SIZE_8);
+  __ veor(D20, D10, D18, quad);
+  __ vst1(D20, Address(R5, 0, post_indexed), MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS);
+
+  __ vld1(D12, Address(R0, 0, post_indexed), MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS);
+  __ vrev(D12, D12, quad, 32, MacroAssembler::VELEM_SIZE_8);
+  __ veor(D20, D12, D18, quad);
+  __ vst1(D20, Address(R5, 0, post_indexed), MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS);
+
+  __ vld1(D14, Address(R0, 0, post_indexed), MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS);
+  __ vrev(D14, D14, quad, 32, MacroAssembler::VELEM_SIZE_8);
+  __ veor(D20, D14, D18, quad);
+  __ vst1(D20, Address(R5, 0, post_indexed), MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS);
+
+
+  // Local frame map:
+  // sp+20 - ouput buffer pointer
+  // sp+28 - key pointer
+  // sp+32 - original source
+  // sp+36 - block counter
+
+
+  // preserve output buffer pointer
+  Address block_current_output_buffer(SP, 20);
+  __ str(R1, block_current_output_buffer);
+
+  // individual rounds in block processing are executed sequentially .
+  Label block_start;
+
+  // record end of the output buffer
+  __ add(R0, R1, 128);
+  __ str(R0, Address(SP, 12));
+
+  __ bind(block_start);
+
+  // load transporistion box reference (T5)
+  // location of the reference (6th incoming argument, second slot on the stack):
+  // 10 scalar registers on stack
+  //  8 double-precision FP registers
+  // 40 bytes frame size for local storage
+  //  4 bytes offset to the original arguments list
+  __ ldr(R0, Address(SP, 40+64+40+4));
+  __ add(R0, R0, arrayOopDesc::base_offset_in_bytes(T_INT));
+
+  // load rounds key and compensate for the first and last rounds
+  __ ldr(LR, rounds_key);
+  __ add(LR, LR, 32);
+
+  // load block data out buffer
+  __ ldr(R2, block_current_output_buffer);
+  __ ldmia(R2, RegisterSet(R5, R8));
+
+  Label round;
+  __ bind(round);
+
+  // Utilize a Transposition Box lookup along with subsequent shift and EOR with a round key.
+  // instructions ordering is rearranged to minimize ReadAferWrite dependency. Not that important on A15 target
+  // with register renaming but performs ~10% better on A9.
+  __ mov(R12, AsmOperand(R5, lsr, 24));
+  __ ubfx(R4, R8, 16, 8);
+  __ ldr (R1, Address(R0, R12, lsl, 2));
+  __ ldr(R2, Address(R0, R4, lsl, 2));
+  __ ubfx(R3, R7, 8, 8);
+  __ eor(R1, R1, AsmOperand(R2, ror, 8));
+  __ uxtb(R4, R6);
+  __ ldr(R3, Address(R0, R3, lsl, 2));
+  __ ldr(R4, Address(R0, R4, lsl, 2));
+  __ ldr(R12, Address(LR, 4, post_indexed));
+  __ eor(R1, R1, AsmOperand(R3, ror, 16));
+  __ eor(R12, R12, AsmOperand(R4, ror, 24));
+  __ eor(R10, R1, R12);
+
+  __ mov(R12, AsmOperand(R6, lsr, 24));
+  __ ubfx(R4, R5, 16, 8);
+  __ ldr (R1, Address(R0, R12, lsl, 2));
+  __ ldr(R2, Address(R0, R4, lsl, 2));
+  __ ubfx(R3, R8, 8, 8);
+  __ eor(R1, R1, AsmOperand(R2, ror, 8));
+  __ uxtb(R4, R7);
+  __ ldr(R3, Address(R0, R3, lsl, 2));
+  __ ldr(R4, Address(R0, R4, lsl, 2));
+  __ ldr(R12, Address(LR, 4, post_indexed));
+  __ eor(R1, R1, AsmOperand(R3, ror, 16));
+  __ eor(R12, R12, AsmOperand(R4, ror, 24));
+  __ eor(R11, R1, R12);
+
+  __ mov(R12, AsmOperand(R7, lsr, 24));
+  __ ubfx(R4, R6, 16, 8);
+  __ ldr (R1, Address(R0, R12, lsl, 2));
+  __ ldr(R2, Address(R0, R4, lsl, 2));
+  __ ubfx(R3, R5, 8, 8);
+  __ eor(R1, R1, AsmOperand(R2, ror, 8));
+  __ uxtb(R4, R8);
+  __ ldr(R3, Address(R0, R3, lsl, 2));
+  __ ldr(R4, Address(R0, R4, lsl, 2));
+  __ ldr(R12, Address(LR, 4, post_indexed));
+  __ eor(R1, R1, AsmOperand(R3, ror, 16));
+  __ eor(R12, R12, AsmOperand(R4, ror, 24));
+  __ eor(R3, R1, R12);
+  __ str(R3, Address(SP, 0));
+
+  __ mov(R12, AsmOperand(R8, lsr, 24));
+  __ ubfx(R4, R7, 16, 8);
+  __ ldr (R1, Address(R0, R12, lsl, 2));
+  __ ldr(R2, Address(R0, R4, lsl, 2));
+  __ ubfx(R3, R6, 8, 8);
+  __ eor(R1, R1, AsmOperand(R2, ror, 8));
+  __ uxtb(R4, R5);
+  __ ldr(R3, Address(R0, R3, lsl, 2));
+  __ ldr(R4, Address(R0, R4, lsl, 2));
+  __ ldr(R12, Address(LR, 4, post_indexed));
+  __ eor(R1, R1, AsmOperand(R3, ror, 16));
+  __ eor(R12, R12, AsmOperand(R4, ror, 24));
+  __ eor(R8, R1, R12);
+
+  // see if we reached the key array end
+  __ cmp(R9, LR);
+
+  //  load processed data
+  __ mov(R5, R10);
+  __ mov(R6, R11);
+  __ ldr(R7, Address(SP, 0));
+
+  __ b(round, gt);
+
+
+  // last round is special
+  // this round could be implemented through vtbl instruction in NEON. However vtbl is limited to a 32-byte wide table (4 vectors),
+  // thus it requires 8 lookup rounds to cover 256-byte wide Si table. On the other hand scalar lookup is independent of the
+  // lookup table size and thus proves to be faster.
+  __ ldr(LR, block_current_output_buffer);
+
+  // cipher counter
+  __ ldr(R11, Address(SP, 12));
+
+  __ mov_slow(R10, (int)SInvBox);
+  __ ldrb(R0, Address(R10, R5, lsr, 24));
+  __ ubfx(R12, R8, 16, 8);
+  __ ldrb (R1, Address(R10, R12));
+  __ orr(R0, R1, AsmOperand(R0, lsl, 8));
+  __ ubfx(R12, R7, 8, 8);
+  __ ldrb(R2, Address(R10, R12));
+  __ orr(R0, R2, AsmOperand(R0, lsl, 8));
+  __ uxtb(R12, R6);
+  __ ldrb(R3, Address(R10, R12));
+  __ orr(R0, R3, AsmOperand(R0, lsl, 8));
+  __ str(R0, Address(LR, 4, post_indexed));
+
+  __ ldrb(R0, Address(R10, R6, lsr, 24));
+  __ ubfx(R12, R5, 16, 8);
+  __ ldrb (R1, Address(R10, R12));
+  __ orr(R0, R1, AsmOperand(R0, lsl, 8));
+  __ ubfx(R12, R8, 8, 8);
+  __ ldrb(R2, Address(R10, R12));
+  __ orr(R0, R2, AsmOperand(R0, lsl, 8));
+  __ uxtb(R12, R7);
+  __ ldrb(R3, Address(R10, R12));
+  __ orr(R0, R3, AsmOperand(R0, lsl, 8));
+  __ str(R0, Address(LR, 4, post_indexed));
+
+
+  __ ldrb(R0, Address(R10, R7, lsr, 24));
+  __ ubfx(R12, R6, 16, 8);
+  __ ldrb (R1, Address(R10, R12));
+  __ orr(R0, R1, AsmOperand(R0, lsl, 8));
+  __ ubfx(R12, R5, 8, 8);
+  __ ldrb(R2, Address(R10, R12));
+  __ orr(R0, R2, AsmOperand(R0, lsl, 8));
+  __ uxtb(R12, R8);
+  __ ldrb(R3, Address(R10, R12));
+  __ orr(R0, R3, AsmOperand(R0, lsl, 8));
+  __ str(R0, Address(LR, 4, post_indexed));
+
+
+  __ ldrb(R0, Address(R10, R8, lsr, 24));
+  __ ubfx(R12, R7, 16, 8);
+  __ ldrb (R1, Address(R10, R12));
+  __ orr(R0, R1, AsmOperand(R0, lsl, 8));
+  __ ubfx(R12, R6, 8, 8);
+  __ ldrb(R2, Address(R10, R12));
+  __ orr(R0, R2, AsmOperand(R0, lsl, 8));
+  __ uxtb(R12, R5);
+  __ ldrb(R3, Address(R10, R12));
+  __ orr(R0, R3, AsmOperand(R0, lsl, 8));
+  __ str(R0, Address(LR, 4, post_indexed));
+
+
+  // preserve current scratch buffer pointer
+  __ cmp(R11, LR);
+  __ str(LR, block_current_output_buffer);
+
+  // go to the next block processing
+  __ b(block_start, ne);
+
+
+
+  // Perform last round AddRoundKey state on all 8 blocks
+
+  // load key pointer (remember that [sp+24]  points to a byte #32 at the key array)
+  // last round is processed with the key[0 ..3]
+  __ ldr(LR, rounds_key);
+
+  // retireve original output buffer pointer
+  __ ldr(R1, block_current_output_buffer);
+  __ sub(R1, R1, 128);
+  __ mov(R5, R1);
+
+
+  // retrieve original cipher (source) pointer
+  __ ldr(R0, original_src);
+
+  // retrieve IV (second argument on stack)
+  __ ldr(R6, iv);
+
+  __ vld1(D20, R6, MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS);
+  __ vrev(D20, D20, quad, 32, MacroAssembler::VELEM_SIZE_8);
+
+  // perform last AddRoundKey and IV addition
+  __ vld1(D18, Address(LR, 0, post_indexed), MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS);
+
+  __ vld1(D22, Address(R1, 0, post_indexed),  MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS);
+  __ veor(D22, D22, D18, quad);
+  __ veor(D22, D22, D20, quad);
+  __ vrev(D22, D22, quad, 32, MacroAssembler::VELEM_SIZE_8);
+  __ vst1(D22, Address(R5, 0, post_indexed),  MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS);
+
+
+  __ vld1(D22, Address(R1, 0, post_indexed),  MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS);
+  __ veor(D22, D22, D18, quad);
+  __ veor(D22, D22, D0, quad);
+  __ vrev(D22, D22, quad, 32, MacroAssembler::VELEM_SIZE_8);
+  __ vst1(D22, Address(R5, 0, post_indexed),  MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS);
+
+  __ vld1(D22, Address(R1, 0, post_indexed),  MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS);
+  __ veor(D22, D22, D18, quad);
+  __ veor(D22, D22, D2, quad);
+  __ vrev(D22, D22, quad, 32, MacroAssembler::VELEM_SIZE_8);
+  __ vst1(D22, Address(R5, 0, post_indexed),  MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS);
+
+  __ vld1(D22, Address(R1, 0, post_indexed),  MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS);
+  __ veor(D22, D22, D18, quad);
+  __ veor(D22, D22, D4, quad);
+  __ vrev(D22, D22, quad, 32, MacroAssembler::VELEM_SIZE_8);
+  __ vst1(D22, Address(R5, 0, post_indexed),  MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS);
+
+  __ vld1(D22, Address(R1, 0, post_indexed),  MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS);
+  __ veor(D22, D22, D18, quad);
+  __ veor(D22, D22, D6, quad);
+  __ vrev(D22, D22, quad, 32, MacroAssembler::VELEM_SIZE_8);
+  __ vst1(D22, Address(R5, 0, post_indexed),  MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS);
+
+  __ vld1(D22, Address(R1, 0, post_indexed),  MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS);
+  __ veor(D22, D22, D18, quad);
+  __ veor(D22, D22, D8, quad);
+  __ vrev(D22, D22, quad, 32, MacroAssembler::VELEM_SIZE_8);
+  __ vst1(D22, Address(R5, 0, post_indexed),  MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS);
+
+  __ vld1(D22, Address(R1, 0, post_indexed),  MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS);
+  __ veor(D22, D22, D18, quad);
+  __ veor(D22, D22, D10, quad);
+  __ vrev(D22, D22, quad, 32, MacroAssembler::VELEM_SIZE_8);
+  __ vst1(D22, Address(R5, 0, post_indexed),  MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS);
+
+  __ vld1(D22, Address(R1, 0, post_indexed),  MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS);
+  __ veor(D22, D22, D18, quad);
+  __ veor(D22, D22, D12, quad);
+  __ vrev(D22, D22, quad, 32, MacroAssembler::VELEM_SIZE_8);
+  __ vst1(D22, Address(R5, 0, post_indexed),  MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS);
+
+
+  // check if we're done
+  __ ldr(R4, output_buffer_end);
+  __ cmp(R4, R1);
+  __ add(R0, R0, 128-16);
+  __ str(R0, iv);
+  __ add(R0, R0, 16);
+
+  __ b(decrypt_8_blocks, ne);
+
+  __ add(SP, SP, 40);
+  __ fldmiad(SP, FloatRegisterSet(D8, 8), writeback);;
+  }
+
+  __ bind(cbc_done);
+  __ pop(RegisterSet(R4, R12) | LR);
+  __ ldr(R0, Address(SP));
+  __ bx(LR);
+
+  return start;
+}
+#endif // USE_CRYPTO
--- /dev/null	2016-08-24 15:41:39.598575000 -0400
+++ new/hotspot/src/cpu/arm/vm/stubRoutines_arm.cpp	2016-12-02 11:23:43.311970341 -0500
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2008, 2015, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+#include "runtime/deoptimization.hpp"
+#include "runtime/frame.inline.hpp"
+#include "runtime/stubRoutines.hpp"
+
+#ifndef AARCH64
+address StubRoutines::Arm::_idiv_irem_entry = NULL;
+#endif
+
+address StubRoutines::Arm::_partial_subtype_check = NULL;
+
+#ifndef AARCH64
+address StubRoutines::_atomic_load_long_entry = NULL;
+address StubRoutines::_atomic_store_long_entry = NULL;
+#endif
--- /dev/null	2016-08-24 15:41:39.598575000 -0400
+++ new/hotspot/src/cpu/arm/vm/stubRoutines_arm.hpp	2016-12-02 11:23:49.368313785 -0500
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2008, 2013, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef CPU_ARM_VM_STUBROUTINES_ARM_HPP
+#define CPU_ARM_VM_STUBROUTINES_ARM_HPP
+
+// This file holds the platform specific parts of the StubRoutines
+// definition. See stubRoutines.hpp for a description on how to
+// extend it.
+
+enum platform_dependent_constants {
+  code_size1 =  9000,           // simply increase if too small (assembler will crash if too small)
+  code_size2 = 22000            // simply increase if too small (assembler will crash if too small)
+};
+
+class Arm {
+ friend class StubGenerator;
+ friend class VMStructs;
+
+ private:
+
+#ifndef AARCH64
+  static address _idiv_irem_entry;
+#endif
+  static address _partial_subtype_check;
+
+ public:
+
+#ifndef AARCH64
+  static address idiv_irem_entry() { return _idiv_irem_entry; }
+#endif
+  static address partial_subtype_check() { return _partial_subtype_check; }
+};
+
+  static bool returns_to_call_stub(address return_pc) {
+    return return_pc == _call_stub_return_address;
+  }
+
+#ifndef AARCH64
+  static address _atomic_load_long_entry;
+  static address _atomic_store_long_entry;
+
+  static address atomic_load_long_entry()                  { return _atomic_load_long_entry; }
+  static address atomic_store_long_entry()                 { return _atomic_store_long_entry; }
+#endif
+
+
+#endif // CPU_ARM_VM_STUBROUTINES_ARM_HPP
--- /dev/null	2016-08-24 15:41:39.598575000 -0400
+++ new/hotspot/src/cpu/arm/vm/templateInterpreterGenerator_arm.cpp	2016-12-02 11:23:55.432657681 -0500
@@ -0,0 +1,1976 @@
+/*
+ * Copyright (c) 2008, 2016, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+#include "asm/assembler.hpp"
+#include "interpreter/bytecodeHistogram.hpp"
+#include "interpreter/interp_masm.hpp"
+#include "interpreter/interpreter.hpp"
+#include "interpreter/interpreterRuntime.hpp"
+#include "interpreter/templateInterpreterGenerator.hpp"
+#include "interpreter/templateTable.hpp"
+#include "oops/arrayOop.hpp"
+#include "oops/methodData.hpp"
+#include "oops/method.hpp"
+#include "oops/oop.inline.hpp"
+#include "prims/jvmtiExport.hpp"
+#include "prims/jvmtiThreadState.hpp"
+#include "prims/methodHandles.hpp"
+#include "runtime/arguments.hpp"
+#include "runtime/deoptimization.hpp"
+#include "runtime/frame.inline.hpp"
+#include "runtime/sharedRuntime.hpp"
+#include "runtime/stubRoutines.hpp"
+#include "runtime/synchronizer.hpp"
+#include "runtime/timer.hpp"
+#include "runtime/vframeArray.hpp"
+#include "utilities/debug.hpp"
+#include "utilities/macros.hpp"
+
+// Size of interpreter code.  Increase if too small.  Interpreter will
+// fail with a guarantee ("not enough space for interpreter generation");
+// if too small.
+// Run with +PrintInterpreter to get the VM to print out the size.
+// Max size with JVMTI
+int TemplateInterpreter::InterpreterCodeSize = 180 * 1024;
+
+#define __ _masm->
+
+//------------------------------------------------------------------------------------------------------------------------
+
+address TemplateInterpreterGenerator::generate_slow_signature_handler() {
+  address entry = __ pc();
+
+  // callee-save register for saving LR, shared with generate_native_entry
+  const Register Rsaved_ret_addr = AARCH64_ONLY(R21) NOT_AARCH64(Rtmp_save0);
+
+  __ mov(Rsaved_ret_addr, LR);
+
+  __ mov(R1, Rmethod);
+  __ mov(R2, Rlocals);
+  __ mov(R3, SP);
+
+#ifdef AARCH64
+  // expand expr. stack and extended SP to avoid cutting SP in call_VM
+  __ mov(Rstack_top, SP);
+  __ str(Rstack_top, Address(FP, frame::interpreter_frame_extended_sp_offset * wordSize));
+  __ check_stack_top();
+
+  __ call_VM(noreg, CAST_FROM_FN_PTR(address, InterpreterRuntime::slow_signature_handler), R1, R2, R3, false);
+
+  __ ldp(ZR,      c_rarg1, Address(SP, 2*wordSize, post_indexed));
+  __ ldp(c_rarg2, c_rarg3, Address(SP, 2*wordSize, post_indexed));
+  __ ldp(c_rarg4, c_rarg5, Address(SP, 2*wordSize, post_indexed));
+  __ ldp(c_rarg6, c_rarg7, Address(SP, 2*wordSize, post_indexed));
+
+  __ ldp_d(V0, V1, Address(SP, 2*wordSize, post_indexed));
+  __ ldp_d(V2, V3, Address(SP, 2*wordSize, post_indexed));
+  __ ldp_d(V4, V5, Address(SP, 2*wordSize, post_indexed));
+  __ ldp_d(V6, V7, Address(SP, 2*wordSize, post_indexed));
+#else
+
+  // Safer to save R9 (when scratched) since callers may have been
+  // written assuming R9 survives. This is suboptimal but
+  // probably not important for this slow case call site.
+  // Note for R9 saving: slow_signature_handler may copy register
+  // arguments above the current SP (passed as R3). It is safe for
+  // call_VM to use push and pop to protect additional values on the
+  // stack if needed.
+  __ call_VM(CAST_FROM_FN_PTR(address, InterpreterRuntime::slow_signature_handler), true /* save R9 if needed*/);
+  __ add(SP, SP, wordSize);     // Skip R0
+  __ pop(RegisterSet(R1, R3));  // Load arguments passed in registers
+#ifdef __ABI_HARD__
+  // Few alternatives to an always-load-FP-registers approach:
+  // - parse method signature to detect FP arguments
+  // - keep a counter/flag on a stack indicationg number of FP arguments in the method.
+  // The later has been originally implemented and tested but a conditional path could
+  // eliminate any gain imposed by avoiding 8 double word loads.
+  __ fldmiad(SP, FloatRegisterSet(D0, 8), writeback);
+#endif // __ABI_HARD__
+#endif // AARCH64
+
+  __ ret(Rsaved_ret_addr);
+
+  return entry;
+}
+
+
+//
+// Various method entries (that c++ and asm interpreter agree upon)
+//------------------------------------------------------------------------------------------------------------------------
+//
+//
+
+// Abstract method entry
+// Attempt to execute abstract method. Throw exception
+address TemplateInterpreterGenerator::generate_abstract_entry(void) {
+  address entry_point = __ pc();
+
+#ifdef AARCH64
+  __ restore_sp_after_call(Rtemp);
+  __ restore_stack_top();
+#endif
+
+  __ empty_expression_stack();
+
+  __ call_VM(noreg, CAST_FROM_FN_PTR(address, InterpreterRuntime::throw_AbstractMethodError));
+
+  DEBUG_ONLY(STOP("generate_abstract_entry");) // Should not reach here
+  return entry_point;
+}
+
+address TemplateInterpreterGenerator::generate_math_entry(AbstractInterpreter::MethodKind kind) {
+  if (!InlineIntrinsics) return NULL; // Generate a vanilla entry
+
+  // TODO: ARM
+  return NULL;
+
+  address entry_point = __ pc();
+  STOP("generate_math_entry");
+  return entry_point;
+}
+
+address TemplateInterpreterGenerator::generate_StackOverflowError_handler() {
+  address entry = __ pc();
+
+  // Note: There should be a minimal interpreter frame set up when stack
+  // overflow occurs since we check explicitly for it now.
+  //
+#ifdef ASSERT
+  { Label L;
+    __ sub(Rtemp, FP, - frame::interpreter_frame_monitor_block_top_offset * wordSize);
+    __ cmp(SP, Rtemp);  // Rtemp = maximal SP for current FP,
+                        //  (stack grows negative)
+    __ b(L, ls); // check if frame is complete
+    __ stop ("interpreter frame not set up");
+    __ bind(L);
+  }
+#endif // ASSERT
+
+  // Restore bcp under the assumption that the current frame is still
+  // interpreted
+  __ restore_bcp();
+
+  // expression stack must be empty before entering the VM if an exception
+  // happened
+  __ empty_expression_stack();
+
+  // throw exception
+  __ call_VM(noreg, CAST_FROM_FN_PTR(address, InterpreterRuntime::throw_StackOverflowError));
+
+  __ should_not_reach_here();
+
+  return entry;
+}
+
+address TemplateInterpreterGenerator::generate_ArrayIndexOutOfBounds_handler(const char* name) {
+  address entry = __ pc();
+
+  // index is in R4_ArrayIndexOutOfBounds_index
+
+  InlinedString Lname(name);
+
+  // expression stack must be empty before entering the VM if an exception happened
+  __ empty_expression_stack();
+
+  // setup parameters
+  __ ldr_literal(R1, Lname);
+  __ mov(R2, R4_ArrayIndexOutOfBounds_index);
+
+  __ call_VM(noreg, CAST_FROM_FN_PTR(address, InterpreterRuntime::throw_ArrayIndexOutOfBoundsException), R1, R2);
+
+  __ nop(); // to avoid filling CPU pipeline with invalid instructions
+  __ nop();
+  __ should_not_reach_here();
+  __ bind_literal(Lname);
+
+  return entry;
+}
+
+address TemplateInterpreterGenerator::generate_ClassCastException_handler() {
+  address entry = __ pc();
+
+  // object is in R2_ClassCastException_obj
+
+  // expression stack must be empty before entering the VM if an exception
+  // happened
+  __ empty_expression_stack();
+
+  __ mov(R1, R2_ClassCastException_obj);
+  __ call_VM(noreg,
+             CAST_FROM_FN_PTR(address,
+                              InterpreterRuntime::throw_ClassCastException),
+             R1);
+
+  __ should_not_reach_here();
+
+  return entry;
+}
+
+address TemplateInterpreterGenerator::generate_exception_handler_common(const char* name, const char* message, bool pass_oop) {
+  assert(!pass_oop || message == NULL, "either oop or message but not both");
+  address entry = __ pc();
+
+  InlinedString Lname(name);
+  InlinedString Lmessage(message);
+
+  if (pass_oop) {
+    // object is at TOS
+    __ pop_ptr(R2);
+  }
+
+  // expression stack must be empty before entering the VM if an exception happened
+  __ empty_expression_stack();
+
+  // setup parameters
+  __ ldr_literal(R1, Lname);
+
+  if (pass_oop) {
+    __ call_VM(Rexception_obj, CAST_FROM_FN_PTR(address, InterpreterRuntime::create_klass_exception), R1, R2);
+  } else {
+    if (message != NULL) {
+      __ ldr_literal(R2, Lmessage);
+    } else {
+      __ mov(R2, 0);
+    }
+    __ call_VM(Rexception_obj, CAST_FROM_FN_PTR(address, InterpreterRuntime::create_exception), R1, R2);
+  }
+
+  // throw exception
+  __ b(Interpreter::throw_exception_entry());
+
+  __ nop(); // to avoid filling CPU pipeline with invalid instructions
+  __ nop();
+  __ bind_literal(Lname);
+  if (!pass_oop && (message != NULL)) {
+    __ bind_literal(Lmessage);
+  }
+
+  return entry;
+}
+
+address TemplateInterpreterGenerator::generate_continuation_for(TosState state) {
+  // Not used.
+  STOP("generate_continuation_for");
+  return NULL;
+}
+
+address TemplateInterpreterGenerator::generate_return_entry_for(TosState state, int step, size_t index_size) {
+  address entry = __ pc();
+
+  __ interp_verify_oop(R0_tos, state, __FILE__, __LINE__);
+
+#ifdef AARCH64
+  __ restore_sp_after_call(Rtemp);  // Restore SP to extended SP
+  __ restore_stack_top();
+#else
+  // Restore stack bottom in case i2c adjusted stack
+  __ ldr(SP, Address(FP, frame::interpreter_frame_last_sp_offset * wordSize));
+  // and NULL it as marker that SP is now tos until next java call
+  __ mov(Rtemp, (int)NULL_WORD);
+  __ str(Rtemp, Address(FP, frame::interpreter_frame_last_sp_offset * wordSize));
+#endif // AARCH64
+
+  __ restore_method();
+  __ restore_bcp();
+  __ restore_dispatch();
+  __ restore_locals();
+
+  const Register Rcache = R2_tmp;
+  const Register Rindex = R3_tmp;
+  __ get_cache_and_index_at_bcp(Rcache, Rindex, 1, index_size);
+
+  __ add(Rtemp, Rcache, AsmOperand(Rindex, lsl, LogBytesPerWord));
+  __ ldrb(Rtemp, Address(Rtemp, ConstantPoolCache::base_offset() + ConstantPoolCacheEntry::flags_offset()));
+  __ check_stack_top();
+  __ add(Rstack_top, Rstack_top, AsmOperand(Rtemp, lsl, Interpreter::logStackElementSize));
+
+#ifndef AARCH64
+  __ convert_retval_to_tos(state);
+#endif // !AARCH64
+
+  __ dispatch_next(state, step);
+
+  return entry;
+}
+
+
+address TemplateInterpreterGenerator::generate_deopt_entry_for(TosState state, int step) {
+  address entry = __ pc();
+
+  __ interp_verify_oop(R0_tos, state, __FILE__, __LINE__);
+
+#ifdef AARCH64
+  __ restore_sp_after_call(Rtemp);  // Restore SP to extended SP
+  __ restore_stack_top();
+#else
+  // The stack is not extended by deopt but we must NULL last_sp as this
+  // entry is like a "return".
+  __ mov(Rtemp, 0);
+  __ str(Rtemp, Address(FP, frame::interpreter_frame_last_sp_offset * wordSize));
+#endif // AARCH64
+
+  __ restore_method();
+  __ restore_bcp();
+  __ restore_dispatch();
+  __ restore_locals();
+
+  // handle exceptions
+  { Label L;
+    __ ldr(Rtemp, Address(Rthread, Thread::pending_exception_offset()));
+    __ cbz(Rtemp, L);
+    __ call_VM(noreg, CAST_FROM_FN_PTR(address, InterpreterRuntime::throw_pending_exception));
+    __ should_not_reach_here();
+    __ bind(L);
+  }
+
+  __ dispatch_next(state, step);
+
+  return entry;
+}
+
+address TemplateInterpreterGenerator::generate_result_handler_for(BasicType type) {
+#ifdef AARCH64
+  address entry = __ pc();
+  switch (type) {
+    case T_BOOLEAN:
+      __ tst(R0, 0xff);
+      __ cset(R0, ne);
+      break;
+    case T_CHAR   : __ zero_extend(R0, R0, 16);  break;
+    case T_BYTE   : __ sign_extend(R0, R0,  8);  break;
+    case T_SHORT  : __ sign_extend(R0, R0, 16);  break;
+    case T_INT    : // fall through
+    case T_LONG   : // fall through
+    case T_VOID   : // fall through
+    case T_FLOAT  : // fall through
+    case T_DOUBLE : /* nothing to do */          break;
+    case T_OBJECT :
+      // retrieve result from frame
+      __ ldr(R0, Address(FP, frame::interpreter_frame_oop_temp_offset * wordSize));
+      // and verify it
+      __ verify_oop(R0);
+      break;
+    default       : ShouldNotReachHere();
+  }
+  __ ret();
+  return entry;
+#else
+  // Result handlers are not used on 32-bit ARM
+  // since the returned value is already in appropriate format.
+  __ should_not_reach_here();  // to avoid empty code block
+
+  // The result handler non-zero indicates an object is returned and this is
+  // used in the native entry code.
+  return type == T_OBJECT ? (address)(-1) : NULL;
+#endif // AARCH64
+}
+
+address TemplateInterpreterGenerator::generate_safept_entry_for(TosState state, address runtime_entry) {
+  address entry = __ pc();
+  __ push(state);
+  __ call_VM(noreg, runtime_entry);
+
+  // load current bytecode
+  __ ldrb(R3_bytecode, Address(Rbcp));
+  __ dispatch_only_normal(vtos);
+  return entry;
+}
+
+
+// Helpers for commoning out cases in the various type of method entries.
+//
+
+// increment invocation count & check for overflow
+//
+// Note: checking for negative value instead of overflow
+//       so we have a 'sticky' overflow test
+//
+// In: Rmethod.
+//
+// Uses R0, R1, Rtemp.
+//
+void TemplateInterpreterGenerator::generate_counter_incr(Label* overflow,
+                                                 Label* profile_method,
+                                                 Label* profile_method_continue) {
+  Label done;
+  const Register Rcounters = Rtemp;
+  const Address invocation_counter(Rcounters,
+                MethodCounters::invocation_counter_offset() +
+                InvocationCounter::counter_offset());
+
+  // Note: In tiered we increment either counters in MethodCounters* or
+  // in MDO depending if we're profiling or not.
+  if (TieredCompilation) {
+    int increment = InvocationCounter::count_increment;
+    Label no_mdo;
+    if (ProfileInterpreter) {
+      // Are we profiling?
+      __ ldr(R1_tmp, Address(Rmethod, Method::method_data_offset()));
+      __ cbz(R1_tmp, no_mdo);
+      // Increment counter in the MDO
+      const Address mdo_invocation_counter(R1_tmp,
+                    in_bytes(MethodData::invocation_counter_offset()) +
+                    in_bytes(InvocationCounter::counter_offset()));
+      const Address mask(R1_tmp, in_bytes(MethodData::invoke_mask_offset()));
+      __ increment_mask_and_jump(mdo_invocation_counter, increment, mask, R0_tmp, Rtemp, eq, overflow);
+      __ b(done);
+    }
+    __ bind(no_mdo);
+    __ get_method_counters(Rmethod, Rcounters, done);
+    const Address mask(Rcounters, in_bytes(MethodCounters::invoke_mask_offset()));
+    __ increment_mask_and_jump(invocation_counter, increment, mask, R0_tmp, R1_tmp, eq, overflow);
+    __ bind(done);
+  } else { // not TieredCompilation
+    const Address backedge_counter(Rcounters,
+                  MethodCounters::backedge_counter_offset() +
+                  InvocationCounter::counter_offset());
+
+    const Register Ricnt = R0_tmp;  // invocation counter
+    const Register Rbcnt = R1_tmp;  // backedge counter
+
+    __ get_method_counters(Rmethod, Rcounters, done);
+
+    if (ProfileInterpreter) {
+      const Register Riic = R1_tmp;
+      __ ldr_s32(Riic, Address(Rcounters, MethodCounters::interpreter_invocation_counter_offset()));
+      __ add(Riic, Riic, 1);
+      __ str_32(Riic, Address(Rcounters, MethodCounters::interpreter_invocation_counter_offset()));
+    }
+
+    // Update standard invocation counters
+
+    __ ldr_u32(Ricnt, invocation_counter);
+    __ ldr_u32(Rbcnt, backedge_counter);
+
+    __ add(Ricnt, Ricnt, InvocationCounter::count_increment);
+
+#ifdef AARCH64
+    __ andr(Rbcnt, Rbcnt, (unsigned int)InvocationCounter::count_mask_value); // mask out the status bits
+#else
+    __ bic(Rbcnt, Rbcnt, ~InvocationCounter::count_mask_value); // mask out the status bits
+#endif // AARCH64
+
+    __ str_32(Ricnt, invocation_counter);            // save invocation count
+    __ add(Ricnt, Ricnt, Rbcnt);                     // add both counters
+
+    // profile_method is non-null only for interpreted method so
+    // profile_method != NULL == !native_call
+    // BytecodeInterpreter only calls for native so code is elided.
+
+    if (ProfileInterpreter && profile_method != NULL) {
+      assert(profile_method_continue != NULL, "should be non-null");
+
+      // Test to see if we should create a method data oop
+      // Reuse R1_tmp as we don't need backedge counters anymore.
+      Address profile_limit(Rcounters, in_bytes(MethodCounters::interpreter_profile_limit_offset()));
+      __ ldr_s32(R1_tmp, profile_limit);
+      __ cmp_32(Ricnt, R1_tmp);
+      __ b(*profile_method_continue, lt);
+
+      // if no method data exists, go to profile_method
+      __ test_method_data_pointer(R1_tmp, *profile_method);
+    }
+
+    Address invoke_limit(Rcounters, in_bytes(MethodCounters::interpreter_invocation_limit_offset()));
+    __ ldr_s32(R1_tmp, invoke_limit);
+    __ cmp_32(Ricnt, R1_tmp);
+    __ b(*overflow, hs);
+    __ bind(done);
+  }
+}
+
+void TemplateInterpreterGenerator::generate_counter_overflow(Label& do_continue) {
+  // InterpreterRuntime::frequency_counter_overflow takes one argument
+  // indicating if the counter overflow occurs at a backwards branch (non-NULL bcp).
+  // The call returns the address of the verified entry point for the method or NULL
+  // if the compilation did not complete (either went background or bailed out).
+  __ mov(R1, (int)false);
+  __ call_VM(noreg, CAST_FROM_FN_PTR(address, InterpreterRuntime::frequency_counter_overflow), R1);
+
+  // jump to the interpreted entry.
+  __ b(do_continue);
+}
+
+void TemplateInterpreterGenerator::generate_stack_overflow_check(void) {
+  // Check if we've got enough room on the stack for
+  //  - overhead;
+  //  - locals;
+  //  - expression stack.
+  //
+  // Registers on entry:
+  //
+  // R3 = number of additional locals
+  // R11 = max expression stack slots (AArch64 only)
+  // Rthread
+  // Rmethod
+  // Registers used: R0, R1, R2, Rtemp.
+
+  const Register Radditional_locals = R3;
+  const Register RmaxStack = AARCH64_ONLY(R11) NOT_AARCH64(R2);
+
+  // monitor entry size
+  const int entry_size = frame::interpreter_frame_monitor_size() * wordSize;
+
+  // total overhead size: entry_size + (saved registers, thru expr stack bottom).
+  // be sure to change this if you add/subtract anything to/from the overhead area
+  const int overhead_size = (frame::sender_sp_offset - frame::interpreter_frame_initial_sp_offset)*wordSize + entry_size;
+
+  // Pages reserved for VM runtime calls and subsequent Java calls.
+  const int reserved_pages = JavaThread::stack_shadow_zone_size();
+
+  // Thread::stack_size() includes guard pages, and they should not be touched.
+  const int guard_pages = JavaThread::stack_guard_zone_size();
+
+  __ ldr(R0, Address(Rthread, Thread::stack_base_offset()));
+  __ ldr(R1, Address(Rthread, Thread::stack_size_offset()));
+#ifndef AARCH64
+  __ ldr(Rtemp, Address(Rmethod, Method::const_offset()));
+  __ ldrh(RmaxStack, Address(Rtemp, ConstMethod::max_stack_offset()));
+#endif // !AARCH64
+  __ sub_slow(Rtemp, SP, overhead_size + reserved_pages + guard_pages + Method::extra_stack_words());
+
+  // reserve space for additional locals
+  __ sub(Rtemp, Rtemp, AsmOperand(Radditional_locals, lsl, Interpreter::logStackElementSize));
+
+  // stack size
+  __ sub(R0, R0, R1);
+
+  // reserve space for expression stack
+  __ sub(Rtemp, Rtemp, AsmOperand(RmaxStack, lsl, Interpreter::logStackElementSize));
+
+  __ cmp(Rtemp, R0);
+
+#ifdef AARCH64
+  Label L;
+  __ b(L, hi);
+  __ mov(SP, Rsender_sp);  // restore SP
+  __ b(StubRoutines::throw_StackOverflowError_entry());
+  __ bind(L);
+#else
+  __ mov(SP, Rsender_sp, ls);  // restore SP
+  __ b(StubRoutines::throw_StackOverflowError_entry(), ls);
+#endif // AARCH64
+}
+
+
+// Allocate monitor and lock method (asm interpreter)
+//
+void TemplateInterpreterGenerator::lock_method() {
+  // synchronize method
+
+  const int entry_size = frame::interpreter_frame_monitor_size() * wordSize;
+  assert ((entry_size % StackAlignmentInBytes) == 0, "should keep stack alignment");
+
+  #ifdef ASSERT
+    { Label L;
+      __ ldr_u32(Rtemp, Address(Rmethod, Method::access_flags_offset()));
+      __ tbnz(Rtemp, JVM_ACC_SYNCHRONIZED_BIT, L);
+      __ stop("method doesn't need synchronization");
+      __ bind(L);
+    }
+  #endif // ASSERT
+
+  // get synchronization object
+  { Label done;
+    __ ldr_u32(Rtemp, Address(Rmethod, Method::access_flags_offset()));
+#ifdef AARCH64
+    __ ldr(R0, Address(Rlocals, Interpreter::local_offset_in_bytes(0))); // get receiver (assume this is frequent case)
+    __ tbz(Rtemp, JVM_ACC_STATIC_BIT, done);
+#else
+    __ tst(Rtemp, JVM_ACC_STATIC);
+    __ ldr(R0, Address(Rlocals, Interpreter::local_offset_in_bytes(0)), eq); // get receiver (assume this is frequent case)
+    __ b(done, eq);
+#endif // AARCH64
+    __ load_mirror(R0, Rmethod, Rtemp);
+    __ bind(done);
+  }
+
+  // add space for monitor & lock
+
+#ifdef AARCH64
+  __ check_extended_sp(Rtemp);
+  __ sub(SP, SP, entry_size);                  // adjust extended SP
+  __ mov(Rtemp, SP);
+  __ str(Rtemp, Address(FP, frame::interpreter_frame_extended_sp_offset * wordSize));
+#endif // AARCH64
+
+  __ sub(Rstack_top, Rstack_top, entry_size);
+  __ check_stack_top_on_expansion();
+                                              // add space for a monitor entry
+  __ str(Rstack_top, Address(FP, frame::interpreter_frame_monitor_block_top_offset * wordSize));
+                                              // set new monitor block top
+  __ str(R0, Address(Rstack_top, BasicObjectLock::obj_offset_in_bytes()));
+                                              // store object
+  __ mov(R1, Rstack_top);                     // monitor entry address
+  __ lock_object(R1);
+}
+
+#ifdef AARCH64
+
+//
+// Generate a fixed interpreter frame. This is identical setup for interpreted methods
+// and for native methods hence the shared code.
+//
+// On entry:
+//   R10 = ConstMethod
+//   R11 = max expr. stack (in slots), if !native_call
+//
+// On exit:
+//   Rbcp, Rstack_top are initialized, SP is extended
+//
+void TemplateInterpreterGenerator::generate_fixed_frame(bool native_call) {
+  // Incoming registers
+  const Register RconstMethod = R10;
+  const Register RmaxStack = R11;
+  // Temporary registers
+  const Register RextendedSP = R0;
+  const Register Rcache = R1;
+  const Register Rmdp = ProfileInterpreter ? R2 : ZR;
+
+  // Generates the following stack layout (stack grows up in this picture):
+  //
+  // [ expr. stack bottom ]
+  // [ saved Rbcp         ]
+  // [ current Rlocals    ]
+  // [ cache              ]
+  // [ mdx                ]
+  // [ mirror             ]
+  // [ Method*            ]
+  // [ extended SP        ]
+  // [ expr. stack top    ]
+  // [ sender_sp          ]
+  // [ saved FP           ] <--- FP
+  // [ saved LR           ]
+
+  // initialize fixed part of activation frame
+  __ stp(FP, LR, Address(SP, -2*wordSize, pre_indexed));
+  __ mov(FP, SP);                                     // establish new FP
+
+  // setup Rbcp
+  if (native_call) {
+    __ mov(Rbcp, ZR);                                 // bcp = 0 for native calls
+  } else {
+    __ add(Rbcp, RconstMethod, in_bytes(ConstMethod::codes_offset())); // get codebase
+  }
+
+  // Rstack_top & RextendedSP
+  __ sub(Rstack_top, SP, 10*wordSize);
+  if (native_call) {
+    __ sub(RextendedSP, Rstack_top, round_to(wordSize, StackAlignmentInBytes));    // reserve 1 slot for exception handling
+  } else {
+    __ sub(RextendedSP, Rstack_top, AsmOperand(RmaxStack, lsl, Interpreter::logStackElementSize));
+    __ align_reg(RextendedSP, RextendedSP, StackAlignmentInBytes);
+  }
+  __ mov(SP, RextendedSP);
+  __ check_stack_top();
+
+  // Load Rmdp
+  if (ProfileInterpreter) {
+    __ ldr(Rtemp, Address(Rmethod, Method::method_data_offset()));
+    __ tst(Rtemp, Rtemp);
+    __ add(Rtemp, Rtemp, in_bytes(MethodData::data_offset()));
+    __ csel(Rmdp, ZR, Rtemp, eq);
+  }
+
+  // Load Rcache
+  __ ldr(Rtemp, Address(RconstMethod, ConstMethod::constants_offset()));
+  __ ldr(Rcache, Address(Rtemp, ConstantPool::cache_offset_in_bytes()));
+  // Get mirror and store it in the frame as GC root for this Method*
+  __ load_mirror(Rtemp, Rmethod, Rtemp);
+
+  // Build fixed frame
+  __ stp(Rstack_top, Rbcp, Address(FP, -10*wordSize));
+  __ stp(Rlocals, Rcache,  Address(FP,  -8*wordSize));
+  __ stp(Rmdp, Rtemp,          Address(FP,  -6*wordSize));
+  __ stp(Rmethod, RextendedSP, Address(FP,  -4*wordSize));
+  __ stp(ZR, Rsender_sp,   Address(FP,  -2*wordSize));
+  assert(frame::interpreter_frame_initial_sp_offset == -10, "interpreter frame broken");
+  assert(frame::interpreter_frame_stack_top_offset  == -2, "stack top broken");
+}
+
+#else // AARCH64
+
+//
+// Generate a fixed interpreter frame. This is identical setup for interpreted methods
+// and for native methods hence the shared code.
+
+void TemplateInterpreterGenerator::generate_fixed_frame(bool native_call) {
+  // Generates the following stack layout:
+  //
+  // [ expr. stack bottom ]
+  // [ saved Rbcp         ]
+  // [ current Rlocals    ]
+  // [ cache              ]
+  // [ mdx                ]
+  // [ Method*            ]
+  // [ last_sp            ]
+  // [ sender_sp          ]
+  // [ saved FP           ] <--- FP
+  // [ saved LR           ]
+
+  // initialize fixed part of activation frame
+  __ push(LR);                                        // save return address
+  __ push(FP);                                        // save FP
+  __ mov(FP, SP);                                     // establish new FP
+
+  __ push(Rsender_sp);
+
+  __ mov(R0, 0);
+  __ push(R0);                                        // leave last_sp as null
+
+  // setup Rbcp
+  if (native_call) {
+    __ mov(Rbcp, 0);                                  // bcp = 0 for native calls
+  } else {
+    __ ldr(Rtemp, Address(Rmethod, Method::const_offset())); // get ConstMethod*
+    __ add(Rbcp, Rtemp, ConstMethod::codes_offset()); // get codebase
+  }
+
+  __ push(Rmethod);                                    // save Method*
+  // Get mirror and store it in the frame as GC root for this Method*
+  __ load_mirror(Rtemp, Rmethod, Rtemp);
+  __ push(Rtemp);
+
+  if (ProfileInterpreter) {
+    __ ldr(Rtemp, Address(Rmethod, Method::method_data_offset()));
+    __ tst(Rtemp, Rtemp);
+    __ add(Rtemp, Rtemp, in_bytes(MethodData::data_offset()), ne);
+    __ push(Rtemp);                                    // set the mdp (method data pointer)
+  } else {
+    __ push(R0);
+  }
+
+  __ ldr(Rtemp, Address(Rmethod, Method::const_offset()));
+  __ ldr(Rtemp, Address(Rtemp, ConstMethod::constants_offset()));
+  __ ldr(Rtemp, Address(Rtemp, ConstantPool::cache_offset_in_bytes()));
+  __ push(Rtemp);                                      // set constant pool cache
+  __ push(Rlocals);                                    // set locals pointer
+  __ push(Rbcp);                                       // set bcp
+  __ push(R0);                                         // reserve word for pointer to expression stack bottom
+  __ str(SP, Address(SP, 0));                          // set expression stack bottom
+}
+
+#endif // AARCH64
+
+// End of helpers
+
+//------------------------------------------------------------------------------------------------------------------------
+// Entry points
+//
+// Here we generate the various kind of entries into the interpreter.
+// The two main entry type are generic bytecode methods and native call method.
+// These both come in synchronized and non-synchronized versions but the
+// frame layout they create is very similar. The other method entry
+// types are really just special purpose entries that are really entry
+// and interpretation all in one. These are for trivial methods like
+// accessor, empty, or special math methods.
+//
+// When control flow reaches any of the entry types for the interpreter
+// the following holds ->
+//
+// Arguments:
+//
+// Rmethod: Method*
+// Rthread: thread
+// Rsender_sp:  sender sp
+// Rparams (SP on 32-bit ARM): pointer to method parameters
+//
+// LR: return address
+//
+// Stack layout immediately at entry
+//
+// [ optional padding(*)] <--- SP (AArch64)
+// [ parameter n        ] <--- Rparams (SP on 32-bit ARM)
+//   ...
+// [ parameter 1        ]
+// [ expression stack   ] (caller's java expression stack)
+
+// Assuming that we don't go to one of the trivial specialized
+// entries the stack will look like below when we are ready to execute
+// the first bytecode (or call the native routine). The register usage
+// will be as the template based interpreter expects.
+//
+// local variables follow incoming parameters immediately; i.e.
+// the return address is saved at the end of the locals.
+//
+// [ reserved stack (*) ] <--- SP (AArch64)
+// [ expr. stack        ] <--- Rstack_top (SP on 32-bit ARM)
+// [ monitor entry      ]
+//   ...
+// [ monitor entry      ]
+// [ expr. stack bottom ]
+// [ saved Rbcp         ]
+// [ current Rlocals    ]
+// [ cache              ]
+// [ mdx                ]
+// [ mirror             ]
+// [ Method*            ]
+//
+// 32-bit ARM:
+// [ last_sp            ]
+//
+// AArch64:
+// [ extended SP (*)    ]
+// [ stack top (*)      ]
+//
+// [ sender_sp          ]
+// [ saved FP           ] <--- FP
+// [ saved LR           ]
+// [ optional padding(*)]
+// [ local variable m   ]
+//   ...
+// [ local variable 1   ]
+// [ parameter n        ]
+//   ...
+// [ parameter 1        ] <--- Rlocals
+//
+// (*) - AArch64 only
+//
+
+address TemplateInterpreterGenerator::generate_Reference_get_entry(void) {
+#if INCLUDE_ALL_GCS
+  if (UseG1GC) {
+    // Code: _aload_0, _getfield, _areturn
+    // parameter size = 1
+    //
+    // The code that gets generated by this routine is split into 2 parts:
+    //    1. The "intrinsified" code for G1 (or any SATB based GC),
+    //    2. The slow path - which is an expansion of the regular method entry.
+    //
+    // Notes:-
+    // * In the G1 code we do not check whether we need to block for
+    //   a safepoint. If G1 is enabled then we must execute the specialized
+    //   code for Reference.get (except when the Reference object is null)
+    //   so that we can log the value in the referent field with an SATB
+    //   update buffer.
+    //   If the code for the getfield template is modified so that the
+    //   G1 pre-barrier code is executed when the current method is
+    //   Reference.get() then going through the normal method entry
+    //   will be fine.
+    // * The G1 code can, however, check the receiver object (the instance
+    //   of java.lang.Reference) and jump to the slow path if null. If the
+    //   Reference object is null then we obviously cannot fetch the referent
+    //   and so we don't need to call the G1 pre-barrier. Thus we can use the
+    //   regular method entry code to generate the NPE.
+    //
+    // This code is based on generate_accessor_enty.
+    //
+    // Rmethod: Method*
+    // Rthread: thread
+    // Rsender_sp: sender sp, must be preserved for slow path, set SP to it on fast path
+    // Rparams: parameters
+
+    address entry = __ pc();
+    Label slow_path;
+    const Register Rthis = R0;
+    const Register Rret_addr = Rtmp_save1;
+    assert_different_registers(Rthis, Rret_addr, Rsender_sp);
+
+    const int referent_offset = java_lang_ref_Reference::referent_offset;
+    guarantee(referent_offset > 0, "referent offset not initialized");
+
+    // Check if local 0 != NULL
+    // If the receiver is null then it is OK to jump to the slow path.
+    __ ldr(Rthis, Address(Rparams));
+    __ cbz(Rthis, slow_path);
+
+    // Generate the G1 pre-barrier code to log the value of
+    // the referent field in an SATB buffer.
+
+    // Load the value of the referent field.
+    __ load_heap_oop(R0, Address(Rthis, referent_offset));
+
+    // Preserve LR
+    __ mov(Rret_addr, LR);
+
+    __ g1_write_barrier_pre(noreg,   // store_addr
+                            noreg,   // new_val
+                            R0,      // pre_val
+                            Rtemp,   // tmp1
+                            R1_tmp); // tmp2
+
+    // _areturn
+    __ mov(SP, Rsender_sp);
+    __ ret(Rret_addr);
+
+    // generate a vanilla interpreter entry as the slow path
+    __ bind(slow_path);
+    __ jump_to_entry(Interpreter::entry_for_kind(Interpreter::zerolocals));
+    return entry;
+  }
+#endif // INCLUDE_ALL_GCS
+
+  // If G1 is not enabled then attempt to go through the normal entry point
+  return NULL;
+}
+
+// Not supported
+address TemplateInterpreterGenerator::generate_CRC32_update_entry() { return NULL; }
+address TemplateInterpreterGenerator::generate_CRC32_updateBytes_entry(AbstractInterpreter::MethodKind kind) { return NULL; }
+address TemplateInterpreterGenerator::generate_CRC32C_updateBytes_entry(AbstractInterpreter::MethodKind kind) { return NULL; }
+
+//
+// Interpreter stub for calling a native method. (asm interpreter)
+// This sets up a somewhat different looking stack for calling the native method
+// than the typical interpreter frame setup.
+//
+
+address TemplateInterpreterGenerator::generate_native_entry(bool synchronized) {
+  // determine code generation flags
+  bool inc_counter  = UseCompiler || CountCompiledCalls || LogTouchedMethods;
+
+  // Incoming registers:
+  //
+  // Rmethod: Method*
+  // Rthread: thread
+  // Rsender_sp: sender sp
+  // Rparams: parameters
+
+  address entry_point = __ pc();
+
+  // Register allocation
+  const Register Rsize_of_params = AARCH64_ONLY(R20) NOT_AARCH64(R6);
+  const Register Rsig_handler    = AARCH64_ONLY(R21) NOT_AARCH64(Rtmp_save0 /* R4 */);
+  const Register Rnative_code    = AARCH64_ONLY(R22) NOT_AARCH64(Rtmp_save1 /* R5 */);
+  const Register Rresult_handler = AARCH64_ONLY(Rsig_handler) NOT_AARCH64(R6);
+
+#ifdef AARCH64
+  const Register RconstMethod = R10; // also used in generate_fixed_frame (should match)
+  const Register Rsaved_result = Rnative_code;
+  const FloatRegister Dsaved_result = V8;
+#else
+  const Register Rsaved_result_lo = Rtmp_save0;  // R4
+  const Register Rsaved_result_hi = Rtmp_save1;  // R5
+  FloatRegister saved_result_fp;
+#endif // AARCH64
+
+
+#ifdef AARCH64
+  __ ldr(RconstMethod, Address(Rmethod, Method::const_offset()));
+  __ ldrh(Rsize_of_params,  Address(RconstMethod, ConstMethod::size_of_parameters_offset()));
+#else
+  __ ldr(Rsize_of_params, Address(Rmethod, Method::const_offset()));
+  __ ldrh(Rsize_of_params,  Address(Rsize_of_params, ConstMethod::size_of_parameters_offset()));
+#endif // AARCH64
+
+  // native calls don't need the stack size check since they have no expression stack
+  // and the arguments are already on the stack and we only add a handful of words
+  // to the stack
+
+  // compute beginning of parameters (Rlocals)
+  __ sub(Rlocals, Rparams, wordSize);
+  __ add(Rlocals, Rlocals, AsmOperand(Rsize_of_params, lsl, Interpreter::logStackElementSize));
+
+#ifdef AARCH64
+  int extra_stack_reserve = 2*wordSize; // extra space for oop_temp
+  if(__ can_post_interpreter_events()) {
+    // extra space for saved results
+    extra_stack_reserve += 2*wordSize;
+  }
+  // reserve extra stack space and nullify oop_temp slot
+  __ stp(ZR, ZR, Address(SP, -extra_stack_reserve, pre_indexed));
+#else
+  // reserve stack space for oop_temp
+  __ mov(R0, 0);
+  __ push(R0);
+#endif // AARCH64
+
+  generate_fixed_frame(true); // Note: R9 is now saved in the frame
+
+  // make sure method is native & not abstract
+#ifdef ASSERT
+  __ ldr_u32(Rtemp, Address(Rmethod, Method::access_flags_offset()));
+  {
+    Label L;
+    __ tbnz(Rtemp, JVM_ACC_NATIVE_BIT, L);
+    __ stop("tried to execute non-native method as native");
+    __ bind(L);
+  }
+  { Label L;
+    __ tbz(Rtemp, JVM_ACC_ABSTRACT_BIT, L);
+    __ stop("tried to execute abstract method in interpreter");
+    __ bind(L);
+  }
+#endif
+
+  // increment invocation count & check for overflow
+  Label invocation_counter_overflow;
+  if (inc_counter) {
+    if (synchronized) {
+      // Avoid unlocking method's monitor in case of exception, as it has not
+      // been locked yet.
+      __ set_do_not_unlock_if_synchronized(true, Rtemp);
+    }
+    generate_counter_incr(&invocation_counter_overflow, NULL, NULL);
+  }
+
+  Label continue_after_compile;
+  __ bind(continue_after_compile);
+
+  if (inc_counter && synchronized) {
+    __ set_do_not_unlock_if_synchronized(false, Rtemp);
+  }
+
+  // check for synchronized methods
+  // Must happen AFTER invocation_counter check and stack overflow check,
+  // so method is not locked if overflows.
+  //
+  if (synchronized) {
+    lock_method();
+  } else {
+    // no synchronization necessary
+#ifdef ASSERT
+      { Label L;
+        __ ldr_u32(Rtemp, Address(Rmethod, Method::access_flags_offset()));
+        __ tbz(Rtemp, JVM_ACC_SYNCHRONIZED_BIT, L);
+        __ stop("method needs synchronization");
+        __ bind(L);
+      }
+#endif
+  }
+
+  // start execution
+#ifdef ASSERT
+  { Label L;
+    __ ldr(Rtemp, Address(FP, frame::interpreter_frame_monitor_block_top_offset * wordSize));
+    __ cmp(Rtemp, Rstack_top);
+    __ b(L, eq);
+    __ stop("broken stack frame setup in interpreter");
+    __ bind(L);
+  }
+#endif
+  __ check_extended_sp(Rtemp);
+
+  // jvmti/dtrace support
+  __ notify_method_entry();
+#if R9_IS_SCRATCHED
+  __ restore_method();
+#endif
+
+  {
+    Label L;
+    __ ldr(Rsig_handler, Address(Rmethod, Method::signature_handler_offset()));
+    __ cbnz(Rsig_handler, L);
+    __ mov(R1, Rmethod);
+    __ call_VM(noreg, CAST_FROM_FN_PTR(address, InterpreterRuntime::prepare_native_call), R1, true);
+    __ ldr(Rsig_handler, Address(Rmethod, Method::signature_handler_offset()));
+    __ bind(L);
+  }
+
+  {
+    Label L;
+    __ ldr(Rnative_code, Address(Rmethod, Method::native_function_offset()));
+    __ cbnz(Rnative_code, L);
+    __ mov(R1, Rmethod);
+    __ call_VM(noreg, CAST_FROM_FN_PTR(address, InterpreterRuntime::prepare_native_call), R1);
+    __ ldr(Rnative_code, Address(Rmethod, Method::native_function_offset()));
+    __ bind(L);
+  }
+
+  // Allocate stack space for arguments
+
+#ifdef AARCH64
+  __ sub(Rtemp, SP, Rsize_of_params, ex_uxtw, LogBytesPerWord);
+  __ align_reg(SP, Rtemp, StackAlignmentInBytes);
+
+  // Allocate more stack space to accomodate all arguments passed on GP and FP registers:
+  // 8 * wordSize for GPRs
+  // 8 * wordSize for FPRs
+  int reg_arguments = round_to(8*wordSize + 8*wordSize, StackAlignmentInBytes);
+#else
+
+  // C functions need aligned stack
+  __ bic(SP, SP, StackAlignmentInBytes - 1);
+  // Multiply by BytesPerLong instead of BytesPerWord, because calling convention
+  // may require empty slots due to long alignment, e.g. func(int, jlong, int, jlong)
+  __ sub(SP, SP, AsmOperand(Rsize_of_params, lsl, LogBytesPerLong));
+
+#ifdef __ABI_HARD__
+  // Allocate more stack space to accomodate all GP as well as FP registers:
+  // 4 * wordSize
+  // 8 * BytesPerLong
+  int reg_arguments = round_to((4*wordSize) + (8*BytesPerLong), StackAlignmentInBytes);
+#else
+  // Reserve at least 4 words on the stack for loading
+  // of parameters passed on registers (R0-R3).
+  // See generate_slow_signature_handler().
+  // It is also used for JNIEnv & class additional parameters.
+  int reg_arguments = 4 * wordSize;
+#endif // __ABI_HARD__
+#endif // AARCH64
+
+  __ sub(SP, SP, reg_arguments);
+
+
+  // Note: signature handler blows R4 (32-bit ARM) or R21 (AArch64) besides all scratch registers.
+  // See AbstractInterpreterGenerator::generate_slow_signature_handler().
+  __ call(Rsig_handler);
+#if R9_IS_SCRATCHED
+  __ restore_method();
+#endif
+  __ mov(Rresult_handler, R0);
+
+  // Pass JNIEnv and mirror for static methods
+  {
+    Label L;
+    __ ldr_u32(Rtemp, Address(Rmethod, Method::access_flags_offset()));
+    __ add(R0, Rthread, in_bytes(JavaThread::jni_environment_offset()));
+    __ tbz(Rtemp, JVM_ACC_STATIC_BIT, L);
+    __ load_mirror(Rtemp, Rmethod, Rtemp);
+    __ add(R1, FP, frame::interpreter_frame_oop_temp_offset * wordSize);
+    __ str(Rtemp, Address(R1, 0));
+    __ bind(L);
+  }
+
+  __ set_last_Java_frame(SP, FP, true, Rtemp);
+
+  // Changing state to _thread_in_native must be the last thing to do
+  // before the jump to native code. At this moment stack must be
+  // safepoint-safe and completely prepared for stack walking.
+#ifdef ASSERT
+  {
+    Label L;
+    __ ldr_u32(Rtemp, Address(Rthread, JavaThread::thread_state_offset()));
+    __ cmp_32(Rtemp, _thread_in_Java);
+    __ b(L, eq);
+    __ stop("invalid thread state");
+    __ bind(L);
+  }
+#endif
+
+#ifdef AARCH64
+  __ mov(Rtemp, _thread_in_native);
+  __ add(Rtemp2, Rthread, in_bytes(JavaThread::thread_state_offset()));
+  // STLR is used to force all preceding writes to be observed prior to thread state change
+  __ stlr_w(Rtemp, Rtemp2);
+#else
+  // Force all preceding writes to be observed prior to thread state change
+  __ membar(MacroAssembler::StoreStore, Rtemp);
+
+  __ mov(Rtemp, _thread_in_native);
+  __ str(Rtemp, Address(Rthread, JavaThread::thread_state_offset()));
+#endif // AARCH64
+
+  __ call(Rnative_code);
+#if R9_IS_SCRATCHED
+  __ restore_method();
+#endif
+
+  // Set FPSCR/FPCR to a known state
+  if (AlwaysRestoreFPU) {
+    __ restore_default_fp_mode();
+  }
+
+  // Do safepoint check
+  __ mov(Rtemp, _thread_in_native_trans);
+  __ str_32(Rtemp, Address(Rthread, JavaThread::thread_state_offset()));
+
+    // Force this write out before the read below
+  __ membar(MacroAssembler::StoreLoad, Rtemp);
+
+  __ ldr_global_s32(Rtemp, SafepointSynchronize::address_of_state());
+
+  // Protect the return value in the interleaved code: save it to callee-save registers.
+#ifdef AARCH64
+  __ mov(Rsaved_result, R0);
+  __ fmov_d(Dsaved_result, D0);
+#else
+  __ mov(Rsaved_result_lo, R0);
+  __ mov(Rsaved_result_hi, R1);
+#ifdef __ABI_HARD__
+  // preserve native FP result in a callee-saved register
+  saved_result_fp = D8;
+  __ fcpyd(saved_result_fp, D0);
+#else
+  saved_result_fp = fnoreg;
+#endif // __ABI_HARD__
+#endif // AARCH64
+
+  {
+    __ ldr_u32(R3, Address(Rthread, JavaThread::suspend_flags_offset()));
+    __ cmp(Rtemp, SafepointSynchronize::_not_synchronized);
+    __ cond_cmp(R3, 0, eq);
+
+#ifdef AARCH64
+    Label L;
+    __ b(L, eq);
+    __ mov(R0, Rthread);
+    __ call(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans), relocInfo::none);
+    __ bind(L);
+#else
+  __ mov(R0, Rthread, ne);
+  __ call(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans), relocInfo::none, ne);
+#if R9_IS_SCRATCHED
+  __ restore_method();
+#endif
+#endif // AARCH64
+  }
+
+  // Perform Native->Java thread transition
+  __ mov(Rtemp, _thread_in_Java);
+  __ str_32(Rtemp, Address(Rthread, JavaThread::thread_state_offset()));
+
+  // Zero handles and last_java_sp
+  __ reset_last_Java_frame(Rtemp);
+  __ ldr(R3, Address(Rthread, JavaThread::active_handles_offset()));
+  __ str_32(__ zero_register(Rtemp), Address(R3, JNIHandleBlock::top_offset_in_bytes()));
+  if (CheckJNICalls) {
+    __ str(__ zero_register(Rtemp), Address(Rthread, JavaThread::pending_jni_exception_check_fn_offset()));
+  }
+
+  // Unbox if the result is non-zero object
+#ifdef AARCH64
+  {
+    Label L, Lnull;
+    __ mov_slow(Rtemp, AbstractInterpreter::result_handler(T_OBJECT));
+    __ cmp(Rresult_handler, Rtemp);
+    __ b(L, ne);
+    __ cbz(Rsaved_result, Lnull);
+    __ ldr(Rsaved_result, Address(Rsaved_result));
+    __ bind(Lnull);
+    // Store oop on the stack for GC
+    __ str(Rsaved_result, Address(FP, frame::interpreter_frame_oop_temp_offset * wordSize));
+    __ bind(L);
+  }
+#else
+  __ tst(Rsaved_result_lo, Rresult_handler);
+  __ ldr(Rsaved_result_lo, Address(Rsaved_result_lo), ne);
+
+  // Store oop on the stack for GC
+  __ cmp(Rresult_handler, 0);
+  __ str(Rsaved_result_lo, Address(FP, frame::interpreter_frame_oop_temp_offset * wordSize), ne);
+#endif // AARCH64
+
+#ifdef AARCH64
+  // Restore SP (drop native parameters area), to keep SP in sync with extended_sp in frame
+  __ restore_sp_after_call(Rtemp);
+  __ check_stack_top();
+#endif // AARCH64
+
+  // reguard stack if StackOverflow exception happened while in native.
+  {
+    __ ldr_u32(Rtemp, Address(Rthread, JavaThread::stack_guard_state_offset()));
+    __ cmp_32(Rtemp, JavaThread::stack_guard_yellow_reserved_disabled);
+#ifdef AARCH64
+    Label L;
+    __ b(L, ne);
+    __ call(CAST_FROM_FN_PTR(address, SharedRuntime::reguard_yellow_pages), relocInfo::none);
+    __ bind(L);
+#else
+  __ call(CAST_FROM_FN_PTR(address, SharedRuntime::reguard_yellow_pages), relocInfo::none, eq);
+#if R9_IS_SCRATCHED
+  __ restore_method();
+#endif
+#endif // AARCH64
+  }
+
+  // check pending exceptions
+  {
+    __ ldr(Rtemp, Address(Rthread, Thread::pending_exception_offset()));
+#ifdef AARCH64
+    Label L;
+    __ cbz(Rtemp, L);
+    __ mov_pc_to(Rexception_pc);
+    __ b(StubRoutines::forward_exception_entry());
+    __ bind(L);
+#else
+    __ cmp(Rtemp, 0);
+    __ mov(Rexception_pc, PC, ne);
+    __ b(StubRoutines::forward_exception_entry(), ne);
+#endif // AARCH64
+  }
+
+  if (synchronized) {
+    // address of first monitor
+    __ sub(R1, FP, - (frame::interpreter_frame_monitor_block_bottom_offset - frame::interpreter_frame_monitor_size()) * wordSize);
+    __ unlock_object(R1);
+  }
+
+  // jvmti/dtrace support
+  // Note: This must happen _after_ handling/throwing any exceptions since
+  //       the exception handler code notifies the runtime of method exits
+  //       too. If this happens before, method entry/exit notifications are
+  //       not properly paired (was bug - gri 11/22/99).
+#ifdef AARCH64
+  __ notify_method_exit(vtos, InterpreterMacroAssembler::NotifyJVMTI, true, Rsaved_result, noreg, Dsaved_result);
+#else
+  __ notify_method_exit(vtos, InterpreterMacroAssembler::NotifyJVMTI, true, Rsaved_result_lo, Rsaved_result_hi, saved_result_fp);
+#endif // AARCH64
+
+  // Restore the result. Oop result is restored from the stack.
+#ifdef AARCH64
+  __ mov(R0, Rsaved_result);
+  __ fmov_d(D0, Dsaved_result);
+
+  __ blr(Rresult_handler);
+#else
+  __ cmp(Rresult_handler, 0);
+  __ ldr(R0, Address(FP, frame::interpreter_frame_oop_temp_offset * wordSize), ne);
+  __ mov(R0, Rsaved_result_lo, eq);
+  __ mov(R1, Rsaved_result_hi);
+
+#ifdef __ABI_HARD__
+  // reload native FP result
+  __ fcpyd(D0, D8);
+#endif // __ABI_HARD__
+
+#ifdef ASSERT
+  if (VerifyOops) {
+    Label L;
+    __ cmp(Rresult_handler, 0);
+    __ b(L, eq);
+    __ verify_oop(R0);
+    __ bind(L);
+  }
+#endif // ASSERT
+#endif // AARCH64
+
+  // Restore FP/LR, sender_sp and return
+#ifdef AARCH64
+  __ ldr(Rtemp, Address(FP, frame::interpreter_frame_sender_sp_offset * wordSize));
+  __ ldp(FP, LR, Address(FP));
+  __ mov(SP, Rtemp);
+#else
+  __ mov(Rtemp, FP);
+  __ ldmia(FP, RegisterSet(FP) | RegisterSet(LR));
+  __ ldr(SP, Address(Rtemp, frame::interpreter_frame_sender_sp_offset * wordSize));
+#endif // AARCH64
+
+  __ ret();
+
+  if (inc_counter) {
+    // Handle overflow of counter and compile method
+    __ bind(invocation_counter_overflow);
+    generate_counter_overflow(continue_after_compile);
+  }
+
+  return entry_point;
+}
+
+//
+// Generic interpreted method entry to (asm) interpreter
+//
+address TemplateInterpreterGenerator::generate_normal_entry(bool synchronized) {
+  // determine code generation flags
+  bool inc_counter  = UseCompiler || CountCompiledCalls || LogTouchedMethods;
+
+  // Rmethod: Method*
+  // Rthread: thread
+  // Rsender_sp: sender sp (could differ from SP if we were called via c2i)
+  // Rparams: pointer to the last parameter in the stack
+
+  address entry_point = __ pc();
+
+  const Register RconstMethod = AARCH64_ONLY(R10) NOT_AARCH64(R3);
+
+#ifdef AARCH64
+  const Register RmaxStack = R11;
+  const Register RlocalsBase = R12;
+#endif // AARCH64
+
+  __ ldr(RconstMethod, Address(Rmethod, Method::const_offset()));
+
+  __ ldrh(R2, Address(RconstMethod, ConstMethod::size_of_parameters_offset()));
+  __ ldrh(R3, Address(RconstMethod, ConstMethod::size_of_locals_offset()));
+
+  // setup Rlocals
+  __ sub(Rlocals, Rparams, wordSize);
+  __ add(Rlocals, Rlocals, AsmOperand(R2, lsl, Interpreter::logStackElementSize));
+
+  __ sub(R3, R3, R2); // number of additional locals
+
+#ifdef AARCH64
+  // setup RmaxStack
+  __ ldrh(RmaxStack, Address(RconstMethod, ConstMethod::max_stack_offset()));
+  __ add(RmaxStack, RmaxStack, MAX2(1, Method::extra_stack_entries())); // reserve slots for exception handler and JSR292 appendix argument
+#endif // AARCH64
+
+  // see if we've got enough room on the stack for locals plus overhead.
+  generate_stack_overflow_check();
+
+#ifdef AARCH64
+
+  // allocate space for locals
+  {
+    __ sub(RlocalsBase, Rparams, AsmOperand(R3, lsl, Interpreter::logStackElementSize));
+    __ align_reg(SP, RlocalsBase, StackAlignmentInBytes);
+  }
+
+  // explicitly initialize locals
+  {
+    Label zero_loop, done;
+    __ cbz(R3, done);
+
+    __ tbz(R3, 0, zero_loop);
+    __ subs(R3, R3, 1);
+    __ str(ZR, Address(RlocalsBase, wordSize, post_indexed));
+    __ b(done, eq);
+
+    __ bind(zero_loop);
+    __ subs(R3, R3, 2);
+    __ stp(ZR, ZR, Address(RlocalsBase, 2*wordSize, post_indexed));
+    __ b(zero_loop, ne);
+
+    __ bind(done);
+  }
+
+#else
+  // allocate space for locals
+  // explicitly initialize locals
+
+  // Loop is unrolled 4 times
+  Label loop;
+  __ mov(R0, 0);
+  __ bind(loop);
+
+  // #1
+  __ subs(R3, R3, 1);
+  __ push(R0, ge);
+
+  // #2
+  __ subs(R3, R3, 1, ge);
+  __ push(R0, ge);
+
+  // #3
+  __ subs(R3, R3, 1, ge);
+  __ push(R0, ge);
+
+  // #4
+  __ subs(R3, R3, 1, ge);
+  __ push(R0, ge);
+
+  __ b(loop, gt);
+#endif // AARCH64
+
+  // initialize fixed part of activation frame
+  generate_fixed_frame(false);
+
+  __ restore_dispatch();
+
+  // make sure method is not native & not abstract
+#ifdef ASSERT
+  __ ldr_u32(Rtemp, Address(Rmethod, Method::access_flags_offset()));
+  {
+    Label L;
+    __ tbz(Rtemp, JVM_ACC_NATIVE_BIT, L);
+    __ stop("tried to execute native method as non-native");
+    __ bind(L);
+  }
+  { Label L;
+    __ tbz(Rtemp, JVM_ACC_ABSTRACT_BIT, L);
+    __ stop("tried to execute abstract method in interpreter");
+    __ bind(L);
+  }
+#endif
+
+  // increment invocation count & check for overflow
+  Label invocation_counter_overflow;
+  Label profile_method;
+  Label profile_method_continue;
+  if (inc_counter) {
+    if (synchronized) {
+      // Avoid unlocking method's monitor in case of exception, as it has not
+      // been locked yet.
+      __ set_do_not_unlock_if_synchronized(true, Rtemp);
+    }
+    generate_counter_incr(&invocation_counter_overflow, &profile_method, &profile_method_continue);
+    if (ProfileInterpreter) {
+      __ bind(profile_method_continue);
+    }
+  }
+  Label continue_after_compile;
+  __ bind(continue_after_compile);
+
+  if (inc_counter && synchronized) {
+    __ set_do_not_unlock_if_synchronized(false, Rtemp);
+  }
+#if R9_IS_SCRATCHED
+  __ restore_method();
+#endif
+
+  // check for synchronized methods
+  // Must happen AFTER invocation_counter check and stack overflow check,
+  // so method is not locked if overflows.
+  //
+  if (synchronized) {
+    // Allocate monitor and lock method
+    lock_method();
+  } else {
+    // no synchronization necessary
+#ifdef ASSERT
+      { Label L;
+        __ ldr_u32(Rtemp, Address(Rmethod, Method::access_flags_offset()));
+        __ tbz(Rtemp, JVM_ACC_SYNCHRONIZED_BIT, L);
+        __ stop("method needs synchronization");
+        __ bind(L);
+      }
+#endif
+  }
+
+  // start execution
+#ifdef ASSERT
+  { Label L;
+    __ ldr(Rtemp, Address(FP, frame::interpreter_frame_monitor_block_top_offset * wordSize));
+    __ cmp(Rtemp, Rstack_top);
+    __ b(L, eq);
+    __ stop("broken stack frame setup in interpreter");
+    __ bind(L);
+  }
+#endif
+  __ check_extended_sp(Rtemp);
+
+  // jvmti support
+  __ notify_method_entry();
+#if R9_IS_SCRATCHED
+  __ restore_method();
+#endif
+
+  __ dispatch_next(vtos);
+
+  // invocation counter overflow
+  if (inc_counter) {
+    if (ProfileInterpreter) {
+      // We have decided to profile this method in the interpreter
+      __ bind(profile_method);
+
+      __ call_VM(noreg, CAST_FROM_FN_PTR(address, InterpreterRuntime::profile_method));
+      __ set_method_data_pointer_for_bcp();
+
+      __ b(profile_method_continue);
+    }
+
+    // Handle overflow of counter and compile method
+    __ bind(invocation_counter_overflow);
+    generate_counter_overflow(continue_after_compile);
+  }
+
+  return entry_point;
+}
+
+//------------------------------------------------------------------------------------------------------------------------
+// Exceptions
+
+void TemplateInterpreterGenerator::generate_throw_exception() {
+  // Entry point in previous activation (i.e., if the caller was interpreted)
+  Interpreter::_rethrow_exception_entry = __ pc();
+  // Rexception_obj: exception
+
+#ifndef AARCH64
+  // Clear interpreter_frame_last_sp.
+  __ mov(Rtemp, 0);
+  __ str(Rtemp, Address(FP, frame::interpreter_frame_last_sp_offset * wordSize));
+#endif // !AARCH64
+
+#if R9_IS_SCRATCHED
+  __ restore_method();
+#endif
+  __ restore_bcp();
+  __ restore_dispatch();
+  __ restore_locals();
+
+#ifdef AARCH64
+  __ restore_sp_after_call(Rtemp);
+#endif // AARCH64
+
+  // Entry point for exceptions thrown within interpreter code
+  Interpreter::_throw_exception_entry = __ pc();
+
+  // expression stack is undefined here
+  // Rexception_obj: exception
+  // Rbcp: exception bcp
+  __ verify_oop(Rexception_obj);
+
+  // expression stack must be empty before entering the VM in case of an exception
+  __ empty_expression_stack();
+  // find exception handler address and preserve exception oop
+  __ mov(R1, Rexception_obj);
+  __ call_VM(Rexception_obj, CAST_FROM_FN_PTR(address, InterpreterRuntime::exception_handler_for_exception), R1);
+  // R0: exception handler entry point
+  // Rexception_obj: preserved exception oop
+  // Rbcp: bcp for exception handler
+  __ push_ptr(Rexception_obj);                    // push exception which is now the only value on the stack
+  __ jump(R0);                                    // jump to exception handler (may be _remove_activation_entry!)
+
+  // If the exception is not handled in the current frame the frame is removed and
+  // the exception is rethrown (i.e. exception continuation is _rethrow_exception).
+  //
+  // Note: At this point the bci is still the bxi for the instruction which caused
+  //       the exception and the expression stack is empty. Thus, for any VM calls
+  //       at this point, GC will find a legal oop map (with empty expression stack).
+
+  // In current activation
+  // tos: exception
+  // Rbcp: exception bcp
+
+  //
+  // JVMTI PopFrame support
+  //
+   Interpreter::_remove_activation_preserving_args_entry = __ pc();
+
+#ifdef AARCH64
+  __ restore_sp_after_call(Rtemp); // restore SP to extended SP
+#endif // AARCH64
+
+  __ empty_expression_stack();
+
+  // Set the popframe_processing bit in _popframe_condition indicating that we are
+  // currently handling popframe, so that call_VMs that may happen later do not trigger new
+  // popframe handling cycles.
+
+  __ ldr_s32(Rtemp, Address(Rthread, JavaThread::popframe_condition_offset()));
+  __ orr(Rtemp, Rtemp, (unsigned)JavaThread::popframe_processing_bit);
+  __ str_32(Rtemp, Address(Rthread, JavaThread::popframe_condition_offset()));
+
+  {
+    // Check to see whether we are returning to a deoptimized frame.
+    // (The PopFrame call ensures that the caller of the popped frame is
+    // either interpreted or compiled and deoptimizes it if compiled.)
+    // In this case, we can't call dispatch_next() after the frame is
+    // popped, but instead must save the incoming arguments and restore
+    // them after deoptimization has occurred.
+    //
+    // Note that we don't compare the return PC against the
+    // deoptimization blob's unpack entry because of the presence of
+    // adapter frames in C2.
+    Label caller_not_deoptimized;
+    __ ldr(R0, Address(FP, frame::return_addr_offset * wordSize));
+    __ call_VM_leaf(CAST_FROM_FN_PTR(address, InterpreterRuntime::interpreter_contains), R0);
+    __ cbnz_32(R0, caller_not_deoptimized);
+#ifdef AARCH64
+    __ NOT_TESTED();
+#endif
+
+    // Compute size of arguments for saving when returning to deoptimized caller
+    __ restore_method();
+    __ ldr(R0, Address(Rmethod, Method::const_offset()));
+    __ ldrh(R0, Address(R0, ConstMethod::size_of_parameters_offset()));
+
+    __ logical_shift_left(R1, R0, Interpreter::logStackElementSize);
+    // Save these arguments
+    __ restore_locals();
+    __ sub(R2, Rlocals, R1);
+    __ add(R2, R2, wordSize);
+    __ mov(R0, Rthread);
+    __ call_VM_leaf(CAST_FROM_FN_PTR(address, Deoptimization::popframe_preserve_args), R0, R1, R2);
+
+    __ remove_activation(vtos, LR,
+                         /* throw_monitor_exception */ false,
+                         /* install_monitor_exception */ false,
+                         /* notify_jvmdi */ false);
+
+    // Inform deoptimization that it is responsible for restoring these arguments
+    __ mov(Rtemp, JavaThread::popframe_force_deopt_reexecution_bit);
+    __ str_32(Rtemp, Address(Rthread, JavaThread::popframe_condition_offset()));
+
+    // Continue in deoptimization handler
+    __ ret();
+
+    __ bind(caller_not_deoptimized);
+  }
+
+  __ remove_activation(vtos, R4,
+                       /* throw_monitor_exception */ false,
+                       /* install_monitor_exception */ false,
+                       /* notify_jvmdi */ false);
+
+#ifndef AARCH64
+  // Finish with popframe handling
+  // A previous I2C followed by a deoptimization might have moved the
+  // outgoing arguments further up the stack. PopFrame expects the
+  // mutations to those outgoing arguments to be preserved and other
+  // constraints basically require this frame to look exactly as
+  // though it had previously invoked an interpreted activation with
+  // no space between the top of the expression stack (current
+  // last_sp) and the top of stack. Rather than force deopt to
+  // maintain this kind of invariant all the time we call a small
+  // fixup routine to move the mutated arguments onto the top of our
+  // expression stack if necessary.
+  __ mov(R1, SP);
+  __ ldr(R2, Address(FP, frame::interpreter_frame_last_sp_offset * wordSize));
+  // PC must point into interpreter here
+  __ set_last_Java_frame(SP, FP, true, Rtemp);
+  __ mov(R0, Rthread);
+  __ call_VM_leaf(CAST_FROM_FN_PTR(address, InterpreterRuntime::popframe_move_outgoing_args), R0, R1, R2);
+  __ reset_last_Java_frame(Rtemp);
+#endif // !AARCH64
+
+#ifdef AARCH64
+  __ restore_sp_after_call(Rtemp);
+  __ restore_stack_top();
+#else
+  // Restore the last_sp and null it out
+  __ ldr(SP, Address(FP, frame::interpreter_frame_last_sp_offset * wordSize));
+  __ mov(Rtemp, (int)NULL_WORD);
+  __ str(Rtemp, Address(FP, frame::interpreter_frame_last_sp_offset * wordSize));
+#endif // AARCH64
+
+  __ restore_bcp();
+  __ restore_dispatch();
+  __ restore_locals();
+  __ restore_method();
+
+  // The method data pointer was incremented already during
+  // call profiling. We have to restore the mdp for the current bcp.
+  if (ProfileInterpreter) {
+    __ set_method_data_pointer_for_bcp();
+  }
+
+  // Clear the popframe condition flag
+  assert(JavaThread::popframe_inactive == 0, "adjust this code");
+  __ str_32(__ zero_register(Rtemp), Address(Rthread, JavaThread::popframe_condition_offset()));
+
+#if INCLUDE_JVMTI
+  {
+    Label L_done;
+
+    __ ldrb(Rtemp, Address(Rbcp, 0));
+    __ cmp(Rtemp, Bytecodes::_invokestatic);
+    __ b(L_done, ne);
+
+    // The member name argument must be restored if _invokestatic is re-executed after a PopFrame call.
+    // Detect such a case in the InterpreterRuntime function and return the member name argument, or NULL.
+
+    // get local0
+    __ ldr(R1, Address(Rlocals, 0));
+    __ mov(R2, Rmethod);
+    __ mov(R3, Rbcp);
+    __ call_VM(R0, CAST_FROM_FN_PTR(address, InterpreterRuntime::member_name_arg_or_null), R1, R2, R3);
+
+    __ cbz(R0, L_done);
+
+    __ str(R0, Address(Rstack_top));
+    __ bind(L_done);
+  }
+#endif // INCLUDE_JVMTI
+
+  __ dispatch_next(vtos);
+  // end of PopFrame support
+
+  Interpreter::_remove_activation_entry = __ pc();
+
+  // preserve exception over this code sequence
+  __ pop_ptr(R0_tos);
+  __ str(R0_tos, Address(Rthread, JavaThread::vm_result_offset()));
+  // remove the activation (without doing throws on illegalMonitorExceptions)
+  __ remove_activation(vtos, Rexception_pc, false, true, false);
+  // restore exception
+  __ get_vm_result(Rexception_obj, Rtemp);
+
+  // Inbetween activations - previous activation type unknown yet
+  // compute continuation point - the continuation point expects
+  // the following registers set up:
+  //
+  // Rexception_obj: exception
+  // Rexception_pc: return address/pc that threw exception
+  // SP: expression stack of caller
+  // FP: frame pointer of caller
+  __ mov(c_rarg0, Rthread);
+  __ mov(c_rarg1, Rexception_pc);
+  __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), c_rarg0, c_rarg1);
+  // Note that an "issuing PC" is actually the next PC after the call
+
+  __ jump(R0);                             // jump to exception handler of caller
+}
+
+
+//
+// JVMTI ForceEarlyReturn support
+//
+address TemplateInterpreterGenerator::generate_earlyret_entry_for(TosState state) {
+  address entry = __ pc();
+
+#ifdef AARCH64
+  __ restore_sp_after_call(Rtemp); // restore SP to extended SP
+#endif // AARCH64
+
+  __ restore_bcp();
+  __ restore_dispatch();
+  __ restore_locals();
+
+  __ empty_expression_stack();
+
+  __ load_earlyret_value(state);
+
+  // Clear the earlyret state
+  __ ldr(Rtemp, Address(Rthread, JavaThread::jvmti_thread_state_offset()));
+
+  assert(JvmtiThreadState::earlyret_inactive == 0, "adjust this code");
+  __ str_32(__ zero_register(R2), Address(Rtemp, JvmtiThreadState::earlyret_state_offset()));
+
+  __ remove_activation(state, LR,
+                       false, /* throw_monitor_exception */
+                       false, /* install_monitor_exception */
+                       true); /* notify_jvmdi */
+
+#ifndef AARCH64
+  // According to interpreter calling conventions, result is returned in R0/R1,
+  // so ftos (S0) and dtos (D0) are moved to R0/R1.
+  // This conversion should be done after remove_activation, as it uses
+  // push(state) & pop(state) to preserve return value.
+  __ convert_tos_to_retval(state);
+#endif // !AARCH64
+  __ ret();
+
+  return entry;
+} // end of ForceEarlyReturn support
+
+
+//------------------------------------------------------------------------------------------------------------------------
+// Helper for vtos entry point generation
+
+void TemplateInterpreterGenerator::set_vtos_entry_points (Template* t, address& bep, address& cep, address& sep, address& aep, address& iep, address& lep, address& fep, address& dep, address& vep) {
+  assert(t->is_valid() && t->tos_in() == vtos, "illegal template");
+  Label L;
+
+#ifdef __SOFTFP__
+  dep = __ pc();                // fall through
+#else
+  fep = __ pc(); __ push(ftos); __ b(L);
+  dep = __ pc(); __ push(dtos); __ b(L);
+#endif // __SOFTFP__
+
+  lep = __ pc(); __ push(ltos); __ b(L);
+
+  if (AARCH64_ONLY(true) NOT_AARCH64(VerifyOops)) {  // can't share atos entry with itos on AArch64 or if VerifyOops
+    aep = __ pc(); __ push(atos); __ b(L);
+  } else {
+    aep = __ pc();              // fall through
+  }
+
+#ifdef __SOFTFP__
+  fep = __ pc();                // fall through
+#endif // __SOFTFP__
+
+  bep = cep = sep =             // fall through
+  iep = __ pc(); __ push(itos); // fall through
+  vep = __ pc(); __ bind(L);    // fall through
+  generate_and_dispatch(t);
+}
+
+//------------------------------------------------------------------------------------------------------------------------
+
+// Non-product code
+#ifndef PRODUCT
+address TemplateInterpreterGenerator::generate_trace_code(TosState state) {
+  address entry = __ pc();
+
+  // prepare expression stack
+  __ push(state);       // save tosca
+
+  // pass tosca registers as arguments
+  __ mov(R2, R0_tos);
+#ifdef AARCH64
+  __ mov(R3, ZR);
+#else
+  __ mov(R3, R1_tos_hi);
+#endif // AARCH64
+  __ mov(R1, LR);       // save return address
+
+  // call tracer
+  __ call_VM(noreg, CAST_FROM_FN_PTR(address, InterpreterRuntime::trace_bytecode), R1, R2, R3);
+
+  __ mov(LR, R0);       // restore return address
+  __ pop(state);        // restore tosca
+
+  // return
+  __ ret();
+
+  return entry;
+}
+
+
+void TemplateInterpreterGenerator::count_bytecode() {
+  __ inc_global_counter((address) &BytecodeCounter::_counter_value, 0, Rtemp, R2_tmp, true);
+}
+
+
+void TemplateInterpreterGenerator::histogram_bytecode(Template* t) {
+  __ inc_global_counter((address)&BytecodeHistogram::_counters[0], sizeof(BytecodeHistogram::_counters[0]) * t->bytecode(), Rtemp, R2_tmp, true);
+}
+
+
+void TemplateInterpreterGenerator::histogram_bytecode_pair(Template* t) {
+  const Register Rindex_addr = R2_tmp;
+  Label Lcontinue;
+  InlinedAddress Lcounters((address)BytecodePairHistogram::_counters);
+  InlinedAddress Lindex((address)&BytecodePairHistogram::_index);
+  const Register Rcounters_addr = R2_tmp;
+  const Register Rindex = R4_tmp;
+
+  // calculate new index for counter:
+  // index = (_index >> log2_number_of_codes) | (bytecode << log2_number_of_codes).
+  // (_index >> log2_number_of_codes) is previous bytecode
+
+  __ ldr_literal(Rindex_addr, Lindex);
+  __ ldr_s32(Rindex, Address(Rindex_addr));
+  __ mov_slow(Rtemp, ((int)t->bytecode()) << BytecodePairHistogram::log2_number_of_codes);
+  __ orr(Rindex, Rtemp, AsmOperand(Rindex, lsr, BytecodePairHistogram::log2_number_of_codes));
+  __ str_32(Rindex, Address(Rindex_addr));
+
+  // Rindex (R4) contains index of counter
+
+  __ ldr_literal(Rcounters_addr, Lcounters);
+  __ ldr_s32(Rtemp, Address::indexed_32(Rcounters_addr, Rindex));
+  __ adds_32(Rtemp, Rtemp, 1);
+  __ b(Lcontinue, mi);                           // avoid overflow
+  __ str_32(Rtemp, Address::indexed_32(Rcounters_addr, Rindex));
+
+  __ b(Lcontinue);
+
+  __ bind_literal(Lindex);
+  __ bind_literal(Lcounters);
+
+  __ bind(Lcontinue);
+}
+
+
+void TemplateInterpreterGenerator::trace_bytecode(Template* t) {
+  // Call a little run-time stub to avoid blow-up for each bytecode.
+  // The run-time runtime saves the right registers, depending on
+  // the tosca in-state for the given template.
+  assert(Interpreter::trace_code(t->tos_in()) != NULL,
+         "entry must have been generated");
+  address trace_entry = Interpreter::trace_code(t->tos_in());
+  __ call(trace_entry, relocInfo::none);
+}
+
+
+void TemplateInterpreterGenerator::stop_interpreter_at() {
+  Label Lcontinue;
+  const Register stop_at = R2_tmp;
+
+  __ ldr_global_s32(Rtemp, (address) &BytecodeCounter::_counter_value);
+  __ mov_slow(stop_at, StopInterpreterAt);
+
+  // test bytecode counter
+  __ cmp(Rtemp, stop_at);
+  __ b(Lcontinue, ne);
+
+  __ trace_state("stop_interpreter_at");
+  __ breakpoint();
+
+  __ bind(Lcontinue);
+}
+#endif // !PRODUCT
--- /dev/null	2016-08-24 15:41:39.598575000 -0400
+++ new/hotspot/src/cpu/arm/vm/templateTable_arm.cpp	2016-12-02 11:24:00.892967326 -0500
@@ -0,0 +1,5030 @@
+/*
+ * Copyright (c) 2008, 2016, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+#include "asm/macroAssembler.hpp"
+#include "interpreter/interp_masm.hpp"
+#include "interpreter/interpreter.hpp"
+#include "interpreter/interpreterRuntime.hpp"
+#include "interpreter/templateTable.hpp"
+#include "memory/universe.inline.hpp"
+#include "oops/cpCache.hpp"
+#include "oops/methodData.hpp"
+#include "oops/objArrayKlass.hpp"
+#include "oops/oop.inline.hpp"
+#include "prims/methodHandles.hpp"
+#include "runtime/sharedRuntime.hpp"
+#include "runtime/stubRoutines.hpp"
+#include "runtime/synchronizer.hpp"
+
+#define __ _masm->
+
+//----------------------------------------------------------------------------------------------------
+// Platform-dependent initialization
+
+void TemplateTable::pd_initialize() {
+  // No arm specific initialization
+}
+
+//----------------------------------------------------------------------------------------------------
+// Address computation
+
+// local variables
+static inline Address iaddress(int n)            {
+  return Address(Rlocals, Interpreter::local_offset_in_bytes(n));
+}
+
+static inline Address laddress(int n)            { return iaddress(n + 1); }
+#ifndef AARCH64
+static inline Address haddress(int n)            { return iaddress(n + 0); }
+#endif // !AARCH64
+
+static inline Address faddress(int n)            { return iaddress(n); }
+static inline Address daddress(int n)            { return laddress(n); }
+static inline Address aaddress(int n)            { return iaddress(n); }
+
+
+void TemplateTable::get_local_base_addr(Register r, Register index) {
+  __ sub(r, Rlocals, AsmOperand(index, lsl, Interpreter::logStackElementSize));
+}
+
+Address TemplateTable::load_iaddress(Register index, Register scratch) {
+#ifdef AARCH64
+  get_local_base_addr(scratch, index);
+  return Address(scratch);
+#else
+  return Address(Rlocals, index, lsl, Interpreter::logStackElementSize, basic_offset, sub_offset);
+#endif // AARCH64
+}
+
+Address TemplateTable::load_aaddress(Register index, Register scratch) {
+  return load_iaddress(index, scratch);
+}
+
+Address TemplateTable::load_faddress(Register index, Register scratch) {
+#ifdef __SOFTFP__
+  return load_iaddress(index, scratch);
+#else
+  get_local_base_addr(scratch, index);
+  return Address(scratch);
+#endif // __SOFTFP__
+}
+
+Address TemplateTable::load_daddress(Register index, Register scratch) {
+  get_local_base_addr(scratch, index);
+  return Address(scratch, Interpreter::local_offset_in_bytes(1));
+}
+
+// At top of Java expression stack which may be different than SP.
+// It isn't for category 1 objects.
+static inline Address at_tos() {
+  return Address(Rstack_top, Interpreter::expr_offset_in_bytes(0));
+}
+
+static inline Address at_tos_p1() {
+  return Address(Rstack_top, Interpreter::expr_offset_in_bytes(1));
+}
+
+static inline Address at_tos_p2() {
+  return Address(Rstack_top, Interpreter::expr_offset_in_bytes(2));
+}
+
+
+// 32-bit ARM:
+// Loads double/long local into R0_tos_lo/R1_tos_hi with two
+// separate ldr instructions (supports nonadjacent values).
+// Used for longs in all modes, and for doubles in SOFTFP mode.
+//
+// AArch64: loads long local into R0_tos.
+//
+void TemplateTable::load_category2_local(Register Rlocal_index, Register tmp) {
+  const Register Rlocal_base = tmp;
+  assert_different_registers(Rlocal_index, tmp);
+
+  get_local_base_addr(Rlocal_base, Rlocal_index);
+#ifdef AARCH64
+  __ ldr(R0_tos, Address(Rlocal_base, Interpreter::local_offset_in_bytes(1)));
+#else
+  __ ldr(R0_tos_lo, Address(Rlocal_base, Interpreter::local_offset_in_bytes(1)));
+  __ ldr(R1_tos_hi, Address(Rlocal_base, Interpreter::local_offset_in_bytes(0)));
+#endif // AARCH64
+}
+
+
+// 32-bit ARM:
+// Stores R0_tos_lo/R1_tos_hi to double/long local with two
+// separate str instructions (supports nonadjacent values).
+// Used for longs in all modes, and for doubles in SOFTFP mode
+//
+// AArch64: stores R0_tos to long local.
+//
+void TemplateTable::store_category2_local(Register Rlocal_index, Register tmp) {
+  const Register Rlocal_base = tmp;
+  assert_different_registers(Rlocal_index, tmp);
+
+  get_local_base_addr(Rlocal_base, Rlocal_index);
+#ifdef AARCH64
+  __ str(R0_tos, Address(Rlocal_base, Interpreter::local_offset_in_bytes(1)));
+#else
+  __ str(R0_tos_lo, Address(Rlocal_base, Interpreter::local_offset_in_bytes(1)));
+  __ str(R1_tos_hi, Address(Rlocal_base, Interpreter::local_offset_in_bytes(0)));
+#endif // AARCH64
+}
+
+// Returns address of Java array element using temp register as address base.
+Address TemplateTable::get_array_elem_addr(BasicType elemType, Register array, Register index, Register temp) {
+  int logElemSize = exact_log2(type2aelembytes(elemType));
+  __ add_ptr_scaled_int32(temp, array, index, logElemSize);
+  return Address(temp, arrayOopDesc::base_offset_in_bytes(elemType));
+}
+
+//----------------------------------------------------------------------------------------------------
+// Condition conversion
+AsmCondition convNegCond(TemplateTable::Condition cc) {
+  switch (cc) {
+    case TemplateTable::equal        : return ne;
+    case TemplateTable::not_equal    : return eq;
+    case TemplateTable::less         : return ge;
+    case TemplateTable::less_equal   : return gt;
+    case TemplateTable::greater      : return le;
+    case TemplateTable::greater_equal: return lt;
+  }
+  ShouldNotReachHere();
+  return nv;
+}
+
+//----------------------------------------------------------------------------------------------------
+// Miscelaneous helper routines
+
+// Store an oop (or NULL) at the address described by obj.
+// Blows all volatile registers (R0-R3 on 32-bit ARM, R0-R18 on AArch64, Rtemp, LR).
+// Also destroys new_val and obj.base().
+static void do_oop_store(InterpreterMacroAssembler* _masm,
+                         Address obj,
+                         Register new_val,
+                         Register tmp1,
+                         Register tmp2,
+                         Register tmp3,
+                         BarrierSet::Name barrier,
+                         bool precise,
+                         bool is_null) {
+
+  assert_different_registers(obj.base(), new_val, tmp1, tmp2, tmp3, noreg);
+  switch (barrier) {
+#if INCLUDE_ALL_GCS
+    case BarrierSet::G1SATBCTLogging:
+      {
+        // flatten object address if needed
+        assert (obj.mode() == basic_offset, "pre- or post-indexing is not supported here");
+
+        const Register store_addr = obj.base();
+        if (obj.index() != noreg) {
+          assert (obj.disp() == 0, "index or displacement, not both");
+#ifdef AARCH64
+          __ add(store_addr, obj.base(), obj.index(), obj.extend(), obj.shift_imm());
+#else
+          assert(obj.offset_op() == add_offset, "addition is expected");
+          __ add(store_addr, obj.base(), AsmOperand(obj.index(), obj.shift(), obj.shift_imm()));
+#endif // AARCH64
+        } else if (obj.disp() != 0) {
+          __ add(store_addr, obj.base(), obj.disp());
+        }
+
+        __ g1_write_barrier_pre(store_addr, new_val, tmp1, tmp2, tmp3);
+        if (is_null) {
+          __ store_heap_oop_null(new_val, Address(store_addr));
+        } else {
+          // G1 barrier needs uncompressed oop for region cross check.
+          Register val_to_store = new_val;
+          if (UseCompressedOops) {
+            val_to_store = tmp1;
+            __ mov(val_to_store, new_val);
+          }
+          __ store_heap_oop(val_to_store, Address(store_addr)); // blows val_to_store:
+          val_to_store = noreg;
+          __ g1_write_barrier_post(store_addr, new_val, tmp1, tmp2, tmp3);
+        }
+      }
+      break;
+#endif // INCLUDE_ALL_GCS
+    case BarrierSet::CardTableForRS:
+    case BarrierSet::CardTableExtension:
+      {
+        if (is_null) {
+          __ store_heap_oop_null(new_val, obj);
+        } else {
+          assert (!precise || (obj.index() == noreg && obj.disp() == 0),
+                  "store check address should be calculated beforehand");
+
+          __ store_check_part1(tmp1);
+          __ store_heap_oop(new_val, obj); // blows new_val:
+          new_val = noreg;
+          __ store_check_part2(obj.base(), tmp1, tmp2);
+        }
+      }
+      break;
+    case BarrierSet::ModRef:
+      ShouldNotReachHere();
+      break;
+    default:
+      ShouldNotReachHere();
+      break;
+  }
+}
+
+Address TemplateTable::at_bcp(int offset) {
+  assert(_desc->uses_bcp(), "inconsistent uses_bcp information");
+  return Address(Rbcp, offset);
+}
+
+
+// Blows volatile registers (R0-R3 on 32-bit ARM, R0-R18 on AArch64), Rtemp, LR.
+void TemplateTable::patch_bytecode(Bytecodes::Code bc, Register bc_reg,
+                                   Register temp_reg, bool load_bc_into_bc_reg/*=true*/,
+                                   int byte_no) {
+  assert_different_registers(bc_reg, temp_reg);
+  if (!RewriteBytecodes)  return;
+  Label L_patch_done;
+
+  switch (bc) {
+  case Bytecodes::_fast_aputfield:
+  case Bytecodes::_fast_bputfield:
+  case Bytecodes::_fast_zputfield:
+  case Bytecodes::_fast_cputfield:
+  case Bytecodes::_fast_dputfield:
+  case Bytecodes::_fast_fputfield:
+  case Bytecodes::_fast_iputfield:
+  case Bytecodes::_fast_lputfield:
+  case Bytecodes::_fast_sputfield:
+    {
+      // We skip bytecode quickening for putfield instructions when
+      // the put_code written to the constant pool cache is zero.
+      // This is required so that every execution of this instruction
+      // calls out to InterpreterRuntime::resolve_get_put to do
+      // additional, required work.
+      assert(byte_no == f1_byte || byte_no == f2_byte, "byte_no out of range");
+      assert(load_bc_into_bc_reg, "we use bc_reg as temp");
+      __ get_cache_and_index_and_bytecode_at_bcp(bc_reg, temp_reg, temp_reg, byte_no, 1, sizeof(u2));
+      __ mov(bc_reg, bc);
+      __ cbz(temp_reg, L_patch_done);  // test if bytecode is zero
+    }
+    break;
+  default:
+    assert(byte_no == -1, "sanity");
+    // the pair bytecodes have already done the load.
+    if (load_bc_into_bc_reg) {
+      __ mov(bc_reg, bc);
+    }
+  }
+
+  if (__ can_post_breakpoint()) {
+    Label L_fast_patch;
+    // if a breakpoint is present we can't rewrite the stream directly
+    __ ldrb(temp_reg, at_bcp(0));
+    __ cmp(temp_reg, Bytecodes::_breakpoint);
+    __ b(L_fast_patch, ne);
+    if (bc_reg != R3) {
+      __ mov(R3, bc_reg);
+    }
+    __ mov(R1, Rmethod);
+    __ mov(R2, Rbcp);
+    // Let breakpoint table handling rewrite to quicker bytecode
+    __ call_VM(noreg, CAST_FROM_FN_PTR(address, InterpreterRuntime::set_original_bytecode_at), R1, R2, R3);
+    __ b(L_patch_done);
+    __ bind(L_fast_patch);
+  }
+
+#ifdef ASSERT
+  Label L_okay;
+  __ ldrb(temp_reg, at_bcp(0));
+  __ cmp(temp_reg, (int)Bytecodes::java_code(bc));
+  __ b(L_okay, eq);
+  __ cmp(temp_reg, bc_reg);
+  __ b(L_okay, eq);
+  __ stop("patching the wrong bytecode");
+  __ bind(L_okay);
+#endif
+
+  // patch bytecode
+  __ strb(bc_reg, at_bcp(0));
+  __ bind(L_patch_done);
+}
+
+//----------------------------------------------------------------------------------------------------
+// Individual instructions
+
+void TemplateTable::nop() {
+  transition(vtos, vtos);
+  // nothing to do
+}
+
+void TemplateTable::shouldnotreachhere() {
+  transition(vtos, vtos);
+  __ stop("shouldnotreachhere bytecode");
+}
+
+
+
+void TemplateTable::aconst_null() {
+  transition(vtos, atos);
+  __ mov(R0_tos, 0);
+}
+
+
+void TemplateTable::iconst(int value) {
+  transition(vtos, itos);
+  __ mov_slow(R0_tos, value);
+}
+
+
+void TemplateTable::lconst(int value) {
+  transition(vtos, ltos);
+  assert((value == 0) || (value == 1), "unexpected long constant");
+  __ mov(R0_tos, value);
+#ifndef AARCH64
+  __ mov(R1_tos_hi, 0);
+#endif // !AARCH64
+}
+
+
+void TemplateTable::fconst(int value) {
+  transition(vtos, ftos);
+#ifdef AARCH64
+  switch(value) {
+  case 0:   __ fmov_sw(S0_tos, ZR);    break;
+  case 1:   __ fmov_s (S0_tos, 0x70);  break;
+  case 2:   __ fmov_s (S0_tos, 0x00);  break;
+  default:  ShouldNotReachHere();      break;
+  }
+#else
+  const int zero = 0;         // 0.0f
+  const int one = 0x3f800000; // 1.0f
+  const int two = 0x40000000; // 2.0f
+
+  switch(value) {
+  case 0:   __ mov(R0_tos, zero);   break;
+  case 1:   __ mov(R0_tos, one);    break;
+  case 2:   __ mov(R0_tos, two);    break;
+  default:  ShouldNotReachHere();   break;
+  }
+
+#ifndef __SOFTFP__
+  __ fmsr(S0_tos, R0_tos);
+#endif // !__SOFTFP__
+#endif // AARCH64
+}
+
+
+void TemplateTable::dconst(int value) {
+  transition(vtos, dtos);
+#ifdef AARCH64
+  switch(value) {
+  case 0:   __ fmov_dx(D0_tos, ZR);    break;
+  case 1:   __ fmov_d (D0_tos, 0x70);  break;
+  default:  ShouldNotReachHere();      break;
+  }
+#else
+  const int one_lo = 0;            // low part of 1.0
+  const int one_hi = 0x3ff00000;   // high part of 1.0
+
+  if (value == 0) {
+#ifdef __SOFTFP__
+    __ mov(R0_tos_lo, 0);
+    __ mov(R1_tos_hi, 0);
+#else
+    __ mov(R0_tmp, 0);
+    __ fmdrr(D0_tos, R0_tmp, R0_tmp);
+#endif // __SOFTFP__
+  } else if (value == 1) {
+    __ mov(R0_tos_lo, one_lo);
+    __ mov_slow(R1_tos_hi, one_hi);
+#ifndef __SOFTFP__
+    __ fmdrr(D0_tos, R0_tos_lo, R1_tos_hi);
+#endif // !__SOFTFP__
+  } else {
+    ShouldNotReachHere();
+  }
+#endif // AARCH64
+}
+
+
+void TemplateTable::bipush() {
+  transition(vtos, itos);
+  __ ldrsb(R0_tos, at_bcp(1));
+}
+
+
+void TemplateTable::sipush() {
+  transition(vtos, itos);
+  __ ldrsb(R0_tmp, at_bcp(1));
+  __ ldrb(R1_tmp, at_bcp(2));
+  __ orr(R0_tos, R1_tmp, AsmOperand(R0_tmp, lsl, BitsPerByte));
+}
+
+
+void TemplateTable::ldc(bool wide) {
+  transition(vtos, vtos);
+  Label fastCase, Done;
+
+  const Register Rindex = R1_tmp;
+  const Register Rcpool = R2_tmp;
+  const Register Rtags  = R3_tmp;
+  const Register RtagType = R3_tmp;
+
+  if (wide) {
+    __ get_unsigned_2_byte_index_at_bcp(Rindex, 1);
+  } else {
+    __ ldrb(Rindex, at_bcp(1));
+  }
+  __ get_cpool_and_tags(Rcpool, Rtags);
+
+  const int base_offset = ConstantPool::header_size() * wordSize;
+  const int tags_offset = Array<u1>::base_offset_in_bytes();
+
+  // get const type
+  __ add(Rtemp, Rtags, tags_offset);
+#ifdef AARCH64
+  __ add(Rtemp, Rtemp, Rindex);
+  __ ldarb(RtagType, Rtemp);  // TODO-AARCH64 figure out if barrier is needed here, or control dependency is enough
+#else
+  __ ldrb(RtagType, Address(Rtemp, Rindex));
+  volatile_barrier(MacroAssembler::LoadLoad, Rtemp);
+#endif // AARCH64
+
+  // unresolved class - get the resolved class
+  __ cmp(RtagType, JVM_CONSTANT_UnresolvedClass);
+
+  // unresolved class in error (resolution failed) - call into runtime
+  // so that the same error from first resolution attempt is thrown.
+#ifdef AARCH64
+  __ mov(Rtemp, JVM_CONSTANT_UnresolvedClassInError); // this constant does not fit into 5-bit immediate constraint
+  __ cond_cmp(RtagType, Rtemp, ne);
+#else
+  __ cond_cmp(RtagType, JVM_CONSTANT_UnresolvedClassInError, ne);
+#endif // AARCH64
+
+  // resolved class - need to call vm to get java mirror of the class
+  __ cond_cmp(RtagType, JVM_CONSTANT_Class, ne);
+
+  __ b(fastCase, ne);
+
+  // slow case - call runtime
+  __ mov(R1, wide);
+  call_VM(R0_tos, CAST_FROM_FN_PTR(address, InterpreterRuntime::ldc), R1);
+  __ push(atos);
+  __ b(Done);
+
+  // int, float, String
+  __ bind(fastCase);
+#ifdef ASSERT
+  { Label L;
+    __ cmp(RtagType, JVM_CONSTANT_Integer);
+    __ cond_cmp(RtagType, JVM_CONSTANT_Float, ne);
+    __ b(L, eq);
+    __ stop("unexpected tag type in ldc");
+    __ bind(L);
+  }
+#endif // ASSERT
+  // itos, ftos
+  __ add(Rtemp, Rcpool, AsmOperand(Rindex, lsl, LogBytesPerWord));
+  __ ldr_u32(R0_tos, Address(Rtemp, base_offset));
+
+  // floats and ints are placed on stack in the same way, so
+  // we can use push(itos) to transfer float value without VFP
+  __ push(itos);
+  __ bind(Done);
+}
+
+// Fast path for caching oop constants.
+void TemplateTable::fast_aldc(bool wide) {
+  transition(vtos, atos);
+  int index_size = wide ? sizeof(u2) : sizeof(u1);
+  Label resolved;
+
+  // We are resolved if the resolved reference cache entry contains a
+  // non-null object (CallSite, etc.)
+  assert_different_registers(R0_tos, R2_tmp);
+  __ get_index_at_bcp(R2_tmp, 1, R0_tos, index_size);
+  __ load_resolved_reference_at_index(R0_tos, R2_tmp);
+  __ cbnz(R0_tos, resolved);
+
+  address entry = CAST_FROM_FN_PTR(address, InterpreterRuntime::resolve_ldc);
+
+  // first time invocation - must resolve first
+  __ mov(R1, (int)bytecode());
+  __ call_VM(R0_tos, entry, R1);
+  __ bind(resolved);
+
+  if (VerifyOops) {
+    __ verify_oop(R0_tos);
+  }
+}
+
+void TemplateTable::ldc2_w() {
+  transition(vtos, vtos);
+  const Register Rtags  = R2_tmp;
+  const Register Rindex = R3_tmp;
+  const Register Rcpool = R4_tmp;
+  const Register Rbase  = R5_tmp;
+
+  __ get_unsigned_2_byte_index_at_bcp(Rindex, 1);
+
+  __ get_cpool_and_tags(Rcpool, Rtags);
+  const int base_offset = ConstantPool::header_size() * wordSize;
+  const int tags_offset = Array<u1>::base_offset_in_bytes();
+
+  __ add(Rbase, Rcpool, AsmOperand(Rindex, lsl, LogBytesPerWord));
+
+#ifdef __ABI_HARD__
+  Label Long, exit;
+  // get type from tags
+  __ add(Rtemp, Rtags, tags_offset);
+  __ ldrb(Rtemp, Address(Rtemp, Rindex));
+  __ cmp(Rtemp, JVM_CONSTANT_Double);
+  __ b(Long, ne);
+  __ ldr_double(D0_tos, Address(Rbase, base_offset));
+
+  __ push(dtos);
+  __ b(exit);
+  __ bind(Long);
+#endif
+
+#ifdef AARCH64
+  __ ldr(R0_tos, Address(Rbase, base_offset));
+#else
+  __ ldr(R0_tos_lo, Address(Rbase, base_offset + 0 * wordSize));
+  __ ldr(R1_tos_hi, Address(Rbase, base_offset + 1 * wordSize));
+#endif // AARCH64
+  __ push(ltos);
+
+#ifdef __ABI_HARD__
+  __ bind(exit);
+#endif
+}
+
+
+void TemplateTable::locals_index(Register reg, int offset) {
+  __ ldrb(reg, at_bcp(offset));
+}
+
+void TemplateTable::iload() {
+  iload_internal();
+}
+
+void TemplateTable::nofast_iload() {
+  iload_internal(may_not_rewrite);
+}
+
+void TemplateTable::iload_internal(RewriteControl rc) {
+  transition(vtos, itos);
+
+  if ((rc == may_rewrite) && __ rewrite_frequent_pairs()) {
+    Label rewrite, done;
+    const Register next_bytecode = R1_tmp;
+    const Register target_bytecode = R2_tmp;
+
+    // get next byte
+    __ ldrb(next_bytecode, at_bcp(Bytecodes::length_for(Bytecodes::_iload)));
+    // if _iload, wait to rewrite to iload2.  We only want to rewrite the
+    // last two iloads in a pair.  Comparing against fast_iload means that
+    // the next bytecode is neither an iload or a caload, and therefore
+    // an iload pair.
+    __ cmp(next_bytecode, Bytecodes::_iload);
+    __ b(done, eq);
+
+    __ cmp(next_bytecode, Bytecodes::_fast_iload);
+    __ mov(target_bytecode, Bytecodes::_fast_iload2);
+    __ b(rewrite, eq);
+
+    // if _caload, rewrite to fast_icaload
+    __ cmp(next_bytecode, Bytecodes::_caload);
+    __ mov(target_bytecode, Bytecodes::_fast_icaload);
+    __ b(rewrite, eq);
+
+    // rewrite so iload doesn't check again.
+    __ mov(target_bytecode, Bytecodes::_fast_iload);
+
+    // rewrite
+    // R2: fast bytecode
+    __ bind(rewrite);
+    patch_bytecode(Bytecodes::_iload, target_bytecode, Rtemp, false);
+    __ bind(done);
+  }
+
+  // Get the local value into tos
+  const Register Rlocal_index = R1_tmp;
+  locals_index(Rlocal_index);
+  Address local = load_iaddress(Rlocal_index, Rtemp);
+  __ ldr_s32(R0_tos, local);
+}
+
+
+void TemplateTable::fast_iload2() {
+  transition(vtos, itos);
+  const Register Rlocal_index = R1_tmp;
+
+  locals_index(Rlocal_index);
+  Address local = load_iaddress(Rlocal_index, Rtemp);
+  __ ldr_s32(R0_tos, local);
+  __ push(itos);
+
+  locals_index(Rlocal_index, 3);
+  local = load_iaddress(Rlocal_index, Rtemp);
+  __ ldr_s32(R0_tos, local);
+}
+
+void TemplateTable::fast_iload() {
+  transition(vtos, itos);
+  const Register Rlocal_index = R1_tmp;
+
+  locals_index(Rlocal_index);
+  Address local = load_iaddress(Rlocal_index, Rtemp);
+  __ ldr_s32(R0_tos, local);
+}
+
+
+void TemplateTable::lload() {
+  transition(vtos, ltos);
+  const Register Rlocal_index = R2_tmp;
+
+  locals_index(Rlocal_index);
+  load_category2_local(Rlocal_index, R3_tmp);
+}
+
+
+void TemplateTable::fload() {
+  transition(vtos, ftos);
+  const Register Rlocal_index = R2_tmp;
+
+  // Get the local value into tos
+  locals_index(Rlocal_index);
+  Address local = load_faddress(Rlocal_index, Rtemp);
+#ifdef __SOFTFP__
+  __ ldr(R0_tos, local);
+#else
+  __ ldr_float(S0_tos, local);
+#endif // __SOFTFP__
+}
+
+
+void TemplateTable::dload() {
+  transition(vtos, dtos);
+  const Register Rlocal_index = R2_tmp;
+
+  locals_index(Rlocal_index);
+
+#ifdef __SOFTFP__
+  load_category2_local(Rlocal_index, R3_tmp);
+#else
+  __ ldr_double(D0_tos, load_daddress(Rlocal_index, Rtemp));
+#endif // __SOFTFP__
+}
+
+
+void TemplateTable::aload() {
+  transition(vtos, atos);
+  const Register Rlocal_index = R1_tmp;
+
+  locals_index(Rlocal_index);
+  Address local = load_aaddress(Rlocal_index, Rtemp);
+  __ ldr(R0_tos, local);
+}
+
+
+void TemplateTable::locals_index_wide(Register reg) {
+  assert_different_registers(reg, Rtemp);
+  __ ldrb(Rtemp, at_bcp(2));
+  __ ldrb(reg, at_bcp(3));
+  __ orr(reg, reg, AsmOperand(Rtemp, lsl, 8));
+}
+
+
+void TemplateTable::wide_iload() {
+  transition(vtos, itos);
+  const Register Rlocal_index = R2_tmp;
+
+  locals_index_wide(Rlocal_index);
+  Address local = load_iaddress(Rlocal_index, Rtemp);
+  __ ldr_s32(R0_tos, local);
+}
+
+
+void TemplateTable::wide_lload() {
+  transition(vtos, ltos);
+  const Register Rlocal_index = R2_tmp;
+  const Register Rlocal_base = R3_tmp;
+
+  locals_index_wide(Rlocal_index);
+  load_category2_local(Rlocal_index, R3_tmp);
+}
+
+
+void TemplateTable::wide_fload() {
+  transition(vtos, ftos);
+  const Register Rlocal_index = R2_tmp;
+
+  locals_index_wide(Rlocal_index);
+  Address local = load_faddress(Rlocal_index, Rtemp);
+#ifdef __SOFTFP__
+  __ ldr(R0_tos, local);
+#else
+  __ ldr_float(S0_tos, local);
+#endif // __SOFTFP__
+}
+
+
+void TemplateTable::wide_dload() {
+  transition(vtos, dtos);
+  const Register Rlocal_index = R2_tmp;
+
+  locals_index_wide(Rlocal_index);
+#ifdef __SOFTFP__
+  load_category2_local(Rlocal_index, R3_tmp);
+#else
+  __ ldr_double(D0_tos, load_daddress(Rlocal_index, Rtemp));
+#endif // __SOFTFP__
+}
+
+
+void TemplateTable::wide_aload() {
+  transition(vtos, atos);
+  const Register Rlocal_index = R2_tmp;
+
+  locals_index_wide(Rlocal_index);
+  Address local = load_aaddress(Rlocal_index, Rtemp);
+  __ ldr(R0_tos, local);
+}
+
+void TemplateTable::index_check(Register array, Register index) {
+  // Pop ptr into array
+  __ pop_ptr(array);
+  index_check_without_pop(array, index);
+}
+
+void TemplateTable::index_check_without_pop(Register array, Register index) {
+  assert_different_registers(array, index, Rtemp);
+  // check array
+  __ null_check(array, Rtemp, arrayOopDesc::length_offset_in_bytes());
+  // check index
+  __ ldr_s32(Rtemp, Address(array, arrayOopDesc::length_offset_in_bytes()));
+  __ cmp_32(index, Rtemp);
+  if (index != R4_ArrayIndexOutOfBounds_index) {
+    // convention with generate_ArrayIndexOutOfBounds_handler()
+    __ mov(R4_ArrayIndexOutOfBounds_index, index, hs);
+  }
+  __ b(Interpreter::_throw_ArrayIndexOutOfBoundsException_entry, hs);
+}
+
+
+void TemplateTable::iaload() {
+  transition(itos, itos);
+  const Register Rarray = R1_tmp;
+  const Register Rindex = R0_tos;
+
+  index_check(Rarray, Rindex);
+  __ ldr_s32(R0_tos, get_array_elem_addr(T_INT, Rarray, Rindex, Rtemp));
+}
+
+
+void TemplateTable::laload() {
+  transition(itos, ltos);
+  const Register Rarray = R1_tmp;
+  const Register Rindex = R0_tos;
+
+  index_check(Rarray, Rindex);
+
+#ifdef AARCH64
+  __ ldr(R0_tos, get_array_elem_addr(T_LONG, Rarray, Rindex, Rtemp));
+#else
+  __ add(Rtemp, Rarray, AsmOperand(Rindex, lsl, LogBytesPerLong));
+  __ add(Rtemp, Rtemp, arrayOopDesc::base_offset_in_bytes(T_LONG));
+  __ ldmia(Rtemp, RegisterSet(R0_tos_lo, R1_tos_hi));
+#endif // AARCH64
+}
+
+
+void TemplateTable::faload() {
+  transition(itos, ftos);
+  const Register Rarray = R1_tmp;
+  const Register Rindex = R0_tos;
+
+  index_check(Rarray, Rindex);
+
+  Address addr = get_array_elem_addr(T_FLOAT, Rarray, Rindex, Rtemp);
+#ifdef __SOFTFP__
+  __ ldr(R0_tos, addr);
+#else
+  __ ldr_float(S0_tos, addr);
+#endif // __SOFTFP__
+}
+
+
+void TemplateTable::daload() {
+  transition(itos, dtos);
+  const Register Rarray = R1_tmp;
+  const Register Rindex = R0_tos;
+
+  index_check(Rarray, Rindex);
+
+#ifdef __SOFTFP__
+  __ add(Rtemp, Rarray, AsmOperand(Rindex, lsl, LogBytesPerLong));
+  __ add(Rtemp, Rtemp, arrayOopDesc::base_offset_in_bytes(T_DOUBLE));
+  __ ldmia(Rtemp, RegisterSet(R0_tos_lo, R1_tos_hi));
+#else
+  __ ldr_double(D0_tos, get_array_elem_addr(T_DOUBLE, Rarray, Rindex, Rtemp));
+#endif // __SOFTFP__
+}
+
+
+void TemplateTable::aaload() {
+  transition(itos, atos);
+  const Register Rarray = R1_tmp;
+  const Register Rindex = R0_tos;
+
+  index_check(Rarray, Rindex);
+  __ load_heap_oop(R0_tos, get_array_elem_addr(T_OBJECT, Rarray, Rindex, Rtemp));
+}
+
+
+void TemplateTable::baload() {
+  transition(itos, itos);
+  const Register Rarray = R1_tmp;
+  const Register Rindex = R0_tos;
+
+  index_check(Rarray, Rindex);
+  __ ldrsb(R0_tos, get_array_elem_addr(T_BYTE, Rarray, Rindex, Rtemp));
+}
+
+
+void TemplateTable::caload() {
+  transition(itos, itos);
+  const Register Rarray = R1_tmp;
+  const Register Rindex = R0_tos;
+
+  index_check(Rarray, Rindex);
+  __ ldrh(R0_tos, get_array_elem_addr(T_CHAR, Rarray, Rindex, Rtemp));
+}
+
+
+// iload followed by caload frequent pair
+void TemplateTable::fast_icaload() {
+  transition(vtos, itos);
+  const Register Rlocal_index = R1_tmp;
+  const Register Rarray = R1_tmp;
+  const Register Rindex = R4_tmp; // index_check prefers index on R4
+  assert_different_registers(Rlocal_index, Rindex);
+  assert_different_registers(Rarray, Rindex);
+
+  // load index out of locals
+  locals_index(Rlocal_index);
+  Address local = load_iaddress(Rlocal_index, Rtemp);
+  __ ldr_s32(Rindex, local);
+
+  // get array element
+  index_check(Rarray, Rindex);
+  __ ldrh(R0_tos, get_array_elem_addr(T_CHAR, Rarray, Rindex, Rtemp));
+}
+
+
+void TemplateTable::saload() {
+  transition(itos, itos);
+  const Register Rarray = R1_tmp;
+  const Register Rindex = R0_tos;
+
+  index_check(Rarray, Rindex);
+  __ ldrsh(R0_tos, get_array_elem_addr(T_SHORT, Rarray, Rindex, Rtemp));
+}
+
+
+void TemplateTable::iload(int n) {
+  transition(vtos, itos);
+  __ ldr_s32(R0_tos, iaddress(n));
+}
+
+
+void TemplateTable::lload(int n) {
+  transition(vtos, ltos);
+#ifdef AARCH64
+  __ ldr(R0_tos, laddress(n));
+#else
+  __ ldr(R0_tos_lo, laddress(n));
+  __ ldr(R1_tos_hi, haddress(n));
+#endif // AARCH64
+}
+
+
+void TemplateTable::fload(int n) {
+  transition(vtos, ftos);
+#ifdef __SOFTFP__
+  __ ldr(R0_tos, faddress(n));
+#else
+  __ ldr_float(S0_tos, faddress(n));
+#endif // __SOFTFP__
+}
+
+
+void TemplateTable::dload(int n) {
+  transition(vtos, dtos);
+#ifdef __SOFTFP__
+  __ ldr(R0_tos_lo, laddress(n));
+  __ ldr(R1_tos_hi, haddress(n));
+#else
+  __ ldr_double(D0_tos, daddress(n));
+#endif // __SOFTFP__
+}
+
+
+void TemplateTable::aload(int n) {
+  transition(vtos, atos);
+  __ ldr(R0_tos, aaddress(n));
+}
+
+void TemplateTable::aload_0() {
+  aload_0_internal();
+}
+
+void TemplateTable::nofast_aload_0() {
+  aload_0_internal(may_not_rewrite);
+}
+
+void TemplateTable::aload_0_internal(RewriteControl rc) {
+  transition(vtos, atos);
+  // According to bytecode histograms, the pairs:
+  //
+  // _aload_0, _fast_igetfield
+  // _aload_0, _fast_agetfield
+  // _aload_0, _fast_fgetfield
+  //
+  // occur frequently. If RewriteFrequentPairs is set, the (slow) _aload_0
+  // bytecode checks if the next bytecode is either _fast_igetfield,
+  // _fast_agetfield or _fast_fgetfield and then rewrites the
+  // current bytecode into a pair bytecode; otherwise it rewrites the current
+  // bytecode into _fast_aload_0 that doesn't do the pair check anymore.
+  //
+  // Note: If the next bytecode is _getfield, the rewrite must be delayed,
+  //       otherwise we may miss an opportunity for a pair.
+  //
+  // Also rewrite frequent pairs
+  //   aload_0, aload_1
+  //   aload_0, iload_1
+  // These bytecodes with a small amount of code are most profitable to rewrite
+  if ((rc == may_rewrite) && __ rewrite_frequent_pairs()) {
+    Label rewrite, done;
+    const Register next_bytecode = R1_tmp;
+    const Register target_bytecode = R2_tmp;
+
+    // get next byte
+    __ ldrb(next_bytecode, at_bcp(Bytecodes::length_for(Bytecodes::_aload_0)));
+
+    // if _getfield then wait with rewrite
+    __ cmp(next_bytecode, Bytecodes::_getfield);
+    __ b(done, eq);
+
+    // if _igetfield then rewrite to _fast_iaccess_0
+    assert(Bytecodes::java_code(Bytecodes::_fast_iaccess_0) == Bytecodes::_aload_0, "fix bytecode definition");
+    __ cmp(next_bytecode, Bytecodes::_fast_igetfield);
+    __ mov(target_bytecode, Bytecodes::_fast_iaccess_0);
+    __ b(rewrite, eq);
+
+    // if _agetfield then rewrite to _fast_aaccess_0
+    assert(Bytecodes::java_code(Bytecodes::_fast_aaccess_0) == Bytecodes::_aload_0, "fix bytecode definition");
+    __ cmp(next_bytecode, Bytecodes::_fast_agetfield);
+    __ mov(target_bytecode, Bytecodes::_fast_aaccess_0);
+    __ b(rewrite, eq);
+
+    // if _fgetfield then rewrite to _fast_faccess_0, else rewrite to _fast_aload0
+    assert(Bytecodes::java_code(Bytecodes::_fast_faccess_0) == Bytecodes::_aload_0, "fix bytecode definition");
+    assert(Bytecodes::java_code(Bytecodes::_fast_aload_0) == Bytecodes::_aload_0, "fix bytecode definition");
+
+    __ cmp(next_bytecode, Bytecodes::_fast_fgetfield);
+#ifdef AARCH64
+    __ mov(Rtemp, Bytecodes::_fast_faccess_0);
+    __ mov(target_bytecode, Bytecodes::_fast_aload_0);
+    __ mov(target_bytecode, Rtemp, eq);
+#else
+    __ mov(target_bytecode, Bytecodes::_fast_faccess_0, eq);
+    __ mov(target_bytecode, Bytecodes::_fast_aload_0, ne);
+#endif // AARCH64
+
+    // rewrite
+    __ bind(rewrite);
+    patch_bytecode(Bytecodes::_aload_0, target_bytecode, Rtemp, false);
+
+    __ bind(done);
+  }
+
+  aload(0);
+}
+
+void TemplateTable::istore() {
+  transition(itos, vtos);
+  const Register Rlocal_index = R2_tmp;
+
+  locals_index(Rlocal_index);
+  Address local = load_iaddress(Rlocal_index, Rtemp);
+  __ str_32(R0_tos, local);
+}
+
+
+void TemplateTable::lstore() {
+  transition(ltos, vtos);
+  const Register Rlocal_index = R2_tmp;
+
+  locals_index(Rlocal_index);
+  store_category2_local(Rlocal_index, R3_tmp);
+}
+
+
+void TemplateTable::fstore() {
+  transition(ftos, vtos);
+  const Register Rlocal_index = R2_tmp;
+
+  locals_index(Rlocal_index);
+  Address local = load_faddress(Rlocal_index, Rtemp);
+#ifdef __SOFTFP__
+  __ str(R0_tos, local);
+#else
+  __ str_float(S0_tos, local);
+#endif // __SOFTFP__
+}
+
+
+void TemplateTable::dstore() {
+  transition(dtos, vtos);
+  const Register Rlocal_index = R2_tmp;
+
+  locals_index(Rlocal_index);
+
+#ifdef __SOFTFP__
+  store_category2_local(Rlocal_index, R3_tmp);
+#else
+  __ str_double(D0_tos, load_daddress(Rlocal_index, Rtemp));
+#endif // __SOFTFP__
+}
+
+
+void TemplateTable::astore() {
+  transition(vtos, vtos);
+  const Register Rlocal_index = R1_tmp;
+
+  __ pop_ptr(R0_tos);
+  locals_index(Rlocal_index);
+  Address local = load_aaddress(Rlocal_index, Rtemp);
+  __ str(R0_tos, local);
+}
+
+
+void TemplateTable::wide_istore() {
+  transition(vtos, vtos);
+  const Register Rlocal_index = R2_tmp;
+
+  __ pop_i(R0_tos);
+  locals_index_wide(Rlocal_index);
+  Address local = load_iaddress(Rlocal_index, Rtemp);
+  __ str_32(R0_tos, local);
+}
+
+
+void TemplateTable::wide_lstore() {
+  transition(vtos, vtos);
+  const Register Rlocal_index = R2_tmp;
+  const Register Rlocal_base = R3_tmp;
+
+#ifdef AARCH64
+  __ pop_l(R0_tos);
+#else
+  __ pop_l(R0_tos_lo, R1_tos_hi);
+#endif // AARCH64
+
+  locals_index_wide(Rlocal_index);
+  store_category2_local(Rlocal_index, R3_tmp);
+}
+
+
+void TemplateTable::wide_fstore() {
+  wide_istore();
+}
+
+
+void TemplateTable::wide_dstore() {
+  wide_lstore();
+}
+
+
+void TemplateTable::wide_astore() {
+  transition(vtos, vtos);
+  const Register Rlocal_index = R2_tmp;
+
+  __ pop_ptr(R0_tos);
+  locals_index_wide(Rlocal_index);
+  Address local = load_aaddress(Rlocal_index, Rtemp);
+  __ str(R0_tos, local);
+}
+
+
+void TemplateTable::iastore() {
+  transition(itos, vtos);
+  const Register Rindex = R4_tmp; // index_check prefers index in R4
+  const Register Rarray = R3_tmp;
+  // R0_tos: value
+
+  __ pop_i(Rindex);
+  index_check(Rarray, Rindex);
+  __ str_32(R0_tos, get_array_elem_addr(T_INT, Rarray, Rindex, Rtemp));
+}
+
+
+void TemplateTable::lastore() {
+  transition(ltos, vtos);
+  const Register Rindex = R4_tmp; // index_check prefers index in R4
+  const Register Rarray = R3_tmp;
+  // R0_tos_lo:R1_tos_hi: value
+
+  __ pop_i(Rindex);
+  index_check(Rarray, Rindex);
+
+#ifdef AARCH64
+  __ str(R0_tos, get_array_elem_addr(T_LONG, Rarray, Rindex, Rtemp));
+#else
+  __ add(Rtemp, Rarray, AsmOperand(Rindex, lsl, LogBytesPerLong));
+  __ add(Rtemp, Rtemp, arrayOopDesc::base_offset_in_bytes(T_LONG));
+  __ stmia(Rtemp, RegisterSet(R0_tos_lo, R1_tos_hi));
+#endif // AARCH64
+}
+
+
+void TemplateTable::fastore() {
+  transition(ftos, vtos);
+  const Register Rindex = R4_tmp; // index_check prefers index in R4
+  const Register Rarray = R3_tmp;
+  // S0_tos/R0_tos: value
+
+  __ pop_i(Rindex);
+  index_check(Rarray, Rindex);
+  Address addr = get_array_elem_addr(T_FLOAT, Rarray, Rindex, Rtemp);
+
+#ifdef __SOFTFP__
+  __ str(R0_tos, addr);
+#else
+  __ str_float(S0_tos, addr);
+#endif // __SOFTFP__
+}
+
+
+void TemplateTable::dastore() {
+  transition(dtos, vtos);
+  const Register Rindex = R4_tmp; // index_check prefers index in R4
+  const Register Rarray = R3_tmp;
+  // D0_tos / R0_tos_lo:R1_to_hi: value
+
+  __ pop_i(Rindex);
+  index_check(Rarray, Rindex);
+
+#ifdef __SOFTFP__
+  __ add(Rtemp, Rarray, AsmOperand(Rindex, lsl, LogBytesPerLong));
+  __ add(Rtemp, Rtemp, arrayOopDesc::base_offset_in_bytes(T_DOUBLE));
+  __ stmia(Rtemp, RegisterSet(R0_tos_lo, R1_tos_hi));
+#else
+  __ str_double(D0_tos, get_array_elem_addr(T_DOUBLE, Rarray, Rindex, Rtemp));
+#endif // __SOFTFP__
+}
+
+
+void TemplateTable::aastore() {
+  transition(vtos, vtos);
+  Label is_null, throw_array_store, done;
+
+  const Register Raddr_1   = R1_tmp;
+  const Register Rvalue_2  = R2_tmp;
+  const Register Rarray_3  = R3_tmp;
+  const Register Rindex_4  = R4_tmp;   // preferred by index_check_without_pop()
+  const Register Rsub_5    = R5_tmp;
+  const Register Rsuper_LR = LR_tmp;
+
+  // stack: ..., array, index, value
+  __ ldr(Rvalue_2, at_tos());     // Value
+  __ ldr_s32(Rindex_4, at_tos_p1());  // Index
+  __ ldr(Rarray_3, at_tos_p2());  // Array
+
+  index_check_without_pop(Rarray_3, Rindex_4);
+
+  // Compute the array base
+  __ add(Raddr_1, Rarray_3, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
+
+  // do array store check - check for NULL value first
+  __ cbz(Rvalue_2, is_null);
+
+  // Load subklass
+  __ load_klass(Rsub_5, Rvalue_2);
+  // Load superklass
+  __ load_klass(Rtemp, Rarray_3);
+  __ ldr(Rsuper_LR, Address(Rtemp, ObjArrayKlass::element_klass_offset()));
+
+  __ gen_subtype_check(Rsub_5, Rsuper_LR, throw_array_store, R0_tmp, R3_tmp);
+  // Come here on success
+
+  // Store value
+  __ add(Raddr_1, Raddr_1, AsmOperand(Rindex_4, lsl, LogBytesPerHeapOop));
+
+  // Now store using the appropriate barrier
+  do_oop_store(_masm, Raddr_1, Rvalue_2, Rtemp, R0_tmp, R3_tmp, _bs->kind(), true, false);
+  __ b(done);
+
+  __ bind(throw_array_store);
+
+  // Come here on failure of subtype check
+  __ profile_typecheck_failed(R0_tmp);
+
+  // object is at TOS
+  __ b(Interpreter::_throw_ArrayStoreException_entry);
+
+  // Have a NULL in Rvalue_2, store NULL at array[index].
+  __ bind(is_null);
+  __ profile_null_seen(R0_tmp);
+
+  // Store a NULL
+  do_oop_store(_masm, Address::indexed_oop(Raddr_1, Rindex_4), Rvalue_2, Rtemp, R0_tmp, R3_tmp, _bs->kind(), true, true);
+
+  // Pop stack arguments
+  __ bind(done);
+  __ add(Rstack_top, Rstack_top, 3 * Interpreter::stackElementSize);
+}
+
+
+void TemplateTable::bastore() {
+  transition(itos, vtos);
+  const Register Rindex = R4_tmp; // index_check prefers index in R4
+  const Register Rarray = R3_tmp;
+  // R0_tos: value
+
+  __ pop_i(Rindex);
+  index_check(Rarray, Rindex);
+
+  // Need to check whether array is boolean or byte
+  // since both types share the bastore bytecode.
+  __ load_klass(Rtemp, Rarray);
+  __ ldr_u32(Rtemp, Address(Rtemp, Klass::layout_helper_offset()));
+  Label L_skip;
+  __ tst(Rtemp, Klass::layout_helper_boolean_diffbit());
+  __ b(L_skip, eq);
+  __ and_32(R0_tos, R0_tos, 1); // if it is a T_BOOLEAN array, mask the stored value to 0/1
+  __ bind(L_skip);
+  __ strb(R0_tos, get_array_elem_addr(T_BYTE, Rarray, Rindex, Rtemp));
+}
+
+
+void TemplateTable::castore() {
+  transition(itos, vtos);
+  const Register Rindex = R4_tmp; // index_check prefers index in R4
+  const Register Rarray = R3_tmp;
+  // R0_tos: value
+
+  __ pop_i(Rindex);
+  index_check(Rarray, Rindex);
+
+  __ strh(R0_tos, get_array_elem_addr(T_CHAR, Rarray, Rindex, Rtemp));
+}
+
+
+void TemplateTable::sastore() {
+  assert(arrayOopDesc::base_offset_in_bytes(T_CHAR) ==
+           arrayOopDesc::base_offset_in_bytes(T_SHORT),
+         "base offsets for char and short should be equal");
+  castore();
+}
+
+
+void TemplateTable::istore(int n) {
+  transition(itos, vtos);
+  __ str_32(R0_tos, iaddress(n));
+}
+
+
+void TemplateTable::lstore(int n) {
+  transition(ltos, vtos);
+#ifdef AARCH64
+  __ str(R0_tos, laddress(n));
+#else
+  __ str(R0_tos_lo, laddress(n));
+  __ str(R1_tos_hi, haddress(n));
+#endif // AARCH64
+}
+
+
+void TemplateTable::fstore(int n) {
+  transition(ftos, vtos);
+#ifdef __SOFTFP__
+  __ str(R0_tos, faddress(n));
+#else
+  __ str_float(S0_tos, faddress(n));
+#endif // __SOFTFP__
+}
+
+
+void TemplateTable::dstore(int n) {
+  transition(dtos, vtos);
+#ifdef __SOFTFP__
+  __ str(R0_tos_lo, laddress(n));
+  __ str(R1_tos_hi, haddress(n));
+#else
+  __ str_double(D0_tos, daddress(n));
+#endif // __SOFTFP__
+}
+
+
+void TemplateTable::astore(int n) {
+  transition(vtos, vtos);
+  __ pop_ptr(R0_tos);
+  __ str(R0_tos, aaddress(n));
+}
+
+
+void TemplateTable::pop() {
+  transition(vtos, vtos);
+  __ add(Rstack_top, Rstack_top, Interpreter::stackElementSize);
+}
+
+
+void TemplateTable::pop2() {
+  transition(vtos, vtos);
+  __ add(Rstack_top, Rstack_top, 2*Interpreter::stackElementSize);
+}
+
+
+void TemplateTable::dup() {
+  transition(vtos, vtos);
+  // stack: ..., a
+  __ load_ptr(0, R0_tmp);
+  __ push_ptr(R0_tmp);
+  // stack: ..., a, a
+}
+
+
+void TemplateTable::dup_x1() {
+  transition(vtos, vtos);
+  // stack: ..., a, b
+  __ load_ptr(0, R0_tmp);  // load b
+  __ load_ptr(1, R2_tmp);  // load a
+  __ store_ptr(1, R0_tmp); // store b
+  __ store_ptr(0, R2_tmp); // store a
+  __ push_ptr(R0_tmp);     // push b
+  // stack: ..., b, a, b
+}
+
+
+void TemplateTable::dup_x2() {
+  transition(vtos, vtos);
+  // stack: ..., a, b, c
+  __ load_ptr(0, R0_tmp);   // load c
+  __ load_ptr(1, R2_tmp);   // load b
+  __ load_ptr(2, R4_tmp);   // load a
+
+  __ push_ptr(R0_tmp);      // push c
+
+  // stack: ..., a, b, c, c
+  __ store_ptr(1, R2_tmp);  // store b
+  __ store_ptr(2, R4_tmp);  // store a
+  __ store_ptr(3, R0_tmp);  // store c
+  // stack: ..., c, a, b, c
+}
+
+
+void TemplateTable::dup2() {
+  transition(vtos, vtos);
+  // stack: ..., a, b
+  __ load_ptr(1, R0_tmp);  // load a
+  __ push_ptr(R0_tmp);     // push a
+  __ load_ptr(1, R0_tmp);  // load b
+  __ push_ptr(R0_tmp);     // push b
+  // stack: ..., a, b, a, b
+}
+
+
+void TemplateTable::dup2_x1() {
+  transition(vtos, vtos);
+
+  // stack: ..., a, b, c
+  __ load_ptr(0, R4_tmp);  // load c
+  __ load_ptr(1, R2_tmp);  // load b
+  __ load_ptr(2, R0_tmp);  // load a
+
+  __ push_ptr(R2_tmp);     // push b
+  __ push_ptr(R4_tmp);     // push c
+
+  // stack: ..., a, b, c, b, c
+
+  __ store_ptr(2, R0_tmp);  // store a
+  __ store_ptr(3, R4_tmp);  // store c
+  __ store_ptr(4, R2_tmp);  // store b
+
+  // stack: ..., b, c, a, b, c
+}
+
+
+void TemplateTable::dup2_x2() {
+  transition(vtos, vtos);
+  // stack: ..., a, b, c, d
+  __ load_ptr(0, R0_tmp);  // load d
+  __ load_ptr(1, R2_tmp);  // load c
+  __ push_ptr(R2_tmp);     // push c
+  __ push_ptr(R0_tmp);     // push d
+  // stack: ..., a, b, c, d, c, d
+  __ load_ptr(4, R4_tmp);  // load b
+  __ store_ptr(4, R0_tmp); // store d in b
+  __ store_ptr(2, R4_tmp); // store b in d
+  // stack: ..., a, d, c, b, c, d
+  __ load_ptr(5, R4_tmp);  // load a
+  __ store_ptr(5, R2_tmp); // store c in a
+  __ store_ptr(3, R4_tmp); // store a in c
+  // stack: ..., c, d, a, b, c, d
+}
+
+
+void TemplateTable::swap() {
+  transition(vtos, vtos);
+  // stack: ..., a, b
+  __ load_ptr(1, R0_tmp);  // load a
+  __ load_ptr(0, R2_tmp);  // load b
+  __ store_ptr(0, R0_tmp); // store a in b
+  __ store_ptr(1, R2_tmp); // store b in a
+  // stack: ..., b, a
+}
+
+
+void TemplateTable::iop2(Operation op) {
+  transition(itos, itos);
+  const Register arg1 = R1_tmp;
+  const Register arg2 = R0_tos;
+
+  __ pop_i(arg1);
+  switch (op) {
+    case add  : __ add_32 (R0_tos, arg1, arg2); break;
+    case sub  : __ sub_32 (R0_tos, arg1, arg2); break;
+    case mul  : __ mul_32 (R0_tos, arg1, arg2); break;
+    case _and : __ and_32 (R0_tos, arg1, arg2); break;
+    case _or  : __ orr_32 (R0_tos, arg1, arg2); break;
+    case _xor : __ eor_32 (R0_tos, arg1, arg2); break;
+#ifdef AARCH64
+    case shl  : __ lslv_w (R0_tos, arg1, arg2); break;
+    case shr  : __ asrv_w (R0_tos, arg1, arg2); break;
+    case ushr : __ lsrv_w (R0_tos, arg1, arg2); break;
+#else
+    case shl  : __ andr(arg2, arg2, 0x1f); __ mov (R0_tos, AsmOperand(arg1, lsl, arg2)); break;
+    case shr  : __ andr(arg2, arg2, 0x1f); __ mov (R0_tos, AsmOperand(arg1, asr, arg2)); break;
+    case ushr : __ andr(arg2, arg2, 0x1f); __ mov (R0_tos, AsmOperand(arg1, lsr, arg2)); break;
+#endif // AARCH64
+    default   : ShouldNotReachHere();
+  }
+}
+
+
+void TemplateTable::lop2(Operation op) {
+  transition(ltos, ltos);
+#ifdef AARCH64
+  const Register arg1 = R1_tmp;
+  const Register arg2 = R0_tos;
+
+  __ pop_l(arg1);
+  switch (op) {
+    case add  : __ add (R0_tos, arg1, arg2); break;
+    case sub  : __ sub (R0_tos, arg1, arg2); break;
+    case _and : __ andr(R0_tos, arg1, arg2); break;
+    case _or  : __ orr (R0_tos, arg1, arg2); break;
+    case _xor : __ eor (R0_tos, arg1, arg2); break;
+    default   : ShouldNotReachHere();
+  }
+#else
+  const Register arg1_lo = R2_tmp;
+  const Register arg1_hi = R3_tmp;
+  const Register arg2_lo = R0_tos_lo;
+  const Register arg2_hi = R1_tos_hi;
+
+  __ pop_l(arg1_lo, arg1_hi);
+  switch (op) {
+    case add : __ adds(R0_tos_lo, arg1_lo, arg2_lo); __ adc (R1_tos_hi, arg1_hi, arg2_hi); break;
+    case sub : __ subs(R0_tos_lo, arg1_lo, arg2_lo); __ sbc (R1_tos_hi, arg1_hi, arg2_hi); break;
+    case _and: __ andr(R0_tos_lo, arg1_lo, arg2_lo); __ andr(R1_tos_hi, arg1_hi, arg2_hi); break;
+    case _or : __ orr (R0_tos_lo, arg1_lo, arg2_lo); __ orr (R1_tos_hi, arg1_hi, arg2_hi); break;
+    case _xor: __ eor (R0_tos_lo, arg1_lo, arg2_lo); __ eor (R1_tos_hi, arg1_hi, arg2_hi); break;
+    default : ShouldNotReachHere();
+  }
+#endif // AARCH64
+}
+
+
+void TemplateTable::idiv() {
+  transition(itos, itos);
+#ifdef AARCH64
+  const Register divisor = R0_tos;
+  const Register dividend = R1_tmp;
+
+  __ cbz_w(divisor, Interpreter::_throw_ArithmeticException_entry);
+  __ pop_i(dividend);
+  __ sdiv_w(R0_tos, dividend, divisor);
+#else
+  __ mov(R2, R0_tos);
+  __ pop_i(R0);
+  // R0 - dividend
+  // R2 - divisor
+  __ call(StubRoutines::Arm::idiv_irem_entry(), relocInfo::none);
+  // R1 - result
+  __ mov(R0_tos, R1);
+#endif // AARCH64
+}
+
+
+void TemplateTable::irem() {
+  transition(itos, itos);
+#ifdef AARCH64
+  const Register divisor = R0_tos;
+  const Register dividend = R1_tmp;
+  const Register quotient = R2_tmp;
+
+  __ cbz_w(divisor, Interpreter::_throw_ArithmeticException_entry);
+  __ pop_i(dividend);
+  __ sdiv_w(quotient, dividend, divisor);
+  __ msub_w(R0_tos, divisor, quotient, dividend);
+#else
+  __ mov(R2, R0_tos);
+  __ pop_i(R0);
+  // R0 - dividend
+  // R2 - divisor
+  __ call(StubRoutines::Arm::idiv_irem_entry(), relocInfo::none);
+  // R0 - remainder
+#endif // AARCH64
+}
+
+
+void TemplateTable::lmul() {
+  transition(ltos, ltos);
+#ifdef AARCH64
+  const Register arg1 = R0_tos;
+  const Register arg2 = R1_tmp;
+
+  __ pop_l(arg2);
+  __ mul(R0_tos, arg1, arg2);
+#else
+  const Register arg1_lo = R0_tos_lo;
+  const Register arg1_hi = R1_tos_hi;
+  const Register arg2_lo = R2_tmp;
+  const Register arg2_hi = R3_tmp;
+
+  __ pop_l(arg2_lo, arg2_hi);
+
+  __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::lmul), arg1_lo, arg1_hi, arg2_lo, arg2_hi);
+#endif // AARCH64
+}
+
+
+void TemplateTable::ldiv() {
+  transition(ltos, ltos);
+#ifdef AARCH64
+  const Register divisor = R0_tos;
+  const Register dividend = R1_tmp;
+
+  __ cbz(divisor, Interpreter::_throw_ArithmeticException_entry);
+  __ pop_l(dividend);
+  __ sdiv(R0_tos, dividend, divisor);
+#else
+  const Register x_lo = R2_tmp;
+  const Register x_hi = R3_tmp;
+  const Register y_lo = R0_tos_lo;
+  const Register y_hi = R1_tos_hi;
+
+  __ pop_l(x_lo, x_hi);
+
+  // check if y = 0
+  __ orrs(Rtemp, y_lo, y_hi);
+  __ call(Interpreter::_throw_ArithmeticException_entry, relocInfo::none, eq);
+  __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::ldiv), y_lo, y_hi, x_lo, x_hi);
+#endif // AARCH64
+}
+
+
+void TemplateTable::lrem() {
+  transition(ltos, ltos);
+#ifdef AARCH64
+  const Register divisor = R0_tos;
+  const Register dividend = R1_tmp;
+  const Register quotient = R2_tmp;
+
+  __ cbz(divisor, Interpreter::_throw_ArithmeticException_entry);
+  __ pop_l(dividend);
+  __ sdiv(quotient, dividend, divisor);
+  __ msub(R0_tos, divisor, quotient, dividend);
+#else
+  const Register x_lo = R2_tmp;
+  const Register x_hi = R3_tmp;
+  const Register y_lo = R0_tos_lo;
+  const Register y_hi = R1_tos_hi;
+
+  __ pop_l(x_lo, x_hi);
+
+  // check if y = 0
+  __ orrs(Rtemp, y_lo, y_hi);
+  __ call(Interpreter::_throw_ArithmeticException_entry, relocInfo::none, eq);
+  __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::lrem), y_lo, y_hi, x_lo, x_hi);
+#endif // AARCH64
+}
+
+
+void TemplateTable::lshl() {
+  transition(itos, ltos);
+#ifdef AARCH64
+  const Register val = R1_tmp;
+  const Register shift_cnt = R0_tos;
+  __ pop_l(val);
+  __ lslv(R0_tos, val, shift_cnt);
+#else
+  const Register shift_cnt = R4_tmp;
+  const Register val_lo = R2_tmp;
+  const Register val_hi = R3_tmp;
+
+  __ pop_l(val_lo, val_hi);
+  __ andr(shift_cnt, R0_tos, 63);
+  __ long_shift(R0_tos_lo, R1_tos_hi, val_lo, val_hi, lsl, shift_cnt);
+#endif // AARCH64
+}
+
+
+void TemplateTable::lshr() {
+  transition(itos, ltos);
+#ifdef AARCH64
+  const Register val = R1_tmp;
+  const Register shift_cnt = R0_tos;
+  __ pop_l(val);
+  __ asrv(R0_tos, val, shift_cnt);
+#else
+  const Register shift_cnt = R4_tmp;
+  const Register val_lo = R2_tmp;
+  const Register val_hi = R3_tmp;
+
+  __ pop_l(val_lo, val_hi);
+  __ andr(shift_cnt, R0_tos, 63);
+  __ long_shift(R0_tos_lo, R1_tos_hi, val_lo, val_hi, asr, shift_cnt);
+#endif // AARCH64
+}
+
+
+void TemplateTable::lushr() {
+  transition(itos, ltos);
+#ifdef AARCH64
+  const Register val = R1_tmp;
+  const Register shift_cnt = R0_tos;
+  __ pop_l(val);
+  __ lsrv(R0_tos, val, shift_cnt);
+#else
+  const Register shift_cnt = R4_tmp;
+  const Register val_lo = R2_tmp;
+  const Register val_hi = R3_tmp;
+
+  __ pop_l(val_lo, val_hi);
+  __ andr(shift_cnt, R0_tos, 63);
+  __ long_shift(R0_tos_lo, R1_tos_hi, val_lo, val_hi, lsr, shift_cnt);
+#endif // AARCH64
+}
+
+
+void TemplateTable::fop2(Operation op) {
+  transition(ftos, ftos);
+#ifdef __SOFTFP__
+  __ mov(R1, R0_tos);
+  __ pop_i(R0);
+  switch (op) {
+    case add: __ call_VM_leaf(CAST_FROM_FN_PTR(address, __aeabi_fadd_glibc), R0, R1); break;
+    case sub: __ call_VM_leaf(CAST_FROM_FN_PTR(address, __aeabi_fsub_glibc), R0, R1); break;
+    case mul: __ call_VM_leaf(CAST_FROM_FN_PTR(address, __aeabi_fmul), R0, R1); break;
+    case div: __ call_VM_leaf(CAST_FROM_FN_PTR(address, __aeabi_fdiv), R0, R1); break;
+    case rem: __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::frem), R0, R1); break;
+    default : ShouldNotReachHere();
+  }
+#else
+  const FloatRegister arg1 = S1_tmp;
+  const FloatRegister arg2 = S0_tos;
+
+  switch (op) {
+    case add: __ pop_f(arg1); __ add_float(S0_tos, arg1, arg2); break;
+    case sub: __ pop_f(arg1); __ sub_float(S0_tos, arg1, arg2); break;
+    case mul: __ pop_f(arg1); __ mul_float(S0_tos, arg1, arg2); break;
+    case div: __ pop_f(arg1); __ div_float(S0_tos, arg1, arg2); break;
+    case rem:
+#ifndef __ABI_HARD__
+      __ pop_f(arg1);
+      __ fmrs(R0, arg1);
+      __ fmrs(R1, arg2);
+      __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::frem), R0, R1);
+      __ fmsr(S0_tos, R0);
+#else
+      __ mov_float(S1_reg, arg2);
+      __ pop_f(S0);
+      __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::frem));
+#endif // !__ABI_HARD__
+      break;
+    default : ShouldNotReachHere();
+  }
+#endif // __SOFTFP__
+}
+
+
+void TemplateTable::dop2(Operation op) {
+  transition(dtos, dtos);
+#ifdef __SOFTFP__
+  __ mov(R2, R0_tos_lo);
+  __ mov(R3, R1_tos_hi);
+  __ pop_l(R0, R1);
+  switch (op) {
+    // __aeabi_XXXX_glibc: Imported code from glibc soft-fp bundle for calculation accuracy improvement. See CR 6757269.
+    case add: __ call_VM_leaf(CAST_FROM_FN_PTR(address, __aeabi_dadd_glibc), R0, R1, R2, R3); break;
+    case sub: __ call_VM_leaf(CAST_FROM_FN_PTR(address, __aeabi_dsub_glibc), R0, R1, R2, R3); break;
+    case mul: __ call_VM_leaf(CAST_FROM_FN_PTR(address, __aeabi_dmul), R0, R1, R2, R3); break;
+    case div: __ call_VM_leaf(CAST_FROM_FN_PTR(address, __aeabi_ddiv), R0, R1, R2, R3); break;
+    case rem: __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::drem), R0, R1, R2, R3); break;
+    default : ShouldNotReachHere();
+  }
+#else
+  const FloatRegister arg1 = D1_tmp;
+  const FloatRegister arg2 = D0_tos;
+
+  switch (op) {
+    case add: __ pop_d(arg1); __ add_double(D0_tos, arg1, arg2); break;
+    case sub: __ pop_d(arg1); __ sub_double(D0_tos, arg1, arg2); break;
+    case mul: __ pop_d(arg1); __ mul_double(D0_tos, arg1, arg2); break;
+    case div: __ pop_d(arg1); __ div_double(D0_tos, arg1, arg2); break;
+    case rem:
+#ifndef __ABI_HARD__
+      __ pop_d(arg1);
+      __ fmrrd(R0, R1, arg1);
+      __ fmrrd(R2, R3, arg2);
+      __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::drem), R0, R1, R2, R3);
+      __ fmdrr(D0_tos, R0, R1);
+#else
+      __ mov_double(D1, arg2);
+      __ pop_d(D0);
+      __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::drem));
+#endif // !__ABI_HARD__
+      break;
+    default : ShouldNotReachHere();
+  }
+#endif // __SOFTFP__
+}
+
+
+void TemplateTable::ineg() {
+  transition(itos, itos);
+  __ neg_32(R0_tos, R0_tos);
+}
+
+
+void TemplateTable::lneg() {
+  transition(ltos, ltos);
+#ifdef AARCH64
+  __ neg(R0_tos, R0_tos);
+#else
+  __ rsbs(R0_tos_lo, R0_tos_lo, 0);
+  __ rsc (R1_tos_hi, R1_tos_hi, 0);
+#endif // AARCH64
+}
+
+
+void TemplateTable::fneg() {
+  transition(ftos, ftos);
+#ifdef __SOFTFP__
+  // Invert sign bit
+  const int sign_mask = 0x80000000;
+  __ eor(R0_tos, R0_tos, sign_mask);
+#else
+  __ neg_float(S0_tos, S0_tos);
+#endif // __SOFTFP__
+}
+
+
+void TemplateTable::dneg() {
+  transition(dtos, dtos);
+#ifdef __SOFTFP__
+  // Invert sign bit in the high part of the double
+  const int sign_mask_hi = 0x80000000;
+  __ eor(R1_tos_hi, R1_tos_hi, sign_mask_hi);
+#else
+  __ neg_double(D0_tos, D0_tos);
+#endif // __SOFTFP__
+}
+
+
+void TemplateTable::iinc() {
+  transition(vtos, vtos);
+  const Register Rconst = R2_tmp;
+  const Register Rlocal_index = R1_tmp;
+  const Register Rval = R0_tmp;
+
+  __ ldrsb(Rconst, at_bcp(2));
+  locals_index(Rlocal_index);
+  Address local = load_iaddress(Rlocal_index, Rtemp);
+  __ ldr_s32(Rval, local);
+  __ add(Rval, Rval, Rconst);
+  __ str_32(Rval, local);
+}
+
+
+void TemplateTable::wide_iinc() {
+  transition(vtos, vtos);
+  const Register Rconst = R2_tmp;
+  const Register Rlocal_index = R1_tmp;
+  const Register Rval = R0_tmp;
+
+  // get constant in Rconst
+  __ ldrsb(R2_tmp, at_bcp(4));
+  __ ldrb(R3_tmp, at_bcp(5));
+  __ orr(Rconst, R3_tmp, AsmOperand(R2_tmp, lsl, 8));
+
+  locals_index_wide(Rlocal_index);
+  Address local = load_iaddress(Rlocal_index, Rtemp);
+  __ ldr_s32(Rval, local);
+  __ add(Rval, Rval, Rconst);
+  __ str_32(Rval, local);
+}
+
+
+void TemplateTable::convert() {
+  // Checking
+#ifdef ASSERT
+  { TosState tos_in  = ilgl;
+    TosState tos_out = ilgl;
+    switch (bytecode()) {
+      case Bytecodes::_i2l: // fall through
+      case Bytecodes::_i2f: // fall through
+      case Bytecodes::_i2d: // fall through
+      case Bytecodes::_i2b: // fall through
+      case Bytecodes::_i2c: // fall through
+      case Bytecodes::_i2s: tos_in = itos; break;
+      case Bytecodes::_l2i: // fall through
+      case Bytecodes::_l2f: // fall through
+      case Bytecodes::_l2d: tos_in = ltos; break;
+      case Bytecodes::_f2i: // fall through
+      case Bytecodes::_f2l: // fall through
+      case Bytecodes::_f2d: tos_in = ftos; break;
+      case Bytecodes::_d2i: // fall through
+      case Bytecodes::_d2l: // fall through
+      case Bytecodes::_d2f: tos_in = dtos; break;
+      default             : ShouldNotReachHere();
+    }
+    switch (bytecode()) {
+      case Bytecodes::_l2i: // fall through
+      case Bytecodes::_f2i: // fall through
+      case Bytecodes::_d2i: // fall through
+      case Bytecodes::_i2b: // fall through
+      case Bytecodes::_i2c: // fall through
+      case Bytecodes::_i2s: tos_out = itos; break;
+      case Bytecodes::_i2l: // fall through
+      case Bytecodes::_f2l: // fall through
+      case Bytecodes::_d2l: tos_out = ltos; break;
+      case Bytecodes::_i2f: // fall through
+      case Bytecodes::_l2f: // fall through
+      case Bytecodes::_d2f: tos_out = ftos; break;
+      case Bytecodes::_i2d: // fall through
+      case Bytecodes::_l2d: // fall through
+      case Bytecodes::_f2d: tos_out = dtos; break;
+      default             : ShouldNotReachHere();
+    }
+    transition(tos_in, tos_out);
+  }
+#endif // ASSERT
+
+  // Conversion
+  switch (bytecode()) {
+    case Bytecodes::_i2l:
+#ifdef AARCH64
+      __ sign_extend(R0_tos, R0_tos, 32);
+#else
+      __ mov(R1_tos_hi, AsmOperand(R0_tos, asr, BitsPerWord-1));
+#endif // AARCH64
+      break;
+
+    case Bytecodes::_i2f:
+#ifdef AARCH64
+      __ scvtf_sw(S0_tos, R0_tos);
+#else
+#ifdef __SOFTFP__
+      __ call_VM_leaf(CAST_FROM_FN_PTR(address, __aeabi_i2f), R0_tos);
+#else
+      __ fmsr(S0_tmp, R0_tos);
+      __ fsitos(S0_tos, S0_tmp);
+#endif // __SOFTFP__
+#endif // AARCH64
+      break;
+
+    case Bytecodes::_i2d:
+#ifdef AARCH64
+      __ scvtf_dw(D0_tos, R0_tos);
+#else
+#ifdef __SOFTFP__
+      __ call_VM_leaf(CAST_FROM_FN_PTR(address, __aeabi_i2d), R0_tos);
+#else
+      __ fmsr(S0_tmp, R0_tos);
+      __ fsitod(D0_tos, S0_tmp);
+#endif // __SOFTFP__
+#endif // AARCH64
+      break;
+
+    case Bytecodes::_i2b:
+      __ sign_extend(R0_tos, R0_tos, 8);
+      break;
+
+    case Bytecodes::_i2c:
+      __ zero_extend(R0_tos, R0_tos, 16);
+      break;
+
+    case Bytecodes::_i2s:
+      __ sign_extend(R0_tos, R0_tos, 16);
+      break;
+
+    case Bytecodes::_l2i:
+      /* nothing to do */
+      break;
+
+    case Bytecodes::_l2f:
+#ifdef AARCH64
+      __ scvtf_sx(S0_tos, R0_tos);
+#else
+      __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::l2f), R0_tos_lo, R1_tos_hi);
+#if !defined(__SOFTFP__) && !defined(__ABI_HARD__)
+      __ fmsr(S0_tos, R0);
+#endif // !__SOFTFP__ && !__ABI_HARD__
+#endif // AARCH64
+      break;
+
+    case Bytecodes::_l2d:
+#ifdef AARCH64
+      __ scvtf_dx(D0_tos, R0_tos);
+#else
+      __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::l2d), R0_tos_lo, R1_tos_hi);
+#if !defined(__SOFTFP__) && !defined(__ABI_HARD__)
+      __ fmdrr(D0_tos, R0, R1);
+#endif // !__SOFTFP__ && !__ABI_HARD__
+#endif // AARCH64
+      break;
+
+    case Bytecodes::_f2i:
+#ifdef AARCH64
+      __ fcvtzs_ws(R0_tos, S0_tos);
+#else
+#ifndef __SOFTFP__
+      __ ftosizs(S0_tos, S0_tos);
+      __ fmrs(R0_tos, S0_tos);
+#else
+      __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::f2i), R0_tos);
+#endif // !__SOFTFP__
+#endif // AARCH64
+      break;
+
+    case Bytecodes::_f2l:
+#ifdef AARCH64
+      __ fcvtzs_xs(R0_tos, S0_tos);
+#else
+#ifndef __SOFTFP__
+      __ fmrs(R0_tos, S0_tos);
+#endif // !__SOFTFP__
+      __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::f2l), R0_tos);
+#endif // AARCH64
+      break;
+
+    case Bytecodes::_f2d:
+#ifdef __SOFTFP__
+      __ call_VM_leaf(CAST_FROM_FN_PTR(address, __aeabi_f2d), R0_tos);
+#else
+      __ convert_f2d(D0_tos, S0_tos);
+#endif // __SOFTFP__
+      break;
+
+    case Bytecodes::_d2i:
+#ifdef AARCH64
+      __ fcvtzs_wd(R0_tos, D0_tos);
+#else
+#ifndef __SOFTFP__
+      __ ftosizd(Stemp, D0);
+      __ fmrs(R0, Stemp);
+#else
+      __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::d2i), R0_tos_lo, R1_tos_hi);
+#endif // !__SOFTFP__
+#endif // AARCH64
+      break;
+
+    case Bytecodes::_d2l:
+#ifdef AARCH64
+      __ fcvtzs_xd(R0_tos, D0_tos);
+#else
+#ifndef __SOFTFP__
+      __ fmrrd(R0_tos_lo, R1_tos_hi, D0_tos);
+#endif // !__SOFTFP__
+      __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::d2l), R0_tos_lo, R1_tos_hi);
+#endif // AARCH64
+      break;
+
+    case Bytecodes::_d2f:
+#ifdef __SOFTFP__
+      __ call_VM_leaf(CAST_FROM_FN_PTR(address, __aeabi_d2f), R0_tos_lo, R1_tos_hi);
+#else
+      __ convert_d2f(S0_tos, D0_tos);
+#endif // __SOFTFP__
+      break;
+
+    default:
+      ShouldNotReachHere();
+  }
+}
+
+
+void TemplateTable::lcmp() {
+  transition(ltos, itos);
+#ifdef AARCH64
+  const Register arg1 = R1_tmp;
+  const Register arg2 = R0_tos;
+
+  __ pop_l(arg1);
+
+  __ cmp(arg1, arg2);
+  __ cset(R0_tos, gt);               // 1 if '>', else 0
+  __ csinv(R0_tos, R0_tos, ZR, ge);  // previous value if '>=', else -1
+#else
+  const Register arg1_lo = R2_tmp;
+  const Register arg1_hi = R3_tmp;
+  const Register arg2_lo = R0_tos_lo;
+  const Register arg2_hi = R1_tos_hi;
+  const Register res = R4_tmp;
+
+  __ pop_l(arg1_lo, arg1_hi);
+
+  // long compare arg1 with arg2
+  // result is -1/0/+1 if '<'/'='/'>'
+  Label done;
+
+  __ mov (res, 0);
+  __ cmp (arg1_hi, arg2_hi);
+  __ mvn (res, 0, lt);
+  __ mov (res, 1, gt);
+  __ b(done, ne);
+  __ cmp (arg1_lo, arg2_lo);
+  __ mvn (res, 0, lo);
+  __ mov (res, 1, hi);
+  __ bind(done);
+  __ mov (R0_tos, res);
+#endif // AARCH64
+}
+
+
+void TemplateTable::float_cmp(bool is_float, int unordered_result) {
+  assert((unordered_result == 1) || (unordered_result == -1), "invalid unordered result");
+
+#ifdef AARCH64
+  if (is_float) {
+    transition(ftos, itos);
+    __ pop_f(S1_tmp);
+    __ fcmp_s(S1_tmp, S0_tos);
+  } else {
+    transition(dtos, itos);
+    __ pop_d(D1_tmp);
+    __ fcmp_d(D1_tmp, D0_tos);
+  }
+
+  if (unordered_result < 0) {
+    __ cset(R0_tos, gt);               // 1 if '>', else 0
+    __ csinv(R0_tos, R0_tos, ZR, ge);  // previous value if '>=', else -1
+  } else {
+    __ cset(R0_tos, hi);               // 1 if '>' or unordered, else 0
+    __ csinv(R0_tos, R0_tos, ZR, pl);  // previous value if '>=' or unordered, else -1
+  }
+
+#else
+
+#ifdef __SOFTFP__
+
+  if (is_float) {
+    transition(ftos, itos);
+    const Register Rx = R0;
+    const Register Ry = R1;
+
+    __ mov(Ry, R0_tos);
+    __ pop_i(Rx);
+
+    if (unordered_result == 1) {
+      __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::fcmpg), Rx, Ry);
+    } else {
+      __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::fcmpl), Rx, Ry);
+    }
+
+  } else {
+
+    transition(dtos, itos);
+    const Register Rx_lo = R0;
+    const Register Rx_hi = R1;
+    const Register Ry_lo = R2;
+    const Register Ry_hi = R3;
+
+    __ mov(Ry_lo, R0_tos_lo);
+    __ mov(Ry_hi, R1_tos_hi);
+    __ pop_l(Rx_lo, Rx_hi);
+
+    if (unordered_result == 1) {
+      __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::dcmpg), Rx_lo, Rx_hi, Ry_lo, Ry_hi);
+    } else {
+      __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::dcmpl), Rx_lo, Rx_hi, Ry_lo, Ry_hi);
+    }
+  }
+
+#else
+
+  if (is_float) {
+    transition(ftos, itos);
+    __ pop_f(S1_tmp);
+    __ fcmps(S1_tmp, S0_tos);
+  } else {
+    transition(dtos, itos);
+    __ pop_d(D1_tmp);
+    __ fcmpd(D1_tmp, D0_tos);
+  }
+
+  __ fmstat();
+
+  // comparison result | flag N | flag Z | flag C | flag V
+  // "<"               |   1    |   0    |   0    |   0
+  // "=="              |   0    |   1    |   1    |   0
+  // ">"               |   0    |   0    |   1    |   0
+  // unordered         |   0    |   0    |   1    |   1
+
+  if (unordered_result < 0) {
+    __ mov(R0_tos, 1);           // result ==  1 if greater
+    __ mvn(R0_tos, 0, lt);       // result == -1 if less or unordered (N!=V)
+  } else {
+    __ mov(R0_tos, 1);           // result ==  1 if greater or unordered
+    __ mvn(R0_tos, 0, mi);       // result == -1 if less (N=1)
+  }
+  __ mov(R0_tos, 0, eq);         // result ==  0 if equ (Z=1)
+#endif // __SOFTFP__
+#endif // AARCH64
+}
+
+
+void TemplateTable::branch(bool is_jsr, bool is_wide) {
+
+  const Register Rdisp = R0_tmp;
+  const Register Rbumped_taken_count = R5_tmp;
+
+  __ profile_taken_branch(R0_tmp, Rbumped_taken_count); // R0 holds updated MDP, Rbumped_taken_count holds bumped taken count
+
+  const ByteSize be_offset = MethodCounters::backedge_counter_offset() +
+                             InvocationCounter::counter_offset();
+  const ByteSize inv_offset = MethodCounters::invocation_counter_offset() +
+                              InvocationCounter::counter_offset();
+  const int method_offset = frame::interpreter_frame_method_offset * wordSize;
+
+  // Load up R0 with the branch displacement
+  if (is_wide) {
+    __ ldrsb(R0_tmp, at_bcp(1));
+    __ ldrb(R1_tmp, at_bcp(2));
+    __ ldrb(R2_tmp, at_bcp(3));
+    __ ldrb(R3_tmp, at_bcp(4));
+    __ orr(R0_tmp, R1_tmp, AsmOperand(R0_tmp, lsl, BitsPerByte));
+    __ orr(R0_tmp, R2_tmp, AsmOperand(R0_tmp, lsl, BitsPerByte));
+    __ orr(Rdisp, R3_tmp, AsmOperand(R0_tmp, lsl, BitsPerByte));
+  } else {
+    __ ldrsb(R0_tmp, at_bcp(1));
+    __ ldrb(R1_tmp, at_bcp(2));
+    __ orr(Rdisp, R1_tmp, AsmOperand(R0_tmp, lsl, BitsPerByte));
+  }
+
+  // Handle all the JSR stuff here, then exit.
+  // It's much shorter and cleaner than intermingling with the
+  // non-JSR normal-branch stuff occuring below.
+  if (is_jsr) {
+    // compute return address as bci in R1
+    const Register Rret_addr = R1_tmp;
+    assert_different_registers(Rdisp, Rret_addr, Rtemp);
+
+    __ ldr(Rtemp, Address(Rmethod, Method::const_offset()));
+    __ sub(Rret_addr, Rbcp, - (is_wide ? 5 : 3) + in_bytes(ConstMethod::codes_offset()));
+    __ sub(Rret_addr, Rret_addr, Rtemp);
+
+    // Load the next target bytecode into R3_bytecode and advance Rbcp
+#ifdef AARCH64
+    __ add(Rbcp, Rbcp, Rdisp);
+    __ ldrb(R3_bytecode, Address(Rbcp));
+#else
+    __ ldrb(R3_bytecode, Address(Rbcp, Rdisp, lsl, 0, pre_indexed));
+#endif // AARCH64
+
+    // Push return address
+    __ push_i(Rret_addr);
+    // jsr returns vtos
+    __ dispatch_only_noverify(vtos);
+    return;
+  }
+
+  // Normal (non-jsr) branch handling
+
+  // Adjust the bcp by the displacement in Rdisp and load next bytecode.
+#ifdef AARCH64
+  __ add(Rbcp, Rbcp, Rdisp);
+  __ ldrb(R3_bytecode, Address(Rbcp));
+#else
+  __ ldrb(R3_bytecode, Address(Rbcp, Rdisp, lsl, 0, pre_indexed));
+#endif // AARCH64
+
+  assert(UseLoopCounter || !UseOnStackReplacement, "on-stack-replacement requires loop counters");
+  Label backedge_counter_overflow;
+  Label profile_method;
+  Label dispatch;
+
+  if (UseLoopCounter) {
+    // increment backedge counter for backward branches
+    // Rdisp (R0): target offset
+
+    const Register Rcnt = R2_tmp;
+    const Register Rcounters = R1_tmp;
+
+    // count only if backward branch
+#ifdef AARCH64
+    __ tbz(Rdisp, (BitsPerWord - 1), dispatch); // TODO-AARCH64: check performance of this variant on 32-bit ARM
+#else
+    __ tst(Rdisp, Rdisp);
+    __ b(dispatch, pl);
+#endif // AARCH64
+
+    if (TieredCompilation) {
+      Label no_mdo;
+      int increment = InvocationCounter::count_increment;
+      if (ProfileInterpreter) {
+        // Are we profiling?
+        __ ldr(Rtemp, Address(Rmethod, Method::method_data_offset()));
+        __ cbz(Rtemp, no_mdo);
+        // Increment the MDO backedge counter
+        const Address mdo_backedge_counter(Rtemp, in_bytes(MethodData::backedge_counter_offset()) +
+                                                  in_bytes(InvocationCounter::counter_offset()));
+        const Address mask(Rtemp, in_bytes(MethodData::backedge_mask_offset()));
+        __ increment_mask_and_jump(mdo_backedge_counter, increment, mask,
+                                   Rcnt, R4_tmp, eq, &backedge_counter_overflow);
+        __ b(dispatch);
+      }
+      __ bind(no_mdo);
+      // Increment backedge counter in MethodCounters*
+      __ get_method_counters(Rmethod, Rcounters, dispatch);
+      const Address mask(Rcounters, in_bytes(MethodCounters::backedge_mask_offset()));
+      __ increment_mask_and_jump(Address(Rcounters, be_offset), increment, mask,
+                                 Rcnt, R4_tmp, eq, &backedge_counter_overflow);
+    } else {
+      // increment counter
+      __ get_method_counters(Rmethod, Rcounters, dispatch);
+      __ ldr_u32(Rtemp, Address(Rcounters, be_offset));           // load backedge counter
+      __ add(Rtemp, Rtemp, InvocationCounter::count_increment);   // increment counter
+      __ str_32(Rtemp, Address(Rcounters, be_offset));            // store counter
+
+      __ ldr_u32(Rcnt, Address(Rcounters, inv_offset));           // load invocation counter
+#ifdef AARCH64
+      __ andr(Rcnt, Rcnt, (unsigned int)InvocationCounter::count_mask_value);  // and the status bits
+#else
+      __ bic(Rcnt, Rcnt, ~InvocationCounter::count_mask_value);  // and the status bits
+#endif // AARCH64
+      __ add(Rcnt, Rcnt, Rtemp);                                 // add both counters
+
+      if (ProfileInterpreter) {
+        // Test to see if we should create a method data oop
+        const Address profile_limit(Rcounters, in_bytes(MethodCounters::interpreter_profile_limit_offset()));
+        __ ldr_s32(Rtemp, profile_limit);
+        __ cmp_32(Rcnt, Rtemp);
+        __ b(dispatch, lt);
+
+        // if no method data exists, go to profile method
+        __ test_method_data_pointer(R4_tmp, profile_method);
+
+        if (UseOnStackReplacement) {
+          // check for overflow against Rbumped_taken_count, which is the MDO taken count
+          const Address backward_branch_limit(Rcounters, in_bytes(MethodCounters::interpreter_backward_branch_limit_offset()));
+          __ ldr_s32(Rtemp, backward_branch_limit);
+          __ cmp(Rbumped_taken_count, Rtemp);
+          __ b(dispatch, lo);
+
+          // When ProfileInterpreter is on, the backedge_count comes from the
+          // MethodData*, which value does not get reset on the call to
+          // frequency_counter_overflow().  To avoid excessive calls to the overflow
+          // routine while the method is being compiled, add a second test to make
+          // sure the overflow function is called only once every overflow_frequency.
+          const int overflow_frequency = 1024;
+
+#ifdef AARCH64
+          __ tst(Rbumped_taken_count, (unsigned)(overflow_frequency-1));
+#else
+          // was '__ andrs(...,overflow_frequency-1)', testing if lowest 10 bits are 0
+          assert(overflow_frequency == (1 << 10),"shift by 22 not correct for expected frequency");
+          __ movs(Rbumped_taken_count, AsmOperand(Rbumped_taken_count, lsl, 22));
+#endif // AARCH64
+
+          __ b(backedge_counter_overflow, eq);
+        }
+      } else {
+        if (UseOnStackReplacement) {
+          // check for overflow against Rcnt, which is the sum of the counters
+          const Address backward_branch_limit(Rcounters, in_bytes(MethodCounters::interpreter_backward_branch_limit_offset()));
+          __ ldr_s32(Rtemp, backward_branch_limit);
+          __ cmp_32(Rcnt, Rtemp);
+          __ b(backedge_counter_overflow, hs);
+
+        }
+      }
+    }
+    __ bind(dispatch);
+  }
+
+  if (!UseOnStackReplacement) {
+    __ bind(backedge_counter_overflow);
+  }
+
+  // continue with the bytecode @ target
+  __ dispatch_only(vtos);
+
+  if (UseLoopCounter) {
+    if (ProfileInterpreter) {
+      // Out-of-line code to allocate method data oop.
+      __ bind(profile_method);
+
+      __ call_VM(noreg, CAST_FROM_FN_PTR(address, InterpreterRuntime::profile_method));
+      __ set_method_data_pointer_for_bcp();
+      // reload next bytecode
+      __ ldrb(R3_bytecode, Address(Rbcp));
+      __ b(dispatch);
+    }
+
+    if (UseOnStackReplacement) {
+      // invocation counter overflow
+      __ bind(backedge_counter_overflow);
+
+      __ sub(R1, Rbcp, Rdisp);                   // branch bcp
+      call_VM(noreg, CAST_FROM_FN_PTR(address, InterpreterRuntime::frequency_counter_overflow), R1);
+
+      // R0: osr nmethod (osr ok) or NULL (osr not possible)
+      const Register Rnmethod = R0;
+
+      __ ldrb(R3_bytecode, Address(Rbcp));       // reload next bytecode
+
+      __ cbz(Rnmethod, dispatch);                // test result, no osr if null
+
+      // nmethod may have been invalidated (VM may block upon call_VM return)
+      __ ldrb(R1_tmp, Address(Rnmethod, nmethod::state_offset()));
+      __ cmp(R1_tmp, nmethod::in_use);
+      __ b(dispatch, ne);
+
+      // We have the address of an on stack replacement routine in Rnmethod,
+      // We need to prepare to execute the OSR method. First we must
+      // migrate the locals and monitors off of the stack.
+
+      __ mov(Rtmp_save0, Rnmethod);                      // save the nmethod
+
+      call_VM(noreg, CAST_FROM_FN_PTR(address, SharedRuntime::OSR_migration_begin));
+
+      // R0 is OSR buffer
+
+      __ ldr(R1_tmp, Address(Rtmp_save0, nmethod::osr_entry_point_offset()));
+      __ ldr(Rtemp, Address(FP, frame::interpreter_frame_sender_sp_offset * wordSize));
+
+#ifdef AARCH64
+      __ ldp(FP, LR, Address(FP));
+      __ mov(SP, Rtemp);
+#else
+      __ ldmia(FP, RegisterSet(FP) | RegisterSet(LR));
+      __ bic(SP, Rtemp, StackAlignmentInBytes - 1);     // Remove frame and align stack
+#endif // AARCH64
+
+      __ jump(R1_tmp);
+    }
+  }
+}
+
+
+void TemplateTable::if_0cmp(Condition cc) {
+  transition(itos, vtos);
+  // assume branch is more often taken than not (loops use backward branches)
+  Label not_taken;
+#ifdef AARCH64
+  if (cc == equal) {
+    __ cbnz_w(R0_tos, not_taken);
+  } else if (cc == not_equal) {
+    __ cbz_w(R0_tos, not_taken);
+  } else {
+    __ cmp_32(R0_tos, 0);
+    __ b(not_taken, convNegCond(cc));
+  }
+#else
+  __ cmp_32(R0_tos, 0);
+  __ b(not_taken, convNegCond(cc));
+#endif // AARCH64
+  branch(false, false);
+  __ bind(not_taken);
+  __ profile_not_taken_branch(R0_tmp);
+}
+
+
+void TemplateTable::if_icmp(Condition cc) {
+  transition(itos, vtos);
+  // assume branch is more often taken than not (loops use backward branches)
+  Label not_taken;
+  __ pop_i(R1_tmp);
+  __ cmp_32(R1_tmp, R0_tos);
+  __ b(not_taken, convNegCond(cc));
+  branch(false, false);
+  __ bind(not_taken);
+  __ profile_not_taken_branch(R0_tmp);
+}
+
+
+void TemplateTable::if_nullcmp(Condition cc) {
+  transition(atos, vtos);
+  assert(cc == equal || cc == not_equal, "invalid condition");
+
+  // assume branch is more often taken than not (loops use backward branches)
+  Label not_taken;
+  if (cc == equal) {
+    __ cbnz(R0_tos, not_taken);
+  } else {
+    __ cbz(R0_tos, not_taken);
+  }
+  branch(false, false);
+  __ bind(not_taken);
+  __ profile_not_taken_branch(R0_tmp);
+}
+
+
+void TemplateTable::if_acmp(Condition cc) {
+  transition(atos, vtos);
+  // assume branch is more often taken than not (loops use backward branches)
+  Label not_taken;
+  __ pop_ptr(R1_tmp);
+  __ cmp(R1_tmp, R0_tos);
+  __ b(not_taken, convNegCond(cc));
+  branch(false, false);
+  __ bind(not_taken);
+  __ profile_not_taken_branch(R0_tmp);
+}
+
+
+void TemplateTable::ret() {
+  transition(vtos, vtos);
+  const Register Rlocal_index = R1_tmp;
+  const Register Rret_bci = Rtmp_save0; // R4/R19
+
+  locals_index(Rlocal_index);
+  Address local = load_iaddress(Rlocal_index, Rtemp);
+  __ ldr_s32(Rret_bci, local);          // get return bci, compute return bcp
+  __ profile_ret(Rtmp_save1, Rret_bci);
+  __ ldr(Rtemp, Address(Rmethod, Method::const_offset()));
+  __ add(Rtemp, Rtemp, in_bytes(ConstMethod::codes_offset()));
+  __ add(Rbcp, Rtemp, Rret_bci);
+  __ dispatch_next(vtos);
+}
+
+
+void TemplateTable::wide_ret() {
+  transition(vtos, vtos);
+  const Register Rlocal_index = R1_tmp;
+  const Register Rret_bci = Rtmp_save0; // R4/R19
+
+  locals_index_wide(Rlocal_index);
+  Address local = load_iaddress(Rlocal_index, Rtemp);
+  __ ldr_s32(Rret_bci, local);               // get return bci, compute return bcp
+  __ profile_ret(Rtmp_save1, Rret_bci);
+  __ ldr(Rtemp, Address(Rmethod, Method::const_offset()));
+  __ add(Rtemp, Rtemp, in_bytes(ConstMethod::codes_offset()));
+  __ add(Rbcp, Rtemp, Rret_bci);
+  __ dispatch_next(vtos);
+}
+
+
+void TemplateTable::tableswitch() {
+  transition(itos, vtos);
+
+  const Register Rindex  = R0_tos;
+#ifndef AARCH64
+  const Register Rtemp2  = R1_tmp;
+#endif // !AARCH64
+  const Register Rabcp   = R2_tmp;  // aligned bcp
+  const Register Rlow    = R3_tmp;
+  const Register Rhigh   = R4_tmp;
+  const Register Roffset = R5_tmp;
+
+  // align bcp
+  __ add(Rtemp, Rbcp, 1 + (2*BytesPerInt-1));
+  __ align_reg(Rabcp, Rtemp, BytesPerInt);
+
+  // load lo & hi
+#ifdef AARCH64
+  __ ldp_w(Rlow, Rhigh, Address(Rabcp, 2*BytesPerInt, post_indexed));
+#else
+  __ ldmia(Rabcp, RegisterSet(Rlow) | RegisterSet(Rhigh), writeback);
+#endif // AARCH64
+  __ byteswap_u32(Rlow, Rtemp, Rtemp2);
+  __ byteswap_u32(Rhigh, Rtemp, Rtemp2);
+
+  // compare index with high bound
+  __ cmp_32(Rhigh, Rindex);
+
+#ifdef AARCH64
+  Label default_case, do_dispatch;
+  __ ccmp_w(Rindex, Rlow, Assembler::flags_for_condition(lt), ge);
+  __ b(default_case, lt);
+
+  __ sub_w(Rindex, Rindex, Rlow);
+  __ ldr_s32(Roffset, Address(Rabcp, Rindex, ex_sxtw, LogBytesPerInt));
+  if(ProfileInterpreter) {
+    __ sxtw(Rindex, Rindex);
+    __ profile_switch_case(Rabcp, Rindex, Rtemp2, R0_tmp);
+  }
+  __ b(do_dispatch);
+
+  __ bind(default_case);
+  __ ldr_s32(Roffset, Address(Rabcp, -3 * BytesPerInt));
+  if(ProfileInterpreter) {
+    __ profile_switch_default(R0_tmp);
+  }
+
+  __ bind(do_dispatch);
+#else
+
+  // if Rindex <= Rhigh then calculate index in table (Rindex - Rlow)
+  __ subs(Rindex, Rindex, Rlow, ge);
+
+  // if Rindex <= Rhigh and (Rindex - Rlow) >= 0
+  // ("ge" status accumulated from cmp and subs instructions) then load
+  // offset from table, otherwise load offset for default case
+
+  if(ProfileInterpreter) {
+    Label default_case, continue_execution;
+
+    __ b(default_case, lt);
+    __ ldr(Roffset, Address(Rabcp, Rindex, lsl, LogBytesPerInt));
+    __ profile_switch_case(Rabcp, Rindex, Rtemp2, R0_tmp);
+    __ b(continue_execution);
+
+    __ bind(default_case);
+    __ profile_switch_default(R0_tmp);
+    __ ldr(Roffset, Address(Rabcp, -3 * BytesPerInt));
+
+    __ bind(continue_execution);
+  } else {
+    __ ldr(Roffset, Address(Rabcp, -3 * BytesPerInt), lt);
+    __ ldr(Roffset, Address(Rabcp, Rindex, lsl, LogBytesPerInt), ge);
+  }
+#endif // AARCH64
+
+  __ byteswap_u32(Roffset, Rtemp, Rtemp2);
+
+  // load the next bytecode to R3_bytecode and advance Rbcp
+#ifdef AARCH64
+  __ add(Rbcp, Rbcp, Roffset, ex_sxtw);
+  __ ldrb(R3_bytecode, Address(Rbcp));
+#else
+  __ ldrb(R3_bytecode, Address(Rbcp, Roffset, lsl, 0, pre_indexed));
+#endif // AARCH64
+  __ dispatch_only(vtos);
+
+}
+
+
+void TemplateTable::lookupswitch() {
+  transition(itos, itos);
+  __ stop("lookupswitch bytecode should have been rewritten");
+}
+
+
+void TemplateTable::fast_linearswitch() {
+  transition(itos, vtos);
+  Label loop, found, default_case, continue_execution;
+
+  const Register Rkey     = R0_tos;
+  const Register Rabcp    = R2_tmp;  // aligned bcp
+  const Register Rdefault = R3_tmp;
+  const Register Rcount   = R4_tmp;
+  const Register Roffset  = R5_tmp;
+
+  // bswap Rkey, so we can avoid bswapping the table entries
+  __ byteswap_u32(Rkey, R1_tmp, Rtemp);
+
+  // align bcp
+  __ add(Rtemp, Rbcp, 1 + (BytesPerInt-1));
+  __ align_reg(Rabcp, Rtemp, BytesPerInt);
+
+  // load default & counter
+#ifdef AARCH64
+  __ ldp_w(Rdefault, Rcount, Address(Rabcp, 2*BytesPerInt, post_indexed));
+#else
+  __ ldmia(Rabcp, RegisterSet(Rdefault) | RegisterSet(Rcount), writeback);
+#endif // AARCH64
+  __ byteswap_u32(Rcount, R1_tmp, Rtemp);
+
+#ifdef AARCH64
+  __ cbz_w(Rcount, default_case);
+#else
+  __ cmp_32(Rcount, 0);
+  __ ldr(Rtemp, Address(Rabcp, 2*BytesPerInt, post_indexed), ne);
+  __ b(default_case, eq);
+#endif // AARCH64
+
+  // table search
+  __ bind(loop);
+#ifdef AARCH64
+  __ ldr_s32(Rtemp, Address(Rabcp, 2*BytesPerInt, post_indexed));
+#endif // AARCH64
+  __ cmp_32(Rtemp, Rkey);
+  __ b(found, eq);
+  __ subs(Rcount, Rcount, 1);
+#ifndef AARCH64
+  __ ldr(Rtemp, Address(Rabcp, 2*BytesPerInt, post_indexed), ne);
+#endif // !AARCH64
+  __ b(loop, ne);
+
+  // default case
+  __ bind(default_case);
+  __ profile_switch_default(R0_tmp);
+  __ mov(Roffset, Rdefault);
+  __ b(continue_execution);
+
+  // entry found -> get offset
+  __ bind(found);
+  // Rabcp is already incremented and points to the next entry
+  __ ldr_s32(Roffset, Address(Rabcp, -BytesPerInt));
+  if (ProfileInterpreter) {
+    // Calculate index of the selected case.
+    assert_different_registers(Roffset, Rcount, Rtemp, R0_tmp, R1_tmp, R2_tmp);
+
+    // align bcp
+    __ add(Rtemp, Rbcp, 1 + (BytesPerInt-1));
+    __ align_reg(R2_tmp, Rtemp, BytesPerInt);
+
+    // load number of cases
+    __ ldr_u32(R2_tmp, Address(R2_tmp, BytesPerInt));
+    __ byteswap_u32(R2_tmp, R1_tmp, Rtemp);
+
+    // Selected index = <number of cases> - <current loop count>
+    __ sub(R1_tmp, R2_tmp, Rcount);
+    __ profile_switch_case(R0_tmp, R1_tmp, Rtemp, R1_tmp);
+  }
+
+  // continue execution
+  __ bind(continue_execution);
+  __ byteswap_u32(Roffset, R1_tmp, Rtemp);
+
+  // load the next bytecode to R3_bytecode and advance Rbcp
+#ifdef AARCH64
+  __ add(Rbcp, Rbcp, Roffset, ex_sxtw);
+  __ ldrb(R3_bytecode, Address(Rbcp));
+#else
+  __ ldrb(R3_bytecode, Address(Rbcp, Roffset, lsl, 0, pre_indexed));
+#endif // AARCH64
+  __ dispatch_only(vtos);
+}
+
+
+void TemplateTable::fast_binaryswitch() {
+  transition(itos, vtos);
+  // Implementation using the following core algorithm:
+  //
+  // int binary_search(int key, LookupswitchPair* array, int n) {
+  //   // Binary search according to "Methodik des Programmierens" by
+  //   // Edsger W. Dijkstra and W.H.J. Feijen, Addison Wesley Germany 1985.
+  //   int i = 0;
+  //   int j = n;
+  //   while (i+1 < j) {
+  //     // invariant P: 0 <= i < j <= n and (a[i] <= key < a[j] or Q)
+  //     // with      Q: for all i: 0 <= i < n: key < a[i]
+  //     // where a stands for the array and assuming that the (inexisting)
+  //     // element a[n] is infinitely big.
+  //     int h = (i + j) >> 1;
+  //     // i < h < j
+  //     if (key < array[h].fast_match()) {
+  //       j = h;
+  //     } else {
+  //       i = h;
+  //     }
+  //   }
+  //   // R: a[i] <= key < a[i+1] or Q
+  //   // (i.e., if key is within array, i is the correct index)
+  //   return i;
+  // }
+
+  // register allocation
+  const Register key    = R0_tos;                // already set (tosca)
+  const Register array  = R1_tmp;
+  const Register i      = R2_tmp;
+  const Register j      = R3_tmp;
+  const Register h      = R4_tmp;
+  const Register val    = R5_tmp;
+  const Register temp1  = Rtemp;
+  const Register temp2  = LR_tmp;
+  const Register offset = R3_tmp;
+
+  // set 'array' = aligned bcp + 2 ints
+  __ add(temp1, Rbcp, 1 + (BytesPerInt-1) + 2*BytesPerInt);
+  __ align_reg(array, temp1, BytesPerInt);
+
+  // initialize i & j
+  __ mov(i, 0);                                  // i = 0;
+  __ ldr_s32(j, Address(array, -BytesPerInt));   // j = length(array);
+  // Convert j into native byteordering
+  __ byteswap_u32(j, temp1, temp2);
+
+  // and start
+  Label entry;
+  __ b(entry);
+
+  // binary search loop
+  { Label loop;
+    __ bind(loop);
+    // int h = (i + j) >> 1;
+    __ add(h, i, j);                             // h = i + j;
+    __ logical_shift_right(h, h, 1);             // h = (i + j) >> 1;
+    // if (key < array[h].fast_match()) {
+    //   j = h;
+    // } else {
+    //   i = h;
+    // }
+#ifdef AARCH64
+    __ add(temp1, array, AsmOperand(h, lsl, 1+LogBytesPerInt));
+    __ ldr_s32(val, Address(temp1));
+#else
+    __ ldr_s32(val, Address(array, h, lsl, 1+LogBytesPerInt));
+#endif // AARCH64
+    // Convert array[h].match to native byte-ordering before compare
+    __ byteswap_u32(val, temp1, temp2);
+    __ cmp_32(key, val);
+    __ mov(j, h, lt);   // j = h if (key <  array[h].fast_match())
+    __ mov(i, h, ge);   // i = h if (key >= array[h].fast_match())
+    // while (i+1 < j)
+    __ bind(entry);
+    __ add(temp1, i, 1);                             // i+1
+    __ cmp(temp1, j);                                // i+1 < j
+    __ b(loop, lt);
+  }
+
+  // end of binary search, result index is i (must check again!)
+  Label default_case;
+  // Convert array[i].match to native byte-ordering before compare
+#ifdef AARCH64
+  __ add(temp1, array, AsmOperand(i, lsl, 1+LogBytesPerInt));
+  __ ldr_s32(val, Address(temp1));
+#else
+  __ ldr_s32(val, Address(array, i, lsl, 1+LogBytesPerInt));
+#endif // AARCH64
+  __ byteswap_u32(val, temp1, temp2);
+  __ cmp_32(key, val);
+  __ b(default_case, ne);
+
+  // entry found
+  __ add(temp1, array, AsmOperand(i, lsl, 1+LogBytesPerInt));
+  __ ldr_s32(offset, Address(temp1, 1*BytesPerInt));
+  __ profile_switch_case(R0, i, R1, i);
+  __ byteswap_u32(offset, temp1, temp2);
+#ifdef AARCH64
+  __ add(Rbcp, Rbcp, offset, ex_sxtw);
+  __ ldrb(R3_bytecode, Address(Rbcp));
+#else
+  __ ldrb(R3_bytecode, Address(Rbcp, offset, lsl, 0, pre_indexed));
+#endif // AARCH64
+  __ dispatch_only(vtos);
+
+  // default case
+  __ bind(default_case);
+  __ profile_switch_default(R0);
+  __ ldr_s32(offset, Address(array, -2*BytesPerInt));
+  __ byteswap_u32(offset, temp1, temp2);
+#ifdef AARCH64
+  __ add(Rbcp, Rbcp, offset, ex_sxtw);
+  __ ldrb(R3_bytecode, Address(Rbcp));
+#else
+  __ ldrb(R3_bytecode, Address(Rbcp, offset, lsl, 0, pre_indexed));
+#endif // AARCH64
+  __ dispatch_only(vtos);
+}
+
+
+void TemplateTable::_return(TosState state) {
+  transition(state, state);
+  assert(_desc->calls_vm(), "inconsistent calls_vm information"); // call in remove_activation
+
+  if (_desc->bytecode() == Bytecodes::_return_register_finalizer) {
+    Label skip_register_finalizer;
+    assert(state == vtos, "only valid state");
+    __ ldr(R1, aaddress(0));
+    __ load_klass(Rtemp, R1);
+    __ ldr_u32(Rtemp, Address(Rtemp, Klass::access_flags_offset()));
+    __ tbz(Rtemp, exact_log2(JVM_ACC_HAS_FINALIZER), skip_register_finalizer);
+
+    __ call_VM(noreg, CAST_FROM_FN_PTR(address, InterpreterRuntime::register_finalizer), R1);
+
+    __ bind(skip_register_finalizer);
+  }
+
+  // Narrow result if state is itos but result type is smaller.
+  // Need to narrow in the return bytecode rather than in generate_return_entry
+  // since compiled code callers expect the result to already be narrowed.
+  if (state == itos) {
+    __ narrow(R0_tos);
+  }
+  __ remove_activation(state, LR);
+
+  __ interp_verify_oop(R0_tos, state, __FILE__, __LINE__);
+
+#ifndef AARCH64
+  // According to interpreter calling conventions, result is returned in R0/R1,
+  // so ftos (S0) and dtos (D0) are moved to R0/R1.
+  // This conversion should be done after remove_activation, as it uses
+  // push(state) & pop(state) to preserve return value.
+  __ convert_tos_to_retval(state);
+#endif // !AARCH64
+
+  __ ret();
+
+  __ nop(); // to avoid filling CPU pipeline with invalid instructions
+  __ nop();
+}
+
+
+// ----------------------------------------------------------------------------
+// Volatile variables demand their effects be made known to all CPU's in
+// order.  Store buffers on most chips allow reads & writes to reorder; the
+// JMM's ReadAfterWrite.java test fails in -Xint mode without some kind of
+// memory barrier (i.e., it's not sufficient that the interpreter does not
+// reorder volatile references, the hardware also must not reorder them).
+//
+// According to the new Java Memory Model (JMM):
+// (1) All volatiles are serialized wrt to each other.
+// ALSO reads & writes act as aquire & release, so:
+// (2) A read cannot let unrelated NON-volatile memory refs that happen after
+// the read float up to before the read.  It's OK for non-volatile memory refs
+// that happen before the volatile read to float down below it.
+// (3) Similar a volatile write cannot let unrelated NON-volatile memory refs
+// that happen BEFORE the write float down to after the write.  It's OK for
+// non-volatile memory refs that happen after the volatile write to float up
+// before it.
+//
+// We only put in barriers around volatile refs (they are expensive), not
+// _between_ memory refs (that would require us to track the flavor of the
+// previous memory refs).  Requirements (2) and (3) require some barriers
+// before volatile stores and after volatile loads.  These nearly cover
+// requirement (1) but miss the volatile-store-volatile-load case.  This final
+// case is placed after volatile-stores although it could just as well go
+// before volatile-loads.
+// TODO-AARCH64: consider removing extra unused parameters
+void TemplateTable::volatile_barrier(MacroAssembler::Membar_mask_bits order_constraint,
+                                     Register tmp,
+                                     bool preserve_flags,
+                                     Register load_tgt) {
+#ifdef AARCH64
+  __ membar(order_constraint);
+#else
+  __ membar(order_constraint, tmp, preserve_flags, load_tgt);
+#endif
+}
+
+// Blows all volatile registers: R0-R3 on 32-bit ARM, R0-R18 on AArch64, Rtemp, LR.
+void TemplateTable::resolve_cache_and_index(int byte_no,
+                                            Register Rcache,
+                                            Register Rindex,
+                                            size_t index_size) {
+  assert_different_registers(Rcache, Rindex, Rtemp);
+
+  Label resolved;
+  Bytecodes::Code code = bytecode();
+  switch (code) {
+  case Bytecodes::_nofast_getfield: code = Bytecodes::_getfield; break;
+  case Bytecodes::_nofast_putfield: code = Bytecodes::_putfield; break;
+  }
+
+  assert(byte_no == f1_byte || byte_no == f2_byte, "byte_no out of range");
+  __ get_cache_and_index_and_bytecode_at_bcp(Rcache, Rindex, Rtemp, byte_no, 1, index_size);
+  __ cmp(Rtemp, code);  // have we resolved this bytecode?
+  __ b(resolved, eq);
+
+  // resolve first time through
+  address entry = CAST_FROM_FN_PTR(address, InterpreterRuntime::resolve_from_cache);
+  __ mov(R1, code);
+  __ call_VM(noreg, entry, R1);
+  // Update registers with resolved info
+  __ get_cache_and_index_at_bcp(Rcache, Rindex, 1, index_size);
+  __ bind(resolved);
+}
+
+
+// The Rcache and Rindex registers must be set before call
+void TemplateTable::load_field_cp_cache_entry(Register Rcache,
+                                              Register Rindex,
+                                              Register Roffset,
+                                              Register Rflags,
+                                              Register Robj,
+                                              bool is_static = false) {
+
+  assert_different_registers(Rcache, Rindex, Rtemp);
+  assert_different_registers(Roffset, Rflags, Robj, Rtemp);
+
+  ByteSize cp_base_offset = ConstantPoolCache::base_offset();
+
+  __ add(Rtemp, Rcache, AsmOperand(Rindex, lsl, LogBytesPerWord));
+
+  // Field offset
+  __ ldr(Roffset, Address(Rtemp,
+           cp_base_offset + ConstantPoolCacheEntry::f2_offset()));
+
+  // Flags
+  __ ldr_u32(Rflags, Address(Rtemp,
+           cp_base_offset + ConstantPoolCacheEntry::flags_offset()));
+
+  if (is_static) {
+    __ ldr(Robj, Address(Rtemp,
+             cp_base_offset + ConstantPoolCacheEntry::f1_offset()));
+    const int mirror_offset = in_bytes(Klass::java_mirror_offset());
+    __ ldr(Robj, Address(Robj, mirror_offset));
+  }
+}
+
+
+// Blows all volatile registers: R0-R3 on 32-bit ARM, R0-R18 on AArch64, Rtemp, LR.
+void TemplateTable::load_invoke_cp_cache_entry(int byte_no,
+                                               Register method,
+                                               Register itable_index,
+                                               Register flags,
+                                               bool is_invokevirtual,
+                                               bool is_invokevfinal/*unused*/,
+                                               bool is_invokedynamic) {
+  // setup registers
+  const Register cache = R2_tmp;
+  const Register index = R3_tmp;
+  const Register temp_reg = Rtemp;
+  assert_different_registers(cache, index, temp_reg);
+  assert_different_registers(method, itable_index, temp_reg);
+
+  // determine constant pool cache field offsets
+  assert(is_invokevirtual == (byte_no == f2_byte), "is_invokevirtual flag redundant");
+  const int method_offset = in_bytes(
+    ConstantPoolCache::base_offset() +
+      ((byte_no == f2_byte)
+       ? ConstantPoolCacheEntry::f2_offset()
+       : ConstantPoolCacheEntry::f1_offset()
+      )
+    );
+  const int flags_offset = in_bytes(ConstantPoolCache::base_offset() +
+                                    ConstantPoolCacheEntry::flags_offset());
+  // access constant pool cache fields
+  const int index_offset = in_bytes(ConstantPoolCache::base_offset() +
+                                    ConstantPoolCacheEntry::f2_offset());
+
+  size_t index_size = (is_invokedynamic ? sizeof(u4) : sizeof(u2));
+  resolve_cache_and_index(byte_no, cache, index, index_size);
+    __ add(temp_reg, cache, AsmOperand(index, lsl, LogBytesPerWord));
+    __ ldr(method, Address(temp_reg, method_offset));
+
+  if (itable_index != noreg) {
+    __ ldr(itable_index, Address(temp_reg, index_offset));
+  }
+  __ ldr_u32(flags, Address(temp_reg, flags_offset));
+}
+
+
+// The registers cache and index expected to be set before call, and should not be Rtemp.
+// Blows volatile registers (R0-R3 on 32-bit ARM, R0-R18 on AArch64), Rtemp, LR,
+// except cache and index registers which are preserved.
+void TemplateTable::jvmti_post_field_access(Register Rcache,
+                                            Register Rindex,
+                                            bool is_static,
+                                            bool has_tos) {
+  assert_different_registers(Rcache, Rindex, Rtemp);
+
+  if (__ can_post_field_access()) {
+    // Check to see if a field access watch has been set before we take
+    // the time to call into the VM.
+
+    Label Lcontinue;
+
+    __ ldr_global_s32(Rtemp, (address)JvmtiExport::get_field_access_count_addr());
+    __ cbz(Rtemp, Lcontinue);
+
+    // cache entry pointer
+    __ add(R2, Rcache, AsmOperand(Rindex, lsl, LogBytesPerWord));
+    __ add(R2, R2, in_bytes(ConstantPoolCache::base_offset()));
+    if (is_static) {
+      __ mov(R1, 0);        // NULL object reference
+    } else {
+      __ pop(atos);         // Get the object
+      __ mov(R1, R0_tos);
+      __ verify_oop(R1);
+      __ push(atos);        // Restore stack state
+    }
+    // R1: object pointer or NULL
+    // R2: cache entry pointer
+    __ call_VM(noreg, CAST_FROM_FN_PTR(address, InterpreterRuntime::post_field_access),
+               R1, R2);
+    __ get_cache_and_index_at_bcp(Rcache, Rindex, 1);
+
+    __ bind(Lcontinue);
+  }
+}
+
+
+void TemplateTable::pop_and_check_object(Register r) {
+  __ pop_ptr(r);
+  __ null_check(r, Rtemp);  // for field access must check obj.
+  __ verify_oop(r);
+}
+
+
+void TemplateTable::getfield_or_static(int byte_no, bool is_static, RewriteControl rc) {
+  transition(vtos, vtos);
+
+  const Register Roffset  = R2_tmp;
+  const Register Robj     = R3_tmp;
+  const Register Rcache   = R4_tmp;
+  const Register Rflagsav = Rtmp_save0;  // R4/R19
+  const Register Rindex   = R5_tmp;
+  const Register Rflags   = R5_tmp;
+
+  const bool gen_volatile_check = os::is_MP();
+
+  resolve_cache_and_index(byte_no, Rcache, Rindex, sizeof(u2));
+  jvmti_post_field_access(Rcache, Rindex, is_static, false);
+  load_field_cp_cache_entry(Rcache, Rindex, Roffset, Rflags, Robj, is_static);
+
+  if (gen_volatile_check) {
+    __ mov(Rflagsav, Rflags);
+  }
+
+  if (!is_static) pop_and_check_object(Robj);
+
+  Label Done, Lint, Ltable, shouldNotReachHere;
+  Label Lbtos, Lztos, Lctos, Lstos, Litos, Lltos, Lftos, Ldtos, Latos;
+
+  // compute type
+  __ logical_shift_right(Rflags, Rflags, ConstantPoolCacheEntry::tos_state_shift);
+  // Make sure we don't need to mask flags after the above shift
+  ConstantPoolCacheEntry::verify_tos_state_shift();
+
+  // There are actually two versions of implementation of getfield/getstatic:
+  //
+  // 32-bit ARM:
+  // 1) Table switch using add(PC,...) instruction (fast_version)
+  // 2) Table switch using ldr(PC,...) instruction
+  //
+  // AArch64:
+  // 1) Table switch using adr/add/br instructions (fast_version)
+  // 2) Table switch using adr/ldr/br instructions
+  //
+  // First version requires fixed size of code block for each case and
+  // can not be used in RewriteBytecodes and VerifyOops
+  // modes.
+
+  // Size of fixed size code block for fast_version
+  const int log_max_block_size = 2;
+  const int max_block_size = 1 << log_max_block_size;
+
+  // Decide if fast version is enabled
+  bool fast_version = (is_static || !RewriteBytecodes) && !VerifyOops && !VerifyInterpreterStackTop;
+
+  // On 32-bit ARM atos and itos cases can be merged only for fast version, because
+  // atos requires additional processing in slow version.
+  // On AArch64 atos and itos cannot be merged.
+  bool atos_merged_with_itos = AARCH64_ONLY(false) NOT_AARCH64(fast_version);
+
+  assert(number_of_states == 10, "number of tos states should be equal to 9");
+
+  __ cmp(Rflags, itos);
+#ifdef AARCH64
+  __ b(Lint, eq);
+
+  if(fast_version) {
+    __ adr(Rtemp, Lbtos);
+    __ add(Rtemp, Rtemp, AsmOperand(Rflags, lsl, log_max_block_size + Assembler::LogInstructionSize));
+    __ br(Rtemp);
+  } else {
+    __ adr(Rtemp, Ltable);
+    __ ldr(Rtemp, Address::indexed_ptr(Rtemp, Rflags));
+    __ br(Rtemp);
+  }
+#else
+  if(atos_merged_with_itos) {
+    __ cmp(Rflags, atos, ne);
+  }
+
+  // table switch by type
+  if(fast_version) {
+    __ add(PC, PC, AsmOperand(Rflags, lsl, log_max_block_size + Assembler::LogInstructionSize), ne);
+  } else {
+    __ ldr(PC, Address(PC, Rflags, lsl, LogBytesPerWord), ne);
+  }
+
+  // jump to itos/atos case
+  __ b(Lint);
+#endif // AARCH64
+
+  // table with addresses for slow version
+  if (fast_version) {
+    // nothing to do
+  } else  {
+    AARCH64_ONLY(__ align(wordSize));
+    __ bind(Ltable);
+    __ emit_address(Lbtos);
+    __ emit_address(Lztos);
+    __ emit_address(Lctos);
+    __ emit_address(Lstos);
+    __ emit_address(Litos);
+    __ emit_address(Lltos);
+    __ emit_address(Lftos);
+    __ emit_address(Ldtos);
+    __ emit_address(Latos);
+  }
+
+#ifdef ASSERT
+  int seq = 0;
+#endif
+  // btos
+  {
+    assert(btos == seq++, "btos has unexpected value");
+    FixedSizeCodeBlock btos_block(_masm, max_block_size, fast_version);
+    __ bind(Lbtos);
+    __ ldrsb(R0_tos, Address(Robj, Roffset));
+    __ push(btos);
+    // Rewrite bytecode to be faster
+    if (!is_static && rc == may_rewrite) {
+      patch_bytecode(Bytecodes::_fast_bgetfield, R0_tmp, Rtemp);
+    }
+    __ b(Done);
+  }
+
+  // ztos (same as btos for getfield)
+  {
+    assert(ztos == seq++, "btos has unexpected value");
+    FixedSizeCodeBlock ztos_block(_masm, max_block_size, fast_version);
+    __ bind(Lztos);
+    __ ldrsb(R0_tos, Address(Robj, Roffset));
+    __ push(ztos);
+    // Rewrite bytecode to be faster (use btos fast getfield)
+    if (!is_static && rc == may_rewrite) {
+      patch_bytecode(Bytecodes::_fast_bgetfield, R0_tmp, Rtemp);
+    }
+    __ b(Done);
+  }
+
+  // ctos
+  {
+    assert(ctos == seq++, "ctos has unexpected value");
+    FixedSizeCodeBlock ctos_block(_masm, max_block_size, fast_version);
+    __ bind(Lctos);
+    __ ldrh(R0_tos, Address(Robj, Roffset));
+    __ push(ctos);
+    if (!is_static && rc == may_rewrite) {
+      patch_bytecode(Bytecodes::_fast_cgetfield, R0_tmp, Rtemp);
+    }
+    __ b(Done);
+  }
+
+  // stos
+  {
+    assert(stos == seq++, "stos has unexpected value");
+    FixedSizeCodeBlock stos_block(_masm, max_block_size, fast_version);
+    __ bind(Lstos);
+    __ ldrsh(R0_tos, Address(Robj, Roffset));
+    __ push(stos);
+    if (!is_static && rc == may_rewrite) {
+      patch_bytecode(Bytecodes::_fast_sgetfield, R0_tmp, Rtemp);
+    }
+    __ b(Done);
+  }
+
+  // itos
+  {
+    assert(itos == seq++, "itos has unexpected value");
+    FixedSizeCodeBlock itos_block(_masm, max_block_size, fast_version);
+    __ bind(Litos);
+    __ b(shouldNotReachHere);
+  }
+
+  // ltos
+  {
+    assert(ltos == seq++, "ltos has unexpected value");
+    FixedSizeCodeBlock ltos_block(_masm, max_block_size, fast_version);
+    __ bind(Lltos);
+#ifdef AARCH64
+    __ ldr(R0_tos, Address(Robj, Roffset));
+#else
+    __ add(Roffset, Robj, Roffset);
+    __ ldmia(Roffset, RegisterSet(R0_tos_lo, R1_tos_hi));
+#endif // AARCH64
+    __ push(ltos);
+    if (!is_static && rc == may_rewrite) {
+      patch_bytecode(Bytecodes::_fast_lgetfield, R0_tmp, Rtemp);
+    }
+    __ b(Done);
+  }
+
+  // ftos
+  {
+    assert(ftos == seq++, "ftos has unexpected value");
+    FixedSizeCodeBlock ftos_block(_masm, max_block_size, fast_version);
+    __ bind(Lftos);
+    // floats and ints are placed on stack in same way, so
+    // we can use push(itos) to transfer value without using VFP
+    __ ldr_u32(R0_tos, Address(Robj, Roffset));
+    __ push(itos);
+    if (!is_static && rc == may_rewrite) {
+      patch_bytecode(Bytecodes::_fast_fgetfield, R0_tmp, Rtemp);
+    }
+    __ b(Done);
+  }
+
+  // dtos
+  {
+    assert(dtos == seq++, "dtos has unexpected value");
+    FixedSizeCodeBlock dtos_block(_masm, max_block_size, fast_version);
+    __ bind(Ldtos);
+    // doubles and longs are placed on stack in the same way, so
+    // we can use push(ltos) to transfer value without using VFP
+#ifdef AARCH64
+    __ ldr(R0_tos, Address(Robj, Roffset));
+#else
+    __ add(Rtemp, Robj, Roffset);
+    __ ldmia(Rtemp, RegisterSet(R0_tos_lo, R1_tos_hi));
+#endif // AARCH64
+    __ push(ltos);
+    if (!is_static && rc == may_rewrite) {
+      patch_bytecode(Bytecodes::_fast_dgetfield, R0_tmp, Rtemp);
+    }
+    __ b(Done);
+  }
+
+  // atos
+  {
+    assert(atos == seq++, "atos has unexpected value");
+
+    // atos case for AArch64 and slow version on 32-bit ARM
+    if(!atos_merged_with_itos) {
+      __ bind(Latos);
+      __ load_heap_oop(R0_tos, Address(Robj, Roffset));
+      __ push(atos);
+      // Rewrite bytecode to be faster
+      if (!is_static && rc == may_rewrite) {
+        patch_bytecode(Bytecodes::_fast_agetfield, R0_tmp, Rtemp);
+      }
+      __ b(Done);
+    }
+  }
+
+  assert(vtos == seq++, "vtos has unexpected value");
+
+  __ bind(shouldNotReachHere);
+  __ should_not_reach_here();
+
+  // itos and atos cases are frequent so it makes sense to move them out of table switch
+  // atos case can be merged with itos case (and thus moved out of table switch) on 32-bit ARM, fast version only
+
+  __ bind(Lint);
+  __ ldr_s32(R0_tos, Address(Robj, Roffset));
+  __ push(itos);
+  // Rewrite bytecode to be faster
+  if (!is_static && rc == may_rewrite) {
+    patch_bytecode(Bytecodes::_fast_igetfield, R0_tmp, Rtemp);
+  }
+
+  __ bind(Done);
+
+  if (gen_volatile_check) {
+    // Check for volatile field
+    Label notVolatile;
+    __ tbz(Rflagsav, ConstantPoolCacheEntry::is_volatile_shift, notVolatile);
+
+    volatile_barrier(MacroAssembler::Membar_mask_bits(MacroAssembler::LoadLoad | MacroAssembler::LoadStore), Rtemp);
+
+    __ bind(notVolatile);
+  }
+
+}
+
+void TemplateTable::getfield(int byte_no) {
+  getfield_or_static(byte_no, false);
+}
+
+void TemplateTable::nofast_getfield(int byte_no) {
+  getfield_or_static(byte_no, false, may_not_rewrite);
+}
+
+void TemplateTable::getstatic(int byte_no) {
+  getfield_or_static(byte_no, true);
+}
+
+
+// The registers cache and index expected to be set before call, and should not be R1 or Rtemp.
+// Blows volatile registers (R0-R3 on 32-bit ARM, R0-R18 on AArch64), Rtemp, LR,
+// except cache and index registers which are preserved.
+void TemplateTable::jvmti_post_field_mod(Register Rcache, Register Rindex, bool is_static) {
+  ByteSize cp_base_offset = ConstantPoolCache::base_offset();
+  assert_different_registers(Rcache, Rindex, R1, Rtemp);
+
+  if (__ can_post_field_modification()) {
+    // Check to see if a field modification watch has been set before we take
+    // the time to call into the VM.
+    Label Lcontinue;
+
+    __ ldr_global_s32(Rtemp, (address)JvmtiExport::get_field_modification_count_addr());
+    __ cbz(Rtemp, Lcontinue);
+
+    if (is_static) {
+      // Life is simple.  Null out the object pointer.
+      __ mov(R1, 0);
+    } else {
+      // Life is harder. The stack holds the value on top, followed by the object.
+      // We don't know the size of the value, though; it could be one or two words
+      // depending on its type. As a result, we must find the type to determine where
+      // the object is.
+
+      __ add(Rtemp, Rcache, AsmOperand(Rindex, lsl, LogBytesPerWord));
+      __ ldr_u32(Rtemp, Address(Rtemp, cp_base_offset + ConstantPoolCacheEntry::flags_offset()));
+
+      __ logical_shift_right(Rtemp, Rtemp, ConstantPoolCacheEntry::tos_state_shift);
+      // Make sure we don't need to mask Rtemp after the above shift
+      ConstantPoolCacheEntry::verify_tos_state_shift();
+
+      __ cmp(Rtemp, ltos);
+      __ cond_cmp(Rtemp, dtos, ne);
+#ifdef AARCH64
+      __ mov(Rtemp, Interpreter::expr_offset_in_bytes(2));
+      __ mov(R1, Interpreter::expr_offset_in_bytes(1));
+      __ mov(R1, Rtemp, eq);
+      __ ldr(R1, Address(Rstack_top, R1));
+#else
+      // two word value (ltos/dtos)
+      __ ldr(R1, Address(SP, Interpreter::expr_offset_in_bytes(2)), eq);
+
+      // one word value (not ltos, dtos)
+      __ ldr(R1, Address(SP, Interpreter::expr_offset_in_bytes(1)), ne);
+#endif // AARCH64
+    }
+
+    // cache entry pointer
+    __ add(R2, Rcache, AsmOperand(Rindex, lsl, LogBytesPerWord));
+    __ add(R2, R2, in_bytes(cp_base_offset));
+
+    // object (tos)
+    __ mov(R3, Rstack_top);
+
+    // R1: object pointer set up above (NULL if static)
+    // R2: cache entry pointer
+    // R3: value object on the stack
+    __ call_VM(noreg, CAST_FROM_FN_PTR(address, InterpreterRuntime::post_field_modification),
+               R1, R2, R3);
+    __ get_cache_and_index_at_bcp(Rcache, Rindex, 1);
+
+    __ bind(Lcontinue);
+  }
+}
+
+
+void TemplateTable::putfield_or_static(int byte_no, bool is_static, RewriteControl rc) {
+  transition(vtos, vtos);
+
+  const Register Roffset  = R2_tmp;
+  const Register Robj     = R3_tmp;
+  const Register Rcache   = R4_tmp;
+  const Register Rflagsav = Rtmp_save0;  // R4/R19
+  const Register Rindex   = R5_tmp;
+  const Register Rflags   = R5_tmp;
+
+  const bool gen_volatile_check = os::is_MP();
+
+  resolve_cache_and_index(byte_no, Rcache, Rindex, sizeof(u2));
+  jvmti_post_field_mod(Rcache, Rindex, is_static);
+  load_field_cp_cache_entry(Rcache, Rindex, Roffset, Rflags, Robj, is_static);
+
+  if (gen_volatile_check) {
+    // Check for volatile field
+    Label notVolatile;
+    __ mov(Rflagsav, Rflags);
+    __ tbz(Rflagsav, ConstantPoolCacheEntry::is_volatile_shift, notVolatile);
+
+    volatile_barrier(MacroAssembler::Membar_mask_bits(MacroAssembler::StoreStore | MacroAssembler::LoadStore), Rtemp);
+
+    __ bind(notVolatile);
+  }
+
+  Label Done, Lint, shouldNotReachHere;
+  Label Ltable, Lbtos, Lztos, Lctos, Lstos, Litos, Lltos, Lftos, Ldtos, Latos;
+
+  // compute type
+  __ logical_shift_right(Rflags, Rflags, ConstantPoolCacheEntry::tos_state_shift);
+  // Make sure we don't need to mask flags after the above shift
+  ConstantPoolCacheEntry::verify_tos_state_shift();
+
+  // There are actually two versions of implementation of putfield/putstatic:
+  //
+  // 32-bit ARM:
+  // 1) Table switch using add(PC,...) instruction (fast_version)
+  // 2) Table switch using ldr(PC,...) instruction
+  //
+  // AArch64:
+  // 1) Table switch using adr/add/br instructions (fast_version)
+  // 2) Table switch using adr/ldr/br instructions
+  //
+  // First version requires fixed size of code block for each case and
+  // can not be used in RewriteBytecodes and VerifyOops
+  // modes.
+
+  // Size of fixed size code block for fast_version (in instructions)
+  const int log_max_block_size = AARCH64_ONLY(is_static ? 2 : 3) NOT_AARCH64(3);
+  const int max_block_size = 1 << log_max_block_size;
+
+  // Decide if fast version is enabled
+  bool fast_version = (is_static || !RewriteBytecodes) && !VerifyOops && !ZapHighNonSignificantBits;
+
+  assert(number_of_states == 10, "number of tos states should be equal to 9");
+
+  // itos case is frequent and is moved outside table switch
+  __ cmp(Rflags, itos);
+
+#ifdef AARCH64
+  __ b(Lint, eq);
+
+  if (fast_version) {
+    __ adr(Rtemp, Lbtos);
+    __ add(Rtemp, Rtemp, AsmOperand(Rflags, lsl, log_max_block_size + Assembler::LogInstructionSize));
+    __ br(Rtemp);
+  } else {
+    __ adr(Rtemp, Ltable);
+    __ ldr(Rtemp, Address::indexed_ptr(Rtemp, Rflags));
+    __ br(Rtemp);
+  }
+#else
+  // table switch by type
+  if (fast_version) {
+    __ add(PC, PC, AsmOperand(Rflags, lsl, log_max_block_size + Assembler::LogInstructionSize), ne);
+  } else  {
+    __ ldr(PC, Address(PC, Rflags, lsl, LogBytesPerWord), ne);
+  }
+
+  // jump to itos case
+  __ b(Lint);
+#endif // AARCH64
+
+  // table with addresses for slow version
+  if (fast_version) {
+    // nothing to do
+  } else  {
+    AARCH64_ONLY(__ align(wordSize));
+    __ bind(Ltable);
+    __ emit_address(Lbtos);
+    __ emit_address(Lztos);
+    __ emit_address(Lctos);
+    __ emit_address(Lstos);
+    __ emit_address(Litos);
+    __ emit_address(Lltos);
+    __ emit_address(Lftos);
+    __ emit_address(Ldtos);
+    __ emit_address(Latos);
+  }
+
+#ifdef ASSERT
+  int seq = 0;
+#endif
+  // btos
+  {
+    assert(btos == seq++, "btos has unexpected value");
+    FixedSizeCodeBlock btos_block(_masm, max_block_size, fast_version);
+    __ bind(Lbtos);
+    __ pop(btos);
+    if (!is_static) pop_and_check_object(Robj);
+    __ strb(R0_tos, Address(Robj, Roffset));
+    if (!is_static && rc == may_rewrite) {
+      patch_bytecode(Bytecodes::_fast_bputfield, R0_tmp, Rtemp, true, byte_no);
+    }
+    __ b(Done);
+  }
+
+  // ztos
+  {
+    assert(ztos == seq++, "ztos has unexpected value");
+    FixedSizeCodeBlock ztos_block(_masm, max_block_size, fast_version);
+    __ bind(Lztos);
+    __ pop(ztos);
+    if (!is_static) pop_and_check_object(Robj);
+    __ and_32(R0_tos, R0_tos, 1);
+    __ strb(R0_tos, Address(Robj, Roffset));
+    if (!is_static && rc == may_rewrite) {
+      patch_bytecode(Bytecodes::_fast_zputfield, R0_tmp, Rtemp, true, byte_no);
+    }
+    __ b(Done);
+  }
+
+  // ctos
+  {
+    assert(ctos == seq++, "ctos has unexpected value");
+    FixedSizeCodeBlock ctos_block(_masm, max_block_size, fast_version);
+    __ bind(Lctos);
+    __ pop(ctos);
+    if (!is_static) pop_and_check_object(Robj);
+    __ strh(R0_tos, Address(Robj, Roffset));
+    if (!is_static && rc == may_rewrite) {
+      patch_bytecode(Bytecodes::_fast_cputfield, R0_tmp, Rtemp, true, byte_no);
+    }
+    __ b(Done);
+  }
+
+  // stos
+  {
+    assert(stos == seq++, "stos has unexpected value");
+    FixedSizeCodeBlock stos_block(_masm, max_block_size, fast_version);
+    __ bind(Lstos);
+    __ pop(stos);
+    if (!is_static) pop_and_check_object(Robj);
+    __ strh(R0_tos, Address(Robj, Roffset));
+    if (!is_static && rc == may_rewrite) {
+      patch_bytecode(Bytecodes::_fast_sputfield, R0_tmp, Rtemp, true, byte_no);
+    }
+    __ b(Done);
+  }
+
+  // itos
+  {
+    assert(itos == seq++, "itos has unexpected value");
+    FixedSizeCodeBlock itos_block(_masm, max_block_size, fast_version);
+    __ bind(Litos);
+    __ b(shouldNotReachHere);
+  }
+
+  // ltos
+  {
+    assert(ltos == seq++, "ltos has unexpected value");
+    FixedSizeCodeBlock ltos_block(_masm, max_block_size, fast_version);
+    __ bind(Lltos);
+    __ pop(ltos);
+    if (!is_static) pop_and_check_object(Robj);
+#ifdef AARCH64
+    __ str(R0_tos, Address(Robj, Roffset));
+#else
+    __ add(Roffset, Robj, Roffset);
+    __ stmia(Roffset, RegisterSet(R0_tos_lo, R1_tos_hi));
+#endif // AARCH64
+    if (!is_static && rc == may_rewrite) {
+      patch_bytecode(Bytecodes::_fast_lputfield, R0_tmp, Rtemp, true, byte_no);
+    }
+    __ b(Done);
+  }
+
+  // ftos
+  {
+    assert(ftos == seq++, "ftos has unexpected value");
+    FixedSizeCodeBlock ftos_block(_masm, max_block_size, fast_version);
+    __ bind(Lftos);
+    // floats and ints are placed on stack in the same way, so
+    // we can use pop(itos) to transfer value without using VFP
+    __ pop(itos);
+    if (!is_static) pop_and_check_object(Robj);
+    __ str_32(R0_tos, Address(Robj, Roffset));
+    if (!is_static && rc == may_rewrite) {
+      patch_bytecode(Bytecodes::_fast_fputfield, R0_tmp, Rtemp, true, byte_no);
+    }
+    __ b(Done);
+  }
+
+  // dtos
+  {
+    assert(dtos == seq++, "dtos has unexpected value");
+    FixedSizeCodeBlock dtos_block(_masm, max_block_size, fast_version);
+    __ bind(Ldtos);
+    // doubles and longs are placed on stack in the same way, so
+    // we can use pop(ltos) to transfer value without using VFP
+    __ pop(ltos);
+    if (!is_static) pop_and_check_object(Robj);
+#ifdef AARCH64
+    __ str(R0_tos, Address(Robj, Roffset));
+#else
+    __ add(Rtemp, Robj, Roffset);
+    __ stmia(Rtemp, RegisterSet(R0_tos_lo, R1_tos_hi));
+#endif // AARCH64
+    if (!is_static && rc == may_rewrite) {
+      patch_bytecode(Bytecodes::_fast_dputfield, R0_tmp, Rtemp, true, byte_no);
+    }
+    __ b(Done);
+  }
+
+  // atos
+  {
+    assert(atos == seq++, "dtos has unexpected value");
+    __ bind(Latos);
+    __ pop(atos);
+    if (!is_static) pop_and_check_object(Robj);
+    // Store into the field
+    do_oop_store(_masm, Address(Robj, Roffset), R0_tos, Rtemp, R1_tmp, R5_tmp, _bs->kind(), false, false);
+    if (!is_static && rc == may_rewrite) {
+      patch_bytecode(Bytecodes::_fast_aputfield, R0_tmp, Rtemp, true, byte_no);
+    }
+    __ b(Done);
+  }
+
+  __ bind(shouldNotReachHere);
+  __ should_not_reach_here();
+
+  // itos case is frequent and is moved outside table switch
+  __ bind(Lint);
+  __ pop(itos);
+  if (!is_static) pop_and_check_object(Robj);
+  __ str_32(R0_tos, Address(Robj, Roffset));
+  if (!is_static && rc == may_rewrite) {
+    patch_bytecode(Bytecodes::_fast_iputfield, R0_tmp, Rtemp, true, byte_no);
+  }
+
+  __ bind(Done);
+
+  if (gen_volatile_check) {
+    Label notVolatile;
+    if (is_static) {
+      // Just check for volatile. Memory barrier for static final field
+      // is handled by class initialization.
+      __ tbz(Rflagsav, ConstantPoolCacheEntry::is_volatile_shift, notVolatile);
+      volatile_barrier(MacroAssembler::StoreLoad, Rtemp);
+      __ bind(notVolatile);
+    } else {
+      // Check for volatile field and final field
+      Label skipMembar;
+
+      __ tst(Rflagsav, 1 << ConstantPoolCacheEntry::is_volatile_shift |
+                       1 << ConstantPoolCacheEntry::is_final_shift);
+      __ b(skipMembar, eq);
+
+      __ tbz(Rflagsav, ConstantPoolCacheEntry::is_volatile_shift, notVolatile);
+
+      // StoreLoad barrier after volatile field write
+      volatile_barrier(MacroAssembler::StoreLoad, Rtemp);
+      __ b(skipMembar);
+
+      // StoreStore barrier after final field write
+      __ bind(notVolatile);
+      volatile_barrier(MacroAssembler::StoreStore, Rtemp);
+
+      __ bind(skipMembar);
+    }
+  }
+
+}
+
+void TemplateTable::putfield(int byte_no) {
+  putfield_or_static(byte_no, false);
+}
+
+void TemplateTable::nofast_putfield(int byte_no) {
+  putfield_or_static(byte_no, false, may_not_rewrite);
+}
+
+void TemplateTable::putstatic(int byte_no) {
+  putfield_or_static(byte_no, true);
+}
+
+
+void TemplateTable::jvmti_post_fast_field_mod() {
+  // This version of jvmti_post_fast_field_mod() is not used on ARM
+  Unimplemented();
+}
+
+// Blows volatile registers (R0-R3 on 32-bit ARM, R0-R18 on AArch64), Rtemp, LR,
+// but preserves tosca with the given state.
+void TemplateTable::jvmti_post_fast_field_mod(TosState state) {
+  if (__ can_post_field_modification()) {
+    // Check to see if a field modification watch has been set before we take
+    // the time to call into the VM.
+    Label done;
+
+    __ ldr_global_s32(R2, (address)JvmtiExport::get_field_modification_count_addr());
+    __ cbz(R2, done);
+
+    __ pop_ptr(R3);               // copy the object pointer from tos
+    __ verify_oop(R3);
+    __ push_ptr(R3);              // put the object pointer back on tos
+
+    __ push(state);               // save value on the stack
+
+    // access constant pool cache entry
+    __ get_cache_entry_pointer_at_bcp(R2, R1, 1);
+
+    __ mov(R1, R3);
+    assert(Interpreter::expr_offset_in_bytes(0) == 0, "adjust this code");
+    __ mov(R3, Rstack_top); // put tos addr into R3
+
+    // R1: object pointer copied above
+    // R2: cache entry pointer
+    // R3: jvalue object on the stack
+    __ call_VM(noreg, CAST_FROM_FN_PTR(address, InterpreterRuntime::post_field_modification), R1, R2, R3);
+
+    __ pop(state);                // restore value
+
+    __ bind(done);
+  }
+}
+
+
+void TemplateTable::fast_storefield(TosState state) {
+  transition(state, vtos);
+
+  ByteSize base = ConstantPoolCache::base_offset();
+
+  jvmti_post_fast_field_mod(state);
+
+  const Register Rcache  = R2_tmp;
+  const Register Rindex  = R3_tmp;
+  const Register Roffset = R3_tmp;
+  const Register Rflags  = Rtmp_save0; // R4/R19
+  const Register Robj    = R5_tmp;
+
+  const bool gen_volatile_check = os::is_MP();
+
+  // access constant pool cache
+  __ get_cache_and_index_at_bcp(Rcache, Rindex, 1);
+
+  __ add(Rcache, Rcache, AsmOperand(Rindex, lsl, LogBytesPerWord));
+
+  if (gen_volatile_check) {
+    // load flags to test volatile
+    __ ldr_u32(Rflags, Address(Rcache, base + ConstantPoolCacheEntry::flags_offset()));
+  }
+
+  // replace index with field offset from cache entry
+  __ ldr(Roffset, Address(Rcache, base + ConstantPoolCacheEntry::f2_offset()));
+
+  if (gen_volatile_check) {
+    // Check for volatile store
+    Label notVolatile;
+    __ tbz(Rflags, ConstantPoolCacheEntry::is_volatile_shift, notVolatile);
+
+    // TODO-AARCH64 on AArch64, store-release instructions can be used to get rid of this explict barrier
+    volatile_barrier(MacroAssembler::Membar_mask_bits(MacroAssembler::StoreStore | MacroAssembler::LoadStore), Rtemp);
+
+    __ bind(notVolatile);
+  }
+
+  // Get object from stack
+  pop_and_check_object(Robj);
+
+  // access field
+  switch (bytecode()) {
+    case Bytecodes::_fast_zputfield: __ and_32(R0_tos, R0_tos, 1);
+                                     // fall through
+    case Bytecodes::_fast_bputfield: __ strb(R0_tos, Address(Robj, Roffset)); break;
+    case Bytecodes::_fast_sputfield: // fall through
+    case Bytecodes::_fast_cputfield: __ strh(R0_tos, Address(Robj, Roffset)); break;
+    case Bytecodes::_fast_iputfield: __ str_32(R0_tos, Address(Robj, Roffset)); break;
+#ifdef AARCH64
+    case Bytecodes::_fast_lputfield: __ str  (R0_tos, Address(Robj, Roffset)); break;
+    case Bytecodes::_fast_fputfield: __ str_s(S0_tos, Address(Robj, Roffset)); break;
+    case Bytecodes::_fast_dputfield: __ str_d(D0_tos, Address(Robj, Roffset)); break;
+#else
+    case Bytecodes::_fast_lputfield: __ add(Robj, Robj, Roffset);
+                                     __ stmia(Robj, RegisterSet(R0_tos_lo, R1_tos_hi)); break;
+
+#ifdef __SOFTFP__
+    case Bytecodes::_fast_fputfield: __ str(R0_tos, Address(Robj, Roffset));  break;
+    case Bytecodes::_fast_dputfield: __ add(Robj, Robj, Roffset);
+                                     __ stmia(Robj, RegisterSet(R0_tos_lo, R1_tos_hi)); break;
+#else
+    case Bytecodes::_fast_fputfield: __ add(Robj, Robj, Roffset);
+                                     __ fsts(S0_tos, Address(Robj));          break;
+    case Bytecodes::_fast_dputfield: __ add(Robj, Robj, Roffset);
+                                     __ fstd(D0_tos, Address(Robj));          break;
+#endif // __SOFTFP__
+#endif // AARCH64
+
+    case Bytecodes::_fast_aputfield:
+      do_oop_store(_masm, Address(Robj, Roffset), R0_tos, Rtemp, R1_tmp, R2_tmp, _bs->kind(), false, false);
+      break;
+
+    default:
+      ShouldNotReachHere();
+  }
+
+  if (gen_volatile_check) {
+    Label notVolatile;
+    Label skipMembar;
+    __ tst(Rflags, 1 << ConstantPoolCacheEntry::is_volatile_shift |
+                   1 << ConstantPoolCacheEntry::is_final_shift);
+    __ b(skipMembar, eq);
+
+    __ tbz(Rflags, ConstantPoolCacheEntry::is_volatile_shift, notVolatile);
+
+    // StoreLoad barrier after volatile field write
+    volatile_barrier(MacroAssembler::StoreLoad, Rtemp);
+    __ b(skipMembar);
+
+    // StoreStore barrier after final field write
+    __ bind(notVolatile);
+    volatile_barrier(MacroAssembler::StoreStore, Rtemp);
+
+    __ bind(skipMembar);
+  }
+}
+
+
+void TemplateTable::fast_accessfield(TosState state) {
+  transition(atos, state);
+
+  // do the JVMTI work here to avoid disturbing the register state below
+  if (__ can_post_field_access()) {
+    // Check to see if a field access watch has been set before we take
+    // the time to call into the VM.
+    Label done;
+    __ ldr_global_s32(R2, (address) JvmtiExport::get_field_access_count_addr());
+    __ cbz(R2, done);
+    // access constant pool cache entry
+    __ get_cache_entry_pointer_at_bcp(R2, R1, 1);
+    __ push_ptr(R0_tos);  // save object pointer before call_VM() clobbers it
+    __ verify_oop(R0_tos);
+    __ mov(R1, R0_tos);
+    // R1: object pointer copied above
+    // R2: cache entry pointer
+    __ call_VM(noreg, CAST_FROM_FN_PTR(address, InterpreterRuntime::post_field_access), R1, R2);
+    __ pop_ptr(R0_tos);   // restore object pointer
+
+    __ bind(done);
+  }
+
+  const Register Robj    = R0_tos;
+  const Register Rcache  = R2_tmp;
+  const Register Rflags  = R2_tmp;
+  const Register Rindex  = R3_tmp;
+  const Register Roffset = R3_tmp;
+
+  const bool gen_volatile_check = os::is_MP();
+
+  // access constant pool cache
+  __ get_cache_and_index_at_bcp(Rcache, Rindex, 1);
+  // replace index with field offset from cache entry
+  __ add(Rtemp, Rcache, AsmOperand(Rindex, lsl, LogBytesPerWord));
+  __ ldr(Roffset, Address(Rtemp, ConstantPoolCache::base_offset() + ConstantPoolCacheEntry::f2_offset()));
+
+  if (gen_volatile_check) {
+    // load flags to test volatile
+    __ ldr_u32(Rflags, Address(Rtemp, ConstantPoolCache::base_offset() + ConstantPoolCacheEntry::flags_offset()));
+  }
+
+  __ verify_oop(Robj);
+  __ null_check(Robj, Rtemp);
+
+  // access field
+  switch (bytecode()) {
+    case Bytecodes::_fast_bgetfield: __ ldrsb(R0_tos, Address(Robj, Roffset)); break;
+    case Bytecodes::_fast_sgetfield: __ ldrsh(R0_tos, Address(Robj, Roffset)); break;
+    case Bytecodes::_fast_cgetfield: __ ldrh (R0_tos, Address(Robj, Roffset)); break;
+    case Bytecodes::_fast_igetfield: __ ldr_s32(R0_tos, Address(Robj, Roffset)); break;
+#ifdef AARCH64
+    case Bytecodes::_fast_lgetfield: __ ldr  (R0_tos, Address(Robj, Roffset)); break;
+    case Bytecodes::_fast_fgetfield: __ ldr_s(S0_tos, Address(Robj, Roffset)); break;
+    case Bytecodes::_fast_dgetfield: __ ldr_d(D0_tos, Address(Robj, Roffset)); break;
+#else
+    case Bytecodes::_fast_lgetfield: __ add(Roffset, Robj, Roffset);
+                                     __ ldmia(Roffset, RegisterSet(R0_tos_lo, R1_tos_hi)); break;
+#ifdef __SOFTFP__
+    case Bytecodes::_fast_fgetfield: __ ldr  (R0_tos, Address(Robj, Roffset)); break;
+    case Bytecodes::_fast_dgetfield: __ add(Roffset, Robj, Roffset);
+                                     __ ldmia(Roffset, RegisterSet(R0_tos_lo, R1_tos_hi)); break;
+#else
+    case Bytecodes::_fast_fgetfield: __ add(Roffset, Robj, Roffset); __ flds(S0_tos, Address(Roffset)); break;
+    case Bytecodes::_fast_dgetfield: __ add(Roffset, Robj, Roffset); __ fldd(D0_tos, Address(Roffset)); break;
+#endif // __SOFTFP__
+#endif // AARCH64
+    case Bytecodes::_fast_agetfield: __ load_heap_oop(R0_tos, Address(Robj, Roffset)); __ verify_oop(R0_tos); break;
+    default:
+      ShouldNotReachHere();
+  }
+
+  if (gen_volatile_check) {
+    // Check for volatile load
+    Label notVolatile;
+    __ tbz(Rflags, ConstantPoolCacheEntry::is_volatile_shift, notVolatile);
+
+    // TODO-AARCH64 on AArch64, load-acquire instructions can be used to get rid of this explict barrier
+    volatile_barrier(MacroAssembler::Membar_mask_bits(MacroAssembler::LoadLoad | MacroAssembler::LoadStore), Rtemp);
+
+    __ bind(notVolatile);
+  }
+}
+
+
+void TemplateTable::fast_xaccess(TosState state) {
+  transition(vtos, state);
+
+  const Register Robj = R1_tmp;
+  const Register Rcache = R2_tmp;
+  const Register Rindex = R3_tmp;
+  const Register Roffset = R3_tmp;
+  const Register Rflags = R4_tmp;
+  Label done;
+
+  // get receiver
+  __ ldr(Robj, aaddress(0));
+
+  // access constant pool cache
+  __ get_cache_and_index_at_bcp(Rcache, Rindex, 2);
+  __ add(Rtemp, Rcache, AsmOperand(Rindex, lsl, LogBytesPerWord));
+  __ ldr(Roffset, Address(Rtemp, ConstantPoolCache::base_offset() + ConstantPoolCacheEntry::f2_offset()));
+
+  const bool gen_volatile_check = os::is_MP();
+
+  if (gen_volatile_check) {
+    // load flags to test volatile
+    __ ldr_u32(Rflags, Address(Rtemp, ConstantPoolCache::base_offset() + ConstantPoolCacheEntry::flags_offset()));
+  }
+
+  // make sure exception is reported in correct bcp range (getfield is next instruction)
+  __ add(Rbcp, Rbcp, 1);
+  __ null_check(Robj, Rtemp);
+  __ sub(Rbcp, Rbcp, 1);
+
+#ifdef AARCH64
+  if (gen_volatile_check) {
+    Label notVolatile;
+    __ tbz(Rflags, ConstantPoolCacheEntry::is_volatile_shift, notVolatile);
+
+    __ add(Rtemp, Robj, Roffset);
+
+    if (state == itos) {
+      __ ldar_w(R0_tos, Rtemp);
+    } else if (state == atos) {
+      if (UseCompressedOops) {
+        __ ldar_w(R0_tos, Rtemp);
+        __ decode_heap_oop(R0_tos);
+      } else {
+        __ ldar(R0_tos, Rtemp);
+      }
+      __ verify_oop(R0_tos);
+    } else if (state == ftos) {
+      __ ldar_w(R0_tos, Rtemp);
+      __ fmov_sw(S0_tos, R0_tos);
+    } else {
+      ShouldNotReachHere();
+    }
+    __ b(done);
+
+    __ bind(notVolatile);
+  }
+#endif // AARCH64
+
+  if (state == itos) {
+    __ ldr_s32(R0_tos, Address(Robj, Roffset));
+  } else if (state == atos) {
+    __ load_heap_oop(R0_tos, Address(Robj, Roffset));
+    __ verify_oop(R0_tos);
+  } else if (state == ftos) {
+#ifdef AARCH64
+    __ ldr_s(S0_tos, Address(Robj, Roffset));
+#else
+#ifdef __SOFTFP__
+    __ ldr(R0_tos, Address(Robj, Roffset));
+#else
+    __ add(Roffset, Robj, Roffset);
+    __ flds(S0_tos, Address(Roffset));
+#endif // __SOFTFP__
+#endif // AARCH64
+  } else {
+    ShouldNotReachHere();
+  }
+
+#ifndef AARCH64
+  if (gen_volatile_check) {
+    // Check for volatile load
+    Label notVolatile;
+    __ tbz(Rflags, ConstantPoolCacheEntry::is_volatile_shift, notVolatile);
+
+    volatile_barrier(MacroAssembler::Membar_mask_bits(MacroAssembler::LoadLoad | MacroAssembler::LoadStore), Rtemp);
+
+    __ bind(notVolatile);
+  }
+#endif // !AARCH64
+
+  __ bind(done);
+}
+
+
+
+//----------------------------------------------------------------------------------------------------
+// Calls
+
+void TemplateTable::count_calls(Register method, Register temp) {
+  // implemented elsewhere
+  ShouldNotReachHere();
+}
+
+
+void TemplateTable::prepare_invoke(int byte_no,
+                                   Register method,  // linked method (or i-klass)
+                                   Register index,   // itable index, MethodType, etc.
+                                   Register recv,    // if caller wants to see it
+                                   Register flags    // if caller wants to test it
+                                   ) {
+  // determine flags
+  const Bytecodes::Code code = bytecode();
+  const bool is_invokeinterface  = code == Bytecodes::_invokeinterface;
+  const bool is_invokedynamic    = code == Bytecodes::_invokedynamic;
+  const bool is_invokehandle     = code == Bytecodes::_invokehandle;
+  const bool is_invokevirtual    = code == Bytecodes::_invokevirtual;
+  const bool is_invokespecial    = code == Bytecodes::_invokespecial;
+  const bool load_receiver       = (recv != noreg);
+  assert(load_receiver == (code != Bytecodes::_invokestatic && code != Bytecodes::_invokedynamic), "");
+  assert(recv  == noreg || recv  == R2, "");
+  assert(flags == noreg || flags == R3, "");
+
+  // setup registers & access constant pool cache
+  if (recv  == noreg)  recv  = R2;
+  if (flags == noreg)  flags = R3;
+  const Register temp = Rtemp;
+  const Register ret_type = R1_tmp;
+  assert_different_registers(method, index, flags, recv, LR, ret_type, temp);
+
+  // save 'interpreter return address'
+  __ save_bcp();
+
+  load_invoke_cp_cache_entry(byte_no, method, index, flags, is_invokevirtual, false, is_invokedynamic);
+
+  // maybe push extra argument
+  if (is_invokedynamic || is_invokehandle) {
+    Label L_no_push;
+    __ tbz(flags, ConstantPoolCacheEntry::has_appendix_shift, L_no_push);
+    __ mov(temp, index);
+    assert(ConstantPoolCacheEntry::_indy_resolved_references_appendix_offset == 0, "appendix expected at index+0");
+    __ load_resolved_reference_at_index(index, temp);
+    __ verify_oop(index);
+    __ push_ptr(index);  // push appendix (MethodType, CallSite, etc.)
+    __ bind(L_no_push);
+  }
+
+  // load receiver if needed (after extra argument is pushed so parameter size is correct)
+  if (load_receiver) {
+    __ andr(temp, flags, (uintx)ConstantPoolCacheEntry::parameter_size_mask);  // get parameter size
+    Address recv_addr = __ receiver_argument_address(Rstack_top, temp, recv);
+    __ ldr(recv, recv_addr);
+    __ verify_oop(recv);
+  }
+
+  // compute return type
+  __ logical_shift_right(ret_type, flags, ConstantPoolCacheEntry::tos_state_shift);
+  // Make sure we don't need to mask flags after the above shift
+  ConstantPoolCacheEntry::verify_tos_state_shift();
+  // load return address
+  { const address table = (address) Interpreter::invoke_return_entry_table_for(code);
+    __ mov_slow(temp, table);
+    __ ldr(LR, Address::indexed_ptr(temp, ret_type));
+  }
+}
+
+
+void TemplateTable::invokevirtual_helper(Register index,
+                                         Register recv,
+                                         Register flags) {
+
+  const Register recv_klass = R2_tmp;
+
+  assert_different_registers(index, recv, flags, Rtemp);
+  assert_different_registers(index, recv_klass, R0_tmp, Rtemp);
+
+  // Test for an invoke of a final method
+  Label notFinal;
+  __ tbz(flags, ConstantPoolCacheEntry::is_vfinal_shift, notFinal);
+
+  assert(index == Rmethod, "Method* must be Rmethod, for interpreter calling convention");
+
+  // do the call - the index is actually the method to call
+
+  // It's final, need a null check here!
+  __ null_check(recv, Rtemp);
+
+  // profile this call
+  __ profile_final_call(R0_tmp);
+
+  __ jump_from_interpreted(Rmethod);
+
+  __ bind(notFinal);
+
+  // get receiver klass
+  __ null_check(recv, Rtemp, oopDesc::klass_offset_in_bytes());
+  __ load_klass(recv_klass, recv);
+
+  // profile this call
+  __ profile_virtual_call(R0_tmp, recv_klass);
+
+  // get target Method* & entry point
+  const int base = in_bytes(Klass::vtable_start_offset());
+  assert(vtableEntry::size() == 1, "adjust the scaling in the code below");
+  __ add(Rtemp, recv_klass, AsmOperand(index, lsl, LogHeapWordSize));
+  __ ldr(Rmethod, Address(Rtemp, base + vtableEntry::method_offset_in_bytes()));
+  __ jump_from_interpreted(Rmethod);
+}
+
+void TemplateTable::invokevirtual(int byte_no) {
+  transition(vtos, vtos);
+  assert(byte_no == f2_byte, "use this argument");
+
+  const Register Rrecv  = R2_tmp;
+  const Register Rflags = R3_tmp;
+
+  prepare_invoke(byte_no, Rmethod, noreg, Rrecv, Rflags);
+
+  // Rmethod: index
+  // Rrecv:   receiver
+  // Rflags:  flags
+  // LR:      return address
+
+  invokevirtual_helper(Rmethod, Rrecv, Rflags);
+}
+
+
+void TemplateTable::invokespecial(int byte_no) {
+  transition(vtos, vtos);
+  assert(byte_no == f1_byte, "use this argument");
+  const Register Rrecv  = R2_tmp;
+  prepare_invoke(byte_no, Rmethod, noreg, Rrecv);
+  __ verify_oop(Rrecv);
+  __ null_check(Rrecv, Rtemp);
+  // do the call
+  __ profile_call(Rrecv);
+  __ jump_from_interpreted(Rmethod);
+}
+
+
+void TemplateTable::invokestatic(int byte_no) {
+  transition(vtos, vtos);
+  assert(byte_no == f1_byte, "use this argument");
+  prepare_invoke(byte_no, Rmethod);
+  // do the call
+  __ profile_call(R2_tmp);
+  __ jump_from_interpreted(Rmethod);
+}
+
+
+void TemplateTable::fast_invokevfinal(int byte_no) {
+  transition(vtos, vtos);
+  assert(byte_no == f2_byte, "use this argument");
+  __ stop("fast_invokevfinal is not used on ARM");
+}
+
+
+void TemplateTable::invokeinterface(int byte_no) {
+  transition(vtos, vtos);
+  assert(byte_no == f1_byte, "use this argument");
+
+  const Register Ritable = R1_tmp;
+  const Register Rrecv   = R2_tmp;
+  const Register Rinterf = R5_tmp;
+  const Register Rindex  = R4_tmp;
+  const Register Rflags  = R3_tmp;
+  const Register Rklass  = R3_tmp;
+
+  prepare_invoke(byte_no, Rinterf, Rindex, Rrecv, Rflags);
+
+  // Special case of invokeinterface called for virtual method of
+  // java.lang.Object.  See cpCacheOop.cpp for details.
+  // This code isn't produced by javac, but could be produced by
+  // another compliant java compiler.
+  Label notMethod;
+  __ tbz(Rflags, ConstantPoolCacheEntry::is_forced_virtual_shift, notMethod);
+
+  __ mov(Rmethod, Rindex);
+  invokevirtual_helper(Rmethod, Rrecv, Rflags);
+  __ bind(notMethod);
+
+  // Get receiver klass into Rklass - also a null check
+  __ load_klass(Rklass, Rrecv);
+
+  // profile this call
+  __ profile_virtual_call(R0_tmp, Rklass);
+
+  // Compute start of first itableOffsetEntry (which is at the end of the vtable)
+  const int base = in_bytes(Klass::vtable_start_offset());
+  assert(vtableEntry::size() == 1, "adjust the scaling in the code below");
+  __ ldr_s32(Rtemp, Address(Rklass, Klass::vtable_length_offset())); // Get length of vtable
+  __ add(Ritable, Rklass, base);
+  __ add(Ritable, Ritable, AsmOperand(Rtemp, lsl, LogBytesPerWord));
+
+  Label entry, search, interface_ok;
+
+  __ b(entry);
+
+  __ bind(search);
+  __ add(Ritable, Ritable, itableOffsetEntry::size() * HeapWordSize);
+
+  __ bind(entry);
+
+  // Check that the entry is non-null.  A null entry means that the receiver
+  // class doesn't implement the interface, and wasn't the same as the
+  // receiver class checked when the interface was resolved.
+
+  __ ldr(Rtemp, Address(Ritable, itableOffsetEntry::interface_offset_in_bytes()));
+  __ cbnz(Rtemp, interface_ok);
+
+  // throw exception
+  __ call_VM(noreg, CAST_FROM_FN_PTR(address,
+                   InterpreterRuntime::throw_IncompatibleClassChangeError));
+
+  // the call_VM checks for exception, so we should never return here.
+  __ should_not_reach_here();
+
+  __ bind(interface_ok);
+
+  __ cmp(Rinterf, Rtemp);
+  __ b(search, ne);
+
+  __ ldr_s32(Rtemp, Address(Ritable, itableOffsetEntry::offset_offset_in_bytes()));
+  __ add(Rtemp, Rtemp, Rklass); // Add offset to Klass*
+  assert(itableMethodEntry::size() == 1, "adjust the scaling in the code below");
+
+  __ ldr(Rmethod, Address::indexed_ptr(Rtemp, Rindex));
+
+  // Rmethod: Method* to call
+
+  // Check for abstract method error
+  // Note: This should be done more efficiently via a throw_abstract_method_error
+  //       interpreter entry point and a conditional jump to it in case of a null
+  //       method.
+  { Label L;
+    __ cbnz(Rmethod, L);
+    // throw exception
+    // note: must restore interpreter registers to canonical
+    //       state for exception handling to work correctly!
+    __ restore_method();
+    __ call_VM(noreg, CAST_FROM_FN_PTR(address, InterpreterRuntime::throw_AbstractMethodError));
+    // the call_VM checks for exception, so we should never return here.
+    __ should_not_reach_here();
+    __ bind(L);
+  }
+
+  // do the call
+  __ jump_from_interpreted(Rmethod);
+}
+
+void TemplateTable::invokehandle(int byte_no) {
+  transition(vtos, vtos);
+
+  // TODO-AARCH64 review register usage
+  const Register Rrecv  = R2_tmp;
+  const Register Rmtype = R4_tmp;
+  const Register R5_method = R5_tmp;  // can't reuse Rmethod!
+
+  prepare_invoke(byte_no, R5_method, Rmtype, Rrecv);
+  __ null_check(Rrecv, Rtemp);
+
+  // Rmtype:  MethodType object (from cpool->resolved_references[f1], if necessary)
+  // Rmethod: MH.invokeExact_MT method (from f2)
+
+  // Note:  Rmtype is already pushed (if necessary) by prepare_invoke
+
+  // do the call
+  __ profile_final_call(R3_tmp);  // FIXME: profile the LambdaForm also
+  __ mov(Rmethod, R5_method);
+  __ jump_from_interpreted(Rmethod);
+}
+
+void TemplateTable::invokedynamic(int byte_no) {
+  transition(vtos, vtos);
+
+  // TODO-AARCH64 review register usage
+  const Register Rcallsite = R4_tmp;
+  const Register R5_method = R5_tmp;  // can't reuse Rmethod!
+
+  prepare_invoke(byte_no, R5_method, Rcallsite);
+
+  // Rcallsite: CallSite object (from cpool->resolved_references[f1])
+  // Rmethod:   MH.linkToCallSite method (from f2)
+
+  // Note:  Rcallsite is already pushed by prepare_invoke
+
+  if (ProfileInterpreter) {
+    __ profile_call(R2_tmp);
+  }
+
+  // do the call
+  __ mov(Rmethod, R5_method);
+  __ jump_from_interpreted(Rmethod);
+}
+
+//----------------------------------------------------------------------------------------------------
+// Allocation
+
+void TemplateTable::_new() {
+  transition(vtos, atos);
+
+  const Register Robj   = R0_tos;
+  const Register Rcpool = R1_tmp;
+  const Register Rindex = R2_tmp;
+  const Register Rtags  = R3_tmp;
+  const Register Rsize  = R3_tmp;
+
+  Register Rklass = R4_tmp;
+  assert_different_registers(Rcpool, Rindex, Rtags, Rklass, Rtemp);
+  assert_different_registers(Rcpool, Rindex, Rklass, Rsize);
+
+  Label slow_case;
+  Label done;
+  Label initialize_header;
+  Label initialize_object;  // including clearing the fields
+  Label allocate_shared;
+
+  const bool allow_shared_alloc =
+    Universe::heap()->supports_inline_contig_alloc();
+
+  // Literals
+  InlinedAddress Lheap_top_addr(allow_shared_alloc ? (address)Universe::heap()->top_addr() : NULL);
+
+  __ get_unsigned_2_byte_index_at_bcp(Rindex, 1);
+  __ get_cpool_and_tags(Rcpool, Rtags);
+
+  // Make sure the class we're about to instantiate has been resolved.
+  // This is done before loading InstanceKlass to be consistent with the order
+  // how Constant Pool is updated (see ConstantPool::klass_at_put)
+  const int tags_offset = Array<u1>::base_offset_in_bytes();
+  __ add(Rtemp, Rtags, Rindex);
+
+#ifdef AARCH64
+  __ add(Rtemp, Rtemp, tags_offset);
+  __ ldarb(Rtemp, Rtemp);
+#else
+  __ ldrb(Rtemp, Address(Rtemp, tags_offset));
+
+  // use Rklass as a scratch
+  volatile_barrier(MacroAssembler::LoadLoad, Rklass);
+#endif // AARCH64
+
+  // get InstanceKlass
+  __ add(Rklass, Rcpool, AsmOperand(Rindex, lsl, LogBytesPerWord));
+  __ ldr(Rklass, Address(Rklass, sizeof(ConstantPool)));
+  __ cmp(Rtemp, JVM_CONSTANT_Class);
+  __ b(slow_case, ne);
+
+  // make sure klass is initialized & doesn't have finalizer
+  // make sure klass is fully initialized
+  __ ldrb(Rtemp, Address(Rklass, InstanceKlass::init_state_offset()));
+  __ cmp(Rtemp, InstanceKlass::fully_initialized);
+  __ b(slow_case, ne);
+
+  // get instance_size in InstanceKlass (scaled to a count of bytes)
+  __ ldr_u32(Rsize, Address(Rklass, Klass::layout_helper_offset()));
+
+  // test to see if it has a finalizer or is malformed in some way
+  // Klass::_lh_instance_slow_path_bit is really a bit mask, not bit number
+  __ tbnz(Rsize, exact_log2(Klass::_lh_instance_slow_path_bit), slow_case);
+
+  //
+  // Allocate the instance
+  // 1) Try to allocate in the TLAB
+  // 2) if fail and the object is large allocate in the shared Eden
+  // 3) if the above fails (or is not applicable), go to a slow case
+  // (creates a new TLAB, etc.)
+
+  if (UseTLAB) {
+    const Register Rtlab_top = R1_tmp;
+    const Register Rtlab_end = R2_tmp;
+    assert_different_registers(Robj, Rsize, Rklass, Rtlab_top, Rtlab_end);
+
+    __ ldr(Robj, Address(Rthread, JavaThread::tlab_top_offset()));
+    __ ldr(Rtlab_end, Address(Rthread, in_bytes(JavaThread::tlab_end_offset())));
+    __ add(Rtlab_top, Robj, Rsize);
+    __ cmp(Rtlab_top, Rtlab_end);
+    __ b(allow_shared_alloc ? allocate_shared : slow_case, hi);
+    __ str(Rtlab_top, Address(Rthread, JavaThread::tlab_top_offset()));
+    if (ZeroTLAB) {
+      // the fields have been already cleared
+      __ b(initialize_header);
+    } else {
+      // initialize both the header and fields
+      __ b(initialize_object);
+    }
+  }
+
+  // Allocation in the shared Eden, if allowed.
+  if (allow_shared_alloc) {
+    __ bind(allocate_shared);
+
+    const Register Rheap_top_addr = R2_tmp;
+    const Register Rheap_top = R5_tmp;
+    const Register Rheap_end = Rtemp;
+    assert_different_registers(Robj, Rklass, Rsize, Rheap_top_addr, Rheap_top, Rheap_end, LR);
+
+    // heap_end now (re)loaded in the loop since also used as a scratch register in the CAS
+    __ ldr_literal(Rheap_top_addr, Lheap_top_addr);
+
+    Label retry;
+    __ bind(retry);
+
+#ifdef AARCH64
+    __ ldxr(Robj, Rheap_top_addr);
+#else
+    __ ldr(Robj, Address(Rheap_top_addr));
+#endif // AARCH64
+
+    __ ldr(Rheap_end, Address(Rheap_top_addr, (intptr_t)Universe::heap()->end_addr()-(intptr_t)Universe::heap()->top_addr()));
+    __ add(Rheap_top, Robj, Rsize);
+    __ cmp(Rheap_top, Rheap_end);
+    __ b(slow_case, hi);
+
+    // Update heap top atomically.
+    // If someone beats us on the allocation, try again, otherwise continue.
+#ifdef AARCH64
+    __ stxr(Rtemp2, Rheap_top, Rheap_top_addr);
+    __ cbnz_w(Rtemp2, retry);
+#else
+    __ atomic_cas_bool(Robj, Rheap_top, Rheap_top_addr, 0, Rheap_end/*scratched*/);
+    __ b(retry, ne);
+#endif // AARCH64
+
+    __ incr_allocated_bytes(Rsize, Rtemp);
+  }
+
+  if (UseTLAB || allow_shared_alloc) {
+    const Register Rzero0 = R1_tmp;
+    const Register Rzero1 = R2_tmp;
+    const Register Rzero_end = R5_tmp;
+    const Register Rzero_cur = Rtemp;
+    assert_different_registers(Robj, Rsize, Rklass, Rzero0, Rzero1, Rzero_cur, Rzero_end);
+
+    // The object is initialized before the header.  If the object size is
+    // zero, go directly to the header initialization.
+    __ bind(initialize_object);
+    __ subs(Rsize, Rsize, sizeof(oopDesc));
+    __ add(Rzero_cur, Robj, sizeof(oopDesc));
+    __ b(initialize_header, eq);
+
+#ifdef ASSERT
+    // make sure Rsize is a multiple of 8
+    Label L;
+    __ tst(Rsize, 0x07);
+    __ b(L, eq);
+    __ stop("object size is not multiple of 8 - adjust this code");
+    __ bind(L);
+#endif
+
+#ifdef AARCH64
+    {
+      Label loop;
+      // Step back by 1 word if object size is not a multiple of 2*wordSize.
+      assert(wordSize <= sizeof(oopDesc), "oop header should contain at least one word");
+      __ andr(Rtemp2, Rsize, (uintx)wordSize);
+      __ sub(Rzero_cur, Rzero_cur, Rtemp2);
+
+      // Zero by 2 words per iteration.
+      __ bind(loop);
+      __ subs(Rsize, Rsize, 2*wordSize);
+      __ stp(ZR, ZR, Address(Rzero_cur, 2*wordSize, post_indexed));
+      __ b(loop, gt);
+    }
+#else
+    __ mov(Rzero0, 0);
+    __ mov(Rzero1, 0);
+    __ add(Rzero_end, Rzero_cur, Rsize);
+
+    // initialize remaining object fields: Rsize was a multiple of 8
+    { Label loop;
+      // loop is unrolled 2 times
+      __ bind(loop);
+      // #1
+      __ stmia(Rzero_cur, RegisterSet(Rzero0) | RegisterSet(Rzero1), writeback);
+      __ cmp(Rzero_cur, Rzero_end);
+      // #2
+      __ stmia(Rzero_cur, RegisterSet(Rzero0) | RegisterSet(Rzero1), writeback, ne);
+      __ cmp(Rzero_cur, Rzero_end, ne);
+      __ b(loop, ne);
+    }
+#endif // AARCH64
+
+    // initialize object header only.
+    __ bind(initialize_header);
+    if (UseBiasedLocking) {
+      __ ldr(Rtemp, Address(Rklass, Klass::prototype_header_offset()));
+    } else {
+      __ mov_slow(Rtemp, (intptr_t)markOopDesc::prototype());
+    }
+    // mark
+    __ str(Rtemp, Address(Robj, oopDesc::mark_offset_in_bytes()));
+
+    // klass
+#ifdef AARCH64
+    __ store_klass_gap(Robj);
+#endif // AARCH64
+    __ store_klass(Rklass, Robj); // blows Rklass:
+    Rklass = noreg;
+
+    // Note: Disable DTrace runtime check for now to eliminate overhead on each allocation
+    if (DTraceAllocProbes) {
+      // Trigger dtrace event for fastpath
+      Label Lcontinue;
+
+      __ ldrb_global(Rtemp, (address)&DTraceAllocProbes);
+      __ cbz(Rtemp, Lcontinue);
+
+      __ push(atos);
+      __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_object_alloc), Robj);
+      __ pop(atos);
+
+      __ bind(Lcontinue);
+    }
+
+    __ b(done);
+  } else {
+    // jump over literals
+    __ b(slow_case);
+  }
+
+  if (allow_shared_alloc) {
+    __ bind_literal(Lheap_top_addr);
+  }
+
+  // slow case
+  __ bind(slow_case);
+  __ get_constant_pool(Rcpool);
+  __ get_unsigned_2_byte_index_at_bcp(Rindex, 1);
+  __ call_VM(Robj, CAST_FROM_FN_PTR(address, InterpreterRuntime::_new), Rcpool, Rindex);
+
+  // continue
+  __ bind(done);
+
+  // StoreStore barrier required after complete initialization
+  // (headers + content zeroing), before the object may escape.
+  __ membar(MacroAssembler::StoreStore, R1_tmp);
+}
+
+
+void TemplateTable::newarray() {
+  transition(itos, atos);
+  __ ldrb(R1, at_bcp(1));
+  __ mov(R2, R0_tos);
+  call_VM(R0_tos, CAST_FROM_FN_PTR(address, InterpreterRuntime::newarray), R1, R2);
+  // MacroAssembler::StoreStore useless (included in the runtime exit path)
+}
+
+
+void TemplateTable::anewarray() {
+  transition(itos, atos);
+  __ get_unsigned_2_byte_index_at_bcp(R2, 1);
+  __ get_constant_pool(R1);
+  __ mov(R3, R0_tos);
+  call_VM(R0_tos, CAST_FROM_FN_PTR(address, InterpreterRuntime::anewarray), R1, R2, R3);
+  // MacroAssembler::StoreStore useless (included in the runtime exit path)
+}
+
+
+void TemplateTable::arraylength() {
+  transition(atos, itos);
+  __ null_check(R0_tos, Rtemp, arrayOopDesc::length_offset_in_bytes());
+  __ ldr_s32(R0_tos, Address(R0_tos, arrayOopDesc::length_offset_in_bytes()));
+}
+
+
+void TemplateTable::checkcast() {
+  transition(atos, atos);
+  Label done, is_null, quicked, resolved, throw_exception;
+
+  const Register Robj = R0_tos;
+  const Register Rcpool = R2_tmp;
+  const Register Rtags = R3_tmp;
+  const Register Rindex = R4_tmp;
+  const Register Rsuper = R3_tmp;
+  const Register Rsub   = R4_tmp;
+  const Register Rsubtype_check_tmp1 = R1_tmp;
+  const Register Rsubtype_check_tmp2 = LR_tmp;
+
+  __ cbz(Robj, is_null);
+
+  // Get cpool & tags index
+  __ get_cpool_and_tags(Rcpool, Rtags);
+  __ get_unsigned_2_byte_index_at_bcp(Rindex, 1);
+
+  // See if bytecode has already been quicked
+  __ add(Rtemp, Rtags, Rindex);
+#ifdef AARCH64
+  // TODO-AARCH64: investigate if LoadLoad barrier is needed here or control dependency is enough
+  __ add(Rtemp, Rtemp, Array<u1>::base_offset_in_bytes());
+  __ ldarb(Rtemp, Rtemp); // acts as LoadLoad memory barrier
+#else
+  __ ldrb(Rtemp, Address(Rtemp, Array<u1>::base_offset_in_bytes()));
+#endif // AARCH64
+
+  __ cmp(Rtemp, JVM_CONSTANT_Class);
+
+#ifndef AARCH64
+  volatile_barrier(MacroAssembler::LoadLoad, Rtemp, true);
+#endif // !AARCH64
+
+  __ b(quicked, eq);
+
+  __ push(atos);
+  call_VM(noreg, CAST_FROM_FN_PTR(address, InterpreterRuntime::quicken_io_cc));
+  // vm_result_2 has metadata result
+  __ get_vm_result_2(Rsuper, Robj);
+  __ pop_ptr(Robj);
+  __ b(resolved);
+
+  __ bind(throw_exception);
+  // Come here on failure of subtype check
+  __ profile_typecheck_failed(R1_tmp);
+  __ mov(R2_ClassCastException_obj, Robj);             // convention with generate_ClassCastException_handler()
+  __ b(Interpreter::_throw_ClassCastException_entry);
+
+  // Get superklass in Rsuper and subklass in Rsub
+  __ bind(quicked);
+  __ add(Rtemp, Rcpool, AsmOperand(Rindex, lsl, LogBytesPerWord));
+  __ ldr(Rsuper, Address(Rtemp, sizeof(ConstantPool)));
+
+  __ bind(resolved);
+  __ load_klass(Rsub, Robj);
+
+  // Generate subtype check. Blows both tmps and Rtemp.
+  assert_different_registers(Robj, Rsub, Rsuper, Rsubtype_check_tmp1, Rsubtype_check_tmp2, Rtemp);
+  __ gen_subtype_check(Rsub, Rsuper, throw_exception, Rsubtype_check_tmp1, Rsubtype_check_tmp2);
+
+  // Come here on success
+
+  // Collect counts on whether this check-cast sees NULLs a lot or not.
+  if (ProfileInterpreter) {
+    __ b(done);
+    __ bind(is_null);
+    __ profile_null_seen(R1_tmp);
+  } else {
+    __ bind(is_null);   // same as 'done'
+  }
+  __ bind(done);
+}
+
+
+void TemplateTable::instanceof() {
+  // result = 0: obj == NULL or  obj is not an instanceof the specified klass
+  // result = 1: obj != NULL and obj is     an instanceof the specified klass
+
+  transition(atos, itos);
+  Label done, is_null, not_subtype, quicked, resolved;
+
+  const Register Robj = R0_tos;
+  const Register Rcpool = R2_tmp;
+  const Register Rtags = R3_tmp;
+  const Register Rindex = R4_tmp;
+  const Register Rsuper = R3_tmp;
+  const Register Rsub   = R4_tmp;
+  const Register Rsubtype_check_tmp1 = R0_tmp;
+  const Register Rsubtype_check_tmp2 = R1_tmp;
+
+  __ cbz(Robj, is_null);
+
+  __ load_klass(Rsub, Robj);
+
+  // Get cpool & tags index
+  __ get_cpool_and_tags(Rcpool, Rtags);
+  __ get_unsigned_2_byte_index_at_bcp(Rindex, 1);
+
+  // See if bytecode has already been quicked
+  __ add(Rtemp, Rtags, Rindex);
+#ifdef AARCH64
+  // TODO-AARCH64: investigate if LoadLoad barrier is needed here or control dependency is enough
+  __ add(Rtemp, Rtemp, Array<u1>::base_offset_in_bytes());
+  __ ldarb(Rtemp, Rtemp); // acts as LoadLoad memory barrier
+#else
+  __ ldrb(Rtemp, Address(Rtemp, Array<u1>::base_offset_in_bytes()));
+#endif // AARCH64
+  __ cmp(Rtemp, JVM_CONSTANT_Class);
+
+#ifndef AARCH64
+  volatile_barrier(MacroAssembler::LoadLoad, Rtemp, true);
+#endif // !AARCH64
+
+  __ b(quicked, eq);
+
+  __ push(atos);
+  call_VM(noreg, CAST_FROM_FN_PTR(address, InterpreterRuntime::quicken_io_cc));
+  // vm_result_2 has metadata result
+  __ get_vm_result_2(Rsuper, Robj);
+  __ pop_ptr(Robj);
+  __ b(resolved);
+
+  // Get superklass in Rsuper and subklass in Rsub
+  __ bind(quicked);
+  __ add(Rtemp, Rcpool, AsmOperand(Rindex, lsl, LogBytesPerWord));
+  __ ldr(Rsuper, Address(Rtemp, sizeof(ConstantPool)));
+
+  __ bind(resolved);
+  __ load_klass(Rsub, Robj);
+
+  // Generate subtype check. Blows both tmps and Rtemp.
+  __ gen_subtype_check(Rsub, Rsuper, not_subtype, Rsubtype_check_tmp1, Rsubtype_check_tmp2);
+
+  // Come here on success
+  __ mov(R0_tos, 1);
+  __ b(done);
+
+  __ bind(not_subtype);
+  // Come here on failure
+  __ profile_typecheck_failed(R1_tmp);
+  __ mov(R0_tos, 0);
+
+  // Collect counts on whether this test sees NULLs a lot or not.
+  if (ProfileInterpreter) {
+    __ b(done);
+    __ bind(is_null);
+    __ profile_null_seen(R1_tmp);
+  } else {
+    __ bind(is_null);   // same as 'done'
+  }
+  __ bind(done);
+}
+
+
+//----------------------------------------------------------------------------------------------------
+// Breakpoints
+void TemplateTable::_breakpoint() {
+
+  // Note: We get here even if we are single stepping..
+  // jbug inists on setting breakpoints at every bytecode
+  // even if we are in single step mode.
+
+  transition(vtos, vtos);
+
+  // get the unpatched byte code
+  __ mov(R1, Rmethod);
+  __ mov(R2, Rbcp);
+  __ call_VM(noreg, CAST_FROM_FN_PTR(address, InterpreterRuntime::get_original_bytecode_at), R1, R2);
+#ifdef AARCH64
+  __ sxtw(Rtmp_save0, R0);
+#else
+  __ mov(Rtmp_save0, R0);
+#endif // AARCH64
+
+  // post the breakpoint event
+  __ mov(R1, Rmethod);
+  __ mov(R2, Rbcp);
+  __ call_VM(noreg, CAST_FROM_FN_PTR(address, InterpreterRuntime::_breakpoint), R1, R2);
+
+  // complete the execution of original bytecode
+  __ mov(R3_bytecode, Rtmp_save0);
+  __ dispatch_only_normal(vtos);
+}
+
+
+//----------------------------------------------------------------------------------------------------
+// Exceptions
+
+void TemplateTable::athrow() {
+  transition(atos, vtos);
+  __ mov(Rexception_obj, R0_tos);
+  __ null_check(Rexception_obj, Rtemp);
+  __ b(Interpreter::throw_exception_entry());
+}
+
+
+//----------------------------------------------------------------------------------------------------
+// Synchronization
+//
+// Note: monitorenter & exit are symmetric routines; which is reflected
+//       in the assembly code structure as well
+//
+// Stack layout:
+//
+// [expressions  ] <--- Rstack_top        = expression stack top
+// ..
+// [expressions  ]
+// [monitor entry] <--- monitor block top = expression stack bot
+// ..
+// [monitor entry]
+// [frame data   ] <--- monitor block bot
+// ...
+// [saved FP     ] <--- FP
+
+
+void TemplateTable::monitorenter() {
+  transition(atos, vtos);
+
+  const Register Robj = R0_tos;
+  const Register Rentry = R1_tmp;
+
+  // check for NULL object
+  __ null_check(Robj, Rtemp);
+
+  const int entry_size = (frame::interpreter_frame_monitor_size() * wordSize);
+  assert (entry_size % StackAlignmentInBytes == 0, "keep stack alignment");
+  Label allocate_monitor, allocated;
+
+  // initialize entry pointer
+  __ mov(Rentry, 0);                             // points to free slot or NULL
+
+  // find a free slot in the monitor block (result in Rentry)
+  { Label loop, exit;
+    const Register Rcur = R2_tmp;
+    const Register Rcur_obj = Rtemp;
+    const Register Rbottom = R3_tmp;
+    assert_different_registers(Robj, Rentry, Rcur, Rbottom, Rcur_obj);
+
+    __ ldr(Rcur, Address(FP, frame::interpreter_frame_monitor_block_top_offset * wordSize));
+                                 // points to current entry, starting with top-most entry
+    __ sub(Rbottom, FP, -frame::interpreter_frame_monitor_block_bottom_offset * wordSize);
+                                 // points to word before bottom of monitor block
+
+    __ cmp(Rcur, Rbottom);                       // check if there are no monitors
+#ifndef AARCH64
+    __ ldr(Rcur_obj, Address(Rcur, BasicObjectLock::obj_offset_in_bytes()), ne);
+                                                 // prefetch monitor's object for the first iteration
+#endif // !AARCH64
+    __ b(allocate_monitor, eq);                  // there are no monitors, skip searching
+
+    __ bind(loop);
+#ifdef AARCH64
+    __ ldr(Rcur_obj, Address(Rcur, BasicObjectLock::obj_offset_in_bytes()));
+#endif // AARCH64
+    __ cmp(Rcur_obj, 0);                         // check if current entry is used
+    __ mov(Rentry, Rcur, eq);                    // if not used then remember entry
+
+    __ cmp(Rcur_obj, Robj);                      // check if current entry is for same object
+    __ b(exit, eq);                              // if same object then stop searching
+
+    __ add(Rcur, Rcur, entry_size);              // otherwise advance to next entry
+
+    __ cmp(Rcur, Rbottom);                       // check if bottom reached
+#ifndef AARCH64
+    __ ldr(Rcur_obj, Address(Rcur, BasicObjectLock::obj_offset_in_bytes()), ne);
+                                                 // prefetch monitor's object for the next iteration
+#endif // !AARCH64
+    __ b(loop, ne);                              // if not at bottom then check this entry
+    __ bind(exit);
+  }
+
+  __ cbnz(Rentry, allocated);                    // check if a slot has been found; if found, continue with that one
+
+  __ bind(allocate_monitor);
+
+  // allocate one if there's no free slot
+  { Label loop;
+    assert_different_registers(Robj, Rentry, R2_tmp, Rtemp);
+
+    // 1. compute new pointers
+
+#ifdef AARCH64
+    __ check_extended_sp(Rtemp);
+    __ sub(SP, SP, entry_size);                  // adjust extended SP
+    __ mov(Rtemp, SP);
+    __ str(Rtemp, Address(FP, frame::interpreter_frame_extended_sp_offset * wordSize));
+#endif // AARCH64
+
+    __ ldr(Rentry, Address(FP, frame::interpreter_frame_monitor_block_top_offset * wordSize));
+                                                 // old monitor block top / expression stack bottom
+
+    __ sub(Rstack_top, Rstack_top, entry_size);  // move expression stack top
+    __ check_stack_top_on_expansion();
+
+    __ sub(Rentry, Rentry, entry_size);          // move expression stack bottom
+
+    __ mov(R2_tmp, Rstack_top);                  // set start value for copy loop
+
+    __ str(Rentry, Address(FP, frame::interpreter_frame_monitor_block_top_offset * wordSize));
+                                                 // set new monitor block top
+
+    // 2. move expression stack contents
+
+    __ cmp(R2_tmp, Rentry);                                 // check if expression stack is empty
+#ifndef AARCH64
+    __ ldr(Rtemp, Address(R2_tmp, entry_size), ne);         // load expression stack word from old location
+#endif // !AARCH64
+    __ b(allocated, eq);
+
+    __ bind(loop);
+#ifdef AARCH64
+    __ ldr(Rtemp, Address(R2_tmp, entry_size));             // load expression stack word from old location
+#endif // AARCH64
+    __ str(Rtemp, Address(R2_tmp, wordSize, post_indexed)); // store expression stack word at new location
+                                                            // and advance to next word
+    __ cmp(R2_tmp, Rentry);                                 // check if bottom reached
+#ifndef AARCH64
+    __ ldr(Rtemp, Address(R2, entry_size), ne);             // load expression stack word from old location
+#endif // !AARCH64
+    __ b(loop, ne);                                         // if not at bottom then copy next word
+  }
+
+  // call run-time routine
+
+  // Rentry: points to monitor entry
+  __ bind(allocated);
+
+  // Increment bcp to point to the next bytecode, so exception handling for async. exceptions work correctly.
+  // The object has already been poped from the stack, so the expression stack looks correct.
+  __ add(Rbcp, Rbcp, 1);
+
+  __ str(Robj, Address(Rentry, BasicObjectLock::obj_offset_in_bytes()));     // store object
+  __ lock_object(Rentry);
+
+  // check to make sure this monitor doesn't cause stack overflow after locking
+  __ save_bcp();  // in case of exception
+  __ arm_stack_overflow_check(0, Rtemp);
+
+  // The bcp has already been incremented. Just need to dispatch to next instruction.
+  __ dispatch_next(vtos);
+}
+
+
+void TemplateTable::monitorexit() {
+  transition(atos, vtos);
+
+  const Register Robj = R0_tos;
+  const Register Rcur = R1_tmp;
+  const Register Rbottom = R2_tmp;
+  const Register Rcur_obj = Rtemp;
+
+  // check for NULL object
+  __ null_check(Robj, Rtemp);
+
+  const int entry_size = (frame::interpreter_frame_monitor_size() * wordSize);
+  Label found, throw_exception;
+
+  // find matching slot
+  { Label loop;
+    assert_different_registers(Robj, Rcur, Rbottom, Rcur_obj);
+
+    __ ldr(Rcur, Address(FP, frame::interpreter_frame_monitor_block_top_offset * wordSize));
+                                 // points to current entry, starting with top-most entry
+    __ sub(Rbottom, FP, -frame::interpreter_frame_monitor_block_bottom_offset * wordSize);
+                                 // points to word before bottom of monitor block
+
+    __ cmp(Rcur, Rbottom);                       // check if bottom reached
+#ifndef AARCH64
+    __ ldr(Rcur_obj, Address(Rcur, BasicObjectLock::obj_offset_in_bytes()), ne);
+                                                 // prefetch monitor's object for the first iteration
+#endif // !AARCH64
+    __ b(throw_exception, eq);                   // throw exception if there are now monitors
+
+    __ bind(loop);
+#ifdef AARCH64
+    __ ldr(Rcur_obj, Address(Rcur, BasicObjectLock::obj_offset_in_bytes()));
+#endif // AARCH64
+    // check if current entry is for same object
+    __ cmp(Rcur_obj, Robj);
+    __ b(found, eq);                             // if same object then stop searching
+    __ add(Rcur, Rcur, entry_size);              // otherwise advance to next entry
+    __ cmp(Rcur, Rbottom);                       // check if bottom reached
+#ifndef AARCH64
+    __ ldr(Rcur_obj, Address(Rcur, BasicObjectLock::obj_offset_in_bytes()), ne);
+#endif // !AARCH64
+    __ b (loop, ne);                             // if not at bottom then check this entry
+  }
+
+  // error handling. Unlocking was not block-structured
+  __ bind(throw_exception);
+  __ call_VM(noreg, CAST_FROM_FN_PTR(address, InterpreterRuntime::throw_illegal_monitor_state_exception));
+  __ should_not_reach_here();
+
+  // call run-time routine
+  // Rcur: points to monitor entry
+  __ bind(found);
+  __ push_ptr(Robj);                             // make sure object is on stack (contract with oopMaps)
+  __ unlock_object(Rcur);
+  __ pop_ptr(Robj);                              // discard object
+}
+
+
+//----------------------------------------------------------------------------------------------------
+// Wide instructions
+
+void TemplateTable::wide() {
+  transition(vtos, vtos);
+  __ ldrb(R3_bytecode, at_bcp(1));
+
+  InlinedAddress Ltable((address)Interpreter::_wentry_point);
+  __ ldr_literal(Rtemp, Ltable);
+  __ indirect_jump(Address::indexed_ptr(Rtemp, R3_bytecode), Rtemp);
+
+  __ nop(); // to avoid filling CPU pipeline with invalid instructions
+  __ nop();
+  __ bind_literal(Ltable);
+}
+
+
+//----------------------------------------------------------------------------------------------------
+// Multi arrays
+
+void TemplateTable::multianewarray() {
+  transition(vtos, atos);
+  __ ldrb(Rtmp_save0, at_bcp(3));   // get number of dimensions
+
+  // last dim is on top of stack; we want address of first one:
+  // first_addr = last_addr + ndims * stackElementSize - 1*wordsize
+  // the latter wordSize to point to the beginning of the array.
+  __ add(Rtemp, Rstack_top, AsmOperand(Rtmp_save0, lsl, Interpreter::logStackElementSize));
+  __ sub(R1, Rtemp, wordSize);
+
+  call_VM(R0, CAST_FROM_FN_PTR(address, InterpreterRuntime::multianewarray), R1);
+  __ add(Rstack_top, Rstack_top, AsmOperand(Rtmp_save0, lsl, Interpreter::logStackElementSize));
+  // MacroAssembler::StoreStore useless (included in the runtime exit path)
+}
--- /dev/null	2016-08-24 15:41:39.598575000 -0400
+++ new/hotspot/src/cpu/arm/vm/templateTable_arm.hpp	2016-12-02 11:24:07.037315761 -0500
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2008, 2013, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef CPU_ARM_VM_TEMPLATETABLE_ARM_HPP
+#define CPU_ARM_VM_TEMPLATETABLE_ARM_HPP
+
+  static void prepare_invoke(int byte_no,
+                             Register method,         // linked method (or i-klass)
+                             Register index = noreg,  // itable index, MethodType, etc.
+                             Register recv  = noreg,  // if caller wants to see it
+                             Register flags = noreg   // if caller wants to test it
+                             );
+
+  static void invokevirtual_helper(Register index, Register recv,
+                                   Register flags);
+
+  static void volatile_barrier(MacroAssembler::Membar_mask_bits order_constraint,
+                               Register tmp,
+                               bool preserve_flags = false,
+                               Register load_tgt = noreg);
+
+  // Helpers
+  static void index_check(Register array, Register index);
+  static void index_check_without_pop(Register array, Register index);
+
+  static void get_local_base_addr(Register r, Register index);
+
+  static Address load_iaddress(Register index, Register scratch);
+  static Address load_aaddress(Register index, Register scratch);
+  static Address load_faddress(Register index, Register scratch);
+  static Address load_daddress(Register index, Register scratch);
+
+  static void load_category2_local(Register Rlocal_index, Register tmp);
+  static void store_category2_local(Register Rlocal_index, Register tmp);
+
+  static Address get_array_elem_addr(BasicType elemType, Register array, Register index, Register temp);
+
+  static void jvmti_post_fast_field_mod(TosState state);
+
+#endif // CPU_ARM_VM_TEMPLATETABLE_ARM_HPP
--- /dev/null	2016-08-24 15:41:39.598575000 -0400
+++ new/hotspot/src/cpu/arm/vm/vmStructs_arm.hpp	2016-12-02 11:24:13.149662380 -0500
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2008, 2013, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef CPU_ARM_VM_VMSTRUCTS_ARM_HPP
+#define CPU_ARM_VM_VMSTRUCTS_ARM_HPP
+
+// These are the CPU-specific fields, types and integer
+// constants required by the Serviceability Agent. This file is
+// referenced by vmStructs.cpp.
+
+#define VM_STRUCTS_CPU(nonstatic_field, static_field, unchecked_nonstatic_field, volatile_nonstatic_field, nonproduct_nonstatic_field, c2_nonstatic_field, unchecked_c1_static_field, unchecked_c2_static_field) \
+                                                                                                                                     \
+  /******************************/                                                                                                   \
+  /* JavaCallWrapper            */                                                                                                   \
+  /******************************/                                                                                                   \
+  /******************************/                                                                                                   \
+  /* JavaFrameAnchor            */                                                                                                   \
+  /******************************/                                                                                                   \
+  volatile_nonstatic_field(JavaFrameAnchor,     _last_Java_fp,                                    intptr_t*)
+
+#define VM_TYPES_CPU(declare_type, declare_toplevel_type, declare_oop_type, declare_integer_type, declare_unsigned_integer_type, declare_c1_toplevel_type, declare_c2_type, declare_c2_toplevel_type)
+
+#define VM_INT_CONSTANTS_CPU(declare_constant, declare_preprocessor_constant, declare_c1_constant, declare_c2_constant, declare_c2_preprocessor_constant)
+
+#define VM_LONG_CONSTANTS_CPU(declare_constant, declare_preprocessor_constant, declare_c1_constant, declare_c2_constant, declare_c2_preprocessor_constant)
+
+#endif // CPU_ARM_VM_VMSTRUCTS_ARM_HPP
--- /dev/null	2016-08-24 15:41:39.598575000 -0400
+++ new/hotspot/src/cpu/arm/vm/vm_version_arm.hpp	2016-12-02 11:24:18.641973838 -0500
@@ -0,0 +1,131 @@
+/*
+ * Copyright (c) 2008, 2014, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+/* ARM sources Merged up to hotspot/src/closed  changeset 2389:b6273c37efea */
+/*                          hotspot/make/closed changeset 542:fff0e7e51b36 */
+
+#ifndef CPU_ARM_VM_VM_VERSION_ARM_HPP
+#define CPU_ARM_VM_VM_VERSION_ARM_HPP
+
+#include "runtime/globals_extension.hpp"
+#include "runtime/vm_version.hpp"
+
+class VM_Version: public Abstract_VM_Version {
+  friend class JVMCIVMStructs;
+
+  static bool _has_simd;
+
+ protected:
+  // Are we done with vm version initialization
+  static bool _is_initialized;
+
+ public:
+  static void initialize();
+  static bool is_initialized()      { return _is_initialized; }
+
+#ifdef AARCH64
+
+ public:
+  static bool supports_ldrex()         { return true; }
+  static bool supports_ldrexd()        { return true; }
+  static bool supports_movw()          { return true; }
+
+  // Override Abstract_VM_Version implementation
+  static bool use_biased_locking();
+
+  static bool has_simd()               { return _has_simd; }
+  static bool has_vfp()                { return has_simd(); }
+  static bool simd_math_is_compliant() { return true; }
+
+  static bool prefer_moves_over_load_literal() { return true; }
+
+#else
+
+ protected:
+  enum Feature_Flag {
+    vfp = 0,
+    vfp3_32 = 1,
+    simd = 2,
+  };
+
+  enum Feature_Flag_Set {
+    unknown_m           = 0,
+    all_features_m      = -1,
+
+    vfp_m     = 1 << vfp,
+    vfp3_32_m = 1 << vfp3_32,
+    simd_m    = 1 << simd,
+  };
+
+  // The value stored by "STR PC, [addr]" instruction can be either
+  // (address of this instruction + 8) or (address of this instruction + 12)
+  // depending on hardware implementation.
+  // This adjustment is calculated in runtime.
+  static int _stored_pc_adjustment;
+
+  // ARM architecture version: 5 = ARMv5, 6 = ARMv6, 7 = ARMv7 etc.
+  static int _arm_arch;
+
+  // linux kernel atomic helper function version info
+  // __kuser_cmpxchg() if version >= 2
+  // __kuser_cmpxchg64() if version >= 5
+  static int _kuser_helper_version;
+
+#define KUSER_HELPER_VERSION_ADDR 0xffff0ffc
+#define KUSER_VERSION_CMPXCHG32 2
+#define KUSER_VERSION_CMPXCHG64 5
+
+  // Read additional info using OS-specific interfaces
+  static void get_os_cpu_info();
+
+ public:
+  static void early_initialize();
+
+  static int arm_arch()             { return _arm_arch; }
+  static int stored_pc_adjustment() { return _stored_pc_adjustment; }
+  static bool supports_rev()        { return _arm_arch >= 6; }
+  static bool supports_ldrex()      { return _arm_arch >= 6; }
+  static bool supports_movw()       { return _arm_arch >= 7; }
+  static bool supports_ldrexd()     { return _arm_arch >= 7; }
+  static bool supports_compare_and_exchange() { return true; }
+  static bool supports_kuser_cmpxchg32() { return _kuser_helper_version >= KUSER_VERSION_CMPXCHG32; }
+  static bool supports_kuser_cmpxchg64() { return _kuser_helper_version >= KUSER_VERSION_CMPXCHG64; }
+  // Override Abstract_VM_Version implementation
+  static bool use_biased_locking();
+  static const char* vm_info_string();
+
+  static bool has_vfp()             { return (_features & vfp_m) != 0; }
+  static bool has_vfp3_32()         { return (_features & vfp3_32_m) != 0; }
+  static bool has_simd()            { return (_features & simd_m) != 0; }
+
+  static bool simd_math_is_compliant() { return false; }
+
+  static bool prefer_moves_over_load_literal() { return supports_movw(); }
+
+  friend class VM_Version_StubGenerator;
+
+#endif // AARCH64
+};
+
+#endif // CPU_ARM_VM_VM_VERSION_ARM_HPP
--- /dev/null	2016-08-24 15:41:39.598575000 -0400
+++ new/hotspot/src/cpu/arm/vm/vm_version_arm_32.cpp	2016-12-02 11:24:23.802266468 -0500
@@ -0,0 +1,329 @@
+/*
+ * Copyright (c) 2008, 2016, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+#include "asm/macroAssembler.inline.hpp"
+#include "memory/resourceArea.hpp"
+#include "runtime/java.hpp"
+#include "runtime/os.inline.hpp"
+#include "runtime/stubCodeGenerator.hpp"
+#include "vm_version_arm.hpp"
+
+int  VM_Version::_stored_pc_adjustment = 4;
+int  VM_Version::_arm_arch             = 5;
+bool VM_Version::_is_initialized       = false;
+int VM_Version::_kuser_helper_version  = 0;
+
+extern "C" {
+  typedef int (*get_cpu_info_t)();
+  typedef bool (*check_vfp_t)(double *d);
+  typedef bool (*check_simd_t)();
+}
+
+#define __ _masm->
+
+class VM_Version_StubGenerator: public StubCodeGenerator {
+ public:
+
+  VM_Version_StubGenerator(CodeBuffer *c) : StubCodeGenerator(c) {}
+
+  address generate_get_cpu_info() {
+    StubCodeMark mark(this, "VM_Version", "get_cpu_info");
+    address start = __ pc();
+
+    __ mov(R0, PC);
+    __ push(PC);
+    __ pop(R1);
+    __ sub(R0, R1, R0);
+    // return the result in R0
+    __ bx(LR);
+
+    return start;
+  };
+
+  address generate_check_vfp() {
+    StubCodeMark mark(this, "VM_Version", "check_vfp");
+    address start = __ pc();
+
+    __ fstd(D0, Address(R0));
+    __ mov(R0, 1);
+    __ bx(LR);
+
+    return start;
+  };
+
+  address generate_check_vfp3_32() {
+    StubCodeMark mark(this, "VM_Version", "check_vfp3_32");
+    address start = __ pc();
+
+    __ fstd(D16, Address(R0));
+    __ mov(R0, 1);
+    __ bx(LR);
+
+    return start;
+  };
+
+  address generate_check_simd() {
+    StubCodeMark mark(this, "VM_Version", "check_simd");
+    address start = __ pc();
+
+    __ vcnt(Stemp, Stemp);
+    __ mov(R0, 1);
+    __ bx(LR);
+
+    return start;
+  };
+};
+
+#undef __
+
+
+extern "C" address check_vfp3_32_fault_instr;
+extern "C" address check_vfp_fault_instr;
+extern "C" address check_simd_fault_instr;
+
+void VM_Version::initialize() {
+  ResourceMark rm;
+
+  // Making this stub must be FIRST use of assembler
+  const int stub_size = 128;
+  BufferBlob* stub_blob = BufferBlob::create("get_cpu_info", stub_size);
+  if (stub_blob == NULL) {
+    vm_exit_during_initialization("Unable to allocate get_cpu_info stub");
+  }
+
+  CodeBuffer c(stub_blob);
+  VM_Version_StubGenerator g(&c);
+  address get_cpu_info_pc = g.generate_get_cpu_info();
+  get_cpu_info_t get_cpu_info = CAST_TO_FN_PTR(get_cpu_info_t, get_cpu_info_pc);
+
+  int pc_adjustment = get_cpu_info();
+
+  VM_Version::_stored_pc_adjustment = pc_adjustment;
+
+#ifndef __SOFTFP__
+  address check_vfp_pc = g.generate_check_vfp();
+  check_vfp_t check_vfp = CAST_TO_FN_PTR(check_vfp_t, check_vfp_pc);
+
+  check_vfp_fault_instr = (address)check_vfp;
+  double dummy;
+  if (check_vfp(&dummy)) {
+    _features |= vfp_m;
+  }
+
+#ifdef COMPILER2
+  if (has_vfp()) {
+    address check_vfp3_32_pc = g.generate_check_vfp3_32();
+    check_vfp_t check_vfp3_32 = CAST_TO_FN_PTR(check_vfp_t, check_vfp3_32_pc);
+    check_vfp3_32_fault_instr = (address)check_vfp3_32;
+    double dummy;
+    if (check_vfp3_32(&dummy)) {
+      _features |= vfp3_32_m;
+    }
+
+    address check_simd_pc =g.generate_check_simd();
+    check_simd_t check_simd = CAST_TO_FN_PTR(check_simd_t, check_simd_pc);
+    check_simd_fault_instr = (address)check_simd;
+    if (check_simd()) {
+      _features |= simd_m;
+    }
+  }
+#endif
+#endif
+
+
+  if (UseAESIntrinsics && !FLAG_IS_DEFAULT(UseAESIntrinsics)) {
+    warning("AES intrinsics are not available on this CPU");
+    FLAG_SET_DEFAULT(UseAESIntrinsics, false);
+  }
+
+  if (UseAES && !FLAG_IS_DEFAULT(UseAES)) {
+    warning("AES instructions are not available on this CPU");
+    FLAG_SET_DEFAULT(UseAES, false);
+  }
+
+  if (UseAESCTRIntrinsics) {
+    warning("AES/CTR intrinsics are not available on this CPU");
+    FLAG_SET_DEFAULT(UseAESCTRIntrinsics, false);
+  }
+
+  if (UseFMA) {
+    warning("FMA instructions are not available on this CPU");
+    FLAG_SET_DEFAULT(UseFMA, false);
+  }
+
+  if (UseSHA) {
+    warning("SHA instructions are not available on this CPU");
+    FLAG_SET_DEFAULT(UseSHA, false);
+  }
+
+  if (UseSHA1Intrinsics) {
+    warning("Intrinsics for SHA-1 crypto hash functions not available on this CPU.");
+    FLAG_SET_DEFAULT(UseSHA1Intrinsics, false);
+  }
+
+  if (UseSHA256Intrinsics) {
+    warning("Intrinsics for SHA-224 and SHA-256 crypto hash functions not available on this CPU.");
+    FLAG_SET_DEFAULT(UseSHA256Intrinsics, false);
+  }
+
+  if (UseSHA512Intrinsics) {
+    warning("Intrinsics for SHA-384 and SHA-512 crypto hash functions not available on this CPU.");
+    FLAG_SET_DEFAULT(UseSHA512Intrinsics, false);
+  }
+
+  if (UseCRC32Intrinsics) {
+    if (!FLAG_IS_DEFAULT(UseCRC32Intrinsics))
+      warning("CRC32 intrinsics are not available on this CPU");
+    FLAG_SET_DEFAULT(UseCRC32Intrinsics, false);
+  }
+
+  if (UseCRC32CIntrinsics) {
+    if (!FLAG_IS_DEFAULT(UseCRC32CIntrinsics))
+      warning("CRC32C intrinsics are not available on this CPU");
+    FLAG_SET_DEFAULT(UseCRC32CIntrinsics, false);
+  }
+
+  if (UseAdler32Intrinsics) {
+    warning("Adler32 intrinsics are not available on this CPU");
+    FLAG_SET_DEFAULT(UseAdler32Intrinsics, false);
+  }
+
+  if (UseVectorizedMismatchIntrinsic) {
+    warning("vectorizedMismatch intrinsic is not available on this CPU.");
+    FLAG_SET_DEFAULT(UseVectorizedMismatchIntrinsic, false);
+  }
+
+  get_os_cpu_info();
+
+  _kuser_helper_version = *(int*)KUSER_HELPER_VERSION_ADDR;
+
+#ifdef COMPILER2
+  // C2 is only supported on v7+ VFP at this time
+  if (_arm_arch < 7 || !has_vfp()) {
+    vm_exit_during_initialization("Server VM is only supported on ARMv7+ VFP");
+  }
+#endif
+
+  // armv7 has the ldrexd instruction that can be used to implement cx8
+  // armv5 with linux >= 3.1 can use kernel helper routine
+  _supports_cx8 = (supports_ldrexd() || supports_kuser_cmpxchg64());
+  // ARM doesn't have special instructions for these but ldrex/ldrexd
+  // enable shorter instruction sequences that the ones based on cas.
+  _supports_atomic_getset4 = supports_ldrex();
+  _supports_atomic_getadd4 = supports_ldrex();
+  _supports_atomic_getset8 = supports_ldrexd();
+  _supports_atomic_getadd8 = supports_ldrexd();
+
+#ifdef COMPILER2
+  assert(_supports_cx8 && _supports_atomic_getset4 && _supports_atomic_getadd4
+         && _supports_atomic_getset8 && _supports_atomic_getadd8, "C2: atomic operations must be supported");
+#endif
+  char buf[512];
+  jio_snprintf(buf, sizeof(buf), "(ARMv%d)%s%s%s",
+               _arm_arch,
+               (has_vfp() ? ", vfp" : ""),
+               (has_vfp3_32() ? ", vfp3-32" : ""),
+               (has_simd() ? ", simd" : ""));
+
+  // buf is started with ", " or is empty
+  _features_string = os::strdup(buf);
+
+  if (has_simd()) {
+    if (FLAG_IS_DEFAULT(UsePopCountInstruction)) {
+      FLAG_SET_DEFAULT(UsePopCountInstruction, true);
+    }
+  }
+
+  AllocatePrefetchDistance = 128;
+
+#ifdef COMPILER2
+  FLAG_SET_DEFAULT(UseFPUForSpilling, true);
+
+  if (FLAG_IS_DEFAULT(MaxVectorSize)) {
+    // FLAG_SET_DEFAULT(MaxVectorSize, has_simd() ? 16 : 8);
+    // SIMD/NEON can use 16, but default is 8 because currently
+    // larger than 8 will disable instruction scheduling
+    FLAG_SET_DEFAULT(MaxVectorSize, 8);
+  }
+
+  if (MaxVectorSize > 16) {
+    FLAG_SET_DEFAULT(MaxVectorSize, 8);
+  }
+#endif
+
+  if (FLAG_IS_DEFAULT(Tier4CompileThreshold)) {
+    Tier4CompileThreshold = 10000;
+  }
+  if (FLAG_IS_DEFAULT(Tier3InvocationThreshold)) {
+    Tier3InvocationThreshold = 1000;
+  }
+  if (FLAG_IS_DEFAULT(Tier3CompileThreshold)) {
+    Tier3CompileThreshold = 5000;
+  }
+  if (FLAG_IS_DEFAULT(Tier3MinInvocationThreshold)) {
+    Tier3MinInvocationThreshold = 500;
+  }
+
+  FLAG_SET_DEFAULT(TypeProfileLevel, 0); // unsupported
+
+  // This machine does not allow unaligned memory accesses
+  if (UseUnalignedAccesses) {
+    if (!FLAG_IS_DEFAULT(UseUnalignedAccesses))
+      warning("Unaligned memory access is not available on this CPU");
+    FLAG_SET_DEFAULT(UseUnalignedAccesses, false);
+  }
+
+  _is_initialized = true;
+}
+
+bool VM_Version::use_biased_locking() {
+  get_os_cpu_info();
+  // The cost of CAS on uniprocessor ARM v6 and later is low compared to the
+  // overhead related to slightly longer Biased Locking execution path.
+  // Testing shows no improvement when running with Biased Locking enabled
+  // on an ARMv6 and higher uniprocessor systems.  The situation is different on
+  // ARMv5 and MP systems.
+  //
+  // Therefore the Biased Locking is enabled on ARMv5 and ARM MP only.
+  //
+  return (!os::is_MP() && (arm_arch() > 5)) ? false : true;
+}
+
+#define EXP
+
+// Temporary override for experimental features
+// Copied from Abstract_VM_Version
+const char* VM_Version::vm_info_string() {
+  switch (Arguments::mode()) {
+    case Arguments::_int:
+      return UseSharedSpaces ? "interpreted mode, sharing" EXP : "interpreted mode" EXP;
+    case Arguments::_mixed:
+      return UseSharedSpaces ? "mixed mode, sharing" EXP    :  "mixed mode" EXP;
+    case Arguments::_comp:
+      return UseSharedSpaces ? "compiled mode, sharing" EXP   : "compiled mode" EXP;
+  };
+  ShouldNotReachHere();
+  return "";
+}
--- /dev/null	2016-08-24 15:41:39.598575000 -0400
+++ new/hotspot/src/cpu/arm/vm/vm_version_arm_64.cpp	2016-12-02 11:24:29.030562954 -0500
@@ -0,0 +1,258 @@
+/*
+ * Copyright (c) 2008, 2016, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+#include "asm/macroAssembler.inline.hpp"
+#include "memory/resourceArea.hpp"
+#include "runtime/java.hpp"
+#include "runtime/os.inline.hpp"
+#include "runtime/stubCodeGenerator.hpp"
+#include "vm_version_arm.hpp"
+#include <sys/auxv.h>
+#include <asm/hwcap.h>
+
+#ifndef HWCAP_AES
+#define HWCAP_AES 1 << 3
+#endif
+
+bool VM_Version::_is_initialized = false;
+bool VM_Version::_has_simd = false;
+
+extern "C" {
+  typedef bool (*check_simd_t)();
+}
+
+
+#ifdef COMPILER2
+
+#define __ _masm->
+
+class VM_Version_StubGenerator: public StubCodeGenerator {
+ public:
+
+  VM_Version_StubGenerator(CodeBuffer *c) : StubCodeGenerator(c) {}
+
+  address generate_check_simd() {
+    StubCodeMark mark(this, "VM_Version", "check_simd");
+    address start = __ pc();
+
+    __ vcnt(Stemp, Stemp);
+    __ mov(R0, 1);
+    __ ret(LR);
+
+    return start;
+  };
+};
+
+#undef __
+
+#endif
+
+
+
+extern "C" address check_simd_fault_instr;
+
+
+void VM_Version::initialize() {
+  ResourceMark rm;
+
+  // Making this stub must be FIRST use of assembler
+  const int stub_size = 128;
+  BufferBlob* stub_blob = BufferBlob::create("get_cpu_info", stub_size);
+  if (stub_blob == NULL) {
+    vm_exit_during_initialization("Unable to allocate get_cpu_info stub");
+  }
+
+  if (UseFMA) {
+    warning("FMA instructions are not available on this CPU");
+    FLAG_SET_DEFAULT(UseFMA, false);
+  }
+
+  if (UseSHA) {
+    warning("SHA instructions are not available on this CPU");
+    FLAG_SET_DEFAULT(UseSHA, false);
+  }
+
+  if (UseSHA1Intrinsics) {
+    warning("Intrinsics for SHA-1 crypto hash functions not available on this CPU.");
+    FLAG_SET_DEFAULT(UseSHA1Intrinsics, false);
+  }
+
+  if (UseSHA256Intrinsics) {
+    warning("Intrinsics for SHA-224 and SHA-256 crypto hash functions not available on this CPU.");
+    FLAG_SET_DEFAULT(UseSHA256Intrinsics, false);
+  }
+
+  if (UseSHA512Intrinsics) {
+    warning("Intrinsics for SHA-384 and SHA-512 crypto hash functions not available on this CPU.");
+    FLAG_SET_DEFAULT(UseSHA512Intrinsics, false);
+  }
+
+  if (UseCRC32Intrinsics) {
+    if (!FLAG_IS_DEFAULT(UseCRC32Intrinsics))
+      warning("CRC32 intrinsics are not available on this CPU");
+    FLAG_SET_DEFAULT(UseCRC32Intrinsics, false);
+  }
+
+  if (UseCRC32CIntrinsics) {
+    if (!FLAG_IS_DEFAULT(UseCRC32CIntrinsics))
+      warning("CRC32C intrinsics are not available on this CPU");
+    FLAG_SET_DEFAULT(UseCRC32CIntrinsics, false);
+  }
+
+  if (UseAdler32Intrinsics) {
+    warning("Adler32 intrinsics are not available on this CPU");
+    FLAG_SET_DEFAULT(UseAdler32Intrinsics, false);
+  }
+
+  if (UseVectorizedMismatchIntrinsic) {
+    warning("vectorizedMismatch intrinsic is not available on this CPU.");
+    FLAG_SET_DEFAULT(UseVectorizedMismatchIntrinsic, false);
+  }
+
+  CodeBuffer c(stub_blob);
+
+#ifdef COMPILER2
+  VM_Version_StubGenerator g(&c);
+
+  address check_simd_pc = g.generate_check_simd();
+  if (check_simd_pc != NULL) {
+    check_simd_t check_simd = CAST_TO_FN_PTR(check_simd_t, check_simd_pc);
+    check_simd_fault_instr = (address)check_simd;
+    _has_simd = check_simd();
+  } else {
+    assert(! _has_simd, "default _has_simd value must be 'false'");
+  }
+#endif
+
+  unsigned long auxv = getauxval(AT_HWCAP);
+
+  char buf[512];
+  jio_snprintf(buf, sizeof(buf), "AArch64%s",
+               ((auxv & HWCAP_AES) ? ", aes" : ""));
+
+  _features_string = os::strdup(buf);
+
+#ifdef COMPILER2
+  if (auxv & HWCAP_AES) {
+    if (FLAG_IS_DEFAULT(UseAES)) {
+      FLAG_SET_DEFAULT(UseAES, true);
+    }
+    if (!UseAES) {
+      if (UseAESIntrinsics && !FLAG_IS_DEFAULT(UseAESIntrinsics)) {
+        warning("AES intrinsics require UseAES flag to be enabled. Intrinsics will be disabled.");
+      }
+      FLAG_SET_DEFAULT(UseAESIntrinsics, false);
+    } else {
+      if (FLAG_IS_DEFAULT(UseAESIntrinsics)) {
+        FLAG_SET_DEFAULT(UseAESIntrinsics, true);
+      }
+    }
+  } else
+#endif
+  if (UseAES || UseAESIntrinsics) {
+    if (UseAES && !FLAG_IS_DEFAULT(UseAES)) {
+      warning("AES instructions are not available on this CPU");
+      FLAG_SET_DEFAULT(UseAES, false);
+    }
+    if (UseAESIntrinsics && !FLAG_IS_DEFAULT(UseAESIntrinsics)) {
+      warning("AES intrinsics are not available on this CPU");
+      FLAG_SET_DEFAULT(UseAESIntrinsics, false);
+    }
+  }
+
+  if (UseAESCTRIntrinsics) {
+    warning("AES/CTR intrinsics are not available on this CPU");
+    FLAG_SET_DEFAULT(UseAESCTRIntrinsics, false);
+  }
+
+  _supports_cx8 = true;
+  _supports_atomic_getset4 = true;
+  _supports_atomic_getadd4 = true;
+  _supports_atomic_getset8 = true;
+  _supports_atomic_getadd8 = true;
+
+  // TODO-AARCH64 revise C2 flags
+
+  if (has_simd()) {
+    if (FLAG_IS_DEFAULT(UsePopCountInstruction)) {
+      FLAG_SET_DEFAULT(UsePopCountInstruction, true);
+    }
+  }
+
+  AllocatePrefetchDistance = 128;
+
+#ifdef COMPILER2
+  FLAG_SET_DEFAULT(UseFPUForSpilling, true);
+
+  if (FLAG_IS_DEFAULT(MaxVectorSize)) {
+    // FLAG_SET_DEFAULT(MaxVectorSize, has_simd() ? 16 : 8);
+    // SIMD/NEON can use 16, but default is 8 because currently
+    // larger than 8 will disable instruction scheduling
+    FLAG_SET_DEFAULT(MaxVectorSize, 8);
+  }
+
+  if (MaxVectorSize > 16) {
+    FLAG_SET_DEFAULT(MaxVectorSize, 8);
+  }
+#endif
+
+  if (FLAG_IS_DEFAULT(Tier4CompileThreshold)) {
+    Tier4CompileThreshold = 10000;
+  }
+  if (FLAG_IS_DEFAULT(Tier3InvocationThreshold)) {
+    Tier3InvocationThreshold = 1000;
+  }
+  if (FLAG_IS_DEFAULT(Tier3CompileThreshold)) {
+    Tier3CompileThreshold = 5000;
+  }
+  if (FLAG_IS_DEFAULT(Tier3MinInvocationThreshold)) {
+    Tier3MinInvocationThreshold = 500;
+  }
+
+  FLAG_SET_DEFAULT(TypeProfileLevel, 0); // unsupported
+
+  // This machine does not allow unaligned memory accesses
+  if (UseUnalignedAccesses) {
+    if (!FLAG_IS_DEFAULT(UseUnalignedAccesses))
+      warning("Unaligned memory access is not available on this CPU");
+    FLAG_SET_DEFAULT(UseUnalignedAccesses, false);
+  }
+
+  _is_initialized = true;
+}
+
+bool VM_Version::use_biased_locking() {
+  // TODO-AARCH64 measure performance and revise
+
+  // The cost of CAS on uniprocessor ARM v6 and later is low compared to the
+  // overhead related to slightly longer Biased Locking execution path.
+  // Testing shows no improvement when running with Biased Locking enabled
+  // on an ARMv6 and higher uniprocessor systems.  The situation is different on
+  // ARMv5 and MP systems.
+  //
+  // Therefore the Biased Locking is enabled on ARMv5 and ARM MP only.
+  //
+  return os::is_MP();
+}
--- /dev/null	2016-08-24 15:41:39.598575000 -0400
+++ new/hotspot/src/cpu/arm/vm/vmreg_arm.cpp	2016-12-02 11:24:34.594878493 -0500
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2008, 2013, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+
+#include "precompiled.hpp"
+#include "asm/assembler.hpp"
+#include "code/vmreg.hpp"
+
+void VMRegImpl::set_regName() {
+  Register reg = ::as_Register(0);
+  int i;
+  for (i = 0; i < ConcreteRegisterImpl::max_gpr; reg = reg->successor()) {
+    for (int j = 0; j < (1 << ConcreteRegisterImpl::log_vmregs_per_gpr); j++) {
+      regName[i++] = reg->name();
+    }
+  }
+#ifndef __SOFTFP__
+  FloatRegister freg = ::as_FloatRegister(0);
+  for ( ; i < ConcreteRegisterImpl::max_fpr ; ) {
+    for (int j = 0; j < (1 << ConcreteRegisterImpl::log_vmregs_per_fpr); j++) {
+      regName[i++] = freg->name();
+    }
+    freg = freg->successor();
+  }
+#endif
+
+  for ( ; i < ConcreteRegisterImpl::number_of_registers ; i ++ ) {
+    regName[i] = "NON-GPR-FPR";
+  }
+}
--- /dev/null	2016-08-24 15:41:39.598575000 -0400
+++ new/hotspot/src/cpu/arm/vm/vmreg_arm.hpp	2016-12-02 11:24:40.855233505 -0500
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2008, 2014, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef CPU_ARM_VM_VMREG_ARM_HPP
+#define CPU_ARM_VM_VMREG_ARM_HPP
+
+  inline bool is_Register() {
+    return (unsigned int) value() < (unsigned int) ConcreteRegisterImpl::max_gpr;
+  }
+
+  inline bool is_FloatRegister() {
+    return value() >= ConcreteRegisterImpl::max_gpr && value() < ConcreteRegisterImpl::max_fpr;
+  }
+
+  inline Register as_Register() {
+    assert(is_Register(), "must be");
+    assert(is_concrete(), "concrete register expected");
+    return ::as_Register(value() >> ConcreteRegisterImpl::log_vmregs_per_gpr);
+  }
+
+  inline FloatRegister as_FloatRegister() {
+    assert(is_FloatRegister(), "must be");
+    assert(is_concrete(), "concrete register expected");
+    return ::as_FloatRegister((value() - ConcreteRegisterImpl::max_gpr) >> ConcreteRegisterImpl::log_vmregs_per_fpr);
+  }
+
+  inline bool is_concrete() {
+    if (is_Register()) {
+      return ((value() & right_n_bits(ConcreteRegisterImpl::log_vmregs_per_gpr)) == 0);
+    } else if (is_FloatRegister()) {
+      return (((value() - ConcreteRegisterImpl::max_gpr) & right_n_bits(ConcreteRegisterImpl::log_vmregs_per_fpr)) == 0);
+    } else {
+      return false;
+    }
+  }
+
+#endif // CPU_ARM_VM_VMREG_ARM_HPP
--- /dev/null	2016-08-24 15:41:39.598575000 -0400
+++ new/hotspot/src/cpu/arm/vm/vmreg_arm.inline.hpp	2016-12-02 11:24:46.659562659 -0500
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2008, 2014, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef CPU_ARM_VM_VMREG_ARM_INLINE_HPP
+#define CPU_ARM_VM_VMREG_ARM_INLINE_HPP
+
+inline VMReg RegisterImpl::as_VMReg() {
+  return VMRegImpl::as_VMReg(encoding() << ConcreteRegisterImpl::log_vmregs_per_gpr);
+}
+
+inline VMReg FloatRegisterImpl::as_VMReg() {
+  return VMRegImpl::as_VMReg((encoding() << ConcreteRegisterImpl::log_vmregs_per_fpr) + ConcreteRegisterImpl::max_gpr);
+}
+#endif // CPU_ARM_VM_VMREG_ARM_INLINE_HPP
--- /dev/null	2016-08-24 15:41:39.598575000 -0400
+++ new/hotspot/src/cpu/arm/vm/vtableStubs_arm.cpp	2016-12-02 11:24:51.887859141 -0500
@@ -0,0 +1,222 @@
+/*
+ * Copyright (c) 2008, 2016, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+#include "asm/assembler.hpp"
+#include "assembler_arm.inline.hpp"
+#include "code/vtableStubs.hpp"
+#include "interp_masm_arm.hpp"
+#include "memory/resourceArea.hpp"
+#include "oops/instanceKlass.hpp"
+#include "oops/klassVtable.hpp"
+#include "runtime/sharedRuntime.hpp"
+#include "vmreg_arm.inline.hpp"
+#ifdef COMPILER2
+#include "opto/runtime.hpp"
+#endif
+
+// machine-dependent part of VtableStubs: create VtableStub of correct size and
+// initialize its code
+
+#define __ masm->
+
+#ifndef PRODUCT
+extern "C" void bad_compiled_vtable_index(JavaThread* thread, oop receiver, int index);
+#endif
+
+VtableStub* VtableStubs::create_vtable_stub(int vtable_index) {
+  const int code_length = VtableStub::pd_code_size_limit(true);
+  VtableStub* s = new(code_length) VtableStub(true, vtable_index);
+  // Can be NULL if there is no free space in the code cache.
+  if (s == NULL) {
+    return NULL;
+  }
+
+  ResourceMark rm;
+  CodeBuffer cb(s->entry_point(), code_length);
+  MacroAssembler* masm = new MacroAssembler(&cb);
+
+  assert(VtableStub::receiver_location() == R0->as_VMReg(), "receiver expected in R0");
+
+  const Register tmp = Rtemp; // Rtemp OK, should be free at call sites
+
+  address npe_addr = __ pc();
+  __ load_klass(tmp, R0);
+
+  {
+  int entry_offset = in_bytes(Klass::vtable_start_offset()) + vtable_index * vtableEntry::size_in_bytes();
+  int method_offset = vtableEntry::method_offset_in_bytes() + entry_offset;
+
+  assert ((method_offset & (wordSize - 1)) == 0, "offset should be aligned");
+  int offset_mask = AARCH64_ONLY(0xfff << LogBytesPerWord) NOT_AARCH64(0xfff);
+  if (method_offset & ~offset_mask) {
+    __ add(tmp, tmp, method_offset & ~offset_mask);
+  }
+  __ ldr(Rmethod, Address(tmp, method_offset & offset_mask));
+  }
+
+  address ame_addr = __ pc();
+#ifdef AARCH64
+  __ ldr(tmp, Address(Rmethod, Method::from_compiled_offset()));
+  __ br(tmp);
+#else
+  __ ldr(PC, Address(Rmethod, Method::from_compiled_offset()));
+#endif // AARCH64
+
+  masm->flush();
+
+  if (PrintMiscellaneous && (WizardMode || Verbose)) {
+    tty->print_cr("vtable #%d at " PTR_FORMAT "[%d] left over: %d",
+                  vtable_index, p2i(s->entry_point()),
+                  (int)(s->code_end() - s->entry_point()),
+                  (int)(s->code_end() - __ pc()));
+  }
+  guarantee(__ pc() <= s->code_end(), "overflowed buffer");
+  // FIXME ARM: need correct 'slop' - below is x86 code
+  // shut the door on sizing bugs
+  //int slop = 8;  // 32-bit offset is this much larger than a 13-bit one
+  //assert(vtable_index > 10 || __ pc() + slop <= s->code_end(), "room for 32-bit offset");
+
+  s->set_exception_points(npe_addr, ame_addr);
+  return s;
+}
+
+VtableStub* VtableStubs::create_itable_stub(int itable_index) {
+  const int code_length = VtableStub::pd_code_size_limit(false);
+  VtableStub* s = new(code_length) VtableStub(false, itable_index);
+  // Can be NULL if there is no free space in the code cache.
+  if (s == NULL) {
+    return NULL;
+  }
+
+  ResourceMark rm;
+  CodeBuffer cb(s->entry_point(), code_length);
+  MacroAssembler* masm = new MacroAssembler(&cb);
+
+  assert(VtableStub::receiver_location() == R0->as_VMReg(), "receiver expected in R0");
+
+  // R0-R3 / R0-R7 registers hold the arguments and cannot be spoiled
+  const Register Rclass  = AARCH64_ONLY(R9)  NOT_AARCH64(R4);
+  const Register Rlength = AARCH64_ONLY(R10)  NOT_AARCH64(R5);
+  const Register Rscan   = AARCH64_ONLY(R11) NOT_AARCH64(R6);
+  const Register tmp     = Rtemp;
+
+  assert_different_registers(Ricklass, Rclass, Rlength, Rscan, tmp);
+
+  // Calculate the start of itable (itable goes after vtable)
+  const int scale = exact_log2(vtableEntry::size_in_bytes());
+  address npe_addr = __ pc();
+  __ load_klass(Rclass, R0);
+  __ ldr_s32(Rlength, Address(Rclass, Klass::vtable_length_offset()));
+
+  __ add(Rscan, Rclass, in_bytes(Klass::vtable_start_offset()));
+  __ add(Rscan, Rscan, AsmOperand(Rlength, lsl, scale));
+
+  // Search through the itable for an interface equal to incoming Ricklass
+  // itable looks like [intface][offset][intface][offset][intface][offset]
+  const int entry_size = itableOffsetEntry::size() * HeapWordSize;
+  assert(itableOffsetEntry::interface_offset_in_bytes() == 0, "not added for convenience");
+
+  Label loop;
+  __ bind(loop);
+  __ ldr(tmp, Address(Rscan, entry_size, post_indexed));
+#ifdef AARCH64
+  Label found;
+  __ cmp(tmp, Ricklass);
+  __ b(found, eq);
+  __ cbnz(tmp, loop);
+#else
+  __ cmp(tmp, Ricklass);  // set ZF and CF if interface is found
+  __ cmn(tmp, 0, ne);     // check if tmp == 0 and clear CF if it is
+  __ b(loop, ne);
+#endif // AARCH64
+
+  assert(StubRoutines::throw_IncompatibleClassChangeError_entry() != NULL, "Check initialization order");
+#ifdef AARCH64
+  __ jump(StubRoutines::throw_IncompatibleClassChangeError_entry(), relocInfo::runtime_call_type, tmp);
+  __ bind(found);
+#else
+  // CF == 0 means we reached the end of itable without finding icklass
+  __ jump(StubRoutines::throw_IncompatibleClassChangeError_entry(), relocInfo::runtime_call_type, noreg, cc);
+#endif // !AARCH64
+
+  // Interface found at previous position of Rscan, now load the method oop
+  __ ldr_s32(tmp, Address(Rscan, itableOffsetEntry::offset_offset_in_bytes() - entry_size));
+  {
+    const int method_offset = itableMethodEntry::size() * HeapWordSize * itable_index +
+      itableMethodEntry::method_offset_in_bytes();
+    __ add_slow(Rmethod, Rclass, method_offset);
+  }
+  __ ldr(Rmethod, Address(Rmethod, tmp));
+
+  address ame_addr = __ pc();
+
+#ifdef AARCH64
+  __ ldr(tmp, Address(Rmethod, Method::from_compiled_offset()));
+  __ br(tmp);
+#else
+  __ ldr(PC, Address(Rmethod, Method::from_compiled_offset()));
+#endif // AARCH64
+
+  masm->flush();
+
+  if (PrintMiscellaneous && (WizardMode || Verbose)) {
+    tty->print_cr("itable #%d at " PTR_FORMAT "[%d] left over: %d",
+                  itable_index, p2i(s->entry_point()),
+                  (int)(s->code_end() - s->entry_point()),
+                  (int)(s->code_end() - __ pc()));
+  }
+  guarantee(__ pc() <= s->code_end(), "overflowed buffer");
+  // FIXME ARM: need correct 'slop' - below is x86 code
+  // shut the door on sizing bugs
+  //int slop = 8;  // 32-bit offset is this much larger than a 13-bit one
+  //assert(itable_index > 10 || __ pc() + slop <= s->code_end(), "room for 32-bit offset");
+
+  s->set_exception_points(npe_addr, ame_addr);
+  return s;
+}
+
+int VtableStub::pd_code_size_limit(bool is_vtable_stub) {
+  int instr_count;
+
+  if (is_vtable_stub) {
+    // vtable stub size
+    instr_count = NOT_AARCH64(4) AARCH64_ONLY(5);
+  } else {
+    // itable stub size
+    instr_count = NOT_AARCH64(20) AARCH64_ONLY(20);
+  }
+
+#ifdef AARCH64
+  if (UseCompressedClassPointers) {
+    instr_count += MacroAssembler::instr_count_for_decode_klass_not_null();
+  }
+#endif // AARCH64
+
+  return instr_count * Assembler::InstructionSize;
+}
+
+int VtableStub::pd_code_alignment() {
+  return 8;
+}
--- /dev/null	2016-08-24 15:41:39.598575000 -0400
+++ new/hotspot/src/os_cpu/linux_arm/vm/atomic_linux_arm.hpp	2016-12-02 11:24:57.044151545 -0500
@@ -0,0 +1,265 @@
+/*
+ * Copyright (c) 2008, 2016, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef OS_CPU_LINUX_ARM_VM_ATOMIC_LINUX_ARM_HPP
+#define OS_CPU_LINUX_ARM_VM_ATOMIC_LINUX_ARM_HPP
+
+#include "runtime/os.hpp"
+#include "vm_version_arm.hpp"
+
+// Implementation of class atomic
+
+/*
+ * Atomic long operations on 32-bit ARM
+ * ARM v7 supports LDREXD/STREXD synchronization instructions so no problem.
+ * ARM < v7 does not have explicit 64 atomic load/store capability.
+ * However, gcc emits LDRD/STRD instructions on v5te and LDM/STM on v5t
+ * when loading/storing 64 bits.
+ * For non-MP machines (which is all we support for ARM < v7)
+ * under current Linux distros these instructions appear atomic.
+ * See section A3.5.3 of ARM Architecture Reference Manual for ARM v7.
+ * Also, for cmpxchg64, if ARM < v7 we check for cmpxchg64 support in the
+ * Linux kernel using _kuser_helper_version. See entry-armv.S in the Linux
+ * kernel source or kernel_user_helpers.txt in Linux Doc.
+ */
+
+inline void Atomic::store    (jbyte    store_value, jbyte*    dest) { *dest = store_value; }
+inline void Atomic::store    (jshort   store_value, jshort*   dest) { *dest = store_value; }
+inline void Atomic::store    (jint     store_value, jint*     dest) { *dest = store_value; }
+inline void Atomic::store_ptr(intptr_t store_value, intptr_t* dest) { *dest = store_value; }
+inline void Atomic::store_ptr(void*    store_value, void*     dest) { *(void**)dest = store_value; }
+
+inline void Atomic::store    (jbyte    store_value, volatile jbyte*    dest) { *dest = store_value; }
+inline void Atomic::store    (jshort   store_value, volatile jshort*   dest) { *dest = store_value; }
+inline void Atomic::store    (jint     store_value, volatile jint*     dest) { *dest = store_value; }
+inline void Atomic::store_ptr(intptr_t store_value, volatile intptr_t* dest) { *dest = store_value; }
+inline void Atomic::store_ptr(void*    store_value, volatile void*     dest) { *(void* volatile *)dest = store_value; }
+
+inline jlong Atomic::load (volatile jlong* src) {
+  assert(((intx)src & (sizeof(jlong)-1)) == 0, "Atomic load jlong mis-aligned");
+#ifdef AARCH64
+  return *src;
+#else
+  return (*os::atomic_load_long_func)(src);
+#endif
+}
+
+inline void Atomic::store (jlong value, volatile jlong* dest) {
+  assert(((intx)dest & (sizeof(jlong)-1)) == 0, "Atomic store jlong mis-aligned");
+#ifdef AARCH64
+  *dest = value;
+#else
+  (*os::atomic_store_long_func)(value, dest);
+#endif
+}
+
+inline void Atomic::store (jlong value, jlong* dest) {
+  store(value, (volatile jlong*)dest);
+}
+
+// As per atomic.hpp all read-modify-write operations have to provide two-way
+// barriers semantics. For AARCH64 we are using load-acquire-with-reservation and
+// store-release-with-reservation. While load-acquire combined with store-release
+// do not generally form two-way barriers, their use with reservations does - the
+// ARMv8 architecture manual Section F "Barrier Litmus Tests" indicates they
+// provide sequentially consistent semantics. All we need to add is an explicit
+// barrier in the failure path of the cmpxchg operations (as these don't execute
+// the store) - arguably this may be overly cautious as there is a very low
+// likelihood that the hardware would pull loads/stores into the region guarded
+// by the reservation.
+//
+// For ARMv7 we add explicit barriers in the stubs.
+
+inline jint Atomic::add(jint add_value, volatile jint* dest) {
+#ifdef AARCH64
+  jint val;
+  int tmp;
+  __asm__ volatile(
+    "1:\n\t"
+    " ldaxr %w[val], [%[dest]]\n\t"
+    " add %w[val], %w[val], %w[add_val]\n\t"
+    " stlxr %w[tmp], %w[val], [%[dest]]\n\t"
+    " cbnz %w[tmp], 1b\n\t"
+    : [val] "=&r" (val), [tmp] "=&r" (tmp)
+    : [add_val] "r" (add_value), [dest] "r" (dest)
+    : "memory");
+  return val;
+#else
+  return (*os::atomic_add_func)(add_value, dest);
+#endif
+}
+
+inline void Atomic::inc(volatile jint* dest) {
+  Atomic::add(1, (volatile jint *)dest);
+}
+
+inline void Atomic::dec(volatile jint* dest) {
+  Atomic::add(-1, (volatile jint *)dest);
+}
+
+inline intptr_t Atomic::add_ptr(intptr_t add_value, volatile intptr_t* dest) {
+#ifdef AARCH64
+  intptr_t val;
+  int tmp;
+  __asm__ volatile(
+    "1:\n\t"
+    " ldaxr %[val], [%[dest]]\n\t"
+    " add %[val], %[val], %[add_val]\n\t"
+    " stlxr %w[tmp], %[val], [%[dest]]\n\t"
+    " cbnz %w[tmp], 1b\n\t"
+    : [val] "=&r" (val), [tmp] "=&r" (tmp)
+    : [add_val] "r" (add_value), [dest] "r" (dest)
+    : "memory");
+  return val;
+#else
+  return (intptr_t)Atomic::add((jint)add_value, (volatile jint*)dest);
+#endif
+}
+
+inline void* Atomic::add_ptr(intptr_t add_value, volatile void* dest) {
+  return (void*)add_ptr(add_value, (volatile intptr_t*)dest);
+}
+
+inline void Atomic::inc_ptr(volatile intptr_t* dest) {
+  Atomic::add_ptr(1, dest);
+}
+
+inline void Atomic::dec_ptr(volatile intptr_t* dest) {
+  Atomic::add_ptr(-1, dest);
+}
+
+inline void Atomic::inc_ptr(volatile void* dest) {
+  inc_ptr((volatile intptr_t*)dest);
+}
+
+inline void Atomic::dec_ptr(volatile void* dest) {
+  dec_ptr((volatile intptr_t*)dest);
+}
+
+
+inline jint Atomic::xchg(jint exchange_value, volatile jint* dest) {
+#ifdef AARCH64
+  jint old_val;
+  int tmp;
+  __asm__ volatile(
+    "1:\n\t"
+    " ldaxr %w[old_val], [%[dest]]\n\t"
+    " stlxr %w[tmp], %w[new_val], [%[dest]]\n\t"
+    " cbnz %w[tmp], 1b\n\t"
+    : [old_val] "=&r" (old_val), [tmp] "=&r" (tmp)
+    : [new_val] "r" (exchange_value), [dest] "r" (dest)
+    : "memory");
+  return old_val;
+#else
+  return (*os::atomic_xchg_func)(exchange_value, dest);
+#endif
+}
+
+inline intptr_t Atomic::xchg_ptr(intptr_t exchange_value, volatile intptr_t* dest) {
+#ifdef AARCH64
+  intptr_t old_val;
+  int tmp;
+  __asm__ volatile(
+    "1:\n\t"
+    " ldaxr %[old_val], [%[dest]]\n\t"
+    " stlxr %w[tmp], %[new_val], [%[dest]]\n\t"
+    " cbnz %w[tmp], 1b\n\t"
+    : [old_val] "=&r" (old_val), [tmp] "=&r" (tmp)
+    : [new_val] "r" (exchange_value), [dest] "r" (dest)
+    : "memory");
+  return old_val;
+#else
+  return (intptr_t)xchg((jint)exchange_value, (volatile jint*)dest);
+#endif
+}
+
+inline void* Atomic::xchg_ptr(void* exchange_value, volatile void* dest) {
+  return (void*)xchg_ptr((intptr_t)exchange_value, (volatile intptr_t*)dest);
+}
+
+// The memory_order parameter is ignored - we always provide the strongest/most-conservative ordering
+
+inline jint Atomic::cmpxchg(jint exchange_value, volatile jint* dest, jint compare_value, cmpxchg_memory_order order) {
+#ifdef AARCH64
+  jint rv;
+  int tmp;
+  __asm__ volatile(
+    "1:\n\t"
+    " ldaxr %w[rv], [%[dest]]\n\t"
+    " cmp %w[rv], %w[cv]\n\t"
+    " b.ne 2f\n\t"
+    " stlxr %w[tmp], %w[ev], [%[dest]]\n\t"
+    " cbnz %w[tmp], 1b\n\t"
+    " b 3f\n\t"
+    "2:\n\t"
+    " dmb sy\n\t"
+    "3:\n\t"
+    : [rv] "=&r" (rv), [tmp] "=&r" (tmp)
+    : [ev] "r" (exchange_value), [dest] "r" (dest), [cv] "r" (compare_value)
+    : "memory");
+  return rv;
+#else
+  // Warning:  Arguments are swapped to avoid moving them for kernel call
+  return (*os::atomic_cmpxchg_func)(compare_value, exchange_value, dest);
+#endif
+}
+
+inline jlong Atomic::cmpxchg (jlong exchange_value, volatile jlong* dest, jlong compare_value, cmpxchg_memory_order order) {
+#ifdef AARCH64
+  jlong rv;
+  int tmp;
+  __asm__ volatile(
+    "1:\n\t"
+    " ldaxr %[rv], [%[dest]]\n\t"
+    " cmp %[rv], %[cv]\n\t"
+    " b.ne 2f\n\t"
+    " stlxr %w[tmp], %[ev], [%[dest]]\n\t"
+    " cbnz %w[tmp], 1b\n\t"
+    " b 3f\n\t"
+    "2:\n\t"
+    " dmb sy\n\t"
+    "3:\n\t"
+    : [rv] "=&r" (rv), [tmp] "=&r" (tmp)
+    : [ev] "r" (exchange_value), [dest] "r" (dest), [cv] "r" (compare_value)
+    : "memory");
+  return rv;
+#else
+  assert(VM_Version::supports_cx8(), "Atomic compare and exchange jlong not supported on this architecture!");
+  return (*os::atomic_cmpxchg_long_func)(compare_value, exchange_value, dest);
+#endif
+}
+
+inline intptr_t Atomic::cmpxchg_ptr(intptr_t exchange_value, volatile intptr_t* dest, intptr_t compare_value, cmpxchg_memory_order order) {
+#ifdef AARCH64
+  return (intptr_t)cmpxchg((jlong)exchange_value, (volatile jlong*)dest, (jlong)compare_value, order);
+#else
+  return (intptr_t)cmpxchg((jint)exchange_value, (volatile jint*)dest, (jint)compare_value, order);
+#endif
+}
+
+inline void* Atomic::cmpxchg_ptr(void* exchange_value, volatile void* dest, void* compare_value, cmpxchg_memory_order order) {
+  return (void*)cmpxchg_ptr((intptr_t)exchange_value, (volatile intptr_t*)dest, (intptr_t)compare_value, order);
+}
+
+#endif // OS_CPU_LINUX_ARM_VM_ATOMIC_LINUX_ARM_HPP
--- /dev/null	2016-08-24 15:41:39.598575000 -0400
+++ new/hotspot/src/os_cpu/linux_arm/vm/bytes_linux_arm.inline.hpp	2016-12-02 11:25:02.568464815 -0500
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2008, 2011, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef OS_CPU_LINUX_ARM_VM_BYTES_LINUX_ARM_INLINE_HPP
+#define OS_CPU_LINUX_ARM_VM_BYTES_LINUX_ARM_INLINE_HPP
+
+#include <byteswap.h>
+
+// Efficient swapping of data bytes from Java byte
+// ordering to native byte ordering and vice versa.
+inline u2 Bytes::swap_u2(u2 x) {
+  // TODO: ARM - optimize
+  return bswap_16(x);
+}
+
+inline u4 Bytes::swap_u4(u4 x) {
+  // TODO: ARM - optimize
+  return bswap_32(x);
+}
+
+inline u8 Bytes::swap_u8(u8 x) {
+  // TODO: ARM - optimize
+  return bswap_64(x);
+}
+
+#endif // OS_CPU_LINUX_ARM_VM_BYTES_LINUX_ARM_INLINE_HPP
--- /dev/null	2016-08-24 15:41:39.598575000 -0400
+++ new/hotspot/src/os_cpu/linux_arm/vm/copy_linux_arm.inline.hpp	2016-12-02 11:25:08.096778315 -0500
@@ -0,0 +1,127 @@
+/*
+ * Copyright (c) 2008, 2013, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef OS_CPU_LINUX_ARM_VM_COPY_LINUX_ARM_INLINE_HPP
+#define OS_CPU_LINUX_ARM_VM_COPY_LINUX_ARM_INLINE_HPP
+
+static void pd_conjoint_words(HeapWord* from, HeapWord* to, size_t count) {
+#ifdef AARCH64
+  _Copy_conjoint_words(from, to, count * HeapWordSize);
+#else
+   // NOTE: _Copy_* functions on 32-bit ARM expect "to" and "from" arguments in reversed order
+  _Copy_conjoint_words(to, from, count * HeapWordSize);
+#endif
+}
+
+static void pd_disjoint_words(HeapWord* from, HeapWord* to, size_t count) {
+#ifdef AARCH64
+  _Copy_disjoint_words(from, to, count * HeapWordSize);
+#else
+  _Copy_disjoint_words(to, from, count * HeapWordSize);
+#endif // AARCH64
+}
+
+static void pd_disjoint_words_atomic(HeapWord* from, HeapWord* to, size_t count) {
+  pd_disjoint_words(from, to, count);
+}
+
+static void pd_aligned_conjoint_words(HeapWord* from, HeapWord* to, size_t count) {
+  pd_conjoint_words(from, to, count);
+}
+
+static void pd_aligned_disjoint_words(HeapWord* from, HeapWord* to, size_t count) {
+  pd_disjoint_words(from, to, count);
+}
+
+static void pd_conjoint_bytes(void* from, void* to, size_t count) {
+  memmove(to, from, count);
+}
+
+static void pd_conjoint_bytes_atomic(void* from, void* to, size_t count) {
+  pd_conjoint_bytes(from, to, count);
+}
+
+static void pd_conjoint_jshorts_atomic(jshort* from, jshort* to, size_t count) {
+#ifdef AARCH64
+  _Copy_conjoint_jshorts_atomic(from, to, count * BytesPerShort);
+#else
+  _Copy_conjoint_jshorts_atomic(to, from, count * BytesPerShort);
+#endif
+}
+
+static void pd_conjoint_jints_atomic(jint* from, jint* to, size_t count) {
+#ifdef AARCH64
+  _Copy_conjoint_jints_atomic(from, to, count * BytesPerInt);
+#else
+  assert(HeapWordSize == BytesPerInt, "heapwords and jints must be the same size");
+  // pd_conjoint_words is word-atomic in this implementation.
+  pd_conjoint_words((HeapWord*)from, (HeapWord*)to, count);
+#endif
+}
+
+static void pd_conjoint_jlongs_atomic(jlong* from, jlong* to, size_t count) {
+#ifdef AARCH64
+  assert(HeapWordSize == BytesPerLong, "64-bit architecture");
+  pd_conjoint_words((HeapWord*)from, (HeapWord*)to, count);
+#else
+  _Copy_conjoint_jlongs_atomic(to, from, count * BytesPerLong);
+#endif
+}
+
+static void pd_conjoint_oops_atomic(oop* from, oop* to, size_t count) {
+#ifdef AARCH64
+  if (UseCompressedOops) {
+    assert(BytesPerHeapOop == BytesPerInt, "compressed oops");
+    pd_conjoint_jints_atomic((jint*)from, (jint*)to, count);
+  } else {
+    assert(BytesPerHeapOop == BytesPerLong, "64-bit architecture");
+    pd_conjoint_jlongs_atomic((jlong*)from, (jlong*)to, count);
+  }
+#else
+  assert(BytesPerHeapOop == BytesPerInt, "32-bit architecture");
+  pd_conjoint_jints_atomic((jint*)from, (jint*)to, count);
+#endif
+}
+
+static void pd_arrayof_conjoint_bytes(HeapWord* from, HeapWord* to, size_t count) {
+  pd_conjoint_bytes_atomic((void*)from, (void*)to, count);
+}
+
+static void pd_arrayof_conjoint_jshorts(HeapWord* from, HeapWord* to, size_t count) {
+  pd_conjoint_jshorts_atomic((jshort*)from, (jshort*)to, count);
+}
+
+static void pd_arrayof_conjoint_jints(HeapWord* from, HeapWord* to, size_t count) {
+  pd_conjoint_jints_atomic((jint*)from, (jint*)to, count);
+}
+
+static void pd_arrayof_conjoint_jlongs(HeapWord* from, HeapWord* to, size_t count) {
+  pd_conjoint_jlongs_atomic((jlong*)from, (jlong*)to, count);
+}
+
+static void pd_arrayof_conjoint_oops(HeapWord* from, HeapWord* to, size_t count) {
+  pd_conjoint_oops_atomic((oop*)from, (oop*)to, count);
+}
+
+#endif // OS_CPU_LINUX_ARM_VM_COPY_LINUX_ARM_INLINE_HPP
--- /dev/null	2016-08-24 15:41:39.598575000 -0400
+++ new/hotspot/src/os_cpu/linux_arm/vm/globals_linux_arm.hpp	2016-12-02 11:25:14.013113817 -0500
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2008, 2016, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef OS_CPU_LINUX_ARM_VM_GLOBALS_LINUX_ARM_HPP
+#define OS_CPU_LINUX_ARM_VM_GLOBALS_LINUX_ARM_HPP
+
+//
+// Sets the default values for platform dependent flags used by the runtime system.
+// (see globals.hpp)
+//
+define_pd_global(bool, DontYieldALot,            false);
+#ifdef AARCH64
+define_pd_global(intx, CompilerThreadStackSize,  1024);
+define_pd_global(intx, ThreadStackSize,          1024);
+define_pd_global(intx, VMThreadStackSize,        1024);
+#else
+define_pd_global(intx, CompilerThreadStackSize,  512);
+// System default ThreadStackSize appears to be 512 which is too big.
+define_pd_global(intx, ThreadStackSize,          320);
+define_pd_global(intx, VMThreadStackSize,        512);
+#endif // AARCH64
+
+define_pd_global(size_t, JVMInvokeMethodSlack,   8192);
+
+// Used on 64 bit platforms for UseCompressedOops base address or CDS
+define_pd_global(size_t, HeapBaseMinAddress,     2*G);
+
+#endif // OS_CPU_LINUX_ARM_VM_GLOBALS_LINUX_ARM_HPP
--- /dev/null	2016-08-24 15:41:39.598575000 -0400
+++ new/hotspot/src/os_cpu/linux_arm/vm/linux_arm_32.s	2016-12-02 11:25:20.189464063 -0500
@@ -0,0 +1,513 @@
+# 
+# Copyright (c) 2008, 2013, Oracle and/or its affiliates. All rights reserved.
+# DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+#
+# This code is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License version 2 only, as
+# published by the Free Software Foundation.
+#
+# This code is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+# FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+# version 2 for more details (a copy is included in the LICENSE file that
+# accompanied this code).
+#
+# You should have received a copy of the GNU General Public License version
+# 2 along with this work; if not, write to the Free Software Foundation,
+# Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+#
+# Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+# or visit www.oracle.com if you need additional information or have any
+# questions.
+# 
+
+	
+        # NOTE WELL!  The _Copy functions are called directly
+	# from server-compiler-generated code via CallLeafNoFP,
+	# which means that they *must* either not use floating
+	# point or use it in the same manner as does the server
+	# compiler.
+	
+        .globl _Copy_conjoint_bytes
+	.type _Copy_conjoint_bytes, %function
+        .globl _Copy_arrayof_conjoint_bytes
+	.type _Copy_arrayof_conjoint_bytes, %function
+	.globl _Copy_disjoint_words
+	.type _Copy_disjoint_words, %function
+	.globl _Copy_conjoint_words
+	.type _Copy_conjoint_words, %function
+        .globl _Copy_conjoint_jshorts_atomic
+	.type _Copy_conjoint_jshorts_atomic, %function
+	.globl _Copy_arrayof_conjoint_jshorts
+	.type _Copy_arrayof_conjoint_jshorts, %function
+        .globl _Copy_conjoint_jints_atomic
+	.type _Copy_conjoint_jints_atomic, %function
+        .globl _Copy_arrayof_conjoint_jints
+	.type _Copy_arrayof_conjoint_jints, %function
+	.globl _Copy_conjoint_jlongs_atomic
+	.type _Copy_conjoint_jlongs_atomic, %function
+	.globl _Copy_arrayof_conjoint_jlongs
+	.type _Copy_arrayof_conjoint_jlongs, %function
+
+	.text
+        .globl  SpinPause
+        .type SpinPause, %function
+SpinPause:
+        bx      LR
+
+        # Support for void Copy::conjoint_bytes(void* from,
+        #                                       void* to,
+        #                                       size_t count)
+_Copy_conjoint_bytes:
+        swi     0x9f0001
+
+        # Support for void Copy::arrayof_conjoint_bytes(void* from,
+        #                                               void* to,
+        #                                               size_t count)
+_Copy_arrayof_conjoint_bytes:
+        swi     0x9f0001
+
+
+        # Support for void Copy::disjoint_words(void* from,
+        #                                       void* to,
+        #                                       size_t count)
+_Copy_disjoint_words:
+        stmdb    sp!, {r3 - r9, ip}
+ 
+        cmp     r2, #0
+        beq     disjoint_words_finish
+
+        pld     [r1, #0]
+        cmp     r2, #12
+        ble disjoint_words_small
+
+        .align 3
+dw_f2b_loop_32:
+        subs    r2, #32
+	blt	dw_f2b_loop_32_finish
+        ldmia r1!, {r3 - r9, ip}
+        nop
+	pld     [r1]
+        stmia r0!, {r3 - r9, ip}
+        bgt     dw_f2b_loop_32
+dw_f2b_loop_32_finish:
+        addlts  r2, #32
+        beq     disjoint_words_finish
+        cmp     r2, #16
+	blt	disjoint_words_small
+        ldmia r1!, {r3 - r6}
+        subge   r2, r2, #16
+        stmia r0!, {r3 - r6}
+        beq     disjoint_words_finish
+disjoint_words_small:
+        cmp     r2, #8
+        ldr     r7, [r1], #4
+        ldrge   r8, [r1], #4
+        ldrgt   r9, [r1], #4
+        str     r7, [r0], #4
+        strge   r8, [r0], #4
+        strgt   r9, [r0], #4
+
+disjoint_words_finish:
+        ldmia   sp!, {r3 - r9, ip}
+        bx      lr
+
+
+        # Support for void Copy::conjoint_words(void* from,
+        #                                       void* to,
+        #                                       size_t count)
+_Copy_conjoint_words:
+        stmdb    sp!, {r3 - r9, ip}
+
+	cmp	r2, #0
+	beq	conjoint_words_finish
+
+        pld     [r1, #0]
+        cmp     r2, #12
+        ble conjoint_words_small
+
+        subs    r3, r0, r1
+        cmphi   r2, r3
+        bhi     cw_b2f_copy
+        .align 3
+cw_f2b_loop_32:
+        subs    r2, #32
+	blt	cw_f2b_loop_32_finish
+        ldmia r1!, {r3 - r9, ip}
+        nop
+	pld     [r1]
+        stmia r0!, {r3 - r9, ip}
+        bgt     cw_f2b_loop_32
+cw_f2b_loop_32_finish:
+        addlts  r2, #32
+        beq     conjoint_words_finish
+        cmp     r2, #16
+	blt	conjoint_words_small
+        ldmia r1!, {r3 - r6}
+        subge   r2, r2, #16
+        stmia r0!, {r3 - r6}
+        beq     conjoint_words_finish
+conjoint_words_small:
+        cmp     r2, #8
+        ldr     r7, [r1], #4
+        ldrge   r8, [r1], #4
+        ldrgt   r9, [r1], #4
+        str     r7, [r0], #4
+        strge   r8, [r0], #4
+        strgt   r9, [r0], #4
+        b       conjoint_words_finish
+
+	# Src and dest overlap, copy in a descending order
+cw_b2f_copy:
+        add     r1, r2
+        pld     [r1, #-32]
+        add     r0, r2
+        .align 3
+cw_b2f_loop_32:
+        subs    r2, #32
+	blt	cw_b2f_loop_32_finish
+        ldmdb r1!, {r3-r9,ip}
+        nop
+	pld     [r1, #-32]
+        stmdb r0!, {r3-r9,ip}
+        bgt     cw_b2f_loop_32
+cw_b2f_loop_32_finish:
+        addlts  r2, #32
+        beq     conjoint_words_finish
+        cmp     r2, #16
+	blt	cw_b2f_copy_small
+        ldmdb r1!, {r3 - r6}
+        subge   r2, r2, #16
+        stmdb r0!, {r3 - r6}
+        beq     conjoint_words_finish
+cw_b2f_copy_small:
+        cmp     r2, #8
+        ldr     r7, [r1, #-4]!
+        ldrge   r8, [r1, #-4]!
+        ldrgt   r9, [r1, #-4]!
+        str     r7, [r0, #-4]!
+        strge   r8, [r0, #-4]!
+        strgt   r9, [r0, #-4]!
+
+conjoint_words_finish:
+        ldmia   sp!, {r3 - r9, ip}
+        bx      lr
+
+        # Support for void Copy::conjoint_jshorts_atomic(void* from,
+        #                                                void* to,
+        #                                                size_t count)
+_Copy_conjoint_jshorts_atomic:
+        stmdb   sp!, {r3 - r9, ip}
+
+	cmp	r2, #0
+	beq	conjoint_shorts_finish	
+
+        subs    r3, r0, r1
+        cmphi   r2, r3
+        bhi     cs_b2f_copy
+
+        pld     [r1]
+
+        ands    r3, r0, #3
+        bne     cs_f2b_dest_u
+        ands    r3, r1, #3
+        bne     cs_f2b_src_u
+
+	# Aligned source address
+        .align 3
+cs_f2b_loop_32:
+        subs    r2, #32
+	blt	cs_f2b_loop_32_finish
+        ldmia r1!, {r3 - r9, ip}
+        nop
+        pld     [r1]
+        stmia r0!, {r3 - r9, ip}
+        bgt     cs_f2b_loop_32
+cs_f2b_loop_32_finish:
+        addlts  r2, #32
+        beq     conjoint_shorts_finish
+        movs    r6, r2, lsr #3
+        .align 3
+cs_f2b_8_loop:
+        beq     cs_f2b_4
+        ldmia   r1!, {r4-r5}
+        subs    r6, #1
+        stmia   r0!, {r4-r5}
+        bgt     cs_f2b_8_loop
+
+cs_f2b_4:
+        ands    r2, #7
+        beq     conjoint_shorts_finish
+        cmp     r2, #4
+        ldrh    r3, [r1], #2
+        ldrgeh  r4, [r1], #2
+        ldrgth  r5, [r1], #2
+        strh    r3, [r0], #2
+        strgeh  r4, [r0], #2
+        strgth  r5, [r0], #2
+        b       conjoint_shorts_finish
+
+	# Destination not aligned
+cs_f2b_dest_u:
+        ldrh    r3, [r1], #2
+        subs    r2, #2
+        strh    r3, [r0], #2
+        beq     conjoint_shorts_finish
+
+	# Check to see if source is not aligned ether
+        ands    r3, r1, #3
+        beq     cs_f2b_loop_32
+
+cs_f2b_src_u:
+        cmp     r2, #16
+        blt     cs_f2b_8_u
+
+	# Load 2 first bytes to r7 and make src ptr word aligned
+        bic     r1, #3
+        ldr     r7, [r1], #4
+
+	# Destination aligned, source not
+        mov     r8, r2, lsr #4
+        .align 3
+cs_f2b_16_u_loop:
+        mov     r3, r7, lsr #16
+        ldmia   r1!, {r4 - r7}
+        orr     r3, r3, r4, lsl #16
+        mov     r4, r4, lsr #16
+        pld     [r1]
+        orr     r4, r4, r5, lsl #16
+        mov     r5, r5, lsr #16
+        orr     r5, r5, r6, lsl #16
+        mov     r6, r6, lsr #16
+        orr     r6, r6, r7, lsl #16
+        stmia   r0!, {r3 - r6}
+        subs    r8, #1
+        bgt     cs_f2b_16_u_loop
+        ands    r2, #0xf
+        beq     conjoint_shorts_finish
+        sub     r1, #2
+
+cs_f2b_8_u:
+        cmp     r2, #8
+        blt     cs_f2b_4_u
+        ldrh    r4, [r1], #2
+        ldr     r5, [r1], #4
+        ldrh    r6, [r1], #2
+        orr     r4, r4, r5, lsl #16
+        mov     r5, r5, lsr #16
+        orr     r5, r5, r6, lsl #16
+        subs    r2, #8
+        stmia	r0!, {r4 - r5}
+cs_f2b_4_u:
+        beq     conjoint_shorts_finish
+        cmp     r2, #4
+        ldrh    r3, [r1], #2
+        ldrgeh  r4, [r1], #2
+        ldrgth  r5, [r1], #2
+        strh    r3, [r0], #2
+        strgeh  r4, [r0], #2
+        strgth  r5, [r0], #2
+        b       conjoint_shorts_finish
+
+	# Src and dest overlap, copy in a descending order
+cs_b2f_copy:
+        add     r1, r2
+        pld     [r1, #-32]
+        add     r0, r2
+
+        ands    r3, r0, #3
+        bne     cs_b2f_dest_u
+        ands    r3, r1, #3
+        bne     cs_b2f_src_u
+        .align 3
+cs_b2f_loop_32:
+        subs    r2, #32
+	blt	cs_b2f_loop_32_finish
+        ldmdb r1!, {r3-r9,ip}
+        nop
+        pld     [r1, #-32]
+        stmdb r0!, {r3-r9,ip}
+        bgt     cs_b2f_loop_32
+cs_b2f_loop_32_finish:
+        addlts  r2, #32
+        beq     conjoint_shorts_finish
+        cmp     r2, #24
+        blt     cs_b2f_16
+        ldmdb   r1!, {r3-r8}
+        sub     r2, #24
+        stmdb   r0!, {r3-r8}
+        beq     conjoint_shorts_finish
+cs_b2f_16:
+        cmp     r2, #16
+        blt     cs_b2f_8
+        ldmdb   r1!, {r3-r6}
+        sub     r2, #16
+        stmdb   r0!, {r3-r6}
+        beq     conjoint_shorts_finish
+cs_b2f_8:
+        cmp     r2, #8
+        blt     cs_b2f_all_copy
+        ldmdb   r1!, {r3-r4}
+        sub     r2, #8
+        stmdb   r0!, {r3-r4}
+        beq     conjoint_shorts_finish
+
+cs_b2f_all_copy:
+        cmp     r2, #4
+        ldrh    r3, [r1, #-2]!
+        ldrgeh  r4, [r1, #-2]!
+        ldrgth  r5, [r1, #-2]!
+        strh    r3, [r0, #-2]!
+        strgeh  r4, [r0, #-2]!
+        strgth  r5, [r0, #-2]!
+        b       conjoint_shorts_finish
+
+	# Destination not aligned
+cs_b2f_dest_u:
+        ldrh    r3, [r1, #-2]!
+        strh    r3, [r0, #-2]!
+        sub     r2, #2
+	# Check source alignment as well
+        ands    r3, r1, #3
+        beq     cs_b2f_loop_32
+
+	# Source not aligned
+cs_b2f_src_u:
+        bic     r1, #3
+        .align 3
+cs_b2f_16_loop_u:
+        subs    r2, #16
+        blt     cs_b2f_16_loop_u_finished
+        ldr     r7, [r1]
+        mov     r3, r7
+        ldmdb   r1!, {r4 - r7}
+        mov     r4, r4, lsr #16
+        orr     r4, r4, r5, lsl #16
+        pld     [r1, #-32]
+        mov     r5, r5, lsr #16
+        orr     r5, r5, r6, lsl #16
+        mov     r6, r6, lsr #16
+        orr     r6, r6, r7, lsl #16
+        mov     r7, r7, lsr #16
+        orr     r7, r7, r3, lsl #16
+        stmdb   r0!, {r4 - r7}
+        bgt     cs_b2f_16_loop_u
+        beq     conjoint_shorts_finish
+cs_b2f_16_loop_u_finished:
+        addlts  r2, #16
+        ldr     r3, [r1]
+	cmp     r2, #10
+        blt     cs_b2f_2_u_loop
+        ldmdb   r1!, {r4 - r5}
+        mov     r6, r4, lsr #16
+        orr     r6, r6, r5, lsl #16
+        mov     r7, r5, lsr #16
+        orr     r7, r7, r3, lsl #16
+        stmdb   r0!, {r6-r7}
+        sub     r2, #8
+	.align 3
+cs_b2f_2_u_loop:
+        subs    r2, #2
+        ldrh    r3, [r1], #-2
+        strh    r3, [r0, #-2]!
+        bgt     cs_b2f_2_u_loop
+
+conjoint_shorts_finish:
+        ldmia   sp!, {r3 - r9, ip}
+        bx      lr
+
+
+        # Support for void Copy::arrayof_conjoint_jshorts(void* from,
+        #                                                 void* to,
+        #                                                 size_t count)
+_Copy_arrayof_conjoint_jshorts:
+        swi     0x9f0001
+
+        # Support for void Copy::conjoint_jints_atomic(void* from,
+        #                                              void* to,
+        #                                              size_t count)
+_Copy_conjoint_jints_atomic:
+_Copy_arrayof_conjoint_jints:
+        swi     0x9f0001
+	
+        # Support for void Copy::conjoint_jlongs_atomic(jlong* from,
+        #                                               jlong* to,
+        #                                               size_t count)
+_Copy_conjoint_jlongs_atomic:
+_Copy_arrayof_conjoint_jlongs:
+        stmdb    sp!, {r3 - r9, ip}
+
+	cmp	r2, #0
+	beq	conjoint_longs_finish
+
+        pld     [r1, #0]
+        cmp     r2, #24
+        ble conjoint_longs_small
+
+        subs    r3, r0, r1
+        cmphi   r2, r3
+        bhi     cl_b2f_copy
+        .align 3
+cl_f2b_loop_32:
+        subs    r2, #32
+	blt	cl_f2b_loop_32_finish
+        ldmia r1!, {r3 - r9, ip}
+        nop
+	pld     [r1]
+        stmia r0!, {r3 - r9, ip}
+        bgt     cl_f2b_loop_32
+cl_f2b_loop_32_finish:
+        addlts  r2, #32
+        beq     conjoint_longs_finish
+conjoint_longs_small:
+        cmp     r2, #16
+	blt	cl_f2b_copy_8
+	bgt	cl_f2b_copy_24
+        ldmia 	r1!, {r3 - r6}
+        stmia 	r0!, {r3 - r6}
+	b	conjoint_longs_finish
+cl_f2b_copy_8:
+        ldmia   r1!, {r3 - r4}
+        stmia   r0!, {r3 - r4}
+        b       conjoint_longs_finish
+cl_f2b_copy_24:
+	ldmia   r1!, {r3 - r8}
+        stmia   r0!, {r3 - r8}
+        b       conjoint_longs_finish
+
+	# Src and dest overlap, copy in a descending order
+cl_b2f_copy:
+        add     r1, r2
+        pld     [r1, #-32]
+        add     r0, r2
+        .align 3
+cl_b2f_loop_32:
+        subs    r2, #32
+	blt	cl_b2f_loop_32_finish
+        ldmdb 	r1!, {r3 - r9, ip}
+        nop
+	pld     [r1]
+        stmdb 	r0!, {r3 - r9, ip}
+        bgt     cl_b2f_loop_32
+cl_b2f_loop_32_finish:
+        addlts  r2, #32
+        beq     conjoint_longs_finish
+        cmp     r2, #16
+	blt	cl_b2f_copy_8
+	bgt	cl_b2f_copy_24
+        ldmdb   r1!, {r3 - r6}
+        stmdb   r0!, {r3 - r6}
+        b       conjoint_longs_finish
+cl_b2f_copy_8:
+	ldmdb   r1!, {r3 - r4}
+        stmdb   r0!, {r3 - r4}
+        b       conjoint_longs_finish
+cl_b2f_copy_24:
+	ldmdb   r1!, {r3 - r8}
+        stmdb   r0!, {r3 - r8}
+
+conjoint_longs_finish:
+        ldmia   sp!, {r3 - r9, ip}
+        bx      lr
+
+
--- /dev/null	2016-08-24 15:41:39.598575000 -0400
+++ new/hotspot/src/os_cpu/linux_arm/vm/linux_arm_64.s	2016-12-02 11:25:25.721777787 -0500
@@ -0,0 +1,542 @@
+# 
+# Copyright (c) 2008, 2013, Oracle and/or its affiliates. All rights reserved.
+# DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+#
+# This code is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License version 2 only, as
+# published by the Free Software Foundation.
+#
+# This code is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+# FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+# version 2 for more details (a copy is included in the LICENSE file that
+# accompanied this code).
+#
+# You should have received a copy of the GNU General Public License version
+# 2 along with this work; if not, write to the Free Software Foundation,
+# Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+#
+# Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+# or visit www.oracle.com if you need additional information or have any
+# questions.
+# 
+
+        # TODO-AARCH64
+        
+        # NOTE WELL!  The _Copy functions are called directly
+        # from server-compiler-generated code via CallLeafNoFP,
+        # which means that they *must* either not use floating
+        # point or use it in the same manner as does the server
+        # compiler.
+        
+        .globl _Copy_conjoint_bytes
+        .type _Copy_conjoint_bytes, %function
+        .globl _Copy_arrayof_conjoint_bytes
+        .type _Copy_arrayof_conjoint_bytes, %function
+        .globl _Copy_disjoint_words
+        .type _Copy_disjoint_words, %function
+        .globl _Copy_conjoint_words
+        .type _Copy_conjoint_words, %function
+        .globl _Copy_conjoint_jshorts_atomic
+        .type _Copy_conjoint_jshorts_atomic, %function
+        .globl _Copy_arrayof_conjoint_jshorts
+        .type _Copy_arrayof_conjoint_jshorts, %function
+        .globl _Copy_conjoint_jints_atomic
+        .type _Copy_conjoint_jints_atomic, %function
+        .globl _Copy_arrayof_conjoint_jints
+        .type _Copy_arrayof_conjoint_jints, %function
+        .globl _Copy_conjoint_jlongs_atomic
+        .type _Copy_conjoint_jlongs_atomic, %function
+        .globl _Copy_arrayof_conjoint_jlongs
+        .type _Copy_arrayof_conjoint_jlongs, %function
+
+        .text
+        .globl  SpinPause
+        .type SpinPause, %function
+SpinPause:
+        yield
+        ret
+
+        # Support for void Copy::conjoint_bytes(void* from,
+        #                                       void* to,
+        #                                       size_t count)
+_Copy_conjoint_bytes:
+        hlt 1002
+
+        # Support for void Copy::arrayof_conjoint_bytes(void* from,
+        #                                               void* to,
+        #                                               size_t count)
+_Copy_arrayof_conjoint_bytes:
+        hlt 1003
+
+
+        # Support for void Copy::disjoint_words(void* from,
+        #                                       void* to,
+        #                                       size_t count)
+_Copy_disjoint_words:
+        # These and further memory prefetches may hit out of array ranges.
+        # Experiments showed that prefetching of inaccessible memory doesn't result in exceptions.
+        prfm    pldl1keep,  [x0, #0]
+        prfm    pstl1keep,  [x1, #0]
+        prfm    pldl1keep,  [x0, #64]
+        prfm    pstl1keep,  [x1, #64]
+
+        subs    x18, x2,  #128
+        b.ge    dw_large
+
+dw_lt_128:
+        # Copy [x0, x0 + x2) to [x1, x1 + x2)
+        
+        adr     x15,  dw_tail_table_base
+        and     x16,  x2,  #~8
+
+        # Calculate address to jump and store it to x15:
+        #   Each pair of instructions before dw_tail_table_base copies 16 bytes.
+        #   x16 is count of bytes to copy aligned down by 16.
+        #   So x16/16 pairs of instructions should be executed. 
+        #   Each pair takes 8 bytes, so x15 = dw_tail_table_base - (x16/16)*8 = x15 - x16/2
+        sub     x15,  x15, x16, lsr #1
+        prfm    plil1keep, [x15]
+    
+        add     x17,  x0,  x2
+        add     x18,  x1,  x2
+
+        # If x2 = x16 + 8, then copy 8 bytes and x16 bytes after that.
+        # Otherwise x2 = x16, so proceed to copy x16 bytes.
+        tbz     x2, #3, dw_lt_128_even
+        ldr     x3, [x0]
+        str     x3, [x1]
+dw_lt_128_even:
+        # Copy [x17 - x16, x17) to [x18 - x16, x18)
+        # x16 is aligned by 16 and less than 128
+
+        # Execute (x16/16) ldp-stp pairs; each pair copies 16 bytes
+        br      x15
+
+        ldp     x3,  x4,  [x17, #-112]
+        stp     x3,  x4,  [x18, #-112]
+        ldp     x5,  x6,  [x17, #-96]
+        stp     x5,  x6,  [x18, #-96]
+        ldp     x7,  x8,  [x17, #-80]
+        stp     x7,  x8,  [x18, #-80]
+        ldp     x9,  x10, [x17, #-64]
+        stp     x9,  x10, [x18, #-64]
+        ldp     x11, x12, [x17, #-48]
+        stp     x11, x12, [x18, #-48]
+        ldp     x13, x14, [x17, #-32]
+        stp     x13, x14, [x18, #-32]
+        ldp     x15, x16, [x17, #-16]
+        stp     x15, x16, [x18, #-16]
+dw_tail_table_base:
+        ret
+
+.p2align  6
+.rept   12
+        nop
+.endr
+dw_large:
+        # x18 >= 0;
+        # Copy [x0, x0 + x18 + 128) to [x1, x1 + x18 + 128)
+
+        ldp     x3,  x4,  [x0], #64
+        ldp     x5,  x6,  [x0, #-48]
+        ldp     x7,  x8,  [x0, #-32]
+        ldp     x9,  x10, [x0, #-16]
+
+        # Before and after each iteration of loop registers x3-x10 contain [x0 - 64, x0),
+        # and x1 is a place to copy this data;
+        # x18 contains number of bytes to be stored minus 128
+
+        # Exactly 16 instructions from p2align, so dw_loop starts from cache line boundary
+        # Checking it explictly by aligning with "hlt 1000" instructions 
+.p2alignl  6, 0xd4407d00
+dw_loop:
+        prfm    pldl1keep,  [x0, #64]
+        # Next line actually hurted memory copy performance (for interpreter) - JDK-8078120
+        # prfm    pstl1keep,  [x1, #64]
+
+        subs    x18, x18, #64
+
+        stp     x3,  x4,  [x1, #0]
+        ldp     x3,  x4,  [x0, #0]
+        stp     x5,  x6,  [x1, #16]
+        ldp     x5,  x6,  [x0, #16]
+        stp     x7,  x8,  [x1, #32]
+        ldp     x7,  x8,  [x0, #32]
+        stp     x9,  x10, [x1, #48]
+        ldp     x9,  x10, [x0, #48]
+        
+        add     x1,  x1,  #64
+        add     x0,  x0,  #64
+
+        b.ge    dw_loop
+
+        # 13 instructions from dw_loop, so the loop body hits into one cache line
+
+dw_loop_end:
+        adds    x2,  x18, #64
+
+        stp     x3,  x4,  [x1], #64
+        stp     x5,  x6,  [x1, #-48]
+        stp     x7,  x8,  [x1, #-32]
+        stp     x9,  x10, [x1, #-16]
+
+        # Increased x18 by 64, but stored 64 bytes, so x2 contains exact number of bytes to be stored
+
+        # If this number is not zero, also copy remaining bytes
+        b.ne    dw_lt_128
+        ret
+
+
+        # Support for void Copy::conjoint_words(void* from,
+        #                                       void* to,
+        #                                       size_t count)
+_Copy_conjoint_words:
+        subs    x3, x1, x0
+        # hi condition is met <=> from < to
+        ccmp    x2, x3, #0, hi
+        # hi condition is met <=> (from < to) and (to - from < count)
+        # otherwise _Copy_disjoint_words may be used, because it performs forward copying,
+        # so it also works when ranges overlap but to <= from
+        b.ls    _Copy_disjoint_words
+
+        # Overlapping case should be the rare one, it does not worth optimizing
+
+        ands    x3,  x2,  #~8
+        # x3 is count aligned down by 2*wordSize
+        add     x0,  x0,  x2
+        add     x1,  x1,  x2
+        sub     x3,  x3,  #16
+        # Skip loop if 0 or 1 words
+        b.eq    cw_backward_loop_end
+
+        # x3 >= 0
+        # Copy [x0 - x3 - 16, x0) to [x1 - x3 - 16, x1) backward
+cw_backward_loop:
+        subs    x3,  x3,  #16
+        ldp     x4,  x5,  [x0, #-16]!
+        stp     x4,  x5,  [x1, #-16]!
+        b.ge    cw_backward_loop
+
+cw_backward_loop_end:
+        # Copy remaining 0 or 1 words
+        tbz     x2,  #3,  cw_finish
+        ldr     x3, [x0, #-8]
+        str     x3, [x1, #-8]
+
+cw_finish:
+        ret
+
+
+        # Support for void Copy::conjoint_jshorts_atomic(void* from,
+        #                                                void* to,
+        #                                                size_t count)
+_Copy_conjoint_jshorts_atomic:
+        add     x17, x0, x2
+        add     x18, x1, x2
+
+        subs    x3, x1, x0
+        # hi is met <=> (from < to) and (to - from < count)
+        ccmp    x2, x3, #0, hi
+        b.hi    cs_backward
+        
+        subs    x3, x2, #14
+        b.ge    cs_forward_loop
+
+        # Copy x2 < 14 bytes from x0 to x1
+cs_forward_lt14:
+        ands    x7, x2, #7
+        tbz     x2, #3, cs_forward_lt8
+        ldrh    w3, [x0, #0]
+        ldrh    w4, [x0, #2]
+        ldrh    w5, [x0, #4]
+        ldrh    w6, [x0, #6]
+
+        strh    w3, [x1, #0]
+        strh    w4, [x1, #2]
+        strh    w5, [x1, #4]
+        strh    w6, [x1, #6]
+
+        # Copy x7 < 8 bytes from x17 - x7 to x18 - x7
+cs_forward_lt8:
+        b.eq    cs_forward_0
+        cmp     x7, #4
+        b.lt    cs_forward_2
+        b.eq    cs_forward_4
+
+cs_forward_6:
+        ldrh    w3, [x17, #-6]
+        strh    w3, [x18, #-6]
+cs_forward_4:
+        ldrh    w4, [x17, #-4]
+        strh    w4, [x18, #-4]
+cs_forward_2:
+        ldrh    w5, [x17, #-2]
+        strh    w5, [x18, #-2]
+cs_forward_0:
+        ret
+
+
+        # Copy [x0, x0 + x3 + 14) to [x1, x1 + x3 + 14)
+        # x3 >= 0
+.p2align 6
+cs_forward_loop:
+        subs    x3, x3, #14
+        
+        ldrh    w4, [x0], #14
+        ldrh    w5, [x0, #-12]
+        ldrh    w6, [x0, #-10]
+        ldrh    w7, [x0, #-8]
+        ldrh    w8, [x0, #-6]
+        ldrh    w9, [x0, #-4]
+        ldrh    w10, [x0, #-2]
+
+        strh    w4, [x1], #14
+        strh    w5, [x1, #-12]
+        strh    w6, [x1, #-10]
+        strh    w7, [x1, #-8]
+        strh    w8, [x1, #-6]
+        strh    w9, [x1, #-4]
+        strh    w10, [x1, #-2]
+
+        b.ge    cs_forward_loop
+        # Exactly 16 instruction from cs_forward_loop, so loop fits into one cache line
+
+        adds    x2, x3, #14
+        # x2 bytes should be copied from x0 to x1
+        b.ne    cs_forward_lt14
+        ret
+        
+        # Very similar to forward copying
+cs_backward:
+        subs    x3, x2, #14
+        b.ge    cs_backward_loop
+
+cs_backward_lt14:
+        ands    x7, x2, #7
+        tbz     x2, #3, cs_backward_lt8
+
+        ldrh    w3, [x17, #-8]
+        ldrh    w4, [x17, #-6]
+        ldrh    w5, [x17, #-4]
+        ldrh    w6, [x17, #-2]
+        
+        strh    w3, [x18, #-8]
+        strh    w4, [x18, #-6]
+        strh    w5, [x18, #-4]
+        strh    w6, [x18, #-2]
+
+cs_backward_lt8:
+        b.eq    cs_backward_0
+        cmp     x7, #4
+        b.lt    cs_backward_2
+        b.eq    cs_backward_4
+
+cs_backward_6:
+        ldrh    w3, [x0, #4]
+        strh    w3, [x1, #4]
+
+cs_backward_4:
+        ldrh    w4, [x0, #2]
+        strh    w4, [x1, #2]
+
+cs_backward_2:
+        ldrh    w5, [x0, #0]
+        strh    w5, [x1, #0]
+
+cs_backward_0:
+        ret
+
+
+.p2align 6
+cs_backward_loop:
+        subs    x3, x3, #14
+
+        ldrh    w4, [x17, #-14]!
+        ldrh    w5, [x17, #2]
+        ldrh    w6, [x17, #4]
+        ldrh    w7, [x17, #6]
+        ldrh    w8, [x17, #8]
+        ldrh    w9, [x17, #10]
+        ldrh    w10, [x17, #12]
+
+        strh    w4, [x18, #-14]!
+        strh    w5, [x18, #2]
+        strh    w6, [x18, #4]
+        strh    w7, [x18, #6]
+        strh    w8, [x18, #8]
+        strh    w9, [x18, #10]
+        strh    w10, [x18, #12]
+
+        b.ge    cs_backward_loop
+        adds    x2, x3, #14
+        b.ne    cs_backward_lt14
+        ret
+
+
+        # Support for void Copy::arrayof_conjoint_jshorts(void* from,
+        #                                                 void* to,
+        #                                                 size_t count)
+_Copy_arrayof_conjoint_jshorts:
+        hlt 1007
+
+
+        # Support for void Copy::conjoint_jlongs_atomic(jlong* from,
+        #                                               jlong* to,
+        #                                               size_t count)
+_Copy_conjoint_jlongs_atomic:
+_Copy_arrayof_conjoint_jlongs:
+        hlt 1009
+
+
+        # Support for void Copy::conjoint_jints_atomic(void* from,
+        #                                              void* to,
+        #                                              size_t count)
+_Copy_conjoint_jints_atomic:
+_Copy_arrayof_conjoint_jints:
+        # These and further memory prefetches may hit out of array ranges.
+        # Experiments showed that prefetching of inaccessible memory doesn't result in exceptions.
+        prfm    pldl1keep,  [x0, #0]
+        prfm    pstl1keep,  [x1, #0]
+        prfm    pldl1keep,  [x0, #32]
+        prfm    pstl1keep,  [x1, #32]
+
+        subs    x3, x1, x0
+        # hi condition is met <=> from < to
+        ccmp    x2, x3, #0, hi
+        # hi condition is met <=> (from < to) and (to - from < count)
+        b.hi    ci_backward
+
+        subs    x18, x2,  #64
+        b.ge    ci_forward_large
+
+ci_forward_lt_64:
+        # Copy [x0, x0 + x2) to [x1, x1 + x2)
+        
+        adr     x15,  ci_forward_tail_table_base
+        and     x16,  x2,  #~4
+
+        # Calculate address to jump and store it to x15:
+        #   Each pair of instructions before ci_forward_tail_table_base copies 8 bytes.
+        #   x16 is count of bytes to copy aligned down by 8.
+        #   So x16/8 pairs of instructions should be executed. 
+        #   Each pair takes 8 bytes, so x15 = ci_forward_tail_table_base - (x16/8)*8 = x15 - x16
+        sub     x15,  x15, x16
+        prfm    plil1keep, [x15]
+    
+        add     x17,  x0,  x2
+        add     x18,  x1,  x2
+
+        # If x2 = x16 + 4, then copy 4 bytes and x16 bytes after that.
+        # Otherwise x2 = x16, so proceed to copy x16 bytes.
+        tbz     x2, #2, ci_forward_lt_64_even
+        ldr     w3, [x0]
+        str     w3, [x1]
+ci_forward_lt_64_even:
+        # Copy [x17 - x16, x17) to [x18 - x16, x18)
+        # x16 is aligned by 8 and less than 64
+
+        # Execute (x16/8) ldp-stp pairs; each pair copies 8 bytes
+        br      x15
+
+        ldp     w3,  w4,  [x17, #-56]
+        stp     w3,  w4,  [x18, #-56]
+        ldp     w5,  w6,  [x17, #-48]
+        stp     w5,  w6,  [x18, #-48]
+        ldp     w7,  w8,  [x17, #-40]
+        stp     w7,  w8,  [x18, #-40]
+        ldp     w9,  w10, [x17, #-32]
+        stp     w9,  w10, [x18, #-32]
+        ldp     w11, w12, [x17, #-24]
+        stp     w11, w12, [x18, #-24]
+        ldp     w13, w14, [x17, #-16]
+        stp     w13, w14, [x18, #-16]
+        ldp     w15, w16, [x17, #-8]
+        stp     w15, w16, [x18, #-8]
+ci_forward_tail_table_base:
+        ret
+
+.p2align  6
+.rept   12
+        nop
+.endr
+ci_forward_large:
+        # x18 >= 0;
+        # Copy [x0, x0 + x18 + 64) to [x1, x1 + x18 + 64)
+
+        ldp     w3,  w4,  [x0], #32
+        ldp     w5,  w6,  [x0, #-24]
+        ldp     w7,  w8,  [x0, #-16]
+        ldp     w9,  w10, [x0, #-8]
+
+        # Before and after each iteration of loop registers w3-w10 contain [x0 - 32, x0),
+        # and x1 is a place to copy this data;
+        # x18 contains number of bytes to be stored minus 64
+
+        # Exactly 16 instructions from p2align, so ci_forward_loop starts from cache line boundary
+        # Checking it explictly by aligning with "hlt 1000" instructions 
+.p2alignl  6, 0xd4407d00
+ci_forward_loop:
+        prfm    pldl1keep,  [x0, #32]
+        prfm    pstl1keep,  [x1, #32]
+
+        subs    x18, x18, #32
+
+        stp     w3,  w4,  [x1, #0]
+        ldp     w3,  w4,  [x0, #0]
+        stp     w5,  w6,  [x1, #8]
+        ldp     w5,  w6,  [x0, #8]
+        stp     w7,  w8,  [x1, #16]
+        ldp     w7,  w8,  [x0, #16]
+        stp     w9,  w10, [x1, #24]
+        ldp     w9,  w10, [x0, #24]
+        
+        add     x1,  x1,  #32
+        add     x0,  x0,  #32
+
+        b.ge    ci_forward_loop
+
+        # 14 instructions from ci_forward_loop, so the loop body hits into one cache line
+
+ci_forward_loop_end:
+        adds    x2,  x18, #32
+
+        stp     w3,  w4,  [x1], #32
+        stp     w5,  w6,  [x1, #-24]
+        stp     w7,  w8,  [x1, #-16]
+        stp     w9,  w10, [x1, #-8]
+
+        # Increased x18 by 32, but stored 32 bytes, so x2 contains exact number of bytes to be stored
+
+        # If this number is not zero, also copy remaining bytes
+        b.ne    ci_forward_lt_64
+        ret
+
+ci_backward:
+
+        # Overlapping case should be the rare one, it does not worth optimizing
+
+        ands    x3,  x2,  #~4
+        # x3 is count aligned down by 2*jintSize
+        add     x0,  x0,  x2
+        add     x1,  x1,  x2
+        sub     x3,  x3,  #8
+        # Skip loop if 0 or 1 jints
+        b.eq    ci_backward_loop_end
+
+        # x3 >= 0
+        # Copy [x0 - x3 - 8, x0) to [x1 - x3 - 8, x1) backward
+ci_backward_loop:
+        subs    x3,  x3,  #8
+        ldp     w4,  w5,  [x0, #-8]!
+        stp     w4,  w5,  [x1, #-8]!
+        b.ge    ci_backward_loop
+
+ci_backward_loop_end:
+        # Copy remaining 0 or 1 jints
+        tbz     x2,  #2,  ci_backward_finish
+        ldr     w3, [x0, #-4]
+        str     w3, [x1, #-4]
+
+ci_backward_finish:
+        ret
--- /dev/null	2016-08-24 15:41:39.598575000 -0400
+++ new/hotspot/src/os_cpu/linux_arm/vm/macroAssembler_linux_arm_32.cpp	2016-12-02 11:25:32.090138922 -0500
@@ -0,0 +1,329 @@
+/*
+ * Copyright (c) 2008, 2015, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+#include "asm/macroAssembler.inline.hpp"
+#include "runtime/os.hpp"
+
+void MacroAssembler::breakpoint(AsmCondition cond) {
+  if (cond == al) {
+    emit_int32(0xe7f001f0);
+  } else {
+    call(CAST_FROM_FN_PTR(address, os::breakpoint), relocInfo::runtime_call_type, cond);
+  }
+}
+
+// atomic_cas_bool
+//
+// Perform an atomic compare and exchange and return bool result
+//
+// inputs:
+//         oldval value to compare to
+//         newval value to store if *(base+offset) == oldval
+//         base   base address of storage location
+//         offset offset added to base to form dest address
+// output:
+//         Z flag is set in success
+
+void MacroAssembler::atomic_cas_bool(Register oldval, Register newval, Register base, int offset, Register tmpreg) {
+  if (VM_Version::supports_ldrex()) {
+    Register tmp_reg;
+    if (tmpreg == noreg) {
+      push(LR);
+      tmp_reg = LR;
+    } else {
+      tmp_reg = tmpreg;
+    }
+    assert_different_registers(tmp_reg, oldval, newval, base);
+    Label loop;
+    bind(loop);
+    ldrex(tmp_reg, Address(base, offset));
+    subs(tmp_reg, tmp_reg, oldval);
+    strex(tmp_reg, newval, Address(base, offset), eq);
+    cmp(tmp_reg, 1, eq);
+    b(loop, eq);
+    cmp(tmp_reg, 0);
+    if (tmpreg == noreg) {
+      pop(tmp_reg);
+    }
+  } else if (VM_Version::supports_kuser_cmpxchg32()) {
+    // On armv5 platforms we must use the Linux kernel helper
+    // function for atomic cas operations since ldrex/strex is
+    // not supported.
+    //
+    // This is a special routine at a fixed address 0xffff0fc0 with
+    // with these arguments and results
+    //
+    // input:
+    //  r0 = oldval, r1 = newval, r2 = ptr, lr = return adress
+    // output:
+    //  r0 = 0 carry set on success
+    //  r0 != 0 carry clear on failure
+    //
+    // r3, ip and flags are clobbered
+    //
+
+    Label loop;
+
+    push(RegisterSet(R0, R3) | RegisterSet(R12) | RegisterSet(LR));
+
+    Register tmp_reg = LR; // ignore the argument
+
+    assert_different_registers(tmp_reg, oldval, newval, base);
+
+    // Shuffle registers for kernel call
+    if (oldval != R0) {
+      if (newval == R0) {
+        mov(tmp_reg, newval);
+        newval = tmp_reg;
+      }
+      if (base == R0) {
+        mov(tmp_reg, base);
+        base = tmp_reg;
+      }
+      mov(R0, oldval);
+    }
+    if(newval != R1) {
+      if(base == R1) {
+        if(newval == R2) {
+          mov(tmp_reg, base);
+          base = tmp_reg;
+        }
+        else {
+          mov(R2, base);
+          base = R2;
+        }
+      }
+      mov(R1, newval);
+    }
+    if (base != R2)
+      mov(R2, base);
+
+    if (offset != 0)
+      add(R2, R2, offset);
+
+    mvn(R3, 0xf000);
+    mov(LR, PC);
+    sub(PC, R3, 0x3f);
+    cmp (R0, 0);
+
+    pop(RegisterSet(R0, R3) | RegisterSet(R12) | RegisterSet(LR));
+  } else {
+    // Should never run on a platform so old that it does not have kernel helper
+    stop("Atomic cmpxchg32 unsupported on this platform");
+  }
+}
+
+// atomic_cas
+//
+// Perform an atomic compare and exchange and return previous value
+//
+// inputs:
+//         prev temporary register (destroyed)
+//         oldval value to compare to
+//         newval value to store if *(base+offset) == oldval
+//         base   base address of storage location
+//         offset offset added to base to form dest address
+// output:
+//         returns previous value from *(base+offset) in R0
+
+void MacroAssembler::atomic_cas(Register temp1, Register temp2, Register oldval, Register newval, Register base, int offset) {
+  if (temp1 != R0) {
+    // try to read the previous value directly in R0
+    if (temp2 == R0) {
+      // R0 declared free
+      temp2 = temp1;
+      temp1 = R0;
+    } else if ((oldval != R0) && (newval != R0) && (base != R0)) {
+      // free, and scratched on return
+      temp1 = R0;
+    }
+  }
+  if (VM_Version::supports_ldrex()) {
+    Label loop;
+    assert_different_registers(temp1, temp2, oldval, newval, base);
+
+    bind(loop);
+    ldrex(temp1, Address(base, offset));
+    cmp(temp1, oldval);
+    strex(temp2, newval, Address(base, offset), eq);
+    cmp(temp2, 1, eq);
+    b(loop, eq);
+    if (temp1 != R0) {
+      mov(R0, temp1);
+    }
+  } else if (VM_Version::supports_kuser_cmpxchg32()) {
+    // On armv5 platforms we must use the Linux kernel helper
+    // function for atomic cas operations since ldrex/strex is
+    // not supported.
+    //
+    // This is a special routine at a fixed address 0xffff0fc0
+    //
+    // input:
+    //  r0 = oldval, r1 = newval, r2 = ptr, lr = return adress
+    // output:
+    //  r0 = 0 carry set on success
+    //  r0 != 0 carry clear on failure
+    //
+    // r3, ip and flags are clobbered
+    //
+    Label done;
+    Label loop;
+
+    push(RegisterSet(R1, R4) | RegisterSet(R12) | RegisterSet(LR));
+
+    if ( oldval != R0 || newval != R1 || base != R2 ) {
+      push(oldval);
+      push(newval);
+      push(base);
+      pop(R2);
+      pop(R1);
+      pop(R0);
+    }
+
+    if (offset != 0) {
+      add(R2, R2, offset);
+    }
+
+    mov(R4, R0);
+    bind(loop);
+    ldr(R0, Address(R2));
+    cmp(R0, R4);
+    b(done, ne);
+    mvn(R12, 0xf000);
+    mov(LR, PC);
+    sub(PC, R12, 0x3f);
+    b(loop, cc);
+    mov(R0, R4);
+    bind(done);
+
+    pop(RegisterSet(R1, R4) | RegisterSet(R12) | RegisterSet(LR));
+  } else {
+    // Should never run on a platform so old that it does not have kernel helper
+    stop("Atomic cmpxchg32 unsupported on this platform");
+  }
+}
+
+// atomic_cas64
+//
+// Perform a 64 bit atomic compare and exchange and return previous value
+// as well as returning status in 'result' register
+//
+// inputs:
+//         oldval_lo, oldval_hi value to compare to
+//         newval_lo, newval_hi value to store if *(base+offset) == oldval
+//         base   base address of storage location
+//         offset offset added to base to form dest address
+// output:
+//         memval_lo, memval_hi, result
+//         returns previous value from *(base+offset) in memval_lo/hi
+//         returns status in result, 1==success, 0==failure
+//         C1 just uses status result
+//         VM code uses previous value returned in memval_lo/hi
+
+void MacroAssembler::atomic_cas64(Register memval_lo, Register memval_hi, Register result, Register oldval_lo, Register oldval_hi, Register newval_lo, Register newval_hi, Register base, int offset) {
+  if (VM_Version::supports_ldrexd()) {
+    Label loop;
+    assert_different_registers(memval_lo, memval_hi, result, oldval_lo,
+                               oldval_hi, newval_lo, newval_hi, base);
+    assert(memval_hi == memval_lo + 1 && memval_lo < R9, "cmpxchg_long: illegal registers");
+    assert(oldval_hi == oldval_lo + 1 && oldval_lo < R9, "cmpxchg_long: illegal registers");
+    assert(newval_hi == newval_lo + 1 && newval_lo < R9, "cmpxchg_long: illegal registers");
+    assert(result != R10, "cmpxchg_long: illegal registers");
+    assert(base != R10, "cmpxchg_long: illegal registers");
+
+    mov(result, 0);
+    bind(loop);
+    ldrexd(memval_lo, Address(base, offset));
+    cmp(memval_lo, oldval_lo);
+    cmp(memval_hi, oldval_hi, eq);
+    strexd(result, newval_lo, Address(base, offset), eq);
+    rsbs(result, result, 1, eq);
+    b(loop, eq);
+  } else if (VM_Version::supports_kuser_cmpxchg64()) {
+    // On armv5 platforms we must use the Linux kernel helper
+    // function for atomic cas64 operations since ldrexd/strexd is
+    // not supported.
+    //
+    // This is a special routine at a fixed address 0xffff0f60
+    //
+    // input:
+    //  r0 = (long long *)oldval, r1 = (long long *)newval,
+    //  r2 = ptr, lr = return adress
+    // output:
+    //  r0 = 0 carry set on success
+    //  r0 != 0 carry clear on failure
+    //
+    // r3, and flags are clobbered
+    //
+    Label done;
+    Label loop;
+
+    if (result != R12) {
+      push(R12);
+    }
+    push(RegisterSet(R10) | RegisterSet(LR));
+    mov(R10, SP);         // Save SP
+
+    bic(SP, SP, StackAlignmentInBytes - 1);  // align stack
+    push(RegisterSet(oldval_lo, oldval_hi));
+    push(RegisterSet(newval_lo, newval_hi));
+
+    if ((offset != 0) || (base != R12)) {
+      add(R12, base, offset);
+    }
+    push(RegisterSet(R0, R3));
+    bind(loop);
+    ldrd(memval_lo, Address(R12)); //current
+    ldrd(oldval_lo, Address(SP, 24));
+    cmp(memval_lo, oldval_lo);
+    cmp(memval_hi, oldval_hi, eq);
+    pop(RegisterSet(R0, R3), ne);
+    mov(result, 0, ne);
+    b(done, ne);
+    // Setup for kernel call
+    mov(R2, R12);
+    add(R0, SP, 24);            // R0 == &oldval_lo
+    add(R1, SP, 16);            // R1 == &newval_lo
+    mvn(R3, 0xf000);            // call kernel helper at 0xffff0f60
+    mov(LR, PC);
+    sub(PC, R3, 0x9f);
+    b(loop, cc);                 // if Carry clear then oldval != current
+                                 // try again. Otherwise, return oldval
+    // Here on success
+    pop(RegisterSet(R0, R3));
+    mov(result, 1);
+    ldrd(memval_lo, Address(SP, 8));
+    bind(done);
+    pop(RegisterSet(newval_lo, newval_hi));
+    pop(RegisterSet(oldval_lo, oldval_hi));
+    mov(SP, R10);                 // restore SP
+    pop(RegisterSet(R10) | RegisterSet(LR));
+    if (result != R12) {
+      pop(R12);
+    }
+  } else {
+    stop("Atomic cmpxchg64 unsupported on this platform");
+  }
+}
--- /dev/null	2016-08-24 15:41:39.598575000 -0400
+++ new/hotspot/src/os_cpu/linux_arm/vm/orderAccess_linux_arm.inline.hpp	2016-12-02 11:25:37.386439262 -0500
@@ -0,0 +1,221 @@
+/*
+ * Copyright (c) 2008, 2016, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef OS_CPU_LINUX_ARM_VM_ORDERACCESS_LINUX_ARM_INLINE_HPP
+#define OS_CPU_LINUX_ARM_VM_ORDERACCESS_LINUX_ARM_INLINE_HPP
+
+#include "runtime/orderAccess.hpp"
+#include "runtime/os.hpp"
+#include "vm_version_arm.hpp"
+
+// Implementation of class OrderAccess.
+// - we define the high level barriers below and use the general
+//   implementation in orderAccess.inline.hpp, with customizations
+//   on AARCH64 via the specialized_* template functions
+#define VM_HAS_GENERALIZED_ORDER_ACCESS 1
+
+// Memory Ordering on ARM is weak.
+//
+// Implement all 4 memory ordering barriers by DMB, since it is a
+// lighter version of DSB.
+// dmb_sy implies full system shareability domain. RD/WR access type.
+// dmb_st implies full system shareability domain. WR only access type.
+//
+// NOP on < ARMv6 (MP not supported)
+//
+// Non mcr instructions can be used if we build for armv7 or higher arch
+//    __asm__ __volatile__ ("dmb" : : : "memory");
+//    __asm__ __volatile__ ("dsb" : : : "memory");
+//
+// inline void _OrderAccess_dsb() {
+//    volatile intptr_t dummy = 0;
+//    if (os::is_MP()) {
+//      __asm__ volatile (
+//        "mcr p15, 0, %0, c7, c10, 4"
+//        : : "r" (dummy) : "memory");
+//   }
+// }
+
+inline static void dmb_sy() {
+   if (!os::is_MP()) {
+     return;
+   }
+#ifdef AARCH64
+   __asm__ __volatile__ ("dmb sy" : : : "memory");
+#else
+   if (VM_Version::arm_arch() >= 7) {
+#ifdef __thumb__
+     __asm__ volatile (
+     "dmb sy": : : "memory");
+#else
+     __asm__ volatile (
+     ".word 0xF57FF050 | 0xf" : : : "memory");
+#endif
+   } else {
+     intptr_t zero = 0;
+     __asm__ volatile (
+       "mcr p15, 0, %0, c7, c10, 5"
+       : : "r" (zero) : "memory");
+   }
+#endif
+}
+
+inline static void dmb_st() {
+   if (!os::is_MP()) {
+     return;
+   }
+#ifdef AARCH64
+   __asm__ __volatile__ ("dmb st" : : : "memory");
+#else
+   if (VM_Version::arm_arch() >= 7) {
+#ifdef __thumb__
+     __asm__ volatile (
+     "dmb st": : : "memory");
+#else
+     __asm__ volatile (
+     ".word 0xF57FF050 | 0xe" : : : "memory");
+#endif
+   } else {
+     intptr_t zero = 0;
+     __asm__ volatile (
+       "mcr p15, 0, %0, c7, c10, 5"
+       : : "r" (zero) : "memory");
+   }
+#endif
+}
+
+// Load-Load/Store barrier
+inline static void dmb_ld() {
+#ifdef AARCH64
+   if (!os::is_MP()) {
+     return;
+   }
+   __asm__ __volatile__ ("dmb ld" : : : "memory");
+#else
+   dmb_sy();
+#endif
+}
+
+
+inline void OrderAccess::loadload()   { dmb_ld(); }
+inline void OrderAccess::loadstore()  { dmb_ld(); }
+inline void OrderAccess::acquire()    { dmb_ld(); }
+inline void OrderAccess::storestore() { dmb_st(); }
+inline void OrderAccess::storeload()  { dmb_sy(); }
+inline void OrderAccess::release()    { dmb_sy(); }
+inline void OrderAccess::fence()      { dmb_sy(); }
+
+// specializations for Aarch64
+// TODO-AARCH64: evaluate effectiveness of ldar*/stlr* implementations compared to 32-bit ARM approach
+
+#ifdef AARCH64
+
+template<> inline jbyte    OrderAccess::specialized_load_acquire<jbyte>(volatile jbyte*   p) {
+  volatile jbyte result;
+  __asm__ volatile(
+    "ldarb %w[res], [%[ptr]]"
+    : [res] "=&r" (result)
+    : [ptr] "r" (p)
+    : "memory");
+  return result;
+}
+
+template<> inline jshort   OrderAccess::specialized_load_acquire<jshort>(volatile jshort*  p) {
+  volatile jshort result;
+  __asm__ volatile(
+    "ldarh %w[res], [%[ptr]]"
+    : [res] "=&r" (result)
+    : [ptr] "r" (p)
+    : "memory");
+  return result;
+}
+
+template<> inline jint     OrderAccess::specialized_load_acquire<jint>(volatile jint*    p) {
+  volatile jint result;
+  __asm__ volatile(
+    "ldar %w[res], [%[ptr]]"
+    : [res] "=&r" (result)
+    : [ptr] "r" (p)
+    : "memory");
+  return result;
+}
+
+template<> inline jfloat   OrderAccess::specialized_load_acquire<jfloat>(volatile jfloat*  p) {
+  return jfloat_cast(specialized_load_acquire((volatile jint*)p));
+}
+
+// This is implicit as jlong and intptr_t are both "long int"
+//template<> inline jlong    OrderAccess::specialized_load_acquire(volatile jlong*   p) {
+//  return (volatile jlong)specialized_load_acquire((volatile intptr_t*)p);
+//}
+
+template<> inline intptr_t OrderAccess::specialized_load_acquire<intptr_t>(volatile intptr_t*   p) {
+  volatile intptr_t result;
+  __asm__ volatile(
+    "ldar %[res], [%[ptr]]"
+    : [res] "=&r" (result)
+    : [ptr] "r" (p)
+    : "memory");
+  return result;
+}
+
+template<> inline jdouble  OrderAccess::specialized_load_acquire<jdouble>(volatile jdouble* p) {
+  return jdouble_cast(specialized_load_acquire((volatile intptr_t*)p));
+}
+
+
+template<> inline void     OrderAccess::specialized_release_store<jbyte>(volatile jbyte*   p, jbyte   v) {
+  __asm__ volatile(
+    "stlrb %w[val], [%[ptr]]"
+    :
+    : [ptr] "r" (p), [val] "r" (v)
+    : "memory");
+}
+
+template<> inline void     OrderAccess::specialized_release_store<jshort>(volatile jshort*  p, jshort  v) {
+  __asm__ volatile(
+    "stlrh %w[val], [%[ptr]]"
+    :
+    : [ptr] "r" (p), [val] "r" (v)
+    : "memory");
+}
+
+template<> inline void     OrderAccess::specialized_release_store<jint>(volatile jint*    p, jint    v) {
+  __asm__ volatile(
+    "stlr %w[val], [%[ptr]]"
+    :
+    : [ptr] "r" (p), [val] "r" (v)
+    : "memory");
+}
+
+template<> inline void     OrderAccess::specialized_release_store<jlong>(volatile jlong*   p, jlong   v) {
+  __asm__ volatile(
+    "stlr %[val], [%[ptr]]"
+    :
+    : [ptr] "r" (p), [val] "r" (v)
+    : "memory");
+}
+#endif // AARCH64
+
+#endif // OS_CPU_LINUX_ARM_VM_ORDERACCESS_LINUX_ARM_INLINE_HPP
--- /dev/null	2016-08-24 15:41:39.598575000 -0400
+++ new/hotspot/src/os_cpu/linux_arm/vm/os_linux_arm.cpp	2016-12-02 11:25:44.118821040 -0500
@@ -0,0 +1,804 @@
+/*
+ * Copyright (c) 2008, 2016, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+// no precompiled headers
+#include "assembler_arm.inline.hpp"
+#include "classfile/classLoader.hpp"
+#include "classfile/systemDictionary.hpp"
+#include "classfile/vmSymbols.hpp"
+#include "code/icBuffer.hpp"
+#include "code/vtableStubs.hpp"
+#include "interpreter/interpreter.hpp"
+#include "jvm_linux.h"
+#include "memory/allocation.inline.hpp"
+#include "nativeInst_arm.hpp"
+#include "os_share_linux.hpp"
+#include "prims/jniFastGetField.hpp"
+#include "prims/jvm.h"
+#include "prims/jvm_misc.hpp"
+#include "runtime/arguments.hpp"
+#include "runtime/extendedPC.hpp"
+#include "runtime/frame.inline.hpp"
+#include "runtime/interfaceSupport.hpp"
+#include "runtime/java.hpp"
+#include "runtime/javaCalls.hpp"
+#include "runtime/mutexLocker.hpp"
+#include "runtime/osThread.hpp"
+#include "runtime/sharedRuntime.hpp"
+#include "runtime/stubRoutines.hpp"
+#include "runtime/timer.hpp"
+#include "utilities/events.hpp"
+#include "utilities/vmError.hpp"
+
+// put OS-includes here
+# include <sys/types.h>
+# include <sys/mman.h>
+# include <pthread.h>
+# include <signal.h>
+# include <errno.h>
+# include <dlfcn.h>
+# include <stdlib.h>
+# include <stdio.h>
+# include <unistd.h>
+# include <sys/resource.h>
+# include <pthread.h>
+# include <sys/stat.h>
+# include <sys/time.h>
+# include <sys/utsname.h>
+# include <sys/socket.h>
+# include <sys/wait.h>
+# include <pwd.h>
+# include <poll.h>
+# include <ucontext.h>
+# include <fpu_control.h>
+# include <asm/ptrace.h>
+
+#define SPELL_REG_SP  "sp"
+
+// Don't #define SPELL_REG_FP for thumb because it is not safe to use, so this makes sure we never fetch it.
+#ifndef __thumb__
+#define SPELL_REG_FP  AARCH64_ONLY("x29") NOT_AARCH64("fp")
+#endif
+
+address os::current_stack_pointer() {
+  register address sp __asm__ (SPELL_REG_SP);
+  return sp;
+}
+
+char* os::non_memory_address_word() {
+  // Must never look like an address returned by reserve_memory
+  return (char*) -1;
+}
+
+void os::initialize_thread(Thread* thr) {
+  // Nothing to do
+}
+
+#ifdef AARCH64
+
+#define arm_pc pc
+#define arm_sp sp
+#define arm_fp regs[29]
+#define arm_r0 regs[0]
+#define ARM_REGS_IN_CONTEXT  31
+
+#else
+
+#if NGREG == 16
+// These definitions are based on the observation that until
+// the certain version of GCC mcontext_t was defined as
+// a structure containing gregs[NGREG] array with 16 elements.
+// In later GCC versions mcontext_t was redefined as struct sigcontext,
+// along with NGREG constant changed to 18.
+#define arm_pc gregs[15]
+#define arm_sp gregs[13]
+#define arm_fp gregs[11]
+#define arm_r0 gregs[0]
+#endif
+
+#define ARM_REGS_IN_CONTEXT  16
+
+#endif // AARCH64
+
+address os::Linux::ucontext_get_pc(const ucontext_t* uc) {
+  return (address)uc->uc_mcontext.arm_pc;
+}
+
+void os::Linux::ucontext_set_pc(ucontext_t* uc, address pc) {
+  uc->uc_mcontext.arm_pc = (uintx)pc;
+}
+
+intptr_t* os::Linux::ucontext_get_sp(const ucontext_t* uc) {
+  return (intptr_t*)uc->uc_mcontext.arm_sp;
+}
+
+intptr_t* os::Linux::ucontext_get_fp(const ucontext_t* uc) {
+  return (intptr_t*)uc->uc_mcontext.arm_fp;
+}
+
+bool is_safe_for_fp(address pc) {
+#ifdef __thumb__
+  if (CodeCache::find_blob(pc) != NULL) {
+    return true;
+  }
+  // For thumb C frames, given an fp we have no idea how to access the frame contents.
+  return false;
+#else
+  // Calling os::address_is_in_vm() here leads to a dladdr call. Calling any libc
+  // function during os::get_native_stack() can result in a deadlock if JFR is
+  // enabled. For now, be more lenient and allow all pc's. There are other
+  // frame sanity checks in shared code, and to date they have been sufficient
+  // for other platforms.
+  //return os::address_is_in_vm(pc);
+  return true;
+#endif
+}
+
+// For Forte Analyzer AsyncGetCallTrace profiling support - thread
+// is currently interrupted by SIGPROF.
+// os::Solaris::fetch_frame_from_ucontext() tries to skip nested signal
+// frames. Currently we don't do that on Linux, so it's the same as
+// os::fetch_frame_from_context().
+ExtendedPC os::Linux::fetch_frame_from_ucontext(Thread* thread,
+  const ucontext_t* uc, intptr_t** ret_sp, intptr_t** ret_fp) {
+
+  assert(thread != NULL, "just checking");
+  assert(ret_sp != NULL, "just checking");
+  assert(ret_fp != NULL, "just checking");
+
+  return os::fetch_frame_from_context(uc, ret_sp, ret_fp);
+}
+
+ExtendedPC os::fetch_frame_from_context(const void* ucVoid,
+                    intptr_t** ret_sp, intptr_t** ret_fp) {
+
+  ExtendedPC  epc;
+  const ucontext_t* uc = (const ucontext_t*)ucVoid;
+
+  if (uc != NULL) {
+    epc = ExtendedPC(os::Linux::ucontext_get_pc(uc));
+    if (ret_sp) *ret_sp = os::Linux::ucontext_get_sp(uc);
+    if (ret_fp) {
+      intptr_t* fp = os::Linux::ucontext_get_fp(uc);
+#ifndef __thumb__
+      if (CodeCache::find_blob(epc.pc()) == NULL) {
+        // It's a C frame. We need to adjust the fp.
+        fp += os::C_frame_offset;
+      }
+#endif
+      // Clear FP when stack walking is dangerous so that
+      // the frame created will not be walked.
+      // However, ensure FP is set correctly when reliable and
+      // potentially necessary.
+      if (!is_safe_for_fp(epc.pc())) {
+        // FP unreliable
+        fp = (intptr_t *)NULL;
+      }
+      *ret_fp = fp;
+    }
+  } else {
+    // construct empty ExtendedPC for return value checking
+    epc = ExtendedPC(NULL);
+    if (ret_sp) *ret_sp = (intptr_t *)NULL;
+    if (ret_fp) *ret_fp = (intptr_t *)NULL;
+  }
+
+  return epc;
+}
+
+frame os::fetch_frame_from_context(const void* ucVoid) {
+  intptr_t* sp;
+  intptr_t* fp;
+  ExtendedPC epc = fetch_frame_from_context(ucVoid, &sp, &fp);
+  return frame(sp, fp, epc.pc());
+}
+
+frame os::get_sender_for_C_frame(frame* fr) {
+#ifdef __thumb__
+  // We can't reliably get anything from a thumb C frame.
+  return frame();
+#else
+  address pc = fr->sender_pc();
+  if (! is_safe_for_fp(pc)) {
+    return frame(fr->sender_sp(), (intptr_t *)NULL, pc);
+  } else {
+    return frame(fr->sender_sp(), fr->link() + os::C_frame_offset, pc);
+  }
+#endif
+}
+
+//
+// This actually returns two frames up. It does not return os::current_frame(),
+// which is the actual current frame. Nor does it return os::get_native_stack(),
+// which is the caller. It returns whoever called os::get_native_stack(). Not
+// very intuitive, but consistent with how this API is implemented on other
+// platforms.
+//
+frame os::current_frame() {
+#ifdef __thumb__
+  // We can't reliably get anything from a thumb C frame.
+  return frame();
+#else
+  register intptr_t* fp __asm__ (SPELL_REG_FP);
+  // fp is for os::current_frame. We want the fp for our caller.
+  frame myframe((intptr_t*)os::current_stack_pointer(), fp + os::C_frame_offset,
+                 CAST_FROM_FN_PTR(address, os::current_frame));
+  frame caller_frame = os::get_sender_for_C_frame(&myframe);
+
+  if (os::is_first_C_frame(&caller_frame)) {
+    // stack is not walkable
+    // Assert below was added because it does not seem like this can ever happen.
+    // How can this frame ever be the first C frame since it is called from C code?
+    // If it does ever happen, undo the assert and comment here on when/why it happens.
+    assert(false, "this should never happen");
+    return frame();
+  }
+
+  // return frame for our caller's caller
+  return os::get_sender_for_C_frame(&caller_frame);
+#endif
+}
+
+#ifndef AARCH64
+extern "C" address check_vfp_fault_instr;
+extern "C" address check_vfp3_32_fault_instr;
+
+address check_vfp_fault_instr = NULL;
+address check_vfp3_32_fault_instr = NULL;
+#endif // !AARCH64
+extern "C" address check_simd_fault_instr;
+address check_simd_fault_instr = NULL;
+
+// Utility functions
+
+extern "C" int JVM_handle_linux_signal(int sig, siginfo_t* info,
+                                       void* ucVoid, int abort_if_unrecognized) {
+  ucontext_t* uc = (ucontext_t*) ucVoid;
+
+  Thread* t = Thread::current_or_null_safe();
+
+  // Must do this before SignalHandlerMark, if crash protection installed we will longjmp away
+  // (no destructors can be run)
+  os::WatcherThreadCrashProtection::check_crash_protection(sig, t);
+
+  SignalHandlerMark shm(t);
+
+  if (sig == SIGILL &&
+      ((info->si_addr == (caddr_t)check_simd_fault_instr)
+       NOT_AARCH64(|| info->si_addr == (caddr_t)check_vfp_fault_instr)
+       NOT_AARCH64(|| info->si_addr == (caddr_t)check_vfp3_32_fault_instr))) {
+    // skip faulty instruction + instruction that sets return value to
+    // success and set return value to failure.
+    os::Linux::ucontext_set_pc(uc, (address)info->si_addr + 8);
+    uc->uc_mcontext.arm_r0 = 0;
+    return true;
+  }
+
+  // Note: it's not uncommon that JNI code uses signal/sigset to install
+  // then restore certain signal handler (e.g. to temporarily block SIGPIPE,
+  // or have a SIGILL handler when detecting CPU type). When that happens,
+  // JVM_handle_linux_signal() might be invoked with junk info/ucVoid. To
+  // avoid unnecessary crash when libjsig is not preloaded, try handle signals
+  // that do not require siginfo/ucontext first.
+
+  if (sig == SIGPIPE || sig == SIGXFSZ) {
+    // allow chained handler to go first
+    if (os::Linux::chained_handler(sig, info, ucVoid)) {
+      return true;
+    } else {
+      // Ignoring SIGPIPE/SIGXFSZ - see bugs 4229104 or 6499219
+      return true;
+    }
+  }
+
+  JavaThread* thread = NULL;
+  VMThread* vmthread = NULL;
+  if (os::Linux::signal_handlers_are_installed) {
+    if (t != NULL ){
+      if(t->is_Java_thread()) {
+        thread = (JavaThread*)t;
+      }
+      else if(t->is_VM_thread()){
+        vmthread = (VMThread *)t;
+      }
+    }
+  }
+
+  address stub = NULL;
+  address pc = NULL;
+  bool unsafe_access = false;
+
+  if (info != NULL && uc != NULL && thread != NULL) {
+    pc = (address) os::Linux::ucontext_get_pc(uc);
+
+    // Handle ALL stack overflow variations here
+    if (sig == SIGSEGV) {
+      address addr = (address) info->si_addr;
+
+      if (StubRoutines::is_safefetch_fault(pc)) {
+        os::Linux::ucontext_set_pc(uc, StubRoutines::continuation_for_safefetch_fault(pc));
+        return 1;
+      }
+      // check if fault address is within thread stack
+      if (addr < thread->stack_base() &&
+          addr >= thread->stack_base() - thread->stack_size()) {
+        // stack overflow
+        if (thread->in_stack_yellow_reserved_zone(addr)) {
+          thread->disable_stack_yellow_reserved_zone();
+          if (thread->thread_state() == _thread_in_Java) {
+            // Throw a stack overflow exception.  Guard pages will be reenabled
+            // while unwinding the stack.
+            stub = SharedRuntime::continuation_for_implicit_exception(thread, pc, SharedRuntime::STACK_OVERFLOW);
+          } else {
+            // Thread was in the vm or native code.  Return and try to finish.
+            return 1;
+          }
+        } else if (thread->in_stack_red_zone(addr)) {
+          // Fatal red zone violation.  Disable the guard pages and fall through
+          // to handle_unexpected_exception way down below.
+          thread->disable_stack_red_zone();
+          tty->print_raw_cr("An irrecoverable stack overflow has occurred.");
+        } else {
+          // Accessing stack address below sp may cause SEGV if current
+          // thread has MAP_GROWSDOWN stack. This should only happen when
+          // current thread was created by user code with MAP_GROWSDOWN flag
+          // and then attached to VM. See notes in os_linux.cpp.
+          if (thread->osthread()->expanding_stack() == 0) {
+             thread->osthread()->set_expanding_stack();
+             if (os::Linux::manually_expand_stack(thread, addr)) {
+               thread->osthread()->clear_expanding_stack();
+               return 1;
+             }
+             thread->osthread()->clear_expanding_stack();
+          } else {
+             fatal("recursive segv. expanding stack.");
+          }
+        }
+      }
+    }
+
+    if (thread->thread_state() == _thread_in_Java) {
+      // Java thread running in Java code => find exception handler if any
+      // a fault inside compiled code, the interpreter, or a stub
+
+      if (sig == SIGSEGV && os::is_poll_address((address)info->si_addr)) {
+        stub = SharedRuntime::get_poll_stub(pc);
+      } else if (sig == SIGBUS) {
+        // BugId 4454115: A read from a MappedByteBuffer can fault
+        // here if the underlying file has been truncated.
+        // Do not crash the VM in such a case.
+        CodeBlob* cb = CodeCache::find_blob_unsafe(pc);
+        CompiledMethod* nm = (cb != NULL) ? cb->as_compiled_method_or_null() : NULL;
+        if (nm != NULL && nm->has_unsafe_access()) {
+          unsafe_access = true;
+        }
+      } else if (sig == SIGSEGV && !MacroAssembler::needs_explicit_null_check((intptr_t)info->si_addr)) {
+          // Determination of interpreter/vtable stub/compiled code null exception
+          CodeBlob* cb = CodeCache::find_blob_unsafe(pc);
+          if (cb != NULL) {
+            stub = SharedRuntime::continuation_for_implicit_exception(thread, pc, SharedRuntime::IMPLICIT_NULL);
+          }
+      } else if (sig == SIGILL && *(int *)pc == NativeInstruction::zombie_illegal_instruction) {
+        // Zombie
+        stub = SharedRuntime::get_handle_wrong_method_stub();
+      }
+    } else if (thread->thread_state() == _thread_in_vm &&
+               sig == SIGBUS && thread->doing_unsafe_access()) {
+        unsafe_access = true;
+    }
+
+    // jni_fast_Get<Primitive>Field can trap at certain pc's if a GC kicks in
+    // and the heap gets shrunk before the field access.
+    if (sig == SIGSEGV || sig == SIGBUS) {
+      address addr = JNI_FastGetField::find_slowcase_pc(pc);
+      if (addr != (address)-1) {
+        stub = addr;
+      }
+    }
+
+    // Check to see if we caught the safepoint code in the
+    // process of write protecting the memory serialization page.
+    // It write enables the page immediately after protecting it
+    // so we can just return to retry the write.
+    if (sig == SIGSEGV && os::is_memory_serialize_page(thread, (address) info->si_addr)) {
+      // Block current thread until the memory serialize page permission restored.
+      os::block_on_serialize_page_trap();
+      return true;
+    }
+  }
+
+  if (unsafe_access && stub == NULL) {
+    // it can be an unsafe access and we haven't found
+    // any other suitable exception reason,
+    // so assume it is an unsafe access.
+    address next_pc = pc + Assembler::InstructionSize;
+#ifdef __thumb__
+    if (uc->uc_mcontext.arm_cpsr & PSR_T_BIT) {
+      next_pc = (address)((intptr_t)next_pc | 0x1);
+    }
+#endif
+
+    stub = SharedRuntime::handle_unsafe_access(thread, next_pc);
+  }
+
+  if (stub != NULL) {
+#ifdef __thumb__
+    if (uc->uc_mcontext.arm_cpsr & PSR_T_BIT) {
+      intptr_t p = (intptr_t)pc | 0x1;
+      pc = (address)p;
+
+      // Clear Thumb mode bit if we're redirected into the ARM ISA based code
+      if (((intptr_t)stub & 0x1) == 0) {
+        uc->uc_mcontext.arm_cpsr &= ~PSR_T_BIT;
+      }
+    } else {
+      // No Thumb2 compiled stubs are triggered from ARM ISA compiled JIT'd code today.
+      // The support needs to be added if that changes
+      assert((((intptr_t)stub & 0x1) == 0), "can't return to Thumb code");
+    }
+#endif
+
+    // save all thread context in case we need to restore it
+    if (thread != NULL) thread->set_saved_exception_pc(pc);
+
+    os::Linux::ucontext_set_pc(uc, stub);
+    return true;
+  }
+
+  // signal-chaining
+  if (os::Linux::chained_handler(sig, info, ucVoid)) {
+     return true;
+  }
+
+  if (!abort_if_unrecognized) {
+    // caller wants another chance, so give it to him
+    return false;
+  }
+
+  if (pc == NULL && uc != NULL) {
+    pc = os::Linux::ucontext_get_pc(uc);
+  }
+
+  // unmask current signal
+  sigset_t newset;
+  sigemptyset(&newset);
+  sigaddset(&newset, sig);
+  sigprocmask(SIG_UNBLOCK, &newset, NULL);
+
+  VMError::report_and_die(t, sig, pc, info, ucVoid);
+
+  ShouldNotReachHere();
+  return false;
+}
+
+void os::Linux::init_thread_fpu_state(void) {
+  os::setup_fpu();
+}
+
+int os::Linux::get_fpu_control_word(void) {
+  return 0;
+}
+
+void os::Linux::set_fpu_control_word(int fpu_control) {
+  // Nothing to do
+}
+
+void os::setup_fpu() {
+#ifdef AARCH64
+  __asm__ volatile ("msr fpcr, xzr");
+#else
+#if !defined(__SOFTFP__) && defined(__VFP_FP__)
+  // Turn on IEEE-754 compliant VFP mode
+  __asm__ volatile (
+    "mov %%r0, #0;"
+    "fmxr fpscr, %%r0"
+    : /* no output */ : /* no input */ : "r0"
+  );
+#endif
+#endif // AARCH64
+}
+
+bool os::is_allocatable(size_t bytes) {
+  return true;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// thread stack
+
+size_t os::Posix::_compiler_thread_min_stack_allowed = (48 DEBUG_ONLY(+ 4)) * K;
+size_t os::Posix::_java_thread_min_stack_allowed = (48 DEBUG_ONLY(+ 4)) * K;
+size_t os::Posix::_vm_internal_thread_min_stack_allowed = (48 DEBUG_ONLY(+ 4)) * K;
+
+// return default stack size for thr_type
+size_t os::Posix::default_stack_size(os::ThreadType thr_type) {
+  // default stack size (compiler thread needs larger stack)
+  size_t s = (thr_type == os::compiler_thread ? 2 * M : 512 * K);
+  return s;
+}
+
+size_t os::Linux::default_guard_size(os::ThreadType thr_type) {
+  // Creating guard page is very expensive. Java thread has HotSpot
+  // guard page, only enable glibc guard page for non-Java threads.
+  return (thr_type == java_thread ? 0 : page_size());
+}
+
+// Java thread:
+//
+//   Low memory addresses
+//    +------------------------+
+//    |                        |\  JavaThread created by VM does not have glibc
+//    |    glibc guard page    | - guard, attached Java thread usually has
+//    |                        |/  1 page glibc guard.
+// P1 +------------------------+ Thread::stack_base() - Thread::stack_size()
+//    |                        |\
+//    |  HotSpot Guard Pages   | - red and yellow pages
+//    |                        |/
+//    +------------------------+ JavaThread::stack_yellow_zone_base()
+//    |                        |\
+//    |      Normal Stack      | -
+//    |                        |/
+// P2 +------------------------+ Thread::stack_base()
+//
+// Non-Java thread:
+//
+//   Low memory addresses
+//    +------------------------+
+//    |                        |\
+//    |  glibc guard page      | - usually 1 page
+//    |                        |/
+// P1 +------------------------+ Thread::stack_base() - Thread::stack_size()
+//    |                        |\
+//    |      Normal Stack      | -
+//    |                        |/
+// P2 +------------------------+ Thread::stack_base()
+//
+// ** P1 (aka bottom) and size ( P2 = P1 - size) are the address and stack size returned from
+//    pthread_attr_getstack()
+
+static void current_stack_region(address * bottom, size_t * size) {
+  if (os::Linux::is_initial_thread()) {
+     // initial thread needs special handling because pthread_getattr_np()
+     // may return bogus value.
+     *bottom = os::Linux::initial_thread_stack_bottom();
+     *size   = os::Linux::initial_thread_stack_size();
+  } else {
+     pthread_attr_t attr;
+
+     int rslt = pthread_getattr_np(pthread_self(), &attr);
+
+     // JVM needs to know exact stack location, abort if it fails
+     if (rslt != 0) {
+       if (rslt == ENOMEM) {
+         vm_exit_out_of_memory(0, OOM_MMAP_ERROR, "pthread_getattr_np");
+       } else {
+         fatal("pthread_getattr_np failed");
+       }
+     }
+
+     if (pthread_attr_getstack(&attr, (void **)bottom, size) != 0) {
+         fatal("Can not locate current stack attributes!");
+     }
+
+     pthread_attr_destroy(&attr);
+
+  }
+  assert(os::current_stack_pointer() >= *bottom &&
+         os::current_stack_pointer() < *bottom + *size, "just checking");
+}
+
+address os::current_stack_base() {
+  address bottom;
+  size_t size;
+  current_stack_region(&bottom, &size);
+  return (bottom + size);
+}
+
+size_t os::current_stack_size() {
+  // stack size includes normal stack and HotSpot guard pages
+  address bottom;
+  size_t size;
+  current_stack_region(&bottom, &size);
+  return size;
+}
+
+/////////////////////////////////////////////////////////////////////////////
+// helper functions for fatal error handler
+
+void os::print_context(outputStream *st, const void *context) {
+  if (context == NULL) return;
+  const ucontext_t *uc = (const ucontext_t*)context;
+
+  st->print_cr("Registers:");
+  intx* reg_area = (intx*)&uc->uc_mcontext.arm_r0;
+  for (int r = 0; r < ARM_REGS_IN_CONTEXT; r++) {
+    st->print_cr("  %-3s = " INTPTR_FORMAT, as_Register(r)->name(), reg_area[r]);
+  }
+#define U64_FORMAT "0x%016llx"
+#ifdef AARCH64
+  st->print_cr("  %-3s = " U64_FORMAT, "sp", uc->uc_mcontext.sp);
+  st->print_cr("  %-3s = " U64_FORMAT, "pc", uc->uc_mcontext.pc);
+  st->print_cr("  %-3s = " U64_FORMAT, "pstate", uc->uc_mcontext.pstate);
+#else
+  // now print flag register
+  st->print_cr("  %-4s = 0x%08lx", "cpsr",uc->uc_mcontext.arm_cpsr);
+#endif
+  st->cr();
+
+  intptr_t *sp = (intptr_t *)os::Linux::ucontext_get_sp(uc);
+  st->print_cr("Top of Stack: (sp=" INTPTR_FORMAT ")", p2i(sp));
+  print_hex_dump(st, (address)sp, (address)(sp + 8*sizeof(intptr_t)), sizeof(intptr_t));
+  st->cr();
+
+  // Note: it may be unsafe to inspect memory near pc. For example, pc may
+  // point to garbage if entry point in an nmethod is corrupted. Leave
+  // this at the end, and hope for the best.
+  address pc = os::Linux::ucontext_get_pc(uc);
+  st->print_cr("Instructions: (pc=" INTPTR_FORMAT ")", p2i(pc));
+  print_hex_dump(st, pc - 32, pc + 32, Assembler::InstructionSize);
+}
+
+void os::print_register_info(outputStream *st, const void *context) {
+  if (context == NULL) return;
+
+  const ucontext_t *uc = (const ucontext_t*)context;
+  intx* reg_area = (intx*)&uc->uc_mcontext.arm_r0;
+
+  st->print_cr("Register to memory mapping:");
+  st->cr();
+  for (int r = 0; r < ARM_REGS_IN_CONTEXT; r++) {
+    st->print_cr("  %-3s = " INTPTR_FORMAT, as_Register(r)->name(), reg_area[r]);
+    print_location(st, reg_area[r]);
+    st->cr();
+  }
+#ifdef AARCH64
+  st->print_cr("  %-3s = " U64_FORMAT, "pc", uc->uc_mcontext.pc);
+  print_location(st, uc->uc_mcontext.pc);
+  st->cr();
+#endif
+  st->cr();
+}
+
+
+#ifndef AARCH64
+
+typedef jlong cmpxchg_long_func_t(jlong, jlong, volatile jlong*);
+
+cmpxchg_long_func_t* os::atomic_cmpxchg_long_func = os::atomic_cmpxchg_long_bootstrap;
+
+jlong os::atomic_cmpxchg_long_bootstrap(jlong compare_value, jlong exchange_value, volatile jlong* dest) {
+  // try to use the stub:
+  cmpxchg_long_func_t* func = CAST_TO_FN_PTR(cmpxchg_long_func_t*, StubRoutines::atomic_cmpxchg_long_entry());
+
+  if (func != NULL) {
+    os::atomic_cmpxchg_long_func = func;
+    return (*func)(compare_value, exchange_value, dest);
+  }
+  assert(Threads::number_of_threads() == 0, "for bootstrap only");
+
+  jlong old_value = *dest;
+  if (old_value == compare_value)
+    *dest = exchange_value;
+  return old_value;
+}
+typedef jlong load_long_func_t(volatile jlong*);
+
+load_long_func_t* os::atomic_load_long_func = os::atomic_load_long_bootstrap;
+
+jlong os::atomic_load_long_bootstrap(volatile jlong* src) {
+  // try to use the stub:
+  load_long_func_t* func = CAST_TO_FN_PTR(load_long_func_t*, StubRoutines::atomic_load_long_entry());
+
+  if (func != NULL) {
+    os::atomic_load_long_func = func;
+    return (*func)(src);
+  }
+  assert(Threads::number_of_threads() == 0, "for bootstrap only");
+
+  jlong old_value = *src;
+  return old_value;
+}
+
+typedef void store_long_func_t(jlong, volatile jlong*);
+
+store_long_func_t* os::atomic_store_long_func = os::atomic_store_long_bootstrap;
+
+void os::atomic_store_long_bootstrap(jlong val, volatile jlong* dest) {
+  // try to use the stub:
+  store_long_func_t* func = CAST_TO_FN_PTR(store_long_func_t*, StubRoutines::atomic_store_long_entry());
+
+  if (func != NULL) {
+    os::atomic_store_long_func = func;
+    return (*func)(val, dest);
+  }
+  assert(Threads::number_of_threads() == 0, "for bootstrap only");
+
+  *dest = val;
+}
+
+typedef jint  atomic_add_func_t(jint add_value, volatile jint *dest);
+
+atomic_add_func_t * os::atomic_add_func = os::atomic_add_bootstrap;
+
+jint  os::atomic_add_bootstrap(jint add_value, volatile jint *dest) {
+  atomic_add_func_t * func = CAST_TO_FN_PTR(atomic_add_func_t*,
+                                            StubRoutines::atomic_add_entry());
+  if (func != NULL) {
+    os::atomic_add_func = func;
+    return (*func)(add_value, dest);
+  }
+
+  jint old_value = *dest;
+  *dest = old_value + add_value;
+  return (old_value + add_value);
+}
+
+typedef jint  atomic_xchg_func_t(jint exchange_value, volatile jint *dest);
+
+atomic_xchg_func_t * os::atomic_xchg_func = os::atomic_xchg_bootstrap;
+
+jint  os::atomic_xchg_bootstrap(jint exchange_value, volatile jint *dest) {
+  atomic_xchg_func_t * func = CAST_TO_FN_PTR(atomic_xchg_func_t*,
+                                            StubRoutines::atomic_xchg_entry());
+  if (func != NULL) {
+    os::atomic_xchg_func = func;
+    return (*func)(exchange_value, dest);
+  }
+
+  jint old_value = *dest;
+  *dest = exchange_value;
+  return (old_value);
+}
+
+typedef jint cmpxchg_func_t(jint, jint, volatile jint*);
+
+cmpxchg_func_t* os::atomic_cmpxchg_func = os::atomic_cmpxchg_bootstrap;
+
+jint os::atomic_cmpxchg_bootstrap(jint compare_value, jint exchange_value, volatile jint* dest) {
+  // try to use the stub:
+  cmpxchg_func_t* func = CAST_TO_FN_PTR(cmpxchg_func_t*, StubRoutines::atomic_cmpxchg_entry());
+
+  if (func != NULL) {
+    os::atomic_cmpxchg_func = func;
+    return (*func)(compare_value, exchange_value, dest);
+  }
+  assert(Threads::number_of_threads() == 0, "for bootstrap only");
+
+  jint old_value = *dest;
+  if (old_value == compare_value)
+    *dest = exchange_value;
+  return old_value;
+}
+
+#endif // !AARCH64
+
+#ifndef PRODUCT
+void os::verify_stack_alignment() {
+}
+#endif
+
+int os::extra_bang_size_in_bytes() {
+  // ARM does not require an additional stack bang.
+  return 0;
+}
+
--- /dev/null	2016-08-24 15:41:39.598575000 -0400
+++ new/hotspot/src/os_cpu/linux_arm/vm/os_linux_arm.hpp	2016-12-02 11:25:49.847145877 -0500
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2008, 2015, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef OS_CPU_LINUX_ARM_VM_OS_LINUX_ARM_HPP
+#define OS_CPU_LINUX_ARM_VM_OS_LINUX_ARM_HPP
+
+#ifndef __thumb__
+  enum {
+    // Offset to add to frame::_fp when dealing with non-thumb C frames
+#ifdef AARCH64
+    C_frame_offset =  0,
+#else
+    C_frame_offset =  -1,
+#endif
+  };
+#endif
+
+  static void setup_fpu();
+
+  static bool is_allocatable(size_t bytes);
+
+  // Used to register dynamic code cache area with the OS
+  // Note: Currently only used in 64 bit Windows implementations
+  static bool register_code_area(char *low, char *high) { return true; }
+
+#ifndef AARCH64
+  static jlong (*atomic_cmpxchg_long_func)(jlong compare_value,
+                                           jlong exchange_value,
+                                           volatile jlong *dest);
+
+  static jlong (*atomic_load_long_func)(volatile jlong*);
+
+  static void (*atomic_store_long_func)(jlong, volatile jlong*);
+
+  static jint  (*atomic_add_func)(jint add_value, volatile jint *dest);
+
+  static jint  (*atomic_xchg_func)(jint exchange_value, volatile jint *dest);
+
+  static jint  (*atomic_cmpxchg_func)(jint compare_value,
+                                      jint exchange_value,
+                                      volatile jint *dest);
+
+  static jlong atomic_cmpxchg_long_bootstrap(jlong, jlong, volatile jlong*);
+
+  static jlong atomic_load_long_bootstrap(volatile jlong*);
+
+  static void atomic_store_long_bootstrap(jlong, volatile jlong*);
+
+  static jint  atomic_add_bootstrap(jint add_value, volatile jint *dest);
+
+  static jint  atomic_xchg_bootstrap(jint exchange_value, volatile jint *dest);
+
+  static jint  atomic_cmpxchg_bootstrap(jint compare_value,
+                                        jint exchange_value,
+                                        volatile jint *dest);
+#endif // !AARCH64
+
+#endif // OS_CPU_LINUX_ARM_VM_OS_LINUX_ARM_HPP
--- /dev/null	2016-08-24 15:41:39.598575000 -0400
+++ new/hotspot/src/os_cpu/linux_arm/vm/prefetch_linux_arm.inline.hpp	2016-12-02 11:25:55.439463005 -0500
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2008, 2013, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef OS_CPU_LINUX_ARM_VM_PREFETCH_LINUX_ARM_INLINE_HPP
+#define OS_CPU_LINUX_ARM_VM_PREFETCH_LINUX_ARM_INLINE_HPP
+
+#include "runtime/prefetch.hpp"
+
+inline void Prefetch::read (void *loc, intx interval) {
+#ifdef AARCH64
+  __asm__ volatile ("prfm PLDL1KEEP, [%0]" : : "r" (loc));
+#else
+#if defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_5TE__)
+  __asm__ volatile ("pld [%0]" : : "r" (loc));
+#endif
+#endif // AARCH64
+}
+
+inline void Prefetch::write(void *loc, intx interval) {
+#ifdef AARCH64
+  __asm__ volatile ("prfm PSTL1KEEP, [%0]" : : "r" (loc));
+#else
+  // Not available on 32-bit ARM (prior to ARMv7 with MP extensions)
+#endif // AARCH64
+}
+
+#endif // OS_CPU_LINUX_ARM_VM_PREFETCH_LINUX_ARM_INLINE_HPP
--- /dev/null	2016-08-24 15:41:39.598575000 -0400
+++ new/hotspot/src/os_cpu/linux_arm/vm/thread_linux_arm.cpp	2016-12-02 11:26:00.783766065 -0500
@@ -0,0 +1,126 @@
+/*
+ * Copyright (c) 2008, 2016, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+#include "gc/shared/barrierSet.inline.hpp"
+#include "gc/shared/cardTableModRefBS.inline.hpp"
+#include "gc/shared/collectedHeap.hpp"
+#include "memory/metaspaceShared.hpp"
+#include "runtime/frame.inline.hpp"
+
+void JavaThread::cache_global_variables() {
+  BarrierSet* bs = Universe::heap()->barrier_set();
+
+  const bool allow_shared_alloc =
+    Universe::heap()->supports_inline_contig_alloc();
+
+  if (allow_shared_alloc) {
+    _heap_top_addr = (address) Universe::heap()->top_addr();
+  } else {
+    _heap_top_addr = NULL;
+  }
+
+  if (bs->is_a(BarrierSet::CardTableModRef)) {
+    _card_table_base = (address) (barrier_set_cast<CardTableModRefBS>(bs)->byte_map_base);
+  } else {
+    _card_table_base = NULL;
+  }
+
+}
+
+// For Forte Analyzer AsyncGetCallTrace profiling support - thread is
+// currently interrupted by SIGPROF
+bool JavaThread::pd_get_top_frame_for_signal_handler(frame* fr_addr,
+  void* ucontext, bool isInJava) {
+  assert(Thread::current() == this, "caller must be current thread");
+  return pd_get_top_frame(fr_addr, ucontext, isInJava);
+}
+
+bool JavaThread::pd_get_top_frame_for_profiling(frame* fr_addr, void* ucontext, bool isInJava) {
+  return pd_get_top_frame(fr_addr, ucontext, isInJava);
+}
+
+bool JavaThread::pd_get_top_frame(frame* fr_addr, void* ucontext, bool isInJava) {
+  assert(this->is_Java_thread(), "must be JavaThread");
+
+  JavaThread* jt = (JavaThread *)this;
+
+  // If we have a last_Java_frame, then we should use it even if
+  // isInJava == true.  It should be more reliable than ucontext info.
+  if (jt->has_last_Java_frame() AARCH64_ONLY(&& jt->last_Java_pc() != NULL)) {
+    *fr_addr = jt->pd_last_frame();
+    return true;
+  }
+
+  // Could be in a code section that plays with the stack, like
+  // MacroAssembler::verify_heapbase()
+  if (jt->in_top_frame_unsafe_section()) {
+    return false;
+  }
+
+  // At this point, we don't have a last_Java_frame, so
+  // we try to glean some information out of the ucontext
+  // if we were running Java code when SIGPROF came in.
+  if (isInJava) {
+    ucontext_t* uc = (ucontext_t*) ucontext;
+
+    intptr_t* ret_fp;
+    intptr_t* ret_sp;
+    ExtendedPC addr = os::Linux::fetch_frame_from_ucontext(this, uc,
+      &ret_sp, &ret_fp);
+    if (addr.pc() == NULL || ret_sp == NULL ) {
+      // ucontext wasn't useful
+      return false;
+    }
+
+#if INCLUDE_CDS
+    if (UseSharedSpaces && MetaspaceShared::is_in_shared_region(addr.pc(), MetaspaceShared::md)) {
+      // In the middle of a trampoline call. Bail out for safety.
+      // This happens rarely so shouldn't affect profiling.
+      return false;
+    }
+#endif
+
+    frame ret_frame(ret_sp, ret_fp, addr.pc());
+    if (!ret_frame.safe_for_sender(jt)) {
+#ifdef COMPILER2
+      // C2 uses ebp as a general register see if NULL fp helps
+      frame ret_frame2(ret_sp, NULL, addr.pc());
+      if (!ret_frame2.safe_for_sender(jt)) {
+        // nothing else to try if the frame isn't good
+        return false;
+      }
+      ret_frame = ret_frame2;
+#else
+      // nothing else to try if the frame isn't good
+      return false;
+#endif /* COMPILER2 */
+    }
+    *fr_addr = ret_frame;
+    return true;
+  }
+
+  // nothing else to try
+  return false;
+}
--- /dev/null	2016-08-24 15:41:39.598575000 -0400
+++ new/hotspot/src/os_cpu/linux_arm/vm/thread_linux_arm.hpp	2016-12-02 11:26:06.304079107 -0500
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2008, 2016, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef OS_CPU_LINUX_ARM_VM_THREAD_LINUX_ARM_HPP
+#define OS_CPU_LINUX_ARM_VM_THREAD_LINUX_ARM_HPP
+
+ private:
+  // The following thread-local variables replicate corresponding global variables.
+  // They are used for a quick access from compiled code via Rthread register.
+  address _heap_top_addr;
+  address _heap_lock_addr;
+  address _card_table_base;
+
+  void pd_initialize() {
+    _anchor.clear();
+    _in_top_frame_unsafe_section = NULL;
+  }
+
+  frame pd_last_frame() {
+    assert(has_last_Java_frame(), "must have last_Java_sp() when suspended");
+#ifdef AARCH64
+    assert (_anchor.last_Java_pc() != NULL, "pc should be stored");
+    return frame(_anchor.last_Java_sp(), _anchor.last_Java_fp(), _anchor.last_Java_pc());
+#else
+    if (_anchor.last_Java_pc() != NULL) {
+      return frame(_anchor.last_Java_sp(), _anchor.last_Java_fp(), _anchor.last_Java_pc());
+    } else {
+      // This will pick up pc from sp
+      return frame(_anchor.last_Java_sp(), _anchor.last_Java_fp());
+    }
+#endif // AARCH64
+  }
+
+ public:
+  intptr_t* last_Java_fp()                       { return _anchor.last_Java_fp(); }
+  void  set_last_Java_fp(intptr_t* fp)           { _anchor.set_last_Java_fp(fp);  }
+  void  set_last_Java_pc(address pc)             { _anchor.set_last_Java_pc(pc);  }
+
+  static ByteSize last_Java_fp_offset()          {
+    return byte_offset_of(JavaThread, _anchor) + JavaFrameAnchor::last_Java_fp_offset();
+  }
+
+  void set_base_of_stack_pointer(intptr_t* base_sp) {
+    // Nothing to do
+  }
+
+  intptr_t* base_of_stack_pointer() {
+    return NULL;
+  }
+
+  void record_base_of_stack_pointer() {
+    // Nothing to do
+  }
+
+  static ByteSize heap_top_addr_offset()         { return byte_offset_of(JavaThread, _heap_top_addr); }
+  static ByteSize card_table_base_offset()       { return byte_offset_of(JavaThread, _card_table_base); }
+
+private:
+  // Set to "this" if pd_get_top_frame should ignore this thread for now.
+  JavaThread *_in_top_frame_unsafe_section;
+
+public:
+  static ByteSize in_top_frame_unsafe_section_offset() { return byte_offset_of(JavaThread, _in_top_frame_unsafe_section); }
+  bool in_top_frame_unsafe_section() { return _in_top_frame_unsafe_section == this; }
+
+  bool pd_get_top_frame_for_signal_handler(frame* fr_addr, void* ucontext, bool isInJava);
+
+  bool pd_get_top_frame_for_profiling(frame* fr_addr, void* ucontext, bool isInJava);
+private:
+  bool pd_get_top_frame(frame* fr_addr, void* ucontext, bool isInJava);
+public:
+
+  // These routines are only used on cpu architectures that
+  // have separate register stacks (Itanium).
+  static bool register_stack_overflow() { return false; }
+  static void enable_register_stack_guard() {}
+  static void disable_register_stack_guard() {}
+
+#endif // OS_CPU_LINUX_ARM_VM_THREAD_LINUX_ARM_HPP
--- /dev/null	2016-08-24 15:41:39.598575000 -0400
+++ new/hotspot/src/os_cpu/linux_arm/vm/vmStructs_linux_arm.hpp	2016-12-02 11:26:11.240359033 -0500
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2008, 2013, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef OS_CPU_LINUX_ARM_VM_VMSTRUCTS_LINUX_ARM_HPP
+#define OS_CPU_LINUX_ARM_VM_VMSTRUCTS_LINUX_ARM_HPP
+
+// These are the OS and CPU-specific fields, types and integer
+// constants required by the Serviceability Agent. This file is
+// referenced by vmStructs.cpp.
+
+#define VM_STRUCTS_OS_CPU(nonstatic_field, static_field, unchecked_nonstatic_field, volatile_nonstatic_field, nonproduct_nonstatic_field, c2_nonstatic_field, unchecked_c1_static_field, unchecked_c2_static_field) \
+                                                                                                                                     \
+  /******************************/                                                                                                   \
+  /* Threads (NOTE: incomplete) */                                                                                                   \
+  /******************************/                                                                                                   \
+  nonstatic_field(OSThread,                      _thread_id,                                      OSThread::thread_id_t)             \
+  nonstatic_field(OSThread,                      _pthread_id,                                     pthread_t)
+
+#define VM_TYPES_OS_CPU(declare_type, declare_toplevel_type, declare_oop_type, declare_integer_type, declare_unsigned_integer_type, declare_c1_toplevel_type, declare_c2_type, declare_c2_toplevel_type) \
+                                                                          \
+  /**********************/                                                \
+  /* Posix Thread IDs   */                                                \
+  /**********************/                                                \
+                                                                          \
+  declare_integer_type(OSThread::thread_id_t)                             \
+  declare_unsigned_integer_type(pthread_t)
+
+#define VM_INT_CONSTANTS_OS_CPU(declare_constant, declare_preprocessor_constant, declare_c1_constant, declare_c2_constant, declare_c2_preprocessor_constant)
+
+#define VM_LONG_CONSTANTS_OS_CPU(declare_constant, declare_preprocessor_constant, declare_c1_constant, declare_c2_constant, declare_c2_preprocessor_constant)
+
+#endif // OS_CPU_LINUX_ARM_VM_VMSTRUCTS_LINUX_ARM_HPP
--- /dev/null	2016-08-24 15:41:39.598575000 -0400
+++ new/hotspot/src/os_cpu/linux_arm/vm/vm_version_linux_arm_32.cpp	2016-12-02 11:26:16.088633966 -0500
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2008, 2011, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+#include "runtime/os.hpp"
+#include "vm_version_arm.hpp"
+
+# include <sys/utsname.h>
+
+// Use uname() to find the architecture version
+void VM_Version::get_os_cpu_info() {
+  struct utsname name;
+  static bool done = false;
+
+  // Support for multiple calls in the init phase
+  if (done) return;
+  done = true;
+
+  uname(&name);
+  if (strncmp(name.machine, "aarch64", 7) == 0) {
+    _arm_arch = 8;
+  } else if (strncmp(name.machine, "armv", 4) == 0 &&
+      name.machine[4] >= '5' && name.machine[4] <= '9') {
+    _arm_arch = (int)(name.machine[4] - '0');
+  }
+}
+
+// Make sure that _arm_arch is initialized so that any calls to OrderAccess will
+// use proper dmb instruction
+void VM_Version::early_initialize() {
+  get_os_cpu_info();
+}
--- /dev/null	2016-08-24 15:41:39.598575000 -0400
+++ new/hotspot/src/share/vm/code/relocInfo_ext.cpp	2016-12-02 11:26:20.964910486 -0500
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+#include "code/codeCache.hpp"
+#include "code/relocInfo.hpp"
+#include "code/relocInfo_ext.hpp"
+#include "gc/shared/cardTableModRefBS.hpp"
+#include "gc/shared/collectedHeap.hpp"
+#include "memory/universe.hpp"
+#include "runtime/os.hpp"
+#include "utilities/debug.hpp"
+#ifdef COMPILER1
+#include "c1/c1_globals.hpp"
+#endif
+
+address symbolic_Relocation::symbolic_value(symbolic_Relocation::symbolic_reference t) {
+  if (Universe::heap() == NULL) {
+    // the symbolic values are not needed so early
+    // (and most of them lead to errors if asked too early)
+    return NULL;
+  }
+  switch(t) {
+  case symbolic_Relocation::polling_page_reference: {
+    return os::get_polling_page();
+  }
+  case symbolic_Relocation::eden_top_reference: {
+    if (!Universe::heap()->supports_inline_contig_alloc()) {
+      return NULL;
+    }
+    return (address)Universe::heap()->top_addr();
+  }
+  case symbolic_Relocation::heap_end_reference: {
+    if (!Universe::heap()->supports_inline_contig_alloc()) {
+      return NULL;
+    }
+    return (address)Universe::heap()->end_addr();
+  }
+  case symbolic_Relocation::card_table_reference: {
+    BarrierSet* bs = Universe::heap()->barrier_set();
+    CardTableModRefBS* ct = (CardTableModRefBS*)bs;
+    return (address)ct->byte_map_base;
+  }
+  case symbolic_Relocation::mark_bits_reference: {
+    return (address)Universe::verify_mark_bits();
+  }
+  case symbolic_Relocation::mark_mask_reference: {
+    return (address)Universe::verify_mark_mask();
+  }
+  case symbolic_Relocation::oop_bits_reference: {
+    return (address)Universe::verify_oop_bits();
+  }
+  case symbolic_Relocation::oop_mask_reference: {
+    return (address)Universe::verify_oop_mask();
+  }
+  case symbolic_Relocation::debug_string_reference: {
+    return (address)"<Lost debug string>";
+  }
+  default: {
+    // missing declaration
+    ShouldNotReachHere();
+    return NULL;
+  }
+  }
+}
--- /dev/null	2016-08-24 15:41:39.598575000 -0400
+++ new/hotspot/src/share/vm/code/relocInfo_ext.hpp	2016-12-02 11:26:25.929191997 -0500
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2011, 2015, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef SHARE_VM_CODE_RELOCINFO_EXT_HPP
+#define SHARE_VM_CODE_RELOCINFO_EXT_HPP
+
+// symbolic_Relocation allows to anotate some addresses in the generated code.
+// This is currently needed only on ARM, for the pregenerated interpreter.
+//
+// This class was initially defined using the last unused relocType. The
+// new version tries to limit the impact on open source code changes.
+//
+// Without compiled code support, symbolic_Relocation need not be a real
+// relocation. To avoid using the last unused relocType, the
+// symbolic_Relocation::spec(<any symbolic type>) has been replaced
+// by additional methods using directly the symbolic type.
+//
+// Note: the order of the arguments in some methods had to reversed
+// to avoid confusion between the relocType enum and the
+// symbolic_reference enum.
+class symbolic_Relocation : AllStatic {
+
+ public:
+  enum symbolic_reference {
+    card_table_reference,
+    eden_top_reference,
+    heap_end_reference,
+    polling_page_reference,
+    mark_bits_reference,
+    mark_mask_reference,
+    oop_bits_reference,
+    oop_mask_reference,
+    debug_string_reference,
+    last_symbolic_reference
+  };
+
+  // get the new value for a given symbolic type
+  static address symbolic_value(symbolic_reference t);
+};
+
+#endif // SHARE_VM_CODE_RELOCINFO_EXT_HPP
--- /dev/null	2016-08-24 15:41:39.598575000 -0400
+++ new/jdk/src/java.base/unix/conf/arm/jvm.cfg	2016-12-02 11:26:31.209491429 -0500
@@ -0,0 +1,36 @@
+# Copyright (c) 2011, 2015, Oracle and/or its affiliates. All rights reserved.
+# DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+#
+# This code is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License version 2 only, as
+# published by the Free Software Foundation.  Oracle designates this
+# particular file as subject to the "Classpath" exception as provided
+# by Oracle in the LICENSE file that accompanied this code.
+#
+# This code is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+# FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+# version 2 for more details (a copy is included in the LICENSE file that
+# accompanied this code).
+#
+# You should have received a copy of the GNU General Public License version
+# 2 along with this work; if not, write to the Free Software Foundation,
+# Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+#
+# Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+# or visit www.oracle.com if you need additional information or have any
+# questions.
+#
+# 
+# List of JVMs that can be used as an option to java, javac, etc.
+# Order is important -- first in this list is the default JVM.
+# NOTE that this both this file and its format are UNSUPPORTED and
+# WILL GO AWAY in a future release.
+#
+# You may also select a JVM in an arbitrary location with the
+# "-XXaltjvm=<jvm_dir>" option, but that too is unsupported
+# and may not be available in a future release.
+#
+-client IF_SERVER_CLASS -server
+-server KNOWN
+-minimal KNOWN
--- old/hotspot/src/share/vm/code/codeCacheExtensions.hpp	2016-12-02 11:26:38.801921975 -0500
+++ /dev/null	2016-08-24 15:41:39.598575000 -0400
@@ -1,52 +0,0 @@
-/*
- * Copyright (c) 2012, 2015, Oracle and/or its affiliates. All rights reserved.
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * This code is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 only, as
- * published by the Free Software Foundation.
- *
- * This code is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
- * version 2 for more details (a copy is included in the LICENSE file that
- * accompanied this code).
- *
- * You should have received a copy of the GNU General Public License version
- * 2 along with this work; if not, write to the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
- *
- * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
- * or visit www.oracle.com if you need additional information or have any
- * questions.
- *
- */
-
-#ifndef SHARE_VM_CODE_CODE_CACHE_EXTENSIONS_HPP
-#define SHARE_VM_CODE_CODE_CACHE_EXTENSIONS_HPP
-
-#include "memory/allocation.hpp"
-
-class CodeCacheExtensionsSteps: AllStatic {
-public:
-  enum Step {
-    // Support for optional fine grain initialization hooks
-    // Note: these hooks must support refining the granularity
-    // (e.g. adding intermediate steps in the ordered enum
-    // if needed for future features)
-    Start,
-    VMVersion,
-    StubRoutines1,
-    Universe,
-    TemplateInterpreter,
-    Interpreter,
-    StubRoutines2,
-    InitGlobals,
-    CreateVM,
-    LastStep
-  };
-};
-
-#include "code/codeCacheExtensions_ext.hpp"
-
-#endif // SHARE_VM_CODE_CODE_CACHE_EXTENSIONS_HPP
--- old/hotspot/src/share/vm/code/codeCacheExtensions_ext.hpp	2016-12-02 11:26:42.906154714 -0500
+++ /dev/null	2016-08-24 15:41:39.598575000 -0400
@@ -1,130 +0,0 @@
-/*
- * Copyright (c) 2012, 2015, Oracle and/or its affiliates. All rights reserved.
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * This code is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 only, as
- * published by the Free Software Foundation.
- *
- * This code is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
- * version 2 for more details (a copy is included in the LICENSE file that
- * accompanied this code).
- *
- * You should have received a copy of the GNU General Public License version
- * 2 along with this work; if not, write to the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
- *
- * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
- * or visit www.oracle.com if you need additional information or have any
- * questions.
- *
- */
-
-#ifndef SHARE_VM_CODE_CODE_CACHE_EXTENSIONS_EXT_HPP
-#define SHARE_VM_CODE_CODE_CACHE_EXTENSIONS_EXT_HPP
-
-#include "utilities/macros.hpp"
-#include "memory/allocation.hpp"
-#include "utilities/globalDefinitions.hpp"
-#include "interpreter/bytecodes.hpp"
-
-class AdapterHandlerEntry;
-class CodeBlob;
-class CodeBuffer;
-class InterpreterMacroAssembler;
-class Template;
-
-// All the methods defined here are placeholders for possible extensions.
-
-class CodeCacheExtensions: AllStatic {
-  friend class CodeCacheDumper;
-
-public:
-  // init both code saving and loading
-  // Must be called very early, before any code is generated.
-  static void initialize() {}
-
-  // Check whether the generated interpreter will be saved.
-  static bool saving_generated_interpreter() { return false; }
-
-  // Check whether a pregenerated interpreter is used.
-  static bool use_pregenerated_interpreter() { return false; }
-
-  // Placeholder for additional VM initialization code
-  static void complete_step(CodeCacheExtensionsSteps::Step phase) {}
-
-  // Return false for newly generated code, on systems where it is not
-  // executable.
-  static bool is_executable(void *pc) { return true; }
-
-  // Return whether dynamically generated code can be executable
-  static bool support_dynamic_code() { return true; }
-
-  // Skip new code generation when known to be useless.
-  static bool skip_code_generation() { return false; }
-
-  // Skip stubs used only for compiled code support.
-  static bool skip_compiler_support() { return false; }
-
-  // Ignore UseFastSignatureHandlers when returning false
-  static bool support_fast_signature_handlers() { return true; }
-
-  /////////////////////////
-  // Handle generated code:
-  // - allow newly generated code to be shared
-  // - allow pregenerated code to be used in place of the newly generated one
-  //   (modifying pc).
-  // - support remapping when doing both save and load
-  // 'remap' can be set to false if the addresses handled are not referenced
-  // from code generated later.
-
-  // Associate a name to a generated codelet and possibly modify the pc
-  // Note: use instead the specialized versions when they exist:
-  // - handle_generated_blob for CodeBlob
-  // - handle_generated_handler for SignatureHandlers
-  // See also the optimized calls below that handle several PCs at once.
-  static void handle_generated_pc(address &pc, const char *name) {}
-
-  // Adds a safe definition of the codelet, for codelets used right after
-  // generation (else we would need to immediately stop the JVM and convert
-  // the generated code to executable format before being able to go further).
-  static void handle_generated_pc(address &pc, const char *name, address default_entry) {}
-
-  // Special cases
-
-  // Special case for CodeBlobs, which may require blob specific actions.
-  static CodeBlob* handle_generated_blob(CodeBlob* blob, const char *name = NULL) { return blob; }
-
-  // Special case for Signature Handlers.
-  static void handle_generated_handler(address &handler_start, const char *name, address handler_end) {}
-
-  // Support for generating different variants of the interpreter
-  // that can be dynamically selected after reload.
-  //
-  // - init_interpreter_assembler allows to configure the assembler for
-  //   the current variant
-  //
-  // - needs_other_interpreter_variant returns true as long as other
-  //   variants are needed.
-  //
-  // - skip_template_interpreter_entries returns true if new entries
-  //   need not be generated for this masm setup and this bytecode
-  //
-  // - completed_template_interpreter_entries is called after new
-  //   entries have been generated and installed, for any non skipped
-  //   bytecode.
-  static void init_interpreter_assembler(InterpreterMacroAssembler* masm, CodeBuffer* code) {}
-  static bool needs_other_interpreter_variant() { return false; }
-  static bool skip_template_interpreter_entries(Bytecodes::Code code) { return false; }
-  static void completed_template_interpreter_entries(InterpreterMacroAssembler* masm, Bytecodes::Code code) {}
-
-  // Code size optimization. May optimize the requested size.
-  static void size_blob(const char* name, int *updatable_size) {}
-
-  // ergonomics
-  static void set_ergonomics_flags() {}
-};
-
-#endif // SHARE_VM_CODE_CODE_CACHE_EXTENSIONS_EXT_HPP