--- old/src/cpu/sparc/vm/sparc.ad	Sat Jun  2 20:03:55 2012
+++ new/src/cpu/sparc/vm/sparc.ad	Sat Jun  2 20:03:55 2012
@@ -1,5 +1,5 @@
 //
-// Copyright (c) 1998, 2011, Oracle and/or its affiliates. All rights reserved.
+// Copyright (c) 1998, 2012, Oracle and/or its affiliates. All rights reserved.
 // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 //
 // This code is free software; you can redistribute it and/or modify it
@@ -678,18 +678,26 @@
 
 static inline jdouble replicate_immI(int con, int count, int width) {
   // Load a constant replicated "count" times with width "width"
+  assert(count*width == 8 && width <= 4, "sanity");
   int bit_width = width * 8;
-  jlong elt_val = con;
-  elt_val &= (((jlong) 1) << bit_width) - 1;  // mask off sign bits
-  jlong val = elt_val;
+  jlong val = con;
+  val &= (((jlong) 1) << bit_width) - 1;  // mask off sign bits
   for (int i = 0; i < count - 1; i++) {
-    val <<= bit_width;
-    val |= elt_val;
+    val |= (val << bit_width);
   }
   jdouble dval = *((jdouble*) &val);  // coerce to double type
   return dval;
 }
 
+static inline jdouble replicate_immF(float con) {
+  // Replicate float con 2 times and pack into vector.
+  int val = *((int*)&con);
+  jlong lval = val;
+  lval = (lval << 32) | (lval & 0xFFFFFFFFl);
+  jdouble dval = *((jdouble*) &lval);  // coerce to double type
+  return dval;
+}
+
 // Standard Sparc opcode form2 field breakdown
 static inline void emit2_19(CodeBuffer &cbuf, int f30, int f29, int f25, int f22, int f20, int f19, int f0 ) {
   f0 &= (1<<19)-1;     // Mask displacement to 19 bits
@@ -841,10 +849,7 @@
           !(n->ideal_Opcode()==Op_PrefetchRead  && ld_op==Op_LoadI) &&
           !(n->ideal_Opcode()==Op_PrefetchWrite && ld_op==Op_LoadI) &&
           !(n->ideal_Opcode()==Op_PrefetchAllocation && ld_op==Op_LoadI) &&
-          !(n->ideal_Opcode()==Op_Load2I    && ld_op==Op_LoadD) &&
-          !(n->ideal_Opcode()==Op_Load4C    && ld_op==Op_LoadD) &&
-          !(n->ideal_Opcode()==Op_Load4S    && ld_op==Op_LoadD) &&
-          !(n->ideal_Opcode()==Op_Load8B    && ld_op==Op_LoadD) &&
+          !(n->ideal_Opcode()==Op_LoadVector && ld_op==Op_LoadD) &&
           !(n->rule() == loadUB_rule)) {
         verify_oops_warning(n, n->ideal_Opcode(), ld_op);
       }
@@ -856,9 +861,7 @@
           !(n->ideal_Opcode()==Op_StoreI && st_op==Op_StoreF) &&
           !(n->ideal_Opcode()==Op_StoreF && st_op==Op_StoreI) &&
           !(n->ideal_Opcode()==Op_StoreL && st_op==Op_StoreI) &&
-          !(n->ideal_Opcode()==Op_Store2I && st_op==Op_StoreD) &&
-          !(n->ideal_Opcode()==Op_Store4C && st_op==Op_StoreD) &&
-          !(n->ideal_Opcode()==Op_Store8B && st_op==Op_StoreD) &&
+          !(n->ideal_Opcode()==Op_StoreVector && st_op==Op_StoreD) &&
           !(n->ideal_Opcode()==Op_StoreD && st_op==Op_StoreI && n->rule() == storeD0_rule)) {
         verify_oops_warning(n, n->ideal_Opcode(), st_op);
       }
@@ -1850,16 +1853,45 @@
 address last_rethrow = NULL;  // debugging aid for Rethrow encoding
 #endif
 
+// Map Types to machine register types
+const int Matcher::base2reg[Type::lastype] = {
+  Node::NotAMachineReg,0,0, Op_RegI, Op_RegL, 0, Op_RegN,
+  Node::NotAMachineReg, Node::NotAMachineReg, /* tuple, array */
+  0, Op_RegD, 0, 0, /* Vectors */
+  Op_RegP, Op_RegP, Op_RegP, Op_RegP, Op_RegP, Op_RegP, /* the pointers */
+  0, 0/*abio*/,
+  Op_RegP /* Return address */, 0, /* the memories */
+  Op_RegF, Op_RegF, Op_RegF, Op_RegD, Op_RegD, Op_RegD,
+  0  /*bottom*/
+};
+
 // Vector width in bytes
-const uint Matcher::vector_width_in_bytes(void) {
+const int Matcher::vector_width_in_bytes(BasicType bt) {
+  assert(MaxVectorSize == 8, "");
   return 8;
 }
 
 // Vector ideal reg
-const uint Matcher::vector_ideal_reg(void) {
+const int Matcher::vector_ideal_reg(int size) {
+  assert(MaxVectorSize == 8, "");
   return Op_RegD;
 }
 
+// Limits on vector size (number of elements) loaded into vector.
+const int Matcher::max_vector_size(const BasicType bt) {
+  assert(is_java_primitive(bt), "only primitive type vectors");
+  return vector_width_in_bytes(bt)/type2aelembytes(bt);
+}
+
+const int Matcher::min_vector_size(const BasicType bt) {
+  return max_vector_size(bt); // Same as max.
+}
+
+// SPARC doesn't support misaligned vectors store/load.
+const bool Matcher::misaligned_vectors_ok() {
+  return false;
+}
+
 // USII supports fxtof through the whole range of number, USIII doesn't
 const bool Matcher::convL2FSupported(void) {
   return VM_Version::has_fast_fxtof();
@@ -5933,50 +5965,6 @@
   ins_pipe(iload_mem);
 %}
 
-// Load Aligned Packed Byte into a Double Register
-instruct loadA8B(regD dst, memory mem) %{
-  match(Set dst (Load8B mem));
-  ins_cost(MEMORY_REF_COST);
-  size(4);
-  format %{ "LDDF   $mem,$dst\t! packed8B" %}
-  opcode(Assembler::lddf_op3);
-  ins_encode(simple_form3_mem_reg( mem, dst ) );
-  ins_pipe(floadD_mem);
-%}
-
-// Load Aligned Packed Char into a Double Register
-instruct loadA4C(regD dst, memory mem) %{
-  match(Set dst (Load4C mem));
-  ins_cost(MEMORY_REF_COST);
-  size(4);
-  format %{ "LDDF   $mem,$dst\t! packed4C" %}
-  opcode(Assembler::lddf_op3);
-  ins_encode(simple_form3_mem_reg( mem, dst ) );
-  ins_pipe(floadD_mem);
-%}
-
-// Load Aligned Packed Short into a Double Register
-instruct loadA4S(regD dst, memory mem) %{
-  match(Set dst (Load4S mem));
-  ins_cost(MEMORY_REF_COST);
-  size(4);
-  format %{ "LDDF   $mem,$dst\t! packed4S" %}
-  opcode(Assembler::lddf_op3);
-  ins_encode(simple_form3_mem_reg( mem, dst ) );
-  ins_pipe(floadD_mem);
-%}
-
-// Load Aligned Packed Int into a Double Register
-instruct loadA2I(regD dst, memory mem) %{
-  match(Set dst (Load2I mem));
-  ins_cost(MEMORY_REF_COST);
-  size(4);
-  format %{ "LDDF   $mem,$dst\t! packed2I" %}
-  opcode(Assembler::lddf_op3);
-  ins_encode(simple_form3_mem_reg( mem, dst ) );
-  ins_pipe(floadD_mem);
-%}
-
 // Load Range
 instruct loadRange(iRegI dst, memory mem) %{
   match(Set dst (LoadRange mem));
@@ -6600,17 +6588,6 @@
   ins_pipe(fstoreF_mem_zero);
 %}
 
-// Store Aligned Packed Bytes in Double register to memory
-instruct storeA8B(memory mem, regD src) %{
-  match(Set mem (Store8B mem src));
-  ins_cost(MEMORY_REF_COST);
-  size(4);
-  format %{ "STDF   $src,$mem\t! packed8B" %}
-  opcode(Assembler::stdf_op3);
-  ins_encode(simple_form3_mem_reg( mem, src ) );
-  ins_pipe(fstoreD_mem_reg);
-%}
-
 // Convert oop pointer into compressed form
 instruct encodeHeapOop(iRegN dst, iRegP src) %{
   predicate(n->bottom_type()->make_ptr()->ptr() != TypePtr::NotNull);
@@ -6655,62 +6632,6 @@
 %}
 
 
-// Store Zero into Aligned Packed Bytes
-instruct storeA8B0(memory mem, immI0 zero) %{
-  match(Set mem (Store8B mem zero));
-  ins_cost(MEMORY_REF_COST);
-  size(4);
-  format %{ "STX    $zero,$mem\t! packed8B" %}
-  opcode(Assembler::stx_op3);
-  ins_encode(simple_form3_mem_reg( mem, R_G0 ) );
-  ins_pipe(fstoreD_mem_zero);
-%}
-
-// Store Aligned Packed Chars/Shorts in Double register to memory
-instruct storeA4C(memory mem, regD src) %{
-  match(Set mem (Store4C mem src));
-  ins_cost(MEMORY_REF_COST);
-  size(4);
-  format %{ "STDF   $src,$mem\t! packed4C" %}
-  opcode(Assembler::stdf_op3);
-  ins_encode(simple_form3_mem_reg( mem, src ) );
-  ins_pipe(fstoreD_mem_reg);
-%}
-
-// Store Zero into Aligned Packed Chars/Shorts
-instruct storeA4C0(memory mem, immI0 zero) %{
-  match(Set mem (Store4C mem (Replicate4C zero)));
-  ins_cost(MEMORY_REF_COST);
-  size(4);
-  format %{ "STX    $zero,$mem\t! packed4C" %}
-  opcode(Assembler::stx_op3);
-  ins_encode(simple_form3_mem_reg( mem, R_G0 ) );
-  ins_pipe(fstoreD_mem_zero);
-%}
-
-// Store Aligned Packed Ints in Double register to memory
-instruct storeA2I(memory mem, regD src) %{
-  match(Set mem (Store2I mem src));
-  ins_cost(MEMORY_REF_COST);
-  size(4);
-  format %{ "STDF   $src,$mem\t! packed2I" %}
-  opcode(Assembler::stdf_op3);
-  ins_encode(simple_form3_mem_reg( mem, src ) );
-  ins_pipe(fstoreD_mem_reg);
-%}
-
-// Store Zero into Aligned Packed Ints
-instruct storeA2I0(memory mem, immI0 zero) %{
-  match(Set mem (Store2I mem zero));
-  ins_cost(MEMORY_REF_COST);
-  size(4);
-  format %{ "STX    $zero,$mem\t! packed2I" %}
-  opcode(Assembler::stx_op3);
-  ins_encode(simple_form3_mem_reg( mem, R_G0 ) );
-  ins_pipe(fstoreD_mem_zero);
-%}
-
-
 //----------MemBar Instructions-----------------------------------------------
 // Memory barrier flavors
 
@@ -8892,150 +8813,6 @@
   ins_pipe(ialu_reg_imm);
 %}
 
-// Replicate scalar to packed byte values in Double register
-instruct Repl8B_reg_helper(iRegL dst, iRegI src) %{
-  effect(DEF dst, USE src);
-  format %{ "SLLX  $src,56,$dst\n\t"
-            "SRLX  $dst, 8,O7\n\t"
-            "OR    $dst,O7,$dst\n\t"
-            "SRLX  $dst,16,O7\n\t"
-            "OR    $dst,O7,$dst\n\t"
-            "SRLX  $dst,32,O7\n\t"
-            "OR    $dst,O7,$dst\t! replicate8B" %}
-  ins_encode( enc_repl8b(src, dst));
-  ins_pipe(ialu_reg);
-%}
-
-// Replicate scalar to packed byte values in Double register
-instruct Repl8B_reg(stackSlotD dst, iRegI src) %{
-  match(Set dst (Replicate8B src));
-  expand %{
-    iRegL tmp;
-    Repl8B_reg_helper(tmp, src);
-    regL_to_stkD(dst, tmp);
-  %}
-%}
-
-// Replicate scalar constant to packed byte values in Double register
-instruct Repl8B_immI(regD dst, immI13 con, o7RegI tmp) %{
-  match(Set dst (Replicate8B con));
-  effect(KILL tmp);
-  format %{ "LDDF   [$constanttablebase + $constantoffset],$dst\t! load from constant table: Repl8B($con)" %}
-  ins_encode %{
-    // XXX This is a quick fix for 6833573.
-    //__ ldf(FloatRegisterImpl::D, $constanttablebase, $constantoffset(replicate_immI($con$$constant, 8, 1)), $dst$$FloatRegister);
-    RegisterOrConstant con_offset = __ ensure_simm13_or_reg($constantoffset(replicate_immI($con$$constant, 8, 1)), $tmp$$Register);
-    __ ldf(FloatRegisterImpl::D, $constanttablebase, con_offset, as_DoubleFloatRegister($dst$$reg));
-  %}
-  ins_pipe(loadConFD);
-%}
-
-// Replicate scalar to packed char values into stack slot
-instruct Repl4C_reg_helper(iRegL dst, iRegI src) %{
-  effect(DEF dst, USE src);
-  format %{ "SLLX  $src,48,$dst\n\t"
-            "SRLX  $dst,16,O7\n\t"
-            "OR    $dst,O7,$dst\n\t"
-            "SRLX  $dst,32,O7\n\t"
-            "OR    $dst,O7,$dst\t! replicate4C" %}
-  ins_encode( enc_repl4s(src, dst) );
-  ins_pipe(ialu_reg);
-%}
-
-// Replicate scalar to packed char values into stack slot
-instruct Repl4C_reg(stackSlotD dst, iRegI src) %{
-  match(Set dst (Replicate4C src));
-  expand %{
-    iRegL tmp;
-    Repl4C_reg_helper(tmp, src);
-    regL_to_stkD(dst, tmp);
-  %}
-%}
-
-// Replicate scalar constant to packed char values in Double register
-instruct Repl4C_immI(regD dst, immI con, o7RegI tmp) %{
-  match(Set dst (Replicate4C con));
-  effect(KILL tmp);
-  format %{ "LDDF   [$constanttablebase + $constantoffset],$dst\t! load from constant table: Repl4C($con)" %}
-  ins_encode %{
-    // XXX This is a quick fix for 6833573.
-    //__ ldf(FloatRegisterImpl::D, $constanttablebase, $constantoffset(replicate_immI($con$$constant, 4, 2)), $dst$$FloatRegister);
-    RegisterOrConstant con_offset = __ ensure_simm13_or_reg($constantoffset(replicate_immI($con$$constant, 4, 2)), $tmp$$Register);
-    __ ldf(FloatRegisterImpl::D, $constanttablebase, con_offset, as_DoubleFloatRegister($dst$$reg));
-  %}
-  ins_pipe(loadConFD);
-%}
-
-// Replicate scalar to packed short values into stack slot
-instruct Repl4S_reg_helper(iRegL dst, iRegI src) %{
-  effect(DEF dst, USE src);
-  format %{ "SLLX  $src,48,$dst\n\t"
-            "SRLX  $dst,16,O7\n\t"
-            "OR    $dst,O7,$dst\n\t"
-            "SRLX  $dst,32,O7\n\t"
-            "OR    $dst,O7,$dst\t! replicate4S" %}
-  ins_encode( enc_repl4s(src, dst) );
-  ins_pipe(ialu_reg);
-%}
-
-// Replicate scalar to packed short values into stack slot
-instruct Repl4S_reg(stackSlotD dst, iRegI src) %{
-  match(Set dst (Replicate4S src));
-  expand %{
-    iRegL tmp;
-    Repl4S_reg_helper(tmp, src);
-    regL_to_stkD(dst, tmp);
-  %}
-%}
-
-// Replicate scalar constant to packed short values in Double register
-instruct Repl4S_immI(regD dst, immI con, o7RegI tmp) %{
-  match(Set dst (Replicate4S con));
-  effect(KILL tmp);
-  format %{ "LDDF   [$constanttablebase + $constantoffset],$dst\t! load from constant table: Repl4S($con)" %}
-  ins_encode %{
-    // XXX This is a quick fix for 6833573.
-    //__ ldf(FloatRegisterImpl::D, $constanttablebase, $constantoffset(replicate_immI($con$$constant, 4, 2)), $dst$$FloatRegister);
-    RegisterOrConstant con_offset = __ ensure_simm13_or_reg($constantoffset(replicate_immI($con$$constant, 4, 2)), $tmp$$Register);
-    __ ldf(FloatRegisterImpl::D, $constanttablebase, con_offset, as_DoubleFloatRegister($dst$$reg));
-  %}
-  ins_pipe(loadConFD);
-%}
-
-// Replicate scalar to packed int values in Double register
-instruct Repl2I_reg_helper(iRegL dst, iRegI src) %{
-  effect(DEF dst, USE src);
-  format %{ "SLLX  $src,32,$dst\n\t"
-            "SRLX  $dst,32,O7\n\t"
-            "OR    $dst,O7,$dst\t! replicate2I" %}
-  ins_encode( enc_repl2i(src, dst));
-  ins_pipe(ialu_reg);
-%}
-
-// Replicate scalar to packed int values in Double register
-instruct Repl2I_reg(stackSlotD dst, iRegI src) %{
-  match(Set dst (Replicate2I src));
-  expand %{
-    iRegL tmp;
-    Repl2I_reg_helper(tmp, src);
-    regL_to_stkD(dst, tmp);
-  %}
-%}
-
-// Replicate scalar zero constant to packed int values in Double register
-instruct Repl2I_immI(regD dst, immI con, o7RegI tmp) %{
-  match(Set dst (Replicate2I con));
-  effect(KILL tmp);
-  format %{ "LDDF   [$constanttablebase + $constantoffset],$dst\t! load from constant table: Repl2I($con)" %}
-  ins_encode %{
-    // XXX This is a quick fix for 6833573.
-    //__ ldf(FloatRegisterImpl::D, $constanttablebase, $constantoffset(replicate_immI($con$$constant, 2, 4)), $dst$$FloatRegister);
-    RegisterOrConstant con_offset = __ ensure_simm13_or_reg($constantoffset(replicate_immI($con$$constant, 2, 4)), $tmp$$Register);
-    __ ldf(FloatRegisterImpl::D, $constanttablebase, con_offset, as_DoubleFloatRegister($dst$$reg));
-  %}
-  ins_pipe(loadConFD);
-%}
-
 //----------Control Flow Instructions------------------------------------------
 // Compare Instructions
 // Compare Integers
@@ -10754,6 +10531,267 @@
   ins_pipe(istore_mem_reg);
 %}
 
+// ====================VECTOR INSTRUCTIONS=====================================
+
+// Load Aligned Packed values into a Double Register
+instruct loadV8(regD dst, memory mem) %{
+  predicate(n->as_LoadVector()->memory_size() == 8);
+  match(Set dst (LoadVector mem));
+  ins_cost(MEMORY_REF_COST);
+  size(4);
+  format %{ "LDDF   $mem,$dst\t! load vector (8 bytes)" %}
+  opcode(Assembler::lddf_op3);
+  ins_encode(simple_form3_mem_reg( mem, dst ) );
+  ins_pipe(floadD_mem);
+%}
+
+// Store Vector in Double register to memory
+instruct storeV8(memory mem, regD src) %{
+  predicate(n->as_StoreVector()->memory_size() == 8);
+  match(Set mem (StoreVector mem src));
+  ins_cost(MEMORY_REF_COST);
+  size(4);
+  format %{ "STDF   $src,$mem\t! store vector (8 bytes)" %}
+  opcode(Assembler::stdf_op3);
+  ins_encode(simple_form3_mem_reg( mem, src ) );
+  ins_pipe(fstoreD_mem_reg);
+%}
+
+// Store Zero into vector in memory
+instruct storeV8B_zero(memory mem, immI0 zero) %{
+  predicate(n->as_StoreVector()->memory_size() == 8);
+  match(Set mem (StoreVector mem (ReplicateB zero)));
+  ins_cost(MEMORY_REF_COST);
+  size(4);
+  format %{ "STX    $zero,$mem\t! store zero vector (8 bytes)" %}
+  opcode(Assembler::stx_op3);
+  ins_encode(simple_form3_mem_reg( mem, R_G0 ) );
+  ins_pipe(fstoreD_mem_zero);
+%}
+
+instruct storeV4C_zero(memory mem, immI0 zero) %{
+  predicate(n->as_StoreVector()->memory_size() == 8);
+  match(Set mem (StoreVector mem (ReplicateC zero)));
+  ins_cost(MEMORY_REF_COST);
+  size(4);
+  format %{ "STX    $zero,$mem\t! store zero vector (4 chars)" %}
+  opcode(Assembler::stx_op3);
+  ins_encode(simple_form3_mem_reg( mem, R_G0 ) );
+  ins_pipe(fstoreD_mem_zero);
+%}
+
+instruct storeV4S_zero(memory mem, immI0 zero) %{
+  predicate(n->as_StoreVector()->memory_size() == 8);
+  match(Set mem (StoreVector mem (ReplicateS zero)));
+  ins_cost(MEMORY_REF_COST);
+  size(4);
+  format %{ "STX    $zero,$mem\t! store zero vector (4 shorts)" %}
+  opcode(Assembler::stx_op3);
+  ins_encode(simple_form3_mem_reg( mem, R_G0 ) );
+  ins_pipe(fstoreD_mem_zero);
+%}
+
+instruct storeV2I_zero(memory mem, immI0 zero) %{
+  predicate(n->as_StoreVector()->memory_size() == 8);
+  match(Set mem (StoreVector mem (ReplicateI zero)));
+  ins_cost(MEMORY_REF_COST);
+  size(4);
+  format %{ "STX    $zero,$mem\t! store zero vector (2 ints)" %}
+  opcode(Assembler::stx_op3);
+  ins_encode(simple_form3_mem_reg( mem, R_G0 ) );
+  ins_pipe(fstoreD_mem_zero);
+%}
+
+instruct storeV2F_zero(memory mem, immF0 zero) %{
+  predicate(n->as_StoreVector()->memory_size() == 8);
+  match(Set mem (StoreVector mem (ReplicateF zero)));
+  ins_cost(MEMORY_REF_COST);
+  size(4);
+  format %{ "STX    $zero,$mem\t! store zero vector (2 floats)" %}
+  opcode(Assembler::stx_op3);
+  ins_encode(simple_form3_mem_reg( mem, R_G0 ) );
+  ins_pipe(fstoreD_mem_zero);
+%}
+
+// Replicate scalar to packed byte values in Double register
+instruct Repl8B_reg_helper(iRegL dst, iRegI src) %{
+  effect(DEF dst, USE src);
+  format %{ "SLLX  $src,56,$dst\n\t"
+            "SRLX  $dst, 8,O7\n\t"
+            "OR    $dst,O7,$dst\n\t"
+            "SRLX  $dst,16,O7\n\t"
+            "OR    $dst,O7,$dst\n\t"
+            "SRLX  $dst,32,O7\n\t"
+            "OR    $dst,O7,$dst\t! replicate8B" %}
+  ins_encode( enc_repl8b(src, dst));
+  ins_pipe(ialu_reg);
+%}
+
+// Replicate scalar to packed byte values in Double register
+instruct Repl8B_reg(stackSlotD dst, iRegI src) %{
+  predicate(n->as_Vector()->length() == 8);
+  match(Set dst (ReplicateB src));
+  expand %{
+    iRegL tmp;
+    Repl8B_reg_helper(tmp, src);
+    regL_to_stkD(dst, tmp);
+  %}
+%}
+
+// Replicate scalar constant to packed byte values in Double register
+instruct Repl8B_immI(regD dst, immI13 con, o7RegI tmp) %{
+  predicate(n->as_Vector()->length() == 8);
+  match(Set dst (ReplicateB con));
+  effect(KILL tmp);
+  format %{ "LDDF   [$constanttablebase + $constantoffset],$dst\t! load from constant table: Repl8B($con)" %}
+  ins_encode %{
+    // XXX This is a quick fix for 6833573.
+    //__ ldf(FloatRegisterImpl::D, $constanttablebase, $constantoffset(replicate_immI($con$$constant, 8, 1)), $dst$$FloatRegister);
+    RegisterOrConstant con_offset = __ ensure_simm13_or_reg($constantoffset(replicate_immI($con$$constant, 8, 1)), $tmp$$Register);
+    __ ldf(FloatRegisterImpl::D, $constanttablebase, con_offset, as_DoubleFloatRegister($dst$$reg));
+  %}
+  ins_pipe(loadConFD);
+%}
+
+// Replicate scalar to packed char values into stack slot
+instruct Repl4C_reg_helper(iRegL dst, iRegI src) %{
+  effect(DEF dst, USE src);
+  format %{ "SLLX  $src,48,$dst\n\t"
+            "SRLX  $dst,16,O7\n\t"
+            "OR    $dst,O7,$dst\n\t"
+            "SRLX  $dst,32,O7\n\t"
+            "OR    $dst,O7,$dst\t! replicate4C" %}
+  ins_encode( enc_repl4s(src, dst) );
+  ins_pipe(ialu_reg);
+%}
+
+// Replicate scalar to packed char values into stack slot
+instruct Repl4C_reg(stackSlotD dst, iRegI src) %{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (ReplicateC src));
+  expand %{
+    iRegL tmp;
+    Repl4C_reg_helper(tmp, src);
+    regL_to_stkD(dst, tmp);
+  %}
+%}
+
+// Replicate scalar constant to packed char values in Double register
+instruct Repl4C_immI(regD dst, immI con, o7RegI tmp) %{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (ReplicateC con));
+  effect(KILL tmp);
+  format %{ "LDDF   [$constanttablebase + $constantoffset],$dst\t! load from constant table: Repl4C($con)" %}
+  ins_encode %{
+    // XXX This is a quick fix for 6833573.
+    //__ ldf(FloatRegisterImpl::D, $constanttablebase, $constantoffset(replicate_immI($con$$constant, 4, 2)), $dst$$FloatRegister);
+    RegisterOrConstant con_offset = __ ensure_simm13_or_reg($constantoffset(replicate_immI($con$$constant, 4, 2)), $tmp$$Register);
+    __ ldf(FloatRegisterImpl::D, $constanttablebase, con_offset, as_DoubleFloatRegister($dst$$reg));
+  %}
+  ins_pipe(loadConFD);
+%}
+
+// Replicate scalar to packed short values into stack slot
+instruct Repl4S_reg_helper(iRegL dst, iRegI src) %{
+  effect(DEF dst, USE src);
+  format %{ "SLLX  $src,48,$dst\n\t"
+            "SRLX  $dst,16,O7\n\t"
+            "OR    $dst,O7,$dst\n\t"
+            "SRLX  $dst,32,O7\n\t"
+            "OR    $dst,O7,$dst\t! replicate4S" %}
+  ins_encode( enc_repl4s(src, dst) );
+  ins_pipe(ialu_reg);
+%}
+
+// Replicate scalar to packed short values into stack slot
+instruct Repl4S_reg(stackSlotD dst, iRegI src) %{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (ReplicateS src));
+  expand %{
+    iRegL tmp;
+    Repl4S_reg_helper(tmp, src);
+    regL_to_stkD(dst, tmp);
+  %}
+%}
+
+// Replicate scalar constant to packed short values in Double register
+instruct Repl4S_immI(regD dst, immI con, o7RegI tmp) %{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (ReplicateS con));
+  effect(KILL tmp);
+  format %{ "LDDF   [$constanttablebase + $constantoffset],$dst\t! load from constant table: Repl4S($con)" %}
+  ins_encode %{
+    // XXX This is a quick fix for 6833573.
+    //__ ldf(FloatRegisterImpl::D, $constanttablebase, $constantoffset(replicate_immI($con$$constant, 4, 2)), $dst$$FloatRegister);
+    RegisterOrConstant con_offset = __ ensure_simm13_or_reg($constantoffset(replicate_immI($con$$constant, 4, 2)), $tmp$$Register);
+    __ ldf(FloatRegisterImpl::D, $constanttablebase, con_offset, as_DoubleFloatRegister($dst$$reg));
+  %}
+  ins_pipe(loadConFD);
+%}
+
+// Replicate scalar to packed int values in Double register
+instruct Repl2I_reg_helper(iRegL dst, iRegI src) %{
+  effect(DEF dst, USE src);
+  format %{ "SLLX  $src,32,$dst\n\t"
+            "SRLX  $dst,32,O7\n\t"
+            "OR    $dst,O7,$dst\t! replicate2I" %}
+  ins_encode( enc_repl2i(src, dst));
+  ins_pipe(ialu_reg);
+%}
+
+// Replicate scalar to packed int values in Double register
+instruct Repl2I_reg(stackSlotD dst, iRegI src) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (ReplicateI src));
+  expand %{
+    iRegL tmp;
+    Repl2I_reg_helper(tmp, src);
+    regL_to_stkD(dst, tmp);
+  %}
+%}
+
+// Replicate scalar zero constant to packed int values in Double register
+instruct Repl2I_immI(regD dst, immI con, o7RegI tmp) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (ReplicateI con));
+  effect(KILL tmp);
+  format %{ "LDDF   [$constanttablebase + $constantoffset],$dst\t! load from constant table: Repl2I($con)" %}
+  ins_encode %{
+    // XXX This is a quick fix for 6833573.
+    //__ ldf(FloatRegisterImpl::D, $constanttablebase, $constantoffset(replicate_immI($con$$constant, 2, 4)), $dst$$FloatRegister);
+    RegisterOrConstant con_offset = __ ensure_simm13_or_reg($constantoffset(replicate_immI($con$$constant, 2, 4)), $tmp$$Register);
+    __ ldf(FloatRegisterImpl::D, $constanttablebase, con_offset, as_DoubleFloatRegister($dst$$reg));
+  %}
+  ins_pipe(loadConFD);
+%}
+
+// Replicate scalar to packed float values in Double register
+instruct Repl2F_reg(stackSlotD dst, regF src) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (ReplicateF src));
+  ins_cost(MEMORY_REF_COST*2);
+  format %{ "STF    $src,$dst.hi\t! packed2F\n\t"
+            "STF    $src,$dst.lo" %}
+  opcode(Assembler::stf_op3);
+  ins_encode(simple_form3_mem_reg(dst, src), form3_mem_plus_4_reg(dst, src));
+  ins_pipe(fstoreF_stk_reg);
+%}
+
+// Replicate scalar zero constant to packed float values in Double register
+instruct Repl2F_immF(regD dst, immF con, o7RegI tmp) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (ReplicateF con));
+  effect(KILL tmp);
+  format %{ "LDDF   [$constanttablebase + $constantoffset],$dst\t! load from constant table: Repl2F($con)" %}
+  ins_encode %{
+    // XXX This is a quick fix for 6833573.
+    //__ ldf(FloatRegisterImpl::D, $constanttablebase, $constantoffset(replicate_immF($con$$constant)), $dst$$FloatRegister);
+    RegisterOrConstant con_offset = __ ensure_simm13_or_reg($constantoffset(replicate_immF($con$$constant)), $tmp$$Register);
+    __ ldf(FloatRegisterImpl::D, $constanttablebase, con_offset, as_DoubleFloatRegister($dst$$reg));
+  %}
+  ins_pipe(loadConFD);
+%}
+
 //----------PEEPHOLE RULES-----------------------------------------------------
 // These must follow all instruction definitions as they use the names
 // defined in the instructions definitions.
--- old/src/cpu/sparc/vm/vm_version_sparc.cpp	Sat Jun  2 20:03:56 2012
+++ new/src/cpu/sparc/vm/vm_version_sparc.cpp	Sat Jun  2 20:03:56 2012
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1997, 2011, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2012, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -217,6 +217,8 @@
   // Currently not supported anywhere.
   FLAG_SET_DEFAULT(UseFPUForSpilling, false);
 
+  MaxVectorSize = 8;
+
   assert((InteriorEntryAlignment % relocInfo::addr_unit()) == 0, "alignment is not a multiple of NOP size");
 #endif
 
--- old/src/cpu/x86/vm/assembler_x86.cpp	Sat Jun  2 20:03:57 2012
+++ new/src/cpu/x86/vm/assembler_x86.cpp	Sat Jun  2 20:03:57 2012
@@ -1637,6 +1637,13 @@
   emit_byte(0xC0 | encode);
 }
 
+void Assembler::movlhps(XMMRegister dst, XMMRegister src) {
+  NOT_LP64(assert(VM_Version::supports_sse(), ""));
+  int encode = simd_prefix_and_encode(dst, src, src, VEX_SIMD_NONE);
+  emit_byte(0x16);
+  emit_byte(0xC0 | encode);
+}
+
 void Assembler::movb(Register dst, Address src) {
   NOT_LP64(assert(dst->has_byte_register(), "must have byte register"));
   InstructionMark im(this);
@@ -1686,6 +1693,14 @@
   emit_operand(dst, src);
 }
 
+void Assembler::movdl(Address dst, XMMRegister src) {
+  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+  InstructionMark im(this);
+  simd_prefix(dst, src, VEX_SIMD_66);
+  emit_byte(0x7E);
+  emit_operand(src, dst);
+}
+
 void Assembler::movdqa(XMMRegister dst, XMMRegister src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   int encode = simd_prefix_and_encode(dst, src, VEX_SIMD_66);
@@ -1716,6 +1731,35 @@
   emit_operand(src, dst);
 }
 
+// Move Unaligned 256bit Vector
+void Assembler::vmovdqu(XMMRegister dst, XMMRegister src) {
+  assert(UseAVX, "");
+  bool vector256 = true;
+  int encode = vex_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_F3, vector256);
+  emit_byte(0x6F);
+  emit_byte(0xC0 | encode);
+}
+
+void Assembler::vmovdqu(XMMRegister dst, Address src) {
+  assert(UseAVX, "");
+  InstructionMark im(this);
+  bool vector256 = true;
+  vex_prefix(dst, xnoreg, src, VEX_SIMD_F3, vector256);
+  emit_byte(0x6F);
+  emit_operand(dst, src);
+}
+
+void Assembler::vmovdqu(Address dst, XMMRegister src) {
+  assert(UseAVX, "");
+  InstructionMark im(this);
+  bool vector256 = true;
+  // swap src<->dst for encoding
+  assert(src != xnoreg, "sanity");
+  vex_prefix(src, xnoreg, dst, VEX_SIMD_F3, vector256);
+  emit_byte(0x7F);
+  emit_operand(src, dst);
+}
+
 // Uses zero extension on 64bit
 
 void Assembler::movl(Register dst, int32_t imm32) {
@@ -3112,6 +3156,13 @@
   emit_operand(dst, src);
 }
 
+void Assembler::vxorpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
+  assert(VM_Version::supports_avx(), "");
+  int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_66, vector256);
+  emit_byte(0x57);
+  emit_byte(0xC0 | encode);
+}
+
 void Assembler::vxorps(XMMRegister dst, XMMRegister nds, Address src) {
   assert(VM_Version::supports_avx(), "");
   InstructionMark im(this);
@@ -3120,6 +3171,30 @@
   emit_operand(dst, src);
 }
 
+void Assembler::vxorps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
+  assert(VM_Version::supports_avx(), "");
+  int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_NONE, vector256);
+  emit_byte(0x57);
+  emit_byte(0xC0 | encode);
+}
+
+void Assembler::vinsertf128h(XMMRegister dst, XMMRegister nds, XMMRegister src) {
+  assert(VM_Version::supports_avx(), "");
+  bool vector256 = true;
+  int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_66, vector256, VEX_OPCODE_0F_3A);
+  emit_byte(0x18);
+  emit_byte(0xC0 | encode);
+  // 0x00 - insert into lower 128 bits
+  // 0x01 - insert into upper 128 bits
+  emit_byte(0x01);
+}
+
+void Assembler::vzeroupper() {
+  assert(VM_Version::supports_avx(), "");
+  (void)vex_prefix_and_encode(xmm0, xmm0, xmm0, VEX_SIMD_NONE);
+  emit_byte(0x77);
+}
+
 
 #ifndef _LP64
 // 32bit only pieces of the assembler
--- old/src/cpu/x86/vm/assembler_x86.hpp	Sat Jun  2 20:03:58 2012
+++ new/src/cpu/x86/vm/assembler_x86.hpp	Sat Jun  2 20:03:58 2012
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1997, 2011, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2012, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -591,8 +591,9 @@
 
   void vex_prefix(XMMRegister dst, XMMRegister nds, Address src,
                   VexSimdPrefix pre, bool vector256 = false) {
-     vex_prefix(src, nds->encoding(), dst->encoding(),
-                pre, VEX_OPCODE_0F, false, vector256);
+    int dst_enc = dst->encoding();
+    int nds_enc = nds->is_valid() ? nds->encoding() : 0;
+    vex_prefix(src, nds_enc, dst_enc, pre, VEX_OPCODE_0F, false, vector256);
   }
 
   int  vex_prefix_and_encode(int dst_enc, int nds_enc, int src_enc,
@@ -600,9 +601,12 @@
                              bool vex_w, bool vector256);
 
   int  vex_prefix_and_encode(XMMRegister dst, XMMRegister nds, XMMRegister src,
-                             VexSimdPrefix pre, bool vector256 = false) {
-     return vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(),
-                                  pre, VEX_OPCODE_0F, false, vector256);
+                             VexSimdPrefix pre, bool vector256 = false,
+                             VexOpcode opc = VEX_OPCODE_0F) {
+    int src_enc = src->encoding();
+    int dst_enc = dst->encoding();
+    int nds_enc = nds->is_valid() ? nds->encoding() : 0;
+    return vex_prefix_and_encode(dst_enc, nds_enc, src_enc, pre, opc, false, vector256);
   }
 
   void simd_prefix(XMMRegister xreg, XMMRegister nds, Address adr,
@@ -1261,6 +1265,7 @@
   void movdl(XMMRegister dst, Register src);
   void movdl(Register dst, XMMRegister src);
   void movdl(XMMRegister dst, Address src);
+  void movdl(Address dst, XMMRegister src);
 
   // Move Double Quadword
   void movdq(XMMRegister dst, Register src);
@@ -1274,6 +1279,14 @@
   void movdqu(XMMRegister dst, Address src);
   void movdqu(XMMRegister dst, XMMRegister src);
 
+  // Move Unaligned 256bit Vector
+  void vmovdqu(Address dst, XMMRegister src);
+  void vmovdqu(XMMRegister dst, Address src);
+  void vmovdqu(XMMRegister dst, XMMRegister src);
+
+  // Move lower 64bit to high 64bit in 128bit register
+  void movlhps(XMMRegister dst, XMMRegister src);
+
   void movl(Register dst, int32_t imm32);
   void movl(Address dst, int32_t imm32);
   void movl(Register dst, Register src);
@@ -1615,7 +1628,18 @@
   void vxorpd(XMMRegister dst, XMMRegister nds, Address src);
   void vxorps(XMMRegister dst, XMMRegister nds, Address src);
 
+  // AVX Vector instrucitons.
+  void vxorpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
+  void vxorps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
+  void vinsertf128h(XMMRegister dst, XMMRegister nds, XMMRegister src);
 
+  // AVX instruction which is used to clear upper 128 bits of YMM registers and
+  // to avoid transaction penalty between AVX and SSE states. There is no
+  // penalty if legacy SSE instructions are encoded using VEX prefix because
+  // they always clear upper 128 bits. It should be used before calling
+  // runtime code and native libraries.
+  void vzeroupper();
+
  protected:
   // Next instructions require address alignment 16 bytes SSE mode.
   // They should be called only from corresponding MacroAssembler instructions.
@@ -2527,9 +2551,13 @@
   void vsubss(XMMRegister dst, XMMRegister nds, Address src)     { Assembler::vsubss(dst, nds, src); }
   void vsubss(XMMRegister dst, XMMRegister nds, AddressLiteral src);
 
+  // AVX Vector instructions
+
+  void vxorpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) { Assembler::vxorpd(dst, nds, src, vector256); }
   void vxorpd(XMMRegister dst, XMMRegister nds, Address src) { Assembler::vxorpd(dst, nds, src); }
   void vxorpd(XMMRegister dst, XMMRegister nds, AddressLiteral src);
 
+  void vxorps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) { Assembler::vxorps(dst, nds, src, vector256); }
   void vxorps(XMMRegister dst, XMMRegister nds, Address src) { Assembler::vxorps(dst, nds, src); }
   void vxorps(XMMRegister dst, XMMRegister nds, AddressLiteral src);
 
--- old/src/cpu/x86/vm/register_x86.cpp	Sat Jun  2 20:03:58 2012
+++ new/src/cpu/x86/vm/register_x86.cpp	Sat Jun  2 20:03:58 2012
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2000, 2012, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -35,7 +35,7 @@
 const int ConcreteRegisterImpl::max_fpr = ConcreteRegisterImpl::max_gpr +
                                                                  2 * FloatRegisterImpl::number_of_registers;
 const int ConcreteRegisterImpl::max_xmm = ConcreteRegisterImpl::max_fpr +
-                                                                 2 * XMMRegisterImpl::number_of_registers;
+                                                                 8 * XMMRegisterImpl::number_of_registers;
 const char* RegisterImpl::name() const {
   const char* names[number_of_registers] = {
 #ifndef AMD64
--- old/src/cpu/x86/vm/register_x86.hpp	Sat Jun  2 20:03:59 2012
+++ new/src/cpu/x86/vm/register_x86.hpp	Sat Jun  2 20:03:59 2012
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2000, 2012, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -158,7 +158,7 @@
   XMMRegister successor() const                          { return as_XMMRegister(encoding() + 1); }
 
   // accessors
-  int   encoding() const                          { assert(is_valid(), "invalid register"); return (intptr_t)this; }
+  int   encoding() const                          { assert(is_valid(), err_msg("invalid register (%d)", (int)(intptr_t)this )); return (intptr_t)this; }
   bool  is_valid() const                          { return 0 <= (intptr_t)this && (intptr_t)this < number_of_registers; }
   const char* name() const;
 };
@@ -216,7 +216,7 @@
                                RegisterImpl::number_of_registers +  // "H" half of a 64bit register
 #endif // AMD64
                            2 * FloatRegisterImpl::number_of_registers +
-                           2 * XMMRegisterImpl::number_of_registers +
+                           8 * XMMRegisterImpl::number_of_registers +
                            1 // eflags
   };
 
--- old/src/cpu/x86/vm/vm_version_x86.cpp	Sat Jun  2 20:03:59 2012
+++ new/src/cpu/x86/vm/vm_version_x86.cpp	Sat Jun  2 20:03:59 2012
@@ -467,6 +467,32 @@
   if (!supports_avx ()) // Drop to 0 if no AVX  support
     UseAVX = 0;
 
+#ifdef COMPILER2
+  if (UseFPUForSpilling) {
+    if (UseSSE < 2) {
+      // Only supported with SSE2+
+      FLAG_SET_DEFAULT(UseFPUForSpilling, false);
+    }
+  }
+  if (MaxVectorSize > 0) {
+    if (!is_power_of_2(MaxVectorSize)) {
+      warning("MaxVectorSize must be a power of 2");
+      FLAG_SET_DEFAULT(MaxVectorSize, 32);
+    }
+    if (MaxVectorSize > 32) {
+      FLAG_SET_DEFAULT(MaxVectorSize, 32);
+    }
+    if (MaxVectorSize > 16 && UseAVX == 0) {
+      // Only supported with AVX+
+      FLAG_SET_DEFAULT(MaxVectorSize, 16);
+    }
+    if (UseSSE < 2) {
+      // Only supported with SSE2+
+      FLAG_SET_DEFAULT(MaxVectorSize, 0);
+    }
+  }
+#endif
+
   // On new cpus instructions which update whole XMM register should be used
   // to prevent partial register stall due to dependencies on high half.
   //
@@ -544,6 +570,12 @@
       }
     }
 
+#ifdef COMPILER2
+    if (MaxVectorSize > 16) {
+      // Limit vectors size to 16 bytes on current AMD cpus.
+      FLAG_SET_DEFAULT(MaxVectorSize, 16);
+    }
+#endif // COMPILER2
   }
 
   if( is_intel() ) { // Intel cpus specific settings
@@ -606,15 +638,6 @@
     FLAG_SET_DEFAULT(UsePopCountInstruction, false);
   }
 
-#ifdef COMPILER2
-  if (UseFPUForSpilling) {
-    if (UseSSE < 2) {
-      // Only supported with SSE2+
-      FLAG_SET_DEFAULT(UseFPUForSpilling, false);
-    }
-  }
-#endif
-
   assert(0 <= ReadPrefetchInstr && ReadPrefetchInstr <= 3, "invalid value");
   assert(0 <= AllocatePrefetchInstr && AllocatePrefetchInstr <= 3, "invalid value");
 
--- old/src/cpu/x86/vm/vmreg_x86.cpp	Sat Jun  2 20:04:00 2012
+++ new/src/cpu/x86/vm/vmreg_x86.cpp	Sat Jun  2 20:04:00 2012
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2006, 2012, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -48,8 +48,9 @@
 
   XMMRegister xreg = ::as_XMMRegister(0);
   for ( ; i < ConcreteRegisterImpl::max_xmm ; ) {
-    regName[i++] = xreg->name();
-    regName[i++] = xreg->name();
+    for (int j = 0 ; j < 8 ; j++) {
+      regName[i++] = xreg->name();
+    }
     xreg = xreg->successor();
   }
   for ( ; i < ConcreteRegisterImpl::number_of_registers ; i ++ ) {
--- old/src/cpu/x86/vm/vmreg_x86.inline.hpp	Sat Jun  2 20:04:00 2012
+++ new/src/cpu/x86/vm/vmreg_x86.inline.hpp	Sat Jun  2 20:04:00 2012
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2006, 2012, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -39,7 +39,7 @@
 }
 
 inline VMReg XMMRegisterImpl::as_VMReg() {
-  return VMRegImpl::as_VMReg((encoding() << 1) + ConcreteRegisterImpl::max_fpr);
+  return VMRegImpl::as_VMReg((encoding() << 3) + ConcreteRegisterImpl::max_fpr);
 }
 
 
@@ -75,7 +75,7 @@
 inline XMMRegister VMRegImpl::as_XMMRegister() {
   assert( is_XMMRegister() && is_even(value()), "must be" );
   // Yuk
-  return ::as_XMMRegister((value() - ConcreteRegisterImpl::max_fpr) >> 1);
+  return ::as_XMMRegister((value() - ConcreteRegisterImpl::max_fpr) >> 3);
 }
 
 inline   bool VMRegImpl::is_concrete() {
--- old/src/cpu/x86/vm/x86.ad	Sat Jun  2 20:04:01 2012
+++ new/src/cpu/x86/vm/x86.ad	Sat Jun  2 20:04:01 2012
@@ -24,6 +24,456 @@
 
 // X86 Common Architecture Description File
 
+//----------REGISTER DEFINITION BLOCK------------------------------------------
+// This information is used by the matcher and the register allocator to
+// describe individual registers and classes of registers within the target
+// archtecture.
+
+register %{
+//----------Architecture Description Register Definitions----------------------
+// General Registers
+// "reg_def"  name ( register save type, C convention save type,
+//                   ideal register type, encoding );
+// Register Save Types:
+//
+// NS  = No-Save:       The register allocator assumes that these registers
+//                      can be used without saving upon entry to the method, &
+//                      that they do not need to be saved at call sites.
+//
+// SOC = Save-On-Call:  The register allocator assumes that these registers
+//                      can be used without saving upon entry to the method,
+//                      but that they must be saved at call sites.
+//
+// SOE = Save-On-Entry: The register allocator assumes that these registers
+//                      must be saved before using them upon entry to the
+//                      method, but they do not need to be saved at call
+//                      sites.
+//
+// AS  = Always-Save:   The register allocator assumes that these registers
+//                      must be saved before using them upon entry to the
+//                      method, & that they must be saved at call sites.
+//
+// Ideal Register Type is used to determine how to save & restore a
+// register.  Op_RegI will get spilled with LoadI/StoreI, Op_RegP will get
+// spilled with LoadP/StoreP.  If the register supports both, use Op_RegI.
+//
+// The encoding number is the actual bit-pattern placed into the opcodes.
+
+// XMM registers.  256-bit registers or 8 words each, labeled (a)-h.
+// Word a in each register holds a Float, words ab hold a Double.
+// The whole registers are used in SSE4.2 version intrinsics,
+// array copy stubs and superword operations (see UseSSE42Intrinsics,
+// UseXMMForArrayCopy and UseSuperword flags).
+// XMM8-XMM15 must be encoded with REX (VEX for UseAVX).
+// Linux ABI:   No register preserved across function calls
+//              XMM0-XMM7 might hold parameters
+// Windows ABI: XMM6-XMM15 preserved across function calls
+//              XMM0-XMM3 might hold parameters
+
+reg_def XMM0 ( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg());
+reg_def XMM0b( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next());
+reg_def XMM0c( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next()->next());
+reg_def XMM0d( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next()->next()->next());
+reg_def XMM0e( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next()->next()->next()->next());
+reg_def XMM0f( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next()->next()->next()->next()->next());
+reg_def XMM0g( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next()->next()->next()->next()->next()->next());
+reg_def XMM0h( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next()->next()->next()->next()->next()->next()->next());
+
+reg_def XMM1 ( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg());
+reg_def XMM1b( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next());
+reg_def XMM1c( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next()->next());
+reg_def XMM1d( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next()->next()->next());
+reg_def XMM1e( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next()->next()->next()->next());
+reg_def XMM1f( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next()->next()->next()->next()->next());
+reg_def XMM1g( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next()->next()->next()->next()->next()->next());
+reg_def XMM1h( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next()->next()->next()->next()->next()->next()->next());
+
+reg_def XMM2 ( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg());
+reg_def XMM2b( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next());
+reg_def XMM2c( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next()->next());
+reg_def XMM2d( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next()->next()->next());
+reg_def XMM2e( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next()->next()->next()->next());
+reg_def XMM2f( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next()->next()->next()->next()->next());
+reg_def XMM2g( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next()->next()->next()->next()->next()->next());
+reg_def XMM2h( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next()->next()->next()->next()->next()->next()->next());
+
+reg_def XMM3 ( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg());
+reg_def XMM3b( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next());
+reg_def XMM3c( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next()->next());
+reg_def XMM3d( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next()->next()->next());
+reg_def XMM3e( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next()->next()->next()->next());
+reg_def XMM3f( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next()->next()->next()->next()->next());
+reg_def XMM3g( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next()->next()->next()->next()->next()->next());
+reg_def XMM3h( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next()->next()->next()->next()->next()->next()->next());
+
+reg_def XMM4 ( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg());
+reg_def XMM4b( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next());
+reg_def XMM4c( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next()->next());
+reg_def XMM4d( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next()->next()->next());
+reg_def XMM4e( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next()->next()->next()->next());
+reg_def XMM4f( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next()->next()->next()->next()->next());
+reg_def XMM4g( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next()->next()->next()->next()->next()->next());
+reg_def XMM4h( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next()->next()->next()->next()->next()->next()->next());
+
+reg_def XMM5 ( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg());
+reg_def XMM5b( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next());
+reg_def XMM5c( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next()->next());
+reg_def XMM5d( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next()->next()->next());
+reg_def XMM5e( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next()->next()->next()->next());
+reg_def XMM5f( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next()->next()->next()->next()->next());
+reg_def XMM5g( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next()->next()->next()->next()->next()->next());
+reg_def XMM5h( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next()->next()->next()->next()->next()->next()->next());
+
+#ifdef _WIN64
+
+reg_def XMM6 ( SOC, SOE, Op_RegF, 6, xmm6->as_VMReg());
+reg_def XMM6b( SOC, SOE, Op_RegF, 6, xmm6->as_VMReg()->next());
+reg_def XMM6c( SOC, SOE, Op_RegF, 6, xmm6->as_VMReg()->next()->next());
+reg_def XMM6d( SOC, SOE, Op_RegF, 6, xmm6->as_VMReg()->next()->next()->next());
+reg_def XMM6e( SOC, SOE, Op_RegF, 6, xmm6->as_VMReg()->next()->next()->next()->next());
+reg_def XMM6f( SOC, SOE, Op_RegF, 6, xmm6->as_VMReg()->next()->next()->next()->next()->next());
+reg_def XMM6g( SOC, SOE, Op_RegF, 6, xmm6->as_VMReg()->next()->next()->next()->next()->next()->next());
+reg_def XMM6h( SOC, SOE, Op_RegF, 6, xmm6->as_VMReg()->next()->next()->next()->next()->next()->next()->next());
+
+reg_def XMM7 ( SOC, SOE, Op_RegF, 7, xmm7->as_VMReg());
+reg_def XMM7b( SOC, SOE, Op_RegF, 7, xmm7->as_VMReg()->next());
+reg_def XMM7c( SOC, SOE, Op_RegF, 7, xmm7->as_VMReg()->next()->next());
+reg_def XMM7d( SOC, SOE, Op_RegF, 7, xmm7->as_VMReg()->next()->next()->next());
+reg_def XMM7e( SOC, SOE, Op_RegF, 7, xmm7->as_VMReg()->next()->next()->next()->next());
+reg_def XMM7f( SOC, SOE, Op_RegF, 7, xmm7->as_VMReg()->next()->next()->next()->next()->next());
+reg_def XMM7g( SOC, SOE, Op_RegF, 7, xmm7->as_VMReg()->next()->next()->next()->next()->next()->next());
+reg_def XMM7h( SOC, SOE, Op_RegF, 7, xmm7->as_VMReg()->next()->next()->next()->next()->next()->next()->next());
+
+reg_def XMM8 ( SOC, SOE, Op_RegF, 8, xmm8->as_VMReg());
+reg_def XMM8b( SOC, SOE, Op_RegF, 8, xmm8->as_VMReg()->next());
+reg_def XMM8c( SOC, SOE, Op_RegF, 8, xmm8->as_VMReg()->next()->next());
+reg_def XMM8d( SOC, SOE, Op_RegF, 8, xmm8->as_VMReg()->next()->next()->next());
+reg_def XMM8e( SOC, SOE, Op_RegF, 8, xmm8->as_VMReg()->next()->next()->next()->next());
+reg_def XMM8f( SOC, SOE, Op_RegF, 8, xmm8->as_VMReg()->next()->next()->next()->next()->next());
+reg_def XMM8g( SOC, SOE, Op_RegF, 8, xmm8->as_VMReg()->next()->next()->next()->next()->next()->next());
+reg_def XMM8h( SOC, SOE, Op_RegF, 8, xmm8->as_VMReg()->next()->next()->next()->next()->next()->next()->next());
+
+reg_def XMM9 ( SOC, SOE, Op_RegF, 9, xmm9->as_VMReg());
+reg_def XMM9b( SOC, SOE, Op_RegF, 9, xmm9->as_VMReg()->next());
+reg_def XMM9c( SOC, SOE, Op_RegF, 9, xmm9->as_VMReg()->next()->next());
+reg_def XMM9d( SOC, SOE, Op_RegF, 9, xmm9->as_VMReg()->next()->next()->next());
+reg_def XMM9e( SOC, SOE, Op_RegF, 9, xmm9->as_VMReg()->next()->next()->next()->next());
+reg_def XMM9f( SOC, SOE, Op_RegF, 9, xmm9->as_VMReg()->next()->next()->next()->next()->next());
+reg_def XMM9g( SOC, SOE, Op_RegF, 9, xmm9->as_VMReg()->next()->next()->next()->next()->next()->next());
+reg_def XMM9h( SOC, SOE, Op_RegF, 9, xmm9->as_VMReg()->next()->next()->next()->next()->next()->next()->next());
+
+reg_def XMM10 ( SOC, SOE, Op_RegF, 10, xmm10->as_VMReg());
+reg_def XMM10b( SOC, SOE, Op_RegF, 10, xmm10->as_VMReg()->next());
+reg_def XMM10c( SOC, SOE, Op_RegF, 10, xmm10->as_VMReg()->next()->next());
+reg_def XMM10d( SOC, SOE, Op_RegF, 10, xmm10->as_VMReg()->next()->next()->next());
+reg_def XMM10e( SOC, SOE, Op_RegF, 10, xmm10->as_VMReg()->next()->next()->next()->next());
+reg_def XMM10f( SOC, SOE, Op_RegF, 10, xmm10->as_VMReg()->next()->next()->next()->next()->next());
+reg_def XMM10g( SOC, SOE, Op_RegF, 10, xmm10->as_VMReg()->next()->next()->next()->next()->next()->next());
+reg_def XMM10h( SOC, SOE, Op_RegF, 10, xmm10->as_VMReg()->next()->next()->next()->next()->next()->next()->next());
+
+reg_def XMM11 ( SOC, SOE, Op_RegF, 11, xmm11->as_VMReg());
+reg_def XMM11b( SOC, SOE, Op_RegF, 11, xmm11->as_VMReg()->next());
+reg_def XMM11c( SOC, SOE, Op_RegF, 11, xmm11->as_VMReg()->next()->next());
+reg_def XMM11d( SOC, SOE, Op_RegF, 11, xmm11->as_VMReg()->next()->next()->next());
+reg_def XMM11e( SOC, SOE, Op_RegF, 11, xmm11->as_VMReg()->next()->next()->next()->next());
+reg_def XMM11f( SOC, SOE, Op_RegF, 11, xmm11->as_VMReg()->next()->next()->next()->next()->next());
+reg_def XMM11g( SOC, SOE, Op_RegF, 11, xmm11->as_VMReg()->next()->next()->next()->next()->next()->next());
+reg_def XMM11h( SOC, SOE, Op_RegF, 11, xmm11->as_VMReg()->next()->next()->next()->next()->next()->next()->next());
+
+reg_def XMM12 ( SOC, SOE, Op_RegF, 12, xmm12->as_VMReg());
+reg_def XMM12b( SOC, SOE, Op_RegF, 12, xmm12->as_VMReg()->next());
+reg_def XMM12c( SOC, SOE, Op_RegF, 12, xmm12->as_VMReg()->next()->next());
+reg_def XMM12d( SOC, SOE, Op_RegF, 12, xmm12->as_VMReg()->next()->next()->next());
+reg_def XMM12e( SOC, SOE, Op_RegF, 12, xmm12->as_VMReg()->next()->next()->next()->next());
+reg_def XMM12f( SOC, SOE, Op_RegF, 12, xmm12->as_VMReg()->next()->next()->next()->next()->next());
+reg_def XMM12g( SOC, SOE, Op_RegF, 12, xmm12->as_VMReg()->next()->next()->next()->next()->next()->next());
+reg_def XMM12h( SOC, SOE, Op_RegF, 12, xmm12->as_VMReg()->next()->next()->next()->next()->next()->next()->next());
+
+reg_def XMM13 ( SOC, SOE, Op_RegF, 13, xmm13->as_VMReg());
+reg_def XMM13b( SOC, SOE, Op_RegF, 13, xmm13->as_VMReg()->next());
+reg_def XMM13c( SOC, SOE, Op_RegF, 13, xmm13->as_VMReg()->next()->next());
+reg_def XMM13d( SOC, SOE, Op_RegF, 13, xmm13->as_VMReg()->next()->next()->next());
+reg_def XMM13e( SOC, SOE, Op_RegF, 13, xmm13->as_VMReg()->next()->next()->next()->next());
+reg_def XMM13f( SOC, SOE, Op_RegF, 13, xmm13->as_VMReg()->next()->next()->next()->next()->next());
+reg_def XMM13g( SOC, SOE, Op_RegF, 13, xmm13->as_VMReg()->next()->next()->next()->next()->next()->next());
+reg_def XMM13h( SOC, SOE, Op_RegF, 13, xmm13->as_VMReg()->next()->next()->next()->next()->next()->next()->next());
+
+reg_def XMM14 ( SOC, SOE, Op_RegF, 14, xmm14->as_VMReg());
+reg_def XMM14b( SOC, SOE, Op_RegF, 14, xmm14->as_VMReg()->next());
+reg_def XMM14c( SOC, SOE, Op_RegF, 14, xmm14->as_VMReg()->next()->next());
+reg_def XMM14d( SOC, SOE, Op_RegF, 14, xmm14->as_VMReg()->next()->next()->next());
+reg_def XMM14e( SOC, SOE, Op_RegF, 14, xmm14->as_VMReg()->next()->next()->next()->next());
+reg_def XMM14f( SOC, SOE, Op_RegF, 14, xmm14->as_VMReg()->next()->next()->next()->next()->next());
+reg_def XMM14g( SOC, SOE, Op_RegF, 14, xmm14->as_VMReg()->next()->next()->next()->next()->next()->next());
+reg_def XMM14h( SOC, SOE, Op_RegF, 14, xmm14->as_VMReg()->next()->next()->next()->next()->next()->next()->next());
+
+reg_def XMM15 ( SOC, SOE, Op_RegF, 15, xmm15->as_VMReg());
+reg_def XMM15b( SOC, SOE, Op_RegF, 15, xmm15->as_VMReg()->next());
+reg_def XMM15c( SOC, SOE, Op_RegF, 15, xmm15->as_VMReg()->next()->next());
+reg_def XMM15d( SOC, SOE, Op_RegF, 15, xmm15->as_VMReg()->next()->next()->next());
+reg_def XMM15e( SOC, SOE, Op_RegF, 15, xmm15->as_VMReg()->next()->next()->next()->next());
+reg_def XMM15f( SOC, SOE, Op_RegF, 15, xmm15->as_VMReg()->next()->next()->next()->next()->next());
+reg_def XMM15g( SOC, SOE, Op_RegF, 15, xmm15->as_VMReg()->next()->next()->next()->next()->next()->next());
+reg_def XMM15h( SOC, SOE, Op_RegF, 15, xmm15->as_VMReg()->next()->next()->next()->next()->next()->next()->next());
+
+#else // _WIN64
+
+reg_def XMM6 ( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg());
+reg_def XMM6b( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next());
+reg_def XMM6c( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next()->next());
+reg_def XMM6d( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next()->next()->next());
+reg_def XMM6e( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next()->next()->next()->next());
+reg_def XMM6f( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next()->next()->next()->next()->next());
+reg_def XMM6g( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next()->next()->next()->next()->next()->next());
+reg_def XMM6h( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next()->next()->next()->next()->next()->next()->next());
+
+reg_def XMM7 ( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg());
+reg_def XMM7b( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next());
+reg_def XMM7c( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next()->next());
+reg_def XMM7d( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next()->next()->next());
+reg_def XMM7e( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next()->next()->next()->next());
+reg_def XMM7f( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next()->next()->next()->next()->next());
+reg_def XMM7g( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next()->next()->next()->next()->next()->next());
+reg_def XMM7h( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next()->next()->next()->next()->next()->next()->next());
+
+#ifdef _LP64
+
+reg_def XMM8 ( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg());
+reg_def XMM8b( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next());
+reg_def XMM8c( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next()->next());
+reg_def XMM8d( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next()->next()->next());
+reg_def XMM8e( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next()->next()->next()->next());
+reg_def XMM8f( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next()->next()->next()->next()->next());
+reg_def XMM8g( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next()->next()->next()->next()->next()->next());
+reg_def XMM8h( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next()->next()->next()->next()->next()->next()->next());
+
+reg_def XMM9 ( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg());
+reg_def XMM9b( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next());
+reg_def XMM9c( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next()->next());
+reg_def XMM9d( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next()->next()->next());
+reg_def XMM9e( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next()->next()->next()->next());
+reg_def XMM9f( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next()->next()->next()->next()->next());
+reg_def XMM9g( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next()->next()->next()->next()->next()->next());
+reg_def XMM9h( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next()->next()->next()->next()->next()->next()->next());
+
+reg_def XMM10 ( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg());
+reg_def XMM10b( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next());
+reg_def XMM10c( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next()->next());
+reg_def XMM10d( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next()->next()->next());
+reg_def XMM10e( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next()->next()->next()->next());
+reg_def XMM10f( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next()->next()->next()->next()->next());
+reg_def XMM10g( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next()->next()->next()->next()->next()->next());
+reg_def XMM10h( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next()->next()->next()->next()->next()->next()->next());
+
+reg_def XMM11 ( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg());
+reg_def XMM11b( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next());
+reg_def XMM11c( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next()->next());
+reg_def XMM11d( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next()->next()->next());
+reg_def XMM11e( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next()->next()->next()->next());
+reg_def XMM11f( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next()->next()->next()->next()->next());
+reg_def XMM11g( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next()->next()->next()->next()->next()->next());
+reg_def XMM11h( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next()->next()->next()->next()->next()->next()->next());
+
+reg_def XMM12 ( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg());
+reg_def XMM12b( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next());
+reg_def XMM12c( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next()->next());
+reg_def XMM12d( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next()->next()->next());
+reg_def XMM12e( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next()->next()->next()->next());
+reg_def XMM12f( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next()->next()->next()->next()->next());
+reg_def XMM12g( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next()->next()->next()->next()->next()->next());
+reg_def XMM12h( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next()->next()->next()->next()->next()->next()->next());
+
+reg_def XMM13 ( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg());
+reg_def XMM13b( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next());
+reg_def XMM13c( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next()->next());
+reg_def XMM13d( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next()->next()->next());
+reg_def XMM13e( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next()->next()->next()->next());
+reg_def XMM13f( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next()->next()->next()->next()->next());
+reg_def XMM13g( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next()->next()->next()->next()->next()->next());
+reg_def XMM13h( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next()->next()->next()->next()->next()->next()->next());
+
+reg_def XMM14 ( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg());
+reg_def XMM14b( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next());
+reg_def XMM14c( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next()->next());
+reg_def XMM14d( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next()->next()->next());
+reg_def XMM14e( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next()->next()->next()->next());
+reg_def XMM14f( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next()->next()->next()->next()->next());
+reg_def XMM14g( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next()->next()->next()->next()->next()->next());
+reg_def XMM14h( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next()->next()->next()->next()->next()->next()->next());
+
+reg_def XMM15 ( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg());
+reg_def XMM15b( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next());
+reg_def XMM15c( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next()->next());
+reg_def XMM15d( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next()->next()->next());
+reg_def XMM15e( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next()->next()->next()->next());
+reg_def XMM15f( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next()->next()->next()->next()->next());
+reg_def XMM15g( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next()->next()->next()->next()->next()->next());
+reg_def XMM15h( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next()->next()->next()->next()->next()->next()->next());
+
+#endif // _LP64
+
+#endif // _WIN64
+
+#ifdef _LP64
+reg_def RFLAGS(SOC, SOC, 0, 16, VMRegImpl::Bad());
+#else
+reg_def RFLAGS(SOC, SOC, 0, 8, VMRegImpl::Bad());
+#endif // _LP64
+
+alloc_class chunk1(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,
+                   XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,
+                   XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,
+                   XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,
+                   XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,
+                   XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,
+                   XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,
+                   XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h
+#ifdef _LP64
+                  ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,
+                   XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,
+                   XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h,
+                   XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h,
+                   XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h,
+                   XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h,
+                   XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h,
+                   XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h
+#endif
+                   );
+
+// flags allocation class should be last.
+alloc_class chunk2(RFLAGS);
+
+// Singleton class for condition codes
+reg_class int_flags(RFLAGS);
+
+// Class for all float registers
+reg_class float_reg(XMM0,
+                    XMM1,
+                    XMM2,
+                    XMM3,
+                    XMM4,
+                    XMM5,
+                    XMM6,
+                    XMM7
+#ifdef _LP64
+                   ,XMM8,
+                    XMM9,
+                    XMM10,
+                    XMM11,
+                    XMM12,
+                    XMM13,
+                    XMM14,
+                    XMM15
+#endif
+                    );
+
+// Class for all double registers
+reg_class double_reg(XMM0,  XMM0b,
+                     XMM1,  XMM1b,
+                     XMM2,  XMM2b,
+                     XMM3,  XMM3b,
+                     XMM4,  XMM4b,
+                     XMM5,  XMM5b,
+                     XMM6,  XMM6b,
+                     XMM7,  XMM7b
+#ifdef _LP64
+                    ,XMM8,  XMM8b,
+                     XMM9,  XMM9b,
+                     XMM10, XMM10b,
+                     XMM11, XMM11b,
+                     XMM12, XMM12b,
+                     XMM13, XMM13b,
+                     XMM14, XMM14b,
+                     XMM15, XMM15b
+#endif
+                     );
+
+// Class for all 32bit vector registers
+reg_class vectors_reg(XMM0,
+                      XMM1,
+                      XMM2,
+                      XMM3,
+                      XMM4,
+                      XMM5,
+                      XMM6,
+                      XMM7
+#ifdef _LP64
+                     ,XMM8,
+                      XMM9,
+                      XMM10,
+                      XMM11,
+                      XMM12,
+                      XMM13,
+                      XMM14,
+                      XMM15
+#endif
+                      );
+
+// Class for all 64bit vector registers
+reg_class vectord_reg(XMM0,  XMM0b,
+                      XMM1,  XMM1b,
+                      XMM2,  XMM2b,
+                      XMM3,  XMM3b,
+                      XMM4,  XMM4b,
+                      XMM5,  XMM5b,
+                      XMM6,  XMM6b,
+                      XMM7,  XMM7b
+#ifdef _LP64
+                     ,XMM8,  XMM8b,
+                      XMM9,  XMM9b,
+                      XMM10, XMM10b,
+                      XMM11, XMM11b,
+                      XMM12, XMM12b,
+                      XMM13, XMM13b,
+                      XMM14, XMM14b,
+                      XMM15, XMM15b
+#endif
+                      );
+
+// Class for all 128bit vector registers
+reg_class vectorx_reg(XMM0,  XMM0b,  XMM0c,  XMM0d,
+                      XMM1,  XMM1b,  XMM1c,  XMM1d,
+                      XMM2,  XMM2b,  XMM2c,  XMM2d,
+                      XMM3,  XMM3b,  XMM3c,  XMM3d,
+                      XMM4,  XMM4b,  XMM4c,  XMM4d,
+                      XMM5,  XMM5b,  XMM5c,  XMM5d,
+                      XMM6,  XMM6b,  XMM6c,  XMM6d,
+                      XMM7,  XMM7b,  XMM7c,  XMM7d
+#ifdef _LP64
+                     ,XMM8,  XMM8b,  XMM8c,  XMM8d,
+                      XMM9,  XMM9b,  XMM9c,  XMM9d,
+                      XMM10, XMM10b, XMM10c, XMM10d,
+                      XMM11, XMM11b, XMM11c, XMM11d,
+                      XMM12, XMM12b, XMM12c, XMM12d,
+                      XMM13, XMM13b, XMM13c, XMM13d,
+                      XMM14, XMM14b, XMM14c, XMM14d,
+                      XMM15, XMM15b, XMM15c, XMM15d
+#endif
+                      );
+
+// Class for all 256bit vector registers
+reg_class vectory_reg(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,
+                      XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,
+                      XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,
+                      XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,
+                      XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,
+                      XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,
+                      XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,
+                      XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h
+#ifdef _LP64
+                     ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,
+                      XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,
+                      XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h,
+                      XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h,
+                      XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h,
+                      XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h,
+                      XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h,
+                      XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h
+#endif
+                      );
+
+%}
+
 source %{
   // Float masks come from different places depending on platform.
 #ifdef _LP64
@@ -38,7 +488,253 @@
   static address double_signflip() { return (address)double_signflip_pool; }
 #endif
 
+// Map Types to machine register types
+const int Matcher::base2reg[Type::lastype] = {
+  Node::NotAMachineReg,0,0, Op_RegI, Op_RegL, 0, Op_RegN,
+  Node::NotAMachineReg, Node::NotAMachineReg, /* tuple, array */
+  Op_VecS, Op_VecD, Op_VecX, Op_VecY, /* Vectors */
+  Op_RegP, Op_RegP, Op_RegP, Op_RegP, Op_RegP, Op_RegP, /* the pointers */
+  0, 0/*abio*/,
+  Op_RegP /* Return address */, 0, /* the memories */
+  Op_RegF, Op_RegF, Op_RegF, Op_RegD, Op_RegD, Op_RegD,
+  0  /*bottom*/
+};
+
+// Max vector size in bytes. 0 if not supported.
+const int Matcher::vector_width_in_bytes(BasicType bt) {
+  assert(is_java_primitive(bt), "only primitive type vectors");
+  if (UseSSE < 2) return 0;
+  // SSE2 supports 128bit vectors for all types.
+  // AVX2 supports 256bit vectors for all types.
+  int size = (UseAVX > 1) ? 32 : 16;
+  // AVX1 supports 256bit vectors only for FLOAT and DOUBLE.
+  if (UseAVX > 0 && (bt == T_FLOAT || bt == T_DOUBLE))
+    size = 32;
+  // Use flag to limit vector size.
+  size = MIN2(size,(int)MaxVectorSize);
+  // Minimum 2 values in vector (or 4 for bytes).
+  switch (bt) {
+  case T_DOUBLE:
+  case T_LONG:
+    if (size < 16) return 0;
+  case T_FLOAT:
+  case T_INT:
+    if (size < 8) return 0;
+  case T_BOOLEAN:
+  case T_BYTE:
+  case T_CHAR:
+  case T_SHORT:
+    if (size < 4) return 0;
+    break;
+  default:
+    ShouldNotReachHere();
+  }
+  return size;
+}
+
+// Limits on vector size (number of elements) loaded into vector.
+const int Matcher::max_vector_size(const BasicType bt) {
+  return vector_width_in_bytes(bt)/type2aelembytes(bt);
+}
+const int Matcher::min_vector_size(const BasicType bt) {
+  int max_size = max_vector_size(bt);
+  // Min size which can be loaded into vector is 4 bytes.
+  int size = (type2aelembytes(bt) == 1) ? 4 : 2;
+  return MIN2(size,max_size);
+}
+
+// Vector ideal reg corresponding to specidied size in bytes
+const int Matcher::vector_ideal_reg(int size) {
+  assert(MaxVectorSize >= size, "");
+  switch(size) {
+    case  4: return Op_VecS;
+    case  8: return Op_VecD;
+    case 16: return Op_VecX;
+    case 32: return Op_VecY;
+  }
+  ShouldNotReachHere();
+  return 0;
+}
+
+// x86 supports misaligned vectors store/load.
+const bool Matcher::misaligned_vectors_ok() {
+  return !AlignVector; // can be changed by flag
+}
+
+// Helper methods for MachSpillCopyNode::implementation().
+static int vec_mov_helper(CodeBuffer *cbuf, bool do_size, int src_lo, int dst_lo,
+                          int src_hi, int dst_hi, uint ireg, outputStream* st) {
+  // In 64-bit VM size calculation is very complex. Emitting instructions
+  // into scratch buffer is used to get size in 64-bit VM.
+  LP64_ONLY( assert(!do_size, "this method calculates size only for 32-bit VM"); )
+  assert(ireg == Op_VecS || // 32bit vector
+         (src_lo & 1) == 0 && (src_lo + 1) == src_hi &&
+         (dst_lo & 1) == 0 && (dst_lo + 1) == dst_hi,
+         "no non-adjacent vector moves" );
+  if (cbuf) {
+    MacroAssembler _masm(cbuf);
+    int offset = __ offset();
+    switch (ireg) {
+    case Op_VecS: // copy whole register
+    case Op_VecD:
+    case Op_VecX:
+      __ movdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
+      break;
+    case Op_VecY:
+      __ vmovdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
+      break;
+    default:
+      ShouldNotReachHere();
+    }
+    int size = __ offset() - offset;
+#ifdef ASSERT
+    // VEX_2bytes prefix is used if UseAVX > 0, so it takes the same 2 bytes as SIMD prefix.
+    assert(!do_size || size == 4, "incorrect size calculattion");
+#endif
+    return size;
 #ifndef PRODUCT
+  } else if (!do_size) {
+    switch (ireg) {
+    case Op_VecS:
+    case Op_VecD:
+    case Op_VecX:
+      st->print("movdqu  %s,%s\t# spill",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
+      break;
+    case Op_VecY:
+      st->print("vmovdqu %s,%s\t# spill",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
+      break;
+    default:
+      ShouldNotReachHere();
+    }
+#endif
+  }
+  // VEX_2bytes prefix is used if UseAVX > 0, and it takes the same 2 bytes as SIMD prefix.
+  return 4;
+}
+
+static int vec_spill_helper(CodeBuffer *cbuf, bool do_size, bool is_load,
+                            int stack_offset, int reg, uint ireg, outputStream* st) {
+  // In 64-bit VM size calculation is very complex. Emitting instructions
+  // into scratch buffer is used to get size in 64-bit VM.
+  LP64_ONLY( assert(!do_size, "this method calculates size only for 32-bit VM"); )
+  if (cbuf) {
+    MacroAssembler _masm(cbuf);
+    int offset = __ offset();
+    if (is_load) {
+      switch (ireg) {
+      case Op_VecS:
+        __ movdl(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
+        break;
+      case Op_VecD:
+        __ movq(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
+        break;
+      case Op_VecX:
+        __ movdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
+        break;
+      case Op_VecY:
+        __ vmovdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
+        break;
+      default:
+        ShouldNotReachHere();
+      }
+    } else { // store
+      switch (ireg) {
+      case Op_VecS:
+        __ movdl(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
+        break;
+      case Op_VecD:
+        __ movq(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
+        break;
+      case Op_VecX:
+        __ movdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
+        break;
+      case Op_VecY:
+        __ vmovdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
+        break;
+      default:
+        ShouldNotReachHere();
+      }
+    }
+    int size = __ offset() - offset;
+#ifdef ASSERT
+    int offset_size = (stack_offset == 0) ? 0 : ((stack_offset < 0x80) ? 1 : 4);
+    // VEX_2bytes prefix is used if UseAVX > 0, so it takes the same 2 bytes as SIMD prefix.
+    assert(!do_size || size == (5+offset_size), "incorrect size calculattion");
+#endif
+    return size;
+#ifndef PRODUCT
+  } else if (!do_size) {
+    if (is_load) {
+      switch (ireg) {
+      case Op_VecS:
+        st->print("movd    %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
+        break;
+      case Op_VecD:
+        st->print("movq    %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
+        break;
+       case Op_VecX:
+        st->print("movdqu  %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
+        break;
+      case Op_VecY:
+        st->print("vmovdqu %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
+        break;
+      default:
+        ShouldNotReachHere();
+      }
+    } else { // store
+      switch (ireg) {
+      case Op_VecS:
+        st->print("movd    [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
+        break;
+      case Op_VecD:
+        st->print("movq    [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
+        break;
+       case Op_VecX:
+        st->print("movdqu  [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
+        break;
+      case Op_VecY:
+        st->print("vmovdqu [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
+        break;
+      default:
+        ShouldNotReachHere();
+      }
+    }
+#endif
+  }
+  int offset_size = (stack_offset == 0) ? 0 : ((stack_offset < 0x80) ? 1 : 4);
+  // VEX_2bytes prefix is used if UseAVX > 0, so it takes the same 2 bytes as SIMD prefix.
+  return 5+offset_size;
+}
+
+static inline jfloat replicate4_imm(int con, int width) {
+  // Load a constant of "width" (in bytes) and replicate it to fill 32bit.
+  assert(width == 1 || width == 2, "only byte or short types here");
+  int bit_width = width * 8;
+  jint val = con;
+  val &= (1 << bit_width) - 1;  // mask off sign bits
+  while(bit_width < 32) {
+    val |= (val << bit_width);
+    bit_width <<= 1;
+  }
+  jfloat fval = *((jfloat*) &val);  // coerce to float type
+  return fval;
+}
+
+static inline jdouble replicate8_imm(int con, int width) {
+  // Load a constant of "width" (in bytes) and replicate it to fill 64bit.
+  assert(width == 1 || width == 2 || width == 4, "only byte, short or int types here");
+  int bit_width = width * 8;
+  jlong val = con;
+  val &= (((jlong) 1) << bit_width) - 1;  // mask off sign bits
+  while(bit_width < 64) {
+    val |= (val << bit_width);
+    bit_width <<= 1;
+  }
+  jdouble dval = *((jdouble*) &val);  // coerce to double type
+  return dval;
+}
+
+#ifndef PRODUCT
   void MachNopNode::format(PhaseRegAlloc*, outputStream* st) const {
     st->print("nop \t# %d bytes pad for loops and calls", _count);
   }
@@ -103,6 +799,46 @@
 
 %}
 
+
+//----------OPERANDS-----------------------------------------------------------
+// Operand definitions must precede instruction definitions for correct parsing
+// in the ADLC because operands constitute user defined types which are used in
+// instruction definitions.
+
+// Vectors
+operand vecS() %{
+  constraint(ALLOC_IN_RC(vectors_reg));
+  match(VecS);
+
+  format %{ %}
+  interface(REG_INTER);
+%}
+
+operand vecD() %{
+  constraint(ALLOC_IN_RC(vectord_reg));
+  match(VecD);
+
+  format %{ %}
+  interface(REG_INTER);
+%}
+
+operand vecX() %{
+  constraint(ALLOC_IN_RC(vectorx_reg));
+  match(VecX);
+
+  format %{ %}
+  interface(REG_INTER);
+%}
+
+operand vecY() %{
+  constraint(ALLOC_IN_RC(vectory_reg));
+  match(VecY);
+
+  format %{ %}
+  interface(REG_INTER);
+%}
+
+
 // INSTRUCTIONS -- Platform independent definitions (same for 32- and 64-bit)
 
 // ============================================================================
@@ -852,3 +1588,942 @@
   ins_pipe(pipe_slow);
 %}
 
+
+// ====================VECTOR INSTRUCTIONS=====================================
+
+// Load vectors (4 bytes long)
+instruct loadV4(vecS dst, memory mem) %{
+  predicate(n->as_LoadVector()->memory_size() == 4);
+  match(Set dst (LoadVector mem));
+  ins_cost(125);
+  format %{ "movd    $dst,$mem\t! load vector (4 bytes)" %}
+  ins_encode %{
+    __ movdl($dst$$XMMRegister, $mem$$Address);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+// Load vectors (8 bytes long)
+instruct loadV8(vecD dst, memory mem) %{
+  predicate(n->as_LoadVector()->memory_size() == 8);
+  match(Set dst (LoadVector mem));
+  ins_cost(125);
+  format %{ "movq    $dst,$mem\t! load vector (8 bytes)" %}
+  ins_encode %{
+    __ movq($dst$$XMMRegister, $mem$$Address);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+// Load vectors (16 bytes long)
+instruct loadV16(vecX dst, memory mem) %{
+  predicate(n->as_LoadVector()->memory_size() == 16);
+  match(Set dst (LoadVector mem));
+  ins_cost(125);
+  format %{ "movdqu  $dst,$mem\t! load vector (16 bytes)" %}
+  ins_encode %{
+    __ movdqu($dst$$XMMRegister, $mem$$Address);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+// Load vectors (32 bytes long)
+instruct loadV32(vecY dst, memory mem) %{
+  predicate(n->as_LoadVector()->memory_size() == 32);
+  match(Set dst (LoadVector mem));
+  ins_cost(125);
+  format %{ "vmovdqu $dst,$mem\t! load vector (32 bytes)" %}
+  ins_encode %{
+    __ vmovdqu($dst$$XMMRegister, $mem$$Address);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+// Store vectors
+instruct storeV4(memory mem, vecS src) %{
+  predicate(n->as_StoreVector()->memory_size() == 4);
+  match(Set mem (StoreVector mem src));
+  ins_cost(145);
+  format %{ "movd    $mem,$src\t! store vector (4 bytes)" %}
+  ins_encode %{
+    __ movdl($mem$$Address, $src$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct storeV8(memory mem, vecD src) %{
+  predicate(n->as_StoreVector()->memory_size() == 8);
+  match(Set mem (StoreVector mem src));
+  ins_cost(145);
+  format %{ "movq    $mem,$src\t! store vector (8 bytes)" %}
+  ins_encode %{
+    __ movq($mem$$Address, $src$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct storeV16(memory mem, vecX src) %{
+  predicate(n->as_StoreVector()->memory_size() == 16);
+  match(Set mem (StoreVector mem src));
+  ins_cost(145);
+  format %{ "movdqu  $mem,$src\t! store vector (16 bytes)" %}
+  ins_encode %{
+    __ movdqu($mem$$Address, $src$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct storeV32(memory mem, vecY src) %{
+  predicate(n->as_StoreVector()->memory_size() == 32);
+  match(Set mem (StoreVector mem src));
+  ins_cost(145);
+  format %{ "vmovdqu $mem,$src\t! store vector (32 bytes)" %}
+  ins_encode %{
+    __ vmovdqu($mem$$Address, $src$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+// Replicate byte scalar to be vector
+instruct Repl4B(vecS dst, rRegI src) %{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (ReplicateB src));
+  format %{ "movd    $dst,$src\n\t"
+            "punpcklbw $dst,$dst\n\t"
+            "pshuflw $dst,$dst,0x00\t! replicate4B" %}
+  ins_encode %{
+    __ movdl($dst$$XMMRegister, $src$$Register);
+    __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
+    __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl8B(vecD dst, rRegI src) %{
+  predicate(n->as_Vector()->length() == 8);
+  match(Set dst (ReplicateB src));
+  format %{ "movd    $dst,$src\n\t"
+            "punpcklbw $dst,$dst\n\t"
+            "pshuflw $dst,$dst,0x00\t! replicate8B" %}
+  ins_encode %{
+    __ movdl($dst$$XMMRegister, $src$$Register);
+    __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
+    __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl16B(vecX dst, rRegI src) %{
+  predicate(n->as_Vector()->length() == 16);
+  match(Set dst (ReplicateB src));
+  format %{ "movd    $dst,$src\n\t"
+            "punpcklbw $dst,$dst\n\t"
+            "pshuflw $dst,$dst,0x00\n\t"
+            "movlhps $dst,$dst\t! replicate16B" %}
+  ins_encode %{
+    __ movdl($dst$$XMMRegister, $src$$Register);
+    __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
+    __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
+    __ movlhps($dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl32B(vecY dst, rRegI src) %{
+  predicate(n->as_Vector()->length() == 32);
+  match(Set dst (ReplicateB src));
+  format %{ "movd    $dst,$src\n\t"
+            "punpcklbw $dst,$dst\n\t"
+            "pshuflw $dst,$dst,0x00\n\t"
+            "movlhps $dst,$dst\n\t"
+            "vinsertf128h $dst,$dst,$dst\t! replicate32B" %}
+  ins_encode %{
+    __ movdl($dst$$XMMRegister, $src$$Register);
+    __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
+    __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
+    __ movlhps($dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinsertf128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+// Replicate byte scalar immediate to be vector by loading from const table.
+instruct Repl4B_imm(vecS dst, immI con) %{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (ReplicateB con));
+  format %{ "movss   $dst,[$constantaddress]\t! replicate4B($con)" %}
+  ins_encode %{
+    __ movflt($dst$$XMMRegister, $constantaddress(replicate4_imm($con$$constant, 1)));
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl8B_imm(vecD dst, immI con) %{
+  predicate(n->as_Vector()->length() == 8);
+  match(Set dst (ReplicateB con));
+  format %{ "movsd   $dst,[$constantaddress]\t! replicate8B($con)" %}
+  ins_encode %{
+    __ movdbl($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl16B_imm(vecX dst, immI con) %{
+  predicate(n->as_Vector()->length() == 16);
+  match(Set dst (ReplicateB con));
+  format %{ "movsd   $dst,[$constantaddress]\t! replicate16B($con)\n\t"
+            "movlhps $dst,$dst" %}
+  ins_encode %{
+    __ movdbl($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
+    __ movlhps($dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl32B_imm(vecY dst, immI con) %{
+  predicate(n->as_Vector()->length() == 32);
+  match(Set dst (ReplicateB con));
+  format %{ "movsd   $dst,[$constantaddress]\t! lreplicate32B($con)\n\t"
+            "movlhps $dst,$dst\n\t"
+            "vinsertf128h $dst,$dst,$dst" %}
+  ins_encode %{
+    __ movdbl($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
+    __ movlhps($dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinsertf128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+// Replicate byte scalar zero to be vector
+instruct Repl4B_zero(vecS dst, immI0 zero) %{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (ReplicateB zero));
+  format %{ "pxor    $dst,$dst\t! replicate4B zero" %}
+  ins_encode %{
+    __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
+instruct Repl8B_zero(vecD dst, immI0 zero) %{
+  predicate(n->as_Vector()->length() == 8);
+  match(Set dst (ReplicateB zero));
+  format %{ "pxor    $dst,$dst\t! replicate8B zero" %}
+  ins_encode %{
+    __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
+instruct Repl16B_zero(vecX dst, immI0 zero) %{
+  predicate(n->as_Vector()->length() == 16);
+  match(Set dst (ReplicateB zero));
+  format %{ "pxor    $dst,$dst\t! replicate16B zero" %}
+  ins_encode %{
+    __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
+instruct Repl32B_zero(vecY dst, immI0 zero) %{
+  predicate(n->as_Vector()->length() == 32);
+  match(Set dst (ReplicateB zero));
+  format %{ "vxorpd  $dst,$dst,$dst\t! replicate32B zero" %}
+  ins_encode %{
+    // Use vxorpd since AVX does not have vpxor for 256-bit (AVX2 will have it).
+    bool vector256 = true;
+    __ vxorpd($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector256);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
+// Replicate short (2 byte) scalar to be vector
+instruct Repl2S(vecS dst, rRegI src) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (ReplicateS src));
+  format %{ "movd    $dst,$src\n\t"
+            "pshuflw $dst,$dst,0x00\t! replicate2S" %}
+  ins_encode %{
+    __ movdl($dst$$XMMRegister, $src$$Register);
+    __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
+instruct Repl4S(vecD dst, rRegI src) %{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (ReplicateS src));
+  format %{ "movd    $dst,$src\n\t"
+            "pshuflw $dst,$dst,0x00\t! replicate4S" %}
+  ins_encode %{
+    __ movdl($dst$$XMMRegister, $src$$Register);
+    __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
+instruct Repl8S(vecX dst, rRegI src) %{
+  predicate(n->as_Vector()->length() == 8);
+  match(Set dst (ReplicateS src));
+  format %{ "movd    $dst,$src\n\t"
+            "pshuflw $dst,$dst,0x00\n\t"
+            "movlhps $dst,$dst\t! replicate8S" %}
+  ins_encode %{
+    __ movdl($dst$$XMMRegister, $src$$Register);
+    __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
+    __ movlhps($dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl16S(vecY dst, rRegI src) %{
+  predicate(n->as_Vector()->length() == 16);
+  match(Set dst (ReplicateS src));
+  format %{ "movd    $dst,$src\n\t"
+            "pshuflw $dst,$dst,0x00\n\t"
+            "movlhps $dst,$dst\n\t"
+            "vinsertf128h $dst,$dst,$dst\t! replicate16S" %}
+  ins_encode %{
+    __ movdl($dst$$XMMRegister, $src$$Register);
+    __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
+    __ movlhps($dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinsertf128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+// Replicate short (2 byte) scalar immediate to be vector by loading from const table.
+instruct Repl2S_imm(vecS dst, immI con) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (ReplicateS con));
+  format %{ "movss   $dst,[$constantaddress]\t! replicate2S($con)" %}
+  ins_encode %{
+    __ movflt($dst$$XMMRegister, $constantaddress(replicate4_imm($con$$constant, 2)));
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
+instruct Repl4S_imm(vecD dst, immI con) %{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (ReplicateS con));
+  format %{ "movsd   $dst,[$constantaddress]\t! replicate4S($con)" %}
+  ins_encode %{
+    __ movdbl($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
+instruct Repl8S_imm(vecX dst, immI con) %{
+  predicate(n->as_Vector()->length() == 8);
+  match(Set dst (ReplicateS con));
+  format %{ "movsd   $dst,[$constantaddress]\t! replicate8S($con)\n\t"
+            "movlhps $dst,$dst" %}
+  ins_encode %{
+    __ movdbl($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
+    __ movlhps($dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl16S_imm(vecY dst, immI con) %{
+  predicate(n->as_Vector()->length() == 16);
+  match(Set dst (ReplicateS con));
+  format %{ "movsd   $dst,[$constantaddress]\t! replicate16S($con)\n\t"
+            "movlhps $dst,$dst\n\t"
+            "vinsertf128h $dst,$dst,$dst" %}
+  ins_encode %{
+    __ movdbl($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
+    __ movlhps($dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinsertf128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+// Replicate short (2 byte) scalar zero to be vector
+instruct Repl2S_zero(vecS dst, immI0 zero) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (ReplicateS zero));
+  format %{ "pxor    $dst,$dst\t! replicate2S zero" %}
+  ins_encode %{
+    __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
+instruct Repl4S_zero(vecD dst, immI0 zero) %{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (ReplicateS zero));
+  format %{ "pxor    $dst,$dst\t! replicate4S zero" %}
+  ins_encode %{
+    __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
+instruct Repl8S_zero(vecX dst, immI0 zero) %{
+  predicate(n->as_Vector()->length() == 8);
+  match(Set dst (ReplicateS zero));
+  format %{ "pxor    $dst,$dst\t! replicate8S zero" %}
+  ins_encode %{
+    __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
+instruct Repl16S_zero(vecY dst, immI0 zero) %{
+  predicate(n->as_Vector()->length() == 16);
+  match(Set dst (ReplicateS zero));
+  format %{ "vxorpd  $dst,$dst,$dst\t! replicate16S zero" %}
+  ins_encode %{
+    // Use vxorpd since AVX does not have vpxor for 256-bit (AVX2 will have it).
+    bool vector256 = true;
+    __ vxorpd($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector256);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
+// Replicate char (2 byte) scalar to be vector
+instruct Repl2C(vecS dst, rRegI src) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (ReplicateC src));
+  format %{ "movd    $dst,$src\n\t"
+            "pshuflw $dst,$dst,0x00\t! replicate2C" %}
+  ins_encode %{
+    __ movdl($dst$$XMMRegister, $src$$Register);
+    __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
+instruct Repl4C(vecD dst, rRegI src) %{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (ReplicateC src));
+  format %{ "movd    $dst,$src\n\t"
+            "pshuflw $dst,$dst,0x00\t! replicate4C" %}
+  ins_encode %{
+    __ movdl($dst$$XMMRegister, $src$$Register);
+    __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
+instruct Repl8C(vecX dst, rRegI src) %{
+  predicate(n->as_Vector()->length() == 8);
+  match(Set dst (ReplicateC src));
+  format %{ "movd    $dst,$src\n\t"
+            "pshuflw $dst,$dst,0x00\n\t"
+            "movlhps $dst,$dst\t! replicate8C" %}
+  ins_encode %{
+    __ movdl($dst$$XMMRegister, $src$$Register);
+    __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
+    __ movlhps($dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl16C(vecY dst, rRegI src) %{
+  predicate(n->as_Vector()->length() == 16);
+  match(Set dst (ReplicateC src));
+  format %{ "movd    $dst,$src\n\t"
+            "pshuflw $dst,$dst,0x00\n\t"
+            "movlhps $dst,$dst\n\t"
+            "vinsertf128h $dst,$dst,$dst\t! replicate16C" %}
+  ins_encode %{
+    __ movdl($dst$$XMMRegister, $src$$Register);
+    __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
+    __ movlhps($dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinsertf128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+// Replicate char (2 byte) scalar immediate to be vector by loading from const table.
+instruct Repl2C_imm(vecS dst, immI con) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (ReplicateC con));
+  format %{ "movss   $dst,[$constantaddress]\t! replicate2C($con)" %}
+  ins_encode %{
+    __ movflt($dst$$XMMRegister, $constantaddress(replicate4_imm($con$$constant, 2)));
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
+instruct Repl4C_imm(vecD dst, immI con) %{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (ReplicateC con));
+  format %{ "movsd   $dst,[$constantaddress]\t! replicate4C($con)" %}
+  ins_encode %{
+    __ movdbl($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
+instruct Repl8C_imm(vecX dst, immI con) %{
+  predicate(n->as_Vector()->length() == 8);
+  match(Set dst (ReplicateC con));
+  format %{ "movsd   $dst,[$constantaddress]\t! replicate8C($con)\n\t"
+            "movlhps $dst,$dst" %}
+  ins_encode %{
+    __ movdbl($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
+    __ movlhps($dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl16C_imm(vecY dst, immI con) %{
+  predicate(n->as_Vector()->length() == 16);
+  match(Set dst (ReplicateC con));
+  format %{ "movsd   $dst,[$constantaddress]\t! replicate16C($con)\n\t"
+            "movlhps $dst,$dst\n\t"
+            "vinsertf128h $dst,$dst,$dst" %}
+  ins_encode %{
+    __ movdbl($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
+    __ movlhps($dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinsertf128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+// Replicate char (2 byte) scalar zero to be vector
+instruct Repl2C_zero(vecS dst, immI0 zero) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (ReplicateC zero));
+  format %{ "pxor    $dst,$dst\t! replicate2C zero" %}
+  ins_encode %{
+    __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
+instruct Repl4C_zero(vecD dst, immI0 zero) %{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (ReplicateC zero));
+  format %{ "pxor    $dst,$dst\t! replicate4C zero" %}
+  ins_encode %{
+    __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
+instruct Repl8C_zero(vecX dst, immI0 zero) %{
+  predicate(n->as_Vector()->length() == 8);
+  match(Set dst (ReplicateC zero));
+  format %{ "pxor    $dst,$dst\t! replicate8C zero" %}
+  ins_encode %{
+    __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
+instruct Repl16C_zero(vecY dst, immI0 zero) %{
+  predicate(n->as_Vector()->length() == 16);
+  match(Set dst (ReplicateC zero));
+  format %{ "vxorpd  $dst,$dst,$dst\t! replicate16C zero" %}
+  ins_encode %{
+    // Use vxorpd since AVX does not have vpxor for 256-bit (AVX2 will have it).
+    bool vector256 = true;
+    __ vxorpd($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector256);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
+// Replicate integer (4 byte) scalar to be vector
+instruct Repl2I(vecD dst, rRegI src) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (ReplicateI src));
+  format %{ "movd    $dst,$src\n\t"
+            "pshufd  $dst,$dst,0x00\t! replicate2I" %}
+  ins_encode %{
+    __ movdl($dst$$XMMRegister, $src$$Register);
+    __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
+instruct Repl4I(vecX dst, rRegI src) %{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (ReplicateI src));
+  format %{ "movd    $dst,$src\n\t"
+            "pshufd  $dst,$dst,0x00\t! replicate4I" %}
+  ins_encode %{
+    __ movdl($dst$$XMMRegister, $src$$Register);
+    __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl8I(vecY dst, rRegI src) %{
+  predicate(n->as_Vector()->length() == 8);
+  match(Set dst (ReplicateI src));
+  format %{ "movd    $dst,$src\n\t"
+            "pshufd  $dst,$dst,0x00\n\t"
+            "vinsertf128h $dst,$dst,$dst\t! replicate8I" %}
+  ins_encode %{
+    __ movdl($dst$$XMMRegister, $src$$Register);
+    __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
+    __ vinsertf128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+// Replicate integer (4 byte) scalar immediate to be vector by loading from const table.
+instruct Repl2I_imm(vecD dst, immI con) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (ReplicateI con));
+  format %{ "movsd   $dst,[$constantaddress]\t! replicate2I($con)" %}
+  ins_encode %{
+    __ movdbl($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
+instruct Repl4I_imm(vecX dst, immI con) %{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (ReplicateI con));
+  format %{ "movsd   $dst,[$constantaddress]\t! replicate4I($con)\n\t"
+            "movlhps $dst,$dst" %}
+  ins_encode %{
+    __ movdbl($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
+    __ movlhps($dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl8I_imm(vecY dst, immI con) %{
+  predicate(n->as_Vector()->length() == 8);
+  match(Set dst (ReplicateI con));
+  format %{ "movsd   $dst,[$constantaddress]\t! replicate8I($con)\n\t"
+            "movlhps $dst,$dst\n\t"
+            "vinsertf128h $dst,$dst,$dst" %}
+  ins_encode %{
+    __ movdbl($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
+    __ movlhps($dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinsertf128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+// Integer could be loaded into xmm register directly from memory.
+instruct Repl2I_mem(vecD dst, memory mem) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (ReplicateI mem));
+  format %{ "movd    $dst,$mem\n\t"
+            "pshufd  $dst,$dst,0x00\t! replicate2I" %}
+  ins_encode %{
+    __ movdl($dst$$XMMRegister, $mem$$Address);
+    __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
+instruct Repl4I_mem(vecX dst, memory mem) %{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (ReplicateI mem));
+  format %{ "movd    $dst,$mem\n\t"
+            "pshufd  $dst,$dst,0x00\t! replicate4I" %}
+  ins_encode %{
+    __ movdl($dst$$XMMRegister, $mem$$Address);
+    __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl8I_mem(vecY dst, memory mem) %{
+  predicate(n->as_Vector()->length() == 8);
+  match(Set dst (ReplicateI mem));
+  format %{ "movd    $dst,$mem\n\t"
+            "pshufd  $dst,$dst,0x00\n\t"
+            "vinsertf128h $dst,$dst,$dst\t! replicate8I" %}
+  ins_encode %{
+    __ movdl($dst$$XMMRegister, $mem$$Address);
+    __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
+    __ vinsertf128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+// Replicate integer (4 byte) scalar zero to be vector
+instruct Repl2I_zero(vecD dst, immI0 zero) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (ReplicateI zero));
+  format %{ "pxor    $dst,$dst\t! replicate2I" %}
+  ins_encode %{
+    __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
+instruct Repl4I_zero(vecX dst, immI0 zero) %{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (ReplicateI zero));
+  format %{ "pxor    $dst,$dst\t! replicate4I zero)" %}
+  ins_encode %{
+    __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
+instruct Repl8I_zero(vecY dst, immI0 zero) %{
+  predicate(n->as_Vector()->length() == 8);
+  match(Set dst (ReplicateI zero));
+  format %{ "vxorpd  $dst,$dst,$dst\t! replicate8I zero" %}
+  ins_encode %{
+    // Use vxorpd since AVX does not have vpxor for 256-bit (AVX2 will have it).
+    bool vector256 = true;
+    __ vxorpd($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector256);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
+// Replicate long (8 byte) scalar to be vector
+#ifdef _LP64
+instruct Repl2L(vecX dst, rRegL src) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (ReplicateL src));
+  format %{ "movdq   $dst,$src\n\t"
+            "movlhps $dst,$dst\t! replicate2L" %}
+  ins_encode %{
+    __ movdq($dst$$XMMRegister, $src$$Register);
+    __ movlhps($dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl4L(vecY dst, rRegL src) %{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (ReplicateL src));
+  format %{ "movdq   $dst,$src\n\t"
+            "movlhps $dst,$dst\n\t"
+            "vinsertf128h $dst,$dst,$dst\t! replicate4L" %}
+  ins_encode %{
+    __ movdq($dst$$XMMRegister, $src$$Register);
+    __ movlhps($dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinsertf128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+#else // _LP64
+instruct Repl2L(vecX dst, eRegL src, regD tmp) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (ReplicateL src));
+  effect(TEMP dst, USE src, TEMP tmp);
+  format %{ "movdl   $dst,$src.lo\n\t"
+            "movdl   $tmp,$src.hi\n\t"
+            "punpckldq $dst,$tmp\n\t"
+            "movlhps $dst,$dst\t! replicate2L"%}
+  ins_encode %{
+    __ movdl($dst$$XMMRegister, $src$$Register);
+    __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
+    __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
+    __ movlhps($dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl4L(vecY dst, eRegL src, regD tmp) %{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (ReplicateL src));
+  effect(TEMP dst, USE src, TEMP tmp);
+  format %{ "movdl   $dst,$src.lo\n\t"
+            "movdl   $tmp,$src.hi\n\t"
+            "punpckldq $dst,$tmp\n\t"
+            "movlhps $dst,$dst\n\t"
+            "vinsertf128h $dst,$dst,$dst\t! replicate4L" %}
+  ins_encode %{
+    __ movdl($dst$$XMMRegister, $src$$Register);
+    __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
+    __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
+    __ movlhps($dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinsertf128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+#endif // _LP64
+
+// Replicate long (8 byte) scalar immediate to be vector by loading from const table.
+instruct Repl2L_imm(vecX dst, immL con) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (ReplicateL con));
+  format %{ "movsd   $dst,[$constantaddress]\t! replicate2L($con)\n\t"
+            "movlhps $dst,$dst" %}
+  ins_encode %{
+    __ movdbl($dst$$XMMRegister, $constantaddress($con));
+    __ movlhps($dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl4L_imm(vecY dst, immL con) %{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (ReplicateL con));
+  format %{ "movsd   $dst,[$constantaddress]\t! replicate4L($con)\n\t"
+            "movlhps $dst,$dst\n\t"
+            "vinsertf128h $dst,$dst,$dst" %}
+  ins_encode %{
+    __ movdbl($dst$$XMMRegister, $constantaddress($con));
+    __ movlhps($dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinsertf128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+// Long could be loaded into xmm register directly from memory.
+instruct Repl2L_mem(vecX dst, memory mem) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (ReplicateL mem));
+  format %{ "movq    $dst,$mem\n\t"
+            "movlhps $dst,$dst\t! replicate2L" %}
+  ins_encode %{
+    __ movq($dst$$XMMRegister, $mem$$Address);
+    __ movlhps($dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl4L_mem(vecY dst, memory mem) %{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (ReplicateL mem));
+  format %{ "movq    $dst,$mem\n\t"
+            "movlhps $dst,$dst\n\t"
+            "vinsertf128h $dst,$dst,$dst\t! replicate4L" %}
+  ins_encode %{
+    __ movq($dst$$XMMRegister, $mem$$Address);
+    __ movlhps($dst$$XMMRegister, $dst$$XMMRegister);
+    __ vinsertf128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+// Replicate long (8 byte) scalar zero to be vector
+instruct Repl2L_zero(vecX dst, immL0 zero) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (ReplicateL zero));
+  format %{ "pxor    $dst,$dst\t! replicate2L zero" %}
+  ins_encode %{
+    __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
+instruct Repl4L_zero(vecY dst, immL0 zero) %{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (ReplicateL zero));
+  format %{ "vxorpd  $dst,$dst,$dst\t! replicate4L zero" %}
+  ins_encode %{
+    // Use vxorpd since AVX does not have vpxor for 256-bit (AVX2 will have it).
+    bool vector256 = true;
+    __ vxorpd($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector256);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
+// Replicate float (4 byte) scalar to be vector
+instruct Repl2F(vecD dst, regF src) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (ReplicateF src));
+  format %{ "pshufd  $dst,$dst,0x00\t! replicate2F" %}
+  ins_encode %{
+    __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
+instruct Repl4F(vecX dst, regF src) %{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (ReplicateF src));
+  format %{ "pshufd  $dst,$dst,0x00\t! replicate4F" %}
+  ins_encode %{
+    __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl8F(vecY dst, regF src) %{
+  predicate(n->as_Vector()->length() == 8);
+  match(Set dst (ReplicateF src));
+  format %{ "pshufd  $dst,$src,0x00\n\t"
+            "vinsertf128h $dst,$dst,$dst\t! replicate8F" %}
+  ins_encode %{
+    __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
+    __ vinsertf128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+// Replicate float (4 byte) scalar zero to be vector
+instruct Repl2F_zero(vecD dst, immF0 zero) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (ReplicateF zero));
+  format %{ "xorps   $dst,$dst\t! replicate2F zero" %}
+  ins_encode %{
+    __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
+instruct Repl4F_zero(vecX dst, immF0 zero) %{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (ReplicateF zero));
+  format %{ "xorps   $dst,$dst\t! replicate4F zero" %}
+  ins_encode %{
+    __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
+instruct Repl8F_zero(vecY dst, immF0 zero) %{
+  predicate(n->as_Vector()->length() == 8);
+  match(Set dst (ReplicateF zero));
+  format %{ "vxorps  $dst,$dst,$dst\t! replicate8F zero" %}
+  ins_encode %{
+    bool vector256 = true;
+    __ vxorps($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector256);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
+// Replicate double (8 bytes) scalar to be vector
+instruct Repl2D(vecX dst, regD src) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (ReplicateD src));
+  format %{ "pshufd  $dst,$src,0x44\t! replicate2D" %}
+  ins_encode %{
+    __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct Repl4D(vecY dst, regD src) %{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (ReplicateD src));
+  format %{ "pshufd  $dst,$src,0x44\n\t"
+            "vinsertf128h $dst,$dst,$dst\t! replicate4D" %}
+  ins_encode %{
+    __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
+    __ vinsertf128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+// Replicate double (8 byte) scalar zero to be vector
+instruct Repl2D_zero(vecX dst, immD0 zero) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (ReplicateD zero));
+  format %{ "xorpd   $dst,$dst\t! replicate2D zero" %}
+  ins_encode %{
+    __ xorpd($dst$$XMMRegister, $dst$$XMMRegister);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
+instruct Repl4D_zero(vecY dst, immD0 zero) %{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (ReplicateD zero));
+  format %{ "vxorpd  $dst,$dst,$dst,vect256\t! replicate4D zero" %}
+  ins_encode %{
+    bool vector256 = true;
+    __ vxorpd($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector256);
+  %}
+  ins_pipe( fpu_reg_reg );
+%}
+
--- old/src/cpu/x86/vm/x86_32.ad	Sat Jun  2 20:04:02 2012
+++ new/src/cpu/x86/vm/x86_32.ad	Sat Jun  2 20:04:01 2012
@@ -74,9 +74,6 @@
 reg_def EAX(SOC, SOC, Op_RegI, 0, rax->as_VMReg());
 reg_def ESP( NS,  NS, Op_RegI, 4, rsp->as_VMReg());
 
-// Special Registers
-reg_def EFLAGS(SOC, SOC, 0, 8, VMRegImpl::Bad());
-
 // Float registers.  We treat TOS/FPR0 special.  It is invisible to the
 // allocator, and only shows up in the encodings.
 reg_def FPR0L( SOC, SOC, Op_RegF, 0, VMRegImpl::Bad());
@@ -105,27 +102,6 @@
 reg_def FPR7L( SOC, SOC, Op_RegF, 7, as_FloatRegister(6)->as_VMReg());
 reg_def FPR7H( SOC, SOC, Op_RegF, 7, as_FloatRegister(6)->as_VMReg()->next());
 
-// XMM registers.  128-bit registers or 4 words each, labeled a-d.
-// Word a in each register holds a Float, words ab hold a Double.
-// We currently do not use the SIMD capabilities, so registers cd
-// are unused at the moment.
-reg_def XMM0a( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg());
-reg_def XMM0b( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next());
-reg_def XMM1a( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg());
-reg_def XMM1b( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next());
-reg_def XMM2a( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg());
-reg_def XMM2b( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next());
-reg_def XMM3a( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg());
-reg_def XMM3b( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next());
-reg_def XMM4a( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg());
-reg_def XMM4b( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next());
-reg_def XMM5a( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg());
-reg_def XMM5b( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next());
-reg_def XMM6a( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg());
-reg_def XMM6b( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next());
-reg_def XMM7a( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg());
-reg_def XMM7b( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next());
-
 // Specify priority of register selection within phases of register
 // allocation.  Highest priority is first.  A useful heuristic is to
 // give registers a low priority when they are required by machine
@@ -138,16 +114,7 @@
                     FPR3L, FPR3H, FPR4L, FPR4H, FPR5L, FPR5H,
                     FPR6L, FPR6H, FPR7L, FPR7H );
 
-alloc_class chunk1( XMM0a, XMM0b,
-                    XMM1a, XMM1b,
-                    XMM2a, XMM2b,
-                    XMM3a, XMM3b,
-                    XMM4a, XMM4b,
-                    XMM5a, XMM5b,
-                    XMM6a, XMM6b,
-                    XMM7a, XMM7b, EFLAGS);
 
-
 //----------Architecture Description Register Classes--------------------------
 // Several register classes are automatically defined based upon information in
 // this architecture description.
@@ -159,12 +126,12 @@
 // Class for all registers
 reg_class any_reg(EAX, EDX, EBP, EDI, ESI, ECX, EBX, ESP);
 // Class for general registers
-reg_class e_reg(EAX, EDX, EBP, EDI, ESI, ECX, EBX);
+reg_class int_reg(EAX, EDX, EBP, EDI, ESI, ECX, EBX);
 // Class for general registers which may be used for implicit null checks on win95
 // Also safe for use by tailjump. We don't want to allocate in rbp,
-reg_class e_reg_no_rbp(EAX, EDX, EDI, ESI, ECX, EBX);
+reg_class int_reg_no_rbp(EAX, EDX, EDI, ESI, ECX, EBX);
 // Class of "X" registers
-reg_class x_reg(EBX, ECX, EDX, EAX);
+reg_class int_x_reg(EBX, ECX, EDX, EAX);
 // Class of registers that can appear in an address with no offset.
 // EBP and ESP require an extra instruction byte for zero offset.
 // Used in fast-unlock
@@ -193,8 +160,6 @@
 reg_class sp_reg(ESP);
 // Singleton class for instruction pointer
 // reg_class ip_reg(EIP);
-// Singleton class for condition codes
-reg_class int_flags(EFLAGS);
 // Class of integer register pairs
 reg_class long_reg( EAX,EDX, ECX,EBX, EBP,EDI );
 // Class of integer register pairs that aligns with calling convention
@@ -206,29 +171,18 @@
 // Floating point registers.  Notice FPR0 is not a choice.
 // FPR0 is not ever allocated; we use clever encodings to fake
 // a 2-address instructions out of Intels FP stack.
-reg_class flt_reg( FPR1L,FPR2L,FPR3L,FPR4L,FPR5L,FPR6L,FPR7L );
+reg_class fp_flt_reg( FPR1L,FPR2L,FPR3L,FPR4L,FPR5L,FPR6L,FPR7L );
 
-// make a register class for SSE registers
-reg_class xmm_reg(XMM0a, XMM1a, XMM2a, XMM3a, XMM4a, XMM5a, XMM6a, XMM7a);
+reg_class fp_dbl_reg( FPR1L,FPR1H, FPR2L,FPR2H, FPR3L,FPR3H,
+                      FPR4L,FPR4H, FPR5L,FPR5H, FPR6L,FPR6H,
+                      FPR7L,FPR7H );
 
-// make a double register class for SSE2 registers
-reg_class xdb_reg(XMM0a,XMM0b, XMM1a,XMM1b, XMM2a,XMM2b, XMM3a,XMM3b,
-                  XMM4a,XMM4b, XMM5a,XMM5b, XMM6a,XMM6b, XMM7a,XMM7b );
+reg_class fp_flt_reg0( FPR1L );
+reg_class fp_dbl_reg0( FPR1L,FPR1H );
+reg_class fp_dbl_reg1( FPR2L,FPR2H );
+reg_class fp_dbl_notreg0( FPR2L,FPR2H, FPR3L,FPR3H, FPR4L,FPR4H,
+                          FPR5L,FPR5H, FPR6L,FPR6H, FPR7L,FPR7H );
 
-reg_class dbl_reg( FPR1L,FPR1H, FPR2L,FPR2H, FPR3L,FPR3H,
-                   FPR4L,FPR4H, FPR5L,FPR5H, FPR6L,FPR6H,
-                   FPR7L,FPR7H );
-
-reg_class flt_reg0( FPR1L );
-reg_class dbl_reg0( FPR1L,FPR1H );
-reg_class dbl_reg1( FPR2L,FPR2H );
-reg_class dbl_notreg0( FPR2L,FPR2H, FPR3L,FPR3H, FPR4L,FPR4H,
-                       FPR5L,FPR5H, FPR6L,FPR6H, FPR7L,FPR7H );
-
-// XMM6 and XMM7 could be used as temporary registers for long, float and
-// double values for SSE2.
-reg_class xdb_reg6( XMM6a,XMM6b );
-reg_class xdb_reg7( XMM7a,XMM7b );
 %}
 
 
@@ -412,7 +366,7 @@
   }
 }
 
-   // eRegI ereg, memory mem) %{    // emit_reg_mem
+   // rRegI ereg, memory mem) %{    // emit_reg_mem
 void encode_RegMem( CodeBuffer &cbuf, int reg_encoding, int base, int index, int scale, int displace, bool displace_is_oop ) {
   // There is no index & no scale, use form without SIB byte
   if ((index == 0x4) &&
@@ -787,7 +741,7 @@
 #endif
   }
   int offset_size = (offset == 0) ? 0 : ((offset <= 127) ? 1 : 4);
-  // VEX_2bytes prefix is used if UseAVX > 0, so it takes the same 2 bytes.
+  // VEX_2bytes prefix is used if UseAVX > 0, so it takes the same 2 bytes as SIMD prefix.
   return size+5+offset_size;
 }
 
@@ -821,7 +775,7 @@
     }
 #endif
   }
-  // VEX_2bytes prefix is used if UseAVX > 0, and it takes the same 2 bytes.
+  // VEX_2bytes prefix is used if UseAVX > 0, and it takes the same 2 bytes as SIMD prefix.
   // Only MOVAPS SSE prefix uses 1 byte.
   int sz = 4;
   if (!(src_lo+1 == src_hi && dst_lo+1 == dst_hi) &&
@@ -903,6 +857,108 @@
   return impl_helper(cbuf,do_size,false,offset,st_op,op,op_str,size, st);
 }
 
+// Next two methods are shared by 32- and 64-bit VM. They are defined in x86.ad.
+static int vec_mov_helper(CodeBuffer *cbuf, bool do_size, int src_lo, int dst_lo,
+                          int src_hi, int dst_hi, uint ireg, outputStream* st);
+
+static int vec_spill_helper(CodeBuffer *cbuf, bool do_size, bool is_load,
+                            int stack_offset, int reg, uint ireg, outputStream* st);
+
+static int vec_stack_to_stack_helper(CodeBuffer *cbuf, bool do_size, int src_offset,
+                                     int dst_offset, uint ireg, outputStream* st) {
+  int calc_size = 0;
+  int src_offset_size = (src_offset == 0) ? 0 : ((src_offset < 0x80) ? 1 : 4);
+  int dst_offset_size = (dst_offset == 0) ? 0 : ((dst_offset < 0x80) ? 1 : 4);
+  switch (ireg) {
+  case Op_VecS:
+    calc_size = 3+src_offset_size + 3+dst_offset_size;
+    break;
+  case Op_VecD:
+    calc_size = 3+src_offset_size + 3+dst_offset_size;
+    src_offset += 4;
+    dst_offset += 4;
+    src_offset_size = (src_offset == 0) ? 0 : ((src_offset < 0x80) ? 1 : 4);
+    dst_offset_size = (dst_offset == 0) ? 0 : ((dst_offset < 0x80) ? 1 : 4);
+    calc_size += 3+src_offset_size + 3+dst_offset_size;
+    break;
+  case Op_VecX:
+    calc_size = 6 + 6 + 5+src_offset_size + 5+dst_offset_size;
+    break;
+  case Op_VecY:
+    calc_size = 6 + 6 + 5+src_offset_size + 5+dst_offset_size;
+    break;
+  default:
+    ShouldNotReachHere();
+  }
+  if (cbuf) {
+    MacroAssembler _masm(cbuf);
+    int offset = __ offset();
+    switch (ireg) {
+    case Op_VecS:
+      __ pushl(Address(rsp, src_offset));
+      __ popl (Address(rsp, dst_offset));
+      break;
+    case Op_VecD:
+      __ pushl(Address(rsp, src_offset));
+      __ popl (Address(rsp, dst_offset));
+      __ pushl(Address(rsp, src_offset+4));
+      __ popl (Address(rsp, dst_offset+4));
+      break;
+    case Op_VecX:
+      __ movdqu(Address(rsp, -16), xmm0);
+      __ movdqu(xmm0, Address(rsp, src_offset));
+      __ movdqu(Address(rsp, dst_offset), xmm0);
+      __ movdqu(xmm0, Address(rsp, -16));
+      break;
+    case Op_VecY:
+      __ vmovdqu(Address(rsp, -32), xmm0);
+      __ vmovdqu(xmm0, Address(rsp, src_offset));
+      __ vmovdqu(Address(rsp, dst_offset), xmm0);
+      __ vmovdqu(xmm0, Address(rsp, -32));
+      break;
+    default:
+      ShouldNotReachHere();
+    }
+    int size = __ offset() - offset;
+    assert(size == calc_size, "incorrect size calculattion");
+    return size;
+#ifndef PRODUCT
+  } else if (!do_size) {
+    switch (ireg) {
+    case Op_VecS:
+      st->print("pushl   [rsp + #%d]\t# 32-bit mem-mem spill\n\t"
+                "popl    [rsp + #%d]",
+                src_offset, dst_offset);
+      break;
+    case Op_VecD:
+      st->print("pushl   [rsp + #%d]\t# 64-bit mem-mem spill\n\t"
+                "popq    [rsp + #%d]\n\t"
+                "pushl   [rsp + #%d]\n\t"
+                "popq    [rsp + #%d]",
+                src_offset, dst_offset, src_offset+4, dst_offset+4);
+      break;
+     case Op_VecX:
+      st->print("movdqu  [rsp - #16], xmm0\t# 128-bit mem-mem spill\n\t"
+                "movdqu  xmm0, [rsp + #%d]\n\t"
+                "movdqu  [rsp + #%d], xmm0\n\t"
+                "movdqu  xmm0, [rsp - #16]",
+                src_offset, dst_offset);
+      break;
+    case Op_VecY:
+      st->print("vmovdqu [rsp - #32], xmm0\t# 256-bit mem-mem spill\n\t"
+                "vmovdqu xmm0, [rsp + #%d]\n\t"
+                "vmovdqu [rsp + #%d], xmm0\n\t"
+                "vmovdqu xmm0, [rsp - #32]",
+                src_offset, dst_offset);
+      break;
+    default:
+      ShouldNotReachHere();
+    }
+#endif
+  }
+  return calc_size;
+}
+
 uint MachSpillCopyNode::implementation( CodeBuffer *cbuf, PhaseRegAlloc *ra_, bool do_size, outputStream* st ) const {
   // Get registers to move
   OptoReg::Name src_second = ra_->get_reg_second(in(1));
@@ -923,6 +979,29 @@
   if( src_first == dst_first && src_second == dst_second )
     return size;            // Self copy, no move
 
+  if (bottom_type()->isa_vect() != NULL) {
+    uint ireg = ideal_reg();
+    assert((src_first_rc != rc_int && dst_first_rc != rc_int), "sanity");
+    assert((src_first_rc != rc_float && dst_first_rc != rc_float), "sanity");
+    assert((ireg == Op_VecS || ireg == Op_VecD || ireg == Op_VecX || ireg == Op_VecY), "sanity");
+    if( src_first_rc == rc_stack && dst_first_rc == rc_stack ) {
+      // mem -> mem
+      int src_offset = ra_->reg2offset(src_first);
+      int dst_offset = ra_->reg2offset(dst_first);
+      return vec_stack_to_stack_helper(cbuf, do_size, src_offset, dst_offset, ireg, st);
+    } else if (src_first_rc == rc_xmm && dst_first_rc == rc_xmm ) {
+      return vec_mov_helper(cbuf, do_size, src_first, dst_first, src_second, dst_second, ireg, st);
+    } else if (src_first_rc == rc_xmm && dst_first_rc == rc_stack ) {
+      int stack_offset = ra_->reg2offset(dst_first);
+      return vec_spill_helper(cbuf, do_size, false, stack_offset, src_first, ireg, st);
+    } else if (src_first_rc == rc_stack && dst_first_rc == rc_xmm ) {
+      int stack_offset = ra_->reg2offset(src_first);
+      return vec_spill_helper(cbuf, do_size, true,  stack_offset, dst_first, ireg, st);
+    } else {
+      ShouldNotReachHere();
+    }
+  }
+
   // --------------------------------------
   // Check for mem-mem move.  push/pop to move.
   if( src_first_rc == rc_stack && dst_first_rc == rc_stack ) {
@@ -1313,16 +1392,6 @@
   return true;
 }
 
-// Vector width in bytes
-const uint Matcher::vector_width_in_bytes(void) {
-  return UseSSE >= 2 ? 8 : 0;
-}
-
-// Vector ideal reg
-const uint Matcher::vector_ideal_reg(void) {
-  return Op_RegD;
-}
-
 // Is this branch offset short enough that a short branch can be used?
 //
 // NOTE: If the platform does not provide any short branch variants, then
@@ -1452,7 +1521,7 @@
 // arguments in those registers not be available to the callee.
 bool Matcher::can_be_java_arg( int reg ) {
   if(  reg == ECX_num   || reg == EDX_num   ) return true;
-  if( (reg == XMM0a_num || reg == XMM1a_num) && UseSSE>=1 ) return true;
+  if( (reg == XMM0_num  || reg == XMM1_num ) && UseSSE>=1 ) return true;
   if( (reg == XMM0b_num || reg == XMM1b_num) && UseSSE>=2 ) return true;
   return false;
 }
@@ -1565,16 +1634,16 @@
     emit_opcode(cbuf,0x66);
   %}
 
-  enc_class RegReg (eRegI dst, eRegI src) %{    // RegReg(Many)
+  enc_class RegReg (rRegI dst, rRegI src) %{    // RegReg(Many)
     emit_rm(cbuf, 0x3, $dst$$reg, $src$$reg);
   %}
 
-  enc_class OpcRegReg (immI opcode, eRegI dst, eRegI src) %{    // OpcRegReg(Many)
+  enc_class OpcRegReg (immI opcode, rRegI dst, rRegI src) %{    // OpcRegReg(Many)
     emit_opcode(cbuf,$opcode$$constant);
     emit_rm(cbuf, 0x3, $dst$$reg, $src$$reg);
   %}
 
-  enc_class mov_r32_imm0( eRegI dst ) %{
+  enc_class mov_r32_imm0( rRegI dst ) %{
     emit_opcode( cbuf, 0xB8 + $dst$$reg ); // 0xB8+ rd   -- MOV r32  ,imm32
     emit_d32   ( cbuf, 0x0  );             //                         imm32==0x0
   %}
@@ -1621,7 +1690,7 @@
   %}
 
   // Dense encoding for older common ops
-  enc_class Opc_plus(immI opcode, eRegI reg) %{
+  enc_class Opc_plus(immI opcode, rRegI reg) %{
     emit_opcode(cbuf, $opcode$$constant + $reg$$reg);
   %}
 
@@ -1637,7 +1706,7 @@
     }
   %}
 
-  enc_class OpcSErm (eRegI dst, immI imm) %{    // OpcSEr/m
+  enc_class OpcSErm (rRegI dst, immI imm) %{    // OpcSEr/m
     // Emit primary opcode and set sign-extend bit
     // Check for 8-bit immediate, and set sign extend bit in opcode
     if (($imm$$constant >= -128) && ($imm$$constant <= 127)) {
@@ -1682,7 +1751,7 @@
     else                               emit_d32(cbuf,con);
   %}
 
-  enc_class OpcSReg (eRegI dst) %{    // BSWAP
+  enc_class OpcSReg (rRegI dst) %{    // BSWAP
     emit_cc(cbuf, $secondary, $dst$$reg );
   %}
 
@@ -1700,7 +1769,7 @@
     emit_rm(cbuf, 0x3, destlo, desthi);
   %}
 
-  enc_class RegOpc (eRegI div) %{    // IDIV, IMOD, JMP indirect, ...
+  enc_class RegOpc (rRegI div) %{    // IDIV, IMOD, JMP indirect, ...
     emit_rm(cbuf, 0x3, $secondary, $div$$reg );
   %}
 
@@ -1891,13 +1960,13 @@
 //                 runtime_call_Relocation::spec(), RELOC_IMM32 );
 //   %}
 
-  enc_class RegOpcImm (eRegI dst, immI8 shift) %{    // SHL, SAR, SHR
+  enc_class RegOpcImm (rRegI dst, immI8 shift) %{    // SHL, SAR, SHR
     $$$emit8$primary;
     emit_rm(cbuf, 0x3, $secondary, $dst$$reg);
     $$$emit8$shift$$constant;
   %}
 
-  enc_class LdImmI (eRegI dst, immI src) %{    // Load Immediate
+  enc_class LdImmI (rRegI dst, immI src) %{    // Load Immediate
     // Load immediate does not have a zero or sign extended version
     // for 8-bit immediates
     emit_opcode(cbuf, 0xB8 + $dst$$reg);
@@ -1904,7 +1973,7 @@
     $$$emit32$src$$constant;
   %}
 
-  enc_class LdImmP (eRegI dst, immI src) %{    // Load Immediate
+  enc_class LdImmP (rRegI dst, immI src) %{    // Load Immediate
     // Load immediate does not have a zero or sign extended version
     // for 8-bit immediates
     emit_opcode(cbuf, $primary + $dst$$reg);
@@ -1943,15 +2012,15 @@
 
 
   // Encode a reg-reg copy.  If it is useless, then empty encoding.
-  enc_class enc_Copy( eRegI dst, eRegI src ) %{
+  enc_class enc_Copy( rRegI dst, rRegI src ) %{
     encode_Copy( cbuf, $dst$$reg, $src$$reg );
   %}
 
-  enc_class enc_CopyL_Lo( eRegI dst, eRegL src ) %{
+  enc_class enc_CopyL_Lo( rRegI dst, eRegL src ) %{
     encode_Copy( cbuf, $dst$$reg, $src$$reg );
   %}
 
-  enc_class RegReg (eRegI dst, eRegI src) %{    // RegReg(Many)
+  enc_class RegReg (rRegI dst, rRegI src) %{    // RegReg(Many)
     emit_rm(cbuf, 0x3, $dst$$reg, $src$$reg);
   %}
 
@@ -1973,7 +2042,7 @@
     emit_rm(cbuf, 0x3, HIGH_FROM_LOW($dst$$reg), HIGH_FROM_LOW($src$$reg));
   %}
 
-  enc_class RegReg_HiLo( eRegL src, eRegI dst ) %{
+  enc_class RegReg_HiLo( eRegL src, rRegI dst ) %{
     emit_rm(cbuf, 0x3, $dst$$reg, HIGH_FROM_LOW($src$$reg));
   %}
 
@@ -2068,7 +2137,7 @@
     cbuf.set_insts_mark();            // Mark start of opcode for reloc info in mem operand
   %}
 
-  enc_class RegMem (eRegI ereg, memory mem) %{    // emit_reg_mem
+  enc_class RegMem (rRegI ereg, memory mem) %{    // emit_reg_mem
     int reg_encoding = $ereg$$reg;
     int base  = $mem$$base;
     int index = $mem$$index;
@@ -2132,7 +2201,7 @@
 
   // Clone of RegMem but accepts an extra parameter to access each
   // half of a double in memory; it never needs relocation info.
-  enc_class Mov_MemD_half_to_Reg (immI opcode, memory mem, immI disp_for_half, eRegI rm_reg) %{
+  enc_class Mov_MemD_half_to_Reg (immI opcode, memory mem, immI disp_for_half, rRegI rm_reg) %{
     emit_opcode(cbuf,$opcode$$constant);
     int reg_encoding = $rm_reg$$reg;
     int base     = $mem$$base;
@@ -2168,7 +2237,7 @@
     encode_RegMem(cbuf, rm_byte_opcode, base, index, scale, displace, disp_is_oop);
   %}
 
-  enc_class RegLea (eRegI dst, eRegI src0, immI src1 ) %{    // emit_reg_lea
+  enc_class RegLea (rRegI dst, rRegI src0, immI src1 ) %{    // emit_reg_lea
     int reg_encoding = $dst$$reg;
     int base         = $src0$$reg;      // 0xFFFFFFFF indicates no base
     int index        = 0x04;            // 0x04 indicates no index
@@ -2178,7 +2247,7 @@
     encode_RegMem(cbuf, reg_encoding, base, index, scale, displace, disp_is_oop);
   %}
 
-  enc_class min_enc (eRegI dst, eRegI src) %{    // MIN
+  enc_class min_enc (rRegI dst, rRegI src) %{    // MIN
     // Compare dst,src
     emit_opcode(cbuf,0x3B);
     emit_rm(cbuf, 0x3, $dst$$reg, $src$$reg);
@@ -2190,7 +2259,7 @@
     emit_rm(cbuf, 0x3, $dst$$reg, $src$$reg);
   %}
 
-  enc_class max_enc (eRegI dst, eRegI src) %{    // MAX
+  enc_class max_enc (rRegI dst, rRegI src) %{    // MAX
     // Compare dst,src
     emit_opcode(cbuf,0x3B);
     emit_rm(cbuf, 0x3, $dst$$reg, $src$$reg);
@@ -2221,7 +2290,7 @@
     encode_RegMem(cbuf, reg_encoding, base, index, scale, displace, disp_is_oop);
   %}
 
-  enc_class neg_reg(eRegI dst) %{
+  enc_class neg_reg(rRegI dst) %{
     // NEG $dst
     emit_opcode(cbuf,0xF7);
     emit_rm(cbuf, 0x3, 0x03, $dst$$reg );
@@ -2251,7 +2320,7 @@
     emit_rm(cbuf, 0x3, $p$$reg, tmpReg);
   %}
 
-  enc_class enc_cmpLTP_mem(eRegI p, eRegI q, memory mem, eCXRegI tmp) %{    // cadd_cmpLT
+  enc_class enc_cmpLTP_mem(rRegI p, rRegI q, memory mem, eCXRegI tmp) %{    // cadd_cmpLT
     int tmpReg = $tmp$$reg;
 
     // SUB $p,$q
@@ -2390,12 +2459,12 @@
   %}
 
   // Special case for moving an integer register to a stack slot.
-  enc_class OpcPRegSS( stackSlotI dst, eRegI src ) %{ // RegSS
+  enc_class OpcPRegSS( stackSlotI dst, rRegI src ) %{ // RegSS
     store_to_stackslot( cbuf, $primary, $src$$reg, $dst$$disp );
   %}
 
   // Special case for moving a register to a stack slot.
-  enc_class RegSS( stackSlotI dst, eRegI src ) %{ // RegSS
+  enc_class RegSS( stackSlotI dst, rRegI src ) %{ // RegSS
     // Opcode already emitted
     emit_rm( cbuf, 0x02, $src$$reg, ESP_enc );   // R/M byte
     emit_rm( cbuf, 0x00, ESP_enc, ESP_enc);          // SIB byte
@@ -2640,7 +2709,7 @@
 // equal_result    = 0;
 // nan_result      = -1;
 
-  enc_class CmpF_Result(eRegI dst) %{
+  enc_class CmpF_Result(rRegI dst) %{
     // fnstsw_ax();
     emit_opcode( cbuf, 0xDF);
     emit_opcode( cbuf, 0xE0);
@@ -2685,7 +2754,7 @@
 // done:
   %}
 
-  enc_class convert_int_long( regL dst, eRegI src ) %{
+  enc_class convert_int_long( regL dst, rRegI src ) %{
     // mov $dst.lo,$src
     int dst_encoding = $dst$$reg;
     int src_encoding = $src$$reg;
@@ -2754,7 +2823,7 @@
     emit_rm( cbuf, 0x3, 0x4, $src$$reg);
   %}
 
-  enc_class long_multiply( eADXRegL dst, eRegL src, eRegI tmp ) %{
+  enc_class long_multiply( eADXRegL dst, eRegL src, rRegI tmp ) %{
     // Basic idea: lo(result) = lo(x_lo * y_lo)
     //             hi(result) = hi(x_lo * y_lo) + lo(x_hi * y_lo) + lo(x_lo * y_hi)
     // MOV    $tmp,$src.lo
@@ -2780,7 +2849,7 @@
     emit_rm( cbuf, 0x3, HIGH_FROM_LOW($dst$$reg), $tmp$$reg );
   %}
 
-  enc_class long_multiply_con( eADXRegL dst, immL_127 src, eRegI tmp ) %{
+  enc_class long_multiply_con( eADXRegL dst, immL_127 src, rRegI tmp ) %{
     // Basic idea: lo(result) = lo(src * y_lo)
     //             hi(result) = hi(src * y_lo) + lo(src * y_hi)
     // IMUL   $tmp,EDX,$src
@@ -2836,7 +2905,7 @@
     emit_d8(cbuf, 4*4);
   %}
 
-  enc_class long_cmp_flags0( eRegL src, eRegI tmp ) %{
+  enc_class long_cmp_flags0( eRegL src, rRegI tmp ) %{
     // MOV   $tmp,$src.lo
     emit_opcode(cbuf, 0x8B);
     emit_rm(cbuf, 0x3, $tmp$$reg, $src$$reg);
@@ -2857,7 +2926,7 @@
     emit_rm(cbuf, 0x3, HIGH_FROM_LOW($src1$$reg), HIGH_FROM_LOW($src2$$reg) );
   %}
 
-  enc_class long_cmp_flags2( eRegL src1, eRegL src2, eRegI tmp ) %{
+  enc_class long_cmp_flags2( eRegL src1, eRegL src2, rRegI tmp ) %{
     // CMP    $src1.lo,$src2.lo\t! Long compare; set flags for low bits
     emit_opcode( cbuf, 0x3B );
     emit_rm(cbuf, 0x3, $src1$$reg, $src2$$reg );
@@ -2869,7 +2938,7 @@
     emit_rm(cbuf, 0x3, $tmp$$reg, HIGH_FROM_LOW($src2$$reg) );
   %}
 
-  enc_class long_cmp_flags3( eRegL src, eRegI tmp ) %{
+  enc_class long_cmp_flags3( eRegL src, rRegI tmp ) %{
     // XOR    $tmp,$tmp
     emit_opcode(cbuf,0x33);  // XOR
     emit_rm(cbuf,0x3, $tmp$$reg, $tmp$$reg);
@@ -3762,9 +3831,9 @@
     // in SSE2+ mode we want to keep the FPU stack clean so pretend
     // that C functions return float and double results in XMM0.
     if( ideal_reg == Op_RegD && UseSSE>=2 )
-      return OptoRegPair(XMM0b_num,XMM0a_num);
+      return OptoRegPair(XMM0b_num,XMM0_num);
     if( ideal_reg == Op_RegF && UseSSE>=2 )
-      return OptoRegPair(OptoReg::Bad,XMM0a_num);
+      return OptoRegPair(OptoReg::Bad,XMM0_num);
 
     return OptoRegPair(hi[ideal_reg],lo[ideal_reg]);
   %}
@@ -3775,9 +3844,9 @@
     static int lo[Op_RegL+1] = { 0, 0, OptoReg::Bad, EAX_num,      EAX_num,      FPR1L_num,    FPR1L_num, EAX_num };
     static int hi[Op_RegL+1] = { 0, 0, OptoReg::Bad, OptoReg::Bad, OptoReg::Bad, OptoReg::Bad, FPR1H_num, EDX_num };
     if( ideal_reg == Op_RegD && UseSSE>=2 )
-      return OptoRegPair(XMM0b_num,XMM0a_num);
+      return OptoRegPair(XMM0b_num,XMM0_num);
     if( ideal_reg == Op_RegF && UseSSE>=1 )
-      return OptoRegPair(OptoReg::Bad,XMM0a_num);
+      return OptoRegPair(OptoReg::Bad,XMM0_num);
     return OptoRegPair(hi[ideal_reg],lo[ideal_reg]);
   %}
 
@@ -4147,8 +4216,8 @@
 
 // Register Operands
 // Integer Register
-operand eRegI() %{
-  constraint(ALLOC_IN_RC(e_reg));
+operand rRegI() %{
+  constraint(ALLOC_IN_RC(int_reg));
   match(RegI);
   match(xRegI);
   match(eAXRegI);
@@ -4163,8 +4232,8 @@
 %}
 
 // Subset of Integer Register
-operand xRegI(eRegI reg) %{
-  constraint(ALLOC_IN_RC(x_reg));
+operand xRegI(rRegI reg) %{
+  constraint(ALLOC_IN_RC(int_x_reg));
   match(reg);
   match(eAXRegI);
   match(eBXRegI);
@@ -4179,7 +4248,7 @@
 operand eAXRegI(xRegI reg) %{
   constraint(ALLOC_IN_RC(eax_reg));
   match(reg);
-  match(eRegI);
+  match(rRegI);
 
   format %{ "EAX" %}
   interface(REG_INTER);
@@ -4189,7 +4258,7 @@
 operand eBXRegI(xRegI reg) %{
   constraint(ALLOC_IN_RC(ebx_reg));
   match(reg);
-  match(eRegI);
+  match(rRegI);
 
   format %{ "EBX" %}
   interface(REG_INTER);
@@ -4198,7 +4267,7 @@
 operand eCXRegI(xRegI reg) %{
   constraint(ALLOC_IN_RC(ecx_reg));
   match(reg);
-  match(eRegI);
+  match(rRegI);
 
   format %{ "ECX" %}
   interface(REG_INTER);
@@ -4207,7 +4276,7 @@
 operand eDXRegI(xRegI reg) %{
   constraint(ALLOC_IN_RC(edx_reg));
   match(reg);
-  match(eRegI);
+  match(rRegI);
 
   format %{ "EDX" %}
   interface(REG_INTER);
@@ -4216,7 +4285,7 @@
 operand eDIRegI(xRegI reg) %{
   constraint(ALLOC_IN_RC(edi_reg));
   match(reg);
-  match(eRegI);
+  match(rRegI);
 
   format %{ "EDI" %}
   interface(REG_INTER);
@@ -4263,7 +4332,7 @@
 operand eSIRegI(xRegI reg) %{
    constraint(ALLOC_IN_RC(esi_reg));
    match(reg);
-   match(eRegI);
+   match(rRegI);
 
    format %{ "ESI" %}
    interface(REG_INTER);
@@ -4284,7 +4353,7 @@
 %}
 
 operand eRegP() %{
-  constraint(ALLOC_IN_RC(e_reg));
+  constraint(ALLOC_IN_RC(int_reg));
   match(RegP);
   match(eAXRegP);
   match(eBXRegP);
@@ -4297,7 +4366,7 @@
 
 // On windows95, EBP is not safe to use for implicit null tests.
 operand eRegP_no_EBP() %{
-  constraint(ALLOC_IN_RC(e_reg_no_rbp));
+  constraint(ALLOC_IN_RC(int_reg_no_rbp));
   match(RegP);
   match(eAXRegP);
   match(eBXRegP);
@@ -4477,7 +4546,7 @@
 // Float register operands
 operand regDPR() %{
   predicate( UseSSE < 2 );
-  constraint(ALLOC_IN_RC(dbl_reg));
+  constraint(ALLOC_IN_RC(fp_dbl_reg));
   match(RegD);
   match(regDPR1);
   match(regDPR2);
@@ -4487,7 +4556,7 @@
 
 operand regDPR1(regDPR reg) %{
   predicate( UseSSE < 2 );
-  constraint(ALLOC_IN_RC(dbl_reg0));
+  constraint(ALLOC_IN_RC(fp_dbl_reg0));
   match(reg);
   format %{ "FPR1" %}
   interface(REG_INTER);
@@ -4495,7 +4564,7 @@
 
 operand regDPR2(regDPR reg) %{
   predicate( UseSSE < 2 );
-  constraint(ALLOC_IN_RC(dbl_reg1));
+  constraint(ALLOC_IN_RC(fp_dbl_reg1));
   match(reg);
   format %{ "FPR2" %}
   interface(REG_INTER);
@@ -4503,45 +4572,16 @@
 
 operand regnotDPR1(regDPR reg) %{
   predicate( UseSSE < 2 );
-  constraint(ALLOC_IN_RC(dbl_notreg0));
+  constraint(ALLOC_IN_RC(fp_dbl_notreg0));
   match(reg);
   format %{ %}
   interface(REG_INTER);
 %}
 
-// XMM Double register operands
-operand regD() %{
-  predicate( UseSSE>=2 );
-  constraint(ALLOC_IN_RC(xdb_reg));
-  match(RegD);
-  match(regD6);
-  match(regD7);
-  format %{ %}
-  interface(REG_INTER);
-%}
-
-// XMM6 double register operands
-operand regD6(regD reg) %{
-  predicate( UseSSE>=2 );
-  constraint(ALLOC_IN_RC(xdb_reg6));
-  match(reg);
-  format %{ "XMM6" %}
-  interface(REG_INTER);
-%}
-
-// XMM7 double register operands
-operand regD7(regD reg) %{
-  predicate( UseSSE>=2 );
-  constraint(ALLOC_IN_RC(xdb_reg7));
-  match(reg);
-  format %{ "XMM7" %}
-  interface(REG_INTER);
-%}
-
 // Float register operands
 operand regFPR() %{
   predicate( UseSSE < 2 );
-  constraint(ALLOC_IN_RC(flt_reg));
+  constraint(ALLOC_IN_RC(fp_flt_reg));
   match(RegF);
   match(regFPR1);
   format %{ %}
@@ -4551,22 +4591,31 @@
 // Float register operands
 operand regFPR1(regFPR reg) %{
   predicate( UseSSE < 2 );
-  constraint(ALLOC_IN_RC(flt_reg0));
+  constraint(ALLOC_IN_RC(fp_flt_reg0));
   match(reg);
   format %{ "FPR1" %}
   interface(REG_INTER);
 %}
 
-// XMM register operands
+// XMM Float register operands
 operand regF() %{
   predicate( UseSSE>=1 );
-  constraint(ALLOC_IN_RC(xmm_reg));
+  constraint(ALLOC_IN_RC(float_reg));
   match(RegF);
   format %{ %}
   interface(REG_INTER);
 %}
 
+// XMM Double register operands
+operand regD() %{
+  predicate( UseSSE>=2 );
+  constraint(ALLOC_IN_RC(double_reg));
+  match(RegD);
+  format %{ %}
+  interface(REG_INTER);
+%}
 
+
 //----------Memory Operands----------------------------------------------------
 // Direct Memory Operand
 operand direct(immP addr) %{
@@ -4583,7 +4632,7 @@
 
 // Indirect Memory Operand
 operand indirect(eRegP reg) %{
-  constraint(ALLOC_IN_RC(e_reg));
+  constraint(ALLOC_IN_RC(int_reg));
   match(reg);
 
   format %{ "[$reg]" %}
@@ -4622,7 +4671,7 @@
 %}
 
 // Indirect Memory Plus Long Offset Operand
-operand indOffset32X(eRegI reg, immP off) %{
+operand indOffset32X(rRegI reg, immP off) %{
   match(AddP off reg);
 
   format %{ "[$reg + $off]" %}
@@ -4635,7 +4684,7 @@
 %}
 
 // Indirect Memory Plus Index Register Plus Offset Operand
-operand indIndexOffset(eRegP reg, eRegI ireg, immI off) %{
+operand indIndexOffset(eRegP reg, rRegI ireg, immI off) %{
   match(AddP (AddP reg ireg) off);
 
   op_cost(10);
@@ -4649,7 +4698,7 @@
 %}
 
 // Indirect Memory Plus Index Register Plus Offset Operand
-operand indIndex(eRegP reg, eRegI ireg) %{
+operand indIndex(eRegP reg, rRegI ireg) %{
   match(AddP reg ireg);
 
   op_cost(10);
@@ -4667,7 +4716,7 @@
 // // -------------------------------------------------------------------------
 // // Scaled Memory Operands
 // // Indirect Memory Times Scale Plus Offset Operand
-// operand indScaleOffset(immP off, eRegI ireg, immI2 scale) %{
+// operand indScaleOffset(immP off, rRegI ireg, immI2 scale) %{
 //   match(AddP off (LShiftI ireg scale));
 //
 //   op_cost(10);
@@ -4681,7 +4730,7 @@
 // %}
 
 // Indirect Memory Times Scale Plus Index Register
-operand indIndexScale(eRegP reg, eRegI ireg, immI2 scale) %{
+operand indIndexScale(eRegP reg, rRegI ireg, immI2 scale) %{
   match(AddP reg (LShiftI ireg scale));
 
   op_cost(10);
@@ -4695,7 +4744,7 @@
 %}
 
 // Indirect Memory Times Scale Plus Index Register Plus Offset Operand
-operand indIndexScaleOffset(eRegP reg, immI off, eRegI ireg, immI2 scale) %{
+operand indIndexScaleOffset(eRegP reg, immI off, rRegI ireg, immI2 scale) %{
   match(AddP (AddP reg (LShiftI ireg scale)) off);
 
   op_cost(10);
@@ -4823,7 +4872,7 @@
 // Indirect Memory Operand
 operand indirect_win95_safe(eRegP_no_EBP reg)
 %{
-  constraint(ALLOC_IN_RC(e_reg));
+  constraint(ALLOC_IN_RC(int_reg));
   match(reg);
 
   op_cost(100);
@@ -4867,7 +4916,7 @@
 %}
 
 // Indirect Memory Plus Index Register Plus Offset Operand
-operand indIndexOffset_win95_safe(eRegP_no_EBP reg, eRegI ireg, immI off)
+operand indIndexOffset_win95_safe(eRegP_no_EBP reg, rRegI ireg, immI off)
 %{
   match(AddP (AddP reg ireg) off);
 
@@ -4882,7 +4931,7 @@
 %}
 
 // Indirect Memory Times Scale Plus Index Register
-operand indIndexScale_win95_safe(eRegP_no_EBP reg, eRegI ireg, immI2 scale)
+operand indIndexScale_win95_safe(eRegP_no_EBP reg, rRegI ireg, immI2 scale)
 %{
   match(AddP reg (LShiftI ireg scale));
 
@@ -4897,7 +4946,7 @@
 %}
 
 // Indirect Memory Times Scale Plus Index Register Plus Offset Operand
-operand indIndexScaleOffset_win95_safe(eRegP_no_EBP reg, immI off, eRegI ireg, immI2 scale)
+operand indIndexScaleOffset_win95_safe(eRegP_no_EBP reg, immI off, rRegI ireg, immI2 scale)
 %{
   match(AddP (AddP reg (LShiftI ireg scale)) off);
 
@@ -5086,7 +5135,7 @@
 //   Or: _mem if it requires the big decoder and a memory unit.
 
 // Integer ALU reg operation
-pipe_class ialu_reg(eRegI dst) %{
+pipe_class ialu_reg(rRegI dst) %{
     single_instruction;
     dst    : S4(write);
     dst    : S3(read);
@@ -5104,7 +5153,7 @@
 %}
 
 // Integer ALU reg operation using big decoder
-pipe_class ialu_reg_fat(eRegI dst) %{
+pipe_class ialu_reg_fat(rRegI dst) %{
     single_instruction;
     dst    : S4(write);
     dst    : S3(read);
@@ -5122,7 +5171,7 @@
 %}
 
 // Integer ALU reg-reg operation
-pipe_class ialu_reg_reg(eRegI dst, eRegI src) %{
+pipe_class ialu_reg_reg(rRegI dst, rRegI src) %{
     single_instruction;
     dst    : S4(write);
     src    : S3(read);
@@ -5140,7 +5189,7 @@
 %}
 
 // Integer ALU reg-reg operation
-pipe_class ialu_reg_reg_fat(eRegI dst, memory src) %{
+pipe_class ialu_reg_reg_fat(rRegI dst, memory src) %{
     single_instruction;
     dst    : S4(write);
     src    : S3(read);
@@ -5158,7 +5207,7 @@
 %}
 
 // Integer ALU reg-mem operation
-pipe_class ialu_reg_mem(eRegI dst, memory mem) %{
+pipe_class ialu_reg_mem(rRegI dst, memory mem) %{
     single_instruction;
     dst    : S5(write);
     mem    : S3(read);
@@ -5187,7 +5236,7 @@
 %}
 
 // Integer Store to Memory
-pipe_class ialu_mem_reg(memory mem, eRegI src) %{
+pipe_class ialu_mem_reg(memory mem, rRegI src) %{
     single_instruction;
     mem    : S3(read);
     src    : S5(read);
@@ -5216,7 +5265,7 @@
 %}
 
 // Integer ALU0 reg-reg operation
-pipe_class ialu_reg_reg_alu0(eRegI dst, eRegI src) %{
+pipe_class ialu_reg_reg_alu0(rRegI dst, rRegI src) %{
     single_instruction;
     dst    : S4(write);
     src    : S3(read);
@@ -5225,7 +5274,7 @@
 %}
 
 // Integer ALU0 reg-mem operation
-pipe_class ialu_reg_mem_alu0(eRegI dst, memory mem) %{
+pipe_class ialu_reg_mem_alu0(rRegI dst, memory mem) %{
     single_instruction;
     dst    : S5(write);
     mem    : S3(read);
@@ -5235,7 +5284,7 @@
 %}
 
 // Integer ALU reg-reg operation
-pipe_class ialu_cr_reg_reg(eFlagsReg cr, eRegI src1, eRegI src2) %{
+pipe_class ialu_cr_reg_reg(eFlagsReg cr, rRegI src1, rRegI src2) %{
     single_instruction;
     cr     : S4(write);
     src1   : S3(read);
@@ -5245,7 +5294,7 @@
 %}
 
 // Integer ALU reg-imm operation
-pipe_class ialu_cr_reg_imm(eFlagsReg cr, eRegI src1) %{
+pipe_class ialu_cr_reg_imm(eFlagsReg cr, rRegI src1) %{
     single_instruction;
     cr     : S4(write);
     src1   : S3(read);
@@ -5254,7 +5303,7 @@
 %}
 
 // Integer ALU reg-mem operation
-pipe_class ialu_cr_reg_mem(eFlagsReg cr, eRegI src1, memory src2) %{
+pipe_class ialu_cr_reg_mem(eFlagsReg cr, rRegI src1, memory src2) %{
     single_instruction;
     cr     : S4(write);
     src1   : S3(read);
@@ -5265,7 +5314,7 @@
 %}
 
 // Conditional move reg-reg
-pipe_class pipe_cmplt( eRegI p, eRegI q, eRegI y ) %{
+pipe_class pipe_cmplt( rRegI p, rRegI q, rRegI y ) %{
     instruction_count(4);
     y      : S4(read);
     q      : S3(read);
@@ -5274,7 +5323,7 @@
 %}
 
 // Conditional move reg-reg
-pipe_class pipe_cmov_reg( eRegI dst, eRegI src, eFlagsReg cr ) %{
+pipe_class pipe_cmov_reg( rRegI dst, rRegI src, eFlagsReg cr ) %{
     single_instruction;
     dst    : S4(write);
     src    : S3(read);
@@ -5283,7 +5332,7 @@
 %}
 
 // Conditional move reg-mem
-pipe_class pipe_cmov_mem( eFlagsReg cr, eRegI dst, memory src) %{
+pipe_class pipe_cmov_mem( eFlagsReg cr, rRegI dst, memory src) %{
     single_instruction;
     dst    : S4(write);
     src    : S3(read);
@@ -5534,7 +5583,7 @@
 //               in the encode section of the architecture description.
 
 //----------BSWAP-Instruction--------------------------------------------------
-instruct bytes_reverse_int(eRegI dst) %{
+instruct bytes_reverse_int(rRegI dst) %{
   match(Set dst (ReverseBytesI dst));
 
   format %{ "BSWAP  $dst" %}
@@ -5555,7 +5604,7 @@
   ins_pipe( ialu_reg_reg);
 %}
 
-instruct bytes_reverse_unsigned_short(eRegI dst) %{
+instruct bytes_reverse_unsigned_short(rRegI dst) %{
   match(Set dst (ReverseBytesUS dst));
 
   format %{ "BSWAP  $dst\n\t" 
@@ -5567,7 +5616,7 @@
   ins_pipe( ialu_reg );
 %}
 
-instruct bytes_reverse_short(eRegI dst) %{
+instruct bytes_reverse_short(rRegI dst) %{
   match(Set dst (ReverseBytesS dst));
 
   format %{ "BSWAP  $dst\n\t" 
@@ -5582,7 +5631,7 @@
 
 //---------- Zeros Count Instructions ------------------------------------------
 
-instruct countLeadingZerosI(eRegI dst, eRegI src, eFlagsReg cr) %{
+instruct countLeadingZerosI(rRegI dst, rRegI src, eFlagsReg cr) %{
   predicate(UseCountLeadingZerosInstruction);
   match(Set dst (CountLeadingZerosI src));
   effect(KILL cr);
@@ -5594,7 +5643,7 @@
   ins_pipe(ialu_reg);
 %}
 
-instruct countLeadingZerosI_bsr(eRegI dst, eRegI src, eFlagsReg cr) %{
+instruct countLeadingZerosI_bsr(rRegI dst, rRegI src, eFlagsReg cr) %{
   predicate(!UseCountLeadingZerosInstruction);
   match(Set dst (CountLeadingZerosI src));
   effect(KILL cr);
@@ -5619,7 +5668,7 @@
   ins_pipe(ialu_reg);
 %}
 
-instruct countLeadingZerosL(eRegI dst, eRegL src, eFlagsReg cr) %{
+instruct countLeadingZerosL(rRegI dst, eRegL src, eFlagsReg cr) %{
   predicate(UseCountLeadingZerosInstruction);
   match(Set dst (CountLeadingZerosL src));
   effect(TEMP dst, KILL cr);
@@ -5642,7 +5691,7 @@
   ins_pipe(ialu_reg);
 %}
 
-instruct countLeadingZerosL_bsr(eRegI dst, eRegL src, eFlagsReg cr) %{
+instruct countLeadingZerosL_bsr(rRegI dst, eRegL src, eFlagsReg cr) %{
   predicate(!UseCountLeadingZerosInstruction);
   match(Set dst (CountLeadingZerosL src));
   effect(TEMP dst, KILL cr);
@@ -5678,7 +5727,7 @@
   ins_pipe(ialu_reg);
 %}
 
-instruct countTrailingZerosI(eRegI dst, eRegI src, eFlagsReg cr) %{
+instruct countTrailingZerosI(rRegI dst, rRegI src, eFlagsReg cr) %{
   match(Set dst (CountTrailingZerosI src));
   effect(KILL cr);
 
@@ -5697,7 +5746,7 @@
   ins_pipe(ialu_reg);
 %}
 
-instruct countTrailingZerosL(eRegI dst, eRegL src, eFlagsReg cr) %{
+instruct countTrailingZerosL(rRegI dst, eRegL src, eFlagsReg cr) %{
   match(Set dst (CountTrailingZerosL src));
   effect(TEMP dst, KILL cr);
 
@@ -5729,7 +5778,7 @@
 
 //---------- Population Count Instructions -------------------------------------
 
-instruct popCountI(eRegI dst, eRegI src) %{
+instruct popCountI(rRegI dst, rRegI src) %{
   predicate(UsePopCountInstruction);
   match(Set dst (PopCountI src));
 
@@ -5740,7 +5789,7 @@
   ins_pipe(ialu_reg);
 %}
 
-instruct popCountI_mem(eRegI dst, memory mem) %{
+instruct popCountI_mem(rRegI dst, memory mem) %{
   predicate(UsePopCountInstruction);
   match(Set dst (PopCountI (LoadI mem)));
 
@@ -5752,7 +5801,7 @@
 %}
 
 // Note: Long.bitCount(long) returns an int.
-instruct popCountL(eRegI dst, eRegL src, eRegI tmp, eFlagsReg cr) %{
+instruct popCountL(rRegI dst, eRegL src, rRegI tmp, eFlagsReg cr) %{
   predicate(UsePopCountInstruction);
   match(Set dst (PopCountL src));
   effect(KILL cr, TEMP tmp, TEMP dst);
@@ -5769,7 +5818,7 @@
 %}
 
 // Note: Long.bitCount(long) returns an int.
-instruct popCountL_mem(eRegI dst, memory mem, eRegI tmp, eFlagsReg cr) %{
+instruct popCountL_mem(rRegI dst, memory mem, rRegI tmp, eFlagsReg cr) %{
   predicate(UsePopCountInstruction);
   match(Set dst (PopCountL (LoadL mem)));
   effect(KILL cr, TEMP tmp, TEMP dst);
@@ -5873,7 +5922,7 @@
 %}
 
 // Load Short (16bit signed)
-instruct loadS(eRegI dst, memory mem) %{
+instruct loadS(rRegI dst, memory mem) %{
   match(Set dst (LoadS mem));
 
   ins_cost(125);
@@ -5887,7 +5936,7 @@
 %}
 
 // Load Short (16 bit signed) to Byte (8 bit signed)
-instruct loadS2B(eRegI dst, memory mem, immI_24 twentyfour) %{
+instruct loadS2B(rRegI dst, memory mem, immI_24 twentyfour) %{
   match(Set dst (RShiftI (LShiftI (LoadS mem) twentyfour) twentyfour));
 
   ins_cost(125);
@@ -5918,7 +5967,7 @@
 %}
 
 // Load Unsigned Short/Char (16bit unsigned)
-instruct loadUS(eRegI dst, memory mem) %{
+instruct loadUS(rRegI dst, memory mem) %{
   match(Set dst (LoadUS mem));
 
   ins_cost(125);
@@ -5932,7 +5981,7 @@
 %}
 
 // Load Unsigned Short/Char (16 bit UNsigned) to Byte (8 bit signed)
-instruct loadUS2B(eRegI dst, memory mem, immI_24 twentyfour) %{
+instruct loadUS2B(rRegI dst, memory mem, immI_24 twentyfour) %{
   match(Set dst (RShiftI (LShiftI (LoadUS mem) twentyfour) twentyfour));
 
   ins_cost(125);
@@ -5993,7 +6042,7 @@
 %}
 
 // Load Integer
-instruct loadI(eRegI dst, memory mem) %{
+instruct loadI(rRegI dst, memory mem) %{
   match(Set dst (LoadI mem));
 
   ins_cost(125);
@@ -6007,7 +6056,7 @@
 %}
 
 // Load Integer (32 bit signed) to Byte (8 bit signed)
-instruct loadI2B(eRegI dst, memory mem, immI_24 twentyfour) %{
+instruct loadI2B(rRegI dst, memory mem, immI_24 twentyfour) %{
   match(Set dst (RShiftI (LShiftI (LoadI mem) twentyfour) twentyfour));
 
   ins_cost(125);
@@ -6019,7 +6068,7 @@
 %}
 
 // Load Integer (32 bit signed) to Unsigned Byte (8 bit UNsigned)
-instruct loadI2UB(eRegI dst, memory mem, immI_255 mask) %{
+instruct loadI2UB(rRegI dst, memory mem, immI_255 mask) %{
   match(Set dst (AndI (LoadI mem) mask));
 
   ins_cost(125);
@@ -6031,7 +6080,7 @@
 %}
 
 // Load Integer (32 bit signed) to Short (16 bit signed)
-instruct loadI2S(eRegI dst, memory mem, immI_16 sixteen) %{
+instruct loadI2S(rRegI dst, memory mem, immI_16 sixteen) %{
   match(Set dst (RShiftI (LShiftI (LoadI mem) sixteen) sixteen));
 
   ins_cost(125);
@@ -6043,7 +6092,7 @@
 %}
 
 // Load Integer (32 bit signed) to Unsigned Short/Char (16 bit UNsigned)
-instruct loadI2US(eRegI dst, memory mem, immI_65535 mask) %{
+instruct loadI2US(rRegI dst, memory mem, immI_65535 mask) %{
   match(Set dst (AndI (LoadI mem) mask));
 
   ins_cost(125);
@@ -6204,7 +6253,7 @@
 %}
 
 // Load Range
-instruct loadRange(eRegI dst, memory mem) %{
+instruct loadRange(rRegI dst, memory mem) %{
   match(Set dst (LoadRange mem));
 
   ins_cost(125);
@@ -6301,66 +6350,6 @@
   ins_pipe( fpu_reg_mem );
 %}
 
-// Load Aligned Packed Byte to XMM register
-instruct loadA8B(regD dst, memory mem) %{
-  predicate(UseSSE>=1);
-  match(Set dst (Load8B mem));
-  ins_cost(125);
-  format %{ "MOVQ  $dst,$mem\t! packed8B" %}
-  ins_encode %{
-    __ movq($dst$$XMMRegister, $mem$$Address);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-// Load Aligned Packed Short to XMM register
-instruct loadA4S(regD dst, memory mem) %{
-  predicate(UseSSE>=1);
-  match(Set dst (Load4S mem));
-  ins_cost(125);
-  format %{ "MOVQ  $dst,$mem\t! packed4S" %}
-  ins_encode %{
-    __ movq($dst$$XMMRegister, $mem$$Address);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-// Load Aligned Packed Char to XMM register
-instruct loadA4C(regD dst, memory mem) %{
-  predicate(UseSSE>=1);
-  match(Set dst (Load4C mem));
-  ins_cost(125);
-  format %{ "MOVQ  $dst,$mem\t! packed4C" %}
-  ins_encode %{
-    __ movq($dst$$XMMRegister, $mem$$Address);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-// Load Aligned Packed Integer to XMM register
-instruct load2IU(regD dst, memory mem) %{
-  predicate(UseSSE>=1);
-  match(Set dst (Load2I mem));
-  ins_cost(125);
-  format %{ "MOVQ  $dst,$mem\t! packed2I" %}
-  ins_encode %{
-    __ movq($dst$$XMMRegister, $mem$$Address);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-// Load Aligned Packed Single to XMM
-instruct loadA2F(regD dst, memory mem) %{
-  predicate(UseSSE>=1);
-  match(Set dst (Load2F mem));
-  ins_cost(145);
-  format %{ "MOVQ  $dst,$mem\t! packed2F" %}
-  ins_encode %{
-    __ movq($dst$$XMMRegister, $mem$$Address);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
 // Load Effective Address
 instruct leaP8(eRegP dst, indOffset8 mem) %{
   match(Set dst mem);
@@ -6413,7 +6402,7 @@
 %}
 
 // Load Constant
-instruct loadConI(eRegI dst, immI src) %{
+instruct loadConI(rRegI dst, immI src) %{
   match(Set dst src);
 
   format %{ "MOV    $dst,$src" %}
@@ -6422,7 +6411,7 @@
 %}
 
 // Load Constant zero
-instruct loadConI0(eRegI dst, immI0 src, eFlagsReg cr) %{
+instruct loadConI0(rRegI dst, immI0 src, eFlagsReg cr) %{
   match(Set dst src);
   effect(KILL cr);
 
@@ -6590,7 +6579,7 @@
 %}
 
 // Load Stack Slot
-instruct loadSSI(eRegI dst, stackSlotI src) %{
+instruct loadSSI(rRegI dst, stackSlotI src) %{
   match(Set dst src);
   ins_cost(125);
 
@@ -6817,7 +6806,7 @@
 %}
 
 // Store Char/Short
-instruct storeC(memory mem, eRegI src) %{
+instruct storeC(memory mem, rRegI src) %{
   match(Set mem (StoreC mem src));
 
   ins_cost(125);
@@ -6828,7 +6817,7 @@
 %}
 
 // Store Integer
-instruct storeI(memory mem, eRegI src) %{
+instruct storeI(memory mem, rRegI src) %{
   match(Set mem (StoreI mem src));
 
   ins_cost(125);
@@ -6972,42 +6961,6 @@
   ins_pipe( ialu_mem_imm );
 %}
 
-// Store Aligned Packed Byte XMM register to memory
-instruct storeA8B(memory mem, regD src) %{
-  predicate(UseSSE>=1);
-  match(Set mem (Store8B mem src));
-  ins_cost(145);
-  format %{ "MOVQ  $mem,$src\t! packed8B" %}
-  ins_encode %{
-    __ movq($mem$$Address, $src$$XMMRegister);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-// Store Aligned Packed Char/Short XMM register to memory
-instruct storeA4C(memory mem, regD src) %{
-  predicate(UseSSE>=1);
-  match(Set mem (Store4C mem src));
-  ins_cost(145);
-  format %{ "MOVQ  $mem,$src\t! packed4C" %}
-  ins_encode %{
-    __ movq($mem$$Address, $src$$XMMRegister);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-// Store Aligned Packed Integer XMM register to memory
-instruct storeA2I(memory mem, regD src) %{
-  predicate(UseSSE>=1);
-  match(Set mem (Store2I mem src));
-  ins_cost(145);
-  format %{ "MOVQ  $mem,$src\t! packed2I" %}
-  ins_encode %{
-    __ movq($mem$$Address, $src$$XMMRegister);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
 // Store CMS card-mark Immediate
 instruct storeImmCM(memory mem, immI8 src) %{
   match(Set mem (StoreCM mem src));
@@ -7069,18 +7022,6 @@
   ins_pipe( pipe_slow );
 %}
 
-// Store Aligned Packed Single Float XMM register to memory
-instruct storeA2F(memory mem, regD src) %{
-  predicate(UseSSE>=1);
-  match(Set mem (Store2F mem src));
-  ins_cost(145);
-  format %{ "MOVQ  $mem,$src\t! packed2F" %}
-  ins_encode %{
-    __ movq($mem$$Address, $src$$XMMRegister);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
 // Store Float
 instruct storeFPR( memory mem, regFPR1 src) %{
   predicate(UseSSE==0);
@@ -7142,7 +7083,7 @@
 %}
 
 // Store Integer to stack slot
-instruct storeSSI(stackSlotI dst, eRegI src) %{
+instruct storeSSI(stackSlotI dst, rRegI src) %{
   match(Set dst src);
 
   ins_cost(100);
@@ -7267,7 +7208,7 @@
   ins_pipe(empty);
 %}
 
-instruct castP2X(eRegI dst, eRegP src ) %{
+instruct castP2X(rRegI dst, eRegP src ) %{
   match(Set dst (CastP2X src));
   ins_cost(50);
   format %{ "MOV    $dst, $src\t# CastP2X" %}
@@ -7277,7 +7218,7 @@
 
 //----------Conditional Move---------------------------------------------------
 // Conditional move
-instruct jmovI_reg(cmpOp cop, eFlagsReg cr, eRegI dst, eRegI src) %{
+instruct jmovI_reg(cmpOp cop, eFlagsReg cr, rRegI dst, rRegI src) %{
   predicate(!VM_Version::supports_cmov() );
   match(Set dst (CMoveI (Binary cop cr) (Binary dst src)));
   ins_cost(200);
@@ -7294,7 +7235,7 @@
   ins_pipe( pipe_cmov_reg );
 %}
 
-instruct jmovI_regU(cmpOpU cop, eFlagsRegU cr, eRegI dst, eRegI src) %{
+instruct jmovI_regU(cmpOpU cop, eFlagsRegU cr, rRegI dst, rRegI src) %{
   predicate(!VM_Version::supports_cmov() );
   match(Set dst (CMoveI (Binary cop cr) (Binary dst src)));
   ins_cost(200);
@@ -7311,7 +7252,7 @@
   ins_pipe( pipe_cmov_reg );
 %}
 
-instruct cmovI_reg(eRegI dst, eRegI src, eFlagsReg cr, cmpOp cop ) %{
+instruct cmovI_reg(rRegI dst, rRegI src, eFlagsReg cr, cmpOp cop ) %{
   predicate(VM_Version::supports_cmov() );
   match(Set dst (CMoveI (Binary cop cr) (Binary dst src)));
   ins_cost(200);
@@ -7321,7 +7262,7 @@
   ins_pipe( pipe_cmov_reg );
 %}
 
-instruct cmovI_regU( cmpOpU cop, eFlagsRegU cr, eRegI dst, eRegI src ) %{
+instruct cmovI_regU( cmpOpU cop, eFlagsRegU cr, rRegI dst, rRegI src ) %{
   predicate(VM_Version::supports_cmov() );
   match(Set dst (CMoveI (Binary cop cr) (Binary dst src)));
   ins_cost(200);
@@ -7331,7 +7272,7 @@
   ins_pipe( pipe_cmov_reg );
 %}
 
-instruct cmovI_regUCF( cmpOpUCF cop, eFlagsRegUCF cr, eRegI dst, eRegI src ) %{
+instruct cmovI_regUCF( cmpOpUCF cop, eFlagsRegUCF cr, rRegI dst, rRegI src ) %{
   predicate(VM_Version::supports_cmov() );
   match(Set dst (CMoveI (Binary cop cr) (Binary dst src)));
   ins_cost(200);
@@ -7341,7 +7282,7 @@
 %}
 
 // Conditional move
-instruct cmovI_mem(cmpOp cop, eFlagsReg cr, eRegI dst, memory src) %{
+instruct cmovI_mem(cmpOp cop, eFlagsReg cr, rRegI dst, memory src) %{
   predicate(VM_Version::supports_cmov() );
   match(Set dst (CMoveI (Binary cop cr) (Binary dst (LoadI src))));
   ins_cost(250);
@@ -7352,7 +7293,7 @@
 %}
 
 // Conditional move
-instruct cmovI_memU(cmpOpU cop, eFlagsRegU cr, eRegI dst, memory src) %{
+instruct cmovI_memU(cmpOpU cop, eFlagsRegU cr, rRegI dst, memory src) %{
   predicate(VM_Version::supports_cmov() );
   match(Set dst (CMoveI (Binary cop cr) (Binary dst (LoadI src))));
   ins_cost(250);
@@ -7362,7 +7303,7 @@
   ins_pipe( pipe_cmov_mem );
 %}
 
-instruct cmovI_memUCF(cmpOpUCF cop, eFlagsRegUCF cr, eRegI dst, memory src) %{
+instruct cmovI_memUCF(cmpOpUCF cop, eFlagsRegUCF cr, rRegI dst, memory src) %{
   predicate(VM_Version::supports_cmov() );
   match(Set dst (CMoveI (Binary cop cr) (Binary dst (LoadI src))));
   ins_cost(250);
@@ -7616,7 +7557,7 @@
 //----------Arithmetic Instructions--------------------------------------------
 //----------Addition Instructions----------------------------------------------
 // Integer Addition Instructions
-instruct addI_eReg(eRegI dst, eRegI src, eFlagsReg cr) %{
+instruct addI_eReg(rRegI dst, rRegI src, eFlagsReg cr) %{
   match(Set dst (AddI dst src));
   effect(KILL cr);
 
@@ -7627,7 +7568,7 @@
   ins_pipe( ialu_reg_reg );
 %}
 
-instruct addI_eReg_imm(eRegI dst, immI src, eFlagsReg cr) %{
+instruct addI_eReg_imm(rRegI dst, immI src, eFlagsReg cr) %{
   match(Set dst (AddI dst src));
   effect(KILL cr);
 
@@ -7637,7 +7578,7 @@
   ins_pipe( ialu_reg );
 %}
 
-instruct incI_eReg(eRegI dst, immI1 src, eFlagsReg cr) %{
+instruct incI_eReg(rRegI dst, immI1 src, eFlagsReg cr) %{
   predicate(UseIncDec);
   match(Set dst (AddI dst src));
   effect(KILL cr);
@@ -7649,7 +7590,7 @@
   ins_pipe( ialu_reg );
 %}
 
-instruct leaI_eReg_immI(eRegI dst, eRegI src0, immI src1) %{
+instruct leaI_eReg_immI(rRegI dst, rRegI src0, immI src1) %{
   match(Set dst (AddI src0 src1));
   ins_cost(110);
 
@@ -7669,7 +7610,7 @@
   ins_pipe( ialu_reg_reg );
 %}
 
-instruct decI_eReg(eRegI dst, immI_M1 src, eFlagsReg cr) %{
+instruct decI_eReg(rRegI dst, immI_M1 src, eFlagsReg cr) %{
   predicate(UseIncDec);
   match(Set dst (AddI dst src));
   effect(KILL cr);
@@ -7681,7 +7622,7 @@
   ins_pipe( ialu_reg );
 %}
 
-instruct addP_eReg(eRegP dst, eRegI src, eFlagsReg cr) %{
+instruct addP_eReg(eRegP dst, rRegI src, eFlagsReg cr) %{
   match(Set dst (AddP dst src));
   effect(KILL cr);
 
@@ -7703,7 +7644,7 @@
   ins_pipe( ialu_reg );
 %}
 
-instruct addI_eReg_mem(eRegI dst, memory src, eFlagsReg cr) %{
+instruct addI_eReg_mem(rRegI dst, memory src, eFlagsReg cr) %{
   match(Set dst (AddI dst (LoadI src)));
   effect(KILL cr);
 
@@ -7714,7 +7655,7 @@
   ins_pipe( ialu_reg_mem );
 %}
 
-instruct addI_mem_eReg(memory dst, eRegI src, eFlagsReg cr) %{
+instruct addI_mem_eReg(memory dst, rRegI src, eFlagsReg cr) %{
   match(Set dst (StoreI dst (AddI (LoadI dst) src)));
   effect(KILL cr);
 
@@ -7776,7 +7717,7 @@
   ins_pipe( empty );
 %}
 
-instruct castII( eRegI dst ) %{
+instruct castII( rRegI dst ) %{
   match(Set dst (CastII dst));
   format %{ "#castII of $dst" %}
   ins_encode( /*empty encoding*/ );
@@ -7854,7 +7795,7 @@
 
 // Conditional-store of an int value.
 // ZF flag is set on success, reset otherwise.  Implemented with a CMPXCHG on Intel.
-instruct storeIConditional( memory mem, eAXRegI oldval, eRegI newval, eFlagsReg cr ) %{
+instruct storeIConditional( memory mem, eAXRegI oldval, rRegI newval, eFlagsReg cr ) %{
   match(Set cr (StoreIConditional mem (Binary oldval newval)));
   effect(KILL oldval);
   format %{ "CMPXCHG $mem,$newval\t# If EAX==$mem Then store $newval into $mem" %}
@@ -7887,7 +7828,7 @@
 
 // No flag versions for CompareAndSwap{P,I,L} because matcher can't match them
 
-instruct compareAndSwapL( eRegI res, eSIRegP mem_ptr, eADXRegL oldval, eBCXRegL newval, eFlagsReg cr ) %{
+instruct compareAndSwapL( rRegI res, eSIRegP mem_ptr, eADXRegL oldval, eBCXRegL newval, eFlagsReg cr ) %{
   match(Set res (CompareAndSwapL mem_ptr (Binary oldval newval)));
   effect(KILL cr, KILL oldval);
   format %{ "CMPXCHG8 [$mem_ptr],$newval\t# If EDX:EAX==[$mem_ptr] Then store $newval into [$mem_ptr]\n\t"
@@ -7900,7 +7841,7 @@
   ins_pipe( pipe_cmpxchg );
 %}
 
-instruct compareAndSwapP( eRegI res,  pRegP mem_ptr, eAXRegP oldval, eCXRegP newval, eFlagsReg cr) %{
+instruct compareAndSwapP( rRegI res,  pRegP mem_ptr, eAXRegP oldval, eCXRegP newval, eFlagsReg cr) %{
   match(Set res (CompareAndSwapP mem_ptr (Binary oldval newval)));
   effect(KILL cr, KILL oldval);
   format %{ "CMPXCHG [$mem_ptr],$newval\t# If EAX==[$mem_ptr] Then store $newval into [$mem_ptr]\n\t"
@@ -7912,7 +7853,7 @@
   ins_pipe( pipe_cmpxchg );
 %}
 
-instruct compareAndSwapI( eRegI res, pRegP mem_ptr, eAXRegI oldval, eCXRegI newval, eFlagsReg cr) %{
+instruct compareAndSwapI( rRegI res, pRegP mem_ptr, eAXRegI oldval, eCXRegI newval, eFlagsReg cr) %{
   match(Set res (CompareAndSwapI mem_ptr (Binary oldval newval)));
   effect(KILL cr, KILL oldval);
   format %{ "CMPXCHG [$mem_ptr],$newval\t# If EAX==[$mem_ptr] Then store $newval into [$mem_ptr]\n\t"
@@ -7926,7 +7867,7 @@
 
 //----------Subtraction Instructions-------------------------------------------
 // Integer Subtraction Instructions
-instruct subI_eReg(eRegI dst, eRegI src, eFlagsReg cr) %{
+instruct subI_eReg(rRegI dst, rRegI src, eFlagsReg cr) %{
   match(Set dst (SubI dst src));
   effect(KILL cr);
 
@@ -7937,7 +7878,7 @@
   ins_pipe( ialu_reg_reg );
 %}
 
-instruct subI_eReg_imm(eRegI dst, immI src, eFlagsReg cr) %{
+instruct subI_eReg_imm(rRegI dst, immI src, eFlagsReg cr) %{
   match(Set dst (SubI dst src));
   effect(KILL cr);
 
@@ -7948,7 +7889,7 @@
   ins_pipe( ialu_reg );
 %}
 
-instruct subI_eReg_mem(eRegI dst, memory src, eFlagsReg cr) %{
+instruct subI_eReg_mem(rRegI dst, memory src, eFlagsReg cr) %{
   match(Set dst (SubI dst (LoadI src)));
   effect(KILL cr);
 
@@ -7959,7 +7900,7 @@
   ins_pipe( ialu_reg_mem );
 %}
 
-instruct subI_mem_eReg(memory dst, eRegI src, eFlagsReg cr) %{
+instruct subI_mem_eReg(memory dst, rRegI src, eFlagsReg cr) %{
   match(Set dst (StoreI dst (SubI (LoadI dst) src)));
   effect(KILL cr);
 
@@ -7971,7 +7912,7 @@
 %}
 
 // Subtract from a pointer
-instruct subP_eReg(eRegP dst, eRegI src, immI0 zero, eFlagsReg cr) %{
+instruct subP_eReg(eRegP dst, rRegI src, immI0 zero, eFlagsReg cr) %{
   match(Set dst (AddP dst (SubI zero src)));
   effect(KILL cr);
 
@@ -7982,7 +7923,7 @@
   ins_pipe( ialu_reg_reg );
 %}
 
-instruct negI_eReg(eRegI dst, immI0 zero, eFlagsReg cr) %{
+instruct negI_eReg(rRegI dst, immI0 zero, eFlagsReg cr) %{
   match(Set dst (SubI zero dst));
   effect(KILL cr);
 
@@ -7997,7 +7938,7 @@
 //----------Multiplication/Division Instructions-------------------------------
 // Integer Multiplication Instructions
 // Multiply Register
-instruct mulI_eReg(eRegI dst, eRegI src, eFlagsReg cr) %{
+instruct mulI_eReg(rRegI dst, rRegI src, eFlagsReg cr) %{
   match(Set dst (MulI dst src));
   effect(KILL cr);
 
@@ -8010,7 +7951,7 @@
 %}
 
 // Multiply 32-bit Immediate
-instruct mulI_eReg_imm(eRegI dst, eRegI src, immI imm, eFlagsReg cr) %{
+instruct mulI_eReg_imm(rRegI dst, rRegI src, immI imm, eFlagsReg cr) %{
   match(Set dst (MulI src imm));
   effect(KILL cr);
 
@@ -8066,7 +8007,7 @@
 %}
 
 // Multiply Memory 32-bit Immediate
-instruct mulI_mem_imm(eRegI dst, memory src, immI imm, eFlagsReg cr) %{
+instruct mulI_mem_imm(rRegI dst, memory src, immI imm, eFlagsReg cr) %{
   match(Set dst (MulI (LoadI src) imm));
   effect(KILL cr);
 
@@ -8078,7 +8019,7 @@
 %}
 
 // Multiply Memory
-instruct mulI(eRegI dst, memory src, eFlagsReg cr) %{
+instruct mulI(rRegI dst, memory src, eFlagsReg cr) %{
   match(Set dst (MulI dst (LoadI src)));
   effect(KILL cr);
 
@@ -8115,7 +8056,7 @@
 %}
 
 // Multiply Register Long
-instruct mulL_eReg(eADXRegL dst, eRegL src, eRegI tmp, eFlagsReg cr) %{
+instruct mulL_eReg(eADXRegL dst, eRegL src, rRegI tmp, eFlagsReg cr) %{
   match(Set dst (MulL dst src));
   effect(KILL cr, TEMP tmp);
   ins_cost(4*100+3*400);
@@ -8133,7 +8074,7 @@
 %}
 
 // Multiply Register Long where the left operand's high 32 bits are zero
-instruct mulL_eReg_lhi0(eADXRegL dst, eRegL src, eRegI tmp, eFlagsReg cr) %{
+instruct mulL_eReg_lhi0(eADXRegL dst, eRegL src, rRegI tmp, eFlagsReg cr) %{
   predicate(is_operand_hi32_zero(n->in(1)));
   match(Set dst (MulL dst src));
   effect(KILL cr, TEMP tmp);
@@ -8154,7 +8095,7 @@
 %}
 
 // Multiply Register Long where the right operand's high 32 bits are zero
-instruct mulL_eReg_rhi0(eADXRegL dst, eRegL src, eRegI tmp, eFlagsReg cr) %{
+instruct mulL_eReg_rhi0(eADXRegL dst, eRegL src, rRegI tmp, eFlagsReg cr) %{
   predicate(is_operand_hi32_zero(n->in(2)));
   match(Set dst (MulL dst src));
   effect(KILL cr, TEMP tmp);
@@ -8190,7 +8131,7 @@
 %}
 
 // Multiply Register Long by small constant
-instruct mulL_eReg_con(eADXRegL dst, immL_127 src, eRegI tmp, eFlagsReg cr) %{
+instruct mulL_eReg_con(eADXRegL dst, immL_127 src, rRegI tmp, eFlagsReg cr) %{
   match(Set dst (MulL dst src));
   effect(KILL cr, TEMP tmp);
   ins_cost(2*100+2*400);
@@ -8288,7 +8229,7 @@
 %}
 
 // Divide Register Long (no special case since divisor != -1)
-instruct divL_eReg_imm32( eADXRegL dst, immL32 imm, eRegI tmp, eRegI tmp2, eFlagsReg cr ) %{
+instruct divL_eReg_imm32( eADXRegL dst, immL32 imm, rRegI tmp, rRegI tmp2, eFlagsReg cr ) %{
   match(Set dst (DivL dst imm));
   effect( TEMP tmp, TEMP tmp2, KILL cr );
   ins_cost(1000);
@@ -8359,7 +8300,7 @@
 %}
 
 // Remainder Register Long (remainder fit into 32 bits)
-instruct modL_eReg_imm32( eADXRegL dst, immL32 imm, eRegI tmp, eRegI tmp2, eFlagsReg cr ) %{
+instruct modL_eReg_imm32( eADXRegL dst, immL32 imm, rRegI tmp, rRegI tmp2, eFlagsReg cr ) %{
   match(Set dst (ModL dst imm));
   effect( TEMP tmp, TEMP tmp2, KILL cr );
   ins_cost(1000);
@@ -8427,7 +8368,7 @@
 
 // Integer Shift Instructions
 // Shift Left by one
-instruct shlI_eReg_1(eRegI dst, immI1 shift, eFlagsReg cr) %{
+instruct shlI_eReg_1(rRegI dst, immI1 shift, eFlagsReg cr) %{
   match(Set dst (LShiftI dst shift));
   effect(KILL cr);
 
@@ -8439,7 +8380,7 @@
 %}
 
 // Shift Left by 8-bit immediate
-instruct salI_eReg_imm(eRegI dst, immI8 shift, eFlagsReg cr) %{
+instruct salI_eReg_imm(rRegI dst, immI8 shift, eFlagsReg cr) %{
   match(Set dst (LShiftI dst shift));
   effect(KILL cr);
 
@@ -8451,7 +8392,7 @@
 %}
 
 // Shift Left by variable
-instruct salI_eReg_CL(eRegI dst, eCXRegI shift, eFlagsReg cr) %{
+instruct salI_eReg_CL(rRegI dst, eCXRegI shift, eFlagsReg cr) %{
   match(Set dst (LShiftI dst shift));
   effect(KILL cr);
 
@@ -8463,7 +8404,7 @@
 %}
 
 // Arithmetic shift right by one
-instruct sarI_eReg_1(eRegI dst, immI1 shift, eFlagsReg cr) %{
+instruct sarI_eReg_1(rRegI dst, immI1 shift, eFlagsReg cr) %{
   match(Set dst (RShiftI dst shift));
   effect(KILL cr);
 
@@ -8485,7 +8426,7 @@
 %}
 
 // Arithmetic Shift Right by 8-bit immediate
-instruct sarI_eReg_imm(eRegI dst, immI8 shift, eFlagsReg cr) %{
+instruct sarI_eReg_imm(rRegI dst, immI8 shift, eFlagsReg cr) %{
   match(Set dst (RShiftI dst shift));
   effect(KILL cr);
 
@@ -8508,7 +8449,7 @@
 %}
 
 // Arithmetic Shift Right by variable
-instruct sarI_eReg_CL(eRegI dst, eCXRegI shift, eFlagsReg cr) %{
+instruct sarI_eReg_CL(rRegI dst, eCXRegI shift, eFlagsReg cr) %{
   match(Set dst (RShiftI dst shift));
   effect(KILL cr);
 
@@ -8520,7 +8461,7 @@
 %}
 
 // Logical shift right by one
-instruct shrI_eReg_1(eRegI dst, immI1 shift, eFlagsReg cr) %{
+instruct shrI_eReg_1(rRegI dst, immI1 shift, eFlagsReg cr) %{
   match(Set dst (URShiftI dst shift));
   effect(KILL cr);
 
@@ -8532,7 +8473,7 @@
 %}
 
 // Logical Shift Right by 8-bit immediate
-instruct shrI_eReg_imm(eRegI dst, immI8 shift, eFlagsReg cr) %{
+instruct shrI_eReg_imm(rRegI dst, immI8 shift, eFlagsReg cr) %{
   match(Set dst (URShiftI dst shift));
   effect(KILL cr);
 
@@ -8546,7 +8487,7 @@
 
 // Logical Shift Right by 24, followed by Arithmetic Shift Left by 24.
 // This idiom is used by the compiler for the i2b bytecode.
-instruct i2b(eRegI dst, xRegI src, immI_24 twentyfour) %{
+instruct i2b(rRegI dst, xRegI src, immI_24 twentyfour) %{
   match(Set dst (RShiftI (LShiftI src twentyfour) twentyfour));
 
   size(3);
@@ -8559,7 +8500,7 @@
 
 // Logical Shift Right by 16, followed by Arithmetic Shift Left by 16.
 // This idiom is used by the compiler the i2s bytecode.
-instruct i2s(eRegI dst, xRegI src, immI_16 sixteen) %{
+instruct i2s(rRegI dst, xRegI src, immI_16 sixteen) %{
   match(Set dst (RShiftI (LShiftI src sixteen) sixteen));
 
   size(3);
@@ -8572,7 +8513,7 @@
 
 
 // Logical Shift Right by variable
-instruct shrI_eReg_CL(eRegI dst, eCXRegI shift, eFlagsReg cr) %{
+instruct shrI_eReg_CL(rRegI dst, eCXRegI shift, eFlagsReg cr) %{
   match(Set dst (URShiftI dst shift));
   effect(KILL cr);
 
@@ -8588,7 +8529,7 @@
 //----------Integer Logical Instructions---------------------------------------
 // And Instructions
 // And Register with Register
-instruct andI_eReg(eRegI dst, eRegI src, eFlagsReg cr) %{
+instruct andI_eReg(rRegI dst, rRegI src, eFlagsReg cr) %{
   match(Set dst (AndI dst src));
   effect(KILL cr);
 
@@ -8600,7 +8541,7 @@
 %}
 
 // And Register with Immediate
-instruct andI_eReg_imm(eRegI dst, immI src, eFlagsReg cr) %{
+instruct andI_eReg_imm(rRegI dst, immI src, eFlagsReg cr) %{
   match(Set dst (AndI dst src));
   effect(KILL cr);
 
@@ -8612,7 +8553,7 @@
 %}
 
 // And Register with Memory
-instruct andI_eReg_mem(eRegI dst, memory src, eFlagsReg cr) %{
+instruct andI_eReg_mem(rRegI dst, memory src, eFlagsReg cr) %{
   match(Set dst (AndI dst (LoadI src)));
   effect(KILL cr);
 
@@ -8624,7 +8565,7 @@
 %}
 
 // And Memory with Register
-instruct andI_mem_eReg(memory dst, eRegI src, eFlagsReg cr) %{
+instruct andI_mem_eReg(memory dst, rRegI src, eFlagsReg cr) %{
   match(Set dst (StoreI dst (AndI (LoadI dst) src)));
   effect(KILL cr);
 
@@ -8650,7 +8591,7 @@
 
 // Or Instructions
 // Or Register with Register
-instruct orI_eReg(eRegI dst, eRegI src, eFlagsReg cr) %{
+instruct orI_eReg(rRegI dst, rRegI src, eFlagsReg cr) %{
   match(Set dst (OrI dst src));
   effect(KILL cr);
 
@@ -8661,7 +8602,7 @@
   ins_pipe( ialu_reg_reg );
 %}
 
-instruct orI_eReg_castP2X(eRegI dst, eRegP src, eFlagsReg cr) %{
+instruct orI_eReg_castP2X(rRegI dst, eRegP src, eFlagsReg cr) %{
   match(Set dst (OrI dst (CastP2X src)));
   effect(KILL cr);
 
@@ -8674,7 +8615,7 @@
 
 
 // Or Register with Immediate
-instruct orI_eReg_imm(eRegI dst, immI src, eFlagsReg cr) %{
+instruct orI_eReg_imm(rRegI dst, immI src, eFlagsReg cr) %{
   match(Set dst (OrI dst src));
   effect(KILL cr);
 
@@ -8686,7 +8627,7 @@
 %}
 
 // Or Register with Memory
-instruct orI_eReg_mem(eRegI dst, memory src, eFlagsReg cr) %{
+instruct orI_eReg_mem(rRegI dst, memory src, eFlagsReg cr) %{
   match(Set dst (OrI dst (LoadI src)));
   effect(KILL cr);
 
@@ -8698,7 +8639,7 @@
 %}
 
 // Or Memory with Register
-instruct orI_mem_eReg(memory dst, eRegI src, eFlagsReg cr) %{
+instruct orI_mem_eReg(memory dst, rRegI src, eFlagsReg cr) %{
   match(Set dst (StoreI dst (OrI (LoadI dst) src)));
   effect(KILL cr);
 
@@ -8724,7 +8665,7 @@
 
 // ROL/ROR
 // ROL expand
-instruct rolI_eReg_imm1(eRegI dst, immI1 shift, eFlagsReg cr) %{
+instruct rolI_eReg_imm1(rRegI dst, immI1 shift, eFlagsReg cr) %{
   effect(USE_DEF dst, USE shift, KILL cr);
 
   format %{ "ROL    $dst, $shift" %}
@@ -8733,7 +8674,7 @@
   ins_pipe( ialu_reg );
 %}
 
-instruct rolI_eReg_imm8(eRegI dst, immI8 shift, eFlagsReg cr) %{
+instruct rolI_eReg_imm8(rRegI dst, immI8 shift, eFlagsReg cr) %{
   effect(USE_DEF dst, USE shift, KILL cr);
 
   format %{ "ROL    $dst, $shift" %}
@@ -8753,7 +8694,7 @@
 // end of ROL expand
 
 // ROL 32bit by one once
-instruct rolI_eReg_i1(eRegI dst, immI1 lshift, immI_M1 rshift, eFlagsReg cr) %{
+instruct rolI_eReg_i1(rRegI dst, immI1 lshift, immI_M1 rshift, eFlagsReg cr) %{
   match(Set dst ( OrI (LShiftI dst lshift) (URShiftI dst rshift)));
 
   expand %{
@@ -8762,7 +8703,7 @@
 %}
 
 // ROL 32bit var by imm8 once
-instruct rolI_eReg_i8(eRegI dst, immI8 lshift, immI8 rshift, eFlagsReg cr) %{
+instruct rolI_eReg_i8(rRegI dst, immI8 lshift, immI8 rshift, eFlagsReg cr) %{
   predicate(  0 == ((n->in(1)->in(2)->get_int() + n->in(2)->in(2)->get_int()) & 0x1f));
   match(Set dst ( OrI (LShiftI dst lshift) (URShiftI dst rshift)));
 
@@ -8790,7 +8731,7 @@
 %}
 
 // ROR expand
-instruct rorI_eReg_imm1(eRegI dst, immI1 shift, eFlagsReg cr) %{
+instruct rorI_eReg_imm1(rRegI dst, immI1 shift, eFlagsReg cr) %{
   effect(USE_DEF dst, USE shift, KILL cr);
 
   format %{ "ROR    $dst, $shift" %}
@@ -8799,7 +8740,7 @@
   ins_pipe( ialu_reg );
 %}
 
-instruct rorI_eReg_imm8(eRegI dst, immI8 shift, eFlagsReg cr) %{
+instruct rorI_eReg_imm8(rRegI dst, immI8 shift, eFlagsReg cr) %{
   effect (USE_DEF dst, USE shift, KILL cr);
 
   format %{ "ROR    $dst, $shift" %}
@@ -8819,7 +8760,7 @@
 // end of ROR expand
 
 // ROR right once
-instruct rorI_eReg_i1(eRegI dst, immI1 rshift, immI_M1 lshift, eFlagsReg cr) %{
+instruct rorI_eReg_i1(rRegI dst, immI1 rshift, immI_M1 lshift, eFlagsReg cr) %{
   match(Set dst ( OrI (URShiftI dst rshift) (LShiftI dst lshift)));
 
   expand %{
@@ -8828,7 +8769,7 @@
 %}
 
 // ROR 32bit by immI8 once
-instruct rorI_eReg_i8(eRegI dst, immI8 rshift, immI8 lshift, eFlagsReg cr) %{
+instruct rorI_eReg_i8(rRegI dst, immI8 rshift, immI8 lshift, eFlagsReg cr) %{
   predicate(  0 == ((n->in(1)->in(2)->get_int() + n->in(2)->in(2)->get_int()) & 0x1f));
   match(Set dst ( OrI (URShiftI dst rshift) (LShiftI dst lshift)));
 
@@ -8857,7 +8798,7 @@
 
 // Xor Instructions
 // Xor Register with Register
-instruct xorI_eReg(eRegI dst, eRegI src, eFlagsReg cr) %{
+instruct xorI_eReg(rRegI dst, rRegI src, eFlagsReg cr) %{
   match(Set dst (XorI dst src));
   effect(KILL cr);
 
@@ -8869,7 +8810,7 @@
 %}
 
 // Xor Register with Immediate -1
-instruct xorI_eReg_im1(eRegI dst, immI_M1 imm) %{
+instruct xorI_eReg_im1(rRegI dst, immI_M1 imm) %{
   match(Set dst (XorI dst imm));  
 
   size(2);
@@ -8881,7 +8822,7 @@
 %}
 
 // Xor Register with Immediate
-instruct xorI_eReg_imm(eRegI dst, immI src, eFlagsReg cr) %{
+instruct xorI_eReg_imm(rRegI dst, immI src, eFlagsReg cr) %{
   match(Set dst (XorI dst src));
   effect(KILL cr);
 
@@ -8893,7 +8834,7 @@
 %}
 
 // Xor Register with Memory
-instruct xorI_eReg_mem(eRegI dst, memory src, eFlagsReg cr) %{
+instruct xorI_eReg_mem(rRegI dst, memory src, eFlagsReg cr) %{
   match(Set dst (XorI dst (LoadI src)));
   effect(KILL cr);
 
@@ -8905,7 +8846,7 @@
 %}
 
 // Xor Memory with Register
-instruct xorI_mem_eReg(memory dst, eRegI src, eFlagsReg cr) %{
+instruct xorI_mem_eReg(memory dst, rRegI src, eFlagsReg cr) %{
   match(Set dst (StoreI dst (XorI (LoadI dst) src)));
   effect(KILL cr);
 
@@ -8930,7 +8871,7 @@
 
 //----------Convert Int to Boolean---------------------------------------------
 
-instruct movI_nocopy(eRegI dst, eRegI src) %{
+instruct movI_nocopy(rRegI dst, rRegI src) %{
   effect( DEF dst, USE src );
   format %{ "MOV    $dst,$src" %}
   ins_encode( enc_Copy( dst, src) );
@@ -8937,7 +8878,7 @@
   ins_pipe( ialu_reg_reg );
 %}
 
-instruct ci2b( eRegI dst, eRegI src, eFlagsReg cr ) %{
+instruct ci2b( rRegI dst, rRegI src, eFlagsReg cr ) %{
   effect( USE_DEF dst, USE src, KILL cr );
 
   size(4);
@@ -8948,7 +8889,7 @@
   ins_pipe( ialu_reg_reg_long );
 %}
 
-instruct convI2B( eRegI dst, eRegI src, eFlagsReg cr ) %{
+instruct convI2B( rRegI dst, rRegI src, eFlagsReg cr ) %{
   match(Set dst (Conv2B src));
 
   expand %{
@@ -8957,7 +8898,7 @@
   %}
 %}
 
-instruct movP_nocopy(eRegI dst, eRegP src) %{
+instruct movP_nocopy(rRegI dst, eRegP src) %{
   effect( DEF dst, USE src );
   format %{ "MOV    $dst,$src" %}
   ins_encode( enc_Copy( dst, src) );
@@ -8964,7 +8905,7 @@
   ins_pipe( ialu_reg_reg );
 %}
 
-instruct cp2b( eRegI dst, eRegP src, eFlagsReg cr ) %{
+instruct cp2b( rRegI dst, eRegP src, eFlagsReg cr ) %{
   effect( USE_DEF dst, USE src, KILL cr );
   format %{ "NEG    $dst\n\t"
             "ADC    $dst,$src" %}
@@ -8973,7 +8914,7 @@
   ins_pipe( ialu_reg_reg_long );
 %}
 
-instruct convP2B( eRegI dst, eRegP src, eFlagsReg cr ) %{
+instruct convP2B( rRegI dst, eRegP src, eFlagsReg cr ) %{
   match(Set dst (Conv2B src));
 
   expand %{
@@ -8998,7 +8939,7 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct cmpLTMask0( eRegI dst, immI0 zero, eFlagsReg cr ) %{
+instruct cmpLTMask0( rRegI dst, immI0 zero, eFlagsReg cr ) %{
   match(Set dst (CmpLTMask dst zero));
   effect( DEF dst, KILL cr );
   ins_cost(100);
@@ -9470,7 +9411,7 @@
 %}
 
 // Compare vs zero into -1,0,1
-instruct cmpDPR_0(eRegI dst, regDPR src1, immDPR0 zero, eAXRegI rax, eFlagsReg cr) %{
+instruct cmpDPR_0(rRegI dst, regDPR src1, immDPR0 zero, eAXRegI rax, eFlagsReg cr) %{
   predicate(UseSSE<=1);
   match(Set dst (CmpD3 src1 zero));
   effect(KILL cr, KILL rax);
@@ -9484,7 +9425,7 @@
 %}
 
 // Compare into -1,0,1
-instruct cmpDPR_reg(eRegI dst, regDPR src1, regDPR src2, eAXRegI rax, eFlagsReg cr) %{
+instruct cmpDPR_reg(rRegI dst, regDPR src1, regDPR src2, eAXRegI rax, eFlagsReg cr) %{
   predicate(UseSSE<=1);
   match(Set dst (CmpD3 src1 src2));
   effect(KILL cr, KILL rax);
@@ -10262,7 +10203,7 @@
 %}
 
 // Compare vs zero into -1,0,1
-instruct cmpFPR_0(eRegI dst, regFPR src1, immFPR0 zero, eAXRegI rax, eFlagsReg cr) %{
+instruct cmpFPR_0(rRegI dst, regFPR src1, immFPR0 zero, eAXRegI rax, eFlagsReg cr) %{
   predicate(UseSSE == 0);
   match(Set dst (CmpF3 src1 zero));
   effect(KILL cr, KILL rax);
@@ -10276,7 +10217,7 @@
 %}
 
 // Compare into -1,0,1
-instruct cmpFPR_reg(eRegI dst, regFPR src1, regFPR src2, eAXRegI rax, eFlagsReg cr) %{
+instruct cmpFPR_reg(rRegI dst, regFPR src1, regFPR src2, eAXRegI rax, eFlagsReg cr) %{
   predicate(UseSSE == 0);
   match(Set dst (CmpF3 src1 src2));
   effect(KILL cr, KILL rax);
@@ -11196,7 +11137,7 @@
   ins_pipe( fpu_reg_mem );
 %}
 
-instruct convI2D_reg(regD dst, eRegI src) %{
+instruct convI2D_reg(regD dst, rRegI src) %{
   predicate( UseSSE>=2 && !UseXmmI2D );
   match(Set dst (ConvI2D src));
   format %{ "CVTSI2SD $dst,$src" %}
@@ -11216,7 +11157,7 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct convXI2D_reg(regD dst, eRegI src)
+instruct convXI2D_reg(regD dst, rRegI src)
 %{
   predicate( UseSSE>=2 && UseXmmI2D );
   match(Set dst (ConvI2D src));
@@ -11304,7 +11245,7 @@
 %}
 
 // Convert an int to a float in xmm; no rounding step needed.
-instruct convI2F_reg(regF dst, eRegI src) %{
+instruct convI2F_reg(regF dst, rRegI src) %{
   predicate( UseSSE==1 || UseSSE>=2 && !UseXmmI2F );
   match(Set dst (ConvI2F src));
   format %{ "CVTSI2SS $dst, $src" %}
@@ -11314,7 +11255,7 @@
   ins_pipe( pipe_slow );
 %}
 
- instruct convXI2F_reg(regF dst, eRegI src)
+ instruct convXI2F_reg(regF dst, rRegI src)
 %{
   predicate( UseSSE>=2 && UseXmmI2F );
   match(Set dst (ConvI2F src));
@@ -11328,7 +11269,7 @@
   ins_pipe(pipe_slow); // XXX
 %}
 
-instruct convI2L_reg( eRegL dst, eRegI src, eFlagsReg cr) %{
+instruct convI2L_reg( eRegL dst, rRegI src, eFlagsReg cr) %{
   match(Set dst (ConvI2L src));
   effect(KILL cr);
   ins_cost(375);
@@ -11340,7 +11281,7 @@
 %}
 
 // Zero-extend convert int to long
-instruct convI2L_reg_zex(eRegL dst, eRegI src, immL_32bits mask, eFlagsReg flags ) %{
+instruct convI2L_reg_zex(eRegL dst, rRegI src, immL_32bits mask, eFlagsReg flags ) %{
   match(Set dst (AndL (ConvI2L src) mask) );
   effect( KILL flags );
   ins_cost(250);
@@ -11420,7 +11361,7 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct convL2I_reg( eRegI dst, eRegL src ) %{
+instruct convL2I_reg( rRegI dst, eRegL src ) %{
   match(Set dst (ConvL2I src));
   effect( DEF dst, USE src );
   format %{ "MOV    $dst,$src.lo" %}
@@ -11429,7 +11370,7 @@
 %}
 
 
-instruct MoveF2I_stack_reg(eRegI dst, stackSlotF src) %{
+instruct MoveF2I_stack_reg(rRegI dst, stackSlotF src) %{
   match(Set dst (MoveF2I src));
   effect( DEF dst, USE src );
   ins_cost(100);
@@ -11464,7 +11405,7 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct MoveF2I_reg_reg_sse(eRegI dst, regF src) %{
+instruct MoveF2I_reg_reg_sse(rRegI dst, regF src) %{
   predicate(UseSSE>=2);
   match(Set dst (MoveF2I src));
   effect( DEF dst, USE src );
@@ -11476,7 +11417,7 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct MoveI2F_reg_stack(stackSlotF dst, eRegI src) %{
+instruct MoveI2F_reg_stack(stackSlotF dst, rRegI src) %{
   match(Set dst (MoveI2F src));
   effect( DEF dst, USE src );
 
@@ -11516,7 +11457,7 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct MoveI2F_reg_reg_sse(regF dst, eRegI src) %{
+instruct MoveI2F_reg_reg_sse(regF dst, rRegI src) %{
   predicate(UseSSE>=2);
   match(Set dst (MoveI2F src));
   effect( DEF dst, USE src );
@@ -11650,187 +11591,7 @@
   ins_pipe( pipe_slow );
 %}
 
-// Replicate scalar to packed byte (1 byte) values in xmm
-instruct Repl8B_reg(regD dst, regD src) %{
-  predicate(UseSSE>=2);
-  match(Set dst (Replicate8B src));
-  format %{ "MOVDQA  $dst,$src\n\t"
-            "PUNPCKLBW $dst,$dst\n\t"
-            "PSHUFLW $dst,$dst,0x00\t! replicate8B" %}
-  ins_encode %{
-    if ($dst$$reg != $src$$reg) {
-      __ movdqa($dst$$XMMRegister, $src$$XMMRegister);
-    }
-    __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
-    __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
-  %}
-  ins_pipe( pipe_slow );
-%}
 
-// Replicate scalar to packed byte (1 byte) values in xmm
-instruct Repl8B_eRegI(regD dst, eRegI src) %{
-  predicate(UseSSE>=2);
-  match(Set dst (Replicate8B src));
-  format %{ "MOVD    $dst,$src\n\t"
-            "PUNPCKLBW $dst,$dst\n\t"
-            "PSHUFLW $dst,$dst,0x00\t! replicate8B" %}
-  ins_encode %{
-    __ movdl($dst$$XMMRegister, $src$$Register);
-    __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
-    __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-// Replicate scalar zero to packed byte (1 byte) values in xmm
-instruct Repl8B_immI0(regD dst, immI0 zero) %{
-  predicate(UseSSE>=2);
-  match(Set dst (Replicate8B zero));
-  format %{ "PXOR  $dst,$dst\t! replicate8B" %}
-  ins_encode %{
-    __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
-  %}
-  ins_pipe( fpu_reg_reg );
-%}
-
-// Replicate scalar to packed shore (2 byte) values in xmm
-instruct Repl4S_reg(regD dst, regD src) %{
-  predicate(UseSSE>=2);
-  match(Set dst (Replicate4S src));
-  format %{ "PSHUFLW $dst,$src,0x00\t! replicate4S" %}
-  ins_encode %{
-    __ pshuflw($dst$$XMMRegister, $src$$XMMRegister, 0x00);
-  %}
-  ins_pipe( fpu_reg_reg );
-%}
-
-// Replicate scalar to packed shore (2 byte) values in xmm
-instruct Repl4S_eRegI(regD dst, eRegI src) %{
-  predicate(UseSSE>=2);
-  match(Set dst (Replicate4S src));
-  format %{ "MOVD    $dst,$src\n\t"
-            "PSHUFLW $dst,$dst,0x00\t! replicate4S" %}
-  ins_encode %{
-    __ movdl($dst$$XMMRegister, $src$$Register);
-    __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
-  %}
-  ins_pipe( fpu_reg_reg );
-%}
-
-// Replicate scalar zero to packed short (2 byte) values in xmm
-instruct Repl4S_immI0(regD dst, immI0 zero) %{
-  predicate(UseSSE>=2);
-  match(Set dst (Replicate4S zero));
-  format %{ "PXOR  $dst,$dst\t! replicate4S" %}
-  ins_encode %{
-    __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
-  %}
-  ins_pipe( fpu_reg_reg );
-%}
-
-// Replicate scalar to packed char (2 byte) values in xmm
-instruct Repl4C_reg(regD dst, regD src) %{
-  predicate(UseSSE>=2);
-  match(Set dst (Replicate4C src));
-  format %{ "PSHUFLW $dst,$src,0x00\t! replicate4C" %}
-  ins_encode %{
-    __ pshuflw($dst$$XMMRegister, $src$$XMMRegister, 0x00);
-  %}
-  ins_pipe( fpu_reg_reg );
-%}
-
-// Replicate scalar to packed char (2 byte) values in xmm
-instruct Repl4C_eRegI(regD dst, eRegI src) %{
-  predicate(UseSSE>=2);
-  match(Set dst (Replicate4C src));
-  format %{ "MOVD    $dst,$src\n\t"
-            "PSHUFLW $dst,$dst,0x00\t! replicate4C" %}
-  ins_encode %{
-    __ movdl($dst$$XMMRegister, $src$$Register);
-    __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
-  %}
-  ins_pipe( fpu_reg_reg );
-%}
-
-// Replicate scalar zero to packed char (2 byte) values in xmm
-instruct Repl4C_immI0(regD dst, immI0 zero) %{
-  predicate(UseSSE>=2);
-  match(Set dst (Replicate4C zero));
-  format %{ "PXOR  $dst,$dst\t! replicate4C" %}
-  ins_encode %{
-    __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
-  %}
-  ins_pipe( fpu_reg_reg );
-%}
-
-// Replicate scalar to packed integer (4 byte) values in xmm
-instruct Repl2I_reg(regD dst, regD src) %{
-  predicate(UseSSE>=2);
-  match(Set dst (Replicate2I src));
-  format %{ "PSHUFD $dst,$src,0x00\t! replicate2I" %}
-  ins_encode %{
-    __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
-  %}
-  ins_pipe( fpu_reg_reg );
-%}
-
-// Replicate scalar to packed integer (4 byte) values in xmm
-instruct Repl2I_eRegI(regD dst, eRegI src) %{
-  predicate(UseSSE>=2);
-  match(Set dst (Replicate2I src));
-  format %{ "MOVD   $dst,$src\n\t"
-            "PSHUFD $dst,$dst,0x00\t! replicate2I" %}
-  ins_encode %{
-    __ movdl($dst$$XMMRegister, $src$$Register);
-    __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
-  %}
-  ins_pipe( fpu_reg_reg );
-%}
-
-// Replicate scalar zero to packed integer (2 byte) values in xmm
-instruct Repl2I_immI0(regD dst, immI0 zero) %{
-  predicate(UseSSE>=2);
-  match(Set dst (Replicate2I zero));
-  format %{ "PXOR  $dst,$dst\t! replicate2I" %}
-  ins_encode %{
-    __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
-  %}
-  ins_pipe( fpu_reg_reg );
-%}
-
-// Replicate scalar to packed single precision floating point values in xmm
-instruct Repl2F_reg(regD dst, regD src) %{
-  predicate(UseSSE>=2);
-  match(Set dst (Replicate2F src));
-  format %{ "PSHUFD $dst,$src,0xe0\t! replicate2F" %}
-  ins_encode %{
-    __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0xe0);
-  %}
-  ins_pipe( fpu_reg_reg );
-%}
-
-// Replicate scalar to packed single precision floating point values in xmm
-instruct Repl2F_regF(regD dst, regF src) %{
-  predicate(UseSSE>=2);
-  match(Set dst (Replicate2F src));
-  format %{ "PSHUFD $dst,$src,0xe0\t! replicate2F" %}
-  ins_encode %{
-    __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0xe0);
-  %}
-  ins_pipe( fpu_reg_reg );
-%}
-
-// Replicate scalar to packed single precision floating point values in xmm
-instruct Repl2F_immF0(regD dst, immF0 zero) %{
-  predicate(UseSSE>=2);
-  match(Set dst (Replicate2F zero));
-  format %{ "PXOR  $dst,$dst\t! replicate2F" %}
-  ins_encode %{
-    __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
-  %}
-  ins_pipe( fpu_reg_reg );
-%}
-
 // =======================================================================
 // fast clearing of an array
 instruct rep_stos(eCXRegI cnt, eDIRegP base, eAXRegI zero, Universe dummy, eFlagsReg cr) %{
@@ -11938,7 +11699,7 @@
 
 //----------Control Flow Instructions------------------------------------------
 // Signed compare Instructions
-instruct compI_eReg(eFlagsReg cr, eRegI op1, eRegI op2) %{
+instruct compI_eReg(eFlagsReg cr, rRegI op1, rRegI op2) %{
   match(Set cr (CmpI op1 op2));
   effect( DEF cr, USE op1, USE op2 );
   format %{ "CMP    $op1,$op2" %}
@@ -11947,7 +11708,7 @@
   ins_pipe( ialu_cr_reg_reg );
 %}
 
-instruct compI_eReg_imm(eFlagsReg cr, eRegI op1, immI op2) %{
+instruct compI_eReg_imm(eFlagsReg cr, rRegI op1, immI op2) %{
   match(Set cr (CmpI op1 op2));
   effect( DEF cr, USE op1 );
   format %{ "CMP    $op1,$op2" %}
@@ -11958,7 +11719,7 @@
 %}
 
 // Cisc-spilled version of cmpI_eReg
-instruct compI_eReg_mem(eFlagsReg cr, eRegI op1, memory op2) %{
+instruct compI_eReg_mem(eFlagsReg cr, rRegI op1, memory op2) %{
   match(Set cr (CmpI op1 (LoadI op2)));
 
   format %{ "CMP    $op1,$op2" %}
@@ -11968,7 +11729,7 @@
   ins_pipe( ialu_cr_reg_mem );
 %}
 
-instruct testI_reg( eFlagsReg cr, eRegI src, immI0 zero ) %{
+instruct testI_reg( eFlagsReg cr, rRegI src, immI0 zero ) %{
   match(Set cr (CmpI src zero));
   effect( DEF cr, USE src );
 
@@ -11978,7 +11739,7 @@
   ins_pipe( ialu_cr_reg_imm );
 %}
 
-instruct testI_reg_imm( eFlagsReg cr, eRegI src, immI con, immI0 zero ) %{
+instruct testI_reg_imm( eFlagsReg cr, rRegI src, immI con, immI0 zero ) %{
   match(Set cr (CmpI (AndI src con) zero));
 
   format %{ "TEST   $src,$con" %}
@@ -11987,7 +11748,7 @@
   ins_pipe( ialu_cr_reg_imm );
 %}
 
-instruct testI_reg_mem( eFlagsReg cr, eRegI src, memory mem, immI0 zero ) %{
+instruct testI_reg_mem( eFlagsReg cr, rRegI src, memory mem, immI0 zero ) %{
   match(Set cr (CmpI (AndI src mem) zero));
 
   format %{ "TEST   $src,$mem" %}
@@ -11998,7 +11759,7 @@
 
 // Unsigned compare Instructions; really, same as signed except they
 // produce an eFlagsRegU instead of eFlagsReg.
-instruct compU_eReg(eFlagsRegU cr, eRegI op1, eRegI op2) %{
+instruct compU_eReg(eFlagsRegU cr, rRegI op1, rRegI op2) %{
   match(Set cr (CmpU op1 op2));
 
   format %{ "CMPu   $op1,$op2" %}
@@ -12007,7 +11768,7 @@
   ins_pipe( ialu_cr_reg_reg );
 %}
 
-instruct compU_eReg_imm(eFlagsRegU cr, eRegI op1, immI op2) %{
+instruct compU_eReg_imm(eFlagsRegU cr, rRegI op1, immI op2) %{
   match(Set cr (CmpU op1 op2));
 
   format %{ "CMPu   $op1,$op2" %}
@@ -12017,7 +11778,7 @@
 %}
 
 // // Cisc-spilled version of cmpU_eReg
-instruct compU_eReg_mem(eFlagsRegU cr, eRegI op1, memory op2) %{
+instruct compU_eReg_mem(eFlagsRegU cr, rRegI op1, memory op2) %{
   match(Set cr (CmpU op1 (LoadI op2)));
 
   format %{ "CMPu   $op1,$op2" %}
@@ -12028,7 +11789,7 @@
 %}
 
 // // Cisc-spilled version of cmpU_eReg
-//instruct compU_mem_eReg(eFlagsRegU cr, memory op1, eRegI op2) %{
+//instruct compU_mem_eReg(eFlagsRegU cr, memory op1, rRegI op2) %{
 //  match(Set cr (CmpU (LoadI op1) op2));
 //
 //  format %{ "CMPu   $op1,$op2" %}
@@ -12037,7 +11798,7 @@
 //  ins_encode( OpcP, RegMem( op1, op2) );
 //%}
 
-instruct testU_reg( eFlagsRegU cr, eRegI src, immI0 zero ) %{
+instruct testU_reg( eFlagsRegU cr, rRegI src, immI0 zero ) %{
   match(Set cr (CmpU src zero));
 
   format %{ "TESTu  $src,$src" %}
@@ -12133,7 +11894,7 @@
 //   *** Min and Max using the conditional move are slower than the
 //   *** branch version on a Pentium III.
 // // Conditional move for min
-//instruct cmovI_reg_lt( eRegI op2, eRegI op1, eFlagsReg cr ) %{
+//instruct cmovI_reg_lt( rRegI op2, rRegI op1, eFlagsReg cr ) %{
 //  effect( USE_DEF op2, USE op1, USE cr );
 //  format %{ "CMOVlt $op2,$op1\t! min" %}
 //  opcode(0x4C,0x0F);
@@ -12142,7 +11903,7 @@
 //%}
 //
 //// Min Register with Register (P6 version)
-//instruct minI_eReg_p6( eRegI op1, eRegI op2 ) %{
+//instruct minI_eReg_p6( rRegI op1, rRegI op2 ) %{
 //  predicate(VM_Version::supports_cmov() );
 //  match(Set op2 (MinI op1 op2));
 //  ins_cost(200);
@@ -12154,7 +11915,7 @@
 //%}
 
 // Min Register with Register (generic version)
-instruct minI_eReg(eRegI dst, eRegI src, eFlagsReg flags) %{
+instruct minI_eReg(rRegI dst, rRegI src, eFlagsReg flags) %{
   match(Set dst (MinI dst src));
   effect(KILL flags);
   ins_cost(300);
@@ -12169,7 +11930,7 @@
 //   *** Min and Max using the conditional move are slower than the
 //   *** branch version on a Pentium III.
 // // Conditional move for max
-//instruct cmovI_reg_gt( eRegI op2, eRegI op1, eFlagsReg cr ) %{
+//instruct cmovI_reg_gt( rRegI op2, rRegI op1, eFlagsReg cr ) %{
 //  effect( USE_DEF op2, USE op1, USE cr );
 //  format %{ "CMOVgt $op2,$op1\t! max" %}
 //  opcode(0x4F,0x0F);
@@ -12178,7 +11939,7 @@
 //%}
 //
 // // Max Register with Register (P6 version)
-//instruct maxI_eReg_p6( eRegI op1, eRegI op2 ) %{
+//instruct maxI_eReg_p6( rRegI op1, rRegI op2 ) %{
 //  predicate(VM_Version::supports_cmov() );
 //  match(Set op2 (MaxI op1 op2));
 //  ins_cost(200);
@@ -12190,7 +11951,7 @@
 //%}
 
 // Max Register with Register (generic version)
-instruct maxI_eReg(eRegI dst, eRegI src, eFlagsReg flags) %{
+instruct maxI_eReg(rRegI dst, rRegI src, eFlagsReg flags) %{
   match(Set dst (MaxI dst src));
   effect(KILL flags);
   ins_cost(300);
@@ -12251,7 +12012,7 @@
 // ============================================================================
 // Branch Instructions
 // Jump Table
-instruct jumpXtnd(eRegI switch_val) %{
+instruct jumpXtnd(rRegI switch_val) %{
   match(Jump switch_val);
   ins_cost(350);
   format %{  "JMP    [$constantaddress](,$switch_val,1)\n\t" %}
@@ -12669,7 +12430,7 @@
 // Manifest a CmpL result in the normal flags.  Only good for LT or GE
 // compares.  Can be used for LE or GT compares by reversing arguments.
 // NOT GOOD FOR EQ/NE tests.
-instruct cmpL_reg_flags_LTGE( flagsReg_long_LTGE flags, eRegL src1, eRegL src2, eRegI tmp ) %{
+instruct cmpL_reg_flags_LTGE( flagsReg_long_LTGE flags, eRegL src1, eRegL src2, rRegI tmp ) %{
   match( Set flags (CmpL src1 src2 ));
   effect( TEMP tmp );
   ins_cost(300);
@@ -12715,7 +12476,7 @@
 %}
 
 // Compare 2 longs and CMOVE ints.
-instruct cmovII_reg_LTGE(cmpOp cmp, flagsReg_long_LTGE flags, eRegI dst, eRegI src) %{
+instruct cmovII_reg_LTGE(cmpOp cmp, flagsReg_long_LTGE flags, rRegI dst, rRegI src) %{
   predicate(VM_Version::supports_cmov() && ( _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::lt || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ge ));
   match(Set dst (CMoveI (Binary cmp flags) (Binary dst src)));
   ins_cost(200);
@@ -12725,7 +12486,7 @@
   ins_pipe( pipe_cmov_reg );
 %}
 
-instruct cmovII_mem_LTGE(cmpOp cmp, flagsReg_long_LTGE flags, eRegI dst, memory src) %{
+instruct cmovII_mem_LTGE(cmpOp cmp, flagsReg_long_LTGE flags, rRegI dst, memory src) %{
   predicate(VM_Version::supports_cmov() && ( _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::lt || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ge ));
   match(Set dst (CMoveI (Binary cmp flags) (Binary dst (LoadI src))));
   ins_cost(250);
@@ -12786,7 +12547,7 @@
 
 //======
 // Manifest a CmpL result in the normal flags.  Only good for EQ/NE compares.
-instruct cmpL_zero_flags_EQNE( flagsReg_long_EQNE flags, eRegL src, immL0 zero, eRegI tmp ) %{
+instruct cmpL_zero_flags_EQNE( flagsReg_long_EQNE flags, eRegL src, immL0 zero, rRegI tmp ) %{
   match( Set flags (CmpL src zero ));
   effect(TEMP tmp);
   ins_cost(200);
@@ -12843,7 +12604,7 @@
 %}
 
 // Compare 2 longs and CMOVE ints.
-instruct cmovII_reg_EQNE(cmpOp cmp, flagsReg_long_EQNE flags, eRegI dst, eRegI src) %{
+instruct cmovII_reg_EQNE(cmpOp cmp, flagsReg_long_EQNE flags, rRegI dst, rRegI src) %{
   predicate(VM_Version::supports_cmov() && ( _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::eq || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ne ));
   match(Set dst (CMoveI (Binary cmp flags) (Binary dst src)));
   ins_cost(200);
@@ -12853,7 +12614,7 @@
   ins_pipe( pipe_cmov_reg );
 %}
 
-instruct cmovII_mem_EQNE(cmpOp cmp, flagsReg_long_EQNE flags, eRegI dst, memory src) %{
+instruct cmovII_mem_EQNE(cmpOp cmp, flagsReg_long_EQNE flags, rRegI dst, memory src) %{
   predicate(VM_Version::supports_cmov() && ( _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::eq || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ne ));
   match(Set dst (CMoveI (Binary cmp flags) (Binary dst (LoadI src))));
   ins_cost(250);
@@ -12915,7 +12676,7 @@
 //======
 // Manifest a CmpL result in the normal flags.  Only good for LE or GT compares.
 // Same as cmpL_reg_flags_LEGT except must negate src
-instruct cmpL_zero_flags_LEGT( flagsReg_long_LEGT flags, eRegL src, immL0 zero, eRegI tmp ) %{
+instruct cmpL_zero_flags_LEGT( flagsReg_long_LEGT flags, eRegL src, immL0 zero, rRegI tmp ) %{
   match( Set flags (CmpL src zero ));
   effect( TEMP tmp );
   ins_cost(300);
@@ -12929,7 +12690,7 @@
 // Manifest a CmpL result in the normal flags.  Only good for LE or GT compares.
 // Same as cmpL_reg_flags_LTGE except operands swapped.  Swapping operands
 // requires a commuted test to get the same result.
-instruct cmpL_reg_flags_LEGT( flagsReg_long_LEGT flags, eRegL src1, eRegL src2, eRegI tmp ) %{
+instruct cmpL_reg_flags_LEGT( flagsReg_long_LEGT flags, eRegL src1, eRegL src2, rRegI tmp ) %{
   match( Set flags (CmpL src1 src2 ));
   effect( TEMP tmp );
   ins_cost(300);
@@ -12976,7 +12737,7 @@
 %}
 
 // Compare 2 longs and CMOVE ints.
-instruct cmovII_reg_LEGT(cmpOp_commute cmp, flagsReg_long_LEGT flags, eRegI dst, eRegI src) %{
+instruct cmovII_reg_LEGT(cmpOp_commute cmp, flagsReg_long_LEGT flags, rRegI dst, rRegI src) %{
   predicate(VM_Version::supports_cmov() && ( _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::le || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::gt ));
   match(Set dst (CMoveI (Binary cmp flags) (Binary dst src)));
   ins_cost(200);
@@ -12986,7 +12747,7 @@
   ins_pipe( pipe_cmov_reg );
 %}
 
-instruct cmovII_mem_LEGT(cmpOp_commute cmp, flagsReg_long_LEGT flags, eRegI dst, memory src) %{
+instruct cmovII_mem_LEGT(cmpOp_commute cmp, flagsReg_long_LEGT flags, rRegI dst, memory src) %{
   predicate(VM_Version::supports_cmov() && ( _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::le || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::gt ));
   match(Set dst (CMoveI (Binary cmp flags) (Binary dst (LoadI src))));
   ins_cost(250);
@@ -13315,11 +13076,11 @@
 // ---------EXAMPLE----------------------------------------------------------
 //
 // // pertinent parts of existing instructions in architecture description
-// instruct movI(eRegI dst, eRegI src) %{
+// instruct movI(rRegI dst, rRegI src) %{
 //   match(Set dst (CopyI src));
 // %}
 //
-// instruct incI_eReg(eRegI dst, immI1 src, eFlagsReg cr) %{
+// instruct incI_eReg(rRegI dst, immI1 src, eFlagsReg cr) %{
 //   match(Set dst (AddI dst src));
 //   effect(KILL cr);
 // %}
@@ -13364,11 +13125,11 @@
 // %}
 
 // // Change load of spilled value to only a spill
-// instruct storeI(memory mem, eRegI src) %{
+// instruct storeI(memory mem, rRegI src) %{
 //   match(Set mem (StoreI mem src));
 // %}
 //
-// instruct loadI(eRegI dst, memory mem) %{
+// instruct loadI(rRegI dst, memory mem) %{
 //   match(Set dst (LoadI mem));
 // %}
 //
--- old/src/cpu/x86/vm/x86_64.ad	Sat Jun  2 20:04:04 2012
+++ new/src/cpu/x86/vm/x86_64.ad	Sat Jun  2 20:04:03 2012
@@ -131,102 +131,6 @@
 
 // Floating Point Registers
 
-// XMM registers.  128-bit registers or 4 words each, labeled (a)-d.
-// Word a in each register holds a Float, words ab hold a Double.  We
-// currently do not use the SIMD capabilities, so registers cd are
-// unused at the moment.
-// XMM8-XMM15 must be encoded with REX.
-// Linux ABI:   No register preserved across function calls
-//              XMM0-XMM7 might hold parameters
-// Windows ABI: XMM6-XMM15 preserved across function calls
-//              XMM0-XMM3 might hold parameters
-
-reg_def XMM0   (SOC, SOC, Op_RegF,  0, xmm0->as_VMReg());
-reg_def XMM0_H (SOC, SOC, Op_RegF,  0, xmm0->as_VMReg()->next());
-
-reg_def XMM1   (SOC, SOC, Op_RegF,  1, xmm1->as_VMReg());
-reg_def XMM1_H (SOC, SOC, Op_RegF,  1, xmm1->as_VMReg()->next());
-
-reg_def XMM2   (SOC, SOC, Op_RegF,  2, xmm2->as_VMReg());
-reg_def XMM2_H (SOC, SOC, Op_RegF,  2, xmm2->as_VMReg()->next());
-
-reg_def XMM3   (SOC, SOC, Op_RegF,  3, xmm3->as_VMReg());
-reg_def XMM3_H (SOC, SOC, Op_RegF,  3, xmm3->as_VMReg()->next());
-
-reg_def XMM4   (SOC, SOC, Op_RegF,  4, xmm4->as_VMReg());
-reg_def XMM4_H (SOC, SOC, Op_RegF,  4, xmm4->as_VMReg()->next());
-
-reg_def XMM5   (SOC, SOC, Op_RegF,  5, xmm5->as_VMReg());
-reg_def XMM5_H (SOC, SOC, Op_RegF,  5, xmm5->as_VMReg()->next());
-
-#ifdef _WIN64
-
-reg_def XMM6   (SOC, SOE, Op_RegF,  6, xmm6->as_VMReg());
-reg_def XMM6_H (SOC, SOE, Op_RegF,  6, xmm6->as_VMReg()->next());
-
-reg_def XMM7   (SOC, SOE, Op_RegF,  7, xmm7->as_VMReg());
-reg_def XMM7_H (SOC, SOE, Op_RegF,  7, xmm7->as_VMReg()->next());
-
-reg_def XMM8   (SOC, SOE, Op_RegF,  8, xmm8->as_VMReg());
-reg_def XMM8_H (SOC, SOE, Op_RegF,  8, xmm8->as_VMReg()->next());
-
-reg_def XMM9   (SOC, SOE, Op_RegF,  9, xmm9->as_VMReg());
-reg_def XMM9_H (SOC, SOE, Op_RegF,  9, xmm9->as_VMReg()->next());
-
-reg_def XMM10  (SOC, SOE, Op_RegF, 10, xmm10->as_VMReg());
-reg_def XMM10_H(SOC, SOE, Op_RegF, 10, xmm10->as_VMReg()->next());
-
-reg_def XMM11  (SOC, SOE, Op_RegF, 11, xmm11->as_VMReg());
-reg_def XMM11_H(SOC, SOE, Op_RegF, 11, xmm11->as_VMReg()->next());
-
-reg_def XMM12  (SOC, SOE, Op_RegF, 12, xmm12->as_VMReg());
-reg_def XMM12_H(SOC, SOE, Op_RegF, 12, xmm12->as_VMReg()->next());
-
-reg_def XMM13  (SOC, SOE, Op_RegF, 13, xmm13->as_VMReg());
-reg_def XMM13_H(SOC, SOE, Op_RegF, 13, xmm13->as_VMReg()->next());
-
-reg_def XMM14  (SOC, SOE, Op_RegF, 14, xmm14->as_VMReg());
-reg_def XMM14_H(SOC, SOE, Op_RegF, 14, xmm14->as_VMReg()->next());
-
-reg_def XMM15  (SOC, SOE, Op_RegF, 15, xmm15->as_VMReg());
-reg_def XMM15_H(SOC, SOE, Op_RegF, 15, xmm15->as_VMReg()->next());
-
-#else
-
-reg_def XMM6   (SOC, SOC, Op_RegF,  6, xmm6->as_VMReg());
-reg_def XMM6_H (SOC, SOC, Op_RegF,  6, xmm6->as_VMReg()->next());
-
-reg_def XMM7   (SOC, SOC, Op_RegF,  7, xmm7->as_VMReg());
-reg_def XMM7_H (SOC, SOC, Op_RegF,  7, xmm7->as_VMReg()->next());
-
-reg_def XMM8   (SOC, SOC, Op_RegF,  8, xmm8->as_VMReg());
-reg_def XMM8_H (SOC, SOC, Op_RegF,  8, xmm8->as_VMReg()->next());
-
-reg_def XMM9   (SOC, SOC, Op_RegF,  9, xmm9->as_VMReg());
-reg_def XMM9_H (SOC, SOC, Op_RegF,  9, xmm9->as_VMReg()->next());
-
-reg_def XMM10  (SOC, SOC, Op_RegF, 10, xmm10->as_VMReg());
-reg_def XMM10_H(SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next());
-
-reg_def XMM11  (SOC, SOC, Op_RegF, 11, xmm11->as_VMReg());
-reg_def XMM11_H(SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next());
-
-reg_def XMM12  (SOC, SOC, Op_RegF, 12, xmm12->as_VMReg());
-reg_def XMM12_H(SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next());
-
-reg_def XMM13  (SOC, SOC, Op_RegF, 13, xmm13->as_VMReg());
-reg_def XMM13_H(SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next());
-
-reg_def XMM14  (SOC, SOC, Op_RegF, 14, xmm14->as_VMReg());
-reg_def XMM14_H(SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next());
-
-reg_def XMM15  (SOC, SOC, Op_RegF, 15, xmm15->as_VMReg());
-reg_def XMM15_H(SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next());
-
-#endif // _WIN64
-
-reg_def RFLAGS(SOC, SOC, 0, 16, VMRegImpl::Bad());
-
 // Specify priority of register selection within phases of register
 // allocation.  Highest priority is first.  A useful heuristic is to
 // give registers a low priority when they are required by machine
@@ -252,27 +156,7 @@
                    R15,         R15_H,
                    RSP,         RSP_H);
 
-// XXX probably use 8-15 first on Linux
-alloc_class chunk1(XMM0,  XMM0_H,
-                   XMM1,  XMM1_H,
-                   XMM2,  XMM2_H,
-                   XMM3,  XMM3_H,
-                   XMM4,  XMM4_H,
-                   XMM5,  XMM5_H,
-                   XMM6,  XMM6_H,
-                   XMM7,  XMM7_H,
-                   XMM8,  XMM8_H,
-                   XMM9,  XMM9_H,
-                   XMM10, XMM10_H,
-                   XMM11, XMM11_H,
-                   XMM12, XMM12_H,
-                   XMM13, XMM13_H,
-                   XMM14, XMM14_H,
-                   XMM15, XMM15_H);
 
-alloc_class chunk2(RFLAGS);
-
-
 //----------Architecture Description Register Classes--------------------------
 // Several register classes are automatically defined based upon information in
 // this architecture description.
@@ -501,47 +385,8 @@
 // Singleton class for instruction pointer
 // reg_class ip_reg(RIP);
 
-// Singleton class for condition codes
-reg_class int_flags(RFLAGS);
-
-// Class for all float registers
-reg_class float_reg(XMM0,
-                    XMM1,
-                    XMM2,
-                    XMM3,
-                    XMM4,
-                    XMM5,
-                    XMM6,
-                    XMM7,
-                    XMM8,
-                    XMM9,
-                    XMM10,
-                    XMM11,
-                    XMM12,
-                    XMM13,
-                    XMM14,
-                    XMM15);
-
-// Class for all double registers
-reg_class double_reg(XMM0,  XMM0_H,
-                     XMM1,  XMM1_H,
-                     XMM2,  XMM2_H,
-                     XMM3,  XMM3_H,
-                     XMM4,  XMM4_H,
-                     XMM5,  XMM5_H,
-                     XMM6,  XMM6_H,
-                     XMM7,  XMM7_H,
-                     XMM8,  XMM8_H,
-                     XMM9,  XMM9_H,
-                     XMM10, XMM10_H,
-                     XMM11, XMM11_H,
-                     XMM12, XMM12_H,
-                     XMM13, XMM13_H,
-                     XMM14, XMM14_H,
-                     XMM15, XMM15_H);
 %}
 
-
 //----------SOURCE BLOCK-------------------------------------------------------
 // This is a block of C++ code which provides values, functions, and
 // definitions necessary in the rest of the architecture description
@@ -1027,12 +872,84 @@
   return rc_float;
 }
 
+// Next two methods are shared by 32- and 64-bit VM. They are defined in x86.ad.
+static int vec_mov_helper(CodeBuffer *cbuf, bool do_size, int src_lo, int dst_lo,
+                          int src_hi, int dst_hi, uint ireg, outputStream* st);
+
+static int vec_spill_helper(CodeBuffer *cbuf, bool do_size, bool is_load,
+                            int stack_offset, int reg, uint ireg, outputStream* st);
+
+static void vec_stack_to_stack_helper(CodeBuffer *cbuf, int src_offset,
+                                      int dst_offset, uint ireg, outputStream* st) {
+  if (cbuf) {
+    MacroAssembler _masm(cbuf);
+    switch (ireg) {
+    case Op_VecS:
+      __ movq(Address(rsp, -8), rax);
+      __ movl(rax, Address(rsp, src_offset));
+      __ movl(Address(rsp, dst_offset), rax);
+      __ movq(rax, Address(rsp, -8));
+      break;
+    case Op_VecD:
+      __ pushq(Address(rsp, src_offset));
+      __ popq (Address(rsp, dst_offset));
+      break;
+    case Op_VecX:
+      __ pushq(Address(rsp, src_offset));
+      __ popq (Address(rsp, dst_offset));
+      __ pushq(Address(rsp, src_offset+8));
+      __ popq (Address(rsp, dst_offset+8));
+      break;
+    case Op_VecY:
+      __ vmovdqu(Address(rsp, -32), xmm0);
+      __ vmovdqu(xmm0, Address(rsp, src_offset));
+      __ vmovdqu(Address(rsp, dst_offset), xmm0);
+      __ vmovdqu(xmm0, Address(rsp, -32));
+      break;
+    default:
+      ShouldNotReachHere();
+    }
+#ifndef PRODUCT
+  } else {
+    switch (ireg) {
+    case Op_VecS:
+      st->print("movq    [rsp - #8], rax\t# 32-bit mem-mem spill\n\t"
+                "movl    rax, [rsp + #%d]\n\t"
+                "movl    [rsp + #%d], rax\n\t"
+                "movq    rax, [rsp - #8]",
+                src_offset, dst_offset);
+      break;
+    case Op_VecD:
+      st->print("pushq   [rsp + #%d]\t# 64-bit mem-mem spill\n\t"
+                "popq    [rsp + #%d]",
+                src_offset, dst_offset);
+      break;
+     case Op_VecX:
+      st->print("pushq   [rsp + #%d]\t# 128-bit mem-mem spill\n\t"
+                "popq    [rsp + #%d]\n\t"
+                "pushq   [rsp + #%d]\n\t"
+                "popq    [rsp + #%d]",
+                src_offset, dst_offset, src_offset+8, dst_offset+8);
+      break;
+    case Op_VecY:
+      st->print("vmovdqu [rsp - #32], xmm0\t# 256-bit mem-mem spill\n\t"
+                "vmovdqu xmm0, [rsp + #%d]\n\t"
+                "vmovdqu [rsp + #%d], xmm0\n\t"
+                "vmovdqu xmm0, [rsp - #32]",
+                src_offset, dst_offset);
+      break;
+    default:
+      ShouldNotReachHere();
+    }
+#endif
+  }
+}
+
 uint MachSpillCopyNode::implementation(CodeBuffer* cbuf,
                                        PhaseRegAlloc* ra_,
                                        bool do_size,
-                                       outputStream* st) const
-{
-
+                                       outputStream* st) const {
+  assert(cbuf != NULL || st  != NULL, "sanity");
   // Get registers to move
   OptoReg::Name src_second = ra_->get_reg_second(in(1));
   OptoReg::Name src_first = ra_->get_reg_first(in(1));
@@ -1050,7 +967,30 @@
   if (src_first == dst_first && src_second == dst_second) {
     // Self copy, no move
     return 0;
-  } else if (src_first_rc == rc_stack) {
+  }
+  if (bottom_type()->isa_vect() != NULL) {
+    uint ireg = ideal_reg();
+    assert((src_first_rc != rc_int && dst_first_rc != rc_int), "sanity");
+    assert((ireg == Op_VecS || ireg == Op_VecD || ireg == Op_VecX || ireg == Op_VecY), "sanity");
+    if( src_first_rc == rc_stack && dst_first_rc == rc_stack ) {
+      // mem -> mem
+      int src_offset = ra_->reg2offset(src_first);
+      int dst_offset = ra_->reg2offset(dst_first);
+      vec_stack_to_stack_helper(cbuf, src_offset, dst_offset, ireg, st);
+    } else if (src_first_rc == rc_float && dst_first_rc == rc_float ) {
+      vec_mov_helper(cbuf, false, src_first, dst_first, src_second, dst_second, ireg, st);
+    } else if (src_first_rc == rc_float && dst_first_rc == rc_stack ) {
+      int stack_offset = ra_->reg2offset(dst_first);
+      vec_spill_helper(cbuf, false, false, stack_offset, src_first, ireg, st);
+    } else if (src_first_rc == rc_stack && dst_first_rc == rc_float ) {
+      int stack_offset = ra_->reg2offset(src_first);
+      vec_spill_helper(cbuf, false, true,  stack_offset, dst_first, ireg, st);
+    } else {
+      ShouldNotReachHere();
+    }
+    return 0;
+  }
+  if (src_first_rc == rc_stack) {
     // mem ->
     if (dst_first_rc == rc_stack) {
       // mem -> mem
@@ -1061,23 +1001,16 @@
         int src_offset = ra_->reg2offset(src_first);
         int dst_offset = ra_->reg2offset(dst_first);
         if (cbuf) {
-          emit_opcode(*cbuf, 0xFF);
-          encode_RegMem(*cbuf, RSI_enc, RSP_enc, 0x4, 0, src_offset, false);
-
-          emit_opcode(*cbuf, 0x8F);
-          encode_RegMem(*cbuf, RAX_enc, RSP_enc, 0x4, 0, dst_offset, false);
-
+          MacroAssembler _masm(cbuf);
+          __ pushq(Address(rsp, src_offset));
+          __ popq (Address(rsp, dst_offset));
 #ifndef PRODUCT
-        } else if (!do_size) {
+        } else {
           st->print("pushq   [rsp + #%d]\t# 64-bit mem-mem spill\n\t"
-                     "popq    [rsp + #%d]",
-                     src_offset,
-                     dst_offset);
+                    "popq    [rsp + #%d]",
+                     src_offset, dst_offset);
 #endif
         }
-        return
-          3 + ((src_offset == 0) ? 0 : (src_offset < 0x80 ? 1 : 4)) +
-          3 + ((dst_offset == 0) ? 0 : (dst_offset < 0x80 ? 1 : 4));
       } else {
         // 32-bit
         assert(!((src_first & 1) == 0 && src_first + 1 == src_second), "no transform");
@@ -1086,46 +1019,22 @@
         int src_offset = ra_->reg2offset(src_first);
         int dst_offset = ra_->reg2offset(dst_first);
         if (cbuf) {
-          emit_opcode(*cbuf, Assembler::REX_W);
-          emit_opcode(*cbuf, 0x89);
-          emit_opcode(*cbuf, 0x44);
-          emit_opcode(*cbuf, 0x24);
-          emit_opcode(*cbuf, 0xF8);
-
-          emit_opcode(*cbuf, 0x8B);
-          encode_RegMem(*cbuf,
-                        RAX_enc,
-                        RSP_enc, 0x4, 0, src_offset,
-                        false);
-
-          emit_opcode(*cbuf, 0x89);
-          encode_RegMem(*cbuf,
-                        RAX_enc,
-                        RSP_enc, 0x4, 0, dst_offset,
-                        false);
-
-          emit_opcode(*cbuf, Assembler::REX_W);
-          emit_opcode(*cbuf, 0x8B);
-          emit_opcode(*cbuf, 0x44);
-          emit_opcode(*cbuf, 0x24);
-          emit_opcode(*cbuf, 0xF8);
-
+          MacroAssembler _masm(cbuf);
+          __ movq(Address(rsp, -8), rax);
+          __ movl(rax, Address(rsp, src_offset));
+          __ movl(Address(rsp, dst_offset), rax);
+          __ movq(rax, Address(rsp, -8));
 #ifndef PRODUCT
-        } else if (!do_size) {
+        } else {
           st->print("movq    [rsp - #8], rax\t# 32-bit mem-mem spill\n\t"
-                     "movl    rax, [rsp + #%d]\n\t"
-                     "movl    [rsp + #%d], rax\n\t"
-                     "movq    rax, [rsp - #8]",
-                     src_offset,
-                     dst_offset);
+                    "movl    rax, [rsp + #%d]\n\t"
+                    "movl    [rsp + #%d], rax\n\t"
+                    "movq    rax, [rsp - #8]",
+                     src_offset, dst_offset);
 #endif
         }
-        return
-          5 + // movq
-          3 + ((src_offset == 0) ? 0 : (src_offset < 0x80 ? 1 : 4)) + // movl
-          3 + ((dst_offset == 0) ? 0 : (dst_offset < 0x80 ? 1 : 4)) + // movl
-          5; // movq
       }
+      return 0;
     } else if (dst_first_rc == rc_int) {
       // mem -> gpr
       if ((src_first & 1) == 0 && src_first + 1 == src_second &&
@@ -1133,25 +1042,15 @@
         // 64-bit
         int offset = ra_->reg2offset(src_first);
         if (cbuf) {
-          if (Matcher::_regEncode[dst_first] < 8) {
-            emit_opcode(*cbuf, Assembler::REX_W);
-          } else {
-            emit_opcode(*cbuf, Assembler::REX_WR);
-          }
-          emit_opcode(*cbuf, 0x8B);
-          encode_RegMem(*cbuf,
-                        Matcher::_regEncode[dst_first],
-                        RSP_enc, 0x4, 0, offset,
-                        false);
+          MacroAssembler _masm(cbuf);
+          __ movq(as_Register(Matcher::_regEncode[dst_first]), Address(rsp, offset));
 #ifndef PRODUCT
-        } else if (!do_size) {
+        } else {
           st->print("movq    %s, [rsp + #%d]\t# spill",
                      Matcher::regName[dst_first],
                      offset);
 #endif
         }
-        return
-          ((offset == 0) ? 0 : (offset < 0x80 ? 1 : 4)) + 4; // REX
       } else {
         // 32-bit
         assert(!((src_first & 1) == 0 && src_first + 1 == src_second), "no transform");
@@ -1158,27 +1057,17 @@
         assert(!((dst_first & 1) == 0 && dst_first + 1 == dst_second), "no transform");
         int offset = ra_->reg2offset(src_first);
         if (cbuf) {
-          if (Matcher::_regEncode[dst_first] >= 8) {
-            emit_opcode(*cbuf, Assembler::REX_R);
-          }
-          emit_opcode(*cbuf, 0x8B);
-          encode_RegMem(*cbuf,
-                        Matcher::_regEncode[dst_first],
-                        RSP_enc, 0x4, 0, offset,
-                        false);
+          MacroAssembler _masm(cbuf);
+          __ movl(as_Register(Matcher::_regEncode[dst_first]), Address(rsp, offset));
 #ifndef PRODUCT
-        } else if (!do_size) {
+        } else {
           st->print("movl    %s, [rsp + #%d]\t# spill",
                      Matcher::regName[dst_first],
                      offset);
 #endif
         }
-        return
-          ((offset == 0) ? 0 : (offset < 0x80 ? 1 : 4)) +
-          ((Matcher::_regEncode[dst_first] < 8)
-           ? 3
-           : 4); // REX
       }
+      return 0;
     } else if (dst_first_rc == rc_float) {
       // mem-> xmm
       if ((src_first & 1) == 0 && src_first + 1 == src_second &&
@@ -1189,7 +1078,7 @@
           MacroAssembler _masm(cbuf);
           __ movdbl( as_XMMRegister(Matcher::_regEncode[dst_first]), Address(rsp, offset));
 #ifndef PRODUCT
-        } else if (!do_size) {
+        } else {
           st->print("%s  %s, [rsp + #%d]\t# spill",
                      UseXmmLoadAndClearUpper ? "movsd " : "movlpd",
                      Matcher::regName[dst_first],
@@ -1196,11 +1085,6 @@
                      offset);
 #endif
         }
-        return
-          ((offset == 0) ? 0 : (offset < 0x80 ? 1 : 4)) +
-          ((Matcher::_regEncode[dst_first] >= 8)
-           ? 6
-           : (5 + ((UseAVX>0)?1:0))); // REX
       } else {
         // 32-bit
         assert(!((src_first & 1) == 0 && src_first + 1 == src_second), "no transform");
@@ -1210,18 +1094,14 @@
           MacroAssembler _masm(cbuf);
           __ movflt( as_XMMRegister(Matcher::_regEncode[dst_first]), Address(rsp, offset));
 #ifndef PRODUCT
-        } else if (!do_size) {
+        } else {
           st->print("movss   %s, [rsp + #%d]\t# spill",
                      Matcher::regName[dst_first],
                      offset);
 #endif
         }
-        return
-          ((offset == 0) ? 0 : (offset < 0x80 ? 1 : 4)) +
-          ((Matcher::_regEncode[dst_first] >= 8)
-           ? 6
-           : (5 + ((UseAVX>0)?1:0))); // REX
       }
+      return 0;
     }
   } else if (src_first_rc == rc_int) {
     // gpr ->
@@ -1232,24 +1112,15 @@
         // 64-bit
         int offset = ra_->reg2offset(dst_first);
         if (cbuf) {
-          if (Matcher::_regEncode[src_first] < 8) {
-            emit_opcode(*cbuf, Assembler::REX_W);
-          } else {
-            emit_opcode(*cbuf, Assembler::REX_WR);
-          }
-          emit_opcode(*cbuf, 0x89);
-          encode_RegMem(*cbuf,
-                        Matcher::_regEncode[src_first],
-                        RSP_enc, 0x4, 0, offset,
-                        false);
+          MacroAssembler _masm(cbuf);
+          __ movq(Address(rsp, offset), as_Register(Matcher::_regEncode[src_first]));
 #ifndef PRODUCT
-        } else if (!do_size) {
+        } else {
           st->print("movq    [rsp + #%d], %s\t# spill",
                      offset,
                      Matcher::regName[src_first]);
 #endif
         }
-        return ((offset == 0) ? 0 : (offset < 0x80 ? 1 : 4)) + 4; // REX
       } else {
         // 32-bit
         assert(!((src_first & 1) == 0 && src_first + 1 == src_second), "no transform");
@@ -1256,27 +1127,17 @@
         assert(!((dst_first & 1) == 0 && dst_first + 1 == dst_second), "no transform");
         int offset = ra_->reg2offset(dst_first);
         if (cbuf) {
-          if (Matcher::_regEncode[src_first] >= 8) {
-            emit_opcode(*cbuf, Assembler::REX_R);
-          }
-          emit_opcode(*cbuf, 0x89);
-          encode_RegMem(*cbuf,
-                        Matcher::_regEncode[src_first],
-                        RSP_enc, 0x4, 0, offset,
-                        false);
+          MacroAssembler _masm(cbuf);
+          __ movl(Address(rsp, offset), as_Register(Matcher::_regEncode[src_first]));
 #ifndef PRODUCT
-        } else if (!do_size) {
+        } else {
           st->print("movl    [rsp + #%d], %s\t# spill",
                      offset,
                      Matcher::regName[src_first]);
 #endif
         }
-        return
-          ((offset == 0) ? 0 : (offset < 0x80 ? 1 : 4)) +
-          ((Matcher::_regEncode[src_first] < 8)
-           ? 3
-           : 4); // REX
       }
+      return 0;
     } else if (dst_first_rc == rc_int) {
       // gpr -> gpr
       if ((src_first & 1) == 0 && src_first + 1 == src_second &&
@@ -1283,62 +1144,33 @@
           (dst_first & 1) == 0 && dst_first + 1 == dst_second) {
         // 64-bit
         if (cbuf) {
-          if (Matcher::_regEncode[dst_first] < 8) {
-            if (Matcher::_regEncode[src_first] < 8) {
-              emit_opcode(*cbuf, Assembler::REX_W);
-            } else {
-              emit_opcode(*cbuf, Assembler::REX_WB);
-            }
-          } else {
-            if (Matcher::_regEncode[src_first] < 8) {
-              emit_opcode(*cbuf, Assembler::REX_WR);
-            } else {
-              emit_opcode(*cbuf, Assembler::REX_WRB);
-            }
-          }
-          emit_opcode(*cbuf, 0x8B);
-          emit_rm(*cbuf, 0x3,
-                  Matcher::_regEncode[dst_first] & 7,
-                  Matcher::_regEncode[src_first] & 7);
+          MacroAssembler _masm(cbuf);
+          __ movq(as_Register(Matcher::_regEncode[dst_first]),
+                  as_Register(Matcher::_regEncode[src_first]));
 #ifndef PRODUCT
-        } else if (!do_size) {
+        } else {
           st->print("movq    %s, %s\t# spill",
                      Matcher::regName[dst_first],
                      Matcher::regName[src_first]);
 #endif
         }
-        return 3; // REX
+        return 0;
       } else {
         // 32-bit
         assert(!((src_first & 1) == 0 && src_first + 1 == src_second), "no transform");
         assert(!((dst_first & 1) == 0 && dst_first + 1 == dst_second), "no transform");
         if (cbuf) {
-          if (Matcher::_regEncode[dst_first] < 8) {
-            if (Matcher::_regEncode[src_first] >= 8) {
-              emit_opcode(*cbuf, Assembler::REX_B);
-            }
-          } else {
-            if (Matcher::_regEncode[src_first] < 8) {
-              emit_opcode(*cbuf, Assembler::REX_R);
-            } else {
-              emit_opcode(*cbuf, Assembler::REX_RB);
-            }
-          }
-          emit_opcode(*cbuf, 0x8B);
-          emit_rm(*cbuf, 0x3,
-                  Matcher::_regEncode[dst_first] & 7,
-                  Matcher::_regEncode[src_first] & 7);
+          MacroAssembler _masm(cbuf);
+          __ movl(as_Register(Matcher::_regEncode[dst_first]),
+                  as_Register(Matcher::_regEncode[src_first]));
 #ifndef PRODUCT
-        } else if (!do_size) {
+        } else {
           st->print("movl    %s, %s\t# spill",
                      Matcher::regName[dst_first],
                      Matcher::regName[src_first]);
 #endif
         }
-        return
-          (Matcher::_regEncode[src_first] < 8 && Matcher::_regEncode[dst_first] < 8)
-          ? 2
-          : 3; // REX
+        return 0;
       }
     } else if (dst_first_rc == rc_float) {
       // gpr -> xmm
@@ -1349,13 +1181,12 @@
           MacroAssembler _masm(cbuf);
           __ movdq( as_XMMRegister(Matcher::_regEncode[dst_first]), as_Register(Matcher::_regEncode[src_first]));
 #ifndef PRODUCT
-        } else if (!do_size) {
+        } else {
           st->print("movdq   %s, %s\t# spill",
                      Matcher::regName[dst_first],
                      Matcher::regName[src_first]);
 #endif
         }
-        return 5; // REX
       } else {
         // 32-bit
         assert(!((src_first & 1) == 0 && src_first + 1 == src_second), "no transform");
@@ -1364,17 +1195,14 @@
           MacroAssembler _masm(cbuf);
           __ movdl( as_XMMRegister(Matcher::_regEncode[dst_first]), as_Register(Matcher::_regEncode[src_first]));
 #ifndef PRODUCT
-        } else if (!do_size) {
+        } else {
           st->print("movdl   %s, %s\t# spill",
                      Matcher::regName[dst_first],
                      Matcher::regName[src_first]);
 #endif
         }
-        return
-          (Matcher::_regEncode[src_first] >= 8 || Matcher::_regEncode[dst_first] >= 8)
-          ? 5
-          : (4 + ((UseAVX>0)?1:0)); // REX
       }
+      return 0;
     }
   } else if (src_first_rc == rc_float) {
     // xmm ->
@@ -1388,17 +1216,12 @@
           MacroAssembler _masm(cbuf);
           __ movdbl( Address(rsp, offset), as_XMMRegister(Matcher::_regEncode[src_first]));
 #ifndef PRODUCT
-        } else if (!do_size) {
+        } else {
           st->print("movsd   [rsp + #%d], %s\t# spill",
                      offset,
                      Matcher::regName[src_first]);
 #endif
         }
-        return
-          ((offset == 0) ? 0 : (offset < 0x80 ? 1 : 4)) +
-          ((Matcher::_regEncode[src_first] >= 8)
-           ? 6
-           : (5 + ((UseAVX>0)?1:0))); // REX
       } else {
         // 32-bit
         assert(!((src_first & 1) == 0 && src_first + 1 == src_second), "no transform");
@@ -1408,18 +1231,14 @@
           MacroAssembler _masm(cbuf);
           __ movflt(Address(rsp, offset), as_XMMRegister(Matcher::_regEncode[src_first]));
 #ifndef PRODUCT
-        } else if (!do_size) {
+        } else {
           st->print("movss   [rsp + #%d], %s\t# spill",
                      offset,
                      Matcher::regName[src_first]);
 #endif
         }
-        return
-          ((offset == 0) ? 0 : (offset < 0x80 ? 1 : 4)) +
-          ((Matcher::_regEncode[src_first] >=8)
-           ? 6
-           : (5 + ((UseAVX>0)?1:0))); // REX
       }
+      return 0;
     } else if (dst_first_rc == rc_int) {
       // xmm -> gpr
       if ((src_first & 1) == 0 && src_first + 1 == src_second &&
@@ -1429,13 +1248,12 @@
           MacroAssembler _masm(cbuf);
           __ movdq( as_Register(Matcher::_regEncode[dst_first]), as_XMMRegister(Matcher::_regEncode[src_first]));
 #ifndef PRODUCT
-        } else if (!do_size) {
+        } else {
           st->print("movdq   %s, %s\t# spill",
                      Matcher::regName[dst_first],
                      Matcher::regName[src_first]);
 #endif
         }
-        return 5; // REX
       } else {
         // 32-bit
         assert(!((src_first & 1) == 0 && src_first + 1 == src_second), "no transform");
@@ -1444,17 +1262,14 @@
           MacroAssembler _masm(cbuf);
           __ movdl( as_Register(Matcher::_regEncode[dst_first]), as_XMMRegister(Matcher::_regEncode[src_first]));
 #ifndef PRODUCT
-        } else if (!do_size) {
+        } else {
           st->print("movdl   %s, %s\t# spill",
                      Matcher::regName[dst_first],
                      Matcher::regName[src_first]);
 #endif
         }
-        return
-          (Matcher::_regEncode[src_first] >= 8 || Matcher::_regEncode[dst_first] >= 8)
-          ? 5
-          : (4 + ((UseAVX>0)?1:0)); // REX
       }
+      return 0;
     } else if (dst_first_rc == rc_float) {
       // xmm -> xmm
       if ((src_first & 1) == 0 && src_first + 1 == src_second &&
@@ -1464,7 +1279,7 @@
           MacroAssembler _masm(cbuf);
           __ movdbl( as_XMMRegister(Matcher::_regEncode[dst_first]), as_XMMRegister(Matcher::_regEncode[src_first]));
 #ifndef PRODUCT
-        } else if (!do_size) {
+        } else {
           st->print("%s  %s, %s\t# spill",
                      UseXmmRegToRegMoveAll ? "movapd" : "movsd ",
                      Matcher::regName[dst_first],
@@ -1471,10 +1286,6 @@
                      Matcher::regName[src_first]);
 #endif
         }
-        return
-          (Matcher::_regEncode[src_first] >= 8 || Matcher::_regEncode[dst_first] >= 8)
-          ? 5
-          : (4 + ((UseAVX>0)?1:0)); // REX
       } else {
         // 32-bit
         assert(!((src_first & 1) == 0 && src_first + 1 == src_second), "no transform");
@@ -1483,7 +1294,7 @@
           MacroAssembler _masm(cbuf);
           __ movflt( as_XMMRegister(Matcher::_regEncode[dst_first]), as_XMMRegister(Matcher::_regEncode[src_first]));
 #ifndef PRODUCT
-        } else if (!do_size) {
+        } else {
           st->print("%s  %s, %s\t# spill",
                      UseXmmRegToRegMoveAll ? "movaps" : "movss ",
                      Matcher::regName[dst_first],
@@ -1490,35 +1301,28 @@
                      Matcher::regName[src_first]);
 #endif
         }
-        return ((UseAVX>0) ? 5:
-          ((Matcher::_regEncode[src_first] >= 8 || Matcher::_regEncode[dst_first] >= 8)
-           ? (UseXmmRegToRegMoveAll ? 4 : 5)
-           : (UseXmmRegToRegMoveAll ? 3 : 4))); // REX
       }
+      return 0;
     }
   }
 
   assert(0," foo ");
   Unimplemented();
-
   return 0;
 }
 
 #ifndef PRODUCT
-void MachSpillCopyNode::format(PhaseRegAlloc *ra_, outputStream* st) const
-{
+void MachSpillCopyNode::format(PhaseRegAlloc *ra_, outputStream* st) const {
   implementation(NULL, ra_, false, st);
 }
 #endif
 
-void MachSpillCopyNode::emit(CodeBuffer &cbuf, PhaseRegAlloc *ra_) const
-{
+void MachSpillCopyNode::emit(CodeBuffer &cbuf, PhaseRegAlloc *ra_) const {
   implementation(&cbuf, ra_, false, NULL);
 }
 
-uint MachSpillCopyNode::size(PhaseRegAlloc *ra_) const
-{
-  return implementation(NULL, ra_, true, NULL);
+uint MachSpillCopyNode::size(PhaseRegAlloc *ra_) const {
+  return MachNode::size(ra_);
 }
 
 //=============================================================================
@@ -1735,16 +1539,6 @@
   return true;
 }
 
-// Vector width in bytes
-const uint Matcher::vector_width_in_bytes(void) {
-  return 8;
-}
-
-// Vector ideal reg
-const uint Matcher::vector_ideal_reg(void) {
-  return Op_RegD;
-}
-
 // Is this branch offset short enough that a short branch can be used?
 //
 // NOTE: If the platform does not provide any short branch variants, then
@@ -1831,21 +1625,21 @@
 bool Matcher::can_be_java_arg(int reg)
 {
   return
-    reg ==  RDI_num || reg ==  RDI_H_num ||
-    reg ==  RSI_num || reg ==  RSI_H_num ||
-    reg ==  RDX_num || reg ==  RDX_H_num ||
-    reg ==  RCX_num || reg ==  RCX_H_num ||
-    reg ==   R8_num || reg ==   R8_H_num ||
-    reg ==   R9_num || reg ==   R9_H_num ||
-    reg ==  R12_num || reg ==  R12_H_num ||
-    reg == XMM0_num || reg == XMM0_H_num ||
-    reg == XMM1_num || reg == XMM1_H_num ||
-    reg == XMM2_num || reg == XMM2_H_num ||
-    reg == XMM3_num || reg == XMM3_H_num ||
-    reg == XMM4_num || reg == XMM4_H_num ||
-    reg == XMM5_num || reg == XMM5_H_num ||
-    reg == XMM6_num || reg == XMM6_H_num ||
-    reg == XMM7_num || reg == XMM7_H_num;
+    reg ==  RDI_num || reg == RDI_H_num ||
+    reg ==  RSI_num || reg == RSI_H_num ||
+    reg ==  RDX_num || reg == RDX_H_num ||
+    reg ==  RCX_num || reg == RCX_H_num ||
+    reg ==   R8_num || reg ==  R8_H_num ||
+    reg ==   R9_num || reg ==  R9_H_num ||
+    reg ==  R12_num || reg == R12_H_num ||
+    reg == XMM0_num || reg == XMM0b_num ||
+    reg == XMM1_num || reg == XMM1b_num ||
+    reg == XMM2_num || reg == XMM2b_num ||
+    reg == XMM3_num || reg == XMM3b_num ||
+    reg == XMM4_num || reg == XMM4b_num ||
+    reg == XMM5_num || reg == XMM5b_num ||
+    reg == XMM6_num || reg == XMM6b_num ||
+    reg == XMM7_num || reg == XMM7b_num;
 }
 
 bool Matcher::is_spillable_arg(int reg)
@@ -3220,10 +3014,11 @@
       OptoReg::Bad, // Op_RegI
       RAX_H_num,    // Op_RegP
       OptoReg::Bad, // Op_RegF
-      XMM0_H_num,   // Op_RegD
+      XMM0b_num,    // Op_RegD
       RAX_H_num     // Op_RegL
     };
-    assert(ARRAY_SIZE(hi) == _last_machine_leaf - 1, "missing type");
+    // Excluded flags and vector registers.
+    assert(ARRAY_SIZE(hi) == _last_machine_leaf - 5, "missing type");
     return OptoRegPair(hi[ideal_reg], lo[ideal_reg]);
   %}
 %}
@@ -3985,7 +3780,6 @@
   interface(REG_INTER);
 %}
 
-
 //----------Memory Operands----------------------------------------------------
 // Direct Memory Operand
 // operand direct(immP addr)
@@ -5416,61 +5210,6 @@
   ins_pipe(pipe_slow); // XXX
 %}
 
-// Load Aligned Packed Byte to XMM register
-instruct loadA8B(regD dst, memory mem) %{
-  match(Set dst (Load8B mem));
-  ins_cost(125);
-  format %{ "MOVQ  $dst,$mem\t! packed8B" %}
-  ins_encode %{
-    __ movq($dst$$XMMRegister, $mem$$Address);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-// Load Aligned Packed Short to XMM register
-instruct loadA4S(regD dst, memory mem) %{
-  match(Set dst (Load4S mem));
-  ins_cost(125);
-  format %{ "MOVQ  $dst,$mem\t! packed4S" %}
-  ins_encode %{
-    __ movq($dst$$XMMRegister, $mem$$Address);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-// Load Aligned Packed Char to XMM register
-instruct loadA4C(regD dst, memory mem) %{
-  match(Set dst (Load4C mem));
-  ins_cost(125);
-  format %{ "MOVQ  $dst,$mem\t! packed4C" %}
-  ins_encode %{
-    __ movq($dst$$XMMRegister, $mem$$Address);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-// Load Aligned Packed Integer to XMM register
-instruct load2IU(regD dst, memory mem) %{
-  match(Set dst (Load2I mem));
-  ins_cost(125);
-  format %{ "MOVQ  $dst,$mem\t! packed2I" %}
-  ins_encode %{
-    __ movq($dst$$XMMRegister, $mem$$Address);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-// Load Aligned Packed Single to XMM
-instruct loadA2F(regD dst, memory mem) %{
-  match(Set dst (Load2F mem));
-  ins_cost(125);
-  format %{ "MOVQ  $dst,$mem\t! packed2F" %}
-  ins_encode %{
-    __ movq($dst$$XMMRegister, $mem$$Address);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
 // Load Effective Address
 instruct leaP8(rRegP dst, indOffset8 mem)
 %{
@@ -6200,39 +5939,6 @@
   ins_pipe(ialu_mem_imm);
 %}
 
-// Store Aligned Packed Byte XMM register to memory
-instruct storeA8B(memory mem, regD src) %{
-  match(Set mem (Store8B mem src));
-  ins_cost(145);
-  format %{ "MOVQ  $mem,$src\t! packed8B" %}
-  ins_encode %{
-    __ movq($mem$$Address, $src$$XMMRegister);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-// Store Aligned Packed Char/Short XMM register to memory
-instruct storeA4C(memory mem, regD src) %{
-  match(Set mem (Store4C mem src));
-  ins_cost(145);
-  format %{ "MOVQ  $mem,$src\t! packed4C" %}
-  ins_encode %{
-    __ movq($mem$$Address, $src$$XMMRegister);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-// Store Aligned Packed Integer XMM register to memory
-instruct storeA2I(memory mem, regD src) %{
-  match(Set mem (Store2I mem src));
-  ins_cost(145);
-  format %{ "MOVQ  $mem,$src\t! packed2I" %}
-  ins_encode %{
-    __ movq($mem$$Address, $src$$XMMRegister);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
 // Store CMS card-mark Immediate
 instruct storeImmCM0_reg(memory mem, immI0 zero)
 %{
@@ -6258,17 +5964,6 @@
   ins_pipe(ialu_mem_imm);
 %}
 
-// Store Aligned Packed Single Float XMM register to memory
-instruct storeA2F(memory mem, regD src) %{
-  match(Set mem (Store2F mem src));
-  ins_cost(145);
-  format %{ "MOVQ  $mem,$src\t! packed2F" %}
-  ins_encode %{
-    __ movq($mem$$Address, $src$$XMMRegister);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
 // Store Float
 instruct storeF(memory mem, regF src)
 %{
@@ -10384,172 +10079,6 @@
   ins_pipe( pipe_slow );
 %}
 
-// Replicate scalar to packed byte (1 byte) values in xmm
-instruct Repl8B_reg(regD dst, regD src) %{
-  match(Set dst (Replicate8B src));
-  format %{ "MOVDQA  $dst,$src\n\t"
-            "PUNPCKLBW $dst,$dst\n\t"
-            "PSHUFLW $dst,$dst,0x00\t! replicate8B" %}
-  ins_encode %{
-    if ($dst$$reg != $src$$reg) {
-      __ movdqa($dst$$XMMRegister, $src$$XMMRegister);
-    }
-    __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
-    __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-// Replicate scalar to packed byte (1 byte) values in xmm
-instruct Repl8B_rRegI(regD dst, rRegI src) %{
-  match(Set dst (Replicate8B src));
-  format %{ "MOVD    $dst,$src\n\t"
-            "PUNPCKLBW $dst,$dst\n\t"
-            "PSHUFLW $dst,$dst,0x00\t! replicate8B" %}
-  ins_encode %{
-    __ movdl($dst$$XMMRegister, $src$$Register);
-    __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
-    __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-// Replicate scalar zero to packed byte (1 byte) values in xmm
-instruct Repl8B_immI0(regD dst, immI0 zero) %{
-  match(Set dst (Replicate8B zero));
-  format %{ "PXOR  $dst,$dst\t! replicate8B" %}
-  ins_encode %{
-    __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
-  %}
-  ins_pipe( fpu_reg_reg );
-%}
-
-// Replicate scalar to packed shore (2 byte) values in xmm
-instruct Repl4S_reg(regD dst, regD src) %{
-  match(Set dst (Replicate4S src));
-  format %{ "PSHUFLW $dst,$src,0x00\t! replicate4S" %}
-  ins_encode %{
-    __ pshuflw($dst$$XMMRegister, $src$$XMMRegister, 0x00);
-  %}
-  ins_pipe( fpu_reg_reg );
-%}
-
-// Replicate scalar to packed shore (2 byte) values in xmm
-instruct Repl4S_rRegI(regD dst, rRegI src) %{
-  match(Set dst (Replicate4S src));
-  format %{ "MOVD    $dst,$src\n\t"
-            "PSHUFLW $dst,$dst,0x00\t! replicate4S" %}
-  ins_encode %{
-    __ movdl($dst$$XMMRegister, $src$$Register);
-    __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
-  %}
-  ins_pipe( fpu_reg_reg );
-%}
-
-// Replicate scalar zero to packed short (2 byte) values in xmm
-instruct Repl4S_immI0(regD dst, immI0 zero) %{
-  match(Set dst (Replicate4S zero));
-  format %{ "PXOR  $dst,$dst\t! replicate4S" %}
-  ins_encode %{
-    __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
-  %}
-  ins_pipe( fpu_reg_reg );
-%}
-
-// Replicate scalar to packed char (2 byte) values in xmm
-instruct Repl4C_reg(regD dst, regD src) %{
-  match(Set dst (Replicate4C src));
-  format %{ "PSHUFLW $dst,$src,0x00\t! replicate4C" %}
-  ins_encode %{
-    __ pshuflw($dst$$XMMRegister, $src$$XMMRegister, 0x00);
-  %}
-  ins_pipe( fpu_reg_reg );
-%}
-
-// Replicate scalar to packed char (2 byte) values in xmm
-instruct Repl4C_rRegI(regD dst, rRegI src) %{
-  match(Set dst (Replicate4C src));
-  format %{ "MOVD    $dst,$src\n\t"
-            "PSHUFLW $dst,$dst,0x00\t! replicate4C" %}
-  ins_encode %{
-    __ movdl($dst$$XMMRegister, $src$$Register);
-    __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
-  %}
-  ins_pipe( fpu_reg_reg );
-%}
-
-// Replicate scalar zero to packed char (2 byte) values in xmm
-instruct Repl4C_immI0(regD dst, immI0 zero) %{
-  match(Set dst (Replicate4C zero));
-  format %{ "PXOR  $dst,$dst\t! replicate4C" %}
-  ins_encode %{
-    __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
-  %}
-  ins_pipe( fpu_reg_reg );
-%}
-
-// Replicate scalar to packed integer (4 byte) values in xmm
-instruct Repl2I_reg(regD dst, regD src) %{
-  match(Set dst (Replicate2I src));
-  format %{ "PSHUFD $dst,$src,0x00\t! replicate2I" %}
-  ins_encode %{
-    __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
-  %}
-  ins_pipe( fpu_reg_reg );
-%}
-
-// Replicate scalar to packed integer (4 byte) values in xmm
-instruct Repl2I_rRegI(regD dst, rRegI src) %{
-  match(Set dst (Replicate2I src));
-  format %{ "MOVD   $dst,$src\n\t"
-            "PSHUFD $dst,$dst,0x00\t! replicate2I" %}
-  ins_encode %{
-    __ movdl($dst$$XMMRegister, $src$$Register);
-    __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
-  %}
-  ins_pipe( fpu_reg_reg );
-%}
-
-// Replicate scalar zero to packed integer (2 byte) values in xmm
-instruct Repl2I_immI0(regD dst, immI0 zero) %{
-  match(Set dst (Replicate2I zero));
-  format %{ "PXOR  $dst,$dst\t! replicate2I" %}
-  ins_encode %{
-    __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
-  %}
-  ins_pipe( fpu_reg_reg );
-%}
-
-// Replicate scalar to packed single precision floating point values in xmm
-instruct Repl2F_reg(regD dst, regD src) %{
-  match(Set dst (Replicate2F src));
-  format %{ "PSHUFD $dst,$src,0xe0\t! replicate2F" %}
-  ins_encode %{
-    __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0xe0);
-  %}
-  ins_pipe( fpu_reg_reg );
-%}
-
-// Replicate scalar to packed single precision floating point values in xmm
-instruct Repl2F_regF(regD dst, regF src) %{
-  match(Set dst (Replicate2F src));
-  format %{ "PSHUFD $dst,$src,0xe0\t! replicate2F" %}
-  ins_encode %{
-    __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0xe0);
-  %}
-  ins_pipe( fpu_reg_reg );
-%}
-
-// Replicate scalar to packed single precision floating point values in xmm
-instruct Repl2F_immF0(regD dst, immF0 zero) %{
-  match(Set dst (Replicate2F zero));
-  format %{ "PXOR  $dst,$dst\t! replicate2F" %}
-  ins_encode %{
-    __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
-  %}
-  ins_pipe( fpu_reg_reg );
-%}
-
 
 // =======================================================================
 // fast clearing of an array
--- old/src/share/vm/adlc/adlparse.cpp	Sat Jun  2 20:04:05 2012
+++ new/src/share/vm/adlc/adlparse.cpp	Sat Jun  2 20:04:05 2012
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1997, 2011, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2012, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -115,6 +115,12 @@
       parse_err(SYNERR, "expected one of - instruct, operand, ins_attrib, op_attrib, source, register, pipeline, encode\n     Found %s",ident);
     }
   }
+  // Add reg_class spill_regs after parsing.
+  RegisterForm *regBlock = _AD.get_registers();
+  if (regBlock == NULL) {
+    parse_err(SEMERR, "Did not declare 'register' definitions");
+  }
+  regBlock->addSpillRegClass();
 
   // Done with parsing, check consistency.
 
@@ -768,12 +774,13 @@
 
 //------------------------------reg_parse--------------------------------------
 void ADLParser::reg_parse(void) {
+  RegisterForm *regBlock = _AD.get_registers(); // Information about registers encoding
+  if (regBlock == NULL) {
+    // Create the RegisterForm for the architecture description.
+    regBlock = new RegisterForm();    // Build new Source object
+    _AD.addForm(regBlock);
+  }
 
-  // Create the RegisterForm for the architecture description.
-  RegisterForm *regBlock = new RegisterForm();    // Build new Source object
-  regBlock->_linenum = linenum();
-  _AD.addForm(regBlock);
-
   skipws();                       // Skip leading whitespace
   if (_curchar == '%' && *(_ptr+1) == '{') {
     next_char(); next_char();     // Skip "%{"
@@ -796,15 +803,11 @@
     parse_err(SYNERR, "Missing %c{ ... %c} block after register keyword.\n",'%','%');
     return;
   }
-
-  // Add reg_class spill_regs
-  regBlock->addSpillRegClass();
 }
 
 //------------------------------encode_parse-----------------------------------
 void ADLParser::encode_parse(void) {
   EncodeForm *encBlock;         // Information about instruction/operand encoding
-  char       *desc = NULL;      // String representation of encode rule
 
   _AD.getForm(&encBlock);
   if ( encBlock == NULL) {
--- old/src/share/vm/adlc/archDesc.cpp	Sat Jun  2 20:04:06 2012
+++ new/src/share/vm/adlc/archDesc.cpp	Sat Jun  2 20:04:05 2012
@@ -1,5 +1,5 @@
 //
-// Copyright (c) 1997, 2010, Oracle and/or its affiliates. All rights reserved.
+// Copyright (c) 1997, 2012, Oracle and/or its affiliates. All rights reserved.
 // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 //
 // This code is free software; you can redistribute it and/or modify it
@@ -911,12 +911,24 @@
   // Find last character in idealOp, it specifies the type
   char  last_char = 0;
   const char *ptr = idealOp;
-  for( ; *ptr != '\0'; ++ptr) {
+  for (; *ptr != '\0'; ++ptr) {
     last_char = *ptr;
   }
 
+  // Match Vector types.
+  if (strncmp(idealOp, "Vec",3)==0) {
+    switch(last_char) {
+    case 'S':  return "TypeVect::VECTS";
+    case 'D':  return "TypeVect::VECTD";
+    case 'X':  return "TypeVect::VECTX";
+    case 'Y':  return "TypeVect::VECTY";
+    default:
+      internal_err("Vector type %s with unrecognized type\n",idealOp);
+    }
+  }
+
   // !!!!!
-  switch( last_char ) {
+  switch(last_char) {
   case 'I':    return "TypeInt::INT";
   case 'P':    return "TypePtr::BOTTOM";
   case 'N':    return "TypeNarrowOop::BOTTOM";
--- old/src/share/vm/adlc/forms.cpp	Sat Jun  2 20:04:06 2012
+++ new/src/share/vm/adlc/forms.cpp	Sat Jun  2 20:04:06 2012
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1997, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2012, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -266,21 +266,7 @@
   if( strcmp(opType,"LoadN")==0 )  return Form::idealN;
   if( strcmp(opType,"LoadRange")==0 )  return Form::idealI;
   if( strcmp(opType,"LoadS")==0 )  return Form::idealS;
-  if( strcmp(opType,"Load16B")==0 )  return Form::idealB;
-  if( strcmp(opType,"Load8B")==0 )  return Form::idealB;
-  if( strcmp(opType,"Load4B")==0 )  return Form::idealB;
-  if( strcmp(opType,"Load8C")==0 )  return Form::idealC;
-  if( strcmp(opType,"Load4C")==0 )  return Form::idealC;
-  if( strcmp(opType,"Load2C")==0 )  return Form::idealC;
-  if( strcmp(opType,"Load8S")==0 )  return Form::idealS;
-  if( strcmp(opType,"Load4S")==0 )  return Form::idealS;
-  if( strcmp(opType,"Load2S")==0 )  return Form::idealS;
-  if( strcmp(opType,"Load2D")==0 )  return Form::idealD;
-  if( strcmp(opType,"Load4F")==0 )  return Form::idealF;
-  if( strcmp(opType,"Load2F")==0 )  return Form::idealF;
-  if( strcmp(opType,"Load4I")==0 )  return Form::idealI;
-  if( strcmp(opType,"Load2I")==0 )  return Form::idealI;
-  if( strcmp(opType,"Load2L")==0 )  return Form::idealL;
+  if( strcmp(opType,"LoadVector")==0 )  return Form::idealV;
   assert( strcmp(opType,"Load") != 0, "Must type Loads" );
   return Form::none;
 }
@@ -287,7 +273,7 @@
 
 Form::DataType Form::is_store_to_memory(const char *opType) const {
   if( strcmp(opType,"StoreB")==0)  return Form::idealB;
-  if( strcmp(opType,"StoreCM")==0)  return Form::idealB;
+  if( strcmp(opType,"StoreCM")==0) return Form::idealB;
   if( strcmp(opType,"StoreC")==0)  return Form::idealC;
   if( strcmp(opType,"StoreD")==0)  return Form::idealD;
   if( strcmp(opType,"StoreF")==0)  return Form::idealF;
@@ -294,19 +280,8 @@
   if( strcmp(opType,"StoreI")==0)  return Form::idealI;
   if( strcmp(opType,"StoreL")==0)  return Form::idealL;
   if( strcmp(opType,"StoreP")==0)  return Form::idealP;
-  if( strcmp(opType,"StoreN")==0) return Form::idealN;
-  if( strcmp(opType,"Store16B")==0)  return Form::idealB;
-  if( strcmp(opType,"Store8B")==0)  return Form::idealB;
-  if( strcmp(opType,"Store4B")==0)  return Form::idealB;
-  if( strcmp(opType,"Store8C")==0)  return Form::idealC;
-  if( strcmp(opType,"Store4C")==0)  return Form::idealC;
-  if( strcmp(opType,"Store2C")==0)  return Form::idealC;
-  if( strcmp(opType,"Store2D")==0)  return Form::idealD;
-  if( strcmp(opType,"Store4F")==0)  return Form::idealF;
-  if( strcmp(opType,"Store2F")==0)  return Form::idealF;
-  if( strcmp(opType,"Store4I")==0)  return Form::idealI;
-  if( strcmp(opType,"Store2I")==0)  return Form::idealI;
-  if( strcmp(opType,"Store2L")==0)  return Form::idealL;
+  if( strcmp(opType,"StoreN")==0)  return Form::idealN;
+  if( strcmp(opType,"StoreVector")==0 )  return Form::idealV;
   assert( strcmp(opType,"Store") != 0, "Must type Stores" );
   return Form::none;
 }
--- old/src/share/vm/adlc/forms.hpp	Sat Jun  2 20:04:06 2012
+++ new/src/share/vm/adlc/forms.hpp	Sat Jun  2 20:04:06 2012
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1997, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2012, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -172,7 +172,8 @@
     idealB      =  6,  // Byte    type
     idealC      =  7,  // Char    type
     idealS      =  8,  // String  type
-    idealN      =  9   // Narrow oop types
+    idealN      =  9,  // Narrow oop types
+    idealV      = 10   // Vector  type
   };
   // Convert ideal name to a DataType, return DataType::none if not a 'ConX'
   Form::DataType  ideal_to_const_type(const char *ideal_type_name) const;
--- old/src/share/vm/adlc/formsopt.cpp	Sat Jun  2 20:04:07 2012
+++ new/src/share/vm/adlc/formsopt.cpp	Sat Jun  2 20:04:07 2012
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1998, 2012, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -66,7 +66,7 @@
 // for spill-slots/regs.
 void RegisterForm::addSpillRegClass() {
   // Stack slots start at the next available even register number.
-  _reg_ctr = (_reg_ctr+1) & ~1;
+  _reg_ctr = (_reg_ctr+7) & ~7;
   const char *rc_name   = "stack_slots";
   RegClass   *reg_class = new RegClass(rc_name);
   reg_class->_stack_or_reg = true;
@@ -150,9 +150,14 @@
 int RegisterForm::RegMask_Size() {
   // Need at least this many words
   int words_for_regs = (_reg_ctr + 31)>>5;
-  // Add a few for incoming & outgoing arguments to calls.
+  // The array of Register Mask bits should be large enough to cover
+  // all the machine registers and all parameters that need to be passed
+  // on the stack (stack registers) up to some interesting limit.  Methods
+  // that need more parameters will NOT be compiled.  On Intel, the limit
+  // is something like 90+ parameters.
+  // Add a few (3 words == 96 bits) for incoming & outgoing arguments to calls.
   // Round up to the next doubleword size.
-  return (words_for_regs + 2 + 1) & ~1;
+  return (words_for_regs + 3 + 1) & ~1;
 }
 
 void RegisterForm::dump() {                  // Debug printer
--- old/src/share/vm/adlc/formssel.cpp	Sat Jun  2 20:04:07 2012
+++ new/src/share/vm/adlc/formssel.cpp	Sat Jun  2 20:04:07 2012
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1998, 2012, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -432,6 +432,14 @@
   return  _matrule->is_ideal_store();
 }
 
+// Return 'true' if this instruction matches an ideal vector node
+bool InstructForm::is_vector() const {
+  if( _matrule == NULL ) return false;
+
+  return _matrule->is_vector();
+}
+
+
 // Return the input register that must match the output register
 // If this is not required, return 0
 uint InstructForm::two_address(FormDict &globals) {
@@ -751,6 +759,9 @@
 
   if (needs_base_oop_edge(globals)) return true;
 
+  if (is_vector()) return true;
+  if (is_mach_constant()) return true;
+
   return  false;
 }
 
@@ -3381,11 +3392,8 @@
     "StoreI","StoreL","StoreP","StoreN","StoreD","StoreF" ,
     "StoreB","StoreC","Store" ,"StoreFP",
     "LoadI", "LoadUI2L", "LoadL", "LoadP" ,"LoadN", "LoadD" ,"LoadF"  ,
-    "LoadB" , "LoadUB", "LoadUS" ,"LoadS" ,"Load"   ,
-    "Store4I","Store2I","Store2L","Store2D","Store4F","Store2F","Store16B",
-    "Store8B","Store4B","Store8C","Store4C","Store2C",
-    "Load4I" ,"Load2I" ,"Load2L" ,"Load2D" ,"Load4F" ,"Load2F" ,"Load16B" ,
-    "Load8B" ,"Load4B" ,"Load8C" ,"Load4C" ,"Load2C" ,"Load8S", "Load4S","Load2S",
+    "LoadB" , "LoadUB", "LoadUS" ,"LoadS" ,"Load" ,
+    "StoreVector", "LoadVector",
     "LoadRange", "LoadKlass", "LoadNKlass", "LoadL_unaligned", "LoadD_unaligned",
     "LoadPLocked", "LoadLLocked",
     "StorePConditional", "StoreIConditional", "StoreLConditional",
@@ -3822,6 +3830,10 @@
          strcmp(opType,"RegL")==0 ||
          strcmp(opType,"RegF")==0 ||
          strcmp(opType,"RegD")==0 ||
+         strcmp(opType,"VecS")==0 ||
+         strcmp(opType,"VecD")==0 ||
+         strcmp(opType,"VecX")==0 ||
+         strcmp(opType,"VecY")==0 ||
          strcmp(opType,"Reg" )==0) ) {
       return 1;
     }
@@ -3938,19 +3950,13 @@
         strcmp(opType,"ReverseBytesL")==0 ||
         strcmp(opType,"ReverseBytesUS")==0 ||
         strcmp(opType,"ReverseBytesS")==0 ||
-        strcmp(opType,"Replicate16B")==0 ||
-        strcmp(opType,"Replicate8B")==0 ||
-        strcmp(opType,"Replicate4B")==0 ||
-        strcmp(opType,"Replicate8C")==0 ||
-        strcmp(opType,"Replicate4C")==0 ||
-        strcmp(opType,"Replicate8S")==0 ||
-        strcmp(opType,"Replicate4S")==0 ||
-        strcmp(opType,"Replicate4I")==0 ||
-        strcmp(opType,"Replicate2I")==0 ||
-        strcmp(opType,"Replicate2L")==0 ||
-        strcmp(opType,"Replicate4F")==0 ||
-        strcmp(opType,"Replicate2F")==0 ||
-        strcmp(opType,"Replicate2D")==0 ||
+        strcmp(opType,"ReplicateB")==0 ||
+        strcmp(opType,"ReplicateC")==0 ||
+        strcmp(opType,"ReplicateS")==0 ||
+        strcmp(opType,"ReplicateI")==0 ||
+        strcmp(opType,"ReplicateL")==0 ||
+        strcmp(opType,"ReplicateF")==0 ||
+        strcmp(opType,"ReplicateD")==0 ||
         0 /* 0 to line up columns nicely */ )
       return 1;
   }
@@ -4034,6 +4040,23 @@
   return ideal_load;
 }
 
+bool MatchRule::is_vector() const {
+  if( _rChild ) {
+    const char  *opType = _rChild->_opType;
+    if( strcmp(opType,"ReplicateB")==0 ||
+        strcmp(opType,"ReplicateC")==0 ||
+        strcmp(opType,"ReplicateS")==0 ||
+        strcmp(opType,"ReplicateI")==0 ||
+        strcmp(opType,"ReplicateL")==0 ||
+        strcmp(opType,"ReplicateF")==0 ||
+        strcmp(opType,"LoadVector")==0 ||
+        strcmp(opType,"StoreVector")==0 ||
+        0 /* 0 to line up columns nicely */ )
+      return true;
+  }
+  return false;
+}
+
 
 bool MatchRule::skip_antidep_check() const {
   // Some loads operate on what is effectively immutable memory so we
--- old/src/share/vm/adlc/formssel.hpp	Sat Jun  2 20:04:08 2012
+++ new/src/share/vm/adlc/formssel.hpp	Sat Jun  2 20:04:08 2012
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1998, 2012, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -160,6 +160,7 @@
   virtual bool        is_ideal_safepoint() const; // node matches 'SafePoint'
   virtual bool        is_ideal_nop() const;     // node matches 'Nop'
   virtual bool        is_ideal_control() const; // control node
+  virtual bool        is_vector() const;        // vector instruction
 
   virtual Form::CallType is_ideal_call() const; // matches ideal 'Call'
   virtual Form::DataType is_ideal_load() const; // node matches ideal 'LoadXNode'
@@ -1011,6 +1012,7 @@
   bool       is_ideal_goto() const;    // node matches ideal 'Goto'
   bool       is_ideal_loopEnd() const; // node matches ideal 'LoopEnd'
   bool       is_ideal_bool() const;    // node matches ideal 'Bool'
+  bool       is_vector() const;        // vector instruction
   Form::DataType is_ideal_load() const;// node matches ideal 'LoadXNode'
   // Should antidep checks be disabled for this rule
   // See definition of MatchRule::skip_antidep_check
--- old/src/share/vm/adlc/main.cpp	Sat Jun  2 20:04:08 2012
+++ new/src/share/vm/adlc/main.cpp	Sat Jun  2 20:04:08 2012
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1997, 2011, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2012, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -250,6 +250,7 @@
   AD.addInclude(AD._HPP_file, "opto/node.hpp");
   AD.addInclude(AD._HPP_file, "opto/regalloc.hpp");
   AD.addInclude(AD._HPP_file, "opto/subnode.hpp");
+  AD.addInclude(AD._HPP_file, "opto/vectornode.hpp");
   AD.addInclude(AD._CPP_CLONE_file, "precompiled.hpp");
   AD.addInclude(AD._CPP_CLONE_file, "adfiles", get_basename(AD._HPP_file._name));
   AD.addInclude(AD._CPP_EXPAND_file, "precompiled.hpp");
--- old/src/share/vm/code/vmreg.cpp	Sat Jun  2 20:04:09 2012
+++ new/src/share/vm/code/vmreg.cpp	Sat Jun  2 20:04:09 2012
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1998, 2012, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -27,7 +27,7 @@
 #include "code/vmreg.hpp"
 
 // First VMReg value that could refer to a stack slot
-VMReg VMRegImpl::stack0 = (VMReg)(intptr_t)((ConcreteRegisterImpl::number_of_registers + 1) & ~1);
+VMReg VMRegImpl::stack0 = (VMReg)(intptr_t)((ConcreteRegisterImpl::number_of_registers + 7) & ~7);
 
 // VMRegs are 4 bytes wide on all platforms
 const int VMRegImpl::stack_slot_size = 4;
--- old/src/share/vm/opto/c2_globals.hpp	Sat Jun  2 20:04:09 2012
+++ new/src/share/vm/opto/c2_globals.hpp	Sat Jun  2 20:04:09 2012
@@ -81,6 +81,13 @@
   product(intx, MaxLoopPad, (OptoLoopAlignment-1),                          \
           "Align a loop if padding size in bytes is less or equal to this value") \
                                                                             \
+  product(intx, MaxVectorSize, 32,                                          \
+          "Max vector size in bytes, "                                      \
+          "actual size could be less depending on elements type")           \
+                                                                            \
+  product(bool, AlignVector, false,                                         \
+          "Perform vector store/load alignment in loop")                    \
+                                                                            \
   product(intx, NumberOfLoopInstrToAlign, 4,                                \
           "Number of first instructions in a loop to align")                \
                                                                             \
--- old/src/share/vm/opto/chaitin.cpp	Sat Jun  2 20:04:10 2012
+++ new/src/share/vm/opto/chaitin.cpp	Sat Jun  2 20:04:10 2012
@@ -75,6 +75,7 @@
   // Flags
   if( _is_oop ) tty->print("Oop ");
   if( _is_float ) tty->print("Float ");
+  if( _is_vector ) tty->print("Vector ");
   if( _was_spilled1 ) tty->print("Spilled ");
   if( _was_spilled2 ) tty->print("Spilled2 ");
   if( _direct_conflict ) tty->print("Direct_conflict ");
@@ -479,16 +480,18 @@
 
   // Move important info out of the live_arena to longer lasting storage.
   alloc_node_regs(_names.Size());
-  for( uint i=0; i < _names.Size(); i++ ) {
-    if( _names[i] ) {           // Live range associated with Node?
-      LRG &lrg = lrgs( _names[i] );
-      if( lrg.num_regs() == 1 ) {
-        _node_regs[i].set1( lrg.reg() );
+  for (uint i=0; i < _names.Size(); i++) {
+    if (_names[i]) {           // Live range associated with Node?
+      LRG &lrg = lrgs(_names[i]);
+      if (!lrg.alive()) {
+        _node_regs[i].set_bad();
+      } else if (lrg.num_regs() == 1) {
+        _node_regs[i].set1(lrg.reg());
       } else {                  // Must be a register-pair
-        if( !lrg._fat_proj ) {  // Must be aligned adjacent register pair
+        if (!lrg._fat_proj) {   // Must be aligned adjacent register pair
           // Live ranges record the highest register in their mask.
           // We want the low register for the AD file writer's convenience.
-          _node_regs[i].set2( OptoReg::add(lrg.reg(),-1) );
+          _node_regs[i].set2( OptoReg::add(lrg.reg(),(1-lrg.num_regs())) );
         } else {                // Misaligned; extract 2 bits
           OptoReg::Name hi = lrg.reg(); // Get hi register
           lrg.Remove(hi);       // Yank from mask
@@ -568,7 +571,7 @@
         // Check for float-vs-int live range (used in register-pressure
         // calculations)
         const Type *n_type = n->bottom_type();
-        if( n_type->is_floatingpoint() )
+        if (n_type->is_floatingpoint())
           lrg._is_float = 1;
 
         // Check for twice prior spilling.  Once prior spilling might have
@@ -599,18 +602,28 @@
         // Limit result register mask to acceptable registers
         const RegMask &rm = n->out_RegMask();
         lrg.AND( rm );
+
+        int ireg = n->ideal_reg();
+        assert( !n->bottom_type()->isa_oop_ptr() || ireg == Op_RegP,
+                "oops must be in Op_RegP's" );
+
+        // Check for vector live range (only if vector register is used).
+        // On SPARC vector uses RegD which could be misaligned so it is not
+        // processes as vector in RA. 
+        if (RegMask::is_vector(ireg))
+          lrg._is_vector = 1;
+        assert(n_type->isa_vect() == NULL || lrg._is_vector || ireg == Op_RegD,
+               "vector must be in vector registers");
+
         // Check for bound register masks
         const RegMask &lrgmask = lrg.mask();
-        if( lrgmask.is_bound1() || lrgmask.is_bound2() )
+        if (lrgmask.is_bound(ireg))
           lrg._is_bound = 1;
 
         // Check for maximum frequency value
-        if( lrg._maxfreq < b->_freq )
+        if (lrg._maxfreq < b->_freq)
           lrg._maxfreq = b->_freq;
 
-        int ireg = n->ideal_reg();
-        assert( !n->bottom_type()->isa_oop_ptr() || ireg == Op_RegP,
-                "oops must be in Op_RegP's" );
         // Check for oop-iness, or long/double
         // Check for multi-kill projection
         switch( ireg ) {
@@ -689,7 +702,7 @@
           // AND changes how we count interferences.  A mis-aligned
           // double can interfere with TWO aligned pairs, or effectively
           // FOUR registers!
-          if( rm.is_misaligned_Pair() ) {
+          if (rm.is_misaligned_pair()) {
             lrg._fat_proj = 1;
             lrg._is_bound = 1;
           }
@@ -706,6 +719,33 @@
           lrg.set_reg_pressure(1);
 #endif
           break;
+        case Op_VecS:
+          assert(Matcher::vector_size_supported(T_BYTE,4), "sanity");
+          assert(RegMask::num_registers(Op_VecS) == RegMask::SlotsPerVecS, "sanity");
+          lrg.set_num_regs(RegMask::SlotsPerVecS);
+          lrg.set_reg_pressure(1);
+          break;
+        case Op_VecD:
+          assert(Matcher::vector_size_supported(T_FLOAT,RegMask::SlotsPerVecD), "sanity");
+          assert(RegMask::num_registers(Op_VecD) == RegMask::SlotsPerVecD, "sanity");
+          assert(lrgmask.is_aligned_sets(RegMask::SlotsPerVecD), "vector should be aligned");
+          lrg.set_num_regs(RegMask::SlotsPerVecD);
+          lrg.set_reg_pressure(1);
+          break;
+        case Op_VecX:
+          assert(Matcher::vector_size_supported(T_FLOAT,RegMask::SlotsPerVecX), "sanity");
+          assert(RegMask::num_registers(Op_VecX) == RegMask::SlotsPerVecX, "sanity");
+          assert(lrgmask.is_aligned_sets(RegMask::SlotsPerVecX), "vector should be aligned");
+          lrg.set_num_regs(RegMask::SlotsPerVecX);
+          lrg.set_reg_pressure(1);
+          break;
+        case Op_VecY:
+          assert(Matcher::vector_size_supported(T_FLOAT,RegMask::SlotsPerVecY), "sanity");
+          assert(RegMask::num_registers(Op_VecY) == RegMask::SlotsPerVecY, "sanity");
+          assert(lrgmask.is_aligned_sets(RegMask::SlotsPerVecY), "vector should be aligned");
+          lrg.set_num_regs(RegMask::SlotsPerVecY);
+          lrg.set_reg_pressure(1);
+          break;
         default:
           ShouldNotReachHere();
         }
@@ -763,24 +803,38 @@
         } else {
           lrg.AND( rm );
         }
+
         // Check for bound register masks
         const RegMask &lrgmask = lrg.mask();
-        if( lrgmask.is_bound1() || lrgmask.is_bound2() )
+        int kreg = n->in(k)->ideal_reg();
+        bool is_vect = RegMask::is_vector(kreg);
+        assert(n->in(k)->bottom_type()->isa_vect() == NULL ||
+               is_vect || kreg == Op_RegD,
+               "vector must be in vector registers");
+        if (lrgmask.is_bound(kreg))
           lrg._is_bound = 1;
+
         // If this use of a double forces a mis-aligned double,
         // flag as '_fat_proj' - really flag as allowing misalignment
         // AND changes how we count interferences.  A mis-aligned
         // double can interfere with TWO aligned pairs, or effectively
         // FOUR registers!
-        if( lrg.num_regs() == 2 && !lrg._fat_proj && rm.is_misaligned_Pair() ) {
+#ifdef ASSERT
+        if (is_vect) {
+          assert(lrgmask.is_aligned_sets(lrg.num_regs()), "vector should be aligned");
+          assert(!lrg._fat_proj, "sanity");
+          assert(RegMask::num_registers(kreg) == lrg.num_regs(), "sanity");
+        }
+#endif
+        if (!is_vect && lrg.num_regs() == 2 && !lrg._fat_proj && rm.is_misaligned_pair()) {
           lrg._fat_proj = 1;
           lrg._is_bound = 1;
         }
         // if the LRG is an unaligned pair, we will have to spill
         // so clear the LRG's register mask if it is not already spilled
-        if ( !n->is_SpillCopy() &&
-               (lrg._def == NULL || lrg.is_multidef() || !lrg._def->is_SpillCopy()) &&
-               lrgmask.is_misaligned_Pair()) {
+        if (!is_vect && !n->is_SpillCopy() &&
+            (lrg._def == NULL || lrg.is_multidef() || !lrg._def->is_SpillCopy()) &&
+            lrgmask.is_misaligned_pair()) {
           lrg.Clear();
         }
 
@@ -793,12 +847,14 @@
   } // end for all blocks
 
   // Final per-liverange setup
-  for( uint i2=0; i2<_maxlrg; i2++ ) {
+  for (uint i2=0; i2<_maxlrg; i2++) {
     LRG &lrg = lrgs(i2);
-    if( lrg.num_regs() == 2 && !lrg._fat_proj )
-      lrg.ClearToPairs();
+    assert(!lrg._is_vector || !lrg._fat_proj, "sanity");
+    if (lrg.num_regs() > 1 && !lrg._fat_proj) {
+      lrg.clear_to_sets();
+    }
     lrg.compute_set_mask_size();
-    if( lrg.not_free() ) {      // Handle case where we lose from the start
+    if (lrg.not_free()) {      // Handle case where we lose from the start
       lrg.set_reg(OptoReg::Name(LRG::SPILL_REG));
       lrg._direct_conflict = 1;
     }
@@ -1104,22 +1160,17 @@
       // Choose a color which is legal for him
       RegMask tempmask = lrg.mask();
       tempmask.AND(lrgs(copy_lrg).mask());
-      OptoReg::Name reg;
-      if( lrg.num_regs() == 1 ) {
-        reg = tempmask.find_first_elem();
-      } else {
-        tempmask.ClearToPairs();
-        reg = tempmask.find_first_pair();
-      }
-      if( OptoReg::is_valid(reg) )
+      tempmask.clear_to_sets(lrg.num_regs());
+      OptoReg::Name reg = tempmask.find_first_set(lrg.num_regs());
+      if (OptoReg::is_valid(reg))
         return reg;
     }
   }
 
   // If no bias info exists, just go with the register selection ordering
-  if( lrg.num_regs() == 2 ) {
-    // Find an aligned pair
-    return OptoReg::add(lrg.mask().find_first_pair(),chunk);
+  if (lrg._is_vector || lrg.num_regs() == 2) {
+    // Find an aligned set
+    return OptoReg::add(lrg.mask().find_first_set(lrg.num_regs()),chunk);
   }
 
   // CNC - Fun hack.  Alternate 1st and 2nd selection.  Enables post-allocate
@@ -1149,6 +1200,7 @@
     // Use a heuristic to "bias" the color choice
     return bias_color(lrg, chunk);
 
+  assert(!lrg._is_vector, "should be not vector here" );
   assert( lrg.num_regs() >= 2, "dead live ranges do not color" );
 
   // Fat-proj case or misaligned double argument.
@@ -1238,14 +1290,16 @@
     }
     //assert(is_allstack == lrg->mask().is_AllStack(), "nbrs must not change AllStackedness");
     // Aligned pairs need aligned masks
-    if( lrg->num_regs() == 2 && !lrg->_fat_proj )
-      lrg->ClearToPairs();
+    assert(!lrg->_is_vector || !lrg->_fat_proj, "sanity");
+    if (lrg->num_regs() > 1 && !lrg->_fat_proj) {
+      lrg->clear_to_sets();
+    }
 
     // Check if a color is available and if so pick the color
     OptoReg::Name reg = choose_color( *lrg, chunk );
 #ifdef SPARC
     debug_only(lrg->compute_set_mask_size());
-    assert(lrg->num_regs() != 2 || lrg->is_bound() || is_even(reg-1), "allocate all doubles aligned");
+    assert(lrg->num_regs() < 2 || lrg->is_bound() || is_even(reg-1), "allocate all doubles aligned");
 #endif
 
     //---------------
@@ -1277,17 +1331,16 @@
       // If the live range is not bound, then we actually had some choices
       // to make.  In this case, the mask has more bits in it than the colors
       // chosen.  Restrict the mask to just what was picked.
-      if( lrg->num_regs() == 1 ) { // Size 1 live range
+      int n_regs = lrg->num_regs();
+      assert(!lrg->_is_vector || !lrg->_fat_proj, "sanity");
+      if (n_regs == 1 || !lrg->_fat_proj) {
+        assert(!lrg->_is_vector || n_regs <= RegMask::SlotsPerVecY, "sanity");
         lrg->Clear();           // Clear the mask
         lrg->Insert(reg);       // Set regmask to match selected reg
-        lrg->set_mask_size(1);
-      } else if( !lrg->_fat_proj ) {
-        // For pairs, also insert the low bit of the pair
-        assert( lrg->num_regs() == 2, "unbound fatproj???" );
-        lrg->Clear();           // Clear the mask
-        lrg->Insert(reg);       // Set regmask to match selected reg
-        lrg->Insert(OptoReg::add(reg,-1));
-        lrg->set_mask_size(2);
+        // For vectors and pairs, also insert the low bit of the pair
+        for (int i = 1; i < n_regs; i++)
+          lrg->Insert(OptoReg::add(reg,-i));
+        lrg->set_mask_size(n_regs);
       } else {                  // Else fatproj
         // mask must be equal to fatproj bits, by definition
       }
@@ -1860,12 +1913,20 @@
       sprintf(buf,"L%d",lidx);  // No register binding yet
     } else if( !lidx ) {        // Special, not allocated value
       strcpy(buf,"Special");
-    } else if( (lrgs(lidx).num_regs() == 1)
-                ? !lrgs(lidx).mask().is_bound1()
-                : !lrgs(lidx).mask().is_bound2() ) {
-      sprintf(buf,"L%d",lidx); // No register binding yet
-    } else {                    // Hah!  We have a bound machine register
-      print_reg( lrgs(lidx).reg(), this, buf );
+    } else {
+      if (lrgs(lidx)._is_vector) {
+        if (lrgs(lidx).mask().is_bound_set(lrgs(lidx).num_regs()))
+          print_reg( lrgs(lidx).reg(), this, buf ); // a bound machine register
+        else
+          sprintf(buf,"L%d",lidx); // No register binding yet
+      } else if( (lrgs(lidx).num_regs() == 1)
+                 ? lrgs(lidx).mask().is_bound1()
+                 : lrgs(lidx).mask().is_bound_pair() ) {
+        // Hah!  We have a bound machine register
+        print_reg( lrgs(lidx).reg(), this, buf );
+      } else {
+        sprintf(buf,"L%d",lidx); // No register binding yet
+      }
     }
   }
   return buf+strlen(buf);
--- old/src/share/vm/opto/chaitin.hpp	Sat Jun  2 20:04:10 2012
+++ new/src/share/vm/opto/chaitin.hpp	Sat Jun  2 20:04:10 2012
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1997, 2011, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2012, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -99,8 +99,15 @@
   void set_mask_size( int size ) {
     assert((size == 65535) || (size == (int)_mask.Size()), "");
     _mask_size = size;
-    debug_only(_msize_valid=1;)
-    debug_only( if( _num_regs == 2 && !_fat_proj ) _mask.VerifyPairs(); )
+#ifdef ASSERT
+    _msize_valid=1;
+    if (_is_vector) {
+      assert(!_fat_proj, "sanity");
+      _mask.verify_sets(_num_regs);
+    } else if (_num_regs == 2 && !_fat_proj) {
+      _mask.verify_pairs();
+    }
+#endif
   }
   void compute_set_mask_size() { set_mask_size(compute_mask_size()); }
   int mask_size() const { assert( _msize_valid, "mask size not valid" );
@@ -116,7 +123,8 @@
   void Set_All() { _mask.Set_All(); debug_only(_msize_valid=1); _mask_size = RegMask::CHUNK_SIZE; }
   void Insert( OptoReg::Name reg ) { _mask.Insert(reg);  debug_only(_msize_valid=0;) }
   void Remove( OptoReg::Name reg ) { _mask.Remove(reg);  debug_only(_msize_valid=0;) }
-  void ClearToPairs() { _mask.ClearToPairs(); debug_only(_msize_valid=0;) }
+  void clear_to_pairs() { _mask.clear_to_pairs(); debug_only(_msize_valid=0;) }
+  void clear_to_sets()  { _mask.clear_to_sets(_num_regs); debug_only(_msize_valid=0;) }
 
   // Number of registers this live range uses when it colors
 private:
@@ -150,6 +158,7 @@
 
   uint   _is_oop:1,             // Live-range holds an oop
          _is_float:1,           // True if in float registers
+         _is_vector:1,          // True if in vector registers
          _was_spilled1:1,       // True if prior spilling on def
          _was_spilled2:1,       // True if twice prior spilling on def
          _is_bound:1,           // live range starts life with no
--- old/src/share/vm/opto/classes.hpp	Sat Jun  2 20:04:11 2012
+++ new/src/share/vm/opto/classes.hpp	Sat Jun  2 20:04:11 2012
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1997, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2012, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -267,42 +267,15 @@
 macro(LShiftVC)
 macro(LShiftVS)
 macro(LShiftVI)
-macro(URShiftVB)
-macro(URShiftVC)
-macro(URShiftVS)
-macro(URShiftVI)
+macro(RShiftVB)
+macro(RShiftVC)
+macro(RShiftVS)
+macro(RShiftVI)
 macro(AndV)
 macro(OrV)
 macro(XorV)
-macro(VectorLoad)
-macro(Load16B)
-macro(Load8B)
-macro(Load4B)
-macro(Load8C)
-macro(Load4C)
-macro(Load2C)
-macro(Load8S)
-macro(Load4S)
-macro(Load2S)
-macro(Load4I)
-macro(Load2I)
-macro(Load2L)
-macro(Load4F)
-macro(Load2F)
-macro(Load2D)
-macro(VectorStore)
-macro(Store16B)
-macro(Store8B)
-macro(Store4B)
-macro(Store8C)
-macro(Store4C)
-macro(Store2C)
-macro(Store4I)
-macro(Store2I)
-macro(Store2L)
-macro(Store4F)
-macro(Store2F)
-macro(Store2D)
+macro(LoadVector)
+macro(StoreVector)
 macro(Pack)
 macro(PackB)
 macro(PackS)
@@ -311,23 +284,15 @@
 macro(PackL)
 macro(PackF)
 macro(PackD)
-macro(Pack2x1B)
-macro(Pack2x2B)
-macro(Replicate16B)
-macro(Replicate8B)
-macro(Replicate4B)
-macro(Replicate8S)
-macro(Replicate4S)
-macro(Replicate2S)
-macro(Replicate8C)
-macro(Replicate4C)
-macro(Replicate2C)
-macro(Replicate4I)
-macro(Replicate2I)
-macro(Replicate2L)
-macro(Replicate4F)
-macro(Replicate2F)
-macro(Replicate2D)
+macro(Pack2L)
+macro(Pack2D)
+macro(ReplicateB)
+macro(ReplicateS)
+macro(ReplicateC)
+macro(ReplicateI)
+macro(ReplicateL)
+macro(ReplicateF)
+macro(ReplicateD)
 macro(Extract)
 macro(ExtractB)
 macro(ExtractS)
--- old/src/share/vm/opto/compile.cpp	Sat Jun  2 20:04:11 2012
+++ new/src/share/vm/opto/compile.cpp	Sat Jun  2 20:04:11 2012
@@ -2592,33 +2592,8 @@
     }
     break;
 
-  case Op_Load16B:
-  case Op_Load8B:
-  case Op_Load4B:
-  case Op_Load8S:
-  case Op_Load4S:
-  case Op_Load2S:
-  case Op_Load8C:
-  case Op_Load4C:
-  case Op_Load2C:
-  case Op_Load4I:
-  case Op_Load2I:
-  case Op_Load2L:
-  case Op_Load4F:
-  case Op_Load2F:
-  case Op_Load2D:
-  case Op_Store16B:
-  case Op_Store8B:
-  case Op_Store4B:
-  case Op_Store8C:
-  case Op_Store4C:
-  case Op_Store2C:
-  case Op_Store4I:
-  case Op_Store2I:
-  case Op_Store2L:
-  case Op_Store4F:
-  case Op_Store2F:
-  case Op_Store2D:
+  case Op_LoadVector:
+  case Op_StoreVector:
     break;
 
   case Op_PackB:
--- old/src/share/vm/opto/ifg.cpp	Sat Jun  2 20:04:12 2012
+++ new/src/share/vm/opto/ifg.cpp	Sat Jun  2 20:04:12 2012
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1998, 2012, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -416,6 +416,7 @@
     if( lrgs(lidx).mask().is_UP() &&
         lrgs(lidx).mask_size() &&
         !lrgs(lidx)._is_float &&
+        !lrgs(lidx)._is_vector &&
         lrgs(lidx).mask().overlap(*Matcher::idealreg2regmask[Op_RegI]) )
       cnt += lrgs(lidx).reg_pressure();
   }
@@ -430,7 +431,7 @@
   while ((lidx = elements.next()) != 0) {
     if( lrgs(lidx).mask().is_UP() &&
         lrgs(lidx).mask_size() &&
-        lrgs(lidx)._is_float )
+        (lrgs(lidx)._is_float || lrgs(lidx)._is_vector))
       cnt += lrgs(lidx).reg_pressure();
   }
   return cnt;
@@ -439,8 +440,8 @@
 //------------------------------lower_pressure---------------------------------
 // Adjust register pressure down by 1.  Capture last hi-to-low transition,
 static void lower_pressure( LRG *lrg, uint where, Block *b, uint *pressure, uint *hrp_index ) {
-  if( lrg->mask().is_UP() && lrg->mask_size() ) {
-    if( lrg->_is_float ) {
+  if (lrg->mask().is_UP() && lrg->mask_size()) {
+    if (lrg->_is_float || lrg->_is_vector) {
       pressure[1] -= lrg->reg_pressure();
       if( pressure[1] == (uint)FLOATPRESSURE ) {
         hrp_index[1] = where;
@@ -522,8 +523,8 @@
       LRG &lrg = lrgs(lidx);
       lrg._area += cost;
       // Compute initial register pressure
-      if( lrg.mask().is_UP() && lrg.mask_size() ) {
-        if( lrg._is_float ) {   // Count float pressure
+      if (lrg.mask().is_UP() && lrg.mask_size()) {
+        if (lrg._is_float || lrg._is_vector) {   // Count float pressure
           pressure[1] += lrg.reg_pressure();
 #ifdef EXACT_PRESSURE
           if( pressure[1] > b->_freg_pressure )
@@ -681,13 +682,10 @@
         // according to its bindings.
         const RegMask &rmask = lrgs(r).mask();
         if( lrgs(r).is_bound() && !(n->rematerialize()) && rmask.is_NotEmpty() ) {
-          // Smear odd bits; leave only aligned pairs of bits.
-          RegMask r2mask = rmask;
-          r2mask.SmearToPairs();
           // Check for common case
           int r_size = lrgs(r).num_regs();
           OptoReg::Name r_reg = (r_size == 1) ? rmask.find_first_elem() : OptoReg::Physical;
-
+          // Smear odd bits
           IndexSetIterator elements(&liveout);
           uint l;
           while ((l = elements.next()) != 0) {
@@ -701,10 +699,15 @@
             // Remove the bits from LRG 'r' from LRG 'l' so 'l' no
             // longer interferes with 'r'.  If 'l' requires aligned
             // adjacent pairs, subtract out bit pairs.
-            if( lrg.num_regs() == 2 && !lrg._fat_proj ) {
+            assert(!lrg._is_vector || !lrg._fat_proj, "sanity");
+            if (lrg.num_regs() > 1 && !lrg._fat_proj) {
+              RegMask r2mask = rmask;
+              // Leave only aligned set of bits.
+              r2mask.smear_to_sets(lrg.num_regs());
+              // It includes vector case.
               lrg.SUBTRACT( r2mask );
               lrg.compute_set_mask_size();
-            } else if( r_size != 1 ) {
+            } else if( r_size != 1 ) { // fat proj
               lrg.SUBTRACT( rmask );
               lrg.compute_set_mask_size();
             } else {            // Common case: size 1 bound removal
@@ -763,8 +766,8 @@
             // Newly live things assumed live from here to top of block
             lrg._area += cost;
             // Adjust register pressure
-            if( lrg.mask().is_UP() && lrg.mask_size() ) {
-              if( lrg._is_float ) {
+            if (lrg.mask().is_UP() && lrg.mask_size()) {
+              if (lrg._is_float || lrg._is_vector) {
                 pressure[1] += lrg.reg_pressure();
 #ifdef EXACT_PRESSURE
                 if( pressure[1] > b->_freg_pressure )
--- old/src/share/vm/opto/lcm.cpp	Sat Jun  2 20:04:12 2012
+++ new/src/share/vm/opto/lcm.cpp	Sat Jun  2 20:04:12 2012
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1998, 2011, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1998, 2012, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -445,6 +445,11 @@
     if( e->is_MachNullCheck() && e->in(1) == n )
       continue;
 
+    // Schedule IV increment last.
+    if (e->is_Mach() && e->as_Mach()->ideal_Opcode() == Op_CountedLoopEnd &&
+        e->in(1)->in(1) == n && n->is_iteratively_computed())
+      continue;
+
     uint n_choice  = 2;
 
     // See if this instruction is consumed by a branch. If so, then (as the
--- old/src/share/vm/opto/loopnode.cpp	Sat Jun  2 20:04:13 2012
+++ new/src/share/vm/opto/loopnode.cpp	Sat Jun  2 20:04:13 2012
@@ -2756,7 +2756,8 @@
         // Do not count uncommon calls
         if( !n->is_CallStaticJava() || !n->as_CallStaticJava()->_name ) {
           Node *iff = n->in(0)->in(0);
-          if( !iff->is_If() ||
+          // No any calls for vectorized loops.
+          if( UseSuperWord || !iff->is_If() ||
               (n->in(0)->Opcode() == Op_IfFalse &&
                (1.0 - iff->as_If()->_prob) >= 0.01) ||
               (iff->as_If()->_prob >= 0.01) )
--- old/src/share/vm/opto/machnode.cpp	Sat Jun  2 20:04:13 2012
+++ new/src/share/vm/opto/machnode.cpp	Sat Jun  2 20:04:13 2012
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1997, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2012, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -439,9 +439,9 @@
   // Don't remateralize somebody with bound inputs - it stretches a
   // fixed register lifetime.
   uint idx = oper_input_base();
-  if( req() > idx ) {
+  if (req() > idx) {
     const RegMask &rm = in_RegMask(idx);
-    if( rm.is_bound1() || rm.is_bound2() )
+    if (rm.is_bound(ideal_reg()))
       return false;
   }
 
--- old/src/share/vm/opto/machnode.hpp	Sat Jun  2 20:04:14 2012
+++ new/src/share/vm/opto/machnode.hpp	Sat Jun  2 20:04:14 2012
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1997, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2012, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -319,6 +319,7 @@
 class MachTypeNode : public MachNode {
   virtual uint size_of() const { return sizeof(*this); } // Size is bigger
 public:
+  MachTypeNode( ) {}
   const Type *_bottom_type;
 
   virtual const class Type *bottom_type() const { return _bottom_type; }
@@ -370,12 +371,12 @@
 
 //------------------------------MachConstantNode-------------------------------
 // Machine node that holds a constant which is stored in the constant table.
-class MachConstantNode : public MachNode {
+class MachConstantNode : public MachTypeNode {
 protected:
   Compile::Constant _constant;  // This node's constant.
 
 public:
-  MachConstantNode() : MachNode() {
+  MachConstantNode() : MachTypeNode() {
     init_class_id(Class_MachConstant);
   }
 
--- old/src/share/vm/opto/matcher.cpp	Sat Jun  2 20:04:14 2012
+++ new/src/share/vm/opto/matcher.cpp	Sat Jun  2 20:04:14 2012
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1997, 2011, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2012, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -35,6 +35,7 @@
 #include "opto/rootnode.hpp"
 #include "opto/runtime.hpp"
 #include "opto/type.hpp"
+#include "opto/vectornode.hpp"
 #include "runtime/atomic.hpp"
 #include "runtime/os.hpp"
 #ifdef TARGET_ARCH_MODEL_x86_32
@@ -58,18 +59,6 @@
 
 OptoReg::Name OptoReg::c_frame_pointer;
 
-
-
-const int Matcher::base2reg[Type::lastype] = {
-  Node::NotAMachineReg,0,0, Op_RegI, Op_RegL, 0, Op_RegN,
-  Node::NotAMachineReg, Node::NotAMachineReg, /* tuple, array */
-  Op_RegP, Op_RegP, Op_RegP, Op_RegP, Op_RegP, Op_RegP, /* the pointers */
-  0, 0/*abio*/,
-  Op_RegP /* Return address */, 0, /* the memories */
-  Op_RegF, Op_RegF, Op_RegF, Op_RegD, Op_RegD, Op_RegD,
-  0  /*bottom*/
-};
-
 const RegMask *Matcher::idealreg2regmask[_last_machine_leaf];
 RegMask Matcher::mreg2regmask[_last_Mach_Reg];
 RegMask Matcher::STACK_ONLY_mask;
@@ -107,6 +96,10 @@
   idealreg2spillmask  [Op_RegF] = NULL;
   idealreg2spillmask  [Op_RegD] = NULL;
   idealreg2spillmask  [Op_RegP] = NULL;
+  idealreg2spillmask  [Op_VecS] = NULL;
+  idealreg2spillmask  [Op_VecD] = NULL;
+  idealreg2spillmask  [Op_VecX] = NULL;
+  idealreg2spillmask  [Op_VecY] = NULL;
 
   idealreg2debugmask  [Op_RegI] = NULL;
   idealreg2debugmask  [Op_RegN] = NULL;
@@ -114,6 +107,10 @@
   idealreg2debugmask  [Op_RegF] = NULL;
   idealreg2debugmask  [Op_RegD] = NULL;
   idealreg2debugmask  [Op_RegP] = NULL;
+  idealreg2debugmask  [Op_VecS] = NULL;
+  idealreg2debugmask  [Op_VecD] = NULL;
+  idealreg2debugmask  [Op_VecX] = NULL;
+  idealreg2debugmask  [Op_VecY] = NULL;
 
   idealreg2mhdebugmask[Op_RegI] = NULL;
   idealreg2mhdebugmask[Op_RegN] = NULL;
@@ -121,6 +118,10 @@
   idealreg2mhdebugmask[Op_RegF] = NULL;
   idealreg2mhdebugmask[Op_RegD] = NULL;
   idealreg2mhdebugmask[Op_RegP] = NULL;
+  idealreg2mhdebugmask[Op_VecS] = NULL;
+  idealreg2mhdebugmask[Op_VecD] = NULL;
+  idealreg2mhdebugmask[Op_VecX] = NULL;
+  idealreg2mhdebugmask[Op_VecY] = NULL;
 
   debug_only(_mem_node = NULL;)   // Ideal memory node consumed by mach node
 }
@@ -134,7 +135,7 @@
     warped = OptoReg::add(warped, C->out_preserve_stack_slots());
     if( warped >= _in_arg_limit )
       _in_arg_limit = OptoReg::add(warped, 1); // Bump max stack slot seen
-    if (!RegMask::can_represent(warped)) {
+    if (!RegMask::can_represent_arg(warped)) {
       // the compiler cannot represent this method's calling sequence
       C->record_method_not_compilable_all_tiers("unsupported incoming calling sequence");
       return OptoReg::Bad;
@@ -302,7 +303,7 @@
   _out_arg_limit = OptoReg::add(_new_SP, C->out_preserve_stack_slots());
   assert( is_even(_out_arg_limit), "out_preserve must be even" );
 
-  if (!RegMask::can_represent(OptoReg::add(_out_arg_limit,-1))) {
+  if (!RegMask::can_represent_arg(OptoReg::add(_out_arg_limit,-1))) {
     // the compiler cannot represent this method's calling sequence
     C->record_method_not_compilable("must be able to represent all call arguments in reg mask");
   }
@@ -428,7 +429,7 @@
 void Matcher::init_first_stack_mask() {
 
   // Allocate storage for spill masks as masks for the appropriate load type.
-  RegMask *rms = (RegMask*)C->comp_arena()->Amalloc_D(sizeof(RegMask) * 3*6);
+  RegMask *rms = (RegMask*)C->comp_arena()->Amalloc_D(sizeof(RegMask) * (3*6+4));
 
   idealreg2spillmask  [Op_RegN] = &rms[0];
   idealreg2spillmask  [Op_RegI] = &rms[1];
@@ -451,6 +452,11 @@
   idealreg2mhdebugmask[Op_RegD] = &rms[16];
   idealreg2mhdebugmask[Op_RegP] = &rms[17];
 
+  idealreg2spillmask  [Op_VecS] = &rms[18];
+  idealreg2spillmask  [Op_VecD] = &rms[19];
+  idealreg2spillmask  [Op_VecX] = &rms[20];
+  idealreg2spillmask  [Op_VecY] = &rms[21];
+
   OptoReg::Name i;
 
   // At first, start with the empty mask
@@ -462,7 +468,7 @@
     C->FIRST_STACK_mask().Insert(i);
 
   // Add in all bits past the outgoing argument area
-  guarantee(RegMask::can_represent(OptoReg::add(_out_arg_limit,-1)),
+  guarantee(RegMask::can_represent_arg(OptoReg::add(_out_arg_limit,-1)),
             "must be able to represent all call arguments in reg mask");
   init = _out_arg_limit;
   for (i = init; RegMask::can_represent(i); i = OptoReg::add(i,1))
@@ -472,21 +478,48 @@
   C->FIRST_STACK_mask().set_AllStack();
 
   // Make spill masks.  Registers for their class, plus FIRST_STACK_mask.
+  RegMask aligned_stack_mask = C->FIRST_STACK_mask();
+  // Keep spill masks aligned.
+  aligned_stack_mask.clear_to_pairs();
+  assert(aligned_stack_mask.is_AllStack(), "should be infinite stack");
+
+  *idealreg2spillmask[Op_RegP] = *idealreg2regmask[Op_RegP];
 #ifdef _LP64
   *idealreg2spillmask[Op_RegN] = *idealreg2regmask[Op_RegN];
    idealreg2spillmask[Op_RegN]->OR(C->FIRST_STACK_mask());
+   idealreg2spillmask[Op_RegP]->OR(aligned_stack_mask);
+#else
+   idealreg2spillmask[Op_RegP]->OR(C->FIRST_STACK_mask());
 #endif
   *idealreg2spillmask[Op_RegI] = *idealreg2regmask[Op_RegI];
    idealreg2spillmask[Op_RegI]->OR(C->FIRST_STACK_mask());
   *idealreg2spillmask[Op_RegL] = *idealreg2regmask[Op_RegL];
-   idealreg2spillmask[Op_RegL]->OR(C->FIRST_STACK_mask());
+   idealreg2spillmask[Op_RegL]->OR(aligned_stack_mask);
   *idealreg2spillmask[Op_RegF] = *idealreg2regmask[Op_RegF];
    idealreg2spillmask[Op_RegF]->OR(C->FIRST_STACK_mask());
   *idealreg2spillmask[Op_RegD] = *idealreg2regmask[Op_RegD];
-   idealreg2spillmask[Op_RegD]->OR(C->FIRST_STACK_mask());
-  *idealreg2spillmask[Op_RegP] = *idealreg2regmask[Op_RegP];
-   idealreg2spillmask[Op_RegP]->OR(C->FIRST_STACK_mask());
+   idealreg2spillmask[Op_RegD]->OR(aligned_stack_mask);
 
+  if (Matcher::vector_size_supported(T_BYTE,4)) {
+    *idealreg2spillmask[Op_VecS] = *idealreg2regmask[Op_VecS];
+     idealreg2spillmask[Op_VecS]->OR(C->FIRST_STACK_mask());
+  }
+  if (Matcher::vector_size_supported(T_FLOAT,2)) {
+    *idealreg2spillmask[Op_VecD] = *idealreg2regmask[Op_VecD];
+     idealreg2spillmask[Op_VecD]->OR(aligned_stack_mask);
+  }
+  if (Matcher::vector_size_supported(T_FLOAT,4)) {
+     aligned_stack_mask.clear_to_sets(RegMask::SlotsPerVecX);
+     assert(aligned_stack_mask.is_AllStack(), "should be infinite stack");
+    *idealreg2spillmask[Op_VecX] = *idealreg2regmask[Op_VecX];
+     idealreg2spillmask[Op_VecX]->OR(aligned_stack_mask);
+  }
+  if (Matcher::vector_size_supported(T_FLOAT,8)) {
+     aligned_stack_mask.clear_to_sets(RegMask::SlotsPerVecY);
+     assert(aligned_stack_mask.is_AllStack(), "should be infinite stack");
+    *idealreg2spillmask[Op_VecY] = *idealreg2regmask[Op_VecY];
+     idealreg2spillmask[Op_VecY]->OR(aligned_stack_mask);
+  }
    if (UseFPUForSpilling) {
      // This mask logic assumes that the spill operations are
      // symmetric and that the registers involved are the same size.
@@ -807,6 +840,25 @@
   idealreg2regmask[Op_RegF] = &spillF->out_RegMask();
   idealreg2regmask[Op_RegD] = &spillD->out_RegMask();
   idealreg2regmask[Op_RegP] = &spillP->out_RegMask();
+
+  // Vector regmasks.
+  if (Matcher::vector_size_supported(T_BYTE,4)) {
+    TypeVect::VECTS = TypeVect::make(T_BYTE, 4);
+    MachNode *spillVectS = match_tree(new (C, 3) LoadVectorNode(NULL,mem,fp,atp,TypeVect::VECTS));
+    idealreg2regmask[Op_VecS] = &spillVectS->out_RegMask();
+  }
+  if (Matcher::vector_size_supported(T_FLOAT,2)) {
+    MachNode *spillVectD = match_tree(new (C, 3) LoadVectorNode(NULL,mem,fp,atp,TypeVect::VECTD));
+    idealreg2regmask[Op_VecD] = &spillVectD->out_RegMask();
+  }
+  if (Matcher::vector_size_supported(T_FLOAT,4)) {
+    MachNode *spillVectX = match_tree(new (C, 3) LoadVectorNode(NULL,mem,fp,atp,TypeVect::VECTX));
+    idealreg2regmask[Op_VecX] = &spillVectX->out_RegMask();
+  }
+  if (Matcher::vector_size_supported(T_FLOAT,8)) {
+    MachNode *spillVectY = match_tree(new (C, 3) LoadVectorNode(NULL,mem,fp,atp,TypeVect::VECTY));
+    idealreg2regmask[Op_VecY] = &spillVectY->out_RegMask();
+  }
 }
 
 #ifdef ASSERT
@@ -1063,7 +1115,7 @@
     // that is killed by the call.
     if( warped >= out_arg_limit_per_call )
       out_arg_limit_per_call = OptoReg::add(warped,1);
-    if (!RegMask::can_represent(warped)) {
+    if (!RegMask::can_represent_arg(warped)) {
       C->record_method_not_compilable_all_tiers("unsupported calling sequence");
       return OptoReg::Bad;
     }
@@ -1251,7 +1303,7 @@
     // this killed area.
     uint r_cnt = mcall->tf()->range()->cnt();
     MachProjNode *proj = new (C, 1) MachProjNode( mcall, r_cnt+10000, RegMask::Empty, MachProjNode::fat_proj );
-    if (!RegMask::can_represent(OptoReg::Name(out_arg_limit_per_call-1))) {
+    if (!RegMask::can_represent_arg(OptoReg::Name(out_arg_limit_per_call-1))) {
       C->record_method_not_compilable_all_tiers("unsupported outgoing calling sequence");
     } else {
       for (int i = begin_out_arg_area; i < out_arg_limit_per_call; i++)
--- old/src/share/vm/opto/matcher.hpp	Sat Jun  2 20:04:15 2012
+++ new/src/share/vm/opto/matcher.hpp	Sat Jun  2 20:04:15 2012
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1997, 2011, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2012, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -250,11 +250,22 @@
   static const bool convL2FSupported(void);
 
   // Vector width in bytes
-  static const uint vector_width_in_bytes(void);
+  static const int vector_width_in_bytes(BasicType bt);
 
+  // Limits on vector size (number of elements).
+  static const int max_vector_size(const BasicType bt);
+  static const int min_vector_size(const BasicType bt);
+  static const bool vector_size_supported(const BasicType bt, int size) {
+    return (Matcher::max_vector_size(bt) >= size &&
+            Matcher::min_vector_size(bt) <= size);
+  }
+
   // Vector ideal reg
-  static const uint vector_ideal_reg(void);
+  static const int vector_ideal_reg(int len);
 
+  // CPU supports misaligned vectors store/load.
+  static const bool misaligned_vectors_ok();
+
   // Used to determine a "low complexity" 64-bit constant.  (Zero is simple.)
   // The standard of comparison is one (StoreL ConL) vs. two (StoreI ConI).
   // Depends on the details of 64-bit constant generation on the CPU.
--- old/src/share/vm/opto/memnode.cpp	Sat Jun  2 20:04:15 2012
+++ new/src/share/vm/opto/memnode.cpp	Sat Jun  2 20:04:15 2012
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1997, 2011, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2012, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -1526,7 +1526,8 @@
     // In fact, that could have been the original type of p1, and p1 could have
     // had an original form like p1:(AddP x x (LShiftL quux 3)), where the
     // expression (LShiftL quux 3) independently optimized to the constant 8.
-    if ((t->isa_int() == NULL) && (t->isa_long() == NULL)
+    if ((t->isa_int() == NULL) && (t->isa_long() == NULL) 
+        && (_type->isa_vect() == NULL)
         && Opcode() != Op_LoadKlass && Opcode() != Op_LoadNKlass) {
       // t might actually be lower than _type, if _type is a unique
       // concrete subclass of abstract class t.
--- old/src/share/vm/opto/mulnode.hpp	Sat Jun  2 20:04:16 2012
+++ new/src/share/vm/opto/mulnode.hpp	Sat Jun  2 20:04:16 2012
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1997, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2012, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -41,7 +41,9 @@
 class MulNode : public Node {
   virtual uint hash() const;
 public:
-  MulNode( Node *in1, Node *in2 ): Node(0,in1,in2) {}
+  MulNode( Node *in1, Node *in2 ): Node(0,in1,in2) {
+    init_class_id(Class_Mul);
+  }
 
   // Handle algebraic identities here.  If we have an identity, return the Node
   // we are equivalent to.  We look for "add of zero" as an identity.
--- old/src/share/vm/opto/node.cpp	Sat Jun  2 20:04:16 2012
+++ new/src/share/vm/opto/node.cpp	Sat Jun  2 20:04:16 2012
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1997, 2011, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2012, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -1576,6 +1576,9 @@
     } else {
       tty->print("no type");
     }
+  } else if (t->isa_vect() && this->is_MachSpillCopy()) {
+    // Dump MachSpillcopy vector type.
+    t->dump();
   }
   if (is_new) {
     debug_only(dump_orig(debug_orig()));
--- old/src/share/vm/opto/node.hpp	Sat Jun  2 20:04:17 2012
+++ new/src/share/vm/opto/node.hpp	Sat Jun  2 20:04:17 2012
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1997, 2011, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2012, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -100,6 +100,7 @@
 class MemBarStoreStoreNode;
 class MemNode;
 class MergeMemNode;
+class MulNode;
 class MultiNode;
 class MultiBranchNode;
 class NeverBranchNode;
@@ -133,8 +134,8 @@
 class TypeNode;
 class UnlockNode;
 class VectorNode;
-class VectorLoadNode;
-class VectorStoreNode;
+class LoadVectorNode;
+class StoreVectorNode;
 class VectorSet;
 typedef void (*NFunc)(Node&,void*);
 extern "C" {
@@ -609,9 +610,9 @@
 
     DEFINE_CLASS_ID(Mem,   Node, 4)
       DEFINE_CLASS_ID(Load,  Mem, 0)
-        DEFINE_CLASS_ID(VectorLoad,  Load, 0)
+        DEFINE_CLASS_ID(LoadVector,  Load, 0)
       DEFINE_CLASS_ID(Store, Mem, 1)
-        DEFINE_CLASS_ID(VectorStore, Store, 0)
+        DEFINE_CLASS_ID(StoreVector, Store, 0)
       DEFINE_CLASS_ID(LoadStore, Mem, 2)
 
     DEFINE_CLASS_ID(Region, Node, 5)
@@ -629,8 +630,9 @@
     DEFINE_CLASS_ID(AddP,     Node, 9)
     DEFINE_CLASS_ID(BoxLock,  Node, 10)
     DEFINE_CLASS_ID(Add,      Node, 11)
-    DEFINE_CLASS_ID(Vector,   Node, 12)
-    DEFINE_CLASS_ID(ClearArray, Node, 13)
+    DEFINE_CLASS_ID(Mul,      Node, 12)
+    DEFINE_CLASS_ID(Vector,   Node, 13)
+    DEFINE_CLASS_ID(ClearArray, Node, 14)
 
     _max_classes  = ClassMask_ClearArray
   };
@@ -752,6 +754,7 @@
   DEFINE_CLASS_QUERY(MemBar)
   DEFINE_CLASS_QUERY(MemBarStoreStore)
   DEFINE_CLASS_QUERY(MergeMem)
+  DEFINE_CLASS_QUERY(Mul)
   DEFINE_CLASS_QUERY(Multi)
   DEFINE_CLASS_QUERY(MultiBranch)
   DEFINE_CLASS_QUERY(Parm)
@@ -767,8 +770,8 @@
   DEFINE_CLASS_QUERY(Sub)
   DEFINE_CLASS_QUERY(Type)
   DEFINE_CLASS_QUERY(Vector)
-  DEFINE_CLASS_QUERY(VectorLoad)
-  DEFINE_CLASS_QUERY(VectorStore)
+  DEFINE_CLASS_QUERY(LoadVector)
+  DEFINE_CLASS_QUERY(StoreVector)
   DEFINE_CLASS_QUERY(Unlock)
 
   #undef DEFINE_CLASS_QUERY
--- old/src/share/vm/opto/opcodes.cpp	Sat Jun  2 20:04:17 2012
+++ new/src/share/vm/opto/opcodes.cpp	Sat Jun  2 20:04:17 2012
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1998, 2012, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -38,6 +38,10 @@
   "RegD",
   "RegL",
   "RegFlags",
+  "VecS",
+  "VecD",
+  "VecX",
+  "VecY",
   "_last_machine_leaf",
 #include "classes.hpp"
   "_last_class_name",
--- old/src/share/vm/opto/opcodes.hpp	Sat Jun  2 20:04:18 2012
+++ new/src/share/vm/opto/opcodes.hpp	Sat Jun  2 20:04:18 2012
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1997, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2012, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -36,6 +36,10 @@
   macro(RegF)                   // Machine float   register
   macro(RegD)                   // Machine double  register
   macro(RegL)                   // Machine long    register
+  macro(VecS)                   // Machine vectors register
+  macro(VecD)                   // Machine vectord register
+  macro(VecX)                   // Machine vectorx register
+  macro(VecY)                   // Machine vectory register
   macro(RegFlags)               // Machine flags   register
   _last_machine_leaf,           // Split between regular opcodes and machine
 #include "classes.hpp"
--- old/src/share/vm/opto/postaloc.cpp	Sat Jun  2 20:04:18 2012
+++ new/src/share/vm/opto/postaloc.cpp	Sat Jun  2 20:04:18 2012
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1998, 2012, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -27,13 +27,15 @@
 #include "opto/chaitin.hpp"
 #include "opto/machnode.hpp"
 
-// see if this register kind does not requires two registers
-static bool is_single_register(uint x) {
-#ifdef _LP64
-  return (x != Op_RegD && x != Op_RegL && x != Op_RegP);
-#else
-  return (x != Op_RegD && x != Op_RegL);
-#endif
+// See if this register (or pairs, or vector) already contains the value.
+static bool register_contains_value(Node* val, OptoReg::Name reg, int n_regs,
+                                    Node_List& value) {
+  for (int i = 0; i < n_regs; i++) {
+    OptoReg::Name nreg = OptoReg::add(reg,-i);
+    if (value[nreg] != val)
+      return false;
+  }
+  return true;
 }
 
 //---------------------------may_be_copy_of_callee-----------------------------
@@ -167,9 +169,11 @@
   const RegMask &use_mask = n->in_RegMask(idx);
   bool can_use = ( RegMask::can_represent(def_reg) ? (use_mask.Member(def_reg) != 0)
                                                    : (use_mask.is_AllStack() != 0));
-  // Check for a copy to or from a misaligned pair.
-  can_use = can_use && !use_mask.is_misaligned_Pair() && !def_lrg.mask().is_misaligned_Pair();
-
+  if (!RegMask::is_vector(def->ideal_reg())) {
+    // Check for a copy to or from a misaligned pair.
+    // It is workaround for a sparc with misaligned pairs.
+    can_use = can_use && !use_mask.is_misaligned_pair() && !def_lrg.mask().is_misaligned_pair();
+  }
   if (!can_use)
     return 0;
 
@@ -263,9 +267,9 @@
     val = skip_copies(n->in(k));
   }
 
-  if( val == x ) return blk_adjust; // No progress?
+  if (val == x) return blk_adjust; // No progress?
 
-  bool single = is_single_register(val->ideal_reg());
+  int n_regs = RegMask::num_registers(val->ideal_reg());
   uint val_idx = n2lidx(val);
   OptoReg::Name val_reg = lrgs(val_idx).reg();
 
@@ -272,9 +276,7 @@
   // See if it happens to already be in the correct register!
   // (either Phi's direct register, or the common case of the name
   // never-clobbered original-def register)
-  if( value[val_reg] == val &&
-      // Doubles check both halves
-      ( single || value[val_reg-1] == val ) ) {
+  if (register_contains_value(val, val_reg, n_regs, value)) {
     blk_adjust += use_prior_register(n,k,regnd[val_reg],current_block,value,regnd);
     if( n->in(k) == regnd[val_reg] ) // Success!  Quit trying
       return blk_adjust;
@@ -306,7 +308,7 @@
     }
 
     Node *vv = value[reg];
-    if( !single ) {             // Doubles check for aligned-adjacent pair
+    if (n_regs > 1) {             // Doubles check for aligned-adjacent pair
       if( (reg&1)==0 ) continue;  // Wrong half of a pair
       if( vv != value[reg-1] ) continue; // Not a complete pair
     }
@@ -526,8 +528,9 @@
       if( pidx ) {
         value.map(preg,phi);
         regnd.map(preg,phi);
-        OptoReg::Name preg_lo = OptoReg::add(preg,-1);
-        if( !is_single_register(phi->ideal_reg()) ) {
+        int n_regs = RegMask::num_registers(phi->ideal_reg());
+        for (int l = 1; l < n_regs; l++) {
+          OptoReg::Name preg_lo = OptoReg::add(preg,-l);
           value.map(preg_lo,phi);
           regnd.map(preg_lo,phi);
         }
@@ -568,13 +571,17 @@
             value.map(ureg,valdef); // record improved reaching-def info
             regnd.map(ureg,   def);
             // Record other half of doubles
-            OptoReg::Name ureg_lo = OptoReg::add(ureg,-1);
-            if( !is_single_register(def->ideal_reg()) &&
-                ( !RegMask::can_represent(ureg_lo) ||
-                  lrgs(useidx).mask().Member(ureg_lo) ) && // Nearly always adjacent
-                !value[ureg_lo] ) {
-              value.map(ureg_lo,valdef); // record improved reaching-def info
-              regnd.map(ureg_lo,   def);
+            uint def_ideal_reg = def->ideal_reg();
+            int n_regs = RegMask::num_registers(def_ideal_reg);
+            bool is_vec = RegMask::is_vector(def_ideal_reg);
+            for (int l = 1; l < n_regs; l++) {
+              OptoReg::Name ureg_lo = OptoReg::add(ureg,-l);
+              if (!value[ureg_lo] &&
+                  (!RegMask::can_represent(ureg_lo) ||
+                   lrgs(useidx).mask().Member(ureg_lo))) { // Nearly always adjacent
+                value.map(ureg_lo,valdef); // record improved reaching-def info
+                regnd.map(ureg_lo,   def);
+              }
             }
           }
         }
@@ -607,7 +614,8 @@
       }
 
       uint n_ideal_reg = n->ideal_reg();
-      if( is_single_register(n_ideal_reg) ) {
+      int n_regs = RegMask::num_registers(n_ideal_reg);
+      if (n_regs == 1) {
         // If Node 'n' does not change the value mapped by the register,
         // then 'n' is a useless copy.  Do not update the register->node
         // mapping so 'n' will go dead.
@@ -625,6 +633,25 @@
           assert( n->is_Copy(), "" );
           j -= replace_and_yank_if_dead(n, nreg, b, value, regnd);
         }
+      } else if (RegMask::is_vector(n_ideal_reg)) {
+        // If Node 'n' does not change the value mapped by the register,
+        // then 'n' is a useless copy.  Do not update the register->node
+        // mapping so 'n' will go dead.
+        if (!register_contains_value(val, nreg, n_regs, value)) {
+          // Update the mapping: record new Node defined by the register
+          regnd.map(nreg,n);
+          // Update mapping for defined *value*, which is the defined
+          // Node after skipping all copies.
+          value.map(nreg,val);
+          for (int l = 1; l < n_regs; l++) {
+            OptoReg::Name nreg_lo = OptoReg::add(nreg,-l);
+            regnd.map(nreg_lo, n );
+            value.map(nreg_lo,val);
+          }
+        } else if (n->is_Copy()) {
+          // Note: vector can't be constant and can't be copy of calee.
+          j -= replace_and_yank_if_dead(n, nreg, b, value, regnd);
+        }
       } else {
         // If the value occupies a register pair, record same info
         // in both registers.
--- old/src/share/vm/opto/reg_split.cpp	Sat Jun  2 20:04:19 2012
+++ new/src/share/vm/opto/reg_split.cpp	Sat Jun  2 20:04:19 2012
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2000, 2012, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -74,12 +74,13 @@
   const RegMask *w_i_mask = w_mask->overlap( *i_mask ) ? w_mask : i_mask;
   const RegMask *w_o_mask;
 
+  int num_regs = RegMask::num_registers(ireg);
+  bool is_vect = RegMask::is_vector(ireg);
   if( w_mask->overlap( *o_mask ) && // Overlap AND
-      ((ireg != Op_RegL && ireg != Op_RegD // Single use or aligned
-#ifdef _LP64
-        && ireg != Op_RegP
-#endif
-         ) || o_mask->is_aligned_Pairs()) ) {
+      ((num_regs == 1) // Single use or aligned
+        ||  is_vect    // or vector
+        || !is_vect && o_mask->is_aligned_pairs()) ) {
+    assert(!is_vect || o_mask->is_aligned_sets(num_regs), "vectors are aligned");
     // Don't come here for mis-aligned doubles
     w_o_mask = w_mask;
   } else {                      // wide ideal mask does not overlap with o_mask
@@ -400,15 +401,17 @@
   // CNC - Turned off 7/8/99, causes too much spilling
   // if( lrg->_is_bound ) return false;
 
+  // Use float pressure numbers for vectors.
+  bool is_float_or_vector = lrg->_is_float || lrg->_is_vector;
   // Not yet reached the high-pressure cutoff point, so low pressure
-  uint hrp_idx = lrg->_is_float ? b->_fhrp_index : b->_ihrp_index;
+  uint hrp_idx = is_float_or_vector ? b->_fhrp_index : b->_ihrp_index;
   if( insidx < hrp_idx ) return false;
   // Register pressure for the block as a whole depends on reg class
-  int block_pres = lrg->_is_float ? b->_freg_pressure : b->_reg_pressure;
+  int block_pres = is_float_or_vector ? b->_freg_pressure : b->_reg_pressure;
   // Bound live ranges will split at the binding points first;
   // Intermediate splits should assume the live range's register set
   // got "freed up" and that num_regs will become INT_PRESSURE.
-  int bound_pres = lrg->_is_float ? FLOATPRESSURE : INTPRESSURE;
+  int bound_pres = is_float_or_vector ? FLOATPRESSURE : INTPRESSURE;
   // Effective register pressure limit.
   int lrg_pres = (lrg->get_invalid_mask_size() > lrg->num_regs())
     ? (lrg->get_invalid_mask_size() >> (lrg->num_regs()-1)) : bound_pres;
@@ -794,12 +797,15 @@
                   if( i < n->req() ) break;
                   insert_point--;
                 }
+                uint orig_eidx = b->end_idx();
                 maxlrg = split_DEF( n1, b, insert_point, maxlrg, Reachblock, debug_defs, splits, slidx);
                 // If it wasn't split bail
                 if (!maxlrg) {
                   return 0;
                 }
-                insidx++;
+                // Spill of NULL check mem op goes into the following block.
+                if (b->end_idx() > orig_eidx)
+                  insidx++;
               }
               // This is a new DEF, so update UP
               UPblock[slidx] = false;
@@ -960,7 +966,7 @@
             // Grab register mask info
             const RegMask &dmask = def->out_RegMask();
             const RegMask &umask = n->in_RegMask(inpidx);
-
+            bool is_vect = RegMask::is_vector(def->ideal_reg());
             assert(inpidx < oopoff, "cannot use-split oop map info");
 
             bool dup = UPblock[slidx];
@@ -972,7 +978,7 @@
             if( !umask.is_AllStack() &&
                 (int)umask.Size() <= lrgs(useidx).num_regs() &&
                 (!def->rematerialize() ||
-                 umask.is_misaligned_Pair())) {
+                 !is_vect && umask.is_misaligned_pair())) {
               // These need a Split regardless of overlap or pressure
               // SPLIT - NO DEF - NO CISC SPILL
               maxlrg = split_USE(def,b,n,inpidx,maxlrg,dup,false, splits,slidx);
@@ -1123,10 +1129,12 @@
         // Grab UP info for DEF
         const RegMask &dmask = n->out_RegMask();
         bool defup = dmask.is_UP();
+        int ireg = n->ideal_reg();
+        bool is_vect = RegMask::is_vector(ireg);
         // Only split at Def if this is a HRP block or bound (and spilled once)
         if( !n->rematerialize() &&
-            (((dmask.is_bound1() || dmask.is_bound2() || dmask.is_misaligned_Pair()) &&
-             (deflrg._direct_conflict || deflrg._must_spill)) ||
+            (((dmask.is_bound(ireg) || !is_vect && dmask.is_misaligned_pair()) &&
+              (deflrg._direct_conflict || deflrg._must_spill)) ||
              // Check for LRG being up in a register and we are inside a high
              // pressure area.  Spill it down immediately.
              (defup && is_high_pressure(b,&deflrg,insidx))) ) {
--- old/src/share/vm/opto/regmask.cpp	Sat Jun  2 20:04:19 2012
+++ new/src/share/vm/opto/regmask.cpp	Sat Jun  2 20:04:19 2012
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1997, 2011, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2012, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -129,11 +129,34 @@
   0
 );
 
+//=============================================================================
+bool RegMask::is_vector(uint ireg) {
+  return (ireg == Op_VecS || ireg == Op_VecD || ireg == Op_VecX || ireg == Op_VecY);
+}
+
+int RegMask::num_registers(uint ireg) {
+    switch(ireg) {
+      case Op_VecY:
+        return 8;
+      case Op_VecX:
+        return 4;
+      case Op_VecD:
+      case Op_RegD:
+      case Op_RegL:
+#ifdef _LP64
+      case Op_RegP:
+#endif
+        return 2;
+    }
+    // Op_VecS and the rest ideal registers.
+    return 1;
+}
+
 //------------------------------find_first_pair--------------------------------
 // Find the lowest-numbered register pair in the mask.  Return the
 // HIGHEST register number in the pair, or BAD if no pairs.
 OptoReg::Name RegMask::find_first_pair() const {
-  VerifyPairs();
+  verify_pairs();
   for( int i = 0; i < RM_SIZE; i++ ) {
     if( _A[i] ) {               // Found some bits
       int bit = _A[i] & -_A[i]; // Extract low bit
@@ -146,7 +169,7 @@
 
 //------------------------------ClearToPairs-----------------------------------
 // Clear out partial bits; leave only bit pairs
-void RegMask::ClearToPairs() {
+void RegMask::clear_to_pairs() {
   for( int i = 0; i < RM_SIZE; i++ ) {
     int bits = _A[i];
     bits &= ((bits & 0x55555555)<<1); // 1 hi-bit set for each pair
@@ -153,12 +176,12 @@
     bits |= (bits>>1);          // Smear 1 hi-bit into a pair
     _A[i] = bits;
   }
-  VerifyPairs();
+  verify_pairs();
 }
 
 //------------------------------SmearToPairs-----------------------------------
 // Smear out partial bits; leave only bit pairs
-void RegMask::SmearToPairs() {
+void RegMask::smear_to_pairs() {
   for( int i = 0; i < RM_SIZE; i++ ) {
     int bits = _A[i];
     bits |= ((bits & 0x55555555)<<1); // Smear lo bit hi per pair
@@ -165,11 +188,11 @@
     bits |= ((bits & 0xAAAAAAAA)>>1); // Smear hi bit lo per pair
     _A[i] = bits;
   }
-  VerifyPairs();
+  verify_pairs();
 }
 
 //------------------------------is_aligned_pairs-------------------------------
-bool RegMask::is_aligned_Pairs() const {
+bool RegMask::is_aligned_pairs() const {
   // Assert that the register mask contains only bit pairs.
   for( int i = 0; i < RM_SIZE; i++ ) {
     int bits = _A[i];
@@ -204,7 +227,7 @@
 
 //------------------------------is_bound2--------------------------------------
 // Return TRUE if the mask contains an adjacent pair of bits and no other bits.
-int RegMask::is_bound2() const {
+int RegMask::is_bound_pair() const {
   if( is_AllStack() ) return false;
 
   int bit = -1;                 // Set to hold the one bit allowed
@@ -225,6 +248,132 @@
   // True for both the empty mask and for a bit pair
   return true;
 }
+
+static int low_bits[3] = { 0x55555555, 0x11111111, 0x01010101 };
+//------------------------------find_first_set---------------------------------
+// Find the lowest-numbered register set in the mask.  Return the
+// HIGHEST register number in the set, or BAD if no sets.
+// Works also for size 1.
+OptoReg::Name RegMask::find_first_set(int size) const {
+  verify_sets(size);
+  for (int i = 0; i < RM_SIZE; i++) {
+    if (_A[i]) {                // Found some bits
+      int bit = _A[i] & -_A[i]; // Extract low bit
+      // Convert to bit number, return hi bit in pair
+      return OptoReg::Name((i<<_LogWordBits)+find_lowest_bit(bit)+(size-1));
+    }
+  }
+  return OptoReg::Bad;
+}
+
+//------------------------------clear_to_sets----------------------------------
+// Clear out partial bits; leave only aligned adjacent bit pairs
+void RegMask::clear_to_sets(int size) {
+  if (size == 1) return;
+  assert(2 <= size && size <= 8, "update low bits table");
+  assert(is_power_of_2(size), "sanity");
+  int low_bits_mask = low_bits[size>>2];
+  for (int i = 0; i < RM_SIZE; i++) {
+    int bits = _A[i];
+    int sets = (bits & low_bits_mask);
+    for (int j = 1; j < size; j++) {
+      sets = (bits & (sets<<1)); // filter bits which produce whole sets
+    }
+    sets |= (sets>>1);           // Smear 1 hi-bit into a set
+    if (size > 2) {
+      sets |= (sets>>2);         // Smear 2 hi-bits into a set
+      if (size > 4) {
+        sets |= (sets>>4);       // Smear 4 hi-bits into a set
+      }
+    }
+    _A[i] = sets;
+  }
+  verify_sets(size);
+}
+
+//------------------------------smear_to_sets----------------------------------
+// Smear out partial bits to aligned adjacent bit sets
+void RegMask::smear_to_sets(int size) {
+  if (size == 1) return;
+  assert(2 <= size && size <= 8, "update low bits table");
+  assert(is_power_of_2(size), "sanity");
+  int low_bits_mask = low_bits[size>>2];
+  for (int i = 0; i < RM_SIZE; i++) {
+    int bits = _A[i];
+    int sets = 0;
+    for (int j = 0; j < size; j++) {
+      sets |= (bits & low_bits_mask);  // collect partial bits
+      bits  = bits>>1;
+    }
+    sets |= (sets<<1);           // Smear 1 lo-bit  into a set
+    if (size > 2) {
+      sets |= (sets<<2);         // Smear 2 lo-bits into a set
+      if (size > 4) {
+        sets |= (sets<<4);       // Smear 4 lo-bits into a set
+      }
+    }
+    _A[i] = sets;
+  }
+  verify_sets(size);
+}
+
+//------------------------------is_aligned_set--------------------------------
+bool RegMask::is_aligned_sets(int size) const {
+  if (size == 1) return true;
+  assert(2 <= size && size <= 8, "update low bits table");
+  assert(is_power_of_2(size), "sanity");
+  int low_bits_mask = low_bits[size>>2];
+  // Assert that the register mask contains only bit sets.
+  for (int i = 0; i < RM_SIZE; i++) {
+    int bits = _A[i];
+    while (bits) {              // Check bits for pairing
+      int bit = bits & -bits;   // Extract low bit
+      // Low bit is not odd means its mis-aligned.
+      if ((bit & low_bits_mask) == 0) return false;
+      // Do extra work since (bit << size) may overflow.
+      int hi_bit = bit << (size-1); // high bit
+      int set = hi_bit + ((hi_bit-1) & ~(bit-1));
+      // Check for aligned adjacent bits in this set
+      if ((bits & set) != set) return false;
+      bits -= set;  // Remove this set
+    }
+  }
+  return true;
+}
+
+//------------------------------is_bound_set-----------------------------------
+// Return TRUE if the mask contains one adjacent set of bits and no other bits.
+// Works also for size 1.
+int RegMask::is_bound_set(int size) const {
+  if( is_AllStack() ) return false;
+  assert(1 <= size && size <= 8, "update low bits table");
+  int bit = -1;                 // Set to hold the one bit allowed
+  for (int i = 0; i < RM_SIZE; i++) {
+    if (_A[i] ) {               // Found some bits
+      if (bit != -1)
+       return false;            // Already had bits, so fail
+      bit = _A[i] & -_A[i];     // Extract 1 bit from mask
+      int hi_bit = bit << (size-1); // high bit
+      if (hi_bit != 0) {        // Bit set stays in same word?
+        int set = hi_bit + ((hi_bit-1) & ~(bit-1));      
+        if (set != _A[i])
+          return false;         // Require adjacent bit set and no more bits
+      } else {                  // Else its a split-set case
+        if (((-1) & ~(bit-1)) != _A[i])
+          return false;         // Found many bits, so fail
+        i++;                    // Skip iteration forward and check high part
+        assert(size <= 8, "update next code");
+        // The lower 24 bits should be 0 since it is split case and size <= 8.
+        int set = bit>>24;
+        set = set & -set; // Remove sign extension.
+        set = (((set << size) - 1) >> 8);
+        if (_A[i] != set) return false; // Require 1 lo bit in next word
+      }
+    }
+  }
+  // True for both the empty mask and for a bit set
+  return true;
+}
 
 //------------------------------is_UP------------------------------------------
 // UP means register only, Register plus stack, or stack only is DOWN
--- old/src/share/vm/opto/regmask.hpp	Sat Jun  2 20:04:20 2012
+++ new/src/share/vm/opto/regmask.hpp	Sat Jun  2 20:04:20 2012
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1997, 2011, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2012, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -113,7 +113,11 @@
   // the controlling alignment constraint.  Note that this alignment
   // requirement is internal to the allocator, and independent of any
   // particular platform.
-  enum { SlotsPerLong = 2 };
+  enum { SlotsPerLong = 2,
+         SlotsPerVecS = 1,
+         SlotsPerVecD = 2,
+         SlotsPerVecX = 4,
+         SlotsPerVecY = 8 };
 
   // A constructor only used by the ADLC output.  All mask fields are filled
   // in directly.  Calls to this look something like RM(1,2,3,4);
@@ -193,21 +197,54 @@
   OptoReg::Name find_first_pair() const;
 
   // Clear out partial bits; leave only aligned adjacent bit pairs.
-  void ClearToPairs();
+  void clear_to_pairs();
   // Smear out partial bits; leave only aligned adjacent bit pairs.
-  void SmearToPairs();
+  void smear_to_pairs();
   // Verify that the mask contains only aligned adjacent bit pairs
-  void VerifyPairs() const { assert( is_aligned_Pairs(), "mask is not aligned, adjacent pairs" ); }
+  void verify_pairs() const { assert( is_aligned_pairs(), "mask is not aligned, adjacent pairs" ); }
   // Test that the mask contains only aligned adjacent bit pairs
-  bool is_aligned_Pairs() const;
+  bool is_aligned_pairs() const;
 
   // mask is a pair of misaligned registers
-  bool is_misaligned_Pair() const { return Size()==2 && !is_aligned_Pairs();}
+  bool is_misaligned_pair() const { return Size()==2 && !is_aligned_pairs(); }
   // Test for single register
   int is_bound1() const;
   // Test for a single adjacent pair
-  int is_bound2() const;
+  int is_bound_pair() const;
+  // Test for a single adjacent set of ideal register's size.
+  int is_bound(uint ireg) const {
+    if (is_vector(ireg)) {
+      if (is_bound_set(num_registers(ireg)))
+        return true;
+    } else if (is_bound1() || is_bound_pair()) {
+      return true;
+    }
+    return false;
+  }
 
+  // Find the lowest-numbered register set in the mask.  Return the
+  // HIGHEST register number in the set, or BAD if no sets.
+  // Assert that the mask contains only bit sets.
+  OptoReg::Name find_first_set(int size) const;
+
+  // Clear out partial bits; leave only aligned adjacent bit sets of size.
+  void clear_to_sets(int size);
+  // Smear out partial bits to aligned adjacent bit sets.
+  void smear_to_sets(int size);
+  // Verify that the mask contains only aligned adjacent bit sets
+  void verify_sets(int size) const { assert(is_aligned_sets(size), "mask is not aligned, adjacent sets"); }
+  // Test that the mask contains only aligned adjacent bit sets
+  bool is_aligned_sets(int size) const;
+
+  // mask is a set of misaligned registers
+  bool is_misaligned_set(int size) const { return (int)Size()==size && !is_aligned_sets(size);}
+
+  // Test for a single adjacent set
+  int is_bound_set(int size) const;
+
+  static bool is_vector(uint ireg);
+  static int num_registers(uint ireg);
+
   // Fast overlap test.  Non-zero if any registers in common.
   int overlap( const RegMask &rm ) const {
     return
@@ -280,9 +317,15 @@
 
   static bool can_represent(OptoReg::Name reg) {
     // NOTE: -1 in computation reflects the usage of the last
-    //       bit of the regmask as an infinite stack flag.
+    //       bit of the regmask as an infinite stack flag and
+    //       -7 is to keep mask aligned for largest value (VecY).
     return (int)reg < (int)(CHUNK_SIZE-1);
   }
+  static bool can_represent_arg(OptoReg::Name reg) {
+    // NOTE: -SlotsPerVecY in computation reflects the need
+    //       to keep mask aligned for largest value (VecY).
+    return (int)reg < (int)(CHUNK_SIZE-SlotsPerVecY);
+  }
 };
 
 // Do not use this constant directly in client code!
--- old/src/share/vm/opto/superword.cpp	Sat Jun  2 20:04:20 2012
+++ new/src/share/vm/opto/superword.cpp	Sat Jun  2 20:04:20 2012
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2007, 2012, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -67,6 +67,10 @@
 
 //------------------------------transform_loop---------------------------
 void SuperWord::transform_loop(IdealLoopTree* lpt) {
+  assert(UseSuperWord, "should be");
+  // Do vectors exist on this architecture?
+  if (Matcher::vector_width_in_bytes(T_BYTE) < 2) return;
+
   assert(lpt->_head->is_CountedLoop(), "must be");
   CountedLoopNode *cl = lpt->_head->as_CountedLoop();
 
@@ -89,15 +93,12 @@
   Node *pre_opaq1 = pre_end->limit();
   if (pre_opaq1->Opcode() != Op_Opaque1) return;
 
-  // Do vectors exist on this architecture?
-  if (vector_width_in_bytes() == 0) return;
-
   init(); // initialize data structures
 
   set_lpt(lpt);
   set_lp(cl);
 
- // For now, define one block which is the entire loop body
+  // For now, define one block which is the entire loop body
   set_bb(cl);
 
   assert(_packset.length() == 0, "packset must be empty");
@@ -177,7 +178,7 @@
   Node_List memops;
   for (int i = 0; i < _block.length(); i++) {
     Node* n = _block.at(i);
-    if (n->is_Mem() && in_bb(n) &&
+    if (n->is_Mem() && !n->is_LoadStore() && in_bb(n) &&
         is_java_primitive(n->as_Mem()->memory_type())) {
       int align = memory_alignment(n->as_Mem(), 0);
       if (align != bottom_align) {
@@ -185,55 +186,134 @@
       }
     }
   }
-  if (memops.size() == 0) return;
 
-  // Find a memory reference to align to.  The pre-loop trip count
-  // is modified to align this reference to a vector-aligned address
-  find_align_to_ref(memops);
-  if (align_to_ref() == NULL) return;
+  Node_List align_to_refs;
+  const Type* best_vt = NULL;
+  int best_iv_adjustment = 0;
+  MemNode* best_align_to_mem_ref = NULL;
 
-  SWPointer align_to_ref_p(align_to_ref(), this);
-  int offset = align_to_ref_p.offset_in_bytes();
-  int scale  = align_to_ref_p.scale_in_bytes();
-  int vw              = vector_width_in_bytes();
-  int stride_sign     = (scale * iv_stride()) > 0 ? 1 : -1;
-  int iv_adjustment   = (stride_sign * vw - (offset % vw)) % vw;
+  while (memops.size() != 0) {
+    // Find a memory reference to align to.
+    MemNode* mem_ref = find_align_to_ref(memops);
+    if (mem_ref == NULL) break;
+    align_to_refs.push(mem_ref);
+    const Type* vt = velt_type(mem_ref);
+    int iv_adjustment = get_iv_adjustment(mem_ref);
 
-#ifndef PRODUCT
-  if (TraceSuperWord)
-    tty->print_cr("\noffset = %d iv_adjustment = %d  elt_align = %d scale = %d iv_stride = %d",
-                  offset, iv_adjustment, align_to_ref_p.memory_size(), align_to_ref_p.scale_in_bytes(), iv_stride());
-#endif
+    if (best_align_to_mem_ref == NULL) {
+      // Set memory reference which is the best from all memory operations
+      // to be used for alignment. The pre-loop trip count is modified to align
+      // this reference to a vector-aligned address.
+      best_vt = vt;
+      best_align_to_mem_ref = mem_ref;
+      best_iv_adjustment = iv_adjustment;
+    }
 
-  // Set alignment relative to "align_to_ref"
-  for (int i = memops.size() - 1; i >= 0; i--) {
-    MemNode* s = memops.at(i)->as_Mem();
-    SWPointer p2(s, this);
-    if (p2.comparable(align_to_ref_p)) {
-      int align = memory_alignment(s, iv_adjustment);
-      set_alignment(s, align);
-    } else {
-      memops.remove(i);
+    SWPointer align_to_ref_p(mem_ref, this);
+    // Set alignment relative to "align_to_ref" for all related memory operations.
+    for (int i = memops.size() - 1; i >= 0; i--) {
+      MemNode* s = memops.at(i)->as_Mem();
+      if (isomorphic(s, mem_ref)) {
+        SWPointer p2(s, this);
+        if (p2.comparable(align_to_ref_p)) {
+          int align = memory_alignment(s, iv_adjustment);
+          set_alignment(s, align);
+        }
+      }
     }
-  }
 
-  // Create initial pack pairs of memory operations
-  for (uint i = 0; i < memops.size(); i++) {
-    Node* s1 = memops.at(i);
-    for (uint j = 0; j < memops.size(); j++) {
-      Node* s2 = memops.at(j);
-      if (s1 != s2 && are_adjacent_refs(s1, s2)) {
+    // Create initial pack pairs of memory operations for which
+    // alignment is set and vectors will be aligned.
+    bool create_pack = true;
+    if (memory_alignment(mem_ref, best_iv_adjustment) != 0) {
+      if (vt == best_vt) {
+        // Can't allow vectorization of unaligned memory accesses with the
+        // same type since it could be overlapped accesses to the same array.
+        create_pack = false;
+      } else {
+        // Allow independent (different type) unaligned memory operations
+        // if HW supports them. 
+        if (!Matcher::misaligned_vectors_ok()) {
+          create_pack = false;
+        } else {
+          // Check if packs of the same memory type but
+          // with a different alignment were created before.
+          for (uint i = 0; i < align_to_refs.size(); i++) {
+            MemNode* mr = align_to_refs.at(i)->as_Mem();
+            if (velt_type(mr) == vt && memory_alignment(mr, iv_adjustment) != 0)
+              create_pack = false;
+          }
+        }
+      }
+    }
+    if (create_pack) {
+      for (uint i = 0; i < memops.size(); i++) {
+        Node* s1 = memops.at(i);
         int align = alignment(s1);
-        if (stmts_can_pack(s1, s2, align)) {
-          Node_List* pair = new Node_List();
-          pair->push(s1);
-          pair->push(s2);
-          _packset.append(pair);
+        if (align == top_align) continue;
+        for (uint j = 0; j < memops.size(); j++) {
+          Node* s2 = memops.at(j);
+          if (alignment(s2) == top_align) continue;
+          if (s1 != s2 && are_adjacent_refs(s1, s2)) {
+            if (stmts_can_pack(s1, s2, align)) {
+              Node_List* pair = new Node_List();
+              pair->push(s1);
+              pair->push(s2);
+              _packset.append(pair);
+            }
+          }
         }
       }
+    } else { // Don't create unaligned pack
+      // First, remove remaining memory ops of the same type from the list.
+      for (int i = memops.size() - 1; i >= 0; i--) {
+        MemNode* s = memops.at(i)->as_Mem();
+        if (velt_type(s) == vt) {
+          memops.remove(i);
+        }
+      }
+
+      // Second, removed already constructed packs of the same type.
+      for (int i = _packset.length() - 1; i >= 0; i--) {
+        Node_List* p = _packset.at(i);
+        MemNode* s = p->at(0)->as_Mem();
+        if (velt_type(s) == vt) {
+          remove_pack_at(i);
+        }
+      }
+
+      // If needed find the best memory reference for loop alignment again.
+      if (best_vt == vt) {
+        // Put memory ops from remaining packs back on memops list for
+        // the best alignment search.
+        uint orig_msize = memops.size();
+        for (int i = 0; i < _packset.length(); i++) {
+          Node_List* p = _packset.at(i);
+          MemNode* s = p->at(0)->as_Mem();
+          assert(velt_type(s) != vt, "sanity");
+          memops.push(s);
+        }
+        MemNode* best_align_to_mem_ref = find_align_to_ref(memops);
+        if (best_align_to_mem_ref == NULL) break;
+        best_vt = velt_type(best_align_to_mem_ref);
+        best_iv_adjustment = get_iv_adjustment(best_align_to_mem_ref);
+        // Restore list.
+        while (memops.size() > orig_msize)
+          (void)memops.pop();
+      }
+    } // unaligned memory accesses
+
+    // Remove used mem nodes 
+    for (int i = memops.size() - 1; i >= 0; i--) {
+      MemNode* m = memops.at(i)->as_Mem();
+      if (alignment(m) != top_align) {
+        memops.remove(i);
+      }
     }
-  }
 
+  } // while (memops.size() != 0
+  set_align_to_ref(best_align_to_mem_ref);
+
 #ifndef PRODUCT
   if (TraceSuperWord) {
     tty->print_cr("\nAfter find_adjacent_refs");
@@ -246,7 +326,7 @@
 // Find a memory reference to align the loop induction variable to.
 // Looks first at stores then at loads, looking for a memory reference
 // with the largest number of references similar to it.
-void SuperWord::find_align_to_ref(Node_List &memops) {
+MemNode* SuperWord::find_align_to_ref(Node_List &memops) {
   GrowableArray<int> cmp_ct(arena(), memops.size(), memops.size(), 0);
 
   // Count number of comparable memory ops
@@ -270,8 +350,10 @@
     }
   }
 
-  // Find Store (or Load) with the greatest number of "comparable" references
+  // Find Store (or Load) with the greatest number of "comparable" references,
+  // biggest vector size, smallest data size and smallest iv offset.
   int max_ct        = 0;
+  int max_vw        = 0;
   int max_idx       = -1;
   int min_size      = max_jint;
   int min_iv_offset = max_jint;
@@ -278,12 +360,18 @@
   for (uint j = 0; j < memops.size(); j++) {
     MemNode* s = memops.at(j)->as_Mem();
     if (s->is_Store()) {
+      int vw = vector_width_in_bytes(velt_basic_type(s));
+      assert(vw > 1, "sanity");
       SWPointer p(s, this);
-      if (cmp_ct.at(j) > max_ct ||
-          cmp_ct.at(j) == max_ct && (data_size(s) < min_size ||
-                                     data_size(s) == min_size &&
-                                        p.offset_in_bytes() < min_iv_offset)) {
+      if (cmp_ct.at(j) >  max_ct ||
+          cmp_ct.at(j) == max_ct &&
+            (vw >  max_vw ||
+             vw == max_vw &&
+              (data_size(s) <  min_size ||
+               data_size(s) == min_size &&
+                 (p.offset_in_bytes() < min_iv_offset)))) {
         max_ct = cmp_ct.at(j);
+        max_vw = vw;
         max_idx = j;
         min_size = data_size(s);
         min_iv_offset = p.offset_in_bytes();
@@ -295,12 +383,18 @@
     for (uint j = 0; j < memops.size(); j++) {
       MemNode* s = memops.at(j)->as_Mem();
       if (s->is_Load()) {
+        int vw = vector_width_in_bytes(velt_basic_type(s));
+        assert(vw > 1, "sanity");
         SWPointer p(s, this);
-        if (cmp_ct.at(j) > max_ct ||
-            cmp_ct.at(j) == max_ct && (data_size(s) < min_size ||
-                                       data_size(s) == min_size &&
-                                          p.offset_in_bytes() < min_iv_offset)) {
+        if (cmp_ct.at(j) >  max_ct ||
+            cmp_ct.at(j) == max_ct &&
+              (vw >  max_vw ||
+               vw == max_vw &&
+                (data_size(s) <  min_size ||
+                 data_size(s) == min_size &&
+                   (p.offset_in_bytes() < min_iv_offset)))) {
           max_ct = cmp_ct.at(j);
+          max_vw = vw;
           max_idx = j;
           min_size = data_size(s);
           min_iv_offset = p.offset_in_bytes();
@@ -309,10 +403,7 @@
     }
   }
 
-  if (max_ct > 0)
-    set_align_to_ref(memops.at(max_idx)->as_Mem());
-
-#ifndef PRODUCT
+#ifdef ASSERT
   if (TraceSuperWord && Verbose) {
     tty->print_cr("\nVector memops after find_align_to_refs");
     for (uint i = 0; i < memops.size(); i++) {
@@ -321,6 +412,17 @@
     }
   }
 #endif
+
+  if (max_ct > 0) {
+#ifdef ASSERT
+    if (TraceSuperWord) {
+      tty->print("\nVector align to node: ");
+      memops.at(max_idx)->as_Mem()->dump();
+    }
+#endif
+    return memops.at(max_idx)->as_Mem();
+  }
+  return NULL;
 }
 
 //------------------------------ref_is_alignable---------------------------
@@ -341,7 +443,9 @@
 
   // If initial offset from start of object is computable,
   // compute alignment within the vector.
-  int vw = vector_width_in_bytes();
+  BasicType bt = velt_basic_type(p.mem());
+  int vw = vector_width_in_bytes(bt);
+  assert(vw > 1, "sanity");
   if (vw % span == 0) {
     Node* init_nd = pre_end->init_trip();
     if (init_nd->is_Con() && p.invar() == NULL) {
@@ -361,6 +465,26 @@
   return false;
 }
 
+//---------------------------get_iv_adjustment---------------------------
+// Calculate loop's iv adjustment for this memory ops.
+int SuperWord::get_iv_adjustment(MemNode* mem_ref) {
+  SWPointer align_to_ref_p(mem_ref, this);
+  int offset = align_to_ref_p.offset_in_bytes();
+  int scale  = align_to_ref_p.scale_in_bytes();
+  BasicType bt = velt_basic_type(mem_ref);
+  int vw       = vector_width_in_bytes(bt);
+  assert(vw > 1, "sanity");
+  int stride_sign   = (scale * iv_stride()) > 0 ? 1 : -1;
+  int iv_adjustment = (stride_sign * vw - (offset % vw)) % vw;
+
+#ifndef PRODUCT
+  if (TraceSuperWord)
+    tty->print_cr("\noffset = %d iv_adjust = %d elt_size = %d scale = %d iv_stride = %d vect_size %d",
+                  offset, iv_adjustment, align_to_ref_p.memory_size(), scale, iv_stride(), vw);
+#endif
+  return iv_adjustment;
+}
+
 //---------------------------dependence_graph---------------------------
 // Construct dependency graph.
 // Add dependence edges to load/store nodes for memory dependence
@@ -488,9 +612,13 @@
 bool SuperWord::stmts_can_pack(Node* s1, Node* s2, int align) {
 
   // Do not use superword for non-primitives
-  if((s1->is_Mem() && !is_java_primitive(s1->as_Mem()->memory_type())) ||
-     (s2->is_Mem() && !is_java_primitive(s2->as_Mem()->memory_type())))
+  BasicType bt1 = velt_basic_type(s1);
+  BasicType bt2 = velt_basic_type(s2);
+  if(!is_java_primitive(bt1) || !is_java_primitive(bt2))
     return false;
+  if (Matcher::max_vector_size(bt1) < 2) {
+    return false; // No vectors for this type
+  }
 
   if (isomorphic(s1, s2)) {
     if (independent(s1, s2)) {
@@ -595,14 +723,16 @@
 //------------------------------set_alignment---------------------------
 void SuperWord::set_alignment(Node* s1, Node* s2, int align) {
   set_alignment(s1, align);
-  set_alignment(s2, align + data_size(s1));
+  if (align == top_align || align == bottom_align) {
+    set_alignment(s2, align);
+  } else {
+    set_alignment(s2, align + data_size(s1));
+  }
 }
 
 //------------------------------data_size---------------------------
 int SuperWord::data_size(Node* s) {
-  const Type* t = velt_type(s);
-  BasicType  bt = t->array_element_basic_type();
-  int bsize = type2aelembytes(bt);
+  int bsize = type2aelembytes(velt_basic_type(s));
   assert(bsize != 0, "valid size");
   return bsize;
 }
@@ -631,9 +761,9 @@
 //------------------------------follow_use_defs---------------------------
 // Extend the packset by visiting operand definitions of nodes in pack p
 bool SuperWord::follow_use_defs(Node_List* p) {
+  assert(p->size() == 2, "just checking");
   Node* s1 = p->at(0);
   Node* s2 = p->at(1);
-  assert(p->size() == 2, "just checking");
   assert(s1->req() == s2->req(), "just checking");
   assert(alignment(s1) + data_size(s1) == alignment(s2), "just checking");
 
@@ -718,7 +848,12 @@
     for (i1++; i1 < ct; i1++) if (u1->in(i1) == d1) break;
     for (i2++; i2 < ct; i2++) if (u2->in(i2) == d2) break;
     if (i1 != i2) {
-      return false;
+      if ((i1 == (3-i2)) && (u2->is_Add() || u2->is_Mul())) {
+        // Further analysis relies on operands position matching.
+        u2->swap_edges(i1, i2);
+      } else {
+        return false;
+      }
     }
   } while (i1 < ct);
   return true;
@@ -727,7 +862,7 @@
 //------------------------------est_savings---------------------------
 // Estimate the savings from executing s1 and s2 as a pack
 int SuperWord::est_savings(Node* s1, Node* s2) {
-  int save = 2 - 1; // 2 operations per instruction in packed form
+  int save_in = 2 - 1; // 2 operations per instruction in packed form
 
   // inputs
   for (uint i = 1; i < s1->req(); i++) {
@@ -735,11 +870,11 @@
     Node* x2 = s2->in(i);
     if (x1 != x2) {
       if (are_adjacent_refs(x1, x2)) {
-        save += adjacent_profit(x1, x2);
+        save_in += adjacent_profit(x1, x2);
       } else if (!in_packset(x1, x2)) {
-        save -= pack_cost(2);
+        save_in -= pack_cost(2);
       } else {
-        save += unpack_cost(2);
+        save_in += unpack_cost(2);
       }
     }
   }
@@ -746,6 +881,7 @@
 
   // uses of result
   uint ct = 0;
+  int save_use = 0;
   for (DUIterator_Fast imax, i = s1->fast_outs(imax); i < imax; i++) {
     Node* s1_use = s1->fast_out(i);
     for (int j = 0; j < _packset.length(); j++) {
@@ -756,7 +892,7 @@
           if (p->at(p->size()-1) == s2_use) {
             ct++;
             if (are_adjacent_refs(s1_use, s2_use)) {
-              save += adjacent_profit(s1_use, s2_use);
+              save_use += adjacent_profit(s1_use, s2_use);
             }
           }
         }
@@ -764,10 +900,10 @@
     }
   }
 
-  if (ct < s1->outcnt()) save += unpack_cost(1);
-  if (ct < s2->outcnt()) save += unpack_cost(1);
+  if (ct < s1->outcnt()) save_use += unpack_cost(1);
+  if (ct < s2->outcnt()) save_use += unpack_cost(1);
 
-  return save;
+  return MAX2(save_in, save_use);
 }
 
 //------------------------------costs---------------------------
@@ -778,8 +914,9 @@
 //------------------------------combine_packs---------------------------
 // Combine packs A and B with A.last == B.first into A.first..,A.last,B.second,..B.last
 void SuperWord::combine_packs() {
-  bool changed;
-  do {
+  bool changed = true;
+  // Combine packs regardless max vector size.
+  while (changed) {
     changed = false;
     for (int i = 0; i < _packset.length(); i++) {
       Node_List* p1 = _packset.at(i);
@@ -787,6 +924,7 @@
       for (int j = 0; j < _packset.length(); j++) {
         Node_List* p2 = _packset.at(j);
         if (p2 == NULL) continue;
+        if (i == j) continue;
         if (p1->at(p1->size()-1) == p2->at(0)) {
           for (uint k = 1; k < p2->size(); k++) {
             p1->push(p2->at(k));
@@ -796,8 +934,39 @@
         }
       }
     }
-  } while (changed);
+  }
 
+  // Split packs which have size greater then max vector size.
+  for (int i = 0; i < _packset.length(); i++) {
+    Node_List* p1 = _packset.at(i);
+    if (p1 != NULL) {
+      BasicType bt = velt_basic_type(p1->at(0));
+      uint max_vlen = Matcher::max_vector_size(bt); // Max elements in vector
+      assert(is_power_of_2(max_vlen), "sanity");
+      uint psize = p1->size();
+      if (!is_power_of_2(psize)) {
+        // Skip pack which can't be vector.
+        // case1: for(...) { a[i] = i; }    elements values are different (i+x)
+        // case2: for(...) { a[i] = b[i+1]; }  can't align both, load and store
+        _packset.at_put(i, NULL);
+        continue;
+      }
+      if (psize > max_vlen) {
+        Node_List* pack = new Node_List();
+        for (uint j = 0; j < psize; j++) {
+          pack->push(p1->at(j));
+          if (pack->size() >= max_vlen) {
+            assert(is_power_of_2(pack->size()), "sanity");
+            _packset.append(pack);
+            pack = new Node_List();
+          }
+        }
+        _packset.at_put(i, NULL);
+      }
+    }
+  }
+
+  // Compress list.
   for (int i = _packset.length() - 1; i >= 0; i--) {
     Node_List* p1 = _packset.at(i);
     if (p1 == NULL) {
@@ -880,8 +1049,7 @@
 // Can code be generated for pack p?
 bool SuperWord::implemented(Node_List* p) {
   Node* p0 = p->at(0);
-  int vopc = VectorNode::opcode(p0->Opcode(), p->size(), velt_type(p0));
-  return vopc > 0 && Matcher::has_match_rule(vopc);
+  return VectorNode::implemented(p0->Opcode(), p->size(), velt_basic_type(p0));
 }
 
 //------------------------------profitable---------------------------
@@ -939,37 +1107,42 @@
 }
 
 //-------------------------------remove_and_insert-------------------
-//remove "current" from its current position in the memory graph and insert
-//it after the appropriate insertion point (lip or uip)
+// Remove "current" from its current position in the memory graph and insert
+// it after the appropriate insertion point (lip or uip).
 void SuperWord::remove_and_insert(MemNode *current, MemNode *prev, MemNode *lip,
                                   Node *uip, Unique_Node_List &sched_before) {
   Node* my_mem = current->in(MemNode::Memory);
-  _igvn.hash_delete(current);
-  _igvn.hash_delete(my_mem);
+  bool sched_up = sched_before.member(current);
 
-  //remove current_store from its current position in the memmory graph
+  // remove current_store from its current position in the memmory graph
   for (DUIterator i = current->outs(); current->has_out(i); i++) {
     Node* use = current->out(i);
     if (use->is_Mem()) {
       assert(use->in(MemNode::Memory) == current, "must be");
-      _igvn.hash_delete(use);
       if (use == prev) { // connect prev to my_mem
-        use->set_req(MemNode::Memory, my_mem);
+          _igvn.hash_delete(use);
+          use->set_req(MemNode::Memory, my_mem);
+          _igvn._worklist.push(use);
+          --i; //deleted this edge; rescan position
       } else if (sched_before.member(use)) {
-        _igvn.hash_delete(uip);
-        use->set_req(MemNode::Memory, uip);
+        if (!sched_up) { // Will be moved together with current
+          _igvn.hash_delete(use);
+          use->set_req(MemNode::Memory, uip);
+          _igvn._worklist.push(use);
+          --i; //deleted this edge; rescan position
+        }
       } else {
-        _igvn.hash_delete(lip);
-        use->set_req(MemNode::Memory, lip);
+        if (sched_up) { // Will be moved together with current
+          _igvn.hash_delete(use);
+          use->set_req(MemNode::Memory, lip);
+          _igvn._worklist.push(use);
+          --i; //deleted this edge; rescan position
+        }
       }
-      _igvn._worklist.push(use);
-      --i; //deleted this edge; rescan position
     }
   }
 
-  bool sched_up = sched_before.member(current);
   Node *insert_pt =  sched_up ?  uip : lip;
-  _igvn.hash_delete(insert_pt);
 
   // all uses of insert_pt's memory state should use current's instead
   for (DUIterator i = insert_pt->outs(); insert_pt->has_out(i); i++) {
@@ -982,10 +1155,10 @@
       --i; //deleted this edge; rescan position
     } else if (!sched_up && use->is_Phi() && use->bottom_type() == Type::MEMORY) {
       uint pos; //lip (lower insert point) must be the last one in the memory slice
-      _igvn.hash_delete(use);
       for (pos=1; pos < use->req(); pos++) {
         if (use->in(pos) == insert_pt) break;
       }
+      _igvn.hash_delete(use);
       use->set_req(pos, current);
       _igvn._worklist.push(use);
       --i;
@@ -993,6 +1166,7 @@
   }
 
   //connect current to insert_pt
+  _igvn.hash_delete(current);
   current->set_req(MemNode::Memory, insert_pt);
   _igvn._worklist.push(current);
 }
@@ -1031,7 +1205,7 @@
         if (use->is_Mem() && use != previous)
           memops.push(use);
       }
-      if(current == first) break;
+      if (current == first) break;
       previous = current;
       current  = current->in(MemNode::Memory)->as_Mem();
     }
@@ -1044,15 +1218,16 @@
           Node *s2 = memops.at(j);
           if (!independent(s1, s2)) {
             if (in_pack(s2, pk) || schedule_before_pack.member(s2)) {
-              schedule_before_pack.push(s1); //s1 must be scheduled before
+              schedule_before_pack.push(s1); // s1 must be scheduled before
               Node_List* mem_pk = my_pack(s1);
               if (mem_pk != NULL) {
                 for (uint ii = 0; ii < mem_pk->size(); ii++) {
-                  Node* s = mem_pk->at(ii); // follow partner
+                  Node* s = mem_pk->at(ii);  // follow partner
                   if (memops.member(s) && !schedule_before_pack.member(s))
                     schedule_before_pack.push(s);
                 }
               }
+              break;
             }
           }
         }
@@ -1059,12 +1234,21 @@
       }
     }
 
-    MemNode* lower_insert_pt = last;
     Node*    upper_insert_pt = first->in(MemNode::Memory);
+    // Following code moves loads connected to upper_insert_pt below aliased stores.
+    // Collect such loads here and reconnect them back to upper_insert_pt later.
+    memops.clear();
+    for (DUIterator i = upper_insert_pt->outs(); upper_insert_pt->has_out(i); i++) {
+      Node* use = upper_insert_pt->out(i);
+      if (!use->is_Store())
+        memops.push(use);
+    }
+
+    MemNode* lower_insert_pt = last;
     previous                 = last; //previous store in pk
     current                  = last->in(MemNode::Memory)->as_Mem();
 
-    //start scheduling from "last" to "first"
+    // start scheduling from "last" to "first"
     while (true) {
       assert(in_bb(current), "stay in block");
       assert(in_pack(previous, pk), "previous stays in pack");
@@ -1072,7 +1256,6 @@
 
       if (in_pack(current, pk)) {
         // Forward users of my memory state (except "previous) to my input memory state
-        _igvn.hash_delete(current);
         for (DUIterator i = current->outs(); current->has_out(i); i++) {
           Node* use = current->out(i);
           if (use->is_Mem() && use != previous) {
@@ -1079,10 +1262,8 @@
             assert(use->in(MemNode::Memory) == current, "must be");
             _igvn.hash_delete(use);
             if (schedule_before_pack.member(use)) {
-              _igvn.hash_delete(upper_insert_pt);
               use->set_req(MemNode::Memory, upper_insert_pt);
             } else {
-              _igvn.hash_delete(lower_insert_pt);
               use->set_req(MemNode::Memory, lower_insert_pt);
             }
             _igvn._worklist.push(use);
@@ -1097,6 +1278,16 @@
       if (current == first) break;
       current = my_mem->as_Mem();
     } // end while
+
+    // Reconect loads back to upper_insert_pt.
+    for (uint i = 0; i < memops.size(); i++) {
+      Node *ld = memops.at(i);
+      if (ld->in(MemNode::Memory) != upper_insert_pt) {
+        _igvn.hash_delete(ld);
+        ld->set_req(MemNode::Memory, upper_insert_pt);
+        _igvn._worklist.push(ld);
+      }
+    }
   } else if (pk->at(0)->is_Load()) { //load
     // all loads in the pack should have the same memory state. By default,
     // we use the memory state of the last load. However, if any load could
@@ -1159,35 +1350,30 @@
       Node* vn = NULL;
       Node* low_adr = p->at(0);
       Node* first   = executed_first(p);
+      int   opc = n->Opcode();
       if (n->is_Load()) {
-        int   opc = n->Opcode();
         Node* ctl = n->in(MemNode::Control);
         Node* mem = first->in(MemNode::Memory);
         Node* adr = low_adr->in(MemNode::Address);
         const TypePtr* atyp = n->adr_type();
-        vn = VectorLoadNode::make(_phase->C, opc, ctl, mem, adr, atyp, vlen);
-
+        vn = LoadVectorNode::make(_phase->C, opc, ctl, mem, adr, atyp, vlen, velt_basic_type(n));
       } else if (n->is_Store()) {
         // Promote value to be stored to vector
         Node* val = vector_opd(p, MemNode::ValueIn);
-
-        int   opc = n->Opcode();
         Node* ctl = n->in(MemNode::Control);
         Node* mem = first->in(MemNode::Memory);
         Node* adr = low_adr->in(MemNode::Address);
         const TypePtr* atyp = n->adr_type();
-        vn = VectorStoreNode::make(_phase->C, opc, ctl, mem, adr, atyp, val, vlen);
-
+        vn = StoreVectorNode::make(_phase->C, opc, ctl, mem, adr, atyp, val, vlen);
       } else if (n->req() == 3) {
         // Promote operands to vector
         Node* in1 = vector_opd(p, 1);
         Node* in2 = vector_opd(p, 2);
-        vn = VectorNode::make(_phase->C, n->Opcode(), in1, in2, vlen, velt_type(n));
-
+        vn = VectorNode::make(_phase->C, opc, in1, in2, vlen, velt_basic_type(n));
       } else {
         ShouldNotReachHere();
       }
-
+      assert(vn != NULL, "sanity");
       _phase->_igvn.register_new_node_with_optimizer(vn);
       _phase->set_ctrl(vn, _phase->get_ctrl(p->at(0)));
       for (uint j = 0; j < p->size(); j++) {
@@ -1195,6 +1381,12 @@
         _igvn.replace_node(pm, vn);
       }
       _igvn._worklist.push(vn);
+#ifdef ASSERT
+      if (TraceSuperWord) {
+        tty->print("\nnew Vector node: ");
+        vn->dump();
+      }
+#endif
     }
   }
 }
@@ -1217,10 +1409,10 @@
   }
 
   if (same_opd) {
-    if (opd->is_Vector() || opd->is_VectorLoad()) {
+    if (opd->is_Vector() || opd->is_LoadVector()) {
       return opd; // input is matching vector
     }
-    assert(!opd->is_VectorStore(), "such vector is not expected here");
+    assert(!opd->is_StoreVector(), "such vector is not expected here");
     // Convert scalar input to vector with the same number of elements as
     // p0's vector. Use p0's type because size of operand's container in
     // vector should match p0's size regardless operand's size.
@@ -1233,8 +1425,8 @@
   }
 
   // Insert pack operation
-  const Type* p0_t = velt_type(p0);
-  PackNode* pk = PackNode::make(_phase->C, opd, p0_t);
+  BasicType bt = velt_basic_type(p0);
+  PackNode* pk = PackNode::make(_phase->C, opd, vlen, bt);
   DEBUG_ONLY( const BasicType opd_bt = opd->bottom_type()->basic_type(); )
 
   for (uint i = 1; i < vlen; i++) {
@@ -1242,7 +1434,7 @@
     Node* in = pi->in(opd_idx);
     assert(my_pack(in) == NULL, "Should already have been unpacked");
     assert(opd_bt == in->bottom_type()->basic_type(), "all same type");
-    pk->add_opd(in);
+    pk->add_opd(i, in);
   }
   _phase->_igvn.register_new_node_with_optimizer(pk);
   _phase->set_ctrl(pk, _phase->get_ctrl(opd));
@@ -1284,9 +1476,8 @@
     _igvn.hash_delete(def);
     _igvn.hash_delete(use);
     int def_pos = alignment(def) / data_size(def);
-    const Type* def_t = velt_type(def);
 
-    Node* ex = ExtractNode::make(_phase->C, def, def_pos, def_t);
+    Node* ex = ExtractNode::make(_phase->C, def, def_pos, velt_basic_type(def));
     _phase->_igvn.register_new_node_with_optimizer(ex);
     _phase->set_ctrl(ex, _phase->get_ctrl(def));
     use->set_req(idx, ex);
@@ -1294,7 +1485,7 @@
     _igvn._worklist.push(use);
 
     bb_insert_after(ex, bb_idx(def));
-    set_velt_type(ex, def_t);
+    set_velt_type(ex, velt_type(def));
   }
 }
 
@@ -1587,10 +1778,14 @@
   if (!p.valid()) {
     return bottom_align;
   }
+  int vw = vector_width_in_bytes(velt_basic_type(s));
+  if (vw < 2) {
+    return bottom_align; // No vectors for this type
+  }
   int offset  = p.offset_in_bytes();
   offset     += iv_adjust_in_bytes;
-  int off_rem = offset % vector_width_in_bytes();
-  int off_mod = off_rem >= 0 ? off_rem : off_rem + vector_width_in_bytes();
+  int off_rem = offset % vw;
+  int off_mod = off_rem >= 0 ? off_rem : off_rem + vw;
   return off_mod;
 }
 
@@ -1615,7 +1810,8 @@
 // (Start, end] half-open range defining which operands are vector
 void SuperWord::vector_opd_range(Node* n, uint* start, uint* end) {
   switch (n->Opcode()) {
-  case Op_LoadB:   case Op_LoadUS:
+  case Op_LoadB:   case Op_LoadUB:
+  case Op_LoadS:   case Op_LoadUS:
   case Op_LoadI:   case Op_LoadL:
   case Op_LoadF:   case Op_LoadD:
   case Op_LoadP:
@@ -1733,6 +1929,7 @@
   assert(orig_limit != NULL && _igvn.type(orig_limit) != Type::TOP, "");
 
   SWPointer align_to_ref_p(align_to_ref, this);
+  assert(align_to_ref_p.valid(), "sanity");
 
   // Given:
   //     lim0 == original pre loop limit
@@ -1785,10 +1982,12 @@
   //     N = (V - (e - lim0)) % V
   //     lim = lim0 - (V - (e - lim0)) % V
 
+  int vw = vector_width_in_bytes(velt_basic_type(align_to_ref));
+  assert(vw > 1, "sanity");
   int stride   = iv_stride();
   int scale    = align_to_ref_p.scale_in_bytes();
   int elt_size = align_to_ref_p.memory_size();
-  int v_align  = vector_width_in_bytes() / elt_size;
+  int v_align  = vw / elt_size;
   int k        = align_to_ref_p.offset_in_bytes() / elt_size;
 
   Node *kn   = _igvn.intcon(k);
@@ -1807,6 +2006,25 @@
     }
     _phase->_igvn.register_new_node_with_optimizer(e);
     _phase->set_ctrl(e, pre_ctrl);
+  }
+  if (vw > ObjectAlignmentInBytes) {
+    // incorporate base e +/- base && Mask >>> log2(elt)
+    Node* mask = _igvn.MakeConX(~(-1 << exact_log2(vw)));
+    Node* xbase = new(_phase->C, 2) CastP2XNode(NULL, align_to_ref_p.base());
+    _phase->_igvn.register_new_node_with_optimizer(xbase);
+    Node* masked_xbase  = new (_phase->C, 3) AndXNode(xbase, mask);
+    _phase->_igvn.register_new_node_with_optimizer(masked_xbase);
+#ifdef _LP64
+    masked_xbase  = new (_phase->C, 2) ConvL2INode(masked_xbase);
+    _phase->_igvn.register_new_node_with_optimizer(masked_xbase);
+#endif
+    Node* log2_elt = _igvn.intcon(exact_log2(elt_size));
+    Node* bref     = new (_phase->C, 3) URShiftINode(masked_xbase, log2_elt);
+    _phase->_igvn.register_new_node_with_optimizer(bref);
+    _phase->set_ctrl(bref, pre_ctrl);
+    e = new (_phase->C, 3) AddINode(e, bref);
+    _phase->_igvn.register_new_node_with_optimizer(e);
+    _phase->set_ctrl(e, pre_ctrl);
   }
 
   // compute e +/- lim0
--- old/src/share/vm/opto/superword.hpp	Sat Jun  2 20:04:21 2012
+++ new/src/share/vm/opto/superword.hpp	Sat Jun  2 20:04:21 2012
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2007, 2012, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -264,7 +264,10 @@
                                      _iv = lp->as_CountedLoop()->phi()->as_Phi(); }
   int      iv_stride()             { return lp()->as_CountedLoop()->stride_con(); }
 
-  int vector_width_in_bytes()      { return Matcher::vector_width_in_bytes(); }
+  int vector_width_in_bytes(BasicType bt) {
+    return MIN2(ABS(iv_stride())*type2aelembytes(bt),
+                Matcher::vector_width_in_bytes(bt));
+  }
 
   MemNode* align_to_ref()            { return _align_to_ref; }
   void  set_align_to_ref(MemNode* m) { _align_to_ref = m; }
@@ -298,6 +301,7 @@
 
   // vector element type
   const Type* velt_type(Node* n)             { return _node_info.adr_at(bb_idx(n))->_velt_type; }
+  BasicType velt_basic_type(Node* n)         { return velt_type(n)->array_element_basic_type(); }
   void set_velt_type(Node* n, const Type* t) { int i = bb_idx(n); grow_node_info(i); _node_info.adr_at(i)->_velt_type = t; }
 
   // my_pack
@@ -311,7 +315,9 @@
   // Find the adjacent memory references and create pack pairs for them.
   void find_adjacent_refs();
   // Find a memory reference to align the loop induction variable to.
-  void find_align_to_ref(Node_List &memops);
+  MemNode* find_align_to_ref(Node_List &memops);
+  // Calculate loop's iv adjustment for this memory ops.
+  int get_iv_adjustment(MemNode* mem);
   // Can the preloop align the reference to position zero in the vector?
   bool ref_is_alignable(SWPointer& p);
   // Construct dependency graph.
@@ -462,6 +468,7 @@
 
   Node* base()            { return _base; }
   Node* adr()             { return _adr; }
+  MemNode* mem()          { return _mem; }
   int   scale_in_bytes()  { return _scale; }
   Node* invar()           { return _invar; }
   bool  negate_invar()    { return _negate_invar; }
--- old/src/share/vm/opto/type.cpp	Sat Jun  2 20:04:21 2012
+++ new/src/share/vm/opto/type.cpp	Sat Jun  2 20:04:21 2012
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1997, 2011, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2012, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -60,6 +60,10 @@
 
   T_ILLEGAL,    // Tuple
   T_ARRAY,      // Array
+  T_ILLEGAL,    // VectorS
+  T_ILLEGAL,    // VectorD
+  T_ILLEGAL,    // VectorX
+  T_ILLEGAL,    // VectorY
 
   T_ADDRESS,    // AnyPtr   // shows up in factory methods for NULL_PTR
   T_ADDRESS,    // RawPtr
@@ -414,6 +418,24 @@
   // get_zero_type() should not happen for T_CONFLICT
   _zero_type[T_CONFLICT]= NULL;
 
+  // Vector predefined types, it needs initialized _const_basic_type[].
+  if (Matcher::vector_size_supported(T_BYTE,4)) {
+    TypeVect::VECTS = TypeVect::make(T_BYTE,4);
+  }
+  if (Matcher::vector_size_supported(T_FLOAT,2)) {
+    TypeVect::VECTD = TypeVect::make(T_FLOAT,2);
+  }
+  if (Matcher::vector_size_supported(T_FLOAT,4)) {
+    TypeVect::VECTX = TypeVect::make(T_FLOAT,4);
+  }
+  if (Matcher::vector_size_supported(T_FLOAT,8)) {
+    TypeVect::VECTY = TypeVect::make(T_FLOAT,8);
+  }
+  mreg2type[Op_VecS] = TypeVect::VECTS;
+  mreg2type[Op_VecD] = TypeVect::VECTD;
+  mreg2type[Op_VecX] = TypeVect::VECTX;
+  mreg2type[Op_VecY] = TypeVect::VECTY;
+
   // Restore working type arena.
   current->set_type_arena(save);
   current->set_type_dict(NULL);
@@ -668,6 +690,10 @@
 
   Bad,          // Tuple - handled in v-call
   Bad,          // Array - handled in v-call
+  Bad,          // VectorS - handled in v-call
+  Bad,          // VectorD - handled in v-call
+  Bad,          // VectorX - handled in v-call
+  Bad,          // VectorY - handled in v-call
 
   Bad,          // AnyPtr - handled in v-call
   Bad,          // RawPtr - handled in v-call
@@ -728,8 +754,8 @@
 //------------------------------data-------------------------------------------
 const char * const Type::msg[Type::lastype] = {
   "bad","control","top","int:","long:","half", "narrowoop:",
-  "tuple:", "aryptr",
-  "anyptr:", "rawptr:", "java:", "inst:", "ary:", "klass:",
+  "tuple:", "array:", "vectors:", "vectord:", "vectorx:", "vectory:",
+  "anyptr:", "rawptr:", "java:", "inst:", "aryptr:", "klass:",
   "func", "abIO", "return_address", "memory",
   "float_top", "ftcon:", "float",
   "double_top", "dblcon:", "double",
@@ -790,7 +816,7 @@
 //------------------------------isa_oop_ptr------------------------------------
 // Return true if type is an oop pointer type.  False for raw pointers.
 static char isa_oop_ptr_tbl[Type::lastype] = {
-  0,0,0,0,0,0,0/*narrowoop*/,0/*tuple*/, 0/*ary*/,
+  0,0,0,0,0,0,0/*narrowoop*/,0/*tuple*/, 0/*array*/, 0, 0, 0, 0/*vector*/,
   0/*anyptr*/,0/*rawptr*/,1/*OopPtr*/,1/*InstPtr*/,1/*AryPtr*/,1/*KlassPtr*/,
   0/*func*/,0,0/*return_address*/,0,
   /*floats*/0,0,0, /*doubles*/0,0,0,
@@ -1926,6 +1952,121 @@
   return false;
 }
 
+//==============================TypeVect=======================================
+// Convenience common pre-built types.
+const TypeVect *TypeVect::VECTS = NULL; //  32-bit vectors
+const TypeVect *TypeVect::VECTD = NULL; //  64-bit vectors
+const TypeVect *TypeVect::VECTX = NULL; // 128-bit vectors
+const TypeVect *TypeVect::VECTY = NULL; // 256-bit vectors
+
+//------------------------------make-------------------------------------------
+const TypeVect* TypeVect::make(const Type *elem, uint length) {
+  BasicType elem_bt = elem->array_element_basic_type();
+  assert(is_java_primitive(elem_bt), "only primitive types in vector");
+  assert(length > 1 && is_power_of_2(length), "vector length is power of 2");
+  assert(Matcher::vector_size_supported(elem_bt, length), "length in range");
+  int size = length * type2aelembytes(elem_bt);
+  switch (Matcher::vector_ideal_reg(size)) {
+  case Op_VecS:
+    return (TypeVect*)(new TypeVectS(elem, length))->hashcons();
+  case Op_VecD:
+  case Op_RegD:
+    return (TypeVect*)(new TypeVectD(elem, length))->hashcons();
+  case Op_VecX:
+    return (TypeVect*)(new TypeVectX(elem, length))->hashcons();
+  case Op_VecY:
+    return (TypeVect*)(new TypeVectY(elem, length))->hashcons();
+  }
+ ShouldNotReachHere();
+  return NULL;
+}
+
+//------------------------------meet-------------------------------------------
+// Compute the MEET of two types.  It returns a new Type object.
+const Type *TypeVect::xmeet( const Type *t ) const {
+  // Perform a fast test for common case; meeting the same types together.
+  if( this == t ) return this;  // Meeting same type-rep?
+
+  // Current "this->_base" is Vector
+  switch (t->base()) {          // switch on original type
+
+  case Bottom:                  // Ye Olde Default
+    return t;
+
+  default:                      // All else is a mistake
+    typerr(t);
+
+  case VectorS:
+  case VectorD:
+  case VectorX:
+  case VectorY: {                // Meeting 2 vectors?
+    const TypeVect* v = t->is_vect();
+    assert(  base() == v->base(), "");
+    assert(length() == v->length(), "");
+    assert(element_basic_type() == v->element_basic_type(), "");
+    return TypeVect::make(_elem->xmeet(v->_elem), _length);
+  }
+  case Top:
+    break;
+  }
+  return this;
+}
+
+//------------------------------xdual------------------------------------------
+// Dual: compute field-by-field dual
+const Type *TypeVect::xdual() const {
+  return new TypeVect(base(), _elem->dual(), _length);
+}
+
+//------------------------------eq---------------------------------------------
+// Structural equality check for Type representations
+bool TypeVect::eq(const Type *t) const {
+  const TypeVect *v = t->is_vect();
+  return (_elem == v->_elem) && (_length == v->_length);
+}
+
+//------------------------------hash-------------------------------------------
+// Type-specific hashing function.
+int TypeVect::hash(void) const {
+  return (intptr_t)_elem + (intptr_t)_length;
+}
+
+//------------------------------singleton--------------------------------------
+// TRUE if Type is a singleton type, FALSE otherwise.   Singletons are simple
+// constants (Ldi nodes).  Vector is singleton if all elements are the same
+// constant value (when vector is created with Replicate code).
+bool TypeVect::singleton(void) const {
+// There is no Con node for vectors yet.
+//  return _elem->singleton();
+  return false;
+}
+
+bool TypeVect::empty(void) const {
+  return _elem->empty();
+}
+
+//------------------------------dump2------------------------------------------
+#ifndef PRODUCT
+void TypeVect::dump2(Dict &d, uint depth, outputStream *st) const {
+  switch (base()) {
+  case VectorS:
+    st->print("vectors["); break;
+  case VectorD:
+    st->print("vectord["); break;
+  case VectorX:
+    st->print("vectorx["); break;
+  case VectorY:
+    st->print("vectory["); break;
+  default:
+    ShouldNotReachHere();
+  }
+  st->print("%d]:{", _length);
+  _elem->dump2(d, depth, st);
+  st->print("}");
+}
+#endif
+
+
 //=============================================================================
 // Convenience common pre-built types.
 const TypePtr *TypePtr::NULL_PTR;
@@ -4140,7 +4281,7 @@
 // Print a 'flattened' signature
 static const char * const flat_type_msg[Type::lastype] = {
   "bad","control","top","int","long","_", "narrowoop",
-  "tuple:", "array:",
+  "tuple:", "array:", "vectors:", "vectord:", "vectorx:", "vectory:",
   "ptr", "rawptr", "ptr", "ptr", "ptr", "ptr",
   "func", "abIO", "return_address", "mem",
   "float_top", "ftcon:", "flt",
--- old/src/share/vm/opto/type.hpp	Sat Jun  2 20:04:22 2012
+++ new/src/share/vm/opto/type.hpp	Sat Jun  2 20:04:22 2012
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1997, 2011, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2012, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -51,6 +51,11 @@
 class   TypeNarrowOop;
 class   TypeAry;
 class   TypeTuple;
+class   TypeVect;
+class     TypeVectS;
+class     TypeVectD;
+class     TypeVectX;
+class     TypeVectY;
 class   TypePtr;
 class     TypeRawPtr;
 class     TypeOopPtr;
@@ -78,6 +83,10 @@
 
     Tuple,                      // Method signature or object layout
     Array,                      // Array types
+    VectorS,                    //  32bit Vector types
+    VectorD,                    //  64bit Vector types
+    VectorX,                    // 128bit Vector types
+    VectorY,                    // 256bit Vector types
 
     AnyPtr,                     // Any old raw, klass, inst, or array pointer
     RawPtr,                     // Raw (non-oop) pointers
@@ -222,6 +231,8 @@
   const TypeF      *isa_float_constant() const;  // Returns NULL if not a FloatCon
   const TypeTuple  *is_tuple() const;            // Collection of fields, NOT a pointer
   const TypeAry    *is_ary() const;              // Array, NOT array pointer
+  const TypeVect   *is_vect() const;             // Vector
+  const TypeVect   *isa_vect() const;            // Returns NULL if not a Vector
   const TypePtr    *is_ptr() const;              // Asserts it is a ptr type
   const TypePtr    *isa_ptr() const;             // Returns NULL if not ptr type
   const TypeRawPtr *isa_rawptr() const;          // NOT Java oop
@@ -574,6 +585,69 @@
 #endif
 };
 
+//------------------------------TypeVect---------------------------------------
+// Class of Vector Types
+class TypeVect : public Type {
+  const Type*   _elem;  // Vector's element type
+  const uint  _length;  // Elements in vector (power of 2)
+
+protected:
+  TypeVect(TYPES t, const Type* elem, uint length) : Type(t),
+    _elem(elem), _length(length) {}
+
+public:
+  const Type* element_type() const { return _elem; }
+  BasicType element_basic_type() const { return _elem->array_element_basic_type(); }
+  uint length() const { return _length; }
+  uint length_in_bytes() const {
+   return _length * type2aelembytes(element_basic_type());
+  }
+
+  virtual bool eq(const Type *t) const;
+  virtual int  hash() const;             // Type specific hashing
+  virtual bool singleton(void) const;    // TRUE if type is a singleton
+  virtual bool empty(void) const;        // TRUE if type is vacuous
+
+  static const TypeVect *make(const BasicType elem_bt, uint length) {
+    // Use bottom primitive type.
+    return make(get_const_basic_type(elem_bt), length);
+  }
+  // Used directly by Replicate nodes to construct singleton vector.
+  static const TypeVect *make(const Type* elem, uint length);
+
+  virtual const Type *xmeet( const Type *t) const;
+  virtual const Type *xdual() const;     // Compute dual right now.
+
+  static const TypeVect *VECTS;
+  static const TypeVect *VECTD;
+  static const TypeVect *VECTX;
+  static const TypeVect *VECTY;
+
+#ifndef PRODUCT
+  virtual void dump2(Dict &d, uint, outputStream *st) const; // Specialized per-Type dumping
+#endif
+};
+
+class TypeVectS : public TypeVect {
+  friend class TypeVect;
+  TypeVectS(const Type* elem, uint length) : TypeVect(VectorS, elem, length) {}
+};
+
+class TypeVectD : public TypeVect {
+  friend class TypeVect;
+  TypeVectD(const Type* elem, uint length) : TypeVect(VectorD, elem, length) {}
+};
+
+class TypeVectX : public TypeVect {
+  friend class TypeVect;
+  TypeVectX(const Type* elem, uint length) : TypeVect(VectorX, elem, length) {}
+};
+
+class TypeVectY : public TypeVect {
+  friend class TypeVect;
+  TypeVectY(const Type* elem, uint length) : TypeVect(VectorY, elem, length) {}
+};
+
 //------------------------------TypePtr----------------------------------------
 // Class of machine Pointer Types: raw data, instances or arrays.
 // If the _base enum is AnyPtr, then this refers to all of the above.
@@ -1113,6 +1187,15 @@
   return (TypeAry*)this;
 }
 
+inline const TypeVect *Type::is_vect() const {
+  assert( _base >= VectorS && _base <= VectorY, "Not a Vector" );
+  return (TypeVect*)this;
+}
+
+inline const TypeVect *Type::isa_vect() const {
+  return (_base >= VectorS && _base <= VectorY) ? (TypeVect*)this : NULL;
+}
+
 inline const TypePtr *Type::is_ptr() const {
   // AnyPtr is the first Ptr and KlassPtr the last, with no non-ptrs between.
   assert(_base >= AnyPtr && _base <= KlassPtr, "Not a pointer");
--- old/src/share/vm/opto/vectornode.cpp	Sat Jun  2 20:04:23 2012
+++ new/src/share/vm/opto/vectornode.cpp	Sat Jun  2 20:04:22 2012
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2007, 2012, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -28,141 +28,10 @@
 
 //------------------------------VectorNode--------------------------------------
 
-// Return vector type for an element type and vector length.
-const Type* VectorNode::vect_type(BasicType elt_bt, uint len) {
-  assert(len <= VectorNode::max_vlen(elt_bt), "len in range");
-  switch(elt_bt) {
-  case T_BOOLEAN:
-  case T_BYTE:
-    switch(len) {
-    case 2:  return TypeInt::CHAR;
-    case 4:  return TypeInt::INT;
-    case 8:  return TypeLong::LONG;
-    }
-    break;
-  case T_CHAR:
-  case T_SHORT:
-    switch(len) {
-    case 2:  return TypeInt::INT;
-    case 4:  return TypeLong::LONG;
-    }
-    break;
-  case T_INT:
-    switch(len) {
-    case 2:  return TypeLong::LONG;
-    }
-    break;
-  case T_LONG:
-    break;
-  case T_FLOAT:
-    switch(len) {
-    case 2:  return Type::DOUBLE;
-    }
-    break;
-  case T_DOUBLE:
-    break;
-  }
-  ShouldNotReachHere();
-  return NULL;
-}
-
-// Scalar promotion
-VectorNode* VectorNode::scalar2vector(Compile* C, Node* s, uint vlen, const Type* opd_t) {
-  BasicType bt = opd_t->array_element_basic_type();
-  assert(vlen <= VectorNode::max_vlen(bt), "vlen in range");
-  switch (bt) {
-  case T_BOOLEAN:
-  case T_BYTE:
-    if (vlen == 16) return new (C, 2) Replicate16BNode(s);
-    if (vlen ==  8) return new (C, 2) Replicate8BNode(s);
-    if (vlen ==  4) return new (C, 2) Replicate4BNode(s);
-    break;
-  case T_CHAR:
-    if (vlen == 8) return new (C, 2) Replicate8CNode(s);
-    if (vlen == 4) return new (C, 2) Replicate4CNode(s);
-    if (vlen == 2) return new (C, 2) Replicate2CNode(s);
-    break;
-  case T_SHORT:
-    if (vlen == 8) return new (C, 2) Replicate8SNode(s);
-    if (vlen == 4) return new (C, 2) Replicate4SNode(s);
-    if (vlen == 2) return new (C, 2) Replicate2SNode(s);
-    break;
-  case T_INT:
-    if (vlen == 4) return new (C, 2) Replicate4INode(s);
-    if (vlen == 2) return new (C, 2) Replicate2INode(s);
-    break;
-  case T_LONG:
-    if (vlen == 2) return new (C, 2) Replicate2LNode(s);
-    break;
-  case T_FLOAT:
-    if (vlen == 4) return new (C, 2) Replicate4FNode(s);
-    if (vlen == 2) return new (C, 2) Replicate2FNode(s);
-    break;
-  case T_DOUBLE:
-    if (vlen == 2) return new (C, 2) Replicate2DNode(s);
-    break;
-  }
-  ShouldNotReachHere();
-  return NULL;
-}
-
-// Return initial Pack node. Additional operands added with add_opd() calls.
-PackNode* PackNode::make(Compile* C, Node* s, const Type* opd_t) {
-  BasicType bt = opd_t->array_element_basic_type();
-  switch (bt) {
-  case T_BOOLEAN:
-  case T_BYTE:
-    return new (C, 2) PackBNode(s);
-  case T_CHAR:
-    return new (C, 2) PackCNode(s);
-  case T_SHORT:
-    return new (C, 2) PackSNode(s);
-  case T_INT:
-    return new (C, 2) PackINode(s);
-  case T_LONG:
-    return new (C, 2) PackLNode(s);
-  case T_FLOAT:
-    return new (C, 2) PackFNode(s);
-  case T_DOUBLE:
-    return new (C, 2) PackDNode(s);
-  }
-  ShouldNotReachHere();
-  return NULL;
-}
-
-// Create a binary tree form for Packs. [lo, hi) (half-open) range
-Node* PackNode::binaryTreePack(Compile* C, int lo, int hi) {
-  int ct = hi - lo;
-  assert(is_power_of_2(ct), "power of 2");
-  int mid = lo + ct/2;
-  Node* n1 = ct == 2 ? in(lo)   : binaryTreePack(C, lo,  mid);
-  Node* n2 = ct == 2 ? in(lo+1) : binaryTreePack(C, mid, hi );
-  int rslt_bsize = ct * type2aelembytes(elt_basic_type());
-  if (bottom_type()->is_floatingpoint()) {
-    switch (rslt_bsize) {
-    case  8: return new (C, 3) PackFNode(n1, n2);
-    case 16: return new (C, 3) PackDNode(n1, n2);
-    }
-  } else {
-    assert(bottom_type()->isa_int() || bottom_type()->isa_long(), "int or long");
-    switch (rslt_bsize) {
-    case  2: return new (C, 3) Pack2x1BNode(n1, n2);
-    case  4: return new (C, 3) Pack2x2BNode(n1, n2);
-    case  8: return new (C, 3) PackINode(n1, n2);
-    case 16: return new (C, 3) PackLNode(n1, n2);
-    }
-  }
-  ShouldNotReachHere();
-  return NULL;
-}
-
 // Return the vector operator for the specified scalar operation
-// and vector length.  One use is to check if the code generator
+// and vector length.  Also used to check if the code generator
 // supports the vector operation.
-int VectorNode::opcode(int sopc, uint vlen, const Type* opd_t) {
-  BasicType bt = opd_t->array_element_basic_type();
-  if (!(is_power_of_2(vlen) && vlen <= max_vlen(bt)))
-    return 0; // unimplemented
+int VectorNode::opcode(int sopc, uint vlen, BasicType bt) {
   switch (sopc) {
   case Op_AddI:
     switch (bt) {
@@ -221,13 +90,13 @@
     case T_INT:    return Op_LShiftVI;
     }
     ShouldNotReachHere();
-  case Op_URShiftI:
+  case Op_RShiftI:
     switch (bt) {
     case T_BOOLEAN:
-    case T_BYTE:   return Op_URShiftVB;
-    case T_CHAR:   return Op_URShiftVC;
-    case T_SHORT:  return Op_URShiftVS;
-    case T_INT:    return Op_URShiftVI;
+    case T_BYTE:   return Op_RShiftVB;
+    case T_CHAR:   return Op_RShiftVC;
+    case T_SHORT:  return Op_RShiftVS;
+    case T_INT:    return Op_RShiftVI;
     }
     ShouldNotReachHere();
   case Op_AndI:
@@ -247,7 +116,7 @@
   case Op_LoadL:
   case Op_LoadF:
   case Op_LoadD:
-    return VectorLoadNode::opcode(sopc, vlen);
+    return Op_LoadVector;
 
   case Op_StoreB:
   case Op_StoreC:
@@ -255,208 +124,172 @@
   case Op_StoreL:
   case Op_StoreF:
   case Op_StoreD:
-    return VectorStoreNode::opcode(sopc, vlen);
+    return Op_StoreVector;
   }
   return 0; // Unimplemented
 }
 
-// Helper for above.
-int VectorLoadNode::opcode(int sopc, uint vlen) {
-  switch (sopc) {
-  case Op_LoadB:
-    switch (vlen) {
-    case  2:       return 0; // Unimplemented
-    case  4:       return Op_Load4B;
-    case  8:       return Op_Load8B;
-    case 16:       return Op_Load16B;
-    }
-    break;
-  case Op_LoadUS:
-    switch (vlen) {
-    case  2:       return Op_Load2C;
-    case  4:       return Op_Load4C;
-    case  8:       return Op_Load8C;
-    }
-    break;
-  case Op_LoadS:
-    switch (vlen) {
-    case  2:       return Op_Load2S;
-    case  4:       return Op_Load4S;
-    case  8:       return Op_Load8S;
-    }
-    break;
-  case Op_LoadI:
-    switch (vlen) {
-    case  2:       return Op_Load2I;
-    case  4:       return Op_Load4I;
-    }
-    break;
-  case Op_LoadL:
-    if (vlen == 2) return Op_Load2L;
-    break;
-  case Op_LoadF:
-    switch (vlen) {
-    case  2:       return Op_Load2F;
-    case  4:       return Op_Load4F;
-    }
-    break;
-  case Op_LoadD:
-    if (vlen == 2) return Op_Load2D;
-    break;
+bool VectorNode::implemented(int opc, uint vlen, BasicType bt) {
+  if (is_java_primitive(bt) &&
+      (vlen > 1) && is_power_of_2(vlen) &&
+      Matcher::vector_size_supported(bt, vlen)) {
+    int vopc = VectorNode::opcode(opc, vlen, bt);
+    return vopc > 0 && Matcher::has_match_rule(vopc);
   }
-  return 0; // Unimplemented
+  return false;
 }
 
-// Helper for above
-int VectorStoreNode::opcode(int sopc, uint vlen) {
-  switch (sopc) {
-  case Op_StoreB:
-    switch (vlen) {
-    case  2:       return 0; // Unimplemented
-    case  4:       return Op_Store4B;
-    case  8:       return Op_Store8B;
-    case 16:       return Op_Store16B;
-    }
-    break;
-  case Op_StoreC:
-    switch (vlen) {
-    case  2:       return Op_Store2C;
-    case  4:       return Op_Store4C;
-    case  8:       return Op_Store8C;
-    }
-    break;
-  case Op_StoreI:
-    switch (vlen) {
-    case  2:       return Op_Store2I;
-    case  4:       return Op_Store4I;
-    }
-    break;
-  case Op_StoreL:
-    if (vlen == 2) return Op_Store2L;
-    break;
-  case Op_StoreF:
-    switch (vlen) {
-    case  2:       return Op_Store2F;
-    case  4:       return Op_Store4F;
-    }
-    break;
-  case Op_StoreD:
-    if (vlen == 2) return Op_Store2D;
-    break;
-  }
-  return 0; // Unimplemented
-}
-
 // Return the vector version of a scalar operation node.
-VectorNode* VectorNode::make(Compile* C, int sopc, Node* n1, Node* n2, uint vlen, const Type* opd_t) {
-  int vopc = opcode(sopc, vlen, opd_t);
+VectorNode* VectorNode::make(Compile* C, int opc, Node* n1, Node* n2, uint vlen, BasicType bt) {
+  const TypeVect* vt = TypeVect::make(bt, vlen);
+  int vopc = VectorNode::opcode(opc, vlen, bt);
 
   switch (vopc) {
-  case Op_AddVB: return new (C, 3) AddVBNode(n1, n2, vlen);
-  case Op_AddVC: return new (C, 3) AddVCNode(n1, n2, vlen);
-  case Op_AddVS: return new (C, 3) AddVSNode(n1, n2, vlen);
-  case Op_AddVI: return new (C, 3) AddVINode(n1, n2, vlen);
-  case Op_AddVL: return new (C, 3) AddVLNode(n1, n2, vlen);
-  case Op_AddVF: return new (C, 3) AddVFNode(n1, n2, vlen);
-  case Op_AddVD: return new (C, 3) AddVDNode(n1, n2, vlen);
+  case Op_AddVB: return new (C, 3) AddVBNode(n1, n2, vt);
+  case Op_AddVC: return new (C, 3) AddVCNode(n1, n2, vt);
+  case Op_AddVS: return new (C, 3) AddVSNode(n1, n2, vt);
+  case Op_AddVI: return new (C, 3) AddVINode(n1, n2, vt);
+  case Op_AddVL: return new (C, 3) AddVLNode(n1, n2, vt);
+  case Op_AddVF: return new (C, 3) AddVFNode(n1, n2, vt);
+  case Op_AddVD: return new (C, 3) AddVDNode(n1, n2, vt);
 
-  case Op_SubVB: return new (C, 3) SubVBNode(n1, n2, vlen);
-  case Op_SubVC: return new (C, 3) SubVCNode(n1, n2, vlen);
-  case Op_SubVS: return new (C, 3) SubVSNode(n1, n2, vlen);
-  case Op_SubVI: return new (C, 3) SubVINode(n1, n2, vlen);
-  case Op_SubVL: return new (C, 3) SubVLNode(n1, n2, vlen);
-  case Op_SubVF: return new (C, 3) SubVFNode(n1, n2, vlen);
-  case Op_SubVD: return new (C, 3) SubVDNode(n1, n2, vlen);
+  case Op_SubVB: return new (C, 3) SubVBNode(n1, n2, vt);
+  case Op_SubVC: return new (C, 3) SubVCNode(n1, n2, vt);
+  case Op_SubVS: return new (C, 3) SubVSNode(n1, n2, vt);
+  case Op_SubVI: return new (C, 3) SubVINode(n1, n2, vt);
+  case Op_SubVL: return new (C, 3) SubVLNode(n1, n2, vt);
+  case Op_SubVF: return new (C, 3) SubVFNode(n1, n2, vt);
+  case Op_SubVD: return new (C, 3) SubVDNode(n1, n2, vt);
 
-  case Op_MulVF: return new (C, 3) MulVFNode(n1, n2, vlen);
-  case Op_MulVD: return new (C, 3) MulVDNode(n1, n2, vlen);
+  case Op_MulVF: return new (C, 3) MulVFNode(n1, n2, vt);
+  case Op_MulVD: return new (C, 3) MulVDNode(n1, n2, vt);
 
-  case Op_DivVF: return new (C, 3) DivVFNode(n1, n2, vlen);
-  case Op_DivVD: return new (C, 3) DivVDNode(n1, n2, vlen);
+  case Op_DivVF: return new (C, 3) DivVFNode(n1, n2, vt);
+  case Op_DivVD: return new (C, 3) DivVDNode(n1, n2, vt);
 
-  case Op_LShiftVB: return new (C, 3) LShiftVBNode(n1, n2, vlen);
-  case Op_LShiftVC: return new (C, 3) LShiftVCNode(n1, n2, vlen);
-  case Op_LShiftVS: return new (C, 3) LShiftVSNode(n1, n2, vlen);
-  case Op_LShiftVI: return new (C, 3) LShiftVINode(n1, n2, vlen);
+  case Op_LShiftVB: return new (C, 3) LShiftVBNode(n1, n2, vt);
+  case Op_LShiftVC: return new (C, 3) LShiftVCNode(n1, n2, vt);
+  case Op_LShiftVS: return new (C, 3) LShiftVSNode(n1, n2, vt);
+  case Op_LShiftVI: return new (C, 3) LShiftVINode(n1, n2, vt);
 
-  case Op_URShiftVB: return new (C, 3) URShiftVBNode(n1, n2, vlen);
-  case Op_URShiftVC: return new (C, 3) URShiftVCNode(n1, n2, vlen);
-  case Op_URShiftVS: return new (C, 3) URShiftVSNode(n1, n2, vlen);
-  case Op_URShiftVI: return new (C, 3) URShiftVINode(n1, n2, vlen);
+  case Op_RShiftVB: return new (C, 3) RShiftVBNode(n1, n2, vt);
+  case Op_RShiftVC: return new (C, 3) RShiftVCNode(n1, n2, vt);
+  case Op_RShiftVS: return new (C, 3) RShiftVSNode(n1, n2, vt);
+  case Op_RShiftVI: return new (C, 3) RShiftVINode(n1, n2, vt);
 
-  case Op_AndV: return new (C, 3) AndVNode(n1, n2, vlen, opd_t->array_element_basic_type());
-  case Op_OrV:  return new (C, 3) OrVNode (n1, n2, vlen, opd_t->array_element_basic_type());
-  case Op_XorV: return new (C, 3) XorVNode(n1, n2, vlen, opd_t->array_element_basic_type());
+  case Op_AndV: return new (C, 3) AndVNode(n1, n2, vt);
+  case Op_OrV:  return new (C, 3) OrVNode (n1, n2, vt);
+  case Op_XorV: return new (C, 3) XorVNode(n1, n2, vt);
   }
   ShouldNotReachHere();
   return NULL;
+
 }
 
-// Return the vector version of a scalar load node.
-VectorLoadNode* VectorLoadNode::make(Compile* C, int opc, Node* ctl, Node* mem,
-                                     Node* adr, const TypePtr* atyp, uint vlen) {
-  int vopc = opcode(opc, vlen);
+// Scalar promotion
+VectorNode* VectorNode::scalar2vector(Compile* C, Node* s, uint vlen, const Type* opd_t) {
+  BasicType bt = opd_t->array_element_basic_type();
+  const TypeVect* vt = opd_t->singleton() ? TypeVect::make(opd_t, vlen)
+                                          : TypeVect::make(bt, vlen);
+  switch (bt) {
+  case T_BOOLEAN:
+  case T_BYTE:
+    return new (C, 2) ReplicateBNode(s, vt);
+  case T_CHAR:
+    return new (C, 2) ReplicateCNode(s, vt);
+  case T_SHORT:
+    return new (C, 2) ReplicateSNode(s, vt);
+  case T_INT:
+    return new (C, 2) ReplicateINode(s, vt);
+  case T_LONG:
+    return new (C, 2) ReplicateLNode(s, vt);
+  case T_FLOAT:
+    return new (C, 2) ReplicateFNode(s, vt);
+  case T_DOUBLE:
+    return new (C, 2) ReplicateDNode(s, vt);
+  }
+  ShouldNotReachHere();
+  return NULL;
+}
 
-  switch(vopc) {
-  case Op_Load16B: return new (C, 3) Load16BNode(ctl, mem, adr, atyp);
-  case Op_Load8B:  return new (C, 3) Load8BNode(ctl, mem, adr, atyp);
-  case Op_Load4B:  return new (C, 3) Load4BNode(ctl, mem, adr, atyp);
+// Return initial Pack node. Additional operands added with add_opd() calls.
+PackNode* PackNode::make(Compile* C, Node* s, uint vlen, BasicType bt) {
+  const TypeVect* vt = TypeVect::make(bt, vlen);
+  switch (bt) {
+  case T_BOOLEAN:
+  case T_BYTE:
+    return new (C, vlen+1) PackBNode(s, vt);
+  case T_CHAR:
+    return new (C, vlen+1) PackCNode(s, vt);
+  case T_SHORT:
+    return new (C, vlen+1) PackSNode(s, vt);
+  case T_INT:
+    return new (C, vlen+1) PackINode(s, vt);
+  case T_LONG:
+    return new (C, vlen+1) PackLNode(s, vt);
+  case T_FLOAT:
+    return new (C, vlen+1) PackFNode(s, vt);
+  case T_DOUBLE:
+    return new (C, vlen+1) PackDNode(s, vt);
+  }
+  ShouldNotReachHere();
+  return NULL;
+}
 
-  case Op_Load8C:  return new (C, 3) Load8CNode(ctl, mem, adr, atyp);
-  case Op_Load4C:  return new (C, 3) Load4CNode(ctl, mem, adr, atyp);
-  case Op_Load2C:  return new (C, 3) Load2CNode(ctl, mem, adr, atyp);
+// Create a binary tree form for Packs. [lo, hi) (half-open) range
+Node* PackNode::binaryTreePack(Compile* C, int lo, int hi) {
+  int ct = hi - lo;
+  assert(is_power_of_2(ct), "power of 2");
+  if (ct == 2) {
+    PackNode* pk = PackNode::make(C, in(lo), 2, vect_type()->element_basic_type());
+    pk->add_opd(1, in(lo+1));
+    return pk;
 
-  case Op_Load8S:  return new (C, 3) Load8SNode(ctl, mem, adr, atyp);
-  case Op_Load4S:  return new (C, 3) Load4SNode(ctl, mem, adr, atyp);
-  case Op_Load2S:  return new (C, 3) Load2SNode(ctl, mem, adr, atyp);
+  } else {
+    int mid = lo + ct/2;
+    Node* n1 = binaryTreePack(C, lo,  mid);
+    Node* n2 = binaryTreePack(C, mid, hi );
 
-  case Op_Load4I:  return new (C, 3) Load4INode(ctl, mem, adr, atyp);
-  case Op_Load2I:  return new (C, 3) Load2INode(ctl, mem, adr, atyp);
-
-  case Op_Load2L:  return new (C, 3) Load2LNode(ctl, mem, adr, atyp);
-
-  case Op_Load4F:  return new (C, 3) Load4FNode(ctl, mem, adr, atyp);
-  case Op_Load2F:  return new (C, 3) Load2FNode(ctl, mem, adr, atyp);
-
-  case Op_Load2D:  return new (C, 3) Load2DNode(ctl, mem, adr, atyp);
+    BasicType bt = vect_type()->element_basic_type();
+    switch (bt) {
+    case T_BOOLEAN:
+    case T_BYTE:
+      return new (C, 3) PackSNode(n1, n2, TypeVect::make(T_SHORT, 2));
+    case T_CHAR:
+    case T_SHORT:
+      return new (C, 3) PackINode(n1, n2, TypeVect::make(T_INT, 2));
+    case T_INT:
+      return new (C, 3) PackLNode(n1, n2, TypeVect::make(T_LONG, 2));
+    case T_LONG:
+      return new (C, 3) Pack2LNode(n1, n2, TypeVect::make(T_LONG, 2));
+    case T_FLOAT:
+      return new (C, 3) PackDNode(n1, n2, TypeVect::make(T_DOUBLE, 2));
+    case T_DOUBLE:
+      return new (C, 3) Pack2DNode(n1, n2, TypeVect::make(T_DOUBLE, 2));
+    }
+    ShouldNotReachHere();
   }
-  ShouldNotReachHere();
   return NULL;
 }
 
+// Return the vector version of a scalar load node.
+LoadVectorNode* LoadVectorNode::make(Compile* C, int opc, Node* ctl, Node* mem,
+                                     Node* adr, const TypePtr* atyp, uint vlen, BasicType bt) {
+  const TypeVect* vt = TypeVect::make(bt, vlen);
+  return new (C, 3) LoadVectorNode(ctl, mem, adr, atyp, vt);
+  return NULL;
+}
+
 // Return the vector version of a scalar store node.
-VectorStoreNode* VectorStoreNode::make(Compile* C, int opc, Node* ctl, Node* mem,
+StoreVectorNode* StoreVectorNode::make(Compile* C, int opc, Node* ctl, Node* mem,
                                        Node* adr, const TypePtr* atyp, Node* val,
                                        uint vlen) {
-  int vopc = opcode(opc, vlen);
-
-  switch(vopc) {
-  case Op_Store16B: return new (C, 4) Store16BNode(ctl, mem, adr, atyp, val);
-  case Op_Store8B: return new (C, 4) Store8BNode(ctl, mem, adr, atyp, val);
-  case Op_Store4B: return new (C, 4) Store4BNode(ctl, mem, adr, atyp, val);
-
-  case Op_Store8C: return new (C, 4) Store8CNode(ctl, mem, adr, atyp, val);
-  case Op_Store4C: return new (C, 4) Store4CNode(ctl, mem, adr, atyp, val);
-  case Op_Store2C: return new (C, 4) Store2CNode(ctl, mem, adr, atyp, val);
-
-  case Op_Store4I: return new (C, 4) Store4INode(ctl, mem, adr, atyp, val);
-  case Op_Store2I: return new (C, 4) Store2INode(ctl, mem, adr, atyp, val);
-
-  case Op_Store2L: return new (C, 4) Store2LNode(ctl, mem, adr, atyp, val);
-
-  case Op_Store4F: return new (C, 4) Store4FNode(ctl, mem, adr, atyp, val);
-  case Op_Store2F: return new (C, 4) Store2FNode(ctl, mem, adr, atyp, val);
-
-  case Op_Store2D: return new (C, 4) Store2DNode(ctl, mem, adr, atyp, val);
-  }
-  ShouldNotReachHere();
-  return NULL;
+  return new (C, 4) StoreVectorNode(ctl, mem, adr, atyp, val);
 }
 
 // Extract a scalar element of vector.
-Node* ExtractNode::make(Compile* C, Node* v, uint position, const Type* opd_t) {
-  BasicType bt = opd_t->array_element_basic_type();
-  assert(position < VectorNode::max_vlen(bt), "pos in range");
+Node* ExtractNode::make(Compile* C, Node* v, uint position, BasicType bt) {
+  assert((int)position < Matcher::max_vector_size(bt), "pos in range");
   ConINode* pos = ConINode::make(C, (int)position);
   switch (bt) {
   case T_BOOLEAN:
@@ -478,3 +311,4 @@
   ShouldNotReachHere();
   return NULL;
 }
+
--- old/src/share/vm/opto/vectornode.hpp	Sat Jun  2 20:04:23 2012
+++ new/src/share/vm/opto/vectornode.hpp	Sat Jun  2 20:04:23 2012
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2007, 2012, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -31,49 +31,33 @@
 
 //------------------------------VectorNode--------------------------------------
 // Vector Operation
-class VectorNode : public Node {
-  virtual uint size_of() const { return sizeof(*this); }
- protected:
-  uint _length; // vector length
-  virtual BasicType elt_basic_type() const = 0; // Vector element basic type
-
-  static const Type* vect_type(BasicType elt_bt, uint len);
-  static const Type* vect_type(const Type* elt_type, uint len) {
-    return vect_type(elt_type->array_element_basic_type(), len);
-  }
-
+class VectorNode : public TypeNode {
  public:
-  friend class VectorLoadNode;  // For vect_type
-  friend class VectorStoreNode; // ditto.
 
-  VectorNode(Node* n1, uint vlen) : Node(NULL, n1), _length(vlen) {
+  VectorNode(Node* n1, const TypeVect* vt) : TypeNode(vt, 2) {
     init_class_id(Class_Vector);
+    init_req(1, n1);
   }
-  VectorNode(Node* n1, Node* n2, uint vlen) : Node(NULL, n1, n2), _length(vlen) {
+  VectorNode(Node* n1, Node* n2, const TypeVect* vt) : TypeNode(vt, 3) {
     init_class_id(Class_Vector);
+    init_req(1, n1);
+    init_req(2, n2);
   }
-  virtual int Opcode() const;
 
-  uint length() const { return _length; } // Vector length
+  const TypeVect* vect_type() const { return type()->is_vect(); }
+  uint length() const { return vect_type()->length(); } // Vector length
 
-  static uint max_vlen(BasicType bt) { // max vector length
-    return (uint)(Matcher::vector_width_in_bytes() / type2aelembytes(bt));
-  }
+  virtual int Opcode() const;
 
-  // Element and vector type
-  const Type* elt_type()  const { return Type::get_const_basic_type(elt_basic_type()); }
-  const Type* vect_type() const { return vect_type(elt_basic_type(), length()); }
+  virtual uint ideal_reg() const { return Matcher::vector_ideal_reg(vect_type()->length_in_bytes()); }
 
-  virtual const Type *bottom_type() const { return vect_type(); }
-  virtual uint        ideal_reg()   const { return Matcher::vector_ideal_reg(); }
-
-  // Vector opcode from scalar opcode
-  static int opcode(int sopc, uint vlen, const Type* opd_t);
-
   static VectorNode* scalar2vector(Compile* C, Node* s, uint vlen, const Type* opd_t);
 
-  static VectorNode* make(Compile* C, int sopc, Node* n1, Node* n2, uint vlen, const Type* elt_t);
+  static VectorNode* make(Compile* C, int opc, Node* n1, Node* n2, uint vlen, BasicType bt);
 
+  static int  opcode(int opc, uint vlen, BasicType bt);
+  static bool implemented(int opc, uint vlen, BasicType bt);
+
 };
 
 //===========================Vector=ALU=Operations====================================
@@ -81,10 +65,8 @@
 //------------------------------AddVBNode---------------------------------------
 // Vector add byte
 class AddVBNode : public VectorNode {
- protected:
-  virtual BasicType elt_basic_type() const { return T_BYTE; }
  public:
-  AddVBNode(Node* in1, Node* in2, uint vlen) : VectorNode(in1,in2,vlen) {}
+  AddVBNode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1,in2,vt) {}
   virtual int Opcode() const;
 };
 
@@ -91,10 +73,8 @@
 //------------------------------AddVCNode---------------------------------------
 // Vector add char
 class AddVCNode : public VectorNode {
- protected:
-  virtual BasicType elt_basic_type() const { return T_CHAR; }
  public:
-  AddVCNode(Node* in1, Node* in2, uint vlen) : VectorNode(in1,in2,vlen) {}
+  AddVCNode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1,in2,vt) {}
   virtual int Opcode() const;
 };
 
@@ -101,10 +81,8 @@
 //------------------------------AddVSNode---------------------------------------
 // Vector add short
 class AddVSNode : public VectorNode {
- protected:
-  virtual BasicType elt_basic_type() const { return T_SHORT; }
  public:
-  AddVSNode(Node* in1, Node* in2, uint vlen) : VectorNode(in1,in2,vlen) {}
+  AddVSNode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1,in2,vt) {}
   virtual int Opcode() const;
 };
 
@@ -111,10 +89,8 @@
 //------------------------------AddVINode---------------------------------------
 // Vector add int
 class AddVINode : public VectorNode {
- protected:
-  virtual BasicType elt_basic_type() const { return T_INT; }
  public:
-  AddVINode(Node* in1, Node* in2, uint vlen) : VectorNode(in1,in2,vlen) {}
+  AddVINode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1,in2,vt) {}
   virtual int Opcode() const;
 };
 
@@ -121,10 +97,8 @@
 //------------------------------AddVLNode---------------------------------------
 // Vector add long
 class AddVLNode : public VectorNode {
- protected:
-  virtual BasicType elt_basic_type() const { return T_LONG; }
  public:
-  AddVLNode(Node* in1, Node* in2, uint vlen) : VectorNode(in1,in2,vlen) {}
+  AddVLNode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1,in2,vt) {}
   virtual int Opcode() const;
 };
 
@@ -131,10 +105,8 @@
 //------------------------------AddVFNode---------------------------------------
 // Vector add float
 class AddVFNode : public VectorNode {
- protected:
-  virtual BasicType elt_basic_type() const { return T_FLOAT; }
  public:
-  AddVFNode(Node* in1, Node* in2, uint vlen) : VectorNode(in1,in2,vlen) {}
+  AddVFNode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1,in2,vt) {}
   virtual int Opcode() const;
 };
 
@@ -141,10 +113,8 @@
 //------------------------------AddVDNode---------------------------------------
 // Vector add double
 class AddVDNode : public VectorNode {
- protected:
-  virtual BasicType elt_basic_type() const { return T_DOUBLE; }
  public:
-  AddVDNode(Node* in1, Node* in2, uint vlen) : VectorNode(in1,in2,vlen) {}
+  AddVDNode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1,in2,vt) {}
   virtual int Opcode() const;
 };
 
@@ -151,10 +121,8 @@
 //------------------------------SubVBNode---------------------------------------
 // Vector subtract byte
 class SubVBNode : public VectorNode {
- protected:
-  virtual BasicType elt_basic_type() const { return T_BYTE; }
  public:
-  SubVBNode(Node* in1, Node* in2, uint vlen) : VectorNode(in1,in2,vlen) {}
+  SubVBNode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1,in2,vt) {}
   virtual int Opcode() const;
 };
 
@@ -161,10 +129,8 @@
 //------------------------------SubVCNode---------------------------------------
 // Vector subtract char
 class SubVCNode : public VectorNode {
- protected:
-  virtual BasicType elt_basic_type() const { return T_CHAR; }
  public:
-  SubVCNode(Node* in1, Node* in2, uint vlen) : VectorNode(in1,in2,vlen) {}
+  SubVCNode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1,in2,vt) {}
   virtual int Opcode() const;
 };
 
@@ -171,10 +137,8 @@
 //------------------------------SubVSNode---------------------------------------
 // Vector subtract short
 class SubVSNode : public VectorNode {
- protected:
-  virtual BasicType elt_basic_type() const { return T_SHORT; }
  public:
-  SubVSNode(Node* in1, Node* in2, uint vlen) : VectorNode(in1,in2,vlen) {}
+  SubVSNode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1,in2,vt) {}
   virtual int Opcode() const;
 };
 
@@ -181,10 +145,8 @@
 //------------------------------SubVINode---------------------------------------
 // Vector subtract int
 class SubVINode : public VectorNode {
- protected:
-  virtual BasicType elt_basic_type() const { return T_INT; }
  public:
-  SubVINode(Node* in1, Node* in2, uint vlen) : VectorNode(in1,in2,vlen) {}
+  SubVINode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1,in2,vt) {}
   virtual int Opcode() const;
 };
 
@@ -191,10 +153,8 @@
 //------------------------------SubVLNode---------------------------------------
 // Vector subtract long
 class SubVLNode : public VectorNode {
- protected:
-  virtual BasicType elt_basic_type() const { return T_LONG; }
  public:
-  SubVLNode(Node* in1, Node* in2, uint vlen) : VectorNode(in1,in2,vlen) {}
+  SubVLNode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1,in2,vt) {}
   virtual int Opcode() const;
 };
 
@@ -201,10 +161,8 @@
 //------------------------------SubVFNode---------------------------------------
 // Vector subtract float
 class SubVFNode : public VectorNode {
- protected:
-  virtual BasicType elt_basic_type() const { return T_FLOAT; }
  public:
-  SubVFNode(Node* in1, Node* in2, uint vlen) : VectorNode(in1,in2,vlen) {}
+  SubVFNode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1,in2,vt) {}
   virtual int Opcode() const;
 };
 
@@ -211,10 +169,8 @@
 //------------------------------SubVDNode---------------------------------------
 // Vector subtract double
 class SubVDNode : public VectorNode {
- protected:
-  virtual BasicType elt_basic_type() const { return T_DOUBLE; }
  public:
-  SubVDNode(Node* in1, Node* in2, uint vlen) : VectorNode(in1,in2,vlen) {}
+  SubVDNode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1,in2,vt) {}
   virtual int Opcode() const;
 };
 
@@ -221,10 +177,8 @@
 //------------------------------MulVFNode---------------------------------------
 // Vector multiply float
 class MulVFNode : public VectorNode {
- protected:
-  virtual BasicType elt_basic_type() const { return T_FLOAT; }
  public:
-  MulVFNode(Node* in1, Node* in2, uint vlen) : VectorNode(in1,in2,vlen) {}
+  MulVFNode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1,in2,vt) {}
   virtual int Opcode() const;
 };
 
@@ -231,10 +185,8 @@
 //------------------------------MulVDNode---------------------------------------
 // Vector multiply double
 class MulVDNode : public VectorNode {
- protected:
-  virtual BasicType elt_basic_type() const { return T_DOUBLE; }
  public:
-  MulVDNode(Node* in1, Node* in2, uint vlen) : VectorNode(in1,in2,vlen) {}
+  MulVDNode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1,in2,vt) {}
   virtual int Opcode() const;
 };
 
@@ -241,10 +193,8 @@
 //------------------------------DivVFNode---------------------------------------
 // Vector divide float
 class DivVFNode : public VectorNode {
- protected:
-  virtual BasicType elt_basic_type() const { return T_FLOAT; }
  public:
-  DivVFNode(Node* in1, Node* in2, uint vlen) : VectorNode(in1,in2,vlen) {}
+  DivVFNode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1,in2,vt) {}
   virtual int Opcode() const;
 };
 
@@ -251,10 +201,8 @@
 //------------------------------DivVDNode---------------------------------------
 // Vector Divide double
 class DivVDNode : public VectorNode {
- protected:
-  virtual BasicType elt_basic_type() const { return T_DOUBLE; }
  public:
-  DivVDNode(Node* in1, Node* in2, uint vlen) : VectorNode(in1,in2,vlen) {}
+  DivVDNode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1,in2,vt) {}
   virtual int Opcode() const;
 };
 
@@ -261,10 +209,8 @@
 //------------------------------LShiftVBNode---------------------------------------
 // Vector lshift byte
 class LShiftVBNode : public VectorNode {
- protected:
-  virtual BasicType elt_basic_type() const { return T_BYTE; }
  public:
-  LShiftVBNode(Node* in1, Node* in2, uint vlen) : VectorNode(in1,in2,vlen) {}
+  LShiftVBNode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1,in2,vt) {}
   virtual int Opcode() const;
 };
 
@@ -271,10 +217,8 @@
 //------------------------------LShiftVCNode---------------------------------------
 // Vector lshift chars
 class LShiftVCNode : public VectorNode {
- protected:
-  virtual BasicType elt_basic_type() const { return T_CHAR; }
  public:
-  LShiftVCNode(Node* in1, Node* in2, uint vlen) : VectorNode(in1,in2,vlen) {}
+  LShiftVCNode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1,in2,vt) {}
   virtual int Opcode() const;
 };
 
@@ -281,10 +225,8 @@
 //------------------------------LShiftVSNode---------------------------------------
 // Vector lshift shorts
 class LShiftVSNode : public VectorNode {
- protected:
-  virtual BasicType elt_basic_type() const { return T_SHORT; }
  public:
-  LShiftVSNode(Node* in1, Node* in2, uint vlen) : VectorNode(in1,in2,vlen) {}
+  LShiftVSNode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1,in2,vt) {}
   virtual int Opcode() const;
 };
 
@@ -291,50 +233,40 @@
 //------------------------------LShiftVINode---------------------------------------
 // Vector lshift ints
 class LShiftVINode : public VectorNode {
- protected:
-  virtual BasicType elt_basic_type() const { return T_INT; }
  public:
-  LShiftVINode(Node* in1, Node* in2, uint vlen) : VectorNode(in1,in2,vlen) {}
+  LShiftVINode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1,in2,vt) {}
   virtual int Opcode() const;
 };
 
 //------------------------------URShiftVBNode---------------------------------------
 // Vector urshift bytes
-class URShiftVBNode : public VectorNode {
- protected:
-  virtual BasicType elt_basic_type() const { return T_BYTE; }
+class RShiftVBNode : public VectorNode {
  public:
-  URShiftVBNode(Node* in1, Node* in2, uint vlen) : VectorNode(in1,in2,vlen) {}
+  RShiftVBNode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1,in2,vt) {}
   virtual int Opcode() const;
 };
 
 //------------------------------URShiftVCNode---------------------------------------
 // Vector urshift char
-class URShiftVCNode : public VectorNode {
- protected:
-  virtual BasicType elt_basic_type() const { return T_SHORT; }
+class RShiftVCNode : public VectorNode {
  public:
-  URShiftVCNode(Node* in1, Node* in2, uint vlen) : VectorNode(in1,in2,vlen) {}
+  RShiftVCNode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1,in2,vt) {}
   virtual int Opcode() const;
 };
 
 //------------------------------URShiftVSNode---------------------------------------
 // Vector urshift shorts
-class URShiftVSNode : public VectorNode {
- protected:
-  virtual BasicType elt_basic_type() const { return T_SHORT; }
+class RShiftVSNode : public VectorNode {
  public:
-  URShiftVSNode(Node* in1, Node* in2, uint vlen) : VectorNode(in1,in2,vlen) {}
+  RShiftVSNode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1,in2,vt) {}
   virtual int Opcode() const;
 };
 
 //------------------------------URShiftVINode---------------------------------------
 // Vector urshift ints
-class URShiftVINode : public VectorNode {
- protected:
-  virtual BasicType elt_basic_type() const { return T_INT; }
+class RShiftVINode : public VectorNode {
  public:
-  URShiftVINode(Node* in1, Node* in2, uint vlen) : VectorNode(in1,in2,vlen) {}
+  RShiftVINode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1,in2,vt) {}
   virtual int Opcode() const;
 };
 
@@ -341,11 +273,8 @@
 //------------------------------AndVNode---------------------------------------
 // Vector and
 class AndVNode : public VectorNode {
- protected:
-  BasicType _bt;
-  virtual BasicType elt_basic_type() const { return _bt; }
  public:
-  AndVNode(Node* in1, Node* in2, uint vlen, BasicType bt) : VectorNode(in1,in2,vlen), _bt(bt) {}
+  AndVNode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1,in2,vt) {}
   virtual int Opcode() const;
 };
 
@@ -352,11 +281,8 @@
 //------------------------------OrVNode---------------------------------------
 // Vector or
 class OrVNode : public VectorNode {
- protected:
-  BasicType _bt;
-  virtual BasicType elt_basic_type() const { return _bt; }
  public:
-  OrVNode(Node* in1, Node* in2, uint vlen, BasicType bt) : VectorNode(in1,in2,vlen), _bt(bt) {}
+  OrVNode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1,in2,vt) {}
   virtual int Opcode() const;
 };
 
@@ -363,607 +289,145 @@
 //------------------------------XorVNode---------------------------------------
 // Vector xor
 class XorVNode : public VectorNode {
- protected:
-  BasicType _bt;
-  virtual BasicType elt_basic_type() const { return _bt; }
  public:
-  XorVNode(Node* in1, Node* in2, uint vlen, BasicType bt) : VectorNode(in1,in2,vlen), _bt(bt) {}
+  XorVNode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1,in2,vt) {}
   virtual int Opcode() const;
 };
 
-//================================= M E M O R Y ==================================
+//================================= M E M O R Y ===============================
 
-
-//------------------------------VectorLoadNode--------------------------------------
-// Vector Load from memory
-class VectorLoadNode : public LoadNode {
-  virtual uint size_of() const { return sizeof(*this); }
-
- protected:
-  virtual BasicType elt_basic_type()  const = 0; // Vector element basic type
-  // For use in constructor
-  static const Type* vect_type(const Type* elt_type, uint len) {
-    return VectorNode::vect_type(elt_type, len);
-  }
-
+//------------------------------LoadVectorNode---------------------------------
+// Load Vector from memory
+class LoadVectorNode : public LoadNode {
  public:
-  VectorLoadNode(Node* c, Node* mem, Node* adr, const TypePtr* at, const Type *rt)
-    : LoadNode(c,mem,adr,at,rt) {
-    init_class_id(Class_VectorLoad);
+  LoadVectorNode(Node* c, Node* mem, Node* adr, const TypePtr* at, const TypeVect* vt)
+    : LoadNode(c, mem, adr, at, vt) {
+    init_class_id(Class_LoadVector);
   }
-  virtual int Opcode() const;
 
-  virtual uint  length() const = 0; // Vector length
+  const TypeVect* vect_type() const { return type()->is_vect(); }
+  uint length() const { return vect_type()->length(); } // Vector length
 
-  // Element and vector type
-  const Type* elt_type()  const { return Type::get_const_basic_type(elt_basic_type()); }
-  const Type* vect_type() const { return VectorNode::vect_type(elt_basic_type(), length()); }
+  virtual int Opcode() const;
 
-  virtual uint ideal_reg() const  { return Matcher::vector_ideal_reg(); }
+  virtual uint ideal_reg() const  { return Matcher::vector_ideal_reg(memory_size()); }
   virtual BasicType memory_type() const { return T_VOID; }
-  virtual int memory_size() const { return length()*type2aelembytes(elt_basic_type()); }
+  virtual int memory_size() const { return vect_type()->length_in_bytes(); }
 
-  // Vector opcode from scalar opcode
-  static int opcode(int sopc, uint vlen);
+  virtual int store_Opcode() const { return Op_StoreVector; }
 
-  static VectorLoadNode* make(Compile* C, int opc, Node* ctl, Node* mem,
-                              Node* adr, const TypePtr* atyp, uint vlen);
+  static LoadVectorNode* make(Compile* C, int opc, Node* ctl, Node* mem,
+                              Node* adr, const TypePtr* atyp, uint vlen, BasicType bt);
 };
 
-//------------------------------Load16BNode--------------------------------------
-// Vector load of 16 bytes (8bits signed) from memory
-class Load16BNode : public VectorLoadNode {
- protected:
-  virtual BasicType elt_basic_type() const { return T_BYTE; }
+//------------------------------StoreVectorNode--------------------------------
+// Store Vector to memory
+class StoreVectorNode : public StoreNode {
  public:
-  Load16BNode(Node* c, Node* mem, Node* adr, const TypePtr* at, const TypeInt *ti = TypeInt::BYTE)
-    : VectorLoadNode(c,mem,adr,at,vect_type(ti,16)) {}
-  virtual int Opcode() const;
-  virtual int store_Opcode() const { return Op_Store16B; }
-  virtual uint length() const { return 16; }
-};
+  StoreVectorNode(Node* c, Node* mem, Node* adr, const TypePtr* at, Node* val)
+    : StoreNode(c, mem, adr, at, val) {
+    assert(val->is_Vector() || val->is_LoadVector(), "sanity");
+    init_class_id(Class_StoreVector);
+  }
 
-//------------------------------Load8BNode--------------------------------------
-// Vector load of 8 bytes (8bits signed) from memory
-class Load8BNode : public VectorLoadNode {
- protected:
-  virtual BasicType elt_basic_type() const { return T_BYTE; }
- public:
-  Load8BNode(Node* c, Node* mem, Node* adr, const TypePtr* at, const TypeInt *ti = TypeInt::BYTE)
-    : VectorLoadNode(c,mem,adr,at,vect_type(ti,8)) {}
-  virtual int Opcode() const;
-  virtual int store_Opcode() const { return Op_Store8B; }
-  virtual uint length() const { return 8; }
-};
+  const TypeVect* vect_type() const { return in(MemNode::ValueIn)->bottom_type()->is_vect(); }
+  uint length() const { return vect_type()->length(); } // Vector length
 
-//------------------------------Load4BNode--------------------------------------
-// Vector load of 4 bytes (8bits signed) from memory
-class Load4BNode : public VectorLoadNode {
- protected:
-  virtual BasicType elt_basic_type() const { return T_BYTE; }
- public:
-  Load4BNode(Node* c, Node* mem, Node* adr, const TypePtr* at, const TypeInt *ti = TypeInt::BYTE)
-    : VectorLoadNode(c,mem,adr,at,vect_type(ti,4)) {}
   virtual int Opcode() const;
-  virtual int store_Opcode() const { return Op_Store4B; }
-  virtual uint length() const { return 4; }
-};
 
-//------------------------------Load8CNode--------------------------------------
-// Vector load of 8 chars (16bits unsigned) from memory
-class Load8CNode : public VectorLoadNode {
- protected:
-  virtual BasicType elt_basic_type() const { return T_CHAR; }
- public:
-  Load8CNode(Node* c, Node* mem, Node* adr, const TypePtr* at, const TypeInt *ti = TypeInt::CHAR)
-    : VectorLoadNode(c,mem,adr,at,vect_type(ti,8)) {}
-  virtual int Opcode() const;
-  virtual int store_Opcode() const { return Op_Store8C; }
-  virtual uint length() const { return 8; }
-};
-
-//------------------------------Load4CNode--------------------------------------
-// Vector load of 4 chars (16bits unsigned) from memory
-class Load4CNode : public VectorLoadNode {
- protected:
-  virtual BasicType elt_basic_type() const { return T_CHAR; }
- public:
-  Load4CNode(Node* c, Node* mem, Node* adr, const TypePtr* at, const TypeInt *ti = TypeInt::CHAR)
-    : VectorLoadNode(c,mem,adr,at,vect_type(ti,4)) {}
-  virtual int Opcode() const;
-  virtual int store_Opcode() const { return Op_Store4C; }
-  virtual uint length() const { return 4; }
-};
-
-//------------------------------Load2CNode--------------------------------------
-// Vector load of 2 chars (16bits unsigned) from memory
-class Load2CNode : public VectorLoadNode {
- protected:
-  virtual BasicType elt_basic_type() const { return T_CHAR; }
- public:
-  Load2CNode(Node* c, Node* mem, Node* adr, const TypePtr* at, const TypeInt *ti = TypeInt::CHAR)
-    : VectorLoadNode(c,mem,adr,at,vect_type(ti,2)) {}
-  virtual int Opcode() const;
-  virtual int store_Opcode() const { return Op_Store2C; }
-  virtual uint length() const { return 2; }
-};
-
-//------------------------------Load8SNode--------------------------------------
-// Vector load of 8 shorts (16bits signed) from memory
-class Load8SNode : public VectorLoadNode {
- protected:
-  virtual BasicType elt_basic_type() const { return T_SHORT; }
- public:
-  Load8SNode(Node* c, Node* mem, Node* adr, const TypePtr* at, const TypeInt *ti = TypeInt::SHORT)
-    : VectorLoadNode(c,mem,adr,at,vect_type(ti,8)) {}
-  virtual int Opcode() const;
-  virtual int store_Opcode() const { return Op_Store8C; }
-  virtual uint length() const { return 8; }
-};
-
-//------------------------------Load4SNode--------------------------------------
-// Vector load of 4 shorts (16bits signed) from memory
-class Load4SNode : public VectorLoadNode {
- protected:
-  virtual BasicType elt_basic_type() const { return T_SHORT; }
- public:
-  Load4SNode(Node* c, Node* mem, Node* adr, const TypePtr* at, const TypeInt *ti = TypeInt::SHORT)
-    : VectorLoadNode(c,mem,adr,at,vect_type(ti,4)) {}
-  virtual int Opcode() const;
-  virtual int store_Opcode() const { return Op_Store4C; }
-  virtual uint length() const { return 4; }
-};
-
-//------------------------------Load2SNode--------------------------------------
-// Vector load of 2 shorts (16bits signed) from memory
-class Load2SNode : public VectorLoadNode {
- protected:
-  virtual BasicType elt_basic_type() const { return T_SHORT; }
- public:
-  Load2SNode(Node* c, Node* mem, Node* adr, const TypePtr* at, const TypeInt *ti = TypeInt::SHORT)
-    : VectorLoadNode(c,mem,adr,at,vect_type(ti,2)) {}
-  virtual int Opcode() const;
-  virtual int store_Opcode() const { return Op_Store2C; }
-  virtual uint length() const { return 2; }
-};
-
-//------------------------------Load4INode--------------------------------------
-// Vector load of 4 integers (32bits signed) from memory
-class Load4INode : public VectorLoadNode {
- protected:
-  virtual BasicType elt_basic_type() const { return T_INT; }
- public:
-  Load4INode(Node* c, Node* mem, Node* adr, const TypePtr* at, const TypeInt *ti = TypeInt::INT)
-    : VectorLoadNode(c,mem,adr,at,vect_type(ti,4)) {}
-  virtual int Opcode() const;
-  virtual int store_Opcode() const { return Op_Store4I; }
-  virtual uint length() const { return 4; }
-};
-
-//------------------------------Load2INode--------------------------------------
-// Vector load of 2 integers (32bits signed) from memory
-class Load2INode : public VectorLoadNode {
- protected:
-  virtual BasicType elt_basic_type() const { return T_INT; }
- public:
-  Load2INode(Node* c, Node* mem, Node* adr, const TypePtr* at, const TypeInt *ti = TypeInt::INT)
-    : VectorLoadNode(c,mem,adr,at,vect_type(ti,2)) {}
-  virtual int Opcode() const;
-  virtual int store_Opcode() const { return Op_Store2I; }
-  virtual uint length() const { return 2; }
-};
-
-//------------------------------Load2LNode--------------------------------------
-// Vector load of 2 longs (64bits signed) from memory
-class Load2LNode : public VectorLoadNode {
- protected:
-  virtual BasicType elt_basic_type() const { return T_LONG; }
- public:
-  Load2LNode(Node* c, Node* mem, Node* adr, const TypePtr* at, const TypeLong *tl = TypeLong::LONG)
-    : VectorLoadNode(c,mem,adr,at,vect_type(tl,2)) {}
-  virtual int Opcode() const;
-  virtual int store_Opcode() const { return Op_Store2L; }
-  virtual uint length() const { return 2; }
-};
-
-//------------------------------Load4FNode--------------------------------------
-// Vector load of 4 floats (32bits) from memory
-class Load4FNode : public VectorLoadNode {
- protected:
-  virtual BasicType elt_basic_type() const { return T_FLOAT; }
- public:
-  Load4FNode(Node* c, Node* mem, Node* adr, const TypePtr* at, const Type *t = Type::FLOAT)
-    : VectorLoadNode(c,mem,adr,at,vect_type(t,4)) {}
-  virtual int Opcode() const;
-  virtual int store_Opcode() const { return Op_Store4F; }
-  virtual uint length() const { return 4; }
-};
-
-//------------------------------Load2FNode--------------------------------------
-// Vector load of 2 floats (32bits) from memory
-class Load2FNode : public VectorLoadNode {
- protected:
-  virtual BasicType elt_basic_type() const { return T_FLOAT; }
- public:
-  Load2FNode(Node* c, Node* mem, Node* adr, const TypePtr* at, const Type *t = Type::FLOAT)
-    : VectorLoadNode(c,mem,adr,at,vect_type(t,2)) {}
-  virtual int Opcode() const;
-  virtual int store_Opcode() const { return Op_Store2F; }
-  virtual uint length() const { return 2; }
-};
-
-//------------------------------Load2DNode--------------------------------------
-// Vector load of 2 doubles (64bits) from memory
-class Load2DNode : public VectorLoadNode {
- protected:
-  virtual BasicType elt_basic_type() const { return T_DOUBLE; }
- public:
-  Load2DNode(Node* c, Node* mem, Node* adr, const TypePtr* at, const Type *t = Type::DOUBLE)
-    : VectorLoadNode(c,mem,adr,at,vect_type(t,2)) {}
-  virtual int Opcode() const;
-  virtual int store_Opcode() const { return Op_Store2D; }
-  virtual uint length() const { return 2; }
-};
-
-
-//------------------------------VectorStoreNode--------------------------------------
-// Vector Store to memory
-class VectorStoreNode : public StoreNode {
-  virtual uint size_of() const { return sizeof(*this); }
-
- protected:
-  virtual BasicType elt_basic_type()  const = 0; // Vector element basic type
-
- public:
-  VectorStoreNode(Node* c, Node* mem, Node* adr, const TypePtr* at, Node* val)
-    : StoreNode(c,mem,adr,at,val) {
-    init_class_id(Class_VectorStore);
-  }
-  virtual int Opcode() const;
-
-  virtual uint  length() const = 0; // Vector length
-
-  // Element and vector type
-  const Type* elt_type()  const { return Type::get_const_basic_type(elt_basic_type()); }
-  const Type* vect_type() const { return VectorNode::vect_type(elt_basic_type(), length()); }
-
-  virtual uint ideal_reg() const  { return Matcher::vector_ideal_reg(); }
+  virtual uint ideal_reg() const  { return Matcher::vector_ideal_reg(memory_size()); }
   virtual BasicType memory_type() const { return T_VOID; }
-  virtual int memory_size() const { return length()*type2aelembytes(elt_basic_type()); }
+  virtual int memory_size() const { return vect_type()->length_in_bytes(); }
 
-  // Vector opcode from scalar opcode
-  static int opcode(int sopc, uint vlen);
-
-  static VectorStoreNode* make(Compile* C, int opc, Node* ctl, Node* mem,
+  static StoreVectorNode* make(Compile* C, int opc, Node* ctl, Node* mem,
                                Node* adr, const TypePtr* atyp, Node* val,
                                uint vlen);
 };
 
-//------------------------------Store16BNode--------------------------------------
-// Vector store of 16 bytes (8bits signed) to memory
-class Store16BNode : public VectorStoreNode {
- protected:
-  virtual BasicType elt_basic_type() const { return T_BYTE; }
- public:
-  Store16BNode(Node* c, Node* mem, Node* adr, const TypePtr* at, Node* val)
-    : VectorStoreNode(c,mem,adr,at,val) {}
-  virtual int Opcode() const;
-  virtual uint length() const { return 16; }
-};
 
-//------------------------------Store8BNode--------------------------------------
-// Vector store of 8 bytes (8bits signed) to memory
-class Store8BNode : public VectorStoreNode {
- protected:
-  virtual BasicType elt_basic_type() const { return T_BYTE; }
- public:
-  Store8BNode(Node* c, Node* mem, Node* adr, const TypePtr* at, Node* val)
-    : VectorStoreNode(c,mem,adr,at,val) {}
-  virtual int Opcode() const;
-  virtual uint length() const { return 8; }
-};
+//=========================Promote_Scalar_to_Vector============================
 
-//------------------------------Store4BNode--------------------------------------
-// Vector store of 4 bytes (8bits signed) to memory
-class Store4BNode : public VectorStoreNode {
- protected:
-  virtual BasicType elt_basic_type() const { return T_BYTE; }
+//------------------------------ReplicateBNode---------------------------------
+// Replicate byte scalar to be vector
+class ReplicateBNode : public VectorNode {
  public:
-  Store4BNode(Node* c, Node* mem, Node* adr, const TypePtr* at, Node* val)
-    : VectorStoreNode(c,mem,adr,at,val) {}
+  ReplicateBNode(Node* in1, const TypeVect* vt) : VectorNode(in1, vt) {}
   virtual int Opcode() const;
-  virtual uint length() const { return 4; }
 };
 
-//------------------------------Store8CNode--------------------------------------
-// Vector store of 8 chars (16bits signed/unsigned) to memory
-class Store8CNode : public VectorStoreNode {
- protected:
-  virtual BasicType elt_basic_type() const { return T_CHAR; }
+//------------------------------ReplicateCNode---------------------------------
+// Replicate char scalar to be vector
+class ReplicateCNode : public VectorNode {
  public:
-  Store8CNode(Node* c, Node* mem, Node* adr, const TypePtr* at, Node* val)
-    : VectorStoreNode(c,mem,adr,at,val) {}
+  ReplicateCNode(Node* in1, const TypeVect* vt) : VectorNode(in1, vt) {}
   virtual int Opcode() const;
-  virtual uint length() const { return 8; }
 };
 
-//------------------------------Store4CNode--------------------------------------
-// Vector store of 4 chars (16bits signed/unsigned) to memory
-class Store4CNode : public VectorStoreNode {
- protected:
-  virtual BasicType elt_basic_type() const { return T_CHAR; }
+//------------------------------ReplicateSNode---------------------------------
+// Replicate short scalar to be vector
+class ReplicateSNode : public VectorNode {
  public:
-  Store4CNode(Node* c, Node* mem, Node* adr, const TypePtr* at, Node* val)
-    : VectorStoreNode(c,mem,adr,at,val) {}
+  ReplicateSNode(Node* in1, const TypeVect* vt) : VectorNode(in1, vt) {}
   virtual int Opcode() const;
-  virtual uint length() const { return 4; }
 };
 
-//------------------------------Store2CNode--------------------------------------
-// Vector store of 2 chars (16bits signed/unsigned) to memory
-class Store2CNode : public VectorStoreNode {
- protected:
-  virtual BasicType elt_basic_type() const { return T_CHAR; }
+//------------------------------ReplicateINode---------------------------------
+// Replicate int scalar to be vector
+class ReplicateINode : public VectorNode {
  public:
-  Store2CNode(Node* c, Node* mem, Node* adr, const TypePtr* at, Node* val)
-    : VectorStoreNode(c,mem,adr,at,val) {}
+  ReplicateINode(Node* in1, const TypeVect* vt) : VectorNode(in1, vt) {}
   virtual int Opcode() const;
-  virtual uint length() const { return 2; }
 };
 
-//------------------------------Store4INode--------------------------------------
-// Vector store of 4 integers (32bits signed) to memory
-class Store4INode : public VectorStoreNode {
- protected:
-  virtual BasicType elt_basic_type() const { return T_INT; }
+//------------------------------ReplicateLNode---------------------------------
+// Replicate long scalar to be vector
+class ReplicateLNode : public VectorNode {
  public:
-  Store4INode(Node* c, Node* mem, Node* adr, const TypePtr* at, Node* val)
-    : VectorStoreNode(c,mem,adr,at,val) {}
+  ReplicateLNode(Node* in1, const TypeVect* vt) : VectorNode(in1, vt) {}
   virtual int Opcode() const;
-  virtual uint length() const { return 4; }
 };
 
-//------------------------------Store2INode--------------------------------------
-// Vector store of 2 integers (32bits signed) to memory
-class Store2INode : public VectorStoreNode {
- protected:
-  virtual BasicType elt_basic_type() const { return T_INT; }
+//------------------------------ReplicateFNode---------------------------------
+// Replicate float scalar to be vector
+class ReplicateFNode : public VectorNode {
  public:
-  Store2INode(Node* c, Node* mem, Node* adr, const TypePtr* at, Node* val)
-    : VectorStoreNode(c,mem,adr,at,val) {}
+  ReplicateFNode(Node* in1, const TypeVect* vt) : VectorNode(in1, vt) {}
   virtual int Opcode() const;
-  virtual uint length() const { return 2; }
 };
 
-//------------------------------Store2LNode--------------------------------------
-// Vector store of 2 longs (64bits signed) to memory
-class Store2LNode : public VectorStoreNode {
- protected:
-  virtual BasicType elt_basic_type() const { return T_LONG; }
+//------------------------------ReplicateDNode---------------------------------
+// Replicate double scalar to be vector
+class ReplicateDNode : public VectorNode {
  public:
-  Store2LNode(Node* c, Node* mem, Node* adr, const TypePtr* at, Node* val)
-    : VectorStoreNode(c,mem,adr,at,val) {}
+  ReplicateDNode(Node* in1, const TypeVect* vt) : VectorNode(in1, vt) {}
   virtual int Opcode() const;
-  virtual uint length() const { return 2; }
 };
 
-//------------------------------Store4FNode--------------------------------------
-// Vector store of 4 floats (32bits) to memory
-class Store4FNode : public VectorStoreNode {
- protected:
-  virtual BasicType elt_basic_type() const { return T_FLOAT; }
- public:
-  Store4FNode(Node* c, Node* mem, Node* adr, const TypePtr* at, Node* val)
-    : VectorStoreNode(c,mem,adr,at,val) {}
-  virtual int Opcode() const;
-  virtual uint length() const { return 4; }
-};
+//========================Pack_Scalars_into_a_Vector===========================
 
-//------------------------------Store2FNode--------------------------------------
-// Vector store of 2 floats (32bits) to memory
-class Store2FNode : public VectorStoreNode {
- protected:
-  virtual BasicType elt_basic_type() const { return T_FLOAT; }
- public:
-  Store2FNode(Node* c, Node* mem, Node* adr, const TypePtr* at, Node* val)
-    : VectorStoreNode(c,mem,adr,at,val) {}
-  virtual int Opcode() const;
-  virtual uint length() const { return 2; }
-};
-
-//------------------------------Store2DNode--------------------------------------
-// Vector store of 2 doubles (64bits) to memory
-class Store2DNode : public VectorStoreNode {
- protected:
-  virtual BasicType elt_basic_type() const { return T_DOUBLE; }
- public:
-  Store2DNode(Node* c, Node* mem, Node* adr, const TypePtr* at, Node* val)
-    : VectorStoreNode(c,mem,adr,at,val) {}
-  virtual int Opcode() const;
-  virtual uint length() const { return 2; }
-};
-
-//=========================Promote_Scalar_to_Vector====================================
-
-//------------------------------Replicate16BNode---------------------------------------
-// Replicate byte scalar to be vector of 16 bytes
-class Replicate16BNode : public VectorNode {
- protected:
-  virtual BasicType elt_basic_type() const { return T_BYTE; }
- public:
-  Replicate16BNode(Node* in1) : VectorNode(in1, 16) {}
-  virtual int Opcode() const;
-};
-
-//------------------------------Replicate8BNode---------------------------------------
-// Replicate byte scalar to be vector of 8 bytes
-class Replicate8BNode : public VectorNode {
- protected:
-  virtual BasicType elt_basic_type() const { return T_BYTE; }
- public:
-  Replicate8BNode(Node* in1) : VectorNode(in1, 8) {}
-  virtual int Opcode() const;
-};
-
-//------------------------------Replicate4BNode---------------------------------------
-// Replicate byte scalar to be vector of 4 bytes
-class Replicate4BNode : public VectorNode {
- protected:
-  virtual BasicType elt_basic_type() const { return T_BYTE; }
- public:
-  Replicate4BNode(Node* in1) : VectorNode(in1, 4) {}
-  virtual int Opcode() const;
-};
-
-//------------------------------Replicate8CNode---------------------------------------
-// Replicate char scalar to be vector of 8 chars
-class Replicate8CNode : public VectorNode {
- protected:
-  virtual BasicType elt_basic_type() const { return T_CHAR; }
- public:
-  Replicate8CNode(Node* in1) : VectorNode(in1, 8) {}
-  virtual int Opcode() const;
-};
-
-//------------------------------Replicate4CNode---------------------------------------
-// Replicate char scalar to be vector of 4 chars
-class Replicate4CNode : public VectorNode {
- protected:
-  virtual BasicType elt_basic_type() const { return T_CHAR; }
- public:
-  Replicate4CNode(Node* in1) : VectorNode(in1, 4) {}
-  virtual int Opcode() const;
-};
-
-//------------------------------Replicate2CNode---------------------------------------
-// Replicate char scalar to be vector of 2 chars
-class Replicate2CNode : public VectorNode {
- protected:
-  virtual BasicType elt_basic_type() const { return T_CHAR; }
- public:
-  Replicate2CNode(Node* in1) : VectorNode(in1, 2) {}
-  virtual int Opcode() const;
-};
-
-//------------------------------Replicate8SNode---------------------------------------
-// Replicate short scalar to be vector of 8 shorts
-class Replicate8SNode : public VectorNode {
- protected:
-  virtual BasicType elt_basic_type() const { return T_SHORT; }
- public:
-  Replicate8SNode(Node* in1) : VectorNode(in1, 8) {}
-  virtual int Opcode() const;
-};
-
-//------------------------------Replicate4SNode---------------------------------------
-// Replicate short scalar to be vector of 4 shorts
-class Replicate4SNode : public VectorNode {
- protected:
-  virtual BasicType elt_basic_type() const { return T_SHORT; }
- public:
-  Replicate4SNode(Node* in1) : VectorNode(in1, 4) {}
-  virtual int Opcode() const;
-};
-
-//------------------------------Replicate2SNode---------------------------------------
-// Replicate short scalar to be vector of 2 shorts
-class Replicate2SNode : public VectorNode {
- protected:
-  virtual BasicType elt_basic_type() const { return T_SHORT; }
- public:
-  Replicate2SNode(Node* in1) : VectorNode(in1, 2) {}
-  virtual int Opcode() const;
-};
-
-//------------------------------Replicate4INode---------------------------------------
-// Replicate int scalar to be vector of 4 ints
-class Replicate4INode : public VectorNode {
- protected:
-  virtual BasicType elt_basic_type() const { return T_INT; }
- public:
-  Replicate4INode(Node* in1) : VectorNode(in1, 4) {}
-  virtual int Opcode() const;
-};
-
-//------------------------------Replicate2INode---------------------------------------
-// Replicate int scalar to be vector of 2 ints
-class Replicate2INode : public VectorNode {
- protected:
-  virtual BasicType elt_basic_type() const { return T_INT; }
- public:
-  Replicate2INode(Node* in1) : VectorNode(in1, 2) {}
-  virtual int Opcode() const;
-};
-
-//------------------------------Replicate2LNode---------------------------------------
-// Replicate long scalar to be vector of 2 longs
-class Replicate2LNode : public VectorNode {
- protected:
-  virtual BasicType elt_basic_type() const { return T_LONG; }
- public:
-  Replicate2LNode(Node* in1) : VectorNode(in1, 2) {}
-  virtual int Opcode() const;
-};
-
-//------------------------------Replicate4FNode---------------------------------------
-// Replicate float scalar to be vector of 4 floats
-class Replicate4FNode : public VectorNode {
- protected:
-  virtual BasicType elt_basic_type() const { return T_FLOAT; }
- public:
-  Replicate4FNode(Node* in1) : VectorNode(in1, 4) {}
-  virtual int Opcode() const;
-};
-
-//------------------------------Replicate2FNode---------------------------------------
-// Replicate float scalar to be vector of 2 floats
-class Replicate2FNode : public VectorNode {
- protected:
-  virtual BasicType elt_basic_type() const { return T_FLOAT; }
- public:
-  Replicate2FNode(Node* in1) : VectorNode(in1, 2) {}
-  virtual int Opcode() const;
-};
-
-//------------------------------Replicate2DNode---------------------------------------
-// Replicate double scalar to be vector of 2 doubles
-class Replicate2DNode : public VectorNode {
- protected:
-  virtual BasicType elt_basic_type() const { return T_DOUBLE; }
- public:
-  Replicate2DNode(Node* in1) : VectorNode(in1, 2) {}
-  virtual int Opcode() const;
-};
-
-//========================Pack_Scalars_into_a_Vector==============================
-
 //------------------------------PackNode---------------------------------------
 // Pack parent class (not for code generation).
 class PackNode : public VectorNode {
  public:
-  PackNode(Node* in1)  : VectorNode(in1, 1) {}
-  PackNode(Node* in1, Node* n2)  : VectorNode(in1, n2, 2) {}
+  PackNode(Node* in1, const TypeVect* vt) : VectorNode(in1, vt) {}
+  PackNode(Node* in1, Node* n2, const TypeVect* vt) : VectorNode(in1, n2, vt) {}
   virtual int Opcode() const;
 
-  void add_opd(Node* n) {
-    add_req(n);
-    _length++;
-    assert(_length == req() - 1, "vector length matches edge count");
+  void add_opd(uint i, Node* n) {
+    init_req(i+1, n);
   }
 
   // Create a binary tree form for Packs. [lo, hi) (half-open) range
   Node* binaryTreePack(Compile* C, int lo, int hi);
 
-  static PackNode* make(Compile* C, Node* s, const Type* elt_t);
+  static PackNode* make(Compile* C, Node* s, uint vlen, BasicType bt);
 };
 
 //------------------------------PackBNode---------------------------------------
 // Pack byte scalars into vector
 class PackBNode : public PackNode {
- protected:
-  virtual BasicType elt_basic_type() const { return T_BYTE; }
  public:
-  PackBNode(Node* in1)  : PackNode(in1) {}
+  PackBNode(Node* in1, const TypeVect* vt)  : PackNode(in1, vt) {}
   virtual int Opcode() const;
 };
 
@@ -970,10 +434,8 @@
 //------------------------------PackCNode---------------------------------------
 // Pack char scalars into vector
 class PackCNode : public PackNode {
- protected:
-  virtual BasicType elt_basic_type() const { return T_CHAR; }
  public:
-  PackCNode(Node* in1)  : PackNode(in1) {}
+  PackCNode(Node* in1, const TypeVect* vt)  : PackNode(in1, vt) {}
   virtual int Opcode() const;
 };
 
@@ -980,10 +442,9 @@
 //------------------------------PackSNode---------------------------------------
 // Pack short scalars into a vector
 class PackSNode : public PackNode {
- protected:
-  virtual BasicType elt_basic_type() const { return T_SHORT; }
  public:
-  PackSNode(Node* in1)  : PackNode(in1) {}
+  PackSNode(Node* in1, const TypeVect* vt)  : PackNode(in1, vt) {}
+  PackSNode(Node* in1, Node* in2, const TypeVect* vt) : PackNode(in1, in2, vt) {}
   virtual int Opcode() const;
 };
 
@@ -990,11 +451,9 @@
 //------------------------------PackINode---------------------------------------
 // Pack integer scalars into a vector
 class PackINode : public PackNode {
- protected:
-  virtual BasicType elt_basic_type() const { return T_INT; }
  public:
-  PackINode(Node* in1)  : PackNode(in1) {}
-  PackINode(Node* in1, Node* in2) : PackNode(in1, in2) {}
+  PackINode(Node* in1, const TypeVect* vt)  : PackNode(in1, vt) {}
+  PackINode(Node* in1, Node* in2, const TypeVect* vt) : PackNode(in1, in2, vt) {}
   virtual int Opcode() const;
 };
 
@@ -1001,22 +460,26 @@
 //------------------------------PackLNode---------------------------------------
 // Pack long scalars into a vector
 class PackLNode : public PackNode {
- protected:
-  virtual BasicType elt_basic_type() const { return T_LONG; }
  public:
-  PackLNode(Node* in1)  : PackNode(in1) {}
-  PackLNode(Node* in1, Node* in2) : PackNode(in1, in2) {}
+  PackLNode(Node* in1, const TypeVect* vt)  : PackNode(in1, vt) {}
+  PackLNode(Node* in1, Node* in2, const TypeVect* vt) : PackNode(in1, in2, vt) {}
   virtual int Opcode() const;
 };
 
+//------------------------------Pack2LNode--------------------------------------
+// Pack 2 long scalars into a vector
+class Pack2LNode : public PackNode {
+ public:
+  Pack2LNode(Node* in1, Node* in2, const TypeVect* vt) : PackNode(in1, in2, vt) {}
+  virtual int Opcode() const;
+};
+
 //------------------------------PackFNode---------------------------------------
 // Pack float scalars into vector
 class PackFNode : public PackNode {
- protected:
-  virtual BasicType elt_basic_type() const { return T_FLOAT; }
  public:
-  PackFNode(Node* in1)  : PackNode(in1) {}
-  PackFNode(Node* in1, Node* in2) : PackNode(in1, in2) {}
+  PackFNode(Node* in1, const TypeVect* vt)  : PackNode(in1, vt) {}
+  PackFNode(Node* in1, Node* in2, const TypeVect* vt) : PackNode(in1, in2, vt) {}
   virtual int Opcode() const;
 };
 
@@ -1023,39 +486,20 @@
 //------------------------------PackDNode---------------------------------------
 // Pack double scalars into a vector
 class PackDNode : public PackNode {
- protected:
-  virtual BasicType elt_basic_type() const { return T_DOUBLE; }
  public:
-  PackDNode(Node* in1)  : PackNode(in1) {}
-  PackDNode(Node* in1, Node* in2) : PackNode(in1, in2) {}
+  PackDNode(Node* in1, const TypeVect* vt) : PackNode(in1, vt) {}
+  PackDNode(Node* in1, Node* in2, const TypeVect* vt) : PackNode(in1, in2, vt) {}
   virtual int Opcode() const;
 };
 
-// The Pack2xN nodes assist code generation.  They are created from
-// Pack4C, etc. nodes in final_graph_reshape in the form of a
-// balanced, binary tree.
-
-//------------------------------Pack2x1BNode-----------------------------------------
-// Pack 2 1-byte integers into vector of 2 bytes
-class Pack2x1BNode : public PackNode {
- protected:
-  virtual BasicType elt_basic_type() const { return T_BYTE; }
+//------------------------------Pack2DNode--------------------------------------
+// Pack 2 double scalars into a vector
+class Pack2DNode : public PackNode {
  public:
-  Pack2x1BNode(Node *in1, Node* in2) : PackNode(in1, in2) {}
+  Pack2DNode(Node* in1, Node* in2, const TypeVect* vt) : PackNode(in1, in2, vt) {}
   virtual int Opcode() const;
-  virtual uint ideal_reg() const { return Op_RegI; }
 };
 
-//------------------------------Pack2x2BNode---------------------------------------
-// Pack 2 2-byte integers into vector of 4 bytes
-class Pack2x2BNode : public PackNode {
- protected:
-  virtual BasicType elt_basic_type() const { return T_CHAR; }
- public:
-  Pack2x2BNode(Node *in1, Node* in2) : PackNode(in1, in2) {}
-  virtual int Opcode() const;
-  virtual uint ideal_reg() const { return Op_RegI; }
-};
 
 //========================Extract_Scalar_from_Vector===============================
 
@@ -1069,7 +513,7 @@
   virtual int Opcode() const;
   uint  pos() const { return in(2)->get_int(); }
 
-  static Node* make(Compile* C, Node* v, uint position, const Type* opd_t);
+  static Node* make(Compile* C, Node* v, uint position, BasicType bt);
 };
 
 //------------------------------ExtractBNode---------------------------------------
--- old/src/share/vm/runtime/vmStructs.cpp	Sat Jun  2 20:04:24 2012
+++ new/src/share/vm/runtime/vmStructs.cpp	Sat Jun  2 20:04:23 2012
@@ -1967,57 +1967,22 @@
   declare_c2_type(LShiftVCNode, VectorNode)                               \
   declare_c2_type(LShiftVSNode, VectorNode)                               \
   declare_c2_type(LShiftVINode, VectorNode)                               \
-  declare_c2_type(URShiftVBNode, VectorNode)                              \
-  declare_c2_type(URShiftVCNode, VectorNode)                              \
-  declare_c2_type(URShiftVSNode, VectorNode)                              \
-  declare_c2_type(URShiftVINode, VectorNode)                              \
+  declare_c2_type(RShiftVBNode, VectorNode)                               \
+  declare_c2_type(RShiftVCNode, VectorNode)                               \
+  declare_c2_type(RShiftVSNode, VectorNode)                               \
+  declare_c2_type(RShiftVINode, VectorNode)                               \
   declare_c2_type(AndVNode, VectorNode)                                   \
   declare_c2_type(OrVNode, VectorNode)                                    \
   declare_c2_type(XorVNode, VectorNode)                                   \
-  declare_c2_type(VectorLoadNode, LoadNode)                               \
-  declare_c2_type(Load16BNode, VectorLoadNode)                            \
-  declare_c2_type(Load8BNode, VectorLoadNode)                             \
-  declare_c2_type(Load4BNode, VectorLoadNode)                             \
-  declare_c2_type(Load8CNode, VectorLoadNode)                             \
-  declare_c2_type(Load4CNode, VectorLoadNode)                             \
-  declare_c2_type(Load2CNode, VectorLoadNode)                             \
-  declare_c2_type(Load8SNode, VectorLoadNode)                             \
-  declare_c2_type(Load4SNode, VectorLoadNode)                             \
-  declare_c2_type(Load2SNode, VectorLoadNode)                             \
-  declare_c2_type(Load4INode, VectorLoadNode)                             \
-  declare_c2_type(Load2INode, VectorLoadNode)                             \
-  declare_c2_type(Load2LNode, VectorLoadNode)                             \
-  declare_c2_type(Load4FNode, VectorLoadNode)                             \
-  declare_c2_type(Load2FNode, VectorLoadNode)                             \
-  declare_c2_type(Load2DNode, VectorLoadNode)                             \
-  declare_c2_type(VectorStoreNode, StoreNode)                             \
-  declare_c2_type(Store16BNode, VectorStoreNode)                          \
-  declare_c2_type(Store8BNode, VectorStoreNode)                           \
-  declare_c2_type(Store4BNode, VectorStoreNode)                           \
-  declare_c2_type(Store8CNode, VectorStoreNode)                           \
-  declare_c2_type(Store4CNode, VectorStoreNode)                           \
-  declare_c2_type(Store2CNode, VectorStoreNode)                           \
-  declare_c2_type(Store4INode, VectorStoreNode)                           \
-  declare_c2_type(Store2INode, VectorStoreNode)                           \
-  declare_c2_type(Store2LNode, VectorStoreNode)                           \
-  declare_c2_type(Store4FNode, VectorStoreNode)                           \
-  declare_c2_type(Store2FNode, VectorStoreNode)                           \
-  declare_c2_type(Store2DNode, VectorStoreNode)                           \
-  declare_c2_type(Replicate16BNode, VectorNode)                           \
-  declare_c2_type(Replicate8BNode, VectorNode)                            \
-  declare_c2_type(Replicate4BNode, VectorNode)                            \
-  declare_c2_type(Replicate8CNode, VectorNode)                            \
-  declare_c2_type(Replicate4CNode, VectorNode)                            \
-  declare_c2_type(Replicate2CNode, VectorNode)                            \
-  declare_c2_type(Replicate8SNode, VectorNode)                            \
-  declare_c2_type(Replicate4SNode, VectorNode)                            \
-  declare_c2_type(Replicate2SNode, VectorNode)                            \
-  declare_c2_type(Replicate4INode, VectorNode)                            \
-  declare_c2_type(Replicate2INode, VectorNode)                            \
-  declare_c2_type(Replicate2LNode, VectorNode)                            \
-  declare_c2_type(Replicate4FNode, VectorNode)                            \
-  declare_c2_type(Replicate2FNode, VectorNode)                            \
-  declare_c2_type(Replicate2DNode, VectorNode)                            \
+  declare_c2_type(LoadVectorNode, LoadNode)                               \
+  declare_c2_type(StoreVectorNode, StoreNode)                             \
+  declare_c2_type(ReplicateBNode, VectorNode)                             \
+  declare_c2_type(ReplicateCNode, VectorNode)                             \
+  declare_c2_type(ReplicateSNode, VectorNode)                             \
+  declare_c2_type(ReplicateINode, VectorNode)                             \
+  declare_c2_type(ReplicateLNode, VectorNode)                             \
+  declare_c2_type(ReplicateFNode, VectorNode)                             \
+  declare_c2_type(ReplicateDNode, VectorNode)                             \
   declare_c2_type(PackNode, VectorNode)                                   \
   declare_c2_type(PackBNode, PackNode)                                    \
   declare_c2_type(PackCNode, PackNode)                                    \
@@ -2026,8 +1991,8 @@
   declare_c2_type(PackLNode, PackNode)                                    \
   declare_c2_type(PackFNode, PackNode)                                    \
   declare_c2_type(PackDNode, PackNode)                                    \
-  declare_c2_type(Pack2x1BNode, PackNode)                                 \
-  declare_c2_type(Pack2x2BNode, PackNode)                                 \
+  declare_c2_type(Pack2LNode, PackNode)                                   \
+  declare_c2_type(Pack2DNode, PackNode)                                   \
   declare_c2_type(ExtractNode, Node)                                      \
   declare_c2_type(ExtractBNode, ExtractNode)                              \
   declare_c2_type(ExtractCNode, ExtractNode)                              \
--- /dev/null	Sat Jun  2 20:04:24 2012
+++ new/test/compiler/7119644/TestByteDoubleVect.java	Sat Jun  2 20:04:24 2012
@@ -0,0 +1,571 @@
+/*
+ * Copyright (c) 2012, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+/**
+ * @test
+ * @bug 7119644
+ * @summary Increase superword's vector size up to 256 bits
+ *
+ * @run main/othervm/timeout=300 -Xbatch -XX:+IgnoreUnrecognizedVMOptions -XX:-TieredCompilation -XX:-OptimizeFill TestByteDoubleVect
+ */
+
+public class TestByteDoubleVect {
+  private static final int ARRLEN = 997;
+  private static final int ITERS  = 11000;
+  private static final int OFFSET = 3;
+  private static final int SCALE = 2;
+  private static final int ALIGN_OFF = 8;
+  private static final int UNALIGN_OFF = 5;
+
+  public static void main(String args[]) {
+    System.out.println("Testing Byte + Double vectors");
+    int errn = test();
+    if (errn > 0) {
+      System.err.println("FAILED: " + errn + " errors");
+      System.exit(97);
+    }
+    System.out.println("PASSED");
+  }
+
+  static int test() {
+    byte[] a1 = new byte[ARRLEN];
+    byte[] a2 = new byte[ARRLEN];
+    double[] b1 = new double[ARRLEN];
+    double[] b2 = new double[ARRLEN];
+    System.out.println("Warmup");
+    for (int i=0; i<ITERS; i++) {
+      test_ci(a1, b1);
+      test_vi(a2, b2, (byte)123, 103.);
+      test_cp(a1, a2, b1, b2);
+      test_ci_neg(a1, b1);
+      test_vi_neg(a1, b1, (byte)123, 103.);
+      test_cp_neg(a1, a2, b1, b2);
+      test_ci_oppos(a1, b1);
+      test_vi_oppos(a1, b1, (byte)123, 103.);
+      test_cp_oppos(a1, a2, b1, b2);
+      test_ci_aln(a1, b1);
+      test_vi_aln(a1, b1, (byte)123, 103.);
+      test_cp_alndst(a1, a2, b1, b2);
+      test_cp_alnsrc(a1, a2, b1, b2);
+      test_ci_unaln(a1, b1);
+      test_vi_unaln(a1, b1, (byte)123, 103.);
+      test_cp_unalndst(a1, a2, b1, b2);
+      test_cp_unalnsrc(a1, a2, b1, b2);
+    }
+    // Initialize
+    for (int i=0; i<ARRLEN; i++) {
+      a1[i] = -1;
+      a2[i] = -1;
+      b1[i] = -1.;
+      b2[i] = -1.;
+    }
+    // Test and verify results
+    System.out.println("Verification");
+    int errn = 0;
+    {
+      test_ci(a1, b1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_ci: a1", i, a1[i], (byte)-123);
+        errn += verify("test_ci: b1", i, b1[i], -103.);
+      }
+      test_vi(a2, b2, (byte)123, 103.);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_vi: a2", i, a2[i], (byte)123);
+        errn += verify("test_vi: b2", i, b2[i], 103.);
+      }
+      test_cp(a1, a2, b1, b2);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_cp: a1", i, a1[i], (byte)123);
+        errn += verify("test_cp: b1", i, b1[i], 103.);
+      }
+
+      // Reset for negative stride
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        a2[i] = -1;
+        b1[i] = -1.;
+        b2[i] = -1.;
+      }
+      test_ci_neg(a1, b1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_ci_neg: a1", i, a1[i], (byte)-123);
+        errn += verify("test_ci_neg: b1", i, b1[i], -103.);
+      }
+      test_vi_neg(a2, b2, (byte)123, 103.);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_vi_neg: a2", i, a2[i], (byte)123);
+        errn += verify("test_vi_neg: b2", i, b2[i], 103.);
+      }
+      test_cp_neg(a1, a2, b1, b2);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_cp_neg: a1", i, a1[i], (byte)123);
+        errn += verify("test_cp_neg: b1", i, b1[i], 103.);
+      }
+
+      // Reset for opposite stride
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        a2[i] = -1;
+        b1[i] = -1.;
+        b2[i] = -1.;
+      }
+      test_ci_oppos(a1, b1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_ci_oppos: a1", i, a1[i], (byte)-123);
+        errn += verify("test_ci_oppos: b1", i, b1[i], -103.);
+      }
+      test_vi_oppos(a2, b2, (byte)123, 103.);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_vi_oppos: a2", i, a2[i], (byte)123);
+        errn += verify("test_vi_oppos: b2", i, b2[i], 103.);
+      }
+      test_cp_oppos(a1, a2, b1, b2);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_cp_oppos: a1", i, a1[i], (byte)123);
+        errn += verify("test_cp_oppos: b1", i, b1[i], 103.);
+      }
+
+      // Reset for 2 arrays with relative aligned offset
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        a2[i] = 123;
+        b1[i] = -1.;
+        b2[i] = 123.;
+      }
+      test_cp_alndst(a1, a2, b1, b2);
+      for (int i=0; i<ALIGN_OFF; i++) {
+        errn += verify("test_cp_alndst: a1", i, a1[i], (byte)-1);
+        errn += verify("test_cp_alndst: b1", i, b1[i], -1.);
+      }
+      for (int i=ALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_cp_alndst: a1", i, a1[i], (byte)123);
+        errn += verify("test_cp_alndst: b1", i, b1[i], 123.);
+      }
+      for (int i=0; i<ARRLEN; i++) {
+        a2[i] = -123;
+        b2[i] = -123.;
+      }
+      test_cp_alnsrc(a1, a2, b1, b2);
+      for (int i=0; i<ARRLEN-ALIGN_OFF; i++) {
+        errn += verify("test_cp_alnsrc: a1", i, a1[i], (byte)-123);
+        errn += verify("test_cp_alnsrc: b1", i, b1[i], -123.);
+      }
+      for (int i=ARRLEN-ALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_cp_alnsrc: a1", i, a1[i], (byte)123);
+        errn += verify("test_cp_alnsrc: b1", i, b1[i], 123.);
+      }
+
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        b1[i] = -1.;
+      }
+      test_ci_aln(a1, b1);
+      for (int i=0; i<ALIGN_OFF; i++) {
+        errn += verify("test_ci_aln: a1", i, a1[i], (byte)-1);
+      }
+      for (int i=ALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_ci_aln: a1", i, a1[i], (byte)-123);
+      }
+      for (int i=0; i<ARRLEN-ALIGN_OFF; i++) {
+        errn += verify("test_ci_aln: b1", i, b1[i], -103.);
+      }
+      for (int i=ARRLEN-ALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_ci_aln: b1", i, b1[i], -1.);
+      }
+
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        b1[i] = -1.;
+      }
+      test_vi_aln(a1, b1, (byte)123, 103.);
+      for (int i=0; i<ARRLEN-ALIGN_OFF; i++) {
+        errn += verify("test_vi_aln: a1", i, a1[i], (byte)123);
+      }
+      for (int i=ARRLEN-ALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_vi_aln: a1", i, a1[i], (byte)-1);
+      }
+      for (int i=0; i<ALIGN_OFF; i++) {
+        errn += verify("test_vi_aln: b1", i, b1[i], -1.);
+      }
+      for (int i=ALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_vi_aln: b1", i, b1[i], 103.);
+      }
+
+      // Reset for 2 arrays with relative unaligned offset
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        a2[i] = 123;
+        b1[i] = -1.;
+        b2[i] = 123.;
+      }
+      test_cp_unalndst(a1, a2, b1, b2);
+      for (int i=0; i<UNALIGN_OFF; i++) {
+        errn += verify("test_cp_unalndst: a1", i, a1[i], (byte)-1);
+        errn += verify("test_cp_unalndst: b1", i, b1[i], -1.);
+      }
+      for (int i=UNALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_cp_unalndst: a1", i, a1[i], (byte)123);
+        errn += verify("test_cp_unalndst: b1", i, b1[i], 123.);
+      }
+      for (int i=0; i<ARRLEN; i++) {
+        a2[i] = -123;
+        b2[i] = -123.;
+      }
+      test_cp_unalnsrc(a1, a2, b1, b2);
+      for (int i=0; i<ARRLEN-UNALIGN_OFF; i++) {
+        errn += verify("test_cp_unalnsrc: a1", i, a1[i], (byte)-123);
+        errn += verify("test_cp_unalnsrc: b1", i, b1[i], -123.);
+      }
+      for (int i=ARRLEN-UNALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_cp_unalnsrc: a1", i, a1[i], (byte)123);
+        errn += verify("test_cp_unalnsrc: b1", i, b1[i], 123.);
+      }
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        b1[i] = -1;
+      }
+      test_ci_unaln(a1, b1);
+      for (int i=0; i<UNALIGN_OFF; i++) {
+        errn += verify("test_ci_unaln: a1", i, a1[i], (byte)-1);
+      }
+      for (int i=UNALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_ci_unaln: a1", i, a1[i], (byte)-123);
+      }
+      for (int i=0; i<ARRLEN-UNALIGN_OFF; i++) {
+        errn += verify("test_ci_unaln: b1", i, b1[i], -103.);
+      }
+      for (int i=ARRLEN-UNALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_ci_unaln: b1", i, b1[i], -1.);
+      }
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        b1[i] = -1;
+      }
+      test_vi_unaln(a1, b1, (byte)123, 103.);
+      for (int i=0; i<ARRLEN-UNALIGN_OFF; i++) {
+        errn += verify("test_vi_unaln: a1", i, a1[i], (byte)123);
+      }
+      for (int i=ARRLEN-UNALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_vi_unaln: a1", i, a1[i], (byte)-1);
+      }
+      for (int i=0; i<UNALIGN_OFF; i++) {
+        errn += verify("test_vi_unaln: b1", i, b1[i], -1.);
+      }
+      for (int i=UNALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_vi_unaln: b1", i, b1[i], 103.);
+      }
+
+      // Reset for aligned overlap initialization
+      for (int i=0; i<ALIGN_OFF; i++) {
+        a1[i] = (byte)i;
+        b1[i] = (double)i;
+      }
+      for (int i=ALIGN_OFF; i<ARRLEN; i++) {
+        a1[i] = -1;
+        b1[i] = -1.;
+      }
+      test_cp_alndst(a1, a1, b1, b1);
+      for (int i=0; i<ARRLEN; i++) {
+        int v = i%ALIGN_OFF;
+        errn += verify("test_cp_alndst_overlap: a1", i, a1[i], (byte)v);
+        errn += verify("test_cp_alndst_overlap: b1", i, b1[i], (double)v);
+      }
+      for (int i=0; i<ALIGN_OFF; i++) {
+        a1[i+ALIGN_OFF] = -1;
+        b1[i+ALIGN_OFF] = -1.;
+      }
+      test_cp_alnsrc(a1, a1, b1, b1);
+      for (int i=0; i<ALIGN_OFF; i++) {
+        errn += verify("test_cp_alnsrc_overlap: a1", i, a1[i], (byte)-1);
+        errn += verify("test_cp_alnsrc_overlap: b1", i, b1[i], -1.);
+      }
+      for (int i=ALIGN_OFF; i<ARRLEN; i++) {
+        int v = i%ALIGN_OFF;
+        errn += verify("test_cp_alnsrc_overlap: a1", i, a1[i], (byte)v);
+        errn += verify("test_cp_alnsrc_overlap: b1", i, b1[i], (double)v);
+      }
+
+      // Reset for unaligned overlap initialization
+      for (int i=0; i<UNALIGN_OFF; i++) {
+        a1[i] = (byte)i;
+        b1[i] = (double)i;
+      }
+      for (int i=UNALIGN_OFF; i<ARRLEN; i++) {
+        a1[i] = -1;
+        b1[i] = -1.;
+      }
+      test_cp_unalndst(a1, a1, b1, b1);
+      for (int i=0; i<ARRLEN; i++) {
+        int v = i%UNALIGN_OFF;
+        errn += verify("test_cp_unalndst_overlap: a1", i, a1[i], (byte)v);
+        errn += verify("test_cp_unalndst_overlap: b1", i, b1[i], (double)v);
+      }
+      for (int i=0; i<UNALIGN_OFF; i++) {
+        a1[i+UNALIGN_OFF] = -1;
+        b1[i+UNALIGN_OFF] = -1.;
+      }
+      test_cp_unalnsrc(a1, a1, b1, b1);
+      for (int i=0; i<UNALIGN_OFF; i++) {
+        errn += verify("test_cp_unalnsrc_overlap: a1", i, a1[i], (byte)-1);
+        errn += verify("test_cp_unalnsrc_overlap: b1", i, b1[i], -1.);
+      }
+      for (int i=UNALIGN_OFF; i<ARRLEN; i++) {
+        int v = i%UNALIGN_OFF;
+        errn += verify("test_cp_unalnsrc_overlap: a1", i, a1[i], (byte)v);
+        errn += verify("test_cp_unalnsrc_overlap: b1", i, b1[i], (double)v);
+      }
+
+    }
+
+    if (errn > 0)
+      return errn;
+
+    System.out.println("Time");
+    long start, end;
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_ci(a1, b1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_ci: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_vi(a2, b2, (byte)123, 103.);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_vi: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp(a1, a2, b1, b2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_ci_neg(a1, b1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_ci_neg: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_vi_neg(a1, b1, (byte)123, 103.);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_vi_neg: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_neg(a1, a2, b1, b2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_neg: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_ci_oppos(a1, b1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_ci_oppos: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_vi_oppos(a1, b1, (byte)123, 103.);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_vi_oppos: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_oppos(a1, a2, b1, b2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_oppos: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_ci_aln(a1, b1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_ci_aln: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_vi_aln(a1, b1, (byte)123, 103.);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_vi_aln: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_alndst(a1, a2, b1, b2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_alndst: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_alnsrc(a1, a2, b1, b2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_alnsrc: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_ci_unaln(a1, b1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_ci_unaln: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_vi_unaln(a1, b1, (byte)123, 103.);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_vi_unaln: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_unalndst(a1, a2, b1, b2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_unalndst: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_unalnsrc(a1, a2, b1, b2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_unalnsrc: " + (end - start));
+    return errn;
+  }
+
+  static void test_ci(byte[] a, double[] b) {
+    for (int i = 0; i < a.length; i+=1) {
+      a[i] = -123;
+      b[i] = -103.;
+    }
+  }
+  static void test_vi(byte[] a, double[] b, byte c, double d) {
+    for (int i = 0; i < a.length; i+=1) {
+      a[i] = c;
+      b[i] = d;
+    }
+  }
+  static void test_cp(byte[] a, byte[] b, double[] c, double[] d) {
+    for (int i = 0; i < a.length; i+=1) {
+      a[i] = b[i];
+      c[i] = d[i];
+    }
+  }
+  static void test_ci_neg(byte[] a, double[] b) {
+    for (int i = a.length-1; i >= 0; i-=1) {
+      a[i] = -123;
+      b[i] = -103.;
+    }
+  }
+  static void test_vi_neg(byte[] a, double[] b, byte c, double d) {
+    for (int i = a.length-1; i >= 0; i-=1) {
+      a[i] = c;
+      b[i] = d;
+    }
+  }
+  static void test_cp_neg(byte[] a, byte[] b, double[] c, double[] d) {
+    for (int i = a.length-1; i >= 0; i-=1) {
+      a[i] = b[i];
+      c[i] = d[i];
+    }
+  }
+  static void test_ci_oppos(byte[] a, double[] b) {
+    int limit = a.length-1;
+    for (int i = 0; i < a.length; i+=1) {
+      a[limit-i] = -123;
+      b[i] = -103.;
+    }
+  }
+  static void test_vi_oppos(byte[] a, double[] b, byte c, double d) {
+    int limit = a.length-1;
+    for (int i = a.length-1; i >= 0; i-=1) {
+      a[i] = c;
+      b[limit-i] = d;
+    }
+  }
+  static void test_cp_oppos(byte[] a, byte[] b, double[] c, double[] d) {
+    int limit = a.length-1;
+    for (int i = 0; i < a.length; i+=1) {
+      a[i] = b[limit-i];
+      c[limit-i] = d[i];
+    }
+  }
+  static void test_ci_aln(byte[] a, double[] b) {
+    for (int i = 0; i < a.length-ALIGN_OFF; i+=1) {
+      a[i+ALIGN_OFF] = -123;
+      b[i] = -103.;
+    }
+  }
+  static void test_vi_aln(byte[] a, double[] b, byte c, double d) {
+    for (int i = 0; i < a.length-ALIGN_OFF; i+=1) {
+      a[i] = c;
+      b[i+ALIGN_OFF] = d;
+    }
+  }
+  static void test_cp_alndst(byte[] a, byte[] b, double[] c, double[] d) {
+    for (int i = 0; i < a.length-ALIGN_OFF; i+=1) {
+      a[i+ALIGN_OFF] = b[i];
+      c[i+ALIGN_OFF] = d[i];
+    }
+  }
+  static void test_cp_alnsrc(byte[] a, byte[] b, double[] c, double[] d) {
+    for (int i = 0; i < a.length-ALIGN_OFF; i+=1) {
+      a[i] = b[i+ALIGN_OFF];
+      c[i] = d[i+ALIGN_OFF];
+    }
+  }
+  static void test_ci_unaln(byte[] a, double[] b) {
+    for (int i = 0; i < a.length-UNALIGN_OFF; i+=1) {
+      a[i+UNALIGN_OFF] = -123;
+      b[i] = -103.;
+    }
+  }
+  static void test_vi_unaln(byte[] a, double[] b, byte c, double d) {
+    for (int i = 0; i < a.length-UNALIGN_OFF; i+=1) {
+      a[i] = c;
+      b[i+UNALIGN_OFF] = d;
+    }
+  }
+  static void test_cp_unalndst(byte[] a, byte[] b, double[] c, double[] d) {
+    for (int i = 0; i < a.length-UNALIGN_OFF; i+=1) {
+      a[i+UNALIGN_OFF] = b[i];
+      c[i+UNALIGN_OFF] = d[i];
+    }
+  }
+  static void test_cp_unalnsrc(byte[] a, byte[] b, double[] c, double[] d) {
+    for (int i = 0; i < a.length-UNALIGN_OFF; i+=1) {
+      a[i] = b[i+UNALIGN_OFF];
+      c[i] = d[i+UNALIGN_OFF];
+    }
+  }
+
+  static int verify(String text, int i, byte elem, byte val) {
+    if (elem != val) {
+      System.err.println(text + "[" + i + "] = " + elem + " != " + val);
+      return 1;
+    }
+    return 0;
+  }
+  static int verify(String text, int i, double elem, double val) {
+    if (elem != val) {
+      System.err.println(text + "[" + i + "] = " + elem + " != " + val);
+      return 1;
+    }
+    return 0;
+  }
+}
--- /dev/null	Sat Jun  2 20:04:25 2012
+++ new/test/compiler/7119644/TestByteFloatVect.java	Sat Jun  2 20:04:25 2012
@@ -0,0 +1,571 @@
+/*
+ * Copyright (c) 2012, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+/**
+ * @test
+ * @bug 7119644
+ * @summary Increase superword's vector size up to 256 bits
+ *
+ * @run main/othervm/timeout=300 -Xbatch -XX:+IgnoreUnrecognizedVMOptions -XX:-TieredCompilation -XX:-OptimizeFill TestByteFloatVect
+ */
+
+public class TestByteFloatVect {
+  private static final int ARRLEN = 997;
+  private static final int ITERS  = 11000;
+  private static final int OFFSET = 3;
+  private static final int SCALE = 2;
+  private static final int ALIGN_OFF = 8;
+  private static final int UNALIGN_OFF = 5;
+
+  public static void main(String args[]) {
+    System.out.println("Testing Byte + Float vectors");
+    int errn = test();
+    if (errn > 0) {
+      System.err.println("FAILED: " + errn + " errors");
+      System.exit(97);
+    }
+    System.out.println("PASSED");
+  }
+
+  static int test() {
+    byte[] a1 = new byte[ARRLEN];
+    byte[] a2 = new byte[ARRLEN];
+    float[] b1 = new float[ARRLEN];
+    float[] b2 = new float[ARRLEN];
+    System.out.println("Warmup");
+    for (int i=0; i<ITERS; i++) {
+      test_ci(a1, b1);
+      test_vi(a2, b2, (byte)123, 103.f);
+      test_cp(a1, a2, b1, b2);
+      test_ci_neg(a1, b1);
+      test_vi_neg(a1, b1, (byte)123, 103.f);
+      test_cp_neg(a1, a2, b1, b2);
+      test_ci_oppos(a1, b1);
+      test_vi_oppos(a1, b1, (byte)123, 103.f);
+      test_cp_oppos(a1, a2, b1, b2);
+      test_ci_aln(a1, b1);
+      test_vi_aln(a1, b1, (byte)123, 103.f);
+      test_cp_alndst(a1, a2, b1, b2);
+      test_cp_alnsrc(a1, a2, b1, b2);
+      test_ci_unaln(a1, b1);
+      test_vi_unaln(a1, b1, (byte)123, 103.f);
+      test_cp_unalndst(a1, a2, b1, b2);
+      test_cp_unalnsrc(a1, a2, b1, b2);
+    }
+    // Initialize
+    for (int i=0; i<ARRLEN; i++) {
+      a1[i] = -1;
+      a2[i] = -1;
+      b1[i] = -1.f;
+      b2[i] = -1.f;
+    }
+    // Test and verify results
+    System.out.println("Verification");
+    int errn = 0;
+    {
+      test_ci(a1, b1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_ci: a1", i, a1[i], (byte)-123);
+        errn += verify("test_ci: b1", i, b1[i], -103.f);
+      }
+      test_vi(a2, b2, (byte)123, 103.f);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_vi: a2", i, a2[i], (byte)123);
+        errn += verify("test_vi: b2", i, b2[i], 103.f);
+      }
+      test_cp(a1, a2, b1, b2);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_cp: a1", i, a1[i], (byte)123);
+        errn += verify("test_cp: b1", i, b1[i], 103.f);
+      }
+
+      // Reset for negative stride
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        a2[i] = -1;
+        b1[i] = -1.f;
+        b2[i] = -1.f;
+      }
+      test_ci_neg(a1, b1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_ci_neg: a1", i, a1[i], (byte)-123);
+        errn += verify("test_ci_neg: b1", i, b1[i], -103.f);
+      }
+      test_vi_neg(a2, b2, (byte)123, 103.f);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_vi_neg: a2", i, a2[i], (byte)123);
+        errn += verify("test_vi_neg: b2", i, b2[i], 103.f);
+      }
+      test_cp_neg(a1, a2, b1, b2);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_cp_neg: a1", i, a1[i], (byte)123);
+        errn += verify("test_cp_neg: b1", i, b1[i], 103.f);
+      }
+
+      // Reset for opposite stride
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        a2[i] = -1;
+        b1[i] = -1.f;
+        b2[i] = -1.f;
+      }
+      test_ci_oppos(a1, b1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_ci_oppos: a1", i, a1[i], (byte)-123);
+        errn += verify("test_ci_oppos: b1", i, b1[i], -103.f);
+      }
+      test_vi_oppos(a2, b2, (byte)123, 103.f);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_vi_oppos: a2", i, a2[i], (byte)123);
+        errn += verify("test_vi_oppos: b2", i, b2[i], 103.f);
+      }
+      test_cp_oppos(a1, a2, b1, b2);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_cp_oppos: a1", i, a1[i], (byte)123);
+        errn += verify("test_cp_oppos: b1", i, b1[i], 103.f);
+      }
+
+      // Reset for 2 arrays with relative aligned offset
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        a2[i] = 123;
+        b1[i] = -1.f;
+        b2[i] = 123.f;
+      }
+      test_cp_alndst(a1, a2, b1, b2);
+      for (int i=0; i<ALIGN_OFF; i++) {
+        errn += verify("test_cp_alndst: a1", i, a1[i], (byte)-1);
+        errn += verify("test_cp_alndst: b1", i, b1[i], -1.f);
+      }
+      for (int i=ALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_cp_alndst: a1", i, a1[i], (byte)123);
+        errn += verify("test_cp_alndst: b1", i, b1[i], 123.f);
+      }
+      for (int i=0; i<ARRLEN; i++) {
+        a2[i] = -123;
+        b2[i] = -123.f;
+      }
+      test_cp_alnsrc(a1, a2, b1, b2);
+      for (int i=0; i<ARRLEN-ALIGN_OFF; i++) {
+        errn += verify("test_cp_alnsrc: a1", i, a1[i], (byte)-123);
+        errn += verify("test_cp_alnsrc: b1", i, b1[i], -123.f);
+      }
+      for (int i=ARRLEN-ALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_cp_alnsrc: a1", i, a1[i], (byte)123);
+        errn += verify("test_cp_alnsrc: b1", i, b1[i], 123.f);
+      }
+
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        b1[i] = -1.f;
+      }
+      test_ci_aln(a1, b1);
+      for (int i=0; i<ALIGN_OFF; i++) {
+        errn += verify("test_ci_aln: a1", i, a1[i], (byte)-1);
+      }
+      for (int i=ALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_ci_aln: a1", i, a1[i], (byte)-123);
+      }
+      for (int i=0; i<ARRLEN-ALIGN_OFF; i++) {
+        errn += verify("test_ci_aln: b1", i, b1[i], -103.f);
+      }
+      for (int i=ARRLEN-ALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_ci_aln: b1", i, b1[i], -1.f);
+      }
+
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        b1[i] = -1.f;
+      }
+      test_vi_aln(a1, b1, (byte)123, 103.f);
+      for (int i=0; i<ARRLEN-ALIGN_OFF; i++) {
+        errn += verify("test_vi_aln: a1", i, a1[i], (byte)123);
+      }
+      for (int i=ARRLEN-ALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_vi_aln: a1", i, a1[i], (byte)-1);
+      }
+      for (int i=0; i<ALIGN_OFF; i++) {
+        errn += verify("test_vi_aln: b1", i, b1[i], -1.f);
+      }
+      for (int i=ALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_vi_aln: b1", i, b1[i], 103.f);
+      }
+
+      // Reset for 2 arrays with relative unaligned offset
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        a2[i] = 123;
+        b1[i] = -1.f;
+        b2[i] = 123.f;
+      }
+      test_cp_unalndst(a1, a2, b1, b2);
+      for (int i=0; i<UNALIGN_OFF; i++) {
+        errn += verify("test_cp_unalndst: a1", i, a1[i], (byte)-1);
+        errn += verify("test_cp_unalndst: b1", i, b1[i], -1.f);
+      }
+      for (int i=UNALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_cp_unalndst: a1", i, a1[i], (byte)123);
+        errn += verify("test_cp_unalndst: b1", i, b1[i], 123.f);
+      }
+      for (int i=0; i<ARRLEN; i++) {
+        a2[i] = -123;
+        b2[i] = -123.f;
+      }
+      test_cp_unalnsrc(a1, a2, b1, b2);
+      for (int i=0; i<ARRLEN-UNALIGN_OFF; i++) {
+        errn += verify("test_cp_unalnsrc: a1", i, a1[i], (byte)-123);
+        errn += verify("test_cp_unalnsrc: b1", i, b1[i], -123.f);
+      }
+      for (int i=ARRLEN-UNALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_cp_unalnsrc: a1", i, a1[i], (byte)123);
+        errn += verify("test_cp_unalnsrc: b1", i, b1[i], 123.f);
+      }
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        b1[i] = -1;
+      }
+      test_ci_unaln(a1, b1);
+      for (int i=0; i<UNALIGN_OFF; i++) {
+        errn += verify("test_ci_unaln: a1", i, a1[i], (byte)-1);
+      }
+      for (int i=UNALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_ci_unaln: a1", i, a1[i], (byte)-123);
+      }
+      for (int i=0; i<ARRLEN-UNALIGN_OFF; i++) {
+        errn += verify("test_ci_unaln: b1", i, b1[i], -103.f);
+      }
+      for (int i=ARRLEN-UNALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_ci_unaln: b1", i, b1[i], -1.f);
+      }
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        b1[i] = -1;
+      }
+      test_vi_unaln(a1, b1, (byte)123, 103.f);
+      for (int i=0; i<ARRLEN-UNALIGN_OFF; i++) {
+        errn += verify("test_vi_unaln: a1", i, a1[i], (byte)123);
+      }
+      for (int i=ARRLEN-UNALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_vi_unaln: a1", i, a1[i], (byte)-1);
+      }
+      for (int i=0; i<UNALIGN_OFF; i++) {
+        errn += verify("test_vi_unaln: b1", i, b1[i], -1.f);
+      }
+      for (int i=UNALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_vi_unaln: b1", i, b1[i], 103.f);
+      }
+
+      // Reset for aligned overlap initialization
+      for (int i=0; i<ALIGN_OFF; i++) {
+        a1[i] = (byte)i;
+        b1[i] = (float)i;
+      }
+      for (int i=ALIGN_OFF; i<ARRLEN; i++) {
+        a1[i] = -1;
+        b1[i] = -1.f;
+      }
+      test_cp_alndst(a1, a1, b1, b1);
+      for (int i=0; i<ARRLEN; i++) {
+        int v = i%ALIGN_OFF;
+        errn += verify("test_cp_alndst_overlap: a1", i, a1[i], (byte)v);
+        errn += verify("test_cp_alndst_overlap: b1", i, b1[i], (float)v);
+      }
+      for (int i=0; i<ALIGN_OFF; i++) {
+        a1[i+ALIGN_OFF] = -1;
+        b1[i+ALIGN_OFF] = -1.f;
+      }
+      test_cp_alnsrc(a1, a1, b1, b1);
+      for (int i=0; i<ALIGN_OFF; i++) {
+        errn += verify("test_cp_alnsrc_overlap: a1", i, a1[i], (byte)-1);
+        errn += verify("test_cp_alnsrc_overlap: b1", i, b1[i], -1.f);
+      }
+      for (int i=ALIGN_OFF; i<ARRLEN; i++) {
+        int v = i%ALIGN_OFF;
+        errn += verify("test_cp_alnsrc_overlap: a1", i, a1[i], (byte)v);
+        errn += verify("test_cp_alnsrc_overlap: b1", i, b1[i], (float)v);
+      }
+
+      // Reset for unaligned overlap initialization
+      for (int i=0; i<UNALIGN_OFF; i++) {
+        a1[i] = (byte)i;
+        b1[i] = (float)i;
+      }
+      for (int i=UNALIGN_OFF; i<ARRLEN; i++) {
+        a1[i] = -1;
+        b1[i] = -1.f;
+      }
+      test_cp_unalndst(a1, a1, b1, b1);
+      for (int i=0; i<ARRLEN; i++) {
+        int v = i%UNALIGN_OFF;
+        errn += verify("test_cp_unalndst_overlap: a1", i, a1[i], (byte)v);
+        errn += verify("test_cp_unalndst_overlap: b1", i, b1[i], (float)v);
+      }
+      for (int i=0; i<UNALIGN_OFF; i++) {
+        a1[i+UNALIGN_OFF] = -1;
+        b1[i+UNALIGN_OFF] = -1.f;
+      }
+      test_cp_unalnsrc(a1, a1, b1, b1);
+      for (int i=0; i<UNALIGN_OFF; i++) {
+        errn += verify("test_cp_unalnsrc_overlap: a1", i, a1[i], (byte)-1);
+        errn += verify("test_cp_unalnsrc_overlap: b1", i, b1[i], -1.f);
+      }
+      for (int i=UNALIGN_OFF; i<ARRLEN; i++) {
+        int v = i%UNALIGN_OFF;
+        errn += verify("test_cp_unalnsrc_overlap: a1", i, a1[i], (byte)v);
+        errn += verify("test_cp_unalnsrc_overlap: b1", i, b1[i], (float)v);
+      }
+
+    }
+
+    if (errn > 0)
+      return errn;
+
+    System.out.println("Time");
+    long start, end;
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_ci(a1, b1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_ci: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_vi(a2, b2, (byte)123, 103.f);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_vi: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp(a1, a2, b1, b2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_ci_neg(a1, b1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_ci_neg: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_vi_neg(a1, b1, (byte)123, 103.f);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_vi_neg: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_neg(a1, a2, b1, b2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_neg: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_ci_oppos(a1, b1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_ci_oppos: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_vi_oppos(a1, b1, (byte)123, 103.f);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_vi_oppos: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_oppos(a1, a2, b1, b2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_oppos: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_ci_aln(a1, b1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_ci_aln: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_vi_aln(a1, b1, (byte)123, 103.f);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_vi_aln: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_alndst(a1, a2, b1, b2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_alndst: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_alnsrc(a1, a2, b1, b2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_alnsrc: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_ci_unaln(a1, b1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_ci_unaln: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_vi_unaln(a1, b1, (byte)123, 103.f);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_vi_unaln: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_unalndst(a1, a2, b1, b2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_unalndst: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_unalnsrc(a1, a2, b1, b2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_unalnsrc: " + (end - start));
+    return errn;
+  }
+
+  static void test_ci(byte[] a, float[] b) {
+    for (int i = 0; i < a.length; i+=1) {
+      a[i] = -123;
+      b[i] = -103.f;
+    }
+  }
+  static void test_vi(byte[] a, float[] b, byte c, float d) {
+    for (int i = 0; i < a.length; i+=1) {
+      a[i] = c;
+      b[i] = d;
+    }
+  }
+  static void test_cp(byte[] a, byte[] b, float[] c, float[] d) {
+    for (int i = 0; i < a.length; i+=1) {
+      a[i] = b[i];
+      c[i] = d[i];
+    }
+  }
+  static void test_ci_neg(byte[] a, float[] b) {
+    for (int i = a.length-1; i >= 0; i-=1) {
+      a[i] = -123;
+      b[i] = -103.f;
+    }
+  }
+  static void test_vi_neg(byte[] a, float[] b, byte c, float d) {
+    for (int i = a.length-1; i >= 0; i-=1) {
+      a[i] = c;
+      b[i] = d;
+    }
+  }
+  static void test_cp_neg(byte[] a, byte[] b, float[] c, float[] d) {
+    for (int i = a.length-1; i >= 0; i-=1) {
+      a[i] = b[i];
+      c[i] = d[i];
+    }
+  }
+  static void test_ci_oppos(byte[] a, float[] b) {
+    int limit = a.length-1;
+    for (int i = 0; i < a.length; i+=1) {
+      a[limit-i] = -123;
+      b[i] = -103.f;
+    }
+  }
+  static void test_vi_oppos(byte[] a, float[] b, byte c, float d) {
+    int limit = a.length-1;
+    for (int i = a.length-1; i >= 0; i-=1) {
+      a[i] = c;
+      b[limit-i] = d;
+    }
+  }
+  static void test_cp_oppos(byte[] a, byte[] b, float[] c, float[] d) {
+    int limit = a.length-1;
+    for (int i = 0; i < a.length; i+=1) {
+      a[i] = b[limit-i];
+      c[limit-i] = d[i];
+    }
+  }
+  static void test_ci_aln(byte[] a, float[] b) {
+    for (int i = 0; i < a.length-ALIGN_OFF; i+=1) {
+      a[i+ALIGN_OFF] = -123;
+      b[i] = -103.f;
+    }
+  }
+  static void test_vi_aln(byte[] a, float[] b, byte c, float d) {
+    for (int i = 0; i < a.length-ALIGN_OFF; i+=1) {
+      a[i] = c;
+      b[i+ALIGN_OFF] = d;
+    }
+  }
+  static void test_cp_alndst(byte[] a, byte[] b, float[] c, float[] d) {
+    for (int i = 0; i < a.length-ALIGN_OFF; i+=1) {
+      a[i+ALIGN_OFF] = b[i];
+      c[i+ALIGN_OFF] = d[i];
+    }
+  }
+  static void test_cp_alnsrc(byte[] a, byte[] b, float[] c, float[] d) {
+    for (int i = 0; i < a.length-ALIGN_OFF; i+=1) {
+      a[i] = b[i+ALIGN_OFF];
+      c[i] = d[i+ALIGN_OFF];
+    }
+  }
+  static void test_ci_unaln(byte[] a, float[] b) {
+    for (int i = 0; i < a.length-UNALIGN_OFF; i+=1) {
+      a[i+UNALIGN_OFF] = -123;
+      b[i] = -103.f;
+    }
+  }
+  static void test_vi_unaln(byte[] a, float[] b, byte c, float d) {
+    for (int i = 0; i < a.length-UNALIGN_OFF; i+=1) {
+      a[i] = c;
+      b[i+UNALIGN_OFF] = d;
+    }
+  }
+  static void test_cp_unalndst(byte[] a, byte[] b, float[] c, float[] d) {
+    for (int i = 0; i < a.length-UNALIGN_OFF; i+=1) {
+      a[i+UNALIGN_OFF] = b[i];
+      c[i+UNALIGN_OFF] = d[i];
+    }
+  }
+  static void test_cp_unalnsrc(byte[] a, byte[] b, float[] c, float[] d) {
+    for (int i = 0; i < a.length-UNALIGN_OFF; i+=1) {
+      a[i] = b[i+UNALIGN_OFF];
+      c[i] = d[i+UNALIGN_OFF];
+    }
+  }
+
+  static int verify(String text, int i, byte elem, byte val) {
+    if (elem != val) {
+      System.err.println(text + "[" + i + "] = " + elem + " != " + val);
+      return 1;
+    }
+    return 0;
+  }
+  static int verify(String text, int i, float elem, float val) {
+    if (elem != val) {
+      System.err.println(text + "[" + i + "] = " + elem + " != " + val);
+      return 1;
+    }
+    return 0;
+  }
+}
--- /dev/null	Sat Jun  2 20:04:25 2012
+++ new/test/compiler/7119644/TestByteIntVect.java	Sat Jun  2 20:04:25 2012
@@ -0,0 +1,571 @@
+/*
+ * Copyright (c) 2012, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+/**
+ * @test
+ * @bug 7119644
+ * @summary Increase superword's vector size up to 256 bits
+ *
+ * @run main/othervm/timeout=300 -Xbatch -XX:+IgnoreUnrecognizedVMOptions -XX:-TieredCompilation -XX:-OptimizeFill TestByteIntVect
+ */
+
+public class TestByteIntVect {
+  private static final int ARRLEN = 997;
+  private static final int ITERS  = 11000;
+  private static final int OFFSET = 3;
+  private static final int SCALE = 2;
+  private static final int ALIGN_OFF = 8;
+  private static final int UNALIGN_OFF = 5;
+
+  public static void main(String args[]) {
+    System.out.println("Testing Byte + Integer vectors");
+    int errn = test();
+    if (errn > 0) {
+      System.err.println("FAILED: " + errn + " errors");
+      System.exit(97);
+    }
+    System.out.println("PASSED");
+  }
+
+  static int test() {
+    byte[] a1 = new byte[ARRLEN];
+    byte[] a2 = new byte[ARRLEN];
+    int[] b1 = new int[ARRLEN];
+    int[] b2 = new int[ARRLEN];
+    System.out.println("Warmup");
+    for (int i=0; i<ITERS; i++) {
+      test_ci(a1, b1);
+      test_vi(a2, b2, (byte)123, (int)103);
+      test_cp(a1, a2, b1, b2);
+      test_ci_neg(a1, b1);
+      test_vi_neg(a1, b1, (byte)123, (int)103);
+      test_cp_neg(a1, a2, b1, b2);
+      test_ci_oppos(a1, b1);
+      test_vi_oppos(a1, b1, (byte)123, (int)103);
+      test_cp_oppos(a1, a2, b1, b2);
+      test_ci_aln(a1, b1);
+      test_vi_aln(a1, b1, (byte)123, (int)103);
+      test_cp_alndst(a1, a2, b1, b2);
+      test_cp_alnsrc(a1, a2, b1, b2);
+      test_ci_unaln(a1, b1);
+      test_vi_unaln(a1, b1, (byte)123, (int)103);
+      test_cp_unalndst(a1, a2, b1, b2);
+      test_cp_unalnsrc(a1, a2, b1, b2);
+    }
+    // Initialize
+    for (int i=0; i<ARRLEN; i++) {
+      a1[i] = -1;
+      a2[i] = -1;
+      b1[i] = -1;
+      b2[i] = -1;
+    }
+    // Test and verify results
+    System.out.println("Verification");
+    int errn = 0;
+    {
+      test_ci(a1, b1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_ci: a1", i, a1[i], (byte)-123);
+        errn += verify("test_ci: b1", i, b1[i], (int)-103);
+      }
+      test_vi(a2, b2, (byte)123, (int)103);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_vi: a2", i, a2[i], (byte)123);
+        errn += verify("test_vi: b2", i, b2[i], (int)103);
+      }
+      test_cp(a1, a2, b1, b2);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_cp: a1", i, a1[i], (byte)123);
+        errn += verify("test_cp: b1", i, b1[i], (int)103);
+      }
+
+      // Reset for negative stride
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        a2[i] = -1;
+        b1[i] = -1;
+        b2[i] = -1;
+      }
+      test_ci_neg(a1, b1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_ci_neg: a1", i, a1[i], (byte)-123);
+        errn += verify("test_ci_neg: b1", i, b1[i], (int)-103);
+      }
+      test_vi_neg(a2, b2, (byte)123, (int)103);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_vi_neg: a2", i, a2[i], (byte)123);
+        errn += verify("test_vi_neg: b2", i, b2[i], (int)103);
+      }
+      test_cp_neg(a1, a2, b1, b2);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_cp_neg: a1", i, a1[i], (byte)123);
+        errn += verify("test_cp_neg: b1", i, b1[i], (int)103);
+      }
+
+      // Reset for opposite stride
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        a2[i] = -1;
+        b1[i] = -1;
+        b2[i] = -1;
+      }
+      test_ci_oppos(a1, b1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_ci_oppos: a1", i, a1[i], (byte)-123);
+        errn += verify("test_ci_oppos: b1", i, b1[i], (int)-103);
+      }
+      test_vi_oppos(a2, b2, (byte)123, (int)103);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_vi_oppos: a2", i, a2[i], (byte)123);
+        errn += verify("test_vi_oppos: b2", i, b2[i], (int)103);
+      }
+      test_cp_oppos(a1, a2, b1, b2);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_cp_oppos: a1", i, a1[i], (byte)123);
+        errn += verify("test_cp_oppos: b1", i, b1[i], (int)103);
+      }
+
+      // Reset for 2 arrays with relative aligned offset
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        a2[i] = 123;
+        b1[i] = -1;
+        b2[i] = 123;
+      }
+      test_cp_alndst(a1, a2, b1, b2);
+      for (int i=0; i<ALIGN_OFF; i++) {
+        errn += verify("test_cp_alndst: a1", i, a1[i], (byte)-1);
+        errn += verify("test_cp_alndst: b1", i, b1[i], (int)-1);
+      }
+      for (int i=ALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_cp_alndst: a1", i, a1[i], (byte)123);
+        errn += verify("test_cp_alndst: b1", i, b1[i], (int)123);
+      }
+      for (int i=0; i<ARRLEN; i++) {
+        a2[i] = -123;
+        b2[i] = -123;
+      }
+      test_cp_alnsrc(a1, a2, b1, b2);
+      for (int i=0; i<ARRLEN-ALIGN_OFF; i++) {
+        errn += verify("test_cp_alnsrc: a1", i, a1[i], (byte)-123);
+        errn += verify("test_cp_alnsrc: b1", i, b1[i], (int)-123);
+      }
+      for (int i=ARRLEN-ALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_cp_alnsrc: a1", i, a1[i], (byte)123);
+        errn += verify("test_cp_alnsrc: b1", i, b1[i], (int)123);
+      }
+
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        b1[i] = -1;
+      }
+      test_ci_aln(a1, b1);
+      for (int i=0; i<ALIGN_OFF; i++) {
+        errn += verify("test_ci_aln: a1", i, a1[i], (byte)-1);
+      }
+      for (int i=ALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_ci_aln: a1", i, a1[i], (byte)-123);
+      }
+      for (int i=0; i<ARRLEN-ALIGN_OFF; i++) {
+        errn += verify("test_ci_aln: b1", i, b1[i], (int)-103);
+      }
+      for (int i=ARRLEN-ALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_ci_aln: b1", i, b1[i], (int)-1);
+      }
+
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        b1[i] = -1;
+      }
+      test_vi_aln(a1, b1, (byte)123, (int)103);
+      for (int i=0; i<ARRLEN-ALIGN_OFF; i++) {
+        errn += verify("test_vi_aln: a1", i, a1[i], (byte)123);
+      }
+      for (int i=ARRLEN-ALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_vi_aln: a1", i, a1[i], (byte)-1);
+      }
+      for (int i=0; i<ALIGN_OFF; i++) {
+        errn += verify("test_vi_aln: b1", i, b1[i], (int)-1);
+      }
+      for (int i=ALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_vi_aln: b1", i, b1[i], (int)103);
+      }
+
+      // Reset for 2 arrays with relative unaligned offset
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        a2[i] = 123;
+        b1[i] = -1;
+        b2[i] = 123;
+      }
+      test_cp_unalndst(a1, a2, b1, b2);
+      for (int i=0; i<UNALIGN_OFF; i++) {
+        errn += verify("test_cp_unalndst: a1", i, a1[i], (byte)-1);
+        errn += verify("test_cp_unalndst: b1", i, b1[i], (int)-1);
+      }
+      for (int i=UNALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_cp_unalndst: a1", i, a1[i], (byte)123);
+        errn += verify("test_cp_unalndst: b1", i, b1[i], (int)123);
+      }
+      for (int i=0; i<ARRLEN; i++) {
+        a2[i] = -123;
+        b2[i] = -123;
+      }
+      test_cp_unalnsrc(a1, a2, b1, b2);
+      for (int i=0; i<ARRLEN-UNALIGN_OFF; i++) {
+        errn += verify("test_cp_unalnsrc: a1", i, a1[i], (byte)-123);
+        errn += verify("test_cp_unalnsrc: b1", i, b1[i], (int)-123);
+      }
+      for (int i=ARRLEN-UNALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_cp_unalnsrc: a1", i, a1[i], (byte)123);
+        errn += verify("test_cp_unalnsrc: b1", i, b1[i], (int)123);
+      }
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        b1[i] = -1;
+      }
+      test_ci_unaln(a1, b1);
+      for (int i=0; i<UNALIGN_OFF; i++) {
+        errn += verify("test_ci_unaln: a1", i, a1[i], (byte)-1);
+      }
+      for (int i=UNALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_ci_unaln: a1", i, a1[i], (byte)-123);
+      }
+      for (int i=0; i<ARRLEN-UNALIGN_OFF; i++) {
+        errn += verify("test_ci_unaln: b1", i, b1[i], (int)-103);
+      }
+      for (int i=ARRLEN-UNALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_ci_unaln: b1", i, b1[i], (int)-1);
+      }
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        b1[i] = -1;
+      }
+      test_vi_unaln(a1, b1, (byte)123, (int)103);
+      for (int i=0; i<ARRLEN-UNALIGN_OFF; i++) {
+        errn += verify("test_vi_unaln: a1", i, a1[i], (byte)123);
+      }
+      for (int i=ARRLEN-UNALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_vi_unaln: a1", i, a1[i], (byte)-1);
+      }
+      for (int i=0; i<UNALIGN_OFF; i++) {
+        errn += verify("test_vi_unaln: b1", i, b1[i], (int)-1);
+      }
+      for (int i=UNALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_vi_unaln: b1", i, b1[i], (int)103);
+      }
+
+      // Reset for aligned overlap initialization
+      for (int i=0; i<ALIGN_OFF; i++) {
+        a1[i] = (byte)i;
+        b1[i] = (int)i;
+      }
+      for (int i=ALIGN_OFF; i<ARRLEN; i++) {
+        a1[i] = -1;
+        b1[i] = -1;
+      }
+      test_cp_alndst(a1, a1, b1, b1);
+      for (int i=0; i<ARRLEN; i++) {
+        int v = i%ALIGN_OFF;
+        errn += verify("test_cp_alndst_overlap: a1", i, a1[i], (byte)v);
+        errn += verify("test_cp_alndst_overlap: b1", i, b1[i], (int)v);
+      }
+      for (int i=0; i<ALIGN_OFF; i++) {
+        a1[i+ALIGN_OFF] = -1;
+        b1[i+ALIGN_OFF] = -1;
+      }
+      test_cp_alnsrc(a1, a1, b1, b1);
+      for (int i=0; i<ALIGN_OFF; i++) {
+        errn += verify("test_cp_alnsrc_overlap: a1", i, a1[i], (byte)-1);
+        errn += verify("test_cp_alnsrc_overlap: b1", i, b1[i], (int)-1);
+      }
+      for (int i=ALIGN_OFF; i<ARRLEN; i++) {
+        int v = i%ALIGN_OFF;
+        errn += verify("test_cp_alnsrc_overlap: a1", i, a1[i], (byte)v);
+        errn += verify("test_cp_alnsrc_overlap: b1", i, b1[i], (int)v);
+      }
+
+      // Reset for unaligned overlap initialization
+      for (int i=0; i<UNALIGN_OFF; i++) {
+        a1[i] = (byte)i;
+        b1[i] = (int)i;
+      }
+      for (int i=UNALIGN_OFF; i<ARRLEN; i++) {
+        a1[i] = -1;
+        b1[i] = -1;
+      }
+      test_cp_unalndst(a1, a1, b1, b1);
+      for (int i=0; i<ARRLEN; i++) {
+        int v = i%UNALIGN_OFF;
+        errn += verify("test_cp_unalndst_overlap: a1", i, a1[i], (byte)v);
+        errn += verify("test_cp_unalndst_overlap: b1", i, b1[i], (int)v);
+      }
+      for (int i=0; i<UNALIGN_OFF; i++) {
+        a1[i+UNALIGN_OFF] = -1;
+        b1[i+UNALIGN_OFF] = -1;
+      }
+      test_cp_unalnsrc(a1, a1, b1, b1);
+      for (int i=0; i<UNALIGN_OFF; i++) {
+        errn += verify("test_cp_unalnsrc_overlap: a1", i, a1[i], (byte)-1);
+        errn += verify("test_cp_unalnsrc_overlap: b1", i, b1[i], (int)-1);
+      }
+      for (int i=UNALIGN_OFF; i<ARRLEN; i++) {
+        int v = i%UNALIGN_OFF;
+        errn += verify("test_cp_unalnsrc_overlap: a1", i, a1[i], (byte)v);
+        errn += verify("test_cp_unalnsrc_overlap: b1", i, b1[i], (int)v);
+      }
+
+    }
+
+    if (errn > 0)
+      return errn;
+
+    System.out.println("Time");
+    long start, end;
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_ci(a1, b1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_ci: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_vi(a2, b2, (byte)123, (int)103);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_vi: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp(a1, a2, b1, b2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_ci_neg(a1, b1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_ci_neg: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_vi_neg(a1, b1, (byte)123, (int)103);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_vi_neg: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_neg(a1, a2, b1, b2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_neg: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_ci_oppos(a1, b1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_ci_oppos: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_vi_oppos(a1, b1, (byte)123, (int)103);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_vi_oppos: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_oppos(a1, a2, b1, b2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_oppos: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_ci_aln(a1, b1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_ci_aln: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_vi_aln(a1, b1, (byte)123, (int)103);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_vi_aln: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_alndst(a1, a2, b1, b2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_alndst: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_alnsrc(a1, a2, b1, b2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_alnsrc: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_ci_unaln(a1, b1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_ci_unaln: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_vi_unaln(a1, b1, (byte)123, (int)103);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_vi_unaln: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_unalndst(a1, a2, b1, b2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_unalndst: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_unalnsrc(a1, a2, b1, b2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_unalnsrc: " + (end - start));
+    return errn;
+  }
+
+  static void test_ci(byte[] a, int[] b) {
+    for (int i = 0; i < a.length; i+=1) {
+      a[i] = -123;
+      b[i] = -103;
+    }
+  }
+  static void test_vi(byte[] a, int[] b, byte c, int d) {
+    for (int i = 0; i < a.length; i+=1) {
+      a[i] = c;
+      b[i] = d;
+    }
+  }
+  static void test_cp(byte[] a, byte[] b, int[] c, int[] d) {
+    for (int i = 0; i < a.length; i+=1) {
+      a[i] = b[i];
+      c[i] = d[i];
+    }
+  }
+  static void test_ci_neg(byte[] a, int[] b) {
+    for (int i = a.length-1; i >= 0; i-=1) {
+      a[i] = -123;
+      b[i] = -103;
+    }
+  }
+  static void test_vi_neg(byte[] a, int[] b, byte c, int d) {
+    for (int i = a.length-1; i >= 0; i-=1) {
+      a[i] = c;
+      b[i] = d;
+    }
+  }
+  static void test_cp_neg(byte[] a, byte[] b, int[] c, int[] d) {
+    for (int i = a.length-1; i >= 0; i-=1) {
+      a[i] = b[i];
+      c[i] = d[i];
+    }
+  }
+  static void test_ci_oppos(byte[] a, int[] b) {
+    int limit = a.length-1;
+    for (int i = 0; i < a.length; i+=1) {
+      a[limit-i] = -123;
+      b[i] = -103;
+    }
+  }
+  static void test_vi_oppos(byte[] a, int[] b, byte c, int d) {
+    int limit = a.length-1;
+    for (int i = a.length-1; i >= 0; i-=1) {
+      a[i] = c;
+      b[limit-i] = d;
+    }
+  }
+  static void test_cp_oppos(byte[] a, byte[] b, int[] c, int[] d) {
+    int limit = a.length-1;
+    for (int i = 0; i < a.length; i+=1) {
+      a[i] = b[limit-i];
+      c[limit-i] = d[i];
+    }
+  }
+  static void test_ci_aln(byte[] a, int[] b) {
+    for (int i = 0; i < a.length-ALIGN_OFF; i+=1) {
+      a[i+ALIGN_OFF] = -123;
+      b[i] = -103;
+    }
+  }
+  static void test_vi_aln(byte[] a, int[] b, byte c, int d) {
+    for (int i = 0; i < a.length-ALIGN_OFF; i+=1) {
+      a[i] = c;
+      b[i+ALIGN_OFF] = d;
+    }
+  }
+  static void test_cp_alndst(byte[] a, byte[] b, int[] c, int[] d) {
+    for (int i = 0; i < a.length-ALIGN_OFF; i+=1) {
+      a[i+ALIGN_OFF] = b[i];
+      c[i+ALIGN_OFF] = d[i];
+    }
+  }
+  static void test_cp_alnsrc(byte[] a, byte[] b, int[] c, int[] d) {
+    for (int i = 0; i < a.length-ALIGN_OFF; i+=1) {
+      a[i] = b[i+ALIGN_OFF];
+      c[i] = d[i+ALIGN_OFF];
+    }
+  }
+  static void test_ci_unaln(byte[] a, int[] b) {
+    for (int i = 0; i < a.length-UNALIGN_OFF; i+=1) {
+      a[i+UNALIGN_OFF] = -123;
+      b[i] = -103;
+    }
+  }
+  static void test_vi_unaln(byte[] a, int[] b, byte c, int d) {
+    for (int i = 0; i < a.length-UNALIGN_OFF; i+=1) {
+      a[i] = c;
+      b[i+UNALIGN_OFF] = d;
+    }
+  }
+  static void test_cp_unalndst(byte[] a, byte[] b, int[] c, int[] d) {
+    for (int i = 0; i < a.length-UNALIGN_OFF; i+=1) {
+      a[i+UNALIGN_OFF] = b[i];
+      c[i+UNALIGN_OFF] = d[i];
+    }
+  }
+  static void test_cp_unalnsrc(byte[] a, byte[] b, int[] c, int[] d) {
+    for (int i = 0; i < a.length-UNALIGN_OFF; i+=1) {
+      a[i] = b[i+UNALIGN_OFF];
+      c[i] = d[i+UNALIGN_OFF];
+    }
+  }
+
+  static int verify(String text, int i, byte elem, byte val) {
+    if (elem != val) {
+      System.err.println(text + "[" + i + "] = " + elem + " != " + val);
+      return 1;
+    }
+    return 0;
+  }
+  static int verify(String text, int i, int elem, int val) {
+    if (elem != val) {
+      System.err.println(text + "[" + i + "] = " + elem + " != " + val);
+      return 1;
+    }
+    return 0;
+  }
+}
--- /dev/null	Sat Jun  2 20:04:25 2012
+++ new/test/compiler/7119644/TestByteLongVect.java	Sat Jun  2 20:04:25 2012
@@ -0,0 +1,571 @@
+/*
+ * Copyright (c) 2012, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+/**
+ * @test
+ * @bug 7119644
+ * @summary Increase superword's vector size up to 256 bits
+ *
+ * @run main/othervm/timeout=300 -Xbatch -XX:+IgnoreUnrecognizedVMOptions -XX:-TieredCompilation -XX:-OptimizeFill TestByteLongVect
+ */
+
+public class TestByteLongVect {
+  private static final int ARRLEN = 997;
+  private static final int ITERS  = 11000;
+  private static final int OFFSET = 3;
+  private static final int SCALE = 2;
+  private static final int ALIGN_OFF = 8;
+  private static final int UNALIGN_OFF = 5;
+
+  public static void main(String args[]) {
+    System.out.println("Testing Byte + Long vectors");
+    int errn = test();
+    if (errn > 0) {
+      System.err.println("FAILED: " + errn + " errors");
+      System.exit(97);
+    }
+    System.out.println("PASSED");
+  }
+
+  static int test() {
+    byte[] a1 = new byte[ARRLEN];
+    byte[] a2 = new byte[ARRLEN];
+    long[] b1 = new long[ARRLEN];
+    long[] b2 = new long[ARRLEN];
+    System.out.println("Warmup");
+    for (int i=0; i<ITERS; i++) {
+      test_ci(a1, b1);
+      test_vi(a2, b2, (byte)123, (long)103);
+      test_cp(a1, a2, b1, b2);
+      test_ci_neg(a1, b1);
+      test_vi_neg(a1, b1, (byte)123, (long)103);
+      test_cp_neg(a1, a2, b1, b2);
+      test_ci_oppos(a1, b1);
+      test_vi_oppos(a1, b1, (byte)123, (long)103);
+      test_cp_oppos(a1, a2, b1, b2);
+      test_ci_aln(a1, b1);
+      test_vi_aln(a1, b1, (byte)123, (long)103);
+      test_cp_alndst(a1, a2, b1, b2);
+      test_cp_alnsrc(a1, a2, b1, b2);
+      test_ci_unaln(a1, b1);
+      test_vi_unaln(a1, b1, (byte)123, (long)103);
+      test_cp_unalndst(a1, a2, b1, b2);
+      test_cp_unalnsrc(a1, a2, b1, b2);
+    }
+    // Initialize
+    for (int i=0; i<ARRLEN; i++) {
+      a1[i] = -1;
+      a2[i] = -1;
+      b1[i] = -1;
+      b2[i] = -1;
+    }
+    // Test and verify results
+    System.out.println("Verification");
+    int errn = 0;
+    {
+      test_ci(a1, b1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_ci: a1", i, a1[i], (byte)-123);
+        errn += verify("test_ci: b1", i, b1[i], (long)-103);
+      }
+      test_vi(a2, b2, (byte)123, (long)103);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_vi: a2", i, a2[i], (byte)123);
+        errn += verify("test_vi: b2", i, b2[i], (long)103);
+      }
+      test_cp(a1, a2, b1, b2);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_cp: a1", i, a1[i], (byte)123);
+        errn += verify("test_cp: b1", i, b1[i], (long)103);
+      }
+
+      // Reset for negative stride
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        a2[i] = -1;
+        b1[i] = -1;
+        b2[i] = -1;
+      }
+      test_ci_neg(a1, b1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_ci_neg: a1", i, a1[i], (byte)-123);
+        errn += verify("test_ci_neg: b1", i, b1[i], (long)-103);
+      }
+      test_vi_neg(a2, b2, (byte)123, (long)103);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_vi_neg: a2", i, a2[i], (byte)123);
+        errn += verify("test_vi_neg: b2", i, b2[i], (long)103);
+      }
+      test_cp_neg(a1, a2, b1, b2);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_cp_neg: a1", i, a1[i], (byte)123);
+        errn += verify("test_cp_neg: b1", i, b1[i], (long)103);
+      }
+
+      // Reset for opposite stride
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        a2[i] = -1;
+        b1[i] = -1;
+        b2[i] = -1;
+      }
+      test_ci_oppos(a1, b1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_ci_oppos: a1", i, a1[i], (byte)-123);
+        errn += verify("test_ci_oppos: b1", i, b1[i], (long)-103);
+      }
+      test_vi_oppos(a2, b2, (byte)123, (long)103);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_vi_oppos: a2", i, a2[i], (byte)123);
+        errn += verify("test_vi_oppos: b2", i, b2[i], (long)103);
+      }
+      test_cp_oppos(a1, a2, b1, b2);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_cp_oppos: a1", i, a1[i], (byte)123);
+        errn += verify("test_cp_oppos: b1", i, b1[i], (long)103);
+      }
+
+      // Reset for 2 arrays with relative aligned offset
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        a2[i] = 123;
+        b1[i] = -1;
+        b2[i] = 123;
+      }
+      test_cp_alndst(a1, a2, b1, b2);
+      for (int i=0; i<ALIGN_OFF; i++) {
+        errn += verify("test_cp_alndst: a1", i, a1[i], (byte)-1);
+        errn += verify("test_cp_alndst: b1", i, b1[i], (long)-1);
+      }
+      for (int i=ALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_cp_alndst: a1", i, a1[i], (byte)123);
+        errn += verify("test_cp_alndst: b1", i, b1[i], (long)123);
+      }
+      for (int i=0; i<ARRLEN; i++) {
+        a2[i] = -123;
+        b2[i] = -123;
+      }
+      test_cp_alnsrc(a1, a2, b1, b2);
+      for (int i=0; i<ARRLEN-ALIGN_OFF; i++) {
+        errn += verify("test_cp_alnsrc: a1", i, a1[i], (byte)-123);
+        errn += verify("test_cp_alnsrc: b1", i, b1[i], (long)-123);
+      }
+      for (int i=ARRLEN-ALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_cp_alnsrc: a1", i, a1[i], (byte)123);
+        errn += verify("test_cp_alnsrc: b1", i, b1[i], (long)123);
+      }
+
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        b1[i] = -1;
+      }
+      test_ci_aln(a1, b1);
+      for (int i=0; i<ALIGN_OFF; i++) {
+        errn += verify("test_ci_aln: a1", i, a1[i], (byte)-1);
+      }
+      for (int i=ALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_ci_aln: a1", i, a1[i], (byte)-123);
+      }
+      for (int i=0; i<ARRLEN-ALIGN_OFF; i++) {
+        errn += verify("test_ci_aln: b1", i, b1[i], (long)-103);
+      }
+      for (int i=ARRLEN-ALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_ci_aln: b1", i, b1[i], (long)-1);
+      }
+
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        b1[i] = -1;
+      }
+      test_vi_aln(a1, b1, (byte)123, (long)103);
+      for (int i=0; i<ARRLEN-ALIGN_OFF; i++) {
+        errn += verify("test_vi_aln: a1", i, a1[i], (byte)123);
+      }
+      for (int i=ARRLEN-ALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_vi_aln: a1", i, a1[i], (byte)-1);
+      }
+      for (int i=0; i<ALIGN_OFF; i++) {
+        errn += verify("test_vi_aln: b1", i, b1[i], (long)-1);
+      }
+      for (int i=ALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_vi_aln: b1", i, b1[i], (long)103);
+      }
+
+      // Reset for 2 arrays with relative unaligned offset
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        a2[i] = 123;
+        b1[i] = -1;
+        b2[i] = 123;
+      }
+      test_cp_unalndst(a1, a2, b1, b2);
+      for (int i=0; i<UNALIGN_OFF; i++) {
+        errn += verify("test_cp_unalndst: a1", i, a1[i], (byte)-1);
+        errn += verify("test_cp_unalndst: b1", i, b1[i], (long)-1);
+      }
+      for (int i=UNALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_cp_unalndst: a1", i, a1[i], (byte)123);
+        errn += verify("test_cp_unalndst: b1", i, b1[i], (long)123);
+      }
+      for (int i=0; i<ARRLEN; i++) {
+        a2[i] = -123;
+        b2[i] = -123;
+      }
+      test_cp_unalnsrc(a1, a2, b1, b2);
+      for (int i=0; i<ARRLEN-UNALIGN_OFF; i++) {
+        errn += verify("test_cp_unalnsrc: a1", i, a1[i], (byte)-123);
+        errn += verify("test_cp_unalnsrc: b1", i, b1[i], (long)-123);
+      }
+      for (int i=ARRLEN-UNALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_cp_unalnsrc: a1", i, a1[i], (byte)123);
+        errn += verify("test_cp_unalnsrc: b1", i, b1[i], (long)123);
+      }
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        b1[i] = -1;
+      }
+      test_ci_unaln(a1, b1);
+      for (int i=0; i<UNALIGN_OFF; i++) {
+        errn += verify("test_ci_unaln: a1", i, a1[i], (byte)-1);
+      }
+      for (int i=UNALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_ci_unaln: a1", i, a1[i], (byte)-123);
+      }
+      for (int i=0; i<ARRLEN-UNALIGN_OFF; i++) {
+        errn += verify("test_ci_unaln: b1", i, b1[i], (long)-103);
+      }
+      for (int i=ARRLEN-UNALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_ci_unaln: b1", i, b1[i], (long)-1);
+      }
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        b1[i] = -1;
+      }
+      test_vi_unaln(a1, b1, (byte)123, (long)103);
+      for (int i=0; i<ARRLEN-UNALIGN_OFF; i++) {
+        errn += verify("test_vi_unaln: a1", i, a1[i], (byte)123);
+      }
+      for (int i=ARRLEN-UNALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_vi_unaln: a1", i, a1[i], (byte)-1);
+      }
+      for (int i=0; i<UNALIGN_OFF; i++) {
+        errn += verify("test_vi_unaln: b1", i, b1[i], (long)-1);
+      }
+      for (int i=UNALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_vi_unaln: b1", i, b1[i], (long)103);
+      }
+
+      // Reset for aligned overlap initialization
+      for (int i=0; i<ALIGN_OFF; i++) {
+        a1[i] = (byte)i;
+        b1[i] = (long)i;
+      }
+      for (int i=ALIGN_OFF; i<ARRLEN; i++) {
+        a1[i] = -1;
+        b1[i] = -1;
+      }
+      test_cp_alndst(a1, a1, b1, b1);
+      for (int i=0; i<ARRLEN; i++) {
+        int v = i%ALIGN_OFF;
+        errn += verify("test_cp_alndst_overlap: a1", i, a1[i], (byte)v);
+        errn += verify("test_cp_alndst_overlap: b1", i, b1[i], (long)v);
+      }
+      for (int i=0; i<ALIGN_OFF; i++) {
+        a1[i+ALIGN_OFF] = -1;
+        b1[i+ALIGN_OFF] = -1;
+      }
+      test_cp_alnsrc(a1, a1, b1, b1);
+      for (int i=0; i<ALIGN_OFF; i++) {
+        errn += verify("test_cp_alnsrc_overlap: a1", i, a1[i], (byte)-1);
+        errn += verify("test_cp_alnsrc_overlap: b1", i, b1[i], (long)-1);
+      }
+      for (int i=ALIGN_OFF; i<ARRLEN; i++) {
+        int v = i%ALIGN_OFF;
+        errn += verify("test_cp_alnsrc_overlap: a1", i, a1[i], (byte)v);
+        errn += verify("test_cp_alnsrc_overlap: b1", i, b1[i], (long)v);
+      }
+
+      // Reset for unaligned overlap initialization
+      for (int i=0; i<UNALIGN_OFF; i++) {
+        a1[i] = (byte)i;
+        b1[i] = (long)i;
+      }
+      for (int i=UNALIGN_OFF; i<ARRLEN; i++) {
+        a1[i] = -1;
+        b1[i] = -1;
+      }
+      test_cp_unalndst(a1, a1, b1, b1);
+      for (int i=0; i<ARRLEN; i++) {
+        int v = i%UNALIGN_OFF;
+        errn += verify("test_cp_unalndst_overlap: a1", i, a1[i], (byte)v);
+        errn += verify("test_cp_unalndst_overlap: b1", i, b1[i], (long)v);
+      }
+      for (int i=0; i<UNALIGN_OFF; i++) {
+        a1[i+UNALIGN_OFF] = -1;
+        b1[i+UNALIGN_OFF] = -1;
+      }
+      test_cp_unalnsrc(a1, a1, b1, b1);
+      for (int i=0; i<UNALIGN_OFF; i++) {
+        errn += verify("test_cp_unalnsrc_overlap: a1", i, a1[i], (byte)-1);
+        errn += verify("test_cp_unalnsrc_overlap: b1", i, b1[i], (long)-1);
+      }
+      for (int i=UNALIGN_OFF; i<ARRLEN; i++) {
+        int v = i%UNALIGN_OFF;
+        errn += verify("test_cp_unalnsrc_overlap: a1", i, a1[i], (byte)v);
+        errn += verify("test_cp_unalnsrc_overlap: b1", i, b1[i], (long)v);
+      }
+
+    }
+
+    if (errn > 0)
+      return errn;
+
+    System.out.println("Time");
+    long start, end;
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_ci(a1, b1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_ci: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_vi(a2, b2, (byte)123, (long)103);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_vi: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp(a1, a2, b1, b2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_ci_neg(a1, b1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_ci_neg: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_vi_neg(a1, b1, (byte)123, (long)103);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_vi_neg: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_neg(a1, a2, b1, b2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_neg: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_ci_oppos(a1, b1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_ci_oppos: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_vi_oppos(a1, b1, (byte)123, (long)103);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_vi_oppos: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_oppos(a1, a2, b1, b2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_oppos: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_ci_aln(a1, b1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_ci_aln: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_vi_aln(a1, b1, (byte)123, (long)103);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_vi_aln: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_alndst(a1, a2, b1, b2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_alndst: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_alnsrc(a1, a2, b1, b2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_alnsrc: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_ci_unaln(a1, b1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_ci_unaln: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_vi_unaln(a1, b1, (byte)123, (long)103);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_vi_unaln: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_unalndst(a1, a2, b1, b2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_unalndst: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_unalnsrc(a1, a2, b1, b2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_unalnsrc: " + (end - start));
+    return errn;
+  }
+
+  static void test_ci(byte[] a, long[] b) {
+    for (int i = 0; i < a.length; i+=1) {
+      a[i] = -123;
+      b[i] = -103;
+    }
+  }
+  static void test_vi(byte[] a, long[] b, byte c, long d) {
+    for (int i = 0; i < a.length; i+=1) {
+      a[i] = c;
+      b[i] = d;
+    }
+  }
+  static void test_cp(byte[] a, byte[] b, long[] c, long[] d) {
+    for (int i = 0; i < a.length; i+=1) {
+      a[i] = b[i];
+      c[i] = d[i];
+    }
+  }
+  static void test_ci_neg(byte[] a, long[] b) {
+    for (int i = a.length-1; i >= 0; i-=1) {
+      a[i] = -123;
+      b[i] = -103;
+    }
+  }
+  static void test_vi_neg(byte[] a, long[] b, byte c, long d) {
+    for (int i = a.length-1; i >= 0; i-=1) {
+      a[i] = c;
+      b[i] = d;
+    }
+  }
+  static void test_cp_neg(byte[] a, byte[] b, long[] c, long[] d) {
+    for (int i = a.length-1; i >= 0; i-=1) {
+      a[i] = b[i];
+      c[i] = d[i];
+    }
+  }
+  static void test_ci_oppos(byte[] a, long[] b) {
+    int limit = a.length-1;
+    for (int i = 0; i < a.length; i+=1) {
+      a[limit-i] = -123;
+      b[i] = -103;
+    }
+  }
+  static void test_vi_oppos(byte[] a, long[] b, byte c, long d) {
+    int limit = a.length-1;
+    for (int i = a.length-1; i >= 0; i-=1) {
+      a[i] = c;
+      b[limit-i] = d;
+    }
+  }
+  static void test_cp_oppos(byte[] a, byte[] b, long[] c, long[] d) {
+    int limit = a.length-1;
+    for (int i = 0; i < a.length; i+=1) {
+      a[i] = b[limit-i];
+      c[limit-i] = d[i];
+    }
+  }
+  static void test_ci_aln(byte[] a, long[] b) {
+    for (int i = 0; i < a.length-ALIGN_OFF; i+=1) {
+      a[i+ALIGN_OFF] = -123;
+      b[i] = -103;
+    }
+  }
+  static void test_vi_aln(byte[] a, long[] b, byte c, long d) {
+    for (int i = 0; i < a.length-ALIGN_OFF; i+=1) {
+      a[i] = c;
+      b[i+ALIGN_OFF] = d;
+    }
+  }
+  static void test_cp_alndst(byte[] a, byte[] b, long[] c, long[] d) {
+    for (int i = 0; i < a.length-ALIGN_OFF; i+=1) {
+      a[i+ALIGN_OFF] = b[i];
+      c[i+ALIGN_OFF] = d[i];
+    }
+  }
+  static void test_cp_alnsrc(byte[] a, byte[] b, long[] c, long[] d) {
+    for (int i = 0; i < a.length-ALIGN_OFF; i+=1) {
+      a[i] = b[i+ALIGN_OFF];
+      c[i] = d[i+ALIGN_OFF];
+    }
+  }
+  static void test_ci_unaln(byte[] a, long[] b) {
+    for (int i = 0; i < a.length-UNALIGN_OFF; i+=1) {
+      a[i+UNALIGN_OFF] = -123;
+      b[i] = -103;
+    }
+  }
+  static void test_vi_unaln(byte[] a, long[] b, byte c, long d) {
+    for (int i = 0; i < a.length-UNALIGN_OFF; i+=1) {
+      a[i] = c;
+      b[i+UNALIGN_OFF] = d;
+    }
+  }
+  static void test_cp_unalndst(byte[] a, byte[] b, long[] c, long[] d) {
+    for (int i = 0; i < a.length-UNALIGN_OFF; i+=1) {
+      a[i+UNALIGN_OFF] = b[i];
+      c[i+UNALIGN_OFF] = d[i];
+    }
+  }
+  static void test_cp_unalnsrc(byte[] a, byte[] b, long[] c, long[] d) {
+    for (int i = 0; i < a.length-UNALIGN_OFF; i+=1) {
+      a[i] = b[i+UNALIGN_OFF];
+      c[i] = d[i+UNALIGN_OFF];
+    }
+  }
+
+  static int verify(String text, int i, byte elem, byte val) {
+    if (elem != val) {
+      System.err.println(text + "[" + i + "] = " + elem + " != " + val);
+      return 1;
+    }
+    return 0;
+  }
+  static int verify(String text, int i, long elem, long val) {
+    if (elem != val) {
+      System.err.println(text + "[" + i + "] = " + elem + " != " + val);
+      return 1;
+    }
+    return 0;
+  }
+}
--- /dev/null	Sat Jun  2 20:04:26 2012
+++ new/test/compiler/7119644/TestByteShortVect.java	Sat Jun  2 20:04:26 2012
@@ -0,0 +1,571 @@
+/*
+ * Copyright (c) 2012, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+/**
+ * @test
+ * @bug 7119644
+ * @summary Increase superword's vector size up to 256 bits
+ *
+ * @run main/othervm/timeout=300 -Xbatch -XX:+IgnoreUnrecognizedVMOptions -XX:-TieredCompilation -XX:-OptimizeFill TestByteShortVect
+ */
+
+public class TestByteShortVect {
+  private static final int ARRLEN = 997;
+  private static final int ITERS  = 11000;
+  private static final int OFFSET = 3;
+  private static final int SCALE = 2;
+  private static final int ALIGN_OFF = 8;
+  private static final int UNALIGN_OFF = 5;
+
+  public static void main(String args[]) {
+    System.out.println("Testing Byte + Short vectors");
+    int errn = test();
+    if (errn > 0) {
+      System.err.println("FAILED: " + errn + " errors");
+      System.exit(97);
+    }
+    System.out.println("PASSED");
+  }
+
+  static int test() {
+    byte[] a1 = new byte[ARRLEN];
+    byte[] a2 = new byte[ARRLEN];
+    short[] b1 = new short[ARRLEN];
+    short[] b2 = new short[ARRLEN];
+    System.out.println("Warmup");
+    for (int i=0; i<ITERS; i++) {
+      test_ci(a1, b1);
+      test_vi(a2, b2, (byte)123, (short)103);
+      test_cp(a1, a2, b1, b2);
+      test_ci_neg(a1, b1);
+      test_vi_neg(a1, b1, (byte)123, (short)103);
+      test_cp_neg(a1, a2, b1, b2);
+      test_ci_oppos(a1, b1);
+      test_vi_oppos(a1, b1, (byte)123, (short)103);
+      test_cp_oppos(a1, a2, b1, b2);
+      test_ci_aln(a1, b1);
+      test_vi_aln(a1, b1, (byte)123, (short)103);
+      test_cp_alndst(a1, a2, b1, b2);
+      test_cp_alnsrc(a1, a2, b1, b2);
+      test_ci_unaln(a1, b1);
+      test_vi_unaln(a1, b1, (byte)123, (short)103);
+      test_cp_unalndst(a1, a2, b1, b2);
+      test_cp_unalnsrc(a1, a2, b1, b2);
+    }
+    // Initialize
+    for (int i=0; i<ARRLEN; i++) {
+      a1[i] = -1;
+      a2[i] = -1;
+      b1[i] = -1;
+      b2[i] = -1;
+    }
+    // Test and verify results
+    System.out.println("Verification");
+    int errn = 0;
+    {
+      test_ci(a1, b1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_ci: a1", i, a1[i], (byte)-123);
+        errn += verify("test_ci: b1", i, b1[i], (short)-103);
+      }
+      test_vi(a2, b2, (byte)123, (short)103);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_vi: a2", i, a2[i], (byte)123);
+        errn += verify("test_vi: b2", i, b2[i], (short)103);
+      }
+      test_cp(a1, a2, b1, b2);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_cp: a1", i, a1[i], (byte)123);
+        errn += verify("test_cp: b1", i, b1[i], (short)103);
+      }
+
+      // Reset for negative stride
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        a2[i] = -1;
+        b1[i] = -1;
+        b2[i] = -1;
+      }
+      test_ci_neg(a1, b1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_ci_neg: a1", i, a1[i], (byte)-123);
+        errn += verify("test_ci_neg: b1", i, b1[i], (short)-103);
+      }
+      test_vi_neg(a2, b2, (byte)123, (short)103);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_vi_neg: a2", i, a2[i], (byte)123);
+        errn += verify("test_vi_neg: b2", i, b2[i], (short)103);
+      }
+      test_cp_neg(a1, a2, b1, b2);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_cp_neg: a1", i, a1[i], (byte)123);
+        errn += verify("test_cp_neg: b1", i, b1[i], (short)103);
+      }
+
+      // Reset for opposite stride
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        a2[i] = -1;
+        b1[i] = -1;
+        b2[i] = -1;
+      }
+      test_ci_oppos(a1, b1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_ci_oppos: a1", i, a1[i], (byte)-123);
+        errn += verify("test_ci_oppos: b1", i, b1[i], (short)-103);
+      }
+      test_vi_oppos(a2, b2, (byte)123, (short)103);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_vi_oppos: a2", i, a2[i], (byte)123);
+        errn += verify("test_vi_oppos: b2", i, b2[i], (short)103);
+      }
+      test_cp_oppos(a1, a2, b1, b2);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_cp_oppos: a1", i, a1[i], (byte)123);
+        errn += verify("test_cp_oppos: b1", i, b1[i], (short)103);
+      }
+
+      // Reset for 2 arrays with relative aligned offset
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        a2[i] = 123;
+        b1[i] = -1;
+        b2[i] = 123;
+      }
+      test_cp_alndst(a1, a2, b1, b2);
+      for (int i=0; i<ALIGN_OFF; i++) {
+        errn += verify("test_cp_alndst: a1", i, a1[i], (byte)-1);
+        errn += verify("test_cp_alndst: b1", i, b1[i], (short)-1);
+      }
+      for (int i=ALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_cp_alndst: a1", i, a1[i], (byte)123);
+        errn += verify("test_cp_alndst: b1", i, b1[i], (short)123);
+      }
+      for (int i=0; i<ARRLEN; i++) {
+        a2[i] = -123;
+        b2[i] = -123;
+      }
+      test_cp_alnsrc(a1, a2, b1, b2);
+      for (int i=0; i<ARRLEN-ALIGN_OFF; i++) {
+        errn += verify("test_cp_alnsrc: a1", i, a1[i], (byte)-123);
+        errn += verify("test_cp_alnsrc: b1", i, b1[i], (short)-123);
+      }
+      for (int i=ARRLEN-ALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_cp_alnsrc: a1", i, a1[i], (byte)123);
+        errn += verify("test_cp_alnsrc: b1", i, b1[i], (short)123);
+      }
+
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        b1[i] = -1;
+      }
+      test_ci_aln(a1, b1);
+      for (int i=0; i<ALIGN_OFF; i++) {
+        errn += verify("test_ci_aln: a1", i, a1[i], (byte)-1);
+      }
+      for (int i=ALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_ci_aln: a1", i, a1[i], (byte)-123);
+      }
+      for (int i=0; i<ARRLEN-ALIGN_OFF; i++) {
+        errn += verify("test_ci_aln: b1", i, b1[i], (short)-103);
+      }
+      for (int i=ARRLEN-ALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_ci_aln: b1", i, b1[i], (short)-1);
+      }
+
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        b1[i] = -1;
+      }
+      test_vi_aln(a1, b1, (byte)123, (short)103);
+      for (int i=0; i<ARRLEN-ALIGN_OFF; i++) {
+        errn += verify("test_vi_aln: a1", i, a1[i], (byte)123);
+      }
+      for (int i=ARRLEN-ALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_vi_aln: a1", i, a1[i], (byte)-1);
+      }
+      for (int i=0; i<ALIGN_OFF; i++) {
+        errn += verify("test_vi_aln: b1", i, b1[i], (short)-1);
+      }
+      for (int i=ALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_vi_aln: b1", i, b1[i], (short)103);
+      }
+
+      // Reset for 2 arrays with relative unaligned offset
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        a2[i] = 123;
+        b1[i] = -1;
+        b2[i] = 123;
+      }
+      test_cp_unalndst(a1, a2, b1, b2);
+      for (int i=0; i<UNALIGN_OFF; i++) {
+        errn += verify("test_cp_unalndst: a1", i, a1[i], (byte)-1);
+        errn += verify("test_cp_unalndst: b1", i, b1[i], (short)-1);
+      }
+      for (int i=UNALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_cp_unalndst: a1", i, a1[i], (byte)123);
+        errn += verify("test_cp_unalndst: b1", i, b1[i], (short)123);
+      }
+      for (int i=0; i<ARRLEN; i++) {
+        a2[i] = -123;
+        b2[i] = -123;
+      }
+      test_cp_unalnsrc(a1, a2, b1, b2);
+      for (int i=0; i<ARRLEN-UNALIGN_OFF; i++) {
+        errn += verify("test_cp_unalnsrc: a1", i, a1[i], (byte)-123);
+        errn += verify("test_cp_unalnsrc: b1", i, b1[i], (short)-123);
+      }
+      for (int i=ARRLEN-UNALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_cp_unalnsrc: a1", i, a1[i], (byte)123);
+        errn += verify("test_cp_unalnsrc: b1", i, b1[i], (short)123);
+      }
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        b1[i] = -1;
+      }
+      test_ci_unaln(a1, b1);
+      for (int i=0; i<UNALIGN_OFF; i++) {
+        errn += verify("test_ci_unaln: a1", i, a1[i], (byte)-1);
+      }
+      for (int i=UNALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_ci_unaln: a1", i, a1[i], (byte)-123);
+      }
+      for (int i=0; i<ARRLEN-UNALIGN_OFF; i++) {
+        errn += verify("test_ci_unaln: b1", i, b1[i], (short)-103);
+      }
+      for (int i=ARRLEN-UNALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_ci_unaln: b1", i, b1[i], (short)-1);
+      }
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        b1[i] = -1;
+      }
+      test_vi_unaln(a1, b1, (byte)123, (short)103);
+      for (int i=0; i<ARRLEN-UNALIGN_OFF; i++) {
+        errn += verify("test_vi_unaln: a1", i, a1[i], (byte)123);
+      }
+      for (int i=ARRLEN-UNALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_vi_unaln: a1", i, a1[i], (byte)-1);
+      }
+      for (int i=0; i<UNALIGN_OFF; i++) {
+        errn += verify("test_vi_unaln: b1", i, b1[i], (short)-1);
+      }
+      for (int i=UNALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_vi_unaln: b1", i, b1[i], (short)103);
+      }
+
+      // Reset for aligned overlap initialization
+      for (int i=0; i<ALIGN_OFF; i++) {
+        a1[i] = (byte)i;
+        b1[i] = (short)i;
+      }
+      for (int i=ALIGN_OFF; i<ARRLEN; i++) {
+        a1[i] = -1;
+        b1[i] = -1;
+      }
+      test_cp_alndst(a1, a1, b1, b1);
+      for (int i=0; i<ARRLEN; i++) {
+        int v = i%ALIGN_OFF;
+        errn += verify("test_cp_alndst_overlap: a1", i, a1[i], (byte)v);
+        errn += verify("test_cp_alndst_overlap: b1", i, b1[i], (short)v);
+      }
+      for (int i=0; i<ALIGN_OFF; i++) {
+        a1[i+ALIGN_OFF] = -1;
+        b1[i+ALIGN_OFF] = -1;
+      }
+      test_cp_alnsrc(a1, a1, b1, b1);
+      for (int i=0; i<ALIGN_OFF; i++) {
+        errn += verify("test_cp_alnsrc_overlap: a1", i, a1[i], (byte)-1);
+        errn += verify("test_cp_alnsrc_overlap: b1", i, b1[i], (short)-1);
+      }
+      for (int i=ALIGN_OFF; i<ARRLEN; i++) {
+        int v = i%ALIGN_OFF;
+        errn += verify("test_cp_alnsrc_overlap: a1", i, a1[i], (byte)v);
+        errn += verify("test_cp_alnsrc_overlap: b1", i, b1[i], (short)v);
+      }
+
+      // Reset for unaligned overlap initialization
+      for (int i=0; i<UNALIGN_OFF; i++) {
+        a1[i] = (byte)i;
+        b1[i] = (short)i;
+      }
+      for (int i=UNALIGN_OFF; i<ARRLEN; i++) {
+        a1[i] = -1;
+        b1[i] = -1;
+      }
+      test_cp_unalndst(a1, a1, b1, b1);
+      for (int i=0; i<ARRLEN; i++) {
+        int v = i%UNALIGN_OFF;
+        errn += verify("test_cp_unalndst_overlap: a1", i, a1[i], (byte)v);
+        errn += verify("test_cp_unalndst_overlap: b1", i, b1[i], (short)v);
+      }
+      for (int i=0; i<UNALIGN_OFF; i++) {
+        a1[i+UNALIGN_OFF] = -1;
+        b1[i+UNALIGN_OFF] = -1;
+      }
+      test_cp_unalnsrc(a1, a1, b1, b1);
+      for (int i=0; i<UNALIGN_OFF; i++) {
+        errn += verify("test_cp_unalnsrc_overlap: a1", i, a1[i], (byte)-1);
+        errn += verify("test_cp_unalnsrc_overlap: b1", i, b1[i], (short)-1);
+      }
+      for (int i=UNALIGN_OFF; i<ARRLEN; i++) {
+        int v = i%UNALIGN_OFF;
+        errn += verify("test_cp_unalnsrc_overlap: a1", i, a1[i], (byte)v);
+        errn += verify("test_cp_unalnsrc_overlap: b1", i, b1[i], (short)v);
+      }
+
+    }
+
+    if (errn > 0)
+      return errn;
+
+    System.out.println("Time");
+    long start, end;
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_ci(a1, b1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_ci: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_vi(a2, b2, (byte)123, (short)103);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_vi: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp(a1, a2, b1, b2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_ci_neg(a1, b1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_ci_neg: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_vi_neg(a1, b1, (byte)123, (short)103);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_vi_neg: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_neg(a1, a2, b1, b2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_neg: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_ci_oppos(a1, b1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_ci_oppos: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_vi_oppos(a1, b1, (byte)123, (short)103);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_vi_oppos: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_oppos(a1, a2, b1, b2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_oppos: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_ci_aln(a1, b1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_ci_aln: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_vi_aln(a1, b1, (byte)123, (short)103);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_vi_aln: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_alndst(a1, a2, b1, b2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_alndst: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_alnsrc(a1, a2, b1, b2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_alnsrc: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_ci_unaln(a1, b1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_ci_unaln: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_vi_unaln(a1, b1, (byte)123, (short)103);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_vi_unaln: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_unalndst(a1, a2, b1, b2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_unalndst: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_unalnsrc(a1, a2, b1, b2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_unalnsrc: " + (end - start));
+    return errn;
+  }
+
+  static void test_ci(byte[] a, short[] b) {
+    for (int i = 0; i < a.length; i+=1) {
+      a[i] = -123;
+      b[i] = -103;
+    }
+  }
+  static void test_vi(byte[] a, short[] b, byte c, short d) {
+    for (int i = 0; i < a.length; i+=1) {
+      a[i] = c;
+      b[i] = d;
+    }
+  }
+  static void test_cp(byte[] a, byte[] b, short[] c, short[] d) {
+    for (int i = 0; i < a.length; i+=1) {
+      a[i] = b[i];
+      c[i] = d[i];
+    }
+  }
+  static void test_ci_neg(byte[] a, short[] b) {
+    for (int i = a.length-1; i >= 0; i-=1) {
+      a[i] = -123;
+      b[i] = -103;
+    }
+  }
+  static void test_vi_neg(byte[] a, short[] b, byte c, short d) {
+    for (int i = a.length-1; i >= 0; i-=1) {
+      a[i] = c;
+      b[i] = d;
+    }
+  }
+  static void test_cp_neg(byte[] a, byte[] b, short[] c, short[] d) {
+    for (int i = a.length-1; i >= 0; i-=1) {
+      a[i] = b[i];
+      c[i] = d[i];
+    }
+  }
+  static void test_ci_oppos(byte[] a, short[] b) {
+    int limit = a.length-1;
+    for (int i = 0; i < a.length; i+=1) {
+      a[limit-i] = -123;
+      b[i] = -103;
+    }
+  }
+  static void test_vi_oppos(byte[] a, short[] b, byte c, short d) {
+    int limit = a.length-1;
+    for (int i = a.length-1; i >= 0; i-=1) {
+      a[i] = c;
+      b[limit-i] = d;
+    }
+  }
+  static void test_cp_oppos(byte[] a, byte[] b, short[] c, short[] d) {
+    int limit = a.length-1;
+    for (int i = 0; i < a.length; i+=1) {
+      a[i] = b[limit-i];
+      c[limit-i] = d[i];
+    }
+  }
+  static void test_ci_aln(byte[] a, short[] b) {
+    for (int i = 0; i < a.length-ALIGN_OFF; i+=1) {
+      a[i+ALIGN_OFF] = -123;
+      b[i] = -103;
+    }
+  }
+  static void test_vi_aln(byte[] a, short[] b, byte c, short d) {
+    for (int i = 0; i < a.length-ALIGN_OFF; i+=1) {
+      a[i] = c;
+      b[i+ALIGN_OFF] = d;
+    }
+  }
+  static void test_cp_alndst(byte[] a, byte[] b, short[] c, short[] d) {
+    for (int i = 0; i < a.length-ALIGN_OFF; i+=1) {
+      a[i+ALIGN_OFF] = b[i];
+      c[i+ALIGN_OFF] = d[i];
+    }
+  }
+  static void test_cp_alnsrc(byte[] a, byte[] b, short[] c, short[] d) {
+    for (int i = 0; i < a.length-ALIGN_OFF; i+=1) {
+      a[i] = b[i+ALIGN_OFF];
+      c[i] = d[i+ALIGN_OFF];
+    }
+  }
+  static void test_ci_unaln(byte[] a, short[] b) {
+    for (int i = 0; i < a.length-UNALIGN_OFF; i+=1) {
+      a[i+UNALIGN_OFF] = -123;
+      b[i] = -103;
+    }
+  }
+  static void test_vi_unaln(byte[] a, short[] b, byte c, short d) {
+    for (int i = 0; i < a.length-UNALIGN_OFF; i+=1) {
+      a[i] = c;
+      b[i+UNALIGN_OFF] = d;
+    }
+  }
+  static void test_cp_unalndst(byte[] a, byte[] b, short[] c, short[] d) {
+    for (int i = 0; i < a.length-UNALIGN_OFF; i+=1) {
+      a[i+UNALIGN_OFF] = b[i];
+      c[i+UNALIGN_OFF] = d[i];
+    }
+  }
+  static void test_cp_unalnsrc(byte[] a, byte[] b, short[] c, short[] d) {
+    for (int i = 0; i < a.length-UNALIGN_OFF; i+=1) {
+      a[i] = b[i+UNALIGN_OFF];
+      c[i] = d[i+UNALIGN_OFF];
+    }
+  }
+
+  static int verify(String text, int i, byte elem, byte val) {
+    if (elem != val) {
+      System.err.println(text + "[" + i + "] = " + elem + " != " + val);
+      return 1;
+    }
+    return 0;
+  }
+  static int verify(String text, int i, short elem, short val) {
+    if (elem != val) {
+      System.err.println(text + "[" + i + "] = " + elem + " != " + val);
+      return 1;
+    }
+    return 0;
+  }
+}
--- /dev/null	Sat Jun  2 20:04:26 2012
+++ new/test/compiler/7119644/TestByteVect.java	Sat Jun  2 20:04:26 2012
@@ -0,0 +1,953 @@
+/*
+ * Copyright (c) 2012, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+/**
+ * @test
+ * @bug 7119644
+ * @summary Increase superword's vector size up to 256 bits
+ *
+ * @run main/othervm/timeout=300 -Xbatch -XX:+IgnoreUnrecognizedVMOptions -XX:-TieredCompilation -XX:-OptimizeFill TestByteVect
+ */
+
+public class TestByteVect {
+  private static final int ARRLEN = 997;
+  private static final int ITERS  = 11000;
+  private static final int OFFSET = 3;
+  private static final int SCALE = 2;
+  private static final int ALIGN_OFF = 8;
+  private static final int UNALIGN_OFF = 5;
+
+  public static void main(String args[]) {
+    System.out.println("Testing Byte vectors");
+    int errn = test();
+    if (errn > 0) {
+      System.err.println("FAILED: " + errn + " errors");
+      System.exit(97);
+    }
+    System.out.println("PASSED");
+  }
+
+  static int test() {
+    byte[] a1 = new byte[ARRLEN];
+    byte[] a2 = new byte[ARRLEN];
+    System.out.println("Warmup");
+    for (int i=0; i<ITERS; i++) {
+      test_ci(a1);
+      test_vi(a2, (byte)123);
+      test_cp(a1, a2);
+      test_2ci(a1, a2);
+      test_2vi(a1, a2, (byte)123, (byte)103);
+      test_ci_neg(a1);
+      test_vi_neg(a2, (byte)123);
+      test_cp_neg(a1, a2);
+      test_2ci_neg(a1, a2);
+      test_2vi_neg(a1, a2, (byte)123, (byte)103);
+      test_ci_oppos(a1);
+      test_vi_oppos(a2, (byte)123);
+      test_cp_oppos(a1, a2);
+      test_2ci_oppos(a1, a2);
+      test_2vi_oppos(a1, a2, (byte)123, (byte)103);
+      test_ci_off(a1);
+      test_vi_off(a2, (byte)123);
+      test_cp_off(a1, a2);
+      test_2ci_off(a1, a2);
+      test_2vi_off(a1, a2, (byte)123, (byte)103);
+      test_ci_inv(a1, OFFSET);
+      test_vi_inv(a2, (byte)123, OFFSET);
+      test_cp_inv(a1, a2, OFFSET);
+      test_2ci_inv(a1, a2, OFFSET);
+      test_2vi_inv(a1, a2, (byte)123, (byte)103, OFFSET);
+      test_ci_scl(a1);
+      test_vi_scl(a2, (byte)123);
+      test_cp_scl(a1, a2);
+      test_2ci_scl(a1, a2);
+      test_2vi_scl(a1, a2, (byte)123, (byte)103);
+      test_cp_alndst(a1, a2);
+      test_cp_alnsrc(a1, a2);
+      test_2ci_aln(a1, a2);
+      test_2vi_aln(a1, a2, (byte)123, (byte)103);
+      test_cp_unalndst(a1, a2);
+      test_cp_unalnsrc(a1, a2);
+      test_2ci_unaln(a1, a2);
+      test_2vi_unaln(a1, a2, (byte)123, (byte)103);
+    }
+    // Initialize
+    for (int i=0; i<ARRLEN; i++) {
+      a1[i] = -1;
+      a2[i] = -1;
+    }
+    // Test and verify results
+    System.out.println("Verification");
+    int errn = 0;
+    {
+      test_ci(a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_ci: a1", i, a1[i], (byte)-123);
+      }
+      test_vi(a2, (byte)123);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_vi: a2", i, a2[i], (byte)123);
+      }
+      test_cp(a1, a2);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_cp: a1", i, a1[i], (byte)123);
+      }
+      test_2ci(a1, a2);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_2ci: a1", i, a1[i], (byte)-123);
+        errn += verify("test_2ci: a2", i, a2[i], (byte)-103);
+      }
+      test_2vi(a1, a2, (byte)123, (byte)103);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_2vi: a1", i, a1[i], (byte)123);
+        errn += verify("test_2vi: a2", i, a2[i], (byte)103);
+      }
+      // Reset for negative stride
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        a2[i] = -1;
+      }
+      test_ci_neg(a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_ci_neg: a1", i, a1[i], (byte)-123);
+      }
+      test_vi_neg(a2, (byte)123);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_vi_neg: a2", i, a2[i], (byte)123);
+      }
+      test_cp_neg(a1, a2);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_cp_neg: a1", i, a1[i], (byte)123);
+      }
+      test_2ci_neg(a1, a2);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_2ci_neg: a1", i, a1[i], (byte)-123);
+        errn += verify("test_2ci_neg: a2", i, a2[i], (byte)-103);
+      }
+      test_2vi_neg(a1, a2, (byte)123, (byte)103);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_2vi_neg: a1", i, a1[i], (byte)123);
+        errn += verify("test_2vi_neg: a2", i, a2[i], (byte)103);
+      }
+      // Reset for opposite stride
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        a2[i] = -1;
+      }
+      test_ci_oppos(a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_ci_oppos: a1", i, a1[i], (byte)-123);
+      }
+      test_vi_oppos(a2, (byte)123);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_vi_oppos: a2", i, a2[i], (byte)123);
+      }
+      test_cp_oppos(a1, a2);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_cp_oppos: a1", i, a1[i], (byte)123);
+      }
+      test_2ci_oppos(a1, a2);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_2ci_oppos: a1", i, a1[i], (byte)-123);
+        errn += verify("test_2ci_oppos: a2", i, a2[i], (byte)-103);
+      }
+      test_2vi_oppos(a1, a2, (byte)123, (byte)103);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_2vi_oppos: a1", i, a1[i], (byte)123);
+        errn += verify("test_2vi_oppos: a2", i, a2[i], (byte)103);
+      }
+      // Reset for indexing with offset
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        a2[i] = -1;
+      }
+      test_ci_off(a1);
+      for (int i=OFFSET; i<ARRLEN; i++) {
+        errn += verify("test_ci_off: a1", i, a1[i], (byte)-123);
+      }
+      test_vi_off(a2, (byte)123);
+      for (int i=OFFSET; i<ARRLEN; i++) {
+        errn += verify("test_vi_off: a2", i, a2[i], (byte)123);
+      }
+      test_cp_off(a1, a2);
+      for (int i=OFFSET; i<ARRLEN; i++) {
+        errn += verify("test_cp_off: a1", i, a1[i], (byte)123);
+      }
+      test_2ci_off(a1, a2);
+      for (int i=OFFSET; i<ARRLEN; i++) {
+        errn += verify("test_2ci_off: a1", i, a1[i], (byte)-123);
+        errn += verify("test_2ci_off: a2", i, a2[i], (byte)-103);
+      }
+      test_2vi_off(a1, a2, (byte)123, (byte)103);
+      for (int i=OFFSET; i<ARRLEN; i++) {
+        errn += verify("test_2vi_off: a1", i, a1[i], (byte)123);
+        errn += verify("test_2vi_off: a2", i, a2[i], (byte)103);
+      }
+      for (int i=0; i<OFFSET; i++) {
+        errn += verify("test_2vi_off: a1", i, a1[i], (byte)-1);
+        errn += verify("test_2vi_off: a2", i, a2[i], (byte)-1);
+      }
+      // Reset for indexing with invariant offset
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        a2[i] = -1;
+      }
+      test_ci_inv(a1, OFFSET);
+      for (int i=OFFSET; i<ARRLEN; i++) {
+        errn += verify("test_ci_inv: a1", i, a1[i], (byte)-123);
+      }
+      test_vi_inv(a2, (byte)123, OFFSET);
+      for (int i=OFFSET; i<ARRLEN; i++) {
+        errn += verify("test_vi_inv: a2", i, a2[i], (byte)123);
+      }
+      test_cp_inv(a1, a2, OFFSET);
+      for (int i=OFFSET; i<ARRLEN; i++) {
+        errn += verify("test_cp_inv: a1", i, a1[i], (byte)123);
+      }
+      test_2ci_inv(a1, a2, OFFSET);
+      for (int i=OFFSET; i<ARRLEN; i++) {
+        errn += verify("test_2ci_inv: a1", i, a1[i], (byte)-123);
+        errn += verify("test_2ci_inv: a2", i, a2[i], (byte)-103);
+      }
+      test_2vi_inv(a1, a2, (byte)123, (byte)103, OFFSET);
+      for (int i=OFFSET; i<ARRLEN; i++) {
+        errn += verify("test_2vi_inv: a1", i, a1[i], (byte)123);
+        errn += verify("test_2vi_inv: a2", i, a2[i], (byte)103);
+      }
+      for (int i=0; i<OFFSET; i++) {
+        errn += verify("test_2vi_inv: a1", i, a1[i], (byte)-1);
+        errn += verify("test_2vi_inv: a2", i, a2[i], (byte)-1);
+      }
+      // Reset for indexing with scale
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        a2[i] = -1;
+      }
+      test_ci_scl(a1);
+      for (int i=0; i<ARRLEN; i++) {
+        int val = (i%SCALE != 0) ? -1 : -123;
+        errn += verify("test_ci_scl: a1", i, a1[i], (byte)val);
+      }
+      test_vi_scl(a2, (byte)123);
+      for (int i=0; i<ARRLEN; i++) {
+        int val = (i%SCALE != 0) ? -1 : 123;
+        errn += verify("test_vi_scl: a2", i, a2[i], (byte)val);
+      }
+      test_cp_scl(a1, a2);
+      for (int i=0; i<ARRLEN; i++) {
+        int val = (i%SCALE != 0) ? -1 : 123;
+        errn += verify("test_cp_scl: a1", i, a1[i], (byte)val);
+      }
+      test_2ci_scl(a1, a2);
+      for (int i=0; i<ARRLEN; i++) {
+        if (i%SCALE != 0) {
+          errn += verify("test_2ci_scl: a1", i, a1[i], (byte)-1);
+        } else if (i*SCALE < ARRLEN) {
+          errn += verify("test_2ci_scl: a1", i*SCALE, a1[i*SCALE], (byte)-123);
+        }
+        if (i%SCALE != 0) {
+          errn += verify("test_2ci_scl: a2", i, a2[i], (byte)-1);
+        } else if (i*SCALE < ARRLEN) {
+          errn += verify("test_2ci_scl: a2", i*SCALE, a2[i*SCALE], (byte)-103);
+        }
+      }
+      test_2vi_scl(a1, a2, (byte)123, (byte)103);
+      for (int i=0; i<ARRLEN; i++) {
+        if (i%SCALE != 0) {
+          errn += verify("test_2vi_scl: a1", i, a1[i], (byte)-1);
+        } else if (i*SCALE < ARRLEN) {
+          errn += verify("test_2vi_scl: a1", i*SCALE, a1[i*SCALE], (byte)123);
+        }
+        if (i%SCALE != 0) {
+          errn += verify("test_2vi_scl: a2", i, a2[i], (byte)-1);
+        } else if (i*SCALE < ARRLEN) {
+          errn += verify("test_2vi_scl: a2", i*SCALE, a2[i*SCALE], (byte)103);
+        }
+      }
+      // Reset for 2 arrays with relative aligned offset
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        a2[i] = -1;
+      }
+      test_vi(a2, (byte)123);
+      test_cp_alndst(a1, a2);
+      for (int i=0; i<ALIGN_OFF; i++) {
+        errn += verify("test_cp_alndst: a1", i, a1[i], (byte)-1);
+      }
+      for (int i=ALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_cp_alndst: a1", i, a1[i], (byte)123);
+      }
+      test_vi(a2, (byte)-123);
+      test_cp_alnsrc(a1, a2);
+      for (int i=0; i<ARRLEN-ALIGN_OFF; i++) {
+        errn += verify("test_cp_alnsrc: a1", i, a1[i], (byte)-123);
+      }
+      for (int i=ARRLEN-ALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_cp_alnsrc: a1", i, a1[i], (byte)123);
+      }
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        a2[i] = -1;
+      }
+      test_2ci_aln(a1, a2);
+      for (int i=0; i<ALIGN_OFF; i++) {
+        errn += verify("test_2ci_aln: a1", i, a1[i], (byte)-1);
+      }
+      for (int i=ALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_2ci_aln: a1", i, a1[i], (byte)-123);
+      }
+      for (int i=0; i<ARRLEN-ALIGN_OFF; i++) {
+        errn += verify("test_2ci_aln: a2", i, a2[i], (byte)-103);
+      }
+      for (int i=ARRLEN-ALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_2ci_aln: a2", i, a2[i], (byte)-1);
+      }
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        a2[i] = -1;
+      }
+      test_2vi_aln(a1, a2, (byte)123, (byte)103);
+      for (int i=0; i<ARRLEN-ALIGN_OFF; i++) {
+        errn += verify("test_2vi_aln: a1", i, a1[i], (byte)123);
+      }
+      for (int i=ARRLEN-ALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_2vi_aln: a1", i, a1[i], (byte)-1);
+      }
+      for (int i=0; i<ALIGN_OFF; i++) {
+        errn += verify("test_2vi_aln: a2", i, a2[i], (byte)-1);
+      }
+      for (int i=ALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_2vi_aln: a2", i, a2[i], (byte)103);
+      }
+
+      // Reset for 2 arrays with relative unaligned offset
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        a2[i] = -1;
+      }
+      test_vi(a2, (byte)123);
+      test_cp_unalndst(a1, a2);
+      for (int i=0; i<UNALIGN_OFF; i++) {
+        errn += verify("test_cp_unalndst: a1", i, a1[i], (byte)-1);
+      }
+      for (int i=UNALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_cp_unalndst: a1", i, a1[i], (byte)123);
+      }
+      test_vi(a2, (byte)-123);
+      test_cp_unalnsrc(a1, a2);
+      for (int i=0; i<ARRLEN-UNALIGN_OFF; i++) {
+        errn += verify("test_cp_unalnsrc: a1", i, a1[i], (byte)-123);
+      }
+      for (int i=ARRLEN-UNALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_cp_unalnsrc: a1", i, a1[i], (byte)123);
+      }
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        a2[i] = -1;
+      }
+      test_2ci_unaln(a1, a2);
+      for (int i=0; i<UNALIGN_OFF; i++) {
+        errn += verify("test_2ci_unaln: a1", i, a1[i], (byte)-1);
+      }
+      for (int i=UNALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_2ci_unaln: a1", i, a1[i], (byte)-123);
+      }
+      for (int i=0; i<ARRLEN-UNALIGN_OFF; i++) {
+        errn += verify("test_2ci_unaln: a2", i, a2[i], (byte)-103);
+      }
+      for (int i=ARRLEN-UNALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_2ci_unaln: a2", i, a2[i], (byte)-1);
+      }
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        a2[i] = -1;
+      }
+      test_2vi_unaln(a1, a2, (byte)123, (byte)103);
+      for (int i=0; i<ARRLEN-UNALIGN_OFF; i++) {
+        errn += verify("test_2vi_unaln: a1", i, a1[i], (byte)123);
+      }
+      for (int i=ARRLEN-UNALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_2vi_unaln: a1", i, a1[i], (byte)-1);
+      }
+      for (int i=0; i<UNALIGN_OFF; i++) {
+        errn += verify("test_2vi_unaln: a2", i, a2[i], (byte)-1);
+      }
+      for (int i=UNALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_2vi_unaln: a2", i, a2[i], (byte)103);
+      }
+
+      // Reset for aligned overlap initialization
+      for (int i=0; i<ALIGN_OFF; i++) {
+        a1[i] = (byte)i;
+      }
+      for (int i=ALIGN_OFF; i<ARRLEN; i++) {
+        a1[i] = -1;
+      }
+      test_cp_alndst(a1, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        int v = i%ALIGN_OFF;
+        errn += verify("test_cp_alndst_overlap: a1", i, a1[i], (byte)v);
+      }
+      for (int i=0; i<ALIGN_OFF; i++) {
+        a1[i+ALIGN_OFF] = -1;
+      }
+      test_cp_alnsrc(a1, a1);
+      for (int i=0; i<ALIGN_OFF; i++) {
+        errn += verify("test_cp_alnsrc_overlap: a1", i, a1[i], (byte)-1);
+      }
+      for (int i=ALIGN_OFF; i<ARRLEN; i++) {
+        int v = i%ALIGN_OFF;
+        errn += verify("test_cp_alnsrc_overlap: a1", i, a1[i], (byte)v);
+      }
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+      }
+      test_2ci_aln(a1, a1);
+      for (int i=0; i<ARRLEN-ALIGN_OFF; i++) {
+        errn += verify("test_2ci_aln_overlap: a1", i, a1[i], (byte)-103);
+      }
+      for (int i=ARRLEN-ALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_2ci_aln_overlap: a1", i, a1[i], (byte)-123);
+      }
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+      }
+      test_2vi_aln(a1, a1, (byte)123, (byte)103);
+      for (int i=0; i<ARRLEN-ALIGN_OFF; i++) {
+        errn += verify("test_2vi_aln_overlap: a1", i, a1[i], (byte)123);
+      }
+      for (int i=ARRLEN-ALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_2vi_aln_overlap: a1", i, a1[i], (byte)103);
+      }
+
+      // Reset for unaligned overlap initialization
+      for (int i=0; i<UNALIGN_OFF; i++) {
+        a1[i] = (byte)i;
+      }
+      for (int i=UNALIGN_OFF; i<ARRLEN; i++) {
+        a1[i] = -1;
+      }
+      test_cp_unalndst(a1, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        int v = i%UNALIGN_OFF;
+        errn += verify("test_cp_unalndst_overlap: a1", i, a1[i], (byte)v);
+      }
+      for (int i=0; i<UNALIGN_OFF; i++) {
+        a1[i+UNALIGN_OFF] = -1;
+      }
+      test_cp_unalnsrc(a1, a1);
+      for (int i=0; i<UNALIGN_OFF; i++) {
+        errn += verify("test_cp_unalnsrc_overlap: a1", i, a1[i], (byte)-1);
+      }
+      for (int i=UNALIGN_OFF; i<ARRLEN; i++) {
+        int v = i%UNALIGN_OFF;
+        errn += verify("test_cp_unalnsrc_overlap: a1", i, a1[i], (byte)v);
+      }
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+      }
+      test_2ci_unaln(a1, a1);
+      for (int i=0; i<ARRLEN-UNALIGN_OFF; i++) {
+        errn += verify("test_2ci_unaln_overlap: a1", i, a1[i], (byte)-103);
+      }
+      for (int i=ARRLEN-UNALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_2ci_unaln_overlap: a1", i, a1[i], (byte)-123);
+      }
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+      }
+      test_2vi_unaln(a1, a1, (byte)123, (byte)103);
+      for (int i=0; i<ARRLEN-UNALIGN_OFF; i++) {
+        errn += verify("test_2vi_unaln_overlap: a1", i, a1[i], (byte)123);
+      }
+      for (int i=ARRLEN-UNALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_2vi_unaln_overlap: a1", i, a1[i], (byte)103);
+      }
+
+    }
+
+    if (errn > 0)
+      return errn;
+
+    System.out.println("Time");
+    long start, end;
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_ci(a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_ci: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_vi(a2, (byte)123);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_vi: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp(a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_2ci(a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_2ci: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_2vi(a1, a2, (byte)123, (byte)103);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_2vi: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_ci_neg(a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_ci_neg: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_vi_neg(a2, (byte)123);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_vi_neg: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_neg(a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_neg: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_2ci_neg(a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_2ci_neg: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_2vi_neg(a1, a2, (byte)123, (byte)103);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_2vi_neg: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_ci_oppos(a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_ci_oppos: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_vi_oppos(a2, (byte)123);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_vi_oppos: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_oppos(a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_oppos: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_2ci_oppos(a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_2ci_oppos: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_2vi_oppos(a1, a2, (byte)123, (byte)103);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_2vi_oppos: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_ci_off(a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_ci_off: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_vi_off(a2, (byte)123);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_vi_off: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_off(a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_off: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_2ci_off(a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_2ci_off: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_2vi_off(a1, a2, (byte)123, (byte)103);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_2vi_off: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_ci_inv(a1, OFFSET);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_ci_inv: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_vi_inv(a2, (byte)123, OFFSET);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_vi_inv: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_inv(a1, a2, OFFSET);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_inv: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_2ci_inv(a1, a2, OFFSET);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_2ci_inv: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_2vi_inv(a1, a2, (byte)123, (byte)103, OFFSET);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_2vi_inv: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_ci_scl(a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_ci_scl: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_vi_scl(a2, (byte)123);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_vi_scl: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_scl(a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_scl: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_2ci_scl(a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_2ci_scl: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_2vi_scl(a1, a2, (byte)123, (byte)103);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_2vi_scl: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_alndst(a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_alndst: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_alnsrc(a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_alnsrc: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_2ci_aln(a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_2ci_aln: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_2vi_aln(a1, a2, (byte)123, (byte)103);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_2vi_aln: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_unalndst(a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_unalndst: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_unalnsrc(a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_unalnsrc: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_2ci_unaln(a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_2ci_unaln: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_2vi_unaln(a1, a2, (byte)123, (byte)103);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_2vi_unaln: " + (end - start));
+
+    return errn;
+  }
+
+  static void test_ci(byte[] a) {
+    for (int i = 0; i < a.length; i+=1) {
+      a[i] = -123;
+    }
+  }
+  static void test_vi(byte[] a, byte b) {
+    for (int i = 0; i < a.length; i+=1) {
+      a[i] = b;
+    }
+  }
+  static void test_cp(byte[] a, byte[] b) {
+    for (int i = 0; i < a.length; i+=1) {
+      a[i] = b[i];
+    }
+  }
+  static void test_2ci(byte[] a, byte[] b) {
+    for (int i = 0; i < a.length; i+=1) {
+      a[i] = -123;
+      b[i] = -103;
+    }
+  }
+  static void test_2vi(byte[] a, byte[] b, byte c, byte d) {
+    for (int i = 0; i < a.length; i+=1) {
+      a[i] = c;
+      b[i] = d;
+    }
+  }
+  static void test_ci_neg(byte[] a) {
+    for (int i = a.length-1; i >= 0; i-=1) {
+      a[i] = -123;
+    }
+  }
+  static void test_vi_neg(byte[] a, byte b) {
+    for (int i = a.length-1; i >= 0; i-=1) {
+      a[i] = b;
+    }
+  }
+  static void test_cp_neg(byte[] a, byte[] b) {
+    for (int i = a.length-1; i >= 0; i-=1) {
+      a[i] = b[i];
+    }
+  }
+  static void test_2ci_neg(byte[] a, byte[] b) {
+    for (int i = a.length-1; i >= 0; i-=1) {
+      a[i] = -123;
+      b[i] = -103;
+    }
+  }
+  static void test_2vi_neg(byte[] a, byte[] b, byte c, byte d) {
+    for (int i = a.length-1; i >= 0; i-=1) {
+      a[i] = c;
+      b[i] = d;
+    }
+  }
+  static void test_ci_oppos(byte[] a) {
+    int limit = a.length-1;
+    for (int i = 0; i < a.length; i+=1) {
+      a[limit-i] = -123;
+    }
+  }
+  static void test_vi_oppos(byte[] a, byte b) {
+    int limit = a.length-1;
+    for (int i = limit; i >= 0; i-=1) {
+      a[limit-i] = b;
+    }
+  }
+  static void test_cp_oppos(byte[] a, byte[] b) {
+    int limit = a.length-1;
+    for (int i = 0; i < a.length; i+=1) {
+      a[i] = b[limit-i];
+    }
+  }
+  static void test_2ci_oppos(byte[] a, byte[] b) {
+    int limit = a.length-1;
+    for (int i = 0; i < a.length; i+=1) {
+      a[limit-i] = -123;
+      b[i] = -103;
+    }
+  }
+  static void test_2vi_oppos(byte[] a, byte[] b, byte c, byte d) {
+    int limit = a.length-1;
+    for (int i = limit; i >= 0; i-=1) {
+      a[i] = c;
+      b[limit-i] = d;
+    }
+  }
+  static void test_ci_off(byte[] a) {
+    for (int i = 0; i < a.length-OFFSET; i+=1) {
+      a[i+OFFSET] = -123;
+    }
+  }
+  static void test_vi_off(byte[] a, byte b) {
+    for (int i = 0; i < a.length-OFFSET; i+=1) {
+      a[i+OFFSET] = b;
+    }
+  }
+  static void test_cp_off(byte[] a, byte[] b) {
+    for (int i = 0; i < a.length-OFFSET; i+=1) {
+      a[i+OFFSET] = b[i+OFFSET];
+    }
+  }
+  static void test_2ci_off(byte[] a, byte[] b) {
+    for (int i = 0; i < a.length-OFFSET; i+=1) {
+      a[i+OFFSET] = -123;
+      b[i+OFFSET] = -103;
+    }
+  }
+  static void test_2vi_off(byte[] a, byte[] b, byte c, byte d) {
+    for (int i = 0; i < a.length-OFFSET; i+=1) {
+      a[i+OFFSET] = c;
+      b[i+OFFSET] = d;
+    }
+  }
+  static void test_ci_inv(byte[] a, int k) {
+    for (int i = 0; i < a.length-k; i+=1) {
+      a[i+k] = -123;
+    }
+  }
+  static void test_vi_inv(byte[] a, byte b, int k) {
+    for (int i = 0; i < a.length-k; i+=1) {
+      a[i+k] = b;
+    }
+  }
+  static void test_cp_inv(byte[] a, byte[] b, int k) {
+    for (int i = 0; i < a.length-k; i+=1) {
+      a[i+k] = b[i+k];
+    }
+  }
+  static void test_2ci_inv(byte[] a, byte[] b, int k) {
+    for (int i = 0; i < a.length-k; i+=1) {
+      a[i+k] = -123;
+      b[i+k] = -103;
+    }
+  }
+  static void test_2vi_inv(byte[] a, byte[] b, byte c, byte d, int k) {
+    for (int i = 0; i < a.length-k; i+=1) {
+      a[i+k] = c;
+      b[i+k] = d;
+    }
+  }
+  static void test_ci_scl(byte[] a) {
+    for (int i = 0; i*SCALE < a.length; i+=1) {
+      a[i*SCALE] = -123;
+    }
+  }
+  static void test_vi_scl(byte[] a, byte b) {
+    for (int i = 0; i*SCALE < a.length; i+=1) {
+      a[i*SCALE] = b;
+    }
+  }
+  static void test_cp_scl(byte[] a, byte[] b) {
+    for (int i = 0; i*SCALE < a.length; i+=1) {
+      a[i*SCALE] = b[i*SCALE];
+    }
+  }
+  static void test_2ci_scl(byte[] a, byte[] b) {
+    for (int i = 0; i*SCALE < a.length; i+=1) {
+      a[i*SCALE] = -123;
+      b[i*SCALE] = -103;
+    }
+  }
+  static void test_2vi_scl(byte[] a, byte[] b, byte c, byte d) {
+    for (int i = 0; i*SCALE < a.length; i+=1) {
+      a[i*SCALE] = c;
+      b[i*SCALE] = d;
+    }
+  }
+  static void test_cp_alndst(byte[] a, byte[] b) {
+    for (int i = 0; i < a.length-ALIGN_OFF; i+=1) {
+      a[i+ALIGN_OFF] = b[i];
+    }
+  }
+  static void test_cp_alnsrc(byte[] a, byte[] b) {
+    for (int i = 0; i < a.length-ALIGN_OFF; i+=1) {
+      a[i] = b[i+ALIGN_OFF];
+    }
+  }
+  static void test_2ci_aln(byte[] a, byte[] b) {
+    for (int i = 0; i < a.length-ALIGN_OFF; i+=1) {
+      a[i+ALIGN_OFF] = -123;
+      b[i] = -103;
+    }
+  }
+  static void test_2vi_aln(byte[] a, byte[] b, byte c, byte d) {
+    for (int i = 0; i < a.length-ALIGN_OFF; i+=1) {
+      a[i] = c;
+      b[i+ALIGN_OFF] = d;
+    }
+  }
+  static void test_cp_unalndst(byte[] a, byte[] b) {
+    for (int i = 0; i < a.length-UNALIGN_OFF; i+=1) {
+      a[i+UNALIGN_OFF] = b[i];
+    }
+  }
+  static void test_cp_unalnsrc(byte[] a, byte[] b) {
+    for (int i = 0; i < a.length-UNALIGN_OFF; i+=1) {
+      a[i] = b[i+UNALIGN_OFF];
+    }
+  }
+  static void test_2ci_unaln(byte[] a, byte[] b) {
+    for (int i = 0; i < a.length-UNALIGN_OFF; i+=1) {
+      a[i+UNALIGN_OFF] = -123;
+      b[i] = -103;
+    }
+  }
+  static void test_2vi_unaln(byte[] a, byte[] b, byte c, byte d) {
+    for (int i = 0; i < a.length-UNALIGN_OFF; i+=1) {
+      a[i] = c;
+      b[i+UNALIGN_OFF] = d;
+    }
+  }
+
+  static int verify(String text, int i, byte elem, byte val) {
+    if (elem != val) {
+      System.err.println(text + "[" + i + "] = " + elem + " != " + val);
+      return 1;
+    }
+    return 0;
+  }
+}
--- /dev/null	Sat Jun  2 20:04:27 2012
+++ new/test/compiler/7119644/TestDoubleVect.java	Sat Jun  2 20:04:27 2012
@@ -0,0 +1,953 @@
+/*
+ * Copyright (c) 2012, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+/**
+ * @test
+ * @bug 7119644
+ * @summary Increase superword's vector size up to 256 bits
+ *
+ * @run main/othervm/timeout=300 -Xbatch -XX:+IgnoreUnrecognizedVMOptions -XX:-TieredCompilation -XX:-OptimizeFill TestDoubleVect
+ */
+
+public class TestDoubleVect {
+  private static final int ARRLEN = 997;
+  private static final int ITERS  = 11000;
+  private static final int OFFSET = 3;
+  private static final int SCALE = 2;
+  private static final int ALIGN_OFF = 8;
+  private static final int UNALIGN_OFF = 5;
+
+  public static void main(String args[]) {
+    System.out.println("Testing Double vectors");
+    int errn = test();
+    if (errn > 0) {
+      System.err.println("FAILED: " + errn + " errors");
+      System.exit(97);
+    }
+    System.out.println("PASSED");
+  }
+
+  static int test() {
+    double[] a1 = new double[ARRLEN];
+    double[] a2 = new double[ARRLEN];
+    System.out.println("Warmup");
+    for (int i=0; i<ITERS; i++) {
+      test_ci(a1);
+      test_vi(a2, 123.);
+      test_cp(a1, a2);
+      test_2ci(a1, a2);
+      test_2vi(a1, a2, 123., 103.);
+      test_ci_neg(a1);
+      test_vi_neg(a2, 123.);
+      test_cp_neg(a1, a2);
+      test_2ci_neg(a1, a2);
+      test_2vi_neg(a1, a2, 123., 103.);
+      test_ci_oppos(a1);
+      test_vi_oppos(a2, 123.);
+      test_cp_oppos(a1, a2);
+      test_2ci_oppos(a1, a2);
+      test_2vi_oppos(a1, a2, 123., 103.);
+      test_ci_off(a1);
+      test_vi_off(a2, 123.);
+      test_cp_off(a1, a2);
+      test_2ci_off(a1, a2);
+      test_2vi_off(a1, a2, 123., 103.);
+      test_ci_inv(a1, OFFSET);
+      test_vi_inv(a2, 123., OFFSET);
+      test_cp_inv(a1, a2, OFFSET);
+      test_2ci_inv(a1, a2, OFFSET);
+      test_2vi_inv(a1, a2, 123., 103., OFFSET);
+      test_ci_scl(a1);
+      test_vi_scl(a2, 123.);
+      test_cp_scl(a1, a2);
+      test_2ci_scl(a1, a2);
+      test_2vi_scl(a1, a2, 123., 103.);
+      test_cp_alndst(a1, a2);
+      test_cp_alnsrc(a1, a2);
+      test_2ci_aln(a1, a2);
+      test_2vi_aln(a1, a2, 123., 103.);
+      test_cp_unalndst(a1, a2);
+      test_cp_unalnsrc(a1, a2);
+      test_2ci_unaln(a1, a2);
+      test_2vi_unaln(a1, a2, 123., 103.);
+    }
+    // Initialize
+    for (int i=0; i<ARRLEN; i++) {
+      a1[i] = -1;
+      a2[i] = -1;
+    }
+    // Test and verify results
+    System.out.println("Verification");
+    int errn = 0;
+    {
+      test_ci(a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_ci: a1", i, a1[i], -123.);
+      }
+      test_vi(a2, 123.);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_vi: a2", i, a2[i], 123.);
+      }
+      test_cp(a1, a2);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_cp: a1", i, a1[i], 123.);
+      }
+      test_2ci(a1, a2);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_2ci: a1", i, a1[i], -123.);
+        errn += verify("test_2ci: a2", i, a2[i], -103.);
+      }
+      test_2vi(a1, a2, 123., 103.);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_2vi: a1", i, a1[i], 123.);
+        errn += verify("test_2vi: a2", i, a2[i], 103.);
+      }
+      // Reset for negative stride
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        a2[i] = -1;
+      }
+      test_ci_neg(a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_ci_neg: a1", i, a1[i], -123.);
+      }
+      test_vi_neg(a2, 123.);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_vi_neg: a2", i, a2[i], 123.);
+      }
+      test_cp_neg(a1, a2);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_cp_neg: a1", i, a1[i], 123.);
+      }
+      test_2ci_neg(a1, a2);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_2ci_neg: a1", i, a1[i], -123.);
+        errn += verify("test_2ci_neg: a2", i, a2[i], -103.);
+      }
+      test_2vi_neg(a1, a2, 123., 103.);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_2vi_neg: a1", i, a1[i], 123.);
+        errn += verify("test_2vi_neg: a2", i, a2[i], 103.);
+      }
+      // Reset for opposite stride
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        a2[i] = -1;
+      }
+      test_ci_oppos(a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_ci_oppos: a1", i, a1[i], -123.);
+      }
+      test_vi_oppos(a2, 123.);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_vi_oppos: a2", i, a2[i], 123.);
+      }
+      test_cp_oppos(a1, a2);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_cp_oppos: a1", i, a1[i], 123.);
+      }
+      test_2ci_oppos(a1, a2);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_2ci_oppos: a1", i, a1[i], -123.);
+        errn += verify("test_2ci_oppos: a2", i, a2[i], -103.);
+      }
+      test_2vi_oppos(a1, a2, 123., 103.);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_2vi_oppos: a1", i, a1[i], 123.);
+        errn += verify("test_2vi_oppos: a2", i, a2[i], 103.);
+      }
+      // Reset for indexing with offset
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        a2[i] = -1;
+      }
+      test_ci_off(a1);
+      for (int i=OFFSET; i<ARRLEN; i++) {
+        errn += verify("test_ci_off: a1", i, a1[i], -123.);
+      }
+      test_vi_off(a2, 123.);
+      for (int i=OFFSET; i<ARRLEN; i++) {
+        errn += verify("test_vi_off: a2", i, a2[i], 123.);
+      }
+      test_cp_off(a1, a2);
+      for (int i=OFFSET; i<ARRLEN; i++) {
+        errn += verify("test_cp_off: a1", i, a1[i], 123.);
+      }
+      test_2ci_off(a1, a2);
+      for (int i=OFFSET; i<ARRLEN; i++) {
+        errn += verify("test_2ci_off: a1", i, a1[i], -123.);
+        errn += verify("test_2ci_off: a2", i, a2[i], -103.);
+      }
+      test_2vi_off(a1, a2, 123., 103.);
+      for (int i=OFFSET; i<ARRLEN; i++) {
+        errn += verify("test_2vi_off: a1", i, a1[i], 123.);
+        errn += verify("test_2vi_off: a2", i, a2[i], 103.);
+      }
+      for (int i=0; i<OFFSET; i++) {
+        errn += verify("test_2vi_off: a1", i, a1[i], -1.);
+        errn += verify("test_2vi_off: a2", i, a2[i], -1.);
+      }
+      // Reset for indexing with invariant offset
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        a2[i] = -1;
+      }
+      test_ci_inv(a1, OFFSET);
+      for (int i=OFFSET; i<ARRLEN; i++) {
+        errn += verify("test_ci_inv: a1", i, a1[i], -123.);
+      }
+      test_vi_inv(a2, 123., OFFSET);
+      for (int i=OFFSET; i<ARRLEN; i++) {
+        errn += verify("test_vi_inv: a2", i, a2[i], 123.);
+      }
+      test_cp_inv(a1, a2, OFFSET);
+      for (int i=OFFSET; i<ARRLEN; i++) {
+        errn += verify("test_cp_inv: a1", i, a1[i], 123.);
+      }
+      test_2ci_inv(a1, a2, OFFSET);
+      for (int i=OFFSET; i<ARRLEN; i++) {
+        errn += verify("test_2ci_inv: a1", i, a1[i], -123.);
+        errn += verify("test_2ci_inv: a2", i, a2[i], -103.);
+      }
+      test_2vi_inv(a1, a2, 123., 103., OFFSET);
+      for (int i=OFFSET; i<ARRLEN; i++) {
+        errn += verify("test_2vi_inv: a1", i, a1[i], 123.);
+        errn += verify("test_2vi_inv: a2", i, a2[i], 103.);
+      }
+      for (int i=0; i<OFFSET; i++) {
+        errn += verify("test_2vi_inv: a1", i, a1[i], -1.);
+        errn += verify("test_2vi_inv: a2", i, a2[i], -1.);
+      }
+      // Reset for indexing with scale
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        a2[i] = -1;
+      }
+      test_ci_scl(a1);
+      for (int i=0; i<ARRLEN; i++) {
+        int val = (i%SCALE != 0) ? -1 : -123;
+        errn += verify("test_ci_scl: a1", i, a1[i], (double)val);
+      }
+      test_vi_scl(a2, 123.);
+      for (int i=0; i<ARRLEN; i++) {
+        int val = (i%SCALE != 0) ? -1 : 123;
+        errn += verify("test_vi_scl: a2", i, a2[i], (double)val);
+      }
+      test_cp_scl(a1, a2);
+      for (int i=0; i<ARRLEN; i++) {
+        int val = (i%SCALE != 0) ? -1 : 123;
+        errn += verify("test_cp_scl: a1", i, a1[i], (double)val);
+      }
+      test_2ci_scl(a1, a2);
+      for (int i=0; i<ARRLEN; i++) {
+        if (i%SCALE != 0) {
+          errn += verify("test_2ci_scl: a1", i, a1[i], -1.);
+        } else if (i*SCALE < ARRLEN) {
+          errn += verify("test_2ci_scl: a1", i*SCALE, a1[i*SCALE], -123.);
+        }
+        if (i%SCALE != 0) {
+          errn += verify("test_2ci_scl: a2", i, a2[i], -1.);
+        } else if (i*SCALE < ARRLEN) {
+          errn += verify("test_2ci_scl: a2", i*SCALE, a2[i*SCALE], -103.);
+        }
+      }
+      test_2vi_scl(a1, a2, 123., 103.);
+      for (int i=0; i<ARRLEN; i++) {
+        if (i%SCALE != 0) {
+          errn += verify("test_2vi_scl: a1", i, a1[i], -1.);
+        } else if (i*SCALE < ARRLEN) {
+          errn += verify("test_2vi_scl: a1", i*SCALE, a1[i*SCALE], 123.);
+        }
+        if (i%SCALE != 0) {
+          errn += verify("test_2vi_scl: a2", i, a2[i], -1.);
+        } else if (i*SCALE < ARRLEN) {
+          errn += verify("test_2vi_scl: a2", i*SCALE, a2[i*SCALE], 103.);
+        }
+      }
+      // Reset for 2 arrays with relative aligned offset
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        a2[i] = -1;
+      }
+      test_vi(a2, 123.);
+      test_cp_alndst(a1, a2);
+      for (int i=0; i<ALIGN_OFF; i++) {
+        errn += verify("test_cp_alndst: a1", i, a1[i], -1.);
+      }
+      for (int i=ALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_cp_alndst: a1", i, a1[i], 123.);
+      }
+      test_vi(a2, -123.);
+      test_cp_alnsrc(a1, a2);
+      for (int i=0; i<ARRLEN-ALIGN_OFF; i++) {
+        errn += verify("test_cp_alnsrc: a1", i, a1[i], -123.);
+      }
+      for (int i=ARRLEN-ALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_cp_alnsrc: a1", i, a1[i], 123.);
+      }
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        a2[i] = -1;
+      }
+      test_2ci_aln(a1, a2);
+      for (int i=0; i<ALIGN_OFF; i++) {
+        errn += verify("test_2ci_aln: a1", i, a1[i], -1.);
+      }
+      for (int i=ALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_2ci_aln: a1", i, a1[i], -123.);
+      }
+      for (int i=0; i<ARRLEN-ALIGN_OFF; i++) {
+        errn += verify("test_2ci_aln: a2", i, a2[i], -103.);
+      }
+      for (int i=ARRLEN-ALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_2ci_aln: a2", i, a2[i], -1.);
+      }
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        a2[i] = -1;
+      }
+      test_2vi_aln(a1, a2, 123., 103.);
+      for (int i=0; i<ARRLEN-ALIGN_OFF; i++) {
+        errn += verify("test_2vi_aln: a1", i, a1[i], 123.);
+      }
+      for (int i=ARRLEN-ALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_2vi_aln: a1", i, a1[i], -1.);
+      }
+      for (int i=0; i<ALIGN_OFF; i++) {
+        errn += verify("test_2vi_aln: a2", i, a2[i], -1.);
+      }
+      for (int i=ALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_2vi_aln: a2", i, a2[i], 103.);
+      }
+
+      // Reset for 2 arrays with relative unaligned offset
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        a2[i] = -1;
+      }
+      test_vi(a2, 123.);
+      test_cp_unalndst(a1, a2);
+      for (int i=0; i<UNALIGN_OFF; i++) {
+        errn += verify("test_cp_unalndst: a1", i, a1[i], -1.);
+      }
+      for (int i=UNALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_cp_unalndst: a1", i, a1[i], 123.);
+      }
+      test_vi(a2, -123.);
+      test_cp_unalnsrc(a1, a2);
+      for (int i=0; i<ARRLEN-UNALIGN_OFF; i++) {
+        errn += verify("test_cp_unalnsrc: a1", i, a1[i], -123.);
+      }
+      for (int i=ARRLEN-UNALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_cp_unalnsrc: a1", i, a1[i], 123.);
+      }
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        a2[i] = -1;
+      }
+      test_2ci_unaln(a1, a2);
+      for (int i=0; i<UNALIGN_OFF; i++) {
+        errn += verify("test_2ci_unaln: a1", i, a1[i], -1.);
+      }
+      for (int i=UNALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_2ci_unaln: a1", i, a1[i], -123.);
+      }
+      for (int i=0; i<ARRLEN-UNALIGN_OFF; i++) {
+        errn += verify("test_2ci_unaln: a2", i, a2[i], -103.);
+      }
+      for (int i=ARRLEN-UNALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_2ci_unaln: a2", i, a2[i], -1.);
+      }
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        a2[i] = -1;
+      }
+      test_2vi_unaln(a1, a2, 123., 103.);
+      for (int i=0; i<ARRLEN-UNALIGN_OFF; i++) {
+        errn += verify("test_2vi_unaln: a1", i, a1[i], 123.);
+      }
+      for (int i=ARRLEN-UNALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_2vi_unaln: a1", i, a1[i], -1.);
+      }
+      for (int i=0; i<UNALIGN_OFF; i++) {
+        errn += verify("test_2vi_unaln: a2", i, a2[i], -1.);
+      }
+      for (int i=UNALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_2vi_unaln: a2", i, a2[i], 103.);
+      }
+
+      // Reset for aligned overlap initialization
+      for (int i=0; i<ALIGN_OFF; i++) {
+        a1[i] = (double)i;
+      }
+      for (int i=ALIGN_OFF; i<ARRLEN; i++) {
+        a1[i] = -1;
+      }
+      test_cp_alndst(a1, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        int v = i%ALIGN_OFF;
+        errn += verify("test_cp_alndst_overlap: a1", i, a1[i], (double)v);
+      }
+      for (int i=0; i<ALIGN_OFF; i++) {
+        a1[i+ALIGN_OFF] = -1;
+      }
+      test_cp_alnsrc(a1, a1);
+      for (int i=0; i<ALIGN_OFF; i++) {
+        errn += verify("test_cp_alnsrc_overlap: a1", i, a1[i], -1.);
+      }
+      for (int i=ALIGN_OFF; i<ARRLEN; i++) {
+        int v = i%ALIGN_OFF;
+        errn += verify("test_cp_alnsrc_overlap: a1", i, a1[i], (double)v);
+      }
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+      }
+      test_2ci_aln(a1, a1);
+      for (int i=0; i<ARRLEN-ALIGN_OFF; i++) {
+        errn += verify("test_2ci_aln_overlap: a1", i, a1[i], -103.);
+      }
+      for (int i=ARRLEN-ALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_2ci_aln_overlap: a1", i, a1[i], -123.);
+      }
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+      }
+      test_2vi_aln(a1, a1, 123., 103.);
+      for (int i=0; i<ARRLEN-ALIGN_OFF; i++) {
+        errn += verify("test_2vi_aln_overlap: a1", i, a1[i], 123.);
+      }
+      for (int i=ARRLEN-ALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_2vi_aln_overlap: a1", i, a1[i], 103.);
+      }
+
+      // Reset for unaligned overlap initialization
+      for (int i=0; i<UNALIGN_OFF; i++) {
+        a1[i] = (double)i;
+      }
+      for (int i=UNALIGN_OFF; i<ARRLEN; i++) {
+        a1[i] = -1;
+      }
+      test_cp_unalndst(a1, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        int v = i%UNALIGN_OFF;
+        errn += verify("test_cp_unalndst_overlap: a1", i, a1[i], (double)v);
+      }
+      for (int i=0; i<UNALIGN_OFF; i++) {
+        a1[i+UNALIGN_OFF] = -1;
+      }
+      test_cp_unalnsrc(a1, a1);
+      for (int i=0; i<UNALIGN_OFF; i++) {
+        errn += verify("test_cp_unalnsrc_overlap: a1", i, a1[i], -1.);
+      }
+      for (int i=UNALIGN_OFF; i<ARRLEN; i++) {
+        int v = i%UNALIGN_OFF;
+        errn += verify("test_cp_unalnsrc_overlap: a1", i, a1[i], (double)v);
+      }
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+      }
+      test_2ci_unaln(a1, a1);
+      for (int i=0; i<ARRLEN-UNALIGN_OFF; i++) {
+        errn += verify("test_2ci_unaln_overlap: a1", i, a1[i], -103.);
+      }
+      for (int i=ARRLEN-UNALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_2ci_unaln_overlap: a1", i, a1[i], -123.);
+      }
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+      }
+      test_2vi_unaln(a1, a1, 123., 103.);
+      for (int i=0; i<ARRLEN-UNALIGN_OFF; i++) {
+        errn += verify("test_2vi_unaln_overlap: a1", i, a1[i], 123.);
+      }
+      for (int i=ARRLEN-UNALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_2vi_unaln_overlap: a1", i, a1[i], 103.);
+      }
+
+    }
+
+    if (errn > 0)
+      return errn;
+
+    System.out.println("Time");
+    long start, end;
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_ci(a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_ci: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_vi(a2, 123.);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_vi: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp(a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_2ci(a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_2ci: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_2vi(a1, a2, 123., 103.);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_2vi: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_ci_neg(a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_ci_neg: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_vi_neg(a2, 123.);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_vi_neg: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_neg(a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_neg: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_2ci_neg(a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_2ci_neg: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_2vi_neg(a1, a2, 123., 103.);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_2vi_neg: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_ci_oppos(a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_ci_oppos: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_vi_oppos(a2, 123.);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_vi_oppos: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_oppos(a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_oppos: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_2ci_oppos(a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_2ci_oppos: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_2vi_oppos(a1, a2, 123., 103.);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_2vi_oppos: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_ci_off(a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_ci_off: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_vi_off(a2, 123.);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_vi_off: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_off(a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_off: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_2ci_off(a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_2ci_off: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_2vi_off(a1, a2, 123., 103.);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_2vi_off: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_ci_inv(a1, OFFSET);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_ci_inv: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_vi_inv(a2, 123., OFFSET);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_vi_inv: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_inv(a1, a2, OFFSET);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_inv: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_2ci_inv(a1, a2, OFFSET);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_2ci_inv: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_2vi_inv(a1, a2, 123., 103., OFFSET);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_2vi_inv: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_ci_scl(a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_ci_scl: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_vi_scl(a2, 123.);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_vi_scl: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_scl(a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_scl: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_2ci_scl(a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_2ci_scl: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_2vi_scl(a1, a2, 123., 103.);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_2vi_scl: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_alndst(a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_alndst: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_alnsrc(a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_alnsrc: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_2ci_aln(a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_2ci_aln: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_2vi_aln(a1, a2, 123., 103.);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_2vi_aln: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_unalndst(a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_unalndst: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_unalnsrc(a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_unalnsrc: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_2ci_unaln(a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_2ci_unaln: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_2vi_unaln(a1, a2, 123., 103.);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_2vi_unaln: " + (end - start));
+
+    return errn;
+  }
+
+  static void test_ci(double[] a) {
+    for (int i = 0; i < a.length; i+=1) {
+      a[i] = -123.;
+    }
+  }
+  static void test_vi(double[] a, double b) {
+    for (int i = 0; i < a.length; i+=1) {
+      a[i] = b;
+    }
+  }
+  static void test_cp(double[] a, double[] b) {
+    for (int i = 0; i < a.length; i+=1) {
+      a[i] = b[i];
+    }
+  }
+  static void test_2ci(double[] a, double[] b) {
+    for (int i = 0; i < a.length; i+=1) {
+      a[i] = -123.;
+      b[i] = -103.;
+    }
+  }
+  static void test_2vi(double[] a, double[] b, double c, double d) {
+    for (int i = 0; i < a.length; i+=1) {
+      a[i] = c;
+      b[i] = d;
+    }
+  }
+  static void test_ci_neg(double[] a) {
+    for (int i = a.length-1; i >= 0; i-=1) {
+      a[i] = -123.;
+    }
+  }
+  static void test_vi_neg(double[] a, double b) {
+    for (int i = a.length-1; i >= 0; i-=1) {
+      a[i] = b;
+    }
+  }
+  static void test_cp_neg(double[] a, double[] b) {
+    for (int i = a.length-1; i >= 0; i-=1) {
+      a[i] = b[i];
+    }
+  }
+  static void test_2ci_neg(double[] a, double[] b) {
+    for (int i = a.length-1; i >= 0; i-=1) {
+      a[i] = -123.;
+      b[i] = -103.;
+    }
+  }
+  static void test_2vi_neg(double[] a, double[] b, double c, double d) {
+    for (int i = a.length-1; i >= 0; i-=1) {
+      a[i] = c;
+      b[i] = d;
+    }
+  }
+  static void test_ci_oppos(double[] a) {
+    int limit = a.length-1;
+    for (int i = 0; i < a.length; i+=1) {
+      a[limit-i] = -123.;
+    }
+  }
+  static void test_vi_oppos(double[] a, double b) {
+    int limit = a.length-1;
+    for (int i = limit; i >= 0; i-=1) {
+      a[limit-i] = b;
+    }
+  }
+  static void test_cp_oppos(double[] a, double[] b) {
+    int limit = a.length-1;
+    for (int i = 0; i < a.length; i+=1) {
+      a[i] = b[limit-i];
+    }
+  }
+  static void test_2ci_oppos(double[] a, double[] b) {
+    int limit = a.length-1;
+    for (int i = 0; i < a.length; i+=1) {
+      a[limit-i] = -123.;
+      b[i] = -103.;
+    }
+  }
+  static void test_2vi_oppos(double[] a, double[] b, double c, double d) {
+    int limit = a.length-1;
+    for (int i = limit; i >= 0; i-=1) {
+      a[i] = c;
+      b[limit-i] = d;
+    }
+  }
+  static void test_ci_off(double[] a) {
+    for (int i = 0; i < a.length-OFFSET; i+=1) {
+      a[i+OFFSET] = -123.;
+    }
+  }
+  static void test_vi_off(double[] a, double b) {
+    for (int i = 0; i < a.length-OFFSET; i+=1) {
+      a[i+OFFSET] = b;
+    }
+  }
+  static void test_cp_off(double[] a, double[] b) {
+    for (int i = 0; i < a.length-OFFSET; i+=1) {
+      a[i+OFFSET] = b[i+OFFSET];
+    }
+  }
+  static void test_2ci_off(double[] a, double[] b) {
+    for (int i = 0; i < a.length-OFFSET; i+=1) {
+      a[i+OFFSET] = -123.;
+      b[i+OFFSET] = -103.;
+    }
+  }
+  static void test_2vi_off(double[] a, double[] b, double c, double d) {
+    for (int i = 0; i < a.length-OFFSET; i+=1) {
+      a[i+OFFSET] = c;
+      b[i+OFFSET] = d;
+    }
+  }
+  static void test_ci_inv(double[] a, int k) {
+    for (int i = 0; i < a.length-k; i+=1) {
+      a[i+k] = -123.;
+    }
+  }
+  static void test_vi_inv(double[] a, double b, int k) {
+    for (int i = 0; i < a.length-k; i+=1) {
+      a[i+k] = b;
+    }
+  }
+  static void test_cp_inv(double[] a, double[] b, int k) {
+    for (int i = 0; i < a.length-k; i+=1) {
+      a[i+k] = b[i+k];
+    }
+  }
+  static void test_2ci_inv(double[] a, double[] b, int k) {
+    for (int i = 0; i < a.length-k; i+=1) {
+      a[i+k] = -123.;
+      b[i+k] = -103.;
+    }
+  }
+  static void test_2vi_inv(double[] a, double[] b, double c, double d, int k) {
+    for (int i = 0; i < a.length-k; i+=1) {
+      a[i+k] = c;
+      b[i+k] = d;
+    }
+  }
+  static void test_ci_scl(double[] a) {
+    for (int i = 0; i*SCALE < a.length; i+=1) {
+      a[i*SCALE] = -123.;
+    }
+  }
+  static void test_vi_scl(double[] a, double b) {
+    for (int i = 0; i*SCALE < a.length; i+=1) {
+      a[i*SCALE] = b;
+    }
+  }
+  static void test_cp_scl(double[] a, double[] b) {
+    for (int i = 0; i*SCALE < a.length; i+=1) {
+      a[i*SCALE] = b[i*SCALE];
+    }
+  }
+  static void test_2ci_scl(double[] a, double[] b) {
+    for (int i = 0; i*SCALE < a.length; i+=1) {
+      a[i*SCALE] = -123.;
+      b[i*SCALE] = -103.;
+    }
+  }
+  static void test_2vi_scl(double[] a, double[] b, double c, double d) {
+    for (int i = 0; i*SCALE < a.length; i+=1) {
+      a[i*SCALE] = c;
+      b[i*SCALE] = d;
+    }
+  }
+  static void test_cp_alndst(double[] a, double[] b) {
+    for (int i = 0; i < a.length-ALIGN_OFF; i+=1) {
+      a[i+ALIGN_OFF] = b[i];
+    }
+  }
+  static void test_cp_alnsrc(double[] a, double[] b) {
+    for (int i = 0; i < a.length-ALIGN_OFF; i+=1) {
+      a[i] = b[i+ALIGN_OFF];
+    }
+  }
+  static void test_2ci_aln(double[] a, double[] b) {
+    for (int i = 0; i < a.length-ALIGN_OFF; i+=1) {
+      a[i+ALIGN_OFF] = -123.;
+      b[i] = -103.;
+    }
+  }
+  static void test_2vi_aln(double[] a, double[] b, double c, double d) {
+    for (int i = 0; i < a.length-ALIGN_OFF; i+=1) {
+      a[i] = c;
+      b[i+ALIGN_OFF] = d;
+    }
+  }
+  static void test_cp_unalndst(double[] a, double[] b) {
+    for (int i = 0; i < a.length-UNALIGN_OFF; i+=1) {
+      a[i+UNALIGN_OFF] = b[i];
+    }
+  }
+  static void test_cp_unalnsrc(double[] a, double[] b) {
+    for (int i = 0; i < a.length-UNALIGN_OFF; i+=1) {
+      a[i] = b[i+UNALIGN_OFF];
+    }
+  }
+  static void test_2ci_unaln(double[] a, double[] b) {
+    for (int i = 0; i < a.length-UNALIGN_OFF; i+=1) {
+      a[i+UNALIGN_OFF] = -123.;
+      b[i] = -103.;
+    }
+  }
+  static void test_2vi_unaln(double[] a, double[] b, double c, double d) {
+    for (int i = 0; i < a.length-UNALIGN_OFF; i+=1) {
+      a[i] = c;
+      b[i+UNALIGN_OFF] = d;
+    }
+  }
+
+  static int verify(String text, int i, double elem, double val) {
+    if (elem != val) {
+      System.err.println(text + "[" + i + "] = " + elem + " != " + val);
+      return 1;
+    }
+    return 0;
+  }
+}
--- /dev/null	Sat Jun  2 20:04:27 2012
+++ new/test/compiler/7119644/TestFloatDoubleVect.java	Sat Jun  2 20:04:27 2012
@@ -0,0 +1,571 @@
+/*
+ * Copyright (c) 2012, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+/**
+ * @test
+ * @bug 7119644
+ * @summary Increase superword's vector size up to 256 bits
+ *
+ * @run main/othervm/timeout=300 -Xbatch -XX:+IgnoreUnrecognizedVMOptions -XX:-TieredCompilation -XX:-OptimizeFill TestFloatDoubleVect
+ */
+
+public class TestFloatDoubleVect {
+  private static final int ARRLEN = 997;
+  private static final int ITERS  = 11000;
+  private static final int OFFSET = 3;
+  private static final int SCALE = 2;
+  private static final int ALIGN_OFF = 8;
+  private static final int UNALIGN_OFF = 5;
+
+  public static void main(String args[]) {
+    System.out.println("Testing Float + Double vectors");
+    int errn = test();
+    if (errn > 0) {
+      System.err.println("FAILED: " + errn + " errors");
+      System.exit(97);
+    }
+    System.out.println("PASSED");
+  }
+
+  static int test() {
+    float[] a1 = new float[ARRLEN];
+    float[] a2 = new float[ARRLEN];
+    double[] b1 = new double[ARRLEN];
+    double[] b2 = new double[ARRLEN];
+    System.out.println("Warmup");
+    for (int i=0; i<ITERS; i++) {
+      test_ci(a1, b1);
+      test_vi(a2, b2, 123.f, 103.);
+      test_cp(a1, a2, b1, b2);
+      test_ci_neg(a1, b1);
+      test_vi_neg(a1, b1, 123.f, 103.);
+      test_cp_neg(a1, a2, b1, b2);
+      test_ci_oppos(a1, b1);
+      test_vi_oppos(a1, b1, 123.f, 103.);
+      test_cp_oppos(a1, a2, b1, b2);
+      test_ci_aln(a1, b1);
+      test_vi_aln(a1, b1, 123.f, 103.);
+      test_cp_alndst(a1, a2, b1, b2);
+      test_cp_alnsrc(a1, a2, b1, b2);
+      test_ci_unaln(a1, b1);
+      test_vi_unaln(a1, b1, 123.f, 103.);
+      test_cp_unalndst(a1, a2, b1, b2);
+      test_cp_unalnsrc(a1, a2, b1, b2);
+    }
+    // Initialize
+    for (int i=0; i<ARRLEN; i++) {
+      a1[i] = -1;
+      a2[i] = -1;
+      b1[i] = -1.;
+      b2[i] = -1.;
+    }
+    // Test and verify results
+    System.out.println("Verification");
+    int errn = 0;
+    {
+      test_ci(a1, b1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_ci: a1", i, a1[i], -123.f);
+        errn += verify("test_ci: b1", i, b1[i], -103.);
+      }
+      test_vi(a2, b2, 123.f, 103.);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_vi: a2", i, a2[i], 123.f);
+        errn += verify("test_vi: b2", i, b2[i], 103.);
+      }
+      test_cp(a1, a2, b1, b2);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_cp: a1", i, a1[i], 123.f);
+        errn += verify("test_cp: b1", i, b1[i], 103.);
+      }
+
+      // Reset for negative stride
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        a2[i] = -1;
+        b1[i] = -1.;
+        b2[i] = -1.;
+      }
+      test_ci_neg(a1, b1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_ci_neg: a1", i, a1[i], -123.f);
+        errn += verify("test_ci_neg: b1", i, b1[i], -103.);
+      }
+      test_vi_neg(a2, b2, 123.f, 103.);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_vi_neg: a2", i, a2[i], 123.f);
+        errn += verify("test_vi_neg: b2", i, b2[i], 103.);
+      }
+      test_cp_neg(a1, a2, b1, b2);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_cp_neg: a1", i, a1[i], 123.f);
+        errn += verify("test_cp_neg: b1", i, b1[i], 103.);
+      }
+
+      // Reset for opposite stride
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        a2[i] = -1;
+        b1[i] = -1.;
+        b2[i] = -1.;
+      }
+      test_ci_oppos(a1, b1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_ci_oppos: a1", i, a1[i], -123.f);
+        errn += verify("test_ci_oppos: b1", i, b1[i], -103.);
+      }
+      test_vi_oppos(a2, b2, 123.f, 103.);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_vi_oppos: a2", i, a2[i], 123.f);
+        errn += verify("test_vi_oppos: b2", i, b2[i], 103.);
+      }
+      test_cp_oppos(a1, a2, b1, b2);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_cp_oppos: a1", i, a1[i], 123.f);
+        errn += verify("test_cp_oppos: b1", i, b1[i], 103.);
+      }
+
+      // Reset for 2 arrays with relative aligned offset
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        a2[i] = 123;
+        b1[i] = -1.;
+        b2[i] = 123.;
+      }
+      test_cp_alndst(a1, a2, b1, b2);
+      for (int i=0; i<ALIGN_OFF; i++) {
+        errn += verify("test_cp_alndst: a1", i, a1[i], -1.f);
+        errn += verify("test_cp_alndst: b1", i, b1[i], -1.);
+      }
+      for (int i=ALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_cp_alndst: a1", i, a1[i], 123.f);
+        errn += verify("test_cp_alndst: b1", i, b1[i], 123.);
+      }
+      for (int i=0; i<ARRLEN; i++) {
+        a2[i] = -123;
+        b2[i] = -123.;
+      }
+      test_cp_alnsrc(a1, a2, b1, b2);
+      for (int i=0; i<ARRLEN-ALIGN_OFF; i++) {
+        errn += verify("test_cp_alnsrc: a1", i, a1[i], -123.f);
+        errn += verify("test_cp_alnsrc: b1", i, b1[i], -123.);
+      }
+      for (int i=ARRLEN-ALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_cp_alnsrc: a1", i, a1[i], 123.f);
+        errn += verify("test_cp_alnsrc: b1", i, b1[i], 123.);
+      }
+
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        b1[i] = -1.;
+      }
+      test_ci_aln(a1, b1);
+      for (int i=0; i<ALIGN_OFF; i++) {
+        errn += verify("test_ci_aln: a1", i, a1[i], -1.f);
+      }
+      for (int i=ALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_ci_aln: a1", i, a1[i], -123.f);
+      }
+      for (int i=0; i<ARRLEN-ALIGN_OFF; i++) {
+        errn += verify("test_ci_aln: b1", i, b1[i], -103.);
+      }
+      for (int i=ARRLEN-ALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_ci_aln: b1", i, b1[i], -1.);
+      }
+
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        b1[i] = -1.;
+      }
+      test_vi_aln(a1, b1, 123.f, 103.);
+      for (int i=0; i<ARRLEN-ALIGN_OFF; i++) {
+        errn += verify("test_vi_aln: a1", i, a1[i], 123.f);
+      }
+      for (int i=ARRLEN-ALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_vi_aln: a1", i, a1[i], -1.f);
+      }
+      for (int i=0; i<ALIGN_OFF; i++) {
+        errn += verify("test_vi_aln: b1", i, b1[i], -1.);
+      }
+      for (int i=ALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_vi_aln: b1", i, b1[i], 103.);
+      }
+
+      // Reset for 2 arrays with relative unaligned offset
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        a2[i] = 123;
+        b1[i] = -1.;
+        b2[i] = 123.;
+      }
+      test_cp_unalndst(a1, a2, b1, b2);
+      for (int i=0; i<UNALIGN_OFF; i++) {
+        errn += verify("test_cp_unalndst: a1", i, a1[i], -1.f);
+        errn += verify("test_cp_unalndst: b1", i, b1[i], -1.);
+      }
+      for (int i=UNALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_cp_unalndst: a1", i, a1[i], 123.f);
+        errn += verify("test_cp_unalndst: b1", i, b1[i], 123.);
+      }
+      for (int i=0; i<ARRLEN; i++) {
+        a2[i] = -123;
+        b2[i] = -123.;
+      }
+      test_cp_unalnsrc(a1, a2, b1, b2);
+      for (int i=0; i<ARRLEN-UNALIGN_OFF; i++) {
+        errn += verify("test_cp_unalnsrc: a1", i, a1[i], -123.f);
+        errn += verify("test_cp_unalnsrc: b1", i, b1[i], -123.);
+      }
+      for (int i=ARRLEN-UNALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_cp_unalnsrc: a1", i, a1[i], 123.f);
+        errn += verify("test_cp_unalnsrc: b1", i, b1[i], 123.);
+      }
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        b1[i] = -1;
+      }
+      test_ci_unaln(a1, b1);
+      for (int i=0; i<UNALIGN_OFF; i++) {
+        errn += verify("test_ci_unaln: a1", i, a1[i], -1.f);
+      }
+      for (int i=UNALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_ci_unaln: a1", i, a1[i], -123.f);
+      }
+      for (int i=0; i<ARRLEN-UNALIGN_OFF; i++) {
+        errn += verify("test_ci_unaln: b1", i, b1[i], -103.);
+      }
+      for (int i=ARRLEN-UNALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_ci_unaln: b1", i, b1[i], -1.);
+      }
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        b1[i] = -1;
+      }
+      test_vi_unaln(a1, b1, 123.f, 103.);
+      for (int i=0; i<ARRLEN-UNALIGN_OFF; i++) {
+        errn += verify("test_vi_unaln: a1", i, a1[i], 123.f);
+      }
+      for (int i=ARRLEN-UNALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_vi_unaln: a1", i, a1[i], -1.f);
+      }
+      for (int i=0; i<UNALIGN_OFF; i++) {
+        errn += verify("test_vi_unaln: b1", i, b1[i], -1.);
+      }
+      for (int i=UNALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_vi_unaln: b1", i, b1[i], 103.);
+      }
+
+      // Reset for aligned overlap initialization
+      for (int i=0; i<ALIGN_OFF; i++) {
+        a1[i] = (float)i;
+        b1[i] = (double)i;
+      }
+      for (int i=ALIGN_OFF; i<ARRLEN; i++) {
+        a1[i] = -1;
+        b1[i] = -1.;
+      }
+      test_cp_alndst(a1, a1, b1, b1);
+      for (int i=0; i<ARRLEN; i++) {
+        int v = i%ALIGN_OFF;
+        errn += verify("test_cp_alndst_overlap: a1", i, a1[i], (float)v);
+        errn += verify("test_cp_alndst_overlap: b1", i, b1[i], (double)v);
+      }
+      for (int i=0; i<ALIGN_OFF; i++) {
+        a1[i+ALIGN_OFF] = -1;
+        b1[i+ALIGN_OFF] = -1.;
+      }
+      test_cp_alnsrc(a1, a1, b1, b1);
+      for (int i=0; i<ALIGN_OFF; i++) {
+        errn += verify("test_cp_alnsrc_overlap: a1", i, a1[i], -1.f);
+        errn += verify("test_cp_alnsrc_overlap: b1", i, b1[i], -1.);
+      }
+      for (int i=ALIGN_OFF; i<ARRLEN; i++) {
+        int v = i%ALIGN_OFF;
+        errn += verify("test_cp_alnsrc_overlap: a1", i, a1[i], (float)v);
+        errn += verify("test_cp_alnsrc_overlap: b1", i, b1[i], (double)v);
+      }
+
+      // Reset for unaligned overlap initialization
+      for (int i=0; i<UNALIGN_OFF; i++) {
+        a1[i] = (float)i;
+        b1[i] = (double)i;
+      }
+      for (int i=UNALIGN_OFF; i<ARRLEN; i++) {
+        a1[i] = -1;
+        b1[i] = -1.;
+      }
+      test_cp_unalndst(a1, a1, b1, b1);
+      for (int i=0; i<ARRLEN; i++) {
+        int v = i%UNALIGN_OFF;
+        errn += verify("test_cp_unalndst_overlap: a1", i, a1[i], (float)v);
+        errn += verify("test_cp_unalndst_overlap: b1", i, b1[i], (double)v);
+      }
+      for (int i=0; i<UNALIGN_OFF; i++) {
+        a1[i+UNALIGN_OFF] = -1;
+        b1[i+UNALIGN_OFF] = -1.;
+      }
+      test_cp_unalnsrc(a1, a1, b1, b1);
+      for (int i=0; i<UNALIGN_OFF; i++) {
+        errn += verify("test_cp_unalnsrc_overlap: a1", i, a1[i], -1.f);
+        errn += verify("test_cp_unalnsrc_overlap: b1", i, b1[i], -1.);
+      }
+      for (int i=UNALIGN_OFF; i<ARRLEN; i++) {
+        int v = i%UNALIGN_OFF;
+        errn += verify("test_cp_unalnsrc_overlap: a1", i, a1[i], (float)v);
+        errn += verify("test_cp_unalnsrc_overlap: b1", i, b1[i], (double)v);
+      }
+
+    }
+
+    if (errn > 0)
+      return errn;
+
+    System.out.println("Time");
+    long start, end;
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_ci(a1, b1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_ci: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_vi(a2, b2, 123.f, 103.);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_vi: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp(a1, a2, b1, b2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_ci_neg(a1, b1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_ci_neg: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_vi_neg(a1, b1, 123.f, 103.);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_vi_neg: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_neg(a1, a2, b1, b2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_neg: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_ci_oppos(a1, b1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_ci_oppos: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_vi_oppos(a1, b1, 123.f, 103.);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_vi_oppos: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_oppos(a1, a2, b1, b2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_oppos: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_ci_aln(a1, b1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_ci_aln: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_vi_aln(a1, b1, 123.f, 103.);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_vi_aln: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_alndst(a1, a2, b1, b2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_alndst: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_alnsrc(a1, a2, b1, b2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_alnsrc: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_ci_unaln(a1, b1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_ci_unaln: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_vi_unaln(a1, b1, 123.f, 103.);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_vi_unaln: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_unalndst(a1, a2, b1, b2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_unalndst: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_unalnsrc(a1, a2, b1, b2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_unalnsrc: " + (end - start));
+    return errn;
+  }
+
+  static void test_ci(float[] a, double[] b) {
+    for (int i = 0; i < a.length; i+=1) {
+      a[i] = -123.f;
+      b[i] = -103.;
+    }
+  }
+  static void test_vi(float[] a, double[] b, float c, double d) {
+    for (int i = 0; i < a.length; i+=1) {
+      a[i] = c;
+      b[i] = d;
+    }
+  }
+  static void test_cp(float[] a, float[] b, double[] c, double[] d) {
+    for (int i = 0; i < a.length; i+=1) {
+      a[i] = b[i];
+      c[i] = d[i];
+    }
+  }
+  static void test_ci_neg(float[] a, double[] b) {
+    for (int i = a.length-1; i >= 0; i-=1) {
+      a[i] = -123.f;
+      b[i] = -103.;
+    }
+  }
+  static void test_vi_neg(float[] a, double[] b, float c, double d) {
+    for (int i = a.length-1; i >= 0; i-=1) {
+      a[i] = c;
+      b[i] = d;
+    }
+  }
+  static void test_cp_neg(float[] a, float[] b, double[] c, double[] d) {
+    for (int i = a.length-1; i >= 0; i-=1) {
+      a[i] = b[i];
+      c[i] = d[i];
+    }
+  }
+  static void test_ci_oppos(float[] a, double[] b) {
+    int limit = a.length-1;
+    for (int i = 0; i < a.length; i+=1) {
+      a[limit-i] = -123.f;
+      b[i] = -103.;
+    }
+  }
+  static void test_vi_oppos(float[] a, double[] b, float c, double d) {
+    int limit = a.length-1;
+    for (int i = a.length-1; i >= 0; i-=1) {
+      a[i] = c;
+      b[limit-i] = d;
+    }
+  }
+  static void test_cp_oppos(float[] a, float[] b, double[] c, double[] d) {
+    int limit = a.length-1;
+    for (int i = 0; i < a.length; i+=1) {
+      a[i] = b[limit-i];
+      c[limit-i] = d[i];
+    }
+  }
+  static void test_ci_aln(float[] a, double[] b) {
+    for (int i = 0; i < a.length-ALIGN_OFF; i+=1) {
+      a[i+ALIGN_OFF] = -123.f;
+      b[i] = -103.;
+    }
+  }
+  static void test_vi_aln(float[] a, double[] b, float c, double d) {
+    for (int i = 0; i < a.length-ALIGN_OFF; i+=1) {
+      a[i] = c;
+      b[i+ALIGN_OFF] = d;
+    }
+  }
+  static void test_cp_alndst(float[] a, float[] b, double[] c, double[] d) {
+    for (int i = 0; i < a.length-ALIGN_OFF; i+=1) {
+      a[i+ALIGN_OFF] = b[i];
+      c[i+ALIGN_OFF] = d[i];
+    }
+  }
+  static void test_cp_alnsrc(float[] a, float[] b, double[] c, double[] d) {
+    for (int i = 0; i < a.length-ALIGN_OFF; i+=1) {
+      a[i] = b[i+ALIGN_OFF];
+      c[i] = d[i+ALIGN_OFF];
+    }
+  }
+  static void test_ci_unaln(float[] a, double[] b) {
+    for (int i = 0; i < a.length-UNALIGN_OFF; i+=1) {
+      a[i+UNALIGN_OFF] = -123.f;
+      b[i] = -103.;
+    }
+  }
+  static void test_vi_unaln(float[] a, double[] b, float c, double d) {
+    for (int i = 0; i < a.length-UNALIGN_OFF; i+=1) {
+      a[i] = c;
+      b[i+UNALIGN_OFF] = d;
+    }
+  }
+  static void test_cp_unalndst(float[] a, float[] b, double[] c, double[] d) {
+    for (int i = 0; i < a.length-UNALIGN_OFF; i+=1) {
+      a[i+UNALIGN_OFF] = b[i];
+      c[i+UNALIGN_OFF] = d[i];
+    }
+  }
+  static void test_cp_unalnsrc(float[] a, float[] b, double[] c, double[] d) {
+    for (int i = 0; i < a.length-UNALIGN_OFF; i+=1) {
+      a[i] = b[i+UNALIGN_OFF];
+      c[i] = d[i+UNALIGN_OFF];
+    }
+  }
+
+  static int verify(String text, int i, float elem, float val) {
+    if (elem != val) {
+      System.err.println(text + "[" + i + "] = " + elem + " != " + val);
+      return 1;
+    }
+    return 0;
+  }
+  static int verify(String text, int i, double elem, double val) {
+    if (elem != val) {
+      System.err.println(text + "[" + i + "] = " + elem + " != " + val);
+      return 1;
+    }
+    return 0;
+  }
+}
--- /dev/null	Sat Jun  2 20:04:28 2012
+++ new/test/compiler/7119644/TestFloatVect.java	Sat Jun  2 20:04:27 2012
@@ -0,0 +1,953 @@
+/*
+ * Copyright (c) 2012, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+/**
+ * @test
+ * @bug 7119644
+ * @summary Increase superword's vector size up to 256 bits
+ *
+ * @run main/othervm/timeout=300 -Xbatch -XX:+IgnoreUnrecognizedVMOptions -XX:-TieredCompilation -XX:-OptimizeFill TestFloatVect
+ */
+
+public class TestFloatVect {
+  private static final int ARRLEN = 997;
+  private static final int ITERS  = 11000;
+  private static final int OFFSET = 3;
+  private static final int SCALE = 2;
+  private static final int ALIGN_OFF = 8;
+  private static final int UNALIGN_OFF = 5;
+
+  public static void main(String args[]) {
+    System.out.println("Testing Float vectors");
+    int errn = test();
+    if (errn > 0) {
+      System.err.println("FAILED: " + errn + " errors");
+      System.exit(97);
+    }
+    System.out.println("PASSED");
+  }
+
+  static int test() {
+    float[] a1 = new float[ARRLEN];
+    float[] a2 = new float[ARRLEN];
+    System.out.println("Warmup");
+    for (int i=0; i<ITERS; i++) {
+      test_ci(a1);
+      test_vi(a2, 123.f);
+      test_cp(a1, a2);
+      test_2ci(a1, a2);
+      test_2vi(a1, a2, 123.f, 103.f);
+      test_ci_neg(a1);
+      test_vi_neg(a2, 123.f);
+      test_cp_neg(a1, a2);
+      test_2ci_neg(a1, a2);
+      test_2vi_neg(a1, a2, 123.f, 103.f);
+      test_ci_oppos(a1);
+      test_vi_oppos(a2, 123.f);
+      test_cp_oppos(a1, a2);
+      test_2ci_oppos(a1, a2);
+      test_2vi_oppos(a1, a2, 123.f, 103.f);
+      test_ci_off(a1);
+      test_vi_off(a2, 123.f);
+      test_cp_off(a1, a2);
+      test_2ci_off(a1, a2);
+      test_2vi_off(a1, a2, 123.f, 103.f);
+      test_ci_inv(a1, OFFSET);
+      test_vi_inv(a2, 123.f, OFFSET);
+      test_cp_inv(a1, a2, OFFSET);
+      test_2ci_inv(a1, a2, OFFSET);
+      test_2vi_inv(a1, a2, 123.f, 103.f, OFFSET);
+      test_ci_scl(a1);
+      test_vi_scl(a2, 123.f);
+      test_cp_scl(a1, a2);
+      test_2ci_scl(a1, a2);
+      test_2vi_scl(a1, a2, 123.f, 103.f);
+      test_cp_alndst(a1, a2);
+      test_cp_alnsrc(a1, a2);
+      test_2ci_aln(a1, a2);
+      test_2vi_aln(a1, a2, 123.f, 103.f);
+      test_cp_unalndst(a1, a2);
+      test_cp_unalnsrc(a1, a2);
+      test_2ci_unaln(a1, a2);
+      test_2vi_unaln(a1, a2, 123.f, 103.f);
+    }
+    // Initialize
+    for (int i=0; i<ARRLEN; i++) {
+      a1[i] = -1;
+      a2[i] = -1;
+    }
+    // Test and verify results
+    System.out.println("Verification");
+    int errn = 0;
+    {
+      test_ci(a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_ci: a1", i, a1[i], -123.f);
+      }
+      test_vi(a2, 123.f);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_vi: a2", i, a2[i], 123.f);
+      }
+      test_cp(a1, a2);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_cp: a1", i, a1[i], 123.f);
+      }
+      test_2ci(a1, a2);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_2ci: a1", i, a1[i], -123.f);
+        errn += verify("test_2ci: a2", i, a2[i], -103.f);
+      }
+      test_2vi(a1, a2, 123.f, 103.f);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_2vi: a1", i, a1[i], 123.f);
+        errn += verify("test_2vi: a2", i, a2[i], 103.f);
+      }
+      // Reset for negative stride
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        a2[i] = -1;
+      }
+      test_ci_neg(a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_ci_neg: a1", i, a1[i], -123.f);
+      }
+      test_vi_neg(a2, 123.f);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_vi_neg: a2", i, a2[i], 123.f);
+      }
+      test_cp_neg(a1, a2);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_cp_neg: a1", i, a1[i], 123.f);
+      }
+      test_2ci_neg(a1, a2);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_2ci_neg: a1", i, a1[i], -123.f);
+        errn += verify("test_2ci_neg: a2", i, a2[i], -103.f);
+      }
+      test_2vi_neg(a1, a2, 123.f, 103.f);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_2vi_neg: a1", i, a1[i], 123.f);
+        errn += verify("test_2vi_neg: a2", i, a2[i], 103.f);
+      }
+      // Reset for opposite stride
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        a2[i] = -1;
+      }
+      test_ci_oppos(a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_ci_oppos: a1", i, a1[i], -123.f);
+      }
+      test_vi_oppos(a2, 123.f);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_vi_oppos: a2", i, a2[i], 123.f);
+      }
+      test_cp_oppos(a1, a2);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_cp_oppos: a1", i, a1[i], 123.f);
+      }
+      test_2ci_oppos(a1, a2);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_2ci_oppos: a1", i, a1[i], -123.f);
+        errn += verify("test_2ci_oppos: a2", i, a2[i], -103.f);
+      }
+      test_2vi_oppos(a1, a2, 123.f, 103.f);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_2vi_oppos: a1", i, a1[i], 123.f);
+        errn += verify("test_2vi_oppos: a2", i, a2[i], 103.f);
+      }
+      // Reset for indexing with offset
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        a2[i] = -1;
+      }
+      test_ci_off(a1);
+      for (int i=OFFSET; i<ARRLEN; i++) {
+        errn += verify("test_ci_off: a1", i, a1[i], -123.f);
+      }
+      test_vi_off(a2, 123.f);
+      for (int i=OFFSET; i<ARRLEN; i++) {
+        errn += verify("test_vi_off: a2", i, a2[i], 123.f);
+      }
+      test_cp_off(a1, a2);
+      for (int i=OFFSET; i<ARRLEN; i++) {
+        errn += verify("test_cp_off: a1", i, a1[i], 123.f);
+      }
+      test_2ci_off(a1, a2);
+      for (int i=OFFSET; i<ARRLEN; i++) {
+        errn += verify("test_2ci_off: a1", i, a1[i], -123.f);
+        errn += verify("test_2ci_off: a2", i, a2[i], -103.f);
+      }
+      test_2vi_off(a1, a2, 123.f, 103.f);
+      for (int i=OFFSET; i<ARRLEN; i++) {
+        errn += verify("test_2vi_off: a1", i, a1[i], 123.f);
+        errn += verify("test_2vi_off: a2", i, a2[i], 103.f);
+      }
+      for (int i=0; i<OFFSET; i++) {
+        errn += verify("test_2vi_off: a1", i, a1[i], -1.f);
+        errn += verify("test_2vi_off: a2", i, a2[i], -1.f);
+      }
+      // Reset for indexing with invariant offset
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        a2[i] = -1;
+      }
+      test_ci_inv(a1, OFFSET);
+      for (int i=OFFSET; i<ARRLEN; i++) {
+        errn += verify("test_ci_inv: a1", i, a1[i], -123.f);
+      }
+      test_vi_inv(a2, 123.f, OFFSET);
+      for (int i=OFFSET; i<ARRLEN; i++) {
+        errn += verify("test_vi_inv: a2", i, a2[i], 123.f);
+      }
+      test_cp_inv(a1, a2, OFFSET);
+      for (int i=OFFSET; i<ARRLEN; i++) {
+        errn += verify("test_cp_inv: a1", i, a1[i], 123.f);
+      }
+      test_2ci_inv(a1, a2, OFFSET);
+      for (int i=OFFSET; i<ARRLEN; i++) {
+        errn += verify("test_2ci_inv: a1", i, a1[i], -123.f);
+        errn += verify("test_2ci_inv: a2", i, a2[i], -103.f);
+      }
+      test_2vi_inv(a1, a2, 123.f, 103.f, OFFSET);
+      for (int i=OFFSET; i<ARRLEN; i++) {
+        errn += verify("test_2vi_inv: a1", i, a1[i], 123.f);
+        errn += verify("test_2vi_inv: a2", i, a2[i], 103.f);
+      }
+      for (int i=0; i<OFFSET; i++) {
+        errn += verify("test_2vi_inv: a1", i, a1[i], -1.f);
+        errn += verify("test_2vi_inv: a2", i, a2[i], -1.f);
+      }
+      // Reset for indexing with scale
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        a2[i] = -1;
+      }
+      test_ci_scl(a1);
+      for (int i=0; i<ARRLEN; i++) {
+        int val = (i%SCALE != 0) ? -1 : -123;
+        errn += verify("test_ci_scl: a1", i, a1[i], (float)val);
+      }
+      test_vi_scl(a2, 123.f);
+      for (int i=0; i<ARRLEN; i++) {
+        int val = (i%SCALE != 0) ? -1 : 123;
+        errn += verify("test_vi_scl: a2", i, a2[i], (float)val);
+      }
+      test_cp_scl(a1, a2);
+      for (int i=0; i<ARRLEN; i++) {
+        int val = (i%SCALE != 0) ? -1 : 123;
+        errn += verify("test_cp_scl: a1", i, a1[i], (float)val);
+      }
+      test_2ci_scl(a1, a2);
+      for (int i=0; i<ARRLEN; i++) {
+        if (i%SCALE != 0) {
+          errn += verify("test_2ci_scl: a1", i, a1[i], -1.f);
+        } else if (i*SCALE < ARRLEN) {
+          errn += verify("test_2ci_scl: a1", i*SCALE, a1[i*SCALE], -123.f);
+        }
+        if (i%SCALE != 0) {
+          errn += verify("test_2ci_scl: a2", i, a2[i], -1.f);
+        } else if (i*SCALE < ARRLEN) {
+          errn += verify("test_2ci_scl: a2", i*SCALE, a2[i*SCALE], -103.f);
+        }
+      }
+      test_2vi_scl(a1, a2, 123.f, 103.f);
+      for (int i=0; i<ARRLEN; i++) {
+        if (i%SCALE != 0) {
+          errn += verify("test_2vi_scl: a1", i, a1[i], -1.f);
+        } else if (i*SCALE < ARRLEN) {
+          errn += verify("test_2vi_scl: a1", i*SCALE, a1[i*SCALE], 123.f);
+        }
+        if (i%SCALE != 0) {
+          errn += verify("test_2vi_scl: a2", i, a2[i], -1.f);
+        } else if (i*SCALE < ARRLEN) {
+          errn += verify("test_2vi_scl: a2", i*SCALE, a2[i*SCALE], 103.f);
+        }
+      }
+      // Reset for 2 arrays with relative aligned offset
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        a2[i] = -1;
+      }
+      test_vi(a2, 123.f);
+      test_cp_alndst(a1, a2);
+      for (int i=0; i<ALIGN_OFF; i++) {
+        errn += verify("test_cp_alndst: a1", i, a1[i], -1.f);
+      }
+      for (int i=ALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_cp_alndst: a1", i, a1[i], 123.f);
+      }
+      test_vi(a2, -123.f);
+      test_cp_alnsrc(a1, a2);
+      for (int i=0; i<ARRLEN-ALIGN_OFF; i++) {
+        errn += verify("test_cp_alnsrc: a1", i, a1[i], -123.f);
+      }
+      for (int i=ARRLEN-ALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_cp_alnsrc: a1", i, a1[i], 123.f);
+      }
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        a2[i] = -1;
+      }
+      test_2ci_aln(a1, a2);
+      for (int i=0; i<ALIGN_OFF; i++) {
+        errn += verify("test_2ci_aln: a1", i, a1[i], -1.f);
+      }
+      for (int i=ALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_2ci_aln: a1", i, a1[i], -123.f);
+      }
+      for (int i=0; i<ARRLEN-ALIGN_OFF; i++) {
+        errn += verify("test_2ci_aln: a2", i, a2[i], -103.f);
+      }
+      for (int i=ARRLEN-ALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_2ci_aln: a2", i, a2[i], -1.f);
+      }
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        a2[i] = -1;
+      }
+      test_2vi_aln(a1, a2, 123.f, 103.f);
+      for (int i=0; i<ARRLEN-ALIGN_OFF; i++) {
+        errn += verify("test_2vi_aln: a1", i, a1[i], 123.f);
+      }
+      for (int i=ARRLEN-ALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_2vi_aln: a1", i, a1[i], -1.f);
+      }
+      for (int i=0; i<ALIGN_OFF; i++) {
+        errn += verify("test_2vi_aln: a2", i, a2[i], -1.f);
+      }
+      for (int i=ALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_2vi_aln: a2", i, a2[i], 103.f);
+      }
+
+      // Reset for 2 arrays with relative unaligned offset
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        a2[i] = -1;
+      }
+      test_vi(a2, 123.f);
+      test_cp_unalndst(a1, a2);
+      for (int i=0; i<UNALIGN_OFF; i++) {
+        errn += verify("test_cp_unalndst: a1", i, a1[i], -1.f);
+      }
+      for (int i=UNALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_cp_unalndst: a1", i, a1[i], 123.f);
+      }
+      test_vi(a2, -123.f);
+      test_cp_unalnsrc(a1, a2);
+      for (int i=0; i<ARRLEN-UNALIGN_OFF; i++) {
+        errn += verify("test_cp_unalnsrc: a1", i, a1[i], -123.f);
+      }
+      for (int i=ARRLEN-UNALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_cp_unalnsrc: a1", i, a1[i], 123.f);
+      }
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        a2[i] = -1;
+      }
+      test_2ci_unaln(a1, a2);
+      for (int i=0; i<UNALIGN_OFF; i++) {
+        errn += verify("test_2ci_unaln: a1", i, a1[i], -1.f);
+      }
+      for (int i=UNALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_2ci_unaln: a1", i, a1[i], -123.f);
+      }
+      for (int i=0; i<ARRLEN-UNALIGN_OFF; i++) {
+        errn += verify("test_2ci_unaln: a2", i, a2[i], -103.f);
+      }
+      for (int i=ARRLEN-UNALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_2ci_unaln: a2", i, a2[i], -1.f);
+      }
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        a2[i] = -1;
+      }
+      test_2vi_unaln(a1, a2, 123.f, 103.f);
+      for (int i=0; i<ARRLEN-UNALIGN_OFF; i++) {
+        errn += verify("test_2vi_unaln: a1", i, a1[i], 123.f);
+      }
+      for (int i=ARRLEN-UNALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_2vi_unaln: a1", i, a1[i], -1.f);
+      }
+      for (int i=0; i<UNALIGN_OFF; i++) {
+        errn += verify("test_2vi_unaln: a2", i, a2[i], -1.f);
+      }
+      for (int i=UNALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_2vi_unaln: a2", i, a2[i], 103.f);
+      }
+
+      // Reset for aligned overlap initialization
+      for (int i=0; i<ALIGN_OFF; i++) {
+        a1[i] = (float)i;
+      }
+      for (int i=ALIGN_OFF; i<ARRLEN; i++) {
+        a1[i] = -1;
+      }
+      test_cp_alndst(a1, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        int v = i%ALIGN_OFF;
+        errn += verify("test_cp_alndst_overlap: a1", i, a1[i], (float)v);
+      }
+      for (int i=0; i<ALIGN_OFF; i++) {
+        a1[i+ALIGN_OFF] = -1;
+      }
+      test_cp_alnsrc(a1, a1);
+      for (int i=0; i<ALIGN_OFF; i++) {
+        errn += verify("test_cp_alnsrc_overlap: a1", i, a1[i], -1.f);
+      }
+      for (int i=ALIGN_OFF; i<ARRLEN; i++) {
+        int v = i%ALIGN_OFF;
+        errn += verify("test_cp_alnsrc_overlap: a1", i, a1[i], (float)v);
+      }
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+      }
+      test_2ci_aln(a1, a1);
+      for (int i=0; i<ARRLEN-ALIGN_OFF; i++) {
+        errn += verify("test_2ci_aln_overlap: a1", i, a1[i], -103.f);
+      }
+      for (int i=ARRLEN-ALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_2ci_aln_overlap: a1", i, a1[i], -123.f);
+      }
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+      }
+      test_2vi_aln(a1, a1, 123.f, 103.f);
+      for (int i=0; i<ARRLEN-ALIGN_OFF; i++) {
+        errn += verify("test_2vi_aln_overlap: a1", i, a1[i], 123.f);
+      }
+      for (int i=ARRLEN-ALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_2vi_aln_overlap: a1", i, a1[i], 103.f);
+      }
+
+      // Reset for unaligned overlap initialization
+      for (int i=0; i<UNALIGN_OFF; i++) {
+        a1[i] = (float)i;
+      }
+      for (int i=UNALIGN_OFF; i<ARRLEN; i++) {
+        a1[i] = -1;
+      }
+      test_cp_unalndst(a1, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        int v = i%UNALIGN_OFF;
+        errn += verify("test_cp_unalndst_overlap: a1", i, a1[i], (float)v);
+      }
+      for (int i=0; i<UNALIGN_OFF; i++) {
+        a1[i+UNALIGN_OFF] = -1;
+      }
+      test_cp_unalnsrc(a1, a1);
+      for (int i=0; i<UNALIGN_OFF; i++) {
+        errn += verify("test_cp_unalnsrc_overlap: a1", i, a1[i], -1.f);
+      }
+      for (int i=UNALIGN_OFF; i<ARRLEN; i++) {
+        int v = i%UNALIGN_OFF;
+        errn += verify("test_cp_unalnsrc_overlap: a1", i, a1[i], (float)v);
+      }
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+      }
+      test_2ci_unaln(a1, a1);
+      for (int i=0; i<ARRLEN-UNALIGN_OFF; i++) {
+        errn += verify("test_2ci_unaln_overlap: a1", i, a1[i], -103.f);
+      }
+      for (int i=ARRLEN-UNALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_2ci_unaln_overlap: a1", i, a1[i], -123.f);
+      }
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+      }
+      test_2vi_unaln(a1, a1, 123.f, 103.f);
+      for (int i=0; i<ARRLEN-UNALIGN_OFF; i++) {
+        errn += verify("test_2vi_unaln_overlap: a1", i, a1[i], 123.f);
+      }
+      for (int i=ARRLEN-UNALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_2vi_unaln_overlap: a1", i, a1[i], 103.f);
+      }
+
+    }
+
+    if (errn > 0)
+      return errn;
+
+    System.out.println("Time");
+    long start, end;
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_ci(a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_ci: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_vi(a2, 123.f);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_vi: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp(a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_2ci(a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_2ci: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_2vi(a1, a2, 123.f, 103.f);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_2vi: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_ci_neg(a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_ci_neg: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_vi_neg(a2, 123.f);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_vi_neg: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_neg(a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_neg: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_2ci_neg(a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_2ci_neg: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_2vi_neg(a1, a2, 123.f, 103.f);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_2vi_neg: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_ci_oppos(a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_ci_oppos: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_vi_oppos(a2, 123.f);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_vi_oppos: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_oppos(a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_oppos: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_2ci_oppos(a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_2ci_oppos: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_2vi_oppos(a1, a2, 123.f, 103.f);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_2vi_oppos: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_ci_off(a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_ci_off: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_vi_off(a2, 123.f);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_vi_off: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_off(a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_off: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_2ci_off(a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_2ci_off: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_2vi_off(a1, a2, 123.f, 103.f);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_2vi_off: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_ci_inv(a1, OFFSET);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_ci_inv: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_vi_inv(a2, 123.f, OFFSET);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_vi_inv: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_inv(a1, a2, OFFSET);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_inv: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_2ci_inv(a1, a2, OFFSET);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_2ci_inv: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_2vi_inv(a1, a2, 123.f, 103.f, OFFSET);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_2vi_inv: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_ci_scl(a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_ci_scl: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_vi_scl(a2, 123.f);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_vi_scl: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_scl(a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_scl: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_2ci_scl(a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_2ci_scl: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_2vi_scl(a1, a2, 123.f, 103.f);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_2vi_scl: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_alndst(a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_alndst: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_alnsrc(a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_alnsrc: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_2ci_aln(a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_2ci_aln: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_2vi_aln(a1, a2, 123.f, 103.f);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_2vi_aln: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_unalndst(a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_unalndst: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_unalnsrc(a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_unalnsrc: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_2ci_unaln(a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_2ci_unaln: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_2vi_unaln(a1, a2, 123.f, 103.f);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_2vi_unaln: " + (end - start));
+
+    return errn;
+  }
+
+  static void test_ci(float[] a) {
+    for (int i = 0; i < a.length; i+=1) {
+      a[i] = -123.f;
+    }
+  }
+  static void test_vi(float[] a, float b) {
+    for (int i = 0; i < a.length; i+=1) {
+      a[i] = b;
+    }
+  }
+  static void test_cp(float[] a, float[] b) {
+    for (int i = 0; i < a.length; i+=1) {
+      a[i] = b[i];
+    }
+  }
+  static void test_2ci(float[] a, float[] b) {
+    for (int i = 0; i < a.length; i+=1) {
+      a[i] = -123.f;
+      b[i] = -103.f;
+    }
+  }
+  static void test_2vi(float[] a, float[] b, float c, float d) {
+    for (int i = 0; i < a.length; i+=1) {
+      a[i] = c;
+      b[i] = d;
+    }
+  }
+  static void test_ci_neg(float[] a) {
+    for (int i = a.length-1; i >= 0; i-=1) {
+      a[i] = -123.f;
+    }
+  }
+  static void test_vi_neg(float[] a, float b) {
+    for (int i = a.length-1; i >= 0; i-=1) {
+      a[i] = b;
+    }
+  }
+  static void test_cp_neg(float[] a, float[] b) {
+    for (int i = a.length-1; i >= 0; i-=1) {
+      a[i] = b[i];
+    }
+  }
+  static void test_2ci_neg(float[] a, float[] b) {
+    for (int i = a.length-1; i >= 0; i-=1) {
+      a[i] = -123.f;
+      b[i] = -103.f;
+    }
+  }
+  static void test_2vi_neg(float[] a, float[] b, float c, float d) {
+    for (int i = a.length-1; i >= 0; i-=1) {
+      a[i] = c;
+      b[i] = d;
+    }
+  }
+  static void test_ci_oppos(float[] a) {
+    int limit = a.length-1;
+    for (int i = 0; i < a.length; i+=1) {
+      a[limit-i] = -123.f;
+    }
+  }
+  static void test_vi_oppos(float[] a, float b) {
+    int limit = a.length-1;
+    for (int i = limit; i >= 0; i-=1) {
+      a[limit-i] = b;
+    }
+  }
+  static void test_cp_oppos(float[] a, float[] b) {
+    int limit = a.length-1;
+    for (int i = 0; i < a.length; i+=1) {
+      a[i] = b[limit-i];
+    }
+  }
+  static void test_2ci_oppos(float[] a, float[] b) {
+    int limit = a.length-1;
+    for (int i = 0; i < a.length; i+=1) {
+      a[limit-i] = -123.f;
+      b[i] = -103.f;
+    }
+  }
+  static void test_2vi_oppos(float[] a, float[] b, float c, float d) {
+    int limit = a.length-1;
+    for (int i = limit; i >= 0; i-=1) {
+      a[i] = c;
+      b[limit-i] = d;
+    }
+  }
+  static void test_ci_off(float[] a) {
+    for (int i = 0; i < a.length-OFFSET; i+=1) {
+      a[i+OFFSET] = -123.f;
+    }
+  }
+  static void test_vi_off(float[] a, float b) {
+    for (int i = 0; i < a.length-OFFSET; i+=1) {
+      a[i+OFFSET] = b;
+    }
+  }
+  static void test_cp_off(float[] a, float[] b) {
+    for (int i = 0; i < a.length-OFFSET; i+=1) {
+      a[i+OFFSET] = b[i+OFFSET];
+    }
+  }
+  static void test_2ci_off(float[] a, float[] b) {
+    for (int i = 0; i < a.length-OFFSET; i+=1) {
+      a[i+OFFSET] = -123.f;
+      b[i+OFFSET] = -103.f;
+    }
+  }
+  static void test_2vi_off(float[] a, float[] b, float c, float d) {
+    for (int i = 0; i < a.length-OFFSET; i+=1) {
+      a[i+OFFSET] = c;
+      b[i+OFFSET] = d;
+    }
+  }
+  static void test_ci_inv(float[] a, int k) {
+    for (int i = 0; i < a.length-k; i+=1) {
+      a[i+k] = -123.f;
+    }
+  }
+  static void test_vi_inv(float[] a, float b, int k) {
+    for (int i = 0; i < a.length-k; i+=1) {
+      a[i+k] = b;
+    }
+  }
+  static void test_cp_inv(float[] a, float[] b, int k) {
+    for (int i = 0; i < a.length-k; i+=1) {
+      a[i+k] = b[i+k];
+    }
+  }
+  static void test_2ci_inv(float[] a, float[] b, int k) {
+    for (int i = 0; i < a.length-k; i+=1) {
+      a[i+k] = -123.f;
+      b[i+k] = -103.f;
+    }
+  }
+  static void test_2vi_inv(float[] a, float[] b, float c, float d, int k) {
+    for (int i = 0; i < a.length-k; i+=1) {
+      a[i+k] = c;
+      b[i+k] = d;
+    }
+  }
+  static void test_ci_scl(float[] a) {
+    for (int i = 0; i*SCALE < a.length; i+=1) {
+      a[i*SCALE] = -123.f;
+    }
+  }
+  static void test_vi_scl(float[] a, float b) {
+    for (int i = 0; i*SCALE < a.length; i+=1) {
+      a[i*SCALE] = b;
+    }
+  }
+  static void test_cp_scl(float[] a, float[] b) {
+    for (int i = 0; i*SCALE < a.length; i+=1) {
+      a[i*SCALE] = b[i*SCALE];
+    }
+  }
+  static void test_2ci_scl(float[] a, float[] b) {
+    for (int i = 0; i*SCALE < a.length; i+=1) {
+      a[i*SCALE] = -123.f;
+      b[i*SCALE] = -103.f;
+    }
+  }
+  static void test_2vi_scl(float[] a, float[] b, float c, float d) {
+    for (int i = 0; i*SCALE < a.length; i+=1) {
+      a[i*SCALE] = c;
+      b[i*SCALE] = d;
+    }
+  }
+  static void test_cp_alndst(float[] a, float[] b) {
+    for (int i = 0; i < a.length-ALIGN_OFF; i+=1) {
+      a[i+ALIGN_OFF] = b[i];
+    }
+  }
+  static void test_cp_alnsrc(float[] a, float[] b) {
+    for (int i = 0; i < a.length-ALIGN_OFF; i+=1) {
+      a[i] = b[i+ALIGN_OFF];
+    }
+  }
+  static void test_2ci_aln(float[] a, float[] b) {
+    for (int i = 0; i < a.length-ALIGN_OFF; i+=1) {
+      a[i+ALIGN_OFF] = -123.f;
+      b[i] = -103.f;
+    }
+  }
+  static void test_2vi_aln(float[] a, float[] b, float c, float d) {
+    for (int i = 0; i < a.length-ALIGN_OFF; i+=1) {
+      a[i] = c;
+      b[i+ALIGN_OFF] = d;
+    }
+  }
+  static void test_cp_unalndst(float[] a, float[] b) {
+    for (int i = 0; i < a.length-UNALIGN_OFF; i+=1) {
+      a[i+UNALIGN_OFF] = b[i];
+    }
+  }
+  static void test_cp_unalnsrc(float[] a, float[] b) {
+    for (int i = 0; i < a.length-UNALIGN_OFF; i+=1) {
+      a[i] = b[i+UNALIGN_OFF];
+    }
+  }
+  static void test_2ci_unaln(float[] a, float[] b) {
+    for (int i = 0; i < a.length-UNALIGN_OFF; i+=1) {
+      a[i+UNALIGN_OFF] = -123.f;
+      b[i] = -103.f;
+    }
+  }
+  static void test_2vi_unaln(float[] a, float[] b, float c, float d) {
+    for (int i = 0; i < a.length-UNALIGN_OFF; i+=1) {
+      a[i] = c;
+      b[i+UNALIGN_OFF] = d;
+    }
+  }
+
+  static int verify(String text, int i, float elem, float val) {
+    if (elem != val) {
+      System.err.println(text + "[" + i + "] = " + elem + " != " + val);
+      return 1;
+    }
+    return 0;
+  }
+}
--- /dev/null	Sat Jun  2 20:04:28 2012
+++ new/test/compiler/7119644/TestIntDoubleVect.java	Sat Jun  2 20:04:28 2012
@@ -0,0 +1,571 @@
+/*
+ * Copyright (c) 2012, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+/**
+ * @test
+ * @bug 7119644
+ * @summary Increase superword's vector size up to 256 bits
+ *
+ * @run main/othervm/timeout=300 -Xbatch -XX:+IgnoreUnrecognizedVMOptions -XX:-TieredCompilation -XX:-OptimizeFill TestIntDoubleVect
+ */
+
+public class TestIntDoubleVect {
+  private static final int ARRLEN = 997;
+  private static final int ITERS  = 11000;
+  private static final int OFFSET = 3;
+  private static final int SCALE = 2;
+  private static final int ALIGN_OFF = 8;
+  private static final int UNALIGN_OFF = 5;
+
+  public static void main(String args[]) {
+    System.out.println("Testing Integer + Double vectors");
+    int errn = test();
+    if (errn > 0) {
+      System.err.println("FAILED: " + errn + " errors");
+      System.exit(97);
+    }
+    System.out.println("PASSED");
+  }
+
+  static int test() {
+    int[] a1 = new int[ARRLEN];
+    int[] a2 = new int[ARRLEN];
+    double[] b1 = new double[ARRLEN];
+    double[] b2 = new double[ARRLEN];
+    System.out.println("Warmup");
+    for (int i=0; i<ITERS; i++) {
+      test_ci(a1, b1);
+      test_vi(a2, b2, (int)123, 103.);
+      test_cp(a1, a2, b1, b2);
+      test_ci_neg(a1, b1);
+      test_vi_neg(a1, b1, (int)123, 103.);
+      test_cp_neg(a1, a2, b1, b2);
+      test_ci_oppos(a1, b1);
+      test_vi_oppos(a1, b1, (int)123, 103.);
+      test_cp_oppos(a1, a2, b1, b2);
+      test_ci_aln(a1, b1);
+      test_vi_aln(a1, b1, (int)123, 103.);
+      test_cp_alndst(a1, a2, b1, b2);
+      test_cp_alnsrc(a1, a2, b1, b2);
+      test_ci_unaln(a1, b1);
+      test_vi_unaln(a1, b1, (int)123, 103.);
+      test_cp_unalndst(a1, a2, b1, b2);
+      test_cp_unalnsrc(a1, a2, b1, b2);
+    }
+    // Initialize
+    for (int i=0; i<ARRLEN; i++) {
+      a1[i] = -1;
+      a2[i] = -1;
+      b1[i] = -1.;
+      b2[i] = -1.;
+    }
+    // Test and verify results
+    System.out.println("Verification");
+    int errn = 0;
+    {
+      test_ci(a1, b1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_ci: a1", i, a1[i], (int)-123);
+        errn += verify("test_ci: b1", i, b1[i], -103.);
+      }
+      test_vi(a2, b2, (int)123, 103.);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_vi: a2", i, a2[i], (int)123);
+        errn += verify("test_vi: b2", i, b2[i], 103.);
+      }
+      test_cp(a1, a2, b1, b2);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_cp: a1", i, a1[i], (int)123);
+        errn += verify("test_cp: b1", i, b1[i], 103.);
+      }
+
+      // Reset for negative stride
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        a2[i] = -1;
+        b1[i] = -1.;
+        b2[i] = -1.;
+      }
+      test_ci_neg(a1, b1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_ci_neg: a1", i, a1[i], (int)-123);
+        errn += verify("test_ci_neg: b1", i, b1[i], -103.);
+      }
+      test_vi_neg(a2, b2, (int)123, 103.);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_vi_neg: a2", i, a2[i], (int)123);
+        errn += verify("test_vi_neg: b2", i, b2[i], 103.);
+      }
+      test_cp_neg(a1, a2, b1, b2);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_cp_neg: a1", i, a1[i], (int)123);
+        errn += verify("test_cp_neg: b1", i, b1[i], 103.);
+      }
+
+      // Reset for opposite stride
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        a2[i] = -1;
+        b1[i] = -1.;
+        b2[i] = -1.;
+      }
+      test_ci_oppos(a1, b1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_ci_oppos: a1", i, a1[i], (int)-123);
+        errn += verify("test_ci_oppos: b1", i, b1[i], -103.);
+      }
+      test_vi_oppos(a2, b2, (int)123, 103.);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_vi_oppos: a2", i, a2[i], (int)123);
+        errn += verify("test_vi_oppos: b2", i, b2[i], 103.);
+      }
+      test_cp_oppos(a1, a2, b1, b2);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_cp_oppos: a1", i, a1[i], (int)123);
+        errn += verify("test_cp_oppos: b1", i, b1[i], 103.);
+      }
+
+      // Reset for 2 arrays with relative aligned offset
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        a2[i] = 123;
+        b1[i] = -1.;
+        b2[i] = 123.;
+      }
+      test_cp_alndst(a1, a2, b1, b2);
+      for (int i=0; i<ALIGN_OFF; i++) {
+        errn += verify("test_cp_alndst: a1", i, a1[i], (int)-1);
+        errn += verify("test_cp_alndst: b1", i, b1[i], -1.);
+      }
+      for (int i=ALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_cp_alndst: a1", i, a1[i], (int)123);
+        errn += verify("test_cp_alndst: b1", i, b1[i], 123.);
+      }
+      for (int i=0; i<ARRLEN; i++) {
+        a2[i] = -123;
+        b2[i] = -123.;
+      }
+      test_cp_alnsrc(a1, a2, b1, b2);
+      for (int i=0; i<ARRLEN-ALIGN_OFF; i++) {
+        errn += verify("test_cp_alnsrc: a1", i, a1[i], (int)-123);
+        errn += verify("test_cp_alnsrc: b1", i, b1[i], -123.);
+      }
+      for (int i=ARRLEN-ALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_cp_alnsrc: a1", i, a1[i], (int)123);
+        errn += verify("test_cp_alnsrc: b1", i, b1[i], 123.);
+      }
+
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        b1[i] = -1.;
+      }
+      test_ci_aln(a1, b1);
+      for (int i=0; i<ALIGN_OFF; i++) {
+        errn += verify("test_ci_aln: a1", i, a1[i], (int)-1);
+      }
+      for (int i=ALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_ci_aln: a1", i, a1[i], (int)-123);
+      }
+      for (int i=0; i<ARRLEN-ALIGN_OFF; i++) {
+        errn += verify("test_ci_aln: b1", i, b1[i], -103.);
+      }
+      for (int i=ARRLEN-ALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_ci_aln: b1", i, b1[i], -1.);
+      }
+
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        b1[i] = -1.;
+      }
+      test_vi_aln(a1, b1, (int)123, 103.);
+      for (int i=0; i<ARRLEN-ALIGN_OFF; i++) {
+        errn += verify("test_vi_aln: a1", i, a1[i], (int)123);
+      }
+      for (int i=ARRLEN-ALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_vi_aln: a1", i, a1[i], (int)-1);
+      }
+      for (int i=0; i<ALIGN_OFF; i++) {
+        errn += verify("test_vi_aln: b1", i, b1[i], -1.);
+      }
+      for (int i=ALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_vi_aln: b1", i, b1[i], 103.);
+      }
+
+      // Reset for 2 arrays with relative unaligned offset
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        a2[i] = 123;
+        b1[i] = -1.;
+        b2[i] = 123.;
+      }
+      test_cp_unalndst(a1, a2, b1, b2);
+      for (int i=0; i<UNALIGN_OFF; i++) {
+        errn += verify("test_cp_unalndst: a1", i, a1[i], (int)-1);
+        errn += verify("test_cp_unalndst: b1", i, b1[i], -1.);
+      }
+      for (int i=UNALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_cp_unalndst: a1", i, a1[i], (int)123);
+        errn += verify("test_cp_unalndst: b1", i, b1[i], 123.);
+      }
+      for (int i=0; i<ARRLEN; i++) {
+        a2[i] = -123;
+        b2[i] = -123.;
+      }
+      test_cp_unalnsrc(a1, a2, b1, b2);
+      for (int i=0; i<ARRLEN-UNALIGN_OFF; i++) {
+        errn += verify("test_cp_unalnsrc: a1", i, a1[i], (int)-123);
+        errn += verify("test_cp_unalnsrc: b1", i, b1[i], -123.);
+      }
+      for (int i=ARRLEN-UNALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_cp_unalnsrc: a1", i, a1[i], (int)123);
+        errn += verify("test_cp_unalnsrc: b1", i, b1[i], 123.);
+      }
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        b1[i] = -1;
+      }
+      test_ci_unaln(a1, b1);
+      for (int i=0; i<UNALIGN_OFF; i++) {
+        errn += verify("test_ci_unaln: a1", i, a1[i], (int)-1);
+      }
+      for (int i=UNALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_ci_unaln: a1", i, a1[i], (int)-123);
+      }
+      for (int i=0; i<ARRLEN-UNALIGN_OFF; i++) {
+        errn += verify("test_ci_unaln: b1", i, b1[i], -103.);
+      }
+      for (int i=ARRLEN-UNALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_ci_unaln: b1", i, b1[i], -1.);
+      }
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        b1[i] = -1;
+      }
+      test_vi_unaln(a1, b1, (int)123, 103.);
+      for (int i=0; i<ARRLEN-UNALIGN_OFF; i++) {
+        errn += verify("test_vi_unaln: a1", i, a1[i], (int)123);
+      }
+      for (int i=ARRLEN-UNALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_vi_unaln: a1", i, a1[i], (int)-1);
+      }
+      for (int i=0; i<UNALIGN_OFF; i++) {
+        errn += verify("test_vi_unaln: b1", i, b1[i], -1.);
+      }
+      for (int i=UNALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_vi_unaln: b1", i, b1[i], 103.);
+      }
+
+      // Reset for aligned overlap initialization
+      for (int i=0; i<ALIGN_OFF; i++) {
+        a1[i] = (int)i;
+        b1[i] = (double)i;
+      }
+      for (int i=ALIGN_OFF; i<ARRLEN; i++) {
+        a1[i] = -1;
+        b1[i] = -1.;
+      }
+      test_cp_alndst(a1, a1, b1, b1);
+      for (int i=0; i<ARRLEN; i++) {
+        int v = i%ALIGN_OFF;
+        errn += verify("test_cp_alndst_overlap: a1", i, a1[i], (int)v);
+        errn += verify("test_cp_alndst_overlap: b1", i, b1[i], (double)v);
+      }
+      for (int i=0; i<ALIGN_OFF; i++) {
+        a1[i+ALIGN_OFF] = -1;
+        b1[i+ALIGN_OFF] = -1.;
+      }
+      test_cp_alnsrc(a1, a1, b1, b1);
+      for (int i=0; i<ALIGN_OFF; i++) {
+        errn += verify("test_cp_alnsrc_overlap: a1", i, a1[i], (int)-1);
+        errn += verify("test_cp_alnsrc_overlap: b1", i, b1[i], -1.);
+      }
+      for (int i=ALIGN_OFF; i<ARRLEN; i++) {
+        int v = i%ALIGN_OFF;
+        errn += verify("test_cp_alnsrc_overlap: a1", i, a1[i], (int)v);
+        errn += verify("test_cp_alnsrc_overlap: b1", i, b1[i], (double)v);
+      }
+
+      // Reset for unaligned overlap initialization
+      for (int i=0; i<UNALIGN_OFF; i++) {
+        a1[i] = (int)i;
+        b1[i] = (double)i;
+      }
+      for (int i=UNALIGN_OFF; i<ARRLEN; i++) {
+        a1[i] = -1;
+        b1[i] = -1.;
+      }
+      test_cp_unalndst(a1, a1, b1, b1);
+      for (int i=0; i<ARRLEN; i++) {
+        int v = i%UNALIGN_OFF;
+        errn += verify("test_cp_unalndst_overlap: a1", i, a1[i], (int)v);
+        errn += verify("test_cp_unalndst_overlap: b1", i, b1[i], (double)v);
+      }
+      for (int i=0; i<UNALIGN_OFF; i++) {
+        a1[i+UNALIGN_OFF] = -1;
+        b1[i+UNALIGN_OFF] = -1.;
+      }
+      test_cp_unalnsrc(a1, a1, b1, b1);
+      for (int i=0; i<UNALIGN_OFF; i++) {
+        errn += verify("test_cp_unalnsrc_overlap: a1", i, a1[i], (int)-1);
+        errn += verify("test_cp_unalnsrc_overlap: b1", i, b1[i], -1.);
+      }
+      for (int i=UNALIGN_OFF; i<ARRLEN; i++) {
+        int v = i%UNALIGN_OFF;
+        errn += verify("test_cp_unalnsrc_overlap: a1", i, a1[i], (int)v);
+        errn += verify("test_cp_unalnsrc_overlap: b1", i, b1[i], (double)v);
+      }
+
+    }
+
+    if (errn > 0)
+      return errn;
+
+    System.out.println("Time");
+    long start, end;
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_ci(a1, b1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_ci: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_vi(a2, b2, (int)123, 103.);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_vi: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp(a1, a2, b1, b2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_ci_neg(a1, b1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_ci_neg: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_vi_neg(a1, b1, (int)123, 103.);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_vi_neg: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_neg(a1, a2, b1, b2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_neg: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_ci_oppos(a1, b1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_ci_oppos: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_vi_oppos(a1, b1, (int)123, 103.);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_vi_oppos: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_oppos(a1, a2, b1, b2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_oppos: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_ci_aln(a1, b1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_ci_aln: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_vi_aln(a1, b1, (int)123, 103.);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_vi_aln: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_alndst(a1, a2, b1, b2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_alndst: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_alnsrc(a1, a2, b1, b2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_alnsrc: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_ci_unaln(a1, b1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_ci_unaln: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_vi_unaln(a1, b1, (int)123, 103.);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_vi_unaln: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_unalndst(a1, a2, b1, b2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_unalndst: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_unalnsrc(a1, a2, b1, b2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_unalnsrc: " + (end - start));
+    return errn;
+  }
+
+  static void test_ci(int[] a, double[] b) {
+    for (int i = 0; i < a.length; i+=1) {
+      a[i] = -123;
+      b[i] = -103.;
+    }
+  }
+  static void test_vi(int[] a, double[] b, int c, double d) {
+    for (int i = 0; i < a.length; i+=1) {
+      a[i] = c;
+      b[i] = d;
+    }
+  }
+  static void test_cp(int[] a, int[] b, double[] c, double[] d) {
+    for (int i = 0; i < a.length; i+=1) {
+      a[i] = b[i];
+      c[i] = d[i];
+    }
+  }
+  static void test_ci_neg(int[] a, double[] b) {
+    for (int i = a.length-1; i >= 0; i-=1) {
+      a[i] = -123;
+      b[i] = -103.;
+    }
+  }
+  static void test_vi_neg(int[] a, double[] b, int c, double d) {
+    for (int i = a.length-1; i >= 0; i-=1) {
+      a[i] = c;
+      b[i] = d;
+    }
+  }
+  static void test_cp_neg(int[] a, int[] b, double[] c, double[] d) {
+    for (int i = a.length-1; i >= 0; i-=1) {
+      a[i] = b[i];
+      c[i] = d[i];
+    }
+  }
+  static void test_ci_oppos(int[] a, double[] b) {
+    int limit = a.length-1;
+    for (int i = 0; i < a.length; i+=1) {
+      a[limit-i] = -123;
+      b[i] = -103.;
+    }
+  }
+  static void test_vi_oppos(int[] a, double[] b, int c, double d) {
+    int limit = a.length-1;
+    for (int i = a.length-1; i >= 0; i-=1) {
+      a[i] = c;
+      b[limit-i] = d;
+    }
+  }
+  static void test_cp_oppos(int[] a, int[] b, double[] c, double[] d) {
+    int limit = a.length-1;
+    for (int i = 0; i < a.length; i+=1) {
+      a[i] = b[limit-i];
+      c[limit-i] = d[i];
+    }
+  }
+  static void test_ci_aln(int[] a, double[] b) {
+    for (int i = 0; i < a.length-ALIGN_OFF; i+=1) {
+      a[i+ALIGN_OFF] = -123;
+      b[i] = -103.;
+    }
+  }
+  static void test_vi_aln(int[] a, double[] b, int c, double d) {
+    for (int i = 0; i < a.length-ALIGN_OFF; i+=1) {
+      a[i] = c;
+      b[i+ALIGN_OFF] = d;
+    }
+  }
+  static void test_cp_alndst(int[] a, int[] b, double[] c, double[] d) {
+    for (int i = 0; i < a.length-ALIGN_OFF; i+=1) {
+      a[i+ALIGN_OFF] = b[i];
+      c[i+ALIGN_OFF] = d[i];
+    }
+  }
+  static void test_cp_alnsrc(int[] a, int[] b, double[] c, double[] d) {
+    for (int i = 0; i < a.length-ALIGN_OFF; i+=1) {
+      a[i] = b[i+ALIGN_OFF];
+      c[i] = d[i+ALIGN_OFF];
+    }
+  }
+  static void test_ci_unaln(int[] a, double[] b) {
+    for (int i = 0; i < a.length-UNALIGN_OFF; i+=1) {
+      a[i+UNALIGN_OFF] = -123;
+      b[i] = -103.;
+    }
+  }
+  static void test_vi_unaln(int[] a, double[] b, int c, double d) {
+    for (int i = 0; i < a.length-UNALIGN_OFF; i+=1) {
+      a[i] = c;
+      b[i+UNALIGN_OFF] = d;
+    }
+  }
+  static void test_cp_unalndst(int[] a, int[] b, double[] c, double[] d) {
+    for (int i = 0; i < a.length-UNALIGN_OFF; i+=1) {
+      a[i+UNALIGN_OFF] = b[i];
+      c[i+UNALIGN_OFF] = d[i];
+    }
+  }
+  static void test_cp_unalnsrc(int[] a, int[] b, double[] c, double[] d) {
+    for (int i = 0; i < a.length-UNALIGN_OFF; i+=1) {
+      a[i] = b[i+UNALIGN_OFF];
+      c[i] = d[i+UNALIGN_OFF];
+    }
+  }
+
+  static int verify(String text, int i, int elem, int val) {
+    if (elem != val) {
+      System.err.println(text + "[" + i + "] = " + elem + " != " + val);
+      return 1;
+    }
+    return 0;
+  }
+  static int verify(String text, int i, double elem, double val) {
+    if (elem != val) {
+      System.err.println(text + "[" + i + "] = " + elem + " != " + val);
+      return 1;
+    }
+    return 0;
+  }
+}
--- /dev/null	Sat Jun  2 20:04:28 2012
+++ new/test/compiler/7119644/TestIntFloatVect.java	Sat Jun  2 20:04:28 2012
@@ -0,0 +1,571 @@
+/*
+ * Copyright (c) 2012, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+/**
+ * @test
+ * @bug 7119644
+ * @summary Increase superword's vector size up to 256 bits
+ *
+ * @run main/othervm/timeout=300 -Xbatch -XX:+IgnoreUnrecognizedVMOptions -XX:-TieredCompilation -XX:-OptimizeFill TestIntFloatVect
+ */
+
+public class TestIntFloatVect {
+  private static final int ARRLEN = 997;
+  private static final int ITERS  = 11000;
+  private static final int OFFSET = 3;
+  private static final int SCALE = 2;
+  private static final int ALIGN_OFF = 8;
+  private static final int UNALIGN_OFF = 5;
+
+  public static void main(String args[]) {
+    System.out.println("Testing Integer + Float vectors");
+    int errn = test();
+    if (errn > 0) {
+      System.err.println("FAILED: " + errn + " errors");
+      System.exit(97);
+    }
+    System.out.println("PASSED");
+  }
+
+  static int test() {
+    int[] a1 = new int[ARRLEN];
+    int[] a2 = new int[ARRLEN];
+    float[] b1 = new float[ARRLEN];
+    float[] b2 = new float[ARRLEN];
+    System.out.println("Warmup");
+    for (int i=0; i<ITERS; i++) {
+      test_ci(a1, b1);
+      test_vi(a2, b2, (int)123, 103.f);
+      test_cp(a1, a2, b1, b2);
+      test_ci_neg(a1, b1);
+      test_vi_neg(a1, b1, (int)123, 103.f);
+      test_cp_neg(a1, a2, b1, b2);
+      test_ci_oppos(a1, b1);
+      test_vi_oppos(a1, b1, (int)123, 103.f);
+      test_cp_oppos(a1, a2, b1, b2);
+      test_ci_aln(a1, b1);
+      test_vi_aln(a1, b1, (int)123, 103.f);
+      test_cp_alndst(a1, a2, b1, b2);
+      test_cp_alnsrc(a1, a2, b1, b2);
+      test_ci_unaln(a1, b1);
+      test_vi_unaln(a1, b1, (int)123, 103.f);
+      test_cp_unalndst(a1, a2, b1, b2);
+      test_cp_unalnsrc(a1, a2, b1, b2);
+    }
+    // Initialize
+    for (int i=0; i<ARRLEN; i++) {
+      a1[i] = -1;
+      a2[i] = -1;
+      b1[i] = -1.f;
+      b2[i] = -1.f;
+    }
+    // Test and verify results
+    System.out.println("Verification");
+    int errn = 0;
+    {
+      test_ci(a1, b1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_ci: a1", i, a1[i], (int)-123);
+        errn += verify("test_ci: b1", i, b1[i], -103.f);
+      }
+      test_vi(a2, b2, (int)123, 103.f);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_vi: a2", i, a2[i], (int)123);
+        errn += verify("test_vi: b2", i, b2[i], 103.f);
+      }
+      test_cp(a1, a2, b1, b2);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_cp: a1", i, a1[i], (int)123);
+        errn += verify("test_cp: b1", i, b1[i], 103.f);
+      }
+
+      // Reset for negative stride
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        a2[i] = -1;
+        b1[i] = -1.f;
+        b2[i] = -1.f;
+      }
+      test_ci_neg(a1, b1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_ci_neg: a1", i, a1[i], (int)-123);
+        errn += verify("test_ci_neg: b1", i, b1[i], -103.f);
+      }
+      test_vi_neg(a2, b2, (int)123, 103.f);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_vi_neg: a2", i, a2[i], (int)123);
+        errn += verify("test_vi_neg: b2", i, b2[i], 103.f);
+      }
+      test_cp_neg(a1, a2, b1, b2);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_cp_neg: a1", i, a1[i], (int)123);
+        errn += verify("test_cp_neg: b1", i, b1[i], 103.f);
+      }
+
+      // Reset for opposite stride
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        a2[i] = -1;
+        b1[i] = -1.f;
+        b2[i] = -1.f;
+      }
+      test_ci_oppos(a1, b1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_ci_oppos: a1", i, a1[i], (int)-123);
+        errn += verify("test_ci_oppos: b1", i, b1[i], -103.f);
+      }
+      test_vi_oppos(a2, b2, (int)123, 103.f);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_vi_oppos: a2", i, a2[i], (int)123);
+        errn += verify("test_vi_oppos: b2", i, b2[i], 103.f);
+      }
+      test_cp_oppos(a1, a2, b1, b2);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_cp_oppos: a1", i, a1[i], (int)123);
+        errn += verify("test_cp_oppos: b1", i, b1[i], 103.f);
+      }
+
+      // Reset for 2 arrays with relative aligned offset
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        a2[i] = 123;
+        b1[i] = -1.f;
+        b2[i] = 123.f;
+      }
+      test_cp_alndst(a1, a2, b1, b2);
+      for (int i=0; i<ALIGN_OFF; i++) {
+        errn += verify("test_cp_alndst: a1", i, a1[i], (int)-1);
+        errn += verify("test_cp_alndst: b1", i, b1[i], -1.f);
+      }
+      for (int i=ALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_cp_alndst: a1", i, a1[i], (int)123);
+        errn += verify("test_cp_alndst: b1", i, b1[i], 123.f);
+      }
+      for (int i=0; i<ARRLEN; i++) {
+        a2[i] = -123;
+        b2[i] = -123.f;
+      }
+      test_cp_alnsrc(a1, a2, b1, b2);
+      for (int i=0; i<ARRLEN-ALIGN_OFF; i++) {
+        errn += verify("test_cp_alnsrc: a1", i, a1[i], (int)-123);
+        errn += verify("test_cp_alnsrc: b1", i, b1[i], -123.f);
+      }
+      for (int i=ARRLEN-ALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_cp_alnsrc: a1", i, a1[i], (int)123);
+        errn += verify("test_cp_alnsrc: b1", i, b1[i], 123.f);
+      }
+
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        b1[i] = -1.f;
+      }
+      test_ci_aln(a1, b1);
+      for (int i=0; i<ALIGN_OFF; i++) {
+        errn += verify("test_ci_aln: a1", i, a1[i], (int)-1);
+      }
+      for (int i=ALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_ci_aln: a1", i, a1[i], (int)-123);
+      }
+      for (int i=0; i<ARRLEN-ALIGN_OFF; i++) {
+        errn += verify("test_ci_aln: b1", i, b1[i], -103.f);
+      }
+      for (int i=ARRLEN-ALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_ci_aln: b1", i, b1[i], -1.f);
+      }
+
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        b1[i] = -1.f;
+      }
+      test_vi_aln(a1, b1, (int)123, 103.f);
+      for (int i=0; i<ARRLEN-ALIGN_OFF; i++) {
+        errn += verify("test_vi_aln: a1", i, a1[i], (int)123);
+      }
+      for (int i=ARRLEN-ALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_vi_aln: a1", i, a1[i], (int)-1);
+      }
+      for (int i=0; i<ALIGN_OFF; i++) {
+        errn += verify("test_vi_aln: b1", i, b1[i], -1.f);
+      }
+      for (int i=ALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_vi_aln: b1", i, b1[i], 103.f);
+      }
+
+      // Reset for 2 arrays with relative unaligned offset
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        a2[i] = 123;
+        b1[i] = -1.f;
+        b2[i] = 123.f;
+      }
+      test_cp_unalndst(a1, a2, b1, b2);
+      for (int i=0; i<UNALIGN_OFF; i++) {
+        errn += verify("test_cp_unalndst: a1", i, a1[i], (int)-1);
+        errn += verify("test_cp_unalndst: b1", i, b1[i], -1.f);
+      }
+      for (int i=UNALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_cp_unalndst: a1", i, a1[i], (int)123);
+        errn += verify("test_cp_unalndst: b1", i, b1[i], 123.f);
+      }
+      for (int i=0; i<ARRLEN; i++) {
+        a2[i] = -123;
+        b2[i] = -123.f;
+      }
+      test_cp_unalnsrc(a1, a2, b1, b2);
+      for (int i=0; i<ARRLEN-UNALIGN_OFF; i++) {
+        errn += verify("test_cp_unalnsrc: a1", i, a1[i], (int)-123);
+        errn += verify("test_cp_unalnsrc: b1", i, b1[i], -123.f);
+      }
+      for (int i=ARRLEN-UNALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_cp_unalnsrc: a1", i, a1[i], (int)123);
+        errn += verify("test_cp_unalnsrc: b1", i, b1[i], 123.f);
+      }
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        b1[i] = -1;
+      }
+      test_ci_unaln(a1, b1);
+      for (int i=0; i<UNALIGN_OFF; i++) {
+        errn += verify("test_ci_unaln: a1", i, a1[i], (int)-1);
+      }
+      for (int i=UNALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_ci_unaln: a1", i, a1[i], (int)-123);
+      }
+      for (int i=0; i<ARRLEN-UNALIGN_OFF; i++) {
+        errn += verify("test_ci_unaln: b1", i, b1[i], -103.f);
+      }
+      for (int i=ARRLEN-UNALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_ci_unaln: b1", i, b1[i], -1.f);
+      }
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        b1[i] = -1;
+      }
+      test_vi_unaln(a1, b1, (int)123, 103.f);
+      for (int i=0; i<ARRLEN-UNALIGN_OFF; i++) {
+        errn += verify("test_vi_unaln: a1", i, a1[i], (int)123);
+      }
+      for (int i=ARRLEN-UNALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_vi_unaln: a1", i, a1[i], (int)-1);
+      }
+      for (int i=0; i<UNALIGN_OFF; i++) {
+        errn += verify("test_vi_unaln: b1", i, b1[i], -1.f);
+      }
+      for (int i=UNALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_vi_unaln: b1", i, b1[i], 103.f);
+      }
+
+      // Reset for aligned overlap initialization
+      for (int i=0; i<ALIGN_OFF; i++) {
+        a1[i] = (int)i;
+        b1[i] = (float)i;
+      }
+      for (int i=ALIGN_OFF; i<ARRLEN; i++) {
+        a1[i] = -1;
+        b1[i] = -1.f;
+      }
+      test_cp_alndst(a1, a1, b1, b1);
+      for (int i=0; i<ARRLEN; i++) {
+        int v = i%ALIGN_OFF;
+        errn += verify("test_cp_alndst_overlap: a1", i, a1[i], (int)v);
+        errn += verify("test_cp_alndst_overlap: b1", i, b1[i], (float)v);
+      }
+      for (int i=0; i<ALIGN_OFF; i++) {
+        a1[i+ALIGN_OFF] = -1;
+        b1[i+ALIGN_OFF] = -1.f;
+      }
+      test_cp_alnsrc(a1, a1, b1, b1);
+      for (int i=0; i<ALIGN_OFF; i++) {
+        errn += verify("test_cp_alnsrc_overlap: a1", i, a1[i], (int)-1);
+        errn += verify("test_cp_alnsrc_overlap: b1", i, b1[i], -1.f);
+      }
+      for (int i=ALIGN_OFF; i<ARRLEN; i++) {
+        int v = i%ALIGN_OFF;
+        errn += verify("test_cp_alnsrc_overlap: a1", i, a1[i], (int)v);
+        errn += verify("test_cp_alnsrc_overlap: b1", i, b1[i], (float)v);
+      }
+
+      // Reset for unaligned overlap initialization
+      for (int i=0; i<UNALIGN_OFF; i++) {
+        a1[i] = (int)i;
+        b1[i] = (float)i;
+      }
+      for (int i=UNALIGN_OFF; i<ARRLEN; i++) {
+        a1[i] = -1;
+        b1[i] = -1.f;
+      }
+      test_cp_unalndst(a1, a1, b1, b1);
+      for (int i=0; i<ARRLEN; i++) {
+        int v = i%UNALIGN_OFF;
+        errn += verify("test_cp_unalndst_overlap: a1", i, a1[i], (int)v);
+        errn += verify("test_cp_unalndst_overlap: b1", i, b1[i], (float)v);
+      }
+      for (int i=0; i<UNALIGN_OFF; i++) {
+        a1[i+UNALIGN_OFF] = -1;
+        b1[i+UNALIGN_OFF] = -1.f;
+      }
+      test_cp_unalnsrc(a1, a1, b1, b1);
+      for (int i=0; i<UNALIGN_OFF; i++) {
+        errn += verify("test_cp_unalnsrc_overlap: a1", i, a1[i], (int)-1);
+        errn += verify("test_cp_unalnsrc_overlap: b1", i, b1[i], -1.f);
+      }
+      for (int i=UNALIGN_OFF; i<ARRLEN; i++) {
+        int v = i%UNALIGN_OFF;
+        errn += verify("test_cp_unalnsrc_overlap: a1", i, a1[i], (int)v);
+        errn += verify("test_cp_unalnsrc_overlap: b1", i, b1[i], (float)v);
+      }
+
+    }
+
+    if (errn > 0)
+      return errn;
+
+    System.out.println("Time");
+    long start, end;
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_ci(a1, b1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_ci: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_vi(a2, b2, (int)123, 103.f);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_vi: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp(a1, a2, b1, b2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_ci_neg(a1, b1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_ci_neg: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_vi_neg(a1, b1, (int)123, 103.f);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_vi_neg: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_neg(a1, a2, b1, b2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_neg: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_ci_oppos(a1, b1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_ci_oppos: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_vi_oppos(a1, b1, (int)123, 103.f);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_vi_oppos: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_oppos(a1, a2, b1, b2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_oppos: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_ci_aln(a1, b1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_ci_aln: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_vi_aln(a1, b1, (int)123, 103.f);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_vi_aln: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_alndst(a1, a2, b1, b2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_alndst: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_alnsrc(a1, a2, b1, b2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_alnsrc: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_ci_unaln(a1, b1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_ci_unaln: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_vi_unaln(a1, b1, (int)123, 103.f);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_vi_unaln: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_unalndst(a1, a2, b1, b2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_unalndst: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_unalnsrc(a1, a2, b1, b2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_unalnsrc: " + (end - start));
+    return errn;
+  }
+
+  static void test_ci(int[] a, float[] b) {
+    for (int i = 0; i < a.length; i+=1) {
+      a[i] = -123;
+      b[i] = -103.f;
+    }
+  }
+  static void test_vi(int[] a, float[] b, int c, float d) {
+    for (int i = 0; i < a.length; i+=1) {
+      a[i] = c;
+      b[i] = d;
+    }
+  }
+  static void test_cp(int[] a, int[] b, float[] c, float[] d) {
+    for (int i = 0; i < a.length; i+=1) {
+      a[i] = b[i];
+      c[i] = d[i];
+    }
+  }
+  static void test_ci_neg(int[] a, float[] b) {
+    for (int i = a.length-1; i >= 0; i-=1) {
+      a[i] = -123;
+      b[i] = -103.f;
+    }
+  }
+  static void test_vi_neg(int[] a, float[] b, int c, float d) {
+    for (int i = a.length-1; i >= 0; i-=1) {
+      a[i] = c;
+      b[i] = d;
+    }
+  }
+  static void test_cp_neg(int[] a, int[] b, float[] c, float[] d) {
+    for (int i = a.length-1; i >= 0; i-=1) {
+      a[i] = b[i];
+      c[i] = d[i];
+    }
+  }
+  static void test_ci_oppos(int[] a, float[] b) {
+    int limit = a.length-1;
+    for (int i = 0; i < a.length; i+=1) {
+      a[limit-i] = -123;
+      b[i] = -103.f;
+    }
+  }
+  static void test_vi_oppos(int[] a, float[] b, int c, float d) {
+    int limit = a.length-1;
+    for (int i = a.length-1; i >= 0; i-=1) {
+      a[i] = c;
+      b[limit-i] = d;
+    }
+  }
+  static void test_cp_oppos(int[] a, int[] b, float[] c, float[] d) {
+    int limit = a.length-1;
+    for (int i = 0; i < a.length; i+=1) {
+      a[i] = b[limit-i];
+      c[limit-i] = d[i];
+    }
+  }
+  static void test_ci_aln(int[] a, float[] b) {
+    for (int i = 0; i < a.length-ALIGN_OFF; i+=1) {
+      a[i+ALIGN_OFF] = -123;
+      b[i] = -103.f;
+    }
+  }
+  static void test_vi_aln(int[] a, float[] b, int c, float d) {
+    for (int i = 0; i < a.length-ALIGN_OFF; i+=1) {
+      a[i] = c;
+      b[i+ALIGN_OFF] = d;
+    }
+  }
+  static void test_cp_alndst(int[] a, int[] b, float[] c, float[] d) {
+    for (int i = 0; i < a.length-ALIGN_OFF; i+=1) {
+      a[i+ALIGN_OFF] = b[i];
+      c[i+ALIGN_OFF] = d[i];
+    }
+  }
+  static void test_cp_alnsrc(int[] a, int[] b, float[] c, float[] d) {
+    for (int i = 0; i < a.length-ALIGN_OFF; i+=1) {
+      a[i] = b[i+ALIGN_OFF];
+      c[i] = d[i+ALIGN_OFF];
+    }
+  }
+  static void test_ci_unaln(int[] a, float[] b) {
+    for (int i = 0; i < a.length-UNALIGN_OFF; i+=1) {
+      a[i+UNALIGN_OFF] = -123;
+      b[i] = -103.f;
+    }
+  }
+  static void test_vi_unaln(int[] a, float[] b, int c, float d) {
+    for (int i = 0; i < a.length-UNALIGN_OFF; i+=1) {
+      a[i] = c;
+      b[i+UNALIGN_OFF] = d;
+    }
+  }
+  static void test_cp_unalndst(int[] a, int[] b, float[] c, float[] d) {
+    for (int i = 0; i < a.length-UNALIGN_OFF; i+=1) {
+      a[i+UNALIGN_OFF] = b[i];
+      c[i+UNALIGN_OFF] = d[i];
+    }
+  }
+  static void test_cp_unalnsrc(int[] a, int[] b, float[] c, float[] d) {
+    for (int i = 0; i < a.length-UNALIGN_OFF; i+=1) {
+      a[i] = b[i+UNALIGN_OFF];
+      c[i] = d[i+UNALIGN_OFF];
+    }
+  }
+
+  static int verify(String text, int i, int elem, int val) {
+    if (elem != val) {
+      System.err.println(text + "[" + i + "] = " + elem + " != " + val);
+      return 1;
+    }
+    return 0;
+  }
+  static int verify(String text, int i, float elem, float val) {
+    if (elem != val) {
+      System.err.println(text + "[" + i + "] = " + elem + " != " + val);
+      return 1;
+    }
+    return 0;
+  }
+}
--- /dev/null	Sat Jun  2 20:04:29 2012
+++ new/test/compiler/7119644/TestIntLongVect.java	Sat Jun  2 20:04:29 2012
@@ -0,0 +1,571 @@
+/*
+ * Copyright (c) 2012, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+/**
+ * @test
+ * @bug 7119644
+ * @summary Increase superword's vector size up to 256 bits
+ *
+ * @run main/othervm/timeout=300 -Xbatch -XX:+IgnoreUnrecognizedVMOptions -XX:-TieredCompilation -XX:-OptimizeFill TestIntLongVect
+ */
+
+public class TestIntLongVect {
+  private static final int ARRLEN = 997;
+  private static final int ITERS  = 11000;
+  private static final int OFFSET = 3;
+  private static final int SCALE = 2;
+  private static final int ALIGN_OFF = 8;
+  private static final int UNALIGN_OFF = 5;
+
+  public static void main(String args[]) {
+    System.out.println("Testing Integer + Long vectors");
+    int errn = test();
+    if (errn > 0) {
+      System.err.println("FAILED: " + errn + " errors");
+      System.exit(97);
+    }
+    System.out.println("PASSED");
+  }
+
+  static int test() {
+    int[] a1 = new int[ARRLEN];
+    int[] a2 = new int[ARRLEN];
+    long[] b1 = new long[ARRLEN];
+    long[] b2 = new long[ARRLEN];
+    System.out.println("Warmup");
+    for (int i=0; i<ITERS; i++) {
+      test_ci(a1, b1);
+      test_vi(a2, b2, (int)123, (long)103);
+      test_cp(a1, a2, b1, b2);
+      test_ci_neg(a1, b1);
+      test_vi_neg(a1, b1, (int)123, (long)103);
+      test_cp_neg(a1, a2, b1, b2);
+      test_ci_oppos(a1, b1);
+      test_vi_oppos(a1, b1, (int)123, (long)103);
+      test_cp_oppos(a1, a2, b1, b2);
+      test_ci_aln(a1, b1);
+      test_vi_aln(a1, b1, (int)123, (long)103);
+      test_cp_alndst(a1, a2, b1, b2);
+      test_cp_alnsrc(a1, a2, b1, b2);
+      test_ci_unaln(a1, b1);
+      test_vi_unaln(a1, b1, (int)123, (long)103);
+      test_cp_unalndst(a1, a2, b1, b2);
+      test_cp_unalnsrc(a1, a2, b1, b2);
+    }
+    // Initialize
+    for (int i=0; i<ARRLEN; i++) {
+      a1[i] = -1;
+      a2[i] = -1;
+      b1[i] = -1;
+      b2[i] = -1;
+    }
+    // Test and verify results
+    System.out.println("Verification");
+    int errn = 0;
+    {
+      test_ci(a1, b1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_ci: a1", i, a1[i], (int)-123);
+        errn += verify("test_ci: b1", i, b1[i], (long)-103);
+      }
+      test_vi(a2, b2, (int)123, (long)103);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_vi: a2", i, a2[i], (int)123);
+        errn += verify("test_vi: b2", i, b2[i], (long)103);
+      }
+      test_cp(a1, a2, b1, b2);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_cp: a1", i, a1[i], (int)123);
+        errn += verify("test_cp: b1", i, b1[i], (long)103);
+      }
+
+      // Reset for negative stride
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        a2[i] = -1;
+        b1[i] = -1;
+        b2[i] = -1;
+      }
+      test_ci_neg(a1, b1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_ci_neg: a1", i, a1[i], (int)-123);
+        errn += verify("test_ci_neg: b1", i, b1[i], (long)-103);
+      }
+      test_vi_neg(a2, b2, (int)123, (long)103);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_vi_neg: a2", i, a2[i], (int)123);
+        errn += verify("test_vi_neg: b2", i, b2[i], (long)103);
+      }
+      test_cp_neg(a1, a2, b1, b2);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_cp_neg: a1", i, a1[i], (int)123);
+        errn += verify("test_cp_neg: b1", i, b1[i], (long)103);
+      }
+
+      // Reset for opposite stride
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        a2[i] = -1;
+        b1[i] = -1;
+        b2[i] = -1;
+      }
+      test_ci_oppos(a1, b1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_ci_oppos: a1", i, a1[i], (int)-123);
+        errn += verify("test_ci_oppos: b1", i, b1[i], (long)-103);
+      }
+      test_vi_oppos(a2, b2, (int)123, (long)103);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_vi_oppos: a2", i, a2[i], (int)123);
+        errn += verify("test_vi_oppos: b2", i, b2[i], (long)103);
+      }
+      test_cp_oppos(a1, a2, b1, b2);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_cp_oppos: a1", i, a1[i], (int)123);
+        errn += verify("test_cp_oppos: b1", i, b1[i], (long)103);
+      }
+
+      // Reset for 2 arrays with relative aligned offset
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        a2[i] = 123;
+        b1[i] = -1;
+        b2[i] = 123;
+      }
+      test_cp_alndst(a1, a2, b1, b2);
+      for (int i=0; i<ALIGN_OFF; i++) {
+        errn += verify("test_cp_alndst: a1", i, a1[i], (int)-1);
+        errn += verify("test_cp_alndst: b1", i, b1[i], (long)-1);
+      }
+      for (int i=ALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_cp_alndst: a1", i, a1[i], (int)123);
+        errn += verify("test_cp_alndst: b1", i, b1[i], (long)123);
+      }
+      for (int i=0; i<ARRLEN; i++) {
+        a2[i] = -123;
+        b2[i] = -123;
+      }
+      test_cp_alnsrc(a1, a2, b1, b2);
+      for (int i=0; i<ARRLEN-ALIGN_OFF; i++) {
+        errn += verify("test_cp_alnsrc: a1", i, a1[i], (int)-123);
+        errn += verify("test_cp_alnsrc: b1", i, b1[i], (long)-123);
+      }
+      for (int i=ARRLEN-ALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_cp_alnsrc: a1", i, a1[i], (int)123);
+        errn += verify("test_cp_alnsrc: b1", i, b1[i], (long)123);
+      }
+
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        b1[i] = -1;
+      }
+      test_ci_aln(a1, b1);
+      for (int i=0; i<ALIGN_OFF; i++) {
+        errn += verify("test_ci_aln: a1", i, a1[i], (int)-1);
+      }
+      for (int i=ALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_ci_aln: a1", i, a1[i], (int)-123);
+      }
+      for (int i=0; i<ARRLEN-ALIGN_OFF; i++) {
+        errn += verify("test_ci_aln: b1", i, b1[i], (long)-103);
+      }
+      for (int i=ARRLEN-ALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_ci_aln: b1", i, b1[i], (long)-1);
+      }
+
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        b1[i] = -1;
+      }
+      test_vi_aln(a1, b1, (int)123, (long)103);
+      for (int i=0; i<ARRLEN-ALIGN_OFF; i++) {
+        errn += verify("test_vi_aln: a1", i, a1[i], (int)123);
+      }
+      for (int i=ARRLEN-ALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_vi_aln: a1", i, a1[i], (int)-1);
+      }
+      for (int i=0; i<ALIGN_OFF; i++) {
+        errn += verify("test_vi_aln: b1", i, b1[i], (long)-1);
+      }
+      for (int i=ALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_vi_aln: b1", i, b1[i], (long)103);
+      }
+
+      // Reset for 2 arrays with relative unaligned offset
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        a2[i] = 123;
+        b1[i] = -1;
+        b2[i] = 123;
+      }
+      test_cp_unalndst(a1, a2, b1, b2);
+      for (int i=0; i<UNALIGN_OFF; i++) {
+        errn += verify("test_cp_unalndst: a1", i, a1[i], (int)-1);
+        errn += verify("test_cp_unalndst: b1", i, b1[i], (long)-1);
+      }
+      for (int i=UNALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_cp_unalndst: a1", i, a1[i], (int)123);
+        errn += verify("test_cp_unalndst: b1", i, b1[i], (long)123);
+      }
+      for (int i=0; i<ARRLEN; i++) {
+        a2[i] = -123;
+        b2[i] = -123;
+      }
+      test_cp_unalnsrc(a1, a2, b1, b2);
+      for (int i=0; i<ARRLEN-UNALIGN_OFF; i++) {
+        errn += verify("test_cp_unalnsrc: a1", i, a1[i], (int)-123);
+        errn += verify("test_cp_unalnsrc: b1", i, b1[i], (long)-123);
+      }
+      for (int i=ARRLEN-UNALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_cp_unalnsrc: a1", i, a1[i], (int)123);
+        errn += verify("test_cp_unalnsrc: b1", i, b1[i], (long)123);
+      }
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        b1[i] = -1;
+      }
+      test_ci_unaln(a1, b1);
+      for (int i=0; i<UNALIGN_OFF; i++) {
+        errn += verify("test_ci_unaln: a1", i, a1[i], (int)-1);
+      }
+      for (int i=UNALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_ci_unaln: a1", i, a1[i], (int)-123);
+      }
+      for (int i=0; i<ARRLEN-UNALIGN_OFF; i++) {
+        errn += verify("test_ci_unaln: b1", i, b1[i], (long)-103);
+      }
+      for (int i=ARRLEN-UNALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_ci_unaln: b1", i, b1[i], (long)-1);
+      }
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        b1[i] = -1;
+      }
+      test_vi_unaln(a1, b1, (int)123, (long)103);
+      for (int i=0; i<ARRLEN-UNALIGN_OFF; i++) {
+        errn += verify("test_vi_unaln: a1", i, a1[i], (int)123);
+      }
+      for (int i=ARRLEN-UNALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_vi_unaln: a1", i, a1[i], (int)-1);
+      }
+      for (int i=0; i<UNALIGN_OFF; i++) {
+        errn += verify("test_vi_unaln: b1", i, b1[i], (long)-1);
+      }
+      for (int i=UNALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_vi_unaln: b1", i, b1[i], (long)103);
+      }
+
+      // Reset for aligned overlap initialization
+      for (int i=0; i<ALIGN_OFF; i++) {
+        a1[i] = (int)i;
+        b1[i] = (long)i;
+      }
+      for (int i=ALIGN_OFF; i<ARRLEN; i++) {
+        a1[i] = -1;
+        b1[i] = -1;
+      }
+      test_cp_alndst(a1, a1, b1, b1);
+      for (int i=0; i<ARRLEN; i++) {
+        int v = i%ALIGN_OFF;
+        errn += verify("test_cp_alndst_overlap: a1", i, a1[i], (int)v);
+        errn += verify("test_cp_alndst_overlap: b1", i, b1[i], (long)v);
+      }
+      for (int i=0; i<ALIGN_OFF; i++) {
+        a1[i+ALIGN_OFF] = -1;
+        b1[i+ALIGN_OFF] = -1;
+      }
+      test_cp_alnsrc(a1, a1, b1, b1);
+      for (int i=0; i<ALIGN_OFF; i++) {
+        errn += verify("test_cp_alnsrc_overlap: a1", i, a1[i], (int)-1);
+        errn += verify("test_cp_alnsrc_overlap: b1", i, b1[i], (long)-1);
+      }
+      for (int i=ALIGN_OFF; i<ARRLEN; i++) {
+        int v = i%ALIGN_OFF;
+        errn += verify("test_cp_alnsrc_overlap: a1", i, a1[i], (int)v);
+        errn += verify("test_cp_alnsrc_overlap: b1", i, b1[i], (long)v);
+      }
+
+      // Reset for unaligned overlap initialization
+      for (int i=0; i<UNALIGN_OFF; i++) {
+        a1[i] = (int)i;
+        b1[i] = (long)i;
+      }
+      for (int i=UNALIGN_OFF; i<ARRLEN; i++) {
+        a1[i] = -1;
+        b1[i] = -1;
+      }
+      test_cp_unalndst(a1, a1, b1, b1);
+      for (int i=0; i<ARRLEN; i++) {
+        int v = i%UNALIGN_OFF;
+        errn += verify("test_cp_unalndst_overlap: a1", i, a1[i], (int)v);
+        errn += verify("test_cp_unalndst_overlap: b1", i, b1[i], (long)v);
+      }
+      for (int i=0; i<UNALIGN_OFF; i++) {
+        a1[i+UNALIGN_OFF] = -1;
+        b1[i+UNALIGN_OFF] = -1;
+      }
+      test_cp_unalnsrc(a1, a1, b1, b1);
+      for (int i=0; i<UNALIGN_OFF; i++) {
+        errn += verify("test_cp_unalnsrc_overlap: a1", i, a1[i], (int)-1);
+        errn += verify("test_cp_unalnsrc_overlap: b1", i, b1[i], (long)-1);
+      }
+      for (int i=UNALIGN_OFF; i<ARRLEN; i++) {
+        int v = i%UNALIGN_OFF;
+        errn += verify("test_cp_unalnsrc_overlap: a1", i, a1[i], (int)v);
+        errn += verify("test_cp_unalnsrc_overlap: b1", i, b1[i], (long)v);
+      }
+
+    }
+
+    if (errn > 0)
+      return errn;
+
+    System.out.println("Time");
+    long start, end;
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_ci(a1, b1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_ci: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_vi(a2, b2, (int)123, (long)103);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_vi: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp(a1, a2, b1, b2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_ci_neg(a1, b1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_ci_neg: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_vi_neg(a1, b1, (int)123, (long)103);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_vi_neg: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_neg(a1, a2, b1, b2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_neg: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_ci_oppos(a1, b1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_ci_oppos: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_vi_oppos(a1, b1, (int)123, (long)103);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_vi_oppos: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_oppos(a1, a2, b1, b2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_oppos: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_ci_aln(a1, b1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_ci_aln: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_vi_aln(a1, b1, (int)123, (long)103);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_vi_aln: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_alndst(a1, a2, b1, b2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_alndst: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_alnsrc(a1, a2, b1, b2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_alnsrc: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_ci_unaln(a1, b1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_ci_unaln: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_vi_unaln(a1, b1, (int)123, (long)103);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_vi_unaln: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_unalndst(a1, a2, b1, b2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_unalndst: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_unalnsrc(a1, a2, b1, b2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_unalnsrc: " + (end - start));
+    return errn;
+  }
+
+  static void test_ci(int[] a, long[] b) {
+    for (int i = 0; i < a.length; i+=1) {
+      a[i] = -123;
+      b[i] = -103;
+    }
+  }
+  static void test_vi(int[] a, long[] b, int c, long d) {
+    for (int i = 0; i < a.length; i+=1) {
+      a[i] = c;
+      b[i] = d;
+    }
+  }
+  static void test_cp(int[] a, int[] b, long[] c, long[] d) {
+    for (int i = 0; i < a.length; i+=1) {
+      a[i] = b[i];
+      c[i] = d[i];
+    }
+  }
+  static void test_ci_neg(int[] a, long[] b) {
+    for (int i = a.length-1; i >= 0; i-=1) {
+      a[i] = -123;
+      b[i] = -103;
+    }
+  }
+  static void test_vi_neg(int[] a, long[] b, int c, long d) {
+    for (int i = a.length-1; i >= 0; i-=1) {
+      a[i] = c;
+      b[i] = d;
+    }
+  }
+  static void test_cp_neg(int[] a, int[] b, long[] c, long[] d) {
+    for (int i = a.length-1; i >= 0; i-=1) {
+      a[i] = b[i];
+      c[i] = d[i];
+    }
+  }
+  static void test_ci_oppos(int[] a, long[] b) {
+    int limit = a.length-1;
+    for (int i = 0; i < a.length; i+=1) {
+      a[limit-i] = -123;
+      b[i] = -103;
+    }
+  }
+  static void test_vi_oppos(int[] a, long[] b, int c, long d) {
+    int limit = a.length-1;
+    for (int i = a.length-1; i >= 0; i-=1) {
+      a[i] = c;
+      b[limit-i] = d;
+    }
+  }
+  static void test_cp_oppos(int[] a, int[] b, long[] c, long[] d) {
+    int limit = a.length-1;
+    for (int i = 0; i < a.length; i+=1) {
+      a[i] = b[limit-i];
+      c[limit-i] = d[i];
+    }
+  }
+  static void test_ci_aln(int[] a, long[] b) {
+    for (int i = 0; i < a.length-ALIGN_OFF; i+=1) {
+      a[i+ALIGN_OFF] = -123;
+      b[i] = -103;
+    }
+  }
+  static void test_vi_aln(int[] a, long[] b, int c, long d) {
+    for (int i = 0; i < a.length-ALIGN_OFF; i+=1) {
+      a[i] = c;
+      b[i+ALIGN_OFF] = d;
+    }
+  }
+  static void test_cp_alndst(int[] a, int[] b, long[] c, long[] d) {
+    for (int i = 0; i < a.length-ALIGN_OFF; i+=1) {
+      a[i+ALIGN_OFF] = b[i];
+      c[i+ALIGN_OFF] = d[i];
+    }
+  }
+  static void test_cp_alnsrc(int[] a, int[] b, long[] c, long[] d) {
+    for (int i = 0; i < a.length-ALIGN_OFF; i+=1) {
+      a[i] = b[i+ALIGN_OFF];
+      c[i] = d[i+ALIGN_OFF];
+    }
+  }
+  static void test_ci_unaln(int[] a, long[] b) {
+    for (int i = 0; i < a.length-UNALIGN_OFF; i+=1) {
+      a[i+UNALIGN_OFF] = -123;
+      b[i] = -103;
+    }
+  }
+  static void test_vi_unaln(int[] a, long[] b, int c, long d) {
+    for (int i = 0; i < a.length-UNALIGN_OFF; i+=1) {
+      a[i] = c;
+      b[i+UNALIGN_OFF] = d;
+    }
+  }
+  static void test_cp_unalndst(int[] a, int[] b, long[] c, long[] d) {
+    for (int i = 0; i < a.length-UNALIGN_OFF; i+=1) {
+      a[i+UNALIGN_OFF] = b[i];
+      c[i+UNALIGN_OFF] = d[i];
+    }
+  }
+  static void test_cp_unalnsrc(int[] a, int[] b, long[] c, long[] d) {
+    for (int i = 0; i < a.length-UNALIGN_OFF; i+=1) {
+      a[i] = b[i+UNALIGN_OFF];
+      c[i] = d[i+UNALIGN_OFF];
+    }
+  }
+
+  static int verify(String text, int i, int elem, int val) {
+    if (elem != val) {
+      System.err.println(text + "[" + i + "] = " + elem + " != " + val);
+      return 1;
+    }
+    return 0;
+  }
+  static int verify(String text, int i, long elem, long val) {
+    if (elem != val) {
+      System.err.println(text + "[" + i + "] = " + elem + " != " + val);
+      return 1;
+    }
+    return 0;
+  }
+}
--- /dev/null	Sat Jun  2 20:04:29 2012
+++ new/test/compiler/7119644/TestIntVect.java	Sat Jun  2 20:04:29 2012
@@ -0,0 +1,953 @@
+/*
+ * Copyright (c) 2012, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+/**
+ * @test
+ * @bug 7119644
+ * @summary Increase superword's vector size up to 256 bits
+ *
+ * @run main/othervm/timeout=300 -Xbatch -XX:+IgnoreUnrecognizedVMOptions -XX:-TieredCompilation -XX:-OptimizeFill TestIntVect
+ */
+
+public class TestIntVect {
+  private static final int ARRLEN = 997;
+  private static final int ITERS  = 11000;
+  private static final int OFFSET = 3;
+  private static final int SCALE = 2;
+  private static final int ALIGN_OFF = 8;
+  private static final int UNALIGN_OFF = 5;
+
+  public static void main(String args[]) {
+    System.out.println("Testing Integer vectors");
+    int errn = test();
+    if (errn > 0) {
+      System.err.println("FAILED: " + errn + " errors");
+      System.exit(97);
+    }
+    System.out.println("PASSED");
+  }
+
+  static int test() {
+    int[] a1 = new int[ARRLEN];
+    int[] a2 = new int[ARRLEN];
+    System.out.println("Warmup");
+    for (int i=0; i<ITERS; i++) {
+      test_ci(a1);
+      test_vi(a2, (int)123);
+      test_cp(a1, a2);
+      test_2ci(a1, a2);
+      test_2vi(a1, a2, (int)123, (int)103);
+      test_ci_neg(a1);
+      test_vi_neg(a2, (int)123);
+      test_cp_neg(a1, a2);
+      test_2ci_neg(a1, a2);
+      test_2vi_neg(a1, a2, (int)123, (int)103);
+      test_ci_oppos(a1);
+      test_vi_oppos(a2, (int)123);
+      test_cp_oppos(a1, a2);
+      test_2ci_oppos(a1, a2);
+      test_2vi_oppos(a1, a2, (int)123, (int)103);
+      test_ci_off(a1);
+      test_vi_off(a2, (int)123);
+      test_cp_off(a1, a2);
+      test_2ci_off(a1, a2);
+      test_2vi_off(a1, a2, (int)123, (int)103);
+      test_ci_inv(a1, OFFSET);
+      test_vi_inv(a2, (int)123, OFFSET);
+      test_cp_inv(a1, a2, OFFSET);
+      test_2ci_inv(a1, a2, OFFSET);
+      test_2vi_inv(a1, a2, (int)123, (int)103, OFFSET);
+      test_ci_scl(a1);
+      test_vi_scl(a2, (int)123);
+      test_cp_scl(a1, a2);
+      test_2ci_scl(a1, a2);
+      test_2vi_scl(a1, a2, (int)123, (int)103);
+      test_cp_alndst(a1, a2);
+      test_cp_alnsrc(a1, a2);
+      test_2ci_aln(a1, a2);
+      test_2vi_aln(a1, a2, (int)123, (int)103);
+      test_cp_unalndst(a1, a2);
+      test_cp_unalnsrc(a1, a2);
+      test_2ci_unaln(a1, a2);
+      test_2vi_unaln(a1, a2, (int)123, (int)103);
+    }
+    // Initialize
+    for (int i=0; i<ARRLEN; i++) {
+      a1[i] = -1;
+      a2[i] = -1;
+    }
+    // Test and verify results
+    System.out.println("Verification");
+    int errn = 0;
+    {
+      test_ci(a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_ci: a1", i, a1[i], (int)-123);
+      }
+      test_vi(a2, (int)123);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_vi: a2", i, a2[i], (int)123);
+      }
+      test_cp(a1, a2);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_cp: a1", i, a1[i], (int)123);
+      }
+      test_2ci(a1, a2);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_2ci: a1", i, a1[i], (int)-123);
+        errn += verify("test_2ci: a2", i, a2[i], (int)-103);
+      }
+      test_2vi(a1, a2, (int)123, (int)103);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_2vi: a1", i, a1[i], (int)123);
+        errn += verify("test_2vi: a2", i, a2[i], (int)103);
+      }
+      // Reset for negative stride
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        a2[i] = -1;
+      }
+      test_ci_neg(a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_ci_neg: a1", i, a1[i], (int)-123);
+      }
+      test_vi_neg(a2, (int)123);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_vi_neg: a2", i, a2[i], (int)123);
+      }
+      test_cp_neg(a1, a2);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_cp_neg: a1", i, a1[i], (int)123);
+      }
+      test_2ci_neg(a1, a2);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_2ci_neg: a1", i, a1[i], (int)-123);
+        errn += verify("test_2ci_neg: a2", i, a2[i], (int)-103);
+      }
+      test_2vi_neg(a1, a2, (int)123, (int)103);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_2vi_neg: a1", i, a1[i], (int)123);
+        errn += verify("test_2vi_neg: a2", i, a2[i], (int)103);
+      }
+      // Reset for opposite stride
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        a2[i] = -1;
+      }
+      test_ci_oppos(a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_ci_oppos: a1", i, a1[i], (int)-123);
+      }
+      test_vi_oppos(a2, (int)123);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_vi_oppos: a2", i, a2[i], (int)123);
+      }
+      test_cp_oppos(a1, a2);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_cp_oppos: a1", i, a1[i], (int)123);
+      }
+      test_2ci_oppos(a1, a2);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_2ci_oppos: a1", i, a1[i], (int)-123);
+        errn += verify("test_2ci_oppos: a2", i, a2[i], (int)-103);
+      }
+      test_2vi_oppos(a1, a2, (int)123, (int)103);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_2vi_oppos: a1", i, a1[i], (int)123);
+        errn += verify("test_2vi_oppos: a2", i, a2[i], (int)103);
+      }
+      // Reset for indexing with offset
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        a2[i] = -1;
+      }
+      test_ci_off(a1);
+      for (int i=OFFSET; i<ARRLEN; i++) {
+        errn += verify("test_ci_off: a1", i, a1[i], (int)-123);
+      }
+      test_vi_off(a2, (int)123);
+      for (int i=OFFSET; i<ARRLEN; i++) {
+        errn += verify("test_vi_off: a2", i, a2[i], (int)123);
+      }
+      test_cp_off(a1, a2);
+      for (int i=OFFSET; i<ARRLEN; i++) {
+        errn += verify("test_cp_off: a1", i, a1[i], (int)123);
+      }
+      test_2ci_off(a1, a2);
+      for (int i=OFFSET; i<ARRLEN; i++) {
+        errn += verify("test_2ci_off: a1", i, a1[i], (int)-123);
+        errn += verify("test_2ci_off: a2", i, a2[i], (int)-103);
+      }
+      test_2vi_off(a1, a2, (int)123, (int)103);
+      for (int i=OFFSET; i<ARRLEN; i++) {
+        errn += verify("test_2vi_off: a1", i, a1[i], (int)123);
+        errn += verify("test_2vi_off: a2", i, a2[i], (int)103);
+      }
+      for (int i=0; i<OFFSET; i++) {
+        errn += verify("test_2vi_off: a1", i, a1[i], (int)-1);
+        errn += verify("test_2vi_off: a2", i, a2[i], (int)-1);
+      }
+      // Reset for indexing with invariant offset
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        a2[i] = -1;
+      }
+      test_ci_inv(a1, OFFSET);
+      for (int i=OFFSET; i<ARRLEN; i++) {
+        errn += verify("test_ci_inv: a1", i, a1[i], (int)-123);
+      }
+      test_vi_inv(a2, (int)123, OFFSET);
+      for (int i=OFFSET; i<ARRLEN; i++) {
+        errn += verify("test_vi_inv: a2", i, a2[i], (int)123);
+      }
+      test_cp_inv(a1, a2, OFFSET);
+      for (int i=OFFSET; i<ARRLEN; i++) {
+        errn += verify("test_cp_inv: a1", i, a1[i], (int)123);
+      }
+      test_2ci_inv(a1, a2, OFFSET);
+      for (int i=OFFSET; i<ARRLEN; i++) {
+        errn += verify("test_2ci_inv: a1", i, a1[i], (int)-123);
+        errn += verify("test_2ci_inv: a2", i, a2[i], (int)-103);
+      }
+      test_2vi_inv(a1, a2, (int)123, (int)103, OFFSET);
+      for (int i=OFFSET; i<ARRLEN; i++) {
+        errn += verify("test_2vi_inv: a1", i, a1[i], (int)123);
+        errn += verify("test_2vi_inv: a2", i, a2[i], (int)103);
+      }
+      for (int i=0; i<OFFSET; i++) {
+        errn += verify("test_2vi_inv: a1", i, a1[i], (int)-1);
+        errn += verify("test_2vi_inv: a2", i, a2[i], (int)-1);
+      }
+      // Reset for indexing with scale
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        a2[i] = -1;
+      }
+      test_ci_scl(a1);
+      for (int i=0; i<ARRLEN; i++) {
+        int val = (i%SCALE != 0) ? -1 : -123;
+        errn += verify("test_ci_scl: a1", i, a1[i], (int)val);
+      }
+      test_vi_scl(a2, (int)123);
+      for (int i=0; i<ARRLEN; i++) {
+        int val = (i%SCALE != 0) ? -1 : 123;
+        errn += verify("test_vi_scl: a2", i, a2[i], (int)val);
+      }
+      test_cp_scl(a1, a2);
+      for (int i=0; i<ARRLEN; i++) {
+        int val = (i%SCALE != 0) ? -1 : 123;
+        errn += verify("test_cp_scl: a1", i, a1[i], (int)val);
+      }
+      test_2ci_scl(a1, a2);
+      for (int i=0; i<ARRLEN; i++) {
+        if (i%SCALE != 0) {
+          errn += verify("test_2ci_scl: a1", i, a1[i], (int)-1);
+        } else if (i*SCALE < ARRLEN) {
+          errn += verify("test_2ci_scl: a1", i*SCALE, a1[i*SCALE], (int)-123);
+        }
+        if (i%SCALE != 0) {
+          errn += verify("test_2ci_scl: a2", i, a2[i], (int)-1);
+        } else if (i*SCALE < ARRLEN) {
+          errn += verify("test_2ci_scl: a2", i*SCALE, a2[i*SCALE], (int)-103);
+        }
+      }
+      test_2vi_scl(a1, a2, (int)123, (int)103);
+      for (int i=0; i<ARRLEN; i++) {
+        if (i%SCALE != 0) {
+          errn += verify("test_2vi_scl: a1", i, a1[i], (int)-1);
+        } else if (i*SCALE < ARRLEN) {
+          errn += verify("test_2vi_scl: a1", i*SCALE, a1[i*SCALE], (int)123);
+        }
+        if (i%SCALE != 0) {
+          errn += verify("test_2vi_scl: a2", i, a2[i], (int)-1);
+        } else if (i*SCALE < ARRLEN) {
+          errn += verify("test_2vi_scl: a2", i*SCALE, a2[i*SCALE], (int)103);
+        }
+      }
+      // Reset for 2 arrays with relative aligned offset
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        a2[i] = -1;
+      }
+      test_vi(a2, (int)123);
+      test_cp_alndst(a1, a2);
+      for (int i=0; i<ALIGN_OFF; i++) {
+        errn += verify("test_cp_alndst: a1", i, a1[i], (int)-1);
+      }
+      for (int i=ALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_cp_alndst: a1", i, a1[i], (int)123);
+      }
+      test_vi(a2, (int)-123);
+      test_cp_alnsrc(a1, a2);
+      for (int i=0; i<ARRLEN-ALIGN_OFF; i++) {
+        errn += verify("test_cp_alnsrc: a1", i, a1[i], (int)-123);
+      }
+      for (int i=ARRLEN-ALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_cp_alnsrc: a1", i, a1[i], (int)123);
+      }
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        a2[i] = -1;
+      }
+      test_2ci_aln(a1, a2);
+      for (int i=0; i<ALIGN_OFF; i++) {
+        errn += verify("test_2ci_aln: a1", i, a1[i], (int)-1);
+      }
+      for (int i=ALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_2ci_aln: a1", i, a1[i], (int)-123);
+      }
+      for (int i=0; i<ARRLEN-ALIGN_OFF; i++) {
+        errn += verify("test_2ci_aln: a2", i, a2[i], (int)-103);
+      }
+      for (int i=ARRLEN-ALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_2ci_aln: a2", i, a2[i], (int)-1);
+      }
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        a2[i] = -1;
+      }
+      test_2vi_aln(a1, a2, (int)123, (int)103);
+      for (int i=0; i<ARRLEN-ALIGN_OFF; i++) {
+        errn += verify("test_2vi_aln: a1", i, a1[i], (int)123);
+      }
+      for (int i=ARRLEN-ALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_2vi_aln: a1", i, a1[i], (int)-1);
+      }
+      for (int i=0; i<ALIGN_OFF; i++) {
+        errn += verify("test_2vi_aln: a2", i, a2[i], (int)-1);
+      }
+      for (int i=ALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_2vi_aln: a2", i, a2[i], (int)103);
+      }
+
+      // Reset for 2 arrays with relative unaligned offset
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        a2[i] = -1;
+      }
+      test_vi(a2, (int)123);
+      test_cp_unalndst(a1, a2);
+      for (int i=0; i<UNALIGN_OFF; i++) {
+        errn += verify("test_cp_unalndst: a1", i, a1[i], (int)-1);
+      }
+      for (int i=UNALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_cp_unalndst: a1", i, a1[i], (int)123);
+      }
+      test_vi(a2, (int)-123);
+      test_cp_unalnsrc(a1, a2);
+      for (int i=0; i<ARRLEN-UNALIGN_OFF; i++) {
+        errn += verify("test_cp_unalnsrc: a1", i, a1[i], (int)-123);
+      }
+      for (int i=ARRLEN-UNALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_cp_unalnsrc: a1", i, a1[i], (int)123);
+      }
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        a2[i] = -1;
+      }
+      test_2ci_unaln(a1, a2);
+      for (int i=0; i<UNALIGN_OFF; i++) {
+        errn += verify("test_2ci_unaln: a1", i, a1[i], (int)-1);
+      }
+      for (int i=UNALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_2ci_unaln: a1", i, a1[i], (int)-123);
+      }
+      for (int i=0; i<ARRLEN-UNALIGN_OFF; i++) {
+        errn += verify("test_2ci_unaln: a2", i, a2[i], (int)-103);
+      }
+      for (int i=ARRLEN-UNALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_2ci_unaln: a2", i, a2[i], (int)-1);
+      }
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        a2[i] = -1;
+      }
+      test_2vi_unaln(a1, a2, (int)123, (int)103);
+      for (int i=0; i<ARRLEN-UNALIGN_OFF; i++) {
+        errn += verify("test_2vi_unaln: a1", i, a1[i], (int)123);
+      }
+      for (int i=ARRLEN-UNALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_2vi_unaln: a1", i, a1[i], (int)-1);
+      }
+      for (int i=0; i<UNALIGN_OFF; i++) {
+        errn += verify("test_2vi_unaln: a2", i, a2[i], (int)-1);
+      }
+      for (int i=UNALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_2vi_unaln: a2", i, a2[i], (int)103);
+      }
+
+      // Reset for aligned overlap initialization
+      for (int i=0; i<ALIGN_OFF; i++) {
+        a1[i] = (int)i;
+      }
+      for (int i=ALIGN_OFF; i<ARRLEN; i++) {
+        a1[i] = -1;
+      }
+      test_cp_alndst(a1, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        int v = i%ALIGN_OFF;
+        errn += verify("test_cp_alndst_overlap: a1", i, a1[i], (int)v);
+      }
+      for (int i=0; i<ALIGN_OFF; i++) {
+        a1[i+ALIGN_OFF] = -1;
+      }
+      test_cp_alnsrc(a1, a1);
+      for (int i=0; i<ALIGN_OFF; i++) {
+        errn += verify("test_cp_alnsrc_overlap: a1", i, a1[i], (int)-1);
+      }
+      for (int i=ALIGN_OFF; i<ARRLEN; i++) {
+        int v = i%ALIGN_OFF;
+        errn += verify("test_cp_alnsrc_overlap: a1", i, a1[i], (int)v);
+      }
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+      }
+      test_2ci_aln(a1, a1);
+      for (int i=0; i<ARRLEN-ALIGN_OFF; i++) {
+        errn += verify("test_2ci_aln_overlap: a1", i, a1[i], (int)-103);
+      }
+      for (int i=ARRLEN-ALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_2ci_aln_overlap: a1", i, a1[i], (int)-123);
+      }
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+      }
+      test_2vi_aln(a1, a1, (int)123, (int)103);
+      for (int i=0; i<ARRLEN-ALIGN_OFF; i++) {
+        errn += verify("test_2vi_aln_overlap: a1", i, a1[i], (int)123);
+      }
+      for (int i=ARRLEN-ALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_2vi_aln_overlap: a1", i, a1[i], (int)103);
+      }
+
+      // Reset for unaligned overlap initialization
+      for (int i=0; i<UNALIGN_OFF; i++) {
+        a1[i] = (int)i;
+      }
+      for (int i=UNALIGN_OFF; i<ARRLEN; i++) {
+        a1[i] = -1;
+      }
+      test_cp_unalndst(a1, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        int v = i%UNALIGN_OFF;
+        errn += verify("test_cp_unalndst_overlap: a1", i, a1[i], (int)v);
+      }
+      for (int i=0; i<UNALIGN_OFF; i++) {
+        a1[i+UNALIGN_OFF] = -1;
+      }
+      test_cp_unalnsrc(a1, a1);
+      for (int i=0; i<UNALIGN_OFF; i++) {
+        errn += verify("test_cp_unalnsrc_overlap: a1", i, a1[i], (int)-1);
+      }
+      for (int i=UNALIGN_OFF; i<ARRLEN; i++) {
+        int v = i%UNALIGN_OFF;
+        errn += verify("test_cp_unalnsrc_overlap: a1", i, a1[i], (int)v);
+      }
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+      }
+      test_2ci_unaln(a1, a1);
+      for (int i=0; i<ARRLEN-UNALIGN_OFF; i++) {
+        errn += verify("test_2ci_unaln_overlap: a1", i, a1[i], (int)-103);
+      }
+      for (int i=ARRLEN-UNALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_2ci_unaln_overlap: a1", i, a1[i], (int)-123);
+      }
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+      }
+      test_2vi_unaln(a1, a1, (int)123, (int)103);
+      for (int i=0; i<ARRLEN-UNALIGN_OFF; i++) {
+        errn += verify("test_2vi_unaln_overlap: a1", i, a1[i], (int)123);
+      }
+      for (int i=ARRLEN-UNALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_2vi_unaln_overlap: a1", i, a1[i], (int)103);
+      }
+
+    }
+
+    if (errn > 0)
+      return errn;
+
+    System.out.println("Time");
+    long start, end;
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_ci(a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_ci: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_vi(a2, (int)123);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_vi: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp(a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_2ci(a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_2ci: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_2vi(a1, a2, (int)123, (int)103);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_2vi: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_ci_neg(a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_ci_neg: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_vi_neg(a2, (int)123);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_vi_neg: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_neg(a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_neg: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_2ci_neg(a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_2ci_neg: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_2vi_neg(a1, a2, (int)123, (int)103);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_2vi_neg: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_ci_oppos(a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_ci_oppos: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_vi_oppos(a2, (int)123);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_vi_oppos: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_oppos(a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_oppos: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_2ci_oppos(a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_2ci_oppos: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_2vi_oppos(a1, a2, (int)123, (int)103);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_2vi_oppos: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_ci_off(a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_ci_off: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_vi_off(a2, (int)123);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_vi_off: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_off(a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_off: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_2ci_off(a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_2ci_off: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_2vi_off(a1, a2, (int)123, (int)103);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_2vi_off: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_ci_inv(a1, OFFSET);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_ci_inv: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_vi_inv(a2, (int)123, OFFSET);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_vi_inv: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_inv(a1, a2, OFFSET);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_inv: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_2ci_inv(a1, a2, OFFSET);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_2ci_inv: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_2vi_inv(a1, a2, (int)123, (int)103, OFFSET);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_2vi_inv: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_ci_scl(a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_ci_scl: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_vi_scl(a2, (int)123);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_vi_scl: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_scl(a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_scl: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_2ci_scl(a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_2ci_scl: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_2vi_scl(a1, a2, (int)123, (int)103);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_2vi_scl: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_alndst(a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_alndst: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_alnsrc(a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_alnsrc: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_2ci_aln(a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_2ci_aln: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_2vi_aln(a1, a2, (int)123, (int)103);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_2vi_aln: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_unalndst(a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_unalndst: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_unalnsrc(a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_unalnsrc: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_2ci_unaln(a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_2ci_unaln: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_2vi_unaln(a1, a2, (int)123, (int)103);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_2vi_unaln: " + (end - start));
+
+    return errn;
+  }
+
+  static void test_ci(int[] a) {
+    for (int i = 0; i < a.length; i+=1) {
+      a[i] = -123;
+    }
+  }
+  static void test_vi(int[] a, int b) {
+    for (int i = 0; i < a.length; i+=1) {
+      a[i] = b;
+    }
+  }
+  static void test_cp(int[] a, int[] b) {
+    for (int i = 0; i < a.length; i+=1) {
+      a[i] = b[i];
+    }
+  }
+  static void test_2ci(int[] a, int[] b) {
+    for (int i = 0; i < a.length; i+=1) {
+      a[i] = -123;
+      b[i] = -103;
+    }
+  }
+  static void test_2vi(int[] a, int[] b, int c, int d) {
+    for (int i = 0; i < a.length; i+=1) {
+      a[i] = c;
+      b[i] = d;
+    }
+  }
+  static void test_ci_neg(int[] a) {
+    for (int i = a.length-1; i >= 0; i-=1) {
+      a[i] = -123;
+    }
+  }
+  static void test_vi_neg(int[] a, int b) {
+    for (int i = a.length-1; i >= 0; i-=1) {
+      a[i] = b;
+    }
+  }
+  static void test_cp_neg(int[] a, int[] b) {
+    for (int i = a.length-1; i >= 0; i-=1) {
+      a[i] = b[i];
+    }
+  }
+  static void test_2ci_neg(int[] a, int[] b) {
+    for (int i = a.length-1; i >= 0; i-=1) {
+      a[i] = -123;
+      b[i] = -103;
+    }
+  }
+  static void test_2vi_neg(int[] a, int[] b, int c, int d) {
+    for (int i = a.length-1; i >= 0; i-=1) {
+      a[i] = c;
+      b[i] = d;
+    }
+  }
+  static void test_ci_oppos(int[] a) {
+    int limit = a.length-1;
+    for (int i = 0; i < a.length; i+=1) {
+      a[limit-i] = -123;
+    }
+  }
+  static void test_vi_oppos(int[] a, int b) {
+    int limit = a.length-1;
+    for (int i = limit; i >= 0; i-=1) {
+      a[limit-i] = b;
+    }
+  }
+  static void test_cp_oppos(int[] a, int[] b) {
+    int limit = a.length-1;
+    for (int i = 0; i < a.length; i+=1) {
+      a[i] = b[limit-i];
+    }
+  }
+  static void test_2ci_oppos(int[] a, int[] b) {
+    int limit = a.length-1;
+    for (int i = 0; i < a.length; i+=1) {
+      a[limit-i] = -123;
+      b[i] = -103;
+    }
+  }
+  static void test_2vi_oppos(int[] a, int[] b, int c, int d) {
+    int limit = a.length-1;
+    for (int i = limit; i >= 0; i-=1) {
+      a[i] = c;
+      b[limit-i] = d;
+    }
+  }
+  static void test_ci_off(int[] a) {
+    for (int i = 0; i < a.length-OFFSET; i+=1) {
+      a[i+OFFSET] = -123;
+    }
+  }
+  static void test_vi_off(int[] a, int b) {
+    for (int i = 0; i < a.length-OFFSET; i+=1) {
+      a[i+OFFSET] = b;
+    }
+  }
+  static void test_cp_off(int[] a, int[] b) {
+    for (int i = 0; i < a.length-OFFSET; i+=1) {
+      a[i+OFFSET] = b[i+OFFSET];
+    }
+  }
+  static void test_2ci_off(int[] a, int[] b) {
+    for (int i = 0; i < a.length-OFFSET; i+=1) {
+      a[i+OFFSET] = -123;
+      b[i+OFFSET] = -103;
+    }
+  }
+  static void test_2vi_off(int[] a, int[] b, int c, int d) {
+    for (int i = 0; i < a.length-OFFSET; i+=1) {
+      a[i+OFFSET] = c;
+      b[i+OFFSET] = d;
+    }
+  }
+  static void test_ci_inv(int[] a, int k) {
+    for (int i = 0; i < a.length-k; i+=1) {
+      a[i+k] = -123;
+    }
+  }
+  static void test_vi_inv(int[] a, int b, int k) {
+    for (int i = 0; i < a.length-k; i+=1) {
+      a[i+k] = b;
+    }
+  }
+  static void test_cp_inv(int[] a, int[] b, int k) {
+    for (int i = 0; i < a.length-k; i+=1) {
+      a[i+k] = b[i+k];
+    }
+  }
+  static void test_2ci_inv(int[] a, int[] b, int k) {
+    for (int i = 0; i < a.length-k; i+=1) {
+      a[i+k] = -123;
+      b[i+k] = -103;
+    }
+  }
+  static void test_2vi_inv(int[] a, int[] b, int c, int d, int k) {
+    for (int i = 0; i < a.length-k; i+=1) {
+      a[i+k] = c;
+      b[i+k] = d;
+    }
+  }
+  static void test_ci_scl(int[] a) {
+    for (int i = 0; i*SCALE < a.length; i+=1) {
+      a[i*SCALE] = -123;
+    }
+  }
+  static void test_vi_scl(int[] a, int b) {
+    for (int i = 0; i*SCALE < a.length; i+=1) {
+      a[i*SCALE] = b;
+    }
+  }
+  static void test_cp_scl(int[] a, int[] b) {
+    for (int i = 0; i*SCALE < a.length; i+=1) {
+      a[i*SCALE] = b[i*SCALE];
+    }
+  }
+  static void test_2ci_scl(int[] a, int[] b) {
+    for (int i = 0; i*SCALE < a.length; i+=1) {
+      a[i*SCALE] = -123;
+      b[i*SCALE] = -103;
+    }
+  }
+  static void test_2vi_scl(int[] a, int[] b, int c, int d) {
+    for (int i = 0; i*SCALE < a.length; i+=1) {
+      a[i*SCALE] = c;
+      b[i*SCALE] = d;
+    }
+  }
+  static void test_cp_alndst(int[] a, int[] b) {
+    for (int i = 0; i < a.length-ALIGN_OFF; i+=1) {
+      a[i+ALIGN_OFF] = b[i];
+    }
+  }
+  static void test_cp_alnsrc(int[] a, int[] b) {
+    for (int i = 0; i < a.length-ALIGN_OFF; i+=1) {
+      a[i] = b[i+ALIGN_OFF];
+    }
+  }
+  static void test_2ci_aln(int[] a, int[] b) {
+    for (int i = 0; i < a.length-ALIGN_OFF; i+=1) {
+      a[i+ALIGN_OFF] = -123;
+      b[i] = -103;
+    }
+  }
+  static void test_2vi_aln(int[] a, int[] b, int c, int d) {
+    for (int i = 0; i < a.length-ALIGN_OFF; i+=1) {
+      a[i] = c;
+      b[i+ALIGN_OFF] = d;
+    }
+  }
+  static void test_cp_unalndst(int[] a, int[] b) {
+    for (int i = 0; i < a.length-UNALIGN_OFF; i+=1) {
+      a[i+UNALIGN_OFF] = b[i];
+    }
+  }
+  static void test_cp_unalnsrc(int[] a, int[] b) {
+    for (int i = 0; i < a.length-UNALIGN_OFF; i+=1) {
+      a[i] = b[i+UNALIGN_OFF];
+    }
+  }
+  static void test_2ci_unaln(int[] a, int[] b) {
+    for (int i = 0; i < a.length-UNALIGN_OFF; i+=1) {
+      a[i+UNALIGN_OFF] = -123;
+      b[i] = -103;
+    }
+  }
+  static void test_2vi_unaln(int[] a, int[] b, int c, int d) {
+    for (int i = 0; i < a.length-UNALIGN_OFF; i+=1) {
+      a[i] = c;
+      b[i+UNALIGN_OFF] = d;
+    }
+  }
+
+  static int verify(String text, int i, int elem, int val) {
+    if (elem != val) {
+      System.err.println(text + "[" + i + "] = " + elem + " != " + val);
+      return 1;
+    }
+    return 0;
+  }
+}
--- /dev/null	Sat Jun  2 20:04:30 2012
+++ new/test/compiler/7119644/TestLongDoubleVect.java	Sat Jun  2 20:04:29 2012
@@ -0,0 +1,571 @@
+/*
+ * Copyright (c) 2012, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+/**
+ * @test
+ * @bug 7119644
+ * @summary Increase superword's vector size up to 256 bits
+ *
+ * @run main/othervm/timeout=300 -Xbatch -XX:+IgnoreUnrecognizedVMOptions -XX:-TieredCompilation -XX:-OptimizeFill TestLongDoubleVect
+ */
+
+public class TestLongDoubleVect {
+  private static final int ARRLEN = 997;
+  private static final int ITERS  = 11000;
+  private static final int OFFSET = 3;
+  private static final int SCALE = 2;
+  private static final int ALIGN_OFF = 8;
+  private static final int UNALIGN_OFF = 5;
+
+  public static void main(String args[]) {
+    System.out.println("Testing Long + Double vectors");
+    int errn = test();
+    if (errn > 0) {
+      System.err.println("FAILED: " + errn + " errors");
+      System.exit(97);
+    }
+    System.out.println("PASSED");
+  }
+
+  static int test() {
+    long[] a1 = new long[ARRLEN];
+    long[] a2 = new long[ARRLEN];
+    double[] b1 = new double[ARRLEN];
+    double[] b2 = new double[ARRLEN];
+    System.out.println("Warmup");
+    for (int i=0; i<ITERS; i++) {
+      test_ci(a1, b1);
+      test_vi(a2, b2, (long)123, 103.);
+      test_cp(a1, a2, b1, b2);
+      test_ci_neg(a1, b1);
+      test_vi_neg(a1, b1, (long)123, 103.);
+      test_cp_neg(a1, a2, b1, b2);
+      test_ci_oppos(a1, b1);
+      test_vi_oppos(a1, b1, (long)123, 103.);
+      test_cp_oppos(a1, a2, b1, b2);
+      test_ci_aln(a1, b1);
+      test_vi_aln(a1, b1, (long)123, 103.);
+      test_cp_alndst(a1, a2, b1, b2);
+      test_cp_alnsrc(a1, a2, b1, b2);
+      test_ci_unaln(a1, b1);
+      test_vi_unaln(a1, b1, (long)123, 103.);
+      test_cp_unalndst(a1, a2, b1, b2);
+      test_cp_unalnsrc(a1, a2, b1, b2);
+    }
+    // Initialize
+    for (int i=0; i<ARRLEN; i++) {
+      a1[i] = -1;
+      a2[i] = -1;
+      b1[i] = -1.;
+      b2[i] = -1.;
+    }
+    // Test and verify results
+    System.out.println("Verification");
+    int errn = 0;
+    {
+      test_ci(a1, b1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_ci: a1", i, a1[i], (long)-123);
+        errn += verify("test_ci: b1", i, b1[i], -103.);
+      }
+      test_vi(a2, b2, (long)123, 103.);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_vi: a2", i, a2[i], (long)123);
+        errn += verify("test_vi: b2", i, b2[i], 103.);
+      }
+      test_cp(a1, a2, b1, b2);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_cp: a1", i, a1[i], (long)123);
+        errn += verify("test_cp: b1", i, b1[i], 103.);
+      }
+
+      // Reset for negative stride
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        a2[i] = -1;
+        b1[i] = -1.;
+        b2[i] = -1.;
+      }
+      test_ci_neg(a1, b1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_ci_neg: a1", i, a1[i], (long)-123);
+        errn += verify("test_ci_neg: b1", i, b1[i], -103.);
+      }
+      test_vi_neg(a2, b2, (long)123, 103.);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_vi_neg: a2", i, a2[i], (long)123);
+        errn += verify("test_vi_neg: b2", i, b2[i], 103.);
+      }
+      test_cp_neg(a1, a2, b1, b2);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_cp_neg: a1", i, a1[i], (long)123);
+        errn += verify("test_cp_neg: b1", i, b1[i], 103.);
+      }
+
+      // Reset for opposite stride
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        a2[i] = -1;
+        b1[i] = -1.;
+        b2[i] = -1.;
+      }
+      test_ci_oppos(a1, b1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_ci_oppos: a1", i, a1[i], (long)-123);
+        errn += verify("test_ci_oppos: b1", i, b1[i], -103.);
+      }
+      test_vi_oppos(a2, b2, (long)123, 103.);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_vi_oppos: a2", i, a2[i], (long)123);
+        errn += verify("test_vi_oppos: b2", i, b2[i], 103.);
+      }
+      test_cp_oppos(a1, a2, b1, b2);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_cp_oppos: a1", i, a1[i], (long)123);
+        errn += verify("test_cp_oppos: b1", i, b1[i], 103.);
+      }
+
+      // Reset for 2 arrays with relative aligned offset
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        a2[i] = 123;
+        b1[i] = -1.;
+        b2[i] = 123.;
+      }
+      test_cp_alndst(a1, a2, b1, b2);
+      for (int i=0; i<ALIGN_OFF; i++) {
+        errn += verify("test_cp_alndst: a1", i, a1[i], (long)-1);
+        errn += verify("test_cp_alndst: b1", i, b1[i], -1.);
+      }
+      for (int i=ALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_cp_alndst: a1", i, a1[i], (long)123);
+        errn += verify("test_cp_alndst: b1", i, b1[i], 123.);
+      }
+      for (int i=0; i<ARRLEN; i++) {
+        a2[i] = -123;
+        b2[i] = -123.;
+      }
+      test_cp_alnsrc(a1, a2, b1, b2);
+      for (int i=0; i<ARRLEN-ALIGN_OFF; i++) {
+        errn += verify("test_cp_alnsrc: a1", i, a1[i], (long)-123);
+        errn += verify("test_cp_alnsrc: b1", i, b1[i], -123.);
+      }
+      for (int i=ARRLEN-ALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_cp_alnsrc: a1", i, a1[i], (long)123);
+        errn += verify("test_cp_alnsrc: b1", i, b1[i], 123.);
+      }
+
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        b1[i] = -1.;
+      }
+      test_ci_aln(a1, b1);
+      for (int i=0; i<ALIGN_OFF; i++) {
+        errn += verify("test_ci_aln: a1", i, a1[i], (long)-1);
+      }
+      for (int i=ALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_ci_aln: a1", i, a1[i], (long)-123);
+      }
+      for (int i=0; i<ARRLEN-ALIGN_OFF; i++) {
+        errn += verify("test_ci_aln: b1", i, b1[i], -103.);
+      }
+      for (int i=ARRLEN-ALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_ci_aln: b1", i, b1[i], -1.);
+      }
+
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        b1[i] = -1.;
+      }
+      test_vi_aln(a1, b1, (long)123, 103.);
+      for (int i=0; i<ARRLEN-ALIGN_OFF; i++) {
+        errn += verify("test_vi_aln: a1", i, a1[i], (long)123);
+      }
+      for (int i=ARRLEN-ALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_vi_aln: a1", i, a1[i], (long)-1);
+      }
+      for (int i=0; i<ALIGN_OFF; i++) {
+        errn += verify("test_vi_aln: b1", i, b1[i], -1.);
+      }
+      for (int i=ALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_vi_aln: b1", i, b1[i], 103.);
+      }
+
+      // Reset for 2 arrays with relative unaligned offset
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        a2[i] = 123;
+        b1[i] = -1.;
+        b2[i] = 123.;
+      }
+      test_cp_unalndst(a1, a2, b1, b2);
+      for (int i=0; i<UNALIGN_OFF; i++) {
+        errn += verify("test_cp_unalndst: a1", i, a1[i], (long)-1);
+        errn += verify("test_cp_unalndst: b1", i, b1[i], -1.);
+      }
+      for (int i=UNALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_cp_unalndst: a1", i, a1[i], (long)123);
+        errn += verify("test_cp_unalndst: b1", i, b1[i], 123.);
+      }
+      for (int i=0; i<ARRLEN; i++) {
+        a2[i] = -123;
+        b2[i] = -123.;
+      }
+      test_cp_unalnsrc(a1, a2, b1, b2);
+      for (int i=0; i<ARRLEN-UNALIGN_OFF; i++) {
+        errn += verify("test_cp_unalnsrc: a1", i, a1[i], (long)-123);
+        errn += verify("test_cp_unalnsrc: b1", i, b1[i], -123.);
+      }
+      for (int i=ARRLEN-UNALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_cp_unalnsrc: a1", i, a1[i], (long)123);
+        errn += verify("test_cp_unalnsrc: b1", i, b1[i], 123.);
+      }
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        b1[i] = -1;
+      }
+      test_ci_unaln(a1, b1);
+      for (int i=0; i<UNALIGN_OFF; i++) {
+        errn += verify("test_ci_unaln: a1", i, a1[i], (long)-1);
+      }
+      for (int i=UNALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_ci_unaln: a1", i, a1[i], (long)-123);
+      }
+      for (int i=0; i<ARRLEN-UNALIGN_OFF; i++) {
+        errn += verify("test_ci_unaln: b1", i, b1[i], -103.);
+      }
+      for (int i=ARRLEN-UNALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_ci_unaln: b1", i, b1[i], -1.);
+      }
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        b1[i] = -1;
+      }
+      test_vi_unaln(a1, b1, (long)123, 103.);
+      for (int i=0; i<ARRLEN-UNALIGN_OFF; i++) {
+        errn += verify("test_vi_unaln: a1", i, a1[i], (long)123);
+      }
+      for (int i=ARRLEN-UNALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_vi_unaln: a1", i, a1[i], (long)-1);
+      }
+      for (int i=0; i<UNALIGN_OFF; i++) {
+        errn += verify("test_vi_unaln: b1", i, b1[i], -1.);
+      }
+      for (int i=UNALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_vi_unaln: b1", i, b1[i], 103.);
+      }
+
+      // Reset for aligned overlap initialization
+      for (int i=0; i<ALIGN_OFF; i++) {
+        a1[i] = (long)i;
+        b1[i] = (double)i;
+      }
+      for (int i=ALIGN_OFF; i<ARRLEN; i++) {
+        a1[i] = -1;
+        b1[i] = -1.;
+      }
+      test_cp_alndst(a1, a1, b1, b1);
+      for (int i=0; i<ARRLEN; i++) {
+        int v = i%ALIGN_OFF;
+        errn += verify("test_cp_alndst_overlap: a1", i, a1[i], (long)v);
+        errn += verify("test_cp_alndst_overlap: b1", i, b1[i], (double)v);
+      }
+      for (int i=0; i<ALIGN_OFF; i++) {
+        a1[i+ALIGN_OFF] = -1;
+        b1[i+ALIGN_OFF] = -1.;
+      }
+      test_cp_alnsrc(a1, a1, b1, b1);
+      for (int i=0; i<ALIGN_OFF; i++) {
+        errn += verify("test_cp_alnsrc_overlap: a1", i, a1[i], (long)-1);
+        errn += verify("test_cp_alnsrc_overlap: b1", i, b1[i], -1.);
+      }
+      for (int i=ALIGN_OFF; i<ARRLEN; i++) {
+        int v = i%ALIGN_OFF;
+        errn += verify("test_cp_alnsrc_overlap: a1", i, a1[i], (long)v);
+        errn += verify("test_cp_alnsrc_overlap: b1", i, b1[i], (double)v);
+      }
+
+      // Reset for unaligned overlap initialization
+      for (int i=0; i<UNALIGN_OFF; i++) {
+        a1[i] = (long)i;
+        b1[i] = (double)i;
+      }
+      for (int i=UNALIGN_OFF; i<ARRLEN; i++) {
+        a1[i] = -1;
+        b1[i] = -1.;
+      }
+      test_cp_unalndst(a1, a1, b1, b1);
+      for (int i=0; i<ARRLEN; i++) {
+        int v = i%UNALIGN_OFF;
+        errn += verify("test_cp_unalndst_overlap: a1", i, a1[i], (long)v);
+        errn += verify("test_cp_unalndst_overlap: b1", i, b1[i], (double)v);
+      }
+      for (int i=0; i<UNALIGN_OFF; i++) {
+        a1[i+UNALIGN_OFF] = -1;
+        b1[i+UNALIGN_OFF] = -1.;
+      }
+      test_cp_unalnsrc(a1, a1, b1, b1);
+      for (int i=0; i<UNALIGN_OFF; i++) {
+        errn += verify("test_cp_unalnsrc_overlap: a1", i, a1[i], (long)-1);
+        errn += verify("test_cp_unalnsrc_overlap: b1", i, b1[i], -1.);
+      }
+      for (int i=UNALIGN_OFF; i<ARRLEN; i++) {
+        int v = i%UNALIGN_OFF;
+        errn += verify("test_cp_unalnsrc_overlap: a1", i, a1[i], (long)v);
+        errn += verify("test_cp_unalnsrc_overlap: b1", i, b1[i], (double)v);
+      }
+
+    }
+
+    if (errn > 0)
+      return errn;
+
+    System.out.println("Time");
+    long start, end;
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_ci(a1, b1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_ci: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_vi(a2, b2, (long)123, 103.);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_vi: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp(a1, a2, b1, b2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_ci_neg(a1, b1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_ci_neg: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_vi_neg(a1, b1, (long)123, 103.);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_vi_neg: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_neg(a1, a2, b1, b2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_neg: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_ci_oppos(a1, b1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_ci_oppos: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_vi_oppos(a1, b1, (long)123, 103.);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_vi_oppos: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_oppos(a1, a2, b1, b2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_oppos: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_ci_aln(a1, b1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_ci_aln: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_vi_aln(a1, b1, (long)123, 103.);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_vi_aln: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_alndst(a1, a2, b1, b2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_alndst: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_alnsrc(a1, a2, b1, b2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_alnsrc: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_ci_unaln(a1, b1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_ci_unaln: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_vi_unaln(a1, b1, (long)123, 103.);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_vi_unaln: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_unalndst(a1, a2, b1, b2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_unalndst: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_unalnsrc(a1, a2, b1, b2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_unalnsrc: " + (end - start));
+    return errn;
+  }
+
+  static void test_ci(long[] a, double[] b) {
+    for (int i = 0; i < a.length; i+=1) {
+      a[i] = -123;
+      b[i] = -103.;
+    }
+  }
+  static void test_vi(long[] a, double[] b, long c, double d) {
+    for (int i = 0; i < a.length; i+=1) {
+      a[i] = c;
+      b[i] = d;
+    }
+  }
+  static void test_cp(long[] a, long[] b, double[] c, double[] d) {
+    for (int i = 0; i < a.length; i+=1) {
+      a[i] = b[i];
+      c[i] = d[i];
+    }
+  }
+  static void test_ci_neg(long[] a, double[] b) {
+    for (int i = a.length-1; i >= 0; i-=1) {
+      a[i] = -123;
+      b[i] = -103.;
+    }
+  }
+  static void test_vi_neg(long[] a, double[] b, long c, double d) {
+    for (int i = a.length-1; i >= 0; i-=1) {
+      a[i] = c;
+      b[i] = d;
+    }
+  }
+  static void test_cp_neg(long[] a, long[] b, double[] c, double[] d) {
+    for (int i = a.length-1; i >= 0; i-=1) {
+      a[i] = b[i];
+      c[i] = d[i];
+    }
+  }
+  static void test_ci_oppos(long[] a, double[] b) {
+    int limit = a.length-1;
+    for (int i = 0; i < a.length; i+=1) {
+      a[limit-i] = -123;
+      b[i] = -103.;
+    }
+  }
+  static void test_vi_oppos(long[] a, double[] b, long c, double d) {
+    int limit = a.length-1;
+    for (int i = a.length-1; i >= 0; i-=1) {
+      a[i] = c;
+      b[limit-i] = d;
+    }
+  }
+  static void test_cp_oppos(long[] a, long[] b, double[] c, double[] d) {
+    int limit = a.length-1;
+    for (int i = 0; i < a.length; i+=1) {
+      a[i] = b[limit-i];
+      c[limit-i] = d[i];
+    }
+  }
+  static void test_ci_aln(long[] a, double[] b) {
+    for (int i = 0; i < a.length-ALIGN_OFF; i+=1) {
+      a[i+ALIGN_OFF] = -123;
+      b[i] = -103.;
+    }
+  }
+  static void test_vi_aln(long[] a, double[] b, long c, double d) {
+    for (int i = 0; i < a.length-ALIGN_OFF; i+=1) {
+      a[i] = c;
+      b[i+ALIGN_OFF] = d;
+    }
+  }
+  static void test_cp_alndst(long[] a, long[] b, double[] c, double[] d) {
+    for (int i = 0; i < a.length-ALIGN_OFF; i+=1) {
+      a[i+ALIGN_OFF] = b[i];
+      c[i+ALIGN_OFF] = d[i];
+    }
+  }
+  static void test_cp_alnsrc(long[] a, long[] b, double[] c, double[] d) {
+    for (int i = 0; i < a.length-ALIGN_OFF; i+=1) {
+      a[i] = b[i+ALIGN_OFF];
+      c[i] = d[i+ALIGN_OFF];
+    }
+  }
+  static void test_ci_unaln(long[] a, double[] b) {
+    for (int i = 0; i < a.length-UNALIGN_OFF; i+=1) {
+      a[i+UNALIGN_OFF] = -123;
+      b[i] = -103.;
+    }
+  }
+  static void test_vi_unaln(long[] a, double[] b, long c, double d) {
+    for (int i = 0; i < a.length-UNALIGN_OFF; i+=1) {
+      a[i] = c;
+      b[i+UNALIGN_OFF] = d;
+    }
+  }
+  static void test_cp_unalndst(long[] a, long[] b, double[] c, double[] d) {
+    for (int i = 0; i < a.length-UNALIGN_OFF; i+=1) {
+      a[i+UNALIGN_OFF] = b[i];
+      c[i+UNALIGN_OFF] = d[i];
+    }
+  }
+  static void test_cp_unalnsrc(long[] a, long[] b, double[] c, double[] d) {
+    for (int i = 0; i < a.length-UNALIGN_OFF; i+=1) {
+      a[i] = b[i+UNALIGN_OFF];
+      c[i] = d[i+UNALIGN_OFF];
+    }
+  }
+
+  static int verify(String text, int i, long elem, long val) {
+    if (elem != val) {
+      System.err.println(text + "[" + i + "] = " + elem + " != " + val);
+      return 1;
+    }
+    return 0;
+  }
+  static int verify(String text, int i, double elem, double val) {
+    if (elem != val) {
+      System.err.println(text + "[" + i + "] = " + elem + " != " + val);
+      return 1;
+    }
+    return 0;
+  }
+}
--- /dev/null	Sat Jun  2 20:04:30 2012
+++ new/test/compiler/7119644/TestLongFloatVect.java	Sat Jun  2 20:04:30 2012
@@ -0,0 +1,571 @@
+/*
+ * Copyright (c) 2012, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+/**
+ * @test
+ * @bug 7119644
+ * @summary Increase superword's vector size up to 256 bits
+ *
+ * @run main/othervm/timeout=300 -Xbatch -XX:+IgnoreUnrecognizedVMOptions -XX:-TieredCompilation -XX:-OptimizeFill TestLongFloatVect
+ */
+
+public class TestLongFloatVect {
+  private static final int ARRLEN = 997;
+  private static final int ITERS  = 11000;
+  private static final int OFFSET = 3;
+  private static final int SCALE = 2;
+  private static final int ALIGN_OFF = 8;
+  private static final int UNALIGN_OFF = 5;
+
+  public static void main(String args[]) {
+    System.out.println("Testing Long + Float vectors");
+    int errn = test();
+    if (errn > 0) {
+      System.err.println("FAILED: " + errn + " errors");
+      System.exit(97);
+    }
+    System.out.println("PASSED");
+  }
+
+  static int test() {
+    long[] a1 = new long[ARRLEN];
+    long[] a2 = new long[ARRLEN];
+    float[] b1 = new float[ARRLEN];
+    float[] b2 = new float[ARRLEN];
+    System.out.println("Warmup");
+    for (int i=0; i<ITERS; i++) {
+      test_ci(a1, b1);
+      test_vi(a2, b2, (long)123, 103.f);
+      test_cp(a1, a2, b1, b2);
+      test_ci_neg(a1, b1);
+      test_vi_neg(a1, b1, (long)123, 103.f);
+      test_cp_neg(a1, a2, b1, b2);
+      test_ci_oppos(a1, b1);
+      test_vi_oppos(a1, b1, (long)123, 103.f);
+      test_cp_oppos(a1, a2, b1, b2);
+      test_ci_aln(a1, b1);
+      test_vi_aln(a1, b1, (long)123, 103.f);
+      test_cp_alndst(a1, a2, b1, b2);
+      test_cp_alnsrc(a1, a2, b1, b2);
+      test_ci_unaln(a1, b1);
+      test_vi_unaln(a1, b1, (long)123, 103.f);
+      test_cp_unalndst(a1, a2, b1, b2);
+      test_cp_unalnsrc(a1, a2, b1, b2);
+    }
+    // Initialize
+    for (int i=0; i<ARRLEN; i++) {
+      a1[i] = -1;
+      a2[i] = -1;
+      b1[i] = -1.f;
+      b2[i] = -1.f;
+    }
+    // Test and verify results
+    System.out.println("Verification");
+    int errn = 0;
+    {
+      test_ci(a1, b1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_ci: a1", i, a1[i], (long)-123);
+        errn += verify("test_ci: b1", i, b1[i], -103.f);
+      }
+      test_vi(a2, b2, (long)123, 103.f);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_vi: a2", i, a2[i], (long)123);
+        errn += verify("test_vi: b2", i, b2[i], 103.f);
+      }
+      test_cp(a1, a2, b1, b2);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_cp: a1", i, a1[i], (long)123);
+        errn += verify("test_cp: b1", i, b1[i], 103.f);
+      }
+
+      // Reset for negative stride
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        a2[i] = -1;
+        b1[i] = -1.f;
+        b2[i] = -1.f;
+      }
+      test_ci_neg(a1, b1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_ci_neg: a1", i, a1[i], (long)-123);
+        errn += verify("test_ci_neg: b1", i, b1[i], -103.f);
+      }
+      test_vi_neg(a2, b2, (long)123, 103.f);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_vi_neg: a2", i, a2[i], (long)123);
+        errn += verify("test_vi_neg: b2", i, b2[i], 103.f);
+      }
+      test_cp_neg(a1, a2, b1, b2);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_cp_neg: a1", i, a1[i], (long)123);
+        errn += verify("test_cp_neg: b1", i, b1[i], 103.f);
+      }
+
+      // Reset for opposite stride
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        a2[i] = -1;
+        b1[i] = -1.f;
+        b2[i] = -1.f;
+      }
+      test_ci_oppos(a1, b1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_ci_oppos: a1", i, a1[i], (long)-123);
+        errn += verify("test_ci_oppos: b1", i, b1[i], -103.f);
+      }
+      test_vi_oppos(a2, b2, (long)123, 103.f);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_vi_oppos: a2", i, a2[i], (long)123);
+        errn += verify("test_vi_oppos: b2", i, b2[i], 103.f);
+      }
+      test_cp_oppos(a1, a2, b1, b2);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_cp_oppos: a1", i, a1[i], (long)123);
+        errn += verify("test_cp_oppos: b1", i, b1[i], 103.f);
+      }
+
+      // Reset for 2 arrays with relative aligned offset
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        a2[i] = 123;
+        b1[i] = -1.f;
+        b2[i] = 123.f;
+      }
+      test_cp_alndst(a1, a2, b1, b2);
+      for (int i=0; i<ALIGN_OFF; i++) {
+        errn += verify("test_cp_alndst: a1", i, a1[i], (long)-1);
+        errn += verify("test_cp_alndst: b1", i, b1[i], -1.f);
+      }
+      for (int i=ALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_cp_alndst: a1", i, a1[i], (long)123);
+        errn += verify("test_cp_alndst: b1", i, b1[i], 123.f);
+      }
+      for (int i=0; i<ARRLEN; i++) {
+        a2[i] = -123;
+        b2[i] = -123.f;
+      }
+      test_cp_alnsrc(a1, a2, b1, b2);
+      for (int i=0; i<ARRLEN-ALIGN_OFF; i++) {
+        errn += verify("test_cp_alnsrc: a1", i, a1[i], (long)-123);
+        errn += verify("test_cp_alnsrc: b1", i, b1[i], -123.f);
+      }
+      for (int i=ARRLEN-ALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_cp_alnsrc: a1", i, a1[i], (long)123);
+        errn += verify("test_cp_alnsrc: b1", i, b1[i], 123.f);
+      }
+
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        b1[i] = -1.f;
+      }
+      test_ci_aln(a1, b1);
+      for (int i=0; i<ALIGN_OFF; i++) {
+        errn += verify("test_ci_aln: a1", i, a1[i], (long)-1);
+      }
+      for (int i=ALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_ci_aln: a1", i, a1[i], (long)-123);
+      }
+      for (int i=0; i<ARRLEN-ALIGN_OFF; i++) {
+        errn += verify("test_ci_aln: b1", i, b1[i], -103.f);
+      }
+      for (int i=ARRLEN-ALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_ci_aln: b1", i, b1[i], -1.f);
+      }
+
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        b1[i] = -1.f;
+      }
+      test_vi_aln(a1, b1, (long)123, 103.f);
+      for (int i=0; i<ARRLEN-ALIGN_OFF; i++) {
+        errn += verify("test_vi_aln: a1", i, a1[i], (long)123);
+      }
+      for (int i=ARRLEN-ALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_vi_aln: a1", i, a1[i], (long)-1);
+      }
+      for (int i=0; i<ALIGN_OFF; i++) {
+        errn += verify("test_vi_aln: b1", i, b1[i], -1.f);
+      }
+      for (int i=ALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_vi_aln: b1", i, b1[i], 103.f);
+      }
+
+      // Reset for 2 arrays with relative unaligned offset
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        a2[i] = 123;
+        b1[i] = -1.f;
+        b2[i] = 123.f;
+      }
+      test_cp_unalndst(a1, a2, b1, b2);
+      for (int i=0; i<UNALIGN_OFF; i++) {
+        errn += verify("test_cp_unalndst: a1", i, a1[i], (long)-1);
+        errn += verify("test_cp_unalndst: b1", i, b1[i], -1.f);
+      }
+      for (int i=UNALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_cp_unalndst: a1", i, a1[i], (long)123);
+        errn += verify("test_cp_unalndst: b1", i, b1[i], 123.f);
+      }
+      for (int i=0; i<ARRLEN; i++) {
+        a2[i] = -123;
+        b2[i] = -123.f;
+      }
+      test_cp_unalnsrc(a1, a2, b1, b2);
+      for (int i=0; i<ARRLEN-UNALIGN_OFF; i++) {
+        errn += verify("test_cp_unalnsrc: a1", i, a1[i], (long)-123);
+        errn += verify("test_cp_unalnsrc: b1", i, b1[i], -123.f);
+      }
+      for (int i=ARRLEN-UNALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_cp_unalnsrc: a1", i, a1[i], (long)123);
+        errn += verify("test_cp_unalnsrc: b1", i, b1[i], 123.f);
+      }
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        b1[i] = -1;
+      }
+      test_ci_unaln(a1, b1);
+      for (int i=0; i<UNALIGN_OFF; i++) {
+        errn += verify("test_ci_unaln: a1", i, a1[i], (long)-1);
+      }
+      for (int i=UNALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_ci_unaln: a1", i, a1[i], (long)-123);
+      }
+      for (int i=0; i<ARRLEN-UNALIGN_OFF; i++) {
+        errn += verify("test_ci_unaln: b1", i, b1[i], -103.f);
+      }
+      for (int i=ARRLEN-UNALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_ci_unaln: b1", i, b1[i], -1.f);
+      }
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        b1[i] = -1;
+      }
+      test_vi_unaln(a1, b1, (long)123, 103.f);
+      for (int i=0; i<ARRLEN-UNALIGN_OFF; i++) {
+        errn += verify("test_vi_unaln: a1", i, a1[i], (long)123);
+      }
+      for (int i=ARRLEN-UNALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_vi_unaln: a1", i, a1[i], (long)-1);
+      }
+      for (int i=0; i<UNALIGN_OFF; i++) {
+        errn += verify("test_vi_unaln: b1", i, b1[i], -1.f);
+      }
+      for (int i=UNALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_vi_unaln: b1", i, b1[i], 103.f);
+      }
+
+      // Reset for aligned overlap initialization
+      for (int i=0; i<ALIGN_OFF; i++) {
+        a1[i] = (long)i;
+        b1[i] = (float)i;
+      }
+      for (int i=ALIGN_OFF; i<ARRLEN; i++) {
+        a1[i] = -1;
+        b1[i] = -1.f;
+      }
+      test_cp_alndst(a1, a1, b1, b1);
+      for (int i=0; i<ARRLEN; i++) {
+        int v = i%ALIGN_OFF;
+        errn += verify("test_cp_alndst_overlap: a1", i, a1[i], (long)v);
+        errn += verify("test_cp_alndst_overlap: b1", i, b1[i], (float)v);
+      }
+      for (int i=0; i<ALIGN_OFF; i++) {
+        a1[i+ALIGN_OFF] = -1;
+        b1[i+ALIGN_OFF] = -1.f;
+      }
+      test_cp_alnsrc(a1, a1, b1, b1);
+      for (int i=0; i<ALIGN_OFF; i++) {
+        errn += verify("test_cp_alnsrc_overlap: a1", i, a1[i], (long)-1);
+        errn += verify("test_cp_alnsrc_overlap: b1", i, b1[i], -1.f);
+      }
+      for (int i=ALIGN_OFF; i<ARRLEN; i++) {
+        int v = i%ALIGN_OFF;
+        errn += verify("test_cp_alnsrc_overlap: a1", i, a1[i], (long)v);
+        errn += verify("test_cp_alnsrc_overlap: b1", i, b1[i], (float)v);
+      }
+
+      // Reset for unaligned overlap initialization
+      for (int i=0; i<UNALIGN_OFF; i++) {
+        a1[i] = (long)i;
+        b1[i] = (float)i;
+      }
+      for (int i=UNALIGN_OFF; i<ARRLEN; i++) {
+        a1[i] = -1;
+        b1[i] = -1.f;
+      }
+      test_cp_unalndst(a1, a1, b1, b1);
+      for (int i=0; i<ARRLEN; i++) {
+        int v = i%UNALIGN_OFF;
+        errn += verify("test_cp_unalndst_overlap: a1", i, a1[i], (long)v);
+        errn += verify("test_cp_unalndst_overlap: b1", i, b1[i], (float)v);
+      }
+      for (int i=0; i<UNALIGN_OFF; i++) {
+        a1[i+UNALIGN_OFF] = -1;
+        b1[i+UNALIGN_OFF] = -1.f;
+      }
+      test_cp_unalnsrc(a1, a1, b1, b1);
+      for (int i=0; i<UNALIGN_OFF; i++) {
+        errn += verify("test_cp_unalnsrc_overlap: a1", i, a1[i], (long)-1);
+        errn += verify("test_cp_unalnsrc_overlap: b1", i, b1[i], -1.f);
+      }
+      for (int i=UNALIGN_OFF; i<ARRLEN; i++) {
+        int v = i%UNALIGN_OFF;
+        errn += verify("test_cp_unalnsrc_overlap: a1", i, a1[i], (long)v);
+        errn += verify("test_cp_unalnsrc_overlap: b1", i, b1[i], (float)v);
+      }
+
+    }
+
+    if (errn > 0)
+      return errn;
+
+    System.out.println("Time");
+    long start, end;
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_ci(a1, b1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_ci: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_vi(a2, b2, (long)123, 103.f);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_vi: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp(a1, a2, b1, b2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_ci_neg(a1, b1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_ci_neg: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_vi_neg(a1, b1, (long)123, 103.f);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_vi_neg: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_neg(a1, a2, b1, b2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_neg: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_ci_oppos(a1, b1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_ci_oppos: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_vi_oppos(a1, b1, (long)123, 103.f);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_vi_oppos: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_oppos(a1, a2, b1, b2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_oppos: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_ci_aln(a1, b1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_ci_aln: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_vi_aln(a1, b1, (long)123, 103.f);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_vi_aln: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_alndst(a1, a2, b1, b2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_alndst: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_alnsrc(a1, a2, b1, b2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_alnsrc: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_ci_unaln(a1, b1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_ci_unaln: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_vi_unaln(a1, b1, (long)123, 103.f);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_vi_unaln: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_unalndst(a1, a2, b1, b2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_unalndst: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_unalnsrc(a1, a2, b1, b2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_unalnsrc: " + (end - start));
+    return errn;
+  }
+
+  static void test_ci(long[] a, float[] b) {
+    for (int i = 0; i < a.length; i+=1) {
+      a[i] = -123;
+      b[i] = -103.f;
+    }
+  }
+  static void test_vi(long[] a, float[] b, long c, float d) {
+    for (int i = 0; i < a.length; i+=1) {
+      a[i] = c;
+      b[i] = d;
+    }
+  }
+  static void test_cp(long[] a, long[] b, float[] c, float[] d) {
+    for (int i = 0; i < a.length; i+=1) {
+      a[i] = b[i];
+      c[i] = d[i];
+    }
+  }
+  static void test_ci_neg(long[] a, float[] b) {
+    for (int i = a.length-1; i >= 0; i-=1) {
+      a[i] = -123;
+      b[i] = -103.f;
+    }
+  }
+  static void test_vi_neg(long[] a, float[] b, long c, float d) {
+    for (int i = a.length-1; i >= 0; i-=1) {
+      a[i] = c;
+      b[i] = d;
+    }
+  }
+  static void test_cp_neg(long[] a, long[] b, float[] c, float[] d) {
+    for (int i = a.length-1; i >= 0; i-=1) {
+      a[i] = b[i];
+      c[i] = d[i];
+    }
+  }
+  static void test_ci_oppos(long[] a, float[] b) {
+    int limit = a.length-1;
+    for (int i = 0; i < a.length; i+=1) {
+      a[limit-i] = -123;
+      b[i] = -103.f;
+    }
+  }
+  static void test_vi_oppos(long[] a, float[] b, long c, float d) {
+    int limit = a.length-1;
+    for (int i = a.length-1; i >= 0; i-=1) {
+      a[i] = c;
+      b[limit-i] = d;
+    }
+  }
+  static void test_cp_oppos(long[] a, long[] b, float[] c, float[] d) {
+    int limit = a.length-1;
+    for (int i = 0; i < a.length; i+=1) {
+      a[i] = b[limit-i];
+      c[limit-i] = d[i];
+    }
+  }
+  static void test_ci_aln(long[] a, float[] b) {
+    for (int i = 0; i < a.length-ALIGN_OFF; i+=1) {
+      a[i+ALIGN_OFF] = -123;
+      b[i] = -103.f;
+    }
+  }
+  static void test_vi_aln(long[] a, float[] b, long c, float d) {
+    for (int i = 0; i < a.length-ALIGN_OFF; i+=1) {
+      a[i] = c;
+      b[i+ALIGN_OFF] = d;
+    }
+  }
+  static void test_cp_alndst(long[] a, long[] b, float[] c, float[] d) {
+    for (int i = 0; i < a.length-ALIGN_OFF; i+=1) {
+      a[i+ALIGN_OFF] = b[i];
+      c[i+ALIGN_OFF] = d[i];
+    }
+  }
+  static void test_cp_alnsrc(long[] a, long[] b, float[] c, float[] d) {
+    for (int i = 0; i < a.length-ALIGN_OFF; i+=1) {
+      a[i] = b[i+ALIGN_OFF];
+      c[i] = d[i+ALIGN_OFF];
+    }
+  }
+  static void test_ci_unaln(long[] a, float[] b) {
+    for (int i = 0; i < a.length-UNALIGN_OFF; i+=1) {
+      a[i+UNALIGN_OFF] = -123;
+      b[i] = -103.f;
+    }
+  }
+  static void test_vi_unaln(long[] a, float[] b, long c, float d) {
+    for (int i = 0; i < a.length-UNALIGN_OFF; i+=1) {
+      a[i] = c;
+      b[i+UNALIGN_OFF] = d;
+    }
+  }
+  static void test_cp_unalndst(long[] a, long[] b, float[] c, float[] d) {
+    for (int i = 0; i < a.length-UNALIGN_OFF; i+=1) {
+      a[i+UNALIGN_OFF] = b[i];
+      c[i+UNALIGN_OFF] = d[i];
+    }
+  }
+  static void test_cp_unalnsrc(long[] a, long[] b, float[] c, float[] d) {
+    for (int i = 0; i < a.length-UNALIGN_OFF; i+=1) {
+      a[i] = b[i+UNALIGN_OFF];
+      c[i] = d[i+UNALIGN_OFF];
+    }
+  }
+
+  static int verify(String text, int i, long elem, long val) {
+    if (elem != val) {
+      System.err.println(text + "[" + i + "] = " + elem + " != " + val);
+      return 1;
+    }
+    return 0;
+  }
+  static int verify(String text, int i, float elem, float val) {
+    if (elem != val) {
+      System.err.println(text + "[" + i + "] = " + elem + " != " + val);
+      return 1;
+    }
+    return 0;
+  }
+}
--- /dev/null	Sat Jun  2 20:04:30 2012
+++ new/test/compiler/7119644/TestLongVect.java	Sat Jun  2 20:04:30 2012
@@ -0,0 +1,953 @@
+/*
+ * Copyright (c) 2012, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+/**
+ * @test
+ * @bug 7119644
+ * @summary Increase superword's vector size up to 256 bits
+ *
+ * @run main/othervm/timeout=300 -Xbatch -XX:+IgnoreUnrecognizedVMOptions -XX:-TieredCompilation -XX:-OptimizeFill TestLongVect
+ */
+
+public class TestLongVect {
+  private static final int ARRLEN = 997;
+  private static final int ITERS  = 11000;
+  private static final int OFFSET = 3;
+  private static final int SCALE = 2;
+  private static final int ALIGN_OFF = 8;
+  private static final int UNALIGN_OFF = 5;
+
+  public static void main(String args[]) {
+    System.out.println("Testing Long vectors");
+    int errn = test();
+    if (errn > 0) {
+      System.err.println("FAILED: " + errn + " errors");
+      System.exit(97);
+    }
+    System.out.println("PASSED");
+  }
+
+  static int test() {
+    long[] a1 = new long[ARRLEN];
+    long[] a2 = new long[ARRLEN];
+    System.out.println("Warmup");
+    for (int i=0; i<ITERS; i++) {
+      test_ci(a1);
+      test_vi(a2, (long)123);
+      test_cp(a1, a2);
+      test_2ci(a1, a2);
+      test_2vi(a1, a2, (long)123, (long)103);
+      test_ci_neg(a1);
+      test_vi_neg(a2, (long)123);
+      test_cp_neg(a1, a2);
+      test_2ci_neg(a1, a2);
+      test_2vi_neg(a1, a2, (long)123, (long)103);
+      test_ci_oppos(a1);
+      test_vi_oppos(a2, (long)123);
+      test_cp_oppos(a1, a2);
+      test_2ci_oppos(a1, a2);
+      test_2vi_oppos(a1, a2, (long)123, (long)103);
+      test_ci_off(a1);
+      test_vi_off(a2, (long)123);
+      test_cp_off(a1, a2);
+      test_2ci_off(a1, a2);
+      test_2vi_off(a1, a2, (long)123, (long)103);
+      test_ci_inv(a1, OFFSET);
+      test_vi_inv(a2, (long)123, OFFSET);
+      test_cp_inv(a1, a2, OFFSET);
+      test_2ci_inv(a1, a2, OFFSET);
+      test_2vi_inv(a1, a2, (long)123, (long)103, OFFSET);
+      test_ci_scl(a1);
+      test_vi_scl(a2, (long)123);
+      test_cp_scl(a1, a2);
+      test_2ci_scl(a1, a2);
+      test_2vi_scl(a1, a2, (long)123, (long)103);
+      test_cp_alndst(a1, a2);
+      test_cp_alnsrc(a1, a2);
+      test_2ci_aln(a1, a2);
+      test_2vi_aln(a1, a2, (long)123, (long)103);
+      test_cp_unalndst(a1, a2);
+      test_cp_unalnsrc(a1, a2);
+      test_2ci_unaln(a1, a2);
+      test_2vi_unaln(a1, a2, (long)123, (long)103);
+    }
+    // Initialize
+    for (int i=0; i<ARRLEN; i++) {
+      a1[i] = -1;
+      a2[i] = -1;
+    }
+    // Test and verify results
+    System.out.println("Verification");
+    int errn = 0;
+    {
+      test_ci(a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_ci: a1", i, a1[i], (long)-123);
+      }
+      test_vi(a2, (long)123);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_vi: a2", i, a2[i], (long)123);
+      }
+      test_cp(a1, a2);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_cp: a1", i, a1[i], (long)123);
+      }
+      test_2ci(a1, a2);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_2ci: a1", i, a1[i], (long)-123);
+        errn += verify("test_2ci: a2", i, a2[i], (long)-103);
+      }
+      test_2vi(a1, a2, (long)123, (long)103);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_2vi: a1", i, a1[i], (long)123);
+        errn += verify("test_2vi: a2", i, a2[i], (long)103);
+      }
+      // Reset for negative stride
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        a2[i] = -1;
+      }
+      test_ci_neg(a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_ci_neg: a1", i, a1[i], (long)-123);
+      }
+      test_vi_neg(a2, (long)123);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_vi_neg: a2", i, a2[i], (long)123);
+      }
+      test_cp_neg(a1, a2);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_cp_neg: a1", i, a1[i], (long)123);
+      }
+      test_2ci_neg(a1, a2);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_2ci_neg: a1", i, a1[i], (long)-123);
+        errn += verify("test_2ci_neg: a2", i, a2[i], (long)-103);
+      }
+      test_2vi_neg(a1, a2, (long)123, (long)103);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_2vi_neg: a1", i, a1[i], (long)123);
+        errn += verify("test_2vi_neg: a2", i, a2[i], (long)103);
+      }
+      // Reset for opposite stride
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        a2[i] = -1;
+      }
+      test_ci_oppos(a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_ci_oppos: a1", i, a1[i], (long)-123);
+      }
+      test_vi_oppos(a2, (long)123);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_vi_oppos: a2", i, a2[i], (long)123);
+      }
+      test_cp_oppos(a1, a2);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_cp_oppos: a1", i, a1[i], (long)123);
+      }
+      test_2ci_oppos(a1, a2);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_2ci_oppos: a1", i, a1[i], (long)-123);
+        errn += verify("test_2ci_oppos: a2", i, a2[i], (long)-103);
+      }
+      test_2vi_oppos(a1, a2, (long)123, (long)103);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_2vi_oppos: a1", i, a1[i], (long)123);
+        errn += verify("test_2vi_oppos: a2", i, a2[i], (long)103);
+      }
+      // Reset for indexing with offset
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        a2[i] = -1;
+      }
+      test_ci_off(a1);
+      for (int i=OFFSET; i<ARRLEN; i++) {
+        errn += verify("test_ci_off: a1", i, a1[i], (long)-123);
+      }
+      test_vi_off(a2, (long)123);
+      for (int i=OFFSET; i<ARRLEN; i++) {
+        errn += verify("test_vi_off: a2", i, a2[i], (long)123);
+      }
+      test_cp_off(a1, a2);
+      for (int i=OFFSET; i<ARRLEN; i++) {
+        errn += verify("test_cp_off: a1", i, a1[i], (long)123);
+      }
+      test_2ci_off(a1, a2);
+      for (int i=OFFSET; i<ARRLEN; i++) {
+        errn += verify("test_2ci_off: a1", i, a1[i], (long)-123);
+        errn += verify("test_2ci_off: a2", i, a2[i], (long)-103);
+      }
+      test_2vi_off(a1, a2, (long)123, (long)103);
+      for (int i=OFFSET; i<ARRLEN; i++) {
+        errn += verify("test_2vi_off: a1", i, a1[i], (long)123);
+        errn += verify("test_2vi_off: a2", i, a2[i], (long)103);
+      }
+      for (int i=0; i<OFFSET; i++) {
+        errn += verify("test_2vi_off: a1", i, a1[i], (long)-1);
+        errn += verify("test_2vi_off: a2", i, a2[i], (long)-1);
+      }
+      // Reset for indexing with invariant offset
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        a2[i] = -1;
+      }
+      test_ci_inv(a1, OFFSET);
+      for (int i=OFFSET; i<ARRLEN; i++) {
+        errn += verify("test_ci_inv: a1", i, a1[i], (long)-123);
+      }
+      test_vi_inv(a2, (long)123, OFFSET);
+      for (int i=OFFSET; i<ARRLEN; i++) {
+        errn += verify("test_vi_inv: a2", i, a2[i], (long)123);
+      }
+      test_cp_inv(a1, a2, OFFSET);
+      for (int i=OFFSET; i<ARRLEN; i++) {
+        errn += verify("test_cp_inv: a1", i, a1[i], (long)123);
+      }
+      test_2ci_inv(a1, a2, OFFSET);
+      for (int i=OFFSET; i<ARRLEN; i++) {
+        errn += verify("test_2ci_inv: a1", i, a1[i], (long)-123);
+        errn += verify("test_2ci_inv: a2", i, a2[i], (long)-103);
+      }
+      test_2vi_inv(a1, a2, (long)123, (long)103, OFFSET);
+      for (int i=OFFSET; i<ARRLEN; i++) {
+        errn += verify("test_2vi_inv: a1", i, a1[i], (long)123);
+        errn += verify("test_2vi_inv: a2", i, a2[i], (long)103);
+      }
+      for (int i=0; i<OFFSET; i++) {
+        errn += verify("test_2vi_inv: a1", i, a1[i], (long)-1);
+        errn += verify("test_2vi_inv: a2", i, a2[i], (long)-1);
+      }
+      // Reset for indexing with scale
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        a2[i] = -1;
+      }
+      test_ci_scl(a1);
+      for (int i=0; i<ARRLEN; i++) {
+        int val = (i%SCALE != 0) ? -1 : -123;
+        errn += verify("test_ci_scl: a1", i, a1[i], (long)val);
+      }
+      test_vi_scl(a2, (long)123);
+      for (int i=0; i<ARRLEN; i++) {
+        int val = (i%SCALE != 0) ? -1 : 123;
+        errn += verify("test_vi_scl: a2", i, a2[i], (long)val);
+      }
+      test_cp_scl(a1, a2);
+      for (int i=0; i<ARRLEN; i++) {
+        int val = (i%SCALE != 0) ? -1 : 123;
+        errn += verify("test_cp_scl: a1", i, a1[i], (long)val);
+      }
+      test_2ci_scl(a1, a2);
+      for (int i=0; i<ARRLEN; i++) {
+        if (i%SCALE != 0) {
+          errn += verify("test_2ci_scl: a1", i, a1[i], (long)-1);
+        } else if (i*SCALE < ARRLEN) {
+          errn += verify("test_2ci_scl: a1", i*SCALE, a1[i*SCALE], (long)-123);
+        }
+        if (i%SCALE != 0) {
+          errn += verify("test_2ci_scl: a2", i, a2[i], (long)-1);
+        } else if (i*SCALE < ARRLEN) {
+          errn += verify("test_2ci_scl: a2", i*SCALE, a2[i*SCALE], (long)-103);
+        }
+      }
+      test_2vi_scl(a1, a2, (long)123, (long)103);
+      for (int i=0; i<ARRLEN; i++) {
+        if (i%SCALE != 0) {
+          errn += verify("test_2vi_scl: a1", i, a1[i], (long)-1);
+        } else if (i*SCALE < ARRLEN) {
+          errn += verify("test_2vi_scl: a1", i*SCALE, a1[i*SCALE], (long)123);
+        }
+        if (i%SCALE != 0) {
+          errn += verify("test_2vi_scl: a2", i, a2[i], (long)-1);
+        } else if (i*SCALE < ARRLEN) {
+          errn += verify("test_2vi_scl: a2", i*SCALE, a2[i*SCALE], (long)103);
+        }
+      }
+      // Reset for 2 arrays with relative aligned offset
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        a2[i] = -1;
+      }
+      test_vi(a2, (long)123);
+      test_cp_alndst(a1, a2);
+      for (int i=0; i<ALIGN_OFF; i++) {
+        errn += verify("test_cp_alndst: a1", i, a1[i], (long)-1);
+      }
+      for (int i=ALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_cp_alndst: a1", i, a1[i], (long)123);
+      }
+      test_vi(a2, (long)-123);
+      test_cp_alnsrc(a1, a2);
+      for (int i=0; i<ARRLEN-ALIGN_OFF; i++) {
+        errn += verify("test_cp_alnsrc: a1", i, a1[i], (long)-123);
+      }
+      for (int i=ARRLEN-ALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_cp_alnsrc: a1", i, a1[i], (long)123);
+      }
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        a2[i] = -1;
+      }
+      test_2ci_aln(a1, a2);
+      for (int i=0; i<ALIGN_OFF; i++) {
+        errn += verify("test_2ci_aln: a1", i, a1[i], (long)-1);
+      }
+      for (int i=ALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_2ci_aln: a1", i, a1[i], (long)-123);
+      }
+      for (int i=0; i<ARRLEN-ALIGN_OFF; i++) {
+        errn += verify("test_2ci_aln: a2", i, a2[i], (long)-103);
+      }
+      for (int i=ARRLEN-ALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_2ci_aln: a2", i, a2[i], (long)-1);
+      }
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        a2[i] = -1;
+      }
+      test_2vi_aln(a1, a2, (long)123, (long)103);
+      for (int i=0; i<ARRLEN-ALIGN_OFF; i++) {
+        errn += verify("test_2vi_aln: a1", i, a1[i], (long)123);
+      }
+      for (int i=ARRLEN-ALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_2vi_aln: a1", i, a1[i], (long)-1);
+      }
+      for (int i=0; i<ALIGN_OFF; i++) {
+        errn += verify("test_2vi_aln: a2", i, a2[i], (long)-1);
+      }
+      for (int i=ALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_2vi_aln: a2", i, a2[i], (long)103);
+      }
+
+      // Reset for 2 arrays with relative unaligned offset
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        a2[i] = -1;
+      }
+      test_vi(a2, (long)123);
+      test_cp_unalndst(a1, a2);
+      for (int i=0; i<UNALIGN_OFF; i++) {
+        errn += verify("test_cp_unalndst: a1", i, a1[i], (long)-1);
+      }
+      for (int i=UNALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_cp_unalndst: a1", i, a1[i], (long)123);
+      }
+      test_vi(a2, (long)-123);
+      test_cp_unalnsrc(a1, a2);
+      for (int i=0; i<ARRLEN-UNALIGN_OFF; i++) {
+        errn += verify("test_cp_unalnsrc: a1", i, a1[i], (long)-123);
+      }
+      for (int i=ARRLEN-UNALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_cp_unalnsrc: a1", i, a1[i], (long)123);
+      }
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        a2[i] = -1;
+      }
+      test_2ci_unaln(a1, a2);
+      for (int i=0; i<UNALIGN_OFF; i++) {
+        errn += verify("test_2ci_unaln: a1", i, a1[i], (long)-1);
+      }
+      for (int i=UNALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_2ci_unaln: a1", i, a1[i], (long)-123);
+      }
+      for (int i=0; i<ARRLEN-UNALIGN_OFF; i++) {
+        errn += verify("test_2ci_unaln: a2", i, a2[i], (long)-103);
+      }
+      for (int i=ARRLEN-UNALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_2ci_unaln: a2", i, a2[i], (long)-1);
+      }
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        a2[i] = -1;
+      }
+      test_2vi_unaln(a1, a2, (long)123, (long)103);
+      for (int i=0; i<ARRLEN-UNALIGN_OFF; i++) {
+        errn += verify("test_2vi_unaln: a1", i, a1[i], (long)123);
+      }
+      for (int i=ARRLEN-UNALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_2vi_unaln: a1", i, a1[i], (long)-1);
+      }
+      for (int i=0; i<UNALIGN_OFF; i++) {
+        errn += verify("test_2vi_unaln: a2", i, a2[i], (long)-1);
+      }
+      for (int i=UNALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_2vi_unaln: a2", i, a2[i], (long)103);
+      }
+
+      // Reset for aligned overlap initialization
+      for (int i=0; i<ALIGN_OFF; i++) {
+        a1[i] = (long)i;
+      }
+      for (int i=ALIGN_OFF; i<ARRLEN; i++) {
+        a1[i] = -1;
+      }
+      test_cp_alndst(a1, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        int v = i%ALIGN_OFF;
+        errn += verify("test_cp_alndst_overlap: a1", i, a1[i], (long)v);
+      }
+      for (int i=0; i<ALIGN_OFF; i++) {
+        a1[i+ALIGN_OFF] = -1;
+      }
+      test_cp_alnsrc(a1, a1);
+      for (int i=0; i<ALIGN_OFF; i++) {
+        errn += verify("test_cp_alnsrc_overlap: a1", i, a1[i], (long)-1);
+      }
+      for (int i=ALIGN_OFF; i<ARRLEN; i++) {
+        int v = i%ALIGN_OFF;
+        errn += verify("test_cp_alnsrc_overlap: a1", i, a1[i], (long)v);
+      }
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+      }
+      test_2ci_aln(a1, a1);
+      for (int i=0; i<ARRLEN-ALIGN_OFF; i++) {
+        errn += verify("test_2ci_aln_overlap: a1", i, a1[i], (long)-103);
+      }
+      for (int i=ARRLEN-ALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_2ci_aln_overlap: a1", i, a1[i], (long)-123);
+      }
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+      }
+      test_2vi_aln(a1, a1, (long)123, (long)103);
+      for (int i=0; i<ARRLEN-ALIGN_OFF; i++) {
+        errn += verify("test_2vi_aln_overlap: a1", i, a1[i], (long)123);
+      }
+      for (int i=ARRLEN-ALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_2vi_aln_overlap: a1", i, a1[i], (long)103);
+      }
+
+      // Reset for unaligned overlap initialization
+      for (int i=0; i<UNALIGN_OFF; i++) {
+        a1[i] = (long)i;
+      }
+      for (int i=UNALIGN_OFF; i<ARRLEN; i++) {
+        a1[i] = -1;
+      }
+      test_cp_unalndst(a1, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        int v = i%UNALIGN_OFF;
+        errn += verify("test_cp_unalndst_overlap: a1", i, a1[i], (long)v);
+      }
+      for (int i=0; i<UNALIGN_OFF; i++) {
+        a1[i+UNALIGN_OFF] = -1;
+      }
+      test_cp_unalnsrc(a1, a1);
+      for (int i=0; i<UNALIGN_OFF; i++) {
+        errn += verify("test_cp_unalnsrc_overlap: a1", i, a1[i], (long)-1);
+      }
+      for (int i=UNALIGN_OFF; i<ARRLEN; i++) {
+        int v = i%UNALIGN_OFF;
+        errn += verify("test_cp_unalnsrc_overlap: a1", i, a1[i], (long)v);
+      }
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+      }
+      test_2ci_unaln(a1, a1);
+      for (int i=0; i<ARRLEN-UNALIGN_OFF; i++) {
+        errn += verify("test_2ci_unaln_overlap: a1", i, a1[i], (long)-103);
+      }
+      for (int i=ARRLEN-UNALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_2ci_unaln_overlap: a1", i, a1[i], (long)-123);
+      }
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+      }
+      test_2vi_unaln(a1, a1, (long)123, (long)103);
+      for (int i=0; i<ARRLEN-UNALIGN_OFF; i++) {
+        errn += verify("test_2vi_unaln_overlap: a1", i, a1[i], (long)123);
+      }
+      for (int i=ARRLEN-UNALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_2vi_unaln_overlap: a1", i, a1[i], (long)103);
+      }
+
+    }
+
+    if (errn > 0)
+      return errn;
+
+    System.out.println("Time");
+    long start, end;
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_ci(a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_ci: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_vi(a2, (long)123);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_vi: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp(a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_2ci(a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_2ci: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_2vi(a1, a2, (long)123, (long)103);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_2vi: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_ci_neg(a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_ci_neg: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_vi_neg(a2, (long)123);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_vi_neg: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_neg(a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_neg: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_2ci_neg(a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_2ci_neg: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_2vi_neg(a1, a2, (long)123, (long)103);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_2vi_neg: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_ci_oppos(a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_ci_oppos: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_vi_oppos(a2, (long)123);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_vi_oppos: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_oppos(a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_oppos: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_2ci_oppos(a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_2ci_oppos: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_2vi_oppos(a1, a2, (long)123, (long)103);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_2vi_oppos: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_ci_off(a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_ci_off: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_vi_off(a2, (long)123);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_vi_off: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_off(a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_off: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_2ci_off(a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_2ci_off: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_2vi_off(a1, a2, (long)123, (long)103);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_2vi_off: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_ci_inv(a1, OFFSET);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_ci_inv: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_vi_inv(a2, (long)123, OFFSET);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_vi_inv: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_inv(a1, a2, OFFSET);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_inv: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_2ci_inv(a1, a2, OFFSET);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_2ci_inv: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_2vi_inv(a1, a2, (long)123, (long)103, OFFSET);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_2vi_inv: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_ci_scl(a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_ci_scl: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_vi_scl(a2, (long)123);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_vi_scl: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_scl(a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_scl: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_2ci_scl(a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_2ci_scl: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_2vi_scl(a1, a2, (long)123, (long)103);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_2vi_scl: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_alndst(a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_alndst: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_alnsrc(a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_alnsrc: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_2ci_aln(a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_2ci_aln: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_2vi_aln(a1, a2, (long)123, (long)103);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_2vi_aln: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_unalndst(a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_unalndst: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_unalnsrc(a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_unalnsrc: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_2ci_unaln(a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_2ci_unaln: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_2vi_unaln(a1, a2, (long)123, (long)103);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_2vi_unaln: " + (end - start));
+
+    return errn;
+  }
+
+  static void test_ci(long[] a) {
+    for (int i = 0; i < a.length; i+=1) {
+      a[i] = -123;
+    }
+  }
+  static void test_vi(long[] a, long b) {
+    for (int i = 0; i < a.length; i+=1) {
+      a[i] = b;
+    }
+  }
+  static void test_cp(long[] a, long[] b) {
+    for (int i = 0; i < a.length; i+=1) {
+      a[i] = b[i];
+    }
+  }
+  static void test_2ci(long[] a, long[] b) {
+    for (int i = 0; i < a.length; i+=1) {
+      a[i] = -123;
+      b[i] = -103;
+    }
+  }
+  static void test_2vi(long[] a, long[] b, long c, long d) {
+    for (int i = 0; i < a.length; i+=1) {
+      a[i] = c;
+      b[i] = d;
+    }
+  }
+  static void test_ci_neg(long[] a) {
+    for (int i = a.length-1; i >= 0; i-=1) {
+      a[i] = -123;
+    }
+  }
+  static void test_vi_neg(long[] a, long b) {
+    for (int i = a.length-1; i >= 0; i-=1) {
+      a[i] = b;
+    }
+  }
+  static void test_cp_neg(long[] a, long[] b) {
+    for (int i = a.length-1; i >= 0; i-=1) {
+      a[i] = b[i];
+    }
+  }
+  static void test_2ci_neg(long[] a, long[] b) {
+    for (int i = a.length-1; i >= 0; i-=1) {
+      a[i] = -123;
+      b[i] = -103;
+    }
+  }
+  static void test_2vi_neg(long[] a, long[] b, long c, long d) {
+    for (int i = a.length-1; i >= 0; i-=1) {
+      a[i] = c;
+      b[i] = d;
+    }
+  }
+  static void test_ci_oppos(long[] a) {
+    int limit = a.length-1;
+    for (int i = 0; i < a.length; i+=1) {
+      a[limit-i] = -123;
+    }
+  }
+  static void test_vi_oppos(long[] a, long b) {
+    int limit = a.length-1;
+    for (int i = limit; i >= 0; i-=1) {
+      a[limit-i] = b;
+    }
+  }
+  static void test_cp_oppos(long[] a, long[] b) {
+    int limit = a.length-1;
+    for (int i = 0; i < a.length; i+=1) {
+      a[i] = b[limit-i];
+    }
+  }
+  static void test_2ci_oppos(long[] a, long[] b) {
+    int limit = a.length-1;
+    for (int i = 0; i < a.length; i+=1) {
+      a[limit-i] = -123;
+      b[i] = -103;
+    }
+  }
+  static void test_2vi_oppos(long[] a, long[] b, long c, long d) {
+    int limit = a.length-1;
+    for (int i = limit; i >= 0; i-=1) {
+      a[i] = c;
+      b[limit-i] = d;
+    }
+  }
+  static void test_ci_off(long[] a) {
+    for (int i = 0; i < a.length-OFFSET; i+=1) {
+      a[i+OFFSET] = -123;
+    }
+  }
+  static void test_vi_off(long[] a, long b) {
+    for (int i = 0; i < a.length-OFFSET; i+=1) {
+      a[i+OFFSET] = b;
+    }
+  }
+  static void test_cp_off(long[] a, long[] b) {
+    for (int i = 0; i < a.length-OFFSET; i+=1) {
+      a[i+OFFSET] = b[i+OFFSET];
+    }
+  }
+  static void test_2ci_off(long[] a, long[] b) {
+    for (int i = 0; i < a.length-OFFSET; i+=1) {
+      a[i+OFFSET] = -123;
+      b[i+OFFSET] = -103;
+    }
+  }
+  static void test_2vi_off(long[] a, long[] b, long c, long d) {
+    for (int i = 0; i < a.length-OFFSET; i+=1) {
+      a[i+OFFSET] = c;
+      b[i+OFFSET] = d;
+    }
+  }
+  static void test_ci_inv(long[] a, int k) {
+    for (int i = 0; i < a.length-k; i+=1) {
+      a[i+k] = -123;
+    }
+  }
+  static void test_vi_inv(long[] a, long b, int k) {
+    for (int i = 0; i < a.length-k; i+=1) {
+      a[i+k] = b;
+    }
+  }
+  static void test_cp_inv(long[] a, long[] b, int k) {
+    for (int i = 0; i < a.length-k; i+=1) {
+      a[i+k] = b[i+k];
+    }
+  }
+  static void test_2ci_inv(long[] a, long[] b, int k) {
+    for (int i = 0; i < a.length-k; i+=1) {
+      a[i+k] = -123;
+      b[i+k] = -103;
+    }
+  }
+  static void test_2vi_inv(long[] a, long[] b, long c, long d, int k) {
+    for (int i = 0; i < a.length-k; i+=1) {
+      a[i+k] = c;
+      b[i+k] = d;
+    }
+  }
+  static void test_ci_scl(long[] a) {
+    for (int i = 0; i*SCALE < a.length; i+=1) {
+      a[i*SCALE] = -123;
+    }
+  }
+  static void test_vi_scl(long[] a, long b) {
+    for (int i = 0; i*SCALE < a.length; i+=1) {
+      a[i*SCALE] = b;
+    }
+  }
+  static void test_cp_scl(long[] a, long[] b) {
+    for (int i = 0; i*SCALE < a.length; i+=1) {
+      a[i*SCALE] = b[i*SCALE];
+    }
+  }
+  static void test_2ci_scl(long[] a, long[] b) {
+    for (int i = 0; i*SCALE < a.length; i+=1) {
+      a[i*SCALE] = -123;
+      b[i*SCALE] = -103;
+    }
+  }
+  static void test_2vi_scl(long[] a, long[] b, long c, long d) {
+    for (int i = 0; i*SCALE < a.length; i+=1) {
+      a[i*SCALE] = c;
+      b[i*SCALE] = d;
+    }
+  }
+  static void test_cp_alndst(long[] a, long[] b) {
+    for (int i = 0; i < a.length-ALIGN_OFF; i+=1) {
+      a[i+ALIGN_OFF] = b[i];
+    }
+  }
+  static void test_cp_alnsrc(long[] a, long[] b) {
+    for (int i = 0; i < a.length-ALIGN_OFF; i+=1) {
+      a[i] = b[i+ALIGN_OFF];
+    }
+  }
+  static void test_2ci_aln(long[] a, long[] b) {
+    for (int i = 0; i < a.length-ALIGN_OFF; i+=1) {
+      a[i+ALIGN_OFF] = -123;
+      b[i] = -103;
+    }
+  }
+  static void test_2vi_aln(long[] a, long[] b, long c, long d) {
+    for (int i = 0; i < a.length-ALIGN_OFF; i+=1) {
+      a[i] = c;
+      b[i+ALIGN_OFF] = d;
+    }
+  }
+  static void test_cp_unalndst(long[] a, long[] b) {
+    for (int i = 0; i < a.length-UNALIGN_OFF; i+=1) {
+      a[i+UNALIGN_OFF] = b[i];
+    }
+  }
+  static void test_cp_unalnsrc(long[] a, long[] b) {
+    for (int i = 0; i < a.length-UNALIGN_OFF; i+=1) {
+      a[i] = b[i+UNALIGN_OFF];
+    }
+  }
+  static void test_2ci_unaln(long[] a, long[] b) {
+    for (int i = 0; i < a.length-UNALIGN_OFF; i+=1) {
+      a[i+UNALIGN_OFF] = -123;
+      b[i] = -103;
+    }
+  }
+  static void test_2vi_unaln(long[] a, long[] b, long c, long d) {
+    for (int i = 0; i < a.length-UNALIGN_OFF; i+=1) {
+      a[i] = c;
+      b[i+UNALIGN_OFF] = d;
+    }
+  }
+
+  static int verify(String text, int i, long elem, long val) {
+    if (elem != val) {
+      System.err.println(text + "[" + i + "] = " + elem + " != " + val);
+      return 1;
+    }
+    return 0;
+  }
+}
--- /dev/null	Sat Jun  2 20:04:31 2012
+++ new/test/compiler/7119644/TestShortDoubleVect.java	Sat Jun  2 20:04:31 2012
@@ -0,0 +1,571 @@
+/*
+ * Copyright (c) 2012, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+/**
+ * @test
+ * @bug 7119644
+ * @summary Increase superword's vector size up to 256 bits
+ *
+ * @run main/othervm/timeout=300 -Xbatch -XX:+IgnoreUnrecognizedVMOptions -XX:-TieredCompilation -XX:-OptimizeFill TestShortDoubleVect
+ */
+
+public class TestShortDoubleVect {
+  private static final int ARRLEN = 997;
+  private static final int ITERS  = 11000;
+  private static final int OFFSET = 3;
+  private static final int SCALE = 2;
+  private static final int ALIGN_OFF = 8;
+  private static final int UNALIGN_OFF = 5;
+
+  public static void main(String args[]) {
+    System.out.println("Testing Short + Double vectors");
+    int errn = test();
+    if (errn > 0) {
+      System.err.println("FAILED: " + errn + " errors");
+      System.exit(97);
+    }
+    System.out.println("PASSED");
+  }
+
+  static int test() {
+    short[] a1 = new short[ARRLEN];
+    short[] a2 = new short[ARRLEN];
+    double[] b1 = new double[ARRLEN];
+    double[] b2 = new double[ARRLEN];
+    System.out.println("Warmup");
+    for (int i=0; i<ITERS; i++) {
+      test_ci(a1, b1);
+      test_vi(a2, b2, (short)123, 103.);
+      test_cp(a1, a2, b1, b2);
+      test_ci_neg(a1, b1);
+      test_vi_neg(a1, b1, (short)123, 103.);
+      test_cp_neg(a1, a2, b1, b2);
+      test_ci_oppos(a1, b1);
+      test_vi_oppos(a1, b1, (short)123, 103.);
+      test_cp_oppos(a1, a2, b1, b2);
+      test_ci_aln(a1, b1);
+      test_vi_aln(a1, b1, (short)123, 103.);
+      test_cp_alndst(a1, a2, b1, b2);
+      test_cp_alnsrc(a1, a2, b1, b2);
+      test_ci_unaln(a1, b1);
+      test_vi_unaln(a1, b1, (short)123, 103.);
+      test_cp_unalndst(a1, a2, b1, b2);
+      test_cp_unalnsrc(a1, a2, b1, b2);
+    }
+    // Initialize
+    for (int i=0; i<ARRLEN; i++) {
+      a1[i] = -1;
+      a2[i] = -1;
+      b1[i] = -1.;
+      b2[i] = -1.;
+    }
+    // Test and verify results
+    System.out.println("Verification");
+    int errn = 0;
+    {
+      test_ci(a1, b1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_ci: a1", i, a1[i], (short)-123);
+        errn += verify("test_ci: b1", i, b1[i], -103.);
+      }
+      test_vi(a2, b2, (short)123, 103.);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_vi: a2", i, a2[i], (short)123);
+        errn += verify("test_vi: b2", i, b2[i], 103.);
+      }
+      test_cp(a1, a2, b1, b2);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_cp: a1", i, a1[i], (short)123);
+        errn += verify("test_cp: b1", i, b1[i], 103.);
+      }
+
+      // Reset for negative stride
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        a2[i] = -1;
+        b1[i] = -1.;
+        b2[i] = -1.;
+      }
+      test_ci_neg(a1, b1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_ci_neg: a1", i, a1[i], (short)-123);
+        errn += verify("test_ci_neg: b1", i, b1[i], -103.);
+      }
+      test_vi_neg(a2, b2, (short)123, 103.);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_vi_neg: a2", i, a2[i], (short)123);
+        errn += verify("test_vi_neg: b2", i, b2[i], 103.);
+      }
+      test_cp_neg(a1, a2, b1, b2);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_cp_neg: a1", i, a1[i], (short)123);
+        errn += verify("test_cp_neg: b1", i, b1[i], 103.);
+      }
+
+      // Reset for opposite stride
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        a2[i] = -1;
+        b1[i] = -1.;
+        b2[i] = -1.;
+      }
+      test_ci_oppos(a1, b1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_ci_oppos: a1", i, a1[i], (short)-123);
+        errn += verify("test_ci_oppos: b1", i, b1[i], -103.);
+      }
+      test_vi_oppos(a2, b2, (short)123, 103.);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_vi_oppos: a2", i, a2[i], (short)123);
+        errn += verify("test_vi_oppos: b2", i, b2[i], 103.);
+      }
+      test_cp_oppos(a1, a2, b1, b2);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_cp_oppos: a1", i, a1[i], (short)123);
+        errn += verify("test_cp_oppos: b1", i, b1[i], 103.);
+      }
+
+      // Reset for 2 arrays with relative aligned offset
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        a2[i] = 123;
+        b1[i] = -1.;
+        b2[i] = 123.;
+      }
+      test_cp_alndst(a1, a2, b1, b2);
+      for (int i=0; i<ALIGN_OFF; i++) {
+        errn += verify("test_cp_alndst: a1", i, a1[i], (short)-1);
+        errn += verify("test_cp_alndst: b1", i, b1[i], -1.);
+      }
+      for (int i=ALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_cp_alndst: a1", i, a1[i], (short)123);
+        errn += verify("test_cp_alndst: b1", i, b1[i], 123.);
+      }
+      for (int i=0; i<ARRLEN; i++) {
+        a2[i] = -123;
+        b2[i] = -123.;
+      }
+      test_cp_alnsrc(a1, a2, b1, b2);
+      for (int i=0; i<ARRLEN-ALIGN_OFF; i++) {
+        errn += verify("test_cp_alnsrc: a1", i, a1[i], (short)-123);
+        errn += verify("test_cp_alnsrc: b1", i, b1[i], -123.);
+      }
+      for (int i=ARRLEN-ALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_cp_alnsrc: a1", i, a1[i], (short)123);
+        errn += verify("test_cp_alnsrc: b1", i, b1[i], 123.);
+      }
+
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        b1[i] = -1.;
+      }
+      test_ci_aln(a1, b1);
+      for (int i=0; i<ALIGN_OFF; i++) {
+        errn += verify("test_ci_aln: a1", i, a1[i], (short)-1);
+      }
+      for (int i=ALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_ci_aln: a1", i, a1[i], (short)-123);
+      }
+      for (int i=0; i<ARRLEN-ALIGN_OFF; i++) {
+        errn += verify("test_ci_aln: b1", i, b1[i], -103.);
+      }
+      for (int i=ARRLEN-ALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_ci_aln: b1", i, b1[i], -1.);
+      }
+
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        b1[i] = -1.;
+      }
+      test_vi_aln(a1, b1, (short)123, 103.);
+      for (int i=0; i<ARRLEN-ALIGN_OFF; i++) {
+        errn += verify("test_vi_aln: a1", i, a1[i], (short)123);
+      }
+      for (int i=ARRLEN-ALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_vi_aln: a1", i, a1[i], (short)-1);
+      }
+      for (int i=0; i<ALIGN_OFF; i++) {
+        errn += verify("test_vi_aln: b1", i, b1[i], -1.);
+      }
+      for (int i=ALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_vi_aln: b1", i, b1[i], 103.);
+      }
+
+      // Reset for 2 arrays with relative unaligned offset
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        a2[i] = 123;
+        b1[i] = -1.;
+        b2[i] = 123.;
+      }
+      test_cp_unalndst(a1, a2, b1, b2);
+      for (int i=0; i<UNALIGN_OFF; i++) {
+        errn += verify("test_cp_unalndst: a1", i, a1[i], (short)-1);
+        errn += verify("test_cp_unalndst: b1", i, b1[i], -1.);
+      }
+      for (int i=UNALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_cp_unalndst: a1", i, a1[i], (short)123);
+        errn += verify("test_cp_unalndst: b1", i, b1[i], 123.);
+      }
+      for (int i=0; i<ARRLEN; i++) {
+        a2[i] = -123;
+        b2[i] = -123.;
+      }
+      test_cp_unalnsrc(a1, a2, b1, b2);
+      for (int i=0; i<ARRLEN-UNALIGN_OFF; i++) {
+        errn += verify("test_cp_unalnsrc: a1", i, a1[i], (short)-123);
+        errn += verify("test_cp_unalnsrc: b1", i, b1[i], -123.);
+      }
+      for (int i=ARRLEN-UNALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_cp_unalnsrc: a1", i, a1[i], (short)123);
+        errn += verify("test_cp_unalnsrc: b1", i, b1[i], 123.);
+      }
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        b1[i] = -1;
+      }
+      test_ci_unaln(a1, b1);
+      for (int i=0; i<UNALIGN_OFF; i++) {
+        errn += verify("test_ci_unaln: a1", i, a1[i], (short)-1);
+      }
+      for (int i=UNALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_ci_unaln: a1", i, a1[i], (short)-123);
+      }
+      for (int i=0; i<ARRLEN-UNALIGN_OFF; i++) {
+        errn += verify("test_ci_unaln: b1", i, b1[i], -103.);
+      }
+      for (int i=ARRLEN-UNALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_ci_unaln: b1", i, b1[i], -1.);
+      }
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        b1[i] = -1;
+      }
+      test_vi_unaln(a1, b1, (short)123, 103.);
+      for (int i=0; i<ARRLEN-UNALIGN_OFF; i++) {
+        errn += verify("test_vi_unaln: a1", i, a1[i], (short)123);
+      }
+      for (int i=ARRLEN-UNALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_vi_unaln: a1", i, a1[i], (short)-1);
+      }
+      for (int i=0; i<UNALIGN_OFF; i++) {
+        errn += verify("test_vi_unaln: b1", i, b1[i], -1.);
+      }
+      for (int i=UNALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_vi_unaln: b1", i, b1[i], 103.);
+      }
+
+      // Reset for aligned overlap initialization
+      for (int i=0; i<ALIGN_OFF; i++) {
+        a1[i] = (short)i;
+        b1[i] = (double)i;
+      }
+      for (int i=ALIGN_OFF; i<ARRLEN; i++) {
+        a1[i] = -1;
+        b1[i] = -1.;
+      }
+      test_cp_alndst(a1, a1, b1, b1);
+      for (int i=0; i<ARRLEN; i++) {
+        int v = i%ALIGN_OFF;
+        errn += verify("test_cp_alndst_overlap: a1", i, a1[i], (short)v);
+        errn += verify("test_cp_alndst_overlap: b1", i, b1[i], (double)v);
+      }
+      for (int i=0; i<ALIGN_OFF; i++) {
+        a1[i+ALIGN_OFF] = -1;
+        b1[i+ALIGN_OFF] = -1.;
+      }
+      test_cp_alnsrc(a1, a1, b1, b1);
+      for (int i=0; i<ALIGN_OFF; i++) {
+        errn += verify("test_cp_alnsrc_overlap: a1", i, a1[i], (short)-1);
+        errn += verify("test_cp_alnsrc_overlap: b1", i, b1[i], -1.);
+      }
+      for (int i=ALIGN_OFF; i<ARRLEN; i++) {
+        int v = i%ALIGN_OFF;
+        errn += verify("test_cp_alnsrc_overlap: a1", i, a1[i], (short)v);
+        errn += verify("test_cp_alnsrc_overlap: b1", i, b1[i], (double)v);
+      }
+
+      // Reset for unaligned overlap initialization
+      for (int i=0; i<UNALIGN_OFF; i++) {
+        a1[i] = (short)i;
+        b1[i] = (double)i;
+      }
+      for (int i=UNALIGN_OFF; i<ARRLEN; i++) {
+        a1[i] = -1;
+        b1[i] = -1.;
+      }
+      test_cp_unalndst(a1, a1, b1, b1);
+      for (int i=0; i<ARRLEN; i++) {
+        int v = i%UNALIGN_OFF;
+        errn += verify("test_cp_unalndst_overlap: a1", i, a1[i], (short)v);
+        errn += verify("test_cp_unalndst_overlap: b1", i, b1[i], (double)v);
+      }
+      for (int i=0; i<UNALIGN_OFF; i++) {
+        a1[i+UNALIGN_OFF] = -1;
+        b1[i+UNALIGN_OFF] = -1.;
+      }
+      test_cp_unalnsrc(a1, a1, b1, b1);
+      for (int i=0; i<UNALIGN_OFF; i++) {
+        errn += verify("test_cp_unalnsrc_overlap: a1", i, a1[i], (short)-1);
+        errn += verify("test_cp_unalnsrc_overlap: b1", i, b1[i], -1.);
+      }
+      for (int i=UNALIGN_OFF; i<ARRLEN; i++) {
+        int v = i%UNALIGN_OFF;
+        errn += verify("test_cp_unalnsrc_overlap: a1", i, a1[i], (short)v);
+        errn += verify("test_cp_unalnsrc_overlap: b1", i, b1[i], (double)v);
+      }
+
+    }
+
+    if (errn > 0)
+      return errn;
+
+    System.out.println("Time");
+    long start, end;
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_ci(a1, b1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_ci: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_vi(a2, b2, (short)123, 103.);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_vi: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp(a1, a2, b1, b2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_ci_neg(a1, b1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_ci_neg: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_vi_neg(a1, b1, (short)123, 103.);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_vi_neg: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_neg(a1, a2, b1, b2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_neg: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_ci_oppos(a1, b1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_ci_oppos: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_vi_oppos(a1, b1, (short)123, 103.);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_vi_oppos: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_oppos(a1, a2, b1, b2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_oppos: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_ci_aln(a1, b1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_ci_aln: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_vi_aln(a1, b1, (short)123, 103.);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_vi_aln: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_alndst(a1, a2, b1, b2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_alndst: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_alnsrc(a1, a2, b1, b2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_alnsrc: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_ci_unaln(a1, b1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_ci_unaln: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_vi_unaln(a1, b1, (short)123, 103.);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_vi_unaln: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_unalndst(a1, a2, b1, b2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_unalndst: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_unalnsrc(a1, a2, b1, b2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_unalnsrc: " + (end - start));
+    return errn;
+  }
+
+  static void test_ci(short[] a, double[] b) {
+    for (int i = 0; i < a.length; i+=1) {
+      a[i] = -123;
+      b[i] = -103.;
+    }
+  }
+  static void test_vi(short[] a, double[] b, short c, double d) {
+    for (int i = 0; i < a.length; i+=1) {
+      a[i] = c;
+      b[i] = d;
+    }
+  }
+  static void test_cp(short[] a, short[] b, double[] c, double[] d) {
+    for (int i = 0; i < a.length; i+=1) {
+      a[i] = b[i];
+      c[i] = d[i];
+    }
+  }
+  static void test_ci_neg(short[] a, double[] b) {
+    for (int i = a.length-1; i >= 0; i-=1) {
+      a[i] = -123;
+      b[i] = -103.;
+    }
+  }
+  static void test_vi_neg(short[] a, double[] b, short c, double d) {
+    for (int i = a.length-1; i >= 0; i-=1) {
+      a[i] = c;
+      b[i] = d;
+    }
+  }
+  static void test_cp_neg(short[] a, short[] b, double[] c, double[] d) {
+    for (int i = a.length-1; i >= 0; i-=1) {
+      a[i] = b[i];
+      c[i] = d[i];
+    }
+  }
+  static void test_ci_oppos(short[] a, double[] b) {
+    int limit = a.length-1;
+    for (int i = 0; i < a.length; i+=1) {
+      a[limit-i] = -123;
+      b[i] = -103.;
+    }
+  }
+  static void test_vi_oppos(short[] a, double[] b, short c, double d) {
+    int limit = a.length-1;
+    for (int i = a.length-1; i >= 0; i-=1) {
+      a[i] = c;
+      b[limit-i] = d;
+    }
+  }
+  static void test_cp_oppos(short[] a, short[] b, double[] c, double[] d) {
+    int limit = a.length-1;
+    for (int i = 0; i < a.length; i+=1) {
+      a[i] = b[limit-i];
+      c[limit-i] = d[i];
+    }
+  }
+  static void test_ci_aln(short[] a, double[] b) {
+    for (int i = 0; i < a.length-ALIGN_OFF; i+=1) {
+      a[i+ALIGN_OFF] = -123;
+      b[i] = -103.;
+    }
+  }
+  static void test_vi_aln(short[] a, double[] b, short c, double d) {
+    for (int i = 0; i < a.length-ALIGN_OFF; i+=1) {
+      a[i] = c;
+      b[i+ALIGN_OFF] = d;
+    }
+  }
+  static void test_cp_alndst(short[] a, short[] b, double[] c, double[] d) {
+    for (int i = 0; i < a.length-ALIGN_OFF; i+=1) {
+      a[i+ALIGN_OFF] = b[i];
+      c[i+ALIGN_OFF] = d[i];
+    }
+  }
+  static void test_cp_alnsrc(short[] a, short[] b, double[] c, double[] d) {
+    for (int i = 0; i < a.length-ALIGN_OFF; i+=1) {
+      a[i] = b[i+ALIGN_OFF];
+      c[i] = d[i+ALIGN_OFF];
+    }
+  }
+  static void test_ci_unaln(short[] a, double[] b) {
+    for (int i = 0; i < a.length-UNALIGN_OFF; i+=1) {
+      a[i+UNALIGN_OFF] = -123;
+      b[i] = -103.;
+    }
+  }
+  static void test_vi_unaln(short[] a, double[] b, short c, double d) {
+    for (int i = 0; i < a.length-UNALIGN_OFF; i+=1) {
+      a[i] = c;
+      b[i+UNALIGN_OFF] = d;
+    }
+  }
+  static void test_cp_unalndst(short[] a, short[] b, double[] c, double[] d) {
+    for (int i = 0; i < a.length-UNALIGN_OFF; i+=1) {
+      a[i+UNALIGN_OFF] = b[i];
+      c[i+UNALIGN_OFF] = d[i];
+    }
+  }
+  static void test_cp_unalnsrc(short[] a, short[] b, double[] c, double[] d) {
+    for (int i = 0; i < a.length-UNALIGN_OFF; i+=1) {
+      a[i] = b[i+UNALIGN_OFF];
+      c[i] = d[i+UNALIGN_OFF];
+    }
+  }
+
+  static int verify(String text, int i, short elem, short val) {
+    if (elem != val) {
+      System.err.println(text + "[" + i + "] = " + elem + " != " + val);
+      return 1;
+    }
+    return 0;
+  }
+  static int verify(String text, int i, double elem, double val) {
+    if (elem != val) {
+      System.err.println(text + "[" + i + "] = " + elem + " != " + val);
+      return 1;
+    }
+    return 0;
+  }
+}
--- /dev/null	Sat Jun  2 20:04:31 2012
+++ new/test/compiler/7119644/TestShortFloatVect.java	Sat Jun  2 20:04:31 2012
@@ -0,0 +1,571 @@
+/*
+ * Copyright (c) 2012, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+/**
+ * @test
+ * @bug 7119644
+ * @summary Increase superword's vector size up to 256 bits
+ *
+ * @run main/othervm/timeout=300 -Xbatch -XX:+IgnoreUnrecognizedVMOptions -XX:-TieredCompilation -XX:-OptimizeFill TestShortFloatVect
+ */
+
+public class TestShortFloatVect {
+  private static final int ARRLEN = 997;
+  private static final int ITERS  = 11000;
+  private static final int OFFSET = 3;
+  private static final int SCALE = 2;
+  private static final int ALIGN_OFF = 8;
+  private static final int UNALIGN_OFF = 5;
+
+  public static void main(String args[]) {
+    System.out.println("Testing Short + Float vectors");
+    int errn = test();
+    if (errn > 0) {
+      System.err.println("FAILED: " + errn + " errors");
+      System.exit(97);
+    }
+    System.out.println("PASSED");
+  }
+
+  static int test() {
+    short[] a1 = new short[ARRLEN];
+    short[] a2 = new short[ARRLEN];
+    float[] b1 = new float[ARRLEN];
+    float[] b2 = new float[ARRLEN];
+    System.out.println("Warmup");
+    for (int i=0; i<ITERS; i++) {
+      test_ci(a1, b1);
+      test_vi(a2, b2, (short)123, 103.f);
+      test_cp(a1, a2, b1, b2);
+      test_ci_neg(a1, b1);
+      test_vi_neg(a1, b1, (short)123, 103.f);
+      test_cp_neg(a1, a2, b1, b2);
+      test_ci_oppos(a1, b1);
+      test_vi_oppos(a1, b1, (short)123, 103.f);
+      test_cp_oppos(a1, a2, b1, b2);
+      test_ci_aln(a1, b1);
+      test_vi_aln(a1, b1, (short)123, 103.f);
+      test_cp_alndst(a1, a2, b1, b2);
+      test_cp_alnsrc(a1, a2, b1, b2);
+      test_ci_unaln(a1, b1);
+      test_vi_unaln(a1, b1, (short)123, 103.f);
+      test_cp_unalndst(a1, a2, b1, b2);
+      test_cp_unalnsrc(a1, a2, b1, b2);
+    }
+    // Initialize
+    for (int i=0; i<ARRLEN; i++) {
+      a1[i] = -1;
+      a2[i] = -1;
+      b1[i] = -1.f;
+      b2[i] = -1.f;
+    }
+    // Test and verify results
+    System.out.println("Verification");
+    int errn = 0;
+    {
+      test_ci(a1, b1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_ci: a1", i, a1[i], (short)-123);
+        errn += verify("test_ci: b1", i, b1[i], -103.f);
+      }
+      test_vi(a2, b2, (short)123, 103.f);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_vi: a2", i, a2[i], (short)123);
+        errn += verify("test_vi: b2", i, b2[i], 103.f);
+      }
+      test_cp(a1, a2, b1, b2);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_cp: a1", i, a1[i], (short)123);
+        errn += verify("test_cp: b1", i, b1[i], 103.f);
+      }
+
+      // Reset for negative stride
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        a2[i] = -1;
+        b1[i] = -1.f;
+        b2[i] = -1.f;
+      }
+      test_ci_neg(a1, b1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_ci_neg: a1", i, a1[i], (short)-123);
+        errn += verify("test_ci_neg: b1", i, b1[i], -103.f);
+      }
+      test_vi_neg(a2, b2, (short)123, 103.f);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_vi_neg: a2", i, a2[i], (short)123);
+        errn += verify("test_vi_neg: b2", i, b2[i], 103.f);
+      }
+      test_cp_neg(a1, a2, b1, b2);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_cp_neg: a1", i, a1[i], (short)123);
+        errn += verify("test_cp_neg: b1", i, b1[i], 103.f);
+      }
+
+      // Reset for opposite stride
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        a2[i] = -1;
+        b1[i] = -1.f;
+        b2[i] = -1.f;
+      }
+      test_ci_oppos(a1, b1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_ci_oppos: a1", i, a1[i], (short)-123);
+        errn += verify("test_ci_oppos: b1", i, b1[i], -103.f);
+      }
+      test_vi_oppos(a2, b2, (short)123, 103.f);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_vi_oppos: a2", i, a2[i], (short)123);
+        errn += verify("test_vi_oppos: b2", i, b2[i], 103.f);
+      }
+      test_cp_oppos(a1, a2, b1, b2);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_cp_oppos: a1", i, a1[i], (short)123);
+        errn += verify("test_cp_oppos: b1", i, b1[i], 103.f);
+      }
+
+      // Reset for 2 arrays with relative aligned offset
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        a2[i] = 123;
+        b1[i] = -1.f;
+        b2[i] = 123.f;
+      }
+      test_cp_alndst(a1, a2, b1, b2);
+      for (int i=0; i<ALIGN_OFF; i++) {
+        errn += verify("test_cp_alndst: a1", i, a1[i], (short)-1);
+        errn += verify("test_cp_alndst: b1", i, b1[i], -1.f);
+      }
+      for (int i=ALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_cp_alndst: a1", i, a1[i], (short)123);
+        errn += verify("test_cp_alndst: b1", i, b1[i], 123.f);
+      }
+      for (int i=0; i<ARRLEN; i++) {
+        a2[i] = -123;
+        b2[i] = -123.f;
+      }
+      test_cp_alnsrc(a1, a2, b1, b2);
+      for (int i=0; i<ARRLEN-ALIGN_OFF; i++) {
+        errn += verify("test_cp_alnsrc: a1", i, a1[i], (short)-123);
+        errn += verify("test_cp_alnsrc: b1", i, b1[i], -123.f);
+      }
+      for (int i=ARRLEN-ALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_cp_alnsrc: a1", i, a1[i], (short)123);
+        errn += verify("test_cp_alnsrc: b1", i, b1[i], 123.f);
+      }
+
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        b1[i] = -1.f;
+      }
+      test_ci_aln(a1, b1);
+      for (int i=0; i<ALIGN_OFF; i++) {
+        errn += verify("test_ci_aln: a1", i, a1[i], (short)-1);
+      }
+      for (int i=ALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_ci_aln: a1", i, a1[i], (short)-123);
+      }
+      for (int i=0; i<ARRLEN-ALIGN_OFF; i++) {
+        errn += verify("test_ci_aln: b1", i, b1[i], -103.f);
+      }
+      for (int i=ARRLEN-ALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_ci_aln: b1", i, b1[i], -1.f);
+      }
+
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        b1[i] = -1.f;
+      }
+      test_vi_aln(a1, b1, (short)123, 103.f);
+      for (int i=0; i<ARRLEN-ALIGN_OFF; i++) {
+        errn += verify("test_vi_aln: a1", i, a1[i], (short)123);
+      }
+      for (int i=ARRLEN-ALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_vi_aln: a1", i, a1[i], (short)-1);
+      }
+      for (int i=0; i<ALIGN_OFF; i++) {
+        errn += verify("test_vi_aln: b1", i, b1[i], -1.f);
+      }
+      for (int i=ALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_vi_aln: b1", i, b1[i], 103.f);
+      }
+
+      // Reset for 2 arrays with relative unaligned offset
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        a2[i] = 123;
+        b1[i] = -1.f;
+        b2[i] = 123.f;
+      }
+      test_cp_unalndst(a1, a2, b1, b2);
+      for (int i=0; i<UNALIGN_OFF; i++) {
+        errn += verify("test_cp_unalndst: a1", i, a1[i], (short)-1);
+        errn += verify("test_cp_unalndst: b1", i, b1[i], -1.f);
+      }
+      for (int i=UNALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_cp_unalndst: a1", i, a1[i], (short)123);
+        errn += verify("test_cp_unalndst: b1", i, b1[i], 123.f);
+      }
+      for (int i=0; i<ARRLEN; i++) {
+        a2[i] = -123;
+        b2[i] = -123.f;
+      }
+      test_cp_unalnsrc(a1, a2, b1, b2);
+      for (int i=0; i<ARRLEN-UNALIGN_OFF; i++) {
+        errn += verify("test_cp_unalnsrc: a1", i, a1[i], (short)-123);
+        errn += verify("test_cp_unalnsrc: b1", i, b1[i], -123.f);
+      }
+      for (int i=ARRLEN-UNALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_cp_unalnsrc: a1", i, a1[i], (short)123);
+        errn += verify("test_cp_unalnsrc: b1", i, b1[i], 123.f);
+      }
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        b1[i] = -1;
+      }
+      test_ci_unaln(a1, b1);
+      for (int i=0; i<UNALIGN_OFF; i++) {
+        errn += verify("test_ci_unaln: a1", i, a1[i], (short)-1);
+      }
+      for (int i=UNALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_ci_unaln: a1", i, a1[i], (short)-123);
+      }
+      for (int i=0; i<ARRLEN-UNALIGN_OFF; i++) {
+        errn += verify("test_ci_unaln: b1", i, b1[i], -103.f);
+      }
+      for (int i=ARRLEN-UNALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_ci_unaln: b1", i, b1[i], -1.f);
+      }
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        b1[i] = -1;
+      }
+      test_vi_unaln(a1, b1, (short)123, 103.f);
+      for (int i=0; i<ARRLEN-UNALIGN_OFF; i++) {
+        errn += verify("test_vi_unaln: a1", i, a1[i], (short)123);
+      }
+      for (int i=ARRLEN-UNALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_vi_unaln: a1", i, a1[i], (short)-1);
+      }
+      for (int i=0; i<UNALIGN_OFF; i++) {
+        errn += verify("test_vi_unaln: b1", i, b1[i], -1.f);
+      }
+      for (int i=UNALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_vi_unaln: b1", i, b1[i], 103.f);
+      }
+
+      // Reset for aligned overlap initialization
+      for (int i=0; i<ALIGN_OFF; i++) {
+        a1[i] = (short)i;
+        b1[i] = (float)i;
+      }
+      for (int i=ALIGN_OFF; i<ARRLEN; i++) {
+        a1[i] = -1;
+        b1[i] = -1.f;
+      }
+      test_cp_alndst(a1, a1, b1, b1);
+      for (int i=0; i<ARRLEN; i++) {
+        int v = i%ALIGN_OFF;
+        errn += verify("test_cp_alndst_overlap: a1", i, a1[i], (short)v);
+        errn += verify("test_cp_alndst_overlap: b1", i, b1[i], (float)v);
+      }
+      for (int i=0; i<ALIGN_OFF; i++) {
+        a1[i+ALIGN_OFF] = -1;
+        b1[i+ALIGN_OFF] = -1.f;
+      }
+      test_cp_alnsrc(a1, a1, b1, b1);
+      for (int i=0; i<ALIGN_OFF; i++) {
+        errn += verify("test_cp_alnsrc_overlap: a1", i, a1[i], (short)-1);
+        errn += verify("test_cp_alnsrc_overlap: b1", i, b1[i], -1.f);
+      }
+      for (int i=ALIGN_OFF; i<ARRLEN; i++) {
+        int v = i%ALIGN_OFF;
+        errn += verify("test_cp_alnsrc_overlap: a1", i, a1[i], (short)v);
+        errn += verify("test_cp_alnsrc_overlap: b1", i, b1[i], (float)v);
+      }
+
+      // Reset for unaligned overlap initialization
+      for (int i=0; i<UNALIGN_OFF; i++) {
+        a1[i] = (short)i;
+        b1[i] = (float)i;
+      }
+      for (int i=UNALIGN_OFF; i<ARRLEN; i++) {
+        a1[i] = -1;
+        b1[i] = -1.f;
+      }
+      test_cp_unalndst(a1, a1, b1, b1);
+      for (int i=0; i<ARRLEN; i++) {
+        int v = i%UNALIGN_OFF;
+        errn += verify("test_cp_unalndst_overlap: a1", i, a1[i], (short)v);
+        errn += verify("test_cp_unalndst_overlap: b1", i, b1[i], (float)v);
+      }
+      for (int i=0; i<UNALIGN_OFF; i++) {
+        a1[i+UNALIGN_OFF] = -1;
+        b1[i+UNALIGN_OFF] = -1.f;
+      }
+      test_cp_unalnsrc(a1, a1, b1, b1);
+      for (int i=0; i<UNALIGN_OFF; i++) {
+        errn += verify("test_cp_unalnsrc_overlap: a1", i, a1[i], (short)-1);
+        errn += verify("test_cp_unalnsrc_overlap: b1", i, b1[i], -1.f);
+      }
+      for (int i=UNALIGN_OFF; i<ARRLEN; i++) {
+        int v = i%UNALIGN_OFF;
+        errn += verify("test_cp_unalnsrc_overlap: a1", i, a1[i], (short)v);
+        errn += verify("test_cp_unalnsrc_overlap: b1", i, b1[i], (float)v);
+      }
+
+    }
+
+    if (errn > 0)
+      return errn;
+
+    System.out.println("Time");
+    long start, end;
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_ci(a1, b1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_ci: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_vi(a2, b2, (short)123, 103.f);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_vi: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp(a1, a2, b1, b2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_ci_neg(a1, b1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_ci_neg: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_vi_neg(a1, b1, (short)123, 103.f);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_vi_neg: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_neg(a1, a2, b1, b2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_neg: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_ci_oppos(a1, b1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_ci_oppos: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_vi_oppos(a1, b1, (short)123, 103.f);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_vi_oppos: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_oppos(a1, a2, b1, b2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_oppos: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_ci_aln(a1, b1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_ci_aln: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_vi_aln(a1, b1, (short)123, 103.f);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_vi_aln: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_alndst(a1, a2, b1, b2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_alndst: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_alnsrc(a1, a2, b1, b2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_alnsrc: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_ci_unaln(a1, b1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_ci_unaln: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_vi_unaln(a1, b1, (short)123, 103.f);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_vi_unaln: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_unalndst(a1, a2, b1, b2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_unalndst: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_unalnsrc(a1, a2, b1, b2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_unalnsrc: " + (end - start));
+    return errn;
+  }
+
+  static void test_ci(short[] a, float[] b) {
+    for (int i = 0; i < a.length; i+=1) {
+      a[i] = -123;
+      b[i] = -103.f;
+    }
+  }
+  static void test_vi(short[] a, float[] b, short c, float d) {
+    for (int i = 0; i < a.length; i+=1) {
+      a[i] = c;
+      b[i] = d;
+    }
+  }
+  static void test_cp(short[] a, short[] b, float[] c, float[] d) {
+    for (int i = 0; i < a.length; i+=1) {
+      a[i] = b[i];
+      c[i] = d[i];
+    }
+  }
+  static void test_ci_neg(short[] a, float[] b) {
+    for (int i = a.length-1; i >= 0; i-=1) {
+      a[i] = -123;
+      b[i] = -103.f;
+    }
+  }
+  static void test_vi_neg(short[] a, float[] b, short c, float d) {
+    for (int i = a.length-1; i >= 0; i-=1) {
+      a[i] = c;
+      b[i] = d;
+    }
+  }
+  static void test_cp_neg(short[] a, short[] b, float[] c, float[] d) {
+    for (int i = a.length-1; i >= 0; i-=1) {
+      a[i] = b[i];
+      c[i] = d[i];
+    }
+  }
+  static void test_ci_oppos(short[] a, float[] b) {
+    int limit = a.length-1;
+    for (int i = 0; i < a.length; i+=1) {
+      a[limit-i] = -123;
+      b[i] = -103.f;
+    }
+  }
+  static void test_vi_oppos(short[] a, float[] b, short c, float d) {
+    int limit = a.length-1;
+    for (int i = a.length-1; i >= 0; i-=1) {
+      a[i] = c;
+      b[limit-i] = d;
+    }
+  }
+  static void test_cp_oppos(short[] a, short[] b, float[] c, float[] d) {
+    int limit = a.length-1;
+    for (int i = 0; i < a.length; i+=1) {
+      a[i] = b[limit-i];
+      c[limit-i] = d[i];
+    }
+  }
+  static void test_ci_aln(short[] a, float[] b) {
+    for (int i = 0; i < a.length-ALIGN_OFF; i+=1) {
+      a[i+ALIGN_OFF] = -123;
+      b[i] = -103.f;
+    }
+  }
+  static void test_vi_aln(short[] a, float[] b, short c, float d) {
+    for (int i = 0; i < a.length-ALIGN_OFF; i+=1) {
+      a[i] = c;
+      b[i+ALIGN_OFF] = d;
+    }
+  }
+  static void test_cp_alndst(short[] a, short[] b, float[] c, float[] d) {
+    for (int i = 0; i < a.length-ALIGN_OFF; i+=1) {
+      a[i+ALIGN_OFF] = b[i];
+      c[i+ALIGN_OFF] = d[i];
+    }
+  }
+  static void test_cp_alnsrc(short[] a, short[] b, float[] c, float[] d) {
+    for (int i = 0; i < a.length-ALIGN_OFF; i+=1) {
+      a[i] = b[i+ALIGN_OFF];
+      c[i] = d[i+ALIGN_OFF];
+    }
+  }
+  static void test_ci_unaln(short[] a, float[] b) {
+    for (int i = 0; i < a.length-UNALIGN_OFF; i+=1) {
+      a[i+UNALIGN_OFF] = -123;
+      b[i] = -103.f;
+    }
+  }
+  static void test_vi_unaln(short[] a, float[] b, short c, float d) {
+    for (int i = 0; i < a.length-UNALIGN_OFF; i+=1) {
+      a[i] = c;
+      b[i+UNALIGN_OFF] = d;
+    }
+  }
+  static void test_cp_unalndst(short[] a, short[] b, float[] c, float[] d) {
+    for (int i = 0; i < a.length-UNALIGN_OFF; i+=1) {
+      a[i+UNALIGN_OFF] = b[i];
+      c[i+UNALIGN_OFF] = d[i];
+    }
+  }
+  static void test_cp_unalnsrc(short[] a, short[] b, float[] c, float[] d) {
+    for (int i = 0; i < a.length-UNALIGN_OFF; i+=1) {
+      a[i] = b[i+UNALIGN_OFF];
+      c[i] = d[i+UNALIGN_OFF];
+    }
+  }
+
+  static int verify(String text, int i, short elem, short val) {
+    if (elem != val) {
+      System.err.println(text + "[" + i + "] = " + elem + " != " + val);
+      return 1;
+    }
+    return 0;
+  }
+  static int verify(String text, int i, float elem, float val) {
+    if (elem != val) {
+      System.err.println(text + "[" + i + "] = " + elem + " != " + val);
+      return 1;
+    }
+    return 0;
+  }
+}
--- /dev/null	Sat Jun  2 20:04:32 2012
+++ new/test/compiler/7119644/TestShortIntVect.java	Sat Jun  2 20:04:31 2012
@@ -0,0 +1,571 @@
+/*
+ * Copyright (c) 2012, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+/**
+ * @test
+ * @bug 7119644
+ * @summary Increase superword's vector size up to 256 bits
+ *
+ * @run main/othervm/timeout=300 -Xbatch -XX:+IgnoreUnrecognizedVMOptions -XX:-TieredCompilation -XX:-OptimizeFill TestShortIntVect
+ */
+
+public class TestShortIntVect {
+  private static final int ARRLEN = 997;
+  private static final int ITERS  = 11000;
+  private static final int OFFSET = 3;
+  private static final int SCALE = 2;
+  private static final int ALIGN_OFF = 8;
+  private static final int UNALIGN_OFF = 5;
+
+  public static void main(String args[]) {
+    System.out.println("Testing Short + Integer vectors");
+    int errn = test();
+    if (errn > 0) {
+      System.err.println("FAILED: " + errn + " errors");
+      System.exit(97);
+    }
+    System.out.println("PASSED");
+  }
+
+  static int test() {
+    short[] a1 = new short[ARRLEN];
+    short[] a2 = new short[ARRLEN];
+    int[] b1 = new int[ARRLEN];
+    int[] b2 = new int[ARRLEN];
+    System.out.println("Warmup");
+    for (int i=0; i<ITERS; i++) {
+      test_ci(a1, b1);
+      test_vi(a2, b2, (short)123, (int)103);
+      test_cp(a1, a2, b1, b2);
+      test_ci_neg(a1, b1);
+      test_vi_neg(a1, b1, (short)123, (int)103);
+      test_cp_neg(a1, a2, b1, b2);
+      test_ci_oppos(a1, b1);
+      test_vi_oppos(a1, b1, (short)123, (int)103);
+      test_cp_oppos(a1, a2, b1, b2);
+      test_ci_aln(a1, b1);
+      test_vi_aln(a1, b1, (short)123, (int)103);
+      test_cp_alndst(a1, a2, b1, b2);
+      test_cp_alnsrc(a1, a2, b1, b2);
+      test_ci_unaln(a1, b1);
+      test_vi_unaln(a1, b1, (short)123, (int)103);
+      test_cp_unalndst(a1, a2, b1, b2);
+      test_cp_unalnsrc(a1, a2, b1, b2);
+    }
+    // Initialize
+    for (int i=0; i<ARRLEN; i++) {
+      a1[i] = -1;
+      a2[i] = -1;
+      b1[i] = -1;
+      b2[i] = -1;
+    }
+    // Test and verify results
+    System.out.println("Verification");
+    int errn = 0;
+    {
+      test_ci(a1, b1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_ci: a1", i, a1[i], (short)-123);
+        errn += verify("test_ci: b1", i, b1[i], (int)-103);
+      }
+      test_vi(a2, b2, (short)123, (int)103);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_vi: a2", i, a2[i], (short)123);
+        errn += verify("test_vi: b2", i, b2[i], (int)103);
+      }
+      test_cp(a1, a2, b1, b2);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_cp: a1", i, a1[i], (short)123);
+        errn += verify("test_cp: b1", i, b1[i], (int)103);
+      }
+
+      // Reset for negative stride
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        a2[i] = -1;
+        b1[i] = -1;
+        b2[i] = -1;
+      }
+      test_ci_neg(a1, b1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_ci_neg: a1", i, a1[i], (short)-123);
+        errn += verify("test_ci_neg: b1", i, b1[i], (int)-103);
+      }
+      test_vi_neg(a2, b2, (short)123, (int)103);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_vi_neg: a2", i, a2[i], (short)123);
+        errn += verify("test_vi_neg: b2", i, b2[i], (int)103);
+      }
+      test_cp_neg(a1, a2, b1, b2);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_cp_neg: a1", i, a1[i], (short)123);
+        errn += verify("test_cp_neg: b1", i, b1[i], (int)103);
+      }
+
+      // Reset for opposite stride
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        a2[i] = -1;
+        b1[i] = -1;
+        b2[i] = -1;
+      }
+      test_ci_oppos(a1, b1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_ci_oppos: a1", i, a1[i], (short)-123);
+        errn += verify("test_ci_oppos: b1", i, b1[i], (int)-103);
+      }
+      test_vi_oppos(a2, b2, (short)123, (int)103);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_vi_oppos: a2", i, a2[i], (short)123);
+        errn += verify("test_vi_oppos: b2", i, b2[i], (int)103);
+      }
+      test_cp_oppos(a1, a2, b1, b2);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_cp_oppos: a1", i, a1[i], (short)123);
+        errn += verify("test_cp_oppos: b1", i, b1[i], (int)103);
+      }
+
+      // Reset for 2 arrays with relative aligned offset
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        a2[i] = 123;
+        b1[i] = -1;
+        b2[i] = 123;
+      }
+      test_cp_alndst(a1, a2, b1, b2);
+      for (int i=0; i<ALIGN_OFF; i++) {
+        errn += verify("test_cp_alndst: a1", i, a1[i], (short)-1);
+        errn += verify("test_cp_alndst: b1", i, b1[i], (int)-1);
+      }
+      for (int i=ALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_cp_alndst: a1", i, a1[i], (short)123);
+        errn += verify("test_cp_alndst: b1", i, b1[i], (int)123);
+      }
+      for (int i=0; i<ARRLEN; i++) {
+        a2[i] = -123;
+        b2[i] = -123;
+      }
+      test_cp_alnsrc(a1, a2, b1, b2);
+      for (int i=0; i<ARRLEN-ALIGN_OFF; i++) {
+        errn += verify("test_cp_alnsrc: a1", i, a1[i], (short)-123);
+        errn += verify("test_cp_alnsrc: b1", i, b1[i], (int)-123);
+      }
+      for (int i=ARRLEN-ALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_cp_alnsrc: a1", i, a1[i], (short)123);
+        errn += verify("test_cp_alnsrc: b1", i, b1[i], (int)123);
+      }
+
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        b1[i] = -1;
+      }
+      test_ci_aln(a1, b1);
+      for (int i=0; i<ALIGN_OFF; i++) {
+        errn += verify("test_ci_aln: a1", i, a1[i], (short)-1);
+      }
+      for (int i=ALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_ci_aln: a1", i, a1[i], (short)-123);
+      }
+      for (int i=0; i<ARRLEN-ALIGN_OFF; i++) {
+        errn += verify("test_ci_aln: b1", i, b1[i], (int)-103);
+      }
+      for (int i=ARRLEN-ALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_ci_aln: b1", i, b1[i], (int)-1);
+      }
+
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        b1[i] = -1;
+      }
+      test_vi_aln(a1, b1, (short)123, (int)103);
+      for (int i=0; i<ARRLEN-ALIGN_OFF; i++) {
+        errn += verify("test_vi_aln: a1", i, a1[i], (short)123);
+      }
+      for (int i=ARRLEN-ALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_vi_aln: a1", i, a1[i], (short)-1);
+      }
+      for (int i=0; i<ALIGN_OFF; i++) {
+        errn += verify("test_vi_aln: b1", i, b1[i], (int)-1);
+      }
+      for (int i=ALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_vi_aln: b1", i, b1[i], (int)103);
+      }
+
+      // Reset for 2 arrays with relative unaligned offset
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        a2[i] = 123;
+        b1[i] = -1;
+        b2[i] = 123;
+      }
+      test_cp_unalndst(a1, a2, b1, b2);
+      for (int i=0; i<UNALIGN_OFF; i++) {
+        errn += verify("test_cp_unalndst: a1", i, a1[i], (short)-1);
+        errn += verify("test_cp_unalndst: b1", i, b1[i], (int)-1);
+      }
+      for (int i=UNALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_cp_unalndst: a1", i, a1[i], (short)123);
+        errn += verify("test_cp_unalndst: b1", i, b1[i], (int)123);
+      }
+      for (int i=0; i<ARRLEN; i++) {
+        a2[i] = -123;
+        b2[i] = -123;
+      }
+      test_cp_unalnsrc(a1, a2, b1, b2);
+      for (int i=0; i<ARRLEN-UNALIGN_OFF; i++) {
+        errn += verify("test_cp_unalnsrc: a1", i, a1[i], (short)-123);
+        errn += verify("test_cp_unalnsrc: b1", i, b1[i], (int)-123);
+      }
+      for (int i=ARRLEN-UNALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_cp_unalnsrc: a1", i, a1[i], (short)123);
+        errn += verify("test_cp_unalnsrc: b1", i, b1[i], (int)123);
+      }
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        b1[i] = -1;
+      }
+      test_ci_unaln(a1, b1);
+      for (int i=0; i<UNALIGN_OFF; i++) {
+        errn += verify("test_ci_unaln: a1", i, a1[i], (short)-1);
+      }
+      for (int i=UNALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_ci_unaln: a1", i, a1[i], (short)-123);
+      }
+      for (int i=0; i<ARRLEN-UNALIGN_OFF; i++) {
+        errn += verify("test_ci_unaln: b1", i, b1[i], (int)-103);
+      }
+      for (int i=ARRLEN-UNALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_ci_unaln: b1", i, b1[i], (int)-1);
+      }
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        b1[i] = -1;
+      }
+      test_vi_unaln(a1, b1, (short)123, (int)103);
+      for (int i=0; i<ARRLEN-UNALIGN_OFF; i++) {
+        errn += verify("test_vi_unaln: a1", i, a1[i], (short)123);
+      }
+      for (int i=ARRLEN-UNALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_vi_unaln: a1", i, a1[i], (short)-1);
+      }
+      for (int i=0; i<UNALIGN_OFF; i++) {
+        errn += verify("test_vi_unaln: b1", i, b1[i], (int)-1);
+      }
+      for (int i=UNALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_vi_unaln: b1", i, b1[i], (int)103);
+      }
+
+      // Reset for aligned overlap initialization
+      for (int i=0; i<ALIGN_OFF; i++) {
+        a1[i] = (short)i;
+        b1[i] = (int)i;
+      }
+      for (int i=ALIGN_OFF; i<ARRLEN; i++) {
+        a1[i] = -1;
+        b1[i] = -1;
+      }
+      test_cp_alndst(a1, a1, b1, b1);
+      for (int i=0; i<ARRLEN; i++) {
+        int v = i%ALIGN_OFF;
+        errn += verify("test_cp_alndst_overlap: a1", i, a1[i], (short)v);
+        errn += verify("test_cp_alndst_overlap: b1", i, b1[i], (int)v);
+      }
+      for (int i=0; i<ALIGN_OFF; i++) {
+        a1[i+ALIGN_OFF] = -1;
+        b1[i+ALIGN_OFF] = -1;
+      }
+      test_cp_alnsrc(a1, a1, b1, b1);
+      for (int i=0; i<ALIGN_OFF; i++) {
+        errn += verify("test_cp_alnsrc_overlap: a1", i, a1[i], (short)-1);
+        errn += verify("test_cp_alnsrc_overlap: b1", i, b1[i], (int)-1);
+      }
+      for (int i=ALIGN_OFF; i<ARRLEN; i++) {
+        int v = i%ALIGN_OFF;
+        errn += verify("test_cp_alnsrc_overlap: a1", i, a1[i], (short)v);
+        errn += verify("test_cp_alnsrc_overlap: b1", i, b1[i], (int)v);
+      }
+
+      // Reset for unaligned overlap initialization
+      for (int i=0; i<UNALIGN_OFF; i++) {
+        a1[i] = (short)i;
+        b1[i] = (int)i;
+      }
+      for (int i=UNALIGN_OFF; i<ARRLEN; i++) {
+        a1[i] = -1;
+        b1[i] = -1;
+      }
+      test_cp_unalndst(a1, a1, b1, b1);
+      for (int i=0; i<ARRLEN; i++) {
+        int v = i%UNALIGN_OFF;
+        errn += verify("test_cp_unalndst_overlap: a1", i, a1[i], (short)v);
+        errn += verify("test_cp_unalndst_overlap: b1", i, b1[i], (int)v);
+      }
+      for (int i=0; i<UNALIGN_OFF; i++) {
+        a1[i+UNALIGN_OFF] = -1;
+        b1[i+UNALIGN_OFF] = -1;
+      }
+      test_cp_unalnsrc(a1, a1, b1, b1);
+      for (int i=0; i<UNALIGN_OFF; i++) {
+        errn += verify("test_cp_unalnsrc_overlap: a1", i, a1[i], (short)-1);
+        errn += verify("test_cp_unalnsrc_overlap: b1", i, b1[i], (int)-1);
+      }
+      for (int i=UNALIGN_OFF; i<ARRLEN; i++) {
+        int v = i%UNALIGN_OFF;
+        errn += verify("test_cp_unalnsrc_overlap: a1", i, a1[i], (short)v);
+        errn += verify("test_cp_unalnsrc_overlap: b1", i, b1[i], (int)v);
+      }
+
+    }
+
+    if (errn > 0)
+      return errn;
+
+    System.out.println("Time");
+    long start, end;
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_ci(a1, b1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_ci: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_vi(a2, b2, (short)123, (int)103);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_vi: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp(a1, a2, b1, b2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_ci_neg(a1, b1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_ci_neg: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_vi_neg(a1, b1, (short)123, (int)103);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_vi_neg: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_neg(a1, a2, b1, b2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_neg: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_ci_oppos(a1, b1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_ci_oppos: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_vi_oppos(a1, b1, (short)123, (int)103);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_vi_oppos: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_oppos(a1, a2, b1, b2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_oppos: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_ci_aln(a1, b1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_ci_aln: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_vi_aln(a1, b1, (short)123, (int)103);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_vi_aln: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_alndst(a1, a2, b1, b2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_alndst: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_alnsrc(a1, a2, b1, b2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_alnsrc: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_ci_unaln(a1, b1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_ci_unaln: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_vi_unaln(a1, b1, (short)123, (int)103);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_vi_unaln: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_unalndst(a1, a2, b1, b2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_unalndst: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_unalnsrc(a1, a2, b1, b2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_unalnsrc: " + (end - start));
+    return errn;
+  }
+
+  static void test_ci(short[] a, int[] b) {
+    for (int i = 0; i < a.length; i+=1) {
+      a[i] = -123;
+      b[i] = -103;
+    }
+  }
+  static void test_vi(short[] a, int[] b, short c, int d) {
+    for (int i = 0; i < a.length; i+=1) {
+      a[i] = c;
+      b[i] = d;
+    }
+  }
+  static void test_cp(short[] a, short[] b, int[] c, int[] d) {
+    for (int i = 0; i < a.length; i+=1) {
+      a[i] = b[i];
+      c[i] = d[i];
+    }
+  }
+  static void test_ci_neg(short[] a, int[] b) {
+    for (int i = a.length-1; i >= 0; i-=1) {
+      a[i] = -123;
+      b[i] = -103;
+    }
+  }
+  static void test_vi_neg(short[] a, int[] b, short c, int d) {
+    for (int i = a.length-1; i >= 0; i-=1) {
+      a[i] = c;
+      b[i] = d;
+    }
+  }
+  static void test_cp_neg(short[] a, short[] b, int[] c, int[] d) {
+    for (int i = a.length-1; i >= 0; i-=1) {
+      a[i] = b[i];
+      c[i] = d[i];
+    }
+  }
+  static void test_ci_oppos(short[] a, int[] b) {
+    int limit = a.length-1;
+    for (int i = 0; i < a.length; i+=1) {
+      a[limit-i] = -123;
+      b[i] = -103;
+    }
+  }
+  static void test_vi_oppos(short[] a, int[] b, short c, int d) {
+    int limit = a.length-1;
+    for (int i = a.length-1; i >= 0; i-=1) {
+      a[i] = c;
+      b[limit-i] = d;
+    }
+  }
+  static void test_cp_oppos(short[] a, short[] b, int[] c, int[] d) {
+    int limit = a.length-1;
+    for (int i = 0; i < a.length; i+=1) {
+      a[i] = b[limit-i];
+      c[limit-i] = d[i];
+    }
+  }
+  static void test_ci_aln(short[] a, int[] b) {
+    for (int i = 0; i < a.length-ALIGN_OFF; i+=1) {
+      a[i+ALIGN_OFF] = -123;
+      b[i] = -103;
+    }
+  }
+  static void test_vi_aln(short[] a, int[] b, short c, int d) {
+    for (int i = 0; i < a.length-ALIGN_OFF; i+=1) {
+      a[i] = c;
+      b[i+ALIGN_OFF] = d;
+    }
+  }
+  static void test_cp_alndst(short[] a, short[] b, int[] c, int[] d) {
+    for (int i = 0; i < a.length-ALIGN_OFF; i+=1) {
+      a[i+ALIGN_OFF] = b[i];
+      c[i+ALIGN_OFF] = d[i];
+    }
+  }
+  static void test_cp_alnsrc(short[] a, short[] b, int[] c, int[] d) {
+    for (int i = 0; i < a.length-ALIGN_OFF; i+=1) {
+      a[i] = b[i+ALIGN_OFF];
+      c[i] = d[i+ALIGN_OFF];
+    }
+  }
+  static void test_ci_unaln(short[] a, int[] b) {
+    for (int i = 0; i < a.length-UNALIGN_OFF; i+=1) {
+      a[i+UNALIGN_OFF] = -123;
+      b[i] = -103;
+    }
+  }
+  static void test_vi_unaln(short[] a, int[] b, short c, int d) {
+    for (int i = 0; i < a.length-UNALIGN_OFF; i+=1) {
+      a[i] = c;
+      b[i+UNALIGN_OFF] = d;
+    }
+  }
+  static void test_cp_unalndst(short[] a, short[] b, int[] c, int[] d) {
+    for (int i = 0; i < a.length-UNALIGN_OFF; i+=1) {
+      a[i+UNALIGN_OFF] = b[i];
+      c[i+UNALIGN_OFF] = d[i];
+    }
+  }
+  static void test_cp_unalnsrc(short[] a, short[] b, int[] c, int[] d) {
+    for (int i = 0; i < a.length-UNALIGN_OFF; i+=1) {
+      a[i] = b[i+UNALIGN_OFF];
+      c[i] = d[i+UNALIGN_OFF];
+    }
+  }
+
+  static int verify(String text, int i, short elem, short val) {
+    if (elem != val) {
+      System.err.println(text + "[" + i + "] = " + elem + " != " + val);
+      return 1;
+    }
+    return 0;
+  }
+  static int verify(String text, int i, int elem, int val) {
+    if (elem != val) {
+      System.err.println(text + "[" + i + "] = " + elem + " != " + val);
+      return 1;
+    }
+    return 0;
+  }
+}
--- /dev/null	Sat Jun  2 20:04:32 2012
+++ new/test/compiler/7119644/TestShortLongVect.java	Sat Jun  2 20:04:32 2012
@@ -0,0 +1,571 @@
+/*
+ * Copyright (c) 2012, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+/**
+ * @test
+ * @bug 7119644
+ * @summary Increase superword's vector size up to 256 bits
+ *
+ * @run main/othervm/timeout=300 -Xbatch -XX:+IgnoreUnrecognizedVMOptions -XX:-TieredCompilation -XX:-OptimizeFill TestShortLongVect
+ */
+
+public class TestShortLongVect {
+  private static final int ARRLEN = 997;
+  private static final int ITERS  = 11000;
+  private static final int OFFSET = 3;
+  private static final int SCALE = 2;
+  private static final int ALIGN_OFF = 8;
+  private static final int UNALIGN_OFF = 5;
+
+  public static void main(String args[]) {
+    System.out.println("Testing Short + Long vectors");
+    int errn = test();
+    if (errn > 0) {
+      System.err.println("FAILED: " + errn + " errors");
+      System.exit(97);
+    }
+    System.out.println("PASSED");
+  }
+
+  static int test() {
+    short[] a1 = new short[ARRLEN];
+    short[] a2 = new short[ARRLEN];
+    long[] b1 = new long[ARRLEN];
+    long[] b2 = new long[ARRLEN];
+    System.out.println("Warmup");
+    for (int i=0; i<ITERS; i++) {
+      test_ci(a1, b1);
+      test_vi(a2, b2, (short)123, (long)103);
+      test_cp(a1, a2, b1, b2);
+      test_ci_neg(a1, b1);
+      test_vi_neg(a1, b1, (short)123, (long)103);
+      test_cp_neg(a1, a2, b1, b2);
+      test_ci_oppos(a1, b1);
+      test_vi_oppos(a1, b1, (short)123, (long)103);
+      test_cp_oppos(a1, a2, b1, b2);
+      test_ci_aln(a1, b1);
+      test_vi_aln(a1, b1, (short)123, (long)103);
+      test_cp_alndst(a1, a2, b1, b2);
+      test_cp_alnsrc(a1, a2, b1, b2);
+      test_ci_unaln(a1, b1);
+      test_vi_unaln(a1, b1, (short)123, (long)103);
+      test_cp_unalndst(a1, a2, b1, b2);
+      test_cp_unalnsrc(a1, a2, b1, b2);
+    }
+    // Initialize
+    for (int i=0; i<ARRLEN; i++) {
+      a1[i] = -1;
+      a2[i] = -1;
+      b1[i] = -1;
+      b2[i] = -1;
+    }
+    // Test and verify results
+    System.out.println("Verification");
+    int errn = 0;
+    {
+      test_ci(a1, b1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_ci: a1", i, a1[i], (short)-123);
+        errn += verify("test_ci: b1", i, b1[i], (long)-103);
+      }
+      test_vi(a2, b2, (short)123, (long)103);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_vi: a2", i, a2[i], (short)123);
+        errn += verify("test_vi: b2", i, b2[i], (long)103);
+      }
+      test_cp(a1, a2, b1, b2);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_cp: a1", i, a1[i], (short)123);
+        errn += verify("test_cp: b1", i, b1[i], (long)103);
+      }
+
+      // Reset for negative stride
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        a2[i] = -1;
+        b1[i] = -1;
+        b2[i] = -1;
+      }
+      test_ci_neg(a1, b1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_ci_neg: a1", i, a1[i], (short)-123);
+        errn += verify("test_ci_neg: b1", i, b1[i], (long)-103);
+      }
+      test_vi_neg(a2, b2, (short)123, (long)103);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_vi_neg: a2", i, a2[i], (short)123);
+        errn += verify("test_vi_neg: b2", i, b2[i], (long)103);
+      }
+      test_cp_neg(a1, a2, b1, b2);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_cp_neg: a1", i, a1[i], (short)123);
+        errn += verify("test_cp_neg: b1", i, b1[i], (long)103);
+      }
+
+      // Reset for opposite stride
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        a2[i] = -1;
+        b1[i] = -1;
+        b2[i] = -1;
+      }
+      test_ci_oppos(a1, b1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_ci_oppos: a1", i, a1[i], (short)-123);
+        errn += verify("test_ci_oppos: b1", i, b1[i], (long)-103);
+      }
+      test_vi_oppos(a2, b2, (short)123, (long)103);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_vi_oppos: a2", i, a2[i], (short)123);
+        errn += verify("test_vi_oppos: b2", i, b2[i], (long)103);
+      }
+      test_cp_oppos(a1, a2, b1, b2);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_cp_oppos: a1", i, a1[i], (short)123);
+        errn += verify("test_cp_oppos: b1", i, b1[i], (long)103);
+      }
+
+      // Reset for 2 arrays with relative aligned offset
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        a2[i] = 123;
+        b1[i] = -1;
+        b2[i] = 123;
+      }
+      test_cp_alndst(a1, a2, b1, b2);
+      for (int i=0; i<ALIGN_OFF; i++) {
+        errn += verify("test_cp_alndst: a1", i, a1[i], (short)-1);
+        errn += verify("test_cp_alndst: b1", i, b1[i], (long)-1);
+      }
+      for (int i=ALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_cp_alndst: a1", i, a1[i], (short)123);
+        errn += verify("test_cp_alndst: b1", i, b1[i], (long)123);
+      }
+      for (int i=0; i<ARRLEN; i++) {
+        a2[i] = -123;
+        b2[i] = -123;
+      }
+      test_cp_alnsrc(a1, a2, b1, b2);
+      for (int i=0; i<ARRLEN-ALIGN_OFF; i++) {
+        errn += verify("test_cp_alnsrc: a1", i, a1[i], (short)-123);
+        errn += verify("test_cp_alnsrc: b1", i, b1[i], (long)-123);
+      }
+      for (int i=ARRLEN-ALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_cp_alnsrc: a1", i, a1[i], (short)123);
+        errn += verify("test_cp_alnsrc: b1", i, b1[i], (long)123);
+      }
+
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        b1[i] = -1;
+      }
+      test_ci_aln(a1, b1);
+      for (int i=0; i<ALIGN_OFF; i++) {
+        errn += verify("test_ci_aln: a1", i, a1[i], (short)-1);
+      }
+      for (int i=ALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_ci_aln: a1", i, a1[i], (short)-123);
+      }
+      for (int i=0; i<ARRLEN-ALIGN_OFF; i++) {
+        errn += verify("test_ci_aln: b1", i, b1[i], (long)-103);
+      }
+      for (int i=ARRLEN-ALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_ci_aln: b1", i, b1[i], (long)-1);
+      }
+
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        b1[i] = -1;
+      }
+      test_vi_aln(a1, b1, (short)123, (long)103);
+      for (int i=0; i<ARRLEN-ALIGN_OFF; i++) {
+        errn += verify("test_vi_aln: a1", i, a1[i], (short)123);
+      }
+      for (int i=ARRLEN-ALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_vi_aln: a1", i, a1[i], (short)-1);
+      }
+      for (int i=0; i<ALIGN_OFF; i++) {
+        errn += verify("test_vi_aln: b1", i, b1[i], (long)-1);
+      }
+      for (int i=ALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_vi_aln: b1", i, b1[i], (long)103);
+      }
+
+      // Reset for 2 arrays with relative unaligned offset
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        a2[i] = 123;
+        b1[i] = -1;
+        b2[i] = 123;
+      }
+      test_cp_unalndst(a1, a2, b1, b2);
+      for (int i=0; i<UNALIGN_OFF; i++) {
+        errn += verify("test_cp_unalndst: a1", i, a1[i], (short)-1);
+        errn += verify("test_cp_unalndst: b1", i, b1[i], (long)-1);
+      }
+      for (int i=UNALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_cp_unalndst: a1", i, a1[i], (short)123);
+        errn += verify("test_cp_unalndst: b1", i, b1[i], (long)123);
+      }
+      for (int i=0; i<ARRLEN; i++) {
+        a2[i] = -123;
+        b2[i] = -123;
+      }
+      test_cp_unalnsrc(a1, a2, b1, b2);
+      for (int i=0; i<ARRLEN-UNALIGN_OFF; i++) {
+        errn += verify("test_cp_unalnsrc: a1", i, a1[i], (short)-123);
+        errn += verify("test_cp_unalnsrc: b1", i, b1[i], (long)-123);
+      }
+      for (int i=ARRLEN-UNALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_cp_unalnsrc: a1", i, a1[i], (short)123);
+        errn += verify("test_cp_unalnsrc: b1", i, b1[i], (long)123);
+      }
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        b1[i] = -1;
+      }
+      test_ci_unaln(a1, b1);
+      for (int i=0; i<UNALIGN_OFF; i++) {
+        errn += verify("test_ci_unaln: a1", i, a1[i], (short)-1);
+      }
+      for (int i=UNALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_ci_unaln: a1", i, a1[i], (short)-123);
+      }
+      for (int i=0; i<ARRLEN-UNALIGN_OFF; i++) {
+        errn += verify("test_ci_unaln: b1", i, b1[i], (long)-103);
+      }
+      for (int i=ARRLEN-UNALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_ci_unaln: b1", i, b1[i], (long)-1);
+      }
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        b1[i] = -1;
+      }
+      test_vi_unaln(a1, b1, (short)123, (long)103);
+      for (int i=0; i<ARRLEN-UNALIGN_OFF; i++) {
+        errn += verify("test_vi_unaln: a1", i, a1[i], (short)123);
+      }
+      for (int i=ARRLEN-UNALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_vi_unaln: a1", i, a1[i], (short)-1);
+      }
+      for (int i=0; i<UNALIGN_OFF; i++) {
+        errn += verify("test_vi_unaln: b1", i, b1[i], (long)-1);
+      }
+      for (int i=UNALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_vi_unaln: b1", i, b1[i], (long)103);
+      }
+
+      // Reset for aligned overlap initialization
+      for (int i=0; i<ALIGN_OFF; i++) {
+        a1[i] = (short)i;
+        b1[i] = (long)i;
+      }
+      for (int i=ALIGN_OFF; i<ARRLEN; i++) {
+        a1[i] = -1;
+        b1[i] = -1;
+      }
+      test_cp_alndst(a1, a1, b1, b1);
+      for (int i=0; i<ARRLEN; i++) {
+        int v = i%ALIGN_OFF;
+        errn += verify("test_cp_alndst_overlap: a1", i, a1[i], (short)v);
+        errn += verify("test_cp_alndst_overlap: b1", i, b1[i], (long)v);
+      }
+      for (int i=0; i<ALIGN_OFF; i++) {
+        a1[i+ALIGN_OFF] = -1;
+        b1[i+ALIGN_OFF] = -1;
+      }
+      test_cp_alnsrc(a1, a1, b1, b1);
+      for (int i=0; i<ALIGN_OFF; i++) {
+        errn += verify("test_cp_alnsrc_overlap: a1", i, a1[i], (short)-1);
+        errn += verify("test_cp_alnsrc_overlap: b1", i, b1[i], (long)-1);
+      }
+      for (int i=ALIGN_OFF; i<ARRLEN; i++) {
+        int v = i%ALIGN_OFF;
+        errn += verify("test_cp_alnsrc_overlap: a1", i, a1[i], (short)v);
+        errn += verify("test_cp_alnsrc_overlap: b1", i, b1[i], (long)v);
+      }
+
+      // Reset for unaligned overlap initialization
+      for (int i=0; i<UNALIGN_OFF; i++) {
+        a1[i] = (short)i;
+        b1[i] = (long)i;
+      }
+      for (int i=UNALIGN_OFF; i<ARRLEN; i++) {
+        a1[i] = -1;
+        b1[i] = -1;
+      }
+      test_cp_unalndst(a1, a1, b1, b1);
+      for (int i=0; i<ARRLEN; i++) {
+        int v = i%UNALIGN_OFF;
+        errn += verify("test_cp_unalndst_overlap: a1", i, a1[i], (short)v);
+        errn += verify("test_cp_unalndst_overlap: b1", i, b1[i], (long)v);
+      }
+      for (int i=0; i<UNALIGN_OFF; i++) {
+        a1[i+UNALIGN_OFF] = -1;
+        b1[i+UNALIGN_OFF] = -1;
+      }
+      test_cp_unalnsrc(a1, a1, b1, b1);
+      for (int i=0; i<UNALIGN_OFF; i++) {
+        errn += verify("test_cp_unalnsrc_overlap: a1", i, a1[i], (short)-1);
+        errn += verify("test_cp_unalnsrc_overlap: b1", i, b1[i], (long)-1);
+      }
+      for (int i=UNALIGN_OFF; i<ARRLEN; i++) {
+        int v = i%UNALIGN_OFF;
+        errn += verify("test_cp_unalnsrc_overlap: a1", i, a1[i], (short)v);
+        errn += verify("test_cp_unalnsrc_overlap: b1", i, b1[i], (long)v);
+      }
+
+    }
+
+    if (errn > 0)
+      return errn;
+
+    System.out.println("Time");
+    long start, end;
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_ci(a1, b1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_ci: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_vi(a2, b2, (short)123, (long)103);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_vi: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp(a1, a2, b1, b2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_ci_neg(a1, b1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_ci_neg: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_vi_neg(a1, b1, (short)123, (long)103);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_vi_neg: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_neg(a1, a2, b1, b2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_neg: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_ci_oppos(a1, b1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_ci_oppos: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_vi_oppos(a1, b1, (short)123, (long)103);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_vi_oppos: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_oppos(a1, a2, b1, b2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_oppos: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_ci_aln(a1, b1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_ci_aln: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_vi_aln(a1, b1, (short)123, (long)103);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_vi_aln: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_alndst(a1, a2, b1, b2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_alndst: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_alnsrc(a1, a2, b1, b2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_alnsrc: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_ci_unaln(a1, b1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_ci_unaln: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_vi_unaln(a1, b1, (short)123, (long)103);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_vi_unaln: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_unalndst(a1, a2, b1, b2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_unalndst: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_unalnsrc(a1, a2, b1, b2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_unalnsrc: " + (end - start));
+    return errn;
+  }
+
+  static void test_ci(short[] a, long[] b) {
+    for (int i = 0; i < a.length; i+=1) {
+      a[i] = -123;
+      b[i] = -103;
+    }
+  }
+  static void test_vi(short[] a, long[] b, short c, long d) {
+    for (int i = 0; i < a.length; i+=1) {
+      a[i] = c;
+      b[i] = d;
+    }
+  }
+  static void test_cp(short[] a, short[] b, long[] c, long[] d) {
+    for (int i = 0; i < a.length; i+=1) {
+      a[i] = b[i];
+      c[i] = d[i];
+    }
+  }
+  static void test_ci_neg(short[] a, long[] b) {
+    for (int i = a.length-1; i >= 0; i-=1) {
+      a[i] = -123;
+      b[i] = -103;
+    }
+  }
+  static void test_vi_neg(short[] a, long[] b, short c, long d) {
+    for (int i = a.length-1; i >= 0; i-=1) {
+      a[i] = c;
+      b[i] = d;
+    }
+  }
+  static void test_cp_neg(short[] a, short[] b, long[] c, long[] d) {
+    for (int i = a.length-1; i >= 0; i-=1) {
+      a[i] = b[i];
+      c[i] = d[i];
+    }
+  }
+  static void test_ci_oppos(short[] a, long[] b) {
+    int limit = a.length-1;
+    for (int i = 0; i < a.length; i+=1) {
+      a[limit-i] = -123;
+      b[i] = -103;
+    }
+  }
+  static void test_vi_oppos(short[] a, long[] b, short c, long d) {
+    int limit = a.length-1;
+    for (int i = a.length-1; i >= 0; i-=1) {
+      a[i] = c;
+      b[limit-i] = d;
+    }
+  }
+  static void test_cp_oppos(short[] a, short[] b, long[] c, long[] d) {
+    int limit = a.length-1;
+    for (int i = 0; i < a.length; i+=1) {
+      a[i] = b[limit-i];
+      c[limit-i] = d[i];
+    }
+  }
+  static void test_ci_aln(short[] a, long[] b) {
+    for (int i = 0; i < a.length-ALIGN_OFF; i+=1) {
+      a[i+ALIGN_OFF] = -123;
+      b[i] = -103;
+    }
+  }
+  static void test_vi_aln(short[] a, long[] b, short c, long d) {
+    for (int i = 0; i < a.length-ALIGN_OFF; i+=1) {
+      a[i] = c;
+      b[i+ALIGN_OFF] = d;
+    }
+  }
+  static void test_cp_alndst(short[] a, short[] b, long[] c, long[] d) {
+    for (int i = 0; i < a.length-ALIGN_OFF; i+=1) {
+      a[i+ALIGN_OFF] = b[i];
+      c[i+ALIGN_OFF] = d[i];
+    }
+  }
+  static void test_cp_alnsrc(short[] a, short[] b, long[] c, long[] d) {
+    for (int i = 0; i < a.length-ALIGN_OFF; i+=1) {
+      a[i] = b[i+ALIGN_OFF];
+      c[i] = d[i+ALIGN_OFF];
+    }
+  }
+  static void test_ci_unaln(short[] a, long[] b) {
+    for (int i = 0; i < a.length-UNALIGN_OFF; i+=1) {
+      a[i+UNALIGN_OFF] = -123;
+      b[i] = -103;
+    }
+  }
+  static void test_vi_unaln(short[] a, long[] b, short c, long d) {
+    for (int i = 0; i < a.length-UNALIGN_OFF; i+=1) {
+      a[i] = c;
+      b[i+UNALIGN_OFF] = d;
+    }
+  }
+  static void test_cp_unalndst(short[] a, short[] b, long[] c, long[] d) {
+    for (int i = 0; i < a.length-UNALIGN_OFF; i+=1) {
+      a[i+UNALIGN_OFF] = b[i];
+      c[i+UNALIGN_OFF] = d[i];
+    }
+  }
+  static void test_cp_unalnsrc(short[] a, short[] b, long[] c, long[] d) {
+    for (int i = 0; i < a.length-UNALIGN_OFF; i+=1) {
+      a[i] = b[i+UNALIGN_OFF];
+      c[i] = d[i+UNALIGN_OFF];
+    }
+  }
+
+  static int verify(String text, int i, short elem, short val) {
+    if (elem != val) {
+      System.err.println(text + "[" + i + "] = " + elem + " != " + val);
+      return 1;
+    }
+    return 0;
+  }
+  static int verify(String text, int i, long elem, long val) {
+    if (elem != val) {
+      System.err.println(text + "[" + i + "] = " + elem + " != " + val);
+      return 1;
+    }
+    return 0;
+  }
+}
--- /dev/null	Sat Jun  2 20:04:32 2012
+++ new/test/compiler/7119644/TestShortVect.java	Sat Jun  2 20:04:32 2012
@@ -0,0 +1,953 @@
+/*
+ * Copyright (c) 2012, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+/**
+ * @test
+ * @bug 7119644
+ * @summary Increase superword's vector size up to 256 bits
+ *
+ * @run main/othervm/timeout=300 -Xbatch -XX:+IgnoreUnrecognizedVMOptions -XX:-TieredCompilation -XX:-OptimizeFill TestShortVect
+ */
+
+public class TestShortVect {
+  private static final int ARRLEN = 997;
+  private static final int ITERS  = 11000;
+  private static final int OFFSET = 3;
+  private static final int SCALE = 2;
+  private static final int ALIGN_OFF = 8;
+  private static final int UNALIGN_OFF = 5;
+
+  public static void main(String args[]) {
+    System.out.println("Testing Short vectors");
+    int errn = test();
+    if (errn > 0) {
+      System.err.println("FAILED: " + errn + " errors");
+      System.exit(97);
+    }
+    System.out.println("PASSED");
+  }
+
+  static int test() {
+    short[] a1 = new short[ARRLEN];
+    short[] a2 = new short[ARRLEN];
+    System.out.println("Warmup");
+    for (int i=0; i<ITERS; i++) {
+      test_ci(a1);
+      test_vi(a2, (short)123);
+      test_cp(a1, a2);
+      test_2ci(a1, a2);
+      test_2vi(a1, a2, (short)123, (short)103);
+      test_ci_neg(a1);
+      test_vi_neg(a2, (short)123);
+      test_cp_neg(a1, a2);
+      test_2ci_neg(a1, a2);
+      test_2vi_neg(a1, a2, (short)123, (short)103);
+      test_ci_oppos(a1);
+      test_vi_oppos(a2, (short)123);
+      test_cp_oppos(a1, a2);
+      test_2ci_oppos(a1, a2);
+      test_2vi_oppos(a1, a2, (short)123, (short)103);
+      test_ci_off(a1);
+      test_vi_off(a2, (short)123);
+      test_cp_off(a1, a2);
+      test_2ci_off(a1, a2);
+      test_2vi_off(a1, a2, (short)123, (short)103);
+      test_ci_inv(a1, OFFSET);
+      test_vi_inv(a2, (short)123, OFFSET);
+      test_cp_inv(a1, a2, OFFSET);
+      test_2ci_inv(a1, a2, OFFSET);
+      test_2vi_inv(a1, a2, (short)123, (short)103, OFFSET);
+      test_ci_scl(a1);
+      test_vi_scl(a2, (short)123);
+      test_cp_scl(a1, a2);
+      test_2ci_scl(a1, a2);
+      test_2vi_scl(a1, a2, (short)123, (short)103);
+      test_cp_alndst(a1, a2);
+      test_cp_alnsrc(a1, a2);
+      test_2ci_aln(a1, a2);
+      test_2vi_aln(a1, a2, (short)123, (short)103);
+      test_cp_unalndst(a1, a2);
+      test_cp_unalnsrc(a1, a2);
+      test_2ci_unaln(a1, a2);
+      test_2vi_unaln(a1, a2, (short)123, (short)103);
+    }
+    // Initialize
+    for (int i=0; i<ARRLEN; i++) {
+      a1[i] = -1;
+      a2[i] = -1;
+    }
+    // Test and verify results
+    System.out.println("Verification");
+    int errn = 0;
+    {
+      test_ci(a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_ci: a1", i, a1[i], (short)-123);
+      }
+      test_vi(a2, (short)123);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_vi: a2", i, a2[i], (short)123);
+      }
+      test_cp(a1, a2);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_cp: a1", i, a1[i], (short)123);
+      }
+      test_2ci(a1, a2);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_2ci: a1", i, a1[i], (short)-123);
+        errn += verify("test_2ci: a2", i, a2[i], (short)-103);
+      }
+      test_2vi(a1, a2, (short)123, (short)103);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_2vi: a1", i, a1[i], (short)123);
+        errn += verify("test_2vi: a2", i, a2[i], (short)103);
+      }
+      // Reset for negative stride
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        a2[i] = -1;
+      }
+      test_ci_neg(a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_ci_neg: a1", i, a1[i], (short)-123);
+      }
+      test_vi_neg(a2, (short)123);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_vi_neg: a2", i, a2[i], (short)123);
+      }
+      test_cp_neg(a1, a2);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_cp_neg: a1", i, a1[i], (short)123);
+      }
+      test_2ci_neg(a1, a2);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_2ci_neg: a1", i, a1[i], (short)-123);
+        errn += verify("test_2ci_neg: a2", i, a2[i], (short)-103);
+      }
+      test_2vi_neg(a1, a2, (short)123, (short)103);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_2vi_neg: a1", i, a1[i], (short)123);
+        errn += verify("test_2vi_neg: a2", i, a2[i], (short)103);
+      }
+      // Reset for opposite stride
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        a2[i] = -1;
+      }
+      test_ci_oppos(a1);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_ci_oppos: a1", i, a1[i], (short)-123);
+      }
+      test_vi_oppos(a2, (short)123);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_vi_oppos: a2", i, a2[i], (short)123);
+      }
+      test_cp_oppos(a1, a2);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_cp_oppos: a1", i, a1[i], (short)123);
+      }
+      test_2ci_oppos(a1, a2);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_2ci_oppos: a1", i, a1[i], (short)-123);
+        errn += verify("test_2ci_oppos: a2", i, a2[i], (short)-103);
+      }
+      test_2vi_oppos(a1, a2, (short)123, (short)103);
+      for (int i=0; i<ARRLEN; i++) {
+        errn += verify("test_2vi_oppos: a1", i, a1[i], (short)123);
+        errn += verify("test_2vi_oppos: a2", i, a2[i], (short)103);
+      }
+      // Reset for indexing with offset
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        a2[i] = -1;
+      }
+      test_ci_off(a1);
+      for (int i=OFFSET; i<ARRLEN; i++) {
+        errn += verify("test_ci_off: a1", i, a1[i], (short)-123);
+      }
+      test_vi_off(a2, (short)123);
+      for (int i=OFFSET; i<ARRLEN; i++) {
+        errn += verify("test_vi_off: a2", i, a2[i], (short)123);
+      }
+      test_cp_off(a1, a2);
+      for (int i=OFFSET; i<ARRLEN; i++) {
+        errn += verify("test_cp_off: a1", i, a1[i], (short)123);
+      }
+      test_2ci_off(a1, a2);
+      for (int i=OFFSET; i<ARRLEN; i++) {
+        errn += verify("test_2ci_off: a1", i, a1[i], (short)-123);
+        errn += verify("test_2ci_off: a2", i, a2[i], (short)-103);
+      }
+      test_2vi_off(a1, a2, (short)123, (short)103);
+      for (int i=OFFSET; i<ARRLEN; i++) {
+        errn += verify("test_2vi_off: a1", i, a1[i], (short)123);
+        errn += verify("test_2vi_off: a2", i, a2[i], (short)103);
+      }
+      for (int i=0; i<OFFSET; i++) {
+        errn += verify("test_2vi_off: a1", i, a1[i], (short)-1);
+        errn += verify("test_2vi_off: a2", i, a2[i], (short)-1);
+      }
+      // Reset for indexing with invariant offset
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        a2[i] = -1;
+      }
+      test_ci_inv(a1, OFFSET);
+      for (int i=OFFSET; i<ARRLEN; i++) {
+        errn += verify("test_ci_inv: a1", i, a1[i], (short)-123);
+      }
+      test_vi_inv(a2, (short)123, OFFSET);
+      for (int i=OFFSET; i<ARRLEN; i++) {
+        errn += verify("test_vi_inv: a2", i, a2[i], (short)123);
+      }
+      test_cp_inv(a1, a2, OFFSET);
+      for (int i=OFFSET; i<ARRLEN; i++) {
+        errn += verify("test_cp_inv: a1", i, a1[i], (short)123);
+      }
+      test_2ci_inv(a1, a2, OFFSET);
+      for (int i=OFFSET; i<ARRLEN; i++) {
+        errn += verify("test_2ci_inv: a1", i, a1[i], (short)-123);
+        errn += verify("test_2ci_inv: a2", i, a2[i], (short)-103);
+      }
+      test_2vi_inv(a1, a2, (short)123, (short)103, OFFSET);
+      for (int i=OFFSET; i<ARRLEN; i++) {
+        errn += verify("test_2vi_inv: a1", i, a1[i], (short)123);
+        errn += verify("test_2vi_inv: a2", i, a2[i], (short)103);
+      }
+      for (int i=0; i<OFFSET; i++) {
+        errn += verify("test_2vi_inv: a1", i, a1[i], (short)-1);
+        errn += verify("test_2vi_inv: a2", i, a2[i], (short)-1);
+      }
+      // Reset for indexing with scale
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        a2[i] = -1;
+      }
+      test_ci_scl(a1);
+      for (int i=0; i<ARRLEN; i++) {
+        int val = (i%SCALE != 0) ? -1 : -123;
+        errn += verify("test_ci_scl: a1", i, a1[i], (short)val);
+      }
+      test_vi_scl(a2, (short)123);
+      for (int i=0; i<ARRLEN; i++) {
+        int val = (i%SCALE != 0) ? -1 : 123;
+        errn += verify("test_vi_scl: a2", i, a2[i], (short)val);
+      }
+      test_cp_scl(a1, a2);
+      for (int i=0; i<ARRLEN; i++) {
+        int val = (i%SCALE != 0) ? -1 : 123;
+        errn += verify("test_cp_scl: a1", i, a1[i], (short)val);
+      }
+      test_2ci_scl(a1, a2);
+      for (int i=0; i<ARRLEN; i++) {
+        if (i%SCALE != 0) {
+          errn += verify("test_2ci_scl: a1", i, a1[i], (short)-1);
+        } else if (i*SCALE < ARRLEN) {
+          errn += verify("test_2ci_scl: a1", i*SCALE, a1[i*SCALE], (short)-123);
+        }
+        if (i%SCALE != 0) {
+          errn += verify("test_2ci_scl: a2", i, a2[i], (short)-1);
+        } else if (i*SCALE < ARRLEN) {
+          errn += verify("test_2ci_scl: a2", i*SCALE, a2[i*SCALE], (short)-103);
+        }
+      }
+      test_2vi_scl(a1, a2, (short)123, (short)103);
+      for (int i=0; i<ARRLEN; i++) {
+        if (i%SCALE != 0) {
+          errn += verify("test_2vi_scl: a1", i, a1[i], (short)-1);
+        } else if (i*SCALE < ARRLEN) {
+          errn += verify("test_2vi_scl: a1", i*SCALE, a1[i*SCALE], (short)123);
+        }
+        if (i%SCALE != 0) {
+          errn += verify("test_2vi_scl: a2", i, a2[i], (short)-1);
+        } else if (i*SCALE < ARRLEN) {
+          errn += verify("test_2vi_scl: a2", i*SCALE, a2[i*SCALE], (short)103);
+        }
+      }
+      // Reset for 2 arrays with relative aligned offset
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        a2[i] = -1;
+      }
+      test_vi(a2, (short)123);
+      test_cp_alndst(a1, a2);
+      for (int i=0; i<ALIGN_OFF; i++) {
+        errn += verify("test_cp_alndst: a1", i, a1[i], (short)-1);
+      }
+      for (int i=ALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_cp_alndst: a1", i, a1[i], (short)123);
+      }
+      test_vi(a2, (short)-123);
+      test_cp_alnsrc(a1, a2);
+      for (int i=0; i<ARRLEN-ALIGN_OFF; i++) {
+        errn += verify("test_cp_alnsrc: a1", i, a1[i], (short)-123);
+      }
+      for (int i=ARRLEN-ALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_cp_alnsrc: a1", i, a1[i], (short)123);
+      }
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        a2[i] = -1;
+      }
+      test_2ci_aln(a1, a2);
+      for (int i=0; i<ALIGN_OFF; i++) {
+        errn += verify("test_2ci_aln: a1", i, a1[i], (short)-1);
+      }
+      for (int i=ALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_2ci_aln: a1", i, a1[i], (short)-123);
+      }
+      for (int i=0; i<ARRLEN-ALIGN_OFF; i++) {
+        errn += verify("test_2ci_aln: a2", i, a2[i], (short)-103);
+      }
+      for (int i=ARRLEN-ALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_2ci_aln: a2", i, a2[i], (short)-1);
+      }
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        a2[i] = -1;
+      }
+      test_2vi_aln(a1, a2, (short)123, (short)103);
+      for (int i=0; i<ARRLEN-ALIGN_OFF; i++) {
+        errn += verify("test_2vi_aln: a1", i, a1[i], (short)123);
+      }
+      for (int i=ARRLEN-ALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_2vi_aln: a1", i, a1[i], (short)-1);
+      }
+      for (int i=0; i<ALIGN_OFF; i++) {
+        errn += verify("test_2vi_aln: a2", i, a2[i], (short)-1);
+      }
+      for (int i=ALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_2vi_aln: a2", i, a2[i], (short)103);
+      }
+
+      // Reset for 2 arrays with relative unaligned offset
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        a2[i] = -1;
+      }
+      test_vi(a2, (short)123);
+      test_cp_unalndst(a1, a2);
+      for (int i=0; i<UNALIGN_OFF; i++) {
+        errn += verify("test_cp_unalndst: a1", i, a1[i], (short)-1);
+      }
+      for (int i=UNALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_cp_unalndst: a1", i, a1[i], (short)123);
+      }
+      test_vi(a2, (short)-123);
+      test_cp_unalnsrc(a1, a2);
+      for (int i=0; i<ARRLEN-UNALIGN_OFF; i++) {
+        errn += verify("test_cp_unalnsrc: a1", i, a1[i], (short)-123);
+      }
+      for (int i=ARRLEN-UNALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_cp_unalnsrc: a1", i, a1[i], (short)123);
+      }
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        a2[i] = -1;
+      }
+      test_2ci_unaln(a1, a2);
+      for (int i=0; i<UNALIGN_OFF; i++) {
+        errn += verify("test_2ci_unaln: a1", i, a1[i], (short)-1);
+      }
+      for (int i=UNALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_2ci_unaln: a1", i, a1[i], (short)-123);
+      }
+      for (int i=0; i<ARRLEN-UNALIGN_OFF; i++) {
+        errn += verify("test_2ci_unaln: a2", i, a2[i], (short)-103);
+      }
+      for (int i=ARRLEN-UNALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_2ci_unaln: a2", i, a2[i], (short)-1);
+      }
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+        a2[i] = -1;
+      }
+      test_2vi_unaln(a1, a2, (short)123, (short)103);
+      for (int i=0; i<ARRLEN-UNALIGN_OFF; i++) {
+        errn += verify("test_2vi_unaln: a1", i, a1[i], (short)123);
+      }
+      for (int i=ARRLEN-UNALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_2vi_unaln: a1", i, a1[i], (short)-1);
+      }
+      for (int i=0; i<UNALIGN_OFF; i++) {
+        errn += verify("test_2vi_unaln: a2", i, a2[i], (short)-1);
+      }
+      for (int i=UNALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_2vi_unaln: a2", i, a2[i], (short)103);
+      }
+
+      // Reset for aligned overlap initialization
+      for (int i=0; i<ALIGN_OFF; i++) {
+        a1[i] = (short)i;
+      }
+      for (int i=ALIGN_OFF; i<ARRLEN; i++) {
+        a1[i] = -1;
+      }
+      test_cp_alndst(a1, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        int v = i%ALIGN_OFF;
+        errn += verify("test_cp_alndst_overlap: a1", i, a1[i], (short)v);
+      }
+      for (int i=0; i<ALIGN_OFF; i++) {
+        a1[i+ALIGN_OFF] = -1;
+      }
+      test_cp_alnsrc(a1, a1);
+      for (int i=0; i<ALIGN_OFF; i++) {
+        errn += verify("test_cp_alnsrc_overlap: a1", i, a1[i], (short)-1);
+      }
+      for (int i=ALIGN_OFF; i<ARRLEN; i++) {
+        int v = i%ALIGN_OFF;
+        errn += verify("test_cp_alnsrc_overlap: a1", i, a1[i], (short)v);
+      }
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+      }
+      test_2ci_aln(a1, a1);
+      for (int i=0; i<ARRLEN-ALIGN_OFF; i++) {
+        errn += verify("test_2ci_aln_overlap: a1", i, a1[i], (short)-103);
+      }
+      for (int i=ARRLEN-ALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_2ci_aln_overlap: a1", i, a1[i], (short)-123);
+      }
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+      }
+      test_2vi_aln(a1, a1, (short)123, (short)103);
+      for (int i=0; i<ARRLEN-ALIGN_OFF; i++) {
+        errn += verify("test_2vi_aln_overlap: a1", i, a1[i], (short)123);
+      }
+      for (int i=ARRLEN-ALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_2vi_aln_overlap: a1", i, a1[i], (short)103);
+      }
+
+      // Reset for unaligned overlap initialization
+      for (int i=0; i<UNALIGN_OFF; i++) {
+        a1[i] = (short)i;
+      }
+      for (int i=UNALIGN_OFF; i<ARRLEN; i++) {
+        a1[i] = -1;
+      }
+      test_cp_unalndst(a1, a1);
+      for (int i=0; i<ARRLEN; i++) {
+        int v = i%UNALIGN_OFF;
+        errn += verify("test_cp_unalndst_overlap: a1", i, a1[i], (short)v);
+      }
+      for (int i=0; i<UNALIGN_OFF; i++) {
+        a1[i+UNALIGN_OFF] = -1;
+      }
+      test_cp_unalnsrc(a1, a1);
+      for (int i=0; i<UNALIGN_OFF; i++) {
+        errn += verify("test_cp_unalnsrc_overlap: a1", i, a1[i], (short)-1);
+      }
+      for (int i=UNALIGN_OFF; i<ARRLEN; i++) {
+        int v = i%UNALIGN_OFF;
+        errn += verify("test_cp_unalnsrc_overlap: a1", i, a1[i], (short)v);
+      }
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+      }
+      test_2ci_unaln(a1, a1);
+      for (int i=0; i<ARRLEN-UNALIGN_OFF; i++) {
+        errn += verify("test_2ci_unaln_overlap: a1", i, a1[i], (short)-103);
+      }
+      for (int i=ARRLEN-UNALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_2ci_unaln_overlap: a1", i, a1[i], (short)-123);
+      }
+      for (int i=0; i<ARRLEN; i++) {
+        a1[i] = -1;
+      }
+      test_2vi_unaln(a1, a1, (short)123, (short)103);
+      for (int i=0; i<ARRLEN-UNALIGN_OFF; i++) {
+        errn += verify("test_2vi_unaln_overlap: a1", i, a1[i], (short)123);
+      }
+      for (int i=ARRLEN-UNALIGN_OFF; i<ARRLEN; i++) {
+        errn += verify("test_2vi_unaln_overlap: a1", i, a1[i], (short)103);
+      }
+
+    }
+
+    if (errn > 0)
+      return errn;
+
+    System.out.println("Time");
+    long start, end;
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_ci(a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_ci: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_vi(a2, (short)123);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_vi: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp(a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_2ci(a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_2ci: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_2vi(a1, a2, (short)123, (short)103);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_2vi: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_ci_neg(a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_ci_neg: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_vi_neg(a2, (short)123);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_vi_neg: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_neg(a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_neg: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_2ci_neg(a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_2ci_neg: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_2vi_neg(a1, a2, (short)123, (short)103);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_2vi_neg: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_ci_oppos(a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_ci_oppos: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_vi_oppos(a2, (short)123);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_vi_oppos: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_oppos(a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_oppos: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_2ci_oppos(a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_2ci_oppos: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_2vi_oppos(a1, a2, (short)123, (short)103);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_2vi_oppos: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_ci_off(a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_ci_off: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_vi_off(a2, (short)123);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_vi_off: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_off(a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_off: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_2ci_off(a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_2ci_off: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_2vi_off(a1, a2, (short)123, (short)103);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_2vi_off: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_ci_inv(a1, OFFSET);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_ci_inv: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_vi_inv(a2, (short)123, OFFSET);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_vi_inv: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_inv(a1, a2, OFFSET);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_inv: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_2ci_inv(a1, a2, OFFSET);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_2ci_inv: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_2vi_inv(a1, a2, (short)123, (short)103, OFFSET);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_2vi_inv: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_ci_scl(a1);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_ci_scl: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_vi_scl(a2, (short)123);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_vi_scl: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_scl(a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_scl: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_2ci_scl(a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_2ci_scl: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_2vi_scl(a1, a2, (short)123, (short)103);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_2vi_scl: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_alndst(a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_alndst: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_alnsrc(a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_alnsrc: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_2ci_aln(a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_2ci_aln: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_2vi_aln(a1, a2, (short)123, (short)103);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_2vi_aln: " + (end - start));
+
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_unalndst(a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_unalndst: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_cp_unalnsrc(a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_cp_unalnsrc: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_2ci_unaln(a1, a2);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_2ci_unaln: " + (end - start));
+    start = System.currentTimeMillis();
+    for (int i=0; i<ITERS; i++) {
+      test_2vi_unaln(a1, a2, (short)123, (short)103);
+    }
+    end = System.currentTimeMillis();
+    System.out.println("test_2vi_unaln: " + (end - start));
+
+    return errn;
+  }
+
+  static void test_ci(short[] a) {
+    for (int i = 0; i < a.length; i+=1) {
+      a[i] = -123;
+    }
+  }
+  static void test_vi(short[] a, short b) {
+    for (int i = 0; i < a.length; i+=1) {
+      a[i] = b;
+    }
+  }
+  static void test_cp(short[] a, short[] b) {
+    for (int i = 0; i < a.length; i+=1) {
+      a[i] = b[i];
+    }
+  }
+  static void test_2ci(short[] a, short[] b) {
+    for (int i = 0; i < a.length; i+=1) {
+      a[i] = -123;
+      b[i] = -103;
+    }
+  }
+  static void test_2vi(short[] a, short[] b, short c, short d) {
+    for (int i = 0; i < a.length; i+=1) {
+      a[i] = c;
+      b[i] = d;
+    }
+  }
+  static void test_ci_neg(short[] a) {
+    for (int i = a.length-1; i >= 0; i-=1) {
+      a[i] = -123;
+    }
+  }
+  static void test_vi_neg(short[] a, short b) {
+    for (int i = a.length-1; i >= 0; i-=1) {
+      a[i] = b;
+    }
+  }
+  static void test_cp_neg(short[] a, short[] b) {
+    for (int i = a.length-1; i >= 0; i-=1) {
+      a[i] = b[i];
+    }
+  }
+  static void test_2ci_neg(short[] a, short[] b) {
+    for (int i = a.length-1; i >= 0; i-=1) {
+      a[i] = -123;
+      b[i] = -103;
+    }
+  }
+  static void test_2vi_neg(short[] a, short[] b, short c, short d) {
+    for (int i = a.length-1; i >= 0; i-=1) {
+      a[i] = c;
+      b[i] = d;
+    }
+  }
+  static void test_ci_oppos(short[] a) {
+    int limit = a.length-1;
+    for (int i = 0; i < a.length; i+=1) {
+      a[limit-i] = -123;
+    }
+  }
+  static void test_vi_oppos(short[] a, short b) {
+    int limit = a.length-1;
+    for (int i = limit; i >= 0; i-=1) {
+      a[limit-i] = b;
+    }
+  }
+  static void test_cp_oppos(short[] a, short[] b) {
+    int limit = a.length-1;
+    for (int i = 0; i < a.length; i+=1) {
+      a[i] = b[limit-i];
+    }
+  }
+  static void test_2ci_oppos(short[] a, short[] b) {
+    int limit = a.length-1;
+    for (int i = 0; i < a.length; i+=1) {
+      a[limit-i] = -123;
+      b[i] = -103;
+    }
+  }
+  static void test_2vi_oppos(short[] a, short[] b, short c, short d) {
+    int limit = a.length-1;
+    for (int i = limit; i >= 0; i-=1) {
+      a[i] = c;
+      b[limit-i] = d;
+    }
+  }
+  static void test_ci_off(short[] a) {
+    for (int i = 0; i < a.length-OFFSET; i+=1) {
+      a[i+OFFSET] = -123;
+    }
+  }
+  static void test_vi_off(short[] a, short b) {
+    for (int i = 0; i < a.length-OFFSET; i+=1) {
+      a[i+OFFSET] = b;
+    }
+  }
+  static void test_cp_off(short[] a, short[] b) {
+    for (int i = 0; i < a.length-OFFSET; i+=1) {
+      a[i+OFFSET] = b[i+OFFSET];
+    }
+  }
+  static void test_2ci_off(short[] a, short[] b) {
+    for (int i = 0; i < a.length-OFFSET; i+=1) {
+      a[i+OFFSET] = -123;
+      b[i+OFFSET] = -103;
+    }
+  }
+  static void test_2vi_off(short[] a, short[] b, short c, short d) {
+    for (int i = 0; i < a.length-OFFSET; i+=1) {
+      a[i+OFFSET] = c;
+      b[i+OFFSET] = d;
+    }
+  }
+  static void test_ci_inv(short[] a, int k) {
+    for (int i = 0; i < a.length-k; i+=1) {
+      a[i+k] = -123;
+    }
+  }
+  static void test_vi_inv(short[] a, short b, int k) {
+    for (int i = 0; i < a.length-k; i+=1) {
+      a[i+k] = b;
+    }
+  }
+  static void test_cp_inv(short[] a, short[] b, int k) {
+    for (int i = 0; i < a.length-k; i+=1) {
+      a[i+k] = b[i+k];
+    }
+  }
+  static void test_2ci_inv(short[] a, short[] b, int k) {
+    for (int i = 0; i < a.length-k; i+=1) {
+      a[i+k] = -123;
+      b[i+k] = -103;
+    }
+  }
+  static void test_2vi_inv(short[] a, short[] b, short c, short d, int k) {
+    for (int i = 0; i < a.length-k; i+=1) {
+      a[i+k] = c;
+      b[i+k] = d;
+    }
+  }
+  static void test_ci_scl(short[] a) {
+    for (int i = 0; i*SCALE < a.length; i+=1) {
+      a[i*SCALE] = -123;
+    }
+  }
+  static void test_vi_scl(short[] a, short b) {
+    for (int i = 0; i*SCALE < a.length; i+=1) {
+      a[i*SCALE] = b;
+    }
+  }
+  static void test_cp_scl(short[] a, short[] b) {
+    for (int i = 0; i*SCALE < a.length; i+=1) {
+      a[i*SCALE] = b[i*SCALE];
+    }
+  }
+  static void test_2ci_scl(short[] a, short[] b) {
+    for (int i = 0; i*SCALE < a.length; i+=1) {
+      a[i*SCALE] = -123;
+      b[i*SCALE] = -103;
+    }
+  }
+  static void test_2vi_scl(short[] a, short[] b, short c, short d) {
+    for (int i = 0; i*SCALE < a.length; i+=1) {
+      a[i*SCALE] = c;
+      b[i*SCALE] = d;
+    }
+  }
+  static void test_cp_alndst(short[] a, short[] b) {
+    for (int i = 0; i < a.length-ALIGN_OFF; i+=1) {
+      a[i+ALIGN_OFF] = b[i];
+    }
+  }
+  static void test_cp_alnsrc(short[] a, short[] b) {
+    for (int i = 0; i < a.length-ALIGN_OFF; i+=1) {
+      a[i] = b[i+ALIGN_OFF];
+    }
+  }
+  static void test_2ci_aln(short[] a, short[] b) {
+    for (int i = 0; i < a.length-ALIGN_OFF; i+=1) {
+      a[i+ALIGN_OFF] = -123;
+      b[i] = -103;
+    }
+  }
+  static void test_2vi_aln(short[] a, short[] b, short c, short d) {
+    for (int i = 0; i < a.length-ALIGN_OFF; i+=1) {
+      a[i] = c;
+      b[i+ALIGN_OFF] = d;
+    }
+  }
+  static void test_cp_unalndst(short[] a, short[] b) {
+    for (int i = 0; i < a.length-UNALIGN_OFF; i+=1) {
+      a[i+UNALIGN_OFF] = b[i];
+    }
+  }
+  static void test_cp_unalnsrc(short[] a, short[] b) {
+    for (int i = 0; i < a.length-UNALIGN_OFF; i+=1) {
+      a[i] = b[i+UNALIGN_OFF];
+    }
+  }
+  static void test_2ci_unaln(short[] a, short[] b) {
+    for (int i = 0; i < a.length-UNALIGN_OFF; i+=1) {
+      a[i+UNALIGN_OFF] = -123;
+      b[i] = -103;
+    }
+  }
+  static void test_2vi_unaln(short[] a, short[] b, short c, short d) {
+    for (int i = 0; i < a.length-UNALIGN_OFF; i+=1) {
+      a[i] = c;
+      b[i+UNALIGN_OFF] = d;
+    }
+  }
+
+  static int verify(String text, int i, short elem, short val) {
+    if (elem != val) {
+      System.err.println(text + "[" + i + "] = " + elem + " != " + val);
+      return 1;
+    }
+    return 0;
+  }
+}