src/share/vm/opto/superword.cpp
Index Unified diffs Context diffs Sdiffs Wdiffs Patch New Old Previous File Next File
*** old/src/share/vm/opto/superword.cpp	Sat Jun  2 20:04:20 2012
--- new/src/share/vm/opto/superword.cpp	Sat Jun  2 20:04:20 2012

*** 1,7 ****
--- 1,7 ----
  /*
!  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
!  * Copyright (c) 2007, 2012, Oracle and/or its affiliates. All rights reserved.
   * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
   * This code is free software; you can redistribute it and/or modify it
   * under the terms of the GNU General Public License version 2 only, as
   * published by the Free Software Foundation.

*** 65,74 ****
--- 65,78 ----
    _iv(NULL)                               // induction var
  {}
  
  //------------------------------transform_loop---------------------------
  void SuperWord::transform_loop(IdealLoopTree* lpt) {
+   assert(UseSuperWord, "should be");
+   // Do vectors exist on this architecture?
+   if (Matcher::vector_width_in_bytes(T_BYTE) < 2) return;
+ 
    assert(lpt->_head->is_CountedLoop(), "must be");
    CountedLoopNode *cl = lpt->_head->as_CountedLoop();
  
    if (!cl->is_valid_counted_loop()) return; // skip malformed counted loop
  

*** 87,99 ****
--- 91,100 ----
    CountedLoopEndNode* pre_end = get_pre_loop_end(cl);
    if (pre_end == NULL) return;
    Node *pre_opaq1 = pre_end->limit();
    if (pre_opaq1->Opcode() != Op_Opaque1) return;
  
    // Do vectors exist on this architecture?
    if (vector_width_in_bytes() == 0) return;
  
    init(); // initialize data structures
  
    set_lpt(lpt);
    set_lp(cl);
  

*** 175,241 ****
--- 176,321 ----
  void SuperWord::find_adjacent_refs() {
    // Get list of memory operations
    Node_List memops;
    for (int i = 0; i < _block.length(); i++) {
      Node* n = _block.at(i);
!     if (n->is_Mem() && !n->is_LoadStore() && in_bb(n) &&
          is_java_primitive(n->as_Mem()->memory_type())) {
        int align = memory_alignment(n->as_Mem(), 0);
        if (align != bottom_align) {
          memops.push(n);
        }
      }
    }
    if (memops.size() == 0) return;
  
    // Find a memory reference to align to.  The pre-loop trip count
    // is modified to align this reference to a vector-aligned address
!   find_align_to_ref(memops);
!   if (align_to_ref() == NULL) return;
+   Node_List align_to_refs;
+   const Type* best_vt = NULL;
!   int best_iv_adjustment = 0;
!   MemNode* best_align_to_mem_ref = NULL;
  
    SWPointer align_to_ref_p(align_to_ref(), this);
    int offset = align_to_ref_p.offset_in_bytes();
!   int scale  = align_to_ref_p.scale_in_bytes();
!   int vw              = vector_width_in_bytes();
!   int stride_sign     = (scale * iv_stride()) > 0 ? 1 : -1;
!   int iv_adjustment   = (stride_sign * vw - (offset % vw)) % vw;
+   while (memops.size() != 0) {
+     // Find a memory reference to align to.
!     MemNode* mem_ref = find_align_to_ref(memops);
!     if (mem_ref == NULL) break;
!     align_to_refs.push(mem_ref);
!     const Type* vt = velt_type(mem_ref);
+     int iv_adjustment = get_iv_adjustment(mem_ref);
  
  #ifndef PRODUCT
    if (TraceSuperWord)
      tty->print_cr("\noffset = %d iv_adjustment = %d  elt_align = %d scale = %d iv_stride = %d",
                    offset, iv_adjustment, align_to_ref_p.memory_size(), align_to_ref_p.scale_in_bytes(), iv_stride());
  #endif
+     if (best_align_to_mem_ref == NULL) {
+       // Set memory reference which is the best from all memory operations
+       // to be used for alignment. The pre-loop trip count is modified to align
+       // this reference to a vector-aligned address.
+       best_vt = vt;
+       best_align_to_mem_ref = mem_ref;
+       best_iv_adjustment = iv_adjustment;
+     }
  
    // Set alignment relative to "align_to_ref"
+     SWPointer align_to_ref_p(mem_ref, this);
+     // Set alignment relative to "align_to_ref" for all related memory operations.
      for (int i = memops.size() - 1; i >= 0; i--) {
        MemNode* s = memops.at(i)->as_Mem();
+       if (isomorphic(s, mem_ref)) {
          SWPointer p2(s, this);
          if (p2.comparable(align_to_ref_p)) {
            int align = memory_alignment(s, iv_adjustment);
            set_alignment(s, align);
      } else {
        memops.remove(i);
          }
        }
+     }
  
!     // Create initial pack pairs of memory operations for which
+     // alignment is set and vectors will be aligned.
+     bool create_pack = true;
+     if (memory_alignment(mem_ref, best_iv_adjustment) != 0) {
+       if (vt == best_vt) {
+         // Can't allow vectorization of unaligned memory accesses with the
+         // same type since it could be overlapped accesses to the same array.
+         create_pack = false;
+       } else {
+         // Allow independent (different type) unaligned memory operations
+         // if HW supports them. 
+         if (!Matcher::misaligned_vectors_ok()) {
+           create_pack = false;
+         } else {
+           // Check if packs of the same memory type but
+           // with a different alignment were created before.
+           for (uint i = 0; i < align_to_refs.size(); i++) {
+             MemNode* mr = align_to_refs.at(i)->as_Mem();
+             if (velt_type(mr) == vt && memory_alignment(mr, iv_adjustment) != 0)
+               create_pack = false;
+           }
+         }
+       }
+     }
+     if (create_pack) {
        for (uint i = 0; i < memops.size(); i++) {
          Node* s1 = memops.at(i);
+         int align = alignment(s1);
+         if (align == top_align) continue;
          for (uint j = 0; j < memops.size(); j++) {
            Node* s2 = memops.at(j);
+           if (alignment(s2) == top_align) continue;
            if (s1 != s2 && are_adjacent_refs(s1, s2)) {
          int align = alignment(s1);
              if (stmts_can_pack(s1, s2, align)) {
                Node_List* pair = new Node_List();
                pair->push(s1);
                pair->push(s2);
                _packset.append(pair);
              }
            }
          }
        }
+     } else { // Don't create unaligned pack
+       // First, remove remaining memory ops of the same type from the list.
+       for (int i = memops.size() - 1; i >= 0; i--) {
+         MemNode* s = memops.at(i)->as_Mem();
+         if (velt_type(s) == vt) {
+           memops.remove(i);
+         }
+       }
  
+       // Second, removed already constructed packs of the same type.
+       for (int i = _packset.length() - 1; i >= 0; i--) {
+         Node_List* p = _packset.at(i);
+         MemNode* s = p->at(0)->as_Mem();
+         if (velt_type(s) == vt) {
+           remove_pack_at(i);
+         }
+       }
+ 
+       // If needed find the best memory reference for loop alignment again.
+       if (best_vt == vt) {
+         // Put memory ops from remaining packs back on memops list for
+         // the best alignment search.
+         uint orig_msize = memops.size();
+         for (int i = 0; i < _packset.length(); i++) {
+           Node_List* p = _packset.at(i);
+           MemNode* s = p->at(0)->as_Mem();
+           assert(velt_type(s) != vt, "sanity");
+           memops.push(s);
+         }
+         MemNode* best_align_to_mem_ref = find_align_to_ref(memops);
+         if (best_align_to_mem_ref == NULL) break;
+         best_vt = velt_type(best_align_to_mem_ref);
+         best_iv_adjustment = get_iv_adjustment(best_align_to_mem_ref);
+         // Restore list.
+         while (memops.size() > orig_msize)
+           (void)memops.pop();
+       }
+     } // unaligned memory accesses
+ 
+     // Remove used mem nodes 
+     for (int i = memops.size() - 1; i >= 0; i--) {
+       MemNode* m = memops.at(i)->as_Mem();
+       if (alignment(m) != top_align) {
+         memops.remove(i);
+       }
+     }
+ 
+   } // while (memops.size() != 0
+   set_align_to_ref(best_align_to_mem_ref);
+ 
  #ifndef PRODUCT
    if (TraceSuperWord) {
      tty->print_cr("\nAfter find_adjacent_refs");
      print_packset();
    }

*** 244,254 ****
--- 324,334 ----
  
  //------------------------------find_align_to_ref---------------------------
  // Find a memory reference to align the loop induction variable to.
  // Looks first at stores then at loads, looking for a memory reference
  // with the largest number of references similar to it.
! void SuperWord::find_align_to_ref(Node_List &memops) {
! MemNode* SuperWord::find_align_to_ref(Node_List &memops) {
    GrowableArray<int> cmp_ct(arena(), memops.size(), memops.size(), 0);
  
    // Count number of comparable memory ops
    for (uint i = 0; i < memops.size(); i++) {
      MemNode* s1 = memops.at(i)->as_Mem();

*** 268,291 ****
--- 348,379 ----
          }
        }
      }
    }
  
!   // Find Store (or Load) with the greatest number of "comparable" references,
+   // biggest vector size, smallest data size and smallest iv offset.
    int max_ct        = 0;
+   int max_vw        = 0;
    int max_idx       = -1;
    int min_size      = max_jint;
    int min_iv_offset = max_jint;
    for (uint j = 0; j < memops.size(); j++) {
      MemNode* s = memops.at(j)->as_Mem();
      if (s->is_Store()) {
+       int vw = vector_width_in_bytes(velt_basic_type(s));
+       assert(vw > 1, "sanity");
        SWPointer p(s, this);
        if (cmp_ct.at(j) >  max_ct ||
-           cmp_ct.at(j) == max_ct && (data_size(s) < min_size ||
+             (vw >  max_vw ||
+              vw == max_vw &&
+               (data_size(s) <  min_size ||
                 data_size(s) == min_size &&
!                                         p.offset_in_bytes() < min_iv_offset)) {
!                  (p.offset_in_bytes() < min_iv_offset)))) {
          max_ct = cmp_ct.at(j);
+         max_vw = vw;
          max_idx = j;
          min_size = data_size(s);
          min_iv_offset = p.offset_in_bytes();
        }
      }

*** 293,328 ****
--- 381,430 ----
    // If no stores, look at loads
    if (max_ct == 0) {
      for (uint j = 0; j < memops.size(); j++) {
        MemNode* s = memops.at(j)->as_Mem();
        if (s->is_Load()) {
+         int vw = vector_width_in_bytes(velt_basic_type(s));
+         assert(vw > 1, "sanity");
          SWPointer p(s, this);
          if (cmp_ct.at(j) >  max_ct ||
-             cmp_ct.at(j) == max_ct && (data_size(s) < min_size ||
+               (vw >  max_vw ||
+                vw == max_vw &&
+                 (data_size(s) <  min_size ||
                   data_size(s) == min_size &&
!                                           p.offset_in_bytes() < min_iv_offset)) {
!                    (p.offset_in_bytes() < min_iv_offset)))) {
            max_ct = cmp_ct.at(j);
+           max_vw = vw;
            max_idx = j;
            min_size = data_size(s);
            min_iv_offset = p.offset_in_bytes();
          }
        }
      }
    }
  
    if (max_ct > 0)
      set_align_to_ref(memops.at(max_idx)->as_Mem());
  
  #ifndef PRODUCT
+ #ifdef ASSERT
    if (TraceSuperWord && Verbose) {
      tty->print_cr("\nVector memops after find_align_to_refs");
      for (uint i = 0; i < memops.size(); i++) {
        MemNode* s = memops.at(i)->as_Mem();
        s->dump();
      }
    }
  #endif
+ 
+   if (max_ct > 0) {
+ #ifdef ASSERT
+     if (TraceSuperWord) {
+       tty->print("\nVector align to node: ");
+       memops.at(max_idx)->as_Mem()->dump();
+     }
+ #endif
+     return memops.at(max_idx)->as_Mem();
+   }
+   return NULL;
  }
  
  //------------------------------ref_is_alignable---------------------------
  // Can the preloop align the reference to position zero in the vector?
  bool SuperWord::ref_is_alignable(SWPointer& p) {

*** 339,349 ****
--- 441,453 ----
    if (ABS(span) == p.memory_size())
      return true;
  
    // If initial offset from start of object is computable,
    // compute alignment within the vector.
!   int vw = vector_width_in_bytes();
!   BasicType bt = velt_basic_type(p.mem());
+   int vw = vector_width_in_bytes(bt);
+   assert(vw > 1, "sanity");
    if (vw % span == 0) {
      Node* init_nd = pre_end->init_trip();
      if (init_nd->is_Con() && p.invar() == NULL) {
        int init = init_nd->bottom_type()->is_int()->get_con();
  

*** 359,368 ****
--- 463,492 ----
      }
    }
    return false;
  }
  
+ //---------------------------get_iv_adjustment---------------------------
+ // Calculate loop's iv adjustment for this memory ops.
+ int SuperWord::get_iv_adjustment(MemNode* mem_ref) {
+   SWPointer align_to_ref_p(mem_ref, this);
+   int offset = align_to_ref_p.offset_in_bytes();
+   int scale  = align_to_ref_p.scale_in_bytes();
+   BasicType bt = velt_basic_type(mem_ref);
+   int vw       = vector_width_in_bytes(bt);
+   assert(vw > 1, "sanity");
+   int stride_sign   = (scale * iv_stride()) > 0 ? 1 : -1;
+   int iv_adjustment = (stride_sign * vw - (offset % vw)) % vw;
+ 
+ #ifndef PRODUCT
+   if (TraceSuperWord)
+     tty->print_cr("\noffset = %d iv_adjust = %d elt_size = %d scale = %d iv_stride = %d vect_size %d",
+                   offset, iv_adjustment, align_to_ref_p.memory_size(), scale, iv_stride(), vw);
+ #endif
+   return iv_adjustment;
+ }
+ 
  //---------------------------dependence_graph---------------------------
  // Construct dependency graph.
  // Add dependence edges to load/store nodes for memory dependence
  //    A.out()->DependNode.in(1) and DependNode.out()->B.prec(x)
  void SuperWord::dependence_graph() {

*** 486,498 ****
--- 610,626 ----
  // Can s1 and s2 be in a pack with s1 immediately preceding s2 and
  // s1 aligned at "align"
  bool SuperWord::stmts_can_pack(Node* s1, Node* s2, int align) {
  
    // Do not use superword for non-primitives
    if((s1->is_Mem() && !is_java_primitive(s1->as_Mem()->memory_type())) ||
       (s2->is_Mem() && !is_java_primitive(s2->as_Mem()->memory_type())))
+   BasicType bt1 = velt_basic_type(s1);
+   BasicType bt2 = velt_basic_type(s2);
+   if(!is_java_primitive(bt1) || !is_java_primitive(bt2))
      return false;
+   if (Matcher::max_vector_size(bt1) < 2) {
+     return false; // No vectors for this type
+   }
  
    if (isomorphic(s1, s2)) {
      if (independent(s1, s2)) {
        if (!exists_at(s1, 0) && !exists_at(s2, 1)) {
          if (!s1->is_Mem() || are_adjacent_refs(s1, s2)) {

*** 593,610 ****
--- 721,740 ----
  }
  
  //------------------------------set_alignment---------------------------
  void SuperWord::set_alignment(Node* s1, Node* s2, int align) {
    set_alignment(s1, align);
+   if (align == top_align || align == bottom_align) {
+     set_alignment(s2, align);
+   } else {
      set_alignment(s2, align + data_size(s1));
+   }
  }
  
  //------------------------------data_size---------------------------
  int SuperWord::data_size(Node* s) {
!   const Type* t = velt_type(s);
    BasicType  bt = t->array_element_basic_type();
    int bsize = type2aelembytes(bt);
!   int bsize = type2aelembytes(velt_basic_type(s));
    assert(bsize != 0, "valid size");
    return bsize;
  }
  
  //------------------------------extend_packlist---------------------------

*** 629,641 ****
--- 759,771 ----
  }
  
  //------------------------------follow_use_defs---------------------------
  // Extend the packset by visiting operand definitions of nodes in pack p
  bool SuperWord::follow_use_defs(Node_List* p) {
+   assert(p->size() == 2, "just checking");
    Node* s1 = p->at(0);
    Node* s2 = p->at(1);
    assert(p->size() == 2, "just checking");
    assert(s1->req() == s2->req(), "just checking");
    assert(alignment(s1) + data_size(s1) == alignment(s2), "just checking");
  
    if (s1->is_Load()) return false;
  

*** 716,753 ****
--- 846,889 ----
    uint i2 = 0;
    do {
      for (i1++; i1 < ct; i1++) if (u1->in(i1) == d1) break;
      for (i2++; i2 < ct; i2++) if (u2->in(i2) == d2) break;
      if (i1 != i2) {
+       if ((i1 == (3-i2)) && (u2->is_Add() || u2->is_Mul())) {
+         // Further analysis relies on operands position matching.
+         u2->swap_edges(i1, i2);
+       } else {
          return false;
        }
+     }
    } while (i1 < ct);
    return true;
  }
  
  //------------------------------est_savings---------------------------
  // Estimate the savings from executing s1 and s2 as a pack
  int SuperWord::est_savings(Node* s1, Node* s2) {
!   int save_in = 2 - 1; // 2 operations per instruction in packed form
  
    // inputs
    for (uint i = 1; i < s1->req(); i++) {
      Node* x1 = s1->in(i);
      Node* x2 = s2->in(i);
      if (x1 != x2) {
        if (are_adjacent_refs(x1, x2)) {
!         save_in += adjacent_profit(x1, x2);
        } else if (!in_packset(x1, x2)) {
!         save_in -= pack_cost(2);
        } else {
!         save_in += unpack_cost(2);
        }
      }
    }
  
    // uses of result
    uint ct = 0;
+   int save_use = 0;
    for (DUIterator_Fast imax, i = s1->fast_outs(imax); i < imax; i++) {
      Node* s1_use = s1->fast_out(i);
      for (int j = 0; j < _packset.length(); j++) {
        Node_List* p = _packset.at(j);
        if (p->at(0) == s1_use) {

*** 754,775 ****
--- 890,911 ----
          for (DUIterator_Fast kmax, k = s2->fast_outs(kmax); k < kmax; k++) {
            Node* s2_use = s2->fast_out(k);
            if (p->at(p->size()-1) == s2_use) {
              ct++;
              if (are_adjacent_refs(s1_use, s2_use)) {
!               save_use += adjacent_profit(s1_use, s2_use);
              }
            }
          }
        }
      }
    }
  
!   if (ct < s1->outcnt()) save_use += unpack_cost(1);
!   if (ct < s2->outcnt()) save_use += unpack_cost(1);
  
!   return MAX2(save_in, save_use);
  }
  
  //------------------------------costs---------------------------
  int SuperWord::adjacent_profit(Node* s1, Node* s2) { return 2; }
  int SuperWord::pack_cost(int ct)   { return ct; }

*** 776,805 ****
--- 912,974 ----
  int SuperWord::unpack_cost(int ct) { return ct; }
  
  //------------------------------combine_packs---------------------------
  // Combine packs A and B with A.last == B.first into A.first..,A.last,B.second,..B.last
  void SuperWord::combine_packs() {
!   bool changed = true;
    do {
+   // Combine packs regardless max vector size.
+   while (changed) {
      changed = false;
      for (int i = 0; i < _packset.length(); i++) {
        Node_List* p1 = _packset.at(i);
        if (p1 == NULL) continue;
        for (int j = 0; j < _packset.length(); j++) {
          Node_List* p2 = _packset.at(j);
          if (p2 == NULL) continue;
+         if (i == j) continue;
          if (p1->at(p1->size()-1) == p2->at(0)) {
            for (uint k = 1; k < p2->size(); k++) {
              p1->push(p2->at(k));
            }
            _packset.at_put(j, NULL);
            changed = true;
          }
        }
      }
    } while (changed);
+   }
  
+   // Split packs which have size greater then max vector size.
+   for (int i = 0; i < _packset.length(); i++) {
+     Node_List* p1 = _packset.at(i);
+     if (p1 != NULL) {
+       BasicType bt = velt_basic_type(p1->at(0));
+       uint max_vlen = Matcher::max_vector_size(bt); // Max elements in vector
+       assert(is_power_of_2(max_vlen), "sanity");
+       uint psize = p1->size();
+       if (!is_power_of_2(psize)) {
+         // Skip pack which can't be vector.
+         // case1: for(...) { a[i] = i; }    elements values are different (i+x)
+         // case2: for(...) { a[i] = b[i+1]; }  can't align both, load and store
+         _packset.at_put(i, NULL);
+         continue;
+       }
+       if (psize > max_vlen) {
+         Node_List* pack = new Node_List();
+         for (uint j = 0; j < psize; j++) {
+           pack->push(p1->at(j));
+           if (pack->size() >= max_vlen) {
+             assert(is_power_of_2(pack->size()), "sanity");
+             _packset.append(pack);
+             pack = new Node_List();
+           }
+         }
+         _packset.at_put(i, NULL);
+       }
+     }
+   }
+ 
+   // Compress list.
    for (int i = _packset.length() - 1; i >= 0; i--) {
      Node_List* p1 = _packset.at(i);
      if (p1 == NULL) {
        _packset.remove_at(i);
      }

*** 878,889 ****
--- 1047,1057 ----
  
  //------------------------------implemented---------------------------
  // Can code be generated for pack p?
  bool SuperWord::implemented(Node_List* p) {
    Node* p0 = p->at(0);
!   int vopc = VectorNode::opcode(p0->Opcode(), p->size(), velt_type(p0));
    return vopc > 0 && Matcher::has_match_rule(vopc);
!   return VectorNode::implemented(p0->Opcode(), p->size(), velt_basic_type(p0));
  }
  
  //------------------------------profitable---------------------------
  // For pack p, are all operands and all uses (with in the block) vector?
  bool SuperWord::profitable(Node_List* p) {

*** 937,977 ****
--- 1105,1150 ----
      co_locate_pack(_packset.at(i));
    }
  }
  
  //-------------------------------remove_and_insert-------------------
! //remove "current" from its current position in the memory graph and insert
  //it after the appropriate insertion point (lip or uip)
! // Remove "current" from its current position in the memory graph and insert
+ // it after the appropriate insertion point (lip or uip).
  void SuperWord::remove_and_insert(MemNode *current, MemNode *prev, MemNode *lip,
                                    Node *uip, Unique_Node_List &sched_before) {
    Node* my_mem = current->in(MemNode::Memory);
!   _igvn.hash_delete(current);
    _igvn.hash_delete(my_mem);
!   bool sched_up = sched_before.member(current);
  
!   // remove current_store from its current position in the memmory graph
    for (DUIterator i = current->outs(); current->has_out(i); i++) {
      Node* use = current->out(i);
      if (use->is_Mem()) {
        assert(use->in(MemNode::Memory) == current, "must be");
        _igvn.hash_delete(use);
        if (use == prev) { // connect prev to my_mem
+           _igvn.hash_delete(use);
            use->set_req(MemNode::Memory, my_mem);
+           _igvn._worklist.push(use);
+           --i; //deleted this edge; rescan position
        } else if (sched_before.member(use)) {
          _igvn.hash_delete(uip);
+         if (!sched_up) { // Will be moved together with current
+           _igvn.hash_delete(use);
            use->set_req(MemNode::Memory, uip);
+           _igvn._worklist.push(use);
+           --i; //deleted this edge; rescan position
+         }
        } else {
          _igvn.hash_delete(lip);
+         if (sched_up) { // Will be moved together with current
+           _igvn.hash_delete(use);
            use->set_req(MemNode::Memory, lip);
        }
            _igvn._worklist.push(use);
            --i; //deleted this edge; rescan position
          }
        }
+     }
+   }
  
    bool sched_up = sched_before.member(current);
    Node *insert_pt =  sched_up ?  uip : lip;
    _igvn.hash_delete(insert_pt);
  
    // all uses of insert_pt's memory state should use current's instead
    for (DUIterator i = insert_pt->outs(); insert_pt->has_out(i); i++) {
      Node* use = insert_pt->out(i);
      if (use->is_Mem()) {

*** 980,1000 ****
--- 1153,1174 ----
        use->set_req(MemNode::Memory, current);
        _igvn._worklist.push(use);
        --i; //deleted this edge; rescan position
      } else if (!sched_up && use->is_Phi() && use->bottom_type() == Type::MEMORY) {
        uint pos; //lip (lower insert point) must be the last one in the memory slice
        _igvn.hash_delete(use);
        for (pos=1; pos < use->req(); pos++) {
          if (use->in(pos) == insert_pt) break;
        }
+       _igvn.hash_delete(use);
        use->set_req(pos, current);
        _igvn._worklist.push(use);
        --i;
      }
    }
  
    //connect current to insert_pt
+   _igvn.hash_delete(current);
    current->set_req(MemNode::Memory, insert_pt);
    _igvn._worklist.push(current);
  }
  
  //------------------------------co_locate_pack----------------------------------

*** 1029,1039 ****
--- 1203,1213 ----
        for (DUIterator i = current->outs(); current->has_out(i); i++) {
          Node* use = current->out(i);
          if (use->is_Mem() && use != previous)
            memops.push(use);
        }
!       if (current == first) break;
        previous = current;
        current  = current->in(MemNode::Memory)->as_Mem();
      }
  
      // determine which memory operations should be scheduled before the pack

*** 1042,1090 ****
--- 1216,1271 ----
        if (!in_pack(s1, pk) && !schedule_before_pack.member(s1)) {
          for (uint j = 0; j< i; j++) {
            Node *s2 = memops.at(j);
            if (!independent(s1, s2)) {
              if (in_pack(s2, pk) || schedule_before_pack.member(s2)) {
!               schedule_before_pack.push(s1); // s1 must be scheduled before
                Node_List* mem_pk = my_pack(s1);
                if (mem_pk != NULL) {
                  for (uint ii = 0; ii < mem_pk->size(); ii++) {
                    Node* s = mem_pk->at(ii);  // follow partner
                    if (memops.member(s) && !schedule_before_pack.member(s))
                      schedule_before_pack.push(s);
                  }
                }
+               break;
              }
            }
          }
        }
      }
  
      MemNode* lower_insert_pt = last;
      Node*    upper_insert_pt = first->in(MemNode::Memory);
+     // Following code moves loads connected to upper_insert_pt below aliased stores.
+     // Collect such loads here and reconnect them back to upper_insert_pt later.
+     memops.clear();
+     for (DUIterator i = upper_insert_pt->outs(); upper_insert_pt->has_out(i); i++) {
+       Node* use = upper_insert_pt->out(i);
+       if (!use->is_Store())
+         memops.push(use);
+     }
+ 
+     MemNode* lower_insert_pt = last;
      previous                 = last; //previous store in pk
      current                  = last->in(MemNode::Memory)->as_Mem();
  
!     // start scheduling from "last" to "first"
      while (true) {
        assert(in_bb(current), "stay in block");
        assert(in_pack(previous, pk), "previous stays in pack");
        Node* my_mem = current->in(MemNode::Memory);
  
        if (in_pack(current, pk)) {
          // Forward users of my memory state (except "previous) to my input memory state
          _igvn.hash_delete(current);
          for (DUIterator i = current->outs(); current->has_out(i); i++) {
            Node* use = current->out(i);
            if (use->is_Mem() && use != previous) {
              assert(use->in(MemNode::Memory) == current, "must be");
              _igvn.hash_delete(use);
              if (schedule_before_pack.member(use)) {
                _igvn.hash_delete(upper_insert_pt);
                use->set_req(MemNode::Memory, upper_insert_pt);
              } else {
                _igvn.hash_delete(lower_insert_pt);
                use->set_req(MemNode::Memory, lower_insert_pt);
              }
              _igvn._worklist.push(use);
              --i; // deleted this edge; rescan position
            }

*** 1095,1104 ****
--- 1276,1295 ----
        }
  
        if (current == first) break;
        current = my_mem->as_Mem();
      } // end while
+ 
+     // Reconect loads back to upper_insert_pt.
+     for (uint i = 0; i < memops.size(); i++) {
+       Node *ld = memops.at(i);
+       if (ld->in(MemNode::Memory) != upper_insert_pt) {
+         _igvn.hash_delete(ld);
+         ld->set_req(MemNode::Memory, upper_insert_pt);
+         _igvn._worklist.push(ld);
+       }
+     }
    } else if (pk->at(0)->is_Load()) { //load
      // all loads in the pack should have the same memory state. By default,
      // we use the memory state of the last load. However, if any load could
      // not be moved down due to the dependence constraint, we use the memory
      // state of the first load.

*** 1157,1204 ****
--- 1348,1396 ----
      if (p && n == executed_last(p)) {
        uint vlen = p->size();
        Node* vn = NULL;
        Node* low_adr = p->at(0);
        Node* first   = executed_first(p);
        if (n->is_Load()) {
        int   opc = n->Opcode();
+       if (n->is_Load()) {
          Node* ctl = n->in(MemNode::Control);
          Node* mem = first->in(MemNode::Memory);
          Node* adr = low_adr->in(MemNode::Address);
          const TypePtr* atyp = n->adr_type();
!         vn = VectorLoadNode::make(_phase->C, opc, ctl, mem, adr, atyp, vlen);
  
!         vn = LoadVectorNode::make(_phase->C, opc, ctl, mem, adr, atyp, vlen, velt_basic_type(n));
        } else if (n->is_Store()) {
          // Promote value to be stored to vector
          Node* val = vector_opd(p, MemNode::ValueIn);
  
          int   opc = n->Opcode();
          Node* ctl = n->in(MemNode::Control);
          Node* mem = first->in(MemNode::Memory);
          Node* adr = low_adr->in(MemNode::Address);
          const TypePtr* atyp = n->adr_type();
!         vn = VectorStoreNode::make(_phase->C, opc, ctl, mem, adr, atyp, val, vlen);
  
!         vn = StoreVectorNode::make(_phase->C, opc, ctl, mem, adr, atyp, val, vlen);
        } else if (n->req() == 3) {
          // Promote operands to vector
          Node* in1 = vector_opd(p, 1);
          Node* in2 = vector_opd(p, 2);
!         vn = VectorNode::make(_phase->C, n->Opcode(), in1, in2, vlen, velt_type(n));
  
!         vn = VectorNode::make(_phase->C, opc, in1, in2, vlen, velt_basic_type(n));
        } else {
          ShouldNotReachHere();
        }
  
+       assert(vn != NULL, "sanity");
        _phase->_igvn.register_new_node_with_optimizer(vn);
        _phase->set_ctrl(vn, _phase->get_ctrl(p->at(0)));
        for (uint j = 0; j < p->size(); j++) {
          Node* pm = p->at(j);
          _igvn.replace_node(pm, vn);
        }
        _igvn._worklist.push(vn);
+ #ifdef ASSERT
+       if (TraceSuperWord) {
+         tty->print("\nnew Vector node: ");
+         vn->dump();
        }
+ #endif
      }
+   }
  }
  
  //------------------------------vector_opd---------------------------
  // Create a vector operand for the nodes in pack p for operand: in(opd_idx)
  Node* SuperWord::vector_opd(Node_List* p, int opd_idx) {

*** 1215,1228 ****
--- 1407,1420 ----
        break;
      }
    }
  
    if (same_opd) {
!     if (opd->is_Vector() || opd->is_VectorLoad()) {
!     if (opd->is_Vector() || opd->is_LoadVector()) {
        return opd; // input is matching vector
      }
!     assert(!opd->is_VectorStore(), "such vector is not expected here");
!     assert(!opd->is_StoreVector(), "such vector is not expected here");
      // Convert scalar input to vector with the same number of elements as
      // p0's vector. Use p0's type because size of operand's container in
      // vector should match p0's size regardless operand's size.
      const Type* p0_t = velt_type(p0);
      VectorNode* vn = VectorNode::scalar2vector(_phase->C, opd, vlen, p0_t);

*** 1231,1250 ****
--- 1423,1442 ----
      _phase->set_ctrl(vn, _phase->get_ctrl(opd));
      return vn;
    }
  
    // Insert pack operation
!   const Type* p0_t = velt_type(p0);
!   PackNode* pk = PackNode::make(_phase->C, opd, p0_t);
!   BasicType bt = velt_basic_type(p0);
!   PackNode* pk = PackNode::make(_phase->C, opd, vlen, bt);
    DEBUG_ONLY( const BasicType opd_bt = opd->bottom_type()->basic_type(); )
  
    for (uint i = 1; i < vlen; i++) {
      Node* pi = p->at(i);
      Node* in = pi->in(opd_idx);
      assert(my_pack(in) == NULL, "Should already have been unpacked");
      assert(opd_bt == in->bottom_type()->basic_type(), "all same type");
!     pk->add_opd(i, in);
    }
    _phase->_igvn.register_new_node_with_optimizer(pk);
    _phase->set_ctrl(pk, _phase->get_ctrl(opd));
    return pk;
  }

*** 1282,1302 ****
--- 1474,1493 ----
  
      // Insert extract operation
      _igvn.hash_delete(def);
      _igvn.hash_delete(use);
      int def_pos = alignment(def) / data_size(def);
      const Type* def_t = velt_type(def);
  
!     Node* ex = ExtractNode::make(_phase->C, def, def_pos, def_t);
!     Node* ex = ExtractNode::make(_phase->C, def, def_pos, velt_basic_type(def));
      _phase->_igvn.register_new_node_with_optimizer(ex);
      _phase->set_ctrl(ex, _phase->get_ctrl(def));
      use->set_req(idx, ex);
      _igvn._worklist.push(def);
      _igvn._worklist.push(use);
  
      bb_insert_after(ex, bb_idx(def));
!     set_velt_type(ex, def_t);
!     set_velt_type(ex, velt_type(def));
    }
  }
  
  //------------------------------is_vector_use---------------------------
  // Is use->in(u_idx) a vector use?

*** 1585,1598 ****
--- 1776,1793 ----
  int SuperWord::memory_alignment(MemNode* s, int iv_adjust_in_bytes) {
    SWPointer p(s, this);
    if (!p.valid()) {
      return bottom_align;
    }
+   int vw = vector_width_in_bytes(velt_basic_type(s));
+   if (vw < 2) {
+     return bottom_align; // No vectors for this type
+   }
    int offset  = p.offset_in_bytes();
    offset     += iv_adjust_in_bytes;
!   int off_rem = offset % vector_width_in_bytes();
!   int off_mod = off_rem >= 0 ? off_rem : off_rem + vector_width_in_bytes();
!   int off_rem = offset % vw;
!   int off_mod = off_rem >= 0 ? off_rem : off_rem + vw;
    return off_mod;
  }
  
  //---------------------------container_type---------------------------
  // Smallest type containing range of values

*** 1613,1623 ****
--- 1808,1819 ----
  
  //-------------------------vector_opd_range-----------------------
  // (Start, end] half-open range defining which operands are vector
  void SuperWord::vector_opd_range(Node* n, uint* start, uint* end) {
    switch (n->Opcode()) {
!   case Op_LoadB:   case Op_LoadUS:
!   case Op_LoadB:   case Op_LoadUB:
+   case Op_LoadS:   case Op_LoadUS:
    case Op_LoadI:   case Op_LoadL:
    case Op_LoadF:   case Op_LoadD:
    case Op_LoadP:
      *start = 0;
      *end   = 0;

*** 1731,1740 ****
--- 1927,1937 ----
    // pre-loop Opaque1 node.
    Node *orig_limit = pre_opaq->original_loop_limit();
    assert(orig_limit != NULL && _igvn.type(orig_limit) != Type::TOP, "");
  
    SWPointer align_to_ref_p(align_to_ref, this);
+   assert(align_to_ref_p.valid(), "sanity");
  
    // Given:
    //     lim0 == original pre loop limit
    //     V == v_align (power of 2)
    //     invar == extra invariant piece of the address expression

*** 1783,1796 ****
--- 1980,1995 ----
    //   Solving for lim:
    //     (e - lim0 + N) % V == 0
    //     N = (V - (e - lim0)) % V
    //     lim = lim0 - (V - (e - lim0)) % V
  
+   int vw = vector_width_in_bytes(velt_basic_type(align_to_ref));
+   assert(vw > 1, "sanity");
    int stride   = iv_stride();
    int scale    = align_to_ref_p.scale_in_bytes();
    int elt_size = align_to_ref_p.memory_size();
!   int v_align  = vector_width_in_bytes() / elt_size;
!   int v_align  = vw / elt_size;
    int k        = align_to_ref_p.offset_in_bytes() / elt_size;
  
    Node *kn   = _igvn.intcon(k);
  
    Node *e = kn;

*** 1805,1814 ****
--- 2004,2032 ----
      } else {
        e = new (_phase->C, 3) AddINode(e, aref);
      }
      _phase->_igvn.register_new_node_with_optimizer(e);
      _phase->set_ctrl(e, pre_ctrl);
+   }
+   if (vw > ObjectAlignmentInBytes) {
+     // incorporate base e +/- base && Mask >>> log2(elt)
+     Node* mask = _igvn.MakeConX(~(-1 << exact_log2(vw)));
+     Node* xbase = new(_phase->C, 2) CastP2XNode(NULL, align_to_ref_p.base());
+     _phase->_igvn.register_new_node_with_optimizer(xbase);
+     Node* masked_xbase  = new (_phase->C, 3) AndXNode(xbase, mask);
+     _phase->_igvn.register_new_node_with_optimizer(masked_xbase);
+ #ifdef _LP64
+     masked_xbase  = new (_phase->C, 2) ConvL2INode(masked_xbase);
+     _phase->_igvn.register_new_node_with_optimizer(masked_xbase);
+ #endif
+     Node* log2_elt = _igvn.intcon(exact_log2(elt_size));
+     Node* bref     = new (_phase->C, 3) URShiftINode(masked_xbase, log2_elt);
+     _phase->_igvn.register_new_node_with_optimizer(bref);
+     _phase->set_ctrl(bref, pre_ctrl);
+     e = new (_phase->C, 3) AddINode(e, bref);
+     _phase->_igvn.register_new_node_with_optimizer(e);
+     _phase->set_ctrl(e, pre_ctrl);
    }
  
    // compute e +/- lim0
    if (scale < 0) {
      e = new (_phase->C, 3) SubINode(e, lim0);
src/share/vm/opto/superword.cpp
Index Unified diffs Context diffs Sdiffs Wdiffs Patch New Old Previous File Next File