hotspot Cdiff src/cpu/aarch64/vm/aarch64.ad

< prev index next >

src/cpu/aarch64/vm/aarch64.ad

rev 8832 : 8078743: AARCH64: Extend use of stlr to cater for volatile object stores
    Summary: The current use of stlr on AArch64 to implement volatile stores needs to be extended to cater for object stores.
    Reviewed-by: adinn
rev 8833 : 8080293: AARCH64: Remove unnecessary dmbs from generated CAS code
Summary: The current encoding for CAS generates unnecessary leading and trailing dmbs for the MemBarAcquire and MemBarRelease which ought to be elided
Reviewed-by: adinn


*** 1037,1046 ****
--- 1037,1047 ----
    MemBarNode *parent_membar(const Node *n);
    MemBarNode *child_membar(const MemBarNode *n);
    bool leading_membar(const MemBarNode *barrier);
  
    bool is_card_mark_membar(const MemBarNode *barrier);
+   bool is_CAS(int opcode);
  
    MemBarNode *leading_to_normal(MemBarNode *leading);
    MemBarNode *normal_to_leading(const MemBarNode *barrier);
    MemBarNode *card_mark_to_trailing(const MemBarNode *barrier);
    MemBarNode *trailing_to_card_mark(const MemBarNode *trailing);
*** 1055,1064 ****
--- 1056,1068 ----
  
    bool unnecessary_release(const Node *barrier);
    bool unnecessary_volatile(const Node *barrier);
    bool needs_releasing_store(const Node *store);
  
+   // predicate controlling translation of CompareAndSwapX
+   bool needs_acquiring_load_exclusive(const Node *load);
+ 
    // predicate controlling translation of StoreCM
    bool unnecessary_storestore(const Node *storecm);
  %}
  
  source %{
*** 1086,1104 ****
    //
    //   dmb ish
    //   str<x>
    //   dmb ish
    //
    // In order to generate the desired instruction sequence we need to
    // be able to identify specific 'signature' ideal graph node
    // sequences which i) occur as a translation of a volatile reads or
!   // writes and ii) do not occur through any other translation or
!   // graph transformation. We can then provide alternative aldc
!   // matching rules which translate these node sequences to the
!   // desired machine code sequences. Selection of the alternative
!   // rules can be implemented by predicates which identify the
!   // relevant node sequences.
    //
    // The ideal graph generator translates a volatile read to the node
    // sequence
    //
    //   LoadX[mo_acquire]
--- 1090,1151 ----
    //
    //   dmb ish
    //   str<x>
    //   dmb ish
    //
+   // We can also use ldaxr and stlxr to implement compare and swap CAS
+   // sequences. These are normally translated to an instruction
+   // sequence like the following
+   //
+   //   dmb      ish
+   // retry:
+   //   ldxr<x>   rval raddr
+   //   cmp       rval rold
+   //   b.ne done
+   //   stlxr<x>  rval, rnew, rold
+   //   cbnz      rval retry
+   // done:
+   //   cset      r0, eq
+   //   dmb ishld
+   //
+   // Note that the exclusive store is already using an stlxr
+   // instruction. That is required to ensure visibility to other
+   // threads of the exclusive write (assuming it succeeds) before that
+   // of any subsequent writes.
+   //
+   // The following instruction sequence is an improvement on the above
+   //
+   // retry:
+   //   ldaxr<x>  rval raddr
+   //   cmp       rval rold
+   //   b.ne done
+   //   stlxr<x>  rval, rnew, rold
+   //   cbnz      rval retry
+   // done:
+   //   cset      r0, eq
+   //
+   // We don't need the leading dmb ish since the stlxr guarantees
+   // visibility of prior writes in the case that the swap is
+   // successful. Crucially we don't have to worry about the case where
+   // the swap is not successful since no valid program should be
+   // relying on visibility of prior changes by the attempting thread
+   // in the case where the CAS fails.
+   //
+   // Similarly, we don't need the trailing dmb ishld if we substitute
+   // an ldaxr instruction since that will provide all the guarantees we
+   // require regarding observation of changes made by other threads
+   // before any change to the CAS address observed by the load.
+   //
    // In order to generate the desired instruction sequence we need to
    // be able to identify specific 'signature' ideal graph node
    // sequences which i) occur as a translation of a volatile reads or
!   // writes or CAS operations and ii) do not occur through any other
!   // translation or graph transformation. We can then provide
!   // alternative aldc matching rules which translate these node
!   // sequences to the desired machine code sequences. Selection of the
!   // alternative rules can be implemented by predicates which identify
!   // the relevant node sequences.
    //
    // The ideal graph generator translates a volatile read to the node
    // sequence
    //
    //   LoadX[mo_acquire]
*** 1161,1170 ****
--- 1208,1226 ----
    // sequence of membar nodes. Similarly, given an acquire membar we
    // can know that it was added because of an inlined unsafe volatile
    // get if it is fed and feeds a cpuorder membar and if its feed
    // membar also feeds an acquiring load.
    //
+   // Finally an inlined (Unsafe) CAS operation is translated to the
+   // following ideal graph
+   //
+   //   MemBarRelease
+   //   MemBarCPUOrder
+   //   CompareAndSwapX {CardMark}-optional
+   //   MemBarCPUOrder
+   //   MemBarAcquire
+   //
    // So, where we can identify these volatile read and write
    // signatures we can choose to plant either of the above two code
    // sequences. For a volatile read we can simply plant a normal
    // ldr<x> and translate the MemBarAcquire to a dmb. However, we can
    // also choose to inhibit translation of the MemBarAcquire and
*** 1175,1184 ****
--- 1231,1248 ----
    // normal str<x> and then a dmb ish for the MemBarVolatile.
    // Alternatively, we can inhibit translation of the MemBarRelease
    // and MemBarVolatile and instead plant a simple stlr<x>
    // instruction.
    //
+   // when we recognise a CAS signature we can choose to plant a dmb
+   // ish as a translation for the MemBarRelease, the conventional
+   // macro-instruction sequence for the CompareAndSwap node (which
+   // uses ldxr<x>) and then a dmb ishld for the MemBarAcquire.
+   // Alternatively, we can elide generation of the dmb instructions
+   // and plant the alternative CompareAndSwap macro-instruction
+   // sequence (which uses ldaxr<x>).
+   // 
    // Of course, the above only applies when we see these signature
    // configurations. We still want to plant dmb instructions in any
    // other cases where we may see a MemBarAcquire, MemBarRelease or
    // MemBarVolatile. For example, at the end of a constructor which
    // writes final/volatile fields we will see a MemBarRelease
*** 1192,1202 ****
    // always just translate the loads and stores to ldr<x> and str<x>
    // and translate acquire, release and volatile membars to the
    // relevant dmb instructions.
    //
  
!   // graph traversal helpers used for volatile put/get optimization
  
    // 1) general purpose helpers
  
    // if node n is linked to a parent MemBarNode by an intervening
    // Control and Memory ProjNode return the MemBarNode otherwise return
--- 1256,1267 ----
    // always just translate the loads and stores to ldr<x> and str<x>
    // and translate acquire, release and volatile membars to the
    // relevant dmb instructions.
    //
  
!   // graph traversal helpers used for volatile put/get and CAS
!   // optimization
  
    // 1) general purpose helpers
  
    // if node n is linked to a parent MemBarNode by an intervening
    // Control and Memory ProjNode return the MemBarNode otherwise return
*** 1331,1342 ****
    
      return false;
    }
  
  
!   // 3) helper predicates to traverse volatile put graphs which may
!   // contain GC barrier subgraphs
  
    // Preamble
    // --------
    //
    // for volatile writes we can omit generating barriers and employ a
--- 1396,1407 ----
    
      return false;
    }
  
  
!   // 3) helper predicates to traverse volatile put or CAS graphs which
!   // may contain GC barrier subgraphs
  
    // Preamble
    // --------
    //
    // for volatile writes we can omit generating barriers and employ a
*** 1402,1413 ****
    //
    // It is also possible to perform the card mark conditionally on it
    // currently being unmarked in which case the volatile put graph
    // will look slightly different
    //
!   //   MemBarRelease
!   //   MemBarCPUOrder___________________________________________
    //         ||    \\               Ctl \     Ctl \     \\  Mem \
    //         ||    StoreN/P[mo_release] CastP2X   If   LoadB     |
    //         | \     /                              \            |
    //         | MergeMem                            . . .      StoreB
    //         | /                                                /
--- 1467,1477 ----
    //
    // It is also possible to perform the card mark conditionally on it
    // currently being unmarked in which case the volatile put graph
    // will look slightly different
    //
!   //   MemBarRelease____________________________________________
    //         ||    \\               Ctl \     Ctl \     \\  Mem \
    //         ||    StoreN/P[mo_release] CastP2X   If   LoadB     |
    //         | \     /                              \            |
    //         | MergeMem                            . . .      StoreB
    //         | /                                                /
*** 1417,1427 ****
    // It is worth noting at this stage that both the above
    // configurations can be uniquely identified by checking that the
    // memory flow includes the following subgraph:
    //
    //   MemBarRelease
!   //   MemBarCPUOrder
    //          |  \      . . .
    //          |  StoreX[mo_release]  . . .
    //          |   /
    //         MergeMem
    //          |
--- 1481,1491 ----
    // It is worth noting at this stage that both the above
    // configurations can be uniquely identified by checking that the
    // memory flow includes the following subgraph:
    //
    //   MemBarRelease
!   //  {MemBarCPUOrder}
    //          |  \      . . .
    //          |  StoreX[mo_release]  . . .
    //          |   /
    //         MergeMem
    //          |
*** 1429,1440 ****
    //
    // This is referred to as a *normal* subgraph. It can easily be
    // detected starting from any candidate MemBarRelease,
    // StoreX[mo_release] or MemBarVolatile.
    //
    // the code below uses two helper predicates, leading_to_normal and
!   // normal_to_leading to identify this configuration, one validating
    // the layout starting from the top membar and searching down and
    // the other validating the layout starting from the lower membar
    // and searching up.
    //
    // There are two special case GC configurations when a normal graph
--- 1493,1544 ----
    //
    // This is referred to as a *normal* subgraph. It can easily be
    // detected starting from any candidate MemBarRelease,
    // StoreX[mo_release] or MemBarVolatile.
    //
+   // A simple variation on this normal case occurs for an unsafe CAS
+   // operation. The basic graph for a non-object CAS is
+   //
+   //   MemBarRelease
+   //         ||
+   //   MemBarCPUOrder
+   //         ||     \\   . . .
+   //         ||     CompareAndSwapX
+   //         ||       |
+   //         ||     SCMemProj
+   //         | \     /
+   //         | MergeMem
+   //         | /
+   //   MemBarCPUOrder
+   //         ||
+   //   MemBarAcquire
+   //
+   // The same basic variations on this arrangement (mutatis mutandis)
+   // occur when a card mark is introduced. i.e. we se the same basic
+   // shape but the StoreP/N is replaced with CompareAndSawpP/N and the
+   // tail of the graph is a pair comprising a MemBarCPUOrder +
+   // MemBarAcquire.
+   //
+   // So, in the case of a CAS the normal graph has the variant form
+   //
+   //   MemBarRelease
+   //   MemBarCPUOrder
+   //          |   \      . . .
+   //          |  CompareAndSwapX  . . .
+   //          |    |
+   //          |   SCMemProj
+   //          |   /  . . .
+   //         MergeMem
+   //          |
+   //   MemBarCPUOrder
+   //   MemBarAcquire
+   //
+   // This graph can also easily be detected starting from any
+   // candidate MemBarRelease, CompareAndSwapX or MemBarAcquire.
+   //
    // the code below uses two helper predicates, leading_to_normal and
!   // normal_to_leading to identify these normal graphs, one validating
    // the layout starting from the top membar and searching down and
    // the other validating the layout starting from the lower membar
    // and searching up.
    //
    // There are two special case GC configurations when a normal graph
*** 1448,1458 ****
    // employs a post-write GC barrier while G1 employs both a pre- and
    // post-write GC barrier. Of course the extra nodes may be absent --
    // they are only inserted for object puts. This significantly
    // complicates the task of identifying whether a MemBarRelease,
    // StoreX[mo_release] or MemBarVolatile forms part of a volatile put
!   // when using these GC configurations (see below).
    //
    // In both cases the post-write subtree includes an auxiliary
    // MemBarVolatile (StoreLoad barrier) separating the object put and
    // the read of the corresponding card. This poses two additional
    // problems.
--- 1552,1564 ----
    // employs a post-write GC barrier while G1 employs both a pre- and
    // post-write GC barrier. Of course the extra nodes may be absent --
    // they are only inserted for object puts. This significantly
    // complicates the task of identifying whether a MemBarRelease,
    // StoreX[mo_release] or MemBarVolatile forms part of a volatile put
!   // when using these GC configurations (see below). It adds similar
!   // complexity to the task of identifying whether a MemBarRelease,
!   // CompareAndSwapX or MemBarAcquire forms part of a CAS.
    //
    // In both cases the post-write subtree includes an auxiliary
    // MemBarVolatile (StoreLoad barrier) separating the object put and
    // the read of the corresponding card. This poses two additional
    // problems.
*** 1487,1497 ****
    // A CMS GC post-barrier wraps its card write (StoreCM) inside an If
    // which selects conditonal execution based on the value loaded
    // (LoadB) from the card. Ctl and Mem are fed to the If via an
    // intervening StoreLoad barrier (MemBarVolatile).
    //
!   // So, with CMS we may see a node graph which looks like this
    //
    //   MemBarRelease
    //   MemBarCPUOrder_(leading)__________________
    //     C |    M \       \\                   C \
    //       |       \    StoreN/P[mo_release]  CastP2X
--- 1593,1604 ----
    // A CMS GC post-barrier wraps its card write (StoreCM) inside an If
    // which selects conditonal execution based on the value loaded
    // (LoadB) from the card. Ctl and Mem are fed to the If via an
    // intervening StoreLoad barrier (MemBarVolatile).
    //
!   // So, with CMS we may see a node graph for a volatile object store
!   // which looks like this
    //
    //   MemBarRelease
    //   MemBarCPUOrder_(leading)__________________
    //     C |    M \       \\                   C \
    //       |       \    StoreN/P[mo_release]  CastP2X
*** 1522,1531 ****
--- 1629,1687 ----
    // card mark membar. The trailing MergeMem merges the AliasIdxBot
    // Mem slice from the card mark membar and the AliasIdxRaw slice
    // from the StoreCM into the trailing membar (n.b. the latter
    // proceeds via a Phi associated with the If region).
    //
+   // The graph for a CAS varies slightly, the obvious difference being
+   // that the StoreN/P node is replaced by a CompareAndSwapP/N node
+   // and the trailing MemBarVolatile by a MemBarCPUOrder +
+   // MemBarAcquire pair. The other important difference is that the
+   // CompareAndSwap node's SCMemProj is not merged into the card mark
+   // membar - it still feeds the trailing MergeMem. This also means
+   // that the card mark membar receives its Mem feed directly from the
+   // leading membar rather than via a MergeMem.
+   //
+   //   MemBarRelease
+   //   MemBarCPUOrder__(leading)_________________________
+   //       ||                       \\                 C \
+   //   MemBarVolatile (card mark)  CompareAndSwapN/P  CastP2X
+   //     C |  ||    M |              |
+   //       | LoadB    |       ______/|
+   //       |   |      |      /       |
+   //       | Cmp      |     /      SCMemProj
+   //       | /        |    /         |
+   //       If         |   /         /
+   //       | \        |  /         /
+   // IfFalse  IfTrue  | /         /
+   //       \     / \  |/ prec    /
+   //        \   / StoreCM       /
+   //         \ /      |        /
+   //        Region   . . .    /
+   //          | \            /
+   //          |  . . .  \   / Bot
+   //          |       MergeMem
+   //          |          |
+   //        MemBarCPUOrder
+   //        MemBarAcquire (trailing)
+   //
+   // This has a slightly different memory subgraph to the one seen
+   // previously but the core of it is the same as for the CAS normal
+   // sungraph
+   //
+   //   MemBarRelease
+   //   MemBarCPUOrder____
+   //      ||             \      . . .
+   //   MemBarVolatile  CompareAndSwapX  . . .
+   //      |  \            |
+   //        . . .   SCMemProj
+   //          |     /  . . .
+   //         MergeMem
+   //          |
+   //   MemBarCPUOrder
+   //   MemBarAcquire
+   //
+   //
    // G1 is quite a lot more complicated. The nodes inserted on behalf
    // of G1 may comprise: a pre-write graph which adds the old value to
    // the SATB queue; the releasing store itself; and, finally, a
    // post-write graph which performs a card mark.
    //
*** 1573,1588 ****
    //         MemBarVolatile (trailing)
    //
    // n.b. the LoadB in this subgraph is not the card read -- it's a
    // read of the SATB queue active flag.
    //
    // The G1 post-write subtree is also optional, this time when the
    // new value being written is either null or can be identified as a
    // newly allocated (young gen) object with no intervening control
    // flow. The latter cannot happen but the former may, in which case
!   // the card mark membar is omitted and the memory feeds from the
!   // leading membar and the StoreN/P are merged direct into the
    // trailing membar as per the normal subgraph. So, the only special
    // case which arises is when the post-write subgraph is generated.
    //
    // The kernel of the post-write G1 subgraph is the card mark itself
    // which includes a card mark memory barrier (MemBarVolatile), a
--- 1729,1748 ----
    //         MemBarVolatile (trailing)
    //
    // n.b. the LoadB in this subgraph is not the card read -- it's a
    // read of the SATB queue active flag.
    //
+   // Once again the CAS graph is a minor variant on the above with the
+   // expected substitutions of CompareAndSawpX for StoreN/P and
+   // MemBarCPUOrder + MemBarAcquire for trailing MemBarVolatile.
+   //
    // The G1 post-write subtree is also optional, this time when the
    // new value being written is either null or can be identified as a
    // newly allocated (young gen) object with no intervening control
    // flow. The latter cannot happen but the former may, in which case
!   // the card mark membar is omitted and the memory feeds form the
!   // leading membar and the SToreN/P are merged direct into the
    // trailing membar as per the normal subgraph. So, the only special
    // case which arises is when the post-write subgraph is generated.
    //
    // The kernel of the post-write G1 subgraph is the card mark itself
    // which includes a card mark memory barrier (MemBarVolatile), a
*** 1666,1716 ****
    // membar. Each Phi corresponds to one of the Ifs which may skip
    // around the card mark membar. So when the If implementing the NULL
    // value check has been elided the total number of Phis is 2
    // otherwise it is 3.
    //
    // So, the upshot is that in all cases the volatile put graph will
    // include a *normal* memory subgraph betwen the leading membar and
!   // its child membar. When that child is not a card mark membar then
!   // it marks the end of a volatile put subgraph. If the child is a
!   // card mark membar then the normal subgraph will form part of a
!   // volatile put subgraph if and only if the child feeds an
!   // AliasIdxBot Mem feed to a trailing barrier via a MergeMem. That
!   // feed is either direct (for CMS) or via 2 or 3 Phi nodes merging
!   // the leading barrier memory flow (for G1).
    // 
    // The predicates controlling generation of instructions for store
    // and barrier nodes employ a few simple helper functions (described
!   // below) which identify the presence or absence of these subgraph
!   // configurations and provide a means of traversing from one node in
!   // the subgraph to another.
  
    // leading_to_normal
    //
!   //graph traversal helper which detects the normal case Mem feed
!   // from a release membar (or, optionally, its cpuorder child) to a
!   // dependent volatile membar i.e. it ensures that the following Mem
!   // flow subgraph is present.
    //
    //   MemBarRelease
!   //   MemBarCPUOrder
    //          |  \      . . .
    //          |  StoreN/P[mo_release]  . . .
    //          |   /
    //         MergeMem
    //          |
!   //   MemBarVolatile
    //
!   // if the correct configuration is present returns the volatile
    // membar otherwise NULL.
    //
    // the input membar is expected to be either a cpuorder membar or a
    // release membar. in the latter case it should not have a cpu membar
    // child.
    //
!   // the returned membar may be a card mark membar rather than a
!   // trailing membar.
  
    MemBarNode *leading_to_normal(MemBarNode *leading)
    {
      assert((leading->Opcode() == Op_MemBarRelease ||
              leading->Opcode() == Op_MemBarCPUOrder),
--- 1826,1913 ----
    // membar. Each Phi corresponds to one of the Ifs which may skip
    // around the card mark membar. So when the If implementing the NULL
    // value check has been elided the total number of Phis is 2
    // otherwise it is 3.
    //
+   // The CAS graph when using G1GC also includes a pre-write subgraph
+   // and an optional post-write subgraph. Teh sam evarioations are
+   // introduced as for CMS with conditional card marking i.e. the
+   // StoreP/N is swapped for a CompareAndSwapP/N, the tariling
+   // MemBarVolatile for a MemBarCPUOrder + MemBarAcquire pair and the
+   // Mem feed from the CompareAndSwapP/N includes a precedence
+   // dependency feed to the StoreCM and a feed via an SCMemProj to the
+   // trailing membar. So, as before the configuration includes the
+   // normal CAS graph as a subgraph of the memory flow.
+   //
    // So, the upshot is that in all cases the volatile put graph will
    // include a *normal* memory subgraph betwen the leading membar and
!   // its child membar, either a volatile put graph (including a
!   // releasing StoreX) or a CAS graph (including a CompareAndSwapX).
!   // When that child is not a card mark membar then it marks the end
!   // of the volatile put or CAS subgraph. If the child is a card mark
!   // membar then the normal subgraph will form part of a volatile put
!   // subgraph if and only if the child feeds an AliasIdxBot Mem feed
!   // to a trailing barrier via a MergeMem. That feed is either direct
!   // (for CMS) or via 2 or 3 Phi nodes merging the leading barrier
!   // memory flow (for G1).
    // 
    // The predicates controlling generation of instructions for store
    // and barrier nodes employ a few simple helper functions (described
!   // below) which identify the presence or absence of all these
!   // subgraph configurations and provide a means of traversing from
!   // one node in the subgraph to another.
! 
!   // is_CAS(int opcode)
!   //
!   // return true if opcode is one of the possible CompareAndSwapX
!   // values otherwise false.
! 
!   bool is_CAS(int opcode)
!   {
!     return (opcode == Op_CompareAndSwapI ||
!             opcode == Op_CompareAndSwapL ||
!             opcode == Op_CompareAndSwapN ||
!             opcode == Op_CompareAndSwapP);
!   }
  
    // leading_to_normal
    //
!   //graph traversal helper which detects the normal case Mem feed from
!   // a release membar (or, optionally, its cpuorder child) to a
!   // dependent volatile membar i.e. it ensures that one or other of
!   // the following Mem flow subgraph is present.
    //
    //   MemBarRelease
!   //   MemBarCPUOrder {leading}
    //          |  \      . . .
    //          |  StoreN/P[mo_release]  . . .
    //          |   /
    //         MergeMem
    //          |
!   //   MemBarVolatile {trailing or card mark}
!   //
!   //   MemBarRelease
!   //   MemBarCPUOrder {leading}
!   //      |       \      . . .
!   //      |     CompareAndSwapX  . . .
!   //               |
!   //     . . .    SCMemProj
!   //           \   |
!   //      |    MergeMem
!   //      |       /
!   //    MemBarCPUOrder
!   //    MemBarAcquire {trailing}
    //
!   // if the correct configuration is present returns the trailing
    // membar otherwise NULL.
    //
    // the input membar is expected to be either a cpuorder membar or a
    // release membar. in the latter case it should not have a cpu membar
    // child.
    //
!   // the returned value may be a card mark or trailing membar
!   //
  
    MemBarNode *leading_to_normal(MemBarNode *leading)
    {
      assert((leading->Opcode() == Op_MemBarRelease ||
              leading->Opcode() == Op_MemBarCPUOrder),
*** 1722,1791 ****
      if (!mem)
        return NULL;
  
      Node *x = NULL;
      StoreNode * st = NULL;
      MergeMemNode *mm = NULL;
  
      for (DUIterator_Fast imax, i = mem->fast_outs(imax); i < imax; i++) {
        x = mem->fast_out(i);
        if (x->is_MergeMem()) {
          if (mm != NULL)
            return NULL;
          // two merge mems is one too many
          mm = x->as_MergeMem();
        } else if (x->is_Store() && x->as_Store()->is_release() && x->Opcode() != Op_StoreCM) {
!         // two releasing stores is one too many
!         if (st != NULL)
            return NULL;
          st = x->as_Store();
        }
      }
  
!     if (!mm || !st)
        return NULL;
  
!     bool found = false;
!     // ensure the store feeds the merge
      for (DUIterator_Fast imax, i = st->fast_outs(imax); i < imax; i++) {
        if (st->fast_out(i) == mm) {
!         found = true;
          break;
        }
      }
! 
!     if (!found)
        return NULL;
  
!     MemBarNode *mbvol = NULL;
!     // ensure the merge feeds a volatile membar
      for (DUIterator_Fast imax, i = mm->fast_outs(imax); i < imax; i++) {
        x = mm->fast_out(i);
!       if (x->is_MemBar() && x->Opcode() == Op_MemBarVolatile) {
!         mbvol = x->as_MemBar();
          break;
        }
      }
  
!     return mbvol;
    }
  
    // normal_to_leading
    //
    // graph traversal helper which detects the normal case Mem feed
    // from either a card mark or a trailing membar to a preceding
    // release membar (optionally its cpuorder child) i.e. it ensures
!   // that the following Mem flow subgraph is present.
    //
    //   MemBarRelease
    //   MemBarCPUOrder {leading}
    //          |  \      . . .
    //          |  StoreN/P[mo_release]  . . .
    //          |   /
    //         MergeMem
    //          |
!   //   MemBarVolatile
    //
    // this predicate checks for the same flow as the previous predicate
    // but starting from the bottom rather than the top.
    //
    // if the configuration is present returns the cpuorder member for
--- 1919,2041 ----
      if (!mem)
        return NULL;
  
      Node *x = NULL;
      StoreNode * st = NULL;
+     LoadStoreNode *cas = NULL;
      MergeMemNode *mm = NULL;
  
      for (DUIterator_Fast imax, i = mem->fast_outs(imax); i < imax; i++) {
        x = mem->fast_out(i);
        if (x->is_MergeMem()) {
          if (mm != NULL)
            return NULL;
          // two merge mems is one too many
          mm = x->as_MergeMem();
        } else if (x->is_Store() && x->as_Store()->is_release() && x->Opcode() != Op_StoreCM) {
!         // two releasing stores/CAS nodes is one too many
!         if (st != NULL || cas != NULL)
            return NULL;
          st = x->as_Store();
+       } else if (is_CAS(x->Opcode())) {
+         if (st != NULL || cas != NULL)
+           return NULL;
+         cas = x->as_LoadStore();
        }
      }
  
!     // must have a store or a cas
!     if (!st && !cas)
!       return NULL;
! 
!     // must have a merge if we also have st
!     if (st && !mm)
        return NULL;
  
!     Node *y = NULL;
!     if (cas) {
!       // look for an SCMemProj
!       for (DUIterator_Fast imax, i = cas->fast_outs(imax); i < imax; i++) {
!         x = cas->fast_out(i);
!         if (x->is_Proj()) {
!           y = x;
!           break;
!         }
!       }
!       if (y == NULL)
!         return NULL;
!       // the proj must feed a MergeMem
!       for (DUIterator_Fast imax, i = y->fast_outs(imax); i < imax; i++) {
!         x = y->fast_out(i);
!         if (x->is_MergeMem()) {
!           mm = x->as_MergeMem();
!           break;
!         }
!       }
!       if (mm == NULL)
!         return NULL;
!     } else {
!       // ensure the store feeds the existing mergemem;
        for (DUIterator_Fast imax, i = st->fast_outs(imax); i < imax; i++) {
          if (st->fast_out(i) == mm) {
!           y = st;
            break;
          }
        }
!       if (y == NULL)
          return NULL;
+     }
  
!     MemBarNode *mbar = NULL;
!     // ensure the merge feeds to the expected type of membar
      for (DUIterator_Fast imax, i = mm->fast_outs(imax); i < imax; i++) {
        x = mm->fast_out(i);
!       if (x->is_MemBar()) {
!         int opcode = x->Opcode();
!         if (opcode == Op_MemBarVolatile && st) {
!           mbar = x->as_MemBar();
!         } else if (cas && opcode == Op_MemBarCPUOrder) {
!           MemBarNode *y =  x->as_MemBar();
!           y = child_membar(y);
!           if (y != NULL && y->Opcode() == Op_MemBarAcquire) {
!             mbar = y;
!           }
!         }
          break;
        }
      }
  
!     return mbar;
    }
  
    // normal_to_leading
    //
    // graph traversal helper which detects the normal case Mem feed
    // from either a card mark or a trailing membar to a preceding
    // release membar (optionally its cpuorder child) i.e. it ensures
!   // that one or other of the following Mem flow subgraphs is present.
    //
    //   MemBarRelease
    //   MemBarCPUOrder {leading}
    //          |  \      . . .
    //          |  StoreN/P[mo_release]  . . .
    //          |   /
    //         MergeMem
    //          |
!   //   MemBarVolatile {card mark or trailing}
!   //
!   //   MemBarRelease
!   //   MemBarCPUOrder {leading}
!   //      |       \      . . .
!   //      |     CompareAndSwapX  . . .
!   //               |
!   //     . . .    SCMemProj
!   //           \   |
!   //      |    MergeMem
!   //      |        /
!   //    MemBarCPUOrder
!   //    MemBarAcquire {trailing}
    //
    // this predicate checks for the same flow as the previous predicate
    // but starting from the bottom rather than the top.
    //
    // if the configuration is present returns the cpuorder member for
*** 1795,1816 ****
    // need not be a card mark membar.
  
    MemBarNode *normal_to_leading(const MemBarNode *barrier)
    {
      // input must be a volatile membar
!     assert(barrier->Opcode() == Op_MemBarVolatile, "expecting a volatile membar");
      Node *x;
  
      // the Mem feed to the membar should be a merge
!     x = barrier->in(TypeFunc::Memory);
      if (!x->is_MergeMem())
        return NULL;
  
      MergeMemNode *mm = x->as_MergeMem();
  
!     // the AliasIdxBot slice should be another MemBar projection
      x = mm->in(Compile::AliasIdxBot);
      // ensure this is a non control projection
      if (!x->is_Proj() || x->is_CFG())
        return NULL;
      // if it is fed by a membar that's the one we want
      x = x->in(0);
--- 2045,2103 ----
    // need not be a card mark membar.
  
    MemBarNode *normal_to_leading(const MemBarNode *barrier)
    {
      // input must be a volatile membar
!     assert((barrier->Opcode() == Op_MemBarVolatile ||
!             barrier->Opcode() == Op_MemBarAcquire),
!            "expecting a volatile or an acquire membar");
      Node *x;
+     bool is_cas = barrier->Opcode() == Op_MemBarAcquire;
+ 
+     // if we have an acquire membar then it must be fed via a CPUOrder
+     // membar
+ 
+     if (is_cas) {
+       // skip to parent barrier which must be a cpuorder
+       x = parent_membar(barrier);
+       if (x->Opcode() != Op_MemBarCPUOrder)
+         return NULL;
+     } else {
+       // start from the supplied barrier
+       x = (Node *)barrier;
+     }
  
      // the Mem feed to the membar should be a merge
!     x = x ->in(TypeFunc::Memory);
      if (!x->is_MergeMem())
        return NULL;
  
      MergeMemNode *mm = x->as_MergeMem();
  
!     if (is_cas) {
!       // the merge should be fed from the CAS via an SCMemProj node
!       x = NULL;
!       for (uint idx = 1; idx < mm->req(); idx++) {
!         if (mm->in(idx)->Opcode() == Op_SCMemProj) {
!           x = mm->in(idx);
!           break;
!         }
!       }
!       if (x == NULL)
!         return NULL;
!       // check for a CAS feeding this proj
!       x = x->in(0);
!       int opcode = x->Opcode();
!       if (!is_CAS(opcode))
!         return NULL;
!       // the CAS should get its mem feed from the leading membar
!       x = x->in(MemNode::Memory);
!     } else {
!       // the merge should get its Bottom mem feed from the leading membar
        x = mm->in(Compile::AliasIdxBot);      
+     } 
+ 
      // ensure this is a non control projection
      if (!x->is_Proj() || x->is_CFG())
        return NULL;
      // if it is fed by a membar that's the one we want
      x = x->in(0);
*** 1821,1850 ****
      MemBarNode *leading = x->as_MemBar();
      // reject invalid candidates
      if (!leading_membar(leading))
        return NULL;
  
!     // ok, we have a leading ReleaseMembar, now for the sanity clauses
  
!     // the leading membar must feed Mem to a releasing store
      ProjNode *mem = leading->proj_out(TypeFunc::Memory);
      StoreNode *st = NULL;
      for (DUIterator_Fast imax, i = mem->fast_outs(imax); i < imax; i++) {
        x = mem->fast_out(i);
        if (x->is_Store() && x->as_Store()->is_release() && x->Opcode() != Op_StoreCM) {
          st = x->as_Store();
!         break;
        }
      }
!     if (st == NULL)
        return NULL;
  
!     // the releasing store has to feed the same merge
      for (DUIterator_Fast imax, i = st->fast_outs(imax); i < imax; i++) {
        if (st->fast_out(i) == mm)
          return leading;
      }
  
      return NULL;
    }
  
    // card_mark_to_trailing
--- 2108,2155 ----
      MemBarNode *leading = x->as_MemBar();
      // reject invalid candidates
      if (!leading_membar(leading))
        return NULL;
  
!     // ok, we have a leading membar, now for the sanity clauses
  
!     // the leading membar must feed Mem to a releasing store or CAS
      ProjNode *mem = leading->proj_out(TypeFunc::Memory);
      StoreNode *st = NULL;
+     LoadStoreNode *cas = NULL;
      for (DUIterator_Fast imax, i = mem->fast_outs(imax); i < imax; i++) {
        x = mem->fast_out(i);
        if (x->is_Store() && x->as_Store()->is_release() && x->Opcode() != Op_StoreCM) {
+         // two stores or CASes is one too many
+         if (st != NULL || cas != NULL)
+           return NULL;
          st = x->as_Store();
!       } else if (is_CAS(x->Opcode())) {
!         if (st != NULL || cas != NULL)
!           return NULL;
!         cas = x->as_LoadStore();
        }
      }
! 
!     // we should not have both a store and a cas
!     if (st == NULL & cas == NULL)
!       return NULL;
! 
!     if (st == NULL) {
!       // nothing more to check
!       return leading;
!     } else {
!       // we should not have a store if we started from an acquire
!       if (is_cas)
          return NULL;
  
!       // the store should feed the merge we used to get here
        for (DUIterator_Fast imax, i = st->fast_outs(imax); i < imax; i++) {
          if (st->fast_out(i) == mm)
            return leading;
        }
+     }
  
      return NULL;
    }
  
    // card_mark_to_trailing
*** 1863,1874 ****
    //      |          |
    //      |        . . .
    //  Bot |  / 
    //   MergeMem 
    //      |
!   //   MemBarVolatile (trailing)
!   //
    //
    // 2)
    //   MemBarRelease/CPUOrder (leading)
    //    |
    //    | 
--- 2168,2179 ----
    //      |          |
    //      |        . . .
    //  Bot |  / 
    //   MergeMem 
    //      |
!   //      |
!   //    MemBarVolatile {trailing}
    //
    // 2)
    //   MemBarRelease/CPUOrder (leading)
    //    |
    //    | 
*** 1882,1892 ****
    //        \ /
    //        Phi  . . .
    //     Bot |   /
    //       MergeMem
    //         |
!   //   MemBarVolatile (trailing)
    //
    // 3)
    //   MemBarRelease/CPUOrder (leading)
    //    |
    //    |\
--- 2187,2198 ----
    //        \ /
    //        Phi  . . .
    //     Bot |   /
    //       MergeMem
    //         |
!   //    MemBarVolatile {trailing}
!   //
    //
    // 3)
    //   MemBarRelease/CPUOrder (leading)
    //    |
    //    |\
*** 1903,1913 ****
    //        \ /
    //        Phi  . . .
    //     Bot |   /
    //       MergeMem
    //         |
!   //   MemBarVolatile (trailing)
    //
    // configuration 1 is only valid if UseConcMarkSweepGC &&
    // UseCondCardMark
    //
    // configurations 2 and 3 are only valid if UseG1GC.
--- 2209,2220 ----
    //        \ /
    //        Phi  . . .
    //     Bot |   /
    //       MergeMem
    //         |
!   //         |
!   //    MemBarVolatile {trailing}
    //
    // configuration 1 is only valid if UseConcMarkSweepGC &&
    // UseCondCardMark
    //
    // configurations 2 and 3 are only valid if UseG1GC.
*** 1967,1977 ****
  
      // sanity check this feed turns up as the expected slice
      assert(mm->as_MergeMem()->in(Compile::AliasIdxBot) == feed, "expecting membar to feed AliasIdxBot slice to Merge");
  
      MemBarNode *trailing = NULL;
!     // be sure we have a volatile membar below the merge
      for (DUIterator_Fast imax, i = mm->fast_outs(imax); i < imax; i++) {
        x = mm->fast_out(i);
        if (x->is_MemBar() && x->Opcode() == Op_MemBarVolatile) {
          trailing = x->as_MemBar();
          break;
--- 2274,2284 ----
  
      // sanity check this feed turns up as the expected slice
      assert(mm->as_MergeMem()->in(Compile::AliasIdxBot) == feed, "expecting membar to feed AliasIdxBot slice to Merge");
  
      MemBarNode *trailing = NULL;
!     // be sure we have a trailing membar the merge
      for (DUIterator_Fast imax, i = mm->fast_outs(imax); i < imax; i++) {
        x = mm->fast_out(i);
        if (x->is_MemBar() && x->Opcode() == Op_MemBarVolatile) {
          trailing = x->as_MemBar();
          break;
*** 1982,2007 ****
    }
  
    // trailing_to_card_mark
    //
    // graph traversal helper which detects extra, non-normal Mem feed
!   // from a trailing membar to a preceding card mark volatile membar
!   // i.e. it identifies whether one of the three possible extra GC
!   // post-write Mem flow subgraphs is present
    //
    // this predicate checks for the same flow as the previous predicate
    // but starting from the bottom rather than the top.
    //
!   // if the configurationis present returns the card mark membar
    // otherwise NULL
  
    MemBarNode *trailing_to_card_mark(const MemBarNode *trailing)
    {
!     assert(!is_card_mark_membar(trailing), "not expecting a card mark membar");
  
-     Node *x = trailing->in(TypeFunc::Memory);
      // the Mem feed to the membar should be a merge
      if (!x->is_MergeMem())
        return NULL;
  
      MergeMemNode *mm = x->as_MergeMem();
  
--- 2289,2321 ----
    }
  
    // trailing_to_card_mark
    //
    // graph traversal helper which detects extra, non-normal Mem feed
!   // from a trailing volatile membar to a preceding card mark volatile
!   // membar i.e. it identifies whether one of the three possible extra
!   // GC post-write Mem flow subgraphs is present
    //
    // this predicate checks for the same flow as the previous predicate
    // but starting from the bottom rather than the top.
    //
!   // if the configuration is present returns the card mark membar
    // otherwise NULL
+   //
+   // n.b. the supplied membar is expected to be a trailing
+   // MemBarVolatile i.e. the caller must ensure the input node has the
+   // correct opcode
  
    MemBarNode *trailing_to_card_mark(const MemBarNode *trailing)
    {
!     assert(trailing->Opcode() == Op_MemBarVolatile,
!            "expecting a volatile membar");
!     assert(!is_card_mark_membar(trailing),
!            "not expecting a card mark membar");
  
      // the Mem feed to the membar should be a merge
+     Node *x = trailing->in(TypeFunc::Memory);
      if (!x->is_MergeMem())
        return NULL;
  
      MergeMemNode *mm = x->as_MergeMem();
  
*** 2066,2076 ****
    }
  
    // trailing_to_leading
    //
    // graph traversal helper which checks the Mem flow up the graph
!   // from a (non-card mark) volatile membar attempting to locate and
    // return an associated leading membar. it first looks for a
    // subgraph in the normal configuration (relying on helper
    // normal_to_leading). failing that it then looks for one of the
    // possible post-write card mark subgraphs linking the trailing node
    // to a the card mark membar (relying on helper
--- 2380,2390 ----
    }
  
    // trailing_to_leading
    //
    // graph traversal helper which checks the Mem flow up the graph
!   // from a (non-card mark) trailing membar attempting to locate and
    // return an associated leading membar. it first looks for a
    // subgraph in the normal configuration (relying on helper
    // normal_to_leading). failing that it then looks for one of the
    // possible post-write card mark subgraphs linking the trailing node
    // to a the card mark membar (relying on helper
*** 2079,2100 ****
    // predicate normal_to_leading).
    //
    // if the configuration is valid returns the cpuorder member for
    // preference or when absent the release membar otherwise NULL.
    //
!   // n.b. the input membar is expected to be a volatile membar but
!   // must *not* be a card mark membar.
  
    MemBarNode *trailing_to_leading(const MemBarNode *trailing)
    {
!     assert(!is_card_mark_membar(trailing), "not expecting a card mark membar");
  
      MemBarNode *leading = normal_to_leading(trailing);
  
      if (leading)
        return leading;
  
      MemBarNode *card_mark_membar = trailing_to_card_mark(trailing);
  
      if (!card_mark_membar)
        return NULL;
  
--- 2393,2424 ----
    // predicate normal_to_leading).
    //
    // if the configuration is valid returns the cpuorder member for
    // preference or when absent the release membar otherwise NULL.
    //
!   // n.b. the input membar is expected to be either a volatile or
!   // acquire membar but in the former case must *not* be a card mark
!   // membar.
  
    MemBarNode *trailing_to_leading(const MemBarNode *trailing)
    {
!     assert((trailing->Opcode() == Op_MemBarAcquire ||
!             trailing->Opcode() == Op_MemBarVolatile),
!            "expecting an acquire or volatile membar");
!     assert((trailing->Opcode() != Op_MemBarVolatile ||
!             !is_card_mark_membar(trailing)),
!            "not expecting a card mark membar");
  
      MemBarNode *leading = normal_to_leading(trailing);
  
      if (leading)
        return leading;
  
+     // nothing more to do if this is an acquire
+     if (trailing->Opcode() == Op_MemBarAcquire)
+       return NULL;
+ 
      MemBarNode *card_mark_membar = trailing_to_card_mark(trailing);
  
      if (!card_mark_membar)
        return NULL;
  
*** 2103,2113 ****
  
    // predicates controlling emit of ldr<x>/ldar<x> and associated dmb
  
  bool unnecessary_acquire(const Node *barrier)
  {
!   // assert barrier->is_MemBar();
    if (UseBarriersForVolatile)
      // we need to plant a dmb
      return false;
  
    // a volatile read derived from bytecode (or also from an inlined
--- 2427,2438 ----
  
    // predicates controlling emit of ldr<x>/ldar<x> and associated dmb
  
  bool unnecessary_acquire(const Node *barrier)
  {
!   assert(barrier->is_MemBar(), "expecting a membar");
! 
    if (UseBarriersForVolatile)
      // we need to plant a dmb
      return false;
  
    // a volatile read derived from bytecode (or also from an inlined
*** 2178,2211 ****
        ld = x->as_Load();
        break;
      }
    }
    // it must be an acquiring load
!   if (! ld || ! ld->is_acquire())
!     return false;
    for (DUIterator_Fast imax, i = mem->fast_outs(imax); i < imax; i++) {
      x = mem->fast_out(i);
      // if we see the same load we drop it and stop searching
      if (x == ld) {
        ld = NULL;
        break;
      }
    }
    // we must have dropped the load
!   if (ld)
!     return false;
    // check for a child cpuorder membar
    MemBarNode *child  = child_membar(barrier->as_MemBar());
!   if (!child || child->Opcode() != Op_MemBarCPUOrder)
!     return false;
! 
    return true;
  }
  
  bool needs_acquiring_load(const Node *n)
  {
!   // assert n->is_Load();
    if (UseBarriersForVolatile)
      // we use a normal load and a dmb
      return false;
  
    LoadNode *ld = n->as_Load();
--- 2503,2542 ----
        ld = x->as_Load();
        break;
      }
    }
    // it must be an acquiring load
!   if (ld && ld->is_acquire()) {
! 
      for (DUIterator_Fast imax, i = mem->fast_outs(imax); i < imax; i++) {
        x = mem->fast_out(i);
        // if we see the same load we drop it and stop searching
        if (x == ld) {
          ld = NULL;
          break;
        }
      }
      // we must have dropped the load
!     if (ld == NULL) {
        // check for a child cpuorder membar
        MemBarNode *child  = child_membar(barrier->as_MemBar());
!       if (child && child->Opcode() != Op_MemBarCPUOrder)
          return true;
+     }
+   }
+ 
+   // final option for unnecessary mebar is that it is a trailing node
+   // belonging to a CAS
+ 
+   MemBarNode *leading = trailing_to_leading(barrier->as_MemBar());
+ 
+   return leading != NULL;
  }
  
  bool needs_acquiring_load(const Node *n)
  {
!   assert(n->is_Load(), "expecting a load");
    if (UseBarriersForVolatile)
      // we use a normal load and a dmb
      return false;
  
    LoadNode *ld = n->as_Load();
*** 2388,2397 ****
--- 2719,2778 ----
    // we found a card mark -- just make sure we have a trailing barrier
  
    return (card_mark_to_trailing(mbvol) != NULL);
  }
  
+ // predicate controlling translation of CAS
+ //
+ // returns true if CAS needs to use an acquiring load otherwise false
+ 
+ bool needs_acquiring_load_exclusive(const Node *n)
+ {
+   assert(is_CAS(n->Opcode()), "expecting a compare and swap");
+   if (UseBarriersForVolatile)
+     return false;
+ 
+   // CAS nodes only ought to turn up in inlined unsafe CAS operations
+ #ifndef PRODUCT
+ #ifdef ASSERT
+   LoadStoreNode *st = n->as_LoadStore();
+ 
+   // the store must be fed by a membar
+ 
+   Node *x = st->lookup(StoreNode::Memory);
+ 
+   assert (x && x->is_Proj(), "CAS not fed by memory proj!");
+ 
+   ProjNode *proj = x->as_Proj();
+ 
+   x = proj->lookup(0);
+ 
+   assert (x && x->is_MemBar(), "CAS not fed by membar!");
+ 
+   MemBarNode *barrier = x->as_MemBar();
+ 
+   // the barrier must be a cpuorder mmebar fed by a release membar
+ 
+   assert(barrier->Opcode() == Op_MemBarCPUOrder,
+          "CAS not fed by cpuorder membar!");
+       
+   MemBarNode *b = parent_membar(barrier);
+   assert ((b != NULL && b->Opcode() == Op_MemBarRelease),
+           "CAS not fed by cpuorder+release membar pair!");
+ 
+   // does this lead a normal subgraph?
+   MemBarNode *mbar = leading_to_normal(barrier);
+ 
+   assert(mbar != NULL, "CAS not embedded in normal graph!");
+ 
+   assert(mbar->Opcode() == Op_MemBarAcquire, "trailing membar should be an acquire");
+ #endif // ASSERT
+ #endif // !PRODUCT
+   // so we can just return true here
+   return true;
+ }
+ 
  // predicate controlling translation of StoreCM
  //
  // returns true if a StoreStore must precede the card write otherwise
  // false
  
*** 3877,3886 ****
--- 4258,4351 ----
      __ stlxrw(rscratch1, new_reg, addr_reg);
      __ cbnzw(rscratch1, retry_load);
      __ bind(done);
    %}
  
+   // variant of cmpxchg employing an acquiring load which is used by
+   // CompareAndSwap{LNP} when we are eliding barriers
+ 
+   enc_class aarch64_enc_cmpxchg_acq(memory mem, iRegLNoSp oldval, iRegLNoSp newval) %{
+     MacroAssembler _masm(&cbuf);
+     Register old_reg = as_Register($oldval$$reg);
+     Register new_reg = as_Register($newval$$reg);
+     Register base = as_Register($mem$$base);
+     Register addr_reg;
+     int index = $mem$$index;
+     int scale = $mem$$scale;
+     int disp = $mem$$disp;
+     if (index == -1) {
+        if (disp != 0) {
+         __ lea(rscratch2, Address(base, disp));
+         addr_reg = rscratch2;
+       } else {
+         // TODO
+         // should we ever get anything other than this case?
+         addr_reg = base;
+       }
+     } else {
+       Register index_reg = as_Register(index);
+       if (disp == 0) {
+         __ lea(rscratch2, Address(base, index_reg, Address::lsl(scale)));
+         addr_reg = rscratch2;
+       } else {
+         __ lea(rscratch2, Address(base, disp));
+         __ lea(rscratch2, Address(rscratch2, index_reg, Address::lsl(scale)));
+         addr_reg = rscratch2;
+       }
+     }
+     Label retry_load, done;
+     __ bind(retry_load);
+     __ ldaxr(rscratch1, addr_reg);
+     __ cmp(rscratch1, old_reg);
+     __ br(Assembler::NE, done);
+     __ stlxr(rscratch1, new_reg, addr_reg);
+     __ cbnzw(rscratch1, retry_load);
+     __ bind(done);
+   %}
+ 
+   // variant of cmpxchgw employing an acquiring load which is used by
+   // CompareAndSwapI when we are eliding barriers
+ 
+   enc_class aarch64_enc_cmpxchgw_acq(memory mem, iRegINoSp oldval, iRegINoSp newval) %{
+     MacroAssembler _masm(&cbuf);
+     Register old_reg = as_Register($oldval$$reg);
+     Register new_reg = as_Register($newval$$reg);
+     Register base = as_Register($mem$$base);
+     Register addr_reg;
+     int index = $mem$$index;
+     int scale = $mem$$scale;
+     int disp = $mem$$disp;
+     if (index == -1) {
+        if (disp != 0) {
+         __ lea(rscratch2, Address(base, disp));
+         addr_reg = rscratch2;
+       } else {
+         // TODO
+         // should we ever get anything other than this case?
+         addr_reg = base;
+       }
+     } else {
+       Register index_reg = as_Register(index);
+       if (disp == 0) {
+         __ lea(rscratch2, Address(base, index_reg, Address::lsl(scale)));
+         addr_reg = rscratch2;
+       } else {
+         __ lea(rscratch2, Address(base, disp));
+         __ lea(rscratch2, Address(rscratch2, index_reg, Address::lsl(scale)));
+         addr_reg = rscratch2;
+       }
+     }
+     Label retry_load, done;
+     __ bind(retry_load);
+     __ ldaxrw(rscratch1, addr_reg);
+     __ cmpw(rscratch1, old_reg);
+     __ br(Assembler::NE, done);
+     __ stlxrw(rscratch1, new_reg, addr_reg);
+     __ cbnzw(rscratch1, retry_load);
+     __ bind(done);
+   %}
+ 
    // auxiliary used for CompareAndSwapX to set result register
    enc_class aarch64_enc_cset_eq(iRegINoSp res) %{
      MacroAssembler _masm(&cbuf);
      Register res_reg = as_Register($res$$reg);
      __ cset(res_reg, Assembler::EQ);
*** 8404,8416 ****
--- 8869,8885 ----
  %}
  
  // XXX No flag versions for CompareAndSwap{I,L,P,N} because matcher
  // can't match them
  
+ // standard CompareAndSwapX when we are using barriers
+ 
  instruct compareAndSwapI(iRegINoSp res, indirect mem, iRegINoSp oldval, iRegINoSp newval, rFlagsReg cr) %{
  
+   predicate(!needs_acquiring_load_exclusive(n));
    match(Set res (CompareAndSwapI mem (Binary oldval newval)));
+   ins_cost(VOLATILE_REF_COST);
  
    effect(KILL cr);
  
   format %{
      "cmpxchgw $mem, $oldval, $newval\t# (int) if $mem == $oldval then $mem <-- $newval"
*** 8423,8433 ****
--- 8892,8904 ----
    ins_pipe(pipe_slow);
  %}
  
  instruct compareAndSwapL(iRegINoSp res, indirect mem, iRegLNoSp oldval, iRegLNoSp newval, rFlagsReg cr) %{
  
+   predicate(!needs_acquiring_load_exclusive(n));
    match(Set res (CompareAndSwapL mem (Binary oldval newval)));
+   ins_cost(VOLATILE_REF_COST);
  
    effect(KILL cr);
  
   format %{
      "cmpxchg $mem, $oldval, $newval\t# (long) if $mem == $oldval then $mem <-- $newval"
*** 8440,8450 ****
--- 8911,8923 ----
    ins_pipe(pipe_slow);
  %}
  
  instruct compareAndSwapP(iRegINoSp res, indirect mem, iRegP oldval, iRegP newval, rFlagsReg cr) %{
  
+   predicate(!needs_acquiring_load_exclusive(n));
    match(Set res (CompareAndSwapP mem (Binary oldval newval)));
+   ins_cost(VOLATILE_REF_COST);
  
    effect(KILL cr);
  
   format %{
      "cmpxchg $mem, $oldval, $newval\t# (ptr) if $mem == $oldval then $mem <-- $newval"
*** 8457,8467 ****
--- 8930,8942 ----
    ins_pipe(pipe_slow);
  %}
  
  instruct compareAndSwapN(iRegINoSp res, indirect mem, iRegNNoSp oldval, iRegNNoSp newval, rFlagsReg cr) %{
  
+   predicate(!needs_acquiring_load_exclusive(n));
    match(Set res (CompareAndSwapN mem (Binary oldval newval)));
+   ins_cost(VOLATILE_REF_COST);
  
    effect(KILL cr);
  
   format %{
      "cmpxchgw $mem, $oldval, $newval\t# (narrow oop) if $mem == $oldval then $mem <-- $newval"
*** 8472,8481 ****
--- 8947,9030 ----
              aarch64_enc_cset_eq(res));
  
    ins_pipe(pipe_slow);
  %}
  
+ // alternative CompareAndSwapX when we are eliding barriers
+ 
+ instruct compareAndSwapIAcq(iRegINoSp res, indirect mem, iRegINoSp oldval, iRegINoSp newval, rFlagsReg cr) %{
+ 
+   match(Set res (CompareAndSwapI mem (Binary oldval newval)));
+   ins_cost(2 * VOLATILE_REF_COST);
+ 
+   effect(KILL cr);
+ 
+  format %{
+     "cmpxchgw_acq $mem, $oldval, $newval\t# (int) if $mem == $oldval then $mem <-- $newval"
+     "cset $res, EQ\t# $res <-- (EQ ? 1 : 0)"
+  %}
+ 
+  ins_encode(aarch64_enc_cmpxchgw_acq(mem, oldval, newval),
+             aarch64_enc_cset_eq(res));
+ 
+   ins_pipe(pipe_slow);
+ %}
+ 
+ instruct compareAndSwapLAcq(iRegINoSp res, indirect mem, iRegLNoSp oldval, iRegLNoSp newval, rFlagsReg cr) %{
+ 
+   match(Set res (CompareAndSwapL mem (Binary oldval newval)));
+   ins_cost(2 * VOLATILE_REF_COST);
+ 
+   effect(KILL cr);
+ 
+  format %{
+     "cmpxchg_acq $mem, $oldval, $newval\t# (long) if $mem == $oldval then $mem <-- $newval"
+     "cset $res, EQ\t# $res <-- (EQ ? 1 : 0)"
+  %}
+ 
+  ins_encode(aarch64_enc_cmpxchg_acq(mem, oldval, newval),
+             aarch64_enc_cset_eq(res));
+ 
+   ins_pipe(pipe_slow);
+ %}
+ 
+ instruct compareAndSwapPAcq(iRegINoSp res, indirect mem, iRegP oldval, iRegP newval, rFlagsReg cr) %{
+ 
+   match(Set res (CompareAndSwapP mem (Binary oldval newval)));
+   ins_cost(2 * VOLATILE_REF_COST);
+ 
+   effect(KILL cr);
+ 
+  format %{
+     "cmpxchg_acq $mem, $oldval, $newval\t# (ptr) if $mem == $oldval then $mem <-- $newval"
+     "cset $res, EQ\t# $res <-- (EQ ? 1 : 0)"
+  %}
+ 
+  ins_encode(aarch64_enc_cmpxchg_acq(mem, oldval, newval),
+             aarch64_enc_cset_eq(res));
+ 
+   ins_pipe(pipe_slow);
+ %}
+ 
+ instruct compareAndSwapNAcq(iRegINoSp res, indirect mem, iRegNNoSp oldval, iRegNNoSp newval, rFlagsReg cr) %{
+ 
+   match(Set res (CompareAndSwapN mem (Binary oldval newval)));
+   ins_cost(2 * VOLATILE_REF_COST);
+ 
+   effect(KILL cr);
+ 
+  format %{
+     "cmpxchgw_acq $mem, $oldval, $newval\t# (narrow oop) if $mem == $oldval then $mem <-- $newval"
+     "cset $res, EQ\t# $res <-- (EQ ? 1 : 0)"
+  %}
+ 
+  ins_encode(aarch64_enc_cmpxchgw_acq(mem, oldval, newval),
+             aarch64_enc_cset_eq(res));
+ 
+   ins_pipe(pipe_slow);
+ %}
+ 
  
  instruct get_and_setI(indirect mem, iRegINoSp newv, iRegI prev) %{
    match(Set prev (GetAndSetI mem newv));
    format %{ "atomic_xchgw  $prev, $newv, [$mem]" %}
    ins_encode %{

< prev index next >