< prev index next >

src/cpu/aarch64/vm/aarch64.ad

Print this page
rev 8967 : 8080293: AARCH64: Remove unnecessary dmbs from generated CAS code
Summary: The current encoding for CAS generates unnecessary leading and trailing dmbs for the MemBarAcquire and MemBarRelease which ought to be elided
Reviewed-by: kvn

@@ -1037,10 +1037,11 @@
   MemBarNode *parent_membar(const Node *n);
   MemBarNode *child_membar(const MemBarNode *n);
   bool leading_membar(const MemBarNode *barrier);
 
   bool is_card_mark_membar(const MemBarNode *barrier);
+  bool is_CAS(int opcode);
 
   MemBarNode *leading_to_normal(MemBarNode *leading);
   MemBarNode *normal_to_leading(const MemBarNode *barrier);
   MemBarNode *card_mark_to_trailing(const MemBarNode *barrier);
   MemBarNode *trailing_to_card_mark(const MemBarNode *trailing);

@@ -1055,10 +1056,13 @@
 
   bool unnecessary_release(const Node *barrier);
   bool unnecessary_volatile(const Node *barrier);
   bool needs_releasing_store(const Node *store);
 
+  // predicate controlling translation of CompareAndSwapX
+  bool needs_acquiring_load_exclusive(const Node *load);
+
   // predicate controlling translation of StoreCM
   bool unnecessary_storestore(const Node *storecm);
 %}
 
 source %{

@@ -1086,19 +1090,62 @@
   //
   //   dmb ish
   //   str<x>
   //   dmb ish
   //
+  // We can also use ldaxr and stlxr to implement compare and swap CAS
+  // sequences. These are normally translated to an instruction
+  // sequence like the following
+  //
+  //   dmb      ish
+  // retry:
+  //   ldxr<x>   rval raddr
+  //   cmp       rval rold
+  //   b.ne done
+  //   stlxr<x>  rval, rnew, rold
+  //   cbnz      rval retry
+  // done:
+  //   cset      r0, eq
+  //   dmb ishld
+  //
+  // Note that the exclusive store is already using an stlxr
+  // instruction. That is required to ensure visibility to other
+  // threads of the exclusive write (assuming it succeeds) before that
+  // of any subsequent writes.
+  //
+  // The following instruction sequence is an improvement on the above
+  //
+  // retry:
+  //   ldaxr<x>  rval raddr
+  //   cmp       rval rold
+  //   b.ne done
+  //   stlxr<x>  rval, rnew, rold
+  //   cbnz      rval retry
+  // done:
+  //   cset      r0, eq
+  //
+  // We don't need the leading dmb ish since the stlxr guarantees
+  // visibility of prior writes in the case that the swap is
+  // successful. Crucially we don't have to worry about the case where
+  // the swap is not successful since no valid program should be
+  // relying on visibility of prior changes by the attempting thread
+  // in the case where the CAS fails.
+  //
+  // Similarly, we don't need the trailing dmb ishld if we substitute
+  // an ldaxr instruction since that will provide all the guarantees we
+  // require regarding observation of changes made by other threads
+  // before any change to the CAS address observed by the load.
+  //
   // In order to generate the desired instruction sequence we need to
   // be able to identify specific 'signature' ideal graph node
   // sequences which i) occur as a translation of a volatile reads or
-  // writes and ii) do not occur through any other translation or
-  // graph transformation. We can then provide alternative aldc
-  // matching rules which translate these node sequences to the
-  // desired machine code sequences. Selection of the alternative
-  // rules can be implemented by predicates which identify the
-  // relevant node sequences.
+  // writes or CAS operations and ii) do not occur through any other
+  // translation or graph transformation. We can then provide
+  // alternative aldc matching rules which translate these node
+  // sequences to the desired machine code sequences. Selection of the
+  // alternative rules can be implemented by predicates which identify
+  // the relevant node sequences.
   //
   // The ideal graph generator translates a volatile read to the node
   // sequence
   //
   //   LoadX[mo_acquire]

@@ -1161,10 +1208,19 @@
   // sequence of membar nodes. Similarly, given an acquire membar we
   // can know that it was added because of an inlined unsafe volatile
   // get if it is fed and feeds a cpuorder membar and if its feed
   // membar also feeds an acquiring load.
   //
+  // Finally an inlined (Unsafe) CAS operation is translated to the
+  // following ideal graph
+  //
+  //   MemBarRelease
+  //   MemBarCPUOrder
+  //   CompareAndSwapX {CardMark}-optional
+  //   MemBarCPUOrder
+  //   MemBarAcquire
+  //
   // So, where we can identify these volatile read and write
   // signatures we can choose to plant either of the above two code
   // sequences. For a volatile read we can simply plant a normal
   // ldr<x> and translate the MemBarAcquire to a dmb. However, we can
   // also choose to inhibit translation of the MemBarAcquire and

@@ -1175,10 +1231,18 @@
   // normal str<x> and then a dmb ish for the MemBarVolatile.
   // Alternatively, we can inhibit translation of the MemBarRelease
   // and MemBarVolatile and instead plant a simple stlr<x>
   // instruction.
   //
+  // when we recognise a CAS signature we can choose to plant a dmb
+  // ish as a translation for the MemBarRelease, the conventional
+  // macro-instruction sequence for the CompareAndSwap node (which
+  // uses ldxr<x>) and then a dmb ishld for the MemBarAcquire.
+  // Alternatively, we can elide generation of the dmb instructions
+  // and plant the alternative CompareAndSwap macro-instruction
+  // sequence (which uses ldaxr<x>).
+  // 
   // Of course, the above only applies when we see these signature
   // configurations. We still want to plant dmb instructions in any
   // other cases where we may see a MemBarAcquire, MemBarRelease or
   // MemBarVolatile. For example, at the end of a constructor which
   // writes final/volatile fields we will see a MemBarRelease

@@ -1192,11 +1256,12 @@
   // always just translate the loads and stores to ldr<x> and str<x>
   // and translate acquire, release and volatile membars to the
   // relevant dmb instructions.
   //
 
-  // graph traversal helpers used for volatile put/get optimization
+  // graph traversal helpers used for volatile put/get and CAS
+  // optimization
 
   // 1) general purpose helpers
 
   // if node n is linked to a parent MemBarNode by an intervening
   // Control and Memory ProjNode return the MemBarNode otherwise return

@@ -1218,20 +1283,23 @@
       mem = n->lookup(TypeFunc::Memory);
     } else {
         return NULL;
     }
 
-    if (!ctl || !mem || !ctl->is_Proj() || !mem->is_Proj())
+    if (!ctl || !mem || !ctl->is_Proj() || !mem->is_Proj()) {
       return NULL;
+    }
 
     membar = ctl->lookup(0);
 
-    if (!membar || !membar->is_MemBar())
+    if (!membar || !membar->is_MemBar()) {
       return NULL;
+    }
 
-    if (mem->lookup(0) != membar)
+    if (mem->lookup(0) != membar) {
       return NULL;
+    }
 
     return membar->as_MemBar();
   }
 
   // if n is linked to a child MemBarNode by intervening Control and

@@ -1257,12 +1325,13 @@
           child = x->as_MemBar();
           break;
       }
     }
 
-    if (child == NULL)
+    if (child == NULL) {
       return NULL;
+    }
 
     for (DUIterator_Fast imax, i = mem->fast_outs(imax); i < imax; i++) {
       x = mem->fast_out(i);
       // if we see a membar we keep hold of it. we may also see a new
       // arena copy of the original but it will appear later

@@ -1281,19 +1350,22 @@
 
   bool leading_membar(const MemBarNode *barrier)
   {
     int opcode = barrier->Opcode();
     // if this is a release membar we are ok
-    if (opcode == Op_MemBarRelease)
+    if (opcode == Op_MemBarRelease) {
       return true;
+    }
     // if its a cpuorder membar . . .
-    if (opcode != Op_MemBarCPUOrder)
+    if (opcode != Op_MemBarCPUOrder) {
       return false;
+    }
     // then the parent has to be a release membar
     MemBarNode *parent = parent_membar(barrier);
-    if (!parent)
+    if (!parent) {
       return false;
+    }
     opcode = parent->Opcode();
     return opcode == Op_MemBarRelease;
   }
  
   // 2) card mark detection helper

@@ -1312,15 +1384,17 @@
   //
   // iii) the node's Mem projection feeds a StoreCM node.
   
   bool is_card_mark_membar(const MemBarNode *barrier)
   {
-    if (!UseG1GC && !(UseConcMarkSweepGC && UseCondCardMark))
+    if (!UseG1GC && !(UseConcMarkSweepGC && UseCondCardMark)) {
       return false;
+    }
 
-    if (barrier->Opcode() != Op_MemBarVolatile)
+    if (barrier->Opcode() != Op_MemBarVolatile) {
       return false;
+    }
 
     ProjNode *mem = barrier->proj_out(TypeFunc::Memory);
 
     for (DUIterator_Fast imax, i = mem->fast_outs(imax); i < imax ; i++) {
       Node *y = mem->fast_out(i);

@@ -1331,12 +1405,12 @@
   
     return false;
   }
 
 
-  // 3) helper predicates to traverse volatile put graphs which may
-  // contain GC barrier subgraphs
+  // 3) helper predicates to traverse volatile put or CAS graphs which
+  // may contain GC barrier subgraphs
 
   // Preamble
   // --------
   //
   // for volatile writes we can omit generating barriers and employ a

@@ -1402,12 +1476,11 @@
   //
   // It is also possible to perform the card mark conditionally on it
   // currently being unmarked in which case the volatile put graph
   // will look slightly different
   //
-  //   MemBarRelease
-  //   MemBarCPUOrder___________________________________________
+  //   MemBarRelease____________________________________________
   //         ||    \\               Ctl \     Ctl \     \\  Mem \
   //         ||    StoreN/P[mo_release] CastP2X   If   LoadB     |
   //         | \     /                              \            |
   //         | MergeMem                            . . .      StoreB
   //         | /                                                /

@@ -1417,11 +1490,11 @@
   // It is worth noting at this stage that both the above
   // configurations can be uniquely identified by checking that the
   // memory flow includes the following subgraph:
   //
   //   MemBarRelease
-  //   MemBarCPUOrder
+  //  {MemBarCPUOrder}
   //          |  \      . . .
   //          |  StoreX[mo_release]  . . .
   //          |   /
   //         MergeMem
   //          |

@@ -1429,12 +1502,52 @@
   //
   // This is referred to as a *normal* subgraph. It can easily be
   // detected starting from any candidate MemBarRelease,
   // StoreX[mo_release] or MemBarVolatile.
   //
+  // A simple variation on this normal case occurs for an unsafe CAS
+  // operation. The basic graph for a non-object CAS is
+  //
+  //   MemBarRelease
+  //         ||
+  //   MemBarCPUOrder
+  //         ||     \\   . . .
+  //         ||     CompareAndSwapX
+  //         ||       |
+  //         ||     SCMemProj
+  //         | \     /
+  //         | MergeMem
+  //         | /
+  //   MemBarCPUOrder
+  //         ||
+  //   MemBarAcquire
+  //
+  // The same basic variations on this arrangement (mutatis mutandis)
+  // occur when a card mark is introduced. i.e. we se the same basic
+  // shape but the StoreP/N is replaced with CompareAndSawpP/N and the
+  // tail of the graph is a pair comprising a MemBarCPUOrder +
+  // MemBarAcquire.
+  //
+  // So, in the case of a CAS the normal graph has the variant form
+  //
+  //   MemBarRelease
+  //   MemBarCPUOrder
+  //          |   \      . . .
+  //          |  CompareAndSwapX  . . .
+  //          |    |
+  //          |   SCMemProj
+  //          |   /  . . .
+  //         MergeMem
+  //          |
+  //   MemBarCPUOrder
+  //   MemBarAcquire
+  //
+  // This graph can also easily be detected starting from any
+  // candidate MemBarRelease, CompareAndSwapX or MemBarAcquire.
+  //
   // the code below uses two helper predicates, leading_to_normal and
-  // normal_to_leading to identify this configuration, one validating
+  // normal_to_leading to identify these normal graphs, one validating
   // the layout starting from the top membar and searching down and
   // the other validating the layout starting from the lower membar
   // and searching up.
   //
   // There are two special case GC configurations when a normal graph

@@ -1448,11 +1561,13 @@
   // employs a post-write GC barrier while G1 employs both a pre- and
   // post-write GC barrier. Of course the extra nodes may be absent --
   // they are only inserted for object puts. This significantly
   // complicates the task of identifying whether a MemBarRelease,
   // StoreX[mo_release] or MemBarVolatile forms part of a volatile put
-  // when using these GC configurations (see below).
+  // when using these GC configurations (see below). It adds similar
+  // complexity to the task of identifying whether a MemBarRelease,
+  // CompareAndSwapX or MemBarAcquire forms part of a CAS.
   //
   // In both cases the post-write subtree includes an auxiliary
   // MemBarVolatile (StoreLoad barrier) separating the object put and
   // the read of the corresponding card. This poses two additional
   // problems.

@@ -1487,11 +1602,12 @@
   // A CMS GC post-barrier wraps its card write (StoreCM) inside an If
   // which selects conditonal execution based on the value loaded
   // (LoadB) from the card. Ctl and Mem are fed to the If via an
   // intervening StoreLoad barrier (MemBarVolatile).
   //
-  // So, with CMS we may see a node graph which looks like this
+  // So, with CMS we may see a node graph for a volatile object store
+  // which looks like this
   //
   //   MemBarRelease
   //   MemBarCPUOrder_(leading)__________________
   //     C |    M \       \\                   C \
   //       |       \    StoreN/P[mo_release]  CastP2X

@@ -1522,10 +1638,59 @@
   // card mark membar. The trailing MergeMem merges the AliasIdxBot
   // Mem slice from the card mark membar and the AliasIdxRaw slice
   // from the StoreCM into the trailing membar (n.b. the latter
   // proceeds via a Phi associated with the If region).
   //
+  // The graph for a CAS varies slightly, the obvious difference being
+  // that the StoreN/P node is replaced by a CompareAndSwapP/N node
+  // and the trailing MemBarVolatile by a MemBarCPUOrder +
+  // MemBarAcquire pair. The other important difference is that the
+  // CompareAndSwap node's SCMemProj is not merged into the card mark
+  // membar - it still feeds the trailing MergeMem. This also means
+  // that the card mark membar receives its Mem feed directly from the
+  // leading membar rather than via a MergeMem.
+  //
+  //   MemBarRelease
+  //   MemBarCPUOrder__(leading)_________________________
+  //       ||                       \\                 C \
+  //   MemBarVolatile (card mark)  CompareAndSwapN/P  CastP2X
+  //     C |  ||    M |              |
+  //       | LoadB    |       ______/|
+  //       |   |      |      /       |
+  //       | Cmp      |     /      SCMemProj
+  //       | /        |    /         |
+  //       If         |   /         /
+  //       | \        |  /         /
+  // IfFalse  IfTrue  | /         /
+  //       \     / \  |/ prec    /
+  //        \   / StoreCM       /
+  //         \ /      |        /
+  //        Region   . . .    /
+  //          | \            /
+  //          |  . . .  \   / Bot
+  //          |       MergeMem
+  //          |          |
+  //        MemBarCPUOrder
+  //        MemBarAcquire (trailing)
+  //
+  // This has a slightly different memory subgraph to the one seen
+  // previously but the core of it is the same as for the CAS normal
+  // sungraph
+  //
+  //   MemBarRelease
+  //   MemBarCPUOrder____
+  //      ||             \      . . .
+  //   MemBarVolatile  CompareAndSwapX  . . .
+  //      |  \            |
+  //        . . .   SCMemProj
+  //          |     /  . . .
+  //         MergeMem
+  //          |
+  //   MemBarCPUOrder
+  //   MemBarAcquire
+  //
+  //
   // G1 is quite a lot more complicated. The nodes inserted on behalf
   // of G1 may comprise: a pre-write graph which adds the old value to
   // the SATB queue; the releasing store itself; and, finally, a
   // post-write graph which performs a card mark.
   //

@@ -1573,16 +1738,20 @@
   //         MemBarVolatile (trailing)
   //
   // n.b. the LoadB in this subgraph is not the card read -- it's a
   // read of the SATB queue active flag.
   //
+  // Once again the CAS graph is a minor variant on the above with the
+  // expected substitutions of CompareAndSawpX for StoreN/P and
+  // MemBarCPUOrder + MemBarAcquire for trailing MemBarVolatile.
+  //
   // The G1 post-write subtree is also optional, this time when the
   // new value being written is either null or can be identified as a
   // newly allocated (young gen) object with no intervening control
   // flow. The latter cannot happen but the former may, in which case
-  // the card mark membar is omitted and the memory feeds from the
-  // leading membar and the StoreN/P are merged direct into the
+  // the card mark membar is omitted and the memory feeds form the
+  // leading membar and the SToreN/P are merged direct into the
   // trailing membar as per the normal subgraph. So, the only special
   // case which arises is when the post-write subgraph is generated.
   //
   // The kernel of the post-write G1 subgraph is the card mark itself
   // which includes a card mark memory barrier (MemBarVolatile), a

@@ -1666,126 +1835,224 @@
   // membar. Each Phi corresponds to one of the Ifs which may skip
   // around the card mark membar. So when the If implementing the NULL
   // value check has been elided the total number of Phis is 2
   // otherwise it is 3.
   //
+  // The CAS graph when using G1GC also includes a pre-write subgraph
+  // and an optional post-write subgraph. Teh sam evarioations are
+  // introduced as for CMS with conditional card marking i.e. the
+  // StoreP/N is swapped for a CompareAndSwapP/N, the tariling
+  // MemBarVolatile for a MemBarCPUOrder + MemBarAcquire pair and the
+  // Mem feed from the CompareAndSwapP/N includes a precedence
+  // dependency feed to the StoreCM and a feed via an SCMemProj to the
+  // trailing membar. So, as before the configuration includes the
+  // normal CAS graph as a subgraph of the memory flow.
+  //
   // So, the upshot is that in all cases the volatile put graph will
   // include a *normal* memory subgraph betwen the leading membar and
-  // its child membar. When that child is not a card mark membar then
-  // it marks the end of a volatile put subgraph. If the child is a
-  // card mark membar then the normal subgraph will form part of a
-  // volatile put subgraph if and only if the child feeds an
-  // AliasIdxBot Mem feed to a trailing barrier via a MergeMem. That
-  // feed is either direct (for CMS) or via 2 or 3 Phi nodes merging
-  // the leading barrier memory flow (for G1).
+  // its child membar, either a volatile put graph (including a
+  // releasing StoreX) or a CAS graph (including a CompareAndSwapX).
+  // When that child is not a card mark membar then it marks the end
+  // of the volatile put or CAS subgraph. If the child is a card mark
+  // membar then the normal subgraph will form part of a volatile put
+  // subgraph if and only if the child feeds an AliasIdxBot Mem feed
+  // to a trailing barrier via a MergeMem. That feed is either direct
+  // (for CMS) or via 2 or 3 Phi nodes merging the leading barrier
+  // memory flow (for G1).
   // 
   // The predicates controlling generation of instructions for store
   // and barrier nodes employ a few simple helper functions (described
-  // below) which identify the presence or absence of these subgraph
-  // configurations and provide a means of traversing from one node in
-  // the subgraph to another.
+  // below) which identify the presence or absence of all these
+  // subgraph configurations and provide a means of traversing from
+  // one node in the subgraph to another.
+
+  // is_CAS(int opcode)
+  //
+  // return true if opcode is one of the possible CompareAndSwapX
+  // values otherwise false.
+
+  bool is_CAS(int opcode)
+  {
+    return (opcode == Op_CompareAndSwapI ||
+            opcode == Op_CompareAndSwapL ||
+            opcode == Op_CompareAndSwapN ||
+            opcode == Op_CompareAndSwapP);
+  }
 
   // leading_to_normal
   //
-  //graph traversal helper which detects the normal case Mem feed
-  // from a release membar (or, optionally, its cpuorder child) to a
-  // dependent volatile membar i.e. it ensures that the following Mem
-  // flow subgraph is present.
+  //graph traversal helper which detects the normal case Mem feed from
+  // a release membar (or, optionally, its cpuorder child) to a
+  // dependent volatile membar i.e. it ensures that one or other of
+  // the following Mem flow subgraph is present.
   //
   //   MemBarRelease
-  //   MemBarCPUOrder
+  //   MemBarCPUOrder {leading}
   //          |  \      . . .
   //          |  StoreN/P[mo_release]  . . .
   //          |   /
   //         MergeMem
   //          |
-  //   MemBarVolatile
+  //   MemBarVolatile {trailing or card mark}
+  //
+  //   MemBarRelease
+  //   MemBarCPUOrder {leading}
+  //      |       \      . . .
+  //      |     CompareAndSwapX  . . .
+  //               |
+  //     . . .    SCMemProj
+  //           \   |
+  //      |    MergeMem
+  //      |       /
+  //    MemBarCPUOrder
+  //    MemBarAcquire {trailing}
   //
-  // if the correct configuration is present returns the volatile
+  // if the correct configuration is present returns the trailing
   // membar otherwise NULL.
   //
   // the input membar is expected to be either a cpuorder membar or a
   // release membar. in the latter case it should not have a cpu membar
   // child.
   //
-  // the returned membar may be a card mark membar rather than a
-  // trailing membar.
+  // the returned value may be a card mark or trailing membar
+  //
 
   MemBarNode *leading_to_normal(MemBarNode *leading)
   {
     assert((leading->Opcode() == Op_MemBarRelease ||
             leading->Opcode() == Op_MemBarCPUOrder),
            "expecting a volatile or cpuroder membar!");
 
     // check the mem flow
     ProjNode *mem = leading->proj_out(TypeFunc::Memory);
 
-    if (!mem)
+    if (!mem) {
       return NULL;
+    }
 
     Node *x = NULL;
     StoreNode * st = NULL;
+    LoadStoreNode *cas = NULL;
     MergeMemNode *mm = NULL;
 
     for (DUIterator_Fast imax, i = mem->fast_outs(imax); i < imax; i++) {
       x = mem->fast_out(i);
       if (x->is_MergeMem()) {
-        if (mm != NULL)
+        if (mm != NULL) {
           return NULL;
+        }
         // two merge mems is one too many
         mm = x->as_MergeMem();
       } else if (x->is_Store() && x->as_Store()->is_release() && x->Opcode() != Op_StoreCM) {
-        // two releasing stores is one too many
-        if (st != NULL)
+        // two releasing stores/CAS nodes is one too many
+        if (st != NULL || cas != NULL) {
           return NULL;
+        }
         st = x->as_Store();
+      } else if (is_CAS(x->Opcode())) {
+        if (st != NULL || cas != NULL) {
+          return NULL;
+        }
+        cas = x->as_LoadStore();
       }
     }
 
-    if (!mm || !st)
+    // must have a store or a cas
+    if (!st && !cas) {
       return NULL;
+    }
 
-    bool found = false;
-    // ensure the store feeds the merge
+    // must have a merge if we also have st
+    if (st && !mm) {
+      return NULL;
+    }
+
+    Node *y = NULL;
+    if (cas) {
+      // look for an SCMemProj
+      for (DUIterator_Fast imax, i = cas->fast_outs(imax); i < imax; i++) {
+        x = cas->fast_out(i);
+        if (x->is_Proj()) {
+          y = x;
+          break;
+        }
+      }
+      if (y == NULL) {
+        return NULL;
+      }
+      // the proj must feed a MergeMem
+      for (DUIterator_Fast imax, i = y->fast_outs(imax); i < imax; i++) {
+        x = y->fast_out(i);
+        if (x->is_MergeMem()) {
+          mm = x->as_MergeMem();
+          break;
+        }
+      }
+      if (mm == NULL)
+        return NULL;
+    } else {
+      // ensure the store feeds the existing mergemem;
     for (DUIterator_Fast imax, i = st->fast_outs(imax); i < imax; i++) {
       if (st->fast_out(i) == mm) {
-        found = true;
+          y = st;
         break;
       }
     }
-
-    if (!found)
+      if (y == NULL) {
       return NULL;
+      }
+    }
 
-    MemBarNode *mbvol = NULL;
-    // ensure the merge feeds a volatile membar
+    MemBarNode *mbar = NULL;
+    // ensure the merge feeds to the expected type of membar
     for (DUIterator_Fast imax, i = mm->fast_outs(imax); i < imax; i++) {
       x = mm->fast_out(i);
-      if (x->is_MemBar() && x->Opcode() == Op_MemBarVolatile) {
-        mbvol = x->as_MemBar();
+      if (x->is_MemBar()) {
+        int opcode = x->Opcode();
+        if (opcode == Op_MemBarVolatile && st) {
+          mbar = x->as_MemBar();
+        } else if (cas && opcode == Op_MemBarCPUOrder) {
+          MemBarNode *y =  x->as_MemBar();
+          y = child_membar(y);
+          if (y != NULL && y->Opcode() == Op_MemBarAcquire) {
+            mbar = y;
+          }
+        }
         break;
       }
     }
 
-    return mbvol;
+    return mbar;
   }
 
   // normal_to_leading
   //
   // graph traversal helper which detects the normal case Mem feed
   // from either a card mark or a trailing membar to a preceding
   // release membar (optionally its cpuorder child) i.e. it ensures
-  // that the following Mem flow subgraph is present.
+  // that one or other of the following Mem flow subgraphs is present.
   //
   //   MemBarRelease
   //   MemBarCPUOrder {leading}
   //          |  \      . . .
   //          |  StoreN/P[mo_release]  . . .
   //          |   /
   //         MergeMem
   //          |
-  //   MemBarVolatile
+  //   MemBarVolatile {card mark or trailing}
+  //
+  //   MemBarRelease
+  //   MemBarCPUOrder {leading}
+  //      |       \      . . .
+  //      |     CompareAndSwapX  . . .
+  //               |
+  //     . . .    SCMemProj
+  //           \   |
+  //      |    MergeMem
+  //      |        /
+  //    MemBarCPUOrder
+  //    MemBarAcquire {trailing}
   //
   // this predicate checks for the same flow as the previous predicate
   // but starting from the bottom rather than the top.
   //
   // if the configuration is present returns the cpuorder member for

@@ -1795,56 +2062,121 @@
   // need not be a card mark membar.
 
   MemBarNode *normal_to_leading(const MemBarNode *barrier)
   {
     // input must be a volatile membar
-    assert(barrier->Opcode() == Op_MemBarVolatile, "expecting a volatile membar");
+    assert((barrier->Opcode() == Op_MemBarVolatile ||
+            barrier->Opcode() == Op_MemBarAcquire),
+           "expecting a volatile or an acquire membar");
     Node *x;
+    bool is_cas = barrier->Opcode() == Op_MemBarAcquire;
+
+    // if we have an acquire membar then it must be fed via a CPUOrder
+    // membar
+
+    if (is_cas) {
+      // skip to parent barrier which must be a cpuorder
+      x = parent_membar(barrier);
+      if (x->Opcode() != Op_MemBarCPUOrder)
+        return NULL;
+    } else {
+      // start from the supplied barrier
+      x = (Node *)barrier;
+    }
 
     // the Mem feed to the membar should be a merge
-    x = barrier->in(TypeFunc::Memory);
+    x = x ->in(TypeFunc::Memory);
     if (!x->is_MergeMem())
       return NULL;
 
     MergeMemNode *mm = x->as_MergeMem();
 
-    // the AliasIdxBot slice should be another MemBar projection
+    if (is_cas) {
+      // the merge should be fed from the CAS via an SCMemProj node
+      x = NULL;
+      for (uint idx = 1; idx < mm->req(); idx++) {
+        if (mm->in(idx)->Opcode() == Op_SCMemProj) {
+          x = mm->in(idx);
+          break;
+        }
+      }
+      if (x == NULL) {
+        return NULL;
+      }
+      // check for a CAS feeding this proj
+      x = x->in(0);
+      int opcode = x->Opcode();
+      if (!is_CAS(opcode)) {
+        return NULL;
+      }
+      // the CAS should get its mem feed from the leading membar
+      x = x->in(MemNode::Memory);
+    } else {
+      // the merge should get its Bottom mem feed from the leading membar
     x = mm->in(Compile::AliasIdxBot);
+    } 
+
     // ensure this is a non control projection
-    if (!x->is_Proj() || x->is_CFG())
+    if (!x->is_Proj() || x->is_CFG()) {
       return NULL;
+    }
     // if it is fed by a membar that's the one we want
     x = x->in(0);
 
-    if (!x->is_MemBar())
+    if (!x->is_MemBar()) {
       return NULL;
+    }
 
     MemBarNode *leading = x->as_MemBar();
     // reject invalid candidates
-    if (!leading_membar(leading))
+    if (!leading_membar(leading)) {
       return NULL;
+    }
 
-    // ok, we have a leading ReleaseMembar, now for the sanity clauses
+    // ok, we have a leading membar, now for the sanity clauses
 
-    // the leading membar must feed Mem to a releasing store
+    // the leading membar must feed Mem to a releasing store or CAS
     ProjNode *mem = leading->proj_out(TypeFunc::Memory);
     StoreNode *st = NULL;
+    LoadStoreNode *cas = NULL;
     for (DUIterator_Fast imax, i = mem->fast_outs(imax); i < imax; i++) {
       x = mem->fast_out(i);
       if (x->is_Store() && x->as_Store()->is_release() && x->Opcode() != Op_StoreCM) {
+        // two stores or CASes is one too many
+        if (st != NULL || cas != NULL) {
+          return NULL;
+        }
         st = x->as_Store();
-        break;
+      } else if (is_CAS(x->Opcode())) {
+        if (st != NULL || cas != NULL) {
+          return NULL;
+        }
+        cas = x->as_LoadStore();
       }
     }
-    if (st == NULL)
+
+    // we should not have both a store and a cas
+    if (st == NULL & cas == NULL) {
       return NULL;
+    }
 
-    // the releasing store has to feed the same merge
+    if (st == NULL) {
+      // nothing more to check
+      return leading;
+    } else {
+      // we should not have a store if we started from an acquire
+      if (is_cas) {
+        return NULL;
+      }
+
+      // the store should feed the merge we used to get here
     for (DUIterator_Fast imax, i = st->fast_outs(imax); i < imax; i++) {
-      if (st->fast_out(i) == mm)
+        if (st->fast_out(i) == mm) {
         return leading;
     }
+      }
+    }
 
     return NULL;
   }
 
   // card_mark_to_trailing

@@ -1863,12 +2195,12 @@
   //      |          |
   //      |        . . .
   //  Bot |  / 
   //   MergeMem 
   //      |
-  //   MemBarVolatile (trailing)
-  //
+  //      |
+  //    MemBarVolatile {trailing}
   //
   // 2)
   //   MemBarRelease/CPUOrder (leading)
   //    |
   //    | 

@@ -1882,11 +2214,12 @@
   //        \ /
   //        Phi  . . .
   //     Bot |   /
   //       MergeMem
   //         |
-  //   MemBarVolatile (trailing)
+  //    MemBarVolatile {trailing}
+  //
   //
   // 3)
   //   MemBarRelease/CPUOrder (leading)
   //    |
   //    |\

@@ -1903,11 +2236,12 @@
   //        \ /
   //        Phi  . . .
   //     Bot |   /
   //       MergeMem
   //         |
-  //   MemBarVolatile (trailing)
+  //         |
+  //    MemBarVolatile {trailing}
   //
   // configuration 1 is only valid if UseConcMarkSweepGC &&
   // UseCondCardMark
   //
   // configurations 2 and 3 are only valid if UseG1GC.

@@ -1953,12 +2287,13 @@
           if (x->is_Phi() && x->adr_type() == TypePtr::BOTTOM) {
             phi = x->as_Phi();
             break;
           }
         }
-        if (!phi)
+        if (!phi) {
           return NULL;
+        }
         // look for another merge below this phi
         feed = phi;
       } else {
         // couldn't find a merge
         return NULL;

@@ -1967,11 +2302,11 @@
 
     // sanity check this feed turns up as the expected slice
     assert(mm->as_MergeMem()->in(Compile::AliasIdxBot) == feed, "expecting membar to feed AliasIdxBot slice to Merge");
 
     MemBarNode *trailing = NULL;
-    // be sure we have a volatile membar below the merge
+    // be sure we have a trailing membar the merge
     for (DUIterator_Fast imax, i = mm->fast_outs(imax); i < imax; i++) {
       x = mm->fast_out(i);
       if (x->is_MemBar() && x->Opcode() == Op_MemBarVolatile) {
         trailing = x->as_MemBar();
         break;

@@ -1982,28 +2317,36 @@
   }
 
   // trailing_to_card_mark
   //
   // graph traversal helper which detects extra, non-normal Mem feed
-  // from a trailing membar to a preceding card mark volatile membar
-  // i.e. it identifies whether one of the three possible extra GC
-  // post-write Mem flow subgraphs is present
+  // from a trailing volatile membar to a preceding card mark volatile
+  // membar i.e. it identifies whether one of the three possible extra
+  // GC post-write Mem flow subgraphs is present
   //
   // this predicate checks for the same flow as the previous predicate
   // but starting from the bottom rather than the top.
   //
-  // if the configurationis present returns the card mark membar
+  // if the configuration is present returns the card mark membar
   // otherwise NULL
+  //
+  // n.b. the supplied membar is expected to be a trailing
+  // MemBarVolatile i.e. the caller must ensure the input node has the
+  // correct opcode
 
   MemBarNode *trailing_to_card_mark(const MemBarNode *trailing)
   {
-    assert(!is_card_mark_membar(trailing), "not expecting a card mark membar");
+    assert(trailing->Opcode() == Op_MemBarVolatile,
+           "expecting a volatile membar");
+    assert(!is_card_mark_membar(trailing),
+           "not expecting a card mark membar");
 
-    Node *x = trailing->in(TypeFunc::Memory);
     // the Mem feed to the membar should be a merge
-    if (!x->is_MergeMem())
+    Node *x = trailing->in(TypeFunc::Memory);
+    if (!x->is_MergeMem()) {
       return NULL;
+    }
 
     MergeMemNode *mm = x->as_MergeMem();
 
     x = mm->in(Compile::AliasIdxBot);
     // with G1 we may possibly see a Phi or two before we see a Memory

@@ -2052,25 +2395,27 @@
         return NULL;
       }
     }
     // the proj has to come from the card mark membar
     x = x->in(0);
-    if (!x->is_MemBar())
+    if (!x->is_MemBar()) {
       return NULL;
+    }
 
     MemBarNode *card_mark_membar = x->as_MemBar();
 
-    if (!is_card_mark_membar(card_mark_membar))
+    if (!is_card_mark_membar(card_mark_membar)) {
       return NULL;
+    }
 
     return card_mark_membar;
   }
 
   // trailing_to_leading
   //
   // graph traversal helper which checks the Mem flow up the graph
-  // from a (non-card mark) volatile membar attempting to locate and
+  // from a (non-card mark) trailing membar attempting to locate and
   // return an associated leading membar. it first looks for a
   // subgraph in the normal configuration (relying on helper
   // normal_to_leading). failing that it then looks for one of the
   // possible post-write card mark subgraphs linking the trailing node
   // to a the card mark membar (relying on helper

@@ -2079,38 +2424,53 @@
   // predicate normal_to_leading).
   //
   // if the configuration is valid returns the cpuorder member for
   // preference or when absent the release membar otherwise NULL.
   //
-  // n.b. the input membar is expected to be a volatile membar but
-  // must *not* be a card mark membar.
+  // n.b. the input membar is expected to be either a volatile or
+  // acquire membar but in the former case must *not* be a card mark
+  // membar.
 
   MemBarNode *trailing_to_leading(const MemBarNode *trailing)
   {
-    assert(!is_card_mark_membar(trailing), "not expecting a card mark membar");
+    assert((trailing->Opcode() == Op_MemBarAcquire ||
+            trailing->Opcode() == Op_MemBarVolatile),
+           "expecting an acquire or volatile membar");
+    assert((trailing->Opcode() != Op_MemBarVolatile ||
+            !is_card_mark_membar(trailing)),
+           "not expecting a card mark membar");
 
     MemBarNode *leading = normal_to_leading(trailing);
 
-    if (leading)
+    if (leading) {
       return leading;
+    }
+
+    // nothing more to do if this is an acquire
+    if (trailing->Opcode() == Op_MemBarAcquire) {
+      return NULL;
+    }
 
     MemBarNode *card_mark_membar = trailing_to_card_mark(trailing);
 
-    if (!card_mark_membar)
+    if (!card_mark_membar) {
       return NULL;
+    }
 
     return normal_to_leading(card_mark_membar);
   }
 
   // predicates controlling emit of ldr<x>/ldar<x> and associated dmb
 
 bool unnecessary_acquire(const Node *barrier)
 {
-  // assert barrier->is_MemBar();
-  if (UseBarriersForVolatile)
+  assert(barrier->is_MemBar(), "expecting a membar");
+
+  if (UseBarriersForVolatile) {
     // we need to plant a dmb
     return false;
+  }
 
   // a volatile read derived from bytecode (or also from an inlined
   // SHA field read via LibraryCallKit::load_field_from_object)
   // manifests as a LoadX[mo_acquire] followed by an acquire membar
   // with a bogus read dependency on it's preceding load. so in those

@@ -2138,12 +2498,13 @@
     //      |Parms
     //   MemBarAcquire*
     //
     // where * tags node we were passed
     // and |k means input k
-    if (x->is_DecodeNarrowPtr())
+    if (x->is_DecodeNarrowPtr()) {
       x = x->in(1);
+    }
 
     return (x->is_Load() && x->as_Load()->is_acquire());
   }
   
   // now check for an unsafe volatile get

@@ -2165,12 +2526,13 @@
   MemBarNode *parent = parent_membar(barrier);
   if (!parent || parent->Opcode() != Op_MemBarCPUOrder)
     return false;
   ctl = parent->proj_out(TypeFunc::Control);
   mem = parent->proj_out(TypeFunc::Memory);
-  if (!ctl || !mem)
+  if (!ctl || !mem) {
     return false;
+  }
   // ensure the proj nodes both feed a LoadX[mo_acquire]
   LoadNode *ld = NULL;
   for (DUIterator_Fast imax, i = ctl->fast_outs(imax); i < imax; i++) {
     x = ctl->fast_out(i);
     // if we see a load we keep hold of it and stop searching

@@ -2178,42 +2540,50 @@
       ld = x->as_Load();
       break;
     }
   }
   // it must be an acquiring load
-  if (! ld || ! ld->is_acquire())
-    return false;
+  if (ld && ld->is_acquire()) {
+
   for (DUIterator_Fast imax, i = mem->fast_outs(imax); i < imax; i++) {
     x = mem->fast_out(i);
     // if we see the same load we drop it and stop searching
     if (x == ld) {
       ld = NULL;
       break;
     }
   }
   // we must have dropped the load
-  if (ld)
-    return false;
+    if (ld == NULL) {
   // check for a child cpuorder membar
   MemBarNode *child  = child_membar(barrier->as_MemBar());
-  if (!child || child->Opcode() != Op_MemBarCPUOrder)
-    return false;
-
+      if (child && child->Opcode() != Op_MemBarCPUOrder)
   return true;
+    }
+  }
+
+  // final option for unnecessary mebar is that it is a trailing node
+  // belonging to a CAS
+
+  MemBarNode *leading = trailing_to_leading(barrier->as_MemBar());
+
+  return leading != NULL;
 }
 
 bool needs_acquiring_load(const Node *n)
 {
-  // assert n->is_Load();
-  if (UseBarriersForVolatile)
+  assert(n->is_Load(), "expecting a load");
+  if (UseBarriersForVolatile) {
     // we use a normal load and a dmb
     return false;
+  }
 
   LoadNode *ld = n->as_Load();
 
-  if (!ld->is_acquire())
+  if (!ld->is_acquire()) {
     return false;
+  }
 
   // check if this load is feeding an acquire membar
   //
   //   LoadX[mo_acquire]
   //   {  |1   }

@@ -2259,37 +2629,41 @@
 
   MemBarNode *membar;
 
   membar = parent_membar(ld);
 
-  if (!membar || !membar->Opcode() == Op_MemBarCPUOrder)
+  if (!membar || !membar->Opcode() == Op_MemBarCPUOrder) {
     return false;
+  }
 
   // ensure that there is a CPUOrder->Acquire->CPUOrder membar chain
 
   membar = child_membar(membar);
 
-  if (!membar || !membar->Opcode() == Op_MemBarAcquire)
+  if (!membar || !membar->Opcode() == Op_MemBarAcquire) {
     return false;
+  }
 
   membar = child_membar(membar);
   
-  if (!membar || !membar->Opcode() == Op_MemBarCPUOrder)
+  if (!membar || !membar->Opcode() == Op_MemBarCPUOrder) {
     return false;
+  }
 
   return true;
 }
 
 bool unnecessary_release(const Node *n)
 {
   assert((n->is_MemBar() &&
           n->Opcode() == Op_MemBarRelease),
          "expecting a release membar");
 
-  if (UseBarriersForVolatile)
+  if (UseBarriersForVolatile) {
     // we need to plant a dmb
     return false;
+  }
 
   // if there is a dependent CPUOrder barrier then use that as the
   // leading
 
   MemBarNode *barrier = n->as_MemBar();

@@ -2301,36 +2675,40 @@
   }
 
   // must start with a normal feed
   MemBarNode *child_barrier = leading_to_normal(barrier);
 
-  if (!child_barrier)
+  if (!child_barrier) {
     return false;
+  }
 
-  if (!is_card_mark_membar(child_barrier))
+  if (!is_card_mark_membar(child_barrier)) {
     // this is the trailing membar and we are done
     return true;
+  }
 
   // must be sure this card mark feeds a trailing membar
   MemBarNode *trailing = card_mark_to_trailing(child_barrier);
   return (trailing != NULL);
 }
 
 bool unnecessary_volatile(const Node *n)
 {
   // assert n->is_MemBar();
-  if (UseBarriersForVolatile)
+  if (UseBarriersForVolatile) {
     // we need to plant a dmb
     return false;
+  }
 
   MemBarNode *mbvol = n->as_MemBar();
 
   // first we check if this is part of a card mark. if so then we have
   // to generate a StoreLoad barrier
   
-  if (is_card_mark_membar(mbvol))
+  if (is_card_mark_membar(mbvol)) {
       return false;
+  }
 
   // ok, if it's not a card mark then we still need to check if it is
   // a trailing membar of a volatile put hgraph.
 
   return (trailing_to_leading(mbvol) != NULL);

@@ -2339,59 +2717,115 @@
 // predicates controlling emit of str<x>/stlr<x> and associated dmbs
 
 bool needs_releasing_store(const Node *n)
 {
   // assert n->is_Store();
-  if (UseBarriersForVolatile)
+  if (UseBarriersForVolatile) {
     // we use a normal store and dmb combination
     return false;
+  }
 
   StoreNode *st = n->as_Store();
 
   // the store must be marked as releasing
-  if (!st->is_release())
+  if (!st->is_release()) {
     return false;
+  }
 
   // the store must be fed by a membar
 
   Node *x = st->lookup(StoreNode::Memory);
 
-  if (! x || !x->is_Proj())
+  if (! x || !x->is_Proj()) {
     return false;
+  }
 
   ProjNode *proj = x->as_Proj();
 
   x = proj->lookup(0);
 
-  if (!x || !x->is_MemBar())
+  if (!x || !x->is_MemBar()) {
     return false;
+  }
 
   MemBarNode *barrier = x->as_MemBar();
 
   // if the barrier is a release membar or a cpuorder mmebar fed by a
   // release membar then we need to check whether that forms part of a
   // volatile put graph.
 
   // reject invalid candidates
-  if (!leading_membar(barrier))
+  if (!leading_membar(barrier)) {
     return false;
+  }
 
   // does this lead a normal subgraph?
   MemBarNode *mbvol = leading_to_normal(barrier);
 
-  if (!mbvol)
+  if (!mbvol) {
     return false;
+  }
 
   // all done unless this is a card mark
-  if (!is_card_mark_membar(mbvol))
+  if (!is_card_mark_membar(mbvol)) {
     return true;
+  }
   
   // we found a card mark -- just make sure we have a trailing barrier
 
   return (card_mark_to_trailing(mbvol) != NULL);
 }
 
+// predicate controlling translation of CAS
+//
+// returns true if CAS needs to use an acquiring load otherwise false
+
+bool needs_acquiring_load_exclusive(const Node *n)
+{
+  assert(is_CAS(n->Opcode()), "expecting a compare and swap");
+  if (UseBarriersForVolatile) {
+    return false;
+  }
+
+  // CAS nodes only ought to turn up in inlined unsafe CAS operations
+#ifdef ASSERT
+  LoadStoreNode *st = n->as_LoadStore();
+
+  // the store must be fed by a membar
+
+  Node *x = st->lookup(StoreNode::Memory);
+
+  assert (x && x->is_Proj(), "CAS not fed by memory proj!");
+
+  ProjNode *proj = x->as_Proj();
+
+  x = proj->lookup(0);
+
+  assert (x && x->is_MemBar(), "CAS not fed by membar!");
+
+  MemBarNode *barrier = x->as_MemBar();
+
+  // the barrier must be a cpuorder mmebar fed by a release membar
+
+  assert(barrier->Opcode() == Op_MemBarCPUOrder,
+         "CAS not fed by cpuorder membar!");
+      
+  MemBarNode *b = parent_membar(barrier);
+  assert ((b != NULL && b->Opcode() == Op_MemBarRelease),
+          "CAS not fed by cpuorder+release membar pair!");
+
+  // does this lead a normal subgraph?
+  MemBarNode *mbar = leading_to_normal(barrier);
+
+  assert(mbar != NULL, "CAS not embedded in normal graph!");
+
+  assert(mbar->Opcode() == Op_MemBarAcquire, "trailing membar should be an acquire");
+#endif // ASSERT
+  // so we can just return true here
+  return true;
+}
+
 // predicate controlling translation of StoreCM
 //
 // returns true if a StoreStore must precede the card write otherwise
 // false
 

@@ -2401,18 +2835,20 @@
 
   // we only ever need to generate a dmb ishst between an object put
   // and the associated card mark when we are using CMS without
   // conditional card marking
 
-  if (!UseConcMarkSweepGC || UseCondCardMark)
+  if (!UseConcMarkSweepGC || UseCondCardMark) {
     return true;
+  }
 
   // if we are implementing volatile puts using barriers then the
   // object put as an str so we must insert the dmb ishst
 
-  if (UseBarriersForVolatile)
+  if (UseBarriersForVolatile) {
     return false;
+  }
 
   // we can omit the dmb ishst if this StoreCM is part of a volatile
   // put because in thta case the put will be implemented by stlr
   //
   // we need to check for a normal subgraph feeding this StoreCM.

@@ -2420,23 +2856,26 @@
   // either a MemBarRelease or its dependent MemBarCPUOrder, and the
   // leading membar must be part of a normal subgraph
 
   Node *x = storecm->in(StoreNode::Memory);
 
-  if (!x->is_Proj())
+  if (!x->is_Proj()) {
     return false;
+  }
 
   x = x->in(0);
 
-  if (!x->is_MemBar())
+  if (!x->is_MemBar()) {
     return false;
+  }
 
   MemBarNode *leading = x->as_MemBar();
 
   // reject invalid candidates
-  if (!leading_membar(leading))
+  if (!leading_membar(leading)) {
     return false;
+  }
 
   // we can omit the StoreStore if it is the head of a normal subgraph
   return (leading_to_normal(leading) != NULL);
 }
 

@@ -8363,13 +8802,17 @@
 %}
 
 // XXX No flag versions for CompareAndSwap{I,L,P,N} because matcher
 // can't match them
 
+// standard CompareAndSwapX when we are using barriers
+// these have higher priority than the rules selected by a predicate
+
 instruct compareAndSwapI(iRegINoSp res, indirect mem, iRegINoSp oldval, iRegINoSp newval, rFlagsReg cr) %{
 
   match(Set res (CompareAndSwapI mem (Binary oldval newval)));
+  ins_cost(2 * VOLATILE_REF_COST);
 
   effect(KILL cr);
 
  format %{
     "cmpxchgw $mem, $oldval, $newval\t# (int) if $mem == $oldval then $mem <-- $newval"

@@ -8383,10 +8826,11 @@
 %}
 
 instruct compareAndSwapL(iRegINoSp res, indirect mem, iRegLNoSp oldval, iRegLNoSp newval, rFlagsReg cr) %{
 
   match(Set res (CompareAndSwapL mem (Binary oldval newval)));
+  ins_cost(2 * VOLATILE_REF_COST);
 
   effect(KILL cr);
 
  format %{
     "cmpxchg $mem, $oldval, $newval\t# (long) if $mem == $oldval then $mem <-- $newval"

@@ -8400,10 +8844,11 @@
 %}
 
 instruct compareAndSwapP(iRegINoSp res, indirect mem, iRegP oldval, iRegP newval, rFlagsReg cr) %{
 
   match(Set res (CompareAndSwapP mem (Binary oldval newval)));
+  ins_cost(2 * VOLATILE_REF_COST);
 
   effect(KILL cr);
 
  format %{
     "cmpxchg $mem, $oldval, $newval\t# (ptr) if $mem == $oldval then $mem <-- $newval"

@@ -8417,10 +8862,11 @@
 %}
 
 instruct compareAndSwapN(iRegINoSp res, indirect mem, iRegNNoSp oldval, iRegNNoSp newval, rFlagsReg cr) %{
 
   match(Set res (CompareAndSwapN mem (Binary oldval newval)));
+  ins_cost(2 * VOLATILE_REF_COST);
 
   effect(KILL cr);
 
  format %{
     "cmpxchgw $mem, $oldval, $newval\t# (narrow oop) if $mem == $oldval then $mem <-- $newval"

@@ -8431,10 +8877,88 @@
             aarch64_enc_cset_eq(res));
 
   ins_pipe(pipe_slow);
 %}
 
+// alternative CompareAndSwapX when we are eliding barriers
+
+instruct compareAndSwapIAcq(iRegINoSp res, indirect mem, iRegINoSp oldval, iRegINoSp newval, rFlagsReg cr) %{
+
+  predicate(needs_acquiring_load_exclusive(n));
+  match(Set res (CompareAndSwapI mem (Binary oldval newval)));
+  ins_cost(VOLATILE_REF_COST);
+
+  effect(KILL cr);
+
+ format %{
+    "cmpxchgw_acq $mem, $oldval, $newval\t# (int) if $mem == $oldval then $mem <-- $newval"
+    "cset $res, EQ\t# $res <-- (EQ ? 1 : 0)"
+ %}
+
+ ins_encode(aarch64_enc_cmpxchgw_acq(mem, oldval, newval),
+            aarch64_enc_cset_eq(res));
+
+  ins_pipe(pipe_slow);
+%}
+
+instruct compareAndSwapLAcq(iRegINoSp res, indirect mem, iRegLNoSp oldval, iRegLNoSp newval, rFlagsReg cr) %{
+
+  predicate(needs_acquiring_load_exclusive(n));
+  match(Set res (CompareAndSwapL mem (Binary oldval newval)));
+  ins_cost(VOLATILE_REF_COST);
+
+  effect(KILL cr);
+
+ format %{
+    "cmpxchg_acq $mem, $oldval, $newval\t# (long) if $mem == $oldval then $mem <-- $newval"
+    "cset $res, EQ\t# $res <-- (EQ ? 1 : 0)"
+ %}
+
+ ins_encode(aarch64_enc_cmpxchg_acq(mem, oldval, newval),
+            aarch64_enc_cset_eq(res));
+
+  ins_pipe(pipe_slow);
+%}
+
+instruct compareAndSwapPAcq(iRegINoSp res, indirect mem, iRegP oldval, iRegP newval, rFlagsReg cr) %{
+
+  predicate(needs_acquiring_load_exclusive(n));
+  match(Set res (CompareAndSwapP mem (Binary oldval newval)));
+  ins_cost(VOLATILE_REF_COST);
+
+  effect(KILL cr);
+
+ format %{
+    "cmpxchg_acq $mem, $oldval, $newval\t# (ptr) if $mem == $oldval then $mem <-- $newval"
+    "cset $res, EQ\t# $res <-- (EQ ? 1 : 0)"
+ %}
+
+ ins_encode(aarch64_enc_cmpxchg_acq(mem, oldval, newval),
+            aarch64_enc_cset_eq(res));
+
+  ins_pipe(pipe_slow);
+%}
+
+instruct compareAndSwapNAcq(iRegINoSp res, indirect mem, iRegNNoSp oldval, iRegNNoSp newval, rFlagsReg cr) %{
+
+  predicate(needs_acquiring_load_exclusive(n));
+  match(Set res (CompareAndSwapN mem (Binary oldval newval)));
+  ins_cost(VOLATILE_REF_COST);
+
+  effect(KILL cr);
+
+ format %{
+    "cmpxchgw_acq $mem, $oldval, $newval\t# (narrow oop) if $mem == $oldval then $mem <-- $newval"
+    "cset $res, EQ\t# $res <-- (EQ ? 1 : 0)"
+ %}
+
+ ins_encode(aarch64_enc_cmpxchgw_acq(mem, oldval, newval),
+            aarch64_enc_cset_eq(res));
+
+  ins_pipe(pipe_slow);
+%}
+
 
 instruct get_and_setI(indirect mem, iRegINoSp newv, iRegI prev) %{
   match(Set prev (GetAndSetI mem newv));
   format %{ "atomic_xchgw  $prev, $newv, [$mem]" %}
   ins_encode %{
< prev index next >