[llvm] [SystemZ] Simplify f128 atomic load/store (PR #90977)

Fri May 3 08:58:03 PDT 2024

https://github.com/uweigand created https://github.com/llvm/llvm-project/pull/90977

Change definition of expandBitCastI128ToF128 and expandBitCastF128ToI128 to allow for simplified use in atomic load/store.

Update logic to split 128-bit loads and stores in DAGCombine to also handle the f128 case where appropriate.  This fixes the regressions introduced by recent atomic load/store patches.

>From dba69e52692b57206bb3dab813791731a3042f34 Mon Sep 17 00:00:00 2001
From: Ulrich Weigand <ulrich.weigand at de.ibm.com>
Date: Fri, 3 May 2024 16:59:12 +0200
Subject: [PATCH] [SystemZ] Simplify f128 atomic load/store

Change definition of expandBitCastI128ToF128 and expandBitCastF128ToI128
to allow for simplified use in atomic load/store.

Update logic to split 128-bit loads and stores in DAGCombine to
also handle the f128 case where appropriate.  This fixes the
regressions introduced by recent atomic load/store patches.
---
 .../Target/SystemZ/SystemZISelLowering.cpp    | 205 ++++++++++--------
 llvm/test/CodeGen/SystemZ/atomic-load-08.ll   |  21 +-
 llvm/test/CodeGen/SystemZ/atomic-store-08.ll  |  34 +--
 .../test/CodeGen/SystemZ/atomicrmw-fmax-03.ll |   6 +-
 .../test/CodeGen/SystemZ/atomicrmw-fmin-03.ll |   6 +-
 5 files changed, 131 insertions(+), 141 deletions(-)

diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
index 5612e4ced4f607..cb1d7d5388d3bc 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -1551,6 +1551,8 @@ static SDValue lowerI128ToGR128(SelectionDAG &DAG, SDValue In) {
     std::tie(Lo, Hi) = DAG.SplitScalar(In, DL, MVT::i64, MVT::i64);
   }
 
+  // FIXME: If v2i64 were a legal type, we could use it instead of
+  // Untyped here.  This might enable improved folding.
   SDNode *Pair = DAG.getMachineNode(SystemZ::PAIR128, DL,
                                     MVT::Untyped, Hi, Lo);
   return SDValue(Pair, 0);
@@ -6247,14 +6249,18 @@ SDValue SystemZTargetLowering::LowerOperation(SDValue Op,
   }
 }
 
-// Manually lower a bitcast to avoid introducing illegal types after type
-// legalization.
 static SDValue expandBitCastI128ToF128(SelectionDAG &DAG, SDValue Src,
-                                       SDValue Chain, const SDLoc &SL) {
-  SDValue Hi =
-      DAG.getTargetExtractSubreg(SystemZ::subreg_h64, SL, MVT::i64, Src);
-  SDValue Lo =
-      DAG.getTargetExtractSubreg(SystemZ::subreg_l64, SL, MVT::i64, Src);
+                                       const SDLoc &SL) {
+  // If i128 is legal, just use a normal bitcast.
+  if (DAG.getTargetLoweringInfo().isTypeLegal(MVT::i128))
+    return DAG.getBitcast(MVT::f128, Src);
+
+  // Otherwise, f128 must live in FP128, so do a partwise move.
+  assert(DAG.getTargetLoweringInfo().getRepRegClassFor(MVT::f128)
+         == &SystemZ::FP128BitRegClass);
+
+  SDValue Hi, Lo;
+  std::tie(Lo, Hi) = DAG.SplitScalar(Src, SL, MVT::i64, MVT::i64);
 
   Hi = DAG.getBitcast(MVT::f64, Hi);
   Lo = DAG.getBitcast(MVT::f64, Lo);
@@ -6267,8 +6273,16 @@ static SDValue expandBitCastI128ToF128(SelectionDAG &DAG, SDValue Src,
   return SDValue(Pair, 0);
 }
 
-static std::pair<SDValue, SDValue>
-expandBitCastF128ToI128Parts(SelectionDAG &DAG, SDValue Src, const SDLoc &SL) {
+static SDValue expandBitCastF128ToI128(SelectionDAG &DAG, SDValue Src,
+                                       const SDLoc &SL) {
+  // If i128 is legal, just use a normal bitcast.
+  if (DAG.getTargetLoweringInfo().isTypeLegal(MVT::i128))
+    return DAG.getBitcast(MVT::i128, Src);
+
+  // Otherwise, f128 must live in FP128, so do a partwise move.
+  assert(DAG.getTargetLoweringInfo().getRepRegClassFor(MVT::f128)
+         == &SystemZ::FP128BitRegClass);
+
   SDValue LoFP =
       DAG.getTargetExtractSubreg(SystemZ::subreg_l64, SL, MVT::f64, Src);
   SDValue HiFP =
@@ -6276,15 +6290,7 @@ expandBitCastF128ToI128Parts(SelectionDAG &DAG, SDValue Src, const SDLoc &SL) {
   SDValue Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i64, LoFP);
   SDValue Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i64, HiFP);
 
-  return {Hi, Lo};
-}
-
-static SDValue expandBitCastF128ToI128(SelectionDAG &DAG, SDValue Src,
-                                       const SDLoc &SL) {
-
-  auto [Hi, Lo] = expandBitCastF128ToI128Parts(DAG, Src, SL);
-  SDNode *Pair = DAG.getMachineNode(SystemZ::PAIR128, SL, MVT::Untyped, Hi, Lo);
-  return SDValue(Pair, 0);
+  return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i128, Lo, Hi);
 }
 
 // Lower operations with invalid operand or result types (currently used
@@ -6302,38 +6308,20 @@ SystemZTargetLowering::LowerOperationWrapper(SDNode *N,
     SDValue Res = DAG.getMemIntrinsicNode(SystemZISD::ATOMIC_LOAD_128,
                                           DL, Tys, Ops, MVT::i128, MMO);
 
-    EVT VT = N->getValueType(0);
-
-    if (VT == MVT::i128 || isTypeLegal(MVT::i128)) {
-      SDValue Lowered = lowerGR128ToI128(DAG, Res);
-      Results.push_back(DAG.getBitcast(VT, Lowered));
-      Results.push_back(Res.getValue(1));
-    } else {
-      // For the f128 case, after type legalization, we cannot produce a bitcast
-      // with an illegal type (i.e. i128), so manually lower it.
-      //
-      // FIXME: Really v2i64 should be legal, and should be used in place of
-      // unttyped. Then we could emit the bitcast which will potentially fold
-      // into the use.
-      SDValue Cast = expandBitCastI128ToF128(DAG, Res, Res.getValue(1), DL);
-      Results.push_back(Cast);
-      Results.push_back(Res.getValue(1));
-    }
-
+    SDValue Lowered = lowerGR128ToI128(DAG, Res);
+    if (N->getValueType(0) == MVT::f128)
+      Lowered = expandBitCastI128ToF128(DAG, Lowered, DL);
+    Results.push_back(Lowered);
+    Results.push_back(Res.getValue(1));
     break;
   }
   case ISD::ATOMIC_STORE: {
     SDLoc DL(N);
     SDVTList Tys = DAG.getVTList(MVT::Other);
     SDValue Val = N->getOperand(1);
-    EVT VT = Val.getValueType();
-
-    if (VT == MVT::i128 || isTypeLegal(MVT::i128)) {
-      Val = DAG.getBitcast(MVT::i128, Val);
-      Val = lowerI128ToGR128(DAG, Val);
-    } else {
+    if (Val.getValueType() == MVT::f128)
       Val = expandBitCastF128ToI128(DAG, Val, DL);
-    }
+    Val = lowerI128ToGR128(DAG, Val);
 
     SDValue Ops[] = {N->getOperand(0), Val, N->getOperand(2)};
     MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
@@ -6370,21 +6358,7 @@ SystemZTargetLowering::LowerOperationWrapper(SDNode *N,
     if (N->getValueType(0) == MVT::i128 && Src.getValueType() == MVT::f128 &&
         !useSoftFloat()) {
       SDLoc DL(N);
-      SDValue Lo, Hi;
-      if (getRepRegClassFor(MVT::f128) == &SystemZ::VR128BitRegClass) {
-        SDValue VecBC = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Src);
-        Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, VecBC,
-                         DAG.getConstant(1, DL, MVT::i32));
-        Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, VecBC,
-                         DAG.getConstant(0, DL, MVT::i32));
-      } else {
-        // FIXME: Assert should be moved into expandBitCastF128ToI128Parts
-        assert(getRepRegClassFor(MVT::f128) == &SystemZ::FP128BitRegClass &&
-               "Unrecognized register class for f128.");
-        std::tie(Hi, Lo) = expandBitCastF128ToI128Parts(DAG, Src, DL);
-      }
-
-      Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i128, Lo, Hi));
+      Results.push_back(expandBitCastF128ToI128(DAG, Src, DL));
     }
     break;
   }
@@ -6829,47 +6803,79 @@ SDValue SystemZTargetLowering::combineMERGE(
   return SDValue();
 }
 
+static bool isI128MovedToParts(LoadSDNode *LD, SmallVector<std::pair<SDNode *, int>, 2> &Users) {
+  // Scan through all users.
+  for (SDNode::use_iterator UI = LD->use_begin(), UIEnd = LD->use_end();
+       UI != UIEnd; ++UI) {
+    // Skip the uses of the chain.
+    if (UI.getUse().getResNo() != 0)
+      continue;
+
+    // Verify every user is a TRUNCATE to i64 of the low or high half.
+    SDNode *User = *UI;
+    int Index = 1;
+    if (User->getOpcode() == ISD::SRL &&
+        User->getOperand(1).getOpcode() == ISD::Constant &&
+        User->getConstantOperandVal(1) == 64 && User->hasOneUse()) {
+      User = *User->use_begin();
+      Index = 0;
+    }
+    if (User->getOpcode() != ISD::TRUNCATE ||
+        User->getValueType(0) != MVT::i64)
+      return false;
+
+    Users.push_back(std::make_pair(User, Index));
+  }
+  return true;
+}
+
+static bool isF128MovedToParts(LoadSDNode *LD, SmallVector<std::pair<SDNode *, int>, 2> &Users) {
+  // Scan through all users.
+  for (SDNode::use_iterator UI = LD->use_begin(), UIEnd = LD->use_end();
+       UI != UIEnd; ++UI) {
+    // Skip the uses of the chain.
+    if (UI.getUse().getResNo() != 0)
+      continue;
+
+    // Verify every user is an EXTRACT_SUBREG of the low or high half.
+    SDNode *User = *UI;
+    if (!User->hasOneUse() || !User->isMachineOpcode() ||
+        User->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
+      return false;
+
+    int Index;
+    switch (User->getConstantOperandVal(1)) {
+      case SystemZ::subreg_l64: Index = 1; break;
+      case SystemZ::subreg_h64: Index = 0; break;
+      default: return false;
+    }
+
+    Users.push_back(std::make_pair(User, Index));
+  }
+  return true;
+}
+
 SDValue SystemZTargetLowering::combineLOAD(
     SDNode *N, DAGCombinerInfo &DCI) const {
   SelectionDAG &DAG = DCI.DAG;
   EVT LdVT = N->getValueType(0);
   SDLoc DL(N);
 
-  // Replace an i128 load that is used solely to move its value into GPRs
+  // Replace a 128-bit load that is used solely to move its value into GPRs
   // by separate loads of both halves.
-  if (LdVT == MVT::i128) {
-    LoadSDNode *LD = cast<LoadSDNode>(N);
-    if (!LD->isSimple() || !ISD::isNormalLoad(LD))
-      return SDValue();
+  LoadSDNode *LD = cast<LoadSDNode>(N);
+  SmallVector<std::pair<SDNode *, int>, 2> Users;
+  if (LD->isSimple() && ISD::isNormalLoad(LD) &&
+      ((LdVT == MVT::i128 && isI128MovedToParts(LD, Users)) ||
+       (LdVT == MVT::f128 && isF128MovedToParts(LD, Users)))) {
 
-    // Scan through all users.
-    SmallVector<std::pair<SDNode *, int>, 2> Users;
+    // Verify that no part is extracted twice.
     int UsedElements = 0;
-    for (SDNode::use_iterator UI = LD->use_begin(), UIEnd = LD->use_end();
-         UI != UIEnd; ++UI) {
-      // Skip the uses of the chain.
-      if (UI.getUse().getResNo() != 0)
-        continue;
-
-      // Verify every user is a TRUNCATE to i64 of the low or high half ...
-      SDNode *User = *UI;
-      int Index = 1;
-      if (User->getOpcode() == ISD::SRL &&
-          User->getOperand(1).getOpcode() == ISD::Constant &&
-          User->getConstantOperandVal(1) == 64 && User->hasOneUse()) {
-        User = *User->use_begin();
-        Index = 0;
-      }
-      if (User->getOpcode() != ISD::TRUNCATE ||
-          User->getValueType(0) != MVT::i64)
-        return SDValue();
-
-      // ... and no half is extracted twice.
+    for (auto UserAndIndex : Users) {
+      unsigned Index = UserAndIndex.second;
       if (UsedElements & (1 << Index))
         return SDValue();
-
       UsedElements |= 1 << Index;
-      Users.push_back(std::make_pair(User, Index));
     }
 
     // Rewrite each extraction as an independent load.
@@ -6974,7 +6980,7 @@ static bool isOnlyUsedByStores(SDValue StoredVal, SelectionDAG &DAG) {
   return true;
 }
 
-static bool isMovedFromParts(SDValue Val, SDValue &LoPart, SDValue &HiPart) {
+static bool isI128MovedFromParts(SDValue Val, SDValue &LoPart, SDValue &HiPart) {
   if (Val.getOpcode() != ISD::OR || !Val.getNode()->hasOneUse())
     return false;
 
@@ -7001,6 +7007,22 @@ static bool isMovedFromParts(SDValue Val, SDValue &LoPart, SDValue &HiPart) {
   return true;
 }
 
+static bool isF128MovedFromParts(SDValue Val, SDValue &LoPart, SDValue &HiPart) {
+  if (!Val.getNode()->hasOneUse() || !Val.isMachineOpcode() ||
+      Val.getMachineOpcode() != TargetOpcode::REG_SEQUENCE)
+    return false;
+
+  if (Val->getNumOperands() != 5 ||
+      Val->getOperand(0)->getAsZExtVal() != SystemZ::FP128BitRegClassID ||
+      Val->getOperand(2)->getAsZExtVal() != SystemZ::subreg_l64 ||
+      Val->getOperand(4)->getAsZExtVal() != SystemZ::subreg_h64)
+    return false;
+
+  LoPart = Val->getOperand(1);
+  HiPart = Val->getOperand(3);
+  return true;
+}
+
 SDValue SystemZTargetLowering::combineSTORE(
     SDNode *N, DAGCombinerInfo &DCI) const {
   SelectionDAG &DAG = DCI.DAG;
@@ -7070,10 +7092,11 @@ SDValue SystemZTargetLowering::combineSTORE(
                                      Ops, MemVT, SN->getMemOperand());
   }
 
-  // Transform a store of an i128 moved from GPRs into two separate stores.
-  if (MemVT == MVT::i128 && SN->isSimple() && ISD::isNormalStore(SN)) {
+  // Transform a store of a 128-bit value moved from parts into two stores.
+  if (SN->isSimple() && ISD::isNormalStore(SN)) {
     SDValue LoPart, HiPart;
-    if (isMovedFromParts(Op1, LoPart, HiPart)) {
+    if ((MemVT == MVT::i128 && isI128MovedFromParts(Op1, LoPart, HiPart)) ||
+        (MemVT == MVT::f128 && isF128MovedFromParts(Op1, LoPart, HiPart))) {
       SDLoc DL(SN);
       SDValue Chain0 =
         DAG.getStore(SN->getChain(), DL, HiPart, SN->getBasePtr(),
diff --git a/llvm/test/CodeGen/SystemZ/atomic-load-08.ll b/llvm/test/CodeGen/SystemZ/atomic-load-08.ll
index 15ff2daa0a3846..90d4214037d222 100644
--- a/llvm/test/CodeGen/SystemZ/atomic-load-08.ll
+++ b/llvm/test/CodeGen/SystemZ/atomic-load-08.ll
@@ -1,27 +1,16 @@
-; Test long double atomic loads.
+; Test long double atomic loads - via i128.
 ;
 ; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck -check-prefixes=CHECK,BASE %s
 ; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck -check-prefixes=CHECK,Z13 %s
-; TODO: Is it worth testing softfp with vector?
 ; RUN: llc < %s -mtriple=s390x-linux-gnu -mattr=+soft-float | FileCheck -check-prefixes=SOFTFP %s
 
-; FIXME: Without vector support, v2i64 should be legal and we should
-; introduce a simple bitcast, which could fold into the store use
-; avoid the intermediate f registers.
 define void @f1(ptr %ret, ptr %src) {
 ; CHECK-LABEL: f1:
 ; CHECK:       # %bb.0:
-; Z13-NEXT:    lpq %r0, 0(%r3)
-; Z13-NEXT:    stg %r1, 8(%r2)
-; Z13-NEXT:    stg %r0, 0(%r2)
-; Z13-NEXT:    br %r14
-
-; BASE: lpq	%r0, 0(%r3)
-; BASE-NEXT: ldgr	%f0, %r0
-; BASE-NEXT: ldgr	%f2, %r1
-; BASE-NEXT: std	%f0, 0(%r2)
-; BASE-NEXT: std	%f2, 8(%r2)
-; BASE-NEXT: br %r14
+; CHECK-NEXT:    lpq %r0, 0(%r3)
+; CHECK-NEXT:    stg %r1, 8(%r2)
+; CHECK-NEXT:    stg %r0, 0(%r2)
+; CHECK-NEXT:    br %r14
 
 ; SOFTFP-LABEL: f1:
 ; SOFTFP:       # %bb.0:
diff --git a/llvm/test/CodeGen/SystemZ/atomic-store-08.ll b/llvm/test/CodeGen/SystemZ/atomic-store-08.ll
index 422dcb63b531fe..57f1319365c4fd 100644
--- a/llvm/test/CodeGen/SystemZ/atomic-store-08.ll
+++ b/llvm/test/CodeGen/SystemZ/atomic-store-08.ll
@@ -1,30 +1,17 @@
-; Test long double atomic stores. The atomic store is converted to i128
+; Test long double atomic stores - via i128.
 ;
 ; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck -check-prefixes=CHECK,BASE %s
 ; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck -check-prefixes=CHECK,Z13 %s
-
-; TODO: Is it worth testing softfp with vector?
 ; RUN: llc < %s -mtriple=s390x-linux-gnu -mattr=+soft-float | FileCheck -check-prefixes=SOFTFP %s
 
-
-; FIXME: With legal 128-bit operation to bitcast, the base code would
-; be the same as z13
 define void @f1(ptr %dst, ptr %src) {
 ; CHECK-LABEL: f1:
 ; CHECK:       # %bb.0:
-; Z13-NEXT:    lg %r1, 8(%r3)
-; Z13-NEXT:    lg %r0, 0(%r3)
-; Z13-NEXT:    stpq %r0, 0(%r2)
-; Z13-NEXT:    bcr 1{{[45]}}, %r0
-; Z13-NEXT:    br %r14
-
-; BASE-NEXT: ld	%f0, 0(%r3)
-; BASE-NEXT: ld	%f2, 8(%r3)
-; BASE-NEXT: lgdr	%r1, %f2
-; BASE-NEXT: lgdr	%r0, %f0
-; BASE-NEXT: stpq	%r0, 0(%r2)
-; BASE-NEXT: bcr	15, %r0
-; BASE-NEXT: br	%r14
+; CHECK-NEXT:    lg %r1, 8(%r3)
+; CHECK-NEXT:    lg %r0, 0(%r3)
+; CHECK-NEXT:    stpq %r0, 0(%r2)
+; CHECK-NEXT:    bcr 1{{[45]}}, %r0
+; CHECK-NEXT:    br %r14
 
 ; SOFTFP-LABEL: f1:
 ; SOFTFP:       # %bb.0:
@@ -99,13 +86,8 @@ define void @f2_fpuse(ptr %dst, ptr %src) {
 ; CHECK-NEXT:	.cfi_def_cfa_offset 336
 ; CHECK-NEXT:	ld	%f0, 0(%r3)
 ; CHECK-NEXT:	ld	%f2, 8(%r3)
-
-; BASE-NEXT:	lgr	%r3, %r2
-; BASE-NEXT:	axbr	%f0, %f0
-
-; Z13-NEXT:	axbr	%f0, %f0
-; Z13-NEXT:	lgr	%r3, %r2
-
+; CHECK-DAG:	lgr	%r3, %r2
+; CHECK-DAG:	axbr	%f0, %f0
 ; CHECK-NEXT:	la	%r4, 160(%r15)
 ; CHECK-NEXT:	lghi	%r2, 16
 ; CHECK-NEXT:	lhi	%r5, 5
diff --git a/llvm/test/CodeGen/SystemZ/atomicrmw-fmax-03.ll b/llvm/test/CodeGen/SystemZ/atomicrmw-fmax-03.ll
index 3c8ea19f86f860..21e7c6e586dfe4 100644
--- a/llvm/test/CodeGen/SystemZ/atomicrmw-fmax-03.ll
+++ b/llvm/test/CodeGen/SystemZ/atomicrmw-fmax-03.ll
@@ -20,10 +20,8 @@ define void @f1(ptr %ret, ptr %src, ptr %b) {
 ; CHECK: std [[FSL]], 176(%r15)
 ; CHECK: std [[FSH]], 184(%r15)
 ; CHECK: brasl %r14, fmaxl at PLT
-; CHECK: ld [[FL:%f[0-9]+]], 192(%r15)
-; CHECK: ld [[FH:%f[0-9]+]], 200(%r15)
-; CHECK: lgdr [[RH:%r[0-9]+]], [[FH]]
-; CHECK: lgdr [[RL:%r[0-9]+]], [[FL]]
+; CHECK: lg [[RH:%r[0-9]+]], 200(%r15)
+; CHECK: lg [[RL:%r[0-9]+]], 192(%r15)
 ; CHECK: lgdr [[RSH:%r[0-9]+]], [[FSH]]
 ; CHECK: lgdr [[RSL:%r[0-9]+]], [[FSL]]
 ; CHECK: cdsg [[RSL]], [[RL]], 0([[SRC]])
diff --git a/llvm/test/CodeGen/SystemZ/atomicrmw-fmin-03.ll b/llvm/test/CodeGen/SystemZ/atomicrmw-fmin-03.ll
index dfa2cc021d1667..1c6f8e20aa4f8d 100644
--- a/llvm/test/CodeGen/SystemZ/atomicrmw-fmin-03.ll
+++ b/llvm/test/CodeGen/SystemZ/atomicrmw-fmin-03.ll
@@ -20,10 +20,8 @@ define void @f1(ptr %ret, ptr %src, ptr %b) {
 ; CHECK: std [[FSL]], 176(%r15)
 ; CHECK: std [[FSH]], 184(%r15)
 ; CHECK: brasl %r14, fminl at PLT
-; CHECK: ld [[FL:%f[0-9]+]], 192(%r15)
-; CHECK: ld [[FH:%f[0-9]+]], 200(%r15)
-; CHECK: lgdr [[RH:%r[0-9]+]], [[FH]]
-; CHECK: lgdr [[RL:%r[0-9]+]], [[FL]]
+; CHECK: lg [[RH:%r[0-9]+]], 200(%r15)
+; CHECK: lg [[RL:%r[0-9]+]], 192(%r15)
 ; CHECK: lgdr [[RSH:%r[0-9]+]], [[FSH]]
 ; CHECK: lgdr [[RSL:%r[0-9]+]], [[FSL]]
 ; CHECK: cdsg [[RSL]], [[RL]], 0([[SRC]])