[llvm] [SystemZ] Simplify f128 atomic load/store (PR #90977)
Ulrich Weigand via llvm-commits
llvm-commits at lists.llvm.org
Fri May 3 08:58:03 PDT 2024
https://github.com/uweigand created https://github.com/llvm/llvm-project/pull/90977
Change definition of expandBitCastI128ToF128 and expandBitCastF128ToI128 to allow for simplified use in atomic load/store.
Update logic to split 128-bit loads and stores in DAGCombine to also handle the f128 case where appropriate. This fixes the regressions introduced by recent atomic load/store patches.
>From dba69e52692b57206bb3dab813791731a3042f34 Mon Sep 17 00:00:00 2001
From: Ulrich Weigand <ulrich.weigand at de.ibm.com>
Date: Fri, 3 May 2024 16:59:12 +0200
Subject: [PATCH] [SystemZ] Simplify f128 atomic load/store
Change definition of expandBitCastI128ToF128 and expandBitCastF128ToI128
to allow for simplified use in atomic load/store.
Update logic to split 128-bit loads and stores in DAGCombine to
also handle the f128 case where appropriate. This fixes the
regressions introduced by recent atomic load/store patches.
---
.../Target/SystemZ/SystemZISelLowering.cpp | 205 ++++++++++--------
llvm/test/CodeGen/SystemZ/atomic-load-08.ll | 21 +-
llvm/test/CodeGen/SystemZ/atomic-store-08.ll | 34 +--
.../test/CodeGen/SystemZ/atomicrmw-fmax-03.ll | 6 +-
.../test/CodeGen/SystemZ/atomicrmw-fmin-03.ll | 6 +-
5 files changed, 131 insertions(+), 141 deletions(-)
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
index 5612e4ced4f607..cb1d7d5388d3bc 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -1551,6 +1551,8 @@ static SDValue lowerI128ToGR128(SelectionDAG &DAG, SDValue In) {
std::tie(Lo, Hi) = DAG.SplitScalar(In, DL, MVT::i64, MVT::i64);
}
+ // FIXME: If v2i64 were a legal type, we could use it instead of
+ // Untyped here. This might enable improved folding.
SDNode *Pair = DAG.getMachineNode(SystemZ::PAIR128, DL,
MVT::Untyped, Hi, Lo);
return SDValue(Pair, 0);
@@ -6247,14 +6249,18 @@ SDValue SystemZTargetLowering::LowerOperation(SDValue Op,
}
}
-// Manually lower a bitcast to avoid introducing illegal types after type
-// legalization.
static SDValue expandBitCastI128ToF128(SelectionDAG &DAG, SDValue Src,
- SDValue Chain, const SDLoc &SL) {
- SDValue Hi =
- DAG.getTargetExtractSubreg(SystemZ::subreg_h64, SL, MVT::i64, Src);
- SDValue Lo =
- DAG.getTargetExtractSubreg(SystemZ::subreg_l64, SL, MVT::i64, Src);
+ const SDLoc &SL) {
+ // If i128 is legal, just use a normal bitcast.
+ if (DAG.getTargetLoweringInfo().isTypeLegal(MVT::i128))
+ return DAG.getBitcast(MVT::f128, Src);
+
+ // Otherwise, f128 must live in FP128, so do a partwise move.
+ assert(DAG.getTargetLoweringInfo().getRepRegClassFor(MVT::f128)
+ == &SystemZ::FP128BitRegClass);
+
+ SDValue Hi, Lo;
+ std::tie(Lo, Hi) = DAG.SplitScalar(Src, SL, MVT::i64, MVT::i64);
Hi = DAG.getBitcast(MVT::f64, Hi);
Lo = DAG.getBitcast(MVT::f64, Lo);
@@ -6267,8 +6273,16 @@ static SDValue expandBitCastI128ToF128(SelectionDAG &DAG, SDValue Src,
return SDValue(Pair, 0);
}
-static std::pair<SDValue, SDValue>
-expandBitCastF128ToI128Parts(SelectionDAG &DAG, SDValue Src, const SDLoc &SL) {
+static SDValue expandBitCastF128ToI128(SelectionDAG &DAG, SDValue Src,
+ const SDLoc &SL) {
+ // If i128 is legal, just use a normal bitcast.
+ if (DAG.getTargetLoweringInfo().isTypeLegal(MVT::i128))
+ return DAG.getBitcast(MVT::i128, Src);
+
+ // Otherwise, f128 must live in FP128, so do a partwise move.
+ assert(DAG.getTargetLoweringInfo().getRepRegClassFor(MVT::f128)
+ == &SystemZ::FP128BitRegClass);
+
SDValue LoFP =
DAG.getTargetExtractSubreg(SystemZ::subreg_l64, SL, MVT::f64, Src);
SDValue HiFP =
@@ -6276,15 +6290,7 @@ expandBitCastF128ToI128Parts(SelectionDAG &DAG, SDValue Src, const SDLoc &SL) {
SDValue Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i64, LoFP);
SDValue Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i64, HiFP);
- return {Hi, Lo};
-}
-
-static SDValue expandBitCastF128ToI128(SelectionDAG &DAG, SDValue Src,
- const SDLoc &SL) {
-
- auto [Hi, Lo] = expandBitCastF128ToI128Parts(DAG, Src, SL);
- SDNode *Pair = DAG.getMachineNode(SystemZ::PAIR128, SL, MVT::Untyped, Hi, Lo);
- return SDValue(Pair, 0);
+ return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i128, Lo, Hi);
}
// Lower operations with invalid operand or result types (currently used
@@ -6302,38 +6308,20 @@ SystemZTargetLowering::LowerOperationWrapper(SDNode *N,
SDValue Res = DAG.getMemIntrinsicNode(SystemZISD::ATOMIC_LOAD_128,
DL, Tys, Ops, MVT::i128, MMO);
- EVT VT = N->getValueType(0);
-
- if (VT == MVT::i128 || isTypeLegal(MVT::i128)) {
- SDValue Lowered = lowerGR128ToI128(DAG, Res);
- Results.push_back(DAG.getBitcast(VT, Lowered));
- Results.push_back(Res.getValue(1));
- } else {
- // For the f128 case, after type legalization, we cannot produce a bitcast
- // with an illegal type (i.e. i128), so manually lower it.
- //
- // FIXME: Really v2i64 should be legal, and should be used in place of
- // unttyped. Then we could emit the bitcast which will potentially fold
- // into the use.
- SDValue Cast = expandBitCastI128ToF128(DAG, Res, Res.getValue(1), DL);
- Results.push_back(Cast);
- Results.push_back(Res.getValue(1));
- }
-
+ SDValue Lowered = lowerGR128ToI128(DAG, Res);
+ if (N->getValueType(0) == MVT::f128)
+ Lowered = expandBitCastI128ToF128(DAG, Lowered, DL);
+ Results.push_back(Lowered);
+ Results.push_back(Res.getValue(1));
break;
}
case ISD::ATOMIC_STORE: {
SDLoc DL(N);
SDVTList Tys = DAG.getVTList(MVT::Other);
SDValue Val = N->getOperand(1);
- EVT VT = Val.getValueType();
-
- if (VT == MVT::i128 || isTypeLegal(MVT::i128)) {
- Val = DAG.getBitcast(MVT::i128, Val);
- Val = lowerI128ToGR128(DAG, Val);
- } else {
+ if (Val.getValueType() == MVT::f128)
Val = expandBitCastF128ToI128(DAG, Val, DL);
- }
+ Val = lowerI128ToGR128(DAG, Val);
SDValue Ops[] = {N->getOperand(0), Val, N->getOperand(2)};
MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
@@ -6370,21 +6358,7 @@ SystemZTargetLowering::LowerOperationWrapper(SDNode *N,
if (N->getValueType(0) == MVT::i128 && Src.getValueType() == MVT::f128 &&
!useSoftFloat()) {
SDLoc DL(N);
- SDValue Lo, Hi;
- if (getRepRegClassFor(MVT::f128) == &SystemZ::VR128BitRegClass) {
- SDValue VecBC = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Src);
- Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, VecBC,
- DAG.getConstant(1, DL, MVT::i32));
- Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, VecBC,
- DAG.getConstant(0, DL, MVT::i32));
- } else {
- // FIXME: Assert should be moved into expandBitCastF128ToI128Parts
- assert(getRepRegClassFor(MVT::f128) == &SystemZ::FP128BitRegClass &&
- "Unrecognized register class for f128.");
- std::tie(Hi, Lo) = expandBitCastF128ToI128Parts(DAG, Src, DL);
- }
-
- Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i128, Lo, Hi));
+ Results.push_back(expandBitCastF128ToI128(DAG, Src, DL));
}
break;
}
@@ -6829,47 +6803,79 @@ SDValue SystemZTargetLowering::combineMERGE(
return SDValue();
}
+static bool isI128MovedToParts(LoadSDNode *LD, SmallVector<std::pair<SDNode *, int>, 2> &Users) {
+ // Scan through all users.
+ for (SDNode::use_iterator UI = LD->use_begin(), UIEnd = LD->use_end();
+ UI != UIEnd; ++UI) {
+ // Skip the uses of the chain.
+ if (UI.getUse().getResNo() != 0)
+ continue;
+
+ // Verify every user is a TRUNCATE to i64 of the low or high half.
+ SDNode *User = *UI;
+ int Index = 1;
+ if (User->getOpcode() == ISD::SRL &&
+ User->getOperand(1).getOpcode() == ISD::Constant &&
+ User->getConstantOperandVal(1) == 64 && User->hasOneUse()) {
+ User = *User->use_begin();
+ Index = 0;
+ }
+ if (User->getOpcode() != ISD::TRUNCATE ||
+ User->getValueType(0) != MVT::i64)
+ return false;
+
+ Users.push_back(std::make_pair(User, Index));
+ }
+ return true;
+}
+
+static bool isF128MovedToParts(LoadSDNode *LD, SmallVector<std::pair<SDNode *, int>, 2> &Users) {
+ // Scan through all users.
+ for (SDNode::use_iterator UI = LD->use_begin(), UIEnd = LD->use_end();
+ UI != UIEnd; ++UI) {
+ // Skip the uses of the chain.
+ if (UI.getUse().getResNo() != 0)
+ continue;
+
+ // Verify every user is an EXTRACT_SUBREG of the low or high half.
+ SDNode *User = *UI;
+ if (!User->hasOneUse() || !User->isMachineOpcode() ||
+ User->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
+ return false;
+
+ int Index;
+ switch (User->getConstantOperandVal(1)) {
+ case SystemZ::subreg_l64: Index = 1; break;
+ case SystemZ::subreg_h64: Index = 0; break;
+ default: return false;
+ }
+
+ Users.push_back(std::make_pair(User, Index));
+ }
+ return true;
+}
+
SDValue SystemZTargetLowering::combineLOAD(
SDNode *N, DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
EVT LdVT = N->getValueType(0);
SDLoc DL(N);
- // Replace an i128 load that is used solely to move its value into GPRs
+ // Replace a 128-bit load that is used solely to move its value into GPRs
// by separate loads of both halves.
- if (LdVT == MVT::i128) {
- LoadSDNode *LD = cast<LoadSDNode>(N);
- if (!LD->isSimple() || !ISD::isNormalLoad(LD))
- return SDValue();
+ LoadSDNode *LD = cast<LoadSDNode>(N);
+ SmallVector<std::pair<SDNode *, int>, 2> Users;
+ if (LD->isSimple() && ISD::isNormalLoad(LD) &&
+ ((LdVT == MVT::i128 && isI128MovedToParts(LD, Users)) ||
+ (LdVT == MVT::f128 && isF128MovedToParts(LD, Users)))) {
- // Scan through all users.
- SmallVector<std::pair<SDNode *, int>, 2> Users;
+ // Verify that no part is extracted twice.
int UsedElements = 0;
- for (SDNode::use_iterator UI = LD->use_begin(), UIEnd = LD->use_end();
- UI != UIEnd; ++UI) {
- // Skip the uses of the chain.
- if (UI.getUse().getResNo() != 0)
- continue;
-
- // Verify every user is a TRUNCATE to i64 of the low or high half ...
- SDNode *User = *UI;
- int Index = 1;
- if (User->getOpcode() == ISD::SRL &&
- User->getOperand(1).getOpcode() == ISD::Constant &&
- User->getConstantOperandVal(1) == 64 && User->hasOneUse()) {
- User = *User->use_begin();
- Index = 0;
- }
- if (User->getOpcode() != ISD::TRUNCATE ||
- User->getValueType(0) != MVT::i64)
- return SDValue();
-
- // ... and no half is extracted twice.
+ for (auto UserAndIndex : Users) {
+ unsigned Index = UserAndIndex.second;
if (UsedElements & (1 << Index))
return SDValue();
-
UsedElements |= 1 << Index;
- Users.push_back(std::make_pair(User, Index));
}
// Rewrite each extraction as an independent load.
@@ -6974,7 +6980,7 @@ static bool isOnlyUsedByStores(SDValue StoredVal, SelectionDAG &DAG) {
return true;
}
-static bool isMovedFromParts(SDValue Val, SDValue &LoPart, SDValue &HiPart) {
+static bool isI128MovedFromParts(SDValue Val, SDValue &LoPart, SDValue &HiPart) {
if (Val.getOpcode() != ISD::OR || !Val.getNode()->hasOneUse())
return false;
@@ -7001,6 +7007,22 @@ static bool isMovedFromParts(SDValue Val, SDValue &LoPart, SDValue &HiPart) {
return true;
}
+static bool isF128MovedFromParts(SDValue Val, SDValue &LoPart, SDValue &HiPart) {
+ if (!Val.getNode()->hasOneUse() || !Val.isMachineOpcode() ||
+ Val.getMachineOpcode() != TargetOpcode::REG_SEQUENCE)
+ return false;
+
+ if (Val->getNumOperands() != 5 ||
+ Val->getOperand(0)->getAsZExtVal() != SystemZ::FP128BitRegClassID ||
+ Val->getOperand(2)->getAsZExtVal() != SystemZ::subreg_l64 ||
+ Val->getOperand(4)->getAsZExtVal() != SystemZ::subreg_h64)
+ return false;
+
+ LoPart = Val->getOperand(1);
+ HiPart = Val->getOperand(3);
+ return true;
+}
+
SDValue SystemZTargetLowering::combineSTORE(
SDNode *N, DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
@@ -7070,10 +7092,11 @@ SDValue SystemZTargetLowering::combineSTORE(
Ops, MemVT, SN->getMemOperand());
}
- // Transform a store of an i128 moved from GPRs into two separate stores.
- if (MemVT == MVT::i128 && SN->isSimple() && ISD::isNormalStore(SN)) {
+ // Transform a store of a 128-bit value moved from parts into two stores.
+ if (SN->isSimple() && ISD::isNormalStore(SN)) {
SDValue LoPart, HiPart;
- if (isMovedFromParts(Op1, LoPart, HiPart)) {
+ if ((MemVT == MVT::i128 && isI128MovedFromParts(Op1, LoPart, HiPart)) ||
+ (MemVT == MVT::f128 && isF128MovedFromParts(Op1, LoPart, HiPart))) {
SDLoc DL(SN);
SDValue Chain0 =
DAG.getStore(SN->getChain(), DL, HiPart, SN->getBasePtr(),
diff --git a/llvm/test/CodeGen/SystemZ/atomic-load-08.ll b/llvm/test/CodeGen/SystemZ/atomic-load-08.ll
index 15ff2daa0a3846..90d4214037d222 100644
--- a/llvm/test/CodeGen/SystemZ/atomic-load-08.ll
+++ b/llvm/test/CodeGen/SystemZ/atomic-load-08.ll
@@ -1,27 +1,16 @@
-; Test long double atomic loads.
+; Test long double atomic loads - via i128.
;
; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck -check-prefixes=CHECK,BASE %s
; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck -check-prefixes=CHECK,Z13 %s
-; TODO: Is it worth testing softfp with vector?
; RUN: llc < %s -mtriple=s390x-linux-gnu -mattr=+soft-float | FileCheck -check-prefixes=SOFTFP %s
-; FIXME: Without vector support, v2i64 should be legal and we should
-; introduce a simple bitcast, which could fold into the store use
-; avoid the intermediate f registers.
define void @f1(ptr %ret, ptr %src) {
; CHECK-LABEL: f1:
; CHECK: # %bb.0:
-; Z13-NEXT: lpq %r0, 0(%r3)
-; Z13-NEXT: stg %r1, 8(%r2)
-; Z13-NEXT: stg %r0, 0(%r2)
-; Z13-NEXT: br %r14
-
-; BASE: lpq %r0, 0(%r3)
-; BASE-NEXT: ldgr %f0, %r0
-; BASE-NEXT: ldgr %f2, %r1
-; BASE-NEXT: std %f0, 0(%r2)
-; BASE-NEXT: std %f2, 8(%r2)
-; BASE-NEXT: br %r14
+; CHECK-NEXT: lpq %r0, 0(%r3)
+; CHECK-NEXT: stg %r1, 8(%r2)
+; CHECK-NEXT: stg %r0, 0(%r2)
+; CHECK-NEXT: br %r14
; SOFTFP-LABEL: f1:
; SOFTFP: # %bb.0:
diff --git a/llvm/test/CodeGen/SystemZ/atomic-store-08.ll b/llvm/test/CodeGen/SystemZ/atomic-store-08.ll
index 422dcb63b531fe..57f1319365c4fd 100644
--- a/llvm/test/CodeGen/SystemZ/atomic-store-08.ll
+++ b/llvm/test/CodeGen/SystemZ/atomic-store-08.ll
@@ -1,30 +1,17 @@
-; Test long double atomic stores. The atomic store is converted to i128
+; Test long double atomic stores - via i128.
;
; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck -check-prefixes=CHECK,BASE %s
; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck -check-prefixes=CHECK,Z13 %s
-
-; TODO: Is it worth testing softfp with vector?
; RUN: llc < %s -mtriple=s390x-linux-gnu -mattr=+soft-float | FileCheck -check-prefixes=SOFTFP %s
-
-; FIXME: With legal 128-bit operation to bitcast, the base code would
-; be the same as z13
define void @f1(ptr %dst, ptr %src) {
; CHECK-LABEL: f1:
; CHECK: # %bb.0:
-; Z13-NEXT: lg %r1, 8(%r3)
-; Z13-NEXT: lg %r0, 0(%r3)
-; Z13-NEXT: stpq %r0, 0(%r2)
-; Z13-NEXT: bcr 1{{[45]}}, %r0
-; Z13-NEXT: br %r14
-
-; BASE-NEXT: ld %f0, 0(%r3)
-; BASE-NEXT: ld %f2, 8(%r3)
-; BASE-NEXT: lgdr %r1, %f2
-; BASE-NEXT: lgdr %r0, %f0
-; BASE-NEXT: stpq %r0, 0(%r2)
-; BASE-NEXT: bcr 15, %r0
-; BASE-NEXT: br %r14
+; CHECK-NEXT: lg %r1, 8(%r3)
+; CHECK-NEXT: lg %r0, 0(%r3)
+; CHECK-NEXT: stpq %r0, 0(%r2)
+; CHECK-NEXT: bcr 1{{[45]}}, %r0
+; CHECK-NEXT: br %r14
; SOFTFP-LABEL: f1:
; SOFTFP: # %bb.0:
@@ -99,13 +86,8 @@ define void @f2_fpuse(ptr %dst, ptr %src) {
; CHECK-NEXT: .cfi_def_cfa_offset 336
; CHECK-NEXT: ld %f0, 0(%r3)
; CHECK-NEXT: ld %f2, 8(%r3)
-
-; BASE-NEXT: lgr %r3, %r2
-; BASE-NEXT: axbr %f0, %f0
-
-; Z13-NEXT: axbr %f0, %f0
-; Z13-NEXT: lgr %r3, %r2
-
+; CHECK-DAG: lgr %r3, %r2
+; CHECK-DAG: axbr %f0, %f0
; CHECK-NEXT: la %r4, 160(%r15)
; CHECK-NEXT: lghi %r2, 16
; CHECK-NEXT: lhi %r5, 5
diff --git a/llvm/test/CodeGen/SystemZ/atomicrmw-fmax-03.ll b/llvm/test/CodeGen/SystemZ/atomicrmw-fmax-03.ll
index 3c8ea19f86f860..21e7c6e586dfe4 100644
--- a/llvm/test/CodeGen/SystemZ/atomicrmw-fmax-03.ll
+++ b/llvm/test/CodeGen/SystemZ/atomicrmw-fmax-03.ll
@@ -20,10 +20,8 @@ define void @f1(ptr %ret, ptr %src, ptr %b) {
; CHECK: std [[FSL]], 176(%r15)
; CHECK: std [[FSH]], 184(%r15)
; CHECK: brasl %r14, fmaxl at PLT
-; CHECK: ld [[FL:%f[0-9]+]], 192(%r15)
-; CHECK: ld [[FH:%f[0-9]+]], 200(%r15)
-; CHECK: lgdr [[RH:%r[0-9]+]], [[FH]]
-; CHECK: lgdr [[RL:%r[0-9]+]], [[FL]]
+; CHECK: lg [[RH:%r[0-9]+]], 200(%r15)
+; CHECK: lg [[RL:%r[0-9]+]], 192(%r15)
; CHECK: lgdr [[RSH:%r[0-9]+]], [[FSH]]
; CHECK: lgdr [[RSL:%r[0-9]+]], [[FSL]]
; CHECK: cdsg [[RSL]], [[RL]], 0([[SRC]])
diff --git a/llvm/test/CodeGen/SystemZ/atomicrmw-fmin-03.ll b/llvm/test/CodeGen/SystemZ/atomicrmw-fmin-03.ll
index dfa2cc021d1667..1c6f8e20aa4f8d 100644
--- a/llvm/test/CodeGen/SystemZ/atomicrmw-fmin-03.ll
+++ b/llvm/test/CodeGen/SystemZ/atomicrmw-fmin-03.ll
@@ -20,10 +20,8 @@ define void @f1(ptr %ret, ptr %src, ptr %b) {
; CHECK: std [[FSL]], 176(%r15)
; CHECK: std [[FSH]], 184(%r15)
; CHECK: brasl %r14, fminl at PLT
-; CHECK: ld [[FL:%f[0-9]+]], 192(%r15)
-; CHECK: ld [[FH:%f[0-9]+]], 200(%r15)
-; CHECK: lgdr [[RH:%r[0-9]+]], [[FH]]
-; CHECK: lgdr [[RL:%r[0-9]+]], [[FL]]
+; CHECK: lg [[RH:%r[0-9]+]], 200(%r15)
+; CHECK: lg [[RL:%r[0-9]+]], 192(%r15)
; CHECK: lgdr [[RSH:%r[0-9]+]], [[FSH]]
; CHECK: lgdr [[RSL:%r[0-9]+]], [[FSL]]
; CHECK: cdsg [[RSL]], [[RL]], 0([[SRC]])
More information about the llvm-commits
mailing list