[llvm] SystemZ: Stop casting fp typed atomic loads in the IR (PR #90768)
Matt Arsenault via llvm-commits
llvm-commits at lists.llvm.org
Thu May 2 08:11:28 PDT 2024
https://github.com/arsenm updated https://github.com/llvm/llvm-project/pull/90768
>From ed796926ef1ab86789bbcedceff05a1ec859e1b8 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Thu, 25 Apr 2024 21:51:52 +0200
Subject: [PATCH 1/3] SystemZ: Stop casting fp typed atomic loads in the IR
shouldCastAtomicLoadInIR is a hack that should be removed. Simple
bitcasting of operations should be in the domain of ordinary type
legalization and does not need to be done in the IR.
This introduces a code quality regression due to the hack
currently used to avoid using 128-bit values in the case where the
floating point value is ultimately used as an integer. This would be
avoidable if there were always a legal 128-bit type (like v2i64).
This is a pretty niche situation so I assume it's not important.
I implemented about 85% of the work necessary to make v2i64 legal,
but it was taking too long and I lack the necessary familiarity
with systemz to complete it. I've pushed it here for someone
to pick up:
https://github.com/arsenm/llvm-project/pull/new/systemz-legal-v2i64
---
.../Target/SystemZ/SystemZISelLowering.cpp | 64 +++++++++++++++++--
llvm/test/CodeGen/SystemZ/atomic-load-08.ll | 25 ++++++--
2 files changed, 76 insertions(+), 13 deletions(-)
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
index 2da4431cf077eb..e62d742b221f53 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -294,6 +294,7 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
// the atomic operations in order to exploit SystemZ instructions.
setOperationAction(ISD::ATOMIC_LOAD, MVT::i128, Custom);
setOperationAction(ISD::ATOMIC_STORE, MVT::i128, Custom);
+ setOperationAction(ISD::ATOMIC_LOAD, MVT::f128, Custom);
// Mark sign/zero extending atomic loads as legal, which will make
// DAGCombiner fold extensions into atomic loads if possible.
@@ -935,9 +936,6 @@ bool SystemZTargetLowering::hasInlineStackProbe(const MachineFunction &MF) const
TargetLowering::AtomicExpansionKind
SystemZTargetLowering::shouldCastAtomicLoadInIR(LoadInst *LI) const {
- // Lower fp128 the same way as i128.
- if (LI->getType()->isFP128Ty())
- return AtomicExpansionKind::CastToInteger;
return AtomicExpansionKind::None;
}
@@ -4550,7 +4548,9 @@ SDValue SystemZTargetLowering::lowerATOMIC_FENCE(SDValue Op,
SDValue SystemZTargetLowering::lowerATOMIC_LDST_I128(SDValue Op,
SelectionDAG &DAG) const {
auto *Node = cast<AtomicSDNode>(Op.getNode());
- assert(Node->getMemoryVT() == MVT::i128 && "Only custom lowering i128.");
+ assert(
+ (Node->getMemoryVT() == MVT::i128 || Node->getMemoryVT() == MVT::f128) &&
+ "Only custom lowering i128 or f128.");
// Use same code to handle both legal and non-legal i128 types.
SmallVector<SDValue, 2> Results;
LowerOperationWrapper(Node, Results, DAG);
@@ -6249,6 +6249,41 @@ SDValue SystemZTargetLowering::LowerOperation(SDValue Op,
}
}
+// Manually lower a bitcast to avoid introducing illegal types after type
+// legalization.
+static SDValue expandBitCastI128ToF128(SelectionDAG &DAG, SDValue Src,
+ SDValue Chain, const SDLoc &SL) {
+ MachineFunction &MF = DAG.getMachineFunction();
+ const DataLayout &DL = DAG.getDataLayout();
+
+ assert(DL.isBigEndian());
+
+ Align F128Align = DL.getPrefTypeAlign(Type::getFP128Ty(*DAG.getContext()));
+ SDValue StackTemp = DAG.CreateStackTemporary(MVT::f128, F128Align.value());
+ int FI = cast<FrameIndexSDNode>(StackTemp)->getIndex();
+ Align A = MF.getFrameInfo().getObjectAlign(FI);
+
+ SDValue Hi =
+ DAG.getTargetExtractSubreg(SystemZ::subreg_h64, SL, MVT::i64, Src);
+ SDValue Lo =
+ DAG.getTargetExtractSubreg(SystemZ::subreg_l64, SL, MVT::i64, Src);
+
+ TypeSize Offset = MVT(MVT::i64).getStoreSize();
+ SDValue StackTempOffset = DAG.getObjectPtrOffset(SL, StackTemp, Offset);
+
+ MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
+
+ SDValue StoreHi = DAG.getStore(Chain, SL, Lo, StackTempOffset, PtrInfo, A);
+ SDValue StoreLo =
+ DAG.getStore(Chain, SL, Hi, StackTemp, PtrInfo.getWithOffset(Offset),
+ commonAlignment(A, Offset));
+
+ SDValue StoreChain =
+ DAG.getNode(ISD::TokenFactor, SL, MVT::Other, {StoreLo, StoreHi});
+
+ return DAG.getLoad(MVT::f128, SL, StoreChain, StackTemp, PtrInfo, A);
+}
+
// Lower operations with invalid operand or result types (currently used
// only for 128-bit integer types).
void
@@ -6263,8 +6298,25 @@ SystemZTargetLowering::LowerOperationWrapper(SDNode *N,
MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
SDValue Res = DAG.getMemIntrinsicNode(SystemZISD::ATOMIC_LOAD_128,
DL, Tys, Ops, MVT::i128, MMO);
- Results.push_back(lowerGR128ToI128(DAG, Res));
- Results.push_back(Res.getValue(1));
+
+ EVT VT = N->getValueType(0);
+
+ if (VT == MVT::i128 || isTypeLegal(MVT::i128)) {
+ SDValue Lowered = lowerGR128ToI128(DAG, Res);
+ Results.push_back(DAG.getBitcast(VT, Lowered));
+ Results.push_back(Res.getValue(1));
+ } else {
+ // For the f128 case, after type legalization, we cannot produce a bitcast
+ // with an illegal type (i.e. i128), so introduce a stack store and reload
+ //
+ // FIXME: Really v2i64 should be legal, and should be used in place of
+ // unttyped. Then we could emit the bitcast which will potentially avoid
+ // the stack usage after combining.
+ SDValue Cast = expandBitCastI128ToF128(DAG, Res, Res.getValue(1), DL);
+ Results.push_back(Cast);
+ Results.push_back(Cast.getValue(1));
+ }
+
break;
}
case ISD::ATOMIC_STORE: {
diff --git a/llvm/test/CodeGen/SystemZ/atomic-load-08.ll b/llvm/test/CodeGen/SystemZ/atomic-load-08.ll
index 83050ef87591ae..540a95d558a341 100644
--- a/llvm/test/CodeGen/SystemZ/atomic-load-08.ll
+++ b/llvm/test/CodeGen/SystemZ/atomic-load-08.ll
@@ -1,17 +1,28 @@
-; Test long double atomic loads. These are emitted by the Clang FE as i128
-; loads with a bitcast, and this test case gets converted into that form as
-; well by the AtomicExpand pass.
+; Test long double atomic loads.
;
; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck -check-prefixes=CHECK,BASE %s
; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck -check-prefixes=CHECK,Z13 %s
+; FIXME: Without vector support, v2i64 should be legal and we should
+; introduce a simple bitcast instead of the stack temporary store and
+; reload
define void @f1(ptr %ret, ptr %src) {
; CHECK-LABEL: f1:
; CHECK: # %bb.0:
-; CHECK-NEXT: lpq %r0, 0(%r3)
-; CHECK-NEXT: stg %r1, 8(%r2)
-; CHECK-NEXT: stg %r0, 0(%r2)
-; CHECK-NEXT: br %r14
+; Z13-NEXT: lpq %r0, 0(%r3)
+; Z13-NEXT: stg %r1, 8(%r2)
+; Z13-NEXT: stg %r0, 0(%r2)
+; Z13-NEXT: br %r14
+
+; BASE: aghi %r15, -176
+; BASE: lpq %r0, 0(%r3)
+; BASE-NEXT: stg %r1, 168(%r15)
+; BASE-NEXT: stg %r0, 160(%r15)
+; BASE-NEXT: ld %f0, 160(%r15)
+; BASE-NEXT: ld %f2, 168(%r15)
+; BASE-NEXT: std %f0, 0(%r2)
+; BASE-NEXT: std %f2, 8(%r2)
+; BASE-NEXT: aghi %r15, 176
%val = load atomic fp128, ptr %src seq_cst, align 16
store fp128 %val, ptr %ret, align 8
ret void
>From 200ce90f4cfd24cb5ec5cab2ca4485aa871f25d6 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Thu, 2 May 2024 16:36:13 +0200
Subject: [PATCH 2/3] Avoid stack temp. this now produces better code in the
actually-used-as-fp case
---
.../Target/SystemZ/SystemZISelLowering.cpp | 30 ++++---------------
llvm/test/CodeGen/SystemZ/atomic-load-08.ll | 21 ++++---------
2 files changed, 11 insertions(+), 40 deletions(-)
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
index e62d742b221f53..87b78ed29899c3 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -6253,35 +6253,17 @@ SDValue SystemZTargetLowering::LowerOperation(SDValue Op,
// legalization.
static SDValue expandBitCastI128ToF128(SelectionDAG &DAG, SDValue Src,
SDValue Chain, const SDLoc &SL) {
- MachineFunction &MF = DAG.getMachineFunction();
- const DataLayout &DL = DAG.getDataLayout();
-
- assert(DL.isBigEndian());
-
- Align F128Align = DL.getPrefTypeAlign(Type::getFP128Ty(*DAG.getContext()));
- SDValue StackTemp = DAG.CreateStackTemporary(MVT::f128, F128Align.value());
- int FI = cast<FrameIndexSDNode>(StackTemp)->getIndex();
- Align A = MF.getFrameInfo().getObjectAlign(FI);
-
SDValue Hi =
DAG.getTargetExtractSubreg(SystemZ::subreg_h64, SL, MVT::i64, Src);
SDValue Lo =
DAG.getTargetExtractSubreg(SystemZ::subreg_l64, SL, MVT::i64, Src);
- TypeSize Offset = MVT(MVT::i64).getStoreSize();
- SDValue StackTempOffset = DAG.getObjectPtrOffset(SL, StackTemp, Offset);
-
- MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
+ Hi = DAG.getBitcast(MVT::f64, Hi);
+ Lo = DAG.getBitcast(MVT::f64, Lo);
- SDValue StoreHi = DAG.getStore(Chain, SL, Lo, StackTempOffset, PtrInfo, A);
- SDValue StoreLo =
- DAG.getStore(Chain, SL, Hi, StackTemp, PtrInfo.getWithOffset(Offset),
- commonAlignment(A, Offset));
-
- SDValue StoreChain =
- DAG.getNode(ISD::TokenFactor, SL, MVT::Other, {StoreLo, StoreHi});
-
- return DAG.getLoad(MVT::f128, SL, StoreChain, StackTemp, PtrInfo, A);
+ SDNode *Pair = DAG.getMachineNode(SystemZ::PAIR128, SL,
+ MVT::f128, Hi, Lo);
+ return SDValue(Pair, 0);
}
// Lower operations with invalid operand or result types (currently used
@@ -6314,7 +6296,7 @@ SystemZTargetLowering::LowerOperationWrapper(SDNode *N,
// the stack usage after combining.
SDValue Cast = expandBitCastI128ToF128(DAG, Res, Res.getValue(1), DL);
Results.push_back(Cast);
- Results.push_back(Cast.getValue(1));
+ Results.push_back(Res.getValue(1));
}
break;
diff --git a/llvm/test/CodeGen/SystemZ/atomic-load-08.ll b/llvm/test/CodeGen/SystemZ/atomic-load-08.ll
index 57440b66598e42..3c941b6dcb585b 100644
--- a/llvm/test/CodeGen/SystemZ/atomic-load-08.ll
+++ b/llvm/test/CodeGen/SystemZ/atomic-load-08.ll
@@ -16,16 +16,12 @@ define void @f1(ptr %ret, ptr %src) {
; Z13-NEXT: stg %r0, 0(%r2)
; Z13-NEXT: br %r14
-; BASE: aghi %r15, -176
; BASE: lpq %r0, 0(%r3)
-; BASE-NEXT: stg %r1, 168(%r15)
-; BASE-NEXT: stg %r0, 160(%r15)
-; BASE-NEXT: ld %f0, 160(%r15)
-; BASE-NEXT: ld %f2, 168(%r15)
+; BASE-NEXT: ldgr %f0, %r0
+; BASE-NEXT: ldgr %f2, %r1
; BASE-NEXT: std %f0, 0(%r2)
; BASE-NEXT: std %f2, 8(%r2)
-; BASE-NEXT: aghi %r15, 176
-
+; BASE-NEXT: br %r14
; SOFTFP-LABEL: f1:
; SOFTFP: # %bb.0:
@@ -41,24 +37,17 @@ define void @f1(ptr %ret, ptr %src) {
define void @f1_fpuse(ptr %ret, ptr %src) {
; CHECK-LABEL: f1_fpuse:
; CHECK: # %bb.0:
-; BASE-NEXT: aghi %r15, -176
-; BASE-NEXT: .cfi_def_cfa_offset 336
-
; CHECK-NEXT: lpq %r0, 0(%r3)
-; BASE-NEXT: stg %r1, 168(%r15)
-; BASE-NEXT: stg %r0, 160(%r15)
-; BASE-NEXT: ld %f0, 160(%r15)
-; BASE-NEXT: ld %f2, 168(%r15)
+; BASE-NEXT: ldgr %f0, %r0
+; BASE-NEXT: ldgr %f2, %r1
; Z13-NEXT: vlvgp %v0, %r0, %r1
; Z13-NEXT: vrepg %v2, %v0, 1
-
; CHECK-NEXT: axbr %f0, %f0
; CHECK-NEXT: std %f0, 0(%r2)
; CHECK-NEXT: std %f2, 8(%r2)
-; BASE-NEXT: aghi %r15, 176
; CHECK-NEXT: br %r14
>From c9fb212a39fec96a523af127fc52b323faa04602 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Thu, 2 May 2024 17:11:16 +0200
Subject: [PATCH 3/3] Fix formatting
---
llvm/lib/Target/SystemZ/SystemZISelLowering.cpp | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
index 87b78ed29899c3..c5df655e787e03 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -6261,8 +6261,7 @@ static SDValue expandBitCastI128ToF128(SelectionDAG &DAG, SDValue Src,
Hi = DAG.getBitcast(MVT::f64, Hi);
Lo = DAG.getBitcast(MVT::f64, Lo);
- SDNode *Pair = DAG.getMachineNode(SystemZ::PAIR128, SL,
- MVT::f128, Hi, Lo);
+ SDNode *Pair = DAG.getMachineNode(SystemZ::PAIR128, SL, MVT::f128, Hi, Lo);
return SDValue(Pair, 0);
}
More information about the llvm-commits
mailing list