[llvm] SystemZ: Stop casting fp typed atomic loads in the IR (PR #90768)

Wed May 1 13:00:54 PDT 2024

https://github.com/arsenm created https://github.com/llvm/llvm-project/pull/90768

shouldCastAtomicLoadInIR is a hack that should be removed. Simple bitcasting of operations should be in the domain of ordinary type legalization and does not need to be done in the IR.

This introduces a code quality regression due to the hack currently used to avoid using 128-bit values in the case where the floating point value is ultimately used as an integer. This would be avoidable if there were always a legal 128-bit type (like v2i64). This is a pretty niche situation so I assume it's not important.

I implemented about 85% of the work necessary to make v2i64 legal, but it was taking too long and I lack the necessary familiarity with systemz to complete it. I've pushed it here for someone to pick up:
https://github.com/arsenm/llvm-project/pull/new/systemz-legal-v2i64

>From b235a86c59fbbb85d9762e6797fa0ada48b4e9aa Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Thu, 25 Apr 2024 21:51:52 +0200
Subject: [PATCH] SystemZ: Stop casting fp typed atomic loads in the IR

shouldCastAtomicLoadInIR is a hack that should be removed. Simple
bitcasting of operations should be in the domain of ordinary type
legalization and does not need to be done in the IR.

This introduces a code quality regression due to the hack
currently used to avoid using 128-bit values in the case where the
floating point value is ultimately used as an integer. This would be
avoidable if there were always a legal 128-bit type (like v2i64).
This is a pretty niche situation so I assume it's not important.

I implemented about 85% of the work necessary to make v2i64 legal,
but it was taking too long and I lack the necessary familiarity
with systemz to complete it. I've pushed it here for someone
to pick up:
https://github.com/arsenm/llvm-project/pull/new/systemz-legal-v2i64
---
 .../Target/SystemZ/SystemZISelLowering.cpp    | 64 +++++++++++++++++--
 llvm/test/CodeGen/SystemZ/atomic-load-08.ll   | 21 ++++--
 2 files changed, 75 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
index 2da4431cf077eb..e62d742b221f53 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -294,6 +294,7 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
   // the atomic operations in order to exploit SystemZ instructions.
   setOperationAction(ISD::ATOMIC_LOAD,     MVT::i128, Custom);
   setOperationAction(ISD::ATOMIC_STORE,    MVT::i128, Custom);
+  setOperationAction(ISD::ATOMIC_LOAD, MVT::f128, Custom);
 
   // Mark sign/zero extending atomic loads as legal, which will make
   // DAGCombiner fold extensions into atomic loads if possible.
@@ -935,9 +936,6 @@ bool SystemZTargetLowering::hasInlineStackProbe(const MachineFunction &MF) const
 
 TargetLowering::AtomicExpansionKind
 SystemZTargetLowering::shouldCastAtomicLoadInIR(LoadInst *LI) const {
-  // Lower fp128 the same way as i128.
-  if (LI->getType()->isFP128Ty())
-    return AtomicExpansionKind::CastToInteger;
   return AtomicExpansionKind::None;
 }
 
@@ -4550,7 +4548,9 @@ SDValue SystemZTargetLowering::lowerATOMIC_FENCE(SDValue Op,
 SDValue SystemZTargetLowering::lowerATOMIC_LDST_I128(SDValue Op,
                                                      SelectionDAG &DAG) const {
   auto *Node = cast<AtomicSDNode>(Op.getNode());
-  assert(Node->getMemoryVT() == MVT::i128 && "Only custom lowering i128.");
+  assert(
+      (Node->getMemoryVT() == MVT::i128 || Node->getMemoryVT() == MVT::f128) &&
+      "Only custom lowering i128 or f128.");
   // Use same code to handle both legal and non-legal i128 types.
   SmallVector<SDValue, 2> Results;
   LowerOperationWrapper(Node, Results, DAG);
@@ -6249,6 +6249,41 @@ SDValue SystemZTargetLowering::LowerOperation(SDValue Op,
   }
 }
 
+// Manually lower a bitcast to avoid introducing illegal types after type
+// legalization.
+static SDValue expandBitCastI128ToF128(SelectionDAG &DAG, SDValue Src,
+                                       SDValue Chain, const SDLoc &SL) {
+  MachineFunction &MF = DAG.getMachineFunction();
+  const DataLayout &DL = DAG.getDataLayout();
+
+  assert(DL.isBigEndian());
+
+  Align F128Align = DL.getPrefTypeAlign(Type::getFP128Ty(*DAG.getContext()));
+  SDValue StackTemp = DAG.CreateStackTemporary(MVT::f128, F128Align.value());
+  int FI = cast<FrameIndexSDNode>(StackTemp)->getIndex();
+  Align A = MF.getFrameInfo().getObjectAlign(FI);
+
+  SDValue Hi =
+      DAG.getTargetExtractSubreg(SystemZ::subreg_h64, SL, MVT::i64, Src);
+  SDValue Lo =
+      DAG.getTargetExtractSubreg(SystemZ::subreg_l64, SL, MVT::i64, Src);
+
+  TypeSize Offset = MVT(MVT::i64).getStoreSize();
+  SDValue StackTempOffset = DAG.getObjectPtrOffset(SL, StackTemp, Offset);
+
+  MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
+
+  SDValue StoreHi = DAG.getStore(Chain, SL, Lo, StackTempOffset, PtrInfo, A);
+  SDValue StoreLo =
+      DAG.getStore(Chain, SL, Hi, StackTemp, PtrInfo.getWithOffset(Offset),
+                   commonAlignment(A, Offset));
+
+  SDValue StoreChain =
+      DAG.getNode(ISD::TokenFactor, SL, MVT::Other, {StoreLo, StoreHi});
+
+  return DAG.getLoad(MVT::f128, SL, StoreChain, StackTemp, PtrInfo, A);
+}
+
 // Lower operations with invalid operand or result types (currently used
 // only for 128-bit integer types).
 void
@@ -6263,8 +6298,25 @@ SystemZTargetLowering::LowerOperationWrapper(SDNode *N,
     MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
     SDValue Res = DAG.getMemIntrinsicNode(SystemZISD::ATOMIC_LOAD_128,
                                           DL, Tys, Ops, MVT::i128, MMO);
-    Results.push_back(lowerGR128ToI128(DAG, Res));
-    Results.push_back(Res.getValue(1));
+
+    EVT VT = N->getValueType(0);
+
+    if (VT == MVT::i128 || isTypeLegal(MVT::i128)) {
+      SDValue Lowered = lowerGR128ToI128(DAG, Res);
+      Results.push_back(DAG.getBitcast(VT, Lowered));
+      Results.push_back(Res.getValue(1));
+    } else {
+      // For the f128 case, after type legalization, we cannot produce a bitcast
+      // with an illegal type (i.e. i128), so introduce a stack store and reload
+      //
+      // FIXME: Really v2i64 should be legal, and should be used in place of
+      // unttyped. Then we could emit the bitcast which will potentially avoid
+      // the stack usage after combining.
+      SDValue Cast = expandBitCastI128ToF128(DAG, Res, Res.getValue(1), DL);
+      Results.push_back(Cast);
+      Results.push_back(Cast.getValue(1));
+    }
+
     break;
   }
   case ISD::ATOMIC_STORE: {
diff --git a/llvm/test/CodeGen/SystemZ/atomic-load-08.ll b/llvm/test/CodeGen/SystemZ/atomic-load-08.ll
index 83050ef87591ae..94b8595bef2933 100644
--- a/llvm/test/CodeGen/SystemZ/atomic-load-08.ll
+++ b/llvm/test/CodeGen/SystemZ/atomic-load-08.ll
@@ -5,13 +5,26 @@
 ; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck -check-prefixes=CHECK,BASE %s
 ; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck -check-prefixes=CHECK,Z13 %s
 
+; FIXME: Without vector support, v2i64 should be legal and we should
+; introduce a simple bitcast instead of the stack temporary store and
+; reload
 define void @f1(ptr %ret, ptr %src) {
 ; CHECK-LABEL: f1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lpq %r0, 0(%r3)
-; CHECK-NEXT:    stg %r1, 8(%r2)
-; CHECK-NEXT:    stg %r0, 0(%r2)
-; CHECK-NEXT:    br %r14
+; Z13-NEXT:    lpq %r0, 0(%r3)
+; Z13-NEXT:    stg %r1, 8(%r2)
+; Z13-NEXT:    stg %r0, 0(%r2)
+; Z13-NEXT:    br %r14
+
+; BASE:	aghi	%r15, -176
+; BASE: lpq	%r0, 0(%r3)
+; BASE-NEXT: stg	%r1, 168(%r15)
+; BASE-NEXT: stg	%r0, 160(%r15)
+; BASE-NEXT: ld	%f0, 160(%r15)
+; BASE-NEXT: ld	%f2, 168(%r15)
+; BASE-NEXT: std	%f0, 0(%r2)
+; BASE-NEXT: std	%f2, 8(%r2)
+; BASE-NEXT: aghi	%r15, 176
   %val = load atomic fp128, ptr %src seq_cst, align 16
   store fp128 %val, ptr %ret, align 8
   ret void