[llvm] [AMDGPU] Codegen for GFX12 VFLAT, VSCRATCH and VGLOBAL instructions (PR #75493)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Dec 14 08:53:42 PST 2023
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-llvm-ir
@llvm/pr-subscribers-llvm-globalisel
Author: Mirko BrkuĊĦanin (mbrkusanin)
<details>
<summary>Changes</summary>
---
Patch is 1.14 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/75493.diff
58 Files Affected:
- (modified) llvm/include/llvm/IR/IntrinsicsAMDGPU.td (+9)
- (modified) llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp (+9-4)
- (modified) llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp (+5)
- (modified) llvm/lib/Target/AMDGPU/AMDGPUInstructions.td (+4)
- (modified) llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp (+4)
- (modified) llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td (+4)
- (modified) llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp (+5-1)
- (modified) llvm/lib/Target/AMDGPU/FLATInstructions.td (+46-11)
- (modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+17-3)
- (modified) llvm/lib/Target/AMDGPU/SIInstrInfo.cpp (+11-8)
- (modified) llvm/lib/Target/AMDGPU/SIInstrInfo.h (+3)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f32.ll (+1)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll (+233)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-no-rtn.ll (+1)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-rtn.ll (+1)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgpu-atomic-cmpxchg-flat.mir (+70)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgpu-atomic-cmpxchg-global.mir (+1)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-atomicrmw-add-flat.mir (+99-233)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-atomicrmw-add-global.mir (+106)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-atomic-flat.mir (+71)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-atomic-global.mir (+1)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-atomic-local.mir (+1)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-flat.mir (+634)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global-saddr.mir (+190)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global.mir (+745)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global.s96.mir (+1)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-private.mir (+401)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-atomic-flat.mir (+1)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-atomic-local.mir (+1)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-flat.mir (+563)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-global.mir (+614)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-global.s96.mir (+1)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-private.mir (+188)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-flat.mir (+1)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-global.mir (+2)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-local.mir (+2)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-private.mir (+1)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-store-global.mir (+1)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll (+53)
- (modified) llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll (+73)
- (modified) llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll (+52)
- (modified) llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll (+86)
- (modified) llvm/test/CodeGen/AMDGPU/flat-scratch-i8-i16.ll (+283)
- (modified) llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll (+300)
- (modified) llvm/test/CodeGen/AMDGPU/flat-scratch.ll (+816)
- (modified) llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll (+2141)
- (added) llvm/test/CodeGen/AMDGPU/fp-min-max-num-flat-atomics.ll (+53)
- (added) llvm/test/CodeGen/AMDGPU/fp-min-max-num-global-atomics.ll (+59)
- (modified) llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-no-rtn.ll (+176-59)
- (modified) llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll (+1)
- (modified) llvm/test/CodeGen/AMDGPU/global-saddr-load.ll (+1485-44)
- (modified) llvm/test/CodeGen/AMDGPU/global-saddr-store.ll (+452)
- (modified) llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll (+1898)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.csub.ll (+19-10)
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll (+110)
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll (+114)
- (modified) llvm/test/CodeGen/AMDGPU/offset-split-flat.ll (+1020)
- (modified) llvm/test/CodeGen/AMDGPU/offset-split-global.ll (+1133)
``````````diff
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 09e88152e65d2a..7fec7e5a62bdcf 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2460,6 +2460,15 @@ def int_amdgcn_s_wait_event_export_ready :
Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrWillReturn]
>;
+//===----------------------------------------------------------------------===//
+// GFX12 Intrinsics
+//===----------------------------------------------------------------------===//
+
+def int_amdgcn_flat_atomic_fmin_num : AMDGPUGlobalAtomicRtn<llvm_anyfloat_ty>;
+def int_amdgcn_flat_atomic_fmax_num : AMDGPUGlobalAtomicRtn<llvm_anyfloat_ty>;
+def int_amdgcn_global_atomic_fmin_num : AMDGPUGlobalAtomicRtn<llvm_anyfloat_ty>;
+def int_amdgcn_global_atomic_fmax_num : AMDGPUGlobalAtomicRtn<llvm_anyfloat_ty>;
+
//===----------------------------------------------------------------------===//
// Deep learning intrinsics.
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 66ba08ef0dc12a..1904e04c8b8293 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -1166,6 +1166,11 @@ bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegal(SDValue Addr) const {
if (isNoUnsignedWrap(Addr))
return true;
+ // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
+ // values.
+ if (AMDGPU::isGFX12Plus(*Subtarget))
+ return true;
+
auto LHS = Addr.getOperand(0);
auto RHS = Addr.getOperand(1);
@@ -1682,7 +1687,7 @@ bool AMDGPUDAGToDAGISel::SelectFlatOffsetImpl(SDNode *N, SDValue Addr,
}
VAddr = Addr;
- Offset = CurDAG->getTargetConstant(OffsetVal, SDLoc(), MVT::i16);
+ Offset = CurDAG->getTargetConstant(OffsetVal, SDLoc(), MVT::i32);
return true;
}
@@ -1750,7 +1755,7 @@ bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N,
CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32));
VOffset = SDValue(VMov, 0);
SAddr = LHS;
- Offset = CurDAG->getTargetConstant(SplitImmOffset, SDLoc(), MVT::i16);
+ Offset = CurDAG->getTargetConstant(SplitImmOffset, SDLoc(), MVT::i32);
return true;
}
}
@@ -1790,7 +1795,7 @@ bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N,
}
if (SAddr) {
- Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i16);
+ Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i32);
return true;
}
}
@@ -1806,7 +1811,7 @@ bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N,
CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, SDLoc(Addr), MVT::i32,
CurDAG->getTargetConstant(0, SDLoc(), MVT::i32));
VOffset = SDValue(VMov, 0);
- Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i16);
+ Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i32);
return true;
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 75fac09d0b99fa..2fbe02eda682da 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -4541,6 +4541,11 @@ bool AMDGPUInstructionSelector::isFlatScratchBaseLegal(Register Addr) const {
if (isNoUnsignedWrap(AddrMI))
return true;
+ // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
+ // values.
+ if (AMDGPU::isGFX12Plus(STI))
+ return true;
+
Register LHS = AddrMI->getOperand(1).getReg();
Register RHS = AddrMI->getOperand(2).getReg();
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
index 121026aca60355..eaf72d7157ee2d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
@@ -642,6 +642,10 @@ defm int_amdgcn_global_atomic_fmax : noret_op;
defm int_amdgcn_global_atomic_csub : noret_op;
defm int_amdgcn_flat_atomic_fadd : local_addr_space_atomic_op;
defm int_amdgcn_ds_fadd_v2bf16 : noret_op;
+defm int_amdgcn_flat_atomic_fmin_num : noret_op;
+defm int_amdgcn_flat_atomic_fmax_num : noret_op;
+defm int_amdgcn_global_atomic_fmin_num : noret_op;
+defm int_amdgcn_global_atomic_fmax_num : noret_op;
multiclass noret_binary_atomic_op<SDNode atomic_op, bit IsInt = 1> {
let HasNoUse = true in
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 03b6d19b2b3c06..e0162cff4b7a28 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -4638,9 +4638,13 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_global_atomic_csub:
case Intrinsic::amdgcn_global_atomic_fmin:
case Intrinsic::amdgcn_global_atomic_fmax:
+ case Intrinsic::amdgcn_global_atomic_fmin_num:
+ case Intrinsic::amdgcn_global_atomic_fmax_num:
case Intrinsic::amdgcn_flat_atomic_fadd:
case Intrinsic::amdgcn_flat_atomic_fmin:
case Intrinsic::amdgcn_flat_atomic_fmax:
+ case Intrinsic::amdgcn_flat_atomic_fmin_num:
+ case Intrinsic::amdgcn_flat_atomic_fmax_num:
case Intrinsic::amdgcn_global_atomic_fadd_v2bf16:
case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16:
return getDefaultMappingAllVGPR(MI);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
index 317f3f21d24002..3cb68eb46eb0d0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
@@ -241,9 +241,13 @@ def : SourceOfDivergence<int_amdgcn_global_atomic_csub>;
def : SourceOfDivergence<int_amdgcn_global_atomic_fadd>;
def : SourceOfDivergence<int_amdgcn_global_atomic_fmin>;
def : SourceOfDivergence<int_amdgcn_global_atomic_fmax>;
+def : SourceOfDivergence<int_amdgcn_global_atomic_fmin_num>;
+def : SourceOfDivergence<int_amdgcn_global_atomic_fmax_num>;
def : SourceOfDivergence<int_amdgcn_flat_atomic_fadd>;
def : SourceOfDivergence<int_amdgcn_flat_atomic_fmin>;
def : SourceOfDivergence<int_amdgcn_flat_atomic_fmax>;
+def : SourceOfDivergence<int_amdgcn_flat_atomic_fmin_num>;
+def : SourceOfDivergence<int_amdgcn_flat_atomic_fmax_num>;
def : SourceOfDivergence<int_amdgcn_global_atomic_fadd_v2bf16>;
def : SourceOfDivergence<int_amdgcn_flat_atomic_fadd_v2bf16>;
def : SourceOfDivergence<int_amdgcn_ds_fadd>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index d754058d4f0645..53a9fc2d8e7f66 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -1026,6 +1026,8 @@ bool GCNTTIImpl::collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
case Intrinsic::amdgcn_flat_atomic_fadd:
case Intrinsic::amdgcn_flat_atomic_fmax:
case Intrinsic::amdgcn_flat_atomic_fmin:
+ case Intrinsic::amdgcn_flat_atomic_fmax_num:
+ case Intrinsic::amdgcn_flat_atomic_fmin_num:
OpIndexes.push_back(0);
return true;
default:
@@ -1100,7 +1102,9 @@ Value *GCNTTIImpl::rewriteIntrinsicWithAddressSpace(IntrinsicInst *II,
}
case Intrinsic::amdgcn_flat_atomic_fadd:
case Intrinsic::amdgcn_flat_atomic_fmax:
- case Intrinsic::amdgcn_flat_atomic_fmin: {
+ case Intrinsic::amdgcn_flat_atomic_fmin:
+ case Intrinsic::amdgcn_flat_atomic_fmax_num:
+ case Intrinsic::amdgcn_flat_atomic_fmin_num: {
Type *DestTy = II->getType();
Type *SrcTy = NewV->getType();
unsigned NewAS = SrcTy->getPointerAddressSpace();
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index f3771bff247fd8..0dd2b3f5c2c912 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -1072,19 +1072,43 @@ class FlatStoreSignedAtomicPat <FLAT_Pseudo inst, SDPatternOperator node,
(inst $vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)
>;
-multiclass FlatAtomicPat <string inst, string node, ValueType vt,
- ValueType data_vt = vt> {
- defvar rtnNode = !cast<PatFrags>(node#"_"#vt.Size);
- defvar noRtnNode = !cast<PatFrags>(node#"_noret_"#vt.Size);
-
- def : GCNPat <(vt (rtnNode (FlatOffset i64:$vaddr, i32:$offset), data_vt:$data)),
- (!cast<FLAT_Pseudo>(inst#"_RTN") VReg_64:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)>;
+multiclass FlatAtomicNoRtnPat <string inst, string node, ValueType vt,
+ ValueType data_vt = vt, bit isIntr = 0> {
+ defvar noRtnNode = !cast<PatFrags>(node # "_noret" # !if(isIntr, "", "_"#vt.Size));
let AddedComplexity = 1 in
def : GCNPat <(vt (noRtnNode (FlatOffset i64:$vaddr, i32:$offset), data_vt:$data)),
(!cast<FLAT_Pseudo>(inst) VReg_64:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)>;
}
+multiclass FlatAtomicRtnPat <string inst, string node, ValueType vt,
+ ValueType data_vt = vt, bit isIntr = 0> {
+ defvar rtnNode = !cast<SDPatternOperator>(node # !if(isIntr, "", "_"#vt.Size));
+
+ def : GCNPat <(vt (rtnNode (FlatOffset i64:$vaddr, i32:$offset), data_vt:$data)),
+ (!cast<FLAT_Pseudo>(inst#"_RTN") VReg_64:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)>;
+}
+
+multiclass FlatAtomicPat <string inst, string node, ValueType vt,
+ ValueType data_vt = vt, bit isIntr = 0> :
+ FlatAtomicRtnPat<inst, node, vt, data_vt, isIntr>,
+ FlatAtomicNoRtnPat<inst, node, vt, data_vt, isIntr>;
+
+multiclass FlatAtomicIntrNoRtnPat <string inst, string node, ValueType vt,
+ ValueType data_vt = vt> {
+ defm : FlatAtomicNoRtnPat<inst, node, vt, data_vt, /* isIntr */ 1>;
+}
+
+multiclass FlatAtomicIntrRtnPat <string inst, string node, ValueType vt,
+ ValueType data_vt = vt> {
+ defm : FlatAtomicRtnPat<inst, node, vt, data_vt, /* isIntr */ 1>;
+}
+
+multiclass FlatAtomicIntrPat <string inst, string node, ValueType vt,
+ ValueType data_vt = vt> :
+ FlatAtomicRtnPat<inst, node, vt, data_vt, /* isIntr */ 1>,
+ FlatAtomicNoRtnPat<inst, node, vt, data_vt, /* isIntr */ 1>;
+
class FlatSignedAtomicPatBase <FLAT_Pseudo inst, SDPatternOperator node,
ValueType vt, ValueType data_vt = vt> : GCNPat <
(vt (node (GlobalOffset i64:$vaddr, i32:$offset), data_vt:$data)),
@@ -1305,10 +1329,10 @@ multiclass GlobalFLATStorePats<FLAT_Pseudo inst, SDPatternOperator node,
multiclass GlobalFLATAtomicPatsNoRtnBase<string inst, string node, ValueType vt,
ValueType data_vt = vt> {
let AddedComplexity = 11 in
- def : FlatSignedAtomicPatBase<!cast<FLAT_Pseudo>(inst), !cast<PatFrags>(node), vt, data_vt>;
+ def : FlatSignedAtomicPatBase<!cast<FLAT_Pseudo>(inst), !cast<SDPatternOperator>(node), vt, data_vt>;
let AddedComplexity = 13 in
- def : GlobalAtomicSaddrPat<!cast<FLAT_Pseudo>(inst#"_SADDR"), !cast<PatFrags>(node), vt, data_vt>;
+ def : GlobalAtomicSaddrPat<!cast<FLAT_Pseudo>(inst#"_SADDR"), !cast<SDPatternOperator>(node), vt, data_vt>;
}
multiclass GlobalFLATAtomicPatsRtnBase<string inst, string node, ValueType vt,
@@ -1508,10 +1532,14 @@ defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_XOR_X2", "atomic_load_xor_global", i
let OtherPredicates = [isGFX10Plus] in {
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMIN", "atomic_load_fmin_global", f32>;
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMAX", "atomic_load_fmax_global", f32>;
-defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMIN", "int_amdgcn_global_atomic_fmin", f32>;
-defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMAX", "int_amdgcn_global_atomic_fmax", f32>;
defm : FlatSignedAtomicPat <"FLAT_ATOMIC_FMIN", "atomic_load_fmin_flat", f32>;
defm : FlatSignedAtomicPat <"FLAT_ATOMIC_FMAX", "atomic_load_fmax_flat", f32>;
+}
+
+let OtherPredicates = [isGFX10GFX11] in {
+defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMIN", "int_amdgcn_global_atomic_fmin", f32>;
+defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMAX", "int_amdgcn_global_atomic_fmax", f32>;
+
defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_FMIN", "int_amdgcn_flat_atomic_fmin", f32>;
defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_FMAX", "int_amdgcn_flat_atomic_fmax", f32>;
}
@@ -1527,6 +1555,13 @@ defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_FMIN_X2", "int_amdgcn_flat_atomic_f
defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_FMAX_X2", "int_amdgcn_flat_atomic_fmax", f64>;
}
+let OtherPredicates = [isGFX12Only] in {
+ defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMIN", "int_amdgcn_global_atomic_fmin_num", f32>;
+ defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMAX", "int_amdgcn_global_atomic_fmax_num", f32>;
+ defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_FMIN", "int_amdgcn_flat_atomic_fmin_num", f32>;
+ defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_FMAX", "int_amdgcn_flat_atomic_fmax_num", f32>;
+}
+
let OtherPredicates = [HasAtomicFaddNoRtnInsts] in {
defm : GlobalFLATAtomicPatsNoRtn <"GLOBAL_ATOMIC_ADD_F32", "atomic_load_fadd_global", f32>;
defm : GlobalFLATAtomicPatsNoRtnWithAddrSpace <"GLOBAL_ATOMIC_ADD_F32", "int_amdgcn_flat_atomic_fadd", "global_addrspace", f32>;
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 708f212e204acf..7cf6186b334be2 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1230,9 +1230,13 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
case Intrinsic::amdgcn_global_atomic_fadd:
case Intrinsic::amdgcn_global_atomic_fmin:
case Intrinsic::amdgcn_global_atomic_fmax:
+ case Intrinsic::amdgcn_global_atomic_fmin_num:
+ case Intrinsic::amdgcn_global_atomic_fmax_num:
case Intrinsic::amdgcn_flat_atomic_fadd:
case Intrinsic::amdgcn_flat_atomic_fmin:
case Intrinsic::amdgcn_flat_atomic_fmax:
+ case Intrinsic::amdgcn_flat_atomic_fmin_num:
+ case Intrinsic::amdgcn_flat_atomic_fmax_num:
case Intrinsic::amdgcn_global_atomic_fadd_v2bf16:
case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16: {
Info.opc = ISD::INTRINSIC_W_CHAIN;
@@ -1315,6 +1319,8 @@ bool SITargetLowering::getAddrModeArguments(IntrinsicInst *II,
case Intrinsic::amdgcn_flat_atomic_fadd:
case Intrinsic::amdgcn_flat_atomic_fmin:
case Intrinsic::amdgcn_flat_atomic_fmax:
+ case Intrinsic::amdgcn_flat_atomic_fmin_num:
+ case Intrinsic::amdgcn_flat_atomic_fmax_num:
case Intrinsic::amdgcn_global_atomic_fadd_v2bf16:
case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16:
case Intrinsic::amdgcn_global_atomic_csub: {
@@ -8602,8 +8608,12 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
}
case Intrinsic::amdgcn_global_atomic_fmin:
case Intrinsic::amdgcn_global_atomic_fmax:
+ case Intrinsic::amdgcn_global_atomic_fmin_num:
+ case Intrinsic::amdgcn_global_atomic_fmax_num:
case Intrinsic::amdgcn_flat_atomic_fmin:
- case Intrinsic::amdgcn_flat_atomic_fmax: {
+ case Intrinsic::amdgcn_flat_atomic_fmax:
+ case Intrinsic::amdgcn_flat_atomic_fmin_num:
+ case Intrinsic::amdgcn_flat_atomic_fmax_num: {
MemSDNode *M = cast<MemSDNode>(Op);
SDValue Ops[] = {
M->getOperand(0), // Chain
@@ -8613,12 +8623,16 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
unsigned Opcode = 0;
switch (IntrID) {
case Intrinsic::amdgcn_global_atomic_fmin:
- case Intrinsic::amdgcn_flat_atomic_fmin: {
+ case Intrinsic::amdgcn_global_atomic_fmin_num:
+ case Intrinsic::amdgcn_flat_atomic_fmin:
+ case Intrinsic::amdgcn_flat_atomic_fmin_num: {
Opcode = AMDGPUISD::ATOMIC_LOAD_FMIN;
break;
}
case Intrinsic::amdgcn_global_atomic_fmax:
- case Intrinsic::amdgcn_flat_atomic_fmax: {
+ case Intrinsic::amdgcn_global_atomic_fmax_num:
+ case Intrinsic::amdgcn_flat_atomic_fmax:
+ case Intrinsic::amdgcn_flat_atomic_fmax_num: {
Opcode = AMDGPUISD::ATOMIC_LOAD_FMAX;
break;
}
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index d4746b559d9256..4fa480c24f2c82 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -8644,16 +8644,13 @@ bool SIInstrInfo::isLegalFLATOffset(int64_t Offset, unsigned AddrSpace,
AddrSpace == AMDGPUAS::GLOBAL_ADDRESS))
return false;
- bool AllowNegative = FlatVariant != SIInstrFlags::FLAT;
- if (ST.hasNegativeScratchOffsetBug() &&
- FlatVariant == SIInstrFlags::FlatScratch)
- AllowNegative = false;
if (ST.hasNegativeUnalignedScratchOffsetBug() &&
FlatVariant == SIInstrFlags::FlatScratch && Offset < 0 &&
(Offset % 4) != 0) {
return false;
}
+ bool AllowNegative = allowNegativeFlatOffset(FlatVariant);
unsigned N = AMDGPU::getNumFlatOffsetBits(ST);
return isIntN(N, Offset) && (AllowNegative || Offset >= 0);
}
@@ -8664,12 +8661,10 @@ SIInstrInfo::splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace,
uint64_t FlatVariant) const {
int64_t RemainderOffset = COffsetVal;
int64_t ImmField = 0;
- bool AllowNegative = FlatVariant != SIInstrFlags::FLAT;
- if (ST.hasNegativeScratchOffsetBug() &&
- FlatVariant == SIInstrFlags::FlatScratch)
- AllowNegative = false;
+ bool AllowNegative = allowNegativeFlatOffset(FlatVariant);
const unsigned NumBits = AMDGPU::getNumFlatOffsetBits(ST) - 1;
+
if (AllowNegative) {
// Use signed division by a power of two to truncate towards 0.
int64_t D = 1LL << NumBits;
@@ -8693,6 +8688,14 @@ SIInstrInfo::splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace,
return {ImmField, RemainderOffset};
}
+bool SIInstrInfo::allowNegativeFlatOffset(uint64_t FlatVariant) const {
+ if (ST.hasNegativeScratchOffsetBug() &&
+ FlatVariant == SIInstrFlags::FlatScratch)
+ return false;
+
+ return FlatVariant != SIInstrFlags::FLAT || AMDGPU::isGFX12Plus(ST);
+}
+
static unsigned subtargetEncodingFamily(const GCNSubtarget &ST) {
switch (ST.getGeneration()) {
default:
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 32a81d67cc246c..ad4220f3ad8e92 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -1270,6 +1270,9 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
unsigned AddrSpace,
uint64_t FlatVariant) const;
+ /// Returns true if negative offsets are allowed for the given \p FlatVariant.
+ bool allowNegativeFlatOffset(uint64_t FlatVariant) const;
+
/// \brief Return a target-specific opcode if Opcode is a pseudo instruction.
/// Return -1 if the target-specific opcode for the pseudo instruction does
/// not exist. If Opcode is not a pseudo instruction, this is identity.
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f32.ll
index 3f931412527300..83266f8d8386ed 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f32.ll
@@ -1,6 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx940 -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX940 %s
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX11 %s
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX11 %s
define amdgpu_ps void @flat_atomic_fadd_f32_no_rtn_intrinsic(ptr %ptr, float %data) {
; GFX940-LABEL: name: flat_atomic_fadd_f32_no_rtn_intrinsic
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
index ec2cd43e5fb5df..4603fbcd525c78 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
@@ -3...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/75493
More information about the llvm-commits
mailing list