[llvm] [AMDGPU] Legalize 64bit elements for BUILD_VECTOR on gfx942 (PR #145052)
via llvm-commits
llvm-commits at lists.llvm.org
Fri Jun 20 08:32:07 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Janek van Oirschot (JanekvO)
<details>
<summary>Changes</summary>
Any subtarget with full mov64 support may be able to take advantage of build_vector with 64b vector elts, Change to build_vector select for 64b element support and adds build_vector combine that will try to combine build_vector with 32b elements into its 64b equivalent.
---
Patch is 3.28 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/145052.diff
45 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp (+15-7)
- (modified) llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp (+8)
- (modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+99-15)
- (modified) llvm/lib/Target/AMDGPU/SIISelLowering.h (+1)
- (modified) llvm/test/CodeGen/AMDGPU/bf16-conversions.ll (+14-14)
- (modified) llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.f32-no-rtn.ll (+154-85)
- (modified) llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.f32-rtn.ll (+162-89)
- (modified) llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.f64.ll (+618-214)
- (modified) llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.v2f16-no-rtn.ll (+154-85)
- (modified) llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.v2f16-rtn.ll (+162-89)
- (modified) llvm/test/CodeGen/AMDGPU/flat-scratch.ll (+16-32)
- (modified) llvm/test/CodeGen/AMDGPU/fmaximum3.ll (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/fminimum3.ll (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/i1-to-bf16.ll (+8-8)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll (+101-58)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll (+315-275)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll (+315-275)
- (modified) llvm/test/CodeGen/AMDGPU/masked-load-vectortypes.ll (+12-17)
- (modified) llvm/test/CodeGen/AMDGPU/maximumnum.ll (+988-473)
- (modified) llvm/test/CodeGen/AMDGPU/minimumnum.ll (+988-473)
- (modified) llvm/test/CodeGen/AMDGPU/packed-fp32.ll (+288-141)
- (modified) llvm/test/CodeGen/AMDGPU/preload-kernargs.ll (+3-5)
- (modified) llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v2i64.ll (+140-94)
- (modified) llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v3i64.ll (+337-238)
- (modified) llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v4i64.ll (+817-456)
- (modified) llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v8i64.ll (+2575-1764)
- (modified) llvm/test/CodeGen/AMDGPU/shufflevector.v2p0.v2p0.ll (+140-94)
- (modified) llvm/test/CodeGen/AMDGPU/shufflevector.v2p0.v3p0.ll (+337-238)
- (modified) llvm/test/CodeGen/AMDGPU/shufflevector.v2p0.v4p0.ll (+817-456)
- (modified) llvm/test/CodeGen/AMDGPU/shufflevector.v3i64.v2i64.ll (+429-256)
- (modified) llvm/test/CodeGen/AMDGPU/shufflevector.v3i64.v3i64.ll (+1151-691)
- (modified) llvm/test/CodeGen/AMDGPU/shufflevector.v3i64.v4i64.ll (+2800-2259)
- (modified) llvm/test/CodeGen/AMDGPU/shufflevector.v3p0.v2p0.ll (+429-256)
- (modified) llvm/test/CodeGen/AMDGPU/shufflevector.v3p0.v3p0.ll (+1151-691)
- (modified) llvm/test/CodeGen/AMDGPU/shufflevector.v3p0.v4p0.ll (+2800-2259)
- (modified) llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v2i64.ll (+1839-962)
- (modified) llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v3i64.ll (+1376-1013)
- (modified) llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v4i64.ll (+4594-3830)
- (modified) llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v2p0.ll (+1839-962)
- (modified) llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v3p0.ll (+1376-1013)
- (modified) llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v4p0.ll (+4594-3830)
- (modified) llvm/test/CodeGen/AMDGPU/smfmac_no_agprs.ll (+11-14)
- (modified) llvm/test/CodeGen/AMDGPU/vector-reduce-fmax.ll (+4-3)
- (modified) llvm/test/CodeGen/AMDGPU/vector-reduce-fmin.ll (+3-2)
- (modified) llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll (+2-3)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 6e990cb2e160c..044c073e7f918 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -440,6 +440,8 @@ void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) {
EVT EltVT = VT.getVectorElementType();
SDLoc DL(N);
SDValue RegClass = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32);
+ unsigned NumRegs = EltVT.getSizeInBits() / 32;
+ bool IsGCN = CurDAG->getSubtarget().getTargetTriple().isAMDGCN();
if (NumVectorElts == 1) {
CurDAG->SelectNodeTo(N, AMDGPU::COPY_TO_REGCLASS, EltVT, N->getOperand(0),
@@ -449,12 +451,13 @@ void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) {
assert(NumVectorElts <= 32 && "Vectors with more than 32 elements not "
"supported yet");
+ assert((IsGCN || (!IsGCN && NumRegs == 1)) &&
+ "R600 does not support 64-bit reg_seq elements");
// 32 = Max Num Vector Elements
// 2 = 2 REG_SEQUENCE operands per element (value, subreg index)
// 1 = Vector Register Class
SmallVector<SDValue, 32 * 2 + 1> RegSeqArgs(NumVectorElts * 2 + 1);
- bool IsGCN = CurDAG->getSubtarget().getTargetTriple().isAMDGCN();
RegSeqArgs[0] = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32);
bool IsRegSeq = true;
unsigned NOps = N->getNumOperands();
@@ -464,8 +467,9 @@ void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) {
IsRegSeq = false;
break;
}
- unsigned Sub = IsGCN ? SIRegisterInfo::getSubRegFromChannel(i)
- : R600RegisterInfo::getSubRegFromChannel(i);
+ unsigned Sub =
+ IsGCN ? SIRegisterInfo::getSubRegFromChannel(i * NumRegs, NumRegs)
+ : R600RegisterInfo::getSubRegFromChannel(i);
RegSeqArgs[1 + (2 * i)] = N->getOperand(i);
RegSeqArgs[1 + (2 * i) + 1] = CurDAG->getTargetConstant(Sub, DL, MVT::i32);
}
@@ -475,8 +479,9 @@ void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) {
MachineSDNode *ImpDef = CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,
DL, EltVT);
for (unsigned i = NOps; i < NumVectorElts; ++i) {
- unsigned Sub = IsGCN ? SIRegisterInfo::getSubRegFromChannel(i)
- : R600RegisterInfo::getSubRegFromChannel(i);
+ unsigned Sub =
+ IsGCN ? SIRegisterInfo::getSubRegFromChannel(i * NumRegs, NumRegs)
+ : R600RegisterInfo::getSubRegFromChannel(i);
RegSeqArgs[1 + (2 * i)] = SDValue(ImpDef, 0);
RegSeqArgs[1 + (2 * i) + 1] =
CurDAG->getTargetConstant(Sub, DL, MVT::i32);
@@ -644,9 +649,12 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
break;
}
- assert(VT.getVectorElementType().bitsEq(MVT::i32));
+ EVT VET = VT.getVectorElementType();
+ assert(VET.bitsEq(MVT::i32) || VET.bitsEq(MVT::i64));
+ unsigned EltSize = VET.getSizeInBits();
unsigned RegClassID =
- SIRegisterInfo::getSGPRClassForBitWidth(NumVectorElts * 32)->getID();
+ SIRegisterInfo::getSGPRClassForBitWidth(NumVectorElts * EltSize)
+ ->getID();
SelectBuildVector(N, RegClassID);
return;
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 134adc681215f..8a12b9b25d713 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -5206,6 +5206,14 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
case ISD::BITCAST: {
EVT DestVT = N->getValueType(0);
+ // Avoid undoing build_vector with 64b elements if subtarget supports 64b
+ // movs (i.e., avoid inf loop through combines).
+ if (Subtarget->isGCN()) {
+ const GCNSubtarget &ST = DAG.getSubtarget<GCNSubtarget>();
+ if (ST.hasMovB64())
+ break;
+ }
+
// Push casts through vector builds. This helps avoid emitting a large
// number of copies when materializing floating point vector constants.
//
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 07d79d677104a..2dc12357cb4b0 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -357,9 +357,12 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
// Most operations are naturally 32-bit vector operations. We only support
// load and store of i64 vectors, so promote v2i64 vector operations to v4i32.
for (MVT Vec64 : {MVT::v2i64, MVT::v2f64}) {
- setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);
- AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v4i32);
-
+ if (STI.hasMovB64())
+ setOperationAction(ISD::BUILD_VECTOR, Vec64, Legal);
+ else {
+ setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);
+ AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v4i32);
+ }
setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote);
AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v4i32);
@@ -371,9 +374,12 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
}
for (MVT Vec64 : {MVT::v3i64, MVT::v3f64}) {
- setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);
- AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v6i32);
-
+ if (STI.hasMovB64())
+ setOperationAction(ISD::BUILD_VECTOR, Vec64, Legal);
+ else {
+ setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);
+ AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v6i32);
+ }
setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote);
AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v6i32);
@@ -385,9 +391,12 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
}
for (MVT Vec64 : {MVT::v4i64, MVT::v4f64}) {
- setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);
- AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v8i32);
-
+ if (STI.hasMovB64())
+ setOperationAction(ISD::BUILD_VECTOR, Vec64, Legal);
+ else {
+ setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);
+ AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v8i32);
+ }
setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote);
AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v8i32);
@@ -399,9 +408,12 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
}
for (MVT Vec64 : {MVT::v8i64, MVT::v8f64}) {
- setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);
- AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v16i32);
-
+ if (STI.hasMovB64())
+ setOperationAction(ISD::BUILD_VECTOR, Vec64, Legal);
+ else {
+ setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);
+ AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v16i32);
+ }
setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote);
AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v16i32);
@@ -413,9 +425,12 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
}
for (MVT Vec64 : {MVT::v16i64, MVT::v16f64}) {
- setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);
- AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v32i32);
-
+ if (STI.hasMovB64())
+ setOperationAction(ISD::BUILD_VECTOR, Vec64, Legal);
+ else {
+ setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);
+ AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v32i32);
+ }
setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote);
AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v32i32);
@@ -945,6 +960,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
}
setTargetDAGCombine({ISD::ADD,
+ ISD::BUILD_VECTOR,
ISD::UADDO_CARRY,
ISD::SUB,
ISD::USUBO_CARRY,
@@ -15486,6 +15502,72 @@ SDValue SITargetLowering::performClampCombine(SDNode *N,
return SDValue(CSrc, 0);
}
+SDValue
+SITargetLowering::performBuildVectorCombine(SDNode *N,
+ DAGCombinerInfo &DCI) const {
+ const GCNSubtarget *ST = getSubtarget();
+ if (DCI.Level < AfterLegalizeDAG || !ST->hasMovB64())
+ return SDValue();
+
+ SelectionDAG &DAG = DCI.DAG;
+ SDLoc SL(N);
+ BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N);
+
+ EVT VT = N->getValueType(0);
+ EVT EltVT = VT.getVectorElementType();
+ unsigned SizeBits = VT.getSizeInBits();
+ unsigned EltSize = EltVT.getSizeInBits();
+
+ // Skip if:
+ // - Value type isn't multiplication of 64 bit (e.g., v3i32), or
+ // - BuildVector instruction has non-constants, or
+ // - Element type has already been combined into i64 elements
+ if ((SizeBits % 64) != 0 || !BVN->isConstant() || EltVT == MVT::i64)
+ return SDValue();
+
+ // Construct the 64b values.
+ SmallVector<uint64_t, 8> ImmVals;
+ uint64_t ImmVal = 0;
+ uint64_t ImmSize = 0;
+ for (SDValue Opand : N->ops()) {
+ ConstantSDNode *C = dyn_cast<ConstantSDNode>(Opand);
+ if (!C)
+ return SDValue();
+
+ ImmVal |= C->getZExtValue() << ImmSize;
+ ImmSize += EltSize;
+ if (ImmSize > 64)
+ return SDValue();
+ if (ImmSize == 64) {
+ if (!isUInt<32>(ImmVal))
+ return SDValue();
+ ImmVals.push_back(ImmVal);
+ ImmVal = 0;
+ ImmSize = 0;
+ }
+ }
+
+ // Avoid emitting build_vector with 1 element and directly emit value.
+ if (ImmVals.size() == 1) {
+ SDValue Val = DAG.getConstant(ImmVals[0], SL, MVT::i64);
+ return DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Val);
+ }
+
+ // Construct and return build_vector with 64b elements.
+ if (!ImmVals.empty()) {
+ SmallVector<SDValue, 8> VectorConsts;
+ for (uint64_t I : ImmVals)
+ VectorConsts.push_back(DAG.getConstant(I, SL, MVT::i64));
+ unsigned NewNumElts = SizeBits / 64;
+ LLVMContext &Ctx = *DAG.getContext();
+ EVT NewVT = EVT::getVectorVT(Ctx, MVT::i64, NewNumElts);
+ SDValue BV = DAG.getBuildVector(
+ NewVT, SL, ArrayRef(VectorConsts.begin(), VectorConsts.end()));
+ return DAG.getBitcast(VT, BV);
+ }
+ return SDValue();
+}
+
SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
switch (N->getOpcode()) {
@@ -15573,6 +15655,8 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
return performFCanonicalizeCombine(N, DCI);
case AMDGPUISD::RCP:
return performRcpCombine(N, DCI);
+ case ISD::BUILD_VECTOR:
+ return performBuildVectorCombine(N, DCI);
case ISD::FLDEXP:
case AMDGPUISD::FRACT:
case AMDGPUISD::RSQ:
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index 89fb12b52c3e6..8be0631b37dd2 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -231,6 +231,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
SDValue performCvtF32UByteNCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performClampCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+ SDValue performBuildVectorCombine(SDNode *N, DAGCombinerInfo &DCI) const;
bool isLegalMUBUFAddressingMode(const AddrMode &AM) const;
diff --git a/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll b/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll
index a597faa028f22..8c48f9a4a7b5b 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll
@@ -152,24 +152,24 @@ define amdgpu_ps float @v_test_cvt_v2f64_v2bf16_v(<2 x double> %src) {
; GFX-950: ; %bb.0:
; GFX-950-NEXT: v_cvt_f32_f64_e32 v6, v[2:3]
; GFX-950-NEXT: v_cvt_f64_f32_e32 v[4:5], v6
-; GFX-950-NEXT: v_and_b32_e32 v7, 1, v6
-; GFX-950-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[2:3]|, |v[4:5]|
+; GFX-950-NEXT: v_cmp_gt_f64_e64 s[0:1], |v[2:3]|, |v[4:5]|
; GFX-950-NEXT: v_cmp_nlg_f64_e32 vcc, v[2:3], v[4:5]
-; GFX-950-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v7
-; GFX-950-NEXT: v_cndmask_b32_e64 v2, -1, 1, s[2:3]
-; GFX-950-NEXT: v_add_u32_e32 v2, v6, v2
+; GFX-950-NEXT: v_and_b32_e32 v2, 1, v6
+; GFX-950-NEXT: v_cndmask_b32_e64 v7, -1, 1, s[0:1]
+; GFX-950-NEXT: v_cvt_f32_f64_e32 v8, v[0:1]
+; GFX-950-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v2
+; GFX-950-NEXT: v_add_u32_e32 v7, v6, v7
; GFX-950-NEXT: s_or_b64 vcc, vcc, s[0:1]
-; GFX-950-NEXT: v_cvt_f32_f64_e32 v5, v[0:1]
-; GFX-950-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
-; GFX-950-NEXT: v_cvt_f64_f32_e32 v[2:3], v5
-; GFX-950-NEXT: v_and_b32_e32 v6, 1, v5
-; GFX-950-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, |v[2:3]|
+; GFX-950-NEXT: v_cvt_f64_f32_e32 v[2:3], v8
+; GFX-950-NEXT: v_cndmask_b32_e32 v4, v7, v6, vcc
+; GFX-950-NEXT: v_cmp_gt_f64_e64 s[0:1], |v[0:1]|, |v[2:3]|
; GFX-950-NEXT: v_cmp_nlg_f64_e32 vcc, v[0:1], v[2:3]
-; GFX-950-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v6
-; GFX-950-NEXT: v_cndmask_b32_e64 v0, -1, 1, s[2:3]
-; GFX-950-NEXT: v_add_u32_e32 v0, v5, v0
+; GFX-950-NEXT: v_and_b32_e32 v0, 1, v8
+; GFX-950-NEXT: v_cndmask_b32_e64 v5, -1, 1, s[0:1]
+; GFX-950-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v0
+; GFX-950-NEXT: v_add_u32_e32 v5, v8, v5
; GFX-950-NEXT: s_or_b64 vcc, vcc, s[0:1]
-; GFX-950-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc
+; GFX-950-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc
; GFX-950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v4
; GFX-950-NEXT: ; return to shader part epilog
%res = fptrunc <2 x double> %src to <2 x bfloat>
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.f32-no-rtn.ll b/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.f32-no-rtn.ll
index a14114358433a..9b783ed9d7772 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.f32-no-rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.f32-no-rtn.ll
@@ -1,11 +1,11 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX908_GFX11 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX942 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX942 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefixes=GFX90A_GFX942,GFX90A %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefixes=GFX90A_GFX942,GFX942 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX908_GFX11 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -enable-new-pm -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX908_GFX11 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -enable-new-pm -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX942 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -enable-new-pm -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX942 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -enable-new-pm -stop-after=amdgpu-isel < %s | FileCheck -check-prefixes=GFX90A_GFX942,GFX90A %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -enable-new-pm -stop-after=amdgpu-isel < %s | FileCheck -check-prefixes=GFX90A_GFX942,GFX942 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -enable-new-pm -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX908_GFX11 %s
define amdgpu_ps void @buffer_atomic_fadd_f32_offset_no_rtn(float %val, <4 x i32> inreg %rsrc, i32 inreg %soffset) {
@@ -167,25 +167,41 @@ define amdgpu_ps void @buffer_ptr_atomic_fadd_f32_offset_no_rtn(float %val, ptr
; GFX908_GFX11-NEXT: BUFFER_ATOMIC_ADD_F32_OFFSET [[COPY5]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
; GFX908_GFX11-NEXT: S_ENDPGM 0
;
- ; GFX90A_GFX942-LABEL: name: buffer_ptr_atomic_fadd_f32_offset_no_rtn
- ; GFX90A_GFX942: bb.0 (%ir-block.0):
- ; GFX90A_GFX942-NEXT: liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4
- ; GFX90A_GFX942-NEXT: {{ $}}
- ; GFX90A_GFX942-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
- ; GFX90A_GFX942-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr3
- ; GFX90A_GFX942-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
- ; GFX90A_GFX942-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr1
- ; GFX90A_GFX942-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr0
- ; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
- ; GFX90A_GFX942-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
- ; GFX90A_GFX942-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
- ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1
- ; GFX90A_GFX942-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1
- ; GFX90A_GFX942-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
- ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY6]], %subreg.sub3
- ; GFX90A_GFX942-NEXT: BUFFER_ATOMIC_ADD_F32_OFFSET [[COPY5]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
- ; GFX90A_GFX942-NEXT: S_ENDPGM 0
+ ; GFX90A-LABEL: name: buffer_ptr_atomic_fadd_f32_offset_no_rtn
+ ; GFX90A: bb.0 (%ir-block.0):
+ ; GFX90A-NEXT: liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4
+ ; GFX90A-NEXT: {{ $}}
+ ; GFX90A-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+ ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+ ; GFX90A-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+ ; GFX90A-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; GFX90A-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; GFX90A-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX90A-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX90A-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+ ; GFX90A-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+ ; GFX90A-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1
+ ; GFX90A-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub1
+ ; GFX90A-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0
+ ; GFX90A-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY6]], %subreg.sub3
+ ; GFX90A-NEXT: BUFFER_ATOMIC_ADD_F32_OFFSET [[COPY5]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
+ ; GFX90A-NEXT: S_ENDPGM 0
+ ;
+ ; GFX942-LABEL: name: buffer_ptr_atomic_fadd_f32_offset_no_rtn
+ ; GFX942: bb.0 (%ir-block.0):
+ ; GFX942-NEXT: liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4
+ ; GFX942-NEXT: {{ $}}
+ ; GFX942-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+ ; GFX942-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+ ; GFX942-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+ ; GFX942-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; GFX942-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; GFX942-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX942-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1
+ ; GFX942-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[REG_SEQUENCE1]], %subreg.sub0_sub1, killed [[REG_SEQUENCE]], %subreg.sub2_sub3
+ ; GFX942-NEXT: BUFFER_ATOMIC_ADD_F32_OFFSET [[COPY5]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenc...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/145052
More information about the llvm-commits
mailing list