[llvm] [AMDGPU][GlobalISel] Enable vector reductions (PR #131413)
via llvm-commits
llvm-commits at lists.llvm.org
Fri Mar 14 18:31:01 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Tim Gymnich (tgymnich)
<details>
<summary>Changes</summary>
fixes https://github.com/llvm/llvm-project/issues/114816
---
Patch is 1.64 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/131413.diff
20 Files Affected:
- (modified) llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h (+1)
- (modified) llvm/include/llvm/CodeGen/GlobalISel/Utils.h (+2)
- (modified) llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp (+61)
- (modified) llvm/lib/CodeGen/GlobalISel/Utils.cpp (+10)
- (modified) llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp (+15-1)
- (added) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm-intrinsics/reduce/add.ll (+2388)
- (added) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm-intrinsics/reduce/and.ll (+2352)
- (added) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm-intrinsics/reduce/fadd.ll (+1461)
- (added) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm-intrinsics/reduce/fmax.ll (+1806)
- (added) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm-intrinsics/reduce/fmaximum.ll (+2671)
- (added) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm-intrinsics/reduce/fmin.ll (+1805)
- (added) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm-intrinsics/reduce/fminimum.ll (+2675)
- (added) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm-intrinsics/reduce/fmul.ll (+1461)
- (added) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm-intrinsics/reduce/mul.ll (+2711)
- (added) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm-intrinsics/reduce/or.ll (+2347)
- (added) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm-intrinsics/reduce/smax.ll (+3098)
- (added) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm-intrinsics/reduce/smin.ll (+3098)
- (added) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm-intrinsics/reduce/umax.ll (+3031)
- (added) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm-intrinsics/reduce/umin.ll (+2678)
- (added) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm-intrinsics/reduce/xor.ll (+2276)
``````````diff
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
index 4e18f5cc913a7..3b02b23dcd7b9 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
@@ -426,6 +426,7 @@ class LegalizerHelper {
LegalizeResult lowerMinMax(MachineInstr &MI);
LegalizeResult lowerFCopySign(MachineInstr &MI);
LegalizeResult lowerFMinNumMaxNum(MachineInstr &MI);
+ LegalizeResult lowerFMinimum_FMaximum(MachineInstr &MI);
LegalizeResult lowerFMad(MachineInstr &MI);
LegalizeResult lowerIntrinsicRound(MachineInstr &MI);
LegalizeResult lowerFFloor(MachineInstr &MI);
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/Utils.h b/llvm/include/llvm/CodeGen/GlobalISel/Utils.h
index a35ecae5d18bf..ad9ca2ad26f6a 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/Utils.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/Utils.h
@@ -343,6 +343,8 @@ inline bool isKnownNeverSNaN(Register Val, const MachineRegisterInfo &MRI) {
return isKnownNeverNaN(Val, MRI, true);
}
+bool isKnownNeverZeroFloat(Register Val, const MachineRegisterInfo &MRI);
+
Align inferAlignFromPtrInfo(MachineFunction &MF, const MachinePointerInfo &MPO);
/// Return a virtual register corresponding to the incoming argument register \p
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index ed8bd25698c03..b20a79a24d70c 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -13,6 +13,7 @@
//===----------------------------------------------------------------------===//
#include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
+#include "llvm/ADT/APFloat.h"
#include "llvm/CodeGen/GlobalISel/CallLowering.h"
#include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h"
#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
@@ -32,6 +33,7 @@
#include "llvm/CodeGen/TargetLowering.h"
#include "llvm/CodeGen/TargetOpcodes.h"
#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/IR/InstrTypes.h"
#include "llvm/IR/Instructions.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/MathExtras.h"
@@ -4594,6 +4596,9 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT LowerHintTy) {
case G_FMINNUM:
case G_FMAXNUM:
return lowerFMinNumMaxNum(MI);
+ case G_FMINIMUM:
+ case G_FMAXIMUM:
+ return lowerFMinimum_FMaximum(MI);
case G_MERGE_VALUES:
return lowerMergeValues(MI);
case G_UNMERGE_VALUES:
@@ -8165,6 +8170,62 @@ LegalizerHelper::lowerFMinNumMaxNum(MachineInstr &MI) {
return Legalized;
}
+LegalizerHelper::LegalizeResult
+LegalizerHelper::lowerFMinimum_FMaximum(MachineInstr &MI) {
+ auto [Dst, Src0, Src1] = MI.getFirst3Regs();
+ LLT Ty = MRI.getType(Dst);
+ unsigned Opc = MI.getOpcode();
+ bool IsMax = Opc == TargetOpcode::G_FMAXIMUM;
+
+ Register MinMax;
+ unsigned CompOpcIeee = IsMax ? TargetOpcode::G_FMAXNUM_IEEE : TargetOpcode::G_FMINNUM_IEEE;
+ unsigned CompOpc = IsMax ? TargetOpcode::G_FMAXNUM : TargetOpcode::G_FMINNUM;
+ CmpInst::Predicate CompPred = IsMax ? CmpInst::FCMP_OGT : CmpInst::FCMP_OLT;
+ LLT S1 = LLT::scalar(1);
+ const fltSemantics &FPSem = getFltSemanticForLLT(Ty);
+
+ bool MinMaxMustRespectOrderedZero = false;
+
+ if (LI.isLegalOrCustom({CompOpcIeee, Ty})) {
+ MinMax = MIRBuilder.buildInstr(CompOpcIeee, {Ty}, {Src0, Src1}).getReg(0);
+ MinMaxMustRespectOrderedZero = true;
+ } else if (LI.isLegalOrCustom({CompOpc, Ty})) {
+ MinMax = MIRBuilder.buildInstr(CompOpc, {Ty}, {Src0, Src1}).getReg(0);
+ } else {
+ // NaN (if exists) will be propagated later, so orderness doesn't matter.
+ auto Comp = MIRBuilder.buildFCmp(CompPred, S1, Src0, Src1);
+ MinMax = MIRBuilder.buildSelect(Ty, Comp,Src0, Src1).getReg(0);
+ }
+
+ // Propagate any NaN of both operands
+ if (!MI.getFlag(MachineInstr::FmNoNans) && (!isKnownNeverNaN(Src0, MRI) || !isKnownNeverNaN(Src1, MRI))) {
+ auto FPNaN = MIRBuilder.buildFConstant(Ty, APFloat::getNaN(FPSem));
+ auto Comp = MIRBuilder.buildFCmp(CmpInst::Predicate::FCMP_UNO, S1, Src0, Src1);
+ MinMax = MIRBuilder.buildSelect(Ty, Comp, FPNaN, MinMax).getReg(0);
+ }
+
+ // fminimum/fmaximum requires -0.0 less than +0.0
+ if (!MinMaxMustRespectOrderedZero && !MI.getFlag(MachineInstr::FmNsz) &&
+ !isKnownNeverZeroFloat(Src0, MRI) && !isKnownNeverZeroFloat(Src1, MRI)) {
+ auto Zero = MIRBuilder.buildFConstant(Ty, APFloat::getZero(FPSem));
+ auto IsZero = MIRBuilder.buildFCmp(CmpInst::Predicate::FCMP_OEQ, S1,MinMax, Zero);
+
+ unsigned TestZeroMask = IsMax ? fcPosZero : fcNegZero;
+
+ auto Src0Zero = MIRBuilder.buildIsFPClass(S1, Src0, TestZeroMask);
+ auto Src0Comp = MIRBuilder.buildSelect(Ty, Src0Zero, Src0, MinMax);
+
+ auto Src1Zero = MIRBuilder.buildIsFPClass(S1, Src1, TestZeroMask);
+ auto Src1Comp = MIRBuilder.buildSelect(Ty, Src1Zero, Src1, Src0Comp);
+
+ MinMax = MIRBuilder.buildSelect(Ty, IsZero, Src1Comp, MinMax).getReg(0);
+ }
+
+ MRI.replaceRegWith(Dst, MinMax);
+ MI.eraseFromParent();
+ return Legalized;
+}
+
LegalizerHelper::LegalizeResult LegalizerHelper::lowerFMad(MachineInstr &MI) {
// Expand G_FMAD a, b, c -> G_FADD (G_FMUL a, b), c
Register DstReg = MI.getOperand(0).getReg();
diff --git a/llvm/lib/CodeGen/GlobalISel/Utils.cpp b/llvm/lib/CodeGen/GlobalISel/Utils.cpp
index 625d556e3ff5e..59764fc74e928 100644
--- a/llvm/lib/CodeGen/GlobalISel/Utils.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/Utils.cpp
@@ -890,6 +890,16 @@ bool llvm::isKnownNeverNaN(Register Val, const MachineRegisterInfo &MRI,
return false;
}
+bool llvm::isKnownNeverZeroFloat(Register Reg, const MachineRegisterInfo &MRI) {
+ std::optional<FPValueAndVReg> FPValReg;
+ if (mi_match(Reg, MRI, m_GFCstOrSplat(FPValReg))) {
+ if (!FPValReg->Value.isZero())
+ return true;
+ }
+
+ return false;
+}
+
Align llvm::inferAlignFromPtrInfo(MachineFunction &MF,
const MachinePointerInfo &MPO) {
auto PSV = dyn_cast_if_present<const PseudoSourceValue *>(MPO.V);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index b3a8183beeacf..fc2d4954df8c0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -350,6 +350,12 @@ static std::initializer_list<LLT> AllS32Vectors = {
static std::initializer_list<LLT> AllS64Vectors = {V2S64, V3S64, V4S64, V5S64,
V6S64, V7S64, V8S64, V16S64};
+static std::initializer_list<LLT> AllVectors{
+ V2S16, V4S16, V6S16, V8S16, V10S16, V12S16, V16S16, V2S128,
+ V4S128, V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
+ V9S32, V10S32, V11S32, V12S32, V16S32, V32S32, V2S64, V3S64,
+ V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
+
// Checks whether a type is in the list of legal register types.
static bool isRegisterClassType(const GCNSubtarget &ST, LLT Ty) {
if (Ty.isPointerOrPointerVector())
@@ -2090,7 +2096,6 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
.clampMaxNumElements(0, S16, 2)
.scalarize(0);
} else {
- // TODO: Implement
getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower();
}
@@ -2106,6 +2111,15 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
getActionDefinitionsBuilder(G_PREFETCH).alwaysLegal();
+ getActionDefinitionsBuilder(
+ {G_VECREDUCE_SMIN, G_VECREDUCE_SMAX, G_VECREDUCE_UMIN, G_VECREDUCE_UMAX,
+ G_VECREDUCE_ADD, G_VECREDUCE_MUL, G_VECREDUCE_FMUL, G_VECREDUCE_FMIN,
+ G_VECREDUCE_FMAX, G_VECREDUCE_FMINIMUM, G_VECREDUCE_FMAXIMUM,
+ G_VECREDUCE_OR, G_VECREDUCE_AND, G_VECREDUCE_XOR})
+ .legalFor(AllVectors)
+ .scalarize(1)
+ .lower();
+
getLegacyLegalizerInfo().computeTables();
verify(*ST.getInstrInfo());
}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm-intrinsics/reduce/add.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm-intrinsics/reduce/add.ll
new file mode 100644
index 0000000000000..76c97f4327aa3
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm-intrinsics/reduce/add.ll
@@ -0,0 +1,2388 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX7 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx801 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12 %s
+
+define i8 @test_vector_reduce_add_v2i8(<2 x i8> %v) {
+; GFX7-LABEL: test_vector_reduce_add_v2i8:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v1
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: test_vector_reduce_add_v2i8:
+; GFX8: ; %bb.0: ; %entry
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_add_u16_e32 v0, v0, v1
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: test_vector_reduce_add_v2i8:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_add_u16_e32 v0, v0, v1
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: test_vector_reduce_add_v2i8:
+; GFX10: ; %bb.0: ; %entry
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_add_nc_u16 v0, v0, v1
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: test_vector_reduce_add_v2i8:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_add_nc_u16 v0, v0, v1
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: test_vector_reduce_add_v2i8:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_add_nc_u16 v0, v0, v1
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %res = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> %v)
+ ret i8 %res
+}
+
+define i8 @test_vector_reduce_add_v3i8(<3 x i8> %v) {
+; GFX7-LABEL: test_vector_reduce_add_v3i8:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v1
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: test_vector_reduce_add_v3i8:
+; GFX8: ; %bb.0: ; %entry
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_add_u16_e32 v0, v0, v1
+; GFX8-NEXT: v_add_u16_e32 v0, v0, v2
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: test_vector_reduce_add_v3i8:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_add_u16_e32 v0, v0, v1
+; GFX9-NEXT: v_add_u16_e32 v0, v0, v2
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: test_vector_reduce_add_v3i8:
+; GFX10: ; %bb.0: ; %entry
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_add_nc_u16 v0, v0, v1
+; GFX10-NEXT: v_add_nc_u16 v0, v0, v2
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: test_vector_reduce_add_v3i8:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_add_nc_u16 v0, v0, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_nc_u16 v0, v0, v2
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: test_vector_reduce_add_v3i8:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_add_nc_u16 v0, v0, v1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_add_nc_u16 v0, v0, v2
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %res = call i8 @llvm.vector.reduce.add.v3i8(<3 x i8> %v)
+ ret i8 %res
+}
+
+define i8 @test_vector_reduce_add_v4i8(<4 x i8> %v) {
+; GFX7-LABEL: test_vector_reduce_add_v4i8:
+; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v5, 0xff, v1
+; GFX7-NEXT: v_and_b32_e32 v4, 0xff, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5
+; GFX7-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX7-NEXT: v_or_b32_e32 v4, v4, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GFX7-NEXT: v_or_b32_e32 v2, v4, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 24, v3
+; GFX7-NEXT: v_or_b32_e32 v2, v2, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v2
+; GFX7-NEXT: v_add_i32_e32 v1, vcc, v1, v2
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v3
+; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX7-NEXT: v_and_b32_e32 v2, 0xff, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1
+; GFX7-NEXT: v_or_b32_e32 v1, v2, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 8, v1
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v1
+; GFX7-NEXT: v_bfe_u32 v0, v0, 0, 8
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: test_vector_reduce_add_v4i8:
+; GFX8: ; %bb.0: ; %entry
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v4, 8
+; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX8-NEXT: v_or_b32_sdwa v5, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GFX8-NEXT: v_or_b32_e32 v2, v5, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 24, v3
+; GFX8-NEXT: v_or_b32_e32 v2, v2, v3
+; GFX8-NEXT: v_add_u16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+; GFX8-NEXT: v_add_u16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 8, v1
+; GFX8-NEXT: v_add_u16_e32 v0, v0, v1
+; GFX8-NEXT: v_bfe_u32 v0, v0, 0, 8
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: test_vector_reduce_add_v4i8:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v5, 8
+; GFX9-NEXT: v_mov_b32_e32 v4, 0xff
+; GFX9-NEXT: v_lshlrev_b32_sdwa v6, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX9-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GFX9-NEXT: v_and_or_b32 v6, v0, v4, v6
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX9-NEXT: v_lshlrev_b32_e32 v3, 24, v3
+; GFX9-NEXT: v_or3_b32 v2, v6, v2, v3
+; GFX9-NEXT: v_add_u16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+; GFX9-NEXT: v_add_u16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT: v_and_or_b32 v1, v0, v4, v1
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v1
+; GFX9-NEXT: v_add_u16_e32 v0, v0, v1
+; GFX9-NEXT: v_bfe_u32 v0, v0, 0, 8
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: test_vector_reduce_add_v4i8:
+; GFX10: ; %bb.0: ; %entry
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v4, 8
+; GFX10-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX10-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v3
+; GFX10-NEXT: v_and_or_b32 v5, 0xff, v0, v5
+; GFX10-NEXT: v_or3_b32 v2, v5, v2, v3
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, 24, v2
+; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX10-NEXT: v_add_nc_u16 v1, v1, v3
+; GFX10-NEXT: v_add_nc_u16 v0, v0, v2
+; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX10-NEXT: v_and_or_b32 v1, 0xff, v0, v1
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 8, v1
+; GFX10-NEXT: v_add_nc_u16 v0, v0, v1
+; GFX10-NEXT: v_bfe_u32 v0, v0, 0, 8
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: test_vector_reduce_add_v4i8:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v1
+; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 8, v4
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 24, v3
+; GFX11-NEXT: v_and_or_b32 v4, 0xff, v0, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_or3_b32 v2, v4, v2, v3
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 24, v2
+; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_add_nc_u16 v1, v1, v3
+; GFX11-NEXT: v_add_nc_u16 v0, v0, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 8, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_or_b32 v1, 0xff, v0, v1
+; GFX11-NEXT: v_lshrrev_b32_e32 v1, 8, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add_nc_u16 v0, v0, v1
+; GFX11-NEXT: v_bfe_u32 v0, v0, 0, 8
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: test_vector_reduce_add_v4i8:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_and_b32_e32 v4, 0xff, v1
+; GFX12-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX12-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-NEXT: v_lshlrev_b32_e32 v4, 8, v4
+; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-NEXT: v_lshlrev_b32_e32 v3, 24, v3
+; GFX12-NEXT: v_and_or_b32 v4, 0xff, v0, v4
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_or3_b32 v2, v4, v2, v3
+; GFX12-NEXT: v_lshrrev_b32_e32 v3, 24, v2
+; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_add_nc_u16 v1, v1, v3
+; GFX12-NEXT: v_add_nc_u16 v0, v0, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX12-NEXT: v_lshlrev_b32_e32 v1, 8, v1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_and_or_b32 v1, 0xff, v0, v1
+; GFX12-NEXT: v_lshrrev_b32_e32 v1, 8, v1
+; GFX12-NEXT: s_delay_alu instid0(VA...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/131413
More information about the llvm-commits
mailing list