[llvm] [AMDGPU] [DO NOT MERGE] Nonsuccessful Attempt At Using SelectionDAG Hooks for abs i8/i16 (PR #167064)
Patrick Simmons via llvm-commits
llvm-commits at lists.llvm.org
Fri Nov 7 17:01:50 PST 2025
https://github.com/linuxrocks123 updated https://github.com/llvm/llvm-project/pull/167064
>From eb489a6519dc71f316c4ceb40f020dc6b4ee9002 Mon Sep 17 00:00:00 2001
From: Patrick Simmons <psimmons at pensando.io>
Date: Mon, 27 Oct 2025 17:39:50 -0500
Subject: [PATCH 01/19] This doesn't work.
---
llvm/lib/Target/AMDGPU/SOPInstructions.td | 5 +++++
1 file changed, 5 insertions(+)
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index 1931e0be15152..731dfece8f3fe 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -1895,6 +1895,11 @@ def : GCNPat <
(S_MOV_B32 imm:$imm)
>;
+def : GCNPat <
+ (i32 (UniformUnaryFrag<anyext> (i16 (UniformBinFrag<smax> i16:$src, (i16 (UniformBinFrag<sub> 0, i16:$src)))))),
+ (S_ABS_I32 (i32 (S_SEXT_I32_I16 $src)))
+>;
+
def : GCNPat <
(v2i32 (UniformBinFrag<and> v2i32:$x, v2i32:$y)),
(S_AND_B64 SReg_64:$x, SReg_64:$y)
>From 3fbeb0d3a01973a3396e240f4f62e4e140a8faf3 Mon Sep 17 00:00:00 2001
From: Patrick Simmons <psimmons at pensando.io>
Date: Mon, 27 Oct 2025 21:26:37 -0500
Subject: [PATCH 02/19] Finally something that works
---
llvm/include/llvm/CodeGen/TargetLowering.h | 4 ++--
llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp | 1 +
llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h | 1 +
llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 16 ++++++++++++++++
llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h | 2 ++
llvm/lib/Target/AMDGPU/SOPInstructions.td | 5 -----
6 files changed, 22 insertions(+), 7 deletions(-)
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 2550c2bee5f71..cbd68f64059f0 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -5576,8 +5576,8 @@ class LLVM_ABI TargetLowering : public TargetLoweringBase {
/// \param N Node to expand
/// \param IsNegative indicate negated abs
/// \returns The expansion result or SDValue() if it fails.
- SDValue expandABS(SDNode *N, SelectionDAG &DAG,
- bool IsNegative = false) const;
+ virtual SDValue expandABS(SDNode *N, SelectionDAG &DAG,
+ bool IsNegative = false) const;
/// Expand ABDS/ABDU nodes. Expands vector/scalar ABDS/ABDU nodes.
/// \param N Node to expand
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index b8b419d93021a..b963b8f83070b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -23,6 +23,7 @@
#include "SIMachineFunctionInfo.h"
#include "llvm/Analysis/UniformityAnalysis.h"
#include "llvm/CodeGen/FunctionLoweringInfo.h"
+#include "llvm/CodeGen/ISDOpcodes.h"
#include "llvm/CodeGen/SelectionDAG.h"
#include "llvm/CodeGen/SelectionDAGISel.h"
#include "llvm/CodeGen/SelectionDAGNodes.h"
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
index 4fa0d3f72e1c7..6d8697834e536 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
@@ -282,6 +282,7 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
void SelectADD_SUB_I64(SDNode *N);
void SelectAddcSubb(SDNode *N);
void SelectUADDO_USUBO(SDNode *N);
+ bool SelectABS(SDNode *N);
void SelectDIV_SCALE(SDNode *N);
void SelectMAD_64_32(SDNode *N);
void SelectMUL_LOHI(SDNode *N);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index f5081a9d2dd56..ef8b7e0f7d323 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -5287,6 +5287,22 @@ SDValue AMDGPUTargetLowering::performRcpCombine(SDNode *N,
return DCI.DAG.getConstantFP(One / Val, SDLoc(N), N->getValueType(0));
}
+SDValue AMDGPUTargetLowering::expandABS(SDNode *N, SelectionDAG &CurDAG,
+ bool IsNegative) const {
+ assert(N->getOpcode() == ISD::ABS &&
+ "Tried to select abs with non-abs opcode.");
+
+ if (N->getValueSizeInBits(0) != 16 || IsNegative)
+ return TargetLowering::expandABS(N, CurDAG, IsNegative);
+
+ SDValue Src = N->getOperand(0);
+ SDLoc DL(Src);
+
+ SDValue SExtSrc = CurDAG.getSExtOrTrunc(Src, DL, MVT::i32);
+ SDValue ExtAbs = CurDAG.getNode(ISD::ABS, DL, MVT::i32, SExtSrc);
+ return CurDAG.getNode(ISD::TRUNCATE, DL, MVT::i16, ExtAbs);
+}
+
SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index bdaf48652d107..06327051667fe 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -135,6 +135,8 @@ class AMDGPUTargetLowering : public TargetLowering {
SDValue performFNegCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performFAbsCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+ virtual SDValue expandABS(SDNode *N, SelectionDAG &CurDAG,
+ bool IsNegative) const override;
static EVT getEquivalentMemType(LLVMContext &Context, EVT VT);
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index 731dfece8f3fe..1931e0be15152 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -1895,11 +1895,6 @@ def : GCNPat <
(S_MOV_B32 imm:$imm)
>;
-def : GCNPat <
- (i32 (UniformUnaryFrag<anyext> (i16 (UniformBinFrag<smax> i16:$src, (i16 (UniformBinFrag<sub> 0, i16:$src)))))),
- (S_ABS_I32 (i32 (S_SEXT_I32_I16 $src)))
->;
-
def : GCNPat <
(v2i32 (UniformBinFrag<and> v2i32:$x, v2i32:$y)),
(S_AND_B64 SReg_64:$x, SReg_64:$y)
>From 6f630bd91f0e9019707cc404248957434efe5af2 Mon Sep 17 00:00:00 2001
From: Patrick Simmons <psimmons at pensando.io>
Date: Tue, 28 Oct 2025 17:25:42 -0500
Subject: [PATCH 03/19] This doesn't work.
---
llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 10 +++++++--
llvm/test/CodeGen/AMDGPU/s_abs_i16.ll | 22 +++++++++++++++++++
2 files changed, 30 insertions(+), 2 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/s_abs_i16.ll
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index ef8b7e0f7d323..a31dbf1d4a894 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -18,6 +18,7 @@
#include "AMDGPUMachineFunction.h"
#include "AMDGPUMemoryUtils.h"
#include "SIMachineFunctionInfo.h"
+#include "SIRegisterInfo.h"
#include "llvm/CodeGen/Analysis.h"
#include "llvm/CodeGen/GlobalISel/GISelValueTracking.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
@@ -5292,7 +5293,7 @@ SDValue AMDGPUTargetLowering::expandABS(SDNode *N, SelectionDAG &CurDAG,
assert(N->getOpcode() == ISD::ABS &&
"Tried to select abs with non-abs opcode.");
- if (N->getValueSizeInBits(0) != 16 || IsNegative)
+ if (N->getValueSizeInBits(0) != 16 || getRegClassFor(N->getSimpleValueType(0)) != &AMDGPU::SReg_32RegClass)
return TargetLowering::expandABS(N, CurDAG, IsNegative);
SDValue Src = N->getOperand(0);
@@ -5300,7 +5301,12 @@ SDValue AMDGPUTargetLowering::expandABS(SDNode *N, SelectionDAG &CurDAG,
SDValue SExtSrc = CurDAG.getSExtOrTrunc(Src, DL, MVT::i32);
SDValue ExtAbs = CurDAG.getNode(ISD::ABS, DL, MVT::i32, SExtSrc);
- return CurDAG.getNode(ISD::TRUNCATE, DL, MVT::i16, ExtAbs);
+ SDValue TruncResult = CurDAG.getNode(ISD::TRUNCATE, DL, MVT::i16, ExtAbs);
+
+ if (!IsNegative)
+ return TruncResult;
+ return CurDAG.getNode(ISD::SUB, DL, MVT::i16,
+ CurDAG.getConstant(0, DL, MVT::i16), TruncResult);
}
SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
diff --git a/llvm/test/CodeGen/AMDGPU/s_abs_i16.ll b/llvm/test/CodeGen/AMDGPU/s_abs_i16.ll
new file mode 100644
index 0000000000000..e61abb7173d78
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/s_abs_i16.ll
@@ -0,0 +1,22 @@
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 < %s | FileCheck %s
+
+define amdgpu_ps i16 @abs_i16(i16 inreg %arg) {
+; CHECK-LABEL: abs_i16:
+; CHECK: %bb.0:
+; CHECK-NEXT: s_sext_i32_i16 s0, s0
+; CHECK-NEXT: s_abs_i32 s0, s0
+
+ %res = call i16 @llvm.abs.i16(i16 %arg, i1 false)
+ ret i16 %res
+}
+
+define amdgpu_ps i16 @abs_i16_neg(i16 inreg %arg) {
+; CHECK-LABEL: abs_i16_neg:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_sext_i32_i16 s0, s0
+; CHECK-NEXT: s_abs_i32 s0, s0
+; CHECK-NEXT: s_sub_i32 s0, 0, s0
+ %res1 = call i16 @llvm.abs.i16(i16 %arg, i1 false)
+ %res2 = sub i16 0, %res1
+ ret i16 %res2
+}
\ No newline at end of file
>From 1136040bd5d29b4b7a6ab522f37e1363881bea29 Mon Sep 17 00:00:00 2001
From: Patrick Simmons <psimmons at pensando.io>
Date: Wed, 29 Oct 2025 12:55:07 -0500
Subject: [PATCH 04/19] Revert to master
---
llvm/include/llvm/CodeGen/TargetLowering.h | 4 ++--
llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp | 1 -
llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h | 1 -
llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 22 -------------------
llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h | 2 --
5 files changed, 2 insertions(+), 28 deletions(-)
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index cbd68f64059f0..2550c2bee5f71 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -5576,8 +5576,8 @@ class LLVM_ABI TargetLowering : public TargetLoweringBase {
/// \param N Node to expand
/// \param IsNegative indicate negated abs
/// \returns The expansion result or SDValue() if it fails.
- virtual SDValue expandABS(SDNode *N, SelectionDAG &DAG,
- bool IsNegative = false) const;
+ SDValue expandABS(SDNode *N, SelectionDAG &DAG,
+ bool IsNegative = false) const;
/// Expand ABDS/ABDU nodes. Expands vector/scalar ABDS/ABDU nodes.
/// \param N Node to expand
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index b963b8f83070b..b8b419d93021a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -23,7 +23,6 @@
#include "SIMachineFunctionInfo.h"
#include "llvm/Analysis/UniformityAnalysis.h"
#include "llvm/CodeGen/FunctionLoweringInfo.h"
-#include "llvm/CodeGen/ISDOpcodes.h"
#include "llvm/CodeGen/SelectionDAG.h"
#include "llvm/CodeGen/SelectionDAGISel.h"
#include "llvm/CodeGen/SelectionDAGNodes.h"
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
index 6d8697834e536..4fa0d3f72e1c7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
@@ -282,7 +282,6 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
void SelectADD_SUB_I64(SDNode *N);
void SelectAddcSubb(SDNode *N);
void SelectUADDO_USUBO(SDNode *N);
- bool SelectABS(SDNode *N);
void SelectDIV_SCALE(SDNode *N);
void SelectMAD_64_32(SDNode *N);
void SelectMUL_LOHI(SDNode *N);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index a31dbf1d4a894..f5081a9d2dd56 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -18,7 +18,6 @@
#include "AMDGPUMachineFunction.h"
#include "AMDGPUMemoryUtils.h"
#include "SIMachineFunctionInfo.h"
-#include "SIRegisterInfo.h"
#include "llvm/CodeGen/Analysis.h"
#include "llvm/CodeGen/GlobalISel/GISelValueTracking.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
@@ -5288,27 +5287,6 @@ SDValue AMDGPUTargetLowering::performRcpCombine(SDNode *N,
return DCI.DAG.getConstantFP(One / Val, SDLoc(N), N->getValueType(0));
}
-SDValue AMDGPUTargetLowering::expandABS(SDNode *N, SelectionDAG &CurDAG,
- bool IsNegative) const {
- assert(N->getOpcode() == ISD::ABS &&
- "Tried to select abs with non-abs opcode.");
-
- if (N->getValueSizeInBits(0) != 16 || getRegClassFor(N->getSimpleValueType(0)) != &AMDGPU::SReg_32RegClass)
- return TargetLowering::expandABS(N, CurDAG, IsNegative);
-
- SDValue Src = N->getOperand(0);
- SDLoc DL(Src);
-
- SDValue SExtSrc = CurDAG.getSExtOrTrunc(Src, DL, MVT::i32);
- SDValue ExtAbs = CurDAG.getNode(ISD::ABS, DL, MVT::i32, SExtSrc);
- SDValue TruncResult = CurDAG.getNode(ISD::TRUNCATE, DL, MVT::i16, ExtAbs);
-
- if (!IsNegative)
- return TruncResult;
- return CurDAG.getNode(ISD::SUB, DL, MVT::i16,
- CurDAG.getConstant(0, DL, MVT::i16), TruncResult);
-}
-
SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index 06327051667fe..bdaf48652d107 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -135,8 +135,6 @@ class AMDGPUTargetLowering : public TargetLowering {
SDValue performFNegCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performFAbsCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const;
- virtual SDValue expandABS(SDNode *N, SelectionDAG &CurDAG,
- bool IsNegative) const override;
static EVT getEquivalentMemType(LLVMContext &Context, EVT VT);
>From 02e54ce6750588587a08cdfc7727723a808224ef Mon Sep 17 00:00:00 2001
From: Patrick Simmons <psimmons at pensando.io>
Date: Wed, 29 Oct 2025 16:18:19 -0500
Subject: [PATCH 05/19] Machine-Level Implementation
---
llvm/lib/Target/AMDGPU/AMDGPU.h | 11 +++++++++++
llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 3 +++
llvm/lib/Target/AMDGPU/CMakeLists.txt | 1 +
3 files changed, 15 insertions(+)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 67042b700c047..f7e46430d658f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -39,6 +39,7 @@ FunctionPass *createSIAnnotateControlFlowLegacyPass();
FunctionPass *createSIFoldOperandsLegacyPass();
FunctionPass *createSIPeepholeSDWALegacyPass();
FunctionPass *createSILowerI1CopiesLegacyPass();
+FunctionPass *createSISAbs16FixupLegacyPass();
FunctionPass *createSIShrinkInstructionsLegacyPass();
FunctionPass *createSILoadStoreOptimizerLegacyPass();
FunctionPass *createSIWholeQuadModeLegacyPass();
@@ -93,6 +94,13 @@ class SILowerI1CopiesPass : public PassInfoMixin<SILowerI1CopiesPass> {
MachineFunctionAnalysisManager &MFAM);
};
+class SISAbs16FixupPass : public PassInfoMixin<SISAbs16FixupPass> {
+public:
+ SISAbs16FixupPass() = default;
+ PreservedAnalyses run(MachineFunction &MF,
+ MachineFunctionAnalysisManager &MFAM);
+};
+
void initializeAMDGPUDAGToDAGISelLegacyPass(PassRegistry &);
void initializeAMDGPUAlwaysInlinePass(PassRegistry&);
@@ -197,6 +205,9 @@ extern char &SILowerWWMCopiesLegacyID;
void initializeSILowerI1CopiesLegacyPass(PassRegistry &);
extern char &SILowerI1CopiesLegacyID;
+void initializeSISAbs16FixupLegacyPass(PassRegistry &);
+extern char &SISAbs16FixupLegacyID;
+
void initializeAMDGPUGlobalISelDivergenceLoweringPass(PassRegistry &);
extern char &AMDGPUGlobalISelDivergenceLoweringID;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index b87b54ffc4f12..3065658f4d8f3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -551,6 +551,7 @@ extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
initializeAMDGPUPrepareAGPRAllocLegacyPass(*PR);
initializeGCNDPPCombineLegacyPass(*PR);
initializeSILowerI1CopiesLegacyPass(*PR);
+ initializeSISAbs16FixupLegacyPass(*PR);
initializeAMDGPUGlobalISelDivergenceLoweringPass(*PR);
initializeAMDGPURegBankSelectPass(*PR);
initializeAMDGPURegBankLegalizePass(*PR);
@@ -1521,6 +1522,7 @@ bool GCNPassConfig::addInstSelector() {
AMDGPUPassConfig::addInstSelector();
addPass(&SIFixSGPRCopiesLegacyID);
addPass(createSILowerI1CopiesLegacyPass());
+ addPass(createSISAbs16FixupLegacyPass());
return false;
}
@@ -2215,6 +2217,7 @@ Error AMDGPUCodeGenPassBuilder::addInstSelector(AddMachinePass &addPass) const {
addPass(AMDGPUISelDAGToDAGPass(TM));
addPass(SIFixSGPRCopiesPass());
addPass(SILowerI1CopiesPass());
+ addPass(SISAbs16FixupPass());
return Error::success();
}
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index a1e0e5293c706..cd9225acdb002 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -185,6 +185,7 @@ add_llvm_target(AMDGPUCodeGen
SIPreEmitPeephole.cpp
SIProgramInfo.cpp
SIRegisterInfo.cpp
+ SISAbs16Fixup.cpp
SIShrinkInstructions.cpp
SIWholeQuadMode.cpp
>From eafd0ce0c0d72d7a263ea7be17849c7728768eb8 Mon Sep 17 00:00:00 2001
From: Patrick Simmons <psimmons at pensando.io>
Date: Thu, 30 Oct 2025 02:44:22 -0500
Subject: [PATCH 06/19] Add new file
---
llvm/lib/Target/AMDGPU/SISAbs16Fixup.cpp | 168 +++++++++++++++++++++++
1 file changed, 168 insertions(+)
create mode 100644 llvm/lib/Target/AMDGPU/SISAbs16Fixup.cpp
diff --git a/llvm/lib/Target/AMDGPU/SISAbs16Fixup.cpp b/llvm/lib/Target/AMDGPU/SISAbs16Fixup.cpp
new file mode 100644
index 0000000000000..fd305b6ffc061
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/SISAbs16Fixup.cpp
@@ -0,0 +1,168 @@
+//===-- SISAbs16Fixup.cpp - Lower I1 Copies -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass matches the pattern for 16-bit ABS instructions after they have
+// been lowered to for execution on the Scalar Unit.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "GCNSubtarget.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "SIRegisterInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineSSAUpdater.h"
+#include "llvm/InitializePasses.h"
+
+#define DEBUG_TYPE "si-abs16-pattern"
+
+using namespace llvm;
+
+static Register pierceCopies(Register R, MachineRegisterInfo& MRI) {
+ MachineInstr *CopyMI = MRI.getVRegDef(R);
+ while (CopyMI && CopyMI->getOpcode() == AMDGPU::COPY) {
+ Register T = CopyMI->getOperand(1).getReg();
+ if (!T.isVirtual())
+ break;
+
+ R = T;
+ CopyMI = MRI.getVRegDef(R);
+ }
+
+ return R;
+}
+
+static MachineInstr *matchExpandAbsPattern(MachineInstr &MI,
+ MachineRegisterInfo &MRI) {
+ std::array<MachineInstr *, 2> SextInstructions;
+ for (unsigned I = 0; I < SextInstructions.size(); I++)
+ {
+ SextInstructions[I] = MRI.getVRegDef(MI.getOperand(I + 1).getReg());
+ if (SextInstructions[I]->getOpcode() != AMDGPU::S_SEXT_I32_I16)
+ return nullptr;
+ }
+
+ Register AbsSource;
+ MachineInstr* SubIns = nullptr;
+ for (MachineInstr *SextMI : SextInstructions) {
+ Register SextReg = SextMI->getOperand(1).getReg();
+ MachineInstr* OperandMI = MRI.getVRegDef(SextReg);
+ if (OperandMI->getOpcode() == AMDGPU::S_SUB_I32)
+ if(!SubIns)
+ SubIns = OperandMI;
+ else
+ return nullptr;
+ else
+ AbsSource = pierceCopies(SextReg,MRI);
+ }
+
+ if (!SubIns)
+ return nullptr;
+
+ if (MRI.getRegClass(AbsSource) != &AMDGPU::SGPR_32RegClass)
+ return nullptr;
+
+ MachineInstr &MustBeZero =
+ *MRI.getVRegDef(pierceCopies(SubIns->getOperand(1).getReg(), MRI));
+ if (MustBeZero.getOpcode() != AMDGPU::S_MOV_B32 ||
+ MustBeZero.getOperand(1).getImm())
+ return nullptr;
+
+ if (pierceCopies(SubIns->getOperand(2).getReg(), MRI) != AbsSource)
+ return nullptr;
+
+ return MRI.getVRegDef(AbsSource);
+}
+
+static bool runSAbs16Fixup(MachineFunction &MF) {
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ const SIInstrInfo &TII = *MF.getSubtarget<GCNSubtarget>().getInstrInfo();
+
+ bool Changed = false;
+
+ for (MachineBasicBlock &MBB : MF)
+ for (MachineInstr &MI : make_early_inc_range(MBB)) {
+ bool IsPositive = MI.getOpcode() == AMDGPU::S_MAX_I32;
+ bool IsNegative = MI.getOpcode() == AMDGPU::S_MIN_I32;
+ MachineInstr* AbsSourceMI;
+ if ((!IsPositive && !IsNegative) ||
+ !(AbsSourceMI = matchExpandAbsPattern(MI, MRI)))
+ continue;
+
+ Register SextDestReg =
+ MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ Register AbsDestReg =
+ IsNegative ? MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass)
+ : MI.getOperand(0).getReg();
+
+ BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_SEXT_I32_I16),
+ SextDestReg)
+ .addReg(AbsSourceMI->getOperand(0).getReg());
+ BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_ABS_I32), AbsDestReg)
+ .addReg(SextDestReg);
+
+ if(IsNegative)
+ BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_SUB_I32),
+ MI.getOperand(0).getReg())
+ .addImm(0)
+ .addReg(AbsDestReg);
+
+ MI.eraseFromParent();
+ Changed = true;
+ }
+
+ return Changed;
+}
+
+PreservedAnalyses SISAbs16FixupPass::run(MachineFunction &MF,
+ MachineFunctionAnalysisManager &MFAM) {
+ bool Changed = runSAbs16Fixup(MF);
+ if (!Changed)
+ return PreservedAnalyses::all();
+
+ // TODO: Probably preserves most.
+ PreservedAnalyses PA;
+ PA.preserveSet<CFGAnalyses>();
+ return PA;
+}
+
+class SISAbs16FixupLegacy : public MachineFunctionPass {
+public:
+ static char ID;
+
+ SISAbs16FixupLegacy() : MachineFunctionPass(ID) {
+ initializeSISAbs16FixupLegacyPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ StringRef getPassName() const override { return "SI SAbs16 Fixup"; }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+};
+
+bool SISAbs16FixupLegacy::runOnMachineFunction(MachineFunction &MF) {
+ return runSAbs16Fixup(MF);
+}
+
+INITIALIZE_PASS_BEGIN(SISAbs16FixupLegacy, DEBUG_TYPE, "SI SAbs16 Fixup",
+ false, false)
+INITIALIZE_PASS_END(SISAbs16FixupLegacy, DEBUG_TYPE, "SI SAbs16 Fixup",
+ false, false)
+
+char SISAbs16FixupLegacy::ID = 0;
+
+char &llvm::SISAbs16FixupLegacyID = SISAbs16FixupLegacy::ID;
+
+FunctionPass *llvm::createSISAbs16FixupLegacyPass() {
+ return new SISAbs16FixupLegacy();
+}
>From d74b1f25de3ef525f1f25550746aac4b73ace4a9 Mon Sep 17 00:00:00 2001
From: Patrick Simmons <psimmons at pensando.io>
Date: Fri, 31 Oct 2025 14:29:42 -0500
Subject: [PATCH 07/19] Run update_llc_test_checks.py
---
llvm/test/CodeGen/AMDGPU/s_abs_i16.ll | 20 ++++++++++++--------
1 file changed, 12 insertions(+), 8 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/s_abs_i16.ll b/llvm/test/CodeGen/AMDGPU/s_abs_i16.ll
index e61abb7173d78..0cdbedd837396 100644
--- a/llvm/test/CodeGen/AMDGPU/s_abs_i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/s_abs_i16.ll
@@ -1,10 +1,13 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 < %s | FileCheck %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-- -mcpu=gfx900 < %s | FileCheck %s
define amdgpu_ps i16 @abs_i16(i16 inreg %arg) {
; CHECK-LABEL: abs_i16:
-; CHECK: %bb.0:
-; CHECK-NEXT: s_sext_i32_i16 s0, s0
-; CHECK-NEXT: s_abs_i32 s0, s0
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_sext_i32_i16 s0, s0
+; CHECK-NEXT: s_abs_i32 s0, s0
+; CHECK-NEXT: ; return to shader part epilog
%res = call i16 @llvm.abs.i16(i16 %arg, i1 false)
ret i16 %res
@@ -12,11 +15,12 @@ define amdgpu_ps i16 @abs_i16(i16 inreg %arg) {
define amdgpu_ps i16 @abs_i16_neg(i16 inreg %arg) {
; CHECK-LABEL: abs_i16_neg:
-; CHECK: ; %bb.0:
-; CHECK-NEXT: s_sext_i32_i16 s0, s0
-; CHECK-NEXT: s_abs_i32 s0, s0
-; CHECK-NEXT: s_sub_i32 s0, 0, s0
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_sext_i32_i16 s0, s0
+; CHECK-NEXT: s_abs_i32 s0, s0
+; CHECK-NEXT: s_sub_i32 s0, 0, s0
+; CHECK-NEXT: ; return to shader part epilog
%res1 = call i16 @llvm.abs.i16(i16 %arg, i1 false)
%res2 = sub i16 0, %res1
ret i16 %res2
-}
\ No newline at end of file
+}
>From f02cfeba672aad4d4a88e736d4fe862843411e3c Mon Sep 17 00:00:00 2001
From: Patrick Simmons <psimmons at pensando.io>
Date: Mon, 3 Nov 2025 16:05:57 -0500
Subject: [PATCH 08/19] Attempt #4, with DAG again
---
llvm/lib/Target/AMDGPU/AMDGPU.h | 11 --
.../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 3 -
llvm/lib/Target/AMDGPU/CMakeLists.txt | 1 -
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 26 +++
llvm/lib/Target/AMDGPU/SIISelLowering.h | 1 +
llvm/lib/Target/AMDGPU/SISAbs16Fixup.cpp | 168 ------------------
6 files changed, 27 insertions(+), 183 deletions(-)
delete mode 100644 llvm/lib/Target/AMDGPU/SISAbs16Fixup.cpp
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index f7e46430d658f..67042b700c047 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -39,7 +39,6 @@ FunctionPass *createSIAnnotateControlFlowLegacyPass();
FunctionPass *createSIFoldOperandsLegacyPass();
FunctionPass *createSIPeepholeSDWALegacyPass();
FunctionPass *createSILowerI1CopiesLegacyPass();
-FunctionPass *createSISAbs16FixupLegacyPass();
FunctionPass *createSIShrinkInstructionsLegacyPass();
FunctionPass *createSILoadStoreOptimizerLegacyPass();
FunctionPass *createSIWholeQuadModeLegacyPass();
@@ -94,13 +93,6 @@ class SILowerI1CopiesPass : public PassInfoMixin<SILowerI1CopiesPass> {
MachineFunctionAnalysisManager &MFAM);
};
-class SISAbs16FixupPass : public PassInfoMixin<SISAbs16FixupPass> {
-public:
- SISAbs16FixupPass() = default;
- PreservedAnalyses run(MachineFunction &MF,
- MachineFunctionAnalysisManager &MFAM);
-};
-
void initializeAMDGPUDAGToDAGISelLegacyPass(PassRegistry &);
void initializeAMDGPUAlwaysInlinePass(PassRegistry&);
@@ -205,9 +197,6 @@ extern char &SILowerWWMCopiesLegacyID;
void initializeSILowerI1CopiesLegacyPass(PassRegistry &);
extern char &SILowerI1CopiesLegacyID;
-void initializeSISAbs16FixupLegacyPass(PassRegistry &);
-extern char &SISAbs16FixupLegacyID;
-
void initializeAMDGPUGlobalISelDivergenceLoweringPass(PassRegistry &);
extern char &AMDGPUGlobalISelDivergenceLoweringID;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 3065658f4d8f3..b87b54ffc4f12 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -551,7 +551,6 @@ extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
initializeAMDGPUPrepareAGPRAllocLegacyPass(*PR);
initializeGCNDPPCombineLegacyPass(*PR);
initializeSILowerI1CopiesLegacyPass(*PR);
- initializeSISAbs16FixupLegacyPass(*PR);
initializeAMDGPUGlobalISelDivergenceLoweringPass(*PR);
initializeAMDGPURegBankSelectPass(*PR);
initializeAMDGPURegBankLegalizePass(*PR);
@@ -1522,7 +1521,6 @@ bool GCNPassConfig::addInstSelector() {
AMDGPUPassConfig::addInstSelector();
addPass(&SIFixSGPRCopiesLegacyID);
addPass(createSILowerI1CopiesLegacyPass());
- addPass(createSISAbs16FixupLegacyPass());
return false;
}
@@ -2217,7 +2215,6 @@ Error AMDGPUCodeGenPassBuilder::addInstSelector(AddMachinePass &addPass) const {
addPass(AMDGPUISelDAGToDAGPass(TM));
addPass(SIFixSGPRCopiesPass());
addPass(SILowerI1CopiesPass());
- addPass(SISAbs16FixupPass());
return Error::success();
}
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index cd9225acdb002..a1e0e5293c706 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -185,7 +185,6 @@ add_llvm_target(AMDGPUCodeGen
SIPreEmitPeephole.cpp
SIProgramInfo.cpp
SIRegisterInfo.cpp
- SISAbs16Fixup.cpp
SIShrinkInstructions.cpp
SIWholeQuadMode.cpp
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 8bb28084159e8..1d3eff2cdc1aa 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -177,6 +177,10 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
addRegisterClass(MVT::v32i16, &AMDGPU::SGPR_512RegClass);
addRegisterClass(MVT::v32f16, &AMDGPU::SGPR_512RegClass);
addRegisterClass(MVT::v32bf16, &AMDGPU::SGPR_512RegClass);
+
+ // We don't want the default expansion of 16-bit ABS since we can
+ // sign-extend and use the 32-bit ABS operation for 16-bit ABS with SGPRs
+ setOperationAction(ISD::ABS, MVT::i16, Custom);
}
addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass);
@@ -6774,6 +6778,9 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::DEBUGTRAP:
return lowerDEBUGTRAP(Op, DAG);
case ISD::ABS:
+ if (Op.getValueType() == MVT::i16)
+ return lowerABSi16(Op, DAG);
+ // fall through
case ISD::FABS:
case ISD::FNEG:
case ISD::FCANONICALIZE:
@@ -8139,6 +8146,25 @@ SDValue SITargetLowering::lowerDEBUGTRAP(SDValue Op, SelectionDAG &DAG) const {
return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
}
+// sign-extend and use the 32-bit ABS operation for 16-bit ABS with SGPRs
+SDValue SITargetLowering::lowerABSi16(SDValue Op, SelectionDAG &DAG) const {
+ assert(Op.getOpcode() == ISD::ABS &&
+ "Tried to select abs with non-abs opcode.");
+ assert(Op.getValueType() == MVT::i16 &&
+ "Tried to select abs i16 lowering with non-i16 type.");
+
+ // divergent means will not end up using SGPRs
+ if (Op->isDivergent())
+ return SDValue();
+
+ //(abs i16 (i16 op1)) -> (trunc i16 (abs i32 (sext i32 (i16 op1))))
+ SDValue Src = Op.getOperand(0);
+ SDLoc DL(Src);
+ SDValue SExtSrc = DAG.getSExtOrTrunc(Src, DL, MVT::i32);
+ SDValue ExtAbs = DAG.getNode(ISD::ABS, DL, MVT::i32, SExtSrc);
+ return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, ExtAbs);
+}
+
SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
SelectionDAG &DAG) const {
if (Subtarget->hasApertureRegs()) {
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index 74e58f4272e10..25e94851c24df 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -184,6 +184,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
SDValue lowerTrapHsaQueuePtr(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerTrapHsa(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerDEBUGTRAP(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerABSi16(SDValue Op, SelectionDAG &DAG) const;
SDNode *adjustWritemask(MachineSDNode *&N, SelectionDAG &DAG) const;
diff --git a/llvm/lib/Target/AMDGPU/SISAbs16Fixup.cpp b/llvm/lib/Target/AMDGPU/SISAbs16Fixup.cpp
deleted file mode 100644
index fd305b6ffc061..0000000000000
--- a/llvm/lib/Target/AMDGPU/SISAbs16Fixup.cpp
+++ /dev/null
@@ -1,168 +0,0 @@
-//===-- SISAbs16Fixup.cpp - Lower I1 Copies -----------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This pass matches the pattern for 16-bit ABS instructions after they have
-// been lowered to for execution on the Scalar Unit.
-//
-//===----------------------------------------------------------------------===//
-
-#include "AMDGPU.h"
-#include "GCNSubtarget.h"
-#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-#include "SIRegisterInfo.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/MachineSSAUpdater.h"
-#include "llvm/InitializePasses.h"
-
-#define DEBUG_TYPE "si-abs16-pattern"
-
-using namespace llvm;
-
-static Register pierceCopies(Register R, MachineRegisterInfo& MRI) {
- MachineInstr *CopyMI = MRI.getVRegDef(R);
- while (CopyMI && CopyMI->getOpcode() == AMDGPU::COPY) {
- Register T = CopyMI->getOperand(1).getReg();
- if (!T.isVirtual())
- break;
-
- R = T;
- CopyMI = MRI.getVRegDef(R);
- }
-
- return R;
-}
-
-static MachineInstr *matchExpandAbsPattern(MachineInstr &MI,
- MachineRegisterInfo &MRI) {
- std::array<MachineInstr *, 2> SextInstructions;
- for (unsigned I = 0; I < SextInstructions.size(); I++)
- {
- SextInstructions[I] = MRI.getVRegDef(MI.getOperand(I + 1).getReg());
- if (SextInstructions[I]->getOpcode() != AMDGPU::S_SEXT_I32_I16)
- return nullptr;
- }
-
- Register AbsSource;
- MachineInstr* SubIns = nullptr;
- for (MachineInstr *SextMI : SextInstructions) {
- Register SextReg = SextMI->getOperand(1).getReg();
- MachineInstr* OperandMI = MRI.getVRegDef(SextReg);
- if (OperandMI->getOpcode() == AMDGPU::S_SUB_I32)
- if(!SubIns)
- SubIns = OperandMI;
- else
- return nullptr;
- else
- AbsSource = pierceCopies(SextReg,MRI);
- }
-
- if (!SubIns)
- return nullptr;
-
- if (MRI.getRegClass(AbsSource) != &AMDGPU::SGPR_32RegClass)
- return nullptr;
-
- MachineInstr &MustBeZero =
- *MRI.getVRegDef(pierceCopies(SubIns->getOperand(1).getReg(), MRI));
- if (MustBeZero.getOpcode() != AMDGPU::S_MOV_B32 ||
- MustBeZero.getOperand(1).getImm())
- return nullptr;
-
- if (pierceCopies(SubIns->getOperand(2).getReg(), MRI) != AbsSource)
- return nullptr;
-
- return MRI.getVRegDef(AbsSource);
-}
-
-static bool runSAbs16Fixup(MachineFunction &MF) {
- MachineRegisterInfo &MRI = MF.getRegInfo();
- const SIInstrInfo &TII = *MF.getSubtarget<GCNSubtarget>().getInstrInfo();
-
- bool Changed = false;
-
- for (MachineBasicBlock &MBB : MF)
- for (MachineInstr &MI : make_early_inc_range(MBB)) {
- bool IsPositive = MI.getOpcode() == AMDGPU::S_MAX_I32;
- bool IsNegative = MI.getOpcode() == AMDGPU::S_MIN_I32;
- MachineInstr* AbsSourceMI;
- if ((!IsPositive && !IsNegative) ||
- !(AbsSourceMI = matchExpandAbsPattern(MI, MRI)))
- continue;
-
- Register SextDestReg =
- MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
- Register AbsDestReg =
- IsNegative ? MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass)
- : MI.getOperand(0).getReg();
-
- BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_SEXT_I32_I16),
- SextDestReg)
- .addReg(AbsSourceMI->getOperand(0).getReg());
- BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_ABS_I32), AbsDestReg)
- .addReg(SextDestReg);
-
- if(IsNegative)
- BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_SUB_I32),
- MI.getOperand(0).getReg())
- .addImm(0)
- .addReg(AbsDestReg);
-
- MI.eraseFromParent();
- Changed = true;
- }
-
- return Changed;
-}
-
-PreservedAnalyses SISAbs16FixupPass::run(MachineFunction &MF,
- MachineFunctionAnalysisManager &MFAM) {
- bool Changed = runSAbs16Fixup(MF);
- if (!Changed)
- return PreservedAnalyses::all();
-
- // TODO: Probably preserves most.
- PreservedAnalyses PA;
- PA.preserveSet<CFGAnalyses>();
- return PA;
-}
-
-class SISAbs16FixupLegacy : public MachineFunctionPass {
-public:
- static char ID;
-
- SISAbs16FixupLegacy() : MachineFunctionPass(ID) {
- initializeSISAbs16FixupLegacyPass(*PassRegistry::getPassRegistry());
- }
-
- bool runOnMachineFunction(MachineFunction &MF) override;
-
- StringRef getPassName() const override { return "SI SAbs16 Fixup"; }
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.setPreservesCFG();
- MachineFunctionPass::getAnalysisUsage(AU);
- }
-};
-
-bool SISAbs16FixupLegacy::runOnMachineFunction(MachineFunction &MF) {
- return runSAbs16Fixup(MF);
-}
-
-INITIALIZE_PASS_BEGIN(SISAbs16FixupLegacy, DEBUG_TYPE, "SI SAbs16 Fixup",
- false, false)
-INITIALIZE_PASS_END(SISAbs16FixupLegacy, DEBUG_TYPE, "SI SAbs16 Fixup",
- false, false)
-
-char SISAbs16FixupLegacy::ID = 0;
-
-char &llvm::SISAbs16FixupLegacyID = SISAbs16FixupLegacy::ID;
-
-FunctionPass *llvm::createSISAbs16FixupLegacyPass() {
- return new SISAbs16FixupLegacy();
-}
>From f92fac1da3e70b4ae776582f2ece7243d292758e Mon Sep 17 00:00:00 2001
From: Patrick Simmons <psimmons at pensando.io>
Date: Mon, 3 Nov 2025 17:36:51 -0500
Subject: [PATCH 09/19] For real?
---
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 1d3eff2cdc1aa..5f9a8d6f9c59f 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -42,6 +42,7 @@
#include "llvm/IR/IntrinsicsR600.h"
#include "llvm/IR/MDBuilder.h"
#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
#include "llvm/Support/KnownBits.h"
#include "llvm/Support/ModRef.h"
#include "llvm/Transforms/Utils/LowerAtomic.h"
@@ -6780,7 +6781,7 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::ABS:
if (Op.getValueType() == MVT::i16)
return lowerABSi16(Op, DAG);
- // fall through
+ LLVM_FALLTHROUGH;
case ISD::FABS:
case ISD::FNEG:
case ISD::FCANONICALIZE:
>From 43b7fa2435af3c13fd3fb340b439c22220b002b9 Mon Sep 17 00:00:00 2001
From: Patrick Simmons <psimmons at pensando.io>
Date: Tue, 4 Nov 2025 18:06:05 -0500
Subject: [PATCH 10/19] Add testcase
---
llvm/test/CodeGen/AMDGPU/llvm.abs.ll | 692 +++++++++++++++++++++++++++
1 file changed, 692 insertions(+)
create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.abs.ll
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.abs.ll b/llvm/test/CodeGen/AMDGPU/llvm.abs.ll
new file mode 100644
index 0000000000000..8b8448e260d73
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.abs.ll
@@ -0,0 +1,692 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn -mcpu=tahiti -o - < %s | FileCheck %s --check-prefixes=SDAG6
+; RUN: llc -mtriple=amdgcn -mcpu=fiji -o - < %s | FileCheck %s --check-prefixes=SDAG8
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -o - < %s | FileCheck %s --check-prefixes=SDAG10
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -o - < %s | FileCheck %s --check-prefixes=SDAG1250
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=tahiti -o - < %s | FileCheck %s --check-prefixes=GFX,GFX6
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=fiji -o - < %s | FileCheck %s --check-prefixes=GFX,GFX8
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -o - < %s | FileCheck %s --check-prefixes=GFX,GFX10
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1250 -o - < %s | FileCheck %s --check-prefixes=GFX,GFX1250
+
+declare i16 @llvm.abs.i16(i16, i1)
+declare i32 @llvm.abs.i32(i32, i1)
+declare i64 @llvm.abs.i64(i64, i1)
+declare <2 x i8> @llvm.abs.v2i8(<2 x i8>, i1)
+declare <3 x i8> @llvm.abs.v3i8(<3 x i8>, i1)
+declare <2 x i16> @llvm.abs.v2i16(<2 x i16>, i1)
+declare <3 x i16> @llvm.abs.v3i16(<3 x i16>, i1)
+declare <4 x i32> @llvm.abs.v4i32(<4 x i32>, i1)
+
+define amdgpu_cs i16 @abs_sgpr_i16(i16 inreg %arg) {
+; SDAG6-LABEL: abs_sgpr_i16:
+; SDAG6: ; %bb.0:
+; SDAG6-NEXT: s_sext_i32_i16 s0, s0
+; SDAG6-NEXT: s_abs_i32 s0, s0
+; SDAG6-NEXT: ; return to shader part epilog
+;
+; SDAG8-LABEL: abs_sgpr_i16:
+; SDAG8: ; %bb.0:
+; SDAG8-NEXT: s_sext_i32_i16 s0, s0
+; SDAG8-NEXT: s_abs_i32 s0, s0
+; SDAG8-NEXT: ; return to shader part epilog
+;
+; SDAG10-LABEL: abs_sgpr_i16:
+; SDAG10: ; %bb.0:
+; SDAG10-NEXT: s_sext_i32_i16 s0, s0
+; SDAG10-NEXT: s_abs_i32 s0, s0
+; SDAG10-NEXT: ; return to shader part epilog
+;
+; SDAG1250-LABEL: abs_sgpr_i16:
+; SDAG1250: ; %bb.0:
+; SDAG1250-NEXT: s_sext_i32_i16 s0, s0
+; SDAG1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; SDAG1250-NEXT: s_abs_i32 s0, s0
+; SDAG1250-NEXT: ; return to shader part epilog
+
+; GFX6-LABEL: abs_sgpr_i16:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_sext_i32_i16 s0, s0
+; GFX6-NEXT: s_abs_i32 s0, s0
+; GFX6-NEXT: ; return to shader part epilog
+;
+; GFX8-LABEL: abs_sgpr_i16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_sext_i32_i16 s0, s0
+; GFX8-NEXT: s_abs_i32 s0, s0
+; GFX8-NEXT: ; return to shader part epilog
+;
+; GFX10-LABEL: abs_sgpr_i16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_sext_i32_i16 s0, s0
+; GFX10-NEXT: s_abs_i32 s0, s0
+; GFX10-NEXT: ; return to shader part epilog
+;
+; GFX1250-LABEL: abs_sgpr_i16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_sext_i32_i16 s0, s0
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: s_abs_i32 s0, s0
+; GFX1250-NEXT: ; return to shader part epilog
+ %res = call i16 @llvm.abs.i16(i16 %arg, i1 false)
+ ret i16 %res
+}
+
+define amdgpu_ps i16 @abs_sgpr_i16_neg(i16 inreg %arg) {
+; SDAG6-LABEL: abs_sgpr_i16_neg:
+; SDAG6: ; %bb.0:
+; SDAG6-NEXT: s_sext_i32_i16 s0, s0
+; SDAG6-NEXT: s_abs_i32 s0, s0
+; SDAG6-NEXT: s_sub_i32 s0, 0, s0
+; SDAG6-NEXT: ; return to shader part epilog
+;
+; SDAG8-LABEL: abs_sgpr_i16_neg:
+; SDAG8: ; %bb.0:
+; SDAG8-NEXT: s_sext_i32_i16 s0, s0
+; SDAG8-NEXT: s_abs_i32 s0, s0
+; SDAG8-NEXT: s_sub_i32 s0, 0, s0
+; SDAG8-NEXT: ; return to shader part epilog
+;
+; SDAG10-LABEL: abs_sgpr_i16_neg:
+; SDAG10: ; %bb.0:
+; SDAG10-NEXT: s_sext_i32_i16 s0, s0
+; SDAG10-NEXT: s_abs_i32 s0, s0
+; SDAG10-NEXT: s_sub_i32 s0, 0, s0
+; SDAG10-NEXT: ; return to shader part epilog
+;
+; SDAG1250-LABEL: abs_sgpr_i16_neg:
+; SDAG1250: ; %bb.0:
+; SDAG1250-NEXT: s_sext_i32_i16 s0, s0
+; SDAG1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; SDAG1250-NEXT: s_abs_i32 s0, s0
+; SDAG1250-NEXT: s_sub_co_i32 s0, 0, s0
+; SDAG1250-NEXT: ; return to shader part epilog
+;
+; GFX6-LABEL: abs_sgpr_i16_neg:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_sext_i32_i16 s0, s0
+; GFX6-NEXT: s_abs_i32 s0, s0
+; GFX6-NEXT: s_sub_i32 s0, 0, s0
+; GFX6-NEXT: ; return to shader part epilog
+;
+; GFX8-LABEL: abs_sgpr_i16_neg:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_sext_i32_i16 s0, s0
+; GFX8-NEXT: s_abs_i32 s0, s0
+; GFX8-NEXT: s_sub_i32 s0, 0, s0
+; GFX8-NEXT: ; return to shader part epilog
+;
+; GFX10-LABEL: abs_sgpr_i16_neg:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_sext_i32_i16 s0, s0
+; GFX10-NEXT: s_abs_i32 s0, s0
+; GFX10-NEXT: s_sub_i32 s0, 0, s0
+; GFX10-NEXT: ; return to shader part epilog
+;
+; GFX1250-LABEL: abs_sgpr_i16_neg:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_sext_i32_i16 s0, s0
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: s_abs_i32 s0, s0
+; GFX1250-NEXT: s_sub_co_i32 s0, 0, s0
+; GFX1250-NEXT: ; return to shader part epilog
+ %res1 = call i16 @llvm.abs.i16(i16 %arg, i1 false)
+ %res2 = sub i16 0, %res1
+ ret i16 %res2
+}
+
+define amdgpu_cs i32 @abs_sgpr_i32(i32 inreg %arg) {
+; GFX-LABEL: abs_sgpr_i32:
+; GFX: ; %bb.0:
+; GFX-NEXT: s_abs_i32 s0, s0
+; GFX-NEXT: ; return to shader part epilog
+ %res = call i32 @llvm.abs.i32(i32 %arg, i1 false)
+ ret i32 %res
+}
+
+define amdgpu_cs i64 @abs_sgpr_i64(i64 inreg %arg) {
+; GFX6-LABEL: abs_sgpr_i64:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_ashr_i32 s2, s1, 31
+; GFX6-NEXT: s_add_u32 s0, s0, s2
+; GFX6-NEXT: s_mov_b32 s3, s2
+; GFX6-NEXT: s_addc_u32 s1, s1, s2
+; GFX6-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3]
+; GFX6-NEXT: ; return to shader part epilog
+;
+; GFX8-LABEL: abs_sgpr_i64:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_ashr_i32 s2, s1, 31
+; GFX8-NEXT: s_add_u32 s0, s0, s2
+; GFX8-NEXT: s_mov_b32 s3, s2
+; GFX8-NEXT: s_addc_u32 s1, s1, s2
+; GFX8-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3]
+; GFX8-NEXT: ; return to shader part epilog
+;
+; GFX10-LABEL: abs_sgpr_i64:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_ashr_i32 s2, s1, 31
+; GFX10-NEXT: s_add_u32 s0, s0, s2
+; GFX10-NEXT: s_mov_b32 s3, s2
+; GFX10-NEXT: s_addc_u32 s1, s1, s2
+; GFX10-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3]
+; GFX10-NEXT: ; return to shader part epilog
+;
+; GFX1250-LABEL: abs_sgpr_i64:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_ashr_i32 s2, s1, 31
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1250-NEXT: s_mov_b32 s3, s2
+; GFX1250-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3]
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3]
+; GFX1250-NEXT: ; return to shader part epilog
+ %res = call i64 @llvm.abs.i64(i64 %arg, i1 false)
+ ret i64 %res
+}
+
+define amdgpu_cs <4 x i32> @abs_sgpr_v4i32(<4 x i32> inreg %arg) {
+; GFX-LABEL: abs_sgpr_v4i32:
+; GFX: ; %bb.0:
+; GFX-NEXT: s_abs_i32 s0, s0
+; GFX-NEXT: s_abs_i32 s1, s1
+; GFX-NEXT: s_abs_i32 s2, s2
+; GFX-NEXT: s_abs_i32 s3, s3
+; GFX-NEXT: ; return to shader part epilog
+ %res = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %arg, i1 false)
+ ret <4 x i32> %res
+}
+
+define i16 @abs_vgpr_i16(i16 %arg) {
+; GFX6-LABEL: abs_vgpr_i16:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16
+; GFX6-NEXT: v_sub_i32_e32 v1, vcc, 0, v0
+; GFX6-NEXT: v_max_i32_e32 v0, v0, v1
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: abs_vgpr_i16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_sub_u16_e32 v1, 0, v0
+; GFX8-NEXT: v_max_i16_e32 v0, v0, v1
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: abs_vgpr_i16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_sub_nc_u16 v1, 0, v0
+; GFX10-NEXT: v_max_i16 v0, v0, v1
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: abs_vgpr_i16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_sub_nc_u16 v1, 0, v0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_max_i16 v0, v0, v1
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %res = call i16 @llvm.abs.i16(i16 %arg, i1 false)
+ ret i16 %res
+}
+
+define i32 @abs_vgpr_i32(i32 %arg) {
+; GFX6-LABEL: abs_vgpr_i32:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: v_sub_i32_e32 v1, vcc, 0, v0
+; GFX6-NEXT: v_max_i32_e32 v0, v0, v1
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: abs_vgpr_i32:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_sub_u32_e32 v1, vcc, 0, v0
+; GFX8-NEXT: v_max_i32_e32 v0, v0, v1
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: abs_vgpr_i32:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_sub_nc_u32_e32 v1, 0, v0
+; GFX10-NEXT: v_max_i32_e32 v0, v0, v1
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: abs_vgpr_i32:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_sub_nc_u32_e32 v1, 0, v0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_max_i32_e32 v0, v0, v1
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %res = call i32 @llvm.abs.i32(i32 %arg, i1 false)
+ ret i32 %res
+}
+
+define i64 @abs_vgpr_i64(i64 %arg) {
+; GFX6-LABEL: abs_vgpr_i64:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: v_ashrrev_i32_e32 v2, 31, v1
+; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2
+; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v2, vcc
+; GFX6-NEXT: v_xor_b32_e32 v0, v0, v2
+; GFX6-NEXT: v_xor_b32_e32 v1, v1, v2
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: abs_vgpr_i64:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v1
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v2, vcc
+; GFX8-NEXT: v_xor_b32_e32 v0, v0, v2
+; GFX8-NEXT: v_xor_b32_e32 v1, v1, v2
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: abs_vgpr_i64:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_ashrrev_i32_e32 v2, 31, v1
+; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v2, vcc_lo
+; GFX10-NEXT: v_xor_b32_e32 v0, v0, v2
+; GFX10-NEXT: v_xor_b32_e32 v1, v1, v2
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: abs_vgpr_i64:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_ashrrev_i32_e32 v2, 31, v1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_mov_b32_e32 v3, v2
+; GFX1250-NEXT: v_add_nc_u64_e32 v[0:1], v[0:1], v[2:3]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT: v_xor_b32_e32 v0, v0, v2
+; GFX1250-NEXT: v_xor_b32_e32 v1, v1, v2
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %res = call i64 @llvm.abs.i64(i64 %arg, i1 false)
+ ret i64 %res
+}
+
+define <4 x i32> @abs_vgpr_v4i32(<4 x i32> %arg) {
+; GFX6-LABEL: abs_vgpr_v4i32:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 0, v0
+; GFX6-NEXT: v_max_i32_e32 v0, v0, v4
+; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 0, v1
+; GFX6-NEXT: v_max_i32_e32 v1, v1, v4
+; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 0, v2
+; GFX6-NEXT: v_max_i32_e32 v2, v2, v4
+; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 0, v3
+; GFX6-NEXT: v_max_i32_e32 v3, v3, v4
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: abs_vgpr_v4i32:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_sub_u32_e32 v4, vcc, 0, v0
+; GFX8-NEXT: v_max_i32_e32 v0, v0, v4
+; GFX8-NEXT: v_sub_u32_e32 v4, vcc, 0, v1
+; GFX8-NEXT: v_max_i32_e32 v1, v1, v4
+; GFX8-NEXT: v_sub_u32_e32 v4, vcc, 0, v2
+; GFX8-NEXT: v_max_i32_e32 v2, v2, v4
+; GFX8-NEXT: v_sub_u32_e32 v4, vcc, 0, v3
+; GFX8-NEXT: v_max_i32_e32 v3, v3, v4
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: abs_vgpr_v4i32:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_sub_nc_u32_e32 v4, 0, v0
+; GFX10-NEXT: v_sub_nc_u32_e32 v5, 0, v1
+; GFX10-NEXT: v_sub_nc_u32_e32 v6, 0, v2
+; GFX10-NEXT: v_sub_nc_u32_e32 v7, 0, v3
+; GFX10-NEXT: v_max_i32_e32 v0, v0, v4
+; GFX10-NEXT: v_max_i32_e32 v1, v1, v5
+; GFX10-NEXT: v_max_i32_e32 v2, v2, v6
+; GFX10-NEXT: v_max_i32_e32 v3, v3, v7
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: abs_vgpr_v4i32:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_sub_nc_u32 v4, 0, v0 :: v_dual_sub_nc_u32 v5, 0, v1
+; GFX1250-NEXT: v_dual_sub_nc_u32 v6, 0, v2 :: v_dual_sub_nc_u32 v7, 0, v3
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250-NEXT: v_max_i32_e32 v0, v0, v4
+; GFX1250-NEXT: v_max_i32_e32 v1, v1, v5
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1250-NEXT: v_max_i32_e32 v2, v2, v6
+; GFX1250-NEXT: v_max_i32_e32 v3, v3, v7
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %res = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %arg, i1 false)
+ ret <4 x i32> %res
+}
+
+define amdgpu_cs <2 x i8> @abs_sgpr_v2i8(<2 x i8> inreg %arg) {
+; GFX-LABEL: abs_sgpr_v2i8:
+; GFX: ; %bb.0:
+; GFX-NEXT: s_sext_i32_i8 s0, s0
+; GFX-NEXT: s_sext_i32_i8 s1, s1
+; GFX-NEXT: s_abs_i32 s0, s0
+; GFX-NEXT: s_abs_i32 s1, s1
+; GFX-NEXT: ; return to shader part epilog
+ %res = call <2 x i8> @llvm.abs.v2i8(<2 x i8> %arg, i1 false)
+ ret <2 x i8> %res
+}
+
+define <2 x i8> @abs_vgpr_v2i8(<2 x i8> %arg) {
+; GFX6-LABEL: abs_vgpr_v2i8:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 8
+; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 0, v0
+; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 8
+; GFX6-NEXT: v_max_i32_e32 v0, v0, v2
+; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 0, v1
+; GFX6-NEXT: v_max_i32_e32 v1, v1, v2
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: abs_vgpr_v2i8:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v2, 0
+; GFX8-NEXT: v_sub_u16_sdwa v3, v2, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT: v_sub_u16_sdwa v2, v2, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT: v_max_i16_sdwa v0, sext(v0), v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT: v_max_i16_sdwa v1, sext(v1), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: abs_vgpr_v2i8:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_bfe_i32 v0, v0, 0, 8
+; GFX10-NEXT: v_bfe_i32 v1, v1, 0, 8
+; GFX10-NEXT: v_sub_nc_u16 v2, 0, v0
+; GFX10-NEXT: v_sub_nc_u16 v3, 0, v1
+; GFX10-NEXT: v_max_i16 v0, v0, v2
+; GFX10-NEXT: v_max_i16 v1, v1, v3
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: abs_vgpr_v2i8:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_bfe_i32 v0, v0, 0, 8
+; GFX1250-NEXT: v_bfe_i32 v1, v1, 0, 8
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT: v_sub_nc_u16 v2, 0, v0
+; GFX1250-NEXT: v_sub_nc_u16 v3, 0, v1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT: v_max_i16 v0, v0, v2
+; GFX1250-NEXT: v_max_i16 v1, v1, v3
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %res = call <2 x i8> @llvm.abs.v2i8(<2 x i8> %arg, i1 false)
+ ret <2 x i8> %res
+}
+
+define amdgpu_cs <3 x i8> @abs_sgpr_v3i8(<3 x i8> inreg %arg) {
+; GFX-LABEL: abs_sgpr_v3i8:
+; GFX: ; %bb.0:
+; GFX-NEXT: s_sext_i32_i8 s0, s0
+; GFX-NEXT: s_sext_i32_i8 s1, s1
+; GFX-NEXT: s_sext_i32_i8 s2, s2
+; GFX-NEXT: s_abs_i32 s0, s0
+; GFX-NEXT: s_abs_i32 s1, s1
+; GFX-NEXT: s_abs_i32 s2, s2
+; GFX-NEXT: ; return to shader part epilog
+ %res = call <3 x i8> @llvm.abs.v3i8(<3 x i8> %arg, i1 false)
+ ret <3 x i8> %res
+}
+
+define <3 x i8> @abs_vgpr_v3i8(<3 x i8> %arg) {
+; GFX6-LABEL: abs_vgpr_v3i8:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 8
+; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0, v0
+; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 8
+; GFX6-NEXT: v_max_i32_e32 v0, v0, v3
+; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0, v1
+; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 8
+; GFX6-NEXT: v_max_i32_e32 v1, v1, v3
+; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0, v2
+; GFX6-NEXT: v_max_i32_e32 v2, v2, v3
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: abs_vgpr_v3i8:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v3, 0
+; GFX8-NEXT: v_sub_u16_sdwa v4, v3, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT: v_max_i16_sdwa v0, sext(v0), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT: v_sub_u16_sdwa v4, v3, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT: v_sub_u16_sdwa v3, v3, sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT: v_max_i16_sdwa v1, sext(v1), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT: v_max_i16_sdwa v2, sext(v2), v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: abs_vgpr_v3i8:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_bfe_i32 v0, v0, 0, 8
+; GFX10-NEXT: v_bfe_i32 v1, v1, 0, 8
+; GFX10-NEXT: v_bfe_i32 v2, v2, 0, 8
+; GFX10-NEXT: v_sub_nc_u16 v3, 0, v0
+; GFX10-NEXT: v_sub_nc_u16 v4, 0, v1
+; GFX10-NEXT: v_sub_nc_u16 v5, 0, v2
+; GFX10-NEXT: v_max_i16 v0, v0, v3
+; GFX10-NEXT: v_max_i16 v1, v1, v4
+; GFX10-NEXT: v_max_i16 v2, v2, v5
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: abs_vgpr_v3i8:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_bfe_i32 v0, v0, 0, 8
+; GFX1250-NEXT: v_bfe_i32 v1, v1, 0, 8
+; GFX1250-NEXT: v_bfe_i32 v2, v2, 0, 8
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250-NEXT: v_sub_nc_u16 v3, 0, v0
+; GFX1250-NEXT: v_sub_nc_u16 v4, 0, v1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250-NEXT: v_sub_nc_u16 v5, 0, v2
+; GFX1250-NEXT: v_max_i16 v0, v0, v3
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250-NEXT: v_max_i16 v1, v1, v4
+; GFX1250-NEXT: v_max_i16 v2, v2, v5
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %res = call <3 x i8> @llvm.abs.v3i8(<3 x i8> %arg, i1 false)
+ ret <3 x i8> %res
+}
+
+define amdgpu_cs <2 x i16> @abs_sgpr_v2i16(<2 x i16> inreg %arg) {
+; GFX6-LABEL: abs_sgpr_v2i16:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_sext_i32_i16 s0, s0
+; GFX6-NEXT: s_sext_i32_i16 s1, s1
+; GFX6-NEXT: s_abs_i32 s0, s0
+; GFX6-NEXT: s_abs_i32 s1, s1
+; GFX6-NEXT: ; return to shader part epilog
+;
+; GFX8-LABEL: abs_sgpr_v2i16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_lshr_b32 s1, s0, 16
+; GFX8-NEXT: s_sext_i32_i16 s1, s1
+; GFX8-NEXT: s_sext_i32_i16 s0, s0
+; GFX8-NEXT: s_abs_i32 s1, s1
+; GFX8-NEXT: s_abs_i32 s0, s0
+; GFX8-NEXT: s_lshl_b32 s1, s1, 16
+; GFX8-NEXT: s_or_b32 s0, s0, s1
+; GFX8-NEXT: ; return to shader part epilog
+;
+; GFX10-LABEL: abs_sgpr_v2i16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_sext_i32_i16 s1, s0
+; GFX10-NEXT: s_ashr_i32 s0, s0, 16
+; GFX10-NEXT: s_abs_i32 s1, s1
+; GFX10-NEXT: s_abs_i32 s0, s0
+; GFX10-NEXT: s_pack_ll_b32_b16 s0, s1, s0
+; GFX10-NEXT: ; return to shader part epilog
+;
+; GFX1250-LABEL: abs_sgpr_v2i16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_sext_i32_i16 s1, s0
+; GFX1250-NEXT: s_ashr_i32 s0, s0, 16
+; GFX1250-NEXT: s_abs_i32 s1, s1
+; GFX1250-NEXT: s_abs_i32 s0, s0
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: s_pack_ll_b32_b16 s0, s1, s0
+; GFX1250-NEXT: ; return to shader part epilog
+ %res = call <2 x i16> @llvm.abs.v2i16(<2 x i16> %arg, i1 false)
+ ret <2 x i16> %res
+}
+
+define <2 x i16> @abs_vgpr_v2i16(<2 x i16> %arg) {
+; GFX6-LABEL: abs_vgpr_v2i16:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16
+; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 0, v0
+; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16
+; GFX6-NEXT: v_max_i32_e32 v0, v0, v2
+; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 0, v1
+; GFX6-NEXT: v_max_i32_e32 v1, v1, v2
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: abs_vgpr_v2i16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v2, 0
+; GFX8-NEXT: v_sub_u16_e32 v1, 0, v0
+; GFX8-NEXT: v_sub_u16_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_max_i16_e32 v1, v0, v1
+; GFX8-NEXT: v_max_i16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: abs_vgpr_v2i16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_pk_sub_i16 v1, 0, v0
+; GFX10-NEXT: v_pk_max_i16 v0, v0, v1
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: abs_vgpr_v2i16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_pk_sub_i16 v1, 0, v0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_pk_max_i16 v0, v0, v1
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %res = call <2 x i16> @llvm.abs.v2i16(<2 x i16> %arg, i1 false)
+ ret <2 x i16> %res
+}
+
+define amdgpu_cs <3 x i16> @abs_sgpr_v3i16(<3 x i16> inreg %arg) {
+; GFX6-LABEL: abs_sgpr_v3i16:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_sext_i32_i16 s0, s0
+; GFX6-NEXT: s_sext_i32_i16 s1, s1
+; GFX6-NEXT: s_sext_i32_i16 s2, s2
+; GFX6-NEXT: s_abs_i32 s0, s0
+; GFX6-NEXT: s_abs_i32 s1, s1
+; GFX6-NEXT: s_abs_i32 s2, s2
+; GFX6-NEXT: ; return to shader part epilog
+;
+; GFX8-LABEL: abs_sgpr_v3i16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_lshr_b32 s2, s0, 16
+; GFX8-NEXT: s_sext_i32_i16 s2, s2
+; GFX8-NEXT: s_sext_i32_i16 s0, s0
+; GFX8-NEXT: s_abs_i32 s2, s2
+; GFX8-NEXT: s_abs_i32 s0, s0
+; GFX8-NEXT: s_sext_i32_i16 s1, s1
+; GFX8-NEXT: s_lshl_b32 s2, s2, 16
+; GFX8-NEXT: s_abs_i32 s1, s1
+; GFX8-NEXT: s_or_b32 s0, s0, s2
+; GFX8-NEXT: ; return to shader part epilog
+;
+; GFX10-LABEL: abs_sgpr_v3i16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_sext_i32_i16 s2, s0
+; GFX10-NEXT: s_ashr_i32 s0, s0, 16
+; GFX10-NEXT: s_abs_i32 s2, s2
+; GFX10-NEXT: s_abs_i32 s0, s0
+; GFX10-NEXT: s_sext_i32_i16 s1, s1
+; GFX10-NEXT: s_pack_ll_b32_b16 s0, s2, s0
+; GFX10-NEXT: s_abs_i32 s1, s1
+; GFX10-NEXT: ; return to shader part epilog
+;
+; GFX1250-LABEL: abs_sgpr_v3i16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_sext_i32_i16 s2, s0
+; GFX1250-NEXT: s_ashr_i32 s0, s0, 16
+; GFX1250-NEXT: s_abs_i32 s2, s2
+; GFX1250-NEXT: s_abs_i32 s0, s0
+; GFX1250-NEXT: s_sext_i32_i16 s1, s1
+; GFX1250-NEXT: s_pack_ll_b32_b16 s0, s2, s0
+; GFX1250-NEXT: s_abs_i32 s1, s1
+; GFX1250-NEXT: ; return to shader part epilog
+ %res = call <3 x i16> @llvm.abs.v3i16(<3 x i16> %arg, i1 false)
+ ret <3 x i16> %res
+}
+
+define <3 x i16> @abs_vgpr_v3i16(<3 x i16> %arg) {
+; GFX6-LABEL: abs_vgpr_v3i16:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16
+; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0, v0
+; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16
+; GFX6-NEXT: v_max_i32_e32 v0, v0, v3
+; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0, v1
+; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 16
+; GFX6-NEXT: v_max_i32_e32 v1, v1, v3
+; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0, v2
+; GFX6-NEXT: v_max_i32_e32 v2, v2, v3
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: abs_vgpr_v3i16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v3, 0
+; GFX8-NEXT: v_sub_u16_e32 v2, 0, v0
+; GFX8-NEXT: v_sub_u16_sdwa v3, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_sub_u16_e32 v4, 0, v1
+; GFX8-NEXT: v_max_i16_e32 v2, v0, v2
+; GFX8-NEXT: v_max_i16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_e32 v0, v2, v0
+; GFX8-NEXT: v_max_i16_e32 v1, v1, v4
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: abs_vgpr_v3i16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_pk_sub_i16 v2, 0, v0
+; GFX10-NEXT: v_sub_nc_u16 v3, 0, v1
+; GFX10-NEXT: v_pk_max_i16 v0, v0, v2
+; GFX10-NEXT: v_max_i16 v1, v1, v3
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: abs_vgpr_v3i16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_pk_sub_i16 v2, 0, v0
+; GFX1250-NEXT: v_sub_nc_u16 v3, 0, v1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT: v_pk_max_i16 v0, v0, v2
+; GFX1250-NEXT: v_max_i16 v1, v1, v3
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ %res = call <3 x i16> @llvm.abs.v3i16(<3 x i16> %arg, i1 false)
+ ret <3 x i16> %res
+}
>From 348729ad0633bb2976f363a712fb5c66781683ea Mon Sep 17 00:00:00 2001
From: Patrick Simmons <psimmons at pensando.io>
Date: Tue, 4 Nov 2025 18:11:33 -0500
Subject: [PATCH 11/19] Delete new testcase
---
llvm/test/CodeGen/AMDGPU/s_abs_i16.ll | 26 --------------------------
1 file changed, 26 deletions(-)
delete mode 100644 llvm/test/CodeGen/AMDGPU/s_abs_i16.ll
diff --git a/llvm/test/CodeGen/AMDGPU/s_abs_i16.ll b/llvm/test/CodeGen/AMDGPU/s_abs_i16.ll
deleted file mode 100644
index 0cdbedd837396..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/s_abs_i16.ll
+++ /dev/null
@@ -1,26 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
-; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 < %s | FileCheck %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn-- -mcpu=gfx900 < %s | FileCheck %s
-
-define amdgpu_ps i16 @abs_i16(i16 inreg %arg) {
-; CHECK-LABEL: abs_i16:
-; CHECK: ; %bb.0:
-; CHECK-NEXT: s_sext_i32_i16 s0, s0
-; CHECK-NEXT: s_abs_i32 s0, s0
-; CHECK-NEXT: ; return to shader part epilog
-
- %res = call i16 @llvm.abs.i16(i16 %arg, i1 false)
- ret i16 %res
-}
-
-define amdgpu_ps i16 @abs_i16_neg(i16 inreg %arg) {
-; CHECK-LABEL: abs_i16_neg:
-; CHECK: ; %bb.0:
-; CHECK-NEXT: s_sext_i32_i16 s0, s0
-; CHECK-NEXT: s_abs_i32 s0, s0
-; CHECK-NEXT: s_sub_i32 s0, 0, s0
-; CHECK-NEXT: ; return to shader part epilog
- %res1 = call i16 @llvm.abs.i16(i16 %arg, i1 false)
- %res2 = sub i16 0, %res1
- ret i16 %res2
-}
>From 2f2affd94e6bb3d21d7d7057a5d129e6baec575a Mon Sep 17 00:00:00 2001
From: Patrick Simmons <psimmons at pensando.io>
Date: Tue, 4 Nov 2025 18:25:20 -0500
Subject: [PATCH 12/19] Fix testcase
---
llvm/test/CodeGen/AMDGPU/llvm.abs.ll | 8 +++++---
1 file changed, 5 insertions(+), 3 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.abs.ll b/llvm/test/CodeGen/AMDGPU/llvm.abs.ll
index 8b8448e260d73..bac900cf0f3a9 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.abs.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.abs.ll
@@ -72,11 +72,13 @@ define amdgpu_cs i16 @abs_sgpr_i16(i16 inreg %arg) {
}
define amdgpu_ps i16 @abs_sgpr_i16_neg(i16 inreg %arg) {
+; COM: Suboptimal code generation on Tahiti.
; SDAG6-LABEL: abs_sgpr_i16_neg:
; SDAG6: ; %bb.0:
-; SDAG6-NEXT: s_sext_i32_i16 s0, s0
-; SDAG6-NEXT: s_abs_i32 s0, s0
-; SDAG6-NEXT: s_sub_i32 s0, 0, s0
+; SDAG6-NEXT: s_sext_i32_i16 s1, s0
+; SDAG6-NEXT: s_ashr_i32 s1, s1, 15
+; SDAG6-NEXT: s_xor_b32 s0, s0, s1
+; SDAG6-NEXT: s_sub_i32 s0, s1, s0
; SDAG6-NEXT: ; return to shader part epilog
;
; SDAG8-LABEL: abs_sgpr_i16_neg:
>From d0a4fa4536afc9daf5d7c54ad4c540e6043035ca Mon Sep 17 00:00:00 2001
From: Patrick Simmons <psimmons at pensando.io>
Date: Wed, 5 Nov 2025 15:56:14 -0500
Subject: [PATCH 13/19] Update testcase
---
llvm/test/CodeGen/AMDGPU/absdiff.ll | 8 ++------
1 file changed, 2 insertions(+), 6 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/absdiff.ll b/llvm/test/CodeGen/AMDGPU/absdiff.ll
index 9cb397fb9d1c6..ee8241e355e26 100644
--- a/llvm/test/CodeGen/AMDGPU/absdiff.ll
+++ b/llvm/test/CodeGen/AMDGPU/absdiff.ll
@@ -5,10 +5,8 @@ define amdgpu_ps i16 @absdiff_i16_false(i16 inreg %arg0, i16 inreg %arg1) {
; CHECK-LABEL: absdiff_i16_false:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_sub_i32 s0, s0, s1
-; CHECK-NEXT: s_sext_i32_i16 s1, s0
-; CHECK-NEXT: s_sub_i32 s0, 0, s0
; CHECK-NEXT: s_sext_i32_i16 s0, s0
-; CHECK-NEXT: s_max_i32 s0, s1, s0
+; CHECK-NEXT: s_abs_i32 s0, s0
; CHECK-NEXT: ; return to shader part epilog
%diff = sub i16 %arg0, %arg1
%res = call i16 @llvm.abs.i16(i16 %diff, i1 false) ; INT_MIN input returns INT_MIN
@@ -19,10 +17,8 @@ define amdgpu_ps i16 @absdiff_i16_true(i16 inreg %arg0, i16 inreg %arg1) {
; CHECK-LABEL: absdiff_i16_true:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_sub_i32 s0, s0, s1
-; CHECK-NEXT: s_sext_i32_i16 s1, s0
-; CHECK-NEXT: s_sub_i32 s0, 0, s0
; CHECK-NEXT: s_sext_i32_i16 s0, s0
-; CHECK-NEXT: s_max_i32 s0, s1, s0
+; CHECK-NEXT: s_abs_i32 s0, s0
; CHECK-NEXT: ; return to shader part epilog
%diff = sub i16 %arg0, %arg1
%res = call i16 @llvm.abs.i16(i16 %diff, i1 true) ; INT_MIN input returns poison
>From 0b6e6cf8a0731359610eee76b9a4882e707118b7 Mon Sep 17 00:00:00 2001
From: Patrick Simmons <patrick.simmons at amd.com>
Date: Thu, 6 Nov 2025 13:10:56 -0500
Subject: [PATCH 14/19] Delete testcase
---
llvm/test/CodeGen/AMDGPU/llvm.abs.ll | 694 ---------------------------
1 file changed, 694 deletions(-)
delete mode 100644 llvm/test/CodeGen/AMDGPU/llvm.abs.ll
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.abs.ll b/llvm/test/CodeGen/AMDGPU/llvm.abs.ll
deleted file mode 100644
index bac900cf0f3a9..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/llvm.abs.ll
+++ /dev/null
@@ -1,694 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=tahiti -o - < %s | FileCheck %s --check-prefixes=SDAG6
-; RUN: llc -mtriple=amdgcn -mcpu=fiji -o - < %s | FileCheck %s --check-prefixes=SDAG8
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -o - < %s | FileCheck %s --check-prefixes=SDAG10
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -o - < %s | FileCheck %s --check-prefixes=SDAG1250
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=tahiti -o - < %s | FileCheck %s --check-prefixes=GFX,GFX6
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=fiji -o - < %s | FileCheck %s --check-prefixes=GFX,GFX8
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -o - < %s | FileCheck %s --check-prefixes=GFX,GFX10
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1250 -o - < %s | FileCheck %s --check-prefixes=GFX,GFX1250
-
-declare i16 @llvm.abs.i16(i16, i1)
-declare i32 @llvm.abs.i32(i32, i1)
-declare i64 @llvm.abs.i64(i64, i1)
-declare <2 x i8> @llvm.abs.v2i8(<2 x i8>, i1)
-declare <3 x i8> @llvm.abs.v3i8(<3 x i8>, i1)
-declare <2 x i16> @llvm.abs.v2i16(<2 x i16>, i1)
-declare <3 x i16> @llvm.abs.v3i16(<3 x i16>, i1)
-declare <4 x i32> @llvm.abs.v4i32(<4 x i32>, i1)
-
-define amdgpu_cs i16 @abs_sgpr_i16(i16 inreg %arg) {
-; SDAG6-LABEL: abs_sgpr_i16:
-; SDAG6: ; %bb.0:
-; SDAG6-NEXT: s_sext_i32_i16 s0, s0
-; SDAG6-NEXT: s_abs_i32 s0, s0
-; SDAG6-NEXT: ; return to shader part epilog
-;
-; SDAG8-LABEL: abs_sgpr_i16:
-; SDAG8: ; %bb.0:
-; SDAG8-NEXT: s_sext_i32_i16 s0, s0
-; SDAG8-NEXT: s_abs_i32 s0, s0
-; SDAG8-NEXT: ; return to shader part epilog
-;
-; SDAG10-LABEL: abs_sgpr_i16:
-; SDAG10: ; %bb.0:
-; SDAG10-NEXT: s_sext_i32_i16 s0, s0
-; SDAG10-NEXT: s_abs_i32 s0, s0
-; SDAG10-NEXT: ; return to shader part epilog
-;
-; SDAG1250-LABEL: abs_sgpr_i16:
-; SDAG1250: ; %bb.0:
-; SDAG1250-NEXT: s_sext_i32_i16 s0, s0
-; SDAG1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; SDAG1250-NEXT: s_abs_i32 s0, s0
-; SDAG1250-NEXT: ; return to shader part epilog
-
-; GFX6-LABEL: abs_sgpr_i16:
-; GFX6: ; %bb.0:
-; GFX6-NEXT: s_sext_i32_i16 s0, s0
-; GFX6-NEXT: s_abs_i32 s0, s0
-; GFX6-NEXT: ; return to shader part epilog
-;
-; GFX8-LABEL: abs_sgpr_i16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_sext_i32_i16 s0, s0
-; GFX8-NEXT: s_abs_i32 s0, s0
-; GFX8-NEXT: ; return to shader part epilog
-;
-; GFX10-LABEL: abs_sgpr_i16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_sext_i32_i16 s0, s0
-; GFX10-NEXT: s_abs_i32 s0, s0
-; GFX10-NEXT: ; return to shader part epilog
-;
-; GFX1250-LABEL: abs_sgpr_i16:
-; GFX1250: ; %bb.0:
-; GFX1250-NEXT: s_sext_i32_i16 s0, s0
-; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1250-NEXT: s_abs_i32 s0, s0
-; GFX1250-NEXT: ; return to shader part epilog
- %res = call i16 @llvm.abs.i16(i16 %arg, i1 false)
- ret i16 %res
-}
-
-define amdgpu_ps i16 @abs_sgpr_i16_neg(i16 inreg %arg) {
-; COM: Suboptimal code generation on Tahiti.
-; SDAG6-LABEL: abs_sgpr_i16_neg:
-; SDAG6: ; %bb.0:
-; SDAG6-NEXT: s_sext_i32_i16 s1, s0
-; SDAG6-NEXT: s_ashr_i32 s1, s1, 15
-; SDAG6-NEXT: s_xor_b32 s0, s0, s1
-; SDAG6-NEXT: s_sub_i32 s0, s1, s0
-; SDAG6-NEXT: ; return to shader part epilog
-;
-; SDAG8-LABEL: abs_sgpr_i16_neg:
-; SDAG8: ; %bb.0:
-; SDAG8-NEXT: s_sext_i32_i16 s0, s0
-; SDAG8-NEXT: s_abs_i32 s0, s0
-; SDAG8-NEXT: s_sub_i32 s0, 0, s0
-; SDAG8-NEXT: ; return to shader part epilog
-;
-; SDAG10-LABEL: abs_sgpr_i16_neg:
-; SDAG10: ; %bb.0:
-; SDAG10-NEXT: s_sext_i32_i16 s0, s0
-; SDAG10-NEXT: s_abs_i32 s0, s0
-; SDAG10-NEXT: s_sub_i32 s0, 0, s0
-; SDAG10-NEXT: ; return to shader part epilog
-;
-; SDAG1250-LABEL: abs_sgpr_i16_neg:
-; SDAG1250: ; %bb.0:
-; SDAG1250-NEXT: s_sext_i32_i16 s0, s0
-; SDAG1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; SDAG1250-NEXT: s_abs_i32 s0, s0
-; SDAG1250-NEXT: s_sub_co_i32 s0, 0, s0
-; SDAG1250-NEXT: ; return to shader part epilog
-;
-; GFX6-LABEL: abs_sgpr_i16_neg:
-; GFX6: ; %bb.0:
-; GFX6-NEXT: s_sext_i32_i16 s0, s0
-; GFX6-NEXT: s_abs_i32 s0, s0
-; GFX6-NEXT: s_sub_i32 s0, 0, s0
-; GFX6-NEXT: ; return to shader part epilog
-;
-; GFX8-LABEL: abs_sgpr_i16_neg:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_sext_i32_i16 s0, s0
-; GFX8-NEXT: s_abs_i32 s0, s0
-; GFX8-NEXT: s_sub_i32 s0, 0, s0
-; GFX8-NEXT: ; return to shader part epilog
-;
-; GFX10-LABEL: abs_sgpr_i16_neg:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_sext_i32_i16 s0, s0
-; GFX10-NEXT: s_abs_i32 s0, s0
-; GFX10-NEXT: s_sub_i32 s0, 0, s0
-; GFX10-NEXT: ; return to shader part epilog
-;
-; GFX1250-LABEL: abs_sgpr_i16_neg:
-; GFX1250: ; %bb.0:
-; GFX1250-NEXT: s_sext_i32_i16 s0, s0
-; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1250-NEXT: s_abs_i32 s0, s0
-; GFX1250-NEXT: s_sub_co_i32 s0, 0, s0
-; GFX1250-NEXT: ; return to shader part epilog
- %res1 = call i16 @llvm.abs.i16(i16 %arg, i1 false)
- %res2 = sub i16 0, %res1
- ret i16 %res2
-}
-
-define amdgpu_cs i32 @abs_sgpr_i32(i32 inreg %arg) {
-; GFX-LABEL: abs_sgpr_i32:
-; GFX: ; %bb.0:
-; GFX-NEXT: s_abs_i32 s0, s0
-; GFX-NEXT: ; return to shader part epilog
- %res = call i32 @llvm.abs.i32(i32 %arg, i1 false)
- ret i32 %res
-}
-
-define amdgpu_cs i64 @abs_sgpr_i64(i64 inreg %arg) {
-; GFX6-LABEL: abs_sgpr_i64:
-; GFX6: ; %bb.0:
-; GFX6-NEXT: s_ashr_i32 s2, s1, 31
-; GFX6-NEXT: s_add_u32 s0, s0, s2
-; GFX6-NEXT: s_mov_b32 s3, s2
-; GFX6-NEXT: s_addc_u32 s1, s1, s2
-; GFX6-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3]
-; GFX6-NEXT: ; return to shader part epilog
-;
-; GFX8-LABEL: abs_sgpr_i64:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_ashr_i32 s2, s1, 31
-; GFX8-NEXT: s_add_u32 s0, s0, s2
-; GFX8-NEXT: s_mov_b32 s3, s2
-; GFX8-NEXT: s_addc_u32 s1, s1, s2
-; GFX8-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3]
-; GFX8-NEXT: ; return to shader part epilog
-;
-; GFX10-LABEL: abs_sgpr_i64:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_ashr_i32 s2, s1, 31
-; GFX10-NEXT: s_add_u32 s0, s0, s2
-; GFX10-NEXT: s_mov_b32 s3, s2
-; GFX10-NEXT: s_addc_u32 s1, s1, s2
-; GFX10-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3]
-; GFX10-NEXT: ; return to shader part epilog
-;
-; GFX1250-LABEL: abs_sgpr_i64:
-; GFX1250: ; %bb.0:
-; GFX1250-NEXT: s_ashr_i32 s2, s1, 31
-; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1250-NEXT: s_mov_b32 s3, s2
-; GFX1250-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3]
-; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1250-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3]
-; GFX1250-NEXT: ; return to shader part epilog
- %res = call i64 @llvm.abs.i64(i64 %arg, i1 false)
- ret i64 %res
-}
-
-define amdgpu_cs <4 x i32> @abs_sgpr_v4i32(<4 x i32> inreg %arg) {
-; GFX-LABEL: abs_sgpr_v4i32:
-; GFX: ; %bb.0:
-; GFX-NEXT: s_abs_i32 s0, s0
-; GFX-NEXT: s_abs_i32 s1, s1
-; GFX-NEXT: s_abs_i32 s2, s2
-; GFX-NEXT: s_abs_i32 s3, s3
-; GFX-NEXT: ; return to shader part epilog
- %res = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %arg, i1 false)
- ret <4 x i32> %res
-}
-
-define i16 @abs_vgpr_i16(i16 %arg) {
-; GFX6-LABEL: abs_vgpr_i16:
-; GFX6: ; %bb.0:
-; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16
-; GFX6-NEXT: v_sub_i32_e32 v1, vcc, 0, v0
-; GFX6-NEXT: v_max_i32_e32 v0, v0, v1
-; GFX6-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: abs_vgpr_i16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_sub_u16_e32 v1, 0, v0
-; GFX8-NEXT: v_max_i16_e32 v0, v0, v1
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: abs_vgpr_i16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_sub_nc_u16 v1, 0, v0
-; GFX10-NEXT: v_max_i16 v0, v0, v1
-; GFX10-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1250-LABEL: abs_vgpr_i16:
-; GFX1250: ; %bb.0:
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_sub_nc_u16 v1, 0, v0
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-NEXT: v_max_i16 v0, v0, v1
-; GFX1250-NEXT: s_set_pc_i64 s[30:31]
- %res = call i16 @llvm.abs.i16(i16 %arg, i1 false)
- ret i16 %res
-}
-
-define i32 @abs_vgpr_i32(i32 %arg) {
-; GFX6-LABEL: abs_vgpr_i32:
-; GFX6: ; %bb.0:
-; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_sub_i32_e32 v1, vcc, 0, v0
-; GFX6-NEXT: v_max_i32_e32 v0, v0, v1
-; GFX6-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: abs_vgpr_i32:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_sub_u32_e32 v1, vcc, 0, v0
-; GFX8-NEXT: v_max_i32_e32 v0, v0, v1
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: abs_vgpr_i32:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_sub_nc_u32_e32 v1, 0, v0
-; GFX10-NEXT: v_max_i32_e32 v0, v0, v1
-; GFX10-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1250-LABEL: abs_vgpr_i32:
-; GFX1250: ; %bb.0:
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_sub_nc_u32_e32 v1, 0, v0
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-NEXT: v_max_i32_e32 v0, v0, v1
-; GFX1250-NEXT: s_set_pc_i64 s[30:31]
- %res = call i32 @llvm.abs.i32(i32 %arg, i1 false)
- ret i32 %res
-}
-
-define i64 @abs_vgpr_i64(i64 %arg) {
-; GFX6-LABEL: abs_vgpr_i64:
-; GFX6: ; %bb.0:
-; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_ashrrev_i32_e32 v2, 31, v1
-; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2
-; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v2, vcc
-; GFX6-NEXT: v_xor_b32_e32 v0, v0, v2
-; GFX6-NEXT: v_xor_b32_e32 v1, v1, v2
-; GFX6-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: abs_vgpr_i64:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v2, vcc
-; GFX8-NEXT: v_xor_b32_e32 v0, v0, v2
-; GFX8-NEXT: v_xor_b32_e32 v1, v1, v2
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: abs_vgpr_i64:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_ashrrev_i32_e32 v2, 31, v1
-; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v2, vcc_lo
-; GFX10-NEXT: v_xor_b32_e32 v0, v0, v2
-; GFX10-NEXT: v_xor_b32_e32 v1, v1, v2
-; GFX10-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1250-LABEL: abs_vgpr_i64:
-; GFX1250: ; %bb.0:
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_ashrrev_i32_e32 v2, 31, v1
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT: v_mov_b32_e32 v3, v2
-; GFX1250-NEXT: v_add_nc_u64_e32 v[0:1], v[0:1], v[2:3]
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1250-NEXT: v_xor_b32_e32 v0, v0, v2
-; GFX1250-NEXT: v_xor_b32_e32 v1, v1, v2
-; GFX1250-NEXT: s_set_pc_i64 s[30:31]
- %res = call i64 @llvm.abs.i64(i64 %arg, i1 false)
- ret i64 %res
-}
-
-define <4 x i32> @abs_vgpr_v4i32(<4 x i32> %arg) {
-; GFX6-LABEL: abs_vgpr_v4i32:
-; GFX6: ; %bb.0:
-; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 0, v0
-; GFX6-NEXT: v_max_i32_e32 v0, v0, v4
-; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 0, v1
-; GFX6-NEXT: v_max_i32_e32 v1, v1, v4
-; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 0, v2
-; GFX6-NEXT: v_max_i32_e32 v2, v2, v4
-; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 0, v3
-; GFX6-NEXT: v_max_i32_e32 v3, v3, v4
-; GFX6-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: abs_vgpr_v4i32:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_sub_u32_e32 v4, vcc, 0, v0
-; GFX8-NEXT: v_max_i32_e32 v0, v0, v4
-; GFX8-NEXT: v_sub_u32_e32 v4, vcc, 0, v1
-; GFX8-NEXT: v_max_i32_e32 v1, v1, v4
-; GFX8-NEXT: v_sub_u32_e32 v4, vcc, 0, v2
-; GFX8-NEXT: v_max_i32_e32 v2, v2, v4
-; GFX8-NEXT: v_sub_u32_e32 v4, vcc, 0, v3
-; GFX8-NEXT: v_max_i32_e32 v3, v3, v4
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: abs_vgpr_v4i32:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_sub_nc_u32_e32 v4, 0, v0
-; GFX10-NEXT: v_sub_nc_u32_e32 v5, 0, v1
-; GFX10-NEXT: v_sub_nc_u32_e32 v6, 0, v2
-; GFX10-NEXT: v_sub_nc_u32_e32 v7, 0, v3
-; GFX10-NEXT: v_max_i32_e32 v0, v0, v4
-; GFX10-NEXT: v_max_i32_e32 v1, v1, v5
-; GFX10-NEXT: v_max_i32_e32 v2, v2, v6
-; GFX10-NEXT: v_max_i32_e32 v3, v3, v7
-; GFX10-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1250-LABEL: abs_vgpr_v4i32:
-; GFX1250: ; %bb.0:
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_dual_sub_nc_u32 v4, 0, v0 :: v_dual_sub_nc_u32 v5, 0, v1
-; GFX1250-NEXT: v_dual_sub_nc_u32 v6, 0, v2 :: v_dual_sub_nc_u32 v7, 0, v3
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1250-NEXT: v_max_i32_e32 v0, v0, v4
-; GFX1250-NEXT: v_max_i32_e32 v1, v1, v5
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1250-NEXT: v_max_i32_e32 v2, v2, v6
-; GFX1250-NEXT: v_max_i32_e32 v3, v3, v7
-; GFX1250-NEXT: s_set_pc_i64 s[30:31]
- %res = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %arg, i1 false)
- ret <4 x i32> %res
-}
-
-define amdgpu_cs <2 x i8> @abs_sgpr_v2i8(<2 x i8> inreg %arg) {
-; GFX-LABEL: abs_sgpr_v2i8:
-; GFX: ; %bb.0:
-; GFX-NEXT: s_sext_i32_i8 s0, s0
-; GFX-NEXT: s_sext_i32_i8 s1, s1
-; GFX-NEXT: s_abs_i32 s0, s0
-; GFX-NEXT: s_abs_i32 s1, s1
-; GFX-NEXT: ; return to shader part epilog
- %res = call <2 x i8> @llvm.abs.v2i8(<2 x i8> %arg, i1 false)
- ret <2 x i8> %res
-}
-
-define <2 x i8> @abs_vgpr_v2i8(<2 x i8> %arg) {
-; GFX6-LABEL: abs_vgpr_v2i8:
-; GFX6: ; %bb.0:
-; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 8
-; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 0, v0
-; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 8
-; GFX6-NEXT: v_max_i32_e32 v0, v0, v2
-; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 0, v1
-; GFX6-NEXT: v_max_i32_e32 v1, v1, v2
-; GFX6-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: abs_vgpr_v2i8:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v2, 0
-; GFX8-NEXT: v_sub_u16_sdwa v3, v2, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT: v_sub_u16_sdwa v2, v2, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT: v_max_i16_sdwa v0, sext(v0), v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT: v_max_i16_sdwa v1, sext(v1), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: abs_vgpr_v2i8:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_bfe_i32 v0, v0, 0, 8
-; GFX10-NEXT: v_bfe_i32 v1, v1, 0, 8
-; GFX10-NEXT: v_sub_nc_u16 v2, 0, v0
-; GFX10-NEXT: v_sub_nc_u16 v3, 0, v1
-; GFX10-NEXT: v_max_i16 v0, v0, v2
-; GFX10-NEXT: v_max_i16 v1, v1, v3
-; GFX10-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1250-LABEL: abs_vgpr_v2i8:
-; GFX1250: ; %bb.0:
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_bfe_i32 v0, v0, 0, 8
-; GFX1250-NEXT: v_bfe_i32 v1, v1, 0, 8
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1250-NEXT: v_sub_nc_u16 v2, 0, v0
-; GFX1250-NEXT: v_sub_nc_u16 v3, 0, v1
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1250-NEXT: v_max_i16 v0, v0, v2
-; GFX1250-NEXT: v_max_i16 v1, v1, v3
-; GFX1250-NEXT: s_set_pc_i64 s[30:31]
- %res = call <2 x i8> @llvm.abs.v2i8(<2 x i8> %arg, i1 false)
- ret <2 x i8> %res
-}
-
-define amdgpu_cs <3 x i8> @abs_sgpr_v3i8(<3 x i8> inreg %arg) {
-; GFX-LABEL: abs_sgpr_v3i8:
-; GFX: ; %bb.0:
-; GFX-NEXT: s_sext_i32_i8 s0, s0
-; GFX-NEXT: s_sext_i32_i8 s1, s1
-; GFX-NEXT: s_sext_i32_i8 s2, s2
-; GFX-NEXT: s_abs_i32 s0, s0
-; GFX-NEXT: s_abs_i32 s1, s1
-; GFX-NEXT: s_abs_i32 s2, s2
-; GFX-NEXT: ; return to shader part epilog
- %res = call <3 x i8> @llvm.abs.v3i8(<3 x i8> %arg, i1 false)
- ret <3 x i8> %res
-}
-
-define <3 x i8> @abs_vgpr_v3i8(<3 x i8> %arg) {
-; GFX6-LABEL: abs_vgpr_v3i8:
-; GFX6: ; %bb.0:
-; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 8
-; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0, v0
-; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 8
-; GFX6-NEXT: v_max_i32_e32 v0, v0, v3
-; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0, v1
-; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 8
-; GFX6-NEXT: v_max_i32_e32 v1, v1, v3
-; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0, v2
-; GFX6-NEXT: v_max_i32_e32 v2, v2, v3
-; GFX6-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: abs_vgpr_v3i8:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v3, 0
-; GFX8-NEXT: v_sub_u16_sdwa v4, v3, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT: v_max_i16_sdwa v0, sext(v0), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT: v_sub_u16_sdwa v4, v3, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT: v_sub_u16_sdwa v3, v3, sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT: v_max_i16_sdwa v1, sext(v1), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT: v_max_i16_sdwa v2, sext(v2), v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: abs_vgpr_v3i8:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_bfe_i32 v0, v0, 0, 8
-; GFX10-NEXT: v_bfe_i32 v1, v1, 0, 8
-; GFX10-NEXT: v_bfe_i32 v2, v2, 0, 8
-; GFX10-NEXT: v_sub_nc_u16 v3, 0, v0
-; GFX10-NEXT: v_sub_nc_u16 v4, 0, v1
-; GFX10-NEXT: v_sub_nc_u16 v5, 0, v2
-; GFX10-NEXT: v_max_i16 v0, v0, v3
-; GFX10-NEXT: v_max_i16 v1, v1, v4
-; GFX10-NEXT: v_max_i16 v2, v2, v5
-; GFX10-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1250-LABEL: abs_vgpr_v3i8:
-; GFX1250: ; %bb.0:
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_bfe_i32 v0, v0, 0, 8
-; GFX1250-NEXT: v_bfe_i32 v1, v1, 0, 8
-; GFX1250-NEXT: v_bfe_i32 v2, v2, 0, 8
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1250-NEXT: v_sub_nc_u16 v3, 0, v0
-; GFX1250-NEXT: v_sub_nc_u16 v4, 0, v1
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1250-NEXT: v_sub_nc_u16 v5, 0, v2
-; GFX1250-NEXT: v_max_i16 v0, v0, v3
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1250-NEXT: v_max_i16 v1, v1, v4
-; GFX1250-NEXT: v_max_i16 v2, v2, v5
-; GFX1250-NEXT: s_set_pc_i64 s[30:31]
- %res = call <3 x i8> @llvm.abs.v3i8(<3 x i8> %arg, i1 false)
- ret <3 x i8> %res
-}
-
-define amdgpu_cs <2 x i16> @abs_sgpr_v2i16(<2 x i16> inreg %arg) {
-; GFX6-LABEL: abs_sgpr_v2i16:
-; GFX6: ; %bb.0:
-; GFX6-NEXT: s_sext_i32_i16 s0, s0
-; GFX6-NEXT: s_sext_i32_i16 s1, s1
-; GFX6-NEXT: s_abs_i32 s0, s0
-; GFX6-NEXT: s_abs_i32 s1, s1
-; GFX6-NEXT: ; return to shader part epilog
-;
-; GFX8-LABEL: abs_sgpr_v2i16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_lshr_b32 s1, s0, 16
-; GFX8-NEXT: s_sext_i32_i16 s1, s1
-; GFX8-NEXT: s_sext_i32_i16 s0, s0
-; GFX8-NEXT: s_abs_i32 s1, s1
-; GFX8-NEXT: s_abs_i32 s0, s0
-; GFX8-NEXT: s_lshl_b32 s1, s1, 16
-; GFX8-NEXT: s_or_b32 s0, s0, s1
-; GFX8-NEXT: ; return to shader part epilog
-;
-; GFX10-LABEL: abs_sgpr_v2i16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_sext_i32_i16 s1, s0
-; GFX10-NEXT: s_ashr_i32 s0, s0, 16
-; GFX10-NEXT: s_abs_i32 s1, s1
-; GFX10-NEXT: s_abs_i32 s0, s0
-; GFX10-NEXT: s_pack_ll_b32_b16 s0, s1, s0
-; GFX10-NEXT: ; return to shader part epilog
-;
-; GFX1250-LABEL: abs_sgpr_v2i16:
-; GFX1250: ; %bb.0:
-; GFX1250-NEXT: s_sext_i32_i16 s1, s0
-; GFX1250-NEXT: s_ashr_i32 s0, s0, 16
-; GFX1250-NEXT: s_abs_i32 s1, s1
-; GFX1250-NEXT: s_abs_i32 s0, s0
-; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1250-NEXT: s_pack_ll_b32_b16 s0, s1, s0
-; GFX1250-NEXT: ; return to shader part epilog
- %res = call <2 x i16> @llvm.abs.v2i16(<2 x i16> %arg, i1 false)
- ret <2 x i16> %res
-}
-
-define <2 x i16> @abs_vgpr_v2i16(<2 x i16> %arg) {
-; GFX6-LABEL: abs_vgpr_v2i16:
-; GFX6: ; %bb.0:
-; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16
-; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 0, v0
-; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16
-; GFX6-NEXT: v_max_i32_e32 v0, v0, v2
-; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 0, v1
-; GFX6-NEXT: v_max_i32_e32 v1, v1, v2
-; GFX6-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: abs_vgpr_v2i16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v2, 0
-; GFX8-NEXT: v_sub_u16_e32 v1, 0, v0
-; GFX8-NEXT: v_sub_u16_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT: v_max_i16_e32 v1, v0, v1
-; GFX8-NEXT: v_max_i16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: abs_vgpr_v2i16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_pk_sub_i16 v1, 0, v0
-; GFX10-NEXT: v_pk_max_i16 v0, v0, v1
-; GFX10-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1250-LABEL: abs_vgpr_v2i16:
-; GFX1250: ; %bb.0:
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_pk_sub_i16 v1, 0, v0
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-NEXT: v_pk_max_i16 v0, v0, v1
-; GFX1250-NEXT: s_set_pc_i64 s[30:31]
- %res = call <2 x i16> @llvm.abs.v2i16(<2 x i16> %arg, i1 false)
- ret <2 x i16> %res
-}
-
-define amdgpu_cs <3 x i16> @abs_sgpr_v3i16(<3 x i16> inreg %arg) {
-; GFX6-LABEL: abs_sgpr_v3i16:
-; GFX6: ; %bb.0:
-; GFX6-NEXT: s_sext_i32_i16 s0, s0
-; GFX6-NEXT: s_sext_i32_i16 s1, s1
-; GFX6-NEXT: s_sext_i32_i16 s2, s2
-; GFX6-NEXT: s_abs_i32 s0, s0
-; GFX6-NEXT: s_abs_i32 s1, s1
-; GFX6-NEXT: s_abs_i32 s2, s2
-; GFX6-NEXT: ; return to shader part epilog
-;
-; GFX8-LABEL: abs_sgpr_v3i16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_lshr_b32 s2, s0, 16
-; GFX8-NEXT: s_sext_i32_i16 s2, s2
-; GFX8-NEXT: s_sext_i32_i16 s0, s0
-; GFX8-NEXT: s_abs_i32 s2, s2
-; GFX8-NEXT: s_abs_i32 s0, s0
-; GFX8-NEXT: s_sext_i32_i16 s1, s1
-; GFX8-NEXT: s_lshl_b32 s2, s2, 16
-; GFX8-NEXT: s_abs_i32 s1, s1
-; GFX8-NEXT: s_or_b32 s0, s0, s2
-; GFX8-NEXT: ; return to shader part epilog
-;
-; GFX10-LABEL: abs_sgpr_v3i16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_sext_i32_i16 s2, s0
-; GFX10-NEXT: s_ashr_i32 s0, s0, 16
-; GFX10-NEXT: s_abs_i32 s2, s2
-; GFX10-NEXT: s_abs_i32 s0, s0
-; GFX10-NEXT: s_sext_i32_i16 s1, s1
-; GFX10-NEXT: s_pack_ll_b32_b16 s0, s2, s0
-; GFX10-NEXT: s_abs_i32 s1, s1
-; GFX10-NEXT: ; return to shader part epilog
-;
-; GFX1250-LABEL: abs_sgpr_v3i16:
-; GFX1250: ; %bb.0:
-; GFX1250-NEXT: s_sext_i32_i16 s2, s0
-; GFX1250-NEXT: s_ashr_i32 s0, s0, 16
-; GFX1250-NEXT: s_abs_i32 s2, s2
-; GFX1250-NEXT: s_abs_i32 s0, s0
-; GFX1250-NEXT: s_sext_i32_i16 s1, s1
-; GFX1250-NEXT: s_pack_ll_b32_b16 s0, s2, s0
-; GFX1250-NEXT: s_abs_i32 s1, s1
-; GFX1250-NEXT: ; return to shader part epilog
- %res = call <3 x i16> @llvm.abs.v3i16(<3 x i16> %arg, i1 false)
- ret <3 x i16> %res
-}
-
-define <3 x i16> @abs_vgpr_v3i16(<3 x i16> %arg) {
-; GFX6-LABEL: abs_vgpr_v3i16:
-; GFX6: ; %bb.0:
-; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16
-; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0, v0
-; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16
-; GFX6-NEXT: v_max_i32_e32 v0, v0, v3
-; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0, v1
-; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 16
-; GFX6-NEXT: v_max_i32_e32 v1, v1, v3
-; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0, v2
-; GFX6-NEXT: v_max_i32_e32 v2, v2, v3
-; GFX6-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: abs_vgpr_v3i16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v3, 0
-; GFX8-NEXT: v_sub_u16_e32 v2, 0, v0
-; GFX8-NEXT: v_sub_u16_sdwa v3, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT: v_sub_u16_e32 v4, 0, v1
-; GFX8-NEXT: v_max_i16_e32 v2, v0, v2
-; GFX8-NEXT: v_max_i16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT: v_or_b32_e32 v0, v2, v0
-; GFX8-NEXT: v_max_i16_e32 v1, v1, v4
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: abs_vgpr_v3i16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_pk_sub_i16 v2, 0, v0
-; GFX10-NEXT: v_sub_nc_u16 v3, 0, v1
-; GFX10-NEXT: v_pk_max_i16 v0, v0, v2
-; GFX10-NEXT: v_max_i16 v1, v1, v3
-; GFX10-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX1250-LABEL: abs_vgpr_v3i16:
-; GFX1250: ; %bb.0:
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_pk_sub_i16 v2, 0, v0
-; GFX1250-NEXT: v_sub_nc_u16 v3, 0, v1
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1250-NEXT: v_pk_max_i16 v0, v0, v2
-; GFX1250-NEXT: v_max_i16 v1, v1, v3
-; GFX1250-NEXT: s_set_pc_i64 s[30:31]
- %res = call <3 x i16> @llvm.abs.v3i16(<3 x i16> %arg, i1 false)
- ret <3 x i16> %res
-}
>From 294552188b3491282dfbac6e10df0f02986066e4 Mon Sep 17 00:00:00 2001
From: Patrick Simmons <patrick.simmons at amd.com>
Date: Thu, 6 Nov 2025 13:11:24 -0500
Subject: [PATCH 15/19] Move file
---
llvm/test/CodeGen/AMDGPU/{GlobalISel => }/llvm.abs.ll | 0
1 file changed, 0 insertions(+), 0 deletions(-)
rename llvm/test/CodeGen/AMDGPU/{GlobalISel => }/llvm.abs.ll (100%)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll b/llvm/test/CodeGen/AMDGPU/llvm.abs.ll
similarity index 100%
rename from llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll
rename to llvm/test/CodeGen/AMDGPU/llvm.abs.ll
>From 68298cf3cc022137e85cd7f288634e0a13842ff9 Mon Sep 17 00:00:00 2001
From: Patrick Simmons <patrick.simmons at amd.com>
Date: Thu, 6 Nov 2025 13:12:07 -0500
Subject: [PATCH 16/19] Restore testcase
---
llvm/test/CodeGen/AMDGPU/llvm.abs.ll | 887 +++++++++++++++++++++++++++
1 file changed, 887 insertions(+)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.abs.ll b/llvm/test/CodeGen/AMDGPU/llvm.abs.ll
index 6facdfdec64ae..dd7d2fbc931b6 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.abs.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.abs.ll
@@ -1,4 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn -mcpu=tahiti -o - < %s | FileCheck %s --check-prefixes=SDAG6
+; RUN: llc -mtriple=amdgcn -mcpu=fiji -o - < %s | FileCheck %s --check-prefixes=SDAG8
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -o - < %s | FileCheck %s --check-prefixes=SDAG10
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -o - < %s | FileCheck %s --check-prefixes=SDAG1250
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=tahiti -o - < %s | FileCheck %s --check-prefixes=GFX,GFX6
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=fiji -o - < %s | FileCheck %s --check-prefixes=GFX,GFX8
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -o - < %s | FileCheck %s --check-prefixes=GFX,GFX10
@@ -13,7 +17,152 @@ declare <2 x i16> @llvm.abs.v2i16(<2 x i16>, i1)
declare <3 x i16> @llvm.abs.v3i16(<3 x i16>, i1)
declare <4 x i32> @llvm.abs.v4i32(<4 x i32>, i1)
+define amdgpu_cs i8 @abs_sgpr_i8(i8 inreg %arg) {
+; SDAG6-LABEL: abs_sgpr_i8:
+; SDAG6: ; %bb.0:
+; SDAG6-NEXT: s_sext_i32_i8 s0, s0
+; SDAG6-NEXT: s_abs_i32 s0, s0
+; SDAG6-NEXT: ; return to shader part epilog
+;
+; SDAG8-LABEL: abs_sgpr_i8:
+; SDAG8: ; %bb.0:
+; SDAG8-NEXT: s_bfe_i32 s0, s0, 0x80000
+; SDAG8-NEXT: s_sext_i32_i16 s0, s0
+; SDAG8-NEXT: s_abs_i32 s0, s0
+; SDAG8-NEXT: ; return to shader part epilog
+;
+; SDAG10-LABEL: abs_sgpr_i8:
+; SDAG10: ; %bb.0:
+; SDAG10-NEXT: s_bfe_i32 s0, s0, 0x80000
+; SDAG10-NEXT: s_sext_i32_i16 s0, s0
+; SDAG10-NEXT: s_abs_i32 s0, s0
+; SDAG10-NEXT: ; return to shader part epilog
+;
+; SDAG1250-LABEL: abs_sgpr_i8:
+; SDAG1250: ; %bb.0:
+; SDAG1250-NEXT: s_bfe_i32 s0, s0, 0x80000
+; SDAG1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; SDAG1250-NEXT: s_sext_i32_i16 s0, s0
+; SDAG1250-NEXT: s_abs_i32 s0, s0
+; SDAG1250-NEXT: ; return to shader part epilog
+;
+; GFX6-LABEL: abs_sgpr_i8:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_sext_i32_i8 s0, s0
+; GFX6-NEXT: s_abs_i32 s0, s0
+; GFX6-NEXT: ; return to shader part epilog
+;
+; GFX8-LABEL: abs_sgpr_i8:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_sext_i32_i8 s0, s0
+; GFX8-NEXT: s_abs_i32 s0, s0
+; GFX8-NEXT: ; return to shader part epilog
+;
+; GFX10-LABEL: abs_sgpr_i8:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_sext_i32_i8 s0, s0
+; GFX10-NEXT: s_abs_i32 s0, s0
+; GFX10-NEXT: ; return to shader part epilog
+;
+; GFX1250-LABEL: abs_sgpr_i8:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_sext_i32_i8 s0, s0
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: s_abs_i32 s0, s0
+; GFX1250-NEXT: ; return to shader part epilog
+ %res = call i8 @llvm.abs.i8(i8 %arg, i1 false)
+ ret i8 %res
+}
+
+define amdgpu_cs i8 @abs_sgpr_i8_neg(i8 inreg %arg) {
+; SDAG6-LABEL: abs_sgpr_i8_neg:
+; SDAG6: ; %bb.0:
+; SDAG6-NEXT: s_sext_i32_i8 s0, s0
+; SDAG6-NEXT: s_abs_i32 s0, s0
+; SDAG6-NEXT: s_sub_i32 s0, 0, s0
+; SDAG6-NEXT: ; return to shader part epilog
+;
+; SDAG8-LABEL: abs_sgpr_i8_neg:
+; SDAG8: ; %bb.0:
+; SDAG8-NEXT: s_sext_i32_i8 s0, s0
+; SDAG8-NEXT: s_abs_i32 s0, s0
+; SDAG8-NEXT: s_sub_i32 s0, 0, s0
+; SDAG8-NEXT: ; return to shader part epilog
+;
+; SDAG10-LABEL: abs_sgpr_i8_neg:
+; SDAG10: ; %bb.0:
+; SDAG10-NEXT: s_sext_i32_i8 s0, s0
+; SDAG10-NEXT: s_abs_i32 s0, s0
+; SDAG10-NEXT: s_sub_i32 s0, 0, s0
+; SDAG10-NEXT: ; return to shader part epilog
+;
+; SDAG1250-LABEL: abs_sgpr_i8_neg:
+; SDAG1250: ; %bb.0:
+; SDAG1250-NEXT: s_sext_i32_i8 s0, s0
+; SDAG1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; SDAG1250-NEXT: s_abs_i32 s0, s0
+; SDAG1250-NEXT: s_sub_co_i32 s0, 0, s0
+; SDAG1250-NEXT: ; return to shader part epilog
+;
+; GFX6-LABEL: abs_sgpr_i8_neg:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_sext_i32_i8 s0, s0
+; GFX6-NEXT: s_abs_i32 s0, s0
+; GFX6-NEXT: s_sub_i32 s0, 0, s0
+; GFX6-NEXT: ; return to shader part epilog
+;
+; GFX8-LABEL: abs_sgpr_i8_neg:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_sext_i32_i8 s0, s0
+; GFX8-NEXT: s_abs_i32 s0, s0
+; GFX8-NEXT: s_sub_i32 s0, 0, s0
+; GFX8-NEXT: ; return to shader part epilog
+;
+; GFX10-LABEL: abs_sgpr_i8_neg:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_sext_i32_i8 s0, s0
+; GFX10-NEXT: s_abs_i32 s0, s0
+; GFX10-NEXT: s_sub_i32 s0, 0, s0
+; GFX10-NEXT: ; return to shader part epilog
+;
+; GFX1250-LABEL: abs_sgpr_i8_neg:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_sext_i32_i8 s0, s0
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1250-NEXT: s_abs_i32 s0, s0
+; GFX1250-NEXT: s_sub_co_i32 s0, 0, s0
+; GFX1250-NEXT: ; return to shader part epilog
+ %res1 = call i8 @llvm.abs.i8(i8 %arg, i1 false)
+ %res2 = sub i8 0, %res1
+ ret i8 %res2
+}
+
define amdgpu_cs i16 @abs_sgpr_i16(i16 inreg %arg) {
+; SDAG6-LABEL: abs_sgpr_i16:
+; SDAG6: ; %bb.0:
+; SDAG6-NEXT: s_sext_i32_i16 s0, s0
+; SDAG6-NEXT: s_abs_i32 s0, s0
+; SDAG6-NEXT: ; return to shader part epilog
+;
+; SDAG8-LABEL: abs_sgpr_i16:
+; SDAG8: ; %bb.0:
+; SDAG8-NEXT: s_sext_i32_i16 s0, s0
+; SDAG8-NEXT: s_abs_i32 s0, s0
+; SDAG8-NEXT: ; return to shader part epilog
+;
+; SDAG10-LABEL: abs_sgpr_i16:
+; SDAG10: ; %bb.0:
+; SDAG10-NEXT: s_sext_i32_i16 s0, s0
+; SDAG10-NEXT: s_abs_i32 s0, s0
+; SDAG10-NEXT: ; return to shader part epilog
+;
+; SDAG1250-LABEL: abs_sgpr_i16:
+; SDAG1250: ; %bb.0:
+; SDAG1250-NEXT: s_sext_i32_i16 s0, s0
+; SDAG1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; SDAG1250-NEXT: s_abs_i32 s0, s0
+; SDAG1250-NEXT: ; return to shader part epilog
+;
; GFX6-LABEL: abs_sgpr_i16:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_sext_i32_i16 s0, s0
@@ -38,11 +187,95 @@ define amdgpu_cs i16 @abs_sgpr_i16(i16 inreg %arg) {
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-NEXT: s_abs_i32 s0, s0
; GFX1250-NEXT: ; return to shader part epilog
+
%res = call i16 @llvm.abs.i16(i16 %arg, i1 false)
ret i16 %res
}
+define amdgpu_ps i16 @abs_sgpr_i16_neg(i16 inreg %arg) {
+; SDAG6-LABEL: abs_sgpr_i16_neg:
+; SDAG6: ; %bb.0:
+; SDAG6-NEXT: s_sext_i32_i16 s0, s0
+; SDAG6-NEXT: s_abs_i32 s0, s0
+; SDAG6-NEXT: s_sub_i32 s0, 0, s0
+; SDAG6-NEXT: ; return to shader part epilog
+;
+; SDAG8-LABEL: abs_sgpr_i16_neg:
+; SDAG8: ; %bb.0:
+; SDAG8-NEXT: s_sext_i32_i16 s0, s0
+; SDAG8-NEXT: s_abs_i32 s0, s0
+; SDAG8-NEXT: s_sub_i32 s0, 0, s0
+; SDAG8-NEXT: ; return to shader part epilog
+;
+; SDAG10-LABEL: abs_sgpr_i16_neg:
+; SDAG10: ; %bb.0:
+; SDAG10-NEXT: s_sext_i32_i16 s0, s0
+; SDAG10-NEXT: s_abs_i32 s0, s0
+; SDAG10-NEXT: s_sub_i32 s0, 0, s0
+; SDAG10-NEXT: ; return to shader part epilog
+;
+; SDAG1250-LABEL: abs_sgpr_i16_neg:
+; SDAG1250: ; %bb.0:
+; SDAG1250-NEXT: s_sext_i32_i16 s0, s0
+; SDAG1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; SDAG1250-NEXT: s_abs_i32 s0, s0
+; SDAG1250-NEXT: s_sub_co_i32 s0, 0, s0
+; SDAG1250-NEXT: ; return to shader part epilog
+;
+; GFX6-LABEL: abs_sgpr_i16_neg:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_sext_i32_i16 s0, s0
+; GFX6-NEXT: s_abs_i32 s0, s0
+; GFX6-NEXT: s_sub_i32 s0, 0, s0
+; GFX6-NEXT: ; return to shader part epilog
+;
+; GFX8-LABEL: abs_sgpr_i16_neg:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_sext_i32_i16 s0, s0
+; GFX8-NEXT: s_abs_i32 s0, s0
+; GFX8-NEXT: s_sub_i32 s0, 0, s0
+; GFX8-NEXT: ; return to shader part epilog
+;
+; GFX10-LABEL: abs_sgpr_i16_neg:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_sext_i32_i16 s0, s0
+; GFX10-NEXT: s_abs_i32 s0, s0
+; GFX10-NEXT: s_sub_i32 s0, 0, s0
+; GFX10-NEXT: ; return to shader part epilog
+;
+; GFX1250-LABEL: abs_sgpr_i16_neg:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_sext_i32_i16 s0, s0
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1250-NEXT: s_abs_i32 s0, s0
+; GFX1250-NEXT: s_sub_co_i32 s0, 0, s0
+; GFX1250-NEXT: ; return to shader part epilog
+ %res1 = call i16 @llvm.abs.i16(i16 %arg, i1 false)
+ %res2 = sub i16 0, %res1
+ ret i16 %res2
+}
+
define amdgpu_cs i32 @abs_sgpr_i32(i32 inreg %arg) {
+; SDAG6-LABEL: abs_sgpr_i32:
+; SDAG6: ; %bb.0:
+; SDAG6-NEXT: s_abs_i32 s0, s0
+; SDAG6-NEXT: ; return to shader part epilog
+;
+; SDAG8-LABEL: abs_sgpr_i32:
+; SDAG8: ; %bb.0:
+; SDAG8-NEXT: s_abs_i32 s0, s0
+; SDAG8-NEXT: ; return to shader part epilog
+;
+; SDAG10-LABEL: abs_sgpr_i32:
+; SDAG10: ; %bb.0:
+; SDAG10-NEXT: s_abs_i32 s0, s0
+; SDAG10-NEXT: ; return to shader part epilog
+;
+; SDAG1250-LABEL: abs_sgpr_i32:
+; SDAG1250: ; %bb.0:
+; SDAG1250-NEXT: s_abs_i32 s0, s0
+; SDAG1250-NEXT: ; return to shader part epilog
+;
; GFX-LABEL: abs_sgpr_i32:
; GFX: ; %bb.0:
; GFX-NEXT: s_abs_i32 s0, s0
@@ -52,6 +285,43 @@ define amdgpu_cs i32 @abs_sgpr_i32(i32 inreg %arg) {
}
define amdgpu_cs i64 @abs_sgpr_i64(i64 inreg %arg) {
+; SDAG6-LABEL: abs_sgpr_i64:
+; SDAG6: ; %bb.0:
+; SDAG6-NEXT: s_ashr_i32 s2, s1, 31
+; SDAG6-NEXT: s_mov_b32 s3, s2
+; SDAG6-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3]
+; SDAG6-NEXT: s_sub_u32 s0, s0, s2
+; SDAG6-NEXT: s_subb_u32 s1, s1, s2
+; SDAG6-NEXT: ; return to shader part epilog
+;
+; SDAG8-LABEL: abs_sgpr_i64:
+; SDAG8: ; %bb.0:
+; SDAG8-NEXT: s_ashr_i32 s2, s1, 31
+; SDAG8-NEXT: s_mov_b32 s3, s2
+; SDAG8-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3]
+; SDAG8-NEXT: s_sub_u32 s0, s0, s2
+; SDAG8-NEXT: s_subb_u32 s1, s1, s2
+; SDAG8-NEXT: ; return to shader part epilog
+;
+; SDAG10-LABEL: abs_sgpr_i64:
+; SDAG10: ; %bb.0:
+; SDAG10-NEXT: s_ashr_i32 s2, s1, 31
+; SDAG10-NEXT: s_mov_b32 s3, s2
+; SDAG10-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3]
+; SDAG10-NEXT: s_sub_u32 s0, s0, s2
+; SDAG10-NEXT: s_subb_u32 s1, s1, s2
+; SDAG10-NEXT: ; return to shader part epilog
+;
+; SDAG1250-LABEL: abs_sgpr_i64:
+; SDAG1250: ; %bb.0:
+; SDAG1250-NEXT: s_sub_nc_u64 s[2:3], 0, s[0:1]
+; SDAG1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; SDAG1250-NEXT: v_max_i64 v[0:1], s[0:1], s[2:3]
+; SDAG1250-NEXT: v_readfirstlane_b32 s0, v0
+; SDAG1250-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; SDAG1250-NEXT: v_readfirstlane_b32 s1, v1
+; SDAG1250-NEXT: ; return to shader part epilog
+;
; GFX6-LABEL: abs_sgpr_i64:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_ashr_i32 s2, s1, 31
@@ -93,6 +363,38 @@ define amdgpu_cs i64 @abs_sgpr_i64(i64 inreg %arg) {
}
define amdgpu_cs <4 x i32> @abs_sgpr_v4i32(<4 x i32> inreg %arg) {
+; SDAG6-LABEL: abs_sgpr_v4i32:
+; SDAG6: ; %bb.0:
+; SDAG6-NEXT: s_abs_i32 s3, s3
+; SDAG6-NEXT: s_abs_i32 s2, s2
+; SDAG6-NEXT: s_abs_i32 s1, s1
+; SDAG6-NEXT: s_abs_i32 s0, s0
+; SDAG6-NEXT: ; return to shader part epilog
+;
+; SDAG8-LABEL: abs_sgpr_v4i32:
+; SDAG8: ; %bb.0:
+; SDAG8-NEXT: s_abs_i32 s3, s3
+; SDAG8-NEXT: s_abs_i32 s2, s2
+; SDAG8-NEXT: s_abs_i32 s1, s1
+; SDAG8-NEXT: s_abs_i32 s0, s0
+; SDAG8-NEXT: ; return to shader part epilog
+;
+; SDAG10-LABEL: abs_sgpr_v4i32:
+; SDAG10: ; %bb.0:
+; SDAG10-NEXT: s_abs_i32 s3, s3
+; SDAG10-NEXT: s_abs_i32 s2, s2
+; SDAG10-NEXT: s_abs_i32 s0, s0
+; SDAG10-NEXT: s_abs_i32 s1, s1
+; SDAG10-NEXT: ; return to shader part epilog
+;
+; SDAG1250-LABEL: abs_sgpr_v4i32:
+; SDAG1250: ; %bb.0:
+; SDAG1250-NEXT: s_abs_i32 s3, s3
+; SDAG1250-NEXT: s_abs_i32 s2, s2
+; SDAG1250-NEXT: s_abs_i32 s0, s0
+; SDAG1250-NEXT: s_abs_i32 s1, s1
+; SDAG1250-NEXT: ; return to shader part epilog
+;
; GFX-LABEL: abs_sgpr_v4i32:
; GFX: ; %bb.0:
; GFX-NEXT: s_abs_i32 s0, s0
@@ -105,6 +407,37 @@ define amdgpu_cs <4 x i32> @abs_sgpr_v4i32(<4 x i32> inreg %arg) {
}
define i16 @abs_vgpr_i16(i16 %arg) {
+; SDAG6-LABEL: abs_vgpr_i16:
+; SDAG6: ; %bb.0:
+; SDAG6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG6-NEXT: v_bfe_i32 v0, v0, 0, 16
+; SDAG6-NEXT: v_sub_i32_e32 v1, vcc, 0, v0
+; SDAG6-NEXT: v_max_i32_e32 v0, v1, v0
+; SDAG6-NEXT: s_setpc_b64 s[30:31]
+;
+; SDAG8-LABEL: abs_vgpr_i16:
+; SDAG8: ; %bb.0:
+; SDAG8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG8-NEXT: v_sub_u16_e32 v1, 0, v0
+; SDAG8-NEXT: v_max_i16_e32 v0, v0, v1
+; SDAG8-NEXT: s_setpc_b64 s[30:31]
+;
+; SDAG10-LABEL: abs_vgpr_i16:
+; SDAG10: ; %bb.0:
+; SDAG10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG10-NEXT: v_sub_nc_u16 v1, 0, v0
+; SDAG10-NEXT: v_max_i16 v0, v0, v1
+; SDAG10-NEXT: s_setpc_b64 s[30:31]
+;
+; SDAG1250-LABEL: abs_vgpr_i16:
+; SDAG1250: ; %bb.0:
+; SDAG1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; SDAG1250-NEXT: s_wait_kmcnt 0x0
+; SDAG1250-NEXT: v_sub_nc_u16 v1, 0, v0
+; SDAG1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; SDAG1250-NEXT: v_max_i16 v0, v0, v1
+; SDAG1250-NEXT: s_set_pc_i64 s[30:31]
+;
; GFX6-LABEL: abs_vgpr_i16:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -140,6 +473,36 @@ define i16 @abs_vgpr_i16(i16 %arg) {
}
define i32 @abs_vgpr_i32(i32 %arg) {
+; SDAG6-LABEL: abs_vgpr_i32:
+; SDAG6: ; %bb.0:
+; SDAG6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG6-NEXT: v_sub_i32_e32 v1, vcc, 0, v0
+; SDAG6-NEXT: v_max_i32_e32 v0, v1, v0
+; SDAG6-NEXT: s_setpc_b64 s[30:31]
+;
+; SDAG8-LABEL: abs_vgpr_i32:
+; SDAG8: ; %bb.0:
+; SDAG8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG8-NEXT: v_sub_u32_e32 v1, vcc, 0, v0
+; SDAG8-NEXT: v_max_i32_e32 v0, v1, v0
+; SDAG8-NEXT: s_setpc_b64 s[30:31]
+;
+; SDAG10-LABEL: abs_vgpr_i32:
+; SDAG10: ; %bb.0:
+; SDAG10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG10-NEXT: v_sub_nc_u32_e32 v1, 0, v0
+; SDAG10-NEXT: v_max_i32_e32 v0, v1, v0
+; SDAG10-NEXT: s_setpc_b64 s[30:31]
+;
+; SDAG1250-LABEL: abs_vgpr_i32:
+; SDAG1250: ; %bb.0:
+; SDAG1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; SDAG1250-NEXT: s_wait_kmcnt 0x0
+; SDAG1250-NEXT: v_sub_nc_u32_e32 v1, 0, v0
+; SDAG1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; SDAG1250-NEXT: v_max_i32_e32 v0, v1, v0
+; SDAG1250-NEXT: s_set_pc_i64 s[30:31]
+;
; GFX6-LABEL: abs_vgpr_i32:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -174,6 +537,45 @@ define i32 @abs_vgpr_i32(i32 %arg) {
}
define i64 @abs_vgpr_i64(i64 %arg) {
+; SDAG6-LABEL: abs_vgpr_i64:
+; SDAG6: ; %bb.0:
+; SDAG6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG6-NEXT: v_ashrrev_i32_e32 v2, 31, v1
+; SDAG6-NEXT: v_xor_b32_e32 v0, v0, v2
+; SDAG6-NEXT: v_xor_b32_e32 v1, v1, v2
+; SDAG6-NEXT: v_sub_i32_e32 v0, vcc, v0, v2
+; SDAG6-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc
+; SDAG6-NEXT: s_setpc_b64 s[30:31]
+;
+; SDAG8-LABEL: abs_vgpr_i64:
+; SDAG8: ; %bb.0:
+; SDAG8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG8-NEXT: v_ashrrev_i32_e32 v2, 31, v1
+; SDAG8-NEXT: v_xor_b32_e32 v0, v0, v2
+; SDAG8-NEXT: v_xor_b32_e32 v1, v1, v2
+; SDAG8-NEXT: v_sub_u32_e32 v0, vcc, v0, v2
+; SDAG8-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc
+; SDAG8-NEXT: s_setpc_b64 s[30:31]
+;
+; SDAG10-LABEL: abs_vgpr_i64:
+; SDAG10: ; %bb.0:
+; SDAG10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG10-NEXT: v_ashrrev_i32_e32 v2, 31, v1
+; SDAG10-NEXT: v_xor_b32_e32 v0, v0, v2
+; SDAG10-NEXT: v_xor_b32_e32 v1, v1, v2
+; SDAG10-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v2
+; SDAG10-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v2, vcc_lo
+; SDAG10-NEXT: s_setpc_b64 s[30:31]
+;
+; SDAG1250-LABEL: abs_vgpr_i64:
+; SDAG1250: ; %bb.0:
+; SDAG1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; SDAG1250-NEXT: s_wait_kmcnt 0x0
+; SDAG1250-NEXT: v_sub_nc_u64_e32 v[2:3], 0, v[0:1]
+; SDAG1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; SDAG1250-NEXT: v_max_i64 v[0:1], v[0:1], v[2:3]
+; SDAG1250-NEXT: s_set_pc_i64 s[30:31]
+;
; GFX6-LABEL: abs_vgpr_i64:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -221,6 +623,59 @@ define i64 @abs_vgpr_i64(i64 %arg) {
}
define <4 x i32> @abs_vgpr_v4i32(<4 x i32> %arg) {
+; SDAG6-LABEL: abs_vgpr_v4i32:
+; SDAG6: ; %bb.0:
+; SDAG6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG6-NEXT: v_sub_i32_e32 v4, vcc, 0, v0
+; SDAG6-NEXT: v_max_i32_e32 v0, v4, v0
+; SDAG6-NEXT: v_sub_i32_e32 v4, vcc, 0, v1
+; SDAG6-NEXT: v_max_i32_e32 v1, v4, v1
+; SDAG6-NEXT: v_sub_i32_e32 v4, vcc, 0, v2
+; SDAG6-NEXT: v_max_i32_e32 v2, v4, v2
+; SDAG6-NEXT: v_sub_i32_e32 v4, vcc, 0, v3
+; SDAG6-NEXT: v_max_i32_e32 v3, v4, v3
+; SDAG6-NEXT: s_setpc_b64 s[30:31]
+;
+; SDAG8-LABEL: abs_vgpr_v4i32:
+; SDAG8: ; %bb.0:
+; SDAG8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG8-NEXT: v_sub_u32_e32 v4, vcc, 0, v0
+; SDAG8-NEXT: v_max_i32_e32 v0, v4, v0
+; SDAG8-NEXT: v_sub_u32_e32 v4, vcc, 0, v1
+; SDAG8-NEXT: v_max_i32_e32 v1, v4, v1
+; SDAG8-NEXT: v_sub_u32_e32 v4, vcc, 0, v2
+; SDAG8-NEXT: v_max_i32_e32 v2, v4, v2
+; SDAG8-NEXT: v_sub_u32_e32 v4, vcc, 0, v3
+; SDAG8-NEXT: v_max_i32_e32 v3, v4, v3
+; SDAG8-NEXT: s_setpc_b64 s[30:31]
+;
+; SDAG10-LABEL: abs_vgpr_v4i32:
+; SDAG10: ; %bb.0:
+; SDAG10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG10-NEXT: v_sub_nc_u32_e32 v4, 0, v0
+; SDAG10-NEXT: v_sub_nc_u32_e32 v5, 0, v1
+; SDAG10-NEXT: v_sub_nc_u32_e32 v6, 0, v2
+; SDAG10-NEXT: v_sub_nc_u32_e32 v7, 0, v3
+; SDAG10-NEXT: v_max_i32_e32 v0, v4, v0
+; SDAG10-NEXT: v_max_i32_e32 v1, v5, v1
+; SDAG10-NEXT: v_max_i32_e32 v2, v6, v2
+; SDAG10-NEXT: v_max_i32_e32 v3, v7, v3
+; SDAG10-NEXT: s_setpc_b64 s[30:31]
+;
+; SDAG1250-LABEL: abs_vgpr_v4i32:
+; SDAG1250: ; %bb.0:
+; SDAG1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; SDAG1250-NEXT: s_wait_kmcnt 0x0
+; SDAG1250-NEXT: v_dual_sub_nc_u32 v4, 0, v0 :: v_dual_sub_nc_u32 v5, 0, v1
+; SDAG1250-NEXT: v_dual_sub_nc_u32 v6, 0, v2 :: v_dual_sub_nc_u32 v7, 0, v3
+; SDAG1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; SDAG1250-NEXT: v_max_i32_e32 v0, v4, v0
+; SDAG1250-NEXT: v_max_i32_e32 v1, v5, v1
+; SDAG1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; SDAG1250-NEXT: v_max_i32_e32 v2, v6, v2
+; SDAG1250-NEXT: v_max_i32_e32 v3, v7, v3
+; SDAG1250-NEXT: s_set_pc_i64 s[30:31]
+;
; GFX6-LABEL: abs_vgpr_v4i32:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -278,6 +733,53 @@ define <4 x i32> @abs_vgpr_v4i32(<4 x i32> %arg) {
}
define amdgpu_cs <2 x i8> @abs_sgpr_v2i8(<2 x i8> inreg %arg) {
+; SDAG6-LABEL: abs_sgpr_v2i8:
+; SDAG6: ; %bb.0:
+; SDAG6-NEXT: s_sext_i32_i8 s1, s1
+; SDAG6-NEXT: s_sext_i32_i8 s0, s0
+; SDAG6-NEXT: s_abs_i32 s1, s1
+; SDAG6-NEXT: s_lshl_b32 s2, s1, 8
+; SDAG6-NEXT: s_abs_i32 s0, s0
+; SDAG6-NEXT: s_or_b32 s0, s0, s2
+; SDAG6-NEXT: ; return to shader part epilog
+;
+; SDAG8-LABEL: abs_sgpr_v2i8:
+; SDAG8: ; %bb.0:
+; SDAG8-NEXT: s_bfe_i32 s1, s1, 0x80000
+; SDAG8-NEXT: s_bfe_i32 s0, s0, 0x80000
+; SDAG8-NEXT: s_sext_i32_i16 s1, s1
+; SDAG8-NEXT: s_sext_i32_i16 s0, s0
+; SDAG8-NEXT: s_abs_i32 s1, s1
+; SDAG8-NEXT: s_abs_i32 s0, s0
+; SDAG8-NEXT: s_lshl_b32 s2, s1, 8
+; SDAG8-NEXT: s_or_b32 s0, s0, s2
+; SDAG8-NEXT: ; return to shader part epilog
+;
+; SDAG10-LABEL: abs_sgpr_v2i8:
+; SDAG10: ; %bb.0:
+; SDAG10-NEXT: s_bfe_i32 s1, s1, 0x80000
+; SDAG10-NEXT: s_bfe_i32 s0, s0, 0x80000
+; SDAG10-NEXT: s_sext_i32_i16 s1, s1
+; SDAG10-NEXT: s_sext_i32_i16 s0, s0
+; SDAG10-NEXT: s_abs_i32 s1, s1
+; SDAG10-NEXT: s_abs_i32 s0, s0
+; SDAG10-NEXT: s_lshl_b32 s2, s1, 8
+; SDAG10-NEXT: s_or_b32 s0, s0, s2
+; SDAG10-NEXT: ; return to shader part epilog
+;
+; SDAG1250-LABEL: abs_sgpr_v2i8:
+; SDAG1250: ; %bb.0:
+; SDAG1250-NEXT: s_bfe_i32 s1, s1, 0x80000
+; SDAG1250-NEXT: s_bfe_i32 s0, s0, 0x80000
+; SDAG1250-NEXT: s_sext_i32_i16 s1, s1
+; SDAG1250-NEXT: s_sext_i32_i16 s0, s0
+; SDAG1250-NEXT: s_abs_i32 s1, s1
+; SDAG1250-NEXT: s_abs_i32 s0, s0
+; SDAG1250-NEXT: s_lshl_b32 s2, s1, 8
+; SDAG1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; SDAG1250-NEXT: s_or_b32 s0, s0, s2
+; SDAG1250-NEXT: ; return to shader part epilog
+;
; GFX-LABEL: abs_sgpr_v2i8:
; GFX: ; %bb.0:
; GFX-NEXT: s_sext_i32_i8 s0, s0
@@ -290,6 +792,64 @@ define amdgpu_cs <2 x i8> @abs_sgpr_v2i8(<2 x i8> inreg %arg) {
}
define <2 x i8> @abs_vgpr_v2i8(<2 x i8> %arg) {
+; SDAG6-LABEL: abs_vgpr_v2i8:
+; SDAG6: ; %bb.0:
+; SDAG6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG6-NEXT: v_bfe_i32 v0, v0, 0, 8
+; SDAG6-NEXT: v_bfe_i32 v1, v1, 0, 8
+; SDAG6-NEXT: v_sub_i32_e32 v2, vcc, 0, v0
+; SDAG6-NEXT: v_max_i32_e32 v0, v2, v0
+; SDAG6-NEXT: v_sub_i32_e32 v2, vcc, 0, v1
+; SDAG6-NEXT: v_max_i32_e32 v1, v2, v1
+; SDAG6-NEXT: v_lshlrev_b32_e32 v2, 8, v1
+; SDAG6-NEXT: v_or_b32_e32 v0, v0, v2
+; SDAG6-NEXT: s_setpc_b64 s[30:31]
+;
+; SDAG8-LABEL: abs_vgpr_v2i8:
+; SDAG8: ; %bb.0:
+; SDAG8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG8-NEXT: v_mov_b32_e32 v2, 0
+; SDAG8-NEXT: v_sub_u16_sdwa v3, v2, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; SDAG8-NEXT: v_sub_u16_sdwa v2, v2, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; SDAG8-NEXT: v_max_i16_sdwa v1, sext(v1), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG8-NEXT: v_max_i16_sdwa v0, sext(v0), v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG8-NEXT: v_lshlrev_b16_e32 v2, 8, v1
+; SDAG8-NEXT: v_or_b32_e32 v0, v0, v2
+; SDAG8-NEXT: v_and_b32_e32 v1, 0xff, v1
+; SDAG8-NEXT: s_setpc_b64 s[30:31]
+;
+; SDAG10-LABEL: abs_vgpr_v2i8:
+; SDAG10: ; %bb.0:
+; SDAG10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG10-NEXT: v_bfe_i32 v1, v1, 0, 8
+; SDAG10-NEXT: v_bfe_i32 v0, v0, 0, 8
+; SDAG10-NEXT: v_sub_nc_u16 v2, 0, v1
+; SDAG10-NEXT: v_sub_nc_u16 v3, 0, v0
+; SDAG10-NEXT: v_max_i16 v1, v1, v2
+; SDAG10-NEXT: v_max_i16 v0, v0, v3
+; SDAG10-NEXT: v_lshlrev_b16 v2, 8, v1
+; SDAG10-NEXT: v_and_b32_e32 v1, 0xff, v1
+; SDAG10-NEXT: v_or_b32_e32 v0, v0, v2
+; SDAG10-NEXT: s_setpc_b64 s[30:31]
+;
+; SDAG1250-LABEL: abs_vgpr_v2i8:
+; SDAG1250: ; %bb.0:
+; SDAG1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; SDAG1250-NEXT: s_wait_kmcnt 0x0
+; SDAG1250-NEXT: v_bfe_i32 v1, v1, 0, 8
+; SDAG1250-NEXT: v_bfe_i32 v0, v0, 0, 8
+; SDAG1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; SDAG1250-NEXT: v_sub_nc_u16 v2, 0, v1
+; SDAG1250-NEXT: v_sub_nc_u16 v3, 0, v0
+; SDAG1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; SDAG1250-NEXT: v_max_i16 v1, v1, v2
+; SDAG1250-NEXT: v_max_i16 v0, v0, v3
+; SDAG1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; SDAG1250-NEXT: v_lshlrev_b16 v2, 8, v1
+; SDAG1250-NEXT: v_and_b32_e32 v1, 0xff, v1
+; SDAG1250-NEXT: v_or_b32_e32 v0, v0, v2
+; SDAG1250-NEXT: s_set_pc_i64 s[30:31]
+;
; GFX6-LABEL: abs_vgpr_v2i8:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -340,6 +900,79 @@ define <2 x i8> @abs_vgpr_v2i8(<2 x i8> %arg) {
}
define amdgpu_cs <3 x i8> @abs_sgpr_v3i8(<3 x i8> inreg %arg) {
+; SDAG6-LABEL: abs_sgpr_v3i8:
+; SDAG6: ; %bb.0:
+; SDAG6-NEXT: s_sext_i32_i8 s1, s1
+; SDAG6-NEXT: s_sext_i32_i8 s0, s0
+; SDAG6-NEXT: s_sext_i32_i8 s2, s2
+; SDAG6-NEXT: s_abs_i32 s1, s1
+; SDAG6-NEXT: s_abs_i32 s2, s2
+; SDAG6-NEXT: s_lshl_b32 s1, s1, 8
+; SDAG6-NEXT: s_abs_i32 s0, s0
+; SDAG6-NEXT: s_lshl_b32 s3, s2, 16
+; SDAG6-NEXT: s_or_b32 s0, s0, s1
+; SDAG6-NEXT: s_or_b32 s0, s0, s3
+; SDAG6-NEXT: s_lshr_b32 s1, s0, 8
+; SDAG6-NEXT: ; return to shader part epilog
+;
+; SDAG8-LABEL: abs_sgpr_v3i8:
+; SDAG8: ; %bb.0:
+; SDAG8-NEXT: s_bfe_i32 s1, s1, 0x80000
+; SDAG8-NEXT: s_bfe_i32 s0, s0, 0x80000
+; SDAG8-NEXT: s_sext_i32_i16 s1, s1
+; SDAG8-NEXT: s_sext_i32_i16 s0, s0
+; SDAG8-NEXT: s_abs_i32 s1, s1
+; SDAG8-NEXT: s_bfe_i32 s2, s2, 0x80000
+; SDAG8-NEXT: s_abs_i32 s0, s0
+; SDAG8-NEXT: s_sext_i32_i16 s2, s2
+; SDAG8-NEXT: s_lshl_b32 s1, s1, 8
+; SDAG8-NEXT: s_abs_i32 s2, s2
+; SDAG8-NEXT: s_or_b32 s0, s0, s1
+; SDAG8-NEXT: s_lshl_b32 s3, s2, 16
+; SDAG8-NEXT: s_and_b32 s1, s0, 0xffff
+; SDAG8-NEXT: s_or_b32 s1, s1, s3
+; SDAG8-NEXT: s_lshr_b32 s1, s1, 8
+; SDAG8-NEXT: ; return to shader part epilog
+;
+; SDAG10-LABEL: abs_sgpr_v3i8:
+; SDAG10: ; %bb.0:
+; SDAG10-NEXT: s_bfe_i32 s1, s1, 0x80000
+; SDAG10-NEXT: s_bfe_i32 s0, s0, 0x80000
+; SDAG10-NEXT: s_sext_i32_i16 s1, s1
+; SDAG10-NEXT: s_bfe_i32 s2, s2, 0x80000
+; SDAG10-NEXT: s_sext_i32_i16 s0, s0
+; SDAG10-NEXT: s_abs_i32 s1, s1
+; SDAG10-NEXT: s_abs_i32 s0, s0
+; SDAG10-NEXT: s_lshl_b32 s1, s1, 8
+; SDAG10-NEXT: s_sext_i32_i16 s2, s2
+; SDAG10-NEXT: s_or_b32 s0, s0, s1
+; SDAG10-NEXT: s_abs_i32 s2, s2
+; SDAG10-NEXT: s_and_b32 s1, s0, 0xffff
+; SDAG10-NEXT: s_lshl_b32 s3, s2, 16
+; SDAG10-NEXT: s_or_b32 s1, s1, s3
+; SDAG10-NEXT: s_lshr_b32 s1, s1, 8
+; SDAG10-NEXT: ; return to shader part epilog
+;
+; SDAG1250-LABEL: abs_sgpr_v3i8:
+; SDAG1250: ; %bb.0:
+; SDAG1250-NEXT: s_bfe_i32 s1, s1, 0x80000
+; SDAG1250-NEXT: s_bfe_i32 s0, s0, 0x80000
+; SDAG1250-NEXT: s_sext_i32_i16 s1, s1
+; SDAG1250-NEXT: s_bfe_i32 s2, s2, 0x80000
+; SDAG1250-NEXT: s_sext_i32_i16 s0, s0
+; SDAG1250-NEXT: s_abs_i32 s1, s1
+; SDAG1250-NEXT: s_abs_i32 s0, s0
+; SDAG1250-NEXT: s_lshl_b32 s1, s1, 8
+; SDAG1250-NEXT: s_sext_i32_i16 s2, s2
+; SDAG1250-NEXT: s_or_b32 s0, s0, s1
+; SDAG1250-NEXT: s_abs_i32 s2, s2
+; SDAG1250-NEXT: s_and_b32 s1, s0, 0xffff
+; SDAG1250-NEXT: s_lshl_b32 s3, s2, 16
+; SDAG1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; SDAG1250-NEXT: s_or_b32 s1, s1, s3
+; SDAG1250-NEXT: s_lshr_b32 s1, s1, 8
+; SDAG1250-NEXT: ; return to shader part epilog
+;
; GFX-LABEL: abs_sgpr_v3i8:
; GFX: ; %bb.0:
; GFX-NEXT: s_sext_i32_i8 s0, s0
@@ -354,6 +987,86 @@ define amdgpu_cs <3 x i8> @abs_sgpr_v3i8(<3 x i8> inreg %arg) {
}
define <3 x i8> @abs_vgpr_v3i8(<3 x i8> %arg) {
+; SDAG6-LABEL: abs_vgpr_v3i8:
+; SDAG6: ; %bb.0:
+; SDAG6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG6-NEXT: v_bfe_i32 v0, v0, 0, 8
+; SDAG6-NEXT: v_bfe_i32 v1, v1, 0, 8
+; SDAG6-NEXT: v_sub_i32_e32 v3, vcc, 0, v0
+; SDAG6-NEXT: v_max_i32_e32 v0, v3, v0
+; SDAG6-NEXT: v_sub_i32_e32 v3, vcc, 0, v1
+; SDAG6-NEXT: v_max_i32_e32 v1, v3, v1
+; SDAG6-NEXT: v_bfe_i32 v2, v2, 0, 8
+; SDAG6-NEXT: v_lshlrev_b32_e32 v1, 8, v1
+; SDAG6-NEXT: v_or_b32_e32 v0, v0, v1
+; SDAG6-NEXT: v_sub_i32_e32 v1, vcc, 0, v2
+; SDAG6-NEXT: v_max_i32_e32 v2, v1, v2
+; SDAG6-NEXT: v_lshlrev_b32_e32 v1, 16, v2
+; SDAG6-NEXT: v_or_b32_e32 v0, v0, v1
+; SDAG6-NEXT: v_lshrrev_b32_e32 v1, 8, v0
+; SDAG6-NEXT: s_setpc_b64 s[30:31]
+;
+; SDAG8-LABEL: abs_vgpr_v3i8:
+; SDAG8: ; %bb.0:
+; SDAG8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG8-NEXT: v_mov_b32_e32 v3, 0
+; SDAG8-NEXT: v_sub_u16_sdwa v4, v3, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; SDAG8-NEXT: v_max_i16_sdwa v0, sext(v0), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG8-NEXT: v_sub_u16_sdwa v4, v3, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; SDAG8-NEXT: v_max_i16_sdwa v1, sext(v1), v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG8-NEXT: v_or_b32_e32 v0, v0, v1
+; SDAG8-NEXT: v_sub_u16_sdwa v1, v3, sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; SDAG8-NEXT: v_max_i16_sdwa v2, sext(v2), v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; SDAG8-NEXT: v_lshlrev_b32_e32 v1, 16, v2
+; SDAG8-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; SDAG8-NEXT: v_lshrrev_b32_e32 v1, 8, v1
+; SDAG8-NEXT: s_setpc_b64 s[30:31]
+;
+; SDAG10-LABEL: abs_vgpr_v3i8:
+; SDAG10: ; %bb.0:
+; SDAG10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG10-NEXT: v_bfe_i32 v1, v1, 0, 8
+; SDAG10-NEXT: v_bfe_i32 v0, v0, 0, 8
+; SDAG10-NEXT: v_bfe_i32 v2, v2, 0, 8
+; SDAG10-NEXT: v_sub_nc_u16 v3, 0, v1
+; SDAG10-NEXT: v_sub_nc_u16 v4, 0, v0
+; SDAG10-NEXT: v_sub_nc_u16 v5, 0, v2
+; SDAG10-NEXT: v_max_i16 v1, v1, v3
+; SDAG10-NEXT: v_max_i16 v0, v0, v4
+; SDAG10-NEXT: v_max_i16 v2, v2, v5
+; SDAG10-NEXT: v_lshlrev_b16 v1, 8, v1
+; SDAG10-NEXT: v_lshlrev_b32_e32 v3, 16, v2
+; SDAG10-NEXT: v_or_b32_e32 v0, v0, v1
+; SDAG10-NEXT: v_or_b32_sdwa v1, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; SDAG10-NEXT: v_lshrrev_b32_e32 v1, 8, v1
+; SDAG10-NEXT: s_setpc_b64 s[30:31]
+;
+; SDAG1250-LABEL: abs_vgpr_v3i8:
+; SDAG1250: ; %bb.0:
+; SDAG1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; SDAG1250-NEXT: s_wait_kmcnt 0x0
+; SDAG1250-NEXT: v_bfe_i32 v1, v1, 0, 8
+; SDAG1250-NEXT: v_bfe_i32 v0, v0, 0, 8
+; SDAG1250-NEXT: v_bfe_i32 v2, v2, 0, 8
+; SDAG1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; SDAG1250-NEXT: v_sub_nc_u16 v3, 0, v1
+; SDAG1250-NEXT: v_sub_nc_u16 v4, 0, v0
+; SDAG1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
+; SDAG1250-NEXT: v_max_i16 v1, v1, v3
+; SDAG1250-NEXT: v_sub_nc_u16 v3, 0, v2
+; SDAG1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; SDAG1250-NEXT: v_max_i16 v0, v0, v4
+; SDAG1250-NEXT: v_lshlrev_b16 v1, 8, v1
+; SDAG1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; SDAG1250-NEXT: v_max_i16 v2, v2, v3
+; SDAG1250-NEXT: v_dual_lshlrev_b32 v1, 16, v2 :: v_dual_bitop2_b32 v0, v0, v1 bitop3:0x54
+; SDAG1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; SDAG1250-NEXT: v_and_b32_e32 v3, 0xffff, v0
+; SDAG1250-NEXT: v_or_b32_e32 v1, v3, v1
+; SDAG1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; SDAG1250-NEXT: v_lshrrev_b32_e32 v1, 8, v1
+; SDAG1250-NEXT: s_set_pc_i64 s[30:31]
+;
; GFX6-LABEL: abs_vgpr_v3i8:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -416,6 +1129,41 @@ define <3 x i8> @abs_vgpr_v3i8(<3 x i8> %arg) {
}
define amdgpu_cs <2 x i16> @abs_sgpr_v2i16(<2 x i16> inreg %arg) {
+; SDAG6-LABEL: abs_sgpr_v2i16:
+; SDAG6: ; %bb.0:
+; SDAG6-NEXT: s_sext_i32_i16 s1, s1
+; SDAG6-NEXT: s_sext_i32_i16 s0, s0
+; SDAG6-NEXT: s_abs_i32 s1, s1
+; SDAG6-NEXT: s_lshl_b32 s2, s1, 16
+; SDAG6-NEXT: s_abs_i32 s0, s0
+; SDAG6-NEXT: s_or_b32 s0, s0, s2
+; SDAG6-NEXT: ; return to shader part epilog
+;
+; SDAG8-LABEL: abs_sgpr_v2i16:
+; SDAG8: ; %bb.0:
+; SDAG8-NEXT: s_ashr_i32 s1, s0, 16
+; SDAG8-NEXT: s_abs_i32 s1, s1
+; SDAG8-NEXT: s_sext_i32_i16 s0, s0
+; SDAG8-NEXT: s_lshl_b32 s1, s1, 16
+; SDAG8-NEXT: s_abs_i32 s0, s0
+; SDAG8-NEXT: s_or_b32 s0, s0, s1
+; SDAG8-NEXT: ; return to shader part epilog
+;
+; SDAG10-LABEL: abs_sgpr_v2i16:
+; SDAG10: ; %bb.0:
+; SDAG10-NEXT: v_pk_sub_i16 v0, 0, s0
+; SDAG10-NEXT: v_pk_max_i16 v0, s0, v0
+; SDAG10-NEXT: v_readfirstlane_b32 s0, v0
+; SDAG10-NEXT: ; return to shader part epilog
+;
+; SDAG1250-LABEL: abs_sgpr_v2i16:
+; SDAG1250: ; %bb.0:
+; SDAG1250-NEXT: v_pk_sub_i16 v0, 0, s0
+; SDAG1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; SDAG1250-NEXT: v_pk_max_i16 v0, s0, v0
+; SDAG1250-NEXT: v_readfirstlane_b32 s0, v0
+; SDAG1250-NEXT: ; return to shader part epilog
+;
; GFX6-LABEL: abs_sgpr_v2i16:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_sext_i32_i16 s0, s0
@@ -458,6 +1206,46 @@ define amdgpu_cs <2 x i16> @abs_sgpr_v2i16(<2 x i16> inreg %arg) {
}
define <2 x i16> @abs_vgpr_v2i16(<2 x i16> %arg) {
+; SDAG6-LABEL: abs_vgpr_v2i16:
+; SDAG6: ; %bb.0:
+; SDAG6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG6-NEXT: v_bfe_i32 v0, v0, 0, 16
+; SDAG6-NEXT: v_bfe_i32 v1, v1, 0, 16
+; SDAG6-NEXT: v_sub_i32_e32 v2, vcc, 0, v0
+; SDAG6-NEXT: v_max_i32_e32 v0, v2, v0
+; SDAG6-NEXT: v_sub_i32_e32 v2, vcc, 0, v1
+; SDAG6-NEXT: v_max_i32_e32 v1, v2, v1
+; SDAG6-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; SDAG6-NEXT: v_or_b32_e32 v0, v0, v2
+; SDAG6-NEXT: s_setpc_b64 s[30:31]
+;
+; SDAG8-LABEL: abs_vgpr_v2i16:
+; SDAG8: ; %bb.0:
+; SDAG8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG8-NEXT: v_mov_b32_e32 v1, 0
+; SDAG8-NEXT: v_sub_u16_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; SDAG8-NEXT: v_sub_u16_e32 v2, 0, v0
+; SDAG8-NEXT: v_max_i16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; SDAG8-NEXT: v_max_i16_e32 v0, v0, v2
+; SDAG8-NEXT: v_or_b32_e32 v0, v0, v1
+; SDAG8-NEXT: s_setpc_b64 s[30:31]
+;
+; SDAG10-LABEL: abs_vgpr_v2i16:
+; SDAG10: ; %bb.0:
+; SDAG10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG10-NEXT: v_pk_sub_i16 v1, 0, v0
+; SDAG10-NEXT: v_pk_max_i16 v0, v0, v1
+; SDAG10-NEXT: s_setpc_b64 s[30:31]
+;
+; SDAG1250-LABEL: abs_vgpr_v2i16:
+; SDAG1250: ; %bb.0:
+; SDAG1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; SDAG1250-NEXT: s_wait_kmcnt 0x0
+; SDAG1250-NEXT: v_pk_sub_i16 v1, 0, v0
+; SDAG1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; SDAG1250-NEXT: v_pk_max_i16 v0, v0, v1
+; SDAG1250-NEXT: s_set_pc_i64 s[30:31]
+;
; GFX6-LABEL: abs_vgpr_v2i16:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -500,6 +1288,55 @@ define <2 x i16> @abs_vgpr_v2i16(<2 x i16> %arg) {
}
define amdgpu_cs <3 x i16> @abs_sgpr_v3i16(<3 x i16> inreg %arg) {
+; SDAG6-LABEL: abs_sgpr_v3i16:
+; SDAG6: ; %bb.0:
+; SDAG6-NEXT: s_sext_i32_i16 s1, s1
+; SDAG6-NEXT: s_sext_i32_i16 s3, s2
+; SDAG6-NEXT: s_abs_i32 s1, s1
+; SDAG6-NEXT: s_sext_i32_i16 s0, s0
+; SDAG6-NEXT: s_lshl_b32 s2, s1, 16
+; SDAG6-NEXT: s_abs_i32 s3, s3
+; SDAG6-NEXT: s_abs_i32 s0, s0
+; SDAG6-NEXT: s_lshr_b64 s[4:5], s[2:3], 16
+; SDAG6-NEXT: s_or_b32 s0, s0, s2
+; SDAG6-NEXT: s_mov_b32 s1, s4
+; SDAG6-NEXT: s_mov_b32 s2, s3
+; SDAG6-NEXT: ; return to shader part epilog
+;
+; SDAG8-LABEL: abs_sgpr_v3i16:
+; SDAG8: ; %bb.0:
+; SDAG8-NEXT: s_ashr_i32 s2, s0, 16
+; SDAG8-NEXT: s_abs_i32 s2, s2
+; SDAG8-NEXT: s_sext_i32_i16 s0, s0
+; SDAG8-NEXT: s_sext_i32_i16 s1, s1
+; SDAG8-NEXT: s_abs_i32 s0, s0
+; SDAG8-NEXT: s_lshl_b32 s2, s2, 16
+; SDAG8-NEXT: s_abs_i32 s1, s1
+; SDAG8-NEXT: s_or_b32 s0, s0, s2
+; SDAG8-NEXT: ; return to shader part epilog
+;
+; SDAG10-LABEL: abs_sgpr_v3i16:
+; SDAG10: ; %bb.0:
+; SDAG10-NEXT: v_pk_sub_i16 v0, 0, s0
+; SDAG10-NEXT: v_pk_sub_i16 v1, 0, s1
+; SDAG10-NEXT: v_pk_max_i16 v0, s0, v0
+; SDAG10-NEXT: v_pk_max_i16 v1, s1, v1
+; SDAG10-NEXT: v_readfirstlane_b32 s0, v0
+; SDAG10-NEXT: v_readfirstlane_b32 s1, v1
+; SDAG10-NEXT: ; return to shader part epilog
+;
+; SDAG1250-LABEL: abs_sgpr_v3i16:
+; SDAG1250: ; %bb.0:
+; SDAG1250-NEXT: v_pk_sub_i16 v0, 0, s0
+; SDAG1250-NEXT: v_pk_sub_i16 v1, 0, s1
+; SDAG1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; SDAG1250-NEXT: v_pk_max_i16 v0, s0, v0
+; SDAG1250-NEXT: v_pk_max_i16 v1, s1, v1
+; SDAG1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; SDAG1250-NEXT: v_readfirstlane_b32 s0, v0
+; SDAG1250-NEXT: v_readfirstlane_b32 s1, v1
+; SDAG1250-NEXT: ; return to shader part epilog
+;
; GFX6-LABEL: abs_sgpr_v3i16:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_sext_i32_i16 s0, s0
@@ -549,6 +1386,56 @@ define amdgpu_cs <3 x i16> @abs_sgpr_v3i16(<3 x i16> inreg %arg) {
}
define <3 x i16> @abs_vgpr_v3i16(<3 x i16> %arg) {
+; SDAG6-LABEL: abs_vgpr_v3i16:
+; SDAG6: ; %bb.0:
+; SDAG6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG6-NEXT: v_bfe_i32 v0, v0, 0, 16
+; SDAG6-NEXT: v_bfe_i32 v1, v1, 0, 16
+; SDAG6-NEXT: v_sub_i32_e32 v3, vcc, 0, v0
+; SDAG6-NEXT: v_bfe_i32 v2, v2, 0, 16
+; SDAG6-NEXT: v_max_i32_e32 v0, v3, v0
+; SDAG6-NEXT: v_sub_i32_e32 v3, vcc, 0, v1
+; SDAG6-NEXT: v_max_i32_e32 v1, v3, v1
+; SDAG6-NEXT: v_sub_i32_e32 v3, vcc, 0, v2
+; SDAG6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SDAG6-NEXT: v_max_i32_e32 v2, v3, v2
+; SDAG6-NEXT: v_or_b32_e32 v0, v0, v1
+; SDAG6-NEXT: v_alignbit_b32 v1, v2, v1, 16
+; SDAG6-NEXT: s_setpc_b64 s[30:31]
+;
+; SDAG8-LABEL: abs_vgpr_v3i16:
+; SDAG8: ; %bb.0:
+; SDAG8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG8-NEXT: v_mov_b32_e32 v2, 0
+; SDAG8-NEXT: v_sub_u16_e32 v3, 0, v1
+; SDAG8-NEXT: v_sub_u16_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; SDAG8-NEXT: v_max_i16_e32 v1, v1, v3
+; SDAG8-NEXT: v_sub_u16_e32 v3, 0, v0
+; SDAG8-NEXT: v_max_i16_sdwa v2, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; SDAG8-NEXT: v_max_i16_e32 v0, v0, v3
+; SDAG8-NEXT: v_or_b32_e32 v0, v0, v2
+; SDAG8-NEXT: s_setpc_b64 s[30:31]
+;
+; SDAG10-LABEL: abs_vgpr_v3i16:
+; SDAG10: ; %bb.0:
+; SDAG10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG10-NEXT: v_pk_sub_i16 v2, 0, v0
+; SDAG10-NEXT: v_pk_sub_i16 v3, 0, v1
+; SDAG10-NEXT: v_pk_max_i16 v0, v0, v2
+; SDAG10-NEXT: v_pk_max_i16 v1, v1, v3
+; SDAG10-NEXT: s_setpc_b64 s[30:31]
+;
+; SDAG1250-LABEL: abs_vgpr_v3i16:
+; SDAG1250: ; %bb.0:
+; SDAG1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; SDAG1250-NEXT: s_wait_kmcnt 0x0
+; SDAG1250-NEXT: v_pk_sub_i16 v2, 0, v0
+; SDAG1250-NEXT: v_pk_sub_i16 v3, 0, v1
+; SDAG1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; SDAG1250-NEXT: v_pk_max_i16 v0, v0, v2
+; SDAG1250-NEXT: v_pk_max_i16 v1, v1, v3
+; SDAG1250-NEXT: s_set_pc_i64 s[30:31]
+;
; GFX6-LABEL: abs_vgpr_v3i16:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
>From 09b39391e8b22ede4c6fc86b4243d7a2449bb88b Mon Sep 17 00:00:00 2001
From: Patrick Simmons <patrick.simmons at amd.com>
Date: Fri, 7 Nov 2025 19:42:31 -0500
Subject: [PATCH 17/19] Doesn't work
---
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 24 ++++++++++++++++-------
1 file changed, 17 insertions(+), 7 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 5f9a8d6f9c59f..c56ce443f963c 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -181,7 +181,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
// We don't want the default expansion of 16-bit ABS since we can
// sign-extend and use the 32-bit ABS operation for 16-bit ABS with SGPRs
- setOperationAction(ISD::ABS, MVT::i16, Custom);
+ setOperationAction(ISD::ABS, {MVT::i8,MVT::i16}, Custom);
}
addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass);
@@ -979,7 +979,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
Custom);
}
- setTargetDAGCombine({ISD::ADD,
+ setTargetDAGCombine({ISD::ABS,
+ ISD::ADD,
ISD::PTRADD,
ISD::UADDO_CARRY,
ISD::SUB,
@@ -6779,7 +6780,7 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::DEBUGTRAP:
return lowerDEBUGTRAP(Op, DAG);
case ISD::ABS:
- if (Op.getValueType() == MVT::i16)
+ if (Op.getValueType() == MVT::i16 || Op.getValueType() == MVT::i8)
return lowerABSi16(Op, DAG);
LLVM_FALLTHROUGH;
case ISD::FABS:
@@ -7280,7 +7281,7 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
void SITargetLowering::ReplaceNodeResults(SDNode *N,
SmallVectorImpl<SDValue> &Results,
SelectionDAG &DAG) const {
- switch (N->getOpcode()) {
+ switch (N->getOpcode()) {
case ISD::INSERT_VECTOR_ELT: {
if (SDValue Res = lowerINSERT_VECTOR_ELT(SDValue(N, 0), DAG))
Results.push_back(Res);
@@ -7458,6 +7459,15 @@ void SITargetLowering::ReplaceNodeResults(SDNode *N,
Results.push_back(lowerFSQRTF16(SDValue(N, 0), DAG));
break;
}
+ case ISD::ABS:
+ if (N->getValueType(0) == MVT::i16 || N->getValueType(0) == MVT::i8) {
+ SDValue result = lowerABSi16(SDValue(N, 0), DAG);
+ if(result!=SDValue()) {
+ Results.push_back(result);
+ return;
+ }
+ }
+ LLVM_FALLTHROUGH;
default:
AMDGPUTargetLowering::ReplaceNodeResults(N, Results, DAG);
break;
@@ -8151,7 +8161,7 @@ SDValue SITargetLowering::lowerDEBUGTRAP(SDValue Op, SelectionDAG &DAG) const {
SDValue SITargetLowering::lowerABSi16(SDValue Op, SelectionDAG &DAG) const {
assert(Op.getOpcode() == ISD::ABS &&
"Tried to select abs with non-abs opcode.");
- assert(Op.getValueType() == MVT::i16 &&
+ assert((Op.getValueType() == MVT::i16 || Op.getValueType() == MVT::i8) &&
"Tried to select abs i16 lowering with non-i16 type.");
// divergent means will not end up using SGPRs
@@ -8163,7 +8173,7 @@ SDValue SITargetLowering::lowerABSi16(SDValue Op, SelectionDAG &DAG) const {
SDLoc DL(Src);
SDValue SExtSrc = DAG.getSExtOrTrunc(Src, DL, MVT::i32);
SDValue ExtAbs = DAG.getNode(ISD::ABS, DL, MVT::i32, SExtSrc);
- return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, ExtAbs);
+ return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), ExtAbs);
}
SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
@@ -16882,7 +16892,7 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
return SDValue();
switch (N->getOpcode()) {
- case ISD::ADD:
+ case ISD::ADD:
return performAddCombine(N, DCI);
case ISD::PTRADD:
return performPtrAddCombine(N, DCI);
>From e134577df56cad6d3ef27b738a7ac072b6380bc3 Mon Sep 17 00:00:00 2001
From: Patrick Simmons <patrick.simmons at amd.com>
Date: Fri, 7 Nov 2025 19:45:45 -0500
Subject: [PATCH 18/19] Update testcases to test for the inferior code
generation resulting from this approach.
---
llvm/test/CodeGen/AMDGPU/llvm.abs.ll | 111 ++++++++++++---------------
1 file changed, 51 insertions(+), 60 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.abs.ll b/llvm/test/CodeGen/AMDGPU/llvm.abs.ll
index dd7d2fbc931b6..54c6f4148e328 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.abs.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.abs.ll
@@ -26,23 +26,20 @@ define amdgpu_cs i8 @abs_sgpr_i8(i8 inreg %arg) {
;
; SDAG8-LABEL: abs_sgpr_i8:
; SDAG8: ; %bb.0:
-; SDAG8-NEXT: s_bfe_i32 s0, s0, 0x80000
-; SDAG8-NEXT: s_sext_i32_i16 s0, s0
+; SDAG8-NEXT: s_sext_i32_i8 s0, s0
; SDAG8-NEXT: s_abs_i32 s0, s0
; SDAG8-NEXT: ; return to shader part epilog
;
; SDAG10-LABEL: abs_sgpr_i8:
; SDAG10: ; %bb.0:
-; SDAG10-NEXT: s_bfe_i32 s0, s0, 0x80000
-; SDAG10-NEXT: s_sext_i32_i16 s0, s0
+; SDAG10-NEXT: s_sext_i32_i8 s0, s0
; SDAG10-NEXT: s_abs_i32 s0, s0
; SDAG10-NEXT: ; return to shader part epilog
;
; SDAG1250-LABEL: abs_sgpr_i8:
; SDAG1250: ; %bb.0:
-; SDAG1250-NEXT: s_bfe_i32 s0, s0, 0x80000
-; SDAG1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; SDAG1250-NEXT: s_sext_i32_i16 s0, s0
+; SDAG1250-NEXT: s_sext_i32_i8 s0, s0
+; SDAG1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; SDAG1250-NEXT: s_abs_i32 s0, s0
; SDAG1250-NEXT: ; return to shader part epilog
;
@@ -77,31 +74,39 @@ define amdgpu_cs i8 @abs_sgpr_i8(i8 inreg %arg) {
define amdgpu_cs i8 @abs_sgpr_i8_neg(i8 inreg %arg) {
; SDAG6-LABEL: abs_sgpr_i8_neg:
; SDAG6: ; %bb.0:
-; SDAG6-NEXT: s_sext_i32_i8 s0, s0
-; SDAG6-NEXT: s_abs_i32 s0, s0
-; SDAG6-NEXT: s_sub_i32 s0, 0, s0
+; SDAG6-NEXT: s_sext_i32_i8 s1, s0
+; SDAG6-NEXT: s_ashr_i32 s1, s1, 7
+; SDAG6-NEXT: s_xor_b32 s0, s0, s1
+; SDAG6-NEXT: s_sub_i32 s0, s1, s0
; SDAG6-NEXT: ; return to shader part epilog
;
; SDAG8-LABEL: abs_sgpr_i8_neg:
; SDAG8: ; %bb.0:
-; SDAG8-NEXT: s_sext_i32_i8 s0, s0
-; SDAG8-NEXT: s_abs_i32 s0, s0
-; SDAG8-NEXT: s_sub_i32 s0, 0, s0
+; SDAG8-NEXT: s_bfe_i32 s1, s0, 0x80000
+; SDAG8-NEXT: s_sext_i32_i16 s1, s1
+; SDAG8-NEXT: s_ashr_i32 s1, s1, 7
+; SDAG8-NEXT: s_xor_b32 s0, s0, s1
+; SDAG8-NEXT: s_sub_i32 s0, s1, s0
; SDAG8-NEXT: ; return to shader part epilog
;
; SDAG10-LABEL: abs_sgpr_i8_neg:
; SDAG10: ; %bb.0:
-; SDAG10-NEXT: s_sext_i32_i8 s0, s0
-; SDAG10-NEXT: s_abs_i32 s0, s0
-; SDAG10-NEXT: s_sub_i32 s0, 0, s0
+; SDAG10-NEXT: s_bfe_i32 s1, s0, 0x80000
+; SDAG10-NEXT: s_sext_i32_i16 s1, s1
+; SDAG10-NEXT: s_ashr_i32 s1, s1, 7
+; SDAG10-NEXT: s_xor_b32 s0, s0, s1
+; SDAG10-NEXT: s_sub_i32 s0, s1, s0
; SDAG10-NEXT: ; return to shader part epilog
;
; SDAG1250-LABEL: abs_sgpr_i8_neg:
; SDAG1250: ; %bb.0:
-; SDAG1250-NEXT: s_sext_i32_i8 s0, s0
+; SDAG1250-NEXT: s_bfe_i32 s1, s0, 0x80000
; SDAG1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; SDAG1250-NEXT: s_abs_i32 s0, s0
-; SDAG1250-NEXT: s_sub_co_i32 s0, 0, s0
+; SDAG1250-NEXT: s_sext_i32_i16 s1, s1
+; SDAG1250-NEXT: s_ashr_i32 s1, s1, 7
+; SDAG1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; SDAG1250-NEXT: s_xor_b32 s0, s0, s1
+; SDAG1250-NEXT: s_sub_co_i32 s0, s1, s0
; SDAG1250-NEXT: ; return to shader part epilog
;
; GFX6-LABEL: abs_sgpr_i8_neg:
@@ -195,9 +200,10 @@ define amdgpu_cs i16 @abs_sgpr_i16(i16 inreg %arg) {
define amdgpu_ps i16 @abs_sgpr_i16_neg(i16 inreg %arg) {
; SDAG6-LABEL: abs_sgpr_i16_neg:
; SDAG6: ; %bb.0:
-; SDAG6-NEXT: s_sext_i32_i16 s0, s0
-; SDAG6-NEXT: s_abs_i32 s0, s0
-; SDAG6-NEXT: s_sub_i32 s0, 0, s0
+; SDAG6-NEXT: s_sext_i32_i16 s1, s0
+; SDAG6-NEXT: s_ashr_i32 s1, s1, 15
+; SDAG6-NEXT: s_xor_b32 s0, s0, s1
+; SDAG6-NEXT: s_sub_i32 s0, s1, s0
; SDAG6-NEXT: ; return to shader part epilog
;
; SDAG8-LABEL: abs_sgpr_i16_neg:
@@ -745,10 +751,8 @@ define amdgpu_cs <2 x i8> @abs_sgpr_v2i8(<2 x i8> inreg %arg) {
;
; SDAG8-LABEL: abs_sgpr_v2i8:
; SDAG8: ; %bb.0:
-; SDAG8-NEXT: s_bfe_i32 s1, s1, 0x80000
-; SDAG8-NEXT: s_bfe_i32 s0, s0, 0x80000
-; SDAG8-NEXT: s_sext_i32_i16 s1, s1
-; SDAG8-NEXT: s_sext_i32_i16 s0, s0
+; SDAG8-NEXT: s_sext_i32_i8 s1, s1
+; SDAG8-NEXT: s_sext_i32_i8 s0, s0
; SDAG8-NEXT: s_abs_i32 s1, s1
; SDAG8-NEXT: s_abs_i32 s0, s0
; SDAG8-NEXT: s_lshl_b32 s2, s1, 8
@@ -757,10 +761,8 @@ define amdgpu_cs <2 x i8> @abs_sgpr_v2i8(<2 x i8> inreg %arg) {
;
; SDAG10-LABEL: abs_sgpr_v2i8:
; SDAG10: ; %bb.0:
-; SDAG10-NEXT: s_bfe_i32 s1, s1, 0x80000
-; SDAG10-NEXT: s_bfe_i32 s0, s0, 0x80000
-; SDAG10-NEXT: s_sext_i32_i16 s1, s1
-; SDAG10-NEXT: s_sext_i32_i16 s0, s0
+; SDAG10-NEXT: s_sext_i32_i8 s1, s1
+; SDAG10-NEXT: s_sext_i32_i8 s0, s0
; SDAG10-NEXT: s_abs_i32 s1, s1
; SDAG10-NEXT: s_abs_i32 s0, s0
; SDAG10-NEXT: s_lshl_b32 s2, s1, 8
@@ -769,10 +771,8 @@ define amdgpu_cs <2 x i8> @abs_sgpr_v2i8(<2 x i8> inreg %arg) {
;
; SDAG1250-LABEL: abs_sgpr_v2i8:
; SDAG1250: ; %bb.0:
-; SDAG1250-NEXT: s_bfe_i32 s1, s1, 0x80000
-; SDAG1250-NEXT: s_bfe_i32 s0, s0, 0x80000
-; SDAG1250-NEXT: s_sext_i32_i16 s1, s1
-; SDAG1250-NEXT: s_sext_i32_i16 s0, s0
+; SDAG1250-NEXT: s_sext_i32_i8 s1, s1
+; SDAG1250-NEXT: s_sext_i32_i8 s0, s0
; SDAG1250-NEXT: s_abs_i32 s1, s1
; SDAG1250-NEXT: s_abs_i32 s0, s0
; SDAG1250-NEXT: s_lshl_b32 s2, s1, 8
@@ -917,14 +917,11 @@ define amdgpu_cs <3 x i8> @abs_sgpr_v3i8(<3 x i8> inreg %arg) {
;
; SDAG8-LABEL: abs_sgpr_v3i8:
; SDAG8: ; %bb.0:
-; SDAG8-NEXT: s_bfe_i32 s1, s1, 0x80000
-; SDAG8-NEXT: s_bfe_i32 s0, s0, 0x80000
-; SDAG8-NEXT: s_sext_i32_i16 s1, s1
-; SDAG8-NEXT: s_sext_i32_i16 s0, s0
+; SDAG8-NEXT: s_sext_i32_i8 s1, s1
+; SDAG8-NEXT: s_sext_i32_i8 s0, s0
; SDAG8-NEXT: s_abs_i32 s1, s1
-; SDAG8-NEXT: s_bfe_i32 s2, s2, 0x80000
; SDAG8-NEXT: s_abs_i32 s0, s0
-; SDAG8-NEXT: s_sext_i32_i16 s2, s2
+; SDAG8-NEXT: s_sext_i32_i8 s2, s2
; SDAG8-NEXT: s_lshl_b32 s1, s1, 8
; SDAG8-NEXT: s_abs_i32 s2, s2
; SDAG8-NEXT: s_or_b32 s0, s0, s1
@@ -936,40 +933,34 @@ define amdgpu_cs <3 x i8> @abs_sgpr_v3i8(<3 x i8> inreg %arg) {
;
; SDAG10-LABEL: abs_sgpr_v3i8:
; SDAG10: ; %bb.0:
-; SDAG10-NEXT: s_bfe_i32 s1, s1, 0x80000
-; SDAG10-NEXT: s_bfe_i32 s0, s0, 0x80000
-; SDAG10-NEXT: s_sext_i32_i16 s1, s1
-; SDAG10-NEXT: s_bfe_i32 s2, s2, 0x80000
-; SDAG10-NEXT: s_sext_i32_i16 s0, s0
+; SDAG10-NEXT: s_sext_i32_i8 s1, s1
+; SDAG10-NEXT: s_sext_i32_i8 s0, s0
; SDAG10-NEXT: s_abs_i32 s1, s1
; SDAG10-NEXT: s_abs_i32 s0, s0
+; SDAG10-NEXT: s_sext_i32_i8 s2, s2
; SDAG10-NEXT: s_lshl_b32 s1, s1, 8
-; SDAG10-NEXT: s_sext_i32_i16 s2, s2
-; SDAG10-NEXT: s_or_b32 s0, s0, s1
; SDAG10-NEXT: s_abs_i32 s2, s2
-; SDAG10-NEXT: s_and_b32 s1, s0, 0xffff
-; SDAG10-NEXT: s_lshl_b32 s3, s2, 16
-; SDAG10-NEXT: s_or_b32 s1, s1, s3
+; SDAG10-NEXT: s_or_b32 s0, s0, s1
+; SDAG10-NEXT: s_lshl_b32 s1, s2, 16
+; SDAG10-NEXT: s_and_b32 s3, s0, 0xffff
+; SDAG10-NEXT: s_or_b32 s1, s3, s1
; SDAG10-NEXT: s_lshr_b32 s1, s1, 8
; SDAG10-NEXT: ; return to shader part epilog
;
; SDAG1250-LABEL: abs_sgpr_v3i8:
; SDAG1250: ; %bb.0:
-; SDAG1250-NEXT: s_bfe_i32 s1, s1, 0x80000
-; SDAG1250-NEXT: s_bfe_i32 s0, s0, 0x80000
-; SDAG1250-NEXT: s_sext_i32_i16 s1, s1
-; SDAG1250-NEXT: s_bfe_i32 s2, s2, 0x80000
-; SDAG1250-NEXT: s_sext_i32_i16 s0, s0
+; SDAG1250-NEXT: s_sext_i32_i8 s1, s1
+; SDAG1250-NEXT: s_sext_i32_i8 s0, s0
; SDAG1250-NEXT: s_abs_i32 s1, s1
; SDAG1250-NEXT: s_abs_i32 s0, s0
+; SDAG1250-NEXT: s_sext_i32_i8 s2, s2
; SDAG1250-NEXT: s_lshl_b32 s1, s1, 8
-; SDAG1250-NEXT: s_sext_i32_i16 s2, s2
-; SDAG1250-NEXT: s_or_b32 s0, s0, s1
; SDAG1250-NEXT: s_abs_i32 s2, s2
-; SDAG1250-NEXT: s_and_b32 s1, s0, 0xffff
-; SDAG1250-NEXT: s_lshl_b32 s3, s2, 16
+; SDAG1250-NEXT: s_or_b32 s0, s0, s1
+; SDAG1250-NEXT: s_lshl_b32 s1, s2, 16
+; SDAG1250-NEXT: s_and_b32 s3, s0, 0xffff
; SDAG1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; SDAG1250-NEXT: s_or_b32 s1, s1, s3
+; SDAG1250-NEXT: s_or_b32 s1, s3, s1
; SDAG1250-NEXT: s_lshr_b32 s1, s1, 8
; SDAG1250-NEXT: ; return to shader part epilog
;
>From 97c9dddc96f3576fed0762344ce84b2c48e16671 Mon Sep 17 00:00:00 2001
From: Patrick Simmons <patrick.simmons at amd.com>
Date: Fri, 7 Nov 2025 20:01:15 -0500
Subject: [PATCH 19/19] Add a nonfunctional target DAG combine hook
---
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 6 +++++-
1 file changed, 5 insertions(+), 1 deletion(-)
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index c56ce443f963c..95910b68f9c7d 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -16892,7 +16892,11 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
return SDValue();
switch (N->getOpcode()) {
- case ISD::ADD:
+ case ISD::ABS:
+ if (N->getValueType(0) == MVT::i16 || N->getValueType(0) == MVT::i8)
+ return lowerABSi16(SDValue(N,0), DCI.DAG);
+ break;
+ case ISD::ADD:
return performAddCombine(N, DCI);
case ISD::PTRADD:
return performPtrAddCombine(N, DCI);
More information about the llvm-commits
mailing list