[llvm] [AMDGPU] Optimize rotate instruction selection patterns (PR #143551)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Jun 10 08:23:55 PDT 2025
https://github.com/aleksandar-amd created https://github.com/llvm/llvm-project/pull/143551
This patch improves rotate instruction selection for AMDGPU by adding
optimized patterns for the rotate right (rotr) operation. It now selects
s_lshl + s_lshr + s_or (3 SALU instructions) instead of the previous
v_alignbit + v_readfirstlane (2 VALU instructions).
>From 9b4ecca539cfa7b872e4abd7f4578168d04c6ba7 Mon Sep 17 00:00:00 2001
From: Aleksandar Spasojevic <aleksandar.spasojevic at amd.com>
Date: Tue, 10 Jun 2025 17:17:39 +0200
Subject: [PATCH] [AMDGPU] Optimize rotate instruction selection patterns
This patch improves rotate instruction selection for AMDGPU by adding
optimized patterns for the rotate right (rotr) operation. It now selects
s_lshl + s_lshr + s_or (3 SALU instructions) instead of the previous
v_alignbit + v_readfirstlane (2 VALU instructions).
---
llvm/lib/Target/AMDGPU/AMDGPUGISel.td | 4 +
llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp | 20 ++
llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h | 2 +
.../AMDGPU/AMDGPUInstructionSelector.cpp | 25 ++
.../Target/AMDGPU/AMDGPUInstructionSelector.h | 3 +
.../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 7 +-
.../Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 6 +
llvm/lib/Target/AMDGPU/SIInstrInfo.td | 2 +
llvm/lib/Target/AMDGPU/SIInstructions.td | 13 +-
.../AMDGPU/GlobalISel/legalize-rotl-rotr.mir | 28 +-
llvm/test/CodeGen/AMDGPU/rotl.ll | 215 ++++++++++----
llvm/test/CodeGen/AMDGPU/rotr.ll | 275 +++++++++++++++---
12 files changed, 477 insertions(+), 123 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
index 1b909568fc555..061764ff4d269 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
@@ -95,6 +95,10 @@ def gi_vinterpmods_hi :
GIComplexOperandMatcher<s32, "selectVINTERPModsHi">,
GIComplexPatternEquiv<VINTERPModsHi>;
+def gi_immsub :
+ GIComplexOperandMatcher<s32, "selectImmSub">,
+ GIComplexPatternEquiv<ImmSub>;
+
// FIXME: Why do we have both VOP3OpSel and VOP3OpSelMods?
def gi_vop3opsel :
GIComplexOperandMatcher<s32, "selectVOP3OpSelMods">,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index e52c2d7fde436..22d3274d565f2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -3610,6 +3610,26 @@ bool AMDGPUDAGToDAGISel::SelectSWMMACIndex16(SDValue In, SDValue &Src,
return true;
}
+bool AMDGPUDAGToDAGISel::SelectImmSub(SDValue In, SDValue &Src,
+ SDValue &InvSrc) const {
+ Src = In;
+
+ // Handle constant operands
+ ConstantSDNode *ImmVal = dyn_cast<ConstantSDNode>(In);
+ if (ImmVal)
+ InvSrc = CurDAG->getTargetConstant(32 - ImmVal->getZExtValue(), SDLoc(In),
+ MVT::i32);
+ else {
+ // Fallback: generate SUB instruction for non-constant, non-negation cases
+ SDNode *VMov = CurDAG->getMachineNode(
+ AMDGPU::S_SUB_U32, SDLoc(In), MVT::i32,
+ {CurDAG->getTargetConstant(32, SDLoc(In), MVT::i32), In});
+ InvSrc = SDValue(VMov, 0);
+ }
+
+ return true;
+}
+
bool AMDGPUDAGToDAGISel::SelectVOP3OpSel(SDValue In, SDValue &Src,
SDValue &SrcMods) const {
Src = In;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
index f3b9364fdb92b..82586329a369c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
@@ -246,6 +246,8 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
bool SelectBITOP3(SDValue In, SDValue &Src0, SDValue &Src1, SDValue &Src2,
SDValue &Tbl) const;
+ bool SelectImmSub(SDValue In, SDValue &Src, SDValue &InvSrc) const;
+
SDValue getHi16Elt(SDValue In) const;
SDValue getMaterializedScalarImm32(int64_t Val, const SDLoc &DL) const;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 7e72f6ca478fd..b43e4c1093a16 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -5831,6 +5831,31 @@ AMDGPUInstructionSelector::selectSMRDBufferSgprImm(MachineOperand &Root) const {
[=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedOffset); }}};
}
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectImmSub(MachineOperand &Root) const {
+
+ MachineInstr *MI = Root.getParent();
+ MachineBasicBlock *MBB = MI->getParent();
+ Register SrcInv = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
+
+ // Handle constant operands
+ std::optional<uint64_t> Val = getConstantZext32Val(Root.getReg(), *MRI);
+
+ if (!Val) {
+ BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_SUB_U32), SrcInv)
+ .addImm(32)
+ .add(Root);
+ } else {
+ BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SrcInv)
+ .addImm(32 - *Val);
+ }
+
+ return {{
+ [=](MachineInstrBuilder &MIB) { MIB.addReg(Root.getReg()); },
+ [=](MachineInstrBuilder &MIB) { MIB.addReg(SrcInv); },
+ }};
+}
+
std::pair<Register, unsigned>
AMDGPUInstructionSelector::selectVOP3PMadMixModsImpl(MachineOperand &Root,
bool &Matched) const {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index 6c3f3026e877a..6371b861ae55c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -224,6 +224,9 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
InstructionSelector::ComplexRendererFns
selectVINTERPModsHi(MachineOperand &Root) const;
+ InstructionSelector::ComplexRendererFns
+ selectImmSub(MachineOperand &Root) const;
+
bool selectSmrdOffset(MachineOperand &Root, Register &Base, Register *SOffset,
int64_t *Offset) const;
InstructionSelector::ComplexRendererFns
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index e8dff85064383..6e74ea56b16ca 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -2046,7 +2046,12 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
.clampScalar(0, S32, S64)
.lower();
- getActionDefinitionsBuilder({G_ROTR, G_ROTL})
+ getActionDefinitionsBuilder(G_ROTR)
+ .legalFor({S32})
+ .scalarize(0)
+ .lower();
+
+ getActionDefinitionsBuilder(G_ROTL)
.scalarize(0)
.lower();
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index dd7aef8f0c583..6242eefb15ad8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -4073,6 +4073,12 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case AMDGPU::G_AMDGPU_SMED3:
case AMDGPU::G_AMDGPU_FMED3:
return getDefaultMappingVOP(MI);
+ case AMDGPU::G_ROTR:
+ case AMDGPU::G_ROTL: {
+ if (isSALUMapping(MI))
+ return getDefaultMappingSOP(MI);
+ return getDefaultMappingVOP(MI);
+ }
case AMDGPU::G_UMULH:
case AMDGPU::G_SMULH: {
if (Subtarget.hasScalarMulHiInsts() && isSALUMapping(MI))
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 2c20475726a48..56d2d99079acb 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -1630,6 +1630,8 @@ def VOP3PMadMixMods : ComplexPattern<untyped, 2, "SelectVOP3PMadMixMods">;
def VINTERPMods : ComplexPattern<untyped, 2, "SelectVINTERPMods">;
def VINTERPModsHi : ComplexPattern<untyped, 2, "SelectVINTERPModsHi">;
+def ImmSub : ComplexPattern<untyped, 2, "SelectImmSub">;
+
//===----------------------------------------------------------------------===//
// SI assembler operands
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 360fd05cb3d96..e9801f7ce6823 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -2388,8 +2388,19 @@ def : AMDGPUPat <
$src1), sub1)
>;
+// rotr pattern
+def : AMDGPUPat <
+ (UniformBinFrag<rotr> i32:$src0, (i32 (ImmSub i32:$src1, i32:$src1_inv))),
+ (S_OR_B32 (S_LSHR_B32 i32:$src0, i32:$src1), (S_LSHL_B32 i32:$src0, i32:$src1_inv))
+>;
+
let True16Predicate = NotHasTrue16BitInsts in {
-def : ROTRPattern <V_ALIGNBIT_B32_e64>;
+
+// rotr pattern
+def : AMDGPUPat <
+ (DivergentBinFrag<rotr> i32:$src0, i32:$src1),
+ (V_ALIGNBIT_B32_e64 $src0, $src0, $src1)
+>;
def : GCNPat<(i32 (trunc (srl i64:$src0, (and i32:$src1, (i32 31))))),
(V_ALIGNBIT_B32_e64 (i32 (EXTRACT_SUBREG (i64 $src0), sub1)),
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-rotl-rotr.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-rotl-rotr.mir
index 7fdee12315754..9610caa1f2012 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-rotl-rotr.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-rotl-rotr.mir
@@ -181,8 +181,8 @@ body: |
; GFX-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr1
; GFX-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
; GFX-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C]], [[COPY1]]
- ; GFX-NEXT: [[FSHR:%[0-9]+]]:_(s32) = G_FSHR [[COPY]], [[COPY]], [[SUB]](s32)
- ; GFX-NEXT: $sgpr0 = COPY [[FSHR]](s32)
+ ; GFX-NEXT: [[ROTR:%[0-9]+]]:_(s32) = G_ROTR [[COPY]], [[SUB]](s32)
+ ; GFX-NEXT: $sgpr0 = COPY [[ROTR]](s32)
%0:_(s32) = COPY $sgpr0
%1:_(s32) = COPY $sgpr1
%2:_(s32) = G_ROTL %0, %1(s32)
@@ -301,14 +301,14 @@ body: |
; GFX-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<4 x s32>)
; GFX-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
; GFX-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C]], [[UV4]]
- ; GFX-NEXT: [[FSHR:%[0-9]+]]:_(s32) = G_FSHR [[UV]], [[UV]], [[SUB]](s32)
+ ; GFX-NEXT: [[ROTR:%[0-9]+]]:_(s32) = G_ROTR [[UV]], [[SUB]](s32)
; GFX-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[C]], [[UV5]]
- ; GFX-NEXT: [[FSHR1:%[0-9]+]]:_(s32) = G_FSHR [[UV1]], [[UV1]], [[SUB1]](s32)
+ ; GFX-NEXT: [[ROTR1:%[0-9]+]]:_(s32) = G_ROTR [[UV1]], [[SUB1]](s32)
; GFX-NEXT: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[C]], [[UV6]]
- ; GFX-NEXT: [[FSHR2:%[0-9]+]]:_(s32) = G_FSHR [[UV2]], [[UV2]], [[SUB2]](s32)
+ ; GFX-NEXT: [[ROTR2:%[0-9]+]]:_(s32) = G_ROTR [[UV2]], [[SUB2]](s32)
; GFX-NEXT: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[C]], [[UV7]]
- ; GFX-NEXT: [[FSHR3:%[0-9]+]]:_(s32) = G_FSHR [[UV3]], [[UV3]], [[SUB3]](s32)
- ; GFX-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[FSHR]](s32), [[FSHR1]](s32), [[FSHR2]](s32), [[FSHR3]](s32)
+ ; GFX-NEXT: [[ROTR3:%[0-9]+]]:_(s32) = G_ROTR [[UV3]], [[SUB3]](s32)
+ ; GFX-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[ROTR]](s32), [[ROTR1]](s32), [[ROTR2]](s32), [[ROTR3]](s32)
; GFX-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
%0:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
%1:_(<4 x s32>) = COPY $sgpr4_sgpr5_sgpr6_sgpr7
@@ -391,8 +391,8 @@ body: |
; GFX-NEXT: {{ $}}
; GFX-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
; GFX-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr1
- ; GFX-NEXT: [[FSHR:%[0-9]+]]:_(s32) = G_FSHR [[COPY]], [[COPY]], [[COPY1]](s32)
- ; GFX-NEXT: $sgpr0 = COPY [[FSHR]](s32)
+ ; GFX-NEXT: [[ROTR:%[0-9]+]]:_(s32) = G_ROTR [[COPY]], [[COPY1]](s32)
+ ; GFX-NEXT: $sgpr0 = COPY [[ROTR]](s32)
%0:_(s32) = COPY $sgpr0
%1:_(s32) = COPY $sgpr1
%2:_(s32) = G_ROTR %0, %1(s32)
@@ -452,11 +452,11 @@ body: |
; GFX-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr4_sgpr5_sgpr6_sgpr7
; GFX-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<4 x s32>)
; GFX-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<4 x s32>)
- ; GFX-NEXT: [[FSHR:%[0-9]+]]:_(s32) = G_FSHR [[UV]], [[UV]], [[UV4]](s32)
- ; GFX-NEXT: [[FSHR1:%[0-9]+]]:_(s32) = G_FSHR [[UV1]], [[UV1]], [[UV5]](s32)
- ; GFX-NEXT: [[FSHR2:%[0-9]+]]:_(s32) = G_FSHR [[UV2]], [[UV2]], [[UV6]](s32)
- ; GFX-NEXT: [[FSHR3:%[0-9]+]]:_(s32) = G_FSHR [[UV3]], [[UV3]], [[UV7]](s32)
- ; GFX-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[FSHR]](s32), [[FSHR1]](s32), [[FSHR2]](s32), [[FSHR3]](s32)
+ ; GFX-NEXT: [[ROTR:%[0-9]+]]:_(s32) = G_ROTR [[UV]], [[UV4]](s32)
+ ; GFX-NEXT: [[ROTR1:%[0-9]+]]:_(s32) = G_ROTR [[UV1]], [[UV5]](s32)
+ ; GFX-NEXT: [[ROTR2:%[0-9]+]]:_(s32) = G_ROTR [[UV2]], [[UV6]](s32)
+ ; GFX-NEXT: [[ROTR3:%[0-9]+]]:_(s32) = G_ROTR [[UV3]], [[UV7]](s32)
+ ; GFX-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[ROTR]](s32), [[ROTR1]](s32), [[ROTR2]](s32), [[ROTR3]](s32)
; GFX-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
%0:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
%1:_(<4 x s32>) = COPY $sgpr4_sgpr5_sgpr6_sgpr7
diff --git a/llvm/test/CodeGen/AMDGPU/rotl.ll b/llvm/test/CodeGen/AMDGPU/rotl.ll
index 0a746b0a3f572..bd245c45025db 100644
--- a/llvm/test/CodeGen/AMDGPU/rotl.ll
+++ b/llvm/test/CodeGen/AMDGPU/rotl.ll
@@ -26,11 +26,14 @@ define amdgpu_kernel void @rotl_i32(ptr addrspace(1) %in, i32 %x, i32 %y) {
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_sub_i32 s3, 32, s3
+; SI-NEXT: s_sub_u32 s4, 32, s3
+; SI-NEXT: s_lshr_b32 s3, s2, s3
+; SI-NEXT: s_lshl_b32 s2, s2, s4
+; SI-NEXT: s_or_b32 s2, s3, s2
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_mov_b32 s4, s0
; SI-NEXT: s_mov_b32 s5, s1
-; SI-NEXT: v_mov_b32_e32 v0, s3
-; SI-NEXT: v_alignbit_b32 v0, s2, s2, v0
+; SI-NEXT: v_mov_b32_e32 v0, s2
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
;
@@ -39,10 +42,13 @@ define amdgpu_kernel void @rotl_i32(ptr addrspace(1) %in, i32 %x, i32 %y) {
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_sub_i32 s3, 32, s3
-; GFX8-NEXT: v_mov_b32_e32 v0, s3
-; GFX8-NEXT: v_alignbit_b32 v2, s2, s2, v0
+; GFX8-NEXT: s_sub_u32 s4, 32, s3
+; GFX8-NEXT: s_lshr_b32 s3, s2, s3
+; GFX8-NEXT: s_lshl_b32 s2, s2, s4
+; GFX8-NEXT: s_or_b32 s2, s3, s2
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v2, s2
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -52,18 +58,26 @@ define amdgpu_kernel void @rotl_i32(ptr addrspace(1) %in, i32 %x, i32 %y) {
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_sub_i32 s3, 32, s3
-; GFX10-NEXT: v_alignbit_b32 v1, s2, s2, s3
+; GFX10-NEXT: s_sub_u32 s4, 32, s3
+; GFX10-NEXT: s_lshr_b32 s3, s2, s3
+; GFX10-NEXT: s_lshl_b32 s2, s2, s4
+; GFX10-NEXT: s_or_b32 s2, s3, s2
+; GFX10-NEXT: v_mov_b32_e32 v1, s2
; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: rotl_i32:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_sub_i32 s3, 32, s3
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_sub_u32 s4, 32, s3
+; GFX11-NEXT: s_lshr_b32 s3, s2, s3
+; GFX11-NEXT: s_lshl_b32 s2, s2, s4
+; GFX11-NEXT: s_or_b32 s2, s3, s2
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_alignbit_b32 v1, s2, s2, s3
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: s_endpgm
entry:
@@ -97,14 +111,20 @@ define amdgpu_kernel void @rotl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb
; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s7, 0xf000
-; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_sub_i32 s3, 32, s3
; SI-NEXT: s_sub_i32 s2, 32, s2
-; SI-NEXT: v_mov_b32_e32 v0, s3
-; SI-NEXT: v_alignbit_b32 v1, s1, s1, v0
-; SI-NEXT: v_mov_b32_e32 v0, s2
-; SI-NEXT: v_alignbit_b32 v0, s0, s0, v0
+; SI-NEXT: s_sub_i32 s3, 32, s3
+; SI-NEXT: s_sub_u32 s6, 32, s2
+; SI-NEXT: s_sub_u32 s8, 32, s3
+; SI-NEXT: s_lshr_b32 s3, s1, s3
+; SI-NEXT: s_lshr_b32 s2, s0, s2
+; SI-NEXT: s_lshl_b32 s1, s1, s8
+; SI-NEXT: s_lshl_b32 s0, s0, s6
+; SI-NEXT: s_or_b32 s1, s3, s1
+; SI-NEXT: s_or_b32 s0, s2, s0
+; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: v_mov_b32_e32 v0, s0
+; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; SI-NEXT: s_endpgm
;
@@ -115,11 +135,17 @@ define amdgpu_kernel void @rotl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_sub_i32 s2, 32, s2
; GFX8-NEXT: s_sub_i32 s3, 32, s3
-; GFX8-NEXT: v_mov_b32_e32 v0, s3
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
-; GFX8-NEXT: v_alignbit_b32 v1, s1, s1, v0
-; GFX8-NEXT: v_alignbit_b32 v0, s0, s0, v2
+; GFX8-NEXT: s_sub_u32 s6, 32, s2
+; GFX8-NEXT: s_sub_u32 s7, 32, s3
+; GFX8-NEXT: s_lshr_b32 s3, s1, s3
+; GFX8-NEXT: s_lshl_b32 s1, s1, s7
+; GFX8-NEXT: s_lshr_b32 s2, s0, s2
+; GFX8-NEXT: s_lshl_b32 s0, s0, s6
+; GFX8-NEXT: s_or_b32 s1, s3, s1
+; GFX8-NEXT: s_or_b32 s0, s2, s0
; GFX8-NEXT: v_mov_b32_e32 v2, s4
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_mov_b32_e32 v3, s5
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: s_endpgm
@@ -131,10 +157,18 @@ define amdgpu_kernel void @rotl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i
; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_sub_i32 s3, 32, s3
; GFX10-NEXT: s_sub_i32 s2, 32, s2
-; GFX10-NEXT: v_alignbit_b32 v1, s1, s1, s3
-; GFX10-NEXT: v_alignbit_b32 v0, s0, s0, s2
+; GFX10-NEXT: s_sub_i32 s3, 32, s3
+; GFX10-NEXT: s_sub_u32 s4, 32, s2
+; GFX10-NEXT: s_sub_u32 s5, 32, s3
+; GFX10-NEXT: s_lshr_b32 s3, s1, s3
+; GFX10-NEXT: s_lshr_b32 s2, s0, s2
+; GFX10-NEXT: s_lshl_b32 s0, s0, s4
+; GFX10-NEXT: s_lshl_b32 s1, s1, s5
+; GFX10-NEXT: s_or_b32 s0, s2, s0
+; GFX10-NEXT: s_or_b32 s1, s3, s1
+; GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX10-NEXT: s_endpgm
;
@@ -143,12 +177,20 @@ define amdgpu_kernel void @rotl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
-; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_sub_i32 s3, 32, s3
; GFX11-NEXT: s_sub_i32 s2, 32, s2
-; GFX11-NEXT: v_alignbit_b32 v1, s1, s1, s3
-; GFX11-NEXT: v_alignbit_b32 v0, s0, s0, s2
+; GFX11-NEXT: s_sub_i32 s3, 32, s3
+; GFX11-NEXT: s_sub_u32 s6, 32, s2
+; GFX11-NEXT: s_sub_u32 s7, 32, s3
+; GFX11-NEXT: s_lshr_b32 s3, s1, s3
+; GFX11-NEXT: s_lshr_b32 s2, s0, s2
+; GFX11-NEXT: s_lshl_b32 s0, s0, s6
+; GFX11-NEXT: s_lshl_b32 s1, s1, s7
+; GFX11-NEXT: s_or_b32 s0, s2, s0
+; GFX11-NEXT: s_or_b32 s1, s3, s1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_endpgm
entry:
@@ -188,20 +230,32 @@ define amdgpu_kernel void @rotl_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i
; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0xd
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
-; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_sub_i32 s4, 32, s12
-; SI-NEXT: s_sub_i32 s5, 32, s13
+; SI-NEXT: s_sub_i32 s2, 32, s12
+; SI-NEXT: s_sub_i32 s4, 32, s13
+; SI-NEXT: s_sub_i32 s5, 32, s14
; SI-NEXT: s_sub_i32 s6, 32, s15
-; SI-NEXT: s_sub_i32 s7, 32, s14
-; SI-NEXT: v_mov_b32_e32 v0, s6
-; SI-NEXT: v_alignbit_b32 v3, s11, s11, v0
+; SI-NEXT: s_sub_u32 s7, 32, s2
+; SI-NEXT: s_sub_u32 s12, 32, s4
+; SI-NEXT: s_sub_u32 s13, 32, s5
+; SI-NEXT: s_sub_u32 s14, 32, s6
+; SI-NEXT: s_lshr_b32 s6, s11, s6
+; SI-NEXT: s_lshr_b32 s5, s10, s5
+; SI-NEXT: s_lshr_b32 s4, s9, s4
+; SI-NEXT: s_lshr_b32 s2, s8, s2
+; SI-NEXT: s_lshl_b32 s11, s11, s14
+; SI-NEXT: s_lshl_b32 s10, s10, s13
+; SI-NEXT: s_lshl_b32 s9, s9, s12
+; SI-NEXT: s_lshl_b32 s7, s8, s7
+; SI-NEXT: s_or_b32 s6, s6, s11
+; SI-NEXT: s_or_b32 s5, s5, s10
+; SI-NEXT: s_or_b32 s4, s4, s9
+; SI-NEXT: s_or_b32 s7, s2, s7
+; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: v_mov_b32_e32 v0, s7
-; SI-NEXT: v_alignbit_b32 v2, s10, s10, v0
-; SI-NEXT: v_mov_b32_e32 v0, s5
-; SI-NEXT: v_alignbit_b32 v1, s9, s9, v0
-; SI-NEXT: v_mov_b32_e32 v0, s4
-; SI-NEXT: v_alignbit_b32 v0, s8, s8, v0
+; SI-NEXT: v_mov_b32_e32 v1, s4
+; SI-NEXT: v_mov_b32_e32 v2, s5
+; SI-NEXT: v_mov_b32_e32 v3, s6
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; SI-NEXT: s_endpgm
;
@@ -210,19 +264,31 @@ define amdgpu_kernel void @rotl_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i
; GFX8-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_sub_i32 s5, 32, s15
-; GFX8-NEXT: s_sub_i32 s4, 32, s14
-; GFX8-NEXT: v_mov_b32_e32 v0, s5
-; GFX8-NEXT: s_sub_i32 s3, 32, s13
-; GFX8-NEXT: v_alignbit_b32 v3, s11, s11, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: s_sub_i32 s2, 32, s12
-; GFX8-NEXT: v_alignbit_b32 v2, s10, s10, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s3
-; GFX8-NEXT: v_alignbit_b32 v1, s9, s9, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: s_sub_i32 s3, 32, s13
+; GFX8-NEXT: s_sub_i32 s4, 32, s14
+; GFX8-NEXT: s_sub_i32 s12, 32, s15
+; GFX8-NEXT: s_sub_u32 s5, 32, s2
+; GFX8-NEXT: s_sub_u32 s6, 32, s3
+; GFX8-NEXT: s_sub_u32 s7, 32, s4
+; GFX8-NEXT: s_sub_u32 s13, 32, s12
+; GFX8-NEXT: s_lshr_b32 s12, s11, s12
+; GFX8-NEXT: s_lshl_b32 s11, s11, s13
+; GFX8-NEXT: s_lshr_b32 s4, s10, s4
+; GFX8-NEXT: s_lshl_b32 s7, s10, s7
+; GFX8-NEXT: s_lshr_b32 s3, s9, s3
+; GFX8-NEXT: s_lshl_b32 s6, s9, s6
+; GFX8-NEXT: s_lshr_b32 s2, s8, s2
+; GFX8-NEXT: s_lshl_b32 s5, s8, s5
+; GFX8-NEXT: s_or_b32 s11, s12, s11
+; GFX8-NEXT: s_or_b32 s4, s4, s7
+; GFX8-NEXT: s_or_b32 s3, s3, s6
+; GFX8-NEXT: s_or_b32 s2, s2, s5
; GFX8-NEXT: v_mov_b32_e32 v5, s1
-; GFX8-NEXT: v_alignbit_b32 v0, s8, s8, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v2, s4
+; GFX8-NEXT: v_mov_b32_e32 v3, s11
; GFX8-NEXT: v_mov_b32_e32 v4, s0
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: s_endpgm
@@ -236,12 +302,28 @@ define amdgpu_kernel void @rotl_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_sub_i32 s2, 32, s12
; GFX10-NEXT: s_sub_i32 s3, 32, s13
-; GFX10-NEXT: s_sub_i32 s4, 32, s15
-; GFX10-NEXT: s_sub_i32 s5, 32, s14
-; GFX10-NEXT: v_alignbit_b32 v3, s11, s11, s4
-; GFX10-NEXT: v_alignbit_b32 v2, s10, s10, s5
-; GFX10-NEXT: v_alignbit_b32 v1, s9, s9, s3
-; GFX10-NEXT: v_alignbit_b32 v0, s8, s8, s2
+; GFX10-NEXT: s_sub_i32 s4, 32, s14
+; GFX10-NEXT: s_sub_i32 s5, 32, s15
+; GFX10-NEXT: s_sub_u32 s6, 32, s2
+; GFX10-NEXT: s_sub_u32 s7, 32, s3
+; GFX10-NEXT: s_sub_u32 s12, 32, s4
+; GFX10-NEXT: s_sub_u32 s13, 32, s5
+; GFX10-NEXT: s_lshr_b32 s5, s11, s5
+; GFX10-NEXT: s_lshr_b32 s4, s10, s4
+; GFX10-NEXT: s_lshr_b32 s3, s9, s3
+; GFX10-NEXT: s_lshr_b32 s2, s8, s2
+; GFX10-NEXT: s_lshl_b32 s11, s11, s13
+; GFX10-NEXT: s_lshl_b32 s10, s10, s12
+; GFX10-NEXT: s_lshl_b32 s7, s9, s7
+; GFX10-NEXT: s_lshl_b32 s6, s8, s6
+; GFX10-NEXT: s_or_b32 s5, s5, s11
+; GFX10-NEXT: s_or_b32 s4, s4, s10
+; GFX10-NEXT: s_or_b32 s2, s2, s6
+; GFX10-NEXT: s_or_b32 s3, s3, s7
+; GFX10-NEXT: v_mov_b32_e32 v0, s2
+; GFX10-NEXT: v_mov_b32_e32 v1, s3
+; GFX10-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-NEXT: v_mov_b32_e32 v3, s5
; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
; GFX10-NEXT: s_endpgm
;
@@ -250,16 +332,31 @@ define amdgpu_kernel void @rotl_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b256 s[8:15], s[4:5], 0x34
; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-NEXT: v_mov_b32_e32 v4, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_sub_i32 s2, 32, s12
; GFX11-NEXT: s_sub_i32 s3, 32, s13
-; GFX11-NEXT: s_sub_i32 s4, 32, s15
-; GFX11-NEXT: s_sub_i32 s5, 32, s14
-; GFX11-NEXT: v_alignbit_b32 v3, s11, s11, s4
-; GFX11-NEXT: v_alignbit_b32 v2, s10, s10, s5
-; GFX11-NEXT: v_alignbit_b32 v1, s9, s9, s3
-; GFX11-NEXT: v_alignbit_b32 v0, s8, s8, s2
+; GFX11-NEXT: s_sub_i32 s4, 32, s14
+; GFX11-NEXT: s_sub_i32 s5, 32, s15
+; GFX11-NEXT: s_sub_u32 s6, 32, s2
+; GFX11-NEXT: s_sub_u32 s7, 32, s3
+; GFX11-NEXT: s_sub_u32 s12, 32, s4
+; GFX11-NEXT: s_sub_u32 s13, 32, s5
+; GFX11-NEXT: s_lshr_b32 s5, s11, s5
+; GFX11-NEXT: s_lshr_b32 s4, s10, s4
+; GFX11-NEXT: s_lshr_b32 s3, s9, s3
+; GFX11-NEXT: s_lshr_b32 s2, s8, s2
+; GFX11-NEXT: s_lshl_b32 s11, s11, s13
+; GFX11-NEXT: s_lshl_b32 s10, s10, s12
+; GFX11-NEXT: s_lshl_b32 s7, s9, s7
+; GFX11-NEXT: s_lshl_b32 s6, s8, s6
+; GFX11-NEXT: s_or_b32 s5, s5, s11
+; GFX11-NEXT: s_or_b32 s4, s4, s10
+; GFX11-NEXT: s_or_b32 s2, s2, s6
+; GFX11-NEXT: s_or_b32 s3, s3, s7
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v3, s5
+; GFX11-NEXT: v_mov_b32_e32 v2, s4
; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1]
; GFX11-NEXT: s_endpgm
entry:
diff --git a/llvm/test/CodeGen/AMDGPU/rotr.ll b/llvm/test/CodeGen/AMDGPU/rotr.ll
index d6e361d6e297e..75ff838dd7449 100644
--- a/llvm/test/CodeGen/AMDGPU/rotr.ll
+++ b/llvm/test/CodeGen/AMDGPU/rotr.ll
@@ -5,6 +5,7 @@
; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12 %s
define amdgpu_kernel void @rotr_i32(ptr addrspace(1) %in, i32 %x, i32 %y) {
; R600-LABEL: rotr_i32:
@@ -22,12 +23,15 @@ define amdgpu_kernel void @rotr_i32(ptr addrspace(1) %in, i32 %x, i32 %y) {
; SI: ; %bb.0: ; %entry
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s7, 0xf000
-; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_sub_u32 s4, 32, s3
+; SI-NEXT: s_lshr_b32 s3, s2, s3
+; SI-NEXT: s_lshl_b32 s2, s2, s4
+; SI-NEXT: s_or_b32 s2, s3, s2
+; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_mov_b32 s4, s0
; SI-NEXT: s_mov_b32 s5, s1
-; SI-NEXT: v_mov_b32_e32 v0, s3
-; SI-NEXT: v_alignbit_b32 v0, s2, s2, v0
+; SI-NEXT: v_mov_b32_e32 v0, s2
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
;
@@ -35,10 +39,13 @@ define amdgpu_kernel void @rotr_i32(ptr addrspace(1) %in, i32 %x, i32 %y) {
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s3
-; GFX8-NEXT: v_alignbit_b32 v2, s2, s2, v0
+; GFX8-NEXT: s_sub_u32 s4, 32, s3
+; GFX8-NEXT: s_lshr_b32 s3, s2, s3
+; GFX8-NEXT: s_lshl_b32 s2, s2, s4
+; GFX8-NEXT: s_or_b32 s2, s3, s2
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v2, s2
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -47,18 +54,39 @@ define amdgpu_kernel void @rotr_i32(ptr addrspace(1) %in, i32 %x, i32 %y) {
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_alignbit_b32 v1, s2, s2, s3
+; GFX10-NEXT: s_sub_u32 s4, 32, s3
+; GFX10-NEXT: s_lshr_b32 s3, s2, s3
+; GFX10-NEXT: s_lshl_b32 s2, s2, s4
+; GFX10-NEXT: s_or_b32 s2, s3, s2
+; GFX10-NEXT: v_mov_b32_e32 v1, s2
; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: rotr_i32:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_alignbit_b32 v1, s2, s2, s3
+; GFX11-NEXT: s_sub_u32 s4, 32, s3
+; GFX11-NEXT: s_lshr_b32 s3, s2, s3
+; GFX11-NEXT: s_lshl_b32 s2, s2, s4
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_or_b32 s2, s3, s2
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: rotr_i32:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_sub_co_u32 s4, 32, s3
+; GFX12-NEXT: s_lshr_b32 s3, s2, s3
+; GFX12-NEXT: s_lshl_b32 s2, s2, s4
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_or_b32 s2, s3, s2
+; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: s_endpgm
entry:
%tmp0 = sub i32 32, %y
%tmp1 = shl i32 %x, %tmp0
@@ -86,12 +114,18 @@ define amdgpu_kernel void @rotr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb
; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s7, 0xf000
-; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v0, s3
-; SI-NEXT: v_alignbit_b32 v1, s1, s1, v0
-; SI-NEXT: v_mov_b32_e32 v0, s2
-; SI-NEXT: v_alignbit_b32 v0, s0, s0, v0
+; SI-NEXT: s_sub_u32 s6, 32, s2
+; SI-NEXT: s_sub_u32 s8, 32, s3
+; SI-NEXT: s_lshr_b32 s3, s1, s3
+; SI-NEXT: s_lshr_b32 s2, s0, s2
+; SI-NEXT: s_lshl_b32 s1, s1, s8
+; SI-NEXT: s_lshl_b32 s0, s0, s6
+; SI-NEXT: s_or_b32 s1, s3, s1
+; SI-NEXT: s_or_b32 s0, s2, s0
+; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: v_mov_b32_e32 v0, s0
+; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; SI-NEXT: s_endpgm
;
@@ -100,11 +134,17 @@ define amdgpu_kernel void @rotr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s3
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
-; GFX8-NEXT: v_alignbit_b32 v1, s1, s1, v0
-; GFX8-NEXT: v_alignbit_b32 v0, s0, s0, v2
+; GFX8-NEXT: s_sub_u32 s6, 32, s2
+; GFX8-NEXT: s_sub_u32 s7, 32, s3
+; GFX8-NEXT: s_lshr_b32 s3, s1, s3
+; GFX8-NEXT: s_lshr_b32 s2, s0, s2
+; GFX8-NEXT: s_lshl_b32 s1, s1, s7
+; GFX8-NEXT: s_lshl_b32 s0, s0, s6
+; GFX8-NEXT: s_or_b32 s1, s3, s1
+; GFX8-NEXT: s_or_b32 s0, s2, s0
; GFX8-NEXT: v_mov_b32_e32 v2, s4
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_mov_b32_e32 v3, s5
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: s_endpgm
@@ -116,8 +156,16 @@ define amdgpu_kernel void @rotr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i
; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_alignbit_b32 v1, s1, s1, s3
-; GFX10-NEXT: v_alignbit_b32 v0, s0, s0, s2
+; GFX10-NEXT: s_sub_u32 s4, 32, s2
+; GFX10-NEXT: s_sub_u32 s5, 32, s3
+; GFX10-NEXT: s_lshr_b32 s3, s1, s3
+; GFX10-NEXT: s_lshr_b32 s2, s0, s2
+; GFX10-NEXT: s_lshl_b32 s0, s0, s4
+; GFX10-NEXT: s_lshl_b32 s1, s1, s5
+; GFX10-NEXT: s_or_b32 s0, s2, s0
+; GFX10-NEXT: s_or_b32 s1, s3, s1
+; GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX10-NEXT: s_endpgm
;
@@ -126,12 +174,40 @@ define amdgpu_kernel void @rotr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
-; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_alignbit_b32 v1, s1, s1, s3
-; GFX11-NEXT: v_alignbit_b32 v0, s0, s0, s2
+; GFX11-NEXT: s_sub_u32 s6, 32, s2
+; GFX11-NEXT: s_sub_u32 s7, 32, s3
+; GFX11-NEXT: s_lshr_b32 s3, s1, s3
+; GFX11-NEXT: s_lshr_b32 s2, s0, s2
+; GFX11-NEXT: s_lshl_b32 s0, s0, s6
+; GFX11-NEXT: s_lshl_b32 s1, s1, s7
+; GFX11-NEXT: s_or_b32 s0, s2, s0
+; GFX11-NEXT: s_or_b32 s1, s3, s1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: rotr_v2i32:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
+; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_sub_co_u32 s6, 32, s2
+; GFX12-NEXT: s_sub_co_u32 s7, 32, s3
+; GFX12-NEXT: s_lshr_b32 s3, s1, s3
+; GFX12-NEXT: s_lshr_b32 s2, s0, s2
+; GFX12-NEXT: s_lshl_b32 s0, s0, s6
+; GFX12-NEXT: s_lshl_b32 s1, s1, s7
+; GFX12-NEXT: s_or_b32 s0, s2, s0
+; GFX12-NEXT: s_or_b32 s1, s3, s1
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: v_mov_b32_e32 v0, s0
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX12-NEXT: s_endpgm
entry:
%tmp0 = sub <2 x i32> <i32 32, i32 32>, %y
%tmp1 = shl <2 x i32> %x, %tmp0
@@ -161,16 +237,28 @@ define amdgpu_kernel void @rotr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i
; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0xd
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
-; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v0, s15
-; SI-NEXT: v_alignbit_b32 v3, s11, s11, v0
-; SI-NEXT: v_mov_b32_e32 v0, s14
-; SI-NEXT: v_alignbit_b32 v2, s10, s10, v0
-; SI-NEXT: v_mov_b32_e32 v0, s13
-; SI-NEXT: v_alignbit_b32 v1, s9, s9, v0
-; SI-NEXT: v_mov_b32_e32 v0, s12
-; SI-NEXT: v_alignbit_b32 v0, s8, s8, v0
+; SI-NEXT: s_sub_u32 s2, 32, s12
+; SI-NEXT: s_sub_u32 s4, 32, s13
+; SI-NEXT: s_sub_u32 s5, 32, s14
+; SI-NEXT: s_sub_u32 s6, 32, s15
+; SI-NEXT: s_lshr_b32 s7, s11, s15
+; SI-NEXT: s_lshr_b32 s14, s10, s14
+; SI-NEXT: s_lshr_b32 s13, s9, s13
+; SI-NEXT: s_lshr_b32 s12, s8, s12
+; SI-NEXT: s_lshl_b32 s6, s11, s6
+; SI-NEXT: s_lshl_b32 s5, s10, s5
+; SI-NEXT: s_lshl_b32 s4, s9, s4
+; SI-NEXT: s_lshl_b32 s2, s8, s2
+; SI-NEXT: s_or_b32 s6, s7, s6
+; SI-NEXT: s_or_b32 s5, s14, s5
+; SI-NEXT: s_or_b32 s4, s13, s4
+; SI-NEXT: s_or_b32 s7, s12, s2
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: v_mov_b32_e32 v0, s7
+; SI-NEXT: v_mov_b32_e32 v1, s4
+; SI-NEXT: v_mov_b32_e32 v2, s5
+; SI-NEXT: v_mov_b32_e32 v3, s6
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; SI-NEXT: s_endpgm
;
@@ -179,15 +267,27 @@ define amdgpu_kernel void @rotr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i
; GFX8-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s15
-; GFX8-NEXT: v_mov_b32_e32 v1, s14
-; GFX8-NEXT: v_mov_b32_e32 v4, s13
-; GFX8-NEXT: v_alignbit_b32 v3, s11, s11, v0
-; GFX8-NEXT: v_alignbit_b32 v2, s10, s10, v1
-; GFX8-NEXT: v_alignbit_b32 v1, s9, s9, v4
-; GFX8-NEXT: v_mov_b32_e32 v0, s12
+; GFX8-NEXT: s_sub_u32 s5, 32, s15
+; GFX8-NEXT: s_sub_u32 s4, 32, s14
+; GFX8-NEXT: s_lshr_b32 s6, s11, s15
+; GFX8-NEXT: s_lshl_b32 s5, s11, s5
+; GFX8-NEXT: s_sub_u32 s3, 32, s13
+; GFX8-NEXT: s_or_b32 s5, s6, s5
+; GFX8-NEXT: s_lshr_b32 s6, s10, s14
+; GFX8-NEXT: s_lshl_b32 s4, s10, s4
+; GFX8-NEXT: s_sub_u32 s2, 32, s12
+; GFX8-NEXT: s_or_b32 s4, s6, s4
+; GFX8-NEXT: s_lshr_b32 s6, s9, s13
+; GFX8-NEXT: s_lshl_b32 s3, s9, s3
+; GFX8-NEXT: s_or_b32 s3, s6, s3
+; GFX8-NEXT: s_lshr_b32 s6, s8, s12
+; GFX8-NEXT: s_lshl_b32 s2, s8, s2
+; GFX8-NEXT: s_or_b32 s2, s6, s2
; GFX8-NEXT: v_mov_b32_e32 v5, s1
-; GFX8-NEXT: v_alignbit_b32 v0, s8, s8, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v2, s4
+; GFX8-NEXT: v_mov_b32_e32 v3, s5
; GFX8-NEXT: v_mov_b32_e32 v4, s0
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: s_endpgm
@@ -199,10 +299,26 @@ define amdgpu_kernel void @rotr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX10-NEXT: v_mov_b32_e32 v4, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_alignbit_b32 v3, s11, s11, s15
-; GFX10-NEXT: v_alignbit_b32 v2, s10, s10, s14
-; GFX10-NEXT: v_alignbit_b32 v1, s9, s9, s13
-; GFX10-NEXT: v_alignbit_b32 v0, s8, s8, s12
+; GFX10-NEXT: s_sub_u32 s2, 32, s12
+; GFX10-NEXT: s_sub_u32 s3, 32, s13
+; GFX10-NEXT: s_sub_u32 s4, 32, s14
+; GFX10-NEXT: s_sub_u32 s5, 32, s15
+; GFX10-NEXT: s_lshr_b32 s6, s11, s15
+; GFX10-NEXT: s_lshr_b32 s7, s10, s14
+; GFX10-NEXT: s_lshr_b32 s13, s9, s13
+; GFX10-NEXT: s_lshr_b32 s12, s8, s12
+; GFX10-NEXT: s_lshl_b32 s5, s11, s5
+; GFX10-NEXT: s_lshl_b32 s4, s10, s4
+; GFX10-NEXT: s_lshl_b32 s3, s9, s3
+; GFX10-NEXT: s_lshl_b32 s2, s8, s2
+; GFX10-NEXT: s_or_b32 s5, s6, s5
+; GFX10-NEXT: s_or_b32 s4, s7, s4
+; GFX10-NEXT: s_or_b32 s2, s12, s2
+; GFX10-NEXT: s_or_b32 s3, s13, s3
+; GFX10-NEXT: v_mov_b32_e32 v0, s2
+; GFX10-NEXT: v_mov_b32_e32 v1, s3
+; GFX10-NEXT: v_mov_b32_e32 v2, s4
+; GFX10-NEXT: v_mov_b32_e32 v3, s5
; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
; GFX10-NEXT: s_endpgm
;
@@ -211,14 +327,58 @@ define amdgpu_kernel void @rotr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b256 s[8:15], s[4:5], 0x34
; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-NEXT: v_mov_b32_e32 v4, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_alignbit_b32 v3, s11, s11, s15
-; GFX11-NEXT: v_alignbit_b32 v2, s10, s10, s14
-; GFX11-NEXT: v_alignbit_b32 v1, s9, s9, s13
-; GFX11-NEXT: v_alignbit_b32 v0, s8, s8, s12
+; GFX11-NEXT: s_sub_u32 s2, 32, s12
+; GFX11-NEXT: s_sub_u32 s3, 32, s13
+; GFX11-NEXT: s_sub_u32 s4, 32, s14
+; GFX11-NEXT: s_sub_u32 s5, 32, s15
+; GFX11-NEXT: s_lshr_b32 s6, s11, s15
+; GFX11-NEXT: s_lshr_b32 s7, s10, s14
+; GFX11-NEXT: s_lshr_b32 s13, s9, s13
+; GFX11-NEXT: s_lshr_b32 s12, s8, s12
+; GFX11-NEXT: s_lshl_b32 s5, s11, s5
+; GFX11-NEXT: s_lshl_b32 s4, s10, s4
+; GFX11-NEXT: s_lshl_b32 s3, s9, s3
+; GFX11-NEXT: s_lshl_b32 s2, s8, s2
+; GFX11-NEXT: s_or_b32 s5, s6, s5
+; GFX11-NEXT: s_or_b32 s4, s7, s4
+; GFX11-NEXT: s_or_b32 s2, s12, s2
+; GFX11-NEXT: s_or_b32 s3, s13, s3
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v3, s5
+; GFX11-NEXT: v_mov_b32_e32 v2, s4
; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1]
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: rotr_v4i32:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: s_load_b256 s[8:15], s[4:5], 0x34
+; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_sub_co_u32 s2, 32, s12
+; GFX12-NEXT: s_sub_co_u32 s3, 32, s13
+; GFX12-NEXT: s_sub_co_u32 s4, 32, s14
+; GFX12-NEXT: s_sub_co_u32 s5, 32, s15
+; GFX12-NEXT: s_lshr_b32 s6, s11, s15
+; GFX12-NEXT: s_lshr_b32 s7, s10, s14
+; GFX12-NEXT: s_lshr_b32 s13, s9, s13
+; GFX12-NEXT: s_lshr_b32 s12, s8, s12
+; GFX12-NEXT: s_lshl_b32 s5, s11, s5
+; GFX12-NEXT: s_lshl_b32 s4, s10, s4
+; GFX12-NEXT: s_lshl_b32 s3, s9, s3
+; GFX12-NEXT: s_lshl_b32 s2, s8, s2
+; GFX12-NEXT: s_or_b32 s5, s6, s5
+; GFX12-NEXT: s_or_b32 s4, s7, s4
+; GFX12-NEXT: s_or_b32 s2, s12, s2
+; GFX12-NEXT: s_or_b32 s3, s13, s3
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v3, s5
+; GFX12-NEXT: v_mov_b32_e32 v2, s4
+; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1]
+; GFX12-NEXT: s_endpgm
entry:
%tmp0 = sub <4 x i32> <i32 32, i32 32, i32 32, i32 32>, %y
%tmp1 = shl <4 x i32> %x, %tmp0
@@ -357,6 +517,25 @@ define void @test_rotr_i16(ptr addrspace(1) nocapture readonly %sourceA, ptr add
; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v2, v0
; GFX11-FAKE16-NEXT: global_store_b16 v[4:5], v0, off offset:8
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: test_rotr_i16:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: global_load_u16 v2, v[2:3], off offset:48
+; GFX12-NEXT: global_load_u16 v0, v[0:1], off offset:32
+; GFX12-NEXT: s_wait_loadcnt 0x1
+; GFX12-NEXT: v_sub_nc_u16 v1, 0, v2
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_lshrrev_b16 v2, v2, v0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_lshlrev_b16 v0, v1, v0
+; GFX12-NEXT: v_or_b32_e32 v0, v2, v0
+; GFX12-NEXT: global_store_b16 v[4:5], v0, off offset:8
+; GFX12-NEXT: s_setpc_b64 s[30:31]
entry:
%arrayidx = getelementptr inbounds i16, ptr addrspace(1) %sourceA, i64 16
%a = load i16, ptr addrspace(1) %arrayidx
More information about the llvm-commits
mailing list