[llvm] [AMDGPU] Allow sinking of free vector ops (PR #162580)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Oct 8 18:51:14 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Gheorghe-Teodor Bercea (doru1004)
<details>
<summary>Changes</summary>
Sinking ShuffleVectors / ExtractElement / InsertElement into user blocks can help enable SDAG combines by providing visibility to the values instead of emitting CopyTo/FromRegs. The sink IR pass disables sinking into loops, so this PR extends the CodeGenPrepare target hook shouldSinkOperands.
---
Patch is 119.05 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/162580.diff
5 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp (+84)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll (+60-60)
- (modified) llvm/test/CodeGen/AMDGPU/frem.ll (+423-436)
- (modified) llvm/test/CodeGen/AMDGPU/srem.ll (+382-434)
- (modified) llvm/test/CodeGen/AMDGPU/undef-handling-crash-in-ra.ll (-2)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 03d16fdd54c42..def8ba47c9c37 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -1301,6 +1301,90 @@ bool GCNTTIImpl::isProfitableToSinkOperands(Instruction *I,
if (match(&Op, m_FAbs(m_Value())) || match(&Op, m_FNeg(m_Value())))
Ops.push_back(&Op);
+
+ // Zero cost vector instructions (e.g. extractelement 0 of i32 vectors)
+ // will be optimized away, and sinking them can help SDAG combines.
+ DataLayout DL = I->getModule()->getDataLayout();
+ auto IsFreeExtractInsert = [&DL, this](VectorType *VecType,
+ unsigned VecIndex) {
+ unsigned EltSize = DL.getTypeSizeInBits(VecType->getElementType());
+ return EltSize >= 32 ||
+ (EltSize == 16 && VecIndex == 0 && ST->has16BitInsts());
+ };
+
+ uint64_t VecIndex;
+ Value *Vec;
+ if (match(Op.get(), m_ExtractElt(m_Value(Vec), m_ConstantInt(VecIndex)))) {
+ Instruction *VecOpInst =
+ dyn_cast<Instruction>(cast<Instruction>(Op.get())->getOperand(0));
+ // If a zero cost extractvector instruction is the only use of the vector,
+ // then it may be combined with the def.
+ if (VecOpInst && VecOpInst->hasOneUse())
+ continue;
+
+ if (IsFreeExtractInsert(cast<VectorType>(Vec->getType()), VecIndex))
+ Ops.push_back(&Op);
+
+ continue;
+ }
+
+ if (match(Op.get(),
+ m_InsertElt(m_Value(Vec), m_Value(), m_ConstantInt(VecIndex)))) {
+ if (IsFreeExtractInsert(cast<VectorType>(Vec->getType()), VecIndex))
+ Ops.push_back(&Op);
+
+ continue;
+ }
+
+ if (auto *Shuffle = dyn_cast<ShuffleVectorInst>(Op.get())) {
+ if (Shuffle->isIdentity()) {
+ Ops.push_back(&Op);
+ continue;
+ }
+
+ unsigned EltSize = DL.getTypeSizeInBits(
+ cast<VectorType>(cast<VectorType>(Shuffle->getType()))
+ ->getElementType());
+
+ // For i32 (or greater) shufflevectors, these will be lowered into a
+ // series of insert / extract elements, which will be coalesced away.
+ if (EltSize >= 32) {
+ Ops.push_back(&Op);
+ continue;
+ }
+
+ if (EltSize < 16 || !ST->has16BitInsts())
+ continue;
+
+ int NumSubElts, SubIndex;
+ if (Shuffle->changesLength()) {
+ if (Shuffle->increasesLength() && Shuffle->isIdentityWithPadding()) {
+ Ops.push_back(&Op);
+ continue;
+ }
+
+ if (Shuffle->isExtractSubvectorMask(SubIndex) ||
+ Shuffle->isInsertSubvectorMask(NumSubElts, SubIndex)) {
+ if (!(SubIndex % 2)) {
+ Ops.push_back(&Op);
+ continue;
+ }
+ }
+ }
+
+ if (Shuffle->isReverse() || Shuffle->isZeroEltSplat() ||
+ Shuffle->isSingleSource()) {
+ Ops.push_back(&Op);
+ continue;
+ }
+
+ if (Shuffle->isInsertSubvectorMask(NumSubElts, SubIndex)) {
+ if (!(SubIndex % 2)) {
+ Ops.push_back(&Op);
+ continue;
+ }
+ }
+ }
}
return !Ops.empty();
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll
index 302b2395642d0..74b31913abb7e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll
@@ -2149,11 +2149,11 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: s_cbranch_vccz .LBB11_2
; CI-NEXT: ; %bb.1: ; %frem.else
; CI-NEXT: s_and_b32 s6, s2, 0x80000000
-; CI-NEXT: v_mov_b32_e32 v1, s4
-; CI-NEXT: v_mov_b32_e32 v0, s2
-; CI-NEXT: v_cmp_eq_f32_e64 vcc, |s2|, |v1|
-; CI-NEXT: v_mov_b32_e32 v1, s6
-; CI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; CI-NEXT: v_mov_b32_e32 v0, s4
+; CI-NEXT: v_cmp_eq_f32_e64 vcc, |s2|, |v0|
+; CI-NEXT: v_mov_b32_e32 v0, s6
+; CI-NEXT: v_mov_b32_e32 v1, s2
+; CI-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
; CI-NEXT: s_mov_b32 s6, 0
; CI-NEXT: .LBB11_2: ; %Flow53
; CI-NEXT: s_xor_b32 s6, s6, 1
@@ -2224,11 +2224,11 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: s_cbranch_vccz .LBB11_10
; CI-NEXT: ; %bb.9: ; %frem.else16
; CI-NEXT: s_and_b32 s6, s3, 0x80000000
-; CI-NEXT: v_mov_b32_e32 v2, s5
-; CI-NEXT: v_mov_b32_e32 v1, s3
-; CI-NEXT: v_cmp_eq_f32_e64 vcc, |s3|, |v2|
-; CI-NEXT: v_mov_b32_e32 v2, s6
-; CI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
+; CI-NEXT: v_mov_b32_e32 v1, s5
+; CI-NEXT: v_cmp_eq_f32_e64 vcc, |s3|, |v1|
+; CI-NEXT: v_mov_b32_e32 v1, s6
+; CI-NEXT: v_mov_b32_e32 v2, s3
+; CI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
; CI-NEXT: s_mov_b32 s6, 0
; CI-NEXT: .LBB11_10: ; %Flow49
; CI-NEXT: s_xor_b32 s6, s6, 1
@@ -2322,11 +2322,11 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: s_cbranch_vccz .LBB11_2
; VI-NEXT: ; %bb.1: ; %frem.else
; VI-NEXT: s_and_b32 s6, s2, 0x80000000
-; VI-NEXT: v_mov_b32_e32 v1, s4
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_cmp_eq_f32_e64 vcc, |s2|, |v1|
-; VI-NEXT: v_mov_b32_e32 v1, s6
-; VI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_cmp_eq_f32_e64 vcc, |s2|, |v0|
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s2
+; VI-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
; VI-NEXT: s_mov_b32 s6, 0
; VI-NEXT: .LBB11_2: ; %Flow53
; VI-NEXT: s_xor_b32 s6, s6, 1
@@ -2397,11 +2397,11 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: s_cbranch_vccz .LBB11_10
; VI-NEXT: ; %bb.9: ; %frem.else16
; VI-NEXT: s_and_b32 s6, s3, 0x80000000
-; VI-NEXT: v_mov_b32_e32 v2, s5
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_cmp_eq_f32_e64 vcc, |s3|, |v2|
-; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_cmp_eq_f32_e64 vcc, |s3|, |v1|
+; VI-NEXT: v_mov_b32_e32 v1, s6
+; VI-NEXT: v_mov_b32_e32 v2, s3
+; VI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
; VI-NEXT: s_mov_b32 s6, 0
; VI-NEXT: .LBB11_10: ; %Flow49
; VI-NEXT: s_xor_b32 s6, s6, 1
@@ -2503,11 +2503,11 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: s_cbranch_vccz .LBB12_2
; CI-NEXT: ; %bb.1: ; %frem.else
; CI-NEXT: s_and_b32 s2, s4, 0x80000000
-; CI-NEXT: v_mov_b32_e32 v1, s8
-; CI-NEXT: v_mov_b32_e32 v0, s4
-; CI-NEXT: v_cmp_eq_f32_e64 vcc, |s4|, |v1|
-; CI-NEXT: v_mov_b32_e32 v1, s2
-; CI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; CI-NEXT: v_mov_b32_e32 v0, s8
+; CI-NEXT: v_cmp_eq_f32_e64 vcc, |s4|, |v0|
+; CI-NEXT: v_mov_b32_e32 v0, s2
+; CI-NEXT: v_mov_b32_e32 v1, s4
+; CI-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
; CI-NEXT: s_mov_b32 s2, 0
; CI-NEXT: .LBB12_2: ; %Flow127
; CI-NEXT: s_xor_b32 s2, s2, 1
@@ -2578,11 +2578,11 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: s_cbranch_vccz .LBB12_10
; CI-NEXT: ; %bb.9: ; %frem.else16
; CI-NEXT: s_and_b32 s2, s5, 0x80000000
-; CI-NEXT: v_mov_b32_e32 v2, s9
-; CI-NEXT: v_mov_b32_e32 v1, s5
-; CI-NEXT: v_cmp_eq_f32_e64 vcc, |s5|, |v2|
-; CI-NEXT: v_mov_b32_e32 v2, s2
-; CI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
+; CI-NEXT: v_mov_b32_e32 v1, s9
+; CI-NEXT: v_cmp_eq_f32_e64 vcc, |s5|, |v1|
+; CI-NEXT: v_mov_b32_e32 v1, s2
+; CI-NEXT: v_mov_b32_e32 v2, s5
+; CI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
; CI-NEXT: s_mov_b32 s2, 0
; CI-NEXT: .LBB12_10: ; %Flow123
; CI-NEXT: s_xor_b32 s2, s2, 1
@@ -2653,11 +2653,11 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: s_cbranch_vccz .LBB12_18
; CI-NEXT: ; %bb.17: ; %frem.else47
; CI-NEXT: s_and_b32 s2, s6, 0x80000000
-; CI-NEXT: v_mov_b32_e32 v3, s10
-; CI-NEXT: v_mov_b32_e32 v2, s6
-; CI-NEXT: v_cmp_eq_f32_e64 vcc, |s6|, |v3|
-; CI-NEXT: v_mov_b32_e32 v3, s2
-; CI-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
+; CI-NEXT: v_mov_b32_e32 v2, s10
+; CI-NEXT: v_cmp_eq_f32_e64 vcc, |s6|, |v2|
+; CI-NEXT: v_mov_b32_e32 v2, s2
+; CI-NEXT: v_mov_b32_e32 v3, s6
+; CI-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
; CI-NEXT: s_mov_b32 s2, 0
; CI-NEXT: .LBB12_18: ; %Flow119
; CI-NEXT: s_xor_b32 s2, s2, 1
@@ -2728,11 +2728,11 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: s_cbranch_vccz .LBB12_26
; CI-NEXT: ; %bb.25: ; %frem.else78
; CI-NEXT: s_and_b32 s2, s7, 0x80000000
-; CI-NEXT: v_mov_b32_e32 v4, s11
-; CI-NEXT: v_mov_b32_e32 v3, s7
-; CI-NEXT: v_cmp_eq_f32_e64 vcc, |s7|, |v4|
-; CI-NEXT: v_mov_b32_e32 v4, s2
-; CI-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
+; CI-NEXT: v_mov_b32_e32 v3, s11
+; CI-NEXT: v_cmp_eq_f32_e64 vcc, |s7|, |v3|
+; CI-NEXT: v_mov_b32_e32 v3, s2
+; CI-NEXT: v_mov_b32_e32 v4, s7
+; CI-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc
; CI-NEXT: s_mov_b32 s2, 0
; CI-NEXT: .LBB12_26: ; %Flow115
; CI-NEXT: s_xor_b32 s2, s2, 1
@@ -2834,11 +2834,11 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: s_cbranch_vccz .LBB12_2
; VI-NEXT: ; %bb.1: ; %frem.else
; VI-NEXT: s_and_b32 s2, s4, 0x80000000
-; VI-NEXT: v_mov_b32_e32 v1, s8
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_cmp_eq_f32_e64 vcc, |s4|, |v1|
-; VI-NEXT: v_mov_b32_e32 v1, s2
-; VI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_cmp_eq_f32_e64 vcc, |s4|, |v0|
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s4
+; VI-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
; VI-NEXT: s_mov_b32 s2, 0
; VI-NEXT: .LBB12_2: ; %Flow127
; VI-NEXT: s_xor_b32 s2, s2, 1
@@ -2909,11 +2909,11 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: s_cbranch_vccz .LBB12_10
; VI-NEXT: ; %bb.9: ; %frem.else16
; VI-NEXT: s_and_b32 s2, s5, 0x80000000
-; VI-NEXT: v_mov_b32_e32 v2, s9
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: v_cmp_eq_f32_e64 vcc, |s5|, |v2|
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
+; VI-NEXT: v_mov_b32_e32 v1, s9
+; VI-NEXT: v_cmp_eq_f32_e64 vcc, |s5|, |v1|
+; VI-NEXT: v_mov_b32_e32 v1, s2
+; VI-NEXT: v_mov_b32_e32 v2, s5
+; VI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
; VI-NEXT: s_mov_b32 s2, 0
; VI-NEXT: .LBB12_10: ; %Flow123
; VI-NEXT: s_xor_b32 s2, s2, 1
@@ -2984,11 +2984,11 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: s_cbranch_vccz .LBB12_18
; VI-NEXT: ; %bb.17: ; %frem.else47
; VI-NEXT: s_and_b32 s2, s6, 0x80000000
-; VI-NEXT: v_mov_b32_e32 v3, s10
-; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: v_cmp_eq_f32_e64 vcc, |s6|, |v3|
-; VI-NEXT: v_mov_b32_e32 v3, s2
-; VI-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
+; VI-NEXT: v_mov_b32_e32 v2, s10
+; VI-NEXT: v_cmp_eq_f32_e64 vcc, |s6|, |v2|
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v3, s6
+; VI-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
; VI-NEXT: s_mov_b32 s2, 0
; VI-NEXT: .LBB12_18: ; %Flow119
; VI-NEXT: s_xor_b32 s2, s2, 1
@@ -3059,11 +3059,11 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: s_cbranch_vccz .LBB12_26
; VI-NEXT: ; %bb.25: ; %frem.else78
; VI-NEXT: s_and_b32 s2, s7, 0x80000000
-; VI-NEXT: v_mov_b32_e32 v4, s11
-; VI-NEXT: v_mov_b32_e32 v3, s7
-; VI-NEXT: v_cmp_eq_f32_e64 vcc, |s7|, |v4|
-; VI-NEXT: v_mov_b32_e32 v4, s2
-; VI-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
+; VI-NEXT: v_mov_b32_e32 v3, s11
+; VI-NEXT: v_cmp_eq_f32_e64 vcc, |s7|, |v3|
+; VI-NEXT: v_mov_b32_e32 v3, s2
+; VI-NEXT: v_mov_b32_e32 v4, s7
+; VI-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc
; VI-NEXT: s_mov_b32 s2, 0
; VI-NEXT: .LBB12_26: ; %Flow115
; VI-NEXT: s_xor_b32 s2, s2, 1
diff --git a/llvm/test/CodeGen/AMDGPU/frem.ll b/llvm/test/CodeGen/AMDGPU/frem.ll
index 78a961ea0da17..d75d2597685d6 100644
--- a/llvm/test/CodeGen/AMDGPU/frem.ll
+++ b/llvm/test/CodeGen/AMDGPU/frem.ll
@@ -5783,11 +5783,11 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-TRUE16-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v4, v3
; GFX11-TRUE16-NEXT: s_cbranch_vccz .LBB9_2
; GFX11-TRUE16-NEXT: ; %bb.1: ; %frem.else
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l
; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, v4, v3
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0x7fff, v5, v2
+; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0x7fff, v2, v5
; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v0.l, v2.l, vcc_lo
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB9_3
; GFX11-TRUE16-NEXT: s_branch .LBB9_8
@@ -6221,12 +6221,12 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-TRUE16-NEXT: s_cmp_ngt_f32 s6, s5
; GFX1150-TRUE16-NEXT: s_cbranch_scc0 .LBB9_2
; GFX1150-TRUE16-NEXT: ; %bb.1: ; %frem.else
-; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v0.l, s4
-; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v1.l, 0
+; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0
+; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v1.l, s4
; GFX1150-TRUE16-NEXT: s_cmp_eq_f32 s6, s5
; GFX1150-TRUE16-NEXT: s_cselect_b32 s7, -1, 0
; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff, v1, v0
+; GFX1150-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1
; GFX1150-TRUE16-NEXT: v_cndmask_b16 v0.l, s4, v0.l, s7
; GFX1150-TRUE16-NEXT: s_cbranch_execz .LBB9_3
; GFX1150-TRUE16-NEXT: s_branch .LBB9_8
@@ -6691,12 +6691,12 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-TRUE16-NEXT: s_cmp_ngt_f32 s6, s5
; GFX1200-TRUE16-NEXT: s_cbranch_scc0 .LBB9_2
; GFX1200-TRUE16-NEXT: ; %bb.1: ; %frem.else
-; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v0.l, s4
-; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v1.l, 0
+; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0
+; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v1.l, s4
; GFX1200-TRUE16-NEXT: s_cmp_eq_f32 s6, s5
; GFX1200-TRUE16-NEXT: s_cselect_b32 s7, -1, 0
; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1200-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff, v1, v0
+; GFX1200-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1
; GFX1200-TRUE16-NEXT: v_cndmask_b16 v0.l, s4, v0.l, s7
; GFX1200-TRUE16-NEXT: s_cbranch_execz .LBB9_3
; GFX1200-TRUE16-NEXT: s_branch .LBB9_8
@@ -8964,11 +8964,11 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-TRUE16-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v6, v5
; GFX11-TRUE16-NEXT: s_cbranch_vccz .LBB10_2
; GFX11-TRUE16-NEXT: ; %bb.1: ; %frem.else
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v0.l
; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, v6, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_bfi_b32 v4, 0x7fff, v7, v4
+; GFX11-TRUE16-NEXT: v_bfi_b32 v4, 0x7fff, v4, v7
; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v0.l, v4.l, vcc_lo
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB10_3
; GFX11-TRUE16-NEXT: s_branch .LBB10_8
@@ -9805,12 +9805,12 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-TRUE16-NEXT: s_cmp_ngt_f32 s8, s6
; GFX1150-TRUE16-NEXT: s_cbranch_scc0 .LBB10_2
; GFX1150-TRUE16-NEXT: ; %bb.1: ; %frem.else
-; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v0.l, s5
-; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v1.l, 0
+; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0
+; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v1.l, s5
; GFX1150-TRUE16-NEXT: s_cmp_eq_f32 s8, s6
; GFX1150-TRUE16-NEXT: s_cselect_b32 s9, -1, 0
; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff, v1, v0
+; GFX1150-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1
; GFX1150-TRUE16-NEXT: v_cndmask_b16 v0.l, s5, v0.l, s9
; GFX1150-TRUE16-NEXT: s_cbranch_execz .LBB10_3
; GFX1150-TRUE16-NEXT: s_branch .LBB10_8
@@ -10713,12 +10713,12 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-TRUE16-NEXT: s_cmp_ngt_f32 s8, s6
; GFX1200-TRUE16-NEXT: s_cbranch_scc0 .LBB10_2
; GFX1200-TRUE16-NEXT: ; %bb.1: ; %frem.else
-; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v0.l, s5
-; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v1.l, 0
+; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0
+; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v1.l, s5
; GFX1200-TRUE16-NEXT: s_cmp_eq_f32 s8, s6
; GFX1200-TRUE16-NEXT: s_cselect_b32 s9, -1, 0
; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1200-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff, v1, v0
+; GFX1200-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1
; GFX1200-TRUE16-NEXT: v_cndmask_b16 v0.l, s5, v0.l, s9
; GFX1200-TRUE16-NEXT: s_cbranch_execz .LBB10_3
; GFX1200-TRUE16-NEXT: s_branch .LBB10_8
@@ -12714,18 +12714,18 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150: ; %bb.0:
; GFX1150-NEXT: s_clause 0x1
; GFX1150-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1150-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
+; GFX1150-NEXT: s_load_b64 s[8:9], s[4:5], 0x34
; GFX1150-NEXT: v_mov_b32_e32 v2, 0
; GFX1150-NEXT: s_waitcnt lgkmcnt(0)
; GFX1150-NEXT: global_load_b64 v[0:1], v2, s[2:3]
; GFX1150-NEXT: s_waitcnt vmcnt(0)
-; GFX1150-NEXT: v_readfirstlane_b32 s5, v1
-; GFX1150-NEXT: global_load_b64 v[1:2], v2, s[6:7] offset:32
; GFX1150-NEXT: v_readfirstlane_b32 s6, v0
+; GFX1150-NEXT: v_readfirstlane_b32 s5, v1
+; GFX1150-NEXT: global_load_b64 v[0:1], v2, s[8:9] offset:32
; GFX1150-NEXT: s_and_b32 s3, s6, 0x7fffffff
; GFX1150-NEXT: s_waitcnt vmcnt(0)
-; GFX1150-NEXT: v_readfirstlane_b32 s4, v1
-; GFX1150-NEXT: v_readfirstlane_b32 s2, v2
+; GFX1150-NEXT: v_readfirstlane_b32 s4, v0
+; GFX1150-NEXT: v_readfirstlane_b32 s2, v1
; GFX1150-NEXT: s_and_b32 s8, s4, 0x7fffffff
; GFX1150-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1150-NEXT: s_cmp_ngt_f32 s3, s8
@@ -12933,232 +12933,221 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-LABEL: frem_v2f32:
; GFX1200: ; %bb.0:
; GFX1200-NEXT: s_clause 0x1
-; GFX1200-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1200-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
-; GFX1200-NEXT: v_mov_b32_e32 v2, 0
+; GFX1200-NEXT: s_load_b128 s[8:11], s[4:5], 0x24
+; GFX1200-NEXT: s_load_b64 s[0:1], s[4:5], 0x34
+; GFX1200-NEXT: v_mov_b32_e32 v0, 0
; GFX1200-NEXT: s_wait_kmcnt 0x0
-; GFX1200-NEXT: global_load_b64 v[0:1], v2, s[2:3]
-; GFX1200-NEXT: s_wait_loadcnt 0x0
-; GFX1200-NEXT: v_readfirstlane_b32 s5, v1
-; GFX1200-NEXT: global_load_b64 v[1:2], v2, s[6:7] offset:32
-; GFX1200-NEXT: v_readfirstlane_b32 s6, v0
-; GFX1200-NEXT: s_and_b32 s3, s6, 0x7fffffff
+; GFX1200-NEXT: s_clause 0x1
+; GFX1200-NEXT: global_load_b64 v[2:3], v0, s[10:11]
+; GFX1200-NEXT: global_load_b64 v[0:1], v0, s[0:1] offset:32
+; GFX1200-NEXT: s_wait_loadcnt 0x1
+; GFX1200-NEXT: v_and_b32_e32 v4, 0x7fffffff, v2
; GFX1200-NEXT: s_wait_loadcnt 0x0
-; GFX1200-NEXT: v_readfirstlane_b32 s4, v1
-; GFX1200-NEXT: v_readfirstlane_b32 s2, v2
-; GFX1200-NEXT: s_and_b32 s8, s4, 0x7fffffff
-; GFX1200-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1200-NEXT: s_cmp_ngt_f32 s3, s8
-; GFX1200-NE...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/162580
More information about the llvm-commits
mailing list