[llvm] [TargetLowering][AMDGPU][ARM][RISCV][X86] Teach SimplifyDemandedBits to combine (srl (sra X, C1), ShAmt) -> sra(X, C1+ShAmt) (PR #101751)
via llvm-commits
llvm-commits at lists.llvm.org
Fri Aug 2 13:59:18 PDT 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Craig Topper (topperc)
<details>
<summary>Changes</summary>
If the upper bits of the shr aren't demanded.
This helps with cases where the outer srl was originally an sra and was converted to a srl by SimplifyDemandedBits before it had a chance to combine with the inner sra. This can occur when the inner sra was part of a sign_extend_inreg expansion.
There are some regressions in AMDGPU, ARM, and Thumb that need investigating.
---
Patch is 60.31 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/101751.diff
23 Files Affected:
- (modified) llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp (+16)
- (modified) llvm/test/CodeGen/AArch64/srem-seteq-illegal-types.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/div-rem-by-constant-64.ll (+24-16)
- (modified) llvm/test/CodeGen/AMDGPU/div_i128.ll (+34-34)
- (modified) llvm/test/CodeGen/AMDGPU/function-returns.ll (+6-6)
- (modified) llvm/test/CodeGen/AMDGPU/itofp.i128.ll (+3-5)
- (modified) llvm/test/CodeGen/AMDGPU/load-range-metadata-sign-bits.ll (+3-2)
- (modified) llvm/test/CodeGen/AMDGPU/permute_i8.ll (+11-13)
- (modified) llvm/test/CodeGen/AMDGPU/sdiv64.ll (+8-6)
- (modified) llvm/test/CodeGen/AMDGPU/shift-i128.ll (+3-1)
- (modified) llvm/test/CodeGen/AMDGPU/srem-seteq-illegal-types.ll (+2-2)
- (modified) llvm/test/CodeGen/ARM/srem-seteq-illegal-types.ll (+14-12)
- (modified) llvm/test/CodeGen/Mips/srem-seteq-illegal-types.ll (+2-4)
- (modified) llvm/test/CodeGen/NVPTX/idioms.ll (+2-2)
- (modified) llvm/test/CodeGen/PowerPC/srem-seteq-illegal-types.ll (+4-4)
- (modified) llvm/test/CodeGen/RISCV/div.ll (+16-24)
- (modified) llvm/test/CodeGen/RISCV/rv64-legal-i32/div.ll (+8-12)
- (modified) llvm/test/CodeGen/RISCV/rv64zba.ll (+8-10)
- (modified) llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll (+12-18)
- (modified) llvm/test/CodeGen/Thumb2/srem-seteq-illegal-types.ll (+3-2)
- (modified) llvm/test/CodeGen/X86/scmp.ll (+84-97)
- (modified) llvm/test/CodeGen/X86/sdiv_fix_sat.ll (+104-92)
- (modified) llvm/test/CodeGen/X86/srem-seteq-illegal-types.ll (+2-4)
``````````diff
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 8ab3103fda23f..ea1102a14cc59 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -1958,6 +1958,22 @@ bool TargetLowering::SimplifyDemandedBits(
}
}
+ // If this is (shr (sra X, C1), ShAmt), see if we can combine this into a
+ // single sra. We can do this if the top bits are never demanded.
+ if (Op0.getOpcode() == ISD::SRA) {
+ if (!DemandedBits.intersects(APInt::getHighBitsSet(BitWidth, ShAmt))) {
+ if (std::optional<uint64_t> InnerSA =
+ TLO.DAG.getValidShiftAmount(Op0, DemandedElts, Depth + 2)) {
+ unsigned C1 = *InnerSA;
+ // Clamp the combined shift amount if it exceeds the bit width.
+ unsigned Combined = std::min(C1 + ShAmt, BitWidth - 1);
+ SDValue NewSA = TLO.DAG.getConstant(Combined, dl, ShiftVT);
+ return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::SRA, dl, VT,
+ Op0.getOperand(0), NewSA));
+ }
+ }
+ }
+
APInt InDemandedMask = (DemandedBits << ShAmt);
// If the shift is exact, then it does demand the low bits (and knows that
diff --git a/llvm/test/CodeGen/AArch64/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/AArch64/srem-seteq-illegal-types.ll
index 595991e86a91c..9fbce05eee177 100644
--- a/llvm/test/CodeGen/AArch64/srem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/AArch64/srem-seteq-illegal-types.ll
@@ -41,8 +41,8 @@ define i1 @test_srem_even(i4 %X) nounwind {
define i1 @test_srem_pow2_setne(i6 %X) nounwind {
; CHECK-LABEL: test_srem_pow2_setne:
; CHECK: // %bb.0:
-; CHECK-NEXT: sbfx w8, w0, #0, #6
-; CHECK-NEXT: ubfx w8, w8, #9, #2
+; CHECK-NEXT: sbfx w8, w0, #5, #1
+; CHECK-NEXT: and w8, w8, #0x3
; CHECK-NEXT: add w8, w0, w8
; CHECK-NEXT: and w8, w8, #0x3c
; CHECK-NEXT: sub w8, w0, w8
diff --git a/llvm/test/CodeGen/AMDGPU/div-rem-by-constant-64.ll b/llvm/test/CodeGen/AMDGPU/div-rem-by-constant-64.ll
index 113c6d01c99a1..dd888433e32e8 100644
--- a/llvm/test/CodeGen/AMDGPU/div-rem-by-constant-64.ll
+++ b/llvm/test/CodeGen/AMDGPU/div-rem-by-constant-64.ll
@@ -661,9 +661,10 @@ define noundef i64 @sdiv64_2(i64 noundef %i) {
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshrrev_b32_e32 v2, 31, v1
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT: v_ashrrev_i64 v[0:1], 1, v[0:1]
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v0, v2
+; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
+; GFX9-NEXT: v_alignbit_b32 v0, v3, v2, 1
+; GFX9-NEXT: v_ashrrev_i64 v[1:2], 33, v[2:3]
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: sdiv64_2:
@@ -671,17 +672,20 @@ define noundef i64 @sdiv64_2(i64 noundef %i) {
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942-NEXT: v_lshrrev_b32_e32 v2, 31, v1
; GFX942-NEXT: v_mov_b32_e32 v3, 0
-; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
-; GFX942-NEXT: v_ashrrev_i64 v[0:1], 1, v[0:1]
+; GFX942-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, v[2:3]
+; GFX942-NEXT: v_alignbit_b32 v0, v3, v2, 1
+; GFX942-NEXT: v_ashrrev_i64 v[2:3], 33, v[2:3]
+; GFX942-NEXT: v_mov_b32_e32 v1, v2
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX1030-LABEL: sdiv64_2:
; GFX1030: ; %bb.0: ; %entry
; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1030-NEXT: v_lshrrev_b32_e32 v2, 31, v1
-; GFX1030-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX1030-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX1030-NEXT: v_ashrrev_i64 v[0:1], 1, v[0:1]
+; GFX1030-NEXT: v_add_co_u32 v2, vcc_lo, v0, v2
+; GFX1030-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
+; GFX1030-NEXT: v_alignbit_b32 v0, v3, v2, 1
+; GFX1030-NEXT: v_ashrrev_i64 v[1:2], 33, v[2:3]
; GFX1030-NEXT: s_setpc_b64 s[30:31]
entry:
%div = sdiv i64 %i, 2
@@ -788,9 +792,10 @@ define noundef i64 @sdiv64_64(i64 noundef %i) {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v1
; GFX9-NEXT: v_lshrrev_b32_e32 v2, 26, v2
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT: v_ashrrev_i64 v[0:1], 6, v[0:1]
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v0, v2
+; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
+; GFX9-NEXT: v_alignbit_b32 v0, v3, v2, 6
+; GFX9-NEXT: v_ashrrev_i64 v[1:2], 38, v[2:3]
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: sdiv64_64:
@@ -799,8 +804,10 @@ define noundef i64 @sdiv64_64(i64 noundef %i) {
; GFX942-NEXT: v_ashrrev_i32_e32 v2, 31, v1
; GFX942-NEXT: v_lshrrev_b32_e32 v2, 26, v2
; GFX942-NEXT: v_mov_b32_e32 v3, 0
-; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
-; GFX942-NEXT: v_ashrrev_i64 v[0:1], 6, v[0:1]
+; GFX942-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, v[2:3]
+; GFX942-NEXT: v_alignbit_b32 v0, v3, v2, 6
+; GFX942-NEXT: v_ashrrev_i64 v[2:3], 38, v[2:3]
+; GFX942-NEXT: v_mov_b32_e32 v1, v2
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX1030-LABEL: sdiv64_64:
@@ -808,9 +815,10 @@ define noundef i64 @sdiv64_64(i64 noundef %i) {
; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1030-NEXT: v_ashrrev_i32_e32 v2, 31, v1
; GFX1030-NEXT: v_lshrrev_b32_e32 v2, 26, v2
-; GFX1030-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX1030-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX1030-NEXT: v_ashrrev_i64 v[0:1], 6, v[0:1]
+; GFX1030-NEXT: v_add_co_u32 v2, vcc_lo, v0, v2
+; GFX1030-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
+; GFX1030-NEXT: v_alignbit_b32 v0, v3, v2, 6
+; GFX1030-NEXT: v_ashrrev_i64 v[1:2], 38, v[2:3]
; GFX1030-NEXT: s_setpc_b64 s[30:31]
entry:
%div = sdiv i64 %i, 64
diff --git a/llvm/test/CodeGen/AMDGPU/div_i128.ll b/llvm/test/CodeGen/AMDGPU/div_i128.ll
index fea1303d0a2b7..071aae98c9685 100644
--- a/llvm/test/CodeGen/AMDGPU/div_i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/div_i128.ll
@@ -4448,13 +4448,14 @@ define i128 @v_sdiv_i128_v_pow2k(i128 %lhs) {
; GFX9-NEXT: v_mov_b32_e32 v5, v4
; GFX9-NEXT: v_lshrrev_b64 v[4:5], 31, v[4:5]
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4
-; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v1, v5, vcc
-; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc
-; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
-; GFX9-NEXT: v_lshlrev_b64 v[0:1], 31, v[2:3]
-; GFX9-NEXT: v_lshrrev_b32_e32 v4, 1, v4
-; GFX9-NEXT: v_ashrrev_i64 v[2:3], 33, v[2:3]
-; GFX9-NEXT: v_or_b32_e32 v0, v4, v0
+; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v1, v5, vcc
+; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v2, vcc
+; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v3, vcc
+; GFX9-NEXT: v_lshlrev_b64 v[0:1], 31, v[4:5]
+; GFX9-NEXT: v_lshrrev_b32_e32 v2, 1, v6
+; GFX9-NEXT: v_or_b32_e32 v0, v2, v0
+; GFX9-NEXT: v_ashrrev_i64 v[2:3], 33, v[4:5]
+; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v5
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-O0-LABEL: v_sdiv_i128_v_pow2k:
@@ -4481,41 +4482,40 @@ define i128 @v_sdiv_i128_v_pow2k(i128 %lhs) {
; GFX9-O0-NEXT: v_mov_b32_e32 v1, v5
; GFX9-O0-NEXT: s_mov_b32 s4, 63
; GFX9-O0-NEXT: v_ashrrev_i64 v[4:5], s4, v[4:5]
-; GFX9-O0-NEXT: s_mov_b32 s5, 31
-; GFX9-O0-NEXT: v_lshrrev_b64 v[6:7], s5, v[4:5]
+; GFX9-O0-NEXT: s_mov_b32 s6, 31
+; GFX9-O0-NEXT: v_lshrrev_b64 v[6:7], s6, v[4:5]
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v6
; GFX9-O0-NEXT: v_mov_b32_e32 v4, v7
; GFX9-O0-NEXT: s_mov_b64 s[8:9], 0
-; GFX9-O0-NEXT: s_mov_b32 s6, s8
-; GFX9-O0-NEXT: s_mov_b32 s4, s9
+; GFX9-O0-NEXT: s_mov_b32 s7, s8
+; GFX9-O0-NEXT: s_mov_b32 s5, s9
; GFX9-O0-NEXT: v_add_co_u32_e32 v0, vcc, v0, v5
-; GFX9-O0-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v4, vcc
-; GFX9-O0-NEXT: v_mov_b32_e32 v4, s6
-; GFX9-O0-NEXT: v_addc_co_u32_e32 v5, vcc, v2, v4, vcc
-; GFX9-O0-NEXT: v_mov_b32_e32 v2, s4
+; GFX9-O0-NEXT: v_addc_co_u32_e32 v5, vcc, v3, v4, vcc
+; GFX9-O0-NEXT: v_mov_b32_e32 v3, s7
+; GFX9-O0-NEXT: v_addc_co_u32_e32 v3, vcc, v2, v3, vcc
+; GFX9-O0-NEXT: v_mov_b32_e32 v2, s5
; GFX9-O0-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v6, v1
-; GFX9-O0-NEXT: v_mov_b32_e32 v2, v5
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
-; GFX9-O0-NEXT: ; implicit-def: $sgpr4
+; GFX9-O0-NEXT: ; implicit-def: $sgpr5
+; GFX9-O0-NEXT: ; implicit-def: $sgpr5
+; GFX9-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v4, v1
+; GFX9-O0-NEXT: v_mov_b32_e32 v2, v3
+; GFX9-O0-NEXT: ; implicit-def: $sgpr5
+; GFX9-O0-NEXT: ; implicit-def: $sgpr5
; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3
-; GFX9-O0-NEXT: s_mov_b32 s4, 33
-; GFX9-O0-NEXT: v_lshrrev_b64 v[0:1], s4, v[0:1]
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, v5
+; GFX9-O0-NEXT: s_mov_b32 s5, 33
+; GFX9-O0-NEXT: v_lshrrev_b64 v[0:1], s5, v[0:1]
; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
-; GFX9-O0-NEXT: v_lshl_or_b32 v0, v2, s5, v0
-; GFX9-O0-NEXT: v_mov_b32_e32 v3, v5
-; GFX9-O0-NEXT: v_mov_b32_e32 v4, v6
+; GFX9-O0-NEXT: v_lshl_or_b32 v0, v2, s6, v0
+; GFX9-O0-NEXT: v_mov_b32_e32 v6, v4
+; GFX9-O0-NEXT: v_mov_b32_e32 v5, v3
+; GFX9-O0-NEXT: v_ashrrev_i64 v[5:6], s5, v[5:6]
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, v4
+; GFX9-O0-NEXT: s_mov_b32 s5, 1
+; GFX9-O0-NEXT: v_alignbit_b32 v1, v1, v2, s5
+; GFX9-O0-NEXT: v_mov_b32_e32 v2, v5
; GFX9-O0-NEXT: v_ashrrev_i64 v[3:4], s4, v[3:4]
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, v6
-; GFX9-O0-NEXT: s_mov_b32 s4, 1
-; GFX9-O0-NEXT: v_alignbit_b32 v1, v1, v2, s4
-; GFX9-O0-NEXT: v_mov_b32_e32 v2, v3
-; GFX9-O0-NEXT: s_mov_b32 s4, 32
-; GFX9-O0-NEXT: v_lshrrev_b64 v[3:4], s4, v[3:4]
; GFX9-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 killed $vgpr3_vgpr4 killed $exec
; GFX9-O0-NEXT: s_setpc_b64 s[30:31]
;
diff --git a/llvm/test/CodeGen/AMDGPU/function-returns.ll b/llvm/test/CodeGen/AMDGPU/function-returns.ll
index 401cbce00ac9a..925987df8c8de 100644
--- a/llvm/test/CodeGen/AMDGPU/function-returns.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-returns.ll
@@ -334,23 +334,23 @@ define signext i63 @i63_signext_func_void(i63 %val) #0 {
; CI-LABEL: i63_signext_func_void:
; CI: ; %bb.0:
; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CI-NEXT: v_lshl_b64 v[0:1], v[0:1], 1
-; CI-NEXT: v_ashr_i64 v[0:1], v[0:1], 1
+; CI-NEXT: v_lshl_b64 v[1:2], v[0:1], 1
+; CI-NEXT: v_ashr_i64 v[1:2], v[1:2], 33
; CI-NEXT: s_setpc_b64 s[30:31]
;
; GFX89-LABEL: i63_signext_func_void:
; GFX89: ; %bb.0:
; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX89-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1]
-; GFX89-NEXT: v_ashrrev_i64 v[0:1], 1, v[0:1]
+; GFX89-NEXT: v_lshlrev_b64 v[1:2], 1, v[0:1]
+; GFX89-NEXT: v_ashrrev_i64 v[1:2], 33, v[1:2]
; GFX89-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: i63_signext_func_void:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1]
+; GFX11-NEXT: v_lshlrev_b64 v[1:2], 1, v[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_ashrrev_i64 v[0:1], 1, v[0:1]
+; GFX11-NEXT: v_ashrrev_i64 v[1:2], 33, v[1:2]
; GFX11-NEXT: s_setpc_b64 s[30:31]
ret i63 %val
}
diff --git a/llvm/test/CodeGen/AMDGPU/itofp.i128.ll b/llvm/test/CodeGen/AMDGPU/itofp.i128.ll
index c5198cdb421a5..1fe8c0aabb6b6 100644
--- a/llvm/test/CodeGen/AMDGPU/itofp.i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/itofp.i128.ll
@@ -631,17 +631,15 @@ define double @sitofp_i128_to_f64(i128 %x) {
; SDAG-NEXT: v_and_or_b32 v0, v0, 1, v4
; SDAG-NEXT: v_add_co_u32_e32 v4, vcc, 1, v0
; SDAG-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
-; SDAG-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
; SDAG-NEXT: v_lshrrev_b64 v[0:1], 2, v[4:5]
-; SDAG-NEXT: v_lshlrev_b32_e32 v7, 30, v6
-; SDAG-NEXT: v_or_b32_e32 v10, v1, v7
+; SDAG-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
; SDAG-NEXT: v_and_b32_e32 v1, 0x800000, v5
; SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
+; SDAG-NEXT: v_alignbit_b32 v10, v6, v5, 2
; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SDAG-NEXT: ; %bb.11: ; %itofp-if-then20
; SDAG-NEXT: v_lshrrev_b64 v[0:1], 3, v[4:5]
-; SDAG-NEXT: v_lshlrev_b32_e32 v2, 29, v6
-; SDAG-NEXT: v_or_b32_e32 v10, v1, v2
+; SDAG-NEXT: v_alignbit_b32 v10, v6, v5, 3
; SDAG-NEXT: v_mov_b32_e32 v2, v8
; SDAG-NEXT: ; %bb.12: ; %Flow
; SDAG-NEXT: s_or_b64 exec, exec, s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/load-range-metadata-sign-bits.ll b/llvm/test/CodeGen/AMDGPU/load-range-metadata-sign-bits.ll
index 5fc1a87e71a1a..49b569bf2154e 100644
--- a/llvm/test/CodeGen/AMDGPU/load-range-metadata-sign-bits.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-range-metadata-sign-bits.ll
@@ -110,10 +110,11 @@ define i64 @range_metadata_sext_i8_signed_range_i64(ptr addrspace(1) %ptr) {
; SDAG-LABEL: range_metadata_sext_i8_signed_range_i64:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: global_load_dwordx2 v[0:1], v[0:1], off glc
+; SDAG-NEXT: global_load_dwordx2 v[2:3], v[0:1], off glc
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: v_lshlrev_b32_e32 v1, 23, v0
+; SDAG-NEXT: v_lshlrev_b32_e32 v1, 23, v2
; SDAG-NEXT: v_ashrrev_i64 v[0:1], 55, v[0:1]
+; SDAG-NEXT: v_bfe_i32 v1, v2, 8, 1
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: range_metadata_sext_i8_signed_range_i64:
diff --git a/llvm/test/CodeGen/AMDGPU/permute_i8.ll b/llvm/test/CodeGen/AMDGPU/permute_i8.ll
index bf98af33dc7b0..050300a69c46b 100644
--- a/llvm/test/CodeGen/AMDGPU/permute_i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/permute_i8.ll
@@ -1049,15 +1049,14 @@ define hidden void @ashr_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1,
; GFX10-NEXT: global_load_dword v0, v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(1)
; GFX10-NEXT: v_bfe_i32 v1, v9, 0, 8
-; GFX10-NEXT: v_ashrrev_i32_e32 v3, 24, v9
; GFX10-NEXT: v_ashrrev_i32_sdwa v2, v2, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT: v_ashrrev_i32_e32 v3, 25, v9
; GFX10-NEXT: v_lshlrev_b16 v1, 7, v1
-; GFX10-NEXT: v_lshrrev_b16 v3, 1, v3
+; GFX10-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_ashrrev_i16 v4, 10, v0
; GFX10-NEXT: v_perm_b32 v0, v9, v0, 0x4010707
; GFX10-NEXT: v_and_b32_e32 v1, 0xffffff00, v1
-; GFX10-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX10-NEXT: global_store_dword v[5:6], v1, off
@@ -1075,23 +1074,22 @@ define hidden void @ashr_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1,
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: global_load_dword v4, v[0:1], off
; GFX9-NEXT: global_load_dword v9, v[2:3], off
-; GFX9-NEXT: v_mov_b32_e32 v0, 26
-; GFX9-NEXT: v_mov_b32_e32 v1, 1
-; GFX9-NEXT: v_mov_b32_e32 v2, 7
+; GFX9-NEXT: v_mov_b32_e32 v1, 7
; GFX9-NEXT: s_mov_b32 s4, 0x4010707
+; GFX9-NEXT: v_mov_b32_e32 v0, 26
; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: v_ashrrev_i32_sdwa v0, v0, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-NEXT: v_lshrrev_b16_sdwa v1, v1, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX9-NEXT: v_lshlrev_b16_sdwa v2, v2, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT: v_lshlrev_b16_sdwa v1, v1, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_perm_b32 v3, v4, v9, s4
+; GFX9-NEXT: v_perm_b32 v2, v4, v9, s4
+; GFX9-NEXT: v_ashrrev_i32_sdwa v0, v0, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT: v_ashrrev_i32_e32 v3, 25, v4
; GFX9-NEXT: v_ashrrev_i16_e32 v9, 10, v9
-; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT: v_and_b32_e32 v1, 0xffffff00, v2
+; GFX9-NEXT: v_and_b32_e32 v1, 0xffffff00, v1
+; GFX9-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: global_store_dword v[5:6], v0, off
-; GFX9-NEXT: global_store_dword v[7:8], v3, off
+; GFX9-NEXT: global_store_dword v[7:8], v2, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
%tid = call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
index f4776747f16ac..8f9417f875e8e 100644
--- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
@@ -1759,9 +1759,10 @@ define i64 @v_test_sdiv_pow2_k_den_i64(i64 %x) {
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_ashrrev_i32_e32 v2, 31, v1
; GCN-NEXT: v_lshrrev_b32_e32 v2, 17, v2
-; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2
-; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GCN-NEXT: v_ashr_i64 v[0:1], v[0:1], 15
+; GCN-NEXT: v_add_i32_e32 v2, vcc, v0, v2
+; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
+; GCN-NEXT: v_alignbit_b32 v0, v3, v2, 15
+; GCN-NEXT: v_ashr_i64 v[1:2], v[2:3], 47
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GCN-IR-LABEL: v_test_sdiv_pow2_k_den_i64:
@@ -2064,9 +2065,10 @@ define i64 @v_test_sdiv24_pow2_k_den_i64(i64 %x) {
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_ashr_i64 v[0:1], v[0:1], 40
; GCN-NEXT: v_lshrrev_b32_e32 v2, 17, v1
-; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2
-; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GCN-NEXT: v_ashr_i64 v[0:1], v[0:1], 15
+; GCN-NEXT: v_add_i32_e32 v2, vcc, v0, v2
+; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
+; GCN-NEXT: v_alignbit_b32 v0, v3, v2, 15
+; GCN-NEXT: v_ashr_i64 v[1:2], v[2:3], 47
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GCN-IR-LABEL: v_test_sdiv24_pow2_k_den_i64:
diff --git a/llvm/test/CodeGen/AMDGPU/shift-i128.ll b/llvm/test/CodeGen/AMDGPU/shift-i128.ll
index ebc916b5c889b..d603b41341402 100644
--- a/llvm/test/CodeGen/AMDGPU/shift-i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/shift-i128.ll
@@ -113,8 +113,10 @@ define i128 @v_ashr_i128_vk(i128 %lhs) {
; GCN-NEXT: v_mov_b32_e32 v4, v1
; GCN-NEXT: v_lshl_b64 v[0:1], v[2:3], 31
; GCN-NEXT: v_lshrrev_b32_e32 v4, 1, v4
-; GCN-NEXT: v_ashr_i64 v[2:3], v[2:3], 33
; GCN-NEXT: v_or_b32_e32 v0, v4, v0
+; GCN-NEXT: v_ashr_i64 v[4:5], v[2:3], 33
+; GCN-NEXT: v_ashrrev_i32_e32 v3, 31, v3
+; GCN-NEXT: v_mov_b32_e32 v2, v4
; GCN-NEXT: s_setpc_b64 s[30:31]
%shl = ashr i128 %lhs, 33
ret i128 %shl
diff --git a/llvm/test/CodeGen/AMDGPU/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/AMDGPU/srem-seteq-illegal-types.ll
index 126b17e718b59..2efe27df2d10d 100644
--- a/llvm/test/CodeGen/AMDGPU/srem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/srem-seteq-illegal-types.ll
@@ -43,8 +43,8 @@ define i1 @test_srem_pow2_setne(i6 %X) nounwind {
; CHECK-LABEL: test_srem_pow2_setne:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: v_bfe_i32 v1, v0, 0, 6
-; CHECK-NEXT: v_...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/101751
More information about the llvm-commits
mailing list