[llvm] [GISel] Combine out-of-range shifts with value to 0 or -1 (PR #123510)
via llvm-commits
llvm-commits at lists.llvm.org
Sun Jan 19 09:53:58 PST 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-llvm-globalisel
Author: lialan (lialan)
<details>
<summary>Changes</summary>
This resolves #<!-- -->123212
---
Patch is 72.80 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/123510.diff
13 Files Affected:
- (modified) llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h (+4-2)
- (modified) llvm/include/llvm/Target/GlobalISel/Combine.td (+10-3)
- (modified) llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp (+45-2)
- (added) llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shifts.mir (+115)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shl-from-extend-narrow.postlegal.mir (+4-12)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shl-from-extend-narrow.prelegal.mir (+4-12)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll (+6-10)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll (+6-6)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll (+21-48)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll (+206-233)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll (+45-69)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll (+180-207)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/widen-i8-i16-scalar-loads.ll (+9-9)
``````````diff
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
index 94e36e412b0cf7..a51aa876e1deb0 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
@@ -840,8 +840,10 @@ class CombinerHelper {
bool matchRedundantBinOpInEquality(MachineInstr &MI,
BuildFnTy &MatchInfo) const;
- /// Match shifts greater or equal to the bitwidth of the operation.
- bool matchShiftsTooBig(MachineInstr &MI) const;
+ /// Match shifts greater or equal to the range (bitwidth of the operation, or
+ /// the source value).
+ bool matchShiftsTooBig(MachineInstr &MI,
+ std::optional<int64_t> &MatchInfo) const;
/// Match constant LHS ops that should be commuted.
bool matchCommuteConstantToRHS(MachineInstr &MI) const;
diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td
index 8641eabbdd84c6..ae0856550d9356 100644
--- a/llvm/include/llvm/Target/GlobalISel/Combine.td
+++ b/llvm/include/llvm/Target/GlobalISel/Combine.td
@@ -306,11 +306,18 @@ def ptr_add_immed_chain : GICombineRule<
[{ return Helper.matchPtrAddImmedChain(*${d}, ${matchinfo}); }]),
(apply [{ Helper.applyPtrAddImmedChain(*${d}, ${matchinfo}); }])>;
+def shift_result_matchdata : GIDefMatchData<"std::optional<int64_t>">;
def shifts_too_big : GICombineRule<
- (defs root:$root),
+ (defs root:$root, shift_result_matchdata:$matchinfo),
(match (wip_match_opcode G_SHL, G_ASHR, G_LSHR):$root,
- [{ return Helper.matchShiftsTooBig(*${root}); }]),
- (apply [{ Helper.replaceInstWithUndef(*${root}); }])>;
+ [{ return Helper.matchShiftsTooBig(*${root}, ${matchinfo}); }]),
+ (apply [{
+ if (${matchinfo}) {
+ Helper.replaceInstWithConstant(*${root}, *${matchinfo});
+ } else {
+ Helper.replaceInstWithUndef(*${root});
+ }
+ }])>;
// Fold shift (shift base x), y -> shift base, (x+y), if shifts are same
def shift_immed_matchdata : GIDefMatchData<"RegisterImmPair">;
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index 4e3aaf5da7198c..4821c21301946b 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -6590,12 +6590,55 @@ bool CombinerHelper::matchRedundantBinOpInEquality(MachineInstr &MI,
return CmpInst::isEquality(Pred) && Y.isValid();
}
-bool CombinerHelper::matchShiftsTooBig(MachineInstr &MI) const {
+static std::optional<unsigned>
+getMaxUsefulShift(KnownBits ValueKB, unsigned Opcode,
+ std::optional<int64_t> &Result) {
+ assert(Opcode == TargetOpcode::G_SHL || Opcode == TargetOpcode::G_LSHR ||
+ Opcode == TargetOpcode::G_ASHR && "Expect G_SHL, G_LSHR or G_ASHR.");
+ auto SignificantBits = 0;
+ switch (Opcode) {
+ case TargetOpcode::G_SHL:
+ SignificantBits = ValueKB.countMinTrailingZeros();
+ Result = 0;
+ break;
+ case TargetOpcode::G_LSHR:
+ Result = 0;
+ SignificantBits = ValueKB.countMinLeadingZeros();
+ break;
+ case TargetOpcode::G_ASHR:
+ if (ValueKB.isNonNegative()) {
+ SignificantBits = ValueKB.countMinLeadingZeros();
+ Result = 0;
+ } else if (ValueKB.isNegative()) {
+ SignificantBits = ValueKB.countMinLeadingOnes();
+ Result = -1;
+ } else {
+ // Cannot determine shift result.
+ Result = std::nullopt;
+ }
+ break;
+ default:
+ break;
+ }
+ return ValueKB.getBitWidth() - SignificantBits;
+}
+
+bool CombinerHelper::matchShiftsTooBig(
+ MachineInstr &MI, std::optional<int64_t> &MatchInfo) const {
+ Register ShiftVal = MI.getOperand(1).getReg();
Register ShiftReg = MI.getOperand(2).getReg();
LLT ResTy = MRI.getType(MI.getOperand(0).getReg());
auto IsShiftTooBig = [&](const Constant *C) {
auto *CI = dyn_cast<ConstantInt>(C);
- return CI && CI->uge(ResTy.getScalarSizeInBits());
+ if (!CI)
+ return false;
+ if (CI->uge(ResTy.getScalarSizeInBits())) {
+ MatchInfo = std::nullopt;
+ return true;
+ }
+ auto OptMaxUsefulShift = getMaxUsefulShift(KB->getKnownBits(ShiftVal),
+ MI.getOpcode(), MatchInfo);
+ return OptMaxUsefulShift && CI->uge(*OptMaxUsefulShift);
};
return matchUnaryPredicate(MRI, ShiftReg, IsShiftTooBig);
}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shifts.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shifts.mir
new file mode 100644
index 00000000000000..d3a3827f35a18a
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shifts.mir
@@ -0,0 +1,115 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn -run-pass=amdgpu-prelegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s
+
+---
+name: combine_ashr
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr31
+
+ liveins: $vgpr0, $vgpr1
+
+ ; CHECK-LABEL: name: combine_ashr
+ ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr31, $vgpr0, $vgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; CHECK-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: G_STORE [[C]](s32), [[MV]](p0) :: (store (s32))
+ ; CHECK-NEXT: SI_RETURN
+ %9:_(s32) = COPY $vgpr0
+ %10:_(s32) = COPY $vgpr1
+ %0:_(p0) = G_MERGE_VALUES %9(s32), %10(s32)
+ %12:_(s32) = G_CONSTANT i32 10
+ %11:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.workitem.id.x)
+ %13:_(s32) = G_ASHR %11, %12(s32)
+ G_STORE %13(s32), %0(p0) :: (store (s32))
+ SI_RETURN
+
+...
+---
+name: combine_lshr
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr31
+
+ liveins: $vgpr0, $vgpr1
+
+ ; CHECK-LABEL: name: combine_lshr
+ ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr31, $vgpr0, $vgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; CHECK-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: G_STORE [[C]](s32), [[MV]](p0) :: (store (s32))
+ ; CHECK-NEXT: SI_RETURN
+ %9:_(s32) = COPY $vgpr0
+ %10:_(s32) = COPY $vgpr1
+ %0:_(p0) = G_MERGE_VALUES %9(s32), %10(s32)
+ %12:_(s32) = G_CONSTANT i32 10
+ %11:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.workitem.id.x)
+ %13:_(s32) = G_LSHR %11, %12(s32)
+ G_STORE %13(s32), %0(p0) :: (store (s32))
+ SI_RETURN
+
+...
+---
+name: combine_shl
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr31
+
+ liveins: $vgpr0, $vgpr1
+
+ ; CHECK-LABEL: name: combine_shl
+ ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr31, $vgpr0, $vgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; CHECK-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: G_STORE [[C]](s32), [[MV]](p0) :: (store (s32))
+ ; CHECK-NEXT: SI_RETURN
+ %9:_(s32) = COPY $vgpr0
+ %10:_(s32) = COPY $vgpr1
+ %0:_(p0) = G_MERGE_VALUES %9(s32), %10(s32)
+ %12:_(s32) = G_CONSTANT i32 16
+ %11:_(s32) = G_CONSTANT i32 4294901760
+ %13:_(s32) = G_SHL %11, %12(s32)
+ G_STORE %13(s32), %0(p0) :: (store (s32))
+ SI_RETURN
+
+...
+---
+name: combine_ashr2
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr31
+
+ liveins: $vgpr0, $vgpr1
+
+ ; CHECK-LABEL: name: combine_ashr2
+ ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr31, $vgpr0, $vgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+ ; CHECK-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s8) = G_CONSTANT i8 -1
+ ; CHECK-NEXT: G_STORE [[C]](s8), [[MV]](p0) :: (store (s8))
+ ; CHECK-NEXT: SI_RETURN
+ %9:_(s32) = COPY $vgpr0
+ %10:_(s32) = COPY $vgpr1
+ %0:_(p0) = G_MERGE_VALUES %9(s32), %10(s32)
+ %12:_(s32) = G_CONSTANT i32 1
+ %11:_(s8) = G_CONSTANT i8 -2
+ %13:_(s8) = G_ASHR %11, %12(s32)
+ G_STORE %13(s8), %0(p0) :: (store (s8))
+ SI_RETURN
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shl-from-extend-narrow.postlegal.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shl-from-extend-narrow.postlegal.mir
index 6ae8895322d6f9..a8cd974b01ab4b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shl-from-extend-narrow.postlegal.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shl-from-extend-narrow.postlegal.mir
@@ -374,23 +374,15 @@ body: |
; GFX6-LABEL: name: do_not_shl_v2s32_zero_by_16_from_zext_v2s16
; GFX6: liveins: $vgpr0
; GFX6-NEXT: {{ $}}
- ; GFX6-NEXT: %zero:_(s16) = G_CONSTANT i16 0
- ; GFX6-NEXT: %zerovector:_(<2 x s16>) = G_BUILD_VECTOR %zero(s16), %zero(s16)
- ; GFX6-NEXT: %shiftamt:_(s16) = G_CONSTANT i16 16
- ; GFX6-NEXT: %shiftamtvector:_(<2 x s16>) = G_BUILD_VECTOR %shiftamt(s16), %shiftamt(s16)
- ; GFX6-NEXT: %extend:_(<2 x s32>) = G_ZEXT %zerovector(<2 x s16>)
- ; GFX6-NEXT: %shl:_(<2 x s32>) = G_SHL %extend, %shiftamtvector(<2 x s16>)
+ ; GFX6-NEXT: %6:_(s32) = G_CONSTANT i32 0
+ ; GFX6-NEXT: %shl:_(<2 x s32>) = G_BUILD_VECTOR %6(s32), %6(s32)
; GFX6-NEXT: $vgpr0_vgpr1 = COPY %shl(<2 x s32>)
;
; GFX9-LABEL: name: do_not_shl_v2s32_zero_by_16_from_zext_v2s16
; GFX9: liveins: $vgpr0
; GFX9-NEXT: {{ $}}
- ; GFX9-NEXT: %zero:_(s16) = G_CONSTANT i16 0
- ; GFX9-NEXT: %zerovector:_(<2 x s16>) = G_BUILD_VECTOR %zero(s16), %zero(s16)
- ; GFX9-NEXT: %shiftamt:_(s16) = G_CONSTANT i16 16
- ; GFX9-NEXT: %shiftamtvector:_(<2 x s16>) = G_BUILD_VECTOR %shiftamt(s16), %shiftamt(s16)
- ; GFX9-NEXT: %extend:_(<2 x s32>) = G_ZEXT %zerovector(<2 x s16>)
- ; GFX9-NEXT: %shl:_(<2 x s32>) = G_SHL %extend, %shiftamtvector(<2 x s16>)
+ ; GFX9-NEXT: %6:_(s32) = G_CONSTANT i32 0
+ ; GFX9-NEXT: %shl:_(<2 x s32>) = G_BUILD_VECTOR %6(s32), %6(s32)
; GFX9-NEXT: $vgpr0_vgpr1 = COPY %shl(<2 x s32>)
%zero:_(s16) = G_CONSTANT i16 0
%zerovector:_(<2 x s16>) = G_BUILD_VECTOR %zero, %zero:_(s16)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shl-from-extend-narrow.prelegal.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shl-from-extend-narrow.prelegal.mir
index 6ceb41199af6da..3780542cd87993 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shl-from-extend-narrow.prelegal.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shl-from-extend-narrow.prelegal.mir
@@ -246,23 +246,15 @@ body: |
; GFX6-LABEL: name: do_not_shl_v2s32_zero_by_16_from_zext_v2s16
; GFX6: liveins: $vgpr0, $vgpr1
; GFX6-NEXT: {{ $}}
- ; GFX6-NEXT: %zero:_(s16) = G_CONSTANT i16 0
- ; GFX6-NEXT: %zerovector:_(<2 x s16>) = G_BUILD_VECTOR %zero(s16), %zero(s16)
- ; GFX6-NEXT: %shiftamt:_(s16) = G_CONSTANT i16 16
- ; GFX6-NEXT: %shiftamtvector:_(<2 x s16>) = G_BUILD_VECTOR %shiftamt(s16), %shiftamt(s16)
- ; GFX6-NEXT: %extend:_(<2 x s32>) = G_ZEXT %zerovector(<2 x s16>)
- ; GFX6-NEXT: %shl:_(<2 x s32>) = G_SHL %extend, %shiftamtvector(<2 x s16>)
+ ; GFX6-NEXT: %6:_(s32) = G_CONSTANT i32 0
+ ; GFX6-NEXT: %shl:_(<2 x s32>) = G_BUILD_VECTOR %6(s32), %6(s32)
; GFX6-NEXT: $vgpr0_vgpr1 = COPY %shl(<2 x s32>)
;
; GFX9-LABEL: name: do_not_shl_v2s32_zero_by_16_from_zext_v2s16
; GFX9: liveins: $vgpr0, $vgpr1
; GFX9-NEXT: {{ $}}
- ; GFX9-NEXT: %zero:_(s16) = G_CONSTANT i16 0
- ; GFX9-NEXT: %zerovector:_(<2 x s16>) = G_BUILD_VECTOR %zero(s16), %zero(s16)
- ; GFX9-NEXT: %shiftamt:_(s16) = G_CONSTANT i16 16
- ; GFX9-NEXT: %shiftamtvector:_(<2 x s16>) = G_BUILD_VECTOR %shiftamt(s16), %shiftamt(s16)
- ; GFX9-NEXT: %extend:_(<2 x s32>) = G_ZEXT %zerovector(<2 x s16>)
- ; GFX9-NEXT: %shl:_(<2 x s32>) = G_SHL %extend, %shiftamtvector(<2 x s16>)
+ ; GFX9-NEXT: %6:_(s32) = G_CONSTANT i32 0
+ ; GFX9-NEXT: %shl:_(<2 x s32>) = G_BUILD_VECTOR %6(s32), %6(s32)
; GFX9-NEXT: $vgpr0_vgpr1 = COPY %shl(<2 x s32>)
%zero:_(s16) = G_CONSTANT i16 0
%zerovector:_(<2 x s16>) = G_BUILD_VECTOR %zero, %zero:_(s16)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll
index b9cd330ee2b5f9..4ddbb0afd7fc58 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll
@@ -1434,13 +1434,11 @@ define float @v_test_sitofp_i64_byte_to_f32(i64 %arg0) {
; SI-LABEL: v_test_sitofp_i64_byte_to_f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_ffbh_i32_e32 v2, 0
+; SI-NEXT: v_add_i32_e32 v2, vcc, -1, v2
; SI-NEXT: v_and_b32_e32 v0, 0xff, v0
-; SI-NEXT: v_ashrrev_i32_e32 v2, 31, v0
-; SI-NEXT: v_ffbh_i32_e32 v3, 0
-; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v2
-; SI-NEXT: v_add_i32_e32 v3, vcc, -1, v3
; SI-NEXT: v_mov_b32_e32 v1, 0
-; SI-NEXT: v_min_u32_e32 v2, v3, v2
+; SI-NEXT: v_min_u32_e32 v2, 32, v2
; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], v2
; SI-NEXT: v_min_u32_e32 v0, 1, v0
; SI-NEXT: v_or_b32_e32 v0, v1, v0
@@ -1452,13 +1450,11 @@ define float @v_test_sitofp_i64_byte_to_f32(i64 %arg0) {
; VI-LABEL: v_test_sitofp_i64_byte_to_f32:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_ffbh_i32_e32 v2, 0
+; VI-NEXT: v_add_u32_e32 v2, vcc, -1, v2
; VI-NEXT: v_and_b32_e32 v0, 0xff, v0
-; VI-NEXT: v_ashrrev_i32_e32 v2, 31, v0
-; VI-NEXT: v_ffbh_i32_e32 v3, 0
-; VI-NEXT: v_add_u32_e32 v2, vcc, 32, v2
-; VI-NEXT: v_add_u32_e32 v3, vcc, -1, v3
; VI-NEXT: v_mov_b32_e32 v1, 0
-; VI-NEXT: v_min_u32_e32 v2, v3, v2
+; VI-NEXT: v_min_u32_e32 v2, 32, v2
; VI-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1]
; VI-NEXT: v_min_u32_e32 v0, 1, v0
; VI-NEXT: v_or_b32_e32 v0, v1, v0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
index cc185aff9eff22..784611cf68dd23 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
@@ -1800,9 +1800,9 @@ define amdgpu_ps i65 @s_lshr_i65_33(i65 inreg %value) {
; GCN-NEXT: s_and_b64 s[2:3], s[2:3], 1
; GCN-NEXT: s_lshr_b32 s0, s1, 1
; GCN-NEXT: s_mov_b32 s1, 0
-; GCN-NEXT: s_lshl_b64 s[4:5], s[2:3], 31
-; GCN-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5]
-; GCN-NEXT: s_lshr_b32 s2, s3, 1
+; GCN-NEXT: s_lshl_b64 s[2:3], s[2:3], 31
+; GCN-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3]
+; GCN-NEXT: s_mov_b32 s2, 0
; GCN-NEXT: ; return to shader part epilog
;
; GFX10PLUS-LABEL: s_lshr_i65_33:
@@ -1810,9 +1810,9 @@ define amdgpu_ps i65 @s_lshr_i65_33(i65 inreg %value) {
; GFX10PLUS-NEXT: s_and_b64 s[2:3], s[2:3], 1
; GFX10PLUS-NEXT: s_lshr_b32 s0, s1, 1
; GFX10PLUS-NEXT: s_mov_b32 s1, 0
-; GFX10PLUS-NEXT: s_lshl_b64 s[4:5], s[2:3], 31
-; GFX10PLUS-NEXT: s_lshr_b32 s2, s3, 1
-; GFX10PLUS-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5]
+; GFX10PLUS-NEXT: s_lshl_b64 s[2:3], s[2:3], 31
+; GFX10PLUS-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3]
+; GFX10PLUS-NEXT: s_mov_b32 s2, 0
; GFX10PLUS-NEXT: ; return to shader part epilog
%result = lshr i65 %value, 33
ret i65 %result
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll
index 88eb0e4b848c95..2fa5492c8a2b72 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll
@@ -608,34 +608,25 @@ define i32 @v_sdiv_i32_24bit(i32 %num, i32 %den) {
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: v_and_b32_e32 v0, 0xffffff, v0
; GISEL-NEXT: v_and_b32_e32 v1, 0xffffff, v1
-; GISEL-NEXT: v_ashrrev_i32_e32 v2, 31, v0
-; GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v1
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v3
-; GISEL-NEXT: v_xor_b32_e32 v0, v0, v2
-; GISEL-NEXT: v_xor_b32_e32 v1, v1, v3
-; GISEL-NEXT: v_cvt_f32_u32_e32 v4, v1
-; GISEL-NEXT: v_sub_i32_e32 v5, vcc, 0, v1
-; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4
-; GISEL-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4
-; GISEL-NEXT: v_cvt_u32_f32_e32 v4, v4
-; GISEL-NEXT: v_mul_lo_u32 v5, v5, v4
-; GISEL-NEXT: v_mul_hi_u32 v5, v4, v5
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5
-; GISEL-NEXT: v_mul_hi_u32 v4, v0, v4
-; GISEL-NEXT: v_mul_lo_u32 v5, v4, v1
-; GISEL-NEXT: v_add_i32_e32 v6, vcc, 1, v4
-; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v5
+; GISEL-NEXT: v_cvt_f32_u32_e32 v2, v1
+; GISEL-NEXT: v_sub_i32_e32 v3, vcc, 0, v1
+; GISEL-NEXT: v_rcp_iflag_f32_e32 v2, v2
+; GISEL-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2
+; GISEL-NEXT: v_cvt_u32_f32_e32 v2, v2
+; GISEL-NEXT: v_mul_lo_u32 v3, v3, v2
+; GISEL-NEXT: v_mul_hi_u32 v3, v2, v3
+; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3
+; GISEL-NEXT: v_mul_hi_u32 v2, v0, v2
+; GISEL-NEXT: v_mul_lo_u32 v3, v2, v1
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, 1, v2
+; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v3
; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1
-; GISEL-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
-; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v0, v1
-; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, 1, v4
+; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
+; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v0, v1
+; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
+; GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v2
; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1
-; GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc
-; GISEL-NEXT: v_xor_b32_e32 v1, v2, v3
-; GISEL-NEXT: v_xor_b32_e32 v0, v0, v1
-; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v1
+; GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
; GISEL-NEXT: s_setpc_b64 s[30:31]
;
; CGP-LABEL: v_sdiv_i32_24bit:
@@ -677,20 +668,6 @@ define <2 x i32> @v_sdiv_v2i32_24bit(<2 x i32> %num, <2 x i32> %den) {
; GISEL-NEXT: v_and_b32_e32 v1, 0xffffff, v1
; GISEL-NEXT: v_and_b32_e32 v2, 0xffffff, v2
; GISEL-NEXT: v_and_b32_e32 v3, 0xffffff, v3
-; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v0
-; GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v2
-; GISEL-NEXT: v_ashrrev_i32_e32 v6, 31, v1
-; GISEL-NEXT: v_ashrrev_i32_e32 v7, 31, v3
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4
-; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v5
-; GISEL-NEXT: v_xor_b32_e32 v8, v4, v5
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v6
-; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v7
-; GISEL-NEXT: v_xor_b32_e32 v9, v6, v7
-; GISEL-NEXT: v_xor_b32_e32 v0, v0, v4
-; GISEL-NEXT: v_xor_b32_e32 v2, v2, v5
-; GISEL-NEXT: v_xor_b32_e32 v1, v1, v6
-; GISEL-NEXT: v_xor_b32_e32 v3, v3, v7
; GISEL-NEXT: v_cvt_f32_u32_e32 v4, v2
; GISEL-NEXT: v_sub_i32_e32 v5, vcc, 0, v2
; GISEL-NEXT: v_cvt_f32_u32_e32 v6, v3
@@ -711,15 +688,15 @@ define <2 x i32> @v_sdiv_v2i32_24bit(<2 x i32> %num, <2 x i32> %den) {
; GISEL-NEXT: v_mul_hi_u32 v5, v1, v5
; GISEL-NEXT: v_mul_lo_u32 v6, v4, v2
; GISEL-NEXT: v_add_i32_e32 v7, vcc, 1, v4
-; GISEL-NEXT: v_mul_lo_u32 v10, v5, v3
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, 1, v5
+; GISEL-NEXT: v_mul_lo_u32 v8, v5, v3
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, 1, v5
; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v6
-; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v1, v10
+; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v1, v8
; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2
; GISEL-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc
; GISEL-NEXT: v_sub_i32_e64 v6, s[4:5], v0, v2
; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v3
-; GISEL-NEXT: v_cndmask_b32_e64 v5, v5, v11, s[4:5]
+; GISEL-NEXT: v_cndmask_b32_e64 v5, v5, v9, s[4:5]
; GISEL-NEXT: v_sub_i32_e64 v7, s[6...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/123510
More information about the llvm-commits
mailing list