[llvm] 1c21d5c - [GlobalISel] Remove GI known bits cache (#157352)
via llvm-commits
llvm-commits at lists.llvm.org
Sun Sep 14 23:32:05 PDT 2025
Author: David Green
Date: 2025-09-15T07:32:00+01:00
New Revision: 1c21d5cb9b8e48ab928919a6f358eba8ffd3b49c
URL: https://github.com/llvm/llvm-project/commit/1c21d5cb9b8e48ab928919a6f358eba8ffd3b49c
DIFF: https://github.com/llvm/llvm-project/commit/1c21d5cb9b8e48ab928919a6f358eba8ffd3b49c.diff
LOG: [GlobalISel] Remove GI known bits cache (#157352)
There is a cache on the known-bit computed by global-isel. It only works
inside a single query to computeKnownBits, which limits its usefulness,
and according to the tests can sometimes limit the effectiveness of
known-bits queries. (Although some AMD tests look longer). Keeping the
cache valid and clearing it at the correct times can also require being
careful about the functions called inside known-bits queries.
I measured compile-time of removing it and came up with:
```
7zip 2.06405E+11 2.06436E+11 0.015018992
Bullet 1.01298E+11 1.01186E+11 -0.110236169
ClamAV 57942466667 57848066667 -0.16292023
SPASS 45444466667 45402966667 -0.091320249
consumer 35432466667 35381233333 -0.144594317
kimwitu++ 40858833333 40927933333 0.169118877
lencod 70022366667 69950633333 -0.102443457
mafft 38439900000 38413233333 -0.069372362
sqlite3 35822266667 35770033333 -0.145812474
tramp3d 82083133333 82045600000 -0.045726
Average -0.068828739
```
The last column is % difference between with / without the cache. So in
total it seems to be costing slightly more to keep the current
known-bits cache than if it was removed. (Measured in instruction count,
similar to llvm-compile-time-tracker).
The hit rate wasn't terrible - higher than I expected. In the
llvm-test-suite+external projects it was hit 4791030 times out of
91107008 queries, slightly more than 5%.
Note that as globalisel increases in complexity, more known bits calls
might be made and the numbers might shift. If that is the case it might
be better to have a cache that works across calls, providing it doesn't
make effectiveness worse.
Added:
Modified:
llvm/include/llvm/CodeGen/GlobalISel/GISelValueTracking.h
llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp
llvm/test/CodeGen/AArch64/rem-by-const.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
llvm/test/CodeGen/RISCV/GlobalISel/div-by-constant.ll
llvm/unittests/CodeGen/GlobalISel/KnownBitsTest.cpp
llvm/unittests/CodeGen/GlobalISel/KnownBitsVectorTest.cpp
Removed:
################################################################################
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/GISelValueTracking.h b/llvm/include/llvm/CodeGen/GlobalISel/GISelValueTracking.h
index 490d1a34cc846..3bf9d694b1b21 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/GISelValueTracking.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/GISelValueTracking.h
@@ -37,8 +37,6 @@ class LLVM_ABI GISelValueTracking : public GISelChangeObserver {
const TargetLowering &TL;
const DataLayout &DL;
unsigned MaxDepth;
- /// Cache maintained during a computeKnownBits request.
- SmallDenseMap<Register, KnownBits, 16> ComputeKnownBitsCache;
void computeKnownBitsMin(Register Src0, Register Src1, KnownBits &Known,
const APInt &DemandedElts, unsigned Depth = 0);
diff --git a/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp b/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp
index 0cf44e02254de..9b4c103763d74 100644
--- a/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp
@@ -93,12 +93,8 @@ KnownBits GISelValueTracking::getKnownBits(Register R) {
KnownBits GISelValueTracking::getKnownBits(Register R,
const APInt &DemandedElts,
unsigned Depth) {
- // For now, we only maintain the cache during one request.
- assert(ComputeKnownBitsCache.empty() && "Cache should have been cleared");
-
KnownBits Known;
computeKnownBitsImpl(R, Known, DemandedElts, Depth);
- ComputeKnownBitsCache.clear();
return Known;
}
@@ -187,14 +183,6 @@ void GISelValueTracking::computeKnownBitsImpl(Register R, KnownBits &Known,
#endif
unsigned BitWidth = DstTy.getScalarSizeInBits();
- auto CacheEntry = ComputeKnownBitsCache.find(R);
- if (CacheEntry != ComputeKnownBitsCache.end()) {
- Known = CacheEntry->second;
- LLVM_DEBUG(dbgs() << "Cache hit at ");
- LLVM_DEBUG(dumpResult(MI, Known, Depth));
- assert(Known.getBitWidth() == BitWidth && "Cache entry size doesn't match");
- return;
- }
Known = KnownBits(BitWidth); // Don't know anything
// Depth may get bigger than max depth if it gets passed to a
diff erent
@@ -254,16 +242,6 @@ void GISelValueTracking::computeKnownBitsImpl(Register R, KnownBits &Known,
// point of the pipeline, otherwise the main live-range will be
// defined more than once, which is against SSA.
assert(MI.getOperand(0).getSubReg() == 0 && "Is this code in SSA?");
- // Record in the cache that we know nothing for MI.
- // This will get updated later and in the meantime, if we reach that
- // phi again, because of a loop, we will cut the search thanks to this
- // cache entry.
- // We could actually build up more information on the phi by not cutting
- // the search, but that additional information is more a side effect
- // than an intended choice.
- // Therefore, for now, save on compile time until we derive a proper way
- // to derive known bits for PHIs within loops.
- ComputeKnownBitsCache[R] = KnownBits(BitWidth);
// PHI's operand are a mix of registers and basic blocks interleaved.
// We only care about the register ones.
for (unsigned Idx = 1; Idx < MI.getNumOperands(); Idx += 2) {
@@ -700,9 +678,6 @@ void GISelValueTracking::computeKnownBitsImpl(Register R, KnownBits &Known,
}
LLVM_DEBUG(dumpResult(MI, Known, Depth));
-
- // Update the cache.
- ComputeKnownBitsCache[R] = Known;
}
static bool outputDenormalIsIEEEOrPosZero(const MachineFunction &MF, LLT Ty) {
diff --git a/llvm/test/CodeGen/AArch64/rem-by-const.ll b/llvm/test/CodeGen/AArch64/rem-by-const.ll
index 599fa510d4aea..1cb92e46cbcd1 100644
--- a/llvm/test/CodeGen/AArch64/rem-by-const.ll
+++ b/llvm/test/CodeGen/AArch64/rem-by-const.ll
@@ -88,7 +88,7 @@ define i8 @ui8_7(i8 %a, i8 %b) {
; CHECK-GI-NEXT: sub w9, w0, w8
; CHECK-GI-NEXT: ubfx w9, w9, #1, #7
; CHECK-GI-NEXT: add w8, w9, w8
-; CHECK-GI-NEXT: ubfx w8, w8, #2, #6
+; CHECK-GI-NEXT: lsr w8, w8, #2
; CHECK-GI-NEXT: lsl w9, w8, #3
; CHECK-GI-NEXT: sub w8, w9, w8
; CHECK-GI-NEXT: sub w0, w0, w8
@@ -207,7 +207,7 @@ define i16 @ui16_7(i16 %a, i16 %b) {
; CHECK-GI-NEXT: sub w9, w0, w8
; CHECK-GI-NEXT: ubfx w9, w9, #1, #15
; CHECK-GI-NEXT: add w8, w9, w8
-; CHECK-GI-NEXT: ubfx w8, w8, #2, #14
+; CHECK-GI-NEXT: lsr w8, w8, #2
; CHECK-GI-NEXT: lsl w9, w8, #3
; CHECK-GI-NEXT: sub w8, w9, w8
; CHECK-GI-NEXT: sub w0, w0, w8
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
index f57fc005b994b..9ffc565d9d47a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
@@ -1186,77 +1186,77 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
; GISEL-NEXT: s_subb_u32 s6, 0, 0
; GISEL-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4
; GISEL-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4
-; GISEL-NEXT: v_trunc_f32_e32 v8, v5
-; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v8
-; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v4
-; GISEL-NEXT: v_cvt_u32_f32_e32 v8, v8
-; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v7, 0
-; GISEL-NEXT: v_mov_b32_e32 v9, v5
-; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v8, v[9:10]
-; GISEL-NEXT: v_mul_hi_u32 v11, v7, v4
-; GISEL-NEXT: v_mul_hi_u32 v12, v8, v4
-; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], s6, v7, v[9:10]
-; GISEL-NEXT: v_mul_lo_u32 v10, v8, v4
-; GISEL-NEXT: v_mul_lo_u32 v13, v7, v9
-; GISEL-NEXT: v_mul_lo_u32 v4, v8, v9
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v10, v13
+; GISEL-NEXT: v_trunc_f32_e32 v7, v5
+; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v7
+; GISEL-NEXT: v_cvt_u32_f32_e32 v8, v4
+; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v7
+; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v8, 0
+; GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v9, v[7:8]
+; GISEL-NEXT: v_mul_hi_u32 v12, v9, v4
+; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], s6, v8, v[10:11]
+; GISEL-NEXT: v_mul_lo_u32 v10, v9, v4
+; GISEL-NEXT: v_mul_hi_u32 v11, v8, v4
+; GISEL-NEXT: v_mul_lo_u32 v7, v8, v13
+; GISEL-NEXT: v_mul_lo_u32 v4, v9, v13
+; GISEL-NEXT: v_add_i32_e32 v7, vcc, v10, v7
; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v11
-; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13
-; GISEL-NEXT: v_mul_hi_u32 v14, v7, v9
+; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v11
+; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v7, vcc, v14, v7
+; GISEL-NEXT: v_mul_hi_u32 v14, v8, v13
; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v12
; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v14
; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14
-; GISEL-NEXT: v_mul_hi_u32 v9, v8, v9
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v13
-; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v13
-; GISEL-NEXT: v_add_i32_e32 v16, vcc, v7, v4
+; GISEL-NEXT: v_mul_hi_u32 v13, v9, v13
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7
+; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v7, vcc, v14, v7
+; GISEL-NEXT: v_add_i32_e32 v7, vcc, v13, v7
+; GISEL-NEXT: v_add_i32_e32 v16, vcc, v8, v4
; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v16, 0
-; GISEL-NEXT: v_addc_u32_e32 v17, vcc, v8, v9, vcc
+; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v9, v7, vcc
; GISEL-NEXT: v_mov_b32_e32 v4, v14
-; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v17, v[4:5]
-; GISEL-NEXT: v_mul_lo_u32 v4, v17, v13
+; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v7, v[4:5]
+; GISEL-NEXT: v_mul_lo_u32 v4, v7, v13
; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], s6, v16, v[14:15]
; GISEL-NEXT: s_mov_b32 s6, 1
; GISEL-NEXT: s_cmp_lg_u32 s6, 0
-; GISEL-NEXT: v_mul_lo_u32 v9, v16, v14
+; GISEL-NEXT: v_mul_lo_u32 v15, v16, v14
; GISEL-NEXT: s_subb_u32 s6, 0, 0
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v9
-; GISEL-NEXT: v_mul_hi_u32 v9, v16, v13
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v15
+; GISEL-NEXT: v_mul_hi_u32 v15, v16, v13
+; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
+; GISEL-NEXT: v_mul_hi_u32 v13, v7, v13
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v15
+; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
+; GISEL-NEXT: v_mul_lo_u32 v15, v7, v14
+; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v4
+; GISEL-NEXT: v_mul_hi_u32 v4, v16, v14
+; GISEL-NEXT: v_add_i32_e32 v13, vcc, v15, v13
; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v9
+; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v4
; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; GISEL-NEXT: v_mul_hi_u32 v9, v17, v13
-; GISEL-NEXT: v_mul_lo_u32 v13, v17, v14
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v15, v4
-; GISEL-NEXT: v_mul_hi_u32 v15, v16, v14
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v13, v9
-; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v15, vcc, v9, v15
-; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v9
-; GISEL-NEXT: v_ashrrev_i32_e32 v9, 31, v1
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9
-; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc
-; GISEL-NEXT: v_xor_b32_e32 v18, v0, v9
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v4
-; GISEL-NEXT: v_mul_hi_u32 v4, v17, v14
-; GISEL-NEXT: v_xor_b32_e32 v19, v1, v9
+; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v4
+; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v1
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4
+; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc
+; GISEL-NEXT: v_xor_b32_e32 v18, v0, v4
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v13, v17
+; GISEL-NEXT: v_mul_hi_u32 v13, v7, v14
+; GISEL-NEXT: v_xor_b32_e32 v19, v1, v4
; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v1, vcc, v15, v1
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v13, v1
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v4, v1
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v16, v0
-; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v17, v1, vcc
+; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v7, v1, vcc
; GISEL-NEXT: v_mul_lo_u32 v13, v19, v0
; GISEL-NEXT: v_mul_lo_u32 v14, v18, v1
; GISEL-NEXT: v_mul_hi_u32 v15, v18, v0
; GISEL-NEXT: v_mul_hi_u32 v0, v19, v0
-; GISEL-NEXT: v_mov_b32_e32 v4, 0x12d8fb
+; GISEL-NEXT: v_mov_b32_e32 v7, 0x12d8fb
; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14
; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v15
@@ -1271,144 +1271,147 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14
; GISEL-NEXT: v_add_i32_e32 v15, vcc, v0, v13
; GISEL-NEXT: v_mul_hi_u32 v16, v19, v1
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v15, 0
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v7, v15, 0
; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13
; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v13
-; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v16, v[1:2]
+; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v7, v16, v[1:2]
; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v18, v0
+; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], 0, v15, v[13:14]
; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], v19, v13, vcc
; GISEL-NEXT: v_sub_i32_e64 v13, s[4:5], v19, v13
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v4
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v7
; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[4:5]
; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1
; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v13, vcc
-; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v4
+; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v7
; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
; GISEL-NEXT: v_add_i32_e32 v13, vcc, 1, v15
; GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v16, vcc
-; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4
+; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v7
; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
; GISEL-NEXT: v_cndmask_b32_e32 v18, -1, v0, vcc
; GISEL-NEXT: v_mov_b32_e32 v0, v5
; GISEL-NEXT: v_cndmask_b32_e64 v14, -1, v14, s[4:5]
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v8, v[0:1]
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v7, v[0:1]
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v9, v[0:1]
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v8, v[0:1]
; GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v13
; GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v17, vcc
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18
-; GISEL-NEXT: v_mul_lo_u32 v18, v7, v0
+; GISEL-NEXT: v_mul_lo_u32 v18, v8, v0
; GISEL-NEXT: v_cndmask_b32_e32 v13, v13, v1, vcc
; GISEL-NEXT: v_cndmask_b32_e32 v5, v17, v5, vcc
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v18
; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v11
; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v11, v8, v0
+; GISEL-NEXT: v_mul_lo_u32 v11, v9, v0
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1
-; GISEL-NEXT: v_mul_hi_u32 v10, v7, v0
+; GISEL-NEXT: v_mul_hi_u32 v10, v8, v0
; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12
; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10
; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11
-; GISEL-NEXT: v_mul_hi_u32 v0, v8, v0
+; GISEL-NEXT: v_mul_hi_u32 v0, v9, v0
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1
; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v1
-; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v8, v0, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v7, 0
+; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v1
+; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v0, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v8, 0
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14
; GISEL-NEXT: v_cndmask_b32_e32 v11, v16, v5, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v6, v8, v[1:2]
-; GISEL-NEXT: v_xor_b32_e32 v1, v11, v9
-; GISEL-NEXT: v_ashrrev_i32_e32 v11, 31, v3
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s6, v7, v[5:6]
+; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v6, v9, v[1:2]
; GISEL-NEXT: v_cndmask_b32_e32 v10, v15, v13, vcc
-; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v11
-; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v11, vcc
-; GISEL-NEXT: v_xor_b32_e32 v12, v2, v11
-; GISEL-NEXT: v_mul_lo_u32 v2, v8, v0
-; GISEL-NEXT: v_mul_lo_u32 v6, v7, v5
-; GISEL-NEXT: v_xor_b32_e32 v13, v3, v11
-; GISEL-NEXT: v_mul_hi_u32 v3, v7, v0
-; GISEL-NEXT: v_mul_hi_u32 v0, v8, v0
+; GISEL-NEXT: v_xor_b32_e32 v1, v10, v4
+; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s6, v8, v[5:6]
+; GISEL-NEXT: v_ashrrev_i32_e32 v10, 31, v3
+; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v10
+; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v10, vcc
+; GISEL-NEXT: v_xor_b32_e32 v12, v2, v10
+; GISEL-NEXT: v_mul_lo_u32 v2, v9, v0
+; GISEL-NEXT: v_mul_lo_u32 v6, v8, v5
+; GISEL-NEXT: v_xor_b32_e32 v13, v3, v10
+; GISEL-NEXT: v_mul_hi_u32 v3, v8, v0
+; GISEL-NEXT: v_mul_hi_u32 v0, v9, v0
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v6
; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3
; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v3, v8, v5
+; GISEL-NEXT: v_mul_lo_u32 v3, v9, v5
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v6, v2
-; GISEL-NEXT: v_mul_hi_u32 v6, v7, v5
+; GISEL-NEXT: v_mul_hi_u32 v6, v8, v5
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0
; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v6
; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v6
-; GISEL-NEXT: v_mul_hi_u32 v5, v8, v5
+; GISEL-NEXT: v_mul_hi_u32 v5, v9, v5
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2
; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v5, v2
-; GISEL-NEXT: v_add_i32_e32 v3, vcc, v7, v0
-; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v8, v2, vcc
-; GISEL-NEXT: v_mul_lo_u32 v5, v13, v3
-; GISEL-NEXT: v_mul_lo_u32 v6, v12, v2
-; GISEL-NEXT: v_xor_b32_e32 v10, v10, v9
-; GISEL-NEXT: v_mul_hi_u32 v7, v12, v3
-; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v10, v9
-; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v6
-; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0
+; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v9, v2, vcc
+; GISEL-NEXT: v_mul_lo_u32 v3, v13, v0
+; GISEL-NEXT: v_mul_lo_u32 v5, v12, v2
+; GISEL-NEXT: v_mul_hi_u32 v6, v12, v0
+; GISEL-NEXT: v_mul_hi_u32 v0, v13, v0
+; GISEL-NEXT: v_xor_b32_e32 v8, v11, v4
+; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v5
; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v7, v13, v2
-; GISEL-NEXT: v_mul_hi_u32 v3, v13, v3
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5
-; GISEL-NEXT: v_mul_hi_u32 v6, v12, v2
-; GISEL-NEXT: v_add_i32_e32 v3, vcc, v7, v3
-; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v6
+; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; GISEL-NEXT: v_mul_lo_u32 v6, v13, v2
+; GISEL-NEXT: v_add_i32_e32 v3, vcc, v5, v3
+; GISEL-NEXT: v_mul_hi_u32 v5, v12, v2
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v6, v0
; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v6, vcc, v7, v6
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v3, v5
-; GISEL-NEXT: v_mul_hi_u32 v8, v13, v2
-; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v7, 0
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v5
; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v5
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v8, v[3:4]
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v0, v3
+; GISEL-NEXT: v_mul_hi_u32 v6, v13, v2
+; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v7, v9, 0
+; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v6, v0
+; GISEL-NEXT: v_mov_b32_e32 v0, v3
+; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v7, v11, v[0:1]
+; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v1, v4
+; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v8, v4, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], 0, v9, v[5:6]
; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v12, v2
-; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], v13, v5, vcc
-; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v13, v5
-; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v4
-; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v4
-; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5]
-; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v3
-; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
-; GISEL-NEXT: v_cndmask_b32_e64 v3, -1, v6, s[4:5]
-; GISEL-NEXT: v_add_i32_e32 v6, vcc, 1, v7
-; GISEL-NEXT: v_addc_u32_e32 v9, vcc, 0, v8, vcc
-; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v4
+; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v13, v3, vcc
+; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v13, v3
+; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v7
+; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v7
+; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5]
+; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4
+; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
+; GISEL-NEXT: v_cndmask_b32_e64 v4, -1, v5, s[4:5]
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, 1, v9
+; GISEL-NEXT: v_addc_u32_e32 v6, vcc, 0, v11, vcc
+; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v7
; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
-; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5
+; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
; GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, 1, v6
-; GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v9, vcc
+; GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v5
+; GISEL-NEXT: v_addc_u32_e32 v7, vcc, 0, v6, vcc
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
-; GISEL-NEXT: v_cndmask_b32_e32 v2, v6, v4, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v4, v9, v5, vcc
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
-; GISEL-NEXT: v_cndmask_b32_e32 v2, v7, v2, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v3, v8, v4, vcc
-; GISEL-NEXT: v_xor_b32_e32 v2, v2, v11
-; GISEL-NEXT: v_xor_b32_e32 v3, v3, v11
-; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v11
-; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v11, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v2, v5, v3, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
+; GISEL-NEXT: v_cndmask_b32_e32 v2, v9, v2, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc
+; GISEL-NEXT: v_xor_b32_e32 v2, v2, v10
+; GISEL-NEXT: v_xor_b32_e32 v3, v3, v10
+; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v10
+; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v10, vcc
; GISEL-NEXT: s_setpc_b64 s[30:31]
;
; CGP-LABEL: v_sdiv_v2i64_oddk_denom:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
index 19dc20c510041..82279e641ed63 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
@@ -1112,67 +1112,67 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
; GISEL-NEXT: s_subb_u32 s6, 0, 0
; GISEL-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4
; GISEL-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4
-; GISEL-NEXT: v_trunc_f32_e32 v8, v5
-; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v8
-; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v4
-; GISEL-NEXT: v_cvt_u32_f32_e32 v8, v8
-; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v7, 0
-; GISEL-NEXT: v_mov_b32_e32 v9, v5
-; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v8, v[9:10]
-; GISEL-NEXT: v_mul_hi_u32 v11, v7, v4
-; GISEL-NEXT: v_mul_hi_u32 v12, v8, v4
-; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], s6, v7, v[9:10]
-; GISEL-NEXT: v_mul_lo_u32 v10, v8, v4
-; GISEL-NEXT: v_mul_lo_u32 v13, v7, v9
-; GISEL-NEXT: v_mul_lo_u32 v4, v8, v9
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v10, v13
+; GISEL-NEXT: v_trunc_f32_e32 v7, v5
+; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v7
+; GISEL-NEXT: v_cvt_u32_f32_e32 v8, v4
+; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v7
+; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v8, 0
+; GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v9, v[7:8]
+; GISEL-NEXT: v_mul_hi_u32 v12, v9, v4
+; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], s6, v8, v[10:11]
+; GISEL-NEXT: v_mul_lo_u32 v10, v9, v4
+; GISEL-NEXT: v_mul_hi_u32 v11, v8, v4
+; GISEL-NEXT: v_mul_lo_u32 v7, v8, v13
+; GISEL-NEXT: v_mul_lo_u32 v4, v9, v13
+; GISEL-NEXT: v_add_i32_e32 v7, vcc, v10, v7
; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v11
-; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13
-; GISEL-NEXT: v_mul_hi_u32 v14, v7, v9
+; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v11
+; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v7, vcc, v14, v7
+; GISEL-NEXT: v_mul_hi_u32 v14, v8, v13
; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v12
; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v14
; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14
-; GISEL-NEXT: v_mul_hi_u32 v9, v8, v9
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v13
-; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v13
-; GISEL-NEXT: v_add_i32_e32 v16, vcc, v7, v4
+; GISEL-NEXT: v_mul_hi_u32 v13, v9, v13
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7
+; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v7, vcc, v14, v7
+; GISEL-NEXT: v_add_i32_e32 v7, vcc, v13, v7
+; GISEL-NEXT: v_add_i32_e32 v16, vcc, v8, v4
; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v16, 0
-; GISEL-NEXT: v_addc_u32_e32 v17, vcc, v8, v9, vcc
+; GISEL-NEXT: v_addc_u32_e32 v17, vcc, v9, v7, vcc
; GISEL-NEXT: v_mov_b32_e32 v4, v14
; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v17, v[4:5]
; GISEL-NEXT: v_mul_lo_u32 v4, v17, v13
; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], s6, v16, v[14:15]
; GISEL-NEXT: s_mov_b32 s6, 1
; GISEL-NEXT: s_cmp_lg_u32 s6, 0
-; GISEL-NEXT: v_mul_lo_u32 v9, v16, v14
+; GISEL-NEXT: v_mul_lo_u32 v7, v16, v14
; GISEL-NEXT: s_subb_u32 s6, 0, 0
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v9
-; GISEL-NEXT: v_mul_hi_u32 v9, v16, v13
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7
+; GISEL-NEXT: v_mul_hi_u32 v7, v16, v13
; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v9
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7
; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; GISEL-NEXT: v_mul_hi_u32 v9, v17, v13
+; GISEL-NEXT: v_mul_hi_u32 v7, v17, v13
; GISEL-NEXT: v_mul_lo_u32 v13, v17, v14
; GISEL-NEXT: v_add_i32_e32 v4, vcc, v15, v4
; GISEL-NEXT: v_mul_hi_u32 v15, v16, v14
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v13, v9
+; GISEL-NEXT: v_add_i32_e32 v7, vcc, v13, v7
; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v15, vcc, v9, v15
-; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v9
-; GISEL-NEXT: v_ashrrev_i32_e32 v9, 31, v1
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9
-; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc
-; GISEL-NEXT: v_xor_b32_e32 v18, v0, v9
+; GISEL-NEXT: v_add_i32_e32 v15, vcc, v7, v15
+; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v7
+; GISEL-NEXT: v_ashrrev_i32_e32 v7, 31, v1
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7
+; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v7, vcc
+; GISEL-NEXT: v_xor_b32_e32 v18, v0, v7
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v4
; GISEL-NEXT: v_mul_hi_u32 v4, v17, v14
-; GISEL-NEXT: v_xor_b32_e32 v19, v1, v9
+; GISEL-NEXT: v_xor_b32_e32 v19, v1, v7
; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v13, v1
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v4, v1
@@ -1195,13 +1195,14 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v14
; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v13
-; GISEL-NEXT: v_mul_hi_u32 v15, v19, v1
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v0, 0
+; GISEL-NEXT: v_add_i32_e32 v15, vcc, v0, v13
+; GISEL-NEXT: v_mul_hi_u32 v16, v19, v1
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v15, 0
; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v15, v13
+; GISEL-NEXT: v_add_i32_e32 v13, vcc, v16, v13
; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v13, v[1:2]
+; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], 0, v15, v[13:14]
; GISEL-NEXT: v_sub_i32_e32 v14, vcc, v18, v0
; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v19, v13
; GISEL-NEXT: v_subb_u32_e64 v15, s[4:5], v19, v13, vcc
@@ -1217,94 +1218,96 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
; GISEL-NEXT: v_cndmask_b32_e32 v18, -1, v0, vcc
; GISEL-NEXT: v_mov_b32_e32 v0, v5
; GISEL-NEXT: v_cndmask_b32_e64 v13, -1, v1, s[4:5]
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v8, v[0:1]
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v7, v[0:1]
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v9, v[0:1]
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v8, v[0:1]
; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v16, v4
; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v17, vcc
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18
-; GISEL-NEXT: v_mul_lo_u32 v18, v7, v0
+; GISEL-NEXT: v_mul_lo_u32 v18, v8, v0
; GISEL-NEXT: v_cndmask_b32_e32 v16, v16, v1, vcc
; GISEL-NEXT: v_cndmask_b32_e32 v5, v17, v5, vcc
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v18
; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v11
; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v11, v8, v0
+; GISEL-NEXT: v_mul_lo_u32 v11, v9, v0
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1
-; GISEL-NEXT: v_mul_hi_u32 v10, v7, v0
+; GISEL-NEXT: v_mul_hi_u32 v10, v8, v0
; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12
; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10
; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11
-; GISEL-NEXT: v_mul_hi_u32 v0, v8, v0
+; GISEL-NEXT: v_mul_hi_u32 v0, v9, v0
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1
; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v1
-; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v8, v0, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v7, 0
+; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v1
+; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v0, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v8, 0
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13
; GISEL-NEXT: v_cndmask_b32_e32 v11, v15, v5, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v6, v8, v[1:2]
-; GISEL-NEXT: v_xor_b32_e32 v1, v11, v9
-; GISEL-NEXT: v_ashrrev_i32_e32 v11, 31, v3
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s6, v7, v[5:6]
+; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v6, v9, v[1:2]
; GISEL-NEXT: v_cndmask_b32_e32 v10, v14, v16, vcc
-; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v11
-; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v11, vcc
-; GISEL-NEXT: v_xor_b32_e32 v12, v2, v11
-; GISEL-NEXT: v_mul_lo_u32 v2, v8, v0
-; GISEL-NEXT: v_mul_lo_u32 v6, v7, v5
-; GISEL-NEXT: v_xor_b32_e32 v13, v3, v11
-; GISEL-NEXT: v_mul_hi_u32 v3, v7, v0
-; GISEL-NEXT: v_mul_hi_u32 v0, v8, v0
+; GISEL-NEXT: v_xor_b32_e32 v1, v10, v7
+; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s6, v8, v[5:6]
+; GISEL-NEXT: v_ashrrev_i32_e32 v10, 31, v3
+; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v10
+; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v10, vcc
+; GISEL-NEXT: v_xor_b32_e32 v12, v2, v10
+; GISEL-NEXT: v_mul_lo_u32 v2, v9, v0
+; GISEL-NEXT: v_mul_lo_u32 v6, v8, v5
+; GISEL-NEXT: v_xor_b32_e32 v13, v3, v10
+; GISEL-NEXT: v_mul_hi_u32 v3, v8, v0
+; GISEL-NEXT: v_mul_hi_u32 v0, v9, v0
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v6
; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3
; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v3, v8, v5
+; GISEL-NEXT: v_mul_lo_u32 v3, v9, v5
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v6, v2
-; GISEL-NEXT: v_mul_hi_u32 v6, v7, v5
+; GISEL-NEXT: v_mul_hi_u32 v6, v8, v5
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0
; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v6
; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v6
-; GISEL-NEXT: v_mul_hi_u32 v5, v8, v5
+; GISEL-NEXT: v_mul_hi_u32 v5, v9, v5
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2
; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v5, v2
-; GISEL-NEXT: v_add_i32_e32 v3, vcc, v7, v0
-; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v8, v2, vcc
-; GISEL-NEXT: v_mul_lo_u32 v5, v13, v3
-; GISEL-NEXT: v_mul_lo_u32 v6, v12, v2
-; GISEL-NEXT: v_xor_b32_e32 v10, v10, v9
-; GISEL-NEXT: v_mul_hi_u32 v7, v12, v3
-; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v10, v9
-; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v6
-; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0
+; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v9, v2, vcc
+; GISEL-NEXT: v_mul_lo_u32 v3, v13, v0
+; GISEL-NEXT: v_mul_lo_u32 v5, v12, v2
+; GISEL-NEXT: v_mul_hi_u32 v6, v12, v0
+; GISEL-NEXT: v_mul_hi_u32 v0, v13, v0
+; GISEL-NEXT: v_xor_b32_e32 v8, v11, v7
+; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v5
; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v7, v13, v2
-; GISEL-NEXT: v_mul_hi_u32 v3, v13, v3
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5
-; GISEL-NEXT: v_mul_hi_u32 v6, v12, v2
-; GISEL-NEXT: v_add_i32_e32 v3, vcc, v7, v3
-; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v6
+; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; GISEL-NEXT: v_mul_lo_u32 v6, v13, v2
+; GISEL-NEXT: v_add_i32_e32 v3, vcc, v5, v3
+; GISEL-NEXT: v_mul_hi_u32 v5, v12, v2
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v6, v0
; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v6, vcc, v7, v6
-; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v5
-; GISEL-NEXT: v_mul_hi_u32 v7, v13, v2
-; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v3, 0
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v5
; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v5, v[3:4]
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v0, v3
+; GISEL-NEXT: v_mul_hi_u32 v6, v13, v2
+; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v9, 0
+; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v0
+; GISEL-NEXT: v_mov_b32_e32 v0, v3
+; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v5, v[0:1]
+; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v1, v7
+; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], 0, v9, v[5:6]
+; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v8, v7, vcc
; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v12, v2
; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], v13, v5, vcc
; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v13, v5
@@ -1327,10 +1330,10 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6
; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
; GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
-; GISEL-NEXT: v_xor_b32_e32 v2, v2, v11
-; GISEL-NEXT: v_xor_b32_e32 v3, v3, v11
-; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v11
-; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v11, vcc
+; GISEL-NEXT: v_xor_b32_e32 v2, v2, v10
+; GISEL-NEXT: v_xor_b32_e32 v3, v3, v10
+; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v10
+; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v10, vcc
; GISEL-NEXT: s_setpc_b64 s[30:31]
;
; CGP-LABEL: v_srem_v2i64_pow2k_denom:
@@ -1705,67 +1708,67 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
; GISEL-NEXT: s_subb_u32 s6, 0, 0
; GISEL-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4
; GISEL-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4
-; GISEL-NEXT: v_trunc_f32_e32 v8, v5
-; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v8
-; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v4
-; GISEL-NEXT: v_cvt_u32_f32_e32 v8, v8
-; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v7, 0
-; GISEL-NEXT: v_mov_b32_e32 v9, v5
-; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v8, v[9:10]
-; GISEL-NEXT: v_mul_hi_u32 v11, v7, v4
-; GISEL-NEXT: v_mul_hi_u32 v12, v8, v4
-; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], s6, v7, v[9:10]
-; GISEL-NEXT: v_mul_lo_u32 v10, v8, v4
-; GISEL-NEXT: v_mul_lo_u32 v13, v7, v9
-; GISEL-NEXT: v_mul_lo_u32 v4, v8, v9
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v10, v13
+; GISEL-NEXT: v_trunc_f32_e32 v7, v5
+; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v7
+; GISEL-NEXT: v_cvt_u32_f32_e32 v8, v4
+; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v7
+; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v8, 0
+; GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v9, v[7:8]
+; GISEL-NEXT: v_mul_hi_u32 v12, v9, v4
+; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], s6, v8, v[10:11]
+; GISEL-NEXT: v_mul_lo_u32 v10, v9, v4
+; GISEL-NEXT: v_mul_hi_u32 v11, v8, v4
+; GISEL-NEXT: v_mul_lo_u32 v7, v8, v13
+; GISEL-NEXT: v_mul_lo_u32 v4, v9, v13
+; GISEL-NEXT: v_add_i32_e32 v7, vcc, v10, v7
; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v11
-; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13
-; GISEL-NEXT: v_mul_hi_u32 v14, v7, v9
+; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v11
+; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v7, vcc, v14, v7
+; GISEL-NEXT: v_mul_hi_u32 v14, v8, v13
; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v12
; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v14
; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14
-; GISEL-NEXT: v_mul_hi_u32 v9, v8, v9
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v13
-; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v13
-; GISEL-NEXT: v_add_i32_e32 v16, vcc, v7, v4
+; GISEL-NEXT: v_mul_hi_u32 v13, v9, v13
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7
+; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v7, vcc, v14, v7
+; GISEL-NEXT: v_add_i32_e32 v7, vcc, v13, v7
+; GISEL-NEXT: v_add_i32_e32 v16, vcc, v8, v4
; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v16, 0
-; GISEL-NEXT: v_addc_u32_e32 v17, vcc, v8, v9, vcc
+; GISEL-NEXT: v_addc_u32_e32 v17, vcc, v9, v7, vcc
; GISEL-NEXT: v_mov_b32_e32 v4, v14
; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v17, v[4:5]
; GISEL-NEXT: v_mul_lo_u32 v4, v17, v13
; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], s6, v16, v[14:15]
; GISEL-NEXT: s_mov_b32 s6, 1
; GISEL-NEXT: s_cmp_lg_u32 s6, 0
-; GISEL-NEXT: v_mul_lo_u32 v9, v16, v14
+; GISEL-NEXT: v_mul_lo_u32 v7, v16, v14
; GISEL-NEXT: s_subb_u32 s6, 0, 0
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v9
-; GISEL-NEXT: v_mul_hi_u32 v9, v16, v13
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7
+; GISEL-NEXT: v_mul_hi_u32 v7, v16, v13
; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v9
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7
; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; GISEL-NEXT: v_mul_hi_u32 v9, v17, v13
+; GISEL-NEXT: v_mul_hi_u32 v7, v17, v13
; GISEL-NEXT: v_mul_lo_u32 v13, v17, v14
; GISEL-NEXT: v_add_i32_e32 v4, vcc, v15, v4
; GISEL-NEXT: v_mul_hi_u32 v15, v16, v14
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v13, v9
+; GISEL-NEXT: v_add_i32_e32 v7, vcc, v13, v7
; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v15, vcc, v9, v15
-; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v9
-; GISEL-NEXT: v_ashrrev_i32_e32 v9, 31, v1
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9
-; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc
-; GISEL-NEXT: v_xor_b32_e32 v18, v0, v9
+; GISEL-NEXT: v_add_i32_e32 v15, vcc, v7, v15
+; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v7
+; GISEL-NEXT: v_ashrrev_i32_e32 v7, 31, v1
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7
+; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v7, vcc
+; GISEL-NEXT: v_xor_b32_e32 v18, v0, v7
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v4
; GISEL-NEXT: v_mul_hi_u32 v4, v17, v14
-; GISEL-NEXT: v_xor_b32_e32 v19, v1, v9
+; GISEL-NEXT: v_xor_b32_e32 v19, v1, v7
; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v13, v1
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v4, v1
@@ -1788,13 +1791,14 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v14
; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v13
-; GISEL-NEXT: v_mul_hi_u32 v15, v19, v1
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v0, 0
+; GISEL-NEXT: v_add_i32_e32 v15, vcc, v0, v13
+; GISEL-NEXT: v_mul_hi_u32 v16, v19, v1
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v15, 0
; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v15, v13
+; GISEL-NEXT: v_add_i32_e32 v13, vcc, v16, v13
; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v13, v[1:2]
+; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], 0, v15, v[13:14]
; GISEL-NEXT: v_sub_i32_e32 v14, vcc, v18, v0
; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v19, v13
; GISEL-NEXT: v_subb_u32_e64 v15, s[4:5], v19, v13, vcc
@@ -1810,94 +1814,96 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
; GISEL-NEXT: v_cndmask_b32_e32 v18, -1, v0, vcc
; GISEL-NEXT: v_mov_b32_e32 v0, v5
; GISEL-NEXT: v_cndmask_b32_e64 v13, -1, v1, s[4:5]
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v8, v[0:1]
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v7, v[0:1]
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v9, v[0:1]
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v8, v[0:1]
; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v16, v4
; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v17, vcc
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18
-; GISEL-NEXT: v_mul_lo_u32 v18, v7, v0
+; GISEL-NEXT: v_mul_lo_u32 v18, v8, v0
; GISEL-NEXT: v_cndmask_b32_e32 v16, v16, v1, vcc
; GISEL-NEXT: v_cndmask_b32_e32 v5, v17, v5, vcc
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v18
; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v11
; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v11, v8, v0
+; GISEL-NEXT: v_mul_lo_u32 v11, v9, v0
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1
-; GISEL-NEXT: v_mul_hi_u32 v10, v7, v0
+; GISEL-NEXT: v_mul_hi_u32 v10, v8, v0
; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12
; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10
; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11
-; GISEL-NEXT: v_mul_hi_u32 v0, v8, v0
+; GISEL-NEXT: v_mul_hi_u32 v0, v9, v0
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1
; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v1
-; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v8, v0, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v7, 0
+; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v1
+; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v0, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v8, 0
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13
; GISEL-NEXT: v_cndmask_b32_e32 v11, v15, v5, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v6, v8, v[1:2]
-; GISEL-NEXT: v_xor_b32_e32 v1, v11, v9
-; GISEL-NEXT: v_ashrrev_i32_e32 v11, 31, v3
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s6, v7, v[5:6]
+; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v6, v9, v[1:2]
; GISEL-NEXT: v_cndmask_b32_e32 v10, v14, v16, vcc
-; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v11
-; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v11, vcc
-; GISEL-NEXT: v_xor_b32_e32 v12, v2, v11
-; GISEL-NEXT: v_mul_lo_u32 v2, v8, v0
-; GISEL-NEXT: v_mul_lo_u32 v6, v7, v5
-; GISEL-NEXT: v_xor_b32_e32 v13, v3, v11
-; GISEL-NEXT: v_mul_hi_u32 v3, v7, v0
-; GISEL-NEXT: v_mul_hi_u32 v0, v8, v0
+; GISEL-NEXT: v_xor_b32_e32 v1, v10, v7
+; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s6, v8, v[5:6]
+; GISEL-NEXT: v_ashrrev_i32_e32 v10, 31, v3
+; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v10
+; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v10, vcc
+; GISEL-NEXT: v_xor_b32_e32 v12, v2, v10
+; GISEL-NEXT: v_mul_lo_u32 v2, v9, v0
+; GISEL-NEXT: v_mul_lo_u32 v6, v8, v5
+; GISEL-NEXT: v_xor_b32_e32 v13, v3, v10
+; GISEL-NEXT: v_mul_hi_u32 v3, v8, v0
+; GISEL-NEXT: v_mul_hi_u32 v0, v9, v0
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v6
; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3
; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v3, v8, v5
+; GISEL-NEXT: v_mul_lo_u32 v3, v9, v5
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v6, v2
-; GISEL-NEXT: v_mul_hi_u32 v6, v7, v5
+; GISEL-NEXT: v_mul_hi_u32 v6, v8, v5
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0
; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v6
; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v6
-; GISEL-NEXT: v_mul_hi_u32 v5, v8, v5
+; GISEL-NEXT: v_mul_hi_u32 v5, v9, v5
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2
; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v5, v2
-; GISEL-NEXT: v_add_i32_e32 v3, vcc, v7, v0
-; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v8, v2, vcc
-; GISEL-NEXT: v_mul_lo_u32 v5, v13, v3
-; GISEL-NEXT: v_mul_lo_u32 v6, v12, v2
-; GISEL-NEXT: v_xor_b32_e32 v10, v10, v9
-; GISEL-NEXT: v_mul_hi_u32 v7, v12, v3
-; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v10, v9
-; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v6
-; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0
+; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v9, v2, vcc
+; GISEL-NEXT: v_mul_lo_u32 v3, v13, v0
+; GISEL-NEXT: v_mul_lo_u32 v5, v12, v2
+; GISEL-NEXT: v_mul_hi_u32 v6, v12, v0
+; GISEL-NEXT: v_mul_hi_u32 v0, v13, v0
+; GISEL-NEXT: v_xor_b32_e32 v8, v11, v7
+; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v5
; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v7, v13, v2
-; GISEL-NEXT: v_mul_hi_u32 v3, v13, v3
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5
-; GISEL-NEXT: v_mul_hi_u32 v6, v12, v2
-; GISEL-NEXT: v_add_i32_e32 v3, vcc, v7, v3
-; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v6
+; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; GISEL-NEXT: v_mul_lo_u32 v6, v13, v2
+; GISEL-NEXT: v_add_i32_e32 v3, vcc, v5, v3
+; GISEL-NEXT: v_mul_hi_u32 v5, v12, v2
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v6, v0
; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v6, vcc, v7, v6
-; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v5
-; GISEL-NEXT: v_mul_hi_u32 v7, v13, v2
-; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v3, 0
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v5
; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v5, v[3:4]
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v0, v3
+; GISEL-NEXT: v_mul_hi_u32 v6, v13, v2
+; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v9, 0
+; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v0
+; GISEL-NEXT: v_mov_b32_e32 v0, v3
+; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v5, v[0:1]
+; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v1, v7
+; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], 0, v9, v[5:6]
+; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v8, v7, vcc
; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v12, v2
; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], v13, v5, vcc
; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v13, v5
@@ -1920,10 +1926,10 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6
; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
; GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
-; GISEL-NEXT: v_xor_b32_e32 v2, v2, v11
-; GISEL-NEXT: v_xor_b32_e32 v3, v3, v11
-; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v11
-; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v11, vcc
+; GISEL-NEXT: v_xor_b32_e32 v2, v2, v10
+; GISEL-NEXT: v_xor_b32_e32 v3, v3, v10
+; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v10
+; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v10, vcc
; GISEL-NEXT: s_setpc_b64 s[30:31]
;
; CGP-LABEL: v_srem_v2i64_oddk_denom:
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/div-by-constant.ll b/llvm/test/CodeGen/RISCV/GlobalISel/div-by-constant.ll
index 6864afe3855f4..225ceed9627b7 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/div-by-constant.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/div-by-constant.ll
@@ -240,7 +240,6 @@ define i8 @udiv8_constant_add(i8 %a) nounwind {
; RV32-NEXT: zext.b a0, a0
; RV32-NEXT: srli a0, a0, 1
; RV32-NEXT: add a0, a0, a1
-; RV32-NEXT: zext.b a0, a0
; RV32-NEXT: srli a0, a0, 2
; RV32-NEXT: ret
;
@@ -254,7 +253,6 @@ define i8 @udiv8_constant_add(i8 %a) nounwind {
; RV64-NEXT: zext.b a0, a0
; RV64-NEXT: srli a0, a0, 1
; RV64-NEXT: add a0, a0, a1
-; RV64-NEXT: zext.b a0, a0
; RV64-NEXT: srli a0, a0, 2
; RV64-NEXT: ret
%1 = udiv i8 %a, 7
@@ -317,7 +315,6 @@ define i16 @udiv16_constant_add(i16 %a) nounwind {
; RV32IM-NEXT: and a0, a0, a2
; RV32IM-NEXT: srli a0, a0, 1
; RV32IM-NEXT: add a0, a0, a1
-; RV32IM-NEXT: and a0, a0, a2
; RV32IM-NEXT: srli a0, a0, 2
; RV32IM-NEXT: ret
;
@@ -332,7 +329,6 @@ define i16 @udiv16_constant_add(i16 %a) nounwind {
; RV32IMZB-NEXT: zext.h a0, a0
; RV32IMZB-NEXT: srli a0, a0, 1
; RV32IMZB-NEXT: add a0, a0, a1
-; RV32IMZB-NEXT: zext.h a0, a0
; RV32IMZB-NEXT: srli a0, a0, 2
; RV32IMZB-NEXT: ret
;
@@ -349,7 +345,6 @@ define i16 @udiv16_constant_add(i16 %a) nounwind {
; RV64IM-NEXT: and a0, a0, a2
; RV64IM-NEXT: srli a0, a0, 1
; RV64IM-NEXT: add a0, a0, a1
-; RV64IM-NEXT: and a0, a0, a2
; RV64IM-NEXT: srli a0, a0, 2
; RV64IM-NEXT: ret
;
@@ -364,7 +359,6 @@ define i16 @udiv16_constant_add(i16 %a) nounwind {
; RV64IMZB-NEXT: zext.h a0, a0
; RV64IMZB-NEXT: srli a0, a0, 1
; RV64IMZB-NEXT: add a0, a0, a1
-; RV64IMZB-NEXT: zext.h a0, a0
; RV64IMZB-NEXT: srli a0, a0, 2
; RV64IMZB-NEXT: ret
%1 = udiv i16 %a, 7
diff --git a/llvm/unittests/CodeGen/GlobalISel/KnownBitsTest.cpp b/llvm/unittests/CodeGen/GlobalISel/KnownBitsTest.cpp
index 089fb00d6080d..8563d7f1f15c9 100644
--- a/llvm/unittests/CodeGen/GlobalISel/KnownBitsTest.cpp
+++ b/llvm/unittests/CodeGen/GlobalISel/KnownBitsTest.cpp
@@ -190,7 +190,7 @@ TEST_F(AArch64GISelMITest, TestKnownBitsDecreasingCstPHIWithLoop) {
// Therefore, %14's known zero are 0x80 shifted by one 0xC0.
// If we had simulated the loop we could have more zero bits, basically
// up to 0xFC (count leading zero of 5, + 1).
- EXPECT_EQ((uint64_t)0xC0, Res.Zero.getZExtValue());
+ EXPECT_EQ((uint64_t)0xFC, Res.Zero.getZExtValue());
KnownBits Res2 = Info.getKnownBits(DstReg);
EXPECT_EQ(Res.One.getZExtValue(), Res2.One.getZExtValue());
diff --git a/llvm/unittests/CodeGen/GlobalISel/KnownBitsVectorTest.cpp b/llvm/unittests/CodeGen/GlobalISel/KnownBitsVectorTest.cpp
index 73ddf0c88d3ed..6b70ae9739179 100644
--- a/llvm/unittests/CodeGen/GlobalISel/KnownBitsVectorTest.cpp
+++ b/llvm/unittests/CodeGen/GlobalISel/KnownBitsVectorTest.cpp
@@ -220,7 +220,7 @@ TEST_F(AArch64GISelMITest, TestKnownBitsVectorDecreasingCstPHIWithLoop) {
GISelValueTracking Info(*MF, /*MaxDepth=*/24);
KnownBits Res = Info.getKnownBits(SrcReg);
EXPECT_EQ((uint64_t)0, Res.One.getZExtValue());
- EXPECT_EQ((uint64_t)0xC0, Res.Zero.getZExtValue());
+ EXPECT_EQ((uint64_t)0xFC, Res.Zero.getZExtValue());
KnownBits Res2 = Info.getKnownBits(DstReg);
EXPECT_EQ(Res.One.getZExtValue(), Res2.One.getZExtValue());
More information about the llvm-commits
mailing list