[llvm] TargetSchedule: correct latency by cycles elapsed from def to use (PR #74088)

via llvm-commits llvm-commits at lists.llvm.org
Fri Dec 1 07:06:07 PST 2023


llvmbot wrote:


<!--LLVM PR SUMMARY COMMENT-->

@llvm/pr-subscribers-backend-x86

Author: Ramkumar Ramachandra (artagnon)

<details>
<summary>Changes</summary>

The getOperandLatency function falls back to default def/instruction latency in many cases, but this fallback latency doesn't account for the cycles elapsed from the def to the use. However, this is a hard problem to solve, since we don't have the scheduling model in the fallback cases. As a conservative approximation, set the fallback latency to zero if it is much less than the distance between the def and the use, albeit by a factor of 22. Improvements are observed on standard benchmarks.

---

Patch is 986.38 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/74088.diff


44 Files Affected:

- (modified) llvm/lib/CodeGen/TargetSchedule.cpp (+25-2) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.ll (+9-10) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll (+197-197) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll (+91-91) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i32.ll (+112-112) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll (+456-405) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll (+418-418) 
- (modified) llvm/test/CodeGen/AMDGPU/calling-conventions.ll (+153-154) 
- (modified) llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll (+4-4) 
- (modified) llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll (+4-4) 
- (modified) llvm/test/CodeGen/AMDGPU/load-constant-i1.ll (+811-821) 
- (modified) llvm/test/CodeGen/AMDGPU/load-constant-i16.ll (+509-517) 
- (modified) llvm/test/CodeGen/AMDGPU/load-constant-i32.ll (+144-143) 
- (modified) llvm/test/CodeGen/AMDGPU/load-constant-i8.ll (+626-634) 
- (modified) llvm/test/CodeGen/AMDGPU/load-global-i16.ll (+392-403) 
- (modified) llvm/test/CodeGen/AMDGPU/load-global-i32.ll (+112-119) 
- (modified) llvm/test/CodeGen/PowerPC/aix-cc-abi-mir.ll (+63-63) 
- (modified) llvm/test/CodeGen/PowerPC/aix-cc-abi.ll (+41-41) 
- (modified) llvm/test/CodeGen/PowerPC/inc-of-add.ll (+94-94) 
- (modified) llvm/test/CodeGen/PowerPC/ppc64-P9-vabsd.ll (+2-2) 
- (modified) llvm/test/CodeGen/PowerPC/sat-add.ll (+1-1) 
- (modified) llvm/test/CodeGen/PowerPC/sub-of-not.ll (+94-94) 
- (modified) llvm/test/CodeGen/PowerPC/testBitReverse.ll (+26-26) 
- (modified) llvm/test/CodeGen/PowerPC/umulo-128-legalisation-lowering.ll (+76-77) 
- (modified) llvm/test/CodeGen/PowerPC/vector-popcnt-128-ult-ugt.ll (+2928-2928) 
- (modified) llvm/test/CodeGen/PowerPC/wide-scalar-shift-legalization.ll (+62-62) 
- (modified) llvm/test/CodeGen/RISCV/bswap-bitreverse.ll (+12-13) 
- (modified) llvm/test/CodeGen/RISCV/mul.ll (+2-2) 
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll (+84-93) 
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-explodevector.ll (+48-48) 
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-stepvector.ll (+14-14) 
- (modified) llvm/test/CodeGen/RISCV/srem-vector-lkk.ll (+38-38) 
- (modified) llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-uniform-cases.ll (+5-5) 
- (modified) llvm/test/CodeGen/Thumb2/mve-fpclamptosat_vec.ll (+16-16) 
- (modified) llvm/test/CodeGen/Thumb2/mve-fptosi-sat-vector.ll (+3-3) 
- (modified) llvm/test/CodeGen/Thumb2/mve-fptoui-sat-vector.ll (+1-1) 
- (modified) llvm/test/CodeGen/Thumb2/mve-phireg.ll (+5-5) 
- (modified) llvm/test/CodeGen/Thumb2/mve-shuffle.ll (+2-2) 
- (modified) llvm/test/CodeGen/Thumb2/mve-vldst4.ll (+1-1) 
- (modified) llvm/test/CodeGen/Thumb2/mve-vst3.ll (+8-8) 
- (modified) llvm/test/CodeGen/Thumb2/mve-vst4.ll (+3-3) 
- (modified) llvm/test/CodeGen/X86/mul-constant-result.ll (+61-66) 
- (modified) llvm/test/CodeGen/X86/sad.ll (+4-3) 
- (modified) llvm/test/CodeGen/X86/x86-interleaved-access.ll (+2-2) 


``````````diff
diff --git a/llvm/lib/CodeGen/TargetSchedule.cpp b/llvm/lib/CodeGen/TargetSchedule.cpp
index a25d4ff78f4d967..7c353eb196a44cf 100644
--- a/llvm/lib/CodeGen/TargetSchedule.cpp
+++ b/llvm/lib/CodeGen/TargetSchedule.cpp
@@ -174,8 +174,31 @@ unsigned TargetSchedModel::computeOperandLatency(
   const MachineInstr *DefMI, unsigned DefOperIdx,
   const MachineInstr *UseMI, unsigned UseOperIdx) const {
 
-  const unsigned InstrLatency = computeInstrLatency(DefMI);
-  const unsigned DefaultDefLatency = TII->defaultDefLatency(SchedModel, *DefMI);
+  unsigned InstrLatency = computeInstrLatency(DefMI);
+  unsigned DefaultDefLatency = TII->defaultDefLatency(SchedModel, *DefMI);
+
+  // We fall back to computing the default latency in many cases. However, this
+  // doesn't take into account the distance between DefMI and UseMI, which would
+  // approximately be the number of cycles elapsed between the def and the use
+  // (quite an approximate, since we don't have the SchedModel). A conservative
+  // approximation would then be to check that this fallback latency is much
+  // less than this distance, and set it to zero if so.
+  const MachineBasicBlock *DefBB = DefMI->getParent();
+  const MachineBasicBlock *UseBB = UseMI ? UseMI->getParent() : nullptr;
+  if (DefBB && DefBB == UseBB) {
+    auto DefIt = find_if(DefBB->instrs(), [DefMI](const MachineInstr &MI) {
+      return &MI == DefMI;
+    });
+    auto UseIt = find_if(DefBB->instrs(), [UseMI](const MachineInstr &MI) {
+      return &MI == UseMI;
+    });
+    unsigned DefUseDist = std::distance(DefIt, UseIt) - 1;
+    const unsigned MulFactor = 22; // Chosen experimentally
+    if (MulFactor * InstrLatency < DefUseDist)
+      InstrLatency = 0;
+    if (MulFactor * DefaultDefLatency < DefUseDist)
+      DefaultDefLatency = 0;
+  }
 
   if (!hasInstrSchedModel() && !hasInstrItineraries())
     return InstrLatency;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.ll
index 6b054556135156f..4a2ee6aed39ebbb 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.ll
@@ -17,12 +17,10 @@ define amdgpu_cs void @memcpy_p1i8(ptr addrspace(1) %dst, ptr addrspace(1) %src)
 ; LOOP-NEXT:    v_mov_b32_e32 v8, s6
 ; LOOP-NEXT:  .LBB0_1: ; %load-store-loop
 ; LOOP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; LOOP-NEXT:    s_waitcnt expcnt(0)
 ; LOOP-NEXT:    buffer_load_ubyte v9, v[4:5], s[4:7], 0 addr64
-; LOOP-NEXT:    s_waitcnt expcnt(6)
 ; LOOP-NEXT:    buffer_load_ubyte v10, v[4:5], s[4:7], 0 addr64 offset:1
-; LOOP-NEXT:    s_waitcnt expcnt(3)
 ; LOOP-NEXT:    buffer_load_ubyte v11, v[4:5], s[4:7], 0 addr64 offset:2
-; LOOP-NEXT:    s_waitcnt expcnt(0)
 ; LOOP-NEXT:    buffer_load_ubyte v12, v[4:5], s[4:7], 0 addr64 offset:3
 ; LOOP-NEXT:    buffer_load_ubyte v13, v[4:5], s[4:7], 0 addr64 offset:4
 ; LOOP-NEXT:    buffer_load_ubyte v14, v[4:5], s[4:7], 0 addr64 offset:5
@@ -87,11 +85,8 @@ define amdgpu_cs void @memcpy_p1i8(ptr addrspace(1) %dst, ptr addrspace(1) %src)
 ; LOOP-NEXT:    buffer_store_byte v11, v[6:7], s[4:7], 0 addr64 offset:8
 ; LOOP-NEXT:    s_waitcnt expcnt(0)
 ; LOOP-NEXT:    v_lshrrev_b32_e32 v11, 24, v11
-; LOOP-NEXT:    v_lshrrev_b32_e32 v19, 16, v12
-; LOOP-NEXT:    v_bfe_u32 v20, v12, 8, 8
 ; LOOP-NEXT:    buffer_store_byte v12, v[6:7], s[4:7], 0 addr64 offset:12
-; LOOP-NEXT:    s_waitcnt expcnt(0)
-; LOOP-NEXT:    v_lshrrev_b32_e32 v12, 24, v12
+; LOOP-NEXT:    v_lshrrev_b32_e32 v19, 24, v12
 ; LOOP-NEXT:    buffer_store_byte v14, v[6:7], s[4:7], 0 addr64 offset:1
 ; LOOP-NEXT:    buffer_store_byte v13, v[6:7], s[4:7], 0 addr64 offset:2
 ; LOOP-NEXT:    buffer_store_byte v9, v[6:7], s[4:7], 0 addr64 offset:3
@@ -101,9 +96,13 @@ define amdgpu_cs void @memcpy_p1i8(ptr addrspace(1) %dst, ptr addrspace(1) %src)
 ; LOOP-NEXT:    buffer_store_byte v18, v[6:7], s[4:7], 0 addr64 offset:9
 ; LOOP-NEXT:    buffer_store_byte v17, v[6:7], s[4:7], 0 addr64 offset:10
 ; LOOP-NEXT:    buffer_store_byte v11, v[6:7], s[4:7], 0 addr64 offset:11
-; LOOP-NEXT:    buffer_store_byte v20, v[6:7], s[4:7], 0 addr64 offset:13
-; LOOP-NEXT:    buffer_store_byte v19, v[6:7], s[4:7], 0 addr64 offset:14
-; LOOP-NEXT:    buffer_store_byte v12, v[6:7], s[4:7], 0 addr64 offset:15
+; LOOP-NEXT:    s_waitcnt expcnt(6)
+; LOOP-NEXT:    v_lshrrev_b32_e32 v9, 16, v12
+; LOOP-NEXT:    s_waitcnt expcnt(3)
+; LOOP-NEXT:    v_bfe_u32 v10, v12, 8, 8
+; LOOP-NEXT:    buffer_store_byte v19, v[6:7], s[4:7], 0 addr64 offset:15
+; LOOP-NEXT:    buffer_store_byte v10, v[6:7], s[4:7], 0 addr64 offset:13
+; LOOP-NEXT:    buffer_store_byte v9, v[6:7], s[4:7], 0 addr64 offset:14
 ; LOOP-NEXT:    v_add_i32_e64 v6, s[0:1], 16, v6
 ; LOOP-NEXT:    v_addc_u32_e64 v7, s[0:1], 0, v7, s[0:1]
 ; LOOP-NEXT:    v_add_i32_e64 v4, s[0:1], 16, v4
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll
index 1061f0003bd4896..6329fcde8ba14ea 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll
@@ -154,60 +154,60 @@ define <2 x i32> @v_sdiv_v2i32(<2 x i32> %num, <2 x i32> %den) {
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v4, 31, v0
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v5, 31, v2
-; GISEL-NEXT:    v_ashrrev_i32_e32 v6, 31, v1
-; GISEL-NEXT:    v_ashrrev_i32_e32 v7, 31, v3
+; GISEL-NEXT:    v_xor_b32_e32 v6, v4, v5
+; GISEL-NEXT:    v_ashrrev_i32_e32 v7, 31, v1
+; GISEL-NEXT:    v_ashrrev_i32_e32 v8, 31, v3
+; GISEL-NEXT:    v_xor_b32_e32 v9, v7, v8
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
-; GISEL-NEXT:    v_xor_b32_e32 v8, v4, v5
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v6
-; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v7
-; GISEL-NEXT:    v_xor_b32_e32 v9, v6, v7
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v7
+; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v8
 ; GISEL-NEXT:    v_xor_b32_e32 v0, v0, v4
 ; GISEL-NEXT:    v_xor_b32_e32 v2, v2, v5
-; GISEL-NEXT:    v_xor_b32_e32 v1, v1, v6
-; GISEL-NEXT:    v_xor_b32_e32 v3, v3, v7
+; GISEL-NEXT:    v_xor_b32_e32 v1, v1, v7
+; GISEL-NEXT:    v_xor_b32_e32 v3, v3, v8
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v4, v2
 ; GISEL-NEXT:    v_sub_i32_e32 v5, vcc, 0, v2
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v6, v3
-; GISEL-NEXT:    v_sub_i32_e32 v7, vcc, 0, v3
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v7, v3
+; GISEL-NEXT:    v_sub_i32_e32 v8, vcc, 0, v3
 ; GISEL-NEXT:    v_rcp_iflag_f32_e32 v4, v4
-; GISEL-NEXT:    v_rcp_iflag_f32_e32 v6, v6
+; GISEL-NEXT:    v_rcp_iflag_f32_e32 v7, v7
 ; GISEL-NEXT:    v_mul_f32_e32 v4, 0x4f7ffffe, v4
-; GISEL-NEXT:    v_mul_f32_e32 v6, 0x4f7ffffe, v6
+; GISEL-NEXT:    v_mul_f32_e32 v7, 0x4f7ffffe, v7
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v4, v4
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v6, v6
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v7, v7
 ; GISEL-NEXT:    v_mul_lo_u32 v5, v5, v4
-; GISEL-NEXT:    v_mul_lo_u32 v7, v7, v6
+; GISEL-NEXT:    v_mul_lo_u32 v8, v8, v7
 ; GISEL-NEXT:    v_mul_hi_u32 v5, v4, v5
-; GISEL-NEXT:    v_mul_hi_u32 v7, v6, v7
+; GISEL-NEXT:    v_mul_hi_u32 v8, v7, v8
 ; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v6, v7
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v7, v8
 ; GISEL-NEXT:    v_mul_hi_u32 v4, v0, v4
 ; GISEL-NEXT:    v_mul_hi_u32 v5, v1, v5
-; GISEL-NEXT:    v_mul_lo_u32 v6, v4, v2
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, 1, v4
+; GISEL-NEXT:    v_mul_lo_u32 v7, v4, v2
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, 1, v4
 ; GISEL-NEXT:    v_mul_lo_u32 v10, v5, v3
 ; GISEL-NEXT:    v_add_i32_e32 v11, vcc, 1, v5
-; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v6
+; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v7
 ; GISEL-NEXT:    v_sub_i32_e32 v1, vcc, v1, v10
 ; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
-; GISEL-NEXT:    v_cndmask_b32_e32 v4, v4, v7, vcc
-; GISEL-NEXT:    v_sub_i32_e64 v6, s[4:5], v0, v2
+; GISEL-NEXT:    v_cndmask_b32_e32 v4, v4, v8, vcc
+; GISEL-NEXT:    v_sub_i32_e64 v7, s[4:5], v0, v2
 ; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v1, v3
 ; GISEL-NEXT:    v_cndmask_b32_e64 v5, v5, v11, s[4:5]
-; GISEL-NEXT:    v_sub_i32_e64 v7, s[6:7], v1, v3
-; GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, 1, v4
-; GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, v7, s[4:5]
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, 1, v5
+; GISEL-NEXT:    v_sub_i32_e64 v8, s[6:7], v1, v3
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v7, vcc
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, 1, v4
+; GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, v8, s[4:5]
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, 1, v5
 ; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
-; GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v6, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v7, vcc
 ; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v3
-; GISEL-NEXT:    v_cndmask_b32_e32 v1, v5, v7, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v0, v0, v8
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v5, v8, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v0, v0, v6
 ; GISEL-NEXT:    v_xor_b32_e32 v1, v1, v9
-; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v8
 ; GISEL-NEXT:    v_sub_i32_e32 v1, vcc, v1, v9
+; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v6
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CGP-LABEL: v_sdiv_v2i32:
@@ -267,8 +267,8 @@ define <2 x i32> @v_sdiv_v2i32(<2 x i32> %num, <2 x i32> %den) {
 ; CGP-NEXT:    v_cndmask_b32_e32 v1, v5, v7, vcc
 ; CGP-NEXT:    v_xor_b32_e32 v0, v0, v8
 ; CGP-NEXT:    v_xor_b32_e32 v1, v1, v9
-; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v8
 ; CGP-NEXT:    v_sub_i32_e32 v1, vcc, v1, v9
+; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v8
 ; CGP-NEXT:    s_setpc_b64 s[30:31]
   %result = sdiv <2 x i32> %num, %den
   ret <2 x i32> %result
@@ -577,60 +577,60 @@ define <2 x i32> @v_sdiv_v2i32_pow2_shl_denom(<2 x i32> %x, <2 x i32> %y) {
 ; GISEL-NEXT:    v_lshl_b32_e32 v3, 0x1000, v3
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v4, 31, v0
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v5, 31, v1
-; GISEL-NEXT:    v_ashrrev_i32_e32 v6, 31, v2
+; GISEL-NEXT:    v_ashrrev_i32_e32 v6, 31, v3
+; GISEL-NEXT:    v_xor_b32_e32 v7, v5, v6
+; GISEL-NEXT:    v_ashrrev_i32_e32 v8, 31, v2
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
-; GISEL-NEXT:    v_ashrrev_i32_e32 v7, 31, v3
+; GISEL-NEXT:    v_xor_b32_e32 v9, v4, v8
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v5
-; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
+; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v8
 ; GISEL-NEXT:    v_xor_b32_e32 v0, v0, v4
-; GISEL-NEXT:    v_xor_b32_e32 v4, v4, v6
-; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v7
 ; GISEL-NEXT:    v_xor_b32_e32 v1, v1, v5
-; GISEL-NEXT:    v_xor_b32_e32 v5, v5, v7
-; GISEL-NEXT:    v_xor_b32_e32 v2, v2, v6
-; GISEL-NEXT:    v_xor_b32_e32 v3, v3, v7
+; GISEL-NEXT:    v_xor_b32_e32 v3, v3, v6
+; GISEL-NEXT:    v_xor_b32_e32 v2, v2, v8
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v4, v3
+; GISEL-NEXT:    v_sub_i32_e32 v5, vcc, 0, v3
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v6, v2
-; GISEL-NEXT:    v_sub_i32_e32 v7, vcc, 0, v2
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v8, v3
-; GISEL-NEXT:    v_sub_i32_e32 v9, vcc, 0, v3
+; GISEL-NEXT:    v_sub_i32_e32 v8, vcc, 0, v2
+; GISEL-NEXT:    v_rcp_iflag_f32_e32 v4, v4
 ; GISEL-NEXT:    v_rcp_iflag_f32_e32 v6, v6
-; GISEL-NEXT:    v_rcp_iflag_f32_e32 v8, v8
+; GISEL-NEXT:    v_mul_f32_e32 v4, 0x4f7ffffe, v4
 ; GISEL-NEXT:    v_mul_f32_e32 v6, 0x4f7ffffe, v6
-; GISEL-NEXT:    v_mul_f32_e32 v8, 0x4f7ffffe, v8
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v4, v4
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v6, v6
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v8, v8
-; GISEL-NEXT:    v_mul_lo_u32 v7, v7, v6
-; GISEL-NEXT:    v_mul_lo_u32 v9, v9, v8
-; GISEL-NEXT:    v_mul_hi_u32 v7, v6, v7
-; GISEL-NEXT:    v_mul_hi_u32 v9, v8, v9
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v8, v9
-; GISEL-NEXT:    v_mul_hi_u32 v6, v0, v6
-; GISEL-NEXT:    v_mul_hi_u32 v7, v1, v7
-; GISEL-NEXT:    v_mul_lo_u32 v8, v6, v2
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, 1, v6
-; GISEL-NEXT:    v_mul_lo_u32 v10, v7, v3
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, 1, v7
-; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v8
-; GISEL-NEXT:    v_sub_i32_e32 v1, vcc, v1, v10
-; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
-; GISEL-NEXT:    v_cndmask_b32_e32 v6, v6, v9, vcc
-; GISEL-NEXT:    v_sub_i32_e64 v8, s[4:5], v0, v2
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v1, v3
-; GISEL-NEXT:    v_cndmask_b32_e64 v7, v7, v11, s[4:5]
-; GISEL-NEXT:    v_sub_i32_e64 v9, s[6:7], v1, v3
-; GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, 1, v6
-; GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, v9, s[4:5]
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, 1, v7
-; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
-; GISEL-NEXT:    v_cndmask_b32_e32 v0, v6, v8, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v5, v5, v4
+; GISEL-NEXT:    v_mul_lo_u32 v8, v8, v6
+; GISEL-NEXT:    v_mul_hi_u32 v5, v4, v5
+; GISEL-NEXT:    v_mul_hi_u32 v8, v6, v8
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v6, v8
+; GISEL-NEXT:    v_mul_hi_u32 v4, v1, v4
+; GISEL-NEXT:    v_mul_hi_u32 v5, v0, v5
+; GISEL-NEXT:    v_mul_lo_u32 v6, v4, v3
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, 1, v4
+; GISEL-NEXT:    v_mul_lo_u32 v10, v5, v2
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, 1, v5
+; GISEL-NEXT:    v_sub_i32_e32 v1, vcc, v1, v6
+; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v10
 ; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v3
-; GISEL-NEXT:    v_cndmask_b32_e32 v1, v7, v9, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v0, v0, v4
-; GISEL-NEXT:    v_xor_b32_e32 v1, v1, v5
-; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v4
-; GISEL-NEXT:    v_sub_i32_e32 v1, vcc, v1, v5
+; GISEL-NEXT:    v_cndmask_b32_e32 v4, v4, v8, vcc
+; GISEL-NEXT:    v_sub_i32_e64 v6, s[4:5], v1, v3
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v0, v2
+; GISEL-NEXT:    v_cndmask_b32_e64 v5, v5, v11, s[4:5]
+; GISEL-NEXT:    v_sub_i32_e64 v8, s[6:7], v0, v2
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, 1, v4
+; GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v8, s[4:5]
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, 1, v5
+; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v3
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v4, v6, vcc
+; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v5, v8, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v1, v1, v7
+; GISEL-NEXT:    v_xor_b32_e32 v0, v0, v9
+; GISEL-NEXT:    v_sub_i32_e32 v1, vcc, v1, v7
+; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v9
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CGP-LABEL: v_sdiv_v2i32_pow2_shl_denom:
@@ -640,60 +640,60 @@ define <2 x i32> @v_sdiv_v2i32_pow2_shl_denom(<2 x i32> %x, <2 x i32> %y) {
 ; CGP-NEXT:    v_lshl_b32_e32 v3, 0x1000, v3
 ; CGP-NEXT:    v_ashrrev_i32_e32 v4, 31, v0
 ; CGP-NEXT:    v_ashrrev_i32_e32 v5, 31, v1
-; CGP-NEXT:    v_ashrrev_i32_e32 v6, 31, v2
+; CGP-NEXT:    v_ashrrev_i32_e32 v6, 31, v3
+; CGP-NEXT:    v_ashrrev_i32_e32 v7, 31, v2
 ; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
-; CGP-NEXT:    v_ashrrev_i32_e32 v7, 31, v3
+; CGP-NEXT:    v_xor_b32_e32 v8, v5, v6
 ; CGP-NEXT:    v_add_i32_e32 v1, vcc, v1, v5
-; CGP-NEXT:    v_xor_b32_e32 v8, v4, v6
-; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
+; CGP-NEXT:    v_xor_b32_e32 v9, v4, v7
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v7
 ; CGP-NEXT:    v_xor_b32_e32 v0, v0, v4
-; CGP-NEXT:    v_xor_b32_e32 v4, v5, v7
-; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v7
 ; CGP-NEXT:    v_xor_b32_e32 v1, v1, v5
-; CGP-NEXT:    v_xor_b32_e32 v2, v2, v6
-; CGP-NEXT:    v_xor_b32_e32 v3, v3, v7
-; CGP-NEXT:    v_cvt_f32_u32_e32 v5, v2
-; CGP-NEXT:    v_sub_i32_e32 v6, vcc, 0, v2
-; CGP-NEXT:    v_cvt_f32_u32_e32 v7, v3
-; CGP-NEXT:    v_sub_i32_e32 v9, vcc, 0, v3
-; CGP-NEXT:    v_rcp_f32_e32 v5, v5
-; CGP-NEXT:    v_rcp_f32_e32 v7, v7
-; CGP-NEXT:    v_mul_f32_e32 v5, 0x4f7ffffe, v5
-; CGP-NEXT:    v_mul_f32_e32 v7, 0x4f7ffffe, v7
-; CGP-NEXT:    v_cvt_u32_f32_e32 v5, v5
-; CGP-NEXT:    v_cvt_u32_f32_e32 v7, v7
-; CGP-NEXT:    v_mul_lo_u32 v6, v6, v5
-; CGP-NEXT:    v_mul_lo_u32 v9, v9, v7
-; CGP-NEXT:    v_mul_hi_u32 v6, v5, v6
-; CGP-NEXT:    v_mul_hi_u32 v9, v7, v9
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v7, v9
+; CGP-NEXT:    v_xor_b32_e32 v3, v3, v6
+; CGP-NEXT:    v_xor_b32_e32 v2, v2, v7
+; CGP-NEXT:    v_cvt_f32_u32_e32 v4, v3
+; CGP-NEXT:    v_sub_i32_e32 v5, vcc, 0, v3
+; CGP-NEXT:    v_cvt_f32_u32_e32 v6, v2
+; CGP-NEXT:    v_sub_i32_e32 v7, vcc, 0, v2
+; CGP-NEXT:    v_rcp_f32_e32 v4, v4
+; CGP-NEXT:    v_rcp_f32_e32 v6, v6
+; CGP-NEXT:    v_mul_f32_e32 v4, 0x4f7ffffe, v4
+; CGP-NEXT:    v_mul_f32_e32 v6, 0x4f7ffffe, v6
+; CGP-NEXT:    v_cvt_u32_f32_e32 v4, v4
+; CGP-NEXT:    v_cvt_u32_f32_e32 v6, v6
+; CGP-NEXT:    v_mul_lo_u32 v5, v5, v4
+; CGP-NEXT:    v_mul_lo_u32 v7, v7, v6
+; CGP-NEXT:    v_mul_hi_u32 v5, v4, v5
+; CGP-NEXT:    v_mul_hi_u32 v7, v6, v7
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v6, v7
+; CGP-NEXT:    v_mul_hi_u32 v4, v1, v4
 ; CGP-NEXT:    v_mul_hi_u32 v5, v0, v5
-; CGP-NEXT:    v_mul_hi_u32 v6, v1, v6
-; CGP-NEXT:    v_mul_lo_u32 v7, v5, v2
-; CGP-NEXT:    v_add_i32_e32 v9, vcc, 1, v5
-; CGP-NEXT:    v_mul_lo_u32 v10, v6, v3
-; CGP-NEXT:    v_add_i32_e32 v11, vcc, 1, v6
-; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v7
-; CGP-NEXT:    v_sub_i32_e32 v1, vcc, v1, v10
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
-; CGP-NEXT:    v_cndmask_b32_e32 v5, v5, v9, vcc
-; CGP-NEXT:    v_sub_i32_e64 v7, s[4:5], v0, v2
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v1, v3
-; CGP-NEXT:    v_cndmask_b32_e64 v6, v6, v11, s[4:5]
-; CGP-NEXT:    v_sub_i32_e64 v9, s[6:7], v1, v3
-; CGP-NEXT:    v_cndmask_b32_e32 v0, v0, v7, vcc
+; CGP-NEXT:    v_mul_lo_u32 v6, v4, v3
+; CGP-NEXT:    v_add_i32_e32 v7, vcc, 1, v4
+; CGP-NEXT:    v_mul_lo_u32 v10, v5, v2
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, 1, v5
+; CGP-NEXT:    v_sub_i32_e32 v1, vcc, v1, v6
+; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v10
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v3
+; CGP-NEXT:    v_cndmask_b32_e32 v4, v4, v7, vcc
+; CGP-NEXT:    v_sub_i32_e64 v6, s[4:5], v1, v3
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v0, v2
+; CGP-NEXT:    v_cndmask_b32_e64 v5, v5, v11, s[4:5]
+; CGP-NEXT:    v_sub_i32_e64 v7, s[6:7], v0, v2
+; CGP-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
+; CGP-NEXT:    v_add_i32_e32 v6, vcc, 1, v4
+; CGP-NEXT:    v_cndmask_b32_e64 v0, v0, v7, s[4:5]
 ; CGP-NEXT:    v_add_i32_e32 v7, vcc, 1, v5
-; CGP-NEXT:    v_cndmask_b32_e64 v1, v1, v9, s[4:5]
-; CGP-NEXT:    v_add_i32_e32 v9, vcc, 1, v6
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v3
+; CGP-NEXT:    v_cndmask_b32_e32 v1, v4, v6, vcc
 ; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
 ; CGP-NEXT:    v_cndmask_b32_e32 v0, v5, v7, vcc
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v3
-; CGP-NEXT:    v_cndmask_b32_e32 v1, v6, v9, vcc
-; CGP-NEXT:    v_xor_b32_e32 v0, v0, v8
-; CGP-NEXT:    v_xor_b32_e32 v1, v1, v4
-; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v8
-; CGP-NEXT:    v_sub_i32_e32 v1, vcc, v1, v4
+; CGP-NEXT:    v_xor_b32_e32 v1, v1, v8
+; CGP-NEXT:    v_xor_b32_e32 v0, v0, v9
+; CGP-NEXT:    v_sub_i32_e32 v1, vcc, v1, v8
+; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v9
 ; CGP-NEXT:    s_setpc_b64 s[30:31]
   %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y
   %r = sdiv <2 x i32> %x, %shl.y
@@ -775,62 +775,62 @@ define <2 x i32> @v_sdiv_v2i32_24bit(<2 x i32> %num, <2 x i32> %den) {
 ; GISEL-NEXT:    v_and_b32_e32 v1, 0xffffff, v1
 ; GISEL-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
 ; GISEL-NEXT:    v_and_b32_e32 v3, 0xffffff, v3
-; GISEL-NEXT:    v_ashrrev_i32_e32 v4, 31, v0
-; GISEL-NEXT:    v_ashrrev_i32_e32 v5, 31, v2
-; GISEL-NEXT:    v_ashrrev_i32_e32 v6, 31, v1
-; GISEL-NEXT:    v_ashrrev_i32_e32 v7, 31, v3
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
-; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
-; GISEL-NEXT:    v_xor_b32_e32 v8, v4, v5
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v6
-; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v7
-; GISEL-NEXT:    v_xor_b32_e32 v9, v6, v7
-; GISEL-NEXT:    v_xor_b32_e32 v0, v0, v4
-; GISEL-NEXT:    v_xor_b32_e32 v2, v2, v5
-; GISEL-NEXT:    v_xor_b32_e32 v1, v1, v6
-; GISEL-NEXT:    v_xor_b32_e32 v3, v3, v7
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v4, v2
-; GISEL...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/74088


More information about the llvm-commits mailing list