[llvm] TargetSchedule: correct latency by cycles elapsed from def to use (PR #74088)
via llvm-commits
llvm-commits at lists.llvm.org
Fri Dec 1 07:06:07 PST 2023
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-x86
Author: Ramkumar Ramachandra (artagnon)
<details>
<summary>Changes</summary>
The getOperandLatency function falls back to default def/instruction latency in many cases, but this fallback latency doesn't account for the cycles elapsed from the def to the use. However, this is a hard problem to solve, since we don't have the scheduling model in the fallback cases. As a conservative approximation, set the fallback latency to zero if it is much less than the distance between the def and the use, albeit by a factor of 22. Improvements are observed on standard benchmarks.
---
Patch is 986.38 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/74088.diff
44 Files Affected:
- (modified) llvm/lib/CodeGen/TargetSchedule.cpp (+25-2)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.ll (+9-10)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll (+197-197)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll (+91-91)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i32.ll (+112-112)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll (+456-405)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll (+418-418)
- (modified) llvm/test/CodeGen/AMDGPU/calling-conventions.ll (+153-154)
- (modified) llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll (+4-4)
- (modified) llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll (+4-4)
- (modified) llvm/test/CodeGen/AMDGPU/load-constant-i1.ll (+811-821)
- (modified) llvm/test/CodeGen/AMDGPU/load-constant-i16.ll (+509-517)
- (modified) llvm/test/CodeGen/AMDGPU/load-constant-i32.ll (+144-143)
- (modified) llvm/test/CodeGen/AMDGPU/load-constant-i8.ll (+626-634)
- (modified) llvm/test/CodeGen/AMDGPU/load-global-i16.ll (+392-403)
- (modified) llvm/test/CodeGen/AMDGPU/load-global-i32.ll (+112-119)
- (modified) llvm/test/CodeGen/PowerPC/aix-cc-abi-mir.ll (+63-63)
- (modified) llvm/test/CodeGen/PowerPC/aix-cc-abi.ll (+41-41)
- (modified) llvm/test/CodeGen/PowerPC/inc-of-add.ll (+94-94)
- (modified) llvm/test/CodeGen/PowerPC/ppc64-P9-vabsd.ll (+2-2)
- (modified) llvm/test/CodeGen/PowerPC/sat-add.ll (+1-1)
- (modified) llvm/test/CodeGen/PowerPC/sub-of-not.ll (+94-94)
- (modified) llvm/test/CodeGen/PowerPC/testBitReverse.ll (+26-26)
- (modified) llvm/test/CodeGen/PowerPC/umulo-128-legalisation-lowering.ll (+76-77)
- (modified) llvm/test/CodeGen/PowerPC/vector-popcnt-128-ult-ugt.ll (+2928-2928)
- (modified) llvm/test/CodeGen/PowerPC/wide-scalar-shift-legalization.ll (+62-62)
- (modified) llvm/test/CodeGen/RISCV/bswap-bitreverse.ll (+12-13)
- (modified) llvm/test/CodeGen/RISCV/mul.ll (+2-2)
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll (+84-93)
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-explodevector.ll (+48-48)
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-stepvector.ll (+14-14)
- (modified) llvm/test/CodeGen/RISCV/srem-vector-lkk.ll (+38-38)
- (modified) llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-uniform-cases.ll (+5-5)
- (modified) llvm/test/CodeGen/Thumb2/mve-fpclamptosat_vec.ll (+16-16)
- (modified) llvm/test/CodeGen/Thumb2/mve-fptosi-sat-vector.ll (+3-3)
- (modified) llvm/test/CodeGen/Thumb2/mve-fptoui-sat-vector.ll (+1-1)
- (modified) llvm/test/CodeGen/Thumb2/mve-phireg.ll (+5-5)
- (modified) llvm/test/CodeGen/Thumb2/mve-shuffle.ll (+2-2)
- (modified) llvm/test/CodeGen/Thumb2/mve-vldst4.ll (+1-1)
- (modified) llvm/test/CodeGen/Thumb2/mve-vst3.ll (+8-8)
- (modified) llvm/test/CodeGen/Thumb2/mve-vst4.ll (+3-3)
- (modified) llvm/test/CodeGen/X86/mul-constant-result.ll (+61-66)
- (modified) llvm/test/CodeGen/X86/sad.ll (+4-3)
- (modified) llvm/test/CodeGen/X86/x86-interleaved-access.ll (+2-2)
``````````diff
diff --git a/llvm/lib/CodeGen/TargetSchedule.cpp b/llvm/lib/CodeGen/TargetSchedule.cpp
index a25d4ff78f4d967..7c353eb196a44cf 100644
--- a/llvm/lib/CodeGen/TargetSchedule.cpp
+++ b/llvm/lib/CodeGen/TargetSchedule.cpp
@@ -174,8 +174,31 @@ unsigned TargetSchedModel::computeOperandLatency(
const MachineInstr *DefMI, unsigned DefOperIdx,
const MachineInstr *UseMI, unsigned UseOperIdx) const {
- const unsigned InstrLatency = computeInstrLatency(DefMI);
- const unsigned DefaultDefLatency = TII->defaultDefLatency(SchedModel, *DefMI);
+ unsigned InstrLatency = computeInstrLatency(DefMI);
+ unsigned DefaultDefLatency = TII->defaultDefLatency(SchedModel, *DefMI);
+
+ // We fall back to computing the default latency in many cases. However, this
+ // doesn't take into account the distance between DefMI and UseMI, which would
+ // approximately be the number of cycles elapsed between the def and the use
+ // (quite an approximate, since we don't have the SchedModel). A conservative
+ // approximation would then be to check that this fallback latency is much
+ // less than this distance, and set it to zero if so.
+ const MachineBasicBlock *DefBB = DefMI->getParent();
+ const MachineBasicBlock *UseBB = UseMI ? UseMI->getParent() : nullptr;
+ if (DefBB && DefBB == UseBB) {
+ auto DefIt = find_if(DefBB->instrs(), [DefMI](const MachineInstr &MI) {
+ return &MI == DefMI;
+ });
+ auto UseIt = find_if(DefBB->instrs(), [UseMI](const MachineInstr &MI) {
+ return &MI == UseMI;
+ });
+ unsigned DefUseDist = std::distance(DefIt, UseIt) - 1;
+ const unsigned MulFactor = 22; // Chosen experimentally
+ if (MulFactor * InstrLatency < DefUseDist)
+ InstrLatency = 0;
+ if (MulFactor * DefaultDefLatency < DefUseDist)
+ DefaultDefLatency = 0;
+ }
if (!hasInstrSchedModel() && !hasInstrItineraries())
return InstrLatency;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.ll
index 6b054556135156f..4a2ee6aed39ebbb 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.ll
@@ -17,12 +17,10 @@ define amdgpu_cs void @memcpy_p1i8(ptr addrspace(1) %dst, ptr addrspace(1) %src)
; LOOP-NEXT: v_mov_b32_e32 v8, s6
; LOOP-NEXT: .LBB0_1: ; %load-store-loop
; LOOP-NEXT: ; =>This Inner Loop Header: Depth=1
+; LOOP-NEXT: s_waitcnt expcnt(0)
; LOOP-NEXT: buffer_load_ubyte v9, v[4:5], s[4:7], 0 addr64
-; LOOP-NEXT: s_waitcnt expcnt(6)
; LOOP-NEXT: buffer_load_ubyte v10, v[4:5], s[4:7], 0 addr64 offset:1
-; LOOP-NEXT: s_waitcnt expcnt(3)
; LOOP-NEXT: buffer_load_ubyte v11, v[4:5], s[4:7], 0 addr64 offset:2
-; LOOP-NEXT: s_waitcnt expcnt(0)
; LOOP-NEXT: buffer_load_ubyte v12, v[4:5], s[4:7], 0 addr64 offset:3
; LOOP-NEXT: buffer_load_ubyte v13, v[4:5], s[4:7], 0 addr64 offset:4
; LOOP-NEXT: buffer_load_ubyte v14, v[4:5], s[4:7], 0 addr64 offset:5
@@ -87,11 +85,8 @@ define amdgpu_cs void @memcpy_p1i8(ptr addrspace(1) %dst, ptr addrspace(1) %src)
; LOOP-NEXT: buffer_store_byte v11, v[6:7], s[4:7], 0 addr64 offset:8
; LOOP-NEXT: s_waitcnt expcnt(0)
; LOOP-NEXT: v_lshrrev_b32_e32 v11, 24, v11
-; LOOP-NEXT: v_lshrrev_b32_e32 v19, 16, v12
-; LOOP-NEXT: v_bfe_u32 v20, v12, 8, 8
; LOOP-NEXT: buffer_store_byte v12, v[6:7], s[4:7], 0 addr64 offset:12
-; LOOP-NEXT: s_waitcnt expcnt(0)
-; LOOP-NEXT: v_lshrrev_b32_e32 v12, 24, v12
+; LOOP-NEXT: v_lshrrev_b32_e32 v19, 24, v12
; LOOP-NEXT: buffer_store_byte v14, v[6:7], s[4:7], 0 addr64 offset:1
; LOOP-NEXT: buffer_store_byte v13, v[6:7], s[4:7], 0 addr64 offset:2
; LOOP-NEXT: buffer_store_byte v9, v[6:7], s[4:7], 0 addr64 offset:3
@@ -101,9 +96,13 @@ define amdgpu_cs void @memcpy_p1i8(ptr addrspace(1) %dst, ptr addrspace(1) %src)
; LOOP-NEXT: buffer_store_byte v18, v[6:7], s[4:7], 0 addr64 offset:9
; LOOP-NEXT: buffer_store_byte v17, v[6:7], s[4:7], 0 addr64 offset:10
; LOOP-NEXT: buffer_store_byte v11, v[6:7], s[4:7], 0 addr64 offset:11
-; LOOP-NEXT: buffer_store_byte v20, v[6:7], s[4:7], 0 addr64 offset:13
-; LOOP-NEXT: buffer_store_byte v19, v[6:7], s[4:7], 0 addr64 offset:14
-; LOOP-NEXT: buffer_store_byte v12, v[6:7], s[4:7], 0 addr64 offset:15
+; LOOP-NEXT: s_waitcnt expcnt(6)
+; LOOP-NEXT: v_lshrrev_b32_e32 v9, 16, v12
+; LOOP-NEXT: s_waitcnt expcnt(3)
+; LOOP-NEXT: v_bfe_u32 v10, v12, 8, 8
+; LOOP-NEXT: buffer_store_byte v19, v[6:7], s[4:7], 0 addr64 offset:15
+; LOOP-NEXT: buffer_store_byte v10, v[6:7], s[4:7], 0 addr64 offset:13
+; LOOP-NEXT: buffer_store_byte v9, v[6:7], s[4:7], 0 addr64 offset:14
; LOOP-NEXT: v_add_i32_e64 v6, s[0:1], 16, v6
; LOOP-NEXT: v_addc_u32_e64 v7, s[0:1], 0, v7, s[0:1]
; LOOP-NEXT: v_add_i32_e64 v4, s[0:1], 16, v4
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll
index 1061f0003bd4896..6329fcde8ba14ea 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll
@@ -154,60 +154,60 @@ define <2 x i32> @v_sdiv_v2i32(<2 x i32> %num, <2 x i32> %den) {
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v0
; GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v2
-; GISEL-NEXT: v_ashrrev_i32_e32 v6, 31, v1
-; GISEL-NEXT: v_ashrrev_i32_e32 v7, 31, v3
+; GISEL-NEXT: v_xor_b32_e32 v6, v4, v5
+; GISEL-NEXT: v_ashrrev_i32_e32 v7, 31, v1
+; GISEL-NEXT: v_ashrrev_i32_e32 v8, 31, v3
+; GISEL-NEXT: v_xor_b32_e32 v9, v7, v8
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v5
-; GISEL-NEXT: v_xor_b32_e32 v8, v4, v5
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v6
-; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v7
-; GISEL-NEXT: v_xor_b32_e32 v9, v6, v7
+; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v7
+; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v8
; GISEL-NEXT: v_xor_b32_e32 v0, v0, v4
; GISEL-NEXT: v_xor_b32_e32 v2, v2, v5
-; GISEL-NEXT: v_xor_b32_e32 v1, v1, v6
-; GISEL-NEXT: v_xor_b32_e32 v3, v3, v7
+; GISEL-NEXT: v_xor_b32_e32 v1, v1, v7
+; GISEL-NEXT: v_xor_b32_e32 v3, v3, v8
; GISEL-NEXT: v_cvt_f32_u32_e32 v4, v2
; GISEL-NEXT: v_sub_i32_e32 v5, vcc, 0, v2
-; GISEL-NEXT: v_cvt_f32_u32_e32 v6, v3
-; GISEL-NEXT: v_sub_i32_e32 v7, vcc, 0, v3
+; GISEL-NEXT: v_cvt_f32_u32_e32 v7, v3
+; GISEL-NEXT: v_sub_i32_e32 v8, vcc, 0, v3
; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4
-; GISEL-NEXT: v_rcp_iflag_f32_e32 v6, v6
+; GISEL-NEXT: v_rcp_iflag_f32_e32 v7, v7
; GISEL-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4
-; GISEL-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6
+; GISEL-NEXT: v_mul_f32_e32 v7, 0x4f7ffffe, v7
; GISEL-NEXT: v_cvt_u32_f32_e32 v4, v4
-; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6
+; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v7
; GISEL-NEXT: v_mul_lo_u32 v5, v5, v4
-; GISEL-NEXT: v_mul_lo_u32 v7, v7, v6
+; GISEL-NEXT: v_mul_lo_u32 v8, v8, v7
; GISEL-NEXT: v_mul_hi_u32 v5, v4, v5
-; GISEL-NEXT: v_mul_hi_u32 v7, v6, v7
+; GISEL-NEXT: v_mul_hi_u32 v8, v7, v8
; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v7
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v8
; GISEL-NEXT: v_mul_hi_u32 v4, v0, v4
; GISEL-NEXT: v_mul_hi_u32 v5, v1, v5
-; GISEL-NEXT: v_mul_lo_u32 v6, v4, v2
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, 1, v4
+; GISEL-NEXT: v_mul_lo_u32 v7, v4, v2
+; GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v4
; GISEL-NEXT: v_mul_lo_u32 v10, v5, v3
; GISEL-NEXT: v_add_i32_e32 v11, vcc, 1, v5
-; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v6
+; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v7
; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v1, v10
; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2
-; GISEL-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc
-; GISEL-NEXT: v_sub_i32_e64 v6, s[4:5], v0, v2
+; GISEL-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc
+; GISEL-NEXT: v_sub_i32_e64 v7, s[4:5], v0, v2
; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v3
; GISEL-NEXT: v_cndmask_b32_e64 v5, v5, v11, s[4:5]
-; GISEL-NEXT: v_sub_i32_e64 v7, s[6:7], v1, v3
-; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc
-; GISEL-NEXT: v_add_i32_e32 v6, vcc, 1, v4
-; GISEL-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[4:5]
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, 1, v5
+; GISEL-NEXT: v_sub_i32_e64 v8, s[6:7], v1, v3
+; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc
+; GISEL-NEXT: v_add_i32_e32 v7, vcc, 1, v4
+; GISEL-NEXT: v_cndmask_b32_e64 v1, v1, v8, s[4:5]
+; GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v5
; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2
-; GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v7, vcc
; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3
-; GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc
-; GISEL-NEXT: v_xor_b32_e32 v0, v0, v8
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v8, vcc
+; GISEL-NEXT: v_xor_b32_e32 v0, v0, v6
; GISEL-NEXT: v_xor_b32_e32 v1, v1, v9
-; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v8
; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v1, v9
+; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v6
; GISEL-NEXT: s_setpc_b64 s[30:31]
;
; CGP-LABEL: v_sdiv_v2i32:
@@ -267,8 +267,8 @@ define <2 x i32> @v_sdiv_v2i32(<2 x i32> %num, <2 x i32> %den) {
; CGP-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc
; CGP-NEXT: v_xor_b32_e32 v0, v0, v8
; CGP-NEXT: v_xor_b32_e32 v1, v1, v9
-; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v8
; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v9
+; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v8
; CGP-NEXT: s_setpc_b64 s[30:31]
%result = sdiv <2 x i32> %num, %den
ret <2 x i32> %result
@@ -577,60 +577,60 @@ define <2 x i32> @v_sdiv_v2i32_pow2_shl_denom(<2 x i32> %x, <2 x i32> %y) {
; GISEL-NEXT: v_lshl_b32_e32 v3, 0x1000, v3
; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v0
; GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v1
-; GISEL-NEXT: v_ashrrev_i32_e32 v6, 31, v2
+; GISEL-NEXT: v_ashrrev_i32_e32 v6, 31, v3
+; GISEL-NEXT: v_xor_b32_e32 v7, v5, v6
+; GISEL-NEXT: v_ashrrev_i32_e32 v8, 31, v2
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4
-; GISEL-NEXT: v_ashrrev_i32_e32 v7, 31, v3
+; GISEL-NEXT: v_xor_b32_e32 v9, v4, v8
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v5
-; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v6
+; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v6
+; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v8
; GISEL-NEXT: v_xor_b32_e32 v0, v0, v4
-; GISEL-NEXT: v_xor_b32_e32 v4, v4, v6
-; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v7
; GISEL-NEXT: v_xor_b32_e32 v1, v1, v5
-; GISEL-NEXT: v_xor_b32_e32 v5, v5, v7
-; GISEL-NEXT: v_xor_b32_e32 v2, v2, v6
-; GISEL-NEXT: v_xor_b32_e32 v3, v3, v7
+; GISEL-NEXT: v_xor_b32_e32 v3, v3, v6
+; GISEL-NEXT: v_xor_b32_e32 v2, v2, v8
+; GISEL-NEXT: v_cvt_f32_u32_e32 v4, v3
+; GISEL-NEXT: v_sub_i32_e32 v5, vcc, 0, v3
; GISEL-NEXT: v_cvt_f32_u32_e32 v6, v2
-; GISEL-NEXT: v_sub_i32_e32 v7, vcc, 0, v2
-; GISEL-NEXT: v_cvt_f32_u32_e32 v8, v3
-; GISEL-NEXT: v_sub_i32_e32 v9, vcc, 0, v3
+; GISEL-NEXT: v_sub_i32_e32 v8, vcc, 0, v2
+; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4
; GISEL-NEXT: v_rcp_iflag_f32_e32 v6, v6
-; GISEL-NEXT: v_rcp_iflag_f32_e32 v8, v8
+; GISEL-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4
; GISEL-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6
-; GISEL-NEXT: v_mul_f32_e32 v8, 0x4f7ffffe, v8
+; GISEL-NEXT: v_cvt_u32_f32_e32 v4, v4
; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6
-; GISEL-NEXT: v_cvt_u32_f32_e32 v8, v8
-; GISEL-NEXT: v_mul_lo_u32 v7, v7, v6
-; GISEL-NEXT: v_mul_lo_u32 v9, v9, v8
-; GISEL-NEXT: v_mul_hi_u32 v7, v6, v7
-; GISEL-NEXT: v_mul_hi_u32 v9, v8, v9
-; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v7
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v9
-; GISEL-NEXT: v_mul_hi_u32 v6, v0, v6
-; GISEL-NEXT: v_mul_hi_u32 v7, v1, v7
-; GISEL-NEXT: v_mul_lo_u32 v8, v6, v2
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, 1, v6
-; GISEL-NEXT: v_mul_lo_u32 v10, v7, v3
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, 1, v7
-; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v8
-; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v1, v10
-; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2
-; GISEL-NEXT: v_cndmask_b32_e32 v6, v6, v9, vcc
-; GISEL-NEXT: v_sub_i32_e64 v8, s[4:5], v0, v2
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v3
-; GISEL-NEXT: v_cndmask_b32_e64 v7, v7, v11, s[4:5]
-; GISEL-NEXT: v_sub_i32_e64 v9, s[6:7], v1, v3
-; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v6
-; GISEL-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[4:5]
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, 1, v7
-; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2
-; GISEL-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc
+; GISEL-NEXT: v_mul_lo_u32 v5, v5, v4
+; GISEL-NEXT: v_mul_lo_u32 v8, v8, v6
+; GISEL-NEXT: v_mul_hi_u32 v5, v4, v5
+; GISEL-NEXT: v_mul_hi_u32 v8, v6, v8
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v8
+; GISEL-NEXT: v_mul_hi_u32 v4, v1, v4
+; GISEL-NEXT: v_mul_hi_u32 v5, v0, v5
+; GISEL-NEXT: v_mul_lo_u32 v6, v4, v3
+; GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v4
+; GISEL-NEXT: v_mul_lo_u32 v10, v5, v2
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, 1, v5
+; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v1, v6
+; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v10
; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3
-; GISEL-NEXT: v_cndmask_b32_e32 v1, v7, v9, vcc
-; GISEL-NEXT: v_xor_b32_e32 v0, v0, v4
-; GISEL-NEXT: v_xor_b32_e32 v1, v1, v5
-; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v4
-; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v1, v5
+; GISEL-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc
+; GISEL-NEXT: v_sub_i32_e64 v6, s[4:5], v1, v3
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v2
+; GISEL-NEXT: v_cndmask_b32_e64 v5, v5, v11, s[4:5]
+; GISEL-NEXT: v_sub_i32_e64 v8, s[6:7], v0, v2
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, 1, v4
+; GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v8, s[4:5]
+; GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v5
+; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc
+; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2
+; GISEL-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc
+; GISEL-NEXT: v_xor_b32_e32 v1, v1, v7
+; GISEL-NEXT: v_xor_b32_e32 v0, v0, v9
+; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v1, v7
+; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v9
; GISEL-NEXT: s_setpc_b64 s[30:31]
;
; CGP-LABEL: v_sdiv_v2i32_pow2_shl_denom:
@@ -640,60 +640,60 @@ define <2 x i32> @v_sdiv_v2i32_pow2_shl_denom(<2 x i32> %x, <2 x i32> %y) {
; CGP-NEXT: v_lshl_b32_e32 v3, 0x1000, v3
; CGP-NEXT: v_ashrrev_i32_e32 v4, 31, v0
; CGP-NEXT: v_ashrrev_i32_e32 v5, 31, v1
-; CGP-NEXT: v_ashrrev_i32_e32 v6, 31, v2
+; CGP-NEXT: v_ashrrev_i32_e32 v6, 31, v3
+; CGP-NEXT: v_ashrrev_i32_e32 v7, 31, v2
; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v4
-; CGP-NEXT: v_ashrrev_i32_e32 v7, 31, v3
+; CGP-NEXT: v_xor_b32_e32 v8, v5, v6
; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v5
-; CGP-NEXT: v_xor_b32_e32 v8, v4, v6
-; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v6
+; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v6
+; CGP-NEXT: v_xor_b32_e32 v9, v4, v7
+; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v7
; CGP-NEXT: v_xor_b32_e32 v0, v0, v4
-; CGP-NEXT: v_xor_b32_e32 v4, v5, v7
-; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v7
; CGP-NEXT: v_xor_b32_e32 v1, v1, v5
-; CGP-NEXT: v_xor_b32_e32 v2, v2, v6
-; CGP-NEXT: v_xor_b32_e32 v3, v3, v7
-; CGP-NEXT: v_cvt_f32_u32_e32 v5, v2
-; CGP-NEXT: v_sub_i32_e32 v6, vcc, 0, v2
-; CGP-NEXT: v_cvt_f32_u32_e32 v7, v3
-; CGP-NEXT: v_sub_i32_e32 v9, vcc, 0, v3
-; CGP-NEXT: v_rcp_f32_e32 v5, v5
-; CGP-NEXT: v_rcp_f32_e32 v7, v7
-; CGP-NEXT: v_mul_f32_e32 v5, 0x4f7ffffe, v5
-; CGP-NEXT: v_mul_f32_e32 v7, 0x4f7ffffe, v7
-; CGP-NEXT: v_cvt_u32_f32_e32 v5, v5
-; CGP-NEXT: v_cvt_u32_f32_e32 v7, v7
-; CGP-NEXT: v_mul_lo_u32 v6, v6, v5
-; CGP-NEXT: v_mul_lo_u32 v9, v9, v7
-; CGP-NEXT: v_mul_hi_u32 v6, v5, v6
-; CGP-NEXT: v_mul_hi_u32 v9, v7, v9
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6
-; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v9
+; CGP-NEXT: v_xor_b32_e32 v3, v3, v6
+; CGP-NEXT: v_xor_b32_e32 v2, v2, v7
+; CGP-NEXT: v_cvt_f32_u32_e32 v4, v3
+; CGP-NEXT: v_sub_i32_e32 v5, vcc, 0, v3
+; CGP-NEXT: v_cvt_f32_u32_e32 v6, v2
+; CGP-NEXT: v_sub_i32_e32 v7, vcc, 0, v2
+; CGP-NEXT: v_rcp_f32_e32 v4, v4
+; CGP-NEXT: v_rcp_f32_e32 v6, v6
+; CGP-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4
+; CGP-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6
+; CGP-NEXT: v_cvt_u32_f32_e32 v4, v4
+; CGP-NEXT: v_cvt_u32_f32_e32 v6, v6
+; CGP-NEXT: v_mul_lo_u32 v5, v5, v4
+; CGP-NEXT: v_mul_lo_u32 v7, v7, v6
+; CGP-NEXT: v_mul_hi_u32 v5, v4, v5
+; CGP-NEXT: v_mul_hi_u32 v7, v6, v7
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v7
+; CGP-NEXT: v_mul_hi_u32 v4, v1, v4
; CGP-NEXT: v_mul_hi_u32 v5, v0, v5
-; CGP-NEXT: v_mul_hi_u32 v6, v1, v6
-; CGP-NEXT: v_mul_lo_u32 v7, v5, v2
-; CGP-NEXT: v_add_i32_e32 v9, vcc, 1, v5
-; CGP-NEXT: v_mul_lo_u32 v10, v6, v3
-; CGP-NEXT: v_add_i32_e32 v11, vcc, 1, v6
-; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v7
-; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v10
-; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2
-; CGP-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc
-; CGP-NEXT: v_sub_i32_e64 v7, s[4:5], v0, v2
-; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v3
-; CGP-NEXT: v_cndmask_b32_e64 v6, v6, v11, s[4:5]
-; CGP-NEXT: v_sub_i32_e64 v9, s[6:7], v1, v3
-; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc
+; CGP-NEXT: v_mul_lo_u32 v6, v4, v3
+; CGP-NEXT: v_add_i32_e32 v7, vcc, 1, v4
+; CGP-NEXT: v_mul_lo_u32 v10, v5, v2
+; CGP-NEXT: v_add_i32_e32 v11, vcc, 1, v5
+; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v6
+; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v10
+; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3
+; CGP-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc
+; CGP-NEXT: v_sub_i32_e64 v6, s[4:5], v1, v3
+; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v2
+; CGP-NEXT: v_cndmask_b32_e64 v5, v5, v11, s[4:5]
+; CGP-NEXT: v_sub_i32_e64 v7, s[6:7], v0, v2
+; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc
+; CGP-NEXT: v_add_i32_e32 v6, vcc, 1, v4
+; CGP-NEXT: v_cndmask_b32_e64 v0, v0, v7, s[4:5]
; CGP-NEXT: v_add_i32_e32 v7, vcc, 1, v5
-; CGP-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[4:5]
-; CGP-NEXT: v_add_i32_e32 v9, vcc, 1, v6
+; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3
+; CGP-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2
; CGP-NEXT: v_cndmask_b32_e32 v0, v5, v7, vcc
-; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3
-; CGP-NEXT: v_cndmask_b32_e32 v1, v6, v9, vcc
-; CGP-NEXT: v_xor_b32_e32 v0, v0, v8
-; CGP-NEXT: v_xor_b32_e32 v1, v1, v4
-; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v8
-; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v4
+; CGP-NEXT: v_xor_b32_e32 v1, v1, v8
+; CGP-NEXT: v_xor_b32_e32 v0, v0, v9
+; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v8
+; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v9
; CGP-NEXT: s_setpc_b64 s[30:31]
%shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y
%r = sdiv <2 x i32> %x, %shl.y
@@ -775,62 +775,62 @@ define <2 x i32> @v_sdiv_v2i32_24bit(<2 x i32> %num, <2 x i32> %den) {
; GISEL-NEXT: v_and_b32_e32 v1, 0xffffff, v1
; GISEL-NEXT: v_and_b32_e32 v2, 0xffffff, v2
; GISEL-NEXT: v_and_b32_e32 v3, 0xffffff, v3
-; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v0
-; GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v2
-; GISEL-NEXT: v_ashrrev_i32_e32 v6, 31, v1
-; GISEL-NEXT: v_ashrrev_i32_e32 v7, 31, v3
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4
-; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v5
-; GISEL-NEXT: v_xor_b32_e32 v8, v4, v5
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v6
-; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v7
-; GISEL-NEXT: v_xor_b32_e32 v9, v6, v7
-; GISEL-NEXT: v_xor_b32_e32 v0, v0, v4
-; GISEL-NEXT: v_xor_b32_e32 v2, v2, v5
-; GISEL-NEXT: v_xor_b32_e32 v1, v1, v6
-; GISEL-NEXT: v_xor_b32_e32 v3, v3, v7
-; GISEL-NEXT: v_cvt_f32_u32_e32 v4, v2
-; GISEL...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/74088
More information about the llvm-commits
mailing list