[llvm] PeepholeOpt: Do not add subregister indexes to reg_sequence operands (PR #124111)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Jan 23 04:44:21 PST 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
@llvm/pr-subscribers-llvm-globalisel
Author: Matt Arsenault (arsenm)
<details>
<summary>Changes</summary>
Given the rest of the pass just gives up when it needs to compose
subregisters, folding a subregister extract directly into a reg_sequence
is counterproductive. Later fold attempts in the function will give up
on the subregister operand, preventing looking up through the reg_sequence.
It may still be profitable to do these folds if we start handling
the composes. There are some test regressions, but this mostly
looks better.
---
Patch is 301.08 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/124111.diff
29 Files Affected:
- (modified) llvm/lib/CodeGen/PeepholeOptimizer.cpp (+6)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll (+174-174)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll (+113-113)
- (modified) llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll (+3-5)
- (modified) llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll (+49-55)
- (modified) llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll (+15-19)
- (modified) llvm/test/CodeGen/AMDGPU/div-rem-by-constant-64.ll (+128-184)
- (modified) llvm/test/CodeGen/AMDGPU/div_v2i128.ll (+71-80)
- (modified) llvm/test/CodeGen/AMDGPU/fptoi.i128.ll (+196-204)
- (modified) llvm/test/CodeGen/AMDGPU/fptrunc.ll (+37-37)
- (modified) llvm/test/CodeGen/AMDGPU/idot2.ll (+91-91)
- (modified) llvm/test/CodeGen/AMDGPU/idot4u.ll (+11-11)
- (modified) llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll (+73-84)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.mulo.ll (+58-69)
- (modified) llvm/test/CodeGen/AMDGPU/load-constant-i1.ll (+71-71)
- (modified) llvm/test/CodeGen/AMDGPU/load-constant-i32.ll (+7-9)
- (modified) llvm/test/CodeGen/AMDGPU/load-global-i16.ll (+11-11)
- (modified) llvm/test/CodeGen/AMDGPU/load-global-i32.ll (+256-197)
- (modified) llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll (+4-4)
- (modified) llvm/test/CodeGen/AMDGPU/mad_64_32.ll (+58-66)
- (modified) llvm/test/CodeGen/AMDGPU/move-to-valu-atomicrmw-system.ll (+9-10)
- (modified) llvm/test/CodeGen/AMDGPU/mul.ll (+88-99)
- (modified) llvm/test/CodeGen/AMDGPU/rem_i128.ll (+27-33)
- (modified) llvm/test/CodeGen/AMDGPU/sdiv.ll (+20-20)
- (modified) llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll (+157-159)
- (modified) llvm/test/CodeGen/AMDGPU/spill-vgpr.ll (+4-4)
- (modified) llvm/test/CodeGen/AMDGPU/sra.ll (+34-34)
- (modified) llvm/test/CodeGen/AMDGPU/udiv.ll (+3-9)
- (modified) llvm/test/CodeGen/AMDGPU/v_cndmask.ll (+2-7)
``````````diff
diff --git a/llvm/lib/CodeGen/PeepholeOptimizer.cpp b/llvm/lib/CodeGen/PeepholeOptimizer.cpp
index 48c25d5039bfd4..af4f2dc49b690b 100644
--- a/llvm/lib/CodeGen/PeepholeOptimizer.cpp
+++ b/llvm/lib/CodeGen/PeepholeOptimizer.cpp
@@ -436,6 +436,12 @@ class RegSequenceRewriter : public Rewriter {
if ((CurrentSrcIdx & 1) != 1 || CurrentSrcIdx > CopyLike.getNumOperands())
return false;
+ // Do not introduce new subregister uses in a reg_sequence. Until composing
+ // subregister indices is supported while folding, we're just blocking
+ // folding of subregister copies later in the function.
+ if (NewSubReg)
+ return false;
+
MachineOperand &MO = CopyLike.getOperand(CurrentSrcIdx);
MO.setReg(NewReg);
MO.setSubReg(NewSubReg);
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
index 756eb2788607bf..b92d9c74342748 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
@@ -2074,208 +2074,208 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
; GFX7-LABEL: v_mul_i256:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v16, v0
-; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v16, v14, 0
-; GFX7-NEXT: v_mov_b32_e32 v17, v1
-; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v16, v10, 0
-; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v17, v13, v[18:19]
-; GFX7-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v16, v12, 0
-; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v17, v9, v[0:1]
-; GFX7-NEXT: v_cndmask_b32_e64 v24, 0, 1, s[4:5]
-; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v2, v12, v[18:19]
-; GFX7-NEXT: v_mad_u64_u32 v[22:23], vcc, v2, v8, v[0:1]
-; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v3, v11, v[18:19]
-; GFX7-NEXT: v_addc_u32_e32 v25, vcc, 0, v24, vcc
-; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v10, v[0:1]
-; GFX7-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v17, v11, v[20:21]
-; GFX7-NEXT: v_cndmask_b32_e64 v21, 0, 1, s[4:5]
-; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v5, v9, v[0:1]
-; GFX7-NEXT: v_mad_u64_u32 v[19:20], vcc, v2, v10, v[19:20]
-; GFX7-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc
-; GFX7-NEXT: v_mad_u64_u32 v[19:20], vcc, v3, v9, v[19:20]
-; GFX7-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc
-; GFX7-NEXT: v_mov_b32_e32 v18, v23
-; GFX7-NEXT: v_mad_u64_u32 v[19:20], vcc, v4, v8, v[19:20]
-; GFX7-NEXT: v_mad_u64_u32 v[23:24], s[4:5], v6, v8, v[0:1]
-; GFX7-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc
-; GFX7-NEXT: v_mov_b32_e32 v0, v20
-; GFX7-NEXT: v_mov_b32_e32 v1, v23
-; GFX7-NEXT: v_mad_u64_u32 v[0:1], vcc, v16, v13, v[0:1]
-; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[8:9], v16, v11, v[18:19]
-; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v17, v12, v[0:1]
-; GFX7-NEXT: v_mul_lo_u32 v20, v6, v9
-; GFX7-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[8:9]
-; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v2, v11, v[0:1]
-; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[8:9], v17, v10, v[18:19]
-; GFX7-NEXT: v_mul_lo_u32 v23, v5, v10
-; GFX7-NEXT: v_mul_lo_u32 v26, v4, v11
-; GFX7-NEXT: v_mad_u64_u32 v[10:11], s[10:11], v3, v10, v[0:1]
-; GFX7-NEXT: v_addc_u32_e64 v6, s[8:9], 0, v6, s[8:9]
-; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[12:13], v16, v8, 0
+; GFX7-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v0, v14, 0
+; GFX7-NEXT: v_mov_b32_e32 v16, v1
+; GFX7-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v0, v10, 0
+; GFX7-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v16, v13, v[19:20]
+; GFX7-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v0, v12, 0
+; GFX7-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v16, v9, v[17:18]
+; GFX7-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5]
+; GFX7-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v2, v12, v[19:20]
+; GFX7-NEXT: v_mad_u64_u32 v[17:18], vcc, v2, v8, v[17:18]
+; GFX7-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v3, v11, v[19:20]
+; GFX7-NEXT: v_addc_u32_e32 v24, vcc, 0, v1, vcc
+; GFX7-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v4, v10, v[19:20]
+; GFX7-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v16, v11, v[21:22]
+; GFX7-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5]
+; GFX7-NEXT: v_mad_u64_u32 v[19:20], s[6:7], v5, v9, v[19:20]
+; GFX7-NEXT: v_mad_u64_u32 v[21:22], vcc, v2, v10, v[21:22]
+; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT: v_mad_u64_u32 v[21:22], vcc, v3, v9, v[21:22]
+; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v6, v8, v[19:20]
+; GFX7-NEXT: v_mad_u64_u32 v[21:22], vcc, v4, v8, v[21:22]
+; GFX7-NEXT: v_addc_u32_e32 v25, vcc, 0, v1, vcc
+; GFX7-NEXT: v_mov_b32_e32 v23, v19
+; GFX7-NEXT: v_mad_u64_u32 v[22:23], vcc, v0, v13, v[22:23]
+; GFX7-NEXT: v_mov_b32_e32 v19, v21
+; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[8:9], v0, v11, v[18:19]
+; GFX7-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v16, v12, v[22:23]
+; GFX7-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[8:9]
+; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[8:9], v16, v10, v[18:19]
+; GFX7-NEXT: v_mad_u64_u32 v[21:22], s[6:7], v2, v11, v[21:22]
+; GFX7-NEXT: v_mul_lo_u32 v27, v5, v10
+; GFX7-NEXT: v_mul_lo_u32 v23, v4, v11
+; GFX7-NEXT: v_mad_u64_u32 v[21:22], s[10:11], v3, v10, v[21:22]
+; GFX7-NEXT: v_mad_u64_u32 v[10:11], s[12:13], v0, v8, 0
+; GFX7-NEXT: v_mul_lo_u32 v26, v6, v9
+; GFX7-NEXT: v_addc_u32_e64 v6, s[8:9], 0, v1, s[8:9]
; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[8:9], v2, v9, v[18:19]
+; GFX7-NEXT: v_mul_lo_u32 v28, v3, v12
; GFX7-NEXT: v_mul_lo_u32 v13, v2, v13
-; GFX7-NEXT: v_mov_b32_e32 v2, v22
-; GFX7-NEXT: v_mad_u64_u32 v[10:11], s[12:13], v4, v9, v[10:11]
-; GFX7-NEXT: v_mad_u64_u32 v[1:2], s[14:15], v16, v9, v[1:2]
+; GFX7-NEXT: v_mov_b32_e32 v2, v17
+; GFX7-NEXT: v_mov_b32_e32 v1, v11
+; GFX7-NEXT: v_mad_u64_u32 v[11:12], s[12:13], v4, v9, v[21:22]
+; GFX7-NEXT: v_mad_u64_u32 v[1:2], s[14:15], v0, v9, v[1:2]
; GFX7-NEXT: v_addc_u32_e64 v6, s[8:9], 0, v6, s[8:9]
-; GFX7-NEXT: v_mul_lo_u32 v12, v3, v12
; GFX7-NEXT: v_mad_u64_u32 v[3:4], s[8:9], v3, v8, v[18:19]
; GFX7-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[14:15]
-; GFX7-NEXT: v_addc_u32_e64 v18, s[8:9], 0, v6, s[8:9]
-; GFX7-NEXT: v_mad_u64_u32 v[5:6], s[14:15], v5, v8, v[10:11]
-; GFX7-NEXT: v_mad_u64_u32 v[1:2], s[8:9], v17, v8, v[1:2]
+; GFX7-NEXT: v_addc_u32_e64 v17, s[8:9], 0, v6, s[8:9]
+; GFX7-NEXT: v_mad_u64_u32 v[5:6], s[14:15], v5, v8, v[11:12]
+; GFX7-NEXT: v_mad_u64_u32 v[1:2], s[8:9], v16, v8, v[1:2]
; GFX7-NEXT: v_addc_u32_e64 v3, s[8:9], v9, v3, s[8:9]
-; GFX7-NEXT: v_mul_lo_u32 v10, v16, v15
-; GFX7-NEXT: v_mul_lo_u32 v9, v17, v14
-; GFX7-NEXT: v_addc_u32_e64 v4, s[8:9], v25, v4, s[8:9]
-; GFX7-NEXT: v_addc_u32_e64 v5, s[8:9], v18, v5, s[8:9]
-; GFX7-NEXT: v_addc_u32_e64 v6, s[8:9], v21, v6, s[8:9]
-; GFX7-NEXT: v_addc_u32_e64 v10, s[8:9], v24, v10, s[8:9]
-; GFX7-NEXT: v_addc_u32_e64 v9, s[8:9], v10, v9, s[14:15]
-; GFX7-NEXT: v_addc_u32_e64 v9, s[8:9], v9, v13, s[12:13]
-; GFX7-NEXT: v_addc_u32_e64 v9, s[8:9], v9, v12, s[10:11]
-; GFX7-NEXT: v_addc_u32_e64 v9, s[6:7], v9, v26, s[6:7]
-; GFX7-NEXT: v_addc_u32_e64 v9, s[4:5], v9, v23, s[4:5]
-; GFX7-NEXT: v_addc_u32_e32 v9, vcc, v9, v20, vcc
-; GFX7-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[9:10]
+; GFX7-NEXT: v_mul_lo_u32 v0, v0, v15
+; GFX7-NEXT: v_mul_lo_u32 v9, v16, v14
+; GFX7-NEXT: v_addc_u32_e64 v4, s[8:9], v24, v4, s[8:9]
+; GFX7-NEXT: v_addc_u32_e64 v5, s[8:9], v17, v5, s[8:9]
+; GFX7-NEXT: v_addc_u32_e64 v6, s[8:9], v25, v6, s[8:9]
+; GFX7-NEXT: v_addc_u32_e64 v0, s[8:9], v20, v0, s[8:9]
+; GFX7-NEXT: v_addc_u32_e64 v0, s[8:9], v0, v9, s[14:15]
+; GFX7-NEXT: v_addc_u32_e64 v0, s[8:9], v0, v13, s[12:13]
+; GFX7-NEXT: v_addc_u32_e64 v0, s[8:9], v0, v28, s[10:11]
+; GFX7-NEXT: v_addc_u32_e64 v0, s[6:7], v0, v23, s[6:7]
+; GFX7-NEXT: v_addc_u32_e64 v0, s[4:5], v0, v27, s[4:5]
+; GFX7-NEXT: v_addc_u32_e32 v0, vcc, v0, v26, vcc
+; GFX7-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[0:1]
+; GFX7-NEXT: v_mov_b32_e32 v0, v10
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_mul_i256:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v16, v0
-; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v16, v14, 0
-; GFX8-NEXT: v_mov_b32_e32 v17, v1
-; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v16, v10, 0
-; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v17, v13, v[18:19]
-; GFX8-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v16, v12, 0
-; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v17, v9, v[0:1]
-; GFX8-NEXT: v_cndmask_b32_e64 v24, 0, 1, s[4:5]
-; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v2, v12, v[18:19]
-; GFX8-NEXT: v_mad_u64_u32 v[22:23], vcc, v2, v8, v[0:1]
-; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v3, v11, v[18:19]
-; GFX8-NEXT: v_addc_u32_e32 v25, vcc, 0, v24, vcc
-; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v10, v[0:1]
-; GFX8-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v17, v11, v[20:21]
-; GFX8-NEXT: v_cndmask_b32_e64 v21, 0, 1, s[4:5]
-; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v5, v9, v[0:1]
-; GFX8-NEXT: v_mad_u64_u32 v[19:20], vcc, v2, v10, v[19:20]
-; GFX8-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc
-; GFX8-NEXT: v_mad_u64_u32 v[19:20], vcc, v3, v9, v[19:20]
-; GFX8-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc
-; GFX8-NEXT: v_mov_b32_e32 v18, v23
-; GFX8-NEXT: v_mad_u64_u32 v[19:20], vcc, v4, v8, v[19:20]
-; GFX8-NEXT: v_mad_u64_u32 v[23:24], s[4:5], v6, v8, v[0:1]
-; GFX8-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc
-; GFX8-NEXT: v_mov_b32_e32 v0, v20
-; GFX8-NEXT: v_mov_b32_e32 v1, v23
-; GFX8-NEXT: v_mad_u64_u32 v[0:1], vcc, v16, v13, v[0:1]
-; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[8:9], v16, v11, v[18:19]
-; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v17, v12, v[0:1]
-; GFX8-NEXT: v_mul_lo_u32 v20, v6, v9
-; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[8:9]
-; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v2, v11, v[0:1]
-; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[8:9], v17, v10, v[18:19]
-; GFX8-NEXT: v_mul_lo_u32 v23, v5, v10
-; GFX8-NEXT: v_mul_lo_u32 v26, v4, v11
-; GFX8-NEXT: v_mad_u64_u32 v[10:11], s[10:11], v3, v10, v[0:1]
-; GFX8-NEXT: v_addc_u32_e64 v6, s[8:9], 0, v6, s[8:9]
-; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[12:13], v16, v8, 0
+; GFX8-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v0, v14, 0
+; GFX8-NEXT: v_mov_b32_e32 v16, v1
+; GFX8-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v0, v10, 0
+; GFX8-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v16, v13, v[19:20]
+; GFX8-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v0, v12, 0
+; GFX8-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v16, v9, v[17:18]
+; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5]
+; GFX8-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v2, v12, v[19:20]
+; GFX8-NEXT: v_mad_u64_u32 v[17:18], vcc, v2, v8, v[17:18]
+; GFX8-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v3, v11, v[19:20]
+; GFX8-NEXT: v_addc_u32_e32 v24, vcc, 0, v1, vcc
+; GFX8-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v4, v10, v[19:20]
+; GFX8-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v16, v11, v[21:22]
+; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5]
+; GFX8-NEXT: v_mad_u64_u32 v[19:20], s[6:7], v5, v9, v[19:20]
+; GFX8-NEXT: v_mad_u64_u32 v[21:22], vcc, v2, v10, v[21:22]
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: v_mad_u64_u32 v[21:22], vcc, v3, v9, v[21:22]
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v6, v8, v[19:20]
+; GFX8-NEXT: v_mad_u64_u32 v[21:22], vcc, v4, v8, v[21:22]
+; GFX8-NEXT: v_addc_u32_e32 v25, vcc, 0, v1, vcc
+; GFX8-NEXT: v_mov_b32_e32 v23, v19
+; GFX8-NEXT: v_mad_u64_u32 v[22:23], vcc, v0, v13, v[22:23]
+; GFX8-NEXT: v_mov_b32_e32 v19, v21
+; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[8:9], v0, v11, v[18:19]
+; GFX8-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v16, v12, v[22:23]
+; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[8:9]
+; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[8:9], v16, v10, v[18:19]
+; GFX8-NEXT: v_mad_u64_u32 v[21:22], s[6:7], v2, v11, v[21:22]
+; GFX8-NEXT: v_mul_lo_u32 v27, v5, v10
+; GFX8-NEXT: v_mul_lo_u32 v23, v4, v11
+; GFX8-NEXT: v_mad_u64_u32 v[21:22], s[10:11], v3, v10, v[21:22]
+; GFX8-NEXT: v_mad_u64_u32 v[10:11], s[12:13], v0, v8, 0
+; GFX8-NEXT: v_mul_lo_u32 v26, v6, v9
+; GFX8-NEXT: v_addc_u32_e64 v6, s[8:9], 0, v1, s[8:9]
; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[8:9], v2, v9, v[18:19]
+; GFX8-NEXT: v_mul_lo_u32 v28, v3, v12
; GFX8-NEXT: v_mul_lo_u32 v13, v2, v13
-; GFX8-NEXT: v_mov_b32_e32 v2, v22
-; GFX8-NEXT: v_mad_u64_u32 v[10:11], s[12:13], v4, v9, v[10:11]
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[14:15], v16, v9, v[1:2]
+; GFX8-NEXT: v_mov_b32_e32 v2, v17
+; GFX8-NEXT: v_mov_b32_e32 v1, v11
+; GFX8-NEXT: v_mad_u64_u32 v[11:12], s[12:13], v4, v9, v[21:22]
+; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[14:15], v0, v9, v[1:2]
; GFX8-NEXT: v_addc_u32_e64 v6, s[8:9], 0, v6, s[8:9]
-; GFX8-NEXT: v_mul_lo_u32 v12, v3, v12
; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[8:9], v3, v8, v[18:19]
; GFX8-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[14:15]
-; GFX8-NEXT: v_addc_u32_e64 v18, s[8:9], 0, v6, s[8:9]
-; GFX8-NEXT: v_mad_u64_u32 v[5:6], s[14:15], v5, v8, v[10:11]
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[8:9], v17, v8, v[1:2]
+; GFX8-NEXT: v_addc_u32_e64 v17, s[8:9], 0, v6, s[8:9]
+; GFX8-NEXT: v_mad_u64_u32 v[5:6], s[14:15], v5, v8, v[11:12]
+; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[8:9], v16, v8, v[1:2]
; GFX8-NEXT: v_addc_u32_e64 v3, s[8:9], v9, v3, s[8:9]
-; GFX8-NEXT: v_mul_lo_u32 v10, v16, v15
-; GFX8-NEXT: v_mul_lo_u32 v9, v17, v14
-; GFX8-NEXT: v_addc_u32_e64 v4, s[8:9], v25, v4, s[8:9]
-; GFX8-NEXT: v_addc_u32_e64 v5, s[8:9], v18, v5, s[8:9]
-; GFX8-NEXT: v_addc_u32_e64 v6, s[8:9], v21, v6, s[8:9]
-; GFX8-NEXT: v_addc_u32_e64 v10, s[8:9], v24, v10, s[8:9]
-; GFX8-NEXT: v_addc_u32_e64 v9, s[8:9], v10, v9, s[14:15]
-; GFX8-NEXT: v_addc_u32_e64 v9, s[8:9], v9, v13, s[12:13]
-; GFX8-NEXT: v_addc_u32_e64 v9, s[8:9], v9, v12, s[10:11]
-; GFX8-NEXT: v_addc_u32_e64 v9, s[6:7], v9, v26, s[6:7]
-; GFX8-NEXT: v_addc_u32_e64 v9, s[4:5], v9, v23, s[4:5]
-; GFX8-NEXT: v_addc_u32_e32 v9, vcc, v9, v20, vcc
-; GFX8-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[9:10]
+; GFX8-NEXT: v_mul_lo_u32 v0, v0, v15
+; GFX8-NEXT: v_mul_lo_u32 v9, v16, v14
+; GFX8-NEXT: v_addc_u32_e64 v4, s[8:9], v24, v4, s[8:9]
+; GFX8-NEXT: v_addc_u32_e64 v5, s[8:9], v17, v5, s[8:9]
+; GFX8-NEXT: v_addc_u32_e64 v6, s[8:9], v25, v6, s[8:9]
+; GFX8-NEXT: v_addc_u32_e64 v0, s[8:9], v20, v0, s[8:9]
+; GFX8-NEXT: v_addc_u32_e64 v0, s[8:9], v0, v9, s[14:15]
+; GFX8-NEXT: v_addc_u32_e64 v0, s[8:9], v0, v13, s[12:13]
+; GFX8-NEXT: v_addc_u32_e64 v0, s[8:9], v0, v28, s[10:11]
+; GFX8-NEXT: v_addc_u32_e64 v0, s[6:7], v0, v23, s[6:7]
+; GFX8-NEXT: v_addc_u32_e64 v0, s[4:5], v0, v27, s[4:5]
+; GFX8-NEXT: v_addc_u32_e32 v0, vcc, v0, v26, vcc
+; GFX8-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v0, v10
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_mul_i256:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v16, v0
-; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v16, v14, 0
-; GFX9-NEXT: v_mov_b32_e32 v17, v1
-; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v16, v10, 0
-; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v17, v13, v[18:19]
-; GFX9-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v16, v12, 0
-; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v17, v9, v[0:1]
-; GFX9-NEXT: v_cndmask_b32_e64 v24, 0, 1, s[4:5]
-; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v2, v12, v[18:19]
-; GFX9-NEXT: v_mad_u64_u32 v[22:23], vcc, v2, v8, v[0:1]
-; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v3, v11, v[18:19]
-; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, 0, v24, vcc
-; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v10, v[0:1]
-; GFX9-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v17, v11, v[20:21]
-; GFX9-NEXT: v_cndmask_b32_e64 v21, 0, 1, s[4:5]
-; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v5, v9, v[0:1]
-; GFX9-NEXT: v_mad_u64_u32 v[19:20], vcc, v2, v10, v[19:20]
-; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v21, vcc
-; GFX9-NEXT: v_mad_u64_u32 v[19:20], vcc, v3, v9, v[19:20]
-; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v21, vcc
-; GFX9-NEXT: v_mov_b32_e32 v18, v23
-; GFX9-NEXT: v_mad_u64_u32 v[19:20], vcc, v4, v8, v[19:20]
-; GFX9-NEXT: v_mad_u64_u32 v[23:24], s[4:5], v6, v8, v[0:1]
-; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v21, vcc
-; GFX9-NEXT: v_mov_b32_e32 v0, v20
-; GFX9-NEXT: v_mov_b32_e32 v1, v23
-; GFX9-NEXT: v_mad_u64_u32 v[0:1], vcc, v16, v13, v[0:1]
-; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[8:9], v16, v11, v[18:19]
-; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v17, v12, v[0:1]
-; GFX9-NEXT: v_mul_lo_u32 v20, v6, v9
-; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[8:9]
-; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v2, v11, v[0:1]
-; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[8:9], v17, v10, v[18:19]
-; GFX9-NEXT: v_mul_lo_u32 v23, v5, v10
-; GFX9-NEXT: v_mul_lo_u32 v26, v4, v11
-; GFX9-NEXT: v_mad_u64_u32 v[10:11], s[10:11], v3, v10, v[0:1]
-; GFX9-NEXT: v_addc_co_u32_e64 v6, s[8:9], 0, v6, s[8:9]
-; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[12:13], v16, v8, 0
+; GFX9-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v0, v14, 0
+; GFX9-NEXT: v_mov_b32_e32 v16, v1
+; GFX9-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v0, v10, 0
+; GFX9-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v16, v13, v[19:20]
+; GFX9-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v0, v12, 0
+; GFX9-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v16, v9, v[17:18]
+; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5]
+; GFX9-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v2, v12, v[19:20]
+; GFX9-NEXT: v_mad_u64_u32 v[17:18], vcc, v2, v8, v[17:18]
+; GFX9-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v3, v11, v[19:20]
+; GFX9-NEXT: v_addc_co_u32_e32 v24, vcc, 0, v1, vcc
+; GFX9-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v4, v10, v[19:20]
+; GFX9-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v16, v11, v[21:22]
+; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5]
+; GFX9-NEXT: v_mad_u64_u32 v[19:20], s[6:7], v5, v9, v[19:20]
+; GFX9-NEXT: v_mad_u64_u32 v[21:22], vcc, v2, v10, v[21:22]
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT: v_mad_u64_u32 v[21:22], vcc, v3, v9, v[21:22]
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v6, v8, v[19:20]
+; GFX9-NEXT: v_mad_u64_u32 v[21:22], vcc, v4, v8, v[21:22]
+; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, 0, v1, vcc
+; GFX9-NEXT: v_mov_b32_e32 v23, v19
+; GFX9-NEXT: v_mad_u64_u32 v[22:23], vcc, v0, v13, v[22:23]
+; GFX9-NEXT: v_mov_b32_e32 v19, v21
+; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[8:9], v0, v11, v[18:19]
+; GFX9-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v16, v12, v[22:23]
+; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[8:9]
+; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[8:9], v16, v10, v[18:19]
+; GFX9-NEXT: v_mad_u64_u32 v[21:22], s[6:7], v2, v11, v[21:22]
+; GFX9-NEXT: v_mul_lo_u32 v27, v5, v10
+; GFX9-NEXT: v_mul_lo_u32 v23, v4, v11
+; GFX9-NEXT: v_mad_u64_u32 v[21:22], s[10:11], v3, v10, v[21:22]
+; GFX9-NEXT: v_mad_u64_u32 v[10:11], s[12:13], v0, v8, 0
+; GFX9-NEXT: v_mul_lo_u32 v26, v6, v9
+; GFX9-NEXT: v_addc_co_u32_e64 v6, s[8:9], 0, v1, s[8:9]
; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[8:9], v2, v9, v[18:19]
+; GFX9-NEXT: v_mul_lo_u32 v28, v3, v12
; GFX9-NEXT: v_mul_lo_u32 v13, v2, v13
-; GFX9-NEXT: v_mov_b32_e32 v2, v22
-; GFX9-NEXT: v_mad_u64_u32 v[10:11], s[12:13], v4, v9, v[10:11]
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[14:15], v16, v9, v[1:2]
+; GFX9-NEXT: v_mov_b32_e32 v2, v17
+; GFX9-NEXT: v_mov_b32_e32 v1, v11
+; GFX9-NEXT: v_mad_u64_u32 v[11:12], s[12:13], v4, v9, v[21:22]
+; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[14:15], v0, v9, v[1:2]
; GFX9-NEXT: v_addc_co_u32_e64 v6, s[8:9], 0, v6, s[8:9]
-; GFX9-NEXT: v_mul_lo_u32 v12, v3, v12
; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[8:9], v3, v8, v[18:19]
; GFX9-NEXT...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/124111
More information about the llvm-commits
mailing list