[llvm] AMDGPU: Use v_mov_b32 to implement divergent zext i32->i64 (PR #168166)
via llvm-commits
llvm-commits at lists.llvm.org
Fri Nov 14 18:09:53 PST 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Matt Arsenault (arsenm)
<details>
<summary>Changes</summary>
Some cases are relying on SIFixSGPRCopies to force VALU
reg_sequence inputs with SGPR inputs to use all VGPR inputs,
but this doesn't always happen if the reg_sequence isn't
invalid. Make sure we use a vgpr up-front here so we don't
rely on something later.
---
Patch is 241.28 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/168166.diff
21 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/SIInstructions.td (+6-1)
- (modified) llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll (+24-24)
- (modified) llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll (+75-74)
- (modified) llvm/test/CodeGen/AMDGPU/collapse-endcf.ll (-2)
- (modified) llvm/test/CodeGen/AMDGPU/div_i128.ll (+311-322)
- (modified) llvm/test/CodeGen/AMDGPU/ds_write2.ll (+12-12)
- (modified) llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll (-2)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.atomic.buffer.load.ll (+39-19)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.atomic.buffer.load.ll (+39-19)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.atomic.buffer.load.ll (+45-22)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.atomic.buffer.load.ll (+45-22)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll (-8)
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-lastuse.ll (-1)
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll (-24)
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll (-16)
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll (-6)
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll (-6)
- (modified) llvm/test/CodeGen/AMDGPU/rem_i128.ll (+626-651)
- (modified) llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll (+3-3)
- (modified) llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll (-1)
- (modified) llvm/test/CodeGen/AMDGPU/wwm-reserved.ll (-2)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 6dd4b1d7bd000..b7256b81ee826 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -2941,10 +2941,15 @@ def : GCNPat <
>;
def : GCNPat <
- (i64 (zext i32:$src)),
+ (i64 (UniformUnaryFrag<zext> i32:$src)),
(REG_SEQUENCE SReg_64, $src, sub0, (S_MOV_B32 (i32 0)), sub1)
>;
+def : GCNPat <
+ (i64 (zext i32:$src)),
+ (REG_SEQUENCE VReg_64, $src, sub0, (V_MOV_B32_e32 (i32 0)), sub1)
+>;
+
def : GCNPat <
(i64 (anyext i32:$src)),
(REG_SEQUENCE SReg_64, $src, sub0, (i32 (IMPLICIT_DEF)), sub1)
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
index ff74d1f71616d..88e3c86c791de 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
@@ -2549,17 +2549,17 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
;
; GFX1164_ITERATIVE-LABEL: add_i64_varying:
; GFX1164_ITERATIVE: ; %bb.0: ; %entry
-; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v2, 0x3ff, v0
-; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0
+; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, 0
+; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v3, 0x3ff, v0
; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec
; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[6:7], 0
; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1164_ITERATIVE-NEXT: .LBB5_1: ; %ComputeLoop
; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s2, s[0:1]
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s3, v2, s2
-; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s2
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s2
+; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v2, s2
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s2
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s2
; GFX1164_ITERATIVE-NEXT: s_add_u32 s6, s6, s3
@@ -2606,7 +2606,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
;
; GFX1132_ITERATIVE-LABEL: add_i64_varying:
; GFX1132_ITERATIVE: ; %bb.0: ; %entry
-; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v2, 0x3ff, v0
+; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_and_b32 v3, 0x3ff, v0
; GFX1132_ITERATIVE-NEXT: s_mov_b32 s0, exec_lo
; GFX1132_ITERATIVE-NEXT: s_mov_b64 s[6:7], 0
; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1
@@ -2614,8 +2614,8 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s2, v2, s1
-; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s1
+; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s2, v3, s1
+; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v2, s1
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s1
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s1
; GFX1132_ITERATIVE-NEXT: s_add_u32 s6, s6, s2
@@ -2659,8 +2659,8 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
;
; GFX1264_ITERATIVE-LABEL: add_i64_varying:
; GFX1264_ITERATIVE: ; %bb.0: ; %entry
-; GFX1264_ITERATIVE-NEXT: v_and_b32_e32 v2, 0x3ff, v0
-; GFX1264_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0
+; GFX1264_ITERATIVE-NEXT: v_mov_b32_e32 v2, 0
+; GFX1264_ITERATIVE-NEXT: v_and_b32_e32 v3, 0x3ff, v0
; GFX1264_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec
; GFX1264_ITERATIVE-NEXT: s_mov_b64 s[6:7], 0
; GFX1264_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1
@@ -2668,8 +2668,8 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1264_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s8, s[0:1]
; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe
-; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s8
-; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s2, v2, s8
+; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s3, v2, s8
+; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s2, v3, s8
; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s8
; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s8
; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8
@@ -2714,7 +2714,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
;
; GFX1232_ITERATIVE-LABEL: add_i64_varying:
; GFX1232_ITERATIVE: ; %bb.0: ; %entry
-; GFX1232_ITERATIVE-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v2, 0x3ff, v0
+; GFX1232_ITERATIVE-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_and_b32 v3, 0x3ff, v0
; GFX1232_ITERATIVE-NEXT: s_mov_b32 s0, exec_lo
; GFX1232_ITERATIVE-NEXT: s_mov_b64 s[6:7], 0
; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1
@@ -2723,8 +2723,8 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe
; GFX1232_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0
; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe
-; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s1
-; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s2, v2, s1
+; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s3, v2, s1
+; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s2, v3, s1
; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s1
; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s1
; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1
@@ -6930,15 +6930,15 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
;
; GFX1164_ITERATIVE-LABEL: sub_i64_varying:
; GFX1164_ITERATIVE: ; %bb.0: ; %entry
-; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec
; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[8:9], 0
; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX1164_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop
; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s2, s[0:1]
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v1, s2
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v5, s9, s2
@@ -7087,8 +7087,8 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
;
; GFX1264_ITERATIVE-LABEL: sub_i64_varying:
; GFX1264_ITERATIVE: ; %bb.0: ; %entry
-; GFX1264_ITERATIVE-NEXT: v_and_b32_e32 v2, 0x3ff, v0
-; GFX1264_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0
+; GFX1264_ITERATIVE-NEXT: v_mov_b32_e32 v2, 0
+; GFX1264_ITERATIVE-NEXT: v_and_b32_e32 v3, 0x3ff, v0
; GFX1264_ITERATIVE-NEXT: s_mov_b64 s[0:1], exec
; GFX1264_ITERATIVE-NEXT: s_mov_b64 s[6:7], 0
; GFX1264_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1
@@ -7096,8 +7096,8 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1264_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s8, s[0:1]
; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe
-; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s8
-; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s2, v2, s8
+; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s3, v2, s8
+; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s2, v3, s8
; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s8
; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s8
; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8
@@ -7142,7 +7142,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
;
; GFX1232_ITERATIVE-LABEL: sub_i64_varying:
; GFX1232_ITERATIVE: ; %bb.0: ; %entry
-; GFX1232_ITERATIVE-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v2, 0x3ff, v0
+; GFX1232_ITERATIVE-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_and_b32 v3, 0x3ff, v0
; GFX1232_ITERATIVE-NEXT: s_mov_b32 s0, exec_lo
; GFX1232_ITERATIVE-NEXT: s_mov_b64 s[6:7], 0
; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1
@@ -7151,8 +7151,8 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe
; GFX1232_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0
; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe
-; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s1
-; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s2, v2, s1
+; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s3, v2, s1
+; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s2, v3, s1
; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s1
; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s1
; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
index f5ca24f59a286..12517c2bc1b5d 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
@@ -2181,17 +2181,17 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
;
; GFX1164_ITERATIVE-LABEL: add_i64_varying:
; GFX1164_ITERATIVE: ; %bb.0: ; %entry
-; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v2, 0x3ff, v0
-; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0
+; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, 0
+; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v3, 0x3ff, v0
; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec
; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0
; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1164_ITERATIVE-NEXT: .LBB6_1: ; %ComputeLoop
; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s6, s[2:3]
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v2, s6
-; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s6
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s6
+; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v2, s6
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s6
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s6
; GFX1164_ITERATIVE-NEXT: s_add_u32 s0, s0, s7
@@ -2233,7 +2233,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
;
; GFX1132_ITERATIVE-LABEL: add_i64_varying:
; GFX1132_ITERATIVE: ; %bb.0: ; %entry
-; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v2, 0x3ff, v0
+; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_and_b32 v3, 0x3ff, v0
; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, exec_lo
; GFX1132_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0
; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1
@@ -2241,8 +2241,8 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s3, s2
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s3
-; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3
+; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v3, s3
+; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v2, s3
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s3
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3
; GFX1132_ITERATIVE-NEXT: s_add_u32 s0, s0, s6
@@ -2982,19 +2982,20 @@ define amdgpu_kernel void @add_i64_varying_nouse() {
;
; GFX1164_ITERATIVE-LABEL: add_i64_varying_nouse:
; GFX1164_ITERATIVE: ; %bb.0: ; %entry
-; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec
; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0
; GFX1164_ITERATIVE-NEXT: .LBB7_1: ; %ComputeLoop
; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s4, s[2:3]
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s5, v0, s4
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v1, s4
; GFX1164_ITERATIVE-NEXT: s_add_u32 s0, s0, s5
; GFX1164_ITERATIVE-NEXT: s_addc_u32 s1, s1, s6
; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s4
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[4:5]
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB7_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
@@ -5594,17 +5595,17 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
;
; GFX1164_ITERATIVE-LABEL: sub_i64_varying:
; GFX1164_ITERATIVE: ; %bb.0: ; %entry
-; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v2, 0x3ff, v0
-; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0
+; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, 0
+; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v3, 0x3ff, v0
; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec
; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0
; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1164_ITERATIVE-NEXT: .LBB14_1: ; %ComputeLoop
; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s6, s[2:3]
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v2, s6
-; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s6
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s6
+; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v2, s6
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s6
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s6
; GFX1164_ITERATIVE-NEXT: s_add_u32 s0, s0, s7
@@ -5646,7 +5647,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
;
; GFX1132_ITERATIVE-LABEL: sub_i64_varying:
; GFX1132_ITERATIVE: ; %bb.0: ; %entry
-; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v2, 0x3ff, v0
+; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_and_b32 v3, 0x3ff, v0
; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, exec_lo
; GFX1132_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0
; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1
@@ -5654,8 +5655,8 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s3, s2
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s3
-; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3
+; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v3, s3
+; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v2, s3
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s3
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3
; GFX1132_ITERATIVE-NEXT: s_add_u32 s0, s0, s6
@@ -7063,17 +7064,17 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) {
;
; GFX1164_ITERATIVE-LABEL: and_i64_varying:
; GFX1164_ITERATIVE: ; %bb.0: ; %entry
-; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v2, 0x3ff, v0
-; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0
+; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, 0
+; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v3, 0x3ff, v0
; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec
; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], -1
; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1164_ITERATIVE-NEXT: .LBB16_1: ; %ComputeLoop
; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s8, s[2:3]
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8
-; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s8
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v2, s8
+; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v3, s8
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s8
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s8
; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8
@@ -7113,7 +7114,7 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) {
;
; GFX1132_ITERATIVE-LABEL: and_i64_varying:
; GFX1132_ITERATIVE: ; %bb.0: ; %entry
-; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v2, 0x3ff, v0
+; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_and_b32 v3, 0x3ff, v0
; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, exec_lo
; GFX1132_ITERATIVE-NEXT: s_mov_b64 s[0:1], -1
; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1
@@ -7121,8 +7122,8 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s3, s2
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3
-; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s3
+; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v2, s3
+; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v3, s3
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s3
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3
; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
@@ -8411,17 +8412,17 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
;
; GFX1164_ITERATIVE-LABEL: or_i64_varying:
; GFX1164_ITERATIVE: ; %bb.0: ; %entry
-; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v2, 0x3ff, v0
-; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v3, 0
+; GFX1164_ITERATIVE-NEXT: v_mov_b32_e32 v2, 0
+; GFX1164_ITERATIVE-NEXT: v_and_b32_e32 v3, 0x3ff, v0
; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[2:3], exec
; GFX1164_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0
; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1164_ITERATIVE-NEXT: .LBB18_1: ; %ComputeLoop
; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s8, s[2:3]
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8
-; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s8
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v2, s8
+; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v3, s8
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s8
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s8
; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8
@@ -8461,7 +8462,7 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
;
; GFX1132_ITERATIVE-LABEL: or_i64_varying:
; GFX1132_ITERATIVE: ; %bb.0: ; %entry
-; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v2, 0x3ff, v0
+; GFX1132_ITERATIVE-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_and_b32 v3, 0x3ff, v0
; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, exec_lo
; GFX1132_ITERATIVE-NEXT: s_mov_b64 s[0:1], 0
; GFX1132_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1
@@ -8469,8 +8470,8 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s3, s2
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1132_ITERATIVE-NEXT...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/168166
More information about the llvm-commits
mailing list