[llvm] gfx12 mad intra fwd (PR #77927)
via llvm-commits
llvm-commits at lists.llvm.org
Fri Jan 12 05:37:16 PST 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Jay Foad (jayfoad)
<details>
<summary>Changes</summary>
- Precommit extra GFX12 test coverage
- [AMDGPU] Disable V_MAD_U64_U32/V_MAD_I64_I32 workaround for GFX12
---
Patch is 46.03 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/77927.diff
9 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/AMDGPU.td (-1)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-mad_64_32.mir (+21)
- (modified) llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll (+7-8)
- (modified) llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll (+18-22)
- (modified) llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll (+7-8)
- (modified) llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll (+7-8)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.mulo.ll (+163)
- (modified) llvm/test/CodeGen/AMDGPU/mad_64_32.ll (+191)
- (modified) llvm/test/CodeGen/AMDGPU/mul.ll (+22-21)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index b27edb1e9e14bb..023a4260d76a37 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -1501,7 +1501,6 @@ def FeatureISAVersion12 : FeatureSet<
FeaturePseudoScalarTrans,
FeatureHasRestrictedSOffset,
FeatureVGPRSingleUseHintInsts,
- FeatureMADIntraFwdBug,
FeatureScalarDwordx3Loads]>;
//===----------------------------------------------------------------------===//
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-mad_64_32.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-mad_64_32.mir
index 698281caca245e..59f6114ca5cd3a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-mad_64_32.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-mad_64_32.mir
@@ -1,6 +1,7 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -march=amdgcn -mcpu=gfx1030 -run-pass=instruction-select -global-isel-abort=2 -pass-remarks-missed='gisel*' -verify-machineinstrs %s -o - 2>%t | FileCheck -check-prefix=GFX10 %s
# RUN: llc -march=amdgcn -mcpu=gfx1100 -run-pass=instruction-select -global-isel-abort=2 -pass-remarks-missed='gisel*' -verify-machineinstrs %s -o - 2>%t | FileCheck -check-prefix=GFX11 %s
+# RUN: llc -march=amdgcn -mcpu=gfx1200 -run-pass=instruction-select -global-isel-abort=2 -pass-remarks-missed='gisel*' -verify-machineinstrs %s -o - 2>%t | FileCheck -check-prefix=GFX12 %s
---
name: mad_u64_u32_vvv
@@ -18,6 +19,7 @@ body: |
; GFX10-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr3
; GFX10-NEXT: [[V_MAD_U64_U32_e64_:%[0-9]+]]:vreg_64, [[V_MAD_U64_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_MAD_U64_U32_e64 [[COPY]], [[COPY1]], [[COPY2]], 0, implicit $exec
; GFX10-NEXT: S_ENDPGM 0, implicit [[V_MAD_U64_U32_e64_]], implicit [[V_MAD_U64_U32_e64_1]]
+ ;
; GFX11-LABEL: name: mad_u64_u32_vvv
; GFX11: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
; GFX11-NEXT: {{ $}}
@@ -26,6 +28,15 @@ body: |
; GFX11-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr3
; GFX11-NEXT: [[V_MAD_U64_U32_gfx11_e64_:%[0-9]+]]:vreg_64, [[V_MAD_U64_U32_gfx11_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_MAD_U64_U32_gfx11_e64 [[COPY]], [[COPY1]], [[COPY2]], 0, implicit $exec
; GFX11-NEXT: S_ENDPGM 0, implicit [[V_MAD_U64_U32_gfx11_e64_]], implicit [[V_MAD_U64_U32_gfx11_e64_1]]
+ ;
+ ; GFX12-LABEL: name: mad_u64_u32_vvv
+ ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr3
+ ; GFX12-NEXT: [[V_MAD_U64_U32_e64_:%[0-9]+]]:vreg_64, [[V_MAD_U64_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_MAD_U64_U32_e64 [[COPY]], [[COPY1]], [[COPY2]], 0, implicit $exec
+ ; GFX12-NEXT: S_ENDPGM 0, implicit [[V_MAD_U64_U32_e64_]], implicit [[V_MAD_U64_U32_e64_1]]
%0:vgpr(s32) = COPY $vgpr0
%1:vgpr(s32) = COPY $vgpr1
%2:vgpr(s32) = COPY $vgpr2
@@ -51,6 +62,7 @@ body: |
; GFX10-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr3
; GFX10-NEXT: [[V_MAD_I64_I32_e64_:%[0-9]+]]:vreg_64, [[V_MAD_I64_I32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_MAD_I64_I32_e64 [[COPY]], [[COPY1]], [[COPY2]], 0, implicit $exec
; GFX10-NEXT: S_ENDPGM 0, implicit [[V_MAD_I64_I32_e64_]], implicit [[V_MAD_I64_I32_e64_1]]
+ ;
; GFX11-LABEL: name: mad_i64_i32_vvv
; GFX11: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
; GFX11-NEXT: {{ $}}
@@ -59,6 +71,15 @@ body: |
; GFX11-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr3
; GFX11-NEXT: [[V_MAD_I64_I32_gfx11_e64_:%[0-9]+]]:vreg_64, [[V_MAD_I64_I32_gfx11_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_MAD_I64_I32_gfx11_e64 [[COPY]], [[COPY1]], [[COPY2]], 0, implicit $exec
; GFX11-NEXT: S_ENDPGM 0, implicit [[V_MAD_I64_I32_gfx11_e64_]], implicit [[V_MAD_I64_I32_gfx11_e64_1]]
+ ;
+ ; GFX12-LABEL: name: mad_i64_i32_vvv
+ ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr3
+ ; GFX12-NEXT: [[V_MAD_I64_I32_e64_:%[0-9]+]]:vreg_64, [[V_MAD_I64_I32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_MAD_I64_I32_e64 [[COPY]], [[COPY1]], [[COPY2]], 0, implicit $exec
+ ; GFX12-NEXT: S_ENDPGM 0, implicit [[V_MAD_I64_I32_e64_]], implicit [[V_MAD_I64_I32_e64_1]]
%0:vgpr(s32) = COPY $vgpr0
%1:vgpr(s32) = COPY $vgpr1
%2:vgpr(s32) = COPY $vgpr2
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
index e3d2ecefbda30d..b63a50dc4e6e7a 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
@@ -520,13 +520,12 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX12W64-NEXT: .LBB1_2:
; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX12W64-NEXT: v_mul_lo_u32 v0, s6, v0
; GFX12W64-NEXT: s_waitcnt vmcnt(0)
; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1
+; GFX12W64-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12W64-NEXT: v_mad_co_u64_u32 v[0:1], null, s6, v0, s[2:3]
; GFX12W64-NEXT: v_mov_b32_e32 v1, 0
-; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12W64-NEXT: v_add_nc_u32_e32 v0, s2, v0
; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX12W64-NEXT: s_nop 0
; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -553,12 +552,12 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX12W32-NEXT: .LBB1_2:
; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX12W32-NEXT: v_mul_lo_u32 v0, s2, v0
; GFX12W32-NEXT: s_waitcnt vmcnt(0)
-; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1
+; GFX12W32-NEXT: v_readfirstlane_b32 s4, v1
+; GFX12W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12W32-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, s2, v0
+; GFX12W32-NEXT: v_mad_co_u64_u32 v[0:1], null, s2, v0, s[4:5]
+; GFX12W32-NEXT: v_mov_b32_e32 v1, 0
; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX12W32-NEXT: s_nop 0
; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
index b4c8da44337ae5..42bef4faf8a49b 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
@@ -572,13 +572,12 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1264-NEXT: global_inv scope:SCOPE_DEV
; GFX1264-NEXT: .LBB1_2:
; GFX1264-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX1264-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1264-NEXT: v_mul_lo_u32 v0, s8, v0
; GFX1264-NEXT: v_readfirstlane_b32 s0, v1
+; GFX1264-NEXT: s_waitcnt lgkmcnt(0)
; GFX1264-NEXT: s_mov_b32 s7, 0x31016000
; GFX1264-NEXT: s_mov_b32 s6, -1
; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1264-NEXT: v_add_nc_u32_e32 v0, s0, v0
+; GFX1264-NEXT: v_mad_co_u64_u32 v[0:1], null, s8, v0, s[0:1]
; GFX1264-NEXT: buffer_store_b32 v0, off, s[4:7], null
; GFX1264-NEXT: s_nop 0
; GFX1264-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -610,13 +609,12 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1232-NEXT: global_inv scope:SCOPE_DEV
; GFX1232-NEXT: .LBB1_2:
; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX1232-NEXT: v_readfirstlane_b32 s2, v1
; GFX1232-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1232-NEXT: v_mul_lo_u32 v0, s0, v0
-; GFX1232-NEXT: v_readfirstlane_b32 s0, v1
; GFX1232-NEXT: s_mov_b32 s7, 0x31016000
; GFX1232-NEXT: s_mov_b32 s6, -1
; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1232-NEXT: v_add_nc_u32_e32 v0, s0, v0
+; GFX1232-NEXT: v_mad_co_u64_u32 v[0:1], null, s0, v0, s[2:3]
; GFX1232-NEXT: buffer_store_b32 v0, off, s[4:7], null
; GFX1232-NEXT: s_nop 0
; GFX1232-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -1671,12 +1669,11 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1264-NEXT: v_readfirstlane_b32 s2, v0
; GFX1264-NEXT: v_readfirstlane_b32 s3, v1
; GFX1264-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1264-NEXT: v_mul_lo_u32 v3, s1, v2
; GFX1264-NEXT: s_mov_b32 s7, 0x31016000
; GFX1264-NEXT: s_mov_b32 s6, -1
+; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1264-NEXT: v_mad_co_u64_u32 v[0:1], null, s0, v2, s[2:3]
-; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1264-NEXT: v_add_nc_u32_e32 v1, v3, v1
+; GFX1264-NEXT: v_mad_co_u64_u32 v[1:2], null, s1, v2, v[1:2]
; GFX1264-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null
; GFX1264-NEXT: s_nop 0
; GFX1264-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -1712,12 +1709,11 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1232-NEXT: v_readfirstlane_b32 s2, v0
; GFX1232-NEXT: v_readfirstlane_b32 s3, v1
; GFX1232-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1232-NEXT: v_mul_lo_u32 v3, s1, v2
; GFX1232-NEXT: s_mov_b32 s7, 0x31016000
; GFX1232-NEXT: s_mov_b32 s6, -1
+; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1232-NEXT: v_mad_co_u64_u32 v[0:1], null, s0, v2, s[2:3]
-; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1232-NEXT: v_add_nc_u32_e32 v1, v3, v1
+; GFX1232-NEXT: v_mad_co_u64_u32 v[1:2], null, s1, v2, v[1:2]
; GFX1232-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null
; GFX1232-NEXT: s_nop 0
; GFX1232-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -3608,16 +3604,16 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1264-NEXT: .LBB10_2:
; GFX1264-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX1264-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1264-NEXT: v_mul_lo_u32 v5, s1, v2
; GFX1264-NEXT: v_mad_co_u64_u32 v[3:4], null, s0, v2, 0
; GFX1264-NEXT: v_readfirstlane_b32 s0, v0
-; GFX1264-NEXT: v_readfirstlane_b32 s1, v1
; GFX1264-NEXT: s_mov_b32 s7, 0x31016000
; GFX1264-NEXT: s_mov_b32 s6, -1
-; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1264-NEXT: v_add_nc_u32_e32 v1, v4, v5
+; GFX1264-NEXT: s_wait_alu 0xfff
+; GFX1264-NEXT: v_mad_co_u64_u32 v[4:5], null, s1, v2, v[4:5]
+; GFX1264-NEXT: v_readfirstlane_b32 s1, v1
; GFX1264-NEXT: v_sub_co_u32 v0, vcc, s0, v3
-; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1264-NEXT: v_mov_b32_e32 v1, v4
; GFX1264-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s1, v1, vcc
; GFX1264-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null
; GFX1264-NEXT: s_nop 0
@@ -3652,16 +3648,16 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1232-NEXT: .LBB10_2:
; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s8
; GFX1232-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1232-NEXT: v_mul_lo_u32 v5, s1, v2
; GFX1232-NEXT: v_mad_co_u64_u32 v[3:4], null, s0, v2, 0
; GFX1232-NEXT: v_readfirstlane_b32 s0, v0
-; GFX1232-NEXT: v_readfirstlane_b32 s1, v1
; GFX1232-NEXT: s_mov_b32 s7, 0x31016000
; GFX1232-NEXT: s_mov_b32 s6, -1
-; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1232-NEXT: v_add_nc_u32_e32 v1, v4, v5
+; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX1232-NEXT: v_mad_co_u64_u32 v[4:5], null, s1, v2, v[4:5]
+; GFX1232-NEXT: v_readfirstlane_b32 s1, v1
; GFX1232-NEXT: v_sub_co_u32 v0, vcc_lo, s0, v3
-; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1232-NEXT: v_mov_b32_e32 v1, v4
; GFX1232-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
; GFX1232-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null
; GFX1232-NEXT: s_nop 0
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
index 79f8b3a1d5d84c..280b3c13f410db 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
@@ -519,13 +519,12 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX12W64-NEXT: .LBB1_2:
; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX12W64-NEXT: v_mul_lo_u32 v0, s6, v0
; GFX12W64-NEXT: s_waitcnt vmcnt(0)
; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1
+; GFX12W64-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12W64-NEXT: v_mad_co_u64_u32 v[0:1], null, s6, v0, s[2:3]
; GFX12W64-NEXT: v_mov_b32_e32 v1, 0
-; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12W64-NEXT: v_add_nc_u32_e32 v0, s2, v0
; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX12W64-NEXT: s_nop 0
; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -552,12 +551,12 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX12W32-NEXT: .LBB1_2:
; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX12W32-NEXT: v_mul_lo_u32 v0, s2, v0
; GFX12W32-NEXT: s_waitcnt vmcnt(0)
-; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1
+; GFX12W32-NEXT: v_readfirstlane_b32 s4, v1
+; GFX12W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12W32-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, s2, v0
+; GFX12W32-NEXT: v_mad_co_u64_u32 v[0:1], null, s2, v0, s[4:5]
+; GFX12W32-NEXT: v_mov_b32_e32 v1, 0
; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX12W32-NEXT: s_nop 0
; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
index edf6fbadf1a60a..f2eea9d2218417 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
@@ -535,13 +535,12 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX12W64-NEXT: .LBB1_2:
; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX12W64-NEXT: v_mul_lo_u32 v0, s6, v0
; GFX12W64-NEXT: s_waitcnt vmcnt(0)
; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1
+; GFX12W64-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12W64-NEXT: v_mad_co_u64_u32 v[0:1], null, s6, v0, s[2:3]
; GFX12W64-NEXT: v_mov_b32_e32 v1, 0
-; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12W64-NEXT: v_add_nc_u32_e32 v0, s2, v0
; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX12W64-NEXT: s_nop 0
; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -568,12 +567,12 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX12W32-NEXT: .LBB1_2:
; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX12W32-NEXT: v_mul_lo_u32 v0, s2, v0
; GFX12W32-NEXT: s_waitcnt vmcnt(0)
-; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1
+; GFX12W32-NEXT: v_readfirstlane_b32 s4, v1
+; GFX12W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12W32-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, s2, v0
+; GFX12W32-NEXT: v_mad_co_u64_u32 v[0:1], null, s2, v0, s[4:5]
+; GFX12W32-NEXT: v_mov_b32_e32 v1, 0
; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX12W32-NEXT: s_nop 0
; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll b/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll
index 249acec639540b..9afe3e0f97551f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll
@@ -3,6 +3,7 @@
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s
; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s
; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX11 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX12 %s
define { i64, i1 } @umulo_i64_v_v(i64 %x, i64 %y) {
; SI-LABEL: umulo_i64_v_v:
@@ -97,6 +98,32 @@ define { i64, i1 } @umulo_i64_v_v(i64 %x, i64 %y) {
; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: umulo_i64_v_v:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v5, v1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v4, v2, 0
+; GFX12-NEXT: v_mad_co_u64_u32 v[6:7], null, v4, v3, 0
+; GFX12-NEXT: v_mad_co_u64_u32 v[8:9], null, v5, v2, 0
+; GFX12-NEXT: v_mad_co_u64_u32 v[2:3], null, v5, v3, 0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-NEXT: v_mov_b32_e32 v4, v1
+; GFX12-NEXT: v_add3_u32 v1, v1, v6, v8
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, v4, v6
+; GFX12-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v7, vcc_lo
+; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, v4, v8
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v5, v9, vcc_lo
+; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
+; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v4, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
+; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
+; GFX12-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX12-NEXT: s_setpc_b64 s[30:31]
bb:
%umulo = tail call { i64, i1 } @llvm.umul.with.overflow.i64(i64 %x, i64 %y)
ret { i64, i1 } %umulo
@@ -248,6 +275,47 @@ define { i64, i1 } @smulo_i64_v_v(i64 %x, i64 %y) {
; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[4:5], v[2:3]
; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: smulo_i64_v_v:
+; GFX12: ; %bb.0: ; %bb
+; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX12-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v5, v1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v4, v2, 0
+; GFX12-NEXT: v_mad_co_u64_u32 v[6:7], null, v4, v3, 0
+; GFX12-NEXT: v_mad_co_u64_u32 v[8:9], null, v5, v2, 0
+; GFX12-NEXT: v_mad_co_i64_i32 v[10:11], null, v5, v3, 0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-NEXT: v_mov_b32_e32 v12, v1
+; GFX12-NEXT: v_add3_u32 v1, v1, v6, v8
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_add_co_u32 v12, vcc_lo, v12, v6
+; GFX12-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v7, vcc_lo
+; GFX12-NEXT: v_add_co_u...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/77927
More information about the llvm-commits
mailing list