[llvm] [AMDGPU] Update uses of new VOP2 pseudos for GFX12 (PR #78155)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Jan 15 04:39:35 PST 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Jay Foad (jayfoad)
<details>
<summary>Changes</summary>
- Add some GFX12 test coverage
- [AMDGPU] Update uses of new VOP2 pseudos for GFX12
---
Patch is 98.88 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/78155.diff
5 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/SIFoldOperands.cpp (+9-5)
- (modified) llvm/lib/Target/AMDGPU/SIInstructions.td (+12-1)
- (modified) llvm/test/CodeGen/AMDGPU/clamp.ll (+3-3)
- (modified) llvm/test/CodeGen/AMDGPU/fcanonicalize.ll (+640-308)
- (modified) llvm/test/CodeGen/AMDGPU/omod.ll (+303-187)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index aa7639a0f18665..2862a7787e75a3 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -1498,6 +1498,7 @@ const MachineOperand *SIFoldOperands::isClamp(const MachineInstr &MI) const {
case AMDGPU::V_MAX_F16_t16_e64:
case AMDGPU::V_MAX_F16_fake16_e64:
case AMDGPU::V_MAX_F64_e64:
+ case AMDGPU::V_MAX_NUM_F64_e64:
case AMDGPU::V_PK_MAX_F16: {
if (!TII->getNamedOperand(MI, AMDGPU::OpName::clamp)->getImm())
return nullptr;
@@ -1567,7 +1568,8 @@ bool SIFoldOperands::tryFoldClamp(MachineInstr &MI) {
static int getOModValue(unsigned Opc, int64_t Val) {
switch (Opc) {
- case AMDGPU::V_MUL_F64_e64: {
+ case AMDGPU::V_MUL_F64_e64:
+ case AMDGPU::V_MUL_F64_pseudo_e64: {
switch (Val) {
case 0x3fe0000000000000: // 0.5
return SIOutMods::DIV2;
@@ -1618,6 +1620,7 @@ SIFoldOperands::isOMod(const MachineInstr &MI) const {
unsigned Op = MI.getOpcode();
switch (Op) {
case AMDGPU::V_MUL_F64_e64:
+ case AMDGPU::V_MUL_F64_pseudo_e64:
case AMDGPU::V_MUL_F32_e64:
case AMDGPU::V_MUL_F16_t16_e64:
case AMDGPU::V_MUL_F16_fake16_e64:
@@ -1625,8 +1628,8 @@ SIFoldOperands::isOMod(const MachineInstr &MI) const {
// If output denormals are enabled, omod is ignored.
if ((Op == AMDGPU::V_MUL_F32_e64 &&
MFI->getMode().FP32Denormals.Output != DenormalMode::PreserveSign) ||
- ((Op == AMDGPU::V_MUL_F64_e64 || Op == AMDGPU::V_MUL_F16_e64 ||
- Op == AMDGPU::V_MUL_F16_t16_e64 ||
+ ((Op == AMDGPU::V_MUL_F64_e64 || Op == AMDGPU::V_MUL_F64_pseudo_e64 ||
+ Op == AMDGPU::V_MUL_F16_e64 || Op == AMDGPU::V_MUL_F16_t16_e64 ||
Op == AMDGPU::V_MUL_F16_fake16_e64) &&
MFI->getMode().FP64FP16Denormals.Output != DenormalMode::PreserveSign))
return std::pair(nullptr, SIOutMods::NONE);
@@ -1655,6 +1658,7 @@ SIFoldOperands::isOMod(const MachineInstr &MI) const {
return std::pair(RegOp, OMod);
}
case AMDGPU::V_ADD_F64_e64:
+ case AMDGPU::V_ADD_F64_pseudo_e64:
case AMDGPU::V_ADD_F32_e64:
case AMDGPU::V_ADD_F16_e64:
case AMDGPU::V_ADD_F16_t16_e64:
@@ -1662,8 +1666,8 @@ SIFoldOperands::isOMod(const MachineInstr &MI) const {
// If output denormals are enabled, omod is ignored.
if ((Op == AMDGPU::V_ADD_F32_e64 &&
MFI->getMode().FP32Denormals.Output != DenormalMode::PreserveSign) ||
- ((Op == AMDGPU::V_ADD_F64_e64 || Op == AMDGPU::V_ADD_F16_e64 ||
- Op == AMDGPU::V_ADD_F16_t16_e64 ||
+ ((Op == AMDGPU::V_ADD_F64_e64 || Op == AMDGPU::V_ADD_F64_pseudo_e64 ||
+ Op == AMDGPU::V_ADD_F16_e64 || Op == AMDGPU::V_ADD_F16_t16_e64 ||
Op == AMDGPU::V_ADD_F16_fake16_e64) &&
MFI->getMode().FP64FP16Denormals.Output != DenormalMode::PreserveSign))
return std::pair(nullptr, SIOutMods::NONE);
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index b4bd46d33c1f10..ffa7952888c355 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -1862,7 +1862,10 @@ class ClampPat<Instruction inst, ValueType vt> : GCNPat <
>;
def : ClampPat<V_MAX_F32_e64, f32>;
+let SubtargetPredicate = isNotGFX12Plus in
def : ClampPat<V_MAX_F64_e64, f64>;
+let SubtargetPredicate = isGFX12Plus in
+def : ClampPat<V_MAX_NUM_F64_e64, f64>;
let SubtargetPredicate = NotHasTrue16BitInsts in
def : ClampPat<V_MAX_F16_e64, f16>;
let SubtargetPredicate = UseRealTrue16Insts in
@@ -2990,10 +2993,12 @@ def : GCNPat<
}
// TODO: Handle fneg like other types.
+let SubtargetPredicate = isNotGFX12Plus in {
def : GCNPat<
(fcanonicalize (f64 (VOP3Mods f64:$src, i32:$src_mods))),
(V_MUL_F64_e64 0, CONST.FP64_ONE, $src_mods, $src)
>;
+}
} // End AddedComplexity = -5
multiclass SelectCanonicalizeAsMax<
@@ -3009,7 +3014,13 @@ multiclass SelectCanonicalizeAsMax<
def : GCNPat<
(fcanonicalize (f64 (VOP3Mods f64:$src, i32:$src_mods))),
(V_MAX_F64_e64 $src_mods, $src, $src_mods, $src)> {
- let OtherPredicates = f64_preds;
+ let OtherPredicates = !listconcat(f64_preds, [isNotGFX12Plus]);
+ }
+
+ def : GCNPat<
+ (fcanonicalize (f64 (VOP3Mods f64:$src, i32:$src_mods))),
+ (V_MAX_NUM_F64_e64 $src_mods, $src, $src_mods, $src)> {
+ let OtherPredicates = !listconcat(f64_preds, [isGFX12Plus]);
}
def : GCNPat<
diff --git a/llvm/test/CodeGen/AMDGPU/clamp.ll b/llvm/test/CodeGen/AMDGPU/clamp.ll
index b95231fd8880f5..92660b9a646ff3 100644
--- a/llvm/test/CodeGen/AMDGPU/clamp.ll
+++ b/llvm/test/CodeGen/AMDGPU/clamp.ll
@@ -857,7 +857,7 @@ define amdgpu_kernel void @v_clamp_f64(ptr addrspace(1) %out, ptr addrspace(1) %
; GFX12-NEXT: s_waitcnt lgkmcnt(0)
; GFX12-NEXT: global_load_b64 v[0:1], v2, s[2:3]
; GFX12-NEXT: s_waitcnt vmcnt(0)
-; GFX12-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] clamp
+; GFX12-NEXT: v_max_num_f64_e64 v[0:1], v[0:1], v[0:1] clamp
; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -938,7 +938,7 @@ define amdgpu_kernel void @v_clamp_neg_f64(ptr addrspace(1) %out, ptr addrspace(
; GFX12-NEXT: s_waitcnt lgkmcnt(0)
; GFX12-NEXT: global_load_b64 v[0:1], v2, s[2:3]
; GFX12-NEXT: s_waitcnt vmcnt(0)
-; GFX12-NEXT: v_max_f64 v[0:1], -v[0:1], -v[0:1] clamp
+; GFX12-NEXT: v_max_num_f64_e64 v[0:1], -v[0:1], -v[0:1] clamp
; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -1020,7 +1020,7 @@ define amdgpu_kernel void @v_clamp_negabs_f64(ptr addrspace(1) %out, ptr addrspa
; GFX12-NEXT: s_waitcnt lgkmcnt(0)
; GFX12-NEXT: global_load_b64 v[0:1], v2, s[2:3]
; GFX12-NEXT: s_waitcnt vmcnt(0)
-; GFX12-NEXT: v_max_f64 v[0:1], -|v[0:1]|, -|v[0:1]| clamp
+; GFX12-NEXT: v_max_num_f64_e64 v[0:1], -|v[0:1]|, -|v[0:1]| clamp
; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll
index 41b436473f6521..fc4bc7595da5b7 100644
--- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll
@@ -2,7 +2,8 @@
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX678,GFX6 %s
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX678,GFX8 %s
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11PLUS,GFX11 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11PLUS,GFX12 %s
declare float @llvm.fabs.f32(float) #0
declare float @llvm.canonicalize.f32(float) #0
@@ -55,6 +56,19 @@ define amdgpu_kernel void @v_test_canonicalize_var_f32(ptr addrspace(1) %out) #1
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: v_test_canonicalize_var_f32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: global_load_b32 v1, v0, s[0:1]
+; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: v_max_num_f32_e32 v1, v1, v1
+; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%val = load float, ptr addrspace(1) %out
%canonicalized = call float @llvm.canonicalize.f32(float %val)
store float %canonicalized, ptr addrspace(1) %out
@@ -106,6 +120,17 @@ define amdgpu_kernel void @s_test_canonicalize_var_f32(ptr addrspace(1) %out, fl
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: s_test_canonicalize_var_f32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: v_max_num_f32_e64 v1, s2, s2
+; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%canonicalized = call float @llvm.canonicalize.f32(float %val)
store float %canonicalized, ptr addrspace(1) %out
ret void
@@ -147,6 +172,19 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f32(ptr addrspace(1) %ou
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: v_test_canonicalize_fabs_var_f32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: global_load_b32 v1, v0, s[0:1]
+; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: v_max_num_f32_e64 v1, |v1|, |v1|
+; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%val = load float, ptr addrspace(1) %out
%val.fabs = call float @llvm.fabs.f32(float %val)
%canonicalized = call float @llvm.canonicalize.f32(float %val.fabs)
@@ -190,6 +228,19 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f32(ptr addrspace(1
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: v_test_canonicalize_fneg_fabs_var_f32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: global_load_b32 v1, v0, s[0:1]
+; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: v_max_num_f32_e64 v1, -|v1|, -|v1|
+; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%val = load float, ptr addrspace(1) %out
%val.fabs = call float @llvm.fabs.f32(float %val)
%val.fabs.fneg = fneg float %val.fabs
@@ -234,6 +285,19 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f32(ptr addrspace(1) %ou
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: v_test_canonicalize_fneg_var_f32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: global_load_b32 v1, v0, s[0:1]
+; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: v_max_num_f32_e64 v1, -v1, -v1
+; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%val = load float, ptr addrspace(1) %out
%val.fneg = fneg float %val
%canonicalized = call float @llvm.canonicalize.f32(float %val.fneg)
@@ -260,15 +324,15 @@ define amdgpu_kernel void @test_fold_canonicalize_undef_f32(ptr addrspace(1) %ou
; GFX9-NEXT: global_store_dword v0, v0, s[0:1]
; GFX9-NEXT: s_endpgm
;
-; GFX11-LABEL: test_fold_canonicalize_undef_f32:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
-; GFX11-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b32 v0, v0, s[0:1]
-; GFX11-NEXT: s_nop 0
-; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT: s_endpgm
+; GFX11PLUS-LABEL: test_fold_canonicalize_undef_f32:
+; GFX11PLUS: ; %bb.0:
+; GFX11PLUS-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11PLUS-NEXT: v_mov_b32_e32 v0, 0
+; GFX11PLUS-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11PLUS-NEXT: global_store_b32 v0, v0, s[0:1]
+; GFX11PLUS-NEXT: s_nop 0
+; GFX11PLUS-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11PLUS-NEXT: s_endpgm
%canonicalized = call float @llvm.canonicalize.f32(float undef)
store float %canonicalized, ptr addrspace(1) %out
ret void
@@ -293,15 +357,15 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_f32(ptr addrspace(1) %out)
; GFX9-NEXT: global_store_dword v0, v0, s[0:1]
; GFX9-NEXT: s_endpgm
;
-; GFX11-LABEL: test_fold_canonicalize_p0_f32:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
-; GFX11-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b32 v0, v0, s[0:1]
-; GFX11-NEXT: s_nop 0
-; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT: s_endpgm
+; GFX11PLUS-LABEL: test_fold_canonicalize_p0_f32:
+; GFX11PLUS: ; %bb.0:
+; GFX11PLUS-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11PLUS-NEXT: v_mov_b32_e32 v0, 0
+; GFX11PLUS-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11PLUS-NEXT: global_store_b32 v0, v0, s[0:1]
+; GFX11PLUS-NEXT: s_nop 0
+; GFX11PLUS-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11PLUS-NEXT: s_endpgm
%canonicalized = call float @llvm.canonicalize.f32(float 0.0)
store float %canonicalized, ptr addrspace(1) %out
ret void
@@ -327,16 +391,16 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_f32(ptr addrspace(1) %out)
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
-; GFX11-LABEL: test_fold_canonicalize_n0_f32:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
-; GFX11-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-NEXT: v_bfrev_b32_e32 v1, 1
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT: s_nop 0
-; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT: s_endpgm
+; GFX11PLUS-LABEL: test_fold_canonicalize_n0_f32:
+; GFX11PLUS: ; %bb.0:
+; GFX11PLUS-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11PLUS-NEXT: v_mov_b32_e32 v0, 0
+; GFX11PLUS-NEXT: v_bfrev_b32_e32 v1, 1
+; GFX11PLUS-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11PLUS-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11PLUS-NEXT: s_nop 0
+; GFX11PLUS-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11PLUS-NEXT: s_endpgm
%canonicalized = call float @llvm.canonicalize.f32(float -0.0)
store float %canonicalized, ptr addrspace(1) %out
ret void
@@ -362,15 +426,15 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_f32(ptr addrspace(1) %out)
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
-; GFX11-LABEL: test_fold_canonicalize_p1_f32:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
-; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 1.0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT: s_nop 0
-; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT: s_endpgm
+; GFX11PLUS-LABEL: test_fold_canonicalize_p1_f32:
+; GFX11PLUS: ; %bb.0:
+; GFX11PLUS-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11PLUS-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 1.0
+; GFX11PLUS-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11PLUS-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11PLUS-NEXT: s_nop 0
+; GFX11PLUS-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11PLUS-NEXT: s_endpgm
%canonicalized = call float @llvm.canonicalize.f32(float 1.0)
store float %canonicalized, ptr addrspace(1) %out
ret void
@@ -396,15 +460,15 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_f32(ptr addrspace(1) %out)
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
-; GFX11-LABEL: test_fold_canonicalize_n1_f32:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
-; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, -1.0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT: s_nop 0
-; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT: s_endpgm
+; GFX11PLUS-LABEL: test_fold_canonicalize_n1_f32:
+; GFX11PLUS: ; %bb.0:
+; GFX11PLUS-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11PLUS-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, -1.0
+; GFX11PLUS-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11PLUS-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11PLUS-NEXT: s_nop 0
+; GFX11PLUS-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11PLUS-NEXT: s_endpgm
%canonicalized = call float @llvm.canonicalize.f32(float -1.0)
store float %canonicalized, ptr addrspace(1) %out
ret void
@@ -430,15 +494,15 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_f32(ptr addrspace(1) %
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
-; GFX11-LABEL: test_fold_canonicalize_literal_f32:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
-; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x41800000
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT: s_nop 0
-; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT: s_endpgm
+; GFX11PLUS-LABEL: test_fold_canonicalize_literal_f32:
+; GFX11PLUS: ; %bb.0:
+; GFX11PLUS-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11PLUS-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x41800000
+; GFX11PLUS-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11PLUS-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11PLUS-NEXT: s_nop 0
+; GFX11PLUS-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11PLUS-NEXT: s_endpgm
%canonicalized = call float @llvm.canonicalize.f32(float 16.0)
store float %canonicalized, ptr addrspace(1) %out
ret void
@@ -463,15 +527,15 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32(ptr
; GFX9-NEXT: global_store_dword v0, v0, s[0:1]
; GFX9-NEXT: s_endpgm
;
-; GFX11-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
-; GFX11-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b32 v0, v0, s[0:1]
-; GFX11-NEXT: s_nop 0
-; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT: s_endpgm
+; GFX11PLUS-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32:
+; GFX11PLUS: ; %bb.0:
+; GFX11PLUS-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11PLUS-NEXT: v_mov_b32_e32 v0, 0
+; GFX11PLUS-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11PLUS-NEXT: global_store_b32 v0, v0, s[0:1]
+; GFX11PLUS-NEXT: s_nop 0
+; GFX11PLUS-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11PLUS-NEXT: s_endpgm
%canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 8388607 to float))
store float %canonicalized, ptr addrspace(1) %out
ret void
@@ -509,6 +573,17 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dyn
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-NEXT: v_max_num_f32_e64 v1, 0x7fffff, 0x7fffff
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 8388607 to float))
store float %canonicalized, ptr addrspace(1) %out
ret void
@@ -546,6 +621,17 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dyn
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL:...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/78155
More information about the llvm-commits
mailing list