[llvm] [CodeGenPrepare] Reverse the canonicalization of isInf/isNanOrInf (PR #81572)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Feb 12 21:34:59 PST 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-llvm-transforms
@llvm/pr-subscribers-backend-aarch64
Author: Yingwei Zheng (dtcxzyw)
<details>
<summary>Changes</summary>
In commit https://github.com/llvm/llvm-project/commit/2b582440c16c72b6b021ea5c212ceda3bdfb2b9b, we canonicalize the isInf/isNanOrInf idiom into fabs+fcmp for better analysis/codegen (See also the discussion in https://github.com/llvm/llvm-project/pull/76338).
This patch reverses the fabs+fcmp to `is.fpclass`. If the `is.fpclass` is not supported by the target, it will be expanded by TLI.
Fixes the regression introduced by https://github.com/llvm/llvm-project/commit/2b582440c16c72b6b021ea5c212ceda3bdfb2b9b and https://github.com/llvm/llvm-project/pull/80414#issuecomment-1936374206.
It is an alternative to #<!-- -->81404.
---
Patch is 66.12 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/81572.diff
7 Files Affected:
- (modified) llvm/lib/CodeGen/CodeGenPrepare.cpp (+27)
- (modified) llvm/test/CodeGen/AArch64/isinf.ll (+7-15)
- (modified) llvm/test/CodeGen/AMDGPU/fp-classify.ll (+114-82)
- (modified) llvm/test/CodeGen/AMDGPU/fract-match.ll (+130-129)
- (added) llvm/test/Transforms/CodeGenPrepare/AArch64/fpclass-test.ll (+134)
- (added) llvm/test/Transforms/CodeGenPrepare/RISCV/fpclass-test.ll (+134)
- (added) llvm/test/Transforms/CodeGenPrepare/X86/fpclass-test.ll (+178)
``````````diff
diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index 09c4922d8822cc..33e4aedb392d75 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -1943,6 +1943,30 @@ static bool swapICmpOperandsToExposeCSEOpportunities(CmpInst *Cmp) {
return false;
}
+static bool foldFCmpToFPClassTest(CmpInst *Cmp) {
+ FCmpInst *FCmp = dyn_cast<FCmpInst>(Cmp);
+ if (!FCmp)
+ return false;
+
+ // Reverse the canonicalization if it is a FP class test
+ auto ShouldReverseTransform = [](FPClassTest ClassTest) {
+ return ClassTest == fcInf || ClassTest == (fcInf | fcNan);
+ };
+ auto [ClassVal, ClassTest] =
+ fcmpToClassTest(FCmp->getPredicate(), *FCmp->getParent()->getParent(),
+ FCmp->getOperand(0), FCmp->getOperand(1));
+ if (ClassVal && (ShouldReverseTransform(ClassTest) ||
+ ShouldReverseTransform(~ClassTest))) {
+ IRBuilder<> Builder(Cmp);
+ Value *IsFPClass = Builder.createIsFPClass(ClassVal, ClassTest);
+ Cmp->replaceAllUsesWith(IsFPClass);
+ RecursivelyDeleteTriviallyDeadInstructions(Cmp);
+ return true;
+ }
+
+ return false;
+}
+
bool CodeGenPrepare::optimizeCmp(CmpInst *Cmp, ModifyDT &ModifiedDT) {
if (sinkCmpExpression(Cmp, *TLI))
return true;
@@ -1959,6 +1983,9 @@ bool CodeGenPrepare::optimizeCmp(CmpInst *Cmp, ModifyDT &ModifiedDT) {
if (swapICmpOperandsToExposeCSEOpportunities(Cmp))
return true;
+ if (foldFCmpToFPClassTest(Cmp))
+ return true;
+
return false;
}
diff --git a/llvm/test/CodeGen/AArch64/isinf.ll b/llvm/test/CodeGen/AArch64/isinf.ll
index 458bd7eeba16cf..834417b98743a8 100644
--- a/llvm/test/CodeGen/AArch64/isinf.ll
+++ b/llvm/test/CodeGen/AArch64/isinf.ll
@@ -58,22 +58,14 @@ define i32 @replace_isinf_call_f64(double %x) {
define i32 @replace_isinf_call_f128(fp128 %x) {
; CHECK-LABEL: replace_isinf_call_f128:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #32
-; CHECK-NEXT: str x30, [sp, #16] // 8-byte Folded Spill
-; CHECK-NEXT: .cfi_def_cfa_offset 32
-; CHECK-NEXT: .cfi_offset w30, -16
-; CHECK-NEXT: str q0, [sp]
-; CHECK-NEXT: ldrb w8, [sp, #15]
-; CHECK-NEXT: and w8, w8, #0x7f
-; CHECK-NEXT: strb w8, [sp, #15]
-; CHECK-NEXT: adrp x8, .LCPI3_0
-; CHECK-NEXT: ldr q0, [sp]
-; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI3_0]
-; CHECK-NEXT: bl __eqtf2
-; CHECK-NEXT: cmp w0, #0
-; CHECK-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEXT: str q0, [sp, #-16]!
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: ldp x9, x8, [sp], #16
+; CHECK-NEXT: and x8, x8, #0x7fffffffffffffff
+; CHECK-NEXT: eor x8, x8, #0x7fff000000000000
+; CHECK-NEXT: orr x8, x9, x8
+; CHECK-NEXT: cmp x8, #0
; CHECK-NEXT: cset w0, eq
-; CHECK-NEXT: add sp, sp, #32
; CHECK-NEXT: ret
%abs = tail call fp128 @llvm.fabs.f128(fp128 %x)
%cmpinf = fcmp oeq fp128 %abs, 0xL00000000000000007FFF000000000000
diff --git a/llvm/test/CodeGen/AMDGPU/fp-classify.ll b/llvm/test/CodeGen/AMDGPU/fp-classify.ll
index 6fa7df913812a3..ed9ce4d62383b1 100644
--- a/llvm/test/CodeGen/AMDGPU/fp-classify.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp-classify.ll
@@ -61,10 +61,10 @@ define amdgpu_kernel void @test_not_isinf_pattern_0(ptr addrspace(1) nocapture %
; SI-NEXT: s_load_dword s0, s[0:1], 0xb
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
-; SI-NEXT: v_mov_b32_e32 v0, 0x7f800000
+; SI-NEXT: v_mov_b32_e32 v0, 0x207
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: v_cmp_nlg_f32_e64 s[0:1], |s0|, v0
-; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; SI-NEXT: v_cmp_class_f32_e32 vcc, s0, v0
+; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
;
@@ -72,11 +72,11 @@ define amdgpu_kernel void @test_not_isinf_pattern_0(ptr addrspace(1) nocapture %
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: v_mov_b32_e32 v0, 0x7f800000
+; VI-NEXT: v_mov_b32_e32 v0, 0x207
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_cmp_nlg_f32_e64 s[2:3], |s2|, v0
+; VI-NEXT: v_cmp_class_f32_e32 vcc, s2, v0
; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[2:3]
+; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
@@ -88,7 +88,7 @@ define amdgpu_kernel void @test_not_isinf_pattern_0(ptr addrspace(1) nocapture %
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_nlg_f32_e64 s2, 0x7f800000, |s2|
+; GFX11-NEXT: v_cmp_class_f32_e64 s2, s2, 0x207
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -143,25 +143,29 @@ define amdgpu_kernel void @test_isfinite_pattern_0(ptr addrspace(1) nocapture %o
; SI-LABEL: test_isfinite_pattern_0:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
-; SI-NEXT: s_load_dword s0, s[0:1], 0xb
+; SI-NEXT: s_load_dword s2, s[0:1], 0xb
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
-; SI-NEXT: v_mov_b32_e32 v0, 0x1f8
+; SI-NEXT: v_mov_b32_e32 v0, 0x1fb
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: v_cmp_class_f32_e32 vcc, s0, v0
-; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; SI-NEXT: v_cmp_o_f32_e64 s[0:1], s2, s2
+; SI-NEXT: v_cmp_class_f32_e32 vcc, s2, v0
+; SI-NEXT: s_and_b64 s[0:1], s[0:1], vcc
+; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: test_isfinite_pattern_0:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: v_mov_b32_e32 v0, 0x1f8
+; VI-NEXT: v_mov_b32_e32 v0, 0x1fb
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_cmp_class_f32_e32 vcc, s2, v0
+; VI-NEXT: v_cmp_o_f32_e64 s[2:3], s4, s4
+; VI-NEXT: v_cmp_class_f32_e32 vcc, s4, v0
+; VI-NEXT: s_and_b64 s[2:3], s[2:3], vcc
; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[2:3]
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
@@ -173,8 +177,10 @@ define amdgpu_kernel void @test_isfinite_pattern_0(ptr addrspace(1) nocapture %o
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_class_f32_e64 s2, s2, 0x1f8
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_cmp_o_f32_e64 s3, s2, s2
+; GFX11-NEXT: v_cmp_class_f32_e64 s2, s2, 0x1fb
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_b32 s2, s3, s2
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: s_nop 0
@@ -349,13 +355,13 @@ define amdgpu_kernel void @test_isfinite_not_pattern_2(ptr addrspace(1) nocaptur
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
-; SI-NEXT: v_mov_b32_e32 v0, 0x7f800000
+; SI-NEXT: v_mov_b32_e32 v0, 0x1fb
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s4, s0
; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: v_cmp_o_f32_e64 s[0:1], s2, s2
-; SI-NEXT: v_cmp_neq_f32_e64 s[2:3], |s3|, v0
-; SI-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3]
+; SI-NEXT: v_cmp_class_f32_e32 vcc, s3, v0
+; SI-NEXT: s_and_b64 s[0:1], s[0:1], vcc
; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
@@ -363,11 +369,11 @@ define amdgpu_kernel void @test_isfinite_not_pattern_2(ptr addrspace(1) nocaptur
; VI-LABEL: test_isfinite_not_pattern_2:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: v_mov_b32_e32 v0, 0x7f800000
+; VI-NEXT: v_mov_b32_e32 v0, 0x1fb
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_cmp_o_f32_e64 s[4:5], s2, s2
-; VI-NEXT: v_cmp_neq_f32_e64 s[2:3], |s3|, v0
-; VI-NEXT: s_and_b64 s[2:3], s[4:5], s[2:3]
+; VI-NEXT: v_cmp_class_f32_e32 vcc, s3, v0
+; VI-NEXT: s_and_b64 s[2:3], s[4:5], vcc
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[2:3]
; VI-NEXT: v_mov_b32_e32 v1, s1
@@ -380,7 +386,7 @@ define amdgpu_kernel void @test_isfinite_not_pattern_2(ptr addrspace(1) nocaptur
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_cmp_o_f32_e64 s2, s2, s2
-; GFX11-NEXT: v_cmp_neq_f32_e64 s3, 0x7f800000, |s3|
+; GFX11-NEXT: v_cmp_class_f32_e64 s3, s3, 0x1fb
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s2, s2, s3
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2
@@ -405,11 +411,11 @@ define amdgpu_kernel void @test_isfinite_not_pattern_3(ptr addrspace(1) nocaptur
; SI-NEXT: s_load_dword s2, s[0:1], 0xb
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
-; SI-NEXT: v_mov_b32_e32 v0, 0x7f800000
+; SI-NEXT: v_mov_b32_e32 v0, 0x1fb
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_cmp_u_f32_e64 s[0:1], s2, s2
-; SI-NEXT: v_cmp_neq_f32_e64 s[2:3], |s2|, v0
-; SI-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3]
+; SI-NEXT: v_cmp_class_f32_e32 vcc, s2, v0
+; SI-NEXT: s_and_b64 s[0:1], s[0:1], vcc
; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
@@ -418,11 +424,11 @@ define amdgpu_kernel void @test_isfinite_not_pattern_3(ptr addrspace(1) nocaptur
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: v_mov_b32_e32 v0, 0x7f800000
+; VI-NEXT: v_mov_b32_e32 v0, 0x1fb
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_cmp_u_f32_e64 s[2:3], s4, s4
-; VI-NEXT: v_cmp_neq_f32_e64 s[4:5], |s4|, v0
-; VI-NEXT: s_and_b64 s[2:3], s[2:3], s[4:5]
+; VI-NEXT: v_cmp_class_f32_e32 vcc, s4, v0
+; VI-NEXT: s_and_b64 s[2:3], s[2:3], vcc
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[2:3]
; VI-NEXT: v_mov_b32_e32 v1, s1
@@ -437,7 +443,7 @@ define amdgpu_kernel void @test_isfinite_not_pattern_3(ptr addrspace(1) nocaptur
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_cmp_u_f32_e64 s3, s2, s2
-; GFX11-NEXT: v_cmp_neq_f32_e64 s2, 0x7f800000, |s2|
+; GFX11-NEXT: v_cmp_class_f32_e64 s2, s2, 0x1fb
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s2, s3, s2
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2
@@ -458,25 +464,29 @@ define amdgpu_kernel void @test_isfinite_pattern_4(ptr addrspace(1) nocapture %o
; SI-LABEL: test_isfinite_pattern_4:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
-; SI-NEXT: s_load_dword s0, s[0:1], 0xb
+; SI-NEXT: s_load_dword s2, s[0:1], 0xb
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: v_mov_b32_e32 v0, 0x1f8
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: v_cmp_class_f32_e32 vcc, s0, v0
-; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; SI-NEXT: v_cmp_o_f32_e64 s[0:1], s2, s2
+; SI-NEXT: v_cmp_class_f32_e32 vcc, s2, v0
+; SI-NEXT: s_and_b64 s[0:1], s[0:1], vcc
+; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: test_isfinite_pattern_4:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 0x1f8
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_cmp_class_f32_e32 vcc, s2, v0
+; VI-NEXT: v_cmp_o_f32_e64 s[2:3], s4, s4
+; VI-NEXT: v_cmp_class_f32_e32 vcc, s4, v0
+; VI-NEXT: s_and_b64 s[2:3], s[2:3], vcc
; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[2:3]
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
@@ -488,8 +498,10 @@ define amdgpu_kernel void @test_isfinite_pattern_4(ptr addrspace(1) nocapture %o
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_cmp_o_f32_e64 s3, s2, s2
; GFX11-NEXT: v_cmp_class_f32_e64 s2, s2, 0x1f8
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_b32 s2, s3, s2
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: s_nop 0
@@ -508,25 +520,29 @@ define amdgpu_kernel void @test_isfinite_pattern_4_commute_and(ptr addrspace(1)
; SI-LABEL: test_isfinite_pattern_4_commute_and:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
-; SI-NEXT: s_load_dword s0, s[0:1], 0xb
+; SI-NEXT: s_load_dword s2, s[0:1], 0xb
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: v_mov_b32_e32 v0, 0x1f8
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: v_cmp_class_f32_e32 vcc, s0, v0
-; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; SI-NEXT: v_cmp_o_f32_e64 s[0:1], s2, s2
+; SI-NEXT: v_cmp_class_f32_e32 vcc, s2, v0
+; SI-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
+; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: test_isfinite_pattern_4_commute_and:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 0x1f8
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_cmp_class_f32_e32 vcc, s2, v0
+; VI-NEXT: v_cmp_o_f32_e64 s[2:3], s4, s4
+; VI-NEXT: v_cmp_class_f32_e32 vcc, s4, v0
+; VI-NEXT: s_and_b64 s[2:3], vcc, s[2:3]
; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[2:3]
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
@@ -538,8 +554,10 @@ define amdgpu_kernel void @test_isfinite_pattern_4_commute_and(ptr addrspace(1)
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_cmp_o_f32_e64 s3, s2, s2
; GFX11-NEXT: v_cmp_class_f32_e64 s2, s2, 0x1f8
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_b32 s2, s2, s3
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: s_nop 0
@@ -618,16 +636,16 @@ define amdgpu_kernel void @test_not_isfinite_pattern_4_wrong_ord_test(ptr addrsp
define amdgpu_kernel void @test_isinf_pattern_f16(ptr addrspace(1) nocapture %out, half %x) #0 {
; SI-LABEL: test_isinf_pattern_f16:
; SI: ; %bb.0:
-; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
-; SI-NEXT: s_load_dword s0, s[0:1], 0xb
-; SI-NEXT: s_mov_b32 s7, 0xf000
-; SI-NEXT: s_mov_b32 s6, -1
-; SI-NEXT: s_mov_b32 s1, 0x7f800000
+; SI-NEXT: s_load_dword s4, s[0:1], 0xb
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e64 v0, |s0|
-; SI-NEXT: v_cmp_eq_f32_e32 vcc, s1, v0
-; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT: s_and_b32 s4, s4, 0x7fff
+; SI-NEXT: s_cmpk_eq_i32 s4, 0x7c00
+; SI-NEXT: s_cselect_b64 s[4:5], -1, 0
+; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: test_isinf_pattern_f16:
@@ -667,27 +685,32 @@ define amdgpu_kernel void @test_isinf_pattern_f16(ptr addrspace(1) nocapture %ou
define amdgpu_kernel void @test_isfinite_pattern_0_f16(ptr addrspace(1) nocapture %out, half %x) #0 {
; SI-LABEL: test_isfinite_pattern_0_f16:
; SI: ; %bb.0:
-; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
-; SI-NEXT: s_load_dword s0, s[0:1], 0xb
-; SI-NEXT: s_mov_b32 s7, 0xf000
-; SI-NEXT: s_mov_b32 s6, -1
-; SI-NEXT: s_movk_i32 s1, 0x1f8
+; SI-NEXT: s_load_dword s4, s[0:1], 0xb
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v0, s0
-; SI-NEXT: v_cmp_class_f32_e64 s[0:1], v0, s1
-; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
-; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT: v_cvt_f32_f16_e32 v0, s4
+; SI-NEXT: s_and_b32 s4, s4, 0x7fff
+; SI-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; SI-NEXT: s_cmpk_lg_i32 s4, 0x7c00
+; SI-NEXT: s_cselect_b64 s[4:5], -1, 0
+; SI-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
+; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: test_isfinite_pattern_0_f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: v_mov_b32_e32 v0, 0x1f8
+; VI-NEXT: v_mov_b32_e32 v0, 0x1fb
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_cmp_class_f16_e32 vcc, s2, v0
+; VI-NEXT: v_cmp_o_f16_e64 s[2:3], s4, s4
+; VI-NEXT: v_cmp_class_f16_e32 vcc, s4, v0
+; VI-NEXT: s_and_b64 s[2:3], s[2:3], vcc
; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[2:3]
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
@@ -699,8 +722,10 @@ define amdgpu_kernel void @test_isfinite_pattern_0_f16(ptr addrspace(1) nocaptur
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_class_f16_e64 s2, s2, 0x1f8
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_cmp_o_f16_e64 s3, s2, s2
+; GFX11-NEXT: v_cmp_class_f16_e64 s2, s2, 0x1fb
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_b32 s2, s3, s2
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: s_nop 0
@@ -718,27 +743,32 @@ define amdgpu_kernel void @test_isfinite_pattern_0_f16(ptr addrspace(1) nocaptur
define amdgpu_kernel void @test_isfinite_pattern_4_f16(ptr addrspace(1) nocapture %out, half %x) #0 {
; SI-LABEL: test_isfinite_pattern_4_f16:
; SI: ; %bb.0:
-; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
-; SI-NEXT: s_load_dword s0, s[0:1], 0xb
-; SI-NEXT: s_mov_b32 s7, 0xf000
-; SI-NEXT: s_mov_b32 s6, -1
-; SI-NEXT: s_movk_i32 s1, 0x1f8
+; SI-NEXT: s_load_dword s4, s[0:1], 0xb
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v0, s0
-; SI-NEXT: v_cmp_class_f32_e64 s[0:1], v0, s1
-; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
-; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT: v_cvt_f32_f16_e32 v0, s4
+; SI-NEXT: s_and_b32 s4, s4, 0x7fff
+; SI-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; SI-NEXT: s_cmpk_lt_i32 s4, 0x7c00
+; SI-NEXT: s_cselect_b64 s[4:5], -1, 0
+; SI-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
+; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: test_isfinite_...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/81572
More information about the llvm-commits
mailing list