[llvm] [AMDGPU] fmuladd.f32.ll - clean up prefixes and regenerate checks (PR #151832)

via llvm-commits llvm-commits at lists.llvm.org
Sat Aug 2 10:39:33 PDT 2025


llvmbot wrote:


<!--LLVM PR SUMMARY COMMENT-->

@llvm/pr-subscribers-backend-amdgpu

Author: Simon Pilgrim (RKSimon)

<details>
<summary>Changes</summary>

Automate the fmuladd.f32.ll test checks as manually fixing changes while working on the topological dag patches was doing my head in

---

Patch is 147.44 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/151832.diff


1 Files Affected:

- (modified) llvm/test/CodeGen/AMDGPU/fmuladd.f32.ll (+2428-323) 


``````````diff
diff --git a/llvm/test/CodeGen/AMDGPU/fmuladd.f32.ll b/llvm/test/CodeGen/AMDGPU/fmuladd.f32.ll
index ceacdf5e254aa..ab8d902fb862b 100644
--- a/llvm/test/CodeGen/AMDGPU/fmuladd.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmuladd.f32.ll
@@ -1,45 +1,171 @@
-; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=tahiti -denormal-fp-math-f32=preserve-sign -mattr=+fast-fmaf -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-FLUSH,GCN-FLUSH-MAD,SI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=tahiti -denormal-fp-math-f32=ieee -mattr=+fast-fmaf -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-DENORM-STRICT,SI-DENORM,GCN-DENORM-FASTFMA,SI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=verde -denormal-fp-math-f32=preserve-sign -mattr=-fast-fmaf -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-FLUSH,GCN-FLUSH-MAD,SI-FLUSH,SI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=verde -denormal-fp-math-f32=ieee -mattr=-fast-fmaf -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-DENORM-STRICT,SI-DENORM,GCN-DENORM-SLOWFMA,SI %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=tahiti -denormal-fp-math-f32=preserve-sign -mattr=+fast-fmaf -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=SI-FLUSH %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=verde  -denormal-fp-math-f32=preserve-sign -mattr=-fast-fmaf -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=SI-FLUSH %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=tahiti -denormal-fp-math-f32=ieee -mattr=+fast-fmaf -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=SI-DENORM-FASTFMA,SI-DENORM-FASTFMA-STRICT %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=verde  -denormal-fp-math-f32=ieee -mattr=-fast-fmaf -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=SI-DENORM-SLOWFMA %s
 
-; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=tahiti -denormal-fp-math-f32=preserve-sign -mattr=+fast-fmaf -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-FLUSH,GCN-FLUSH-MAD,SI-FLUSH,SI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=tahiti -denormal-fp-math-f32=ieee -mattr=+fast-fmaf -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI-DENORM,GCN-DENORM-FASTFMA,GCN-DENORM-FASTFMA-CONTRACT,SI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=verde -denormal-fp-math-f32=preserve-sign -mattr=-fast-fmaf -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-FLUSH,GCN-FLUSH-MAD,SI-FLUSH,SI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=verde -denormal-fp-math-f32=ieee -mattr=-fast-fmaf -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI-DENORM,GCN-DENORM-SLOWFMA,GCN-DENORM-SLOWFMA-CONTRACT,SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=tahiti -denormal-fp-math-f32=preserve-sign -mattr=+fast-fmaf -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefixes=SI-FLUSH %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=verde  -denormal-fp-math-f32=preserve-sign -mattr=-fast-fmaf -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefixes=SI-FLUSH %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=tahiti -denormal-fp-math-f32=ieee -mattr=+fast-fmaf -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefixes=SI-DENORM-FASTFMA,SI-DENORM-FASTFMA-CONTRACT %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=verde  -denormal-fp-math-f32=ieee -mattr=-fast-fmaf -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefixes=SI-DENORM-SLOWFMA %s
 
+; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=gfx900 -denormal-fp-math-f32=preserve-sign -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GFX9-FLUSH,GFX9-FLUSH-MAD %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=gfx900 -denormal-fp-math-f32=ieee -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GFX9-DENORM,GFX9-DENORM-FASTFMA-MAD %s
 
-; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=gfx900 -denormal-fp-math-f32=preserve-sign -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-FLUSH,GCN-FLUSH-MAD %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=gfx900 -denormal-fp-math-f32=ieee -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-DENORM-STRICT,GCN-DENORM-FASTFMA %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=gfx906 -denormal-fp-math-f32=preserve-sign -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GFX9-FMAC,GFX9-FLUSH %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=gfx906 -denormal-fp-math-f32=ieee -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GFX9-FMAC,GFX9-DENORM %s
 
-; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=gfx906 -denormal-fp-math-f32=preserve-sign -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-FLUSH,GCN-FLUSH-FMAC %s
-
-; FIXME: Should probably test this, but sometimes selecting fmac is painful to match.
-; XUN: llc -amdgpu-scalarize-global-loads=false -mcpu=gfx906 -denormal-fp-math-f32=ieee -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-DENORM-STRICT,GCN-DENORM-FASTFMA %s
-
-; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=gfx1030 -denormal-fp-math-f32=preserve-sign -mattr=+mad-mac-f32-insts -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-FLUSH,GCN-FLUSH-FMAC %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=gfx1030 -denormal-fp-math-f32=ieee -mattr=+mad-mac-f32-insts -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-DENORM-STRICT %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=gfx1030 -denormal-fp-math-f32=preserve-sign -mattr=+mad-mac-f32-insts -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10-FLUSH %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=gfx1030 -denormal-fp-math-f32=ieee -mattr=+mad-mac-f32-insts -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10-DENORM %s
 
 ; Test all permutations of: fp32 denormals, fast fp contract, fp contract enabled for fmuladd, fmaf fast/slow.
 
 target triple = "amdgcn--"
 
-
 declare i32 @llvm.amdgcn.workitem.id.x() #1
 declare float @llvm.fmuladd.f32(float, float, float) #1
 declare half @llvm.fmuladd.f16(half, half, half) #1
 declare float @llvm.fabs.f32(float) #1
 
-; GCN-LABEL: {{^}}fmuladd_f32:
-; GCN-FLUSH-MAD: v_mac_f32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
-; GCN-FLUSH-FMAC: v_fmac_f32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
-
-; GCN-DENORM-FASTFMA: v_fma_f32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
-
-; GCN-DENORM-SLOWFMA: v_mul_f32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
-; GCN-DENORM-SLOWFMA: v_add_f32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
-define amdgpu_kernel void @fmuladd_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1,
-                         ptr addrspace(1) %in2, ptr addrspace(1) %in3) #0 {
+define amdgpu_kernel void @fmuladd_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2, ptr addrspace(1) %in3) #0 {
+; SI-FLUSH-LABEL: fmuladd_f32:
+; SI-FLUSH:       ; %bb.0:
+; SI-FLUSH-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x9
+; SI-FLUSH-NEXT:    s_mov_b32 s11, 0xf000
+; SI-FLUSH-NEXT:    s_mov_b32 s10, -1
+; SI-FLUSH-NEXT:    s_mov_b32 s14, s10
+; SI-FLUSH-NEXT:    s_mov_b32 s15, s11
+; SI-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-FLUSH-NEXT:    s_mov_b32 s12, s2
+; SI-FLUSH-NEXT:    s_mov_b32 s13, s3
+; SI-FLUSH-NEXT:    s_mov_b32 s16, s4
+; SI-FLUSH-NEXT:    s_mov_b32 s17, s5
+; SI-FLUSH-NEXT:    s_mov_b32 s18, s10
+; SI-FLUSH-NEXT:    s_mov_b32 s19, s11
+; SI-FLUSH-NEXT:    s_mov_b32 s4, s6
+; SI-FLUSH-NEXT:    s_mov_b32 s5, s7
+; SI-FLUSH-NEXT:    s_mov_b32 s6, s10
+; SI-FLUSH-NEXT:    s_mov_b32 s7, s11
+; SI-FLUSH-NEXT:    buffer_load_dword v0, off, s[12:15], 0
+; SI-FLUSH-NEXT:    buffer_load_dword v1, off, s[16:19], 0
+; SI-FLUSH-NEXT:    buffer_load_dword v2, off, s[4:7], 0
+; SI-FLUSH-NEXT:    s_mov_b32 s8, s0
+; SI-FLUSH-NEXT:    s_mov_b32 s9, s1
+; SI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; SI-FLUSH-NEXT:    v_mac_f32_e32 v2, v0, v1
+; SI-FLUSH-NEXT:    buffer_store_dword v2, off, s[8:11], 0
+; SI-FLUSH-NEXT:    s_endpgm
+;
+; SI-DENORM-FASTFMA-LABEL: fmuladd_f32:
+; SI-DENORM-FASTFMA:       ; %bb.0:
+; SI-DENORM-FASTFMA-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x9
+; SI-DENORM-FASTFMA-NEXT:    s_mov_b32 s11, 0xf000
+; SI-DENORM-FASTFMA-NEXT:    s_mov_b32 s10, -1
+; SI-DENORM-FASTFMA-NEXT:    s_mov_b32 s14, s10
+; SI-DENORM-FASTFMA-NEXT:    s_mov_b32 s15, s11
+; SI-DENORM-FASTFMA-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-DENORM-FASTFMA-NEXT:    s_mov_b32 s12, s2
+; SI-DENORM-FASTFMA-NEXT:    s_mov_b32 s13, s3
+; SI-DENORM-FASTFMA-NEXT:    s_mov_b32 s16, s4
+; SI-DENORM-FASTFMA-NEXT:    s_mov_b32 s17, s5
+; SI-DENORM-FASTFMA-NEXT:    s_mov_b32 s18, s10
+; SI-DENORM-FASTFMA-NEXT:    s_mov_b32 s19, s11
+; SI-DENORM-FASTFMA-NEXT:    s_mov_b32 s4, s6
+; SI-DENORM-FASTFMA-NEXT:    s_mov_b32 s5, s7
+; SI-DENORM-FASTFMA-NEXT:    s_mov_b32 s6, s10
+; SI-DENORM-FASTFMA-NEXT:    s_mov_b32 s7, s11
+; SI-DENORM-FASTFMA-NEXT:    buffer_load_dword v0, off, s[12:15], 0
+; SI-DENORM-FASTFMA-NEXT:    buffer_load_dword v1, off, s[16:19], 0
+; SI-DENORM-FASTFMA-NEXT:    buffer_load_dword v2, off, s[4:7], 0
+; SI-DENORM-FASTFMA-NEXT:    s_mov_b32 s8, s0
+; SI-DENORM-FASTFMA-NEXT:    s_mov_b32 s9, s1
+; SI-DENORM-FASTFMA-NEXT:    s_waitcnt vmcnt(0)
+; SI-DENORM-FASTFMA-NEXT:    v_fma_f32 v0, v0, v1, v2
+; SI-DENORM-FASTFMA-NEXT:    buffer_store_dword v0, off, s[8:11], 0
+; SI-DENORM-FASTFMA-NEXT:    s_endpgm
+;
+; SI-DENORM-SLOWFMA-LABEL: fmuladd_f32:
+; SI-DENORM-SLOWFMA:       ; %bb.0:
+; SI-DENORM-SLOWFMA-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x9
+; SI-DENORM-SLOWFMA-NEXT:    s_mov_b32 s11, 0xf000
+; SI-DENORM-SLOWFMA-NEXT:    s_mov_b32 s10, -1
+; SI-DENORM-SLOWFMA-NEXT:    s_mov_b32 s14, s10
+; SI-DENORM-SLOWFMA-NEXT:    s_mov_b32 s15, s11
+; SI-DENORM-SLOWFMA-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-DENORM-SLOWFMA-NEXT:    s_mov_b32 s12, s2
+; SI-DENORM-SLOWFMA-NEXT:    s_mov_b32 s13, s3
+; SI-DENORM-SLOWFMA-NEXT:    s_mov_b32 s16, s4
+; SI-DENORM-SLOWFMA-NEXT:    s_mov_b32 s17, s5
+; SI-DENORM-SLOWFMA-NEXT:    s_mov_b32 s18, s10
+; SI-DENORM-SLOWFMA-NEXT:    s_mov_b32 s19, s11
+; SI-DENORM-SLOWFMA-NEXT:    s_mov_b32 s4, s6
+; SI-DENORM-SLOWFMA-NEXT:    s_mov_b32 s5, s7
+; SI-DENORM-SLOWFMA-NEXT:    s_mov_b32 s6, s10
+; SI-DENORM-SLOWFMA-NEXT:    s_mov_b32 s7, s11
+; SI-DENORM-SLOWFMA-NEXT:    buffer_load_dword v0, off, s[12:15], 0
+; SI-DENORM-SLOWFMA-NEXT:    buffer_load_dword v1, off, s[16:19], 0
+; SI-DENORM-SLOWFMA-NEXT:    buffer_load_dword v2, off, s[4:7], 0
+; SI-DENORM-SLOWFMA-NEXT:    s_mov_b32 s8, s0
+; SI-DENORM-SLOWFMA-NEXT:    s_mov_b32 s9, s1
+; SI-DENORM-SLOWFMA-NEXT:    s_waitcnt vmcnt(1)
+; SI-DENORM-SLOWFMA-NEXT:    v_mul_f32_e32 v0, v0, v1
+; SI-DENORM-SLOWFMA-NEXT:    s_waitcnt vmcnt(0)
+; SI-DENORM-SLOWFMA-NEXT:    v_add_f32_e32 v0, v0, v2
+; SI-DENORM-SLOWFMA-NEXT:    buffer_store_dword v0, off, s[8:11], 0
+; SI-DENORM-SLOWFMA-NEXT:    s_endpgm
+;
+; GFX9-FLUSH-MAD-LABEL: fmuladd_f32:
+; GFX9-FLUSH-MAD:       ; %bb.0:
+; GFX9-FLUSH-MAD-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX9-FLUSH-MAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-FLUSH-MAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-FLUSH-MAD-NEXT:    global_load_dword v1, v0, s[10:11]
+; GFX9-FLUSH-MAD-NEXT:    global_load_dword v2, v0, s[12:13]
+; GFX9-FLUSH-MAD-NEXT:    global_load_dword v3, v0, s[14:15]
+; GFX9-FLUSH-MAD-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-FLUSH-MAD-NEXT:    v_mac_f32_e32 v3, v1, v2
+; GFX9-FLUSH-MAD-NEXT:    global_store_dword v0, v3, s[8:9]
+; GFX9-FLUSH-MAD-NEXT:    s_endpgm
+;
+; GFX9-DENORM-FASTFMA-MAD-LABEL: fmuladd_f32:
+; GFX9-DENORM-FASTFMA-MAD:       ; %bb.0:
+; GFX9-DENORM-FASTFMA-MAD-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX9-DENORM-FASTFMA-MAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-DENORM-FASTFMA-MAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DENORM-FASTFMA-MAD-NEXT:    global_load_dword v1, v0, s[10:11]
+; GFX9-DENORM-FASTFMA-MAD-NEXT:    global_load_dword v2, v0, s[12:13]
+; GFX9-DENORM-FASTFMA-MAD-NEXT:    global_load_dword v3, v0, s[14:15]
+; GFX9-DENORM-FASTFMA-MAD-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-DENORM-FASTFMA-MAD-NEXT:    v_fma_f32 v1, v1, v2, v3
+; GFX9-DENORM-FASTFMA-MAD-NEXT:    global_store_dword v0, v1, s[8:9]
+; GFX9-DENORM-FASTFMA-MAD-NEXT:    s_endpgm
+;
+; GFX9-FMAC-LABEL: fmuladd_f32:
+; GFX9-FMAC:       ; %bb.0:
+; GFX9-FMAC-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX9-FMAC-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-FMAC-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-FMAC-NEXT:    global_load_dword v1, v0, s[10:11]
+; GFX9-FMAC-NEXT:    global_load_dword v2, v0, s[12:13]
+; GFX9-FMAC-NEXT:    global_load_dword v3, v0, s[14:15]
+; GFX9-FMAC-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-FMAC-NEXT:    v_fmac_f32_e32 v3, v1, v2
+; GFX9-FMAC-NEXT:    global_store_dword v0, v3, s[8:9]
+; GFX9-FMAC-NEXT:    s_endpgm
+;
+; GFX10-LABEL: fmuladd_f32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x24
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_clause 0x2
+; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX10-NEXT:    global_load_dword v2, v0, s[4:5]
+; GFX10-NEXT:    global_load_dword v3, v0, s[6:7]
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_fmac_f32_e32 v3, v1, v2
+; GFX10-NEXT:    global_store_dword v0, v3, s[0:1]
+; GFX10-NEXT:    s_endpgm
   %r0 = load float, ptr addrspace(1) %in1
   %r1 = load float, ptr addrspace(1) %in2
   %r2 = load float, ptr addrspace(1) %in3
@@ -48,18 +174,190 @@ define amdgpu_kernel void @fmuladd_f32(ptr addrspace(1) %out, ptr addrspace(1) %
   ret void
 }
 
-; GCN-LABEL: {{^}}fmul_fadd_f32:
-; GCN-FLUSH: v_mac_f32
-
-; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32
-
-; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32
-; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32
-
-; GCN-DENORM-STRICT: v_mul_f32_e32
-; GCN-DENORM-STRICT: v_add_f32_e32
-define amdgpu_kernel void @fmul_fadd_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1,
-                           ptr addrspace(1) %in2, ptr addrspace(1) %in3) #0 {
+define amdgpu_kernel void @fmul_fadd_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2, ptr addrspace(1) %in3) #0 {
+; SI-FLUSH-LABEL: fmul_fadd_f32:
+; SI-FLUSH:       ; %bb.0:
+; SI-FLUSH-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x9
+; SI-FLUSH-NEXT:    s_mov_b32 s11, 0xf000
+; SI-FLUSH-NEXT:    s_mov_b32 s10, -1
+; SI-FLUSH-NEXT:    s_mov_b32 s14, s10
+; SI-FLUSH-NEXT:    s_mov_b32 s15, s11
+; SI-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-FLUSH-NEXT:    s_mov_b32 s12, s2
+; SI-FLUSH-NEXT:    s_mov_b32 s13, s3
+; SI-FLUSH-NEXT:    s_mov_b32 s16, s4
+; SI-FLUSH-NEXT:    s_mov_b32 s17, s5
+; SI-FLUSH-NEXT:    s_mov_b32 s18, s10
+; SI-FLUSH-NEXT:    s_mov_b32 s19, s11
+; SI-FLUSH-NEXT:    s_mov_b32 s4, s6
+; SI-FLUSH-NEXT:    s_mov_b32 s5, s7
+; SI-FLUSH-NEXT:    s_mov_b32 s6, s10
+; SI-FLUSH-NEXT:    s_mov_b32 s7, s11
+; SI-FLUSH-NEXT:    buffer_load_dword v0, off, s[12:15], 0 glc
+; SI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; SI-FLUSH-NEXT:    buffer_load_dword v1, off, s[16:19], 0 glc
+; SI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; SI-FLUSH-NEXT:    buffer_load_dword v2, off, s[4:7], 0 glc
+; SI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; SI-FLUSH-NEXT:    s_mov_b32 s8, s0
+; SI-FLUSH-NEXT:    s_mov_b32 s9, s1
+; SI-FLUSH-NEXT:    v_mac_f32_e32 v2, v0, v1
+; SI-FLUSH-NEXT:    buffer_store_dword v2, off, s[8:11], 0
+; SI-FLUSH-NEXT:    s_endpgm
+;
+; SI-DENORM-FASTFMA-STRICT-LABEL: fmul_fadd_f32:
+; SI-DENORM-FASTFMA-STRICT:       ; %bb.0:
+; SI-DENORM-FASTFMA-STRICT-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x9
+; SI-DENORM-FASTFMA-STRICT-NEXT:    s_mov_b32 s11, 0xf000
+; SI-DENORM-FASTFMA-STRICT-NEXT:    s_mov_b32 s10, -1
+; SI-DENORM-FASTFMA-STRICT-NEXT:    s_mov_b32 s14, s10
+; SI-DENORM-FASTFMA-STRICT-NEXT:    s_mov_b32 s15, s11
+; SI-DENORM-FASTFMA-STRICT-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-DENORM-FASTFMA-STRICT-NEXT:    s_mov_b32 s12, s2
+; SI-DENORM-FASTFMA-STRICT-NEXT:    s_mov_b32 s13, s3
+; SI-DENORM-FASTFMA-STRICT-NEXT:    s_mov_b32 s16, s4
+; SI-DENORM-FASTFMA-STRICT-NEXT:    s_mov_b32 s17, s5
+; SI-DENORM-FASTFMA-STRICT-NEXT:    s_mov_b32 s18, s10
+; SI-DENORM-FASTFMA-STRICT-NEXT:    s_mov_b32 s19, s11
+; SI-DENORM-FASTFMA-STRICT-NEXT:    s_mov_b32 s4, s6
+; SI-DENORM-FASTFMA-STRICT-NEXT:    s_mov_b32 s5, s7
+; SI-DENORM-FASTFMA-STRICT-NEXT:    s_mov_b32 s6, s10
+; SI-DENORM-FASTFMA-STRICT-NEXT:    s_mov_b32 s7, s11
+; SI-DENORM-FASTFMA-STRICT-NEXT:    buffer_load_dword v0, off, s[12:15], 0 glc
+; SI-DENORM-FASTFMA-STRICT-NEXT:    s_waitcnt vmcnt(0)
+; SI-DENORM-FASTFMA-STRICT-NEXT:    buffer_load_dword v1, off, s[16:19], 0 glc
+; SI-DENORM-FASTFMA-STRICT-NEXT:    s_waitcnt vmcnt(0)
+; SI-DENORM-FASTFMA-STRICT-NEXT:    buffer_load_dword v2, off, s[4:7], 0 glc
+; SI-DENORM-FASTFMA-STRICT-NEXT:    s_waitcnt vmcnt(0)
+; SI-DENORM-FASTFMA-STRICT-NEXT:    s_mov_b32 s8, s0
+; SI-DENORM-FASTFMA-STRICT-NEXT:    s_mov_b32 s9, s1
+; SI-DENORM-FASTFMA-STRICT-NEXT:    v_mul_f32_e32 v0, v0, v1
+; SI-DENORM-FASTFMA-STRICT-NEXT:    v_add_f32_e32 v0, v0, v2
+; SI-DENORM-FASTFMA-STRICT-NEXT:    buffer_store_dword v0, off, s[8:11], 0
+; SI-DENORM-FASTFMA-STRICT-NEXT:    s_endpgm
+;
+; SI-DENORM-SLOWFMA-LABEL: fmul_fadd_f32:
+; SI-DENORM-SLOWFMA:       ; %bb.0:
+; SI-DENORM-SLOWFMA-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x9
+; SI-DENORM-SLOWFMA-NEXT:    s_mov_b32 s11, 0xf000
+; SI-DENORM-SLOWFMA-NEXT:    s_mov_b32 s10, -1
+; SI-DENORM-SLOWFMA-NEXT:    s_mov_b32 s14, s10
+; SI-DENORM-SLOWFMA-NEXT:    s_mov_b32 s15, s11
+; SI-DENORM-SLOWFMA-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-DENORM-SLOWFMA-NEXT:    s_mov_b32 s12, s2
+; SI-DENORM-SLOWFMA-NEXT:    s_mov_b32 s13, s3
+; SI-DENORM-SLOWFMA-NEXT:    s_mov_b32 s16, s4
+; SI-DENORM-SLOWFMA-NEXT:    s_mov_b32 s17, s5
+; SI-DENORM-SLOWFMA-NEXT:    s_mov_b32 s18, s10
+; SI-DENORM-SLOWFMA-NEXT:    s_mov_b32 s19, s11
+; SI-DENORM-SLOWFMA-NEXT:    s_mov_b32 s4, s6
+; SI-DENORM-SLOWFMA-NEXT:    s_mov_b32 s5, s7
+; SI-DENORM-SLOWFMA-NEXT:    s_mov_b32 s6, s10
+; SI-DENORM-SLOWFMA-NEXT:    s_mov_b32 s7, s11
+; SI-DENORM-SLOWFMA-NEXT:    buffer_load_dword v0, off, s[12:15], 0 glc
+; SI-DENORM-SLOWFMA-NEXT:    s_waitcnt vmcnt(0)
+; SI-DENORM-SLOWFMA-NEXT:    buffer_load_dword v1, off, s[16:19], 0 glc
+; SI-DENORM-SLOWFMA-NEXT:    s_waitcnt vmcnt(0)
+; SI-DENORM-SLOWFMA-NEXT:    buffer_load_dword v2, off, s[4:7], 0 glc
+; SI-DENORM-SLOWFMA-NEXT:    s_waitcnt vmcnt(0)
+; SI-DENORM-SLOWFMA-NEXT:    s_mov_b32 s8, s0
+; SI-DENORM-SLOWFMA-NEXT:    s_mov_b32 s9, s1
+; SI-DENORM-SLOWFMA-NEXT:    v_mul_f32_e32 v0, v0, v1
+; SI-DENORM-SLOWFMA-NEXT:    v_add_f32_e32 v0, v0, v2
+; SI-DENORM-SLOWFMA-NEXT:    buffer_store_dword v0, off, s[8:11], 0
+; SI-DENORM-SLOWFMA-NEXT:    s_endpgm
+;
+; SI-DENORM-FASTFMA-CONTRACT-LABEL: fmul_fadd_f32:
+; SI-DENORM-FASTFMA-CONTRACT:       ; %bb.0:
+; SI-DENORM-FASTFMA-CONTRACT-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x9
+; SI-DENORM-FASTFMA-CONTRACT-NEXT:    s_mov_b32 s11, 0xf000
+; SI-DENORM-FASTFMA-CONTRACT-NEXT:    s_mov_b32 s10, -1
+; SI-DENORM-FASTFMA-CONTRACT-NEXT:    s_mov_b32 s14, s10
+; SI-DENORM-FASTFMA-CONTRACT-NEXT:    s_mov_b32 s15, s11
+; SI-DENORM-FASTFMA-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-DENORM-FASTFMA-CONTRACT-NEXT:    s_mov_b32 s12, s2
+; SI-DENORM-FASTFMA-CONTRACT-NEXT:    s_mov_b32 s13, s3
+; SI-DENORM-FASTFMA-CONTRACT-NEXT:    s_mov_b32 s16, s4
+; SI-DENORM-FASTFMA-CONTRACT-NEXT:    s_mov_b32 s17, s5
+; SI-DENORM-FASTFMA-CONTRACT-NEXT:    s_mov_b32 s18, s10
+; SI-DENORM-FASTFMA-CONTRACT-NEXT:    s_mov_b32 s19, s11
+; SI-DENORM-FASTFMA-CONTRACT-NEXT:    s_mov_b32 s4, s6
+; SI-DENORM-FASTFMA-CONTRACT-NEXT:    s_mov_b32 s5, s7
+; SI-DENORM-FASTFMA-CONTRACT-NEXT:    s_mov_b32 s6, s10
+; SI-DENORM-FASTFMA-CONTRACT-NEXT:    s_mov_b32 s7, s11
+; SI-DENORM-FASTFMA-CONTRACT-NEXT:    buffer_load_dword v0, off, s[12:15], 0 glc
+; SI-DENORM-FASTFMA-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
+; SI-DENORM-FASTFMA-CONTRACT-NEXT:    buffer_load_dword v1, off, s[16:19], 0 glc
+; SI-DENORM-FASTFMA-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
+; SI-DENORM-FASTFMA-CONTRA...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/151832


More information about the llvm-commits mailing list