[llvm] [AMDGPU] fmuladd.f32.ll - clean up prefixes and regenerate checks (PR #151832)
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Sat Aug 2 13:11:53 PDT 2025
https://github.com/RKSimon updated https://github.com/llvm/llvm-project/pull/151832
>From 035728f3e82726e8f6c57cc55eb55fa3b2d6cb2f Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Sat, 2 Aug 2025 18:38:07 +0100
Subject: [PATCH] [AMDGPU] fmuladd.f32.ll - clean up prefixes and regenerate
checks
Automate the fmuladd.f32.ll test checks as manually fixing changes while working on the topological dag patches was doing my head in
---
llvm/test/CodeGen/AMDGPU/fmuladd.f32.ll | 2751 ++++++++++++++++++++---
1 file changed, 2428 insertions(+), 323 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/fmuladd.f32.ll b/llvm/test/CodeGen/AMDGPU/fmuladd.f32.ll
index ceacdf5e254aa..ab8d902fb862b 100644
--- a/llvm/test/CodeGen/AMDGPU/fmuladd.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmuladd.f32.ll
@@ -1,45 +1,171 @@
-; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=tahiti -denormal-fp-math-f32=preserve-sign -mattr=+fast-fmaf -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-FLUSH,GCN-FLUSH-MAD,SI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=tahiti -denormal-fp-math-f32=ieee -mattr=+fast-fmaf -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-DENORM-STRICT,SI-DENORM,GCN-DENORM-FASTFMA,SI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=verde -denormal-fp-math-f32=preserve-sign -mattr=-fast-fmaf -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-FLUSH,GCN-FLUSH-MAD,SI-FLUSH,SI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=verde -denormal-fp-math-f32=ieee -mattr=-fast-fmaf -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-DENORM-STRICT,SI-DENORM,GCN-DENORM-SLOWFMA,SI %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=tahiti -denormal-fp-math-f32=preserve-sign -mattr=+fast-fmaf -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=SI-FLUSH %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=verde -denormal-fp-math-f32=preserve-sign -mattr=-fast-fmaf -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=SI-FLUSH %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=tahiti -denormal-fp-math-f32=ieee -mattr=+fast-fmaf -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=SI-DENORM-FASTFMA,SI-DENORM-FASTFMA-STRICT %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=verde -denormal-fp-math-f32=ieee -mattr=-fast-fmaf -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=SI-DENORM-SLOWFMA %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=tahiti -denormal-fp-math-f32=preserve-sign -mattr=+fast-fmaf -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-FLUSH,GCN-FLUSH-MAD,SI-FLUSH,SI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=tahiti -denormal-fp-math-f32=ieee -mattr=+fast-fmaf -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI-DENORM,GCN-DENORM-FASTFMA,GCN-DENORM-FASTFMA-CONTRACT,SI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=verde -denormal-fp-math-f32=preserve-sign -mattr=-fast-fmaf -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-FLUSH,GCN-FLUSH-MAD,SI-FLUSH,SI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=verde -denormal-fp-math-f32=ieee -mattr=-fast-fmaf -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI-DENORM,GCN-DENORM-SLOWFMA,GCN-DENORM-SLOWFMA-CONTRACT,SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=tahiti -denormal-fp-math-f32=preserve-sign -mattr=+fast-fmaf -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefixes=SI-FLUSH %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=verde -denormal-fp-math-f32=preserve-sign -mattr=-fast-fmaf -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefixes=SI-FLUSH %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=tahiti -denormal-fp-math-f32=ieee -mattr=+fast-fmaf -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefixes=SI-DENORM-FASTFMA,SI-DENORM-FASTFMA-CONTRACT %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=verde -denormal-fp-math-f32=ieee -mattr=-fast-fmaf -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefixes=SI-DENORM-SLOWFMA %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=gfx900 -denormal-fp-math-f32=preserve-sign -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GFX9-FLUSH,GFX9-FLUSH-MAD %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=gfx900 -denormal-fp-math-f32=ieee -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GFX9-DENORM,GFX9-DENORM-FASTFMA-MAD %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=gfx900 -denormal-fp-math-f32=preserve-sign -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-FLUSH,GCN-FLUSH-MAD %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=gfx900 -denormal-fp-math-f32=ieee -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-DENORM-STRICT,GCN-DENORM-FASTFMA %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=gfx906 -denormal-fp-math-f32=preserve-sign -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GFX9-FMAC,GFX9-FLUSH %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=gfx906 -denormal-fp-math-f32=ieee -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GFX9-FMAC,GFX9-DENORM %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=gfx906 -denormal-fp-math-f32=preserve-sign -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-FLUSH,GCN-FLUSH-FMAC %s
-
-; FIXME: Should probably test this, but sometimes selecting fmac is painful to match.
-; XUN: llc -amdgpu-scalarize-global-loads=false -mcpu=gfx906 -denormal-fp-math-f32=ieee -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-DENORM-STRICT,GCN-DENORM-FASTFMA %s
-
-; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=gfx1030 -denormal-fp-math-f32=preserve-sign -mattr=+mad-mac-f32-insts -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-FLUSH,GCN-FLUSH-FMAC %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=gfx1030 -denormal-fp-math-f32=ieee -mattr=+mad-mac-f32-insts -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-DENORM-STRICT %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=gfx1030 -denormal-fp-math-f32=preserve-sign -mattr=+mad-mac-f32-insts -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10-FLUSH %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=gfx1030 -denormal-fp-math-f32=ieee -mattr=+mad-mac-f32-insts -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10-DENORM %s
; Test all permutations of: fp32 denormals, fast fp contract, fp contract enabled for fmuladd, fmaf fast/slow.
target triple = "amdgcn--"
-
declare i32 @llvm.amdgcn.workitem.id.x() #1
declare float @llvm.fmuladd.f32(float, float, float) #1
declare half @llvm.fmuladd.f16(half, half, half) #1
declare float @llvm.fabs.f32(float) #1
-; GCN-LABEL: {{^}}fmuladd_f32:
-; GCN-FLUSH-MAD: v_mac_f32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
-; GCN-FLUSH-FMAC: v_fmac_f32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
-
-; GCN-DENORM-FASTFMA: v_fma_f32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
-
-; GCN-DENORM-SLOWFMA: v_mul_f32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
-; GCN-DENORM-SLOWFMA: v_add_f32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
-define amdgpu_kernel void @fmuladd_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1,
- ptr addrspace(1) %in2, ptr addrspace(1) %in3) #0 {
+define amdgpu_kernel void @fmuladd_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2, ptr addrspace(1) %in3) #0 {
+; SI-FLUSH-LABEL: fmuladd_f32:
+; SI-FLUSH: ; %bb.0:
+; SI-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
+; SI-FLUSH-NEXT: s_mov_b32 s11, 0xf000
+; SI-FLUSH-NEXT: s_mov_b32 s10, -1
+; SI-FLUSH-NEXT: s_mov_b32 s14, s10
+; SI-FLUSH-NEXT: s_mov_b32 s15, s11
+; SI-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
+; SI-FLUSH-NEXT: s_mov_b32 s12, s2
+; SI-FLUSH-NEXT: s_mov_b32 s13, s3
+; SI-FLUSH-NEXT: s_mov_b32 s16, s4
+; SI-FLUSH-NEXT: s_mov_b32 s17, s5
+; SI-FLUSH-NEXT: s_mov_b32 s18, s10
+; SI-FLUSH-NEXT: s_mov_b32 s19, s11
+; SI-FLUSH-NEXT: s_mov_b32 s4, s6
+; SI-FLUSH-NEXT: s_mov_b32 s5, s7
+; SI-FLUSH-NEXT: s_mov_b32 s6, s10
+; SI-FLUSH-NEXT: s_mov_b32 s7, s11
+; SI-FLUSH-NEXT: buffer_load_dword v0, off, s[12:15], 0
+; SI-FLUSH-NEXT: buffer_load_dword v1, off, s[16:19], 0
+; SI-FLUSH-NEXT: buffer_load_dword v2, off, s[4:7], 0
+; SI-FLUSH-NEXT: s_mov_b32 s8, s0
+; SI-FLUSH-NEXT: s_mov_b32 s9, s1
+; SI-FLUSH-NEXT: s_waitcnt vmcnt(0)
+; SI-FLUSH-NEXT: v_mac_f32_e32 v2, v0, v1
+; SI-FLUSH-NEXT: buffer_store_dword v2, off, s[8:11], 0
+; SI-FLUSH-NEXT: s_endpgm
+;
+; SI-DENORM-FASTFMA-LABEL: fmuladd_f32:
+; SI-DENORM-FASTFMA: ; %bb.0:
+; SI-DENORM-FASTFMA-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
+; SI-DENORM-FASTFMA-NEXT: s_mov_b32 s11, 0xf000
+; SI-DENORM-FASTFMA-NEXT: s_mov_b32 s10, -1
+; SI-DENORM-FASTFMA-NEXT: s_mov_b32 s14, s10
+; SI-DENORM-FASTFMA-NEXT: s_mov_b32 s15, s11
+; SI-DENORM-FASTFMA-NEXT: s_waitcnt lgkmcnt(0)
+; SI-DENORM-FASTFMA-NEXT: s_mov_b32 s12, s2
+; SI-DENORM-FASTFMA-NEXT: s_mov_b32 s13, s3
+; SI-DENORM-FASTFMA-NEXT: s_mov_b32 s16, s4
+; SI-DENORM-FASTFMA-NEXT: s_mov_b32 s17, s5
+; SI-DENORM-FASTFMA-NEXT: s_mov_b32 s18, s10
+; SI-DENORM-FASTFMA-NEXT: s_mov_b32 s19, s11
+; SI-DENORM-FASTFMA-NEXT: s_mov_b32 s4, s6
+; SI-DENORM-FASTFMA-NEXT: s_mov_b32 s5, s7
+; SI-DENORM-FASTFMA-NEXT: s_mov_b32 s6, s10
+; SI-DENORM-FASTFMA-NEXT: s_mov_b32 s7, s11
+; SI-DENORM-FASTFMA-NEXT: buffer_load_dword v0, off, s[12:15], 0
+; SI-DENORM-FASTFMA-NEXT: buffer_load_dword v1, off, s[16:19], 0
+; SI-DENORM-FASTFMA-NEXT: buffer_load_dword v2, off, s[4:7], 0
+; SI-DENORM-FASTFMA-NEXT: s_mov_b32 s8, s0
+; SI-DENORM-FASTFMA-NEXT: s_mov_b32 s9, s1
+; SI-DENORM-FASTFMA-NEXT: s_waitcnt vmcnt(0)
+; SI-DENORM-FASTFMA-NEXT: v_fma_f32 v0, v0, v1, v2
+; SI-DENORM-FASTFMA-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; SI-DENORM-FASTFMA-NEXT: s_endpgm
+;
+; SI-DENORM-SLOWFMA-LABEL: fmuladd_f32:
+; SI-DENORM-SLOWFMA: ; %bb.0:
+; SI-DENORM-SLOWFMA-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
+; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s11, 0xf000
+; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s10, -1
+; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s14, s10
+; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s15, s11
+; SI-DENORM-SLOWFMA-NEXT: s_waitcnt lgkmcnt(0)
+; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s12, s2
+; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s13, s3
+; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s16, s4
+; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s17, s5
+; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s18, s10
+; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s19, s11
+; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s4, s6
+; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s5, s7
+; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s6, s10
+; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s7, s11
+; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v0, off, s[12:15], 0
+; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v1, off, s[16:19], 0
+; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v2, off, s[4:7], 0
+; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s8, s0
+; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s9, s1
+; SI-DENORM-SLOWFMA-NEXT: s_waitcnt vmcnt(1)
+; SI-DENORM-SLOWFMA-NEXT: v_mul_f32_e32 v0, v0, v1
+; SI-DENORM-SLOWFMA-NEXT: s_waitcnt vmcnt(0)
+; SI-DENORM-SLOWFMA-NEXT: v_add_f32_e32 v0, v0, v2
+; SI-DENORM-SLOWFMA-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; SI-DENORM-SLOWFMA-NEXT: s_endpgm
+;
+; GFX9-FLUSH-MAD-LABEL: fmuladd_f32:
+; GFX9-FLUSH-MAD: ; %bb.0:
+; GFX9-FLUSH-MAD-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX9-FLUSH-MAD-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-FLUSH-MAD-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-FLUSH-MAD-NEXT: global_load_dword v1, v0, s[10:11]
+; GFX9-FLUSH-MAD-NEXT: global_load_dword v2, v0, s[12:13]
+; GFX9-FLUSH-MAD-NEXT: global_load_dword v3, v0, s[14:15]
+; GFX9-FLUSH-MAD-NEXT: s_waitcnt vmcnt(0)
+; GFX9-FLUSH-MAD-NEXT: v_mac_f32_e32 v3, v1, v2
+; GFX9-FLUSH-MAD-NEXT: global_store_dword v0, v3, s[8:9]
+; GFX9-FLUSH-MAD-NEXT: s_endpgm
+;
+; GFX9-DENORM-FASTFMA-MAD-LABEL: fmuladd_f32:
+; GFX9-DENORM-FASTFMA-MAD: ; %bb.0:
+; GFX9-DENORM-FASTFMA-MAD-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX9-DENORM-FASTFMA-MAD-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-DENORM-FASTFMA-MAD-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DENORM-FASTFMA-MAD-NEXT: global_load_dword v1, v0, s[10:11]
+; GFX9-DENORM-FASTFMA-MAD-NEXT: global_load_dword v2, v0, s[12:13]
+; GFX9-DENORM-FASTFMA-MAD-NEXT: global_load_dword v3, v0, s[14:15]
+; GFX9-DENORM-FASTFMA-MAD-NEXT: s_waitcnt vmcnt(0)
+; GFX9-DENORM-FASTFMA-MAD-NEXT: v_fma_f32 v1, v1, v2, v3
+; GFX9-DENORM-FASTFMA-MAD-NEXT: global_store_dword v0, v1, s[8:9]
+; GFX9-DENORM-FASTFMA-MAD-NEXT: s_endpgm
+;
+; GFX9-FMAC-LABEL: fmuladd_f32:
+; GFX9-FMAC: ; %bb.0:
+; GFX9-FMAC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX9-FMAC-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-FMAC-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-FMAC-NEXT: global_load_dword v1, v0, s[10:11]
+; GFX9-FMAC-NEXT: global_load_dword v2, v0, s[12:13]
+; GFX9-FMAC-NEXT: global_load_dword v3, v0, s[14:15]
+; GFX9-FMAC-NEXT: s_waitcnt vmcnt(0)
+; GFX9-FMAC-NEXT: v_fmac_f32_e32 v3, v1, v2
+; GFX9-FMAC-NEXT: global_store_dword v0, v3, s[8:9]
+; GFX9-FMAC-NEXT: s_endpgm
+;
+; GFX10-LABEL: fmuladd_f32:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
+; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_clause 0x2
+; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-NEXT: global_load_dword v2, v0, s[4:5]
+; GFX10-NEXT: global_load_dword v3, v0, s[6:7]
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_fmac_f32_e32 v3, v1, v2
+; GFX10-NEXT: global_store_dword v0, v3, s[0:1]
+; GFX10-NEXT: s_endpgm
%r0 = load float, ptr addrspace(1) %in1
%r1 = load float, ptr addrspace(1) %in2
%r2 = load float, ptr addrspace(1) %in3
@@ -48,18 +174,190 @@ define amdgpu_kernel void @fmuladd_f32(ptr addrspace(1) %out, ptr addrspace(1) %
ret void
}
-; GCN-LABEL: {{^}}fmul_fadd_f32:
-; GCN-FLUSH: v_mac_f32
-
-; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32
-
-; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32
-; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32
-
-; GCN-DENORM-STRICT: v_mul_f32_e32
-; GCN-DENORM-STRICT: v_add_f32_e32
-define amdgpu_kernel void @fmul_fadd_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1,
- ptr addrspace(1) %in2, ptr addrspace(1) %in3) #0 {
+define amdgpu_kernel void @fmul_fadd_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2, ptr addrspace(1) %in3) #0 {
+; SI-FLUSH-LABEL: fmul_fadd_f32:
+; SI-FLUSH: ; %bb.0:
+; SI-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
+; SI-FLUSH-NEXT: s_mov_b32 s11, 0xf000
+; SI-FLUSH-NEXT: s_mov_b32 s10, -1
+; SI-FLUSH-NEXT: s_mov_b32 s14, s10
+; SI-FLUSH-NEXT: s_mov_b32 s15, s11
+; SI-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
+; SI-FLUSH-NEXT: s_mov_b32 s12, s2
+; SI-FLUSH-NEXT: s_mov_b32 s13, s3
+; SI-FLUSH-NEXT: s_mov_b32 s16, s4
+; SI-FLUSH-NEXT: s_mov_b32 s17, s5
+; SI-FLUSH-NEXT: s_mov_b32 s18, s10
+; SI-FLUSH-NEXT: s_mov_b32 s19, s11
+; SI-FLUSH-NEXT: s_mov_b32 s4, s6
+; SI-FLUSH-NEXT: s_mov_b32 s5, s7
+; SI-FLUSH-NEXT: s_mov_b32 s6, s10
+; SI-FLUSH-NEXT: s_mov_b32 s7, s11
+; SI-FLUSH-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc
+; SI-FLUSH-NEXT: s_waitcnt vmcnt(0)
+; SI-FLUSH-NEXT: buffer_load_dword v1, off, s[16:19], 0 glc
+; SI-FLUSH-NEXT: s_waitcnt vmcnt(0)
+; SI-FLUSH-NEXT: buffer_load_dword v2, off, s[4:7], 0 glc
+; SI-FLUSH-NEXT: s_waitcnt vmcnt(0)
+; SI-FLUSH-NEXT: s_mov_b32 s8, s0
+; SI-FLUSH-NEXT: s_mov_b32 s9, s1
+; SI-FLUSH-NEXT: v_mac_f32_e32 v2, v0, v1
+; SI-FLUSH-NEXT: buffer_store_dword v2, off, s[8:11], 0
+; SI-FLUSH-NEXT: s_endpgm
+;
+; SI-DENORM-FASTFMA-STRICT-LABEL: fmul_fadd_f32:
+; SI-DENORM-FASTFMA-STRICT: ; %bb.0:
+; SI-DENORM-FASTFMA-STRICT-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
+; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b32 s11, 0xf000
+; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b32 s10, -1
+; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b32 s14, s10
+; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b32 s15, s11
+; SI-DENORM-FASTFMA-STRICT-NEXT: s_waitcnt lgkmcnt(0)
+; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b32 s12, s2
+; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b32 s13, s3
+; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b32 s16, s4
+; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b32 s17, s5
+; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b32 s18, s10
+; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b32 s19, s11
+; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b32 s4, s6
+; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b32 s5, s7
+; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b32 s6, s10
+; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b32 s7, s11
+; SI-DENORM-FASTFMA-STRICT-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc
+; SI-DENORM-FASTFMA-STRICT-NEXT: s_waitcnt vmcnt(0)
+; SI-DENORM-FASTFMA-STRICT-NEXT: buffer_load_dword v1, off, s[16:19], 0 glc
+; SI-DENORM-FASTFMA-STRICT-NEXT: s_waitcnt vmcnt(0)
+; SI-DENORM-FASTFMA-STRICT-NEXT: buffer_load_dword v2, off, s[4:7], 0 glc
+; SI-DENORM-FASTFMA-STRICT-NEXT: s_waitcnt vmcnt(0)
+; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b32 s8, s0
+; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b32 s9, s1
+; SI-DENORM-FASTFMA-STRICT-NEXT: v_mul_f32_e32 v0, v0, v1
+; SI-DENORM-FASTFMA-STRICT-NEXT: v_add_f32_e32 v0, v0, v2
+; SI-DENORM-FASTFMA-STRICT-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; SI-DENORM-FASTFMA-STRICT-NEXT: s_endpgm
+;
+; SI-DENORM-SLOWFMA-LABEL: fmul_fadd_f32:
+; SI-DENORM-SLOWFMA: ; %bb.0:
+; SI-DENORM-SLOWFMA-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
+; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s11, 0xf000
+; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s10, -1
+; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s14, s10
+; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s15, s11
+; SI-DENORM-SLOWFMA-NEXT: s_waitcnt lgkmcnt(0)
+; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s12, s2
+; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s13, s3
+; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s16, s4
+; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s17, s5
+; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s18, s10
+; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s19, s11
+; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s4, s6
+; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s5, s7
+; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s6, s10
+; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s7, s11
+; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc
+; SI-DENORM-SLOWFMA-NEXT: s_waitcnt vmcnt(0)
+; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v1, off, s[16:19], 0 glc
+; SI-DENORM-SLOWFMA-NEXT: s_waitcnt vmcnt(0)
+; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v2, off, s[4:7], 0 glc
+; SI-DENORM-SLOWFMA-NEXT: s_waitcnt vmcnt(0)
+; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s8, s0
+; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s9, s1
+; SI-DENORM-SLOWFMA-NEXT: v_mul_f32_e32 v0, v0, v1
+; SI-DENORM-SLOWFMA-NEXT: v_add_f32_e32 v0, v0, v2
+; SI-DENORM-SLOWFMA-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; SI-DENORM-SLOWFMA-NEXT: s_endpgm
+;
+; SI-DENORM-FASTFMA-CONTRACT-LABEL: fmul_fadd_f32:
+; SI-DENORM-FASTFMA-CONTRACT: ; %bb.0:
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b32 s11, 0xf000
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b32 s10, -1
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b32 s14, s10
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b32 s15, s11
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_waitcnt lgkmcnt(0)
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b32 s12, s2
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b32 s13, s3
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b32 s16, s4
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b32 s17, s5
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b32 s18, s10
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b32 s19, s11
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b32 s4, s6
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b32 s5, s7
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b32 s6, s10
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b32 s7, s11
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_waitcnt vmcnt(0)
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: buffer_load_dword v1, off, s[16:19], 0 glc
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_waitcnt vmcnt(0)
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: buffer_load_dword v2, off, s[4:7], 0 glc
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_waitcnt vmcnt(0)
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b32 s8, s0
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b32 s9, s1
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: v_fma_f32 v0, v0, v1, v2
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_endpgm
+;
+; GFX9-FLUSH-LABEL: fmul_fadd_f32:
+; GFX9-FLUSH: ; %bb.0:
+; GFX9-FLUSH-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX9-FLUSH-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-FLUSH-NEXT: global_load_dword v1, v0, s[10:11] glc
+; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0)
+; GFX9-FLUSH-NEXT: global_load_dword v2, v0, s[12:13] glc
+; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0)
+; GFX9-FLUSH-NEXT: global_load_dword v3, v0, s[14:15] glc
+; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0)
+; GFX9-FLUSH-NEXT: v_mac_f32_e32 v3, v1, v2
+; GFX9-FLUSH-NEXT: global_store_dword v0, v3, s[8:9]
+; GFX9-FLUSH-NEXT: s_endpgm
+;
+; GFX9-DENORM-LABEL: fmul_fadd_f32:
+; GFX9-DENORM: ; %bb.0:
+; GFX9-DENORM-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX9-DENORM-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-DENORM-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DENORM-NEXT: global_load_dword v1, v0, s[10:11] glc
+; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0)
+; GFX9-DENORM-NEXT: global_load_dword v2, v0, s[12:13] glc
+; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0)
+; GFX9-DENORM-NEXT: global_load_dword v3, v0, s[14:15] glc
+; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0)
+; GFX9-DENORM-NEXT: v_mul_f32_e32 v1, v1, v2
+; GFX9-DENORM-NEXT: v_add_f32_e32 v1, v1, v3
+; GFX9-DENORM-NEXT: global_store_dword v0, v1, s[8:9]
+; GFX9-DENORM-NEXT: s_endpgm
+;
+; GFX10-FLUSH-LABEL: fmul_fadd_f32:
+; GFX10-FLUSH: ; %bb.0:
+; GFX10-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
+; GFX10-FLUSH-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-FLUSH-NEXT: global_load_dword v1, v0, s[2:3] glc dlc
+; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
+; GFX10-FLUSH-NEXT: global_load_dword v2, v0, s[4:5] glc dlc
+; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
+; GFX10-FLUSH-NEXT: global_load_dword v3, v0, s[6:7] glc dlc
+; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
+; GFX10-FLUSH-NEXT: v_mac_f32_e32 v3, v1, v2
+; GFX10-FLUSH-NEXT: global_store_dword v0, v3, s[0:1]
+; GFX10-FLUSH-NEXT: s_endpgm
+;
+; GFX10-DENORM-LABEL: fmul_fadd_f32:
+; GFX10-DENORM: ; %bb.0:
+; GFX10-DENORM-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
+; GFX10-DENORM-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-DENORM-NEXT: global_load_dword v1, v0, s[2:3] glc dlc
+; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0)
+; GFX10-DENORM-NEXT: global_load_dword v2, v0, s[4:5] glc dlc
+; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0)
+; GFX10-DENORM-NEXT: global_load_dword v3, v0, s[6:7] glc dlc
+; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0)
+; GFX10-DENORM-NEXT: v_mul_f32_e32 v1, v1, v2
+; GFX10-DENORM-NEXT: v_add_f32_e32 v1, v1, v3
+; GFX10-DENORM-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-DENORM-NEXT: s_endpgm
%r0 = load volatile float, ptr addrspace(1) %in1
%r1 = load volatile float, ptr addrspace(1) %in2
%r2 = load volatile float, ptr addrspace(1) %in3
@@ -69,15 +367,157 @@ define amdgpu_kernel void @fmul_fadd_f32(ptr addrspace(1) %out, ptr addrspace(1)
ret void
}
-; GCN-LABEL: {{^}}fmul_fadd_contract_f32:
-; GCN-FLUSH-FMAC: v_fmac_f32_e32
-
-; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32
-; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32
-
-; GCN-DENORM-FASTFMA: v_fma_f32
-define amdgpu_kernel void @fmul_fadd_contract_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1,
- ptr addrspace(1) %in2, ptr addrspace(1) %in3) #0 {
+define amdgpu_kernel void @fmul_fadd_contract_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2, ptr addrspace(1) %in3) #0 {
+; SI-FLUSH-LABEL: fmul_fadd_contract_f32:
+; SI-FLUSH: ; %bb.0:
+; SI-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
+; SI-FLUSH-NEXT: s_mov_b32 s11, 0xf000
+; SI-FLUSH-NEXT: s_mov_b32 s10, -1
+; SI-FLUSH-NEXT: s_mov_b32 s14, s10
+; SI-FLUSH-NEXT: s_mov_b32 s15, s11
+; SI-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
+; SI-FLUSH-NEXT: s_mov_b32 s12, s2
+; SI-FLUSH-NEXT: s_mov_b32 s13, s3
+; SI-FLUSH-NEXT: s_mov_b32 s16, s4
+; SI-FLUSH-NEXT: s_mov_b32 s17, s5
+; SI-FLUSH-NEXT: s_mov_b32 s18, s10
+; SI-FLUSH-NEXT: s_mov_b32 s19, s11
+; SI-FLUSH-NEXT: s_mov_b32 s4, s6
+; SI-FLUSH-NEXT: s_mov_b32 s5, s7
+; SI-FLUSH-NEXT: s_mov_b32 s6, s10
+; SI-FLUSH-NEXT: s_mov_b32 s7, s11
+; SI-FLUSH-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc
+; SI-FLUSH-NEXT: s_waitcnt vmcnt(0)
+; SI-FLUSH-NEXT: buffer_load_dword v1, off, s[16:19], 0 glc
+; SI-FLUSH-NEXT: s_waitcnt vmcnt(0)
+; SI-FLUSH-NEXT: buffer_load_dword v2, off, s[4:7], 0 glc
+; SI-FLUSH-NEXT: s_waitcnt vmcnt(0)
+; SI-FLUSH-NEXT: s_mov_b32 s8, s0
+; SI-FLUSH-NEXT: s_mov_b32 s9, s1
+; SI-FLUSH-NEXT: v_mac_f32_e32 v2, v0, v1
+; SI-FLUSH-NEXT: buffer_store_dword v2, off, s[8:11], 0
+; SI-FLUSH-NEXT: s_endpgm
+;
+; SI-DENORM-FASTFMA-LABEL: fmul_fadd_contract_f32:
+; SI-DENORM-FASTFMA: ; %bb.0:
+; SI-DENORM-FASTFMA-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
+; SI-DENORM-FASTFMA-NEXT: s_mov_b32 s11, 0xf000
+; SI-DENORM-FASTFMA-NEXT: s_mov_b32 s10, -1
+; SI-DENORM-FASTFMA-NEXT: s_mov_b32 s14, s10
+; SI-DENORM-FASTFMA-NEXT: s_mov_b32 s15, s11
+; SI-DENORM-FASTFMA-NEXT: s_waitcnt lgkmcnt(0)
+; SI-DENORM-FASTFMA-NEXT: s_mov_b32 s12, s2
+; SI-DENORM-FASTFMA-NEXT: s_mov_b32 s13, s3
+; SI-DENORM-FASTFMA-NEXT: s_mov_b32 s16, s4
+; SI-DENORM-FASTFMA-NEXT: s_mov_b32 s17, s5
+; SI-DENORM-FASTFMA-NEXT: s_mov_b32 s18, s10
+; SI-DENORM-FASTFMA-NEXT: s_mov_b32 s19, s11
+; SI-DENORM-FASTFMA-NEXT: s_mov_b32 s4, s6
+; SI-DENORM-FASTFMA-NEXT: s_mov_b32 s5, s7
+; SI-DENORM-FASTFMA-NEXT: s_mov_b32 s6, s10
+; SI-DENORM-FASTFMA-NEXT: s_mov_b32 s7, s11
+; SI-DENORM-FASTFMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc
+; SI-DENORM-FASTFMA-NEXT: s_waitcnt vmcnt(0)
+; SI-DENORM-FASTFMA-NEXT: buffer_load_dword v1, off, s[16:19], 0 glc
+; SI-DENORM-FASTFMA-NEXT: s_waitcnt vmcnt(0)
+; SI-DENORM-FASTFMA-NEXT: buffer_load_dword v2, off, s[4:7], 0 glc
+; SI-DENORM-FASTFMA-NEXT: s_waitcnt vmcnt(0)
+; SI-DENORM-FASTFMA-NEXT: s_mov_b32 s8, s0
+; SI-DENORM-FASTFMA-NEXT: s_mov_b32 s9, s1
+; SI-DENORM-FASTFMA-NEXT: v_fma_f32 v0, v0, v1, v2
+; SI-DENORM-FASTFMA-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; SI-DENORM-FASTFMA-NEXT: s_endpgm
+;
+; SI-DENORM-SLOWFMA-LABEL: fmul_fadd_contract_f32:
+; SI-DENORM-SLOWFMA: ; %bb.0:
+; SI-DENORM-SLOWFMA-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
+; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s11, 0xf000
+; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s10, -1
+; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s14, s10
+; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s15, s11
+; SI-DENORM-SLOWFMA-NEXT: s_waitcnt lgkmcnt(0)
+; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s12, s2
+; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s13, s3
+; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s16, s4
+; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s17, s5
+; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s18, s10
+; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s19, s11
+; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s4, s6
+; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s5, s7
+; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s6, s10
+; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s7, s11
+; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc
+; SI-DENORM-SLOWFMA-NEXT: s_waitcnt vmcnt(0)
+; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v1, off, s[16:19], 0 glc
+; SI-DENORM-SLOWFMA-NEXT: s_waitcnt vmcnt(0)
+; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v2, off, s[4:7], 0 glc
+; SI-DENORM-SLOWFMA-NEXT: s_waitcnt vmcnt(0)
+; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s8, s0
+; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s9, s1
+; SI-DENORM-SLOWFMA-NEXT: v_mul_f32_e32 v0, v0, v1
+; SI-DENORM-SLOWFMA-NEXT: v_add_f32_e32 v0, v0, v2
+; SI-DENORM-SLOWFMA-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; SI-DENORM-SLOWFMA-NEXT: s_endpgm
+;
+; GFX9-FLUSH-MAD-LABEL: fmul_fadd_contract_f32:
+; GFX9-FLUSH-MAD: ; %bb.0:
+; GFX9-FLUSH-MAD-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX9-FLUSH-MAD-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-FLUSH-MAD-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-FLUSH-MAD-NEXT: global_load_dword v1, v0, s[10:11] glc
+; GFX9-FLUSH-MAD-NEXT: s_waitcnt vmcnt(0)
+; GFX9-FLUSH-MAD-NEXT: global_load_dword v2, v0, s[12:13] glc
+; GFX9-FLUSH-MAD-NEXT: s_waitcnt vmcnt(0)
+; GFX9-FLUSH-MAD-NEXT: global_load_dword v3, v0, s[14:15] glc
+; GFX9-FLUSH-MAD-NEXT: s_waitcnt vmcnt(0)
+; GFX9-FLUSH-MAD-NEXT: v_mac_f32_e32 v3, v1, v2
+; GFX9-FLUSH-MAD-NEXT: global_store_dword v0, v3, s[8:9]
+; GFX9-FLUSH-MAD-NEXT: s_endpgm
+;
+; GFX9-DENORM-FASTFMA-MAD-LABEL: fmul_fadd_contract_f32:
+; GFX9-DENORM-FASTFMA-MAD: ; %bb.0:
+; GFX9-DENORM-FASTFMA-MAD-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX9-DENORM-FASTFMA-MAD-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-DENORM-FASTFMA-MAD-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DENORM-FASTFMA-MAD-NEXT: global_load_dword v1, v0, s[10:11] glc
+; GFX9-DENORM-FASTFMA-MAD-NEXT: s_waitcnt vmcnt(0)
+; GFX9-DENORM-FASTFMA-MAD-NEXT: global_load_dword v2, v0, s[12:13] glc
+; GFX9-DENORM-FASTFMA-MAD-NEXT: s_waitcnt vmcnt(0)
+; GFX9-DENORM-FASTFMA-MAD-NEXT: global_load_dword v3, v0, s[14:15] glc
+; GFX9-DENORM-FASTFMA-MAD-NEXT: s_waitcnt vmcnt(0)
+; GFX9-DENORM-FASTFMA-MAD-NEXT: v_fma_f32 v1, v1, v2, v3
+; GFX9-DENORM-FASTFMA-MAD-NEXT: global_store_dword v0, v1, s[8:9]
+; GFX9-DENORM-FASTFMA-MAD-NEXT: s_endpgm
+;
+; GFX9-FMAC-LABEL: fmul_fadd_contract_f32:
+; GFX9-FMAC: ; %bb.0:
+; GFX9-FMAC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX9-FMAC-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-FMAC-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-FMAC-NEXT: global_load_dword v1, v0, s[10:11] glc
+; GFX9-FMAC-NEXT: s_waitcnt vmcnt(0)
+; GFX9-FMAC-NEXT: global_load_dword v2, v0, s[12:13] glc
+; GFX9-FMAC-NEXT: s_waitcnt vmcnt(0)
+; GFX9-FMAC-NEXT: global_load_dword v3, v0, s[14:15] glc
+; GFX9-FMAC-NEXT: s_waitcnt vmcnt(0)
+; GFX9-FMAC-NEXT: v_fmac_f32_e32 v3, v1, v2
+; GFX9-FMAC-NEXT: global_store_dword v0, v3, s[8:9]
+; GFX9-FMAC-NEXT: s_endpgm
+;
+; GFX10-LABEL: fmul_fadd_contract_f32:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
+; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: global_load_dword v2, v0, s[4:5] glc dlc
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: global_load_dword v3, v0, s[6:7] glc dlc
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_fmac_f32_e32 v3, v1, v2
+; GFX10-NEXT: global_store_dword v0, v3, s[0:1]
+; GFX10-NEXT: s_endpgm
%r0 = load volatile float, ptr addrspace(1) %in1
%r1 = load volatile float, ptr addrspace(1) %in2
%r2 = load volatile float, ptr addrspace(1) %in3
@@ -87,23 +527,107 @@ define amdgpu_kernel void @fmul_fadd_contract_f32(ptr addrspace(1) %out, ptr add
ret void
}
-; GCN-LABEL: {{^}}fmuladd_2.0_a_b_f32
-; GCN: {{buffer|flat|global}}_load_dword [[R1:v[0-9]+]],
-; GCN: {{buffer|flat|global}}_load_dword [[R2:v[0-9]+]],
-
-; GCN-FLUSH-MAD: v_mac_f32_e32 [[R2]], 2.0, [[R1]]
-; GCN-FLUSH-FMAC: v_fmac_f32_e32 [[R2]], 2.0, [[R1]]
-; SI-FLUSH: buffer_store_dword [[R2]]
-; VI-FLUSH: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
-
-; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
-
-; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
-; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
-
-; SI-DENORM: buffer_store_dword [[RESULT]]
-; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
define amdgpu_kernel void @fmuladd_2.0_a_b_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+; SI-FLUSH-LABEL: fmuladd_2.0_a_b_f32:
+; SI-FLUSH: ; %bb.0:
+; SI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-FLUSH-NEXT: s_mov_b32 s3, 0xf000
+; SI-FLUSH-NEXT: s_mov_b32 s2, 0
+; SI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; SI-FLUSH-NEXT: v_mov_b32_e32 v1, 0
+; SI-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
+; SI-FLUSH-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc
+; SI-FLUSH-NEXT: s_waitcnt vmcnt(0)
+; SI-FLUSH-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 glc
+; SI-FLUSH-NEXT: s_waitcnt vmcnt(0)
+; SI-FLUSH-NEXT: v_mac_f32_e32 v3, 2.0, v2
+; SI-FLUSH-NEXT: buffer_store_dword v3, v[0:1], s[0:3], 0 addr64
+; SI-FLUSH-NEXT: s_endpgm
+;
+; SI-DENORM-FASTFMA-LABEL: fmuladd_2.0_a_b_f32:
+; SI-DENORM-FASTFMA: ; %bb.0:
+; SI-DENORM-FASTFMA-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-DENORM-FASTFMA-NEXT: s_mov_b32 s3, 0xf000
+; SI-DENORM-FASTFMA-NEXT: s_mov_b32 s2, 0
+; SI-DENORM-FASTFMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; SI-DENORM-FASTFMA-NEXT: v_mov_b32_e32 v1, 0
+; SI-DENORM-FASTFMA-NEXT: s_waitcnt lgkmcnt(0)
+; SI-DENORM-FASTFMA-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc
+; SI-DENORM-FASTFMA-NEXT: s_waitcnt vmcnt(0)
+; SI-DENORM-FASTFMA-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 glc
+; SI-DENORM-FASTFMA-NEXT: s_waitcnt vmcnt(0)
+; SI-DENORM-FASTFMA-NEXT: v_fma_f32 v2, v2, 2.0, v3
+; SI-DENORM-FASTFMA-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-DENORM-FASTFMA-NEXT: s_endpgm
+;
+; SI-DENORM-SLOWFMA-LABEL: fmuladd_2.0_a_b_f32:
+; SI-DENORM-SLOWFMA: ; %bb.0:
+; SI-DENORM-SLOWFMA-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s3, 0xf000
+; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s2, 0
+; SI-DENORM-SLOWFMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; SI-DENORM-SLOWFMA-NEXT: v_mov_b32_e32 v1, 0
+; SI-DENORM-SLOWFMA-NEXT: s_waitcnt lgkmcnt(0)
+; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc
+; SI-DENORM-SLOWFMA-NEXT: s_waitcnt vmcnt(0)
+; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 glc
+; SI-DENORM-SLOWFMA-NEXT: s_waitcnt vmcnt(0)
+; SI-DENORM-SLOWFMA-NEXT: v_add_f32_e32 v2, v2, v2
+; SI-DENORM-SLOWFMA-NEXT: v_add_f32_e32 v2, v2, v3
+; SI-DENORM-SLOWFMA-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-DENORM-SLOWFMA-NEXT: s_endpgm
+;
+; GFX9-FLUSH-MAD-LABEL: fmuladd_2.0_a_b_f32:
+; GFX9-FLUSH-MAD: ; %bb.0:
+; GFX9-FLUSH-MAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-FLUSH-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-FLUSH-MAD-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-FLUSH-MAD-NEXT: global_load_dword v1, v0, s[0:1] glc
+; GFX9-FLUSH-MAD-NEXT: s_waitcnt vmcnt(0)
+; GFX9-FLUSH-MAD-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc
+; GFX9-FLUSH-MAD-NEXT: s_waitcnt vmcnt(0)
+; GFX9-FLUSH-MAD-NEXT: v_mac_f32_e32 v2, 2.0, v1
+; GFX9-FLUSH-MAD-NEXT: global_store_dword v0, v2, s[0:1]
+; GFX9-FLUSH-MAD-NEXT: s_endpgm
+;
+; GFX9-DENORM-FASTFMA-MAD-LABEL: fmuladd_2.0_a_b_f32:
+; GFX9-DENORM-FASTFMA-MAD: ; %bb.0:
+; GFX9-DENORM-FASTFMA-MAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-DENORM-FASTFMA-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-DENORM-FASTFMA-MAD-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DENORM-FASTFMA-MAD-NEXT: global_load_dword v1, v0, s[0:1] glc
+; GFX9-DENORM-FASTFMA-MAD-NEXT: s_waitcnt vmcnt(0)
+; GFX9-DENORM-FASTFMA-MAD-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc
+; GFX9-DENORM-FASTFMA-MAD-NEXT: s_waitcnt vmcnt(0)
+; GFX9-DENORM-FASTFMA-MAD-NEXT: v_fma_f32 v1, v1, 2.0, v2
+; GFX9-DENORM-FASTFMA-MAD-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-DENORM-FASTFMA-MAD-NEXT: s_endpgm
+;
+; GFX9-FMAC-LABEL: fmuladd_2.0_a_b_f32:
+; GFX9-FMAC: ; %bb.0:
+; GFX9-FMAC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-FMAC-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-FMAC-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-FMAC-NEXT: global_load_dword v1, v0, s[0:1] glc
+; GFX9-FMAC-NEXT: s_waitcnt vmcnt(0)
+; GFX9-FMAC-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc
+; GFX9-FMAC-NEXT: s_waitcnt vmcnt(0)
+; GFX9-FMAC-NEXT: v_fmac_f32_e32 v2, 2.0, v1
+; GFX9-FMAC-NEXT: global_store_dword v0, v2, s[0:1]
+; GFX9-FMAC-NEXT: s_endpgm
+;
+; GFX10-LABEL: fmuladd_2.0_a_b_f32:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: global_load_dword v1, v0, s[0:1] glc dlc
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc dlc
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_fmac_f32_e32 v2, 2.0, v1
+; GFX10-NEXT: global_store_dword v0, v2, s[0:1]
+; GFX10-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep.0 = getelementptr float, ptr addrspace(1) %out, i32 %tid
%gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1
@@ -117,24 +641,107 @@ define amdgpu_kernel void @fmuladd_2.0_a_b_f32(ptr addrspace(1) %out, ptr addrsp
ret void
}
-; GCN-LABEL: {{^}}fmuladd_a_2.0_b_f32
-; GCN: {{buffer|flat|global}}_load_dword [[R1:v[0-9]+]],
-; GCN: {{buffer|flat|global}}_load_dword [[R2:v[0-9]+]],
-
-; GCN-FLUSH-MAD: v_mac_f32_e32 [[R2]], 2.0, [[R1]]
-; GCN-FLUSH-FMAC: v_fmac_f32_e32 [[R2]], 2.0, [[R1]]
-
-; SI-FLUSH: buffer_store_dword [[R2]]
-; VI-FLUSH: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
-
-; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
-
-; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
-; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
-
-; SI-DENORM: buffer_store_dword [[RESULT]]
-; VI-DENORM: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
define amdgpu_kernel void @fmuladd_a_2.0_b_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+; SI-FLUSH-LABEL: fmuladd_a_2.0_b_f32:
+; SI-FLUSH: ; %bb.0:
+; SI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-FLUSH-NEXT: s_mov_b32 s3, 0xf000
+; SI-FLUSH-NEXT: s_mov_b32 s2, 0
+; SI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; SI-FLUSH-NEXT: v_mov_b32_e32 v1, 0
+; SI-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
+; SI-FLUSH-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc
+; SI-FLUSH-NEXT: s_waitcnt vmcnt(0)
+; SI-FLUSH-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 glc
+; SI-FLUSH-NEXT: s_waitcnt vmcnt(0)
+; SI-FLUSH-NEXT: v_mac_f32_e32 v3, 2.0, v2
+; SI-FLUSH-NEXT: buffer_store_dword v3, v[0:1], s[0:3], 0 addr64
+; SI-FLUSH-NEXT: s_endpgm
+;
+; SI-DENORM-FASTFMA-LABEL: fmuladd_a_2.0_b_f32:
+; SI-DENORM-FASTFMA: ; %bb.0:
+; SI-DENORM-FASTFMA-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-DENORM-FASTFMA-NEXT: s_mov_b32 s3, 0xf000
+; SI-DENORM-FASTFMA-NEXT: s_mov_b32 s2, 0
+; SI-DENORM-FASTFMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; SI-DENORM-FASTFMA-NEXT: v_mov_b32_e32 v1, 0
+; SI-DENORM-FASTFMA-NEXT: s_waitcnt lgkmcnt(0)
+; SI-DENORM-FASTFMA-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc
+; SI-DENORM-FASTFMA-NEXT: s_waitcnt vmcnt(0)
+; SI-DENORM-FASTFMA-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 glc
+; SI-DENORM-FASTFMA-NEXT: s_waitcnt vmcnt(0)
+; SI-DENORM-FASTFMA-NEXT: v_fma_f32 v2, v2, 2.0, v3
+; SI-DENORM-FASTFMA-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-DENORM-FASTFMA-NEXT: s_endpgm
+;
+; SI-DENORM-SLOWFMA-LABEL: fmuladd_a_2.0_b_f32:
+; SI-DENORM-SLOWFMA: ; %bb.0:
+; SI-DENORM-SLOWFMA-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s3, 0xf000
+; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s2, 0
+; SI-DENORM-SLOWFMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; SI-DENORM-SLOWFMA-NEXT: v_mov_b32_e32 v1, 0
+; SI-DENORM-SLOWFMA-NEXT: s_waitcnt lgkmcnt(0)
+; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc
+; SI-DENORM-SLOWFMA-NEXT: s_waitcnt vmcnt(0)
+; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 glc
+; SI-DENORM-SLOWFMA-NEXT: s_waitcnt vmcnt(0)
+; SI-DENORM-SLOWFMA-NEXT: v_add_f32_e32 v2, v2, v2
+; SI-DENORM-SLOWFMA-NEXT: v_add_f32_e32 v2, v2, v3
+; SI-DENORM-SLOWFMA-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-DENORM-SLOWFMA-NEXT: s_endpgm
+;
+; GFX9-FLUSH-MAD-LABEL: fmuladd_a_2.0_b_f32:
+; GFX9-FLUSH-MAD: ; %bb.0:
+; GFX9-FLUSH-MAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-FLUSH-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-FLUSH-MAD-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-FLUSH-MAD-NEXT: global_load_dword v1, v0, s[0:1] glc
+; GFX9-FLUSH-MAD-NEXT: s_waitcnt vmcnt(0)
+; GFX9-FLUSH-MAD-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc
+; GFX9-FLUSH-MAD-NEXT: s_waitcnt vmcnt(0)
+; GFX9-FLUSH-MAD-NEXT: v_mac_f32_e32 v2, 2.0, v1
+; GFX9-FLUSH-MAD-NEXT: global_store_dword v0, v2, s[0:1]
+; GFX9-FLUSH-MAD-NEXT: s_endpgm
+;
+; GFX9-DENORM-FASTFMA-MAD-LABEL: fmuladd_a_2.0_b_f32:
+; GFX9-DENORM-FASTFMA-MAD: ; %bb.0:
+; GFX9-DENORM-FASTFMA-MAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-DENORM-FASTFMA-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-DENORM-FASTFMA-MAD-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DENORM-FASTFMA-MAD-NEXT: global_load_dword v1, v0, s[0:1] glc
+; GFX9-DENORM-FASTFMA-MAD-NEXT: s_waitcnt vmcnt(0)
+; GFX9-DENORM-FASTFMA-MAD-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc
+; GFX9-DENORM-FASTFMA-MAD-NEXT: s_waitcnt vmcnt(0)
+; GFX9-DENORM-FASTFMA-MAD-NEXT: v_fma_f32 v1, v1, 2.0, v2
+; GFX9-DENORM-FASTFMA-MAD-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-DENORM-FASTFMA-MAD-NEXT: s_endpgm
+;
+; GFX9-FMAC-LABEL: fmuladd_a_2.0_b_f32:
+; GFX9-FMAC: ; %bb.0:
+; GFX9-FMAC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-FMAC-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-FMAC-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-FMAC-NEXT: global_load_dword v1, v0, s[0:1] glc
+; GFX9-FMAC-NEXT: s_waitcnt vmcnt(0)
+; GFX9-FMAC-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc
+; GFX9-FMAC-NEXT: s_waitcnt vmcnt(0)
+; GFX9-FMAC-NEXT: v_fmac_f32_e32 v2, 2.0, v1
+; GFX9-FMAC-NEXT: global_store_dword v0, v2, s[0:1]
+; GFX9-FMAC-NEXT: s_endpgm
+;
+; GFX10-LABEL: fmuladd_a_2.0_b_f32:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: global_load_dword v1, v0, s[0:1] glc dlc
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc dlc
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_fmac_f32_e32 v2, 2.0, v1
+; GFX10-NEXT: global_store_dword v0, v2, s[0:1]
+; GFX10-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep.0 = getelementptr float, ptr addrspace(1) %out, i32 %tid
%gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1
@@ -148,28 +755,126 @@ define amdgpu_kernel void @fmuladd_a_2.0_b_f32(ptr addrspace(1) %out, ptr addrsp
ret void
}
-; GCN-LABEL: {{^}}fadd_a_a_b_f32:
-; GCN: {{buffer|flat|global}}_load_dword [[R1:v[0-9]+]],
-; GCN: {{buffer|flat|global}}_load_dword [[R2:v[0-9]+]],
-
-; GCN-FLUSH: v_mac_f32_e32 [[R2]], 2.0, [[R1]]
-
-; SI-FLUSH: buffer_store_dword [[R2]]
-; VI-FLUSH: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
-
-; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
-
-; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
-; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
-
-; GCN-DENORM-STRICT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
-; GCN-DENORM-STRICT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
-
-; SI-DENORM: buffer_store_dword [[RESULT]]
-; VI-DENORM: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define amdgpu_kernel void @fadd_a_a_b_f32(ptr addrspace(1) %out,
- ptr addrspace(1) %in1,
- ptr addrspace(1) %in2) #0 {
+define amdgpu_kernel void @fadd_a_a_b_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 {
+; SI-FLUSH-LABEL: fadd_a_a_b_f32:
+; SI-FLUSH: ; %bb.0:
+; SI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-FLUSH-NEXT: s_mov_b32 s3, 0xf000
+; SI-FLUSH-NEXT: s_mov_b32 s2, 0
+; SI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; SI-FLUSH-NEXT: v_mov_b32_e32 v1, 0
+; SI-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
+; SI-FLUSH-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc
+; SI-FLUSH-NEXT: s_waitcnt vmcnt(0)
+; SI-FLUSH-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 glc
+; SI-FLUSH-NEXT: s_waitcnt vmcnt(0)
+; SI-FLUSH-NEXT: v_mac_f32_e32 v3, 2.0, v2
+; SI-FLUSH-NEXT: buffer_store_dword v3, v[0:1], s[0:3], 0 addr64
+; SI-FLUSH-NEXT: s_endpgm
+;
+; SI-DENORM-FASTFMA-STRICT-LABEL: fadd_a_a_b_f32:
+; SI-DENORM-FASTFMA-STRICT: ; %bb.0:
+; SI-DENORM-FASTFMA-STRICT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b32 s3, 0xf000
+; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b32 s2, 0
+; SI-DENORM-FASTFMA-STRICT-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; SI-DENORM-FASTFMA-STRICT-NEXT: v_mov_b32_e32 v1, 0
+; SI-DENORM-FASTFMA-STRICT-NEXT: s_waitcnt lgkmcnt(0)
+; SI-DENORM-FASTFMA-STRICT-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc
+; SI-DENORM-FASTFMA-STRICT-NEXT: s_waitcnt vmcnt(0)
+; SI-DENORM-FASTFMA-STRICT-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 glc
+; SI-DENORM-FASTFMA-STRICT-NEXT: s_waitcnt vmcnt(0)
+; SI-DENORM-FASTFMA-STRICT-NEXT: v_add_f32_e32 v2, v2, v2
+; SI-DENORM-FASTFMA-STRICT-NEXT: v_add_f32_e32 v2, v2, v3
+; SI-DENORM-FASTFMA-STRICT-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-DENORM-FASTFMA-STRICT-NEXT: s_endpgm
+;
+; SI-DENORM-SLOWFMA-LABEL: fadd_a_a_b_f32:
+; SI-DENORM-SLOWFMA: ; %bb.0:
+; SI-DENORM-SLOWFMA-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s3, 0xf000
+; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s2, 0
+; SI-DENORM-SLOWFMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; SI-DENORM-SLOWFMA-NEXT: v_mov_b32_e32 v1, 0
+; SI-DENORM-SLOWFMA-NEXT: s_waitcnt lgkmcnt(0)
+; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc
+; SI-DENORM-SLOWFMA-NEXT: s_waitcnt vmcnt(0)
+; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 glc
+; SI-DENORM-SLOWFMA-NEXT: s_waitcnt vmcnt(0)
+; SI-DENORM-SLOWFMA-NEXT: v_add_f32_e32 v2, v2, v2
+; SI-DENORM-SLOWFMA-NEXT: v_add_f32_e32 v2, v2, v3
+; SI-DENORM-SLOWFMA-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-DENORM-SLOWFMA-NEXT: s_endpgm
+;
+; SI-DENORM-FASTFMA-CONTRACT-LABEL: fadd_a_a_b_f32:
+; SI-DENORM-FASTFMA-CONTRACT: ; %bb.0:
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b32 s3, 0xf000
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b32 s2, 0
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: v_mov_b32_e32 v1, 0
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_waitcnt lgkmcnt(0)
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_waitcnt vmcnt(0)
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 glc
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_waitcnt vmcnt(0)
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: v_fma_f32 v2, v2, 2.0, v3
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_endpgm
+;
+; GFX9-FLUSH-LABEL: fadd_a_a_b_f32:
+; GFX9-FLUSH: ; %bb.0:
+; GFX9-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-FLUSH-NEXT: global_load_dword v1, v0, s[0:1] glc
+; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0)
+; GFX9-FLUSH-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc
+; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0)
+; GFX9-FLUSH-NEXT: v_mac_f32_e32 v2, 2.0, v1
+; GFX9-FLUSH-NEXT: global_store_dword v0, v2, s[0:1]
+; GFX9-FLUSH-NEXT: s_endpgm
+;
+; GFX9-DENORM-LABEL: fadd_a_a_b_f32:
+; GFX9-DENORM: ; %bb.0:
+; GFX9-DENORM-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-DENORM-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-DENORM-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DENORM-NEXT: global_load_dword v1, v0, s[0:1] glc
+; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0)
+; GFX9-DENORM-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc
+; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0)
+; GFX9-DENORM-NEXT: v_add_f32_e32 v1, v1, v1
+; GFX9-DENORM-NEXT: v_add_f32_e32 v1, v1, v2
+; GFX9-DENORM-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-DENORM-NEXT: s_endpgm
+;
+; GFX10-FLUSH-LABEL: fadd_a_a_b_f32:
+; GFX10-FLUSH: ; %bb.0:
+; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-FLUSH-NEXT: global_load_dword v1, v0, s[0:1] glc dlc
+; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
+; GFX10-FLUSH-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc dlc
+; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
+; GFX10-FLUSH-NEXT: v_mac_f32_e32 v2, 2.0, v1
+; GFX10-FLUSH-NEXT: global_store_dword v0, v2, s[0:1]
+; GFX10-FLUSH-NEXT: s_endpgm
+;
+; GFX10-DENORM-LABEL: fadd_a_a_b_f32:
+; GFX10-DENORM: ; %bb.0:
+; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-DENORM-NEXT: global_load_dword v1, v0, s[0:1] glc dlc
+; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0)
+; GFX10-DENORM-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc dlc
+; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0)
+; GFX10-DENORM-NEXT: v_add_f32_e32 v1, v1, v1
+; GFX10-DENORM-NEXT: v_add_f32_e32 v1, v1, v2
+; GFX10-DENORM-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-DENORM-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep.0 = getelementptr float, ptr addrspace(1) %out, i32 %tid
%gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1
@@ -184,28 +889,126 @@ define amdgpu_kernel void @fadd_a_a_b_f32(ptr addrspace(1) %out,
ret void
}
-; GCN-LABEL: {{^}}fadd_b_a_a_f32:
-; GCN: {{buffer|flat|global}}_load_dword [[R1:v[0-9]+]],
-; GCN: {{buffer|flat|global}}_load_dword [[R2:v[0-9]+]],
-
-; GCN-FLUSH: v_mac_f32_e32 [[R2]], 2.0, [[R1]]
-
-; SI-FLUSH: buffer_store_dword [[R2]]
-; VI-FLUSH: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
-
-; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
-
-; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
-; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
-
-; GCN-DENORM-STRICT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
-; GCN-DENORM-STRICT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
-
-; SI-DENORM: buffer_store_dword [[RESULT]]
-; VI-DENORM: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define amdgpu_kernel void @fadd_b_a_a_f32(ptr addrspace(1) %out,
- ptr addrspace(1) %in1,
- ptr addrspace(1) %in2) #0 {
+define amdgpu_kernel void @fadd_b_a_a_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 {
+; SI-FLUSH-LABEL: fadd_b_a_a_f32:
+; SI-FLUSH: ; %bb.0:
+; SI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-FLUSH-NEXT: s_mov_b32 s3, 0xf000
+; SI-FLUSH-NEXT: s_mov_b32 s2, 0
+; SI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; SI-FLUSH-NEXT: v_mov_b32_e32 v1, 0
+; SI-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
+; SI-FLUSH-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc
+; SI-FLUSH-NEXT: s_waitcnt vmcnt(0)
+; SI-FLUSH-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 glc
+; SI-FLUSH-NEXT: s_waitcnt vmcnt(0)
+; SI-FLUSH-NEXT: v_mac_f32_e32 v3, 2.0, v2
+; SI-FLUSH-NEXT: buffer_store_dword v3, v[0:1], s[0:3], 0 addr64
+; SI-FLUSH-NEXT: s_endpgm
+;
+; SI-DENORM-FASTFMA-STRICT-LABEL: fadd_b_a_a_f32:
+; SI-DENORM-FASTFMA-STRICT: ; %bb.0:
+; SI-DENORM-FASTFMA-STRICT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b32 s3, 0xf000
+; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b32 s2, 0
+; SI-DENORM-FASTFMA-STRICT-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; SI-DENORM-FASTFMA-STRICT-NEXT: v_mov_b32_e32 v1, 0
+; SI-DENORM-FASTFMA-STRICT-NEXT: s_waitcnt lgkmcnt(0)
+; SI-DENORM-FASTFMA-STRICT-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc
+; SI-DENORM-FASTFMA-STRICT-NEXT: s_waitcnt vmcnt(0)
+; SI-DENORM-FASTFMA-STRICT-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 glc
+; SI-DENORM-FASTFMA-STRICT-NEXT: s_waitcnt vmcnt(0)
+; SI-DENORM-FASTFMA-STRICT-NEXT: v_add_f32_e32 v2, v2, v2
+; SI-DENORM-FASTFMA-STRICT-NEXT: v_add_f32_e32 v2, v3, v2
+; SI-DENORM-FASTFMA-STRICT-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-DENORM-FASTFMA-STRICT-NEXT: s_endpgm
+;
+; SI-DENORM-SLOWFMA-LABEL: fadd_b_a_a_f32:
+; SI-DENORM-SLOWFMA: ; %bb.0:
+; SI-DENORM-SLOWFMA-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s3, 0xf000
+; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s2, 0
+; SI-DENORM-SLOWFMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; SI-DENORM-SLOWFMA-NEXT: v_mov_b32_e32 v1, 0
+; SI-DENORM-SLOWFMA-NEXT: s_waitcnt lgkmcnt(0)
+; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc
+; SI-DENORM-SLOWFMA-NEXT: s_waitcnt vmcnt(0)
+; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 glc
+; SI-DENORM-SLOWFMA-NEXT: s_waitcnt vmcnt(0)
+; SI-DENORM-SLOWFMA-NEXT: v_add_f32_e32 v2, v2, v2
+; SI-DENORM-SLOWFMA-NEXT: v_add_f32_e32 v2, v3, v2
+; SI-DENORM-SLOWFMA-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-DENORM-SLOWFMA-NEXT: s_endpgm
+;
+; SI-DENORM-FASTFMA-CONTRACT-LABEL: fadd_b_a_a_f32:
+; SI-DENORM-FASTFMA-CONTRACT: ; %bb.0:
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b32 s3, 0xf000
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b32 s2, 0
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: v_mov_b32_e32 v1, 0
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_waitcnt lgkmcnt(0)
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_waitcnt vmcnt(0)
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 glc
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_waitcnt vmcnt(0)
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: v_fma_f32 v2, v2, 2.0, v3
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_endpgm
+;
+; GFX9-FLUSH-LABEL: fadd_b_a_a_f32:
+; GFX9-FLUSH: ; %bb.0:
+; GFX9-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-FLUSH-NEXT: global_load_dword v1, v0, s[0:1] glc
+; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0)
+; GFX9-FLUSH-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc
+; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0)
+; GFX9-FLUSH-NEXT: v_mac_f32_e32 v2, 2.0, v1
+; GFX9-FLUSH-NEXT: global_store_dword v0, v2, s[0:1]
+; GFX9-FLUSH-NEXT: s_endpgm
+;
+; GFX9-DENORM-LABEL: fadd_b_a_a_f32:
+; GFX9-DENORM: ; %bb.0:
+; GFX9-DENORM-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-DENORM-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-DENORM-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DENORM-NEXT: global_load_dword v1, v0, s[0:1] glc
+; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0)
+; GFX9-DENORM-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc
+; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0)
+; GFX9-DENORM-NEXT: v_add_f32_e32 v1, v1, v1
+; GFX9-DENORM-NEXT: v_add_f32_e32 v1, v2, v1
+; GFX9-DENORM-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-DENORM-NEXT: s_endpgm
+;
+; GFX10-FLUSH-LABEL: fadd_b_a_a_f32:
+; GFX10-FLUSH: ; %bb.0:
+; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-FLUSH-NEXT: global_load_dword v1, v0, s[0:1] glc dlc
+; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
+; GFX10-FLUSH-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc dlc
+; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
+; GFX10-FLUSH-NEXT: v_mac_f32_e32 v2, 2.0, v1
+; GFX10-FLUSH-NEXT: global_store_dword v0, v2, s[0:1]
+; GFX10-FLUSH-NEXT: s_endpgm
+;
+; GFX10-DENORM-LABEL: fadd_b_a_a_f32:
+; GFX10-DENORM: ; %bb.0:
+; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-DENORM-NEXT: global_load_dword v1, v0, s[0:1] glc dlc
+; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0)
+; GFX10-DENORM-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc dlc
+; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0)
+; GFX10-DENORM-NEXT: v_add_f32_e32 v1, v1, v1
+; GFX10-DENORM-NEXT: v_add_f32_e32 v1, v2, v1
+; GFX10-DENORM-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-DENORM-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep.0 = getelementptr float, ptr addrspace(1) %out, i32 %tid
%gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1
@@ -220,20 +1023,107 @@ define amdgpu_kernel void @fadd_b_a_a_f32(ptr addrspace(1) %out,
ret void
}
-; GCN-LABEL: {{^}}fmuladd_neg_2.0_a_b_f32
-; GCN: {{buffer|flat|global}}_load_dword [[R1:v[0-9]+]],
-; GCN: {{buffer|flat|global}}_load_dword [[R2:v[0-9]+]],
-; GCN-FLUSH-MAD: v_mac_f32_e32 [[R2]], -2.0, [[R1]]
-; GCN-FLUSH-FMAC: v_fmac_f32_e32 [[R2]], -2.0, [[R1]]
-
-; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], -2.0, [[R2]]
-
-; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
-; GCN-DENORM-SLOWFMA: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
-
-; SI-DENORM: buffer_store_dword [[RESULT]]
-; VI-DENORM: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
define amdgpu_kernel void @fmuladd_neg_2.0_a_b_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+; SI-FLUSH-LABEL: fmuladd_neg_2.0_a_b_f32:
+; SI-FLUSH: ; %bb.0:
+; SI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-FLUSH-NEXT: s_mov_b32 s3, 0xf000
+; SI-FLUSH-NEXT: s_mov_b32 s2, 0
+; SI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; SI-FLUSH-NEXT: v_mov_b32_e32 v1, 0
+; SI-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
+; SI-FLUSH-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc
+; SI-FLUSH-NEXT: s_waitcnt vmcnt(0)
+; SI-FLUSH-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 glc
+; SI-FLUSH-NEXT: s_waitcnt vmcnt(0)
+; SI-FLUSH-NEXT: v_mac_f32_e32 v3, -2.0, v2
+; SI-FLUSH-NEXT: buffer_store_dword v3, v[0:1], s[0:3], 0 addr64
+; SI-FLUSH-NEXT: s_endpgm
+;
+; SI-DENORM-FASTFMA-LABEL: fmuladd_neg_2.0_a_b_f32:
+; SI-DENORM-FASTFMA: ; %bb.0:
+; SI-DENORM-FASTFMA-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-DENORM-FASTFMA-NEXT: s_mov_b32 s3, 0xf000
+; SI-DENORM-FASTFMA-NEXT: s_mov_b32 s2, 0
+; SI-DENORM-FASTFMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; SI-DENORM-FASTFMA-NEXT: v_mov_b32_e32 v1, 0
+; SI-DENORM-FASTFMA-NEXT: s_waitcnt lgkmcnt(0)
+; SI-DENORM-FASTFMA-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc
+; SI-DENORM-FASTFMA-NEXT: s_waitcnt vmcnt(0)
+; SI-DENORM-FASTFMA-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 glc
+; SI-DENORM-FASTFMA-NEXT: s_waitcnt vmcnt(0)
+; SI-DENORM-FASTFMA-NEXT: v_fma_f32 v2, v2, -2.0, v3
+; SI-DENORM-FASTFMA-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-DENORM-FASTFMA-NEXT: s_endpgm
+;
+; SI-DENORM-SLOWFMA-LABEL: fmuladd_neg_2.0_a_b_f32:
+; SI-DENORM-SLOWFMA: ; %bb.0:
+; SI-DENORM-SLOWFMA-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s3, 0xf000
+; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s2, 0
+; SI-DENORM-SLOWFMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; SI-DENORM-SLOWFMA-NEXT: v_mov_b32_e32 v1, 0
+; SI-DENORM-SLOWFMA-NEXT: s_waitcnt lgkmcnt(0)
+; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc
+; SI-DENORM-SLOWFMA-NEXT: s_waitcnt vmcnt(0)
+; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 glc
+; SI-DENORM-SLOWFMA-NEXT: s_waitcnt vmcnt(0)
+; SI-DENORM-SLOWFMA-NEXT: v_add_f32_e32 v2, v2, v2
+; SI-DENORM-SLOWFMA-NEXT: v_sub_f32_e32 v2, v3, v2
+; SI-DENORM-SLOWFMA-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-DENORM-SLOWFMA-NEXT: s_endpgm
+;
+; GFX9-FLUSH-MAD-LABEL: fmuladd_neg_2.0_a_b_f32:
+; GFX9-FLUSH-MAD: ; %bb.0:
+; GFX9-FLUSH-MAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-FLUSH-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-FLUSH-MAD-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-FLUSH-MAD-NEXT: global_load_dword v1, v0, s[0:1] glc
+; GFX9-FLUSH-MAD-NEXT: s_waitcnt vmcnt(0)
+; GFX9-FLUSH-MAD-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc
+; GFX9-FLUSH-MAD-NEXT: s_waitcnt vmcnt(0)
+; GFX9-FLUSH-MAD-NEXT: v_mac_f32_e32 v2, -2.0, v1
+; GFX9-FLUSH-MAD-NEXT: global_store_dword v0, v2, s[0:1]
+; GFX9-FLUSH-MAD-NEXT: s_endpgm
+;
+; GFX9-DENORM-FASTFMA-MAD-LABEL: fmuladd_neg_2.0_a_b_f32:
+; GFX9-DENORM-FASTFMA-MAD: ; %bb.0:
+; GFX9-DENORM-FASTFMA-MAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-DENORM-FASTFMA-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-DENORM-FASTFMA-MAD-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DENORM-FASTFMA-MAD-NEXT: global_load_dword v1, v0, s[0:1] glc
+; GFX9-DENORM-FASTFMA-MAD-NEXT: s_waitcnt vmcnt(0)
+; GFX9-DENORM-FASTFMA-MAD-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc
+; GFX9-DENORM-FASTFMA-MAD-NEXT: s_waitcnt vmcnt(0)
+; GFX9-DENORM-FASTFMA-MAD-NEXT: v_fma_f32 v1, v1, -2.0, v2
+; GFX9-DENORM-FASTFMA-MAD-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-DENORM-FASTFMA-MAD-NEXT: s_endpgm
+;
+; GFX9-FMAC-LABEL: fmuladd_neg_2.0_a_b_f32:
+; GFX9-FMAC: ; %bb.0:
+; GFX9-FMAC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-FMAC-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-FMAC-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-FMAC-NEXT: global_load_dword v1, v0, s[0:1] glc
+; GFX9-FMAC-NEXT: s_waitcnt vmcnt(0)
+; GFX9-FMAC-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc
+; GFX9-FMAC-NEXT: s_waitcnt vmcnt(0)
+; GFX9-FMAC-NEXT: v_fmac_f32_e32 v2, -2.0, v1
+; GFX9-FMAC-NEXT: global_store_dword v0, v2, s[0:1]
+; GFX9-FMAC-NEXT: s_endpgm
+;
+; GFX10-LABEL: fmuladd_neg_2.0_a_b_f32:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: global_load_dword v1, v0, s[0:1] glc dlc
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc dlc
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_fmac_f32_e32 v2, -2.0, v1
+; GFX10-NEXT: global_store_dword v0, v2, s[0:1]
+; GFX10-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep.0 = getelementptr float, ptr addrspace(1) %out, i32 %tid
%gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1
@@ -247,25 +1137,107 @@ define amdgpu_kernel void @fmuladd_neg_2.0_a_b_f32(ptr addrspace(1) %out, ptr ad
ret void
}
-; XXX
-; GCN-LABEL: {{^}}fmuladd_neg_2.0_neg_a_b_f32
-; GCN: {{buffer|flat|global}}_load_dword [[R1:v[0-9]+]],
-; GCN: {{buffer|flat|global}}_load_dword [[R2:v[0-9]+]],
-
-; GCN-FLUSH-MAD: v_mac_f32_e32 [[R2]], 2.0, [[R1]]
-; GCN-FLUSH-FMAC: v_fmac_f32_e32 [[R2]], 2.0, [[R1]]
-
-; SI-FLUSH: buffer_store_dword [[R2]]
-; VI-FLUSH: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
-
-; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
-
-; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
-; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
-
-; SI-DENORM: buffer_store_dword [[RESULT]]
-; VI-DENORM: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
define amdgpu_kernel void @fmuladd_neg_2.0_neg_a_b_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+; SI-FLUSH-LABEL: fmuladd_neg_2.0_neg_a_b_f32:
+; SI-FLUSH: ; %bb.0:
+; SI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-FLUSH-NEXT: s_mov_b32 s3, 0xf000
+; SI-FLUSH-NEXT: s_mov_b32 s2, 0
+; SI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; SI-FLUSH-NEXT: v_mov_b32_e32 v1, 0
+; SI-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
+; SI-FLUSH-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc
+; SI-FLUSH-NEXT: s_waitcnt vmcnt(0)
+; SI-FLUSH-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 glc
+; SI-FLUSH-NEXT: s_waitcnt vmcnt(0)
+; SI-FLUSH-NEXT: v_mac_f32_e32 v3, 2.0, v2
+; SI-FLUSH-NEXT: buffer_store_dword v3, v[0:1], s[0:3], 0 addr64
+; SI-FLUSH-NEXT: s_endpgm
+;
+; SI-DENORM-FASTFMA-LABEL: fmuladd_neg_2.0_neg_a_b_f32:
+; SI-DENORM-FASTFMA: ; %bb.0:
+; SI-DENORM-FASTFMA-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-DENORM-FASTFMA-NEXT: s_mov_b32 s3, 0xf000
+; SI-DENORM-FASTFMA-NEXT: s_mov_b32 s2, 0
+; SI-DENORM-FASTFMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; SI-DENORM-FASTFMA-NEXT: v_mov_b32_e32 v1, 0
+; SI-DENORM-FASTFMA-NEXT: s_waitcnt lgkmcnt(0)
+; SI-DENORM-FASTFMA-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc
+; SI-DENORM-FASTFMA-NEXT: s_waitcnt vmcnt(0)
+; SI-DENORM-FASTFMA-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 glc
+; SI-DENORM-FASTFMA-NEXT: s_waitcnt vmcnt(0)
+; SI-DENORM-FASTFMA-NEXT: v_fma_f32 v2, v2, 2.0, v3
+; SI-DENORM-FASTFMA-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-DENORM-FASTFMA-NEXT: s_endpgm
+;
+; SI-DENORM-SLOWFMA-LABEL: fmuladd_neg_2.0_neg_a_b_f32:
+; SI-DENORM-SLOWFMA: ; %bb.0:
+; SI-DENORM-SLOWFMA-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s3, 0xf000
+; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s2, 0
+; SI-DENORM-SLOWFMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; SI-DENORM-SLOWFMA-NEXT: v_mov_b32_e32 v1, 0
+; SI-DENORM-SLOWFMA-NEXT: s_waitcnt lgkmcnt(0)
+; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc
+; SI-DENORM-SLOWFMA-NEXT: s_waitcnt vmcnt(0)
+; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 glc
+; SI-DENORM-SLOWFMA-NEXT: s_waitcnt vmcnt(0)
+; SI-DENORM-SLOWFMA-NEXT: v_add_f32_e32 v2, v2, v2
+; SI-DENORM-SLOWFMA-NEXT: v_add_f32_e32 v2, v3, v2
+; SI-DENORM-SLOWFMA-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-DENORM-SLOWFMA-NEXT: s_endpgm
+;
+; GFX9-FLUSH-MAD-LABEL: fmuladd_neg_2.0_neg_a_b_f32:
+; GFX9-FLUSH-MAD: ; %bb.0:
+; GFX9-FLUSH-MAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-FLUSH-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-FLUSH-MAD-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-FLUSH-MAD-NEXT: global_load_dword v1, v0, s[0:1] glc
+; GFX9-FLUSH-MAD-NEXT: s_waitcnt vmcnt(0)
+; GFX9-FLUSH-MAD-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc
+; GFX9-FLUSH-MAD-NEXT: s_waitcnt vmcnt(0)
+; GFX9-FLUSH-MAD-NEXT: v_mac_f32_e32 v2, 2.0, v1
+; GFX9-FLUSH-MAD-NEXT: global_store_dword v0, v2, s[0:1]
+; GFX9-FLUSH-MAD-NEXT: s_endpgm
+;
+; GFX9-DENORM-FASTFMA-MAD-LABEL: fmuladd_neg_2.0_neg_a_b_f32:
+; GFX9-DENORM-FASTFMA-MAD: ; %bb.0:
+; GFX9-DENORM-FASTFMA-MAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-DENORM-FASTFMA-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-DENORM-FASTFMA-MAD-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DENORM-FASTFMA-MAD-NEXT: global_load_dword v1, v0, s[0:1] glc
+; GFX9-DENORM-FASTFMA-MAD-NEXT: s_waitcnt vmcnt(0)
+; GFX9-DENORM-FASTFMA-MAD-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc
+; GFX9-DENORM-FASTFMA-MAD-NEXT: s_waitcnt vmcnt(0)
+; GFX9-DENORM-FASTFMA-MAD-NEXT: v_fma_f32 v1, v1, 2.0, v2
+; GFX9-DENORM-FASTFMA-MAD-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-DENORM-FASTFMA-MAD-NEXT: s_endpgm
+;
+; GFX9-FMAC-LABEL: fmuladd_neg_2.0_neg_a_b_f32:
+; GFX9-FMAC: ; %bb.0:
+; GFX9-FMAC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-FMAC-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-FMAC-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-FMAC-NEXT: global_load_dword v1, v0, s[0:1] glc
+; GFX9-FMAC-NEXT: s_waitcnt vmcnt(0)
+; GFX9-FMAC-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc
+; GFX9-FMAC-NEXT: s_waitcnt vmcnt(0)
+; GFX9-FMAC-NEXT: v_fmac_f32_e32 v2, 2.0, v1
+; GFX9-FMAC-NEXT: global_store_dword v0, v2, s[0:1]
+; GFX9-FMAC-NEXT: s_endpgm
+;
+; GFX10-LABEL: fmuladd_neg_2.0_neg_a_b_f32:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: global_load_dword v1, v0, s[0:1] glc dlc
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc dlc
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_fmac_f32_e32 v2, 2.0, v1
+; GFX10-NEXT: global_store_dword v0, v2, s[0:1]
+; GFX10-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep.0 = getelementptr float, ptr addrspace(1) %out, i32 %tid
%gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1
@@ -281,24 +1253,107 @@ define amdgpu_kernel void @fmuladd_neg_2.0_neg_a_b_f32(ptr addrspace(1) %out, pt
ret void
}
-; GCN-LABEL: {{^}}fmuladd_2.0_neg_a_b_f32:
-; GCN: {{buffer|flat|global}}_load_dword [[R1:v[0-9]+]],
-; GCN: {{buffer|flat|global}}_load_dword [[R2:v[0-9]+]],
-
-; GCN-FLUSH-MAD: v_mac_f32_e32 [[R2]], -2.0, [[R1]]
-; GCN-FLUSH-FMAC: v_fmac_f32_e32 [[R2]], -2.0, [[R1]]
-
-; SI-FLUSH: buffer_store_dword [[R2]]
-; VI-FLUSH: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
-
-; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], -2.0, [[R2]]
-
-; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
-; GCN-DENORM-SLOWFMA: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
-
-; SI-DENORM: buffer_store_dword [[RESULT]]
-; VI-DENORM: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
define amdgpu_kernel void @fmuladd_2.0_neg_a_b_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+; SI-FLUSH-LABEL: fmuladd_2.0_neg_a_b_f32:
+; SI-FLUSH: ; %bb.0:
+; SI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-FLUSH-NEXT: s_mov_b32 s3, 0xf000
+; SI-FLUSH-NEXT: s_mov_b32 s2, 0
+; SI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; SI-FLUSH-NEXT: v_mov_b32_e32 v1, 0
+; SI-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
+; SI-FLUSH-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc
+; SI-FLUSH-NEXT: s_waitcnt vmcnt(0)
+; SI-FLUSH-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 glc
+; SI-FLUSH-NEXT: s_waitcnt vmcnt(0)
+; SI-FLUSH-NEXT: v_mac_f32_e32 v3, -2.0, v2
+; SI-FLUSH-NEXT: buffer_store_dword v3, v[0:1], s[0:3], 0 addr64
+; SI-FLUSH-NEXT: s_endpgm
+;
+; SI-DENORM-FASTFMA-LABEL: fmuladd_2.0_neg_a_b_f32:
+; SI-DENORM-FASTFMA: ; %bb.0:
+; SI-DENORM-FASTFMA-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-DENORM-FASTFMA-NEXT: s_mov_b32 s3, 0xf000
+; SI-DENORM-FASTFMA-NEXT: s_mov_b32 s2, 0
+; SI-DENORM-FASTFMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; SI-DENORM-FASTFMA-NEXT: v_mov_b32_e32 v1, 0
+; SI-DENORM-FASTFMA-NEXT: s_waitcnt lgkmcnt(0)
+; SI-DENORM-FASTFMA-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc
+; SI-DENORM-FASTFMA-NEXT: s_waitcnt vmcnt(0)
+; SI-DENORM-FASTFMA-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 glc
+; SI-DENORM-FASTFMA-NEXT: s_waitcnt vmcnt(0)
+; SI-DENORM-FASTFMA-NEXT: v_fma_f32 v2, v2, -2.0, v3
+; SI-DENORM-FASTFMA-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-DENORM-FASTFMA-NEXT: s_endpgm
+;
+; SI-DENORM-SLOWFMA-LABEL: fmuladd_2.0_neg_a_b_f32:
+; SI-DENORM-SLOWFMA: ; %bb.0:
+; SI-DENORM-SLOWFMA-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s3, 0xf000
+; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s2, 0
+; SI-DENORM-SLOWFMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; SI-DENORM-SLOWFMA-NEXT: v_mov_b32_e32 v1, 0
+; SI-DENORM-SLOWFMA-NEXT: s_waitcnt lgkmcnt(0)
+; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc
+; SI-DENORM-SLOWFMA-NEXT: s_waitcnt vmcnt(0)
+; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 glc
+; SI-DENORM-SLOWFMA-NEXT: s_waitcnt vmcnt(0)
+; SI-DENORM-SLOWFMA-NEXT: v_add_f32_e32 v2, v2, v2
+; SI-DENORM-SLOWFMA-NEXT: v_sub_f32_e32 v2, v3, v2
+; SI-DENORM-SLOWFMA-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-DENORM-SLOWFMA-NEXT: s_endpgm
+;
+; GFX9-FLUSH-MAD-LABEL: fmuladd_2.0_neg_a_b_f32:
+; GFX9-FLUSH-MAD: ; %bb.0:
+; GFX9-FLUSH-MAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-FLUSH-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-FLUSH-MAD-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-FLUSH-MAD-NEXT: global_load_dword v1, v0, s[0:1] glc
+; GFX9-FLUSH-MAD-NEXT: s_waitcnt vmcnt(0)
+; GFX9-FLUSH-MAD-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc
+; GFX9-FLUSH-MAD-NEXT: s_waitcnt vmcnt(0)
+; GFX9-FLUSH-MAD-NEXT: v_mac_f32_e32 v2, -2.0, v1
+; GFX9-FLUSH-MAD-NEXT: global_store_dword v0, v2, s[0:1]
+; GFX9-FLUSH-MAD-NEXT: s_endpgm
+;
+; GFX9-DENORM-FASTFMA-MAD-LABEL: fmuladd_2.0_neg_a_b_f32:
+; GFX9-DENORM-FASTFMA-MAD: ; %bb.0:
+; GFX9-DENORM-FASTFMA-MAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-DENORM-FASTFMA-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-DENORM-FASTFMA-MAD-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DENORM-FASTFMA-MAD-NEXT: global_load_dword v1, v0, s[0:1] glc
+; GFX9-DENORM-FASTFMA-MAD-NEXT: s_waitcnt vmcnt(0)
+; GFX9-DENORM-FASTFMA-MAD-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc
+; GFX9-DENORM-FASTFMA-MAD-NEXT: s_waitcnt vmcnt(0)
+; GFX9-DENORM-FASTFMA-MAD-NEXT: v_fma_f32 v1, v1, -2.0, v2
+; GFX9-DENORM-FASTFMA-MAD-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-DENORM-FASTFMA-MAD-NEXT: s_endpgm
+;
+; GFX9-FMAC-LABEL: fmuladd_2.0_neg_a_b_f32:
+; GFX9-FMAC: ; %bb.0:
+; GFX9-FMAC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-FMAC-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-FMAC-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-FMAC-NEXT: global_load_dword v1, v0, s[0:1] glc
+; GFX9-FMAC-NEXT: s_waitcnt vmcnt(0)
+; GFX9-FMAC-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc
+; GFX9-FMAC-NEXT: s_waitcnt vmcnt(0)
+; GFX9-FMAC-NEXT: v_fmac_f32_e32 v2, -2.0, v1
+; GFX9-FMAC-NEXT: global_store_dword v0, v2, s[0:1]
+; GFX9-FMAC-NEXT: s_endpgm
+;
+; GFX10-LABEL: fmuladd_2.0_neg_a_b_f32:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: global_load_dword v1, v0, s[0:1] glc dlc
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc dlc
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_fmac_f32_e32 v2, -2.0, v1
+; GFX10-NEXT: global_store_dword v0, v2, s[0:1]
+; GFX10-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep.0 = getelementptr float, ptr addrspace(1) %out, i32 %tid
%gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1
@@ -314,23 +1369,107 @@ define amdgpu_kernel void @fmuladd_2.0_neg_a_b_f32(ptr addrspace(1) %out, ptr ad
ret void
}
-; GCN-LABEL: {{^}}fmuladd_2.0_a_neg_b_f32:
-; GCN: {{buffer|flat|global}}_load_dword [[R1:v[0-9]+]],
-; GCN: {{buffer|flat|global}}_load_dword [[R2:v[0-9]+]],
-; GCN-FLUSH-MAD: v_mad_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]]
-; GCN-FLUSH-FMAC: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]]
-
-; SI-FLUSH: buffer_store_dword [[RESULT]]
-; VI-FLUSH: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-
-; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]]
-
-; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
-; GCN-DENORM-SLOWFMA: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
-
-; SI-DENORM: buffer_store_dword [[RESULT]]
-; VI-DENORM: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
define amdgpu_kernel void @fmuladd_2.0_a_neg_b_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+; SI-FLUSH-LABEL: fmuladd_2.0_a_neg_b_f32:
+; SI-FLUSH: ; %bb.0:
+; SI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-FLUSH-NEXT: s_mov_b32 s3, 0xf000
+; SI-FLUSH-NEXT: s_mov_b32 s2, 0
+; SI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; SI-FLUSH-NEXT: v_mov_b32_e32 v1, 0
+; SI-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
+; SI-FLUSH-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc
+; SI-FLUSH-NEXT: s_waitcnt vmcnt(0)
+; SI-FLUSH-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 glc
+; SI-FLUSH-NEXT: s_waitcnt vmcnt(0)
+; SI-FLUSH-NEXT: v_mad_f32 v2, v2, 2.0, -v3
+; SI-FLUSH-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-FLUSH-NEXT: s_endpgm
+;
+; SI-DENORM-FASTFMA-LABEL: fmuladd_2.0_a_neg_b_f32:
+; SI-DENORM-FASTFMA: ; %bb.0:
+; SI-DENORM-FASTFMA-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-DENORM-FASTFMA-NEXT: s_mov_b32 s3, 0xf000
+; SI-DENORM-FASTFMA-NEXT: s_mov_b32 s2, 0
+; SI-DENORM-FASTFMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; SI-DENORM-FASTFMA-NEXT: v_mov_b32_e32 v1, 0
+; SI-DENORM-FASTFMA-NEXT: s_waitcnt lgkmcnt(0)
+; SI-DENORM-FASTFMA-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc
+; SI-DENORM-FASTFMA-NEXT: s_waitcnt vmcnt(0)
+; SI-DENORM-FASTFMA-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 glc
+; SI-DENORM-FASTFMA-NEXT: s_waitcnt vmcnt(0)
+; SI-DENORM-FASTFMA-NEXT: v_fma_f32 v2, v2, 2.0, -v3
+; SI-DENORM-FASTFMA-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-DENORM-FASTFMA-NEXT: s_endpgm
+;
+; SI-DENORM-SLOWFMA-LABEL: fmuladd_2.0_a_neg_b_f32:
+; SI-DENORM-SLOWFMA: ; %bb.0:
+; SI-DENORM-SLOWFMA-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s3, 0xf000
+; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s2, 0
+; SI-DENORM-SLOWFMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; SI-DENORM-SLOWFMA-NEXT: v_mov_b32_e32 v1, 0
+; SI-DENORM-SLOWFMA-NEXT: s_waitcnt lgkmcnt(0)
+; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc
+; SI-DENORM-SLOWFMA-NEXT: s_waitcnt vmcnt(0)
+; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 glc
+; SI-DENORM-SLOWFMA-NEXT: s_waitcnt vmcnt(0)
+; SI-DENORM-SLOWFMA-NEXT: v_add_f32_e32 v2, v2, v2
+; SI-DENORM-SLOWFMA-NEXT: v_sub_f32_e32 v2, v2, v3
+; SI-DENORM-SLOWFMA-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-DENORM-SLOWFMA-NEXT: s_endpgm
+;
+; GFX9-FLUSH-MAD-LABEL: fmuladd_2.0_a_neg_b_f32:
+; GFX9-FLUSH-MAD: ; %bb.0:
+; GFX9-FLUSH-MAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-FLUSH-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-FLUSH-MAD-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-FLUSH-MAD-NEXT: global_load_dword v1, v0, s[0:1] glc
+; GFX9-FLUSH-MAD-NEXT: s_waitcnt vmcnt(0)
+; GFX9-FLUSH-MAD-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc
+; GFX9-FLUSH-MAD-NEXT: s_waitcnt vmcnt(0)
+; GFX9-FLUSH-MAD-NEXT: v_mad_f32 v1, v1, 2.0, -v2
+; GFX9-FLUSH-MAD-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-FLUSH-MAD-NEXT: s_endpgm
+;
+; GFX9-DENORM-LABEL: fmuladd_2.0_a_neg_b_f32:
+; GFX9-DENORM: ; %bb.0:
+; GFX9-DENORM-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-DENORM-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-DENORM-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DENORM-NEXT: global_load_dword v1, v0, s[0:1] glc
+; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0)
+; GFX9-DENORM-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc
+; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0)
+; GFX9-DENORM-NEXT: v_fma_f32 v1, v1, 2.0, -v2
+; GFX9-DENORM-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-DENORM-NEXT: s_endpgm
+;
+; GFX9-FMAC-LABEL: fmuladd_2.0_a_neg_b_f32:
+; GFX9-FMAC: ; %bb.0:
+; GFX9-FMAC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-FMAC-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-FMAC-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-FMAC-NEXT: global_load_dword v1, v0, s[0:1] glc
+; GFX9-FMAC-NEXT: s_waitcnt vmcnt(0)
+; GFX9-FMAC-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc
+; GFX9-FMAC-NEXT: s_waitcnt vmcnt(0)
+; GFX9-FMAC-NEXT: v_fma_f32 v1, v1, 2.0, -v2
+; GFX9-FMAC-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-FMAC-NEXT: s_endpgm
+;
+; GFX10-LABEL: fmuladd_2.0_a_neg_b_f32:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: global_load_dword v1, v0, s[0:1] glc dlc
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc dlc
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_fma_f32 v1, v1, 2.0, -v2
+; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep.0 = getelementptr float, ptr addrspace(1) %out, i32 %tid
%gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1
@@ -346,23 +1485,150 @@ define amdgpu_kernel void @fmuladd_2.0_a_neg_b_f32(ptr addrspace(1) %out, ptr ad
ret void
}
-; GCN-LABEL: {{^}}mad_sub_f32:
-; GCN: {{buffer|flat|global}}_load_dword [[REGA:v[0-9]+]]
-; GCN: {{buffer|flat|global}}_load_dword [[REGB:v[0-9]+]]
-; GCN: {{buffer|flat|global}}_load_dword [[REGC:v[0-9]+]]
-; GCN-FLUSH: v_mad_f32 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -[[REGC]]
-
-; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -[[REGC]]
-
-; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
-; GCN-DENORM-SLOWFMA-CONTRACT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]]
-
-; GCN-DENORM-STRICT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
-; GCN-DENORM-STRICT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]]
-
-; SI: buffer_store_dword [[RESULT]]
-; VI: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
define amdgpu_kernel void @mad_sub_f32(ptr addrspace(1) noalias nocapture %out, ptr addrspace(1) noalias nocapture readonly %ptr) #0 {
+; SI-FLUSH-LABEL: mad_sub_f32:
+; SI-FLUSH: ; %bb.0:
+; SI-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-FLUSH-NEXT: s_mov_b32 s7, 0xf000
+; SI-FLUSH-NEXT: s_mov_b32 s6, 0
+; SI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; SI-FLUSH-NEXT: v_mov_b32_e32 v1, 0
+; SI-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
+; SI-FLUSH-NEXT: s_mov_b64 s[4:5], s[2:3]
+; SI-FLUSH-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc
+; SI-FLUSH-NEXT: s_waitcnt vmcnt(0)
+; SI-FLUSH-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:4 glc
+; SI-FLUSH-NEXT: s_waitcnt vmcnt(0)
+; SI-FLUSH-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:8 glc
+; SI-FLUSH-NEXT: s_waitcnt vmcnt(0)
+; SI-FLUSH-NEXT: s_mov_b64 s[2:3], s[6:7]
+; SI-FLUSH-NEXT: v_mad_f32 v2, v2, v3, -v4
+; SI-FLUSH-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-FLUSH-NEXT: s_endpgm
+;
+; SI-DENORM-FASTFMA-STRICT-LABEL: mad_sub_f32:
+; SI-DENORM-FASTFMA-STRICT: ; %bb.0:
+; SI-DENORM-FASTFMA-STRICT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b32 s7, 0xf000
+; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b32 s6, 0
+; SI-DENORM-FASTFMA-STRICT-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; SI-DENORM-FASTFMA-STRICT-NEXT: v_mov_b32_e32 v1, 0
+; SI-DENORM-FASTFMA-STRICT-NEXT: s_waitcnt lgkmcnt(0)
+; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b64 s[4:5], s[2:3]
+; SI-DENORM-FASTFMA-STRICT-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc
+; SI-DENORM-FASTFMA-STRICT-NEXT: s_waitcnt vmcnt(0)
+; SI-DENORM-FASTFMA-STRICT-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:4 glc
+; SI-DENORM-FASTFMA-STRICT-NEXT: s_waitcnt vmcnt(0)
+; SI-DENORM-FASTFMA-STRICT-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:8 glc
+; SI-DENORM-FASTFMA-STRICT-NEXT: s_waitcnt vmcnt(0)
+; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b64 s[2:3], s[6:7]
+; SI-DENORM-FASTFMA-STRICT-NEXT: v_mul_f32_e32 v2, v2, v3
+; SI-DENORM-FASTFMA-STRICT-NEXT: v_sub_f32_e32 v2, v2, v4
+; SI-DENORM-FASTFMA-STRICT-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-DENORM-FASTFMA-STRICT-NEXT: s_endpgm
+;
+; SI-DENORM-SLOWFMA-LABEL: mad_sub_f32:
+; SI-DENORM-SLOWFMA: ; %bb.0:
+; SI-DENORM-SLOWFMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s7, 0xf000
+; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s6, 0
+; SI-DENORM-SLOWFMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; SI-DENORM-SLOWFMA-NEXT: v_mov_b32_e32 v1, 0
+; SI-DENORM-SLOWFMA-NEXT: s_waitcnt lgkmcnt(0)
+; SI-DENORM-SLOWFMA-NEXT: s_mov_b64 s[4:5], s[2:3]
+; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc
+; SI-DENORM-SLOWFMA-NEXT: s_waitcnt vmcnt(0)
+; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:4 glc
+; SI-DENORM-SLOWFMA-NEXT: s_waitcnt vmcnt(0)
+; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:8 glc
+; SI-DENORM-SLOWFMA-NEXT: s_waitcnt vmcnt(0)
+; SI-DENORM-SLOWFMA-NEXT: s_mov_b64 s[2:3], s[6:7]
+; SI-DENORM-SLOWFMA-NEXT: v_mul_f32_e32 v2, v2, v3
+; SI-DENORM-SLOWFMA-NEXT: v_sub_f32_e32 v2, v2, v4
+; SI-DENORM-SLOWFMA-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-DENORM-SLOWFMA-NEXT: s_endpgm
+;
+; SI-DENORM-FASTFMA-CONTRACT-LABEL: mad_sub_f32:
+; SI-DENORM-FASTFMA-CONTRACT: ; %bb.0:
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b32 s7, 0xf000
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b32 s6, 0
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: v_mov_b32_e32 v1, 0
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_waitcnt lgkmcnt(0)
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b64 s[4:5], s[2:3]
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_waitcnt vmcnt(0)
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:4 glc
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_waitcnt vmcnt(0)
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:8 glc
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_waitcnt vmcnt(0)
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b64 s[2:3], s[6:7]
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: v_fma_f32 v2, v2, v3, -v4
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_endpgm
+;
+; GFX9-FLUSH-LABEL: mad_sub_f32:
+; GFX9-FLUSH: ; %bb.0:
+; GFX9-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-FLUSH-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0)
+; GFX9-FLUSH-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc
+; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0)
+; GFX9-FLUSH-NEXT: global_load_dword v3, v0, s[2:3] offset:8 glc
+; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0)
+; GFX9-FLUSH-NEXT: v_mad_f32 v1, v1, v2, -v3
+; GFX9-FLUSH-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-FLUSH-NEXT: s_endpgm
+;
+; GFX9-DENORM-LABEL: mad_sub_f32:
+; GFX9-DENORM: ; %bb.0:
+; GFX9-DENORM-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-DENORM-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-DENORM-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DENORM-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0)
+; GFX9-DENORM-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc
+; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0)
+; GFX9-DENORM-NEXT: global_load_dword v3, v0, s[2:3] offset:8 glc
+; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0)
+; GFX9-DENORM-NEXT: v_mul_f32_e32 v1, v1, v2
+; GFX9-DENORM-NEXT: v_sub_f32_e32 v1, v1, v3
+; GFX9-DENORM-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-DENORM-NEXT: s_endpgm
+;
+; GFX10-FLUSH-LABEL: mad_sub_f32:
+; GFX10-FLUSH: ; %bb.0:
+; GFX10-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-FLUSH-NEXT: global_load_dword v1, v0, s[2:3] glc dlc
+; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
+; GFX10-FLUSH-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc dlc
+; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
+; GFX10-FLUSH-NEXT: global_load_dword v3, v0, s[2:3] offset:8 glc dlc
+; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
+; GFX10-FLUSH-NEXT: v_mad_f32 v1, v1, v2, -v3
+; GFX10-FLUSH-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-FLUSH-NEXT: s_endpgm
+;
+; GFX10-DENORM-LABEL: mad_sub_f32:
+; GFX10-DENORM: ; %bb.0:
+; GFX10-DENORM-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-DENORM-NEXT: global_load_dword v1, v0, s[2:3] glc dlc
+; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0)
+; GFX10-DENORM-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc dlc
+; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0)
+; GFX10-DENORM-NEXT: global_load_dword v3, v0, s[2:3] offset:8 glc dlc
+; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0)
+; GFX10-DENORM-NEXT: v_mul_f32_e32 v1, v1, v2
+; GFX10-DENORM-NEXT: v_sub_f32_e32 v1, v1, v3
+; GFX10-DENORM-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-DENORM-NEXT: s_endpgm
%tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
%tid.ext = sext i32 %tid to i64
%gep0 = getelementptr float, ptr addrspace(1) %ptr, i64 %tid.ext
@@ -380,24 +1646,150 @@ define amdgpu_kernel void @mad_sub_f32(ptr addrspace(1) noalias nocapture %out,
ret void
}
-; GCN-LABEL: {{^}}mad_sub_inv_f32:
-; GCN: {{buffer|flat|global}}_load_dword [[REGA:v[0-9]+]]
-; GCN: {{buffer|flat|global}}_load_dword [[REGB:v[0-9]+]]
-; GCN: {{buffer|flat|global}}_load_dword [[REGC:v[0-9]+]]
-
-; GCN-FLUSH: v_mad_f32 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], [[REGC]]
-
-; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], [[REGC]]
-
-; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
-; GCN-DENORM-SLOWFMA-CONTRACT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]]
-
-; GCN-DENORM-STRICT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
-; GCN-DENORM-STRICT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]]
-
-; SI: buffer_store_dword [[RESULT]]
-; VI: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
define amdgpu_kernel void @mad_sub_inv_f32(ptr addrspace(1) noalias nocapture %out, ptr addrspace(1) noalias nocapture readonly %ptr) #0 {
+; SI-FLUSH-LABEL: mad_sub_inv_f32:
+; SI-FLUSH: ; %bb.0:
+; SI-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-FLUSH-NEXT: s_mov_b32 s7, 0xf000
+; SI-FLUSH-NEXT: s_mov_b32 s6, 0
+; SI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; SI-FLUSH-NEXT: v_mov_b32_e32 v1, 0
+; SI-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
+; SI-FLUSH-NEXT: s_mov_b64 s[4:5], s[2:3]
+; SI-FLUSH-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc
+; SI-FLUSH-NEXT: s_waitcnt vmcnt(0)
+; SI-FLUSH-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:4 glc
+; SI-FLUSH-NEXT: s_waitcnt vmcnt(0)
+; SI-FLUSH-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:8 glc
+; SI-FLUSH-NEXT: s_waitcnt vmcnt(0)
+; SI-FLUSH-NEXT: s_mov_b64 s[2:3], s[6:7]
+; SI-FLUSH-NEXT: v_mad_f32 v2, -v2, v3, v4
+; SI-FLUSH-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-FLUSH-NEXT: s_endpgm
+;
+; SI-DENORM-FASTFMA-STRICT-LABEL: mad_sub_inv_f32:
+; SI-DENORM-FASTFMA-STRICT: ; %bb.0:
+; SI-DENORM-FASTFMA-STRICT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b32 s7, 0xf000
+; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b32 s6, 0
+; SI-DENORM-FASTFMA-STRICT-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; SI-DENORM-FASTFMA-STRICT-NEXT: v_mov_b32_e32 v1, 0
+; SI-DENORM-FASTFMA-STRICT-NEXT: s_waitcnt lgkmcnt(0)
+; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b64 s[4:5], s[2:3]
+; SI-DENORM-FASTFMA-STRICT-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc
+; SI-DENORM-FASTFMA-STRICT-NEXT: s_waitcnt vmcnt(0)
+; SI-DENORM-FASTFMA-STRICT-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:4 glc
+; SI-DENORM-FASTFMA-STRICT-NEXT: s_waitcnt vmcnt(0)
+; SI-DENORM-FASTFMA-STRICT-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:8 glc
+; SI-DENORM-FASTFMA-STRICT-NEXT: s_waitcnt vmcnt(0)
+; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b64 s[2:3], s[6:7]
+; SI-DENORM-FASTFMA-STRICT-NEXT: v_mul_f32_e32 v2, v2, v3
+; SI-DENORM-FASTFMA-STRICT-NEXT: v_sub_f32_e32 v2, v4, v2
+; SI-DENORM-FASTFMA-STRICT-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-DENORM-FASTFMA-STRICT-NEXT: s_endpgm
+;
+; SI-DENORM-SLOWFMA-LABEL: mad_sub_inv_f32:
+; SI-DENORM-SLOWFMA: ; %bb.0:
+; SI-DENORM-SLOWFMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s7, 0xf000
+; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s6, 0
+; SI-DENORM-SLOWFMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; SI-DENORM-SLOWFMA-NEXT: v_mov_b32_e32 v1, 0
+; SI-DENORM-SLOWFMA-NEXT: s_waitcnt lgkmcnt(0)
+; SI-DENORM-SLOWFMA-NEXT: s_mov_b64 s[4:5], s[2:3]
+; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc
+; SI-DENORM-SLOWFMA-NEXT: s_waitcnt vmcnt(0)
+; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:4 glc
+; SI-DENORM-SLOWFMA-NEXT: s_waitcnt vmcnt(0)
+; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:8 glc
+; SI-DENORM-SLOWFMA-NEXT: s_waitcnt vmcnt(0)
+; SI-DENORM-SLOWFMA-NEXT: s_mov_b64 s[2:3], s[6:7]
+; SI-DENORM-SLOWFMA-NEXT: v_mul_f32_e32 v2, v2, v3
+; SI-DENORM-SLOWFMA-NEXT: v_sub_f32_e32 v2, v4, v2
+; SI-DENORM-SLOWFMA-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-DENORM-SLOWFMA-NEXT: s_endpgm
+;
+; SI-DENORM-FASTFMA-CONTRACT-LABEL: mad_sub_inv_f32:
+; SI-DENORM-FASTFMA-CONTRACT: ; %bb.0:
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b32 s7, 0xf000
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b32 s6, 0
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: v_mov_b32_e32 v1, 0
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_waitcnt lgkmcnt(0)
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b64 s[4:5], s[2:3]
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_waitcnt vmcnt(0)
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:4 glc
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_waitcnt vmcnt(0)
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:8 glc
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_waitcnt vmcnt(0)
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b64 s[2:3], s[6:7]
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: v_fma_f32 v2, -v2, v3, v4
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_endpgm
+;
+; GFX9-FLUSH-LABEL: mad_sub_inv_f32:
+; GFX9-FLUSH: ; %bb.0:
+; GFX9-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-FLUSH-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0)
+; GFX9-FLUSH-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc
+; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0)
+; GFX9-FLUSH-NEXT: global_load_dword v3, v0, s[2:3] offset:8 glc
+; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0)
+; GFX9-FLUSH-NEXT: v_mad_f32 v1, -v1, v2, v3
+; GFX9-FLUSH-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-FLUSH-NEXT: s_endpgm
+;
+; GFX9-DENORM-LABEL: mad_sub_inv_f32:
+; GFX9-DENORM: ; %bb.0:
+; GFX9-DENORM-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-DENORM-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-DENORM-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DENORM-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0)
+; GFX9-DENORM-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc
+; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0)
+; GFX9-DENORM-NEXT: global_load_dword v3, v0, s[2:3] offset:8 glc
+; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0)
+; GFX9-DENORM-NEXT: v_mul_f32_e32 v1, v1, v2
+; GFX9-DENORM-NEXT: v_sub_f32_e32 v1, v3, v1
+; GFX9-DENORM-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-DENORM-NEXT: s_endpgm
+;
+; GFX10-FLUSH-LABEL: mad_sub_inv_f32:
+; GFX10-FLUSH: ; %bb.0:
+; GFX10-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-FLUSH-NEXT: global_load_dword v1, v0, s[2:3] glc dlc
+; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
+; GFX10-FLUSH-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc dlc
+; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
+; GFX10-FLUSH-NEXT: global_load_dword v3, v0, s[2:3] offset:8 glc dlc
+; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
+; GFX10-FLUSH-NEXT: v_mad_f32 v1, -v1, v2, v3
+; GFX10-FLUSH-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-FLUSH-NEXT: s_endpgm
+;
+; GFX10-DENORM-LABEL: mad_sub_inv_f32:
+; GFX10-DENORM: ; %bb.0:
+; GFX10-DENORM-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-DENORM-NEXT: global_load_dword v1, v0, s[2:3] glc dlc
+; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0)
+; GFX10-DENORM-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc dlc
+; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0)
+; GFX10-DENORM-NEXT: global_load_dword v3, v0, s[2:3] offset:8 glc dlc
+; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0)
+; GFX10-DENORM-NEXT: v_mul_f32_e32 v1, v1, v2
+; GFX10-DENORM-NEXT: v_sub_f32_e32 v1, v3, v1
+; GFX10-DENORM-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-DENORM-NEXT: s_endpgm
%tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
%tid.ext = sext i32 %tid to i64
%gep0 = getelementptr float, ptr addrspace(1) %ptr, i64 %tid.ext
@@ -415,23 +1807,150 @@ define amdgpu_kernel void @mad_sub_inv_f32(ptr addrspace(1) noalias nocapture %o
ret void
}
-; GCN-LABEL: {{^}}mad_sub_fabs_f32:
-; GCN: {{buffer|flat|global}}_load_dword [[REGA:v[0-9]+]]
-; GCN: {{buffer|flat|global}}_load_dword [[REGB:v[0-9]+]]
-; GCN: {{buffer|flat|global}}_load_dword [[REGC:v[0-9]+]]
-; GCN-FLUSH: v_mad_f32 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -|[[REGC]]|
-
-; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -|[[REGC]]|
-
-; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
-; GCN-DENORM-SLOWFMA-CONTRACT: v_sub_f32_e64 [[RESULT:v[0-9]+]], [[TMP]], |[[REGC]]|
-
-; GCN-DENORM-STRICT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
-; GCN-DENORM-STRICT: v_sub_f32_e64 [[RESULT:v[0-9]+]], [[TMP]], |[[REGC]]|
-
-; SI: buffer_store_dword [[RESULT]]
-; VI: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
define amdgpu_kernel void @mad_sub_fabs_f32(ptr addrspace(1) noalias nocapture %out, ptr addrspace(1) noalias nocapture readonly %ptr) #0 {
+; SI-FLUSH-LABEL: mad_sub_fabs_f32:
+; SI-FLUSH: ; %bb.0:
+; SI-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-FLUSH-NEXT: s_mov_b32 s7, 0xf000
+; SI-FLUSH-NEXT: s_mov_b32 s6, 0
+; SI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; SI-FLUSH-NEXT: v_mov_b32_e32 v1, 0
+; SI-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
+; SI-FLUSH-NEXT: s_mov_b64 s[4:5], s[2:3]
+; SI-FLUSH-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc
+; SI-FLUSH-NEXT: s_waitcnt vmcnt(0)
+; SI-FLUSH-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:4 glc
+; SI-FLUSH-NEXT: s_waitcnt vmcnt(0)
+; SI-FLUSH-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:8 glc
+; SI-FLUSH-NEXT: s_waitcnt vmcnt(0)
+; SI-FLUSH-NEXT: s_mov_b64 s[2:3], s[6:7]
+; SI-FLUSH-NEXT: v_mad_f32 v2, v2, v3, -|v4|
+; SI-FLUSH-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-FLUSH-NEXT: s_endpgm
+;
+; SI-DENORM-FASTFMA-STRICT-LABEL: mad_sub_fabs_f32:
+; SI-DENORM-FASTFMA-STRICT: ; %bb.0:
+; SI-DENORM-FASTFMA-STRICT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b32 s7, 0xf000
+; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b32 s6, 0
+; SI-DENORM-FASTFMA-STRICT-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; SI-DENORM-FASTFMA-STRICT-NEXT: v_mov_b32_e32 v1, 0
+; SI-DENORM-FASTFMA-STRICT-NEXT: s_waitcnt lgkmcnt(0)
+; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b64 s[4:5], s[2:3]
+; SI-DENORM-FASTFMA-STRICT-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc
+; SI-DENORM-FASTFMA-STRICT-NEXT: s_waitcnt vmcnt(0)
+; SI-DENORM-FASTFMA-STRICT-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:4 glc
+; SI-DENORM-FASTFMA-STRICT-NEXT: s_waitcnt vmcnt(0)
+; SI-DENORM-FASTFMA-STRICT-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:8 glc
+; SI-DENORM-FASTFMA-STRICT-NEXT: s_waitcnt vmcnt(0)
+; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b64 s[2:3], s[6:7]
+; SI-DENORM-FASTFMA-STRICT-NEXT: v_mul_f32_e32 v2, v2, v3
+; SI-DENORM-FASTFMA-STRICT-NEXT: v_sub_f32_e64 v2, v2, |v4|
+; SI-DENORM-FASTFMA-STRICT-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-DENORM-FASTFMA-STRICT-NEXT: s_endpgm
+;
+; SI-DENORM-SLOWFMA-LABEL: mad_sub_fabs_f32:
+; SI-DENORM-SLOWFMA: ; %bb.0:
+; SI-DENORM-SLOWFMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s7, 0xf000
+; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s6, 0
+; SI-DENORM-SLOWFMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; SI-DENORM-SLOWFMA-NEXT: v_mov_b32_e32 v1, 0
+; SI-DENORM-SLOWFMA-NEXT: s_waitcnt lgkmcnt(0)
+; SI-DENORM-SLOWFMA-NEXT: s_mov_b64 s[4:5], s[2:3]
+; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc
+; SI-DENORM-SLOWFMA-NEXT: s_waitcnt vmcnt(0)
+; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:4 glc
+; SI-DENORM-SLOWFMA-NEXT: s_waitcnt vmcnt(0)
+; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:8 glc
+; SI-DENORM-SLOWFMA-NEXT: s_waitcnt vmcnt(0)
+; SI-DENORM-SLOWFMA-NEXT: s_mov_b64 s[2:3], s[6:7]
+; SI-DENORM-SLOWFMA-NEXT: v_mul_f32_e32 v2, v2, v3
+; SI-DENORM-SLOWFMA-NEXT: v_sub_f32_e64 v2, v2, |v4|
+; SI-DENORM-SLOWFMA-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-DENORM-SLOWFMA-NEXT: s_endpgm
+;
+; SI-DENORM-FASTFMA-CONTRACT-LABEL: mad_sub_fabs_f32:
+; SI-DENORM-FASTFMA-CONTRACT: ; %bb.0:
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b32 s7, 0xf000
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b32 s6, 0
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: v_mov_b32_e32 v1, 0
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_waitcnt lgkmcnt(0)
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b64 s[4:5], s[2:3]
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_waitcnt vmcnt(0)
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:4 glc
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_waitcnt vmcnt(0)
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:8 glc
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_waitcnt vmcnt(0)
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b64 s[2:3], s[6:7]
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: v_fma_f32 v2, v2, v3, -|v4|
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_endpgm
+;
+; GFX9-FLUSH-LABEL: mad_sub_fabs_f32:
+; GFX9-FLUSH: ; %bb.0:
+; GFX9-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-FLUSH-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0)
+; GFX9-FLUSH-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc
+; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0)
+; GFX9-FLUSH-NEXT: global_load_dword v3, v0, s[2:3] offset:8 glc
+; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0)
+; GFX9-FLUSH-NEXT: v_mad_f32 v1, v1, v2, -|v3|
+; GFX9-FLUSH-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-FLUSH-NEXT: s_endpgm
+;
+; GFX9-DENORM-LABEL: mad_sub_fabs_f32:
+; GFX9-DENORM: ; %bb.0:
+; GFX9-DENORM-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-DENORM-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-DENORM-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DENORM-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0)
+; GFX9-DENORM-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc
+; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0)
+; GFX9-DENORM-NEXT: global_load_dword v3, v0, s[2:3] offset:8 glc
+; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0)
+; GFX9-DENORM-NEXT: v_mul_f32_e32 v1, v1, v2
+; GFX9-DENORM-NEXT: v_sub_f32_e64 v1, v1, |v3|
+; GFX9-DENORM-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-DENORM-NEXT: s_endpgm
+;
+; GFX10-FLUSH-LABEL: mad_sub_fabs_f32:
+; GFX10-FLUSH: ; %bb.0:
+; GFX10-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-FLUSH-NEXT: global_load_dword v1, v0, s[2:3] glc dlc
+; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
+; GFX10-FLUSH-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc dlc
+; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
+; GFX10-FLUSH-NEXT: global_load_dword v3, v0, s[2:3] offset:8 glc dlc
+; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
+; GFX10-FLUSH-NEXT: v_mad_f32 v1, v1, v2, -|v3|
+; GFX10-FLUSH-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-FLUSH-NEXT: s_endpgm
+;
+; GFX10-DENORM-LABEL: mad_sub_fabs_f32:
+; GFX10-DENORM: ; %bb.0:
+; GFX10-DENORM-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-DENORM-NEXT: global_load_dword v1, v0, s[2:3] glc dlc
+; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0)
+; GFX10-DENORM-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc dlc
+; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0)
+; GFX10-DENORM-NEXT: global_load_dword v3, v0, s[2:3] offset:8 glc dlc
+; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0)
+; GFX10-DENORM-NEXT: v_mul_f32_e32 v1, v1, v2
+; GFX10-DENORM-NEXT: v_sub_f32_e64 v1, v1, |v3|
+; GFX10-DENORM-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-DENORM-NEXT: s_endpgm
%tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
%tid.ext = sext i32 %tid to i64
%gep0 = getelementptr float, ptr addrspace(1) %ptr, i64 %tid.ext
@@ -450,24 +1969,150 @@ define amdgpu_kernel void @mad_sub_fabs_f32(ptr addrspace(1) noalias nocapture %
ret void
}
-; GCN-LABEL: {{^}}mad_sub_fabs_inv_f32:
-; GCN: {{buffer|flat|global}}_load_dword [[REGA:v[0-9]+]]
-; GCN: {{buffer|flat|global}}_load_dword [[REGB:v[0-9]+]]
-; GCN: {{buffer|flat|global}}_load_dword [[REGC:v[0-9]+]]
-; GCN-FLUSH-MAD: v_mad_f32 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], |[[REGC]]|
-; GCN-FLUSH-FMA: v_fma_f32 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], |[[REGC]]|
-
-; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], |[[REGC]]|
-
-; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
-; GCN-DENORM-SLOWFMA-CONTRACT: v_sub_f32_e64 [[RESULT:v[0-9]+]], |[[REGC]]|, [[TMP]]
-
-; GCN-DENORM-STRICT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
-; GCN-DENORM-STRICT: v_sub_f32_e64 [[RESULT:v[0-9]+]], |[[REGC]]|, [[TMP]]
-
-; SI: buffer_store_dword [[RESULT]]
-; VI: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
define amdgpu_kernel void @mad_sub_fabs_inv_f32(ptr addrspace(1) noalias nocapture %out, ptr addrspace(1) noalias nocapture readonly %ptr) #0 {
+; SI-FLUSH-LABEL: mad_sub_fabs_inv_f32:
+; SI-FLUSH: ; %bb.0:
+; SI-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-FLUSH-NEXT: s_mov_b32 s7, 0xf000
+; SI-FLUSH-NEXT: s_mov_b32 s6, 0
+; SI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; SI-FLUSH-NEXT: v_mov_b32_e32 v1, 0
+; SI-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
+; SI-FLUSH-NEXT: s_mov_b64 s[4:5], s[2:3]
+; SI-FLUSH-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc
+; SI-FLUSH-NEXT: s_waitcnt vmcnt(0)
+; SI-FLUSH-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:4 glc
+; SI-FLUSH-NEXT: s_waitcnt vmcnt(0)
+; SI-FLUSH-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:8 glc
+; SI-FLUSH-NEXT: s_waitcnt vmcnt(0)
+; SI-FLUSH-NEXT: s_mov_b64 s[2:3], s[6:7]
+; SI-FLUSH-NEXT: v_mad_f32 v2, -v2, v3, |v4|
+; SI-FLUSH-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-FLUSH-NEXT: s_endpgm
+;
+; SI-DENORM-FASTFMA-STRICT-LABEL: mad_sub_fabs_inv_f32:
+; SI-DENORM-FASTFMA-STRICT: ; %bb.0:
+; SI-DENORM-FASTFMA-STRICT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b32 s7, 0xf000
+; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b32 s6, 0
+; SI-DENORM-FASTFMA-STRICT-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; SI-DENORM-FASTFMA-STRICT-NEXT: v_mov_b32_e32 v1, 0
+; SI-DENORM-FASTFMA-STRICT-NEXT: s_waitcnt lgkmcnt(0)
+; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b64 s[4:5], s[2:3]
+; SI-DENORM-FASTFMA-STRICT-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc
+; SI-DENORM-FASTFMA-STRICT-NEXT: s_waitcnt vmcnt(0)
+; SI-DENORM-FASTFMA-STRICT-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:4 glc
+; SI-DENORM-FASTFMA-STRICT-NEXT: s_waitcnt vmcnt(0)
+; SI-DENORM-FASTFMA-STRICT-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:8 glc
+; SI-DENORM-FASTFMA-STRICT-NEXT: s_waitcnt vmcnt(0)
+; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b64 s[2:3], s[6:7]
+; SI-DENORM-FASTFMA-STRICT-NEXT: v_mul_f32_e32 v2, v2, v3
+; SI-DENORM-FASTFMA-STRICT-NEXT: v_sub_f32_e64 v2, |v4|, v2
+; SI-DENORM-FASTFMA-STRICT-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-DENORM-FASTFMA-STRICT-NEXT: s_endpgm
+;
+; SI-DENORM-SLOWFMA-LABEL: mad_sub_fabs_inv_f32:
+; SI-DENORM-SLOWFMA: ; %bb.0:
+; SI-DENORM-SLOWFMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s7, 0xf000
+; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s6, 0
+; SI-DENORM-SLOWFMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; SI-DENORM-SLOWFMA-NEXT: v_mov_b32_e32 v1, 0
+; SI-DENORM-SLOWFMA-NEXT: s_waitcnt lgkmcnt(0)
+; SI-DENORM-SLOWFMA-NEXT: s_mov_b64 s[4:5], s[2:3]
+; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc
+; SI-DENORM-SLOWFMA-NEXT: s_waitcnt vmcnt(0)
+; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:4 glc
+; SI-DENORM-SLOWFMA-NEXT: s_waitcnt vmcnt(0)
+; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:8 glc
+; SI-DENORM-SLOWFMA-NEXT: s_waitcnt vmcnt(0)
+; SI-DENORM-SLOWFMA-NEXT: s_mov_b64 s[2:3], s[6:7]
+; SI-DENORM-SLOWFMA-NEXT: v_mul_f32_e32 v2, v2, v3
+; SI-DENORM-SLOWFMA-NEXT: v_sub_f32_e64 v2, |v4|, v2
+; SI-DENORM-SLOWFMA-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-DENORM-SLOWFMA-NEXT: s_endpgm
+;
+; SI-DENORM-FASTFMA-CONTRACT-LABEL: mad_sub_fabs_inv_f32:
+; SI-DENORM-FASTFMA-CONTRACT: ; %bb.0:
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b32 s7, 0xf000
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b32 s6, 0
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: v_mov_b32_e32 v1, 0
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_waitcnt lgkmcnt(0)
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b64 s[4:5], s[2:3]
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_waitcnt vmcnt(0)
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:4 glc
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_waitcnt vmcnt(0)
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:8 glc
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_waitcnt vmcnt(0)
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b64 s[2:3], s[6:7]
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: v_fma_f32 v2, -v2, v3, |v4|
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_endpgm
+;
+; GFX9-FLUSH-LABEL: mad_sub_fabs_inv_f32:
+; GFX9-FLUSH: ; %bb.0:
+; GFX9-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-FLUSH-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0)
+; GFX9-FLUSH-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc
+; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0)
+; GFX9-FLUSH-NEXT: global_load_dword v3, v0, s[2:3] offset:8 glc
+; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0)
+; GFX9-FLUSH-NEXT: v_mad_f32 v1, -v1, v2, |v3|
+; GFX9-FLUSH-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-FLUSH-NEXT: s_endpgm
+;
+; GFX9-DENORM-LABEL: mad_sub_fabs_inv_f32:
+; GFX9-DENORM: ; %bb.0:
+; GFX9-DENORM-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-DENORM-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-DENORM-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DENORM-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0)
+; GFX9-DENORM-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc
+; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0)
+; GFX9-DENORM-NEXT: global_load_dword v3, v0, s[2:3] offset:8 glc
+; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0)
+; GFX9-DENORM-NEXT: v_mul_f32_e32 v1, v1, v2
+; GFX9-DENORM-NEXT: v_sub_f32_e64 v1, |v3|, v1
+; GFX9-DENORM-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-DENORM-NEXT: s_endpgm
+;
+; GFX10-FLUSH-LABEL: mad_sub_fabs_inv_f32:
+; GFX10-FLUSH: ; %bb.0:
+; GFX10-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-FLUSH-NEXT: global_load_dword v1, v0, s[2:3] glc dlc
+; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
+; GFX10-FLUSH-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc dlc
+; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
+; GFX10-FLUSH-NEXT: global_load_dword v3, v0, s[2:3] offset:8 glc dlc
+; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
+; GFX10-FLUSH-NEXT: v_mad_f32 v1, -v1, v2, |v3|
+; GFX10-FLUSH-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-FLUSH-NEXT: s_endpgm
+;
+; GFX10-DENORM-LABEL: mad_sub_fabs_inv_f32:
+; GFX10-DENORM: ; %bb.0:
+; GFX10-DENORM-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-DENORM-NEXT: global_load_dword v1, v0, s[2:3] glc dlc
+; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0)
+; GFX10-DENORM-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc dlc
+; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0)
+; GFX10-DENORM-NEXT: global_load_dword v3, v0, s[2:3] offset:8 glc dlc
+; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0)
+; GFX10-DENORM-NEXT: v_mul_f32_e32 v1, v1, v2
+; GFX10-DENORM-NEXT: v_sub_f32_e64 v1, |v3|, v1
+; GFX10-DENORM-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-DENORM-NEXT: s_endpgm
%tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
%tid.ext = sext i32 %tid to i64
%gep0 = getelementptr float, ptr addrspace(1) %ptr, i64 %tid.ext
@@ -486,26 +2131,150 @@ define amdgpu_kernel void @mad_sub_fabs_inv_f32(ptr addrspace(1) noalias nocaptu
ret void
}
-; GCN-LABEL: {{^}}neg_neg_mad_f32:
-; GCN: {{buffer|flat|global}}_load_dword [[REGA:v[0-9]+]]
-; GCN: {{buffer|flat|global}}_load_dword [[REGB:v[0-9]+]]
-; GCN: {{buffer|flat|global}}_load_dword [[REGC:v[0-9]+]]
-
-; GCN-FLUSH: v_mac_f32_e32 [[REGC]], [[REGA]], [[REGB]]
-; SI-FLUSH: buffer_store_dword [[REGC]]
-; VI-FLUSH: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REGC]]
-
-; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], [[REGC]]
-
-; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
-; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]]
-
-; GCN-DENORM-STRICT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
-; GCN-DENORM-STRICT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]]
-
-; SI-DENORM: buffer_store_dword [[RESULT]]
-; VI-DENORM: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
define amdgpu_kernel void @neg_neg_mad_f32(ptr addrspace(1) noalias nocapture %out, ptr addrspace(1) noalias nocapture readonly %ptr) #0 {
+; SI-FLUSH-LABEL: neg_neg_mad_f32:
+; SI-FLUSH: ; %bb.0:
+; SI-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-FLUSH-NEXT: s_mov_b32 s7, 0xf000
+; SI-FLUSH-NEXT: s_mov_b32 s6, 0
+; SI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; SI-FLUSH-NEXT: v_mov_b32_e32 v1, 0
+; SI-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
+; SI-FLUSH-NEXT: s_mov_b64 s[4:5], s[2:3]
+; SI-FLUSH-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc
+; SI-FLUSH-NEXT: s_waitcnt vmcnt(0)
+; SI-FLUSH-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:4 glc
+; SI-FLUSH-NEXT: s_waitcnt vmcnt(0)
+; SI-FLUSH-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:8 glc
+; SI-FLUSH-NEXT: s_waitcnt vmcnt(0)
+; SI-FLUSH-NEXT: s_mov_b64 s[2:3], s[6:7]
+; SI-FLUSH-NEXT: v_mac_f32_e32 v4, v2, v3
+; SI-FLUSH-NEXT: buffer_store_dword v4, v[0:1], s[0:3], 0 addr64
+; SI-FLUSH-NEXT: s_endpgm
+;
+; SI-DENORM-FASTFMA-STRICT-LABEL: neg_neg_mad_f32:
+; SI-DENORM-FASTFMA-STRICT: ; %bb.0:
+; SI-DENORM-FASTFMA-STRICT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b32 s7, 0xf000
+; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b32 s6, 0
+; SI-DENORM-FASTFMA-STRICT-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; SI-DENORM-FASTFMA-STRICT-NEXT: v_mov_b32_e32 v1, 0
+; SI-DENORM-FASTFMA-STRICT-NEXT: s_waitcnt lgkmcnt(0)
+; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b64 s[4:5], s[2:3]
+; SI-DENORM-FASTFMA-STRICT-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc
+; SI-DENORM-FASTFMA-STRICT-NEXT: s_waitcnt vmcnt(0)
+; SI-DENORM-FASTFMA-STRICT-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:4 glc
+; SI-DENORM-FASTFMA-STRICT-NEXT: s_waitcnt vmcnt(0)
+; SI-DENORM-FASTFMA-STRICT-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:8 glc
+; SI-DENORM-FASTFMA-STRICT-NEXT: s_waitcnt vmcnt(0)
+; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b64 s[2:3], s[6:7]
+; SI-DENORM-FASTFMA-STRICT-NEXT: v_mul_f32_e32 v2, v2, v3
+; SI-DENORM-FASTFMA-STRICT-NEXT: v_add_f32_e32 v2, v4, v2
+; SI-DENORM-FASTFMA-STRICT-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-DENORM-FASTFMA-STRICT-NEXT: s_endpgm
+;
+; SI-DENORM-SLOWFMA-LABEL: neg_neg_mad_f32:
+; SI-DENORM-SLOWFMA: ; %bb.0:
+; SI-DENORM-SLOWFMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s7, 0xf000
+; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s6, 0
+; SI-DENORM-SLOWFMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; SI-DENORM-SLOWFMA-NEXT: v_mov_b32_e32 v1, 0
+; SI-DENORM-SLOWFMA-NEXT: s_waitcnt lgkmcnt(0)
+; SI-DENORM-SLOWFMA-NEXT: s_mov_b64 s[4:5], s[2:3]
+; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc
+; SI-DENORM-SLOWFMA-NEXT: s_waitcnt vmcnt(0)
+; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:4 glc
+; SI-DENORM-SLOWFMA-NEXT: s_waitcnt vmcnt(0)
+; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:8 glc
+; SI-DENORM-SLOWFMA-NEXT: s_waitcnt vmcnt(0)
+; SI-DENORM-SLOWFMA-NEXT: s_mov_b64 s[2:3], s[6:7]
+; SI-DENORM-SLOWFMA-NEXT: v_mul_f32_e32 v2, v2, v3
+; SI-DENORM-SLOWFMA-NEXT: v_add_f32_e32 v2, v4, v2
+; SI-DENORM-SLOWFMA-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-DENORM-SLOWFMA-NEXT: s_endpgm
+;
+; SI-DENORM-FASTFMA-CONTRACT-LABEL: neg_neg_mad_f32:
+; SI-DENORM-FASTFMA-CONTRACT: ; %bb.0:
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b32 s7, 0xf000
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b32 s6, 0
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: v_mov_b32_e32 v1, 0
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_waitcnt lgkmcnt(0)
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b64 s[4:5], s[2:3]
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_waitcnt vmcnt(0)
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:4 glc
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_waitcnt vmcnt(0)
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:8 glc
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_waitcnt vmcnt(0)
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b64 s[2:3], s[6:7]
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: v_fma_f32 v2, v2, v3, v4
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_endpgm
+;
+; GFX9-FLUSH-LABEL: neg_neg_mad_f32:
+; GFX9-FLUSH: ; %bb.0:
+; GFX9-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-FLUSH-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0)
+; GFX9-FLUSH-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc
+; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0)
+; GFX9-FLUSH-NEXT: global_load_dword v3, v0, s[2:3] offset:8 glc
+; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0)
+; GFX9-FLUSH-NEXT: v_mac_f32_e32 v3, v1, v2
+; GFX9-FLUSH-NEXT: global_store_dword v0, v3, s[0:1]
+; GFX9-FLUSH-NEXT: s_endpgm
+;
+; GFX9-DENORM-LABEL: neg_neg_mad_f32:
+; GFX9-DENORM: ; %bb.0:
+; GFX9-DENORM-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-DENORM-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-DENORM-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DENORM-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0)
+; GFX9-DENORM-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc
+; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0)
+; GFX9-DENORM-NEXT: global_load_dword v3, v0, s[2:3] offset:8 glc
+; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0)
+; GFX9-DENORM-NEXT: v_mul_f32_e32 v1, v1, v2
+; GFX9-DENORM-NEXT: v_add_f32_e32 v1, v3, v1
+; GFX9-DENORM-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-DENORM-NEXT: s_endpgm
+;
+; GFX10-FLUSH-LABEL: neg_neg_mad_f32:
+; GFX10-FLUSH: ; %bb.0:
+; GFX10-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-FLUSH-NEXT: global_load_dword v1, v0, s[2:3] glc dlc
+; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
+; GFX10-FLUSH-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc dlc
+; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
+; GFX10-FLUSH-NEXT: global_load_dword v3, v0, s[2:3] offset:8 glc dlc
+; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
+; GFX10-FLUSH-NEXT: v_mac_f32_e32 v3, v1, v2
+; GFX10-FLUSH-NEXT: global_store_dword v0, v3, s[0:1]
+; GFX10-FLUSH-NEXT: s_endpgm
+;
+; GFX10-DENORM-LABEL: neg_neg_mad_f32:
+; GFX10-DENORM: ; %bb.0:
+; GFX10-DENORM-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-DENORM-NEXT: global_load_dword v1, v0, s[2:3] glc dlc
+; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0)
+; GFX10-DENORM-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc dlc
+; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0)
+; GFX10-DENORM-NEXT: global_load_dword v3, v0, s[2:3] offset:8 glc dlc
+; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0)
+; GFX10-DENORM-NEXT: v_mul_f32_e32 v1, v1, v2
+; GFX10-DENORM-NEXT: v_add_f32_e32 v1, v3, v1
+; GFX10-DENORM-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-DENORM-NEXT: s_endpgm
%tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
%tid.ext = sext i32 %tid to i64
%gep0 = getelementptr float, ptr addrspace(1) %ptr, i64 %tid.ext
@@ -525,23 +2294,150 @@ define amdgpu_kernel void @neg_neg_mad_f32(ptr addrspace(1) noalias nocapture %o
ret void
}
-; GCN-LABEL: {{^}}mad_fabs_sub_f32:
-; GCN: {{buffer|flat|global}}_load_dword [[REGA:v[0-9]+]]
-; GCN: {{buffer|flat|global}}_load_dword [[REGB:v[0-9]+]]
-; GCN: {{buffer|flat|global}}_load_dword [[REGC:v[0-9]+]]
-; GCN-FLUSH: v_mad_f32 [[RESULT:v[0-9]+]], [[REGA]], |[[REGB]]|, -[[REGC]]
-
-; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[REGA]], |[[REGB]]|, -[[REGC]]
-
-; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e64 [[TMP:v[0-9]+]], [[REGA]], |[[REGB]]|
-; GCN-DENORM-SLOWFMA-CONTRACT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]]
-
-; GCN-DENORM-STRICT: v_mul_f32_e64 [[TMP:v[0-9]+]], [[REGA]], |[[REGB]]|
-; GCN-DENORM-STRICT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]]
-
-; SI: buffer_store_dword [[RESULT]]
-; VI: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
define amdgpu_kernel void @mad_fabs_sub_f32(ptr addrspace(1) noalias nocapture %out, ptr addrspace(1) noalias nocapture readonly %ptr) #0 {
+; SI-FLUSH-LABEL: mad_fabs_sub_f32:
+; SI-FLUSH: ; %bb.0:
+; SI-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-FLUSH-NEXT: s_mov_b32 s7, 0xf000
+; SI-FLUSH-NEXT: s_mov_b32 s6, 0
+; SI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; SI-FLUSH-NEXT: v_mov_b32_e32 v1, 0
+; SI-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
+; SI-FLUSH-NEXT: s_mov_b64 s[4:5], s[2:3]
+; SI-FLUSH-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc
+; SI-FLUSH-NEXT: s_waitcnt vmcnt(0)
+; SI-FLUSH-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:4 glc
+; SI-FLUSH-NEXT: s_waitcnt vmcnt(0)
+; SI-FLUSH-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:8 glc
+; SI-FLUSH-NEXT: s_waitcnt vmcnt(0)
+; SI-FLUSH-NEXT: s_mov_b64 s[2:3], s[6:7]
+; SI-FLUSH-NEXT: v_mad_f32 v2, v2, |v3|, -v4
+; SI-FLUSH-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-FLUSH-NEXT: s_endpgm
+;
+; SI-DENORM-FASTFMA-STRICT-LABEL: mad_fabs_sub_f32:
+; SI-DENORM-FASTFMA-STRICT: ; %bb.0:
+; SI-DENORM-FASTFMA-STRICT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b32 s7, 0xf000
+; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b32 s6, 0
+; SI-DENORM-FASTFMA-STRICT-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; SI-DENORM-FASTFMA-STRICT-NEXT: v_mov_b32_e32 v1, 0
+; SI-DENORM-FASTFMA-STRICT-NEXT: s_waitcnt lgkmcnt(0)
+; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b64 s[4:5], s[2:3]
+; SI-DENORM-FASTFMA-STRICT-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc
+; SI-DENORM-FASTFMA-STRICT-NEXT: s_waitcnt vmcnt(0)
+; SI-DENORM-FASTFMA-STRICT-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:4 glc
+; SI-DENORM-FASTFMA-STRICT-NEXT: s_waitcnt vmcnt(0)
+; SI-DENORM-FASTFMA-STRICT-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:8 glc
+; SI-DENORM-FASTFMA-STRICT-NEXT: s_waitcnt vmcnt(0)
+; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b64 s[2:3], s[6:7]
+; SI-DENORM-FASTFMA-STRICT-NEXT: v_mul_f32_e64 v2, v2, |v3|
+; SI-DENORM-FASTFMA-STRICT-NEXT: v_sub_f32_e32 v2, v2, v4
+; SI-DENORM-FASTFMA-STRICT-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-DENORM-FASTFMA-STRICT-NEXT: s_endpgm
+;
+; SI-DENORM-SLOWFMA-LABEL: mad_fabs_sub_f32:
+; SI-DENORM-SLOWFMA: ; %bb.0:
+; SI-DENORM-SLOWFMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s7, 0xf000
+; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s6, 0
+; SI-DENORM-SLOWFMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; SI-DENORM-SLOWFMA-NEXT: v_mov_b32_e32 v1, 0
+; SI-DENORM-SLOWFMA-NEXT: s_waitcnt lgkmcnt(0)
+; SI-DENORM-SLOWFMA-NEXT: s_mov_b64 s[4:5], s[2:3]
+; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc
+; SI-DENORM-SLOWFMA-NEXT: s_waitcnt vmcnt(0)
+; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:4 glc
+; SI-DENORM-SLOWFMA-NEXT: s_waitcnt vmcnt(0)
+; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:8 glc
+; SI-DENORM-SLOWFMA-NEXT: s_waitcnt vmcnt(0)
+; SI-DENORM-SLOWFMA-NEXT: s_mov_b64 s[2:3], s[6:7]
+; SI-DENORM-SLOWFMA-NEXT: v_mul_f32_e64 v2, v2, |v3|
+; SI-DENORM-SLOWFMA-NEXT: v_sub_f32_e32 v2, v2, v4
+; SI-DENORM-SLOWFMA-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-DENORM-SLOWFMA-NEXT: s_endpgm
+;
+; SI-DENORM-FASTFMA-CONTRACT-LABEL: mad_fabs_sub_f32:
+; SI-DENORM-FASTFMA-CONTRACT: ; %bb.0:
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b32 s7, 0xf000
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b32 s6, 0
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: v_mov_b32_e32 v1, 0
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_waitcnt lgkmcnt(0)
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b64 s[4:5], s[2:3]
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_waitcnt vmcnt(0)
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:4 glc
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_waitcnt vmcnt(0)
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:8 glc
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_waitcnt vmcnt(0)
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b64 s[2:3], s[6:7]
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: v_fma_f32 v2, v2, |v3|, -v4
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_endpgm
+;
+; GFX9-FLUSH-LABEL: mad_fabs_sub_f32:
+; GFX9-FLUSH: ; %bb.0:
+; GFX9-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-FLUSH-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0)
+; GFX9-FLUSH-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc
+; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0)
+; GFX9-FLUSH-NEXT: global_load_dword v3, v0, s[2:3] offset:8 glc
+; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0)
+; GFX9-FLUSH-NEXT: v_mad_f32 v1, v1, |v2|, -v3
+; GFX9-FLUSH-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-FLUSH-NEXT: s_endpgm
+;
+; GFX9-DENORM-LABEL: mad_fabs_sub_f32:
+; GFX9-DENORM: ; %bb.0:
+; GFX9-DENORM-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-DENORM-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-DENORM-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DENORM-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0)
+; GFX9-DENORM-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc
+; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0)
+; GFX9-DENORM-NEXT: global_load_dword v3, v0, s[2:3] offset:8 glc
+; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0)
+; GFX9-DENORM-NEXT: v_mul_f32_e64 v1, v1, |v2|
+; GFX9-DENORM-NEXT: v_sub_f32_e32 v1, v1, v3
+; GFX9-DENORM-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-DENORM-NEXT: s_endpgm
+;
+; GFX10-FLUSH-LABEL: mad_fabs_sub_f32:
+; GFX10-FLUSH: ; %bb.0:
+; GFX10-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-FLUSH-NEXT: global_load_dword v1, v0, s[2:3] glc dlc
+; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
+; GFX10-FLUSH-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc dlc
+; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
+; GFX10-FLUSH-NEXT: global_load_dword v3, v0, s[2:3] offset:8 glc dlc
+; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
+; GFX10-FLUSH-NEXT: v_mad_f32 v1, v1, |v2|, -v3
+; GFX10-FLUSH-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-FLUSH-NEXT: s_endpgm
+;
+; GFX10-DENORM-LABEL: mad_fabs_sub_f32:
+; GFX10-DENORM: ; %bb.0:
+; GFX10-DENORM-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-DENORM-NEXT: global_load_dword v1, v0, s[2:3] glc dlc
+; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0)
+; GFX10-DENORM-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc dlc
+; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0)
+; GFX10-DENORM-NEXT: global_load_dword v3, v0, s[2:3] offset:8 glc dlc
+; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0)
+; GFX10-DENORM-NEXT: v_mul_f32_e64 v1, v1, |v2|
+; GFX10-DENORM-NEXT: v_sub_f32_e32 v1, v1, v3
+; GFX10-DENORM-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-DENORM-NEXT: s_endpgm
%tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
%tid.ext = sext i32 %tid to i64
%gep0 = getelementptr float, ptr addrspace(1) %ptr, i64 %tid.ext
@@ -560,24 +2456,126 @@ define amdgpu_kernel void @mad_fabs_sub_f32(ptr addrspace(1) noalias nocapture %
ret void
}
-; GCN-LABEL: {{^}}fsub_c_fadd_a_a_f32:
-; GCN: {{buffer|flat|global}}_load_dword [[R1:v[0-9]+]],
-; GCN: {{buffer|flat|global}}_load_dword [[R2:v[0-9]+]],
-; GCN-FLUSH: v_mac_f32_e32 [[R2]], -2.0, [[R1]]
-; SI-FLUSH: buffer_store_dword [[R2]]
-; VI-FLUSH: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
-
-; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], -2.0, [[R2]]
-
-; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
-; GCN-DENORM-SLOWFMA-CONTRACT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
-
-; GCN-DENORM-STRICT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
-; GCN-DENORM-STRICT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
-
-; SI-DENORM: buffer_store_dword [[RESULT]]
-; VI-DENORM: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
define amdgpu_kernel void @fsub_c_fadd_a_a_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+; SI-FLUSH-LABEL: fsub_c_fadd_a_a_f32:
+; SI-FLUSH: ; %bb.0:
+; SI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-FLUSH-NEXT: s_mov_b32 s3, 0xf000
+; SI-FLUSH-NEXT: s_mov_b32 s2, 0
+; SI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; SI-FLUSH-NEXT: v_mov_b32_e32 v1, 0
+; SI-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
+; SI-FLUSH-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc
+; SI-FLUSH-NEXT: s_waitcnt vmcnt(0)
+; SI-FLUSH-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 glc
+; SI-FLUSH-NEXT: s_waitcnt vmcnt(0)
+; SI-FLUSH-NEXT: v_mac_f32_e32 v3, -2.0, v2
+; SI-FLUSH-NEXT: buffer_store_dword v3, v[0:1], s[0:3], 0 addr64
+; SI-FLUSH-NEXT: s_endpgm
+;
+; SI-DENORM-FASTFMA-STRICT-LABEL: fsub_c_fadd_a_a_f32:
+; SI-DENORM-FASTFMA-STRICT: ; %bb.0:
+; SI-DENORM-FASTFMA-STRICT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b32 s3, 0xf000
+; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b32 s2, 0
+; SI-DENORM-FASTFMA-STRICT-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; SI-DENORM-FASTFMA-STRICT-NEXT: v_mov_b32_e32 v1, 0
+; SI-DENORM-FASTFMA-STRICT-NEXT: s_waitcnt lgkmcnt(0)
+; SI-DENORM-FASTFMA-STRICT-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc
+; SI-DENORM-FASTFMA-STRICT-NEXT: s_waitcnt vmcnt(0)
+; SI-DENORM-FASTFMA-STRICT-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 glc
+; SI-DENORM-FASTFMA-STRICT-NEXT: s_waitcnt vmcnt(0)
+; SI-DENORM-FASTFMA-STRICT-NEXT: v_add_f32_e32 v2, v2, v2
+; SI-DENORM-FASTFMA-STRICT-NEXT: v_sub_f32_e32 v2, v3, v2
+; SI-DENORM-FASTFMA-STRICT-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-DENORM-FASTFMA-STRICT-NEXT: s_endpgm
+;
+; SI-DENORM-SLOWFMA-LABEL: fsub_c_fadd_a_a_f32:
+; SI-DENORM-SLOWFMA: ; %bb.0:
+; SI-DENORM-SLOWFMA-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s3, 0xf000
+; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s2, 0
+; SI-DENORM-SLOWFMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; SI-DENORM-SLOWFMA-NEXT: v_mov_b32_e32 v1, 0
+; SI-DENORM-SLOWFMA-NEXT: s_waitcnt lgkmcnt(0)
+; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc
+; SI-DENORM-SLOWFMA-NEXT: s_waitcnt vmcnt(0)
+; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 glc
+; SI-DENORM-SLOWFMA-NEXT: s_waitcnt vmcnt(0)
+; SI-DENORM-SLOWFMA-NEXT: v_add_f32_e32 v2, v2, v2
+; SI-DENORM-SLOWFMA-NEXT: v_sub_f32_e32 v2, v3, v2
+; SI-DENORM-SLOWFMA-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-DENORM-SLOWFMA-NEXT: s_endpgm
+;
+; SI-DENORM-FASTFMA-CONTRACT-LABEL: fsub_c_fadd_a_a_f32:
+; SI-DENORM-FASTFMA-CONTRACT: ; %bb.0:
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b32 s3, 0xf000
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b32 s2, 0
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: v_mov_b32_e32 v1, 0
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_waitcnt lgkmcnt(0)
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_waitcnt vmcnt(0)
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 glc
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_waitcnt vmcnt(0)
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: v_fma_f32 v2, v2, -2.0, v3
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_endpgm
+;
+; GFX9-FLUSH-LABEL: fsub_c_fadd_a_a_f32:
+; GFX9-FLUSH: ; %bb.0:
+; GFX9-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-FLUSH-NEXT: global_load_dword v1, v0, s[0:1] glc
+; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0)
+; GFX9-FLUSH-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc
+; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0)
+; GFX9-FLUSH-NEXT: v_mac_f32_e32 v2, -2.0, v1
+; GFX9-FLUSH-NEXT: global_store_dword v0, v2, s[0:1]
+; GFX9-FLUSH-NEXT: s_endpgm
+;
+; GFX9-DENORM-LABEL: fsub_c_fadd_a_a_f32:
+; GFX9-DENORM: ; %bb.0:
+; GFX9-DENORM-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-DENORM-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-DENORM-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DENORM-NEXT: global_load_dword v1, v0, s[0:1] glc
+; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0)
+; GFX9-DENORM-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc
+; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0)
+; GFX9-DENORM-NEXT: v_add_f32_e32 v1, v1, v1
+; GFX9-DENORM-NEXT: v_sub_f32_e32 v1, v2, v1
+; GFX9-DENORM-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-DENORM-NEXT: s_endpgm
+;
+; GFX10-FLUSH-LABEL: fsub_c_fadd_a_a_f32:
+; GFX10-FLUSH: ; %bb.0:
+; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-FLUSH-NEXT: global_load_dword v1, v0, s[0:1] glc dlc
+; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
+; GFX10-FLUSH-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc dlc
+; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
+; GFX10-FLUSH-NEXT: v_mac_f32_e32 v2, -2.0, v1
+; GFX10-FLUSH-NEXT: global_store_dword v0, v2, s[0:1]
+; GFX10-FLUSH-NEXT: s_endpgm
+;
+; GFX10-DENORM-LABEL: fsub_c_fadd_a_a_f32:
+; GFX10-DENORM: ; %bb.0:
+; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-DENORM-NEXT: global_load_dword v1, v0, s[0:1] glc dlc
+; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0)
+; GFX10-DENORM-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc dlc
+; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0)
+; GFX10-DENORM-NEXT: v_add_f32_e32 v1, v1, v1
+; GFX10-DENORM-NEXT: v_sub_f32_e32 v1, v2, v1
+; GFX10-DENORM-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-DENORM-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
%gep.0 = getelementptr float, ptr addrspace(1) %out, i32 %tid
%gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1
@@ -593,22 +2591,126 @@ define amdgpu_kernel void @fsub_c_fadd_a_a_f32(ptr addrspace(1) %out, ptr addrsp
ret void
}
-; GCN-LABEL: {{^}}fsub_fadd_a_a_c_f32:
-; GCN: {{buffer|flat|global}}_load_dword [[R1:v[0-9]+]],
-; GCN: {{buffer|flat|global}}_load_dword [[R2:v[0-9]+]],
-; GCN-FLUSH: v_mad_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]]
-
-; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]]
-
-; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
-; GCN-DENORM-SLOWFMA-CONTRACT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
-
-; GCN-DENORM-STRICT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
-; GCN-DENORM-STRICT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
-
-; SI: buffer_store_dword [[RESULT]]
-; VI: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
define amdgpu_kernel void @fsub_fadd_a_a_c_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+; SI-FLUSH-LABEL: fsub_fadd_a_a_c_f32:
+; SI-FLUSH: ; %bb.0:
+; SI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-FLUSH-NEXT: s_mov_b32 s3, 0xf000
+; SI-FLUSH-NEXT: s_mov_b32 s2, 0
+; SI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; SI-FLUSH-NEXT: v_mov_b32_e32 v1, 0
+; SI-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
+; SI-FLUSH-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc
+; SI-FLUSH-NEXT: s_waitcnt vmcnt(0)
+; SI-FLUSH-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 glc
+; SI-FLUSH-NEXT: s_waitcnt vmcnt(0)
+; SI-FLUSH-NEXT: v_mad_f32 v2, v2, 2.0, -v3
+; SI-FLUSH-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-FLUSH-NEXT: s_endpgm
+;
+; SI-DENORM-FASTFMA-STRICT-LABEL: fsub_fadd_a_a_c_f32:
+; SI-DENORM-FASTFMA-STRICT: ; %bb.0:
+; SI-DENORM-FASTFMA-STRICT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b32 s3, 0xf000
+; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b32 s2, 0
+; SI-DENORM-FASTFMA-STRICT-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; SI-DENORM-FASTFMA-STRICT-NEXT: v_mov_b32_e32 v1, 0
+; SI-DENORM-FASTFMA-STRICT-NEXT: s_waitcnt lgkmcnt(0)
+; SI-DENORM-FASTFMA-STRICT-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc
+; SI-DENORM-FASTFMA-STRICT-NEXT: s_waitcnt vmcnt(0)
+; SI-DENORM-FASTFMA-STRICT-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 glc
+; SI-DENORM-FASTFMA-STRICT-NEXT: s_waitcnt vmcnt(0)
+; SI-DENORM-FASTFMA-STRICT-NEXT: v_add_f32_e32 v2, v2, v2
+; SI-DENORM-FASTFMA-STRICT-NEXT: v_sub_f32_e32 v2, v2, v3
+; SI-DENORM-FASTFMA-STRICT-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-DENORM-FASTFMA-STRICT-NEXT: s_endpgm
+;
+; SI-DENORM-SLOWFMA-LABEL: fsub_fadd_a_a_c_f32:
+; SI-DENORM-SLOWFMA: ; %bb.0:
+; SI-DENORM-SLOWFMA-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s3, 0xf000
+; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s2, 0
+; SI-DENORM-SLOWFMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; SI-DENORM-SLOWFMA-NEXT: v_mov_b32_e32 v1, 0
+; SI-DENORM-SLOWFMA-NEXT: s_waitcnt lgkmcnt(0)
+; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc
+; SI-DENORM-SLOWFMA-NEXT: s_waitcnt vmcnt(0)
+; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 glc
+; SI-DENORM-SLOWFMA-NEXT: s_waitcnt vmcnt(0)
+; SI-DENORM-SLOWFMA-NEXT: v_add_f32_e32 v2, v2, v2
+; SI-DENORM-SLOWFMA-NEXT: v_sub_f32_e32 v2, v2, v3
+; SI-DENORM-SLOWFMA-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-DENORM-SLOWFMA-NEXT: s_endpgm
+;
+; SI-DENORM-FASTFMA-CONTRACT-LABEL: fsub_fadd_a_a_c_f32:
+; SI-DENORM-FASTFMA-CONTRACT: ; %bb.0:
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b32 s3, 0xf000
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b32 s2, 0
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: v_mov_b32_e32 v1, 0
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_waitcnt lgkmcnt(0)
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_waitcnt vmcnt(0)
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 glc
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_waitcnt vmcnt(0)
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: v_fma_f32 v2, v2, 2.0, -v3
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_endpgm
+;
+; GFX9-FLUSH-LABEL: fsub_fadd_a_a_c_f32:
+; GFX9-FLUSH: ; %bb.0:
+; GFX9-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-FLUSH-NEXT: global_load_dword v1, v0, s[0:1] glc
+; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0)
+; GFX9-FLUSH-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc
+; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0)
+; GFX9-FLUSH-NEXT: v_mad_f32 v1, v1, 2.0, -v2
+; GFX9-FLUSH-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-FLUSH-NEXT: s_endpgm
+;
+; GFX9-DENORM-LABEL: fsub_fadd_a_a_c_f32:
+; GFX9-DENORM: ; %bb.0:
+; GFX9-DENORM-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-DENORM-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-DENORM-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DENORM-NEXT: global_load_dword v1, v0, s[0:1] glc
+; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0)
+; GFX9-DENORM-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc
+; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0)
+; GFX9-DENORM-NEXT: v_add_f32_e32 v1, v1, v1
+; GFX9-DENORM-NEXT: v_sub_f32_e32 v1, v1, v2
+; GFX9-DENORM-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-DENORM-NEXT: s_endpgm
+;
+; GFX10-FLUSH-LABEL: fsub_fadd_a_a_c_f32:
+; GFX10-FLUSH: ; %bb.0:
+; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-FLUSH-NEXT: global_load_dword v1, v0, s[0:1] glc dlc
+; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
+; GFX10-FLUSH-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc dlc
+; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
+; GFX10-FLUSH-NEXT: v_mad_f32 v1, v1, 2.0, -v2
+; GFX10-FLUSH-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-FLUSH-NEXT: s_endpgm
+;
+; GFX10-DENORM-LABEL: fsub_fadd_a_a_c_f32:
+; GFX10-DENORM: ; %bb.0:
+; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-DENORM-NEXT: global_load_dword v1, v0, s[0:1] glc dlc
+; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0)
+; GFX10-DENORM-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc dlc
+; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0)
+; GFX10-DENORM-NEXT: v_add_f32_e32 v1, v1, v1
+; GFX10-DENORM-NEXT: v_sub_f32_e32 v1, v1, v2
+; GFX10-DENORM-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-DENORM-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
%gep.0 = getelementptr float, ptr addrspace(1) %out, i32 %tid
%gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1
@@ -626,3 +2728,6 @@ define amdgpu_kernel void @fsub_fadd_a_a_c_f32(ptr addrspace(1) %out, ptr addrsp
attributes #0 = { nounwind }
attributes #1 = { nounwind readnone }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX9-DENORM-FASTFMA-FMAC: {{.*}}
+; GFX9-FLUSH-FMAC: {{.*}}
More information about the llvm-commits
mailing list