[llvm] [AMDGPU][True16][CodeGen] true16 codegen pattern for v_mul_f16 (PR #122000)
Brox Chen via llvm-commits
llvm-commits at lists.llvm.org
Mon Jan 13 09:39:30 PST 2025
https://github.com/broxigarchen updated https://github.com/llvm/llvm-project/pull/122000
>From 9091464d61cf4e179e5860fe5cfe6f92dc002f15 Mon Sep 17 00:00:00 2001
From: guochen2 <guochen2 at amd.com>
Date: Tue, 7 Jan 2025 16:46:44 -0500
Subject: [PATCH] true16 for v_mul_f16
---
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 4 +-
llvm/lib/Target/AMDGPU/SIInstructions.td | 20 +-
llvm/test/CodeGen/AMDGPU/fmul.f16.ll | 179 ++++++++++++------
llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll | 5 +-
4 files changed, 144 insertions(+), 64 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index f97ea40caa6704..fa02b9e6d41afc 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -5493,7 +5493,9 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const {
case AMDGPU::S_MAX_F16: return AMDGPU::V_MAX_F16_fake16_e64;
case AMDGPU::S_MINIMUM_F16: return AMDGPU::V_MINIMUM_F16_e64;
case AMDGPU::S_MAXIMUM_F16: return AMDGPU::V_MAXIMUM_F16_e64;
- case AMDGPU::S_MUL_F16: return AMDGPU::V_MUL_F16_fake16_e64;
+ case AMDGPU::S_MUL_F16:
+ return ST.useRealTrue16Insts() ? AMDGPU::V_MUL_F16_t16_e64
+ : AMDGPU::V_MUL_F16_fake16_e64;
case AMDGPU::S_CVT_PK_RTZ_F16_F32: return AMDGPU::V_CVT_PKRTZ_F16_F32_e64;
case AMDGPU::S_FMAC_F32: return AMDGPU::V_FMAC_F32_e64;
case AMDGPU::S_FMAC_F16: return AMDGPU::V_FMAC_F16_fake16_e64;
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index e388efe73cddbb..b696267eb1b386 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -3067,7 +3067,7 @@ foreach vt = [f16, v2f16, f32, v2f32, f64] in {
// Prefer selecting to max when legal, but using mul is always valid.
let AddedComplexity = -5 in {
-let OtherPredicates = [NotHasTrue16BitInsts] in {
+let True16Predicate = NotHasTrue16BitInsts in {
def : GCNPat<
(fcanonicalize (f16 (VOP3Mods f16:$src, i32:$src_mods))),
(V_MUL_F16_e64 0, (i32 CONST.FP16_ONE), $src_mods, $src)
@@ -3077,9 +3077,21 @@ def : GCNPat<
(fcanonicalize (f16 (fneg (VOP3Mods f16:$src, i32:$src_mods)))),
(V_MUL_F16_e64 0, (i32 CONST.FP16_NEG_ONE), $src_mods, $src)
>;
-} // End OtherPredicates
+} // End True16Predicate
-let OtherPredicates = [HasTrue16BitInsts] in {
+let True16Predicate = UseRealTrue16Insts in {
+def : GCNPat<
+ (fcanonicalize (f16 (VOP3Mods f16:$src, i32:$src_mods))),
+ (V_MUL_F16_t16_e64 0, (i16 CONST.FP16_ONE), $src_mods, $src, 0/*Clamp*/, /*omod*/0, /*opsel*/0)
+>;
+
+def : GCNPat<
+ (fcanonicalize (f16 (fneg (VOP3Mods f16:$src, i32:$src_mods)))),
+ (V_MUL_F16_t16_e64 0, (i16 CONST.FP16_NEG_ONE), $src_mods, $src, 0/*Clamp*/, /*omod*/0, /*opsel*/0)
+>;
+} // End True16Predicate
+
+let True16Predicate = UseFakeTrue16Insts in {
def : GCNPat<
(fcanonicalize (f16 (VOP3Mods f16:$src, i32:$src_mods))),
(V_MUL_F16_fake16_e64 0, (i32 CONST.FP16_ONE), $src_mods, $src)
@@ -3089,7 +3101,7 @@ def : GCNPat<
(fcanonicalize (f16 (fneg (VOP3Mods f16:$src, i32:$src_mods)))),
(V_MUL_F16_fake16_e64 0, (i32 CONST.FP16_NEG_ONE), $src_mods, $src)
>;
-} // End OtherPredicates
+} // End True16Predicate
def : GCNPat<
(fcanonicalize (v2f16 (VOP3PMods v2f16:$src, i32:$src_mods))),
diff --git a/llvm/test/CodeGen/AMDGPU/fmul.f16.ll b/llvm/test/CodeGen/AMDGPU/fmul.f16.ll
index 2cc5159c29f7ff..60cca89a1441e7 100644
--- a/llvm/test/CodeGen/AMDGPU/fmul.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmul.f16.ll
@@ -2,7 +2,8 @@
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI %s
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX89,VI %s
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX89,GFX9 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,-real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-FAKE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,+real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-TRUE16 %s
define amdgpu_kernel void @fmul_f16(
; SI-LABEL: fmul_f16:
@@ -54,29 +55,55 @@ define amdgpu_kernel void @fmul_f16(
; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0
; GFX89-NEXT: s_endpgm
;
-; GFX11-LABEL: fmul_f16:
-; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
-; GFX11-NEXT: s_mov_b32 s6, s10
-; GFX11-NEXT: s_mov_b32 s7, s11
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s12, s2
-; GFX11-NEXT: s_mov_b32 s13, s3
-; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s0
-; GFX11-NEXT: s_mov_b32 s9, s1
-; GFX11-NEXT: v_mul_f16_e32 v0, v0, v1
-; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0
-; GFX11-NEXT: s_endpgm
+; GFX11-FAKE16-LABEL: fmul_f16:
+; GFX11-FAKE16: ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT: s_clause 0x1
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX11-FAKE16-NEXT: s_mov_b32 s10, -1
+; GFX11-FAKE16-NEXT: s_mov_b32 s11, 0x31016000
+; GFX11-FAKE16-NEXT: s_mov_b32 s14, s10
+; GFX11-FAKE16-NEXT: s_mov_b32 s15, s11
+; GFX11-FAKE16-NEXT: s_mov_b32 s6, s10
+; GFX11-FAKE16-NEXT: s_mov_b32 s7, s11
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_mov_b32 s12, s2
+; GFX11-FAKE16-NEXT: s_mov_b32 s13, s3
+; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: s_mov_b32 s8, s0
+; GFX11-FAKE16-NEXT: s_mov_b32 s9, s1
+; GFX11-FAKE16-NEXT: v_mul_f16_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[8:11], 0
+; GFX11-FAKE16-NEXT: s_endpgm
+;
+; GFX11-TRUE16-LABEL: fmul_f16:
+; GFX11-TRUE16: ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT: s_clause 0x1
+; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX11-TRUE16-NEXT: s_mov_b32 s10, -1
+; GFX11-TRUE16-NEXT: s_mov_b32 s11, 0x31016000
+; GFX11-TRUE16-NEXT: s_mov_b32 s14, s10
+; GFX11-TRUE16-NEXT: s_mov_b32 s15, s11
+; GFX11-TRUE16-NEXT: s_mov_b32 s6, s10
+; GFX11-TRUE16-NEXT: s_mov_b32 s7, s11
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s12, s2
+; GFX11-TRUE16-NEXT: s_mov_b32 s13, s3
+; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s8, s0
+; GFX11-TRUE16-NEXT: s_mov_b32 s9, s1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mul_f16_e32 v0.l, v0.l, v0.h
+; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[8:11], 0
+; GFX11-TRUE16-NEXT: s_endpgm
ptr addrspace(1) %r,
ptr addrspace(1) %a,
ptr addrspace(1) %b) {
@@ -127,23 +154,41 @@ define amdgpu_kernel void @fmul_f16_imm_a(
; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0
; GFX89-NEXT: s_endpgm
;
-; GFX11-LABEL: fmul_f16_imm_a:
-; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
-; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0 glc dlc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s5, s1
-; GFX11-NEXT: v_mul_f16_e32 v0, 0x4200, v0
-; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0
-; GFX11-NEXT: s_endpgm
+; GFX11-FAKE16-LABEL: fmul_f16_imm_a:
+; GFX11-FAKE16: ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1
+; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6
+; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2
+; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3
+; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0
+; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 glc dlc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1
+; GFX11-FAKE16-NEXT: v_mul_f16_e32 v0, 0x4200, v0
+; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-FAKE16-NEXT: s_endpgm
+;
+; GFX11-TRUE16-LABEL: fmul_f16_imm_a:
+; GFX11-TRUE16: ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1
+; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6
+; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2
+; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0
+; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 glc dlc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1
+; GFX11-TRUE16-NEXT: v_mul_f16_e32 v0.l, 0x4200, v0.l
+; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-TRUE16-NEXT: s_endpgm
ptr addrspace(1) %r,
ptr addrspace(1) %b) {
entry:
@@ -192,23 +237,41 @@ define amdgpu_kernel void @fmul_f16_imm_b(
; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0
; GFX89-NEXT: s_endpgm
;
-; GFX11-LABEL: fmul_f16_imm_b:
-; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
-; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0 glc dlc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s5, s1
-; GFX11-NEXT: v_mul_f16_e32 v0, 4.0, v0
-; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0
-; GFX11-NEXT: s_endpgm
+; GFX11-FAKE16-LABEL: fmul_f16_imm_b:
+; GFX11-FAKE16: ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1
+; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6
+; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2
+; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3
+; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0
+; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 glc dlc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1
+; GFX11-FAKE16-NEXT: v_mul_f16_e32 v0, 4.0, v0
+; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-FAKE16-NEXT: s_endpgm
+;
+; GFX11-TRUE16-LABEL: fmul_f16_imm_b:
+; GFX11-TRUE16: ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1
+; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6
+; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2
+; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0
+; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 glc dlc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1
+; GFX11-TRUE16-NEXT: v_mul_f16_e32 v0.l, 4.0, v0.l
+; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-TRUE16-NEXT: s_endpgm
ptr addrspace(1) %r,
ptr addrspace(1) %a) {
entry:
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll
index b32630a97b3ad0..68e2f953dd6400 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll
@@ -5,7 +5,8 @@
; RUN: llc -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -enable-var-scope --check-prefixes=VI,VI-SAFE %s
; RUN: llc -enable-no-signed-zeros-fp-math -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -enable-var-scope --check-prefixes=VI,VI-NSZ %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -enable-var-scope --check-prefixes=GFX11,GFX11-SAFE %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -enable-var-scope --check-prefixes=GFX11,GFX11-SAFE,GFX11-SAFE-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -enable-var-scope --check-prefixes=GFX11,GFX11-SAFE,GFX11-SAFE-TRUE16 %s
; RUN: llc -enable-no-signed-zeros-fp-math -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -enable-var-scope --check-prefixes=GFX11,GFX11-NSZ %s
; --------------------------------------------------------------------------------
@@ -4761,3 +4762,5 @@ attributes #1 = { nounwind readnone }
attributes #2 = { nounwind "unsafe-fp-math"="true" }
attributes #3 = { nounwind "no-signed-zeros-fp-math"="true" }
attributes #4 = { nounwind "amdgpu-ieee"="false" "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX11-SAFE-FAKE16: {{.*}}
More information about the llvm-commits
mailing list