[llvm] AMDGPU: Add more tests for v_dot2_f32_f16/bf16 (PR #179223)
Petar Avramovic via llvm-commits
llvm-commits at lists.llvm.org
Thu Feb 12 09:08:14 PST 2026
https://github.com/petar-avramovic updated https://github.com/llvm/llvm-project/pull/179223
>From 9df94a3f7925a7bd825b09b90c09fe06006d4e7c Mon Sep 17 00:00:00 2001
From: Petar Avramovic <Petar.Avramovic at amd.com>
Date: Mon, 2 Feb 2026 12:46:48 +0100
Subject: [PATCH] AMDGPU: Add more tests for v_dot2_f32_f16/bf16
Test for src modifiers, inline constants and vopd codegen.
---
.../AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll | 1015 ++++++++++--
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.ll | 1455 ++++++++++++++++-
2 files changed, 2300 insertions(+), 170 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll
index dda2e15652597..46f44b230e56f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll
@@ -1,126 +1,897 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck %s --check-prefixes=GFX11
-; RUN: llc -global-isel -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck %s --check-prefixes=GFX11
-; RUN: llc -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck %s --check-prefixes=GFX950
-; RUN: llc -global-isel -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck %s --check-prefixes=GFX950-ISEL
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --filter-out "s_wait" --filter-out "s_nop" --filter-out "s_delay_alu" --filter-out "s_setpc_b64" --version 6
+; RUN: llc -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck %s --check-prefixes=GCN,GFX950
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck %s --check-prefixes=GCN,GFX11PLUS,GFX11
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s --check-prefixes=GCN,GFX11PLUS,GFX12
declare float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> %b, float %c, i1 %clamp)
-define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f32_bf16_clamp(
-; GFX11-LABEL: test_llvm_amdgcn_fdot2_f32_bf16_clamp:
-; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b32 s6, s[6:7], 0x0
-; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0
-; GFX11-NEXT: s_load_b32 s3, s[4:5], 0x0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_dot2_f32_bf16 v0, s2, s3, v0 clamp
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
-; GFX11-NEXT: s_endpgm
-;
-; GFX950-LABEL: test_llvm_amdgcn_fdot2_f32_bf16_clamp:
-; GFX950: ; %bb.0: ; %entry
-; GFX950-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
-; GFX950-NEXT: v_mov_b32_e32 v0, 0
-; GFX950-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-NEXT: s_load_dword s0, s[12:13], 0x0
-; GFX950-NEXT: s_load_dword s1, s[14:15], 0x0
-; GFX950-NEXT: s_load_dword s2, s[10:11], 0x0
-; GFX950-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-NEXT: v_mov_b32_e32 v1, s0
-; GFX950-NEXT: v_mov_b32_e32 v2, s1
-; GFX950-NEXT: v_dot2_f32_bf16 v1, s2, v1, v2 clamp
-; GFX950-NEXT: s_nop 2
-; GFX950-NEXT: global_store_dword v0, v1, s[8:9]
-; GFX950-NEXT: s_endpgm
-;
-; GFX950-ISEL-LABEL: test_llvm_amdgcn_fdot2_f32_bf16_clamp:
-; GFX950-ISEL: ; %bb.0: ; %entry
-; GFX950-ISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
-; GFX950-ISEL-NEXT: v_mov_b32_e32 v0, 0
-; GFX950-ISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-ISEL-NEXT: s_load_dword s0, s[12:13], 0x0
-; GFX950-ISEL-NEXT: s_load_dword s1, s[14:15], 0x0
-; GFX950-ISEL-NEXT: s_load_dword s2, s[10:11], 0x0
-; GFX950-ISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-ISEL-NEXT: v_mov_b32_e32 v1, s0
-; GFX950-ISEL-NEXT: v_mov_b32_e32 v2, s1
-; GFX950-ISEL-NEXT: v_dot2_f32_bf16 v1, s2, v1, v2 clamp
-; GFX950-ISEL-NEXT: s_nop 2
-; GFX950-ISEL-NEXT: global_store_dword v0, v1, s[8:9]
-; GFX950-ISEL-NEXT: s_endpgm
- ptr addrspace(1) %r,
- ptr addrspace(1) %a,
- ptr addrspace(1) %b,
- ptr addrspace(1) %c) {
-entry:
- %a.val = load <2 x bfloat>, ptr addrspace(1) %a
- %b.val = load <2 x bfloat>, ptr addrspace(1) %b
- %c.val = load float, ptr addrspace(1) %c
- %r.val = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a.val, <2 x bfloat> %b.val, float %c.val, i1 1)
- store float %r.val, ptr addrspace(1) %r
- ret void
-}
-
-
-define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f32_bf16_no_clamp(
-; GFX11-LABEL: test_llvm_amdgcn_fdot2_f32_bf16_no_clamp:
-; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b32 s6, s[6:7], 0x0
-; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0
-; GFX11-NEXT: s_load_b32 s3, s[4:5], 0x0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_dot2_f32_bf16 v0, s2, s3, v0
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
-; GFX11-NEXT: s_endpgm
-;
-; GFX950-LABEL: test_llvm_amdgcn_fdot2_f32_bf16_no_clamp:
-; GFX950: ; %bb.0: ; %entry
-; GFX950-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
-; GFX950-NEXT: v_mov_b32_e32 v0, 0
-; GFX950-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-NEXT: s_load_dword s0, s[12:13], 0x0
-; GFX950-NEXT: s_load_dword s1, s[14:15], 0x0
-; GFX950-NEXT: s_load_dword s2, s[10:11], 0x0
-; GFX950-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-NEXT: v_mov_b32_e32 v1, s0
-; GFX950-NEXT: v_mov_b32_e32 v2, s1
-; GFX950-NEXT: v_dot2c_f32_bf16_e32 v2, s2, v1
-; GFX950-NEXT: s_nop 2
-; GFX950-NEXT: global_store_dword v0, v2, s[8:9]
-; GFX950-NEXT: s_endpgm
-;
-; GFX950-ISEL-LABEL: test_llvm_amdgcn_fdot2_f32_bf16_no_clamp:
-; GFX950-ISEL: ; %bb.0: ; %entry
-; GFX950-ISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
-; GFX950-ISEL-NEXT: v_mov_b32_e32 v0, 0
-; GFX950-ISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-ISEL-NEXT: s_load_dword s0, s[12:13], 0x0
-; GFX950-ISEL-NEXT: s_load_dword s1, s[14:15], 0x0
-; GFX950-ISEL-NEXT: s_load_dword s2, s[10:11], 0x0
-; GFX950-ISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-ISEL-NEXT: v_mov_b32_e32 v1, s0
-; GFX950-ISEL-NEXT: v_mov_b32_e32 v2, s1
-; GFX950-ISEL-NEXT: v_dot2c_f32_bf16_e32 v2, s2, v1
-; GFX950-ISEL-NEXT: s_nop 2
-; GFX950-ISEL-NEXT: global_store_dword v0, v2, s[8:9]
-; GFX950-ISEL-NEXT: s_endpgm
- ptr addrspace(1) %r,
- ptr addrspace(1) %a,
- ptr addrspace(1) %b,
- ptr addrspace(1) %c) {
-entry:
- %a.val = load <2 x bfloat>, ptr addrspace(1) %a
- %b.val = load <2 x bfloat>, ptr addrspace(1) %b
- %c.val = load float, ptr addrspace(1) %c
- %r.val = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a.val, <2 x bfloat> %b.val, float %c.val, i1 0)
- store float %r.val, ptr addrspace(1) %r
- ret void
+define float @v_fdot2_f32_bf16(<2 x bfloat> %a, <2 x bfloat> %b, float %c) {
+; GFX950-LABEL: v_fdot2_f32_bf16:
+; GFX950: ; %bb.0:
+; GFX950: v_dot2c_f32_bf16_e32 v2, v0, v1
+; GFX950: v_mov_b32_e32 v0, v2
+;
+; GFX11PLUS-LABEL: v_fdot2_f32_bf16:
+; GFX11PLUS: ; %bb.0:
+; GFX11PLUS: v_dot2_f32_bf16 v0, v0, v1, v2
+ %r = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> %b, float %c, i1 false)
+ ret float %r
+}
+
+define float @v_fdot2_f32_bf16_neg_a(<2 x bfloat> %a, <2 x bfloat> %b, float %c) {
+; GFX950-LABEL: v_fdot2_f32_bf16_neg_a:
+; GFX950: ; %bb.0:
+; GFX950: v_xor_b32_e32 v0, 0x80008000, v0
+; GFX950: v_dot2c_f32_bf16_e32 v2, v0, v1
+; GFX950: v_mov_b32_e32 v0, v2
+;
+; GFX11PLUS-LABEL: v_fdot2_f32_bf16_neg_a:
+; GFX11PLUS: ; %bb.0:
+; GFX11PLUS: v_dot2_f32_bf16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
+ %neg.a = fneg <2 x bfloat> %a
+ %r = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %neg.a, <2 x bfloat> %b, float %c, i1 false)
+ ret float %r
+}
+
+define float @v_fdot2_f32_bf16_neg_a_lo(<2 x bfloat> %a, <2 x bfloat> %b, float %c) {
+; GFX950-LABEL: v_fdot2_f32_bf16_neg_a_lo:
+; GFX950: ; %bb.0:
+; GFX950: v_xor_b32_e32 v3, 0x8000, v0
+; GFX950: s_mov_b32 s0, 0xffff
+; GFX950: v_bfi_b32 v0, s0, v3, v0
+; GFX950: v_dot2c_f32_bf16_e32 v2, v0, v1
+; GFX950: v_mov_b32_e32 v0, v2
+;
+; GFX11-LABEL: v_fdot2_f32_bf16_neg_a_lo:
+; GFX11: ; %bb.0:
+; GFX11: v_xor_b16 v0.l, 0x8000, v0.l
+; GFX11: v_dot2_f32_bf16 v0, v0, v1, v2
+;
+; GFX12-LABEL: v_fdot2_f32_bf16_neg_a_lo:
+; GFX12: ; %bb.0:
+; GFX12: v_xor_b32_e32 v3, 0x8000, v0
+; GFX12: v_bfi_b32 v0, 0xffff, v3, v0
+; GFX12: v_dot2_f32_bf16 v0, v0, v1, v2
+ %a_lo = extractelement <2 x bfloat> %a, i32 0
+ %neg.a_lo = fneg bfloat %a_lo
+ %neg_lo.a = insertelement <2 x bfloat> %a, bfloat %neg.a_lo, i32 0
+ %r = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %neg_lo.a, <2 x bfloat> %b, float %c, i1 false)
+ ret float %r
+}
+
+define float @v_fdot2_f32_bf16_neg_a_hi(<2 x bfloat> %a, <2 x bfloat> %b, float %c) {
+; GFX950-LABEL: v_fdot2_f32_bf16_neg_a_hi:
+; GFX950: ; %bb.0:
+; GFX950: s_mov_b32 s0, 0x8000
+; GFX950: v_xor_b32_sdwa v3, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX950: s_mov_b32 s0, 0x5040100
+; GFX950: v_perm_b32 v0, v3, v0, s0
+; GFX950: v_dot2c_f32_bf16_e32 v2, v0, v1
+; GFX950: v_mov_b32_e32 v0, v2
+;
+; GFX11-LABEL: v_fdot2_f32_bf16_neg_a_hi:
+; GFX11: ; %bb.0:
+; GFX11: v_xor_b16 v0.h, 0x8000, v0.h
+; GFX11: v_dot2_f32_bf16 v0, v0, v1, v2
+;
+; GFX12-LABEL: v_fdot2_f32_bf16_neg_a_hi:
+; GFX12: ; %bb.0:
+; GFX12: v_lshrrev_b32_e32 v3, 16, v0
+; GFX12: v_xor_b32_e32 v3, 0x8000, v3
+; GFX12: v_perm_b32 v0, v3, v0, 0x5040100
+; GFX12: v_dot2_f32_bf16 v0, v0, v1, v2
+ %a_hi = extractelement <2 x bfloat> %a, i32 1
+ %neg.a_hi = fneg bfloat %a_hi
+ %neg_hi.a = insertelement <2 x bfloat> %a, bfloat %neg.a_hi, i32 1
+ %r = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %neg_hi.a, <2 x bfloat> %b, float %c, i1 false)
+ ret float %r
+}
+
+define float @v_fdot2_f32_bf16_neg_b(<2 x bfloat> %a, <2 x bfloat> %b, float %c) {
+; GFX950-LABEL: v_fdot2_f32_bf16_neg_b:
+; GFX950: ; %bb.0:
+; GFX950: v_xor_b32_e32 v1, 0x80008000, v1
+; GFX950: v_dot2c_f32_bf16_e32 v2, v0, v1
+; GFX950: v_mov_b32_e32 v0, v2
+;
+; GFX11PLUS-LABEL: v_fdot2_f32_bf16_neg_b:
+; GFX11PLUS: ; %bb.0:
+; GFX11PLUS: v_dot2_f32_bf16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0]
+ %neg.b = fneg <2 x bfloat> %b
+ %r = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> %neg.b, float %c, i1 false)
+ ret float %r
+}
+
+define float @v_fdot2_f32_bf16_neg_b_lo(<2 x bfloat> %a, <2 x bfloat> %b, float %c) {
+; GFX950-LABEL: v_fdot2_f32_bf16_neg_b_lo:
+; GFX950: ; %bb.0:
+; GFX950: v_xor_b32_e32 v3, 0x8000, v1
+; GFX950: s_mov_b32 s0, 0xffff
+; GFX950: v_bfi_b32 v1, s0, v3, v1
+; GFX950: v_dot2c_f32_bf16_e32 v2, v0, v1
+; GFX950: v_mov_b32_e32 v0, v2
+;
+; GFX11-LABEL: v_fdot2_f32_bf16_neg_b_lo:
+; GFX11: ; %bb.0:
+; GFX11: v_xor_b16 v1.l, 0x8000, v1.l
+; GFX11: v_dot2_f32_bf16 v0, v0, v1, v2
+;
+; GFX12-LABEL: v_fdot2_f32_bf16_neg_b_lo:
+; GFX12: ; %bb.0:
+; GFX12: v_xor_b32_e32 v3, 0x8000, v1
+; GFX12: v_bfi_b32 v1, 0xffff, v3, v1
+; GFX12: v_dot2_f32_bf16 v0, v0, v1, v2
+ %b_lo = extractelement <2 x bfloat> %b, i32 0
+ %neg.b_lo = fneg bfloat %b_lo
+ %neg_lo.b = insertelement <2 x bfloat> %b, bfloat %neg.b_lo, i32 0
+ %r = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> %neg_lo.b, float %c, i1 false)
+ ret float %r
+}
+
+define float @v_fdot2_f32_bf16_neg_b_hi(<2 x bfloat> %a, <2 x bfloat> %b, float %c) {
+; GFX950-LABEL: v_fdot2_f32_bf16_neg_b_hi:
+; GFX950: ; %bb.0:
+; GFX950: s_mov_b32 s0, 0x8000
+; GFX950: v_xor_b32_sdwa v3, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX950: s_mov_b32 s0, 0x5040100
+; GFX950: v_perm_b32 v1, v3, v1, s0
+; GFX950: v_dot2c_f32_bf16_e32 v2, v0, v1
+; GFX950: v_mov_b32_e32 v0, v2
+;
+; GFX11-LABEL: v_fdot2_f32_bf16_neg_b_hi:
+; GFX11: ; %bb.0:
+; GFX11: v_xor_b16 v1.h, 0x8000, v1.h
+; GFX11: v_dot2_f32_bf16 v0, v0, v1, v2
+;
+; GFX12-LABEL: v_fdot2_f32_bf16_neg_b_hi:
+; GFX12: ; %bb.0:
+; GFX12: v_lshrrev_b32_e32 v3, 16, v1
+; GFX12: v_xor_b32_e32 v3, 0x8000, v3
+; GFX12: v_perm_b32 v1, v3, v1, 0x5040100
+; GFX12: v_dot2_f32_bf16 v0, v0, v1, v2
+ %b_hi = extractelement <2 x bfloat> %b, i32 1
+ %neg.b_hi = fneg bfloat %b_hi
+ %neg_hi.b = insertelement <2 x bfloat> %b, bfloat %neg.b_hi, i32 1
+ %r = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> %neg_hi.b, float %c, i1 false)
+ ret float %r
+}
+
+define float @v_fdot2_f32_bf16_neg_c(<2 x bfloat> %a, <2 x bfloat> %b, float %c) {
+; GFX950-LABEL: v_fdot2_f32_bf16_neg_c:
+; GFX950: ; %bb.0:
+; GFX950: v_xor_b32_e32 v2, 0x80000000, v2
+; GFX950: v_dot2c_f32_bf16_e32 v2, v0, v1
+; GFX950: v_mov_b32_e32 v0, v2
+;
+; GFX11PLUS-LABEL: v_fdot2_f32_bf16_neg_c:
+; GFX11PLUS: ; %bb.0:
+; GFX11PLUS: v_dot2_f32_bf16 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1]
+ %neg.c = fneg float %c
+ %r = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> %b, float %neg.c, i1 false)
+ ret float %r
+}
+
+define float @v_fdot2_f32_bf16_abs_c(<2 x bfloat> %a, <2 x bfloat> %b, float %c) {
+; GFX950-LABEL: v_fdot2_f32_bf16_abs_c:
+; GFX950: ; %bb.0:
+; GFX950: v_and_b32_e32 v2, 0x7fffffff, v2
+; GFX950: v_dot2c_f32_bf16_e32 v2, v0, v1
+; GFX950: v_mov_b32_e32 v0, v2
+;
+; GFX11PLUS-LABEL: v_fdot2_f32_bf16_abs_c:
+; GFX11PLUS: ; %bb.0:
+; GFX11PLUS: v_and_b32_e32 v2, 0x7fffffff, v2
+; GFX11PLUS: v_dot2_f32_bf16 v0, v0, v1, v2
+ %abs.c = call float @llvm.fabs.f32(float %c)
+ %r = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> %b, float %abs.c, i1 false)
+ ret float %r
+}
+
+define float @v_fdot2_f32_bf16_opsel_lo_a(<2 x bfloat> %a, <2 x bfloat> %b, float %c) {
+; GFX950-LABEL: v_fdot2_f32_bf16_opsel_lo_a:
+; GFX950: ; %bb.0:
+; GFX950: s_mov_b32 s0, 0x7060302
+; GFX950: v_perm_b32 v0, v0, v0, s0
+; GFX950: v_dot2c_f32_bf16_e32 v2, v0, v1
+; GFX950: v_mov_b32_e32 v0, v2
+;
+; GFX11-LABEL: v_fdot2_f32_bf16_opsel_lo_a:
+; GFX11: ; %bb.0:
+; GFX11: v_mov_b16_e32 v0.l, v0.h
+; GFX11: v_dot2_f32_bf16 v0, v0, v1, v2
+;
+; GFX12-LABEL: v_fdot2_f32_bf16_opsel_lo_a:
+; GFX12: ; %bb.0:
+; GFX12: v_perm_b32 v0, v0, v0, 0x7060302
+; GFX12: v_dot2_f32_bf16 v0, v0, v1, v2
+ %shuf = shufflevector <2 x bfloat> %a, <2 x bfloat> poison, <2 x i32> <i32 1, i32 1>
+ %r = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %shuf, <2 x bfloat> %b, float %c, i1 false)
+ ret float %r
+}
+
+define float @v_fdot2_f32_bf16_opsel_hi_a(<2 x bfloat> %a, <2 x bfloat> %b, float %c) {
+; GFX950-LABEL: v_fdot2_f32_bf16_opsel_hi_a:
+; GFX950: ; %bb.0:
+; GFX950: s_mov_b32 s0, 0x5040100
+; GFX950: v_perm_b32 v0, v0, v0, s0
+; GFX950: v_dot2c_f32_bf16_e32 v2, v0, v1
+; GFX950: v_mov_b32_e32 v0, v2
+;
+; GFX11-LABEL: v_fdot2_f32_bf16_opsel_hi_a:
+; GFX11: ; %bb.0:
+; GFX11: v_mov_b16_e32 v0.h, v0.l
+; GFX11: v_dot2_f32_bf16 v0, v0, v1, v2
+;
+; GFX12-LABEL: v_fdot2_f32_bf16_opsel_hi_a:
+; GFX12: ; %bb.0:
+; GFX12: v_perm_b32 v0, v0, v0, 0x5040100
+; GFX12: v_dot2_f32_bf16 v0, v0, v1, v2
+ %shuf = shufflevector <2 x bfloat> %a, <2 x bfloat> poison, <2 x i32> <i32 0, i32 0>
+ %r = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %shuf, <2 x bfloat> %b, float %c, i1 false)
+ ret float %r
+}
+
+define float @v_fdot2_f32_bf16_opsel_lo_b(<2 x bfloat> %a, <2 x bfloat> %b, float %c) {
+; GFX950-LABEL: v_fdot2_f32_bf16_opsel_lo_b:
+; GFX950: ; %bb.0:
+; GFX950: s_mov_b32 s0, 0x7060302
+; GFX950: v_perm_b32 v1, v1, v1, s0
+; GFX950: v_dot2c_f32_bf16_e32 v2, v0, v1
+; GFX950: v_mov_b32_e32 v0, v2
+;
+; GFX11-LABEL: v_fdot2_f32_bf16_opsel_lo_b:
+; GFX11: ; %bb.0:
+; GFX11: v_mov_b16_e32 v1.l, v1.h
+; GFX11: v_dot2_f32_bf16 v0, v0, v1, v2
+;
+; GFX12-LABEL: v_fdot2_f32_bf16_opsel_lo_b:
+; GFX12: ; %bb.0:
+; GFX12: v_perm_b32 v1, v1, v1, 0x7060302
+; GFX12: v_dot2_f32_bf16 v0, v0, v1, v2
+ %shuf = shufflevector <2 x bfloat> %b, <2 x bfloat> poison, <2 x i32> <i32 1, i32 1>
+ %r = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> %shuf, float %c, i1 false)
+ ret float %r
+}
+
+define float @v_fdot2_f32_bf16_opsel_hi_b(<2 x bfloat> %a, <2 x bfloat> %b, float %c) {
+; GFX950-LABEL: v_fdot2_f32_bf16_opsel_hi_b:
+; GFX950: ; %bb.0:
+; GFX950: s_mov_b32 s0, 0x5040100
+; GFX950: v_perm_b32 v1, v1, v1, s0
+; GFX950: v_dot2c_f32_bf16_e32 v2, v0, v1
+; GFX950: v_mov_b32_e32 v0, v2
+;
+; GFX11-LABEL: v_fdot2_f32_bf16_opsel_hi_b:
+; GFX11: ; %bb.0:
+; GFX11: v_mov_b16_e32 v1.h, v1.l
+; GFX11: v_dot2_f32_bf16 v0, v0, v1, v2
+;
+; GFX12-LABEL: v_fdot2_f32_bf16_opsel_hi_b:
+; GFX12: ; %bb.0:
+; GFX12: v_perm_b32 v1, v1, v1, 0x5040100
+; GFX12: v_dot2_f32_bf16 v0, v0, v1, v2
+ %shuf = shufflevector <2 x bfloat> %b, <2 x bfloat> poison, <2 x i32> <i32 0, i32 0>
+ %r = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> %shuf, float %c, i1 false)
+ ret float %r
+}
+
+define float @v_fdot2_f32_bf16_inline_literal_a(<2 x bfloat> %b, float %c) {
+; GFX950-LABEL: v_fdot2_f32_bf16_inline_literal_a:
+; GFX950: ; %bb.0:
+; GFX950: v_dot2c_f32_bf16_e32 v1, 0x3f003f00, v0
+; GFX950: v_mov_b32_e32 v0, v1
+;
+; GFX11PLUS-LABEL: v_fdot2_f32_bf16_inline_literal_a:
+; GFX11PLUS: ; %bb.0:
+; GFX11PLUS: v_dot2_f32_bf16 v0, 0x3f003f00, v0, v1
+ %ret = tail call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> <bfloat 0.5, bfloat 0.5>, <2 x bfloat> %b, float %c, i1 false)
+ ret float %ret
+}
+
+define float @v_fdot2_f32_bf16_inline_literal_b(<2 x bfloat> %a, float %c) {
+; GFX950-LABEL: v_fdot2_f32_bf16_inline_literal_b:
+; GFX950: ; %bb.0:
+; GFX950: v_dot2c_f32_bf16_e32 v1, 0x40004000, v0
+; GFX950: v_mov_b32_e32 v0, v1
+;
+; GFX11PLUS-LABEL: v_fdot2_f32_bf16_inline_literal_b:
+; GFX11PLUS: ; %bb.0:
+; GFX11PLUS: v_dot2_f32_bf16 v0, v0, 0x40004000, v1
+ %ret = tail call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> <bfloat 2.0, bfloat 2.0>, float %c, i1 false)
+ ret float %ret
+}
+
+define float @v_fdot2_f32_bf16_inline_literal_c(<2 x bfloat> %a, <2 x bfloat> %b) {
+; GFX950-LABEL: v_fdot2_f32_bf16_inline_literal_c:
+; GFX950: ; %bb.0:
+; GFX950: v_mov_b32_e32 v2, 2.0
+; GFX950: v_dot2c_f32_bf16_e32 v2, v0, v1
+; GFX950: v_mov_b32_e32 v0, v2
+;
+; GFX11PLUS-LABEL: v_fdot2_f32_bf16_inline_literal_c:
+; GFX11PLUS: ; %bb.0:
+; GFX11PLUS: v_dot2_f32_bf16 v0, v0, v1, 2.0
+ %ret = tail call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> %b, float 2.0, i1 false)
+ ret float %ret
+}
+
+define float @v_fdot2_f32_bf16_clamp(<2 x bfloat> %a, <2 x bfloat> %b, float %c) {
+; GCN-LABEL: v_fdot2_f32_bf16_clamp:
+; GCN: ; %bb.0:
+; GCN: v_dot2_f32_bf16 v0, v0, v1, v2 clamp
+ %r = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> %b, float %c, i1 true)
+ ret float %r
+}
+
+define float @v_fdot2_f32_bf16_neg_a_clamp(<2 x bfloat> %a, <2 x bfloat> %b, float %c) {
+; GCN-LABEL: v_fdot2_f32_bf16_neg_a_clamp:
+; GCN: ; %bb.0:
+; GCN: v_dot2_f32_bf16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0] clamp
+ %neg.a = fneg <2 x bfloat> %a
+ %r = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %neg.a, <2 x bfloat> %b, float %c, i1 true)
+ ret float %r
+}
+
+define float @v_fdot2_f32_bf16_neg_b_clamp(<2 x bfloat> %a, <2 x bfloat> %b, float %c) {
+; GCN-LABEL: v_fdot2_f32_bf16_neg_b_clamp:
+; GCN: ; %bb.0:
+; GCN: v_dot2_f32_bf16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0] clamp
+ %neg.b = fneg <2 x bfloat> %b
+ %r = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> %neg.b, float %c, i1 true)
+ ret float %r
+}
+
+define float @v_fdot2_f32_bf16_neg_c_clamp(<2 x bfloat> %a, <2 x bfloat> %b, float %c) {
+; GCN-LABEL: v_fdot2_f32_bf16_neg_c_clamp:
+; GCN: ; %bb.0:
+; GCN: v_dot2_f32_bf16 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1] clamp
+ %neg.c = fneg float %c
+ %r = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> %b, float %neg.c, i1 true)
+ ret float %r
+}
+
+define float @v_fdot2_f32_bf16_abs_c_clamp(<2 x bfloat> %a, <2 x bfloat> %b, float %c) {
+; GCN-LABEL: v_fdot2_f32_bf16_abs_c_clamp:
+; GCN: ; %bb.0:
+; GCN: v_and_b32_e32 v2, 0x7fffffff, v2
+; GCN: v_dot2_f32_bf16 v0, v0, v1, v2 clamp
+ %abs.c = call float @llvm.fabs.f32(float %c)
+ %r = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> %b, float %abs.c, i1 true)
+ ret float %r
+}
+
+define float @v_fdot2_f32_bf16_opsel_lo_a_clamp(<2 x bfloat> %a, <2 x bfloat> %b, float %c) {
+; GFX950-LABEL: v_fdot2_f32_bf16_opsel_lo_a_clamp:
+; GFX950: ; %bb.0:
+; GFX950: s_mov_b32 s0, 0x7060302
+; GFX950: v_perm_b32 v0, v0, v0, s0
+; GFX950: v_dot2_f32_bf16 v0, v0, v1, v2 clamp
+;
+; GFX11-LABEL: v_fdot2_f32_bf16_opsel_lo_a_clamp:
+; GFX11: ; %bb.0:
+; GFX11: v_mov_b16_e32 v0.l, v0.h
+; GFX11: v_dot2_f32_bf16 v0, v0, v1, v2 clamp
+;
+; GFX12-LABEL: v_fdot2_f32_bf16_opsel_lo_a_clamp:
+; GFX12: ; %bb.0:
+; GFX12: v_perm_b32 v0, v0, v0, 0x7060302
+; GFX12: v_dot2_f32_bf16 v0, v0, v1, v2 clamp
+ %shuf = shufflevector <2 x bfloat> %a, <2 x bfloat> poison, <2 x i32> <i32 1, i32 1>
+ %r = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %shuf, <2 x bfloat> %b, float %c, i1 true)
+ ret float %r
+}
+
+define float @v_fdot2_f32_bf16_opsel_hi_a_clamp(<2 x bfloat> %a, <2 x bfloat> %b, float %c) {
+; GFX950-LABEL: v_fdot2_f32_bf16_opsel_hi_a_clamp:
+; GFX950: ; %bb.0:
+; GFX950: s_mov_b32 s0, 0x5040100
+; GFX950: v_perm_b32 v0, v0, v0, s0
+; GFX950: v_dot2_f32_bf16 v0, v0, v1, v2 clamp
+;
+; GFX11-LABEL: v_fdot2_f32_bf16_opsel_hi_a_clamp:
+; GFX11: ; %bb.0:
+; GFX11: v_mov_b16_e32 v0.h, v0.l
+; GFX11: v_dot2_f32_bf16 v0, v0, v1, v2 clamp
+;
+; GFX12-LABEL: v_fdot2_f32_bf16_opsel_hi_a_clamp:
+; GFX12: ; %bb.0:
+; GFX12: v_perm_b32 v0, v0, v0, 0x5040100
+; GFX12: v_dot2_f32_bf16 v0, v0, v1, v2 clamp
+ %shuf = shufflevector <2 x bfloat> %a, <2 x bfloat> poison, <2 x i32> <i32 0, i32 0>
+ %r = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %shuf, <2 x bfloat> %b, float %c, i1 true)
+ ret float %r
+}
+
+define float @v_fdot2_f32_bf16_opsel_lo_b_clamp(<2 x bfloat> %a, <2 x bfloat> %b, float %c) {
+; GFX950-LABEL: v_fdot2_f32_bf16_opsel_lo_b_clamp:
+; GFX950: ; %bb.0:
+; GFX950: s_mov_b32 s0, 0x7060302
+; GFX950: v_perm_b32 v1, v1, v1, s0
+; GFX950: v_dot2_f32_bf16 v0, v0, v1, v2 clamp
+;
+; GFX11-LABEL: v_fdot2_f32_bf16_opsel_lo_b_clamp:
+; GFX11: ; %bb.0:
+; GFX11: v_mov_b16_e32 v1.l, v1.h
+; GFX11: v_dot2_f32_bf16 v0, v0, v1, v2 clamp
+;
+; GFX12-LABEL: v_fdot2_f32_bf16_opsel_lo_b_clamp:
+; GFX12: ; %bb.0:
+; GFX12: v_perm_b32 v1, v1, v1, 0x7060302
+; GFX12: v_dot2_f32_bf16 v0, v0, v1, v2 clamp
+ %shuf = shufflevector <2 x bfloat> %b, <2 x bfloat> poison, <2 x i32> <i32 1, i32 1>
+ %r = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> %shuf, float %c, i1 true)
+ ret float %r
+}
+
+define float @v_fdot2_f32_bf16_opsel_hi_b_clamp(<2 x bfloat> %a, <2 x bfloat> %b, float %c) {
+; GFX950-LABEL: v_fdot2_f32_bf16_opsel_hi_b_clamp:
+; GFX950: ; %bb.0:
+; GFX950: s_mov_b32 s0, 0x5040100
+; GFX950: v_perm_b32 v1, v1, v1, s0
+; GFX950: v_dot2_f32_bf16 v0, v0, v1, v2 clamp
+;
+; GFX11-LABEL: v_fdot2_f32_bf16_opsel_hi_b_clamp:
+; GFX11: ; %bb.0:
+; GFX11: v_mov_b16_e32 v1.h, v1.l
+; GFX11: v_dot2_f32_bf16 v0, v0, v1, v2 clamp
+;
+; GFX12-LABEL: v_fdot2_f32_bf16_opsel_hi_b_clamp:
+; GFX12: ; %bb.0:
+; GFX12: v_perm_b32 v1, v1, v1, 0x5040100
+; GFX12: v_dot2_f32_bf16 v0, v0, v1, v2 clamp
+ %shuf = shufflevector <2 x bfloat> %b, <2 x bfloat> poison, <2 x i32> <i32 0, i32 0>
+ %r = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> %shuf, float %c, i1 true)
+ ret float %r
+}
+
+define float @v_fdot2_f32_bf16_inline_literal_a_clamp(<2 x bfloat> %b, float %c) {
+; GFX950-LABEL: v_fdot2_f32_bf16_inline_literal_a_clamp:
+; GFX950: ; %bb.0:
+; GFX950: s_mov_b32 s0, 0x40004000
+; GFX950: v_dot2_f32_bf16 v0, s0, v0, v1 clamp
+;
+; GFX11PLUS-LABEL: v_fdot2_f32_bf16_inline_literal_a_clamp:
+; GFX11PLUS: ; %bb.0:
+; GFX11PLUS: v_dot2_f32_bf16 v0, 0x40004000, v0, v1 clamp
+ %ret = tail call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> <bfloat 2.0, bfloat 2.0>, <2 x bfloat> %b, float %c, i1 true)
+ ret float %ret
+}
+
+define float @v_fdot2_f32_bf16_inline_literal_b_clamp(<2 x bfloat> %a, float %c) {
+; GFX950-LABEL: v_fdot2_f32_bf16_inline_literal_b_clamp:
+; GFX950: ; %bb.0:
+; GFX950: s_mov_b32 s0, 0x40004000
+; GFX950: v_dot2_f32_bf16 v0, v0, s0, v1 clamp
+;
+; GFX11PLUS-LABEL: v_fdot2_f32_bf16_inline_literal_b_clamp:
+; GFX11PLUS: ; %bb.0:
+; GFX11PLUS: v_dot2_f32_bf16 v0, v0, 0x40004000, v1 clamp
+ %ret = tail call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> <bfloat 2.0, bfloat 2.0>, float %c, i1 true)
+ ret float %ret
+}
+
+define float @v_fdot2_f32_bf16_inline_literal_c_clamp(<2 x bfloat> %a, <2 x bfloat> %b) {
+; GCN-LABEL: v_fdot2_f32_bf16_inline_literal_c_clamp:
+; GCN: ; %bb.0:
+; GCN: v_dot2_f32_bf16 v0, v0, v1, 2.0 clamp
+ %ret = tail call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> %b, float 2.0, i1 true)
+ ret float %ret
+}
+
+define float @v_fdot2_f32_bf16_dual(<2 x bfloat> %a, <2 x bfloat> %b, float %c, <2 x bfloat> %d, <2 x bfloat> %e, float %f) {
+; GFX950-LABEL: v_fdot2_f32_bf16_dual:
+; GFX950: ; %bb.0:
+; GFX950: v_dot2c_f32_bf16_e32 v2, v0, v1
+; GFX950: v_dot2c_f32_bf16_e32 v5, v3, v4
+; GFX950: v_add_f32_e32 v0, v2, v5
+;
+; GFX11PLUS-LABEL: v_fdot2_f32_bf16_dual:
+; GFX11PLUS: ; %bb.0:
+; GFX11PLUS: v_dot2_f32_bf16 v0, v0, v1, v2
+; GFX11PLUS: v_dot2_f32_bf16 v1, v3, v4, v5
+; GFX11PLUS: v_add_f32_e32 v0, v0, v1
+ %r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> %b, float %c, i1 false)
+ %r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> %e, float %f, i1 false)
+ %r = fadd float %r0, %r1
+ ret float %r
+}
+
+define float @v_fdot2_f32_bf16_neg_a_dual(<2 x bfloat> %a, <2 x bfloat> %b, float %c, <2 x bfloat> %d, <2 x bfloat> %e, float %f) {
+; GFX950-LABEL: v_fdot2_f32_bf16_neg_a_dual:
+; GFX950: ; %bb.0:
+; GFX950: v_xor_b32_e32 v0, 0x80008000, v0
+; GFX950: v_dot2c_f32_bf16_e32 v2, v0, v1
+; GFX950: v_dot2c_f32_bf16_e32 v5, v3, v4
+; GFX950: v_add_f32_e32 v0, v2, v5
+;
+; GFX11PLUS-LABEL: v_fdot2_f32_bf16_neg_a_dual:
+; GFX11PLUS: ; %bb.0:
+; GFX11PLUS: v_dot2_f32_bf16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GFX11PLUS: v_dot2_f32_bf16 v1, v3, v4, v5
+; GFX11PLUS: v_add_f32_e32 v0, v0, v1
+ %neg.a = fneg <2 x bfloat> %a
+ %r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %neg.a, <2 x bfloat> %b, float %c, i1 false)
+ %r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> %e, float %f, i1 false)
+ %r = fadd float %r0, %r1
+ ret float %r
+}
+
+define float @v_fdot2_f32_bf16_neg_a_lo_dual(<2 x bfloat> %a, <2 x bfloat> %b, float %c, <2 x bfloat> %d, <2 x bfloat> %e, float %f) {
+; GFX950-LABEL: v_fdot2_f32_bf16_neg_a_lo_dual:
+; GFX950: ; %bb.0:
+; GFX950: v_xor_b32_e32 v6, 0x8000, v0
+; GFX950: s_mov_b32 s0, 0xffff
+; GFX950: v_bfi_b32 v0, s0, v6, v0
+; GFX950: v_dot2c_f32_bf16_e32 v2, v0, v1
+; GFX950: v_dot2c_f32_bf16_e32 v5, v3, v4
+; GFX950: v_add_f32_e32 v0, v2, v5
+;
+; GFX11-LABEL: v_fdot2_f32_bf16_neg_a_lo_dual:
+; GFX11: ; %bb.0:
+; GFX11: v_xor_b16 v0.l, 0x8000, v0.l
+; GFX11: v_dot2_f32_bf16 v0, v0, v1, v2
+; GFX11: v_dot2_f32_bf16 v1, v3, v4, v5
+; GFX11: v_add_f32_e32 v0, v0, v1
+;
+; GFX12-LABEL: v_fdot2_f32_bf16_neg_a_lo_dual:
+; GFX12: ; %bb.0:
+; GFX12: v_xor_b32_e32 v6, 0x8000, v0
+; GFX12: v_bfi_b32 v0, 0xffff, v6, v0
+; GFX12: v_dot2_f32_bf16 v0, v0, v1, v2
+; GFX12: v_dot2_f32_bf16 v1, v3, v4, v5
+; GFX12: v_add_f32_e32 v0, v0, v1
+ %a_lo = extractelement <2 x bfloat> %a, i32 0
+ %neg.a_lo = fneg bfloat %a_lo
+ %neg_lo.a = insertelement <2 x bfloat> %a, bfloat %neg.a_lo, i32 0
+ %r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %neg_lo.a, <2 x bfloat> %b, float %c, i1 false)
+ %r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> %e, float %f, i1 false)
+ %r = fadd float %r0, %r1
+ ret float %r
+}
+
+define float @v_fdot2_f32_bf16_neg_a_hi_dual(<2 x bfloat> %a, <2 x bfloat> %b, float %c, <2 x bfloat> %d, <2 x bfloat> %e, float %f) {
+; GFX950-LABEL: v_fdot2_f32_bf16_neg_a_hi_dual:
+; GFX950: ; %bb.0:
+; GFX950: s_mov_b32 s0, 0x8000
+; GFX950: v_xor_b32_sdwa v6, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX950: s_mov_b32 s0, 0x5040100
+; GFX950: v_perm_b32 v0, v6, v0, s0
+; GFX950: v_dot2c_f32_bf16_e32 v2, v0, v1
+; GFX950: v_dot2c_f32_bf16_e32 v5, v3, v4
+; GFX950: v_add_f32_e32 v0, v2, v5
+;
+; GFX11-LABEL: v_fdot2_f32_bf16_neg_a_hi_dual:
+; GFX11: ; %bb.0:
+; GFX11: v_xor_b16 v0.h, 0x8000, v0.h
+; GFX11: v_dot2_f32_bf16 v0, v0, v1, v2
+; GFX11: v_dot2_f32_bf16 v1, v3, v4, v5
+; GFX11: v_add_f32_e32 v0, v0, v1
+;
+; GFX12-LABEL: v_fdot2_f32_bf16_neg_a_hi_dual:
+; GFX12: ; %bb.0:
+; GFX12: v_lshrrev_b32_e32 v6, 16, v0
+; GFX12: v_xor_b32_e32 v6, 0x8000, v6
+; GFX12: v_perm_b32 v0, v6, v0, 0x5040100
+; GFX12: v_dot2_f32_bf16 v0, v0, v1, v2
+; GFX12: v_dot2_f32_bf16 v1, v3, v4, v5
+; GFX12: v_add_f32_e32 v0, v0, v1
+ %a_hi = extractelement <2 x bfloat> %a, i32 1
+ %neg.a_hi = fneg bfloat %a_hi
+ %neg_hi.a = insertelement <2 x bfloat> %a, bfloat %neg.a_hi, i32 1
+ %r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %neg_hi.a, <2 x bfloat> %b, float %c, i1 false)
+ %r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> %e, float %f, i1 false)
+ %r = fadd float %r0, %r1
+ ret float %r
+}
+
+define float @v_fdot2_f32_bf16_neg_b_dual(<2 x bfloat> %a, <2 x bfloat> %b, float %c, <2 x bfloat> %d, <2 x bfloat> %e, float %f) {
+; GFX950-LABEL: v_fdot2_f32_bf16_neg_b_dual:
+; GFX950: ; %bb.0:
+; GFX950: v_xor_b32_e32 v1, 0x80008000, v1
+; GFX950: v_dot2c_f32_bf16_e32 v2, v0, v1
+; GFX950: v_dot2c_f32_bf16_e32 v5, v3, v4
+; GFX950: v_add_f32_e32 v0, v2, v5
+;
+; GFX11PLUS-LABEL: v_fdot2_f32_bf16_neg_b_dual:
+; GFX11PLUS: ; %bb.0:
+; GFX11PLUS: v_dot2_f32_bf16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GFX11PLUS: v_dot2_f32_bf16 v1, v3, v4, v5
+; GFX11PLUS: v_add_f32_e32 v0, v0, v1
+ %neg.b = fneg <2 x bfloat> %b
+ %r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> %neg.b, float %c, i1 false)
+ %r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> %e, float %f, i1 false)
+ %r = fadd float %r0, %r1
+ ret float %r
+}
+
+define float @v_fdot2_f32_bf16_neg_b_lo_dual(<2 x bfloat> %a, <2 x bfloat> %b, float %c, <2 x bfloat> %d, <2 x bfloat> %e, float %f) {
+; GFX950-LABEL: v_fdot2_f32_bf16_neg_b_lo_dual:
+; GFX950: ; %bb.0:
+; GFX950: v_xor_b32_e32 v6, 0x8000, v1
+; GFX950: s_mov_b32 s0, 0xffff
+; GFX950: v_bfi_b32 v1, s0, v6, v1
+; GFX950: v_dot2c_f32_bf16_e32 v2, v0, v1
+; GFX950: v_dot2c_f32_bf16_e32 v5, v3, v4
+; GFX950: v_add_f32_e32 v0, v2, v5
+;
+; GFX11-LABEL: v_fdot2_f32_bf16_neg_b_lo_dual:
+; GFX11: ; %bb.0:
+; GFX11: v_xor_b16 v1.l, 0x8000, v1.l
+; GFX11: v_dot2_f32_bf16 v0, v0, v1, v2
+; GFX11: v_dot2_f32_bf16 v1, v3, v4, v5
+; GFX11: v_add_f32_e32 v0, v0, v1
+;
+; GFX12-LABEL: v_fdot2_f32_bf16_neg_b_lo_dual:
+; GFX12: ; %bb.0:
+; GFX12: v_xor_b32_e32 v6, 0x8000, v1
+; GFX12: v_bfi_b32 v1, 0xffff, v6, v1
+; GFX12: v_dot2_f32_bf16 v0, v0, v1, v2
+; GFX12: v_dot2_f32_bf16 v1, v3, v4, v5
+; GFX12: v_add_f32_e32 v0, v0, v1
+ %b_lo = extractelement <2 x bfloat> %b, i32 0
+ %neg.b_lo = fneg bfloat %b_lo
+ %neg_lo.b = insertelement <2 x bfloat> %b, bfloat %neg.b_lo, i32 0
+ %r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> %neg_lo.b, float %c, i1 false)
+ %r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> %e, float %f, i1 false)
+ %r = fadd float %r0, %r1
+ ret float %r
+}
+
+define float @v_fdot2_f32_bf16_neg_b_hi_dual(<2 x bfloat> %a, <2 x bfloat> %b, float %c, <2 x bfloat> %d, <2 x bfloat> %e, float %f) {
+; GFX950-LABEL: v_fdot2_f32_bf16_neg_b_hi_dual:
+; GFX950: ; %bb.0:
+; GFX950: s_mov_b32 s0, 0x8000
+; GFX950: v_xor_b32_sdwa v6, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX950: s_mov_b32 s0, 0x5040100
+; GFX950: v_perm_b32 v1, v6, v1, s0
+; GFX950: v_dot2c_f32_bf16_e32 v2, v0, v1
+; GFX950: v_dot2c_f32_bf16_e32 v5, v3, v4
+; GFX950: v_add_f32_e32 v0, v2, v5
+;
+; GFX11-LABEL: v_fdot2_f32_bf16_neg_b_hi_dual:
+; GFX11: ; %bb.0:
+; GFX11: v_xor_b16 v1.h, 0x8000, v1.h
+; GFX11: v_dot2_f32_bf16 v0, v0, v1, v2
+; GFX11: v_dot2_f32_bf16 v1, v3, v4, v5
+; GFX11: v_add_f32_e32 v0, v0, v1
+;
+; GFX12-LABEL: v_fdot2_f32_bf16_neg_b_hi_dual:
+; GFX12: ; %bb.0:
+; GFX12: v_lshrrev_b32_e32 v6, 16, v1
+; GFX12: v_xor_b32_e32 v6, 0x8000, v6
+; GFX12: v_perm_b32 v1, v6, v1, 0x5040100
+; GFX12: v_dot2_f32_bf16 v0, v0, v1, v2
+; GFX12: v_dot2_f32_bf16 v1, v3, v4, v5
+; GFX12: v_add_f32_e32 v0, v0, v1
+ %b_hi = extractelement <2 x bfloat> %b, i32 1
+ %neg.b_hi = fneg bfloat %b_hi
+ %neg_hi.b = insertelement <2 x bfloat> %b, bfloat %neg.b_hi, i32 1
+ %r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> %neg_hi.b, float %c, i1 false)
+ %r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> %e, float %f, i1 false)
+ %r = fadd float %r0, %r1
+ ret float %r
+}
+
+define float @v_fdot2_f32_bf16_neg_c_dual(<2 x bfloat> %a, <2 x bfloat> %b, float %c, <2 x bfloat> %d, <2 x bfloat> %e, float %f) {
+; GFX950-LABEL: v_fdot2_f32_bf16_neg_c_dual:
+; GFX950: ; %bb.0:
+; GFX950: v_xor_b32_e32 v2, 0x80000000, v2
+; GFX950: v_dot2c_f32_bf16_e32 v2, v0, v1
+; GFX950: v_dot2c_f32_bf16_e32 v5, v3, v4
+; GFX950: v_add_f32_e32 v0, v2, v5
+;
+; GFX11PLUS-LABEL: v_fdot2_f32_bf16_neg_c_dual:
+; GFX11PLUS: ; %bb.0:
+; GFX11PLUS: v_dot2_f32_bf16 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GFX11PLUS: v_dot2_f32_bf16 v1, v3, v4, v5
+; GFX11PLUS: v_add_f32_e32 v0, v0, v1
+ %neg.c = fneg float %c
+ %r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> %b, float %neg.c, i1 false)
+ %r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> %e, float %f, i1 false)
+ %r = fadd float %r0, %r1
+ ret float %r
+}
+
+define float @v_fdot2_f32_bf16_abs_c_dual(<2 x bfloat> %a, <2 x bfloat> %b, float %c, <2 x bfloat> %d, <2 x bfloat> %e, float %f) {
+; GFX950-LABEL: v_fdot2_f32_bf16_abs_c_dual:
+; GFX950: ; %bb.0:
+; GFX950: v_and_b32_e32 v2, 0x7fffffff, v2
+; GFX950: v_dot2c_f32_bf16_e32 v2, v0, v1
+; GFX950: v_dot2c_f32_bf16_e32 v5, v3, v4
+; GFX950: v_add_f32_e32 v0, v2, v5
+;
+; GFX11PLUS-LABEL: v_fdot2_f32_bf16_abs_c_dual:
+; GFX11PLUS: ; %bb.0:
+; GFX11PLUS: v_and_b32_e32 v2, 0x7fffffff, v2
+; GFX11PLUS: v_dot2_f32_bf16 v0, v0, v1, v2
+; GFX11PLUS: v_dot2_f32_bf16 v1, v3, v4, v5
+; GFX11PLUS: v_add_f32_e32 v0, v0, v1
+ %abs.c = call float @llvm.fabs.f32(float %c)
+ %r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> %b, float %abs.c, i1 false)
+ %r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> %e, float %f, i1 false)
+ %r = fadd float %r0, %r1
+ ret float %r
+}
+
+define float @v_fdot2_f32_bf16_opsel_lo_a_dual(<2 x bfloat> %a, <2 x bfloat> %b, float %c, <2 x bfloat> %d, <2 x bfloat> %e, float %f) {
+; GFX950-LABEL: v_fdot2_f32_bf16_opsel_lo_a_dual:
+; GFX950: ; %bb.0:
+; GFX950: s_mov_b32 s0, 0x7060302
+; GFX950: v_perm_b32 v0, v0, v0, s0
+; GFX950: v_dot2c_f32_bf16_e32 v2, v0, v1
+; GFX950: v_dot2c_f32_bf16_e32 v5, v3, v4
+; GFX950: v_add_f32_e32 v0, v2, v5
+;
+; GFX11-LABEL: v_fdot2_f32_bf16_opsel_lo_a_dual:
+; GFX11: ; %bb.0:
+; GFX11: v_mov_b16_e32 v0.l, v0.h
+; GFX11: v_dot2_f32_bf16 v0, v0, v1, v2
+; GFX11: v_dot2_f32_bf16 v1, v3, v4, v5
+; GFX11: v_add_f32_e32 v0, v0, v1
+;
+; GFX12-LABEL: v_fdot2_f32_bf16_opsel_lo_a_dual:
+; GFX12: ; %bb.0:
+; GFX12: v_perm_b32 v0, v0, v0, 0x7060302
+; GFX12: v_dot2_f32_bf16 v0, v0, v1, v2
+; GFX12: v_dot2_f32_bf16 v1, v3, v4, v5
+; GFX12: v_add_f32_e32 v0, v0, v1
+ %shuf = shufflevector <2 x bfloat> %a, <2 x bfloat> poison, <2 x i32> <i32 1, i32 1>
+ %r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %shuf, <2 x bfloat> %b, float %c, i1 false)
+ %r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> %e, float %f, i1 false)
+ %r = fadd float %r0, %r1
+ ret float %r
+}
+
+define float @v_fdot2_f32_bf16_opsel_hi_a_dual(<2 x bfloat> %a, <2 x bfloat> %b, float %c, <2 x bfloat> %d, <2 x bfloat> %e, float %f) {
+; GFX950-LABEL: v_fdot2_f32_bf16_opsel_hi_a_dual:
+; GFX950: ; %bb.0:
+; GFX950: s_mov_b32 s0, 0x5040100
+; GFX950: v_perm_b32 v0, v0, v0, s0
+; GFX950: v_dot2c_f32_bf16_e32 v2, v0, v1
+; GFX950: v_dot2c_f32_bf16_e32 v5, v3, v4
+; GFX950: v_add_f32_e32 v0, v2, v5
+;
+; GFX11-LABEL: v_fdot2_f32_bf16_opsel_hi_a_dual:
+; GFX11: ; %bb.0:
+; GFX11: v_mov_b16_e32 v0.h, v0.l
+; GFX11: v_dot2_f32_bf16 v0, v0, v1, v2
+; GFX11: v_dot2_f32_bf16 v1, v3, v4, v5
+; GFX11: v_add_f32_e32 v0, v0, v1
+;
+; GFX12-LABEL: v_fdot2_f32_bf16_opsel_hi_a_dual:
+; GFX12: ; %bb.0:
+; GFX12: v_perm_b32 v0, v0, v0, 0x5040100
+; GFX12: v_dot2_f32_bf16 v0, v0, v1, v2
+; GFX12: v_dot2_f32_bf16 v1, v3, v4, v5
+; GFX12: v_add_f32_e32 v0, v0, v1
+ %shuf = shufflevector <2 x bfloat> %a, <2 x bfloat> poison, <2 x i32> <i32 0, i32 0>
+ %r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %shuf, <2 x bfloat> %b, float %c, i1 false)
+ %r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> %e, float %f, i1 false)
+ %r = fadd float %r0, %r1
+ ret float %r
+}
+
+define float @v_fdot2_f32_bf16_opsel_lo_b_dual(<2 x bfloat> %a, <2 x bfloat> %b, float %c, <2 x bfloat> %d, <2 x bfloat> %e, float %f) {
+; GFX950-LABEL: v_fdot2_f32_bf16_opsel_lo_b_dual:
+; GFX950: ; %bb.0:
+; GFX950: s_mov_b32 s0, 0x7060302
+; GFX950: v_perm_b32 v1, v1, v1, s0
+; GFX950: v_dot2c_f32_bf16_e32 v2, v0, v1
+; GFX950: v_dot2c_f32_bf16_e32 v5, v3, v4
+; GFX950: v_add_f32_e32 v0, v2, v5
+;
+; GFX11-LABEL: v_fdot2_f32_bf16_opsel_lo_b_dual:
+; GFX11: ; %bb.0:
+; GFX11: v_mov_b16_e32 v1.l, v1.h
+; GFX11: v_dot2_f32_bf16 v0, v0, v1, v2
+; GFX11: v_dot2_f32_bf16 v1, v3, v4, v5
+; GFX11: v_add_f32_e32 v0, v0, v1
+;
+; GFX12-LABEL: v_fdot2_f32_bf16_opsel_lo_b_dual:
+; GFX12: ; %bb.0:
+; GFX12: v_perm_b32 v1, v1, v1, 0x7060302
+; GFX12: v_dot2_f32_bf16 v0, v0, v1, v2
+; GFX12: v_dot2_f32_bf16 v1, v3, v4, v5
+; GFX12: v_add_f32_e32 v0, v0, v1
+ %shuf = shufflevector <2 x bfloat> %b, <2 x bfloat> poison, <2 x i32> <i32 1, i32 1>
+ %r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> %shuf, float %c, i1 false)
+ %r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> %e, float %f, i1 false)
+ %r = fadd float %r0, %r1
+ ret float %r
+}
+
+define float @v_fdot2_f32_bf16_opsel_hi_b_dual(<2 x bfloat> %a, <2 x bfloat> %b, float %c, <2 x bfloat> %d, <2 x bfloat> %e, float %f) {
+; GFX950-LABEL: v_fdot2_f32_bf16_opsel_hi_b_dual:
+; GFX950: ; %bb.0:
+; GFX950: s_mov_b32 s0, 0x5040100
+; GFX950: v_perm_b32 v1, v1, v1, s0
+; GFX950: v_dot2c_f32_bf16_e32 v2, v0, v1
+; GFX950: v_dot2c_f32_bf16_e32 v5, v3, v4
+; GFX950: v_add_f32_e32 v0, v2, v5
+;
+; GFX11-LABEL: v_fdot2_f32_bf16_opsel_hi_b_dual:
+; GFX11: ; %bb.0:
+; GFX11: v_mov_b16_e32 v1.h, v1.l
+; GFX11: v_dot2_f32_bf16 v0, v0, v1, v2
+; GFX11: v_dot2_f32_bf16 v1, v3, v4, v5
+; GFX11: v_add_f32_e32 v0, v0, v1
+;
+; GFX12-LABEL: v_fdot2_f32_bf16_opsel_hi_b_dual:
+; GFX12: ; %bb.0:
+; GFX12: v_perm_b32 v1, v1, v1, 0x5040100
+; GFX12: v_dot2_f32_bf16 v0, v0, v1, v2
+; GFX12: v_dot2_f32_bf16 v1, v3, v4, v5
+; GFX12: v_add_f32_e32 v0, v0, v1
+ %shuf = shufflevector <2 x bfloat> %b, <2 x bfloat> poison, <2 x i32> <i32 0, i32 0>
+ %r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> %shuf, float %c, i1 false)
+ %r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> %e, float %f, i1 false)
+ %r = fadd float %r0, %r1
+ ret float %r
+}
+
+define float @v_fdot2_f32_bf16_inline_literal_a_dual(<2 x bfloat> %b, float %c, <2 x bfloat> %d, <2 x bfloat> %e, float %f) {
+; GFX950-LABEL: v_fdot2_f32_bf16_inline_literal_a_dual:
+; GFX950: ; %bb.0:
+; GFX950: v_dot2c_f32_bf16_e32 v1, 0x40004000, v0
+; GFX950: v_dot2c_f32_bf16_e32 v4, v2, v3
+; GFX950: v_add_f32_e32 v0, v1, v4
+;
+; GFX11PLUS-LABEL: v_fdot2_f32_bf16_inline_literal_a_dual:
+; GFX11PLUS: ; %bb.0:
+; GFX11PLUS: v_dot2_f32_bf16 v0, 0x40004000, v0, v1
+; GFX11PLUS: v_dot2_f32_bf16 v1, v2, v3, v4
+; GFX11PLUS: v_add_f32_e32 v0, v0, v1
+ %r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> <bfloat 2.0, bfloat 2.0>, <2 x bfloat> %b, float %c, i1 false)
+ %r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> %e, float %f, i1 false)
+ %r = fadd float %r0, %r1
+ ret float %r
+}
+
+define float @v_fdot2_f32_bf16_inline_literal_b_dual(<2 x bfloat> %a, float %c, <2 x bfloat> %d, <2 x bfloat> %e, float %f) {
+; GFX950-LABEL: v_fdot2_f32_bf16_inline_literal_b_dual:
+; GFX950: ; %bb.0:
+; GFX950: v_dot2c_f32_bf16_e32 v1, 0x40004000, v0
+; GFX950: v_dot2c_f32_bf16_e32 v4, v2, v3
+; GFX950: v_add_f32_e32 v0, v1, v4
+;
+; GFX11PLUS-LABEL: v_fdot2_f32_bf16_inline_literal_b_dual:
+; GFX11PLUS: ; %bb.0:
+; GFX11PLUS: v_dot2_f32_bf16 v0, v0, 0x40004000, v1
+; GFX11PLUS: v_dot2_f32_bf16 v1, v2, v3, v4
+; GFX11PLUS: v_add_f32_e32 v0, v0, v1
+ %r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> <bfloat 2.0, bfloat 2.0>, float %c, i1 false)
+ %r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> %e, float %f, i1 false)
+ %r = fadd float %r0, %r1
+ ret float %r
+}
+
+define float @v_fdot2_f32_bf16_inline_literal_c_dual(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %d, <2 x bfloat> %e, float %f) {
+; GFX950-LABEL: v_fdot2_f32_bf16_inline_literal_c_dual:
+; GFX950: ; %bb.0:
+; GFX950: v_mov_b32_e32 v5, 2.0
+; GFX950: v_dot2c_f32_bf16_e32 v5, v0, v1
+; GFX950: v_dot2c_f32_bf16_e32 v4, v2, v3
+; GFX950: v_add_f32_e32 v0, v5, v4
+;
+; GFX11PLUS-LABEL: v_fdot2_f32_bf16_inline_literal_c_dual:
+; GFX11PLUS: ; %bb.0:
+; GFX11PLUS: v_dot2_f32_bf16 v0, v0, v1, 2.0
+; GFX11PLUS: v_dot2_f32_bf16 v1, v2, v3, v4
+; GFX11PLUS: v_add_f32_e32 v0, v0, v1
+ %r0 = tail call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> %b, float 2.0, i1 false)
+ %r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> %e, float %f, i1 false)
+ %r = fadd float %r0, %r1
+ ret float %r
+}
+
+define float @v_fdot2_f32_bf16_clamp_dual(<2 x bfloat> %a, <2 x bfloat> %b, float %c, <2 x bfloat> %d, <2 x bfloat> %e, float %f) {
+; GCN-LABEL: v_fdot2_f32_bf16_clamp_dual:
+; GCN: ; %bb.0:
+; GCN: v_dot2_f32_bf16 v0, v0, v1, v2 clamp
+; GCN: v_dot2_f32_bf16 v1, v3, v4, v5 clamp
+; GCN: v_add_f32_e32 v0, v0, v1
+ %r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> %b, float %c, i1 true)
+ %r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> %e, float %f, i1 true)
+ %r = fadd float %r0, %r1
+ ret float %r
}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.ll
index 98cb09642511e..c488cbcc623fc 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.ll
@@ -1,54 +1,1413 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx906 < %s | FileCheck %s --check-prefixes=GCN,GFX9,GFX906
-; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck %s --check-prefixes=GCN,GFX9,GFX942
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 < %s | FileCheck %s --check-prefixes=GCN,GFX10
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --filter-out "s_wait" --filter-out "s_nop" --filter-out "s_delay_alu" --filter-out "s_setpc_b64" --version 6
+; RUN: llc -mtriple=amdgcn -mcpu=gfx906 < %s | FileCheck %s --check-prefixes=GCN,GFX906
+; RUN: llc -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck %s --check-prefixes=GCN,GFX950
; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 < %s | FileCheck %s --check-prefixes=GCN,GFX10
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-vopd=0 < %s | FileCheck %s --check-prefixes=GCN,GFX10
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-enable-vopd=0 < %s | FileCheck %s --check-prefixes=GCN,GFX12
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck %s --check-prefixes=GCN,GFX11PLUS,GFX11
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s --check-prefixes=GCN,GFX11PLUS,GFX12
declare float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float %c, i1 %clamp)
-; GCN-LABEL: {{^}}test_llvm_amdgcn_fdot2_clamp
-; GFX9: v_dot2_f32_f16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} clamp{{$}}
-; GFX10: v_dot2_f32_f16 v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} clamp{{$}}
-; GFX12: v_dot2_f32_f16 v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} clamp{{$}}
-define amdgpu_kernel void @test_llvm_amdgcn_fdot2_clamp(
- ptr addrspace(1) %r,
- ptr addrspace(1) %a,
- ptr addrspace(1) %b,
- ptr addrspace(1) %c) {
-entry:
- %a.val = load <2 x half>, ptr addrspace(1) %a
- %b.val = load <2 x half>, ptr addrspace(1) %b
- %c.val = load float, ptr addrspace(1) %c
- %r.val = call float @llvm.amdgcn.fdot2(<2 x half> %a.val, <2 x half> %b.val, float %c.val, i1 1)
- store float %r.val, ptr addrspace(1) %r
- ret void
-}
-
-; GCN-LABEL: {{^}}test_llvm_amdgcn_fdot2_no_clamp
-; GFX906: v_dot2_f32_f16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}}
-; GFX942: v_dot2c_f32_f16_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}{{$}}
-; GFX10: {{v_dot2c_f32_f16|v_dot2acc_f32_f16}} v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}{{$}}
-; GFX12: v_dot2_f32_f16 v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}{{$}}
-define amdgpu_kernel void @test_llvm_amdgcn_fdot2_no_clamp(
- ptr addrspace(1) %r,
- ptr addrspace(1) %a,
- ptr addrspace(1) %b,
- ptr addrspace(1) %c) {
-entry:
- %a.val = load <2 x half>, ptr addrspace(1) %a
- %b.val = load <2 x half>, ptr addrspace(1) %b
- %c.val = load float, ptr addrspace(1) %c
- %r.val = call float @llvm.amdgcn.fdot2(<2 x half> %a.val, <2 x half> %b.val, float %c.val, i1 0)
- store float %r.val, ptr addrspace(1) %r
- ret void
-}
-
-; GFX9-LABEL: {{^}}fdot2_inline_literal
-; GFX906: v_dot2_f32_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 1.0
-; GFX942: v_dot2c_f32_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}}
-; GFX12: v_dot2_f32_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 1.0{{$}}
-define float @fdot2_inline_literal(<2 x half> %a, <2 x half> %b) {
- %ret = tail call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float 1.0, i1 false)
+define float @v_fdot2(<2 x half> %a, <2 x half> %b, float %c) {
+; GFX906-LABEL: v_fdot2:
+; GFX906: ; %bb.0:
+; GFX906: v_dot2_f32_f16 v0, v0, v1, v2
+;
+; GFX950-LABEL: v_fdot2:
+; GFX950: ; %bb.0:
+; GFX950: v_dot2c_f32_f16_e32 v2, v0, v1
+; GFX950: v_mov_b32_e32 v0, v2
+;
+; GFX10-LABEL: v_fdot2:
+; GFX10: ; %bb.0:
+; GFX10: v_dot2c_f32_f16 v2, v0, v1
+; GFX10: v_mov_b32_e32 v0, v2
+;
+; GFX11-LABEL: v_fdot2:
+; GFX11: ; %bb.0:
+; GFX11: v_dot2acc_f32_f16 v2, v0, v1
+; GFX11: v_mov_b32_e32 v0, v2
+;
+; GFX12-LABEL: v_fdot2:
+; GFX12: ; %bb.0:
+; GFX12: v_dot2_f32_f16 v0, v0, v1, v2
+ %r = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float %c, i1 false)
+ ret float %r
+}
+
+define float @v_fdot2_neg_a(<2 x half> %a, <2 x half> %b, float %c) {
+; GFX906-LABEL: v_fdot2_neg_a:
+; GFX906: ; %bb.0:
+; GFX906: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
+;
+; GFX950-LABEL: v_fdot2_neg_a:
+; GFX950: ; %bb.0:
+; GFX950: v_xor_b32_e32 v0, 0x80008000, v0
+; GFX950: v_dot2c_f32_f16_e32 v2, v0, v1
+; GFX950: v_mov_b32_e32 v0, v2
+;
+; GFX10-LABEL: v_fdot2_neg_a:
+; GFX10: ; %bb.0:
+; GFX10: v_xor_b32_e32 v0, 0x80008000, v0
+; GFX10: v_dot2c_f32_f16 v2, v0, v1
+; GFX10: v_mov_b32_e32 v0, v2
+;
+; GFX11-LABEL: v_fdot2_neg_a:
+; GFX11: ; %bb.0:
+; GFX11: v_xor_b32_e32 v0, 0x80008000, v0
+; GFX11: v_dot2acc_f32_f16 v2, v0, v1
+; GFX11: v_mov_b32_e32 v0, v2
+;
+; GFX12-LABEL: v_fdot2_neg_a:
+; GFX12: ; %bb.0:
+; GFX12: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
+ %neg.a = fneg <2 x half> %a
+ %r = call float @llvm.amdgcn.fdot2(<2 x half> %neg.a, <2 x half> %b, float %c, i1 false)
+ ret float %r
+}
+
+define float @v_fdot2_neg_a_lo(<2 x half> %a, <2 x half> %b, float %c) {
+; GFX906-LABEL: v_fdot2_neg_a_lo:
+; GFX906: ; %bb.0:
+; GFX906: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[1,0,0]
+;
+; GFX950-LABEL: v_fdot2_neg_a_lo:
+; GFX950: ; %bb.0:
+; GFX950: v_xor_b32_e32 v3, 0x8000, v0
+; GFX950: s_mov_b32 s0, 0xffff
+; GFX950: v_bfi_b32 v0, s0, v3, v0
+; GFX950: v_dot2c_f32_f16_e32 v2, v0, v1
+; GFX950: v_mov_b32_e32 v0, v2
+;
+; GFX10-LABEL: v_fdot2_neg_a_lo:
+; GFX10: ; %bb.0:
+; GFX10: v_xor_b32_e32 v3, 0x8000, v0
+; GFX10: v_bfi_b32 v0, 0xffff, v3, v0
+; GFX10: v_dot2c_f32_f16 v2, v0, v1
+; GFX10: v_mov_b32_e32 v0, v2
+;
+; GFX11-LABEL: v_fdot2_neg_a_lo:
+; GFX11: ; %bb.0:
+; GFX11: v_xor_b16 v0.l, 0x8000, v0.l
+; GFX11: v_dot2acc_f32_f16 v2, v0, v1
+; GFX11: v_mov_b32_e32 v0, v2
+;
+; GFX12-LABEL: v_fdot2_neg_a_lo:
+; GFX12: ; %bb.0:
+; GFX12: v_xor_b32_e32 v3, 0x8000, v0
+; GFX12: v_bfi_b32 v0, 0xffff, v3, v0
+; GFX12: v_dot2_f32_f16 v0, v0, v1, v2
+ %a_lo = extractelement <2 x half> %a, i32 0
+ %neg.a_lo = fneg half %a_lo
+ %neg_lo.a = insertelement <2 x half> %a, half %neg.a_lo, i32 0
+ %r = call float @llvm.amdgcn.fdot2(<2 x half> %neg_lo.a, <2 x half> %b, float %c, i1 false)
+ ret float %r
+}
+
+define float @v_fdot2_neg_a_hi(<2 x half> %a, <2 x half> %b, float %c) {
+; GFX906-LABEL: v_fdot2_neg_a_hi:
+; GFX906: ; %bb.0:
+; GFX906: v_dot2_f32_f16 v0, v0, v1, v2 neg_hi:[1,0,0]
+;
+; GFX950-LABEL: v_fdot2_neg_a_hi:
+; GFX950: ; %bb.0:
+; GFX950: s_mov_b32 s0, 0x8000
+; GFX950: v_xor_b32_sdwa v3, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX950: s_mov_b32 s0, 0x5040100
+; GFX950: v_perm_b32 v0, v3, v0, s0
+; GFX950: v_dot2c_f32_f16_e32 v2, v0, v1
+; GFX950: v_mov_b32_e32 v0, v2
+;
+; GFX10-LABEL: v_fdot2_neg_a_hi:
+; GFX10: ; %bb.0:
+; GFX10: v_mov_b32_e32 v3, 0x8000
+; GFX10: v_xor_b32_sdwa v3, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10: v_perm_b32 v0, v3, v0, 0x5040100
+; GFX10: v_dot2c_f32_f16 v2, v0, v1
+; GFX10: v_mov_b32_e32 v0, v2
+;
+; GFX11-LABEL: v_fdot2_neg_a_hi:
+; GFX11: ; %bb.0:
+; GFX11: v_xor_b16 v0.h, 0x8000, v0.h
+; GFX11: v_dot2acc_f32_f16 v2, v0, v1
+; GFX11: v_mov_b32_e32 v0, v2
+;
+; GFX12-LABEL: v_fdot2_neg_a_hi:
+; GFX12: ; %bb.0:
+; GFX12: v_lshrrev_b32_e32 v3, 16, v0
+; GFX12: v_xor_b32_e32 v3, 0x8000, v3
+; GFX12: v_perm_b32 v0, v3, v0, 0x5040100
+; GFX12: v_dot2_f32_f16 v0, v0, v1, v2
+ %a_hi = extractelement <2 x half> %a, i32 1
+ %neg.a_hi = fneg half %a_hi
+ %neg_hi.a = insertelement <2 x half> %a, half %neg.a_hi, i32 1
+ %r = call float @llvm.amdgcn.fdot2(<2 x half> %neg_hi.a, <2 x half> %b, float %c, i1 false)
+ ret float %r
+}
+
+define float @v_fdot2_neg_b(<2 x half> %a, <2 x half> %b, float %c) {
+; GFX906-LABEL: v_fdot2_neg_b:
+; GFX906: ; %bb.0:
+; GFX906: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0]
+;
+; GFX950-LABEL: v_fdot2_neg_b:
+; GFX950: ; %bb.0:
+; GFX950: v_xor_b32_e32 v1, 0x80008000, v1
+; GFX950: v_dot2c_f32_f16_e32 v2, v0, v1
+; GFX950: v_mov_b32_e32 v0, v2
+;
+; GFX10-LABEL: v_fdot2_neg_b:
+; GFX10: ; %bb.0:
+; GFX10: v_xor_b32_e32 v1, 0x80008000, v1
+; GFX10: v_dot2c_f32_f16 v2, v0, v1
+; GFX10: v_mov_b32_e32 v0, v2
+;
+; GFX11-LABEL: v_fdot2_neg_b:
+; GFX11: ; %bb.0:
+; GFX11: v_xor_b32_e32 v1, 0x80008000, v1
+; GFX11: v_dot2acc_f32_f16 v2, v0, v1
+; GFX11: v_mov_b32_e32 v0, v2
+;
+; GFX12-LABEL: v_fdot2_neg_b:
+; GFX12: ; %bb.0:
+; GFX12: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0]
+ %neg.b = fneg <2 x half> %b
+ %r = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %neg.b, float %c, i1 false)
+ ret float %r
+}
+
+define float @v_fdot2_neg_b_lo(<2 x half> %a, <2 x half> %b, float %c) {
+; GFX906-LABEL: v_fdot2_neg_b_lo:
+; GFX906: ; %bb.0:
+; GFX906: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,1,0]
+;
+; GFX950-LABEL: v_fdot2_neg_b_lo:
+; GFX950: ; %bb.0:
+; GFX950: v_xor_b32_e32 v3, 0x8000, v1
+; GFX950: s_mov_b32 s0, 0xffff
+; GFX950: v_bfi_b32 v1, s0, v3, v1
+; GFX950: v_dot2c_f32_f16_e32 v2, v0, v1
+; GFX950: v_mov_b32_e32 v0, v2
+;
+; GFX10-LABEL: v_fdot2_neg_b_lo:
+; GFX10: ; %bb.0:
+; GFX10: v_xor_b32_e32 v3, 0x8000, v1
+; GFX10: v_bfi_b32 v1, 0xffff, v3, v1
+; GFX10: v_dot2c_f32_f16 v2, v0, v1
+; GFX10: v_mov_b32_e32 v0, v2
+;
+; GFX11-LABEL: v_fdot2_neg_b_lo:
+; GFX11: ; %bb.0:
+; GFX11: v_xor_b16 v1.l, 0x8000, v1.l
+; GFX11: v_dot2acc_f32_f16 v2, v0, v1
+; GFX11: v_mov_b32_e32 v0, v2
+;
+; GFX12-LABEL: v_fdot2_neg_b_lo:
+; GFX12: ; %bb.0:
+; GFX12: v_xor_b32_e32 v3, 0x8000, v1
+; GFX12: v_bfi_b32 v1, 0xffff, v3, v1
+; GFX12: v_dot2_f32_f16 v0, v0, v1, v2
+ %b_lo = extractelement <2 x half> %b, i32 0
+ %neg.b_lo = fneg half %b_lo
+ %neg_lo.b = insertelement <2 x half> %b, half %neg.b_lo, i32 0
+ %r = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %neg_lo.b, float %c, i1 false)
+ ret float %r
+}
+
+define float @v_fdot2_neg_b_hi(<2 x half> %a, <2 x half> %b, float %c) {
+; GFX906-LABEL: v_fdot2_neg_b_hi:
+; GFX906: ; %bb.0:
+; GFX906: v_dot2_f32_f16 v0, v0, v1, v2 neg_hi:[0,1,0]
+;
+; GFX950-LABEL: v_fdot2_neg_b_hi:
+; GFX950: ; %bb.0:
+; GFX950: s_mov_b32 s0, 0x8000
+; GFX950: v_xor_b32_sdwa v3, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX950: s_mov_b32 s0, 0x5040100
+; GFX950: v_perm_b32 v1, v3, v1, s0
+; GFX950: v_dot2c_f32_f16_e32 v2, v0, v1
+; GFX950: v_mov_b32_e32 v0, v2
+;
+; GFX10-LABEL: v_fdot2_neg_b_hi:
+; GFX10: ; %bb.0:
+; GFX10: v_mov_b32_e32 v3, 0x8000
+; GFX10: v_xor_b32_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10: v_perm_b32 v1, v3, v1, 0x5040100
+; GFX10: v_dot2c_f32_f16 v2, v0, v1
+; GFX10: v_mov_b32_e32 v0, v2
+;
+; GFX11-LABEL: v_fdot2_neg_b_hi:
+; GFX11: ; %bb.0:
+; GFX11: v_xor_b16 v1.h, 0x8000, v1.h
+; GFX11: v_dot2acc_f32_f16 v2, v0, v1
+; GFX11: v_mov_b32_e32 v0, v2
+;
+; GFX12-LABEL: v_fdot2_neg_b_hi:
+; GFX12: ; %bb.0:
+; GFX12: v_lshrrev_b32_e32 v3, 16, v1
+; GFX12: v_xor_b32_e32 v3, 0x8000, v3
+; GFX12: v_perm_b32 v1, v3, v1, 0x5040100
+; GFX12: v_dot2_f32_f16 v0, v0, v1, v2
+ %b_hi = extractelement <2 x half> %b, i32 1
+ %neg.b_hi = fneg half %b_hi
+ %neg_hi.b = insertelement <2 x half> %b, half %neg.b_hi, i32 1
+ %r = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %neg_hi.b, float %c, i1 false)
+ ret float %r
+}
+
+define float @v_fdot2_neg_c(<2 x half> %a, <2 x half> %b, float %c) {
+; GFX906-LABEL: v_fdot2_neg_c:
+; GFX906: ; %bb.0:
+; GFX906: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1]
+;
+; GFX950-LABEL: v_fdot2_neg_c:
+; GFX950: ; %bb.0:
+; GFX950: v_xor_b32_e32 v2, 0x80000000, v2
+; GFX950: v_dot2c_f32_f16_e32 v2, v0, v1
+; GFX950: v_mov_b32_e32 v0, v2
+;
+; GFX10-LABEL: v_fdot2_neg_c:
+; GFX10: ; %bb.0:
+; GFX10: v_xor_b32_e32 v2, 0x80000000, v2
+; GFX10: v_dot2c_f32_f16 v2, v0, v1
+; GFX10: v_mov_b32_e32 v0, v2
+;
+; GFX11-LABEL: v_fdot2_neg_c:
+; GFX11: ; %bb.0:
+; GFX11: v_xor_b32_e32 v2, 0x80000000, v2
+; GFX11: v_dot2acc_f32_f16 v2, v0, v1
+; GFX11: v_mov_b32_e32 v0, v2
+;
+; GFX12-LABEL: v_fdot2_neg_c:
+; GFX12: ; %bb.0:
+; GFX12: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1]
+ %neg.c = fneg float %c
+ %r = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float %neg.c, i1 false)
+ ret float %r
+}
+
+define float @v_fdot2_abs_c(<2 x half> %a, <2 x half> %b, float %c) {
+; GFX906-LABEL: v_fdot2_abs_c:
+; GFX906: ; %bb.0:
+; GFX906: v_and_b32_e32 v2, 0x7fffffff, v2
+; GFX906: v_dot2_f32_f16 v0, v0, v1, v2
+;
+; GFX950-LABEL: v_fdot2_abs_c:
+; GFX950: ; %bb.0:
+; GFX950: v_and_b32_e32 v2, 0x7fffffff, v2
+; GFX950: v_dot2c_f32_f16_e32 v2, v0, v1
+; GFX950: v_mov_b32_e32 v0, v2
+;
+; GFX10-LABEL: v_fdot2_abs_c:
+; GFX10: ; %bb.0:
+; GFX10: v_and_b32_e32 v2, 0x7fffffff, v2
+; GFX10: v_dot2c_f32_f16 v2, v0, v1
+; GFX10: v_mov_b32_e32 v0, v2
+;
+; GFX11-LABEL: v_fdot2_abs_c:
+; GFX11: ; %bb.0:
+; GFX11: v_and_b32_e32 v2, 0x7fffffff, v2
+; GFX11: v_dot2acc_f32_f16 v2, v0, v1
+; GFX11: v_mov_b32_e32 v0, v2
+;
+; GFX12-LABEL: v_fdot2_abs_c:
+; GFX12: ; %bb.0:
+; GFX12: v_and_b32_e32 v2, 0x7fffffff, v2
+; GFX12: v_dot2_f32_f16 v0, v0, v1, v2
+ %abs.c = call float @llvm.fabs.f32(float %c)
+ %r = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float %abs.c, i1 false)
+ ret float %r
+}
+
+define float @v_fdot2_opsel_lo_a(<2 x half> %a, <2 x half> %b, float %c) {
+; GFX906-LABEL: v_fdot2_opsel_lo_a:
+; GFX906: ; %bb.0:
+; GFX906: v_dot2_f32_f16 v0, v0, v1, v2 op_sel:[1,0,0]
+;
+; GFX950-LABEL: v_fdot2_opsel_lo_a:
+; GFX950: ; %bb.0:
+; GFX950: s_mov_b32 s0, 0x7060302
+; GFX950: v_perm_b32 v0, v0, v0, s0
+; GFX950: v_dot2c_f32_f16_e32 v2, v0, v1
+; GFX950: v_mov_b32_e32 v0, v2
+;
+; GFX10-LABEL: v_fdot2_opsel_lo_a:
+; GFX10: ; %bb.0:
+; GFX10: v_perm_b32 v0, v0, v0, 0x7060302
+; GFX10: v_dot2c_f32_f16 v2, v0, v1
+; GFX10: v_mov_b32_e32 v0, v2
+;
+; GFX11-LABEL: v_fdot2_opsel_lo_a:
+; GFX11: ; %bb.0:
+; GFX11: v_mov_b16_e32 v0.l, v0.h
+; GFX11: v_dot2acc_f32_f16 v2, v0, v1
+; GFX11: v_mov_b32_e32 v0, v2
+;
+; GFX12-LABEL: v_fdot2_opsel_lo_a:
+; GFX12: ; %bb.0:
+; GFX12: v_perm_b32 v0, v0, v0, 0x7060302
+; GFX12: v_dot2_f32_f16 v0, v0, v1, v2
+ %shuf = shufflevector <2 x half> %a, <2 x half> poison, <2 x i32> <i32 1, i32 1>
+ %r = call float @llvm.amdgcn.fdot2(<2 x half> %shuf, <2 x half> %b, float %c, i1 false)
+ ret float %r
+}
+
+define float @v_fdot2_opsel_hi_a(<2 x half> %a, <2 x half> %b, float %c) {
+; GFX906-LABEL: v_fdot2_opsel_hi_a:
+; GFX906: ; %bb.0:
+; GFX906: v_dot2_f32_f16 v0, v0, v1, v2 op_sel_hi:[0,1,1]
+;
+; GFX950-LABEL: v_fdot2_opsel_hi_a:
+; GFX950: ; %bb.0:
+; GFX950: s_mov_b32 s0, 0x5040100
+; GFX950: v_perm_b32 v0, v0, v0, s0
+; GFX950: v_dot2c_f32_f16_e32 v2, v0, v1
+; GFX950: v_mov_b32_e32 v0, v2
+;
+; GFX10-LABEL: v_fdot2_opsel_hi_a:
+; GFX10: ; %bb.0:
+; GFX10: v_perm_b32 v0, v0, v0, 0x5040100
+; GFX10: v_dot2c_f32_f16 v2, v0, v1
+; GFX10: v_mov_b32_e32 v0, v2
+;
+; GFX11-LABEL: v_fdot2_opsel_hi_a:
+; GFX11: ; %bb.0:
+; GFX11: v_mov_b16_e32 v0.h, v0.l
+; GFX11: v_dot2acc_f32_f16 v2, v0, v1
+; GFX11: v_mov_b32_e32 v0, v2
+;
+; GFX12-LABEL: v_fdot2_opsel_hi_a:
+; GFX12: ; %bb.0:
+; GFX12: v_perm_b32 v0, v0, v0, 0x5040100
+; GFX12: v_dot2_f32_f16 v0, v0, v1, v2
+ %shuf = shufflevector <2 x half> %a, <2 x half> poison, <2 x i32> <i32 0, i32 0>
+ %r = call float @llvm.amdgcn.fdot2(<2 x half> %shuf, <2 x half> %b, float %c, i1 false)
+ ret float %r
+}
+
+define float @v_fdot2_opsel_lo_b(<2 x half> %a, <2 x half> %b, float %c) {
+; GFX906-LABEL: v_fdot2_opsel_lo_b:
+; GFX906: ; %bb.0:
+; GFX906: v_dot2_f32_f16 v0, v0, v1, v2 op_sel:[0,1,0]
+;
+; GFX950-LABEL: v_fdot2_opsel_lo_b:
+; GFX950: ; %bb.0:
+; GFX950: s_mov_b32 s0, 0x7060302
+; GFX950: v_perm_b32 v1, v1, v1, s0
+; GFX950: v_dot2c_f32_f16_e32 v2, v0, v1
+; GFX950: v_mov_b32_e32 v0, v2
+;
+; GFX10-LABEL: v_fdot2_opsel_lo_b:
+; GFX10: ; %bb.0:
+; GFX10: v_perm_b32 v1, v1, v1, 0x7060302
+; GFX10: v_dot2c_f32_f16 v2, v0, v1
+; GFX10: v_mov_b32_e32 v0, v2
+;
+; GFX11-LABEL: v_fdot2_opsel_lo_b:
+; GFX11: ; %bb.0:
+; GFX11: v_mov_b16_e32 v1.l, v1.h
+; GFX11: v_dot2acc_f32_f16 v2, v0, v1
+; GFX11: v_mov_b32_e32 v0, v2
+;
+; GFX12-LABEL: v_fdot2_opsel_lo_b:
+; GFX12: ; %bb.0:
+; GFX12: v_perm_b32 v1, v1, v1, 0x7060302
+; GFX12: v_dot2_f32_f16 v0, v0, v1, v2
+ %shuf = shufflevector <2 x half> %b, <2 x half> poison, <2 x i32> <i32 1, i32 1>
+ %r = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %shuf, float %c, i1 false)
+ ret float %r
+}
+
+define float @v_fdot2_opsel_hi_b(<2 x half> %a, <2 x half> %b, float %c) {
+; GFX906-LABEL: v_fdot2_opsel_hi_b:
+; GFX906: ; %bb.0:
+; GFX906: v_dot2_f32_f16 v0, v0, v1, v2 op_sel_hi:[1,0,1]
+;
+; GFX950-LABEL: v_fdot2_opsel_hi_b:
+; GFX950: ; %bb.0:
+; GFX950: s_mov_b32 s0, 0x5040100
+; GFX950: v_perm_b32 v1, v1, v1, s0
+; GFX950: v_dot2c_f32_f16_e32 v2, v0, v1
+; GFX950: v_mov_b32_e32 v0, v2
+;
+; GFX10-LABEL: v_fdot2_opsel_hi_b:
+; GFX10: ; %bb.0:
+; GFX10: v_perm_b32 v1, v1, v1, 0x5040100
+; GFX10: v_dot2c_f32_f16 v2, v0, v1
+; GFX10: v_mov_b32_e32 v0, v2
+;
+; GFX11-LABEL: v_fdot2_opsel_hi_b:
+; GFX11: ; %bb.0:
+; GFX11: v_mov_b16_e32 v1.h, v1.l
+; GFX11: v_dot2acc_f32_f16 v2, v0, v1
+; GFX11: v_mov_b32_e32 v0, v2
+;
+; GFX12-LABEL: v_fdot2_opsel_hi_b:
+; GFX12: ; %bb.0:
+; GFX12: v_perm_b32 v1, v1, v1, 0x5040100
+; GFX12: v_dot2_f32_f16 v0, v0, v1, v2
+ %shuf = shufflevector <2 x half> %b, <2 x half> poison, <2 x i32> <i32 0, i32 0>
+ %r = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %shuf, float %c, i1 false)
+ ret float %r
+}
+
+define float @v_fdot2_inline_literal_a(<2 x half> %b, float %c) {
+; GFX906-LABEL: v_fdot2_inline_literal_a:
+; GFX906: ; %bb.0:
+; GFX906: v_dot2_f32_f16 v0, 2.0, v0, v1 op_sel_hi:[0,1,1]
+;
+; GFX950-LABEL: v_fdot2_inline_literal_a:
+; GFX950: ; %bb.0:
+; GFX950: v_dot2c_f32_f16_e32 v1, 0x40004000, v0
+; GFX950: v_mov_b32_e32 v0, v1
+;
+; GFX10-LABEL: v_fdot2_inline_literal_a:
+; GFX10: ; %bb.0:
+; GFX10: v_dot2c_f32_f16 v1, 0x40004000, v0
+; GFX10: v_mov_b32_e32 v0, v1
+;
+; GFX11-LABEL: v_fdot2_inline_literal_a:
+; GFX11: ; %bb.0:
+; GFX11: v_dot2acc_f32_f16 v1, 0x40004000, v0
+; GFX11: v_mov_b32_e32 v0, v1
+;
+; GFX12-LABEL: v_fdot2_inline_literal_a:
+; GFX12: ; %bb.0:
+; GFX12: v_dot2_f32_f16 v0, 0x40004000, v0, v1
+ %ret = tail call float @llvm.amdgcn.fdot2(<2 x half> <half 2.0, half 2.0>, <2 x half> %b, float %c, i1 false)
+ ret float %ret
+}
+
+define float @v_fdot2_inline_literal_b(<2 x half> %a, float %c) {
+; GFX906-LABEL: v_fdot2_inline_literal_b:
+; GFX906: ; %bb.0:
+; GFX906: v_dot2_f32_f16 v0, v0, 2.0, v1 op_sel_hi:[1,0,1]
+;
+; GFX950-LABEL: v_fdot2_inline_literal_b:
+; GFX950: ; %bb.0:
+; GFX950: v_dot2c_f32_f16_e32 v1, 0x40004000, v0
+; GFX950: v_mov_b32_e32 v0, v1
+;
+; GFX10-LABEL: v_fdot2_inline_literal_b:
+; GFX10: ; %bb.0:
+; GFX10: v_dot2c_f32_f16 v1, 0x40004000, v0
+; GFX10: v_mov_b32_e32 v0, v1
+;
+; GFX11-LABEL: v_fdot2_inline_literal_b:
+; GFX11: ; %bb.0:
+; GFX11: v_dot2acc_f32_f16 v1, 0x40004000, v0
+; GFX11: v_mov_b32_e32 v0, v1
+;
+; GFX12-LABEL: v_fdot2_inline_literal_b:
+; GFX12: ; %bb.0:
+; GFX12: v_dot2_f32_f16 v0, v0, 0x40004000, v1
+ %ret = tail call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> <half 2.0, half 2.0>, float %c, i1 false)
+ ret float %ret
+}
+
+define float @v_fdot2_inline_literal_c(<2 x half> %a, <2 x half> %b) {
+; GFX906-LABEL: v_fdot2_inline_literal_c:
+; GFX906: ; %bb.0:
+; GFX906: v_dot2_f32_f16 v0, v0, v1, 2.0
+;
+; GFX950-LABEL: v_fdot2_inline_literal_c:
+; GFX950: ; %bb.0:
+; GFX950: v_mov_b32_e32 v2, 2.0
+; GFX950: v_dot2c_f32_f16_e32 v2, v0, v1
+; GFX950: v_mov_b32_e32 v0, v2
+;
+; GFX10-LABEL: v_fdot2_inline_literal_c:
+; GFX10: ; %bb.0:
+; GFX10: v_mov_b32_e32 v2, 2.0
+; GFX10: v_dot2c_f32_f16 v2, v0, v1
+; GFX10: v_mov_b32_e32 v0, v2
+;
+; GFX11-LABEL: v_fdot2_inline_literal_c:
+; GFX11: ; %bb.0:
+; GFX11: v_mov_b32_e32 v2, 2.0
+; GFX11: v_dot2acc_f32_f16 v2, v0, v1
+; GFX11: v_mov_b32_e32 v0, v2
+;
+; GFX12-LABEL: v_fdot2_inline_literal_c:
+; GFX12: ; %bb.0:
+; GFX12: v_dot2_f32_f16 v0, v0, v1, 2.0
+ %ret = tail call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float 2.0, i1 false)
ret float %ret
}
+
+define float @v_fdot2_clamp(<2 x half> %a, <2 x half> %b, float %c) {
+; GCN-LABEL: v_fdot2_clamp:
+; GCN: ; %bb.0:
+; GCN: v_dot2_f32_f16 v0, v0, v1, v2 clamp
+ %r = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float %c, i1 true)
+ ret float %r
+}
+
+define float @v_fdot2_neg_a_clamp(<2 x half> %a, <2 x half> %b, float %c) {
+; GCN-LABEL: v_fdot2_neg_a_clamp:
+; GCN: ; %bb.0:
+; GCN: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0] clamp
+ %neg.a = fneg <2 x half> %a
+ %r = call float @llvm.amdgcn.fdot2(<2 x half> %neg.a, <2 x half> %b, float %c, i1 true)
+ ret float %r
+}
+
+define float @v_fdot2_neg_b_clamp(<2 x half> %a, <2 x half> %b, float %c) {
+; GCN-LABEL: v_fdot2_neg_b_clamp:
+; GCN: ; %bb.0:
+; GCN: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0] clamp
+ %neg.b = fneg <2 x half> %b
+ %r = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %neg.b, float %c, i1 true)
+ ret float %r
+}
+
+define float @v_fdot2_neg_c_clamp(<2 x half> %a, <2 x half> %b, float %c) {
+; GCN-LABEL: v_fdot2_neg_c_clamp:
+; GCN: ; %bb.0:
+; GCN: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1] clamp
+ %neg.c = fneg float %c
+ %r = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float %neg.c, i1 true)
+ ret float %r
+}
+
+define float @v_fdot2_abs_c_clamp(<2 x half> %a, <2 x half> %b, float %c) {
+; GCN-LABEL: v_fdot2_abs_c_clamp:
+; GCN: ; %bb.0:
+; GCN: v_and_b32_e32 v2, 0x7fffffff, v2
+; GCN: v_dot2_f32_f16 v0, v0, v1, v2 clamp
+ %abs.c = call float @llvm.fabs.f32(float %c)
+ %r = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float %abs.c, i1 true)
+ ret float %r
+}
+
+define float @v_fdot2_opsel_lo_a_clamp(<2 x half> %a, <2 x half> %b, float %c) {
+; GFX906-LABEL: v_fdot2_opsel_lo_a_clamp:
+; GFX906: ; %bb.0:
+; GFX906: v_dot2_f32_f16 v0, v0, v1, v2 op_sel:[1,0,0] clamp
+;
+; GFX950-LABEL: v_fdot2_opsel_lo_a_clamp:
+; GFX950: ; %bb.0:
+; GFX950: s_mov_b32 s0, 0x7060302
+; GFX950: v_perm_b32 v0, v0, v0, s0
+; GFX950: v_dot2_f32_f16 v0, v0, v1, v2 clamp
+;
+; GFX10-LABEL: v_fdot2_opsel_lo_a_clamp:
+; GFX10: ; %bb.0:
+; GFX10: v_dot2_f32_f16 v0, v0, v1, v2 op_sel:[1,0,0] clamp
+;
+; GFX11-LABEL: v_fdot2_opsel_lo_a_clamp:
+; GFX11: ; %bb.0:
+; GFX11: v_mov_b16_e32 v0.l, v0.h
+; GFX11: v_dot2_f32_f16 v0, v0, v1, v2 clamp
+;
+; GFX12-LABEL: v_fdot2_opsel_lo_a_clamp:
+; GFX12: ; %bb.0:
+; GFX12: v_perm_b32 v0, v0, v0, 0x7060302
+; GFX12: v_dot2_f32_f16 v0, v0, v1, v2 clamp
+ %shuf = shufflevector <2 x half> %a, <2 x half> poison, <2 x i32> <i32 1, i32 1>
+ %r = call float @llvm.amdgcn.fdot2(<2 x half> %shuf, <2 x half> %b, float %c, i1 true)
+ ret float %r
+}
+
+define float @v_fdot2_opsel_hi_a_clamp(<2 x half> %a, <2 x half> %b, float %c) {
+; GFX906-LABEL: v_fdot2_opsel_hi_a_clamp:
+; GFX906: ; %bb.0:
+; GFX906: v_dot2_f32_f16 v0, v0, v1, v2 op_sel_hi:[0,1,1] clamp
+;
+; GFX950-LABEL: v_fdot2_opsel_hi_a_clamp:
+; GFX950: ; %bb.0:
+; GFX950: s_mov_b32 s0, 0x5040100
+; GFX950: v_perm_b32 v0, v0, v0, s0
+; GFX950: v_dot2_f32_f16 v0, v0, v1, v2 clamp
+;
+; GFX10-LABEL: v_fdot2_opsel_hi_a_clamp:
+; GFX10: ; %bb.0:
+; GFX10: v_dot2_f32_f16 v0, v0, v1, v2 op_sel_hi:[0,1,1] clamp
+;
+; GFX11-LABEL: v_fdot2_opsel_hi_a_clamp:
+; GFX11: ; %bb.0:
+; GFX11: v_mov_b16_e32 v0.h, v0.l
+; GFX11: v_dot2_f32_f16 v0, v0, v1, v2 clamp
+;
+; GFX12-LABEL: v_fdot2_opsel_hi_a_clamp:
+; GFX12: ; %bb.0:
+; GFX12: v_perm_b32 v0, v0, v0, 0x5040100
+; GFX12: v_dot2_f32_f16 v0, v0, v1, v2 clamp
+ %shuf = shufflevector <2 x half> %a, <2 x half> poison, <2 x i32> <i32 0, i32 0>
+ %r = call float @llvm.amdgcn.fdot2(<2 x half> %shuf, <2 x half> %b, float %c, i1 true)
+ ret float %r
+}
+
+define float @v_fdot2_opsel_lo_b_clamp(<2 x half> %a, <2 x half> %b, float %c) {
+; GFX906-LABEL: v_fdot2_opsel_lo_b_clamp:
+; GFX906: ; %bb.0:
+; GFX906: v_dot2_f32_f16 v0, v0, v1, v2 op_sel:[0,1,0] clamp
+;
+; GFX950-LABEL: v_fdot2_opsel_lo_b_clamp:
+; GFX950: ; %bb.0:
+; GFX950: s_mov_b32 s0, 0x7060302
+; GFX950: v_perm_b32 v1, v1, v1, s0
+; GFX950: v_dot2_f32_f16 v0, v0, v1, v2 clamp
+;
+; GFX10-LABEL: v_fdot2_opsel_lo_b_clamp:
+; GFX10: ; %bb.0:
+; GFX10: v_dot2_f32_f16 v0, v0, v1, v2 op_sel:[0,1,0] clamp
+;
+; GFX11-LABEL: v_fdot2_opsel_lo_b_clamp:
+; GFX11: ; %bb.0:
+; GFX11: v_mov_b16_e32 v1.l, v1.h
+; GFX11: v_dot2_f32_f16 v0, v0, v1, v2 clamp
+;
+; GFX12-LABEL: v_fdot2_opsel_lo_b_clamp:
+; GFX12: ; %bb.0:
+; GFX12: v_perm_b32 v1, v1, v1, 0x7060302
+; GFX12: v_dot2_f32_f16 v0, v0, v1, v2 clamp
+ %shuf = shufflevector <2 x half> %b, <2 x half> poison, <2 x i32> <i32 1, i32 1>
+ %r = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %shuf, float %c, i1 true)
+ ret float %r
+}
+
+define float @v_fdot2_opsel_hi_b_clamp(<2 x half> %a, <2 x half> %b, float %c) {
+; GFX906-LABEL: v_fdot2_opsel_hi_b_clamp:
+; GFX906: ; %bb.0:
+; GFX906: v_dot2_f32_f16 v0, v0, v1, v2 op_sel_hi:[1,0,1] clamp
+;
+; GFX950-LABEL: v_fdot2_opsel_hi_b_clamp:
+; GFX950: ; %bb.0:
+; GFX950: s_mov_b32 s0, 0x5040100
+; GFX950: v_perm_b32 v1, v1, v1, s0
+; GFX950: v_dot2_f32_f16 v0, v0, v1, v2 clamp
+;
+; GFX10-LABEL: v_fdot2_opsel_hi_b_clamp:
+; GFX10: ; %bb.0:
+; GFX10: v_dot2_f32_f16 v0, v0, v1, v2 op_sel_hi:[1,0,1] clamp
+;
+; GFX11-LABEL: v_fdot2_opsel_hi_b_clamp:
+; GFX11: ; %bb.0:
+; GFX11: v_mov_b16_e32 v1.h, v1.l
+; GFX11: v_dot2_f32_f16 v0, v0, v1, v2 clamp
+;
+; GFX12-LABEL: v_fdot2_opsel_hi_b_clamp:
+; GFX12: ; %bb.0:
+; GFX12: v_perm_b32 v1, v1, v1, 0x5040100
+; GFX12: v_dot2_f32_f16 v0, v0, v1, v2 clamp
+ %shuf = shufflevector <2 x half> %b, <2 x half> poison, <2 x i32> <i32 0, i32 0>
+ %r = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %shuf, float %c, i1 true)
+ ret float %r
+}
+
+define float @v_fdot2_inline_literal_a_clamp(<2 x half> %b, float %c) {
+; GFX906-LABEL: v_fdot2_inline_literal_a_clamp:
+; GFX906: ; %bb.0:
+; GFX906: v_dot2_f32_f16 v0, 2.0, v0, v1 op_sel_hi:[0,1,1] clamp
+;
+; GFX950-LABEL: v_fdot2_inline_literal_a_clamp:
+; GFX950: ; %bb.0:
+; GFX950: s_mov_b32 s0, 0x40004000
+; GFX950: v_dot2_f32_f16 v0, s0, v0, v1 clamp
+;
+; GFX10-LABEL: v_fdot2_inline_literal_a_clamp:
+; GFX10: ; %bb.0:
+; GFX10: v_dot2_f32_f16 v0, 2.0, v0, v1 op_sel_hi:[0,1,1] clamp
+;
+; GFX11PLUS-LABEL: v_fdot2_inline_literal_a_clamp:
+; GFX11PLUS: ; %bb.0:
+; GFX11PLUS: v_dot2_f32_f16 v0, 0x40004000, v0, v1 clamp
+ %ret = tail call float @llvm.amdgcn.fdot2(<2 x half> <half 2.0, half 2.0>, <2 x half> %b, float %c, i1 true)
+ ret float %ret
+}
+
+define float @v_fdot2_inline_literal_b_clamp(<2 x half> %a, float %c) {
+; GFX906-LABEL: v_fdot2_inline_literal_b_clamp:
+; GFX906: ; %bb.0:
+; GFX906: v_dot2_f32_f16 v0, v0, 2.0, v1 op_sel_hi:[1,0,1] clamp
+;
+; GFX950-LABEL: v_fdot2_inline_literal_b_clamp:
+; GFX950: ; %bb.0:
+; GFX950: s_mov_b32 s0, 0x40004000
+; GFX950: v_dot2_f32_f16 v0, v0, s0, v1 clamp
+;
+; GFX10-LABEL: v_fdot2_inline_literal_b_clamp:
+; GFX10: ; %bb.0:
+; GFX10: v_dot2_f32_f16 v0, v0, 2.0, v1 op_sel_hi:[1,0,1] clamp
+;
+; GFX11PLUS-LABEL: v_fdot2_inline_literal_b_clamp:
+; GFX11PLUS: ; %bb.0:
+; GFX11PLUS: v_dot2_f32_f16 v0, v0, 0x40004000, v1 clamp
+ %ret = tail call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> <half 2.0, half 2.0>, float %c, i1 true)
+ ret float %ret
+}
+
+define float @v_fdot2_inline_literal_c_clamp(<2 x half> %a, <2 x half> %b) {
+; GCN-LABEL: v_fdot2_inline_literal_c_clamp:
+; GCN: ; %bb.0:
+; GCN: v_dot2_f32_f16 v0, v0, v1, 2.0 clamp
+ %ret = tail call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float 2.0, i1 true)
+ ret float %ret
+}
+
+define float @v_fdot2_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x half> %d, <2 x half> %e, float %f) {
+; GFX906-LABEL: v_fdot2_dual:
+; GFX906: ; %bb.0:
+; GFX906: v_dot2_f32_f16 v0, v0, v1, v2
+; GFX906: v_dot2_f32_f16 v1, v3, v4, v5
+; GFX906: v_add_f32_e32 v0, v0, v1
+;
+; GFX950-LABEL: v_fdot2_dual:
+; GFX950: ; %bb.0:
+; GFX950: v_dot2c_f32_f16_e32 v2, v0, v1
+; GFX950: v_dot2c_f32_f16_e32 v5, v3, v4
+; GFX950: v_add_f32_e32 v0, v2, v5
+;
+; GFX10-LABEL: v_fdot2_dual:
+; GFX10: ; %bb.0:
+; GFX10: v_dot2c_f32_f16 v2, v0, v1
+; GFX10: v_dot2c_f32_f16 v5, v3, v4
+; GFX10: v_add_f32_e32 v0, v2, v5
+;
+; GFX11-LABEL: v_fdot2_dual:
+; GFX11: ; %bb.0:
+; GFX11: v_dual_dot2acc_f32_f16 v2, v0, v1 :: v_dual_dot2acc_f32_f16 v5, v3, v4
+; GFX11: v_add_f32_e32 v0, v2, v5
+;
+; GFX12-LABEL: v_fdot2_dual:
+; GFX12: ; %bb.0:
+; GFX12: v_dot2_f32_f16 v0, v0, v1, v2
+; GFX12: v_dot2_f32_f16 v1, v3, v4, v5
+; GFX12: v_add_f32_e32 v0, v0, v1
+ %r0 = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float %c, i1 false)
+ %r1 = call float @llvm.amdgcn.fdot2(<2 x half> %d, <2 x half> %e, float %f, i1 false)
+ %r = fadd float %r0, %r1
+ ret float %r
+}
+
+define float @v_fdot2_neg_a_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x half> %d, <2 x half> %e, float %f) {
+; GFX906-LABEL: v_fdot2_neg_a_dual:
+; GFX906: ; %bb.0:
+; GFX906: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GFX906: v_dot2_f32_f16 v1, v3, v4, v5
+; GFX906: v_add_f32_e32 v0, v0, v1
+;
+; GFX950-LABEL: v_fdot2_neg_a_dual:
+; GFX950: ; %bb.0:
+; GFX950: v_xor_b32_e32 v0, 0x80008000, v0
+; GFX950: v_dot2c_f32_f16_e32 v2, v0, v1
+; GFX950: v_dot2c_f32_f16_e32 v5, v3, v4
+; GFX950: v_add_f32_e32 v0, v2, v5
+;
+; GFX10-LABEL: v_fdot2_neg_a_dual:
+; GFX10: ; %bb.0:
+; GFX10: v_xor_b32_e32 v0, 0x80008000, v0
+; GFX10: v_dot2c_f32_f16 v5, v3, v4
+; GFX10: v_dot2c_f32_f16 v2, v0, v1
+; GFX10: v_add_f32_e32 v0, v2, v5
+;
+; GFX11-LABEL: v_fdot2_neg_a_dual:
+; GFX11: ; %bb.0:
+; GFX11: v_xor_b32_e32 v0, 0x80008000, v0
+; GFX11: v_dual_dot2acc_f32_f16 v5, v3, v4 :: v_dual_dot2acc_f32_f16 v2, v0, v1
+; GFX11: v_add_f32_e32 v0, v2, v5
+;
+; GFX12-LABEL: v_fdot2_neg_a_dual:
+; GFX12: ; %bb.0:
+; GFX12: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GFX12: v_dot2_f32_f16 v1, v3, v4, v5
+; GFX12: v_add_f32_e32 v0, v0, v1
+ %neg.a = fneg <2 x half> %a
+ %r0 = call float @llvm.amdgcn.fdot2(<2 x half> %neg.a, <2 x half> %b, float %c, i1 false)
+ %r1 = call float @llvm.amdgcn.fdot2(<2 x half> %d, <2 x half> %e, float %f, i1 false)
+ %r = fadd float %r0, %r1
+ ret float %r
+}
+
+define float @v_fdot2_neg_a_lo_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x half> %d, <2 x half> %e, float %f) {
+; GFX906-LABEL: v_fdot2_neg_a_lo_dual:
+; GFX906: ; %bb.0:
+; GFX906: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[1,0,0]
+; GFX906: v_dot2_f32_f16 v1, v3, v4, v5
+; GFX906: v_add_f32_e32 v0, v0, v1
+;
+; GFX950-LABEL: v_fdot2_neg_a_lo_dual:
+; GFX950: ; %bb.0:
+; GFX950: v_xor_b32_e32 v6, 0x8000, v0
+; GFX950: s_mov_b32 s0, 0xffff
+; GFX950: v_bfi_b32 v0, s0, v6, v0
+; GFX950: v_dot2c_f32_f16_e32 v2, v0, v1
+; GFX950: v_dot2c_f32_f16_e32 v5, v3, v4
+; GFX950: v_add_f32_e32 v0, v2, v5
+;
+; GFX10-LABEL: v_fdot2_neg_a_lo_dual:
+; GFX10: ; %bb.0:
+; GFX10: v_xor_b32_e32 v6, 0x8000, v0
+; GFX10: v_dot2c_f32_f16 v5, v3, v4
+; GFX10: v_bfi_b32 v0, 0xffff, v6, v0
+; GFX10: v_dot2c_f32_f16 v2, v0, v1
+; GFX10: v_add_f32_e32 v0, v2, v5
+;
+; GFX11-LABEL: v_fdot2_neg_a_lo_dual:
+; GFX11: ; %bb.0:
+; GFX11: v_xor_b16 v0.l, 0x8000, v0.l
+; GFX11: v_dual_dot2acc_f32_f16 v5, v3, v4 :: v_dual_dot2acc_f32_f16 v2, v0, v1
+; GFX11: v_add_f32_e32 v0, v2, v5
+;
+; GFX12-LABEL: v_fdot2_neg_a_lo_dual:
+; GFX12: ; %bb.0:
+; GFX12: v_xor_b32_e32 v6, 0x8000, v0
+; GFX12: v_bfi_b32 v0, 0xffff, v6, v0
+; GFX12: v_dot2_f32_f16 v0, v0, v1, v2
+; GFX12: v_dot2_f32_f16 v1, v3, v4, v5
+; GFX12: v_add_f32_e32 v0, v0, v1
+ %a_lo = extractelement <2 x half> %a, i32 0
+ %neg.a_lo = fneg half %a_lo
+ %neg_lo.a = insertelement <2 x half> %a, half %neg.a_lo, i32 0
+ %r0 = call float @llvm.amdgcn.fdot2(<2 x half> %neg_lo.a, <2 x half> %b, float %c, i1 false)
+ %r1 = call float @llvm.amdgcn.fdot2(<2 x half> %d, <2 x half> %e, float %f, i1 false)
+ %r = fadd float %r0, %r1
+ ret float %r
+}
+
+define float @v_fdot2_neg_a_hi_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x half> %d, <2 x half> %e, float %f) {
+; GFX906-LABEL: v_fdot2_neg_a_hi_dual:
+; GFX906: ; %bb.0:
+; GFX906: v_dot2_f32_f16 v0, v0, v1, v2 neg_hi:[1,0,0]
+; GFX906: v_dot2_f32_f16 v1, v3, v4, v5
+; GFX906: v_add_f32_e32 v0, v0, v1
+;
+; GFX950-LABEL: v_fdot2_neg_a_hi_dual:
+; GFX950: ; %bb.0:
+; GFX950: s_mov_b32 s0, 0x8000
+; GFX950: v_xor_b32_sdwa v6, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX950: s_mov_b32 s0, 0x5040100
+; GFX950: v_perm_b32 v0, v6, v0, s0
+; GFX950: v_dot2c_f32_f16_e32 v2, v0, v1
+; GFX950: v_dot2c_f32_f16_e32 v5, v3, v4
+; GFX950: v_add_f32_e32 v0, v2, v5
+;
+; GFX10-LABEL: v_fdot2_neg_a_hi_dual:
+; GFX10: ; %bb.0:
+; GFX10: v_mov_b32_e32 v6, 0x8000
+; GFX10: v_dot2c_f32_f16 v5, v3, v4
+; GFX10: v_xor_b32_sdwa v6, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10: v_perm_b32 v0, v6, v0, 0x5040100
+; GFX10: v_dot2c_f32_f16 v2, v0, v1
+; GFX10: v_add_f32_e32 v0, v2, v5
+;
+; GFX11-LABEL: v_fdot2_neg_a_hi_dual:
+; GFX11: ; %bb.0:
+; GFX11: v_xor_b16 v0.h, 0x8000, v0.h
+; GFX11: v_dual_dot2acc_f32_f16 v5, v3, v4 :: v_dual_dot2acc_f32_f16 v2, v0, v1
+; GFX11: v_add_f32_e32 v0, v2, v5
+;
+; GFX12-LABEL: v_fdot2_neg_a_hi_dual:
+; GFX12: ; %bb.0:
+; GFX12: v_lshrrev_b32_e32 v6, 16, v0
+; GFX12: v_xor_b32_e32 v6, 0x8000, v6
+; GFX12: v_perm_b32 v0, v6, v0, 0x5040100
+; GFX12: v_dot2_f32_f16 v0, v0, v1, v2
+; GFX12: v_dot2_f32_f16 v1, v3, v4, v5
+; GFX12: v_add_f32_e32 v0, v0, v1
+ %a_hi = extractelement <2 x half> %a, i32 1
+ %neg.a_hi = fneg half %a_hi
+ %neg_hi.a = insertelement <2 x half> %a, half %neg.a_hi, i32 1
+ %r0 = call float @llvm.amdgcn.fdot2(<2 x half> %neg_hi.a, <2 x half> %b, float %c, i1 false)
+ %r1 = call float @llvm.amdgcn.fdot2(<2 x half> %d, <2 x half> %e, float %f, i1 false)
+ %r = fadd float %r0, %r1
+ ret float %r
+}
+
+define float @v_fdot2_neg_b_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x half> %d, <2 x half> %e, float %f) {
+; GFX906-LABEL: v_fdot2_neg_b_dual:
+; GFX906: ; %bb.0:
+; GFX906: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GFX906: v_dot2_f32_f16 v1, v3, v4, v5
+; GFX906: v_add_f32_e32 v0, v0, v1
+;
+; GFX950-LABEL: v_fdot2_neg_b_dual:
+; GFX950: ; %bb.0:
+; GFX950: v_xor_b32_e32 v1, 0x80008000, v1
+; GFX950: v_dot2c_f32_f16_e32 v2, v0, v1
+; GFX950: v_dot2c_f32_f16_e32 v5, v3, v4
+; GFX950: v_add_f32_e32 v0, v2, v5
+;
+; GFX10-LABEL: v_fdot2_neg_b_dual:
+; GFX10: ; %bb.0:
+; GFX10: v_xor_b32_e32 v1, 0x80008000, v1
+; GFX10: v_dot2c_f32_f16 v5, v3, v4
+; GFX10: v_dot2c_f32_f16 v2, v0, v1
+; GFX10: v_add_f32_e32 v0, v2, v5
+;
+; GFX11-LABEL: v_fdot2_neg_b_dual:
+; GFX11: ; %bb.0:
+; GFX11: v_xor_b32_e32 v1, 0x80008000, v1
+; GFX11: v_dual_dot2acc_f32_f16 v5, v3, v4 :: v_dual_dot2acc_f32_f16 v2, v0, v1
+; GFX11: v_add_f32_e32 v0, v2, v5
+;
+; GFX12-LABEL: v_fdot2_neg_b_dual:
+; GFX12: ; %bb.0:
+; GFX12: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GFX12: v_dot2_f32_f16 v1, v3, v4, v5
+; GFX12: v_add_f32_e32 v0, v0, v1
+ %neg.b = fneg <2 x half> %b
+ %r0 = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %neg.b, float %c, i1 false)
+ %r1 = call float @llvm.amdgcn.fdot2(<2 x half> %d, <2 x half> %e, float %f, i1 false)
+ %r = fadd float %r0, %r1
+ ret float %r
+}
+
+define float @v_fdot2_neg_b_lo_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x half> %d, <2 x half> %e, float %f) {
+; GFX906-LABEL: v_fdot2_neg_b_lo_dual:
+; GFX906: ; %bb.0:
+; GFX906: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,1,0]
+; GFX906: v_dot2_f32_f16 v1, v3, v4, v5
+; GFX906: v_add_f32_e32 v0, v0, v1
+;
+; GFX950-LABEL: v_fdot2_neg_b_lo_dual:
+; GFX950: ; %bb.0:
+; GFX950: v_xor_b32_e32 v6, 0x8000, v1
+; GFX950: s_mov_b32 s0, 0xffff
+; GFX950: v_bfi_b32 v1, s0, v6, v1
+; GFX950: v_dot2c_f32_f16_e32 v2, v0, v1
+; GFX950: v_dot2c_f32_f16_e32 v5, v3, v4
+; GFX950: v_add_f32_e32 v0, v2, v5
+;
+; GFX10-LABEL: v_fdot2_neg_b_lo_dual:
+; GFX10: ; %bb.0:
+; GFX10: v_xor_b32_e32 v6, 0x8000, v1
+; GFX10: v_dot2c_f32_f16 v5, v3, v4
+; GFX10: v_bfi_b32 v1, 0xffff, v6, v1
+; GFX10: v_dot2c_f32_f16 v2, v0, v1
+; GFX10: v_add_f32_e32 v0, v2, v5
+;
+; GFX11-LABEL: v_fdot2_neg_b_lo_dual:
+; GFX11: ; %bb.0:
+; GFX11: v_xor_b16 v1.l, 0x8000, v1.l
+; GFX11: v_dual_dot2acc_f32_f16 v5, v3, v4 :: v_dual_dot2acc_f32_f16 v2, v0, v1
+; GFX11: v_add_f32_e32 v0, v2, v5
+;
+; GFX12-LABEL: v_fdot2_neg_b_lo_dual:
+; GFX12: ; %bb.0:
+; GFX12: v_xor_b32_e32 v6, 0x8000, v1
+; GFX12: v_bfi_b32 v1, 0xffff, v6, v1
+; GFX12: v_dot2_f32_f16 v0, v0, v1, v2
+; GFX12: v_dot2_f32_f16 v1, v3, v4, v5
+; GFX12: v_add_f32_e32 v0, v0, v1
+ %b_lo = extractelement <2 x half> %b, i32 0
+ %neg.b_lo = fneg half %b_lo
+ %neg_lo.b = insertelement <2 x half> %b, half %neg.b_lo, i32 0
+ %r0 = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %neg_lo.b, float %c, i1 false)
+ %r1 = call float @llvm.amdgcn.fdot2(<2 x half> %d, <2 x half> %e, float %f, i1 false)
+ %r = fadd float %r0, %r1
+ ret float %r
+}
+
+define float @v_fdot2_neg_b_hi_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x half> %d, <2 x half> %e, float %f) {
+; GFX906-LABEL: v_fdot2_neg_b_hi_dual:
+; GFX906: ; %bb.0:
+; GFX906: v_dot2_f32_f16 v0, v0, v1, v2 neg_hi:[0,1,0]
+; GFX906: v_dot2_f32_f16 v1, v3, v4, v5
+; GFX906: v_add_f32_e32 v0, v0, v1
+;
+; GFX950-LABEL: v_fdot2_neg_b_hi_dual:
+; GFX950: ; %bb.0:
+; GFX950: s_mov_b32 s0, 0x8000
+; GFX950: v_xor_b32_sdwa v6, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX950: s_mov_b32 s0, 0x5040100
+; GFX950: v_perm_b32 v1, v6, v1, s0
+; GFX950: v_dot2c_f32_f16_e32 v2, v0, v1
+; GFX950: v_dot2c_f32_f16_e32 v5, v3, v4
+; GFX950: v_add_f32_e32 v0, v2, v5
+;
+; GFX10-LABEL: v_fdot2_neg_b_hi_dual:
+; GFX10: ; %bb.0:
+; GFX10: v_mov_b32_e32 v6, 0x8000
+; GFX10: v_dot2c_f32_f16 v5, v3, v4
+; GFX10: v_xor_b32_sdwa v6, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10: v_perm_b32 v1, v6, v1, 0x5040100
+; GFX10: v_dot2c_f32_f16 v2, v0, v1
+; GFX10: v_add_f32_e32 v0, v2, v5
+;
+; GFX11-LABEL: v_fdot2_neg_b_hi_dual:
+; GFX11: ; %bb.0:
+; GFX11: v_xor_b16 v1.h, 0x8000, v1.h
+; GFX11: v_dual_dot2acc_f32_f16 v5, v3, v4 :: v_dual_dot2acc_f32_f16 v2, v0, v1
+; GFX11: v_add_f32_e32 v0, v2, v5
+;
+; GFX12-LABEL: v_fdot2_neg_b_hi_dual:
+; GFX12: ; %bb.0:
+; GFX12: v_lshrrev_b32_e32 v6, 16, v1
+; GFX12: v_xor_b32_e32 v6, 0x8000, v6
+; GFX12: v_perm_b32 v1, v6, v1, 0x5040100
+; GFX12: v_dot2_f32_f16 v0, v0, v1, v2
+; GFX12: v_dot2_f32_f16 v1, v3, v4, v5
+; GFX12: v_add_f32_e32 v0, v0, v1
+ %b_hi = extractelement <2 x half> %b, i32 1
+ %neg.b_hi = fneg half %b_hi
+ %neg_hi.b = insertelement <2 x half> %b, half %neg.b_hi, i32 1
+ %r0 = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %neg_hi.b, float %c, i1 false)
+ %r1 = call float @llvm.amdgcn.fdot2(<2 x half> %d, <2 x half> %e, float %f, i1 false)
+ %r = fadd float %r0, %r1
+ ret float %r
+}
+
+define float @v_fdot2_neg_c_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x half> %d, <2 x half> %e, float %f) {
+; GFX906-LABEL: v_fdot2_neg_c_dual:
+; GFX906: ; %bb.0:
+; GFX906: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GFX906: v_dot2_f32_f16 v1, v3, v4, v5
+; GFX906: v_add_f32_e32 v0, v0, v1
+;
+; GFX950-LABEL: v_fdot2_neg_c_dual:
+; GFX950: ; %bb.0:
+; GFX950: v_xor_b32_e32 v2, 0x80000000, v2
+; GFX950: v_dot2c_f32_f16_e32 v2, v0, v1
+; GFX950: v_dot2c_f32_f16_e32 v5, v3, v4
+; GFX950: v_add_f32_e32 v0, v2, v5
+;
+; GFX10-LABEL: v_fdot2_neg_c_dual:
+; GFX10: ; %bb.0:
+; GFX10: v_xor_b32_e32 v2, 0x80000000, v2
+; GFX10: v_dot2c_f32_f16 v5, v3, v4
+; GFX10: v_dot2c_f32_f16 v2, v0, v1
+; GFX10: v_add_f32_e32 v0, v2, v5
+;
+; GFX11-LABEL: v_fdot2_neg_c_dual:
+; GFX11: ; %bb.0:
+; GFX11: v_xor_b32_e32 v2, 0x80000000, v2
+; GFX11: v_dual_dot2acc_f32_f16 v5, v3, v4 :: v_dual_dot2acc_f32_f16 v2, v0, v1
+; GFX11: v_add_f32_e32 v0, v2, v5
+;
+; GFX12-LABEL: v_fdot2_neg_c_dual:
+; GFX12: ; %bb.0:
+; GFX12: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GFX12: v_dot2_f32_f16 v1, v3, v4, v5
+; GFX12: v_add_f32_e32 v0, v0, v1
+ %neg.c = fneg float %c
+ %r0 = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float %neg.c, i1 false)
+ %r1 = call float @llvm.amdgcn.fdot2(<2 x half> %d, <2 x half> %e, float %f, i1 false)
+ %r = fadd float %r0, %r1
+ ret float %r
+}
+
+define float @v_fdot2_abs_c_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x half> %d, <2 x half> %e, float %f) {
+; GFX906-LABEL: v_fdot2_abs_c_dual:
+; GFX906: ; %bb.0:
+; GFX906: v_and_b32_e32 v2, 0x7fffffff, v2
+; GFX906: v_dot2_f32_f16 v0, v0, v1, v2
+; GFX906: v_dot2_f32_f16 v1, v3, v4, v5
+; GFX906: v_add_f32_e32 v0, v0, v1
+;
+; GFX950-LABEL: v_fdot2_abs_c_dual:
+; GFX950: ; %bb.0:
+; GFX950: v_and_b32_e32 v2, 0x7fffffff, v2
+; GFX950: v_dot2c_f32_f16_e32 v2, v0, v1
+; GFX950: v_dot2c_f32_f16_e32 v5, v3, v4
+; GFX950: v_add_f32_e32 v0, v2, v5
+;
+; GFX10-LABEL: v_fdot2_abs_c_dual:
+; GFX10: ; %bb.0:
+; GFX10: v_and_b32_e32 v2, 0x7fffffff, v2
+; GFX10: v_dot2c_f32_f16 v5, v3, v4
+; GFX10: v_dot2c_f32_f16 v2, v0, v1
+; GFX10: v_add_f32_e32 v0, v2, v5
+;
+; GFX11-LABEL: v_fdot2_abs_c_dual:
+; GFX11: ; %bb.0:
+; GFX11: v_dual_dot2acc_f32_f16 v5, v3, v4 :: v_dual_and_b32 v2, 0x7fffffff, v2
+; GFX11: v_dot2acc_f32_f16 v2, v0, v1
+; GFX11: v_add_f32_e32 v0, v2, v5
+;
+; GFX12-LABEL: v_fdot2_abs_c_dual:
+; GFX12: ; %bb.0:
+; GFX12: v_and_b32_e32 v2, 0x7fffffff, v2
+; GFX12: v_dot2_f32_f16 v0, v0, v1, v2
+; GFX12: v_dot2_f32_f16 v1, v3, v4, v5
+; GFX12: v_add_f32_e32 v0, v0, v1
+ %abs.c = call float @llvm.fabs.f32(float %c)
+ %r0 = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float %abs.c, i1 false)
+ %r1 = call float @llvm.amdgcn.fdot2(<2 x half> %d, <2 x half> %e, float %f, i1 false)
+ %r = fadd float %r0, %r1
+ ret float %r
+}
+
+define float @v_fdot2_opsel_lo_a_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x half> %d, <2 x half> %e, float %f) {
+; GFX906-LABEL: v_fdot2_opsel_lo_a_dual:
+; GFX906: ; %bb.0:
+; GFX906: v_dot2_f32_f16 v0, v0, v1, v2 op_sel:[1,0,0]
+; GFX906: v_dot2_f32_f16 v1, v3, v4, v5
+; GFX906: v_add_f32_e32 v0, v0, v1
+;
+; GFX950-LABEL: v_fdot2_opsel_lo_a_dual:
+; GFX950: ; %bb.0:
+; GFX950: s_mov_b32 s0, 0x7060302
+; GFX950: v_perm_b32 v0, v0, v0, s0
+; GFX950: v_dot2c_f32_f16_e32 v2, v0, v1
+; GFX950: v_dot2c_f32_f16_e32 v5, v3, v4
+; GFX950: v_add_f32_e32 v0, v2, v5
+;
+; GFX10-LABEL: v_fdot2_opsel_lo_a_dual:
+; GFX10: ; %bb.0:
+; GFX10: v_perm_b32 v0, v0, v0, 0x7060302
+; GFX10: v_dot2c_f32_f16 v5, v3, v4
+; GFX10: v_dot2c_f32_f16 v2, v0, v1
+; GFX10: v_add_f32_e32 v0, v2, v5
+;
+; GFX11-LABEL: v_fdot2_opsel_lo_a_dual:
+; GFX11: ; %bb.0:
+; GFX11: v_mov_b16_e32 v0.l, v0.h
+; GFX11: v_dual_dot2acc_f32_f16 v5, v3, v4 :: v_dual_dot2acc_f32_f16 v2, v0, v1
+; GFX11: v_add_f32_e32 v0, v2, v5
+;
+; GFX12-LABEL: v_fdot2_opsel_lo_a_dual:
+; GFX12: ; %bb.0:
+; GFX12: v_perm_b32 v0, v0, v0, 0x7060302
+; GFX12: v_dot2_f32_f16 v0, v0, v1, v2
+; GFX12: v_dot2_f32_f16 v1, v3, v4, v5
+; GFX12: v_add_f32_e32 v0, v0, v1
+ %shuf = shufflevector <2 x half> %a, <2 x half> poison, <2 x i32> <i32 1, i32 1>
+ %r0 = call float @llvm.amdgcn.fdot2(<2 x half> %shuf, <2 x half> %b, float %c, i1 false)
+ %r1 = call float @llvm.amdgcn.fdot2(<2 x half> %d, <2 x half> %e, float %f, i1 false)
+ %r = fadd float %r0, %r1
+ ret float %r
+}
+
+define float @v_fdot2_opsel_hi_a_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x half> %d, <2 x half> %e, float %f) {
+; GFX906-LABEL: v_fdot2_opsel_hi_a_dual:
+; GFX906: ; %bb.0:
+; GFX906: v_dot2_f32_f16 v0, v0, v1, v2 op_sel_hi:[0,1,1]
+; GFX906: v_dot2_f32_f16 v1, v3, v4, v5
+; GFX906: v_add_f32_e32 v0, v0, v1
+;
+; GFX950-LABEL: v_fdot2_opsel_hi_a_dual:
+; GFX950: ; %bb.0:
+; GFX950: s_mov_b32 s0, 0x5040100
+; GFX950: v_perm_b32 v0, v0, v0, s0
+; GFX950: v_dot2c_f32_f16_e32 v2, v0, v1
+; GFX950: v_dot2c_f32_f16_e32 v5, v3, v4
+; GFX950: v_add_f32_e32 v0, v2, v5
+;
+; GFX10-LABEL: v_fdot2_opsel_hi_a_dual:
+; GFX10: ; %bb.0:
+; GFX10: v_perm_b32 v0, v0, v0, 0x5040100
+; GFX10: v_dot2c_f32_f16 v5, v3, v4
+; GFX10: v_dot2c_f32_f16 v2, v0, v1
+; GFX10: v_add_f32_e32 v0, v2, v5
+;
+; GFX11-LABEL: v_fdot2_opsel_hi_a_dual:
+; GFX11: ; %bb.0:
+; GFX11: v_mov_b16_e32 v0.h, v0.l
+; GFX11: v_dual_dot2acc_f32_f16 v5, v3, v4 :: v_dual_dot2acc_f32_f16 v2, v0, v1
+; GFX11: v_add_f32_e32 v0, v2, v5
+;
+; GFX12-LABEL: v_fdot2_opsel_hi_a_dual:
+; GFX12: ; %bb.0:
+; GFX12: v_perm_b32 v0, v0, v0, 0x5040100
+; GFX12: v_dot2_f32_f16 v0, v0, v1, v2
+; GFX12: v_dot2_f32_f16 v1, v3, v4, v5
+; GFX12: v_add_f32_e32 v0, v0, v1
+ %shuf = shufflevector <2 x half> %a, <2 x half> poison, <2 x i32> <i32 0, i32 0>
+ %r0 = call float @llvm.amdgcn.fdot2(<2 x half> %shuf, <2 x half> %b, float %c, i1 false)
+ %r1 = call float @llvm.amdgcn.fdot2(<2 x half> %d, <2 x half> %e, float %f, i1 false)
+ %r = fadd float %r0, %r1
+ ret float %r
+}
+
+define float @v_fdot2_opsel_lo_b_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x half> %d, <2 x half> %e, float %f) {
+; GFX906-LABEL: v_fdot2_opsel_lo_b_dual:
+; GFX906: ; %bb.0:
+; GFX906: v_dot2_f32_f16 v0, v0, v1, v2 op_sel:[0,1,0]
+; GFX906: v_dot2_f32_f16 v1, v3, v4, v5
+; GFX906: v_add_f32_e32 v0, v0, v1
+;
+; GFX950-LABEL: v_fdot2_opsel_lo_b_dual:
+; GFX950: ; %bb.0:
+; GFX950: s_mov_b32 s0, 0x7060302
+; GFX950: v_perm_b32 v1, v1, v1, s0
+; GFX950: v_dot2c_f32_f16_e32 v2, v0, v1
+; GFX950: v_dot2c_f32_f16_e32 v5, v3, v4
+; GFX950: v_add_f32_e32 v0, v2, v5
+;
+; GFX10-LABEL: v_fdot2_opsel_lo_b_dual:
+; GFX10: ; %bb.0:
+; GFX10: v_perm_b32 v1, v1, v1, 0x7060302
+; GFX10: v_dot2c_f32_f16 v5, v3, v4
+; GFX10: v_dot2c_f32_f16 v2, v0, v1
+; GFX10: v_add_f32_e32 v0, v2, v5
+;
+; GFX11-LABEL: v_fdot2_opsel_lo_b_dual:
+; GFX11: ; %bb.0:
+; GFX11: v_mov_b16_e32 v1.l, v1.h
+; GFX11: v_dual_dot2acc_f32_f16 v5, v3, v4 :: v_dual_dot2acc_f32_f16 v2, v0, v1
+; GFX11: v_add_f32_e32 v0, v2, v5
+;
+; GFX12-LABEL: v_fdot2_opsel_lo_b_dual:
+; GFX12: ; %bb.0:
+; GFX12: v_perm_b32 v1, v1, v1, 0x7060302
+; GFX12: v_dot2_f32_f16 v0, v0, v1, v2
+; GFX12: v_dot2_f32_f16 v1, v3, v4, v5
+; GFX12: v_add_f32_e32 v0, v0, v1
+ %shuf = shufflevector <2 x half> %b, <2 x half> poison, <2 x i32> <i32 1, i32 1>
+ %r0 = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %shuf, float %c, i1 false)
+ %r1 = call float @llvm.amdgcn.fdot2(<2 x half> %d, <2 x half> %e, float %f, i1 false)
+ %r = fadd float %r0, %r1
+ ret float %r
+}
+
+define float @v_fdot2_opsel_hi_b_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x half> %d, <2 x half> %e, float %f) {
+; GFX906-LABEL: v_fdot2_opsel_hi_b_dual:
+; GFX906: ; %bb.0:
+; GFX906: v_dot2_f32_f16 v0, v0, v1, v2 op_sel_hi:[1,0,1]
+; GFX906: v_dot2_f32_f16 v1, v3, v4, v5
+; GFX906: v_add_f32_e32 v0, v0, v1
+;
+; GFX950-LABEL: v_fdot2_opsel_hi_b_dual:
+; GFX950: ; %bb.0:
+; GFX950: s_mov_b32 s0, 0x5040100
+; GFX950: v_perm_b32 v1, v1, v1, s0
+; GFX950: v_dot2c_f32_f16_e32 v2, v0, v1
+; GFX950: v_dot2c_f32_f16_e32 v5, v3, v4
+; GFX950: v_add_f32_e32 v0, v2, v5
+;
+; GFX10-LABEL: v_fdot2_opsel_hi_b_dual:
+; GFX10: ; %bb.0:
+; GFX10: v_perm_b32 v1, v1, v1, 0x5040100
+; GFX10: v_dot2c_f32_f16 v5, v3, v4
+; GFX10: v_dot2c_f32_f16 v2, v0, v1
+; GFX10: v_add_f32_e32 v0, v2, v5
+;
+; GFX11-LABEL: v_fdot2_opsel_hi_b_dual:
+; GFX11: ; %bb.0:
+; GFX11: v_mov_b16_e32 v1.h, v1.l
+; GFX11: v_dual_dot2acc_f32_f16 v5, v3, v4 :: v_dual_dot2acc_f32_f16 v2, v0, v1
+; GFX11: v_add_f32_e32 v0, v2, v5
+;
+; GFX12-LABEL: v_fdot2_opsel_hi_b_dual:
+; GFX12: ; %bb.0:
+; GFX12: v_perm_b32 v1, v1, v1, 0x5040100
+; GFX12: v_dot2_f32_f16 v0, v0, v1, v2
+; GFX12: v_dot2_f32_f16 v1, v3, v4, v5
+; GFX12: v_add_f32_e32 v0, v0, v1
+ %shuf = shufflevector <2 x half> %b, <2 x half> poison, <2 x i32> <i32 0, i32 0>
+ %r0 = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %shuf, float %c, i1 false)
+ %r1 = call float @llvm.amdgcn.fdot2(<2 x half> %d, <2 x half> %e, float %f, i1 false)
+ %r = fadd float %r0, %r1
+ ret float %r
+}
+
+define float @v_fdot2_inline_literal_a_dual(<2 x half> %b, float %c, <2 x half> %d, <2 x half> %e, float %f) {
+; GFX906-LABEL: v_fdot2_inline_literal_a_dual:
+; GFX906: ; %bb.0:
+; GFX906: v_dot2_f32_f16 v0, 2.0, v0, v1 op_sel_hi:[0,1,1]
+; GFX906: v_dot2_f32_f16 v1, v2, v3, v4
+; GFX906: v_add_f32_e32 v0, v0, v1
+;
+; GFX950-LABEL: v_fdot2_inline_literal_a_dual:
+; GFX950: ; %bb.0:
+; GFX950: v_dot2c_f32_f16_e32 v1, 0x40004000, v0
+; GFX950: v_dot2c_f32_f16_e32 v4, v2, v3
+; GFX950: v_add_f32_e32 v0, v1, v4
+;
+; GFX10-LABEL: v_fdot2_inline_literal_a_dual:
+; GFX10: ; %bb.0:
+; GFX10: v_dot2c_f32_f16 v1, 0x40004000, v0
+; GFX10: v_dot2c_f32_f16 v4, v2, v3
+; GFX10: v_add_f32_e32 v0, v1, v4
+;
+; GFX11-LABEL: v_fdot2_inline_literal_a_dual:
+; GFX11: ; %bb.0:
+; GFX11: v_dual_dot2acc_f32_f16 v1, 0x40004000, v0 :: v_dual_dot2acc_f32_f16 v4, v2, v3
+; GFX11: v_add_f32_e32 v0, v1, v4
+;
+; GFX12-LABEL: v_fdot2_inline_literal_a_dual:
+; GFX12: ; %bb.0:
+; GFX12: v_dot2_f32_f16 v0, 0x40004000, v0, v1
+; GFX12: v_dot2_f32_f16 v1, v2, v3, v4
+; GFX12: v_add_f32_e32 v0, v0, v1
+ %r0 = call float @llvm.amdgcn.fdot2(<2 x half> <half 2.0, half 2.0>, <2 x half> %b, float %c, i1 false)
+ %r1 = call float @llvm.amdgcn.fdot2(<2 x half> %d, <2 x half> %e, float %f, i1 false)
+ %r = fadd float %r0, %r1
+ ret float %r
+}
+
+define float @v_fdot2_inline_literal_b_dual(<2 x half> %a, float %c, <2 x half> %d, <2 x half> %e, float %f) {
+; GFX906-LABEL: v_fdot2_inline_literal_b_dual:
+; GFX906: ; %bb.0:
+; GFX906: v_dot2_f32_f16 v0, v0, 2.0, v1 op_sel_hi:[1,0,1]
+; GFX906: v_dot2_f32_f16 v1, v2, v3, v4
+; GFX906: v_add_f32_e32 v0, v0, v1
+;
+; GFX950-LABEL: v_fdot2_inline_literal_b_dual:
+; GFX950: ; %bb.0:
+; GFX950: v_dot2c_f32_f16_e32 v1, 0x40004000, v0
+; GFX950: v_dot2c_f32_f16_e32 v4, v2, v3
+; GFX950: v_add_f32_e32 v0, v1, v4
+;
+; GFX10-LABEL: v_fdot2_inline_literal_b_dual:
+; GFX10: ; %bb.0:
+; GFX10: v_dot2c_f32_f16 v1, 0x40004000, v0
+; GFX10: v_dot2c_f32_f16 v4, v2, v3
+; GFX10: v_add_f32_e32 v0, v1, v4
+;
+; GFX11-LABEL: v_fdot2_inline_literal_b_dual:
+; GFX11: ; %bb.0:
+; GFX11: v_dual_dot2acc_f32_f16 v1, 0x40004000, v0 :: v_dual_dot2acc_f32_f16 v4, v2, v3
+; GFX11: v_add_f32_e32 v0, v1, v4
+;
+; GFX12-LABEL: v_fdot2_inline_literal_b_dual:
+; GFX12: ; %bb.0:
+; GFX12: v_dot2_f32_f16 v0, v0, 0x40004000, v1
+; GFX12: v_dot2_f32_f16 v1, v2, v3, v4
+; GFX12: v_add_f32_e32 v0, v0, v1
+ %r0 = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> <half 2.0, half 2.0>, float %c, i1 false)
+ %r1 = call float @llvm.amdgcn.fdot2(<2 x half> %d, <2 x half> %e, float %f, i1 false)
+ %r = fadd float %r0, %r1
+ ret float %r
+}
+
+define float @v_fdot2_inline_literal_c_dual(<2 x half> %a, <2 x half> %b, <2 x half> %d, <2 x half> %e, float %f) {
+; GFX906-LABEL: v_fdot2_inline_literal_c_dual:
+; GFX906: ; %bb.0:
+; GFX906: v_dot2_f32_f16 v0, v0, v1, 2.0
+; GFX906: v_dot2_f32_f16 v1, v2, v3, v4
+; GFX906: v_add_f32_e32 v0, v0, v1
+;
+; GFX950-LABEL: v_fdot2_inline_literal_c_dual:
+; GFX950: ; %bb.0:
+; GFX950: v_mov_b32_e32 v5, 2.0
+; GFX950: v_dot2c_f32_f16_e32 v5, v0, v1
+; GFX950: v_dot2c_f32_f16_e32 v4, v2, v3
+; GFX950: v_add_f32_e32 v0, v5, v4
+;
+; GFX10-LABEL: v_fdot2_inline_literal_c_dual:
+; GFX10: ; %bb.0:
+; GFX10: v_mov_b32_e32 v5, 2.0
+; GFX10: v_dot2c_f32_f16 v4, v2, v3
+; GFX10: v_dot2c_f32_f16 v5, v0, v1
+; GFX10: v_add_f32_e32 v0, v5, v4
+;
+; GFX11-LABEL: v_fdot2_inline_literal_c_dual:
+; GFX11: ; %bb.0:
+; GFX11: v_dual_mov_b32 v5, 2.0 :: v_dual_dot2acc_f32_f16 v4, v2, v3
+; GFX11: v_dot2acc_f32_f16 v5, v0, v1
+; GFX11: v_add_f32_e32 v0, v5, v4
+;
+; GFX12-LABEL: v_fdot2_inline_literal_c_dual:
+; GFX12: ; %bb.0:
+; GFX12: v_dot2_f32_f16 v0, v0, v1, 2.0
+; GFX12: v_dot2_f32_f16 v1, v2, v3, v4
+; GFX12: v_add_f32_e32 v0, v0, v1
+ %r0 = tail call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float 2.0, i1 false)
+ %r1 = call float @llvm.amdgcn.fdot2(<2 x half> %d, <2 x half> %e, float %f, i1 false)
+ %r = fadd float %r0, %r1
+ ret float %r
+}
+
+define float @v_fdot2_clamp_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x half> %d, <2 x half> %e, float %f) {
+; GCN-LABEL: v_fdot2_clamp_dual:
+; GCN: ; %bb.0:
+; GCN: v_dot2_f32_f16 v0, v0, v1, v2 clamp
+; GCN: v_dot2_f32_f16 v1, v3, v4, v5 clamp
+; GCN: v_add_f32_e32 v0, v0, v1
+ %r0 = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float %c, i1 true)
+ %r1 = call float @llvm.amdgcn.fdot2(<2 x half> %d, <2 x half> %e, float %f, i1 true)
+ %r = fadd float %r0, %r1
+ ret float %r
+}
More information about the llvm-commits
mailing list