[llvm] 3ba8dab - [AMDGPU] Add sdot4 / sdot8 intrinsics for gfx11
Jeffrey Byrnes via llvm-commits
llvm-commits at lists.llvm.org
Fri Aug 25 11:46:28 PDT 2023
Author: Jeffrey Byrnes
Date: 2023-08-25T11:45:55-07:00
New Revision: 3ba8dabbf31bbfff226ed03c0948386d58ab190a
URL: https://github.com/llvm/llvm-project/commit/3ba8dabbf31bbfff226ed03c0948386d58ab190a
DIFF: https://github.com/llvm/llvm-project/commit/3ba8dabbf31bbfff226ed03c0948386d58ab190a.diff
LOG: [AMDGPU] Add sdot4 / sdot8 intrinsics for gfx11
This provides a uniform way to lower into the relevant instructions across all generations.
Differential Revision: https://reviews.llvm.org/D158468
Change-Id: I1f7ba4b15ee470738535cf1c7d177a11fc471e43
Added:
Modified:
llvm/docs/AMDGPUUsage.rst
llvm/lib/Target/AMDGPU/VOP3PInstructions.td
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sdot4.ll
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sdot8.ll
Removed:
################################################################################
diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index 506d193d9806a0..c3b3927c4f0f7e 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -1019,6 +1019,65 @@ The AMDGPU backend implements the following LLVM IR intrinsics.
reduction will be performed using default iterative strategy.
Intrinsic is currently only implemented for i32.
+ llvm.amdgcn.udot2 Provides direct access to v_dot2_u32_u16 across targets which
+ support such instructions. This performs unsigned dot product
+ with two v2i16 operands, summed with the third i32 operand. The
+ i1 fourth operand is used to clamp the output.
+
+ llvm.amdgcn.udot4 Provides direct access to v_dot4_u32_u8 across targets which
+ support such instructions. This performs unsigned dot product
+ with two i32 operands (holding a vector of 4 8bit values), summed
+ with the third i32 operand. The i1 fourth operand is used to clamp
+ the output.
+
+ llvm.amdgcn.udot8 Provides direct access to v_dot8_u32_u4 across targets which
+ support such instructions. This performs unsigned dot product
+ with two i32 operands (holding a vector of 8 4bit values), summed
+ with the third i32 operand. The i1 fourth operand is used to clamp
+ the output.
+
+ llvm.amdgcn.sdot2 Provides direct access to v_dot2_i32_i16 across targets which
+ support such instructions. This performs signed dot product
+ with two v2i16 operands, summed with the third i32 operand. The
+ i1 fourth operand is used to clamp the output.
+ When applicable (e.g. no clamping), this is lowered into
+ v_dot2c_i32_i16 for targets which support it.
+
+ llvm.amdgcn.sdot4 Provides direct access to v_dot4_i32_i8 across targets which
+ support such instructions. This performs signed dot product
+ with two i32 operands (holding a vector of 4 8bit values), summed
+ with the third i32 operand. The i1 fourth operand is used to clamp
+ the output.
+ When applicable (i.e. no clamping / operand modifiers), this is lowered
+ into v_dot4c_i32_i8 for targets which support it.
+ RDNA3 does not offer v_dot4_i32_i8, and rather offers
+ v_dot4_i32_iu8 which has operands to hold the signedness of the
+ vector operands. Thus, this intrinsic lowers to the signed version
+ of this instruction for gfx11 targets.
+
+ llvm.amdgcn.sdot8 Provides direct access to v_dot8_u32_u4 across targets which
+ support such instructions. This performs signed dot product
+ with two i32 operands (holding a vector of 8 4bit values), summed
+ with the third i32 operand. The i1 fourth operand is used to clamp
+ the output.
+ When applicable (i.e. no clamping / operand modifiers), this is lowered
+ into v_dot8c_i32_i4 for targets which support it.
+ RDNA3 does not offer v_dot8_i32_i4, and rather offers
+ v_dot4_i32_iu4 which has operands to hold the signedness of the
+ vector operands. Thus, this intrinsic lowers to the signed version
+ of this instruction for gfx11 targets.
+
+ llvm.amdgcn.sudot4 Provides direct access to v_dot4_i32_iu8 on gfx11 targets. This performs
+ dot product with two i32 operands (holding a vector of 4 8bit values), summed
+ with the fifth i32 operand. The i1 sixth operand is used to clamp
+ the output. The i1s preceding the vector operands decide the signedness.
+
+ llvm.amdgcn.sudot8 Provides direct access to v_dot8_i32_iu4 on gfx11 targets. This performs
+ dot product with two i32 operands (holding a vector of 8 4bit values), summed
+ with the fifth i32 operand. The i1 sixth operand is used to clamp
+ the output. The i1s preceding the vector operands decide the signedness.
+
+
============================================== ==========================================================
.. TODO::
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index 67fdcb87d5791f..8844b36c6222f2 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -436,6 +436,20 @@ multiclass VOP3PDOTIUInst <string OpName, SDPatternOperator intrinsic_node> {
let SubtargetPredicate = HasDot8Insts in {
defm V_DOT4_I32_IU8 : VOP3PDOTIUInst<"v_dot4_i32_iu8", int_amdgcn_sudot4>;
defm V_DOT8_I32_IU4 : VOP3PDOTIUInst<"v_dot8_i32_iu4", int_amdgcn_sudot8>;
+
+def : GCNPat < (int_amdgcn_sdot8 i32:$src0,
+ i32:$src1,
+ i32:$src2, (i1 timm:$clamp)),
+ (V_DOT8_I32_IU4 (i32 9), i32:$src0,
+ (i32 9), i32:$src1, (i32 8), i32:$src2, i1:$clamp)
+>;
+
+def : GCNPat < (int_amdgcn_sdot4 i32:$src0,
+ i32:$src1,
+ i32:$src2, (i1 timm:$clamp)),
+ (V_DOT4_I32_IU8 (i32 9), i32:$src0,
+ (i32 9), i32:$src1, (i32 8), i32:$src2, i1:$clamp)
+>;
} // End SubtargetPredicate = HasDot8Insts
def : UDot2Pat<V_DOT2_U32_U16>;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sdot4.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sdot4.ll
index aa20f3546d6520..08dbe29c5de4e5 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sdot4.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sdot4.ll
@@ -3,12 +3,14 @@
; RUN: llc -march=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,GFX10
; RUN: llc -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,GFX10
; RUN: llc -march=amdgcn -mcpu=gfx1031 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,GFX10
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GFX11
declare i32 @llvm.amdgcn.sdot4(i32 %a, i32 %b, i32 %c, i1 %clamp)
; GCN-LABEL: {{^}}test_llvm_amdgcn_sdot4_clamp
; GFX906: v_dot4_i32_i8 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} clamp{{$}}
; GFX10: v_dot4_i32_i8 v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} clamp{{$}}
+; GFX11: v_dot4_i32_iu8 v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} neg_lo:[1,1,0] clamp{{$}}
define amdgpu_kernel void @test_llvm_amdgcn_sdot4_clamp(
ptr addrspace(1) %r,
ptr addrspace(1) %a,
@@ -28,6 +30,7 @@ entry:
; GCN-LABEL: {{^}}test_llvm_amdgcn_sdot4_no_clamp
; GFX906: v_dot4_i32_i8 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}}
; GFX10: v_dot4c_i32_i8_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}{{$}}
+; GF11: v_dot4_i32_iu8 v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}{{$}} neg_lo:[1,1,0]{{$}}
define amdgpu_kernel void @test_llvm_amdgcn_sdot4_no_clamp(
ptr addrspace(1) %r,
ptr addrspace(1) %a,
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sdot8.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sdot8.ll
index a8cadaa8aaace0..0db62d4a74e4ed 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sdot8.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sdot8.ll
@@ -4,6 +4,7 @@
; RUN: llc -march=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,GFX10
; RUN: llc -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,GFX10
; RUN: llc -march=amdgcn -mcpu=gfx1031 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,GFX10
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GFX11
declare i32 @llvm.amdgcn.sdot8(i32 %a, i32 %b, i32 %c, i1 %clamp)
@@ -11,6 +12,7 @@ declare i32 @llvm.amdgcn.sdot8(i32 %a, i32 %b, i32 %c, i1 %clamp)
; GFX906: v_dot8_i32_i4 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} clamp{{$}}
; GFX908: v_dot8_i32_i4 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} clamp{{$}}
; GFX10: v_dot8_i32_i4 v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} clamp{{$}}
+; GFX11: v_dot8_i32_iu4 v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} neg_lo:[1,1,0] clamp{{$}}
define amdgpu_kernel void @test_llvm_amdgcn_sdot8_clamp(
ptr addrspace(1) %r,
ptr addrspace(1) %a,
@@ -31,6 +33,7 @@ entry:
; GFX906: v_dot8_i32_i4 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}}
; GFX908: v_dot8c_i32_i4_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}{{$}}
; GFX10: v_dot8_i32_i4 v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}{{$}}
+; GFX11: v_dot8_i32_iu4 v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} neg_lo:[1,1,0]{{$}}
define amdgpu_kernel void @test_llvm_amdgcn_sdot8_no_clamp(
ptr addrspace(1) %r,
ptr addrspace(1) %a,
More information about the llvm-commits
mailing list