[llvm] [X86][AVX-VNNI] Fix VNNI intrinsics argument types (PR #122649)
via llvm-commits
llvm-commits at lists.llvm.org
Sun Jan 12 14:17:39 PST 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-llvm-ir
Author: BaiXilin (BaiXilin)
<details>
<summary>Changes</summary>
Fixed the mismatched VNNI intrinsics argument types to align with the ISA.
VNNI intrinsics affected are:
VPDPBUSD[,S]_128/256/512, VPDPB[SS,SU,UU]D[,S]_128/256
Resolves issue #<!-- -->97271
---
Patch is 107.19 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/122649.diff
10 Files Affected:
- (modified) llvm/include/llvm/IR/IntrinsicsX86.td (+25-24)
- (modified) llvm/test/CodeGen/X86/avx10_2ni-intrinsics.ll (+36-36)
- (modified) llvm/test/CodeGen/X86/avx512vl_vnni-intrinsics-upgrade.ll (+32-32)
- (modified) llvm/test/CodeGen/X86/avx512vl_vnni-intrinsics.ll (+28-28)
- (modified) llvm/test/CodeGen/X86/avx512vnni-intrinsics-upgrade.ll (+16-16)
- (modified) llvm/test/CodeGen/X86/avx512vnni-intrinsics.ll (+14-14)
- (modified) llvm/test/CodeGen/X86/avx_vnni-intrinsics.ll (+12-12)
- (modified) llvm/test/CodeGen/X86/avxvnniint8-intrinsics.ll (+60-60)
- (modified) llvm/test/CodeGen/X86/stack-folding-int-avxvnni.ll (+20-20)
- (modified) llvm/test/CodeGen/X86/stack-folding-int-avxvnniint8.ll (+60-60)
``````````diff
diff --git a/llvm/include/llvm/IR/IntrinsicsX86.td b/llvm/include/llvm/IR/IntrinsicsX86.td
index 42b211e0e1f75a..dc08e3f06919cf 100644
--- a/llvm/include/llvm/IR/IntrinsicsX86.td
+++ b/llvm/include/llvm/IR/IntrinsicsX86.td
@@ -1867,29 +1867,29 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
def int_x86_avx512_vpdpbusd_128 :
ClangBuiltin<"__builtin_ia32_vpdpbusd128">,
- DefaultAttrsIntrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty,
- llvm_v4i32_ty], [IntrNoMem]>;
+ DefaultAttrsIntrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v16i8_ty,
+ llvm_v16i8_ty], [IntrNoMem]>;
def int_x86_avx512_vpdpbusd_256 :
ClangBuiltin<"__builtin_ia32_vpdpbusd256">,
- DefaultAttrsIntrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty, llvm_v8i32_ty,
- llvm_v8i32_ty], [IntrNoMem]>;
+ DefaultAttrsIntrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty, llvm_v32i8_ty,
+ llvm_v32i8_ty], [IntrNoMem]>;
def int_x86_avx512_vpdpbusd_512 :
ClangBuiltin<"__builtin_ia32_vpdpbusd512">,
- DefaultAttrsIntrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty, llvm_v16i32_ty,
- llvm_v16i32_ty], [IntrNoMem]>;
+ DefaultAttrsIntrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty, llvm_v64i8_ty,
+ llvm_v64i8_ty], [IntrNoMem]>;
def int_x86_avx512_vpdpbusds_128 :
ClangBuiltin<"__builtin_ia32_vpdpbusds128">,
- DefaultAttrsIntrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty,
- llvm_v4i32_ty], [IntrNoMem]>;
+ DefaultAttrsIntrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v16i8_ty,
+ llvm_v16i8_ty], [IntrNoMem]>;
def int_x86_avx512_vpdpbusds_256 :
ClangBuiltin<"__builtin_ia32_vpdpbusds256">,
- DefaultAttrsIntrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty, llvm_v8i32_ty,
- llvm_v8i32_ty], [IntrNoMem]>;
+ DefaultAttrsIntrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty, llvm_v32i8_ty,
+ llvm_v32i8_ty], [IntrNoMem]>;
def int_x86_avx512_vpdpbusds_512 :
ClangBuiltin<"__builtin_ia32_vpdpbusds512">,
- DefaultAttrsIntrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty, llvm_v16i32_ty,
- llvm_v16i32_ty], [IntrNoMem]>;
+ DefaultAttrsIntrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty, llvm_v64i8_ty,
+ llvm_v64i8_ty], [IntrNoMem]>;
def int_x86_avx512_vpdpwssd_128 :
ClangBuiltin<"__builtin_ia32_vpdpwssd128">,
@@ -1916,65 +1916,66 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
ClangBuiltin<"__builtin_ia32_vpdpwssds512">,
DefaultAttrsIntrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty, llvm_v16i32_ty,
llvm_v16i32_ty], [IntrNoMem]>;
+
def int_x86_avx2_vpdpbssd_128
: ClangBuiltin<"__builtin_ia32_vpdpbssd128">,
DefaultAttrsIntrinsic<[llvm_v4i32_ty],
- [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty],
+ [llvm_v4i32_ty, llvm_v16i8_ty, llvm_v16i8_ty],
[IntrNoMem]>;
def int_x86_avx2_vpdpbssd_256
: ClangBuiltin<"__builtin_ia32_vpdpbssd256">,
DefaultAttrsIntrinsic<[llvm_v8i32_ty],
- [llvm_v8i32_ty, llvm_v8i32_ty, llvm_v8i32_ty],
+ [llvm_v8i32_ty, llvm_v32i8_ty, llvm_v32i8_ty],
[IntrNoMem]>;
def int_x86_avx2_vpdpbssds_128
: ClangBuiltin<"__builtin_ia32_vpdpbssds128">,
DefaultAttrsIntrinsic<[llvm_v4i32_ty],
- [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty],
+ [llvm_v4i32_ty, llvm_v16i8_ty, llvm_v16i8_ty],
[IntrNoMem]>;
def int_x86_avx2_vpdpbssds_256
: ClangBuiltin<"__builtin_ia32_vpdpbssds256">,
DefaultAttrsIntrinsic<[llvm_v8i32_ty],
- [llvm_v8i32_ty, llvm_v8i32_ty, llvm_v8i32_ty],
+ [llvm_v8i32_ty, llvm_v32i8_ty, llvm_v32i8_ty],
[IntrNoMem]>;
def int_x86_avx2_vpdpbsud_128
: ClangBuiltin<"__builtin_ia32_vpdpbsud128">,
DefaultAttrsIntrinsic<[llvm_v4i32_ty],
- [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty],
+ [llvm_v4i32_ty, llvm_v16i8_ty, llvm_v16i8_ty],
[IntrNoMem]>;
def int_x86_avx2_vpdpbsud_256
: ClangBuiltin<"__builtin_ia32_vpdpbsud256">,
DefaultAttrsIntrinsic<[llvm_v8i32_ty],
- [llvm_v8i32_ty, llvm_v8i32_ty, llvm_v8i32_ty],
+ [llvm_v8i32_ty, llvm_v32i8_ty, llvm_v32i8_ty],
[IntrNoMem]>;
def int_x86_avx2_vpdpbsuds_128
: ClangBuiltin<"__builtin_ia32_vpdpbsuds128">,
DefaultAttrsIntrinsic<[llvm_v4i32_ty],
- [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty],
+ [llvm_v4i32_ty, llvm_v16i8_ty, llvm_v16i8_ty],
[IntrNoMem]>;
def int_x86_avx2_vpdpbsuds_256
: ClangBuiltin<"__builtin_ia32_vpdpbsuds256">,
DefaultAttrsIntrinsic<[llvm_v8i32_ty],
- [llvm_v8i32_ty, llvm_v8i32_ty, llvm_v8i32_ty],
+ [llvm_v8i32_ty, llvm_v32i8_ty, llvm_v32i8_ty],
[IntrNoMem]>;
def int_x86_avx2_vpdpbuud_128
: ClangBuiltin<"__builtin_ia32_vpdpbuud128">,
DefaultAttrsIntrinsic<[llvm_v4i32_ty],
- [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty],
+ [llvm_v4i32_ty, llvm_v16i8_ty, llvm_v16i8_ty],
[IntrNoMem]>;
def int_x86_avx2_vpdpbuud_256
: ClangBuiltin<"__builtin_ia32_vpdpbuud256">,
DefaultAttrsIntrinsic<[llvm_v8i32_ty],
- [llvm_v8i32_ty, llvm_v8i32_ty, llvm_v8i32_ty],
+ [llvm_v8i32_ty, llvm_v32i8_ty, llvm_v32i8_ty],
[IntrNoMem]>;
def int_x86_avx2_vpdpbuuds_128
: ClangBuiltin<"__builtin_ia32_vpdpbuuds128">,
DefaultAttrsIntrinsic<[llvm_v4i32_ty],
- [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty],
+ [llvm_v4i32_ty, llvm_v16i8_ty, llvm_v16i8_ty],
[IntrNoMem]>;
def int_x86_avx2_vpdpbuuds_256
: ClangBuiltin<"__builtin_ia32_vpdpbuuds256">,
DefaultAttrsIntrinsic<[llvm_v8i32_ty],
- [llvm_v8i32_ty, llvm_v8i32_ty, llvm_v8i32_ty],
+ [llvm_v8i32_ty, llvm_v32i8_ty, llvm_v32i8_ty],
[IntrNoMem]>;
def int_x86_avx2_vpdpwsud_128
diff --git a/llvm/test/CodeGen/X86/avx10_2ni-intrinsics.ll b/llvm/test/CodeGen/X86/avx10_2ni-intrinsics.ll
index 31cec891c4cf38..ded1ddf8f64c82 100644
--- a/llvm/test/CodeGen/X86/avx10_2ni-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx10_2ni-intrinsics.ll
@@ -101,7 +101,7 @@ declare <8 x float> @llvm.x86.avx10.vdpphps.256(<8 x float>, <16 x half>, <16 x
; VNNI INT8
-define <4 x i32> @test_mm_mask_dpbssd_epi32(<4 x i32> %__W, i4 zeroext %__U, <4 x i32> %__A, <4 x i32> %__B) {
+define <4 x i32> @test_mm_mask_dpbssd_epi32(<4 x i32> %__W, i4 zeroext %__U, <16 x i8> %__A, <16 x i8> %__B) {
; X86-LABEL: test_mm_mask_dpbssd_epi32:
; X86: # %bb.0:
; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
@@ -113,13 +113,13 @@ define <4 x i32> @test_mm_mask_dpbssd_epi32(<4 x i32> %__W, i4 zeroext %__U, <4
; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
; X64-NEXT: vpdpbssd %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x77,0x09,0x50,0xc2]
; X64-NEXT: retq # encoding: [0xc3]
- %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpbssd.128(<4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B)
+ %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpbssd.128(<4 x i32> %__W, <16 x i8> %__A, <16 x i8> %__B)
%bst = bitcast i4 %__U to <4 x i1>
%res = select <4 x i1> %bst, <4 x i32> %dpi, <4 x i32> %__W
ret <4 x i32> %res
}
-define <4 x i32> @test_mm_maskz_dpbssds_epi32(i4 zeroext %__U, <4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B) {
+define <4 x i32> @test_mm_maskz_dpbssds_epi32(i4 zeroext %__U, <4 x i32> %__W, <16 x i8> %__A, <16 x i8> %__B) {
; X86-LABEL: test_mm_maskz_dpbssds_epi32:
; X86: # %bb.0:
; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
@@ -131,13 +131,13 @@ define <4 x i32> @test_mm_maskz_dpbssds_epi32(i4 zeroext %__U, <4 x i32> %__W, <
; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
; X64-NEXT: vpdpbssds %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x77,0x89,0x51,0xc2]
; X64-NEXT: retq # encoding: [0xc3]
- %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpbssds.128(<4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B)
+ %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpbssds.128(<4 x i32> %__W, <16 x i8> %__A, <16 x i8> %__B)
%bst = bitcast i4 %__U to <4 x i1>
%res = select <4 x i1> %bst, <4 x i32> %dpi, <4 x i32> zeroinitializer
ret <4 x i32> %res
}
-define <8 x i32> @test_mm256_maskz_dpbssds_epi32(<8 x i32> %__W, i8 zeroext %__U, <8 x i32> %__A, <8 x i32> %__B) {
+define <8 x i32> @test_mm256_maskz_dpbssds_epi32(<8 x i32> %__W, i8 zeroext %__U, <32 x i8> %__A, <32 x i8> %__B) {
; X86-LABEL: test_mm256_maskz_dpbssds_epi32:
; X86: # %bb.0:
; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
@@ -149,13 +149,13 @@ define <8 x i32> @test_mm256_maskz_dpbssds_epi32(<8 x i32> %__W, i8 zeroext %__U
; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
; X64-NEXT: vpdpbssds %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x77,0x29,0x51,0xc2]
; X64-NEXT: retq # encoding: [0xc3]
- %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpbssds.256(<8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B)
+ %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpbssds.256(<8 x i32> %__W, <32 x i8> %__A, <32 x i8> %__B)
%bst = bitcast i8 %__U to <8 x i1>
%res = select <8 x i1> %bst, <8 x i32> %dpi, <8 x i32> %__W
ret <8 x i32> %res
}
-define <8 x i32> @test_mm256_mask_dpbssd_epi32(i8 zeroext %__U, <8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B) {
+define <8 x i32> @test_mm256_mask_dpbssd_epi32(i8 zeroext %__U, <8 x i32> %__W, <32 x i8> %__A, <32 x i8> %__B) {
; X86-LABEL: test_mm256_mask_dpbssd_epi32:
; X86: # %bb.0:
; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
@@ -167,18 +167,18 @@ define <8 x i32> @test_mm256_mask_dpbssd_epi32(i8 zeroext %__U, <8 x i32> %__W,
; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
; X64-NEXT: vpdpbssd %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x77,0xa9,0x50,0xc2]
; X64-NEXT: retq # encoding: [0xc3]
- %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpbssd.256(<8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B)
+ %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpbssd.256(<8 x i32> %__W, <32 x i8> %__A, <32 x i8> %__B)
%bst = bitcast i8 %__U to <8 x i1>
%res = select <8 x i1> %bst, <8 x i32> %dpi, <8 x i32> zeroinitializer
ret <8 x i32> %res
}
-declare <4 x i32> @llvm.x86.avx2.vpdpbssd.128(<4 x i32>, <4 x i32>, <4 x i32>)
-declare <4 x i32> @llvm.x86.avx2.vpdpbssds.128(<4 x i32>, <4 x i32>, <4 x i32>)
-declare <8 x i32> @llvm.x86.avx2.vpdpbssd.256(<8 x i32>, <8 x i32>, <8 x i32>)
-declare <8 x i32> @llvm.x86.avx2.vpdpbssds.256(<8 x i32>, <8 x i32>, <8 x i32>)
+declare <4 x i32> @llvm.x86.avx2.vpdpbssd.128(<4 x i32>, <16 x i8>, <16 x i8>)
+declare <4 x i32> @llvm.x86.avx2.vpdpbssds.128(<4 x i32>, <16 x i8>, <16 x i8>)
+declare <8 x i32> @llvm.x86.avx2.vpdpbssd.256(<8 x i32>, <32 x i8>, <32 x i8>)
+declare <8 x i32> @llvm.x86.avx2.vpdpbssds.256(<8 x i32>, <32 x i8>, <32 x i8>)
-define <4 x i32> @test_mm_mask_dpbsud_epi32(<4 x i32> %__W, i4 zeroext %__U, <4 x i32> %__A, <4 x i32> %__B) {
+define <4 x i32> @test_mm_mask_dpbsud_epi32(<4 x i32> %__W, i4 zeroext %__U, <16 x i8> %__A, <16 x i8> %__B) {
; X86-LABEL: test_mm_mask_dpbsud_epi32:
; X86: # %bb.0:
; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
@@ -190,13 +190,13 @@ define <4 x i32> @test_mm_mask_dpbsud_epi32(<4 x i32> %__W, i4 zeroext %__U, <4
; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
; X64-NEXT: vpdpbsud %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x76,0x09,0x50,0xc2]
; X64-NEXT: retq # encoding: [0xc3]
- %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpbsud.128(<4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B)
+ %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpbsud.128(<4 x i32> %__W, <16 x i8> %__A, <16 x i8> %__B)
%bst = bitcast i4 %__U to <4 x i1>
%res = select <4 x i1> %bst, <4 x i32> %dpi, <4 x i32> %__W
ret <4 x i32> %res
}
-define <4 x i32> @test_mm_maskz_dpbsuds_epi32(i4 zeroext %__U, <4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B) {
+define <4 x i32> @test_mm_maskz_dpbsuds_epi32(i4 zeroext %__U, <4 x i32> %__W, <16 x i8> %__A, <16 x i8> %__B) {
; X86-LABEL: test_mm_maskz_dpbsuds_epi32:
; X86: # %bb.0:
; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
@@ -208,13 +208,13 @@ define <4 x i32> @test_mm_maskz_dpbsuds_epi32(i4 zeroext %__U, <4 x i32> %__W, <
; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
; X64-NEXT: vpdpbsuds %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x76,0x89,0x51,0xc2]
; X64-NEXT: retq # encoding: [0xc3]
- %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpbsuds.128(<4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B)
+ %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpbsuds.128(<4 x i32> %__W, <16 x i8> %__A, <16 x i8> %__B)
%bst = bitcast i4 %__U to <4 x i1>
%res = select <4 x i1> %bst, <4 x i32> %dpi, <4 x i32> zeroinitializer
ret <4 x i32> %res
}
-define <8 x i32> @test_mm256_maskz_dpbsuds_epi32(<8 x i32> %__W, i8 zeroext %__U, <8 x i32> %__A, <8 x i32> %__B) {
+define <8 x i32> @test_mm256_maskz_dpbsuds_epi32(<8 x i32> %__W, i8 zeroext %__U, <32 x i8> %__A, <32 x i8> %__B) {
; X86-LABEL: test_mm256_maskz_dpbsuds_epi32:
; X86: # %bb.0:
; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
@@ -226,13 +226,13 @@ define <8 x i32> @test_mm256_maskz_dpbsuds_epi32(<8 x i32> %__W, i8 zeroext %__U
; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
; X64-NEXT: vpdpbsuds %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x76,0x29,0x51,0xc2]
; X64-NEXT: retq # encoding: [0xc3]
- %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpbsuds.256(<8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B)
+ %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpbsuds.256(<8 x i32> %__W, <32 x i8> %__A, <32 x i8> %__B)
%bst = bitcast i8 %__U to <8 x i1>
%res = select <8 x i1> %bst, <8 x i32> %dpi, <8 x i32> %__W
ret <8 x i32> %res
}
-define <8 x i32> @test_mm256_mask_dpbsud_epi32(i8 zeroext %__U, <8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B) {
+define <8 x i32> @test_mm256_mask_dpbsud_epi32(i8 zeroext %__U, <8 x i32> %__W, <32 x i8> %__A, <32 x i8> %__B) {
; X86-LABEL: test_mm256_mask_dpbsud_epi32:
; X86: # %bb.0:
; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
@@ -244,18 +244,18 @@ define <8 x i32> @test_mm256_mask_dpbsud_epi32(i8 zeroext %__U, <8 x i32> %__W,
; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
; X64-NEXT: vpdpbsud %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x76,0xa9,0x50,0xc2]
; X64-NEXT: retq # encoding: [0xc3]
- %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpbsud.256(<8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B)
+ %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpbsud.256(<8 x i32> %__W, <32 x i8> %__A, <32 x i8> %__B)
%bst = bitcast i8 %__U to <8 x i1>
%res = select <8 x i1> %bst, <8 x i32> %dpi, <8 x i32> zeroinitializer
ret <8 x i32> %res
}
-declare <4 x i32> @llvm.x86.avx2.vpdpbsud.128(<4 x i32>, <4 x i32>, <4 x i32>)
-declare <4 x i32> @llvm.x86.avx2.vpdpbsuds.128(<4 x i32>, <4 x i32>, <4 x i32>)
-declare <8 x i32> @llvm.x86.avx2.vpdpbsud.256(<8 x i32>, <8 x i32>, <8 x i32>)
-declare <8 x i32> @llvm.x86.avx2.vpdpbsuds.256(<8 x i32>, <8 x i32>, <8 x i32>)
+declare <4 x i32> @llvm.x86.avx2.vpdpbsud.128(<4 x i32>, <16 x i8>, <16 x i8>)
+declare <4 x i32> @llvm.x86.avx2.vpdpbsuds.128(<4 x i32>, <16 x i8>, <16 x i8>)
+declare <8 x i32> @llvm.x86.avx2.vpdpbsud.256(<8 x i32>, <32 x i8>, <32 x i8>)
+declare <8 x i32> @llvm.x86.avx2.vpdpbsuds.256(<8 x i32>, <32 x i8>, <32 x i8>)
-define <4 x i32> @test_mm_mask_dpbuud_epi32(<4 x i32> %__W, i4 zeroext %__U, <4 x i32> %__A, <4 x i32> %__B) {
+define <4 x i32> @test_mm_mask_dpbuud_epi32(<4 x i32> %__W, i4 zeroext %__U, <16 x i8> %__A, <16 x i8> %__B) {
; X86-LABEL: test_mm_mask_dpbuud_epi32:
; X86: # %bb.0:
; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
@@ -267,13 +267,13 @@ define <4 x i32> @test_mm_mask_dpbuud_epi32(<4 x i32> %__W, i4 zeroext %__U, <4
; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
; X64-NEXT: vpdpbuud %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x74,0x09,0x50,0xc2]
; X64-NEXT: retq # encoding: [0xc3]
- %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpbuud.128(<4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B)
+ %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpbuud.128(<4 x i32> %__W, <16 x i8> %__A, <16 x i8> %__B)
%bst = bitcast i4 %__U to <4 x i1>
%res = select <4 x i1> %bst, <4 x i32> %dpi, <4 x i32> %__W
ret <4 x i32> %res
}
-define <4 x i32> @test_mm_maskz_dpbuuds_epi32(i4 zeroext %__U, <4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B) {
+define <4 x i32> @test_mm_maskz_dpbuuds_epi32(i4 zeroext %__U, <4 x i32> %__W, <16 x i8> %__A, <16 x i8> %__B) {
; X86-LABEL: test_mm_maskz_dpbuuds_epi32:
; X86: # %bb.0:
; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
@@ -285,13 +285,13 @@ define <4 x i32> @test_mm_maskz_dpbuuds_epi32(i4 zeroext %__U, <4 x i32> %__W, <
; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
; X64-NEXT: vpdpbuuds %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x74,0x89,0x51,0xc2]
; X64-NEXT: retq # encoding: [0xc3]
- %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpbuuds.128(<4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B)
+ %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpbuuds.128(<4 x i32> %__W, <16 x i8> %__A, <16 x i8> %__B)
%bst = bitcast i4 %__U to <4 x i1>
%res = select <4 x i1> %bst, <4 x i32> %dpi, <4 x i32> zeroinitializer
ret <4 x i32> %res
}
-define <8 x i32> @test_mm256_maskz_dpbuuds_epi32(<8 x i32> %__W, i8 zeroext %__U, <8 x i32> %__A, <8 x i32> %__B) {
+define <8 x i32> @test_mm256_maskz_dpbuuds_epi32(<8 x i32> %__W, i8 zeroext %__U, <32 x i8> %__A, <32 x i8> %__B) {
; X86-LABEL: test_mm256_maskz_dpbuuds_epi32:
; X86: # %bb.0:
; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
@@ -303,13 +303,13 @@ define <8 x i32> @test_mm256_maskz_dpbuuds_epi32(<8 x i32> %__W, i8 zeroext %__U
; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
; X64-NEXT: vpdpbuuds %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x74,0x29,0x51,0xc2]
; X64-NEXT: retq # encoding: [0xc3]
- %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpbuuds.256(<8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B)
+ %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpbuuds.256(<8 x i32> %__W, <32 x i8> %__A, <32 x i8> %__B)
%bst = bitcast i8 %__U to <8 x i1>
%res = select <8 x i1> %bst, <8 x i32> %dpi, <8 x i32> %__W
ret <8 x i32> %res
}
-define <8 x i32> @test_mm256_mask_dpbuud_epi32(i8 zeroext %__U, <8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B) {
+define <8 x i32> @test_mm256_mask_dpbuud_epi32(i8 zeroext %__U, <8 x i32> %__W, <32 x i8> %__A, <32 x i8> %__B) {
; X86-LABEL: test_mm256_mask_dpbuud_epi32:
; X86: # %bb.0:
; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
@@ -321...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/122649
More information about the llvm-commits
mailing list