[llvm] 7746596 - [SLP][X86] Add VBMI2 coverage for funnel shift tests
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Fri Jan 24 01:48:09 PST 2025
Author: Simon Pilgrim
Date: 2025-01-24T09:47:40Z
New Revision: 77465967130a502eb092a710a2f18be23ef2efff
URL: https://github.com/llvm/llvm-project/commit/77465967130a502eb092a710a2f18be23ef2efff
DIFF: https://github.com/llvm/llvm-project/commit/77465967130a502eb092a710a2f18be23ef2efff.diff
LOG: [SLP][X86] Add VBMI2 coverage for funnel shift tests
VBMI2 CPUs actually have vector funnel shift instruction support
Added:
Modified:
llvm/test/Transforms/SLPVectorizer/X86/arith-fshl-rot.ll
llvm/test/Transforms/SLPVectorizer/X86/arith-fshl.ll
llvm/test/Transforms/SLPVectorizer/X86/arith-fshr-rot.ll
llvm/test/Transforms/SLPVectorizer/X86/arith-fshr.ll
Removed:
################################################################################
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-fshl-rot.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-fshl-rot.ll
index 45294e581e6aea..856601d94fbfc2 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/arith-fshl-rot.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-fshl-rot.ll
@@ -5,6 +5,7 @@
; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=AVX,AVX2
; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=+prefer-256-bit -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=AVX,AVX256
; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=-prefer-256-bit -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=AVX512
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=znver4 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=AVX512VBMI2
; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=AVX512
@a64 = common global [8 x i64] zeroinitializer, align 64
@@ -128,6 +129,13 @@ define void @fshl_v8i64() {
; AVX512-NEXT: [[TMP3:%.*]] = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i64> [[TMP2]])
; AVX512-NEXT: store <8 x i64> [[TMP3]], ptr @d64, align 8
; AVX512-NEXT: ret void
+;
+; AVX512VBMI2-LABEL: @fshl_v8i64(
+; AVX512VBMI2-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @a64, align 8
+; AVX512VBMI2-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @b64, align 8
+; AVX512VBMI2-NEXT: [[TMP3:%.*]] = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i64> [[TMP2]])
+; AVX512VBMI2-NEXT: store <8 x i64> [[TMP3]], ptr @d64, align 8
+; AVX512VBMI2-NEXT: ret void
;
%a0 = load i64, ptr @a64, align 8
%a1 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 1), align 8
@@ -249,6 +257,13 @@ define void @fshl_v16i32() {
; AVX512-NEXT: [[TMP3:%.*]] = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> [[TMP2]])
; AVX512-NEXT: store <16 x i32> [[TMP3]], ptr @d32, align 4
; AVX512-NEXT: ret void
+;
+; AVX512VBMI2-LABEL: @fshl_v16i32(
+; AVX512VBMI2-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @a32, align 4
+; AVX512VBMI2-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @b32, align 4
+; AVX512VBMI2-NEXT: [[TMP3:%.*]] = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> [[TMP2]])
+; AVX512VBMI2-NEXT: store <16 x i32> [[TMP3]], ptr @d32, align 4
+; AVX512VBMI2-NEXT: ret void
;
%a0 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 0 ), align 4
%a1 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 1 ), align 4
@@ -335,6 +350,13 @@ define void @fshl_v32i16() {
; AVX512-NEXT: [[TMP3:%.*]] = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> [[TMP1]], <32 x i16> [[TMP1]], <32 x i16> [[TMP2]])
; AVX512-NEXT: store <32 x i16> [[TMP3]], ptr @d16, align 2
; AVX512-NEXT: ret void
+;
+; AVX512VBMI2-LABEL: @fshl_v32i16(
+; AVX512VBMI2-NEXT: [[TMP1:%.*]] = load <32 x i16>, ptr @a16, align 2
+; AVX512VBMI2-NEXT: [[TMP2:%.*]] = load <32 x i16>, ptr @b16, align 2
+; AVX512VBMI2-NEXT: [[TMP3:%.*]] = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> [[TMP1]], <32 x i16> [[TMP1]], <32 x i16> [[TMP2]])
+; AVX512VBMI2-NEXT: store <32 x i16> [[TMP3]], ptr @d16, align 2
+; AVX512VBMI2-NEXT: ret void
;
%a0 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 0 ), align 2
%a1 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 1 ), align 2
@@ -504,6 +526,13 @@ define void @fshl_v64i8() {
; AVX512-NEXT: [[TMP3:%.*]] = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> [[TMP1]], <64 x i8> [[TMP1]], <64 x i8> [[TMP2]])
; AVX512-NEXT: store <64 x i8> [[TMP3]], ptr @d8, align 1
; AVX512-NEXT: ret void
+;
+; AVX512VBMI2-LABEL: @fshl_v64i8(
+; AVX512VBMI2-NEXT: [[TMP1:%.*]] = load <64 x i8>, ptr @a8, align 1
+; AVX512VBMI2-NEXT: [[TMP2:%.*]] = load <64 x i8>, ptr @b8, align 1
+; AVX512VBMI2-NEXT: [[TMP3:%.*]] = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> [[TMP1]], <64 x i8> [[TMP1]], <64 x i8> [[TMP2]])
+; AVX512VBMI2-NEXT: store <64 x i8> [[TMP3]], ptr @d8, align 1
+; AVX512VBMI2-NEXT: ret void
;
%a0 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 0 ), align 1
%a1 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 1 ), align 1
@@ -811,6 +840,13 @@ define void @fshl_v2i32() {
; AVX512-NEXT: [[TMP3:%.*]] = call <2 x i32> @llvm.fshl.v2i32(<2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> [[TMP2]])
; AVX512-NEXT: store <2 x i32> [[TMP3]], ptr @d32, align 4
; AVX512-NEXT: ret void
+;
+; AVX512VBMI2-LABEL: @fshl_v2i32(
+; AVX512VBMI2-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr @a32, align 4
+; AVX512VBMI2-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr @b32, align 4
+; AVX512VBMI2-NEXT: [[TMP3:%.*]] = call <2 x i32> @llvm.fshl.v2i32(<2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> [[TMP2]])
+; AVX512VBMI2-NEXT: store <2 x i32> [[TMP3]], ptr @d32, align 4
+; AVX512VBMI2-NEXT: ret void
;
%a0 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 0 ), align 4
%a1 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 1 ), align 4
@@ -863,6 +899,12 @@ define void @fshl_v2i32_uniformconst() {
; AVX512-NEXT: [[TMP2:%.*]] = call <2 x i32> @llvm.fshl.v2i32(<2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> splat (i32 1))
; AVX512-NEXT: store <2 x i32> [[TMP2]], ptr @d32, align 4
; AVX512-NEXT: ret void
+;
+; AVX512VBMI2-LABEL: @fshl_v2i32_uniformconst(
+; AVX512VBMI2-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr @a32, align 4
+; AVX512VBMI2-NEXT: [[TMP2:%.*]] = call <2 x i32> @llvm.fshl.v2i32(<2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> splat (i32 1))
+; AVX512VBMI2-NEXT: store <2 x i32> [[TMP2]], ptr @d32, align 4
+; AVX512VBMI2-NEXT: ret void
;
%a0 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 0 ), align 4
%a1 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 1 ), align 4
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-fshl.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-fshl.ll
index 609a9024e5bf7b..cd9348fcb5eed2 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/arith-fshl.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-fshl.ll
@@ -5,6 +5,7 @@
; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=AVX,AVX2
; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=+prefer-256-bit -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=AVX,AVX256
; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=-prefer-256-bit -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=AVX512
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=znver4 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=AVX512VBMI2
; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=AVX512
@a64 = common global [8 x i64] zeroinitializer, align 64
@@ -129,6 +130,14 @@ define void @fshl_v8i64() {
; AVX512-NEXT: [[TMP4:%.*]] = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <8 x i64> [[TMP3]])
; AVX512-NEXT: store <8 x i64> [[TMP4]], ptr @d64, align 8
; AVX512-NEXT: ret void
+;
+; AVX512VBMI2-LABEL: @fshl_v8i64(
+; AVX512VBMI2-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @a64, align 8
+; AVX512VBMI2-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @b64, align 8
+; AVX512VBMI2-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr @c64, align 8
+; AVX512VBMI2-NEXT: [[TMP4:%.*]] = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <8 x i64> [[TMP3]])
+; AVX512VBMI2-NEXT: store <8 x i64> [[TMP4]], ptr @d64, align 8
+; AVX512VBMI2-NEXT: ret void
;
%a0 = load i64, ptr @a64, align 8
%a1 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 1), align 8
@@ -277,6 +286,14 @@ define void @fshl_v16i32() {
; AVX512-NEXT: [[TMP4:%.*]] = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP2]], <16 x i32> [[TMP3]])
; AVX512-NEXT: store <16 x i32> [[TMP4]], ptr @d32, align 4
; AVX512-NEXT: ret void
+;
+; AVX512VBMI2-LABEL: @fshl_v16i32(
+; AVX512VBMI2-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @a32, align 4
+; AVX512VBMI2-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @b32, align 4
+; AVX512VBMI2-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr @c32, align 4
+; AVX512VBMI2-NEXT: [[TMP4:%.*]] = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP2]], <16 x i32> [[TMP3]])
+; AVX512VBMI2-NEXT: store <16 x i32> [[TMP4]], ptr @d32, align 4
+; AVX512VBMI2-NEXT: ret void
;
%a0 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 0 ), align 4
%a1 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 1 ), align 4
@@ -405,6 +422,14 @@ define void @fshl_v32i16() {
; AVX512-NEXT: [[TMP4:%.*]] = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> [[TMP1]], <32 x i16> [[TMP2]], <32 x i16> [[TMP3]])
; AVX512-NEXT: store <32 x i16> [[TMP4]], ptr @d16, align 2
; AVX512-NEXT: ret void
+;
+; AVX512VBMI2-LABEL: @fshl_v32i16(
+; AVX512VBMI2-NEXT: [[TMP1:%.*]] = load <32 x i16>, ptr @a16, align 2
+; AVX512VBMI2-NEXT: [[TMP2:%.*]] = load <32 x i16>, ptr @b16, align 2
+; AVX512VBMI2-NEXT: [[TMP3:%.*]] = load <32 x i16>, ptr @c16, align 2
+; AVX512VBMI2-NEXT: [[TMP4:%.*]] = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> [[TMP1]], <32 x i16> [[TMP2]], <32 x i16> [[TMP3]])
+; AVX512VBMI2-NEXT: store <32 x i16> [[TMP4]], ptr @d16, align 2
+; AVX512VBMI2-NEXT: ret void
;
%a0 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 0 ), align 2
%a1 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 1 ), align 2
@@ -613,6 +638,14 @@ define void @fshl_v64i8() {
; AVX512-NEXT: [[TMP4:%.*]] = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> [[TMP1]], <64 x i8> [[TMP2]], <64 x i8> [[TMP3]])
; AVX512-NEXT: store <64 x i8> [[TMP4]], ptr @d8, align 1
; AVX512-NEXT: ret void
+;
+; AVX512VBMI2-LABEL: @fshl_v64i8(
+; AVX512VBMI2-NEXT: [[TMP1:%.*]] = load <64 x i8>, ptr @a8, align 1
+; AVX512VBMI2-NEXT: [[TMP2:%.*]] = load <64 x i8>, ptr @b8, align 1
+; AVX512VBMI2-NEXT: [[TMP3:%.*]] = load <64 x i8>, ptr @c8, align 1
+; AVX512VBMI2-NEXT: [[TMP4:%.*]] = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> [[TMP1]], <64 x i8> [[TMP2]], <64 x i8> [[TMP3]])
+; AVX512VBMI2-NEXT: store <64 x i8> [[TMP4]], ptr @d8, align 1
+; AVX512VBMI2-NEXT: ret void
;
%a0 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 0 ), align 1
%a1 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 1 ), align 1
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-fshr-rot.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-fshr-rot.ll
index d2002b4eedaf40..0eaa55e7ace602 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/arith-fshr-rot.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-fshr-rot.ll
@@ -5,6 +5,7 @@
; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=AVX,AVX2
; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=+prefer-256-bit -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=AVX,AVX256
; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=-prefer-256-bit -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=AVX512
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=znver4 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=AVX512VBMI2
; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=AVX512
@a64 = common global [8 x i64] zeroinitializer, align 64
@@ -128,6 +129,13 @@ define void @fshr_v8i64() {
; AVX512-NEXT: [[TMP3:%.*]] = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i64> [[TMP2]])
; AVX512-NEXT: store <8 x i64> [[TMP3]], ptr @d64, align 8
; AVX512-NEXT: ret void
+;
+; AVX512VBMI2-LABEL: @fshr_v8i64(
+; AVX512VBMI2-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @a64, align 8
+; AVX512VBMI2-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @b64, align 8
+; AVX512VBMI2-NEXT: [[TMP3:%.*]] = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i64> [[TMP2]])
+; AVX512VBMI2-NEXT: store <8 x i64> [[TMP3]], ptr @d64, align 8
+; AVX512VBMI2-NEXT: ret void
;
%a0 = load i64, ptr @a64, align 8
%a1 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 1), align 8
@@ -249,6 +257,13 @@ define void @fshr_v16i32() {
; AVX512-NEXT: [[TMP3:%.*]] = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> [[TMP2]])
; AVX512-NEXT: store <16 x i32> [[TMP3]], ptr @d32, align 4
; AVX512-NEXT: ret void
+;
+; AVX512VBMI2-LABEL: @fshr_v16i32(
+; AVX512VBMI2-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @a32, align 4
+; AVX512VBMI2-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @b32, align 4
+; AVX512VBMI2-NEXT: [[TMP3:%.*]] = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> [[TMP2]])
+; AVX512VBMI2-NEXT: store <16 x i32> [[TMP3]], ptr @d32, align 4
+; AVX512VBMI2-NEXT: ret void
;
%a0 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 0 ), align 4
%a1 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 1 ), align 4
@@ -335,6 +350,13 @@ define void @fshr_v32i16() {
; AVX512-NEXT: [[TMP3:%.*]] = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> [[TMP1]], <32 x i16> [[TMP1]], <32 x i16> [[TMP2]])
; AVX512-NEXT: store <32 x i16> [[TMP3]], ptr @d16, align 2
; AVX512-NEXT: ret void
+;
+; AVX512VBMI2-LABEL: @fshr_v32i16(
+; AVX512VBMI2-NEXT: [[TMP1:%.*]] = load <32 x i16>, ptr @a16, align 2
+; AVX512VBMI2-NEXT: [[TMP2:%.*]] = load <32 x i16>, ptr @b16, align 2
+; AVX512VBMI2-NEXT: [[TMP3:%.*]] = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> [[TMP1]], <32 x i16> [[TMP1]], <32 x i16> [[TMP2]])
+; AVX512VBMI2-NEXT: store <32 x i16> [[TMP3]], ptr @d16, align 2
+; AVX512VBMI2-NEXT: ret void
;
%a0 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 0 ), align 2
%a1 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 1 ), align 2
@@ -504,6 +526,13 @@ define void @fshr_v64i8() {
; AVX512-NEXT: [[TMP3:%.*]] = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> [[TMP1]], <64 x i8> [[TMP1]], <64 x i8> [[TMP2]])
; AVX512-NEXT: store <64 x i8> [[TMP3]], ptr @d8, align 1
; AVX512-NEXT: ret void
+;
+; AVX512VBMI2-LABEL: @fshr_v64i8(
+; AVX512VBMI2-NEXT: [[TMP1:%.*]] = load <64 x i8>, ptr @a8, align 1
+; AVX512VBMI2-NEXT: [[TMP2:%.*]] = load <64 x i8>, ptr @b8, align 1
+; AVX512VBMI2-NEXT: [[TMP3:%.*]] = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> [[TMP1]], <64 x i8> [[TMP1]], <64 x i8> [[TMP2]])
+; AVX512VBMI2-NEXT: store <64 x i8> [[TMP3]], ptr @d8, align 1
+; AVX512VBMI2-NEXT: ret void
;
%a0 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 0 ), align 1
%a1 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 1 ), align 1
@@ -811,6 +840,13 @@ define void @fshr_v2i32() {
; AVX512-NEXT: [[TMP3:%.*]] = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> [[TMP2]])
; AVX512-NEXT: store <2 x i32> [[TMP3]], ptr @d32, align 4
; AVX512-NEXT: ret void
+;
+; AVX512VBMI2-LABEL: @fshr_v2i32(
+; AVX512VBMI2-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr @a32, align 4
+; AVX512VBMI2-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr @b32, align 4
+; AVX512VBMI2-NEXT: [[TMP3:%.*]] = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> [[TMP2]])
+; AVX512VBMI2-NEXT: store <2 x i32> [[TMP3]], ptr @d32, align 4
+; AVX512VBMI2-NEXT: ret void
;
%a0 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 0 ), align 4
%a1 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 1 ), align 4
@@ -863,6 +899,12 @@ define void @fshr_v2i32_uniformconst() {
; AVX512-NEXT: [[TMP2:%.*]] = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> splat (i32 1))
; AVX512-NEXT: store <2 x i32> [[TMP2]], ptr @d32, align 4
; AVX512-NEXT: ret void
+;
+; AVX512VBMI2-LABEL: @fshr_v2i32_uniformconst(
+; AVX512VBMI2-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr @a32, align 4
+; AVX512VBMI2-NEXT: [[TMP2:%.*]] = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> splat (i32 1))
+; AVX512VBMI2-NEXT: store <2 x i32> [[TMP2]], ptr @d32, align 4
+; AVX512VBMI2-NEXT: ret void
;
%a0 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 0 ), align 4
%a1 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 1 ), align 4
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-fshr.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-fshr.ll
index 3dc7d164f5bc94..217f5e06c2983f 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/arith-fshr.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-fshr.ll
@@ -5,6 +5,7 @@
; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=AVX,AVX2
; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=+prefer-256-bit -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=AVX,AVX256
; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=-prefer-256-bit -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=AVX512
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=znver4 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=AVX512VBMI2
; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=AVX512
@a64 = common global [8 x i64] zeroinitializer, align 64
@@ -129,6 +130,14 @@ define void @fshr_v8i64() {
; AVX512-NEXT: [[TMP4:%.*]] = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <8 x i64> [[TMP3]])
; AVX512-NEXT: store <8 x i64> [[TMP4]], ptr @d64, align 8
; AVX512-NEXT: ret void
+;
+; AVX512VBMI2-LABEL: @fshr_v8i64(
+; AVX512VBMI2-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @a64, align 8
+; AVX512VBMI2-NEXT: [[TMP2:%.*]] = load <8 x i64>, ptr @b64, align 8
+; AVX512VBMI2-NEXT: [[TMP3:%.*]] = load <8 x i64>, ptr @c64, align 8
+; AVX512VBMI2-NEXT: [[TMP4:%.*]] = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <8 x i64> [[TMP3]])
+; AVX512VBMI2-NEXT: store <8 x i64> [[TMP4]], ptr @d64, align 8
+; AVX512VBMI2-NEXT: ret void
;
%a0 = load i64, ptr @a64, align 8
%a1 = load i64, ptr getelementptr inbounds ([8 x i64], ptr @a64, i32 0, i64 1), align 8
@@ -277,6 +286,14 @@ define void @fshr_v16i32() {
; AVX512-NEXT: [[TMP4:%.*]] = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP2]], <16 x i32> [[TMP3]])
; AVX512-NEXT: store <16 x i32> [[TMP4]], ptr @d32, align 4
; AVX512-NEXT: ret void
+;
+; AVX512VBMI2-LABEL: @fshr_v16i32(
+; AVX512VBMI2-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @a32, align 4
+; AVX512VBMI2-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @b32, align 4
+; AVX512VBMI2-NEXT: [[TMP3:%.*]] = load <16 x i32>, ptr @c32, align 4
+; AVX512VBMI2-NEXT: [[TMP4:%.*]] = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP2]], <16 x i32> [[TMP3]])
+; AVX512VBMI2-NEXT: store <16 x i32> [[TMP4]], ptr @d32, align 4
+; AVX512VBMI2-NEXT: ret void
;
%a0 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 0 ), align 4
%a1 = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 1 ), align 4
@@ -405,6 +422,14 @@ define void @fshr_v32i16() {
; AVX512-NEXT: [[TMP4:%.*]] = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> [[TMP1]], <32 x i16> [[TMP2]], <32 x i16> [[TMP3]])
; AVX512-NEXT: store <32 x i16> [[TMP4]], ptr @d16, align 2
; AVX512-NEXT: ret void
+;
+; AVX512VBMI2-LABEL: @fshr_v32i16(
+; AVX512VBMI2-NEXT: [[TMP1:%.*]] = load <32 x i16>, ptr @a16, align 2
+; AVX512VBMI2-NEXT: [[TMP2:%.*]] = load <32 x i16>, ptr @b16, align 2
+; AVX512VBMI2-NEXT: [[TMP3:%.*]] = load <32 x i16>, ptr @c16, align 2
+; AVX512VBMI2-NEXT: [[TMP4:%.*]] = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> [[TMP1]], <32 x i16> [[TMP2]], <32 x i16> [[TMP3]])
+; AVX512VBMI2-NEXT: store <32 x i16> [[TMP4]], ptr @d16, align 2
+; AVX512VBMI2-NEXT: ret void
;
%a0 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 0 ), align 2
%a1 = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 1 ), align 2
@@ -613,6 +638,14 @@ define void @fshr_v64i8() {
; AVX512-NEXT: [[TMP4:%.*]] = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> [[TMP1]], <64 x i8> [[TMP2]], <64 x i8> [[TMP3]])
; AVX512-NEXT: store <64 x i8> [[TMP4]], ptr @d8, align 1
; AVX512-NEXT: ret void
+;
+; AVX512VBMI2-LABEL: @fshr_v64i8(
+; AVX512VBMI2-NEXT: [[TMP1:%.*]] = load <64 x i8>, ptr @a8, align 1
+; AVX512VBMI2-NEXT: [[TMP2:%.*]] = load <64 x i8>, ptr @b8, align 1
+; AVX512VBMI2-NEXT: [[TMP3:%.*]] = load <64 x i8>, ptr @c8, align 1
+; AVX512VBMI2-NEXT: [[TMP4:%.*]] = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> [[TMP1]], <64 x i8> [[TMP2]], <64 x i8> [[TMP3]])
+; AVX512VBMI2-NEXT: store <64 x i8> [[TMP4]], ptr @d8, align 1
+; AVX512VBMI2-NEXT: ret void
;
%a0 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 0 ), align 1
%a1 = load i8, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 1 ), align 1
More information about the llvm-commits
mailing list