[llvm] 625e0a4 - [SLP][X86] Add missing SSE2/SSE4 checks from vector rotate tests
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Fri Jan 24 02:13:07 PST 2025
Author: Simon Pilgrim
Date: 2025-01-24T10:12:19Z
New Revision: 625e0a40f1a97d181a4641f604495b6aea433bd8
URL: https://github.com/llvm/llvm-project/commit/625e0a40f1a97d181a4641f604495b6aea433bd8
DIFF: https://github.com/llvm/llvm-project/commit/625e0a40f1a97d181a4641f604495b6aea433bd8.diff
LOG: [SLP][X86] Add missing SSE2/SSE4 checks from vector rotate tests
Added:
Modified:
llvm/test/Transforms/SLPVectorizer/X86/arith-fshl-rot.ll
llvm/test/Transforms/SLPVectorizer/X86/arith-fshr-rot.ll
Removed:
################################################################################
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-fshl-rot.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-fshl-rot.ll
index 856601d94fbfc2..153191b1eea084 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/arith-fshl-rot.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-fshl-rot.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=SSE
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=SSE
+; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=SSE,SSE2
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=SSE,SSE4
; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=AVX,AVX1
; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=AVX,AVX2
; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=+prefer-256-bit -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=AVX,AVX256
@@ -333,6 +333,156 @@ define void @fshl_v16i32() {
}
define void @fshl_v32i16() {
+; SSE2-LABEL: @fshl_v32i16(
+; SSE2-NEXT: [[A0:%.*]] = load i16, ptr @a16, align 2
+; SSE2-NEXT: [[A1:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 1), align 2
+; SSE2-NEXT: [[A2:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 2), align 2
+; SSE2-NEXT: [[A3:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 3), align 2
+; SSE2-NEXT: [[A4:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 4), align 2
+; SSE2-NEXT: [[A5:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 5), align 2
+; SSE2-NEXT: [[A6:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 6), align 2
+; SSE2-NEXT: [[A7:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 7), align 2
+; SSE2-NEXT: [[A8:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 8), align 2
+; SSE2-NEXT: [[A9:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 9), align 2
+; SSE2-NEXT: [[A10:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 10), align 2
+; SSE2-NEXT: [[A11:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 11), align 2
+; SSE2-NEXT: [[A12:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 12), align 2
+; SSE2-NEXT: [[A13:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 13), align 2
+; SSE2-NEXT: [[A14:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 14), align 2
+; SSE2-NEXT: [[A15:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 15), align 2
+; SSE2-NEXT: [[A16:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2
+; SSE2-NEXT: [[A17:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 17), align 2
+; SSE2-NEXT: [[A18:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 18), align 2
+; SSE2-NEXT: [[A19:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 19), align 2
+; SSE2-NEXT: [[A20:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 20), align 2
+; SSE2-NEXT: [[A21:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 21), align 2
+; SSE2-NEXT: [[A22:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 22), align 2
+; SSE2-NEXT: [[A23:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 23), align 2
+; SSE2-NEXT: [[A24:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 24), align 2
+; SSE2-NEXT: [[A25:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 25), align 2
+; SSE2-NEXT: [[A26:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 26), align 2
+; SSE2-NEXT: [[A27:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 27), align 2
+; SSE2-NEXT: [[A28:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 28), align 2
+; SSE2-NEXT: [[A29:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 29), align 2
+; SSE2-NEXT: [[A30:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 30), align 2
+; SSE2-NEXT: [[A31:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 31), align 2
+; SSE2-NEXT: [[B0:%.*]] = load i16, ptr @b16, align 2
+; SSE2-NEXT: [[B1:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 1), align 2
+; SSE2-NEXT: [[B2:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 2), align 2
+; SSE2-NEXT: [[B3:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 3), align 2
+; SSE2-NEXT: [[B4:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 4), align 2
+; SSE2-NEXT: [[B5:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 5), align 2
+; SSE2-NEXT: [[B6:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 6), align 2
+; SSE2-NEXT: [[B7:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 7), align 2
+; SSE2-NEXT: [[B8:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 8), align 2
+; SSE2-NEXT: [[B9:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 9), align 2
+; SSE2-NEXT: [[B10:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 10), align 2
+; SSE2-NEXT: [[B11:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 11), align 2
+; SSE2-NEXT: [[B12:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 12), align 2
+; SSE2-NEXT: [[B13:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 13), align 2
+; SSE2-NEXT: [[B14:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 14), align 2
+; SSE2-NEXT: [[B15:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 15), align 2
+; SSE2-NEXT: [[B16:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2
+; SSE2-NEXT: [[B17:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 17), align 2
+; SSE2-NEXT: [[B18:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 18), align 2
+; SSE2-NEXT: [[B19:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 19), align 2
+; SSE2-NEXT: [[B20:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 20), align 2
+; SSE2-NEXT: [[B21:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 21), align 2
+; SSE2-NEXT: [[B22:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 22), align 2
+; SSE2-NEXT: [[B23:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 23), align 2
+; SSE2-NEXT: [[B24:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 24), align 2
+; SSE2-NEXT: [[B25:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 25), align 2
+; SSE2-NEXT: [[B26:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 26), align 2
+; SSE2-NEXT: [[B27:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 27), align 2
+; SSE2-NEXT: [[B28:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 28), align 2
+; SSE2-NEXT: [[B29:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 29), align 2
+; SSE2-NEXT: [[B30:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 30), align 2
+; SSE2-NEXT: [[B31:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 31), align 2
+; SSE2-NEXT: [[R0:%.*]] = call i16 @llvm.fshl.i16(i16 [[A0]], i16 [[A0]], i16 [[B0]])
+; SSE2-NEXT: [[R1:%.*]] = call i16 @llvm.fshl.i16(i16 [[A1]], i16 [[A1]], i16 [[B1]])
+; SSE2-NEXT: [[R2:%.*]] = call i16 @llvm.fshl.i16(i16 [[A2]], i16 [[A2]], i16 [[B2]])
+; SSE2-NEXT: [[R3:%.*]] = call i16 @llvm.fshl.i16(i16 [[A3]], i16 [[A3]], i16 [[B3]])
+; SSE2-NEXT: [[R4:%.*]] = call i16 @llvm.fshl.i16(i16 [[A4]], i16 [[A4]], i16 [[B4]])
+; SSE2-NEXT: [[R5:%.*]] = call i16 @llvm.fshl.i16(i16 [[A5]], i16 [[A5]], i16 [[B5]])
+; SSE2-NEXT: [[R6:%.*]] = call i16 @llvm.fshl.i16(i16 [[A6]], i16 [[A6]], i16 [[B6]])
+; SSE2-NEXT: [[R7:%.*]] = call i16 @llvm.fshl.i16(i16 [[A7]], i16 [[A7]], i16 [[B7]])
+; SSE2-NEXT: [[R8:%.*]] = call i16 @llvm.fshl.i16(i16 [[A8]], i16 [[A8]], i16 [[B8]])
+; SSE2-NEXT: [[R9:%.*]] = call i16 @llvm.fshl.i16(i16 [[A9]], i16 [[A9]], i16 [[B9]])
+; SSE2-NEXT: [[R10:%.*]] = call i16 @llvm.fshl.i16(i16 [[A10]], i16 [[A10]], i16 [[B10]])
+; SSE2-NEXT: [[R11:%.*]] = call i16 @llvm.fshl.i16(i16 [[A11]], i16 [[A11]], i16 [[B11]])
+; SSE2-NEXT: [[R12:%.*]] = call i16 @llvm.fshl.i16(i16 [[A12]], i16 [[A12]], i16 [[B12]])
+; SSE2-NEXT: [[R13:%.*]] = call i16 @llvm.fshl.i16(i16 [[A13]], i16 [[A13]], i16 [[B13]])
+; SSE2-NEXT: [[R14:%.*]] = call i16 @llvm.fshl.i16(i16 [[A14]], i16 [[A14]], i16 [[B14]])
+; SSE2-NEXT: [[R15:%.*]] = call i16 @llvm.fshl.i16(i16 [[A15]], i16 [[A15]], i16 [[B15]])
+; SSE2-NEXT: [[R16:%.*]] = call i16 @llvm.fshl.i16(i16 [[A16]], i16 [[A16]], i16 [[B16]])
+; SSE2-NEXT: [[R17:%.*]] = call i16 @llvm.fshl.i16(i16 [[A17]], i16 [[A17]], i16 [[B17]])
+; SSE2-NEXT: [[R18:%.*]] = call i16 @llvm.fshl.i16(i16 [[A18]], i16 [[A18]], i16 [[B18]])
+; SSE2-NEXT: [[R19:%.*]] = call i16 @llvm.fshl.i16(i16 [[A19]], i16 [[A19]], i16 [[B19]])
+; SSE2-NEXT: [[R20:%.*]] = call i16 @llvm.fshl.i16(i16 [[A20]], i16 [[A20]], i16 [[B20]])
+; SSE2-NEXT: [[R21:%.*]] = call i16 @llvm.fshl.i16(i16 [[A21]], i16 [[A21]], i16 [[B21]])
+; SSE2-NEXT: [[R22:%.*]] = call i16 @llvm.fshl.i16(i16 [[A22]], i16 [[A22]], i16 [[B22]])
+; SSE2-NEXT: [[R23:%.*]] = call i16 @llvm.fshl.i16(i16 [[A23]], i16 [[A23]], i16 [[B23]])
+; SSE2-NEXT: [[R24:%.*]] = call i16 @llvm.fshl.i16(i16 [[A24]], i16 [[A24]], i16 [[B24]])
+; SSE2-NEXT: [[R25:%.*]] = call i16 @llvm.fshl.i16(i16 [[A25]], i16 [[A25]], i16 [[B25]])
+; SSE2-NEXT: [[R26:%.*]] = call i16 @llvm.fshl.i16(i16 [[A26]], i16 [[A26]], i16 [[B26]])
+; SSE2-NEXT: [[R27:%.*]] = call i16 @llvm.fshl.i16(i16 [[A27]], i16 [[A27]], i16 [[B27]])
+; SSE2-NEXT: [[R28:%.*]] = call i16 @llvm.fshl.i16(i16 [[A28]], i16 [[A28]], i16 [[B28]])
+; SSE2-NEXT: [[R29:%.*]] = call i16 @llvm.fshl.i16(i16 [[A29]], i16 [[A29]], i16 [[B29]])
+; SSE2-NEXT: [[R30:%.*]] = call i16 @llvm.fshl.i16(i16 [[A30]], i16 [[A30]], i16 [[B30]])
+; SSE2-NEXT: [[R31:%.*]] = call i16 @llvm.fshl.i16(i16 [[A31]], i16 [[A31]], i16 [[B31]])
+; SSE2-NEXT: store i16 [[R0]], ptr @d16, align 2
+; SSE2-NEXT: store i16 [[R1]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 1), align 2
+; SSE2-NEXT: store i16 [[R2]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 2), align 2
+; SSE2-NEXT: store i16 [[R3]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 3), align 2
+; SSE2-NEXT: store i16 [[R4]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 4), align 2
+; SSE2-NEXT: store i16 [[R5]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 5), align 2
+; SSE2-NEXT: store i16 [[R6]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 6), align 2
+; SSE2-NEXT: store i16 [[R7]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 7), align 2
+; SSE2-NEXT: store i16 [[R8]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 8), align 2
+; SSE2-NEXT: store i16 [[R9]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 9), align 2
+; SSE2-NEXT: store i16 [[R10]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 10), align 2
+; SSE2-NEXT: store i16 [[R11]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 11), align 2
+; SSE2-NEXT: store i16 [[R12]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 12), align 2
+; SSE2-NEXT: store i16 [[R13]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 13), align 2
+; SSE2-NEXT: store i16 [[R14]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 14), align 2
+; SSE2-NEXT: store i16 [[R15]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 15), align 2
+; SSE2-NEXT: store i16 [[R16]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 16), align 2
+; SSE2-NEXT: store i16 [[R17]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 17), align 2
+; SSE2-NEXT: store i16 [[R18]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 18), align 2
+; SSE2-NEXT: store i16 [[R19]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 19), align 2
+; SSE2-NEXT: store i16 [[R20]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 20), align 2
+; SSE2-NEXT: store i16 [[R21]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 21), align 2
+; SSE2-NEXT: store i16 [[R22]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 22), align 2
+; SSE2-NEXT: store i16 [[R23]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 23), align 2
+; SSE2-NEXT: store i16 [[R24]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 24), align 2
+; SSE2-NEXT: store i16 [[R25]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 25), align 2
+; SSE2-NEXT: store i16 [[R26]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 26), align 2
+; SSE2-NEXT: store i16 [[R27]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 27), align 2
+; SSE2-NEXT: store i16 [[R28]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 28), align 2
+; SSE2-NEXT: store i16 [[R29]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 29), align 2
+; SSE2-NEXT: store i16 [[R30]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 30), align 2
+; SSE2-NEXT: store i16 [[R31]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 31), align 2
+; SSE2-NEXT: ret void
+;
+; SSE4-LABEL: @fshl_v32i16(
+; SSE4-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr @a16, align 2
+; SSE4-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr @b16, align 2
+; SSE4-NEXT: [[TMP3:%.*]] = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i16> [[TMP2]])
+; SSE4-NEXT: store <8 x i16> [[TMP3]], ptr @d16, align 2
+; SSE4-NEXT: [[TMP4:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 8), align 2
+; SSE4-NEXT: [[TMP5:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 8), align 2
+; SSE4-NEXT: [[TMP6:%.*]] = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP4]], <8 x i16> [[TMP5]])
+; SSE4-NEXT: store <8 x i16> [[TMP6]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 8), align 2
+; SSE4-NEXT: [[TMP7:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2
+; SSE4-NEXT: [[TMP8:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2
+; SSE4-NEXT: [[TMP9:%.*]] = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> [[TMP7]], <8 x i16> [[TMP7]], <8 x i16> [[TMP8]])
+; SSE4-NEXT: store <8 x i16> [[TMP9]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 16), align 2
+; SSE4-NEXT: [[TMP10:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 24), align 2
+; SSE4-NEXT: [[TMP11:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 24), align 2
+; SSE4-NEXT: [[TMP12:%.*]] = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> [[TMP10]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]])
+; SSE4-NEXT: store <8 x i16> [[TMP12]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 24), align 2
+; SSE4-NEXT: ret void
+;
; AVX-LABEL: @fshl_v32i16(
; AVX-NEXT: [[TMP1:%.*]] = load <16 x i16>, ptr @a16, align 2
; AVX-NEXT: [[TMP2:%.*]] = load <16 x i16>, ptr @b16, align 2
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-fshr-rot.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-fshr-rot.ll
index 0eaa55e7ace602..4d50ffad7f8b59 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/arith-fshr-rot.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-fshr-rot.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=SSE
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=SSE
+; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=SSE,SSE2
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=SSE,SSE4
; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=AVX,AVX1
; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=AVX,AVX2
; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=+prefer-256-bit -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=AVX,AVX256
@@ -333,6 +333,156 @@ define void @fshr_v16i32() {
}
define void @fshr_v32i16() {
+; SSE2-LABEL: @fshr_v32i16(
+; SSE2-NEXT: [[A0:%.*]] = load i16, ptr @a16, align 2
+; SSE2-NEXT: [[A1:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 1), align 2
+; SSE2-NEXT: [[A2:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 2), align 2
+; SSE2-NEXT: [[A3:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 3), align 2
+; SSE2-NEXT: [[A4:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 4), align 2
+; SSE2-NEXT: [[A5:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 5), align 2
+; SSE2-NEXT: [[A6:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 6), align 2
+; SSE2-NEXT: [[A7:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 7), align 2
+; SSE2-NEXT: [[A8:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 8), align 2
+; SSE2-NEXT: [[A9:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 9), align 2
+; SSE2-NEXT: [[A10:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 10), align 2
+; SSE2-NEXT: [[A11:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 11), align 2
+; SSE2-NEXT: [[A12:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 12), align 2
+; SSE2-NEXT: [[A13:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 13), align 2
+; SSE2-NEXT: [[A14:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 14), align 2
+; SSE2-NEXT: [[A15:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 15), align 2
+; SSE2-NEXT: [[A16:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2
+; SSE2-NEXT: [[A17:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 17), align 2
+; SSE2-NEXT: [[A18:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 18), align 2
+; SSE2-NEXT: [[A19:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 19), align 2
+; SSE2-NEXT: [[A20:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 20), align 2
+; SSE2-NEXT: [[A21:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 21), align 2
+; SSE2-NEXT: [[A22:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 22), align 2
+; SSE2-NEXT: [[A23:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 23), align 2
+; SSE2-NEXT: [[A24:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 24), align 2
+; SSE2-NEXT: [[A25:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 25), align 2
+; SSE2-NEXT: [[A26:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 26), align 2
+; SSE2-NEXT: [[A27:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 27), align 2
+; SSE2-NEXT: [[A28:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 28), align 2
+; SSE2-NEXT: [[A29:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 29), align 2
+; SSE2-NEXT: [[A30:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 30), align 2
+; SSE2-NEXT: [[A31:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 31), align 2
+; SSE2-NEXT: [[B0:%.*]] = load i16, ptr @b16, align 2
+; SSE2-NEXT: [[B1:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 1), align 2
+; SSE2-NEXT: [[B2:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 2), align 2
+; SSE2-NEXT: [[B3:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 3), align 2
+; SSE2-NEXT: [[B4:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 4), align 2
+; SSE2-NEXT: [[B5:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 5), align 2
+; SSE2-NEXT: [[B6:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 6), align 2
+; SSE2-NEXT: [[B7:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 7), align 2
+; SSE2-NEXT: [[B8:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 8), align 2
+; SSE2-NEXT: [[B9:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 9), align 2
+; SSE2-NEXT: [[B10:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 10), align 2
+; SSE2-NEXT: [[B11:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 11), align 2
+; SSE2-NEXT: [[B12:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 12), align 2
+; SSE2-NEXT: [[B13:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 13), align 2
+; SSE2-NEXT: [[B14:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 14), align 2
+; SSE2-NEXT: [[B15:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 15), align 2
+; SSE2-NEXT: [[B16:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2
+; SSE2-NEXT: [[B17:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 17), align 2
+; SSE2-NEXT: [[B18:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 18), align 2
+; SSE2-NEXT: [[B19:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 19), align 2
+; SSE2-NEXT: [[B20:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 20), align 2
+; SSE2-NEXT: [[B21:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 21), align 2
+; SSE2-NEXT: [[B22:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 22), align 2
+; SSE2-NEXT: [[B23:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 23), align 2
+; SSE2-NEXT: [[B24:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 24), align 2
+; SSE2-NEXT: [[B25:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 25), align 2
+; SSE2-NEXT: [[B26:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 26), align 2
+; SSE2-NEXT: [[B27:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 27), align 2
+; SSE2-NEXT: [[B28:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 28), align 2
+; SSE2-NEXT: [[B29:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 29), align 2
+; SSE2-NEXT: [[B30:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 30), align 2
+; SSE2-NEXT: [[B31:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 31), align 2
+; SSE2-NEXT: [[R0:%.*]] = call i16 @llvm.fshr.i16(i16 [[A0]], i16 [[A0]], i16 [[B0]])
+; SSE2-NEXT: [[R1:%.*]] = call i16 @llvm.fshr.i16(i16 [[A1]], i16 [[A1]], i16 [[B1]])
+; SSE2-NEXT: [[R2:%.*]] = call i16 @llvm.fshr.i16(i16 [[A2]], i16 [[A2]], i16 [[B2]])
+; SSE2-NEXT: [[R3:%.*]] = call i16 @llvm.fshr.i16(i16 [[A3]], i16 [[A3]], i16 [[B3]])
+; SSE2-NEXT: [[R4:%.*]] = call i16 @llvm.fshr.i16(i16 [[A4]], i16 [[A4]], i16 [[B4]])
+; SSE2-NEXT: [[R5:%.*]] = call i16 @llvm.fshr.i16(i16 [[A5]], i16 [[A5]], i16 [[B5]])
+; SSE2-NEXT: [[R6:%.*]] = call i16 @llvm.fshr.i16(i16 [[A6]], i16 [[A6]], i16 [[B6]])
+; SSE2-NEXT: [[R7:%.*]] = call i16 @llvm.fshr.i16(i16 [[A7]], i16 [[A7]], i16 [[B7]])
+; SSE2-NEXT: [[R8:%.*]] = call i16 @llvm.fshr.i16(i16 [[A8]], i16 [[A8]], i16 [[B8]])
+; SSE2-NEXT: [[R9:%.*]] = call i16 @llvm.fshr.i16(i16 [[A9]], i16 [[A9]], i16 [[B9]])
+; SSE2-NEXT: [[R10:%.*]] = call i16 @llvm.fshr.i16(i16 [[A10]], i16 [[A10]], i16 [[B10]])
+; SSE2-NEXT: [[R11:%.*]] = call i16 @llvm.fshr.i16(i16 [[A11]], i16 [[A11]], i16 [[B11]])
+; SSE2-NEXT: [[R12:%.*]] = call i16 @llvm.fshr.i16(i16 [[A12]], i16 [[A12]], i16 [[B12]])
+; SSE2-NEXT: [[R13:%.*]] = call i16 @llvm.fshr.i16(i16 [[A13]], i16 [[A13]], i16 [[B13]])
+; SSE2-NEXT: [[R14:%.*]] = call i16 @llvm.fshr.i16(i16 [[A14]], i16 [[A14]], i16 [[B14]])
+; SSE2-NEXT: [[R15:%.*]] = call i16 @llvm.fshr.i16(i16 [[A15]], i16 [[A15]], i16 [[B15]])
+; SSE2-NEXT: [[R16:%.*]] = call i16 @llvm.fshr.i16(i16 [[A16]], i16 [[A16]], i16 [[B16]])
+; SSE2-NEXT: [[R17:%.*]] = call i16 @llvm.fshr.i16(i16 [[A17]], i16 [[A17]], i16 [[B17]])
+; SSE2-NEXT: [[R18:%.*]] = call i16 @llvm.fshr.i16(i16 [[A18]], i16 [[A18]], i16 [[B18]])
+; SSE2-NEXT: [[R19:%.*]] = call i16 @llvm.fshr.i16(i16 [[A19]], i16 [[A19]], i16 [[B19]])
+; SSE2-NEXT: [[R20:%.*]] = call i16 @llvm.fshr.i16(i16 [[A20]], i16 [[A20]], i16 [[B20]])
+; SSE2-NEXT: [[R21:%.*]] = call i16 @llvm.fshr.i16(i16 [[A21]], i16 [[A21]], i16 [[B21]])
+; SSE2-NEXT: [[R22:%.*]] = call i16 @llvm.fshr.i16(i16 [[A22]], i16 [[A22]], i16 [[B22]])
+; SSE2-NEXT: [[R23:%.*]] = call i16 @llvm.fshr.i16(i16 [[A23]], i16 [[A23]], i16 [[B23]])
+; SSE2-NEXT: [[R24:%.*]] = call i16 @llvm.fshr.i16(i16 [[A24]], i16 [[A24]], i16 [[B24]])
+; SSE2-NEXT: [[R25:%.*]] = call i16 @llvm.fshr.i16(i16 [[A25]], i16 [[A25]], i16 [[B25]])
+; SSE2-NEXT: [[R26:%.*]] = call i16 @llvm.fshr.i16(i16 [[A26]], i16 [[A26]], i16 [[B26]])
+; SSE2-NEXT: [[R27:%.*]] = call i16 @llvm.fshr.i16(i16 [[A27]], i16 [[A27]], i16 [[B27]])
+; SSE2-NEXT: [[R28:%.*]] = call i16 @llvm.fshr.i16(i16 [[A28]], i16 [[A28]], i16 [[B28]])
+; SSE2-NEXT: [[R29:%.*]] = call i16 @llvm.fshr.i16(i16 [[A29]], i16 [[A29]], i16 [[B29]])
+; SSE2-NEXT: [[R30:%.*]] = call i16 @llvm.fshr.i16(i16 [[A30]], i16 [[A30]], i16 [[B30]])
+; SSE2-NEXT: [[R31:%.*]] = call i16 @llvm.fshr.i16(i16 [[A31]], i16 [[A31]], i16 [[B31]])
+; SSE2-NEXT: store i16 [[R0]], ptr @d16, align 2
+; SSE2-NEXT: store i16 [[R1]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 1), align 2
+; SSE2-NEXT: store i16 [[R2]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 2), align 2
+; SSE2-NEXT: store i16 [[R3]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 3), align 2
+; SSE2-NEXT: store i16 [[R4]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 4), align 2
+; SSE2-NEXT: store i16 [[R5]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 5), align 2
+; SSE2-NEXT: store i16 [[R6]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 6), align 2
+; SSE2-NEXT: store i16 [[R7]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 7), align 2
+; SSE2-NEXT: store i16 [[R8]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 8), align 2
+; SSE2-NEXT: store i16 [[R9]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 9), align 2
+; SSE2-NEXT: store i16 [[R10]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 10), align 2
+; SSE2-NEXT: store i16 [[R11]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 11), align 2
+; SSE2-NEXT: store i16 [[R12]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 12), align 2
+; SSE2-NEXT: store i16 [[R13]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 13), align 2
+; SSE2-NEXT: store i16 [[R14]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 14), align 2
+; SSE2-NEXT: store i16 [[R15]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 15), align 2
+; SSE2-NEXT: store i16 [[R16]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 16), align 2
+; SSE2-NEXT: store i16 [[R17]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 17), align 2
+; SSE2-NEXT: store i16 [[R18]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 18), align 2
+; SSE2-NEXT: store i16 [[R19]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 19), align 2
+; SSE2-NEXT: store i16 [[R20]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 20), align 2
+; SSE2-NEXT: store i16 [[R21]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 21), align 2
+; SSE2-NEXT: store i16 [[R22]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 22), align 2
+; SSE2-NEXT: store i16 [[R23]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 23), align 2
+; SSE2-NEXT: store i16 [[R24]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 24), align 2
+; SSE2-NEXT: store i16 [[R25]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 25), align 2
+; SSE2-NEXT: store i16 [[R26]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 26), align 2
+; SSE2-NEXT: store i16 [[R27]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 27), align 2
+; SSE2-NEXT: store i16 [[R28]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 28), align 2
+; SSE2-NEXT: store i16 [[R29]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 29), align 2
+; SSE2-NEXT: store i16 [[R30]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 30), align 2
+; SSE2-NEXT: store i16 [[R31]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 31), align 2
+; SSE2-NEXT: ret void
+;
+; SSE4-LABEL: @fshr_v32i16(
+; SSE4-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr @a16, align 2
+; SSE4-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr @b16, align 2
+; SSE4-NEXT: [[TMP3:%.*]] = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i16> [[TMP2]])
+; SSE4-NEXT: store <8 x i16> [[TMP3]], ptr @d16, align 2
+; SSE4-NEXT: [[TMP4:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 8), align 2
+; SSE4-NEXT: [[TMP5:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 8), align 2
+; SSE4-NEXT: [[TMP6:%.*]] = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP4]], <8 x i16> [[TMP5]])
+; SSE4-NEXT: store <8 x i16> [[TMP6]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 8), align 2
+; SSE4-NEXT: [[TMP7:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2
+; SSE4-NEXT: [[TMP8:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2
+; SSE4-NEXT: [[TMP9:%.*]] = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> [[TMP7]], <8 x i16> [[TMP7]], <8 x i16> [[TMP8]])
+; SSE4-NEXT: store <8 x i16> [[TMP9]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 16), align 2
+; SSE4-NEXT: [[TMP10:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 24), align 2
+; SSE4-NEXT: [[TMP11:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 24), align 2
+; SSE4-NEXT: [[TMP12:%.*]] = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> [[TMP10]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]])
+; SSE4-NEXT: store <8 x i16> [[TMP12]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 24), align 2
+; SSE4-NEXT: ret void
+;
; AVX-LABEL: @fshr_v32i16(
; AVX-NEXT: [[TMP1:%.*]] = load <16 x i16>, ptr @a16, align 2
; AVX-NEXT: [[TMP2:%.*]] = load <16 x i16>, ptr @b16, align 2
More information about the llvm-commits
mailing list