[llvm-branch-commits] [clang] [compiler-rt] [llvm] [mlir] [mlir][test] Shard the Test Dialect (NFC) (PR #89628)
Jeff Niu via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Mon Apr 22 10:03:01 PDT 2024
https://github.com/Mogball updated https://github.com/llvm/llvm-project/pull/89628
>From 6ad22c879aab88b6bb0531eeb3a6708a82f88cf6 Mon Sep 17 00:00:00 2001
From: Mircea Trofin <mtrofin at google.com>
Date: Mon, 22 Apr 2024 09:24:22 -0700
Subject: [PATCH 1/7] [compiler-rt][ctx_instr] Add `ctx_profile` component
(#89304)
Add the component structure for contextual instrumented PGO and the bump allocator + test.
(Tracking Issue: #89287, RFC referenced there)
---
compiler-rt/CMakeLists.txt | 2 +
.../cmake/Modules/AllSupportedArchDefs.cmake | 1 +
compiler-rt/cmake/config-ix.cmake | 11 +++
compiler-rt/lib/CMakeLists.txt | 4 ++
compiler-rt/lib/ctx_profile/CMakeLists.txt | 28 ++++++++
.../lib/ctx_profile/CtxInstrProfiling.cpp | 40 +++++++++++
.../lib/ctx_profile/CtxInstrProfiling.h | 55 +++++++++++++++
.../lib/ctx_profile/tests/CMakeLists.txt | 70 +++++++++++++++++++
.../tests/CtxInstrProfilingTest.cpp | 22 ++++++
compiler-rt/lib/ctx_profile/tests/driver.cpp | 14 ++++
10 files changed, 247 insertions(+)
create mode 100644 compiler-rt/lib/ctx_profile/CMakeLists.txt
create mode 100644 compiler-rt/lib/ctx_profile/CtxInstrProfiling.cpp
create mode 100644 compiler-rt/lib/ctx_profile/CtxInstrProfiling.h
create mode 100644 compiler-rt/lib/ctx_profile/tests/CMakeLists.txt
create mode 100644 compiler-rt/lib/ctx_profile/tests/CtxInstrProfilingTest.cpp
create mode 100644 compiler-rt/lib/ctx_profile/tests/driver.cpp
diff --git a/compiler-rt/CMakeLists.txt b/compiler-rt/CMakeLists.txt
index 8649507ce1c79b..6ce451e3cac2e3 100644
--- a/compiler-rt/CMakeLists.txt
+++ b/compiler-rt/CMakeLists.txt
@@ -50,6 +50,8 @@ option(COMPILER_RT_BUILD_LIBFUZZER "Build libFuzzer" ON)
mark_as_advanced(COMPILER_RT_BUILD_LIBFUZZER)
option(COMPILER_RT_BUILD_PROFILE "Build profile runtime" ON)
mark_as_advanced(COMPILER_RT_BUILD_PROFILE)
+option(COMPILER_RT_BUILD_CTX_PROFILE "Build ctx profile runtime" ON)
+mark_as_advanced(COMPILER_RT_BUILD_CTX_PROFILE)
option(COMPILER_RT_BUILD_MEMPROF "Build memory profiling runtime" ON)
mark_as_advanced(COMPILER_RT_BUILD_MEMPROF)
option(COMPILER_RT_BUILD_XRAY_NO_PREINIT "Build xray with no preinit patching" OFF)
diff --git a/compiler-rt/cmake/Modules/AllSupportedArchDefs.cmake b/compiler-rt/cmake/Modules/AllSupportedArchDefs.cmake
index 423171532c2028..2fe06273a814c7 100644
--- a/compiler-rt/cmake/Modules/AllSupportedArchDefs.cmake
+++ b/compiler-rt/cmake/Modules/AllSupportedArchDefs.cmake
@@ -66,6 +66,7 @@ set(ALL_MEMPROF_SUPPORTED_ARCH ${X86_64})
set(ALL_PROFILE_SUPPORTED_ARCH ${X86} ${X86_64} ${ARM32} ${ARM64} ${PPC32} ${PPC64}
${MIPS32} ${MIPS64} ${S390X} ${SPARC} ${SPARCV9} ${HEXAGON}
${RISCV32} ${RISCV64} ${LOONGARCH64})
+set(ALL_CTX_PROFILE_SUPPORTED_ARCH ${X86_64})
set(ALL_TSAN_SUPPORTED_ARCH ${X86_64} ${MIPS64} ${ARM64} ${PPC64} ${S390X}
${LOONGARCH64} ${RISCV64})
set(ALL_UBSAN_SUPPORTED_ARCH ${X86} ${X86_64} ${ARM32} ${ARM64} ${RISCV64}
diff --git a/compiler-rt/cmake/config-ix.cmake b/compiler-rt/cmake/config-ix.cmake
index b281ac64f5d5c7..ba740af9e1d60f 100644
--- a/compiler-rt/cmake/config-ix.cmake
+++ b/compiler-rt/cmake/config-ix.cmake
@@ -632,6 +632,9 @@ if(APPLE)
list_intersect(PROFILE_SUPPORTED_ARCH
ALL_PROFILE_SUPPORTED_ARCH
SANITIZER_COMMON_SUPPORTED_ARCH)
+ list_intersect(CTX_PROFILE_SUPPORTED_ARCH
+ ALL_CTX_PROFILE_SUPPORTED_ARCH
+ SANITIZER_COMMON_SUPPORTED_ARCH)
list_intersect(TSAN_SUPPORTED_ARCH
ALL_TSAN_SUPPORTED_ARCH
SANITIZER_COMMON_SUPPORTED_ARCH)
@@ -678,6 +681,7 @@ else()
filter_available_targets(HWASAN_SUPPORTED_ARCH ${ALL_HWASAN_SUPPORTED_ARCH})
filter_available_targets(MEMPROF_SUPPORTED_ARCH ${ALL_MEMPROF_SUPPORTED_ARCH})
filter_available_targets(PROFILE_SUPPORTED_ARCH ${ALL_PROFILE_SUPPORTED_ARCH})
+ filter_available_targets(CTX_PROFILE_SUPPORTED_ARCH ${ALL_CTX_PROFILE_SUPPORTED_ARCH})
filter_available_targets(TSAN_SUPPORTED_ARCH ${ALL_TSAN_SUPPORTED_ARCH})
filter_available_targets(UBSAN_SUPPORTED_ARCH ${ALL_UBSAN_SUPPORTED_ARCH})
filter_available_targets(SAFESTACK_SUPPORTED_ARCH
@@ -803,6 +807,13 @@ else()
set(COMPILER_RT_HAS_PROFILE FALSE)
endif()
+if (COMPILER_RT_HAS_SANITIZER_COMMON AND CTX_PROFILE_SUPPORTED_ARCH AND
+ OS_NAME MATCHES "Linux")
+ set(COMPILER_RT_HAS_CTX_PROFILE TRUE)
+else()
+ set(COMPILER_RT_HAS_CTX_PROFILE FALSE)
+endif()
+
if (COMPILER_RT_HAS_SANITIZER_COMMON AND TSAN_SUPPORTED_ARCH)
if (OS_NAME MATCHES "Linux|Darwin|FreeBSD|NetBSD")
set(COMPILER_RT_HAS_TSAN TRUE)
diff --git a/compiler-rt/lib/CMakeLists.txt b/compiler-rt/lib/CMakeLists.txt
index 43ba9a102c8487..f9e96563b88090 100644
--- a/compiler-rt/lib/CMakeLists.txt
+++ b/compiler-rt/lib/CMakeLists.txt
@@ -51,6 +51,10 @@ if(COMPILER_RT_BUILD_PROFILE AND COMPILER_RT_HAS_PROFILE)
compiler_rt_build_runtime(profile)
endif()
+if(COMPILER_RT_BUILD_CTX_PROFILE AND COMPILER_RT_HAS_CTX_PROFILE)
+ compiler_rt_build_runtime(ctx_profile)
+endif()
+
if(COMPILER_RT_BUILD_XRAY)
compiler_rt_build_runtime(xray)
endif()
diff --git a/compiler-rt/lib/ctx_profile/CMakeLists.txt b/compiler-rt/lib/ctx_profile/CMakeLists.txt
new file mode 100644
index 00000000000000..621b7d30b76d41
--- /dev/null
+++ b/compiler-rt/lib/ctx_profile/CMakeLists.txt
@@ -0,0 +1,28 @@
+add_compiler_rt_component(ctx_profile)
+
+set(CTX_PROFILE_SOURCES
+ CtxInstrProfiling.cpp
+ )
+
+set(CTX_PROFILE_HEADERS
+ CtxInstrProfiling.h
+ )
+
+include_directories(..)
+include_directories(../../include)
+
+# We don't use the C++ Standard Library here, so avoid including it by mistake.
+append_list_if(COMPILER_RT_HAS_NOSTDINCXX_FLAG -nostdinc++ EXTRA_FLAGS)
+
+add_compiler_rt_runtime(clang_rt.ctx_profile
+ STATIC
+ ARCHS ${CTX_PROFILE_SUPPORTED_ARCH}
+ OBJECT_LIBS RTSanitizerCommon RTSanitizerCommonLibc
+ CFLAGS ${EXTRA_FLAGS}
+ SOURCES ${CTX_PROFILE_SOURCES}
+ ADDITIONAL_HEADERS ${CTX_PROFILE_HEADERS}
+ PARENT_TARGET ctx_profile)
+
+if(COMPILER_RT_INCLUDE_TESTS)
+ add_subdirectory(tests)
+endif()
diff --git a/compiler-rt/lib/ctx_profile/CtxInstrProfiling.cpp b/compiler-rt/lib/ctx_profile/CtxInstrProfiling.cpp
new file mode 100644
index 00000000000000..7620ce92f7ebde
--- /dev/null
+++ b/compiler-rt/lib/ctx_profile/CtxInstrProfiling.cpp
@@ -0,0 +1,40 @@
+//===- CtxInstrProfiling.cpp - contextual instrumented PGO ----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "CtxInstrProfiling.h"
+#include "sanitizer_common/sanitizer_allocator_internal.h"
+#include "sanitizer_common/sanitizer_common.h"
+#include "sanitizer_common/sanitizer_dense_map.h"
+#include "sanitizer_common/sanitizer_mutex.h"
+#include "sanitizer_common/sanitizer_placement_new.h"
+#include "sanitizer_common/sanitizer_thread_safety.h"
+
+#include <assert.h>
+
+using namespace __ctx_profile;
+
+// FIXME(mtrofin): use malloc / mmap instead of sanitizer common APIs to reduce
+// the dependency on the latter.
+Arena *Arena::allocateNewArena(size_t Size, Arena *Prev) {
+ assert(!Prev || Prev->Next == nullptr);
+ Arena *NewArena =
+ new (__sanitizer::InternalAlloc(Size + sizeof(Arena))) Arena(Size);
+ if (Prev)
+ Prev->Next = NewArena;
+ return NewArena;
+}
+
+void Arena::freeArenaList(Arena *&A) {
+ assert(A);
+ for (auto *I = A; I != nullptr;) {
+ auto *Current = I;
+ I = I->Next;
+ __sanitizer::InternalFree(Current);
+ }
+ A = nullptr;
+}
diff --git a/compiler-rt/lib/ctx_profile/CtxInstrProfiling.h b/compiler-rt/lib/ctx_profile/CtxInstrProfiling.h
new file mode 100644
index 00000000000000..c1789c32a64c25
--- /dev/null
+++ b/compiler-rt/lib/ctx_profile/CtxInstrProfiling.h
@@ -0,0 +1,55 @@
+/*===- CtxInstrProfiling.h- Contextual instrumentation-based PGO ---------===*\
+|*
+|* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+|* See https://llvm.org/LICENSE.txt for license information.
+|* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+|*
+\*===----------------------------------------------------------------------===*/
+
+#ifndef CTX_PROFILE_CTXINSTRPROFILING_H_
+#define CTX_PROFILE_CTXINSTRPROFILING_H_
+
+#include <sanitizer/common_interface_defs.h>
+
+namespace __ctx_profile {
+
+/// Arena (bump allocator) forming a linked list. Intentionally not thread safe.
+/// Allocation and de-allocation happen using sanitizer APIs. We make that
+/// explicit.
+class Arena final {
+public:
+ // When allocating a new Arena, optionally specify an existing one to append
+ // to, assumed to be the last in the Arena list. We only need to support
+ // appending to the arena list.
+ static Arena *allocateNewArena(size_t Size, Arena *Prev = nullptr);
+ static void freeArenaList(Arena *&A);
+
+ uint64_t size() const { return Size; }
+
+ // Allocate S bytes or return nullptr if we don't have that many available.
+ char *tryBumpAllocate(size_t S) {
+ if (Pos + S > Size)
+ return nullptr;
+ Pos += S;
+ return start() + (Pos - S);
+ }
+
+ Arena *next() const { return Next; }
+
+ // the beginning of allocatable memory.
+ const char *start() const { return const_cast<Arena *>(this)->start(); }
+ const char *pos() const { return start() + Pos; }
+
+private:
+ explicit Arena(uint32_t Size) : Size(Size) {}
+ ~Arena() = delete;
+
+ char *start() { return reinterpret_cast<char *>(&this[1]); }
+
+ Arena *Next = nullptr;
+ uint64_t Pos = 0;
+ const uint64_t Size;
+};
+
+} // namespace __ctx_profile
+#endif // CTX_PROFILE_CTXINSTRPROFILING_H_
diff --git a/compiler-rt/lib/ctx_profile/tests/CMakeLists.txt b/compiler-rt/lib/ctx_profile/tests/CMakeLists.txt
new file mode 100644
index 00000000000000..93b41b838445d1
--- /dev/null
+++ b/compiler-rt/lib/ctx_profile/tests/CMakeLists.txt
@@ -0,0 +1,70 @@
+include(CheckCXXCompilerFlag)
+include(CompilerRTCompile)
+include(CompilerRTLink)
+
+set(CTX_PROFILE_UNITTEST_CFLAGS
+ ${COMPILER_RT_UNITTEST_CFLAGS}
+ ${COMPILER_RT_GTEST_CFLAGS}
+ ${COMPILER_RT_GMOCK_CFLAGS}
+ ${SANITIZER_TEST_CXX_CFLAGS}
+ -I${COMPILER_RT_SOURCE_DIR}/lib/
+ -DSANITIZER_COMMON_NO_REDEFINE_BUILTINS
+ -O2
+ -g
+ -fno-rtti
+ -Wno-pedantic
+ -fno-omit-frame-pointer)
+
+# Suppress warnings for gmock variadic macros for clang and gcc respectively.
+append_list_if(SUPPORTS_GNU_ZERO_VARIADIC_MACRO_ARGUMENTS_FLAG -Wno-gnu-zero-variadic-macro-arguments CTX_PROFILE_UNITTEST_CFLAGS)
+append_list_if(COMPILER_RT_HAS_WVARIADIC_MACROS_FLAG -Wno-variadic-macros CTX_PROFILE_UNITTEST_CFLAGS)
+
+file(GLOB PROFILE_HEADERS ../*.h)
+
+set(CTX_PROFILE_SOURCES
+ ../CtxInstrProfiling.cpp)
+
+set(CTX_PROFILE_UNITTESTS
+ CtxInstrProfilingTest.cpp
+ driver.cpp)
+
+include_directories(../../../include)
+
+set(CTX_PROFILE_UNIT_TEST_HEADERS
+ ${CTX_PROFILE_HEADERS})
+
+set(CTX_PROFILE_UNITTEST_LINK_FLAGS
+ ${COMPILER_RT_UNITTEST_LINK_FLAGS})
+
+list(APPEND CTX_PROFILE_UNITTEST_LINK_FLAGS -pthread)
+
+set(CTX_PROFILE_UNITTEST_LINK_LIBRARIES
+ ${COMPILER_RT_UNWINDER_LINK_LIBS}
+ ${SANITIZER_TEST_CXX_LIBRARIES})
+list(APPEND CTX_PROFILE_UNITTEST_LINK_LIBRARIES "dl")
+
+if(COMPILER_RT_DEFAULT_TARGET_ARCH IN_LIST CTX_PROFILE_SUPPORTED_ARCH)
+ # Profile unit tests are only run on the host machine.
+ set(arch ${COMPILER_RT_DEFAULT_TARGET_ARCH})
+
+ add_executable(CtxProfileUnitTests
+ ${CTX_PROFILE_UNITTESTS}
+ ${COMPILER_RT_GTEST_SOURCE}
+ ${COMPILER_RT_GMOCK_SOURCE}
+ ${CTX_PROFILE_SOURCES}
+ $<TARGET_OBJECTS:RTSanitizerCommon.${arch}>
+ $<TARGET_OBJECTS:RTSanitizerCommonCoverage.${arch}>
+ $<TARGET_OBJECTS:RTSanitizerCommonLibc.${arch}>
+ $<TARGET_OBJECTS:RTSanitizerCommonSymbolizer.${arch}>
+ $<TARGET_OBJECTS:RTSanitizerCommonSymbolizerInternal.${arch}>)
+ set_target_compile_flags(CtxProfileUnitTests ${CTX_PROFILE_UNITTEST_CFLAGS})
+ set_target_link_flags(CtxProfileUnitTests ${CTX_PROFILE_UNITTEST_LINK_FLAGS})
+ target_link_libraries(CtxProfileUnitTests ${CTX_PROFILE_UNITTEST_LINK_LIBRARIES})
+
+ if (TARGET cxx-headers OR HAVE_LIBCXX)
+ add_dependencies(CtxProfileUnitTests cxx-headers)
+ endif()
+
+ set_target_properties(CtxProfileUnitTests PROPERTIES
+ RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+endif()
diff --git a/compiler-rt/lib/ctx_profile/tests/CtxInstrProfilingTest.cpp b/compiler-rt/lib/ctx_profile/tests/CtxInstrProfilingTest.cpp
new file mode 100644
index 00000000000000..44f37d25763206
--- /dev/null
+++ b/compiler-rt/lib/ctx_profile/tests/CtxInstrProfilingTest.cpp
@@ -0,0 +1,22 @@
+#include "../CtxInstrProfiling.h"
+#include "gtest/gtest.h"
+
+using namespace __ctx_profile;
+
+TEST(ArenaTest, Basic) {
+ Arena *A = Arena::allocateNewArena(1024);
+ EXPECT_EQ(A->size(), 1024U);
+ EXPECT_EQ(A->next(), nullptr);
+
+ auto *M1 = A->tryBumpAllocate(1020);
+ EXPECT_NE(M1, nullptr);
+ auto *M2 = A->tryBumpAllocate(4);
+ EXPECT_NE(M2, nullptr);
+ EXPECT_EQ(M1 + 1020, M2);
+ EXPECT_EQ(A->tryBumpAllocate(1), nullptr);
+ Arena *A2 = Arena::allocateNewArena(2024, A);
+ EXPECT_EQ(A->next(), A2);
+ EXPECT_EQ(A2->next(), nullptr);
+ Arena::freeArenaList(A);
+ EXPECT_EQ(A, nullptr);
+}
diff --git a/compiler-rt/lib/ctx_profile/tests/driver.cpp b/compiler-rt/lib/ctx_profile/tests/driver.cpp
new file mode 100644
index 00000000000000..b402cec1126b33
--- /dev/null
+++ b/compiler-rt/lib/ctx_profile/tests/driver.cpp
@@ -0,0 +1,14 @@
+//===-- driver.cpp ----------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "gtest/gtest.h"
+
+int main(int argc, char **argv) {
+ testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
>From 0ab0c1d982876662a45adb9bafaa3c2d3bdf1939 Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev at outlook.com>
Date: Mon, 22 Apr 2024 12:31:57 -0400
Subject: [PATCH 2/7] [SLP]Introduce transformNodes() and transform loads +
reverse to strided loads.
Introduced transformNodes() function to perform transformation of the
nodes (cost-based, instruction count based, etc.).
Implemented transformation of consecutive loads + reverse order to
strided loads with stride -1, if profitable.
Reviewers: RKSimon, preames, topperc
Reviewed By: RKSimon
Pull Request: https://github.com/llvm/llvm-project/pull/88530
---
.../Transforms/Vectorize/SLPVectorizer.cpp | 43 +++++++++++++++++++
.../RISCV/strided-loads-vectorized.ll | 5 +--
2 files changed, 45 insertions(+), 3 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 685a5907d94fe7..6ac380a6ab6c6c 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -1126,6 +1126,9 @@ class BoUpSLP {
void
buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues = {});
+ /// Transforms graph nodes to target specific representations, if profitable.
+ void transformNodes();
+
/// Clear the internal data structures that are created by 'buildTree'.
void deleteTree() {
VectorizableTree.clear();
@@ -7813,6 +7816,43 @@ getGEPCosts(const TargetTransformInfo &TTI, ArrayRef<Value *> Ptrs,
return std::make_pair(ScalarCost, VecCost);
}
+void BoUpSLP::transformNodes() {
+ constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+ for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
+ TreeEntry &E = *TE.get();
+ switch (E.getOpcode()) {
+ case Instruction::Load: {
+ Type *ScalarTy = E.getMainOp()->getType();
+ auto *VecTy = FixedVectorType::get(ScalarTy, E.Scalars.size());
+ Align CommonAlignment = computeCommonAlignment<LoadInst>(E.Scalars);
+ // Check if profitable to represent consecutive load + reverse as strided
+ // load with stride -1.
+ if (isReverseOrder(E.ReorderIndices) &&
+ TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
+ SmallVector<int> Mask;
+ inversePermutation(E.ReorderIndices, Mask);
+ auto *BaseLI = cast<LoadInst>(E.Scalars.back());
+ InstructionCost OriginalVecCost =
+ TTI->getMemoryOpCost(Instruction::Load, VecTy, BaseLI->getAlign(),
+ BaseLI->getPointerAddressSpace(), CostKind,
+ TTI::OperandValueInfo()) +
+ ::getShuffleCost(*TTI, TTI::SK_Reverse, VecTy, Mask, CostKind);
+ InstructionCost StridedCost = TTI->getStridedMemoryOpCost(
+ Instruction::Load, VecTy, BaseLI->getPointerOperand(),
+ /*VariableMask=*/false, CommonAlignment, CostKind, BaseLI);
+ if (StridedCost < OriginalVecCost)
+ // Strided load is more profitable than consecutive load + reverse -
+ // transform the node to strided load.
+ E.State = TreeEntry::StridedVectorize;
+ }
+ break;
+ }
+ default:
+ break;
+ }
+ }
+}
+
/// Merges shuffle masks and emits final shuffle instruction, if required. It
/// supports shuffling of 2 input vectors. It implements lazy shuffles emission,
/// when the actual shuffle instruction is generated only if this is actually
@@ -15189,6 +15229,7 @@ bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
R.buildExternalUses();
R.computeMinimumValueSizes();
+ R.transformNodes();
InstructionCost Cost = R.getTreeCost();
@@ -15567,6 +15608,7 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
R.buildExternalUses();
R.computeMinimumValueSizes();
+ R.transformNodes();
InstructionCost Cost = R.getTreeCost();
CandidateFound = true;
MinCost = std::min(MinCost, Cost);
@@ -16563,6 +16605,7 @@ class HorizontalReduction {
V.buildExternalUses(LocalExternallyUsedValues);
V.computeMinimumValueSizes();
+ V.transformNodes();
// Estimate cost.
InstructionCost TreeCost = V.getTreeCost(VL);
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-vectorized.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-vectorized.ll
index 03acc0009fb04c..44d320c75fedd4 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-vectorized.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-vectorized.ll
@@ -240,11 +240,10 @@ define void @test3(ptr %p, ptr noalias %s) {
; CHECK-LABEL: @test3(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [48 x float], ptr [[P:%.*]], i64 0, i64 0
+; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 30
; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[S:%.*]], i64 0
-; CHECK-NEXT: [[ARRAYIDX48:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 23
; CHECK-NEXT: [[TMP0:%.*]] = call <8 x float> @llvm.experimental.vp.strided.load.v8f32.p0.i64(ptr align 4 [[ARRAYIDX]], i64 16, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 8)
-; CHECK-NEXT: [[TMP1:%.*]] = load <8 x float>, ptr [[ARRAYIDX48]], align 4
-; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT: [[TMP2:%.*]] = call <8 x float> @llvm.experimental.vp.strided.load.v8f32.p0.i64(ptr align 4 [[ARRAYIDX1]], i64 -4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 8)
; CHECK-NEXT: [[TMP3:%.*]] = fsub fast <8 x float> [[TMP2]], [[TMP0]]
; CHECK-NEXT: store <8 x float> [[TMP3]], ptr [[ARRAYIDX2]], align 4
; CHECK-NEXT: ret void
>From 832d3a42c34eee2a6ca323ef97a1c6fe14c1f651 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Mon, 22 Apr 2024 17:18:43 +0100
Subject: [PATCH 3/7] [X86] gfni-funnel-shifts.ll - add vXi8
variable/splat/constant test coverage
Once #89115 has landed, we can handle per-element rotates as well using (V)GF2P8MULB
---
llvm/test/CodeGen/X86/gfni-funnel-shifts.ll | 2686 ++++++++++++++++++-
1 file changed, 2684 insertions(+), 2 deletions(-)
diff --git a/llvm/test/CodeGen/X86/gfni-funnel-shifts.ll b/llvm/test/CodeGen/X86/gfni-funnel-shifts.ll
index a98983e44d3d0c..0c341dc63a9ecc 100644
--- a/llvm/test/CodeGen/X86/gfni-funnel-shifts.ll
+++ b/llvm/test/CodeGen/X86/gfni-funnel-shifts.ll
@@ -9,6 +9,486 @@
; 128 Bit Vector Funnel Shifts
;
+define <16 x i8> @var_fshl_v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %amt) nounwind {
+; GFNISSE-LABEL: var_fshl_v16i8:
+; GFNISSE: # %bb.0:
+; GFNISSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; GFNISSE-NEXT: pxor %xmm3, %xmm3
+; GFNISSE-NEXT: pmovzxbd {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
+; GFNISSE-NEXT: pmovzxbw {{.*#+}} xmm5 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
+; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15]
+; GFNISSE-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
+; GFNISSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7]
+; GFNISSE-NEXT: pslld $23, %xmm2
+; GFNISSE-NEXT: movdqa {{.*#+}} xmm6 = [1065353216,1065353216,1065353216,1065353216]
+; GFNISSE-NEXT: paddd %xmm6, %xmm2
+; GFNISSE-NEXT: cvttps2dq %xmm2, %xmm2
+; GFNISSE-NEXT: pslld $23, %xmm3
+; GFNISSE-NEXT: paddd %xmm6, %xmm3
+; GFNISSE-NEXT: cvttps2dq %xmm3, %xmm3
+; GFNISSE-NEXT: packusdw %xmm2, %xmm3
+; GFNISSE-NEXT: movdqa %xmm1, %xmm7
+; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm0[8],xmm7[9],xmm0[9],xmm7[10],xmm0[10],xmm7[11],xmm0[11],xmm7[12],xmm0[12],xmm7[13],xmm0[13],xmm7[14],xmm0[14],xmm7[15],xmm0[15]
+; GFNISSE-NEXT: pmullw %xmm3, %xmm7
+; GFNISSE-NEXT: psrlw $8, %xmm7
+; GFNISSE-NEXT: pslld $23, %xmm4
+; GFNISSE-NEXT: paddd %xmm6, %xmm4
+; GFNISSE-NEXT: cvttps2dq %xmm4, %xmm2
+; GFNISSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7]
+; GFNISSE-NEXT: pslld $23, %xmm5
+; GFNISSE-NEXT: paddd %xmm6, %xmm5
+; GFNISSE-NEXT: cvttps2dq %xmm5, %xmm3
+; GFNISSE-NEXT: packusdw %xmm3, %xmm2
+; GFNISSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; GFNISSE-NEXT: pmullw %xmm1, %xmm2
+; GFNISSE-NEXT: psrlw $8, %xmm2
+; GFNISSE-NEXT: packuswb %xmm7, %xmm2
+; GFNISSE-NEXT: movdqa %xmm2, %xmm0
+; GFNISSE-NEXT: retq
+;
+; GFNIAVX1-LABEL: var_fshl_v16i8:
+; GFNIAVX1: # %bb.0:
+; GFNIAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; GFNIAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15]
+; GFNIAVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm3[4,4,5,5,6,6,7,7]
+; GFNIAVX1-NEXT: vpslld $23, %xmm4, %xmm4
+; GFNIAVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [1065353216,1065353216,1065353216,1065353216]
+; GFNIAVX1-NEXT: vpaddd %xmm5, %xmm4, %xmm4
+; GFNIAVX1-NEXT: vcvttps2dq %xmm4, %xmm4
+; GFNIAVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
+; GFNIAVX1-NEXT: vpslld $23, %xmm3, %xmm3
+; GFNIAVX1-NEXT: vpaddd %xmm5, %xmm3, %xmm3
+; GFNIAVX1-NEXT: vcvttps2dq %xmm3, %xmm3
+; GFNIAVX1-NEXT: vpackusdw %xmm4, %xmm3, %xmm3
+; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; GFNIAVX1-NEXT: vpmullw %xmm3, %xmm4, %xmm3
+; GFNIAVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
+; GFNIAVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
+; GFNIAVX1-NEXT: vpslld $23, %xmm4, %xmm4
+; GFNIAVX1-NEXT: vpaddd %xmm5, %xmm4, %xmm4
+; GFNIAVX1-NEXT: vcvttps2dq %xmm4, %xmm4
+; GFNIAVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
+; GFNIAVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7]
+; GFNIAVX1-NEXT: vpslld $23, %xmm2, %xmm2
+; GFNIAVX1-NEXT: vpaddd %xmm5, %xmm2, %xmm2
+; GFNIAVX1-NEXT: vcvttps2dq %xmm2, %xmm2
+; GFNIAVX1-NEXT: vpackusdw %xmm2, %xmm4, %xmm2
+; GFNIAVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; GFNIAVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0
+; GFNIAVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
+; GFNIAVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
+; GFNIAVX1-NEXT: retq
+;
+; GFNIAVX2-LABEL: var_fshl_v16i8:
+; GFNIAVX2: # %bb.0:
+; GFNIAVX2-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; GFNIAVX2-NEXT: vpmovzxwd {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
+; GFNIAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; GFNIAVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; GFNIAVX2-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15]
+; GFNIAVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
+; GFNIAVX2-NEXT: vpsllvd %ymm4, %ymm3, %ymm3
+; GFNIAVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; GFNIAVX2-NEXT: vpshufb %ymm4, %ymm3, %ymm3
+; GFNIAVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
+; GFNIAVX2-NEXT: vpsrlw $8, %xmm3, %xmm3
+; GFNIAVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; GFNIAVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; GFNIAVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero
+; GFNIAVX2-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
+; GFNIAVX2-NEXT: vpshufb %ymm4, %ymm0, %ymm0
+; GFNIAVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; GFNIAVX2-NEXT: vpsrlw $8, %xmm0, %xmm0
+; GFNIAVX2-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
+; GFNIAVX2-NEXT: vzeroupper
+; GFNIAVX2-NEXT: retq
+;
+; GFNIAVX512VL-LABEL: var_fshl_v16i8:
+; GFNIAVX512VL: # %bb.0:
+; GFNIAVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; GFNIAVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4
+; GFNIAVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero,xmm4[8],zero,zero,zero,xmm4[9],zero,zero,zero,xmm4[10],zero,zero,zero,xmm4[11],zero,zero,zero,xmm4[12],zero,zero,zero,xmm4[13],zero,zero,zero,xmm4[14],zero,zero,zero,xmm4[15],zero,zero,zero
+; GFNIAVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; GFNIAVX512VL-NEXT: vpsllvd %zmm4, %zmm0, %zmm0
+; GFNIAVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm2
+; GFNIAVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
+; GFNIAVX512VL-NEXT: vpsrlw $1, %xmm1, %xmm1
+; GFNIAVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1
+; GFNIAVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; GFNIAVX512VL-NEXT: vpsrlvd %zmm2, %zmm1, %zmm1
+; GFNIAVX512VL-NEXT: vpord %zmm1, %zmm0, %zmm0
+; GFNIAVX512VL-NEXT: vpmovdb %zmm0, %xmm0
+; GFNIAVX512VL-NEXT: vzeroupper
+; GFNIAVX512VL-NEXT: retq
+;
+; GFNIAVX512BW-LABEL: var_fshl_v16i8:
+; GFNIAVX512BW: # %bb.0:
+; GFNIAVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; GFNIAVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; GFNIAVX512BW-NEXT: vpsllw $8, %ymm0, %ymm0
+; GFNIAVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0
+; GFNIAVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm2, %xmm1
+; GFNIAVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; GFNIAVX512BW-NEXT: vpsllvw %ymm1, %ymm0, %ymm0
+; GFNIAVX512BW-NEXT: vpsrlw $8, %ymm0, %ymm0
+; GFNIAVX512BW-NEXT: vpmovwb %ymm0, %xmm0
+; GFNIAVX512BW-NEXT: vzeroupper
+; GFNIAVX512BW-NEXT: retq
+ %res = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %amt)
+ ret <16 x i8> %res
+}
+
+define <16 x i8> @var_fshr_v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %amt) nounwind {
+; GFNISSE-LABEL: var_fshr_v16i8:
+; GFNISSE: # %bb.0:
+; GFNISSE-NEXT: movdqa %xmm2, %xmm3
+; GFNISSE-NEXT: movdqa %xmm0, %xmm2
+; GFNISSE-NEXT: movdqa {{.*#+}} xmm5 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; GFNISSE-NEXT: movdqa %xmm3, %xmm0
+; GFNISSE-NEXT: pand %xmm5, %xmm0
+; GFNISSE-NEXT: psllw $5, %xmm0
+; GFNISSE-NEXT: movdqa %xmm0, %xmm4
+; GFNISSE-NEXT: paddb %xmm0, %xmm4
+; GFNISSE-NEXT: movdqa %xmm1, %xmm6
+; GFNISSE-NEXT: psrlw $4, %xmm6
+; GFNISSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6
+; GFNISSE-NEXT: pblendvb %xmm0, %xmm6, %xmm1
+; GFNISSE-NEXT: movdqa %xmm1, %xmm6
+; GFNISSE-NEXT: psrlw $2, %xmm6
+; GFNISSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6
+; GFNISSE-NEXT: movdqa %xmm4, %xmm0
+; GFNISSE-NEXT: pblendvb %xmm0, %xmm6, %xmm1
+; GFNISSE-NEXT: movdqa %xmm1, %xmm6
+; GFNISSE-NEXT: psrlw $1, %xmm6
+; GFNISSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6
+; GFNISSE-NEXT: paddb %xmm4, %xmm4
+; GFNISSE-NEXT: movdqa %xmm4, %xmm0
+; GFNISSE-NEXT: pblendvb %xmm0, %xmm6, %xmm1
+; GFNISSE-NEXT: pandn %xmm5, %xmm3
+; GFNISSE-NEXT: psllw $5, %xmm3
+; GFNISSE-NEXT: movdqa %xmm3, %xmm4
+; GFNISSE-NEXT: paddb %xmm3, %xmm4
+; GFNISSE-NEXT: paddb %xmm2, %xmm2
+; GFNISSE-NEXT: movdqa %xmm2, %xmm5
+; GFNISSE-NEXT: psllw $4, %xmm5
+; GFNISSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5
+; GFNISSE-NEXT: movdqa %xmm3, %xmm0
+; GFNISSE-NEXT: pblendvb %xmm0, %xmm5, %xmm2
+; GFNISSE-NEXT: movdqa %xmm2, %xmm3
+; GFNISSE-NEXT: psllw $2, %xmm3
+; GFNISSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
+; GFNISSE-NEXT: movdqa %xmm4, %xmm0
+; GFNISSE-NEXT: pblendvb %xmm0, %xmm3, %xmm2
+; GFNISSE-NEXT: movdqa %xmm2, %xmm3
+; GFNISSE-NEXT: paddb %xmm2, %xmm3
+; GFNISSE-NEXT: paddb %xmm4, %xmm4
+; GFNISSE-NEXT: movdqa %xmm4, %xmm0
+; GFNISSE-NEXT: pblendvb %xmm0, %xmm3, %xmm2
+; GFNISSE-NEXT: por %xmm1, %xmm2
+; GFNISSE-NEXT: movdqa %xmm2, %xmm0
+; GFNISSE-NEXT: retq
+;
+; GFNIAVX1-LABEL: var_fshr_v16i8:
+; GFNIAVX1: # %bb.0:
+; GFNIAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; GFNIAVX1-NEXT: vpand %xmm3, %xmm2, %xmm4
+; GFNIAVX1-NEXT: vpsllw $5, %xmm4, %xmm4
+; GFNIAVX1-NEXT: vpaddb %xmm4, %xmm4, %xmm5
+; GFNIAVX1-NEXT: vpsrlw $4, %xmm1, %xmm6
+; GFNIAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6, %xmm6
+; GFNIAVX1-NEXT: vpblendvb %xmm4, %xmm6, %xmm1, %xmm1
+; GFNIAVX1-NEXT: vpsrlw $2, %xmm1, %xmm4
+; GFNIAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4
+; GFNIAVX1-NEXT: vpblendvb %xmm5, %xmm4, %xmm1, %xmm1
+; GFNIAVX1-NEXT: vpsrlw $1, %xmm1, %xmm4
+; GFNIAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4
+; GFNIAVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm5
+; GFNIAVX1-NEXT: vpblendvb %xmm5, %xmm4, %xmm1, %xmm1
+; GFNIAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2
+; GFNIAVX1-NEXT: vpsllw $5, %xmm2, %xmm2
+; GFNIAVX1-NEXT: vpaddb %xmm2, %xmm2, %xmm3
+; GFNIAVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm0
+; GFNIAVX1-NEXT: vpsllw $4, %xmm0, %xmm4
+; GFNIAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4
+; GFNIAVX1-NEXT: vpblendvb %xmm2, %xmm4, %xmm0, %xmm0
+; GFNIAVX1-NEXT: vpsllw $2, %xmm0, %xmm2
+; GFNIAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; GFNIAVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
+; GFNIAVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm2
+; GFNIAVX1-NEXT: vpaddb %xmm3, %xmm3, %xmm3
+; GFNIAVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
+; GFNIAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
+; GFNIAVX1-NEXT: retq
+;
+; GFNIAVX2-LABEL: var_fshr_v16i8:
+; GFNIAVX2: # %bb.0:
+; GFNIAVX2-NEXT: vpbroadcastb {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; GFNIAVX2-NEXT: vpand %xmm3, %xmm2, %xmm4
+; GFNIAVX2-NEXT: vpsllw $5, %xmm4, %xmm4
+; GFNIAVX2-NEXT: vpaddb %xmm4, %xmm4, %xmm5
+; GFNIAVX2-NEXT: vpsrlw $4, %xmm1, %xmm6
+; GFNIAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6, %xmm6
+; GFNIAVX2-NEXT: vpblendvb %xmm4, %xmm6, %xmm1, %xmm1
+; GFNIAVX2-NEXT: vpsrlw $2, %xmm1, %xmm4
+; GFNIAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4
+; GFNIAVX2-NEXT: vpblendvb %xmm5, %xmm4, %xmm1, %xmm1
+; GFNIAVX2-NEXT: vpsrlw $1, %xmm1, %xmm4
+; GFNIAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4
+; GFNIAVX2-NEXT: vpaddb %xmm5, %xmm5, %xmm5
+; GFNIAVX2-NEXT: vpblendvb %xmm5, %xmm4, %xmm1, %xmm1
+; GFNIAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2
+; GFNIAVX2-NEXT: vpsllw $5, %xmm2, %xmm2
+; GFNIAVX2-NEXT: vpaddb %xmm2, %xmm2, %xmm3
+; GFNIAVX2-NEXT: vpaddb %xmm0, %xmm0, %xmm0
+; GFNIAVX2-NEXT: vpsllw $4, %xmm0, %xmm4
+; GFNIAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4
+; GFNIAVX2-NEXT: vpblendvb %xmm2, %xmm4, %xmm0, %xmm0
+; GFNIAVX2-NEXT: vpsllw $2, %xmm0, %xmm2
+; GFNIAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; GFNIAVX2-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
+; GFNIAVX2-NEXT: vpaddb %xmm0, %xmm0, %xmm2
+; GFNIAVX2-NEXT: vpaddb %xmm3, %xmm3, %xmm3
+; GFNIAVX2-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
+; GFNIAVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
+; GFNIAVX2-NEXT: retq
+;
+; GFNIAVX512VL-LABEL: var_fshr_v16i8:
+; GFNIAVX512VL: # %bb.0:
+; GFNIAVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; GFNIAVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4
+; GFNIAVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero,xmm4[8],zero,zero,zero,xmm4[9],zero,zero,zero,xmm4[10],zero,zero,zero,xmm4[11],zero,zero,zero,xmm4[12],zero,zero,zero,xmm4[13],zero,zero,zero,xmm4[14],zero,zero,zero,xmm4[15],zero,zero,zero
+; GFNIAVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; GFNIAVX512VL-NEXT: vpsrlvd %zmm4, %zmm1, %zmm1
+; GFNIAVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm2
+; GFNIAVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
+; GFNIAVX512VL-NEXT: vpaddb %xmm0, %xmm0, %xmm0
+; GFNIAVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; GFNIAVX512VL-NEXT: vpsllvd %zmm2, %zmm0, %zmm0
+; GFNIAVX512VL-NEXT: vpord %zmm1, %zmm0, %zmm0
+; GFNIAVX512VL-NEXT: vpmovdb %zmm0, %xmm0
+; GFNIAVX512VL-NEXT: vzeroupper
+; GFNIAVX512VL-NEXT: retq
+;
+; GFNIAVX512BW-LABEL: var_fshr_v16i8:
+; GFNIAVX512BW: # %bb.0:
+; GFNIAVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; GFNIAVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; GFNIAVX512BW-NEXT: vpsllw $8, %ymm0, %ymm0
+; GFNIAVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0
+; GFNIAVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm2, %xmm1
+; GFNIAVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; GFNIAVX512BW-NEXT: vpsrlvw %ymm1, %ymm0, %ymm0
+; GFNIAVX512BW-NEXT: vpmovwb %ymm0, %xmm0
+; GFNIAVX512BW-NEXT: vzeroupper
+; GFNIAVX512BW-NEXT: retq
+ %res = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %amt)
+ ret <16 x i8> %res
+}
+
+define <16 x i8> @splatvar_fshl_v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %amt) nounwind {
+; GFNISSE-LABEL: splatvar_fshl_v16i8:
+; GFNISSE: # %bb.0:
+; GFNISSE-NEXT: movdqa %xmm1, %xmm3
+; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
+; GFNISSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; GFNISSE-NEXT: psllw %xmm2, %xmm3
+; GFNISSE-NEXT: psrlw $8, %xmm3
+; GFNISSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; GFNISSE-NEXT: psllw %xmm2, %xmm1
+; GFNISSE-NEXT: psrlw $8, %xmm1
+; GFNISSE-NEXT: packuswb %xmm3, %xmm1
+; GFNISSE-NEXT: movdqa %xmm1, %xmm0
+; GFNISSE-NEXT: retq
+;
+; GFNIAVX-LABEL: splatvar_fshl_v16i8:
+; GFNIAVX: # %bb.0:
+; GFNIAVX-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; GFNIAVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; GFNIAVX-NEXT: vpsllw %xmm2, %xmm3, %xmm3
+; GFNIAVX-NEXT: vpsrlw $8, %xmm3, %xmm3
+; GFNIAVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; GFNIAVX-NEXT: vpsllw %xmm2, %xmm0, %xmm0
+; GFNIAVX-NEXT: vpsrlw $8, %xmm0, %xmm0
+; GFNIAVX-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
+; GFNIAVX-NEXT: retq
+ %splat = shufflevector <16 x i8> %amt, <16 x i8> undef, <16 x i32> zeroinitializer
+ %res = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %splat)
+ ret <16 x i8> %res
+}
+
+define <16 x i8> @splatvar_fshr_v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %amt) nounwind {
+; GFNISSE-LABEL: splatvar_fshr_v16i8:
+; GFNISSE: # %bb.0:
+; GFNISSE-NEXT: movdqa %xmm1, %xmm4
+; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15]
+; GFNISSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; GFNISSE-NEXT: psrlw %xmm2, %xmm4
+; GFNISSE-NEXT: pmovzxbw {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
+; GFNISSE-NEXT: pand %xmm3, %xmm4
+; GFNISSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; GFNISSE-NEXT: psrlw %xmm2, %xmm1
+; GFNISSE-NEXT: pand %xmm1, %xmm3
+; GFNISSE-NEXT: packuswb %xmm4, %xmm3
+; GFNISSE-NEXT: movdqa %xmm3, %xmm0
+; GFNISSE-NEXT: retq
+;
+; GFNIAVX1-LABEL: splatvar_fshr_v16i8:
+; GFNIAVX1: # %bb.0:
+; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; GFNIAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; GFNIAVX1-NEXT: vpsrlw %xmm2, %xmm3, %xmm3
+; GFNIAVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
+; GFNIAVX1-NEXT: vpand %xmm4, %xmm3, %xmm3
+; GFNIAVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; GFNIAVX1-NEXT: vpsrlw %xmm2, %xmm0, %xmm0
+; GFNIAVX1-NEXT: vpand %xmm4, %xmm0, %xmm0
+; GFNIAVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
+; GFNIAVX1-NEXT: retq
+;
+; GFNIAVX2-LABEL: splatvar_fshr_v16i8:
+; GFNIAVX2: # %bb.0:
+; GFNIAVX2-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; GFNIAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; GFNIAVX2-NEXT: vpsrlw %xmm2, %xmm3, %xmm3
+; GFNIAVX2-NEXT: vpbroadcastw {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
+; GFNIAVX2-NEXT: vpand %xmm4, %xmm3, %xmm3
+; GFNIAVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; GFNIAVX2-NEXT: vpsrlw %xmm2, %xmm0, %xmm0
+; GFNIAVX2-NEXT: vpand %xmm4, %xmm0, %xmm0
+; GFNIAVX2-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
+; GFNIAVX2-NEXT: retq
+;
+; GFNIAVX512VL-LABEL: splatvar_fshr_v16i8:
+; GFNIAVX512VL: # %bb.0:
+; GFNIAVX512VL-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; GFNIAVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; GFNIAVX512VL-NEXT: vpsrlw %xmm2, %xmm3, %xmm3
+; GFNIAVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
+; GFNIAVX512VL-NEXT: vpand %xmm4, %xmm3, %xmm3
+; GFNIAVX512VL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; GFNIAVX512VL-NEXT: vpsrlw %xmm2, %xmm0, %xmm0
+; GFNIAVX512VL-NEXT: vpand %xmm4, %xmm0, %xmm0
+; GFNIAVX512VL-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
+; GFNIAVX512VL-NEXT: retq
+;
+; GFNIAVX512BW-LABEL: splatvar_fshr_v16i8:
+; GFNIAVX512BW: # %bb.0:
+; GFNIAVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; GFNIAVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; GFNIAVX512BW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
+; GFNIAVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm1
+; GFNIAVX512BW-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
+; GFNIAVX512BW-NEXT: vpmovwb %ymm0, %xmm0
+; GFNIAVX512BW-NEXT: vzeroupper
+; GFNIAVX512BW-NEXT: retq
+ %splat = shufflevector <16 x i8> %amt, <16 x i8> undef, <16 x i32> zeroinitializer
+ %res = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %splat)
+ ret <16 x i8> %res
+}
+
+define <16 x i8> @constant_fshl_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
+; GFNISSE-LABEL: constant_fshl_v16i8:
+; GFNISSE: # %bb.0:
+; GFNISSE-NEXT: movdqa %xmm1, %xmm2
+; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
+; GFNISSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; GFNISSE-NEXT: psrlw $8, %xmm2
+; GFNISSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; GFNISSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; GFNISSE-NEXT: psrlw $8, %xmm1
+; GFNISSE-NEXT: packuswb %xmm2, %xmm1
+; GFNISSE-NEXT: movdqa %xmm1, %xmm0
+; GFNISSE-NEXT: retq
+;
+; GFNIAVX1OR2-LABEL: constant_fshl_v16i8:
+; GFNIAVX1OR2: # %bb.0:
+; GFNIAVX1OR2-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; GFNIAVX1OR2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; GFNIAVX1OR2-NEXT: vpsrlw $8, %xmm2, %xmm2
+; GFNIAVX1OR2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; GFNIAVX1OR2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; GFNIAVX1OR2-NEXT: vpsrlw $8, %xmm0, %xmm0
+; GFNIAVX1OR2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; GFNIAVX1OR2-NEXT: retq
+;
+; GFNIAVX512VL-LABEL: constant_fshl_v16i8:
+; GFNIAVX512VL: # %bb.0:
+; GFNIAVX512VL-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; GFNIAVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; GFNIAVX512VL-NEXT: vpsrlw $8, %xmm2, %xmm2
+; GFNIAVX512VL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; GFNIAVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; GFNIAVX512VL-NEXT: vpsrlw $8, %xmm0, %xmm0
+; GFNIAVX512VL-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; GFNIAVX512VL-NEXT: retq
+;
+; GFNIAVX512BW-LABEL: constant_fshl_v16i8:
+; GFNIAVX512BW: # %bb.0:
+; GFNIAVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; GFNIAVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; GFNIAVX512BW-NEXT: vpsllw $8, %ymm0, %ymm0
+; GFNIAVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0
+; GFNIAVX512BW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; GFNIAVX512BW-NEXT: vpsrlw $8, %ymm0, %ymm0
+; GFNIAVX512BW-NEXT: vpmovwb %ymm0, %xmm0
+; GFNIAVX512BW-NEXT: vzeroupper
+; GFNIAVX512BW-NEXT: retq
+ %res = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1>)
+ ret <16 x i8> %res
+}
+
+define <16 x i8> @constant_fshr_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
+; GFNISSE-LABEL: constant_fshr_v16i8:
+; GFNISSE: # %bb.0:
+; GFNISSE-NEXT: movdqa %xmm1, %xmm2
+; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
+; GFNISSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; GFNISSE-NEXT: psrlw $8, %xmm2
+; GFNISSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; GFNISSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; GFNISSE-NEXT: psrlw $8, %xmm1
+; GFNISSE-NEXT: packuswb %xmm2, %xmm1
+; GFNISSE-NEXT: movdqa %xmm1, %xmm0
+; GFNISSE-NEXT: retq
+;
+; GFNIAVX1OR2-LABEL: constant_fshr_v16i8:
+; GFNIAVX1OR2: # %bb.0:
+; GFNIAVX1OR2-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; GFNIAVX1OR2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; GFNIAVX1OR2-NEXT: vpsrlw $8, %xmm2, %xmm2
+; GFNIAVX1OR2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; GFNIAVX1OR2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; GFNIAVX1OR2-NEXT: vpsrlw $8, %xmm0, %xmm0
+; GFNIAVX1OR2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; GFNIAVX1OR2-NEXT: retq
+;
+; GFNIAVX512VL-LABEL: constant_fshr_v16i8:
+; GFNIAVX512VL: # %bb.0:
+; GFNIAVX512VL-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; GFNIAVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; GFNIAVX512VL-NEXT: vpsrlw $8, %xmm2, %xmm2
+; GFNIAVX512VL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; GFNIAVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; GFNIAVX512VL-NEXT: vpsrlw $8, %xmm0, %xmm0
+; GFNIAVX512VL-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; GFNIAVX512VL-NEXT: retq
+;
+; GFNIAVX512BW-LABEL: constant_fshr_v16i8:
+; GFNIAVX512BW: # %bb.0:
+; GFNIAVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; GFNIAVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; GFNIAVX512BW-NEXT: vpsllw $8, %ymm0, %ymm0
+; GFNIAVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0
+; GFNIAVX512BW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; GFNIAVX512BW-NEXT: vpsrlw $8, %ymm0, %ymm0
+; GFNIAVX512BW-NEXT: vpmovwb %ymm0, %xmm0
+; GFNIAVX512BW-NEXT: vzeroupper
+; GFNIAVX512BW-NEXT: retq
+ %res = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1>)
+ ret <16 x i8> %res
+}
+
define <16 x i8> @splatconstant_fshl_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; GFNISSE-LABEL: splatconstant_fshl_v16i8:
; GFNISSE: # %bb.0:
@@ -71,6 +551,788 @@ declare <16 x i8> @llvm.fshr.v16i8(<16 x i8>, <16 x i8>, <16 x i8>)
; 256 Bit Vector Funnel Shifts
;
+define <32 x i8> @var_fshl_v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8> %amt) nounwind {
+; GFNISSE-LABEL: var_fshl_v32i8:
+; GFNISSE: # %bb.0:
+; GFNISSE-NEXT: movdqa %xmm0, %xmm6
+; GFNISSE-NEXT: movdqa {{.*#+}} xmm8 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; GFNISSE-NEXT: pand %xmm8, %xmm4
+; GFNISSE-NEXT: pxor %xmm7, %xmm7
+; GFNISSE-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
+; GFNISSE-NEXT: pmovzxbw {{.*#+}} xmm9 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
+; GFNISSE-NEXT: movdqa %xmm4, %xmm10
+; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm7[8],xmm10[9],xmm7[9],xmm10[10],xmm7[10],xmm10[11],xmm7[11],xmm10[12],xmm7[12],xmm10[13],xmm7[13],xmm10[14],xmm7[14],xmm10[15],xmm7[15]
+; GFNISSE-NEXT: pmovzxwd {{.*#+}} xmm11 = xmm10[0],zero,xmm10[1],zero,xmm10[2],zero,xmm10[3],zero
+; GFNISSE-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4,4,5,5,6,6,7,7]
+; GFNISSE-NEXT: pslld $23, %xmm10
+; GFNISSE-NEXT: movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
+; GFNISSE-NEXT: paddd %xmm4, %xmm10
+; GFNISSE-NEXT: cvttps2dq %xmm10, %xmm10
+; GFNISSE-NEXT: pslld $23, %xmm11
+; GFNISSE-NEXT: paddd %xmm4, %xmm11
+; GFNISSE-NEXT: cvttps2dq %xmm11, %xmm11
+; GFNISSE-NEXT: packusdw %xmm10, %xmm11
+; GFNISSE-NEXT: movdqa %xmm2, %xmm10
+; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm6[8],xmm10[9],xmm6[9],xmm10[10],xmm6[10],xmm10[11],xmm6[11],xmm10[12],xmm6[12],xmm10[13],xmm6[13],xmm10[14],xmm6[14],xmm10[15],xmm6[15]
+; GFNISSE-NEXT: pmullw %xmm11, %xmm10
+; GFNISSE-NEXT: psrlw $8, %xmm10
+; GFNISSE-NEXT: pslld $23, %xmm0
+; GFNISSE-NEXT: paddd %xmm4, %xmm0
+; GFNISSE-NEXT: cvttps2dq %xmm0, %xmm0
+; GFNISSE-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4,4,5,5,6,6,7,7]
+; GFNISSE-NEXT: pslld $23, %xmm9
+; GFNISSE-NEXT: paddd %xmm4, %xmm9
+; GFNISSE-NEXT: cvttps2dq %xmm9, %xmm9
+; GFNISSE-NEXT: packusdw %xmm9, %xmm0
+; GFNISSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7]
+; GFNISSE-NEXT: pmullw %xmm2, %xmm0
+; GFNISSE-NEXT: psrlw $8, %xmm0
+; GFNISSE-NEXT: packuswb %xmm10, %xmm0
+; GFNISSE-NEXT: pand %xmm8, %xmm5
+; GFNISSE-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero
+; GFNISSE-NEXT: pmovzxbw {{.*#+}} xmm6 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero
+; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm7[8],xmm5[9],xmm7[9],xmm5[10],xmm7[10],xmm5[11],xmm7[11],xmm5[12],xmm7[12],xmm5[13],xmm7[13],xmm5[14],xmm7[14],xmm5[15],xmm7[15]
+; GFNISSE-NEXT: pmovzxwd {{.*#+}} xmm7 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero
+; GFNISSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7]
+; GFNISSE-NEXT: pslld $23, %xmm5
+; GFNISSE-NEXT: paddd %xmm4, %xmm5
+; GFNISSE-NEXT: cvttps2dq %xmm5, %xmm5
+; GFNISSE-NEXT: pslld $23, %xmm7
+; GFNISSE-NEXT: paddd %xmm4, %xmm7
+; GFNISSE-NEXT: cvttps2dq %xmm7, %xmm7
+; GFNISSE-NEXT: packusdw %xmm5, %xmm7
+; GFNISSE-NEXT: movdqa %xmm3, %xmm5
+; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm1[8],xmm5[9],xmm1[9],xmm5[10],xmm1[10],xmm5[11],xmm1[11],xmm5[12],xmm1[12],xmm5[13],xmm1[13],xmm5[14],xmm1[14],xmm5[15],xmm1[15]
+; GFNISSE-NEXT: pmullw %xmm7, %xmm5
+; GFNISSE-NEXT: psrlw $8, %xmm5
+; GFNISSE-NEXT: pslld $23, %xmm2
+; GFNISSE-NEXT: paddd %xmm4, %xmm2
+; GFNISSE-NEXT: cvttps2dq %xmm2, %xmm2
+; GFNISSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4,4,5,5,6,6,7,7]
+; GFNISSE-NEXT: pslld $23, %xmm6
+; GFNISSE-NEXT: paddd %xmm4, %xmm6
+; GFNISSE-NEXT: cvttps2dq %xmm6, %xmm4
+; GFNISSE-NEXT: packusdw %xmm4, %xmm2
+; GFNISSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
+; GFNISSE-NEXT: pmullw %xmm3, %xmm2
+; GFNISSE-NEXT: psrlw $8, %xmm2
+; GFNISSE-NEXT: packuswb %xmm5, %xmm2
+; GFNISSE-NEXT: movdqa %xmm2, %xmm1
+; GFNISSE-NEXT: retq
+;
+; GFNIAVX1-LABEL: var_fshl_v32i8:
+; GFNIAVX1: # %bb.0:
+; GFNIAVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
+; GFNIAVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
+; GFNIAVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5
+; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15]
+; GFNIAVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm6[4,4,5,5,6,6,7,7]
+; GFNIAVX1-NEXT: vpslld $23, %xmm3, %xmm7
+; GFNIAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
+; GFNIAVX1-NEXT: vpaddd %xmm3, %xmm7, %xmm7
+; GFNIAVX1-NEXT: vcvttps2dq %xmm7, %xmm7
+; GFNIAVX1-NEXT: vpmovzxwd {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero
+; GFNIAVX1-NEXT: vpslld $23, %xmm6, %xmm6
+; GFNIAVX1-NEXT: vpaddd %xmm3, %xmm6, %xmm6
+; GFNIAVX1-NEXT: vcvttps2dq %xmm6, %xmm6
+; GFNIAVX1-NEXT: vpackusdw %xmm7, %xmm6, %xmm6
+; GFNIAVX1-NEXT: vextractf128 $1, %ymm0, %xmm7
+; GFNIAVX1-NEXT: vextractf128 $1, %ymm1, %xmm8
+; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15]
+; GFNIAVX1-NEXT: vpmullw %xmm6, %xmm9, %xmm6
+; GFNIAVX1-NEXT: vpsrlw $8, %xmm6, %xmm6
+; GFNIAVX1-NEXT: vpmovzxbd {{.*#+}} xmm9 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
+; GFNIAVX1-NEXT: vpslld $23, %xmm9, %xmm9
+; GFNIAVX1-NEXT: vpaddd %xmm3, %xmm9, %xmm9
+; GFNIAVX1-NEXT: vcvttps2dq %xmm9, %xmm9
+; GFNIAVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
+; GFNIAVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7]
+; GFNIAVX1-NEXT: vpslld $23, %xmm4, %xmm4
+; GFNIAVX1-NEXT: vpaddd %xmm3, %xmm4, %xmm4
+; GFNIAVX1-NEXT: vcvttps2dq %xmm4, %xmm4
+; GFNIAVX1-NEXT: vpackusdw %xmm4, %xmm9, %xmm4
+; GFNIAVX1-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7]
+; GFNIAVX1-NEXT: vpmullw %xmm4, %xmm7, %xmm4
+; GFNIAVX1-NEXT: vpsrlw $8, %xmm4, %xmm4
+; GFNIAVX1-NEXT: vpackuswb %xmm6, %xmm4, %xmm4
+; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15]
+; GFNIAVX1-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm5[4,4,5,5,6,6,7,7]
+; GFNIAVX1-NEXT: vpslld $23, %xmm6, %xmm6
+; GFNIAVX1-NEXT: vpaddd %xmm3, %xmm6, %xmm6
+; GFNIAVX1-NEXT: vcvttps2dq %xmm6, %xmm6
+; GFNIAVX1-NEXT: vpmovzxwd {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero
+; GFNIAVX1-NEXT: vpslld $23, %xmm5, %xmm5
+; GFNIAVX1-NEXT: vpaddd %xmm3, %xmm5, %xmm5
+; GFNIAVX1-NEXT: vcvttps2dq %xmm5, %xmm5
+; GFNIAVX1-NEXT: vpackusdw %xmm6, %xmm5, %xmm5
+; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; GFNIAVX1-NEXT: vpmullw %xmm5, %xmm6, %xmm5
+; GFNIAVX1-NEXT: vpsrlw $8, %xmm5, %xmm5
+; GFNIAVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
+; GFNIAVX1-NEXT: vpslld $23, %xmm6, %xmm6
+; GFNIAVX1-NEXT: vpaddd %xmm3, %xmm6, %xmm6
+; GFNIAVX1-NEXT: vcvttps2dq %xmm6, %xmm6
+; GFNIAVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
+; GFNIAVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7]
+; GFNIAVX1-NEXT: vpslld $23, %xmm2, %xmm2
+; GFNIAVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2
+; GFNIAVX1-NEXT: vcvttps2dq %xmm2, %xmm2
+; GFNIAVX1-NEXT: vpackusdw %xmm2, %xmm6, %xmm2
+; GFNIAVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; GFNIAVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0
+; GFNIAVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
+; GFNIAVX1-NEXT: vpackuswb %xmm5, %xmm0, %xmm0
+; GFNIAVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; GFNIAVX1-NEXT: retq
+;
+; GFNIAVX2-LABEL: var_fshl_v32i8:
+; GFNIAVX2: # %bb.0:
+; GFNIAVX2-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
+; GFNIAVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; GFNIAVX2-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15]
+; GFNIAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
+; GFNIAVX2-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm2[8],ymm4[8],ymm2[9],ymm4[9],ymm2[10],ymm4[10],ymm2[11],ymm4[11],ymm2[12],ymm4[12],ymm2[13],ymm4[13],ymm2[14],ymm4[14],ymm2[15],ymm4[15],ymm2[24],ymm4[24],ymm2[25],ymm4[25],ymm2[26],ymm4[26],ymm2[27],ymm4[27],ymm2[28],ymm4[28],ymm2[29],ymm4[29],ymm2[30],ymm4[30],ymm2[31],ymm4[31]
+; GFNIAVX2-NEXT: vpunpckhwd {{.*#+}} ymm7 = ymm6[4],ymm4[4],ymm6[5],ymm4[5],ymm6[6],ymm4[6],ymm6[7],ymm4[7],ymm6[12],ymm4[12],ymm6[13],ymm4[13],ymm6[14],ymm4[14],ymm6[15],ymm4[15]
+; GFNIAVX2-NEXT: vpsllvd %ymm7, %ymm5, %ymm5
+; GFNIAVX2-NEXT: vpsrld $16, %ymm5, %ymm5
+; GFNIAVX2-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11]
+; GFNIAVX2-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm6[0],ymm4[0],ymm6[1],ymm4[1],ymm6[2],ymm4[2],ymm6[3],ymm4[3],ymm6[8],ymm4[8],ymm6[9],ymm4[9],ymm6[10],ymm4[10],ymm6[11],ymm4[11]
+; GFNIAVX2-NEXT: vpsllvd %ymm6, %ymm3, %ymm3
+; GFNIAVX2-NEXT: vpsrld $16, %ymm3, %ymm3
+; GFNIAVX2-NEXT: vpackusdw %ymm5, %ymm3, %ymm3
+; GFNIAVX2-NEXT: vpsrlw $8, %ymm3, %ymm3
+; GFNIAVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
+; GFNIAVX2-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm4[4],ymm0[4],ymm4[5],ymm0[5],ymm4[6],ymm0[6],ymm4[7],ymm0[7],ymm4[12],ymm0[12],ymm4[13],ymm0[13],ymm4[14],ymm0[14],ymm4[15],ymm0[15]
+; GFNIAVX2-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[4],ymm4[4],ymm2[5],ymm4[5],ymm2[6],ymm4[6],ymm2[7],ymm4[7],ymm2[16],ymm4[16],ymm2[17],ymm4[17],ymm2[18],ymm4[18],ymm2[19],ymm4[19],ymm2[20],ymm4[20],ymm2[21],ymm4[21],ymm2[22],ymm4[22],ymm2[23],ymm4[23]
+; GFNIAVX2-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm2[4],ymm4[4],ymm2[5],ymm4[5],ymm2[6],ymm4[6],ymm2[7],ymm4[7],ymm2[12],ymm4[12],ymm2[13],ymm4[13],ymm2[14],ymm4[14],ymm2[15],ymm4[15]
+; GFNIAVX2-NEXT: vpsllvd %ymm5, %ymm1, %ymm1
+; GFNIAVX2-NEXT: vpsrld $16, %ymm1, %ymm1
+; GFNIAVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm4[0],ymm0[0],ymm4[1],ymm0[1],ymm4[2],ymm0[2],ymm4[3],ymm0[3],ymm4[8],ymm0[8],ymm4[9],ymm0[9],ymm4[10],ymm0[10],ymm4[11],ymm0[11]
+; GFNIAVX2-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[8],ymm4[8],ymm2[9],ymm4[9],ymm2[10],ymm4[10],ymm2[11],ymm4[11]
+; GFNIAVX2-NEXT: vpsllvd %ymm2, %ymm0, %ymm0
+; GFNIAVX2-NEXT: vpsrld $16, %ymm0, %ymm0
+; GFNIAVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
+; GFNIAVX2-NEXT: vpsrlw $8, %ymm0, %ymm0
+; GFNIAVX2-NEXT: vpackuswb %ymm3, %ymm0, %ymm0
+; GFNIAVX2-NEXT: retq
+;
+; GFNIAVX512VL-LABEL: var_fshl_v32i8:
+; GFNIAVX512VL: # %bb.0:
+; GFNIAVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; GFNIAVX512VL-NEXT: vpandn %ymm3, %ymm2, %ymm4
+; GFNIAVX512VL-NEXT: vpsllw $5, %ymm4, %ymm4
+; GFNIAVX512VL-NEXT: vpsrlw $1, %ymm1, %ymm1
+; GFNIAVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; GFNIAVX512VL-NEXT: vpand %ymm5, %ymm1, %ymm1
+; GFNIAVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm6
+; GFNIAVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm6, %ymm6
+; GFNIAVX512VL-NEXT: vpblendvb %ymm4, %ymm6, %ymm1, %ymm1
+; GFNIAVX512VL-NEXT: vpsrlw $2, %ymm1, %ymm6
+; GFNIAVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm6, %ymm6
+; GFNIAVX512VL-NEXT: vpaddb %ymm4, %ymm4, %ymm4
+; GFNIAVX512VL-NEXT: vpblendvb %ymm4, %ymm6, %ymm1, %ymm1
+; GFNIAVX512VL-NEXT: vpsrlw $1, %ymm1, %ymm6
+; GFNIAVX512VL-NEXT: vpand %ymm5, %ymm6, %ymm5
+; GFNIAVX512VL-NEXT: vpaddb %ymm4, %ymm4, %ymm4
+; GFNIAVX512VL-NEXT: vpblendvb %ymm4, %ymm5, %ymm1, %ymm1
+; GFNIAVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm2
+; GFNIAVX512VL-NEXT: vpsllw $5, %ymm2, %ymm2
+; GFNIAVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm3
+; GFNIAVX512VL-NEXT: vpsllw $4, %ymm0, %ymm4
+; GFNIAVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm4, %ymm4
+; GFNIAVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
+; GFNIAVX512VL-NEXT: vpsllw $2, %ymm0, %ymm2
+; GFNIAVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm2
+; GFNIAVX512VL-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0
+; GFNIAVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm2
+; GFNIAVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm3
+; GFNIAVX512VL-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0
+; GFNIAVX512VL-NEXT: vpor %ymm1, %ymm0, %ymm0
+; GFNIAVX512VL-NEXT: retq
+;
+; GFNIAVX512BW-LABEL: var_fshl_v32i8:
+; GFNIAVX512BW: # %bb.0:
+; GFNIAVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
+; GFNIAVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; GFNIAVX512BW-NEXT: vpsllw $8, %zmm0, %zmm0
+; GFNIAVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0
+; GFNIAVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm1
+; GFNIAVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
+; GFNIAVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
+; GFNIAVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0
+; GFNIAVX512BW-NEXT: vpmovwb %zmm0, %ymm0
+; GFNIAVX512BW-NEXT: retq
+ %res = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8> %amt)
+ ret <32 x i8> %res
+}
+
+define <32 x i8> @var_fshr_v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8> %amt) nounwind {
+; GFNISSE-LABEL: var_fshr_v32i8:
+; GFNISSE: # %bb.0:
+; GFNISSE-NEXT: movdqa %xmm4, %xmm6
+; GFNISSE-NEXT: movdqa %xmm0, %xmm4
+; GFNISSE-NEXT: movdqa %xmm2, %xmm9
+; GFNISSE-NEXT: psrlw $4, %xmm9
+; GFNISSE-NEXT: movdqa {{.*#+}} xmm8 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; GFNISSE-NEXT: pand %xmm8, %xmm9
+; GFNISSE-NEXT: movdqa {{.*#+}} xmm7 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; GFNISSE-NEXT: movdqa %xmm6, %xmm0
+; GFNISSE-NEXT: pand %xmm7, %xmm0
+; GFNISSE-NEXT: psllw $5, %xmm0
+; GFNISSE-NEXT: pblendvb %xmm0, %xmm9, %xmm2
+; GFNISSE-NEXT: movdqa %xmm2, %xmm10
+; GFNISSE-NEXT: psrlw $2, %xmm10
+; GFNISSE-NEXT: movdqa {{.*#+}} xmm9 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
+; GFNISSE-NEXT: pand %xmm9, %xmm10
+; GFNISSE-NEXT: paddb %xmm0, %xmm0
+; GFNISSE-NEXT: pblendvb %xmm0, %xmm10, %xmm2
+; GFNISSE-NEXT: movdqa %xmm2, %xmm11
+; GFNISSE-NEXT: psrlw $1, %xmm11
+; GFNISSE-NEXT: movdqa {{.*#+}} xmm10 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; GFNISSE-NEXT: pand %xmm10, %xmm11
+; GFNISSE-NEXT: paddb %xmm0, %xmm0
+; GFNISSE-NEXT: pblendvb %xmm0, %xmm11, %xmm2
+; GFNISSE-NEXT: paddb %xmm4, %xmm4
+; GFNISSE-NEXT: movdqa %xmm4, %xmm12
+; GFNISSE-NEXT: psllw $4, %xmm12
+; GFNISSE-NEXT: movdqa {{.*#+}} xmm11 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; GFNISSE-NEXT: pand %xmm11, %xmm12
+; GFNISSE-NEXT: pandn %xmm7, %xmm6
+; GFNISSE-NEXT: psllw $5, %xmm6
+; GFNISSE-NEXT: movdqa %xmm6, %xmm0
+; GFNISSE-NEXT: pblendvb %xmm0, %xmm12, %xmm4
+; GFNISSE-NEXT: movdqa %xmm4, %xmm13
+; GFNISSE-NEXT: psllw $2, %xmm13
+; GFNISSE-NEXT: movdqa {{.*#+}} xmm12 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
+; GFNISSE-NEXT: pand %xmm12, %xmm13
+; GFNISSE-NEXT: paddb %xmm6, %xmm6
+; GFNISSE-NEXT: movdqa %xmm6, %xmm0
+; GFNISSE-NEXT: pblendvb %xmm0, %xmm13, %xmm4
+; GFNISSE-NEXT: movdqa %xmm4, %xmm13
+; GFNISSE-NEXT: paddb %xmm4, %xmm13
+; GFNISSE-NEXT: paddb %xmm6, %xmm6
+; GFNISSE-NEXT: movdqa %xmm6, %xmm0
+; GFNISSE-NEXT: pblendvb %xmm0, %xmm13, %xmm4
+; GFNISSE-NEXT: por %xmm2, %xmm4
+; GFNISSE-NEXT: movdqa %xmm3, %xmm2
+; GFNISSE-NEXT: psrlw $4, %xmm2
+; GFNISSE-NEXT: pand %xmm8, %xmm2
+; GFNISSE-NEXT: movdqa %xmm5, %xmm0
+; GFNISSE-NEXT: pand %xmm7, %xmm0
+; GFNISSE-NEXT: psllw $5, %xmm0
+; GFNISSE-NEXT: pblendvb %xmm0, %xmm2, %xmm3
+; GFNISSE-NEXT: movdqa %xmm3, %xmm2
+; GFNISSE-NEXT: psrlw $2, %xmm2
+; GFNISSE-NEXT: pand %xmm9, %xmm2
+; GFNISSE-NEXT: paddb %xmm0, %xmm0
+; GFNISSE-NEXT: pblendvb %xmm0, %xmm2, %xmm3
+; GFNISSE-NEXT: movdqa %xmm3, %xmm2
+; GFNISSE-NEXT: psrlw $1, %xmm2
+; GFNISSE-NEXT: pand %xmm10, %xmm2
+; GFNISSE-NEXT: paddb %xmm0, %xmm0
+; GFNISSE-NEXT: pblendvb %xmm0, %xmm2, %xmm3
+; GFNISSE-NEXT: paddb %xmm1, %xmm1
+; GFNISSE-NEXT: movdqa %xmm1, %xmm2
+; GFNISSE-NEXT: psllw $4, %xmm2
+; GFNISSE-NEXT: pand %xmm11, %xmm2
+; GFNISSE-NEXT: pandn %xmm7, %xmm5
+; GFNISSE-NEXT: psllw $5, %xmm5
+; GFNISSE-NEXT: movdqa %xmm5, %xmm0
+; GFNISSE-NEXT: pblendvb %xmm0, %xmm2, %xmm1
+; GFNISSE-NEXT: movdqa %xmm1, %xmm2
+; GFNISSE-NEXT: psllw $2, %xmm2
+; GFNISSE-NEXT: pand %xmm12, %xmm2
+; GFNISSE-NEXT: paddb %xmm5, %xmm5
+; GFNISSE-NEXT: movdqa %xmm5, %xmm0
+; GFNISSE-NEXT: pblendvb %xmm0, %xmm2, %xmm1
+; GFNISSE-NEXT: movdqa %xmm1, %xmm2
+; GFNISSE-NEXT: paddb %xmm1, %xmm2
+; GFNISSE-NEXT: paddb %xmm5, %xmm5
+; GFNISSE-NEXT: movdqa %xmm5, %xmm0
+; GFNISSE-NEXT: pblendvb %xmm0, %xmm2, %xmm1
+; GFNISSE-NEXT: por %xmm3, %xmm1
+; GFNISSE-NEXT: movdqa %xmm4, %xmm0
+; GFNISSE-NEXT: retq
+;
+; GFNIAVX1-LABEL: var_fshr_v32i8:
+; GFNIAVX1: # %bb.0:
+; GFNIAVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
+; GFNIAVX1-NEXT: vpsrlw $4, %xmm5, %xmm3
+; GFNIAVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; GFNIAVX1-NEXT: vpand %xmm4, %xmm3, %xmm6
+; GFNIAVX1-NEXT: vbroadcastss {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; GFNIAVX1-NEXT: vandps %ymm3, %ymm2, %ymm2
+; GFNIAVX1-NEXT: vextractf128 $1, %ymm2, %xmm7
+; GFNIAVX1-NEXT: vpsllw $5, %xmm7, %xmm8
+; GFNIAVX1-NEXT: vpblendvb %xmm8, %xmm6, %xmm5, %xmm6
+; GFNIAVX1-NEXT: vpsrlw $2, %xmm6, %xmm9
+; GFNIAVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
+; GFNIAVX1-NEXT: vpand %xmm5, %xmm9, %xmm9
+; GFNIAVX1-NEXT: vpaddb %xmm8, %xmm8, %xmm8
+; GFNIAVX1-NEXT: vpblendvb %xmm8, %xmm9, %xmm6, %xmm9
+; GFNIAVX1-NEXT: vpsrlw $1, %xmm9, %xmm10
+; GFNIAVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; GFNIAVX1-NEXT: vpand %xmm6, %xmm10, %xmm10
+; GFNIAVX1-NEXT: vpaddb %xmm8, %xmm8, %xmm8
+; GFNIAVX1-NEXT: vpblendvb %xmm8, %xmm10, %xmm9, %xmm8
+; GFNIAVX1-NEXT: vextractf128 $1, %ymm0, %xmm9
+; GFNIAVX1-NEXT: vpaddb %xmm9, %xmm9, %xmm9
+; GFNIAVX1-NEXT: vpsllw $4, %xmm9, %xmm10
+; GFNIAVX1-NEXT: vbroadcastss {{.*#+}} xmm11 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; GFNIAVX1-NEXT: vpand %xmm11, %xmm10, %xmm10
+; GFNIAVX1-NEXT: vpxor %xmm3, %xmm7, %xmm7
+; GFNIAVX1-NEXT: vpsllw $5, %xmm7, %xmm7
+; GFNIAVX1-NEXT: vpblendvb %xmm7, %xmm10, %xmm9, %xmm9
+; GFNIAVX1-NEXT: vpsllw $2, %xmm9, %xmm10
+; GFNIAVX1-NEXT: vbroadcastss {{.*#+}} xmm12 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
+; GFNIAVX1-NEXT: vpand %xmm12, %xmm10, %xmm10
+; GFNIAVX1-NEXT: vpaddb %xmm7, %xmm7, %xmm7
+; GFNIAVX1-NEXT: vpblendvb %xmm7, %xmm10, %xmm9, %xmm9
+; GFNIAVX1-NEXT: vpaddb %xmm9, %xmm9, %xmm10
+; GFNIAVX1-NEXT: vpaddb %xmm7, %xmm7, %xmm7
+; GFNIAVX1-NEXT: vpblendvb %xmm7, %xmm10, %xmm9, %xmm7
+; GFNIAVX1-NEXT: vpor %xmm7, %xmm8, %xmm7
+; GFNIAVX1-NEXT: vpsrlw $4, %xmm1, %xmm8
+; GFNIAVX1-NEXT: vpand %xmm4, %xmm8, %xmm4
+; GFNIAVX1-NEXT: vpsllw $5, %xmm2, %xmm8
+; GFNIAVX1-NEXT: vpblendvb %xmm8, %xmm4, %xmm1, %xmm1
+; GFNIAVX1-NEXT: vpsrlw $2, %xmm1, %xmm4
+; GFNIAVX1-NEXT: vpand %xmm5, %xmm4, %xmm4
+; GFNIAVX1-NEXT: vpaddb %xmm8, %xmm8, %xmm5
+; GFNIAVX1-NEXT: vpblendvb %xmm5, %xmm4, %xmm1, %xmm1
+; GFNIAVX1-NEXT: vpsrlw $1, %xmm1, %xmm4
+; GFNIAVX1-NEXT: vpand %xmm6, %xmm4, %xmm4
+; GFNIAVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm5
+; GFNIAVX1-NEXT: vpblendvb %xmm5, %xmm4, %xmm1, %xmm1
+; GFNIAVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm0
+; GFNIAVX1-NEXT: vpsllw $4, %xmm0, %xmm4
+; GFNIAVX1-NEXT: vpand %xmm4, %xmm11, %xmm4
+; GFNIAVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2
+; GFNIAVX1-NEXT: vpsllw $5, %xmm2, %xmm2
+; GFNIAVX1-NEXT: vpblendvb %xmm2, %xmm4, %xmm0, %xmm0
+; GFNIAVX1-NEXT: vpsllw $2, %xmm0, %xmm3
+; GFNIAVX1-NEXT: vpand %xmm3, %xmm12, %xmm3
+; GFNIAVX1-NEXT: vpaddb %xmm2, %xmm2, %xmm2
+; GFNIAVX1-NEXT: vpblendvb %xmm2, %xmm3, %xmm0, %xmm0
+; GFNIAVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm3
+; GFNIAVX1-NEXT: vpaddb %xmm2, %xmm2, %xmm2
+; GFNIAVX1-NEXT: vpblendvb %xmm2, %xmm3, %xmm0, %xmm0
+; GFNIAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
+; GFNIAVX1-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm0
+; GFNIAVX1-NEXT: retq
+;
+; GFNIAVX2-LABEL: var_fshr_v32i8:
+; GFNIAVX2: # %bb.0:
+; GFNIAVX2-NEXT: vpbroadcastb {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; GFNIAVX2-NEXT: vpand %ymm3, %ymm2, %ymm4
+; GFNIAVX2-NEXT: vpsllw $5, %ymm4, %ymm4
+; GFNIAVX2-NEXT: vpaddb %ymm4, %ymm4, %ymm5
+; GFNIAVX2-NEXT: vpsrlw $4, %ymm1, %ymm6
+; GFNIAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm6, %ymm6
+; GFNIAVX2-NEXT: vpblendvb %ymm4, %ymm6, %ymm1, %ymm1
+; GFNIAVX2-NEXT: vpsrlw $2, %ymm1, %ymm4
+; GFNIAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4
+; GFNIAVX2-NEXT: vpblendvb %ymm5, %ymm4, %ymm1, %ymm1
+; GFNIAVX2-NEXT: vpsrlw $1, %ymm1, %ymm4
+; GFNIAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4
+; GFNIAVX2-NEXT: vpaddb %ymm5, %ymm5, %ymm5
+; GFNIAVX2-NEXT: vpblendvb %ymm5, %ymm4, %ymm1, %ymm1
+; GFNIAVX2-NEXT: vpandn %ymm3, %ymm2, %ymm2
+; GFNIAVX2-NEXT: vpsllw $5, %ymm2, %ymm2
+; GFNIAVX2-NEXT: vpaddb %ymm2, %ymm2, %ymm3
+; GFNIAVX2-NEXT: vpaddb %ymm0, %ymm0, %ymm0
+; GFNIAVX2-NEXT: vpsllw $4, %ymm0, %ymm4
+; GFNIAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4
+; GFNIAVX2-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
+; GFNIAVX2-NEXT: vpsllw $2, %ymm0, %ymm2
+; GFNIAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
+; GFNIAVX2-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0
+; GFNIAVX2-NEXT: vpaddb %ymm0, %ymm0, %ymm2
+; GFNIAVX2-NEXT: vpaddb %ymm3, %ymm3, %ymm3
+; GFNIAVX2-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0
+; GFNIAVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
+; GFNIAVX2-NEXT: retq
+;
+; GFNIAVX512VL-LABEL: var_fshr_v32i8:
+; GFNIAVX512VL: # %bb.0:
+; GFNIAVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; GFNIAVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm4
+; GFNIAVX512VL-NEXT: vpsllw $5, %ymm4, %ymm4
+; GFNIAVX512VL-NEXT: vpaddb %ymm4, %ymm4, %ymm5
+; GFNIAVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm6
+; GFNIAVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm6, %ymm6
+; GFNIAVX512VL-NEXT: vpblendvb %ymm4, %ymm6, %ymm1, %ymm1
+; GFNIAVX512VL-NEXT: vpsrlw $2, %ymm1, %ymm4
+; GFNIAVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm4, %ymm4
+; GFNIAVX512VL-NEXT: vpblendvb %ymm5, %ymm4, %ymm1, %ymm1
+; GFNIAVX512VL-NEXT: vpsrlw $1, %ymm1, %ymm4
+; GFNIAVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm4, %ymm4
+; GFNIAVX512VL-NEXT: vpaddb %ymm5, %ymm5, %ymm5
+; GFNIAVX512VL-NEXT: vpblendvb %ymm5, %ymm4, %ymm1, %ymm1
+; GFNIAVX512VL-NEXT: vpandn %ymm3, %ymm2, %ymm2
+; GFNIAVX512VL-NEXT: vpsllw $5, %ymm2, %ymm2
+; GFNIAVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm3
+; GFNIAVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm0
+; GFNIAVX512VL-NEXT: vpsllw $4, %ymm0, %ymm4
+; GFNIAVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm4, %ymm4
+; GFNIAVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
+; GFNIAVX512VL-NEXT: vpsllw $2, %ymm0, %ymm2
+; GFNIAVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm2
+; GFNIAVX512VL-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0
+; GFNIAVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm2
+; GFNIAVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm3
+; GFNIAVX512VL-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0
+; GFNIAVX512VL-NEXT: vpor %ymm1, %ymm0, %ymm0
+; GFNIAVX512VL-NEXT: retq
+;
+; GFNIAVX512BW-LABEL: var_fshr_v32i8:
+; GFNIAVX512BW: # %bb.0:
+; GFNIAVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
+; GFNIAVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; GFNIAVX512BW-NEXT: vpsllw $8, %zmm0, %zmm0
+; GFNIAVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0
+; GFNIAVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm1
+; GFNIAVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
+; GFNIAVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
+; GFNIAVX512BW-NEXT: vpmovwb %zmm0, %ymm0
+; GFNIAVX512BW-NEXT: retq
+ %res = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8> %amt)
+ ret <32 x i8> %res
+}
+
+define <32 x i8> @splatvar_fshl_v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8> %amt) nounwind {
+; GFNISSE-LABEL: splatvar_fshl_v32i8:
+; GFNISSE: # %bb.0:
+; GFNISSE-NEXT: movdqa %xmm2, %xmm5
+; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15]
+; GFNISSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
+; GFNISSE-NEXT: psllw %xmm4, %xmm5
+; GFNISSE-NEXT: psrlw $8, %xmm5
+; GFNISSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; GFNISSE-NEXT: psllw %xmm4, %xmm2
+; GFNISSE-NEXT: psrlw $8, %xmm2
+; GFNISSE-NEXT: packuswb %xmm5, %xmm2
+; GFNISSE-NEXT: movdqa %xmm3, %xmm0
+; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; GFNISSE-NEXT: psllw %xmm4, %xmm0
+; GFNISSE-NEXT: psrlw $8, %xmm0
+; GFNISSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
+; GFNISSE-NEXT: psllw %xmm4, %xmm3
+; GFNISSE-NEXT: psrlw $8, %xmm3
+; GFNISSE-NEXT: packuswb %xmm0, %xmm3
+; GFNISSE-NEXT: movdqa %xmm2, %xmm0
+; GFNISSE-NEXT: movdqa %xmm3, %xmm1
+; GFNISSE-NEXT: retq
+;
+; GFNIAVX1-LABEL: splatvar_fshl_v32i8:
+; GFNIAVX1: # %bb.0:
+; GFNIAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; GFNIAVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
+; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15]
+; GFNIAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; GFNIAVX1-NEXT: vpsllw %xmm2, %xmm5, %xmm5
+; GFNIAVX1-NEXT: vpsrlw $8, %xmm5, %xmm5
+; GFNIAVX1-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
+; GFNIAVX1-NEXT: vpsllw %xmm2, %xmm3, %xmm3
+; GFNIAVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
+; GFNIAVX1-NEXT: vpackuswb %xmm5, %xmm3, %xmm3
+; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; GFNIAVX1-NEXT: vpsllw %xmm2, %xmm4, %xmm4
+; GFNIAVX1-NEXT: vpsrlw $8, %xmm4, %xmm4
+; GFNIAVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; GFNIAVX1-NEXT: vpsllw %xmm2, %xmm0, %xmm0
+; GFNIAVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
+; GFNIAVX1-NEXT: vpackuswb %xmm4, %xmm0, %xmm0
+; GFNIAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
+; GFNIAVX1-NEXT: retq
+;
+; GFNIAVX2-LABEL: splatvar_fshl_v32i8:
+; GFNIAVX2: # %bb.0:
+; GFNIAVX2-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
+; GFNIAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; GFNIAVX2-NEXT: vpsllw %xmm2, %ymm3, %ymm3
+; GFNIAVX2-NEXT: vpsrlw $8, %ymm3, %ymm3
+; GFNIAVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
+; GFNIAVX2-NEXT: vpsllw %xmm2, %ymm0, %ymm0
+; GFNIAVX2-NEXT: vpsrlw $8, %ymm0, %ymm0
+; GFNIAVX2-NEXT: vpackuswb %ymm3, %ymm0, %ymm0
+; GFNIAVX2-NEXT: retq
+;
+; GFNIAVX512-LABEL: splatvar_fshl_v32i8:
+; GFNIAVX512: # %bb.0:
+; GFNIAVX512-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
+; GFNIAVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; GFNIAVX512-NEXT: vpsllw %xmm2, %ymm3, %ymm3
+; GFNIAVX512-NEXT: vpsrlw $8, %ymm3, %ymm3
+; GFNIAVX512-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
+; GFNIAVX512-NEXT: vpsllw %xmm2, %ymm0, %ymm0
+; GFNIAVX512-NEXT: vpsrlw $8, %ymm0, %ymm0
+; GFNIAVX512-NEXT: vpackuswb %ymm3, %ymm0, %ymm0
+; GFNIAVX512-NEXT: retq
+ %splat = shufflevector <32 x i8> %amt, <32 x i8> undef, <32 x i32> zeroinitializer
+ %res = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8> %splat)
+ ret <32 x i8> %res
+}
+
+define <32 x i8> @splatvar_fshr_v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8> %amt) nounwind {
+; GFNISSE-LABEL: splatvar_fshr_v32i8:
+; GFNISSE: # %bb.0:
+; GFNISSE-NEXT: movdqa %xmm2, %xmm6
+; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm0[8],xmm6[9],xmm0[9],xmm6[10],xmm0[10],xmm6[11],xmm0[11],xmm6[12],xmm0[12],xmm6[13],xmm0[13],xmm6[14],xmm0[14],xmm6[15],xmm0[15]
+; GFNISSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
+; GFNISSE-NEXT: psrlw %xmm4, %xmm6
+; GFNISSE-NEXT: pmovzxbw {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
+; GFNISSE-NEXT: pand %xmm5, %xmm6
+; GFNISSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; GFNISSE-NEXT: psrlw %xmm4, %xmm2
+; GFNISSE-NEXT: pand %xmm5, %xmm2
+; GFNISSE-NEXT: packuswb %xmm6, %xmm2
+; GFNISSE-NEXT: movdqa %xmm3, %xmm0
+; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; GFNISSE-NEXT: psrlw %xmm4, %xmm0
+; GFNISSE-NEXT: pand %xmm5, %xmm0
+; GFNISSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
+; GFNISSE-NEXT: psrlw %xmm4, %xmm3
+; GFNISSE-NEXT: pand %xmm3, %xmm5
+; GFNISSE-NEXT: packuswb %xmm0, %xmm5
+; GFNISSE-NEXT: movdqa %xmm2, %xmm0
+; GFNISSE-NEXT: movdqa %xmm5, %xmm1
+; GFNISSE-NEXT: retq
+;
+; GFNIAVX1-LABEL: splatvar_fshr_v32i8:
+; GFNIAVX1: # %bb.0:
+; GFNIAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; GFNIAVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
+; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15]
+; GFNIAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; GFNIAVX1-NEXT: vpsrlw %xmm2, %xmm5, %xmm5
+; GFNIAVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255]
+; GFNIAVX1-NEXT: vpand %xmm6, %xmm5, %xmm5
+; GFNIAVX1-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
+; GFNIAVX1-NEXT: vpsrlw %xmm2, %xmm3, %xmm3
+; GFNIAVX1-NEXT: vpand %xmm6, %xmm3, %xmm3
+; GFNIAVX1-NEXT: vpackuswb %xmm5, %xmm3, %xmm3
+; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; GFNIAVX1-NEXT: vpsrlw %xmm2, %xmm4, %xmm4
+; GFNIAVX1-NEXT: vpand %xmm6, %xmm4, %xmm4
+; GFNIAVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; GFNIAVX1-NEXT: vpsrlw %xmm2, %xmm0, %xmm0
+; GFNIAVX1-NEXT: vpand %xmm6, %xmm0, %xmm0
+; GFNIAVX1-NEXT: vpackuswb %xmm4, %xmm0, %xmm0
+; GFNIAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
+; GFNIAVX1-NEXT: retq
+;
+; GFNIAVX2-LABEL: splatvar_fshr_v32i8:
+; GFNIAVX2: # %bb.0:
+; GFNIAVX2-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
+; GFNIAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; GFNIAVX2-NEXT: vpsrlw %xmm2, %ymm3, %ymm3
+; GFNIAVX2-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; GFNIAVX2-NEXT: vpand %ymm4, %ymm3, %ymm3
+; GFNIAVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
+; GFNIAVX2-NEXT: vpsrlw %xmm2, %ymm0, %ymm0
+; GFNIAVX2-NEXT: vpand %ymm4, %ymm0, %ymm0
+; GFNIAVX2-NEXT: vpackuswb %ymm3, %ymm0, %ymm0
+; GFNIAVX2-NEXT: retq
+;
+; GFNIAVX512VL-LABEL: splatvar_fshr_v32i8:
+; GFNIAVX512VL: # %bb.0:
+; GFNIAVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
+; GFNIAVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; GFNIAVX512VL-NEXT: vpsrlw %xmm2, %ymm3, %ymm3
+; GFNIAVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; GFNIAVX512VL-NEXT: vpand %ymm4, %ymm3, %ymm3
+; GFNIAVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
+; GFNIAVX512VL-NEXT: vpsrlw %xmm2, %ymm0, %ymm0
+; GFNIAVX512VL-NEXT: vpand %ymm4, %ymm0, %ymm0
+; GFNIAVX512VL-NEXT: vpackuswb %ymm3, %ymm0, %ymm0
+; GFNIAVX512VL-NEXT: retq
+;
+; GFNIAVX512BW-LABEL: splatvar_fshr_v32i8:
+; GFNIAVX512BW: # %bb.0:
+; GFNIAVX512BW-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
+; GFNIAVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; GFNIAVX512BW-NEXT: vpsrlw %xmm2, %ymm3, %ymm3
+; GFNIAVX512BW-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; GFNIAVX512BW-NEXT: vpand %ymm4, %ymm3, %ymm3
+; GFNIAVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
+; GFNIAVX512BW-NEXT: vpsrlw %xmm2, %ymm0, %ymm0
+; GFNIAVX512BW-NEXT: vpand %ymm4, %ymm0, %ymm0
+; GFNIAVX512BW-NEXT: vpackuswb %ymm3, %ymm0, %ymm0
+; GFNIAVX512BW-NEXT: retq
+ %splat = shufflevector <32 x i8> %amt, <32 x i8> undef, <32 x i32> zeroinitializer
+ %res = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8> %splat)
+ ret <32 x i8> %res
+}
+
+define <32 x i8> @constant_fshl_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
+; GFNISSE-LABEL: constant_fshl_v32i8:
+; GFNISSE: # %bb.0:
+; GFNISSE-NEXT: movdqa %xmm2, %xmm5
+; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15]
+; GFNISSE-NEXT: pmovzxbw {{.*#+}} xmm6 = [1,128,64,32,16,8,4,2]
+; GFNISSE-NEXT: pmullw %xmm6, %xmm5
+; GFNISSE-NEXT: psrlw $8, %xmm5
+; GFNISSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; GFNISSE-NEXT: pmovzxbw {{.*#+}} xmm4 = [1,2,4,8,16,32,64,128]
+; GFNISSE-NEXT: pmullw %xmm4, %xmm2
+; GFNISSE-NEXT: psrlw $8, %xmm2
+; GFNISSE-NEXT: packuswb %xmm5, %xmm2
+; GFNISSE-NEXT: movdqa %xmm3, %xmm0
+; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; GFNISSE-NEXT: pmullw %xmm6, %xmm0
+; GFNISSE-NEXT: psrlw $8, %xmm0
+; GFNISSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
+; GFNISSE-NEXT: pmullw %xmm3, %xmm4
+; GFNISSE-NEXT: psrlw $8, %xmm4
+; GFNISSE-NEXT: packuswb %xmm0, %xmm4
+; GFNISSE-NEXT: movdqa %xmm2, %xmm0
+; GFNISSE-NEXT: movdqa %xmm4, %xmm1
+; GFNISSE-NEXT: retq
+;
+; GFNIAVX1-LABEL: constant_fshl_v32i8:
+; GFNIAVX1: # %bb.0:
+; GFNIAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; GFNIAVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
+; GFNIAVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = [1,128,64,32,16,8,4,2]
+; GFNIAVX1-NEXT: vpmullw %xmm5, %xmm4, %xmm4
+; GFNIAVX1-NEXT: vpsrlw $8, %xmm4, %xmm4
+; GFNIAVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; GFNIAVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = [1,2,4,8,16,32,64,128]
+; GFNIAVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2
+; GFNIAVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
+; GFNIAVX1-NEXT: vpackuswb %xmm4, %xmm2, %xmm2
+; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; GFNIAVX1-NEXT: vpmullw %xmm5, %xmm4, %xmm4
+; GFNIAVX1-NEXT: vpsrlw $8, %xmm4, %xmm4
+; GFNIAVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; GFNIAVX1-NEXT: vpmullw %xmm3, %xmm0, %xmm0
+; GFNIAVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
+; GFNIAVX1-NEXT: vpackuswb %xmm4, %xmm0, %xmm0
+; GFNIAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; GFNIAVX1-NEXT: retq
+;
+; GFNIAVX2-LABEL: constant_fshl_v32i8:
+; GFNIAVX2: # %bb.0:
+; GFNIAVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
+; GFNIAVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
+; GFNIAVX2-NEXT: vpsrlw $8, %ymm2, %ymm2
+; GFNIAVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
+; GFNIAVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; GFNIAVX2-NEXT: vpsrlw $8, %ymm0, %ymm0
+; GFNIAVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
+; GFNIAVX2-NEXT: retq
+;
+; GFNIAVX512VL-LABEL: constant_fshl_v32i8:
+; GFNIAVX512VL: # %bb.0:
+; GFNIAVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
+; GFNIAVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
+; GFNIAVX512VL-NEXT: vpsrlw $8, %ymm2, %ymm2
+; GFNIAVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
+; GFNIAVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; GFNIAVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0
+; GFNIAVX512VL-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
+; GFNIAVX512VL-NEXT: retq
+;
+; GFNIAVX512BW-LABEL: constant_fshl_v32i8:
+; GFNIAVX512BW: # %bb.0:
+; GFNIAVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
+; GFNIAVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; GFNIAVX512BW-NEXT: vpsllw $8, %zmm0, %zmm0
+; GFNIAVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0
+; GFNIAVX512BW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
+; GFNIAVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0
+; GFNIAVX512BW-NEXT: vpmovwb %zmm0, %ymm0
+; GFNIAVX512BW-NEXT: retq
+ %res = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1>)
+ ret <32 x i8> %res
+}
+
+define <32 x i8> @constant_fshr_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
+; GFNISSE-LABEL: constant_fshr_v32i8:
+; GFNISSE: # %bb.0:
+; GFNISSE-NEXT: movdqa %xmm2, %xmm5
+; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15]
+; GFNISSE-NEXT: pmovzxbw {{.*#+}} xmm6 = [1,128,64,32,16,8,4,2]
+; GFNISSE-NEXT: pmullw %xmm6, %xmm5
+; GFNISSE-NEXT: psrlw $8, %xmm5
+; GFNISSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; GFNISSE-NEXT: pmovzxbw {{.*#+}} xmm4 = [1,2,4,8,16,32,64,128]
+; GFNISSE-NEXT: pmullw %xmm4, %xmm2
+; GFNISSE-NEXT: psrlw $8, %xmm2
+; GFNISSE-NEXT: packuswb %xmm5, %xmm2
+; GFNISSE-NEXT: movdqa %xmm3, %xmm0
+; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; GFNISSE-NEXT: pmullw %xmm6, %xmm0
+; GFNISSE-NEXT: psrlw $8, %xmm0
+; GFNISSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
+; GFNISSE-NEXT: pmullw %xmm3, %xmm4
+; GFNISSE-NEXT: psrlw $8, %xmm4
+; GFNISSE-NEXT: packuswb %xmm0, %xmm4
+; GFNISSE-NEXT: movdqa %xmm2, %xmm0
+; GFNISSE-NEXT: movdqa %xmm4, %xmm1
+; GFNISSE-NEXT: retq
+;
+; GFNIAVX1-LABEL: constant_fshr_v32i8:
+; GFNIAVX1: # %bb.0:
+; GFNIAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; GFNIAVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
+; GFNIAVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = [1,128,64,32,16,8,4,2]
+; GFNIAVX1-NEXT: vpmullw %xmm5, %xmm4, %xmm4
+; GFNIAVX1-NEXT: vpsrlw $8, %xmm4, %xmm4
+; GFNIAVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; GFNIAVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = [1,2,4,8,16,32,64,128]
+; GFNIAVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2
+; GFNIAVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
+; GFNIAVX1-NEXT: vpackuswb %xmm4, %xmm2, %xmm2
+; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; GFNIAVX1-NEXT: vpmullw %xmm5, %xmm4, %xmm4
+; GFNIAVX1-NEXT: vpsrlw $8, %xmm4, %xmm4
+; GFNIAVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; GFNIAVX1-NEXT: vpmullw %xmm3, %xmm0, %xmm0
+; GFNIAVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
+; GFNIAVX1-NEXT: vpackuswb %xmm4, %xmm0, %xmm0
+; GFNIAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; GFNIAVX1-NEXT: retq
+;
+; GFNIAVX2-LABEL: constant_fshr_v32i8:
+; GFNIAVX2: # %bb.0:
+; GFNIAVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
+; GFNIAVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
+; GFNIAVX2-NEXT: vpsrlw $8, %ymm2, %ymm2
+; GFNIAVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
+; GFNIAVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; GFNIAVX2-NEXT: vpsrlw $8, %ymm0, %ymm0
+; GFNIAVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
+; GFNIAVX2-NEXT: retq
+;
+; GFNIAVX512VL-LABEL: constant_fshr_v32i8:
+; GFNIAVX512VL: # %bb.0:
+; GFNIAVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
+; GFNIAVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
+; GFNIAVX512VL-NEXT: vpsrlw $8, %ymm2, %ymm2
+; GFNIAVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
+; GFNIAVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; GFNIAVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0
+; GFNIAVX512VL-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
+; GFNIAVX512VL-NEXT: retq
+;
+; GFNIAVX512BW-LABEL: constant_fshr_v32i8:
+; GFNIAVX512BW: # %bb.0:
+; GFNIAVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
+; GFNIAVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; GFNIAVX512BW-NEXT: vpsllw $8, %zmm0, %zmm0
+; GFNIAVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0
+; GFNIAVX512BW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
+; GFNIAVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0
+; GFNIAVX512BW-NEXT: vpmovwb %zmm0, %ymm0
+; GFNIAVX512BW-NEXT: retq
+ %res = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1>)
+ ret <32 x i8> %res
+}
+
define <32 x i8> @splatconstant_fshl_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
; GFNISSE-LABEL: splatconstant_fshl_v32i8:
; GFNISSE: # %bb.0:
@@ -187,6 +1449,1428 @@ declare <32 x i8> @llvm.fshr.v32i8(<32 x i8>, <32 x i8>, <32 x i8>)
; 512 Bit Vector Funnel Shifts
;
+define <64 x i8> @var_fshl_v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %amt) nounwind {
+; GFNISSE-LABEL: var_fshl_v64i8:
+; GFNISSE: # %bb.0:
+; GFNISSE-NEXT: movdqa %xmm1, %xmm8
+; GFNISSE-NEXT: movdqa %xmm0, %xmm1
+; GFNISSE-NEXT: movdqa {{.*#+}} xmm9 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; GFNISSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm0
+; GFNISSE-NEXT: pand %xmm9, %xmm0
+; GFNISSE-NEXT: pxor %xmm10, %xmm10
+; GFNISSE-NEXT: pmovzxbd {{.*#+}} xmm12 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; GFNISSE-NEXT: pmovzxbw {{.*#+}} xmm13 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm10[8],xmm0[9],xmm10[9],xmm0[10],xmm10[10],xmm0[11],xmm10[11],xmm0[12],xmm10[12],xmm0[13],xmm10[13],xmm0[14],xmm10[14],xmm0[15],xmm10[15]
+; GFNISSE-NEXT: pmovzxwd {{.*#+}} xmm14 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; GFNISSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
+; GFNISSE-NEXT: pslld $23, %xmm0
+; GFNISSE-NEXT: movdqa {{.*#+}} xmm11 = [1065353216,1065353216,1065353216,1065353216]
+; GFNISSE-NEXT: paddd %xmm11, %xmm0
+; GFNISSE-NEXT: cvttps2dq %xmm0, %xmm0
+; GFNISSE-NEXT: pslld $23, %xmm14
+; GFNISSE-NEXT: paddd %xmm11, %xmm14
+; GFNISSE-NEXT: cvttps2dq %xmm14, %xmm14
+; GFNISSE-NEXT: packusdw %xmm0, %xmm14
+; GFNISSE-NEXT: movdqa %xmm4, %xmm15
+; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm1[8],xmm15[9],xmm1[9],xmm15[10],xmm1[10],xmm15[11],xmm1[11],xmm15[12],xmm1[12],xmm15[13],xmm1[13],xmm15[14],xmm1[14],xmm15[15],xmm1[15]
+; GFNISSE-NEXT: pmullw %xmm14, %xmm15
+; GFNISSE-NEXT: psrlw $8, %xmm15
+; GFNISSE-NEXT: pslld $23, %xmm12
+; GFNISSE-NEXT: paddd %xmm11, %xmm12
+; GFNISSE-NEXT: cvttps2dq %xmm12, %xmm0
+; GFNISSE-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4,4,5,5,6,6,7,7]
+; GFNISSE-NEXT: pslld $23, %xmm13
+; GFNISSE-NEXT: paddd %xmm11, %xmm13
+; GFNISSE-NEXT: cvttps2dq %xmm13, %xmm12
+; GFNISSE-NEXT: packusdw %xmm12, %xmm0
+; GFNISSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
+; GFNISSE-NEXT: pmullw %xmm4, %xmm0
+; GFNISSE-NEXT: psrlw $8, %xmm0
+; GFNISSE-NEXT: packuswb %xmm15, %xmm0
+; GFNISSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1
+; GFNISSE-NEXT: pand %xmm9, %xmm1
+; GFNISSE-NEXT: pmovzxbd {{.*#+}} xmm4 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; GFNISSE-NEXT: pmovzxbw {{.*#+}} xmm12 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm10[8],xmm1[9],xmm10[9],xmm1[10],xmm10[10],xmm1[11],xmm10[11],xmm1[12],xmm10[12],xmm1[13],xmm10[13],xmm1[14],xmm10[14],xmm1[15],xmm10[15]
+; GFNISSE-NEXT: pmovzxwd {{.*#+}} xmm13 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; GFNISSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
+; GFNISSE-NEXT: pslld $23, %xmm1
+; GFNISSE-NEXT: paddd %xmm11, %xmm1
+; GFNISSE-NEXT: cvttps2dq %xmm1, %xmm1
+; GFNISSE-NEXT: pslld $23, %xmm13
+; GFNISSE-NEXT: paddd %xmm11, %xmm13
+; GFNISSE-NEXT: cvttps2dq %xmm13, %xmm13
+; GFNISSE-NEXT: packusdw %xmm1, %xmm13
+; GFNISSE-NEXT: movdqa %xmm5, %xmm14
+; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm14 = xmm14[8],xmm8[8],xmm14[9],xmm8[9],xmm14[10],xmm8[10],xmm14[11],xmm8[11],xmm14[12],xmm8[12],xmm14[13],xmm8[13],xmm14[14],xmm8[14],xmm14[15],xmm8[15]
+; GFNISSE-NEXT: pmullw %xmm13, %xmm14
+; GFNISSE-NEXT: psrlw $8, %xmm14
+; GFNISSE-NEXT: pslld $23, %xmm4
+; GFNISSE-NEXT: paddd %xmm11, %xmm4
+; GFNISSE-NEXT: cvttps2dq %xmm4, %xmm1
+; GFNISSE-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4,4,5,5,6,6,7,7]
+; GFNISSE-NEXT: pslld $23, %xmm12
+; GFNISSE-NEXT: paddd %xmm11, %xmm12
+; GFNISSE-NEXT: cvttps2dq %xmm12, %xmm4
+; GFNISSE-NEXT: packusdw %xmm4, %xmm1
+; GFNISSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3],xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7]
+; GFNISSE-NEXT: pmullw %xmm5, %xmm1
+; GFNISSE-NEXT: psrlw $8, %xmm1
+; GFNISSE-NEXT: packuswb %xmm14, %xmm1
+; GFNISSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm4
+; GFNISSE-NEXT: pand %xmm9, %xmm4
+; GFNISSE-NEXT: pmovzxbd {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
+; GFNISSE-NEXT: pmovzxbw {{.*#+}} xmm8 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
+; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm10[8],xmm4[9],xmm10[9],xmm4[10],xmm10[10],xmm4[11],xmm10[11],xmm4[12],xmm10[12],xmm4[13],xmm10[13],xmm4[14],xmm10[14],xmm4[15],xmm10[15]
+; GFNISSE-NEXT: pmovzxwd {{.*#+}} xmm12 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero
+; GFNISSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7]
+; GFNISSE-NEXT: pslld $23, %xmm4
+; GFNISSE-NEXT: paddd %xmm11, %xmm4
+; GFNISSE-NEXT: cvttps2dq %xmm4, %xmm4
+; GFNISSE-NEXT: pslld $23, %xmm12
+; GFNISSE-NEXT: paddd %xmm11, %xmm12
+; GFNISSE-NEXT: cvttps2dq %xmm12, %xmm12
+; GFNISSE-NEXT: packusdw %xmm4, %xmm12
+; GFNISSE-NEXT: movdqa %xmm6, %xmm13
+; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm2[8],xmm13[9],xmm2[9],xmm13[10],xmm2[10],xmm13[11],xmm2[11],xmm13[12],xmm2[12],xmm13[13],xmm2[13],xmm13[14],xmm2[14],xmm13[15],xmm2[15]
+; GFNISSE-NEXT: pmullw %xmm12, %xmm13
+; GFNISSE-NEXT: psrlw $8, %xmm13
+; GFNISSE-NEXT: pslld $23, %xmm5
+; GFNISSE-NEXT: paddd %xmm11, %xmm5
+; GFNISSE-NEXT: cvttps2dq %xmm5, %xmm4
+; GFNISSE-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4,4,5,5,6,6,7,7]
+; GFNISSE-NEXT: pslld $23, %xmm8
+; GFNISSE-NEXT: paddd %xmm11, %xmm8
+; GFNISSE-NEXT: cvttps2dq %xmm8, %xmm5
+; GFNISSE-NEXT: packusdw %xmm5, %xmm4
+; GFNISSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
+; GFNISSE-NEXT: pmullw %xmm6, %xmm4
+; GFNISSE-NEXT: psrlw $8, %xmm4
+; GFNISSE-NEXT: packuswb %xmm13, %xmm4
+; GFNISSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm9
+; GFNISSE-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero,xmm9[2],zero,zero,zero,xmm9[3],zero,zero,zero
+; GFNISSE-NEXT: pmovzxbw {{.*#+}} xmm6 = xmm9[0],zero,xmm9[1],zero,xmm9[2],zero,xmm9[3],zero,xmm9[4],zero,xmm9[5],zero,xmm9[6],zero,xmm9[7],zero
+; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm10[8],xmm9[9],xmm10[9],xmm9[10],xmm10[10],xmm9[11],xmm10[11],xmm9[12],xmm10[12],xmm9[13],xmm10[13],xmm9[14],xmm10[14],xmm9[15],xmm10[15]
+; GFNISSE-NEXT: pmovzxwd {{.*#+}} xmm5 = xmm9[0],zero,xmm9[1],zero,xmm9[2],zero,xmm9[3],zero
+; GFNISSE-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4,4,5,5,6,6,7,7]
+; GFNISSE-NEXT: pslld $23, %xmm9
+; GFNISSE-NEXT: paddd %xmm11, %xmm9
+; GFNISSE-NEXT: cvttps2dq %xmm9, %xmm8
+; GFNISSE-NEXT: pslld $23, %xmm5
+; GFNISSE-NEXT: paddd %xmm11, %xmm5
+; GFNISSE-NEXT: cvttps2dq %xmm5, %xmm5
+; GFNISSE-NEXT: packusdw %xmm8, %xmm5
+; GFNISSE-NEXT: movdqa %xmm7, %xmm8
+; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm3[8],xmm8[9],xmm3[9],xmm8[10],xmm3[10],xmm8[11],xmm3[11],xmm8[12],xmm3[12],xmm8[13],xmm3[13],xmm8[14],xmm3[14],xmm8[15],xmm3[15]
+; GFNISSE-NEXT: pmullw %xmm5, %xmm8
+; GFNISSE-NEXT: psrlw $8, %xmm8
+; GFNISSE-NEXT: pslld $23, %xmm2
+; GFNISSE-NEXT: paddd %xmm11, %xmm2
+; GFNISSE-NEXT: cvttps2dq %xmm2, %xmm5
+; GFNISSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4,4,5,5,6,6,7,7]
+; GFNISSE-NEXT: pslld $23, %xmm6
+; GFNISSE-NEXT: paddd %xmm11, %xmm6
+; GFNISSE-NEXT: cvttps2dq %xmm6, %xmm2
+; GFNISSE-NEXT: packusdw %xmm2, %xmm5
+; GFNISSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7]
+; GFNISSE-NEXT: pmullw %xmm7, %xmm5
+; GFNISSE-NEXT: psrlw $8, %xmm5
+; GFNISSE-NEXT: packuswb %xmm8, %xmm5
+; GFNISSE-NEXT: movdqa %xmm4, %xmm2
+; GFNISSE-NEXT: movdqa %xmm5, %xmm3
+; GFNISSE-NEXT: retq
+;
+; GFNIAVX1-LABEL: var_fshl_v64i8:
+; GFNIAVX1: # %bb.0:
+; GFNIAVX1-NEXT: vbroadcastss {{.*#+}} ymm7 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; GFNIAVX1-NEXT: vandps %ymm7, %ymm4, %ymm8
+; GFNIAVX1-NEXT: vextractf128 $1, %ymm8, %xmm9
+; GFNIAVX1-NEXT: vpxor %xmm6, %xmm6, %xmm6
+; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm9[8],xmm6[8],xmm9[9],xmm6[9],xmm9[10],xmm6[10],xmm9[11],xmm6[11],xmm9[12],xmm6[12],xmm9[13],xmm6[13],xmm9[14],xmm6[14],xmm9[15],xmm6[15]
+; GFNIAVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm10[4,4,5,5,6,6,7,7]
+; GFNIAVX1-NEXT: vpslld $23, %xmm4, %xmm11
+; GFNIAVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
+; GFNIAVX1-NEXT: vpaddd %xmm4, %xmm11, %xmm11
+; GFNIAVX1-NEXT: vcvttps2dq %xmm11, %xmm11
+; GFNIAVX1-NEXT: vpmovzxwd {{.*#+}} xmm10 = xmm10[0],zero,xmm10[1],zero,xmm10[2],zero,xmm10[3],zero
+; GFNIAVX1-NEXT: vpslld $23, %xmm10, %xmm10
+; GFNIAVX1-NEXT: vpaddd %xmm4, %xmm10, %xmm10
+; GFNIAVX1-NEXT: vcvttps2dq %xmm10, %xmm10
+; GFNIAVX1-NEXT: vpackusdw %xmm11, %xmm10, %xmm10
+; GFNIAVX1-NEXT: vextractf128 $1, %ymm0, %xmm11
+; GFNIAVX1-NEXT: vextractf128 $1, %ymm2, %xmm12
+; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm12[8],xmm11[8],xmm12[9],xmm11[9],xmm12[10],xmm11[10],xmm12[11],xmm11[11],xmm12[12],xmm11[12],xmm12[13],xmm11[13],xmm12[14],xmm11[14],xmm12[15],xmm11[15]
+; GFNIAVX1-NEXT: vpmullw %xmm10, %xmm13, %xmm10
+; GFNIAVX1-NEXT: vpsrlw $8, %xmm10, %xmm10
+; GFNIAVX1-NEXT: vpmovzxbd {{.*#+}} xmm13 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero,xmm9[2],zero,zero,zero,xmm9[3],zero,zero,zero
+; GFNIAVX1-NEXT: vpslld $23, %xmm13, %xmm13
+; GFNIAVX1-NEXT: vpaddd %xmm4, %xmm13, %xmm13
+; GFNIAVX1-NEXT: vcvttps2dq %xmm13, %xmm13
+; GFNIAVX1-NEXT: vpmovzxbw {{.*#+}} xmm9 = xmm9[0],zero,xmm9[1],zero,xmm9[2],zero,xmm9[3],zero,xmm9[4],zero,xmm9[5],zero,xmm9[6],zero,xmm9[7],zero
+; GFNIAVX1-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm9[4,4,5,5,6,6,7,7]
+; GFNIAVX1-NEXT: vpslld $23, %xmm9, %xmm9
+; GFNIAVX1-NEXT: vpaddd %xmm4, %xmm9, %xmm9
+; GFNIAVX1-NEXT: vcvttps2dq %xmm9, %xmm9
+; GFNIAVX1-NEXT: vpackusdw %xmm9, %xmm13, %xmm9
+; GFNIAVX1-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7]
+; GFNIAVX1-NEXT: vpmullw %xmm9, %xmm11, %xmm9
+; GFNIAVX1-NEXT: vpsrlw $8, %xmm9, %xmm9
+; GFNIAVX1-NEXT: vpackuswb %xmm10, %xmm9, %xmm9
+; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm8[8],xmm6[8],xmm8[9],xmm6[9],xmm8[10],xmm6[10],xmm8[11],xmm6[11],xmm8[12],xmm6[12],xmm8[13],xmm6[13],xmm8[14],xmm6[14],xmm8[15],xmm6[15]
+; GFNIAVX1-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm10[4,4,5,5,6,6,7,7]
+; GFNIAVX1-NEXT: vpslld $23, %xmm11, %xmm11
+; GFNIAVX1-NEXT: vpaddd %xmm4, %xmm11, %xmm11
+; GFNIAVX1-NEXT: vcvttps2dq %xmm11, %xmm11
+; GFNIAVX1-NEXT: vpmovzxwd {{.*#+}} xmm10 = xmm10[0],zero,xmm10[1],zero,xmm10[2],zero,xmm10[3],zero
+; GFNIAVX1-NEXT: vpslld $23, %xmm10, %xmm10
+; GFNIAVX1-NEXT: vpaddd %xmm4, %xmm10, %xmm10
+; GFNIAVX1-NEXT: vcvttps2dq %xmm10, %xmm10
+; GFNIAVX1-NEXT: vpackusdw %xmm11, %xmm10, %xmm10
+; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
+; GFNIAVX1-NEXT: vpmullw %xmm10, %xmm11, %xmm10
+; GFNIAVX1-NEXT: vpsrlw $8, %xmm10, %xmm10
+; GFNIAVX1-NEXT: vpmovzxbd {{.*#+}} xmm11 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero,xmm8[2],zero,zero,zero,xmm8[3],zero,zero,zero
+; GFNIAVX1-NEXT: vpslld $23, %xmm11, %xmm11
+; GFNIAVX1-NEXT: vpaddd %xmm4, %xmm11, %xmm11
+; GFNIAVX1-NEXT: vcvttps2dq %xmm11, %xmm11
+; GFNIAVX1-NEXT: vpmovzxbw {{.*#+}} xmm8 = xmm8[0],zero,xmm8[1],zero,xmm8[2],zero,xmm8[3],zero,xmm8[4],zero,xmm8[5],zero,xmm8[6],zero,xmm8[7],zero
+; GFNIAVX1-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm8[4,4,5,5,6,6,7,7]
+; GFNIAVX1-NEXT: vpslld $23, %xmm8, %xmm8
+; GFNIAVX1-NEXT: vpaddd %xmm4, %xmm8, %xmm8
+; GFNIAVX1-NEXT: vcvttps2dq %xmm8, %xmm8
+; GFNIAVX1-NEXT: vpackusdw %xmm8, %xmm11, %xmm8
+; GFNIAVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; GFNIAVX1-NEXT: vpmullw %xmm0, %xmm8, %xmm0
+; GFNIAVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
+; GFNIAVX1-NEXT: vpackuswb %xmm10, %xmm0, %xmm0
+; GFNIAVX1-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm0
+; GFNIAVX1-NEXT: vandps %ymm7, %ymm5, %ymm2
+; GFNIAVX1-NEXT: vextractf128 $1, %ymm2, %xmm5
+; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15]
+; GFNIAVX1-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm7[4,4,5,5,6,6,7,7]
+; GFNIAVX1-NEXT: vpslld $23, %xmm8, %xmm8
+; GFNIAVX1-NEXT: vpaddd %xmm4, %xmm8, %xmm8
+; GFNIAVX1-NEXT: vcvttps2dq %xmm8, %xmm8
+; GFNIAVX1-NEXT: vpmovzxwd {{.*#+}} xmm7 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero
+; GFNIAVX1-NEXT: vpslld $23, %xmm7, %xmm7
+; GFNIAVX1-NEXT: vpaddd %xmm4, %xmm7, %xmm7
+; GFNIAVX1-NEXT: vcvttps2dq %xmm7, %xmm7
+; GFNIAVX1-NEXT: vpackusdw %xmm8, %xmm7, %xmm7
+; GFNIAVX1-NEXT: vextractf128 $1, %ymm1, %xmm8
+; GFNIAVX1-NEXT: vextractf128 $1, %ymm3, %xmm9
+; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm9[8],xmm8[8],xmm9[9],xmm8[9],xmm9[10],xmm8[10],xmm9[11],xmm8[11],xmm9[12],xmm8[12],xmm9[13],xmm8[13],xmm9[14],xmm8[14],xmm9[15],xmm8[15]
+; GFNIAVX1-NEXT: vpmullw %xmm7, %xmm10, %xmm7
+; GFNIAVX1-NEXT: vpsrlw $8, %xmm7, %xmm7
+; GFNIAVX1-NEXT: vpmovzxbd {{.*#+}} xmm10 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero
+; GFNIAVX1-NEXT: vpslld $23, %xmm10, %xmm10
+; GFNIAVX1-NEXT: vpaddd %xmm4, %xmm10, %xmm10
+; GFNIAVX1-NEXT: vcvttps2dq %xmm10, %xmm10
+; GFNIAVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero
+; GFNIAVX1-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7]
+; GFNIAVX1-NEXT: vpslld $23, %xmm5, %xmm5
+; GFNIAVX1-NEXT: vpaddd %xmm4, %xmm5, %xmm5
+; GFNIAVX1-NEXT: vcvttps2dq %xmm5, %xmm5
+; GFNIAVX1-NEXT: vpackusdw %xmm5, %xmm10, %xmm5
+; GFNIAVX1-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7]
+; GFNIAVX1-NEXT: vpmullw %xmm5, %xmm8, %xmm5
+; GFNIAVX1-NEXT: vpsrlw $8, %xmm5, %xmm5
+; GFNIAVX1-NEXT: vpackuswb %xmm7, %xmm5, %xmm5
+; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm2[8],xmm6[8],xmm2[9],xmm6[9],xmm2[10],xmm6[10],xmm2[11],xmm6[11],xmm2[12],xmm6[12],xmm2[13],xmm6[13],xmm2[14],xmm6[14],xmm2[15],xmm6[15]
+; GFNIAVX1-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm6[4,4,5,5,6,6,7,7]
+; GFNIAVX1-NEXT: vpslld $23, %xmm7, %xmm7
+; GFNIAVX1-NEXT: vpaddd %xmm4, %xmm7, %xmm7
+; GFNIAVX1-NEXT: vcvttps2dq %xmm7, %xmm7
+; GFNIAVX1-NEXT: vpmovzxwd {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero
+; GFNIAVX1-NEXT: vpslld $23, %xmm6, %xmm6
+; GFNIAVX1-NEXT: vpaddd %xmm4, %xmm6, %xmm6
+; GFNIAVX1-NEXT: vcvttps2dq %xmm6, %xmm6
+; GFNIAVX1-NEXT: vpackusdw %xmm7, %xmm6, %xmm6
+; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
+; GFNIAVX1-NEXT: vpmullw %xmm6, %xmm7, %xmm6
+; GFNIAVX1-NEXT: vpsrlw $8, %xmm6, %xmm6
+; GFNIAVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
+; GFNIAVX1-NEXT: vpslld $23, %xmm7, %xmm7
+; GFNIAVX1-NEXT: vpaddd %xmm4, %xmm7, %xmm7
+; GFNIAVX1-NEXT: vcvttps2dq %xmm7, %xmm7
+; GFNIAVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
+; GFNIAVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7]
+; GFNIAVX1-NEXT: vpslld $23, %xmm2, %xmm2
+; GFNIAVX1-NEXT: vpaddd %xmm4, %xmm2, %xmm2
+; GFNIAVX1-NEXT: vcvttps2dq %xmm2, %xmm2
+; GFNIAVX1-NEXT: vpackusdw %xmm2, %xmm7, %xmm2
+; GFNIAVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
+; GFNIAVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1
+; GFNIAVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
+; GFNIAVX1-NEXT: vpackuswb %xmm6, %xmm1, %xmm1
+; GFNIAVX1-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1
+; GFNIAVX1-NEXT: retq
+;
+; GFNIAVX2-LABEL: var_fshl_v64i8:
+; GFNIAVX2: # %bb.0:
+; GFNIAVX2-NEXT: vpunpckhbw {{.*#+}} ymm7 = ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15],ymm2[24],ymm0[24],ymm2[25],ymm0[25],ymm2[26],ymm0[26],ymm2[27],ymm0[27],ymm2[28],ymm0[28],ymm2[29],ymm0[29],ymm2[30],ymm0[30],ymm2[31],ymm0[31]
+; GFNIAVX2-NEXT: vpxor %xmm6, %xmm6, %xmm6
+; GFNIAVX2-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm6[4],ymm7[4],ymm6[5],ymm7[5],ymm6[6],ymm7[6],ymm6[7],ymm7[7],ymm6[12],ymm7[12],ymm6[13],ymm7[13],ymm6[14],ymm7[14],ymm6[15],ymm7[15]
+; GFNIAVX2-NEXT: vpbroadcastb {{.*#+}} ymm9 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; GFNIAVX2-NEXT: vpand %ymm4, %ymm9, %ymm4
+; GFNIAVX2-NEXT: vpunpckhbw {{.*#+}} ymm10 = ymm4[8],ymm6[8],ymm4[9],ymm6[9],ymm4[10],ymm6[10],ymm4[11],ymm6[11],ymm4[12],ymm6[12],ymm4[13],ymm6[13],ymm4[14],ymm6[14],ymm4[15],ymm6[15],ymm4[24],ymm6[24],ymm4[25],ymm6[25],ymm4[26],ymm6[26],ymm4[27],ymm6[27],ymm4[28],ymm6[28],ymm4[29],ymm6[29],ymm4[30],ymm6[30],ymm4[31],ymm6[31]
+; GFNIAVX2-NEXT: vpunpckhwd {{.*#+}} ymm11 = ymm10[4],ymm6[4],ymm10[5],ymm6[5],ymm10[6],ymm6[6],ymm10[7],ymm6[7],ymm10[12],ymm6[12],ymm10[13],ymm6[13],ymm10[14],ymm6[14],ymm10[15],ymm6[15]
+; GFNIAVX2-NEXT: vpsllvd %ymm11, %ymm8, %ymm8
+; GFNIAVX2-NEXT: vpsrld $16, %ymm8, %ymm8
+; GFNIAVX2-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm6[0],ymm7[0],ymm6[1],ymm7[1],ymm6[2],ymm7[2],ymm6[3],ymm7[3],ymm6[8],ymm7[8],ymm6[9],ymm7[9],ymm6[10],ymm7[10],ymm6[11],ymm7[11]
+; GFNIAVX2-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm10[0],ymm6[0],ymm10[1],ymm6[1],ymm10[2],ymm6[2],ymm10[3],ymm6[3],ymm10[8],ymm6[8],ymm10[9],ymm6[9],ymm10[10],ymm6[10],ymm10[11],ymm6[11]
+; GFNIAVX2-NEXT: vpsllvd %ymm10, %ymm7, %ymm7
+; GFNIAVX2-NEXT: vpsrld $16, %ymm7, %ymm7
+; GFNIAVX2-NEXT: vpackusdw %ymm8, %ymm7, %ymm7
+; GFNIAVX2-NEXT: vpsrlw $8, %ymm7, %ymm7
+; GFNIAVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23]
+; GFNIAVX2-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm6[4],ymm0[4],ymm6[5],ymm0[5],ymm6[6],ymm0[6],ymm6[7],ymm0[7],ymm6[12],ymm0[12],ymm6[13],ymm0[13],ymm6[14],ymm0[14],ymm6[15],ymm0[15]
+; GFNIAVX2-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm6[0],ymm4[1],ymm6[1],ymm4[2],ymm6[2],ymm4[3],ymm6[3],ymm4[4],ymm6[4],ymm4[5],ymm6[5],ymm4[6],ymm6[6],ymm4[7],ymm6[7],ymm4[16],ymm6[16],ymm4[17],ymm6[17],ymm4[18],ymm6[18],ymm4[19],ymm6[19],ymm4[20],ymm6[20],ymm4[21],ymm6[21],ymm4[22],ymm6[22],ymm4[23],ymm6[23]
+; GFNIAVX2-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm4[4],ymm6[4],ymm4[5],ymm6[5],ymm4[6],ymm6[6],ymm4[7],ymm6[7],ymm4[12],ymm6[12],ymm4[13],ymm6[13],ymm4[14],ymm6[14],ymm4[15],ymm6[15]
+; GFNIAVX2-NEXT: vpsllvd %ymm8, %ymm2, %ymm2
+; GFNIAVX2-NEXT: vpsrld $16, %ymm2, %ymm2
+; GFNIAVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm6[0],ymm0[0],ymm6[1],ymm0[1],ymm6[2],ymm0[2],ymm6[3],ymm0[3],ymm6[8],ymm0[8],ymm6[9],ymm0[9],ymm6[10],ymm0[10],ymm6[11],ymm0[11]
+; GFNIAVX2-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm4[0],ymm6[0],ymm4[1],ymm6[1],ymm4[2],ymm6[2],ymm4[3],ymm6[3],ymm4[8],ymm6[8],ymm4[9],ymm6[9],ymm4[10],ymm6[10],ymm4[11],ymm6[11]
+; GFNIAVX2-NEXT: vpsllvd %ymm4, %ymm0, %ymm0
+; GFNIAVX2-NEXT: vpsrld $16, %ymm0, %ymm0
+; GFNIAVX2-NEXT: vpackusdw %ymm2, %ymm0, %ymm0
+; GFNIAVX2-NEXT: vpsrlw $8, %ymm0, %ymm0
+; GFNIAVX2-NEXT: vpackuswb %ymm7, %ymm0, %ymm0
+; GFNIAVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm3[8],ymm1[8],ymm3[9],ymm1[9],ymm3[10],ymm1[10],ymm3[11],ymm1[11],ymm3[12],ymm1[12],ymm3[13],ymm1[13],ymm3[14],ymm1[14],ymm3[15],ymm1[15],ymm3[24],ymm1[24],ymm3[25],ymm1[25],ymm3[26],ymm1[26],ymm3[27],ymm1[27],ymm3[28],ymm1[28],ymm3[29],ymm1[29],ymm3[30],ymm1[30],ymm3[31],ymm1[31]
+; GFNIAVX2-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm6[4],ymm2[4],ymm6[5],ymm2[5],ymm6[6],ymm2[6],ymm6[7],ymm2[7],ymm6[12],ymm2[12],ymm6[13],ymm2[13],ymm6[14],ymm2[14],ymm6[15],ymm2[15]
+; GFNIAVX2-NEXT: vpand %ymm5, %ymm9, %ymm5
+; GFNIAVX2-NEXT: vpunpckhbw {{.*#+}} ymm7 = ymm5[8],ymm6[8],ymm5[9],ymm6[9],ymm5[10],ymm6[10],ymm5[11],ymm6[11],ymm5[12],ymm6[12],ymm5[13],ymm6[13],ymm5[14],ymm6[14],ymm5[15],ymm6[15],ymm5[24],ymm6[24],ymm5[25],ymm6[25],ymm5[26],ymm6[26],ymm5[27],ymm6[27],ymm5[28],ymm6[28],ymm5[29],ymm6[29],ymm5[30],ymm6[30],ymm5[31],ymm6[31]
+; GFNIAVX2-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm7[4],ymm6[4],ymm7[5],ymm6[5],ymm7[6],ymm6[6],ymm7[7],ymm6[7],ymm7[12],ymm6[12],ymm7[13],ymm6[13],ymm7[14],ymm6[14],ymm7[15],ymm6[15]
+; GFNIAVX2-NEXT: vpsllvd %ymm8, %ymm4, %ymm4
+; GFNIAVX2-NEXT: vpsrld $16, %ymm4, %ymm4
+; GFNIAVX2-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm6[0],ymm2[0],ymm6[1],ymm2[1],ymm6[2],ymm2[2],ymm6[3],ymm2[3],ymm6[8],ymm2[8],ymm6[9],ymm2[9],ymm6[10],ymm2[10],ymm6[11],ymm2[11]
+; GFNIAVX2-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[8],ymm6[8],ymm7[9],ymm6[9],ymm7[10],ymm6[10],ymm7[11],ymm6[11]
+; GFNIAVX2-NEXT: vpsllvd %ymm7, %ymm2, %ymm2
+; GFNIAVX2-NEXT: vpsrld $16, %ymm2, %ymm2
+; GFNIAVX2-NEXT: vpackusdw %ymm4, %ymm2, %ymm2
+; GFNIAVX2-NEXT: vpsrlw $8, %ymm2, %ymm2
+; GFNIAVX2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm3[0],ymm1[0],ymm3[1],ymm1[1],ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[4],ymm1[4],ymm3[5],ymm1[5],ymm3[6],ymm1[6],ymm3[7],ymm1[7],ymm3[16],ymm1[16],ymm3[17],ymm1[17],ymm3[18],ymm1[18],ymm3[19],ymm1[19],ymm3[20],ymm1[20],ymm3[21],ymm1[21],ymm3[22],ymm1[22],ymm3[23],ymm1[23]
+; GFNIAVX2-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm6[4],ymm1[4],ymm6[5],ymm1[5],ymm6[6],ymm1[6],ymm6[7],ymm1[7],ymm6[12],ymm1[12],ymm6[13],ymm1[13],ymm6[14],ymm1[14],ymm6[15],ymm1[15]
+; GFNIAVX2-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm5[0],ymm6[0],ymm5[1],ymm6[1],ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[4],ymm6[4],ymm5[5],ymm6[5],ymm5[6],ymm6[6],ymm5[7],ymm6[7],ymm5[16],ymm6[16],ymm5[17],ymm6[17],ymm5[18],ymm6[18],ymm5[19],ymm6[19],ymm5[20],ymm6[20],ymm5[21],ymm6[21],ymm5[22],ymm6[22],ymm5[23],ymm6[23]
+; GFNIAVX2-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm4[4],ymm6[4],ymm4[5],ymm6[5],ymm4[6],ymm6[6],ymm4[7],ymm6[7],ymm4[12],ymm6[12],ymm4[13],ymm6[13],ymm4[14],ymm6[14],ymm4[15],ymm6[15]
+; GFNIAVX2-NEXT: vpsllvd %ymm5, %ymm3, %ymm3
+; GFNIAVX2-NEXT: vpsrld $16, %ymm3, %ymm3
+; GFNIAVX2-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm6[0],ymm1[0],ymm6[1],ymm1[1],ymm6[2],ymm1[2],ymm6[3],ymm1[3],ymm6[8],ymm1[8],ymm6[9],ymm1[9],ymm6[10],ymm1[10],ymm6[11],ymm1[11]
+; GFNIAVX2-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm4[0],ymm6[0],ymm4[1],ymm6[1],ymm4[2],ymm6[2],ymm4[3],ymm6[3],ymm4[8],ymm6[8],ymm4[9],ymm6[9],ymm4[10],ymm6[10],ymm4[11],ymm6[11]
+; GFNIAVX2-NEXT: vpsllvd %ymm4, %ymm1, %ymm1
+; GFNIAVX2-NEXT: vpsrld $16, %ymm1, %ymm1
+; GFNIAVX2-NEXT: vpackusdw %ymm3, %ymm1, %ymm1
+; GFNIAVX2-NEXT: vpsrlw $8, %ymm1, %ymm1
+; GFNIAVX2-NEXT: vpackuswb %ymm2, %ymm1, %ymm1
+; GFNIAVX2-NEXT: retq
+;
+; GFNIAVX512VL-LABEL: var_fshl_v64i8:
+; GFNIAVX512VL: # %bb.0:
+; GFNIAVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm3
+; GFNIAVX512VL-NEXT: vpsrlw $1, %ymm3, %ymm3
+; GFNIAVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm4 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; GFNIAVX512VL-NEXT: vpand %ymm4, %ymm3, %ymm5
+; GFNIAVX512VL-NEXT: vpsrlw $4, %ymm5, %ymm3
+; GFNIAVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm6 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; GFNIAVX512VL-NEXT: vpand %ymm6, %ymm3, %ymm7
+; GFNIAVX512VL-NEXT: vpbroadcastd {{.*#+}} zmm8 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; GFNIAVX512VL-NEXT: vpandq %zmm8, %zmm2, %zmm2
+; GFNIAVX512VL-NEXT: vextracti64x4 $1, %zmm2, %ymm3
+; GFNIAVX512VL-NEXT: vpxor %ymm3, %ymm8, %ymm9
+; GFNIAVX512VL-NEXT: vpsllw $5, %ymm9, %ymm9
+; GFNIAVX512VL-NEXT: vpblendvb %ymm9, %ymm7, %ymm5, %ymm5
+; GFNIAVX512VL-NEXT: vpsrlw $2, %ymm5, %ymm7
+; GFNIAVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm10 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
+; GFNIAVX512VL-NEXT: vpand %ymm7, %ymm10, %ymm7
+; GFNIAVX512VL-NEXT: vpaddb %ymm9, %ymm9, %ymm9
+; GFNIAVX512VL-NEXT: vpblendvb %ymm9, %ymm7, %ymm5, %ymm5
+; GFNIAVX512VL-NEXT: vpsrlw $1, %ymm5, %ymm7
+; GFNIAVX512VL-NEXT: vpand %ymm4, %ymm7, %ymm7
+; GFNIAVX512VL-NEXT: vpaddb %ymm9, %ymm9, %ymm9
+; GFNIAVX512VL-NEXT: vpblendvb %ymm9, %ymm7, %ymm5, %ymm5
+; GFNIAVX512VL-NEXT: vpsrlw $1, %ymm1, %ymm1
+; GFNIAVX512VL-NEXT: vpand %ymm4, %ymm1, %ymm1
+; GFNIAVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm7
+; GFNIAVX512VL-NEXT: vpand %ymm6, %ymm7, %ymm6
+; GFNIAVX512VL-NEXT: vpxor %ymm2, %ymm8, %ymm7
+; GFNIAVX512VL-NEXT: vpsllw $5, %ymm7, %ymm7
+; GFNIAVX512VL-NEXT: vpblendvb %ymm7, %ymm6, %ymm1, %ymm1
+; GFNIAVX512VL-NEXT: vpsrlw $2, %ymm1, %ymm6
+; GFNIAVX512VL-NEXT: vpand %ymm6, %ymm10, %ymm6
+; GFNIAVX512VL-NEXT: vpaddb %ymm7, %ymm7, %ymm7
+; GFNIAVX512VL-NEXT: vpblendvb %ymm7, %ymm6, %ymm1, %ymm1
+; GFNIAVX512VL-NEXT: vpsrlw $1, %ymm1, %ymm6
+; GFNIAVX512VL-NEXT: vpand %ymm4, %ymm6, %ymm4
+; GFNIAVX512VL-NEXT: vpaddb %ymm7, %ymm7, %ymm6
+; GFNIAVX512VL-NEXT: vpblendvb %ymm6, %ymm4, %ymm1, %ymm1
+; GFNIAVX512VL-NEXT: vinserti64x4 $1, %ymm5, %zmm1, %zmm1
+; GFNIAVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm4
+; GFNIAVX512VL-NEXT: vpsllw $4, %ymm4, %ymm5
+; GFNIAVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm6 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; GFNIAVX512VL-NEXT: vpand %ymm6, %ymm5, %ymm5
+; GFNIAVX512VL-NEXT: vpsllw $5, %ymm3, %ymm3
+; GFNIAVX512VL-NEXT: vpblendvb %ymm3, %ymm5, %ymm4, %ymm4
+; GFNIAVX512VL-NEXT: vpsllw $2, %ymm4, %ymm5
+; GFNIAVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm7 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
+; GFNIAVX512VL-NEXT: vpand %ymm7, %ymm5, %ymm5
+; GFNIAVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm3
+; GFNIAVX512VL-NEXT: vpblendvb %ymm3, %ymm5, %ymm4, %ymm4
+; GFNIAVX512VL-NEXT: vpaddb %ymm4, %ymm4, %ymm5
+; GFNIAVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm3
+; GFNIAVX512VL-NEXT: vpblendvb %ymm3, %ymm5, %ymm4, %ymm3
+; GFNIAVX512VL-NEXT: vpsllw $4, %ymm0, %ymm4
+; GFNIAVX512VL-NEXT: vpand %ymm6, %ymm4, %ymm4
+; GFNIAVX512VL-NEXT: vpsllw $5, %ymm2, %ymm2
+; GFNIAVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
+; GFNIAVX512VL-NEXT: vpsllw $2, %ymm0, %ymm4
+; GFNIAVX512VL-NEXT: vpand %ymm7, %ymm4, %ymm4
+; GFNIAVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm2
+; GFNIAVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
+; GFNIAVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm4
+; GFNIAVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm2
+; GFNIAVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
+; GFNIAVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0
+; GFNIAVX512VL-NEXT: vporq %zmm1, %zmm0, %zmm0
+; GFNIAVX512VL-NEXT: retq
+;
+; GFNIAVX512BW-LABEL: var_fshl_v64i8:
+; GFNIAVX512BW: # %bb.0:
+; GFNIAVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63]
+; GFNIAVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2
+; GFNIAVX512BW-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; GFNIAVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm5 = zmm2[8],zmm4[8],zmm2[9],zmm4[9],zmm2[10],zmm4[10],zmm2[11],zmm4[11],zmm2[12],zmm4[12],zmm2[13],zmm4[13],zmm2[14],zmm4[14],zmm2[15],zmm4[15],zmm2[24],zmm4[24],zmm2[25],zmm4[25],zmm2[26],zmm4[26],zmm2[27],zmm4[27],zmm2[28],zmm4[28],zmm2[29],zmm4[29],zmm2[30],zmm4[30],zmm2[31],zmm4[31],zmm2[40],zmm4[40],zmm2[41],zmm4[41],zmm2[42],zmm4[42],zmm2[43],zmm4[43],zmm2[44],zmm4[44],zmm2[45],zmm4[45],zmm2[46],zmm4[46],zmm2[47],zmm4[47],zmm2[56],zmm4[56],zmm2[57],zmm4[57],zmm2[58],zmm4[58],zmm2[59],zmm4[59],zmm2[60],zmm4[60],zmm2[61],zmm4[61],zmm2[62],zmm4[62],zmm2[63],zmm4[63]
+; GFNIAVX512BW-NEXT: vpsllvw %zmm5, %zmm3, %zmm3
+; GFNIAVX512BW-NEXT: vpsrlw $8, %zmm3, %zmm3
+; GFNIAVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55]
+; GFNIAVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm2[0],zmm4[0],zmm2[1],zmm4[1],zmm2[2],zmm4[2],zmm2[3],zmm4[3],zmm2[4],zmm4[4],zmm2[5],zmm4[5],zmm2[6],zmm4[6],zmm2[7],zmm4[7],zmm2[16],zmm4[16],zmm2[17],zmm4[17],zmm2[18],zmm4[18],zmm2[19],zmm4[19],zmm2[20],zmm4[20],zmm2[21],zmm4[21],zmm2[22],zmm4[22],zmm2[23],zmm4[23],zmm2[32],zmm4[32],zmm2[33],zmm4[33],zmm2[34],zmm4[34],zmm2[35],zmm4[35],zmm2[36],zmm4[36],zmm2[37],zmm4[37],zmm2[38],zmm4[38],zmm2[39],zmm4[39],zmm2[48],zmm4[48],zmm2[49],zmm4[49],zmm2[50],zmm4[50],zmm2[51],zmm4[51],zmm2[52],zmm4[52],zmm2[53],zmm4[53],zmm2[54],zmm4[54],zmm2[55],zmm4[55]
+; GFNIAVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
+; GFNIAVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0
+; GFNIAVX512BW-NEXT: vpackuswb %zmm3, %zmm0, %zmm0
+; GFNIAVX512BW-NEXT: retq
+ %res = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %amt)
+ ret <64 x i8> %res
+}
+
+define <64 x i8> @var_fshr_v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %amt) nounwind {
+; GFNISSE-LABEL: var_fshr_v64i8:
+; GFNISSE: # %bb.0:
+; GFNISSE-NEXT: movdqa %xmm7, %xmm10
+; GFNISSE-NEXT: movdqa %xmm6, %xmm7
+; GFNISSE-NEXT: movdqa %xmm5, %xmm6
+; GFNISSE-NEXT: movdqa %xmm4, %xmm5
+; GFNISSE-NEXT: movdqa %xmm3, %xmm4
+; GFNISSE-NEXT: movdqa %xmm2, %xmm3
+; GFNISSE-NEXT: movdqa %xmm1, %xmm2
+; GFNISSE-NEXT: movdqa %xmm0, %xmm1
+; GFNISSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9
+; GFNISSE-NEXT: movdqa %xmm5, %xmm12
+; GFNISSE-NEXT: psrlw $4, %xmm12
+; GFNISSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm12
+; GFNISSE-NEXT: movdqa {{.*#+}} xmm11 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; GFNISSE-NEXT: movdqa %xmm9, %xmm0
+; GFNISSE-NEXT: pand %xmm11, %xmm0
+; GFNISSE-NEXT: psllw $5, %xmm0
+; GFNISSE-NEXT: pblendvb %xmm0, %xmm12, %xmm5
+; GFNISSE-NEXT: movdqa %xmm5, %xmm13
+; GFNISSE-NEXT: psrlw $2, %xmm13
+; GFNISSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm13
+; GFNISSE-NEXT: paddb %xmm0, %xmm0
+; GFNISSE-NEXT: pblendvb %xmm0, %xmm13, %xmm5
+; GFNISSE-NEXT: movdqa %xmm5, %xmm14
+; GFNISSE-NEXT: psrlw $1, %xmm14
+; GFNISSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm14
+; GFNISSE-NEXT: paddb %xmm0, %xmm0
+; GFNISSE-NEXT: pblendvb %xmm0, %xmm14, %xmm5
+; GFNISSE-NEXT: paddb %xmm1, %xmm1
+; GFNISSE-NEXT: movdqa %xmm1, %xmm15
+; GFNISSE-NEXT: psllw $4, %xmm15
+; GFNISSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm15
+; GFNISSE-NEXT: movdqa %xmm11, %xmm12
+; GFNISSE-NEXT: pandn %xmm11, %xmm9
+; GFNISSE-NEXT: psllw $5, %xmm9
+; GFNISSE-NEXT: movdqa %xmm9, %xmm0
+; GFNISSE-NEXT: pblendvb %xmm0, %xmm15, %xmm1
+; GFNISSE-NEXT: movdqa %xmm1, %xmm8
+; GFNISSE-NEXT: psllw $2, %xmm8
+; GFNISSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm8
+; GFNISSE-NEXT: paddb %xmm9, %xmm9
+; GFNISSE-NEXT: movdqa %xmm9, %xmm0
+; GFNISSE-NEXT: pblendvb %xmm0, %xmm8, %xmm1
+; GFNISSE-NEXT: movdqa %xmm1, %xmm8
+; GFNISSE-NEXT: paddb %xmm1, %xmm8
+; GFNISSE-NEXT: paddb %xmm9, %xmm9
+; GFNISSE-NEXT: movdqa %xmm9, %xmm0
+; GFNISSE-NEXT: pblendvb %xmm0, %xmm8, %xmm1
+; GFNISSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9
+; GFNISSE-NEXT: movdqa %xmm6, %xmm8
+; GFNISSE-NEXT: psrlw $4, %xmm8
+; GFNISSE-NEXT: movdqa {{.*#+}} xmm11 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; GFNISSE-NEXT: pand %xmm11, %xmm8
+; GFNISSE-NEXT: movdqa %xmm9, %xmm0
+; GFNISSE-NEXT: pand %xmm12, %xmm0
+; GFNISSE-NEXT: psllw $5, %xmm0
+; GFNISSE-NEXT: pblendvb %xmm0, %xmm8, %xmm6
+; GFNISSE-NEXT: movdqa %xmm6, %xmm8
+; GFNISSE-NEXT: psrlw $2, %xmm8
+; GFNISSE-NEXT: movdqa {{.*#+}} xmm13 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
+; GFNISSE-NEXT: pand %xmm13, %xmm8
+; GFNISSE-NEXT: paddb %xmm0, %xmm0
+; GFNISSE-NEXT: pblendvb %xmm0, %xmm8, %xmm6
+; GFNISSE-NEXT: movdqa %xmm6, %xmm8
+; GFNISSE-NEXT: psrlw $1, %xmm8
+; GFNISSE-NEXT: movdqa {{.*#+}} xmm14 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; GFNISSE-NEXT: pand %xmm14, %xmm8
+; GFNISSE-NEXT: paddb %xmm0, %xmm0
+; GFNISSE-NEXT: pblendvb %xmm0, %xmm8, %xmm6
+; GFNISSE-NEXT: paddb %xmm2, %xmm2
+; GFNISSE-NEXT: movdqa %xmm2, %xmm8
+; GFNISSE-NEXT: psllw $4, %xmm8
+; GFNISSE-NEXT: movdqa {{.*#+}} xmm15 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; GFNISSE-NEXT: pand %xmm15, %xmm8
+; GFNISSE-NEXT: pandn %xmm12, %xmm9
+; GFNISSE-NEXT: psllw $5, %xmm9
+; GFNISSE-NEXT: movdqa %xmm9, %xmm0
+; GFNISSE-NEXT: pblendvb %xmm0, %xmm8, %xmm2
+; GFNISSE-NEXT: movdqa %xmm2, %xmm8
+; GFNISSE-NEXT: psllw $2, %xmm8
+; GFNISSE-NEXT: movdqa {{.*#+}} xmm0 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
+; GFNISSE-NEXT: pand %xmm0, %xmm8
+; GFNISSE-NEXT: paddb %xmm9, %xmm9
+; GFNISSE-NEXT: movdqa %xmm9, %xmm0
+; GFNISSE-NEXT: pblendvb %xmm0, %xmm8, %xmm2
+; GFNISSE-NEXT: movdqa %xmm2, %xmm8
+; GFNISSE-NEXT: paddb %xmm2, %xmm8
+; GFNISSE-NEXT: paddb %xmm9, %xmm9
+; GFNISSE-NEXT: movdqa %xmm9, %xmm0
+; GFNISSE-NEXT: pblendvb %xmm0, %xmm8, %xmm2
+; GFNISSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9
+; GFNISSE-NEXT: movdqa %xmm7, %xmm8
+; GFNISSE-NEXT: psrlw $4, %xmm8
+; GFNISSE-NEXT: pand %xmm11, %xmm8
+; GFNISSE-NEXT: movdqa %xmm9, %xmm0
+; GFNISSE-NEXT: pand %xmm12, %xmm0
+; GFNISSE-NEXT: psllw $5, %xmm0
+; GFNISSE-NEXT: pblendvb %xmm0, %xmm8, %xmm7
+; GFNISSE-NEXT: movdqa %xmm7, %xmm8
+; GFNISSE-NEXT: psrlw $2, %xmm8
+; GFNISSE-NEXT: pand %xmm13, %xmm8
+; GFNISSE-NEXT: paddb %xmm0, %xmm0
+; GFNISSE-NEXT: pblendvb %xmm0, %xmm8, %xmm7
+; GFNISSE-NEXT: movdqa %xmm7, %xmm8
+; GFNISSE-NEXT: psrlw $1, %xmm8
+; GFNISSE-NEXT: pand %xmm14, %xmm8
+; GFNISSE-NEXT: paddb %xmm0, %xmm0
+; GFNISSE-NEXT: pblendvb %xmm0, %xmm8, %xmm7
+; GFNISSE-NEXT: paddb %xmm3, %xmm3
+; GFNISSE-NEXT: movdqa %xmm3, %xmm8
+; GFNISSE-NEXT: psllw $4, %xmm8
+; GFNISSE-NEXT: pand %xmm15, %xmm8
+; GFNISSE-NEXT: pandn %xmm12, %xmm9
+; GFNISSE-NEXT: psllw $5, %xmm9
+; GFNISSE-NEXT: movdqa %xmm9, %xmm0
+; GFNISSE-NEXT: pblendvb %xmm0, %xmm8, %xmm3
+; GFNISSE-NEXT: movdqa %xmm3, %xmm8
+; GFNISSE-NEXT: psllw $2, %xmm8
+; GFNISSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm8
+; GFNISSE-NEXT: paddb %xmm9, %xmm9
+; GFNISSE-NEXT: movdqa %xmm9, %xmm0
+; GFNISSE-NEXT: pblendvb %xmm0, %xmm8, %xmm3
+; GFNISSE-NEXT: movdqa %xmm3, %xmm8
+; GFNISSE-NEXT: paddb %xmm3, %xmm8
+; GFNISSE-NEXT: paddb %xmm9, %xmm9
+; GFNISSE-NEXT: movdqa %xmm9, %xmm0
+; GFNISSE-NEXT: pblendvb %xmm0, %xmm8, %xmm3
+; GFNISSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9
+; GFNISSE-NEXT: movdqa %xmm10, %xmm8
+; GFNISSE-NEXT: psrlw $4, %xmm8
+; GFNISSE-NEXT: pand %xmm11, %xmm8
+; GFNISSE-NEXT: movdqa %xmm9, %xmm0
+; GFNISSE-NEXT: pand %xmm12, %xmm0
+; GFNISSE-NEXT: psllw $5, %xmm0
+; GFNISSE-NEXT: pblendvb %xmm0, %xmm8, %xmm10
+; GFNISSE-NEXT: movdqa %xmm10, %xmm8
+; GFNISSE-NEXT: psrlw $2, %xmm8
+; GFNISSE-NEXT: pand %xmm13, %xmm8
+; GFNISSE-NEXT: paddb %xmm0, %xmm0
+; GFNISSE-NEXT: pblendvb %xmm0, %xmm8, %xmm10
+; GFNISSE-NEXT: movdqa %xmm10, %xmm8
+; GFNISSE-NEXT: psrlw $1, %xmm8
+; GFNISSE-NEXT: pand %xmm14, %xmm8
+; GFNISSE-NEXT: paddb %xmm0, %xmm0
+; GFNISSE-NEXT: pblendvb %xmm0, %xmm8, %xmm10
+; GFNISSE-NEXT: paddb %xmm4, %xmm4
+; GFNISSE-NEXT: movdqa %xmm4, %xmm8
+; GFNISSE-NEXT: psllw $4, %xmm8
+; GFNISSE-NEXT: pand %xmm15, %xmm8
+; GFNISSE-NEXT: pandn %xmm12, %xmm9
+; GFNISSE-NEXT: psllw $5, %xmm9
+; GFNISSE-NEXT: movdqa %xmm9, %xmm0
+; GFNISSE-NEXT: pblendvb %xmm0, %xmm8, %xmm4
+; GFNISSE-NEXT: movdqa %xmm4, %xmm8
+; GFNISSE-NEXT: psllw $2, %xmm8
+; GFNISSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm8
+; GFNISSE-NEXT: paddb %xmm9, %xmm9
+; GFNISSE-NEXT: movdqa %xmm9, %xmm0
+; GFNISSE-NEXT: pblendvb %xmm0, %xmm8, %xmm4
+; GFNISSE-NEXT: movdqa %xmm4, %xmm8
+; GFNISSE-NEXT: paddb %xmm4, %xmm8
+; GFNISSE-NEXT: paddb %xmm9, %xmm9
+; GFNISSE-NEXT: movdqa %xmm9, %xmm0
+; GFNISSE-NEXT: pblendvb %xmm0, %xmm8, %xmm4
+; GFNISSE-NEXT: por %xmm5, %xmm1
+; GFNISSE-NEXT: por %xmm6, %xmm2
+; GFNISSE-NEXT: por %xmm7, %xmm3
+; GFNISSE-NEXT: por %xmm10, %xmm4
+; GFNISSE-NEXT: movdqa %xmm1, %xmm0
+; GFNISSE-NEXT: movdqa %xmm2, %xmm1
+; GFNISSE-NEXT: movdqa %xmm3, %xmm2
+; GFNISSE-NEXT: movdqa %xmm4, %xmm3
+; GFNISSE-NEXT: retq
+;
+; GFNIAVX1-LABEL: var_fshr_v64i8:
+; GFNIAVX1: # %bb.0:
+; GFNIAVX1-NEXT: vextractf128 $1, %ymm2, %xmm8
+; GFNIAVX1-NEXT: vpsrlw $4, %xmm8, %xmm6
+; GFNIAVX1-NEXT: vbroadcastss {{.*#+}} xmm7 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; GFNIAVX1-NEXT: vpand %xmm7, %xmm6, %xmm9
+; GFNIAVX1-NEXT: vbroadcastss {{.*#+}} ymm6 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; GFNIAVX1-NEXT: vandps %ymm6, %ymm4, %ymm11
+; GFNIAVX1-NEXT: vextractf128 $1, %ymm11, %xmm10
+; GFNIAVX1-NEXT: vpsllw $5, %xmm10, %xmm12
+; GFNIAVX1-NEXT: vpblendvb %xmm12, %xmm9, %xmm8, %xmm8
+; GFNIAVX1-NEXT: vpsrlw $2, %xmm8, %xmm9
+; GFNIAVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
+; GFNIAVX1-NEXT: vpand %xmm4, %xmm9, %xmm9
+; GFNIAVX1-NEXT: vpaddb %xmm12, %xmm12, %xmm12
+; GFNIAVX1-NEXT: vpblendvb %xmm12, %xmm9, %xmm8, %xmm9
+; GFNIAVX1-NEXT: vpsrlw $1, %xmm9, %xmm13
+; GFNIAVX1-NEXT: vbroadcastss {{.*#+}} xmm8 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; GFNIAVX1-NEXT: vpand %xmm8, %xmm13, %xmm13
+; GFNIAVX1-NEXT: vpaddb %xmm12, %xmm12, %xmm12
+; GFNIAVX1-NEXT: vpblendvb %xmm12, %xmm13, %xmm9, %xmm12
+; GFNIAVX1-NEXT: vextractf128 $1, %ymm0, %xmm9
+; GFNIAVX1-NEXT: vpaddb %xmm9, %xmm9, %xmm13
+; GFNIAVX1-NEXT: vpsllw $4, %xmm13, %xmm14
+; GFNIAVX1-NEXT: vbroadcastss {{.*#+}} xmm9 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; GFNIAVX1-NEXT: vpand %xmm9, %xmm14, %xmm14
+; GFNIAVX1-NEXT: vpxor %xmm6, %xmm10, %xmm10
+; GFNIAVX1-NEXT: vpsllw $5, %xmm10, %xmm15
+; GFNIAVX1-NEXT: vpblendvb %xmm15, %xmm14, %xmm13, %xmm13
+; GFNIAVX1-NEXT: vpsllw $2, %xmm13, %xmm14
+; GFNIAVX1-NEXT: vbroadcastss {{.*#+}} xmm10 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
+; GFNIAVX1-NEXT: vpand %xmm10, %xmm14, %xmm14
+; GFNIAVX1-NEXT: vpaddb %xmm15, %xmm15, %xmm15
+; GFNIAVX1-NEXT: vpblendvb %xmm15, %xmm14, %xmm13, %xmm13
+; GFNIAVX1-NEXT: vpaddb %xmm13, %xmm13, %xmm14
+; GFNIAVX1-NEXT: vpaddb %xmm15, %xmm15, %xmm15
+; GFNIAVX1-NEXT: vpblendvb %xmm15, %xmm14, %xmm13, %xmm13
+; GFNIAVX1-NEXT: vpor %xmm12, %xmm13, %xmm12
+; GFNIAVX1-NEXT: vpsrlw $4, %xmm2, %xmm13
+; GFNIAVX1-NEXT: vpand %xmm7, %xmm13, %xmm13
+; GFNIAVX1-NEXT: vpsllw $5, %xmm11, %xmm14
+; GFNIAVX1-NEXT: vpblendvb %xmm14, %xmm13, %xmm2, %xmm2
+; GFNIAVX1-NEXT: vpsrlw $2, %xmm2, %xmm13
+; GFNIAVX1-NEXT: vpand %xmm4, %xmm13, %xmm13
+; GFNIAVX1-NEXT: vpaddb %xmm14, %xmm14, %xmm14
+; GFNIAVX1-NEXT: vpblendvb %xmm14, %xmm13, %xmm2, %xmm2
+; GFNIAVX1-NEXT: vpsrlw $1, %xmm2, %xmm13
+; GFNIAVX1-NEXT: vpand %xmm8, %xmm13, %xmm13
+; GFNIAVX1-NEXT: vpaddb %xmm14, %xmm14, %xmm14
+; GFNIAVX1-NEXT: vpblendvb %xmm14, %xmm13, %xmm2, %xmm2
+; GFNIAVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm0
+; GFNIAVX1-NEXT: vpsllw $4, %xmm0, %xmm13
+; GFNIAVX1-NEXT: vpand %xmm9, %xmm13, %xmm13
+; GFNIAVX1-NEXT: vpxor %xmm6, %xmm11, %xmm11
+; GFNIAVX1-NEXT: vpsllw $5, %xmm11, %xmm11
+; GFNIAVX1-NEXT: vpblendvb %xmm11, %xmm13, %xmm0, %xmm0
+; GFNIAVX1-NEXT: vpsllw $2, %xmm0, %xmm13
+; GFNIAVX1-NEXT: vpand %xmm10, %xmm13, %xmm13
+; GFNIAVX1-NEXT: vpaddb %xmm11, %xmm11, %xmm11
+; GFNIAVX1-NEXT: vpblendvb %xmm11, %xmm13, %xmm0, %xmm0
+; GFNIAVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm13
+; GFNIAVX1-NEXT: vpaddb %xmm11, %xmm11, %xmm11
+; GFNIAVX1-NEXT: vpblendvb %xmm11, %xmm13, %xmm0, %xmm0
+; GFNIAVX1-NEXT: vpor %xmm2, %xmm0, %xmm0
+; GFNIAVX1-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm0
+; GFNIAVX1-NEXT: vextractf128 $1, %ymm3, %xmm11
+; GFNIAVX1-NEXT: vpsrlw $4, %xmm11, %xmm2
+; GFNIAVX1-NEXT: vpand %xmm7, %xmm2, %xmm12
+; GFNIAVX1-NEXT: vandps %ymm6, %ymm5, %ymm2
+; GFNIAVX1-NEXT: vextractf128 $1, %ymm2, %xmm5
+; GFNIAVX1-NEXT: vpsllw $5, %xmm5, %xmm13
+; GFNIAVX1-NEXT: vpblendvb %xmm13, %xmm12, %xmm11, %xmm11
+; GFNIAVX1-NEXT: vpsrlw $2, %xmm11, %xmm12
+; GFNIAVX1-NEXT: vpand %xmm4, %xmm12, %xmm12
+; GFNIAVX1-NEXT: vpaddb %xmm13, %xmm13, %xmm13
+; GFNIAVX1-NEXT: vpblendvb %xmm13, %xmm12, %xmm11, %xmm11
+; GFNIAVX1-NEXT: vpsrlw $1, %xmm11, %xmm12
+; GFNIAVX1-NEXT: vpand %xmm8, %xmm12, %xmm12
+; GFNIAVX1-NEXT: vpaddb %xmm13, %xmm13, %xmm13
+; GFNIAVX1-NEXT: vpblendvb %xmm13, %xmm12, %xmm11, %xmm11
+; GFNIAVX1-NEXT: vextractf128 $1, %ymm1, %xmm12
+; GFNIAVX1-NEXT: vpaddb %xmm12, %xmm12, %xmm12
+; GFNIAVX1-NEXT: vpsllw $4, %xmm12, %xmm13
+; GFNIAVX1-NEXT: vpand %xmm9, %xmm13, %xmm13
+; GFNIAVX1-NEXT: vpxor %xmm6, %xmm5, %xmm5
+; GFNIAVX1-NEXT: vpsllw $5, %xmm5, %xmm5
+; GFNIAVX1-NEXT: vpblendvb %xmm5, %xmm13, %xmm12, %xmm12
+; GFNIAVX1-NEXT: vpsllw $2, %xmm12, %xmm13
+; GFNIAVX1-NEXT: vpand %xmm10, %xmm13, %xmm13
+; GFNIAVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm5
+; GFNIAVX1-NEXT: vpblendvb %xmm5, %xmm13, %xmm12, %xmm12
+; GFNIAVX1-NEXT: vpaddb %xmm12, %xmm12, %xmm13
+; GFNIAVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm5
+; GFNIAVX1-NEXT: vpblendvb %xmm5, %xmm13, %xmm12, %xmm5
+; GFNIAVX1-NEXT: vpor %xmm5, %xmm11, %xmm5
+; GFNIAVX1-NEXT: vpsrlw $4, %xmm3, %xmm11
+; GFNIAVX1-NEXT: vpand %xmm7, %xmm11, %xmm7
+; GFNIAVX1-NEXT: vpsllw $5, %xmm2, %xmm11
+; GFNIAVX1-NEXT: vpblendvb %xmm11, %xmm7, %xmm3, %xmm3
+; GFNIAVX1-NEXT: vpsrlw $2, %xmm3, %xmm7
+; GFNIAVX1-NEXT: vpand %xmm4, %xmm7, %xmm4
+; GFNIAVX1-NEXT: vpaddb %xmm11, %xmm11, %xmm7
+; GFNIAVX1-NEXT: vpblendvb %xmm7, %xmm4, %xmm3, %xmm3
+; GFNIAVX1-NEXT: vpsrlw $1, %xmm3, %xmm4
+; GFNIAVX1-NEXT: vpand %xmm4, %xmm8, %xmm4
+; GFNIAVX1-NEXT: vpaddb %xmm7, %xmm7, %xmm7
+; GFNIAVX1-NEXT: vpblendvb %xmm7, %xmm4, %xmm3, %xmm3
+; GFNIAVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm1
+; GFNIAVX1-NEXT: vpsllw $4, %xmm1, %xmm4
+; GFNIAVX1-NEXT: vpand %xmm4, %xmm9, %xmm4
+; GFNIAVX1-NEXT: vpxor %xmm6, %xmm2, %xmm2
+; GFNIAVX1-NEXT: vpsllw $5, %xmm2, %xmm2
+; GFNIAVX1-NEXT: vpblendvb %xmm2, %xmm4, %xmm1, %xmm1
+; GFNIAVX1-NEXT: vpsllw $2, %xmm1, %xmm4
+; GFNIAVX1-NEXT: vpand %xmm4, %xmm10, %xmm4
+; GFNIAVX1-NEXT: vpaddb %xmm2, %xmm2, %xmm2
+; GFNIAVX1-NEXT: vpblendvb %xmm2, %xmm4, %xmm1, %xmm1
+; GFNIAVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm4
+; GFNIAVX1-NEXT: vpaddb %xmm2, %xmm2, %xmm2
+; GFNIAVX1-NEXT: vpblendvb %xmm2, %xmm4, %xmm1, %xmm1
+; GFNIAVX1-NEXT: vpor %xmm3, %xmm1, %xmm1
+; GFNIAVX1-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1
+; GFNIAVX1-NEXT: retq
+;
+; GFNIAVX2-LABEL: var_fshr_v64i8:
+; GFNIAVX2: # %bb.0:
+; GFNIAVX2-NEXT: vpsrlw $4, %ymm2, %ymm6
+; GFNIAVX2-NEXT: vpbroadcastb {{.*#+}} ymm7 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; GFNIAVX2-NEXT: vpand %ymm7, %ymm6, %ymm8
+; GFNIAVX2-NEXT: vpbroadcastb {{.*#+}} ymm6 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; GFNIAVX2-NEXT: vpand %ymm6, %ymm4, %ymm9
+; GFNIAVX2-NEXT: vpsllw $5, %ymm9, %ymm9
+; GFNIAVX2-NEXT: vpblendvb %ymm9, %ymm8, %ymm2, %ymm2
+; GFNIAVX2-NEXT: vpsrlw $2, %ymm2, %ymm8
+; GFNIAVX2-NEXT: vpbroadcastb {{.*#+}} ymm10 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
+; GFNIAVX2-NEXT: vpand %ymm10, %ymm8, %ymm8
+; GFNIAVX2-NEXT: vpaddb %ymm9, %ymm9, %ymm9
+; GFNIAVX2-NEXT: vpblendvb %ymm9, %ymm8, %ymm2, %ymm2
+; GFNIAVX2-NEXT: vpsrlw $1, %ymm2, %ymm8
+; GFNIAVX2-NEXT: vpbroadcastb {{.*#+}} ymm11 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; GFNIAVX2-NEXT: vpand %ymm11, %ymm8, %ymm8
+; GFNIAVX2-NEXT: vpaddb %ymm9, %ymm9, %ymm9
+; GFNIAVX2-NEXT: vpblendvb %ymm9, %ymm8, %ymm2, %ymm2
+; GFNIAVX2-NEXT: vpaddb %ymm0, %ymm0, %ymm0
+; GFNIAVX2-NEXT: vpsllw $4, %ymm0, %ymm8
+; GFNIAVX2-NEXT: vpbroadcastb {{.*#+}} ymm9 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; GFNIAVX2-NEXT: vpand %ymm9, %ymm8, %ymm8
+; GFNIAVX2-NEXT: vpandn %ymm6, %ymm4, %ymm4
+; GFNIAVX2-NEXT: vpsllw $5, %ymm4, %ymm4
+; GFNIAVX2-NEXT: vpblendvb %ymm4, %ymm8, %ymm0, %ymm0
+; GFNIAVX2-NEXT: vpsllw $2, %ymm0, %ymm8
+; GFNIAVX2-NEXT: vpbroadcastb {{.*#+}} ymm12 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
+; GFNIAVX2-NEXT: vpand %ymm12, %ymm8, %ymm8
+; GFNIAVX2-NEXT: vpaddb %ymm4, %ymm4, %ymm4
+; GFNIAVX2-NEXT: vpblendvb %ymm4, %ymm8, %ymm0, %ymm0
+; GFNIAVX2-NEXT: vpaddb %ymm0, %ymm0, %ymm8
+; GFNIAVX2-NEXT: vpaddb %ymm4, %ymm4, %ymm4
+; GFNIAVX2-NEXT: vpblendvb %ymm4, %ymm8, %ymm0, %ymm0
+; GFNIAVX2-NEXT: vpor %ymm2, %ymm0, %ymm0
+; GFNIAVX2-NEXT: vpsrlw $4, %ymm3, %ymm2
+; GFNIAVX2-NEXT: vpand %ymm7, %ymm2, %ymm2
+; GFNIAVX2-NEXT: vpand %ymm6, %ymm5, %ymm4
+; GFNIAVX2-NEXT: vpsllw $5, %ymm4, %ymm4
+; GFNIAVX2-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2
+; GFNIAVX2-NEXT: vpsrlw $2, %ymm2, %ymm3
+; GFNIAVX2-NEXT: vpand %ymm3, %ymm10, %ymm3
+; GFNIAVX2-NEXT: vpaddb %ymm4, %ymm4, %ymm4
+; GFNIAVX2-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
+; GFNIAVX2-NEXT: vpsrlw $1, %ymm2, %ymm3
+; GFNIAVX2-NEXT: vpand %ymm3, %ymm11, %ymm3
+; GFNIAVX2-NEXT: vpaddb %ymm4, %ymm4, %ymm4
+; GFNIAVX2-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
+; GFNIAVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1
+; GFNIAVX2-NEXT: vpsllw $4, %ymm1, %ymm3
+; GFNIAVX2-NEXT: vpand %ymm3, %ymm9, %ymm3
+; GFNIAVX2-NEXT: vpandn %ymm6, %ymm5, %ymm4
+; GFNIAVX2-NEXT: vpsllw $5, %ymm4, %ymm4
+; GFNIAVX2-NEXT: vpblendvb %ymm4, %ymm3, %ymm1, %ymm1
+; GFNIAVX2-NEXT: vpsllw $2, %ymm1, %ymm3
+; GFNIAVX2-NEXT: vpand %ymm3, %ymm12, %ymm3
+; GFNIAVX2-NEXT: vpaddb %ymm4, %ymm4, %ymm4
+; GFNIAVX2-NEXT: vpblendvb %ymm4, %ymm3, %ymm1, %ymm1
+; GFNIAVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm3
+; GFNIAVX2-NEXT: vpaddb %ymm4, %ymm4, %ymm4
+; GFNIAVX2-NEXT: vpblendvb %ymm4, %ymm3, %ymm1, %ymm1
+; GFNIAVX2-NEXT: vpor %ymm2, %ymm1, %ymm1
+; GFNIAVX2-NEXT: retq
+;
+; GFNIAVX512VL-LABEL: var_fshr_v64i8:
+; GFNIAVX512VL: # %bb.0:
+; GFNIAVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm4
+; GFNIAVX512VL-NEXT: vpsrlw $4, %ymm4, %ymm3
+; GFNIAVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; GFNIAVX512VL-NEXT: vpand %ymm5, %ymm3, %ymm6
+; GFNIAVX512VL-NEXT: vpbroadcastd {{.*#+}} zmm7 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; GFNIAVX512VL-NEXT: vpandq %zmm7, %zmm2, %zmm2
+; GFNIAVX512VL-NEXT: vextracti64x4 $1, %zmm2, %ymm3
+; GFNIAVX512VL-NEXT: vpsllw $5, %ymm3, %ymm8
+; GFNIAVX512VL-NEXT: vpblendvb %ymm8, %ymm6, %ymm4, %ymm4
+; GFNIAVX512VL-NEXT: vpsrlw $2, %ymm4, %ymm6
+; GFNIAVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm9 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
+; GFNIAVX512VL-NEXT: vpand %ymm6, %ymm9, %ymm6
+; GFNIAVX512VL-NEXT: vpaddb %ymm8, %ymm8, %ymm8
+; GFNIAVX512VL-NEXT: vpblendvb %ymm8, %ymm6, %ymm4, %ymm4
+; GFNIAVX512VL-NEXT: vpsrlw $1, %ymm4, %ymm6
+; GFNIAVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm10 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; GFNIAVX512VL-NEXT: vpand %ymm6, %ymm10, %ymm6
+; GFNIAVX512VL-NEXT: vpaddb %ymm8, %ymm8, %ymm8
+; GFNIAVX512VL-NEXT: vpblendvb %ymm8, %ymm6, %ymm4, %ymm4
+; GFNIAVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm6
+; GFNIAVX512VL-NEXT: vpand %ymm5, %ymm6, %ymm5
+; GFNIAVX512VL-NEXT: vpsllw $5, %ymm2, %ymm6
+; GFNIAVX512VL-NEXT: vpblendvb %ymm6, %ymm5, %ymm1, %ymm1
+; GFNIAVX512VL-NEXT: vpsrlw $2, %ymm1, %ymm5
+; GFNIAVX512VL-NEXT: vpand %ymm5, %ymm9, %ymm5
+; GFNIAVX512VL-NEXT: vpaddb %ymm6, %ymm6, %ymm6
+; GFNIAVX512VL-NEXT: vpblendvb %ymm6, %ymm5, %ymm1, %ymm1
+; GFNIAVX512VL-NEXT: vpsrlw $1, %ymm1, %ymm5
+; GFNIAVX512VL-NEXT: vpand %ymm5, %ymm10, %ymm5
+; GFNIAVX512VL-NEXT: vpaddb %ymm6, %ymm6, %ymm6
+; GFNIAVX512VL-NEXT: vpblendvb %ymm6, %ymm5, %ymm1, %ymm1
+; GFNIAVX512VL-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm1
+; GFNIAVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm4
+; GFNIAVX512VL-NEXT: vpaddb %ymm4, %ymm4, %ymm4
+; GFNIAVX512VL-NEXT: vpsllw $4, %ymm4, %ymm5
+; GFNIAVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm6 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; GFNIAVX512VL-NEXT: vpand %ymm6, %ymm5, %ymm5
+; GFNIAVX512VL-NEXT: vpxor %ymm7, %ymm3, %ymm3
+; GFNIAVX512VL-NEXT: vpsllw $5, %ymm3, %ymm3
+; GFNIAVX512VL-NEXT: vpblendvb %ymm3, %ymm5, %ymm4, %ymm4
+; GFNIAVX512VL-NEXT: vpsllw $2, %ymm4, %ymm5
+; GFNIAVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm8 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
+; GFNIAVX512VL-NEXT: vpand %ymm5, %ymm8, %ymm5
+; GFNIAVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm3
+; GFNIAVX512VL-NEXT: vpblendvb %ymm3, %ymm5, %ymm4, %ymm4
+; GFNIAVX512VL-NEXT: vpaddb %ymm4, %ymm4, %ymm5
+; GFNIAVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm3
+; GFNIAVX512VL-NEXT: vpblendvb %ymm3, %ymm5, %ymm4, %ymm3
+; GFNIAVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm0
+; GFNIAVX512VL-NEXT: vpsllw $4, %ymm0, %ymm4
+; GFNIAVX512VL-NEXT: vpand %ymm6, %ymm4, %ymm4
+; GFNIAVX512VL-NEXT: vpxor %ymm7, %ymm2, %ymm2
+; GFNIAVX512VL-NEXT: vpsllw $5, %ymm2, %ymm2
+; GFNIAVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
+; GFNIAVX512VL-NEXT: vpsllw $2, %ymm0, %ymm4
+; GFNIAVX512VL-NEXT: vpand %ymm4, %ymm8, %ymm4
+; GFNIAVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm2
+; GFNIAVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
+; GFNIAVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm4
+; GFNIAVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm2
+; GFNIAVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
+; GFNIAVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0
+; GFNIAVX512VL-NEXT: vporq %zmm1, %zmm0, %zmm0
+; GFNIAVX512VL-NEXT: retq
+;
+; GFNIAVX512BW-LABEL: var_fshr_v64i8:
+; GFNIAVX512BW: # %bb.0:
+; GFNIAVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63]
+; GFNIAVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2
+; GFNIAVX512BW-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; GFNIAVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm5 = zmm2[8],zmm4[8],zmm2[9],zmm4[9],zmm2[10],zmm4[10],zmm2[11],zmm4[11],zmm2[12],zmm4[12],zmm2[13],zmm4[13],zmm2[14],zmm4[14],zmm2[15],zmm4[15],zmm2[24],zmm4[24],zmm2[25],zmm4[25],zmm2[26],zmm4[26],zmm2[27],zmm4[27],zmm2[28],zmm4[28],zmm2[29],zmm4[29],zmm2[30],zmm4[30],zmm2[31],zmm4[31],zmm2[40],zmm4[40],zmm2[41],zmm4[41],zmm2[42],zmm4[42],zmm2[43],zmm4[43],zmm2[44],zmm4[44],zmm2[45],zmm4[45],zmm2[46],zmm4[46],zmm2[47],zmm4[47],zmm2[56],zmm4[56],zmm2[57],zmm4[57],zmm2[58],zmm4[58],zmm2[59],zmm4[59],zmm2[60],zmm4[60],zmm2[61],zmm4[61],zmm2[62],zmm4[62],zmm2[63],zmm4[63]
+; GFNIAVX512BW-NEXT: vpsrlvw %zmm5, %zmm3, %zmm3
+; GFNIAVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; GFNIAVX512BW-NEXT: vpandq %zmm5, %zmm3, %zmm3
+; GFNIAVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55]
+; GFNIAVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm2[0],zmm4[0],zmm2[1],zmm4[1],zmm2[2],zmm4[2],zmm2[3],zmm4[3],zmm2[4],zmm4[4],zmm2[5],zmm4[5],zmm2[6],zmm4[6],zmm2[7],zmm4[7],zmm2[16],zmm4[16],zmm2[17],zmm4[17],zmm2[18],zmm4[18],zmm2[19],zmm4[19],zmm2[20],zmm4[20],zmm2[21],zmm4[21],zmm2[22],zmm4[22],zmm2[23],zmm4[23],zmm2[32],zmm4[32],zmm2[33],zmm4[33],zmm2[34],zmm4[34],zmm2[35],zmm4[35],zmm2[36],zmm4[36],zmm2[37],zmm4[37],zmm2[38],zmm4[38],zmm2[39],zmm4[39],zmm2[48],zmm4[48],zmm2[49],zmm4[49],zmm2[50],zmm4[50],zmm2[51],zmm4[51],zmm2[52],zmm4[52],zmm2[53],zmm4[53],zmm2[54],zmm4[54],zmm2[55],zmm4[55]
+; GFNIAVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
+; GFNIAVX512BW-NEXT: vpandq %zmm5, %zmm0, %zmm0
+; GFNIAVX512BW-NEXT: vpackuswb %zmm3, %zmm0, %zmm0
+; GFNIAVX512BW-NEXT: retq
+ %res = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %amt)
+ ret <64 x i8> %res
+}
+
+define <64 x i8> @splatvar_fshl_v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %amt) nounwind {
+; GFNISSE-LABEL: splatvar_fshl_v64i8:
+; GFNISSE: # %bb.0:
+; GFNISSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
+; GFNISSE-NEXT: movdqa %xmm4, %xmm9
+; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm0[8],xmm9[9],xmm0[9],xmm9[10],xmm0[10],xmm9[11],xmm0[11],xmm9[12],xmm0[12],xmm9[13],xmm0[13],xmm9[14],xmm0[14],xmm9[15],xmm0[15]
+; GFNISSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm8
+; GFNISSE-NEXT: psllw %xmm8, %xmm9
+; GFNISSE-NEXT: psrlw $8, %xmm9
+; GFNISSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
+; GFNISSE-NEXT: psllw %xmm8, %xmm4
+; GFNISSE-NEXT: psrlw $8, %xmm4
+; GFNISSE-NEXT: packuswb %xmm9, %xmm4
+; GFNISSE-NEXT: movdqa %xmm5, %xmm0
+; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; GFNISSE-NEXT: psllw %xmm8, %xmm0
+; GFNISSE-NEXT: psrlw $8, %xmm0
+; GFNISSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
+; GFNISSE-NEXT: psllw %xmm8, %xmm5
+; GFNISSE-NEXT: psrlw $8, %xmm5
+; GFNISSE-NEXT: packuswb %xmm0, %xmm5
+; GFNISSE-NEXT: movdqa %xmm6, %xmm0
+; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
+; GFNISSE-NEXT: psllw %xmm8, %xmm0
+; GFNISSE-NEXT: psrlw $8, %xmm0
+; GFNISSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
+; GFNISSE-NEXT: psllw %xmm8, %xmm6
+; GFNISSE-NEXT: psrlw $8, %xmm6
+; GFNISSE-NEXT: packuswb %xmm0, %xmm6
+; GFNISSE-NEXT: movdqa %xmm7, %xmm0
+; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15]
+; GFNISSE-NEXT: psllw %xmm8, %xmm0
+; GFNISSE-NEXT: psrlw $8, %xmm0
+; GFNISSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7]
+; GFNISSE-NEXT: psllw %xmm8, %xmm7
+; GFNISSE-NEXT: psrlw $8, %xmm7
+; GFNISSE-NEXT: packuswb %xmm0, %xmm7
+; GFNISSE-NEXT: movdqa %xmm4, %xmm0
+; GFNISSE-NEXT: movdqa %xmm5, %xmm1
+; GFNISSE-NEXT: movdqa %xmm6, %xmm2
+; GFNISSE-NEXT: movdqa %xmm7, %xmm3
+; GFNISSE-NEXT: retq
+;
+; GFNIAVX1-LABEL: splatvar_fshl_v64i8:
+; GFNIAVX1: # %bb.0:
+; GFNIAVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
+; GFNIAVX1-NEXT: vextractf128 $1, %ymm2, %xmm6
+; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15]
+; GFNIAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4
+; GFNIAVX1-NEXT: vpsllw %xmm4, %xmm7, %xmm7
+; GFNIAVX1-NEXT: vpsrlw $8, %xmm7, %xmm7
+; GFNIAVX1-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
+; GFNIAVX1-NEXT: vpsllw %xmm4, %xmm5, %xmm5
+; GFNIAVX1-NEXT: vpsrlw $8, %xmm5, %xmm5
+; GFNIAVX1-NEXT: vpackuswb %xmm7, %xmm5, %xmm5
+; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
+; GFNIAVX1-NEXT: vpsllw %xmm4, %xmm6, %xmm6
+; GFNIAVX1-NEXT: vpsrlw $8, %xmm6, %xmm6
+; GFNIAVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; GFNIAVX1-NEXT: vpsllw %xmm4, %xmm0, %xmm0
+; GFNIAVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
+; GFNIAVX1-NEXT: vpackuswb %xmm6, %xmm0, %xmm0
+; GFNIAVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0
+; GFNIAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; GFNIAVX1-NEXT: vextractf128 $1, %ymm3, %xmm5
+; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm5[8],xmm2[8],xmm5[9],xmm2[9],xmm5[10],xmm2[10],xmm5[11],xmm2[11],xmm5[12],xmm2[12],xmm5[13],xmm2[13],xmm5[14],xmm2[14],xmm5[15],xmm2[15]
+; GFNIAVX1-NEXT: vpsllw %xmm4, %xmm6, %xmm6
+; GFNIAVX1-NEXT: vpsrlw $8, %xmm6, %xmm6
+; GFNIAVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7]
+; GFNIAVX1-NEXT: vpsllw %xmm4, %xmm2, %xmm2
+; GFNIAVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
+; GFNIAVX1-NEXT: vpackuswb %xmm6, %xmm2, %xmm2
+; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
+; GFNIAVX1-NEXT: vpsllw %xmm4, %xmm5, %xmm5
+; GFNIAVX1-NEXT: vpsrlw $8, %xmm5, %xmm5
+; GFNIAVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
+; GFNIAVX1-NEXT: vpsllw %xmm4, %xmm1, %xmm1
+; GFNIAVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
+; GFNIAVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1
+; GFNIAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; GFNIAVX1-NEXT: retq
+;
+; GFNIAVX2-LABEL: splatvar_fshl_v64i8:
+; GFNIAVX2: # %bb.0:
+; GFNIAVX2-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15],ymm2[24],ymm0[24],ymm2[25],ymm0[25],ymm2[26],ymm0[26],ymm2[27],ymm0[27],ymm2[28],ymm0[28],ymm2[29],ymm0[29],ymm2[30],ymm0[30],ymm2[31],ymm0[31]
+; GFNIAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4
+; GFNIAVX2-NEXT: vpsllw %xmm4, %ymm5, %ymm5
+; GFNIAVX2-NEXT: vpsrlw $8, %ymm5, %ymm5
+; GFNIAVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23]
+; GFNIAVX2-NEXT: vpsllw %xmm4, %ymm0, %ymm0
+; GFNIAVX2-NEXT: vpsrlw $8, %ymm0, %ymm0
+; GFNIAVX2-NEXT: vpackuswb %ymm5, %ymm0, %ymm0
+; GFNIAVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm3[8],ymm1[8],ymm3[9],ymm1[9],ymm3[10],ymm1[10],ymm3[11],ymm1[11],ymm3[12],ymm1[12],ymm3[13],ymm1[13],ymm3[14],ymm1[14],ymm3[15],ymm1[15],ymm3[24],ymm1[24],ymm3[25],ymm1[25],ymm3[26],ymm1[26],ymm3[27],ymm1[27],ymm3[28],ymm1[28],ymm3[29],ymm1[29],ymm3[30],ymm1[30],ymm3[31],ymm1[31]
+; GFNIAVX2-NEXT: vpsllw %xmm4, %ymm2, %ymm2
+; GFNIAVX2-NEXT: vpsrlw $8, %ymm2, %ymm2
+; GFNIAVX2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm3[0],ymm1[0],ymm3[1],ymm1[1],ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[4],ymm1[4],ymm3[5],ymm1[5],ymm3[6],ymm1[6],ymm3[7],ymm1[7],ymm3[16],ymm1[16],ymm3[17],ymm1[17],ymm3[18],ymm1[18],ymm3[19],ymm1[19],ymm3[20],ymm1[20],ymm3[21],ymm1[21],ymm3[22],ymm1[22],ymm3[23],ymm1[23]
+; GFNIAVX2-NEXT: vpsllw %xmm4, %ymm1, %ymm1
+; GFNIAVX2-NEXT: vpsrlw $8, %ymm1, %ymm1
+; GFNIAVX2-NEXT: vpackuswb %ymm2, %ymm1, %ymm1
+; GFNIAVX2-NEXT: retq
+;
+; GFNIAVX512VL-LABEL: splatvar_fshl_v64i8:
+; GFNIAVX512VL: # %bb.0:
+; GFNIAVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; GFNIAVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm4
+; GFNIAVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15],ymm4[24],ymm3[24],ymm4[25],ymm3[25],ymm4[26],ymm3[26],ymm4[27],ymm3[27],ymm4[28],ymm3[28],ymm4[29],ymm3[29],ymm4[30],ymm3[30],ymm4[31],ymm3[31]
+; GFNIAVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; GFNIAVX512VL-NEXT: vpsllw %xmm2, %ymm5, %ymm5
+; GFNIAVX512VL-NEXT: vpsrlw $8, %ymm5, %ymm5
+; GFNIAVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[16],ymm3[16],ymm4[17],ymm3[17],ymm4[18],ymm3[18],ymm4[19],ymm3[19],ymm4[20],ymm3[20],ymm4[21],ymm3[21],ymm4[22],ymm3[22],ymm4[23],ymm3[23]
+; GFNIAVX512VL-NEXT: vpsllw %xmm2, %ymm3, %ymm3
+; GFNIAVX512VL-NEXT: vpsrlw $8, %ymm3, %ymm3
+; GFNIAVX512VL-NEXT: vpackuswb %ymm5, %ymm3, %ymm3
+; GFNIAVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
+; GFNIAVX512VL-NEXT: vpsllw %xmm2, %ymm4, %ymm4
+; GFNIAVX512VL-NEXT: vpsrlw $8, %ymm4, %ymm4
+; GFNIAVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
+; GFNIAVX512VL-NEXT: vpsllw %xmm2, %ymm0, %ymm0
+; GFNIAVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0
+; GFNIAVX512VL-NEXT: vpackuswb %ymm4, %ymm0, %ymm0
+; GFNIAVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0
+; GFNIAVX512VL-NEXT: retq
+;
+; GFNIAVX512BW-LABEL: splatvar_fshl_v64i8:
+; GFNIAVX512BW: # %bb.0:
+; GFNIAVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63]
+; GFNIAVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; GFNIAVX512BW-NEXT: vpsllw %xmm2, %zmm3, %zmm3
+; GFNIAVX512BW-NEXT: vpsrlw $8, %zmm3, %zmm3
+; GFNIAVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55]
+; GFNIAVX512BW-NEXT: vpsllw %xmm2, %zmm0, %zmm0
+; GFNIAVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0
+; GFNIAVX512BW-NEXT: vpackuswb %zmm3, %zmm0, %zmm0
+; GFNIAVX512BW-NEXT: retq
+ %splat = shufflevector <64 x i8> %amt, <64 x i8> undef, <64 x i32> zeroinitializer
+ %res = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %splat)
+ ret <64 x i8> %res
+}
+
+define <64 x i8> @splatvar_fshr_v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %amt) nounwind {
+; GFNISSE-LABEL: splatvar_fshr_v64i8:
+; GFNISSE: # %bb.0:
+; GFNISSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9
+; GFNISSE-NEXT: movdqa %xmm4, %xmm10
+; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm0[8],xmm10[9],xmm0[9],xmm10[10],xmm0[10],xmm10[11],xmm0[11],xmm10[12],xmm0[12],xmm10[13],xmm0[13],xmm10[14],xmm0[14],xmm10[15],xmm0[15]
+; GFNISSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm9
+; GFNISSE-NEXT: psrlw %xmm9, %xmm10
+; GFNISSE-NEXT: pmovzxbw {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255]
+; GFNISSE-NEXT: pand %xmm8, %xmm10
+; GFNISSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
+; GFNISSE-NEXT: psrlw %xmm9, %xmm4
+; GFNISSE-NEXT: pand %xmm8, %xmm4
+; GFNISSE-NEXT: packuswb %xmm10, %xmm4
+; GFNISSE-NEXT: movdqa %xmm5, %xmm0
+; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; GFNISSE-NEXT: psrlw %xmm9, %xmm0
+; GFNISSE-NEXT: pand %xmm8, %xmm0
+; GFNISSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
+; GFNISSE-NEXT: psrlw %xmm9, %xmm5
+; GFNISSE-NEXT: pand %xmm8, %xmm5
+; GFNISSE-NEXT: packuswb %xmm0, %xmm5
+; GFNISSE-NEXT: movdqa %xmm6, %xmm0
+; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
+; GFNISSE-NEXT: psrlw %xmm9, %xmm0
+; GFNISSE-NEXT: pand %xmm8, %xmm0
+; GFNISSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
+; GFNISSE-NEXT: psrlw %xmm9, %xmm6
+; GFNISSE-NEXT: pand %xmm8, %xmm6
+; GFNISSE-NEXT: packuswb %xmm0, %xmm6
+; GFNISSE-NEXT: movdqa %xmm7, %xmm0
+; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15]
+; GFNISSE-NEXT: psrlw %xmm9, %xmm0
+; GFNISSE-NEXT: pand %xmm8, %xmm0
+; GFNISSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7]
+; GFNISSE-NEXT: psrlw %xmm9, %xmm7
+; GFNISSE-NEXT: pand %xmm7, %xmm8
+; GFNISSE-NEXT: packuswb %xmm0, %xmm8
+; GFNISSE-NEXT: movdqa %xmm4, %xmm0
+; GFNISSE-NEXT: movdqa %xmm5, %xmm1
+; GFNISSE-NEXT: movdqa %xmm6, %xmm2
+; GFNISSE-NEXT: movdqa %xmm8, %xmm3
+; GFNISSE-NEXT: retq
+;
+; GFNIAVX1-LABEL: splatvar_fshr_v64i8:
+; GFNIAVX1: # %bb.0:
+; GFNIAVX1-NEXT: vextractf128 $1, %ymm0, %xmm6
+; GFNIAVX1-NEXT: vextractf128 $1, %ymm2, %xmm7
+; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15]
+; GFNIAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4
+; GFNIAVX1-NEXT: vpsrlw %xmm4, %xmm5, %xmm8
+; GFNIAVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
+; GFNIAVX1-NEXT: vpand %xmm5, %xmm8, %xmm8
+; GFNIAVX1-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
+; GFNIAVX1-NEXT: vpsrlw %xmm4, %xmm6, %xmm6
+; GFNIAVX1-NEXT: vpand %xmm5, %xmm6, %xmm6
+; GFNIAVX1-NEXT: vpackuswb %xmm8, %xmm6, %xmm6
+; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
+; GFNIAVX1-NEXT: vpsrlw %xmm4, %xmm7, %xmm7
+; GFNIAVX1-NEXT: vpand %xmm5, %xmm7, %xmm7
+; GFNIAVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; GFNIAVX1-NEXT: vpsrlw %xmm4, %xmm0, %xmm0
+; GFNIAVX1-NEXT: vpand %xmm5, %xmm0, %xmm0
+; GFNIAVX1-NEXT: vpackuswb %xmm7, %xmm0, %xmm0
+; GFNIAVX1-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0
+; GFNIAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; GFNIAVX1-NEXT: vextractf128 $1, %ymm3, %xmm6
+; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm6[8],xmm2[8],xmm6[9],xmm2[9],xmm6[10],xmm2[10],xmm6[11],xmm2[11],xmm6[12],xmm2[12],xmm6[13],xmm2[13],xmm6[14],xmm2[14],xmm6[15],xmm2[15]
+; GFNIAVX1-NEXT: vpsrlw %xmm4, %xmm7, %xmm7
+; GFNIAVX1-NEXT: vpand %xmm5, %xmm7, %xmm7
+; GFNIAVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
+; GFNIAVX1-NEXT: vpsrlw %xmm4, %xmm2, %xmm2
+; GFNIAVX1-NEXT: vpand %xmm5, %xmm2, %xmm2
+; GFNIAVX1-NEXT: vpackuswb %xmm7, %xmm2, %xmm2
+; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
+; GFNIAVX1-NEXT: vpsrlw %xmm4, %xmm6, %xmm6
+; GFNIAVX1-NEXT: vpand %xmm5, %xmm6, %xmm6
+; GFNIAVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
+; GFNIAVX1-NEXT: vpsrlw %xmm4, %xmm1, %xmm1
+; GFNIAVX1-NEXT: vpand %xmm5, %xmm1, %xmm1
+; GFNIAVX1-NEXT: vpackuswb %xmm6, %xmm1, %xmm1
+; GFNIAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; GFNIAVX1-NEXT: retq
+;
+; GFNIAVX2-LABEL: splatvar_fshr_v64i8:
+; GFNIAVX2: # %bb.0:
+; GFNIAVX2-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15],ymm2[24],ymm0[24],ymm2[25],ymm0[25],ymm2[26],ymm0[26],ymm2[27],ymm0[27],ymm2[28],ymm0[28],ymm2[29],ymm0[29],ymm2[30],ymm0[30],ymm2[31],ymm0[31]
+; GFNIAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4
+; GFNIAVX2-NEXT: vpsrlw %xmm4, %ymm5, %ymm5
+; GFNIAVX2-NEXT: vpbroadcastw {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; GFNIAVX2-NEXT: vpand %ymm6, %ymm5, %ymm5
+; GFNIAVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23]
+; GFNIAVX2-NEXT: vpsrlw %xmm4, %ymm0, %ymm0
+; GFNIAVX2-NEXT: vpand %ymm6, %ymm0, %ymm0
+; GFNIAVX2-NEXT: vpackuswb %ymm5, %ymm0, %ymm0
+; GFNIAVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm3[8],ymm1[8],ymm3[9],ymm1[9],ymm3[10],ymm1[10],ymm3[11],ymm1[11],ymm3[12],ymm1[12],ymm3[13],ymm1[13],ymm3[14],ymm1[14],ymm3[15],ymm1[15],ymm3[24],ymm1[24],ymm3[25],ymm1[25],ymm3[26],ymm1[26],ymm3[27],ymm1[27],ymm3[28],ymm1[28],ymm3[29],ymm1[29],ymm3[30],ymm1[30],ymm3[31],ymm1[31]
+; GFNIAVX2-NEXT: vpsrlw %xmm4, %ymm2, %ymm2
+; GFNIAVX2-NEXT: vpand %ymm6, %ymm2, %ymm2
+; GFNIAVX2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm3[0],ymm1[0],ymm3[1],ymm1[1],ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[4],ymm1[4],ymm3[5],ymm1[5],ymm3[6],ymm1[6],ymm3[7],ymm1[7],ymm3[16],ymm1[16],ymm3[17],ymm1[17],ymm3[18],ymm1[18],ymm3[19],ymm1[19],ymm3[20],ymm1[20],ymm3[21],ymm1[21],ymm3[22],ymm1[22],ymm3[23],ymm1[23]
+; GFNIAVX2-NEXT: vpsrlw %xmm4, %ymm1, %ymm1
+; GFNIAVX2-NEXT: vpand %ymm6, %ymm1, %ymm1
+; GFNIAVX2-NEXT: vpackuswb %ymm2, %ymm1, %ymm1
+; GFNIAVX2-NEXT: retq
+;
+; GFNIAVX512VL-LABEL: splatvar_fshr_v64i8:
+; GFNIAVX512VL: # %bb.0:
+; GFNIAVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; GFNIAVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm4
+; GFNIAVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15],ymm4[24],ymm3[24],ymm4[25],ymm3[25],ymm4[26],ymm3[26],ymm4[27],ymm3[27],ymm4[28],ymm3[28],ymm4[29],ymm3[29],ymm4[30],ymm3[30],ymm4[31],ymm3[31]
+; GFNIAVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; GFNIAVX512VL-NEXT: vpsrlw %xmm2, %ymm5, %ymm5
+; GFNIAVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; GFNIAVX512VL-NEXT: vpand %ymm6, %ymm5, %ymm5
+; GFNIAVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[16],ymm3[16],ymm4[17],ymm3[17],ymm4[18],ymm3[18],ymm4[19],ymm3[19],ymm4[20],ymm3[20],ymm4[21],ymm3[21],ymm4[22],ymm3[22],ymm4[23],ymm3[23]
+; GFNIAVX512VL-NEXT: vpsrlw %xmm2, %ymm3, %ymm3
+; GFNIAVX512VL-NEXT: vpand %ymm6, %ymm3, %ymm3
+; GFNIAVX512VL-NEXT: vpackuswb %ymm5, %ymm3, %ymm3
+; GFNIAVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
+; GFNIAVX512VL-NEXT: vpsrlw %xmm2, %ymm4, %ymm4
+; GFNIAVX512VL-NEXT: vpand %ymm6, %ymm4, %ymm4
+; GFNIAVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
+; GFNIAVX512VL-NEXT: vpsrlw %xmm2, %ymm0, %ymm0
+; GFNIAVX512VL-NEXT: vpand %ymm6, %ymm0, %ymm0
+; GFNIAVX512VL-NEXT: vpackuswb %ymm4, %ymm0, %ymm0
+; GFNIAVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0
+; GFNIAVX512VL-NEXT: retq
+;
+; GFNIAVX512BW-LABEL: splatvar_fshr_v64i8:
+; GFNIAVX512BW: # %bb.0:
+; GFNIAVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63]
+; GFNIAVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; GFNIAVX512BW-NEXT: vpsrlw %xmm2, %zmm3, %zmm3
+; GFNIAVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; GFNIAVX512BW-NEXT: vpandq %zmm4, %zmm3, %zmm3
+; GFNIAVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55]
+; GFNIAVX512BW-NEXT: vpsrlw %xmm2, %zmm0, %zmm0
+; GFNIAVX512BW-NEXT: vpandq %zmm4, %zmm0, %zmm0
+; GFNIAVX512BW-NEXT: vpackuswb %zmm3, %zmm0, %zmm0
+; GFNIAVX512BW-NEXT: retq
+ %splat = shufflevector <64 x i8> %amt, <64 x i8> undef, <64 x i32> zeroinitializer
+ %res = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %splat)
+ ret <64 x i8> %res
+}
+
+define <64 x i8> @constant_fshl_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
+; GFNISSE-LABEL: constant_fshl_v64i8:
+; GFNISSE: # %bb.0:
+; GFNISSE-NEXT: movdqa %xmm4, %xmm10
+; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm0[8],xmm10[9],xmm0[9],xmm10[10],xmm0[10],xmm10[11],xmm0[11],xmm10[12],xmm0[12],xmm10[13],xmm0[13],xmm10[14],xmm0[14],xmm10[15],xmm0[15]
+; GFNISSE-NEXT: pmovzxbw {{.*#+}} xmm9 = [1,128,64,32,16,8,4,2]
+; GFNISSE-NEXT: pmullw %xmm9, %xmm10
+; GFNISSE-NEXT: psrlw $8, %xmm10
+; GFNISSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
+; GFNISSE-NEXT: pmovzxbw {{.*#+}} xmm8 = [1,2,4,8,16,32,64,128]
+; GFNISSE-NEXT: pmullw %xmm8, %xmm4
+; GFNISSE-NEXT: psrlw $8, %xmm4
+; GFNISSE-NEXT: packuswb %xmm10, %xmm4
+; GFNISSE-NEXT: movdqa %xmm5, %xmm0
+; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; GFNISSE-NEXT: pmullw %xmm9, %xmm0
+; GFNISSE-NEXT: psrlw $8, %xmm0
+; GFNISSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
+; GFNISSE-NEXT: pmullw %xmm8, %xmm5
+; GFNISSE-NEXT: psrlw $8, %xmm5
+; GFNISSE-NEXT: packuswb %xmm0, %xmm5
+; GFNISSE-NEXT: movdqa %xmm6, %xmm0
+; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
+; GFNISSE-NEXT: pmullw %xmm9, %xmm0
+; GFNISSE-NEXT: psrlw $8, %xmm0
+; GFNISSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
+; GFNISSE-NEXT: pmullw %xmm8, %xmm6
+; GFNISSE-NEXT: psrlw $8, %xmm6
+; GFNISSE-NEXT: packuswb %xmm0, %xmm6
+; GFNISSE-NEXT: movdqa %xmm7, %xmm0
+; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15]
+; GFNISSE-NEXT: pmullw %xmm9, %xmm0
+; GFNISSE-NEXT: psrlw $8, %xmm0
+; GFNISSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7]
+; GFNISSE-NEXT: pmullw %xmm7, %xmm8
+; GFNISSE-NEXT: psrlw $8, %xmm8
+; GFNISSE-NEXT: packuswb %xmm0, %xmm8
+; GFNISSE-NEXT: movdqa %xmm4, %xmm0
+; GFNISSE-NEXT: movdqa %xmm5, %xmm1
+; GFNISSE-NEXT: movdqa %xmm6, %xmm2
+; GFNISSE-NEXT: movdqa %xmm8, %xmm3
+; GFNISSE-NEXT: retq
+;
+; GFNIAVX1-LABEL: constant_fshl_v64i8:
+; GFNIAVX1: # %bb.0:
+; GFNIAVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
+; GFNIAVX1-NEXT: vextractf128 $1, %ymm2, %xmm5
+; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15]
+; GFNIAVX1-NEXT: vpmovzxbw {{.*#+}} xmm7 = [1,128,64,32,16,8,4,2]
+; GFNIAVX1-NEXT: vpmullw %xmm7, %xmm6, %xmm6
+; GFNIAVX1-NEXT: vpsrlw $8, %xmm6, %xmm6
+; GFNIAVX1-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
+; GFNIAVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = [1,2,4,8,16,32,64,128]
+; GFNIAVX1-NEXT: vpmullw %xmm5, %xmm4, %xmm4
+; GFNIAVX1-NEXT: vpsrlw $8, %xmm4, %xmm4
+; GFNIAVX1-NEXT: vpackuswb %xmm6, %xmm4, %xmm4
+; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
+; GFNIAVX1-NEXT: vpmullw %xmm7, %xmm6, %xmm6
+; GFNIAVX1-NEXT: vpsrlw $8, %xmm6, %xmm6
+; GFNIAVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; GFNIAVX1-NEXT: vpmullw %xmm5, %xmm0, %xmm0
+; GFNIAVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
+; GFNIAVX1-NEXT: vpackuswb %xmm6, %xmm0, %xmm0
+; GFNIAVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; GFNIAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; GFNIAVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
+; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15]
+; GFNIAVX1-NEXT: vpmullw %xmm7, %xmm6, %xmm6
+; GFNIAVX1-NEXT: vpsrlw $8, %xmm6, %xmm6
+; GFNIAVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
+; GFNIAVX1-NEXT: vpmullw %xmm5, %xmm2, %xmm2
+; GFNIAVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
+; GFNIAVX1-NEXT: vpackuswb %xmm6, %xmm2, %xmm2
+; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
+; GFNIAVX1-NEXT: vpmullw %xmm7, %xmm4, %xmm4
+; GFNIAVX1-NEXT: vpsrlw $8, %xmm4, %xmm4
+; GFNIAVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
+; GFNIAVX1-NEXT: vpmullw %xmm5, %xmm1, %xmm1
+; GFNIAVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
+; GFNIAVX1-NEXT: vpackuswb %xmm4, %xmm1, %xmm1
+; GFNIAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; GFNIAVX1-NEXT: retq
+;
+; GFNIAVX2-LABEL: constant_fshl_v64i8:
+; GFNIAVX2: # %bb.0:
+; GFNIAVX2-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15],ymm2[24],ymm0[24],ymm2[25],ymm0[25],ymm2[26],ymm0[26],ymm2[27],ymm0[27],ymm2[28],ymm0[28],ymm2[29],ymm0[29],ymm2[30],ymm0[30],ymm2[31],ymm0[31]
+; GFNIAVX2-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2]
+; GFNIAVX2-NEXT: # ymm5 = mem[0,1,0,1]
+; GFNIAVX2-NEXT: vpmullw %ymm5, %ymm4, %ymm4
+; GFNIAVX2-NEXT: vpsrlw $8, %ymm4, %ymm4
+; GFNIAVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23]
+; GFNIAVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
+; GFNIAVX2-NEXT: # ymm2 = mem[0,1,0,1]
+; GFNIAVX2-NEXT: vpmullw %ymm2, %ymm0, %ymm0
+; GFNIAVX2-NEXT: vpsrlw $8, %ymm0, %ymm0
+; GFNIAVX2-NEXT: vpackuswb %ymm4, %ymm0, %ymm0
+; GFNIAVX2-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm3[8],ymm1[8],ymm3[9],ymm1[9],ymm3[10],ymm1[10],ymm3[11],ymm1[11],ymm3[12],ymm1[12],ymm3[13],ymm1[13],ymm3[14],ymm1[14],ymm3[15],ymm1[15],ymm3[24],ymm1[24],ymm3[25],ymm1[25],ymm3[26],ymm1[26],ymm3[27],ymm1[27],ymm3[28],ymm1[28],ymm3[29],ymm1[29],ymm3[30],ymm1[30],ymm3[31],ymm1[31]
+; GFNIAVX2-NEXT: vpmullw %ymm5, %ymm4, %ymm4
+; GFNIAVX2-NEXT: vpsrlw $8, %ymm4, %ymm4
+; GFNIAVX2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm3[0],ymm1[0],ymm3[1],ymm1[1],ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[4],ymm1[4],ymm3[5],ymm1[5],ymm3[6],ymm1[6],ymm3[7],ymm1[7],ymm3[16],ymm1[16],ymm3[17],ymm1[17],ymm3[18],ymm1[18],ymm3[19],ymm1[19],ymm3[20],ymm1[20],ymm3[21],ymm1[21],ymm3[22],ymm1[22],ymm3[23],ymm1[23]
+; GFNIAVX2-NEXT: vpmullw %ymm2, %ymm1, %ymm1
+; GFNIAVX2-NEXT: vpsrlw $8, %ymm1, %ymm1
+; GFNIAVX2-NEXT: vpackuswb %ymm4, %ymm1, %ymm1
+; GFNIAVX2-NEXT: retq
+;
+; GFNIAVX512VL-LABEL: constant_fshl_v64i8:
+; GFNIAVX512VL: # %bb.0:
+; GFNIAVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; GFNIAVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm3
+; GFNIAVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15],ymm3[24],ymm2[24],ymm3[25],ymm2[25],ymm3[26],ymm2[26],ymm3[27],ymm2[27],ymm3[28],ymm2[28],ymm3[29],ymm2[29],ymm3[30],ymm2[30],ymm3[31],ymm2[31]
+; GFNIAVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2]
+; GFNIAVX512VL-NEXT: # ymm5 = mem[0,1,0,1]
+; GFNIAVX512VL-NEXT: vpmullw %ymm5, %ymm4, %ymm4
+; GFNIAVX512VL-NEXT: vpsrlw $8, %ymm4, %ymm4
+; GFNIAVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[16],ymm2[16],ymm3[17],ymm2[17],ymm3[18],ymm2[18],ymm3[19],ymm2[19],ymm3[20],ymm2[20],ymm3[21],ymm2[21],ymm3[22],ymm2[22],ymm3[23],ymm2[23]
+; GFNIAVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
+; GFNIAVX512VL-NEXT: # ymm3 = mem[0,1,0,1]
+; GFNIAVX512VL-NEXT: vpmullw %ymm3, %ymm2, %ymm2
+; GFNIAVX512VL-NEXT: vpsrlw $8, %ymm2, %ymm2
+; GFNIAVX512VL-NEXT: vpackuswb %ymm4, %ymm2, %ymm2
+; GFNIAVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
+; GFNIAVX512VL-NEXT: vpmullw %ymm5, %ymm4, %ymm4
+; GFNIAVX512VL-NEXT: vpsrlw $8, %ymm4, %ymm4
+; GFNIAVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
+; GFNIAVX512VL-NEXT: vpmullw %ymm3, %ymm0, %ymm0
+; GFNIAVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0
+; GFNIAVX512VL-NEXT: vpackuswb %ymm4, %ymm0, %ymm0
+; GFNIAVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; GFNIAVX512VL-NEXT: retq
+;
+; GFNIAVX512BW-LABEL: constant_fshl_v64i8:
+; GFNIAVX512BW: # %bb.0:
+; GFNIAVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63]
+; GFNIAVX512BW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2
+; GFNIAVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2
+; GFNIAVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55]
+; GFNIAVX512BW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
+; GFNIAVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0
+; GFNIAVX512BW-NEXT: vpackuswb %zmm2, %zmm0, %zmm0
+; GFNIAVX512BW-NEXT: retq
+ %res = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1>)
+ ret <64 x i8> %res
+}
+
+define <64 x i8> @constant_fshr_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
+; GFNISSE-LABEL: constant_fshr_v64i8:
+; GFNISSE: # %bb.0:
+; GFNISSE-NEXT: movdqa %xmm4, %xmm10
+; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm0[8],xmm10[9],xmm0[9],xmm10[10],xmm0[10],xmm10[11],xmm0[11],xmm10[12],xmm0[12],xmm10[13],xmm0[13],xmm10[14],xmm0[14],xmm10[15],xmm0[15]
+; GFNISSE-NEXT: pmovzxbw {{.*#+}} xmm9 = [1,128,64,32,16,8,4,2]
+; GFNISSE-NEXT: pmullw %xmm9, %xmm10
+; GFNISSE-NEXT: psrlw $8, %xmm10
+; GFNISSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
+; GFNISSE-NEXT: pmovzxbw {{.*#+}} xmm8 = [1,2,4,8,16,32,64,128]
+; GFNISSE-NEXT: pmullw %xmm8, %xmm4
+; GFNISSE-NEXT: psrlw $8, %xmm4
+; GFNISSE-NEXT: packuswb %xmm10, %xmm4
+; GFNISSE-NEXT: movdqa %xmm5, %xmm0
+; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; GFNISSE-NEXT: pmullw %xmm9, %xmm0
+; GFNISSE-NEXT: psrlw $8, %xmm0
+; GFNISSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
+; GFNISSE-NEXT: pmullw %xmm8, %xmm5
+; GFNISSE-NEXT: psrlw $8, %xmm5
+; GFNISSE-NEXT: packuswb %xmm0, %xmm5
+; GFNISSE-NEXT: movdqa %xmm6, %xmm0
+; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
+; GFNISSE-NEXT: pmullw %xmm9, %xmm0
+; GFNISSE-NEXT: psrlw $8, %xmm0
+; GFNISSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
+; GFNISSE-NEXT: pmullw %xmm8, %xmm6
+; GFNISSE-NEXT: psrlw $8, %xmm6
+; GFNISSE-NEXT: packuswb %xmm0, %xmm6
+; GFNISSE-NEXT: movdqa %xmm7, %xmm0
+; GFNISSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15]
+; GFNISSE-NEXT: pmullw %xmm9, %xmm0
+; GFNISSE-NEXT: psrlw $8, %xmm0
+; GFNISSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7]
+; GFNISSE-NEXT: pmullw %xmm7, %xmm8
+; GFNISSE-NEXT: psrlw $8, %xmm8
+; GFNISSE-NEXT: packuswb %xmm0, %xmm8
+; GFNISSE-NEXT: movdqa %xmm4, %xmm0
+; GFNISSE-NEXT: movdqa %xmm5, %xmm1
+; GFNISSE-NEXT: movdqa %xmm6, %xmm2
+; GFNISSE-NEXT: movdqa %xmm8, %xmm3
+; GFNISSE-NEXT: retq
+;
+; GFNIAVX1-LABEL: constant_fshr_v64i8:
+; GFNIAVX1: # %bb.0:
+; GFNIAVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
+; GFNIAVX1-NEXT: vextractf128 $1, %ymm2, %xmm5
+; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15]
+; GFNIAVX1-NEXT: vpmovzxbw {{.*#+}} xmm7 = [1,128,64,32,16,8,4,2]
+; GFNIAVX1-NEXT: vpmullw %xmm7, %xmm6, %xmm6
+; GFNIAVX1-NEXT: vpsrlw $8, %xmm6, %xmm6
+; GFNIAVX1-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
+; GFNIAVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = [1,2,4,8,16,32,64,128]
+; GFNIAVX1-NEXT: vpmullw %xmm5, %xmm4, %xmm4
+; GFNIAVX1-NEXT: vpsrlw $8, %xmm4, %xmm4
+; GFNIAVX1-NEXT: vpackuswb %xmm6, %xmm4, %xmm4
+; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
+; GFNIAVX1-NEXT: vpmullw %xmm7, %xmm6, %xmm6
+; GFNIAVX1-NEXT: vpsrlw $8, %xmm6, %xmm6
+; GFNIAVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; GFNIAVX1-NEXT: vpmullw %xmm5, %xmm0, %xmm0
+; GFNIAVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
+; GFNIAVX1-NEXT: vpackuswb %xmm6, %xmm0, %xmm0
+; GFNIAVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; GFNIAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; GFNIAVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
+; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15]
+; GFNIAVX1-NEXT: vpmullw %xmm7, %xmm6, %xmm6
+; GFNIAVX1-NEXT: vpsrlw $8, %xmm6, %xmm6
+; GFNIAVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
+; GFNIAVX1-NEXT: vpmullw %xmm5, %xmm2, %xmm2
+; GFNIAVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
+; GFNIAVX1-NEXT: vpackuswb %xmm6, %xmm2, %xmm2
+; GFNIAVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
+; GFNIAVX1-NEXT: vpmullw %xmm7, %xmm4, %xmm4
+; GFNIAVX1-NEXT: vpsrlw $8, %xmm4, %xmm4
+; GFNIAVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
+; GFNIAVX1-NEXT: vpmullw %xmm5, %xmm1, %xmm1
+; GFNIAVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
+; GFNIAVX1-NEXT: vpackuswb %xmm4, %xmm1, %xmm1
+; GFNIAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; GFNIAVX1-NEXT: retq
+;
+; GFNIAVX2-LABEL: constant_fshr_v64i8:
+; GFNIAVX2: # %bb.0:
+; GFNIAVX2-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15],ymm2[24],ymm0[24],ymm2[25],ymm0[25],ymm2[26],ymm0[26],ymm2[27],ymm0[27],ymm2[28],ymm0[28],ymm2[29],ymm0[29],ymm2[30],ymm0[30],ymm2[31],ymm0[31]
+; GFNIAVX2-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2]
+; GFNIAVX2-NEXT: # ymm5 = mem[0,1,0,1]
+; GFNIAVX2-NEXT: vpmullw %ymm5, %ymm4, %ymm4
+; GFNIAVX2-NEXT: vpsrlw $8, %ymm4, %ymm4
+; GFNIAVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23]
+; GFNIAVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
+; GFNIAVX2-NEXT: # ymm2 = mem[0,1,0,1]
+; GFNIAVX2-NEXT: vpmullw %ymm2, %ymm0, %ymm0
+; GFNIAVX2-NEXT: vpsrlw $8, %ymm0, %ymm0
+; GFNIAVX2-NEXT: vpackuswb %ymm4, %ymm0, %ymm0
+; GFNIAVX2-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm3[8],ymm1[8],ymm3[9],ymm1[9],ymm3[10],ymm1[10],ymm3[11],ymm1[11],ymm3[12],ymm1[12],ymm3[13],ymm1[13],ymm3[14],ymm1[14],ymm3[15],ymm1[15],ymm3[24],ymm1[24],ymm3[25],ymm1[25],ymm3[26],ymm1[26],ymm3[27],ymm1[27],ymm3[28],ymm1[28],ymm3[29],ymm1[29],ymm3[30],ymm1[30],ymm3[31],ymm1[31]
+; GFNIAVX2-NEXT: vpmullw %ymm5, %ymm4, %ymm4
+; GFNIAVX2-NEXT: vpsrlw $8, %ymm4, %ymm4
+; GFNIAVX2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm3[0],ymm1[0],ymm3[1],ymm1[1],ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[4],ymm1[4],ymm3[5],ymm1[5],ymm3[6],ymm1[6],ymm3[7],ymm1[7],ymm3[16],ymm1[16],ymm3[17],ymm1[17],ymm3[18],ymm1[18],ymm3[19],ymm1[19],ymm3[20],ymm1[20],ymm3[21],ymm1[21],ymm3[22],ymm1[22],ymm3[23],ymm1[23]
+; GFNIAVX2-NEXT: vpmullw %ymm2, %ymm1, %ymm1
+; GFNIAVX2-NEXT: vpsrlw $8, %ymm1, %ymm1
+; GFNIAVX2-NEXT: vpackuswb %ymm4, %ymm1, %ymm1
+; GFNIAVX2-NEXT: retq
+;
+; GFNIAVX512VL-LABEL: constant_fshr_v64i8:
+; GFNIAVX512VL: # %bb.0:
+; GFNIAVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; GFNIAVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm3
+; GFNIAVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15],ymm3[24],ymm2[24],ymm3[25],ymm2[25],ymm3[26],ymm2[26],ymm3[27],ymm2[27],ymm3[28],ymm2[28],ymm3[29],ymm2[29],ymm3[30],ymm2[30],ymm3[31],ymm2[31]
+; GFNIAVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2]
+; GFNIAVX512VL-NEXT: # ymm5 = mem[0,1,0,1]
+; GFNIAVX512VL-NEXT: vpmullw %ymm5, %ymm4, %ymm4
+; GFNIAVX512VL-NEXT: vpsrlw $8, %ymm4, %ymm4
+; GFNIAVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[16],ymm2[16],ymm3[17],ymm2[17],ymm3[18],ymm2[18],ymm3[19],ymm2[19],ymm3[20],ymm2[20],ymm3[21],ymm2[21],ymm3[22],ymm2[22],ymm3[23],ymm2[23]
+; GFNIAVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
+; GFNIAVX512VL-NEXT: # ymm3 = mem[0,1,0,1]
+; GFNIAVX512VL-NEXT: vpmullw %ymm3, %ymm2, %ymm2
+; GFNIAVX512VL-NEXT: vpsrlw $8, %ymm2, %ymm2
+; GFNIAVX512VL-NEXT: vpackuswb %ymm4, %ymm2, %ymm2
+; GFNIAVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
+; GFNIAVX512VL-NEXT: vpmullw %ymm5, %ymm4, %ymm4
+; GFNIAVX512VL-NEXT: vpsrlw $8, %ymm4, %ymm4
+; GFNIAVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
+; GFNIAVX512VL-NEXT: vpmullw %ymm3, %ymm0, %ymm0
+; GFNIAVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0
+; GFNIAVX512VL-NEXT: vpackuswb %ymm4, %ymm0, %ymm0
+; GFNIAVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; GFNIAVX512VL-NEXT: retq
+;
+; GFNIAVX512BW-LABEL: constant_fshr_v64i8:
+; GFNIAVX512BW: # %bb.0:
+; GFNIAVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63]
+; GFNIAVX512BW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2
+; GFNIAVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2
+; GFNIAVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55]
+; GFNIAVX512BW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
+; GFNIAVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0
+; GFNIAVX512BW-NEXT: vpackuswb %zmm2, %zmm0, %zmm0
+; GFNIAVX512BW-NEXT: retq
+ %res = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1>)
+ ret <64 x i8> %res
+}
+
define <64 x i8> @splatconstant_fshl_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
; GFNISSE-LABEL: splatconstant_fshl_v64i8:
; GFNISSE: # %bb.0:
@@ -372,5 +3056,3 @@ define <64 x i8> @splatconstant_fshr_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind
ret <64 x i8> %res
}
declare <64 x i8> @llvm.fshr.v64i8(<64 x i8>, <64 x i8>, <64 x i8>)
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; GFNIAVX: {{.*}}
>From 8b2ba6a144e728ee4116e2804e9b5aed8824e726 Mon Sep 17 00:00:00 2001
From: Mircea Trofin <mtrofin at google.com>
Date: Mon, 22 Apr 2024 09:35:49 -0700
Subject: [PATCH 4/7] Revert "[compiler-rt][ctx_instr] Add `ctx_profile`
component" (#89625)
Reverts llvm/llvm-project#89304
Some build bot failures - will fix and reland.
Example: https://lab.llvm.org/buildbot/#/builders/165/builds/52789
---
compiler-rt/CMakeLists.txt | 2 -
.../cmake/Modules/AllSupportedArchDefs.cmake | 1 -
compiler-rt/cmake/config-ix.cmake | 11 ---
compiler-rt/lib/CMakeLists.txt | 4 --
compiler-rt/lib/ctx_profile/CMakeLists.txt | 28 --------
.../lib/ctx_profile/CtxInstrProfiling.cpp | 40 -----------
.../lib/ctx_profile/CtxInstrProfiling.h | 55 ---------------
.../lib/ctx_profile/tests/CMakeLists.txt | 70 -------------------
.../tests/CtxInstrProfilingTest.cpp | 22 ------
compiler-rt/lib/ctx_profile/tests/driver.cpp | 14 ----
10 files changed, 247 deletions(-)
delete mode 100644 compiler-rt/lib/ctx_profile/CMakeLists.txt
delete mode 100644 compiler-rt/lib/ctx_profile/CtxInstrProfiling.cpp
delete mode 100644 compiler-rt/lib/ctx_profile/CtxInstrProfiling.h
delete mode 100644 compiler-rt/lib/ctx_profile/tests/CMakeLists.txt
delete mode 100644 compiler-rt/lib/ctx_profile/tests/CtxInstrProfilingTest.cpp
delete mode 100644 compiler-rt/lib/ctx_profile/tests/driver.cpp
diff --git a/compiler-rt/CMakeLists.txt b/compiler-rt/CMakeLists.txt
index 6ce451e3cac2e3..8649507ce1c79b 100644
--- a/compiler-rt/CMakeLists.txt
+++ b/compiler-rt/CMakeLists.txt
@@ -50,8 +50,6 @@ option(COMPILER_RT_BUILD_LIBFUZZER "Build libFuzzer" ON)
mark_as_advanced(COMPILER_RT_BUILD_LIBFUZZER)
option(COMPILER_RT_BUILD_PROFILE "Build profile runtime" ON)
mark_as_advanced(COMPILER_RT_BUILD_PROFILE)
-option(COMPILER_RT_BUILD_CTX_PROFILE "Build ctx profile runtime" ON)
-mark_as_advanced(COMPILER_RT_BUILD_CTX_PROFILE)
option(COMPILER_RT_BUILD_MEMPROF "Build memory profiling runtime" ON)
mark_as_advanced(COMPILER_RT_BUILD_MEMPROF)
option(COMPILER_RT_BUILD_XRAY_NO_PREINIT "Build xray with no preinit patching" OFF)
diff --git a/compiler-rt/cmake/Modules/AllSupportedArchDefs.cmake b/compiler-rt/cmake/Modules/AllSupportedArchDefs.cmake
index 2fe06273a814c7..423171532c2028 100644
--- a/compiler-rt/cmake/Modules/AllSupportedArchDefs.cmake
+++ b/compiler-rt/cmake/Modules/AllSupportedArchDefs.cmake
@@ -66,7 +66,6 @@ set(ALL_MEMPROF_SUPPORTED_ARCH ${X86_64})
set(ALL_PROFILE_SUPPORTED_ARCH ${X86} ${X86_64} ${ARM32} ${ARM64} ${PPC32} ${PPC64}
${MIPS32} ${MIPS64} ${S390X} ${SPARC} ${SPARCV9} ${HEXAGON}
${RISCV32} ${RISCV64} ${LOONGARCH64})
-set(ALL_CTX_PROFILE_SUPPORTED_ARCH ${X86_64})
set(ALL_TSAN_SUPPORTED_ARCH ${X86_64} ${MIPS64} ${ARM64} ${PPC64} ${S390X}
${LOONGARCH64} ${RISCV64})
set(ALL_UBSAN_SUPPORTED_ARCH ${X86} ${X86_64} ${ARM32} ${ARM64} ${RISCV64}
diff --git a/compiler-rt/cmake/config-ix.cmake b/compiler-rt/cmake/config-ix.cmake
index ba740af9e1d60f..b281ac64f5d5c7 100644
--- a/compiler-rt/cmake/config-ix.cmake
+++ b/compiler-rt/cmake/config-ix.cmake
@@ -632,9 +632,6 @@ if(APPLE)
list_intersect(PROFILE_SUPPORTED_ARCH
ALL_PROFILE_SUPPORTED_ARCH
SANITIZER_COMMON_SUPPORTED_ARCH)
- list_intersect(CTX_PROFILE_SUPPORTED_ARCH
- ALL_CTX_PROFILE_SUPPORTED_ARCH
- SANITIZER_COMMON_SUPPORTED_ARCH)
list_intersect(TSAN_SUPPORTED_ARCH
ALL_TSAN_SUPPORTED_ARCH
SANITIZER_COMMON_SUPPORTED_ARCH)
@@ -681,7 +678,6 @@ else()
filter_available_targets(HWASAN_SUPPORTED_ARCH ${ALL_HWASAN_SUPPORTED_ARCH})
filter_available_targets(MEMPROF_SUPPORTED_ARCH ${ALL_MEMPROF_SUPPORTED_ARCH})
filter_available_targets(PROFILE_SUPPORTED_ARCH ${ALL_PROFILE_SUPPORTED_ARCH})
- filter_available_targets(CTX_PROFILE_SUPPORTED_ARCH ${ALL_CTX_PROFILE_SUPPORTED_ARCH})
filter_available_targets(TSAN_SUPPORTED_ARCH ${ALL_TSAN_SUPPORTED_ARCH})
filter_available_targets(UBSAN_SUPPORTED_ARCH ${ALL_UBSAN_SUPPORTED_ARCH})
filter_available_targets(SAFESTACK_SUPPORTED_ARCH
@@ -807,13 +803,6 @@ else()
set(COMPILER_RT_HAS_PROFILE FALSE)
endif()
-if (COMPILER_RT_HAS_SANITIZER_COMMON AND CTX_PROFILE_SUPPORTED_ARCH AND
- OS_NAME MATCHES "Linux")
- set(COMPILER_RT_HAS_CTX_PROFILE TRUE)
-else()
- set(COMPILER_RT_HAS_CTX_PROFILE FALSE)
-endif()
-
if (COMPILER_RT_HAS_SANITIZER_COMMON AND TSAN_SUPPORTED_ARCH)
if (OS_NAME MATCHES "Linux|Darwin|FreeBSD|NetBSD")
set(COMPILER_RT_HAS_TSAN TRUE)
diff --git a/compiler-rt/lib/CMakeLists.txt b/compiler-rt/lib/CMakeLists.txt
index f9e96563b88090..43ba9a102c8487 100644
--- a/compiler-rt/lib/CMakeLists.txt
+++ b/compiler-rt/lib/CMakeLists.txt
@@ -51,10 +51,6 @@ if(COMPILER_RT_BUILD_PROFILE AND COMPILER_RT_HAS_PROFILE)
compiler_rt_build_runtime(profile)
endif()
-if(COMPILER_RT_BUILD_CTX_PROFILE AND COMPILER_RT_HAS_CTX_PROFILE)
- compiler_rt_build_runtime(ctx_profile)
-endif()
-
if(COMPILER_RT_BUILD_XRAY)
compiler_rt_build_runtime(xray)
endif()
diff --git a/compiler-rt/lib/ctx_profile/CMakeLists.txt b/compiler-rt/lib/ctx_profile/CMakeLists.txt
deleted file mode 100644
index 621b7d30b76d41..00000000000000
--- a/compiler-rt/lib/ctx_profile/CMakeLists.txt
+++ /dev/null
@@ -1,28 +0,0 @@
-add_compiler_rt_component(ctx_profile)
-
-set(CTX_PROFILE_SOURCES
- CtxInstrProfiling.cpp
- )
-
-set(CTX_PROFILE_HEADERS
- CtxInstrProfiling.h
- )
-
-include_directories(..)
-include_directories(../../include)
-
-# We don't use the C++ Standard Library here, so avoid including it by mistake.
-append_list_if(COMPILER_RT_HAS_NOSTDINCXX_FLAG -nostdinc++ EXTRA_FLAGS)
-
-add_compiler_rt_runtime(clang_rt.ctx_profile
- STATIC
- ARCHS ${CTX_PROFILE_SUPPORTED_ARCH}
- OBJECT_LIBS RTSanitizerCommon RTSanitizerCommonLibc
- CFLAGS ${EXTRA_FLAGS}
- SOURCES ${CTX_PROFILE_SOURCES}
- ADDITIONAL_HEADERS ${CTX_PROFILE_HEADERS}
- PARENT_TARGET ctx_profile)
-
-if(COMPILER_RT_INCLUDE_TESTS)
- add_subdirectory(tests)
-endif()
diff --git a/compiler-rt/lib/ctx_profile/CtxInstrProfiling.cpp b/compiler-rt/lib/ctx_profile/CtxInstrProfiling.cpp
deleted file mode 100644
index 7620ce92f7ebde..00000000000000
--- a/compiler-rt/lib/ctx_profile/CtxInstrProfiling.cpp
+++ /dev/null
@@ -1,40 +0,0 @@
-//===- CtxInstrProfiling.cpp - contextual instrumented PGO ----------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "CtxInstrProfiling.h"
-#include "sanitizer_common/sanitizer_allocator_internal.h"
-#include "sanitizer_common/sanitizer_common.h"
-#include "sanitizer_common/sanitizer_dense_map.h"
-#include "sanitizer_common/sanitizer_mutex.h"
-#include "sanitizer_common/sanitizer_placement_new.h"
-#include "sanitizer_common/sanitizer_thread_safety.h"
-
-#include <assert.h>
-
-using namespace __ctx_profile;
-
-// FIXME(mtrofin): use malloc / mmap instead of sanitizer common APIs to reduce
-// the dependency on the latter.
-Arena *Arena::allocateNewArena(size_t Size, Arena *Prev) {
- assert(!Prev || Prev->Next == nullptr);
- Arena *NewArena =
- new (__sanitizer::InternalAlloc(Size + sizeof(Arena))) Arena(Size);
- if (Prev)
- Prev->Next = NewArena;
- return NewArena;
-}
-
-void Arena::freeArenaList(Arena *&A) {
- assert(A);
- for (auto *I = A; I != nullptr;) {
- auto *Current = I;
- I = I->Next;
- __sanitizer::InternalFree(Current);
- }
- A = nullptr;
-}
diff --git a/compiler-rt/lib/ctx_profile/CtxInstrProfiling.h b/compiler-rt/lib/ctx_profile/CtxInstrProfiling.h
deleted file mode 100644
index c1789c32a64c25..00000000000000
--- a/compiler-rt/lib/ctx_profile/CtxInstrProfiling.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/*===- CtxInstrProfiling.h- Contextual instrumentation-based PGO ---------===*\
-|*
-|* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-|* See https://llvm.org/LICENSE.txt for license information.
-|* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-|*
-\*===----------------------------------------------------------------------===*/
-
-#ifndef CTX_PROFILE_CTXINSTRPROFILING_H_
-#define CTX_PROFILE_CTXINSTRPROFILING_H_
-
-#include <sanitizer/common_interface_defs.h>
-
-namespace __ctx_profile {
-
-/// Arena (bump allocator) forming a linked list. Intentionally not thread safe.
-/// Allocation and de-allocation happen using sanitizer APIs. We make that
-/// explicit.
-class Arena final {
-public:
- // When allocating a new Arena, optionally specify an existing one to append
- // to, assumed to be the last in the Arena list. We only need to support
- // appending to the arena list.
- static Arena *allocateNewArena(size_t Size, Arena *Prev = nullptr);
- static void freeArenaList(Arena *&A);
-
- uint64_t size() const { return Size; }
-
- // Allocate S bytes or return nullptr if we don't have that many available.
- char *tryBumpAllocate(size_t S) {
- if (Pos + S > Size)
- return nullptr;
- Pos += S;
- return start() + (Pos - S);
- }
-
- Arena *next() const { return Next; }
-
- // the beginning of allocatable memory.
- const char *start() const { return const_cast<Arena *>(this)->start(); }
- const char *pos() const { return start() + Pos; }
-
-private:
- explicit Arena(uint32_t Size) : Size(Size) {}
- ~Arena() = delete;
-
- char *start() { return reinterpret_cast<char *>(&this[1]); }
-
- Arena *Next = nullptr;
- uint64_t Pos = 0;
- const uint64_t Size;
-};
-
-} // namespace __ctx_profile
-#endif // CTX_PROFILE_CTXINSTRPROFILING_H_
diff --git a/compiler-rt/lib/ctx_profile/tests/CMakeLists.txt b/compiler-rt/lib/ctx_profile/tests/CMakeLists.txt
deleted file mode 100644
index 93b41b838445d1..00000000000000
--- a/compiler-rt/lib/ctx_profile/tests/CMakeLists.txt
+++ /dev/null
@@ -1,70 +0,0 @@
-include(CheckCXXCompilerFlag)
-include(CompilerRTCompile)
-include(CompilerRTLink)
-
-set(CTX_PROFILE_UNITTEST_CFLAGS
- ${COMPILER_RT_UNITTEST_CFLAGS}
- ${COMPILER_RT_GTEST_CFLAGS}
- ${COMPILER_RT_GMOCK_CFLAGS}
- ${SANITIZER_TEST_CXX_CFLAGS}
- -I${COMPILER_RT_SOURCE_DIR}/lib/
- -DSANITIZER_COMMON_NO_REDEFINE_BUILTINS
- -O2
- -g
- -fno-rtti
- -Wno-pedantic
- -fno-omit-frame-pointer)
-
-# Suppress warnings for gmock variadic macros for clang and gcc respectively.
-append_list_if(SUPPORTS_GNU_ZERO_VARIADIC_MACRO_ARGUMENTS_FLAG -Wno-gnu-zero-variadic-macro-arguments CTX_PROFILE_UNITTEST_CFLAGS)
-append_list_if(COMPILER_RT_HAS_WVARIADIC_MACROS_FLAG -Wno-variadic-macros CTX_PROFILE_UNITTEST_CFLAGS)
-
-file(GLOB PROFILE_HEADERS ../*.h)
-
-set(CTX_PROFILE_SOURCES
- ../CtxInstrProfiling.cpp)
-
-set(CTX_PROFILE_UNITTESTS
- CtxInstrProfilingTest.cpp
- driver.cpp)
-
-include_directories(../../../include)
-
-set(CTX_PROFILE_UNIT_TEST_HEADERS
- ${CTX_PROFILE_HEADERS})
-
-set(CTX_PROFILE_UNITTEST_LINK_FLAGS
- ${COMPILER_RT_UNITTEST_LINK_FLAGS})
-
-list(APPEND CTX_PROFILE_UNITTEST_LINK_FLAGS -pthread)
-
-set(CTX_PROFILE_UNITTEST_LINK_LIBRARIES
- ${COMPILER_RT_UNWINDER_LINK_LIBS}
- ${SANITIZER_TEST_CXX_LIBRARIES})
-list(APPEND CTX_PROFILE_UNITTEST_LINK_LIBRARIES "dl")
-
-if(COMPILER_RT_DEFAULT_TARGET_ARCH IN_LIST CTX_PROFILE_SUPPORTED_ARCH)
- # Profile unit tests are only run on the host machine.
- set(arch ${COMPILER_RT_DEFAULT_TARGET_ARCH})
-
- add_executable(CtxProfileUnitTests
- ${CTX_PROFILE_UNITTESTS}
- ${COMPILER_RT_GTEST_SOURCE}
- ${COMPILER_RT_GMOCK_SOURCE}
- ${CTX_PROFILE_SOURCES}
- $<TARGET_OBJECTS:RTSanitizerCommon.${arch}>
- $<TARGET_OBJECTS:RTSanitizerCommonCoverage.${arch}>
- $<TARGET_OBJECTS:RTSanitizerCommonLibc.${arch}>
- $<TARGET_OBJECTS:RTSanitizerCommonSymbolizer.${arch}>
- $<TARGET_OBJECTS:RTSanitizerCommonSymbolizerInternal.${arch}>)
- set_target_compile_flags(CtxProfileUnitTests ${CTX_PROFILE_UNITTEST_CFLAGS})
- set_target_link_flags(CtxProfileUnitTests ${CTX_PROFILE_UNITTEST_LINK_FLAGS})
- target_link_libraries(CtxProfileUnitTests ${CTX_PROFILE_UNITTEST_LINK_LIBRARIES})
-
- if (TARGET cxx-headers OR HAVE_LIBCXX)
- add_dependencies(CtxProfileUnitTests cxx-headers)
- endif()
-
- set_target_properties(CtxProfileUnitTests PROPERTIES
- RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
-endif()
diff --git a/compiler-rt/lib/ctx_profile/tests/CtxInstrProfilingTest.cpp b/compiler-rt/lib/ctx_profile/tests/CtxInstrProfilingTest.cpp
deleted file mode 100644
index 44f37d25763206..00000000000000
--- a/compiler-rt/lib/ctx_profile/tests/CtxInstrProfilingTest.cpp
+++ /dev/null
@@ -1,22 +0,0 @@
-#include "../CtxInstrProfiling.h"
-#include "gtest/gtest.h"
-
-using namespace __ctx_profile;
-
-TEST(ArenaTest, Basic) {
- Arena *A = Arena::allocateNewArena(1024);
- EXPECT_EQ(A->size(), 1024U);
- EXPECT_EQ(A->next(), nullptr);
-
- auto *M1 = A->tryBumpAllocate(1020);
- EXPECT_NE(M1, nullptr);
- auto *M2 = A->tryBumpAllocate(4);
- EXPECT_NE(M2, nullptr);
- EXPECT_EQ(M1 + 1020, M2);
- EXPECT_EQ(A->tryBumpAllocate(1), nullptr);
- Arena *A2 = Arena::allocateNewArena(2024, A);
- EXPECT_EQ(A->next(), A2);
- EXPECT_EQ(A2->next(), nullptr);
- Arena::freeArenaList(A);
- EXPECT_EQ(A, nullptr);
-}
diff --git a/compiler-rt/lib/ctx_profile/tests/driver.cpp b/compiler-rt/lib/ctx_profile/tests/driver.cpp
deleted file mode 100644
index b402cec1126b33..00000000000000
--- a/compiler-rt/lib/ctx_profile/tests/driver.cpp
+++ /dev/null
@@ -1,14 +0,0 @@
-//===-- driver.cpp ----------------------------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "gtest/gtest.h"
-
-int main(int argc, char **argv) {
- testing::InitGoogleTest(&argc, argv);
- return RUN_ALL_TESTS();
-}
>From c4c54af569f7c17bc89ae73c3e5c5c4be0a586b9 Mon Sep 17 00:00:00 2001
From: Farzon Lotfi <1802579+farzonl at users.noreply.github.com>
Date: Mon, 22 Apr 2024 12:40:21 -0400
Subject: [PATCH 5/7] [SPIRV][HLSL] map lerp to Fmix (#88976)
- `clang/lib/CodeGen/CGBuiltin.cpp` - switch to using
`getLerpIntrinsic()` to abstract backend intrinsic
- `clang/lib/CodeGen/CGHLSLRuntime.h` - add `getLerpIntrinsic()`
- `llvm/include/llvm/IR/IntrinsicsSPIRV.td` - add SPIRV intrinsic for
lerp
- `llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp` - add mapping of
HLSL's lerp to GLSL's Fmix.
resolves #88940
---
clang/lib/CodeGen/CGBuiltin.cpp | 4 +-
clang/lib/CodeGen/CGHLSLRuntime.h | 1 +
.../CodeGenHLSL/builtins/lerp-builtin.hlsl | 8 +-
clang/test/CodeGenHLSL/builtins/lerp.hlsl | 96 ++++++++++++-------
llvm/include/llvm/IR/IntrinsicsSPIRV.td | 2 +
.../Target/SPIRV/SPIRVInstructionSelector.cpp | 26 +++++
.../test/CodeGen/SPIRV/hlsl-intrinsics/all.ll | 76 +++++++--------
.../test/CodeGen/SPIRV/hlsl-intrinsics/any.ll | 76 +++++++--------
.../CodeGen/SPIRV/hlsl-intrinsics/lerp.ll | 56 +++++++++++
.../test/CodeGen/SPIRV/hlsl-intrinsics/rcp.ll | 66 ++++++-------
10 files changed, 260 insertions(+), 151 deletions(-)
create mode 100644 llvm/test/CodeGen/SPIRV/hlsl-intrinsics/lerp.ll
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index afe2de5d00ac5d..7e5f2edfc732cc 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -18267,8 +18267,8 @@ Value *CodeGenFunction::EmitHLSLBuiltinExpr(unsigned BuiltinID,
if (!E->getArg(0)->getType()->hasFloatingRepresentation())
llvm_unreachable("lerp operand must have a float representation");
return Builder.CreateIntrinsic(
- /*ReturnType=*/X->getType(), Intrinsic::dx_lerp,
- ArrayRef<Value *>{X, Y, S}, nullptr, "dx.lerp");
+ /*ReturnType=*/X->getType(), CGM.getHLSLRuntime().getLerpIntrinsic(),
+ ArrayRef<Value *>{X, Y, S}, nullptr, "hlsl.lerp");
}
case Builtin::BI__builtin_hlsl_elementwise_frac: {
Value *Op0 = EmitScalarExpr(E->getArg(0));
diff --git a/clang/lib/CodeGen/CGHLSLRuntime.h b/clang/lib/CodeGen/CGHLSLRuntime.h
index 506b364f5b2ec7..0abe39dedcb96f 100644
--- a/clang/lib/CodeGen/CGHLSLRuntime.h
+++ b/clang/lib/CodeGen/CGHLSLRuntime.h
@@ -74,6 +74,7 @@ class CGHLSLRuntime {
GENERATE_HLSL_INTRINSIC_FUNCTION(All, all)
GENERATE_HLSL_INTRINSIC_FUNCTION(Any, any)
+ GENERATE_HLSL_INTRINSIC_FUNCTION(Lerp, lerp)
GENERATE_HLSL_INTRINSIC_FUNCTION(ThreadId, thread_id)
//===----------------------------------------------------------------------===//
diff --git a/clang/test/CodeGenHLSL/builtins/lerp-builtin.hlsl b/clang/test/CodeGenHLSL/builtins/lerp-builtin.hlsl
index 2fd5a19fc33521..cdc9abbd70e40b 100644
--- a/clang/test/CodeGenHLSL/builtins/lerp-builtin.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/lerp-builtin.hlsl
@@ -1,15 +1,15 @@
// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s
// CHECK-LABEL: builtin_lerp_half_vector
-// CHECK: %dx.lerp = call <3 x half> @llvm.dx.lerp.v3f16(<3 x half> %0, <3 x half> %1, <3 x half> %2)
-// CHECK: ret <3 x half> %dx.lerp
+// CHECK: %hlsl.lerp = call <3 x half> @llvm.dx.lerp.v3f16(<3 x half> %0, <3 x half> %1, <3 x half> %2)
+// CHECK: ret <3 x half> %hlsl.lerp
half3 builtin_lerp_half_vector (half3 p0) {
return __builtin_hlsl_lerp ( p0, p0, p0 );
}
// CHECK-LABEL: builtin_lerp_floar_vector
-// CHECK: %dx.lerp = call <2 x float> @llvm.dx.lerp.v2f32(<2 x float> %0, <2 x float> %1, <2 x float> %2)
-// CHECK: ret <2 x float> %dx.lerp
+// CHECK: %hlsl.lerp = call <2 x float> @llvm.dx.lerp.v2f32(<2 x float> %0, <2 x float> %1, <2 x float> %2)
+// CHECK: ret <2 x float> %hlsl.lerp
float2 builtin_lerp_floar_vector ( float2 p0) {
return __builtin_hlsl_lerp ( p0, p0, p0 );
}
diff --git a/clang/test/CodeGenHLSL/builtins/lerp.hlsl b/clang/test/CodeGenHLSL/builtins/lerp.hlsl
index 49cd04a10115ae..634b20be3a28d6 100644
--- a/clang/test/CodeGenHLSL/builtins/lerp.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/lerp.hlsl
@@ -1,69 +1,92 @@
// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
// RUN: dxil-pc-shadermodel6.3-library %s -fnative-half-type \
// RUN: -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
-// RUN: --check-prefixes=CHECK,NATIVE_HALF
+// RUN: --check-prefixes=CHECK,DXIL_CHECK,DXIL_NATIVE_HALF,NATIVE_HALF
// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
// RUN: dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
-// RUN: -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF
+// RUN: -o - | FileCheck %s --check-prefixes=CHECK,DXIL_CHECK,NO_HALF,DXIL_NO_HALF
+// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
+// RUN: spirv-unknown-vulkan-compute %s -fnative-half-type \
+// RUN: -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
+// RUN: --check-prefixes=CHECK,NATIVE_HALF,SPIR_NATIVE_HALF,SPIR_CHECK
+// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
+// RUN: spirv-unknown-vulkan-compute %s -emit-llvm -disable-llvm-passes \
+// RUN: -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF,SPIR_NO_HALF,SPIR_CHECK
-// NATIVE_HALF: %dx.lerp = call half @llvm.dx.lerp.f16(half %0, half %1, half %2)
-// NATIVE_HALF: ret half %dx.lerp
-// NO_HALF: %dx.lerp = call float @llvm.dx.lerp.f32(float %0, float %1, float %2)
-// NO_HALF: ret float %dx.lerp
+// DXIL_NATIVE_HALF: %hlsl.lerp = call half @llvm.dx.lerp.f16(half %0, half %1, half %2)
+// SPIR_NATIVE_HALF: %hlsl.lerp = call half @llvm.spv.lerp.f16(half %0, half %1, half %2)
+// NATIVE_HALF: ret half %hlsl.lerp
+// DXIL_NO_HALF: %hlsl.lerp = call float @llvm.dx.lerp.f32(float %0, float %1, float %2)
+// SPIR_NO_HALF: %hlsl.lerp = call float @llvm.spv.lerp.f32(float %0, float %1, float %2)
+// NO_HALF: ret float %hlsl.lerp
half test_lerp_half(half p0) { return lerp(p0, p0, p0); }
-// NATIVE_HALF: %dx.lerp = call <2 x half> @llvm.dx.lerp.v2f16(<2 x half> %0, <2 x half> %1, <2 x half> %2)
-// NATIVE_HALF: ret <2 x half> %dx.lerp
-// NO_HALF: %dx.lerp = call <2 x float> @llvm.dx.lerp.v2f32(<2 x float> %0, <2 x float> %1, <2 x float> %2)
-// NO_HALF: ret <2 x float> %dx.lerp
+// DXIL_NATIVE_HALF: %hlsl.lerp = call <2 x half> @llvm.dx.lerp.v2f16(<2 x half> %0, <2 x half> %1, <2 x half> %2)
+// SPIR_NATIVE_HALF: %hlsl.lerp = call <2 x half> @llvm.spv.lerp.v2f16(<2 x half> %0, <2 x half> %1, <2 x half> %2)
+// NATIVE_HALF: ret <2 x half> %hlsl.lerp
+// DXIL_NO_HALF: %hlsl.lerp = call <2 x float> @llvm.dx.lerp.v2f32(<2 x float> %0, <2 x float> %1, <2 x float> %2)
+// SPIR_NO_HALF: %hlsl.lerp = call <2 x float> @llvm.spv.lerp.v2f32(<2 x float> %0, <2 x float> %1, <2 x float> %2)
+// NO_HALF: ret <2 x float> %hlsl.lerp
half2 test_lerp_half2(half2 p0) { return lerp(p0, p0, p0); }
-// NATIVE_HALF: %dx.lerp = call <3 x half> @llvm.dx.lerp.v3f16(<3 x half> %0, <3 x half> %1, <3 x half> %2)
-// NATIVE_HALF: ret <3 x half> %dx.lerp
-// NO_HALF: %dx.lerp = call <3 x float> @llvm.dx.lerp.v3f32(<3 x float> %0, <3 x float> %1, <3 x float> %2)
-// NO_HALF: ret <3 x float> %dx.lerp
+// DXIL_NATIVE_HALF: %hlsl.lerp = call <3 x half> @llvm.dx.lerp.v3f16(<3 x half> %0, <3 x half> %1, <3 x half> %2)
+// SPIR_NATIVE_HALF: %hlsl.lerp = call <3 x half> @llvm.spv.lerp.v3f16(<3 x half> %0, <3 x half> %1, <3 x half> %2)
+// NATIVE_HALF: ret <3 x half> %hlsl.lerp
+// DXIL_NO_HALF: %hlsl.lerp = call <3 x float> @llvm.dx.lerp.v3f32(<3 x float> %0, <3 x float> %1, <3 x float> %2)
+// SPIR_NO_HALF: %hlsl.lerp = call <3 x float> @llvm.spv.lerp.v3f32(<3 x float> %0, <3 x float> %1, <3 x float> %2)
+// NO_HALF: ret <3 x float> %hlsl.lerp
half3 test_lerp_half3(half3 p0) { return lerp(p0, p0, p0); }
-// NATIVE_HALF: %dx.lerp = call <4 x half> @llvm.dx.lerp.v4f16(<4 x half> %0, <4 x half> %1, <4 x half> %2)
-// NATIVE_HALF: ret <4 x half> %dx.lerp
-// NO_HALF: %dx.lerp = call <4 x float> @llvm.dx.lerp.v4f32(<4 x float> %0, <4 x float> %1, <4 x float> %2)
-// NO_HALF: ret <4 x float> %dx.lerp
+// DXIL_NATIVE_HALF: %hlsl.lerp = call <4 x half> @llvm.dx.lerp.v4f16(<4 x half> %0, <4 x half> %1, <4 x half> %2)
+// SPIR_NATIVE_HALF: %hlsl.lerp = call <4 x half> @llvm.spv.lerp.v4f16(<4 x half> %0, <4 x half> %1, <4 x half> %2)
+// NATIVE_HALF: ret <4 x half> %hlsl.lerp
+// DXIL_NO_HALF: %hlsl.lerp = call <4 x float> @llvm.dx.lerp.v4f32(<4 x float> %0, <4 x float> %1, <4 x float> %2)
+// SPIR_NO_HALF: %hlsl.lerp = call <4 x float> @llvm.spv.lerp.v4f32(<4 x float> %0, <4 x float> %1, <4 x float> %2)
+// NO_HALF: ret <4 x float> %hlsl.lerp
half4 test_lerp_half4(half4 p0) { return lerp(p0, p0, p0); }
-// CHECK: %dx.lerp = call float @llvm.dx.lerp.f32(float %0, float %1, float %2)
-// CHECK: ret float %dx.lerp
+// DXIL_CHECK: %hlsl.lerp = call float @llvm.dx.lerp.f32(float %0, float %1, float %2)
+// SPIR_CHECK: %hlsl.lerp = call float @llvm.spv.lerp.f32(float %0, float %1, float %2)
+// CHECK: ret float %hlsl.lerp
float test_lerp_float(float p0) { return lerp(p0, p0, p0); }
-// CHECK: %dx.lerp = call <2 x float> @llvm.dx.lerp.v2f32(<2 x float> %0, <2 x float> %1, <2 x float> %2)
-// CHECK: ret <2 x float> %dx.lerp
+// DXIL_CHECK: %hlsl.lerp = call <2 x float> @llvm.dx.lerp.v2f32(<2 x float> %0, <2 x float> %1, <2 x float> %2)
+// SPIR_CHECK: %hlsl.lerp = call <2 x float> @llvm.spv.lerp.v2f32(<2 x float> %0, <2 x float> %1, <2 x float> %2)
+// CHECK: ret <2 x float> %hlsl.lerp
float2 test_lerp_float2(float2 p0) { return lerp(p0, p0, p0); }
-// CHECK: %dx.lerp = call <3 x float> @llvm.dx.lerp.v3f32(<3 x float> %0, <3 x float> %1, <3 x float> %2)
-// CHECK: ret <3 x float> %dx.lerp
+// DXIL_CHECK: %hlsl.lerp = call <3 x float> @llvm.dx.lerp.v3f32(<3 x float> %0, <3 x float> %1, <3 x float> %2)
+// SPIR_CHECK: %hlsl.lerp = call <3 x float> @llvm.spv.lerp.v3f32(<3 x float> %0, <3 x float> %1, <3 x float> %2)
+// CHECK: ret <3 x float> %hlsl.lerp
float3 test_lerp_float3(float3 p0) { return lerp(p0, p0, p0); }
-// CHECK: %dx.lerp = call <4 x float> @llvm.dx.lerp.v4f32(<4 x float> %0, <4 x float> %1, <4 x float> %2)
-// CHECK: ret <4 x float> %dx.lerp
+// DXIL_CHECK: %hlsl.lerp = call <4 x float> @llvm.dx.lerp.v4f32(<4 x float> %0, <4 x float> %1, <4 x float> %2)
+// SPIR_CHECK: %hlsl.lerp = call <4 x float> @llvm.spv.lerp.v4f32(<4 x float> %0, <4 x float> %1, <4 x float> %2)
+// CHECK: ret <4 x float> %hlsl.lerp
float4 test_lerp_float4(float4 p0) { return lerp(p0, p0, p0); }
-// CHECK: %dx.lerp = call <2 x float> @llvm.dx.lerp.v2f32(<2 x float> %splat.splat, <2 x float> %1, <2 x float> %2)
-// CHECK: ret <2 x float> %dx.lerp
+// DXIL_CHECK: %hlsl.lerp = call <2 x float> @llvm.dx.lerp.v2f32(<2 x float> %splat.splat, <2 x float> %1, <2 x float> %2)
+// SPIR_CHECK: %hlsl.lerp = call <2 x float> @llvm.spv.lerp.v2f32(<2 x float> %splat.splat, <2 x float> %1, <2 x float> %2)
+// CHECK: ret <2 x float> %hlsl.lerp
float2 test_lerp_float2_splat(float p0, float2 p1) { return lerp(p0, p1, p1); }
-// CHECK: %dx.lerp = call <3 x float> @llvm.dx.lerp.v3f32(<3 x float> %splat.splat, <3 x float> %1, <3 x float> %2)
-// CHECK: ret <3 x float> %dx.lerp
+// DXIL_CHECK: %hlsl.lerp = call <3 x float> @llvm.dx.lerp.v3f32(<3 x float> %splat.splat, <3 x float> %1, <3 x float> %2)
+// SPIR_CHECK: %hlsl.lerp = call <3 x float> @llvm.spv.lerp.v3f32(<3 x float> %splat.splat, <3 x float> %1, <3 x float> %2)
+// CHECK: ret <3 x float> %hlsl.lerp
float3 test_lerp_float3_splat(float p0, float3 p1) { return lerp(p0, p1, p1); }
-// CHECK: %dx.lerp = call <4 x float> @llvm.dx.lerp.v4f32(<4 x float> %splat.splat, <4 x float> %1, <4 x float> %2)
-// CHECK: ret <4 x float> %dx.lerp
+// DXIL_CHECK: %hlsl.lerp = call <4 x float> @llvm.dx.lerp.v4f32(<4 x float> %splat.splat, <4 x float> %1, <4 x float> %2)
+// SPIR_CHECK: %hlsl.lerp = call <4 x float> @llvm.spv.lerp.v4f32(<4 x float> %splat.splat, <4 x float> %1, <4 x float> %2)
+// CHECK: ret <4 x float> %hlsl.lerp
float4 test_lerp_float4_splat(float p0, float4 p1) { return lerp(p0, p1, p1); }
// CHECK: %conv = sitofp i32 %2 to float
// CHECK: %splat.splatinsert = insertelement <2 x float> poison, float %conv, i64 0
// CHECK: %splat.splat = shufflevector <2 x float> %splat.splatinsert, <2 x float> poison, <2 x i32> zeroinitializer
-// CHECK: %dx.lerp = call <2 x float> @llvm.dx.lerp.v2f32(<2 x float> %0, <2 x float> %1, <2 x float> %splat.splat)
-// CHECK: ret <2 x float> %dx.lerp
+// DXIL_CHECK: %hlsl.lerp = call <2 x float> @llvm.dx.lerp.v2f32(<2 x float> %0, <2 x float> %1, <2 x float> %splat.splat)
+// SPIR_CHECK: %hlsl.lerp = call <2 x float> @llvm.spv.lerp.v2f32(<2 x float> %0, <2 x float> %1, <2 x float> %splat.splat)
+// CHECK: ret <2 x float> %hlsl.lerp
float2 test_lerp_float2_int_splat(float2 p0, int p1) {
return lerp(p0, p0, p1);
}
@@ -71,8 +94,9 @@ float2 test_lerp_float2_int_splat(float2 p0, int p1) {
// CHECK: %conv = sitofp i32 %2 to float
// CHECK: %splat.splatinsert = insertelement <3 x float> poison, float %conv, i64 0
// CHECK: %splat.splat = shufflevector <3 x float> %splat.splatinsert, <3 x float> poison, <3 x i32> zeroinitializer
-// CHECK: %dx.lerp = call <3 x float> @llvm.dx.lerp.v3f32(<3 x float> %0, <3 x float> %1, <3 x float> %splat.splat)
-// CHECK: ret <3 x float> %dx.lerp
+// DXIL_CHECK: %hlsl.lerp = call <3 x float> @llvm.dx.lerp.v3f32(<3 x float> %0, <3 x float> %1, <3 x float> %splat.splat)
+// SPIR_CHECK: %hlsl.lerp = call <3 x float> @llvm.spv.lerp.v3f32(<3 x float> %0, <3 x float> %1, <3 x float> %splat.splat)
+// CHECK: ret <3 x float> %hlsl.lerp
float3 test_lerp_float3_int_splat(float3 p0, int p1) {
return lerp(p0, p0, p1);
}
diff --git a/llvm/include/llvm/IR/IntrinsicsSPIRV.td b/llvm/include/llvm/IR/IntrinsicsSPIRV.td
index b6618baceb5608..8660782d71d950 100644
--- a/llvm/include/llvm/IR/IntrinsicsSPIRV.td
+++ b/llvm/include/llvm/IR/IntrinsicsSPIRV.td
@@ -58,4 +58,6 @@ let TargetPrefix = "spv" in {
Intrinsic<[ llvm_ptr_ty ], [llvm_i8_ty], [IntrWillReturn]>;
def int_spv_all : DefaultAttrsIntrinsic<[llvm_i1_ty], [llvm_any_ty]>;
def int_spv_any : DefaultAttrsIntrinsic<[llvm_i1_ty], [llvm_any_ty]>;
+ def int_spv_lerp : Intrinsic<[LLVMMatchType<0>], [llvm_anyfloat_ty, LLVMMatchType<0>,LLVMMatchType<0>],
+ [IntrNoMem, IntrWillReturn] >;
}
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
index 72e5a7bcac9834..21a69fc3ad9b44 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
@@ -170,6 +170,9 @@ class SPIRVInstructionSelector : public InstructionSelector {
bool selectFCmp(Register ResVReg, const SPIRVType *ResType,
MachineInstr &I) const;
+ bool selectFmix(Register ResVReg, const SPIRVType *ResType,
+ MachineInstr &I) const;
+
void renderImm32(MachineInstrBuilder &MIB, const MachineInstr &I,
int OpIdx) const;
void renderFImm32(MachineInstrBuilder &MIB, const MachineInstr &I,
@@ -1242,6 +1245,27 @@ bool SPIRVInstructionSelector::selectAny(Register ResVReg,
return selectAnyOrAll(ResVReg, ResType, I, SPIRV::OpAny);
}
+bool SPIRVInstructionSelector::selectFmix(Register ResVReg,
+ const SPIRVType *ResType,
+ MachineInstr &I) const {
+
+ assert(I.getNumOperands() == 5);
+ assert(I.getOperand(2).isReg());
+ assert(I.getOperand(3).isReg());
+ assert(I.getOperand(4).isReg());
+ MachineBasicBlock &BB = *I.getParent();
+
+ return BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpExtInst))
+ .addDef(ResVReg)
+ .addUse(GR.getSPIRVTypeID(ResType))
+ .addImm(static_cast<uint32_t>(SPIRV::InstructionSet::GLSL_std_450))
+ .addImm(GL::FMix)
+ .addUse(I.getOperand(2).getReg())
+ .addUse(I.getOperand(3).getReg())
+ .addUse(I.getOperand(4).getReg())
+ .constrainAllUses(TII, TRI, RBI);
+}
+
bool SPIRVInstructionSelector::selectBitreverse(Register ResVReg,
const SPIRVType *ResType,
MachineInstr &I) const {
@@ -1902,6 +1926,8 @@ bool SPIRVInstructionSelector::selectIntrinsic(Register ResVReg,
return selectAll(ResVReg, ResType, I);
case Intrinsic::spv_any:
return selectAny(ResVReg, ResType, I);
+ case Intrinsic::spv_lerp:
+ return selectFmix(ResVReg, ResType, I);
case Intrinsic::spv_lifetime_start:
case Intrinsic::spv_lifetime_end: {
unsigned Op = IID == Intrinsic::spv_lifetime_start ? SPIRV::OpLifetimeStart
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/all.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/all.ll
index ef8d463cbd815e..8c5410aa54a433 100644
--- a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/all.ll
+++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/all.ll
@@ -26,32 +26,32 @@
; CHECK-HLSL-DAG: %[[#const_i32_0:]] = OpConstant %[[#int_32]] 0
; CHECK-HLSL-DAG: %[[#const_i16_0:]] = OpConstant %[[#int_16]] 0
; CHECK-HLSL-DAG: %[[#const_f64_0:]] = OpConstant %[[#float_64]] 0
-; CHECK-HLSL-DAG: %[[#const_f32_0:]] = OpConstant %[[#float_32:]] 0
-; CHECK-HLSL-DAG: %[[#const_f16_0:]] = OpConstant %[[#float_16:]] 0
-; CHECK-HLSL-DAG: %[[#vec4_const_zeros_i16:]] = OpConstantComposite %[[#vec4_16:]] %[[#const_i16_0:]] %[[#const_i16_0:]] %[[#const_i16_0:]] %[[#const_i16_0:]]
-; CHECK-HLSL-DAG: %[[#vec4_const_zeros_i32:]] = OpConstantComposite %[[#vec4_32:]] %[[#const_i32_0:]] %[[#const_i32_0:]] %[[#const_i32_0:]] %[[#const_i32_0:]]
-; CHECK-HLSL-DAG: %[[#vec4_const_zeros_i64:]] = OpConstantComposite %[[#vec4_64:]] %[[#const_i64_0:]] %[[#const_i64_0:]] %[[#const_i64_0:]] %[[#const_i64_0:]]
-; CHECK-HLSL-DAG: %[[#vec4_const_zeros_f16:]] = OpConstantComposite %[[#vec4_float_16:]] %[[#const_f16_0:]] %[[#const_f16_0:]] %[[#const_f16_0:]] %[[#const_f16_0:]]
-; CHECK-HLSL-DAG: %[[#vec4_const_zeros_f32:]] = OpConstantComposite %[[#vec4_float_32:]] %[[#const_f32_0:]] %[[#const_f32_0:]] %[[#const_f32_0:]] %[[#const_f32_0:]]
-; CHECK-HLSL-DAG: %[[#vec4_const_zeros_f64:]] = OpConstantComposite %[[#vec4_float_64:]] %[[#const_f64_0:]] %[[#const_f64_0:]] %[[#const_f64_0:]] %[[#const_f64_0:]]
+; CHECK-HLSL-DAG: %[[#const_f32_0:]] = OpConstant %[[#float_32]] 0
+; CHECK-HLSL-DAG: %[[#const_f16_0:]] = OpConstant %[[#float_16]] 0
+; CHECK-HLSL-DAG: %[[#vec4_const_zeros_i16:]] = OpConstantComposite %[[#vec4_16]] %[[#const_i16_0]] %[[#const_i16_0]] %[[#const_i16_0]] %[[#const_i16_0]]
+; CHECK-HLSL-DAG: %[[#vec4_const_zeros_i32:]] = OpConstantComposite %[[#vec4_32]] %[[#const_i32_0]] %[[#const_i32_0]] %[[#const_i32_0]] %[[#const_i32_0]]
+; CHECK-HLSL-DAG: %[[#vec4_const_zeros_i64:]] = OpConstantComposite %[[#vec4_64]] %[[#const_i64_0]] %[[#const_i64_0]] %[[#const_i64_0]] %[[#const_i64_0]]
+; CHECK-HLSL-DAG: %[[#vec4_const_zeros_f16:]] = OpConstantComposite %[[#vec4_float_16]] %[[#const_f16_0]] %[[#const_f16_0]] %[[#const_f16_0]] %[[#const_f16_0]]
+; CHECK-HLSL-DAG: %[[#vec4_const_zeros_f32:]] = OpConstantComposite %[[#vec4_float_32]] %[[#const_f32_0]] %[[#const_f32_0]] %[[#const_f32_0]] %[[#const_f32_0]]
+; CHECK-HLSL-DAG: %[[#vec4_const_zeros_f64:]] = OpConstantComposite %[[#vec4_float_64]] %[[#const_f64_0]] %[[#const_f64_0]] %[[#const_f64_0]] %[[#const_f64_0]]
; CHECK-OCL-DAG: %[[#const_i64_0:]] = OpConstantNull %[[#int_64]]
; CHECK-OCL-DAG: %[[#const_i32_0:]] = OpConstantNull %[[#int_32]]
; CHECK-OCL-DAG: %[[#const_i16_0:]] = OpConstantNull %[[#int_16]]
; CHECK-OCL-DAG: %[[#const_f64_0:]] = OpConstantNull %[[#float_64]]
-; CHECK-OCL-DAG: %[[#const_f32_0:]] = OpConstantNull %[[#float_32:]]
-; CHECK-OCL-DAG: %[[#const_f16_0:]] = OpConstantNull %[[#float_16:]]
-; CHECK-OCL-DAG: %[[#vec4_const_zeros_i16:]] = OpConstantNull %[[#vec4_16:]]
-; CHECK-OCL-DAG: %[[#vec4_const_zeros_i32:]] = OpConstantNull %[[#vec4_32:]]
-; CHECK-OCL-DAG: %[[#vec4_const_zeros_i64:]] = OpConstantNull %[[#vec4_64:]]
-; CHECK-OCL-DAG: %[[#vec4_const_zeros_f16:]] = OpConstantNull %[[#vec4_float_16:]]
-; CHECK-OCL-DAG: %[[#vec4_const_zeros_f32:]] = OpConstantNull %[[#vec4_float_32:]]
-; CHECK-OCL-DAG: %[[#vec4_const_zeros_f64:]] = OpConstantNull %[[#vec4_float_64:]]
+; CHECK-OCL-DAG: %[[#const_f32_0:]] = OpConstantNull %[[#float_32]]
+; CHECK-OCL-DAG: %[[#const_f16_0:]] = OpConstantNull %[[#float_16]]
+; CHECK-OCL-DAG: %[[#vec4_const_zeros_i16:]] = OpConstantNull %[[#vec4_16]]
+; CHECK-OCL-DAG: %[[#vec4_const_zeros_i32:]] = OpConstantNull %[[#vec4_32]]
+; CHECK-OCL-DAG: %[[#vec4_const_zeros_i64:]] = OpConstantNull %[[#vec4_64]]
+; CHECK-OCL-DAG: %[[#vec4_const_zeros_f16:]] = OpConstantNull %[[#vec4_float_16]]
+; CHECK-OCL-DAG: %[[#vec4_const_zeros_f32:]] = OpConstantNull %[[#vec4_float_32]]
+; CHECK-OCL-DAG: %[[#vec4_const_zeros_f64:]] = OpConstantNull %[[#vec4_float_64]]
define noundef i1 @all_int64_t(i64 noundef %p0) {
entry:
; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#]]
- ; CHECK: %[[#]] = OpINotEqual %[[#bool:]] %[[#arg0:]] %[[#const_i64_0:]]
+ ; CHECK: %[[#]] = OpINotEqual %[[#bool]] %[[#arg0]] %[[#const_i64_0]]
%hlsl.all = call i1 @llvm.spv.all.i64(i64 %p0)
ret i1 %hlsl.all
}
@@ -60,7 +60,7 @@ entry:
define noundef i1 @all_int(i32 noundef %p0) {
entry:
; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#]]
- ; CHECK: %[[#]] = OpINotEqual %[[#bool:]] %[[#arg0:]] %[[#const_i32_0:]]
+ ; CHECK: %[[#]] = OpINotEqual %[[#bool]] %[[#arg0]] %[[#const_i32_0]]
%hlsl.all = call i1 @llvm.spv.all.i32(i32 %p0)
ret i1 %hlsl.all
}
@@ -69,7 +69,7 @@ entry:
define noundef i1 @all_int16_t(i16 noundef %p0) {
entry:
; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#]]
- ; CHECK: %[[#]] = OpINotEqual %[[#bool:]] %[[#arg0:]] %[[#const_i16_0:]]
+ ; CHECK: %[[#]] = OpINotEqual %[[#bool]] %[[#arg0]] %[[#const_i16_0]]
%hlsl.all = call i1 @llvm.spv.all.i16(i16 %p0)
ret i1 %hlsl.all
}
@@ -77,7 +77,7 @@ entry:
define noundef i1 @all_double(double noundef %p0) {
entry:
; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#]]
- ; CHECK: %[[#]] = OpFOrdNotEqual %[[#bool:]] %[[#arg0:]] %[[#const_f64_0:]]
+ ; CHECK: %[[#]] = OpFOrdNotEqual %[[#bool]] %[[#arg0]] %[[#const_f64_0]]
%hlsl.all = call i1 @llvm.spv.all.f64(double %p0)
ret i1 %hlsl.all
}
@@ -86,7 +86,7 @@ entry:
define noundef i1 @all_float(float noundef %p0) {
entry:
; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#]]
- ; CHECK: %[[#]] = OpFOrdNotEqual %[[#bool:]] %[[#arg0:]] %[[#const_f32_0:]]
+ ; CHECK: %[[#]] = OpFOrdNotEqual %[[#bool]] %[[#arg0]] %[[#const_f32_0]]
%hlsl.all = call i1 @llvm.spv.all.f32(float %p0)
ret i1 %hlsl.all
}
@@ -95,7 +95,7 @@ entry:
define noundef i1 @all_half(half noundef %p0) {
entry:
; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#]]
- ; CHECK: %[[#]] = OpFOrdNotEqual %[[#bool:]] %[[#arg0:]] %[[#const_f16_0:]]
+ ; CHECK: %[[#]] = OpFOrdNotEqual %[[#bool]] %[[#arg0]] %[[#const_f16_0]]
%hlsl.all = call i1 @llvm.spv.all.f16(half %p0)
ret i1 %hlsl.all
}
@@ -103,8 +103,8 @@ entry:
define noundef i1 @all_bool4(<4 x i1> noundef %p0) {
entry:
- ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#]]
- ; CHECK: %[[#]] = OpAll %[[#vec4_bool:]] %[[#arg0:]]
+ ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#vec4_bool]]
+ ; CHECK: %[[#]] = OpAll %[[#bool]] %[[#arg0]]
%hlsl.all = call i1 @llvm.spv.all.v4i1(<4 x i1> %p0)
ret i1 %hlsl.all
}
@@ -112,8 +112,8 @@ entry:
define noundef i1 @all_short4(<4 x i16> noundef %p0) {
entry:
; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#]]
- ; CHECK: %[[#shortVecNotEq:]] = OpINotEqual %[[#vec4_bool:]] %[[#arg0:]] %[[#vec4_const_zeros_i16:]]
- ; CHECK: %[[#]] = OpAll %[[#bool:]] %[[#shortVecNotEq:]]
+ ; CHECK: %[[#shortVecNotEq:]] = OpINotEqual %[[#vec4_bool]] %[[#arg0]] %[[#vec4_const_zeros_i16]]
+ ; CHECK: %[[#]] = OpAll %[[#bool]] %[[#shortVecNotEq]]
%hlsl.all = call i1 @llvm.spv.all.v4i16(<4 x i16> %p0)
ret i1 %hlsl.all
}
@@ -121,8 +121,8 @@ entry:
define noundef i1 @all_int4(<4 x i32> noundef %p0) {
entry:
; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#]]
- ; CHECK: %[[#i32VecNotEq:]] = OpINotEqual %[[#vec4_bool:]] %[[#arg0:]] %[[#vec4_const_zeros_i32:]]
- ; CHECK: %[[#]] = OpAll %[[#bool:]] %[[#i32VecNotEq:]]
+ ; CHECK: %[[#i32VecNotEq:]] = OpINotEqual %[[#vec4_bool]] %[[#arg0]] %[[#vec4_const_zeros_i32]]
+ ; CHECK: %[[#]] = OpAll %[[#bool]] %[[#i32VecNotEq]]
%hlsl.all = call i1 @llvm.spv.all.v4i32(<4 x i32> %p0)
ret i1 %hlsl.all
}
@@ -130,8 +130,8 @@ entry:
define noundef i1 @all_int64_t4(<4 x i64> noundef %p0) {
entry:
; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#]]
- ; CHECK: %[[#i64VecNotEq:]] = OpINotEqual %[[#vec4_bool:]] %[[#arg0:]] %[[#vec4_const_zeros_i64:]]
- ; CHECK: %[[#]] = OpAll %[[#bool:]] %[[#i64VecNotEq]]
+ ; CHECK: %[[#i64VecNotEq:]] = OpINotEqual %[[#vec4_bool]] %[[#arg0]] %[[#vec4_const_zeros_i64]]
+ ; CHECK: %[[#]] = OpAll %[[#bool]] %[[#i64VecNotEq]]
%hlsl.all = call i1 @llvm.spv.all.v4i64(<4 x i64> %p0)
ret i1 %hlsl.all
}
@@ -139,8 +139,8 @@ entry:
define noundef i1 @all_half4(<4 x half> noundef %p0) {
entry:
; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#]]
- ; CHECK: %[[#f16VecNotEq:]] = OpFOrdNotEqual %[[#vec4_bool:]] %[[#arg0:]] %[[#vec4_const_zeros_f16:]]
- ; CHECK: %[[#]] = OpAll %[[#bool]] %[[#f16VecNotEq:]]
+ ; CHECK: %[[#f16VecNotEq:]] = OpFOrdNotEqual %[[#vec4_bool]] %[[#arg0]] %[[#vec4_const_zeros_f16]]
+ ; CHECK: %[[#]] = OpAll %[[#bool]] %[[#f16VecNotEq]]
%hlsl.all = call i1 @llvm.spv.all.v4f16(<4 x half> %p0)
ret i1 %hlsl.all
}
@@ -148,8 +148,8 @@ entry:
define noundef i1 @all_float4(<4 x float> noundef %p0) {
entry:
; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#]]
- ; CHECK: %[[#f32VecNotEq:]] = OpFOrdNotEqual %[[#vec4_bool:]] %[[#arg0:]] %[[#vec4_const_zeros_f32:]]
- ; CHECK: %[[#]] = OpAll %[[#bool:]] %[[#f32VecNotEq:]]
+ ; CHECK: %[[#f32VecNotEq:]] = OpFOrdNotEqual %[[#vec4_bool]] %[[#arg0]] %[[#vec4_const_zeros_f32]]
+ ; CHECK: %[[#]] = OpAll %[[#bool]] %[[#f32VecNotEq]]
%hlsl.all = call i1 @llvm.spv.all.v4f32(<4 x float> %p0)
ret i1 %hlsl.all
}
@@ -157,16 +157,16 @@ entry:
define noundef i1 @all_double4(<4 x double> noundef %p0) {
entry:
; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#]]
- ; CHECK: %[[#f64VecNotEq:]] = OpFOrdNotEqual %[[#vec4_bool:]] %[[#arg0:]] %[[#vec4_const_zeros_f64:]]
- ; CHECK: %[[#]] = OpAll %[[#bool:]] %[[#f64VecNotEq:]]
+ ; CHECK: %[[#f64VecNotEq:]] = OpFOrdNotEqual %[[#vec4_bool]] %[[#arg0]] %[[#vec4_const_zeros_f64]]
+ ; CHECK: %[[#]] = OpAll %[[#bool]] %[[#f64VecNotEq]]
%hlsl.all = call i1 @llvm.spv.all.v4f64(<4 x double> %p0)
ret i1 %hlsl.all
}
define noundef i1 @all_bool(i1 noundef %a) {
entry:
- ; CHECK: %[[#all_bool_arg:]] = OpFunctionParameter %[[#bool:]]
- ; CHECK: OpReturnValue %[[#all_bool_arg:]]
+ ; CHECK: %[[#all_bool_arg:]] = OpFunctionParameter %[[#bool]]
+ ; CHECK: OpReturnValue %[[#all_bool_arg]]
%hlsl.all = call i1 @llvm.spv.all.i1(i1 %a)
ret i1 %hlsl.all
}
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/any.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/any.ll
index b1dd388f5c6e36..7a74a335a659d4 100644
--- a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/any.ll
+++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/any.ll
@@ -26,32 +26,32 @@
; CHECK-HLSL-DAG: %[[#const_i32_0:]] = OpConstant %[[#int_32]] 0
; CHECK-HLSL-DAG: %[[#const_i16_0:]] = OpConstant %[[#int_16]] 0
; CHECK-HLSL-DAG: %[[#const_f64_0:]] = OpConstant %[[#float_64]] 0
-; CHECK-HLSL-DAG: %[[#const_f32_0:]] = OpConstant %[[#float_32:]] 0
-; CHECK-HLSL-DAG: %[[#const_f16_0:]] = OpConstant %[[#float_16:]] 0
-; CHECK-HLSL-DAG: %[[#vec4_const_zeros_i16:]] = OpConstantComposite %[[#vec4_16:]] %[[#const_i16_0:]] %[[#const_i16_0:]] %[[#const_i16_0:]] %[[#const_i16_0:]]
-; CHECK-HLSL-DAG: %[[#vec4_const_zeros_i32:]] = OpConstantComposite %[[#vec4_32:]] %[[#const_i32_0:]] %[[#const_i32_0:]] %[[#const_i32_0:]] %[[#const_i32_0:]]
-; CHECK-HLSL-DAG: %[[#vec4_const_zeros_i64:]] = OpConstantComposite %[[#vec4_64:]] %[[#const_i64_0:]] %[[#const_i64_0:]] %[[#const_i64_0:]] %[[#const_i64_0:]]
-; CHECK-HLSL-DAG: %[[#vec4_const_zeros_f16:]] = OpConstantComposite %[[#vec4_float_16:]] %[[#const_f16_0:]] %[[#const_f16_0:]] %[[#const_f16_0:]] %[[#const_f16_0:]]
-; CHECK-HLSL-DAG: %[[#vec4_const_zeros_f32:]] = OpConstantComposite %[[#vec4_float_32:]] %[[#const_f32_0:]] %[[#const_f32_0:]] %[[#const_f32_0:]] %[[#const_f32_0:]]
-; CHECK-HLSL-DAG: %[[#vec4_const_zeros_f64:]] = OpConstantComposite %[[#vec4_float_64:]] %[[#const_f64_0:]] %[[#const_f64_0:]] %[[#const_f64_0:]] %[[#const_f64_0:]]
+; CHECK-HLSL-DAG: %[[#const_f32_0:]] = OpConstant %[[#float_32]] 0
+; CHECK-HLSL-DAG: %[[#const_f16_0:]] = OpConstant %[[#float_16]] 0
+; CHECK-HLSL-DAG: %[[#vec4_const_zeros_i16:]] = OpConstantComposite %[[#vec4_16]] %[[#const_i16_0]] %[[#const_i16_0]] %[[#const_i16_0]] %[[#const_i16_0]]
+; CHECK-HLSL-DAG: %[[#vec4_const_zeros_i32:]] = OpConstantComposite %[[#vec4_32]] %[[#const_i32_0]] %[[#const_i32_0]] %[[#const_i32_0]] %[[#const_i32_0]]
+; CHECK-HLSL-DAG: %[[#vec4_const_zeros_i64:]] = OpConstantComposite %[[#vec4_64]] %[[#const_i64_0]] %[[#const_i64_0]] %[[#const_i64_0]] %[[#const_i64_0]]
+; CHECK-HLSL-DAG: %[[#vec4_const_zeros_f16:]] = OpConstantComposite %[[#vec4_float_16]] %[[#const_f16_0]] %[[#const_f16_0]] %[[#const_f16_0]] %[[#const_f16_0]]
+; CHECK-HLSL-DAG: %[[#vec4_const_zeros_f32:]] = OpConstantComposite %[[#vec4_float_32]] %[[#const_f32_0]] %[[#const_f32_0]] %[[#const_f32_0]] %[[#const_f32_0]]
+; CHECK-HLSL-DAG: %[[#vec4_const_zeros_f64:]] = OpConstantComposite %[[#vec4_float_64]] %[[#const_f64_0]] %[[#const_f64_0]] %[[#const_f64_0]] %[[#const_f64_0]]
; CHECK-OCL-DAG: %[[#const_i64_0:]] = OpConstantNull %[[#int_64]]
; CHECK-OCL-DAG: %[[#const_i32_0:]] = OpConstantNull %[[#int_32]]
; CHECK-OCL-DAG: %[[#const_i16_0:]] = OpConstantNull %[[#int_16]]
; CHECK-OCL-DAG: %[[#const_f64_0:]] = OpConstantNull %[[#float_64]]
-; CHECK-OCL-DAG: %[[#const_f32_0:]] = OpConstantNull %[[#float_32:]]
-; CHECK-OCL-DAG: %[[#const_f16_0:]] = OpConstantNull %[[#float_16:]]
-; CHECK-OCL-DAG: %[[#vec4_const_zeros_i16:]] = OpConstantNull %[[#vec4_16:]]
-; CHECK-OCL-DAG: %[[#vec4_const_zeros_i32:]] = OpConstantNull %[[#vec4_32:]]
-; CHECK-OCL-DAG: %[[#vec4_const_zeros_i64:]] = OpConstantNull %[[#vec4_64:]]
-; CHECK-OCL-DAG: %[[#vec4_const_zeros_f16:]] = OpConstantNull %[[#vec4_float_16:]]
-; CHECK-OCL-DAG: %[[#vec4_const_zeros_f32:]] = OpConstantNull %[[#vec4_float_32:]]
-; CHECK-OCL-DAG: %[[#vec4_const_zeros_f64:]] = OpConstantNull %[[#vec4_float_64:]]
+; CHECK-OCL-DAG: %[[#const_f32_0:]] = OpConstantNull %[[#float_32]]
+; CHECK-OCL-DAG: %[[#const_f16_0:]] = OpConstantNull %[[#float_16]]
+; CHECK-OCL-DAG: %[[#vec4_const_zeros_i16:]] = OpConstantNull %[[#vec4_16]]
+; CHECK-OCL-DAG: %[[#vec4_const_zeros_i32:]] = OpConstantNull %[[#vec4_32]]
+; CHECK-OCL-DAG: %[[#vec4_const_zeros_i64:]] = OpConstantNull %[[#vec4_64]]
+; CHECK-OCL-DAG: %[[#vec4_const_zeros_f16:]] = OpConstantNull %[[#vec4_float_16]]
+; CHECK-OCL-DAG: %[[#vec4_const_zeros_f32:]] = OpConstantNull %[[#vec4_float_32]]
+; CHECK-OCL-DAG: %[[#vec4_const_zeros_f64:]] = OpConstantNull %[[#vec4_float_64]]
define noundef i1 @any_int64_t(i64 noundef %p0) {
entry:
; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#]]
- ; CHECK: %[[#]] = OpINotEqual %[[#bool:]] %[[#arg0:]] %[[#const_i64_0:]]
+ ; CHECK: %[[#]] = OpINotEqual %[[#bool]] %[[#arg0]] %[[#const_i64_0]]
%hlsl.any = call i1 @llvm.spv.any.i64(i64 %p0)
ret i1 %hlsl.any
}
@@ -60,7 +60,7 @@ entry:
define noundef i1 @any_int(i32 noundef %p0) {
entry:
; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#]]
- ; CHECK: %[[#]] = OpINotEqual %[[#bool:]] %[[#arg0:]] %[[#const_i32_0:]]
+ ; CHECK: %[[#]] = OpINotEqual %[[#bool]] %[[#arg0]] %[[#const_i32_0]]
%hlsl.any = call i1 @llvm.spv.any.i32(i32 %p0)
ret i1 %hlsl.any
}
@@ -69,7 +69,7 @@ entry:
define noundef i1 @any_int16_t(i16 noundef %p0) {
entry:
; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#]]
- ; CHECK: %[[#]] = OpINotEqual %[[#bool:]] %[[#arg0:]] %[[#const_i16_0:]]
+ ; CHECK: %[[#]] = OpINotEqual %[[#bool]] %[[#arg0]] %[[#const_i16_0]]
%hlsl.any = call i1 @llvm.spv.any.i16(i16 %p0)
ret i1 %hlsl.any
}
@@ -77,7 +77,7 @@ entry:
define noundef i1 @any_double(double noundef %p0) {
entry:
; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#]]
- ; CHECK: %[[#]] = OpFOrdNotEqual %[[#bool:]] %[[#arg0:]] %[[#const_f64_0:]]
+ ; CHECK: %[[#]] = OpFOrdNotEqual %[[#bool]] %[[#arg0]] %[[#const_f64_0]]
%hlsl.any = call i1 @llvm.spv.any.f64(double %p0)
ret i1 %hlsl.any
}
@@ -86,7 +86,7 @@ entry:
define noundef i1 @any_float(float noundef %p0) {
entry:
; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#]]
- ; CHECK: %[[#]] = OpFOrdNotEqual %[[#bool:]] %[[#arg0:]] %[[#const_f32_0:]]
+ ; CHECK: %[[#]] = OpFOrdNotEqual %[[#bool]] %[[#arg0]] %[[#const_f32_0]]
%hlsl.any = call i1 @llvm.spv.any.f32(float %p0)
ret i1 %hlsl.any
}
@@ -95,7 +95,7 @@ entry:
define noundef i1 @any_half(half noundef %p0) {
entry:
; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#]]
- ; CHECK: %[[#]] = OpFOrdNotEqual %[[#bool:]] %[[#arg0:]] %[[#const_f16_0:]]
+ ; CHECK: %[[#]] = OpFOrdNotEqual %[[#bool]] %[[#arg0]] %[[#const_f16_0]]
%hlsl.any = call i1 @llvm.spv.any.f16(half %p0)
ret i1 %hlsl.any
}
@@ -103,8 +103,8 @@ entry:
define noundef i1 @any_bool4(<4 x i1> noundef %p0) {
entry:
- ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#]]
- ; CHECK: %[[#]] = OpAny %[[#vec4_bool:]] %[[#arg0:]]
+ ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#vec4_bool]]
+ ; CHECK: %[[#]] = OpAny %[[#bool]] %[[#arg0]]
%hlsl.any = call i1 @llvm.spv.any.v4i1(<4 x i1> %p0)
ret i1 %hlsl.any
}
@@ -112,8 +112,8 @@ entry:
define noundef i1 @any_short4(<4 x i16> noundef %p0) {
entry:
; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#]]
- ; CHECK: %[[#shortVecNotEq:]] = OpINotEqual %[[#vec4_bool:]] %[[#arg0:]] %[[#vec4_const_zeros_i16:]]
- ; CHECK: %[[#]] = OpAny %[[#bool:]] %[[#shortVecNotEq:]]
+ ; CHECK: %[[#shortVecNotEq:]] = OpINotEqual %[[#vec4_bool]] %[[#arg0]] %[[#vec4_const_zeros_i16]]
+ ; CHECK: %[[#]] = OpAny %[[#bool]] %[[#shortVecNotEq]]
%hlsl.any = call i1 @llvm.spv.any.v4i16(<4 x i16> %p0)
ret i1 %hlsl.any
}
@@ -121,8 +121,8 @@ entry:
define noundef i1 @any_int4(<4 x i32> noundef %p0) {
entry:
; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#]]
- ; CHECK: %[[#i32VecNotEq:]] = OpINotEqual %[[#vec4_bool:]] %[[#arg0:]] %[[#vec4_const_zeros_i32:]]
- ; CHECK: %[[#]] = OpAny %[[#bool:]] %[[#i32VecNotEq:]]
+ ; CHECK: %[[#i32VecNotEq:]] = OpINotEqual %[[#vec4_bool]] %[[#arg0]] %[[#vec4_const_zeros_i32]]
+ ; CHECK: %[[#]] = OpAny %[[#bool]] %[[#i32VecNotEq]]
%hlsl.any = call i1 @llvm.spv.any.v4i32(<4 x i32> %p0)
ret i1 %hlsl.any
}
@@ -130,8 +130,8 @@ entry:
define noundef i1 @any_int64_t4(<4 x i64> noundef %p0) {
entry:
; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#]]
- ; CHECK: %[[#i64VecNotEq:]] = OpINotEqual %[[#vec4_bool:]] %[[#arg0:]] %[[#vec4_const_zeros_i64:]]
- ; CHECK: %[[#]] = OpAny %[[#bool:]] %[[#i64VecNotEq]]
+ ; CHECK: %[[#i64VecNotEq:]] = OpINotEqual %[[#vec4_bool]] %[[#arg0]] %[[#vec4_const_zeros_i64]]
+ ; CHECK: %[[#]] = OpAny %[[#bool]] %[[#i64VecNotEq]]
%hlsl.any = call i1 @llvm.spv.any.v4i64(<4 x i64> %p0)
ret i1 %hlsl.any
}
@@ -139,8 +139,8 @@ entry:
define noundef i1 @any_half4(<4 x half> noundef %p0) {
entry:
; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#]]
- ; CHECK: %[[#f16VecNotEq:]] = OpFOrdNotEqual %[[#vec4_bool:]] %[[#arg0:]] %[[#vec4_const_zeros_f16:]]
- ; CHECK: %[[#]] = OpAny %[[#bool]] %[[#f16VecNotEq:]]
+ ; CHECK: %[[#f16VecNotEq:]] = OpFOrdNotEqual %[[#vec4_bool]] %[[#arg0]] %[[#vec4_const_zeros_f16]]
+ ; CHECK: %[[#]] = OpAny %[[#bool]] %[[#f16VecNotEq]]
%hlsl.any = call i1 @llvm.spv.any.v4f16(<4 x half> %p0)
ret i1 %hlsl.any
}
@@ -148,8 +148,8 @@ entry:
define noundef i1 @any_float4(<4 x float> noundef %p0) {
entry:
; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#]]
- ; CHECK: %[[#f32VecNotEq:]] = OpFOrdNotEqual %[[#vec4_bool:]] %[[#arg0:]] %[[#vec4_const_zeros_f32:]]
- ; CHECK: %[[#]] = OpAny %[[#bool:]] %[[#f32VecNotEq:]]
+ ; CHECK: %[[#f32VecNotEq:]] = OpFOrdNotEqual %[[#vec4_bool]] %[[#arg0]] %[[#vec4_const_zeros_f32]]
+ ; CHECK: %[[#]] = OpAny %[[#bool]] %[[#f32VecNotEq]]
%hlsl.any = call i1 @llvm.spv.any.v4f32(<4 x float> %p0)
ret i1 %hlsl.any
}
@@ -157,16 +157,16 @@ entry:
define noundef i1 @any_double4(<4 x double> noundef %p0) {
entry:
; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#]]
- ; CHECK: %[[#f64VecNotEq:]] = OpFOrdNotEqual %[[#vec4_bool:]] %[[#arg0:]] %[[#vec4_const_zeros_f64:]]
- ; CHECK: %[[#]] = OpAny %[[#bool:]] %[[#f64VecNotEq:]]
+ ; CHECK: %[[#f64VecNotEq:]] = OpFOrdNotEqual %[[#vec4_bool]] %[[#arg0]] %[[#vec4_const_zeros_f64]]
+ ; CHECK: %[[#]] = OpAny %[[#bool]] %[[#f64VecNotEq]]
%hlsl.any = call i1 @llvm.spv.any.v4f64(<4 x double> %p0)
ret i1 %hlsl.any
}
define noundef i1 @any_bool(i1 noundef %a) {
entry:
- ; CHECK: %[[#any_bool_arg:]] = OpFunctionParameter %[[#bool:]]
- ; CHECK: OpReturnValue %[[#any_bool_arg:]]
+ ; CHECK: %[[#any_bool_arg:]] = OpFunctionParameter %[[#bool]]
+ ; CHECK: OpReturnValue %[[#any_bool_arg]]
%hlsl.any = call i1 @llvm.spv.any.i1(i1 %a)
ret i1 %hlsl.any
}
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/lerp.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/lerp.ll
new file mode 100644
index 00000000000000..63547820c18c77
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/lerp.ll
@@ -0,0 +1,56 @@
+; RUN: llc -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; Make sure SPIRV operation function calls for lerp are generated as FMix
+
+; CHECK-DAG: %[[#op_ext_glsl:]] = OpExtInstImport "GLSL.std.450"
+; CHECK-DAG: %[[#float_32:]] = OpTypeFloat 32
+; CHECK-DAG: %[[#float_16:]] = OpTypeFloat 16
+; CHECK-DAG: %[[#vec4_float_16:]] = OpTypeVector %[[#float_16]] 4
+; CHECK-DAG: %[[#vec4_float_32:]] = OpTypeVector %[[#float_32]] 4
+
+define noundef half @lerp_half(half noundef %a, half noundef %b, half noundef %c) {
+entry:
+ ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#]]
+ ; CHECK: %[[#arg1:]] = OpFunctionParameter %[[#]]
+ ; CHECK: %[[#arg2:]] = OpFunctionParameter %[[#]]
+ ; CHECK: %[[#]] = OpExtInst %[[#float_16]] %[[#op_ext_glsl]] FMix %[[#arg0]] %[[#arg1]] %[[#arg2]]
+ %hlsl.lerp = call half @llvm.spv.lerp.f16(half %a, half %b, half %c)
+ ret half %hlsl.lerp
+}
+
+
+define noundef float @lerp_float(float noundef %a, float noundef %b, float noundef %c) {
+entry:
+ ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#]]
+ ; CHECK: %[[#arg1:]] = OpFunctionParameter %[[#]]
+ ; CHECK: %[[#arg2:]] = OpFunctionParameter %[[#]]
+ ; CHECK: %[[#]] = OpExtInst %[[#float_32]] %[[#op_ext_glsl]] FMix %[[#arg0]] %[[#arg1]] %[[#arg2]]
+ %hlsl.lerp = call float @llvm.spv.lerp.f32(float %a, float %b, float %c)
+ ret float %hlsl.lerp
+}
+
+define noundef <4 x half> @lerp_half4(<4 x half> noundef %a, <4 x half> noundef %b, <4 x half> noundef %c) {
+entry:
+ ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#]]
+ ; CHECK: %[[#arg1:]] = OpFunctionParameter %[[#]]
+ ; CHECK: %[[#arg2:]] = OpFunctionParameter %[[#]]
+ ; CHECK: %[[#]] = OpExtInst %[[#vec4_float_16]] %[[#op_ext_glsl]] FMix %[[#arg0]] %[[#arg1]] %[[#arg2]]
+ %hlsl.lerp = call <4 x half> @llvm.spv.lerp.v4f16(<4 x half> %a, <4 x half> %b, <4 x half> %c)
+ ret <4 x half> %hlsl.lerp
+}
+
+define noundef <4 x float> @lerp_float4(<4 x float> noundef %a, <4 x float> noundef %b, <4 x float> noundef %c) {
+entry:
+ ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#]]
+ ; CHECK: %[[#arg1:]] = OpFunctionParameter %[[#]]
+ ; CHECK: %[[#arg2:]] = OpFunctionParameter %[[#]]
+ ; CHECK: %[[#]] = OpExtInst %[[#vec4_float_32]] %[[#op_ext_glsl]] FMix %[[#arg0]] %[[#arg1]] %[[#arg2]]
+ %hlsl.lerp = call <4 x float> @llvm.spv.lerp.v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %c)
+ ret <4 x float> %hlsl.lerp
+}
+
+declare half @llvm.spv.lerp.f16(half, half, half)
+declare float @llvm.spv.lerp.f32(float, float, float)
+declare <4 x half> @llvm.spv.lerp.v4f16(<4 x half>, <4 x half>, <4 x half>)
+declare <4 x float> @llvm.spv.lerp.v4f32(<4 x float>, <4 x float>, <4 x float>)
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/rcp.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/rcp.ll
index 95962c0fdc9695..34f3c610ca81da 100644
--- a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/rcp.ll
+++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/rcp.ll
@@ -13,90 +13,90 @@
; CHECK-DAG: %[[#vec4_float_32:]] = OpTypeVector %[[#float_32]] 4
; CHECK-DAG: %[[#vec4_float_64:]] = OpTypeVector %[[#float_64]] 4
; CHECK-DAG: %[[#const_f64_1:]] = OpConstant %[[#float_64]] 1
-; CHECK-DAG: %[[#const_f32_1:]] = OpConstant %[[#float_32:]] 1
-; CHECK-DAG: %[[#const_f16_1:]] = OpConstant %[[#float_16:]] 1
+; CHECK-DAG: %[[#const_f32_1:]] = OpConstant %[[#float_32]] 1
+; CHECK-DAG: %[[#const_f16_1:]] = OpConstant %[[#float_16]] 1
-; CHECK-DAG: %[[#vec2_const_ones_f16:]] = OpConstantComposite %[[#vec2_float_16:]] %[[#const_f16_1:]] %[[#const_f16_1:]]
-; CHECK-DAG: %[[#vec3_const_ones_f16:]] = OpConstantComposite %[[#vec3_float_16:]] %[[#const_f16_1:]] %[[#const_f16_1:]] %[[#const_f16_1:]]
-; CHECK-DAG: %[[#vec4_const_ones_f16:]] = OpConstantComposite %[[#vec4_float_16:]] %[[#const_f16_1:]] %[[#const_f16_1:]] %[[#const_f16_1:]] %[[#const_f16_1:]]
+; CHECK-DAG: %[[#vec2_const_ones_f16:]] = OpConstantComposite %[[#vec2_float_16]] %[[#const_f16_1]] %[[#const_f16_1]]
+; CHECK-DAG: %[[#vec3_const_ones_f16:]] = OpConstantComposite %[[#vec3_float_16]] %[[#const_f16_1]] %[[#const_f16_1]] %[[#const_f16_1]]
+; CHECK-DAG: %[[#vec4_const_ones_f16:]] = OpConstantComposite %[[#vec4_float_16]] %[[#const_f16_1]] %[[#const_f16_1]] %[[#const_f16_1]] %[[#const_f16_1]]
-; CHECK-DAG: %[[#vec2_const_ones_f32:]] = OpConstantComposite %[[#vec2_float_32:]] %[[#const_f32_1:]] %[[#const_f32_1:]]
-; CHECK-DAG: %[[#vec3_const_ones_f32:]] = OpConstantComposite %[[#vec3_float_32:]] %[[#const_f32_1:]] %[[#const_f32_1:]] %[[#const_f32_1:]]
-; CHECK-DAG: %[[#vec4_const_ones_f32:]] = OpConstantComposite %[[#vec4_float_32:]] %[[#const_f32_1:]] %[[#const_f32_1:]] %[[#const_f32_1:]] %[[#const_f32_1:]]
+; CHECK-DAG: %[[#vec2_const_ones_f32:]] = OpConstantComposite %[[#vec2_float_32]] %[[#const_f32_1]] %[[#const_f32_1]]
+; CHECK-DAG: %[[#vec3_const_ones_f32:]] = OpConstantComposite %[[#vec3_float_32]] %[[#const_f32_1]] %[[#const_f32_1]] %[[#const_f32_1]]
+; CHECK-DAG: %[[#vec4_const_ones_f32:]] = OpConstantComposite %[[#vec4_float_32]] %[[#const_f32_1]] %[[#const_f32_1]] %[[#const_f32_1]] %[[#const_f32_1]]
-; CHECK-DAG: %[[#vec2_const_ones_f64:]] = OpConstantComposite %[[#vec2_float_64:]] %[[#const_f64_1:]] %[[#const_f64_1:]]
-; CHECK-DAG: %[[#vec3_const_ones_f64:]] = OpConstantComposite %[[#vec3_float_64:]] %[[#const_f64_1:]] %[[#const_f64_1:]] %[[#const_f64_1:]]
-; CHECK-DAG: %[[#vec4_const_ones_f64:]] = OpConstantComposite %[[#vec4_float_64:]] %[[#const_f64_1:]] %[[#const_f64_1:]] %[[#const_f64_1:]] %[[#const_f64_1:]]
+; CHECK-DAG: %[[#vec2_const_ones_f64:]] = OpConstantComposite %[[#vec2_float_64]] %[[#const_f64_1]] %[[#const_f64_1]]
+; CHECK-DAG: %[[#vec3_const_ones_f64:]] = OpConstantComposite %[[#vec3_float_64]] %[[#const_f64_1]] %[[#const_f64_1]] %[[#const_f64_1]]
+; CHECK-DAG: %[[#vec4_const_ones_f64:]] = OpConstantComposite %[[#vec4_float_64]] %[[#const_f64_1]] %[[#const_f64_1]] %[[#const_f64_1]] %[[#const_f64_1]]
define spir_func noundef half @test_rcp_half(half noundef %p0) #0 {
entry:
- ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#float_16:]]
- ; CHECK: OpFDiv %[[#float_16:]] %[[#const_f16_1:]] %[[#arg0:]]
+ ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#float_16]]
+ ; CHECK: OpFDiv %[[#float_16]] %[[#const_f16_1]] %[[#arg0]]
%hlsl.rcp = fdiv half 0xH3C00, %p0
ret half %hlsl.rcp
}
define spir_func noundef <2 x half> @test_rcp_half2(<2 x half> noundef %p0) #0 {
entry:
- ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#vec2_float_16:]]
- ; CHECK: OpFDiv %[[#vec2_float_16:]] %[[#vec2_const_ones_f16:]] %[[#arg0:]]
+ ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#vec2_float_16]]
+ ; CHECK: OpFDiv %[[#vec2_float_16]] %[[#vec2_const_ones_f16]] %[[#arg0]]
%hlsl.rcp = fdiv <2 x half> <half 0xH3C00, half 0xH3C00>, %p0
ret <2 x half> %hlsl.rcp
}
define spir_func noundef <3 x half> @test_rcp_half3(<3 x half> noundef %p0) #0 {
entry:
- ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#vec3_float_16:]]
- ; CHECK: OpFDiv %[[#vec3_float_16:]] %[[#vec3_const_ones_f16:]] %[[#arg0:]]
+ ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#vec3_float_16]]
+ ; CHECK: OpFDiv %[[#vec3_float_16]] %[[#vec3_const_ones_f16]] %[[#arg0]]
%hlsl.rcp = fdiv <3 x half> <half 0xH3C00, half 0xH3C00, half 0xH3C00>, %p0
ret <3 x half> %hlsl.rcp
}
define spir_func noundef <4 x half> @test_rcp_half4(<4 x half> noundef %p0) #0 {
entry:
- ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#vec4_float_16:]]
- ; CHECK: OpFDiv %[[#vec4_float_16:]] %[[#vec4_const_ones_f16:]] %[[#arg0:]]
+ ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#vec4_float_16]]
+ ; CHECK: OpFDiv %[[#vec4_float_16]] %[[#vec4_const_ones_f16]] %[[#arg0]]
%hlsl.rcp = fdiv <4 x half> <half 0xH3C00, half 0xH3C00, half 0xH3C00, half 0xH3C00>, %p0
ret <4 x half> %hlsl.rcp
}
define spir_func noundef float @test_rcp_float(float noundef %p0) #0 {
entry:
- ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#float_32:]]
- ; CHECK: OpFDiv %[[#float_32:]] %[[#const_f32_1:]] %[[#arg0:]]
+ ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#float_32]]
+ ; CHECK: OpFDiv %[[#float_32]] %[[#const_f32_1]] %[[#arg0]]
%hlsl.rcp = fdiv float 1.000000e+00, %p0
ret float %hlsl.rcp
}
define spir_func noundef <2 x float> @test_rcp_float2(<2 x float> noundef %p0) #0 {
entry:
- ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#vec2_float_32:]]
- ; CHECK: OpFDiv %[[#vec2_float_32:]] %[[#vec2_const_ones_f32:]] %[[#arg0:]]
+ ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#vec2_float_32]]
+ ; CHECK: OpFDiv %[[#vec2_float_32]] %[[#vec2_const_ones_f32]] %[[#arg0]]
%hlsl.rcp = fdiv <2 x float> <float 1.000000e+00, float 1.000000e+00>, %p0
ret <2 x float> %hlsl.rcp
}
define spir_func noundef <3 x float> @test_rcp_float3(<3 x float> noundef %p0) #0 {
entry:
- ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#vec3_float_32:]]
- ; CHECK: OpFDiv %[[#vec3_float_32:]] %[[#vec3_const_ones_f32:]] %[[#arg0:]]
+ ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#vec3_float_32]]
+ ; CHECK: OpFDiv %[[#vec3_float_32]] %[[#vec3_const_ones_f32]] %[[#arg0]]
%hlsl.rcp = fdiv <3 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, %p0
ret <3 x float> %hlsl.rcp
}
define spir_func noundef <4 x float> @test_rcp_float4(<4 x float> noundef %p0) #0 {
entry:
- ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#vec4_float_32:]]
- ; CHECK: OpFDiv %[[#vec4_float_32:]] %[[#vec4_const_ones_f32:]] %[[#arg0:]]
+ ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#vec4_float_32]]
+ ; CHECK: OpFDiv %[[#vec4_float_32]] %[[#vec4_const_ones_f32]] %[[#arg0]]
%hlsl.rcp = fdiv <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, %p0
ret <4 x float> %hlsl.rcp
}
define spir_func noundef double @test_rcp_double(double noundef %p0) #0 {
entry:
- ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#float_64:]]
- ; CHECK: OpFDiv %[[#float_64:]] %[[#const_f64_1:]] %[[#arg0:]]
+ ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#float_64]]
+ ; CHECK: OpFDiv %[[#float_64]] %[[#const_f64_1]] %[[#arg0]]
%hlsl.rcp = fdiv double 1.000000e+00, %p0
ret double %hlsl.rcp
}
@@ -104,7 +104,7 @@ entry:
define spir_func noundef <2 x double> @test_rcp_double2(<2 x double> noundef %p0) #0 {
entry:
; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#vec2_float_64:]]
- ; CHECK: OpFDiv %[[#vec2_float_64:]] %[[#vec2_const_ones_f64:]] %[[#arg0:]]
+ ; CHECK: OpFDiv %[[#vec2_float_64]] %[[#vec2_const_ones_f64]] %[[#arg0]]
%hlsl.rcp = fdiv <2 x double> <double 1.000000e+00, double 1.000000e+00>, %p0
ret <2 x double> %hlsl.rcp
}
@@ -112,15 +112,15 @@ entry:
define spir_func noundef <3 x double> @test_rcp_double3(<3 x double> noundef %p0) #0 {
entry:
; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#vec3_float_64:]]
- ; CHECK: OpFDiv %[[#vec3_float_64:]] %[[#vec3_const_ones_f64:]] %[[#arg0:]]
+ ; CHECK: OpFDiv %[[#vec3_float_64]] %[[#vec3_const_ones_f64]] %[[#arg0]]
%hlsl.rcp = fdiv <3 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>, %p0
ret <3 x double> %hlsl.rcp
}
define spir_func noundef <4 x double> @test_rcp_double4(<4 x double> noundef %p0) #0 {
entry:
- ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#vec4_float_64:]]
- ; CHECK: OpFDiv %[[#vec4_float_64:]] %[[#vec4_const_ones_f64:]] %[[#arg0:]]
+ ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#vec4_float_64]]
+ ; CHECK: OpFDiv %[[#vec4_float_64]] %[[#vec4_const_ones_f64]] %[[#arg0]]
%hlsl.rcp = fdiv <4 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>, %p0
ret <4 x double> %hlsl.rcp
}
>From b6628c24ef017138b8d6eb288e94c141e7c846b0 Mon Sep 17 00:00:00 2001
From: Sirraide <aeternalmail at gmail.com>
Date: Mon, 22 Apr 2024 18:41:36 +0200
Subject: [PATCH 6/7] [Clang] Fix crash on invalid size in user-defined
`static_assert` message (#89420)
This addresses two problems observed in #89407 wrt user-defined
`static_assert` messages:
1. In `Expr::EvaluateCharRangeAsString`, we were calling `getExtValue()`
instead of `getZExtValue()`, which would assert if a negative or very
large number was returned from `size()`.
2. If the value could not be converted to `std::size_t`, attempting to
diagnose that would crash because `ext_cce_narrowing` was missing two
`%select` cases.
This fixes #89407.
---
clang/docs/ReleaseNotes.rst | 2 +
.../clang/Basic/DiagnosticSemaKinds.td | 6 +-
clang/lib/AST/ExprConstant.cpp | 4 +-
clang/test/SemaCXX/static-assert-cxx26.cpp | 74 +++++++++++++++++++
4 files changed, 81 insertions(+), 5 deletions(-)
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 009531bae8a9de..aea99680c79a0e 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -555,6 +555,8 @@ Bug Fixes to C++ Support
- Fix a crash caused by defined struct in a type alias template when the structure
has fields with dependent type. Fixes (#GH75221).
- Fix the Itanium mangling of lambdas defined in a member of a local class (#GH88906)
+- Fixed a crash when trying to evaluate a user-defined ``static_assert`` message whose ``size()``
+ function returns a large or negative value. Fixes (#GH89407).
Bug Fixes to AST Handling
^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index a95424862e63f4..63e951daec7477 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -87,9 +87,9 @@ def err_expr_not_cce : Error<
"call to 'size()'|call to 'data()'}0 is not a constant expression">;
def ext_cce_narrowing : ExtWarn<
"%select{case value|enumerator value|non-type template argument|"
- "array size|explicit specifier argument|noexcept specifier argument}0 "
- "%select{cannot be narrowed from type %2 to %3|"
- "evaluates to %2, which cannot be narrowed to type %3}1">,
+ "array size|explicit specifier argument|noexcept specifier argument|"
+ "call to 'size()'|call to 'data()'}0 %select{cannot be narrowed from "
+ "type %2 to %3|evaluates to %2, which cannot be narrowed to type %3}1">,
InGroup<CXX11Narrowing>, DefaultError, SFINAEFailure;
def err_ice_not_integral : Error<
"%select{integer|integral}1 constant expression must have "
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index 73ae8d8efb23a2..de3c2a63913e94 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -16853,13 +16853,13 @@ bool Expr::EvaluateCharRangeAsString(std::string &Result,
if (!::EvaluateInteger(SizeExpression, SizeValue, Info))
return false;
- int64_t Size = SizeValue.getExtValue();
+ uint64_t Size = SizeValue.getZExtValue();
if (!::EvaluatePointer(PtrExpression, String, Info))
return false;
QualType CharTy = PtrExpression->getType()->getPointeeType();
- for (int64_t I = 0; I < Size; ++I) {
+ for (uint64_t I = 0; I < Size; ++I) {
APValue Char;
if (!handleLValueToRValueConversion(Info, PtrExpression, CharTy, String,
Char))
diff --git a/clang/test/SemaCXX/static-assert-cxx26.cpp b/clang/test/SemaCXX/static-assert-cxx26.cpp
index f4ede74f9214a4..7d896d8b365b74 100644
--- a/clang/test/SemaCXX/static-assert-cxx26.cpp
+++ b/clang/test/SemaCXX/static-assert-cxx26.cpp
@@ -341,3 +341,77 @@ struct Callable {
} data;
};
static_assert(false, Callable{}); // expected-error {{static assertion failed: hello}}
+
+namespace GH89407 {
+struct A {
+ constexpr __SIZE_TYPE__ size() const { return -1; }
+ constexpr const char* data() const { return ""; }
+};
+
+struct B {
+ constexpr long long size() const { return 18446744073709551615U; }
+ constexpr const char* data() const { return ""; }
+};
+
+struct C {
+ constexpr __int128 size() const { return -1; }
+ constexpr const char* data() const { return ""; }
+};
+
+struct D {
+ constexpr unsigned __int128 size() const { return -1; }
+ constexpr const char* data() const { return ""; }
+};
+
+struct E {
+ constexpr __SIZE_TYPE__ size() const { return 18446744073709551615U; }
+ constexpr const char* data() const { return ""; }
+};
+
+static_assert(true, A{}); // expected-error {{the message in this static assertion is not a constant expression}}
+ // expected-note at -1 {{read of dereferenced one-past-the-end pointer is not allowed in a constant expression}}
+static_assert(true, B{}); // expected-error {{call to 'size()' evaluates to -1, which cannot be narrowed to type 'unsigned long'}}
+ // expected-error at -1 {{the message in this static assertion is not a constant expression}}
+ // expected-note at -2 {{read of dereferenced one-past-the-end pointer is not allowed in a constant expression}}
+static_assert(true, C{}); // expected-error {{call to 'size()' evaluates to -1, which cannot be narrowed to type 'unsigned long'}}
+ // expected-error at -1 {{the message in this static assertion is not a constant expression}}
+ // expected-note at -2 {{read of dereferenced one-past-the-end pointer is not allowed in a constant expression}}
+static_assert(true, D{}); // expected-error {{call to 'size()' evaluates to 340282366920938463463374607431768211455, which cannot be narrowed to type 'unsigned long'}}
+ // expected-error at -1 {{the message in this static assertion is not a constant expression}}
+ // expected-note at -2 {{read of dereferenced one-past-the-end pointer is not allowed in a constant expression}}
+static_assert(true, E{}); // expected-error {{the message in this static assertion is not a constant expression}}
+ // expected-note at -1 {{read of dereferenced one-past-the-end pointer is not allowed in a constant expression}}
+
+static_assert(
+ false, // expected-error {{static assertion failed}}
+ A{} // expected-error {{the message in a static assertion must be produced by a constant expression}}
+ // expected-note at -1 {{read of dereferenced one-past-the-end pointer is not allowed in a constant expression}}
+);
+
+static_assert(
+ false, // expected-error {{static assertion failed}}
+ B{} // expected-error {{call to 'size()' evaluates to -1, which cannot be narrowed to type 'unsigned long'}}
+ // expected-error at -1 {{the message in a static assertion must be produced by a constant expression}}
+ // expected-note at -2 {{read of dereferenced one-past-the-end pointer is not allowed in a constant expression}}
+);
+
+static_assert(
+ false, // expected-error {{static assertion failed}}
+ C{} // expected-error {{call to 'size()' evaluates to -1, which cannot be narrowed to type 'unsigned long'}}
+ // expected-error at -1 {{the message in a static assertion must be produced by a constant expression}}
+ // expected-note at -2 {{read of dereferenced one-past-the-end pointer is not allowed in a constant expression}}
+);
+
+static_assert(
+ false, // expected-error {{static assertion failed}}
+ D{} // expected-error {{call to 'size()' evaluates to 340282366920938463463374607431768211455, which cannot be narrowed to type 'unsigned long'}}
+ // expected-error at -1 {{the message in a static assertion must be produced by a constant expression}}
+ // expected-note at -2 {{read of dereferenced one-past-the-end pointer is not allowed in a constant expression}}
+);
+
+static_assert(
+ false, // expected-error {{static assertion failed}}
+ E{} // expected-error {{the message in a static assertion must be produced by a constant expression}}
+ // expected-note at -1 {{read of dereferenced one-past-the-end pointer is not allowed in a constant expression}}
+);
+}
>From 5bfaf9046b7edcbf57b98e01f745842fe9795ead Mon Sep 17 00:00:00 2001
From: Mogball <jeff at modular.com>
Date: Mon, 22 Apr 2024 17:02:17 +0000
Subject: [PATCH 7/7] [mlir] Update comment about `propertiesAttr` (NFC)
The comment is misleading because `propertiesAttr` is not actually
ignored when the operation isn't unregistered.
---
mlir/include/mlir/IR/OperationSupport.h | 9 ++++++---
1 file changed, 6 insertions(+), 3 deletions(-)
diff --git a/mlir/include/mlir/IR/OperationSupport.h b/mlir/include/mlir/IR/OperationSupport.h
index 2c1c490aac49b8..2c6e8253b4327a 100644
--- a/mlir/include/mlir/IR/OperationSupport.h
+++ b/mlir/include/mlir/IR/OperationSupport.h
@@ -960,9 +960,12 @@ struct OperationState {
/// Regions that the op will hold.
SmallVector<std::unique_ptr<Region>, 1> regions;
- // If we're creating an unregistered operation, this Attribute is used to
- // build the properties. Otherwise it is ignored. For registered operations
- // see the `getOrAddProperties` method.
+ // This Attribute is used to opaquely construct the properties of the
+ // operation. If we're creating an unregistered operation, the Attribute is
+ // used as-is as the Properties storage of the operation. Otherwise, the
+ // operation properties are constructed opaquely using its
+ // `setPropertiesFromAttr` hook. Note that `getOrAddProperties` is the
+ // preferred method to construct properties from C++.
Attribute propertiesAttr;
private:
More information about the llvm-branch-commits
mailing list