[llvm-branch-commits] [clang] [compiler-rt] [llvm] [mlir] [mlir][test] Shard the Test Dialect (NFC) (PR #89628)

Jeff Niu via llvm-branch-commits llvm-branch-commits at lists.llvm.org
Mon Apr 22 10:03:05 PDT 2024


https://github.com/Mogball updated https://github.com/llvm/llvm-project/pull/89628

>From 6ad22c879aab88b6bb0531eeb3a6708a82f88cf6 Mon Sep 17 00:00:00 2001
From: Mircea Trofin <mtrofin at google.com>
Date: Mon, 22 Apr 2024 09:24:22 -0700
Subject: [PATCH 1/7] [compiler-rt][ctx_instr] Add `ctx_profile` component
 (#89304)

Add the component structure for contextual instrumented PGO and the bump allocator + test.

(Tracking Issue: #89287, RFC referenced there)
---
 compiler-rt/CMakeLists.txt                    |  2 +
 .../cmake/Modules/AllSupportedArchDefs.cmake  |  1 +
 compiler-rt/cmake/config-ix.cmake             | 11 +++
 compiler-rt/lib/CMakeLists.txt                |  4 ++
 compiler-rt/lib/ctx_profile/CMakeLists.txt    | 28 ++++++++
 .../lib/ctx_profile/CtxInstrProfiling.cpp     | 40 +++++++++++
 .../lib/ctx_profile/CtxInstrProfiling.h       | 55 +++++++++++++++
 .../lib/ctx_profile/tests/CMakeLists.txt      | 70 +++++++++++++++++++
 .../tests/CtxInstrProfilingTest.cpp           | 22 ++++++
 compiler-rt/lib/ctx_profile/tests/driver.cpp  | 14 ++++
 10 files changed, 247 insertions(+)
 create mode 100644 compiler-rt/lib/ctx_profile/CMakeLists.txt
 create mode 100644 compiler-rt/lib/ctx_profile/CtxInstrProfiling.cpp
 create mode 100644 compiler-rt/lib/ctx_profile/CtxInstrProfiling.h
 create mode 100644 compiler-rt/lib/ctx_profile/tests/CMakeLists.txt
 create mode 100644 compiler-rt/lib/ctx_profile/tests/CtxInstrProfilingTest.cpp
 create mode 100644 compiler-rt/lib/ctx_profile/tests/driver.cpp

diff --git a/compiler-rt/CMakeLists.txt b/compiler-rt/CMakeLists.txt
index 8649507ce1c79b..6ce451e3cac2e3 100644
--- a/compiler-rt/CMakeLists.txt
+++ b/compiler-rt/CMakeLists.txt
@@ -50,6 +50,8 @@ option(COMPILER_RT_BUILD_LIBFUZZER "Build libFuzzer" ON)
 mark_as_advanced(COMPILER_RT_BUILD_LIBFUZZER)
 option(COMPILER_RT_BUILD_PROFILE "Build profile runtime" ON)
 mark_as_advanced(COMPILER_RT_BUILD_PROFILE)
+option(COMPILER_RT_BUILD_CTX_PROFILE "Build ctx profile runtime" ON)
+mark_as_advanced(COMPILER_RT_BUILD_CTX_PROFILE)
 option(COMPILER_RT_BUILD_MEMPROF "Build memory profiling runtime" ON)
 mark_as_advanced(COMPILER_RT_BUILD_MEMPROF)
 option(COMPILER_RT_BUILD_XRAY_NO_PREINIT "Build xray with no preinit patching" OFF)
diff --git a/compiler-rt/cmake/Modules/AllSupportedArchDefs.cmake b/compiler-rt/cmake/Modules/AllSupportedArchDefs.cmake
index 423171532c2028..2fe06273a814c7 100644
--- a/compiler-rt/cmake/Modules/AllSupportedArchDefs.cmake
+++ b/compiler-rt/cmake/Modules/AllSupportedArchDefs.cmake
@@ -66,6 +66,7 @@ set(ALL_MEMPROF_SUPPORTED_ARCH ${X86_64})
 set(ALL_PROFILE_SUPPORTED_ARCH ${X86} ${X86_64} ${ARM32} ${ARM64} ${PPC32} ${PPC64}
     ${MIPS32} ${MIPS64} ${S390X} ${SPARC} ${SPARCV9} ${HEXAGON}
     ${RISCV32} ${RISCV64} ${LOONGARCH64})
+set(ALL_CTX_PROFILE_SUPPORTED_ARCH ${X86_64})
 set(ALL_TSAN_SUPPORTED_ARCH ${X86_64} ${MIPS64} ${ARM64} ${PPC64} ${S390X}
     ${LOONGARCH64} ${RISCV64})
 set(ALL_UBSAN_SUPPORTED_ARCH ${X86} ${X86_64} ${ARM32} ${ARM64} ${RISCV64}
diff --git a/compiler-rt/cmake/config-ix.cmake b/compiler-rt/cmake/config-ix.cmake
index b281ac64f5d5c7..ba740af9e1d60f 100644
--- a/compiler-rt/cmake/config-ix.cmake
+++ b/compiler-rt/cmake/config-ix.cmake
@@ -632,6 +632,9 @@ if(APPLE)
   list_intersect(PROFILE_SUPPORTED_ARCH
     ALL_PROFILE_SUPPORTED_ARCH
     SANITIZER_COMMON_SUPPORTED_ARCH)
+  list_intersect(CTX_PROFILE_SUPPORTED_ARCH
+    ALL_CTX_PROFILE_SUPPORTED_ARCH
+    SANITIZER_COMMON_SUPPORTED_ARCH)
   list_intersect(TSAN_SUPPORTED_ARCH
     ALL_TSAN_SUPPORTED_ARCH
     SANITIZER_COMMON_SUPPORTED_ARCH)
@@ -678,6 +681,7 @@ else()
   filter_available_targets(HWASAN_SUPPORTED_ARCH ${ALL_HWASAN_SUPPORTED_ARCH})
   filter_available_targets(MEMPROF_SUPPORTED_ARCH ${ALL_MEMPROF_SUPPORTED_ARCH})
   filter_available_targets(PROFILE_SUPPORTED_ARCH ${ALL_PROFILE_SUPPORTED_ARCH})
+  filter_available_targets(CTX_PROFILE_SUPPORTED_ARCH ${ALL_CTX_PROFILE_SUPPORTED_ARCH})
   filter_available_targets(TSAN_SUPPORTED_ARCH ${ALL_TSAN_SUPPORTED_ARCH})
   filter_available_targets(UBSAN_SUPPORTED_ARCH ${ALL_UBSAN_SUPPORTED_ARCH})
   filter_available_targets(SAFESTACK_SUPPORTED_ARCH
@@ -803,6 +807,13 @@ else()
   set(COMPILER_RT_HAS_PROFILE FALSE)
 endif()
 
+if (COMPILER_RT_HAS_SANITIZER_COMMON AND CTX_PROFILE_SUPPORTED_ARCH AND
+    OS_NAME MATCHES "Linux")
+  set(COMPILER_RT_HAS_CTX_PROFILE TRUE)
+else()
+  set(COMPILER_RT_HAS_CTX_PROFILE FALSE)
+endif()
+
 if (COMPILER_RT_HAS_SANITIZER_COMMON AND TSAN_SUPPORTED_ARCH)
   if (OS_NAME MATCHES "Linux|Darwin|FreeBSD|NetBSD")
     set(COMPILER_RT_HAS_TSAN TRUE)
diff --git a/compiler-rt/lib/CMakeLists.txt b/compiler-rt/lib/CMakeLists.txt
index 43ba9a102c8487..f9e96563b88090 100644
--- a/compiler-rt/lib/CMakeLists.txt
+++ b/compiler-rt/lib/CMakeLists.txt
@@ -51,6 +51,10 @@ if(COMPILER_RT_BUILD_PROFILE AND COMPILER_RT_HAS_PROFILE)
   compiler_rt_build_runtime(profile)
 endif()
 
+if(COMPILER_RT_BUILD_CTX_PROFILE AND COMPILER_RT_HAS_CTX_PROFILE)
+  compiler_rt_build_runtime(ctx_profile)
+endif()
+
 if(COMPILER_RT_BUILD_XRAY)
   compiler_rt_build_runtime(xray)
 endif()
diff --git a/compiler-rt/lib/ctx_profile/CMakeLists.txt b/compiler-rt/lib/ctx_profile/CMakeLists.txt
new file mode 100644
index 00000000000000..621b7d30b76d41
--- /dev/null
+++ b/compiler-rt/lib/ctx_profile/CMakeLists.txt
@@ -0,0 +1,28 @@
+add_compiler_rt_component(ctx_profile)
+
+set(CTX_PROFILE_SOURCES
+  CtxInstrProfiling.cpp
+  )
+
+set(CTX_PROFILE_HEADERS
+  CtxInstrProfiling.h
+  )
+
+include_directories(..)
+include_directories(../../include)
+
+# We don't use the C++ Standard Library here, so avoid including it by mistake.
+append_list_if(COMPILER_RT_HAS_NOSTDINCXX_FLAG -nostdinc++ EXTRA_FLAGS)
+
+add_compiler_rt_runtime(clang_rt.ctx_profile
+  STATIC
+  ARCHS ${CTX_PROFILE_SUPPORTED_ARCH}
+  OBJECT_LIBS RTSanitizerCommon RTSanitizerCommonLibc
+  CFLAGS ${EXTRA_FLAGS}
+  SOURCES ${CTX_PROFILE_SOURCES}
+  ADDITIONAL_HEADERS ${CTX_PROFILE_HEADERS}
+  PARENT_TARGET ctx_profile)
+
+if(COMPILER_RT_INCLUDE_TESTS)
+  add_subdirectory(tests)
+endif()
diff --git a/compiler-rt/lib/ctx_profile/CtxInstrProfiling.cpp b/compiler-rt/lib/ctx_profile/CtxInstrProfiling.cpp
new file mode 100644
index 00000000000000..7620ce92f7ebde
--- /dev/null
+++ b/compiler-rt/lib/ctx_profile/CtxInstrProfiling.cpp
@@ -0,0 +1,40 @@
+//===- CtxInstrProfiling.cpp - contextual instrumented PGO ----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "CtxInstrProfiling.h"
+#include "sanitizer_common/sanitizer_allocator_internal.h"
+#include "sanitizer_common/sanitizer_common.h"
+#include "sanitizer_common/sanitizer_dense_map.h"
+#include "sanitizer_common/sanitizer_mutex.h"
+#include "sanitizer_common/sanitizer_placement_new.h"
+#include "sanitizer_common/sanitizer_thread_safety.h"
+
+#include <assert.h>
+
+using namespace __ctx_profile;
+
+// FIXME(mtrofin): use malloc / mmap instead of sanitizer common APIs to reduce
+// the dependency on the latter.
+Arena *Arena::allocateNewArena(size_t Size, Arena *Prev) {
+  assert(!Prev || Prev->Next == nullptr);
+  Arena *NewArena =
+      new (__sanitizer::InternalAlloc(Size + sizeof(Arena))) Arena(Size);
+  if (Prev)
+    Prev->Next = NewArena;
+  return NewArena;
+}
+
+void Arena::freeArenaList(Arena *&A) {
+  assert(A);
+  for (auto *I = A; I != nullptr;) {
+    auto *Current = I;
+    I = I->Next;
+    __sanitizer::InternalFree(Current);
+  }
+  A = nullptr;
+}
diff --git a/compiler-rt/lib/ctx_profile/CtxInstrProfiling.h b/compiler-rt/lib/ctx_profile/CtxInstrProfiling.h
new file mode 100644
index 00000000000000..c1789c32a64c25
--- /dev/null
+++ b/compiler-rt/lib/ctx_profile/CtxInstrProfiling.h
@@ -0,0 +1,55 @@
+/*===- CtxInstrProfiling.h- Contextual instrumentation-based PGO  ---------===*\
+|*
+|* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+|* See https://llvm.org/LICENSE.txt for license information.
+|* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+|*
+\*===----------------------------------------------------------------------===*/
+
+#ifndef CTX_PROFILE_CTXINSTRPROFILING_H_
+#define CTX_PROFILE_CTXINSTRPROFILING_H_
+
+#include <sanitizer/common_interface_defs.h>
+
+namespace __ctx_profile {
+
+/// Arena (bump allocator) forming a linked list. Intentionally not thread safe.
+/// Allocation and de-allocation happen using sanitizer APIs. We make that
+/// explicit.
+class Arena final {
+public:
+  // When allocating a new Arena, optionally specify an existing one to append
+  // to, assumed to be the last in the Arena list. We only need to support
+  // appending to the arena list.
+  static Arena *allocateNewArena(size_t Size, Arena *Prev = nullptr);
+  static void freeArenaList(Arena *&A);
+
+  uint64_t size() const { return Size; }
+
+  // Allocate S bytes or return nullptr if we don't have that many available.
+  char *tryBumpAllocate(size_t S) {
+    if (Pos + S > Size)
+      return nullptr;
+    Pos += S;
+    return start() + (Pos - S);
+  }
+
+  Arena *next() const { return Next; }
+
+  // the beginning of allocatable memory.
+  const char *start() const { return const_cast<Arena *>(this)->start(); }
+  const char *pos() const { return start() + Pos; }
+
+private:
+  explicit Arena(uint32_t Size) : Size(Size) {}
+  ~Arena() = delete;
+
+  char *start() { return reinterpret_cast<char *>(&this[1]); }
+
+  Arena *Next = nullptr;
+  uint64_t Pos = 0;
+  const uint64_t Size;
+};
+
+} // namespace __ctx_profile
+#endif // CTX_PROFILE_CTXINSTRPROFILING_H_
diff --git a/compiler-rt/lib/ctx_profile/tests/CMakeLists.txt b/compiler-rt/lib/ctx_profile/tests/CMakeLists.txt
new file mode 100644
index 00000000000000..93b41b838445d1
--- /dev/null
+++ b/compiler-rt/lib/ctx_profile/tests/CMakeLists.txt
@@ -0,0 +1,70 @@
+include(CheckCXXCompilerFlag)
+include(CompilerRTCompile)
+include(CompilerRTLink)
+
+set(CTX_PROFILE_UNITTEST_CFLAGS
+  ${COMPILER_RT_UNITTEST_CFLAGS}
+  ${COMPILER_RT_GTEST_CFLAGS}
+  ${COMPILER_RT_GMOCK_CFLAGS}
+  ${SANITIZER_TEST_CXX_CFLAGS}
+  -I${COMPILER_RT_SOURCE_DIR}/lib/
+  -DSANITIZER_COMMON_NO_REDEFINE_BUILTINS
+  -O2
+  -g
+  -fno-rtti
+  -Wno-pedantic
+  -fno-omit-frame-pointer)
+
+# Suppress warnings for gmock variadic macros for clang and gcc respectively.
+append_list_if(SUPPORTS_GNU_ZERO_VARIADIC_MACRO_ARGUMENTS_FLAG -Wno-gnu-zero-variadic-macro-arguments CTX_PROFILE_UNITTEST_CFLAGS)
+append_list_if(COMPILER_RT_HAS_WVARIADIC_MACROS_FLAG -Wno-variadic-macros CTX_PROFILE_UNITTEST_CFLAGS)
+
+file(GLOB PROFILE_HEADERS ../*.h)
+
+set(CTX_PROFILE_SOURCES
+  ../CtxInstrProfiling.cpp)
+
+set(CTX_PROFILE_UNITTESTS
+  CtxInstrProfilingTest.cpp
+  driver.cpp)
+
+include_directories(../../../include)
+
+set(CTX_PROFILE_UNIT_TEST_HEADERS
+  ${CTX_PROFILE_HEADERS})
+
+set(CTX_PROFILE_UNITTEST_LINK_FLAGS
+  ${COMPILER_RT_UNITTEST_LINK_FLAGS})
+
+list(APPEND CTX_PROFILE_UNITTEST_LINK_FLAGS -pthread)
+
+set(CTX_PROFILE_UNITTEST_LINK_LIBRARIES
+  ${COMPILER_RT_UNWINDER_LINK_LIBS}
+  ${SANITIZER_TEST_CXX_LIBRARIES})
+list(APPEND CTX_PROFILE_UNITTEST_LINK_LIBRARIES "dl")
+
+if(COMPILER_RT_DEFAULT_TARGET_ARCH IN_LIST CTX_PROFILE_SUPPORTED_ARCH)
+  # Profile unit tests are only run on the host machine.
+  set(arch ${COMPILER_RT_DEFAULT_TARGET_ARCH})
+
+  add_executable(CtxProfileUnitTests 
+    ${CTX_PROFILE_UNITTESTS}
+    ${COMPILER_RT_GTEST_SOURCE}
+    ${COMPILER_RT_GMOCK_SOURCE}
+    ${CTX_PROFILE_SOURCES}
+    $<TARGET_OBJECTS:RTSanitizerCommon.${arch}>
+    $<TARGET_OBJECTS:RTSanitizerCommonCoverage.${arch}>
+    $<TARGET_OBJECTS:RTSanitizerCommonLibc.${arch}>
+    $<TARGET_OBJECTS:RTSanitizerCommonSymbolizer.${arch}>
+    $<TARGET_OBJECTS:RTSanitizerCommonSymbolizerInternal.${arch}>)
+  set_target_compile_flags(CtxProfileUnitTests ${CTX_PROFILE_UNITTEST_CFLAGS})
+  set_target_link_flags(CtxProfileUnitTests ${CTX_PROFILE_UNITTEST_LINK_FLAGS})
+  target_link_libraries(CtxProfileUnitTests ${CTX_PROFILE_UNITTEST_LINK_LIBRARIES})
+
+  if (TARGET cxx-headers OR HAVE_LIBCXX)
+    add_dependencies(CtxProfileUnitTests cxx-headers)
+  endif()
+
+  set_target_properties(CtxProfileUnitTests PROPERTIES
+    RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+endif()
diff --git a/compiler-rt/lib/ctx_profile/tests/CtxInstrProfilingTest.cpp b/compiler-rt/lib/ctx_profile/tests/CtxInstrProfilingTest.cpp
new file mode 100644
index 00000000000000..44f37d25763206
--- /dev/null
+++ b/compiler-rt/lib/ctx_profile/tests/CtxInstrProfilingTest.cpp
@@ -0,0 +1,22 @@
+#include "../CtxInstrProfiling.h"
+#include "gtest/gtest.h"
+
+using namespace __ctx_profile;
+
+TEST(ArenaTest, Basic) {
+  Arena *A = Arena::allocateNewArena(1024);
+  EXPECT_EQ(A->size(), 1024U);
+  EXPECT_EQ(A->next(), nullptr);
+
+  auto *M1 = A->tryBumpAllocate(1020);
+  EXPECT_NE(M1, nullptr);
+  auto *M2 = A->tryBumpAllocate(4);
+  EXPECT_NE(M2, nullptr);
+  EXPECT_EQ(M1 + 1020, M2);
+  EXPECT_EQ(A->tryBumpAllocate(1), nullptr);
+  Arena *A2 = Arena::allocateNewArena(2024, A);
+  EXPECT_EQ(A->next(), A2);
+  EXPECT_EQ(A2->next(), nullptr);
+  Arena::freeArenaList(A);
+  EXPECT_EQ(A, nullptr);
+}
diff --git a/compiler-rt/lib/ctx_profile/tests/driver.cpp b/compiler-rt/lib/ctx_profile/tests/driver.cpp
new file mode 100644
index 00000000000000..b402cec1126b33
--- /dev/null
+++ b/compiler-rt/lib/ctx_profile/tests/driver.cpp
@@ -0,0 +1,14 @@
+//===-- driver.cpp ----------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "gtest/gtest.h"
+
+int main(int argc, char **argv) {
+  testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}

>From 0ab0c1d982876662a45adb9bafaa3c2d3bdf1939 Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev at outlook.com>
Date: Mon, 22 Apr 2024 12:31:57 -0400
Subject: [PATCH 2/7] [SLP]Introduce transformNodes() and transform loads +
 reverse to strided loads.

Introduced transformNodes() function to perform transformation of the
nodes (cost-based, instruction count based, etc.).
Implemented transformation of consecutive loads + reverse order to
strided loads with stride -1, if profitable.

Reviewers: RKSimon, preames, topperc

Reviewed By: RKSimon

Pull Request: https://github.com/llvm/llvm-project/pull/88530
---
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 43 +++++++++++++++++++
 .../RISCV/strided-loads-vectorized.ll         |  5 +--
 2 files changed, 45 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 685a5907d94fe7..6ac380a6ab6c6c 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -1126,6 +1126,9 @@ class BoUpSLP {
   void
   buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues = {});
 
+  /// Transforms graph nodes to target specific representations, if profitable.
+  void transformNodes();
+
   /// Clear the internal data structures that are created by 'buildTree'.
   void deleteTree() {
     VectorizableTree.clear();
@@ -7813,6 +7816,43 @@ getGEPCosts(const TargetTransformInfo &TTI, ArrayRef<Value *> Ptrs,
   return std::make_pair(ScalarCost, VecCost);
 }
 
+void BoUpSLP::transformNodes() {
+  constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+  for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
+    TreeEntry &E = *TE.get();
+    switch (E.getOpcode()) {
+    case Instruction::Load: {
+      Type *ScalarTy = E.getMainOp()->getType();
+      auto *VecTy = FixedVectorType::get(ScalarTy, E.Scalars.size());
+      Align CommonAlignment = computeCommonAlignment<LoadInst>(E.Scalars);
+      // Check if profitable to represent consecutive load + reverse as strided
+      // load with stride -1.
+      if (isReverseOrder(E.ReorderIndices) &&
+          TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
+        SmallVector<int> Mask;
+        inversePermutation(E.ReorderIndices, Mask);
+        auto *BaseLI = cast<LoadInst>(E.Scalars.back());
+        InstructionCost OriginalVecCost =
+            TTI->getMemoryOpCost(Instruction::Load, VecTy, BaseLI->getAlign(),
+                                 BaseLI->getPointerAddressSpace(), CostKind,
+                                 TTI::OperandValueInfo()) +
+            ::getShuffleCost(*TTI, TTI::SK_Reverse, VecTy, Mask, CostKind);
+        InstructionCost StridedCost = TTI->getStridedMemoryOpCost(
+            Instruction::Load, VecTy, BaseLI->getPointerOperand(),
+            /*VariableMask=*/false, CommonAlignment, CostKind, BaseLI);
+        if (StridedCost < OriginalVecCost)
+          // Strided load is more profitable than consecutive load + reverse -
+          // transform the node to strided load.
+          E.State = TreeEntry::StridedVectorize;
+      }
+      break;
+    }
+    default:
+      break;
+    }
+  }
+}
+
 /// Merges shuffle masks and emits final shuffle instruction, if required. It
 /// supports shuffling of 2 input vectors. It implements lazy shuffles emission,
 /// when the actual shuffle instruction is generated only if this is actually
@@ -15189,6 +15229,7 @@ bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
   R.buildExternalUses();
 
   R.computeMinimumValueSizes();
+  R.transformNodes();
 
   InstructionCost Cost = R.getTreeCost();
 
@@ -15567,6 +15608,7 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
       R.buildExternalUses();
 
       R.computeMinimumValueSizes();
+      R.transformNodes();
       InstructionCost Cost = R.getTreeCost();
       CandidateFound = true;
       MinCost = std::min(MinCost, Cost);
@@ -16563,6 +16605,7 @@ class HorizontalReduction {
         V.buildExternalUses(LocalExternallyUsedValues);
 
         V.computeMinimumValueSizes();
+        V.transformNodes();
 
         // Estimate cost.
         InstructionCost TreeCost = V.getTreeCost(VL);
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-vectorized.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-vectorized.ll
index 03acc0009fb04c..44d320c75fedd4 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-vectorized.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-vectorized.ll
@@ -240,11 +240,10 @@ define void @test3(ptr %p, ptr noalias %s) {
 ; CHECK-LABEL: @test3(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [48 x float], ptr [[P:%.*]], i64 0, i64 0
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 30
 ; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[S:%.*]], i64 0
-; CHECK-NEXT:    [[ARRAYIDX48:%.*]] = getelementptr inbounds [48 x float], ptr [[P]], i64 0, i64 23
 ; CHECK-NEXT:    [[TMP0:%.*]] = call <8 x float> @llvm.experimental.vp.strided.load.v8f32.p0.i64(ptr align 4 [[ARRAYIDX]], i64 16, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 8)
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x float>, ptr [[ARRAYIDX48]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = call <8 x float> @llvm.experimental.vp.strided.load.v8f32.p0.i64(ptr align 4 [[ARRAYIDX1]], i64 -4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 8)
 ; CHECK-NEXT:    [[TMP3:%.*]] = fsub fast <8 x float> [[TMP2]], [[TMP0]]
 ; CHECK-NEXT:    store <8 x float> [[TMP3]], ptr [[ARRAYIDX2]], align 4
 ; CHECK-NEXT:    ret void

>From 832d3a42c34eee2a6ca323ef97a1c6fe14c1f651 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Mon, 22 Apr 2024 17:18:43 +0100
Subject: [PATCH 3/7] [X86] gfni-funnel-shifts.ll - add vXi8
 variable/splat/constant test coverage

Once #89115 has landed, we can handle per-element rotates as well using (V)GF2P8MULB
---
 llvm/test/CodeGen/X86/gfni-funnel-shifts.ll | 2686 ++++++++++++++++++-
 1 file changed, 2684 insertions(+), 2 deletions(-)

diff --git a/llvm/test/CodeGen/X86/gfni-funnel-shifts.ll b/llvm/test/CodeGen/X86/gfni-funnel-shifts.ll
index a98983e44d3d0c..0c341dc63a9ecc 100644
--- a/llvm/test/CodeGen/X86/gfni-funnel-shifts.ll
+++ b/llvm/test/CodeGen/X86/gfni-funnel-shifts.ll
@@ -9,6 +9,486 @@
 ; 128 Bit Vector Funnel Shifts
 ;
 
+define <16 x i8> @var_fshl_v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %amt) nounwind {
+; GFNISSE-LABEL: var_fshl_v16i8:
+; GFNISSE:       # %bb.0:
+; GFNISSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; GFNISSE-NEXT:    pxor %xmm3, %xmm3
+; GFNISSE-NEXT:    pmovzxbd {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
+; GFNISSE-NEXT:    pmovzxbw {{.*#+}} xmm5 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
+; GFNISSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15]
+; GFNISSE-NEXT:    pmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
+; GFNISSE-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7]
+; GFNISSE-NEXT:    pslld $23, %xmm2
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm6 = [1065353216,1065353216,1065353216,1065353216]
+; GFNISSE-NEXT:    paddd %xmm6, %xmm2
+; GFNISSE-NEXT:    cvttps2dq %xmm2, %xmm2
+; GFNISSE-NEXT:    pslld $23, %xmm3
+; GFNISSE-NEXT:    paddd %xmm6, %xmm3
+; GFNISSE-NEXT:    cvttps2dq %xmm3, %xmm3
+; GFNISSE-NEXT:    packusdw %xmm2, %xmm3
+; GFNISSE-NEXT:    movdqa %xmm1, %xmm7
+; GFNISSE-NEXT:    punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm0[8],xmm7[9],xmm0[9],xmm7[10],xmm0[10],xmm7[11],xmm0[11],xmm7[12],xmm0[12],xmm7[13],xmm0[13],xmm7[14],xmm0[14],xmm7[15],xmm0[15]
+; GFNISSE-NEXT:    pmullw %xmm3, %xmm7
+; GFNISSE-NEXT:    psrlw $8, %xmm7
+; GFNISSE-NEXT:    pslld $23, %xmm4
+; GFNISSE-NEXT:    paddd %xmm6, %xmm4
+; GFNISSE-NEXT:    cvttps2dq %xmm4, %xmm2
+; GFNISSE-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7]
+; GFNISSE-NEXT:    pslld $23, %xmm5
+; GFNISSE-NEXT:    paddd %xmm6, %xmm5
+; GFNISSE-NEXT:    cvttps2dq %xmm5, %xmm3
+; GFNISSE-NEXT:    packusdw %xmm3, %xmm2
+; GFNISSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; GFNISSE-NEXT:    pmullw %xmm1, %xmm2
+; GFNISSE-NEXT:    psrlw $8, %xmm2
+; GFNISSE-NEXT:    packuswb %xmm7, %xmm2
+; GFNISSE-NEXT:    movdqa %xmm2, %xmm0
+; GFNISSE-NEXT:    retq
+;
+; GFNIAVX1-LABEL: var_fshl_v16i8:
+; GFNIAVX1:       # %bb.0:
+; GFNIAVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; GFNIAVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; GFNIAVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15]
+; GFNIAVX1-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm3[4,4,5,5,6,6,7,7]
+; GFNIAVX1-NEXT:    vpslld $23, %xmm4, %xmm4
+; GFNIAVX1-NEXT:    vbroadcastss {{.*#+}} xmm5 = [1065353216,1065353216,1065353216,1065353216]
+; GFNIAVX1-NEXT:    vpaddd %xmm5, %xmm4, %xmm4
+; GFNIAVX1-NEXT:    vcvttps2dq %xmm4, %xmm4
+; GFNIAVX1-NEXT:    vpmovzxwd {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
+; GFNIAVX1-NEXT:    vpslld $23, %xmm3, %xmm3
+; GFNIAVX1-NEXT:    vpaddd %xmm5, %xmm3, %xmm3
+; GFNIAVX1-NEXT:    vcvttps2dq %xmm3, %xmm3
+; GFNIAVX1-NEXT:    vpackusdw %xmm4, %xmm3, %xmm3
+; GFNIAVX1-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; GFNIAVX1-NEXT:    vpmullw %xmm3, %xmm4, %xmm3
+; GFNIAVX1-NEXT:    vpsrlw $8, %xmm3, %xmm3
+; GFNIAVX1-NEXT:    vpmovzxbd {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
+; GFNIAVX1-NEXT:    vpslld $23, %xmm4, %xmm4
+; GFNIAVX1-NEXT:    vpaddd %xmm5, %xmm4, %xmm4
+; GFNIAVX1-NEXT:    vcvttps2dq %xmm4, %xmm4
+; GFNIAVX1-NEXT:    vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
+; GFNIAVX1-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7]
+; GFNIAVX1-NEXT:    vpslld $23, %xmm2, %xmm2
+; GFNIAVX1-NEXT:    vpaddd %xmm5, %xmm2, %xmm2
+; GFNIAVX1-NEXT:    vcvttps2dq %xmm2, %xmm2
+; GFNIAVX1-NEXT:    vpackusdw %xmm2, %xmm4, %xmm2
+; GFNIAVX1-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; GFNIAVX1-NEXT:    vpmullw %xmm2, %xmm0, %xmm0
+; GFNIAVX1-NEXT:    vpsrlw $8, %xmm0, %xmm0
+; GFNIAVX1-NEXT:    vpackuswb %xmm3, %xmm0, %xmm0
+; GFNIAVX1-NEXT:    retq
+;
+; GFNIAVX2-LABEL: var_fshl_v16i8:
+; GFNIAVX2:       # %bb.0:
+; GFNIAVX2-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; GFNIAVX2-NEXT:    vpmovzxwd {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
+; GFNIAVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; GFNIAVX2-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; GFNIAVX2-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15]
+; GFNIAVX2-NEXT:    vpmovzxwd {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
+; GFNIAVX2-NEXT:    vpsllvd %ymm4, %ymm3, %ymm3
+; GFNIAVX2-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; GFNIAVX2-NEXT:    vpshufb %ymm4, %ymm3, %ymm3
+; GFNIAVX2-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
+; GFNIAVX2-NEXT:    vpsrlw $8, %xmm3, %xmm3
+; GFNIAVX2-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; GFNIAVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; GFNIAVX2-NEXT:    vpmovzxbd {{.*#+}} ymm1 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero
+; GFNIAVX2-NEXT:    vpsllvd %ymm1, %ymm0, %ymm0
+; GFNIAVX2-NEXT:    vpshufb %ymm4, %ymm0, %ymm0
+; GFNIAVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; GFNIAVX2-NEXT:    vpsrlw $8, %xmm0, %xmm0
+; GFNIAVX2-NEXT:    vpackuswb %xmm3, %xmm0, %xmm0
+; GFNIAVX2-NEXT:    vzeroupper
+; GFNIAVX2-NEXT:    retq
+;
+; GFNIAVX512VL-LABEL: var_fshl_v16i8:
+; GFNIAVX512VL:       # %bb.0:
+; GFNIAVX512VL-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; GFNIAVX512VL-NEXT:    vpand %xmm3, %xmm2, %xmm4
+; GFNIAVX512VL-NEXT:    vpmovzxbd {{.*#+}} zmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero,xmm4[8],zero,zero,zero,xmm4[9],zero,zero,zero,xmm4[10],zero,zero,zero,xmm4[11],zero,zero,zero,xmm4[12],zero,zero,zero,xmm4[13],zero,zero,zero,xmm4[14],zero,zero,zero,xmm4[15],zero,zero,zero
+; GFNIAVX512VL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; GFNIAVX512VL-NEXT:    vpsllvd %zmm4, %zmm0, %zmm0
+; GFNIAVX512VL-NEXT:    vpandn %xmm3, %xmm2, %xmm2
+; GFNIAVX512VL-NEXT:    vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
+; GFNIAVX512VL-NEXT:    vpsrlw $1, %xmm1, %xmm1
+; GFNIAVX512VL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1
+; GFNIAVX512VL-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; GFNIAVX512VL-NEXT:    vpsrlvd %zmm2, %zmm1, %zmm1
+; GFNIAVX512VL-NEXT:    vpord %zmm1, %zmm0, %zmm0
+; GFNIAVX512VL-NEXT:    vpmovdb %zmm0, %xmm0
+; GFNIAVX512VL-NEXT:    vzeroupper
+; GFNIAVX512VL-NEXT:    retq
+;
+; GFNIAVX512BW-LABEL: var_fshl_v16i8:
+; GFNIAVX512BW:       # %bb.0:
+; GFNIAVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; GFNIAVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; GFNIAVX512BW-NEXT:    vpsllw $8, %ymm0, %ymm0
+; GFNIAVX512BW-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; GFNIAVX512BW-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm2, %xmm1
+; GFNIAVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; GFNIAVX512BW-NEXT:    vpsllvw %ymm1, %ymm0, %ymm0
+; GFNIAVX512BW-NEXT:    vpsrlw $8, %ymm0, %ymm0
+; GFNIAVX512BW-NEXT:    vpmovwb %ymm0, %xmm0
+; GFNIAVX512BW-NEXT:    vzeroupper
+; GFNIAVX512BW-NEXT:    retq
+  %res = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %amt)
+  ret <16 x i8> %res
+}
+
+define <16 x i8> @var_fshr_v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %amt) nounwind {
+; GFNISSE-LABEL: var_fshr_v16i8:
+; GFNISSE:       # %bb.0:
+; GFNISSE-NEXT:    movdqa %xmm2, %xmm3
+; GFNISSE-NEXT:    movdqa %xmm0, %xmm2
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm5 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; GFNISSE-NEXT:    movdqa %xmm3, %xmm0
+; GFNISSE-NEXT:    pand %xmm5, %xmm0
+; GFNISSE-NEXT:    psllw $5, %xmm0
+; GFNISSE-NEXT:    movdqa %xmm0, %xmm4
+; GFNISSE-NEXT:    paddb %xmm0, %xmm4
+; GFNISSE-NEXT:    movdqa %xmm1, %xmm6
+; GFNISSE-NEXT:    psrlw $4, %xmm6
+; GFNISSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6
+; GFNISSE-NEXT:    pblendvb %xmm0, %xmm6, %xmm1
+; GFNISSE-NEXT:    movdqa %xmm1, %xmm6
+; GFNISSE-NEXT:    psrlw $2, %xmm6
+; GFNISSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6
+; GFNISSE-NEXT:    movdqa %xmm4, %xmm0
+; GFNISSE-NEXT:    pblendvb %xmm0, %xmm6, %xmm1
+; GFNISSE-NEXT:    movdqa %xmm1, %xmm6
+; GFNISSE-NEXT:    psrlw $1, %xmm6
+; GFNISSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6
+; GFNISSE-NEXT:    paddb %xmm4, %xmm4
+; GFNISSE-NEXT:    movdqa %xmm4, %xmm0
+; GFNISSE-NEXT:    pblendvb %xmm0, %xmm6, %xmm1
+; GFNISSE-NEXT:    pandn %xmm5, %xmm3
+; GFNISSE-NEXT:    psllw $5, %xmm3
+; GFNISSE-NEXT:    movdqa %xmm3, %xmm4
+; GFNISSE-NEXT:    paddb %xmm3, %xmm4
+; GFNISSE-NEXT:    paddb %xmm2, %xmm2
+; GFNISSE-NEXT:    movdqa %xmm2, %xmm5
+; GFNISSE-NEXT:    psllw $4, %xmm5
+; GFNISSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5
+; GFNISSE-NEXT:    movdqa %xmm3, %xmm0
+; GFNISSE-NEXT:    pblendvb %xmm0, %xmm5, %xmm2
+; GFNISSE-NEXT:    movdqa %xmm2, %xmm3
+; GFNISSE-NEXT:    psllw $2, %xmm3
+; GFNISSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
+; GFNISSE-NEXT:    movdqa %xmm4, %xmm0
+; GFNISSE-NEXT:    pblendvb %xmm0, %xmm3, %xmm2
+; GFNISSE-NEXT:    movdqa %xmm2, %xmm3
+; GFNISSE-NEXT:    paddb %xmm2, %xmm3
+; GFNISSE-NEXT:    paddb %xmm4, %xmm4
+; GFNISSE-NEXT:    movdqa %xmm4, %xmm0
+; GFNISSE-NEXT:    pblendvb %xmm0, %xmm3, %xmm2
+; GFNISSE-NEXT:    por %xmm1, %xmm2
+; GFNISSE-NEXT:    movdqa %xmm2, %xmm0
+; GFNISSE-NEXT:    retq
+;
+; GFNIAVX1-LABEL: var_fshr_v16i8:
+; GFNIAVX1:       # %bb.0:
+; GFNIAVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; GFNIAVX1-NEXT:    vpand %xmm3, %xmm2, %xmm4
+; GFNIAVX1-NEXT:    vpsllw $5, %xmm4, %xmm4
+; GFNIAVX1-NEXT:    vpaddb %xmm4, %xmm4, %xmm5
+; GFNIAVX1-NEXT:    vpsrlw $4, %xmm1, %xmm6
+; GFNIAVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6, %xmm6
+; GFNIAVX1-NEXT:    vpblendvb %xmm4, %xmm6, %xmm1, %xmm1
+; GFNIAVX1-NEXT:    vpsrlw $2, %xmm1, %xmm4
+; GFNIAVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4
+; GFNIAVX1-NEXT:    vpblendvb %xmm5, %xmm4, %xmm1, %xmm1
+; GFNIAVX1-NEXT:    vpsrlw $1, %xmm1, %xmm4
+; GFNIAVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4
+; GFNIAVX1-NEXT:    vpaddb %xmm5, %xmm5, %xmm5
+; GFNIAVX1-NEXT:    vpblendvb %xmm5, %xmm4, %xmm1, %xmm1
+; GFNIAVX1-NEXT:    vpandn %xmm3, %xmm2, %xmm2
+; GFNIAVX1-NEXT:    vpsllw $5, %xmm2, %xmm2
+; GFNIAVX1-NEXT:    vpaddb %xmm2, %xmm2, %xmm3
+; GFNIAVX1-NEXT:    vpaddb %xmm0, %xmm0, %xmm0
+; GFNIAVX1-NEXT:    vpsllw $4, %xmm0, %xmm4
+; GFNIAVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4
+; GFNIAVX1-NEXT:    vpblendvb %xmm2, %xmm4, %xmm0, %xmm0
+; GFNIAVX1-NEXT:    vpsllw $2, %xmm0, %xmm2
+; GFNIAVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; GFNIAVX1-NEXT:    vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
+; GFNIAVX1-NEXT:    vpaddb %xmm0, %xmm0, %xmm2
+; GFNIAVX1-NEXT:    vpaddb %xmm3, %xmm3, %xmm3
+; GFNIAVX1-NEXT:    vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
+; GFNIAVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; GFNIAVX1-NEXT:    retq
+;
+; GFNIAVX2-LABEL: var_fshr_v16i8:
+; GFNIAVX2:       # %bb.0:
+; GFNIAVX2-NEXT:    vpbroadcastb {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; GFNIAVX2-NEXT:    vpand %xmm3, %xmm2, %xmm4
+; GFNIAVX2-NEXT:    vpsllw $5, %xmm4, %xmm4
+; GFNIAVX2-NEXT:    vpaddb %xmm4, %xmm4, %xmm5
+; GFNIAVX2-NEXT:    vpsrlw $4, %xmm1, %xmm6
+; GFNIAVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6, %xmm6
+; GFNIAVX2-NEXT:    vpblendvb %xmm4, %xmm6, %xmm1, %xmm1
+; GFNIAVX2-NEXT:    vpsrlw $2, %xmm1, %xmm4
+; GFNIAVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4
+; GFNIAVX2-NEXT:    vpblendvb %xmm5, %xmm4, %xmm1, %xmm1
+; GFNIAVX2-NEXT:    vpsrlw $1, %xmm1, %xmm4
+; GFNIAVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4
+; GFNIAVX2-NEXT:    vpaddb %xmm5, %xmm5, %xmm5
+; GFNIAVX2-NEXT:    vpblendvb %xmm5, %xmm4, %xmm1, %xmm1
+; GFNIAVX2-NEXT:    vpandn %xmm3, %xmm2, %xmm2
+; GFNIAVX2-NEXT:    vpsllw $5, %xmm2, %xmm2
+; GFNIAVX2-NEXT:    vpaddb %xmm2, %xmm2, %xmm3
+; GFNIAVX2-NEXT:    vpaddb %xmm0, %xmm0, %xmm0
+; GFNIAVX2-NEXT:    vpsllw $4, %xmm0, %xmm4
+; GFNIAVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4
+; GFNIAVX2-NEXT:    vpblendvb %xmm2, %xmm4, %xmm0, %xmm0
+; GFNIAVX2-NEXT:    vpsllw $2, %xmm0, %xmm2
+; GFNIAVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; GFNIAVX2-NEXT:    vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
+; GFNIAVX2-NEXT:    vpaddb %xmm0, %xmm0, %xmm2
+; GFNIAVX2-NEXT:    vpaddb %xmm3, %xmm3, %xmm3
+; GFNIAVX2-NEXT:    vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
+; GFNIAVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; GFNIAVX2-NEXT:    retq
+;
+; GFNIAVX512VL-LABEL: var_fshr_v16i8:
+; GFNIAVX512VL:       # %bb.0:
+; GFNIAVX512VL-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; GFNIAVX512VL-NEXT:    vpand %xmm3, %xmm2, %xmm4
+; GFNIAVX512VL-NEXT:    vpmovzxbd {{.*#+}} zmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero,xmm4[8],zero,zero,zero,xmm4[9],zero,zero,zero,xmm4[10],zero,zero,zero,xmm4[11],zero,zero,zero,xmm4[12],zero,zero,zero,xmm4[13],zero,zero,zero,xmm4[14],zero,zero,zero,xmm4[15],zero,zero,zero
+; GFNIAVX512VL-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; GFNIAVX512VL-NEXT:    vpsrlvd %zmm4, %zmm1, %zmm1
+; GFNIAVX512VL-NEXT:    vpandn %xmm3, %xmm2, %xmm2
+; GFNIAVX512VL-NEXT:    vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
+; GFNIAVX512VL-NEXT:    vpaddb %xmm0, %xmm0, %xmm0
+; GFNIAVX512VL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; GFNIAVX512VL-NEXT:    vpsllvd %zmm2, %zmm0, %zmm0
+; GFNIAVX512VL-NEXT:    vpord %zmm1, %zmm0, %zmm0
+; GFNIAVX512VL-NEXT:    vpmovdb %zmm0, %xmm0
+; GFNIAVX512VL-NEXT:    vzeroupper
+; GFNIAVX512VL-NEXT:    retq
+;
+; GFNIAVX512BW-LABEL: var_fshr_v16i8:
+; GFNIAVX512BW:       # %bb.0:
+; GFNIAVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; GFNIAVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; GFNIAVX512BW-NEXT:    vpsllw $8, %ymm0, %ymm0
+; GFNIAVX512BW-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; GFNIAVX512BW-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm2, %xmm1
+; GFNIAVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; GFNIAVX512BW-NEXT:    vpsrlvw %ymm1, %ymm0, %ymm0
+; GFNIAVX512BW-NEXT:    vpmovwb %ymm0, %xmm0
+; GFNIAVX512BW-NEXT:    vzeroupper
+; GFNIAVX512BW-NEXT:    retq
+  %res = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %amt)
+  ret <16 x i8> %res
+}
+
+define <16 x i8> @splatvar_fshl_v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %amt) nounwind {
+; GFNISSE-LABEL: splatvar_fshl_v16i8:
+; GFNISSE:       # %bb.0:
+; GFNISSE-NEXT:    movdqa %xmm1, %xmm3
+; GFNISSE-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
+; GFNISSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; GFNISSE-NEXT:    psllw %xmm2, %xmm3
+; GFNISSE-NEXT:    psrlw $8, %xmm3
+; GFNISSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; GFNISSE-NEXT:    psllw %xmm2, %xmm1
+; GFNISSE-NEXT:    psrlw $8, %xmm1
+; GFNISSE-NEXT:    packuswb %xmm3, %xmm1
+; GFNISSE-NEXT:    movdqa %xmm1, %xmm0
+; GFNISSE-NEXT:    retq
+;
+; GFNIAVX-LABEL: splatvar_fshl_v16i8:
+; GFNIAVX:       # %bb.0:
+; GFNIAVX-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; GFNIAVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; GFNIAVX-NEXT:    vpsllw %xmm2, %xmm3, %xmm3
+; GFNIAVX-NEXT:    vpsrlw $8, %xmm3, %xmm3
+; GFNIAVX-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; GFNIAVX-NEXT:    vpsllw %xmm2, %xmm0, %xmm0
+; GFNIAVX-NEXT:    vpsrlw $8, %xmm0, %xmm0
+; GFNIAVX-NEXT:    vpackuswb %xmm3, %xmm0, %xmm0
+; GFNIAVX-NEXT:    retq
+  %splat = shufflevector <16 x i8> %amt, <16 x i8> undef, <16 x i32> zeroinitializer
+  %res = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %splat)
+  ret <16 x i8> %res
+}
+
+define <16 x i8> @splatvar_fshr_v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %amt) nounwind {
+; GFNISSE-LABEL: splatvar_fshr_v16i8:
+; GFNISSE:       # %bb.0:
+; GFNISSE-NEXT:    movdqa %xmm1, %xmm4
+; GFNISSE-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15]
+; GFNISSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; GFNISSE-NEXT:    psrlw %xmm2, %xmm4
+; GFNISSE-NEXT:    pmovzxbw {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
+; GFNISSE-NEXT:    pand %xmm3, %xmm4
+; GFNISSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; GFNISSE-NEXT:    psrlw %xmm2, %xmm1
+; GFNISSE-NEXT:    pand %xmm1, %xmm3
+; GFNISSE-NEXT:    packuswb %xmm4, %xmm3
+; GFNISSE-NEXT:    movdqa %xmm3, %xmm0
+; GFNISSE-NEXT:    retq
+;
+; GFNIAVX1-LABEL: splatvar_fshr_v16i8:
+; GFNIAVX1:       # %bb.0:
+; GFNIAVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; GFNIAVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; GFNIAVX1-NEXT:    vpsrlw %xmm2, %xmm3, %xmm3
+; GFNIAVX1-NEXT:    vbroadcastss {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
+; GFNIAVX1-NEXT:    vpand %xmm4, %xmm3, %xmm3
+; GFNIAVX1-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; GFNIAVX1-NEXT:    vpsrlw %xmm2, %xmm0, %xmm0
+; GFNIAVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
+; GFNIAVX1-NEXT:    vpackuswb %xmm3, %xmm0, %xmm0
+; GFNIAVX1-NEXT:    retq
+;
+; GFNIAVX2-LABEL: splatvar_fshr_v16i8:
+; GFNIAVX2:       # %bb.0:
+; GFNIAVX2-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; GFNIAVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; GFNIAVX2-NEXT:    vpsrlw %xmm2, %xmm3, %xmm3
+; GFNIAVX2-NEXT:    vpbroadcastw {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
+; GFNIAVX2-NEXT:    vpand %xmm4, %xmm3, %xmm3
+; GFNIAVX2-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; GFNIAVX2-NEXT:    vpsrlw %xmm2, %xmm0, %xmm0
+; GFNIAVX2-NEXT:    vpand %xmm4, %xmm0, %xmm0
+; GFNIAVX2-NEXT:    vpackuswb %xmm3, %xmm0, %xmm0
+; GFNIAVX2-NEXT:    retq
+;
+; GFNIAVX512VL-LABEL: splatvar_fshr_v16i8:
+; GFNIAVX512VL:       # %bb.0:
+; GFNIAVX512VL-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; GFNIAVX512VL-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; GFNIAVX512VL-NEXT:    vpsrlw %xmm2, %xmm3, %xmm3
+; GFNIAVX512VL-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
+; GFNIAVX512VL-NEXT:    vpand %xmm4, %xmm3, %xmm3
+; GFNIAVX512VL-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; GFNIAVX512VL-NEXT:    vpsrlw %xmm2, %xmm0, %xmm0
+; GFNIAVX512VL-NEXT:    vpand %xmm4, %xmm0, %xmm0
+; GFNIAVX512VL-NEXT:    vpackuswb %xmm3, %xmm0, %xmm0
+; GFNIAVX512VL-NEXT:    retq
+;
+; GFNIAVX512BW-LABEL: splatvar_fshr_v16i8:
+; GFNIAVX512BW:       # %bb.0:
+; GFNIAVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; GFNIAVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; GFNIAVX512BW-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm0
+; GFNIAVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm1
+; GFNIAVX512BW-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0
+; GFNIAVX512BW-NEXT:    vpmovwb %ymm0, %xmm0
+; GFNIAVX512BW-NEXT:    vzeroupper
+; GFNIAVX512BW-NEXT:    retq
+  %splat = shufflevector <16 x i8> %amt, <16 x i8> undef, <16 x i32> zeroinitializer
+  %res = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %splat)
+  ret <16 x i8> %res
+}
+
+define <16 x i8> @constant_fshl_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
+; GFNISSE-LABEL: constant_fshl_v16i8:
+; GFNISSE:       # %bb.0:
+; GFNISSE-NEXT:    movdqa %xmm1, %xmm2
+; GFNISSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
+; GFNISSE-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; GFNISSE-NEXT:    psrlw $8, %xmm2
+; GFNISSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; GFNISSE-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; GFNISSE-NEXT:    psrlw $8, %xmm1
+; GFNISSE-NEXT:    packuswb %xmm2, %xmm1
+; GFNISSE-NEXT:    movdqa %xmm1, %xmm0
+; GFNISSE-NEXT:    retq
+;
+; GFNIAVX1OR2-LABEL: constant_fshl_v16i8:
+; GFNIAVX1OR2:       # %bb.0:
+; GFNIAVX1OR2-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; GFNIAVX1OR2-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; GFNIAVX1OR2-NEXT:    vpsrlw $8, %xmm2, %xmm2
+; GFNIAVX1OR2-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; GFNIAVX1OR2-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; GFNIAVX1OR2-NEXT:    vpsrlw $8, %xmm0, %xmm0
+; GFNIAVX1OR2-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; GFNIAVX1OR2-NEXT:    retq
+;
+; GFNIAVX512VL-LABEL: constant_fshl_v16i8:
+; GFNIAVX512VL:       # %bb.0:
+; GFNIAVX512VL-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; GFNIAVX512VL-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; GFNIAVX512VL-NEXT:    vpsrlw $8, %xmm2, %xmm2
+; GFNIAVX512VL-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; GFNIAVX512VL-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; GFNIAVX512VL-NEXT:    vpsrlw $8, %xmm0, %xmm0
+; GFNIAVX512VL-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; GFNIAVX512VL-NEXT:    retq
+;
+; GFNIAVX512BW-LABEL: constant_fshl_v16i8:
+; GFNIAVX512BW:       # %bb.0:
+; GFNIAVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; GFNIAVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; GFNIAVX512BW-NEXT:    vpsllw $8, %ymm0, %ymm0
+; GFNIAVX512BW-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; GFNIAVX512BW-NEXT:    vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; GFNIAVX512BW-NEXT:    vpsrlw $8, %ymm0, %ymm0
+; GFNIAVX512BW-NEXT:    vpmovwb %ymm0, %xmm0
+; GFNIAVX512BW-NEXT:    vzeroupper
+; GFNIAVX512BW-NEXT:    retq
+  %res = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1>)
+  ret <16 x i8> %res
+}
+
+define <16 x i8> @constant_fshr_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
+; GFNISSE-LABEL: constant_fshr_v16i8:
+; GFNISSE:       # %bb.0:
+; GFNISSE-NEXT:    movdqa %xmm1, %xmm2
+; GFNISSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
+; GFNISSE-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; GFNISSE-NEXT:    psrlw $8, %xmm2
+; GFNISSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; GFNISSE-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; GFNISSE-NEXT:    psrlw $8, %xmm1
+; GFNISSE-NEXT:    packuswb %xmm2, %xmm1
+; GFNISSE-NEXT:    movdqa %xmm1, %xmm0
+; GFNISSE-NEXT:    retq
+;
+; GFNIAVX1OR2-LABEL: constant_fshr_v16i8:
+; GFNIAVX1OR2:       # %bb.0:
+; GFNIAVX1OR2-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; GFNIAVX1OR2-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; GFNIAVX1OR2-NEXT:    vpsrlw $8, %xmm2, %xmm2
+; GFNIAVX1OR2-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; GFNIAVX1OR2-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; GFNIAVX1OR2-NEXT:    vpsrlw $8, %xmm0, %xmm0
+; GFNIAVX1OR2-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; GFNIAVX1OR2-NEXT:    retq
+;
+; GFNIAVX512VL-LABEL: constant_fshr_v16i8:
+; GFNIAVX512VL:       # %bb.0:
+; GFNIAVX512VL-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; GFNIAVX512VL-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; GFNIAVX512VL-NEXT:    vpsrlw $8, %xmm2, %xmm2
+; GFNIAVX512VL-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; GFNIAVX512VL-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; GFNIAVX512VL-NEXT:    vpsrlw $8, %xmm0, %xmm0
+; GFNIAVX512VL-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; GFNIAVX512VL-NEXT:    retq
+;
+; GFNIAVX512BW-LABEL: constant_fshr_v16i8:
+; GFNIAVX512BW:       # %bb.0:
+; GFNIAVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; GFNIAVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; GFNIAVX512BW-NEXT:    vpsllw $8, %ymm0, %ymm0
+; GFNIAVX512BW-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; GFNIAVX512BW-NEXT:    vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; GFNIAVX512BW-NEXT:    vpsrlw $8, %ymm0, %ymm0
+; GFNIAVX512BW-NEXT:    vpmovwb %ymm0, %xmm0
+; GFNIAVX512BW-NEXT:    vzeroupper
+; GFNIAVX512BW-NEXT:    retq
+  %res = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1>)
+  ret <16 x i8> %res
+}
+
 define <16 x i8> @splatconstant_fshl_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
 ; GFNISSE-LABEL: splatconstant_fshl_v16i8:
 ; GFNISSE:       # %bb.0:
@@ -71,6 +551,788 @@ declare <16 x i8> @llvm.fshr.v16i8(<16 x i8>, <16 x i8>, <16 x i8>)
 ; 256 Bit Vector Funnel Shifts
 ;
 
+define <32 x i8> @var_fshl_v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8> %amt) nounwind {
+; GFNISSE-LABEL: var_fshl_v32i8:
+; GFNISSE:       # %bb.0:
+; GFNISSE-NEXT:    movdqa %xmm0, %xmm6
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm8 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; GFNISSE-NEXT:    pand %xmm8, %xmm4
+; GFNISSE-NEXT:    pxor %xmm7, %xmm7
+; GFNISSE-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
+; GFNISSE-NEXT:    pmovzxbw {{.*#+}} xmm9 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
+; GFNISSE-NEXT:    movdqa %xmm4, %xmm10
+; GFNISSE-NEXT:    punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm7[8],xmm10[9],xmm7[9],xmm10[10],xmm7[10],xmm10[11],xmm7[11],xmm10[12],xmm7[12],xmm10[13],xmm7[13],xmm10[14],xmm7[14],xmm10[15],xmm7[15]
+; GFNISSE-NEXT:    pmovzxwd {{.*#+}} xmm11 = xmm10[0],zero,xmm10[1],zero,xmm10[2],zero,xmm10[3],zero
+; GFNISSE-NEXT:    punpckhwd {{.*#+}} xmm10 = xmm10[4,4,5,5,6,6,7,7]
+; GFNISSE-NEXT:    pslld $23, %xmm10
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
+; GFNISSE-NEXT:    paddd %xmm4, %xmm10
+; GFNISSE-NEXT:    cvttps2dq %xmm10, %xmm10
+; GFNISSE-NEXT:    pslld $23, %xmm11
+; GFNISSE-NEXT:    paddd %xmm4, %xmm11
+; GFNISSE-NEXT:    cvttps2dq %xmm11, %xmm11
+; GFNISSE-NEXT:    packusdw %xmm10, %xmm11
+; GFNISSE-NEXT:    movdqa %xmm2, %xmm10
+; GFNISSE-NEXT:    punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm6[8],xmm10[9],xmm6[9],xmm10[10],xmm6[10],xmm10[11],xmm6[11],xmm10[12],xmm6[12],xmm10[13],xmm6[13],xmm10[14],xmm6[14],xmm10[15],xmm6[15]
+; GFNISSE-NEXT:    pmullw %xmm11, %xmm10
+; GFNISSE-NEXT:    psrlw $8, %xmm10
+; GFNISSE-NEXT:    pslld $23, %xmm0
+; GFNISSE-NEXT:    paddd %xmm4, %xmm0
+; GFNISSE-NEXT:    cvttps2dq %xmm0, %xmm0
+; GFNISSE-NEXT:    punpckhwd {{.*#+}} xmm9 = xmm9[4,4,5,5,6,6,7,7]
+; GFNISSE-NEXT:    pslld $23, %xmm9
+; GFNISSE-NEXT:    paddd %xmm4, %xmm9
+; GFNISSE-NEXT:    cvttps2dq %xmm9, %xmm9
+; GFNISSE-NEXT:    packusdw %xmm9, %xmm0
+; GFNISSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7]
+; GFNISSE-NEXT:    pmullw %xmm2, %xmm0
+; GFNISSE-NEXT:    psrlw $8, %xmm0
+; GFNISSE-NEXT:    packuswb %xmm10, %xmm0
+; GFNISSE-NEXT:    pand %xmm8, %xmm5
+; GFNISSE-NEXT:    pmovzxbd {{.*#+}} xmm2 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero
+; GFNISSE-NEXT:    pmovzxbw {{.*#+}} xmm6 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero
+; GFNISSE-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm7[8],xmm5[9],xmm7[9],xmm5[10],xmm7[10],xmm5[11],xmm7[11],xmm5[12],xmm7[12],xmm5[13],xmm7[13],xmm5[14],xmm7[14],xmm5[15],xmm7[15]
+; GFNISSE-NEXT:    pmovzxwd {{.*#+}} xmm7 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero
+; GFNISSE-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7]
+; GFNISSE-NEXT:    pslld $23, %xmm5
+; GFNISSE-NEXT:    paddd %xmm4, %xmm5
+; GFNISSE-NEXT:    cvttps2dq %xmm5, %xmm5
+; GFNISSE-NEXT:    pslld $23, %xmm7
+; GFNISSE-NEXT:    paddd %xmm4, %xmm7
+; GFNISSE-NEXT:    cvttps2dq %xmm7, %xmm7
+; GFNISSE-NEXT:    packusdw %xmm5, %xmm7
+; GFNISSE-NEXT:    movdqa %xmm3, %xmm5
+; GFNISSE-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm1[8],xmm5[9],xmm1[9],xmm5[10],xmm1[10],xmm5[11],xmm1[11],xmm5[12],xmm1[12],xmm5[13],xmm1[13],xmm5[14],xmm1[14],xmm5[15],xmm1[15]
+; GFNISSE-NEXT:    pmullw %xmm7, %xmm5
+; GFNISSE-NEXT:    psrlw $8, %xmm5
+; GFNISSE-NEXT:    pslld $23, %xmm2
+; GFNISSE-NEXT:    paddd %xmm4, %xmm2
+; GFNISSE-NEXT:    cvttps2dq %xmm2, %xmm2
+; GFNISSE-NEXT:    punpckhwd {{.*#+}} xmm6 = xmm6[4,4,5,5,6,6,7,7]
+; GFNISSE-NEXT:    pslld $23, %xmm6
+; GFNISSE-NEXT:    paddd %xmm4, %xmm6
+; GFNISSE-NEXT:    cvttps2dq %xmm6, %xmm4
+; GFNISSE-NEXT:    packusdw %xmm4, %xmm2
+; GFNISSE-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
+; GFNISSE-NEXT:    pmullw %xmm3, %xmm2
+; GFNISSE-NEXT:    psrlw $8, %xmm2
+; GFNISSE-NEXT:    packuswb %xmm5, %xmm2
+; GFNISSE-NEXT:    movdqa %xmm2, %xmm1
+; GFNISSE-NEXT:    retq
+;
+; GFNIAVX1-LABEL: var_fshl_v32i8:
+; GFNIAVX1:       # %bb.0:
+; GFNIAVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
+; GFNIAVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
+; GFNIAVX1-NEXT:    vpxor %xmm5, %xmm5, %xmm5
+; GFNIAVX1-NEXT:    vpunpckhbw {{.*#+}} xmm6 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15]
+; GFNIAVX1-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm6[4,4,5,5,6,6,7,7]
+; GFNIAVX1-NEXT:    vpslld $23, %xmm3, %xmm7
+; GFNIAVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
+; GFNIAVX1-NEXT:    vpaddd %xmm3, %xmm7, %xmm7
+; GFNIAVX1-NEXT:    vcvttps2dq %xmm7, %xmm7
+; GFNIAVX1-NEXT:    vpmovzxwd {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero
+; GFNIAVX1-NEXT:    vpslld $23, %xmm6, %xmm6
+; GFNIAVX1-NEXT:    vpaddd %xmm3, %xmm6, %xmm6
+; GFNIAVX1-NEXT:    vcvttps2dq %xmm6, %xmm6
+; GFNIAVX1-NEXT:    vpackusdw %xmm7, %xmm6, %xmm6
+; GFNIAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm7
+; GFNIAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm8
+; GFNIAVX1-NEXT:    vpunpckhbw {{.*#+}} xmm9 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15]
+; GFNIAVX1-NEXT:    vpmullw %xmm6, %xmm9, %xmm6
+; GFNIAVX1-NEXT:    vpsrlw $8, %xmm6, %xmm6
+; GFNIAVX1-NEXT:    vpmovzxbd {{.*#+}} xmm9 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
+; GFNIAVX1-NEXT:    vpslld $23, %xmm9, %xmm9
+; GFNIAVX1-NEXT:    vpaddd %xmm3, %xmm9, %xmm9
+; GFNIAVX1-NEXT:    vcvttps2dq %xmm9, %xmm9
+; GFNIAVX1-NEXT:    vpmovzxbw {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
+; GFNIAVX1-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7]
+; GFNIAVX1-NEXT:    vpslld $23, %xmm4, %xmm4
+; GFNIAVX1-NEXT:    vpaddd %xmm3, %xmm4, %xmm4
+; GFNIAVX1-NEXT:    vcvttps2dq %xmm4, %xmm4
+; GFNIAVX1-NEXT:    vpackusdw %xmm4, %xmm9, %xmm4
+; GFNIAVX1-NEXT:    vpunpcklbw {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7]
+; GFNIAVX1-NEXT:    vpmullw %xmm4, %xmm7, %xmm4
+; GFNIAVX1-NEXT:    vpsrlw $8, %xmm4, %xmm4
+; GFNIAVX1-NEXT:    vpackuswb %xmm6, %xmm4, %xmm4
+; GFNIAVX1-NEXT:    vpunpckhbw {{.*#+}} xmm5 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15]
+; GFNIAVX1-NEXT:    vpunpckhwd {{.*#+}} xmm6 = xmm5[4,4,5,5,6,6,7,7]
+; GFNIAVX1-NEXT:    vpslld $23, %xmm6, %xmm6
+; GFNIAVX1-NEXT:    vpaddd %xmm3, %xmm6, %xmm6
+; GFNIAVX1-NEXT:    vcvttps2dq %xmm6, %xmm6
+; GFNIAVX1-NEXT:    vpmovzxwd {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero
+; GFNIAVX1-NEXT:    vpslld $23, %xmm5, %xmm5
+; GFNIAVX1-NEXT:    vpaddd %xmm3, %xmm5, %xmm5
+; GFNIAVX1-NEXT:    vcvttps2dq %xmm5, %xmm5
+; GFNIAVX1-NEXT:    vpackusdw %xmm6, %xmm5, %xmm5
+; GFNIAVX1-NEXT:    vpunpckhbw {{.*#+}} xmm6 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; GFNIAVX1-NEXT:    vpmullw %xmm5, %xmm6, %xmm5
+; GFNIAVX1-NEXT:    vpsrlw $8, %xmm5, %xmm5
+; GFNIAVX1-NEXT:    vpmovzxbd {{.*#+}} xmm6 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
+; GFNIAVX1-NEXT:    vpslld $23, %xmm6, %xmm6
+; GFNIAVX1-NEXT:    vpaddd %xmm3, %xmm6, %xmm6
+; GFNIAVX1-NEXT:    vcvttps2dq %xmm6, %xmm6
+; GFNIAVX1-NEXT:    vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
+; GFNIAVX1-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7]
+; GFNIAVX1-NEXT:    vpslld $23, %xmm2, %xmm2
+; GFNIAVX1-NEXT:    vpaddd %xmm3, %xmm2, %xmm2
+; GFNIAVX1-NEXT:    vcvttps2dq %xmm2, %xmm2
+; GFNIAVX1-NEXT:    vpackusdw %xmm2, %xmm6, %xmm2
+; GFNIAVX1-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; GFNIAVX1-NEXT:    vpmullw %xmm2, %xmm0, %xmm0
+; GFNIAVX1-NEXT:    vpsrlw $8, %xmm0, %xmm0
+; GFNIAVX1-NEXT:    vpackuswb %xmm5, %xmm0, %xmm0
+; GFNIAVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; GFNIAVX1-NEXT:    retq
+;
+; GFNIAVX2-LABEL: var_fshl_v32i8:
+; GFNIAVX2:       # %bb.0:
+; GFNIAVX2-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
+; GFNIAVX2-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; GFNIAVX2-NEXT:    vpunpckhwd {{.*#+}} ymm5 = ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15]
+; GFNIAVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
+; GFNIAVX2-NEXT:    vpunpckhbw {{.*#+}} ymm6 = ymm2[8],ymm4[8],ymm2[9],ymm4[9],ymm2[10],ymm4[10],ymm2[11],ymm4[11],ymm2[12],ymm4[12],ymm2[13],ymm4[13],ymm2[14],ymm4[14],ymm2[15],ymm4[15],ymm2[24],ymm4[24],ymm2[25],ymm4[25],ymm2[26],ymm4[26],ymm2[27],ymm4[27],ymm2[28],ymm4[28],ymm2[29],ymm4[29],ymm2[30],ymm4[30],ymm2[31],ymm4[31]
+; GFNIAVX2-NEXT:    vpunpckhwd {{.*#+}} ymm7 = ymm6[4],ymm4[4],ymm6[5],ymm4[5],ymm6[6],ymm4[6],ymm6[7],ymm4[7],ymm6[12],ymm4[12],ymm6[13],ymm4[13],ymm6[14],ymm4[14],ymm6[15],ymm4[15]
+; GFNIAVX2-NEXT:    vpsllvd %ymm7, %ymm5, %ymm5
+; GFNIAVX2-NEXT:    vpsrld $16, %ymm5, %ymm5
+; GFNIAVX2-NEXT:    vpunpcklwd {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11]
+; GFNIAVX2-NEXT:    vpunpcklwd {{.*#+}} ymm6 = ymm6[0],ymm4[0],ymm6[1],ymm4[1],ymm6[2],ymm4[2],ymm6[3],ymm4[3],ymm6[8],ymm4[8],ymm6[9],ymm4[9],ymm6[10],ymm4[10],ymm6[11],ymm4[11]
+; GFNIAVX2-NEXT:    vpsllvd %ymm6, %ymm3, %ymm3
+; GFNIAVX2-NEXT:    vpsrld $16, %ymm3, %ymm3
+; GFNIAVX2-NEXT:    vpackusdw %ymm5, %ymm3, %ymm3
+; GFNIAVX2-NEXT:    vpsrlw $8, %ymm3, %ymm3
+; GFNIAVX2-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
+; GFNIAVX2-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm4[4],ymm0[4],ymm4[5],ymm0[5],ymm4[6],ymm0[6],ymm4[7],ymm0[7],ymm4[12],ymm0[12],ymm4[13],ymm0[13],ymm4[14],ymm0[14],ymm4[15],ymm0[15]
+; GFNIAVX2-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[4],ymm4[4],ymm2[5],ymm4[5],ymm2[6],ymm4[6],ymm2[7],ymm4[7],ymm2[16],ymm4[16],ymm2[17],ymm4[17],ymm2[18],ymm4[18],ymm2[19],ymm4[19],ymm2[20],ymm4[20],ymm2[21],ymm4[21],ymm2[22],ymm4[22],ymm2[23],ymm4[23]
+; GFNIAVX2-NEXT:    vpunpckhwd {{.*#+}} ymm5 = ymm2[4],ymm4[4],ymm2[5],ymm4[5],ymm2[6],ymm4[6],ymm2[7],ymm4[7],ymm2[12],ymm4[12],ymm2[13],ymm4[13],ymm2[14],ymm4[14],ymm2[15],ymm4[15]
+; GFNIAVX2-NEXT:    vpsllvd %ymm5, %ymm1, %ymm1
+; GFNIAVX2-NEXT:    vpsrld $16, %ymm1, %ymm1
+; GFNIAVX2-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm4[0],ymm0[0],ymm4[1],ymm0[1],ymm4[2],ymm0[2],ymm4[3],ymm0[3],ymm4[8],ymm0[8],ymm4[9],ymm0[9],ymm4[10],ymm0[10],ymm4[11],ymm0[11]
+; GFNIAVX2-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[8],ymm4[8],ymm2[9],ymm4[9],ymm2[10],ymm4[10],ymm2[11],ymm4[11]
+; GFNIAVX2-NEXT:    vpsllvd %ymm2, %ymm0, %ymm0
+; GFNIAVX2-NEXT:    vpsrld $16, %ymm0, %ymm0
+; GFNIAVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
+; GFNIAVX2-NEXT:    vpsrlw $8, %ymm0, %ymm0
+; GFNIAVX2-NEXT:    vpackuswb %ymm3, %ymm0, %ymm0
+; GFNIAVX2-NEXT:    retq
+;
+; GFNIAVX512VL-LABEL: var_fshl_v32i8:
+; GFNIAVX512VL:       # %bb.0:
+; GFNIAVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; GFNIAVX512VL-NEXT:    vpandn %ymm3, %ymm2, %ymm4
+; GFNIAVX512VL-NEXT:    vpsllw $5, %ymm4, %ymm4
+; GFNIAVX512VL-NEXT:    vpsrlw $1, %ymm1, %ymm1
+; GFNIAVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; GFNIAVX512VL-NEXT:    vpand %ymm5, %ymm1, %ymm1
+; GFNIAVX512VL-NEXT:    vpsrlw $4, %ymm1, %ymm6
+; GFNIAVX512VL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm6, %ymm6
+; GFNIAVX512VL-NEXT:    vpblendvb %ymm4, %ymm6, %ymm1, %ymm1
+; GFNIAVX512VL-NEXT:    vpsrlw $2, %ymm1, %ymm6
+; GFNIAVX512VL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm6, %ymm6
+; GFNIAVX512VL-NEXT:    vpaddb %ymm4, %ymm4, %ymm4
+; GFNIAVX512VL-NEXT:    vpblendvb %ymm4, %ymm6, %ymm1, %ymm1
+; GFNIAVX512VL-NEXT:    vpsrlw $1, %ymm1, %ymm6
+; GFNIAVX512VL-NEXT:    vpand %ymm5, %ymm6, %ymm5
+; GFNIAVX512VL-NEXT:    vpaddb %ymm4, %ymm4, %ymm4
+; GFNIAVX512VL-NEXT:    vpblendvb %ymm4, %ymm5, %ymm1, %ymm1
+; GFNIAVX512VL-NEXT:    vpand %ymm3, %ymm2, %ymm2
+; GFNIAVX512VL-NEXT:    vpsllw $5, %ymm2, %ymm2
+; GFNIAVX512VL-NEXT:    vpaddb %ymm2, %ymm2, %ymm3
+; GFNIAVX512VL-NEXT:    vpsllw $4, %ymm0, %ymm4
+; GFNIAVX512VL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm4, %ymm4
+; GFNIAVX512VL-NEXT:    vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
+; GFNIAVX512VL-NEXT:    vpsllw $2, %ymm0, %ymm2
+; GFNIAVX512VL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm2
+; GFNIAVX512VL-NEXT:    vpblendvb %ymm3, %ymm2, %ymm0, %ymm0
+; GFNIAVX512VL-NEXT:    vpaddb %ymm0, %ymm0, %ymm2
+; GFNIAVX512VL-NEXT:    vpaddb %ymm3, %ymm3, %ymm3
+; GFNIAVX512VL-NEXT:    vpblendvb %ymm3, %ymm2, %ymm0, %ymm0
+; GFNIAVX512VL-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; GFNIAVX512VL-NEXT:    retq
+;
+; GFNIAVX512BW-LABEL: var_fshl_v32i8:
+; GFNIAVX512BW:       # %bb.0:
+; GFNIAVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
+; GFNIAVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; GFNIAVX512BW-NEXT:    vpsllw $8, %zmm0, %zmm0
+; GFNIAVX512BW-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; GFNIAVX512BW-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm1
+; GFNIAVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
+; GFNIAVX512BW-NEXT:    vpsllvw %zmm1, %zmm0, %zmm0
+; GFNIAVX512BW-NEXT:    vpsrlw $8, %zmm0, %zmm0
+; GFNIAVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
+; GFNIAVX512BW-NEXT:    retq
+  %res = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8> %amt)
+  ret <32 x i8> %res
+}
+
+define <32 x i8> @var_fshr_v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8> %amt) nounwind {
+; GFNISSE-LABEL: var_fshr_v32i8:
+; GFNISSE:       # %bb.0:
+; GFNISSE-NEXT:    movdqa %xmm4, %xmm6
+; GFNISSE-NEXT:    movdqa %xmm0, %xmm4
+; GFNISSE-NEXT:    movdqa %xmm2, %xmm9
+; GFNISSE-NEXT:    psrlw $4, %xmm9
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm8 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; GFNISSE-NEXT:    pand %xmm8, %xmm9
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm7 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; GFNISSE-NEXT:    movdqa %xmm6, %xmm0
+; GFNISSE-NEXT:    pand %xmm7, %xmm0
+; GFNISSE-NEXT:    psllw $5, %xmm0
+; GFNISSE-NEXT:    pblendvb %xmm0, %xmm9, %xmm2
+; GFNISSE-NEXT:    movdqa %xmm2, %xmm10
+; GFNISSE-NEXT:    psrlw $2, %xmm10
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm9 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
+; GFNISSE-NEXT:    pand %xmm9, %xmm10
+; GFNISSE-NEXT:    paddb %xmm0, %xmm0
+; GFNISSE-NEXT:    pblendvb %xmm0, %xmm10, %xmm2
+; GFNISSE-NEXT:    movdqa %xmm2, %xmm11
+; GFNISSE-NEXT:    psrlw $1, %xmm11
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm10 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; GFNISSE-NEXT:    pand %xmm10, %xmm11
+; GFNISSE-NEXT:    paddb %xmm0, %xmm0
+; GFNISSE-NEXT:    pblendvb %xmm0, %xmm11, %xmm2
+; GFNISSE-NEXT:    paddb %xmm4, %xmm4
+; GFNISSE-NEXT:    movdqa %xmm4, %xmm12
+; GFNISSE-NEXT:    psllw $4, %xmm12
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm11 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; GFNISSE-NEXT:    pand %xmm11, %xmm12
+; GFNISSE-NEXT:    pandn %xmm7, %xmm6
+; GFNISSE-NEXT:    psllw $5, %xmm6
+; GFNISSE-NEXT:    movdqa %xmm6, %xmm0
+; GFNISSE-NEXT:    pblendvb %xmm0, %xmm12, %xmm4
+; GFNISSE-NEXT:    movdqa %xmm4, %xmm13
+; GFNISSE-NEXT:    psllw $2, %xmm13
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm12 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
+; GFNISSE-NEXT:    pand %xmm12, %xmm13
+; GFNISSE-NEXT:    paddb %xmm6, %xmm6
+; GFNISSE-NEXT:    movdqa %xmm6, %xmm0
+; GFNISSE-NEXT:    pblendvb %xmm0, %xmm13, %xmm4
+; GFNISSE-NEXT:    movdqa %xmm4, %xmm13
+; GFNISSE-NEXT:    paddb %xmm4, %xmm13
+; GFNISSE-NEXT:    paddb %xmm6, %xmm6
+; GFNISSE-NEXT:    movdqa %xmm6, %xmm0
+; GFNISSE-NEXT:    pblendvb %xmm0, %xmm13, %xmm4
+; GFNISSE-NEXT:    por %xmm2, %xmm4
+; GFNISSE-NEXT:    movdqa %xmm3, %xmm2
+; GFNISSE-NEXT:    psrlw $4, %xmm2
+; GFNISSE-NEXT:    pand %xmm8, %xmm2
+; GFNISSE-NEXT:    movdqa %xmm5, %xmm0
+; GFNISSE-NEXT:    pand %xmm7, %xmm0
+; GFNISSE-NEXT:    psllw $5, %xmm0
+; GFNISSE-NEXT:    pblendvb %xmm0, %xmm2, %xmm3
+; GFNISSE-NEXT:    movdqa %xmm3, %xmm2
+; GFNISSE-NEXT:    psrlw $2, %xmm2
+; GFNISSE-NEXT:    pand %xmm9, %xmm2
+; GFNISSE-NEXT:    paddb %xmm0, %xmm0
+; GFNISSE-NEXT:    pblendvb %xmm0, %xmm2, %xmm3
+; GFNISSE-NEXT:    movdqa %xmm3, %xmm2
+; GFNISSE-NEXT:    psrlw $1, %xmm2
+; GFNISSE-NEXT:    pand %xmm10, %xmm2
+; GFNISSE-NEXT:    paddb %xmm0, %xmm0
+; GFNISSE-NEXT:    pblendvb %xmm0, %xmm2, %xmm3
+; GFNISSE-NEXT:    paddb %xmm1, %xmm1
+; GFNISSE-NEXT:    movdqa %xmm1, %xmm2
+; GFNISSE-NEXT:    psllw $4, %xmm2
+; GFNISSE-NEXT:    pand %xmm11, %xmm2
+; GFNISSE-NEXT:    pandn %xmm7, %xmm5
+; GFNISSE-NEXT:    psllw $5, %xmm5
+; GFNISSE-NEXT:    movdqa %xmm5, %xmm0
+; GFNISSE-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
+; GFNISSE-NEXT:    movdqa %xmm1, %xmm2
+; GFNISSE-NEXT:    psllw $2, %xmm2
+; GFNISSE-NEXT:    pand %xmm12, %xmm2
+; GFNISSE-NEXT:    paddb %xmm5, %xmm5
+; GFNISSE-NEXT:    movdqa %xmm5, %xmm0
+; GFNISSE-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
+; GFNISSE-NEXT:    movdqa %xmm1, %xmm2
+; GFNISSE-NEXT:    paddb %xmm1, %xmm2
+; GFNISSE-NEXT:    paddb %xmm5, %xmm5
+; GFNISSE-NEXT:    movdqa %xmm5, %xmm0
+; GFNISSE-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
+; GFNISSE-NEXT:    por %xmm3, %xmm1
+; GFNISSE-NEXT:    movdqa %xmm4, %xmm0
+; GFNISSE-NEXT:    retq
+;
+; GFNIAVX1-LABEL: var_fshr_v32i8:
+; GFNIAVX1:       # %bb.0:
+; GFNIAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
+; GFNIAVX1-NEXT:    vpsrlw $4, %xmm5, %xmm3
+; GFNIAVX1-NEXT:    vbroadcastss {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; GFNIAVX1-NEXT:    vpand %xmm4, %xmm3, %xmm6
+; GFNIAVX1-NEXT:    vbroadcastss {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; GFNIAVX1-NEXT:    vandps %ymm3, %ymm2, %ymm2
+; GFNIAVX1-NEXT:    vextractf128 $1, %ymm2, %xmm7
+; GFNIAVX1-NEXT:    vpsllw $5, %xmm7, %xmm8
+; GFNIAVX1-NEXT:    vpblendvb %xmm8, %xmm6, %xmm5, %xmm6
+; GFNIAVX1-NEXT:    vpsrlw $2, %xmm6, %xmm9
+; GFNIAVX1-NEXT:    vbroadcastss {{.*#+}} xmm5 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
+; GFNIAVX1-NEXT:    vpand %xmm5, %xmm9, %xmm9
+; GFNIAVX1-NEXT:    vpaddb %xmm8, %xmm8, %xmm8
+; GFNIAVX1-NEXT:    vpblendvb %xmm8, %xmm9, %xmm6, %xmm9
+; GFNIAVX1-NEXT:    vpsrlw $1, %xmm9, %xmm10
+; GFNIAVX1-NEXT:    vbroadcastss {{.*#+}} xmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; GFNIAVX1-NEXT:    vpand %xmm6, %xmm10, %xmm10
+; GFNIAVX1-NEXT:    vpaddb %xmm8, %xmm8, %xmm8
+; GFNIAVX1-NEXT:    vpblendvb %xmm8, %xmm10, %xmm9, %xmm8
+; GFNIAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm9
+; GFNIAVX1-NEXT:    vpaddb %xmm9, %xmm9, %xmm9
+; GFNIAVX1-NEXT:    vpsllw $4, %xmm9, %xmm10
+; GFNIAVX1-NEXT:    vbroadcastss {{.*#+}} xmm11 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; GFNIAVX1-NEXT:    vpand %xmm11, %xmm10, %xmm10
+; GFNIAVX1-NEXT:    vpxor %xmm3, %xmm7, %xmm7
+; GFNIAVX1-NEXT:    vpsllw $5, %xmm7, %xmm7
+; GFNIAVX1-NEXT:    vpblendvb %xmm7, %xmm10, %xmm9, %xmm9
+; GFNIAVX1-NEXT:    vpsllw $2, %xmm9, %xmm10
+; GFNIAVX1-NEXT:    vbroadcastss {{.*#+}} xmm12 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
+; GFNIAVX1-NEXT:    vpand %xmm12, %xmm10, %xmm10
+; GFNIAVX1-NEXT:    vpaddb %xmm7, %xmm7, %xmm7
+; GFNIAVX1-NEXT:    vpblendvb %xmm7, %xmm10, %xmm9, %xmm9
+; GFNIAVX1-NEXT:    vpaddb %xmm9, %xmm9, %xmm10
+; GFNIAVX1-NEXT:    vpaddb %xmm7, %xmm7, %xmm7
+; GFNIAVX1-NEXT:    vpblendvb %xmm7, %xmm10, %xmm9, %xmm7
+; GFNIAVX1-NEXT:    vpor %xmm7, %xmm8, %xmm7
+; GFNIAVX1-NEXT:    vpsrlw $4, %xmm1, %xmm8
+; GFNIAVX1-NEXT:    vpand %xmm4, %xmm8, %xmm4
+; GFNIAVX1-NEXT:    vpsllw $5, %xmm2, %xmm8
+; GFNIAVX1-NEXT:    vpblendvb %xmm8, %xmm4, %xmm1, %xmm1
+; GFNIAVX1-NEXT:    vpsrlw $2, %xmm1, %xmm4
+; GFNIAVX1-NEXT:    vpand %xmm5, %xmm4, %xmm4
+; GFNIAVX1-NEXT:    vpaddb %xmm8, %xmm8, %xmm5
+; GFNIAVX1-NEXT:    vpblendvb %xmm5, %xmm4, %xmm1, %xmm1
+; GFNIAVX1-NEXT:    vpsrlw $1, %xmm1, %xmm4
+; GFNIAVX1-NEXT:    vpand %xmm6, %xmm4, %xmm4
+; GFNIAVX1-NEXT:    vpaddb %xmm5, %xmm5, %xmm5
+; GFNIAVX1-NEXT:    vpblendvb %xmm5, %xmm4, %xmm1, %xmm1
+; GFNIAVX1-NEXT:    vpaddb %xmm0, %xmm0, %xmm0
+; GFNIAVX1-NEXT:    vpsllw $4, %xmm0, %xmm4
+; GFNIAVX1-NEXT:    vpand %xmm4, %xmm11, %xmm4
+; GFNIAVX1-NEXT:    vpxor %xmm3, %xmm2, %xmm2
+; GFNIAVX1-NEXT:    vpsllw $5, %xmm2, %xmm2
+; GFNIAVX1-NEXT:    vpblendvb %xmm2, %xmm4, %xmm0, %xmm0
+; GFNIAVX1-NEXT:    vpsllw $2, %xmm0, %xmm3
+; GFNIAVX1-NEXT:    vpand %xmm3, %xmm12, %xmm3
+; GFNIAVX1-NEXT:    vpaddb %xmm2, %xmm2, %xmm2
+; GFNIAVX1-NEXT:    vpblendvb %xmm2, %xmm3, %xmm0, %xmm0
+; GFNIAVX1-NEXT:    vpaddb %xmm0, %xmm0, %xmm3
+; GFNIAVX1-NEXT:    vpaddb %xmm2, %xmm2, %xmm2
+; GFNIAVX1-NEXT:    vpblendvb %xmm2, %xmm3, %xmm0, %xmm0
+; GFNIAVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; GFNIAVX1-NEXT:    vinsertf128 $1, %xmm7, %ymm0, %ymm0
+; GFNIAVX1-NEXT:    retq
+;
+; GFNIAVX2-LABEL: var_fshr_v32i8:
+; GFNIAVX2:       # %bb.0:
+; GFNIAVX2-NEXT:    vpbroadcastb {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; GFNIAVX2-NEXT:    vpand %ymm3, %ymm2, %ymm4
+; GFNIAVX2-NEXT:    vpsllw $5, %ymm4, %ymm4
+; GFNIAVX2-NEXT:    vpaddb %ymm4, %ymm4, %ymm5
+; GFNIAVX2-NEXT:    vpsrlw $4, %ymm1, %ymm6
+; GFNIAVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm6, %ymm6
+; GFNIAVX2-NEXT:    vpblendvb %ymm4, %ymm6, %ymm1, %ymm1
+; GFNIAVX2-NEXT:    vpsrlw $2, %ymm1, %ymm4
+; GFNIAVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4
+; GFNIAVX2-NEXT:    vpblendvb %ymm5, %ymm4, %ymm1, %ymm1
+; GFNIAVX2-NEXT:    vpsrlw $1, %ymm1, %ymm4
+; GFNIAVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4
+; GFNIAVX2-NEXT:    vpaddb %ymm5, %ymm5, %ymm5
+; GFNIAVX2-NEXT:    vpblendvb %ymm5, %ymm4, %ymm1, %ymm1
+; GFNIAVX2-NEXT:    vpandn %ymm3, %ymm2, %ymm2
+; GFNIAVX2-NEXT:    vpsllw $5, %ymm2, %ymm2
+; GFNIAVX2-NEXT:    vpaddb %ymm2, %ymm2, %ymm3
+; GFNIAVX2-NEXT:    vpaddb %ymm0, %ymm0, %ymm0
+; GFNIAVX2-NEXT:    vpsllw $4, %ymm0, %ymm4
+; GFNIAVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4
+; GFNIAVX2-NEXT:    vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
+; GFNIAVX2-NEXT:    vpsllw $2, %ymm0, %ymm2
+; GFNIAVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
+; GFNIAVX2-NEXT:    vpblendvb %ymm3, %ymm2, %ymm0, %ymm0
+; GFNIAVX2-NEXT:    vpaddb %ymm0, %ymm0, %ymm2
+; GFNIAVX2-NEXT:    vpaddb %ymm3, %ymm3, %ymm3
+; GFNIAVX2-NEXT:    vpblendvb %ymm3, %ymm2, %ymm0, %ymm0
+; GFNIAVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; GFNIAVX2-NEXT:    retq
+;
+; GFNIAVX512VL-LABEL: var_fshr_v32i8:
+; GFNIAVX512VL:       # %bb.0:
+; GFNIAVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; GFNIAVX512VL-NEXT:    vpand %ymm3, %ymm2, %ymm4
+; GFNIAVX512VL-NEXT:    vpsllw $5, %ymm4, %ymm4
+; GFNIAVX512VL-NEXT:    vpaddb %ymm4, %ymm4, %ymm5
+; GFNIAVX512VL-NEXT:    vpsrlw $4, %ymm1, %ymm6
+; GFNIAVX512VL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm6, %ymm6
+; GFNIAVX512VL-NEXT:    vpblendvb %ymm4, %ymm6, %ymm1, %ymm1
+; GFNIAVX512VL-NEXT:    vpsrlw $2, %ymm1, %ymm4
+; GFNIAVX512VL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm4, %ymm4
+; GFNIAVX512VL-NEXT:    vpblendvb %ymm5, %ymm4, %ymm1, %ymm1
+; GFNIAVX512VL-NEXT:    vpsrlw $1, %ymm1, %ymm4
+; GFNIAVX512VL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm4, %ymm4
+; GFNIAVX512VL-NEXT:    vpaddb %ymm5, %ymm5, %ymm5
+; GFNIAVX512VL-NEXT:    vpblendvb %ymm5, %ymm4, %ymm1, %ymm1
+; GFNIAVX512VL-NEXT:    vpandn %ymm3, %ymm2, %ymm2
+; GFNIAVX512VL-NEXT:    vpsllw $5, %ymm2, %ymm2
+; GFNIAVX512VL-NEXT:    vpaddb %ymm2, %ymm2, %ymm3
+; GFNIAVX512VL-NEXT:    vpaddb %ymm0, %ymm0, %ymm0
+; GFNIAVX512VL-NEXT:    vpsllw $4, %ymm0, %ymm4
+; GFNIAVX512VL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm4, %ymm4
+; GFNIAVX512VL-NEXT:    vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
+; GFNIAVX512VL-NEXT:    vpsllw $2, %ymm0, %ymm2
+; GFNIAVX512VL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm2
+; GFNIAVX512VL-NEXT:    vpblendvb %ymm3, %ymm2, %ymm0, %ymm0
+; GFNIAVX512VL-NEXT:    vpaddb %ymm0, %ymm0, %ymm2
+; GFNIAVX512VL-NEXT:    vpaddb %ymm3, %ymm3, %ymm3
+; GFNIAVX512VL-NEXT:    vpblendvb %ymm3, %ymm2, %ymm0, %ymm0
+; GFNIAVX512VL-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; GFNIAVX512VL-NEXT:    retq
+;
+; GFNIAVX512BW-LABEL: var_fshr_v32i8:
+; GFNIAVX512BW:       # %bb.0:
+; GFNIAVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
+; GFNIAVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; GFNIAVX512BW-NEXT:    vpsllw $8, %zmm0, %zmm0
+; GFNIAVX512BW-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; GFNIAVX512BW-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm1
+; GFNIAVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
+; GFNIAVX512BW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
+; GFNIAVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
+; GFNIAVX512BW-NEXT:    retq
+  %res = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8> %amt)
+  ret <32 x i8> %res
+}
+
+define <32 x i8> @splatvar_fshl_v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8> %amt) nounwind {
+; GFNISSE-LABEL: splatvar_fshl_v32i8:
+; GFNISSE:       # %bb.0:
+; GFNISSE-NEXT:    movdqa %xmm2, %xmm5
+; GFNISSE-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15]
+; GFNISSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
+; GFNISSE-NEXT:    psllw %xmm4, %xmm5
+; GFNISSE-NEXT:    psrlw $8, %xmm5
+; GFNISSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; GFNISSE-NEXT:    psllw %xmm4, %xmm2
+; GFNISSE-NEXT:    psrlw $8, %xmm2
+; GFNISSE-NEXT:    packuswb %xmm5, %xmm2
+; GFNISSE-NEXT:    movdqa %xmm3, %xmm0
+; GFNISSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; GFNISSE-NEXT:    psllw %xmm4, %xmm0
+; GFNISSE-NEXT:    psrlw $8, %xmm0
+; GFNISSE-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
+; GFNISSE-NEXT:    psllw %xmm4, %xmm3
+; GFNISSE-NEXT:    psrlw $8, %xmm3
+; GFNISSE-NEXT:    packuswb %xmm0, %xmm3
+; GFNISSE-NEXT:    movdqa %xmm2, %xmm0
+; GFNISSE-NEXT:    movdqa %xmm3, %xmm1
+; GFNISSE-NEXT:    retq
+;
+; GFNIAVX1-LABEL: splatvar_fshl_v32i8:
+; GFNIAVX1:       # %bb.0:
+; GFNIAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; GFNIAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; GFNIAVX1-NEXT:    vpunpckhbw {{.*#+}} xmm5 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15]
+; GFNIAVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; GFNIAVX1-NEXT:    vpsllw %xmm2, %xmm5, %xmm5
+; GFNIAVX1-NEXT:    vpsrlw $8, %xmm5, %xmm5
+; GFNIAVX1-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
+; GFNIAVX1-NEXT:    vpsllw %xmm2, %xmm3, %xmm3
+; GFNIAVX1-NEXT:    vpsrlw $8, %xmm3, %xmm3
+; GFNIAVX1-NEXT:    vpackuswb %xmm5, %xmm3, %xmm3
+; GFNIAVX1-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; GFNIAVX1-NEXT:    vpsllw %xmm2, %xmm4, %xmm4
+; GFNIAVX1-NEXT:    vpsrlw $8, %xmm4, %xmm4
+; GFNIAVX1-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; GFNIAVX1-NEXT:    vpsllw %xmm2, %xmm0, %xmm0
+; GFNIAVX1-NEXT:    vpsrlw $8, %xmm0, %xmm0
+; GFNIAVX1-NEXT:    vpackuswb %xmm4, %xmm0, %xmm0
+; GFNIAVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
+; GFNIAVX1-NEXT:    retq
+;
+; GFNIAVX2-LABEL: splatvar_fshl_v32i8:
+; GFNIAVX2:       # %bb.0:
+; GFNIAVX2-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
+; GFNIAVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; GFNIAVX2-NEXT:    vpsllw %xmm2, %ymm3, %ymm3
+; GFNIAVX2-NEXT:    vpsrlw $8, %ymm3, %ymm3
+; GFNIAVX2-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
+; GFNIAVX2-NEXT:    vpsllw %xmm2, %ymm0, %ymm0
+; GFNIAVX2-NEXT:    vpsrlw $8, %ymm0, %ymm0
+; GFNIAVX2-NEXT:    vpackuswb %ymm3, %ymm0, %ymm0
+; GFNIAVX2-NEXT:    retq
+;
+; GFNIAVX512-LABEL: splatvar_fshl_v32i8:
+; GFNIAVX512:       # %bb.0:
+; GFNIAVX512-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
+; GFNIAVX512-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; GFNIAVX512-NEXT:    vpsllw %xmm2, %ymm3, %ymm3
+; GFNIAVX512-NEXT:    vpsrlw $8, %ymm3, %ymm3
+; GFNIAVX512-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
+; GFNIAVX512-NEXT:    vpsllw %xmm2, %ymm0, %ymm0
+; GFNIAVX512-NEXT:    vpsrlw $8, %ymm0, %ymm0
+; GFNIAVX512-NEXT:    vpackuswb %ymm3, %ymm0, %ymm0
+; GFNIAVX512-NEXT:    retq
+  %splat = shufflevector <32 x i8> %amt, <32 x i8> undef, <32 x i32> zeroinitializer
+  %res = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8> %splat)
+  ret <32 x i8> %res
+}
+
+define <32 x i8> @splatvar_fshr_v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8> %amt) nounwind {
+; GFNISSE-LABEL: splatvar_fshr_v32i8:
+; GFNISSE:       # %bb.0:
+; GFNISSE-NEXT:    movdqa %xmm2, %xmm6
+; GFNISSE-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm0[8],xmm6[9],xmm0[9],xmm6[10],xmm0[10],xmm6[11],xmm0[11],xmm6[12],xmm0[12],xmm6[13],xmm0[13],xmm6[14],xmm0[14],xmm6[15],xmm0[15]
+; GFNISSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
+; GFNISSE-NEXT:    psrlw %xmm4, %xmm6
+; GFNISSE-NEXT:    pmovzxbw {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
+; GFNISSE-NEXT:    pand %xmm5, %xmm6
+; GFNISSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; GFNISSE-NEXT:    psrlw %xmm4, %xmm2
+; GFNISSE-NEXT:    pand %xmm5, %xmm2
+; GFNISSE-NEXT:    packuswb %xmm6, %xmm2
+; GFNISSE-NEXT:    movdqa %xmm3, %xmm0
+; GFNISSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; GFNISSE-NEXT:    psrlw %xmm4, %xmm0
+; GFNISSE-NEXT:    pand %xmm5, %xmm0
+; GFNISSE-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
+; GFNISSE-NEXT:    psrlw %xmm4, %xmm3
+; GFNISSE-NEXT:    pand %xmm3, %xmm5
+; GFNISSE-NEXT:    packuswb %xmm0, %xmm5
+; GFNISSE-NEXT:    movdqa %xmm2, %xmm0
+; GFNISSE-NEXT:    movdqa %xmm5, %xmm1
+; GFNISSE-NEXT:    retq
+;
+; GFNIAVX1-LABEL: splatvar_fshr_v32i8:
+; GFNIAVX1:       # %bb.0:
+; GFNIAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; GFNIAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; GFNIAVX1-NEXT:    vpunpckhbw {{.*#+}} xmm5 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15]
+; GFNIAVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; GFNIAVX1-NEXT:    vpsrlw %xmm2, %xmm5, %xmm5
+; GFNIAVX1-NEXT:    vbroadcastss {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255]
+; GFNIAVX1-NEXT:    vpand %xmm6, %xmm5, %xmm5
+; GFNIAVX1-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
+; GFNIAVX1-NEXT:    vpsrlw %xmm2, %xmm3, %xmm3
+; GFNIAVX1-NEXT:    vpand %xmm6, %xmm3, %xmm3
+; GFNIAVX1-NEXT:    vpackuswb %xmm5, %xmm3, %xmm3
+; GFNIAVX1-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; GFNIAVX1-NEXT:    vpsrlw %xmm2, %xmm4, %xmm4
+; GFNIAVX1-NEXT:    vpand %xmm6, %xmm4, %xmm4
+; GFNIAVX1-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; GFNIAVX1-NEXT:    vpsrlw %xmm2, %xmm0, %xmm0
+; GFNIAVX1-NEXT:    vpand %xmm6, %xmm0, %xmm0
+; GFNIAVX1-NEXT:    vpackuswb %xmm4, %xmm0, %xmm0
+; GFNIAVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
+; GFNIAVX1-NEXT:    retq
+;
+; GFNIAVX2-LABEL: splatvar_fshr_v32i8:
+; GFNIAVX2:       # %bb.0:
+; GFNIAVX2-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
+; GFNIAVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; GFNIAVX2-NEXT:    vpsrlw %xmm2, %ymm3, %ymm3
+; GFNIAVX2-NEXT:    vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; GFNIAVX2-NEXT:    vpand %ymm4, %ymm3, %ymm3
+; GFNIAVX2-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
+; GFNIAVX2-NEXT:    vpsrlw %xmm2, %ymm0, %ymm0
+; GFNIAVX2-NEXT:    vpand %ymm4, %ymm0, %ymm0
+; GFNIAVX2-NEXT:    vpackuswb %ymm3, %ymm0, %ymm0
+; GFNIAVX2-NEXT:    retq
+;
+; GFNIAVX512VL-LABEL: splatvar_fshr_v32i8:
+; GFNIAVX512VL:       # %bb.0:
+; GFNIAVX512VL-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
+; GFNIAVX512VL-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; GFNIAVX512VL-NEXT:    vpsrlw %xmm2, %ymm3, %ymm3
+; GFNIAVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; GFNIAVX512VL-NEXT:    vpand %ymm4, %ymm3, %ymm3
+; GFNIAVX512VL-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
+; GFNIAVX512VL-NEXT:    vpsrlw %xmm2, %ymm0, %ymm0
+; GFNIAVX512VL-NEXT:    vpand %ymm4, %ymm0, %ymm0
+; GFNIAVX512VL-NEXT:    vpackuswb %ymm3, %ymm0, %ymm0
+; GFNIAVX512VL-NEXT:    retq
+;
+; GFNIAVX512BW-LABEL: splatvar_fshr_v32i8:
+; GFNIAVX512BW:       # %bb.0:
+; GFNIAVX512BW-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
+; GFNIAVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; GFNIAVX512BW-NEXT:    vpsrlw %xmm2, %ymm3, %ymm3
+; GFNIAVX512BW-NEXT:    vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; GFNIAVX512BW-NEXT:    vpand %ymm4, %ymm3, %ymm3
+; GFNIAVX512BW-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
+; GFNIAVX512BW-NEXT:    vpsrlw %xmm2, %ymm0, %ymm0
+; GFNIAVX512BW-NEXT:    vpand %ymm4, %ymm0, %ymm0
+; GFNIAVX512BW-NEXT:    vpackuswb %ymm3, %ymm0, %ymm0
+; GFNIAVX512BW-NEXT:    retq
+  %splat = shufflevector <32 x i8> %amt, <32 x i8> undef, <32 x i32> zeroinitializer
+  %res = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8> %splat)
+  ret <32 x i8> %res
+}
+
+define <32 x i8> @constant_fshl_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
+; GFNISSE-LABEL: constant_fshl_v32i8:
+; GFNISSE:       # %bb.0:
+; GFNISSE-NEXT:    movdqa %xmm2, %xmm5
+; GFNISSE-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15]
+; GFNISSE-NEXT:    pmovzxbw {{.*#+}} xmm6 = [1,128,64,32,16,8,4,2]
+; GFNISSE-NEXT:    pmullw %xmm6, %xmm5
+; GFNISSE-NEXT:    psrlw $8, %xmm5
+; GFNISSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; GFNISSE-NEXT:    pmovzxbw {{.*#+}} xmm4 = [1,2,4,8,16,32,64,128]
+; GFNISSE-NEXT:    pmullw %xmm4, %xmm2
+; GFNISSE-NEXT:    psrlw $8, %xmm2
+; GFNISSE-NEXT:    packuswb %xmm5, %xmm2
+; GFNISSE-NEXT:    movdqa %xmm3, %xmm0
+; GFNISSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; GFNISSE-NEXT:    pmullw %xmm6, %xmm0
+; GFNISSE-NEXT:    psrlw $8, %xmm0
+; GFNISSE-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
+; GFNISSE-NEXT:    pmullw %xmm3, %xmm4
+; GFNISSE-NEXT:    psrlw $8, %xmm4
+; GFNISSE-NEXT:    packuswb %xmm0, %xmm4
+; GFNISSE-NEXT:    movdqa %xmm2, %xmm0
+; GFNISSE-NEXT:    movdqa %xmm4, %xmm1
+; GFNISSE-NEXT:    retq
+;
+; GFNIAVX1-LABEL: constant_fshl_v32i8:
+; GFNIAVX1:       # %bb.0:
+; GFNIAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; GFNIAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; GFNIAVX1-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
+; GFNIAVX1-NEXT:    vpmovzxbw {{.*#+}} xmm5 = [1,128,64,32,16,8,4,2]
+; GFNIAVX1-NEXT:    vpmullw %xmm5, %xmm4, %xmm4
+; GFNIAVX1-NEXT:    vpsrlw $8, %xmm4, %xmm4
+; GFNIAVX1-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; GFNIAVX1-NEXT:    vpmovzxbw {{.*#+}} xmm3 = [1,2,4,8,16,32,64,128]
+; GFNIAVX1-NEXT:    vpmullw %xmm3, %xmm2, %xmm2
+; GFNIAVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
+; GFNIAVX1-NEXT:    vpackuswb %xmm4, %xmm2, %xmm2
+; GFNIAVX1-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; GFNIAVX1-NEXT:    vpmullw %xmm5, %xmm4, %xmm4
+; GFNIAVX1-NEXT:    vpsrlw $8, %xmm4, %xmm4
+; GFNIAVX1-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; GFNIAVX1-NEXT:    vpmullw %xmm3, %xmm0, %xmm0
+; GFNIAVX1-NEXT:    vpsrlw $8, %xmm0, %xmm0
+; GFNIAVX1-NEXT:    vpackuswb %xmm4, %xmm0, %xmm0
+; GFNIAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; GFNIAVX1-NEXT:    retq
+;
+; GFNIAVX2-LABEL: constant_fshl_v32i8:
+; GFNIAVX2:       # %bb.0:
+; GFNIAVX2-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
+; GFNIAVX2-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
+; GFNIAVX2-NEXT:    vpsrlw $8, %ymm2, %ymm2
+; GFNIAVX2-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
+; GFNIAVX2-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; GFNIAVX2-NEXT:    vpsrlw $8, %ymm0, %ymm0
+; GFNIAVX2-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
+; GFNIAVX2-NEXT:    retq
+;
+; GFNIAVX512VL-LABEL: constant_fshl_v32i8:
+; GFNIAVX512VL:       # %bb.0:
+; GFNIAVX512VL-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
+; GFNIAVX512VL-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
+; GFNIAVX512VL-NEXT:    vpsrlw $8, %ymm2, %ymm2
+; GFNIAVX512VL-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
+; GFNIAVX512VL-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; GFNIAVX512VL-NEXT:    vpsrlw $8, %ymm0, %ymm0
+; GFNIAVX512VL-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
+; GFNIAVX512VL-NEXT:    retq
+;
+; GFNIAVX512BW-LABEL: constant_fshl_v32i8:
+; GFNIAVX512BW:       # %bb.0:
+; GFNIAVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
+; GFNIAVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; GFNIAVX512BW-NEXT:    vpsllw $8, %zmm0, %zmm0
+; GFNIAVX512BW-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; GFNIAVX512BW-NEXT:    vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
+; GFNIAVX512BW-NEXT:    vpsrlw $8, %zmm0, %zmm0
+; GFNIAVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
+; GFNIAVX512BW-NEXT:    retq
+  %res = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1>)
+  ret <32 x i8> %res
+}
+
+define <32 x i8> @constant_fshr_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
+; GFNISSE-LABEL: constant_fshr_v32i8:
+; GFNISSE:       # %bb.0:
+; GFNISSE-NEXT:    movdqa %xmm2, %xmm5
+; GFNISSE-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15]
+; GFNISSE-NEXT:    pmovzxbw {{.*#+}} xmm6 = [1,128,64,32,16,8,4,2]
+; GFNISSE-NEXT:    pmullw %xmm6, %xmm5
+; GFNISSE-NEXT:    psrlw $8, %xmm5
+; GFNISSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; GFNISSE-NEXT:    pmovzxbw {{.*#+}} xmm4 = [1,2,4,8,16,32,64,128]
+; GFNISSE-NEXT:    pmullw %xmm4, %xmm2
+; GFNISSE-NEXT:    psrlw $8, %xmm2
+; GFNISSE-NEXT:    packuswb %xmm5, %xmm2
+; GFNISSE-NEXT:    movdqa %xmm3, %xmm0
+; GFNISSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; GFNISSE-NEXT:    pmullw %xmm6, %xmm0
+; GFNISSE-NEXT:    psrlw $8, %xmm0
+; GFNISSE-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
+; GFNISSE-NEXT:    pmullw %xmm3, %xmm4
+; GFNISSE-NEXT:    psrlw $8, %xmm4
+; GFNISSE-NEXT:    packuswb %xmm0, %xmm4
+; GFNISSE-NEXT:    movdqa %xmm2, %xmm0
+; GFNISSE-NEXT:    movdqa %xmm4, %xmm1
+; GFNISSE-NEXT:    retq
+;
+; GFNIAVX1-LABEL: constant_fshr_v32i8:
+; GFNIAVX1:       # %bb.0:
+; GFNIAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; GFNIAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; GFNIAVX1-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
+; GFNIAVX1-NEXT:    vpmovzxbw {{.*#+}} xmm5 = [1,128,64,32,16,8,4,2]
+; GFNIAVX1-NEXT:    vpmullw %xmm5, %xmm4, %xmm4
+; GFNIAVX1-NEXT:    vpsrlw $8, %xmm4, %xmm4
+; GFNIAVX1-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; GFNIAVX1-NEXT:    vpmovzxbw {{.*#+}} xmm3 = [1,2,4,8,16,32,64,128]
+; GFNIAVX1-NEXT:    vpmullw %xmm3, %xmm2, %xmm2
+; GFNIAVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
+; GFNIAVX1-NEXT:    vpackuswb %xmm4, %xmm2, %xmm2
+; GFNIAVX1-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; GFNIAVX1-NEXT:    vpmullw %xmm5, %xmm4, %xmm4
+; GFNIAVX1-NEXT:    vpsrlw $8, %xmm4, %xmm4
+; GFNIAVX1-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; GFNIAVX1-NEXT:    vpmullw %xmm3, %xmm0, %xmm0
+; GFNIAVX1-NEXT:    vpsrlw $8, %xmm0, %xmm0
+; GFNIAVX1-NEXT:    vpackuswb %xmm4, %xmm0, %xmm0
+; GFNIAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; GFNIAVX1-NEXT:    retq
+;
+; GFNIAVX2-LABEL: constant_fshr_v32i8:
+; GFNIAVX2:       # %bb.0:
+; GFNIAVX2-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
+; GFNIAVX2-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
+; GFNIAVX2-NEXT:    vpsrlw $8, %ymm2, %ymm2
+; GFNIAVX2-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
+; GFNIAVX2-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; GFNIAVX2-NEXT:    vpsrlw $8, %ymm0, %ymm0
+; GFNIAVX2-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
+; GFNIAVX2-NEXT:    retq
+;
+; GFNIAVX512VL-LABEL: constant_fshr_v32i8:
+; GFNIAVX512VL:       # %bb.0:
+; GFNIAVX512VL-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
+; GFNIAVX512VL-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
+; GFNIAVX512VL-NEXT:    vpsrlw $8, %ymm2, %ymm2
+; GFNIAVX512VL-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
+; GFNIAVX512VL-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; GFNIAVX512VL-NEXT:    vpsrlw $8, %ymm0, %ymm0
+; GFNIAVX512VL-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
+; GFNIAVX512VL-NEXT:    retq
+;
+; GFNIAVX512BW-LABEL: constant_fshr_v32i8:
+; GFNIAVX512BW:       # %bb.0:
+; GFNIAVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
+; GFNIAVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; GFNIAVX512BW-NEXT:    vpsllw $8, %zmm0, %zmm0
+; GFNIAVX512BW-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; GFNIAVX512BW-NEXT:    vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
+; GFNIAVX512BW-NEXT:    vpsrlw $8, %zmm0, %zmm0
+; GFNIAVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
+; GFNIAVX512BW-NEXT:    retq
+  %res = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1>)
+  ret <32 x i8> %res
+}
+
 define <32 x i8> @splatconstant_fshl_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
 ; GFNISSE-LABEL: splatconstant_fshl_v32i8:
 ; GFNISSE:       # %bb.0:
@@ -187,6 +1449,1428 @@ declare <32 x i8> @llvm.fshr.v32i8(<32 x i8>, <32 x i8>, <32 x i8>)
 ; 512 Bit Vector Funnel Shifts
 ;
 
+define <64 x i8> @var_fshl_v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %amt) nounwind {
+; GFNISSE-LABEL: var_fshl_v64i8:
+; GFNISSE:       # %bb.0:
+; GFNISSE-NEXT:    movdqa %xmm1, %xmm8
+; GFNISSE-NEXT:    movdqa %xmm0, %xmm1
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm9 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; GFNISSE-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm0
+; GFNISSE-NEXT:    pand %xmm9, %xmm0
+; GFNISSE-NEXT:    pxor %xmm10, %xmm10
+; GFNISSE-NEXT:    pmovzxbd {{.*#+}} xmm12 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; GFNISSE-NEXT:    pmovzxbw {{.*#+}} xmm13 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; GFNISSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm10[8],xmm0[9],xmm10[9],xmm0[10],xmm10[10],xmm0[11],xmm10[11],xmm0[12],xmm10[12],xmm0[13],xmm10[13],xmm0[14],xmm10[14],xmm0[15],xmm10[15]
+; GFNISSE-NEXT:    pmovzxwd {{.*#+}} xmm14 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; GFNISSE-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
+; GFNISSE-NEXT:    pslld $23, %xmm0
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm11 = [1065353216,1065353216,1065353216,1065353216]
+; GFNISSE-NEXT:    paddd %xmm11, %xmm0
+; GFNISSE-NEXT:    cvttps2dq %xmm0, %xmm0
+; GFNISSE-NEXT:    pslld $23, %xmm14
+; GFNISSE-NEXT:    paddd %xmm11, %xmm14
+; GFNISSE-NEXT:    cvttps2dq %xmm14, %xmm14
+; GFNISSE-NEXT:    packusdw %xmm0, %xmm14
+; GFNISSE-NEXT:    movdqa %xmm4, %xmm15
+; GFNISSE-NEXT:    punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm1[8],xmm15[9],xmm1[9],xmm15[10],xmm1[10],xmm15[11],xmm1[11],xmm15[12],xmm1[12],xmm15[13],xmm1[13],xmm15[14],xmm1[14],xmm15[15],xmm1[15]
+; GFNISSE-NEXT:    pmullw %xmm14, %xmm15
+; GFNISSE-NEXT:    psrlw $8, %xmm15
+; GFNISSE-NEXT:    pslld $23, %xmm12
+; GFNISSE-NEXT:    paddd %xmm11, %xmm12
+; GFNISSE-NEXT:    cvttps2dq %xmm12, %xmm0
+; GFNISSE-NEXT:    punpckhwd {{.*#+}} xmm13 = xmm13[4,4,5,5,6,6,7,7]
+; GFNISSE-NEXT:    pslld $23, %xmm13
+; GFNISSE-NEXT:    paddd %xmm11, %xmm13
+; GFNISSE-NEXT:    cvttps2dq %xmm13, %xmm12
+; GFNISSE-NEXT:    packusdw %xmm12, %xmm0
+; GFNISSE-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
+; GFNISSE-NEXT:    pmullw %xmm4, %xmm0
+; GFNISSE-NEXT:    psrlw $8, %xmm0
+; GFNISSE-NEXT:    packuswb %xmm15, %xmm0
+; GFNISSE-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm1
+; GFNISSE-NEXT:    pand %xmm9, %xmm1
+; GFNISSE-NEXT:    pmovzxbd {{.*#+}} xmm4 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; GFNISSE-NEXT:    pmovzxbw {{.*#+}} xmm12 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; GFNISSE-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm10[8],xmm1[9],xmm10[9],xmm1[10],xmm10[10],xmm1[11],xmm10[11],xmm1[12],xmm10[12],xmm1[13],xmm10[13],xmm1[14],xmm10[14],xmm1[15],xmm10[15]
+; GFNISSE-NEXT:    pmovzxwd {{.*#+}} xmm13 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; GFNISSE-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
+; GFNISSE-NEXT:    pslld $23, %xmm1
+; GFNISSE-NEXT:    paddd %xmm11, %xmm1
+; GFNISSE-NEXT:    cvttps2dq %xmm1, %xmm1
+; GFNISSE-NEXT:    pslld $23, %xmm13
+; GFNISSE-NEXT:    paddd %xmm11, %xmm13
+; GFNISSE-NEXT:    cvttps2dq %xmm13, %xmm13
+; GFNISSE-NEXT:    packusdw %xmm1, %xmm13
+; GFNISSE-NEXT:    movdqa %xmm5, %xmm14
+; GFNISSE-NEXT:    punpckhbw {{.*#+}} xmm14 = xmm14[8],xmm8[8],xmm14[9],xmm8[9],xmm14[10],xmm8[10],xmm14[11],xmm8[11],xmm14[12],xmm8[12],xmm14[13],xmm8[13],xmm14[14],xmm8[14],xmm14[15],xmm8[15]
+; GFNISSE-NEXT:    pmullw %xmm13, %xmm14
+; GFNISSE-NEXT:    psrlw $8, %xmm14
+; GFNISSE-NEXT:    pslld $23, %xmm4
+; GFNISSE-NEXT:    paddd %xmm11, %xmm4
+; GFNISSE-NEXT:    cvttps2dq %xmm4, %xmm1
+; GFNISSE-NEXT:    punpckhwd {{.*#+}} xmm12 = xmm12[4,4,5,5,6,6,7,7]
+; GFNISSE-NEXT:    pslld $23, %xmm12
+; GFNISSE-NEXT:    paddd %xmm11, %xmm12
+; GFNISSE-NEXT:    cvttps2dq %xmm12, %xmm4
+; GFNISSE-NEXT:    packusdw %xmm4, %xmm1
+; GFNISSE-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3],xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7]
+; GFNISSE-NEXT:    pmullw %xmm5, %xmm1
+; GFNISSE-NEXT:    psrlw $8, %xmm1
+; GFNISSE-NEXT:    packuswb %xmm14, %xmm1
+; GFNISSE-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm4
+; GFNISSE-NEXT:    pand %xmm9, %xmm4
+; GFNISSE-NEXT:    pmovzxbd {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
+; GFNISSE-NEXT:    pmovzxbw {{.*#+}} xmm8 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
+; GFNISSE-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm10[8],xmm4[9],xmm10[9],xmm4[10],xmm10[10],xmm4[11],xmm10[11],xmm4[12],xmm10[12],xmm4[13],xmm10[13],xmm4[14],xmm10[14],xmm4[15],xmm10[15]
+; GFNISSE-NEXT:    pmovzxwd {{.*#+}} xmm12 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero
+; GFNISSE-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7]
+; GFNISSE-NEXT:    pslld $23, %xmm4
+; GFNISSE-NEXT:    paddd %xmm11, %xmm4
+; GFNISSE-NEXT:    cvttps2dq %xmm4, %xmm4
+; GFNISSE-NEXT:    pslld $23, %xmm12
+; GFNISSE-NEXT:    paddd %xmm11, %xmm12
+; GFNISSE-NEXT:    cvttps2dq %xmm12, %xmm12
+; GFNISSE-NEXT:    packusdw %xmm4, %xmm12
+; GFNISSE-NEXT:    movdqa %xmm6, %xmm13
+; GFNISSE-NEXT:    punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm2[8],xmm13[9],xmm2[9],xmm13[10],xmm2[10],xmm13[11],xmm2[11],xmm13[12],xmm2[12],xmm13[13],xmm2[13],xmm13[14],xmm2[14],xmm13[15],xmm2[15]
+; GFNISSE-NEXT:    pmullw %xmm12, %xmm13
+; GFNISSE-NEXT:    psrlw $8, %xmm13
+; GFNISSE-NEXT:    pslld $23, %xmm5
+; GFNISSE-NEXT:    paddd %xmm11, %xmm5
+; GFNISSE-NEXT:    cvttps2dq %xmm5, %xmm4
+; GFNISSE-NEXT:    punpckhwd {{.*#+}} xmm8 = xmm8[4,4,5,5,6,6,7,7]
+; GFNISSE-NEXT:    pslld $23, %xmm8
+; GFNISSE-NEXT:    paddd %xmm11, %xmm8
+; GFNISSE-NEXT:    cvttps2dq %xmm8, %xmm5
+; GFNISSE-NEXT:    packusdw %xmm5, %xmm4
+; GFNISSE-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
+; GFNISSE-NEXT:    pmullw %xmm6, %xmm4
+; GFNISSE-NEXT:    psrlw $8, %xmm4
+; GFNISSE-NEXT:    packuswb %xmm13, %xmm4
+; GFNISSE-NEXT:    pand {{[0-9]+}}(%rsp), %xmm9
+; GFNISSE-NEXT:    pmovzxbd {{.*#+}} xmm2 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero,xmm9[2],zero,zero,zero,xmm9[3],zero,zero,zero
+; GFNISSE-NEXT:    pmovzxbw {{.*#+}} xmm6 = xmm9[0],zero,xmm9[1],zero,xmm9[2],zero,xmm9[3],zero,xmm9[4],zero,xmm9[5],zero,xmm9[6],zero,xmm9[7],zero
+; GFNISSE-NEXT:    punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm10[8],xmm9[9],xmm10[9],xmm9[10],xmm10[10],xmm9[11],xmm10[11],xmm9[12],xmm10[12],xmm9[13],xmm10[13],xmm9[14],xmm10[14],xmm9[15],xmm10[15]
+; GFNISSE-NEXT:    pmovzxwd {{.*#+}} xmm5 = xmm9[0],zero,xmm9[1],zero,xmm9[2],zero,xmm9[3],zero
+; GFNISSE-NEXT:    punpckhwd {{.*#+}} xmm9 = xmm9[4,4,5,5,6,6,7,7]
+; GFNISSE-NEXT:    pslld $23, %xmm9
+; GFNISSE-NEXT:    paddd %xmm11, %xmm9
+; GFNISSE-NEXT:    cvttps2dq %xmm9, %xmm8
+; GFNISSE-NEXT:    pslld $23, %xmm5
+; GFNISSE-NEXT:    paddd %xmm11, %xmm5
+; GFNISSE-NEXT:    cvttps2dq %xmm5, %xmm5
+; GFNISSE-NEXT:    packusdw %xmm8, %xmm5
+; GFNISSE-NEXT:    movdqa %xmm7, %xmm8
+; GFNISSE-NEXT:    punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm3[8],xmm8[9],xmm3[9],xmm8[10],xmm3[10],xmm8[11],xmm3[11],xmm8[12],xmm3[12],xmm8[13],xmm3[13],xmm8[14],xmm3[14],xmm8[15],xmm3[15]
+; GFNISSE-NEXT:    pmullw %xmm5, %xmm8
+; GFNISSE-NEXT:    psrlw $8, %xmm8
+; GFNISSE-NEXT:    pslld $23, %xmm2
+; GFNISSE-NEXT:    paddd %xmm11, %xmm2
+; GFNISSE-NEXT:    cvttps2dq %xmm2, %xmm5
+; GFNISSE-NEXT:    punpckhwd {{.*#+}} xmm6 = xmm6[4,4,5,5,6,6,7,7]
+; GFNISSE-NEXT:    pslld $23, %xmm6
+; GFNISSE-NEXT:    paddd %xmm11, %xmm6
+; GFNISSE-NEXT:    cvttps2dq %xmm6, %xmm2
+; GFNISSE-NEXT:    packusdw %xmm2, %xmm5
+; GFNISSE-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7]
+; GFNISSE-NEXT:    pmullw %xmm7, %xmm5
+; GFNISSE-NEXT:    psrlw $8, %xmm5
+; GFNISSE-NEXT:    packuswb %xmm8, %xmm5
+; GFNISSE-NEXT:    movdqa %xmm4, %xmm2
+; GFNISSE-NEXT:    movdqa %xmm5, %xmm3
+; GFNISSE-NEXT:    retq
+;
+; GFNIAVX1-LABEL: var_fshl_v64i8:
+; GFNIAVX1:       # %bb.0:
+; GFNIAVX1-NEXT:    vbroadcastss {{.*#+}} ymm7 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; GFNIAVX1-NEXT:    vandps %ymm7, %ymm4, %ymm8
+; GFNIAVX1-NEXT:    vextractf128 $1, %ymm8, %xmm9
+; GFNIAVX1-NEXT:    vpxor %xmm6, %xmm6, %xmm6
+; GFNIAVX1-NEXT:    vpunpckhbw {{.*#+}} xmm10 = xmm9[8],xmm6[8],xmm9[9],xmm6[9],xmm9[10],xmm6[10],xmm9[11],xmm6[11],xmm9[12],xmm6[12],xmm9[13],xmm6[13],xmm9[14],xmm6[14],xmm9[15],xmm6[15]
+; GFNIAVX1-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm10[4,4,5,5,6,6,7,7]
+; GFNIAVX1-NEXT:    vpslld $23, %xmm4, %xmm11
+; GFNIAVX1-NEXT:    vbroadcastss {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
+; GFNIAVX1-NEXT:    vpaddd %xmm4, %xmm11, %xmm11
+; GFNIAVX1-NEXT:    vcvttps2dq %xmm11, %xmm11
+; GFNIAVX1-NEXT:    vpmovzxwd {{.*#+}} xmm10 = xmm10[0],zero,xmm10[1],zero,xmm10[2],zero,xmm10[3],zero
+; GFNIAVX1-NEXT:    vpslld $23, %xmm10, %xmm10
+; GFNIAVX1-NEXT:    vpaddd %xmm4, %xmm10, %xmm10
+; GFNIAVX1-NEXT:    vcvttps2dq %xmm10, %xmm10
+; GFNIAVX1-NEXT:    vpackusdw %xmm11, %xmm10, %xmm10
+; GFNIAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm11
+; GFNIAVX1-NEXT:    vextractf128 $1, %ymm2, %xmm12
+; GFNIAVX1-NEXT:    vpunpckhbw {{.*#+}} xmm13 = xmm12[8],xmm11[8],xmm12[9],xmm11[9],xmm12[10],xmm11[10],xmm12[11],xmm11[11],xmm12[12],xmm11[12],xmm12[13],xmm11[13],xmm12[14],xmm11[14],xmm12[15],xmm11[15]
+; GFNIAVX1-NEXT:    vpmullw %xmm10, %xmm13, %xmm10
+; GFNIAVX1-NEXT:    vpsrlw $8, %xmm10, %xmm10
+; GFNIAVX1-NEXT:    vpmovzxbd {{.*#+}} xmm13 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero,xmm9[2],zero,zero,zero,xmm9[3],zero,zero,zero
+; GFNIAVX1-NEXT:    vpslld $23, %xmm13, %xmm13
+; GFNIAVX1-NEXT:    vpaddd %xmm4, %xmm13, %xmm13
+; GFNIAVX1-NEXT:    vcvttps2dq %xmm13, %xmm13
+; GFNIAVX1-NEXT:    vpmovzxbw {{.*#+}} xmm9 = xmm9[0],zero,xmm9[1],zero,xmm9[2],zero,xmm9[3],zero,xmm9[4],zero,xmm9[5],zero,xmm9[6],zero,xmm9[7],zero
+; GFNIAVX1-NEXT:    vpunpckhwd {{.*#+}} xmm9 = xmm9[4,4,5,5,6,6,7,7]
+; GFNIAVX1-NEXT:    vpslld $23, %xmm9, %xmm9
+; GFNIAVX1-NEXT:    vpaddd %xmm4, %xmm9, %xmm9
+; GFNIAVX1-NEXT:    vcvttps2dq %xmm9, %xmm9
+; GFNIAVX1-NEXT:    vpackusdw %xmm9, %xmm13, %xmm9
+; GFNIAVX1-NEXT:    vpunpcklbw {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7]
+; GFNIAVX1-NEXT:    vpmullw %xmm9, %xmm11, %xmm9
+; GFNIAVX1-NEXT:    vpsrlw $8, %xmm9, %xmm9
+; GFNIAVX1-NEXT:    vpackuswb %xmm10, %xmm9, %xmm9
+; GFNIAVX1-NEXT:    vpunpckhbw {{.*#+}} xmm10 = xmm8[8],xmm6[8],xmm8[9],xmm6[9],xmm8[10],xmm6[10],xmm8[11],xmm6[11],xmm8[12],xmm6[12],xmm8[13],xmm6[13],xmm8[14],xmm6[14],xmm8[15],xmm6[15]
+; GFNIAVX1-NEXT:    vpunpckhwd {{.*#+}} xmm11 = xmm10[4,4,5,5,6,6,7,7]
+; GFNIAVX1-NEXT:    vpslld $23, %xmm11, %xmm11
+; GFNIAVX1-NEXT:    vpaddd %xmm4, %xmm11, %xmm11
+; GFNIAVX1-NEXT:    vcvttps2dq %xmm11, %xmm11
+; GFNIAVX1-NEXT:    vpmovzxwd {{.*#+}} xmm10 = xmm10[0],zero,xmm10[1],zero,xmm10[2],zero,xmm10[3],zero
+; GFNIAVX1-NEXT:    vpslld $23, %xmm10, %xmm10
+; GFNIAVX1-NEXT:    vpaddd %xmm4, %xmm10, %xmm10
+; GFNIAVX1-NEXT:    vcvttps2dq %xmm10, %xmm10
+; GFNIAVX1-NEXT:    vpackusdw %xmm11, %xmm10, %xmm10
+; GFNIAVX1-NEXT:    vpunpckhbw {{.*#+}} xmm11 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
+; GFNIAVX1-NEXT:    vpmullw %xmm10, %xmm11, %xmm10
+; GFNIAVX1-NEXT:    vpsrlw $8, %xmm10, %xmm10
+; GFNIAVX1-NEXT:    vpmovzxbd {{.*#+}} xmm11 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero,xmm8[2],zero,zero,zero,xmm8[3],zero,zero,zero
+; GFNIAVX1-NEXT:    vpslld $23, %xmm11, %xmm11
+; GFNIAVX1-NEXT:    vpaddd %xmm4, %xmm11, %xmm11
+; GFNIAVX1-NEXT:    vcvttps2dq %xmm11, %xmm11
+; GFNIAVX1-NEXT:    vpmovzxbw {{.*#+}} xmm8 = xmm8[0],zero,xmm8[1],zero,xmm8[2],zero,xmm8[3],zero,xmm8[4],zero,xmm8[5],zero,xmm8[6],zero,xmm8[7],zero
+; GFNIAVX1-NEXT:    vpunpckhwd {{.*#+}} xmm8 = xmm8[4,4,5,5,6,6,7,7]
+; GFNIAVX1-NEXT:    vpslld $23, %xmm8, %xmm8
+; GFNIAVX1-NEXT:    vpaddd %xmm4, %xmm8, %xmm8
+; GFNIAVX1-NEXT:    vcvttps2dq %xmm8, %xmm8
+; GFNIAVX1-NEXT:    vpackusdw %xmm8, %xmm11, %xmm8
+; GFNIAVX1-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; GFNIAVX1-NEXT:    vpmullw %xmm0, %xmm8, %xmm0
+; GFNIAVX1-NEXT:    vpsrlw $8, %xmm0, %xmm0
+; GFNIAVX1-NEXT:    vpackuswb %xmm10, %xmm0, %xmm0
+; GFNIAVX1-NEXT:    vinsertf128 $1, %xmm9, %ymm0, %ymm0
+; GFNIAVX1-NEXT:    vandps %ymm7, %ymm5, %ymm2
+; GFNIAVX1-NEXT:    vextractf128 $1, %ymm2, %xmm5
+; GFNIAVX1-NEXT:    vpunpckhbw {{.*#+}} xmm7 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15]
+; GFNIAVX1-NEXT:    vpunpckhwd {{.*#+}} xmm8 = xmm7[4,4,5,5,6,6,7,7]
+; GFNIAVX1-NEXT:    vpslld $23, %xmm8, %xmm8
+; GFNIAVX1-NEXT:    vpaddd %xmm4, %xmm8, %xmm8
+; GFNIAVX1-NEXT:    vcvttps2dq %xmm8, %xmm8
+; GFNIAVX1-NEXT:    vpmovzxwd {{.*#+}} xmm7 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero
+; GFNIAVX1-NEXT:    vpslld $23, %xmm7, %xmm7
+; GFNIAVX1-NEXT:    vpaddd %xmm4, %xmm7, %xmm7
+; GFNIAVX1-NEXT:    vcvttps2dq %xmm7, %xmm7
+; GFNIAVX1-NEXT:    vpackusdw %xmm8, %xmm7, %xmm7
+; GFNIAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm8
+; GFNIAVX1-NEXT:    vextractf128 $1, %ymm3, %xmm9
+; GFNIAVX1-NEXT:    vpunpckhbw {{.*#+}} xmm10 = xmm9[8],xmm8[8],xmm9[9],xmm8[9],xmm9[10],xmm8[10],xmm9[11],xmm8[11],xmm9[12],xmm8[12],xmm9[13],xmm8[13],xmm9[14],xmm8[14],xmm9[15],xmm8[15]
+; GFNIAVX1-NEXT:    vpmullw %xmm7, %xmm10, %xmm7
+; GFNIAVX1-NEXT:    vpsrlw $8, %xmm7, %xmm7
+; GFNIAVX1-NEXT:    vpmovzxbd {{.*#+}} xmm10 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero
+; GFNIAVX1-NEXT:    vpslld $23, %xmm10, %xmm10
+; GFNIAVX1-NEXT:    vpaddd %xmm4, %xmm10, %xmm10
+; GFNIAVX1-NEXT:    vcvttps2dq %xmm10, %xmm10
+; GFNIAVX1-NEXT:    vpmovzxbw {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero
+; GFNIAVX1-NEXT:    vpunpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7]
+; GFNIAVX1-NEXT:    vpslld $23, %xmm5, %xmm5
+; GFNIAVX1-NEXT:    vpaddd %xmm4, %xmm5, %xmm5
+; GFNIAVX1-NEXT:    vcvttps2dq %xmm5, %xmm5
+; GFNIAVX1-NEXT:    vpackusdw %xmm5, %xmm10, %xmm5
+; GFNIAVX1-NEXT:    vpunpcklbw {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7]
+; GFNIAVX1-NEXT:    vpmullw %xmm5, %xmm8, %xmm5
+; GFNIAVX1-NEXT:    vpsrlw $8, %xmm5, %xmm5
+; GFNIAVX1-NEXT:    vpackuswb %xmm7, %xmm5, %xmm5
+; GFNIAVX1-NEXT:    vpunpckhbw {{.*#+}} xmm6 = xmm2[8],xmm6[8],xmm2[9],xmm6[9],xmm2[10],xmm6[10],xmm2[11],xmm6[11],xmm2[12],xmm6[12],xmm2[13],xmm6[13],xmm2[14],xmm6[14],xmm2[15],xmm6[15]
+; GFNIAVX1-NEXT:    vpunpckhwd {{.*#+}} xmm7 = xmm6[4,4,5,5,6,6,7,7]
+; GFNIAVX1-NEXT:    vpslld $23, %xmm7, %xmm7
+; GFNIAVX1-NEXT:    vpaddd %xmm4, %xmm7, %xmm7
+; GFNIAVX1-NEXT:    vcvttps2dq %xmm7, %xmm7
+; GFNIAVX1-NEXT:    vpmovzxwd {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero
+; GFNIAVX1-NEXT:    vpslld $23, %xmm6, %xmm6
+; GFNIAVX1-NEXT:    vpaddd %xmm4, %xmm6, %xmm6
+; GFNIAVX1-NEXT:    vcvttps2dq %xmm6, %xmm6
+; GFNIAVX1-NEXT:    vpackusdw %xmm7, %xmm6, %xmm6
+; GFNIAVX1-NEXT:    vpunpckhbw {{.*#+}} xmm7 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
+; GFNIAVX1-NEXT:    vpmullw %xmm6, %xmm7, %xmm6
+; GFNIAVX1-NEXT:    vpsrlw $8, %xmm6, %xmm6
+; GFNIAVX1-NEXT:    vpmovzxbd {{.*#+}} xmm7 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
+; GFNIAVX1-NEXT:    vpslld $23, %xmm7, %xmm7
+; GFNIAVX1-NEXT:    vpaddd %xmm4, %xmm7, %xmm7
+; GFNIAVX1-NEXT:    vcvttps2dq %xmm7, %xmm7
+; GFNIAVX1-NEXT:    vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
+; GFNIAVX1-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7]
+; GFNIAVX1-NEXT:    vpslld $23, %xmm2, %xmm2
+; GFNIAVX1-NEXT:    vpaddd %xmm4, %xmm2, %xmm2
+; GFNIAVX1-NEXT:    vcvttps2dq %xmm2, %xmm2
+; GFNIAVX1-NEXT:    vpackusdw %xmm2, %xmm7, %xmm2
+; GFNIAVX1-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
+; GFNIAVX1-NEXT:    vpmullw %xmm2, %xmm1, %xmm1
+; GFNIAVX1-NEXT:    vpsrlw $8, %xmm1, %xmm1
+; GFNIAVX1-NEXT:    vpackuswb %xmm6, %xmm1, %xmm1
+; GFNIAVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm1, %ymm1
+; GFNIAVX1-NEXT:    retq
+;
+; GFNIAVX2-LABEL: var_fshl_v64i8:
+; GFNIAVX2:       # %bb.0:
+; GFNIAVX2-NEXT:    vpunpckhbw {{.*#+}} ymm7 = ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15],ymm2[24],ymm0[24],ymm2[25],ymm0[25],ymm2[26],ymm0[26],ymm2[27],ymm0[27],ymm2[28],ymm0[28],ymm2[29],ymm0[29],ymm2[30],ymm0[30],ymm2[31],ymm0[31]
+; GFNIAVX2-NEXT:    vpxor %xmm6, %xmm6, %xmm6
+; GFNIAVX2-NEXT:    vpunpckhwd {{.*#+}} ymm8 = ymm6[4],ymm7[4],ymm6[5],ymm7[5],ymm6[6],ymm7[6],ymm6[7],ymm7[7],ymm6[12],ymm7[12],ymm6[13],ymm7[13],ymm6[14],ymm7[14],ymm6[15],ymm7[15]
+; GFNIAVX2-NEXT:    vpbroadcastb {{.*#+}} ymm9 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; GFNIAVX2-NEXT:    vpand %ymm4, %ymm9, %ymm4
+; GFNIAVX2-NEXT:    vpunpckhbw {{.*#+}} ymm10 = ymm4[8],ymm6[8],ymm4[9],ymm6[9],ymm4[10],ymm6[10],ymm4[11],ymm6[11],ymm4[12],ymm6[12],ymm4[13],ymm6[13],ymm4[14],ymm6[14],ymm4[15],ymm6[15],ymm4[24],ymm6[24],ymm4[25],ymm6[25],ymm4[26],ymm6[26],ymm4[27],ymm6[27],ymm4[28],ymm6[28],ymm4[29],ymm6[29],ymm4[30],ymm6[30],ymm4[31],ymm6[31]
+; GFNIAVX2-NEXT:    vpunpckhwd {{.*#+}} ymm11 = ymm10[4],ymm6[4],ymm10[5],ymm6[5],ymm10[6],ymm6[6],ymm10[7],ymm6[7],ymm10[12],ymm6[12],ymm10[13],ymm6[13],ymm10[14],ymm6[14],ymm10[15],ymm6[15]
+; GFNIAVX2-NEXT:    vpsllvd %ymm11, %ymm8, %ymm8
+; GFNIAVX2-NEXT:    vpsrld $16, %ymm8, %ymm8
+; GFNIAVX2-NEXT:    vpunpcklwd {{.*#+}} ymm7 = ymm6[0],ymm7[0],ymm6[1],ymm7[1],ymm6[2],ymm7[2],ymm6[3],ymm7[3],ymm6[8],ymm7[8],ymm6[9],ymm7[9],ymm6[10],ymm7[10],ymm6[11],ymm7[11]
+; GFNIAVX2-NEXT:    vpunpcklwd {{.*#+}} ymm10 = ymm10[0],ymm6[0],ymm10[1],ymm6[1],ymm10[2],ymm6[2],ymm10[3],ymm6[3],ymm10[8],ymm6[8],ymm10[9],ymm6[9],ymm10[10],ymm6[10],ymm10[11],ymm6[11]
+; GFNIAVX2-NEXT:    vpsllvd %ymm10, %ymm7, %ymm7
+; GFNIAVX2-NEXT:    vpsrld $16, %ymm7, %ymm7
+; GFNIAVX2-NEXT:    vpackusdw %ymm8, %ymm7, %ymm7
+; GFNIAVX2-NEXT:    vpsrlw $8, %ymm7, %ymm7
+; GFNIAVX2-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23]
+; GFNIAVX2-NEXT:    vpunpckhwd {{.*#+}} ymm2 = ymm6[4],ymm0[4],ymm6[5],ymm0[5],ymm6[6],ymm0[6],ymm6[7],ymm0[7],ymm6[12],ymm0[12],ymm6[13],ymm0[13],ymm6[14],ymm0[14],ymm6[15],ymm0[15]
+; GFNIAVX2-NEXT:    vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm6[0],ymm4[1],ymm6[1],ymm4[2],ymm6[2],ymm4[3],ymm6[3],ymm4[4],ymm6[4],ymm4[5],ymm6[5],ymm4[6],ymm6[6],ymm4[7],ymm6[7],ymm4[16],ymm6[16],ymm4[17],ymm6[17],ymm4[18],ymm6[18],ymm4[19],ymm6[19],ymm4[20],ymm6[20],ymm4[21],ymm6[21],ymm4[22],ymm6[22],ymm4[23],ymm6[23]
+; GFNIAVX2-NEXT:    vpunpckhwd {{.*#+}} ymm8 = ymm4[4],ymm6[4],ymm4[5],ymm6[5],ymm4[6],ymm6[6],ymm4[7],ymm6[7],ymm4[12],ymm6[12],ymm4[13],ymm6[13],ymm4[14],ymm6[14],ymm4[15],ymm6[15]
+; GFNIAVX2-NEXT:    vpsllvd %ymm8, %ymm2, %ymm2
+; GFNIAVX2-NEXT:    vpsrld $16, %ymm2, %ymm2
+; GFNIAVX2-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm6[0],ymm0[0],ymm6[1],ymm0[1],ymm6[2],ymm0[2],ymm6[3],ymm0[3],ymm6[8],ymm0[8],ymm6[9],ymm0[9],ymm6[10],ymm0[10],ymm6[11],ymm0[11]
+; GFNIAVX2-NEXT:    vpunpcklwd {{.*#+}} ymm4 = ymm4[0],ymm6[0],ymm4[1],ymm6[1],ymm4[2],ymm6[2],ymm4[3],ymm6[3],ymm4[8],ymm6[8],ymm4[9],ymm6[9],ymm4[10],ymm6[10],ymm4[11],ymm6[11]
+; GFNIAVX2-NEXT:    vpsllvd %ymm4, %ymm0, %ymm0
+; GFNIAVX2-NEXT:    vpsrld $16, %ymm0, %ymm0
+; GFNIAVX2-NEXT:    vpackusdw %ymm2, %ymm0, %ymm0
+; GFNIAVX2-NEXT:    vpsrlw $8, %ymm0, %ymm0
+; GFNIAVX2-NEXT:    vpackuswb %ymm7, %ymm0, %ymm0
+; GFNIAVX2-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm3[8],ymm1[8],ymm3[9],ymm1[9],ymm3[10],ymm1[10],ymm3[11],ymm1[11],ymm3[12],ymm1[12],ymm3[13],ymm1[13],ymm3[14],ymm1[14],ymm3[15],ymm1[15],ymm3[24],ymm1[24],ymm3[25],ymm1[25],ymm3[26],ymm1[26],ymm3[27],ymm1[27],ymm3[28],ymm1[28],ymm3[29],ymm1[29],ymm3[30],ymm1[30],ymm3[31],ymm1[31]
+; GFNIAVX2-NEXT:    vpunpckhwd {{.*#+}} ymm4 = ymm6[4],ymm2[4],ymm6[5],ymm2[5],ymm6[6],ymm2[6],ymm6[7],ymm2[7],ymm6[12],ymm2[12],ymm6[13],ymm2[13],ymm6[14],ymm2[14],ymm6[15],ymm2[15]
+; GFNIAVX2-NEXT:    vpand %ymm5, %ymm9, %ymm5
+; GFNIAVX2-NEXT:    vpunpckhbw {{.*#+}} ymm7 = ymm5[8],ymm6[8],ymm5[9],ymm6[9],ymm5[10],ymm6[10],ymm5[11],ymm6[11],ymm5[12],ymm6[12],ymm5[13],ymm6[13],ymm5[14],ymm6[14],ymm5[15],ymm6[15],ymm5[24],ymm6[24],ymm5[25],ymm6[25],ymm5[26],ymm6[26],ymm5[27],ymm6[27],ymm5[28],ymm6[28],ymm5[29],ymm6[29],ymm5[30],ymm6[30],ymm5[31],ymm6[31]
+; GFNIAVX2-NEXT:    vpunpckhwd {{.*#+}} ymm8 = ymm7[4],ymm6[4],ymm7[5],ymm6[5],ymm7[6],ymm6[6],ymm7[7],ymm6[7],ymm7[12],ymm6[12],ymm7[13],ymm6[13],ymm7[14],ymm6[14],ymm7[15],ymm6[15]
+; GFNIAVX2-NEXT:    vpsllvd %ymm8, %ymm4, %ymm4
+; GFNIAVX2-NEXT:    vpsrld $16, %ymm4, %ymm4
+; GFNIAVX2-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm6[0],ymm2[0],ymm6[1],ymm2[1],ymm6[2],ymm2[2],ymm6[3],ymm2[3],ymm6[8],ymm2[8],ymm6[9],ymm2[9],ymm6[10],ymm2[10],ymm6[11],ymm2[11]
+; GFNIAVX2-NEXT:    vpunpcklwd {{.*#+}} ymm7 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[8],ymm6[8],ymm7[9],ymm6[9],ymm7[10],ymm6[10],ymm7[11],ymm6[11]
+; GFNIAVX2-NEXT:    vpsllvd %ymm7, %ymm2, %ymm2
+; GFNIAVX2-NEXT:    vpsrld $16, %ymm2, %ymm2
+; GFNIAVX2-NEXT:    vpackusdw %ymm4, %ymm2, %ymm2
+; GFNIAVX2-NEXT:    vpsrlw $8, %ymm2, %ymm2
+; GFNIAVX2-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm3[0],ymm1[0],ymm3[1],ymm1[1],ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[4],ymm1[4],ymm3[5],ymm1[5],ymm3[6],ymm1[6],ymm3[7],ymm1[7],ymm3[16],ymm1[16],ymm3[17],ymm1[17],ymm3[18],ymm1[18],ymm3[19],ymm1[19],ymm3[20],ymm1[20],ymm3[21],ymm1[21],ymm3[22],ymm1[22],ymm3[23],ymm1[23]
+; GFNIAVX2-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm6[4],ymm1[4],ymm6[5],ymm1[5],ymm6[6],ymm1[6],ymm6[7],ymm1[7],ymm6[12],ymm1[12],ymm6[13],ymm1[13],ymm6[14],ymm1[14],ymm6[15],ymm1[15]
+; GFNIAVX2-NEXT:    vpunpcklbw {{.*#+}} ymm4 = ymm5[0],ymm6[0],ymm5[1],ymm6[1],ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[4],ymm6[4],ymm5[5],ymm6[5],ymm5[6],ymm6[6],ymm5[7],ymm6[7],ymm5[16],ymm6[16],ymm5[17],ymm6[17],ymm5[18],ymm6[18],ymm5[19],ymm6[19],ymm5[20],ymm6[20],ymm5[21],ymm6[21],ymm5[22],ymm6[22],ymm5[23],ymm6[23]
+; GFNIAVX2-NEXT:    vpunpckhwd {{.*#+}} ymm5 = ymm4[4],ymm6[4],ymm4[5],ymm6[5],ymm4[6],ymm6[6],ymm4[7],ymm6[7],ymm4[12],ymm6[12],ymm4[13],ymm6[13],ymm4[14],ymm6[14],ymm4[15],ymm6[15]
+; GFNIAVX2-NEXT:    vpsllvd %ymm5, %ymm3, %ymm3
+; GFNIAVX2-NEXT:    vpsrld $16, %ymm3, %ymm3
+; GFNIAVX2-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm6[0],ymm1[0],ymm6[1],ymm1[1],ymm6[2],ymm1[2],ymm6[3],ymm1[3],ymm6[8],ymm1[8],ymm6[9],ymm1[9],ymm6[10],ymm1[10],ymm6[11],ymm1[11]
+; GFNIAVX2-NEXT:    vpunpcklwd {{.*#+}} ymm4 = ymm4[0],ymm6[0],ymm4[1],ymm6[1],ymm4[2],ymm6[2],ymm4[3],ymm6[3],ymm4[8],ymm6[8],ymm4[9],ymm6[9],ymm4[10],ymm6[10],ymm4[11],ymm6[11]
+; GFNIAVX2-NEXT:    vpsllvd %ymm4, %ymm1, %ymm1
+; GFNIAVX2-NEXT:    vpsrld $16, %ymm1, %ymm1
+; GFNIAVX2-NEXT:    vpackusdw %ymm3, %ymm1, %ymm1
+; GFNIAVX2-NEXT:    vpsrlw $8, %ymm1, %ymm1
+; GFNIAVX2-NEXT:    vpackuswb %ymm2, %ymm1, %ymm1
+; GFNIAVX2-NEXT:    retq
+;
+; GFNIAVX512VL-LABEL: var_fshl_v64i8:
+; GFNIAVX512VL:       # %bb.0:
+; GFNIAVX512VL-NEXT:    vextracti64x4 $1, %zmm1, %ymm3
+; GFNIAVX512VL-NEXT:    vpsrlw $1, %ymm3, %ymm3
+; GFNIAVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm4 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; GFNIAVX512VL-NEXT:    vpand %ymm4, %ymm3, %ymm5
+; GFNIAVX512VL-NEXT:    vpsrlw $4, %ymm5, %ymm3
+; GFNIAVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm6 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; GFNIAVX512VL-NEXT:    vpand %ymm6, %ymm3, %ymm7
+; GFNIAVX512VL-NEXT:    vpbroadcastd {{.*#+}} zmm8 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; GFNIAVX512VL-NEXT:    vpandq %zmm8, %zmm2, %zmm2
+; GFNIAVX512VL-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
+; GFNIAVX512VL-NEXT:    vpxor %ymm3, %ymm8, %ymm9
+; GFNIAVX512VL-NEXT:    vpsllw $5, %ymm9, %ymm9
+; GFNIAVX512VL-NEXT:    vpblendvb %ymm9, %ymm7, %ymm5, %ymm5
+; GFNIAVX512VL-NEXT:    vpsrlw $2, %ymm5, %ymm7
+; GFNIAVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm10 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
+; GFNIAVX512VL-NEXT:    vpand %ymm7, %ymm10, %ymm7
+; GFNIAVX512VL-NEXT:    vpaddb %ymm9, %ymm9, %ymm9
+; GFNIAVX512VL-NEXT:    vpblendvb %ymm9, %ymm7, %ymm5, %ymm5
+; GFNIAVX512VL-NEXT:    vpsrlw $1, %ymm5, %ymm7
+; GFNIAVX512VL-NEXT:    vpand %ymm4, %ymm7, %ymm7
+; GFNIAVX512VL-NEXT:    vpaddb %ymm9, %ymm9, %ymm9
+; GFNIAVX512VL-NEXT:    vpblendvb %ymm9, %ymm7, %ymm5, %ymm5
+; GFNIAVX512VL-NEXT:    vpsrlw $1, %ymm1, %ymm1
+; GFNIAVX512VL-NEXT:    vpand %ymm4, %ymm1, %ymm1
+; GFNIAVX512VL-NEXT:    vpsrlw $4, %ymm1, %ymm7
+; GFNIAVX512VL-NEXT:    vpand %ymm6, %ymm7, %ymm6
+; GFNIAVX512VL-NEXT:    vpxor %ymm2, %ymm8, %ymm7
+; GFNIAVX512VL-NEXT:    vpsllw $5, %ymm7, %ymm7
+; GFNIAVX512VL-NEXT:    vpblendvb %ymm7, %ymm6, %ymm1, %ymm1
+; GFNIAVX512VL-NEXT:    vpsrlw $2, %ymm1, %ymm6
+; GFNIAVX512VL-NEXT:    vpand %ymm6, %ymm10, %ymm6
+; GFNIAVX512VL-NEXT:    vpaddb %ymm7, %ymm7, %ymm7
+; GFNIAVX512VL-NEXT:    vpblendvb %ymm7, %ymm6, %ymm1, %ymm1
+; GFNIAVX512VL-NEXT:    vpsrlw $1, %ymm1, %ymm6
+; GFNIAVX512VL-NEXT:    vpand %ymm4, %ymm6, %ymm4
+; GFNIAVX512VL-NEXT:    vpaddb %ymm7, %ymm7, %ymm6
+; GFNIAVX512VL-NEXT:    vpblendvb %ymm6, %ymm4, %ymm1, %ymm1
+; GFNIAVX512VL-NEXT:    vinserti64x4 $1, %ymm5, %zmm1, %zmm1
+; GFNIAVX512VL-NEXT:    vextracti64x4 $1, %zmm0, %ymm4
+; GFNIAVX512VL-NEXT:    vpsllw $4, %ymm4, %ymm5
+; GFNIAVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm6 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; GFNIAVX512VL-NEXT:    vpand %ymm6, %ymm5, %ymm5
+; GFNIAVX512VL-NEXT:    vpsllw $5, %ymm3, %ymm3
+; GFNIAVX512VL-NEXT:    vpblendvb %ymm3, %ymm5, %ymm4, %ymm4
+; GFNIAVX512VL-NEXT:    vpsllw $2, %ymm4, %ymm5
+; GFNIAVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm7 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
+; GFNIAVX512VL-NEXT:    vpand %ymm7, %ymm5, %ymm5
+; GFNIAVX512VL-NEXT:    vpaddb %ymm3, %ymm3, %ymm3
+; GFNIAVX512VL-NEXT:    vpblendvb %ymm3, %ymm5, %ymm4, %ymm4
+; GFNIAVX512VL-NEXT:    vpaddb %ymm4, %ymm4, %ymm5
+; GFNIAVX512VL-NEXT:    vpaddb %ymm3, %ymm3, %ymm3
+; GFNIAVX512VL-NEXT:    vpblendvb %ymm3, %ymm5, %ymm4, %ymm3
+; GFNIAVX512VL-NEXT:    vpsllw $4, %ymm0, %ymm4
+; GFNIAVX512VL-NEXT:    vpand %ymm6, %ymm4, %ymm4
+; GFNIAVX512VL-NEXT:    vpsllw $5, %ymm2, %ymm2
+; GFNIAVX512VL-NEXT:    vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
+; GFNIAVX512VL-NEXT:    vpsllw $2, %ymm0, %ymm4
+; GFNIAVX512VL-NEXT:    vpand %ymm7, %ymm4, %ymm4
+; GFNIAVX512VL-NEXT:    vpaddb %ymm2, %ymm2, %ymm2
+; GFNIAVX512VL-NEXT:    vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
+; GFNIAVX512VL-NEXT:    vpaddb %ymm0, %ymm0, %ymm4
+; GFNIAVX512VL-NEXT:    vpaddb %ymm2, %ymm2, %ymm2
+; GFNIAVX512VL-NEXT:    vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
+; GFNIAVX512VL-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm0
+; GFNIAVX512VL-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; GFNIAVX512VL-NEXT:    retq
+;
+; GFNIAVX512BW-LABEL: var_fshl_v64i8:
+; GFNIAVX512BW:       # %bb.0:
+; GFNIAVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63]
+; GFNIAVX512BW-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2
+; GFNIAVX512BW-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; GFNIAVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm5 = zmm2[8],zmm4[8],zmm2[9],zmm4[9],zmm2[10],zmm4[10],zmm2[11],zmm4[11],zmm2[12],zmm4[12],zmm2[13],zmm4[13],zmm2[14],zmm4[14],zmm2[15],zmm4[15],zmm2[24],zmm4[24],zmm2[25],zmm4[25],zmm2[26],zmm4[26],zmm2[27],zmm4[27],zmm2[28],zmm4[28],zmm2[29],zmm4[29],zmm2[30],zmm4[30],zmm2[31],zmm4[31],zmm2[40],zmm4[40],zmm2[41],zmm4[41],zmm2[42],zmm4[42],zmm2[43],zmm4[43],zmm2[44],zmm4[44],zmm2[45],zmm4[45],zmm2[46],zmm4[46],zmm2[47],zmm4[47],zmm2[56],zmm4[56],zmm2[57],zmm4[57],zmm2[58],zmm4[58],zmm2[59],zmm4[59],zmm2[60],zmm4[60],zmm2[61],zmm4[61],zmm2[62],zmm4[62],zmm2[63],zmm4[63]
+; GFNIAVX512BW-NEXT:    vpsllvw %zmm5, %zmm3, %zmm3
+; GFNIAVX512BW-NEXT:    vpsrlw $8, %zmm3, %zmm3
+; GFNIAVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55]
+; GFNIAVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm1 = zmm2[0],zmm4[0],zmm2[1],zmm4[1],zmm2[2],zmm4[2],zmm2[3],zmm4[3],zmm2[4],zmm4[4],zmm2[5],zmm4[5],zmm2[6],zmm4[6],zmm2[7],zmm4[7],zmm2[16],zmm4[16],zmm2[17],zmm4[17],zmm2[18],zmm4[18],zmm2[19],zmm4[19],zmm2[20],zmm4[20],zmm2[21],zmm4[21],zmm2[22],zmm4[22],zmm2[23],zmm4[23],zmm2[32],zmm4[32],zmm2[33],zmm4[33],zmm2[34],zmm4[34],zmm2[35],zmm4[35],zmm2[36],zmm4[36],zmm2[37],zmm4[37],zmm2[38],zmm4[38],zmm2[39],zmm4[39],zmm2[48],zmm4[48],zmm2[49],zmm4[49],zmm2[50],zmm4[50],zmm2[51],zmm4[51],zmm2[52],zmm4[52],zmm2[53],zmm4[53],zmm2[54],zmm4[54],zmm2[55],zmm4[55]
+; GFNIAVX512BW-NEXT:    vpsllvw %zmm1, %zmm0, %zmm0
+; GFNIAVX512BW-NEXT:    vpsrlw $8, %zmm0, %zmm0
+; GFNIAVX512BW-NEXT:    vpackuswb %zmm3, %zmm0, %zmm0
+; GFNIAVX512BW-NEXT:    retq
+  %res = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %amt)
+  ret <64 x i8> %res
+}
+
+define <64 x i8> @var_fshr_v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %amt) nounwind {
+; GFNISSE-LABEL: var_fshr_v64i8:
+; GFNISSE:       # %bb.0:
+; GFNISSE-NEXT:    movdqa %xmm7, %xmm10
+; GFNISSE-NEXT:    movdqa %xmm6, %xmm7
+; GFNISSE-NEXT:    movdqa %xmm5, %xmm6
+; GFNISSE-NEXT:    movdqa %xmm4, %xmm5
+; GFNISSE-NEXT:    movdqa %xmm3, %xmm4
+; GFNISSE-NEXT:    movdqa %xmm2, %xmm3
+; GFNISSE-NEXT:    movdqa %xmm1, %xmm2
+; GFNISSE-NEXT:    movdqa %xmm0, %xmm1
+; GFNISSE-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm9
+; GFNISSE-NEXT:    movdqa %xmm5, %xmm12
+; GFNISSE-NEXT:    psrlw $4, %xmm12
+; GFNISSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm12
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm11 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; GFNISSE-NEXT:    movdqa %xmm9, %xmm0
+; GFNISSE-NEXT:    pand %xmm11, %xmm0
+; GFNISSE-NEXT:    psllw $5, %xmm0
+; GFNISSE-NEXT:    pblendvb %xmm0, %xmm12, %xmm5
+; GFNISSE-NEXT:    movdqa %xmm5, %xmm13
+; GFNISSE-NEXT:    psrlw $2, %xmm13
+; GFNISSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm13
+; GFNISSE-NEXT:    paddb %xmm0, %xmm0
+; GFNISSE-NEXT:    pblendvb %xmm0, %xmm13, %xmm5
+; GFNISSE-NEXT:    movdqa %xmm5, %xmm14
+; GFNISSE-NEXT:    psrlw $1, %xmm14
+; GFNISSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm14
+; GFNISSE-NEXT:    paddb %xmm0, %xmm0
+; GFNISSE-NEXT:    pblendvb %xmm0, %xmm14, %xmm5
+; GFNISSE-NEXT:    paddb %xmm1, %xmm1
+; GFNISSE-NEXT:    movdqa %xmm1, %xmm15
+; GFNISSE-NEXT:    psllw $4, %xmm15
+; GFNISSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm15
+; GFNISSE-NEXT:    movdqa %xmm11, %xmm12
+; GFNISSE-NEXT:    pandn %xmm11, %xmm9
+; GFNISSE-NEXT:    psllw $5, %xmm9
+; GFNISSE-NEXT:    movdqa %xmm9, %xmm0
+; GFNISSE-NEXT:    pblendvb %xmm0, %xmm15, %xmm1
+; GFNISSE-NEXT:    movdqa %xmm1, %xmm8
+; GFNISSE-NEXT:    psllw $2, %xmm8
+; GFNISSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm8
+; GFNISSE-NEXT:    paddb %xmm9, %xmm9
+; GFNISSE-NEXT:    movdqa %xmm9, %xmm0
+; GFNISSE-NEXT:    pblendvb %xmm0, %xmm8, %xmm1
+; GFNISSE-NEXT:    movdqa %xmm1, %xmm8
+; GFNISSE-NEXT:    paddb %xmm1, %xmm8
+; GFNISSE-NEXT:    paddb %xmm9, %xmm9
+; GFNISSE-NEXT:    movdqa %xmm9, %xmm0
+; GFNISSE-NEXT:    pblendvb %xmm0, %xmm8, %xmm1
+; GFNISSE-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm9
+; GFNISSE-NEXT:    movdqa %xmm6, %xmm8
+; GFNISSE-NEXT:    psrlw $4, %xmm8
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm11 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; GFNISSE-NEXT:    pand %xmm11, %xmm8
+; GFNISSE-NEXT:    movdqa %xmm9, %xmm0
+; GFNISSE-NEXT:    pand %xmm12, %xmm0
+; GFNISSE-NEXT:    psllw $5, %xmm0
+; GFNISSE-NEXT:    pblendvb %xmm0, %xmm8, %xmm6
+; GFNISSE-NEXT:    movdqa %xmm6, %xmm8
+; GFNISSE-NEXT:    psrlw $2, %xmm8
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm13 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
+; GFNISSE-NEXT:    pand %xmm13, %xmm8
+; GFNISSE-NEXT:    paddb %xmm0, %xmm0
+; GFNISSE-NEXT:    pblendvb %xmm0, %xmm8, %xmm6
+; GFNISSE-NEXT:    movdqa %xmm6, %xmm8
+; GFNISSE-NEXT:    psrlw $1, %xmm8
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm14 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; GFNISSE-NEXT:    pand %xmm14, %xmm8
+; GFNISSE-NEXT:    paddb %xmm0, %xmm0
+; GFNISSE-NEXT:    pblendvb %xmm0, %xmm8, %xmm6
+; GFNISSE-NEXT:    paddb %xmm2, %xmm2
+; GFNISSE-NEXT:    movdqa %xmm2, %xmm8
+; GFNISSE-NEXT:    psllw $4, %xmm8
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm15 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; GFNISSE-NEXT:    pand %xmm15, %xmm8
+; GFNISSE-NEXT:    pandn %xmm12, %xmm9
+; GFNISSE-NEXT:    psllw $5, %xmm9
+; GFNISSE-NEXT:    movdqa %xmm9, %xmm0
+; GFNISSE-NEXT:    pblendvb %xmm0, %xmm8, %xmm2
+; GFNISSE-NEXT:    movdqa %xmm2, %xmm8
+; GFNISSE-NEXT:    psllw $2, %xmm8
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm0 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
+; GFNISSE-NEXT:    pand %xmm0, %xmm8
+; GFNISSE-NEXT:    paddb %xmm9, %xmm9
+; GFNISSE-NEXT:    movdqa %xmm9, %xmm0
+; GFNISSE-NEXT:    pblendvb %xmm0, %xmm8, %xmm2
+; GFNISSE-NEXT:    movdqa %xmm2, %xmm8
+; GFNISSE-NEXT:    paddb %xmm2, %xmm8
+; GFNISSE-NEXT:    paddb %xmm9, %xmm9
+; GFNISSE-NEXT:    movdqa %xmm9, %xmm0
+; GFNISSE-NEXT:    pblendvb %xmm0, %xmm8, %xmm2
+; GFNISSE-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm9
+; GFNISSE-NEXT:    movdqa %xmm7, %xmm8
+; GFNISSE-NEXT:    psrlw $4, %xmm8
+; GFNISSE-NEXT:    pand %xmm11, %xmm8
+; GFNISSE-NEXT:    movdqa %xmm9, %xmm0
+; GFNISSE-NEXT:    pand %xmm12, %xmm0
+; GFNISSE-NEXT:    psllw $5, %xmm0
+; GFNISSE-NEXT:    pblendvb %xmm0, %xmm8, %xmm7
+; GFNISSE-NEXT:    movdqa %xmm7, %xmm8
+; GFNISSE-NEXT:    psrlw $2, %xmm8
+; GFNISSE-NEXT:    pand %xmm13, %xmm8
+; GFNISSE-NEXT:    paddb %xmm0, %xmm0
+; GFNISSE-NEXT:    pblendvb %xmm0, %xmm8, %xmm7
+; GFNISSE-NEXT:    movdqa %xmm7, %xmm8
+; GFNISSE-NEXT:    psrlw $1, %xmm8
+; GFNISSE-NEXT:    pand %xmm14, %xmm8
+; GFNISSE-NEXT:    paddb %xmm0, %xmm0
+; GFNISSE-NEXT:    pblendvb %xmm0, %xmm8, %xmm7
+; GFNISSE-NEXT:    paddb %xmm3, %xmm3
+; GFNISSE-NEXT:    movdqa %xmm3, %xmm8
+; GFNISSE-NEXT:    psllw $4, %xmm8
+; GFNISSE-NEXT:    pand %xmm15, %xmm8
+; GFNISSE-NEXT:    pandn %xmm12, %xmm9
+; GFNISSE-NEXT:    psllw $5, %xmm9
+; GFNISSE-NEXT:    movdqa %xmm9, %xmm0
+; GFNISSE-NEXT:    pblendvb %xmm0, %xmm8, %xmm3
+; GFNISSE-NEXT:    movdqa %xmm3, %xmm8
+; GFNISSE-NEXT:    psllw $2, %xmm8
+; GFNISSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm8
+; GFNISSE-NEXT:    paddb %xmm9, %xmm9
+; GFNISSE-NEXT:    movdqa %xmm9, %xmm0
+; GFNISSE-NEXT:    pblendvb %xmm0, %xmm8, %xmm3
+; GFNISSE-NEXT:    movdqa %xmm3, %xmm8
+; GFNISSE-NEXT:    paddb %xmm3, %xmm8
+; GFNISSE-NEXT:    paddb %xmm9, %xmm9
+; GFNISSE-NEXT:    movdqa %xmm9, %xmm0
+; GFNISSE-NEXT:    pblendvb %xmm0, %xmm8, %xmm3
+; GFNISSE-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm9
+; GFNISSE-NEXT:    movdqa %xmm10, %xmm8
+; GFNISSE-NEXT:    psrlw $4, %xmm8
+; GFNISSE-NEXT:    pand %xmm11, %xmm8
+; GFNISSE-NEXT:    movdqa %xmm9, %xmm0
+; GFNISSE-NEXT:    pand %xmm12, %xmm0
+; GFNISSE-NEXT:    psllw $5, %xmm0
+; GFNISSE-NEXT:    pblendvb %xmm0, %xmm8, %xmm10
+; GFNISSE-NEXT:    movdqa %xmm10, %xmm8
+; GFNISSE-NEXT:    psrlw $2, %xmm8
+; GFNISSE-NEXT:    pand %xmm13, %xmm8
+; GFNISSE-NEXT:    paddb %xmm0, %xmm0
+; GFNISSE-NEXT:    pblendvb %xmm0, %xmm8, %xmm10
+; GFNISSE-NEXT:    movdqa %xmm10, %xmm8
+; GFNISSE-NEXT:    psrlw $1, %xmm8
+; GFNISSE-NEXT:    pand %xmm14, %xmm8
+; GFNISSE-NEXT:    paddb %xmm0, %xmm0
+; GFNISSE-NEXT:    pblendvb %xmm0, %xmm8, %xmm10
+; GFNISSE-NEXT:    paddb %xmm4, %xmm4
+; GFNISSE-NEXT:    movdqa %xmm4, %xmm8
+; GFNISSE-NEXT:    psllw $4, %xmm8
+; GFNISSE-NEXT:    pand %xmm15, %xmm8
+; GFNISSE-NEXT:    pandn %xmm12, %xmm9
+; GFNISSE-NEXT:    psllw $5, %xmm9
+; GFNISSE-NEXT:    movdqa %xmm9, %xmm0
+; GFNISSE-NEXT:    pblendvb %xmm0, %xmm8, %xmm4
+; GFNISSE-NEXT:    movdqa %xmm4, %xmm8
+; GFNISSE-NEXT:    psllw $2, %xmm8
+; GFNISSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm8
+; GFNISSE-NEXT:    paddb %xmm9, %xmm9
+; GFNISSE-NEXT:    movdqa %xmm9, %xmm0
+; GFNISSE-NEXT:    pblendvb %xmm0, %xmm8, %xmm4
+; GFNISSE-NEXT:    movdqa %xmm4, %xmm8
+; GFNISSE-NEXT:    paddb %xmm4, %xmm8
+; GFNISSE-NEXT:    paddb %xmm9, %xmm9
+; GFNISSE-NEXT:    movdqa %xmm9, %xmm0
+; GFNISSE-NEXT:    pblendvb %xmm0, %xmm8, %xmm4
+; GFNISSE-NEXT:    por %xmm5, %xmm1
+; GFNISSE-NEXT:    por %xmm6, %xmm2
+; GFNISSE-NEXT:    por %xmm7, %xmm3
+; GFNISSE-NEXT:    por %xmm10, %xmm4
+; GFNISSE-NEXT:    movdqa %xmm1, %xmm0
+; GFNISSE-NEXT:    movdqa %xmm2, %xmm1
+; GFNISSE-NEXT:    movdqa %xmm3, %xmm2
+; GFNISSE-NEXT:    movdqa %xmm4, %xmm3
+; GFNISSE-NEXT:    retq
+;
+; GFNIAVX1-LABEL: var_fshr_v64i8:
+; GFNIAVX1:       # %bb.0:
+; GFNIAVX1-NEXT:    vextractf128 $1, %ymm2, %xmm8
+; GFNIAVX1-NEXT:    vpsrlw $4, %xmm8, %xmm6
+; GFNIAVX1-NEXT:    vbroadcastss {{.*#+}} xmm7 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; GFNIAVX1-NEXT:    vpand %xmm7, %xmm6, %xmm9
+; GFNIAVX1-NEXT:    vbroadcastss {{.*#+}} ymm6 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; GFNIAVX1-NEXT:    vandps %ymm6, %ymm4, %ymm11
+; GFNIAVX1-NEXT:    vextractf128 $1, %ymm11, %xmm10
+; GFNIAVX1-NEXT:    vpsllw $5, %xmm10, %xmm12
+; GFNIAVX1-NEXT:    vpblendvb %xmm12, %xmm9, %xmm8, %xmm8
+; GFNIAVX1-NEXT:    vpsrlw $2, %xmm8, %xmm9
+; GFNIAVX1-NEXT:    vbroadcastss {{.*#+}} xmm4 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
+; GFNIAVX1-NEXT:    vpand %xmm4, %xmm9, %xmm9
+; GFNIAVX1-NEXT:    vpaddb %xmm12, %xmm12, %xmm12
+; GFNIAVX1-NEXT:    vpblendvb %xmm12, %xmm9, %xmm8, %xmm9
+; GFNIAVX1-NEXT:    vpsrlw $1, %xmm9, %xmm13
+; GFNIAVX1-NEXT:    vbroadcastss {{.*#+}} xmm8 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; GFNIAVX1-NEXT:    vpand %xmm8, %xmm13, %xmm13
+; GFNIAVX1-NEXT:    vpaddb %xmm12, %xmm12, %xmm12
+; GFNIAVX1-NEXT:    vpblendvb %xmm12, %xmm13, %xmm9, %xmm12
+; GFNIAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm9
+; GFNIAVX1-NEXT:    vpaddb %xmm9, %xmm9, %xmm13
+; GFNIAVX1-NEXT:    vpsllw $4, %xmm13, %xmm14
+; GFNIAVX1-NEXT:    vbroadcastss {{.*#+}} xmm9 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; GFNIAVX1-NEXT:    vpand %xmm9, %xmm14, %xmm14
+; GFNIAVX1-NEXT:    vpxor %xmm6, %xmm10, %xmm10
+; GFNIAVX1-NEXT:    vpsllw $5, %xmm10, %xmm15
+; GFNIAVX1-NEXT:    vpblendvb %xmm15, %xmm14, %xmm13, %xmm13
+; GFNIAVX1-NEXT:    vpsllw $2, %xmm13, %xmm14
+; GFNIAVX1-NEXT:    vbroadcastss {{.*#+}} xmm10 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
+; GFNIAVX1-NEXT:    vpand %xmm10, %xmm14, %xmm14
+; GFNIAVX1-NEXT:    vpaddb %xmm15, %xmm15, %xmm15
+; GFNIAVX1-NEXT:    vpblendvb %xmm15, %xmm14, %xmm13, %xmm13
+; GFNIAVX1-NEXT:    vpaddb %xmm13, %xmm13, %xmm14
+; GFNIAVX1-NEXT:    vpaddb %xmm15, %xmm15, %xmm15
+; GFNIAVX1-NEXT:    vpblendvb %xmm15, %xmm14, %xmm13, %xmm13
+; GFNIAVX1-NEXT:    vpor %xmm12, %xmm13, %xmm12
+; GFNIAVX1-NEXT:    vpsrlw $4, %xmm2, %xmm13
+; GFNIAVX1-NEXT:    vpand %xmm7, %xmm13, %xmm13
+; GFNIAVX1-NEXT:    vpsllw $5, %xmm11, %xmm14
+; GFNIAVX1-NEXT:    vpblendvb %xmm14, %xmm13, %xmm2, %xmm2
+; GFNIAVX1-NEXT:    vpsrlw $2, %xmm2, %xmm13
+; GFNIAVX1-NEXT:    vpand %xmm4, %xmm13, %xmm13
+; GFNIAVX1-NEXT:    vpaddb %xmm14, %xmm14, %xmm14
+; GFNIAVX1-NEXT:    vpblendvb %xmm14, %xmm13, %xmm2, %xmm2
+; GFNIAVX1-NEXT:    vpsrlw $1, %xmm2, %xmm13
+; GFNIAVX1-NEXT:    vpand %xmm8, %xmm13, %xmm13
+; GFNIAVX1-NEXT:    vpaddb %xmm14, %xmm14, %xmm14
+; GFNIAVX1-NEXT:    vpblendvb %xmm14, %xmm13, %xmm2, %xmm2
+; GFNIAVX1-NEXT:    vpaddb %xmm0, %xmm0, %xmm0
+; GFNIAVX1-NEXT:    vpsllw $4, %xmm0, %xmm13
+; GFNIAVX1-NEXT:    vpand %xmm9, %xmm13, %xmm13
+; GFNIAVX1-NEXT:    vpxor %xmm6, %xmm11, %xmm11
+; GFNIAVX1-NEXT:    vpsllw $5, %xmm11, %xmm11
+; GFNIAVX1-NEXT:    vpblendvb %xmm11, %xmm13, %xmm0, %xmm0
+; GFNIAVX1-NEXT:    vpsllw $2, %xmm0, %xmm13
+; GFNIAVX1-NEXT:    vpand %xmm10, %xmm13, %xmm13
+; GFNIAVX1-NEXT:    vpaddb %xmm11, %xmm11, %xmm11
+; GFNIAVX1-NEXT:    vpblendvb %xmm11, %xmm13, %xmm0, %xmm0
+; GFNIAVX1-NEXT:    vpaddb %xmm0, %xmm0, %xmm13
+; GFNIAVX1-NEXT:    vpaddb %xmm11, %xmm11, %xmm11
+; GFNIAVX1-NEXT:    vpblendvb %xmm11, %xmm13, %xmm0, %xmm0
+; GFNIAVX1-NEXT:    vpor %xmm2, %xmm0, %xmm0
+; GFNIAVX1-NEXT:    vinsertf128 $1, %xmm12, %ymm0, %ymm0
+; GFNIAVX1-NEXT:    vextractf128 $1, %ymm3, %xmm11
+; GFNIAVX1-NEXT:    vpsrlw $4, %xmm11, %xmm2
+; GFNIAVX1-NEXT:    vpand %xmm7, %xmm2, %xmm12
+; GFNIAVX1-NEXT:    vandps %ymm6, %ymm5, %ymm2
+; GFNIAVX1-NEXT:    vextractf128 $1, %ymm2, %xmm5
+; GFNIAVX1-NEXT:    vpsllw $5, %xmm5, %xmm13
+; GFNIAVX1-NEXT:    vpblendvb %xmm13, %xmm12, %xmm11, %xmm11
+; GFNIAVX1-NEXT:    vpsrlw $2, %xmm11, %xmm12
+; GFNIAVX1-NEXT:    vpand %xmm4, %xmm12, %xmm12
+; GFNIAVX1-NEXT:    vpaddb %xmm13, %xmm13, %xmm13
+; GFNIAVX1-NEXT:    vpblendvb %xmm13, %xmm12, %xmm11, %xmm11
+; GFNIAVX1-NEXT:    vpsrlw $1, %xmm11, %xmm12
+; GFNIAVX1-NEXT:    vpand %xmm8, %xmm12, %xmm12
+; GFNIAVX1-NEXT:    vpaddb %xmm13, %xmm13, %xmm13
+; GFNIAVX1-NEXT:    vpblendvb %xmm13, %xmm12, %xmm11, %xmm11
+; GFNIAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm12
+; GFNIAVX1-NEXT:    vpaddb %xmm12, %xmm12, %xmm12
+; GFNIAVX1-NEXT:    vpsllw $4, %xmm12, %xmm13
+; GFNIAVX1-NEXT:    vpand %xmm9, %xmm13, %xmm13
+; GFNIAVX1-NEXT:    vpxor %xmm6, %xmm5, %xmm5
+; GFNIAVX1-NEXT:    vpsllw $5, %xmm5, %xmm5
+; GFNIAVX1-NEXT:    vpblendvb %xmm5, %xmm13, %xmm12, %xmm12
+; GFNIAVX1-NEXT:    vpsllw $2, %xmm12, %xmm13
+; GFNIAVX1-NEXT:    vpand %xmm10, %xmm13, %xmm13
+; GFNIAVX1-NEXT:    vpaddb %xmm5, %xmm5, %xmm5
+; GFNIAVX1-NEXT:    vpblendvb %xmm5, %xmm13, %xmm12, %xmm12
+; GFNIAVX1-NEXT:    vpaddb %xmm12, %xmm12, %xmm13
+; GFNIAVX1-NEXT:    vpaddb %xmm5, %xmm5, %xmm5
+; GFNIAVX1-NEXT:    vpblendvb %xmm5, %xmm13, %xmm12, %xmm5
+; GFNIAVX1-NEXT:    vpor %xmm5, %xmm11, %xmm5
+; GFNIAVX1-NEXT:    vpsrlw $4, %xmm3, %xmm11
+; GFNIAVX1-NEXT:    vpand %xmm7, %xmm11, %xmm7
+; GFNIAVX1-NEXT:    vpsllw $5, %xmm2, %xmm11
+; GFNIAVX1-NEXT:    vpblendvb %xmm11, %xmm7, %xmm3, %xmm3
+; GFNIAVX1-NEXT:    vpsrlw $2, %xmm3, %xmm7
+; GFNIAVX1-NEXT:    vpand %xmm4, %xmm7, %xmm4
+; GFNIAVX1-NEXT:    vpaddb %xmm11, %xmm11, %xmm7
+; GFNIAVX1-NEXT:    vpblendvb %xmm7, %xmm4, %xmm3, %xmm3
+; GFNIAVX1-NEXT:    vpsrlw $1, %xmm3, %xmm4
+; GFNIAVX1-NEXT:    vpand %xmm4, %xmm8, %xmm4
+; GFNIAVX1-NEXT:    vpaddb %xmm7, %xmm7, %xmm7
+; GFNIAVX1-NEXT:    vpblendvb %xmm7, %xmm4, %xmm3, %xmm3
+; GFNIAVX1-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
+; GFNIAVX1-NEXT:    vpsllw $4, %xmm1, %xmm4
+; GFNIAVX1-NEXT:    vpand %xmm4, %xmm9, %xmm4
+; GFNIAVX1-NEXT:    vpxor %xmm6, %xmm2, %xmm2
+; GFNIAVX1-NEXT:    vpsllw $5, %xmm2, %xmm2
+; GFNIAVX1-NEXT:    vpblendvb %xmm2, %xmm4, %xmm1, %xmm1
+; GFNIAVX1-NEXT:    vpsllw $2, %xmm1, %xmm4
+; GFNIAVX1-NEXT:    vpand %xmm4, %xmm10, %xmm4
+; GFNIAVX1-NEXT:    vpaddb %xmm2, %xmm2, %xmm2
+; GFNIAVX1-NEXT:    vpblendvb %xmm2, %xmm4, %xmm1, %xmm1
+; GFNIAVX1-NEXT:    vpaddb %xmm1, %xmm1, %xmm4
+; GFNIAVX1-NEXT:    vpaddb %xmm2, %xmm2, %xmm2
+; GFNIAVX1-NEXT:    vpblendvb %xmm2, %xmm4, %xmm1, %xmm1
+; GFNIAVX1-NEXT:    vpor %xmm3, %xmm1, %xmm1
+; GFNIAVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm1, %ymm1
+; GFNIAVX1-NEXT:    retq
+;
+; GFNIAVX2-LABEL: var_fshr_v64i8:
+; GFNIAVX2:       # %bb.0:
+; GFNIAVX2-NEXT:    vpsrlw $4, %ymm2, %ymm6
+; GFNIAVX2-NEXT:    vpbroadcastb {{.*#+}} ymm7 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; GFNIAVX2-NEXT:    vpand %ymm7, %ymm6, %ymm8
+; GFNIAVX2-NEXT:    vpbroadcastb {{.*#+}} ymm6 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; GFNIAVX2-NEXT:    vpand %ymm6, %ymm4, %ymm9
+; GFNIAVX2-NEXT:    vpsllw $5, %ymm9, %ymm9
+; GFNIAVX2-NEXT:    vpblendvb %ymm9, %ymm8, %ymm2, %ymm2
+; GFNIAVX2-NEXT:    vpsrlw $2, %ymm2, %ymm8
+; GFNIAVX2-NEXT:    vpbroadcastb {{.*#+}} ymm10 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
+; GFNIAVX2-NEXT:    vpand %ymm10, %ymm8, %ymm8
+; GFNIAVX2-NEXT:    vpaddb %ymm9, %ymm9, %ymm9
+; GFNIAVX2-NEXT:    vpblendvb %ymm9, %ymm8, %ymm2, %ymm2
+; GFNIAVX2-NEXT:    vpsrlw $1, %ymm2, %ymm8
+; GFNIAVX2-NEXT:    vpbroadcastb {{.*#+}} ymm11 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; GFNIAVX2-NEXT:    vpand %ymm11, %ymm8, %ymm8
+; GFNIAVX2-NEXT:    vpaddb %ymm9, %ymm9, %ymm9
+; GFNIAVX2-NEXT:    vpblendvb %ymm9, %ymm8, %ymm2, %ymm2
+; GFNIAVX2-NEXT:    vpaddb %ymm0, %ymm0, %ymm0
+; GFNIAVX2-NEXT:    vpsllw $4, %ymm0, %ymm8
+; GFNIAVX2-NEXT:    vpbroadcastb {{.*#+}} ymm9 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; GFNIAVX2-NEXT:    vpand %ymm9, %ymm8, %ymm8
+; GFNIAVX2-NEXT:    vpandn %ymm6, %ymm4, %ymm4
+; GFNIAVX2-NEXT:    vpsllw $5, %ymm4, %ymm4
+; GFNIAVX2-NEXT:    vpblendvb %ymm4, %ymm8, %ymm0, %ymm0
+; GFNIAVX2-NEXT:    vpsllw $2, %ymm0, %ymm8
+; GFNIAVX2-NEXT:    vpbroadcastb {{.*#+}} ymm12 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
+; GFNIAVX2-NEXT:    vpand %ymm12, %ymm8, %ymm8
+; GFNIAVX2-NEXT:    vpaddb %ymm4, %ymm4, %ymm4
+; GFNIAVX2-NEXT:    vpblendvb %ymm4, %ymm8, %ymm0, %ymm0
+; GFNIAVX2-NEXT:    vpaddb %ymm0, %ymm0, %ymm8
+; GFNIAVX2-NEXT:    vpaddb %ymm4, %ymm4, %ymm4
+; GFNIAVX2-NEXT:    vpblendvb %ymm4, %ymm8, %ymm0, %ymm0
+; GFNIAVX2-NEXT:    vpor %ymm2, %ymm0, %ymm0
+; GFNIAVX2-NEXT:    vpsrlw $4, %ymm3, %ymm2
+; GFNIAVX2-NEXT:    vpand %ymm7, %ymm2, %ymm2
+; GFNIAVX2-NEXT:    vpand %ymm6, %ymm5, %ymm4
+; GFNIAVX2-NEXT:    vpsllw $5, %ymm4, %ymm4
+; GFNIAVX2-NEXT:    vpblendvb %ymm4, %ymm2, %ymm3, %ymm2
+; GFNIAVX2-NEXT:    vpsrlw $2, %ymm2, %ymm3
+; GFNIAVX2-NEXT:    vpand %ymm3, %ymm10, %ymm3
+; GFNIAVX2-NEXT:    vpaddb %ymm4, %ymm4, %ymm4
+; GFNIAVX2-NEXT:    vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
+; GFNIAVX2-NEXT:    vpsrlw $1, %ymm2, %ymm3
+; GFNIAVX2-NEXT:    vpand %ymm3, %ymm11, %ymm3
+; GFNIAVX2-NEXT:    vpaddb %ymm4, %ymm4, %ymm4
+; GFNIAVX2-NEXT:    vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
+; GFNIAVX2-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
+; GFNIAVX2-NEXT:    vpsllw $4, %ymm1, %ymm3
+; GFNIAVX2-NEXT:    vpand %ymm3, %ymm9, %ymm3
+; GFNIAVX2-NEXT:    vpandn %ymm6, %ymm5, %ymm4
+; GFNIAVX2-NEXT:    vpsllw $5, %ymm4, %ymm4
+; GFNIAVX2-NEXT:    vpblendvb %ymm4, %ymm3, %ymm1, %ymm1
+; GFNIAVX2-NEXT:    vpsllw $2, %ymm1, %ymm3
+; GFNIAVX2-NEXT:    vpand %ymm3, %ymm12, %ymm3
+; GFNIAVX2-NEXT:    vpaddb %ymm4, %ymm4, %ymm4
+; GFNIAVX2-NEXT:    vpblendvb %ymm4, %ymm3, %ymm1, %ymm1
+; GFNIAVX2-NEXT:    vpaddb %ymm1, %ymm1, %ymm3
+; GFNIAVX2-NEXT:    vpaddb %ymm4, %ymm4, %ymm4
+; GFNIAVX2-NEXT:    vpblendvb %ymm4, %ymm3, %ymm1, %ymm1
+; GFNIAVX2-NEXT:    vpor %ymm2, %ymm1, %ymm1
+; GFNIAVX2-NEXT:    retq
+;
+; GFNIAVX512VL-LABEL: var_fshr_v64i8:
+; GFNIAVX512VL:       # %bb.0:
+; GFNIAVX512VL-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
+; GFNIAVX512VL-NEXT:    vpsrlw $4, %ymm4, %ymm3
+; GFNIAVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; GFNIAVX512VL-NEXT:    vpand %ymm5, %ymm3, %ymm6
+; GFNIAVX512VL-NEXT:    vpbroadcastd {{.*#+}} zmm7 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; GFNIAVX512VL-NEXT:    vpandq %zmm7, %zmm2, %zmm2
+; GFNIAVX512VL-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
+; GFNIAVX512VL-NEXT:    vpsllw $5, %ymm3, %ymm8
+; GFNIAVX512VL-NEXT:    vpblendvb %ymm8, %ymm6, %ymm4, %ymm4
+; GFNIAVX512VL-NEXT:    vpsrlw $2, %ymm4, %ymm6
+; GFNIAVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm9 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
+; GFNIAVX512VL-NEXT:    vpand %ymm6, %ymm9, %ymm6
+; GFNIAVX512VL-NEXT:    vpaddb %ymm8, %ymm8, %ymm8
+; GFNIAVX512VL-NEXT:    vpblendvb %ymm8, %ymm6, %ymm4, %ymm4
+; GFNIAVX512VL-NEXT:    vpsrlw $1, %ymm4, %ymm6
+; GFNIAVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm10 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; GFNIAVX512VL-NEXT:    vpand %ymm6, %ymm10, %ymm6
+; GFNIAVX512VL-NEXT:    vpaddb %ymm8, %ymm8, %ymm8
+; GFNIAVX512VL-NEXT:    vpblendvb %ymm8, %ymm6, %ymm4, %ymm4
+; GFNIAVX512VL-NEXT:    vpsrlw $4, %ymm1, %ymm6
+; GFNIAVX512VL-NEXT:    vpand %ymm5, %ymm6, %ymm5
+; GFNIAVX512VL-NEXT:    vpsllw $5, %ymm2, %ymm6
+; GFNIAVX512VL-NEXT:    vpblendvb %ymm6, %ymm5, %ymm1, %ymm1
+; GFNIAVX512VL-NEXT:    vpsrlw $2, %ymm1, %ymm5
+; GFNIAVX512VL-NEXT:    vpand %ymm5, %ymm9, %ymm5
+; GFNIAVX512VL-NEXT:    vpaddb %ymm6, %ymm6, %ymm6
+; GFNIAVX512VL-NEXT:    vpblendvb %ymm6, %ymm5, %ymm1, %ymm1
+; GFNIAVX512VL-NEXT:    vpsrlw $1, %ymm1, %ymm5
+; GFNIAVX512VL-NEXT:    vpand %ymm5, %ymm10, %ymm5
+; GFNIAVX512VL-NEXT:    vpaddb %ymm6, %ymm6, %ymm6
+; GFNIAVX512VL-NEXT:    vpblendvb %ymm6, %ymm5, %ymm1, %ymm1
+; GFNIAVX512VL-NEXT:    vinserti64x4 $1, %ymm4, %zmm1, %zmm1
+; GFNIAVX512VL-NEXT:    vextracti64x4 $1, %zmm0, %ymm4
+; GFNIAVX512VL-NEXT:    vpaddb %ymm4, %ymm4, %ymm4
+; GFNIAVX512VL-NEXT:    vpsllw $4, %ymm4, %ymm5
+; GFNIAVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm6 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; GFNIAVX512VL-NEXT:    vpand %ymm6, %ymm5, %ymm5
+; GFNIAVX512VL-NEXT:    vpxor %ymm7, %ymm3, %ymm3
+; GFNIAVX512VL-NEXT:    vpsllw $5, %ymm3, %ymm3
+; GFNIAVX512VL-NEXT:    vpblendvb %ymm3, %ymm5, %ymm4, %ymm4
+; GFNIAVX512VL-NEXT:    vpsllw $2, %ymm4, %ymm5
+; GFNIAVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm8 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
+; GFNIAVX512VL-NEXT:    vpand %ymm5, %ymm8, %ymm5
+; GFNIAVX512VL-NEXT:    vpaddb %ymm3, %ymm3, %ymm3
+; GFNIAVX512VL-NEXT:    vpblendvb %ymm3, %ymm5, %ymm4, %ymm4
+; GFNIAVX512VL-NEXT:    vpaddb %ymm4, %ymm4, %ymm5
+; GFNIAVX512VL-NEXT:    vpaddb %ymm3, %ymm3, %ymm3
+; GFNIAVX512VL-NEXT:    vpblendvb %ymm3, %ymm5, %ymm4, %ymm3
+; GFNIAVX512VL-NEXT:    vpaddb %ymm0, %ymm0, %ymm0
+; GFNIAVX512VL-NEXT:    vpsllw $4, %ymm0, %ymm4
+; GFNIAVX512VL-NEXT:    vpand %ymm6, %ymm4, %ymm4
+; GFNIAVX512VL-NEXT:    vpxor %ymm7, %ymm2, %ymm2
+; GFNIAVX512VL-NEXT:    vpsllw $5, %ymm2, %ymm2
+; GFNIAVX512VL-NEXT:    vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
+; GFNIAVX512VL-NEXT:    vpsllw $2, %ymm0, %ymm4
+; GFNIAVX512VL-NEXT:    vpand %ymm4, %ymm8, %ymm4
+; GFNIAVX512VL-NEXT:    vpaddb %ymm2, %ymm2, %ymm2
+; GFNIAVX512VL-NEXT:    vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
+; GFNIAVX512VL-NEXT:    vpaddb %ymm0, %ymm0, %ymm4
+; GFNIAVX512VL-NEXT:    vpaddb %ymm2, %ymm2, %ymm2
+; GFNIAVX512VL-NEXT:    vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
+; GFNIAVX512VL-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm0
+; GFNIAVX512VL-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; GFNIAVX512VL-NEXT:    retq
+;
+; GFNIAVX512BW-LABEL: var_fshr_v64i8:
+; GFNIAVX512BW:       # %bb.0:
+; GFNIAVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63]
+; GFNIAVX512BW-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2
+; GFNIAVX512BW-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; GFNIAVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm5 = zmm2[8],zmm4[8],zmm2[9],zmm4[9],zmm2[10],zmm4[10],zmm2[11],zmm4[11],zmm2[12],zmm4[12],zmm2[13],zmm4[13],zmm2[14],zmm4[14],zmm2[15],zmm4[15],zmm2[24],zmm4[24],zmm2[25],zmm4[25],zmm2[26],zmm4[26],zmm2[27],zmm4[27],zmm2[28],zmm4[28],zmm2[29],zmm4[29],zmm2[30],zmm4[30],zmm2[31],zmm4[31],zmm2[40],zmm4[40],zmm2[41],zmm4[41],zmm2[42],zmm4[42],zmm2[43],zmm4[43],zmm2[44],zmm4[44],zmm2[45],zmm4[45],zmm2[46],zmm4[46],zmm2[47],zmm4[47],zmm2[56],zmm4[56],zmm2[57],zmm4[57],zmm2[58],zmm4[58],zmm2[59],zmm4[59],zmm2[60],zmm4[60],zmm2[61],zmm4[61],zmm2[62],zmm4[62],zmm2[63],zmm4[63]
+; GFNIAVX512BW-NEXT:    vpsrlvw %zmm5, %zmm3, %zmm3
+; GFNIAVX512BW-NEXT:    vpbroadcastw {{.*#+}} zmm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; GFNIAVX512BW-NEXT:    vpandq %zmm5, %zmm3, %zmm3
+; GFNIAVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55]
+; GFNIAVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm1 = zmm2[0],zmm4[0],zmm2[1],zmm4[1],zmm2[2],zmm4[2],zmm2[3],zmm4[3],zmm2[4],zmm4[4],zmm2[5],zmm4[5],zmm2[6],zmm4[6],zmm2[7],zmm4[7],zmm2[16],zmm4[16],zmm2[17],zmm4[17],zmm2[18],zmm4[18],zmm2[19],zmm4[19],zmm2[20],zmm4[20],zmm2[21],zmm4[21],zmm2[22],zmm4[22],zmm2[23],zmm4[23],zmm2[32],zmm4[32],zmm2[33],zmm4[33],zmm2[34],zmm4[34],zmm2[35],zmm4[35],zmm2[36],zmm4[36],zmm2[37],zmm4[37],zmm2[38],zmm4[38],zmm2[39],zmm4[39],zmm2[48],zmm4[48],zmm2[49],zmm4[49],zmm2[50],zmm4[50],zmm2[51],zmm4[51],zmm2[52],zmm4[52],zmm2[53],zmm4[53],zmm2[54],zmm4[54],zmm2[55],zmm4[55]
+; GFNIAVX512BW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
+; GFNIAVX512BW-NEXT:    vpandq %zmm5, %zmm0, %zmm0
+; GFNIAVX512BW-NEXT:    vpackuswb %zmm3, %zmm0, %zmm0
+; GFNIAVX512BW-NEXT:    retq
+  %res = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %amt)
+  ret <64 x i8> %res
+}
+
+define <64 x i8> @splatvar_fshl_v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %amt) nounwind {
+; GFNISSE-LABEL: splatvar_fshl_v64i8:
+; GFNISSE:       # %bb.0:
+; GFNISSE-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm8
+; GFNISSE-NEXT:    movdqa %xmm4, %xmm9
+; GFNISSE-NEXT:    punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm0[8],xmm9[9],xmm0[9],xmm9[10],xmm0[10],xmm9[11],xmm0[11],xmm9[12],xmm0[12],xmm9[13],xmm0[13],xmm9[14],xmm0[14],xmm9[15],xmm0[15]
+; GFNISSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm8
+; GFNISSE-NEXT:    psllw %xmm8, %xmm9
+; GFNISSE-NEXT:    psrlw $8, %xmm9
+; GFNISSE-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
+; GFNISSE-NEXT:    psllw %xmm8, %xmm4
+; GFNISSE-NEXT:    psrlw $8, %xmm4
+; GFNISSE-NEXT:    packuswb %xmm9, %xmm4
+; GFNISSE-NEXT:    movdqa %xmm5, %xmm0
+; GFNISSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; GFNISSE-NEXT:    psllw %xmm8, %xmm0
+; GFNISSE-NEXT:    psrlw $8, %xmm0
+; GFNISSE-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
+; GFNISSE-NEXT:    psllw %xmm8, %xmm5
+; GFNISSE-NEXT:    psrlw $8, %xmm5
+; GFNISSE-NEXT:    packuswb %xmm0, %xmm5
+; GFNISSE-NEXT:    movdqa %xmm6, %xmm0
+; GFNISSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
+; GFNISSE-NEXT:    psllw %xmm8, %xmm0
+; GFNISSE-NEXT:    psrlw $8, %xmm0
+; GFNISSE-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
+; GFNISSE-NEXT:    psllw %xmm8, %xmm6
+; GFNISSE-NEXT:    psrlw $8, %xmm6
+; GFNISSE-NEXT:    packuswb %xmm0, %xmm6
+; GFNISSE-NEXT:    movdqa %xmm7, %xmm0
+; GFNISSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15]
+; GFNISSE-NEXT:    psllw %xmm8, %xmm0
+; GFNISSE-NEXT:    psrlw $8, %xmm0
+; GFNISSE-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7]
+; GFNISSE-NEXT:    psllw %xmm8, %xmm7
+; GFNISSE-NEXT:    psrlw $8, %xmm7
+; GFNISSE-NEXT:    packuswb %xmm0, %xmm7
+; GFNISSE-NEXT:    movdqa %xmm4, %xmm0
+; GFNISSE-NEXT:    movdqa %xmm5, %xmm1
+; GFNISSE-NEXT:    movdqa %xmm6, %xmm2
+; GFNISSE-NEXT:    movdqa %xmm7, %xmm3
+; GFNISSE-NEXT:    retq
+;
+; GFNIAVX1-LABEL: splatvar_fshl_v64i8:
+; GFNIAVX1:       # %bb.0:
+; GFNIAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; GFNIAVX1-NEXT:    vextractf128 $1, %ymm2, %xmm6
+; GFNIAVX1-NEXT:    vpunpckhbw {{.*#+}} xmm7 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15]
+; GFNIAVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4
+; GFNIAVX1-NEXT:    vpsllw %xmm4, %xmm7, %xmm7
+; GFNIAVX1-NEXT:    vpsrlw $8, %xmm7, %xmm7
+; GFNIAVX1-NEXT:    vpunpcklbw {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
+; GFNIAVX1-NEXT:    vpsllw %xmm4, %xmm5, %xmm5
+; GFNIAVX1-NEXT:    vpsrlw $8, %xmm5, %xmm5
+; GFNIAVX1-NEXT:    vpackuswb %xmm7, %xmm5, %xmm5
+; GFNIAVX1-NEXT:    vpunpckhbw {{.*#+}} xmm6 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
+; GFNIAVX1-NEXT:    vpsllw %xmm4, %xmm6, %xmm6
+; GFNIAVX1-NEXT:    vpsrlw $8, %xmm6, %xmm6
+; GFNIAVX1-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; GFNIAVX1-NEXT:    vpsllw %xmm4, %xmm0, %xmm0
+; GFNIAVX1-NEXT:    vpsrlw $8, %xmm0, %xmm0
+; GFNIAVX1-NEXT:    vpackuswb %xmm6, %xmm0, %xmm0
+; GFNIAVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm0, %ymm0
+; GFNIAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; GFNIAVX1-NEXT:    vextractf128 $1, %ymm3, %xmm5
+; GFNIAVX1-NEXT:    vpunpckhbw {{.*#+}} xmm6 = xmm5[8],xmm2[8],xmm5[9],xmm2[9],xmm5[10],xmm2[10],xmm5[11],xmm2[11],xmm5[12],xmm2[12],xmm5[13],xmm2[13],xmm5[14],xmm2[14],xmm5[15],xmm2[15]
+; GFNIAVX1-NEXT:    vpsllw %xmm4, %xmm6, %xmm6
+; GFNIAVX1-NEXT:    vpsrlw $8, %xmm6, %xmm6
+; GFNIAVX1-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7]
+; GFNIAVX1-NEXT:    vpsllw %xmm4, %xmm2, %xmm2
+; GFNIAVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
+; GFNIAVX1-NEXT:    vpackuswb %xmm6, %xmm2, %xmm2
+; GFNIAVX1-NEXT:    vpunpckhbw {{.*#+}} xmm5 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
+; GFNIAVX1-NEXT:    vpsllw %xmm4, %xmm5, %xmm5
+; GFNIAVX1-NEXT:    vpsrlw $8, %xmm5, %xmm5
+; GFNIAVX1-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
+; GFNIAVX1-NEXT:    vpsllw %xmm4, %xmm1, %xmm1
+; GFNIAVX1-NEXT:    vpsrlw $8, %xmm1, %xmm1
+; GFNIAVX1-NEXT:    vpackuswb %xmm5, %xmm1, %xmm1
+; GFNIAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; GFNIAVX1-NEXT:    retq
+;
+; GFNIAVX2-LABEL: splatvar_fshl_v64i8:
+; GFNIAVX2:       # %bb.0:
+; GFNIAVX2-NEXT:    vpunpckhbw {{.*#+}} ymm5 = ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15],ymm2[24],ymm0[24],ymm2[25],ymm0[25],ymm2[26],ymm0[26],ymm2[27],ymm0[27],ymm2[28],ymm0[28],ymm2[29],ymm0[29],ymm2[30],ymm0[30],ymm2[31],ymm0[31]
+; GFNIAVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4
+; GFNIAVX2-NEXT:    vpsllw %xmm4, %ymm5, %ymm5
+; GFNIAVX2-NEXT:    vpsrlw $8, %ymm5, %ymm5
+; GFNIAVX2-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23]
+; GFNIAVX2-NEXT:    vpsllw %xmm4, %ymm0, %ymm0
+; GFNIAVX2-NEXT:    vpsrlw $8, %ymm0, %ymm0
+; GFNIAVX2-NEXT:    vpackuswb %ymm5, %ymm0, %ymm0
+; GFNIAVX2-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm3[8],ymm1[8],ymm3[9],ymm1[9],ymm3[10],ymm1[10],ymm3[11],ymm1[11],ymm3[12],ymm1[12],ymm3[13],ymm1[13],ymm3[14],ymm1[14],ymm3[15],ymm1[15],ymm3[24],ymm1[24],ymm3[25],ymm1[25],ymm3[26],ymm1[26],ymm3[27],ymm1[27],ymm3[28],ymm1[28],ymm3[29],ymm1[29],ymm3[30],ymm1[30],ymm3[31],ymm1[31]
+; GFNIAVX2-NEXT:    vpsllw %xmm4, %ymm2, %ymm2
+; GFNIAVX2-NEXT:    vpsrlw $8, %ymm2, %ymm2
+; GFNIAVX2-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm3[0],ymm1[0],ymm3[1],ymm1[1],ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[4],ymm1[4],ymm3[5],ymm1[5],ymm3[6],ymm1[6],ymm3[7],ymm1[7],ymm3[16],ymm1[16],ymm3[17],ymm1[17],ymm3[18],ymm1[18],ymm3[19],ymm1[19],ymm3[20],ymm1[20],ymm3[21],ymm1[21],ymm3[22],ymm1[22],ymm3[23],ymm1[23]
+; GFNIAVX2-NEXT:    vpsllw %xmm4, %ymm1, %ymm1
+; GFNIAVX2-NEXT:    vpsrlw $8, %ymm1, %ymm1
+; GFNIAVX2-NEXT:    vpackuswb %ymm2, %ymm1, %ymm1
+; GFNIAVX2-NEXT:    retq
+;
+; GFNIAVX512VL-LABEL: splatvar_fshl_v64i8:
+; GFNIAVX512VL:       # %bb.0:
+; GFNIAVX512VL-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
+; GFNIAVX512VL-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
+; GFNIAVX512VL-NEXT:    vpunpckhbw {{.*#+}} ymm5 = ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15],ymm4[24],ymm3[24],ymm4[25],ymm3[25],ymm4[26],ymm3[26],ymm4[27],ymm3[27],ymm4[28],ymm3[28],ymm4[29],ymm3[29],ymm4[30],ymm3[30],ymm4[31],ymm3[31]
+; GFNIAVX512VL-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; GFNIAVX512VL-NEXT:    vpsllw %xmm2, %ymm5, %ymm5
+; GFNIAVX512VL-NEXT:    vpsrlw $8, %ymm5, %ymm5
+; GFNIAVX512VL-NEXT:    vpunpcklbw {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[16],ymm3[16],ymm4[17],ymm3[17],ymm4[18],ymm3[18],ymm4[19],ymm3[19],ymm4[20],ymm3[20],ymm4[21],ymm3[21],ymm4[22],ymm3[22],ymm4[23],ymm3[23]
+; GFNIAVX512VL-NEXT:    vpsllw %xmm2, %ymm3, %ymm3
+; GFNIAVX512VL-NEXT:    vpsrlw $8, %ymm3, %ymm3
+; GFNIAVX512VL-NEXT:    vpackuswb %ymm5, %ymm3, %ymm3
+; GFNIAVX512VL-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
+; GFNIAVX512VL-NEXT:    vpsllw %xmm2, %ymm4, %ymm4
+; GFNIAVX512VL-NEXT:    vpsrlw $8, %ymm4, %ymm4
+; GFNIAVX512VL-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
+; GFNIAVX512VL-NEXT:    vpsllw %xmm2, %ymm0, %ymm0
+; GFNIAVX512VL-NEXT:    vpsrlw $8, %ymm0, %ymm0
+; GFNIAVX512VL-NEXT:    vpackuswb %ymm4, %ymm0, %ymm0
+; GFNIAVX512VL-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm0
+; GFNIAVX512VL-NEXT:    retq
+;
+; GFNIAVX512BW-LABEL: splatvar_fshl_v64i8:
+; GFNIAVX512BW:       # %bb.0:
+; GFNIAVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63]
+; GFNIAVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; GFNIAVX512BW-NEXT:    vpsllw %xmm2, %zmm3, %zmm3
+; GFNIAVX512BW-NEXT:    vpsrlw $8, %zmm3, %zmm3
+; GFNIAVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55]
+; GFNIAVX512BW-NEXT:    vpsllw %xmm2, %zmm0, %zmm0
+; GFNIAVX512BW-NEXT:    vpsrlw $8, %zmm0, %zmm0
+; GFNIAVX512BW-NEXT:    vpackuswb %zmm3, %zmm0, %zmm0
+; GFNIAVX512BW-NEXT:    retq
+  %splat = shufflevector <64 x i8> %amt, <64 x i8> undef, <64 x i32> zeroinitializer
+  %res = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %splat)
+  ret <64 x i8> %res
+}
+
+define <64 x i8> @splatvar_fshr_v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %amt) nounwind {
+; GFNISSE-LABEL: splatvar_fshr_v64i8:
+; GFNISSE:       # %bb.0:
+; GFNISSE-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm9
+; GFNISSE-NEXT:    movdqa %xmm4, %xmm10
+; GFNISSE-NEXT:    punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm0[8],xmm10[9],xmm0[9],xmm10[10],xmm0[10],xmm10[11],xmm0[11],xmm10[12],xmm0[12],xmm10[13],xmm0[13],xmm10[14],xmm0[14],xmm10[15],xmm0[15]
+; GFNISSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm9
+; GFNISSE-NEXT:    psrlw %xmm9, %xmm10
+; GFNISSE-NEXT:    pmovzxbw {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255]
+; GFNISSE-NEXT:    pand %xmm8, %xmm10
+; GFNISSE-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
+; GFNISSE-NEXT:    psrlw %xmm9, %xmm4
+; GFNISSE-NEXT:    pand %xmm8, %xmm4
+; GFNISSE-NEXT:    packuswb %xmm10, %xmm4
+; GFNISSE-NEXT:    movdqa %xmm5, %xmm0
+; GFNISSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; GFNISSE-NEXT:    psrlw %xmm9, %xmm0
+; GFNISSE-NEXT:    pand %xmm8, %xmm0
+; GFNISSE-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
+; GFNISSE-NEXT:    psrlw %xmm9, %xmm5
+; GFNISSE-NEXT:    pand %xmm8, %xmm5
+; GFNISSE-NEXT:    packuswb %xmm0, %xmm5
+; GFNISSE-NEXT:    movdqa %xmm6, %xmm0
+; GFNISSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
+; GFNISSE-NEXT:    psrlw %xmm9, %xmm0
+; GFNISSE-NEXT:    pand %xmm8, %xmm0
+; GFNISSE-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
+; GFNISSE-NEXT:    psrlw %xmm9, %xmm6
+; GFNISSE-NEXT:    pand %xmm8, %xmm6
+; GFNISSE-NEXT:    packuswb %xmm0, %xmm6
+; GFNISSE-NEXT:    movdqa %xmm7, %xmm0
+; GFNISSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15]
+; GFNISSE-NEXT:    psrlw %xmm9, %xmm0
+; GFNISSE-NEXT:    pand %xmm8, %xmm0
+; GFNISSE-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7]
+; GFNISSE-NEXT:    psrlw %xmm9, %xmm7
+; GFNISSE-NEXT:    pand %xmm7, %xmm8
+; GFNISSE-NEXT:    packuswb %xmm0, %xmm8
+; GFNISSE-NEXT:    movdqa %xmm4, %xmm0
+; GFNISSE-NEXT:    movdqa %xmm5, %xmm1
+; GFNISSE-NEXT:    movdqa %xmm6, %xmm2
+; GFNISSE-NEXT:    movdqa %xmm8, %xmm3
+; GFNISSE-NEXT:    retq
+;
+; GFNIAVX1-LABEL: splatvar_fshr_v64i8:
+; GFNIAVX1:       # %bb.0:
+; GFNIAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm6
+; GFNIAVX1-NEXT:    vextractf128 $1, %ymm2, %xmm7
+; GFNIAVX1-NEXT:    vpunpckhbw {{.*#+}} xmm5 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15]
+; GFNIAVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4
+; GFNIAVX1-NEXT:    vpsrlw %xmm4, %xmm5, %xmm8
+; GFNIAVX1-NEXT:    vbroadcastss {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
+; GFNIAVX1-NEXT:    vpand %xmm5, %xmm8, %xmm8
+; GFNIAVX1-NEXT:    vpunpcklbw {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
+; GFNIAVX1-NEXT:    vpsrlw %xmm4, %xmm6, %xmm6
+; GFNIAVX1-NEXT:    vpand %xmm5, %xmm6, %xmm6
+; GFNIAVX1-NEXT:    vpackuswb %xmm8, %xmm6, %xmm6
+; GFNIAVX1-NEXT:    vpunpckhbw {{.*#+}} xmm7 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
+; GFNIAVX1-NEXT:    vpsrlw %xmm4, %xmm7, %xmm7
+; GFNIAVX1-NEXT:    vpand %xmm5, %xmm7, %xmm7
+; GFNIAVX1-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; GFNIAVX1-NEXT:    vpsrlw %xmm4, %xmm0, %xmm0
+; GFNIAVX1-NEXT:    vpand %xmm5, %xmm0, %xmm0
+; GFNIAVX1-NEXT:    vpackuswb %xmm7, %xmm0, %xmm0
+; GFNIAVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm0, %ymm0
+; GFNIAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; GFNIAVX1-NEXT:    vextractf128 $1, %ymm3, %xmm6
+; GFNIAVX1-NEXT:    vpunpckhbw {{.*#+}} xmm7 = xmm6[8],xmm2[8],xmm6[9],xmm2[9],xmm6[10],xmm2[10],xmm6[11],xmm2[11],xmm6[12],xmm2[12],xmm6[13],xmm2[13],xmm6[14],xmm2[14],xmm6[15],xmm2[15]
+; GFNIAVX1-NEXT:    vpsrlw %xmm4, %xmm7, %xmm7
+; GFNIAVX1-NEXT:    vpand %xmm5, %xmm7, %xmm7
+; GFNIAVX1-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
+; GFNIAVX1-NEXT:    vpsrlw %xmm4, %xmm2, %xmm2
+; GFNIAVX1-NEXT:    vpand %xmm5, %xmm2, %xmm2
+; GFNIAVX1-NEXT:    vpackuswb %xmm7, %xmm2, %xmm2
+; GFNIAVX1-NEXT:    vpunpckhbw {{.*#+}} xmm6 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
+; GFNIAVX1-NEXT:    vpsrlw %xmm4, %xmm6, %xmm6
+; GFNIAVX1-NEXT:    vpand %xmm5, %xmm6, %xmm6
+; GFNIAVX1-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
+; GFNIAVX1-NEXT:    vpsrlw %xmm4, %xmm1, %xmm1
+; GFNIAVX1-NEXT:    vpand %xmm5, %xmm1, %xmm1
+; GFNIAVX1-NEXT:    vpackuswb %xmm6, %xmm1, %xmm1
+; GFNIAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; GFNIAVX1-NEXT:    retq
+;
+; GFNIAVX2-LABEL: splatvar_fshr_v64i8:
+; GFNIAVX2:       # %bb.0:
+; GFNIAVX2-NEXT:    vpunpckhbw {{.*#+}} ymm5 = ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15],ymm2[24],ymm0[24],ymm2[25],ymm0[25],ymm2[26],ymm0[26],ymm2[27],ymm0[27],ymm2[28],ymm0[28],ymm2[29],ymm0[29],ymm2[30],ymm0[30],ymm2[31],ymm0[31]
+; GFNIAVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4
+; GFNIAVX2-NEXT:    vpsrlw %xmm4, %ymm5, %ymm5
+; GFNIAVX2-NEXT:    vpbroadcastw {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; GFNIAVX2-NEXT:    vpand %ymm6, %ymm5, %ymm5
+; GFNIAVX2-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23]
+; GFNIAVX2-NEXT:    vpsrlw %xmm4, %ymm0, %ymm0
+; GFNIAVX2-NEXT:    vpand %ymm6, %ymm0, %ymm0
+; GFNIAVX2-NEXT:    vpackuswb %ymm5, %ymm0, %ymm0
+; GFNIAVX2-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm3[8],ymm1[8],ymm3[9],ymm1[9],ymm3[10],ymm1[10],ymm3[11],ymm1[11],ymm3[12],ymm1[12],ymm3[13],ymm1[13],ymm3[14],ymm1[14],ymm3[15],ymm1[15],ymm3[24],ymm1[24],ymm3[25],ymm1[25],ymm3[26],ymm1[26],ymm3[27],ymm1[27],ymm3[28],ymm1[28],ymm3[29],ymm1[29],ymm3[30],ymm1[30],ymm3[31],ymm1[31]
+; GFNIAVX2-NEXT:    vpsrlw %xmm4, %ymm2, %ymm2
+; GFNIAVX2-NEXT:    vpand %ymm6, %ymm2, %ymm2
+; GFNIAVX2-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm3[0],ymm1[0],ymm3[1],ymm1[1],ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[4],ymm1[4],ymm3[5],ymm1[5],ymm3[6],ymm1[6],ymm3[7],ymm1[7],ymm3[16],ymm1[16],ymm3[17],ymm1[17],ymm3[18],ymm1[18],ymm3[19],ymm1[19],ymm3[20],ymm1[20],ymm3[21],ymm1[21],ymm3[22],ymm1[22],ymm3[23],ymm1[23]
+; GFNIAVX2-NEXT:    vpsrlw %xmm4, %ymm1, %ymm1
+; GFNIAVX2-NEXT:    vpand %ymm6, %ymm1, %ymm1
+; GFNIAVX2-NEXT:    vpackuswb %ymm2, %ymm1, %ymm1
+; GFNIAVX2-NEXT:    retq
+;
+; GFNIAVX512VL-LABEL: splatvar_fshr_v64i8:
+; GFNIAVX512VL:       # %bb.0:
+; GFNIAVX512VL-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
+; GFNIAVX512VL-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
+; GFNIAVX512VL-NEXT:    vpunpckhbw {{.*#+}} ymm5 = ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15],ymm4[24],ymm3[24],ymm4[25],ymm3[25],ymm4[26],ymm3[26],ymm4[27],ymm3[27],ymm4[28],ymm3[28],ymm4[29],ymm3[29],ymm4[30],ymm3[30],ymm4[31],ymm3[31]
+; GFNIAVX512VL-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; GFNIAVX512VL-NEXT:    vpsrlw %xmm2, %ymm5, %ymm5
+; GFNIAVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; GFNIAVX512VL-NEXT:    vpand %ymm6, %ymm5, %ymm5
+; GFNIAVX512VL-NEXT:    vpunpcklbw {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[16],ymm3[16],ymm4[17],ymm3[17],ymm4[18],ymm3[18],ymm4[19],ymm3[19],ymm4[20],ymm3[20],ymm4[21],ymm3[21],ymm4[22],ymm3[22],ymm4[23],ymm3[23]
+; GFNIAVX512VL-NEXT:    vpsrlw %xmm2, %ymm3, %ymm3
+; GFNIAVX512VL-NEXT:    vpand %ymm6, %ymm3, %ymm3
+; GFNIAVX512VL-NEXT:    vpackuswb %ymm5, %ymm3, %ymm3
+; GFNIAVX512VL-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
+; GFNIAVX512VL-NEXT:    vpsrlw %xmm2, %ymm4, %ymm4
+; GFNIAVX512VL-NEXT:    vpand %ymm6, %ymm4, %ymm4
+; GFNIAVX512VL-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
+; GFNIAVX512VL-NEXT:    vpsrlw %xmm2, %ymm0, %ymm0
+; GFNIAVX512VL-NEXT:    vpand %ymm6, %ymm0, %ymm0
+; GFNIAVX512VL-NEXT:    vpackuswb %ymm4, %ymm0, %ymm0
+; GFNIAVX512VL-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm0
+; GFNIAVX512VL-NEXT:    retq
+;
+; GFNIAVX512BW-LABEL: splatvar_fshr_v64i8:
+; GFNIAVX512BW:       # %bb.0:
+; GFNIAVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63]
+; GFNIAVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; GFNIAVX512BW-NEXT:    vpsrlw %xmm2, %zmm3, %zmm3
+; GFNIAVX512BW-NEXT:    vpbroadcastw {{.*#+}} zmm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; GFNIAVX512BW-NEXT:    vpandq %zmm4, %zmm3, %zmm3
+; GFNIAVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55]
+; GFNIAVX512BW-NEXT:    vpsrlw %xmm2, %zmm0, %zmm0
+; GFNIAVX512BW-NEXT:    vpandq %zmm4, %zmm0, %zmm0
+; GFNIAVX512BW-NEXT:    vpackuswb %zmm3, %zmm0, %zmm0
+; GFNIAVX512BW-NEXT:    retq
+  %splat = shufflevector <64 x i8> %amt, <64 x i8> undef, <64 x i32> zeroinitializer
+  %res = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %splat)
+  ret <64 x i8> %res
+}
+
+define <64 x i8> @constant_fshl_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
+; GFNISSE-LABEL: constant_fshl_v64i8:
+; GFNISSE:       # %bb.0:
+; GFNISSE-NEXT:    movdqa %xmm4, %xmm10
+; GFNISSE-NEXT:    punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm0[8],xmm10[9],xmm0[9],xmm10[10],xmm0[10],xmm10[11],xmm0[11],xmm10[12],xmm0[12],xmm10[13],xmm0[13],xmm10[14],xmm0[14],xmm10[15],xmm0[15]
+; GFNISSE-NEXT:    pmovzxbw {{.*#+}} xmm9 = [1,128,64,32,16,8,4,2]
+; GFNISSE-NEXT:    pmullw %xmm9, %xmm10
+; GFNISSE-NEXT:    psrlw $8, %xmm10
+; GFNISSE-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
+; GFNISSE-NEXT:    pmovzxbw {{.*#+}} xmm8 = [1,2,4,8,16,32,64,128]
+; GFNISSE-NEXT:    pmullw %xmm8, %xmm4
+; GFNISSE-NEXT:    psrlw $8, %xmm4
+; GFNISSE-NEXT:    packuswb %xmm10, %xmm4
+; GFNISSE-NEXT:    movdqa %xmm5, %xmm0
+; GFNISSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; GFNISSE-NEXT:    pmullw %xmm9, %xmm0
+; GFNISSE-NEXT:    psrlw $8, %xmm0
+; GFNISSE-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
+; GFNISSE-NEXT:    pmullw %xmm8, %xmm5
+; GFNISSE-NEXT:    psrlw $8, %xmm5
+; GFNISSE-NEXT:    packuswb %xmm0, %xmm5
+; GFNISSE-NEXT:    movdqa %xmm6, %xmm0
+; GFNISSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
+; GFNISSE-NEXT:    pmullw %xmm9, %xmm0
+; GFNISSE-NEXT:    psrlw $8, %xmm0
+; GFNISSE-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
+; GFNISSE-NEXT:    pmullw %xmm8, %xmm6
+; GFNISSE-NEXT:    psrlw $8, %xmm6
+; GFNISSE-NEXT:    packuswb %xmm0, %xmm6
+; GFNISSE-NEXT:    movdqa %xmm7, %xmm0
+; GFNISSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15]
+; GFNISSE-NEXT:    pmullw %xmm9, %xmm0
+; GFNISSE-NEXT:    psrlw $8, %xmm0
+; GFNISSE-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7]
+; GFNISSE-NEXT:    pmullw %xmm7, %xmm8
+; GFNISSE-NEXT:    psrlw $8, %xmm8
+; GFNISSE-NEXT:    packuswb %xmm0, %xmm8
+; GFNISSE-NEXT:    movdqa %xmm4, %xmm0
+; GFNISSE-NEXT:    movdqa %xmm5, %xmm1
+; GFNISSE-NEXT:    movdqa %xmm6, %xmm2
+; GFNISSE-NEXT:    movdqa %xmm8, %xmm3
+; GFNISSE-NEXT:    retq
+;
+; GFNIAVX1-LABEL: constant_fshl_v64i8:
+; GFNIAVX1:       # %bb.0:
+; GFNIAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
+; GFNIAVX1-NEXT:    vextractf128 $1, %ymm2, %xmm5
+; GFNIAVX1-NEXT:    vpunpckhbw {{.*#+}} xmm6 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15]
+; GFNIAVX1-NEXT:    vpmovzxbw {{.*#+}} xmm7 = [1,128,64,32,16,8,4,2]
+; GFNIAVX1-NEXT:    vpmullw %xmm7, %xmm6, %xmm6
+; GFNIAVX1-NEXT:    vpsrlw $8, %xmm6, %xmm6
+; GFNIAVX1-NEXT:    vpunpcklbw {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
+; GFNIAVX1-NEXT:    vpmovzxbw {{.*#+}} xmm5 = [1,2,4,8,16,32,64,128]
+; GFNIAVX1-NEXT:    vpmullw %xmm5, %xmm4, %xmm4
+; GFNIAVX1-NEXT:    vpsrlw $8, %xmm4, %xmm4
+; GFNIAVX1-NEXT:    vpackuswb %xmm6, %xmm4, %xmm4
+; GFNIAVX1-NEXT:    vpunpckhbw {{.*#+}} xmm6 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
+; GFNIAVX1-NEXT:    vpmullw %xmm7, %xmm6, %xmm6
+; GFNIAVX1-NEXT:    vpsrlw $8, %xmm6, %xmm6
+; GFNIAVX1-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; GFNIAVX1-NEXT:    vpmullw %xmm5, %xmm0, %xmm0
+; GFNIAVX1-NEXT:    vpsrlw $8, %xmm0, %xmm0
+; GFNIAVX1-NEXT:    vpackuswb %xmm6, %xmm0, %xmm0
+; GFNIAVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; GFNIAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; GFNIAVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
+; GFNIAVX1-NEXT:    vpunpckhbw {{.*#+}} xmm6 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15]
+; GFNIAVX1-NEXT:    vpmullw %xmm7, %xmm6, %xmm6
+; GFNIAVX1-NEXT:    vpsrlw $8, %xmm6, %xmm6
+; GFNIAVX1-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
+; GFNIAVX1-NEXT:    vpmullw %xmm5, %xmm2, %xmm2
+; GFNIAVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
+; GFNIAVX1-NEXT:    vpackuswb %xmm6, %xmm2, %xmm2
+; GFNIAVX1-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
+; GFNIAVX1-NEXT:    vpmullw %xmm7, %xmm4, %xmm4
+; GFNIAVX1-NEXT:    vpsrlw $8, %xmm4, %xmm4
+; GFNIAVX1-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
+; GFNIAVX1-NEXT:    vpmullw %xmm5, %xmm1, %xmm1
+; GFNIAVX1-NEXT:    vpsrlw $8, %xmm1, %xmm1
+; GFNIAVX1-NEXT:    vpackuswb %xmm4, %xmm1, %xmm1
+; GFNIAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; GFNIAVX1-NEXT:    retq
+;
+; GFNIAVX2-LABEL: constant_fshl_v64i8:
+; GFNIAVX2:       # %bb.0:
+; GFNIAVX2-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15],ymm2[24],ymm0[24],ymm2[25],ymm0[25],ymm2[26],ymm0[26],ymm2[27],ymm0[27],ymm2[28],ymm0[28],ymm2[29],ymm0[29],ymm2[30],ymm0[30],ymm2[31],ymm0[31]
+; GFNIAVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2]
+; GFNIAVX2-NEXT:    # ymm5 = mem[0,1,0,1]
+; GFNIAVX2-NEXT:    vpmullw %ymm5, %ymm4, %ymm4
+; GFNIAVX2-NEXT:    vpsrlw $8, %ymm4, %ymm4
+; GFNIAVX2-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23]
+; GFNIAVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
+; GFNIAVX2-NEXT:    # ymm2 = mem[0,1,0,1]
+; GFNIAVX2-NEXT:    vpmullw %ymm2, %ymm0, %ymm0
+; GFNIAVX2-NEXT:    vpsrlw $8, %ymm0, %ymm0
+; GFNIAVX2-NEXT:    vpackuswb %ymm4, %ymm0, %ymm0
+; GFNIAVX2-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm3[8],ymm1[8],ymm3[9],ymm1[9],ymm3[10],ymm1[10],ymm3[11],ymm1[11],ymm3[12],ymm1[12],ymm3[13],ymm1[13],ymm3[14],ymm1[14],ymm3[15],ymm1[15],ymm3[24],ymm1[24],ymm3[25],ymm1[25],ymm3[26],ymm1[26],ymm3[27],ymm1[27],ymm3[28],ymm1[28],ymm3[29],ymm1[29],ymm3[30],ymm1[30],ymm3[31],ymm1[31]
+; GFNIAVX2-NEXT:    vpmullw %ymm5, %ymm4, %ymm4
+; GFNIAVX2-NEXT:    vpsrlw $8, %ymm4, %ymm4
+; GFNIAVX2-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm3[0],ymm1[0],ymm3[1],ymm1[1],ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[4],ymm1[4],ymm3[5],ymm1[5],ymm3[6],ymm1[6],ymm3[7],ymm1[7],ymm3[16],ymm1[16],ymm3[17],ymm1[17],ymm3[18],ymm1[18],ymm3[19],ymm1[19],ymm3[20],ymm1[20],ymm3[21],ymm1[21],ymm3[22],ymm1[22],ymm3[23],ymm1[23]
+; GFNIAVX2-NEXT:    vpmullw %ymm2, %ymm1, %ymm1
+; GFNIAVX2-NEXT:    vpsrlw $8, %ymm1, %ymm1
+; GFNIAVX2-NEXT:    vpackuswb %ymm4, %ymm1, %ymm1
+; GFNIAVX2-NEXT:    retq
+;
+; GFNIAVX512VL-LABEL: constant_fshl_v64i8:
+; GFNIAVX512VL:       # %bb.0:
+; GFNIAVX512VL-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
+; GFNIAVX512VL-NEXT:    vextracti64x4 $1, %zmm1, %ymm3
+; GFNIAVX512VL-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15],ymm3[24],ymm2[24],ymm3[25],ymm2[25],ymm3[26],ymm2[26],ymm3[27],ymm2[27],ymm3[28],ymm2[28],ymm3[29],ymm2[29],ymm3[30],ymm2[30],ymm3[31],ymm2[31]
+; GFNIAVX512VL-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2]
+; GFNIAVX512VL-NEXT:    # ymm5 = mem[0,1,0,1]
+; GFNIAVX512VL-NEXT:    vpmullw %ymm5, %ymm4, %ymm4
+; GFNIAVX512VL-NEXT:    vpsrlw $8, %ymm4, %ymm4
+; GFNIAVX512VL-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[16],ymm2[16],ymm3[17],ymm2[17],ymm3[18],ymm2[18],ymm3[19],ymm2[19],ymm3[20],ymm2[20],ymm3[21],ymm2[21],ymm3[22],ymm2[22],ymm3[23],ymm2[23]
+; GFNIAVX512VL-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
+; GFNIAVX512VL-NEXT:    # ymm3 = mem[0,1,0,1]
+; GFNIAVX512VL-NEXT:    vpmullw %ymm3, %ymm2, %ymm2
+; GFNIAVX512VL-NEXT:    vpsrlw $8, %ymm2, %ymm2
+; GFNIAVX512VL-NEXT:    vpackuswb %ymm4, %ymm2, %ymm2
+; GFNIAVX512VL-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
+; GFNIAVX512VL-NEXT:    vpmullw %ymm5, %ymm4, %ymm4
+; GFNIAVX512VL-NEXT:    vpsrlw $8, %ymm4, %ymm4
+; GFNIAVX512VL-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
+; GFNIAVX512VL-NEXT:    vpmullw %ymm3, %ymm0, %ymm0
+; GFNIAVX512VL-NEXT:    vpsrlw $8, %ymm0, %ymm0
+; GFNIAVX512VL-NEXT:    vpackuswb %ymm4, %ymm0, %ymm0
+; GFNIAVX512VL-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; GFNIAVX512VL-NEXT:    retq
+;
+; GFNIAVX512BW-LABEL: constant_fshl_v64i8:
+; GFNIAVX512BW:       # %bb.0:
+; GFNIAVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm2 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63]
+; GFNIAVX512BW-NEXT:    vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2
+; GFNIAVX512BW-NEXT:    vpsrlw $8, %zmm2, %zmm2
+; GFNIAVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55]
+; GFNIAVX512BW-NEXT:    vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
+; GFNIAVX512BW-NEXT:    vpsrlw $8, %zmm0, %zmm0
+; GFNIAVX512BW-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
+; GFNIAVX512BW-NEXT:    retq
+  %res = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1>)
+  ret <64 x i8> %res
+}
+
+define <64 x i8> @constant_fshr_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
+; GFNISSE-LABEL: constant_fshr_v64i8:
+; GFNISSE:       # %bb.0:
+; GFNISSE-NEXT:    movdqa %xmm4, %xmm10
+; GFNISSE-NEXT:    punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm0[8],xmm10[9],xmm0[9],xmm10[10],xmm0[10],xmm10[11],xmm0[11],xmm10[12],xmm0[12],xmm10[13],xmm0[13],xmm10[14],xmm0[14],xmm10[15],xmm0[15]
+; GFNISSE-NEXT:    pmovzxbw {{.*#+}} xmm9 = [1,128,64,32,16,8,4,2]
+; GFNISSE-NEXT:    pmullw %xmm9, %xmm10
+; GFNISSE-NEXT:    psrlw $8, %xmm10
+; GFNISSE-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
+; GFNISSE-NEXT:    pmovzxbw {{.*#+}} xmm8 = [1,2,4,8,16,32,64,128]
+; GFNISSE-NEXT:    pmullw %xmm8, %xmm4
+; GFNISSE-NEXT:    psrlw $8, %xmm4
+; GFNISSE-NEXT:    packuswb %xmm10, %xmm4
+; GFNISSE-NEXT:    movdqa %xmm5, %xmm0
+; GFNISSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; GFNISSE-NEXT:    pmullw %xmm9, %xmm0
+; GFNISSE-NEXT:    psrlw $8, %xmm0
+; GFNISSE-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
+; GFNISSE-NEXT:    pmullw %xmm8, %xmm5
+; GFNISSE-NEXT:    psrlw $8, %xmm5
+; GFNISSE-NEXT:    packuswb %xmm0, %xmm5
+; GFNISSE-NEXT:    movdqa %xmm6, %xmm0
+; GFNISSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
+; GFNISSE-NEXT:    pmullw %xmm9, %xmm0
+; GFNISSE-NEXT:    psrlw $8, %xmm0
+; GFNISSE-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
+; GFNISSE-NEXT:    pmullw %xmm8, %xmm6
+; GFNISSE-NEXT:    psrlw $8, %xmm6
+; GFNISSE-NEXT:    packuswb %xmm0, %xmm6
+; GFNISSE-NEXT:    movdqa %xmm7, %xmm0
+; GFNISSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15]
+; GFNISSE-NEXT:    pmullw %xmm9, %xmm0
+; GFNISSE-NEXT:    psrlw $8, %xmm0
+; GFNISSE-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7]
+; GFNISSE-NEXT:    pmullw %xmm7, %xmm8
+; GFNISSE-NEXT:    psrlw $8, %xmm8
+; GFNISSE-NEXT:    packuswb %xmm0, %xmm8
+; GFNISSE-NEXT:    movdqa %xmm4, %xmm0
+; GFNISSE-NEXT:    movdqa %xmm5, %xmm1
+; GFNISSE-NEXT:    movdqa %xmm6, %xmm2
+; GFNISSE-NEXT:    movdqa %xmm8, %xmm3
+; GFNISSE-NEXT:    retq
+;
+; GFNIAVX1-LABEL: constant_fshr_v64i8:
+; GFNIAVX1:       # %bb.0:
+; GFNIAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
+; GFNIAVX1-NEXT:    vextractf128 $1, %ymm2, %xmm5
+; GFNIAVX1-NEXT:    vpunpckhbw {{.*#+}} xmm6 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15]
+; GFNIAVX1-NEXT:    vpmovzxbw {{.*#+}} xmm7 = [1,128,64,32,16,8,4,2]
+; GFNIAVX1-NEXT:    vpmullw %xmm7, %xmm6, %xmm6
+; GFNIAVX1-NEXT:    vpsrlw $8, %xmm6, %xmm6
+; GFNIAVX1-NEXT:    vpunpcklbw {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
+; GFNIAVX1-NEXT:    vpmovzxbw {{.*#+}} xmm5 = [1,2,4,8,16,32,64,128]
+; GFNIAVX1-NEXT:    vpmullw %xmm5, %xmm4, %xmm4
+; GFNIAVX1-NEXT:    vpsrlw $8, %xmm4, %xmm4
+; GFNIAVX1-NEXT:    vpackuswb %xmm6, %xmm4, %xmm4
+; GFNIAVX1-NEXT:    vpunpckhbw {{.*#+}} xmm6 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
+; GFNIAVX1-NEXT:    vpmullw %xmm7, %xmm6, %xmm6
+; GFNIAVX1-NEXT:    vpsrlw $8, %xmm6, %xmm6
+; GFNIAVX1-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; GFNIAVX1-NEXT:    vpmullw %xmm5, %xmm0, %xmm0
+; GFNIAVX1-NEXT:    vpsrlw $8, %xmm0, %xmm0
+; GFNIAVX1-NEXT:    vpackuswb %xmm6, %xmm0, %xmm0
+; GFNIAVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; GFNIAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; GFNIAVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
+; GFNIAVX1-NEXT:    vpunpckhbw {{.*#+}} xmm6 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15]
+; GFNIAVX1-NEXT:    vpmullw %xmm7, %xmm6, %xmm6
+; GFNIAVX1-NEXT:    vpsrlw $8, %xmm6, %xmm6
+; GFNIAVX1-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
+; GFNIAVX1-NEXT:    vpmullw %xmm5, %xmm2, %xmm2
+; GFNIAVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
+; GFNIAVX1-NEXT:    vpackuswb %xmm6, %xmm2, %xmm2
+; GFNIAVX1-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
+; GFNIAVX1-NEXT:    vpmullw %xmm7, %xmm4, %xmm4
+; GFNIAVX1-NEXT:    vpsrlw $8, %xmm4, %xmm4
+; GFNIAVX1-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
+; GFNIAVX1-NEXT:    vpmullw %xmm5, %xmm1, %xmm1
+; GFNIAVX1-NEXT:    vpsrlw $8, %xmm1, %xmm1
+; GFNIAVX1-NEXT:    vpackuswb %xmm4, %xmm1, %xmm1
+; GFNIAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; GFNIAVX1-NEXT:    retq
+;
+; GFNIAVX2-LABEL: constant_fshr_v64i8:
+; GFNIAVX2:       # %bb.0:
+; GFNIAVX2-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15],ymm2[24],ymm0[24],ymm2[25],ymm0[25],ymm2[26],ymm0[26],ymm2[27],ymm0[27],ymm2[28],ymm0[28],ymm2[29],ymm0[29],ymm2[30],ymm0[30],ymm2[31],ymm0[31]
+; GFNIAVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2]
+; GFNIAVX2-NEXT:    # ymm5 = mem[0,1,0,1]
+; GFNIAVX2-NEXT:    vpmullw %ymm5, %ymm4, %ymm4
+; GFNIAVX2-NEXT:    vpsrlw $8, %ymm4, %ymm4
+; GFNIAVX2-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23]
+; GFNIAVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
+; GFNIAVX2-NEXT:    # ymm2 = mem[0,1,0,1]
+; GFNIAVX2-NEXT:    vpmullw %ymm2, %ymm0, %ymm0
+; GFNIAVX2-NEXT:    vpsrlw $8, %ymm0, %ymm0
+; GFNIAVX2-NEXT:    vpackuswb %ymm4, %ymm0, %ymm0
+; GFNIAVX2-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm3[8],ymm1[8],ymm3[9],ymm1[9],ymm3[10],ymm1[10],ymm3[11],ymm1[11],ymm3[12],ymm1[12],ymm3[13],ymm1[13],ymm3[14],ymm1[14],ymm3[15],ymm1[15],ymm3[24],ymm1[24],ymm3[25],ymm1[25],ymm3[26],ymm1[26],ymm3[27],ymm1[27],ymm3[28],ymm1[28],ymm3[29],ymm1[29],ymm3[30],ymm1[30],ymm3[31],ymm1[31]
+; GFNIAVX2-NEXT:    vpmullw %ymm5, %ymm4, %ymm4
+; GFNIAVX2-NEXT:    vpsrlw $8, %ymm4, %ymm4
+; GFNIAVX2-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm3[0],ymm1[0],ymm3[1],ymm1[1],ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[4],ymm1[4],ymm3[5],ymm1[5],ymm3[6],ymm1[6],ymm3[7],ymm1[7],ymm3[16],ymm1[16],ymm3[17],ymm1[17],ymm3[18],ymm1[18],ymm3[19],ymm1[19],ymm3[20],ymm1[20],ymm3[21],ymm1[21],ymm3[22],ymm1[22],ymm3[23],ymm1[23]
+; GFNIAVX2-NEXT:    vpmullw %ymm2, %ymm1, %ymm1
+; GFNIAVX2-NEXT:    vpsrlw $8, %ymm1, %ymm1
+; GFNIAVX2-NEXT:    vpackuswb %ymm4, %ymm1, %ymm1
+; GFNIAVX2-NEXT:    retq
+;
+; GFNIAVX512VL-LABEL: constant_fshr_v64i8:
+; GFNIAVX512VL:       # %bb.0:
+; GFNIAVX512VL-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
+; GFNIAVX512VL-NEXT:    vextracti64x4 $1, %zmm1, %ymm3
+; GFNIAVX512VL-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15],ymm3[24],ymm2[24],ymm3[25],ymm2[25],ymm3[26],ymm2[26],ymm3[27],ymm2[27],ymm3[28],ymm2[28],ymm3[29],ymm2[29],ymm3[30],ymm2[30],ymm3[31],ymm2[31]
+; GFNIAVX512VL-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2]
+; GFNIAVX512VL-NEXT:    # ymm5 = mem[0,1,0,1]
+; GFNIAVX512VL-NEXT:    vpmullw %ymm5, %ymm4, %ymm4
+; GFNIAVX512VL-NEXT:    vpsrlw $8, %ymm4, %ymm4
+; GFNIAVX512VL-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[16],ymm2[16],ymm3[17],ymm2[17],ymm3[18],ymm2[18],ymm3[19],ymm2[19],ymm3[20],ymm2[20],ymm3[21],ymm2[21],ymm3[22],ymm2[22],ymm3[23],ymm2[23]
+; GFNIAVX512VL-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
+; GFNIAVX512VL-NEXT:    # ymm3 = mem[0,1,0,1]
+; GFNIAVX512VL-NEXT:    vpmullw %ymm3, %ymm2, %ymm2
+; GFNIAVX512VL-NEXT:    vpsrlw $8, %ymm2, %ymm2
+; GFNIAVX512VL-NEXT:    vpackuswb %ymm4, %ymm2, %ymm2
+; GFNIAVX512VL-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
+; GFNIAVX512VL-NEXT:    vpmullw %ymm5, %ymm4, %ymm4
+; GFNIAVX512VL-NEXT:    vpsrlw $8, %ymm4, %ymm4
+; GFNIAVX512VL-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
+; GFNIAVX512VL-NEXT:    vpmullw %ymm3, %ymm0, %ymm0
+; GFNIAVX512VL-NEXT:    vpsrlw $8, %ymm0, %ymm0
+; GFNIAVX512VL-NEXT:    vpackuswb %ymm4, %ymm0, %ymm0
+; GFNIAVX512VL-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; GFNIAVX512VL-NEXT:    retq
+;
+; GFNIAVX512BW-LABEL: constant_fshr_v64i8:
+; GFNIAVX512BW:       # %bb.0:
+; GFNIAVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm2 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63]
+; GFNIAVX512BW-NEXT:    vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2
+; GFNIAVX512BW-NEXT:    vpsrlw $8, %zmm2, %zmm2
+; GFNIAVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55]
+; GFNIAVX512BW-NEXT:    vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
+; GFNIAVX512BW-NEXT:    vpsrlw $8, %zmm0, %zmm0
+; GFNIAVX512BW-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
+; GFNIAVX512BW-NEXT:    retq
+  %res = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1>)
+  ret <64 x i8> %res
+}
+
 define <64 x i8> @splatconstant_fshl_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; GFNISSE-LABEL: splatconstant_fshl_v64i8:
 ; GFNISSE:       # %bb.0:
@@ -372,5 +3056,3 @@ define <64 x i8> @splatconstant_fshr_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind
   ret <64 x i8> %res
 }
 declare <64 x i8> @llvm.fshr.v64i8(<64 x i8>, <64 x i8>, <64 x i8>)
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; GFNIAVX: {{.*}}

>From 8b2ba6a144e728ee4116e2804e9b5aed8824e726 Mon Sep 17 00:00:00 2001
From: Mircea Trofin <mtrofin at google.com>
Date: Mon, 22 Apr 2024 09:35:49 -0700
Subject: [PATCH 4/7] Revert "[compiler-rt][ctx_instr] Add `ctx_profile`
 component" (#89625)

Reverts llvm/llvm-project#89304

Some build bot failures - will fix and reland.

Example: https://lab.llvm.org/buildbot/#/builders/165/builds/52789
---
 compiler-rt/CMakeLists.txt                    |  2 -
 .../cmake/Modules/AllSupportedArchDefs.cmake  |  1 -
 compiler-rt/cmake/config-ix.cmake             | 11 ---
 compiler-rt/lib/CMakeLists.txt                |  4 --
 compiler-rt/lib/ctx_profile/CMakeLists.txt    | 28 --------
 .../lib/ctx_profile/CtxInstrProfiling.cpp     | 40 -----------
 .../lib/ctx_profile/CtxInstrProfiling.h       | 55 ---------------
 .../lib/ctx_profile/tests/CMakeLists.txt      | 70 -------------------
 .../tests/CtxInstrProfilingTest.cpp           | 22 ------
 compiler-rt/lib/ctx_profile/tests/driver.cpp  | 14 ----
 10 files changed, 247 deletions(-)
 delete mode 100644 compiler-rt/lib/ctx_profile/CMakeLists.txt
 delete mode 100644 compiler-rt/lib/ctx_profile/CtxInstrProfiling.cpp
 delete mode 100644 compiler-rt/lib/ctx_profile/CtxInstrProfiling.h
 delete mode 100644 compiler-rt/lib/ctx_profile/tests/CMakeLists.txt
 delete mode 100644 compiler-rt/lib/ctx_profile/tests/CtxInstrProfilingTest.cpp
 delete mode 100644 compiler-rt/lib/ctx_profile/tests/driver.cpp

diff --git a/compiler-rt/CMakeLists.txt b/compiler-rt/CMakeLists.txt
index 6ce451e3cac2e3..8649507ce1c79b 100644
--- a/compiler-rt/CMakeLists.txt
+++ b/compiler-rt/CMakeLists.txt
@@ -50,8 +50,6 @@ option(COMPILER_RT_BUILD_LIBFUZZER "Build libFuzzer" ON)
 mark_as_advanced(COMPILER_RT_BUILD_LIBFUZZER)
 option(COMPILER_RT_BUILD_PROFILE "Build profile runtime" ON)
 mark_as_advanced(COMPILER_RT_BUILD_PROFILE)
-option(COMPILER_RT_BUILD_CTX_PROFILE "Build ctx profile runtime" ON)
-mark_as_advanced(COMPILER_RT_BUILD_CTX_PROFILE)
 option(COMPILER_RT_BUILD_MEMPROF "Build memory profiling runtime" ON)
 mark_as_advanced(COMPILER_RT_BUILD_MEMPROF)
 option(COMPILER_RT_BUILD_XRAY_NO_PREINIT "Build xray with no preinit patching" OFF)
diff --git a/compiler-rt/cmake/Modules/AllSupportedArchDefs.cmake b/compiler-rt/cmake/Modules/AllSupportedArchDefs.cmake
index 2fe06273a814c7..423171532c2028 100644
--- a/compiler-rt/cmake/Modules/AllSupportedArchDefs.cmake
+++ b/compiler-rt/cmake/Modules/AllSupportedArchDefs.cmake
@@ -66,7 +66,6 @@ set(ALL_MEMPROF_SUPPORTED_ARCH ${X86_64})
 set(ALL_PROFILE_SUPPORTED_ARCH ${X86} ${X86_64} ${ARM32} ${ARM64} ${PPC32} ${PPC64}
     ${MIPS32} ${MIPS64} ${S390X} ${SPARC} ${SPARCV9} ${HEXAGON}
     ${RISCV32} ${RISCV64} ${LOONGARCH64})
-set(ALL_CTX_PROFILE_SUPPORTED_ARCH ${X86_64})
 set(ALL_TSAN_SUPPORTED_ARCH ${X86_64} ${MIPS64} ${ARM64} ${PPC64} ${S390X}
     ${LOONGARCH64} ${RISCV64})
 set(ALL_UBSAN_SUPPORTED_ARCH ${X86} ${X86_64} ${ARM32} ${ARM64} ${RISCV64}
diff --git a/compiler-rt/cmake/config-ix.cmake b/compiler-rt/cmake/config-ix.cmake
index ba740af9e1d60f..b281ac64f5d5c7 100644
--- a/compiler-rt/cmake/config-ix.cmake
+++ b/compiler-rt/cmake/config-ix.cmake
@@ -632,9 +632,6 @@ if(APPLE)
   list_intersect(PROFILE_SUPPORTED_ARCH
     ALL_PROFILE_SUPPORTED_ARCH
     SANITIZER_COMMON_SUPPORTED_ARCH)
-  list_intersect(CTX_PROFILE_SUPPORTED_ARCH
-    ALL_CTX_PROFILE_SUPPORTED_ARCH
-    SANITIZER_COMMON_SUPPORTED_ARCH)
   list_intersect(TSAN_SUPPORTED_ARCH
     ALL_TSAN_SUPPORTED_ARCH
     SANITIZER_COMMON_SUPPORTED_ARCH)
@@ -681,7 +678,6 @@ else()
   filter_available_targets(HWASAN_SUPPORTED_ARCH ${ALL_HWASAN_SUPPORTED_ARCH})
   filter_available_targets(MEMPROF_SUPPORTED_ARCH ${ALL_MEMPROF_SUPPORTED_ARCH})
   filter_available_targets(PROFILE_SUPPORTED_ARCH ${ALL_PROFILE_SUPPORTED_ARCH})
-  filter_available_targets(CTX_PROFILE_SUPPORTED_ARCH ${ALL_CTX_PROFILE_SUPPORTED_ARCH})
   filter_available_targets(TSAN_SUPPORTED_ARCH ${ALL_TSAN_SUPPORTED_ARCH})
   filter_available_targets(UBSAN_SUPPORTED_ARCH ${ALL_UBSAN_SUPPORTED_ARCH})
   filter_available_targets(SAFESTACK_SUPPORTED_ARCH
@@ -807,13 +803,6 @@ else()
   set(COMPILER_RT_HAS_PROFILE FALSE)
 endif()
 
-if (COMPILER_RT_HAS_SANITIZER_COMMON AND CTX_PROFILE_SUPPORTED_ARCH AND
-    OS_NAME MATCHES "Linux")
-  set(COMPILER_RT_HAS_CTX_PROFILE TRUE)
-else()
-  set(COMPILER_RT_HAS_CTX_PROFILE FALSE)
-endif()
-
 if (COMPILER_RT_HAS_SANITIZER_COMMON AND TSAN_SUPPORTED_ARCH)
   if (OS_NAME MATCHES "Linux|Darwin|FreeBSD|NetBSD")
     set(COMPILER_RT_HAS_TSAN TRUE)
diff --git a/compiler-rt/lib/CMakeLists.txt b/compiler-rt/lib/CMakeLists.txt
index f9e96563b88090..43ba9a102c8487 100644
--- a/compiler-rt/lib/CMakeLists.txt
+++ b/compiler-rt/lib/CMakeLists.txt
@@ -51,10 +51,6 @@ if(COMPILER_RT_BUILD_PROFILE AND COMPILER_RT_HAS_PROFILE)
   compiler_rt_build_runtime(profile)
 endif()
 
-if(COMPILER_RT_BUILD_CTX_PROFILE AND COMPILER_RT_HAS_CTX_PROFILE)
-  compiler_rt_build_runtime(ctx_profile)
-endif()
-
 if(COMPILER_RT_BUILD_XRAY)
   compiler_rt_build_runtime(xray)
 endif()
diff --git a/compiler-rt/lib/ctx_profile/CMakeLists.txt b/compiler-rt/lib/ctx_profile/CMakeLists.txt
deleted file mode 100644
index 621b7d30b76d41..00000000000000
--- a/compiler-rt/lib/ctx_profile/CMakeLists.txt
+++ /dev/null
@@ -1,28 +0,0 @@
-add_compiler_rt_component(ctx_profile)
-
-set(CTX_PROFILE_SOURCES
-  CtxInstrProfiling.cpp
-  )
-
-set(CTX_PROFILE_HEADERS
-  CtxInstrProfiling.h
-  )
-
-include_directories(..)
-include_directories(../../include)
-
-# We don't use the C++ Standard Library here, so avoid including it by mistake.
-append_list_if(COMPILER_RT_HAS_NOSTDINCXX_FLAG -nostdinc++ EXTRA_FLAGS)
-
-add_compiler_rt_runtime(clang_rt.ctx_profile
-  STATIC
-  ARCHS ${CTX_PROFILE_SUPPORTED_ARCH}
-  OBJECT_LIBS RTSanitizerCommon RTSanitizerCommonLibc
-  CFLAGS ${EXTRA_FLAGS}
-  SOURCES ${CTX_PROFILE_SOURCES}
-  ADDITIONAL_HEADERS ${CTX_PROFILE_HEADERS}
-  PARENT_TARGET ctx_profile)
-
-if(COMPILER_RT_INCLUDE_TESTS)
-  add_subdirectory(tests)
-endif()
diff --git a/compiler-rt/lib/ctx_profile/CtxInstrProfiling.cpp b/compiler-rt/lib/ctx_profile/CtxInstrProfiling.cpp
deleted file mode 100644
index 7620ce92f7ebde..00000000000000
--- a/compiler-rt/lib/ctx_profile/CtxInstrProfiling.cpp
+++ /dev/null
@@ -1,40 +0,0 @@
-//===- CtxInstrProfiling.cpp - contextual instrumented PGO ----------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "CtxInstrProfiling.h"
-#include "sanitizer_common/sanitizer_allocator_internal.h"
-#include "sanitizer_common/sanitizer_common.h"
-#include "sanitizer_common/sanitizer_dense_map.h"
-#include "sanitizer_common/sanitizer_mutex.h"
-#include "sanitizer_common/sanitizer_placement_new.h"
-#include "sanitizer_common/sanitizer_thread_safety.h"
-
-#include <assert.h>
-
-using namespace __ctx_profile;
-
-// FIXME(mtrofin): use malloc / mmap instead of sanitizer common APIs to reduce
-// the dependency on the latter.
-Arena *Arena::allocateNewArena(size_t Size, Arena *Prev) {
-  assert(!Prev || Prev->Next == nullptr);
-  Arena *NewArena =
-      new (__sanitizer::InternalAlloc(Size + sizeof(Arena))) Arena(Size);
-  if (Prev)
-    Prev->Next = NewArena;
-  return NewArena;
-}
-
-void Arena::freeArenaList(Arena *&A) {
-  assert(A);
-  for (auto *I = A; I != nullptr;) {
-    auto *Current = I;
-    I = I->Next;
-    __sanitizer::InternalFree(Current);
-  }
-  A = nullptr;
-}
diff --git a/compiler-rt/lib/ctx_profile/CtxInstrProfiling.h b/compiler-rt/lib/ctx_profile/CtxInstrProfiling.h
deleted file mode 100644
index c1789c32a64c25..00000000000000
--- a/compiler-rt/lib/ctx_profile/CtxInstrProfiling.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/*===- CtxInstrProfiling.h- Contextual instrumentation-based PGO  ---------===*\
-|*
-|* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-|* See https://llvm.org/LICENSE.txt for license information.
-|* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-|*
-\*===----------------------------------------------------------------------===*/
-
-#ifndef CTX_PROFILE_CTXINSTRPROFILING_H_
-#define CTX_PROFILE_CTXINSTRPROFILING_H_
-
-#include <sanitizer/common_interface_defs.h>
-
-namespace __ctx_profile {
-
-/// Arena (bump allocator) forming a linked list. Intentionally not thread safe.
-/// Allocation and de-allocation happen using sanitizer APIs. We make that
-/// explicit.
-class Arena final {
-public:
-  // When allocating a new Arena, optionally specify an existing one to append
-  // to, assumed to be the last in the Arena list. We only need to support
-  // appending to the arena list.
-  static Arena *allocateNewArena(size_t Size, Arena *Prev = nullptr);
-  static void freeArenaList(Arena *&A);
-
-  uint64_t size() const { return Size; }
-
-  // Allocate S bytes or return nullptr if we don't have that many available.
-  char *tryBumpAllocate(size_t S) {
-    if (Pos + S > Size)
-      return nullptr;
-    Pos += S;
-    return start() + (Pos - S);
-  }
-
-  Arena *next() const { return Next; }
-
-  // the beginning of allocatable memory.
-  const char *start() const { return const_cast<Arena *>(this)->start(); }
-  const char *pos() const { return start() + Pos; }
-
-private:
-  explicit Arena(uint32_t Size) : Size(Size) {}
-  ~Arena() = delete;
-
-  char *start() { return reinterpret_cast<char *>(&this[1]); }
-
-  Arena *Next = nullptr;
-  uint64_t Pos = 0;
-  const uint64_t Size;
-};
-
-} // namespace __ctx_profile
-#endif // CTX_PROFILE_CTXINSTRPROFILING_H_
diff --git a/compiler-rt/lib/ctx_profile/tests/CMakeLists.txt b/compiler-rt/lib/ctx_profile/tests/CMakeLists.txt
deleted file mode 100644
index 93b41b838445d1..00000000000000
--- a/compiler-rt/lib/ctx_profile/tests/CMakeLists.txt
+++ /dev/null
@@ -1,70 +0,0 @@
-include(CheckCXXCompilerFlag)
-include(CompilerRTCompile)
-include(CompilerRTLink)
-
-set(CTX_PROFILE_UNITTEST_CFLAGS
-  ${COMPILER_RT_UNITTEST_CFLAGS}
-  ${COMPILER_RT_GTEST_CFLAGS}
-  ${COMPILER_RT_GMOCK_CFLAGS}
-  ${SANITIZER_TEST_CXX_CFLAGS}
-  -I${COMPILER_RT_SOURCE_DIR}/lib/
-  -DSANITIZER_COMMON_NO_REDEFINE_BUILTINS
-  -O2
-  -g
-  -fno-rtti
-  -Wno-pedantic
-  -fno-omit-frame-pointer)
-
-# Suppress warnings for gmock variadic macros for clang and gcc respectively.
-append_list_if(SUPPORTS_GNU_ZERO_VARIADIC_MACRO_ARGUMENTS_FLAG -Wno-gnu-zero-variadic-macro-arguments CTX_PROFILE_UNITTEST_CFLAGS)
-append_list_if(COMPILER_RT_HAS_WVARIADIC_MACROS_FLAG -Wno-variadic-macros CTX_PROFILE_UNITTEST_CFLAGS)
-
-file(GLOB PROFILE_HEADERS ../*.h)
-
-set(CTX_PROFILE_SOURCES
-  ../CtxInstrProfiling.cpp)
-
-set(CTX_PROFILE_UNITTESTS
-  CtxInstrProfilingTest.cpp
-  driver.cpp)
-
-include_directories(../../../include)
-
-set(CTX_PROFILE_UNIT_TEST_HEADERS
-  ${CTX_PROFILE_HEADERS})
-
-set(CTX_PROFILE_UNITTEST_LINK_FLAGS
-  ${COMPILER_RT_UNITTEST_LINK_FLAGS})
-
-list(APPEND CTX_PROFILE_UNITTEST_LINK_FLAGS -pthread)
-
-set(CTX_PROFILE_UNITTEST_LINK_LIBRARIES
-  ${COMPILER_RT_UNWINDER_LINK_LIBS}
-  ${SANITIZER_TEST_CXX_LIBRARIES})
-list(APPEND CTX_PROFILE_UNITTEST_LINK_LIBRARIES "dl")
-
-if(COMPILER_RT_DEFAULT_TARGET_ARCH IN_LIST CTX_PROFILE_SUPPORTED_ARCH)
-  # Profile unit tests are only run on the host machine.
-  set(arch ${COMPILER_RT_DEFAULT_TARGET_ARCH})
-
-  add_executable(CtxProfileUnitTests 
-    ${CTX_PROFILE_UNITTESTS}
-    ${COMPILER_RT_GTEST_SOURCE}
-    ${COMPILER_RT_GMOCK_SOURCE}
-    ${CTX_PROFILE_SOURCES}
-    $<TARGET_OBJECTS:RTSanitizerCommon.${arch}>
-    $<TARGET_OBJECTS:RTSanitizerCommonCoverage.${arch}>
-    $<TARGET_OBJECTS:RTSanitizerCommonLibc.${arch}>
-    $<TARGET_OBJECTS:RTSanitizerCommonSymbolizer.${arch}>
-    $<TARGET_OBJECTS:RTSanitizerCommonSymbolizerInternal.${arch}>)
-  set_target_compile_flags(CtxProfileUnitTests ${CTX_PROFILE_UNITTEST_CFLAGS})
-  set_target_link_flags(CtxProfileUnitTests ${CTX_PROFILE_UNITTEST_LINK_FLAGS})
-  target_link_libraries(CtxProfileUnitTests ${CTX_PROFILE_UNITTEST_LINK_LIBRARIES})
-
-  if (TARGET cxx-headers OR HAVE_LIBCXX)
-    add_dependencies(CtxProfileUnitTests cxx-headers)
-  endif()
-
-  set_target_properties(CtxProfileUnitTests PROPERTIES
-    RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
-endif()
diff --git a/compiler-rt/lib/ctx_profile/tests/CtxInstrProfilingTest.cpp b/compiler-rt/lib/ctx_profile/tests/CtxInstrProfilingTest.cpp
deleted file mode 100644
index 44f37d25763206..00000000000000
--- a/compiler-rt/lib/ctx_profile/tests/CtxInstrProfilingTest.cpp
+++ /dev/null
@@ -1,22 +0,0 @@
-#include "../CtxInstrProfiling.h"
-#include "gtest/gtest.h"
-
-using namespace __ctx_profile;
-
-TEST(ArenaTest, Basic) {
-  Arena *A = Arena::allocateNewArena(1024);
-  EXPECT_EQ(A->size(), 1024U);
-  EXPECT_EQ(A->next(), nullptr);
-
-  auto *M1 = A->tryBumpAllocate(1020);
-  EXPECT_NE(M1, nullptr);
-  auto *M2 = A->tryBumpAllocate(4);
-  EXPECT_NE(M2, nullptr);
-  EXPECT_EQ(M1 + 1020, M2);
-  EXPECT_EQ(A->tryBumpAllocate(1), nullptr);
-  Arena *A2 = Arena::allocateNewArena(2024, A);
-  EXPECT_EQ(A->next(), A2);
-  EXPECT_EQ(A2->next(), nullptr);
-  Arena::freeArenaList(A);
-  EXPECT_EQ(A, nullptr);
-}
diff --git a/compiler-rt/lib/ctx_profile/tests/driver.cpp b/compiler-rt/lib/ctx_profile/tests/driver.cpp
deleted file mode 100644
index b402cec1126b33..00000000000000
--- a/compiler-rt/lib/ctx_profile/tests/driver.cpp
+++ /dev/null
@@ -1,14 +0,0 @@
-//===-- driver.cpp ----------------------------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "gtest/gtest.h"
-
-int main(int argc, char **argv) {
-  testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}

>From c4c54af569f7c17bc89ae73c3e5c5c4be0a586b9 Mon Sep 17 00:00:00 2001
From: Farzon Lotfi <1802579+farzonl at users.noreply.github.com>
Date: Mon, 22 Apr 2024 12:40:21 -0400
Subject: [PATCH 5/7] [SPIRV][HLSL] map lerp to Fmix (#88976)

- `clang/lib/CodeGen/CGBuiltin.cpp` - switch to using
`getLerpIntrinsic()` to abstract backend intrinsic
- `clang/lib/CodeGen/CGHLSLRuntime.h` - add `getLerpIntrinsic()`
- `llvm/include/llvm/IR/IntrinsicsSPIRV.td` - add SPIRV intrinsic for
lerp
- `llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp` - add mapping of
HLSL's lerp to GLSL's Fmix.

resolves #88940
---
 clang/lib/CodeGen/CGBuiltin.cpp               |  4 +-
 clang/lib/CodeGen/CGHLSLRuntime.h             |  1 +
 .../CodeGenHLSL/builtins/lerp-builtin.hlsl    |  8 +-
 clang/test/CodeGenHLSL/builtins/lerp.hlsl     | 96 ++++++++++++-------
 llvm/include/llvm/IR/IntrinsicsSPIRV.td       |  2 +
 .../Target/SPIRV/SPIRVInstructionSelector.cpp | 26 +++++
 .../test/CodeGen/SPIRV/hlsl-intrinsics/all.ll | 76 +++++++--------
 .../test/CodeGen/SPIRV/hlsl-intrinsics/any.ll | 76 +++++++--------
 .../CodeGen/SPIRV/hlsl-intrinsics/lerp.ll     | 56 +++++++++++
 .../test/CodeGen/SPIRV/hlsl-intrinsics/rcp.ll | 66 ++++++-------
 10 files changed, 260 insertions(+), 151 deletions(-)
 create mode 100644 llvm/test/CodeGen/SPIRV/hlsl-intrinsics/lerp.ll

diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index afe2de5d00ac5d..7e5f2edfc732cc 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -18267,8 +18267,8 @@ Value *CodeGenFunction::EmitHLSLBuiltinExpr(unsigned BuiltinID,
     if (!E->getArg(0)->getType()->hasFloatingRepresentation())
       llvm_unreachable("lerp operand must have a float representation");
     return Builder.CreateIntrinsic(
-        /*ReturnType=*/X->getType(), Intrinsic::dx_lerp,
-        ArrayRef<Value *>{X, Y, S}, nullptr, "dx.lerp");
+        /*ReturnType=*/X->getType(), CGM.getHLSLRuntime().getLerpIntrinsic(),
+        ArrayRef<Value *>{X, Y, S}, nullptr, "hlsl.lerp");
   }
   case Builtin::BI__builtin_hlsl_elementwise_frac: {
     Value *Op0 = EmitScalarExpr(E->getArg(0));
diff --git a/clang/lib/CodeGen/CGHLSLRuntime.h b/clang/lib/CodeGen/CGHLSLRuntime.h
index 506b364f5b2ec7..0abe39dedcb96f 100644
--- a/clang/lib/CodeGen/CGHLSLRuntime.h
+++ b/clang/lib/CodeGen/CGHLSLRuntime.h
@@ -74,6 +74,7 @@ class CGHLSLRuntime {
 
   GENERATE_HLSL_INTRINSIC_FUNCTION(All, all)
   GENERATE_HLSL_INTRINSIC_FUNCTION(Any, any)
+  GENERATE_HLSL_INTRINSIC_FUNCTION(Lerp, lerp)
   GENERATE_HLSL_INTRINSIC_FUNCTION(ThreadId, thread_id)
 
   //===----------------------------------------------------------------------===//
diff --git a/clang/test/CodeGenHLSL/builtins/lerp-builtin.hlsl b/clang/test/CodeGenHLSL/builtins/lerp-builtin.hlsl
index 2fd5a19fc33521..cdc9abbd70e40b 100644
--- a/clang/test/CodeGenHLSL/builtins/lerp-builtin.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/lerp-builtin.hlsl
@@ -1,15 +1,15 @@
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s
 
 // CHECK-LABEL: builtin_lerp_half_vector
-// CHECK: %dx.lerp = call <3 x half> @llvm.dx.lerp.v3f16(<3 x half> %0, <3 x half> %1, <3 x half> %2)
-// CHECK: ret <3 x half> %dx.lerp
+// CHECK: %hlsl.lerp = call <3 x half> @llvm.dx.lerp.v3f16(<3 x half> %0, <3 x half> %1, <3 x half> %2)
+// CHECK: ret <3 x half> %hlsl.lerp
 half3 builtin_lerp_half_vector (half3 p0) {
   return __builtin_hlsl_lerp ( p0, p0, p0 );
 }
 
 // CHECK-LABEL: builtin_lerp_floar_vector
-// CHECK: %dx.lerp = call <2 x float> @llvm.dx.lerp.v2f32(<2 x float> %0, <2 x float> %1, <2 x float> %2)
-// CHECK: ret <2 x float> %dx.lerp
+// CHECK: %hlsl.lerp = call <2 x float> @llvm.dx.lerp.v2f32(<2 x float> %0, <2 x float> %1, <2 x float> %2)
+// CHECK: ret <2 x float> %hlsl.lerp
 float2 builtin_lerp_floar_vector ( float2 p0) {
   return __builtin_hlsl_lerp ( p0, p0, p0 );
 }
diff --git a/clang/test/CodeGenHLSL/builtins/lerp.hlsl b/clang/test/CodeGenHLSL/builtins/lerp.hlsl
index 49cd04a10115ae..634b20be3a28d6 100644
--- a/clang/test/CodeGenHLSL/builtins/lerp.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/lerp.hlsl
@@ -1,69 +1,92 @@
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
 // RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ 
-// RUN:   --check-prefixes=CHECK,NATIVE_HALF
+// RUN:   --check-prefixes=CHECK,DXIL_CHECK,DXIL_NATIVE_HALF,NATIVE_HALF
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
 // RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
-// RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF
+// RUN:   -o - | FileCheck %s --check-prefixes=CHECK,DXIL_CHECK,NO_HALF,DXIL_NO_HALF
+// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
+// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type \
+// RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ 
+// RUN:   --check-prefixes=CHECK,NATIVE_HALF,SPIR_NATIVE_HALF,SPIR_CHECK
+// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
+// RUN:   spirv-unknown-vulkan-compute %s -emit-llvm -disable-llvm-passes \
+// RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF,SPIR_NO_HALF,SPIR_CHECK
 
 
-// NATIVE_HALF: %dx.lerp = call half @llvm.dx.lerp.f16(half %0, half %1, half %2)
-// NATIVE_HALF: ret half %dx.lerp
-// NO_HALF: %dx.lerp = call float @llvm.dx.lerp.f32(float %0, float %1, float %2)
-// NO_HALF: ret float %dx.lerp
+// DXIL_NATIVE_HALF: %hlsl.lerp = call half @llvm.dx.lerp.f16(half %0, half %1, half %2)
+// SPIR_NATIVE_HALF: %hlsl.lerp = call half @llvm.spv.lerp.f16(half %0, half %1, half %2)
+// NATIVE_HALF: ret half %hlsl.lerp
+// DXIL_NO_HALF: %hlsl.lerp = call float @llvm.dx.lerp.f32(float %0, float %1, float %2)
+// SPIR_NO_HALF: %hlsl.lerp = call float @llvm.spv.lerp.f32(float %0, float %1, float %2)
+// NO_HALF: ret float %hlsl.lerp
 half test_lerp_half(half p0) { return lerp(p0, p0, p0); }
 
-// NATIVE_HALF: %dx.lerp = call <2 x half> @llvm.dx.lerp.v2f16(<2 x half> %0, <2 x half> %1, <2 x half> %2)
-// NATIVE_HALF: ret <2 x half> %dx.lerp
-// NO_HALF: %dx.lerp = call <2 x float> @llvm.dx.lerp.v2f32(<2 x float> %0, <2 x float> %1, <2 x float> %2)
-// NO_HALF: ret <2 x float> %dx.lerp
+// DXIL_NATIVE_HALF: %hlsl.lerp = call <2 x half> @llvm.dx.lerp.v2f16(<2 x half> %0, <2 x half> %1, <2 x half> %2)
+// SPIR_NATIVE_HALF: %hlsl.lerp = call <2 x half> @llvm.spv.lerp.v2f16(<2 x half> %0, <2 x half> %1, <2 x half> %2)
+// NATIVE_HALF: ret <2 x half> %hlsl.lerp
+// DXIL_NO_HALF: %hlsl.lerp = call <2 x float> @llvm.dx.lerp.v2f32(<2 x float> %0, <2 x float> %1, <2 x float> %2)
+// SPIR_NO_HALF: %hlsl.lerp = call <2 x float> @llvm.spv.lerp.v2f32(<2 x float> %0, <2 x float> %1, <2 x float> %2)
+// NO_HALF: ret <2 x float> %hlsl.lerp
 half2 test_lerp_half2(half2 p0) { return lerp(p0, p0, p0); }
 
-// NATIVE_HALF: %dx.lerp = call <3 x half> @llvm.dx.lerp.v3f16(<3 x half> %0, <3 x half> %1, <3 x half> %2)
-// NATIVE_HALF: ret <3 x half> %dx.lerp
-// NO_HALF: %dx.lerp = call <3 x float> @llvm.dx.lerp.v3f32(<3 x float> %0, <3 x float> %1, <3 x float> %2)
-// NO_HALF: ret <3 x float> %dx.lerp
+// DXIL_NATIVE_HALF: %hlsl.lerp = call <3 x half> @llvm.dx.lerp.v3f16(<3 x half> %0, <3 x half> %1, <3 x half> %2)
+// SPIR_NATIVE_HALF: %hlsl.lerp = call <3 x half> @llvm.spv.lerp.v3f16(<3 x half> %0, <3 x half> %1, <3 x half> %2)
+// NATIVE_HALF: ret <3 x half> %hlsl.lerp
+// DXIL_NO_HALF: %hlsl.lerp = call <3 x float> @llvm.dx.lerp.v3f32(<3 x float> %0, <3 x float> %1, <3 x float> %2)
+// SPIR_NO_HALF: %hlsl.lerp = call <3 x float> @llvm.spv.lerp.v3f32(<3 x float> %0, <3 x float> %1, <3 x float> %2)
+// NO_HALF: ret <3 x float> %hlsl.lerp
 half3 test_lerp_half3(half3 p0) { return lerp(p0, p0, p0); }
 
-// NATIVE_HALF: %dx.lerp = call <4 x half> @llvm.dx.lerp.v4f16(<4 x half> %0, <4 x half> %1, <4 x half> %2)
-// NATIVE_HALF: ret <4 x half> %dx.lerp
-// NO_HALF: %dx.lerp = call <4 x float> @llvm.dx.lerp.v4f32(<4 x float> %0, <4 x float> %1, <4 x float> %2)
-// NO_HALF: ret <4 x float> %dx.lerp
+// DXIL_NATIVE_HALF: %hlsl.lerp = call <4 x half> @llvm.dx.lerp.v4f16(<4 x half> %0, <4 x half> %1, <4 x half> %2)
+// SPIR_NATIVE_HALF: %hlsl.lerp = call <4 x half> @llvm.spv.lerp.v4f16(<4 x half> %0, <4 x half> %1, <4 x half> %2)
+// NATIVE_HALF: ret <4 x half> %hlsl.lerp
+// DXIL_NO_HALF: %hlsl.lerp = call <4 x float> @llvm.dx.lerp.v4f32(<4 x float> %0, <4 x float> %1, <4 x float> %2)
+// SPIR_NO_HALF: %hlsl.lerp = call <4 x float> @llvm.spv.lerp.v4f32(<4 x float> %0, <4 x float> %1, <4 x float> %2)
+// NO_HALF: ret <4 x float> %hlsl.lerp
 half4 test_lerp_half4(half4 p0) { return lerp(p0, p0, p0); }
 
-// CHECK: %dx.lerp = call float @llvm.dx.lerp.f32(float %0, float %1, float %2)
-// CHECK: ret float %dx.lerp
+// DXIL_CHECK: %hlsl.lerp = call float @llvm.dx.lerp.f32(float %0, float %1, float %2)
+// SPIR_CHECK: %hlsl.lerp = call float @llvm.spv.lerp.f32(float %0, float %1, float %2)
+// CHECK: ret float %hlsl.lerp
 float test_lerp_float(float p0) { return lerp(p0, p0, p0); }
 
-// CHECK: %dx.lerp = call <2 x float> @llvm.dx.lerp.v2f32(<2 x float> %0, <2 x float> %1, <2 x float> %2)
-// CHECK: ret <2 x float> %dx.lerp
+// DXIL_CHECK: %hlsl.lerp = call <2 x float> @llvm.dx.lerp.v2f32(<2 x float> %0, <2 x float> %1, <2 x float> %2)
+// SPIR_CHECK: %hlsl.lerp = call <2 x float> @llvm.spv.lerp.v2f32(<2 x float> %0, <2 x float> %1, <2 x float> %2)
+// CHECK: ret <2 x float> %hlsl.lerp
 float2 test_lerp_float2(float2 p0) { return lerp(p0, p0, p0); }
 
-// CHECK: %dx.lerp = call <3 x float> @llvm.dx.lerp.v3f32(<3 x float> %0, <3 x float> %1, <3 x float> %2)
-// CHECK: ret <3 x float> %dx.lerp
+// DXIL_CHECK: %hlsl.lerp = call <3 x float> @llvm.dx.lerp.v3f32(<3 x float> %0, <3 x float> %1, <3 x float> %2)
+// SPIR_CHECK: %hlsl.lerp = call <3 x float> @llvm.spv.lerp.v3f32(<3 x float> %0, <3 x float> %1, <3 x float> %2)
+// CHECK: ret <3 x float> %hlsl.lerp
 float3 test_lerp_float3(float3 p0) { return lerp(p0, p0, p0); }
 
-// CHECK: %dx.lerp = call <4 x float> @llvm.dx.lerp.v4f32(<4 x float> %0, <4 x float> %1, <4 x float> %2)
-// CHECK: ret <4 x float> %dx.lerp
+// DXIL_CHECK: %hlsl.lerp = call <4 x float> @llvm.dx.lerp.v4f32(<4 x float> %0, <4 x float> %1, <4 x float> %2)
+// SPIR_CHECK: %hlsl.lerp = call <4 x float> @llvm.spv.lerp.v4f32(<4 x float> %0, <4 x float> %1, <4 x float> %2)
+// CHECK: ret <4 x float> %hlsl.lerp
 float4 test_lerp_float4(float4 p0) { return lerp(p0, p0, p0); }
 
-// CHECK: %dx.lerp = call <2 x float> @llvm.dx.lerp.v2f32(<2 x float> %splat.splat, <2 x float> %1, <2 x float> %2)
-// CHECK: ret <2 x float> %dx.lerp
+// DXIL_CHECK: %hlsl.lerp = call <2 x float> @llvm.dx.lerp.v2f32(<2 x float> %splat.splat, <2 x float> %1, <2 x float> %2)
+// SPIR_CHECK: %hlsl.lerp = call <2 x float> @llvm.spv.lerp.v2f32(<2 x float> %splat.splat, <2 x float> %1, <2 x float> %2)
+// CHECK: ret <2 x float> %hlsl.lerp
 float2 test_lerp_float2_splat(float p0, float2 p1) { return lerp(p0, p1, p1); }
 
-// CHECK: %dx.lerp = call <3 x float> @llvm.dx.lerp.v3f32(<3 x float> %splat.splat, <3 x float> %1, <3 x float> %2)
-// CHECK: ret <3 x float> %dx.lerp
+// DXIL_CHECK: %hlsl.lerp = call <3 x float> @llvm.dx.lerp.v3f32(<3 x float> %splat.splat, <3 x float> %1, <3 x float> %2)
+// SPIR_CHECK: %hlsl.lerp = call <3 x float> @llvm.spv.lerp.v3f32(<3 x float> %splat.splat, <3 x float> %1, <3 x float> %2)
+// CHECK: ret <3 x float> %hlsl.lerp
 float3 test_lerp_float3_splat(float p0, float3 p1) { return lerp(p0, p1, p1); }
 
-// CHECK:  %dx.lerp = call <4 x float> @llvm.dx.lerp.v4f32(<4 x float> %splat.splat, <4 x float> %1, <4 x float> %2)
-// CHECK:  ret <4 x float> %dx.lerp
+// DXIL_CHECK:  %hlsl.lerp = call <4 x float> @llvm.dx.lerp.v4f32(<4 x float> %splat.splat, <4 x float> %1, <4 x float> %2)
+// SPIR_CHECK:  %hlsl.lerp = call <4 x float> @llvm.spv.lerp.v4f32(<4 x float> %splat.splat, <4 x float> %1, <4 x float> %2)
+// CHECK:  ret <4 x float> %hlsl.lerp
 float4 test_lerp_float4_splat(float p0, float4 p1) { return lerp(p0, p1, p1); }
 
 // CHECK: %conv = sitofp i32 %2 to float
 // CHECK: %splat.splatinsert = insertelement <2 x float> poison, float %conv, i64 0
 // CHECK: %splat.splat = shufflevector <2 x float> %splat.splatinsert, <2 x float> poison, <2 x i32> zeroinitializer
-// CHECK: %dx.lerp = call <2 x float> @llvm.dx.lerp.v2f32(<2 x float> %0, <2 x float> %1, <2 x float> %splat.splat)
-// CHECK: ret <2 x float> %dx.lerp
+// DXIL_CHECK: %hlsl.lerp = call <2 x float> @llvm.dx.lerp.v2f32(<2 x float> %0, <2 x float> %1, <2 x float> %splat.splat)
+// SPIR_CHECK: %hlsl.lerp = call <2 x float> @llvm.spv.lerp.v2f32(<2 x float> %0, <2 x float> %1, <2 x float> %splat.splat)
+// CHECK: ret <2 x float> %hlsl.lerp
 float2 test_lerp_float2_int_splat(float2 p0, int p1) {
   return lerp(p0, p0, p1);
 }
@@ -71,8 +94,9 @@ float2 test_lerp_float2_int_splat(float2 p0, int p1) {
 // CHECK: %conv = sitofp i32 %2 to float
 // CHECK: %splat.splatinsert = insertelement <3 x float> poison, float %conv, i64 0
 // CHECK: %splat.splat = shufflevector <3 x float> %splat.splatinsert, <3 x float> poison, <3 x i32> zeroinitializer
-// CHECK:  %dx.lerp = call <3 x float> @llvm.dx.lerp.v3f32(<3 x float> %0, <3 x float> %1, <3 x float> %splat.splat)
-// CHECK: ret <3 x float> %dx.lerp
+// DXIL_CHECK:  %hlsl.lerp = call <3 x float> @llvm.dx.lerp.v3f32(<3 x float> %0, <3 x float> %1, <3 x float> %splat.splat)
+// SPIR_CHECK:  %hlsl.lerp = call <3 x float> @llvm.spv.lerp.v3f32(<3 x float> %0, <3 x float> %1, <3 x float> %splat.splat)
+// CHECK: ret <3 x float> %hlsl.lerp
 float3 test_lerp_float3_int_splat(float3 p0, int p1) {
   return lerp(p0, p0, p1);
 }
diff --git a/llvm/include/llvm/IR/IntrinsicsSPIRV.td b/llvm/include/llvm/IR/IntrinsicsSPIRV.td
index b6618baceb5608..8660782d71d950 100644
--- a/llvm/include/llvm/IR/IntrinsicsSPIRV.td
+++ b/llvm/include/llvm/IR/IntrinsicsSPIRV.td
@@ -58,4 +58,6 @@ let TargetPrefix = "spv" in {
       Intrinsic<[ llvm_ptr_ty ], [llvm_i8_ty], [IntrWillReturn]>;
   def int_spv_all : DefaultAttrsIntrinsic<[llvm_i1_ty], [llvm_any_ty]>;
   def int_spv_any : DefaultAttrsIntrinsic<[llvm_i1_ty], [llvm_any_ty]>;
+  def int_spv_lerp : Intrinsic<[LLVMMatchType<0>], [llvm_anyfloat_ty, LLVMMatchType<0>,LLVMMatchType<0>], 
+    [IntrNoMem, IntrWillReturn] >;
 }
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
index 72e5a7bcac9834..21a69fc3ad9b44 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
@@ -170,6 +170,9 @@ class SPIRVInstructionSelector : public InstructionSelector {
   bool selectFCmp(Register ResVReg, const SPIRVType *ResType,
                   MachineInstr &I) const;
 
+  bool selectFmix(Register ResVReg, const SPIRVType *ResType,
+                  MachineInstr &I) const;
+
   void renderImm32(MachineInstrBuilder &MIB, const MachineInstr &I,
                    int OpIdx) const;
   void renderFImm32(MachineInstrBuilder &MIB, const MachineInstr &I,
@@ -1242,6 +1245,27 @@ bool SPIRVInstructionSelector::selectAny(Register ResVReg,
   return selectAnyOrAll(ResVReg, ResType, I, SPIRV::OpAny);
 }
 
+bool SPIRVInstructionSelector::selectFmix(Register ResVReg,
+                                          const SPIRVType *ResType,
+                                          MachineInstr &I) const {
+
+  assert(I.getNumOperands() == 5);
+  assert(I.getOperand(2).isReg());
+  assert(I.getOperand(3).isReg());
+  assert(I.getOperand(4).isReg());
+  MachineBasicBlock &BB = *I.getParent();
+
+  return BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpExtInst))
+      .addDef(ResVReg)
+      .addUse(GR.getSPIRVTypeID(ResType))
+      .addImm(static_cast<uint32_t>(SPIRV::InstructionSet::GLSL_std_450))
+      .addImm(GL::FMix)
+      .addUse(I.getOperand(2).getReg())
+      .addUse(I.getOperand(3).getReg())
+      .addUse(I.getOperand(4).getReg())
+      .constrainAllUses(TII, TRI, RBI);
+}
+
 bool SPIRVInstructionSelector::selectBitreverse(Register ResVReg,
                                                 const SPIRVType *ResType,
                                                 MachineInstr &I) const {
@@ -1902,6 +1926,8 @@ bool SPIRVInstructionSelector::selectIntrinsic(Register ResVReg,
     return selectAll(ResVReg, ResType, I);
   case Intrinsic::spv_any:
     return selectAny(ResVReg, ResType, I);
+  case Intrinsic::spv_lerp:
+    return selectFmix(ResVReg, ResType, I);
   case Intrinsic::spv_lifetime_start:
   case Intrinsic::spv_lifetime_end: {
     unsigned Op = IID == Intrinsic::spv_lifetime_start ? SPIRV::OpLifetimeStart
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/all.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/all.ll
index ef8d463cbd815e..8c5410aa54a433 100644
--- a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/all.ll
+++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/all.ll
@@ -26,32 +26,32 @@
 ; CHECK-HLSL-DAG: %[[#const_i32_0:]] = OpConstant %[[#int_32]] 0
 ; CHECK-HLSL-DAG: %[[#const_i16_0:]] = OpConstant %[[#int_16]] 0
 ; CHECK-HLSL-DAG: %[[#const_f64_0:]] = OpConstant %[[#float_64]] 0
-; CHECK-HLSL-DAG: %[[#const_f32_0:]] = OpConstant %[[#float_32:]] 0
-; CHECK-HLSL-DAG: %[[#const_f16_0:]] = OpConstant %[[#float_16:]] 0
-; CHECK-HLSL-DAG: %[[#vec4_const_zeros_i16:]] = OpConstantComposite %[[#vec4_16:]] %[[#const_i16_0:]] %[[#const_i16_0:]] %[[#const_i16_0:]] %[[#const_i16_0:]]
-; CHECK-HLSL-DAG: %[[#vec4_const_zeros_i32:]] = OpConstantComposite %[[#vec4_32:]] %[[#const_i32_0:]] %[[#const_i32_0:]] %[[#const_i32_0:]] %[[#const_i32_0:]]
-; CHECK-HLSL-DAG: %[[#vec4_const_zeros_i64:]] = OpConstantComposite %[[#vec4_64:]] %[[#const_i64_0:]] %[[#const_i64_0:]] %[[#const_i64_0:]] %[[#const_i64_0:]]
-; CHECK-HLSL-DAG: %[[#vec4_const_zeros_f16:]] = OpConstantComposite %[[#vec4_float_16:]] %[[#const_f16_0:]] %[[#const_f16_0:]] %[[#const_f16_0:]] %[[#const_f16_0:]]
-; CHECK-HLSL-DAG: %[[#vec4_const_zeros_f32:]] = OpConstantComposite %[[#vec4_float_32:]] %[[#const_f32_0:]] %[[#const_f32_0:]] %[[#const_f32_0:]] %[[#const_f32_0:]]
-; CHECK-HLSL-DAG: %[[#vec4_const_zeros_f64:]] = OpConstantComposite %[[#vec4_float_64:]] %[[#const_f64_0:]] %[[#const_f64_0:]] %[[#const_f64_0:]] %[[#const_f64_0:]]
+; CHECK-HLSL-DAG: %[[#const_f32_0:]] = OpConstant %[[#float_32]] 0
+; CHECK-HLSL-DAG: %[[#const_f16_0:]] = OpConstant %[[#float_16]] 0
+; CHECK-HLSL-DAG: %[[#vec4_const_zeros_i16:]] = OpConstantComposite %[[#vec4_16]] %[[#const_i16_0]] %[[#const_i16_0]] %[[#const_i16_0]] %[[#const_i16_0]]
+; CHECK-HLSL-DAG: %[[#vec4_const_zeros_i32:]] = OpConstantComposite %[[#vec4_32]] %[[#const_i32_0]] %[[#const_i32_0]] %[[#const_i32_0]] %[[#const_i32_0]]
+; CHECK-HLSL-DAG: %[[#vec4_const_zeros_i64:]] = OpConstantComposite %[[#vec4_64]] %[[#const_i64_0]] %[[#const_i64_0]] %[[#const_i64_0]] %[[#const_i64_0]]
+; CHECK-HLSL-DAG: %[[#vec4_const_zeros_f16:]] = OpConstantComposite %[[#vec4_float_16]] %[[#const_f16_0]] %[[#const_f16_0]] %[[#const_f16_0]] %[[#const_f16_0]]
+; CHECK-HLSL-DAG: %[[#vec4_const_zeros_f32:]] = OpConstantComposite %[[#vec4_float_32]] %[[#const_f32_0]] %[[#const_f32_0]] %[[#const_f32_0]] %[[#const_f32_0]]
+; CHECK-HLSL-DAG: %[[#vec4_const_zeros_f64:]] = OpConstantComposite %[[#vec4_float_64]] %[[#const_f64_0]] %[[#const_f64_0]] %[[#const_f64_0]] %[[#const_f64_0]]
 
 ; CHECK-OCL-DAG: %[[#const_i64_0:]] = OpConstantNull %[[#int_64]]
 ; CHECK-OCL-DAG: %[[#const_i32_0:]] = OpConstantNull %[[#int_32]]
 ; CHECK-OCL-DAG: %[[#const_i16_0:]] = OpConstantNull %[[#int_16]]
 ; CHECK-OCL-DAG: %[[#const_f64_0:]] = OpConstantNull %[[#float_64]] 
-; CHECK-OCL-DAG: %[[#const_f32_0:]] = OpConstantNull %[[#float_32:]]
-; CHECK-OCL-DAG: %[[#const_f16_0:]] = OpConstantNull %[[#float_16:]]
-; CHECK-OCL-DAG: %[[#vec4_const_zeros_i16:]] = OpConstantNull %[[#vec4_16:]]
-; CHECK-OCL-DAG: %[[#vec4_const_zeros_i32:]] = OpConstantNull %[[#vec4_32:]]
-; CHECK-OCL-DAG: %[[#vec4_const_zeros_i64:]] = OpConstantNull %[[#vec4_64:]]
-; CHECK-OCL-DAG: %[[#vec4_const_zeros_f16:]] = OpConstantNull %[[#vec4_float_16:]]
-; CHECK-OCL-DAG: %[[#vec4_const_zeros_f32:]] = OpConstantNull %[[#vec4_float_32:]]
-; CHECK-OCL-DAG: %[[#vec4_const_zeros_f64:]] = OpConstantNull %[[#vec4_float_64:]]
+; CHECK-OCL-DAG: %[[#const_f32_0:]] = OpConstantNull %[[#float_32]]
+; CHECK-OCL-DAG: %[[#const_f16_0:]] = OpConstantNull %[[#float_16]]
+; CHECK-OCL-DAG: %[[#vec4_const_zeros_i16:]] = OpConstantNull %[[#vec4_16]]
+; CHECK-OCL-DAG: %[[#vec4_const_zeros_i32:]] = OpConstantNull %[[#vec4_32]]
+; CHECK-OCL-DAG: %[[#vec4_const_zeros_i64:]] = OpConstantNull %[[#vec4_64]]
+; CHECK-OCL-DAG: %[[#vec4_const_zeros_f16:]] = OpConstantNull %[[#vec4_float_16]]
+; CHECK-OCL-DAG: %[[#vec4_const_zeros_f32:]] = OpConstantNull %[[#vec4_float_32]]
+; CHECK-OCL-DAG: %[[#vec4_const_zeros_f64:]] = OpConstantNull %[[#vec4_float_64]]
 
 define noundef i1 @all_int64_t(i64 noundef %p0) {
 entry:
   ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#]]
-  ; CHECK: %[[#]] = OpINotEqual %[[#bool:]] %[[#arg0:]] %[[#const_i64_0:]]
+  ; CHECK: %[[#]] = OpINotEqual %[[#bool]] %[[#arg0]] %[[#const_i64_0]]
   %hlsl.all = call i1 @llvm.spv.all.i64(i64 %p0)
   ret i1 %hlsl.all
 }
@@ -60,7 +60,7 @@ entry:
 define noundef i1 @all_int(i32 noundef %p0) {
 entry:
   ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#]]
-  ; CHECK: %[[#]] = OpINotEqual %[[#bool:]] %[[#arg0:]] %[[#const_i32_0:]]
+  ; CHECK: %[[#]] = OpINotEqual %[[#bool]] %[[#arg0]] %[[#const_i32_0]]
   %hlsl.all = call i1 @llvm.spv.all.i32(i32 %p0)
   ret i1 %hlsl.all
 }
@@ -69,7 +69,7 @@ entry:
 define noundef i1 @all_int16_t(i16 noundef %p0) {
 entry:
   ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#]]
-  ; CHECK: %[[#]] = OpINotEqual %[[#bool:]] %[[#arg0:]] %[[#const_i16_0:]]
+  ; CHECK: %[[#]] = OpINotEqual %[[#bool]] %[[#arg0]] %[[#const_i16_0]]
   %hlsl.all = call i1 @llvm.spv.all.i16(i16 %p0)
   ret i1 %hlsl.all
 }
@@ -77,7 +77,7 @@ entry:
 define noundef i1 @all_double(double noundef %p0) {
 entry:
   ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#]]
-  ; CHECK: %[[#]] = OpFOrdNotEqual %[[#bool:]] %[[#arg0:]] %[[#const_f64_0:]]
+  ; CHECK: %[[#]] = OpFOrdNotEqual %[[#bool]] %[[#arg0]] %[[#const_f64_0]]
   %hlsl.all = call i1 @llvm.spv.all.f64(double %p0)
   ret i1 %hlsl.all
 }
@@ -86,7 +86,7 @@ entry:
 define noundef i1 @all_float(float noundef %p0) {
 entry:
   ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#]]
-  ; CHECK: %[[#]] = OpFOrdNotEqual %[[#bool:]] %[[#arg0:]] %[[#const_f32_0:]]
+  ; CHECK: %[[#]] = OpFOrdNotEqual %[[#bool]] %[[#arg0]] %[[#const_f32_0]]
   %hlsl.all = call i1 @llvm.spv.all.f32(float %p0)
   ret i1 %hlsl.all
 }
@@ -95,7 +95,7 @@ entry:
 define noundef i1 @all_half(half noundef %p0) {
 entry:
   ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#]]
-  ; CHECK: %[[#]] = OpFOrdNotEqual %[[#bool:]] %[[#arg0:]] %[[#const_f16_0:]]
+  ; CHECK: %[[#]] = OpFOrdNotEqual %[[#bool]] %[[#arg0]] %[[#const_f16_0]]
   %hlsl.all = call i1 @llvm.spv.all.f16(half %p0)
   ret i1 %hlsl.all
 }
@@ -103,8 +103,8 @@ entry:
 
 define noundef i1 @all_bool4(<4 x i1> noundef %p0) {
 entry:
-  ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#]]
-  ; CHECK: %[[#]] = OpAll %[[#vec4_bool:]] %[[#arg0:]]
+  ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#vec4_bool]]
+  ; CHECK: %[[#]] = OpAll %[[#bool]] %[[#arg0]]
   %hlsl.all = call i1 @llvm.spv.all.v4i1(<4 x i1> %p0)
   ret i1 %hlsl.all
 }
@@ -112,8 +112,8 @@ entry:
 define noundef i1 @all_short4(<4 x i16> noundef %p0) {
 entry:
   ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#]]
-  ; CHECK: %[[#shortVecNotEq:]] = OpINotEqual %[[#vec4_bool:]] %[[#arg0:]] %[[#vec4_const_zeros_i16:]]
-  ; CHECK: %[[#]] = OpAll %[[#bool:]] %[[#shortVecNotEq:]]
+  ; CHECK: %[[#shortVecNotEq:]] = OpINotEqual %[[#vec4_bool]] %[[#arg0]] %[[#vec4_const_zeros_i16]]
+  ; CHECK: %[[#]] = OpAll %[[#bool]] %[[#shortVecNotEq]]
   %hlsl.all = call i1 @llvm.spv.all.v4i16(<4 x i16> %p0)
   ret i1 %hlsl.all
 }
@@ -121,8 +121,8 @@ entry:
 define noundef i1 @all_int4(<4 x i32> noundef %p0) {
 entry:
   ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#]]
-  ; CHECK: %[[#i32VecNotEq:]] = OpINotEqual %[[#vec4_bool:]] %[[#arg0:]] %[[#vec4_const_zeros_i32:]]
-  ; CHECK: %[[#]] = OpAll %[[#bool:]] %[[#i32VecNotEq:]]
+  ; CHECK: %[[#i32VecNotEq:]] = OpINotEqual %[[#vec4_bool]] %[[#arg0]] %[[#vec4_const_zeros_i32]]
+  ; CHECK: %[[#]] = OpAll %[[#bool]] %[[#i32VecNotEq]]
   %hlsl.all = call i1 @llvm.spv.all.v4i32(<4 x i32> %p0)
   ret i1 %hlsl.all
 }
@@ -130,8 +130,8 @@ entry:
 define noundef i1 @all_int64_t4(<4 x i64> noundef %p0) {
 entry:
   ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#]]
-  ; CHECK: %[[#i64VecNotEq:]] = OpINotEqual %[[#vec4_bool:]] %[[#arg0:]] %[[#vec4_const_zeros_i64:]]
-  ; CHECK: %[[#]] = OpAll %[[#bool:]] %[[#i64VecNotEq]]
+  ; CHECK: %[[#i64VecNotEq:]] = OpINotEqual %[[#vec4_bool]] %[[#arg0]] %[[#vec4_const_zeros_i64]]
+  ; CHECK: %[[#]] = OpAll %[[#bool]] %[[#i64VecNotEq]]
   %hlsl.all = call i1 @llvm.spv.all.v4i64(<4 x i64> %p0)
   ret i1 %hlsl.all
 }
@@ -139,8 +139,8 @@ entry:
 define noundef i1 @all_half4(<4 x half> noundef %p0) {
 entry:
   ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#]]
-  ; CHECK: %[[#f16VecNotEq:]] = OpFOrdNotEqual %[[#vec4_bool:]] %[[#arg0:]] %[[#vec4_const_zeros_f16:]]
-  ; CHECK: %[[#]] = OpAll %[[#bool]] %[[#f16VecNotEq:]]
+  ; CHECK: %[[#f16VecNotEq:]] = OpFOrdNotEqual %[[#vec4_bool]] %[[#arg0]] %[[#vec4_const_zeros_f16]]
+  ; CHECK: %[[#]] = OpAll %[[#bool]] %[[#f16VecNotEq]]
   %hlsl.all = call i1 @llvm.spv.all.v4f16(<4 x half> %p0)
   ret i1 %hlsl.all
 }
@@ -148,8 +148,8 @@ entry:
 define noundef i1 @all_float4(<4 x float> noundef %p0) {
 entry:
   ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#]]
-  ; CHECK: %[[#f32VecNotEq:]] = OpFOrdNotEqual %[[#vec4_bool:]] %[[#arg0:]] %[[#vec4_const_zeros_f32:]]
-  ; CHECK: %[[#]] = OpAll %[[#bool:]] %[[#f32VecNotEq:]]
+  ; CHECK: %[[#f32VecNotEq:]] = OpFOrdNotEqual %[[#vec4_bool]] %[[#arg0]] %[[#vec4_const_zeros_f32]]
+  ; CHECK: %[[#]] = OpAll %[[#bool]] %[[#f32VecNotEq]]
   %hlsl.all = call i1 @llvm.spv.all.v4f32(<4 x float> %p0)
   ret i1 %hlsl.all
 }
@@ -157,16 +157,16 @@ entry:
 define noundef i1 @all_double4(<4 x double> noundef %p0) {
 entry:
   ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#]]
-  ; CHECK: %[[#f64VecNotEq:]] = OpFOrdNotEqual %[[#vec4_bool:]] %[[#arg0:]] %[[#vec4_const_zeros_f64:]]
-  ; CHECK: %[[#]] = OpAll %[[#bool:]] %[[#f64VecNotEq:]]
+  ; CHECK: %[[#f64VecNotEq:]] = OpFOrdNotEqual %[[#vec4_bool]] %[[#arg0]] %[[#vec4_const_zeros_f64]]
+  ; CHECK: %[[#]] = OpAll %[[#bool]] %[[#f64VecNotEq]]
   %hlsl.all = call i1 @llvm.spv.all.v4f64(<4 x double> %p0)
   ret i1 %hlsl.all
 }
 
 define noundef i1 @all_bool(i1 noundef %a) {
 entry:
-  ; CHECK: %[[#all_bool_arg:]] = OpFunctionParameter %[[#bool:]]
-  ; CHECK: OpReturnValue %[[#all_bool_arg:]]
+  ; CHECK: %[[#all_bool_arg:]] = OpFunctionParameter %[[#bool]]
+  ; CHECK: OpReturnValue %[[#all_bool_arg]]
   %hlsl.all = call i1 @llvm.spv.all.i1(i1 %a)
   ret i1 %hlsl.all
 }
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/any.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/any.ll
index b1dd388f5c6e36..7a74a335a659d4 100644
--- a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/any.ll
+++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/any.ll
@@ -26,32 +26,32 @@
 ; CHECK-HLSL-DAG: %[[#const_i32_0:]] = OpConstant %[[#int_32]] 0
 ; CHECK-HLSL-DAG: %[[#const_i16_0:]] = OpConstant %[[#int_16]] 0
 ; CHECK-HLSL-DAG: %[[#const_f64_0:]] = OpConstant %[[#float_64]] 0
-; CHECK-HLSL-DAG: %[[#const_f32_0:]] = OpConstant %[[#float_32:]] 0
-; CHECK-HLSL-DAG: %[[#const_f16_0:]] = OpConstant %[[#float_16:]] 0
-; CHECK-HLSL-DAG: %[[#vec4_const_zeros_i16:]] = OpConstantComposite %[[#vec4_16:]] %[[#const_i16_0:]] %[[#const_i16_0:]] %[[#const_i16_0:]] %[[#const_i16_0:]]
-; CHECK-HLSL-DAG: %[[#vec4_const_zeros_i32:]] = OpConstantComposite %[[#vec4_32:]] %[[#const_i32_0:]] %[[#const_i32_0:]] %[[#const_i32_0:]] %[[#const_i32_0:]]
-; CHECK-HLSL-DAG: %[[#vec4_const_zeros_i64:]] = OpConstantComposite %[[#vec4_64:]] %[[#const_i64_0:]] %[[#const_i64_0:]] %[[#const_i64_0:]] %[[#const_i64_0:]]
-; CHECK-HLSL-DAG: %[[#vec4_const_zeros_f16:]] = OpConstantComposite %[[#vec4_float_16:]] %[[#const_f16_0:]] %[[#const_f16_0:]] %[[#const_f16_0:]] %[[#const_f16_0:]]
-; CHECK-HLSL-DAG: %[[#vec4_const_zeros_f32:]] = OpConstantComposite %[[#vec4_float_32:]] %[[#const_f32_0:]] %[[#const_f32_0:]] %[[#const_f32_0:]] %[[#const_f32_0:]]
-; CHECK-HLSL-DAG: %[[#vec4_const_zeros_f64:]] = OpConstantComposite %[[#vec4_float_64:]] %[[#const_f64_0:]] %[[#const_f64_0:]] %[[#const_f64_0:]] %[[#const_f64_0:]]
+; CHECK-HLSL-DAG: %[[#const_f32_0:]] = OpConstant %[[#float_32]] 0
+; CHECK-HLSL-DAG: %[[#const_f16_0:]] = OpConstant %[[#float_16]] 0
+; CHECK-HLSL-DAG: %[[#vec4_const_zeros_i16:]] = OpConstantComposite %[[#vec4_16]] %[[#const_i16_0]] %[[#const_i16_0]] %[[#const_i16_0]] %[[#const_i16_0]]
+; CHECK-HLSL-DAG: %[[#vec4_const_zeros_i32:]] = OpConstantComposite %[[#vec4_32]] %[[#const_i32_0]] %[[#const_i32_0]] %[[#const_i32_0]] %[[#const_i32_0]]
+; CHECK-HLSL-DAG: %[[#vec4_const_zeros_i64:]] = OpConstantComposite %[[#vec4_64]] %[[#const_i64_0]] %[[#const_i64_0]] %[[#const_i64_0]] %[[#const_i64_0]]
+; CHECK-HLSL-DAG: %[[#vec4_const_zeros_f16:]] = OpConstantComposite %[[#vec4_float_16]] %[[#const_f16_0]] %[[#const_f16_0]] %[[#const_f16_0]] %[[#const_f16_0]]
+; CHECK-HLSL-DAG: %[[#vec4_const_zeros_f32:]] = OpConstantComposite %[[#vec4_float_32]] %[[#const_f32_0]] %[[#const_f32_0]] %[[#const_f32_0]] %[[#const_f32_0]]
+; CHECK-HLSL-DAG: %[[#vec4_const_zeros_f64:]] = OpConstantComposite %[[#vec4_float_64]] %[[#const_f64_0]] %[[#const_f64_0]] %[[#const_f64_0]] %[[#const_f64_0]]
 
 ; CHECK-OCL-DAG: %[[#const_i64_0:]] = OpConstantNull %[[#int_64]]
 ; CHECK-OCL-DAG: %[[#const_i32_0:]] = OpConstantNull %[[#int_32]]
 ; CHECK-OCL-DAG: %[[#const_i16_0:]] = OpConstantNull %[[#int_16]]
 ; CHECK-OCL-DAG: %[[#const_f64_0:]] = OpConstantNull %[[#float_64]] 
-; CHECK-OCL-DAG: %[[#const_f32_0:]] = OpConstantNull %[[#float_32:]]
-; CHECK-OCL-DAG: %[[#const_f16_0:]] = OpConstantNull %[[#float_16:]]
-; CHECK-OCL-DAG: %[[#vec4_const_zeros_i16:]] = OpConstantNull %[[#vec4_16:]]
-; CHECK-OCL-DAG: %[[#vec4_const_zeros_i32:]] = OpConstantNull %[[#vec4_32:]]
-; CHECK-OCL-DAG: %[[#vec4_const_zeros_i64:]] = OpConstantNull %[[#vec4_64:]]
-; CHECK-OCL-DAG: %[[#vec4_const_zeros_f16:]] = OpConstantNull %[[#vec4_float_16:]]
-; CHECK-OCL-DAG: %[[#vec4_const_zeros_f32:]] = OpConstantNull %[[#vec4_float_32:]]
-; CHECK-OCL-DAG: %[[#vec4_const_zeros_f64:]] = OpConstantNull %[[#vec4_float_64:]]
+; CHECK-OCL-DAG: %[[#const_f32_0:]] = OpConstantNull %[[#float_32]]
+; CHECK-OCL-DAG: %[[#const_f16_0:]] = OpConstantNull %[[#float_16]]
+; CHECK-OCL-DAG: %[[#vec4_const_zeros_i16:]] = OpConstantNull %[[#vec4_16]]
+; CHECK-OCL-DAG: %[[#vec4_const_zeros_i32:]] = OpConstantNull %[[#vec4_32]]
+; CHECK-OCL-DAG: %[[#vec4_const_zeros_i64:]] = OpConstantNull %[[#vec4_64]]
+; CHECK-OCL-DAG: %[[#vec4_const_zeros_f16:]] = OpConstantNull %[[#vec4_float_16]]
+; CHECK-OCL-DAG: %[[#vec4_const_zeros_f32:]] = OpConstantNull %[[#vec4_float_32]]
+; CHECK-OCL-DAG: %[[#vec4_const_zeros_f64:]] = OpConstantNull %[[#vec4_float_64]]
 
 define noundef i1 @any_int64_t(i64 noundef %p0) {
 entry:
   ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#]]
-  ; CHECK: %[[#]] = OpINotEqual %[[#bool:]] %[[#arg0:]] %[[#const_i64_0:]]
+  ; CHECK: %[[#]] = OpINotEqual %[[#bool]] %[[#arg0]] %[[#const_i64_0]]
   %hlsl.any = call i1 @llvm.spv.any.i64(i64 %p0)
   ret i1 %hlsl.any
 }
@@ -60,7 +60,7 @@ entry:
 define noundef i1 @any_int(i32 noundef %p0) {
 entry:
   ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#]]
-  ; CHECK: %[[#]] = OpINotEqual %[[#bool:]] %[[#arg0:]] %[[#const_i32_0:]]
+  ; CHECK: %[[#]] = OpINotEqual %[[#bool]] %[[#arg0]] %[[#const_i32_0]]
   %hlsl.any = call i1 @llvm.spv.any.i32(i32 %p0)
   ret i1 %hlsl.any
 }
@@ -69,7 +69,7 @@ entry:
 define noundef i1 @any_int16_t(i16 noundef %p0) {
 entry:
   ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#]]
-  ; CHECK: %[[#]] = OpINotEqual %[[#bool:]] %[[#arg0:]] %[[#const_i16_0:]]
+  ; CHECK: %[[#]] = OpINotEqual %[[#bool]] %[[#arg0]] %[[#const_i16_0]]
   %hlsl.any = call i1 @llvm.spv.any.i16(i16 %p0)
   ret i1 %hlsl.any
 }
@@ -77,7 +77,7 @@ entry:
 define noundef i1 @any_double(double noundef %p0) {
 entry:
   ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#]]
-  ; CHECK: %[[#]] = OpFOrdNotEqual %[[#bool:]] %[[#arg0:]] %[[#const_f64_0:]]
+  ; CHECK: %[[#]] = OpFOrdNotEqual %[[#bool]] %[[#arg0]] %[[#const_f64_0]]
   %hlsl.any = call i1 @llvm.spv.any.f64(double %p0)
   ret i1 %hlsl.any
 }
@@ -86,7 +86,7 @@ entry:
 define noundef i1 @any_float(float noundef %p0) {
 entry:
   ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#]]
-  ; CHECK: %[[#]] = OpFOrdNotEqual %[[#bool:]] %[[#arg0:]] %[[#const_f32_0:]]
+  ; CHECK: %[[#]] = OpFOrdNotEqual %[[#bool]] %[[#arg0]] %[[#const_f32_0]]
   %hlsl.any = call i1 @llvm.spv.any.f32(float %p0)
   ret i1 %hlsl.any
 }
@@ -95,7 +95,7 @@ entry:
 define noundef i1 @any_half(half noundef %p0) {
 entry:
   ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#]]
-  ; CHECK: %[[#]] = OpFOrdNotEqual %[[#bool:]] %[[#arg0:]] %[[#const_f16_0:]]
+  ; CHECK: %[[#]] = OpFOrdNotEqual %[[#bool]] %[[#arg0]] %[[#const_f16_0]]
   %hlsl.any = call i1 @llvm.spv.any.f16(half %p0)
   ret i1 %hlsl.any
 }
@@ -103,8 +103,8 @@ entry:
 
 define noundef i1 @any_bool4(<4 x i1> noundef %p0) {
 entry:
-  ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#]]
-  ; CHECK: %[[#]] = OpAny %[[#vec4_bool:]] %[[#arg0:]]
+  ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#vec4_bool]]
+  ; CHECK: %[[#]] = OpAny %[[#bool]] %[[#arg0]]
   %hlsl.any = call i1 @llvm.spv.any.v4i1(<4 x i1> %p0)
   ret i1 %hlsl.any
 }
@@ -112,8 +112,8 @@ entry:
 define noundef i1 @any_short4(<4 x i16> noundef %p0) {
 entry:
   ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#]]
-  ; CHECK: %[[#shortVecNotEq:]] = OpINotEqual %[[#vec4_bool:]] %[[#arg0:]] %[[#vec4_const_zeros_i16:]]
-  ; CHECK: %[[#]] = OpAny %[[#bool:]] %[[#shortVecNotEq:]]
+  ; CHECK: %[[#shortVecNotEq:]] = OpINotEqual %[[#vec4_bool]] %[[#arg0]] %[[#vec4_const_zeros_i16]]
+  ; CHECK: %[[#]] = OpAny %[[#bool]] %[[#shortVecNotEq]]
   %hlsl.any = call i1 @llvm.spv.any.v4i16(<4 x i16> %p0)
   ret i1 %hlsl.any
 }
@@ -121,8 +121,8 @@ entry:
 define noundef i1 @any_int4(<4 x i32> noundef %p0) {
 entry:
   ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#]]
-  ; CHECK: %[[#i32VecNotEq:]] = OpINotEqual %[[#vec4_bool:]] %[[#arg0:]] %[[#vec4_const_zeros_i32:]]
-  ; CHECK: %[[#]] = OpAny %[[#bool:]] %[[#i32VecNotEq:]]
+  ; CHECK: %[[#i32VecNotEq:]] = OpINotEqual %[[#vec4_bool]] %[[#arg0]] %[[#vec4_const_zeros_i32]]
+  ; CHECK: %[[#]] = OpAny %[[#bool]] %[[#i32VecNotEq]]
   %hlsl.any = call i1 @llvm.spv.any.v4i32(<4 x i32> %p0)
   ret i1 %hlsl.any
 }
@@ -130,8 +130,8 @@ entry:
 define noundef i1 @any_int64_t4(<4 x i64> noundef %p0) {
 entry:
   ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#]]
-  ; CHECK: %[[#i64VecNotEq:]] = OpINotEqual %[[#vec4_bool:]] %[[#arg0:]] %[[#vec4_const_zeros_i64:]]
-  ; CHECK: %[[#]] = OpAny %[[#bool:]] %[[#i64VecNotEq]]
+  ; CHECK: %[[#i64VecNotEq:]] = OpINotEqual %[[#vec4_bool]] %[[#arg0]] %[[#vec4_const_zeros_i64]]
+  ; CHECK: %[[#]] = OpAny %[[#bool]] %[[#i64VecNotEq]]
   %hlsl.any = call i1 @llvm.spv.any.v4i64(<4 x i64> %p0)
   ret i1 %hlsl.any
 }
@@ -139,8 +139,8 @@ entry:
 define noundef i1 @any_half4(<4 x half> noundef %p0) {
 entry:
   ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#]]
-  ; CHECK: %[[#f16VecNotEq:]] = OpFOrdNotEqual %[[#vec4_bool:]] %[[#arg0:]] %[[#vec4_const_zeros_f16:]]
-  ; CHECK: %[[#]] = OpAny %[[#bool]] %[[#f16VecNotEq:]]
+  ; CHECK: %[[#f16VecNotEq:]] = OpFOrdNotEqual %[[#vec4_bool]] %[[#arg0]] %[[#vec4_const_zeros_f16]]
+  ; CHECK: %[[#]] = OpAny %[[#bool]] %[[#f16VecNotEq]]
   %hlsl.any = call i1 @llvm.spv.any.v4f16(<4 x half> %p0)
   ret i1 %hlsl.any
 }
@@ -148,8 +148,8 @@ entry:
 define noundef i1 @any_float4(<4 x float> noundef %p0) {
 entry:
   ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#]]
-  ; CHECK: %[[#f32VecNotEq:]] = OpFOrdNotEqual %[[#vec4_bool:]] %[[#arg0:]] %[[#vec4_const_zeros_f32:]]
-  ; CHECK: %[[#]] = OpAny %[[#bool:]] %[[#f32VecNotEq:]]
+  ; CHECK: %[[#f32VecNotEq:]] = OpFOrdNotEqual %[[#vec4_bool]] %[[#arg0]] %[[#vec4_const_zeros_f32]]
+  ; CHECK: %[[#]] = OpAny %[[#bool]] %[[#f32VecNotEq]]
   %hlsl.any = call i1 @llvm.spv.any.v4f32(<4 x float> %p0)
   ret i1 %hlsl.any
 }
@@ -157,16 +157,16 @@ entry:
 define noundef i1 @any_double4(<4 x double> noundef %p0) {
 entry:
   ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#]]
-  ; CHECK: %[[#f64VecNotEq:]] = OpFOrdNotEqual %[[#vec4_bool:]] %[[#arg0:]] %[[#vec4_const_zeros_f64:]]
-  ; CHECK: %[[#]] = OpAny %[[#bool:]] %[[#f64VecNotEq:]]
+  ; CHECK: %[[#f64VecNotEq:]] = OpFOrdNotEqual %[[#vec4_bool]] %[[#arg0]] %[[#vec4_const_zeros_f64]]
+  ; CHECK: %[[#]] = OpAny %[[#bool]] %[[#f64VecNotEq]]
   %hlsl.any = call i1 @llvm.spv.any.v4f64(<4 x double> %p0)
   ret i1 %hlsl.any
 }
 
 define noundef i1 @any_bool(i1 noundef %a) {
 entry:
-  ; CHECK: %[[#any_bool_arg:]] = OpFunctionParameter %[[#bool:]]
-  ; CHECK: OpReturnValue %[[#any_bool_arg:]]
+  ; CHECK: %[[#any_bool_arg:]] = OpFunctionParameter %[[#bool]]
+  ; CHECK: OpReturnValue %[[#any_bool_arg]]
   %hlsl.any = call i1 @llvm.spv.any.i1(i1 %a)
   ret i1 %hlsl.any
 }
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/lerp.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/lerp.ll
new file mode 100644
index 00000000000000..63547820c18c77
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/lerp.ll
@@ -0,0 +1,56 @@
+; RUN: llc -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; Make sure SPIRV operation function calls for lerp are generated as FMix
+
+; CHECK-DAG: %[[#op_ext_glsl:]] = OpExtInstImport "GLSL.std.450"
+; CHECK-DAG: %[[#float_32:]] = OpTypeFloat 32
+; CHECK-DAG: %[[#float_16:]] = OpTypeFloat 16
+; CHECK-DAG: %[[#vec4_float_16:]] = OpTypeVector %[[#float_16]] 4
+; CHECK-DAG: %[[#vec4_float_32:]] = OpTypeVector %[[#float_32]] 4
+
+define noundef half @lerp_half(half noundef %a, half noundef %b, half noundef %c) {
+entry:
+  ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#]]
+  ; CHECK: %[[#arg1:]] = OpFunctionParameter %[[#]]
+  ; CHECK: %[[#arg2:]] = OpFunctionParameter %[[#]]
+  ; CHECK: %[[#]] = OpExtInst %[[#float_16]] %[[#op_ext_glsl]] FMix %[[#arg0]] %[[#arg1]] %[[#arg2]]
+  %hlsl.lerp = call half @llvm.spv.lerp.f16(half %a, half %b, half %c)
+  ret half %hlsl.lerp
+}
+
+
+define noundef float @lerp_float(float noundef %a, float noundef %b, float noundef %c) {
+entry:
+  ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#]]
+  ; CHECK: %[[#arg1:]] = OpFunctionParameter %[[#]]
+  ; CHECK: %[[#arg2:]] = OpFunctionParameter %[[#]]
+  ; CHECK: %[[#]] = OpExtInst %[[#float_32]] %[[#op_ext_glsl]] FMix %[[#arg0]] %[[#arg1]] %[[#arg2]]
+  %hlsl.lerp = call float @llvm.spv.lerp.f32(float %a, float %b, float %c)
+  ret float %hlsl.lerp
+}
+
+define noundef <4 x half> @lerp_half4(<4 x half> noundef %a, <4 x half> noundef %b, <4 x half> noundef %c) {
+entry:
+  ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#]]
+  ; CHECK: %[[#arg1:]] = OpFunctionParameter %[[#]]
+  ; CHECK: %[[#arg2:]] = OpFunctionParameter %[[#]]
+  ; CHECK: %[[#]] = OpExtInst %[[#vec4_float_16]] %[[#op_ext_glsl]] FMix %[[#arg0]] %[[#arg1]] %[[#arg2]]
+  %hlsl.lerp = call <4 x half> @llvm.spv.lerp.v4f16(<4 x half> %a, <4 x half> %b, <4 x half> %c)
+  ret <4 x half> %hlsl.lerp
+}
+
+define noundef <4 x float> @lerp_float4(<4 x float> noundef %a, <4 x float> noundef %b, <4 x float> noundef %c) {
+entry:
+  ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#]]
+  ; CHECK: %[[#arg1:]] = OpFunctionParameter %[[#]]
+  ; CHECK: %[[#arg2:]] = OpFunctionParameter %[[#]]
+  ; CHECK: %[[#]] = OpExtInst %[[#vec4_float_32]] %[[#op_ext_glsl]] FMix %[[#arg0]] %[[#arg1]] %[[#arg2]]
+  %hlsl.lerp = call <4 x float> @llvm.spv.lerp.v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %c)
+  ret <4 x float> %hlsl.lerp
+}
+
+declare half @llvm.spv.lerp.f16(half, half, half)
+declare float @llvm.spv.lerp.f32(float, float, float)
+declare <4 x half> @llvm.spv.lerp.v4f16(<4 x half>, <4 x half>, <4 x half>)
+declare <4 x float> @llvm.spv.lerp.v4f32(<4 x float>, <4 x float>, <4 x float>)
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/rcp.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/rcp.ll
index 95962c0fdc9695..34f3c610ca81da 100644
--- a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/rcp.ll
+++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/rcp.ll
@@ -13,90 +13,90 @@
 ; CHECK-DAG: %[[#vec4_float_32:]] = OpTypeVector %[[#float_32]] 4
 ; CHECK-DAG: %[[#vec4_float_64:]] = OpTypeVector %[[#float_64]] 4
 ; CHECK-DAG: %[[#const_f64_1:]] = OpConstant %[[#float_64]] 1
-; CHECK-DAG: %[[#const_f32_1:]] = OpConstant %[[#float_32:]] 1
-; CHECK-DAG: %[[#const_f16_1:]] = OpConstant %[[#float_16:]] 1
+; CHECK-DAG: %[[#const_f32_1:]] = OpConstant %[[#float_32]] 1
+; CHECK-DAG: %[[#const_f16_1:]] = OpConstant %[[#float_16]] 1
 
-; CHECK-DAG: %[[#vec2_const_ones_f16:]] = OpConstantComposite %[[#vec2_float_16:]] %[[#const_f16_1:]] %[[#const_f16_1:]]
-; CHECK-DAG: %[[#vec3_const_ones_f16:]] = OpConstantComposite %[[#vec3_float_16:]] %[[#const_f16_1:]] %[[#const_f16_1:]] %[[#const_f16_1:]]
-; CHECK-DAG: %[[#vec4_const_ones_f16:]] = OpConstantComposite %[[#vec4_float_16:]] %[[#const_f16_1:]] %[[#const_f16_1:]] %[[#const_f16_1:]] %[[#const_f16_1:]]
+; CHECK-DAG: %[[#vec2_const_ones_f16:]] = OpConstantComposite %[[#vec2_float_16]] %[[#const_f16_1]] %[[#const_f16_1]]
+; CHECK-DAG: %[[#vec3_const_ones_f16:]] = OpConstantComposite %[[#vec3_float_16]] %[[#const_f16_1]] %[[#const_f16_1]] %[[#const_f16_1]]
+; CHECK-DAG: %[[#vec4_const_ones_f16:]] = OpConstantComposite %[[#vec4_float_16]] %[[#const_f16_1]] %[[#const_f16_1]] %[[#const_f16_1]] %[[#const_f16_1]]
 
-; CHECK-DAG: %[[#vec2_const_ones_f32:]] = OpConstantComposite %[[#vec2_float_32:]] %[[#const_f32_1:]] %[[#const_f32_1:]]
-; CHECK-DAG: %[[#vec3_const_ones_f32:]] = OpConstantComposite %[[#vec3_float_32:]] %[[#const_f32_1:]] %[[#const_f32_1:]] %[[#const_f32_1:]]
-; CHECK-DAG: %[[#vec4_const_ones_f32:]] = OpConstantComposite %[[#vec4_float_32:]] %[[#const_f32_1:]] %[[#const_f32_1:]] %[[#const_f32_1:]] %[[#const_f32_1:]]
+; CHECK-DAG: %[[#vec2_const_ones_f32:]] = OpConstantComposite %[[#vec2_float_32]] %[[#const_f32_1]] %[[#const_f32_1]]
+; CHECK-DAG: %[[#vec3_const_ones_f32:]] = OpConstantComposite %[[#vec3_float_32]] %[[#const_f32_1]] %[[#const_f32_1]] %[[#const_f32_1]]
+; CHECK-DAG: %[[#vec4_const_ones_f32:]] = OpConstantComposite %[[#vec4_float_32]] %[[#const_f32_1]] %[[#const_f32_1]] %[[#const_f32_1]] %[[#const_f32_1]]
 
-; CHECK-DAG: %[[#vec2_const_ones_f64:]] = OpConstantComposite %[[#vec2_float_64:]] %[[#const_f64_1:]] %[[#const_f64_1:]]
-; CHECK-DAG: %[[#vec3_const_ones_f64:]] = OpConstantComposite %[[#vec3_float_64:]] %[[#const_f64_1:]] %[[#const_f64_1:]] %[[#const_f64_1:]]
-; CHECK-DAG: %[[#vec4_const_ones_f64:]] = OpConstantComposite %[[#vec4_float_64:]] %[[#const_f64_1:]] %[[#const_f64_1:]] %[[#const_f64_1:]] %[[#const_f64_1:]]
+; CHECK-DAG: %[[#vec2_const_ones_f64:]] = OpConstantComposite %[[#vec2_float_64]] %[[#const_f64_1]] %[[#const_f64_1]]
+; CHECK-DAG: %[[#vec3_const_ones_f64:]] = OpConstantComposite %[[#vec3_float_64]] %[[#const_f64_1]] %[[#const_f64_1]] %[[#const_f64_1]]
+; CHECK-DAG: %[[#vec4_const_ones_f64:]] = OpConstantComposite %[[#vec4_float_64]] %[[#const_f64_1]] %[[#const_f64_1]] %[[#const_f64_1]] %[[#const_f64_1]]
 
 
 define spir_func noundef half @test_rcp_half(half noundef %p0) #0 {
 entry:
-  ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#float_16:]]
-  ; CHECK: OpFDiv %[[#float_16:]] %[[#const_f16_1:]] %[[#arg0:]]
+  ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#float_16]]
+  ; CHECK: OpFDiv %[[#float_16]] %[[#const_f16_1]] %[[#arg0]]
   %hlsl.rcp = fdiv half 0xH3C00, %p0
   ret half %hlsl.rcp
 }
 
 define spir_func noundef <2 x half> @test_rcp_half2(<2 x half> noundef %p0) #0 {
 entry:
-  ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#vec2_float_16:]]
-  ; CHECK: OpFDiv %[[#vec2_float_16:]] %[[#vec2_const_ones_f16:]] %[[#arg0:]]
+  ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#vec2_float_16]]
+  ; CHECK: OpFDiv %[[#vec2_float_16]] %[[#vec2_const_ones_f16]] %[[#arg0]]
   %hlsl.rcp = fdiv <2 x half> <half 0xH3C00, half 0xH3C00>, %p0
   ret <2 x half> %hlsl.rcp
 }
 
 define spir_func noundef <3 x half> @test_rcp_half3(<3 x half> noundef %p0) #0 {
 entry:
-  ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#vec3_float_16:]]
-  ; CHECK: OpFDiv %[[#vec3_float_16:]] %[[#vec3_const_ones_f16:]] %[[#arg0:]]
+  ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#vec3_float_16]]
+  ; CHECK: OpFDiv %[[#vec3_float_16]] %[[#vec3_const_ones_f16]] %[[#arg0]]
   %hlsl.rcp = fdiv <3 x half> <half 0xH3C00, half 0xH3C00, half 0xH3C00>, %p0
   ret <3 x half> %hlsl.rcp
 }
 
 define spir_func noundef <4 x half> @test_rcp_half4(<4 x half> noundef %p0) #0 {
 entry:
-  ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#vec4_float_16:]]
-  ; CHECK: OpFDiv %[[#vec4_float_16:]] %[[#vec4_const_ones_f16:]] %[[#arg0:]]
+  ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#vec4_float_16]]
+  ; CHECK: OpFDiv %[[#vec4_float_16]] %[[#vec4_const_ones_f16]] %[[#arg0]]
   %hlsl.rcp = fdiv <4 x half> <half 0xH3C00, half 0xH3C00, half 0xH3C00, half 0xH3C00>, %p0
   ret <4 x half> %hlsl.rcp
 }
 
 define spir_func noundef float @test_rcp_float(float noundef %p0) #0 {
 entry:
-  ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#float_32:]]
-  ; CHECK: OpFDiv %[[#float_32:]] %[[#const_f32_1:]] %[[#arg0:]]
+  ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#float_32]]
+  ; CHECK: OpFDiv %[[#float_32]] %[[#const_f32_1]] %[[#arg0]]
   %hlsl.rcp = fdiv float 1.000000e+00, %p0
   ret float %hlsl.rcp
 }
 
 define spir_func noundef <2 x float> @test_rcp_float2(<2 x float> noundef %p0) #0 {
 entry:
-  ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#vec2_float_32:]]
-  ; CHECK: OpFDiv %[[#vec2_float_32:]] %[[#vec2_const_ones_f32:]] %[[#arg0:]]
+  ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#vec2_float_32]]
+  ; CHECK: OpFDiv %[[#vec2_float_32]] %[[#vec2_const_ones_f32]] %[[#arg0]]
   %hlsl.rcp = fdiv <2 x float> <float 1.000000e+00, float 1.000000e+00>, %p0
   ret <2 x float> %hlsl.rcp
 }
 
 define spir_func noundef <3 x float> @test_rcp_float3(<3 x float> noundef %p0) #0 {
 entry:
-  ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#vec3_float_32:]]
-  ; CHECK: OpFDiv %[[#vec3_float_32:]] %[[#vec3_const_ones_f32:]] %[[#arg0:]]
+  ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#vec3_float_32]]
+  ; CHECK: OpFDiv %[[#vec3_float_32]] %[[#vec3_const_ones_f32]] %[[#arg0]]
   %hlsl.rcp = fdiv <3 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, %p0
   ret <3 x float> %hlsl.rcp
 }
 
 define spir_func noundef <4 x float> @test_rcp_float4(<4 x float> noundef %p0) #0 {
 entry:
-  ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#vec4_float_32:]]
-  ; CHECK: OpFDiv %[[#vec4_float_32:]] %[[#vec4_const_ones_f32:]] %[[#arg0:]]
+  ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#vec4_float_32]]
+  ; CHECK: OpFDiv %[[#vec4_float_32]] %[[#vec4_const_ones_f32]] %[[#arg0]]
   %hlsl.rcp = fdiv <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, %p0
   ret <4 x float> %hlsl.rcp
 }
 
 define spir_func noundef double @test_rcp_double(double noundef %p0) #0 {
 entry:
-  ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#float_64:]]
-  ; CHECK: OpFDiv %[[#float_64:]] %[[#const_f64_1:]] %[[#arg0:]]
+  ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#float_64]]
+  ; CHECK: OpFDiv %[[#float_64]] %[[#const_f64_1]] %[[#arg0]]
   %hlsl.rcp = fdiv double 1.000000e+00, %p0
   ret double %hlsl.rcp
 }
@@ -104,7 +104,7 @@ entry:
 define spir_func noundef <2 x double> @test_rcp_double2(<2 x double> noundef %p0) #0 {
 entry:
   ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#vec2_float_64:]]
-  ; CHECK: OpFDiv %[[#vec2_float_64:]] %[[#vec2_const_ones_f64:]] %[[#arg0:]]
+  ; CHECK: OpFDiv %[[#vec2_float_64]] %[[#vec2_const_ones_f64]] %[[#arg0]]
   %hlsl.rcp = fdiv <2 x double> <double 1.000000e+00, double 1.000000e+00>, %p0
   ret <2 x double> %hlsl.rcp
 }
@@ -112,15 +112,15 @@ entry:
 define spir_func noundef <3 x double> @test_rcp_double3(<3 x double> noundef %p0) #0 {
 entry:
   ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#vec3_float_64:]]
-  ; CHECK: OpFDiv %[[#vec3_float_64:]] %[[#vec3_const_ones_f64:]] %[[#arg0:]]
+  ; CHECK: OpFDiv %[[#vec3_float_64]] %[[#vec3_const_ones_f64]] %[[#arg0]]
   %hlsl.rcp = fdiv <3 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>, %p0
   ret <3 x double> %hlsl.rcp
 }
 
 define spir_func noundef <4 x double> @test_rcp_double4(<4 x double> noundef %p0) #0 {
 entry:
-  ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#vec4_float_64:]]
-  ; CHECK: OpFDiv %[[#vec4_float_64:]] %[[#vec4_const_ones_f64:]] %[[#arg0:]]
+  ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#vec4_float_64]]
+  ; CHECK: OpFDiv %[[#vec4_float_64]] %[[#vec4_const_ones_f64]] %[[#arg0]]
   %hlsl.rcp = fdiv <4 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>, %p0
   ret <4 x double> %hlsl.rcp
 }

>From b6628c24ef017138b8d6eb288e94c141e7c846b0 Mon Sep 17 00:00:00 2001
From: Sirraide <aeternalmail at gmail.com>
Date: Mon, 22 Apr 2024 18:41:36 +0200
Subject: [PATCH 6/7] [Clang] Fix crash on invalid size in user-defined
 `static_assert` message (#89420)

This addresses two problems observed in #89407 wrt user-defined
`static_assert` messages:

1. In `Expr::EvaluateCharRangeAsString`, we were calling `getExtValue()`
instead of `getZExtValue()`, which would assert if a negative or very
large number was returned from `size()`.
2. If the value could not be converted to `std::size_t`, attempting to
diagnose that would crash because `ext_cce_narrowing` was missing two
`%select` cases.

This fixes #89407.
---
 clang/docs/ReleaseNotes.rst                   |  2 +
 .../clang/Basic/DiagnosticSemaKinds.td        |  6 +-
 clang/lib/AST/ExprConstant.cpp                |  4 +-
 clang/test/SemaCXX/static-assert-cxx26.cpp    | 74 +++++++++++++++++++
 4 files changed, 81 insertions(+), 5 deletions(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 009531bae8a9de..aea99680c79a0e 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -555,6 +555,8 @@ Bug Fixes to C++ Support
 - Fix a crash caused by defined struct in a type alias template when the structure
   has fields with dependent type. Fixes (#GH75221).
 - Fix the Itanium mangling of lambdas defined in a member of a local class (#GH88906)
+- Fixed a crash when trying to evaluate a user-defined ``static_assert`` message whose ``size()``
+  function returns a large or negative value. Fixes (#GH89407).
 
 Bug Fixes to AST Handling
 ^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index a95424862e63f4..63e951daec7477 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -87,9 +87,9 @@ def err_expr_not_cce : Error<
   "call to 'size()'|call to 'data()'}0 is not a constant expression">;
 def ext_cce_narrowing : ExtWarn<
   "%select{case value|enumerator value|non-type template argument|"
-  "array size|explicit specifier argument|noexcept specifier argument}0 "
-  "%select{cannot be narrowed from type %2 to %3|"
-  "evaluates to %2, which cannot be narrowed to type %3}1">,
+  "array size|explicit specifier argument|noexcept specifier argument|"
+  "call to 'size()'|call to 'data()'}0 %select{cannot be narrowed from "
+  "type %2 to %3|evaluates to %2, which cannot be narrowed to type %3}1">,
   InGroup<CXX11Narrowing>, DefaultError, SFINAEFailure;
 def err_ice_not_integral : Error<
   "%select{integer|integral}1 constant expression must have "
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index 73ae8d8efb23a2..de3c2a63913e94 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -16853,13 +16853,13 @@ bool Expr::EvaluateCharRangeAsString(std::string &Result,
   if (!::EvaluateInteger(SizeExpression, SizeValue, Info))
     return false;
 
-  int64_t Size = SizeValue.getExtValue();
+  uint64_t Size = SizeValue.getZExtValue();
 
   if (!::EvaluatePointer(PtrExpression, String, Info))
     return false;
 
   QualType CharTy = PtrExpression->getType()->getPointeeType();
-  for (int64_t I = 0; I < Size; ++I) {
+  for (uint64_t I = 0; I < Size; ++I) {
     APValue Char;
     if (!handleLValueToRValueConversion(Info, PtrExpression, CharTy, String,
                                         Char))
diff --git a/clang/test/SemaCXX/static-assert-cxx26.cpp b/clang/test/SemaCXX/static-assert-cxx26.cpp
index f4ede74f9214a4..7d896d8b365b74 100644
--- a/clang/test/SemaCXX/static-assert-cxx26.cpp
+++ b/clang/test/SemaCXX/static-assert-cxx26.cpp
@@ -341,3 +341,77 @@ struct Callable {
     } data;
 };
 static_assert(false, Callable{}); // expected-error {{static assertion failed: hello}}
+
+namespace GH89407 {
+struct A {
+  constexpr __SIZE_TYPE__ size() const { return -1; }
+  constexpr const char* data() const { return ""; }
+};
+
+struct B {
+  constexpr long long size() const { return 18446744073709551615U; }
+  constexpr const char* data() const { return ""; }
+};
+
+struct C {
+  constexpr __int128 size() const { return -1; }
+  constexpr const char* data() const { return ""; }
+};
+
+struct D {
+  constexpr unsigned __int128 size() const { return -1; }
+  constexpr const char* data() const { return ""; }
+};
+
+struct E {
+  constexpr __SIZE_TYPE__ size() const { return 18446744073709551615U; }
+  constexpr const char* data() const { return ""; }
+};
+
+static_assert(true, A{}); // expected-error {{the message in this static assertion is not a constant expression}}
+                          // expected-note at -1 {{read of dereferenced one-past-the-end pointer is not allowed in a constant expression}}
+static_assert(true, B{}); // expected-error {{call to 'size()' evaluates to -1, which cannot be narrowed to type 'unsigned long'}}
+                          // expected-error at -1 {{the message in this static assertion is not a constant expression}}
+                          // expected-note at -2 {{read of dereferenced one-past-the-end pointer is not allowed in a constant expression}}
+static_assert(true, C{}); // expected-error {{call to 'size()' evaluates to -1, which cannot be narrowed to type 'unsigned long'}}
+                          // expected-error at -1 {{the message in this static assertion is not a constant expression}}
+                          // expected-note at -2 {{read of dereferenced one-past-the-end pointer is not allowed in a constant expression}}
+static_assert(true, D{}); // expected-error {{call to 'size()' evaluates to 340282366920938463463374607431768211455, which cannot be narrowed to type 'unsigned long'}}
+                          // expected-error at -1 {{the message in this static assertion is not a constant expression}}
+                          // expected-note at -2 {{read of dereferenced one-past-the-end pointer is not allowed in a constant expression}}
+static_assert(true, E{}); // expected-error {{the message in this static assertion is not a constant expression}}
+                          // expected-note at -1 {{read of dereferenced one-past-the-end pointer is not allowed in a constant expression}}
+
+static_assert(
+  false, // expected-error {{static assertion failed}}
+  A{} // expected-error {{the message in a static assertion must be produced by a constant expression}}
+      // expected-note at -1 {{read of dereferenced one-past-the-end pointer is not allowed in a constant expression}}
+);
+
+static_assert(
+  false, // expected-error {{static assertion failed}}
+  B{} // expected-error {{call to 'size()' evaluates to -1, which cannot be narrowed to type 'unsigned long'}}
+      // expected-error at -1 {{the message in a static assertion must be produced by a constant expression}}
+      // expected-note at -2 {{read of dereferenced one-past-the-end pointer is not allowed in a constant expression}}
+);
+
+static_assert(
+  false, // expected-error {{static assertion failed}}
+  C{} // expected-error {{call to 'size()' evaluates to -1, which cannot be narrowed to type 'unsigned long'}}
+      // expected-error at -1 {{the message in a static assertion must be produced by a constant expression}}
+      // expected-note at -2 {{read of dereferenced one-past-the-end pointer is not allowed in a constant expression}}
+);
+
+static_assert(
+  false, // expected-error {{static assertion failed}}
+  D{} // expected-error {{call to 'size()' evaluates to 340282366920938463463374607431768211455, which cannot be narrowed to type 'unsigned long'}}
+      // expected-error at -1 {{the message in a static assertion must be produced by a constant expression}}
+      // expected-note at -2 {{read of dereferenced one-past-the-end pointer is not allowed in a constant expression}}
+);
+
+static_assert(
+  false, // expected-error {{static assertion failed}}
+  E{} // expected-error {{the message in a static assertion must be produced by a constant expression}}
+      // expected-note at -1 {{read of dereferenced one-past-the-end pointer is not allowed in a constant expression}}
+);
+}

>From 75068fa9738688424a78db1c9c4e2606912593a0 Mon Sep 17 00:00:00 2001
From: Mogball <jeff at modular.com>
Date: Mon, 22 Apr 2024 17:02:17 +0000
Subject: [PATCH 7/7] [mlir] Update comment about `propertiesAttr` (NFC)

The comment is misleading because `propertiesAttr` is not actually
ignored when the operation isn't unregistered.

stack-info: PR: https://github.com/llvm/llvm-project/pull/89631, branch: users/Mogball/stack/1
---
 mlir/include/mlir/IR/OperationSupport.h | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/mlir/include/mlir/IR/OperationSupport.h b/mlir/include/mlir/IR/OperationSupport.h
index 2c1c490aac49b8..2c6e8253b4327a 100644
--- a/mlir/include/mlir/IR/OperationSupport.h
+++ b/mlir/include/mlir/IR/OperationSupport.h
@@ -960,9 +960,12 @@ struct OperationState {
   /// Regions that the op will hold.
   SmallVector<std::unique_ptr<Region>, 1> regions;
 
-  // If we're creating an unregistered operation, this Attribute is used to
-  // build the properties. Otherwise it is ignored. For registered operations
-  // see the `getOrAddProperties` method.
+  // This Attribute is used to opaquely construct the properties of the
+  // operation. If we're creating an unregistered operation, the Attribute is
+  // used as-is as the Properties storage of the operation. Otherwise, the
+  // operation properties are constructed opaquely using its
+  // `setPropertiesFromAttr` hook. Note that `getOrAddProperties` is the
+  // preferred method to construct properties from C++.
   Attribute propertiesAttr;
 
 private:



More information about the llvm-branch-commits mailing list