[llvm] [AArch64][ARM] Optimize more `tbl`/`tbx` calls into `shufflevector` (PR #169748)

Mon Dec 1 11:47:42 PST 2025

https://github.com/valadaptive updated https://github.com/llvm/llvm-project/pull/169748

>From 852e4d3e58620828649c5209b12cde86615d6597 Mon Sep 17 00:00:00 2001
From: valadaptive <valadaptive at protonmail.com>
Date: Tue, 25 Nov 2025 19:00:27 -0500
Subject: [PATCH 1/8] [AArch64][ARM] Move ARM-specific InstCombine transforms
 to new module

---
 .../AArch64/AArch64TargetTransformInfo.cpp    |  15 ++
 llvm/lib/Target/AArch64/CMakeLists.txt        |   1 +
 .../lib/Target/ARM/ARMTargetTransformInfo.cpp |  16 ++
 llvm/lib/Target/ARM/CMakeLists.txt            |   1 +
 .../ARMCommonInstCombineIntrinsic.cpp         | 137 ++++++++++++++++++
 .../ARMCommon/ARMCommonInstCombineIntrinsic.h |  58 ++++++++
 llvm/lib/Target/ARMCommon/CMakeLists.txt      |   8 +
 llvm/lib/Target/CMakeLists.txt                |   5 +
 .../InstCombine/InstCombineCalls.cpp          | 104 -------------
 .../InstCombine/AArch64/aes-intrinsics.ll     |   2 +-
 .../ARM/2012-04-23-Neon-Intrinsics.ll         |   2 +-
 .../InstCombine/ARM/aes-intrinsics.ll         |   2 +-
 llvm/test/Transforms/InstCombine/ARM/tbl1.ll  |   2 +-
 13 files changed, 245 insertions(+), 108 deletions(-)
 create mode 100644 llvm/lib/Target/ARMCommon/ARMCommonInstCombineIntrinsic.cpp
 create mode 100644 llvm/lib/Target/ARMCommon/ARMCommonInstCombineIntrinsic.h
 create mode 100644 llvm/lib/Target/ARMCommon/CMakeLists.txt

diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 0bae00bafee3c..74be251668ed6 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "AArch64TargetTransformInfo.h"
+#include "../ARMCommon/ARMCommonInstCombineIntrinsic.h"
 #include "AArch64ExpandImm.h"
 #include "AArch64PerfectShuffle.h"
 #include "AArch64SMEAttributes.h"
@@ -2856,6 +2857,20 @@ AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC,
   case Intrinsic::aarch64_neon_fmaxnm:
   case Intrinsic::aarch64_neon_fminnm:
     return instCombineMaxMinNM(IC, II);
+  case Intrinsic::aarch64_neon_tbl1:
+    if (Value *V = ARMCommon::simplifyNeonTbl1(II, IC.Builder))
+      return IC.replaceInstUsesWith(II, V);
+    break;
+  case Intrinsic::aarch64_neon_smull:
+  case Intrinsic::aarch64_neon_umull: {
+    bool Zext = IID == Intrinsic::aarch64_neon_umull;
+    return ARMCommon::simplifyNeonMultiply(II, IC, Zext);
+  }
+  case Intrinsic::aarch64_crypto_aesd:
+  case Intrinsic::aarch64_crypto_aese:
+  case Intrinsic::aarch64_sve_aesd:
+  case Intrinsic::aarch64_sve_aese:
+    return ARMCommon::simplifyAES(II, IC);
   case Intrinsic::aarch64_sve_convert_from_svbool:
     return instCombineConvertFromSVBool(IC, II);
   case Intrinsic::aarch64_sve_dup:
diff --git a/llvm/lib/Target/AArch64/CMakeLists.txt b/llvm/lib/Target/AArch64/CMakeLists.txt
index 285d646293eb7..d27a698ee9e4a 100644
--- a/llvm/lib/Target/AArch64/CMakeLists.txt
+++ b/llvm/lib/Target/AArch64/CMakeLists.txt
@@ -101,6 +101,7 @@ add_llvm_target(AArch64CodeGen
   AArch64Desc
   AArch64Info
   AArch64Utils
+  ARMCommon
   Analysis
   AsmPrinter
   CFGuard
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
index fdb0ec40cb41f..bff93aa804112 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "ARMTargetTransformInfo.h"
+#include "../ARMCommon/ARMCommonInstCombineIntrinsic.h"
 #include "ARMSubtarget.h"
 #include "MCTargetDesc/ARMAddressingModes.h"
 #include "llvm/ADT/APInt.h"
@@ -182,6 +183,21 @@ ARMTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
     break;
   }
 
+  case Intrinsic::arm_neon_vtbl1:
+    if (Value *V = ARMCommon::simplifyNeonTbl1(II, IC.Builder))
+      return IC.replaceInstUsesWith(II, V);
+    break;
+
+  case Intrinsic::arm_neon_vmulls:
+  case Intrinsic::arm_neon_vmullu: {
+    bool Zext = IID == Intrinsic::arm_neon_vmullu;
+    return ARMCommon::simplifyNeonMultiply(II, IC, Zext);
+  }
+
+  case Intrinsic::arm_neon_aesd:
+  case Intrinsic::arm_neon_aese:
+    return ARMCommon::simplifyAES(II, IC);
+
   case Intrinsic::arm_mve_pred_i2v: {
     Value *Arg = II.getArgOperand(0);
     Value *ArgArg;
diff --git a/llvm/lib/Target/ARM/CMakeLists.txt b/llvm/lib/Target/ARM/CMakeLists.txt
index eb3ad01a54fb2..9fc9bc134e5cc 100644
--- a/llvm/lib/Target/ARM/CMakeLists.txt
+++ b/llvm/lib/Target/ARM/CMakeLists.txt
@@ -73,6 +73,7 @@ add_llvm_target(ARMCodeGen
   Thumb2SizeReduction.cpp
 
   LINK_COMPONENTS
+  ARMCommon
   ARMDesc
   ARMInfo
   ARMUtils
diff --git a/llvm/lib/Target/ARMCommon/ARMCommonInstCombineIntrinsic.cpp b/llvm/lib/Target/ARMCommon/ARMCommonInstCombineIntrinsic.cpp
new file mode 100644
index 0000000000000..9b8c1f4747dc1
--- /dev/null
+++ b/llvm/lib/Target/ARMCommon/ARMCommonInstCombineIntrinsic.cpp
@@ -0,0 +1,137 @@
+//===- ARMCommonInstCombineIntrinsic.cpp -
+//                  instCombineIntrinsic opts for both ARM and AArch64  ---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains optimizations for ARM and AArch64 intrinsics that
+/// are shared between both architectures. These functions can be called from:
+/// - ARM TTI's instCombineIntrinsic (for arm_neon_* intrinsics)
+/// - AArch64 TTI's instCombineIntrinsic (for aarch64_neon_* and aarch64_sve_*
+///   intrinsics)
+///
+//===----------------------------------------------------------------------===//
+
+#include "ARMCommonInstCombineIntrinsic.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Transforms/InstCombine/InstCombiner.h"
+
+using namespace llvm;
+using namespace llvm::PatternMatch;
+
+namespace llvm {
+namespace ARMCommon {
+
+/// Convert a table lookup to shufflevector if the mask is constant.
+/// This could benefit tbl1 if the mask is { 7,6,5,4,3,2,1,0 }, in
+/// which case we could lower the shufflevector with rev64 instructions
+/// as it's actually a byte reverse.
+Value *simplifyNeonTbl1(const IntrinsicInst &II,
+                        InstCombiner::BuilderTy &Builder) {
+  // Bail out if the mask is not a constant.
+  auto *C = dyn_cast<Constant>(II.getArgOperand(1));
+  if (!C)
+    return nullptr;
+
+  auto *VecTy = cast<FixedVectorType>(II.getType());
+  unsigned NumElts = VecTy->getNumElements();
+
+  // Only perform this transformation for <8 x i8> vector types.
+  if (!VecTy->getElementType()->isIntegerTy(8) || NumElts != 8)
+    return nullptr;
+
+  int Indexes[8];
+
+  for (unsigned I = 0; I < NumElts; ++I) {
+    Constant *COp = C->getAggregateElement(I);
+
+    if (!COp || !isa<ConstantInt>(COp))
+      return nullptr;
+
+    Indexes[I] = cast<ConstantInt>(COp)->getLimitedValue();
+
+    // Make sure the mask indices are in range.
+    if ((unsigned)Indexes[I] >= NumElts)
+      return nullptr;
+  }
+
+  auto *V1 = II.getArgOperand(0);
+  auto *V2 = Constant::getNullValue(V1->getType());
+  return Builder.CreateShuffleVector(V1, V2, ArrayRef(Indexes));
+}
+
+/// Simplify NEON multiply-long intrinsics (smull, umull).
+/// These intrinsics perform widening multiplies: they multiply two vectors of
+/// narrow integers and produce a vector of wider integers. This function
+/// performs algebraic simplifications:
+/// 1. Multiply by zero => zero vector
+/// 2. Multiply by one => zero/sign-extend the non-one operand
+/// 3. Both operands constant => regular multiply that can be constant-folded
+///    later
+Instruction *simplifyNeonMultiply(IntrinsicInst &II, InstCombiner &IC,
+                                  bool Zext) {
+  Value *Arg0 = II.getArgOperand(0);
+  Value *Arg1 = II.getArgOperand(1);
+
+  // Handle mul by zero first:
+  if (isa<ConstantAggregateZero>(Arg0) || isa<ConstantAggregateZero>(Arg1)) {
+    return IC.replaceInstUsesWith(II, ConstantAggregateZero::get(II.getType()));
+  }
+
+  // Check for constant LHS & RHS - in this case we just simplify.
+  VectorType *NewVT = cast<VectorType>(II.getType());
+  if (Constant *CV0 = dyn_cast<Constant>(Arg0)) {
+    if (Constant *CV1 = dyn_cast<Constant>(Arg1)) {
+      Value *V0 = IC.Builder.CreateIntCast(CV0, NewVT, /*isSigned=*/!Zext);
+      Value *V1 = IC.Builder.CreateIntCast(CV1, NewVT, /*isSigned=*/!Zext);
+      return IC.replaceInstUsesWith(II, IC.Builder.CreateMul(V0, V1));
+    }
+
+    // Couldn't simplify - canonicalize constant to the RHS.
+    std::swap(Arg0, Arg1);
+  }
+
+  // Handle mul by one:
+  if (Constant *CV1 = dyn_cast<Constant>(Arg1))
+    if (ConstantInt *Splat =
+            dyn_cast_or_null<ConstantInt>(CV1->getSplatValue()))
+      if (Splat->isOne())
+        return CastInst::CreateIntegerCast(Arg0, II.getType(),
+                                           /*isSigned=*/!Zext);
+
+  return nullptr;
+}
+
+/// Simplify AES encryption/decryption intrinsics (AESE, AESD).
+///
+/// ARM's AES instructions (AESE/AESD) XOR the data and the key, provided as
+/// separate arguments, before performing the encryption/decryption operation.
+/// We can fold that "internal" XOR with a previous one.
+Instruction *simplifyAES(IntrinsicInst &II, InstCombiner &IC) {
+  Value *DataArg = II.getArgOperand(0);
+  Value *KeyArg = II.getArgOperand(1);
+
+  // Accept zero on either operand.
+  if (!match(KeyArg, m_ZeroInt()))
+    std::swap(KeyArg, DataArg);
+
+  // Try to use the builtin XOR in AESE and AESD to eliminate a prior XOR
+  Value *Data, *Key;
+  if (match(KeyArg, m_ZeroInt()) &&
+      match(DataArg, m_Xor(m_Value(Data), m_Value(Key)))) {
+    IC.replaceOperand(II, 0, Data);
+    IC.replaceOperand(II, 1, Key);
+    return &II;
+  }
+
+  return nullptr;
+}
+
+} // namespace ARMCommon
+} // namespace llvm
diff --git a/llvm/lib/Target/ARMCommon/ARMCommonInstCombineIntrinsic.h b/llvm/lib/Target/ARMCommon/ARMCommonInstCombineIntrinsic.h
new file mode 100644
index 0000000000000..967532b50c7c2
--- /dev/null
+++ b/llvm/lib/Target/ARMCommon/ARMCommonInstCombineIntrinsic.h
@@ -0,0 +1,58 @@
+//===- ARMCommonInstCombineIntrinsic.h -
+// instCombineIntrinsic opts for both ARM and AArch64 -----------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains optimizations for ARM and AArch64 intrinsics that
+/// are shared between both architectures. These functions can be called from:
+/// - ARM TTI's instCombineIntrinsic (for arm_neon_* intrinsics)
+/// - AArch64 TTI's instCombineIntrinsic (for aarch64_neon_* and aarch64_sve_*
+///   intrinsics)
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_ARMCOMMON_ARMCOMMONINSTCOMBINEINTRINSIC_H
+#define LLVM_LIB_TARGET_ARMCOMMON_ARMCOMMONINSTCOMBINEINTRINSIC_H
+
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Transforms/InstCombine/InstCombiner.h"
+
+namespace llvm {
+
+namespace ARMCommon {
+
+/// Convert a table lookup to shufflevector if the mask is constant.
+/// This could benefit tbl1 if the mask is { 7,6,5,4,3,2,1,0 }, in
+/// which case we could lower the shufflevector with rev64 instructions
+/// as it's actually a byte reverse.
+Value *simplifyNeonTbl1(const IntrinsicInst &II,
+                        InstCombiner::BuilderTy &Builder);
+
+/// Simplify NEON multiply-long intrinsics (smull, umull).
+/// These intrinsics perform widening multiplies: they multiply two vectors of
+/// narrow integers and produce a vector of wider integers. This function
+/// performs algebraic simplifications:
+/// 1. Multiply by zero => zero vector
+/// 2. Multiply by one => zero/sign-extend the non-one operand
+/// 3. Both operands constant => regular multiply that can be constant-folded
+///    later
+Instruction *simplifyNeonMultiply(IntrinsicInst &II, InstCombiner &IC,
+                                  bool Zext);
+
+/// Simplify AES encryption/decryption intrinsics (AESE, AESD).
+///
+/// ARM's AES instructions (AESE/AESD) XOR the data and the key, provided as
+/// separate arguments, before performing the encryption/decryption operation.
+/// We can fold that "internal" XOR with a previous one.
+Instruction *simplifyAES(IntrinsicInst &II, InstCombiner &IC);
+
+} // namespace ARMCommon
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_ARMCOMMON_ARMCOMMONINSTCOMBINEINTRINSIC_H
diff --git a/llvm/lib/Target/ARMCommon/CMakeLists.txt b/llvm/lib/Target/ARMCommon/CMakeLists.txt
new file mode 100644
index 0000000000000..1805a5df2f053
--- /dev/null
+++ b/llvm/lib/Target/ARMCommon/CMakeLists.txt
@@ -0,0 +1,8 @@
+add_llvm_component_library(LLVMARMCommon
+  ARMCommonInstCombineIntrinsic.cpp
+
+  LINK_COMPONENTS
+  Core
+  Support
+  TransformUtils
+  )
diff --git a/llvm/lib/Target/CMakeLists.txt b/llvm/lib/Target/CMakeLists.txt
index bcc13f942bf96..e3528014a4be2 100644
--- a/llvm/lib/Target/CMakeLists.txt
+++ b/llvm/lib/Target/CMakeLists.txt
@@ -31,6 +31,11 @@ if (NOT BUILD_SHARED_LIBS AND NOT APPLE AND
   set(CMAKE_CXX_VISIBILITY_PRESET hidden)
 endif()
 
+# Add shared ARM/AArch64 utilities if either target is being built
+if("ARM" IN_LIST LLVM_TARGETS_TO_BUILD OR "AArch64" IN_LIST LLVM_TARGETS_TO_BUILD)
+  add_subdirectory(ARMCommon)
+endif()
+
 foreach(t ${LLVM_TARGETS_TO_BUILD})
   message(STATUS "Targeting ${t}")
   add_subdirectory(${t})
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 8e4edefec42fd..8a54c0dde6be6 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -737,44 +737,6 @@ static Instruction *foldCtpop(IntrinsicInst &II, InstCombinerImpl &IC) {
   return nullptr;
 }
 
-/// Convert a table lookup to shufflevector if the mask is constant.
-/// This could benefit tbl1 if the mask is { 7,6,5,4,3,2,1,0 }, in
-/// which case we could lower the shufflevector with rev64 instructions
-/// as it's actually a byte reverse.
-static Value *simplifyNeonTbl1(const IntrinsicInst &II,
-                               InstCombiner::BuilderTy &Builder) {
-  // Bail out if the mask is not a constant.
-  auto *C = dyn_cast<Constant>(II.getArgOperand(1));
-  if (!C)
-    return nullptr;
-
-  auto *VecTy = cast<FixedVectorType>(II.getType());
-  unsigned NumElts = VecTy->getNumElements();
-
-  // Only perform this transformation for <8 x i8> vector types.
-  if (!VecTy->getElementType()->isIntegerTy(8) || NumElts != 8)
-    return nullptr;
-
-  int Indexes[8];
-
-  for (unsigned I = 0; I < NumElts; ++I) {
-    Constant *COp = C->getAggregateElement(I);
-
-    if (!COp || !isa<ConstantInt>(COp))
-      return nullptr;
-
-    Indexes[I] = cast<ConstantInt>(COp)->getLimitedValue();
-
-    // Make sure the mask indices are in range.
-    if ((unsigned)Indexes[I] >= NumElts)
-      return nullptr;
-  }
-
-  auto *V1 = II.getArgOperand(0);
-  auto *V2 = Constant::getNullValue(V1->getType());
-  return Builder.CreateShuffleVector(V1, V2, ArrayRef(Indexes));
-}
-
 // Returns true iff the 2 intrinsics have the same operands, limiting the
 // comparison to the first NumOperands.
 static bool haveSameOperands(const IntrinsicInst &I, const IntrinsicInst &E,
@@ -3155,72 +3117,6 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
         Intrinsic::getOrInsertDeclaration(II->getModule(), NewIntrin);
     return CallInst::Create(NewFn, CallArgs);
   }
-  case Intrinsic::arm_neon_vtbl1:
-  case Intrinsic::aarch64_neon_tbl1:
-    if (Value *V = simplifyNeonTbl1(*II, Builder))
-      return replaceInstUsesWith(*II, V);
-    break;
-
-  case Intrinsic::arm_neon_vmulls:
-  case Intrinsic::arm_neon_vmullu:
-  case Intrinsic::aarch64_neon_smull:
-  case Intrinsic::aarch64_neon_umull: {
-    Value *Arg0 = II->getArgOperand(0);
-    Value *Arg1 = II->getArgOperand(1);
-
-    // Handle mul by zero first:
-    if (isa<ConstantAggregateZero>(Arg0) || isa<ConstantAggregateZero>(Arg1)) {
-      return replaceInstUsesWith(CI, ConstantAggregateZero::get(II->getType()));
-    }
-
-    // Check for constant LHS & RHS - in this case we just simplify.
-    bool Zext = (IID == Intrinsic::arm_neon_vmullu ||
-                 IID == Intrinsic::aarch64_neon_umull);
-    VectorType *NewVT = cast<VectorType>(II->getType());
-    if (Constant *CV0 = dyn_cast<Constant>(Arg0)) {
-      if (Constant *CV1 = dyn_cast<Constant>(Arg1)) {
-        Value *V0 = Builder.CreateIntCast(CV0, NewVT, /*isSigned=*/!Zext);
-        Value *V1 = Builder.CreateIntCast(CV1, NewVT, /*isSigned=*/!Zext);
-        return replaceInstUsesWith(CI, Builder.CreateMul(V0, V1));
-      }
-
-      // Couldn't simplify - canonicalize constant to the RHS.
-      std::swap(Arg0, Arg1);
-    }
-
-    // Handle mul by one:
-    if (Constant *CV1 = dyn_cast<Constant>(Arg1))
-      if (ConstantInt *Splat =
-              dyn_cast_or_null<ConstantInt>(CV1->getSplatValue()))
-        if (Splat->isOne())
-          return CastInst::CreateIntegerCast(Arg0, II->getType(),
-                                             /*isSigned=*/!Zext);
-
-    break;
-  }
-  case Intrinsic::arm_neon_aesd:
-  case Intrinsic::arm_neon_aese:
-  case Intrinsic::aarch64_crypto_aesd:
-  case Intrinsic::aarch64_crypto_aese:
-  case Intrinsic::aarch64_sve_aesd:
-  case Intrinsic::aarch64_sve_aese: {
-    Value *DataArg = II->getArgOperand(0);
-    Value *KeyArg  = II->getArgOperand(1);
-
-    // Accept zero on either operand.
-    if (!match(KeyArg, m_ZeroInt()))
-      std::swap(KeyArg, DataArg);
-
-    // Try to use the builtin XOR in AESE and AESD to eliminate a prior XOR
-    Value *Data, *Key;
-    if (match(KeyArg, m_ZeroInt()) &&
-        match(DataArg, m_Xor(m_Value(Data), m_Value(Key)))) {
-      replaceOperand(*II, 0, Data);
-      replaceOperand(*II, 1, Key);
-      return II;
-    }
-    break;
-  }
   case Intrinsic::hexagon_V6_vandvrt:
   case Intrinsic::hexagon_V6_vandvrt_128B: {
     // Simplify Q -> V -> Q conversion.
diff --git a/llvm/test/Transforms/InstCombine/AArch64/aes-intrinsics.ll b/llvm/test/Transforms/InstCombine/AArch64/aes-intrinsics.ll
index 8c69d0721b738..fdc628bb59cb0 100644
--- a/llvm/test/Transforms/InstCombine/AArch64/aes-intrinsics.ll
+++ b/llvm/test/Transforms/InstCombine/AArch64/aes-intrinsics.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -S -passes=instcombine < %s | FileCheck %s
+; RUN: opt --mtriple=aarch64 -S -passes=instcombine < %s | FileCheck %s
 ; ARM64 AES intrinsic variants
 
 define <16 x i8> @combineXorAeseZeroARM64(<16 x i8> %data, <16 x i8> %key) {
diff --git a/llvm/test/Transforms/InstCombine/ARM/2012-04-23-Neon-Intrinsics.ll b/llvm/test/Transforms/InstCombine/ARM/2012-04-23-Neon-Intrinsics.ll
index 5fc5709ff8897..9ba4b418cb8e5 100644
--- a/llvm/test/Transforms/InstCombine/ARM/2012-04-23-Neon-Intrinsics.ll
+++ b/llvm/test/Transforms/InstCombine/ARM/2012-04-23-Neon-Intrinsics.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -S -passes=instcombine < %s | FileCheck %s
+; RUN: opt -mtriple=arm -S -passes=instcombine < %s | FileCheck %s
 
 define <4 x i32> @mulByZero(<4 x i16> %x) nounwind readnone ssp {
 ; CHECK-LABEL: define <4 x i32> @mulByZero(
diff --git a/llvm/test/Transforms/InstCombine/ARM/aes-intrinsics.ll b/llvm/test/Transforms/InstCombine/ARM/aes-intrinsics.ll
index 0056d872ff9e3..10175096035ec 100644
--- a/llvm/test/Transforms/InstCombine/ARM/aes-intrinsics.ll
+++ b/llvm/test/Transforms/InstCombine/ARM/aes-intrinsics.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -S -passes=instcombine < %s | FileCheck %s
+; RUN: opt -mtriple=arm -S -passes=instcombine < %s | FileCheck %s
 ; ARM AES intrinsic variants
 
 define <16 x i8> @combineXorAeseZeroARM(<16 x i8> %data, <16 x i8> %key) {
diff --git a/llvm/test/Transforms/InstCombine/ARM/tbl1.ll b/llvm/test/Transforms/InstCombine/ARM/tbl1.ll
index fbec1a2bb7a07..ceeac8648ec51 100644
--- a/llvm/test/Transforms/InstCombine/ARM/tbl1.ll
+++ b/llvm/test/Transforms/InstCombine/ARM/tbl1.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -passes=instcombine -S | FileCheck %s
+; RUN: opt < %s -mtriple=arm -passes=instcombine -S | FileCheck %s
 
 target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
 target triple = "armv8-arm-none-eabi"

>From 234164f31e24d5194e498d9fbbb0cc6981d760d4 Mon Sep 17 00:00:00 2001
From: valadaptive <valadaptive at protonmail.com>
Date: Tue, 25 Nov 2025 19:06:37 -0500
Subject: [PATCH 2/8] [AArch64][ARM] !Zext -> IsSigned

---
 llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp   | 4 ++--
 llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp           | 4 ++--
 .../Target/ARMCommon/ARMCommonInstCombineIntrinsic.cpp   | 9 ++++-----
 .../lib/Target/ARMCommon/ARMCommonInstCombineIntrinsic.h | 2 +-
 4 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 74be251668ed6..4958b44836db8 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -2863,8 +2863,8 @@ AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC,
     break;
   case Intrinsic::aarch64_neon_smull:
   case Intrinsic::aarch64_neon_umull: {
-    bool Zext = IID == Intrinsic::aarch64_neon_umull;
-    return ARMCommon::simplifyNeonMultiply(II, IC, Zext);
+    bool IsSigned = IID == Intrinsic::aarch64_neon_smull;
+    return ARMCommon::simplifyNeonMultiply(II, IC, IsSigned);
   }
   case Intrinsic::aarch64_crypto_aesd:
   case Intrinsic::aarch64_crypto_aese:
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
index bff93aa804112..63541872f74de 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -190,8 +190,8 @@ ARMTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
 
   case Intrinsic::arm_neon_vmulls:
   case Intrinsic::arm_neon_vmullu: {
-    bool Zext = IID == Intrinsic::arm_neon_vmullu;
-    return ARMCommon::simplifyNeonMultiply(II, IC, Zext);
+    bool IsSigned = IID == Intrinsic::arm_neon_vmulls;
+    return ARMCommon::simplifyNeonMultiply(II, IC, IsSigned);
   }
 
   case Intrinsic::arm_neon_aesd:
diff --git a/llvm/lib/Target/ARMCommon/ARMCommonInstCombineIntrinsic.cpp b/llvm/lib/Target/ARMCommon/ARMCommonInstCombineIntrinsic.cpp
index 9b8c1f4747dc1..9ce82e66334b7 100644
--- a/llvm/lib/Target/ARMCommon/ARMCommonInstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/ARMCommon/ARMCommonInstCombineIntrinsic.cpp
@@ -75,7 +75,7 @@ Value *simplifyNeonTbl1(const IntrinsicInst &II,
 /// 3. Both operands constant => regular multiply that can be constant-folded
 ///    later
 Instruction *simplifyNeonMultiply(IntrinsicInst &II, InstCombiner &IC,
-                                  bool Zext) {
+                                  bool IsSigned) {
   Value *Arg0 = II.getArgOperand(0);
   Value *Arg1 = II.getArgOperand(1);
 
@@ -88,8 +88,8 @@ Instruction *simplifyNeonMultiply(IntrinsicInst &II, InstCombiner &IC,
   VectorType *NewVT = cast<VectorType>(II.getType());
   if (Constant *CV0 = dyn_cast<Constant>(Arg0)) {
     if (Constant *CV1 = dyn_cast<Constant>(Arg1)) {
-      Value *V0 = IC.Builder.CreateIntCast(CV0, NewVT, /*isSigned=*/!Zext);
-      Value *V1 = IC.Builder.CreateIntCast(CV1, NewVT, /*isSigned=*/!Zext);
+      Value *V0 = IC.Builder.CreateIntCast(CV0, NewVT, IsSigned);
+      Value *V1 = IC.Builder.CreateIntCast(CV1, NewVT, IsSigned);
       return IC.replaceInstUsesWith(II, IC.Builder.CreateMul(V0, V1));
     }
 
@@ -102,8 +102,7 @@ Instruction *simplifyNeonMultiply(IntrinsicInst &II, InstCombiner &IC,
     if (ConstantInt *Splat =
             dyn_cast_or_null<ConstantInt>(CV1->getSplatValue()))
       if (Splat->isOne())
-        return CastInst::CreateIntegerCast(Arg0, II.getType(),
-                                           /*isSigned=*/!Zext);
+        return CastInst::CreateIntegerCast(Arg0, II.getType(), IsSigned);
 
   return nullptr;
 }
diff --git a/llvm/lib/Target/ARMCommon/ARMCommonInstCombineIntrinsic.h b/llvm/lib/Target/ARMCommon/ARMCommonInstCombineIntrinsic.h
index 967532b50c7c2..f9d43e1a7becb 100644
--- a/llvm/lib/Target/ARMCommon/ARMCommonInstCombineIntrinsic.h
+++ b/llvm/lib/Target/ARMCommon/ARMCommonInstCombineIntrinsic.h
@@ -43,7 +43,7 @@ Value *simplifyNeonTbl1(const IntrinsicInst &II,
 /// 3. Both operands constant => regular multiply that can be constant-folded
 ///    later
 Instruction *simplifyNeonMultiply(IntrinsicInst &II, InstCombiner &IC,
-                                  bool Zext);
+                                  bool IsSigned);
 
 /// Simplify AES encryption/decryption intrinsics (AESE, AESD).
 ///

>From e53963fb1221a442e4c7c2bda269f06a50b17c89 Mon Sep 17 00:00:00 2001
From: valadaptive <valadaptive at protonmail.com>
Date: Tue, 25 Nov 2025 19:43:43 -0500
Subject: [PATCH 3/8] [AArch64][ARM] Make simplifyNeonTbl1 behave like the
 other transforms

---
 llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp      | 4 +---
 llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp              | 4 +---
 llvm/lib/Target/ARMCommon/ARMCommonInstCombineIntrinsic.cpp | 6 +++---
 llvm/lib/Target/ARMCommon/ARMCommonInstCombineIntrinsic.h   | 3 +--
 4 files changed, 6 insertions(+), 11 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 4958b44836db8..8acad13c494d4 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -2858,9 +2858,7 @@ AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC,
   case Intrinsic::aarch64_neon_fminnm:
     return instCombineMaxMinNM(IC, II);
   case Intrinsic::aarch64_neon_tbl1:
-    if (Value *V = ARMCommon::simplifyNeonTbl1(II, IC.Builder))
-      return IC.replaceInstUsesWith(II, V);
-    break;
+    return ARMCommon::simplifyNeonTbl1(II, IC);
   case Intrinsic::aarch64_neon_smull:
   case Intrinsic::aarch64_neon_umull: {
     bool IsSigned = IID == Intrinsic::aarch64_neon_smull;
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
index 63541872f74de..3dd3ae65321f8 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -184,9 +184,7 @@ ARMTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
   }
 
   case Intrinsic::arm_neon_vtbl1:
-    if (Value *V = ARMCommon::simplifyNeonTbl1(II, IC.Builder))
-      return IC.replaceInstUsesWith(II, V);
-    break;
+    return ARMCommon::simplifyNeonTbl1(II, IC);
 
   case Intrinsic::arm_neon_vmulls:
   case Intrinsic::arm_neon_vmullu: {
diff --git a/llvm/lib/Target/ARMCommon/ARMCommonInstCombineIntrinsic.cpp b/llvm/lib/Target/ARMCommon/ARMCommonInstCombineIntrinsic.cpp
index 9ce82e66334b7..6e5711a0272e3 100644
--- a/llvm/lib/Target/ARMCommon/ARMCommonInstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/ARMCommon/ARMCommonInstCombineIntrinsic.cpp
@@ -32,8 +32,7 @@ namespace ARMCommon {
 /// This could benefit tbl1 if the mask is { 7,6,5,4,3,2,1,0 }, in
 /// which case we could lower the shufflevector with rev64 instructions
 /// as it's actually a byte reverse.
-Value *simplifyNeonTbl1(const IntrinsicInst &II,
-                        InstCombiner::BuilderTy &Builder) {
+Instruction *simplifyNeonTbl1(IntrinsicInst &II, InstCombiner &IC) {
   // Bail out if the mask is not a constant.
   auto *C = dyn_cast<Constant>(II.getArgOperand(1));
   if (!C)
@@ -63,7 +62,8 @@ Value *simplifyNeonTbl1(const IntrinsicInst &II,
 
   auto *V1 = II.getArgOperand(0);
   auto *V2 = Constant::getNullValue(V1->getType());
-  return Builder.CreateShuffleVector(V1, V2, ArrayRef(Indexes));
+  Value *Shuf = IC.Builder.CreateShuffleVector(V1, V2, ArrayRef(Indexes));
+  return IC.replaceInstUsesWith(II, Shuf);
 }
 
 /// Simplify NEON multiply-long intrinsics (smull, umull).
diff --git a/llvm/lib/Target/ARMCommon/ARMCommonInstCombineIntrinsic.h b/llvm/lib/Target/ARMCommon/ARMCommonInstCombineIntrinsic.h
index f9d43e1a7becb..541fb6a57f558 100644
--- a/llvm/lib/Target/ARMCommon/ARMCommonInstCombineIntrinsic.h
+++ b/llvm/lib/Target/ARMCommon/ARMCommonInstCombineIntrinsic.h
@@ -31,8 +31,7 @@ namespace ARMCommon {
 /// This could benefit tbl1 if the mask is { 7,6,5,4,3,2,1,0 }, in
 /// which case we could lower the shufflevector with rev64 instructions
 /// as it's actually a byte reverse.
-Value *simplifyNeonTbl1(const IntrinsicInst &II,
-                        InstCombiner::BuilderTy &Builder);
+Instruction *simplifyNeonTbl1(IntrinsicInst &II, InstCombiner &IC);
 
 /// Simplify NEON multiply-long intrinsics (smull, umull).
 /// These intrinsics perform widening multiplies: they multiply two vectors of

>From 40b6ac39a40afe9b487b1c75b64ff57671bee64a Mon Sep 17 00:00:00 2001
From: valadaptive <valadaptive at protonmail.com>
Date: Fri, 28 Nov 2025 12:10:53 -0500
Subject: [PATCH 4/8] [AArch64][ARM] Move the transforms to Transforms/Utils

---
 .../Transforms/Utils}/ARMCommonInstCombineIntrinsic.h     | 6 +++---
 llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp    | 2 +-
 llvm/lib/Target/AArch64/CMakeLists.txt                    | 1 -
 llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp            | 2 +-
 llvm/lib/Target/ARM/CMakeLists.txt                        | 1 -
 llvm/lib/Target/ARMCommon/CMakeLists.txt                  | 8 --------
 llvm/lib/Target/CMakeLists.txt                            | 5 -----
 .../Utils}/ARMCommonInstCombineIntrinsic.cpp              | 2 +-
 llvm/lib/Transforms/Utils/CMakeLists.txt                  | 1 +
 .../utils/gn/secondary/llvm/lib/Transforms/Utils/BUILD.gn | 1 +
 10 files changed, 8 insertions(+), 21 deletions(-)
 rename llvm/{lib/Target/ARMCommon => include/llvm/Transforms/Utils}/ARMCommonInstCombineIntrinsic.h (91%)
 delete mode 100644 llvm/lib/Target/ARMCommon/CMakeLists.txt
 rename llvm/lib/{Target/ARMCommon => Transforms/Utils}/ARMCommonInstCombineIntrinsic.cpp (98%)

diff --git a/llvm/lib/Target/ARMCommon/ARMCommonInstCombineIntrinsic.h b/llvm/include/llvm/Transforms/Utils/ARMCommonInstCombineIntrinsic.h
similarity index 91%
rename from llvm/lib/Target/ARMCommon/ARMCommonInstCombineIntrinsic.h
rename to llvm/include/llvm/Transforms/Utils/ARMCommonInstCombineIntrinsic.h
index 541fb6a57f558..9426dc8d16482 100644
--- a/llvm/lib/Target/ARMCommon/ARMCommonInstCombineIntrinsic.h
+++ b/llvm/include/llvm/Transforms/Utils/ARMCommonInstCombineIntrinsic.h
@@ -16,8 +16,8 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_LIB_TARGET_ARMCOMMON_ARMCOMMONINSTCOMBINEINTRINSIC_H
-#define LLVM_LIB_TARGET_ARMCOMMON_ARMCOMMONINSTCOMBINEINTRINSIC_H
+#ifndef LLVM_TRANSFORMS_UTILS_ARMCOMMONINSTCOMBINEINTRINSIC_H
+#define LLVM_TRANSFORMS_UTILS_ARMCOMMONINSTCOMBINEINTRINSIC_H
 
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Value.h"
@@ -54,4 +54,4 @@ Instruction *simplifyAES(IntrinsicInst &II, InstCombiner &IC);
 } // namespace ARMCommon
 } // namespace llvm
 
-#endif // LLVM_LIB_TARGET_ARMCOMMON_ARMCOMMONINSTCOMBINEINTRINSIC_H
+#endif // LLVM_TRANSFORMS_UTILS_ARMCOMMONINSTCOMBINEINTRINSIC_H
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 8acad13c494d4..0d0c0970091d6 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -7,7 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "AArch64TargetTransformInfo.h"
-#include "../ARMCommon/ARMCommonInstCombineIntrinsic.h"
 #include "AArch64ExpandImm.h"
 #include "AArch64PerfectShuffle.h"
 #include "AArch64SMEAttributes.h"
@@ -26,6 +25,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/TargetParser/AArch64TargetParser.h"
 #include "llvm/Transforms/InstCombine/InstCombiner.h"
+#include "llvm/Transforms/Utils/ARMCommonInstCombineIntrinsic.h"
 #include "llvm/Transforms/Utils/UnrollLoop.h"
 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
 #include <algorithm>
diff --git a/llvm/lib/Target/AArch64/CMakeLists.txt b/llvm/lib/Target/AArch64/CMakeLists.txt
index d27a698ee9e4a..285d646293eb7 100644
--- a/llvm/lib/Target/AArch64/CMakeLists.txt
+++ b/llvm/lib/Target/AArch64/CMakeLists.txt
@@ -101,7 +101,6 @@ add_llvm_target(AArch64CodeGen
   AArch64Desc
   AArch64Info
   AArch64Utils
-  ARMCommon
   Analysis
   AsmPrinter
   CFGuard
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
index 3dd3ae65321f8..c93b2fbf419fe 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -7,7 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "ARMTargetTransformInfo.h"
-#include "../ARMCommon/ARMCommonInstCombineIntrinsic.h"
 #include "ARMSubtarget.h"
 #include "MCTargetDesc/ARMAddressingModes.h"
 #include "llvm/ADT/APInt.h"
@@ -32,6 +31,7 @@
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/TargetParser/SubtargetFeature.h"
 #include "llvm/Transforms/InstCombine/InstCombiner.h"
+#include "llvm/Transforms/Utils/ARMCommonInstCombineIntrinsic.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
diff --git a/llvm/lib/Target/ARM/CMakeLists.txt b/llvm/lib/Target/ARM/CMakeLists.txt
index 9fc9bc134e5cc..eb3ad01a54fb2 100644
--- a/llvm/lib/Target/ARM/CMakeLists.txt
+++ b/llvm/lib/Target/ARM/CMakeLists.txt
@@ -73,7 +73,6 @@ add_llvm_target(ARMCodeGen
   Thumb2SizeReduction.cpp
 
   LINK_COMPONENTS
-  ARMCommon
   ARMDesc
   ARMInfo
   ARMUtils
diff --git a/llvm/lib/Target/ARMCommon/CMakeLists.txt b/llvm/lib/Target/ARMCommon/CMakeLists.txt
deleted file mode 100644
index 1805a5df2f053..0000000000000
--- a/llvm/lib/Target/ARMCommon/CMakeLists.txt
+++ /dev/null
@@ -1,8 +0,0 @@
-add_llvm_component_library(LLVMARMCommon
-  ARMCommonInstCombineIntrinsic.cpp
-
-  LINK_COMPONENTS
-  Core
-  Support
-  TransformUtils
-  )
diff --git a/llvm/lib/Target/CMakeLists.txt b/llvm/lib/Target/CMakeLists.txt
index e3528014a4be2..bcc13f942bf96 100644
--- a/llvm/lib/Target/CMakeLists.txt
+++ b/llvm/lib/Target/CMakeLists.txt
@@ -31,11 +31,6 @@ if (NOT BUILD_SHARED_LIBS AND NOT APPLE AND
   set(CMAKE_CXX_VISIBILITY_PRESET hidden)
 endif()
 
-# Add shared ARM/AArch64 utilities if either target is being built
-if("ARM" IN_LIST LLVM_TARGETS_TO_BUILD OR "AArch64" IN_LIST LLVM_TARGETS_TO_BUILD)
-  add_subdirectory(ARMCommon)
-endif()
-
 foreach(t ${LLVM_TARGETS_TO_BUILD})
   message(STATUS "Targeting ${t}")
   add_subdirectory(${t})
diff --git a/llvm/lib/Target/ARMCommon/ARMCommonInstCombineIntrinsic.cpp b/llvm/lib/Transforms/Utils/ARMCommonInstCombineIntrinsic.cpp
similarity index 98%
rename from llvm/lib/Target/ARMCommon/ARMCommonInstCombineIntrinsic.cpp
rename to llvm/lib/Transforms/Utils/ARMCommonInstCombineIntrinsic.cpp
index 6e5711a0272e3..fe192596b6f2b 100644
--- a/llvm/lib/Target/ARMCommon/ARMCommonInstCombineIntrinsic.cpp
+++ b/llvm/lib/Transforms/Utils/ARMCommonInstCombineIntrinsic.cpp
@@ -16,7 +16,7 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#include "ARMCommonInstCombineIntrinsic.h"
+#include "llvm/Transforms/Utils/ARMCommonInstCombineIntrinsic.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Value.h"
diff --git a/llvm/lib/Transforms/Utils/CMakeLists.txt b/llvm/lib/Transforms/Utils/CMakeLists.txt
index f367ca2fdf56b..a1c25fd9ccd2b 100644
--- a/llvm/lib/Transforms/Utils/CMakeLists.txt
+++ b/llvm/lib/Transforms/Utils/CMakeLists.txt
@@ -1,6 +1,7 @@
 add_llvm_component_library(LLVMTransformUtils
   AddDiscriminators.cpp
   AMDGPUEmitPrintf.cpp
+  ARMCommonInstCombineIntrinsic.cpp
   ASanStackFrameLayout.cpp
   AssumeBundleBuilder.cpp
   BasicBlockUtils.cpp
diff --git a/llvm/utils/gn/secondary/llvm/lib/Transforms/Utils/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Transforms/Utils/BUILD.gn
index 186d2ef96c19b..dae641537b43c 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Transforms/Utils/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Transforms/Utils/BUILD.gn
@@ -8,6 +8,7 @@ static_library("Utils") {
   ]
   sources = [
     "AMDGPUEmitPrintf.cpp",
+    "ARMCommonInstCombineIntrinsic.cpp",
     "ASanStackFrameLayout.cpp",
     "AddDiscriminators.cpp",
     "AssumeBundleBuilder.cpp",

>From 82fec7b89654e1e3a2f3ad45fa234281c86af553 Mon Sep 17 00:00:00 2001
From: valadaptive <valadaptive at protonmail.com>
Date: Mon, 1 Dec 2025 14:35:16 -0500
Subject: [PATCH 5/8] [ARM] Remove redundant -mtriple=arm from RUN line

---
 llvm/test/Transforms/InstCombine/ARM/tbl1.ll | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/test/Transforms/InstCombine/ARM/tbl1.ll b/llvm/test/Transforms/InstCombine/ARM/tbl1.ll
index ceeac8648ec51..fbec1a2bb7a07 100644
--- a/llvm/test/Transforms/InstCombine/ARM/tbl1.ll
+++ b/llvm/test/Transforms/InstCombine/ARM/tbl1.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -mtriple=arm -passes=instcombine -S | FileCheck %s
+; RUN: opt < %s -passes=instcombine -S | FileCheck %s
 
 target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
 target triple = "armv8-arm-none-eabi"

>From c52757d85b9a534ebd5bc3c4e5c1902dfe4037e8 Mon Sep 17 00:00:00 2001
From: valadaptive <valadaptive at protonmail.com>
Date: Mon, 1 Dec 2025 14:37:46 -0500
Subject: [PATCH 6/8] [AArch64][ARM] Shorten file descriptions

---
 .../llvm/Transforms/Utils/ARMCommonInstCombineIntrinsic.h      | 3 +--
 llvm/lib/Transforms/Utils/ARMCommonInstCombineIntrinsic.cpp    | 3 +--
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/llvm/include/llvm/Transforms/Utils/ARMCommonInstCombineIntrinsic.h b/llvm/include/llvm/Transforms/Utils/ARMCommonInstCombineIntrinsic.h
index 9426dc8d16482..1efb647978423 100644
--- a/llvm/include/llvm/Transforms/Utils/ARMCommonInstCombineIntrinsic.h
+++ b/llvm/include/llvm/Transforms/Utils/ARMCommonInstCombineIntrinsic.h
@@ -1,5 +1,4 @@
-//===- ARMCommonInstCombineIntrinsic.h -
-// instCombineIntrinsic opts for both ARM and AArch64 -----------*- C++ -*-===//
+//===- ARMCommonInstCombineIntrinsic.h - Shared ARM/AArch64 opts *- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/llvm/lib/Transforms/Utils/ARMCommonInstCombineIntrinsic.cpp b/llvm/lib/Transforms/Utils/ARMCommonInstCombineIntrinsic.cpp
index fe192596b6f2b..4e4dea727b407 100644
--- a/llvm/lib/Transforms/Utils/ARMCommonInstCombineIntrinsic.cpp
+++ b/llvm/lib/Transforms/Utils/ARMCommonInstCombineIntrinsic.cpp
@@ -1,5 +1,4 @@
-//===- ARMCommonInstCombineIntrinsic.cpp -
-//                  instCombineIntrinsic opts for both ARM and AArch64  ---===//
+//===- ARMCommonInstCombineIntrinsic.cpp - Shared ARM/AArch64 opts  -------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.

>From a6f444858e04471f263917ea55cb45fa157ad828 Mon Sep 17 00:00:00 2001
From: valadaptive <valadaptive at protonmail.com>
Date: Wed, 26 Nov 2025 18:26:52 -0500
Subject: [PATCH 7/8] [AArch64][ARM] Add new tests for tbl/tbx optimizations

---
 .../Transforms/InstCombine/AArch64/tbl.ll     | 272 ++++++++++++++++++
 .../Transforms/InstCombine/AArch64/tbl1.ll    |  65 -----
 llvm/test/Transforms/InstCombine/ARM/tbl.ll   | 218 ++++++++++++++
 llvm/test/Transforms/InstCombine/ARM/tbl1.ll  |  35 ---
 4 files changed, 490 insertions(+), 100 deletions(-)
 create mode 100644 llvm/test/Transforms/InstCombine/AArch64/tbl.ll
 delete mode 100644 llvm/test/Transforms/InstCombine/AArch64/tbl1.ll
 create mode 100644 llvm/test/Transforms/InstCombine/ARM/tbl.ll
 delete mode 100644 llvm/test/Transforms/InstCombine/ARM/tbl1.ll

diff --git a/llvm/test/Transforms/InstCombine/AArch64/tbl.ll b/llvm/test/Transforms/InstCombine/AArch64/tbl.ll
new file mode 100644
index 0000000000000..405b4c13700a4
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/AArch64/tbl.ll
@@ -0,0 +1,272 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -passes=instcombine -S | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64"
+
+; We can turn a tbl/tbx intrinsic into a shufflevector instruction if the mask
+; is constant and references 2 or fewer operands.
+
+; Basic tbl1 with all in-bounds indices should optimize to shufflevector.
+define <16 x i8> @tbl1_basic(<16 x i8> %a) {
+; CHECK-LABEL: @tbl1_basic(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> [[A:%.*]], <16 x i8> <i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; CHECK-NEXT:    ret <16 x i8> [[TMP1]]
+;
+  %tbl = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> %a, <16 x i8> <i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+  ret <16 x i8> %tbl
+}
+
+; tbl2 with both operands the same should optimize (1 unique source).
+define <16 x i8> @tbl2_duplicate_operands(<16 x i8> %a) {
+; CHECK-LABEL: @tbl2_duplicate_operands(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[A]], <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23>)
+; CHECK-NEXT:    ret <16 x i8> [[TMP1]]
+;
+  %tbl = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %a, <16 x i8> %a, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23>)
+  ret <16 x i8> %tbl
+}
+
+; tbl4 with alternating duplicate operands should optimize (2 unique sources).
+define <16 x i8> @tbl4_duplicate_operands(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: @tbl4_duplicate_operands(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 16, i8 17, i8 18, i8 19, i8 32, i8 33, i8 34, i8 35, i8 48, i8 49, i8 50, i8 51>)
+; CHECK-NEXT:    ret <16 x i8> [[TMP1]]
+;
+  %tbl = call <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 16, i8 17, i8 18, i8 19, i8 32, i8 33, i8 34, i8 35, i8 48, i8 49, i8 50, i8 51>)
+  ret <16 x i8> %tbl
+}
+
+; tbl4 where mask only references first two operands should optimize.
+define <16 x i8> @tbl4_unused_operands(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) {
+; CHECK-LABEL: @tbl4_unused_operands(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]], <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23>)
+; CHECK-NEXT:    ret <16 x i8> [[TMP1]]
+;
+  %tbl = call <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23>)
+  ret <16 x i8> %tbl
+}
+
+; tbl4 where mask only references one operand should optimize.
+define <16 x i8> @tbl4_single_operand_used(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) {
+; CHECK-LABEL: @tbl4_single_operand_used(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]], <16 x i8> <i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; CHECK-NEXT:    ret <16 x i8> [[TMP1]]
+;
+  %tbl = call <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d, <16 x i8> <i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+  ret <16 x i8> %tbl
+}
+
+; tbl1 with some OOB indices should optimize (1 source + zero vector = 2 sources).
+define <16 x i8> @tbl1_with_oob(<16 x i8> %a) {
+; CHECK-LABEL: @tbl1_with_oob(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> [[A:%.*]], <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23>)
+; CHECK-NEXT:    ret <16 x i8> [[TMP1]]
+;
+  %tbl = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> %a, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23>)
+  ret <16 x i8> %tbl
+}
+
+; tbl2 with duplicate operands and OOB should optimize (1 unique source + zero vector = 2 sources).
+define <16 x i8> @tbl2_duplicate_with_oob(<16 x i8> %a) {
+; CHECK-LABEL: @tbl2_duplicate_with_oob(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[A]], <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 16, i8 17, i8 18, i8 19, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99>)
+; CHECK-NEXT:    ret <16 x i8> [[TMP1]]
+;
+  %tbl = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %a, <16 x i8> %a, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 16, i8 17, i8 18, i8 19, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99>)
+  ret <16 x i8> %tbl
+}
+
+; tbl2 with OOB indices should NOT optimize (2 sources + zero vector = 3 sources).
+define <16 x i8> @tbl2_with_oob_bail(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: @tbl2_with_oob_bail(
+; CHECK-NEXT:    [[TBL:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 16, i8 17, i8 18, i8 19, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99>)
+; CHECK-NEXT:    ret <16 x i8> [[TBL]]
+;
+  %tbl = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 16, i8 17, i8 18, i8 19, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99>)
+  ret <16 x i8> %tbl
+}
+
+; tbl1 with all OOB indices should optimize to zero vector.
+define <16 x i8> @tbl1_all_oob(<16 x i8> %a) {
+; CHECK-LABEL: @tbl1_all_oob(
+; CHECK-NEXT:    [[TBL:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> [[A:%.*]], <16 x i8> splat (i8 99))
+; CHECK-NEXT:    ret <16 x i8> [[TBL]]
+;
+  %tbl = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> %a, <16 x i8> <i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99>)
+  ret <16 x i8> %tbl
+}
+
+; tbl3 referencing all 3 operands should NOT optimize.
+define <16 x i8> @tbl3_three_sources_bail(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) {
+; CHECK-LABEL: @tbl3_three_sources_bail(
+; CHECK-NEXT:    [[TBL:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl3.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 16, i8 17, i8 18, i8 19, i8 32, i8 33, i8 34, i8 35, i8 0, i8 0, i8 0, i8 0>)
+; CHECK-NEXT:    ret <16 x i8> [[TBL]]
+;
+  %tbl = call <16 x i8> @llvm.aarch64.neon.tbl3.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 16, i8 17, i8 18, i8 19, i8 32, i8 33, i8 34, i8 35, i8 0, i8 0, i8 0, i8 0>)
+  ret <16 x i8> %tbl
+}
+
+; tbl4 referencing 3 unique operands should NOT optimize.
+define <16 x i8> @tbl4_three_sources_bail(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) {
+; CHECK-LABEL: @tbl4_three_sources_bail(
+; CHECK-NEXT:    [[TBL:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[A]], <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 16, i8 17, i8 18, i8 19, i8 32, i8 33, i8 34, i8 35, i8 48, i8 49, i8 50, i8 51>)
+; CHECK-NEXT:    ret <16 x i8> [[TBL]]
+;
+  %tbl = call <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %a, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 16, i8 17, i8 18, i8 19, i8 32, i8 33, i8 34, i8 35, i8 48, i8 49, i8 50, i8 51>)
+  ret <16 x i8> %tbl
+}
+
+; tbl4 referencing all 4 unique operands should NOT optimize.
+define <16 x i8> @tbl4_four_sources_bail(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) {
+; CHECK-LABEL: @tbl4_four_sources_bail(
+; CHECK-NEXT:    [[TBL:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]], <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 16, i8 17, i8 18, i8 19, i8 32, i8 33, i8 34, i8 35, i8 48, i8 49, i8 50, i8 51>)
+; CHECK-NEXT:    ret <16 x i8> [[TBL]]
+;
+  %tbl = call <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 16, i8 17, i8 18, i8 19, i8 32, i8 33, i8 34, i8 35, i8 48, i8 49, i8 50, i8 51>)
+  ret <16 x i8> %tbl
+}
+
+; tbx1 with no OOB should optimize.
+define <16 x i8> @tbx1_no_oob(<16 x i8> %fallback, <16 x i8> %a) {
+; CHECK-LABEL: @tbx1_no_oob(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx1.v16i8(<16 x i8> [[FALLBACK:%.*]], <16 x i8> [[A:%.*]], <16 x i8> <i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; CHECK-NEXT:    ret <16 x i8> [[TMP1]]
+;
+  %tbx = call <16 x i8> @llvm.aarch64.neon.tbx1.v16i8(<16 x i8> %fallback, <16 x i8> %a, <16 x i8> <i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+  ret <16 x i8> %tbx
+}
+
+; tbx2 where fallback == second source operand should optimize (deduplicated).
+define <16 x i8> @tbx2_fallback_equals_second_source(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: @tbx2_fallback_equals_second_source(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx2.v16i8(<16 x i8> [[B:%.*]], <16 x i8> [[A:%.*]], <16 x i8> [[B]], <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 16, i8 17, i8 18, i8 19, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99>)
+; CHECK-NEXT:    ret <16 x i8> [[TMP1]]
+;
+  %tbx = call <16 x i8> @llvm.aarch64.neon.tbx2.v16i8(<16 x i8> %b, <16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 16, i8 17, i8 18, i8 19, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99>)
+  ret <16 x i8> %tbx
+}
+
+; tbx1 with OOB where fallback == source should optimize (deduplicated).
+define <16 x i8> @tbx1_oob_fallback_same_as_source(<16 x i8> %a) {
+; CHECK-LABEL: @tbx1_oob_fallback_same_as_source(
+; CHECK-NEXT:    [[A:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx1.v16i8(<16 x i8> [[A1:%.*]], <16 x i8> [[A1]], <16 x i8> <i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99>)
+; CHECK-NEXT:    ret <16 x i8> [[A]]
+;
+  %tbx = call <16 x i8> @llvm.aarch64.neon.tbx1.v16i8(<16 x i8> %a, <16 x i8> %a, <16 x i8> <i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99>)
+  ret <16 x i8> %tbx
+}
+
+; tbx2 with OOB should NOT optimize (2 sources + fallback = 3 sources).
+define <16 x i8> @tbx2_with_oob_bail(<16 x i8> %fallback, <16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: @tbx2_with_oob_bail(
+; CHECK-NEXT:    [[TBX:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx2.v16i8(<16 x i8> [[FALLBACK:%.*]], <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 16, i8 17, i8 18, i8 19, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99>)
+; CHECK-NEXT:    ret <16 x i8> [[TBX]]
+;
+  %tbx = call <16 x i8> @llvm.aarch64.neon.tbx2.v16i8(<16 x i8> %fallback, <16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 16, i8 17, i8 18, i8 19, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99>)
+  ret <16 x i8> %tbx
+}
+
+; tbx1 with all OOB indices should optimize to fallback.
+define <16 x i8> @tbx1_all_oob(<16 x i8> %fallback, <16 x i8> %a) {
+; CHECK-LABEL: @tbx1_all_oob(
+; CHECK-NEXT:    [[FALLBACK:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx1.v16i8(<16 x i8> [[FALLBACK1:%.*]], <16 x i8> [[A:%.*]], <16 x i8> splat (i8 99))
+; CHECK-NEXT:    ret <16 x i8> [[FALLBACK]]
+;
+  %tbx = call <16 x i8> @llvm.aarch64.neon.tbx1.v16i8(<16 x i8> %fallback, <16 x i8> %a, <16 x i8> <i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99>)
+  ret <16 x i8> %tbx
+}
+
+; tbx1 with OOB and mismatched fallback/source sizes should NOT optimize.
+define <8 x i8> @tbx1_fallback_size_mismatch(<8 x i8> %fallback, <16 x i8> %a) {
+; CHECK-LABEL: @tbx1_fallback_size_mismatch(
+; CHECK-NEXT:    [[TBX:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8> [[FALLBACK:%.*]], <16 x i8> [[A:%.*]], <8 x i8> <i8 0, i8 1, i8 2, i8 3, i8 99, i8 99, i8 99, i8 99>)
+; CHECK-NEXT:    ret <8 x i8> [[TBX]]
+;
+  %tbx = call <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8> %fallback, <16 x i8> %a, <8 x i8> <i8 0, i8 1, i8 2, i8 3, i8 99, i8 99, i8 99, i8 99>)
+  ret <8 x i8> %tbx
+}
+
+; tbx1 with no OOB and mismatched fallback/source sizes should optimize.
+define <8 x i8> @tbx1_fallback_size_mismatch_no_oob(<8 x i8> %fallback, <16 x i8> %a) {
+; CHECK-LABEL: @tbx1_fallback_size_mismatch_no_oob(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8> [[FALLBACK:%.*]], <16 x i8> [[A:%.*]], <8 x i8> <i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; CHECK-NEXT:    ret <8 x i8> [[TMP1]]
+;
+  %tbx = call <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8> %fallback, <16 x i8> %a, <8 x i8> <i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+  ret <8 x i8> %tbx
+}
+
+; tbl1 with non-i8 element type should NOT optimize.
+define <8 x i16> @tbl1_8x16(<16 x i8> %vec) {
+; CHECK-LABEL: @tbl1_8x16(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TBL1:%.*]] = call <8 x i16> @llvm.aarch64.neon.tbl1.v8i16(<16 x i8> [[VEC:%.*]], <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; CHECK-NEXT:    ret <8 x i16> [[TBL1]]
+;
+entry:
+  %tbl1 = call <8 x i16> @llvm.aarch64.neon.tbl1.v8i16(<16 x i8> %vec, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+  ret <8 x i16> %tbl1
+}
+declare <8 x i16> @llvm.aarch64.neon.tbl1.v8i16(<16 x i8>, <8 x i16>)
+
+; tbl1 with non-8/16 element count should NOT optimize.
+define <12 x i8> @tbl1_16x8(<16 x i8> %vec) {
+; CHECK-LABEL: @tbl1_16x8(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TBL1:%.*]] = call <12 x i8> @llvm.aarch64.neon.tbl1.v12i8(<16 x i8> [[VEC:%.*]], <12 x i8> <i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; CHECK-NEXT:    ret <12 x i8> [[TBL1]]
+;
+entry:
+  %tbl1 = call <12 x i8> @llvm.aarch64.neon.tbl1.v12i8(<16 x i8> %vec, <12 x i8> <i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+  ret <12 x i8> %tbl1
+}
+declare <12 x i8> @llvm.aarch64.neon.tbl1.v12i8(<16 x i8>, <12 x i8>)
+
+; Non-constant mask should NOT optimize.
+define <16 x i8> @tbl1_non_constant_mask(<16 x i8> %a, <16 x i8> %mask) {
+; CHECK-LABEL: @tbl1_non_constant_mask(
+; CHECK-NEXT:    [[TBL:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[MASK:%.*]])
+; CHECK-NEXT:    ret <16 x i8> [[TBL]]
+;
+  %tbl = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> %a, <16 x i8> %mask)
+  ret <16 x i8> %tbl
+}
+
+; Mask with some poison elements should optimize, with poison propagating to output.
+define <16 x i8> @tbl1_poison_mask_elements(<16 x i8> %a) {
+; CHECK-LABEL: @tbl1_poison_mask_elements(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> [[A:%.*]], <16 x i8> <i8 0, i8 poison, i8 2, i8 poison, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>)
+; CHECK-NEXT:    ret <16 x i8> [[TMP1]]
+;
+  %tbl = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> %a, <16 x i8> <i8 0, i8 poison, i8 2, i8 poison, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>)
+  ret <16 x i8> %tbl
+}
+
+; Mask with all poison elements should optimize to poison.
+define <16 x i8> @tbl1_all_poison_mask(<16 x i8> %a) {
+; CHECK-LABEL: @tbl1_all_poison_mask(
+; CHECK-NEXT:    [[TBL:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> [[A:%.*]], <16 x i8> poison)
+; CHECK-NEXT:    ret <16 x i8> [[TBL]]
+;
+  %tbl = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> %a, <16 x i8> poison)
+  ret <16 x i8> %tbl
+}
+
+; "Real" declarations
+declare <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.tbl3.v8i8(<16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.tbl3.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.tbl4.v8i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.tbx1.v16i8(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.tbx2.v8i8(<8 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.tbx2.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.tbx3.v8i8(<8 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.tbx3.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.tbx4.v8i8(<8 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.tbx4.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
diff --git a/llvm/test/Transforms/InstCombine/AArch64/tbl1.ll b/llvm/test/Transforms/InstCombine/AArch64/tbl1.ll
deleted file mode 100644
index 362cc0f6c4493..0000000000000
--- a/llvm/test/Transforms/InstCombine/AArch64/tbl1.ll
+++ /dev/null
@@ -1,65 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -passes=instcombine -S | FileCheck %s
-
-target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
-target triple = "aarch64"
-
-; Turning a table lookup intrinsic into a shuffle vector instruction
-; can be beneficial. If the mask used for the lookup is the constant
-; vector {7,6,5,4,3,2,1,0}, then the back-end generates rev64
-; instructions instead.
-
-define <8 x i8> @tbl1_8x8(<16 x i8> %vec) {
-; CHECK-LABEL: @tbl1_8x8(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <16 x i8> [[VEC:%.*]], <16 x i8> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    ret <8 x i8> [[TMP0]]
-;
-entry:
-  %tbl1 = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> %vec, <8 x i8> <i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-  ret <8 x i8> %tbl1
-}
-
-; Bail the optimization if a mask index is out of range.
-define <8 x i8> @tbl1_8x8_out_of_range(<16 x i8> %vec) {
-; CHECK-LABEL: @tbl1_8x8_out_of_range(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TBL1:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[VEC:%.*]], <8 x i8> <i8 8, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-; CHECK-NEXT:    ret <8 x i8> [[TBL1]]
-;
-entry:
-  %tbl1 = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> %vec, <8 x i8> <i8 8, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-  ret <8 x i8> %tbl1
-}
-
-; Bail the optimization if the size of the return vector is not 8 elements.
-define <16 x i8> @tbl1_16x8(<16 x i8> %vec) {
-; CHECK-LABEL: @tbl1_16x8(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TBL1:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> [[VEC:%.*]], <16 x i8> <i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-; CHECK-NEXT:    ret <16 x i8> [[TBL1]]
-;
-entry:
-  %tbl1 = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> %vec, <16 x i8> <i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-  ret <16 x i8> %tbl1
-}
-
-; Bail the optimization if the elements of the return vector are not of type i8.
-define <8 x i16> @tbl1_8x16(<16 x i8> %vec) {
-; CHECK-LABEL: @tbl1_8x16(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TBL1:%.*]] = call <8 x i16> @llvm.aarch64.neon.tbl1.v8i16(<16 x i8> [[VEC:%.*]], <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
-; CHECK-NEXT:    ret <8 x i16> [[TBL1]]
-;
-entry:
-  %tbl1 = call <8 x i16> @llvm.aarch64.neon.tbl1.v8i16(<16 x i8> %vec, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
-  ret <8 x i16> %tbl1
-}
-
-; The type <8 x i16> is not a valid return type for this intrinsic,
-; but we want to test that the optimization won't trigger for vector
-; elements of type different than i8.
-declare <8 x i16> @llvm.aarch64.neon.tbl1.v8i16(<16 x i8>, <8 x i16>)
-
-declare <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8>, <8 x i8>)
-declare <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8>, <16 x i8>)
diff --git a/llvm/test/Transforms/InstCombine/ARM/tbl.ll b/llvm/test/Transforms/InstCombine/ARM/tbl.ll
new file mode 100644
index 0000000000000..1e736ec62c87e
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/ARM/tbl.ll
@@ -0,0 +1,218 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -passes=instcombine -S | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "armv8-arm-none-eabi"
+
+; We can turn a vtbl/vtbx intrinsic into a shufflevector instruction if the mask
+; is constant and references 2 or fewer operands.
+
+; Basic vtbl1 with all in-bounds indices should optimize to shufflevector.
+define <8 x i8> @vtbl1_basic(<8 x i8> %a) {
+; CHECK-LABEL: @vtbl1_basic(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i8> [[A:%.*]], <8 x i8> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <8 x i8> [[TMP1]]
+;
+  %tbl = call <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8> %a, <8 x i8> <i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+  ret <8 x i8> %tbl
+}
+
+; vtbl2 with both operands the same should be optimized (1 unique source).
+define <8 x i8> @vtbl2_duplicate_operands(<8 x i8> %a) {
+; CHECK-LABEL: @vtbl2_duplicate_operands(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl2(<8 x i8> [[A:%.*]], <8 x i8> [[A]], <8 x i8> <i8 0, i8 1, i8 2, i8 3, i8 8, i8 9, i8 10, i8 11>)
+; CHECK-NEXT:    ret <8 x i8> [[TMP1]]
+;
+  %tbl = call <8 x i8> @llvm.arm.neon.vtbl2(<8 x i8> %a, <8 x i8> %a, <8 x i8> <i8 0, i8 1, i8 2, i8 3, i8 8, i8 9, i8 10, i8 11>)
+  ret <8 x i8> %tbl
+}
+
+; vtbl4 with alternating duplicate operands should optimize (2 unique sources).
+define <8 x i8> @vtbl4_duplicate_operands(<8 x i8> %a, <8 x i8> %b) {
+; CHECK-LABEL: @vtbl4_duplicate_operands(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl4(<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[A]], <8 x i8> [[B]], <8 x i8> <i8 0, i8 1, i8 8, i8 9, i8 16, i8 17, i8 24, i8 25>)
+; CHECK-NEXT:    ret <8 x i8> [[TMP1]]
+;
+  %tbl = call <8 x i8> @llvm.arm.neon.vtbl4(<8 x i8> %a, <8 x i8> %b, <8 x i8> %a, <8 x i8> %b, <8 x i8> <i8 0, i8 1, i8 8, i8 9, i8 16, i8 17, i8 24, i8 25>)
+  ret <8 x i8> %tbl
+}
+
+; vtbl4 where mask only references first two operands should optimize.
+define <8 x i8> @vtbl4_unused_operands(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d) {
+; CHECK-LABEL: @vtbl4_unused_operands(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl4(<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], <8 x i8> [[D:%.*]], <8 x i8> <i8 0, i8 1, i8 2, i8 3, i8 8, i8 9, i8 10, i8 11>)
+; CHECK-NEXT:    ret <8 x i8> [[TMP1]]
+;
+  %tbl = call <8 x i8> @llvm.arm.neon.vtbl4(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d, <8 x i8> <i8 0, i8 1, i8 2, i8 3, i8 8, i8 9, i8 10, i8 11>)
+  ret <8 x i8> %tbl
+}
+
+; vtbl4 where mask only references one operand should optimize.
+define <8 x i8> @vtbl4_single_operand_used(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d) {
+; CHECK-LABEL: @vtbl4_single_operand_used(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl4(<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], <8 x i8> [[D:%.*]], <8 x i8> <i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; CHECK-NEXT:    ret <8 x i8> [[TMP1]]
+;
+  %tbl = call <8 x i8> @llvm.arm.neon.vtbl4(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d, <8 x i8> <i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+  ret <8 x i8> %tbl
+}
+
+; vtbl1 with some OOB indices should optimize (1 source + zero vector = 2 sources).
+define <8 x i8> @vtbl1_with_oob(<8 x i8> %a) {
+; CHECK-LABEL: @vtbl1_with_oob(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8> [[A:%.*]], <8 x i8> <i8 0, i8 1, i8 2, i8 3, i8 8, i8 9, i8 10, i8 11>)
+; CHECK-NEXT:    ret <8 x i8> [[TMP1]]
+;
+  %tbl = call <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8> %a, <8 x i8> <i8 0, i8 1, i8 2, i8 3, i8 8, i8 9, i8 10, i8 11>)
+  ret <8 x i8> %tbl
+}
+
+; vtbl2 with duplicate operands and OOB should optimize (1 unique source + zero vector = 2 sources).
+define <8 x i8> @vtbl2_duplicate_with_oob(<8 x i8> %a) {
+; CHECK-LABEL: @vtbl2_duplicate_with_oob(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl2(<8 x i8> [[A:%.*]], <8 x i8> [[A]], <8 x i8> <i8 0, i8 1, i8 8, i8 9, i8 99, i8 99, i8 99, i8 99>)
+; CHECK-NEXT:    ret <8 x i8> [[TMP1]]
+;
+  %tbl = call <8 x i8> @llvm.arm.neon.vtbl2(<8 x i8> %a, <8 x i8> %a, <8 x i8> <i8 0, i8 1, i8 8, i8 9, i8 99, i8 99, i8 99, i8 99>)
+  ret <8 x i8> %tbl
+}
+
+; vtbl2 with OOB indices should NOT optimize (2 sources + zero vector = 3 sources).
+define <8 x i8> @vtbl2_with_oob_bail(<8 x i8> %a, <8 x i8> %b) {
+; CHECK-LABEL: @vtbl2_with_oob_bail(
+; CHECK-NEXT:    [[TBL:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl2(<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> <i8 0, i8 1, i8 8, i8 9, i8 99, i8 99, i8 99, i8 99>)
+; CHECK-NEXT:    ret <8 x i8> [[TBL]]
+;
+  %tbl = call <8 x i8> @llvm.arm.neon.vtbl2(<8 x i8> %a, <8 x i8> %b, <8 x i8> <i8 0, i8 1, i8 8, i8 9, i8 99, i8 99, i8 99, i8 99>)
+  ret <8 x i8> %tbl
+}
+
+; vtbl1 with all OOB indices should optimize to zero vector.
+define <8 x i8> @vtbl1_all_oob(<8 x i8> %a) {
+; CHECK-LABEL: @vtbl1_all_oob(
+; CHECK-NEXT:    [[TBL:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8> [[A:%.*]], <8 x i8> splat (i8 99))
+; CHECK-NEXT:    ret <8 x i8> [[TBL]]
+;
+  %tbl = call <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8> %a, <8 x i8> <i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99>)
+  ret <8 x i8> %tbl
+}
+
+; vtbl3 referencing all 3 operands should NOT optimize.
+define <8 x i8> @vtbl3_three_sources_bail(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) {
+; CHECK-LABEL: @vtbl3_three_sources_bail(
+; CHECK-NEXT:    [[TBL:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl3(<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], <8 x i8> <i8 0, i8 1, i8 8, i8 9, i8 16, i8 17, i8 0, i8 0>)
+; CHECK-NEXT:    ret <8 x i8> [[TBL]]
+;
+  %tbl = call <8 x i8> @llvm.arm.neon.vtbl3(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> <i8 0, i8 1, i8 8, i8 9, i8 16, i8 17, i8 0, i8 0>)
+  ret <8 x i8> %tbl
+}
+
+; vtbl4 referencing 3 unique operands should NOT optimize.
+define <8 x i8> @vtbl4_three_sources_bail(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) {
+; CHECK-LABEL: @vtbl4_three_sources_bail(
+; CHECK-NEXT:    [[TBL:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl4(<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], <8 x i8> [[A]], <8 x i8> <i8 0, i8 1, i8 8, i8 9, i8 16, i8 17, i8 24, i8 25>)
+; CHECK-NEXT:    ret <8 x i8> [[TBL]]
+;
+  %tbl = call <8 x i8> @llvm.arm.neon.vtbl4(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %a, <8 x i8> <i8 0, i8 1, i8 8, i8 9, i8 16, i8 17, i8 24, i8 25>)
+  ret <8 x i8> %tbl
+}
+
+; vtbl4 referencing all 4 unique operands should NOT optimize.
+define <8 x i8> @vtbl4_four_sources_bail(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d) {
+; CHECK-LABEL: @vtbl4_four_sources_bail(
+; CHECK-NEXT:    [[TBL:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl4(<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], <8 x i8> [[D:%.*]], <8 x i8> <i8 0, i8 1, i8 8, i8 9, i8 16, i8 17, i8 24, i8 25>)
+; CHECK-NEXT:    ret <8 x i8> [[TBL]]
+;
+  %tbl = call <8 x i8> @llvm.arm.neon.vtbl4(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d, <8 x i8> <i8 0, i8 1, i8 8, i8 9, i8 16, i8 17, i8 24, i8 25>)
+  ret <8 x i8> %tbl
+}
+
+; vtbx1 with no OOB should optimize.
+define <8 x i8> @vtbx1_no_oob(<8 x i8> %fallback, <8 x i8> %a) {
+; CHECK-LABEL: @vtbx1_no_oob(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx1(<8 x i8> [[FALLBACK:%.*]], <8 x i8> [[A:%.*]], <8 x i8> <i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; CHECK-NEXT:    ret <8 x i8> [[TMP1]]
+;
+  %tbx = call <8 x i8> @llvm.arm.neon.vtbx1(<8 x i8> %fallback, <8 x i8> %a, <8 x i8> <i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+  ret <8 x i8> %tbx
+}
+
+; vtbx2 where fallback == second source operand should optimize (deduplicated).
+define <8 x i8> @vtbx2_fallback_equals_second_source(<8 x i8> %a, <8 x i8> %b) {
+; CHECK-LABEL: @vtbx2_fallback_equals_second_source(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx2(<8 x i8> [[B:%.*]], <8 x i8> [[A:%.*]], <8 x i8> [[B]], <8 x i8> <i8 0, i8 1, i8 8, i8 9, i8 99, i8 99, i8 99, i8 99>)
+; CHECK-NEXT:    ret <8 x i8> [[TMP1]]
+;
+  %tbx = call <8 x i8> @llvm.arm.neon.vtbx2(<8 x i8> %b, <8 x i8> %a, <8 x i8> %b, <8 x i8> <i8 0, i8 1, i8 8, i8 9, i8 99, i8 99, i8 99, i8 99>)
+  ret <8 x i8> %tbx
+}
+
+; vtbx1 with OOB where fallback == source should optimize (deduplicated).
+define <8 x i8> @vtbx1_oob_fallback_same_as_source(<8 x i8> %a) {
+; CHECK-LABEL: @vtbx1_oob_fallback_same_as_source(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx1(<8 x i8> [[A:%.*]], <8 x i8> [[A]], <8 x i8> <i8 3, i8 2, i8 1, i8 0, i8 99, i8 99, i8 99, i8 99>)
+; CHECK-NEXT:    ret <8 x i8> [[TMP1]]
+;
+  %tbx = call <8 x i8> @llvm.arm.neon.vtbx1(<8 x i8> %a, <8 x i8> %a, <8 x i8> <i8 3, i8 2, i8 1, i8 0, i8 99, i8 99, i8 99, i8 99>)
+  ret <8 x i8> %tbx
+}
+
+; vtbx2 with OOB should NOT optimize (2 sources + fallback = 3 sources).
+define <8 x i8> @vtbx2_with_oob_bail(<8 x i8> %fallback, <8 x i8> %a, <8 x i8> %b) {
+; CHECK-LABEL: @vtbx2_with_oob_bail(
+; CHECK-NEXT:    [[TBX:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx2(<8 x i8> [[FALLBACK:%.*]], <8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> <i8 0, i8 1, i8 8, i8 9, i8 99, i8 99, i8 99, i8 99>)
+; CHECK-NEXT:    ret <8 x i8> [[TBX]]
+;
+  %tbx = call <8 x i8> @llvm.arm.neon.vtbx2(<8 x i8> %fallback, <8 x i8> %a, <8 x i8> %b, <8 x i8> <i8 0, i8 1, i8 8, i8 9, i8 99, i8 99, i8 99, i8 99>)
+  ret <8 x i8> %tbx
+}
+
+; vtbx1 with all OOB indices should optimize to fallback.
+define <8 x i8> @vtbx1_all_oob(<8 x i8> %fallback, <8 x i8> %a) {
+; CHECK-LABEL: @vtbx1_all_oob(
+; CHECK-NEXT:    [[FALLBACK:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx1(<8 x i8> [[FALLBACK1:%.*]], <8 x i8> [[A:%.*]], <8 x i8> splat (i8 99))
+; CHECK-NEXT:    ret <8 x i8> [[FALLBACK]]
+;
+  %tbx = call <8 x i8> @llvm.arm.neon.vtbx1(<8 x i8> %fallback, <8 x i8> %a, <8 x i8> <i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99>)
+  ret <8 x i8> %tbx
+}
+
+; Non-constant mask should NOT optimize.
+define <8 x i8> @vtbl1_non_constant_mask(<8 x i8> %a, <8 x i8> %mask) {
+; CHECK-LABEL: @vtbl1_non_constant_mask(
+; CHECK-NEXT:    [[TBL:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8> [[A:%.*]], <8 x i8> [[MASK:%.*]])
+; CHECK-NEXT:    ret <8 x i8> [[TBL]]
+;
+  %tbl = call <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8> %a, <8 x i8> %mask)
+  ret <8 x i8> %tbl
+}
+
+; Mask with some poison elements should optimize, with poison propagating to output.
+define <8 x i8> @vtbl1_poison_mask_elements(<8 x i8> %a) {
+; CHECK-LABEL: @vtbl1_poison_mask_elements(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8> [[A:%.*]], <8 x i8> <i8 0, i8 poison, i8 2, i8 poison, i8 4, i8 5, i8 6, i8 7>)
+; CHECK-NEXT:    ret <8 x i8> [[TMP1]]
+;
+  %tbl = call <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8> %a, <8 x i8> <i8 0, i8 poison, i8 2, i8 poison, i8 4, i8 5, i8 6, i8 7>)
+  ret <8 x i8> %tbl
+}
+
+; Mask with all poison elements should optimize to poison.
+define <8 x i8> @vtbl1_all_poison_mask(<8 x i8> %a) {
+; CHECK-LABEL: @vtbl1_all_poison_mask(
+; CHECK-NEXT:    [[TBL:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8> [[A:%.*]], <8 x i8> poison)
+; CHECK-NEXT:    ret <8 x i8> [[TBL]]
+;
+  %tbl = call <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8> %a, <8 x i8> poison)
+  ret <8 x i8> %tbl
+}
+
+; Declarations
+declare <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8>, <8 x i8>) nounwind readnone
+declare <8 x i8> @llvm.arm.neon.vtbl2(<8 x i8>, <8 x i8>, <8 x i8>) nounwind readnone
+declare <8 x i8> @llvm.arm.neon.vtbl3(<8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>) nounwind readnone
+declare <8 x i8> @llvm.arm.neon.vtbl4(<8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>) nounwind readnone
+declare <8 x i8> @llvm.arm.neon.vtbx1(<8 x i8>, <8 x i8>, <8 x i8>) nounwind readnone
+declare <8 x i8> @llvm.arm.neon.vtbx2(<8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>) nounwind readnone
+declare <8 x i8> @llvm.arm.neon.vtbx3(<8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>) nounwind readnone
+declare <8 x i8> @llvm.arm.neon.vtbx4(<8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>) nounwind readnone
diff --git a/llvm/test/Transforms/InstCombine/ARM/tbl1.ll b/llvm/test/Transforms/InstCombine/ARM/tbl1.ll
deleted file mode 100644
index fbec1a2bb7a07..0000000000000
--- a/llvm/test/Transforms/InstCombine/ARM/tbl1.ll
+++ /dev/null
@@ -1,35 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -passes=instcombine -S | FileCheck %s
-
-target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
-target triple = "armv8-arm-none-eabi"
-
-; Turning a table lookup intrinsic into a shuffle vector instruction
-; can be beneficial. If the mask used for the lookup is the constant
-; vector {7,6,5,4,3,2,1,0}, then the back-end generates rev64
-; instructions instead.
-
-define <8 x i8> @tbl1_8x8(<8 x i8> %vec) {
-; CHECK-LABEL: @tbl1_8x8(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <8 x i8> [[VEC:%.*]], <8 x i8> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    ret <8 x i8> [[TMP0]]
-;
-entry:
-  %vtbl1 = call <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8> %vec, <8 x i8> <i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-  ret <8 x i8> %vtbl1
-}
-
-; Bail the optimization if a mask index is out of range.
-define <8 x i8> @tbl1_8x8_out_of_range(<8 x i8> %vec) {
-; CHECK-LABEL: @tbl1_8x8_out_of_range(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[VTBL1:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8> [[VEC:%.*]], <8 x i8> <i8 8, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-; CHECK-NEXT:    ret <8 x i8> [[VTBL1]]
-;
-entry:
-  %vtbl1 = call <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8> %vec, <8 x i8> <i8 8, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-  ret <8 x i8> %vtbl1
-}
-
-declare <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8>, <8 x i8>)

>From f9b878645cc4df65d4051cdc1a69f980ef05c2b3 Mon Sep 17 00:00:00 2001
From: valadaptive <valadaptive at protonmail.com>
Date: Wed, 26 Nov 2025 18:28:24 -0500
Subject: [PATCH 8/8] [AArch64][ARM] Optimize more tbl/tbx calls into
 shufflevector

---
 .../Utils/ARMCommonInstCombineIntrinsic.h     |   9 +-
 .../AArch64/AArch64TargetTransformInfo.cpp    |  10 +-
 .../lib/Target/ARM/ARMTargetTransformInfo.cpp |  11 +-
 .../Utils/ARMCommonInstCombineIntrinsic.cpp   | 119 +++++++++++++++---
 .../Transforms/InstCombine/AArch64/tbl.ll     |  33 +++--
 llvm/test/Transforms/InstCombine/ARM/tbl.ll   |  29 ++---
 6 files changed, 151 insertions(+), 60 deletions(-)

diff --git a/llvm/include/llvm/Transforms/Utils/ARMCommonInstCombineIntrinsic.h b/llvm/include/llvm/Transforms/Utils/ARMCommonInstCombineIntrinsic.h
index 1efb647978423..ebeddefcee916 100644
--- a/llvm/include/llvm/Transforms/Utils/ARMCommonInstCombineIntrinsic.h
+++ b/llvm/include/llvm/Transforms/Utils/ARMCommonInstCombineIntrinsic.h
@@ -26,11 +26,10 @@ namespace llvm {
 
 namespace ARMCommon {
 
-/// Convert a table lookup to shufflevector if the mask is constant.
-/// This could benefit tbl1 if the mask is { 7,6,5,4,3,2,1,0 }, in
-/// which case we could lower the shufflevector with rev64 instructions
-/// as it's actually a byte reverse.
-Instruction *simplifyNeonTbl1(IntrinsicInst &II, InstCombiner &IC);
+/// Convert `tbl`/`tbx` intrinsics to shufflevector if the mask is constant, and
+/// at most two source operands are actually referenced.
+Instruction *simplifyNeonTbl(IntrinsicInst &II, InstCombiner &IC,
+                             bool IsExtension);
 
 /// Simplify NEON multiply-long intrinsics (smull, umull).
 /// These intrinsics perform widening multiplies: they multiply two vectors of
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 0d0c0970091d6..56f9f2af0c7b0 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -2858,7 +2858,15 @@ AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC,
   case Intrinsic::aarch64_neon_fminnm:
     return instCombineMaxMinNM(IC, II);
   case Intrinsic::aarch64_neon_tbl1:
-    return ARMCommon::simplifyNeonTbl1(II, IC);
+  case Intrinsic::aarch64_neon_tbl2:
+  case Intrinsic::aarch64_neon_tbl3:
+  case Intrinsic::aarch64_neon_tbl4:
+    return ARMCommon::simplifyNeonTbl(II, IC, /*IsExtension=*/false);
+  case Intrinsic::aarch64_neon_tbx1:
+  case Intrinsic::aarch64_neon_tbx2:
+  case Intrinsic::aarch64_neon_tbx3:
+  case Intrinsic::aarch64_neon_tbx4:
+    return ARMCommon::simplifyNeonTbl(II, IC, /*IsExtension=*/true);
   case Intrinsic::aarch64_neon_smull:
   case Intrinsic::aarch64_neon_umull: {
     bool IsSigned = IID == Intrinsic::aarch64_neon_smull;
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
index c93b2fbf419fe..89327e9411507 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -184,7 +184,16 @@ ARMTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
   }
 
   case Intrinsic::arm_neon_vtbl1:
-    return ARMCommon::simplifyNeonTbl1(II, IC);
+  case Intrinsic::arm_neon_vtbl2:
+  case Intrinsic::arm_neon_vtbl3:
+  case Intrinsic::arm_neon_vtbl4:
+    return ARMCommon::simplifyNeonTbl(II, IC, /*IsExtension=*/false);
+
+  case Intrinsic::arm_neon_vtbx1:
+  case Intrinsic::arm_neon_vtbx2:
+  case Intrinsic::arm_neon_vtbx3:
+  case Intrinsic::arm_neon_vtbx4:
+    return ARMCommon::simplifyNeonTbl(II, IC, /*IsExtension=*/true);
 
   case Intrinsic::arm_neon_vmulls:
   case Intrinsic::arm_neon_vmullu: {
diff --git a/llvm/lib/Transforms/Utils/ARMCommonInstCombineIntrinsic.cpp b/llvm/lib/Transforms/Utils/ARMCommonInstCombineIntrinsic.cpp
index 4e4dea727b407..716d1e3b7cf00 100644
--- a/llvm/lib/Transforms/Utils/ARMCommonInstCombineIntrinsic.cpp
+++ b/llvm/lib/Transforms/Utils/ARMCommonInstCombineIntrinsic.cpp
@@ -17,6 +17,7 @@
 
 #include "llvm/Transforms/Utils/ARMCommonInstCombineIntrinsic.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Value.h"
 #include "llvm/Transforms/InstCombine/InstCombiner.h"
@@ -27,41 +28,121 @@ using namespace llvm::PatternMatch;
 namespace llvm {
 namespace ARMCommon {
 
-/// Convert a table lookup to shufflevector if the mask is constant.
-/// This could benefit tbl1 if the mask is { 7,6,5,4,3,2,1,0 }, in
-/// which case we could lower the shufflevector with rev64 instructions
-/// as it's actually a byte reverse.
-Instruction *simplifyNeonTbl1(IntrinsicInst &II, InstCombiner &IC) {
+/// Convert `tbl`/`tbx` intrinsics to shufflevector if the mask is constant, and
+/// at most two source operands are actually referenced.
+Instruction *simplifyNeonTbl(IntrinsicInst &II, InstCombiner &IC,
+                             bool IsExtension) {
   // Bail out if the mask is not a constant.
-  auto *C = dyn_cast<Constant>(II.getArgOperand(1));
+  auto *C = dyn_cast<Constant>(II.getArgOperand(II.arg_size() - 1));
   if (!C)
     return nullptr;
 
-  auto *VecTy = cast<FixedVectorType>(II.getType());
-  unsigned NumElts = VecTy->getNumElements();
+  auto *RetTy = cast<FixedVectorType>(II.getType());
+  unsigned NumIndexes = RetTy->getNumElements();
 
-  // Only perform this transformation for <8 x i8> vector types.
-  if (!VecTy->getElementType()->isIntegerTy(8) || NumElts != 8)
+  // Only perform this transformation for <8 x i8> and <16 x i8> vector types.
+  if (!(RetTy->getElementType()->isIntegerTy(8) &&
+        (NumIndexes == 8 || NumIndexes == 16)))
     return nullptr;
 
-  int Indexes[8];
+  // For tbx instructions, the first argument is the "fallback" vector, which
+  // has the same length as the mask and return type.
+  unsigned int StartIndex = (unsigned)IsExtension;
+  auto *SourceTy =
+      cast<FixedVectorType>(II.getArgOperand(StartIndex)->getType());
+  // Note that the element count of each source vector does *not* need to be the
+  // same as the element count of the return type and mask! All source vectors
+  // must have the same element count as each other, though.
+  unsigned NumElementsPerSource = SourceTy->getNumElements();
+
+  // There are no tbl/tbx intrinsics for which the destination size exceeds the
+  // source size. However, our definitions of the intrinsics, at least in
+  // IntrinsicsAArch64.td, allow for arbitrary destination vector sizes, so it
+  // *could* technically happen.
+  if (NumIndexes > NumElementsPerSource) {
+    return nullptr;
+  }
+
+  // The tbl/tbx intrinsics take several source operands followed by a mask
+  // operand.
+  unsigned int NumSourceOperands = II.arg_size() - 1 - (unsigned)IsExtension;
+
+  // Map input operands to shuffle indices. This also helpfully deduplicates the
+  // input arguments, in case the same value is passed as an argument multiple
+  // times.
+  SmallDenseMap<Value *, unsigned, 2> ValueToShuffleSlot;
+  Value *ShuffleOperands[2] = {PoisonValue::get(SourceTy),
+                               PoisonValue::get(SourceTy)};
 
-  for (unsigned I = 0; I < NumElts; ++I) {
+  int Indexes[16];
+  for (unsigned I = 0; I < NumIndexes; ++I) {
     Constant *COp = C->getAggregateElement(I);
 
-    if (!COp || !isa<ConstantInt>(COp))
+    if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
       return nullptr;
 
-    Indexes[I] = cast<ConstantInt>(COp)->getLimitedValue();
+    if (isa<UndefValue>(COp)) {
+      Indexes[I] = -1;
+      continue;
+    }
+
+    uint64_t Index = cast<ConstantInt>(COp)->getZExtValue();
+    // The index of the input argument that this index references (0 = first
+    // source argument, etc).
+    unsigned SourceOperandIndex = Index / NumElementsPerSource;
+    // The index of the element at that source operand.
+    unsigned SourceOperandElementIndex = Index % NumElementsPerSource;
+
+    Value *SourceOperand;
+    if (SourceOperandIndex >= NumSourceOperands) {
+      // This index is out of bounds. Map it to index into either the fallback
+      // vector (tbx) or vector of zeroes (tbl).
+      SourceOperandIndex = NumSourceOperands;
+      if (IsExtension) {
+        // For out-of-bounds indices in tbx, choose the `I`th element of the
+        // fallback.
+        SourceOperand = II.getArgOperand(0);
+        SourceOperandElementIndex = I;
+      } else {
+        // Otherwise, choose some element from the dummy vector of zeroes (we'll
+        // always choose the first).
+        SourceOperand = Constant::getNullValue(SourceTy);
+        SourceOperandElementIndex = 0;
+      }
+    } else {
+      SourceOperand = II.getArgOperand(SourceOperandIndex + StartIndex);
+    }
 
-    // Make sure the mask indices are in range.
-    if ((unsigned)Indexes[I] >= NumElts)
+    // The source operand may be the fallback vector, which may not have the
+    // same number of elements as the source vector. In that case, we *could*
+    // choose to extend its length with another shufflevector, but it's simpler
+    // to just bail instead.
+    if (cast<FixedVectorType>(SourceOperand->getType())->getNumElements() !=
+        NumElementsPerSource) {
       return nullptr;
+    }
+
+    // We now know the source operand referenced by this index. Make it a
+    // shufflevector operand, if it isn't already.
+    unsigned NumSlots = ValueToShuffleSlot.size();
+    // This shuffle references more than two sources, and hence cannot be
+    // represented as a shufflevector.
+    if (NumSlots == 2 && !ValueToShuffleSlot.contains(SourceOperand)) {
+      return nullptr;
+    }
+    auto [It, Inserted] =
+        ValueToShuffleSlot.try_emplace(SourceOperand, NumSlots);
+    if (Inserted) {
+      ShuffleOperands[It->getSecond()] = SourceOperand;
+    }
+
+    unsigned RemappedIndex =
+        (It->getSecond() * NumElementsPerSource) + SourceOperandElementIndex;
+    Indexes[I] = RemappedIndex;
   }
 
-  auto *V1 = II.getArgOperand(0);
-  auto *V2 = Constant::getNullValue(V1->getType());
-  Value *Shuf = IC.Builder.CreateShuffleVector(V1, V2, ArrayRef(Indexes));
+  Value *Shuf = IC.Builder.CreateShuffleVector(
+      ShuffleOperands[0], ShuffleOperands[1], ArrayRef(Indexes, NumIndexes));
   return IC.replaceInstUsesWith(II, Shuf);
 }
 
diff --git a/llvm/test/Transforms/InstCombine/AArch64/tbl.ll b/llvm/test/Transforms/InstCombine/AArch64/tbl.ll
index 405b4c13700a4..f747f44a7ab9f 100644
--- a/llvm/test/Transforms/InstCombine/AArch64/tbl.ll
+++ b/llvm/test/Transforms/InstCombine/AArch64/tbl.ll
@@ -10,7 +10,7 @@ target triple = "aarch64"
 ; Basic tbl1 with all in-bounds indices should optimize to shufflevector.
 define <16 x i8> @tbl1_basic(<16 x i8> %a) {
 ; CHECK-LABEL: @tbl1_basic(
-; CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> [[A:%.*]], <16 x i8> <i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[A:%.*]], <16 x i8> poison, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    ret <16 x i8> [[TMP1]]
 ;
   %tbl = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> %a, <16 x i8> <i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
@@ -20,7 +20,7 @@ define <16 x i8> @tbl1_basic(<16 x i8> %a) {
 ; tbl2 with both operands the same should optimize (1 unique source).
 define <16 x i8> @tbl2_duplicate_operands(<16 x i8> %a) {
 ; CHECK-LABEL: @tbl2_duplicate_operands(
-; CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[A]], <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23>)
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[A:%.*]], <16 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    ret <16 x i8> [[TMP1]]
 ;
   %tbl = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %a, <16 x i8> %a, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23>)
@@ -30,7 +30,7 @@ define <16 x i8> @tbl2_duplicate_operands(<16 x i8> %a) {
 ; tbl4 with alternating duplicate operands should optimize (2 unique sources).
 define <16 x i8> @tbl4_duplicate_operands(<16 x i8> %a, <16 x i8> %b) {
 ; CHECK-LABEL: @tbl4_duplicate_operands(
-; CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 16, i8 17, i8 18, i8 19, i8 32, i8 33, i8 34, i8 35, i8 48, i8 49, i8 50, i8 51>)
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19>
 ; CHECK-NEXT:    ret <16 x i8> [[TMP1]]
 ;
   %tbl = call <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 16, i8 17, i8 18, i8 19, i8 32, i8 33, i8 34, i8 35, i8 48, i8 49, i8 50, i8 51>)
@@ -40,7 +40,7 @@ define <16 x i8> @tbl4_duplicate_operands(<16 x i8> %a, <16 x i8> %b) {
 ; tbl4 where mask only references first two operands should optimize.
 define <16 x i8> @tbl4_unused_operands(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) {
 ; CHECK-LABEL: @tbl4_unused_operands(
-; CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]], <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23>)
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; CHECK-NEXT:    ret <16 x i8> [[TMP1]]
 ;
   %tbl = call <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23>)
@@ -50,7 +50,7 @@ define <16 x i8> @tbl4_unused_operands(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c,
 ; tbl4 where mask only references one operand should optimize.
 define <16 x i8> @tbl4_single_operand_used(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) {
 ; CHECK-LABEL: @tbl4_single_operand_used(
-; CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]], <16 x i8> <i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[A:%.*]], <16 x i8> poison, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    ret <16 x i8> [[TMP1]]
 ;
   %tbl = call <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d, <16 x i8> <i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
@@ -60,7 +60,7 @@ define <16 x i8> @tbl4_single_operand_used(<16 x i8> %a, <16 x i8> %b, <16 x i8>
 ; tbl1 with some OOB indices should optimize (1 source + zero vector = 2 sources).
 define <16 x i8> @tbl1_with_oob(<16 x i8> %a) {
 ; CHECK-LABEL: @tbl1_with_oob(
-; CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> [[A:%.*]], <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23>)
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[A:%.*]], <16 x i8> <i8 0, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
 ; CHECK-NEXT:    ret <16 x i8> [[TMP1]]
 ;
   %tbl = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> %a, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23>)
@@ -70,7 +70,7 @@ define <16 x i8> @tbl1_with_oob(<16 x i8> %a) {
 ; tbl2 with duplicate operands and OOB should optimize (1 unique source + zero vector = 2 sources).
 define <16 x i8> @tbl2_duplicate_with_oob(<16 x i8> %a) {
 ; CHECK-LABEL: @tbl2_duplicate_with_oob(
-; CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[A]], <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 16, i8 17, i8 18, i8 19, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99>)
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[A:%.*]], <16 x i8> <i8 0, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
 ; CHECK-NEXT:    ret <16 x i8> [[TMP1]]
 ;
   %tbl = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %a, <16 x i8> %a, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 16, i8 17, i8 18, i8 19, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99>)
@@ -90,8 +90,7 @@ define <16 x i8> @tbl2_with_oob_bail(<16 x i8> %a, <16 x i8> %b) {
 ; tbl1 with all OOB indices should optimize to zero vector.
 define <16 x i8> @tbl1_all_oob(<16 x i8> %a) {
 ; CHECK-LABEL: @tbl1_all_oob(
-; CHECK-NEXT:    [[TBL:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> [[A:%.*]], <16 x i8> splat (i8 99))
-; CHECK-NEXT:    ret <16 x i8> [[TBL]]
+; CHECK-NEXT:    ret <16 x i8> zeroinitializer
 ;
   %tbl = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> %a, <16 x i8> <i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99>)
   ret <16 x i8> %tbl
@@ -130,7 +129,7 @@ define <16 x i8> @tbl4_four_sources_bail(<16 x i8> %a, <16 x i8> %b, <16 x i8> %
 ; tbx1 with no OOB should optimize.
 define <16 x i8> @tbx1_no_oob(<16 x i8> %fallback, <16 x i8> %a) {
 ; CHECK-LABEL: @tbx1_no_oob(
-; CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx1.v16i8(<16 x i8> [[FALLBACK:%.*]], <16 x i8> [[A:%.*]], <16 x i8> <i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[A:%.*]], <16 x i8> poison, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    ret <16 x i8> [[TMP1]]
 ;
   %tbx = call <16 x i8> @llvm.aarch64.neon.tbx1.v16i8(<16 x i8> %fallback, <16 x i8> %a, <16 x i8> <i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
@@ -140,7 +139,7 @@ define <16 x i8> @tbx1_no_oob(<16 x i8> %fallback, <16 x i8> %a) {
 ; tbx2 where fallback == second source operand should optimize (deduplicated).
 define <16 x i8> @tbx2_fallback_equals_second_source(<16 x i8> %a, <16 x i8> %b) {
 ; CHECK-LABEL: @tbx2_fallback_equals_second_source(
-; CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx2.v16i8(<16 x i8> [[B:%.*]], <16 x i8> [[A:%.*]], <16 x i8> [[B]], <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 16, i8 17, i8 18, i8 19, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99>)
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; CHECK-NEXT:    ret <16 x i8> [[TMP1]]
 ;
   %tbx = call <16 x i8> @llvm.aarch64.neon.tbx2.v16i8(<16 x i8> %b, <16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 16, i8 17, i8 18, i8 19, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99>)
@@ -150,7 +149,7 @@ define <16 x i8> @tbx2_fallback_equals_second_source(<16 x i8> %a, <16 x i8> %b)
 ; tbx1 with OOB where fallback == source should optimize (deduplicated).
 define <16 x i8> @tbx1_oob_fallback_same_as_source(<16 x i8> %a) {
 ; CHECK-LABEL: @tbx1_oob_fallback_same_as_source(
-; CHECK-NEXT:    [[A:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx1.v16i8(<16 x i8> [[A1:%.*]], <16 x i8> [[A1]], <16 x i8> <i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99>)
+; CHECK-NEXT:    [[A:%.*]] = shufflevector <16 x i8> [[A1:%.*]], <16 x i8> poison, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    ret <16 x i8> [[A]]
 ;
   %tbx = call <16 x i8> @llvm.aarch64.neon.tbx1.v16i8(<16 x i8> %a, <16 x i8> %a, <16 x i8> <i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99>)
@@ -170,8 +169,7 @@ define <16 x i8> @tbx2_with_oob_bail(<16 x i8> %fallback, <16 x i8> %a, <16 x i8
 ; tbx1 with all OOB indices should optimize to fallback.
 define <16 x i8> @tbx1_all_oob(<16 x i8> %fallback, <16 x i8> %a) {
 ; CHECK-LABEL: @tbx1_all_oob(
-; CHECK-NEXT:    [[FALLBACK:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx1.v16i8(<16 x i8> [[FALLBACK1:%.*]], <16 x i8> [[A:%.*]], <16 x i8> splat (i8 99))
-; CHECK-NEXT:    ret <16 x i8> [[FALLBACK]]
+; CHECK-NEXT:    ret <16 x i8> [[FALLBACK:%.*]]
 ;
   %tbx = call <16 x i8> @llvm.aarch64.neon.tbx1.v16i8(<16 x i8> %fallback, <16 x i8> %a, <16 x i8> <i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99>)
   ret <16 x i8> %tbx
@@ -190,7 +188,7 @@ define <8 x i8> @tbx1_fallback_size_mismatch(<8 x i8> %fallback, <16 x i8> %a) {
 ; tbx1 with no OOB and mismatched fallback/source sizes should optimize.
 define <8 x i8> @tbx1_fallback_size_mismatch_no_oob(<8 x i8> %fallback, <16 x i8> %a) {
 ; CHECK-LABEL: @tbx1_fallback_size_mismatch_no_oob(
-; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8> [[FALLBACK:%.*]], <16 x i8> [[A:%.*]], <8 x i8> <i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[A:%.*]], <16 x i8> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    ret <8 x i8> [[TMP1]]
 ;
   %tbx = call <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8> %fallback, <16 x i8> %a, <8 x i8> <i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
@@ -236,7 +234,7 @@ define <16 x i8> @tbl1_non_constant_mask(<16 x i8> %a, <16 x i8> %mask) {
 ; Mask with some poison elements should optimize, with poison propagating to output.
 define <16 x i8> @tbl1_poison_mask_elements(<16 x i8> %a) {
 ; CHECK-LABEL: @tbl1_poison_mask_elements(
-; CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> [[A:%.*]], <16 x i8> <i8 0, i8 poison, i8 2, i8 poison, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>)
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[A:%.*]], <16 x i8> poison, <16 x i32> <i32 0, i32 poison, i32 2, i32 poison, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    ret <16 x i8> [[TMP1]]
 ;
   %tbl = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> %a, <16 x i8> <i8 0, i8 poison, i8 2, i8 poison, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>)
@@ -246,8 +244,7 @@ define <16 x i8> @tbl1_poison_mask_elements(<16 x i8> %a) {
 ; Mask with all poison elements should optimize to poison.
 define <16 x i8> @tbl1_all_poison_mask(<16 x i8> %a) {
 ; CHECK-LABEL: @tbl1_all_poison_mask(
-; CHECK-NEXT:    [[TBL:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> [[A:%.*]], <16 x i8> poison)
-; CHECK-NEXT:    ret <16 x i8> [[TBL]]
+; CHECK-NEXT:    ret <16 x i8> poison
 ;
   %tbl = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> %a, <16 x i8> poison)
   ret <16 x i8> %tbl
diff --git a/llvm/test/Transforms/InstCombine/ARM/tbl.ll b/llvm/test/Transforms/InstCombine/ARM/tbl.ll
index 1e736ec62c87e..1aad566675867 100644
--- a/llvm/test/Transforms/InstCombine/ARM/tbl.ll
+++ b/llvm/test/Transforms/InstCombine/ARM/tbl.ll
@@ -20,7 +20,7 @@ define <8 x i8> @vtbl1_basic(<8 x i8> %a) {
 ; vtbl2 with both operands the same should be optimized (1 unique source).
 define <8 x i8> @vtbl2_duplicate_operands(<8 x i8> %a) {
 ; CHECK-LABEL: @vtbl2_duplicate_operands(
-; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl2(<8 x i8> [[A:%.*]], <8 x i8> [[A]], <8 x i8> <i8 0, i8 1, i8 2, i8 3, i8 8, i8 9, i8 10, i8 11>)
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i8> [[A:%.*]], <8 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    ret <8 x i8> [[TMP1]]
 ;
   %tbl = call <8 x i8> @llvm.arm.neon.vtbl2(<8 x i8> %a, <8 x i8> %a, <8 x i8> <i8 0, i8 1, i8 2, i8 3, i8 8, i8 9, i8 10, i8 11>)
@@ -30,7 +30,7 @@ define <8 x i8> @vtbl2_duplicate_operands(<8 x i8> %a) {
 ; vtbl4 with alternating duplicate operands should optimize (2 unique sources).
 define <8 x i8> @vtbl4_duplicate_operands(<8 x i8> %a, <8 x i8> %b) {
 ; CHECK-LABEL: @vtbl4_duplicate_operands(
-; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl4(<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[A]], <8 x i8> [[B]], <8 x i8> <i8 0, i8 1, i8 8, i8 9, i8 16, i8 17, i8 24, i8 25>)
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 0, i32 1, i32 8, i32 9>
 ; CHECK-NEXT:    ret <8 x i8> [[TMP1]]
 ;
   %tbl = call <8 x i8> @llvm.arm.neon.vtbl4(<8 x i8> %a, <8 x i8> %b, <8 x i8> %a, <8 x i8> %b, <8 x i8> <i8 0, i8 1, i8 8, i8 9, i8 16, i8 17, i8 24, i8 25>)
@@ -40,7 +40,7 @@ define <8 x i8> @vtbl4_duplicate_operands(<8 x i8> %a, <8 x i8> %b) {
 ; vtbl4 where mask only references first two operands should optimize.
 define <8 x i8> @vtbl4_unused_operands(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d) {
 ; CHECK-LABEL: @vtbl4_unused_operands(
-; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl4(<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], <8 x i8> [[D:%.*]], <8 x i8> <i8 0, i8 1, i8 2, i8 3, i8 8, i8 9, i8 10, i8 11>)
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; CHECK-NEXT:    ret <8 x i8> [[TMP1]]
 ;
   %tbl = call <8 x i8> @llvm.arm.neon.vtbl4(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d, <8 x i8> <i8 0, i8 1, i8 2, i8 3, i8 8, i8 9, i8 10, i8 11>)
@@ -50,7 +50,7 @@ define <8 x i8> @vtbl4_unused_operands(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8
 ; vtbl4 where mask only references one operand should optimize.
 define <8 x i8> @vtbl4_single_operand_used(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d) {
 ; CHECK-LABEL: @vtbl4_single_operand_used(
-; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl4(<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], <8 x i8> [[D:%.*]], <8 x i8> <i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i8> [[A:%.*]], <8 x i8> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    ret <8 x i8> [[TMP1]]
 ;
   %tbl = call <8 x i8> @llvm.arm.neon.vtbl4(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d, <8 x i8> <i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
@@ -60,7 +60,7 @@ define <8 x i8> @vtbl4_single_operand_used(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c
 ; vtbl1 with some OOB indices should optimize (1 source + zero vector = 2 sources).
 define <8 x i8> @vtbl1_with_oob(<8 x i8> %a) {
 ; CHECK-LABEL: @vtbl1_with_oob(
-; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8> [[A:%.*]], <8 x i8> <i8 0, i8 1, i8 2, i8 3, i8 8, i8 9, i8 10, i8 11>)
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i8> [[A:%.*]], <8 x i8> <i8 0, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison>, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 8, i32 8, i32 8>
 ; CHECK-NEXT:    ret <8 x i8> [[TMP1]]
 ;
   %tbl = call <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8> %a, <8 x i8> <i8 0, i8 1, i8 2, i8 3, i8 8, i8 9, i8 10, i8 11>)
@@ -70,7 +70,7 @@ define <8 x i8> @vtbl1_with_oob(<8 x i8> %a) {
 ; vtbl2 with duplicate operands and OOB should optimize (1 unique source + zero vector = 2 sources).
 define <8 x i8> @vtbl2_duplicate_with_oob(<8 x i8> %a) {
 ; CHECK-LABEL: @vtbl2_duplicate_with_oob(
-; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl2(<8 x i8> [[A:%.*]], <8 x i8> [[A]], <8 x i8> <i8 0, i8 1, i8 8, i8 9, i8 99, i8 99, i8 99, i8 99>)
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i8> [[A:%.*]], <8 x i8> <i8 0, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison>, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 8, i32 8, i32 8, i32 8>
 ; CHECK-NEXT:    ret <8 x i8> [[TMP1]]
 ;
   %tbl = call <8 x i8> @llvm.arm.neon.vtbl2(<8 x i8> %a, <8 x i8> %a, <8 x i8> <i8 0, i8 1, i8 8, i8 9, i8 99, i8 99, i8 99, i8 99>)
@@ -90,8 +90,7 @@ define <8 x i8> @vtbl2_with_oob_bail(<8 x i8> %a, <8 x i8> %b) {
 ; vtbl1 with all OOB indices should optimize to zero vector.
 define <8 x i8> @vtbl1_all_oob(<8 x i8> %a) {
 ; CHECK-LABEL: @vtbl1_all_oob(
-; CHECK-NEXT:    [[TBL:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8> [[A:%.*]], <8 x i8> splat (i8 99))
-; CHECK-NEXT:    ret <8 x i8> [[TBL]]
+; CHECK-NEXT:    ret <8 x i8> zeroinitializer
 ;
   %tbl = call <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8> %a, <8 x i8> <i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99>)
   ret <8 x i8> %tbl
@@ -130,7 +129,7 @@ define <8 x i8> @vtbl4_four_sources_bail(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c,
 ; vtbx1 with no OOB should optimize.
 define <8 x i8> @vtbx1_no_oob(<8 x i8> %fallback, <8 x i8> %a) {
 ; CHECK-LABEL: @vtbx1_no_oob(
-; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx1(<8 x i8> [[FALLBACK:%.*]], <8 x i8> [[A:%.*]], <8 x i8> <i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i8> [[A:%.*]], <8 x i8> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    ret <8 x i8> [[TMP1]]
 ;
   %tbx = call <8 x i8> @llvm.arm.neon.vtbx1(<8 x i8> %fallback, <8 x i8> %a, <8 x i8> <i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
@@ -140,7 +139,7 @@ define <8 x i8> @vtbx1_no_oob(<8 x i8> %fallback, <8 x i8> %a) {
 ; vtbx2 where fallback == second source operand should optimize (deduplicated).
 define <8 x i8> @vtbx2_fallback_equals_second_source(<8 x i8> %a, <8 x i8> %b) {
 ; CHECK-LABEL: @vtbx2_fallback_equals_second_source(
-; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx2(<8 x i8> [[B:%.*]], <8 x i8> [[A:%.*]], <8 x i8> [[B]], <8 x i8> <i8 0, i8 1, i8 8, i8 9, i8 99, i8 99, i8 99, i8 99>)
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    ret <8 x i8> [[TMP1]]
 ;
   %tbx = call <8 x i8> @llvm.arm.neon.vtbx2(<8 x i8> %b, <8 x i8> %a, <8 x i8> %b, <8 x i8> <i8 0, i8 1, i8 8, i8 9, i8 99, i8 99, i8 99, i8 99>)
@@ -150,7 +149,7 @@ define <8 x i8> @vtbx2_fallback_equals_second_source(<8 x i8> %a, <8 x i8> %b) {
 ; vtbx1 with OOB where fallback == source should optimize (deduplicated).
 define <8 x i8> @vtbx1_oob_fallback_same_as_source(<8 x i8> %a) {
 ; CHECK-LABEL: @vtbx1_oob_fallback_same_as_source(
-; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx1(<8 x i8> [[A:%.*]], <8 x i8> [[A]], <8 x i8> <i8 3, i8 2, i8 1, i8 0, i8 99, i8 99, i8 99, i8 99>)
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i8> [[A:%.*]], <8 x i8> poison, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    ret <8 x i8> [[TMP1]]
 ;
   %tbx = call <8 x i8> @llvm.arm.neon.vtbx1(<8 x i8> %a, <8 x i8> %a, <8 x i8> <i8 3, i8 2, i8 1, i8 0, i8 99, i8 99, i8 99, i8 99>)
@@ -170,8 +169,7 @@ define <8 x i8> @vtbx2_with_oob_bail(<8 x i8> %fallback, <8 x i8> %a, <8 x i8> %
 ; vtbx1 with all OOB indices should optimize to fallback.
 define <8 x i8> @vtbx1_all_oob(<8 x i8> %fallback, <8 x i8> %a) {
 ; CHECK-LABEL: @vtbx1_all_oob(
-; CHECK-NEXT:    [[FALLBACK:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx1(<8 x i8> [[FALLBACK1:%.*]], <8 x i8> [[A:%.*]], <8 x i8> splat (i8 99))
-; CHECK-NEXT:    ret <8 x i8> [[FALLBACK]]
+; CHECK-NEXT:    ret <8 x i8> [[FALLBACK:%.*]]
 ;
   %tbx = call <8 x i8> @llvm.arm.neon.vtbx1(<8 x i8> %fallback, <8 x i8> %a, <8 x i8> <i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99>)
   ret <8 x i8> %tbx
@@ -190,7 +188,7 @@ define <8 x i8> @vtbl1_non_constant_mask(<8 x i8> %a, <8 x i8> %mask) {
 ; Mask with some poison elements should optimize, with poison propagating to output.
 define <8 x i8> @vtbl1_poison_mask_elements(<8 x i8> %a) {
 ; CHECK-LABEL: @vtbl1_poison_mask_elements(
-; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8> [[A:%.*]], <8 x i8> <i8 0, i8 poison, i8 2, i8 poison, i8 4, i8 5, i8 6, i8 7>)
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i8> [[A:%.*]], <8 x i8> poison, <8 x i32> <i32 0, i32 poison, i32 2, i32 poison, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    ret <8 x i8> [[TMP1]]
 ;
   %tbl = call <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8> %a, <8 x i8> <i8 0, i8 poison, i8 2, i8 poison, i8 4, i8 5, i8 6, i8 7>)
@@ -200,8 +198,7 @@ define <8 x i8> @vtbl1_poison_mask_elements(<8 x i8> %a) {
 ; Mask with all poison elements should optimize to poison.
 define <8 x i8> @vtbl1_all_poison_mask(<8 x i8> %a) {
 ; CHECK-LABEL: @vtbl1_all_poison_mask(
-; CHECK-NEXT:    [[TBL:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8> [[A:%.*]], <8 x i8> poison)
-; CHECK-NEXT:    ret <8 x i8> [[TBL]]
+; CHECK-NEXT:    ret <8 x i8> poison
 ;
   %tbl = call <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8> %a, <8 x i8> poison)
   ret <8 x i8> %tbl