[llvm-branch-commits] [clang] [llvm] [ConstantTime] Native ct.select support for ARM32 and Thumb (PR #166707)

Julius Alexandre via llvm-branch-commits llvm-branch-commits at lists.llvm.org
Fri May 22 18:10:10 PDT 2026


https://github.com/wizardengineer updated https://github.com/llvm/llvm-project/pull/166707

>From ac82f346a3fe1b766055947a528555a39629a64b Mon Sep 17 00:00:00 2001
From: wizardengineer <juliuswoosebert at gmail.com>
Date: Wed, 5 Nov 2025 10:56:34 -0500
Subject: [PATCH 1/4] [ConstantTime][Clang] Add __builtin_ct_select for
 constant-time selection

---
 clang/docs/LanguageExtensions.rst             |  44 ++
 clang/include/clang/Basic/Builtins.td         |   8 +
 clang/lib/CodeGen/CGBuiltin.cpp               |  13 +
 clang/lib/Sema/SemaChecking.cpp               |  64 ++
 .../test/Sema/builtin-ct-select-edge-cases.c  | 373 ++++++++++
 clang/test/Sema/builtin-ct-select.c           | 683 ++++++++++++++++++
 6 files changed, 1185 insertions(+)
 create mode 100644 clang/test/Sema/builtin-ct-select-edge-cases.c
 create mode 100644 clang/test/Sema/builtin-ct-select.c

diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst
index 03cb02deb5e7f..6f5cd5f95cdb0 100644
--- a/clang/docs/LanguageExtensions.rst
+++ b/clang/docs/LanguageExtensions.rst
@@ -7332,3 +7332,47 @@ Clang fails to reject some code that should be rejected. e.g.,
   // own initializer rather than rejecting the code with an undeclared identifier
   // diagnostic.
   auto x = x;
+
+.. _langext-__builtin_ct_select:
+
+``__builtin_ct_select``
+-----------------------
+
+``__builtin_ct_select`` performs a constant-time conditional selection between
+two values. Unlike the ternary operator ``?:``, this builtin is designed to
+execute in constant time regardless of the condition value, making it suitable
+for cryptographic and security-sensitive code where timing side-channels must
+be avoided.
+
+**Syntax**:
+
+.. code-block:: c++
+
+  __builtin_ct_select(condition, true_value, false_value)
+
+**Examples**:
+
+.. code-block:: c++
+
+  // Select between two integers
+  int result = __builtin_ct_select(secret_bit, value_a, value_b);
+
+  // Select between two pointers
+  int *ptr = __builtin_ct_select(condition, ptr_a, ptr_b);
+
+  // Select between two floating-point values
+  double d = __builtin_ct_select(flag, 1.0, 2.0);
+
+**Description**:
+
+The first argument is an integer condition that is converted to a boolean
+(non-zero is true, zero is false). The second and third arguments must have
+the same scalar or vector type. The builtin returns the second argument if
+the condition is true, otherwise the third argument.
+
+The operation is guaranteed to be lowered to constant-time machine code that
+does not branch on the condition value, preventing timing-based side-channel
+attacks.
+
+Query for this feature with ``__has_builtin(__builtin_ct_select)``.
+
diff --git a/clang/include/clang/Basic/Builtins.td b/clang/include/clang/Basic/Builtins.td
index 40ec94ab75046..389754a37f7e3 100644
--- a/clang/include/clang/Basic/Builtins.td
+++ b/clang/include/clang/Basic/Builtins.td
@@ -5810,3 +5810,11 @@ def CountedByRef : Builtin {
   let Attributes = [NoThrow, CustomTypeChecking];
   let Prototype = "int(...)";
 }
+
+// Constant-time select builtin
+def CtSelect : Builtin {
+  let Spellings = ["__builtin_ct_select"];
+  let Attributes = [NoThrow, Const, UnevaluatedArguments,
+                    ConstIgnoringExceptions, CustomTypeChecking];
+  let Prototype = "void(...)";
+}
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index cac1628e68721..f69390b4ace57 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -6668,6 +6668,19 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
     auto Str = CGM.GetAddrOfConstantCString(Name, "");
     return RValue::get(Str.getPointer());
   }
+  case Builtin::BI__builtin_ct_select: {
+    auto *Cond = EmitScalarExpr(E->getArg(0));
+    auto *A = EmitScalarExpr(E->getArg(1));
+    auto *B = EmitScalarExpr(E->getArg(2));
+
+    if (Cond->getType()->getIntegerBitWidth() != 1)
+      Cond = Builder.CreateICmpNE(
+          Cond, llvm::ConstantInt::get(Cond->getType(), 0), "cond.bool");
+
+    llvm::Function *Fn =
+        CGM.getIntrinsic(llvm::Intrinsic::ct_select, {A->getType()});
+    return RValue::get(Builder.CreateCall(Fn, {Cond, A, B}));
+  }
   }
 
   // If this is an alias for a lib function (e.g. __builtin_sin), emit
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index cc834bbee23c4..e5a15c84de8d3 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -3928,6 +3928,70 @@ Sema::CheckBuiltinFunctionCall(FunctionDecl *FDecl, unsigned BuiltinID,
     if (BuiltinCountedByRef(TheCall))
       return ExprError();
     break;
+
+  case Builtin::BI__builtin_ct_select: {
+    if (TheCall->getNumArgs() != 3) {
+      // Simple argument count check without complex diagnostics
+      if (TheCall->getNumArgs() < 3) {
+        return Diag(TheCall->getEndLoc(),
+                    diag::err_typecheck_call_too_few_args_at_least)
+               << 0 << 3 << TheCall->getNumArgs() << 0
+               << TheCall->getCallee()->getSourceRange();
+      } else {
+        return Diag(TheCall->getEndLoc(),
+                    diag::err_typecheck_call_too_many_args)
+               << 0 << 3 << TheCall->getNumArgs() << 0
+               << TheCall->getCallee()->getSourceRange();
+      }
+    }
+    auto *Cond = TheCall->getArg(0);
+    auto *A = TheCall->getArg(1);
+    auto *B = TheCall->getArg(2);
+
+    QualType CondTy = Cond->getType();
+    if (!CondTy->isIntegerType()) {
+      return Diag(Cond->getBeginLoc(), diag::err_typecheck_cond_expect_scalar)
+             << CondTy << Cond->getSourceRange();
+    }
+
+    ExprResult ARes = DefaultFunctionArrayLvalueConversion(A);
+    ExprResult BRes = DefaultFunctionArrayLvalueConversion(B);
+    if (ARes.isInvalid() || BRes.isInvalid())
+      return ExprError();
+
+    A = ARes.get();
+    B = BRes.get();
+    TheCall->setArg(1, A);
+    TheCall->setArg(2, B);
+
+    QualType ATy = A->getType();
+    QualType BTy = B->getType();
+
+    // check for scalar or vector scalar type
+    if ((!ATy->isScalarType() && !ATy->isVectorType()) ||
+        (!BTy->isScalarType() && !BTy->isVectorType())) {
+      return Diag(A->getBeginLoc(),
+                  diag::err_typecheck_cond_incompatible_operands)
+             << ATy << BTy << A->getSourceRange() << B->getSourceRange();
+    }
+
+    // Check if both operands have the same type or can be implicitly converted
+    if (!Context.hasSameType(ATy, BTy)) {
+      // For non-arithmetic types, they must be exactly the same
+      return Diag(A->getBeginLoc(),
+                  diag::err_typecheck_cond_incompatible_operands)
+             << ATy << BTy << A->getSourceRange() << B->getSourceRange();
+    }
+
+    QualType ResultTy = ATy;
+    ExprResult CondRes = PerformContextuallyConvertToBool(Cond);
+    if (CondRes.isInvalid())
+      return ExprError();
+
+    TheCall->setArg(0, CondRes.get());
+    TheCall->setType(ResultTy);
+    return TheCall;
+  }
   }
 
   if (getLangOpts().HLSL && HLSL().CheckBuiltinFunctionCall(BuiltinID, TheCall))
diff --git a/clang/test/Sema/builtin-ct-select-edge-cases.c b/clang/test/Sema/builtin-ct-select-edge-cases.c
new file mode 100644
index 0000000000000..167b19bf20663
--- /dev/null
+++ b/clang/test/Sema/builtin-ct-select-edge-cases.c
@@ -0,0 +1,373 @@
+// RUN: %clang_cc1 -fsyntax-only -verify %s
+// RUN: %clang_cc1 -fsyntax-only -verify %s -fexperimental-new-constant-interpreter
+
+// Test with various condition expressions
+int test_conditional_expressions(int x, int y, int a, int b) {
+  // Logical expressions
+  int result1 = __builtin_ct_select(x && y, a, b);
+  int result2 = __builtin_ct_select(x || y, a, b);
+  int result3 = __builtin_ct_select(!x, a, b);
+  
+  // Comparison expressions
+  int result4 = __builtin_ct_select(x == y, a, b);
+  int result5 = __builtin_ct_select(x != y, a, b);
+  int result6 = __builtin_ct_select(x < y, a, b);
+  int result7 = __builtin_ct_select(x > y, a, b);
+  int result8 = __builtin_ct_select(x <= y, a, b);
+  int result9 = __builtin_ct_select(x >= y, a, b);
+  
+  // Bitwise expressions
+  int result10 = __builtin_ct_select(x & y, a, b);
+  int result11 = __builtin_ct_select(x | y, a, b);
+  int result12 = __builtin_ct_select(x ^ y, a, b);
+  int result13 = __builtin_ct_select(~x, a, b);
+  
+  // Arithmetic expressions
+  int result14 = __builtin_ct_select(x + y, a, b);
+  int result15 = __builtin_ct_select(x - y, a, b);
+  int result16 = __builtin_ct_select(x * y, a, b);
+  int result17 = __builtin_ct_select(x / y, a, b);
+  int result18 = __builtin_ct_select(x % y, a, b);
+  
+  return result1 + result2 + result3 + result4 + result5 + result6 + result7 + result8 + result9 + result10 + result11 + result12 + result13 + result14 + result15 + result16 + result17 + result18;
+}
+
+// Test with extreme values
+int test_extreme_values(int cond) {
+  // Maximum and minimum values
+  int max_int = __builtin_ct_select(cond, __INT_MAX__, -__INT_MAX__ - 1);
+  
+  // Very large numbers
+  long long max_ll = __builtin_ct_select(cond, __LONG_LONG_MAX__, -__LONG_LONG_MAX__ - 1);
+  
+  // Floating point extremes
+  float max_float = __builtin_ct_select(cond, __FLT_MAX__, -__FLT_MAX__);
+  double max_double = __builtin_ct_select(cond, __DBL_MAX__, -__DBL_MAX__);
+  
+  return max_int;
+}
+
+// Test with zero and negative zero
+int test_zero_values(int cond) {
+  // Integer zeros
+  int zero_int = __builtin_ct_select(cond, 0, -0);
+  
+  // Floating point zeros
+  float zero_float = __builtin_ct_select(cond, 0.0f, -0.0f);
+  double zero_double = __builtin_ct_select(cond, 0.0, -0.0);
+  
+  return zero_int;
+}
+
+// Test with infinity and NaN
+int test_special_float_values(int cond) {
+  // Infinity
+  float inf_float = __builtin_ct_select(cond, __builtin_inff(), -__builtin_inff());
+  double inf_double = __builtin_ct_select(cond, __builtin_inf(), -__builtin_inf());
+  
+  // NaN
+  float nan_float = __builtin_ct_select(cond, __builtin_nanf(""), __builtin_nanf(""));
+  double nan_double = __builtin_ct_select(cond, __builtin_nan(""), __builtin_nan(""));
+  
+  return 0;
+}
+
+// Test with complex pointer scenarios
+int test_pointer_edge_cases(int cond) {
+  int arr[10];
+  int *ptr1 = arr;
+  int *ptr2 = arr + 5;
+  
+  // Array pointers
+  int *result1 = __builtin_ct_select(cond, ptr1, ptr2);
+  
+  // Pointer arithmetic
+  int *result2 = __builtin_ct_select(cond, arr + 1, arr + 2);
+  
+  // NULL vs non-NULL
+  int *result3 = __builtin_ct_select(cond, ptr1, (int*)0);
+  
+  // Different pointer types (should fail)
+  float *fptr = (float*)0;
+  int *result4 = __builtin_ct_select(cond, ptr1, fptr); // expected-error {{incompatible operand types ('int *' and 'float *')}}
+  
+  return *result1;
+}
+
+// Test with function pointers
+int func1(int x) { return x; }
+int func2(int x) { return x * 2; }
+float func3(float x) { return x; }
+
+int test_function_pointers(int cond, int x) {
+  // Same signature function pointer 
+  int (*fptr)(int) = __builtin_ct_select(cond, &func1, &func2);
+  
+  // Different signature function pointers (should fail)
+  int (*bad_fptr)(int) = __builtin_ct_select(cond, &func1, &func3); // expected-error {{incompatible operand types ('int (*)(int)' and 'float (*)(float)')}}
+  
+  return fptr(x);
+}
+
+// Test with void pointers
+void *test_void_pointers(int cond, void *a, void *b) {
+  return __builtin_ct_select(cond, a, b);
+}
+
+// Test with const/volatile qualifiers
+int test_qualifiers(int cond) {
+  const int ca = 10;
+  const int cb = 20;
+  volatile int va = 30;
+  volatile int vb = 40;
+  const volatile int cva = 50;
+  const volatile int cvb = 60;
+  
+  // const to const
+  const int result1 = __builtin_ct_select(cond, ca, cb);
+  
+  // volatile to volatile
+  volatile int result2 = __builtin_ct_select(cond, va, vb);
+  
+  // const volatile to const volatile
+  const volatile int result3 = __builtin_ct_select(cond, cva, cvb);
+  
+  return result1 + result2 + result3;
+}
+
+// Test with arrays (should fail as they're not arithmetic or pointer)
+int test_arrays(int cond) {
+  int arr1[5] = {1, 2, 3, 4, 5};
+  int arr2[5] = {6, 7, 8, 9, 10};
+  
+  // This should fail??
+  int *result = __builtin_ct_select(cond, arr1, arr2); 
+  
+  return result[0];
+}
+
+// Test with structures (should fail)
+struct Point {
+  int x, y;
+};
+
+struct Point test_structs(int cond) {
+  struct Point p1 = {1, 2};
+  struct Point p2 = {3, 4};
+  
+  return __builtin_ct_select(cond, p1, p2); // expected-error {{incompatible operand types ('struct Point' and 'struct Point')}}
+}
+
+// Test with unions (should fail)
+union Data {
+  int i;
+  float f;
+};
+
+union Data test_unions(int cond) {
+  union Data d1 = {.i = 10};
+  union Data d2 = {.i = 20};
+  
+  return __builtin_ct_select(cond, d1, d2); // expected-error {{incompatible operand types ('union Data' and 'union Data')}}
+}
+
+// Test with bit fields (should work as they're integers)
+struct BitField {
+  int a : 4;
+  int b : 4;
+};
+
+int test_bit_fields(int cond) {
+  struct BitField bf1 = {1, 2};
+  struct BitField bf2 = {3, 4};
+  
+  // Individual bit fields should work
+  int result1 = __builtin_ct_select(cond, bf1.a, bf2.a);
+  int result2 = __builtin_ct_select(cond, bf1.b, bf2.b);
+  
+  return result1 + result2;
+}
+
+// Test with designated initializers
+int test_designated_init(int cond) {
+  int arr1[3] = {[0] = 1, [1] = 2, [2] = 3};
+  int arr2[3] = {[0] = 4, [1] = 5, [2] = 6};
+  
+  // Access specific elements
+  int result1 = __builtin_ct_select(cond, arr1[0], arr2[0]);
+  int result2 = __builtin_ct_select(cond, arr1[1], arr2[1]);
+  
+  return result1 + result2;
+}
+
+// Test with complex expressions in arguments
+int complex_expr(int x) { return x * x; }
+
+int test_complex_arguments(int cond, int x, int y) {
+  // Function calls as arguments
+  int result1 = __builtin_ct_select(cond, complex_expr(x), complex_expr(y));
+  
+  // Ternary operator as arguments
+  int result2 = __builtin_ct_select(cond, x > 0 ? x : -x, y > 0 ? y : -y);
+  
+  // Compound literals
+  int result3 = __builtin_ct_select(cond, (int){x}, (int){y});
+  
+  return result1 + result2 + result3;
+}
+
+// Test with preprocessor macros
+#define MACRO_A 42
+#define MACRO_B 24
+#define MACRO_COND(x) (x > 0)
+
+int test_macros(int x) {
+  int result1 = __builtin_ct_select(MACRO_COND(x), MACRO_A, MACRO_B);
+  
+  // Nested macros
+  #define NESTED_SELECT(c, a, b) __builtin_ct_select(c, a, b)
+  int result2 = NESTED_SELECT(x, 10, 20);
+  
+  return result1 + result2;
+}
+
+// Test with string literals (should fail)
+const char *test_strings(int cond) {
+  return __builtin_ct_select(cond, "hello", "world"); 
+}
+
+// Test with variable length arrays (VLA)
+int test_vla(int cond, int n) {
+  int vla1[n];
+  int vla2[n];
+  
+  // Individual elements should work
+  vla1[0] = 1;
+  vla2[0] = 2;
+  int result = __builtin_ct_select(cond, vla1[0], vla2[0]); 
+  
+  return result;
+}
+
+// Test with typedef
+typedef int MyInt;
+typedef float MyFloat;
+
+MyInt test_typedef(int cond, MyInt a, MyInt b) {
+  return __builtin_ct_select(cond, a, b);
+}
+
+// Test with different typedef types (should fail)
+MyInt test_different_typedef(int cond, MyInt a, MyFloat b) {
+  return __builtin_ct_select(cond, a, b); // expected-error {{incompatible operand types ('MyInt' (aka 'int') and 'MyFloat' (aka 'float'))}}
+}
+
+// Test with side effects (should be evaluated)
+int side_effect_counter = 0;
+int side_effect_func(int x) {
+  side_effect_counter++;
+  return x;
+}
+
+int test_side_effects(int cond) {
+  // Both arguments should be evaluated
+  int result = __builtin_ct_select(cond, side_effect_func(10), side_effect_func(20));
+  return result;
+}
+
+// Test with goto labels (context where expressions are used)
+int test_goto_context(int cond, int a, int b) {
+  int result = __builtin_ct_select(cond, a, b);
+  
+  if (result > 0) {
+    goto positive;
+  } else {
+    goto negative;
+  }
+  
+positive:
+  return result;
+  
+negative:
+  return -result;
+}
+
+// Test with switch statements
+int test_switch_context(int cond, int a, int b) {
+  int result = __builtin_ct_select(cond, a, b);
+  
+  switch (result) {
+    case 0:
+      return 0;
+    case 1:
+      return 1;
+    default:
+      return -1;
+  }
+}
+
+// Test with loops
+int test_loop_context(int cond, int a, int b) {
+  int result = __builtin_ct_select(cond, a, b);
+  int sum = 0;
+  
+  for (int i = 0; i < result; i++) {
+    sum += i;
+  }
+  
+  return sum;
+}
+
+// Test with recursive functions
+int factorial(int n) {
+  if (n <= 1) return 1;
+  return n * factorial(n - 1);
+}
+
+int test_recursive(int cond, int n) {
+  int result = __builtin_ct_select(cond, n, n + 1);
+  return factorial(result);
+}
+
+// Test with inline functions
+static inline int inline_func(int x) {
+  return x * 2;
+}
+
+int test_inline(int cond, int a, int b) {
+  return __builtin_ct_select(cond, inline_func(a), inline_func(b));
+}
+
+// Test with static variables
+int test_static_vars(int cond) {
+  static int static_a = 10;
+  static int static_b = 20;
+  
+  return __builtin_ct_select(cond, static_a, static_b);
+}
+
+// Test with extern variables
+extern int extern_a;
+extern int extern_b;
+
+int test_extern_vars(int cond) {
+  return __builtin_ct_select(cond, extern_a, extern_b);
+}
+
+// Test with register variables
+int test_register_vars(int cond) {
+  register int reg_a = 30;
+  register int reg_b = 40;
+  
+  return __builtin_ct_select(cond, reg_a, reg_b);
+}
+
+// Test with thread-local variables (C11)
+#if __STDC_VERSION__ >= 201112L
+_Thread_local int tls_a = 50;
+_Thread_local int tls_b = 60;
+
+int test_tls_vars(int cond) {
+  return __builtin_ct_select(cond, tls_a, tls_b);
+}
+#endif
diff --git a/clang/test/Sema/builtin-ct-select.c b/clang/test/Sema/builtin-ct-select.c
new file mode 100644
index 0000000000000..7f2d9291299d6
--- /dev/null
+++ b/clang/test/Sema/builtin-ct-select.c
@@ -0,0 +1,683 @@
+// RUN: %clang_cc1 -emit-llvm -o - %s | FileCheck %s
+
+// Test integer types
+int test_int(int cond, int a, int b) {
+  // CHECK-LABEL: define {{.*}} @test_int
+  // CHECK: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0
+  // CHECK: [[RESULT:%.*]] = call i32 @llvm.ct.select.i32(i1 [[COND]], i32 %{{.*}}, i32 %{{.*}})
+  // CHECK: ret i32 [[RESULT]]
+  return __builtin_ct_select(cond, a, b);
+}
+
+long long test_long(int cond, long long a, long long b) {
+  // CHECK-LABEL: define {{.*}} @test_long
+  // CHECK: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0
+  // CHECK: [[RESULT:%.*]] = call i64 @llvm.ct.select.i64(i1 [[COND]], i64 %{{.*}}, i64 %{{.*}})
+  // CHECK: ret i64 [[RESULT]]
+  return __builtin_ct_select(cond, a, b);
+}
+
+short test_short(int cond, short a, short b) {
+  // CHECK-LABEL: define {{.*}} @test_short
+  // CHECK: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0
+  // CHECK: [[RESULT:%.*]] = call i16 @llvm.ct.select.i16(i1 [[COND]], i16 %{{.*}}, i16 %{{.*}})
+  // CHECK: ret i16 [[RESULT]]
+  return __builtin_ct_select(cond, a, b);
+}
+
+unsigned char test_uchar(int cond, unsigned char a, unsigned char b) {
+  // CHECK-LABEL: define {{.*}} @test_uchar
+  // CHECK: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0
+  // CHECK: [[RESULT:%.*]] = call i8 @llvm.ct.select.i8(i1 [[COND]], i8 %{{.*}}, i8 %{{.*}})
+  // CHECK: ret i8 [[RESULT]]
+  return __builtin_ct_select(cond, a, b);
+}
+
+long long test_longlong(int cond, long long a, long long b) {
+  // CHECK-LABEL: define {{.*}} @test_longlong
+  // CHECK: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0
+  // CHECK: [[RESULT:%.*]] = call i64 @llvm.ct.select.i64(i1 [[COND]], i64 %{{.*}}, i64 %{{.*}})
+  // CHECK: ret i64 [[RESULT]]
+  return __builtin_ct_select(cond, a, b);
+}
+
+// Test floating point types
+float test_float(int cond, float a, float b) {
+  // CHECK-LABEL: define {{.*}} @test_float
+  // CHECK: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0
+  // CHECK: [[RESULT:%.*]] = call float @llvm.ct.select.f32(i1 [[COND]], float %{{.*}}, float %{{.*}})
+  // CHECK: ret float [[RESULT]]
+  return __builtin_ct_select(cond, a, b);
+}
+
+double test_double(int cond, double a, double b) {
+  // CHECK-LABEL: define {{.*}} @test_double
+  // CHECK: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0
+  // CHECK: [[RESULT:%.*]] = call double @llvm.ct.select.f64(i1 [[COND]], double %{{.*}}, double %{{.*}})
+  // CHECK: ret double [[RESULT]]
+  return __builtin_ct_select(cond, a, b);
+}
+
+// Test pointer types
+int *test_pointer(int cond, int *a, int *b) {
+  // CHECK-LABEL: define {{.*}} @test_pointer
+  // CHECK: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0
+  // CHECK: [[RESULT:%.*]] = call ptr @llvm.ct.select.p0(i1 [[COND]], ptr %{{.*}}, ptr %{{.*}})
+  // CHECK: ret ptr [[RESULT]]
+  return __builtin_ct_select(cond, a, b);
+}
+
+// Test with different condition types
+int test_char_cond(char cond, int a, int b) {
+  // CHECK-LABEL: define {{.*}} @test_char_cond
+  // CHECK: [[COND:%.*]] = icmp ne i8 %{{.*}}, 0
+  // CHECK: [[RESULT:%.*]] = call i32 @llvm.ct.select.i32(i1 [[COND]], i32 %{{.*}}, i32 %{{.*}})
+  // CHECK: ret i32 [[RESULT]]
+  return __builtin_ct_select(cond, a, b);
+}
+
+int test_long_cond(long long cond, int a, int b) {
+  // CHECK-LABEL: define {{.*}} @test_long_cond
+  // CHECK: [[COND:%.*]] = icmp ne i64 %{{.*}}, 0
+  // CHECK: [[RESULT:%.*]] = call i32 @llvm.ct.select.i32(i1 [[COND]], i32 %{{.*}}, i32 %{{.*}})
+  // CHECK: ret i32 [[RESULT]]
+  return __builtin_ct_select(cond, a, b);
+}
+
+// Test with boolean condition
+int test_bool_cond(_Bool cond, int a, int b) {
+  // CHECK-LABEL: define {{.*}} @test_bool_cond
+  // CHECK: [[COND:%.*]] = icmp ne i8 %{{.*}}, 0
+  // CHECK: [[RESULT:%.*]] = call i32 @llvm.ct.select.i32(i1 [[COND]], i32 %{{.*}}, i32 %{{.*}})
+  // CHECK: ret i32 [[RESULT]]
+  return __builtin_ct_select(cond, a, b);
+}
+
+// Test with constants
+int test_constant_cond(void) {
+  // CHECK-LABEL: define {{.*}} @test_constant_cond
+  // CHECK: [[RESULT:%.*]] = call i32 @llvm.ct.select.i32(i1 true, i32 42, i32 24)
+  // CHECK: ret i32 [[RESULT]]
+  return __builtin_ct_select(1, 42, 24);
+}
+
+int test_zero_cond(void) {
+  // CHECK-LABEL: define {{.*}} @test_zero_cond
+  // CHECK: [[RESULT:%.*]] = call i32 @llvm.ct.select.i32(i1 false, i32 42, i32 24)
+  // CHECK: ret i32 [[RESULT]]
+  return __builtin_ct_select(0, 42, 24);
+}
+
+// Test type promotion
+int test_promotion(int cond, short a, short b) {
+  // CHECK-LABEL: define {{.*}} @test_promotion
+  // CHECK-DAG: [[A_EXT:%.*]] = sext i16 %{{.*}} to i32
+  // CHECK-DAG: [[B_EXT:%.*]] = sext i16 %{{.*}} to i32
+  // CHECK-DAG: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0
+  // CHECK-DAG: [[RESULT:%.*]] = call i32 @llvm.ct.select.i32(i1 [[COND]], i32 [[A_EXT]], i32 [[B_EXT]])
+  // CHECK: ret i32 [[RESULT]]
+  return __builtin_ct_select(cond, (int)a, (int)b);
+}
+
+// Test mixed signedness
+unsigned int test_mixed_signedness(int cond, int a, unsigned int b) {
+  // CHECK-LABEL: define {{.*}} @test_mixed_signedness
+  // CHECK-DAG: [[A_EXT:%.*]] = sext i32 %{{.*}} to i64
+  // CHECK-DAG: [[B_EXT:%.*]] = zext i32 %{{.*}} to i64
+  // CHECK-DAG: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0
+  // CHECK-DAG: [[RESULT:%.*]] = call i64 @llvm.ct.select.i64(i1 [[COND]], i64 [[A_EXT]], i64 [[B_EXT]])
+  // CHECK: [[RESULT_TRUNC:%.*]] = trunc i64 [[RESULT]] to i32
+  // CHECK: ret i32 [[RESULT_TRUNC]]
+  return __builtin_ct_select(cond, (long long)a, (long long)b);
+}
+
+// Test complex expression
+int test_complex_expr_alt(int x, int y) {
+  // CHECK-LABEL: define {{.*}} @test_complex_expr_alt
+  // CHECK-DAG: [[CMP:%.*]] = icmp sgt i32 %{{.*}}, 0
+  // CHECK-DAG: [[ADD:%.*]] = add nsw i32 %{{.*}}, %{{.*}}
+  // CHECK-DAG: [[SUB:%.*]] = sub nsw i32 %{{.*}}, %{{.*}}
+  // Separate the final sequence to ensure proper ordering
+  // CHECK-NEXT: [[RESULT:%.*]] = call i32 @llvm.ct.select.i32(i1 [[CMP]], i32 [[ADD]], i32 [[SUB]])
+  // CHECK-NEXT: ret i32 [[RESULT]]
+  return __builtin_ct_select(x > 0, x + y, x - y);
+}
+
+// Test nested calls
+int test_nested_structured(int cond1, int cond2, int a, int b, int c) {
+  // CHECK-LABEL: define {{.*}} @test_nested_structured
+  // Phase 1: Conditions (order doesn't matter)
+  // CHECK-DAG: [[COND1:%.*]] = icmp ne i32 %{{.*}}, 0
+  // CHECK-DAG: [[COND2:%.*]] = icmp ne i32 %{{.*}}, 0
+  
+  // Phase 2: Inner select (must happen before outer)
+  // CHECK: [[INNER:%.*]] = call i32 @llvm.ct.select.i32(i1 [[COND2]], i32 %{{.*}}, i32 %{{.*}})
+  
+  // Phase 3: Outer select (must use inner result)
+  // CHECK: [[RESULT:%.*]] = call i32 @llvm.ct.select.i32(i1 [[COND1]], i32 [[INNER]], i32 %{{.*}})
+  // CHECK: ret i32 [[RESULT]]
+  return __builtin_ct_select(cond1, __builtin_ct_select(cond2, a, b), c);
+}
+
+// Test with function calls
+int helper(int x) { return x * 2; }
+int test_function_calls(int cond, int x, int y) {
+  // CHECK-LABEL: define {{.*}} @test_function_calls
+  // CHECK-DAG: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0
+  // CHECK-DAG: [[CALL1:%.*]] = call i32 @helper(i32 noundef %{{.*}})
+  // CHECK-DAG: [[CALL2:%.*]] = call i32 @helper(i32 noundef %{{.*}})
+  // CHECK-DAG: [[RESULT:%.*]] = call i32 @llvm.ct.select.i32(i1 [[COND]], i32 [[CALL1]], i32 [[CALL2]])
+  // CHECK: ret i32 [[RESULT]]
+  return __builtin_ct_select(cond, helper(x), helper(y));
+}
+
+// Test using ct_select as condition for another ct_select
+int test_intrinsic_condition(int cond1, int cond2, int a, int b, int c, int d) {
+  // CHECK-LABEL: define {{.*}} @test_intrinsic_condition
+  // CHECK-DAG: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0
+  // CHECK-DAG: [[INNER_COND:%.*]] = call i32 @llvm.ct.select.i32(i1 [[COND]], i32 %{{.*}}, i32 %{{.*}})
+  // CHECK-DAG: [[FINAL_COND:%.*]] = icmp ne i32 [[INNER_COND]], 0
+  // CHECK-DAG: [[RESULT:%.*]] = call i32 @llvm.ct.select.i32(i1 [[FINAL_COND]], i32 %{{.*}}, i32 %{{.*}})
+  // CHECK: ret i32 [[RESULT]]
+  return __builtin_ct_select(__builtin_ct_select(cond1, cond2, a), b, c);
+}
+
+// Test using comparison result of ct_select as condition
+int test_comparison_condition(int cond, int a, int b, int c, int d) {
+  // CHECK-LABEL: define {{.*}} @test_comparison_condition
+  // CHECK-DAG: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0
+  // CHECK: [[FIRST_SELECT:%.*]] = call i32 @llvm.ct.select.i32(i1 [[COND]], i32 %{{.*}}, i32 %{{.*}})
+  // CHECK: [[CMP:%.*]] = icmp sgt i32 [[FIRST_SELECT]], %{{.*}}
+  // CHECK: [[RESULT:%.*]] = call i32 @llvm.ct.select.i32(i1 [[CMP]], i32 %{{.*}}, i32 %{{.*}})
+  // CHECK: ret i32 [[RESULT]]
+  return __builtin_ct_select(__builtin_ct_select(cond, a, b) > c, d, a);
+}
+
+// Test using ct_select result in arithmetic as condition
+int test_arithmetic_condition(int cond, int a, int b, int c, int d) {
+  // CHECK-LABEL: define {{.*}} @test_arithmetic_condition
+  // CHECK-DAG: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0
+  // CHECK: [[FIRST_SELECT:%.*]] = call i32 @llvm.ct.select.i32(i1 [[COND]], i32 %{{.*}}, i32 %{{.*}})
+  // CHECK: [[ADD:%.*]] = add nsw i32 [[FIRST_SELECT]], %{{.*}}
+  // CHECK: [[FINAL_COND:%.*]] = icmp ne i32 [[ADD]], 0
+  // CHECK: [[RESULT:%.*]] = call i32 @llvm.ct.select.i32(i1 [[FINAL_COND]], i32 %{{.*}}, i32 %{{.*}})
+  // CHECK: ret i32 [[RESULT]]
+  return __builtin_ct_select(__builtin_ct_select(cond, a, b) + c, d, a);
+}
+
+// Test chained ct_select as conditions
+int test_chained_conditions(int cond1, int cond2, int cond3, int a, int b, int c, int d, int e) {
+  // CHECK-LABEL: define {{.*}} @test_chained_conditions
+  // CHECK: [[COND1:%.*]] = icmp ne i32 %{{.*}}, 0
+  // CHECK-DAG: [[FIRST:%.*]] = call i32 @llvm.ct.select.i32(i1 [[COND1]], i32 %{{.*}}, i32 %{{.*}})
+  // CHECK-DAG: [[COND2:%.*]] = icmp ne i32 %{{.*}}, 0
+  // CHECK-DAG: [[SECOND:%.*]] = call i32 @llvm.ct.select.i32(i1 [[COND2]], i32 %{{.*}}, i32 %{{.*}})
+  // CHECK-DAG: [[FINAL_COND:%.*]] = icmp ne i32 %{{.*}}, 0
+  // CHECK-DAG: [[RESULT:%.*]] = call i32 @llvm.ct.select.i32(i1 [[FINAL_COND]], i32 %{{.*}}, i32 %{{.*}})
+  // CHECK: ret i32 [[RESULT]]
+  int first_select = __builtin_ct_select(cond1, a, b);
+  int second_select = __builtin_ct_select(cond2, first_select, c);
+  return __builtin_ct_select(second_select, d, e);
+}
+
+// Test using ct_select with pointer condition
+//int test_pointer_condition(int *ptr1, int *ptr2, int a, int b, int c) {
+  // NO-CHECK-LABEL: define {{.*}} @test_pointer_condition
+  // NO-CHECK: [[PTR_COND:%.*]] = icmp ne ptr %{{.*}}, null
+  // NO-CHECK: [[PTR_SELECT:%.*]] = call ptr @llvm.ct.select.p0(i1 [[PTR_COND]], ptr %{{.*}}, ptr %{{.*}})
+  // NO-CHECK: [[FINAL_COND:%.*]] = icmp ne ptr [[PTR_SELECT]], null
+  // NO-CHECK: [[RESULT:%.*]] = call i32 @llvm.ct.select.i32(i1 [[FINAL_COND]], i32 %{{.*}}, i32 %{{.*}})
+  // NO-CHECK: ret i32 [[RESULT]]
+//  return __builtin_ct_select(__builtin_ct_select(ptr1, ptr1, ptr2), a, b);
+//}
+
+
+// Test using ct_select result in logical operations as condition
+int test_logical_condition(int cond1, int cond2, int a, int b, int c, int d) {
+  // CHECK-LABEL: define {{.*}} @test_logical_condition
+  // CHECK-DAG: [[COND1:%.*]] = icmp ne i32 %{{.*}}, 0
+  // CHECK-DAG: [[COND2:%.*]] = icmp ne i32 %{{.*}}, 0
+  // CHECK-DAG: [[FIRST_SELECT:%.*]] = call i32 @llvm.ct.select.i32(i1 [[COND1]], i32 %{{.*}}, i32 %{{.*}})
+  // CHECK-DAG: [[SELECT_BOOL:%.*]] = icmp ne i32 %{{.*}}, 0
+  // CHECK-DAG: [[RESULT:%.*]] = call i32 @llvm.ct.select.i32(i1 %{{.*}}, i32 %{{.*}}, i32 %{{.*}})
+  // CHECK: ret i32 [[RESULT]]
+  return __builtin_ct_select(__builtin_ct_select(cond1, a, b) && cond2, c, d);
+}
+
+// Test multiple levels of ct_select as conditions
+int test_deep_condition_nesting(int cond1, int cond2, int cond3, int a, int b, int c, int d, int e, int f) {
+  // CHECK-LABEL: define {{.*}} @test_deep_condition_nesting
+  // CHECK-DAG: [[COND1:%.*]] = icmp ne i32 %{{.*}}, 0
+  // CHECK-DAG: [[COND2:%.*]] = icmp ne i32 %{{.*}}, 0
+  // CHECK-DAG: [[INNER1:%.*]] = call i32 @llvm.ct.select.i32(i1 [[COND2]], i32 %{{.*}}, i32 %{{.*}})
+  // CHECK-DAG: [[INNER1_COND:%.*]] = icmp ne i32 [[INNER1]], 0
+  // CHECK-DAG: [[INNER2:%.*]] = call i32 @llvm.ct.select.i32(i1 [[INNER1_COND]], i32 %{{.*}}, i32 %{{.*}})
+  // CHECK-DAG: [[OUTER:%.*]] = call i32 @llvm.ct.select.i32(i1 [[COND1]], i32 [[INNER2]], i32 %{{.*}})
+  // CHECK-DAG: [[FINAL_COND:%.*]] = icmp ne i32 [[OUTER]], 0
+  // CHECK-DAG: [[RESULT:%.*]] = call i32 @llvm.ct.select.i32(i1 [[FINAL_COND]], i32 %{{.*}}, i32 %{{.*}})
+  // CHECK: ret i32 [[RESULT]]
+  return __builtin_ct_select(__builtin_ct_select(cond1, __builtin_ct_select(__builtin_ct_select(cond2, a, b), c, d), e), f, a);
+}
+
+// Test ct_select with complex condition expressions
+int test_complex_condition_expr(int x, int y, int z, int a, int b) {
+  // CHECK-LABEL: define {{.*}} @test_complex_condition_expr
+  // CHECK: [[CMP1:%.*]] = icmp sgt i32 %{{.*}}, %{{.*}}
+  // CHECK: [[SELECT1:%.*]] = call i32 @llvm.ct.select.i32(i1 [[CMP1]], i32 %{{.*}}, i32 %{{.*}})
+  // CHECK: [[CMP2:%.*]] = icmp slt i32 [[SELECT1]], %{{.*}}
+  // CHECK: [[RESULT:%.*]] = call i32 @llvm.ct.select.i32(i1 [[CMP2]], i32 %{{.*}}, i32 %{{.*}})
+  // CHECK: ret i32 [[RESULT]]
+  return __builtin_ct_select(__builtin_ct_select(x > y, x, y) < z, a, b);
+}
+
+// Test vector types - 128-bit vectors
+typedef int __attribute__((vector_size(16))) int4;
+typedef float __attribute__((vector_size(16))) float4;
+typedef short __attribute__((vector_size(16))) short8;
+typedef char __attribute__((vector_size(16))) char16;
+
+int4 test_vector_int4(int cond, int4 a, int4 b) {
+  // CHECK-LABEL: define {{.*}} @test_vector_int4
+  // CHECK: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0
+  // CHECK: [[RESULT:%.*]] = call <4 x i32> @llvm.ct.select.v4i32(i1 [[COND]], <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+  // CHECK: ret <4 x i32> [[RESULT]]
+  return __builtin_ct_select(cond, a, b);
+}
+
+float4 test_vector_float4(int cond, float4 a, float4 b) {
+  // CHECK-LABEL: define {{.*}} @test_vector_float4
+  // CHECK: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0
+  // CHECK: [[RESULT:%.*]] = call <4 x float> @llvm.ct.select.v4f32(i1 [[COND]], <4 x float> %{{.*}}, <4 x float> %{{.*}})
+  // CHECK: ret <4 x float> [[RESULT]]
+  return __builtin_ct_select(cond, a, b);
+}
+
+short8 test_vector_short8(int cond, short8 a, short8 b) {
+  // CHECK-LABEL: define {{.*}} @test_vector_short8
+  // CHECK: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0
+  // CHECK: [[RESULT:%.*]] = call <8 x i16> @llvm.ct.select.v8i16(i1 [[COND]], <8 x i16> %{{.*}}, <8 x i16> %{{.*}})
+  // CHECK: ret <8 x i16> [[RESULT]]
+  return __builtin_ct_select(cond, a, b);
+}
+
+char16 test_vector_char16(int cond, char16 a, char16 b) {
+  // CHECK-LABEL: define {{.*}} @test_vector_char16
+  // CHECK: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0
+  // CHECK: [[RESULT:%.*]] = call <16 x i8> @llvm.ct.select.v16i8(i1 [[COND]], <16 x i8> %{{.*}}, <16 x i8> %{{.*}})
+  // CHECK: ret <16 x i8> [[RESULT]]
+  return __builtin_ct_select(cond, a, b);
+}
+
+// Test 256-bit vectors
+typedef int __attribute__((vector_size(32))) int8;
+typedef float __attribute__((vector_size(32))) float8;
+typedef double __attribute__((vector_size(32))) double4;
+
+int8 test_vector_int8(int cond, int8 a, int8 b) {
+  // CHECK-LABEL: define {{.*}} @test_vector_int8
+  // CHECK-DAG: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0
+  // CHECK-DAG: [[RESULT:%.*]] = call <8 x i32> @llvm.ct.select.v8i32(i1 [[COND]], <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+  return __builtin_ct_select(cond, a, b);
+}
+
+float8 test_vector_float8(int cond, float8 a, float8 b) {
+  // CHECK-LABEL: define {{.*}} @test_vector_float8
+  // CHECK-DAG: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0
+  // CHECK-DAG: [[RESULT:%.*]] = call <8 x float> @llvm.ct.select.v8f32(i1 [[COND]], <8 x float> %{{.*}}, <8 x float> %{{.*}})
+  return __builtin_ct_select(cond, a, b);
+}
+
+double4 test_vector_double4(int cond, double4 a, double4 b) {
+  // CHECK-LABEL: define {{.*}} @test_vector_double4
+  // CHECK-DAG: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0
+  // CHECK-DAG: [[RESULT:%.*]] = call <4 x double> @llvm.ct.select.v4f64(i1 [[COND]], <4 x double> %{{.*}}, <4 x double> %{{.*}})
+  return __builtin_ct_select(cond, a, b);
+}
+
+// Test 512-bit vectors
+typedef int __attribute__((vector_size(64))) int16;
+typedef float __attribute__((vector_size(64))) float16;
+
+int16 test_vector_int16(int cond, int16 a, int16 b) {
+  // CHECK-LABEL: define {{.*}} @test_vector_int16
+  // CHECK: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0
+  // CHECK: [[RESULT:%.*]] = call <16 x i32> @llvm.ct.select.v16i32(i1 [[COND]], <16 x i32> %{{.*}}, <16 x i32> %{{.*}})
+  return __builtin_ct_select(cond, a, b);
+}
+
+float16 test_vector_float16(int cond, float16 a, float16 b) {
+  // CHECK-LABEL: define {{.*}} @test_vector_float16
+  // CHECK: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0
+  // CHECK: [[RESULT:%.*]] = call <16 x float> @llvm.ct.select.v16f32(i1 [[COND]], <16 x float> %{{.*}}, <16 x float> %{{.*}})
+  return __builtin_ct_select(cond, a, b);
+}
+
+// Test vector operations with different condition types
+int4 test_vector_char_cond(char cond, int4 a, int4 b) {
+  // CHECK-LABEL: define {{.*}} @test_vector_char_cond
+  // CHECK: [[COND:%.*]] = icmp ne i8 %{{.*}}, 0
+  // CHECK: [[RESULT:%.*]] = call <4 x i32> @llvm.ct.select.v4i32(i1 [[COND]], <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+  // CHECK: ret <4 x i32> [[RESULT]]
+  return __builtin_ct_select(cond, a, b);
+}
+
+float4 test_vector_long_cond(long long cond, float4 a, float4 b) {
+  // CHECK-LABEL: define {{.*}} @test_vector_long_cond
+  // CHECK: [[COND:%.*]] = icmp ne i64 %{{.*}}, 0
+  // CHECK: [[RESULT:%.*]] = call <4 x float> @llvm.ct.select.v4f32(i1 [[COND]], <4 x float> %{{.*}}, <4 x float> %{{.*}})
+  // CHECK: ret <4 x float> [[RESULT]]
+  return __builtin_ct_select(cond, a, b);
+}
+
+// Test vector constants
+int4 test_vector_constant_cond(void) {
+  // CHECK-LABEL: define {{.*}} @test_vector_constant_cond
+  // CHECK: [[RESULT:%.*]] = call <4 x i32> @llvm.ct.select.v4i32(i1 true, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+  // CHECK: ret <4 x i32> [[RESULT]]
+  int4 a = {1, 2, 3, 4};
+  int4 b = {5, 6, 7, 8};
+  return __builtin_ct_select(1, a, b);
+}
+
+float4 test_vector_zero_cond(void) {
+  // CHECK-LABEL: define {{.*}} @test_vector_zero_cond
+  // CHECK: [[RESULT:%.*]] = call <4 x float> @llvm.ct.select.v4f32(i1 false, <4 x float> %{{.*}}, <4 x float> %{{.*}})
+  // CHECK: ret <4 x float> [[RESULT]]
+  float4 a = {1.0f, 2.0f, 3.0f, 4.0f};
+  float4 b = {5.0f, 6.0f, 7.0f, 8.0f};
+  return __builtin_ct_select(0, a, b);
+}
+
+// Test nested vector selections
+int4 test_vector_nested(int cond1, int cond2, int4 a, int4 b, int4 c) {
+  // CHECK-LABEL: define {{.*}} @test_vector_nested
+  // CHECK-DAG: [[COND1:%.*]] = icmp ne i32 %{{.*}}, 0
+  // CHECK-DAG: [[COND2:%.*]] = icmp ne i32 %{{.*}}, 0
+  // CHECK: [[INNER:%.*]] = call <4 x i32> @llvm.ct.select.v4i32(i1 [[COND2]], <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+  // CHECK: [[RESULT:%.*]] = call <4 x i32> @llvm.ct.select.v4i32(i1 [[COND1]], <4 x i32> [[INNER]], <4 x i32> %{{.*}})
+  // CHECK: ret <4 x i32> [[RESULT]]
+  return __builtin_ct_select(cond1, __builtin_ct_select(cond2, a, b), c);
+}
+
+// Test vector selection with complex expressions
+float4 test_vector_complex_expr(int x, int y, float4 a, float4 b) {
+  // CHECK-LABEL: define {{.*}} @test_vector_complex_expr
+  // CHECK: [[CMP:%.*]] = icmp sgt i32 %{{.*}}, %{{.*}}
+  // CHECK: [[RESULT:%.*]] = call <4 x float> @llvm.ct.select.v4f32(i1 [[CMP]], <4 x float> %{{.*}}, <4 x float> %{{.*}})
+  // CHECK: ret <4 x float> [[RESULT]]
+  return __builtin_ct_select(x > y, a, b);
+}
+
+// Test vector with different element sizes
+typedef long long __attribute__((vector_size(16))) long2;
+typedef double __attribute__((vector_size(16))) double2;
+
+long2 test_vector_long2(int cond, long2 a, long2 b) {
+  // CHECK-LABEL: define {{.*}} @test_vector_long2
+  // CHECK: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0
+  // CHECK: [[RESULT:%.*]] = call <2 x i64> @llvm.ct.select.v2i64(i1 [[COND]], <2 x i64> %{{.*}}, <2 x i64> %{{.*}})
+  // CHECK: ret <2 x i64> [[RESULT]]
+  return __builtin_ct_select(cond, a, b);
+}
+
+double2 test_vector_double2(int cond, double2 a, double2 b) {
+  // CHECK-LABEL: define {{.*}} @test_vector_double2
+  // CHECK: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0
+  // CHECK: [[RESULT:%.*]] = call <2 x double> @llvm.ct.select.v2f64(i1 [[COND]], <2 x double> %{{.*}}, <2 x double> %{{.*}})
+  // CHECK: ret <2 x double> [[RESULT]]
+  return __builtin_ct_select(cond, a, b);
+}
+
+// Test mixed vector operations
+int4 test_vector_from_scalar_condition(int4 vec_cond, int4 a, int4 b) {
+  // CHECK-LABEL: define {{.*}} @test_vector_from_scalar_condition
+  // Extract first element and use as condition
+  int scalar_cond = vec_cond[0];
+  // CHECK: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0
+  // CHECK: [[RESULT:%.*]] = call <4 x i32> @llvm.ct.select.v4i32(i1 [[COND]], <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+  // CHECK: ret <4 x i32> [[RESULT]]
+  return __builtin_ct_select(scalar_cond, a, b);
+}
+
+// Test vector chaining
+float4 test_vector_chaining(int cond1, int cond2, int cond3, float4 a, float4 b, float4 c, float4 d) {
+  // CHECK-LABEL: define {{.*}} @test_vector_chaining
+  // CHECK-DAG: [[COND1:%.*]] = icmp ne i32 %{{.*}}, 0
+  // CHECK-DAG: [[COND2:%.*]] = icmp ne i32 %{{.*}}, 0
+  // CHECK-DAG: [[COND3:%.*]] = icmp ne i32 %{{.*}}, 0
+  // CHECK-DAG: [[FIRST:%.*]] = call <4 x float> @llvm.ct.select.v4f32(i1 [[COND1]], <4 x float> %{{.*}}, <4 x float> %{{.*}})
+  // CHECK-DAG: [[SECOND:%.*]] = call <4 x float> @llvm.ct.select.v4f32(i1 [[COND2]], <4 x float> %{{.*}}, <4 x float> %{{.*}})
+  // CHECK-DAG: [[RESULT:%.*]] = call <4 x float> @llvm.ct.select.v4f32(i1 [[COND3]], <4 x float> %{{.*}}, <4 x float> %{{.*}})
+  // CHECK: ret <4 x float> [[RESULT]]
+  float4 first = __builtin_ct_select(cond1, a, b);
+  float4 second = __builtin_ct_select(cond2, first, c);
+  return __builtin_ct_select(cond3, second, d);
+}
+
+// Test special floating point values - NaN
+float test_nan_operands(int cond) {
+  // CHECK-LABEL: define {{.*}} @test_nan_operands
+  // CHECK-DAG: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0
+  // CHECK-DAG: [[RESULT:%.*]] = call float @llvm.ct.select.f32(i1 [[COND]], float  %{{.*}}, float 1.000000e+00)
+  // CHECK: ret float [[RESULT]]
+  float nan_val = __builtin_nanf("");
+  return __builtin_ct_select(cond, nan_val, 1.0f);
+}
+
+double test_nan_double_operands(int cond) {
+  // CHECK-LABEL: define {{.*}} @test_nan_double_operands
+  // CHECK-DAG: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0
+  // CHECK-DAG: [[RESULT:%.*]] = call double @llvm.ct.select.f64(i1 [[COND]], double %{{.*}}, double 2.000000e+00)
+  // CHECK: ret double [[RESULT]]
+  double nan_val = __builtin_nan("");
+  return __builtin_ct_select(cond, nan_val, 2.0);
+}
+
+// Test infinity values
+float test_infinity_operands(int cond) {
+  // CHECK-LABEL: define {{.*}} @test_infinity_operands
+  // CHECK-DAG: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0
+  // CHECK-DAG: [[RESULT:%.*]] = call float @llvm.ct.select.f32(i1 [[COND]], float %{{.*}}, float %{{.*}})
+  // CHECK: ret float [[RESULT]]
+  float pos_inf = __builtin_inff();
+  float neg_inf = -__builtin_inff();
+  return __builtin_ct_select(cond, pos_inf, neg_inf);
+}
+
+double test_infinity_double_operands(int cond) {
+  // CHECK-LABEL: define {{.*}} @test_infinity_double_operands
+  // CHECK-DAG: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0
+  // CHECK-DAG: [[RESULT:%.*]] = call double @llvm.ct.select.f64(i1 [[COND]], double %{{.*}}, double %{{.*}})
+  // CHECK: ret double [[RESULT]]
+  double pos_inf = __builtin_inf();
+  double neg_inf = -__builtin_inf();
+  return __builtin_ct_select(cond, pos_inf, neg_inf);
+}
+
+// Test subnormal/denormal values
+float test_subnormal_operands(int cond) {
+  // CHECK-LABEL: define {{.*}} @test_subnormal_operands
+  // CHECK-DAG: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0
+  // CHECK-DAG: [[RESULT:%.*]] = call float @llvm.ct.select.f32(i1 [[COND]], float %{{.*}}, float %{{.*}})
+  // CHECK: ret float [[RESULT]]
+  // Very small subnormal values
+  float subnormal1 = 1e-40f;
+  float subnormal2 = 1e-45f;
+  return __builtin_ct_select(cond, subnormal1, subnormal2);
+}
+
+// Test integer overflow boundaries
+int test_integer_overflow_operands(int cond) {
+  // CHECK-LABEL: define {{.*}} @test_integer_overflow_operands
+  // CHECK-DAG: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0
+  // CHECK-DAG: [[RESULT:%.*]] = call i32 @llvm.ct.select.i32(i1 [[COND]], i32 %{{.*}}, i32 %{{.*}})
+  // CHECK: ret i32 [[RESULT]]
+  int max_int = __INT_MAX__;
+  int min_int = (-__INT_MAX__ - 1);
+  return __builtin_ct_select(cond, max_int, min_int);
+}
+
+long long test_longlong_overflow_operands(int cond) {
+  // CHECK-LABEL: define {{.*}} @test_longlong_overflow_operands
+  // CHECK-DAG: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0
+  // CHECK-DAG: [[RESULT:%.*]] = call i64 @llvm.ct.select.i64(i1 [[COND]], i64 %{{.*}}, i64 %{{.*}})
+  // CHECK: ret i64 [[RESULT]]
+  long long max_ll = __LONG_LONG_MAX__;
+  long long min_ll = (-__LONG_LONG_MAX__ - 1);
+  return __builtin_ct_select(cond, max_ll, min_ll);
+}
+
+// Test unsigned overflow boundaries
+unsigned int test_unsigned_overflow_operands(int cond) {
+  // CHECK-LABEL: define {{.*}} @test_unsigned_overflow_operands
+  // CHECK-DAG: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0
+  // CHECK-DAG: [[RESULT:%.*]] = call i32 @llvm.ct.select.i32(i1 [[COND]], i32 %{{.*}}, i32 %{{.*}})
+  // CHECK: ret i32 [[RESULT]]
+  unsigned int max_uint = 4294967295;
+  unsigned int min_uint = 0;
+  return __builtin_ct_select(cond, max_uint, min_uint);
+}
+
+// Test null pointer dereference avoidance
+int* test_null_pointer_operands(int cond, int* valid_ptr) {
+  // CHECK: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0
+  // CHECK: [[RESULT:%.*]] = call ptr @llvm.ct.select.p0(i1 [[COND]], ptr %{{.*}}, ptr %{{.*}})
+  // CHECK: ret ptr [[RESULT]]
+  int* null_ptr = (int*)0;
+  return __builtin_ct_select(cond, null_ptr, valid_ptr);
+}
+
+// Test volatile operations
+volatile int global_volatile = 42;
+int test_volatile_operands(int cond) {
+  // CHECK-LABEL: define {{.*}} @test_volatile_operands
+  // CHECK-DAG: [[VOLATILE_LOAD:%.*]] = load volatile i32, ptr {{.*}}
+  // CHECK-DAG: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0
+  // CHECK-DAG: [[RESULT:%.*]] = call i32 @llvm.ct.select.i32(i1 [[COND]], i32 %{{.*}}, i32 100)
+  // CHECK: ret i32 [[RESULT]]
+  volatile int vol_val = global_volatile;
+  return __builtin_ct_select(cond, vol_val, 100);
+}
+
+// Test uninitialized variable behavior (should still work with ct_select)
+int test_uninitialized_operands(int cond, int initialized) {
+  // CHECK-LABEL: define {{.*}} @test_uninitialized_operands
+  // CHECK: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0
+  // CHECK: [[RESULT:%.*]] = call i32 @llvm.ct.select.i32(i1 [[COND]], i32 %{{.*}}, i32 %{{.*}})
+  // CHECK: ret i32 [[RESULT]]
+  int uninitialized; // Intentionally uninitialized
+  return __builtin_ct_select(cond, uninitialized, initialized);
+}
+
+// Test zero division avoidance patterns
+int test_division_by_zero_avoidance(int cond, int dividend, int divisor) {
+  // CHECK-LABEL: define {{.*}} @test_division_by_zero_avoidance
+  // CHECK-DAG: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0
+  // CHECK-DAG: [[DIV_RESULT:%.*]] = sdiv i32 %{{.*}}, %{{.*}}
+  // CHECK-DAG: [[SAFE_DIVISOR:%.*]] = call i32 @llvm.ct.select.i32(i1 [[COND]], i32 %{{.*}}, i32 1)
+  // First get a safe divisor (never zero)
+  int safe_divisor = __builtin_ct_select(divisor != 0, divisor, 1);
+  // Then perform division with guaranteed non-zero divisor
+  return dividend / safe_divisor;
+}
+
+// Test array bounds checking patterns
+int test_array_bounds_protection(int cond, int index, int* array) {
+  // CHECK-LABEL: define {{.*}} @test_array_bounds_protection
+  // CHECK-DAG: [[SAFE_INDEX:%.*]] = call i32 @llvm.ct.select.i32(i1 {{.*}}, i32 %{{.*}}, i32 0)
+  // Use ct_select to ensure safe array indexing
+  int safe_index = __builtin_ct_select(index >= 0 && index < 10, index, 0);
+  return array[safe_index];
+}
+
+// Test bit manipulation edge cases
+unsigned int test_bit_manipulation_edge_cases(int cond, unsigned int value) {
+  // CHECK-LABEL: define {{.*}} @test_bit_manipulation_edge_cases
+  // CHECK-DAG: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0
+  // CHECK-DAG: [[SHIFT_LEFT:%.*]] = shl i32 %{{.*}}, 31
+  // CHECK-DAG: [[SHIFT_RIGHT:%.*]] = lshr i32 %{{.*}}, 31
+  // CHECK-DAG: [[RESULT:%.*]] = call i32 @llvm.ct.select.i32(i1 [[COND]], i32 %{{.*}}, i32 %{{.*}})
+  // CHECK: ret i32 [[RESULT]]
+  // Test extreme bit shifts that could cause undefined behavior
+  unsigned int left_shift = value << 31;   // Could overflow
+  unsigned int right_shift = value >> 31;  // Extract sign bit
+  return __builtin_ct_select(cond, left_shift, right_shift);
+}
+
+// Test signed integer wraparound
+int test_signed_wraparound(int cond, int a, int b) {
+  // CHECK-LABEL: define {{.*}} @test_signed_wraparound
+  // CHECK-DAG: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0
+  // CHECK-DAG: [[ADD:%.*]] = add nsw i32 %{{.*}}, %{{.*}}
+  // CHECK-DAG: [[SUB:%.*]] = sub nsw i32 %{{.*}}, %{{.*}}
+  // CHECK-DAG: [[RESULT:%.*]] = call i32 @llvm.ct.select.i32(i1 [[COND]], i32 %{{.*}}, i32 %{{.*}})
+  // CHECK: ret i32 [[RESULT]]
+  int sum = a + b;      // Could overflow
+  int diff = a - b;     // Could underflow
+  return __builtin_ct_select(cond, sum, diff);
+}
+
+// Test vector NaN handling
+float4 test_vector_nan_operands(int cond) {
+  // CHECK-LABEL: define {{.*}} @test_vector_nan_operands
+  // CHECK: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0
+  // CHECK: [[RESULT:%.*]] = call <4 x float> @llvm.ct.select.v4f32(i1 [[COND]], <4 x float> %{{.*}}, <4 x float> %{{.*}})
+  // CHECK: ret <4 x float> [[RESULT]]
+  float nan_val = __builtin_nanf("");
+  float4 nan_vec = {nan_val, nan_val, nan_val, nan_val};
+  float4 normal_vec = {1.0f, 2.0f, 3.0f, 4.0f};
+  return __builtin_ct_select(cond, nan_vec, normal_vec);
+}
+
+// Test vector infinity handling
+float4 test_vector_infinity_operands(int cond) {
+  // CHECK-LABEL: define {{.*}} @test_vector_infinity_operands
+  // CHECK: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0
+  // CHECK: [[RESULT:%.*]] = call <4 x float> @llvm.ct.select.v4f32(i1 [[COND]], <4 x float> %{{.*}}, <4 x float> %{{.*}})
+  // CHECK: ret <4 x float> [[RESULT]]
+  float pos_inf = __builtin_inff();
+  float neg_inf = -__builtin_inff();
+  float4 inf_vec = {pos_inf, neg_inf, pos_inf, neg_inf};
+  float4 zero_vec = {0.0f, 0.0f, 0.0f, 0.0f};
+  return __builtin_ct_select(cond, inf_vec, zero_vec);
+}
+
+// Test mixed special values
+double test_mixed_special_values(int cond) {
+  // CHECK-LABEL: define {{.*}} @test_mixed_special_values
+  // CHECK: [[COND:%.*]] = icmp ne i32 %{{.*}}, 0
+  // CHECK: [[RESULT:%.*]] = call double @llvm.ct.select.f64(i1 [[COND]], double %{{.*}}, double %{{.*}})
+  // CHECK: ret double [[RESULT]]
+  double nan_val = __builtin_nan("");
+  double inf_val = __builtin_inf();
+  return __builtin_ct_select(cond, nan_val, inf_val);
+}
+
+// Test constant-time memory access pattern
+int test_constant_time_memory_access(int secret_index, int* data_array) {
+  // CHECK-LABEL: define {{.*}} @test_constant_time_memory_access
+  // This pattern ensures constant-time memory access regardless of secret_index value
+  int result = 0;
+  // Use ct_select to accumulate values without revealing the secret index
+  for (int i = 0; i < 8; i++) {
+    int is_target = (i == secret_index);
+    int current_value = data_array[i];
+    int selected_value = __builtin_ct_select(is_target, current_value, 0);
+    result += selected_value;
+  }
+  return result;
+}
+
+// Test timing-attack resistant comparison
+int test_timing_resistant_comparison(const char* secret, const char* guess) {
+  // CHECK-LABEL: define {{.*}} @test_timing_resistant_comparison
+  // Constant-time string comparison using ct_select
+  int match = 1;
+  for (int i = 0; i < 32; i++) {
+    int chars_equal = (secret[i] == guess[i]);
+    int both_null = (secret[i] == 0) && (guess[i] == 0);
+    int still_matching = __builtin_ct_select(chars_equal || both_null, match, 0);
+    match = __builtin_ct_select(both_null, match, still_matching);
+  }
+  return match;
+}

>From dc7ae68740548e19ee74616eb64c10ac43657ee3 Mon Sep 17 00:00:00 2001
From: wizardengineer <juliuswoosebert at gmail.com>
Date: Wed, 5 Nov 2025 17:10:05 -0500
Subject: [PATCH 2/4] [LLVM][ARM] Add native ct.select support for ARM32 and
 Thumb

This patch implements architecture-specific lowering for ct.select on ARM
(both ARM32 and Thumb modes) using conditional move instructions and
bitwise operations for constant-time selection.

Implementation details:
- Uses pseudo-instructions that are expanded Post-RA to bitwise operations
- Post-RA expansion in ARMBaseInstrInfo for BUNDLE pseudo-instructions
- Handles scalar integer types, floating-point, and half-precision types
- Handles vector types with NEON when available
- Support for both ARM and Thumb instruction sets (Thumb1 and Thumb2)
- Special handling for Thumb1 which lacks conditional execution
- Comprehensive test coverage including half-precision and vectors

The implementation includes:
- ISelLowering: Custom lowering to CTSELECT pseudo-instructions
- ISelDAGToDAG: Selection of appropriate pseudo-instructions
- BaseInstrInfo: Post-RA expansion of BUNDLE to bitwise instruction sequences
- InstrInfo.td: Pseudo-instruction definitions for different types
- TargetMachine: Registration of Post-RA expansion pass
- Proper handling of condition codes and register allocation constraints
---
 llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp |  337 +++-
 llvm/lib/Target/ARM/ARMBaseInstrInfo.h   |    6 +
 llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp  |   86 +
 llvm/lib/Target/ARM/ARMISelLowering.cpp  |  373 ++++-
 llvm/lib/Target/ARM/ARMISelLowering.h    |    4 +
 llvm/lib/Target/ARM/ARMInstrInfo.td      |  187 +++
 llvm/lib/Target/ARM/ARMTargetMachine.cpp |    8 +-
 llvm/test/CodeGen/ARM/ctselect-half.ll   |  867 ++++++++++
 llvm/test/CodeGen/ARM/ctselect-vector.ll | 1839 ++++++++++++++++++++++
 llvm/test/CodeGen/ARM/ctselect.ll        |  549 +++++++
 10 files changed, 4187 insertions(+), 69 deletions(-)
 create mode 100644 llvm/test/CodeGen/ARM/ctselect-half.ll
 create mode 100644 llvm/test/CodeGen/ARM/ctselect-vector.ll
 create mode 100644 llvm/test/CodeGen/ARM/ctselect.ll

diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
index 3b1b8673e56a0..b1e3a5065b06f 100644
--- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -1524,18 +1524,351 @@ void ARMBaseInstrInfo::expandMEMCPY(MachineBasicBlock::iterator MI) const {
   BB->erase(MI);
 }
 
+// Expands the ctselect pseudo for vector operands, post-RA.
+bool ARMBaseInstrInfo::expandCtSelectVector(MachineInstr &MI) const {
+  MachineBasicBlock *MBB = MI.getParent();
+  DebugLoc DL = MI.getDebugLoc();
+
+  Register DestReg = MI.getOperand(0).getReg();
+  Register MaskReg = MI.getOperand(1).getReg();
+
+  // These operations will differ by operand register size.
+  unsigned AndOp = ARM::VANDd;
+  unsigned BicOp = ARM::VBICd;
+  unsigned OrrOp = ARM::VORRd;
+  unsigned BroadcastOp = ARM::VDUP32d;
+
+  const TargetRegisterInfo *TRI = &getRegisterInfo();
+  const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(DestReg);
+
+  if (ARM::QPRRegClass.hasSubClassEq(RC)) {
+    AndOp = ARM::VANDq;
+    BicOp = ARM::VBICq;
+    OrrOp = ARM::VORRq;
+    BroadcastOp = ARM::VDUP32q;
+  }
+
+  unsigned RsbOp = Subtarget.isThumb2() ? ARM::t2RSBri : ARM::RSBri;
+
+  // Any vector pseudo has: ((outs $dst, $tmp_mask, $bcast_mask), (ins $src1,
+  // $src2, $cond))
+  Register VectorMaskReg = MI.getOperand(2).getReg();
+  Register Src1Reg = MI.getOperand(3).getReg();
+  Register Src2Reg = MI.getOperand(4).getReg();
+  Register CondReg = MI.getOperand(5).getReg();
+
+  // The following sequence of steps yields: (src1 & mask) | (src2 & ~mask)
+
+  // 1. mask = 0 - cond
+  // When cond = 0: mask = 0x00000000.
+  // When cond = 1: mask = 0xFFFFFFFF.
+
+  MachineInstr *FirstNewMI = BuildMI(*MBB, MI, DL, get(RsbOp), MaskReg)
+                                 .addReg(CondReg)
+                                 .addImm(0)
+                                 .add(predOps(ARMCC::AL))
+                                 .add(condCodeOp())
+                                 .setMIFlag(MachineInstr::MIFlag::NoMerge);
+
+  // 2. A = src1 & mask
+  // For vectors, broadcast the scalar mask so it matches operand size.
+  BuildMI(*MBB, MI, DL, get(BroadcastOp), VectorMaskReg)
+      .addReg(MaskReg)
+      .add(predOps(ARMCC::AL))
+      .setMIFlag(MachineInstr::MIFlag::NoMerge);
+
+  BuildMI(*MBB, MI, DL, get(AndOp), DestReg)
+      .addReg(Src1Reg)
+      .addReg(VectorMaskReg)
+      .add(predOps(ARMCC::AL))
+      .setMIFlag(MachineInstr::MIFlag::NoMerge);
+
+  // 3. B = src2 & ~mask
+  BuildMI(*MBB, MI, DL, get(BicOp), VectorMaskReg)
+      .addReg(Src2Reg)
+      .addReg(VectorMaskReg)
+      .add(predOps(ARMCC::AL))
+      .setMIFlag(MachineInstr::MIFlag::NoMerge);
+
+  // 4. result = A | B
+  auto LastNewMI = BuildMI(*MBB, MI, DL, get(OrrOp), DestReg)
+                       .addReg(DestReg)
+                       .addReg(VectorMaskReg)
+                       .add(predOps(ARMCC::AL))
+                       .setMIFlag(MachineInstr::MIFlag::NoMerge);
+
+  auto BundleStart = FirstNewMI->getIterator();
+  auto BundleEnd = LastNewMI->getIterator();
+
+  // Add instruction bundling
+  finalizeBundle(*MBB, BundleStart, std::next(BundleEnd));
+
+  MI.eraseFromParent();
+  return true;
+}
+
+// Expands the ctselect pseudo for thumb1, post-RA.
+bool ARMBaseInstrInfo::expandCtSelectThumb(MachineInstr &MI) const {
+  MachineBasicBlock *MBB = MI.getParent();
+  DebugLoc DL = MI.getDebugLoc();
+
+  // pseudos in thumb1 mode have: (outs $dst, $tmp_mask), (ins $src1, $src2,
+  // $cond)) register class here is always tGPR.
+  Register DestReg = MI.getOperand(0).getReg();
+  Register MaskReg = MI.getOperand(1).getReg();
+  Register Src1Reg = MI.getOperand(2).getReg();
+  Register Src2Reg = MI.getOperand(3).getReg();
+  Register CondReg = MI.getOperand(4).getReg();
+
+  // Access register info
+  MachineFunction *MF = MBB->getParent();
+  const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+
+  unsigned RegSize = TRI->getRegSizeInBits(MaskReg, MRI);
+  unsigned ShiftAmount = RegSize - 1;
+
+  // Option 1: Shift-based mask (preferred - no flag modification)
+  MachineInstr *FirstNewMI = BuildMI(*MBB, MI, DL, get(ARM::tMOVr), MaskReg)
+                                 .addReg(CondReg)
+                                 .add(predOps(ARMCC::AL))
+                                 .setMIFlag(MachineInstr::MIFlag::NoMerge);
+
+  // Instead of using RSB, we can use LSL and ASR to get the mask. This is to
+  // avoid the flag modification caused by RSB. tLSLri: (outs tGPR:$Rd,
+  // s_cc_out:$s), (ins tGPR:$Rm, imm0_31:$imm5, pred:$p)
+  BuildMI(*MBB, MI, DL, get(ARM::tLSLri), MaskReg)
+      .addReg(ARM::CPSR, RegState::Define | RegState::Dead) // s_cc_out:$s
+      .addReg(MaskReg)                                      // $Rm
+      .addImm(ShiftAmount)                                  // imm0_31:$imm5
+      .add(predOps(ARMCC::AL))                              // pred:$p
+      .setMIFlag(MachineInstr::MIFlag::NoMerge);
+
+  // tASRri: (outs tGPR:$Rd, s_cc_out:$s), (ins tGPR:$Rm, imm_sr:$imm5, pred:$p)
+  BuildMI(*MBB, MI, DL, get(ARM::tASRri), MaskReg)
+      .addReg(ARM::CPSR, RegState::Define | RegState::Dead) // s_cc_out:$s
+      .addReg(MaskReg)                                      // $Rm
+      .addImm(ShiftAmount)                                  // imm_sr:$imm5
+      .add(predOps(ARMCC::AL))                              // pred:$p
+      .setMIFlag(MachineInstr::MIFlag::NoMerge);
+
+  // 2. xor_diff = src1 ^ src2
+  BuildMI(*MBB, MI, DL, get(ARM::tMOVr), DestReg)
+      .addReg(Src1Reg)
+      .add(predOps(ARMCC::AL))
+      .setMIFlag(MachineInstr::MIFlag::NoMerge);
+
+  // tEOR has tied operands: (outs tGPR:$Rdn, s_cc_out:$s), (ins tGPR:$Rn,
+  // pred:$p) with constraint "$Rn = $Rdn"
+  BuildMI(*MBB, MI, DL, get(ARM::tEOR), DestReg)
+      .addReg(ARM::CPSR, RegState::Define | RegState::Dead) // s_cc_out:$s
+      .addReg(DestReg)                                      // tied input $Rn
+      .addReg(Src2Reg)                                      // $Rm
+      .add(predOps(ARMCC::AL))                              // pred:$p
+      .setMIFlag(MachineInstr::MIFlag::NoMerge);
+
+  // 3. masked_xor = xor_diff & mask
+  // tAND has tied operands: (outs tGPR:$Rdn, s_cc_out:$s), (ins tGPR:$Rn,
+  // pred:$p) with constraint "$Rn = $Rdn"
+  BuildMI(*MBB, MI, DL, get(ARM::tAND), DestReg)
+      .addReg(ARM::CPSR, RegState::Define | RegState::Dead) // s_cc_out:$s
+      .addReg(DestReg)                                      // tied input $Rn
+      .addReg(MaskReg, RegState::Kill)                      // $Rm
+      .add(predOps(ARMCC::AL))                              // pred:$p
+      .setMIFlag(MachineInstr::MIFlag::NoMerge);
+
+  // 4. result = src2 ^ masked_xor
+  // tEOR has tied operands: (outs tGPR:$Rdn, s_cc_out:$s), (ins tGPR:$Rn,
+  // pred:$p) with constraint "$Rn = $Rdn"
+  auto LastMI =
+      BuildMI(*MBB, MI, DL, get(ARM::tEOR), DestReg)
+          .addReg(ARM::CPSR, RegState::Define | RegState::Dead) // s_cc_out:$s
+          .addReg(DestReg)         // tied input $Rn
+          .addReg(Src2Reg)         // $Rm
+          .add(predOps(ARMCC::AL)) // pred:$p
+          .setMIFlag(MachineInstr::MIFlag::NoMerge);
+
+  // Add instruction bundling
+  auto BundleStart = FirstNewMI->getIterator();
+  finalizeBundle(*MBB, BundleStart, std::next(LastMI->getIterator()));
+
+  MI.eraseFromParent();
+  return true;
+}
+
+// Expands the ctselect pseudo, post-RA.
+bool ARMBaseInstrInfo::expandCtSelect(MachineInstr &MI) const {
+  MachineBasicBlock *MBB = MI.getParent();
+  DebugLoc DL = MI.getDebugLoc();
+
+  Register DestReg = MI.getOperand(0).getReg();
+  Register MaskReg = MI.getOperand(1).getReg();
+  Register DestRegSavedRef = DestReg;
+  Register Src1Reg, Src2Reg, CondReg;
+
+  // These operations will differ by operand register size.
+  unsigned RsbOp = ARM::RSBri;
+  unsigned AndOp = ARM::ANDrr;
+  unsigned BicOp = ARM::BICrr;
+  unsigned OrrOp = ARM::ORRrr;
+
+  if (Subtarget.isThumb2()) {
+    RsbOp = ARM::t2RSBri;
+    AndOp = ARM::t2ANDrr;
+    BicOp = ARM::t2BICrr;
+    OrrOp = ARM::t2ORRrr;
+  }
+
+  unsigned Opcode = MI.getOpcode();
+  bool IsFloat = Opcode == ARM::CT_SELECTf32 || Opcode == ARM::CT_SELECTf16 ||
+                 Opcode == ARM::CT_SELECTbf16;
+  MachineInstr *FirstNewMI = nullptr;
+  if (IsFloat) {
+    // Each float pseudo has: (outs $dst, $tmp_mask, $scratch1, $scratch2), (ins
+    // $src1, $src2, $cond)) We use two scratch registers in tablegen for
+    // bitwise ops on float types,.
+    Register GPRScratch1 = MI.getOperand(2).getReg();
+    Register GPRScratch2 = MI.getOperand(3).getReg();
+
+    // choice a from __builtin_ct_select(cond, a, b)
+    Src1Reg = MI.getOperand(4).getReg();
+    // choice b from __builtin_ct_select(cond, a, b)
+    Src2Reg = MI.getOperand(5).getReg();
+    // cond from __builtin_ct_select(cond, a, b)
+    CondReg = MI.getOperand(6).getReg();
+
+    // Move fp src1 to GPR scratch1 so we can do our bitwise ops
+    FirstNewMI = BuildMI(*MBB, MI, DL, get(ARM::VMOVRS), GPRScratch1)
+                     .addReg(Src1Reg)
+                     .add(predOps(ARMCC::AL))
+                     .setMIFlag(MachineInstr::MIFlag::NoMerge);
+
+    // Move src2 to scratch2
+    BuildMI(*MBB, MI, DL, get(ARM::VMOVRS), GPRScratch2)
+        .addReg(Src2Reg)
+        .add(predOps(ARMCC::AL))
+        .setMIFlag(MachineInstr::MIFlag::NoMerge);
+
+    Src1Reg = GPRScratch1;
+    Src2Reg = GPRScratch2;
+    // Reuse GPRScratch1 for dest after we are done working with src1.
+    DestReg = GPRScratch1;
+  } else {
+    // Any non-float, non-vector pseudo has: (outs $dst, $tmp_mask), (ins $src1,
+    // $src2, $cond))
+    Src1Reg = MI.getOperand(2).getReg();
+    Src2Reg = MI.getOperand(3).getReg();
+    CondReg = MI.getOperand(4).getReg();
+  }
+
+  // The following sequence of steps yields: (src1 & mask) | (src2 & ~mask)
+
+  // 1. mask = 0 - cond
+  // When cond = 0: mask = 0x00000000.
+  // When cond = 1: mask = 0xFFFFFFFF.
+  auto TmpNewMI = BuildMI(*MBB, MI, DL, get(RsbOp), MaskReg)
+                      .addReg(CondReg)
+                      .addImm(0)
+                      .add(predOps(ARMCC::AL))
+                      .add(condCodeOp())
+                      .setMIFlag(MachineInstr::MIFlag::NoMerge);
+
+  // We use the first instruction in the bundle as the first instruction.
+  if (!FirstNewMI)
+    FirstNewMI = TmpNewMI;
+
+  // 2. A = src1 & mask
+  BuildMI(*MBB, MI, DL, get(AndOp), DestReg)
+      .addReg(Src1Reg)
+      .addReg(MaskReg)
+      .add(predOps(ARMCC::AL))
+      .add(condCodeOp())
+      .setMIFlag(MachineInstr::MIFlag::NoMerge);
+
+  // 3. B = src2 & ~mask
+  BuildMI(*MBB, MI, DL, get(BicOp), MaskReg)
+      .addReg(Src2Reg)
+      .addReg(MaskReg)
+      .add(predOps(ARMCC::AL))
+      .add(condCodeOp())
+      .setMIFlag(MachineInstr::MIFlag::NoMerge);
+
+  // 4. result = A | B
+  auto LastNewMI = BuildMI(*MBB, MI, DL, get(OrrOp), DestReg)
+                       .addReg(DestReg)
+                       .addReg(MaskReg)
+                       .add(predOps(ARMCC::AL))
+                       .add(condCodeOp())
+                       .setMIFlag(MachineInstr::MIFlag::NoMerge);
+
+  if (IsFloat) {
+    // Return our result from GPR to the correct register type.
+    LastNewMI = BuildMI(*MBB, MI, DL, get(ARM::VMOVSR), DestRegSavedRef)
+                    .addReg(DestReg)
+                    .add(predOps(ARMCC::AL))
+                    .setMIFlag(MachineInstr::MIFlag::NoMerge);
+  }
+
+  auto BundleStart = FirstNewMI->getIterator();
+  auto BundleEnd = LastNewMI->getIterator();
+
+  // Add instruction bundling
+  finalizeBundle(*MBB, BundleStart, std::next(BundleEnd));
+
+  MI.eraseFromParent();
+  return true;
+}
+
 bool ARMBaseInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
-  if (MI.getOpcode() == TargetOpcode::LOAD_STACK_GUARD) {
+  auto opcode = MI.getOpcode();
+
+  if (opcode == TargetOpcode::LOAD_STACK_GUARD) {
     expandLoadStackGuard(MI);
     MI.getParent()->erase(MI);
     return true;
   }
 
-  if (MI.getOpcode() == ARM::MEMCPY) {
+  if (opcode == ARM::MEMCPY) {
     expandMEMCPY(MI);
     return true;
   }
 
+  if (opcode == ARM::CT_SELECTf64) {
+    if (Subtarget.isThumb1Only()) {
+      LLVM_DEBUG(dbgs() << "Opcode (thumb1 subtarget) " << opcode
+                        << "replaced by: " << MI);
+      return expandCtSelectThumb(MI);
+    } else {
+      LLVM_DEBUG(dbgs() << "Opcode (vector) " << opcode
+                        << "replaced by: " << MI);
+      return expandCtSelectVector(MI);
+    }
+  }
+
+  if (opcode == ARM::CT_SELECTv8i8 || opcode == ARM::CT_SELECTv4i16 ||
+      opcode == ARM::CT_SELECTv2i32 || opcode == ARM::CT_SELECTv1i64 ||
+      opcode == ARM::CT_SELECTv2f32 || opcode == ARM::CT_SELECTv4f16 ||
+      opcode == ARM::CT_SELECTv4bf16 || opcode == ARM::CT_SELECTv16i8 ||
+      opcode == ARM::CT_SELECTv8i16 || opcode == ARM::CT_SELECTv4i32 ||
+      opcode == ARM::CT_SELECTv2i64 || opcode == ARM::CT_SELECTv4f32 ||
+      opcode == ARM::CT_SELECTv2f64 || opcode == ARM::CT_SELECTv8f16 ||
+      opcode == ARM::CT_SELECTv8bf16) {
+    LLVM_DEBUG(dbgs() << "Opcode (vector) " << opcode << "replaced by: " << MI);
+    return expandCtSelectVector(MI);
+  }
+
+  if (opcode == ARM::CT_SELECTint || opcode == ARM::CT_SELECTf16 ||
+      opcode == ARM::CT_SELECTbf16 || opcode == ARM::CT_SELECTf32) {
+    if (Subtarget.isThumb1Only()) {
+      LLVM_DEBUG(dbgs() << "Opcode (thumb1 subtarget) " << opcode
+                        << "replaced by: " << MI);
+      return expandCtSelectThumb(MI);
+    } else {
+      LLVM_DEBUG(dbgs() << "Opcode " << opcode << "replaced by: " << MI);
+      return expandCtSelect(MI);
+    }
+  }
+
   // This hook gets to expand COPY instructions before they become
   // copyPhysReg() calls.  Look for VMOVS instructions that can legally be
   // widened to VMOVD.  We prefer the VMOVD when possible because it may be
diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.h b/llvm/lib/Target/ARM/ARMBaseInstrInfo.h
index 94595ab2b338b..969726240e67a 100644
--- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.h
+++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.h
@@ -225,6 +225,12 @@ class ARMBaseInstrInfo : public ARMGenInstrInfo {
       Register VReg, unsigned SubReg = 0,
       MachineInstr::MIFlag Flags = MachineInstr::NoFlags) const override;
 
+  bool expandCtSelectVector(MachineInstr &MI) const;
+
+  bool expandCtSelectThumb(MachineInstr &MI) const;
+
+  bool expandCtSelect(MachineInstr &MI) const;
+
   bool expandPostRAPseudo(MachineInstr &MI) const override;
 
   bool shouldSink(const MachineInstr &MI) const override;
diff --git a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
index 61b679d55fb47..0919d4eead3ab 100644
--- a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
+++ b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
@@ -4264,6 +4264,92 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
     // Other cases are autogenerated.
     break;
   }
+  case ARMISD::CT_SELECT: {
+    EVT VT = N->getValueType(0);
+    unsigned PseudoOpcode;
+    bool IsFloat = false;
+    bool IsVector = false;
+
+    if (VT == MVT::f16) {
+      PseudoOpcode = ARM::CT_SELECTf16;
+      IsFloat = true;
+    } else if (VT == MVT::bf16) {
+      PseudoOpcode = ARM::CT_SELECTbf16;
+      IsFloat = true;
+    } else if (VT == MVT::f32) {
+      PseudoOpcode = ARM::CT_SELECTf32;
+      IsFloat = true;
+    } else if (VT == MVT::f64) {
+      PseudoOpcode = ARM::CT_SELECTf64;
+      IsVector = true;
+    } else if (VT == MVT::v8i8) {
+      PseudoOpcode = ARM::CT_SELECTv8i8;
+      IsVector = true;
+    } else if (VT == MVT::v4i16) {
+      PseudoOpcode = ARM::CT_SELECTv4i16;
+      IsVector = true;
+    } else if (VT == MVT::v2i32) {
+      PseudoOpcode = ARM::CT_SELECTv2i32;
+      IsVector = true;
+    } else if (VT == MVT::v1i64) {
+      PseudoOpcode = ARM::CT_SELECTv1i64;
+      IsVector = true;
+    } else if (VT == MVT::v2f32) {
+      PseudoOpcode = ARM::CT_SELECTv2f32;
+      IsVector = true;
+    } else if (VT == MVT::v4f16) {
+      PseudoOpcode = ARM::CT_SELECTv4f16;
+      IsVector = true;
+    } else if (VT == MVT::v4bf16) {
+      PseudoOpcode = ARM::CT_SELECTv4bf16;
+      IsVector = true;
+    } else if (VT == MVT::v16i8) {
+      PseudoOpcode = ARM::CT_SELECTv16i8;
+      IsVector = true;
+    } else if (VT == MVT::v8i16) {
+      PseudoOpcode = ARM::CT_SELECTv8i16;
+      IsVector = true;
+    } else if (VT == MVT::v4i32) {
+      PseudoOpcode = ARM::CT_SELECTv4i32;
+      IsVector = true;
+    } else if (VT == MVT::v2i64) {
+      PseudoOpcode = ARM::CT_SELECTv2i64;
+      IsVector = true;
+    } else if (VT == MVT::v4f32) {
+      PseudoOpcode = ARM::CT_SELECTv4f32;
+      IsVector = true;
+    } else if (VT == MVT::v2f64) {
+      PseudoOpcode = ARM::CT_SELECTv2f64;
+      IsVector = true;
+    } else if (VT == MVT::v8f16) {
+      PseudoOpcode = ARM::CT_SELECTv8f16;
+      IsVector = true;
+    } else if (VT == MVT::v8bf16) {
+      PseudoOpcode = ARM::CT_SELECTv8bf16;
+      IsVector = true;
+    } else {
+      // i1, i8, i16, i32, i64
+      PseudoOpcode = ARM::CT_SELECTint;
+    }
+
+    SmallVector<EVT, 4> VTs;
+    VTs.push_back(VT);       // $dst
+    VTs.push_back(MVT::i32); // $tmp_mask (always GPR)
+
+    if (IsVector) {
+      VTs.push_back(VT); // $bcast_mask (same type as dst for vectors)
+    } else if (IsFloat) {
+      VTs.push_back(MVT::i32); // $scratch1 (GPR)
+      VTs.push_back(MVT::i32); // $scratch2 (GPR)
+    }
+
+    // src1, src2, cond
+    SDValue Ops[] = {N->getOperand(0), N->getOperand(1), N->getOperand(2)};
+
+    SDNode *ResNode = CurDAG->getMachineNode(PseudoOpcode, SDLoc(N), VTs, Ops);
+    ReplaceNode(N, ResNode);
+    return;
+  }
   case ARMISD::VZIP: {
     EVT VT = N->getValueType(0);
     // vzip.32 Dd, Dm is a pseudo-instruction expanded to vtrn.32 Dd, Dm.
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 635a5f4e6ca94..3dde0f3188979 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -204,6 +204,7 @@ void ARMTargetLowering::addTypeForNEON(MVT VT, MVT PromotedLdStVT) {
   setOperationAction(ISD::SELECT,            VT, Expand);
   setOperationAction(ISD::SELECT_CC,         VT, Expand);
   setOperationAction(ISD::VSELECT,           VT, Expand);
+  setOperationAction(ISD::CT_SELECT, VT, Custom);
   setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand);
   if (VT.isInteger()) {
     setOperationAction(ISD::SHL, VT, Custom);
@@ -306,6 +307,7 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) {
     setOperationAction(ISD::CTPOP, VT, Expand);
     setOperationAction(ISD::SELECT, VT, Expand);
     setOperationAction(ISD::SELECT_CC, VT, Expand);
+    setOperationAction(ISD::CT_SELECT, VT, Custom);
 
     // Vector reductions
     setOperationAction(ISD::VECREDUCE_ADD, VT, Legal);
@@ -357,6 +359,7 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) {
     setOperationAction(ISD::MSTORE, VT, Legal);
     setOperationAction(ISD::SELECT, VT, Expand);
     setOperationAction(ISD::SELECT_CC, VT, Expand);
+    setOperationAction(ISD::CT_SELECT, VT, Custom);
 
     // Pre and Post inc are supported on loads and stores
     for (unsigned im = (unsigned)ISD::PRE_INC;
@@ -410,6 +413,28 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) {
   setOperationAction(ISD::VECREDUCE_FMIN, MVT::v2f16, Custom);
   setOperationAction(ISD::VECREDUCE_FMAX, MVT::v2f16, Custom);
 
+  if (Subtarget->hasFullFP16()) {
+    setOperationAction(ISD::CT_SELECT, MVT::v4f16, Custom);
+    setOperationAction(ISD::CT_SELECT, MVT::v8f16, Custom);
+  }
+
+  if (Subtarget->hasBF16()) {
+    setOperationAction(ISD::CT_SELECT, MVT::v4bf16, Custom);
+    setOperationAction(ISD::CT_SELECT, MVT::v8bf16, Custom);
+  }
+
+  // small exotic vectors get scalarised for ctselect
+  setOperationAction(ISD::CT_SELECT, MVT::v1i8, Expand);
+  setOperationAction(ISD::CT_SELECT, MVT::v1i16, Expand);
+  setOperationAction(ISD::CT_SELECT, MVT::v1i32, Expand);
+  setOperationAction(ISD::CT_SELECT, MVT::v1f32, Expand);
+  setOperationAction(ISD::CT_SELECT, MVT::v2i8, Expand);
+
+  setOperationAction(ISD::CT_SELECT, MVT::v2i16, Promote);
+  setOperationPromotedToType(ISD::CT_SELECT, MVT::v2i16, MVT::v4i16);
+  setOperationAction(ISD::CT_SELECT, MVT::v4i8, Promote);
+  setOperationPromotedToType(ISD::CT_SELECT, MVT::v4i8, MVT::v8i8);
+
   // We 'support' these types up to bitcast/load/store level, regardless of
   // MVE integer-only / float support. Only doing FP data processing on the FP
   // vector types is inhibited at integer-only level.
@@ -421,6 +446,7 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) {
     setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
     setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
     setOperationAction(ISD::VSELECT, VT, Legal);
+    setOperationAction(ISD::CT_SELECT, VT, Custom);
     setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
   }
   setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2f64, Legal);
@@ -476,6 +502,7 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) {
     setOperationAction(ISD::VSELECT, VT, Expand);
     setOperationAction(ISD::SELECT, VT, Expand);
     setOperationAction(ISD::SELECT_CC, VT, Expand);
+    setOperationAction(ISD::CT_SELECT, VT, Custom);
 
     if (!HasMVEFP) {
       setOperationAction(ISD::SINT_TO_FP, VT, Expand);
@@ -520,6 +547,74 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_,
 
   const Triple &TT = TM.getTargetTriple();
 
+  if (TT.isOSBinFormatMachO()) {
+    // Uses VFP for Thumb libfuncs if available.
+    if (Subtarget->isThumb() && Subtarget->hasVFP2Base() &&
+        Subtarget->hasARMOps() && !Subtarget->useSoftFloat()) {
+      // clang-format off
+      static const struct {
+        const RTLIB::Libcall Op;
+        const RTLIB::LibcallImpl Impl;
+      } LibraryCalls[] = {
+        // Single-precision floating-point arithmetic.
+        { RTLIB::ADD_F32, RTLIB::impl___addsf3vfp },
+        { RTLIB::SUB_F32, RTLIB::impl___subsf3vfp },
+        { RTLIB::MUL_F32, RTLIB::impl___mulsf3vfp },
+        { RTLIB::DIV_F32, RTLIB::impl___divsf3vfp },
+
+        // Double-precision floating-point arithmetic.
+        { RTLIB::ADD_F64, RTLIB::impl___adddf3vfp },
+        { RTLIB::SUB_F64, RTLIB::impl___subdf3vfp },
+        { RTLIB::MUL_F64, RTLIB::impl___muldf3vfp },
+        { RTLIB::DIV_F64, RTLIB::impl___divdf3vfp },
+
+        // Single-precision comparisons.
+        { RTLIB::OEQ_F32, RTLIB::impl___eqsf2vfp },
+        { RTLIB::UNE_F32, RTLIB::impl___nesf2vfp },
+        { RTLIB::OLT_F32, RTLIB::impl___ltsf2vfp },
+        { RTLIB::OLE_F32, RTLIB::impl___lesf2vfp },
+        { RTLIB::OGE_F32, RTLIB::impl___gesf2vfp },
+        { RTLIB::OGT_F32, RTLIB::impl___gtsf2vfp },
+        { RTLIB::UO_F32,  RTLIB::impl___unordsf2vfp },
+
+        // Double-precision comparisons.
+        { RTLIB::OEQ_F64, RTLIB::impl___eqdf2vfp },
+        { RTLIB::UNE_F64, RTLIB::impl___nedf2vfp },
+        { RTLIB::OLT_F64, RTLIB::impl___ltdf2vfp },
+        { RTLIB::OLE_F64, RTLIB::impl___ledf2vfp },
+        { RTLIB::OGE_F64, RTLIB::impl___gedf2vfp },
+        { RTLIB::OGT_F64, RTLIB::impl___gtdf2vfp },
+        { RTLIB::UO_F64,  RTLIB::impl___unorddf2vfp },
+
+        // Floating-point to integer conversions.
+        // i64 conversions are done via library routines even when generating VFP
+        // instructions, so use the same ones.
+        { RTLIB::FPTOSINT_F64_I32, RTLIB::impl___fixdfsivfp },
+        { RTLIB::FPTOUINT_F64_I32, RTLIB::impl___fixunsdfsivfp },
+        { RTLIB::FPTOSINT_F32_I32, RTLIB::impl___fixsfsivfp },
+        { RTLIB::FPTOUINT_F32_I32, RTLIB::impl___fixunssfsivfp },
+
+        // Conversions between floating types.
+        { RTLIB::FPROUND_F64_F32, RTLIB::impl___truncdfsf2vfp },
+        { RTLIB::FPEXT_F32_F64,   RTLIB::impl___extendsfdf2vfp },
+
+        // Integer to floating-point conversions.
+        // i64 conversions are done via library routines even when generating VFP
+        // instructions, so use the same ones.
+        // FIXME: There appears to be some naming inconsistency in ARM libgcc:
+        // e.g., __floatunsidf vs. __floatunssidfvfp.
+        { RTLIB::SINTTOFP_I32_F64, RTLIB::impl___floatsidfvfp },
+        { RTLIB::UINTTOFP_I32_F64, RTLIB::impl___floatunssidfvfp },
+        { RTLIB::SINTTOFP_I32_F32, RTLIB::impl___floatsisfvfp },
+        { RTLIB::UINTTOFP_I32_F32, RTLIB::impl___floatunssisfvfp },
+      };
+      // clang-format on
+
+      for (const auto &LC : LibraryCalls)
+        setLibcallImpl(LC.Op, LC.Impl);
+    }
+  }
+
   if (Subtarget->isThumb1Only())
     addRegisterClass(MVT::i32, &ARM::tGPRRegClass);
   else
@@ -546,24 +641,16 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_,
       for (auto Op : {ISD::STRICT_FADD, ISD::STRICT_FSUB, ISD::STRICT_FMUL,
                       ISD::STRICT_FDIV, ISD::STRICT_FMA, ISD::STRICT_FSQRT})
         setOperationAction(Op, MVT::f64, Legal);
-
-      setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Legal);
     }
   }
 
   if (Subtarget->hasFullFP16()) {
-    for (auto Op : {ISD::STRICT_FADD, ISD::STRICT_FSUB, ISD::STRICT_FMUL,
-                    ISD::STRICT_FDIV, ISD::STRICT_FMA, ISD::STRICT_FSQRT})
-      setOperationAction(Op, MVT::f16, Legal);
-
     addRegisterClass(MVT::f16, &ARM::HPRRegClass);
     setOperationAction(ISD::BITCAST, MVT::i16, Custom);
     setOperationAction(ISD::BITCAST, MVT::f16, Custom);
 
     setOperationAction(ISD::FMINNUM, MVT::f16, Legal);
     setOperationAction(ISD::FMAXNUM, MVT::f16, Legal);
-    setOperationAction(ISD::STRICT_FMINNUM, MVT::f16, Legal);
-    setOperationAction(ISD::STRICT_FMAXNUM, MVT::f16, Legal);
   }
 
   if (Subtarget->hasBF16()) {
@@ -873,14 +960,13 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_,
     setOperationAction(ISD::FP_TO_SINT, MVT::f64, Custom);
     setOperationAction(ISD::FP_TO_UINT, MVT::f64, Custom);
     setOperationAction(ISD::FP_ROUND,   MVT::f32, Custom);
+    setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i32, Custom);
+    setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Custom);
     setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::f64, Custom);
     setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::f64, Custom);
     setOperationAction(ISD::STRICT_FP_ROUND,   MVT::f32, Custom);
   }
 
-  setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i32, Custom);
-  setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Custom);
-
   if (!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()) {
     setOperationAction(ISD::FP_EXTEND,  MVT::f64, Custom);
     setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Custom);
@@ -888,16 +974,11 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_,
       setOperationAction(ISD::FP_ROUND,  MVT::f16, Custom);
       setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Custom);
     }
-  } else {
-    setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Legal);
   }
 
   if (!Subtarget->hasFP16()) {
     setOperationAction(ISD::FP_EXTEND,  MVT::f32, Custom);
     setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f32, Custom);
-  } else {
-    setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f32, Legal);
-    setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Legal);
   }
 
   computeRegisterProperties(Subtarget->getRegisterInfo());
@@ -1208,10 +1289,27 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_,
   setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
   setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
   setOperationAction(ISD::SELECT_CC, MVT::f64, Custom);
+  setOperationAction(ISD::CT_SELECT, MVT::i8, Promote);
+  setOperationAction(ISD::CT_SELECT, MVT::i16, Promote);
+  setOperationPromotedToType(ISD::CT_SELECT, MVT::i16, MVT::i32);
+
+  setOperationAction(ISD::CT_SELECT, MVT::i32, Custom);
+  setOperationAction(ISD::CT_SELECT, MVT::i64, Expand);
+  setOperationAction(ISD::CT_SELECT, MVT::f32, Custom);
+  setOperationAction(ISD::CT_SELECT, MVT::f64, Custom);
+
+  // Handle f16 and bf16 without falling back to select from ctselect.
+  setTargetDAGCombine({ISD::CT_SELECT});
+
   if (Subtarget->hasFullFP16()) {
     setOperationAction(ISD::SETCC,     MVT::f16, Expand);
     setOperationAction(ISD::SELECT,    MVT::f16, Custom);
     setOperationAction(ISD::SELECT_CC, MVT::f16, Custom);
+    setOperationAction(ISD::CT_SELECT, MVT::f16, Custom);
+  }
+
+  if (Subtarget->hasBF16()) {
+    setOperationAction(ISD::CT_SELECT, MVT::bf16, Custom);
   }
 
   setOperationAction(ISD::SETCCCARRY, MVT::i32, Custom);
@@ -1229,8 +1327,8 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_,
   setOperationAction(ISD::FSIN,      MVT::f32, Expand);
   setOperationAction(ISD::FCOS,      MVT::f32, Expand);
   setOperationAction(ISD::FCOS,      MVT::f64, Expand);
-  setOperationAction(ISD::FSINCOS,   MVT::f64, Expand);
-  setOperationAction(ISD::FSINCOS,   MVT::f32, Expand);
+  setOperationAction(ISD::FSINCOS,   MVT::f64, Custom);
+  setOperationAction(ISD::FSINCOS,   MVT::f32, Custom);
   setOperationAction(ISD::FREM, MVT::f64, LibCall);
   setOperationAction(ISD::FREM, MVT::f32, LibCall);
   if (!Subtarget->useSoftFloat() && Subtarget->hasVFP2Base() &&
@@ -1252,16 +1350,16 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_,
     if (!Subtarget->hasFPARMv8Base() || !Subtarget->hasFP64()) {
       setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
       setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
-      setOperationAction(ISD::STRICT_FP16_TO_FP, MVT::f64, Expand);
-      setOperationAction(ISD::STRICT_FP_TO_FP16, MVT::f64, Expand);
+      setOperationAction(ISD::STRICT_FP16_TO_FP, MVT::f64, LibCall);
+      setOperationAction(ISD::STRICT_FP_TO_FP16, MVT::f64, LibCall);
     }
 
     // fp16 is a special v7 extension that adds f16 <-> f32 conversions.
     if (!Subtarget->hasFP16()) {
       setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
       setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
-      setOperationAction(ISD::STRICT_FP16_TO_FP, MVT::f32, Expand);
-      setOperationAction(ISD::STRICT_FP_TO_FP16, MVT::f32, Expand);
+      setOperationAction(ISD::STRICT_FP16_TO_FP, MVT::f32, LibCall);
+      setOperationAction(ISD::STRICT_FP_TO_FP16, MVT::f32, LibCall);
     }
 
     // Strict floating-point comparisons need custom lowering.
@@ -1294,6 +1392,18 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_,
       setOperationAction(ISD::FMINNUM, MVT::v4f32, Legal);
       setOperationAction(ISD::FMAXNUM, MVT::v4f32, Legal);
     }
+
+    if (Subtarget->hasFP64()) {
+      setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
+      setOperationAction(ISD::FCEIL, MVT::f64, Legal);
+      setOperationAction(ISD::FROUND, MVT::f64, Legal);
+      setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
+      setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal);
+      setOperationAction(ISD::FRINT, MVT::f64, Legal);
+      setOperationAction(ISD::FROUNDEVEN, MVT::f64, Legal);
+      setOperationAction(ISD::FMINNUM, MVT::f64, Legal);
+      setOperationAction(ISD::FMAXNUM, MVT::f64, Legal);
+    }
   }
 
   // FP16 often need to be promoted to call lib functions
@@ -1438,8 +1548,6 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_,
       Align(1ULL << Subtarget->getPreferBranchLogAlignment()));
 
   setMinFunctionAlignment(Subtarget->isThumb() ? Align(2) : Align(4));
-
-  IsStrictFPEnabled = true;
 }
 
 bool ARMTargetLowering::useSoftFloat() const {
@@ -2234,44 +2342,9 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
 
     if (isTailCall && VA.isMemLoc() && !AfterFormalArgLoads) {
       Chain = DAG.getStackArgumentTokenFactor(Chain);
-      if (ByValTempChain) {
-        // In case of large byval copies, re-using the stackframe for tail-calls
-        // can lead to overwriting incoming arguments on the stack. Force
-        // loading these stack arguments before the copy to avoid that.
-        SmallVector<SDValue, 8> IncomingLoad;
-        for (unsigned I = 0; I < OutVals.size(); ++I) {
-          if (Outs[I].Flags.isByVal())
-            continue;
-
-          SDValue OutVal = OutVals[I];
-          LoadSDNode *OutLN = dyn_cast_or_null<LoadSDNode>(OutVal);
-          if (!OutLN)
-            continue;
-
-          FrameIndexSDNode *FIN =
-              dyn_cast_or_null<FrameIndexSDNode>(OutLN->getBasePtr());
-          if (!FIN)
-            continue;
-
-          if (!MFI.isFixedObjectIndex(FIN->getIndex()))
-            continue;
-
-          for (const CCValAssign &VA : ArgLocs) {
-            if (VA.isMemLoc())
-              IncomingLoad.push_back(OutVal.getValue(1));
-          }
-        }
-
-        // Update the chain to force loads for potentially clobbered argument
-        // loads to happen before the byval copy.
-        if (!IncomingLoad.empty()) {
-          IncomingLoad.push_back(Chain);
-          Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, IncomingLoad);
-        }
-
+      if (ByValTempChain)
         Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chain,
                             ByValTempChain);
-      }
       AfterFormalArgLoads = true;
     }
 
@@ -4935,6 +5008,20 @@ SDValue ARMTargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
                          SelectTrue, SelectFalse, ISD::SETNE);
 }
 
+SDValue ARMTargetLowering::LowerCTSELECT(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+
+  SDValue Cond = Op.getOperand(0);
+  SDValue TrueVal = Op.getOperand(1);
+  SDValue FalseVal = Op.getOperand(2);
+  EVT VT = Op.getValueType();
+
+  // Normalise the condition to 0 or 1.
+  SDValue One = DAG.getConstant(1, DL, MVT::i32);
+  SDValue CondNode = DAG.getNode(ISD::AND, DL, MVT::i32, Cond, One);
+  return DAG.getNode(ARMISD::CT_SELECT, DL, VT, TrueVal, FalseVal, CondNode);
+}
+
 static void checkVSELConstraints(ISD::CondCode CC, ARMCC::CondCodes &CondCode,
                                  bool &swpCmpOps, bool &swpVselOps) {
   // Start by selecting the GE condition code for opcodes that return true for
@@ -9658,6 +9745,76 @@ static SDValue LowerADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG,
   return DAG.getMergeValues({Result, OutFlag}, DL);
 }
 
+SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const {
+  // For iOS, we want to call an alternative entry point: __sincos_stret,
+  // return values are passed via sret.
+  SDLoc dl(Op);
+  SDValue Arg = Op.getOperand(0);
+  EVT ArgVT = Arg.getValueType();
+  RTLIB::Libcall LC = RTLIB::getSINCOS_STRET(ArgVT);
+  RTLIB::LibcallImpl SincosStret = getLibcallImpl(LC);
+  if (SincosStret == RTLIB::Unsupported)
+    return SDValue();
+
+  assert(Subtarget->isTargetDarwin());
+
+  Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
+  auto PtrVT = getPointerTy(DAG.getDataLayout());
+
+  MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
+
+  // Pair of floats / doubles used to pass the result.
+  Type *RetTy = StructType::get(ArgTy, ArgTy);
+  auto &DL = DAG.getDataLayout();
+
+  ArgListTy Args;
+  bool ShouldUseSRet = getTM().isAPCS_ABI();
+  SDValue SRet;
+  if (ShouldUseSRet) {
+    // Create stack object for sret.
+    const uint64_t ByteSize = DL.getTypeAllocSize(RetTy);
+    const Align StackAlign = DL.getPrefTypeAlign(RetTy);
+    int FrameIdx = MFI.CreateStackObject(ByteSize, StackAlign, false);
+    SRet = DAG.getFrameIndex(FrameIdx, getPointerTy(DL));
+
+    ArgListEntry Entry(SRet, PointerType::getUnqual(RetTy->getContext()));
+    Entry.IsSExt = false;
+    Entry.IsZExt = false;
+    Entry.IsSRet = true;
+    Args.push_back(Entry);
+    RetTy = Type::getVoidTy(*DAG.getContext());
+  }
+
+  Args.emplace_back(Arg, ArgTy);
+
+  StringRef LibcallName = getLibcallImplName(SincosStret);
+  CallingConv::ID CC = getLibcallImplCallingConv(SincosStret);
+  SDValue Callee = DAG.getExternalSymbol(LibcallName.data(), getPointerTy(DL));
+
+  TargetLowering::CallLoweringInfo CLI(DAG);
+  CLI.setDebugLoc(dl)
+      .setChain(DAG.getEntryNode())
+      .setCallee(CC, RetTy, Callee, std::move(Args))
+      .setDiscardResult(ShouldUseSRet);
+  std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
+
+  if (!ShouldUseSRet)
+    return CallResult.first;
+
+  SDValue LoadSin =
+      DAG.getLoad(ArgVT, dl, CallResult.second, SRet, MachinePointerInfo());
+
+  // Address of cos field.
+  SDValue Add = DAG.getNode(ISD::ADD, dl, PtrVT, SRet,
+                            DAG.getIntPtrConstant(ArgVT.getStoreSize(), dl));
+  SDValue LoadCos =
+      DAG.getLoad(ArgVT, dl, LoadSin.getValue(1), Add, MachinePointerInfo());
+
+  SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
+  return DAG.getNode(ISD::MERGE_VALUES, dl, Tys,
+                     LoadSin.getValue(0), LoadCos.getValue(0));
+}
+
 SDValue ARMTargetLowering::LowerWindowsDIVLibCall(SDValue Op, SelectionDAG &DAG,
                                                   bool Signed,
                                                   SDValue &Chain) const {
@@ -10456,6 +10613,8 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
   case ISD::SELECT:        return LowerSELECT(Op, DAG);
   case ISD::SELECT_CC:     return LowerSELECT_CC(Op, DAG);
+  case ISD::CT_SELECT:
+    return LowerCTSELECT(Op, DAG);
   case ISD::BRCOND:        return LowerBRCOND(Op, DAG);
   case ISD::BR_CC:         return LowerBR_CC(Op, DAG);
   case ISD::BR_JT:         return LowerBR_JT(Op, DAG);
@@ -10573,8 +10732,8 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::VECREDUCE_SMAX:
     return LowerVecReduceMinMax(Op, DAG, Subtarget);
   case ISD::ATOMIC_LOAD:
-  case ISD::ATOMIC_STORE:
-    return LowerAtomicLoadStore(Op, DAG);
+  case ISD::ATOMIC_STORE:  return LowerAtomicLoadStore(Op, DAG);
+  case ISD::FSINCOS:       return LowerFSINCOS(Op, DAG);
   case ISD::SDIVREM:
   case ISD::UDIVREM:       return LowerDivRem(Op, DAG);
   case ISD::DYNAMIC_STACKALLOC:
@@ -10708,6 +10867,36 @@ void ARMTargetLowering::ReplaceNodeResults(SDNode *N,
   case ISD::FP_TO_UINT_SAT:
     Res = LowerFP_TO_INT_SAT(SDValue(N, 0), DAG, Subtarget);
     break;
+  case ISD::CT_SELECT: {
+    EVT VT = N->getValueType(0);
+
+    // Handle f16/bf16 type promotion while preserving ctselect
+    if (VT == MVT::f16 || VT == MVT::bf16) {
+      SDLoc DL(N);
+      SDValue Cond = N->getOperand(0);
+      SDValue TrueVal = N->getOperand(1);
+      SDValue FalseVal = N->getOperand(2);
+
+      // Bitcast to i16, then promote to i32
+      SDValue TrueInt = DAG.getBitcast(MVT::i16, TrueVal);
+      SDValue FalseInt = DAG.getBitcast(MVT::i16, FalseVal);
+
+      TrueInt = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, TrueInt);
+      FalseInt = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, FalseInt);
+
+      // Normalize condition
+      SDValue One = DAG.getConstant(1, DL, MVT::i32);
+      SDValue CondNorm = DAG.getNode(ISD::AND, DL, MVT::i32, Cond, One);
+
+      // Create i32 ctselect that will go through normal lowering
+      Res =
+          DAG.getNode(ISD::CT_SELECT, DL, MVT::i32, CondNorm, TrueInt, FalseInt);
+    } else {
+      // For other types, use existing lowering
+      Res = LowerCTSELECT(SDValue(N, 0), DAG);
+    }
+    break;
+  }
   }
   if (Res.getNode())
     Results.push_back(Res);
@@ -13334,6 +13523,64 @@ static SDValue PerformVQDMULHCombine(SDNode *N, SelectionDAG &DAG) {
                      DAG.getNode(ISD::CONCAT_VECTORS, DL, VecVT, Parts));
 }
 
+static SDValue PerformCTSELECTCombine(SDNode *N,
+                                      TargetLowering::DAGCombinerInfo &DCI,
+                                      const ARMSubtarget *Subtarget) {
+  if (!DCI.isBeforeLegalize()) {
+    return SDValue();
+  }
+
+  SelectionDAG &DAG = DCI.DAG;
+  SDLoc DL(N);
+
+  EVT VT = N->getValueType(0);
+  if (VT == MVT::f16 || VT == MVT::bf16) {
+    SDValue Cond = N->getOperand(0);
+    SDValue TrueVal = N->getOperand(1);
+    SDValue FalseVal = N->getOperand(2);
+
+    SDValue TrueInt = DAG.getBitcast(MVT::i16, TrueVal);
+    SDValue FalseInt = DAG.getBitcast(MVT::i16, FalseVal);
+
+    // Create i16 ctselect - this will be promoted to i32 ctselect naturally
+    SDValue Result =
+        DAG.getNode(ISD::CT_SELECT, DL, MVT::i16, Cond, TrueInt, FalseInt);
+
+    return DAG.getBitcast(VT, Result);
+  } else if (VT.isVector()) {
+    EVT EltVT = VT.getVectorElementType();
+    if (EltVT == MVT::f16 || EltVT == MVT::bf16) {
+      SDValue Cond = N->getOperand(0);
+      SDValue TrueVal = N->getOperand(1);
+      SDValue FalseVal = N->getOperand(2);
+
+      EVT IntVT;
+      switch (VT.getSimpleVT().SimpleTy) {
+      case MVT::v4f16:
+      case MVT::v4bf16:
+        IntVT = MVT::v4i16;
+        break;
+      case MVT::v8f16:
+      case MVT::v8bf16:
+        IntVT = MVT::v8i16;
+        break;
+      default:
+        return SDValue(); // Unsupported vector type
+      }
+
+      SDValue TrueInt = DAG.getBitcast(IntVT, TrueVal);
+      SDValue FalseInt = DAG.getBitcast(IntVT, FalseVal);
+
+      SDValue Result =
+          DAG.getNode(ISD::CT_SELECT, DL, IntVT, Cond, TrueInt, FalseInt);
+
+      return DAG.getBitcast(VT, Result);
+    }
+  }
+
+  return SDValue();
+}
+
 static SDValue PerformVSELECTCombine(SDNode *N,
                                      TargetLowering::DAGCombinerInfo &DCI,
                                      const ARMSubtarget *Subtarget) {
@@ -19019,6 +19266,8 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::SELECT_CC:
   case ISD::SELECT:     return PerformSELECTCombine(N, DCI, Subtarget);
   case ISD::VSELECT:    return PerformVSELECTCombine(N, DCI, Subtarget);
+  case ISD::CT_SELECT:
+    return PerformCTSELECTCombine(N, DCI, Subtarget);
   case ISD::SETCC:      return PerformVSetCCToVCTPCombine(N, DCI, Subtarget);
   case ARMISD::ADDE:    return PerformADDECombine(N, DCI, Subtarget);
   case ARMISD::UMLAL:   return PerformUMLALCombine(N, DCI.DAG, Subtarget);
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h
index 10f5442d7429b..1ab3bdb2b16d1 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.h
+++ b/llvm/lib/Target/ARM/ARMISelLowering.h
@@ -51,6 +51,7 @@ class TargetMachine;
 class TargetRegisterInfo;
 class VectorType;
 
+
   namespace ARM {
   /// Possible values of current rounding mode, which is specified in bits
   /// 23:22 of FPSCR.
@@ -575,6 +576,7 @@ class VectorType;
     SDValue LowerALUO(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerCTSELECT(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerBR_CC(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const;
@@ -591,6 +593,7 @@ class VectorType;
     SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
                               const ARMSubtarget *ST) const;
     SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerDivRem(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerDIV_Windows(SDValue Op, SelectionDAG &DAG, bool Signed) const;
     void ExpandDIV_Windows(SDValue Op, SelectionDAG &DAG, bool Signed,
@@ -725,6 +728,7 @@ class VectorType;
                                            MachineBasicBlock *MBB) const;
     MachineBasicBlock *EmitLowered__dbzchk(MachineInstr &MI,
                                            MachineBasicBlock *MBB) const;
+
     void addMVEVectorTypes(bool HasMVEFP);
     void addAllExtLoads(const MVT From, const MVT To, LegalizeAction Action);
     void setAllExpand(MVT VT);
diff --git a/llvm/lib/Target/ARM/ARMInstrInfo.td b/llvm/lib/Target/ARM/ARMInstrInfo.td
index 4f80746fd8bc5..79d2674f6ef1e 100644
--- a/llvm/lib/Target/ARM/ARMInstrInfo.td
+++ b/llvm/lib/Target/ARM/ARMInstrInfo.td
@@ -40,6 +40,13 @@ def SDT_ARMCMov : SDTypeProfile<1, 4, [
   SDTCisVT<4, FlagsVT>,    // in flags
 ]>;
 
+def SDT_ARMCtSelect : SDTypeProfile<1, 3, [
+  /* any */                // result
+  SDTCisSameAs<1, 0>,      // value on false
+  SDTCisSameAs<2, 0>,      // value on true
+  SDTCisVT<3, i32>         // cond
+]>;
+
 def SDT_ARMBrcond : SDTypeProfile<0, 3, [
   SDTCisVT<0, OtherVT>,    // target basic block
   SDTCisVT<1, CondCodeVT>, // condition code
@@ -226,6 +233,9 @@ def ARMintretglue    : SDNode<"ARMISD::INTRET_GLUE", SDT_ARMcall,
 // ARM conditional move instructions.
 def ARMcmov          : SDNode<"ARMISD::CMOV", SDT_ARMCMov>;
 
+// ARM constant-time selection.
+def ARMct_select : SDNode<"ARMISD::CT_SELECT", SDT_ARMCtSelect>;
+
 // Signed saturation
 def ARMssat   : SDNode<"ARMISD::SSAT", SDTIntSatNoShOp, []>;
 
@@ -6689,6 +6699,183 @@ def CMP_SWAP_64 : PseudoInst<(outs GPRPair:$Rd, GPRPair:$addr_temp_out),
 
 def : Pat<(atomic_fence (timm), 0), (MEMBARRIER)>;
 
+//===----------------------------------------------------------------------===//
+// Constant-time selection pseudoinstructions.
+// We use a machine pass to lower these pseudos as applicable by subtarget,
+// in order to avoid backend optimizations that could invalidate constant-time
+// guarantees to the source programmer by node merging or other operations that
+// would result in machine code that does not run in constant time.
+let isNotDuplicable = 1, isPseudo = 1, hasNoSchedulingInfo = 1 in {
+
+  // i1, i8, i16, i32, i64
+  def CT_SELECTint : ARMPseudoInst<(outs GPR:$dst, GPR:$tmp_mask),
+                                  (ins GPR:$src1, GPR:$src2, GPR:$cond), 4,
+                                  NoItinerary, []> {
+    let Constraints = "@earlyclobber $dst, at earlyclobber $tmp_mask";
+  }
+
+  def CT_SELECTf16
+      : ARMPseudoInst<
+            (outs HPR:$dst, GPR:$tmp_mask, GPR:$scratch1, GPR:$scratch2),
+            (ins HPR:$src1, HPR:$src2, GPR:$cond), 4, NoItinerary, []> {
+    let Constraints =
+        "@earlyclobber $dst, at earlyclobber $tmp_mask, at earlyclobber "
+        "$scratch1, at earlyclobber $scratch2";
+  }
+
+  def CT_SELECTbf16
+      : ARMPseudoInst<
+            (outs HPR:$dst, GPR:$tmp_mask, GPR:$scratch1, GPR:$scratch2),
+            (ins HPR:$src1, HPR:$src2, GPR:$cond), 4, NoItinerary, []> {
+    let Constraints =
+        "@earlyclobber $dst, at earlyclobber $tmp_mask, at earlyclobber "
+        "$scratch1, at earlyclobber $scratch2";
+  }
+
+  def CT_SELECTf32
+      : ARMPseudoInst<
+            (outs SPR:$dst, GPR:$tmp_mask, GPR:$scratch1, GPR:$scratch2),
+            (ins SPR:$src1, SPR:$src2, GPR:$cond), 4, NoItinerary, []> {
+    let Constraints =
+        "@earlyclobber $dst, at earlyclobber $tmp_mask, at earlyclobber "
+        "$scratch1, at earlyclobber $scratch2";
+  }
+
+  let Predicates = [HasDPVFP] in {
+    def CT_SELECTf64
+        : ARMPseudoInst<(outs DPR:$dst, GPR:$tmp_mask, DPR:$bcast_mask),
+                        (ins DPR:$src1, DPR:$src2, GPR:$cond), 4,
+                        NoItinerary, []> {
+      let Constraints = "@earlyclobber $dst, at earlyclobber "
+                        "$tmp_mask, at earlyclobber $bcast_mask";
+    }
+  }
+
+  let Predicates = [HasNEON] in {
+    // DPR
+    def CT_SELECTv8i8
+        : ARMPseudoInst<(outs DPR:$dst, GPR:$tmp_mask, DPR:$bcast_mask),
+                        (ins DPR:$src1, DPR:$src2, GPR:$cond), 4,
+                        NoItinerary, []> {
+      let Constraints = "@earlyclobber $dst, at earlyclobber "
+                        "$tmp_mask, at earlyclobber $bcast_mask";
+    }
+
+    def CT_SELECTv4i16
+        : ARMPseudoInst<(outs DPR:$dst, GPR:$tmp_mask, DPR:$bcast_mask),
+                        (ins DPR:$src1, DPR:$src2, GPR:$cond), 4,
+                        NoItinerary, []> {
+      let Constraints = "@earlyclobber $dst, at earlyclobber "
+                        "$tmp_mask, at earlyclobber $bcast_mask";
+    }
+
+    def CT_SELECTv2i32
+        : ARMPseudoInst<(outs DPR:$dst, GPR:$tmp_mask, DPR:$bcast_mask),
+                        (ins DPR:$src1, DPR:$src2, GPR:$cond), 4,
+                        NoItinerary, []> {
+      let Constraints = "@earlyclobber $dst, at earlyclobber "
+                        "$tmp_mask, at earlyclobber $bcast_mask";
+    }
+
+    def CT_SELECTv1i64
+        : ARMPseudoInst<(outs DPR:$dst, GPR:$tmp_mask, DPR:$bcast_mask),
+                        (ins DPR:$src1, DPR:$src2, GPR:$cond), 4,
+                        NoItinerary, []> {
+      let Constraints = "@earlyclobber $dst, at earlyclobber "
+                        "$tmp_mask, at earlyclobber $bcast_mask";
+    }
+
+    def CT_SELECTv2f32
+        : ARMPseudoInst<(outs DPR:$dst, GPR:$tmp_mask, DPR:$bcast_mask),
+                        (ins DPR:$src1, DPR:$src2, GPR:$cond), 4,
+                        NoItinerary, []> {
+      let Constraints = "@earlyclobber $dst, at earlyclobber "
+                        "$tmp_mask, at earlyclobber $bcast_mask";
+    }
+
+    def CT_SELECTv4f16
+        : ARMPseudoInst<(outs DPR:$dst, GPR:$tmp_mask, DPR:$bcast_mask),
+                        (ins DPR:$src1, DPR:$src2, GPR:$cond), 4,
+                        NoItinerary, []> {
+      let Constraints = "@earlyclobber $dst, at earlyclobber "
+                        "$tmp_mask, at earlyclobber $bcast_mask";
+    }
+
+    def CT_SELECTv4bf16
+        : ARMPseudoInst<(outs DPR:$dst, GPR:$tmp_mask, DPR:$bcast_mask),
+                        (ins DPR:$src1, DPR:$src2, GPR:$cond), 4,
+                        NoItinerary, []> {
+      let Constraints = "@earlyclobber $dst, at earlyclobber "
+                        "$tmp_mask, at earlyclobber $bcast_mask";
+    }
+
+    // QPR
+    def CT_SELECTv16i8
+        : ARMPseudoInst<(outs QPR:$dst, GPR:$tmp_mask, QPR:$bcast_mask),
+                        (ins QPR:$src1, QPR:$src2, GPR:$cond), 4,
+                        NoItinerary, []> {
+      let Constraints = "@earlyclobber $dst, at earlyclobber "
+                        "$tmp_mask, at earlyclobber $bcast_mask";
+    }
+
+    def CT_SELECTv8i16
+        : ARMPseudoInst<(outs QPR:$dst, GPR:$tmp_mask, QPR:$bcast_mask),
+                        (ins QPR:$src1, QPR:$src2, GPR:$cond), 4,
+                        NoItinerary, []> {
+      let Constraints = "@earlyclobber $dst, at earlyclobber "
+                        "$tmp_mask, at earlyclobber $bcast_mask";
+    }
+
+    def CT_SELECTv4i32
+        : ARMPseudoInst<(outs QPR:$dst, GPR:$tmp_mask, QPR:$bcast_mask),
+                        (ins QPR:$src1, QPR:$src2, GPR:$cond), 4,
+                        NoItinerary, []> {
+      let Constraints = "@earlyclobber $dst, at earlyclobber "
+                        "$tmp_mask, at earlyclobber $bcast_mask";
+    }
+
+    def CT_SELECTv2i64
+        : ARMPseudoInst<(outs QPR:$dst, GPR:$tmp_mask, QPR:$bcast_mask),
+                        (ins QPR:$src1, QPR:$src2, GPR:$cond), 4,
+                        NoItinerary, []> {
+      let Constraints = "@earlyclobber $dst, at earlyclobber "
+                        "$tmp_mask, at earlyclobber $bcast_mask";
+    }
+
+    def CT_SELECTv4f32
+        : ARMPseudoInst<(outs QPR:$dst, GPR:$tmp_mask, QPR:$bcast_mask),
+                        (ins QPR:$src1, QPR:$src2, GPR:$cond), 4,
+                        NoItinerary, []> {
+      let Constraints = "@earlyclobber $dst, at earlyclobber "
+                        "$tmp_mask, at earlyclobber $bcast_mask";
+    }
+
+    def CT_SELECTv2f64
+        : ARMPseudoInst<(outs QPR:$dst, GPR:$tmp_mask, QPR:$bcast_mask),
+                        (ins QPR:$src1, QPR:$src2, GPR:$cond), 4,
+                        NoItinerary, []> {
+      let Constraints = "@earlyclobber $dst, at earlyclobber "
+                        "$tmp_mask, at earlyclobber $bcast_mask";
+    }
+
+    def CT_SELECTv8f16
+        : ARMPseudoInst<(outs QPR:$dst, GPR:$tmp_mask, QPR:$bcast_mask),
+                        (ins QPR:$src1, QPR:$src2, GPR:$cond), 4,
+                        NoItinerary, []> {
+      let Constraints = "@earlyclobber $dst, at earlyclobber "
+                        "$tmp_mask, at earlyclobber $bcast_mask";
+    }
+
+    def CT_SELECTv8bf16
+        : ARMPseudoInst<(outs QPR:$dst, GPR:$tmp_mask, QPR:$bcast_mask),
+                        (ins QPR:$src1, QPR:$src2, GPR:$cond), 4,
+                        NoItinerary, []> {
+      let Constraints = "@earlyclobber $dst, at earlyclobber "
+                        "$tmp_mask, at earlyclobber $bcast_mask";
+    }
+  }
+}
+
 //===----------------------------------------------------------------------===//
 // KCFI check pseudo-instruction.
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/ARM/ARMTargetMachine.cpp b/llvm/lib/Target/ARM/ARMTargetMachine.cpp
index 0de6f3d16eff4..a9b542d6110c1 100644
--- a/llvm/lib/Target/ARM/ARMTargetMachine.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetMachine.cpp
@@ -527,13 +527,11 @@ void ARMPassConfig::addPreSched2() {
 void ARMPassConfig::addPreEmitPass() {
   addPass(createThumb2SizeReductionPass());
 
-  // Unpack bundles for:
+  // Always unpack bundles for:
   // - Thumb2: Constant island pass requires unbundled instructions
   // - KCFI: KCFI_CHECK pseudo instructions need to be unbundled for AsmPrinter
-  addPass(createUnpackMachineBundlesLegacy([](const MachineFunction &MF) {
-    return MF.getSubtarget<ARMSubtarget>().isThumb2() ||
-           MF.getFunction().getParent()->getModuleFlag("kcfi");
-  }));
+  // - CT_SELECT: Post-RA expansion creates bundles that must be unpacked
+  addPass(createUnpackMachineBundlesLegacy(nullptr));
 
   // Don't optimize barriers or block placement at -O0.
   if (getOptLevel() != CodeGenOptLevel::None) {
diff --git a/llvm/test/CodeGen/ARM/ctselect-half.ll b/llvm/test/CodeGen/ARM/ctselect-half.ll
new file mode 100644
index 0000000000000..bfc60f1fad42d
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/ctselect-half.ll
@@ -0,0 +1,867 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=armv7-none-eabi -verify-machineinstrs | FileCheck --check-prefixes=CT %s
+; RUN: llc < %s -mtriple=armv8.6a-none-eabi -verify-machineinstrs | FileCheck --check-prefixes=BFLOAT-F16-NATIVE %s
+; RUN: llc < %s -mtriple=armv8.2a-none-eabi -verify-machineinstrs | FileCheck --check-prefixes=F16-NATIVE %s
+; RUN: llc < %s -mtriple=thumbv6m-none-eabi -verify-machineinstrs | FileCheck --check-prefix=THUMB1 %s
+; RUN: llc < %s -mtriple=thumbv7m-none-eabi -verify-machineinstrs | FileCheck --check-prefix=THUMB2 %s
+
+define half @ct_half(i1 %cond, half %a, half %b) {
+; CT-LABEL: ct_half:
+; CT:       @ %bb.0: @ %entry
+; CT-NEXT:    and r0, r0, #1
+; CT-NEXT:    sub r3, r0, #1
+; CT-NEXT:    rsb r0, r0, #0
+; CT-NEXT:    and r2, r2, r3
+; CT-NEXT:    and r0, r1, r0
+; CT-NEXT:    orr r0, r0, r2
+; CT-NEXT:    bx lr
+;
+; BFLOAT-F16-NATIVE-LABEL: ct_half:
+; BFLOAT-F16-NATIVE:       @ %bb.0: @ %entry
+; BFLOAT-F16-NATIVE-NEXT:    and r0, r0, #1
+; BFLOAT-F16-NATIVE-NEXT:    sub r3, r0, #1
+; BFLOAT-F16-NATIVE-NEXT:    rsb r0, r0, #0
+; BFLOAT-F16-NATIVE-NEXT:    and r2, r2, r3
+; BFLOAT-F16-NATIVE-NEXT:    and r0, r1, r0
+; BFLOAT-F16-NATIVE-NEXT:    orr r0, r0, r2
+; BFLOAT-F16-NATIVE-NEXT:    bx lr
+;
+; F16-NATIVE-LABEL: ct_half:
+; F16-NATIVE:       @ %bb.0: @ %entry
+; F16-NATIVE-NEXT:    and r0, r0, #1
+; F16-NATIVE-NEXT:    sub r3, r0, #1
+; F16-NATIVE-NEXT:    rsb r0, r0, #0
+; F16-NATIVE-NEXT:    and r2, r2, r3
+; F16-NATIVE-NEXT:    and r0, r1, r0
+; F16-NATIVE-NEXT:    orr r0, r0, r2
+; F16-NATIVE-NEXT:    bx lr
+;
+; THUMB1-LABEL: ct_half:
+; THUMB1:       @ %bb.0: @ %entry
+; THUMB1-NEXT:    .save {r4, lr}
+; THUMB1-NEXT:    push {r4, lr}
+; THUMB1-NEXT:    movs r3, #1
+; THUMB1-NEXT:    ands r3, r0
+; THUMB1-NEXT:    subs r4, r3, #1
+; THUMB1-NEXT:    ands r4, r2
+; THUMB1-NEXT:    rsbs r0, r3, #0
+; THUMB1-NEXT:    ands r0, r1
+; THUMB1-NEXT:    orrs r0, r4
+; THUMB1-NEXT:    pop {r4, pc}
+;
+; THUMB2-LABEL: ct_half:
+; THUMB2:       @ %bb.0: @ %entry
+; THUMB2-NEXT:    and r0, r0, #1
+; THUMB2-NEXT:    subs r3, r0, #1
+; THUMB2-NEXT:    rsbs r0, r0, #0
+; THUMB2-NEXT:    ands r2, r3
+; THUMB2-NEXT:    ands r0, r1
+; THUMB2-NEXT:    orrs r0, r2
+; THUMB2-NEXT:    bx lr
+entry:
+  %sel = call half @llvm.ct.select.f16(i1 %cond, half %a, half %b)
+  ret half %sel
+}
+
+define bfloat @ct_bf16(i1 %cond, bfloat %a, bfloat %b) {
+; CT-LABEL: ct_bf16:
+; CT:       @ %bb.0: @ %entry
+; CT-NEXT:    and r0, r0, #1
+; CT-NEXT:    sub r3, r0, #1
+; CT-NEXT:    rsb r0, r0, #0
+; CT-NEXT:    and r2, r2, r3
+; CT-NEXT:    and r0, r1, r0
+; CT-NEXT:    orr r0, r0, r2
+; CT-NEXT:    bx lr
+;
+; BFLOAT-F16-NATIVE-LABEL: ct_bf16:
+; BFLOAT-F16-NATIVE:       @ %bb.0: @ %entry
+; BFLOAT-F16-NATIVE-NEXT:    .pad #4
+; BFLOAT-F16-NATIVE-NEXT:    sub sp, sp, #4
+; BFLOAT-F16-NATIVE-NEXT:    and r0, r0, #1
+; BFLOAT-F16-NATIVE-NEXT:    rsb r12, r0, #0
+; BFLOAT-F16-NATIVE-NEXT:    and r3, r1, r12
+; BFLOAT-F16-NATIVE-NEXT:    bic r12, r2, r12
+; BFLOAT-F16-NATIVE-NEXT:    orr r3, r3, r12
+; BFLOAT-F16-NATIVE-NEXT:    strh r3, [sp, #2]
+; BFLOAT-F16-NATIVE-NEXT:    ldrh r0, [sp, #2]
+; BFLOAT-F16-NATIVE-NEXT:    add sp, sp, #4
+; BFLOAT-F16-NATIVE-NEXT:    bx lr
+;
+; F16-NATIVE-LABEL: ct_bf16:
+; F16-NATIVE:       @ %bb.0: @ %entry
+; F16-NATIVE-NEXT:    and r0, r0, #1
+; F16-NATIVE-NEXT:    sub r3, r0, #1
+; F16-NATIVE-NEXT:    rsb r0, r0, #0
+; F16-NATIVE-NEXT:    and r2, r2, r3
+; F16-NATIVE-NEXT:    and r0, r1, r0
+; F16-NATIVE-NEXT:    orr r0, r0, r2
+; F16-NATIVE-NEXT:    bx lr
+;
+; THUMB1-LABEL: ct_bf16:
+; THUMB1:       @ %bb.0: @ %entry
+; THUMB1-NEXT:    .save {r4, lr}
+; THUMB1-NEXT:    push {r4, lr}
+; THUMB1-NEXT:    movs r3, #1
+; THUMB1-NEXT:    ands r3, r0
+; THUMB1-NEXT:    subs r4, r3, #1
+; THUMB1-NEXT:    ands r4, r2
+; THUMB1-NEXT:    rsbs r0, r3, #0
+; THUMB1-NEXT:    ands r0, r1
+; THUMB1-NEXT:    orrs r0, r4
+; THUMB1-NEXT:    pop {r4, pc}
+;
+; THUMB2-LABEL: ct_bf16:
+; THUMB2:       @ %bb.0: @ %entry
+; THUMB2-NEXT:    and r0, r0, #1
+; THUMB2-NEXT:    subs r3, r0, #1
+; THUMB2-NEXT:    rsbs r0, r0, #0
+; THUMB2-NEXT:    ands r2, r3
+; THUMB2-NEXT:    ands r0, r1
+; THUMB2-NEXT:    orrs r0, r2
+; THUMB2-NEXT:    bx lr
+entry:
+  %sel = call bfloat @llvm.ct.select.bf16(i1 %cond, bfloat %a, bfloat %b)
+  ret bfloat %sel
+}
+
+define <4 x half> @ct_v4f16(i1 %cond, <4 x half> %a, <4 x half> %b) {
+; CT-LABEL: ct_v4f16:
+; CT:       @ %bb.0: @ %entry
+; CT-NEXT:    .save {r4, r5, r6, lr}
+; CT-NEXT:    push {r4, r5, r6, lr}
+; CT-NEXT:    ldrh lr, [sp, #36]
+; CT-NEXT:    vdup.16 d16, r0
+; CT-NEXT:    ldrh r12, [sp, #28]
+; CT-NEXT:    pkhbt r2, r2, r3, lsl #16
+; CT-NEXT:    ldrh r1, [sp, #32]
+; CT-NEXT:    vshl.i16 d16, d16, #15
+; CT-NEXT:    ldrh r6, [sp, #24]
+; CT-NEXT:    ldrh r4, [sp, #20]
+; CT-NEXT:    orr r0, r1, lr, lsl #16
+; CT-NEXT:    orr r1, r6, r12, lsl #16
+; CT-NEXT:    ldrh r5, [sp, #16]
+; CT-NEXT:    vshr.s16 d16, d16, #15
+; CT-NEXT:    vmov d17, r1, r0
+; CT-NEXT:    orr r0, r5, r4, lsl #16
+; CT-NEXT:    vmov d18, r2, r0
+; CT-NEXT:    veor d18, d18, d17
+; CT-NEXT:    vand d16, d18, d16
+; CT-NEXT:    veor d16, d17, d16
+; CT-NEXT:    vmov.u16 r0, d16[0]
+; CT-NEXT:    vmov.u16 r1, d16[1]
+; CT-NEXT:    vmov.u16 r2, d16[2]
+; CT-NEXT:    vmov.u16 r3, d16[3]
+; CT-NEXT:    pop {r4, r5, r6, pc}
+;
+; BFLOAT-F16-NATIVE-LABEL: ct_v4f16:
+; BFLOAT-F16-NATIVE:       @ %bb.0: @ %entry
+; BFLOAT-F16-NATIVE-NEXT:    .save {r4, r5, r6, lr}
+; BFLOAT-F16-NATIVE-NEXT:    push {r4, r5, r6, lr}
+; BFLOAT-F16-NATIVE-NEXT:    ldrh lr, [sp, #36]
+; BFLOAT-F16-NATIVE-NEXT:    vdup.16 d16, r0
+; BFLOAT-F16-NATIVE-NEXT:    ldrh r12, [sp, #28]
+; BFLOAT-F16-NATIVE-NEXT:    pkhbt r2, r2, r3, lsl #16
+; BFLOAT-F16-NATIVE-NEXT:    ldrh r1, [sp, #32]
+; BFLOAT-F16-NATIVE-NEXT:    vshl.i16 d16, d16, #15
+; BFLOAT-F16-NATIVE-NEXT:    ldrh r6, [sp, #24]
+; BFLOAT-F16-NATIVE-NEXT:    ldrh r4, [sp, #20]
+; BFLOAT-F16-NATIVE-NEXT:    orr r0, r1, lr, lsl #16
+; BFLOAT-F16-NATIVE-NEXT:    orr r1, r6, r12, lsl #16
+; BFLOAT-F16-NATIVE-NEXT:    ldrh r5, [sp, #16]
+; BFLOAT-F16-NATIVE-NEXT:    vshr.s16 d16, d16, #15
+; BFLOAT-F16-NATIVE-NEXT:    vmov d17, r1, r0
+; BFLOAT-F16-NATIVE-NEXT:    orr r0, r5, r4, lsl #16
+; BFLOAT-F16-NATIVE-NEXT:    vmov d18, r2, r0
+; BFLOAT-F16-NATIVE-NEXT:    veor d18, d18, d17
+; BFLOAT-F16-NATIVE-NEXT:    vand d16, d18, d16
+; BFLOAT-F16-NATIVE-NEXT:    veor d16, d17, d16
+; BFLOAT-F16-NATIVE-NEXT:    vmov.u16 r0, d16[0]
+; BFLOAT-F16-NATIVE-NEXT:    vmov.u16 r1, d16[1]
+; BFLOAT-F16-NATIVE-NEXT:    vmov.u16 r2, d16[2]
+; BFLOAT-F16-NATIVE-NEXT:    vmov.u16 r3, d16[3]
+; BFLOAT-F16-NATIVE-NEXT:    pop {r4, r5, r6, pc}
+;
+; F16-NATIVE-LABEL: ct_v4f16:
+; F16-NATIVE:       @ %bb.0: @ %entry
+; F16-NATIVE-NEXT:    .save {r4, r5, r6, lr}
+; F16-NATIVE-NEXT:    push {r4, r5, r6, lr}
+; F16-NATIVE-NEXT:    ldrh lr, [sp, #36]
+; F16-NATIVE-NEXT:    vdup.16 d16, r0
+; F16-NATIVE-NEXT:    ldrh r12, [sp, #28]
+; F16-NATIVE-NEXT:    pkhbt r2, r2, r3, lsl #16
+; F16-NATIVE-NEXT:    ldrh r1, [sp, #32]
+; F16-NATIVE-NEXT:    vshl.i16 d16, d16, #15
+; F16-NATIVE-NEXT:    ldrh r6, [sp, #24]
+; F16-NATIVE-NEXT:    ldrh r4, [sp, #20]
+; F16-NATIVE-NEXT:    orr r0, r1, lr, lsl #16
+; F16-NATIVE-NEXT:    orr r1, r6, r12, lsl #16
+; F16-NATIVE-NEXT:    ldrh r5, [sp, #16]
+; F16-NATIVE-NEXT:    vshr.s16 d16, d16, #15
+; F16-NATIVE-NEXT:    vmov d17, r1, r0
+; F16-NATIVE-NEXT:    orr r0, r5, r4, lsl #16
+; F16-NATIVE-NEXT:    vmov d18, r2, r0
+; F16-NATIVE-NEXT:    veor d18, d18, d17
+; F16-NATIVE-NEXT:    vand d16, d18, d16
+; F16-NATIVE-NEXT:    veor d16, d17, d16
+; F16-NATIVE-NEXT:    vmov.u16 r0, d16[0]
+; F16-NATIVE-NEXT:    vmov.u16 r1, d16[1]
+; F16-NATIVE-NEXT:    vmov.u16 r2, d16[2]
+; F16-NATIVE-NEXT:    vmov.u16 r3, d16[3]
+; F16-NATIVE-NEXT:    pop {r4, r5, r6, pc}
+;
+; THUMB1-LABEL: ct_v4f16:
+; THUMB1:       @ %bb.0: @ %entry
+; THUMB1-NEXT:    .save {r4, r5, r6, lr}
+; THUMB1-NEXT:    push {r4, r5, r6, lr}
+; THUMB1-NEXT:    movs r1, #1
+; THUMB1-NEXT:    ands r1, r0
+; THUMB1-NEXT:    subs r4, r1, #1
+; THUMB1-NEXT:    ldr r0, [sp, #24]
+; THUMB1-NEXT:    ands r0, r4
+; THUMB1-NEXT:    rsbs r5, r1, #0
+; THUMB1-NEXT:    ands r2, r5
+; THUMB1-NEXT:    orrs r0, r2
+; THUMB1-NEXT:    ldr r1, [sp, #28]
+; THUMB1-NEXT:    ands r1, r4
+; THUMB1-NEXT:    ands r3, r5
+; THUMB1-NEXT:    orrs r1, r3
+; THUMB1-NEXT:    ldr r3, [sp, #32]
+; THUMB1-NEXT:    ands r3, r4
+; THUMB1-NEXT:    ldr r2, [sp, #16]
+; THUMB1-NEXT:    ands r2, r5
+; THUMB1-NEXT:    orrs r2, r3
+; THUMB1-NEXT:    ldr r6, [sp, #36]
+; THUMB1-NEXT:    ands r6, r4
+; THUMB1-NEXT:    ldr r3, [sp, #20]
+; THUMB1-NEXT:    ands r3, r5
+; THUMB1-NEXT:    orrs r3, r6
+; THUMB1-NEXT:    pop {r4, r5, r6, pc}
+;
+; THUMB2-LABEL: ct_v4f16:
+; THUMB2:       @ %bb.0: @ %entry
+; THUMB2-NEXT:    .save {r7, lr}
+; THUMB2-NEXT:    push {r7, lr}
+; THUMB2-NEXT:    and r0, r0, #1
+; THUMB2-NEXT:    ldrh.w r1, [sp, #16]
+; THUMB2-NEXT:    sub.w r12, r0, #1
+; THUMB2-NEXT:    rsb.w lr, r0, #0
+; THUMB2-NEXT:    and.w r0, r2, lr
+; THUMB2-NEXT:    and.w r1, r1, r12
+; THUMB2-NEXT:    orrs r0, r1
+; THUMB2-NEXT:    ldrh.w r1, [sp, #20]
+; THUMB2-NEXT:    and.w r2, r3, lr
+; THUMB2-NEXT:    ldrh.w r3, [sp, #8]
+; THUMB2-NEXT:    and.w r1, r1, r12
+; THUMB2-NEXT:    orrs r1, r2
+; THUMB2-NEXT:    ldrh.w r2, [sp, #24]
+; THUMB2-NEXT:    and.w r3, r3, lr
+; THUMB2-NEXT:    and.w r2, r2, r12
+; THUMB2-NEXT:    orrs r2, r3
+; THUMB2-NEXT:    ldrh.w r3, [sp, #28]
+; THUMB2-NEXT:    and.w r12, r12, r3
+; THUMB2-NEXT:    ldrh.w r3, [sp, #12]
+; THUMB2-NEXT:    and.w r3, r3, lr
+; THUMB2-NEXT:    orr.w r3, r3, r12
+; THUMB2-NEXT:    pop {r7, pc}
+entry:
+  %sel = call <4 x half> @llvm.ct.select.v4f16(i1 %cond, <4 x half> %a, <4 x half> %b)
+  ret <4 x half> %sel
+}
+
+define <4 x bfloat> @ct_v4bf16(i1 %cond, <4 x bfloat> %a, <4 x bfloat> %b) {
+; CT-LABEL: ct_v4bf16:
+; CT:       @ %bb.0: @ %entry
+; CT-NEXT:    .save {r4, r5, r6, lr}
+; CT-NEXT:    push {r4, r5, r6, lr}
+; CT-NEXT:    ldrh lr, [sp, #36]
+; CT-NEXT:    vdup.16 d16, r0
+; CT-NEXT:    ldrh r12, [sp, #28]
+; CT-NEXT:    pkhbt r2, r2, r3, lsl #16
+; CT-NEXT:    ldrh r1, [sp, #32]
+; CT-NEXT:    vshl.i16 d16, d16, #15
+; CT-NEXT:    ldrh r6, [sp, #24]
+; CT-NEXT:    ldrh r4, [sp, #20]
+; CT-NEXT:    orr r0, r1, lr, lsl #16
+; CT-NEXT:    orr r1, r6, r12, lsl #16
+; CT-NEXT:    ldrh r5, [sp, #16]
+; CT-NEXT:    vshr.s16 d16, d16, #15
+; CT-NEXT:    vmov d17, r1, r0
+; CT-NEXT:    orr r0, r5, r4, lsl #16
+; CT-NEXT:    vmov d18, r2, r0
+; CT-NEXT:    veor d18, d18, d17
+; CT-NEXT:    vand d16, d18, d16
+; CT-NEXT:    veor d16, d17, d16
+; CT-NEXT:    vmov.u16 r0, d16[0]
+; CT-NEXT:    vmov.u16 r1, d16[1]
+; CT-NEXT:    vmov.u16 r2, d16[2]
+; CT-NEXT:    vmov.u16 r3, d16[3]
+; CT-NEXT:    pop {r4, r5, r6, pc}
+;
+; BFLOAT-F16-NATIVE-LABEL: ct_v4bf16:
+; BFLOAT-F16-NATIVE:       @ %bb.0: @ %entry
+; BFLOAT-F16-NATIVE-NEXT:    vldr d16, [sp]
+; BFLOAT-F16-NATIVE-NEXT:    vmov d17, r2, r3
+; BFLOAT-F16-NATIVE-NEXT:    and r0, r0, #1
+; BFLOAT-F16-NATIVE-NEXT:    rsb r1, r0, #0
+; BFLOAT-F16-NATIVE-NEXT:    vdup.32 d19, r1
+; BFLOAT-F16-NATIVE-NEXT:    vand d18, d17, d19
+; BFLOAT-F16-NATIVE-NEXT:    vbic d19, d16, d19
+; BFLOAT-F16-NATIVE-NEXT:    vorr d18, d18, d19
+; BFLOAT-F16-NATIVE-NEXT:    vmov r0, r1, d18
+; BFLOAT-F16-NATIVE-NEXT:    bx lr
+;
+; F16-NATIVE-LABEL: ct_v4bf16:
+; F16-NATIVE:       @ %bb.0: @ %entry
+; F16-NATIVE-NEXT:    .save {r4, r5, r6, lr}
+; F16-NATIVE-NEXT:    push {r4, r5, r6, lr}
+; F16-NATIVE-NEXT:    ldrh lr, [sp, #36]
+; F16-NATIVE-NEXT:    vdup.16 d16, r0
+; F16-NATIVE-NEXT:    ldrh r12, [sp, #28]
+; F16-NATIVE-NEXT:    pkhbt r2, r2, r3, lsl #16
+; F16-NATIVE-NEXT:    ldrh r1, [sp, #32]
+; F16-NATIVE-NEXT:    vshl.i16 d16, d16, #15
+; F16-NATIVE-NEXT:    ldrh r6, [sp, #24]
+; F16-NATIVE-NEXT:    ldrh r4, [sp, #20]
+; F16-NATIVE-NEXT:    orr r0, r1, lr, lsl #16
+; F16-NATIVE-NEXT:    orr r1, r6, r12, lsl #16
+; F16-NATIVE-NEXT:    ldrh r5, [sp, #16]
+; F16-NATIVE-NEXT:    vshr.s16 d16, d16, #15
+; F16-NATIVE-NEXT:    vmov d17, r1, r0
+; F16-NATIVE-NEXT:    orr r0, r5, r4, lsl #16
+; F16-NATIVE-NEXT:    vmov d18, r2, r0
+; F16-NATIVE-NEXT:    veor d18, d18, d17
+; F16-NATIVE-NEXT:    vand d16, d18, d16
+; F16-NATIVE-NEXT:    veor d16, d17, d16
+; F16-NATIVE-NEXT:    vmov.u16 r0, d16[0]
+; F16-NATIVE-NEXT:    vmov.u16 r1, d16[1]
+; F16-NATIVE-NEXT:    vmov.u16 r2, d16[2]
+; F16-NATIVE-NEXT:    vmov.u16 r3, d16[3]
+; F16-NATIVE-NEXT:    pop {r4, r5, r6, pc}
+;
+; THUMB1-LABEL: ct_v4bf16:
+; THUMB1:       @ %bb.0: @ %entry
+; THUMB1-NEXT:    .save {r4, r5, r6, lr}
+; THUMB1-NEXT:    push {r4, r5, r6, lr}
+; THUMB1-NEXT:    movs r1, #1
+; THUMB1-NEXT:    ands r1, r0
+; THUMB1-NEXT:    subs r4, r1, #1
+; THUMB1-NEXT:    ldr r0, [sp, #24]
+; THUMB1-NEXT:    ands r0, r4
+; THUMB1-NEXT:    rsbs r5, r1, #0
+; THUMB1-NEXT:    ands r2, r5
+; THUMB1-NEXT:    orrs r0, r2
+; THUMB1-NEXT:    ldr r1, [sp, #28]
+; THUMB1-NEXT:    ands r1, r4
+; THUMB1-NEXT:    ands r3, r5
+; THUMB1-NEXT:    orrs r1, r3
+; THUMB1-NEXT:    ldr r3, [sp, #32]
+; THUMB1-NEXT:    ands r3, r4
+; THUMB1-NEXT:    ldr r2, [sp, #16]
+; THUMB1-NEXT:    ands r2, r5
+; THUMB1-NEXT:    orrs r2, r3
+; THUMB1-NEXT:    ldr r6, [sp, #36]
+; THUMB1-NEXT:    ands r6, r4
+; THUMB1-NEXT:    ldr r3, [sp, #20]
+; THUMB1-NEXT:    ands r3, r5
+; THUMB1-NEXT:    orrs r3, r6
+; THUMB1-NEXT:    pop {r4, r5, r6, pc}
+;
+; THUMB2-LABEL: ct_v4bf16:
+; THUMB2:       @ %bb.0: @ %entry
+; THUMB2-NEXT:    .save {r7, lr}
+; THUMB2-NEXT:    push {r7, lr}
+; THUMB2-NEXT:    and r0, r0, #1
+; THUMB2-NEXT:    ldrh.w r1, [sp, #16]
+; THUMB2-NEXT:    sub.w r12, r0, #1
+; THUMB2-NEXT:    rsb.w lr, r0, #0
+; THUMB2-NEXT:    and.w r0, r2, lr
+; THUMB2-NEXT:    and.w r1, r1, r12
+; THUMB2-NEXT:    orrs r0, r1
+; THUMB2-NEXT:    ldrh.w r1, [sp, #20]
+; THUMB2-NEXT:    and.w r2, r3, lr
+; THUMB2-NEXT:    ldrh.w r3, [sp, #8]
+; THUMB2-NEXT:    and.w r1, r1, r12
+; THUMB2-NEXT:    orrs r1, r2
+; THUMB2-NEXT:    ldrh.w r2, [sp, #24]
+; THUMB2-NEXT:    and.w r3, r3, lr
+; THUMB2-NEXT:    and.w r2, r2, r12
+; THUMB2-NEXT:    orrs r2, r3
+; THUMB2-NEXT:    ldrh.w r3, [sp, #28]
+; THUMB2-NEXT:    and.w r12, r12, r3
+; THUMB2-NEXT:    ldrh.w r3, [sp, #12]
+; THUMB2-NEXT:    and.w r3, r3, lr
+; THUMB2-NEXT:    orr.w r3, r3, r12
+; THUMB2-NEXT:    pop {r7, pc}
+entry:
+  %sel = call <4 x bfloat> @llvm.ct.select.v4bf16(i1 %cond, <4 x bfloat> %a, <4 x bfloat> %b)
+  ret <4 x bfloat> %sel
+}
+
+define <8 x half> @ct_v8f16(i1 %cond, <8 x half> %a, <8 x half> %b) {
+; CT-LABEL: ct_v8f16:
+; CT:       @ %bb.0: @ %entry
+; CT-NEXT:    .save {r4, r5, r6, r7, r11, lr}
+; CT-NEXT:    push {r4, r5, r6, r7, r11, lr}
+; CT-NEXT:    ldrh r5, [sp, #36]
+; CT-NEXT:    pkhbt r2, r2, r3, lsl #16
+; CT-NEXT:    ldrh r7, [sp, #32]
+; CT-NEXT:    vdup.8 d18, r1
+; CT-NEXT:    ldrh r1, [sp, #52]
+; CT-NEXT:    vmov.32 d16[0], r2
+; CT-NEXT:    orr r3, r7, r5, lsl #16
+; CT-NEXT:    ldrh r2, [sp, #48]
+; CT-NEXT:    ldrh r5, [sp, #68]
+; CT-NEXT:    vmovl.u8 q9, d18
+; CT-NEXT:    orr r1, r2, r1, lsl #16
+; CT-NEXT:    vmov.32 d17[0], r3
+; CT-NEXT:    ldrh r3, [sp, #64]
+; CT-NEXT:    ldrh r2, [sp, #28]
+; CT-NEXT:    vmov.32 d20[0], r1
+; CT-NEXT:    ldrh r1, [sp, #24]
+; CT-NEXT:    orr r3, r3, r5, lsl #16
+; CT-NEXT:    ldrh r5, [sp, #76]
+; CT-NEXT:    vshl.i16 q9, q9, #15
+; CT-NEXT:    vmov.32 d21[0], r3
+; CT-NEXT:    orr r1, r1, r2, lsl #16
+; CT-NEXT:    ldrh r3, [sp, #72]
+; CT-NEXT:    ldrh r4, [sp, #60]
+; CT-NEXT:    vmov.32 d16[1], r1
+; CT-NEXT:    orr r1, r3, r5, lsl #16
+; CT-NEXT:    ldrh r6, [sp, #56]
+; CT-NEXT:    ldrh r12, [sp, #44]
+; CT-NEXT:    vshr.s16 q9, q9, #15
+; CT-NEXT:    vmov.32 d21[1], r1
+; CT-NEXT:    orr r1, r6, r4, lsl #16
+; CT-NEXT:    ldrh lr, [sp, #40]
+; CT-NEXT:    vmov.32 d20[1], r1
+; CT-NEXT:    orr r1, lr, r12, lsl #16
+; CT-NEXT:    vmov.32 d17[1], r1
+; CT-NEXT:    veor q8, q8, q10
+; CT-NEXT:    vand q8, q8, q9
+; CT-NEXT:    veor q8, q10, q8
+; CT-NEXT:    vst1.64 {d16, d17}, [r0:128]
+; CT-NEXT:    pop {r4, r5, r6, r7, r11, pc}
+;
+; BFLOAT-F16-NATIVE-LABEL: ct_v8f16:
+; BFLOAT-F16-NATIVE:       @ %bb.0: @ %entry
+; BFLOAT-F16-NATIVE-NEXT:    .save {r4, r5, r6, r7, r11, lr}
+; BFLOAT-F16-NATIVE-NEXT:    push {r4, r5, r6, r7, r11, lr}
+; BFLOAT-F16-NATIVE-NEXT:    ldrh r5, [sp, #36]
+; BFLOAT-F16-NATIVE-NEXT:    pkhbt r2, r2, r3, lsl #16
+; BFLOAT-F16-NATIVE-NEXT:    ldrh r7, [sp, #32]
+; BFLOAT-F16-NATIVE-NEXT:    vdup.8 d18, r1
+; BFLOAT-F16-NATIVE-NEXT:    ldrh r1, [sp, #52]
+; BFLOAT-F16-NATIVE-NEXT:    vmov.32 d16[0], r2
+; BFLOAT-F16-NATIVE-NEXT:    orr r3, r7, r5, lsl #16
+; BFLOAT-F16-NATIVE-NEXT:    ldrh r2, [sp, #48]
+; BFLOAT-F16-NATIVE-NEXT:    ldrh r5, [sp, #68]
+; BFLOAT-F16-NATIVE-NEXT:    vmovl.u8 q9, d18
+; BFLOAT-F16-NATIVE-NEXT:    orr r1, r2, r1, lsl #16
+; BFLOAT-F16-NATIVE-NEXT:    vmov.32 d17[0], r3
+; BFLOAT-F16-NATIVE-NEXT:    ldrh r3, [sp, #64]
+; BFLOAT-F16-NATIVE-NEXT:    ldrh r2, [sp, #28]
+; BFLOAT-F16-NATIVE-NEXT:    vmov.32 d20[0], r1
+; BFLOAT-F16-NATIVE-NEXT:    ldrh r1, [sp, #24]
+; BFLOAT-F16-NATIVE-NEXT:    orr r3, r3, r5, lsl #16
+; BFLOAT-F16-NATIVE-NEXT:    ldrh r5, [sp, #76]
+; BFLOAT-F16-NATIVE-NEXT:    vshl.i16 q9, q9, #15
+; BFLOAT-F16-NATIVE-NEXT:    vmov.32 d21[0], r3
+; BFLOAT-F16-NATIVE-NEXT:    orr r1, r1, r2, lsl #16
+; BFLOAT-F16-NATIVE-NEXT:    ldrh r3, [sp, #72]
+; BFLOAT-F16-NATIVE-NEXT:    ldrh r4, [sp, #60]
+; BFLOAT-F16-NATIVE-NEXT:    vmov.32 d16[1], r1
+; BFLOAT-F16-NATIVE-NEXT:    orr r1, r3, r5, lsl #16
+; BFLOAT-F16-NATIVE-NEXT:    ldrh r6, [sp, #56]
+; BFLOAT-F16-NATIVE-NEXT:    ldrh r12, [sp, #44]
+; BFLOAT-F16-NATIVE-NEXT:    vshr.s16 q9, q9, #15
+; BFLOAT-F16-NATIVE-NEXT:    vmov.32 d21[1], r1
+; BFLOAT-F16-NATIVE-NEXT:    orr r1, r6, r4, lsl #16
+; BFLOAT-F16-NATIVE-NEXT:    ldrh lr, [sp, #40]
+; BFLOAT-F16-NATIVE-NEXT:    vmov.32 d20[1], r1
+; BFLOAT-F16-NATIVE-NEXT:    orr r1, lr, r12, lsl #16
+; BFLOAT-F16-NATIVE-NEXT:    vmov.32 d17[1], r1
+; BFLOAT-F16-NATIVE-NEXT:    veor q8, q8, q10
+; BFLOAT-F16-NATIVE-NEXT:    vand q8, q8, q9
+; BFLOAT-F16-NATIVE-NEXT:    veor q8, q10, q8
+; BFLOAT-F16-NATIVE-NEXT:    vst1.64 {d16, d17}, [r0:128]
+; BFLOAT-F16-NATIVE-NEXT:    pop {r4, r5, r6, r7, r11, pc}
+;
+; F16-NATIVE-LABEL: ct_v8f16:
+; F16-NATIVE:       @ %bb.0: @ %entry
+; F16-NATIVE-NEXT:    .save {r4, r5, r6, r7, r11, lr}
+; F16-NATIVE-NEXT:    push {r4, r5, r6, r7, r11, lr}
+; F16-NATIVE-NEXT:    ldrh r5, [sp, #36]
+; F16-NATIVE-NEXT:    pkhbt r2, r2, r3, lsl #16
+; F16-NATIVE-NEXT:    ldrh r7, [sp, #32]
+; F16-NATIVE-NEXT:    vdup.8 d18, r1
+; F16-NATIVE-NEXT:    ldrh r1, [sp, #52]
+; F16-NATIVE-NEXT:    vmov.32 d16[0], r2
+; F16-NATIVE-NEXT:    orr r3, r7, r5, lsl #16
+; F16-NATIVE-NEXT:    ldrh r2, [sp, #48]
+; F16-NATIVE-NEXT:    ldrh r5, [sp, #68]
+; F16-NATIVE-NEXT:    vmovl.u8 q9, d18
+; F16-NATIVE-NEXT:    orr r1, r2, r1, lsl #16
+; F16-NATIVE-NEXT:    vmov.32 d17[0], r3
+; F16-NATIVE-NEXT:    ldrh r3, [sp, #64]
+; F16-NATIVE-NEXT:    ldrh r2, [sp, #28]
+; F16-NATIVE-NEXT:    vmov.32 d20[0], r1
+; F16-NATIVE-NEXT:    ldrh r1, [sp, #24]
+; F16-NATIVE-NEXT:    orr r3, r3, r5, lsl #16
+; F16-NATIVE-NEXT:    ldrh r5, [sp, #76]
+; F16-NATIVE-NEXT:    vshl.i16 q9, q9, #15
+; F16-NATIVE-NEXT:    vmov.32 d21[0], r3
+; F16-NATIVE-NEXT:    orr r1, r1, r2, lsl #16
+; F16-NATIVE-NEXT:    ldrh r3, [sp, #72]
+; F16-NATIVE-NEXT:    ldrh r4, [sp, #60]
+; F16-NATIVE-NEXT:    vmov.32 d16[1], r1
+; F16-NATIVE-NEXT:    orr r1, r3, r5, lsl #16
+; F16-NATIVE-NEXT:    ldrh r6, [sp, #56]
+; F16-NATIVE-NEXT:    ldrh r12, [sp, #44]
+; F16-NATIVE-NEXT:    vshr.s16 q9, q9, #15
+; F16-NATIVE-NEXT:    vmov.32 d21[1], r1
+; F16-NATIVE-NEXT:    orr r1, r6, r4, lsl #16
+; F16-NATIVE-NEXT:    ldrh lr, [sp, #40]
+; F16-NATIVE-NEXT:    vmov.32 d20[1], r1
+; F16-NATIVE-NEXT:    orr r1, lr, r12, lsl #16
+; F16-NATIVE-NEXT:    vmov.32 d17[1], r1
+; F16-NATIVE-NEXT:    veor q8, q8, q10
+; F16-NATIVE-NEXT:    vand q8, q8, q9
+; F16-NATIVE-NEXT:    veor q8, q10, q8
+; F16-NATIVE-NEXT:    vst1.64 {d16, d17}, [r0:128]
+; F16-NATIVE-NEXT:    pop {r4, r5, r6, r7, r11, pc}
+;
+; THUMB1-LABEL: ct_v8f16:
+; THUMB1:       @ %bb.0: @ %entry
+; THUMB1-NEXT:    .save {r4, r5, r6, lr}
+; THUMB1-NEXT:    push {r4, r5, r6, lr}
+; THUMB1-NEXT:    movs r4, #1
+; THUMB1-NEXT:    ands r4, r1
+; THUMB1-NEXT:    subs r1, r4, #1
+; THUMB1-NEXT:    ldr r5, [sp, #68]
+; THUMB1-NEXT:    ands r5, r1
+; THUMB1-NEXT:    rsbs r4, r4, #0
+; THUMB1-NEXT:    ldr r6, [sp, #36]
+; THUMB1-NEXT:    ands r6, r4
+; THUMB1-NEXT:    orrs r6, r5
+; THUMB1-NEXT:    strh r6, [r0, #14]
+; THUMB1-NEXT:    ldr r5, [sp, #64]
+; THUMB1-NEXT:    ands r5, r1
+; THUMB1-NEXT:    ldr r6, [sp, #32]
+; THUMB1-NEXT:    ands r6, r4
+; THUMB1-NEXT:    orrs r6, r5
+; THUMB1-NEXT:    strh r6, [r0, #12]
+; THUMB1-NEXT:    ldr r5, [sp, #60]
+; THUMB1-NEXT:    ands r5, r1
+; THUMB1-NEXT:    ldr r6, [sp, #28]
+; THUMB1-NEXT:    ands r6, r4
+; THUMB1-NEXT:    orrs r6, r5
+; THUMB1-NEXT:    strh r6, [r0, #10]
+; THUMB1-NEXT:    ldr r5, [sp, #56]
+; THUMB1-NEXT:    ands r5, r1
+; THUMB1-NEXT:    ldr r6, [sp, #24]
+; THUMB1-NEXT:    ands r6, r4
+; THUMB1-NEXT:    orrs r6, r5
+; THUMB1-NEXT:    strh r6, [r0, #8]
+; THUMB1-NEXT:    ldr r5, [sp, #52]
+; THUMB1-NEXT:    ands r5, r1
+; THUMB1-NEXT:    ldr r6, [sp, #20]
+; THUMB1-NEXT:    ands r6, r4
+; THUMB1-NEXT:    orrs r6, r5
+; THUMB1-NEXT:    strh r6, [r0, #6]
+; THUMB1-NEXT:    ldr r5, [sp, #48]
+; THUMB1-NEXT:    ands r5, r1
+; THUMB1-NEXT:    ldr r6, [sp, #16]
+; THUMB1-NEXT:    ands r6, r4
+; THUMB1-NEXT:    orrs r6, r5
+; THUMB1-NEXT:    strh r6, [r0, #4]
+; THUMB1-NEXT:    ldr r5, [sp, #44]
+; THUMB1-NEXT:    ands r5, r1
+; THUMB1-NEXT:    ands r3, r4
+; THUMB1-NEXT:    orrs r3, r5
+; THUMB1-NEXT:    strh r3, [r0, #2]
+; THUMB1-NEXT:    ldr r3, [sp, #40]
+; THUMB1-NEXT:    ands r3, r1
+; THUMB1-NEXT:    ands r2, r4
+; THUMB1-NEXT:    orrs r2, r3
+; THUMB1-NEXT:    strh r2, [r0]
+; THUMB1-NEXT:    pop {r4, r5, r6, pc}
+;
+; THUMB2-LABEL: ct_v8f16:
+; THUMB2:       @ %bb.0: @ %entry
+; THUMB2-NEXT:    .save {r4, r5, r7, lr}
+; THUMB2-NEXT:    push {r4, r5, r7, lr}
+; THUMB2-NEXT:    and lr, r1, #1
+; THUMB2-NEXT:    ldrh.w r1, [sp, #68]
+; THUMB2-NEXT:    sub.w r12, lr, #1
+; THUMB2-NEXT:    ldrh.w r4, [sp, #36]
+; THUMB2-NEXT:    and.w r5, r1, r12
+; THUMB2-NEXT:    rsb.w r1, lr, #0
+; THUMB2-NEXT:    ands r4, r1
+; THUMB2-NEXT:    ands r3, r1
+; THUMB2-NEXT:    orrs r5, r4
+; THUMB2-NEXT:    ldrh.w r4, [sp, #32]
+; THUMB2-NEXT:    strh r5, [r0, #14]
+; THUMB2-NEXT:    ldrh.w r5, [sp, #64]
+; THUMB2-NEXT:    ands r4, r1
+; THUMB2-NEXT:    and.w r5, r5, r12
+; THUMB2-NEXT:    orrs r5, r4
+; THUMB2-NEXT:    ldrh.w r4, [sp, #28]
+; THUMB2-NEXT:    strh r5, [r0, #12]
+; THUMB2-NEXT:    ldrh.w r5, [sp, #60]
+; THUMB2-NEXT:    ands r4, r1
+; THUMB2-NEXT:    and.w r5, r5, r12
+; THUMB2-NEXT:    orrs r5, r4
+; THUMB2-NEXT:    ldrh.w r4, [sp, #24]
+; THUMB2-NEXT:    strh r5, [r0, #10]
+; THUMB2-NEXT:    ldrh.w r5, [sp, #56]
+; THUMB2-NEXT:    ands r4, r1
+; THUMB2-NEXT:    and.w r5, r5, r12
+; THUMB2-NEXT:    orrs r5, r4
+; THUMB2-NEXT:    ldrh.w r4, [sp, #20]
+; THUMB2-NEXT:    strh r5, [r0, #8]
+; THUMB2-NEXT:    ldrh.w r5, [sp, #52]
+; THUMB2-NEXT:    ands r4, r1
+; THUMB2-NEXT:    and.w r5, r5, r12
+; THUMB2-NEXT:    orrs r5, r4
+; THUMB2-NEXT:    ldrh.w r4, [sp, #16]
+; THUMB2-NEXT:    strh r5, [r0, #6]
+; THUMB2-NEXT:    ldrh.w r5, [sp, #48]
+; THUMB2-NEXT:    ands r4, r1
+; THUMB2-NEXT:    ands r1, r2
+; THUMB2-NEXT:    and.w r5, r5, r12
+; THUMB2-NEXT:    orrs r5, r4
+; THUMB2-NEXT:    strh r5, [r0, #4]
+; THUMB2-NEXT:    ldrh.w r5, [sp, #44]
+; THUMB2-NEXT:    and.w r5, r5, r12
+; THUMB2-NEXT:    orrs r3, r5
+; THUMB2-NEXT:    strh r3, [r0, #2]
+; THUMB2-NEXT:    ldrh.w r3, [sp, #40]
+; THUMB2-NEXT:    and.w r3, r3, r12
+; THUMB2-NEXT:    orrs r1, r3
+; THUMB2-NEXT:    strh r1, [r0]
+; THUMB2-NEXT:    pop {r4, r5, r7, pc}
+entry:
+  %sel = call <8 x half> @llvm.ct.select.v8f16(i1 %cond, <8 x half> %a, <8 x half> %b)
+  ret <8 x half> %sel
+}
+
+define <8 x bfloat> @ct_v8bf16(i1 %cond, <8 x bfloat> %a, <8 x bfloat> %b) {
+; CT-LABEL: ct_v8bf16:
+; CT:       @ %bb.0: @ %entry
+; CT-NEXT:    .save {r4, r5, r6, r7, r11, lr}
+; CT-NEXT:    push {r4, r5, r6, r7, r11, lr}
+; CT-NEXT:    ldrh r5, [sp, #36]
+; CT-NEXT:    pkhbt r2, r2, r3, lsl #16
+; CT-NEXT:    ldrh r7, [sp, #32]
+; CT-NEXT:    vdup.8 d18, r1
+; CT-NEXT:    ldrh r1, [sp, #52]
+; CT-NEXT:    vmov.32 d16[0], r2
+; CT-NEXT:    orr r3, r7, r5, lsl #16
+; CT-NEXT:    ldrh r2, [sp, #48]
+; CT-NEXT:    ldrh r5, [sp, #68]
+; CT-NEXT:    vmovl.u8 q9, d18
+; CT-NEXT:    orr r1, r2, r1, lsl #16
+; CT-NEXT:    vmov.32 d17[0], r3
+; CT-NEXT:    ldrh r3, [sp, #64]
+; CT-NEXT:    ldrh r2, [sp, #28]
+; CT-NEXT:    vmov.32 d20[0], r1
+; CT-NEXT:    ldrh r1, [sp, #24]
+; CT-NEXT:    orr r3, r3, r5, lsl #16
+; CT-NEXT:    ldrh r5, [sp, #76]
+; CT-NEXT:    vshl.i16 q9, q9, #15
+; CT-NEXT:    vmov.32 d21[0], r3
+; CT-NEXT:    orr r1, r1, r2, lsl #16
+; CT-NEXT:    ldrh r3, [sp, #72]
+; CT-NEXT:    ldrh r4, [sp, #60]
+; CT-NEXT:    vmov.32 d16[1], r1
+; CT-NEXT:    orr r1, r3, r5, lsl #16
+; CT-NEXT:    ldrh r6, [sp, #56]
+; CT-NEXT:    ldrh r12, [sp, #44]
+; CT-NEXT:    vshr.s16 q9, q9, #15
+; CT-NEXT:    vmov.32 d21[1], r1
+; CT-NEXT:    orr r1, r6, r4, lsl #16
+; CT-NEXT:    ldrh lr, [sp, #40]
+; CT-NEXT:    vmov.32 d20[1], r1
+; CT-NEXT:    orr r1, lr, r12, lsl #16
+; CT-NEXT:    vmov.32 d17[1], r1
+; CT-NEXT:    veor q8, q8, q10
+; CT-NEXT:    vand q8, q8, q9
+; CT-NEXT:    veor q8, q10, q8
+; CT-NEXT:    vst1.64 {d16, d17}, [r0:128]
+; CT-NEXT:    pop {r4, r5, r6, r7, r11, pc}
+;
+; BFLOAT-F16-NATIVE-LABEL: ct_v8bf16:
+; BFLOAT-F16-NATIVE:       @ %bb.0: @ %entry
+; BFLOAT-F16-NATIVE-NEXT:    vldr d17, [sp]
+; BFLOAT-F16-NATIVE-NEXT:    add r1, sp, #8
+; BFLOAT-F16-NATIVE-NEXT:    vmov d16, r2, r3
+; BFLOAT-F16-NATIVE-NEXT:    vld1.64 {d18, d19}, [r1]
+; BFLOAT-F16-NATIVE-NEXT:    and r0, r0, #1
+; BFLOAT-F16-NATIVE-NEXT:    rsb r1, r0, #0
+; BFLOAT-F16-NATIVE-NEXT:    vdup.32 q11, r1
+; BFLOAT-F16-NATIVE-NEXT:    vand q10, q8, q11
+; BFLOAT-F16-NATIVE-NEXT:    vbic q11, q9, q11
+; BFLOAT-F16-NATIVE-NEXT:    vorr q10, q10, q11
+; BFLOAT-F16-NATIVE-NEXT:    vmov r0, r1, d20
+; BFLOAT-F16-NATIVE-NEXT:    vmov r2, r3, d21
+; BFLOAT-F16-NATIVE-NEXT:    bx lr
+;
+; F16-NATIVE-LABEL: ct_v8bf16:
+; F16-NATIVE:       @ %bb.0: @ %entry
+; F16-NATIVE-NEXT:    .save {r4, r5, r6, r7, r11, lr}
+; F16-NATIVE-NEXT:    push {r4, r5, r6, r7, r11, lr}
+; F16-NATIVE-NEXT:    ldrh r5, [sp, #36]
+; F16-NATIVE-NEXT:    pkhbt r2, r2, r3, lsl #16
+; F16-NATIVE-NEXT:    ldrh r7, [sp, #32]
+; F16-NATIVE-NEXT:    vdup.8 d18, r1
+; F16-NATIVE-NEXT:    ldrh r1, [sp, #52]
+; F16-NATIVE-NEXT:    vmov.32 d16[0], r2
+; F16-NATIVE-NEXT:    orr r3, r7, r5, lsl #16
+; F16-NATIVE-NEXT:    ldrh r2, [sp, #48]
+; F16-NATIVE-NEXT:    ldrh r5, [sp, #68]
+; F16-NATIVE-NEXT:    vmovl.u8 q9, d18
+; F16-NATIVE-NEXT:    orr r1, r2, r1, lsl #16
+; F16-NATIVE-NEXT:    vmov.32 d17[0], r3
+; F16-NATIVE-NEXT:    ldrh r3, [sp, #64]
+; F16-NATIVE-NEXT:    ldrh r2, [sp, #28]
+; F16-NATIVE-NEXT:    vmov.32 d20[0], r1
+; F16-NATIVE-NEXT:    ldrh r1, [sp, #24]
+; F16-NATIVE-NEXT:    orr r3, r3, r5, lsl #16
+; F16-NATIVE-NEXT:    ldrh r5, [sp, #76]
+; F16-NATIVE-NEXT:    vshl.i16 q9, q9, #15
+; F16-NATIVE-NEXT:    vmov.32 d21[0], r3
+; F16-NATIVE-NEXT:    orr r1, r1, r2, lsl #16
+; F16-NATIVE-NEXT:    ldrh r3, [sp, #72]
+; F16-NATIVE-NEXT:    ldrh r4, [sp, #60]
+; F16-NATIVE-NEXT:    vmov.32 d16[1], r1
+; F16-NATIVE-NEXT:    orr r1, r3, r5, lsl #16
+; F16-NATIVE-NEXT:    ldrh r6, [sp, #56]
+; F16-NATIVE-NEXT:    ldrh r12, [sp, #44]
+; F16-NATIVE-NEXT:    vshr.s16 q9, q9, #15
+; F16-NATIVE-NEXT:    vmov.32 d21[1], r1
+; F16-NATIVE-NEXT:    orr r1, r6, r4, lsl #16
+; F16-NATIVE-NEXT:    ldrh lr, [sp, #40]
+; F16-NATIVE-NEXT:    vmov.32 d20[1], r1
+; F16-NATIVE-NEXT:    orr r1, lr, r12, lsl #16
+; F16-NATIVE-NEXT:    vmov.32 d17[1], r1
+; F16-NATIVE-NEXT:    veor q8, q8, q10
+; F16-NATIVE-NEXT:    vand q8, q8, q9
+; F16-NATIVE-NEXT:    veor q8, q10, q8
+; F16-NATIVE-NEXT:    vst1.64 {d16, d17}, [r0:128]
+; F16-NATIVE-NEXT:    pop {r4, r5, r6, r7, r11, pc}
+;
+; THUMB1-LABEL: ct_v8bf16:
+; THUMB1:       @ %bb.0: @ %entry
+; THUMB1-NEXT:    .save {r4, r5, r6, lr}
+; THUMB1-NEXT:    push {r4, r5, r6, lr}
+; THUMB1-NEXT:    movs r4, #1
+; THUMB1-NEXT:    ands r4, r1
+; THUMB1-NEXT:    subs r1, r4, #1
+; THUMB1-NEXT:    ldr r5, [sp, #68]
+; THUMB1-NEXT:    ands r5, r1
+; THUMB1-NEXT:    rsbs r4, r4, #0
+; THUMB1-NEXT:    ldr r6, [sp, #36]
+; THUMB1-NEXT:    ands r6, r4
+; THUMB1-NEXT:    orrs r6, r5
+; THUMB1-NEXT:    strh r6, [r0, #14]
+; THUMB1-NEXT:    ldr r5, [sp, #64]
+; THUMB1-NEXT:    ands r5, r1
+; THUMB1-NEXT:    ldr r6, [sp, #32]
+; THUMB1-NEXT:    ands r6, r4
+; THUMB1-NEXT:    orrs r6, r5
+; THUMB1-NEXT:    strh r6, [r0, #12]
+; THUMB1-NEXT:    ldr r5, [sp, #60]
+; THUMB1-NEXT:    ands r5, r1
+; THUMB1-NEXT:    ldr r6, [sp, #28]
+; THUMB1-NEXT:    ands r6, r4
+; THUMB1-NEXT:    orrs r6, r5
+; THUMB1-NEXT:    strh r6, [r0, #10]
+; THUMB1-NEXT:    ldr r5, [sp, #56]
+; THUMB1-NEXT:    ands r5, r1
+; THUMB1-NEXT:    ldr r6, [sp, #24]
+; THUMB1-NEXT:    ands r6, r4
+; THUMB1-NEXT:    orrs r6, r5
+; THUMB1-NEXT:    strh r6, [r0, #8]
+; THUMB1-NEXT:    ldr r5, [sp, #52]
+; THUMB1-NEXT:    ands r5, r1
+; THUMB1-NEXT:    ldr r6, [sp, #20]
+; THUMB1-NEXT:    ands r6, r4
+; THUMB1-NEXT:    orrs r6, r5
+; THUMB1-NEXT:    strh r6, [r0, #6]
+; THUMB1-NEXT:    ldr r5, [sp, #48]
+; THUMB1-NEXT:    ands r5, r1
+; THUMB1-NEXT:    ldr r6, [sp, #16]
+; THUMB1-NEXT:    ands r6, r4
+; THUMB1-NEXT:    orrs r6, r5
+; THUMB1-NEXT:    strh r6, [r0, #4]
+; THUMB1-NEXT:    ldr r5, [sp, #44]
+; THUMB1-NEXT:    ands r5, r1
+; THUMB1-NEXT:    ands r3, r4
+; THUMB1-NEXT:    orrs r3, r5
+; THUMB1-NEXT:    strh r3, [r0, #2]
+; THUMB1-NEXT:    ldr r3, [sp, #40]
+; THUMB1-NEXT:    ands r3, r1
+; THUMB1-NEXT:    ands r2, r4
+; THUMB1-NEXT:    orrs r2, r3
+; THUMB1-NEXT:    strh r2, [r0]
+; THUMB1-NEXT:    pop {r4, r5, r6, pc}
+;
+; THUMB2-LABEL: ct_v8bf16:
+; THUMB2:       @ %bb.0: @ %entry
+; THUMB2-NEXT:    .save {r4, r5, r7, lr}
+; THUMB2-NEXT:    push {r4, r5, r7, lr}
+; THUMB2-NEXT:    and lr, r1, #1
+; THUMB2-NEXT:    ldrh.w r1, [sp, #68]
+; THUMB2-NEXT:    sub.w r12, lr, #1
+; THUMB2-NEXT:    ldrh.w r4, [sp, #36]
+; THUMB2-NEXT:    and.w r5, r1, r12
+; THUMB2-NEXT:    rsb.w r1, lr, #0
+; THUMB2-NEXT:    ands r4, r1
+; THUMB2-NEXT:    ands r3, r1
+; THUMB2-NEXT:    orrs r5, r4
+; THUMB2-NEXT:    ldrh.w r4, [sp, #32]
+; THUMB2-NEXT:    strh r5, [r0, #14]
+; THUMB2-NEXT:    ldrh.w r5, [sp, #64]
+; THUMB2-NEXT:    ands r4, r1
+; THUMB2-NEXT:    and.w r5, r5, r12
+; THUMB2-NEXT:    orrs r5, r4
+; THUMB2-NEXT:    ldrh.w r4, [sp, #28]
+; THUMB2-NEXT:    strh r5, [r0, #12]
+; THUMB2-NEXT:    ldrh.w r5, [sp, #60]
+; THUMB2-NEXT:    ands r4, r1
+; THUMB2-NEXT:    and.w r5, r5, r12
+; THUMB2-NEXT:    orrs r5, r4
+; THUMB2-NEXT:    ldrh.w r4, [sp, #24]
+; THUMB2-NEXT:    strh r5, [r0, #10]
+; THUMB2-NEXT:    ldrh.w r5, [sp, #56]
+; THUMB2-NEXT:    ands r4, r1
+; THUMB2-NEXT:    and.w r5, r5, r12
+; THUMB2-NEXT:    orrs r5, r4
+; THUMB2-NEXT:    ldrh.w r4, [sp, #20]
+; THUMB2-NEXT:    strh r5, [r0, #8]
+; THUMB2-NEXT:    ldrh.w r5, [sp, #52]
+; THUMB2-NEXT:    ands r4, r1
+; THUMB2-NEXT:    and.w r5, r5, r12
+; THUMB2-NEXT:    orrs r5, r4
+; THUMB2-NEXT:    ldrh.w r4, [sp, #16]
+; THUMB2-NEXT:    strh r5, [r0, #6]
+; THUMB2-NEXT:    ldrh.w r5, [sp, #48]
+; THUMB2-NEXT:    ands r4, r1
+; THUMB2-NEXT:    ands r1, r2
+; THUMB2-NEXT:    and.w r5, r5, r12
+; THUMB2-NEXT:    orrs r5, r4
+; THUMB2-NEXT:    strh r5, [r0, #4]
+; THUMB2-NEXT:    ldrh.w r5, [sp, #44]
+; THUMB2-NEXT:    and.w r5, r5, r12
+; THUMB2-NEXT:    orrs r3, r5
+; THUMB2-NEXT:    strh r3, [r0, #2]
+; THUMB2-NEXT:    ldrh.w r3, [sp, #40]
+; THUMB2-NEXT:    and.w r3, r3, r12
+; THUMB2-NEXT:    orrs r1, r3
+; THUMB2-NEXT:    strh r1, [r0]
+; THUMB2-NEXT:    pop {r4, r5, r7, pc}
+entry:
+  %sel = call <8 x bfloat> @llvm.ct.select.v8bf16(i1 %cond, <8 x bfloat> %a, <8 x bfloat> %b)
+  ret <8 x bfloat> %sel
+}
diff --git a/llvm/test/CodeGen/ARM/ctselect-vector.ll b/llvm/test/CodeGen/ARM/ctselect-vector.ll
new file mode 100644
index 0000000000000..3a03ebccb05ac
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/ctselect-vector.ll
@@ -0,0 +1,1839 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=armv7-none-eabi -verify-machineinstrs | FileCheck --check-prefixes=CT %s
+; RUN: llc < %s -mtriple=armv6 -verify-machineinstrs | FileCheck --check-prefix=DEFAULT %s
+; RUN: llc < %s -mtriple=thumbv6m-none-eabi -verify-machineinstrs | FileCheck --check-prefix=THUMB1 %s
+; RUN: llc < %s -mtriple=thumbv7m-none-eabi -verify-machineinstrs | FileCheck --check-prefix=THUMB2 %s
+
+define <8 x i8> @ct_v8i8(i1 %cond, <8 x i8> %a, <8 x i8> %b) {
+; CT-LABEL: ct_v8i8:
+; CT:       @ %bb.0: @ %entry
+; CT-NEXT:    vldr d16, [sp]
+; CT-NEXT:    vmov d17, r2, r3
+; CT-NEXT:    and r0, r0, #1
+; CT-NEXT:    rsb r1, r0, #0
+; CT-NEXT:    vdup.32 d19, r1
+; CT-NEXT:    vand d18, d17, d19
+; CT-NEXT:    vbic d19, d16, d19
+; CT-NEXT:    vorr d18, d18, d19
+; CT-NEXT:    vmov r0, r1, d18
+; CT-NEXT:    bx lr
+;
+; DEFAULT-LABEL: ct_v8i8:
+; DEFAULT:       @ %bb.0: @ %entry
+; DEFAULT-NEXT:    push {r4, r5, r11, lr}
+; DEFAULT-NEXT:    and lr, r1, #1
+; DEFAULT-NEXT:    ldrb r1, [sp, #68]
+; DEFAULT-NEXT:    sub r12, lr, #1
+; DEFAULT-NEXT:    ldrb r4, [sp, #36]
+; DEFAULT-NEXT:    and r5, r1, r12
+; DEFAULT-NEXT:    rsb r1, lr, #0
+; DEFAULT-NEXT:    and r4, r4, r1
+; DEFAULT-NEXT:    and r3, r3, r1
+; DEFAULT-NEXT:    orr r5, r4, r5
+; DEFAULT-NEXT:    ldrb r4, [sp, #32]
+; DEFAULT-NEXT:    strb r5, [r0, #7]
+; DEFAULT-NEXT:    ldrb r5, [sp, #64]
+; DEFAULT-NEXT:    and r4, r4, r1
+; DEFAULT-NEXT:    and r5, r5, r12
+; DEFAULT-NEXT:    orr r5, r4, r5
+; DEFAULT-NEXT:    ldrb r4, [sp, #28]
+; DEFAULT-NEXT:    strb r5, [r0, #6]
+; DEFAULT-NEXT:    ldrb r5, [sp, #60]
+; DEFAULT-NEXT:    and r4, r4, r1
+; DEFAULT-NEXT:    and r5, r5, r12
+; DEFAULT-NEXT:    orr r5, r4, r5
+; DEFAULT-NEXT:    ldrb r4, [sp, #24]
+; DEFAULT-NEXT:    strb r5, [r0, #5]
+; DEFAULT-NEXT:    ldrb r5, [sp, #56]
+; DEFAULT-NEXT:    and r4, r4, r1
+; DEFAULT-NEXT:    and r5, r5, r12
+; DEFAULT-NEXT:    orr r5, r4, r5
+; DEFAULT-NEXT:    ldrb r4, [sp, #20]
+; DEFAULT-NEXT:    strb r5, [r0, #4]
+; DEFAULT-NEXT:    ldrb r5, [sp, #52]
+; DEFAULT-NEXT:    and r4, r4, r1
+; DEFAULT-NEXT:    and r5, r5, r12
+; DEFAULT-NEXT:    orr r5, r4, r5
+; DEFAULT-NEXT:    ldrb r4, [sp, #16]
+; DEFAULT-NEXT:    strb r5, [r0, #3]
+; DEFAULT-NEXT:    ldrb r5, [sp, #48]
+; DEFAULT-NEXT:    and r4, r4, r1
+; DEFAULT-NEXT:    and r1, r2, r1
+; DEFAULT-NEXT:    and r5, r5, r12
+; DEFAULT-NEXT:    orr r5, r4, r5
+; DEFAULT-NEXT:    strb r5, [r0, #2]
+; DEFAULT-NEXT:    ldrb r5, [sp, #44]
+; DEFAULT-NEXT:    and r5, r5, r12
+; DEFAULT-NEXT:    orr r3, r3, r5
+; DEFAULT-NEXT:    strb r3, [r0, #1]
+; DEFAULT-NEXT:    ldrb r3, [sp, #40]
+; DEFAULT-NEXT:    and r3, r3, r12
+; DEFAULT-NEXT:    orr r1, r1, r3
+; DEFAULT-NEXT:    strb r1, [r0]
+; DEFAULT-NEXT:    pop {r4, r5, r11, pc}
+;
+; THUMB1-LABEL: ct_v8i8:
+; THUMB1:       @ %bb.0: @ %entry
+; THUMB1-NEXT:    .save {r4, r5, r6, lr}
+; THUMB1-NEXT:    push {r4, r5, r6, lr}
+; THUMB1-NEXT:    movs r4, #1
+; THUMB1-NEXT:    ands r4, r1
+; THUMB1-NEXT:    subs r1, r4, #1
+; THUMB1-NEXT:    ldr r5, [sp, #68]
+; THUMB1-NEXT:    ands r5, r1
+; THUMB1-NEXT:    rsbs r4, r4, #0
+; THUMB1-NEXT:    ldr r6, [sp, #36]
+; THUMB1-NEXT:    ands r6, r4
+; THUMB1-NEXT:    orrs r6, r5
+; THUMB1-NEXT:    strb r6, [r0, #7]
+; THUMB1-NEXT:    ldr r5, [sp, #64]
+; THUMB1-NEXT:    ands r5, r1
+; THUMB1-NEXT:    ldr r6, [sp, #32]
+; THUMB1-NEXT:    ands r6, r4
+; THUMB1-NEXT:    orrs r6, r5
+; THUMB1-NEXT:    strb r6, [r0, #6]
+; THUMB1-NEXT:    ldr r5, [sp, #60]
+; THUMB1-NEXT:    ands r5, r1
+; THUMB1-NEXT:    ldr r6, [sp, #28]
+; THUMB1-NEXT:    ands r6, r4
+; THUMB1-NEXT:    orrs r6, r5
+; THUMB1-NEXT:    strb r6, [r0, #5]
+; THUMB1-NEXT:    ldr r5, [sp, #56]
+; THUMB1-NEXT:    ands r5, r1
+; THUMB1-NEXT:    ldr r6, [sp, #24]
+; THUMB1-NEXT:    ands r6, r4
+; THUMB1-NEXT:    orrs r6, r5
+; THUMB1-NEXT:    strb r6, [r0, #4]
+; THUMB1-NEXT:    ldr r5, [sp, #52]
+; THUMB1-NEXT:    ands r5, r1
+; THUMB1-NEXT:    ldr r6, [sp, #20]
+; THUMB1-NEXT:    ands r6, r4
+; THUMB1-NEXT:    orrs r6, r5
+; THUMB1-NEXT:    strb r6, [r0, #3]
+; THUMB1-NEXT:    ldr r5, [sp, #48]
+; THUMB1-NEXT:    ands r5, r1
+; THUMB1-NEXT:    ldr r6, [sp, #16]
+; THUMB1-NEXT:    ands r6, r4
+; THUMB1-NEXT:    orrs r6, r5
+; THUMB1-NEXT:    strb r6, [r0, #2]
+; THUMB1-NEXT:    ldr r5, [sp, #44]
+; THUMB1-NEXT:    ands r5, r1
+; THUMB1-NEXT:    ands r3, r4
+; THUMB1-NEXT:    orrs r3, r5
+; THUMB1-NEXT:    strb r3, [r0, #1]
+; THUMB1-NEXT:    ldr r3, [sp, #40]
+; THUMB1-NEXT:    ands r3, r1
+; THUMB1-NEXT:    ands r2, r4
+; THUMB1-NEXT:    orrs r2, r3
+; THUMB1-NEXT:    strb r2, [r0]
+; THUMB1-NEXT:    pop {r4, r5, r6, pc}
+;
+; THUMB2-LABEL: ct_v8i8:
+; THUMB2:       @ %bb.0: @ %entry
+; THUMB2-NEXT:    .save {r4, r5, r7, lr}
+; THUMB2-NEXT:    push {r4, r5, r7, lr}
+; THUMB2-NEXT:    and lr, r1, #1
+; THUMB2-NEXT:    ldrb.w r1, [sp, #68]
+; THUMB2-NEXT:    sub.w r12, lr, #1
+; THUMB2-NEXT:    ldrb.w r4, [sp, #36]
+; THUMB2-NEXT:    and.w r5, r1, r12
+; THUMB2-NEXT:    rsb.w r1, lr, #0
+; THUMB2-NEXT:    ands r4, r1
+; THUMB2-NEXT:    ands r3, r1
+; THUMB2-NEXT:    orrs r5, r4
+; THUMB2-NEXT:    ldrb.w r4, [sp, #32]
+; THUMB2-NEXT:    strb r5, [r0, #7]
+; THUMB2-NEXT:    ldrb.w r5, [sp, #64]
+; THUMB2-NEXT:    ands r4, r1
+; THUMB2-NEXT:    and.w r5, r5, r12
+; THUMB2-NEXT:    orrs r5, r4
+; THUMB2-NEXT:    ldrb.w r4, [sp, #28]
+; THUMB2-NEXT:    strb r5, [r0, #6]
+; THUMB2-NEXT:    ldrb.w r5, [sp, #60]
+; THUMB2-NEXT:    ands r4, r1
+; THUMB2-NEXT:    and.w r5, r5, r12
+; THUMB2-NEXT:    orrs r5, r4
+; THUMB2-NEXT:    ldrb.w r4, [sp, #24]
+; THUMB2-NEXT:    strb r5, [r0, #5]
+; THUMB2-NEXT:    ldrb.w r5, [sp, #56]
+; THUMB2-NEXT:    ands r4, r1
+; THUMB2-NEXT:    and.w r5, r5, r12
+; THUMB2-NEXT:    orrs r5, r4
+; THUMB2-NEXT:    ldrb.w r4, [sp, #20]
+; THUMB2-NEXT:    strb r5, [r0, #4]
+; THUMB2-NEXT:    ldrb.w r5, [sp, #52]
+; THUMB2-NEXT:    ands r4, r1
+; THUMB2-NEXT:    and.w r5, r5, r12
+; THUMB2-NEXT:    orrs r5, r4
+; THUMB2-NEXT:    ldrb.w r4, [sp, #16]
+; THUMB2-NEXT:    strb r5, [r0, #3]
+; THUMB2-NEXT:    ldrb.w r5, [sp, #48]
+; THUMB2-NEXT:    ands r4, r1
+; THUMB2-NEXT:    ands r1, r2
+; THUMB2-NEXT:    and.w r5, r5, r12
+; THUMB2-NEXT:    orrs r5, r4
+; THUMB2-NEXT:    strb r5, [r0, #2]
+; THUMB2-NEXT:    ldrb.w r5, [sp, #44]
+; THUMB2-NEXT:    and.w r5, r5, r12
+; THUMB2-NEXT:    orrs r3, r5
+; THUMB2-NEXT:    strb r3, [r0, #1]
+; THUMB2-NEXT:    ldrb.w r3, [sp, #40]
+; THUMB2-NEXT:    and.w r3, r3, r12
+; THUMB2-NEXT:    orrs r1, r3
+; THUMB2-NEXT:    strb r1, [r0]
+; THUMB2-NEXT:    pop {r4, r5, r7, pc}
+entry:
+  %sel = call <8 x i8> @llvm.ct.select.v8i8(i1 %cond, <8 x i8> %a, <8 x i8> %b)
+  ret <8 x i8> %sel
+}
+
+define <4 x i16> @ct_v4i16(i1 %cond, <4 x i16> %a, <4 x i16> %b) {
+; CT-LABEL: ct_v4i16:
+; CT:       @ %bb.0: @ %entry
+; CT-NEXT:    vldr d16, [sp]
+; CT-NEXT:    vmov d17, r2, r3
+; CT-NEXT:    and r0, r0, #1
+; CT-NEXT:    rsb r1, r0, #0
+; CT-NEXT:    vdup.32 d19, r1
+; CT-NEXT:    vand d18, d17, d19
+; CT-NEXT:    vbic d19, d16, d19
+; CT-NEXT:    vorr d18, d18, d19
+; CT-NEXT:    vmov r0, r1, d18
+; CT-NEXT:    bx lr
+;
+; DEFAULT-LABEL: ct_v4i16:
+; DEFAULT:       @ %bb.0: @ %entry
+; DEFAULT-NEXT:    push {r11, lr}
+; DEFAULT-NEXT:    and r0, r0, #1
+; DEFAULT-NEXT:    ldrh r1, [sp, #16]
+; DEFAULT-NEXT:    sub r12, r0, #1
+; DEFAULT-NEXT:    rsb lr, r0, #0
+; DEFAULT-NEXT:    and r0, r2, lr
+; DEFAULT-NEXT:    and r1, r1, r12
+; DEFAULT-NEXT:    orr r0, r0, r1
+; DEFAULT-NEXT:    ldrh r1, [sp, #20]
+; DEFAULT-NEXT:    and r2, r3, lr
+; DEFAULT-NEXT:    ldrh r3, [sp, #8]
+; DEFAULT-NEXT:    and r1, r1, r12
+; DEFAULT-NEXT:    orr r1, r2, r1
+; DEFAULT-NEXT:    ldrh r2, [sp, #24]
+; DEFAULT-NEXT:    and r3, r3, lr
+; DEFAULT-NEXT:    and r2, r2, r12
+; DEFAULT-NEXT:    orr r2, r3, r2
+; DEFAULT-NEXT:    ldrh r3, [sp, #28]
+; DEFAULT-NEXT:    and r12, r3, r12
+; DEFAULT-NEXT:    ldrh r3, [sp, #12]
+; DEFAULT-NEXT:    and r3, r3, lr
+; DEFAULT-NEXT:    orr r3, r3, r12
+; DEFAULT-NEXT:    pop {r11, pc}
+;
+; THUMB1-LABEL: ct_v4i16:
+; THUMB1:       @ %bb.0: @ %entry
+; THUMB1-NEXT:    .save {r4, r5, r6, lr}
+; THUMB1-NEXT:    push {r4, r5, r6, lr}
+; THUMB1-NEXT:    movs r1, #1
+; THUMB1-NEXT:    ands r1, r0
+; THUMB1-NEXT:    subs r4, r1, #1
+; THUMB1-NEXT:    ldr r0, [sp, #24]
+; THUMB1-NEXT:    ands r0, r4
+; THUMB1-NEXT:    rsbs r5, r1, #0
+; THUMB1-NEXT:    ands r2, r5
+; THUMB1-NEXT:    orrs r0, r2
+; THUMB1-NEXT:    ldr r1, [sp, #28]
+; THUMB1-NEXT:    ands r1, r4
+; THUMB1-NEXT:    ands r3, r5
+; THUMB1-NEXT:    orrs r1, r3
+; THUMB1-NEXT:    ldr r3, [sp, #32]
+; THUMB1-NEXT:    ands r3, r4
+; THUMB1-NEXT:    ldr r2, [sp, #16]
+; THUMB1-NEXT:    ands r2, r5
+; THUMB1-NEXT:    orrs r2, r3
+; THUMB1-NEXT:    ldr r6, [sp, #36]
+; THUMB1-NEXT:    ands r6, r4
+; THUMB1-NEXT:    ldr r3, [sp, #20]
+; THUMB1-NEXT:    ands r3, r5
+; THUMB1-NEXT:    orrs r3, r6
+; THUMB1-NEXT:    pop {r4, r5, r6, pc}
+;
+; THUMB2-LABEL: ct_v4i16:
+; THUMB2:       @ %bb.0: @ %entry
+; THUMB2-NEXT:    .save {r7, lr}
+; THUMB2-NEXT:    push {r7, lr}
+; THUMB2-NEXT:    and r0, r0, #1
+; THUMB2-NEXT:    ldrh.w r1, [sp, #16]
+; THUMB2-NEXT:    sub.w r12, r0, #1
+; THUMB2-NEXT:    rsb.w lr, r0, #0
+; THUMB2-NEXT:    and.w r0, r2, lr
+; THUMB2-NEXT:    and.w r1, r1, r12
+; THUMB2-NEXT:    orrs r0, r1
+; THUMB2-NEXT:    ldrh.w r1, [sp, #20]
+; THUMB2-NEXT:    and.w r2, r3, lr
+; THUMB2-NEXT:    ldrh.w r3, [sp, #8]
+; THUMB2-NEXT:    and.w r1, r1, r12
+; THUMB2-NEXT:    orrs r1, r2
+; THUMB2-NEXT:    ldrh.w r2, [sp, #24]
+; THUMB2-NEXT:    and.w r3, r3, lr
+; THUMB2-NEXT:    and.w r2, r2, r12
+; THUMB2-NEXT:    orrs r2, r3
+; THUMB2-NEXT:    ldrh.w r3, [sp, #28]
+; THUMB2-NEXT:    and.w r12, r12, r3
+; THUMB2-NEXT:    ldrh.w r3, [sp, #12]
+; THUMB2-NEXT:    and.w r3, r3, lr
+; THUMB2-NEXT:    orr.w r3, r3, r12
+; THUMB2-NEXT:    pop {r7, pc}
+entry:
+  %sel = call <4 x i16> @llvm.ct.select.v4i16(i1 %cond, <4 x i16> %a, <4 x i16> %b)
+  ret <4 x i16> %sel
+}
+
+define <2 x i32> @ct_v2i32(i1 %cond, <2 x i32> %a, <2 x i32> %b) {
+; CT-LABEL: ct_v2i32:
+; CT:       @ %bb.0: @ %entry
+; CT-NEXT:    vldr d16, [sp]
+; CT-NEXT:    vmov d17, r2, r3
+; CT-NEXT:    and r0, r0, #1
+; CT-NEXT:    rsb r1, r0, #0
+; CT-NEXT:    vdup.32 d19, r1
+; CT-NEXT:    vand d18, d17, d19
+; CT-NEXT:    vbic d19, d16, d19
+; CT-NEXT:    vorr d18, d18, d19
+; CT-NEXT:    vmov r0, r1, d18
+; CT-NEXT:    bx lr
+;
+; DEFAULT-LABEL: ct_v2i32:
+; DEFAULT:       @ %bb.0: @ %entry
+; DEFAULT-NEXT:    push {r11, lr}
+; DEFAULT-NEXT:    and r0, r0, #1
+; DEFAULT-NEXT:    ldr r1, [sp, #8]
+; DEFAULT-NEXT:    sub r12, r0, #1
+; DEFAULT-NEXT:    and lr, r1, r12
+; DEFAULT-NEXT:    rsb r1, r0, #0
+; DEFAULT-NEXT:    and r0, r2, r1
+; DEFAULT-NEXT:    ldr r2, [sp, #12]
+; DEFAULT-NEXT:    and r1, r3, r1
+; DEFAULT-NEXT:    orr r0, r0, lr
+; DEFAULT-NEXT:    and r2, r2, r12
+; DEFAULT-NEXT:    orr r1, r1, r2
+; DEFAULT-NEXT:    pop {r11, pc}
+;
+; THUMB1-LABEL: ct_v2i32:
+; THUMB1:       @ %bb.0: @ %entry
+; THUMB1-NEXT:    .save {r4, lr}
+; THUMB1-NEXT:    push {r4, lr}
+; THUMB1-NEXT:    movs r1, #1
+; THUMB1-NEXT:    ands r1, r0
+; THUMB1-NEXT:    subs r4, r1, #1
+; THUMB1-NEXT:    ldr r0, [sp, #8]
+; THUMB1-NEXT:    ands r0, r4
+; THUMB1-NEXT:    rsbs r1, r1, #0
+; THUMB1-NEXT:    ands r2, r1
+; THUMB1-NEXT:    orrs r0, r2
+; THUMB1-NEXT:    ldr r2, [sp, #12]
+; THUMB1-NEXT:    ands r2, r4
+; THUMB1-NEXT:    ands r1, r3
+; THUMB1-NEXT:    orrs r1, r2
+; THUMB1-NEXT:    pop {r4, pc}
+;
+; THUMB2-LABEL: ct_v2i32:
+; THUMB2:       @ %bb.0: @ %entry
+; THUMB2-NEXT:    .save {r7, lr}
+; THUMB2-NEXT:    push {r7, lr}
+; THUMB2-NEXT:    and r0, r0, #1
+; THUMB2-NEXT:    ldr r1, [sp, #8]
+; THUMB2-NEXT:    sub.w r12, r0, #1
+; THUMB2-NEXT:    and.w lr, r1, r12
+; THUMB2-NEXT:    rsbs r1, r0, #0
+; THUMB2-NEXT:    and.w r0, r2, r1
+; THUMB2-NEXT:    ldr r2, [sp, #12]
+; THUMB2-NEXT:    ands r1, r3
+; THUMB2-NEXT:    orr.w r0, r0, lr
+; THUMB2-NEXT:    and.w r2, r2, r12
+; THUMB2-NEXT:    orrs r1, r2
+; THUMB2-NEXT:    pop {r7, pc}
+entry:
+  %sel = call <2 x i32> @llvm.ct.select.v2i32(i1 %cond, <2 x i32> %a, <2 x i32> %b)
+  ret <2 x i32> %sel
+}
+
+define <1 x i64> @ct_v1i64(i1 %cond, <1 x i64> %a, <1 x i64> %b) {
+; CT-LABEL: ct_v1i64:
+; CT:       @ %bb.0: @ %entry
+; CT-NEXT:    vldr d16, [sp]
+; CT-NEXT:    vmov d17, r2, r3
+; CT-NEXT:    and r0, r0, #1
+; CT-NEXT:    rsb r1, r0, #0
+; CT-NEXT:    vdup.32 d19, r1
+; CT-NEXT:    vand d18, d17, d19
+; CT-NEXT:    vbic d19, d16, d19
+; CT-NEXT:    vorr d18, d18, d19
+; CT-NEXT:    vmov r0, r1, d18
+; CT-NEXT:    bx lr
+;
+; DEFAULT-LABEL: ct_v1i64:
+; DEFAULT:       @ %bb.0: @ %entry
+; DEFAULT-NEXT:    push {r11, lr}
+; DEFAULT-NEXT:    and r0, r0, #1
+; DEFAULT-NEXT:    ldr r1, [sp, #8]
+; DEFAULT-NEXT:    sub r12, r0, #1
+; DEFAULT-NEXT:    and lr, r1, r12
+; DEFAULT-NEXT:    rsb r1, r0, #0
+; DEFAULT-NEXT:    and r0, r2, r1
+; DEFAULT-NEXT:    ldr r2, [sp, #12]
+; DEFAULT-NEXT:    and r1, r3, r1
+; DEFAULT-NEXT:    orr r0, r0, lr
+; DEFAULT-NEXT:    and r2, r2, r12
+; DEFAULT-NEXT:    orr r1, r1, r2
+; DEFAULT-NEXT:    pop {r11, pc}
+;
+; THUMB1-LABEL: ct_v1i64:
+; THUMB1:       @ %bb.0: @ %entry
+; THUMB1-NEXT:    .save {r4, lr}
+; THUMB1-NEXT:    push {r4, lr}
+; THUMB1-NEXT:    movs r1, #1
+; THUMB1-NEXT:    ands r1, r0
+; THUMB1-NEXT:    subs r4, r1, #1
+; THUMB1-NEXT:    ldr r0, [sp, #8]
+; THUMB1-NEXT:    ands r0, r4
+; THUMB1-NEXT:    rsbs r1, r1, #0
+; THUMB1-NEXT:    ands r2, r1
+; THUMB1-NEXT:    orrs r0, r2
+; THUMB1-NEXT:    ldr r2, [sp, #12]
+; THUMB1-NEXT:    ands r2, r4
+; THUMB1-NEXT:    ands r1, r3
+; THUMB1-NEXT:    orrs r1, r2
+; THUMB1-NEXT:    pop {r4, pc}
+;
+; THUMB2-LABEL: ct_v1i64:
+; THUMB2:       @ %bb.0: @ %entry
+; THUMB2-NEXT:    .save {r7, lr}
+; THUMB2-NEXT:    push {r7, lr}
+; THUMB2-NEXT:    and r0, r0, #1
+; THUMB2-NEXT:    ldr r1, [sp, #8]
+; THUMB2-NEXT:    sub.w r12, r0, #1
+; THUMB2-NEXT:    and.w lr, r1, r12
+; THUMB2-NEXT:    rsbs r1, r0, #0
+; THUMB2-NEXT:    and.w r0, r2, r1
+; THUMB2-NEXT:    ldr r2, [sp, #12]
+; THUMB2-NEXT:    ands r1, r3
+; THUMB2-NEXT:    orr.w r0, r0, lr
+; THUMB2-NEXT:    and.w r2, r2, r12
+; THUMB2-NEXT:    orrs r1, r2
+; THUMB2-NEXT:    pop {r7, pc}
+entry:
+  %sel = call <1 x i64> @llvm.ct.select.v1i64(i1 %cond, <1 x i64> %a, <1 x i64> %b)
+  ret <1 x i64> %sel
+}
+
+define <2 x float> @ct_v2f32(i1 %cond, <2 x float> %a, <2 x float> %b) {
+; CT-LABEL: ct_v2f32:
+; CT:       @ %bb.0: @ %entry
+; CT-NEXT:    vldr d16, [sp]
+; CT-NEXT:    vmov d17, r2, r3
+; CT-NEXT:    and r0, r0, #1
+; CT-NEXT:    rsb r1, r0, #0
+; CT-NEXT:    vdup.32 d19, r1
+; CT-NEXT:    vand d18, d17, d19
+; CT-NEXT:    vbic d19, d16, d19
+; CT-NEXT:    vorr d18, d18, d19
+; CT-NEXT:    vmov r0, r1, d18
+; CT-NEXT:    bx lr
+;
+; DEFAULT-LABEL: ct_v2f32:
+; DEFAULT:       @ %bb.0: @ %entry
+; DEFAULT-NEXT:    push {r11, lr}
+; DEFAULT-NEXT:    and r0, r0, #1
+; DEFAULT-NEXT:    ldr r1, [sp, #8]
+; DEFAULT-NEXT:    sub r12, r0, #1
+; DEFAULT-NEXT:    and lr, r1, r12
+; DEFAULT-NEXT:    rsb r1, r0, #0
+; DEFAULT-NEXT:    and r0, r2, r1
+; DEFAULT-NEXT:    ldr r2, [sp, #12]
+; DEFAULT-NEXT:    and r1, r3, r1
+; DEFAULT-NEXT:    orr r0, r0, lr
+; DEFAULT-NEXT:    and r2, r2, r12
+; DEFAULT-NEXT:    orr r1, r1, r2
+; DEFAULT-NEXT:    pop {r11, pc}
+;
+; THUMB1-LABEL: ct_v2f32:
+; THUMB1:       @ %bb.0: @ %entry
+; THUMB1-NEXT:    .save {r4, lr}
+; THUMB1-NEXT:    push {r4, lr}
+; THUMB1-NEXT:    movs r1, #1
+; THUMB1-NEXT:    ands r1, r0
+; THUMB1-NEXT:    subs r4, r1, #1
+; THUMB1-NEXT:    ldr r0, [sp, #8]
+; THUMB1-NEXT:    ands r0, r4
+; THUMB1-NEXT:    rsbs r1, r1, #0
+; THUMB1-NEXT:    ands r2, r1
+; THUMB1-NEXT:    orrs r0, r2
+; THUMB1-NEXT:    ldr r2, [sp, #12]
+; THUMB1-NEXT:    ands r2, r4
+; THUMB1-NEXT:    ands r1, r3
+; THUMB1-NEXT:    orrs r1, r2
+; THUMB1-NEXT:    pop {r4, pc}
+;
+; THUMB2-LABEL: ct_v2f32:
+; THUMB2:       @ %bb.0: @ %entry
+; THUMB2-NEXT:    .save {r7, lr}
+; THUMB2-NEXT:    push {r7, lr}
+; THUMB2-NEXT:    and r0, r0, #1
+; THUMB2-NEXT:    ldr r1, [sp, #8]
+; THUMB2-NEXT:    sub.w r12, r0, #1
+; THUMB2-NEXT:    and.w lr, r1, r12
+; THUMB2-NEXT:    rsbs r1, r0, #0
+; THUMB2-NEXT:    and.w r0, r2, r1
+; THUMB2-NEXT:    ldr r2, [sp, #12]
+; THUMB2-NEXT:    ands r1, r3
+; THUMB2-NEXT:    orr.w r0, r0, lr
+; THUMB2-NEXT:    and.w r2, r2, r12
+; THUMB2-NEXT:    orrs r1, r2
+; THUMB2-NEXT:    pop {r7, pc}
+entry:
+  %sel = call <2 x float> @llvm.ct.select.v2f32(i1 %cond, <2 x float> %a, <2 x float> %b)
+  ret <2 x float> %sel
+}
+
+define <16 x i8> @ct_v16i8(i1 %cond, <16 x i8> %a, <16 x i8> %b) {
+; CT-LABEL: ct_v16i8:
+; CT:       @ %bb.0: @ %entry
+; CT-NEXT:    vldr d17, [sp]
+; CT-NEXT:    add r1, sp, #8
+; CT-NEXT:    vmov d16, r2, r3
+; CT-NEXT:    vld1.64 {d18, d19}, [r1]
+; CT-NEXT:    and r0, r0, #1
+; CT-NEXT:    rsb r1, r0, #0
+; CT-NEXT:    vdup.32 q11, r1
+; CT-NEXT:    vand q10, q8, q11
+; CT-NEXT:    vbic q11, q9, q11
+; CT-NEXT:    vorr q10, q10, q11
+; CT-NEXT:    vmov r0, r1, d20
+; CT-NEXT:    vmov r2, r3, d21
+; CT-NEXT:    bx lr
+;
+; DEFAULT-LABEL: ct_v16i8:
+; DEFAULT:       @ %bb.0: @ %entry
+; DEFAULT-NEXT:    push {r4, r5, r11, lr}
+; DEFAULT-NEXT:    and lr, r1, #1
+; DEFAULT-NEXT:    ldrb r1, [sp, #132]
+; DEFAULT-NEXT:    sub r12, lr, #1
+; DEFAULT-NEXT:    ldrb r4, [sp, #68]
+; DEFAULT-NEXT:    and r5, r1, r12
+; DEFAULT-NEXT:    rsb r1, lr, #0
+; DEFAULT-NEXT:    and r4, r4, r1
+; DEFAULT-NEXT:    and r3, r3, r1
+; DEFAULT-NEXT:    orr r5, r4, r5
+; DEFAULT-NEXT:    ldrb r4, [sp, #64]
+; DEFAULT-NEXT:    strb r5, [r0, #15]
+; DEFAULT-NEXT:    ldrb r5, [sp, #128]
+; DEFAULT-NEXT:    and r4, r4, r1
+; DEFAULT-NEXT:    and r5, r5, r12
+; DEFAULT-NEXT:    orr r5, r4, r5
+; DEFAULT-NEXT:    ldrb r4, [sp, #60]
+; DEFAULT-NEXT:    strb r5, [r0, #14]
+; DEFAULT-NEXT:    ldrb r5, [sp, #124]
+; DEFAULT-NEXT:    and r4, r4, r1
+; DEFAULT-NEXT:    and r5, r5, r12
+; DEFAULT-NEXT:    orr r5, r4, r5
+; DEFAULT-NEXT:    ldrb r4, [sp, #56]
+; DEFAULT-NEXT:    strb r5, [r0, #13]
+; DEFAULT-NEXT:    ldrb r5, [sp, #120]
+; DEFAULT-NEXT:    and r4, r4, r1
+; DEFAULT-NEXT:    and r5, r5, r12
+; DEFAULT-NEXT:    orr r5, r4, r5
+; DEFAULT-NEXT:    ldrb r4, [sp, #52]
+; DEFAULT-NEXT:    strb r5, [r0, #12]
+; DEFAULT-NEXT:    ldrb r5, [sp, #116]
+; DEFAULT-NEXT:    and r4, r4, r1
+; DEFAULT-NEXT:    and r5, r5, r12
+; DEFAULT-NEXT:    orr r5, r4, r5
+; DEFAULT-NEXT:    ldrb r4, [sp, #48]
+; DEFAULT-NEXT:    strb r5, [r0, #11]
+; DEFAULT-NEXT:    ldrb r5, [sp, #112]
+; DEFAULT-NEXT:    and r4, r4, r1
+; DEFAULT-NEXT:    and r5, r5, r12
+; DEFAULT-NEXT:    orr r5, r4, r5
+; DEFAULT-NEXT:    ldrb r4, [sp, #44]
+; DEFAULT-NEXT:    strb r5, [r0, #10]
+; DEFAULT-NEXT:    ldrb r5, [sp, #108]
+; DEFAULT-NEXT:    and r4, r4, r1
+; DEFAULT-NEXT:    and r5, r5, r12
+; DEFAULT-NEXT:    orr r5, r4, r5
+; DEFAULT-NEXT:    ldrb r4, [sp, #40]
+; DEFAULT-NEXT:    strb r5, [r0, #9]
+; DEFAULT-NEXT:    ldrb r5, [sp, #104]
+; DEFAULT-NEXT:    and r4, r4, r1
+; DEFAULT-NEXT:    and r5, r5, r12
+; DEFAULT-NEXT:    orr r5, r4, r5
+; DEFAULT-NEXT:    ldrb r4, [sp, #36]
+; DEFAULT-NEXT:    strb r5, [r0, #8]
+; DEFAULT-NEXT:    ldrb r5, [sp, #100]
+; DEFAULT-NEXT:    and r4, r4, r1
+; DEFAULT-NEXT:    and r5, r5, r12
+; DEFAULT-NEXT:    orr r5, r4, r5
+; DEFAULT-NEXT:    ldrb r4, [sp, #32]
+; DEFAULT-NEXT:    strb r5, [r0, #7]
+; DEFAULT-NEXT:    ldrb r5, [sp, #96]
+; DEFAULT-NEXT:    and r4, r4, r1
+; DEFAULT-NEXT:    and r5, r5, r12
+; DEFAULT-NEXT:    orr r5, r4, r5
+; DEFAULT-NEXT:    ldrb r4, [sp, #28]
+; DEFAULT-NEXT:    strb r5, [r0, #6]
+; DEFAULT-NEXT:    ldrb r5, [sp, #92]
+; DEFAULT-NEXT:    and r4, r4, r1
+; DEFAULT-NEXT:    and r5, r5, r12
+; DEFAULT-NEXT:    orr r5, r4, r5
+; DEFAULT-NEXT:    ldrb r4, [sp, #24]
+; DEFAULT-NEXT:    strb r5, [r0, #5]
+; DEFAULT-NEXT:    ldrb r5, [sp, #88]
+; DEFAULT-NEXT:    and r4, r4, r1
+; DEFAULT-NEXT:    and r5, r5, r12
+; DEFAULT-NEXT:    orr r5, r4, r5
+; DEFAULT-NEXT:    ldrb r4, [sp, #20]
+; DEFAULT-NEXT:    strb r5, [r0, #4]
+; DEFAULT-NEXT:    ldrb r5, [sp, #84]
+; DEFAULT-NEXT:    and r4, r4, r1
+; DEFAULT-NEXT:    and r5, r5, r12
+; DEFAULT-NEXT:    orr r5, r4, r5
+; DEFAULT-NEXT:    ldrb r4, [sp, #16]
+; DEFAULT-NEXT:    strb r5, [r0, #3]
+; DEFAULT-NEXT:    ldrb r5, [sp, #80]
+; DEFAULT-NEXT:    and r4, r4, r1
+; DEFAULT-NEXT:    and r1, r2, r1
+; DEFAULT-NEXT:    and r5, r5, r12
+; DEFAULT-NEXT:    orr r5, r4, r5
+; DEFAULT-NEXT:    strb r5, [r0, #2]
+; DEFAULT-NEXT:    ldrb r5, [sp, #76]
+; DEFAULT-NEXT:    and r5, r5, r12
+; DEFAULT-NEXT:    orr r3, r3, r5
+; DEFAULT-NEXT:    strb r3, [r0, #1]
+; DEFAULT-NEXT:    ldrb r3, [sp, #72]
+; DEFAULT-NEXT:    and r3, r3, r12
+; DEFAULT-NEXT:    orr r1, r1, r3
+; DEFAULT-NEXT:    strb r1, [r0]
+; DEFAULT-NEXT:    pop {r4, r5, r11, pc}
+;
+; THUMB1-LABEL: ct_v16i8:
+; THUMB1:       @ %bb.0: @ %entry
+; THUMB1-NEXT:    .save {r4, r5, r6, lr}
+; THUMB1-NEXT:    push {r4, r5, r6, lr}
+; THUMB1-NEXT:    movs r4, #1
+; THUMB1-NEXT:    ands r4, r1
+; THUMB1-NEXT:    subs r1, r4, #1
+; THUMB1-NEXT:    ldr r5, [sp, #132]
+; THUMB1-NEXT:    ands r5, r1
+; THUMB1-NEXT:    rsbs r4, r4, #0
+; THUMB1-NEXT:    ldr r6, [sp, #68]
+; THUMB1-NEXT:    ands r6, r4
+; THUMB1-NEXT:    orrs r6, r5
+; THUMB1-NEXT:    strb r6, [r0, #15]
+; THUMB1-NEXT:    ldr r5, [sp, #128]
+; THUMB1-NEXT:    ands r5, r1
+; THUMB1-NEXT:    ldr r6, [sp, #64]
+; THUMB1-NEXT:    ands r6, r4
+; THUMB1-NEXT:    orrs r6, r5
+; THUMB1-NEXT:    strb r6, [r0, #14]
+; THUMB1-NEXT:    ldr r5, [sp, #124]
+; THUMB1-NEXT:    ands r5, r1
+; THUMB1-NEXT:    ldr r6, [sp, #60]
+; THUMB1-NEXT:    ands r6, r4
+; THUMB1-NEXT:    orrs r6, r5
+; THUMB1-NEXT:    strb r6, [r0, #13]
+; THUMB1-NEXT:    ldr r5, [sp, #120]
+; THUMB1-NEXT:    ands r5, r1
+; THUMB1-NEXT:    ldr r6, [sp, #56]
+; THUMB1-NEXT:    ands r6, r4
+; THUMB1-NEXT:    orrs r6, r5
+; THUMB1-NEXT:    strb r6, [r0, #12]
+; THUMB1-NEXT:    ldr r5, [sp, #116]
+; THUMB1-NEXT:    ands r5, r1
+; THUMB1-NEXT:    ldr r6, [sp, #52]
+; THUMB1-NEXT:    ands r6, r4
+; THUMB1-NEXT:    orrs r6, r5
+; THUMB1-NEXT:    strb r6, [r0, #11]
+; THUMB1-NEXT:    ldr r5, [sp, #112]
+; THUMB1-NEXT:    ands r5, r1
+; THUMB1-NEXT:    ldr r6, [sp, #48]
+; THUMB1-NEXT:    ands r6, r4
+; THUMB1-NEXT:    orrs r6, r5
+; THUMB1-NEXT:    strb r6, [r0, #10]
+; THUMB1-NEXT:    ldr r5, [sp, #108]
+; THUMB1-NEXT:    ands r5, r1
+; THUMB1-NEXT:    ldr r6, [sp, #44]
+; THUMB1-NEXT:    ands r6, r4
+; THUMB1-NEXT:    orrs r6, r5
+; THUMB1-NEXT:    strb r6, [r0, #9]
+; THUMB1-NEXT:    ldr r5, [sp, #104]
+; THUMB1-NEXT:    ands r5, r1
+; THUMB1-NEXT:    ldr r6, [sp, #40]
+; THUMB1-NEXT:    ands r6, r4
+; THUMB1-NEXT:    orrs r6, r5
+; THUMB1-NEXT:    strb r6, [r0, #8]
+; THUMB1-NEXT:    ldr r5, [sp, #100]
+; THUMB1-NEXT:    ands r5, r1
+; THUMB1-NEXT:    ldr r6, [sp, #36]
+; THUMB1-NEXT:    ands r6, r4
+; THUMB1-NEXT:    orrs r6, r5
+; THUMB1-NEXT:    strb r6, [r0, #7]
+; THUMB1-NEXT:    ldr r5, [sp, #96]
+; THUMB1-NEXT:    ands r5, r1
+; THUMB1-NEXT:    ldr r6, [sp, #32]
+; THUMB1-NEXT:    ands r6, r4
+; THUMB1-NEXT:    orrs r6, r5
+; THUMB1-NEXT:    strb r6, [r0, #6]
+; THUMB1-NEXT:    ldr r5, [sp, #92]
+; THUMB1-NEXT:    ands r5, r1
+; THUMB1-NEXT:    ldr r6, [sp, #28]
+; THUMB1-NEXT:    ands r6, r4
+; THUMB1-NEXT:    orrs r6, r5
+; THUMB1-NEXT:    strb r6, [r0, #5]
+; THUMB1-NEXT:    ldr r5, [sp, #88]
+; THUMB1-NEXT:    ands r5, r1
+; THUMB1-NEXT:    ldr r6, [sp, #24]
+; THUMB1-NEXT:    ands r6, r4
+; THUMB1-NEXT:    orrs r6, r5
+; THUMB1-NEXT:    strb r6, [r0, #4]
+; THUMB1-NEXT:    ldr r5, [sp, #84]
+; THUMB1-NEXT:    ands r5, r1
+; THUMB1-NEXT:    ldr r6, [sp, #20]
+; THUMB1-NEXT:    ands r6, r4
+; THUMB1-NEXT:    orrs r6, r5
+; THUMB1-NEXT:    strb r6, [r0, #3]
+; THUMB1-NEXT:    ldr r5, [sp, #80]
+; THUMB1-NEXT:    ands r5, r1
+; THUMB1-NEXT:    ldr r6, [sp, #16]
+; THUMB1-NEXT:    ands r6, r4
+; THUMB1-NEXT:    orrs r6, r5
+; THUMB1-NEXT:    strb r6, [r0, #2]
+; THUMB1-NEXT:    ldr r5, [sp, #76]
+; THUMB1-NEXT:    ands r5, r1
+; THUMB1-NEXT:    ands r3, r4
+; THUMB1-NEXT:    orrs r3, r5
+; THUMB1-NEXT:    strb r3, [r0, #1]
+; THUMB1-NEXT:    ldr r3, [sp, #72]
+; THUMB1-NEXT:    ands r3, r1
+; THUMB1-NEXT:    ands r2, r4
+; THUMB1-NEXT:    orrs r2, r3
+; THUMB1-NEXT:    strb r2, [r0]
+; THUMB1-NEXT:    pop {r4, r5, r6, pc}
+;
+; THUMB2-LABEL: ct_v16i8:
+; THUMB2:       @ %bb.0: @ %entry
+; THUMB2-NEXT:    .save {r4, r5, r7, lr}
+; THUMB2-NEXT:    push {r4, r5, r7, lr}
+; THUMB2-NEXT:    and lr, r1, #1
+; THUMB2-NEXT:    ldrb.w r1, [sp, #132]
+; THUMB2-NEXT:    sub.w r12, lr, #1
+; THUMB2-NEXT:    ldrb.w r4, [sp, #68]
+; THUMB2-NEXT:    and.w r5, r1, r12
+; THUMB2-NEXT:    rsb.w r1, lr, #0
+; THUMB2-NEXT:    ands r4, r1
+; THUMB2-NEXT:    ands r3, r1
+; THUMB2-NEXT:    orrs r5, r4
+; THUMB2-NEXT:    ldrb.w r4, [sp, #64]
+; THUMB2-NEXT:    strb r5, [r0, #15]
+; THUMB2-NEXT:    ldrb.w r5, [sp, #128]
+; THUMB2-NEXT:    ands r4, r1
+; THUMB2-NEXT:    and.w r5, r5, r12
+; THUMB2-NEXT:    orrs r5, r4
+; THUMB2-NEXT:    ldrb.w r4, [sp, #60]
+; THUMB2-NEXT:    strb r5, [r0, #14]
+; THUMB2-NEXT:    ldrb.w r5, [sp, #124]
+; THUMB2-NEXT:    ands r4, r1
+; THUMB2-NEXT:    and.w r5, r5, r12
+; THUMB2-NEXT:    orrs r5, r4
+; THUMB2-NEXT:    ldrb.w r4, [sp, #56]
+; THUMB2-NEXT:    strb r5, [r0, #13]
+; THUMB2-NEXT:    ldrb.w r5, [sp, #120]
+; THUMB2-NEXT:    ands r4, r1
+; THUMB2-NEXT:    and.w r5, r5, r12
+; THUMB2-NEXT:    orrs r5, r4
+; THUMB2-NEXT:    ldrb.w r4, [sp, #52]
+; THUMB2-NEXT:    strb r5, [r0, #12]
+; THUMB2-NEXT:    ldrb.w r5, [sp, #116]
+; THUMB2-NEXT:    ands r4, r1
+; THUMB2-NEXT:    and.w r5, r5, r12
+; THUMB2-NEXT:    orrs r5, r4
+; THUMB2-NEXT:    ldrb.w r4, [sp, #48]
+; THUMB2-NEXT:    strb r5, [r0, #11]
+; THUMB2-NEXT:    ldrb.w r5, [sp, #112]
+; THUMB2-NEXT:    ands r4, r1
+; THUMB2-NEXT:    and.w r5, r5, r12
+; THUMB2-NEXT:    orrs r5, r4
+; THUMB2-NEXT:    ldrb.w r4, [sp, #44]
+; THUMB2-NEXT:    strb r5, [r0, #10]
+; THUMB2-NEXT:    ldrb.w r5, [sp, #108]
+; THUMB2-NEXT:    ands r4, r1
+; THUMB2-NEXT:    and.w r5, r5, r12
+; THUMB2-NEXT:    orrs r5, r4
+; THUMB2-NEXT:    ldrb.w r4, [sp, #40]
+; THUMB2-NEXT:    strb r5, [r0, #9]
+; THUMB2-NEXT:    ldrb.w r5, [sp, #104]
+; THUMB2-NEXT:    ands r4, r1
+; THUMB2-NEXT:    and.w r5, r5, r12
+; THUMB2-NEXT:    orrs r5, r4
+; THUMB2-NEXT:    ldrb.w r4, [sp, #36]
+; THUMB2-NEXT:    strb r5, [r0, #8]
+; THUMB2-NEXT:    ldrb.w r5, [sp, #100]
+; THUMB2-NEXT:    ands r4, r1
+; THUMB2-NEXT:    and.w r5, r5, r12
+; THUMB2-NEXT:    orrs r5, r4
+; THUMB2-NEXT:    ldrb.w r4, [sp, #32]
+; THUMB2-NEXT:    strb r5, [r0, #7]
+; THUMB2-NEXT:    ldrb.w r5, [sp, #96]
+; THUMB2-NEXT:    ands r4, r1
+; THUMB2-NEXT:    and.w r5, r5, r12
+; THUMB2-NEXT:    orrs r5, r4
+; THUMB2-NEXT:    ldrb.w r4, [sp, #28]
+; THUMB2-NEXT:    strb r5, [r0, #6]
+; THUMB2-NEXT:    ldrb.w r5, [sp, #92]
+; THUMB2-NEXT:    ands r4, r1
+; THUMB2-NEXT:    and.w r5, r5, r12
+; THUMB2-NEXT:    orrs r5, r4
+; THUMB2-NEXT:    ldrb.w r4, [sp, #24]
+; THUMB2-NEXT:    strb r5, [r0, #5]
+; THUMB2-NEXT:    ldrb.w r5, [sp, #88]
+; THUMB2-NEXT:    ands r4, r1
+; THUMB2-NEXT:    and.w r5, r5, r12
+; THUMB2-NEXT:    orrs r5, r4
+; THUMB2-NEXT:    ldrb.w r4, [sp, #20]
+; THUMB2-NEXT:    strb r5, [r0, #4]
+; THUMB2-NEXT:    ldrb.w r5, [sp, #84]
+; THUMB2-NEXT:    ands r4, r1
+; THUMB2-NEXT:    and.w r5, r5, r12
+; THUMB2-NEXT:    orrs r5, r4
+; THUMB2-NEXT:    ldrb.w r4, [sp, #16]
+; THUMB2-NEXT:    strb r5, [r0, #3]
+; THUMB2-NEXT:    ldrb.w r5, [sp, #80]
+; THUMB2-NEXT:    ands r4, r1
+; THUMB2-NEXT:    ands r1, r2
+; THUMB2-NEXT:    and.w r5, r5, r12
+; THUMB2-NEXT:    orrs r5, r4
+; THUMB2-NEXT:    strb r5, [r0, #2]
+; THUMB2-NEXT:    ldrb.w r5, [sp, #76]
+; THUMB2-NEXT:    and.w r5, r5, r12
+; THUMB2-NEXT:    orrs r3, r5
+; THUMB2-NEXT:    strb r3, [r0, #1]
+; THUMB2-NEXT:    ldrb.w r3, [sp, #72]
+; THUMB2-NEXT:    and.w r3, r3, r12
+; THUMB2-NEXT:    orrs r1, r3
+; THUMB2-NEXT:    strb r1, [r0]
+; THUMB2-NEXT:    pop {r4, r5, r7, pc}
+entry:
+  %sel = call <16 x i8> @llvm.ct.select.v16i8(i1 %cond, <16 x i8> %a, <16 x i8> %b)
+  ret <16 x i8> %sel
+}
+
+define <8 x i16> @ct_v8i16(i1 %cond, <8 x i16> %a, <8 x i16> %b) {
+; CT-LABEL: ct_v8i16:
+; CT:       @ %bb.0: @ %entry
+; CT-NEXT:    vldr d17, [sp]
+; CT-NEXT:    add r1, sp, #8
+; CT-NEXT:    vmov d16, r2, r3
+; CT-NEXT:    vld1.64 {d18, d19}, [r1]
+; CT-NEXT:    and r0, r0, #1
+; CT-NEXT:    rsb r1, r0, #0
+; CT-NEXT:    vdup.32 q11, r1
+; CT-NEXT:    vand q10, q8, q11
+; CT-NEXT:    vbic q11, q9, q11
+; CT-NEXT:    vorr q10, q10, q11
+; CT-NEXT:    vmov r0, r1, d20
+; CT-NEXT:    vmov r2, r3, d21
+; CT-NEXT:    bx lr
+;
+; DEFAULT-LABEL: ct_v8i16:
+; DEFAULT:       @ %bb.0: @ %entry
+; DEFAULT-NEXT:    push {r4, r5, r11, lr}
+; DEFAULT-NEXT:    and lr, r1, #1
+; DEFAULT-NEXT:    ldrh r1, [sp, #68]
+; DEFAULT-NEXT:    sub r12, lr, #1
+; DEFAULT-NEXT:    ldrh r4, [sp, #36]
+; DEFAULT-NEXT:    and r5, r1, r12
+; DEFAULT-NEXT:    rsb r1, lr, #0
+; DEFAULT-NEXT:    and r4, r4, r1
+; DEFAULT-NEXT:    and r3, r3, r1
+; DEFAULT-NEXT:    orr r5, r4, r5
+; DEFAULT-NEXT:    ldrh r4, [sp, #32]
+; DEFAULT-NEXT:    strh r5, [r0, #14]
+; DEFAULT-NEXT:    ldrh r5, [sp, #64]
+; DEFAULT-NEXT:    and r4, r4, r1
+; DEFAULT-NEXT:    and r5, r5, r12
+; DEFAULT-NEXT:    orr r5, r4, r5
+; DEFAULT-NEXT:    ldrh r4, [sp, #28]
+; DEFAULT-NEXT:    strh r5, [r0, #12]
+; DEFAULT-NEXT:    ldrh r5, [sp, #60]
+; DEFAULT-NEXT:    and r4, r4, r1
+; DEFAULT-NEXT:    and r5, r5, r12
+; DEFAULT-NEXT:    orr r5, r4, r5
+; DEFAULT-NEXT:    ldrh r4, [sp, #24]
+; DEFAULT-NEXT:    strh r5, [r0, #10]
+; DEFAULT-NEXT:    ldrh r5, [sp, #56]
+; DEFAULT-NEXT:    and r4, r4, r1
+; DEFAULT-NEXT:    and r5, r5, r12
+; DEFAULT-NEXT:    orr r5, r4, r5
+; DEFAULT-NEXT:    ldrh r4, [sp, #20]
+; DEFAULT-NEXT:    strh r5, [r0, #8]
+; DEFAULT-NEXT:    ldrh r5, [sp, #52]
+; DEFAULT-NEXT:    and r4, r4, r1
+; DEFAULT-NEXT:    and r5, r5, r12
+; DEFAULT-NEXT:    orr r5, r4, r5
+; DEFAULT-NEXT:    ldrh r4, [sp, #16]
+; DEFAULT-NEXT:    strh r5, [r0, #6]
+; DEFAULT-NEXT:    ldrh r5, [sp, #48]
+; DEFAULT-NEXT:    and r4, r4, r1
+; DEFAULT-NEXT:    and r1, r2, r1
+; DEFAULT-NEXT:    and r5, r5, r12
+; DEFAULT-NEXT:    orr r5, r4, r5
+; DEFAULT-NEXT:    strh r5, [r0, #4]
+; DEFAULT-NEXT:    ldrh r5, [sp, #44]
+; DEFAULT-NEXT:    and r5, r5, r12
+; DEFAULT-NEXT:    orr r3, r3, r5
+; DEFAULT-NEXT:    strh r3, [r0, #2]
+; DEFAULT-NEXT:    ldrh r3, [sp, #40]
+; DEFAULT-NEXT:    and r3, r3, r12
+; DEFAULT-NEXT:    orr r1, r1, r3
+; DEFAULT-NEXT:    strh r1, [r0]
+; DEFAULT-NEXT:    pop {r4, r5, r11, pc}
+;
+; THUMB1-LABEL: ct_v8i16:
+; THUMB1:       @ %bb.0: @ %entry
+; THUMB1-NEXT:    .save {r4, r5, r6, lr}
+; THUMB1-NEXT:    push {r4, r5, r6, lr}
+; THUMB1-NEXT:    movs r4, #1
+; THUMB1-NEXT:    ands r4, r1
+; THUMB1-NEXT:    subs r1, r4, #1
+; THUMB1-NEXT:    ldr r5, [sp, #68]
+; THUMB1-NEXT:    ands r5, r1
+; THUMB1-NEXT:    rsbs r4, r4, #0
+; THUMB1-NEXT:    ldr r6, [sp, #36]
+; THUMB1-NEXT:    ands r6, r4
+; THUMB1-NEXT:    orrs r6, r5
+; THUMB1-NEXT:    strh r6, [r0, #14]
+; THUMB1-NEXT:    ldr r5, [sp, #64]
+; THUMB1-NEXT:    ands r5, r1
+; THUMB1-NEXT:    ldr r6, [sp, #32]
+; THUMB1-NEXT:    ands r6, r4
+; THUMB1-NEXT:    orrs r6, r5
+; THUMB1-NEXT:    strh r6, [r0, #12]
+; THUMB1-NEXT:    ldr r5, [sp, #60]
+; THUMB1-NEXT:    ands r5, r1
+; THUMB1-NEXT:    ldr r6, [sp, #28]
+; THUMB1-NEXT:    ands r6, r4
+; THUMB1-NEXT:    orrs r6, r5
+; THUMB1-NEXT:    strh r6, [r0, #10]
+; THUMB1-NEXT:    ldr r5, [sp, #56]
+; THUMB1-NEXT:    ands r5, r1
+; THUMB1-NEXT:    ldr r6, [sp, #24]
+; THUMB1-NEXT:    ands r6, r4
+; THUMB1-NEXT:    orrs r6, r5
+; THUMB1-NEXT:    strh r6, [r0, #8]
+; THUMB1-NEXT:    ldr r5, [sp, #52]
+; THUMB1-NEXT:    ands r5, r1
+; THUMB1-NEXT:    ldr r6, [sp, #20]
+; THUMB1-NEXT:    ands r6, r4
+; THUMB1-NEXT:    orrs r6, r5
+; THUMB1-NEXT:    strh r6, [r0, #6]
+; THUMB1-NEXT:    ldr r5, [sp, #48]
+; THUMB1-NEXT:    ands r5, r1
+; THUMB1-NEXT:    ldr r6, [sp, #16]
+; THUMB1-NEXT:    ands r6, r4
+; THUMB1-NEXT:    orrs r6, r5
+; THUMB1-NEXT:    strh r6, [r0, #4]
+; THUMB1-NEXT:    ldr r5, [sp, #44]
+; THUMB1-NEXT:    ands r5, r1
+; THUMB1-NEXT:    ands r3, r4
+; THUMB1-NEXT:    orrs r3, r5
+; THUMB1-NEXT:    strh r3, [r0, #2]
+; THUMB1-NEXT:    ldr r3, [sp, #40]
+; THUMB1-NEXT:    ands r3, r1
+; THUMB1-NEXT:    ands r2, r4
+; THUMB1-NEXT:    orrs r2, r3
+; THUMB1-NEXT:    strh r2, [r0]
+; THUMB1-NEXT:    pop {r4, r5, r6, pc}
+;
+; THUMB2-LABEL: ct_v8i16:
+; THUMB2:       @ %bb.0: @ %entry
+; THUMB2-NEXT:    .save {r4, r5, r7, lr}
+; THUMB2-NEXT:    push {r4, r5, r7, lr}
+; THUMB2-NEXT:    and lr, r1, #1
+; THUMB2-NEXT:    ldrh.w r1, [sp, #68]
+; THUMB2-NEXT:    sub.w r12, lr, #1
+; THUMB2-NEXT:    ldrh.w r4, [sp, #36]
+; THUMB2-NEXT:    and.w r5, r1, r12
+; THUMB2-NEXT:    rsb.w r1, lr, #0
+; THUMB2-NEXT:    ands r4, r1
+; THUMB2-NEXT:    ands r3, r1
+; THUMB2-NEXT:    orrs r5, r4
+; THUMB2-NEXT:    ldrh.w r4, [sp, #32]
+; THUMB2-NEXT:    strh r5, [r0, #14]
+; THUMB2-NEXT:    ldrh.w r5, [sp, #64]
+; THUMB2-NEXT:    ands r4, r1
+; THUMB2-NEXT:    and.w r5, r5, r12
+; THUMB2-NEXT:    orrs r5, r4
+; THUMB2-NEXT:    ldrh.w r4, [sp, #28]
+; THUMB2-NEXT:    strh r5, [r0, #12]
+; THUMB2-NEXT:    ldrh.w r5, [sp, #60]
+; THUMB2-NEXT:    ands r4, r1
+; THUMB2-NEXT:    and.w r5, r5, r12
+; THUMB2-NEXT:    orrs r5, r4
+; THUMB2-NEXT:    ldrh.w r4, [sp, #24]
+; THUMB2-NEXT:    strh r5, [r0, #10]
+; THUMB2-NEXT:    ldrh.w r5, [sp, #56]
+; THUMB2-NEXT:    ands r4, r1
+; THUMB2-NEXT:    and.w r5, r5, r12
+; THUMB2-NEXT:    orrs r5, r4
+; THUMB2-NEXT:    ldrh.w r4, [sp, #20]
+; THUMB2-NEXT:    strh r5, [r0, #8]
+; THUMB2-NEXT:    ldrh.w r5, [sp, #52]
+; THUMB2-NEXT:    ands r4, r1
+; THUMB2-NEXT:    and.w r5, r5, r12
+; THUMB2-NEXT:    orrs r5, r4
+; THUMB2-NEXT:    ldrh.w r4, [sp, #16]
+; THUMB2-NEXT:    strh r5, [r0, #6]
+; THUMB2-NEXT:    ldrh.w r5, [sp, #48]
+; THUMB2-NEXT:    ands r4, r1
+; THUMB2-NEXT:    ands r1, r2
+; THUMB2-NEXT:    and.w r5, r5, r12
+; THUMB2-NEXT:    orrs r5, r4
+; THUMB2-NEXT:    strh r5, [r0, #4]
+; THUMB2-NEXT:    ldrh.w r5, [sp, #44]
+; THUMB2-NEXT:    and.w r5, r5, r12
+; THUMB2-NEXT:    orrs r3, r5
+; THUMB2-NEXT:    strh r3, [r0, #2]
+; THUMB2-NEXT:    ldrh.w r3, [sp, #40]
+; THUMB2-NEXT:    and.w r3, r3, r12
+; THUMB2-NEXT:    orrs r1, r3
+; THUMB2-NEXT:    strh r1, [r0]
+; THUMB2-NEXT:    pop {r4, r5, r7, pc}
+entry:
+  %sel = call <8 x i16> @llvm.ct.select.v8i16(i1 %cond, <8 x i16> %a, <8 x i16> %b)
+  ret <8 x i16> %sel
+}
+
+define <4 x i32> @ct_v4i32(i1 %cond, <4 x i32> %a, <4 x i32> %b) {
+; CT-LABEL: ct_v4i32:
+; CT:       @ %bb.0: @ %entry
+; CT-NEXT:    vldr d17, [sp]
+; CT-NEXT:    add r1, sp, #8
+; CT-NEXT:    vmov d16, r2, r3
+; CT-NEXT:    vld1.64 {d18, d19}, [r1]
+; CT-NEXT:    and r0, r0, #1
+; CT-NEXT:    rsb r1, r0, #0
+; CT-NEXT:    vdup.32 q11, r1
+; CT-NEXT:    vand q10, q8, q11
+; CT-NEXT:    vbic q11, q9, q11
+; CT-NEXT:    vorr q10, q10, q11
+; CT-NEXT:    vmov r0, r1, d20
+; CT-NEXT:    vmov r2, r3, d21
+; CT-NEXT:    bx lr
+;
+; DEFAULT-LABEL: ct_v4i32:
+; DEFAULT:       @ %bb.0: @ %entry
+; DEFAULT-NEXT:    push {r11, lr}
+; DEFAULT-NEXT:    and r0, r0, #1
+; DEFAULT-NEXT:    ldr r1, [sp, #16]
+; DEFAULT-NEXT:    sub r12, r0, #1
+; DEFAULT-NEXT:    rsb lr, r0, #0
+; DEFAULT-NEXT:    and r0, r2, lr
+; DEFAULT-NEXT:    and r1, r1, r12
+; DEFAULT-NEXT:    orr r0, r0, r1
+; DEFAULT-NEXT:    ldr r1, [sp, #20]
+; DEFAULT-NEXT:    and r2, r3, lr
+; DEFAULT-NEXT:    ldr r3, [sp, #8]
+; DEFAULT-NEXT:    and r1, r1, r12
+; DEFAULT-NEXT:    orr r1, r2, r1
+; DEFAULT-NEXT:    ldr r2, [sp, #24]
+; DEFAULT-NEXT:    and r3, r3, lr
+; DEFAULT-NEXT:    and r2, r2, r12
+; DEFAULT-NEXT:    orr r2, r3, r2
+; DEFAULT-NEXT:    ldr r3, [sp, #28]
+; DEFAULT-NEXT:    and r12, r3, r12
+; DEFAULT-NEXT:    ldr r3, [sp, #12]
+; DEFAULT-NEXT:    and r3, r3, lr
+; DEFAULT-NEXT:    orr r3, r3, r12
+; DEFAULT-NEXT:    pop {r11, pc}
+;
+; THUMB1-LABEL: ct_v4i32:
+; THUMB1:       @ %bb.0: @ %entry
+; THUMB1-NEXT:    .save {r4, r5, r6, lr}
+; THUMB1-NEXT:    push {r4, r5, r6, lr}
+; THUMB1-NEXT:    movs r1, #1
+; THUMB1-NEXT:    ands r1, r0
+; THUMB1-NEXT:    subs r4, r1, #1
+; THUMB1-NEXT:    ldr r0, [sp, #24]
+; THUMB1-NEXT:    ands r0, r4
+; THUMB1-NEXT:    rsbs r5, r1, #0
+; THUMB1-NEXT:    ands r2, r5
+; THUMB1-NEXT:    orrs r0, r2
+; THUMB1-NEXT:    ldr r1, [sp, #28]
+; THUMB1-NEXT:    ands r1, r4
+; THUMB1-NEXT:    ands r3, r5
+; THUMB1-NEXT:    orrs r1, r3
+; THUMB1-NEXT:    ldr r3, [sp, #32]
+; THUMB1-NEXT:    ands r3, r4
+; THUMB1-NEXT:    ldr r2, [sp, #16]
+; THUMB1-NEXT:    ands r2, r5
+; THUMB1-NEXT:    orrs r2, r3
+; THUMB1-NEXT:    ldr r6, [sp, #36]
+; THUMB1-NEXT:    ands r6, r4
+; THUMB1-NEXT:    ldr r3, [sp, #20]
+; THUMB1-NEXT:    ands r3, r5
+; THUMB1-NEXT:    orrs r3, r6
+; THUMB1-NEXT:    pop {r4, r5, r6, pc}
+;
+; THUMB2-LABEL: ct_v4i32:
+; THUMB2:       @ %bb.0: @ %entry
+; THUMB2-NEXT:    .save {r7, lr}
+; THUMB2-NEXT:    push {r7, lr}
+; THUMB2-NEXT:    and r0, r0, #1
+; THUMB2-NEXT:    ldr r1, [sp, #16]
+; THUMB2-NEXT:    sub.w r12, r0, #1
+; THUMB2-NEXT:    rsb.w lr, r0, #0
+; THUMB2-NEXT:    and.w r0, r2, lr
+; THUMB2-NEXT:    and.w r1, r1, r12
+; THUMB2-NEXT:    orrs r0, r1
+; THUMB2-NEXT:    ldr r1, [sp, #20]
+; THUMB2-NEXT:    and.w r2, r3, lr
+; THUMB2-NEXT:    ldr r3, [sp, #8]
+; THUMB2-NEXT:    and.w r1, r1, r12
+; THUMB2-NEXT:    orrs r1, r2
+; THUMB2-NEXT:    ldr r2, [sp, #24]
+; THUMB2-NEXT:    and.w r3, r3, lr
+; THUMB2-NEXT:    and.w r2, r2, r12
+; THUMB2-NEXT:    orrs r2, r3
+; THUMB2-NEXT:    ldr r3, [sp, #28]
+; THUMB2-NEXT:    and.w r12, r12, r3
+; THUMB2-NEXT:    ldr r3, [sp, #12]
+; THUMB2-NEXT:    and.w r3, r3, lr
+; THUMB2-NEXT:    orr.w r3, r3, r12
+; THUMB2-NEXT:    pop {r7, pc}
+entry:
+  %sel = call <4 x i32> @llvm.ct.select.v4i32(i1 %cond, <4 x i32> %a, <4 x i32> %b)
+  ret <4 x i32> %sel
+}
+
+define <2 x i64> @ct_v2i64(i1 %cond, <2 x i64> %a, <2 x i64> %b) {
+; CT-LABEL: ct_v2i64:
+; CT:       @ %bb.0: @ %entry
+; CT-NEXT:    vldr d17, [sp]
+; CT-NEXT:    add r1, sp, #8
+; CT-NEXT:    vmov d16, r2, r3
+; CT-NEXT:    vld1.64 {d18, d19}, [r1]
+; CT-NEXT:    and r0, r0, #1
+; CT-NEXT:    rsb r1, r0, #0
+; CT-NEXT:    vdup.32 q11, r1
+; CT-NEXT:    vand q10, q8, q11
+; CT-NEXT:    vbic q11, q9, q11
+; CT-NEXT:    vorr q10, q10, q11
+; CT-NEXT:    vmov r0, r1, d20
+; CT-NEXT:    vmov r2, r3, d21
+; CT-NEXT:    bx lr
+;
+; DEFAULT-LABEL: ct_v2i64:
+; DEFAULT:       @ %bb.0: @ %entry
+; DEFAULT-NEXT:    push {r11, lr}
+; DEFAULT-NEXT:    and r0, r0, #1
+; DEFAULT-NEXT:    ldr r1, [sp, #16]
+; DEFAULT-NEXT:    sub r12, r0, #1
+; DEFAULT-NEXT:    rsb lr, r0, #0
+; DEFAULT-NEXT:    and r0, r2, lr
+; DEFAULT-NEXT:    and r1, r1, r12
+; DEFAULT-NEXT:    orr r0, r0, r1
+; DEFAULT-NEXT:    ldr r1, [sp, #20]
+; DEFAULT-NEXT:    and r2, r3, lr
+; DEFAULT-NEXT:    ldr r3, [sp, #8]
+; DEFAULT-NEXT:    and r1, r1, r12
+; DEFAULT-NEXT:    orr r1, r2, r1
+; DEFAULT-NEXT:    ldr r2, [sp, #24]
+; DEFAULT-NEXT:    and r3, r3, lr
+; DEFAULT-NEXT:    and r2, r2, r12
+; DEFAULT-NEXT:    orr r2, r3, r2
+; DEFAULT-NEXT:    ldr r3, [sp, #28]
+; DEFAULT-NEXT:    and r12, r3, r12
+; DEFAULT-NEXT:    ldr r3, [sp, #12]
+; DEFAULT-NEXT:    and r3, r3, lr
+; DEFAULT-NEXT:    orr r3, r3, r12
+; DEFAULT-NEXT:    pop {r11, pc}
+;
+; THUMB1-LABEL: ct_v2i64:
+; THUMB1:       @ %bb.0: @ %entry
+; THUMB1-NEXT:    .save {r4, r5, r6, lr}
+; THUMB1-NEXT:    push {r4, r5, r6, lr}
+; THUMB1-NEXT:    movs r1, #1
+; THUMB1-NEXT:    ands r1, r0
+; THUMB1-NEXT:    subs r4, r1, #1
+; THUMB1-NEXT:    ldr r0, [sp, #24]
+; THUMB1-NEXT:    ands r0, r4
+; THUMB1-NEXT:    rsbs r5, r1, #0
+; THUMB1-NEXT:    ands r2, r5
+; THUMB1-NEXT:    orrs r0, r2
+; THUMB1-NEXT:    ldr r1, [sp, #28]
+; THUMB1-NEXT:    ands r1, r4
+; THUMB1-NEXT:    ands r3, r5
+; THUMB1-NEXT:    orrs r1, r3
+; THUMB1-NEXT:    ldr r3, [sp, #32]
+; THUMB1-NEXT:    ands r3, r4
+; THUMB1-NEXT:    ldr r2, [sp, #16]
+; THUMB1-NEXT:    ands r2, r5
+; THUMB1-NEXT:    orrs r2, r3
+; THUMB1-NEXT:    ldr r6, [sp, #36]
+; THUMB1-NEXT:    ands r6, r4
+; THUMB1-NEXT:    ldr r3, [sp, #20]
+; THUMB1-NEXT:    ands r3, r5
+; THUMB1-NEXT:    orrs r3, r6
+; THUMB1-NEXT:    pop {r4, r5, r6, pc}
+;
+; THUMB2-LABEL: ct_v2i64:
+; THUMB2:       @ %bb.0: @ %entry
+; THUMB2-NEXT:    .save {r7, lr}
+; THUMB2-NEXT:    push {r7, lr}
+; THUMB2-NEXT:    and r0, r0, #1
+; THUMB2-NEXT:    ldr r1, [sp, #16]
+; THUMB2-NEXT:    sub.w r12, r0, #1
+; THUMB2-NEXT:    rsb.w lr, r0, #0
+; THUMB2-NEXT:    and.w r0, r2, lr
+; THUMB2-NEXT:    and.w r1, r1, r12
+; THUMB2-NEXT:    orrs r0, r1
+; THUMB2-NEXT:    ldr r1, [sp, #20]
+; THUMB2-NEXT:    and.w r2, r3, lr
+; THUMB2-NEXT:    ldr r3, [sp, #8]
+; THUMB2-NEXT:    and.w r1, r1, r12
+; THUMB2-NEXT:    orrs r1, r2
+; THUMB2-NEXT:    ldr r2, [sp, #24]
+; THUMB2-NEXT:    and.w r3, r3, lr
+; THUMB2-NEXT:    and.w r2, r2, r12
+; THUMB2-NEXT:    orrs r2, r3
+; THUMB2-NEXT:    ldr r3, [sp, #28]
+; THUMB2-NEXT:    and.w r12, r12, r3
+; THUMB2-NEXT:    ldr r3, [sp, #12]
+; THUMB2-NEXT:    and.w r3, r3, lr
+; THUMB2-NEXT:    orr.w r3, r3, r12
+; THUMB2-NEXT:    pop {r7, pc}
+entry:
+  %sel = call <2 x i64> @llvm.ct.select.v2i64(i1 %cond, <2 x i64> %a, <2 x i64> %b)
+  ret <2 x i64> %sel
+}
+
+define <4 x float> @ct_v4f32(i1 %cond, <4 x float> %a, <4 x float> %b) {
+; CT-LABEL: ct_v4f32:
+; CT:       @ %bb.0: @ %entry
+; CT-NEXT:    vldr d17, [sp]
+; CT-NEXT:    add r1, sp, #8
+; CT-NEXT:    vmov d16, r2, r3
+; CT-NEXT:    vld1.64 {d18, d19}, [r1]
+; CT-NEXT:    and r0, r0, #1
+; CT-NEXT:    rsb r1, r0, #0
+; CT-NEXT:    vdup.32 q11, r1
+; CT-NEXT:    vand q10, q8, q11
+; CT-NEXT:    vbic q11, q9, q11
+; CT-NEXT:    vorr q10, q10, q11
+; CT-NEXT:    vmov r0, r1, d20
+; CT-NEXT:    vmov r2, r3, d21
+; CT-NEXT:    bx lr
+;
+; DEFAULT-LABEL: ct_v4f32:
+; DEFAULT:       @ %bb.0: @ %entry
+; DEFAULT-NEXT:    push {r11, lr}
+; DEFAULT-NEXT:    and r0, r0, #1
+; DEFAULT-NEXT:    ldr r1, [sp, #16]
+; DEFAULT-NEXT:    sub r12, r0, #1
+; DEFAULT-NEXT:    rsb lr, r0, #0
+; DEFAULT-NEXT:    and r0, r2, lr
+; DEFAULT-NEXT:    and r1, r1, r12
+; DEFAULT-NEXT:    orr r0, r0, r1
+; DEFAULT-NEXT:    ldr r1, [sp, #20]
+; DEFAULT-NEXT:    and r2, r3, lr
+; DEFAULT-NEXT:    ldr r3, [sp, #8]
+; DEFAULT-NEXT:    and r1, r1, r12
+; DEFAULT-NEXT:    orr r1, r2, r1
+; DEFAULT-NEXT:    ldr r2, [sp, #24]
+; DEFAULT-NEXT:    and r3, r3, lr
+; DEFAULT-NEXT:    and r2, r2, r12
+; DEFAULT-NEXT:    orr r2, r3, r2
+; DEFAULT-NEXT:    ldr r3, [sp, #28]
+; DEFAULT-NEXT:    and r12, r3, r12
+; DEFAULT-NEXT:    ldr r3, [sp, #12]
+; DEFAULT-NEXT:    and r3, r3, lr
+; DEFAULT-NEXT:    orr r3, r3, r12
+; DEFAULT-NEXT:    pop {r11, pc}
+;
+; THUMB1-LABEL: ct_v4f32:
+; THUMB1:       @ %bb.0: @ %entry
+; THUMB1-NEXT:    .save {r4, r5, r6, lr}
+; THUMB1-NEXT:    push {r4, r5, r6, lr}
+; THUMB1-NEXT:    movs r1, #1
+; THUMB1-NEXT:    ands r1, r0
+; THUMB1-NEXT:    subs r4, r1, #1
+; THUMB1-NEXT:    ldr r0, [sp, #24]
+; THUMB1-NEXT:    ands r0, r4
+; THUMB1-NEXT:    rsbs r5, r1, #0
+; THUMB1-NEXT:    ands r2, r5
+; THUMB1-NEXT:    orrs r0, r2
+; THUMB1-NEXT:    ldr r1, [sp, #28]
+; THUMB1-NEXT:    ands r1, r4
+; THUMB1-NEXT:    ands r3, r5
+; THUMB1-NEXT:    orrs r1, r3
+; THUMB1-NEXT:    ldr r3, [sp, #32]
+; THUMB1-NEXT:    ands r3, r4
+; THUMB1-NEXT:    ldr r2, [sp, #16]
+; THUMB1-NEXT:    ands r2, r5
+; THUMB1-NEXT:    orrs r2, r3
+; THUMB1-NEXT:    ldr r6, [sp, #36]
+; THUMB1-NEXT:    ands r6, r4
+; THUMB1-NEXT:    ldr r3, [sp, #20]
+; THUMB1-NEXT:    ands r3, r5
+; THUMB1-NEXT:    orrs r3, r6
+; THUMB1-NEXT:    pop {r4, r5, r6, pc}
+;
+; THUMB2-LABEL: ct_v4f32:
+; THUMB2:       @ %bb.0: @ %entry
+; THUMB2-NEXT:    .save {r7, lr}
+; THUMB2-NEXT:    push {r7, lr}
+; THUMB2-NEXT:    and r0, r0, #1
+; THUMB2-NEXT:    ldr r1, [sp, #16]
+; THUMB2-NEXT:    sub.w r12, r0, #1
+; THUMB2-NEXT:    rsb.w lr, r0, #0
+; THUMB2-NEXT:    and.w r0, r2, lr
+; THUMB2-NEXT:    and.w r1, r1, r12
+; THUMB2-NEXT:    orrs r0, r1
+; THUMB2-NEXT:    ldr r1, [sp, #20]
+; THUMB2-NEXT:    and.w r2, r3, lr
+; THUMB2-NEXT:    ldr r3, [sp, #8]
+; THUMB2-NEXT:    and.w r1, r1, r12
+; THUMB2-NEXT:    orrs r1, r2
+; THUMB2-NEXT:    ldr r2, [sp, #24]
+; THUMB2-NEXT:    and.w r3, r3, lr
+; THUMB2-NEXT:    and.w r2, r2, r12
+; THUMB2-NEXT:    orrs r2, r3
+; THUMB2-NEXT:    ldr r3, [sp, #28]
+; THUMB2-NEXT:    and.w r12, r12, r3
+; THUMB2-NEXT:    ldr r3, [sp, #12]
+; THUMB2-NEXT:    and.w r3, r3, lr
+; THUMB2-NEXT:    orr.w r3, r3, r12
+; THUMB2-NEXT:    pop {r7, pc}
+entry:
+  %sel = call <4 x float> @llvm.ct.select.v4f32(i1 %cond, <4 x float> %a, <4 x float> %b)
+  ret <4 x float> %sel
+}
+
+define <2 x double> @ct_v2f64(i1 %cond, <2 x double> %a, <2 x double> %b) {
+; CT-LABEL: ct_v2f64:
+; CT:       @ %bb.0: @ %entry
+; CT-NEXT:    vldr d17, [sp]
+; CT-NEXT:    add r1, sp, #8
+; CT-NEXT:    vmov d16, r2, r3
+; CT-NEXT:    vld1.64 {d18, d19}, [r1]
+; CT-NEXT:    and r0, r0, #1
+; CT-NEXT:    rsb r1, r0, #0
+; CT-NEXT:    vdup.32 q11, r1
+; CT-NEXT:    vand q10, q8, q11
+; CT-NEXT:    vbic q11, q9, q11
+; CT-NEXT:    vorr q10, q10, q11
+; CT-NEXT:    vmov r0, r1, d20
+; CT-NEXT:    vmov r2, r3, d21
+; CT-NEXT:    bx lr
+;
+; DEFAULT-LABEL: ct_v2f64:
+; DEFAULT:       @ %bb.0: @ %entry
+; DEFAULT-NEXT:    push {r11, lr}
+; DEFAULT-NEXT:    and r0, r0, #1
+; DEFAULT-NEXT:    ldr r1, [sp, #16]
+; DEFAULT-NEXT:    sub r12, r0, #1
+; DEFAULT-NEXT:    rsb lr, r0, #0
+; DEFAULT-NEXT:    and r0, r2, lr
+; DEFAULT-NEXT:    and r1, r1, r12
+; DEFAULT-NEXT:    orr r0, r0, r1
+; DEFAULT-NEXT:    ldr r1, [sp, #20]
+; DEFAULT-NEXT:    and r2, r3, lr
+; DEFAULT-NEXT:    ldr r3, [sp, #8]
+; DEFAULT-NEXT:    and r1, r1, r12
+; DEFAULT-NEXT:    orr r1, r2, r1
+; DEFAULT-NEXT:    ldr r2, [sp, #24]
+; DEFAULT-NEXT:    and r3, r3, lr
+; DEFAULT-NEXT:    and r2, r2, r12
+; DEFAULT-NEXT:    orr r2, r3, r2
+; DEFAULT-NEXT:    ldr r3, [sp, #28]
+; DEFAULT-NEXT:    and r12, r3, r12
+; DEFAULT-NEXT:    ldr r3, [sp, #12]
+; DEFAULT-NEXT:    and r3, r3, lr
+; DEFAULT-NEXT:    orr r3, r3, r12
+; DEFAULT-NEXT:    pop {r11, pc}
+;
+; THUMB1-LABEL: ct_v2f64:
+; THUMB1:       @ %bb.0: @ %entry
+; THUMB1-NEXT:    .save {r4, r5, r6, lr}
+; THUMB1-NEXT:    push {r4, r5, r6, lr}
+; THUMB1-NEXT:    movs r1, #1
+; THUMB1-NEXT:    ands r1, r0
+; THUMB1-NEXT:    subs r4, r1, #1
+; THUMB1-NEXT:    ldr r0, [sp, #24]
+; THUMB1-NEXT:    ands r0, r4
+; THUMB1-NEXT:    rsbs r5, r1, #0
+; THUMB1-NEXT:    ands r2, r5
+; THUMB1-NEXT:    orrs r0, r2
+; THUMB1-NEXT:    ldr r1, [sp, #28]
+; THUMB1-NEXT:    ands r1, r4
+; THUMB1-NEXT:    ands r3, r5
+; THUMB1-NEXT:    orrs r1, r3
+; THUMB1-NEXT:    ldr r3, [sp, #32]
+; THUMB1-NEXT:    ands r3, r4
+; THUMB1-NEXT:    ldr r2, [sp, #16]
+; THUMB1-NEXT:    ands r2, r5
+; THUMB1-NEXT:    orrs r2, r3
+; THUMB1-NEXT:    ldr r6, [sp, #36]
+; THUMB1-NEXT:    ands r6, r4
+; THUMB1-NEXT:    ldr r3, [sp, #20]
+; THUMB1-NEXT:    ands r3, r5
+; THUMB1-NEXT:    orrs r3, r6
+; THUMB1-NEXT:    pop {r4, r5, r6, pc}
+;
+; THUMB2-LABEL: ct_v2f64:
+; THUMB2:       @ %bb.0: @ %entry
+; THUMB2-NEXT:    .save {r7, lr}
+; THUMB2-NEXT:    push {r7, lr}
+; THUMB2-NEXT:    and r0, r0, #1
+; THUMB2-NEXT:    ldr r1, [sp, #16]
+; THUMB2-NEXT:    sub.w r12, r0, #1
+; THUMB2-NEXT:    rsb.w lr, r0, #0
+; THUMB2-NEXT:    and.w r0, r2, lr
+; THUMB2-NEXT:    and.w r1, r1, r12
+; THUMB2-NEXT:    orrs r0, r1
+; THUMB2-NEXT:    ldr r1, [sp, #20]
+; THUMB2-NEXT:    and.w r2, r3, lr
+; THUMB2-NEXT:    ldr r3, [sp, #8]
+; THUMB2-NEXT:    and.w r1, r1, r12
+; THUMB2-NEXT:    orrs r1, r2
+; THUMB2-NEXT:    ldr r2, [sp, #24]
+; THUMB2-NEXT:    and.w r3, r3, lr
+; THUMB2-NEXT:    and.w r2, r2, r12
+; THUMB2-NEXT:    orrs r2, r3
+; THUMB2-NEXT:    ldr r3, [sp, #28]
+; THUMB2-NEXT:    and.w r12, r12, r3
+; THUMB2-NEXT:    ldr r3, [sp, #12]
+; THUMB2-NEXT:    and.w r3, r3, lr
+; THUMB2-NEXT:    orr.w r3, r3, r12
+; THUMB2-NEXT:    pop {r7, pc}
+entry:
+  %sel = call <2 x double> @llvm.ct.select.v2f64(i1 %cond, <2 x double> %a, <2 x double> %b)
+  ret <2 x double> %sel
+}
+
+;
+; itty bitty vector type edge cases follow. these should be scalarised.
+;
+define <1 x i8> @ct_v1i8(i1 %cond, <1 x i8> %a, <1 x i8> %b) {
+; CT-LABEL: ct_v1i8:
+; CT:       @ %bb.0: @ %entry
+; CT-NEXT:    and r0, r0, #1
+; CT-NEXT:    sub r3, r0, #1
+; CT-NEXT:    rsb r0, r0, #0
+; CT-NEXT:    and r2, r2, r3
+; CT-NEXT:    and r0, r1, r0
+; CT-NEXT:    orr r0, r0, r2
+; CT-NEXT:    bx lr
+;
+; DEFAULT-LABEL: ct_v1i8:
+; DEFAULT:       @ %bb.0: @ %entry
+; DEFAULT-NEXT:    and r0, r0, #1
+; DEFAULT-NEXT:    sub r3, r0, #1
+; DEFAULT-NEXT:    rsb r0, r0, #0
+; DEFAULT-NEXT:    and r2, r2, r3
+; DEFAULT-NEXT:    and r0, r1, r0
+; DEFAULT-NEXT:    orr r0, r0, r2
+; DEFAULT-NEXT:    bx lr
+;
+; THUMB1-LABEL: ct_v1i8:
+; THUMB1:       @ %bb.0: @ %entry
+; THUMB1-NEXT:    .save {r4, lr}
+; THUMB1-NEXT:    push {r4, lr}
+; THUMB1-NEXT:    movs r3, #1
+; THUMB1-NEXT:    ands r3, r0
+; THUMB1-NEXT:    subs r4, r3, #1
+; THUMB1-NEXT:    ands r4, r2
+; THUMB1-NEXT:    rsbs r0, r3, #0
+; THUMB1-NEXT:    ands r0, r1
+; THUMB1-NEXT:    orrs r0, r4
+; THUMB1-NEXT:    pop {r4, pc}
+;
+; THUMB2-LABEL: ct_v1i8:
+; THUMB2:       @ %bb.0: @ %entry
+; THUMB2-NEXT:    and r0, r0, #1
+; THUMB2-NEXT:    subs r3, r0, #1
+; THUMB2-NEXT:    rsbs r0, r0, #0
+; THUMB2-NEXT:    ands r2, r3
+; THUMB2-NEXT:    ands r0, r1
+; THUMB2-NEXT:    orrs r0, r2
+; THUMB2-NEXT:    bx lr
+entry:
+  %sel = call <1 x i8> @llvm.ct.select.i8(i1 %cond, <1 x i8> %a, <1 x i8> %b)
+  ret <1 x i8> %sel
+}
+
+define <2 x i8> @ct_v2i8(i1 %cond, <2 x i8> %a, <2 x i8> %b) {
+; CT-LABEL: ct_v2i8:
+; CT:       @ %bb.0: @ %entry
+; CT-NEXT:    vdup.32 d16, r0
+; CT-NEXT:    vmov d18, r2, r3
+; CT-NEXT:    vldr d17, [sp]
+; CT-NEXT:    vshl.i32 d16, d16, #31
+; CT-NEXT:    vshr.s32 d16, d16, #31
+; CT-NEXT:    vbsl d16, d18, d17
+; CT-NEXT:    vmov r0, r1, d16
+; CT-NEXT:    bx lr
+;
+; DEFAULT-LABEL: ct_v2i8:
+; DEFAULT:       @ %bb.0: @ %entry
+; DEFAULT-NEXT:    push {r11, lr}
+; DEFAULT-NEXT:    and r0, r0, #1
+; DEFAULT-NEXT:    sub r12, r0, #1
+; DEFAULT-NEXT:    and lr, r3, r12
+; DEFAULT-NEXT:    rsb r3, r0, #0
+; DEFAULT-NEXT:    and r0, r1, r3
+; DEFAULT-NEXT:    ldrb r1, [sp, #8]
+; DEFAULT-NEXT:    and r2, r2, r3
+; DEFAULT-NEXT:    orr r0, r0, lr
+; DEFAULT-NEXT:    and r1, r1, r12
+; DEFAULT-NEXT:    orr r1, r2, r1
+; DEFAULT-NEXT:    pop {r11, pc}
+;
+; THUMB1-LABEL: ct_v2i8:
+; THUMB1:       @ %bb.0: @ %entry
+; THUMB1-NEXT:    .save {r4, lr}
+; THUMB1-NEXT:    push {r4, lr}
+; THUMB1-NEXT:    mov r4, r1
+; THUMB1-NEXT:    movs r1, #1
+; THUMB1-NEXT:    ands r1, r0
+; THUMB1-NEXT:    subs r0, r1, #1
+; THUMB1-NEXT:    ands r3, r0
+; THUMB1-NEXT:    rsbs r1, r1, #0
+; THUMB1-NEXT:    ands r4, r1
+; THUMB1-NEXT:    orrs r4, r3
+; THUMB1-NEXT:    ldr r3, [sp, #8]
+; THUMB1-NEXT:    ands r3, r0
+; THUMB1-NEXT:    ands r1, r2
+; THUMB1-NEXT:    orrs r1, r3
+; THUMB1-NEXT:    mov r0, r4
+; THUMB1-NEXT:    pop {r4, pc}
+;
+; THUMB2-LABEL: ct_v2i8:
+; THUMB2:       @ %bb.0: @ %entry
+; THUMB2-NEXT:    .save {r7, lr}
+; THUMB2-NEXT:    push {r7, lr}
+; THUMB2-NEXT:    and r0, r0, #1
+; THUMB2-NEXT:    sub.w r12, r0, #1
+; THUMB2-NEXT:    and.w lr, r3, r12
+; THUMB2-NEXT:    rsbs r3, r0, #0
+; THUMB2-NEXT:    and.w r0, r1, r3
+; THUMB2-NEXT:    ldrb.w r1, [sp, #8]
+; THUMB2-NEXT:    ands r2, r3
+; THUMB2-NEXT:    orr.w r0, r0, lr
+; THUMB2-NEXT:    and.w r1, r1, r12
+; THUMB2-NEXT:    orrs r1, r2
+; THUMB2-NEXT:    pop {r7, pc}
+entry:
+  %sel = call <2 x i8> @llvm.ct.select.i16(i1 %cond, <2 x i8> %a, <2 x i8> %b)
+  ret <2 x i8> %sel
+}
+
+define <4 x i8> @ct_v4i8(i1 %cond, <4 x i8> %a, <4 x i8> %b) {
+; CT-LABEL: ct_v4i8:
+; CT:       @ %bb.0: @ %entry
+; CT-NEXT:    vdup.16 d16, r0
+; CT-NEXT:    vmov d18, r2, r3
+; CT-NEXT:    vldr d17, [sp]
+; CT-NEXT:    vshl.i16 d16, d16, #15
+; CT-NEXT:    vshr.s16 d16, d16, #15
+; CT-NEXT:    vbsl d16, d18, d17
+; CT-NEXT:    vmov r0, r1, d16
+; CT-NEXT:    bx lr
+;
+; DEFAULT-LABEL: ct_v4i8:
+; DEFAULT:       @ %bb.0: @ %entry
+; DEFAULT-NEXT:    push {r11, lr}
+; DEFAULT-NEXT:    and lr, r0, #1
+; DEFAULT-NEXT:    ldrb r0, [sp, #12]
+; DEFAULT-NEXT:    sub r12, lr, #1
+; DEFAULT-NEXT:    rsb lr, lr, #0
+; DEFAULT-NEXT:    and r0, r0, r12
+; DEFAULT-NEXT:    and r1, r1, lr
+; DEFAULT-NEXT:    orr r0, r1, r0
+; DEFAULT-NEXT:    ldrb r1, [sp, #16]
+; DEFAULT-NEXT:    and r2, r2, lr
+; DEFAULT-NEXT:    and r3, r3, lr
+; DEFAULT-NEXT:    and r1, r1, r12
+; DEFAULT-NEXT:    orr r1, r2, r1
+; DEFAULT-NEXT:    ldrb r2, [sp, #20]
+; DEFAULT-NEXT:    and r2, r2, r12
+; DEFAULT-NEXT:    orr r2, r3, r2
+; DEFAULT-NEXT:    ldrb r3, [sp, #24]
+; DEFAULT-NEXT:    and r12, r3, r12
+; DEFAULT-NEXT:    ldrb r3, [sp, #8]
+; DEFAULT-NEXT:    and r3, r3, lr
+; DEFAULT-NEXT:    orr r3, r3, r12
+; DEFAULT-NEXT:    pop {r11, pc}
+;
+; THUMB1-LABEL: ct_v4i8:
+; THUMB1:       @ %bb.0: @ %entry
+; THUMB1-NEXT:    .save {r4, r5, r6, lr}
+; THUMB1-NEXT:    push {r4, r5, r6, lr}
+; THUMB1-NEXT:    movs r5, #1
+; THUMB1-NEXT:    ands r5, r0
+; THUMB1-NEXT:    subs r4, r5, #1
+; THUMB1-NEXT:    ldr r0, [sp, #20]
+; THUMB1-NEXT:    ands r0, r4
+; THUMB1-NEXT:    rsbs r5, r5, #0
+; THUMB1-NEXT:    ands r1, r5
+; THUMB1-NEXT:    orrs r0, r1
+; THUMB1-NEXT:    ldr r1, [sp, #24]
+; THUMB1-NEXT:    ands r1, r4
+; THUMB1-NEXT:    ands r2, r5
+; THUMB1-NEXT:    orrs r1, r2
+; THUMB1-NEXT:    ldr r2, [sp, #28]
+; THUMB1-NEXT:    ands r2, r4
+; THUMB1-NEXT:    ands r3, r5
+; THUMB1-NEXT:    orrs r2, r3
+; THUMB1-NEXT:    ldr r6, [sp, #32]
+; THUMB1-NEXT:    ands r6, r4
+; THUMB1-NEXT:    ldr r3, [sp, #16]
+; THUMB1-NEXT:    ands r3, r5
+; THUMB1-NEXT:    orrs r3, r6
+; THUMB1-NEXT:    pop {r4, r5, r6, pc}
+;
+; THUMB2-LABEL: ct_v4i8:
+; THUMB2:       @ %bb.0: @ %entry
+; THUMB2-NEXT:    .save {r7, lr}
+; THUMB2-NEXT:    push {r7, lr}
+; THUMB2-NEXT:    and lr, r0, #1
+; THUMB2-NEXT:    ldrb.w r0, [sp, #12]
+; THUMB2-NEXT:    sub.w r12, lr, #1
+; THUMB2-NEXT:    rsb.w lr, lr, #0
+; THUMB2-NEXT:    and.w r0, r0, r12
+; THUMB2-NEXT:    and.w r1, r1, lr
+; THUMB2-NEXT:    orrs r0, r1
+; THUMB2-NEXT:    ldrb.w r1, [sp, #16]
+; THUMB2-NEXT:    and.w r2, r2, lr
+; THUMB2-NEXT:    and.w r3, r3, lr
+; THUMB2-NEXT:    and.w r1, r1, r12
+; THUMB2-NEXT:    orrs r1, r2
+; THUMB2-NEXT:    ldrb.w r2, [sp, #20]
+; THUMB2-NEXT:    and.w r2, r2, r12
+; THUMB2-NEXT:    orrs r2, r3
+; THUMB2-NEXT:    ldrb.w r3, [sp, #24]
+; THUMB2-NEXT:    and.w r12, r12, r3
+; THUMB2-NEXT:    ldrb.w r3, [sp, #8]
+; THUMB2-NEXT:    and.w r3, r3, lr
+; THUMB2-NEXT:    orr.w r3, r3, r12
+; THUMB2-NEXT:    pop {r7, pc}
+entry:
+  %sel = call <4 x i8> @llvm.ct.select.i32(i1 %cond, <4 x i8> %a, <4 x i8> %b)
+  ret <4 x i8> %sel
+}
+
+define <1 x i16> @ct_v1i16(i1 %cond, <1 x i16> %a, <1 x i16> %b) {
+; CT-LABEL: ct_v1i16:
+; CT:       @ %bb.0: @ %entry
+; CT-NEXT:    and r0, r0, #1
+; CT-NEXT:    sub r3, r0, #1
+; CT-NEXT:    rsb r0, r0, #0
+; CT-NEXT:    and r2, r2, r3
+; CT-NEXT:    and r0, r1, r0
+; CT-NEXT:    orr r0, r0, r2
+; CT-NEXT:    bx lr
+;
+; DEFAULT-LABEL: ct_v1i16:
+; DEFAULT:       @ %bb.0: @ %entry
+; DEFAULT-NEXT:    and r0, r0, #1
+; DEFAULT-NEXT:    sub r3, r0, #1
+; DEFAULT-NEXT:    rsb r0, r0, #0
+; DEFAULT-NEXT:    and r2, r2, r3
+; DEFAULT-NEXT:    and r0, r1, r0
+; DEFAULT-NEXT:    orr r0, r0, r2
+; DEFAULT-NEXT:    bx lr
+;
+; THUMB1-LABEL: ct_v1i16:
+; THUMB1:       @ %bb.0: @ %entry
+; THUMB1-NEXT:    .save {r4, lr}
+; THUMB1-NEXT:    push {r4, lr}
+; THUMB1-NEXT:    movs r3, #1
+; THUMB1-NEXT:    ands r3, r0
+; THUMB1-NEXT:    subs r4, r3, #1
+; THUMB1-NEXT:    ands r4, r2
+; THUMB1-NEXT:    rsbs r0, r3, #0
+; THUMB1-NEXT:    ands r0, r1
+; THUMB1-NEXT:    orrs r0, r4
+; THUMB1-NEXT:    pop {r4, pc}
+;
+; THUMB2-LABEL: ct_v1i16:
+; THUMB2:       @ %bb.0: @ %entry
+; THUMB2-NEXT:    and r0, r0, #1
+; THUMB2-NEXT:    subs r3, r0, #1
+; THUMB2-NEXT:    rsbs r0, r0, #0
+; THUMB2-NEXT:    ands r2, r3
+; THUMB2-NEXT:    ands r0, r1
+; THUMB2-NEXT:    orrs r0, r2
+; THUMB2-NEXT:    bx lr
+entry:
+  %sel = call <1 x i16> @llvm.ct.select.i16(i1 %cond, <1 x i16> %a, <1 x i16> %b)
+  ret <1 x i16> %sel
+}
+
+define <2 x i16> @ct_v2i16(i1 %cond, <2 x i16> %a, <2 x i16> %b) {
+; CT-LABEL: ct_v2i16:
+; CT:       @ %bb.0: @ %entry
+; CT-NEXT:    vdup.32 d16, r0
+; CT-NEXT:    vmov d18, r2, r3
+; CT-NEXT:    vldr d17, [sp]
+; CT-NEXT:    vshl.i32 d16, d16, #31
+; CT-NEXT:    vshr.s32 d16, d16, #31
+; CT-NEXT:    vbsl d16, d18, d17
+; CT-NEXT:    vmov r0, r1, d16
+; CT-NEXT:    bx lr
+;
+; DEFAULT-LABEL: ct_v2i16:
+; DEFAULT:       @ %bb.0: @ %entry
+; DEFAULT-NEXT:    push {r11, lr}
+; DEFAULT-NEXT:    and r0, r0, #1
+; DEFAULT-NEXT:    sub r12, r0, #1
+; DEFAULT-NEXT:    and lr, r3, r12
+; DEFAULT-NEXT:    rsb r3, r0, #0
+; DEFAULT-NEXT:    and r0, r1, r3
+; DEFAULT-NEXT:    ldrh r1, [sp, #8]
+; DEFAULT-NEXT:    and r2, r2, r3
+; DEFAULT-NEXT:    orr r0, r0, lr
+; DEFAULT-NEXT:    and r1, r1, r12
+; DEFAULT-NEXT:    orr r1, r2, r1
+; DEFAULT-NEXT:    pop {r11, pc}
+;
+; THUMB1-LABEL: ct_v2i16:
+; THUMB1:       @ %bb.0: @ %entry
+; THUMB1-NEXT:    .save {r4, lr}
+; THUMB1-NEXT:    push {r4, lr}
+; THUMB1-NEXT:    mov r4, r1
+; THUMB1-NEXT:    movs r1, #1
+; THUMB1-NEXT:    ands r1, r0
+; THUMB1-NEXT:    subs r0, r1, #1
+; THUMB1-NEXT:    ands r3, r0
+; THUMB1-NEXT:    rsbs r1, r1, #0
+; THUMB1-NEXT:    ands r4, r1
+; THUMB1-NEXT:    orrs r4, r3
+; THUMB1-NEXT:    ldr r3, [sp, #8]
+; THUMB1-NEXT:    ands r3, r0
+; THUMB1-NEXT:    ands r1, r2
+; THUMB1-NEXT:    orrs r1, r3
+; THUMB1-NEXT:    mov r0, r4
+; THUMB1-NEXT:    pop {r4, pc}
+;
+; THUMB2-LABEL: ct_v2i16:
+; THUMB2:       @ %bb.0: @ %entry
+; THUMB2-NEXT:    .save {r7, lr}
+; THUMB2-NEXT:    push {r7, lr}
+; THUMB2-NEXT:    and r0, r0, #1
+; THUMB2-NEXT:    sub.w r12, r0, #1
+; THUMB2-NEXT:    and.w lr, r3, r12
+; THUMB2-NEXT:    rsbs r3, r0, #0
+; THUMB2-NEXT:    and.w r0, r1, r3
+; THUMB2-NEXT:    ldrh.w r1, [sp, #8]
+; THUMB2-NEXT:    ands r2, r3
+; THUMB2-NEXT:    orr.w r0, r0, lr
+; THUMB2-NEXT:    and.w r1, r1, r12
+; THUMB2-NEXT:    orrs r1, r2
+; THUMB2-NEXT:    pop {r7, pc}
+entry:
+  %sel = call <2 x i16> @llvm.ct.select.i32(i1 %cond, <2 x i16> %a, <2 x i16> %b)
+  ret <2 x i16> %sel
+}
+
+define <1 x i32> @ct_v1i32(i1 %cond, <1 x i32> %a, <1 x i32> %b) {
+; CT-LABEL: ct_v1i32:
+; CT:       @ %bb.0: @ %entry
+; CT-NEXT:    and r0, r0, #1
+; CT-NEXT:    sub r3, r0, #1
+; CT-NEXT:    rsb r0, r0, #0
+; CT-NEXT:    and r2, r2, r3
+; CT-NEXT:    and r0, r1, r0
+; CT-NEXT:    orr r0, r0, r2
+; CT-NEXT:    bx lr
+;
+; DEFAULT-LABEL: ct_v1i32:
+; DEFAULT:       @ %bb.0: @ %entry
+; DEFAULT-NEXT:    and r0, r0, #1
+; DEFAULT-NEXT:    sub r3, r0, #1
+; DEFAULT-NEXT:    rsb r0, r0, #0
+; DEFAULT-NEXT:    and r2, r2, r3
+; DEFAULT-NEXT:    and r0, r1, r0
+; DEFAULT-NEXT:    orr r0, r0, r2
+; DEFAULT-NEXT:    bx lr
+;
+; THUMB1-LABEL: ct_v1i32:
+; THUMB1:       @ %bb.0: @ %entry
+; THUMB1-NEXT:    .save {r4, lr}
+; THUMB1-NEXT:    push {r4, lr}
+; THUMB1-NEXT:    movs r3, #1
+; THUMB1-NEXT:    ands r3, r0
+; THUMB1-NEXT:    subs r4, r3, #1
+; THUMB1-NEXT:    ands r4, r2
+; THUMB1-NEXT:    rsbs r0, r3, #0
+; THUMB1-NEXT:    ands r0, r1
+; THUMB1-NEXT:    orrs r0, r4
+; THUMB1-NEXT:    pop {r4, pc}
+;
+; THUMB2-LABEL: ct_v1i32:
+; THUMB2:       @ %bb.0: @ %entry
+; THUMB2-NEXT:    and r0, r0, #1
+; THUMB2-NEXT:    subs r3, r0, #1
+; THUMB2-NEXT:    rsbs r0, r0, #0
+; THUMB2-NEXT:    ands r2, r3
+; THUMB2-NEXT:    ands r0, r1
+; THUMB2-NEXT:    orrs r0, r2
+; THUMB2-NEXT:    bx lr
+entry:
+  %sel = call <1 x i32> @llvm.ct.select.i32(i1 %cond, <1 x i32> %a, <1 x i32> %b)
+  ret <1 x i32> %sel
+}
+
+define <1 x float> @ct_v1f32(i1 %cond, <1 x float> %a, <1 x float> %b) {
+; CT-LABEL: ct_v1f32:
+; CT:       @ %bb.0: @ %entry
+; CT-NEXT:    and r0, r0, #1
+; CT-NEXT:    sub r3, r0, #1
+; CT-NEXT:    rsb r0, r0, #0
+; CT-NEXT:    and r2, r2, r3
+; CT-NEXT:    and r0, r1, r0
+; CT-NEXT:    orr r0, r0, r2
+; CT-NEXT:    bx lr
+;
+; DEFAULT-LABEL: ct_v1f32:
+; DEFAULT:       @ %bb.0: @ %entry
+; DEFAULT-NEXT:    and r0, r0, #1
+; DEFAULT-NEXT:    sub r3, r0, #1
+; DEFAULT-NEXT:    rsb r0, r0, #0
+; DEFAULT-NEXT:    and r2, r2, r3
+; DEFAULT-NEXT:    and r0, r1, r0
+; DEFAULT-NEXT:    orr r0, r0, r2
+; DEFAULT-NEXT:    bx lr
+;
+; THUMB1-LABEL: ct_v1f32:
+; THUMB1:       @ %bb.0: @ %entry
+; THUMB1-NEXT:    .save {r4, lr}
+; THUMB1-NEXT:    push {r4, lr}
+; THUMB1-NEXT:    movs r3, #1
+; THUMB1-NEXT:    ands r3, r0
+; THUMB1-NEXT:    subs r4, r3, #1
+; THUMB1-NEXT:    ands r4, r2
+; THUMB1-NEXT:    rsbs r0, r3, #0
+; THUMB1-NEXT:    ands r0, r1
+; THUMB1-NEXT:    orrs r0, r4
+; THUMB1-NEXT:    pop {r4, pc}
+;
+; THUMB2-LABEL: ct_v1f32:
+; THUMB2:       @ %bb.0: @ %entry
+; THUMB2-NEXT:    and r0, r0, #1
+; THUMB2-NEXT:    subs r3, r0, #1
+; THUMB2-NEXT:    rsbs r0, r0, #0
+; THUMB2-NEXT:    ands r2, r3
+; THUMB2-NEXT:    ands r0, r1
+; THUMB2-NEXT:    orrs r0, r2
+; THUMB2-NEXT:    bx lr
+entry:
+  %sel = call <1 x float> @llvm.ct.select.f32(i1 %cond, <1 x float> %a, <1 x float> %b)
+  ret <1 x float> %sel
+}
diff --git a/llvm/test/CodeGen/ARM/ctselect.ll b/llvm/test/CodeGen/ARM/ctselect.ll
new file mode 100644
index 0000000000000..055e8733cb65c
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/ctselect.ll
@@ -0,0 +1,549 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=armv7-none-eabi -verify-machineinstrs | FileCheck --check-prefixes=CT %s
+; RUN: llc < %s -mtriple=armv6 -verify-machineinstrs | FileCheck --check-prefix=DEFAULT %s
+; RUN: llc < %s -mtriple=thumbv6m-none-eabi -verify-machineinstrs | FileCheck --check-prefix=THUMB1 %s
+; RUN: llc < %s -mtriple=thumbv7m-none-eabi -verify-machineinstrs | FileCheck --check-prefix=THUMB2 %s
+; RUN: llc < %s -mtriple=thumbv7-linux-gnueabihf -mcpu=cortex-a9 -verify-machineinstrs | FileCheck --check-prefix=CORTEXA9 %s
+; RUN: llc < %s -mtriple=armv7-linux-gnueabihf -mcpu=cortex-a9 -verify-machineinstrs | FileCheck --check-prefix=CORTEX-NOTHUMB %s
+
+define i1 @ct_i1(i1 %cond, i1 %a, i1 %b) {
+; CT-LABEL: ct_i1:
+; CT:       @ %bb.0: @ %entry
+; CT-NEXT:    and r3, r0, #1
+; CT-NEXT:    rsb r12, r3, #0
+; CT-NEXT:    and r0, r1, r12
+; CT-NEXT:    bic r12, r2, r12
+; CT-NEXT:    orr r0, r0, r12
+; CT-NEXT:    bx lr
+;
+; DEFAULT-LABEL: ct_i1:
+; DEFAULT:       @ %bb.0: @ %entry
+; DEFAULT-NEXT:    and r3, r0, #1
+; DEFAULT-NEXT:    rsb r12, r3, #0
+; DEFAULT-NEXT:    and r0, r1, r12
+; DEFAULT-NEXT:    bic r12, r2, r12
+; DEFAULT-NEXT:    orr r0, r0, r12
+; DEFAULT-NEXT:    bx lr
+;
+; THUMB1-LABEL: ct_i1:
+; THUMB1:       @ %bb.0: @ %entry
+; THUMB1-NEXT:    .save {r4, lr}
+; THUMB1-NEXT:    push {r4, lr}
+; THUMB1-NEXT:    movs r3, #1
+; THUMB1-NEXT:    ands r3, r0
+; THUMB1-NEXT:    mov r4, r3
+; THUMB1-NEXT:    lsls r4, r4, #31
+; THUMB1-NEXT:    asrs r4, r4, #31
+; THUMB1-NEXT:    mov r0, r1
+; THUMB1-NEXT:    eors r0, r2
+; THUMB1-NEXT:    ands r0, r4
+; THUMB1-NEXT:    eors r0, r2
+; THUMB1-NEXT:    pop {r4, pc}
+;
+; THUMB2-LABEL: ct_i1:
+; THUMB2:       @ %bb.0: @ %entry
+; THUMB2-NEXT:    and r3, r0, #1
+; THUMB2-NEXT:    rsb.w r12, r3, #0
+; THUMB2-NEXT:    and.w r0, r1, r12
+; THUMB2-NEXT:    bic.w r12, r2, r12
+; THUMB2-NEXT:    orr.w r0, r0, r12
+; THUMB2-NEXT:    bx lr
+;
+; CORTEXA9-LABEL: ct_i1:
+; CORTEXA9:       @ %bb.0: @ %entry
+; CORTEXA9-NEXT:    and r3, r0, #1
+; CORTEXA9-NEXT:    rsb.w r12, r3, #0
+; CORTEXA9-NEXT:    and.w r0, r1, r12
+; CORTEXA9-NEXT:    bic.w r12, r2, r12
+; CORTEXA9-NEXT:    orr.w r0, r0, r12
+; CORTEXA9-NEXT:    bx lr
+;
+; CORTEX-NOTHUMB-LABEL: ct_i1:
+; CORTEX-NOTHUMB:       @ %bb.0: @ %entry
+; CORTEX-NOTHUMB-NEXT:    and r3, r0, #1
+; CORTEX-NOTHUMB-NEXT:    rsb r12, r3, #0
+; CORTEX-NOTHUMB-NEXT:    and r0, r1, r12
+; CORTEX-NOTHUMB-NEXT:    bic r12, r2, r12
+; CORTEX-NOTHUMB-NEXT:    orr r0, r0, r12
+; CORTEX-NOTHUMB-NEXT:    bx lr
+entry:
+  %sel = call i1 @llvm.ct.select.i1(i1 %cond, i1 %a, i1 %b)
+  ret i1 %sel
+}
+
+define i8 @ct_int8(i1 %cond, i8 %a, i8 %b) {
+; CT-LABEL: ct_int8:
+; CT:       @ %bb.0: @ %entry
+; CT-NEXT:    and r3, r0, #1
+; CT-NEXT:    rsb r12, r3, #0
+; CT-NEXT:    and r0, r1, r12
+; CT-NEXT:    bic r12, r2, r12
+; CT-NEXT:    orr r0, r0, r12
+; CT-NEXT:    bx lr
+;
+; DEFAULT-LABEL: ct_int8:
+; DEFAULT:       @ %bb.0: @ %entry
+; DEFAULT-NEXT:    and r3, r0, #1
+; DEFAULT-NEXT:    rsb r12, r3, #0
+; DEFAULT-NEXT:    and r0, r1, r12
+; DEFAULT-NEXT:    bic r12, r2, r12
+; DEFAULT-NEXT:    orr r0, r0, r12
+; DEFAULT-NEXT:    bx lr
+;
+; THUMB1-LABEL: ct_int8:
+; THUMB1:       @ %bb.0: @ %entry
+; THUMB1-NEXT:    .save {r4, lr}
+; THUMB1-NEXT:    push {r4, lr}
+; THUMB1-NEXT:    movs r3, #1
+; THUMB1-NEXT:    ands r3, r0
+; THUMB1-NEXT:    mov r4, r3
+; THUMB1-NEXT:    lsls r4, r4, #31
+; THUMB1-NEXT:    asrs r4, r4, #31
+; THUMB1-NEXT:    mov r0, r1
+; THUMB1-NEXT:    eors r0, r2
+; THUMB1-NEXT:    ands r0, r4
+; THUMB1-NEXT:    eors r0, r2
+; THUMB1-NEXT:    pop {r4, pc}
+;
+; THUMB2-LABEL: ct_int8:
+; THUMB2:       @ %bb.0: @ %entry
+; THUMB2-NEXT:    and r3, r0, #1
+; THUMB2-NEXT:    rsb.w r12, r3, #0
+; THUMB2-NEXT:    and.w r0, r1, r12
+; THUMB2-NEXT:    bic.w r12, r2, r12
+; THUMB2-NEXT:    orr.w r0, r0, r12
+; THUMB2-NEXT:    bx lr
+;
+; CORTEXA9-LABEL: ct_int8:
+; CORTEXA9:       @ %bb.0: @ %entry
+; CORTEXA9-NEXT:    and r3, r0, #1
+; CORTEXA9-NEXT:    rsb.w r12, r3, #0
+; CORTEXA9-NEXT:    and.w r0, r1, r12
+; CORTEXA9-NEXT:    bic.w r12, r2, r12
+; CORTEXA9-NEXT:    orr.w r0, r0, r12
+; CORTEXA9-NEXT:    bx lr
+;
+; CORTEX-NOTHUMB-LABEL: ct_int8:
+; CORTEX-NOTHUMB:       @ %bb.0: @ %entry
+; CORTEX-NOTHUMB-NEXT:    and r3, r0, #1
+; CORTEX-NOTHUMB-NEXT:    rsb r12, r3, #0
+; CORTEX-NOTHUMB-NEXT:    and r0, r1, r12
+; CORTEX-NOTHUMB-NEXT:    bic r12, r2, r12
+; CORTEX-NOTHUMB-NEXT:    orr r0, r0, r12
+; CORTEX-NOTHUMB-NEXT:    bx lr
+entry:
+  %sel = call i8 @llvm.ct.select.i8(i1 %cond, i8 %a, i8 %b)
+  ret i8 %sel
+}
+
+define i16 @ct_int16(i1 %cond, i16 %a, i16 %b) {
+; CT-LABEL: ct_int16:
+; CT:       @ %bb.0: @ %entry
+; CT-NEXT:    and r3, r0, #1
+; CT-NEXT:    rsb r12, r3, #0
+; CT-NEXT:    and r0, r1, r12
+; CT-NEXT:    bic r12, r2, r12
+; CT-NEXT:    orr r0, r0, r12
+; CT-NEXT:    bx lr
+;
+; DEFAULT-LABEL: ct_int16:
+; DEFAULT:       @ %bb.0: @ %entry
+; DEFAULT-NEXT:    and r3, r0, #1
+; DEFAULT-NEXT:    rsb r12, r3, #0
+; DEFAULT-NEXT:    and r0, r1, r12
+; DEFAULT-NEXT:    bic r12, r2, r12
+; DEFAULT-NEXT:    orr r0, r0, r12
+; DEFAULT-NEXT:    bx lr
+;
+; THUMB1-LABEL: ct_int16:
+; THUMB1:       @ %bb.0: @ %entry
+; THUMB1-NEXT:    .save {r4, lr}
+; THUMB1-NEXT:    push {r4, lr}
+; THUMB1-NEXT:    movs r3, #1
+; THUMB1-NEXT:    ands r3, r0
+; THUMB1-NEXT:    mov r4, r3
+; THUMB1-NEXT:    lsls r4, r4, #31
+; THUMB1-NEXT:    asrs r4, r4, #31
+; THUMB1-NEXT:    mov r0, r1
+; THUMB1-NEXT:    eors r0, r2
+; THUMB1-NEXT:    ands r0, r4
+; THUMB1-NEXT:    eors r0, r2
+; THUMB1-NEXT:    pop {r4, pc}
+;
+; THUMB2-LABEL: ct_int16:
+; THUMB2:       @ %bb.0: @ %entry
+; THUMB2-NEXT:    and r3, r0, #1
+; THUMB2-NEXT:    rsb.w r12, r3, #0
+; THUMB2-NEXT:    and.w r0, r1, r12
+; THUMB2-NEXT:    bic.w r12, r2, r12
+; THUMB2-NEXT:    orr.w r0, r0, r12
+; THUMB2-NEXT:    bx lr
+;
+; CORTEXA9-LABEL: ct_int16:
+; CORTEXA9:       @ %bb.0: @ %entry
+; CORTEXA9-NEXT:    and r3, r0, #1
+; CORTEXA9-NEXT:    rsb.w r12, r3, #0
+; CORTEXA9-NEXT:    and.w r0, r1, r12
+; CORTEXA9-NEXT:    bic.w r12, r2, r12
+; CORTEXA9-NEXT:    orr.w r0, r0, r12
+; CORTEXA9-NEXT:    bx lr
+;
+; CORTEX-NOTHUMB-LABEL: ct_int16:
+; CORTEX-NOTHUMB:       @ %bb.0: @ %entry
+; CORTEX-NOTHUMB-NEXT:    and r3, r0, #1
+; CORTEX-NOTHUMB-NEXT:    rsb r12, r3, #0
+; CORTEX-NOTHUMB-NEXT:    and r0, r1, r12
+; CORTEX-NOTHUMB-NEXT:    bic r12, r2, r12
+; CORTEX-NOTHUMB-NEXT:    orr r0, r0, r12
+; CORTEX-NOTHUMB-NEXT:    bx lr
+entry:
+  %sel = call i16 @llvm.ct.select.i16(i1 %cond, i16 %a, i16 %b)
+  ret i16 %sel
+}
+
+define i32 @ct_int32(i1 %cond, i32 %a, i32 %b) {
+; CT-LABEL: ct_int32:
+; CT:       @ %bb.0: @ %entry
+; CT-NEXT:    and r3, r0, #1
+; CT-NEXT:    rsb r12, r3, #0
+; CT-NEXT:    and r0, r1, r12
+; CT-NEXT:    bic r12, r2, r12
+; CT-NEXT:    orr r0, r0, r12
+; CT-NEXT:    bx lr
+;
+; DEFAULT-LABEL: ct_int32:
+; DEFAULT:       @ %bb.0: @ %entry
+; DEFAULT-NEXT:    and r3, r0, #1
+; DEFAULT-NEXT:    rsb r12, r3, #0
+; DEFAULT-NEXT:    and r0, r1, r12
+; DEFAULT-NEXT:    bic r12, r2, r12
+; DEFAULT-NEXT:    orr r0, r0, r12
+; DEFAULT-NEXT:    bx lr
+;
+; THUMB1-LABEL: ct_int32:
+; THUMB1:       @ %bb.0: @ %entry
+; THUMB1-NEXT:    .save {r4, lr}
+; THUMB1-NEXT:    push {r4, lr}
+; THUMB1-NEXT:    movs r3, #1
+; THUMB1-NEXT:    ands r3, r0
+; THUMB1-NEXT:    mov r4, r3
+; THUMB1-NEXT:    lsls r4, r4, #31
+; THUMB1-NEXT:    asrs r4, r4, #31
+; THUMB1-NEXT:    mov r0, r1
+; THUMB1-NEXT:    eors r0, r2
+; THUMB1-NEXT:    ands r0, r4
+; THUMB1-NEXT:    eors r0, r2
+; THUMB1-NEXT:    pop {r4, pc}
+;
+; THUMB2-LABEL: ct_int32:
+; THUMB2:       @ %bb.0: @ %entry
+; THUMB2-NEXT:    and r3, r0, #1
+; THUMB2-NEXT:    rsb.w r12, r3, #0
+; THUMB2-NEXT:    and.w r0, r1, r12
+; THUMB2-NEXT:    bic.w r12, r2, r12
+; THUMB2-NEXT:    orr.w r0, r0, r12
+; THUMB2-NEXT:    bx lr
+;
+; CORTEXA9-LABEL: ct_int32:
+; CORTEXA9:       @ %bb.0: @ %entry
+; CORTEXA9-NEXT:    and r3, r0, #1
+; CORTEXA9-NEXT:    rsb.w r12, r3, #0
+; CORTEXA9-NEXT:    and.w r0, r1, r12
+; CORTEXA9-NEXT:    bic.w r12, r2, r12
+; CORTEXA9-NEXT:    orr.w r0, r0, r12
+; CORTEXA9-NEXT:    bx lr
+;
+; CORTEX-NOTHUMB-LABEL: ct_int32:
+; CORTEX-NOTHUMB:       @ %bb.0: @ %entry
+; CORTEX-NOTHUMB-NEXT:    and r3, r0, #1
+; CORTEX-NOTHUMB-NEXT:    rsb r12, r3, #0
+; CORTEX-NOTHUMB-NEXT:    and r0, r1, r12
+; CORTEX-NOTHUMB-NEXT:    bic r12, r2, r12
+; CORTEX-NOTHUMB-NEXT:    orr r0, r0, r12
+; CORTEX-NOTHUMB-NEXT:    bx lr
+entry:
+  %sel = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b)
+  ret i32 %sel
+}
+
+define i64 @ct_int64(i1 %cond, i64 %a, i64 %b) {
+; CT-LABEL: ct_int64:
+; CT:       @ %bb.0: @ %entry
+; CT-NEXT:    .save {r4, lr}
+; CT-NEXT:    push {r4, lr}
+; CT-NEXT:    ldr r1, [sp, #8]
+; CT-NEXT:    and lr, r0, #1
+; CT-NEXT:    ldr r12, [sp, #12]
+; CT-NEXT:    rsb r4, lr, #0
+; CT-NEXT:    and r0, r2, r4
+; CT-NEXT:    bic r4, r1, r4
+; CT-NEXT:    orr r0, r0, r4
+; CT-NEXT:    rsb r2, lr, #0
+; CT-NEXT:    and r1, r3, r2
+; CT-NEXT:    bic r2, r12, r2
+; CT-NEXT:    orr r1, r1, r2
+; CT-NEXT:    pop {r4, pc}
+;
+; DEFAULT-LABEL: ct_int64:
+; DEFAULT:       @ %bb.0: @ %entry
+; DEFAULT-NEXT:    push {r11, lr}
+; DEFAULT-NEXT:    and r12, r0, #1
+; DEFAULT-NEXT:    ldr r1, [sp, #8]
+; DEFAULT-NEXT:    rsb lr, r12, #0
+; DEFAULT-NEXT:    and r0, r2, lr
+; DEFAULT-NEXT:    bic lr, r1, lr
+; DEFAULT-NEXT:    orr r0, r0, lr
+; DEFAULT-NEXT:    ldr r2, [sp, #12]
+; DEFAULT-NEXT:    rsb lr, r12, #0
+; DEFAULT-NEXT:    and r1, r3, lr
+; DEFAULT-NEXT:    bic lr, r2, lr
+; DEFAULT-NEXT:    orr r1, r1, lr
+; DEFAULT-NEXT:    pop {r11, pc}
+;
+; THUMB1-LABEL: ct_int64:
+; THUMB1:       @ %bb.0: @ %entry
+; THUMB1-NEXT:    .save {r4, r5, r7, lr}
+; THUMB1-NEXT:    push {r4, r5, r7, lr}
+; THUMB1-NEXT:    movs r4, #1
+; THUMB1-NEXT:    ands r4, r0
+; THUMB1-NEXT:    ldr r1, [sp, #16]
+; THUMB1-NEXT:    mov r5, r4
+; THUMB1-NEXT:    lsls r5, r5, #31
+; THUMB1-NEXT:    asrs r5, r5, #31
+; THUMB1-NEXT:    mov r0, r2
+; THUMB1-NEXT:    eors r0, r1
+; THUMB1-NEXT:    ands r0, r5
+; THUMB1-NEXT:    eors r0, r1
+; THUMB1-NEXT:    ldr r2, [sp, #20]
+; THUMB1-NEXT:    mov r5, r4
+; THUMB1-NEXT:    lsls r5, r5, #31
+; THUMB1-NEXT:    asrs r5, r5, #31
+; THUMB1-NEXT:    mov r1, r3
+; THUMB1-NEXT:    eors r1, r2
+; THUMB1-NEXT:    ands r1, r5
+; THUMB1-NEXT:    eors r1, r2
+; THUMB1-NEXT:    pop {r4, r5, r7, pc}
+;
+; THUMB2-LABEL: ct_int64:
+; THUMB2:       @ %bb.0: @ %entry
+; THUMB2-NEXT:    .save {r7, lr}
+; THUMB2-NEXT:    push {r7, lr}
+; THUMB2-NEXT:    and r12, r0, #1
+; THUMB2-NEXT:    ldr r1, [sp, #8]
+; THUMB2-NEXT:    rsb.w lr, r12, #0
+; THUMB2-NEXT:    and.w r0, r2, lr
+; THUMB2-NEXT:    bic.w lr, r1, lr
+; THUMB2-NEXT:    orr.w r0, r0, lr
+; THUMB2-NEXT:    ldr r2, [sp, #12]
+; THUMB2-NEXT:    rsb.w lr, r12, #0
+; THUMB2-NEXT:    and.w r1, r3, lr
+; THUMB2-NEXT:    bic.w lr, r2, lr
+; THUMB2-NEXT:    orr.w r1, r1, lr
+; THUMB2-NEXT:    pop {r7, pc}
+;
+; CORTEXA9-LABEL: ct_int64:
+; CORTEXA9:       @ %bb.0: @ %entry
+; CORTEXA9-NEXT:    .save {r4, lr}
+; CORTEXA9-NEXT:    push {r4, lr}
+; CORTEXA9-NEXT:    ldrd r1, r12, [sp, #8]
+; CORTEXA9-NEXT:    and lr, r0, #1
+; CORTEXA9-NEXT:    rsb.w r4, lr, #0
+; CORTEXA9-NEXT:    and.w r0, r2, r4
+; CORTEXA9-NEXT:    bic.w r4, r1, r4
+; CORTEXA9-NEXT:    orrs r0, r4
+; CORTEXA9-NEXT:    rsb.w r2, lr, #0
+; CORTEXA9-NEXT:    and.w r1, r3, r2
+; CORTEXA9-NEXT:    bic.w r2, r12, r2
+; CORTEXA9-NEXT:    orr.w r1, r1, r2
+; CORTEXA9-NEXT:    pop {r4, pc}
+;
+; CORTEX-NOTHUMB-LABEL: ct_int64:
+; CORTEX-NOTHUMB:       @ %bb.0: @ %entry
+; CORTEX-NOTHUMB-NEXT:    .save {r4, lr}
+; CORTEX-NOTHUMB-NEXT:    push {r4, lr}
+; CORTEX-NOTHUMB-NEXT:    ldr r12, [sp, #12]
+; CORTEX-NOTHUMB-NEXT:    and lr, r0, #1
+; CORTEX-NOTHUMB-NEXT:    ldr r1, [sp, #8]
+; CORTEX-NOTHUMB-NEXT:    rsb r4, lr, #0
+; CORTEX-NOTHUMB-NEXT:    and r0, r2, r4
+; CORTEX-NOTHUMB-NEXT:    bic r4, r1, r4
+; CORTEX-NOTHUMB-NEXT:    orr r0, r0, r4
+; CORTEX-NOTHUMB-NEXT:    rsb r2, lr, #0
+; CORTEX-NOTHUMB-NEXT:    and r1, r3, r2
+; CORTEX-NOTHUMB-NEXT:    bic r2, r12, r2
+; CORTEX-NOTHUMB-NEXT:    orr r1, r1, r2
+; CORTEX-NOTHUMB-NEXT:    pop {r4, pc}
+entry:
+  %sel = call i64 @llvm.ct.select.i64(i1 %cond, i64 %a, i64 %b)
+  ret i64 %sel
+}
+
+define float @ct_float(i1 %cond, float %a, float %b) {
+; CT-LABEL: ct_float:
+; CT:       @ %bb.0: @ %entry
+; CT-NEXT:    and r0, r0, #1
+; CT-NEXT:    vmov s0, r2
+; CT-NEXT:    vmov s2, r1
+; CT-NEXT:    vmov r2, s2
+; CT-NEXT:    vmov r3, s0
+; CT-NEXT:    rsb r1, r0, #0
+; CT-NEXT:    and r2, r2, r1
+; CT-NEXT:    bic r1, r3, r1
+; CT-NEXT:    orr r2, r2, r1
+; CT-NEXT:    vmov s4, r2
+; CT-NEXT:    vmov r0, s4
+; CT-NEXT:    bx lr
+;
+; DEFAULT-LABEL: ct_float:
+; DEFAULT:       @ %bb.0: @ %entry
+; DEFAULT-NEXT:    and r3, r0, #1
+; DEFAULT-NEXT:    rsb r12, r3, #0
+; DEFAULT-NEXT:    and r0, r1, r12
+; DEFAULT-NEXT:    bic r12, r2, r12
+; DEFAULT-NEXT:    orr r0, r0, r12
+; DEFAULT-NEXT:    bx lr
+;
+; THUMB1-LABEL: ct_float:
+; THUMB1:       @ %bb.0: @ %entry
+; THUMB1-NEXT:    .save {r4, lr}
+; THUMB1-NEXT:    push {r4, lr}
+; THUMB1-NEXT:    movs r3, #1
+; THUMB1-NEXT:    ands r3, r0
+; THUMB1-NEXT:    mov r4, r3
+; THUMB1-NEXT:    lsls r4, r4, #31
+; THUMB1-NEXT:    asrs r4, r4, #31
+; THUMB1-NEXT:    mov r0, r1
+; THUMB1-NEXT:    eors r0, r2
+; THUMB1-NEXT:    ands r0, r4
+; THUMB1-NEXT:    eors r0, r2
+; THUMB1-NEXT:    pop {r4, pc}
+;
+; THUMB2-LABEL: ct_float:
+; THUMB2:       @ %bb.0: @ %entry
+; THUMB2-NEXT:    and r3, r0, #1
+; THUMB2-NEXT:    rsb.w r12, r3, #0
+; THUMB2-NEXT:    and.w r0, r1, r12
+; THUMB2-NEXT:    bic.w r12, r2, r12
+; THUMB2-NEXT:    orr.w r0, r0, r12
+; THUMB2-NEXT:    bx lr
+;
+; CORTEXA9-LABEL: ct_float:
+; CORTEXA9:       @ %bb.0: @ %entry
+; CORTEXA9-NEXT:    and r0, r0, #1
+; CORTEXA9-NEXT:    vmov r2, s0
+; CORTEXA9-NEXT:    vmov r3, s1
+; CORTEXA9-NEXT:    rsbs r1, r0, #0
+; CORTEXA9-NEXT:    ands r2, r1
+; CORTEXA9-NEXT:    bic.w r1, r3, r1
+; CORTEXA9-NEXT:    orrs r2, r1
+; CORTEXA9-NEXT:    vmov s2, r2
+; CORTEXA9-NEXT:    vmov.f32 s0, s2
+; CORTEXA9-NEXT:    bx lr
+;
+; CORTEX-NOTHUMB-LABEL: ct_float:
+; CORTEX-NOTHUMB:       @ %bb.0: @ %entry
+; CORTEX-NOTHUMB-NEXT:    and r0, r0, #1
+; CORTEX-NOTHUMB-NEXT:    vmov r2, s0
+; CORTEX-NOTHUMB-NEXT:    vmov r3, s1
+; CORTEX-NOTHUMB-NEXT:    rsb r1, r0, #0
+; CORTEX-NOTHUMB-NEXT:    and r2, r2, r1
+; CORTEX-NOTHUMB-NEXT:    bic r1, r3, r1
+; CORTEX-NOTHUMB-NEXT:    orr r2, r2, r1
+; CORTEX-NOTHUMB-NEXT:    vmov s2, r2
+; CORTEX-NOTHUMB-NEXT:    vmov.f32 s0, s2
+; CORTEX-NOTHUMB-NEXT:    bx lr
+entry:
+  %sel = call float @llvm.ct.select.f32(i1 %cond, float %a, float %b)
+  ret float %sel
+}
+
+define double @ct_f64(i1 %cond, double %a, double %b) {
+; CT-LABEL: ct_f64:
+; CT:       @ %bb.0: @ %entry
+; CT-NEXT:    vldr d16, [sp]
+; CT-NEXT:    vmov d17, r2, r3
+; CT-NEXT:    and r0, r0, #1
+; CT-NEXT:    rsb r1, r0, #0
+; CT-NEXT:    vdup.32 d19, r1
+; CT-NEXT:    vand d18, d17, d19
+; CT-NEXT:    vbic d19, d16, d19
+; CT-NEXT:    vorr d18, d18, d19
+; CT-NEXT:    vmov r0, r1, d18
+; CT-NEXT:    bx lr
+;
+; DEFAULT-LABEL: ct_f64:
+; DEFAULT:       @ %bb.0: @ %entry
+; DEFAULT-NEXT:    push {r11, lr}
+; DEFAULT-NEXT:    and r0, r0, #1
+; DEFAULT-NEXT:    ldr r1, [sp, #8]
+; DEFAULT-NEXT:    sub r12, r0, #1
+; DEFAULT-NEXT:    and lr, r1, r12
+; DEFAULT-NEXT:    rsb r1, r0, #0
+; DEFAULT-NEXT:    and r0, r2, r1
+; DEFAULT-NEXT:    ldr r2, [sp, #12]
+; DEFAULT-NEXT:    and r1, r3, r1
+; DEFAULT-NEXT:    orr r0, r0, lr
+; DEFAULT-NEXT:    and r2, r2, r12
+; DEFAULT-NEXT:    orr r1, r1, r2
+; DEFAULT-NEXT:    pop {r11, pc}
+;
+; THUMB1-LABEL: ct_f64:
+; THUMB1:       @ %bb.0: @ %entry
+; THUMB1-NEXT:    .save {r4, lr}
+; THUMB1-NEXT:    push {r4, lr}
+; THUMB1-NEXT:    movs r1, #1
+; THUMB1-NEXT:    ands r1, r0
+; THUMB1-NEXT:    subs r4, r1, #1
+; THUMB1-NEXT:    ldr r0, [sp, #8]
+; THUMB1-NEXT:    ands r0, r4
+; THUMB1-NEXT:    rsbs r1, r1, #0
+; THUMB1-NEXT:    ands r2, r1
+; THUMB1-NEXT:    orrs r0, r2
+; THUMB1-NEXT:    ldr r2, [sp, #12]
+; THUMB1-NEXT:    ands r2, r4
+; THUMB1-NEXT:    ands r1, r3
+; THUMB1-NEXT:    orrs r1, r2
+; THUMB1-NEXT:    pop {r4, pc}
+;
+; THUMB2-LABEL: ct_f64:
+; THUMB2:       @ %bb.0: @ %entry
+; THUMB2-NEXT:    .save {r7, lr}
+; THUMB2-NEXT:    push {r7, lr}
+; THUMB2-NEXT:    and r0, r0, #1
+; THUMB2-NEXT:    ldr r1, [sp, #8]
+; THUMB2-NEXT:    sub.w r12, r0, #1
+; THUMB2-NEXT:    and.w lr, r1, r12
+; THUMB2-NEXT:    rsbs r1, r0, #0
+; THUMB2-NEXT:    and.w r0, r2, r1
+; THUMB2-NEXT:    ldr r2, [sp, #12]
+; THUMB2-NEXT:    ands r1, r3
+; THUMB2-NEXT:    orr.w r0, r0, lr
+; THUMB2-NEXT:    and.w r2, r2, r12
+; THUMB2-NEXT:    orrs r1, r2
+; THUMB2-NEXT:    pop {r7, pc}
+;
+; CORTEXA9-LABEL: ct_f64:
+; CORTEXA9:       @ %bb.0: @ %entry
+; CORTEXA9-NEXT:    and r0, r0, #1
+; CORTEXA9-NEXT:    rsbs r1, r0, #0
+; CORTEXA9-NEXT:    vdup.32 d17, r1
+; CORTEXA9-NEXT:    vand d16, d0, d17
+; CORTEXA9-NEXT:    vbic d17, d1, d17
+; CORTEXA9-NEXT:    vorr d16, d16, d17
+; CORTEXA9-NEXT:    vmov.f64 d0, d16
+; CORTEXA9-NEXT:    bx lr
+;
+; CORTEX-NOTHUMB-LABEL: ct_f64:
+; CORTEX-NOTHUMB:       @ %bb.0: @ %entry
+; CORTEX-NOTHUMB-NEXT:    and r0, r0, #1
+; CORTEX-NOTHUMB-NEXT:    rsb r1, r0, #0
+; CORTEX-NOTHUMB-NEXT:    vdup.32 d17, r1
+; CORTEX-NOTHUMB-NEXT:    vand d16, d0, d17
+; CORTEX-NOTHUMB-NEXT:    vbic d17, d1, d17
+; CORTEX-NOTHUMB-NEXT:    vorr d16, d16, d17
+; CORTEX-NOTHUMB-NEXT:    vmov.f64 d0, d16
+; CORTEX-NOTHUMB-NEXT:    bx lr
+entry:
+  %sel = call double @llvm.ct.select.f64(i1 %cond, double %a, double %b)
+  ret double %sel
+}

>From d8bee539c4046bdc086d54b18bbb8a8b44d06f73 Mon Sep 17 00:00:00 2001
From: wizardengineer <juliuswoosebert at gmail.com>
Date: Fri, 22 May 2026 12:00:21 -0400
Subject: [PATCH 3/4] [ARM] Restore STRICT FP setup, byval pre-load fix, bundle
 predicate

A prior rebase conflict resolution accidentally dropped several
unrelated pieces of upstream ARM code:

- ARMISelLowering: STRICT_FP_ROUND/FMINNUM/FMAXNUM/FP_EXTEND
  setOperationAction calls (from d08b0f7240aa "Disable strict node
  mutation"), the STRICT_FP_TO_SINT/UINT i32 unconditional setup,
  the STRICT_FP16_TO_FP Expand fallback (reverted from LibCall), the
  byval pre-load fix in LowerCall (from a01a921004c1), and the
  IsStrictFPEnabled=true line at the end of the constructor.
- ARMTargetMachine: broadened createUnpackMachineBundlesLegacy from
  the Thumb2/KCFI predicate to nullptr (unconditional).

These deletions caused CHECK-line mismatches in fp16-fullfp16.ll,
fp-intrinsics-vector.ll, and byval_struct_copy_tailcall.ll. None of
the affected tests use llvm.ct.select.

Restores the upstream code verbatim; ct.select work is unaffected.
---
 llvm/lib/Target/ARM/ARMISelLowering.cpp  | 83 +++++++++++++++++++-----
 llvm/lib/Target/ARM/ARMISelLowering.h    |  1 -
 llvm/lib/Target/ARM/ARMTargetMachine.cpp |  8 ++-
 3 files changed, 73 insertions(+), 19 deletions(-)

diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 3dde0f3188979..376836de8007c 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -641,16 +641,24 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_,
       for (auto Op : {ISD::STRICT_FADD, ISD::STRICT_FSUB, ISD::STRICT_FMUL,
                       ISD::STRICT_FDIV, ISD::STRICT_FMA, ISD::STRICT_FSQRT})
         setOperationAction(Op, MVT::f64, Legal);
+
+      setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Legal);
     }
   }
 
   if (Subtarget->hasFullFP16()) {
+    for (auto Op : {ISD::STRICT_FADD, ISD::STRICT_FSUB, ISD::STRICT_FMUL,
+                    ISD::STRICT_FDIV, ISD::STRICT_FMA, ISD::STRICT_FSQRT})
+      setOperationAction(Op, MVT::f16, Legal);
+
     addRegisterClass(MVT::f16, &ARM::HPRRegClass);
     setOperationAction(ISD::BITCAST, MVT::i16, Custom);
     setOperationAction(ISD::BITCAST, MVT::f16, Custom);
 
     setOperationAction(ISD::FMINNUM, MVT::f16, Legal);
     setOperationAction(ISD::FMAXNUM, MVT::f16, Legal);
+    setOperationAction(ISD::STRICT_FMINNUM, MVT::f16, Legal);
+    setOperationAction(ISD::STRICT_FMAXNUM, MVT::f16, Legal);
   }
 
   if (Subtarget->hasBF16()) {
@@ -960,13 +968,14 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_,
     setOperationAction(ISD::FP_TO_SINT, MVT::f64, Custom);
     setOperationAction(ISD::FP_TO_UINT, MVT::f64, Custom);
     setOperationAction(ISD::FP_ROUND,   MVT::f32, Custom);
-    setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i32, Custom);
-    setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Custom);
     setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::f64, Custom);
     setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::f64, Custom);
     setOperationAction(ISD::STRICT_FP_ROUND,   MVT::f32, Custom);
   }
 
+  setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i32, Custom);
+  setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Custom);
+
   if (!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()) {
     setOperationAction(ISD::FP_EXTEND,  MVT::f64, Custom);
     setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Custom);
@@ -974,11 +983,16 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_,
       setOperationAction(ISD::FP_ROUND,  MVT::f16, Custom);
       setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Custom);
     }
+  } else {
+    setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Legal);
   }
 
   if (!Subtarget->hasFP16()) {
     setOperationAction(ISD::FP_EXTEND,  MVT::f32, Custom);
     setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f32, Custom);
+  } else {
+    setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f32, Legal);
+    setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Legal);
   }
 
   computeRegisterProperties(Subtarget->getRegisterInfo());
@@ -1327,8 +1341,8 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_,
   setOperationAction(ISD::FSIN,      MVT::f32, Expand);
   setOperationAction(ISD::FCOS,      MVT::f32, Expand);
   setOperationAction(ISD::FCOS,      MVT::f64, Expand);
-  setOperationAction(ISD::FSINCOS,   MVT::f64, Custom);
-  setOperationAction(ISD::FSINCOS,   MVT::f32, Custom);
+  setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
+  setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
   setOperationAction(ISD::FREM, MVT::f64, LibCall);
   setOperationAction(ISD::FREM, MVT::f32, LibCall);
   if (!Subtarget->useSoftFloat() && Subtarget->hasVFP2Base() &&
@@ -1350,16 +1364,16 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_,
     if (!Subtarget->hasFPARMv8Base() || !Subtarget->hasFP64()) {
       setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
       setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
-      setOperationAction(ISD::STRICT_FP16_TO_FP, MVT::f64, LibCall);
-      setOperationAction(ISD::STRICT_FP_TO_FP16, MVT::f64, LibCall);
+      setOperationAction(ISD::STRICT_FP16_TO_FP, MVT::f64, Expand);
+      setOperationAction(ISD::STRICT_FP_TO_FP16, MVT::f64, Expand);
     }
 
     // fp16 is a special v7 extension that adds f16 <-> f32 conversions.
     if (!Subtarget->hasFP16()) {
       setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
       setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
-      setOperationAction(ISD::STRICT_FP16_TO_FP, MVT::f32, LibCall);
-      setOperationAction(ISD::STRICT_FP_TO_FP16, MVT::f32, LibCall);
+      setOperationAction(ISD::STRICT_FP16_TO_FP, MVT::f32, Expand);
+      setOperationAction(ISD::STRICT_FP_TO_FP16, MVT::f32, Expand);
     }
 
     // Strict floating-point comparisons need custom lowering.
@@ -1548,6 +1562,8 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_,
       Align(1ULL << Subtarget->getPreferBranchLogAlignment()));
 
   setMinFunctionAlignment(Subtarget->isThumb() ? Align(2) : Align(4));
+
+  IsStrictFPEnabled = true;
 }
 
 bool ARMTargetLowering::useSoftFloat() const {
@@ -2342,9 +2358,44 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
 
     if (isTailCall && VA.isMemLoc() && !AfterFormalArgLoads) {
       Chain = DAG.getStackArgumentTokenFactor(Chain);
-      if (ByValTempChain)
+      if (ByValTempChain) {
+        // In case of large byval copies, re-using the stackframe for tail-calls
+        // can lead to overwriting incoming arguments on the stack. Force
+        // loading these stack arguments before the copy to avoid that.
+        SmallVector<SDValue, 8> IncomingLoad;
+        for (unsigned I = 0; I < OutVals.size(); ++I) {
+          if (Outs[I].Flags.isByVal())
+            continue;
+
+          SDValue OutVal = OutVals[I];
+          LoadSDNode *OutLN = dyn_cast_or_null<LoadSDNode>(OutVal);
+          if (!OutLN)
+            continue;
+
+          FrameIndexSDNode *FIN =
+              dyn_cast_or_null<FrameIndexSDNode>(OutLN->getBasePtr());
+          if (!FIN)
+            continue;
+
+          if (!MFI.isFixedObjectIndex(FIN->getIndex()))
+            continue;
+
+          for (const CCValAssign &VA : ArgLocs) {
+            if (VA.isMemLoc())
+              IncomingLoad.push_back(OutVal.getValue(1));
+          }
+        }
+
+        // Update the chain to force loads for potentially clobbered argument
+        // loads to happen before the byval copy.
+        if (!IncomingLoad.empty()) {
+          IncomingLoad.push_back(Chain);
+          Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, IncomingLoad);
+        }
+
         Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chain,
                             ByValTempChain);
+      }
       AfterFormalArgLoads = true;
     }
 
@@ -9811,8 +9862,8 @@ SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const {
       DAG.getLoad(ArgVT, dl, LoadSin.getValue(1), Add, MachinePointerInfo());
 
   SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
-  return DAG.getNode(ISD::MERGE_VALUES, dl, Tys,
-                     LoadSin.getValue(0), LoadCos.getValue(0));
+  return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, LoadSin.getValue(0),
+                     LoadCos.getValue(0));
 }
 
 SDValue ARMTargetLowering::LowerWindowsDIVLibCall(SDValue Op, SelectionDAG &DAG,
@@ -10732,8 +10783,10 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::VECREDUCE_SMAX:
     return LowerVecReduceMinMax(Op, DAG, Subtarget);
   case ISD::ATOMIC_LOAD:
-  case ISD::ATOMIC_STORE:  return LowerAtomicLoadStore(Op, DAG);
-  case ISD::FSINCOS:       return LowerFSINCOS(Op, DAG);
+  case ISD::ATOMIC_STORE:
+    return LowerAtomicLoadStore(Op, DAG);
+  case ISD::FSINCOS:
+    return LowerFSINCOS(Op, DAG);
   case ISD::SDIVREM:
   case ISD::UDIVREM:       return LowerDivRem(Op, DAG);
   case ISD::DYNAMIC_STACKALLOC:
@@ -10889,8 +10942,8 @@ void ARMTargetLowering::ReplaceNodeResults(SDNode *N,
       SDValue CondNorm = DAG.getNode(ISD::AND, DL, MVT::i32, Cond, One);
 
       // Create i32 ctselect that will go through normal lowering
-      Res =
-          DAG.getNode(ISD::CT_SELECT, DL, MVT::i32, CondNorm, TrueInt, FalseInt);
+      Res = DAG.getNode(ISD::CT_SELECT, DL, MVT::i32, CondNorm, TrueInt,
+                        FalseInt);
     } else {
       // For other types, use existing lowering
       Res = LowerCTSELECT(SDValue(N, 0), DAG);
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h
index 1ab3bdb2b16d1..52c76ccfe196a 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.h
+++ b/llvm/lib/Target/ARM/ARMISelLowering.h
@@ -51,7 +51,6 @@ class TargetMachine;
 class TargetRegisterInfo;
 class VectorType;
 
-
   namespace ARM {
   /// Possible values of current rounding mode, which is specified in bits
   /// 23:22 of FPSCR.
diff --git a/llvm/lib/Target/ARM/ARMTargetMachine.cpp b/llvm/lib/Target/ARM/ARMTargetMachine.cpp
index a9b542d6110c1..0de6f3d16eff4 100644
--- a/llvm/lib/Target/ARM/ARMTargetMachine.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetMachine.cpp
@@ -527,11 +527,13 @@ void ARMPassConfig::addPreSched2() {
 void ARMPassConfig::addPreEmitPass() {
   addPass(createThumb2SizeReductionPass());
 
-  // Always unpack bundles for:
+  // Unpack bundles for:
   // - Thumb2: Constant island pass requires unbundled instructions
   // - KCFI: KCFI_CHECK pseudo instructions need to be unbundled for AsmPrinter
-  // - CT_SELECT: Post-RA expansion creates bundles that must be unpacked
-  addPass(createUnpackMachineBundlesLegacy(nullptr));
+  addPass(createUnpackMachineBundlesLegacy([](const MachineFunction &MF) {
+    return MF.getSubtarget<ARMSubtarget>().isThumb2() ||
+           MF.getFunction().getParent()->getModuleFlag("kcfi");
+  }));
 
   // Don't optimize barriers or block placement at -O0.
   if (getOptLevel() != CodeGenOptLevel::None) {

>From 2fa51827e3205bc85db6e170ba373a5398ba3ad9 Mon Sep 17 00:00:00 2001
From: wizardengineer <juliuswoosebert at gmail.com>
Date: Fri, 22 May 2026 12:12:54 -0400
Subject: [PATCH 4/4] [ARM] Regen ctselect tests for new core legalization

The core PR moved CT_SELECT lowering into the generic legalizer
(memory-blend for FP types without a legal same-size integer,
scalar-mask+splat for vectors). ARM tests' CHECK lines need to
reflect the new codegen.

No functional change in ARM target lowering itself; only test
expectations updated to match generated output.
---
 llvm/test/CodeGen/ARM/ctselect-half.ll   | 1120 +++++------
 llvm/test/CodeGen/ARM/ctselect-vector.ll | 2259 +++++++++-------------
 llvm/test/CodeGen/ARM/ctselect.ll        |  233 +--
 3 files changed, 1503 insertions(+), 2109 deletions(-)

diff --git a/llvm/test/CodeGen/ARM/ctselect-half.ll b/llvm/test/CodeGen/ARM/ctselect-half.ll
index bfc60f1fad42d..f83a95e62f07f 100644
--- a/llvm/test/CodeGen/ARM/ctselect-half.ll
+++ b/llvm/test/CodeGen/ARM/ctselect-half.ll
@@ -8,32 +8,20 @@
 define half @ct_half(i1 %cond, half %a, half %b) {
 ; CT-LABEL: ct_half:
 ; CT:       @ %bb.0: @ %entry
-; CT-NEXT:    and r0, r0, #1
-; CT-NEXT:    sub r3, r0, #1
-; CT-NEXT:    rsb r0, r0, #0
-; CT-NEXT:    and r2, r2, r3
-; CT-NEXT:    and r0, r1, r0
-; CT-NEXT:    orr r0, r0, r2
+; CT-NEXT:    and r3, r0, #1
+; CT-NEXT:    BUNDLE
 ; CT-NEXT:    bx lr
 ;
 ; BFLOAT-F16-NATIVE-LABEL: ct_half:
 ; BFLOAT-F16-NATIVE:       @ %bb.0: @ %entry
-; BFLOAT-F16-NATIVE-NEXT:    and r0, r0, #1
-; BFLOAT-F16-NATIVE-NEXT:    sub r3, r0, #1
-; BFLOAT-F16-NATIVE-NEXT:    rsb r0, r0, #0
-; BFLOAT-F16-NATIVE-NEXT:    and r2, r2, r3
-; BFLOAT-F16-NATIVE-NEXT:    and r0, r1, r0
-; BFLOAT-F16-NATIVE-NEXT:    orr r0, r0, r2
+; BFLOAT-F16-NATIVE-NEXT:    and r3, r0, #1
+; BFLOAT-F16-NATIVE-NEXT:    BUNDLE
 ; BFLOAT-F16-NATIVE-NEXT:    bx lr
 ;
 ; F16-NATIVE-LABEL: ct_half:
 ; F16-NATIVE:       @ %bb.0: @ %entry
-; F16-NATIVE-NEXT:    and r0, r0, #1
-; F16-NATIVE-NEXT:    sub r3, r0, #1
-; F16-NATIVE-NEXT:    rsb r0, r0, #0
-; F16-NATIVE-NEXT:    and r2, r2, r3
-; F16-NATIVE-NEXT:    and r0, r1, r0
-; F16-NATIVE-NEXT:    orr r0, r0, r2
+; F16-NATIVE-NEXT:    and r3, r0, #1
+; F16-NATIVE-NEXT:    BUNDLE
 ; F16-NATIVE-NEXT:    bx lr
 ;
 ; THUMB1-LABEL: ct_half:
@@ -42,21 +30,16 @@ define half @ct_half(i1 %cond, half %a, half %b) {
 ; THUMB1-NEXT:    push {r4, lr}
 ; THUMB1-NEXT:    movs r3, #1
 ; THUMB1-NEXT:    ands r3, r0
-; THUMB1-NEXT:    subs r4, r3, #1
-; THUMB1-NEXT:    ands r4, r2
-; THUMB1-NEXT:    rsbs r0, r3, #0
-; THUMB1-NEXT:    ands r0, r1
-; THUMB1-NEXT:    orrs r0, r4
+; THUMB1-NEXT:    BUNDLE
 ; THUMB1-NEXT:    pop {r4, pc}
 ;
 ; THUMB2-LABEL: ct_half:
 ; THUMB2:       @ %bb.0: @ %entry
-; THUMB2-NEXT:    and r0, r0, #1
-; THUMB2-NEXT:    subs r3, r0, #1
-; THUMB2-NEXT:    rsbs r0, r0, #0
-; THUMB2-NEXT:    ands r2, r3
-; THUMB2-NEXT:    ands r0, r1
-; THUMB2-NEXT:    orrs r0, r2
+; THUMB2-NEXT:    and r3, r0, #1
+; THUMB2-NEXT:    rsb.w r12, r3, #0
+; THUMB2-NEXT:    and.w r0, r1, r12
+; THUMB2-NEXT:    bic.w r12, r2, r12
+; THUMB2-NEXT:    orr.w r0, r0, r12
 ; THUMB2-NEXT:    bx lr
 entry:
   %sel = call half @llvm.ct.select.f16(i1 %cond, half %a, half %b)
@@ -66,12 +49,8 @@ entry:
 define bfloat @ct_bf16(i1 %cond, bfloat %a, bfloat %b) {
 ; CT-LABEL: ct_bf16:
 ; CT:       @ %bb.0: @ %entry
-; CT-NEXT:    and r0, r0, #1
-; CT-NEXT:    sub r3, r0, #1
-; CT-NEXT:    rsb r0, r0, #0
-; CT-NEXT:    and r2, r2, r3
-; CT-NEXT:    and r0, r1, r0
-; CT-NEXT:    orr r0, r0, r2
+; CT-NEXT:    and r3, r0, #1
+; CT-NEXT:    BUNDLE
 ; CT-NEXT:    bx lr
 ;
 ; BFLOAT-F16-NATIVE-LABEL: ct_bf16:
@@ -79,10 +58,7 @@ define bfloat @ct_bf16(i1 %cond, bfloat %a, bfloat %b) {
 ; BFLOAT-F16-NATIVE-NEXT:    .pad #4
 ; BFLOAT-F16-NATIVE-NEXT:    sub sp, sp, #4
 ; BFLOAT-F16-NATIVE-NEXT:    and r0, r0, #1
-; BFLOAT-F16-NATIVE-NEXT:    rsb r12, r0, #0
-; BFLOAT-F16-NATIVE-NEXT:    and r3, r1, r12
-; BFLOAT-F16-NATIVE-NEXT:    bic r12, r2, r12
-; BFLOAT-F16-NATIVE-NEXT:    orr r3, r3, r12
+; BFLOAT-F16-NATIVE-NEXT:    BUNDLE
 ; BFLOAT-F16-NATIVE-NEXT:    strh r3, [sp, #2]
 ; BFLOAT-F16-NATIVE-NEXT:    ldrh r0, [sp, #2]
 ; BFLOAT-F16-NATIVE-NEXT:    add sp, sp, #4
@@ -90,12 +66,8 @@ define bfloat @ct_bf16(i1 %cond, bfloat %a, bfloat %b) {
 ;
 ; F16-NATIVE-LABEL: ct_bf16:
 ; F16-NATIVE:       @ %bb.0: @ %entry
-; F16-NATIVE-NEXT:    and r0, r0, #1
-; F16-NATIVE-NEXT:    sub r3, r0, #1
-; F16-NATIVE-NEXT:    rsb r0, r0, #0
-; F16-NATIVE-NEXT:    and r2, r2, r3
-; F16-NATIVE-NEXT:    and r0, r1, r0
-; F16-NATIVE-NEXT:    orr r0, r0, r2
+; F16-NATIVE-NEXT:    and r3, r0, #1
+; F16-NATIVE-NEXT:    BUNDLE
 ; F16-NATIVE-NEXT:    bx lr
 ;
 ; THUMB1-LABEL: ct_bf16:
@@ -104,21 +76,16 @@ define bfloat @ct_bf16(i1 %cond, bfloat %a, bfloat %b) {
 ; THUMB1-NEXT:    push {r4, lr}
 ; THUMB1-NEXT:    movs r3, #1
 ; THUMB1-NEXT:    ands r3, r0
-; THUMB1-NEXT:    subs r4, r3, #1
-; THUMB1-NEXT:    ands r4, r2
-; THUMB1-NEXT:    rsbs r0, r3, #0
-; THUMB1-NEXT:    ands r0, r1
-; THUMB1-NEXT:    orrs r0, r4
+; THUMB1-NEXT:    BUNDLE
 ; THUMB1-NEXT:    pop {r4, pc}
 ;
 ; THUMB2-LABEL: ct_bf16:
 ; THUMB2:       @ %bb.0: @ %entry
-; THUMB2-NEXT:    and r0, r0, #1
-; THUMB2-NEXT:    subs r3, r0, #1
-; THUMB2-NEXT:    rsbs r0, r0, #0
-; THUMB2-NEXT:    ands r2, r3
-; THUMB2-NEXT:    ands r0, r1
-; THUMB2-NEXT:    orrs r0, r2
+; THUMB2-NEXT:    and r3, r0, #1
+; THUMB2-NEXT:    rsb.w r12, r3, #0
+; THUMB2-NEXT:    and.w r0, r1, r12
+; THUMB2-NEXT:    bic.w r12, r2, r12
+; THUMB2-NEXT:    orr.w r0, r0, r12
 ; THUMB2-NEXT:    bx lr
 entry:
   %sel = call bfloat @llvm.ct.select.bf16(i1 %cond, bfloat %a, bfloat %b)
@@ -130,140 +97,123 @@ define <4 x half> @ct_v4f16(i1 %cond, <4 x half> %a, <4 x half> %b) {
 ; CT:       @ %bb.0: @ %entry
 ; CT-NEXT:    .save {r4, r5, r6, lr}
 ; CT-NEXT:    push {r4, r5, r6, lr}
-; CT-NEXT:    ldrh lr, [sp, #36]
-; CT-NEXT:    vdup.16 d16, r0
-; CT-NEXT:    ldrh r12, [sp, #28]
+; CT-NEXT:    ldrh r1, [sp, #20]
 ; CT-NEXT:    pkhbt r2, r2, r3, lsl #16
-; CT-NEXT:    ldrh r1, [sp, #32]
-; CT-NEXT:    vshl.i16 d16, d16, #15
+; CT-NEXT:    ldrh r12, [sp, #36]
+; CT-NEXT:    and r0, r0, #1
+; CT-NEXT:    ldrh lr, [sp, #28]
 ; CT-NEXT:    ldrh r6, [sp, #24]
-; CT-NEXT:    ldrh r4, [sp, #20]
-; CT-NEXT:    orr r0, r1, lr, lsl #16
-; CT-NEXT:    orr r1, r6, r12, lsl #16
-; CT-NEXT:    ldrh r5, [sp, #16]
-; CT-NEXT:    vshr.s16 d16, d16, #15
-; CT-NEXT:    vmov d17, r1, r0
-; CT-NEXT:    orr r0, r5, r4, lsl #16
-; CT-NEXT:    vmov d18, r2, r0
-; CT-NEXT:    veor d18, d18, d17
-; CT-NEXT:    vand d16, d18, d16
-; CT-NEXT:    veor d16, d17, d16
-; CT-NEXT:    vmov.u16 r0, d16[0]
-; CT-NEXT:    vmov.u16 r1, d16[1]
-; CT-NEXT:    vmov.u16 r2, d16[2]
-; CT-NEXT:    vmov.u16 r3, d16[3]
+; CT-NEXT:    ldrh r4, [sp, #16]
+; CT-NEXT:    ldrh r5, [sp, #32]
+; CT-NEXT:    orr r6, r6, lr, lsl #16
+; CT-NEXT:    orr r1, r4, r1, lsl #16
+; CT-NEXT:    orr r3, r5, r12, lsl #16
+; CT-NEXT:    vmov d17, r2, r1
+; CT-NEXT:    vmov d16, r6, r3
+; CT-NEXT:    BUNDLE
+; CT-NEXT:    vmov.u16 r0, d18[0]
+; CT-NEXT:    vmov.u16 r1, d18[1]
+; CT-NEXT:    vmov.u16 r2, d18[2]
+; CT-NEXT:    vmov.u16 r3, d18[3]
 ; CT-NEXT:    pop {r4, r5, r6, pc}
 ;
 ; BFLOAT-F16-NATIVE-LABEL: ct_v4f16:
 ; BFLOAT-F16-NATIVE:       @ %bb.0: @ %entry
 ; BFLOAT-F16-NATIVE-NEXT:    .save {r4, r5, r6, lr}
 ; BFLOAT-F16-NATIVE-NEXT:    push {r4, r5, r6, lr}
-; BFLOAT-F16-NATIVE-NEXT:    ldrh lr, [sp, #36]
-; BFLOAT-F16-NATIVE-NEXT:    vdup.16 d16, r0
-; BFLOAT-F16-NATIVE-NEXT:    ldrh r12, [sp, #28]
+; BFLOAT-F16-NATIVE-NEXT:    ldrh r1, [sp, #20]
 ; BFLOAT-F16-NATIVE-NEXT:    pkhbt r2, r2, r3, lsl #16
-; BFLOAT-F16-NATIVE-NEXT:    ldrh r1, [sp, #32]
-; BFLOAT-F16-NATIVE-NEXT:    vshl.i16 d16, d16, #15
+; BFLOAT-F16-NATIVE-NEXT:    ldrh r12, [sp, #36]
+; BFLOAT-F16-NATIVE-NEXT:    and r0, r0, #1
+; BFLOAT-F16-NATIVE-NEXT:    ldrh lr, [sp, #28]
 ; BFLOAT-F16-NATIVE-NEXT:    ldrh r6, [sp, #24]
-; BFLOAT-F16-NATIVE-NEXT:    ldrh r4, [sp, #20]
-; BFLOAT-F16-NATIVE-NEXT:    orr r0, r1, lr, lsl #16
-; BFLOAT-F16-NATIVE-NEXT:    orr r1, r6, r12, lsl #16
-; BFLOAT-F16-NATIVE-NEXT:    ldrh r5, [sp, #16]
-; BFLOAT-F16-NATIVE-NEXT:    vshr.s16 d16, d16, #15
-; BFLOAT-F16-NATIVE-NEXT:    vmov d17, r1, r0
-; BFLOAT-F16-NATIVE-NEXT:    orr r0, r5, r4, lsl #16
-; BFLOAT-F16-NATIVE-NEXT:    vmov d18, r2, r0
-; BFLOAT-F16-NATIVE-NEXT:    veor d18, d18, d17
-; BFLOAT-F16-NATIVE-NEXT:    vand d16, d18, d16
-; BFLOAT-F16-NATIVE-NEXT:    veor d16, d17, d16
-; BFLOAT-F16-NATIVE-NEXT:    vmov.u16 r0, d16[0]
-; BFLOAT-F16-NATIVE-NEXT:    vmov.u16 r1, d16[1]
-; BFLOAT-F16-NATIVE-NEXT:    vmov.u16 r2, d16[2]
-; BFLOAT-F16-NATIVE-NEXT:    vmov.u16 r3, d16[3]
+; BFLOAT-F16-NATIVE-NEXT:    ldrh r4, [sp, #16]
+; BFLOAT-F16-NATIVE-NEXT:    ldrh r5, [sp, #32]
+; BFLOAT-F16-NATIVE-NEXT:    orr r6, r6, lr, lsl #16
+; BFLOAT-F16-NATIVE-NEXT:    orr r1, r4, r1, lsl #16
+; BFLOAT-F16-NATIVE-NEXT:    orr r3, r5, r12, lsl #16
+; BFLOAT-F16-NATIVE-NEXT:    vmov d17, r2, r1
+; BFLOAT-F16-NATIVE-NEXT:    vmov d16, r6, r3
+; BFLOAT-F16-NATIVE-NEXT:    BUNDLE
+; BFLOAT-F16-NATIVE-NEXT:    vmov.u16 r0, d18[0]
+; BFLOAT-F16-NATIVE-NEXT:    vmov.u16 r1, d18[1]
+; BFLOAT-F16-NATIVE-NEXT:    vmov.u16 r2, d18[2]
+; BFLOAT-F16-NATIVE-NEXT:    vmov.u16 r3, d18[3]
 ; BFLOAT-F16-NATIVE-NEXT:    pop {r4, r5, r6, pc}
 ;
 ; F16-NATIVE-LABEL: ct_v4f16:
 ; F16-NATIVE:       @ %bb.0: @ %entry
 ; F16-NATIVE-NEXT:    .save {r4, r5, r6, lr}
 ; F16-NATIVE-NEXT:    push {r4, r5, r6, lr}
-; F16-NATIVE-NEXT:    ldrh lr, [sp, #36]
-; F16-NATIVE-NEXT:    vdup.16 d16, r0
-; F16-NATIVE-NEXT:    ldrh r12, [sp, #28]
+; F16-NATIVE-NEXT:    ldrh r1, [sp, #20]
 ; F16-NATIVE-NEXT:    pkhbt r2, r2, r3, lsl #16
-; F16-NATIVE-NEXT:    ldrh r1, [sp, #32]
-; F16-NATIVE-NEXT:    vshl.i16 d16, d16, #15
+; F16-NATIVE-NEXT:    ldrh r12, [sp, #36]
+; F16-NATIVE-NEXT:    and r0, r0, #1
+; F16-NATIVE-NEXT:    ldrh lr, [sp, #28]
 ; F16-NATIVE-NEXT:    ldrh r6, [sp, #24]
-; F16-NATIVE-NEXT:    ldrh r4, [sp, #20]
-; F16-NATIVE-NEXT:    orr r0, r1, lr, lsl #16
-; F16-NATIVE-NEXT:    orr r1, r6, r12, lsl #16
-; F16-NATIVE-NEXT:    ldrh r5, [sp, #16]
-; F16-NATIVE-NEXT:    vshr.s16 d16, d16, #15
-; F16-NATIVE-NEXT:    vmov d17, r1, r0
-; F16-NATIVE-NEXT:    orr r0, r5, r4, lsl #16
-; F16-NATIVE-NEXT:    vmov d18, r2, r0
-; F16-NATIVE-NEXT:    veor d18, d18, d17
-; F16-NATIVE-NEXT:    vand d16, d18, d16
-; F16-NATIVE-NEXT:    veor d16, d17, d16
-; F16-NATIVE-NEXT:    vmov.u16 r0, d16[0]
-; F16-NATIVE-NEXT:    vmov.u16 r1, d16[1]
-; F16-NATIVE-NEXT:    vmov.u16 r2, d16[2]
-; F16-NATIVE-NEXT:    vmov.u16 r3, d16[3]
+; F16-NATIVE-NEXT:    ldrh r4, [sp, #16]
+; F16-NATIVE-NEXT:    ldrh r5, [sp, #32]
+; F16-NATIVE-NEXT:    orr r6, r6, lr, lsl #16
+; F16-NATIVE-NEXT:    orr r1, r4, r1, lsl #16
+; F16-NATIVE-NEXT:    orr r3, r5, r12, lsl #16
+; F16-NATIVE-NEXT:    vmov d17, r2, r1
+; F16-NATIVE-NEXT:    vmov d16, r6, r3
+; F16-NATIVE-NEXT:    BUNDLE
+; F16-NATIVE-NEXT:    vmov.u16 r0, d18[0]
+; F16-NATIVE-NEXT:    vmov.u16 r1, d18[1]
+; F16-NATIVE-NEXT:    vmov.u16 r2, d18[2]
+; F16-NATIVE-NEXT:    vmov.u16 r3, d18[3]
 ; F16-NATIVE-NEXT:    pop {r4, r5, r6, pc}
 ;
 ; THUMB1-LABEL: ct_v4f16:
 ; THUMB1:       @ %bb.0: @ %entry
-; THUMB1-NEXT:    .save {r4, r5, r6, lr}
-; THUMB1-NEXT:    push {r4, r5, r6, lr}
-; THUMB1-NEXT:    movs r1, #1
-; THUMB1-NEXT:    ands r1, r0
-; THUMB1-NEXT:    subs r4, r1, #1
-; THUMB1-NEXT:    ldr r0, [sp, #24]
-; THUMB1-NEXT:    ands r0, r4
-; THUMB1-NEXT:    rsbs r5, r1, #0
-; THUMB1-NEXT:    ands r2, r5
-; THUMB1-NEXT:    orrs r0, r2
-; THUMB1-NEXT:    ldr r1, [sp, #28]
-; THUMB1-NEXT:    ands r1, r4
-; THUMB1-NEXT:    ands r3, r5
-; THUMB1-NEXT:    orrs r1, r3
-; THUMB1-NEXT:    ldr r3, [sp, #32]
-; THUMB1-NEXT:    ands r3, r4
-; THUMB1-NEXT:    ldr r2, [sp, #16]
-; THUMB1-NEXT:    ands r2, r5
-; THUMB1-NEXT:    orrs r2, r3
-; THUMB1-NEXT:    ldr r6, [sp, #36]
-; THUMB1-NEXT:    ands r6, r4
-; THUMB1-NEXT:    ldr r3, [sp, #20]
-; THUMB1-NEXT:    ands r3, r5
-; THUMB1-NEXT:    orrs r3, r6
-; THUMB1-NEXT:    pop {r4, r5, r6, pc}
+; THUMB1-NEXT:    .save {r4, r5, r6, r7, lr}
+; THUMB1-NEXT:    push {r4, r5, r6, r7, lr}
+; THUMB1-NEXT:    .pad #4
+; THUMB1-NEXT:    sub sp, #4
+; THUMB1-NEXT:    movs r4, #1
+; THUMB1-NEXT:    ands r4, r0
+; THUMB1-NEXT:    ldr r1, [sp, #32]
+; THUMB1-NEXT:    BUNDLE
+; THUMB1-NEXT:    ldr r2, [sp, #36]
+; THUMB1-NEXT:    BUNDLE
+; THUMB1-NEXT:    ldr r3, [sp, #40]
+; THUMB1-NEXT:    ldr r5, [sp, #24]
+; THUMB1-NEXT:    BUNDLE
+; THUMB1-NEXT:    ldr r5, [sp, #44]
+; THUMB1-NEXT:    ldr r6, [sp, #28]
+; THUMB1-NEXT:    BUNDLE
+; THUMB1-NEXT:    add sp, #4
+; THUMB1-NEXT:    pop {r4, r5, r6, r7, pc}
 ;
 ; THUMB2-LABEL: ct_v4f16:
 ; THUMB2:       @ %bb.0: @ %entry
-; THUMB2-NEXT:    .save {r7, lr}
-; THUMB2-NEXT:    push {r7, lr}
-; THUMB2-NEXT:    and r0, r0, #1
-; THUMB2-NEXT:    ldrh.w r1, [sp, #16]
-; THUMB2-NEXT:    sub.w r12, r0, #1
-; THUMB2-NEXT:    rsb.w lr, r0, #0
+; THUMB2-NEXT:    .save {r4, r5, r7, lr}
+; THUMB2-NEXT:    push {r4, r5, r7, lr}
+; THUMB2-NEXT:    and r12, r0, #1
+; THUMB2-NEXT:    ldrh.w r1, [sp, #24]
+; THUMB2-NEXT:    rsb.w lr, r12, #0
 ; THUMB2-NEXT:    and.w r0, r2, lr
-; THUMB2-NEXT:    and.w r1, r1, r12
-; THUMB2-NEXT:    orrs r0, r1
-; THUMB2-NEXT:    ldrh.w r1, [sp, #20]
-; THUMB2-NEXT:    and.w r2, r3, lr
-; THUMB2-NEXT:    ldrh.w r3, [sp, #8]
-; THUMB2-NEXT:    and.w r1, r1, r12
-; THUMB2-NEXT:    orrs r1, r2
-; THUMB2-NEXT:    ldrh.w r2, [sp, #24]
-; THUMB2-NEXT:    and.w r3, r3, lr
-; THUMB2-NEXT:    and.w r2, r2, r12
-; THUMB2-NEXT:    orrs r2, r3
-; THUMB2-NEXT:    ldrh.w r3, [sp, #28]
-; THUMB2-NEXT:    and.w r12, r12, r3
-; THUMB2-NEXT:    ldrh.w r3, [sp, #12]
-; THUMB2-NEXT:    and.w r3, r3, lr
-; THUMB2-NEXT:    orr.w r3, r3, r12
-; THUMB2-NEXT:    pop {r7, pc}
+; THUMB2-NEXT:    bic.w lr, r1, lr
+; THUMB2-NEXT:    orr.w r0, r0, lr
+; THUMB2-NEXT:    ldrh.w r2, [sp, #28]
+; THUMB2-NEXT:    rsb.w lr, r12, #0
+; THUMB2-NEXT:    and.w r1, r3, lr
+; THUMB2-NEXT:    bic.w lr, r2, lr
+; THUMB2-NEXT:    orr.w r1, r1, lr
+; THUMB2-NEXT:    ldrh.w r3, [sp, #16]
+; THUMB2-NEXT:    ldrh.w lr, [sp, #32]
+; THUMB2-NEXT:    rsb.w r4, r12, #0
+; THUMB2-NEXT:    and.w r2, r3, r4
+; THUMB2-NEXT:    bic.w r4, lr, r4
+; THUMB2-NEXT:    orrs r2, r4
+; THUMB2-NEXT:    ldrh.w lr, [sp, #36]
+; THUMB2-NEXT:    ldrh.w r4, [sp, #20]
+; THUMB2-NEXT:    rsb.w r5, r12, #0
+; THUMB2-NEXT:    and.w r3, r4, r5
+; THUMB2-NEXT:    bic.w r5, lr, r5
+; THUMB2-NEXT:    orrs r3, r5
+; THUMB2-NEXT:    pop {r4, r5, r7, pc}
 entry:
   %sel = call <4 x half> @llvm.ct.select.v4f16(i1 %cond, <4 x half> %a, <4 x half> %b)
   ret <4 x half> %sel
@@ -274,28 +224,24 @@ define <4 x bfloat> @ct_v4bf16(i1 %cond, <4 x bfloat> %a, <4 x bfloat> %b) {
 ; CT:       @ %bb.0: @ %entry
 ; CT-NEXT:    .save {r4, r5, r6, lr}
 ; CT-NEXT:    push {r4, r5, r6, lr}
-; CT-NEXT:    ldrh lr, [sp, #36]
-; CT-NEXT:    vdup.16 d16, r0
-; CT-NEXT:    ldrh r12, [sp, #28]
+; CT-NEXT:    ldrh r1, [sp, #20]
 ; CT-NEXT:    pkhbt r2, r2, r3, lsl #16
-; CT-NEXT:    ldrh r1, [sp, #32]
-; CT-NEXT:    vshl.i16 d16, d16, #15
+; CT-NEXT:    ldrh r12, [sp, #36]
+; CT-NEXT:    and r0, r0, #1
+; CT-NEXT:    ldrh lr, [sp, #28]
 ; CT-NEXT:    ldrh r6, [sp, #24]
-; CT-NEXT:    ldrh r4, [sp, #20]
-; CT-NEXT:    orr r0, r1, lr, lsl #16
-; CT-NEXT:    orr r1, r6, r12, lsl #16
-; CT-NEXT:    ldrh r5, [sp, #16]
-; CT-NEXT:    vshr.s16 d16, d16, #15
-; CT-NEXT:    vmov d17, r1, r0
-; CT-NEXT:    orr r0, r5, r4, lsl #16
-; CT-NEXT:    vmov d18, r2, r0
-; CT-NEXT:    veor d18, d18, d17
-; CT-NEXT:    vand d16, d18, d16
-; CT-NEXT:    veor d16, d17, d16
-; CT-NEXT:    vmov.u16 r0, d16[0]
-; CT-NEXT:    vmov.u16 r1, d16[1]
-; CT-NEXT:    vmov.u16 r2, d16[2]
-; CT-NEXT:    vmov.u16 r3, d16[3]
+; CT-NEXT:    ldrh r4, [sp, #16]
+; CT-NEXT:    ldrh r5, [sp, #32]
+; CT-NEXT:    orr r6, r6, lr, lsl #16
+; CT-NEXT:    orr r1, r4, r1, lsl #16
+; CT-NEXT:    orr r3, r5, r12, lsl #16
+; CT-NEXT:    vmov d17, r2, r1
+; CT-NEXT:    vmov d16, r6, r3
+; CT-NEXT:    BUNDLE
+; CT-NEXT:    vmov.u16 r0, d18[0]
+; CT-NEXT:    vmov.u16 r1, d18[1]
+; CT-NEXT:    vmov.u16 r2, d18[2]
+; CT-NEXT:    vmov.u16 r3, d18[3]
 ; CT-NEXT:    pop {r4, r5, r6, pc}
 ;
 ; BFLOAT-F16-NATIVE-LABEL: ct_v4bf16:
@@ -303,11 +249,7 @@ define <4 x bfloat> @ct_v4bf16(i1 %cond, <4 x bfloat> %a, <4 x bfloat> %b) {
 ; BFLOAT-F16-NATIVE-NEXT:    vldr d16, [sp]
 ; BFLOAT-F16-NATIVE-NEXT:    vmov d17, r2, r3
 ; BFLOAT-F16-NATIVE-NEXT:    and r0, r0, #1
-; BFLOAT-F16-NATIVE-NEXT:    rsb r1, r0, #0
-; BFLOAT-F16-NATIVE-NEXT:    vdup.32 d19, r1
-; BFLOAT-F16-NATIVE-NEXT:    vand d18, d17, d19
-; BFLOAT-F16-NATIVE-NEXT:    vbic d19, d16, d19
-; BFLOAT-F16-NATIVE-NEXT:    vorr d18, d18, d19
+; BFLOAT-F16-NATIVE-NEXT:    BUNDLE
 ; BFLOAT-F16-NATIVE-NEXT:    vmov r0, r1, d18
 ; BFLOAT-F16-NATIVE-NEXT:    bx lr
 ;
@@ -315,84 +257,75 @@ define <4 x bfloat> @ct_v4bf16(i1 %cond, <4 x bfloat> %a, <4 x bfloat> %b) {
 ; F16-NATIVE:       @ %bb.0: @ %entry
 ; F16-NATIVE-NEXT:    .save {r4, r5, r6, lr}
 ; F16-NATIVE-NEXT:    push {r4, r5, r6, lr}
-; F16-NATIVE-NEXT:    ldrh lr, [sp, #36]
-; F16-NATIVE-NEXT:    vdup.16 d16, r0
-; F16-NATIVE-NEXT:    ldrh r12, [sp, #28]
+; F16-NATIVE-NEXT:    ldrh r1, [sp, #20]
 ; F16-NATIVE-NEXT:    pkhbt r2, r2, r3, lsl #16
-; F16-NATIVE-NEXT:    ldrh r1, [sp, #32]
-; F16-NATIVE-NEXT:    vshl.i16 d16, d16, #15
+; F16-NATIVE-NEXT:    ldrh r12, [sp, #36]
+; F16-NATIVE-NEXT:    and r0, r0, #1
+; F16-NATIVE-NEXT:    ldrh lr, [sp, #28]
 ; F16-NATIVE-NEXT:    ldrh r6, [sp, #24]
-; F16-NATIVE-NEXT:    ldrh r4, [sp, #20]
-; F16-NATIVE-NEXT:    orr r0, r1, lr, lsl #16
-; F16-NATIVE-NEXT:    orr r1, r6, r12, lsl #16
-; F16-NATIVE-NEXT:    ldrh r5, [sp, #16]
-; F16-NATIVE-NEXT:    vshr.s16 d16, d16, #15
-; F16-NATIVE-NEXT:    vmov d17, r1, r0
-; F16-NATIVE-NEXT:    orr r0, r5, r4, lsl #16
-; F16-NATIVE-NEXT:    vmov d18, r2, r0
-; F16-NATIVE-NEXT:    veor d18, d18, d17
-; F16-NATIVE-NEXT:    vand d16, d18, d16
-; F16-NATIVE-NEXT:    veor d16, d17, d16
-; F16-NATIVE-NEXT:    vmov.u16 r0, d16[0]
-; F16-NATIVE-NEXT:    vmov.u16 r1, d16[1]
-; F16-NATIVE-NEXT:    vmov.u16 r2, d16[2]
-; F16-NATIVE-NEXT:    vmov.u16 r3, d16[3]
+; F16-NATIVE-NEXT:    ldrh r4, [sp, #16]
+; F16-NATIVE-NEXT:    ldrh r5, [sp, #32]
+; F16-NATIVE-NEXT:    orr r6, r6, lr, lsl #16
+; F16-NATIVE-NEXT:    orr r1, r4, r1, lsl #16
+; F16-NATIVE-NEXT:    orr r3, r5, r12, lsl #16
+; F16-NATIVE-NEXT:    vmov d17, r2, r1
+; F16-NATIVE-NEXT:    vmov d16, r6, r3
+; F16-NATIVE-NEXT:    BUNDLE
+; F16-NATIVE-NEXT:    vmov.u16 r0, d18[0]
+; F16-NATIVE-NEXT:    vmov.u16 r1, d18[1]
+; F16-NATIVE-NEXT:    vmov.u16 r2, d18[2]
+; F16-NATIVE-NEXT:    vmov.u16 r3, d18[3]
 ; F16-NATIVE-NEXT:    pop {r4, r5, r6, pc}
 ;
 ; THUMB1-LABEL: ct_v4bf16:
 ; THUMB1:       @ %bb.0: @ %entry
-; THUMB1-NEXT:    .save {r4, r5, r6, lr}
-; THUMB1-NEXT:    push {r4, r5, r6, lr}
-; THUMB1-NEXT:    movs r1, #1
-; THUMB1-NEXT:    ands r1, r0
-; THUMB1-NEXT:    subs r4, r1, #1
-; THUMB1-NEXT:    ldr r0, [sp, #24]
-; THUMB1-NEXT:    ands r0, r4
-; THUMB1-NEXT:    rsbs r5, r1, #0
-; THUMB1-NEXT:    ands r2, r5
-; THUMB1-NEXT:    orrs r0, r2
-; THUMB1-NEXT:    ldr r1, [sp, #28]
-; THUMB1-NEXT:    ands r1, r4
-; THUMB1-NEXT:    ands r3, r5
-; THUMB1-NEXT:    orrs r1, r3
-; THUMB1-NEXT:    ldr r3, [sp, #32]
-; THUMB1-NEXT:    ands r3, r4
-; THUMB1-NEXT:    ldr r2, [sp, #16]
-; THUMB1-NEXT:    ands r2, r5
-; THUMB1-NEXT:    orrs r2, r3
-; THUMB1-NEXT:    ldr r6, [sp, #36]
-; THUMB1-NEXT:    ands r6, r4
-; THUMB1-NEXT:    ldr r3, [sp, #20]
-; THUMB1-NEXT:    ands r3, r5
-; THUMB1-NEXT:    orrs r3, r6
-; THUMB1-NEXT:    pop {r4, r5, r6, pc}
+; THUMB1-NEXT:    .save {r4, r5, r6, r7, lr}
+; THUMB1-NEXT:    push {r4, r5, r6, r7, lr}
+; THUMB1-NEXT:    .pad #4
+; THUMB1-NEXT:    sub sp, #4
+; THUMB1-NEXT:    movs r4, #1
+; THUMB1-NEXT:    ands r4, r0
+; THUMB1-NEXT:    ldr r1, [sp, #32]
+; THUMB1-NEXT:    BUNDLE
+; THUMB1-NEXT:    ldr r2, [sp, #36]
+; THUMB1-NEXT:    BUNDLE
+; THUMB1-NEXT:    ldr r3, [sp, #40]
+; THUMB1-NEXT:    ldr r5, [sp, #24]
+; THUMB1-NEXT:    BUNDLE
+; THUMB1-NEXT:    ldr r5, [sp, #44]
+; THUMB1-NEXT:    ldr r6, [sp, #28]
+; THUMB1-NEXT:    BUNDLE
+; THUMB1-NEXT:    add sp, #4
+; THUMB1-NEXT:    pop {r4, r5, r6, r7, pc}
 ;
 ; THUMB2-LABEL: ct_v4bf16:
 ; THUMB2:       @ %bb.0: @ %entry
-; THUMB2-NEXT:    .save {r7, lr}
-; THUMB2-NEXT:    push {r7, lr}
-; THUMB2-NEXT:    and r0, r0, #1
-; THUMB2-NEXT:    ldrh.w r1, [sp, #16]
-; THUMB2-NEXT:    sub.w r12, r0, #1
-; THUMB2-NEXT:    rsb.w lr, r0, #0
+; THUMB2-NEXT:    .save {r4, r5, r7, lr}
+; THUMB2-NEXT:    push {r4, r5, r7, lr}
+; THUMB2-NEXT:    and r12, r0, #1
+; THUMB2-NEXT:    ldrh.w r1, [sp, #24]
+; THUMB2-NEXT:    rsb.w lr, r12, #0
 ; THUMB2-NEXT:    and.w r0, r2, lr
-; THUMB2-NEXT:    and.w r1, r1, r12
-; THUMB2-NEXT:    orrs r0, r1
-; THUMB2-NEXT:    ldrh.w r1, [sp, #20]
-; THUMB2-NEXT:    and.w r2, r3, lr
-; THUMB2-NEXT:    ldrh.w r3, [sp, #8]
-; THUMB2-NEXT:    and.w r1, r1, r12
-; THUMB2-NEXT:    orrs r1, r2
-; THUMB2-NEXT:    ldrh.w r2, [sp, #24]
-; THUMB2-NEXT:    and.w r3, r3, lr
-; THUMB2-NEXT:    and.w r2, r2, r12
-; THUMB2-NEXT:    orrs r2, r3
-; THUMB2-NEXT:    ldrh.w r3, [sp, #28]
-; THUMB2-NEXT:    and.w r12, r12, r3
-; THUMB2-NEXT:    ldrh.w r3, [sp, #12]
-; THUMB2-NEXT:    and.w r3, r3, lr
-; THUMB2-NEXT:    orr.w r3, r3, r12
-; THUMB2-NEXT:    pop {r7, pc}
+; THUMB2-NEXT:    bic.w lr, r1, lr
+; THUMB2-NEXT:    orr.w r0, r0, lr
+; THUMB2-NEXT:    ldrh.w r2, [sp, #28]
+; THUMB2-NEXT:    rsb.w lr, r12, #0
+; THUMB2-NEXT:    and.w r1, r3, lr
+; THUMB2-NEXT:    bic.w lr, r2, lr
+; THUMB2-NEXT:    orr.w r1, r1, lr
+; THUMB2-NEXT:    ldrh.w r3, [sp, #16]
+; THUMB2-NEXT:    ldrh.w lr, [sp, #32]
+; THUMB2-NEXT:    rsb.w r4, r12, #0
+; THUMB2-NEXT:    and.w r2, r3, r4
+; THUMB2-NEXT:    bic.w r4, lr, r4
+; THUMB2-NEXT:    orrs r2, r4
+; THUMB2-NEXT:    ldrh.w lr, [sp, #36]
+; THUMB2-NEXT:    ldrh.w r4, [sp, #20]
+; THUMB2-NEXT:    rsb.w r5, r12, #0
+; THUMB2-NEXT:    and.w r3, r4, r5
+; THUMB2-NEXT:    bic.w r5, lr, r5
+; THUMB2-NEXT:    orrs r3, r5
+; THUMB2-NEXT:    pop {r4, r5, r7, pc}
 entry:
   %sel = call <4 x bfloat> @llvm.ct.select.v4bf16(i1 %cond, <4 x bfloat> %a, <4 x bfloat> %b)
   ret <4 x bfloat> %sel
@@ -401,245 +334,221 @@ entry:
 define <8 x half> @ct_v8f16(i1 %cond, <8 x half> %a, <8 x half> %b) {
 ; CT-LABEL: ct_v8f16:
 ; CT:       @ %bb.0: @ %entry
-; CT-NEXT:    .save {r4, r5, r6, r7, r11, lr}
-; CT-NEXT:    push {r4, r5, r6, r7, r11, lr}
-; CT-NEXT:    ldrh r5, [sp, #36]
+; CT-NEXT:    .save {r4, r5, r6, r7, r8, lr}
+; CT-NEXT:    push {r4, r5, r6, r7, r8, lr}
+; CT-NEXT:    ldrh r12, [sp, #36]
 ; CT-NEXT:    pkhbt r2, r2, r3, lsl #16
 ; CT-NEXT:    ldrh r7, [sp, #32]
-; CT-NEXT:    vdup.8 d18, r1
-; CT-NEXT:    ldrh r1, [sp, #52]
+; CT-NEXT:    and r1, r1, #1
+; CT-NEXT:    ldrh r3, [sp, #52]
 ; CT-NEXT:    vmov.32 d16[0], r2
-; CT-NEXT:    orr r3, r7, r5, lsl #16
 ; CT-NEXT:    ldrh r2, [sp, #48]
+; CT-NEXT:    orr r7, r7, r12, lsl #16
 ; CT-NEXT:    ldrh r5, [sp, #68]
-; CT-NEXT:    vmovl.u8 q9, d18
-; CT-NEXT:    orr r1, r2, r1, lsl #16
-; CT-NEXT:    vmov.32 d17[0], r3
-; CT-NEXT:    ldrh r3, [sp, #64]
-; CT-NEXT:    ldrh r2, [sp, #28]
-; CT-NEXT:    vmov.32 d20[0], r1
-; CT-NEXT:    ldrh r1, [sp, #24]
-; CT-NEXT:    orr r3, r3, r5, lsl #16
+; CT-NEXT:    orr r2, r2, r3, lsl #16
+; CT-NEXT:    vmov.32 d17[0], r7
+; CT-NEXT:    ldrh r7, [sp, #64]
+; CT-NEXT:    ldrh r3, [sp, #28]
+; CT-NEXT:    vmov.32 d18[0], r2
+; CT-NEXT:    ldrh r2, [sp, #24]
+; CT-NEXT:    orr r7, r7, r5, lsl #16
 ; CT-NEXT:    ldrh r5, [sp, #76]
-; CT-NEXT:    vshl.i16 q9, q9, #15
-; CT-NEXT:    vmov.32 d21[0], r3
-; CT-NEXT:    orr r1, r1, r2, lsl #16
-; CT-NEXT:    ldrh r3, [sp, #72]
-; CT-NEXT:    ldrh r4, [sp, #60]
-; CT-NEXT:    vmov.32 d16[1], r1
-; CT-NEXT:    orr r1, r3, r5, lsl #16
-; CT-NEXT:    ldrh r6, [sp, #56]
-; CT-NEXT:    ldrh r12, [sp, #44]
-; CT-NEXT:    vshr.s16 q9, q9, #15
-; CT-NEXT:    vmov.32 d21[1], r1
-; CT-NEXT:    orr r1, r6, r4, lsl #16
-; CT-NEXT:    ldrh lr, [sp, #40]
-; CT-NEXT:    vmov.32 d20[1], r1
-; CT-NEXT:    orr r1, lr, r12, lsl #16
-; CT-NEXT:    vmov.32 d17[1], r1
-; CT-NEXT:    veor q8, q8, q10
-; CT-NEXT:    vand q8, q8, q9
-; CT-NEXT:    veor q8, q10, q8
-; CT-NEXT:    vst1.64 {d16, d17}, [r0:128]
-; CT-NEXT:    pop {r4, r5, r6, r7, r11, pc}
+; CT-NEXT:    vmov.32 d19[0], r7
+; CT-NEXT:    orr r2, r2, r3, lsl #16
+; CT-NEXT:    ldrh r7, [sp, #72]
+; CT-NEXT:    ldrh lr, [sp, #60]
+; CT-NEXT:    vmov.32 d16[1], r2
+; CT-NEXT:    orr r2, r7, r5, lsl #16
+; CT-NEXT:    ldrh r4, [sp, #56]
+; CT-NEXT:    ldrh r8, [sp, #44]
+; CT-NEXT:    vmov.32 d19[1], r2
+; CT-NEXT:    orr r2, r4, lr, lsl #16
+; CT-NEXT:    ldrh r6, [sp, #40]
+; CT-NEXT:    vmov.32 d18[1], r2
+; CT-NEXT:    orr r2, r6, r8, lsl #16
+; CT-NEXT:    vmov.32 d17[1], r2
+; CT-NEXT:    BUNDLE
+; CT-NEXT:    vst1.64 {d20, d21}, [r0:128]
+; CT-NEXT:    pop {r4, r5, r6, r7, r8, pc}
 ;
 ; BFLOAT-F16-NATIVE-LABEL: ct_v8f16:
 ; BFLOAT-F16-NATIVE:       @ %bb.0: @ %entry
-; BFLOAT-F16-NATIVE-NEXT:    .save {r4, r5, r6, r7, r11, lr}
-; BFLOAT-F16-NATIVE-NEXT:    push {r4, r5, r6, r7, r11, lr}
-; BFLOAT-F16-NATIVE-NEXT:    ldrh r5, [sp, #36]
+; BFLOAT-F16-NATIVE-NEXT:    .save {r4, r5, r6, r7, r8, lr}
+; BFLOAT-F16-NATIVE-NEXT:    push {r4, r5, r6, r7, r8, lr}
+; BFLOAT-F16-NATIVE-NEXT:    ldrh r12, [sp, #36]
 ; BFLOAT-F16-NATIVE-NEXT:    pkhbt r2, r2, r3, lsl #16
 ; BFLOAT-F16-NATIVE-NEXT:    ldrh r7, [sp, #32]
-; BFLOAT-F16-NATIVE-NEXT:    vdup.8 d18, r1
-; BFLOAT-F16-NATIVE-NEXT:    ldrh r1, [sp, #52]
+; BFLOAT-F16-NATIVE-NEXT:    and r1, r1, #1
+; BFLOAT-F16-NATIVE-NEXT:    ldrh r3, [sp, #52]
 ; BFLOAT-F16-NATIVE-NEXT:    vmov.32 d16[0], r2
-; BFLOAT-F16-NATIVE-NEXT:    orr r3, r7, r5, lsl #16
 ; BFLOAT-F16-NATIVE-NEXT:    ldrh r2, [sp, #48]
+; BFLOAT-F16-NATIVE-NEXT:    orr r7, r7, r12, lsl #16
 ; BFLOAT-F16-NATIVE-NEXT:    ldrh r5, [sp, #68]
-; BFLOAT-F16-NATIVE-NEXT:    vmovl.u8 q9, d18
-; BFLOAT-F16-NATIVE-NEXT:    orr r1, r2, r1, lsl #16
-; BFLOAT-F16-NATIVE-NEXT:    vmov.32 d17[0], r3
-; BFLOAT-F16-NATIVE-NEXT:    ldrh r3, [sp, #64]
-; BFLOAT-F16-NATIVE-NEXT:    ldrh r2, [sp, #28]
-; BFLOAT-F16-NATIVE-NEXT:    vmov.32 d20[0], r1
-; BFLOAT-F16-NATIVE-NEXT:    ldrh r1, [sp, #24]
-; BFLOAT-F16-NATIVE-NEXT:    orr r3, r3, r5, lsl #16
+; BFLOAT-F16-NATIVE-NEXT:    orr r2, r2, r3, lsl #16
+; BFLOAT-F16-NATIVE-NEXT:    vmov.32 d17[0], r7
+; BFLOAT-F16-NATIVE-NEXT:    ldrh r7, [sp, #64]
+; BFLOAT-F16-NATIVE-NEXT:    ldrh r3, [sp, #28]
+; BFLOAT-F16-NATIVE-NEXT:    vmov.32 d18[0], r2
+; BFLOAT-F16-NATIVE-NEXT:    ldrh r2, [sp, #24]
+; BFLOAT-F16-NATIVE-NEXT:    orr r7, r7, r5, lsl #16
 ; BFLOAT-F16-NATIVE-NEXT:    ldrh r5, [sp, #76]
-; BFLOAT-F16-NATIVE-NEXT:    vshl.i16 q9, q9, #15
-; BFLOAT-F16-NATIVE-NEXT:    vmov.32 d21[0], r3
-; BFLOAT-F16-NATIVE-NEXT:    orr r1, r1, r2, lsl #16
-; BFLOAT-F16-NATIVE-NEXT:    ldrh r3, [sp, #72]
-; BFLOAT-F16-NATIVE-NEXT:    ldrh r4, [sp, #60]
-; BFLOAT-F16-NATIVE-NEXT:    vmov.32 d16[1], r1
-; BFLOAT-F16-NATIVE-NEXT:    orr r1, r3, r5, lsl #16
-; BFLOAT-F16-NATIVE-NEXT:    ldrh r6, [sp, #56]
-; BFLOAT-F16-NATIVE-NEXT:    ldrh r12, [sp, #44]
-; BFLOAT-F16-NATIVE-NEXT:    vshr.s16 q9, q9, #15
-; BFLOAT-F16-NATIVE-NEXT:    vmov.32 d21[1], r1
-; BFLOAT-F16-NATIVE-NEXT:    orr r1, r6, r4, lsl #16
-; BFLOAT-F16-NATIVE-NEXT:    ldrh lr, [sp, #40]
-; BFLOAT-F16-NATIVE-NEXT:    vmov.32 d20[1], r1
-; BFLOAT-F16-NATIVE-NEXT:    orr r1, lr, r12, lsl #16
-; BFLOAT-F16-NATIVE-NEXT:    vmov.32 d17[1], r1
-; BFLOAT-F16-NATIVE-NEXT:    veor q8, q8, q10
-; BFLOAT-F16-NATIVE-NEXT:    vand q8, q8, q9
-; BFLOAT-F16-NATIVE-NEXT:    veor q8, q10, q8
-; BFLOAT-F16-NATIVE-NEXT:    vst1.64 {d16, d17}, [r0:128]
-; BFLOAT-F16-NATIVE-NEXT:    pop {r4, r5, r6, r7, r11, pc}
+; BFLOAT-F16-NATIVE-NEXT:    vmov.32 d19[0], r7
+; BFLOAT-F16-NATIVE-NEXT:    orr r2, r2, r3, lsl #16
+; BFLOAT-F16-NATIVE-NEXT:    ldrh r7, [sp, #72]
+; BFLOAT-F16-NATIVE-NEXT:    ldrh lr, [sp, #60]
+; BFLOAT-F16-NATIVE-NEXT:    vmov.32 d16[1], r2
+; BFLOAT-F16-NATIVE-NEXT:    orr r2, r7, r5, lsl #16
+; BFLOAT-F16-NATIVE-NEXT:    ldrh r4, [sp, #56]
+; BFLOAT-F16-NATIVE-NEXT:    ldrh r8, [sp, #44]
+; BFLOAT-F16-NATIVE-NEXT:    vmov.32 d19[1], r2
+; BFLOAT-F16-NATIVE-NEXT:    orr r2, r4, lr, lsl #16
+; BFLOAT-F16-NATIVE-NEXT:    ldrh r6, [sp, #40]
+; BFLOAT-F16-NATIVE-NEXT:    vmov.32 d18[1], r2
+; BFLOAT-F16-NATIVE-NEXT:    orr r2, r6, r8, lsl #16
+; BFLOAT-F16-NATIVE-NEXT:    vmov.32 d17[1], r2
+; BFLOAT-F16-NATIVE-NEXT:    BUNDLE
+; BFLOAT-F16-NATIVE-NEXT:    vst1.64 {d20, d21}, [r0:128]
+; BFLOAT-F16-NATIVE-NEXT:    pop {r4, r5, r6, r7, r8, pc}
 ;
 ; F16-NATIVE-LABEL: ct_v8f16:
 ; F16-NATIVE:       @ %bb.0: @ %entry
-; F16-NATIVE-NEXT:    .save {r4, r5, r6, r7, r11, lr}
-; F16-NATIVE-NEXT:    push {r4, r5, r6, r7, r11, lr}
-; F16-NATIVE-NEXT:    ldrh r5, [sp, #36]
+; F16-NATIVE-NEXT:    .save {r4, r5, r6, r7, r8, lr}
+; F16-NATIVE-NEXT:    push {r4, r5, r6, r7, r8, lr}
+; F16-NATIVE-NEXT:    ldrh r12, [sp, #36]
 ; F16-NATIVE-NEXT:    pkhbt r2, r2, r3, lsl #16
 ; F16-NATIVE-NEXT:    ldrh r7, [sp, #32]
-; F16-NATIVE-NEXT:    vdup.8 d18, r1
-; F16-NATIVE-NEXT:    ldrh r1, [sp, #52]
+; F16-NATIVE-NEXT:    and r1, r1, #1
+; F16-NATIVE-NEXT:    ldrh r3, [sp, #52]
 ; F16-NATIVE-NEXT:    vmov.32 d16[0], r2
-; F16-NATIVE-NEXT:    orr r3, r7, r5, lsl #16
 ; F16-NATIVE-NEXT:    ldrh r2, [sp, #48]
+; F16-NATIVE-NEXT:    orr r7, r7, r12, lsl #16
 ; F16-NATIVE-NEXT:    ldrh r5, [sp, #68]
-; F16-NATIVE-NEXT:    vmovl.u8 q9, d18
-; F16-NATIVE-NEXT:    orr r1, r2, r1, lsl #16
-; F16-NATIVE-NEXT:    vmov.32 d17[0], r3
-; F16-NATIVE-NEXT:    ldrh r3, [sp, #64]
-; F16-NATIVE-NEXT:    ldrh r2, [sp, #28]
-; F16-NATIVE-NEXT:    vmov.32 d20[0], r1
-; F16-NATIVE-NEXT:    ldrh r1, [sp, #24]
-; F16-NATIVE-NEXT:    orr r3, r3, r5, lsl #16
+; F16-NATIVE-NEXT:    orr r2, r2, r3, lsl #16
+; F16-NATIVE-NEXT:    vmov.32 d17[0], r7
+; F16-NATIVE-NEXT:    ldrh r7, [sp, #64]
+; F16-NATIVE-NEXT:    ldrh r3, [sp, #28]
+; F16-NATIVE-NEXT:    vmov.32 d18[0], r2
+; F16-NATIVE-NEXT:    ldrh r2, [sp, #24]
+; F16-NATIVE-NEXT:    orr r7, r7, r5, lsl #16
 ; F16-NATIVE-NEXT:    ldrh r5, [sp, #76]
-; F16-NATIVE-NEXT:    vshl.i16 q9, q9, #15
-; F16-NATIVE-NEXT:    vmov.32 d21[0], r3
-; F16-NATIVE-NEXT:    orr r1, r1, r2, lsl #16
-; F16-NATIVE-NEXT:    ldrh r3, [sp, #72]
-; F16-NATIVE-NEXT:    ldrh r4, [sp, #60]
-; F16-NATIVE-NEXT:    vmov.32 d16[1], r1
-; F16-NATIVE-NEXT:    orr r1, r3, r5, lsl #16
-; F16-NATIVE-NEXT:    ldrh r6, [sp, #56]
-; F16-NATIVE-NEXT:    ldrh r12, [sp, #44]
-; F16-NATIVE-NEXT:    vshr.s16 q9, q9, #15
-; F16-NATIVE-NEXT:    vmov.32 d21[1], r1
-; F16-NATIVE-NEXT:    orr r1, r6, r4, lsl #16
-; F16-NATIVE-NEXT:    ldrh lr, [sp, #40]
-; F16-NATIVE-NEXT:    vmov.32 d20[1], r1
-; F16-NATIVE-NEXT:    orr r1, lr, r12, lsl #16
-; F16-NATIVE-NEXT:    vmov.32 d17[1], r1
-; F16-NATIVE-NEXT:    veor q8, q8, q10
-; F16-NATIVE-NEXT:    vand q8, q8, q9
-; F16-NATIVE-NEXT:    veor q8, q10, q8
-; F16-NATIVE-NEXT:    vst1.64 {d16, d17}, [r0:128]
-; F16-NATIVE-NEXT:    pop {r4, r5, r6, r7, r11, pc}
+; F16-NATIVE-NEXT:    vmov.32 d19[0], r7
+; F16-NATIVE-NEXT:    orr r2, r2, r3, lsl #16
+; F16-NATIVE-NEXT:    ldrh r7, [sp, #72]
+; F16-NATIVE-NEXT:    ldrh lr, [sp, #60]
+; F16-NATIVE-NEXT:    vmov.32 d16[1], r2
+; F16-NATIVE-NEXT:    orr r2, r7, r5, lsl #16
+; F16-NATIVE-NEXT:    ldrh r4, [sp, #56]
+; F16-NATIVE-NEXT:    ldrh r8, [sp, #44]
+; F16-NATIVE-NEXT:    vmov.32 d19[1], r2
+; F16-NATIVE-NEXT:    orr r2, r4, lr, lsl #16
+; F16-NATIVE-NEXT:    ldrh r6, [sp, #40]
+; F16-NATIVE-NEXT:    vmov.32 d18[1], r2
+; F16-NATIVE-NEXT:    orr r2, r6, r8, lsl #16
+; F16-NATIVE-NEXT:    vmov.32 d17[1], r2
+; F16-NATIVE-NEXT:    BUNDLE
+; F16-NATIVE-NEXT:    vst1.64 {d20, d21}, [r0:128]
+; F16-NATIVE-NEXT:    pop {r4, r5, r6, r7, r8, pc}
 ;
 ; THUMB1-LABEL: ct_v8f16:
 ; THUMB1:       @ %bb.0: @ %entry
-; THUMB1-NEXT:    .save {r4, r5, r6, lr}
-; THUMB1-NEXT:    push {r4, r5, r6, lr}
+; THUMB1-NEXT:    .save {r4, r5, r6, r7, lr}
+; THUMB1-NEXT:    push {r4, r5, r6, r7, lr}
+; THUMB1-NEXT:    .pad #4
+; THUMB1-NEXT:    sub sp, #4
 ; THUMB1-NEXT:    movs r4, #1
 ; THUMB1-NEXT:    ands r4, r1
-; THUMB1-NEXT:    subs r1, r4, #1
-; THUMB1-NEXT:    ldr r5, [sp, #68]
-; THUMB1-NEXT:    ands r5, r1
-; THUMB1-NEXT:    rsbs r4, r4, #0
-; THUMB1-NEXT:    ldr r6, [sp, #36]
-; THUMB1-NEXT:    ands r6, r4
-; THUMB1-NEXT:    orrs r6, r5
+; THUMB1-NEXT:    ldr r1, [sp, #76]
+; THUMB1-NEXT:    ldr r5, [sp, #44]
+; THUMB1-NEXT:    BUNDLE
 ; THUMB1-NEXT:    strh r6, [r0, #14]
-; THUMB1-NEXT:    ldr r5, [sp, #64]
-; THUMB1-NEXT:    ands r5, r1
-; THUMB1-NEXT:    ldr r6, [sp, #32]
-; THUMB1-NEXT:    ands r6, r4
-; THUMB1-NEXT:    orrs r6, r5
+; THUMB1-NEXT:    ldr r1, [sp, #72]
+; THUMB1-NEXT:    ldr r5, [sp, #40]
+; THUMB1-NEXT:    BUNDLE
 ; THUMB1-NEXT:    strh r6, [r0, #12]
-; THUMB1-NEXT:    ldr r5, [sp, #60]
-; THUMB1-NEXT:    ands r5, r1
-; THUMB1-NEXT:    ldr r6, [sp, #28]
-; THUMB1-NEXT:    ands r6, r4
-; THUMB1-NEXT:    orrs r6, r5
+; THUMB1-NEXT:    ldr r1, [sp, #68]
+; THUMB1-NEXT:    ldr r5, [sp, #36]
+; THUMB1-NEXT:    BUNDLE
 ; THUMB1-NEXT:    strh r6, [r0, #10]
-; THUMB1-NEXT:    ldr r5, [sp, #56]
-; THUMB1-NEXT:    ands r5, r1
-; THUMB1-NEXT:    ldr r6, [sp, #24]
-; THUMB1-NEXT:    ands r6, r4
-; THUMB1-NEXT:    orrs r6, r5
+; THUMB1-NEXT:    ldr r1, [sp, #64]
+; THUMB1-NEXT:    ldr r5, [sp, #32]
+; THUMB1-NEXT:    BUNDLE
 ; THUMB1-NEXT:    strh r6, [r0, #8]
-; THUMB1-NEXT:    ldr r5, [sp, #52]
-; THUMB1-NEXT:    ands r5, r1
-; THUMB1-NEXT:    ldr r6, [sp, #20]
-; THUMB1-NEXT:    ands r6, r4
-; THUMB1-NEXT:    orrs r6, r5
+; THUMB1-NEXT:    ldr r1, [sp, #60]
+; THUMB1-NEXT:    ldr r5, [sp, #28]
+; THUMB1-NEXT:    BUNDLE
 ; THUMB1-NEXT:    strh r6, [r0, #6]
-; THUMB1-NEXT:    ldr r5, [sp, #48]
-; THUMB1-NEXT:    ands r5, r1
-; THUMB1-NEXT:    ldr r6, [sp, #16]
-; THUMB1-NEXT:    ands r6, r4
-; THUMB1-NEXT:    orrs r6, r5
+; THUMB1-NEXT:    ldr r1, [sp, #56]
+; THUMB1-NEXT:    ldr r5, [sp, #24]
+; THUMB1-NEXT:    BUNDLE
 ; THUMB1-NEXT:    strh r6, [r0, #4]
-; THUMB1-NEXT:    ldr r5, [sp, #44]
-; THUMB1-NEXT:    ands r5, r1
-; THUMB1-NEXT:    ands r3, r4
-; THUMB1-NEXT:    orrs r3, r5
-; THUMB1-NEXT:    strh r3, [r0, #2]
-; THUMB1-NEXT:    ldr r3, [sp, #40]
-; THUMB1-NEXT:    ands r3, r1
-; THUMB1-NEXT:    ands r2, r4
-; THUMB1-NEXT:    orrs r2, r3
-; THUMB1-NEXT:    strh r2, [r0]
-; THUMB1-NEXT:    pop {r4, r5, r6, pc}
+; THUMB1-NEXT:    ldr r1, [sp, #52]
+; THUMB1-NEXT:    BUNDLE
+; THUMB1-NEXT:    strh r5, [r0, #2]
+; THUMB1-NEXT:    ldr r1, [sp, #48]
+; THUMB1-NEXT:    BUNDLE
+; THUMB1-NEXT:    strh r3, [r0]
+; THUMB1-NEXT:    add sp, #4
+; THUMB1-NEXT:    pop {r4, r5, r6, r7, pc}
 ;
 ; THUMB2-LABEL: ct_v8f16:
 ; THUMB2:       @ %bb.0: @ %entry
 ; THUMB2-NEXT:    .save {r4, r5, r7, lr}
 ; THUMB2-NEXT:    push {r4, r5, r7, lr}
 ; THUMB2-NEXT:    and lr, r1, #1
-; THUMB2-NEXT:    ldrh.w r1, [sp, #68]
-; THUMB2-NEXT:    sub.w r12, lr, #1
-; THUMB2-NEXT:    ldrh.w r4, [sp, #36]
-; THUMB2-NEXT:    and.w r5, r1, r12
+; THUMB2-NEXT:    ldrh.w r12, [sp, #68]
+; THUMB2-NEXT:    ldrh.w r1, [sp, #36]
+; THUMB2-NEXT:    rsb.w r5, lr, #0
+; THUMB2-NEXT:    and.w r4, r1, r5
+; THUMB2-NEXT:    bic.w r5, r12, r5
+; THUMB2-NEXT:    orrs r4, r5
+; THUMB2-NEXT:    strh r4, [r0, #14]
+; THUMB2-NEXT:    ldrh.w r12, [sp, #64]
+; THUMB2-NEXT:    ldrh.w r5, [sp, #32]
 ; THUMB2-NEXT:    rsb.w r1, lr, #0
-; THUMB2-NEXT:    ands r4, r1
-; THUMB2-NEXT:    ands r3, r1
-; THUMB2-NEXT:    orrs r5, r4
-; THUMB2-NEXT:    ldrh.w r4, [sp, #32]
-; THUMB2-NEXT:    strh r5, [r0, #14]
-; THUMB2-NEXT:    ldrh.w r5, [sp, #64]
-; THUMB2-NEXT:    ands r4, r1
-; THUMB2-NEXT:    and.w r5, r5, r12
-; THUMB2-NEXT:    orrs r5, r4
-; THUMB2-NEXT:    ldrh.w r4, [sp, #28]
-; THUMB2-NEXT:    strh r5, [r0, #12]
-; THUMB2-NEXT:    ldrh.w r5, [sp, #60]
-; THUMB2-NEXT:    ands r4, r1
-; THUMB2-NEXT:    and.w r5, r5, r12
-; THUMB2-NEXT:    orrs r5, r4
-; THUMB2-NEXT:    ldrh.w r4, [sp, #24]
-; THUMB2-NEXT:    strh r5, [r0, #10]
-; THUMB2-NEXT:    ldrh.w r5, [sp, #56]
-; THUMB2-NEXT:    ands r4, r1
-; THUMB2-NEXT:    and.w r5, r5, r12
-; THUMB2-NEXT:    orrs r5, r4
-; THUMB2-NEXT:    ldrh.w r4, [sp, #20]
-; THUMB2-NEXT:    strh r5, [r0, #8]
-; THUMB2-NEXT:    ldrh.w r5, [sp, #52]
-; THUMB2-NEXT:    ands r4, r1
-; THUMB2-NEXT:    and.w r5, r5, r12
-; THUMB2-NEXT:    orrs r5, r4
-; THUMB2-NEXT:    ldrh.w r4, [sp, #16]
-; THUMB2-NEXT:    strh r5, [r0, #6]
-; THUMB2-NEXT:    ldrh.w r5, [sp, #48]
-; THUMB2-NEXT:    ands r4, r1
-; THUMB2-NEXT:    ands r1, r2
-; THUMB2-NEXT:    and.w r5, r5, r12
+; THUMB2-NEXT:    and.w r4, r5, r1
+; THUMB2-NEXT:    bic.w r1, r12, r1
+; THUMB2-NEXT:    orrs r4, r1
+; THUMB2-NEXT:    strh r4, [r0, #12]
+; THUMB2-NEXT:    ldrh.w r12, [sp, #60]
+; THUMB2-NEXT:    ldrh.w r5, [sp, #28]
+; THUMB2-NEXT:    rsb.w r1, lr, #0
+; THUMB2-NEXT:    and.w r4, r5, r1
+; THUMB2-NEXT:    bic.w r1, r12, r1
+; THUMB2-NEXT:    orrs r4, r1
+; THUMB2-NEXT:    strh r4, [r0, #10]
+; THUMB2-NEXT:    ldrh.w r12, [sp, #56]
+; THUMB2-NEXT:    ldrh.w r5, [sp, #24]
+; THUMB2-NEXT:    rsb.w r1, lr, #0
+; THUMB2-NEXT:    and.w r4, r5, r1
+; THUMB2-NEXT:    bic.w r1, r12, r1
+; THUMB2-NEXT:    orrs r4, r1
+; THUMB2-NEXT:    strh r4, [r0, #8]
+; THUMB2-NEXT:    ldrh.w r12, [sp, #52]
+; THUMB2-NEXT:    ldrh.w r5, [sp, #20]
+; THUMB2-NEXT:    rsb.w r1, lr, #0
+; THUMB2-NEXT:    and.w r4, r5, r1
+; THUMB2-NEXT:    bic.w r1, r12, r1
+; THUMB2-NEXT:    orrs r4, r1
+; THUMB2-NEXT:    strh r4, [r0, #6]
+; THUMB2-NEXT:    ldrh.w r12, [sp, #48]
+; THUMB2-NEXT:    ldrh.w r5, [sp, #16]
+; THUMB2-NEXT:    rsb.w r1, lr, #0
+; THUMB2-NEXT:    and.w r4, r5, r1
+; THUMB2-NEXT:    bic.w r1, r12, r1
+; THUMB2-NEXT:    orrs r4, r1
+; THUMB2-NEXT:    strh r4, [r0, #4]
+; THUMB2-NEXT:    ldrh.w r1, [sp, #44]
+; THUMB2-NEXT:    rsb.w r4, lr, #0
+; THUMB2-NEXT:    and.w r5, r3, r4
+; THUMB2-NEXT:    bic.w r4, r1, r4
 ; THUMB2-NEXT:    orrs r5, r4
-; THUMB2-NEXT:    strh r5, [r0, #4]
-; THUMB2-NEXT:    ldrh.w r5, [sp, #44]
-; THUMB2-NEXT:    and.w r5, r5, r12
+; THUMB2-NEXT:    strh r5, [r0, #2]
+; THUMB2-NEXT:    ldrh.w r1, [sp, #40]
+; THUMB2-NEXT:    rsb.w r5, lr, #0
+; THUMB2-NEXT:    and.w r3, r2, r5
+; THUMB2-NEXT:    bic.w r5, r1, r5
 ; THUMB2-NEXT:    orrs r3, r5
-; THUMB2-NEXT:    strh r3, [r0, #2]
-; THUMB2-NEXT:    ldrh.w r3, [sp, #40]
-; THUMB2-NEXT:    and.w r3, r3, r12
-; THUMB2-NEXT:    orrs r1, r3
-; THUMB2-NEXT:    strh r1, [r0]
+; THUMB2-NEXT:    strh r3, [r0]
 ; THUMB2-NEXT:    pop {r4, r5, r7, pc}
 entry:
   %sel = call <8 x half> @llvm.ct.select.v8f16(i1 %cond, <8 x half> %a, <8 x half> %b)
@@ -649,47 +558,42 @@ entry:
 define <8 x bfloat> @ct_v8bf16(i1 %cond, <8 x bfloat> %a, <8 x bfloat> %b) {
 ; CT-LABEL: ct_v8bf16:
 ; CT:       @ %bb.0: @ %entry
-; CT-NEXT:    .save {r4, r5, r6, r7, r11, lr}
-; CT-NEXT:    push {r4, r5, r6, r7, r11, lr}
-; CT-NEXT:    ldrh r5, [sp, #36]
+; CT-NEXT:    .save {r4, r5, r6, r7, r8, lr}
+; CT-NEXT:    push {r4, r5, r6, r7, r8, lr}
+; CT-NEXT:    ldrh r12, [sp, #36]
 ; CT-NEXT:    pkhbt r2, r2, r3, lsl #16
 ; CT-NEXT:    ldrh r7, [sp, #32]
-; CT-NEXT:    vdup.8 d18, r1
-; CT-NEXT:    ldrh r1, [sp, #52]
+; CT-NEXT:    and r1, r1, #1
+; CT-NEXT:    ldrh r3, [sp, #52]
 ; CT-NEXT:    vmov.32 d16[0], r2
-; CT-NEXT:    orr r3, r7, r5, lsl #16
 ; CT-NEXT:    ldrh r2, [sp, #48]
+; CT-NEXT:    orr r7, r7, r12, lsl #16
 ; CT-NEXT:    ldrh r5, [sp, #68]
-; CT-NEXT:    vmovl.u8 q9, d18
-; CT-NEXT:    orr r1, r2, r1, lsl #16
-; CT-NEXT:    vmov.32 d17[0], r3
-; CT-NEXT:    ldrh r3, [sp, #64]
-; CT-NEXT:    ldrh r2, [sp, #28]
-; CT-NEXT:    vmov.32 d20[0], r1
-; CT-NEXT:    ldrh r1, [sp, #24]
-; CT-NEXT:    orr r3, r3, r5, lsl #16
+; CT-NEXT:    orr r2, r2, r3, lsl #16
+; CT-NEXT:    vmov.32 d17[0], r7
+; CT-NEXT:    ldrh r7, [sp, #64]
+; CT-NEXT:    ldrh r3, [sp, #28]
+; CT-NEXT:    vmov.32 d18[0], r2
+; CT-NEXT:    ldrh r2, [sp, #24]
+; CT-NEXT:    orr r7, r7, r5, lsl #16
 ; CT-NEXT:    ldrh r5, [sp, #76]
-; CT-NEXT:    vshl.i16 q9, q9, #15
-; CT-NEXT:    vmov.32 d21[0], r3
-; CT-NEXT:    orr r1, r1, r2, lsl #16
-; CT-NEXT:    ldrh r3, [sp, #72]
-; CT-NEXT:    ldrh r4, [sp, #60]
-; CT-NEXT:    vmov.32 d16[1], r1
-; CT-NEXT:    orr r1, r3, r5, lsl #16
-; CT-NEXT:    ldrh r6, [sp, #56]
-; CT-NEXT:    ldrh r12, [sp, #44]
-; CT-NEXT:    vshr.s16 q9, q9, #15
-; CT-NEXT:    vmov.32 d21[1], r1
-; CT-NEXT:    orr r1, r6, r4, lsl #16
-; CT-NEXT:    ldrh lr, [sp, #40]
-; CT-NEXT:    vmov.32 d20[1], r1
-; CT-NEXT:    orr r1, lr, r12, lsl #16
-; CT-NEXT:    vmov.32 d17[1], r1
-; CT-NEXT:    veor q8, q8, q10
-; CT-NEXT:    vand q8, q8, q9
-; CT-NEXT:    veor q8, q10, q8
-; CT-NEXT:    vst1.64 {d16, d17}, [r0:128]
-; CT-NEXT:    pop {r4, r5, r6, r7, r11, pc}
+; CT-NEXT:    vmov.32 d19[0], r7
+; CT-NEXT:    orr r2, r2, r3, lsl #16
+; CT-NEXT:    ldrh r7, [sp, #72]
+; CT-NEXT:    ldrh lr, [sp, #60]
+; CT-NEXT:    vmov.32 d16[1], r2
+; CT-NEXT:    orr r2, r7, r5, lsl #16
+; CT-NEXT:    ldrh r4, [sp, #56]
+; CT-NEXT:    ldrh r8, [sp, #44]
+; CT-NEXT:    vmov.32 d19[1], r2
+; CT-NEXT:    orr r2, r4, lr, lsl #16
+; CT-NEXT:    ldrh r6, [sp, #40]
+; CT-NEXT:    vmov.32 d18[1], r2
+; CT-NEXT:    orr r2, r6, r8, lsl #16
+; CT-NEXT:    vmov.32 d17[1], r2
+; CT-NEXT:    BUNDLE
+; CT-NEXT:    vst1.64 {d20, d21}, [r0:128]
+; CT-NEXT:    pop {r4, r5, r6, r7, r8, pc}
 ;
 ; BFLOAT-F16-NATIVE-LABEL: ct_v8bf16:
 ; BFLOAT-F16-NATIVE:       @ %bb.0: @ %entry
@@ -698,168 +602,150 @@ define <8 x bfloat> @ct_v8bf16(i1 %cond, <8 x bfloat> %a, <8 x bfloat> %b) {
 ; BFLOAT-F16-NATIVE-NEXT:    vmov d16, r2, r3
 ; BFLOAT-F16-NATIVE-NEXT:    vld1.64 {d18, d19}, [r1]
 ; BFLOAT-F16-NATIVE-NEXT:    and r0, r0, #1
-; BFLOAT-F16-NATIVE-NEXT:    rsb r1, r0, #0
-; BFLOAT-F16-NATIVE-NEXT:    vdup.32 q11, r1
-; BFLOAT-F16-NATIVE-NEXT:    vand q10, q8, q11
-; BFLOAT-F16-NATIVE-NEXT:    vbic q11, q9, q11
-; BFLOAT-F16-NATIVE-NEXT:    vorr q10, q10, q11
+; BFLOAT-F16-NATIVE-NEXT:    BUNDLE
 ; BFLOAT-F16-NATIVE-NEXT:    vmov r0, r1, d20
 ; BFLOAT-F16-NATIVE-NEXT:    vmov r2, r3, d21
 ; BFLOAT-F16-NATIVE-NEXT:    bx lr
 ;
 ; F16-NATIVE-LABEL: ct_v8bf16:
 ; F16-NATIVE:       @ %bb.0: @ %entry
-; F16-NATIVE-NEXT:    .save {r4, r5, r6, r7, r11, lr}
-; F16-NATIVE-NEXT:    push {r4, r5, r6, r7, r11, lr}
-; F16-NATIVE-NEXT:    ldrh r5, [sp, #36]
+; F16-NATIVE-NEXT:    .save {r4, r5, r6, r7, r8, lr}
+; F16-NATIVE-NEXT:    push {r4, r5, r6, r7, r8, lr}
+; F16-NATIVE-NEXT:    ldrh r12, [sp, #36]
 ; F16-NATIVE-NEXT:    pkhbt r2, r2, r3, lsl #16
 ; F16-NATIVE-NEXT:    ldrh r7, [sp, #32]
-; F16-NATIVE-NEXT:    vdup.8 d18, r1
-; F16-NATIVE-NEXT:    ldrh r1, [sp, #52]
+; F16-NATIVE-NEXT:    and r1, r1, #1
+; F16-NATIVE-NEXT:    ldrh r3, [sp, #52]
 ; F16-NATIVE-NEXT:    vmov.32 d16[0], r2
-; F16-NATIVE-NEXT:    orr r3, r7, r5, lsl #16
 ; F16-NATIVE-NEXT:    ldrh r2, [sp, #48]
+; F16-NATIVE-NEXT:    orr r7, r7, r12, lsl #16
 ; F16-NATIVE-NEXT:    ldrh r5, [sp, #68]
-; F16-NATIVE-NEXT:    vmovl.u8 q9, d18
-; F16-NATIVE-NEXT:    orr r1, r2, r1, lsl #16
-; F16-NATIVE-NEXT:    vmov.32 d17[0], r3
-; F16-NATIVE-NEXT:    ldrh r3, [sp, #64]
-; F16-NATIVE-NEXT:    ldrh r2, [sp, #28]
-; F16-NATIVE-NEXT:    vmov.32 d20[0], r1
-; F16-NATIVE-NEXT:    ldrh r1, [sp, #24]
-; F16-NATIVE-NEXT:    orr r3, r3, r5, lsl #16
+; F16-NATIVE-NEXT:    orr r2, r2, r3, lsl #16
+; F16-NATIVE-NEXT:    vmov.32 d17[0], r7
+; F16-NATIVE-NEXT:    ldrh r7, [sp, #64]
+; F16-NATIVE-NEXT:    ldrh r3, [sp, #28]
+; F16-NATIVE-NEXT:    vmov.32 d18[0], r2
+; F16-NATIVE-NEXT:    ldrh r2, [sp, #24]
+; F16-NATIVE-NEXT:    orr r7, r7, r5, lsl #16
 ; F16-NATIVE-NEXT:    ldrh r5, [sp, #76]
-; F16-NATIVE-NEXT:    vshl.i16 q9, q9, #15
-; F16-NATIVE-NEXT:    vmov.32 d21[0], r3
-; F16-NATIVE-NEXT:    orr r1, r1, r2, lsl #16
-; F16-NATIVE-NEXT:    ldrh r3, [sp, #72]
-; F16-NATIVE-NEXT:    ldrh r4, [sp, #60]
-; F16-NATIVE-NEXT:    vmov.32 d16[1], r1
-; F16-NATIVE-NEXT:    orr r1, r3, r5, lsl #16
-; F16-NATIVE-NEXT:    ldrh r6, [sp, #56]
-; F16-NATIVE-NEXT:    ldrh r12, [sp, #44]
-; F16-NATIVE-NEXT:    vshr.s16 q9, q9, #15
-; F16-NATIVE-NEXT:    vmov.32 d21[1], r1
-; F16-NATIVE-NEXT:    orr r1, r6, r4, lsl #16
-; F16-NATIVE-NEXT:    ldrh lr, [sp, #40]
-; F16-NATIVE-NEXT:    vmov.32 d20[1], r1
-; F16-NATIVE-NEXT:    orr r1, lr, r12, lsl #16
-; F16-NATIVE-NEXT:    vmov.32 d17[1], r1
-; F16-NATIVE-NEXT:    veor q8, q8, q10
-; F16-NATIVE-NEXT:    vand q8, q8, q9
-; F16-NATIVE-NEXT:    veor q8, q10, q8
-; F16-NATIVE-NEXT:    vst1.64 {d16, d17}, [r0:128]
-; F16-NATIVE-NEXT:    pop {r4, r5, r6, r7, r11, pc}
+; F16-NATIVE-NEXT:    vmov.32 d19[0], r7
+; F16-NATIVE-NEXT:    orr r2, r2, r3, lsl #16
+; F16-NATIVE-NEXT:    ldrh r7, [sp, #72]
+; F16-NATIVE-NEXT:    ldrh lr, [sp, #60]
+; F16-NATIVE-NEXT:    vmov.32 d16[1], r2
+; F16-NATIVE-NEXT:    orr r2, r7, r5, lsl #16
+; F16-NATIVE-NEXT:    ldrh r4, [sp, #56]
+; F16-NATIVE-NEXT:    ldrh r8, [sp, #44]
+; F16-NATIVE-NEXT:    vmov.32 d19[1], r2
+; F16-NATIVE-NEXT:    orr r2, r4, lr, lsl #16
+; F16-NATIVE-NEXT:    ldrh r6, [sp, #40]
+; F16-NATIVE-NEXT:    vmov.32 d18[1], r2
+; F16-NATIVE-NEXT:    orr r2, r6, r8, lsl #16
+; F16-NATIVE-NEXT:    vmov.32 d17[1], r2
+; F16-NATIVE-NEXT:    BUNDLE
+; F16-NATIVE-NEXT:    vst1.64 {d20, d21}, [r0:128]
+; F16-NATIVE-NEXT:    pop {r4, r5, r6, r7, r8, pc}
 ;
 ; THUMB1-LABEL: ct_v8bf16:
 ; THUMB1:       @ %bb.0: @ %entry
-; THUMB1-NEXT:    .save {r4, r5, r6, lr}
-; THUMB1-NEXT:    push {r4, r5, r6, lr}
+; THUMB1-NEXT:    .save {r4, r5, r6, r7, lr}
+; THUMB1-NEXT:    push {r4, r5, r6, r7, lr}
+; THUMB1-NEXT:    .pad #4
+; THUMB1-NEXT:    sub sp, #4
 ; THUMB1-NEXT:    movs r4, #1
 ; THUMB1-NEXT:    ands r4, r1
-; THUMB1-NEXT:    subs r1, r4, #1
-; THUMB1-NEXT:    ldr r5, [sp, #68]
-; THUMB1-NEXT:    ands r5, r1
-; THUMB1-NEXT:    rsbs r4, r4, #0
-; THUMB1-NEXT:    ldr r6, [sp, #36]
-; THUMB1-NEXT:    ands r6, r4
-; THUMB1-NEXT:    orrs r6, r5
+; THUMB1-NEXT:    ldr r1, [sp, #76]
+; THUMB1-NEXT:    ldr r5, [sp, #44]
+; THUMB1-NEXT:    BUNDLE
 ; THUMB1-NEXT:    strh r6, [r0, #14]
-; THUMB1-NEXT:    ldr r5, [sp, #64]
-; THUMB1-NEXT:    ands r5, r1
-; THUMB1-NEXT:    ldr r6, [sp, #32]
-; THUMB1-NEXT:    ands r6, r4
-; THUMB1-NEXT:    orrs r6, r5
+; THUMB1-NEXT:    ldr r1, [sp, #72]
+; THUMB1-NEXT:    ldr r5, [sp, #40]
+; THUMB1-NEXT:    BUNDLE
 ; THUMB1-NEXT:    strh r6, [r0, #12]
-; THUMB1-NEXT:    ldr r5, [sp, #60]
-; THUMB1-NEXT:    ands r5, r1
-; THUMB1-NEXT:    ldr r6, [sp, #28]
-; THUMB1-NEXT:    ands r6, r4
-; THUMB1-NEXT:    orrs r6, r5
+; THUMB1-NEXT:    ldr r1, [sp, #68]
+; THUMB1-NEXT:    ldr r5, [sp, #36]
+; THUMB1-NEXT:    BUNDLE
 ; THUMB1-NEXT:    strh r6, [r0, #10]
-; THUMB1-NEXT:    ldr r5, [sp, #56]
-; THUMB1-NEXT:    ands r5, r1
-; THUMB1-NEXT:    ldr r6, [sp, #24]
-; THUMB1-NEXT:    ands r6, r4
-; THUMB1-NEXT:    orrs r6, r5
+; THUMB1-NEXT:    ldr r1, [sp, #64]
+; THUMB1-NEXT:    ldr r5, [sp, #32]
+; THUMB1-NEXT:    BUNDLE
 ; THUMB1-NEXT:    strh r6, [r0, #8]
-; THUMB1-NEXT:    ldr r5, [sp, #52]
-; THUMB1-NEXT:    ands r5, r1
-; THUMB1-NEXT:    ldr r6, [sp, #20]
-; THUMB1-NEXT:    ands r6, r4
-; THUMB1-NEXT:    orrs r6, r5
+; THUMB1-NEXT:    ldr r1, [sp, #60]
+; THUMB1-NEXT:    ldr r5, [sp, #28]
+; THUMB1-NEXT:    BUNDLE
 ; THUMB1-NEXT:    strh r6, [r0, #6]
-; THUMB1-NEXT:    ldr r5, [sp, #48]
-; THUMB1-NEXT:    ands r5, r1
-; THUMB1-NEXT:    ldr r6, [sp, #16]
-; THUMB1-NEXT:    ands r6, r4
-; THUMB1-NEXT:    orrs r6, r5
+; THUMB1-NEXT:    ldr r1, [sp, #56]
+; THUMB1-NEXT:    ldr r5, [sp, #24]
+; THUMB1-NEXT:    BUNDLE
 ; THUMB1-NEXT:    strh r6, [r0, #4]
-; THUMB1-NEXT:    ldr r5, [sp, #44]
-; THUMB1-NEXT:    ands r5, r1
-; THUMB1-NEXT:    ands r3, r4
-; THUMB1-NEXT:    orrs r3, r5
-; THUMB1-NEXT:    strh r3, [r0, #2]
-; THUMB1-NEXT:    ldr r3, [sp, #40]
-; THUMB1-NEXT:    ands r3, r1
-; THUMB1-NEXT:    ands r2, r4
-; THUMB1-NEXT:    orrs r2, r3
-; THUMB1-NEXT:    strh r2, [r0]
-; THUMB1-NEXT:    pop {r4, r5, r6, pc}
+; THUMB1-NEXT:    ldr r1, [sp, #52]
+; THUMB1-NEXT:    BUNDLE
+; THUMB1-NEXT:    strh r5, [r0, #2]
+; THUMB1-NEXT:    ldr r1, [sp, #48]
+; THUMB1-NEXT:    BUNDLE
+; THUMB1-NEXT:    strh r3, [r0]
+; THUMB1-NEXT:    add sp, #4
+; THUMB1-NEXT:    pop {r4, r5, r6, r7, pc}
 ;
 ; THUMB2-LABEL: ct_v8bf16:
 ; THUMB2:       @ %bb.0: @ %entry
 ; THUMB2-NEXT:    .save {r4, r5, r7, lr}
 ; THUMB2-NEXT:    push {r4, r5, r7, lr}
 ; THUMB2-NEXT:    and lr, r1, #1
-; THUMB2-NEXT:    ldrh.w r1, [sp, #68]
-; THUMB2-NEXT:    sub.w r12, lr, #1
-; THUMB2-NEXT:    ldrh.w r4, [sp, #36]
-; THUMB2-NEXT:    and.w r5, r1, r12
+; THUMB2-NEXT:    ldrh.w r12, [sp, #68]
+; THUMB2-NEXT:    ldrh.w r1, [sp, #36]
+; THUMB2-NEXT:    rsb.w r5, lr, #0
+; THUMB2-NEXT:    and.w r4, r1, r5
+; THUMB2-NEXT:    bic.w r5, r12, r5
+; THUMB2-NEXT:    orrs r4, r5
+; THUMB2-NEXT:    strh r4, [r0, #14]
+; THUMB2-NEXT:    ldrh.w r12, [sp, #64]
+; THUMB2-NEXT:    ldrh.w r5, [sp, #32]
 ; THUMB2-NEXT:    rsb.w r1, lr, #0
-; THUMB2-NEXT:    ands r4, r1
-; THUMB2-NEXT:    ands r3, r1
-; THUMB2-NEXT:    orrs r5, r4
-; THUMB2-NEXT:    ldrh.w r4, [sp, #32]
-; THUMB2-NEXT:    strh r5, [r0, #14]
-; THUMB2-NEXT:    ldrh.w r5, [sp, #64]
-; THUMB2-NEXT:    ands r4, r1
-; THUMB2-NEXT:    and.w r5, r5, r12
-; THUMB2-NEXT:    orrs r5, r4
-; THUMB2-NEXT:    ldrh.w r4, [sp, #28]
-; THUMB2-NEXT:    strh r5, [r0, #12]
-; THUMB2-NEXT:    ldrh.w r5, [sp, #60]
-; THUMB2-NEXT:    ands r4, r1
-; THUMB2-NEXT:    and.w r5, r5, r12
-; THUMB2-NEXT:    orrs r5, r4
-; THUMB2-NEXT:    ldrh.w r4, [sp, #24]
-; THUMB2-NEXT:    strh r5, [r0, #10]
-; THUMB2-NEXT:    ldrh.w r5, [sp, #56]
-; THUMB2-NEXT:    ands r4, r1
-; THUMB2-NEXT:    and.w r5, r5, r12
-; THUMB2-NEXT:    orrs r5, r4
-; THUMB2-NEXT:    ldrh.w r4, [sp, #20]
-; THUMB2-NEXT:    strh r5, [r0, #8]
-; THUMB2-NEXT:    ldrh.w r5, [sp, #52]
-; THUMB2-NEXT:    ands r4, r1
-; THUMB2-NEXT:    and.w r5, r5, r12
-; THUMB2-NEXT:    orrs r5, r4
-; THUMB2-NEXT:    ldrh.w r4, [sp, #16]
-; THUMB2-NEXT:    strh r5, [r0, #6]
-; THUMB2-NEXT:    ldrh.w r5, [sp, #48]
-; THUMB2-NEXT:    ands r4, r1
-; THUMB2-NEXT:    ands r1, r2
-; THUMB2-NEXT:    and.w r5, r5, r12
+; THUMB2-NEXT:    and.w r4, r5, r1
+; THUMB2-NEXT:    bic.w r1, r12, r1
+; THUMB2-NEXT:    orrs r4, r1
+; THUMB2-NEXT:    strh r4, [r0, #12]
+; THUMB2-NEXT:    ldrh.w r12, [sp, #60]
+; THUMB2-NEXT:    ldrh.w r5, [sp, #28]
+; THUMB2-NEXT:    rsb.w r1, lr, #0
+; THUMB2-NEXT:    and.w r4, r5, r1
+; THUMB2-NEXT:    bic.w r1, r12, r1
+; THUMB2-NEXT:    orrs r4, r1
+; THUMB2-NEXT:    strh r4, [r0, #10]
+; THUMB2-NEXT:    ldrh.w r12, [sp, #56]
+; THUMB2-NEXT:    ldrh.w r5, [sp, #24]
+; THUMB2-NEXT:    rsb.w r1, lr, #0
+; THUMB2-NEXT:    and.w r4, r5, r1
+; THUMB2-NEXT:    bic.w r1, r12, r1
+; THUMB2-NEXT:    orrs r4, r1
+; THUMB2-NEXT:    strh r4, [r0, #8]
+; THUMB2-NEXT:    ldrh.w r12, [sp, #52]
+; THUMB2-NEXT:    ldrh.w r5, [sp, #20]
+; THUMB2-NEXT:    rsb.w r1, lr, #0
+; THUMB2-NEXT:    and.w r4, r5, r1
+; THUMB2-NEXT:    bic.w r1, r12, r1
+; THUMB2-NEXT:    orrs r4, r1
+; THUMB2-NEXT:    strh r4, [r0, #6]
+; THUMB2-NEXT:    ldrh.w r12, [sp, #48]
+; THUMB2-NEXT:    ldrh.w r5, [sp, #16]
+; THUMB2-NEXT:    rsb.w r1, lr, #0
+; THUMB2-NEXT:    and.w r4, r5, r1
+; THUMB2-NEXT:    bic.w r1, r12, r1
+; THUMB2-NEXT:    orrs r4, r1
+; THUMB2-NEXT:    strh r4, [r0, #4]
+; THUMB2-NEXT:    ldrh.w r1, [sp, #44]
+; THUMB2-NEXT:    rsb.w r4, lr, #0
+; THUMB2-NEXT:    and.w r5, r3, r4
+; THUMB2-NEXT:    bic.w r4, r1, r4
 ; THUMB2-NEXT:    orrs r5, r4
-; THUMB2-NEXT:    strh r5, [r0, #4]
-; THUMB2-NEXT:    ldrh.w r5, [sp, #44]
-; THUMB2-NEXT:    and.w r5, r5, r12
+; THUMB2-NEXT:    strh r5, [r0, #2]
+; THUMB2-NEXT:    ldrh.w r1, [sp, #40]
+; THUMB2-NEXT:    rsb.w r5, lr, #0
+; THUMB2-NEXT:    and.w r3, r2, r5
+; THUMB2-NEXT:    bic.w r5, r1, r5
 ; THUMB2-NEXT:    orrs r3, r5
-; THUMB2-NEXT:    strh r3, [r0, #2]
-; THUMB2-NEXT:    ldrh.w r3, [sp, #40]
-; THUMB2-NEXT:    and.w r3, r3, r12
-; THUMB2-NEXT:    orrs r1, r3
-; THUMB2-NEXT:    strh r1, [r0]
+; THUMB2-NEXT:    strh r3, [r0]
 ; THUMB2-NEXT:    pop {r4, r5, r7, pc}
 entry:
   %sel = call <8 x bfloat> @llvm.ct.select.v8bf16(i1 %cond, <8 x bfloat> %a, <8 x bfloat> %b)
diff --git a/llvm/test/CodeGen/ARM/ctselect-vector.ll b/llvm/test/CodeGen/ARM/ctselect-vector.ll
index 3a03ebccb05ac..d720c8990ab69 100644
--- a/llvm/test/CodeGen/ARM/ctselect-vector.ll
+++ b/llvm/test/CodeGen/ARM/ctselect-vector.ll
@@ -10,11 +10,7 @@ define <8 x i8> @ct_v8i8(i1 %cond, <8 x i8> %a, <8 x i8> %b) {
 ; CT-NEXT:    vldr d16, [sp]
 ; CT-NEXT:    vmov d17, r2, r3
 ; CT-NEXT:    and r0, r0, #1
-; CT-NEXT:    rsb r1, r0, #0
-; CT-NEXT:    vdup.32 d19, r1
-; CT-NEXT:    vand d18, d17, d19
-; CT-NEXT:    vbic d19, d16, d19
-; CT-NEXT:    vorr d18, d18, d19
+; CT-NEXT:    BUNDLE
 ; CT-NEXT:    vmov r0, r1, d18
 ; CT-NEXT:    bx lr
 ;
@@ -22,165 +18,138 @@ define <8 x i8> @ct_v8i8(i1 %cond, <8 x i8> %a, <8 x i8> %b) {
 ; DEFAULT:       @ %bb.0: @ %entry
 ; DEFAULT-NEXT:    push {r4, r5, r11, lr}
 ; DEFAULT-NEXT:    and lr, r1, #1
-; DEFAULT-NEXT:    ldrb r1, [sp, #68]
-; DEFAULT-NEXT:    sub r12, lr, #1
-; DEFAULT-NEXT:    ldrb r4, [sp, #36]
-; DEFAULT-NEXT:    and r5, r1, r12
-; DEFAULT-NEXT:    rsb r1, lr, #0
-; DEFAULT-NEXT:    and r4, r4, r1
-; DEFAULT-NEXT:    and r3, r3, r1
-; DEFAULT-NEXT:    orr r5, r4, r5
-; DEFAULT-NEXT:    ldrb r4, [sp, #32]
-; DEFAULT-NEXT:    strb r5, [r0, #7]
-; DEFAULT-NEXT:    ldrb r5, [sp, #64]
-; DEFAULT-NEXT:    and r4, r4, r1
-; DEFAULT-NEXT:    and r5, r5, r12
-; DEFAULT-NEXT:    orr r5, r4, r5
-; DEFAULT-NEXT:    ldrb r4, [sp, #28]
-; DEFAULT-NEXT:    strb r5, [r0, #6]
-; DEFAULT-NEXT:    ldrb r5, [sp, #60]
-; DEFAULT-NEXT:    and r4, r4, r1
-; DEFAULT-NEXT:    and r5, r5, r12
-; DEFAULT-NEXT:    orr r5, r4, r5
-; DEFAULT-NEXT:    ldrb r4, [sp, #24]
-; DEFAULT-NEXT:    strb r5, [r0, #5]
-; DEFAULT-NEXT:    ldrb r5, [sp, #56]
-; DEFAULT-NEXT:    and r4, r4, r1
-; DEFAULT-NEXT:    and r5, r5, r12
-; DEFAULT-NEXT:    orr r5, r4, r5
-; DEFAULT-NEXT:    ldrb r4, [sp, #20]
-; DEFAULT-NEXT:    strb r5, [r0, #4]
-; DEFAULT-NEXT:    ldrb r5, [sp, #52]
-; DEFAULT-NEXT:    and r4, r4, r1
-; DEFAULT-NEXT:    and r5, r5, r12
-; DEFAULT-NEXT:    orr r5, r4, r5
-; DEFAULT-NEXT:    ldrb r4, [sp, #16]
-; DEFAULT-NEXT:    strb r5, [r0, #3]
-; DEFAULT-NEXT:    ldrb r5, [sp, #48]
-; DEFAULT-NEXT:    and r4, r4, r1
-; DEFAULT-NEXT:    and r1, r2, r1
-; DEFAULT-NEXT:    and r5, r5, r12
-; DEFAULT-NEXT:    orr r5, r4, r5
-; DEFAULT-NEXT:    strb r5, [r0, #2]
-; DEFAULT-NEXT:    ldrb r5, [sp, #44]
-; DEFAULT-NEXT:    and r5, r5, r12
-; DEFAULT-NEXT:    orr r3, r3, r5
-; DEFAULT-NEXT:    strb r3, [r0, #1]
-; DEFAULT-NEXT:    ldrb r3, [sp, #40]
-; DEFAULT-NEXT:    and r3, r3, r12
-; DEFAULT-NEXT:    orr r1, r1, r3
-; DEFAULT-NEXT:    strb r1, [r0]
+; DEFAULT-NEXT:    ldrb r12, [sp, #68]
+; DEFAULT-NEXT:    ldrb r1, [sp, #36]
+; DEFAULT-NEXT:    BUNDLE
+; DEFAULT-NEXT:    strb r4, [r0, #7]
+; DEFAULT-NEXT:    ldrb r12, [sp, #64]
+; DEFAULT-NEXT:    ldrb r5, [sp, #32]
+; DEFAULT-NEXT:    BUNDLE
+; DEFAULT-NEXT:    strb r4, [r0, #6]
+; DEFAULT-NEXT:    ldrb r12, [sp, #60]
+; DEFAULT-NEXT:    ldrb r5, [sp, #28]
+; DEFAULT-NEXT:    BUNDLE
+; DEFAULT-NEXT:    strb r4, [r0, #5]
+; DEFAULT-NEXT:    ldrb r12, [sp, #56]
+; DEFAULT-NEXT:    ldrb r5, [sp, #24]
+; DEFAULT-NEXT:    BUNDLE
+; DEFAULT-NEXT:    strb r4, [r0, #4]
+; DEFAULT-NEXT:    ldrb r12, [sp, #52]
+; DEFAULT-NEXT:    ldrb r5, [sp, #20]
+; DEFAULT-NEXT:    BUNDLE
+; DEFAULT-NEXT:    strb r4, [r0, #3]
+; DEFAULT-NEXT:    ldrb r12, [sp, #48]
+; DEFAULT-NEXT:    ldrb r5, [sp, #16]
+; DEFAULT-NEXT:    BUNDLE
+; DEFAULT-NEXT:    strb r4, [r0, #2]
+; DEFAULT-NEXT:    ldrb r1, [sp, #44]
+; DEFAULT-NEXT:    BUNDLE
+; DEFAULT-NEXT:    strb r5, [r0, #1]
+; DEFAULT-NEXT:    ldrb r1, [sp, #40]
+; DEFAULT-NEXT:    BUNDLE
+; DEFAULT-NEXT:    strb r3, [r0]
 ; DEFAULT-NEXT:    pop {r4, r5, r11, pc}
 ;
 ; THUMB1-LABEL: ct_v8i8:
 ; THUMB1:       @ %bb.0: @ %entry
-; THUMB1-NEXT:    .save {r4, r5, r6, lr}
-; THUMB1-NEXT:    push {r4, r5, r6, lr}
+; THUMB1-NEXT:    .save {r4, r5, r6, r7, lr}
+; THUMB1-NEXT:    push {r4, r5, r6, r7, lr}
+; THUMB1-NEXT:    .pad #4
+; THUMB1-NEXT:    sub sp, #4
 ; THUMB1-NEXT:    movs r4, #1
 ; THUMB1-NEXT:    ands r4, r1
-; THUMB1-NEXT:    subs r1, r4, #1
-; THUMB1-NEXT:    ldr r5, [sp, #68]
-; THUMB1-NEXT:    ands r5, r1
-; THUMB1-NEXT:    rsbs r4, r4, #0
-; THUMB1-NEXT:    ldr r6, [sp, #36]
-; THUMB1-NEXT:    ands r6, r4
-; THUMB1-NEXT:    orrs r6, r5
+; THUMB1-NEXT:    ldr r1, [sp, #76]
+; THUMB1-NEXT:    ldr r5, [sp, #44]
+; THUMB1-NEXT:    BUNDLE
 ; THUMB1-NEXT:    strb r6, [r0, #7]
-; THUMB1-NEXT:    ldr r5, [sp, #64]
-; THUMB1-NEXT:    ands r5, r1
-; THUMB1-NEXT:    ldr r6, [sp, #32]
-; THUMB1-NEXT:    ands r6, r4
-; THUMB1-NEXT:    orrs r6, r5
+; THUMB1-NEXT:    ldr r1, [sp, #72]
+; THUMB1-NEXT:    ldr r5, [sp, #40]
+; THUMB1-NEXT:    BUNDLE
 ; THUMB1-NEXT:    strb r6, [r0, #6]
-; THUMB1-NEXT:    ldr r5, [sp, #60]
-; THUMB1-NEXT:    ands r5, r1
-; THUMB1-NEXT:    ldr r6, [sp, #28]
-; THUMB1-NEXT:    ands r6, r4
-; THUMB1-NEXT:    orrs r6, r5
+; THUMB1-NEXT:    ldr r1, [sp, #68]
+; THUMB1-NEXT:    ldr r5, [sp, #36]
+; THUMB1-NEXT:    BUNDLE
 ; THUMB1-NEXT:    strb r6, [r0, #5]
-; THUMB1-NEXT:    ldr r5, [sp, #56]
-; THUMB1-NEXT:    ands r5, r1
-; THUMB1-NEXT:    ldr r6, [sp, #24]
-; THUMB1-NEXT:    ands r6, r4
-; THUMB1-NEXT:    orrs r6, r5
+; THUMB1-NEXT:    ldr r1, [sp, #64]
+; THUMB1-NEXT:    ldr r5, [sp, #32]
+; THUMB1-NEXT:    BUNDLE
 ; THUMB1-NEXT:    strb r6, [r0, #4]
-; THUMB1-NEXT:    ldr r5, [sp, #52]
-; THUMB1-NEXT:    ands r5, r1
-; THUMB1-NEXT:    ldr r6, [sp, #20]
-; THUMB1-NEXT:    ands r6, r4
-; THUMB1-NEXT:    orrs r6, r5
+; THUMB1-NEXT:    ldr r1, [sp, #60]
+; THUMB1-NEXT:    ldr r5, [sp, #28]
+; THUMB1-NEXT:    BUNDLE
 ; THUMB1-NEXT:    strb r6, [r0, #3]
-; THUMB1-NEXT:    ldr r5, [sp, #48]
-; THUMB1-NEXT:    ands r5, r1
-; THUMB1-NEXT:    ldr r6, [sp, #16]
-; THUMB1-NEXT:    ands r6, r4
-; THUMB1-NEXT:    orrs r6, r5
+; THUMB1-NEXT:    ldr r1, [sp, #56]
+; THUMB1-NEXT:    ldr r5, [sp, #24]
+; THUMB1-NEXT:    BUNDLE
 ; THUMB1-NEXT:    strb r6, [r0, #2]
-; THUMB1-NEXT:    ldr r5, [sp, #44]
-; THUMB1-NEXT:    ands r5, r1
-; THUMB1-NEXT:    ands r3, r4
-; THUMB1-NEXT:    orrs r3, r5
-; THUMB1-NEXT:    strb r3, [r0, #1]
-; THUMB1-NEXT:    ldr r3, [sp, #40]
-; THUMB1-NEXT:    ands r3, r1
-; THUMB1-NEXT:    ands r2, r4
-; THUMB1-NEXT:    orrs r2, r3
-; THUMB1-NEXT:    strb r2, [r0]
-; THUMB1-NEXT:    pop {r4, r5, r6, pc}
+; THUMB1-NEXT:    ldr r1, [sp, #52]
+; THUMB1-NEXT:    BUNDLE
+; THUMB1-NEXT:    strb r5, [r0, #1]
+; THUMB1-NEXT:    ldr r1, [sp, #48]
+; THUMB1-NEXT:    BUNDLE
+; THUMB1-NEXT:    strb r3, [r0]
+; THUMB1-NEXT:    add sp, #4
+; THUMB1-NEXT:    pop {r4, r5, r6, r7, pc}
 ;
 ; THUMB2-LABEL: ct_v8i8:
 ; THUMB2:       @ %bb.0: @ %entry
 ; THUMB2-NEXT:    .save {r4, r5, r7, lr}
 ; THUMB2-NEXT:    push {r4, r5, r7, lr}
 ; THUMB2-NEXT:    and lr, r1, #1
-; THUMB2-NEXT:    ldrb.w r1, [sp, #68]
-; THUMB2-NEXT:    sub.w r12, lr, #1
-; THUMB2-NEXT:    ldrb.w r4, [sp, #36]
-; THUMB2-NEXT:    and.w r5, r1, r12
+; THUMB2-NEXT:    ldrb.w r12, [sp, #68]
+; THUMB2-NEXT:    ldrb.w r1, [sp, #36]
+; THUMB2-NEXT:    rsb.w r5, lr, #0
+; THUMB2-NEXT:    and.w r4, r1, r5
+; THUMB2-NEXT:    bic.w r5, r12, r5
+; THUMB2-NEXT:    orrs r4, r5
+; THUMB2-NEXT:    strb r4, [r0, #7]
+; THUMB2-NEXT:    ldrb.w r12, [sp, #64]
+; THUMB2-NEXT:    ldrb.w r5, [sp, #32]
 ; THUMB2-NEXT:    rsb.w r1, lr, #0
-; THUMB2-NEXT:    ands r4, r1
-; THUMB2-NEXT:    ands r3, r1
-; THUMB2-NEXT:    orrs r5, r4
-; THUMB2-NEXT:    ldrb.w r4, [sp, #32]
-; THUMB2-NEXT:    strb r5, [r0, #7]
-; THUMB2-NEXT:    ldrb.w r5, [sp, #64]
-; THUMB2-NEXT:    ands r4, r1
-; THUMB2-NEXT:    and.w r5, r5, r12
-; THUMB2-NEXT:    orrs r5, r4
-; THUMB2-NEXT:    ldrb.w r4, [sp, #28]
-; THUMB2-NEXT:    strb r5, [r0, #6]
-; THUMB2-NEXT:    ldrb.w r5, [sp, #60]
-; THUMB2-NEXT:    ands r4, r1
-; THUMB2-NEXT:    and.w r5, r5, r12
-; THUMB2-NEXT:    orrs r5, r4
-; THUMB2-NEXT:    ldrb.w r4, [sp, #24]
-; THUMB2-NEXT:    strb r5, [r0, #5]
-; THUMB2-NEXT:    ldrb.w r5, [sp, #56]
-; THUMB2-NEXT:    ands r4, r1
-; THUMB2-NEXT:    and.w r5, r5, r12
-; THUMB2-NEXT:    orrs r5, r4
-; THUMB2-NEXT:    ldrb.w r4, [sp, #20]
-; THUMB2-NEXT:    strb r5, [r0, #4]
-; THUMB2-NEXT:    ldrb.w r5, [sp, #52]
-; THUMB2-NEXT:    ands r4, r1
-; THUMB2-NEXT:    and.w r5, r5, r12
-; THUMB2-NEXT:    orrs r5, r4
-; THUMB2-NEXT:    ldrb.w r4, [sp, #16]
-; THUMB2-NEXT:    strb r5, [r0, #3]
-; THUMB2-NEXT:    ldrb.w r5, [sp, #48]
-; THUMB2-NEXT:    ands r4, r1
-; THUMB2-NEXT:    ands r1, r2
-; THUMB2-NEXT:    and.w r5, r5, r12
+; THUMB2-NEXT:    and.w r4, r5, r1
+; THUMB2-NEXT:    bic.w r1, r12, r1
+; THUMB2-NEXT:    orrs r4, r1
+; THUMB2-NEXT:    strb r4, [r0, #6]
+; THUMB2-NEXT:    ldrb.w r12, [sp, #60]
+; THUMB2-NEXT:    ldrb.w r5, [sp, #28]
+; THUMB2-NEXT:    rsb.w r1, lr, #0
+; THUMB2-NEXT:    and.w r4, r5, r1
+; THUMB2-NEXT:    bic.w r1, r12, r1
+; THUMB2-NEXT:    orrs r4, r1
+; THUMB2-NEXT:    strb r4, [r0, #5]
+; THUMB2-NEXT:    ldrb.w r12, [sp, #56]
+; THUMB2-NEXT:    ldrb.w r5, [sp, #24]
+; THUMB2-NEXT:    rsb.w r1, lr, #0
+; THUMB2-NEXT:    and.w r4, r5, r1
+; THUMB2-NEXT:    bic.w r1, r12, r1
+; THUMB2-NEXT:    orrs r4, r1
+; THUMB2-NEXT:    strb r4, [r0, #4]
+; THUMB2-NEXT:    ldrb.w r12, [sp, #52]
+; THUMB2-NEXT:    ldrb.w r5, [sp, #20]
+; THUMB2-NEXT:    rsb.w r1, lr, #0
+; THUMB2-NEXT:    and.w r4, r5, r1
+; THUMB2-NEXT:    bic.w r1, r12, r1
+; THUMB2-NEXT:    orrs r4, r1
+; THUMB2-NEXT:    strb r4, [r0, #3]
+; THUMB2-NEXT:    ldrb.w r12, [sp, #48]
+; THUMB2-NEXT:    ldrb.w r5, [sp, #16]
+; THUMB2-NEXT:    rsb.w r1, lr, #0
+; THUMB2-NEXT:    and.w r4, r5, r1
+; THUMB2-NEXT:    bic.w r1, r12, r1
+; THUMB2-NEXT:    orrs r4, r1
+; THUMB2-NEXT:    strb r4, [r0, #2]
+; THUMB2-NEXT:    ldrb.w r1, [sp, #44]
+; THUMB2-NEXT:    rsb.w r4, lr, #0
+; THUMB2-NEXT:    and.w r5, r3, r4
+; THUMB2-NEXT:    bic.w r4, r1, r4
 ; THUMB2-NEXT:    orrs r5, r4
-; THUMB2-NEXT:    strb r5, [r0, #2]
-; THUMB2-NEXT:    ldrb.w r5, [sp, #44]
-; THUMB2-NEXT:    and.w r5, r5, r12
+; THUMB2-NEXT:    strb r5, [r0, #1]
+; THUMB2-NEXT:    ldrb.w r1, [sp, #40]
+; THUMB2-NEXT:    rsb.w r5, lr, #0
+; THUMB2-NEXT:    and.w r3, r2, r5
+; THUMB2-NEXT:    bic.w r5, r1, r5
 ; THUMB2-NEXT:    orrs r3, r5
-; THUMB2-NEXT:    strb r3, [r0, #1]
-; THUMB2-NEXT:    ldrb.w r3, [sp, #40]
-; THUMB2-NEXT:    and.w r3, r3, r12
-; THUMB2-NEXT:    orrs r1, r3
-; THUMB2-NEXT:    strb r1, [r0]
+; THUMB2-NEXT:    strb r3, [r0]
 ; THUMB2-NEXT:    pop {r4, r5, r7, pc}
 entry:
   %sel = call <8 x i8> @llvm.ct.select.v8i8(i1 %cond, <8 x i8> %a, <8 x i8> %b)
@@ -193,94 +162,75 @@ define <4 x i16> @ct_v4i16(i1 %cond, <4 x i16> %a, <4 x i16> %b) {
 ; CT-NEXT:    vldr d16, [sp]
 ; CT-NEXT:    vmov d17, r2, r3
 ; CT-NEXT:    and r0, r0, #1
-; CT-NEXT:    rsb r1, r0, #0
-; CT-NEXT:    vdup.32 d19, r1
-; CT-NEXT:    vand d18, d17, d19
-; CT-NEXT:    vbic d19, d16, d19
-; CT-NEXT:    vorr d18, d18, d19
+; CT-NEXT:    BUNDLE
 ; CT-NEXT:    vmov r0, r1, d18
 ; CT-NEXT:    bx lr
 ;
 ; DEFAULT-LABEL: ct_v4i16:
 ; DEFAULT:       @ %bb.0: @ %entry
-; DEFAULT-NEXT:    push {r11, lr}
-; DEFAULT-NEXT:    and r0, r0, #1
-; DEFAULT-NEXT:    ldrh r1, [sp, #16]
-; DEFAULT-NEXT:    sub r12, r0, #1
-; DEFAULT-NEXT:    rsb lr, r0, #0
-; DEFAULT-NEXT:    and r0, r2, lr
-; DEFAULT-NEXT:    and r1, r1, r12
-; DEFAULT-NEXT:    orr r0, r0, r1
-; DEFAULT-NEXT:    ldrh r1, [sp, #20]
-; DEFAULT-NEXT:    and r2, r3, lr
-; DEFAULT-NEXT:    ldrh r3, [sp, #8]
-; DEFAULT-NEXT:    and r1, r1, r12
-; DEFAULT-NEXT:    orr r1, r2, r1
-; DEFAULT-NEXT:    ldrh r2, [sp, #24]
-; DEFAULT-NEXT:    and r3, r3, lr
-; DEFAULT-NEXT:    and r2, r2, r12
-; DEFAULT-NEXT:    orr r2, r3, r2
-; DEFAULT-NEXT:    ldrh r3, [sp, #28]
-; DEFAULT-NEXT:    and r12, r3, r12
-; DEFAULT-NEXT:    ldrh r3, [sp, #12]
-; DEFAULT-NEXT:    and r3, r3, lr
-; DEFAULT-NEXT:    orr r3, r3, r12
-; DEFAULT-NEXT:    pop {r11, pc}
+; DEFAULT-NEXT:    push {r4, r5, r11, lr}
+; DEFAULT-NEXT:    and r12, r0, #1
+; DEFAULT-NEXT:    ldrh r1, [sp, #24]
+; DEFAULT-NEXT:    BUNDLE
+; DEFAULT-NEXT:    ldrh r2, [sp, #28]
+; DEFAULT-NEXT:    BUNDLE
+; DEFAULT-NEXT:    ldrh r3, [sp, #16]
+; DEFAULT-NEXT:    ldrh lr, [sp, #32]
+; DEFAULT-NEXT:    BUNDLE
+; DEFAULT-NEXT:    ldrh lr, [sp, #36]
+; DEFAULT-NEXT:    ldrh r4, [sp, #20]
+; DEFAULT-NEXT:    BUNDLE
+; DEFAULT-NEXT:    pop {r4, r5, r11, pc}
 ;
 ; THUMB1-LABEL: ct_v4i16:
 ; THUMB1:       @ %bb.0: @ %entry
-; THUMB1-NEXT:    .save {r4, r5, r6, lr}
-; THUMB1-NEXT:    push {r4, r5, r6, lr}
-; THUMB1-NEXT:    movs r1, #1
-; THUMB1-NEXT:    ands r1, r0
-; THUMB1-NEXT:    subs r4, r1, #1
-; THUMB1-NEXT:    ldr r0, [sp, #24]
-; THUMB1-NEXT:    ands r0, r4
-; THUMB1-NEXT:    rsbs r5, r1, #0
-; THUMB1-NEXT:    ands r2, r5
-; THUMB1-NEXT:    orrs r0, r2
-; THUMB1-NEXT:    ldr r1, [sp, #28]
-; THUMB1-NEXT:    ands r1, r4
-; THUMB1-NEXT:    ands r3, r5
-; THUMB1-NEXT:    orrs r1, r3
-; THUMB1-NEXT:    ldr r3, [sp, #32]
-; THUMB1-NEXT:    ands r3, r4
-; THUMB1-NEXT:    ldr r2, [sp, #16]
-; THUMB1-NEXT:    ands r2, r5
-; THUMB1-NEXT:    orrs r2, r3
-; THUMB1-NEXT:    ldr r6, [sp, #36]
-; THUMB1-NEXT:    ands r6, r4
-; THUMB1-NEXT:    ldr r3, [sp, #20]
-; THUMB1-NEXT:    ands r3, r5
-; THUMB1-NEXT:    orrs r3, r6
-; THUMB1-NEXT:    pop {r4, r5, r6, pc}
+; THUMB1-NEXT:    .save {r4, r5, r6, r7, lr}
+; THUMB1-NEXT:    push {r4, r5, r6, r7, lr}
+; THUMB1-NEXT:    .pad #4
+; THUMB1-NEXT:    sub sp, #4
+; THUMB1-NEXT:    movs r4, #1
+; THUMB1-NEXT:    ands r4, r0
+; THUMB1-NEXT:    ldr r1, [sp, #32]
+; THUMB1-NEXT:    BUNDLE
+; THUMB1-NEXT:    ldr r2, [sp, #36]
+; THUMB1-NEXT:    BUNDLE
+; THUMB1-NEXT:    ldr r3, [sp, #40]
+; THUMB1-NEXT:    ldr r5, [sp, #24]
+; THUMB1-NEXT:    BUNDLE
+; THUMB1-NEXT:    ldr r5, [sp, #44]
+; THUMB1-NEXT:    ldr r6, [sp, #28]
+; THUMB1-NEXT:    BUNDLE
+; THUMB1-NEXT:    add sp, #4
+; THUMB1-NEXT:    pop {r4, r5, r6, r7, pc}
 ;
 ; THUMB2-LABEL: ct_v4i16:
 ; THUMB2:       @ %bb.0: @ %entry
-; THUMB2-NEXT:    .save {r7, lr}
-; THUMB2-NEXT:    push {r7, lr}
-; THUMB2-NEXT:    and r0, r0, #1
-; THUMB2-NEXT:    ldrh.w r1, [sp, #16]
-; THUMB2-NEXT:    sub.w r12, r0, #1
-; THUMB2-NEXT:    rsb.w lr, r0, #0
+; THUMB2-NEXT:    .save {r4, r5, r7, lr}
+; THUMB2-NEXT:    push {r4, r5, r7, lr}
+; THUMB2-NEXT:    and r12, r0, #1
+; THUMB2-NEXT:    ldrh.w r1, [sp, #24]
+; THUMB2-NEXT:    rsb.w lr, r12, #0
 ; THUMB2-NEXT:    and.w r0, r2, lr
-; THUMB2-NEXT:    and.w r1, r1, r12
-; THUMB2-NEXT:    orrs r0, r1
-; THUMB2-NEXT:    ldrh.w r1, [sp, #20]
-; THUMB2-NEXT:    and.w r2, r3, lr
-; THUMB2-NEXT:    ldrh.w r3, [sp, #8]
-; THUMB2-NEXT:    and.w r1, r1, r12
-; THUMB2-NEXT:    orrs r1, r2
-; THUMB2-NEXT:    ldrh.w r2, [sp, #24]
-; THUMB2-NEXT:    and.w r3, r3, lr
-; THUMB2-NEXT:    and.w r2, r2, r12
-; THUMB2-NEXT:    orrs r2, r3
-; THUMB2-NEXT:    ldrh.w r3, [sp, #28]
-; THUMB2-NEXT:    and.w r12, r12, r3
-; THUMB2-NEXT:    ldrh.w r3, [sp, #12]
-; THUMB2-NEXT:    and.w r3, r3, lr
-; THUMB2-NEXT:    orr.w r3, r3, r12
-; THUMB2-NEXT:    pop {r7, pc}
+; THUMB2-NEXT:    bic.w lr, r1, lr
+; THUMB2-NEXT:    orr.w r0, r0, lr
+; THUMB2-NEXT:    ldrh.w r2, [sp, #28]
+; THUMB2-NEXT:    rsb.w lr, r12, #0
+; THUMB2-NEXT:    and.w r1, r3, lr
+; THUMB2-NEXT:    bic.w lr, r2, lr
+; THUMB2-NEXT:    orr.w r1, r1, lr
+; THUMB2-NEXT:    ldrh.w r3, [sp, #16]
+; THUMB2-NEXT:    ldrh.w lr, [sp, #32]
+; THUMB2-NEXT:    rsb.w r4, r12, #0
+; THUMB2-NEXT:    and.w r2, r3, r4
+; THUMB2-NEXT:    bic.w r4, lr, r4
+; THUMB2-NEXT:    orrs r2, r4
+; THUMB2-NEXT:    ldrh.w lr, [sp, #36]
+; THUMB2-NEXT:    ldrh.w r4, [sp, #20]
+; THUMB2-NEXT:    rsb.w r5, r12, #0
+; THUMB2-NEXT:    and.w r3, r4, r5
+; THUMB2-NEXT:    bic.w r5, lr, r5
+; THUMB2-NEXT:    orrs r3, r5
+; THUMB2-NEXT:    pop {r4, r5, r7, pc}
 entry:
   %sel = call <4 x i16> @llvm.ct.select.v4i16(i1 %cond, <4 x i16> %a, <4 x i16> %b)
   ret <4 x i16> %sel
@@ -292,63 +242,47 @@ define <2 x i32> @ct_v2i32(i1 %cond, <2 x i32> %a, <2 x i32> %b) {
 ; CT-NEXT:    vldr d16, [sp]
 ; CT-NEXT:    vmov d17, r2, r3
 ; CT-NEXT:    and r0, r0, #1
-; CT-NEXT:    rsb r1, r0, #0
-; CT-NEXT:    vdup.32 d19, r1
-; CT-NEXT:    vand d18, d17, d19
-; CT-NEXT:    vbic d19, d16, d19
-; CT-NEXT:    vorr d18, d18, d19
+; CT-NEXT:    BUNDLE
 ; CT-NEXT:    vmov r0, r1, d18
 ; CT-NEXT:    bx lr
 ;
 ; DEFAULT-LABEL: ct_v2i32:
 ; DEFAULT:       @ %bb.0: @ %entry
 ; DEFAULT-NEXT:    push {r11, lr}
-; DEFAULT-NEXT:    and r0, r0, #1
+; DEFAULT-NEXT:    and r12, r0, #1
 ; DEFAULT-NEXT:    ldr r1, [sp, #8]
-; DEFAULT-NEXT:    sub r12, r0, #1
-; DEFAULT-NEXT:    and lr, r1, r12
-; DEFAULT-NEXT:    rsb r1, r0, #0
-; DEFAULT-NEXT:    and r0, r2, r1
+; DEFAULT-NEXT:    BUNDLE
 ; DEFAULT-NEXT:    ldr r2, [sp, #12]
-; DEFAULT-NEXT:    and r1, r3, r1
-; DEFAULT-NEXT:    orr r0, r0, lr
-; DEFAULT-NEXT:    and r2, r2, r12
-; DEFAULT-NEXT:    orr r1, r1, r2
+; DEFAULT-NEXT:    BUNDLE
 ; DEFAULT-NEXT:    pop {r11, pc}
 ;
 ; THUMB1-LABEL: ct_v2i32:
 ; THUMB1:       @ %bb.0: @ %entry
-; THUMB1-NEXT:    .save {r4, lr}
-; THUMB1-NEXT:    push {r4, lr}
-; THUMB1-NEXT:    movs r1, #1
-; THUMB1-NEXT:    ands r1, r0
-; THUMB1-NEXT:    subs r4, r1, #1
-; THUMB1-NEXT:    ldr r0, [sp, #8]
-; THUMB1-NEXT:    ands r0, r4
-; THUMB1-NEXT:    rsbs r1, r1, #0
-; THUMB1-NEXT:    ands r2, r1
-; THUMB1-NEXT:    orrs r0, r2
-; THUMB1-NEXT:    ldr r2, [sp, #12]
-; THUMB1-NEXT:    ands r2, r4
-; THUMB1-NEXT:    ands r1, r3
-; THUMB1-NEXT:    orrs r1, r2
-; THUMB1-NEXT:    pop {r4, pc}
+; THUMB1-NEXT:    .save {r4, r5, r7, lr}
+; THUMB1-NEXT:    push {r4, r5, r7, lr}
+; THUMB1-NEXT:    movs r4, #1
+; THUMB1-NEXT:    ands r4, r0
+; THUMB1-NEXT:    ldr r1, [sp, #16]
+; THUMB1-NEXT:    BUNDLE
+; THUMB1-NEXT:    ldr r2, [sp, #20]
+; THUMB1-NEXT:    BUNDLE
+; THUMB1-NEXT:    pop {r4, r5, r7, pc}
 ;
 ; THUMB2-LABEL: ct_v2i32:
 ; THUMB2:       @ %bb.0: @ %entry
 ; THUMB2-NEXT:    .save {r7, lr}
 ; THUMB2-NEXT:    push {r7, lr}
-; THUMB2-NEXT:    and r0, r0, #1
+; THUMB2-NEXT:    and r12, r0, #1
 ; THUMB2-NEXT:    ldr r1, [sp, #8]
-; THUMB2-NEXT:    sub.w r12, r0, #1
-; THUMB2-NEXT:    and.w lr, r1, r12
-; THUMB2-NEXT:    rsbs r1, r0, #0
-; THUMB2-NEXT:    and.w r0, r2, r1
-; THUMB2-NEXT:    ldr r2, [sp, #12]
-; THUMB2-NEXT:    ands r1, r3
+; THUMB2-NEXT:    rsb.w lr, r12, #0
+; THUMB2-NEXT:    and.w r0, r2, lr
+; THUMB2-NEXT:    bic.w lr, r1, lr
 ; THUMB2-NEXT:    orr.w r0, r0, lr
-; THUMB2-NEXT:    and.w r2, r2, r12
-; THUMB2-NEXT:    orrs r1, r2
+; THUMB2-NEXT:    ldr r2, [sp, #12]
+; THUMB2-NEXT:    rsb.w lr, r12, #0
+; THUMB2-NEXT:    and.w r1, r3, lr
+; THUMB2-NEXT:    bic.w lr, r2, lr
+; THUMB2-NEXT:    orr.w r1, r1, lr
 ; THUMB2-NEXT:    pop {r7, pc}
 entry:
   %sel = call <2 x i32> @llvm.ct.select.v2i32(i1 %cond, <2 x i32> %a, <2 x i32> %b)
@@ -361,63 +295,47 @@ define <1 x i64> @ct_v1i64(i1 %cond, <1 x i64> %a, <1 x i64> %b) {
 ; CT-NEXT:    vldr d16, [sp]
 ; CT-NEXT:    vmov d17, r2, r3
 ; CT-NEXT:    and r0, r0, #1
-; CT-NEXT:    rsb r1, r0, #0
-; CT-NEXT:    vdup.32 d19, r1
-; CT-NEXT:    vand d18, d17, d19
-; CT-NEXT:    vbic d19, d16, d19
-; CT-NEXT:    vorr d18, d18, d19
+; CT-NEXT:    BUNDLE
 ; CT-NEXT:    vmov r0, r1, d18
 ; CT-NEXT:    bx lr
 ;
 ; DEFAULT-LABEL: ct_v1i64:
 ; DEFAULT:       @ %bb.0: @ %entry
 ; DEFAULT-NEXT:    push {r11, lr}
-; DEFAULT-NEXT:    and r0, r0, #1
+; DEFAULT-NEXT:    and r12, r0, #1
 ; DEFAULT-NEXT:    ldr r1, [sp, #8]
-; DEFAULT-NEXT:    sub r12, r0, #1
-; DEFAULT-NEXT:    and lr, r1, r12
-; DEFAULT-NEXT:    rsb r1, r0, #0
-; DEFAULT-NEXT:    and r0, r2, r1
+; DEFAULT-NEXT:    BUNDLE
 ; DEFAULT-NEXT:    ldr r2, [sp, #12]
-; DEFAULT-NEXT:    and r1, r3, r1
-; DEFAULT-NEXT:    orr r0, r0, lr
-; DEFAULT-NEXT:    and r2, r2, r12
-; DEFAULT-NEXT:    orr r1, r1, r2
+; DEFAULT-NEXT:    BUNDLE
 ; DEFAULT-NEXT:    pop {r11, pc}
 ;
 ; THUMB1-LABEL: ct_v1i64:
 ; THUMB1:       @ %bb.0: @ %entry
-; THUMB1-NEXT:    .save {r4, lr}
-; THUMB1-NEXT:    push {r4, lr}
-; THUMB1-NEXT:    movs r1, #1
-; THUMB1-NEXT:    ands r1, r0
-; THUMB1-NEXT:    subs r4, r1, #1
-; THUMB1-NEXT:    ldr r0, [sp, #8]
-; THUMB1-NEXT:    ands r0, r4
-; THUMB1-NEXT:    rsbs r1, r1, #0
-; THUMB1-NEXT:    ands r2, r1
-; THUMB1-NEXT:    orrs r0, r2
-; THUMB1-NEXT:    ldr r2, [sp, #12]
-; THUMB1-NEXT:    ands r2, r4
-; THUMB1-NEXT:    ands r1, r3
-; THUMB1-NEXT:    orrs r1, r2
-; THUMB1-NEXT:    pop {r4, pc}
+; THUMB1-NEXT:    .save {r4, r5, r7, lr}
+; THUMB1-NEXT:    push {r4, r5, r7, lr}
+; THUMB1-NEXT:    movs r4, #1
+; THUMB1-NEXT:    ands r4, r0
+; THUMB1-NEXT:    ldr r1, [sp, #16]
+; THUMB1-NEXT:    BUNDLE
+; THUMB1-NEXT:    ldr r2, [sp, #20]
+; THUMB1-NEXT:    BUNDLE
+; THUMB1-NEXT:    pop {r4, r5, r7, pc}
 ;
 ; THUMB2-LABEL: ct_v1i64:
 ; THUMB2:       @ %bb.0: @ %entry
 ; THUMB2-NEXT:    .save {r7, lr}
 ; THUMB2-NEXT:    push {r7, lr}
-; THUMB2-NEXT:    and r0, r0, #1
+; THUMB2-NEXT:    and r12, r0, #1
 ; THUMB2-NEXT:    ldr r1, [sp, #8]
-; THUMB2-NEXT:    sub.w r12, r0, #1
-; THUMB2-NEXT:    and.w lr, r1, r12
-; THUMB2-NEXT:    rsbs r1, r0, #0
-; THUMB2-NEXT:    and.w r0, r2, r1
-; THUMB2-NEXT:    ldr r2, [sp, #12]
-; THUMB2-NEXT:    ands r1, r3
+; THUMB2-NEXT:    rsb.w lr, r12, #0
+; THUMB2-NEXT:    and.w r0, r2, lr
+; THUMB2-NEXT:    bic.w lr, r1, lr
 ; THUMB2-NEXT:    orr.w r0, r0, lr
-; THUMB2-NEXT:    and.w r2, r2, r12
-; THUMB2-NEXT:    orrs r1, r2
+; THUMB2-NEXT:    ldr r2, [sp, #12]
+; THUMB2-NEXT:    rsb.w lr, r12, #0
+; THUMB2-NEXT:    and.w r1, r3, lr
+; THUMB2-NEXT:    bic.w lr, r2, lr
+; THUMB2-NEXT:    orr.w r1, r1, lr
 ; THUMB2-NEXT:    pop {r7, pc}
 entry:
   %sel = call <1 x i64> @llvm.ct.select.v1i64(i1 %cond, <1 x i64> %a, <1 x i64> %b)
@@ -430,63 +348,47 @@ define <2 x float> @ct_v2f32(i1 %cond, <2 x float> %a, <2 x float> %b) {
 ; CT-NEXT:    vldr d16, [sp]
 ; CT-NEXT:    vmov d17, r2, r3
 ; CT-NEXT:    and r0, r0, #1
-; CT-NEXT:    rsb r1, r0, #0
-; CT-NEXT:    vdup.32 d19, r1
-; CT-NEXT:    vand d18, d17, d19
-; CT-NEXT:    vbic d19, d16, d19
-; CT-NEXT:    vorr d18, d18, d19
+; CT-NEXT:    BUNDLE
 ; CT-NEXT:    vmov r0, r1, d18
 ; CT-NEXT:    bx lr
 ;
 ; DEFAULT-LABEL: ct_v2f32:
 ; DEFAULT:       @ %bb.0: @ %entry
 ; DEFAULT-NEXT:    push {r11, lr}
-; DEFAULT-NEXT:    and r0, r0, #1
+; DEFAULT-NEXT:    and r12, r0, #1
 ; DEFAULT-NEXT:    ldr r1, [sp, #8]
-; DEFAULT-NEXT:    sub r12, r0, #1
-; DEFAULT-NEXT:    and lr, r1, r12
-; DEFAULT-NEXT:    rsb r1, r0, #0
-; DEFAULT-NEXT:    and r0, r2, r1
+; DEFAULT-NEXT:    BUNDLE
 ; DEFAULT-NEXT:    ldr r2, [sp, #12]
-; DEFAULT-NEXT:    and r1, r3, r1
-; DEFAULT-NEXT:    orr r0, r0, lr
-; DEFAULT-NEXT:    and r2, r2, r12
-; DEFAULT-NEXT:    orr r1, r1, r2
+; DEFAULT-NEXT:    BUNDLE
 ; DEFAULT-NEXT:    pop {r11, pc}
 ;
 ; THUMB1-LABEL: ct_v2f32:
 ; THUMB1:       @ %bb.0: @ %entry
-; THUMB1-NEXT:    .save {r4, lr}
-; THUMB1-NEXT:    push {r4, lr}
-; THUMB1-NEXT:    movs r1, #1
-; THUMB1-NEXT:    ands r1, r0
-; THUMB1-NEXT:    subs r4, r1, #1
-; THUMB1-NEXT:    ldr r0, [sp, #8]
-; THUMB1-NEXT:    ands r0, r4
-; THUMB1-NEXT:    rsbs r1, r1, #0
-; THUMB1-NEXT:    ands r2, r1
-; THUMB1-NEXT:    orrs r0, r2
-; THUMB1-NEXT:    ldr r2, [sp, #12]
-; THUMB1-NEXT:    ands r2, r4
-; THUMB1-NEXT:    ands r1, r3
-; THUMB1-NEXT:    orrs r1, r2
-; THUMB1-NEXT:    pop {r4, pc}
+; THUMB1-NEXT:    .save {r4, r5, r7, lr}
+; THUMB1-NEXT:    push {r4, r5, r7, lr}
+; THUMB1-NEXT:    movs r4, #1
+; THUMB1-NEXT:    ands r4, r0
+; THUMB1-NEXT:    ldr r1, [sp, #16]
+; THUMB1-NEXT:    BUNDLE
+; THUMB1-NEXT:    ldr r2, [sp, #20]
+; THUMB1-NEXT:    BUNDLE
+; THUMB1-NEXT:    pop {r4, r5, r7, pc}
 ;
 ; THUMB2-LABEL: ct_v2f32:
 ; THUMB2:       @ %bb.0: @ %entry
 ; THUMB2-NEXT:    .save {r7, lr}
 ; THUMB2-NEXT:    push {r7, lr}
-; THUMB2-NEXT:    and r0, r0, #1
+; THUMB2-NEXT:    and r12, r0, #1
 ; THUMB2-NEXT:    ldr r1, [sp, #8]
-; THUMB2-NEXT:    sub.w r12, r0, #1
-; THUMB2-NEXT:    and.w lr, r1, r12
-; THUMB2-NEXT:    rsbs r1, r0, #0
-; THUMB2-NEXT:    and.w r0, r2, r1
-; THUMB2-NEXT:    ldr r2, [sp, #12]
-; THUMB2-NEXT:    ands r1, r3
+; THUMB2-NEXT:    rsb.w lr, r12, #0
+; THUMB2-NEXT:    and.w r0, r2, lr
+; THUMB2-NEXT:    bic.w lr, r1, lr
 ; THUMB2-NEXT:    orr.w r0, r0, lr
-; THUMB2-NEXT:    and.w r2, r2, r12
-; THUMB2-NEXT:    orrs r1, r2
+; THUMB2-NEXT:    ldr r2, [sp, #12]
+; THUMB2-NEXT:    rsb.w lr, r12, #0
+; THUMB2-NEXT:    and.w r1, r3, lr
+; THUMB2-NEXT:    bic.w lr, r2, lr
+; THUMB2-NEXT:    orr.w r1, r1, lr
 ; THUMB2-NEXT:    pop {r7, pc}
 entry:
   %sel = call <2 x float> @llvm.ct.select.v2f32(i1 %cond, <2 x float> %a, <2 x float> %b)
@@ -501,11 +403,7 @@ define <16 x i8> @ct_v16i8(i1 %cond, <16 x i8> %a, <16 x i8> %b) {
 ; CT-NEXT:    vmov d16, r2, r3
 ; CT-NEXT:    vld1.64 {d18, d19}, [r1]
 ; CT-NEXT:    and r0, r0, #1
-; CT-NEXT:    rsb r1, r0, #0
-; CT-NEXT:    vdup.32 q11, r1
-; CT-NEXT:    vand q10, q8, q11
-; CT-NEXT:    vbic q11, q9, q11
-; CT-NEXT:    vorr q10, q10, q11
+; CT-NEXT:    BUNDLE
 ; CT-NEXT:    vmov r0, r1, d20
 ; CT-NEXT:    vmov r2, r3, d21
 ; CT-NEXT:    bx lr
@@ -514,309 +412,258 @@ define <16 x i8> @ct_v16i8(i1 %cond, <16 x i8> %a, <16 x i8> %b) {
 ; DEFAULT:       @ %bb.0: @ %entry
 ; DEFAULT-NEXT:    push {r4, r5, r11, lr}
 ; DEFAULT-NEXT:    and lr, r1, #1
-; DEFAULT-NEXT:    ldrb r1, [sp, #132]
-; DEFAULT-NEXT:    sub r12, lr, #1
-; DEFAULT-NEXT:    ldrb r4, [sp, #68]
-; DEFAULT-NEXT:    and r5, r1, r12
-; DEFAULT-NEXT:    rsb r1, lr, #0
-; DEFAULT-NEXT:    and r4, r4, r1
-; DEFAULT-NEXT:    and r3, r3, r1
-; DEFAULT-NEXT:    orr r5, r4, r5
-; DEFAULT-NEXT:    ldrb r4, [sp, #64]
-; DEFAULT-NEXT:    strb r5, [r0, #15]
-; DEFAULT-NEXT:    ldrb r5, [sp, #128]
-; DEFAULT-NEXT:    and r4, r4, r1
-; DEFAULT-NEXT:    and r5, r5, r12
-; DEFAULT-NEXT:    orr r5, r4, r5
-; DEFAULT-NEXT:    ldrb r4, [sp, #60]
-; DEFAULT-NEXT:    strb r5, [r0, #14]
-; DEFAULT-NEXT:    ldrb r5, [sp, #124]
-; DEFAULT-NEXT:    and r4, r4, r1
-; DEFAULT-NEXT:    and r5, r5, r12
-; DEFAULT-NEXT:    orr r5, r4, r5
-; DEFAULT-NEXT:    ldrb r4, [sp, #56]
-; DEFAULT-NEXT:    strb r5, [r0, #13]
-; DEFAULT-NEXT:    ldrb r5, [sp, #120]
-; DEFAULT-NEXT:    and r4, r4, r1
-; DEFAULT-NEXT:    and r5, r5, r12
-; DEFAULT-NEXT:    orr r5, r4, r5
-; DEFAULT-NEXT:    ldrb r4, [sp, #52]
-; DEFAULT-NEXT:    strb r5, [r0, #12]
-; DEFAULT-NEXT:    ldrb r5, [sp, #116]
-; DEFAULT-NEXT:    and r4, r4, r1
-; DEFAULT-NEXT:    and r5, r5, r12
-; DEFAULT-NEXT:    orr r5, r4, r5
-; DEFAULT-NEXT:    ldrb r4, [sp, #48]
-; DEFAULT-NEXT:    strb r5, [r0, #11]
-; DEFAULT-NEXT:    ldrb r5, [sp, #112]
-; DEFAULT-NEXT:    and r4, r4, r1
-; DEFAULT-NEXT:    and r5, r5, r12
-; DEFAULT-NEXT:    orr r5, r4, r5
-; DEFAULT-NEXT:    ldrb r4, [sp, #44]
-; DEFAULT-NEXT:    strb r5, [r0, #10]
-; DEFAULT-NEXT:    ldrb r5, [sp, #108]
-; DEFAULT-NEXT:    and r4, r4, r1
-; DEFAULT-NEXT:    and r5, r5, r12
-; DEFAULT-NEXT:    orr r5, r4, r5
-; DEFAULT-NEXT:    ldrb r4, [sp, #40]
-; DEFAULT-NEXT:    strb r5, [r0, #9]
-; DEFAULT-NEXT:    ldrb r5, [sp, #104]
-; DEFAULT-NEXT:    and r4, r4, r1
-; DEFAULT-NEXT:    and r5, r5, r12
-; DEFAULT-NEXT:    orr r5, r4, r5
-; DEFAULT-NEXT:    ldrb r4, [sp, #36]
-; DEFAULT-NEXT:    strb r5, [r0, #8]
-; DEFAULT-NEXT:    ldrb r5, [sp, #100]
-; DEFAULT-NEXT:    and r4, r4, r1
-; DEFAULT-NEXT:    and r5, r5, r12
-; DEFAULT-NEXT:    orr r5, r4, r5
-; DEFAULT-NEXT:    ldrb r4, [sp, #32]
-; DEFAULT-NEXT:    strb r5, [r0, #7]
-; DEFAULT-NEXT:    ldrb r5, [sp, #96]
-; DEFAULT-NEXT:    and r4, r4, r1
-; DEFAULT-NEXT:    and r5, r5, r12
-; DEFAULT-NEXT:    orr r5, r4, r5
-; DEFAULT-NEXT:    ldrb r4, [sp, #28]
-; DEFAULT-NEXT:    strb r5, [r0, #6]
-; DEFAULT-NEXT:    ldrb r5, [sp, #92]
-; DEFAULT-NEXT:    and r4, r4, r1
-; DEFAULT-NEXT:    and r5, r5, r12
-; DEFAULT-NEXT:    orr r5, r4, r5
-; DEFAULT-NEXT:    ldrb r4, [sp, #24]
-; DEFAULT-NEXT:    strb r5, [r0, #5]
-; DEFAULT-NEXT:    ldrb r5, [sp, #88]
-; DEFAULT-NEXT:    and r4, r4, r1
-; DEFAULT-NEXT:    and r5, r5, r12
-; DEFAULT-NEXT:    orr r5, r4, r5
-; DEFAULT-NEXT:    ldrb r4, [sp, #20]
-; DEFAULT-NEXT:    strb r5, [r0, #4]
-; DEFAULT-NEXT:    ldrb r5, [sp, #84]
-; DEFAULT-NEXT:    and r4, r4, r1
-; DEFAULT-NEXT:    and r5, r5, r12
-; DEFAULT-NEXT:    orr r5, r4, r5
-; DEFAULT-NEXT:    ldrb r4, [sp, #16]
-; DEFAULT-NEXT:    strb r5, [r0, #3]
-; DEFAULT-NEXT:    ldrb r5, [sp, #80]
-; DEFAULT-NEXT:    and r4, r4, r1
-; DEFAULT-NEXT:    and r1, r2, r1
-; DEFAULT-NEXT:    and r5, r5, r12
-; DEFAULT-NEXT:    orr r5, r4, r5
-; DEFAULT-NEXT:    strb r5, [r0, #2]
-; DEFAULT-NEXT:    ldrb r5, [sp, #76]
-; DEFAULT-NEXT:    and r5, r5, r12
-; DEFAULT-NEXT:    orr r3, r3, r5
-; DEFAULT-NEXT:    strb r3, [r0, #1]
-; DEFAULT-NEXT:    ldrb r3, [sp, #72]
-; DEFAULT-NEXT:    and r3, r3, r12
-; DEFAULT-NEXT:    orr r1, r1, r3
-; DEFAULT-NEXT:    strb r1, [r0]
+; DEFAULT-NEXT:    ldrb r12, [sp, #132]
+; DEFAULT-NEXT:    ldrb r1, [sp, #68]
+; DEFAULT-NEXT:    BUNDLE
+; DEFAULT-NEXT:    strb r4, [r0, #15]
+; DEFAULT-NEXT:    ldrb r12, [sp, #128]
+; DEFAULT-NEXT:    ldrb r5, [sp, #64]
+; DEFAULT-NEXT:    BUNDLE
+; DEFAULT-NEXT:    strb r4, [r0, #14]
+; DEFAULT-NEXT:    ldrb r12, [sp, #124]
+; DEFAULT-NEXT:    ldrb r5, [sp, #60]
+; DEFAULT-NEXT:    BUNDLE
+; DEFAULT-NEXT:    strb r4, [r0, #13]
+; DEFAULT-NEXT:    ldrb r12, [sp, #120]
+; DEFAULT-NEXT:    ldrb r5, [sp, #56]
+; DEFAULT-NEXT:    BUNDLE
+; DEFAULT-NEXT:    strb r4, [r0, #12]
+; DEFAULT-NEXT:    ldrb r12, [sp, #116]
+; DEFAULT-NEXT:    ldrb r5, [sp, #52]
+; DEFAULT-NEXT:    BUNDLE
+; DEFAULT-NEXT:    strb r4, [r0, #11]
+; DEFAULT-NEXT:    ldrb r12, [sp, #112]
+; DEFAULT-NEXT:    ldrb r5, [sp, #48]
+; DEFAULT-NEXT:    BUNDLE
+; DEFAULT-NEXT:    strb r4, [r0, #10]
+; DEFAULT-NEXT:    ldrb r12, [sp, #108]
+; DEFAULT-NEXT:    ldrb r5, [sp, #44]
+; DEFAULT-NEXT:    BUNDLE
+; DEFAULT-NEXT:    strb r4, [r0, #9]
+; DEFAULT-NEXT:    ldrb r12, [sp, #104]
+; DEFAULT-NEXT:    ldrb r5, [sp, #40]
+; DEFAULT-NEXT:    BUNDLE
+; DEFAULT-NEXT:    strb r4, [r0, #8]
+; DEFAULT-NEXT:    ldrb r12, [sp, #100]
+; DEFAULT-NEXT:    ldrb r5, [sp, #36]
+; DEFAULT-NEXT:    BUNDLE
+; DEFAULT-NEXT:    strb r4, [r0, #7]
+; DEFAULT-NEXT:    ldrb r12, [sp, #96]
+; DEFAULT-NEXT:    ldrb r5, [sp, #32]
+; DEFAULT-NEXT:    BUNDLE
+; DEFAULT-NEXT:    strb r4, [r0, #6]
+; DEFAULT-NEXT:    ldrb r12, [sp, #92]
+; DEFAULT-NEXT:    ldrb r5, [sp, #28]
+; DEFAULT-NEXT:    BUNDLE
+; DEFAULT-NEXT:    strb r4, [r0, #5]
+; DEFAULT-NEXT:    ldrb r12, [sp, #88]
+; DEFAULT-NEXT:    ldrb r5, [sp, #24]
+; DEFAULT-NEXT:    BUNDLE
+; DEFAULT-NEXT:    strb r4, [r0, #4]
+; DEFAULT-NEXT:    ldrb r12, [sp, #84]
+; DEFAULT-NEXT:    ldrb r5, [sp, #20]
+; DEFAULT-NEXT:    BUNDLE
+; DEFAULT-NEXT:    strb r4, [r0, #3]
+; DEFAULT-NEXT:    ldrb r12, [sp, #80]
+; DEFAULT-NEXT:    ldrb r5, [sp, #16]
+; DEFAULT-NEXT:    BUNDLE
+; DEFAULT-NEXT:    strb r4, [r0, #2]
+; DEFAULT-NEXT:    ldrb r1, [sp, #76]
+; DEFAULT-NEXT:    BUNDLE
+; DEFAULT-NEXT:    strb r5, [r0, #1]
+; DEFAULT-NEXT:    ldrb r1, [sp, #72]
+; DEFAULT-NEXT:    BUNDLE
+; DEFAULT-NEXT:    strb r3, [r0]
 ; DEFAULT-NEXT:    pop {r4, r5, r11, pc}
 ;
 ; THUMB1-LABEL: ct_v16i8:
 ; THUMB1:       @ %bb.0: @ %entry
-; THUMB1-NEXT:    .save {r4, r5, r6, lr}
-; THUMB1-NEXT:    push {r4, r5, r6, lr}
+; THUMB1-NEXT:    .save {r4, r5, r6, r7, lr}
+; THUMB1-NEXT:    push {r4, r5, r6, r7, lr}
+; THUMB1-NEXT:    .pad #4
+; THUMB1-NEXT:    sub sp, #4
 ; THUMB1-NEXT:    movs r4, #1
 ; THUMB1-NEXT:    ands r4, r1
-; THUMB1-NEXT:    subs r1, r4, #1
-; THUMB1-NEXT:    ldr r5, [sp, #132]
-; THUMB1-NEXT:    ands r5, r1
-; THUMB1-NEXT:    rsbs r4, r4, #0
-; THUMB1-NEXT:    ldr r6, [sp, #68]
-; THUMB1-NEXT:    ands r6, r4
-; THUMB1-NEXT:    orrs r6, r5
+; THUMB1-NEXT:    ldr r1, [sp, #140]
+; THUMB1-NEXT:    ldr r5, [sp, #76]
+; THUMB1-NEXT:    BUNDLE
 ; THUMB1-NEXT:    strb r6, [r0, #15]
-; THUMB1-NEXT:    ldr r5, [sp, #128]
-; THUMB1-NEXT:    ands r5, r1
-; THUMB1-NEXT:    ldr r6, [sp, #64]
-; THUMB1-NEXT:    ands r6, r4
-; THUMB1-NEXT:    orrs r6, r5
+; THUMB1-NEXT:    ldr r1, [sp, #136]
+; THUMB1-NEXT:    ldr r5, [sp, #72]
+; THUMB1-NEXT:    BUNDLE
 ; THUMB1-NEXT:    strb r6, [r0, #14]
-; THUMB1-NEXT:    ldr r5, [sp, #124]
-; THUMB1-NEXT:    ands r5, r1
-; THUMB1-NEXT:    ldr r6, [sp, #60]
-; THUMB1-NEXT:    ands r6, r4
-; THUMB1-NEXT:    orrs r6, r5
+; THUMB1-NEXT:    ldr r1, [sp, #132]
+; THUMB1-NEXT:    ldr r5, [sp, #68]
+; THUMB1-NEXT:    BUNDLE
 ; THUMB1-NEXT:    strb r6, [r0, #13]
-; THUMB1-NEXT:    ldr r5, [sp, #120]
-; THUMB1-NEXT:    ands r5, r1
-; THUMB1-NEXT:    ldr r6, [sp, #56]
-; THUMB1-NEXT:    ands r6, r4
-; THUMB1-NEXT:    orrs r6, r5
+; THUMB1-NEXT:    ldr r1, [sp, #128]
+; THUMB1-NEXT:    ldr r5, [sp, #64]
+; THUMB1-NEXT:    BUNDLE
 ; THUMB1-NEXT:    strb r6, [r0, #12]
-; THUMB1-NEXT:    ldr r5, [sp, #116]
-; THUMB1-NEXT:    ands r5, r1
-; THUMB1-NEXT:    ldr r6, [sp, #52]
-; THUMB1-NEXT:    ands r6, r4
-; THUMB1-NEXT:    orrs r6, r5
+; THUMB1-NEXT:    ldr r1, [sp, #124]
+; THUMB1-NEXT:    ldr r5, [sp, #60]
+; THUMB1-NEXT:    BUNDLE
 ; THUMB1-NEXT:    strb r6, [r0, #11]
-; THUMB1-NEXT:    ldr r5, [sp, #112]
-; THUMB1-NEXT:    ands r5, r1
-; THUMB1-NEXT:    ldr r6, [sp, #48]
-; THUMB1-NEXT:    ands r6, r4
-; THUMB1-NEXT:    orrs r6, r5
+; THUMB1-NEXT:    ldr r1, [sp, #120]
+; THUMB1-NEXT:    ldr r5, [sp, #56]
+; THUMB1-NEXT:    BUNDLE
 ; THUMB1-NEXT:    strb r6, [r0, #10]
-; THUMB1-NEXT:    ldr r5, [sp, #108]
-; THUMB1-NEXT:    ands r5, r1
-; THUMB1-NEXT:    ldr r6, [sp, #44]
-; THUMB1-NEXT:    ands r6, r4
-; THUMB1-NEXT:    orrs r6, r5
+; THUMB1-NEXT:    ldr r1, [sp, #116]
+; THUMB1-NEXT:    ldr r5, [sp, #52]
+; THUMB1-NEXT:    BUNDLE
 ; THUMB1-NEXT:    strb r6, [r0, #9]
-; THUMB1-NEXT:    ldr r5, [sp, #104]
-; THUMB1-NEXT:    ands r5, r1
-; THUMB1-NEXT:    ldr r6, [sp, #40]
-; THUMB1-NEXT:    ands r6, r4
-; THUMB1-NEXT:    orrs r6, r5
+; THUMB1-NEXT:    ldr r1, [sp, #112]
+; THUMB1-NEXT:    ldr r5, [sp, #48]
+; THUMB1-NEXT:    BUNDLE
 ; THUMB1-NEXT:    strb r6, [r0, #8]
-; THUMB1-NEXT:    ldr r5, [sp, #100]
-; THUMB1-NEXT:    ands r5, r1
-; THUMB1-NEXT:    ldr r6, [sp, #36]
-; THUMB1-NEXT:    ands r6, r4
-; THUMB1-NEXT:    orrs r6, r5
+; THUMB1-NEXT:    ldr r1, [sp, #108]
+; THUMB1-NEXT:    ldr r5, [sp, #44]
+; THUMB1-NEXT:    BUNDLE
 ; THUMB1-NEXT:    strb r6, [r0, #7]
-; THUMB1-NEXT:    ldr r5, [sp, #96]
-; THUMB1-NEXT:    ands r5, r1
-; THUMB1-NEXT:    ldr r6, [sp, #32]
-; THUMB1-NEXT:    ands r6, r4
-; THUMB1-NEXT:    orrs r6, r5
+; THUMB1-NEXT:    ldr r1, [sp, #104]
+; THUMB1-NEXT:    ldr r5, [sp, #40]
+; THUMB1-NEXT:    BUNDLE
 ; THUMB1-NEXT:    strb r6, [r0, #6]
-; THUMB1-NEXT:    ldr r5, [sp, #92]
-; THUMB1-NEXT:    ands r5, r1
-; THUMB1-NEXT:    ldr r6, [sp, #28]
-; THUMB1-NEXT:    ands r6, r4
-; THUMB1-NEXT:    orrs r6, r5
+; THUMB1-NEXT:    ldr r1, [sp, #100]
+; THUMB1-NEXT:    ldr r5, [sp, #36]
+; THUMB1-NEXT:    BUNDLE
 ; THUMB1-NEXT:    strb r6, [r0, #5]
-; THUMB1-NEXT:    ldr r5, [sp, #88]
-; THUMB1-NEXT:    ands r5, r1
-; THUMB1-NEXT:    ldr r6, [sp, #24]
-; THUMB1-NEXT:    ands r6, r4
-; THUMB1-NEXT:    orrs r6, r5
+; THUMB1-NEXT:    ldr r1, [sp, #96]
+; THUMB1-NEXT:    ldr r5, [sp, #32]
+; THUMB1-NEXT:    BUNDLE
 ; THUMB1-NEXT:    strb r6, [r0, #4]
-; THUMB1-NEXT:    ldr r5, [sp, #84]
-; THUMB1-NEXT:    ands r5, r1
-; THUMB1-NEXT:    ldr r6, [sp, #20]
-; THUMB1-NEXT:    ands r6, r4
-; THUMB1-NEXT:    orrs r6, r5
+; THUMB1-NEXT:    ldr r1, [sp, #92]
+; THUMB1-NEXT:    ldr r5, [sp, #28]
+; THUMB1-NEXT:    BUNDLE
 ; THUMB1-NEXT:    strb r6, [r0, #3]
-; THUMB1-NEXT:    ldr r5, [sp, #80]
-; THUMB1-NEXT:    ands r5, r1
-; THUMB1-NEXT:    ldr r6, [sp, #16]
-; THUMB1-NEXT:    ands r6, r4
-; THUMB1-NEXT:    orrs r6, r5
+; THUMB1-NEXT:    ldr r1, [sp, #88]
+; THUMB1-NEXT:    ldr r5, [sp, #24]
+; THUMB1-NEXT:    BUNDLE
 ; THUMB1-NEXT:    strb r6, [r0, #2]
-; THUMB1-NEXT:    ldr r5, [sp, #76]
-; THUMB1-NEXT:    ands r5, r1
-; THUMB1-NEXT:    ands r3, r4
-; THUMB1-NEXT:    orrs r3, r5
-; THUMB1-NEXT:    strb r3, [r0, #1]
-; THUMB1-NEXT:    ldr r3, [sp, #72]
-; THUMB1-NEXT:    ands r3, r1
-; THUMB1-NEXT:    ands r2, r4
-; THUMB1-NEXT:    orrs r2, r3
-; THUMB1-NEXT:    strb r2, [r0]
-; THUMB1-NEXT:    pop {r4, r5, r6, pc}
+; THUMB1-NEXT:    ldr r1, [sp, #84]
+; THUMB1-NEXT:    BUNDLE
+; THUMB1-NEXT:    strb r5, [r0, #1]
+; THUMB1-NEXT:    ldr r1, [sp, #80]
+; THUMB1-NEXT:    BUNDLE
+; THUMB1-NEXT:    strb r3, [r0]
+; THUMB1-NEXT:    add sp, #4
+; THUMB1-NEXT:    pop {r4, r5, r6, r7, pc}
 ;
 ; THUMB2-LABEL: ct_v16i8:
 ; THUMB2:       @ %bb.0: @ %entry
 ; THUMB2-NEXT:    .save {r4, r5, r7, lr}
 ; THUMB2-NEXT:    push {r4, r5, r7, lr}
 ; THUMB2-NEXT:    and lr, r1, #1
-; THUMB2-NEXT:    ldrb.w r1, [sp, #132]
-; THUMB2-NEXT:    sub.w r12, lr, #1
-; THUMB2-NEXT:    ldrb.w r4, [sp, #68]
-; THUMB2-NEXT:    and.w r5, r1, r12
+; THUMB2-NEXT:    ldrb.w r12, [sp, #132]
+; THUMB2-NEXT:    ldrb.w r1, [sp, #68]
+; THUMB2-NEXT:    rsb.w r5, lr, #0
+; THUMB2-NEXT:    and.w r4, r1, r5
+; THUMB2-NEXT:    bic.w r5, r12, r5
+; THUMB2-NEXT:    orrs r4, r5
+; THUMB2-NEXT:    strb r4, [r0, #15]
+; THUMB2-NEXT:    ldrb.w r12, [sp, #128]
+; THUMB2-NEXT:    ldrb.w r5, [sp, #64]
 ; THUMB2-NEXT:    rsb.w r1, lr, #0
-; THUMB2-NEXT:    ands r4, r1
-; THUMB2-NEXT:    ands r3, r1
-; THUMB2-NEXT:    orrs r5, r4
-; THUMB2-NEXT:    ldrb.w r4, [sp, #64]
-; THUMB2-NEXT:    strb r5, [r0, #15]
-; THUMB2-NEXT:    ldrb.w r5, [sp, #128]
-; THUMB2-NEXT:    ands r4, r1
-; THUMB2-NEXT:    and.w r5, r5, r12
-; THUMB2-NEXT:    orrs r5, r4
-; THUMB2-NEXT:    ldrb.w r4, [sp, #60]
-; THUMB2-NEXT:    strb r5, [r0, #14]
-; THUMB2-NEXT:    ldrb.w r5, [sp, #124]
-; THUMB2-NEXT:    ands r4, r1
-; THUMB2-NEXT:    and.w r5, r5, r12
-; THUMB2-NEXT:    orrs r5, r4
-; THUMB2-NEXT:    ldrb.w r4, [sp, #56]
-; THUMB2-NEXT:    strb r5, [r0, #13]
-; THUMB2-NEXT:    ldrb.w r5, [sp, #120]
-; THUMB2-NEXT:    ands r4, r1
-; THUMB2-NEXT:    and.w r5, r5, r12
-; THUMB2-NEXT:    orrs r5, r4
-; THUMB2-NEXT:    ldrb.w r4, [sp, #52]
-; THUMB2-NEXT:    strb r5, [r0, #12]
-; THUMB2-NEXT:    ldrb.w r5, [sp, #116]
-; THUMB2-NEXT:    ands r4, r1
-; THUMB2-NEXT:    and.w r5, r5, r12
-; THUMB2-NEXT:    orrs r5, r4
-; THUMB2-NEXT:    ldrb.w r4, [sp, #48]
-; THUMB2-NEXT:    strb r5, [r0, #11]
-; THUMB2-NEXT:    ldrb.w r5, [sp, #112]
-; THUMB2-NEXT:    ands r4, r1
-; THUMB2-NEXT:    and.w r5, r5, r12
-; THUMB2-NEXT:    orrs r5, r4
-; THUMB2-NEXT:    ldrb.w r4, [sp, #44]
-; THUMB2-NEXT:    strb r5, [r0, #10]
-; THUMB2-NEXT:    ldrb.w r5, [sp, #108]
-; THUMB2-NEXT:    ands r4, r1
-; THUMB2-NEXT:    and.w r5, r5, r12
-; THUMB2-NEXT:    orrs r5, r4
-; THUMB2-NEXT:    ldrb.w r4, [sp, #40]
-; THUMB2-NEXT:    strb r5, [r0, #9]
-; THUMB2-NEXT:    ldrb.w r5, [sp, #104]
-; THUMB2-NEXT:    ands r4, r1
-; THUMB2-NEXT:    and.w r5, r5, r12
-; THUMB2-NEXT:    orrs r5, r4
-; THUMB2-NEXT:    ldrb.w r4, [sp, #36]
-; THUMB2-NEXT:    strb r5, [r0, #8]
-; THUMB2-NEXT:    ldrb.w r5, [sp, #100]
-; THUMB2-NEXT:    ands r4, r1
-; THUMB2-NEXT:    and.w r5, r5, r12
-; THUMB2-NEXT:    orrs r5, r4
-; THUMB2-NEXT:    ldrb.w r4, [sp, #32]
-; THUMB2-NEXT:    strb r5, [r0, #7]
-; THUMB2-NEXT:    ldrb.w r5, [sp, #96]
-; THUMB2-NEXT:    ands r4, r1
-; THUMB2-NEXT:    and.w r5, r5, r12
-; THUMB2-NEXT:    orrs r5, r4
-; THUMB2-NEXT:    ldrb.w r4, [sp, #28]
-; THUMB2-NEXT:    strb r5, [r0, #6]
-; THUMB2-NEXT:    ldrb.w r5, [sp, #92]
-; THUMB2-NEXT:    ands r4, r1
-; THUMB2-NEXT:    and.w r5, r5, r12
-; THUMB2-NEXT:    orrs r5, r4
-; THUMB2-NEXT:    ldrb.w r4, [sp, #24]
-; THUMB2-NEXT:    strb r5, [r0, #5]
-; THUMB2-NEXT:    ldrb.w r5, [sp, #88]
-; THUMB2-NEXT:    ands r4, r1
-; THUMB2-NEXT:    and.w r5, r5, r12
-; THUMB2-NEXT:    orrs r5, r4
-; THUMB2-NEXT:    ldrb.w r4, [sp, #20]
-; THUMB2-NEXT:    strb r5, [r0, #4]
-; THUMB2-NEXT:    ldrb.w r5, [sp, #84]
-; THUMB2-NEXT:    ands r4, r1
-; THUMB2-NEXT:    and.w r5, r5, r12
-; THUMB2-NEXT:    orrs r5, r4
-; THUMB2-NEXT:    ldrb.w r4, [sp, #16]
-; THUMB2-NEXT:    strb r5, [r0, #3]
-; THUMB2-NEXT:    ldrb.w r5, [sp, #80]
-; THUMB2-NEXT:    ands r4, r1
-; THUMB2-NEXT:    ands r1, r2
-; THUMB2-NEXT:    and.w r5, r5, r12
+; THUMB2-NEXT:    and.w r4, r5, r1
+; THUMB2-NEXT:    bic.w r1, r12, r1
+; THUMB2-NEXT:    orrs r4, r1
+; THUMB2-NEXT:    strb r4, [r0, #14]
+; THUMB2-NEXT:    ldrb.w r12, [sp, #124]
+; THUMB2-NEXT:    ldrb.w r5, [sp, #60]
+; THUMB2-NEXT:    rsb.w r1, lr, #0
+; THUMB2-NEXT:    and.w r4, r5, r1
+; THUMB2-NEXT:    bic.w r1, r12, r1
+; THUMB2-NEXT:    orrs r4, r1
+; THUMB2-NEXT:    strb r4, [r0, #13]
+; THUMB2-NEXT:    ldrb.w r12, [sp, #120]
+; THUMB2-NEXT:    ldrb.w r5, [sp, #56]
+; THUMB2-NEXT:    rsb.w r1, lr, #0
+; THUMB2-NEXT:    and.w r4, r5, r1
+; THUMB2-NEXT:    bic.w r1, r12, r1
+; THUMB2-NEXT:    orrs r4, r1
+; THUMB2-NEXT:    strb r4, [r0, #12]
+; THUMB2-NEXT:    ldrb.w r12, [sp, #116]
+; THUMB2-NEXT:    ldrb.w r5, [sp, #52]
+; THUMB2-NEXT:    rsb.w r1, lr, #0
+; THUMB2-NEXT:    and.w r4, r5, r1
+; THUMB2-NEXT:    bic.w r1, r12, r1
+; THUMB2-NEXT:    orrs r4, r1
+; THUMB2-NEXT:    strb r4, [r0, #11]
+; THUMB2-NEXT:    ldrb.w r12, [sp, #112]
+; THUMB2-NEXT:    ldrb.w r5, [sp, #48]
+; THUMB2-NEXT:    rsb.w r1, lr, #0
+; THUMB2-NEXT:    and.w r4, r5, r1
+; THUMB2-NEXT:    bic.w r1, r12, r1
+; THUMB2-NEXT:    orrs r4, r1
+; THUMB2-NEXT:    strb r4, [r0, #10]
+; THUMB2-NEXT:    ldrb.w r12, [sp, #108]
+; THUMB2-NEXT:    ldrb.w r5, [sp, #44]
+; THUMB2-NEXT:    rsb.w r1, lr, #0
+; THUMB2-NEXT:    and.w r4, r5, r1
+; THUMB2-NEXT:    bic.w r1, r12, r1
+; THUMB2-NEXT:    orrs r4, r1
+; THUMB2-NEXT:    strb r4, [r0, #9]
+; THUMB2-NEXT:    ldrb.w r12, [sp, #104]
+; THUMB2-NEXT:    ldrb.w r5, [sp, #40]
+; THUMB2-NEXT:    rsb.w r1, lr, #0
+; THUMB2-NEXT:    and.w r4, r5, r1
+; THUMB2-NEXT:    bic.w r1, r12, r1
+; THUMB2-NEXT:    orrs r4, r1
+; THUMB2-NEXT:    strb r4, [r0, #8]
+; THUMB2-NEXT:    ldrb.w r12, [sp, #100]
+; THUMB2-NEXT:    ldrb.w r5, [sp, #36]
+; THUMB2-NEXT:    rsb.w r1, lr, #0
+; THUMB2-NEXT:    and.w r4, r5, r1
+; THUMB2-NEXT:    bic.w r1, r12, r1
+; THUMB2-NEXT:    orrs r4, r1
+; THUMB2-NEXT:    strb r4, [r0, #7]
+; THUMB2-NEXT:    ldrb.w r12, [sp, #96]
+; THUMB2-NEXT:    ldrb.w r5, [sp, #32]
+; THUMB2-NEXT:    rsb.w r1, lr, #0
+; THUMB2-NEXT:    and.w r4, r5, r1
+; THUMB2-NEXT:    bic.w r1, r12, r1
+; THUMB2-NEXT:    orrs r4, r1
+; THUMB2-NEXT:    strb r4, [r0, #6]
+; THUMB2-NEXT:    ldrb.w r12, [sp, #92]
+; THUMB2-NEXT:    ldrb.w r5, [sp, #28]
+; THUMB2-NEXT:    rsb.w r1, lr, #0
+; THUMB2-NEXT:    and.w r4, r5, r1
+; THUMB2-NEXT:    bic.w r1, r12, r1
+; THUMB2-NEXT:    orrs r4, r1
+; THUMB2-NEXT:    strb r4, [r0, #5]
+; THUMB2-NEXT:    ldrb.w r12, [sp, #88]
+; THUMB2-NEXT:    ldrb.w r5, [sp, #24]
+; THUMB2-NEXT:    rsb.w r1, lr, #0
+; THUMB2-NEXT:    and.w r4, r5, r1
+; THUMB2-NEXT:    bic.w r1, r12, r1
+; THUMB2-NEXT:    orrs r4, r1
+; THUMB2-NEXT:    strb r4, [r0, #4]
+; THUMB2-NEXT:    ldrb.w r12, [sp, #84]
+; THUMB2-NEXT:    ldrb.w r5, [sp, #20]
+; THUMB2-NEXT:    rsb.w r1, lr, #0
+; THUMB2-NEXT:    and.w r4, r5, r1
+; THUMB2-NEXT:    bic.w r1, r12, r1
+; THUMB2-NEXT:    orrs r4, r1
+; THUMB2-NEXT:    strb r4, [r0, #3]
+; THUMB2-NEXT:    ldrb.w r12, [sp, #80]
+; THUMB2-NEXT:    ldrb.w r5, [sp, #16]
+; THUMB2-NEXT:    rsb.w r1, lr, #0
+; THUMB2-NEXT:    and.w r4, r5, r1
+; THUMB2-NEXT:    bic.w r1, r12, r1
+; THUMB2-NEXT:    orrs r4, r1
+; THUMB2-NEXT:    strb r4, [r0, #2]
+; THUMB2-NEXT:    ldrb.w r1, [sp, #76]
+; THUMB2-NEXT:    rsb.w r4, lr, #0
+; THUMB2-NEXT:    and.w r5, r3, r4
+; THUMB2-NEXT:    bic.w r4, r1, r4
 ; THUMB2-NEXT:    orrs r5, r4
-; THUMB2-NEXT:    strb r5, [r0, #2]
-; THUMB2-NEXT:    ldrb.w r5, [sp, #76]
-; THUMB2-NEXT:    and.w r5, r5, r12
+; THUMB2-NEXT:    strb r5, [r0, #1]
+; THUMB2-NEXT:    ldrb.w r1, [sp, #72]
+; THUMB2-NEXT:    rsb.w r5, lr, #0
+; THUMB2-NEXT:    and.w r3, r2, r5
+; THUMB2-NEXT:    bic.w r5, r1, r5
 ; THUMB2-NEXT:    orrs r3, r5
-; THUMB2-NEXT:    strb r3, [r0, #1]
-; THUMB2-NEXT:    ldrb.w r3, [sp, #72]
-; THUMB2-NEXT:    and.w r3, r3, r12
-; THUMB2-NEXT:    orrs r1, r3
-; THUMB2-NEXT:    strb r1, [r0]
+; THUMB2-NEXT:    strb r3, [r0]
 ; THUMB2-NEXT:    pop {r4, r5, r7, pc}
 entry:
   %sel = call <16 x i8> @llvm.ct.select.v16i8(i1 %cond, <16 x i8> %a, <16 x i8> %b)
@@ -831,11 +678,7 @@ define <8 x i16> @ct_v8i16(i1 %cond, <8 x i16> %a, <8 x i16> %b) {
 ; CT-NEXT:    vmov d16, r2, r3
 ; CT-NEXT:    vld1.64 {d18, d19}, [r1]
 ; CT-NEXT:    and r0, r0, #1
-; CT-NEXT:    rsb r1, r0, #0
-; CT-NEXT:    vdup.32 q11, r1
-; CT-NEXT:    vand q10, q8, q11
-; CT-NEXT:    vbic q11, q9, q11
-; CT-NEXT:    vorr q10, q10, q11
+; CT-NEXT:    BUNDLE
 ; CT-NEXT:    vmov r0, r1, d20
 ; CT-NEXT:    vmov r2, r3, d21
 ; CT-NEXT:    bx lr
@@ -844,165 +687,138 @@ define <8 x i16> @ct_v8i16(i1 %cond, <8 x i16> %a, <8 x i16> %b) {
 ; DEFAULT:       @ %bb.0: @ %entry
 ; DEFAULT-NEXT:    push {r4, r5, r11, lr}
 ; DEFAULT-NEXT:    and lr, r1, #1
-; DEFAULT-NEXT:    ldrh r1, [sp, #68]
-; DEFAULT-NEXT:    sub r12, lr, #1
-; DEFAULT-NEXT:    ldrh r4, [sp, #36]
-; DEFAULT-NEXT:    and r5, r1, r12
-; DEFAULT-NEXT:    rsb r1, lr, #0
-; DEFAULT-NEXT:    and r4, r4, r1
-; DEFAULT-NEXT:    and r3, r3, r1
-; DEFAULT-NEXT:    orr r5, r4, r5
-; DEFAULT-NEXT:    ldrh r4, [sp, #32]
-; DEFAULT-NEXT:    strh r5, [r0, #14]
-; DEFAULT-NEXT:    ldrh r5, [sp, #64]
-; DEFAULT-NEXT:    and r4, r4, r1
-; DEFAULT-NEXT:    and r5, r5, r12
-; DEFAULT-NEXT:    orr r5, r4, r5
-; DEFAULT-NEXT:    ldrh r4, [sp, #28]
-; DEFAULT-NEXT:    strh r5, [r0, #12]
-; DEFAULT-NEXT:    ldrh r5, [sp, #60]
-; DEFAULT-NEXT:    and r4, r4, r1
-; DEFAULT-NEXT:    and r5, r5, r12
-; DEFAULT-NEXT:    orr r5, r4, r5
-; DEFAULT-NEXT:    ldrh r4, [sp, #24]
-; DEFAULT-NEXT:    strh r5, [r0, #10]
-; DEFAULT-NEXT:    ldrh r5, [sp, #56]
-; DEFAULT-NEXT:    and r4, r4, r1
-; DEFAULT-NEXT:    and r5, r5, r12
-; DEFAULT-NEXT:    orr r5, r4, r5
-; DEFAULT-NEXT:    ldrh r4, [sp, #20]
-; DEFAULT-NEXT:    strh r5, [r0, #8]
-; DEFAULT-NEXT:    ldrh r5, [sp, #52]
-; DEFAULT-NEXT:    and r4, r4, r1
-; DEFAULT-NEXT:    and r5, r5, r12
-; DEFAULT-NEXT:    orr r5, r4, r5
-; DEFAULT-NEXT:    ldrh r4, [sp, #16]
-; DEFAULT-NEXT:    strh r5, [r0, #6]
-; DEFAULT-NEXT:    ldrh r5, [sp, #48]
-; DEFAULT-NEXT:    and r4, r4, r1
-; DEFAULT-NEXT:    and r1, r2, r1
-; DEFAULT-NEXT:    and r5, r5, r12
-; DEFAULT-NEXT:    orr r5, r4, r5
-; DEFAULT-NEXT:    strh r5, [r0, #4]
-; DEFAULT-NEXT:    ldrh r5, [sp, #44]
-; DEFAULT-NEXT:    and r5, r5, r12
-; DEFAULT-NEXT:    orr r3, r3, r5
-; DEFAULT-NEXT:    strh r3, [r0, #2]
-; DEFAULT-NEXT:    ldrh r3, [sp, #40]
-; DEFAULT-NEXT:    and r3, r3, r12
-; DEFAULT-NEXT:    orr r1, r1, r3
-; DEFAULT-NEXT:    strh r1, [r0]
+; DEFAULT-NEXT:    ldrh r12, [sp, #68]
+; DEFAULT-NEXT:    ldrh r1, [sp, #36]
+; DEFAULT-NEXT:    BUNDLE
+; DEFAULT-NEXT:    strh r4, [r0, #14]
+; DEFAULT-NEXT:    ldrh r12, [sp, #64]
+; DEFAULT-NEXT:    ldrh r5, [sp, #32]
+; DEFAULT-NEXT:    BUNDLE
+; DEFAULT-NEXT:    strh r4, [r0, #12]
+; DEFAULT-NEXT:    ldrh r12, [sp, #60]
+; DEFAULT-NEXT:    ldrh r5, [sp, #28]
+; DEFAULT-NEXT:    BUNDLE
+; DEFAULT-NEXT:    strh r4, [r0, #10]
+; DEFAULT-NEXT:    ldrh r12, [sp, #56]
+; DEFAULT-NEXT:    ldrh r5, [sp, #24]
+; DEFAULT-NEXT:    BUNDLE
+; DEFAULT-NEXT:    strh r4, [r0, #8]
+; DEFAULT-NEXT:    ldrh r12, [sp, #52]
+; DEFAULT-NEXT:    ldrh r5, [sp, #20]
+; DEFAULT-NEXT:    BUNDLE
+; DEFAULT-NEXT:    strh r4, [r0, #6]
+; DEFAULT-NEXT:    ldrh r12, [sp, #48]
+; DEFAULT-NEXT:    ldrh r5, [sp, #16]
+; DEFAULT-NEXT:    BUNDLE
+; DEFAULT-NEXT:    strh r4, [r0, #4]
+; DEFAULT-NEXT:    ldrh r1, [sp, #44]
+; DEFAULT-NEXT:    BUNDLE
+; DEFAULT-NEXT:    strh r5, [r0, #2]
+; DEFAULT-NEXT:    ldrh r1, [sp, #40]
+; DEFAULT-NEXT:    BUNDLE
+; DEFAULT-NEXT:    strh r3, [r0]
 ; DEFAULT-NEXT:    pop {r4, r5, r11, pc}
 ;
 ; THUMB1-LABEL: ct_v8i16:
 ; THUMB1:       @ %bb.0: @ %entry
-; THUMB1-NEXT:    .save {r4, r5, r6, lr}
-; THUMB1-NEXT:    push {r4, r5, r6, lr}
+; THUMB1-NEXT:    .save {r4, r5, r6, r7, lr}
+; THUMB1-NEXT:    push {r4, r5, r6, r7, lr}
+; THUMB1-NEXT:    .pad #4
+; THUMB1-NEXT:    sub sp, #4
 ; THUMB1-NEXT:    movs r4, #1
 ; THUMB1-NEXT:    ands r4, r1
-; THUMB1-NEXT:    subs r1, r4, #1
-; THUMB1-NEXT:    ldr r5, [sp, #68]
-; THUMB1-NEXT:    ands r5, r1
-; THUMB1-NEXT:    rsbs r4, r4, #0
-; THUMB1-NEXT:    ldr r6, [sp, #36]
-; THUMB1-NEXT:    ands r6, r4
-; THUMB1-NEXT:    orrs r6, r5
+; THUMB1-NEXT:    ldr r1, [sp, #76]
+; THUMB1-NEXT:    ldr r5, [sp, #44]
+; THUMB1-NEXT:    BUNDLE
 ; THUMB1-NEXT:    strh r6, [r0, #14]
-; THUMB1-NEXT:    ldr r5, [sp, #64]
-; THUMB1-NEXT:    ands r5, r1
-; THUMB1-NEXT:    ldr r6, [sp, #32]
-; THUMB1-NEXT:    ands r6, r4
-; THUMB1-NEXT:    orrs r6, r5
+; THUMB1-NEXT:    ldr r1, [sp, #72]
+; THUMB1-NEXT:    ldr r5, [sp, #40]
+; THUMB1-NEXT:    BUNDLE
 ; THUMB1-NEXT:    strh r6, [r0, #12]
-; THUMB1-NEXT:    ldr r5, [sp, #60]
-; THUMB1-NEXT:    ands r5, r1
-; THUMB1-NEXT:    ldr r6, [sp, #28]
-; THUMB1-NEXT:    ands r6, r4
-; THUMB1-NEXT:    orrs r6, r5
+; THUMB1-NEXT:    ldr r1, [sp, #68]
+; THUMB1-NEXT:    ldr r5, [sp, #36]
+; THUMB1-NEXT:    BUNDLE
 ; THUMB1-NEXT:    strh r6, [r0, #10]
-; THUMB1-NEXT:    ldr r5, [sp, #56]
-; THUMB1-NEXT:    ands r5, r1
-; THUMB1-NEXT:    ldr r6, [sp, #24]
-; THUMB1-NEXT:    ands r6, r4
-; THUMB1-NEXT:    orrs r6, r5
+; THUMB1-NEXT:    ldr r1, [sp, #64]
+; THUMB1-NEXT:    ldr r5, [sp, #32]
+; THUMB1-NEXT:    BUNDLE
 ; THUMB1-NEXT:    strh r6, [r0, #8]
-; THUMB1-NEXT:    ldr r5, [sp, #52]
-; THUMB1-NEXT:    ands r5, r1
-; THUMB1-NEXT:    ldr r6, [sp, #20]
-; THUMB1-NEXT:    ands r6, r4
-; THUMB1-NEXT:    orrs r6, r5
+; THUMB1-NEXT:    ldr r1, [sp, #60]
+; THUMB1-NEXT:    ldr r5, [sp, #28]
+; THUMB1-NEXT:    BUNDLE
 ; THUMB1-NEXT:    strh r6, [r0, #6]
-; THUMB1-NEXT:    ldr r5, [sp, #48]
-; THUMB1-NEXT:    ands r5, r1
-; THUMB1-NEXT:    ldr r6, [sp, #16]
-; THUMB1-NEXT:    ands r6, r4
-; THUMB1-NEXT:    orrs r6, r5
+; THUMB1-NEXT:    ldr r1, [sp, #56]
+; THUMB1-NEXT:    ldr r5, [sp, #24]
+; THUMB1-NEXT:    BUNDLE
 ; THUMB1-NEXT:    strh r6, [r0, #4]
-; THUMB1-NEXT:    ldr r5, [sp, #44]
-; THUMB1-NEXT:    ands r5, r1
-; THUMB1-NEXT:    ands r3, r4
-; THUMB1-NEXT:    orrs r3, r5
-; THUMB1-NEXT:    strh r3, [r0, #2]
-; THUMB1-NEXT:    ldr r3, [sp, #40]
-; THUMB1-NEXT:    ands r3, r1
-; THUMB1-NEXT:    ands r2, r4
-; THUMB1-NEXT:    orrs r2, r3
-; THUMB1-NEXT:    strh r2, [r0]
-; THUMB1-NEXT:    pop {r4, r5, r6, pc}
+; THUMB1-NEXT:    ldr r1, [sp, #52]
+; THUMB1-NEXT:    BUNDLE
+; THUMB1-NEXT:    strh r5, [r0, #2]
+; THUMB1-NEXT:    ldr r1, [sp, #48]
+; THUMB1-NEXT:    BUNDLE
+; THUMB1-NEXT:    strh r3, [r0]
+; THUMB1-NEXT:    add sp, #4
+; THUMB1-NEXT:    pop {r4, r5, r6, r7, pc}
 ;
 ; THUMB2-LABEL: ct_v8i16:
 ; THUMB2:       @ %bb.0: @ %entry
 ; THUMB2-NEXT:    .save {r4, r5, r7, lr}
 ; THUMB2-NEXT:    push {r4, r5, r7, lr}
 ; THUMB2-NEXT:    and lr, r1, #1
-; THUMB2-NEXT:    ldrh.w r1, [sp, #68]
-; THUMB2-NEXT:    sub.w r12, lr, #1
-; THUMB2-NEXT:    ldrh.w r4, [sp, #36]
-; THUMB2-NEXT:    and.w r5, r1, r12
+; THUMB2-NEXT:    ldrh.w r12, [sp, #68]
+; THUMB2-NEXT:    ldrh.w r1, [sp, #36]
+; THUMB2-NEXT:    rsb.w r5, lr, #0
+; THUMB2-NEXT:    and.w r4, r1, r5
+; THUMB2-NEXT:    bic.w r5, r12, r5
+; THUMB2-NEXT:    orrs r4, r5
+; THUMB2-NEXT:    strh r4, [r0, #14]
+; THUMB2-NEXT:    ldrh.w r12, [sp, #64]
+; THUMB2-NEXT:    ldrh.w r5, [sp, #32]
 ; THUMB2-NEXT:    rsb.w r1, lr, #0
-; THUMB2-NEXT:    ands r4, r1
-; THUMB2-NEXT:    ands r3, r1
-; THUMB2-NEXT:    orrs r5, r4
-; THUMB2-NEXT:    ldrh.w r4, [sp, #32]
-; THUMB2-NEXT:    strh r5, [r0, #14]
-; THUMB2-NEXT:    ldrh.w r5, [sp, #64]
-; THUMB2-NEXT:    ands r4, r1
-; THUMB2-NEXT:    and.w r5, r5, r12
-; THUMB2-NEXT:    orrs r5, r4
-; THUMB2-NEXT:    ldrh.w r4, [sp, #28]
-; THUMB2-NEXT:    strh r5, [r0, #12]
-; THUMB2-NEXT:    ldrh.w r5, [sp, #60]
-; THUMB2-NEXT:    ands r4, r1
-; THUMB2-NEXT:    and.w r5, r5, r12
-; THUMB2-NEXT:    orrs r5, r4
-; THUMB2-NEXT:    ldrh.w r4, [sp, #24]
-; THUMB2-NEXT:    strh r5, [r0, #10]
-; THUMB2-NEXT:    ldrh.w r5, [sp, #56]
-; THUMB2-NEXT:    ands r4, r1
-; THUMB2-NEXT:    and.w r5, r5, r12
-; THUMB2-NEXT:    orrs r5, r4
-; THUMB2-NEXT:    ldrh.w r4, [sp, #20]
-; THUMB2-NEXT:    strh r5, [r0, #8]
-; THUMB2-NEXT:    ldrh.w r5, [sp, #52]
-; THUMB2-NEXT:    ands r4, r1
-; THUMB2-NEXT:    and.w r5, r5, r12
-; THUMB2-NEXT:    orrs r5, r4
-; THUMB2-NEXT:    ldrh.w r4, [sp, #16]
-; THUMB2-NEXT:    strh r5, [r0, #6]
-; THUMB2-NEXT:    ldrh.w r5, [sp, #48]
-; THUMB2-NEXT:    ands r4, r1
-; THUMB2-NEXT:    ands r1, r2
-; THUMB2-NEXT:    and.w r5, r5, r12
+; THUMB2-NEXT:    and.w r4, r5, r1
+; THUMB2-NEXT:    bic.w r1, r12, r1
+; THUMB2-NEXT:    orrs r4, r1
+; THUMB2-NEXT:    strh r4, [r0, #12]
+; THUMB2-NEXT:    ldrh.w r12, [sp, #60]
+; THUMB2-NEXT:    ldrh.w r5, [sp, #28]
+; THUMB2-NEXT:    rsb.w r1, lr, #0
+; THUMB2-NEXT:    and.w r4, r5, r1
+; THUMB2-NEXT:    bic.w r1, r12, r1
+; THUMB2-NEXT:    orrs r4, r1
+; THUMB2-NEXT:    strh r4, [r0, #10]
+; THUMB2-NEXT:    ldrh.w r12, [sp, #56]
+; THUMB2-NEXT:    ldrh.w r5, [sp, #24]
+; THUMB2-NEXT:    rsb.w r1, lr, #0
+; THUMB2-NEXT:    and.w r4, r5, r1
+; THUMB2-NEXT:    bic.w r1, r12, r1
+; THUMB2-NEXT:    orrs r4, r1
+; THUMB2-NEXT:    strh r4, [r0, #8]
+; THUMB2-NEXT:    ldrh.w r12, [sp, #52]
+; THUMB2-NEXT:    ldrh.w r5, [sp, #20]
+; THUMB2-NEXT:    rsb.w r1, lr, #0
+; THUMB2-NEXT:    and.w r4, r5, r1
+; THUMB2-NEXT:    bic.w r1, r12, r1
+; THUMB2-NEXT:    orrs r4, r1
+; THUMB2-NEXT:    strh r4, [r0, #6]
+; THUMB2-NEXT:    ldrh.w r12, [sp, #48]
+; THUMB2-NEXT:    ldrh.w r5, [sp, #16]
+; THUMB2-NEXT:    rsb.w r1, lr, #0
+; THUMB2-NEXT:    and.w r4, r5, r1
+; THUMB2-NEXT:    bic.w r1, r12, r1
+; THUMB2-NEXT:    orrs r4, r1
+; THUMB2-NEXT:    strh r4, [r0, #4]
+; THUMB2-NEXT:    ldrh.w r1, [sp, #44]
+; THUMB2-NEXT:    rsb.w r4, lr, #0
+; THUMB2-NEXT:    and.w r5, r3, r4
+; THUMB2-NEXT:    bic.w r4, r1, r4
 ; THUMB2-NEXT:    orrs r5, r4
-; THUMB2-NEXT:    strh r5, [r0, #4]
-; THUMB2-NEXT:    ldrh.w r5, [sp, #44]
-; THUMB2-NEXT:    and.w r5, r5, r12
+; THUMB2-NEXT:    strh r5, [r0, #2]
+; THUMB2-NEXT:    ldrh.w r1, [sp, #40]
+; THUMB2-NEXT:    rsb.w r5, lr, #0
+; THUMB2-NEXT:    and.w r3, r2, r5
+; THUMB2-NEXT:    bic.w r5, r1, r5
 ; THUMB2-NEXT:    orrs r3, r5
-; THUMB2-NEXT:    strh r3, [r0, #2]
-; THUMB2-NEXT:    ldrh.w r3, [sp, #40]
-; THUMB2-NEXT:    and.w r3, r3, r12
-; THUMB2-NEXT:    orrs r1, r3
-; THUMB2-NEXT:    strh r1, [r0]
+; THUMB2-NEXT:    strh r3, [r0]
 ; THUMB2-NEXT:    pop {r4, r5, r7, pc}
 entry:
   %sel = call <8 x i16> @llvm.ct.select.v8i16(i1 %cond, <8 x i16> %a, <8 x i16> %b)
@@ -1017,95 +833,76 @@ define <4 x i32> @ct_v4i32(i1 %cond, <4 x i32> %a, <4 x i32> %b) {
 ; CT-NEXT:    vmov d16, r2, r3
 ; CT-NEXT:    vld1.64 {d18, d19}, [r1]
 ; CT-NEXT:    and r0, r0, #1
-; CT-NEXT:    rsb r1, r0, #0
-; CT-NEXT:    vdup.32 q11, r1
-; CT-NEXT:    vand q10, q8, q11
-; CT-NEXT:    vbic q11, q9, q11
-; CT-NEXT:    vorr q10, q10, q11
+; CT-NEXT:    BUNDLE
 ; CT-NEXT:    vmov r0, r1, d20
 ; CT-NEXT:    vmov r2, r3, d21
 ; CT-NEXT:    bx lr
 ;
 ; DEFAULT-LABEL: ct_v4i32:
 ; DEFAULT:       @ %bb.0: @ %entry
-; DEFAULT-NEXT:    push {r11, lr}
-; DEFAULT-NEXT:    and r0, r0, #1
-; DEFAULT-NEXT:    ldr r1, [sp, #16]
-; DEFAULT-NEXT:    sub r12, r0, #1
-; DEFAULT-NEXT:    rsb lr, r0, #0
-; DEFAULT-NEXT:    and r0, r2, lr
-; DEFAULT-NEXT:    and r1, r1, r12
-; DEFAULT-NEXT:    orr r0, r0, r1
-; DEFAULT-NEXT:    ldr r1, [sp, #20]
-; DEFAULT-NEXT:    and r2, r3, lr
-; DEFAULT-NEXT:    ldr r3, [sp, #8]
-; DEFAULT-NEXT:    and r1, r1, r12
-; DEFAULT-NEXT:    orr r1, r2, r1
-; DEFAULT-NEXT:    ldr r2, [sp, #24]
-; DEFAULT-NEXT:    and r3, r3, lr
-; DEFAULT-NEXT:    and r2, r2, r12
-; DEFAULT-NEXT:    orr r2, r3, r2
-; DEFAULT-NEXT:    ldr r3, [sp, #28]
-; DEFAULT-NEXT:    and r12, r3, r12
-; DEFAULT-NEXT:    ldr r3, [sp, #12]
-; DEFAULT-NEXT:    and r3, r3, lr
-; DEFAULT-NEXT:    orr r3, r3, r12
-; DEFAULT-NEXT:    pop {r11, pc}
+; DEFAULT-NEXT:    push {r4, r5, r11, lr}
+; DEFAULT-NEXT:    and r12, r0, #1
+; DEFAULT-NEXT:    ldr r1, [sp, #24]
+; DEFAULT-NEXT:    BUNDLE
+; DEFAULT-NEXT:    ldr r2, [sp, #28]
+; DEFAULT-NEXT:    BUNDLE
+; DEFAULT-NEXT:    ldr r3, [sp, #16]
+; DEFAULT-NEXT:    ldr lr, [sp, #32]
+; DEFAULT-NEXT:    BUNDLE
+; DEFAULT-NEXT:    ldr lr, [sp, #36]
+; DEFAULT-NEXT:    ldr r4, [sp, #20]
+; DEFAULT-NEXT:    BUNDLE
+; DEFAULT-NEXT:    pop {r4, r5, r11, pc}
 ;
 ; THUMB1-LABEL: ct_v4i32:
 ; THUMB1:       @ %bb.0: @ %entry
-; THUMB1-NEXT:    .save {r4, r5, r6, lr}
-; THUMB1-NEXT:    push {r4, r5, r6, lr}
-; THUMB1-NEXT:    movs r1, #1
-; THUMB1-NEXT:    ands r1, r0
-; THUMB1-NEXT:    subs r4, r1, #1
-; THUMB1-NEXT:    ldr r0, [sp, #24]
-; THUMB1-NEXT:    ands r0, r4
-; THUMB1-NEXT:    rsbs r5, r1, #0
-; THUMB1-NEXT:    ands r2, r5
-; THUMB1-NEXT:    orrs r0, r2
-; THUMB1-NEXT:    ldr r1, [sp, #28]
-; THUMB1-NEXT:    ands r1, r4
-; THUMB1-NEXT:    ands r3, r5
-; THUMB1-NEXT:    orrs r1, r3
-; THUMB1-NEXT:    ldr r3, [sp, #32]
-; THUMB1-NEXT:    ands r3, r4
-; THUMB1-NEXT:    ldr r2, [sp, #16]
-; THUMB1-NEXT:    ands r2, r5
-; THUMB1-NEXT:    orrs r2, r3
-; THUMB1-NEXT:    ldr r6, [sp, #36]
-; THUMB1-NEXT:    ands r6, r4
-; THUMB1-NEXT:    ldr r3, [sp, #20]
-; THUMB1-NEXT:    ands r3, r5
-; THUMB1-NEXT:    orrs r3, r6
-; THUMB1-NEXT:    pop {r4, r5, r6, pc}
+; THUMB1-NEXT:    .save {r4, r5, r6, r7, lr}
+; THUMB1-NEXT:    push {r4, r5, r6, r7, lr}
+; THUMB1-NEXT:    .pad #4
+; THUMB1-NEXT:    sub sp, #4
+; THUMB1-NEXT:    movs r4, #1
+; THUMB1-NEXT:    ands r4, r0
+; THUMB1-NEXT:    ldr r1, [sp, #32]
+; THUMB1-NEXT:    BUNDLE
+; THUMB1-NEXT:    ldr r2, [sp, #36]
+; THUMB1-NEXT:    BUNDLE
+; THUMB1-NEXT:    ldr r3, [sp, #40]
+; THUMB1-NEXT:    ldr r5, [sp, #24]
+; THUMB1-NEXT:    BUNDLE
+; THUMB1-NEXT:    ldr r5, [sp, #44]
+; THUMB1-NEXT:    ldr r6, [sp, #28]
+; THUMB1-NEXT:    BUNDLE
+; THUMB1-NEXT:    add sp, #4
+; THUMB1-NEXT:    pop {r4, r5, r6, r7, pc}
 ;
 ; THUMB2-LABEL: ct_v4i32:
 ; THUMB2:       @ %bb.0: @ %entry
-; THUMB2-NEXT:    .save {r7, lr}
-; THUMB2-NEXT:    push {r7, lr}
-; THUMB2-NEXT:    and r0, r0, #1
-; THUMB2-NEXT:    ldr r1, [sp, #16]
-; THUMB2-NEXT:    sub.w r12, r0, #1
-; THUMB2-NEXT:    rsb.w lr, r0, #0
+; THUMB2-NEXT:    .save {r4, r5, r7, lr}
+; THUMB2-NEXT:    push {r4, r5, r7, lr}
+; THUMB2-NEXT:    and r12, r0, #1
+; THUMB2-NEXT:    ldr r1, [sp, #24]
+; THUMB2-NEXT:    rsb.w lr, r12, #0
 ; THUMB2-NEXT:    and.w r0, r2, lr
-; THUMB2-NEXT:    and.w r1, r1, r12
-; THUMB2-NEXT:    orrs r0, r1
-; THUMB2-NEXT:    ldr r1, [sp, #20]
-; THUMB2-NEXT:    and.w r2, r3, lr
-; THUMB2-NEXT:    ldr r3, [sp, #8]
-; THUMB2-NEXT:    and.w r1, r1, r12
-; THUMB2-NEXT:    orrs r1, r2
-; THUMB2-NEXT:    ldr r2, [sp, #24]
-; THUMB2-NEXT:    and.w r3, r3, lr
-; THUMB2-NEXT:    and.w r2, r2, r12
-; THUMB2-NEXT:    orrs r2, r3
-; THUMB2-NEXT:    ldr r3, [sp, #28]
-; THUMB2-NEXT:    and.w r12, r12, r3
-; THUMB2-NEXT:    ldr r3, [sp, #12]
-; THUMB2-NEXT:    and.w r3, r3, lr
-; THUMB2-NEXT:    orr.w r3, r3, r12
-; THUMB2-NEXT:    pop {r7, pc}
+; THUMB2-NEXT:    bic.w lr, r1, lr
+; THUMB2-NEXT:    orr.w r0, r0, lr
+; THUMB2-NEXT:    ldr r2, [sp, #28]
+; THUMB2-NEXT:    rsb.w lr, r12, #0
+; THUMB2-NEXT:    and.w r1, r3, lr
+; THUMB2-NEXT:    bic.w lr, r2, lr
+; THUMB2-NEXT:    orr.w r1, r1, lr
+; THUMB2-NEXT:    ldr r3, [sp, #16]
+; THUMB2-NEXT:    ldr.w lr, [sp, #32]
+; THUMB2-NEXT:    rsb.w r4, r12, #0
+; THUMB2-NEXT:    and.w r2, r3, r4
+; THUMB2-NEXT:    bic.w r4, lr, r4
+; THUMB2-NEXT:    orrs r2, r4
+; THUMB2-NEXT:    ldr.w lr, [sp, #36]
+; THUMB2-NEXT:    ldr r4, [sp, #20]
+; THUMB2-NEXT:    rsb.w r5, r12, #0
+; THUMB2-NEXT:    and.w r3, r4, r5
+; THUMB2-NEXT:    bic.w r5, lr, r5
+; THUMB2-NEXT:    orrs r3, r5
+; THUMB2-NEXT:    pop {r4, r5, r7, pc}
 entry:
   %sel = call <4 x i32> @llvm.ct.select.v4i32(i1 %cond, <4 x i32> %a, <4 x i32> %b)
   ret <4 x i32> %sel
@@ -1119,95 +916,76 @@ define <2 x i64> @ct_v2i64(i1 %cond, <2 x i64> %a, <2 x i64> %b) {
 ; CT-NEXT:    vmov d16, r2, r3
 ; CT-NEXT:    vld1.64 {d18, d19}, [r1]
 ; CT-NEXT:    and r0, r0, #1
-; CT-NEXT:    rsb r1, r0, #0
-; CT-NEXT:    vdup.32 q11, r1
-; CT-NEXT:    vand q10, q8, q11
-; CT-NEXT:    vbic q11, q9, q11
-; CT-NEXT:    vorr q10, q10, q11
+; CT-NEXT:    BUNDLE
 ; CT-NEXT:    vmov r0, r1, d20
 ; CT-NEXT:    vmov r2, r3, d21
 ; CT-NEXT:    bx lr
 ;
 ; DEFAULT-LABEL: ct_v2i64:
 ; DEFAULT:       @ %bb.0: @ %entry
-; DEFAULT-NEXT:    push {r11, lr}
-; DEFAULT-NEXT:    and r0, r0, #1
-; DEFAULT-NEXT:    ldr r1, [sp, #16]
-; DEFAULT-NEXT:    sub r12, r0, #1
-; DEFAULT-NEXT:    rsb lr, r0, #0
-; DEFAULT-NEXT:    and r0, r2, lr
-; DEFAULT-NEXT:    and r1, r1, r12
-; DEFAULT-NEXT:    orr r0, r0, r1
-; DEFAULT-NEXT:    ldr r1, [sp, #20]
-; DEFAULT-NEXT:    and r2, r3, lr
-; DEFAULT-NEXT:    ldr r3, [sp, #8]
-; DEFAULT-NEXT:    and r1, r1, r12
-; DEFAULT-NEXT:    orr r1, r2, r1
-; DEFAULT-NEXT:    ldr r2, [sp, #24]
-; DEFAULT-NEXT:    and r3, r3, lr
-; DEFAULT-NEXT:    and r2, r2, r12
-; DEFAULT-NEXT:    orr r2, r3, r2
-; DEFAULT-NEXT:    ldr r3, [sp, #28]
-; DEFAULT-NEXT:    and r12, r3, r12
-; DEFAULT-NEXT:    ldr r3, [sp, #12]
-; DEFAULT-NEXT:    and r3, r3, lr
-; DEFAULT-NEXT:    orr r3, r3, r12
-; DEFAULT-NEXT:    pop {r11, pc}
+; DEFAULT-NEXT:    push {r4, r5, r11, lr}
+; DEFAULT-NEXT:    and r12, r0, #1
+; DEFAULT-NEXT:    ldr r1, [sp, #24]
+; DEFAULT-NEXT:    BUNDLE
+; DEFAULT-NEXT:    ldr r2, [sp, #28]
+; DEFAULT-NEXT:    BUNDLE
+; DEFAULT-NEXT:    ldr r3, [sp, #16]
+; DEFAULT-NEXT:    ldr lr, [sp, #32]
+; DEFAULT-NEXT:    BUNDLE
+; DEFAULT-NEXT:    ldr lr, [sp, #36]
+; DEFAULT-NEXT:    ldr r4, [sp, #20]
+; DEFAULT-NEXT:    BUNDLE
+; DEFAULT-NEXT:    pop {r4, r5, r11, pc}
 ;
 ; THUMB1-LABEL: ct_v2i64:
 ; THUMB1:       @ %bb.0: @ %entry
-; THUMB1-NEXT:    .save {r4, r5, r6, lr}
-; THUMB1-NEXT:    push {r4, r5, r6, lr}
-; THUMB1-NEXT:    movs r1, #1
-; THUMB1-NEXT:    ands r1, r0
-; THUMB1-NEXT:    subs r4, r1, #1
-; THUMB1-NEXT:    ldr r0, [sp, #24]
-; THUMB1-NEXT:    ands r0, r4
-; THUMB1-NEXT:    rsbs r5, r1, #0
-; THUMB1-NEXT:    ands r2, r5
-; THUMB1-NEXT:    orrs r0, r2
-; THUMB1-NEXT:    ldr r1, [sp, #28]
-; THUMB1-NEXT:    ands r1, r4
-; THUMB1-NEXT:    ands r3, r5
-; THUMB1-NEXT:    orrs r1, r3
-; THUMB1-NEXT:    ldr r3, [sp, #32]
-; THUMB1-NEXT:    ands r3, r4
-; THUMB1-NEXT:    ldr r2, [sp, #16]
-; THUMB1-NEXT:    ands r2, r5
-; THUMB1-NEXT:    orrs r2, r3
-; THUMB1-NEXT:    ldr r6, [sp, #36]
-; THUMB1-NEXT:    ands r6, r4
-; THUMB1-NEXT:    ldr r3, [sp, #20]
-; THUMB1-NEXT:    ands r3, r5
-; THUMB1-NEXT:    orrs r3, r6
-; THUMB1-NEXT:    pop {r4, r5, r6, pc}
+; THUMB1-NEXT:    .save {r4, r5, r6, r7, lr}
+; THUMB1-NEXT:    push {r4, r5, r6, r7, lr}
+; THUMB1-NEXT:    .pad #4
+; THUMB1-NEXT:    sub sp, #4
+; THUMB1-NEXT:    movs r4, #1
+; THUMB1-NEXT:    ands r4, r0
+; THUMB1-NEXT:    ldr r1, [sp, #32]
+; THUMB1-NEXT:    BUNDLE
+; THUMB1-NEXT:    ldr r2, [sp, #36]
+; THUMB1-NEXT:    BUNDLE
+; THUMB1-NEXT:    ldr r3, [sp, #40]
+; THUMB1-NEXT:    ldr r5, [sp, #24]
+; THUMB1-NEXT:    BUNDLE
+; THUMB1-NEXT:    ldr r5, [sp, #44]
+; THUMB1-NEXT:    ldr r6, [sp, #28]
+; THUMB1-NEXT:    BUNDLE
+; THUMB1-NEXT:    add sp, #4
+; THUMB1-NEXT:    pop {r4, r5, r6, r7, pc}
 ;
 ; THUMB2-LABEL: ct_v2i64:
 ; THUMB2:       @ %bb.0: @ %entry
-; THUMB2-NEXT:    .save {r7, lr}
-; THUMB2-NEXT:    push {r7, lr}
-; THUMB2-NEXT:    and r0, r0, #1
-; THUMB2-NEXT:    ldr r1, [sp, #16]
-; THUMB2-NEXT:    sub.w r12, r0, #1
-; THUMB2-NEXT:    rsb.w lr, r0, #0
+; THUMB2-NEXT:    .save {r4, r5, r7, lr}
+; THUMB2-NEXT:    push {r4, r5, r7, lr}
+; THUMB2-NEXT:    and r12, r0, #1
+; THUMB2-NEXT:    ldr r1, [sp, #24]
+; THUMB2-NEXT:    rsb.w lr, r12, #0
 ; THUMB2-NEXT:    and.w r0, r2, lr
-; THUMB2-NEXT:    and.w r1, r1, r12
-; THUMB2-NEXT:    orrs r0, r1
-; THUMB2-NEXT:    ldr r1, [sp, #20]
-; THUMB2-NEXT:    and.w r2, r3, lr
-; THUMB2-NEXT:    ldr r3, [sp, #8]
-; THUMB2-NEXT:    and.w r1, r1, r12
-; THUMB2-NEXT:    orrs r1, r2
-; THUMB2-NEXT:    ldr r2, [sp, #24]
-; THUMB2-NEXT:    and.w r3, r3, lr
-; THUMB2-NEXT:    and.w r2, r2, r12
-; THUMB2-NEXT:    orrs r2, r3
-; THUMB2-NEXT:    ldr r3, [sp, #28]
-; THUMB2-NEXT:    and.w r12, r12, r3
-; THUMB2-NEXT:    ldr r3, [sp, #12]
-; THUMB2-NEXT:    and.w r3, r3, lr
-; THUMB2-NEXT:    orr.w r3, r3, r12
-; THUMB2-NEXT:    pop {r7, pc}
+; THUMB2-NEXT:    bic.w lr, r1, lr
+; THUMB2-NEXT:    orr.w r0, r0, lr
+; THUMB2-NEXT:    ldr r2, [sp, #28]
+; THUMB2-NEXT:    rsb.w lr, r12, #0
+; THUMB2-NEXT:    and.w r1, r3, lr
+; THUMB2-NEXT:    bic.w lr, r2, lr
+; THUMB2-NEXT:    orr.w r1, r1, lr
+; THUMB2-NEXT:    ldr r3, [sp, #16]
+; THUMB2-NEXT:    ldr.w lr, [sp, #32]
+; THUMB2-NEXT:    rsb.w r4, r12, #0
+; THUMB2-NEXT:    and.w r2, r3, r4
+; THUMB2-NEXT:    bic.w r4, lr, r4
+; THUMB2-NEXT:    orrs r2, r4
+; THUMB2-NEXT:    ldr.w lr, [sp, #36]
+; THUMB2-NEXT:    ldr r4, [sp, #20]
+; THUMB2-NEXT:    rsb.w r5, r12, #0
+; THUMB2-NEXT:    and.w r3, r4, r5
+; THUMB2-NEXT:    bic.w r5, lr, r5
+; THUMB2-NEXT:    orrs r3, r5
+; THUMB2-NEXT:    pop {r4, r5, r7, pc}
 entry:
   %sel = call <2 x i64> @llvm.ct.select.v2i64(i1 %cond, <2 x i64> %a, <2 x i64> %b)
   ret <2 x i64> %sel
@@ -1221,95 +999,76 @@ define <4 x float> @ct_v4f32(i1 %cond, <4 x float> %a, <4 x float> %b) {
 ; CT-NEXT:    vmov d16, r2, r3
 ; CT-NEXT:    vld1.64 {d18, d19}, [r1]
 ; CT-NEXT:    and r0, r0, #1
-; CT-NEXT:    rsb r1, r0, #0
-; CT-NEXT:    vdup.32 q11, r1
-; CT-NEXT:    vand q10, q8, q11
-; CT-NEXT:    vbic q11, q9, q11
-; CT-NEXT:    vorr q10, q10, q11
+; CT-NEXT:    BUNDLE
 ; CT-NEXT:    vmov r0, r1, d20
 ; CT-NEXT:    vmov r2, r3, d21
 ; CT-NEXT:    bx lr
 ;
 ; DEFAULT-LABEL: ct_v4f32:
 ; DEFAULT:       @ %bb.0: @ %entry
-; DEFAULT-NEXT:    push {r11, lr}
-; DEFAULT-NEXT:    and r0, r0, #1
-; DEFAULT-NEXT:    ldr r1, [sp, #16]
-; DEFAULT-NEXT:    sub r12, r0, #1
-; DEFAULT-NEXT:    rsb lr, r0, #0
-; DEFAULT-NEXT:    and r0, r2, lr
-; DEFAULT-NEXT:    and r1, r1, r12
-; DEFAULT-NEXT:    orr r0, r0, r1
-; DEFAULT-NEXT:    ldr r1, [sp, #20]
-; DEFAULT-NEXT:    and r2, r3, lr
-; DEFAULT-NEXT:    ldr r3, [sp, #8]
-; DEFAULT-NEXT:    and r1, r1, r12
-; DEFAULT-NEXT:    orr r1, r2, r1
-; DEFAULT-NEXT:    ldr r2, [sp, #24]
-; DEFAULT-NEXT:    and r3, r3, lr
-; DEFAULT-NEXT:    and r2, r2, r12
-; DEFAULT-NEXT:    orr r2, r3, r2
-; DEFAULT-NEXT:    ldr r3, [sp, #28]
-; DEFAULT-NEXT:    and r12, r3, r12
-; DEFAULT-NEXT:    ldr r3, [sp, #12]
-; DEFAULT-NEXT:    and r3, r3, lr
-; DEFAULT-NEXT:    orr r3, r3, r12
-; DEFAULT-NEXT:    pop {r11, pc}
+; DEFAULT-NEXT:    push {r4, r5, r11, lr}
+; DEFAULT-NEXT:    and r12, r0, #1
+; DEFAULT-NEXT:    ldr r1, [sp, #24]
+; DEFAULT-NEXT:    BUNDLE
+; DEFAULT-NEXT:    ldr r2, [sp, #28]
+; DEFAULT-NEXT:    BUNDLE
+; DEFAULT-NEXT:    ldr r3, [sp, #16]
+; DEFAULT-NEXT:    ldr lr, [sp, #32]
+; DEFAULT-NEXT:    BUNDLE
+; DEFAULT-NEXT:    ldr lr, [sp, #36]
+; DEFAULT-NEXT:    ldr r4, [sp, #20]
+; DEFAULT-NEXT:    BUNDLE
+; DEFAULT-NEXT:    pop {r4, r5, r11, pc}
 ;
 ; THUMB1-LABEL: ct_v4f32:
 ; THUMB1:       @ %bb.0: @ %entry
-; THUMB1-NEXT:    .save {r4, r5, r6, lr}
-; THUMB1-NEXT:    push {r4, r5, r6, lr}
-; THUMB1-NEXT:    movs r1, #1
-; THUMB1-NEXT:    ands r1, r0
-; THUMB1-NEXT:    subs r4, r1, #1
-; THUMB1-NEXT:    ldr r0, [sp, #24]
-; THUMB1-NEXT:    ands r0, r4
-; THUMB1-NEXT:    rsbs r5, r1, #0
-; THUMB1-NEXT:    ands r2, r5
-; THUMB1-NEXT:    orrs r0, r2
-; THUMB1-NEXT:    ldr r1, [sp, #28]
-; THUMB1-NEXT:    ands r1, r4
-; THUMB1-NEXT:    ands r3, r5
-; THUMB1-NEXT:    orrs r1, r3
-; THUMB1-NEXT:    ldr r3, [sp, #32]
-; THUMB1-NEXT:    ands r3, r4
-; THUMB1-NEXT:    ldr r2, [sp, #16]
-; THUMB1-NEXT:    ands r2, r5
-; THUMB1-NEXT:    orrs r2, r3
-; THUMB1-NEXT:    ldr r6, [sp, #36]
-; THUMB1-NEXT:    ands r6, r4
-; THUMB1-NEXT:    ldr r3, [sp, #20]
-; THUMB1-NEXT:    ands r3, r5
-; THUMB1-NEXT:    orrs r3, r6
-; THUMB1-NEXT:    pop {r4, r5, r6, pc}
+; THUMB1-NEXT:    .save {r4, r5, r6, r7, lr}
+; THUMB1-NEXT:    push {r4, r5, r6, r7, lr}
+; THUMB1-NEXT:    .pad #4
+; THUMB1-NEXT:    sub sp, #4
+; THUMB1-NEXT:    movs r4, #1
+; THUMB1-NEXT:    ands r4, r0
+; THUMB1-NEXT:    ldr r1, [sp, #32]
+; THUMB1-NEXT:    BUNDLE
+; THUMB1-NEXT:    ldr r2, [sp, #36]
+; THUMB1-NEXT:    BUNDLE
+; THUMB1-NEXT:    ldr r3, [sp, #40]
+; THUMB1-NEXT:    ldr r5, [sp, #24]
+; THUMB1-NEXT:    BUNDLE
+; THUMB1-NEXT:    ldr r5, [sp, #44]
+; THUMB1-NEXT:    ldr r6, [sp, #28]
+; THUMB1-NEXT:    BUNDLE
+; THUMB1-NEXT:    add sp, #4
+; THUMB1-NEXT:    pop {r4, r5, r6, r7, pc}
 ;
 ; THUMB2-LABEL: ct_v4f32:
 ; THUMB2:       @ %bb.0: @ %entry
-; THUMB2-NEXT:    .save {r7, lr}
-; THUMB2-NEXT:    push {r7, lr}
-; THUMB2-NEXT:    and r0, r0, #1
-; THUMB2-NEXT:    ldr r1, [sp, #16]
-; THUMB2-NEXT:    sub.w r12, r0, #1
-; THUMB2-NEXT:    rsb.w lr, r0, #0
+; THUMB2-NEXT:    .save {r4, r5, r7, lr}
+; THUMB2-NEXT:    push {r4, r5, r7, lr}
+; THUMB2-NEXT:    and r12, r0, #1
+; THUMB2-NEXT:    ldr r1, [sp, #24]
+; THUMB2-NEXT:    rsb.w lr, r12, #0
 ; THUMB2-NEXT:    and.w r0, r2, lr
-; THUMB2-NEXT:    and.w r1, r1, r12
-; THUMB2-NEXT:    orrs r0, r1
-; THUMB2-NEXT:    ldr r1, [sp, #20]
-; THUMB2-NEXT:    and.w r2, r3, lr
-; THUMB2-NEXT:    ldr r3, [sp, #8]
-; THUMB2-NEXT:    and.w r1, r1, r12
-; THUMB2-NEXT:    orrs r1, r2
-; THUMB2-NEXT:    ldr r2, [sp, #24]
-; THUMB2-NEXT:    and.w r3, r3, lr
-; THUMB2-NEXT:    and.w r2, r2, r12
-; THUMB2-NEXT:    orrs r2, r3
-; THUMB2-NEXT:    ldr r3, [sp, #28]
-; THUMB2-NEXT:    and.w r12, r12, r3
-; THUMB2-NEXT:    ldr r3, [sp, #12]
-; THUMB2-NEXT:    and.w r3, r3, lr
-; THUMB2-NEXT:    orr.w r3, r3, r12
-; THUMB2-NEXT:    pop {r7, pc}
+; THUMB2-NEXT:    bic.w lr, r1, lr
+; THUMB2-NEXT:    orr.w r0, r0, lr
+; THUMB2-NEXT:    ldr r2, [sp, #28]
+; THUMB2-NEXT:    rsb.w lr, r12, #0
+; THUMB2-NEXT:    and.w r1, r3, lr
+; THUMB2-NEXT:    bic.w lr, r2, lr
+; THUMB2-NEXT:    orr.w r1, r1, lr
+; THUMB2-NEXT:    ldr r3, [sp, #16]
+; THUMB2-NEXT:    ldr.w lr, [sp, #32]
+; THUMB2-NEXT:    rsb.w r4, r12, #0
+; THUMB2-NEXT:    and.w r2, r3, r4
+; THUMB2-NEXT:    bic.w r4, lr, r4
+; THUMB2-NEXT:    orrs r2, r4
+; THUMB2-NEXT:    ldr.w lr, [sp, #36]
+; THUMB2-NEXT:    ldr r4, [sp, #20]
+; THUMB2-NEXT:    rsb.w r5, r12, #0
+; THUMB2-NEXT:    and.w r3, r4, r5
+; THUMB2-NEXT:    bic.w r5, lr, r5
+; THUMB2-NEXT:    orrs r3, r5
+; THUMB2-NEXT:    pop {r4, r5, r7, pc}
 entry:
   %sel = call <4 x float> @llvm.ct.select.v4f32(i1 %cond, <4 x float> %a, <4 x float> %b)
   ret <4 x float> %sel
@@ -1323,95 +1082,76 @@ define <2 x double> @ct_v2f64(i1 %cond, <2 x double> %a, <2 x double> %b) {
 ; CT-NEXT:    vmov d16, r2, r3
 ; CT-NEXT:    vld1.64 {d18, d19}, [r1]
 ; CT-NEXT:    and r0, r0, #1
-; CT-NEXT:    rsb r1, r0, #0
-; CT-NEXT:    vdup.32 q11, r1
-; CT-NEXT:    vand q10, q8, q11
-; CT-NEXT:    vbic q11, q9, q11
-; CT-NEXT:    vorr q10, q10, q11
+; CT-NEXT:    BUNDLE
 ; CT-NEXT:    vmov r0, r1, d20
 ; CT-NEXT:    vmov r2, r3, d21
 ; CT-NEXT:    bx lr
 ;
 ; DEFAULT-LABEL: ct_v2f64:
 ; DEFAULT:       @ %bb.0: @ %entry
-; DEFAULT-NEXT:    push {r11, lr}
-; DEFAULT-NEXT:    and r0, r0, #1
-; DEFAULT-NEXT:    ldr r1, [sp, #16]
-; DEFAULT-NEXT:    sub r12, r0, #1
-; DEFAULT-NEXT:    rsb lr, r0, #0
-; DEFAULT-NEXT:    and r0, r2, lr
-; DEFAULT-NEXT:    and r1, r1, r12
-; DEFAULT-NEXT:    orr r0, r0, r1
-; DEFAULT-NEXT:    ldr r1, [sp, #20]
-; DEFAULT-NEXT:    and r2, r3, lr
-; DEFAULT-NEXT:    ldr r3, [sp, #8]
-; DEFAULT-NEXT:    and r1, r1, r12
-; DEFAULT-NEXT:    orr r1, r2, r1
-; DEFAULT-NEXT:    ldr r2, [sp, #24]
-; DEFAULT-NEXT:    and r3, r3, lr
-; DEFAULT-NEXT:    and r2, r2, r12
-; DEFAULT-NEXT:    orr r2, r3, r2
-; DEFAULT-NEXT:    ldr r3, [sp, #28]
-; DEFAULT-NEXT:    and r12, r3, r12
-; DEFAULT-NEXT:    ldr r3, [sp, #12]
-; DEFAULT-NEXT:    and r3, r3, lr
-; DEFAULT-NEXT:    orr r3, r3, r12
-; DEFAULT-NEXT:    pop {r11, pc}
+; DEFAULT-NEXT:    push {r4, r5, r11, lr}
+; DEFAULT-NEXT:    and r12, r0, #1
+; DEFAULT-NEXT:    ldr r1, [sp, #24]
+; DEFAULT-NEXT:    BUNDLE
+; DEFAULT-NEXT:    ldr r2, [sp, #28]
+; DEFAULT-NEXT:    BUNDLE
+; DEFAULT-NEXT:    ldr r3, [sp, #16]
+; DEFAULT-NEXT:    ldr lr, [sp, #32]
+; DEFAULT-NEXT:    BUNDLE
+; DEFAULT-NEXT:    ldr lr, [sp, #36]
+; DEFAULT-NEXT:    ldr r4, [sp, #20]
+; DEFAULT-NEXT:    BUNDLE
+; DEFAULT-NEXT:    pop {r4, r5, r11, pc}
 ;
 ; THUMB1-LABEL: ct_v2f64:
 ; THUMB1:       @ %bb.0: @ %entry
-; THUMB1-NEXT:    .save {r4, r5, r6, lr}
-; THUMB1-NEXT:    push {r4, r5, r6, lr}
-; THUMB1-NEXT:    movs r1, #1
-; THUMB1-NEXT:    ands r1, r0
-; THUMB1-NEXT:    subs r4, r1, #1
-; THUMB1-NEXT:    ldr r0, [sp, #24]
-; THUMB1-NEXT:    ands r0, r4
-; THUMB1-NEXT:    rsbs r5, r1, #0
-; THUMB1-NEXT:    ands r2, r5
-; THUMB1-NEXT:    orrs r0, r2
-; THUMB1-NEXT:    ldr r1, [sp, #28]
-; THUMB1-NEXT:    ands r1, r4
-; THUMB1-NEXT:    ands r3, r5
-; THUMB1-NEXT:    orrs r1, r3
-; THUMB1-NEXT:    ldr r3, [sp, #32]
-; THUMB1-NEXT:    ands r3, r4
-; THUMB1-NEXT:    ldr r2, [sp, #16]
-; THUMB1-NEXT:    ands r2, r5
-; THUMB1-NEXT:    orrs r2, r3
-; THUMB1-NEXT:    ldr r6, [sp, #36]
-; THUMB1-NEXT:    ands r6, r4
-; THUMB1-NEXT:    ldr r3, [sp, #20]
-; THUMB1-NEXT:    ands r3, r5
-; THUMB1-NEXT:    orrs r3, r6
-; THUMB1-NEXT:    pop {r4, r5, r6, pc}
+; THUMB1-NEXT:    .save {r4, r5, r6, r7, lr}
+; THUMB1-NEXT:    push {r4, r5, r6, r7, lr}
+; THUMB1-NEXT:    .pad #4
+; THUMB1-NEXT:    sub sp, #4
+; THUMB1-NEXT:    movs r4, #1
+; THUMB1-NEXT:    ands r4, r0
+; THUMB1-NEXT:    ldr r1, [sp, #32]
+; THUMB1-NEXT:    BUNDLE
+; THUMB1-NEXT:    ldr r2, [sp, #36]
+; THUMB1-NEXT:    BUNDLE
+; THUMB1-NEXT:    ldr r3, [sp, #40]
+; THUMB1-NEXT:    ldr r5, [sp, #24]
+; THUMB1-NEXT:    BUNDLE
+; THUMB1-NEXT:    ldr r5, [sp, #44]
+; THUMB1-NEXT:    ldr r6, [sp, #28]
+; THUMB1-NEXT:    BUNDLE
+; THUMB1-NEXT:    add sp, #4
+; THUMB1-NEXT:    pop {r4, r5, r6, r7, pc}
 ;
 ; THUMB2-LABEL: ct_v2f64:
 ; THUMB2:       @ %bb.0: @ %entry
-; THUMB2-NEXT:    .save {r7, lr}
-; THUMB2-NEXT:    push {r7, lr}
-; THUMB2-NEXT:    and r0, r0, #1
-; THUMB2-NEXT:    ldr r1, [sp, #16]
-; THUMB2-NEXT:    sub.w r12, r0, #1
-; THUMB2-NEXT:    rsb.w lr, r0, #0
+; THUMB2-NEXT:    .save {r4, r5, r7, lr}
+; THUMB2-NEXT:    push {r4, r5, r7, lr}
+; THUMB2-NEXT:    and r12, r0, #1
+; THUMB2-NEXT:    ldr r1, [sp, #24]
+; THUMB2-NEXT:    rsb.w lr, r12, #0
 ; THUMB2-NEXT:    and.w r0, r2, lr
-; THUMB2-NEXT:    and.w r1, r1, r12
-; THUMB2-NEXT:    orrs r0, r1
-; THUMB2-NEXT:    ldr r1, [sp, #20]
-; THUMB2-NEXT:    and.w r2, r3, lr
-; THUMB2-NEXT:    ldr r3, [sp, #8]
-; THUMB2-NEXT:    and.w r1, r1, r12
-; THUMB2-NEXT:    orrs r1, r2
-; THUMB2-NEXT:    ldr r2, [sp, #24]
-; THUMB2-NEXT:    and.w r3, r3, lr
-; THUMB2-NEXT:    and.w r2, r2, r12
-; THUMB2-NEXT:    orrs r2, r3
-; THUMB2-NEXT:    ldr r3, [sp, #28]
-; THUMB2-NEXT:    and.w r12, r12, r3
-; THUMB2-NEXT:    ldr r3, [sp, #12]
-; THUMB2-NEXT:    and.w r3, r3, lr
-; THUMB2-NEXT:    orr.w r3, r3, r12
-; THUMB2-NEXT:    pop {r7, pc}
+; THUMB2-NEXT:    bic.w lr, r1, lr
+; THUMB2-NEXT:    orr.w r0, r0, lr
+; THUMB2-NEXT:    ldr r2, [sp, #28]
+; THUMB2-NEXT:    rsb.w lr, r12, #0
+; THUMB2-NEXT:    and.w r1, r3, lr
+; THUMB2-NEXT:    bic.w lr, r2, lr
+; THUMB2-NEXT:    orr.w r1, r1, lr
+; THUMB2-NEXT:    ldr r3, [sp, #16]
+; THUMB2-NEXT:    ldr.w lr, [sp, #32]
+; THUMB2-NEXT:    rsb.w r4, r12, #0
+; THUMB2-NEXT:    and.w r2, r3, r4
+; THUMB2-NEXT:    bic.w r4, lr, r4
+; THUMB2-NEXT:    orrs r2, r4
+; THUMB2-NEXT:    ldr.w lr, [sp, #36]
+; THUMB2-NEXT:    ldr r4, [sp, #20]
+; THUMB2-NEXT:    rsb.w r5, r12, #0
+; THUMB2-NEXT:    and.w r3, r4, r5
+; THUMB2-NEXT:    bic.w r5, lr, r5
+; THUMB2-NEXT:    orrs r3, r5
+; THUMB2-NEXT:    pop {r4, r5, r7, pc}
 entry:
   %sel = call <2 x double> @llvm.ct.select.v2f64(i1 %cond, <2 x double> %a, <2 x double> %b)
   ret <2 x double> %sel
@@ -1423,22 +1163,14 @@ entry:
 define <1 x i8> @ct_v1i8(i1 %cond, <1 x i8> %a, <1 x i8> %b) {
 ; CT-LABEL: ct_v1i8:
 ; CT:       @ %bb.0: @ %entry
-; CT-NEXT:    and r0, r0, #1
-; CT-NEXT:    sub r3, r0, #1
-; CT-NEXT:    rsb r0, r0, #0
-; CT-NEXT:    and r2, r2, r3
-; CT-NEXT:    and r0, r1, r0
-; CT-NEXT:    orr r0, r0, r2
+; CT-NEXT:    and r3, r0, #1
+; CT-NEXT:    BUNDLE
 ; CT-NEXT:    bx lr
 ;
 ; DEFAULT-LABEL: ct_v1i8:
 ; DEFAULT:       @ %bb.0: @ %entry
-; DEFAULT-NEXT:    and r0, r0, #1
-; DEFAULT-NEXT:    sub r3, r0, #1
-; DEFAULT-NEXT:    rsb r0, r0, #0
-; DEFAULT-NEXT:    and r2, r2, r3
-; DEFAULT-NEXT:    and r0, r1, r0
-; DEFAULT-NEXT:    orr r0, r0, r2
+; DEFAULT-NEXT:    and r3, r0, #1
+; DEFAULT-NEXT:    BUNDLE
 ; DEFAULT-NEXT:    bx lr
 ;
 ; THUMB1-LABEL: ct_v1i8:
@@ -1447,21 +1179,16 @@ define <1 x i8> @ct_v1i8(i1 %cond, <1 x i8> %a, <1 x i8> %b) {
 ; THUMB1-NEXT:    push {r4, lr}
 ; THUMB1-NEXT:    movs r3, #1
 ; THUMB1-NEXT:    ands r3, r0
-; THUMB1-NEXT:    subs r4, r3, #1
-; THUMB1-NEXT:    ands r4, r2
-; THUMB1-NEXT:    rsbs r0, r3, #0
-; THUMB1-NEXT:    ands r0, r1
-; THUMB1-NEXT:    orrs r0, r4
+; THUMB1-NEXT:    BUNDLE
 ; THUMB1-NEXT:    pop {r4, pc}
 ;
 ; THUMB2-LABEL: ct_v1i8:
 ; THUMB2:       @ %bb.0: @ %entry
-; THUMB2-NEXT:    and r0, r0, #1
-; THUMB2-NEXT:    subs r3, r0, #1
-; THUMB2-NEXT:    rsbs r0, r0, #0
-; THUMB2-NEXT:    ands r2, r3
-; THUMB2-NEXT:    ands r0, r1
-; THUMB2-NEXT:    orrs r0, r2
+; THUMB2-NEXT:    and r3, r0, #1
+; THUMB2-NEXT:    rsb.w r12, r3, #0
+; THUMB2-NEXT:    and.w r0, r1, r12
+; THUMB2-NEXT:    bic.w r12, r2, r12
+; THUMB2-NEXT:    orr.w r0, r0, r12
 ; THUMB2-NEXT:    bx lr
 entry:
   %sel = call <1 x i8> @llvm.ct.select.i8(i1 %cond, <1 x i8> %a, <1 x i8> %b)
@@ -1471,63 +1198,47 @@ entry:
 define <2 x i8> @ct_v2i8(i1 %cond, <2 x i8> %a, <2 x i8> %b) {
 ; CT-LABEL: ct_v2i8:
 ; CT:       @ %bb.0: @ %entry
-; CT-NEXT:    vdup.32 d16, r0
-; CT-NEXT:    vmov d18, r2, r3
-; CT-NEXT:    vldr d17, [sp]
-; CT-NEXT:    vshl.i32 d16, d16, #31
-; CT-NEXT:    vshr.s32 d16, d16, #31
-; CT-NEXT:    vbsl d16, d18, d17
-; CT-NEXT:    vmov r0, r1, d16
+; CT-NEXT:    vldr d16, [sp]
+; CT-NEXT:    vmov d17, r2, r3
+; CT-NEXT:    and r0, r0, #1
+; CT-NEXT:    BUNDLE
+; CT-NEXT:    vmov r0, r1, d18
 ; CT-NEXT:    bx lr
 ;
 ; DEFAULT-LABEL: ct_v2i8:
 ; DEFAULT:       @ %bb.0: @ %entry
 ; DEFAULT-NEXT:    push {r11, lr}
-; DEFAULT-NEXT:    and r0, r0, #1
-; DEFAULT-NEXT:    sub r12, r0, #1
-; DEFAULT-NEXT:    and lr, r3, r12
-; DEFAULT-NEXT:    rsb r3, r0, #0
-; DEFAULT-NEXT:    and r0, r1, r3
-; DEFAULT-NEXT:    ldrb r1, [sp, #8]
-; DEFAULT-NEXT:    and r2, r2, r3
-; DEFAULT-NEXT:    orr r0, r0, lr
-; DEFAULT-NEXT:    and r1, r1, r12
-; DEFAULT-NEXT:    orr r1, r2, r1
+; DEFAULT-NEXT:    and r12, r0, #1
+; DEFAULT-NEXT:    BUNDLE
+; DEFAULT-NEXT:    ldrb r3, [sp, #8]
+; DEFAULT-NEXT:    BUNDLE
 ; DEFAULT-NEXT:    pop {r11, pc}
 ;
 ; THUMB1-LABEL: ct_v2i8:
 ; THUMB1:       @ %bb.0: @ %entry
-; THUMB1-NEXT:    .save {r4, lr}
-; THUMB1-NEXT:    push {r4, lr}
-; THUMB1-NEXT:    mov r4, r1
-; THUMB1-NEXT:    movs r1, #1
-; THUMB1-NEXT:    ands r1, r0
-; THUMB1-NEXT:    subs r0, r1, #1
-; THUMB1-NEXT:    ands r3, r0
-; THUMB1-NEXT:    rsbs r1, r1, #0
-; THUMB1-NEXT:    ands r4, r1
-; THUMB1-NEXT:    orrs r4, r3
-; THUMB1-NEXT:    ldr r3, [sp, #8]
-; THUMB1-NEXT:    ands r3, r0
-; THUMB1-NEXT:    ands r1, r2
-; THUMB1-NEXT:    orrs r1, r3
-; THUMB1-NEXT:    mov r0, r4
-; THUMB1-NEXT:    pop {r4, pc}
+; THUMB1-NEXT:    .save {r4, r5, r7, lr}
+; THUMB1-NEXT:    push {r4, r5, r7, lr}
+; THUMB1-NEXT:    movs r4, #1
+; THUMB1-NEXT:    ands r4, r0
+; THUMB1-NEXT:    BUNDLE
+; THUMB1-NEXT:    ldr r3, [sp, #16]
+; THUMB1-NEXT:    BUNDLE
+; THUMB1-NEXT:    pop {r4, r5, r7, pc}
 ;
 ; THUMB2-LABEL: ct_v2i8:
 ; THUMB2:       @ %bb.0: @ %entry
 ; THUMB2-NEXT:    .save {r7, lr}
 ; THUMB2-NEXT:    push {r7, lr}
-; THUMB2-NEXT:    and r0, r0, #1
-; THUMB2-NEXT:    sub.w r12, r0, #1
-; THUMB2-NEXT:    and.w lr, r3, r12
-; THUMB2-NEXT:    rsbs r3, r0, #0
-; THUMB2-NEXT:    and.w r0, r1, r3
-; THUMB2-NEXT:    ldrb.w r1, [sp, #8]
-; THUMB2-NEXT:    ands r2, r3
+; THUMB2-NEXT:    and r12, r0, #1
+; THUMB2-NEXT:    rsb.w lr, r12, #0
+; THUMB2-NEXT:    and.w r0, r1, lr
+; THUMB2-NEXT:    bic.w lr, r3, lr
 ; THUMB2-NEXT:    orr.w r0, r0, lr
-; THUMB2-NEXT:    and.w r1, r1, r12
-; THUMB2-NEXT:    orrs r1, r2
+; THUMB2-NEXT:    ldrb.w r3, [sp, #8]
+; THUMB2-NEXT:    rsb.w lr, r12, #0
+; THUMB2-NEXT:    and.w r1, r2, lr
+; THUMB2-NEXT:    bic.w lr, r3, lr
+; THUMB2-NEXT:    orr.w r1, r1, lr
 ; THUMB2-NEXT:    pop {r7, pc}
 entry:
   %sel = call <2 x i8> @llvm.ct.select.i16(i1 %cond, <2 x i8> %a, <2 x i8> %b)
@@ -1537,92 +1248,72 @@ entry:
 define <4 x i8> @ct_v4i8(i1 %cond, <4 x i8> %a, <4 x i8> %b) {
 ; CT-LABEL: ct_v4i8:
 ; CT:       @ %bb.0: @ %entry
-; CT-NEXT:    vdup.16 d16, r0
-; CT-NEXT:    vmov d18, r2, r3
-; CT-NEXT:    vldr d17, [sp]
-; CT-NEXT:    vshl.i16 d16, d16, #15
-; CT-NEXT:    vshr.s16 d16, d16, #15
-; CT-NEXT:    vbsl d16, d18, d17
-; CT-NEXT:    vmov r0, r1, d16
+; CT-NEXT:    vldr d16, [sp]
+; CT-NEXT:    vmov d17, r2, r3
+; CT-NEXT:    and r0, r0, #1
+; CT-NEXT:    BUNDLE
+; CT-NEXT:    vmov r0, r1, d18
 ; CT-NEXT:    bx lr
 ;
 ; DEFAULT-LABEL: ct_v4i8:
 ; DEFAULT:       @ %bb.0: @ %entry
-; DEFAULT-NEXT:    push {r11, lr}
-; DEFAULT-NEXT:    and lr, r0, #1
-; DEFAULT-NEXT:    ldrb r0, [sp, #12]
-; DEFAULT-NEXT:    sub r12, lr, #1
-; DEFAULT-NEXT:    rsb lr, lr, #0
-; DEFAULT-NEXT:    and r0, r0, r12
-; DEFAULT-NEXT:    and r1, r1, lr
-; DEFAULT-NEXT:    orr r0, r1, r0
-; DEFAULT-NEXT:    ldrb r1, [sp, #16]
-; DEFAULT-NEXT:    and r2, r2, lr
-; DEFAULT-NEXT:    and r3, r3, lr
-; DEFAULT-NEXT:    and r1, r1, r12
-; DEFAULT-NEXT:    orr r1, r2, r1
-; DEFAULT-NEXT:    ldrb r2, [sp, #20]
-; DEFAULT-NEXT:    and r2, r2, r12
-; DEFAULT-NEXT:    orr r2, r3, r2
-; DEFAULT-NEXT:    ldrb r3, [sp, #24]
-; DEFAULT-NEXT:    and r12, r3, r12
-; DEFAULT-NEXT:    ldrb r3, [sp, #8]
-; DEFAULT-NEXT:    and r3, r3, lr
-; DEFAULT-NEXT:    orr r3, r3, r12
-; DEFAULT-NEXT:    pop {r11, pc}
+; DEFAULT-NEXT:    push {r4, r5, r11, lr}
+; DEFAULT-NEXT:    and r12, r0, #1
+; DEFAULT-NEXT:    ldrb lr, [sp, #20]
+; DEFAULT-NEXT:    BUNDLE
+; DEFAULT-NEXT:    ldrb r4, [sp, #24]
+; DEFAULT-NEXT:    BUNDLE
+; DEFAULT-NEXT:    ldrb r4, [sp, #28]
+; DEFAULT-NEXT:    BUNDLE
+; DEFAULT-NEXT:    ldrb lr, [sp, #32]
+; DEFAULT-NEXT:    ldrb r4, [sp, #16]
+; DEFAULT-NEXT:    BUNDLE
+; DEFAULT-NEXT:    pop {r4, r5, r11, pc}
 ;
 ; THUMB1-LABEL: ct_v4i8:
 ; THUMB1:       @ %bb.0: @ %entry
-; THUMB1-NEXT:    .save {r4, r5, r6, lr}
-; THUMB1-NEXT:    push {r4, r5, r6, lr}
-; THUMB1-NEXT:    movs r5, #1
-; THUMB1-NEXT:    ands r5, r0
-; THUMB1-NEXT:    subs r4, r5, #1
-; THUMB1-NEXT:    ldr r0, [sp, #20]
-; THUMB1-NEXT:    ands r0, r4
-; THUMB1-NEXT:    rsbs r5, r5, #0
-; THUMB1-NEXT:    ands r1, r5
-; THUMB1-NEXT:    orrs r0, r1
-; THUMB1-NEXT:    ldr r1, [sp, #24]
-; THUMB1-NEXT:    ands r1, r4
-; THUMB1-NEXT:    ands r2, r5
-; THUMB1-NEXT:    orrs r1, r2
-; THUMB1-NEXT:    ldr r2, [sp, #28]
-; THUMB1-NEXT:    ands r2, r4
-; THUMB1-NEXT:    ands r3, r5
-; THUMB1-NEXT:    orrs r2, r3
-; THUMB1-NEXT:    ldr r6, [sp, #32]
-; THUMB1-NEXT:    ands r6, r4
-; THUMB1-NEXT:    ldr r3, [sp, #16]
-; THUMB1-NEXT:    ands r3, r5
-; THUMB1-NEXT:    orrs r3, r6
-; THUMB1-NEXT:    pop {r4, r5, r6, pc}
+; THUMB1-NEXT:    .save {r4, r5, r6, r7, lr}
+; THUMB1-NEXT:    push {r4, r5, r6, r7, lr}
+; THUMB1-NEXT:    movs r4, #1
+; THUMB1-NEXT:    ands r4, r0
+; THUMB1-NEXT:    ldr r5, [sp, #24]
+; THUMB1-NEXT:    BUNDLE
+; THUMB1-NEXT:    ldr r5, [sp, #28]
+; THUMB1-NEXT:    BUNDLE
+; THUMB1-NEXT:    ldr r5, [sp, #32]
+; THUMB1-NEXT:    BUNDLE
+; THUMB1-NEXT:    ldr r5, [sp, #36]
+; THUMB1-NEXT:    ldr r6, [sp, #20]
+; THUMB1-NEXT:    BUNDLE
+; THUMB1-NEXT:    pop {r4, r5, r6, r7, pc}
 ;
 ; THUMB2-LABEL: ct_v4i8:
 ; THUMB2:       @ %bb.0: @ %entry
-; THUMB2-NEXT:    .save {r7, lr}
-; THUMB2-NEXT:    push {r7, lr}
-; THUMB2-NEXT:    and lr, r0, #1
-; THUMB2-NEXT:    ldrb.w r0, [sp, #12]
-; THUMB2-NEXT:    sub.w r12, lr, #1
-; THUMB2-NEXT:    rsb.w lr, lr, #0
-; THUMB2-NEXT:    and.w r0, r0, r12
-; THUMB2-NEXT:    and.w r1, r1, lr
-; THUMB2-NEXT:    orrs r0, r1
-; THUMB2-NEXT:    ldrb.w r1, [sp, #16]
-; THUMB2-NEXT:    and.w r2, r2, lr
-; THUMB2-NEXT:    and.w r3, r3, lr
-; THUMB2-NEXT:    and.w r1, r1, r12
-; THUMB2-NEXT:    orrs r1, r2
-; THUMB2-NEXT:    ldrb.w r2, [sp, #20]
-; THUMB2-NEXT:    and.w r2, r2, r12
-; THUMB2-NEXT:    orrs r2, r3
-; THUMB2-NEXT:    ldrb.w r3, [sp, #24]
-; THUMB2-NEXT:    and.w r12, r12, r3
-; THUMB2-NEXT:    ldrb.w r3, [sp, #8]
-; THUMB2-NEXT:    and.w r3, r3, lr
-; THUMB2-NEXT:    orr.w r3, r3, r12
-; THUMB2-NEXT:    pop {r7, pc}
+; THUMB2-NEXT:    .save {r4, r5, r7, lr}
+; THUMB2-NEXT:    push {r4, r5, r7, lr}
+; THUMB2-NEXT:    and r12, r0, #1
+; THUMB2-NEXT:    ldrb.w lr, [sp, #20]
+; THUMB2-NEXT:    rsb.w r4, r12, #0
+; THUMB2-NEXT:    and.w r0, r1, r4
+; THUMB2-NEXT:    bic.w r4, lr, r4
+; THUMB2-NEXT:    orrs r0, r4
+; THUMB2-NEXT:    ldrb.w r4, [sp, #24]
+; THUMB2-NEXT:    rsb.w lr, r12, #0
+; THUMB2-NEXT:    and.w r1, r2, lr
+; THUMB2-NEXT:    bic.w lr, r4, lr
+; THUMB2-NEXT:    orr.w r1, r1, lr
+; THUMB2-NEXT:    ldrb.w r4, [sp, #28]
+; THUMB2-NEXT:    rsb.w lr, r12, #0
+; THUMB2-NEXT:    and.w r2, r3, lr
+; THUMB2-NEXT:    bic.w lr, r4, lr
+; THUMB2-NEXT:    orr.w r2, r2, lr
+; THUMB2-NEXT:    ldrb.w lr, [sp, #32]
+; THUMB2-NEXT:    ldrb.w r4, [sp, #16]
+; THUMB2-NEXT:    rsb.w r5, r12, #0
+; THUMB2-NEXT:    and.w r3, r4, r5
+; THUMB2-NEXT:    bic.w r5, lr, r5
+; THUMB2-NEXT:    orrs r3, r5
+; THUMB2-NEXT:    pop {r4, r5, r7, pc}
 entry:
   %sel = call <4 x i8> @llvm.ct.select.i32(i1 %cond, <4 x i8> %a, <4 x i8> %b)
   ret <4 x i8> %sel
@@ -1631,22 +1322,14 @@ entry:
 define <1 x i16> @ct_v1i16(i1 %cond, <1 x i16> %a, <1 x i16> %b) {
 ; CT-LABEL: ct_v1i16:
 ; CT:       @ %bb.0: @ %entry
-; CT-NEXT:    and r0, r0, #1
-; CT-NEXT:    sub r3, r0, #1
-; CT-NEXT:    rsb r0, r0, #0
-; CT-NEXT:    and r2, r2, r3
-; CT-NEXT:    and r0, r1, r0
-; CT-NEXT:    orr r0, r0, r2
+; CT-NEXT:    and r3, r0, #1
+; CT-NEXT:    BUNDLE
 ; CT-NEXT:    bx lr
 ;
 ; DEFAULT-LABEL: ct_v1i16:
 ; DEFAULT:       @ %bb.0: @ %entry
-; DEFAULT-NEXT:    and r0, r0, #1
-; DEFAULT-NEXT:    sub r3, r0, #1
-; DEFAULT-NEXT:    rsb r0, r0, #0
-; DEFAULT-NEXT:    and r2, r2, r3
-; DEFAULT-NEXT:    and r0, r1, r0
-; DEFAULT-NEXT:    orr r0, r0, r2
+; DEFAULT-NEXT:    and r3, r0, #1
+; DEFAULT-NEXT:    BUNDLE
 ; DEFAULT-NEXT:    bx lr
 ;
 ; THUMB1-LABEL: ct_v1i16:
@@ -1655,21 +1338,16 @@ define <1 x i16> @ct_v1i16(i1 %cond, <1 x i16> %a, <1 x i16> %b) {
 ; THUMB1-NEXT:    push {r4, lr}
 ; THUMB1-NEXT:    movs r3, #1
 ; THUMB1-NEXT:    ands r3, r0
-; THUMB1-NEXT:    subs r4, r3, #1
-; THUMB1-NEXT:    ands r4, r2
-; THUMB1-NEXT:    rsbs r0, r3, #0
-; THUMB1-NEXT:    ands r0, r1
-; THUMB1-NEXT:    orrs r0, r4
+; THUMB1-NEXT:    BUNDLE
 ; THUMB1-NEXT:    pop {r4, pc}
 ;
 ; THUMB2-LABEL: ct_v1i16:
 ; THUMB2:       @ %bb.0: @ %entry
-; THUMB2-NEXT:    and r0, r0, #1
-; THUMB2-NEXT:    subs r3, r0, #1
-; THUMB2-NEXT:    rsbs r0, r0, #0
-; THUMB2-NEXT:    ands r2, r3
-; THUMB2-NEXT:    ands r0, r1
-; THUMB2-NEXT:    orrs r0, r2
+; THUMB2-NEXT:    and r3, r0, #1
+; THUMB2-NEXT:    rsb.w r12, r3, #0
+; THUMB2-NEXT:    and.w r0, r1, r12
+; THUMB2-NEXT:    bic.w r12, r2, r12
+; THUMB2-NEXT:    orr.w r0, r0, r12
 ; THUMB2-NEXT:    bx lr
 entry:
   %sel = call <1 x i16> @llvm.ct.select.i16(i1 %cond, <1 x i16> %a, <1 x i16> %b)
@@ -1679,63 +1357,47 @@ entry:
 define <2 x i16> @ct_v2i16(i1 %cond, <2 x i16> %a, <2 x i16> %b) {
 ; CT-LABEL: ct_v2i16:
 ; CT:       @ %bb.0: @ %entry
-; CT-NEXT:    vdup.32 d16, r0
-; CT-NEXT:    vmov d18, r2, r3
-; CT-NEXT:    vldr d17, [sp]
-; CT-NEXT:    vshl.i32 d16, d16, #31
-; CT-NEXT:    vshr.s32 d16, d16, #31
-; CT-NEXT:    vbsl d16, d18, d17
-; CT-NEXT:    vmov r0, r1, d16
+; CT-NEXT:    vldr d16, [sp]
+; CT-NEXT:    vmov d17, r2, r3
+; CT-NEXT:    and r0, r0, #1
+; CT-NEXT:    BUNDLE
+; CT-NEXT:    vmov r0, r1, d18
 ; CT-NEXT:    bx lr
 ;
 ; DEFAULT-LABEL: ct_v2i16:
 ; DEFAULT:       @ %bb.0: @ %entry
 ; DEFAULT-NEXT:    push {r11, lr}
-; DEFAULT-NEXT:    and r0, r0, #1
-; DEFAULT-NEXT:    sub r12, r0, #1
-; DEFAULT-NEXT:    and lr, r3, r12
-; DEFAULT-NEXT:    rsb r3, r0, #0
-; DEFAULT-NEXT:    and r0, r1, r3
-; DEFAULT-NEXT:    ldrh r1, [sp, #8]
-; DEFAULT-NEXT:    and r2, r2, r3
-; DEFAULT-NEXT:    orr r0, r0, lr
-; DEFAULT-NEXT:    and r1, r1, r12
-; DEFAULT-NEXT:    orr r1, r2, r1
+; DEFAULT-NEXT:    and r12, r0, #1
+; DEFAULT-NEXT:    BUNDLE
+; DEFAULT-NEXT:    ldrh r3, [sp, #8]
+; DEFAULT-NEXT:    BUNDLE
 ; DEFAULT-NEXT:    pop {r11, pc}
 ;
 ; THUMB1-LABEL: ct_v2i16:
 ; THUMB1:       @ %bb.0: @ %entry
-; THUMB1-NEXT:    .save {r4, lr}
-; THUMB1-NEXT:    push {r4, lr}
-; THUMB1-NEXT:    mov r4, r1
-; THUMB1-NEXT:    movs r1, #1
-; THUMB1-NEXT:    ands r1, r0
-; THUMB1-NEXT:    subs r0, r1, #1
-; THUMB1-NEXT:    ands r3, r0
-; THUMB1-NEXT:    rsbs r1, r1, #0
-; THUMB1-NEXT:    ands r4, r1
-; THUMB1-NEXT:    orrs r4, r3
-; THUMB1-NEXT:    ldr r3, [sp, #8]
-; THUMB1-NEXT:    ands r3, r0
-; THUMB1-NEXT:    ands r1, r2
-; THUMB1-NEXT:    orrs r1, r3
-; THUMB1-NEXT:    mov r0, r4
-; THUMB1-NEXT:    pop {r4, pc}
+; THUMB1-NEXT:    .save {r4, r5, r7, lr}
+; THUMB1-NEXT:    push {r4, r5, r7, lr}
+; THUMB1-NEXT:    movs r4, #1
+; THUMB1-NEXT:    ands r4, r0
+; THUMB1-NEXT:    BUNDLE
+; THUMB1-NEXT:    ldr r3, [sp, #16]
+; THUMB1-NEXT:    BUNDLE
+; THUMB1-NEXT:    pop {r4, r5, r7, pc}
 ;
 ; THUMB2-LABEL: ct_v2i16:
 ; THUMB2:       @ %bb.0: @ %entry
 ; THUMB2-NEXT:    .save {r7, lr}
 ; THUMB2-NEXT:    push {r7, lr}
-; THUMB2-NEXT:    and r0, r0, #1
-; THUMB2-NEXT:    sub.w r12, r0, #1
-; THUMB2-NEXT:    and.w lr, r3, r12
-; THUMB2-NEXT:    rsbs r3, r0, #0
-; THUMB2-NEXT:    and.w r0, r1, r3
-; THUMB2-NEXT:    ldrh.w r1, [sp, #8]
-; THUMB2-NEXT:    ands r2, r3
+; THUMB2-NEXT:    and r12, r0, #1
+; THUMB2-NEXT:    rsb.w lr, r12, #0
+; THUMB2-NEXT:    and.w r0, r1, lr
+; THUMB2-NEXT:    bic.w lr, r3, lr
 ; THUMB2-NEXT:    orr.w r0, r0, lr
-; THUMB2-NEXT:    and.w r1, r1, r12
-; THUMB2-NEXT:    orrs r1, r2
+; THUMB2-NEXT:    ldrh.w r3, [sp, #8]
+; THUMB2-NEXT:    rsb.w lr, r12, #0
+; THUMB2-NEXT:    and.w r1, r2, lr
+; THUMB2-NEXT:    bic.w lr, r3, lr
+; THUMB2-NEXT:    orr.w r1, r1, lr
 ; THUMB2-NEXT:    pop {r7, pc}
 entry:
   %sel = call <2 x i16> @llvm.ct.select.i32(i1 %cond, <2 x i16> %a, <2 x i16> %b)
@@ -1745,22 +1407,14 @@ entry:
 define <1 x i32> @ct_v1i32(i1 %cond, <1 x i32> %a, <1 x i32> %b) {
 ; CT-LABEL: ct_v1i32:
 ; CT:       @ %bb.0: @ %entry
-; CT-NEXT:    and r0, r0, #1
-; CT-NEXT:    sub r3, r0, #1
-; CT-NEXT:    rsb r0, r0, #0
-; CT-NEXT:    and r2, r2, r3
-; CT-NEXT:    and r0, r1, r0
-; CT-NEXT:    orr r0, r0, r2
+; CT-NEXT:    and r3, r0, #1
+; CT-NEXT:    BUNDLE
 ; CT-NEXT:    bx lr
 ;
 ; DEFAULT-LABEL: ct_v1i32:
 ; DEFAULT:       @ %bb.0: @ %entry
-; DEFAULT-NEXT:    and r0, r0, #1
-; DEFAULT-NEXT:    sub r3, r0, #1
-; DEFAULT-NEXT:    rsb r0, r0, #0
-; DEFAULT-NEXT:    and r2, r2, r3
-; DEFAULT-NEXT:    and r0, r1, r0
-; DEFAULT-NEXT:    orr r0, r0, r2
+; DEFAULT-NEXT:    and r3, r0, #1
+; DEFAULT-NEXT:    BUNDLE
 ; DEFAULT-NEXT:    bx lr
 ;
 ; THUMB1-LABEL: ct_v1i32:
@@ -1769,21 +1423,16 @@ define <1 x i32> @ct_v1i32(i1 %cond, <1 x i32> %a, <1 x i32> %b) {
 ; THUMB1-NEXT:    push {r4, lr}
 ; THUMB1-NEXT:    movs r3, #1
 ; THUMB1-NEXT:    ands r3, r0
-; THUMB1-NEXT:    subs r4, r3, #1
-; THUMB1-NEXT:    ands r4, r2
-; THUMB1-NEXT:    rsbs r0, r3, #0
-; THUMB1-NEXT:    ands r0, r1
-; THUMB1-NEXT:    orrs r0, r4
+; THUMB1-NEXT:    BUNDLE
 ; THUMB1-NEXT:    pop {r4, pc}
 ;
 ; THUMB2-LABEL: ct_v1i32:
 ; THUMB2:       @ %bb.0: @ %entry
-; THUMB2-NEXT:    and r0, r0, #1
-; THUMB2-NEXT:    subs r3, r0, #1
-; THUMB2-NEXT:    rsbs r0, r0, #0
-; THUMB2-NEXT:    ands r2, r3
-; THUMB2-NEXT:    ands r0, r1
-; THUMB2-NEXT:    orrs r0, r2
+; THUMB2-NEXT:    and r3, r0, #1
+; THUMB2-NEXT:    rsb.w r12, r3, #0
+; THUMB2-NEXT:    and.w r0, r1, r12
+; THUMB2-NEXT:    bic.w r12, r2, r12
+; THUMB2-NEXT:    orr.w r0, r0, r12
 ; THUMB2-NEXT:    bx lr
 entry:
   %sel = call <1 x i32> @llvm.ct.select.i32(i1 %cond, <1 x i32> %a, <1 x i32> %b)
@@ -1794,21 +1443,16 @@ define <1 x float> @ct_v1f32(i1 %cond, <1 x float> %a, <1 x float> %b) {
 ; CT-LABEL: ct_v1f32:
 ; CT:       @ %bb.0: @ %entry
 ; CT-NEXT:    and r0, r0, #1
-; CT-NEXT:    sub r3, r0, #1
-; CT-NEXT:    rsb r0, r0, #0
-; CT-NEXT:    and r2, r2, r3
-; CT-NEXT:    and r0, r1, r0
-; CT-NEXT:    orr r0, r0, r2
+; CT-NEXT:    vmov s0, r2
+; CT-NEXT:    vmov s2, r1
+; CT-NEXT:    BUNDLE
+; CT-NEXT:    vmov r0, s4
 ; CT-NEXT:    bx lr
 ;
 ; DEFAULT-LABEL: ct_v1f32:
 ; DEFAULT:       @ %bb.0: @ %entry
-; DEFAULT-NEXT:    and r0, r0, #1
-; DEFAULT-NEXT:    sub r3, r0, #1
-; DEFAULT-NEXT:    rsb r0, r0, #0
-; DEFAULT-NEXT:    and r2, r2, r3
-; DEFAULT-NEXT:    and r0, r1, r0
-; DEFAULT-NEXT:    orr r0, r0, r2
+; DEFAULT-NEXT:    and r3, r0, #1
+; DEFAULT-NEXT:    BUNDLE
 ; DEFAULT-NEXT:    bx lr
 ;
 ; THUMB1-LABEL: ct_v1f32:
@@ -1817,21 +1461,16 @@ define <1 x float> @ct_v1f32(i1 %cond, <1 x float> %a, <1 x float> %b) {
 ; THUMB1-NEXT:    push {r4, lr}
 ; THUMB1-NEXT:    movs r3, #1
 ; THUMB1-NEXT:    ands r3, r0
-; THUMB1-NEXT:    subs r4, r3, #1
-; THUMB1-NEXT:    ands r4, r2
-; THUMB1-NEXT:    rsbs r0, r3, #0
-; THUMB1-NEXT:    ands r0, r1
-; THUMB1-NEXT:    orrs r0, r4
+; THUMB1-NEXT:    BUNDLE
 ; THUMB1-NEXT:    pop {r4, pc}
 ;
 ; THUMB2-LABEL: ct_v1f32:
 ; THUMB2:       @ %bb.0: @ %entry
-; THUMB2-NEXT:    and r0, r0, #1
-; THUMB2-NEXT:    subs r3, r0, #1
-; THUMB2-NEXT:    rsbs r0, r0, #0
-; THUMB2-NEXT:    ands r2, r3
-; THUMB2-NEXT:    ands r0, r1
-; THUMB2-NEXT:    orrs r0, r2
+; THUMB2-NEXT:    and r3, r0, #1
+; THUMB2-NEXT:    rsb.w r12, r3, #0
+; THUMB2-NEXT:    and.w r0, r1, r12
+; THUMB2-NEXT:    bic.w r12, r2, r12
+; THUMB2-NEXT:    orr.w r0, r0, r12
 ; THUMB2-NEXT:    bx lr
 entry:
   %sel = call <1 x float> @llvm.ct.select.f32(i1 %cond, <1 x float> %a, <1 x float> %b)
diff --git a/llvm/test/CodeGen/ARM/ctselect.ll b/llvm/test/CodeGen/ARM/ctselect.ll
index 055e8733cb65c..b6d5be4e77257 100644
--- a/llvm/test/CodeGen/ARM/ctselect.ll
+++ b/llvm/test/CodeGen/ARM/ctselect.ll
@@ -10,19 +10,13 @@ define i1 @ct_i1(i1 %cond, i1 %a, i1 %b) {
 ; CT-LABEL: ct_i1:
 ; CT:       @ %bb.0: @ %entry
 ; CT-NEXT:    and r3, r0, #1
-; CT-NEXT:    rsb r12, r3, #0
-; CT-NEXT:    and r0, r1, r12
-; CT-NEXT:    bic r12, r2, r12
-; CT-NEXT:    orr r0, r0, r12
+; CT-NEXT:    BUNDLE
 ; CT-NEXT:    bx lr
 ;
 ; DEFAULT-LABEL: ct_i1:
 ; DEFAULT:       @ %bb.0: @ %entry
 ; DEFAULT-NEXT:    and r3, r0, #1
-; DEFAULT-NEXT:    rsb r12, r3, #0
-; DEFAULT-NEXT:    and r0, r1, r12
-; DEFAULT-NEXT:    bic r12, r2, r12
-; DEFAULT-NEXT:    orr r0, r0, r12
+; DEFAULT-NEXT:    BUNDLE
 ; DEFAULT-NEXT:    bx lr
 ;
 ; THUMB1-LABEL: ct_i1:
@@ -31,13 +25,7 @@ define i1 @ct_i1(i1 %cond, i1 %a, i1 %b) {
 ; THUMB1-NEXT:    push {r4, lr}
 ; THUMB1-NEXT:    movs r3, #1
 ; THUMB1-NEXT:    ands r3, r0
-; THUMB1-NEXT:    mov r4, r3
-; THUMB1-NEXT:    lsls r4, r4, #31
-; THUMB1-NEXT:    asrs r4, r4, #31
-; THUMB1-NEXT:    mov r0, r1
-; THUMB1-NEXT:    eors r0, r2
-; THUMB1-NEXT:    ands r0, r4
-; THUMB1-NEXT:    eors r0, r2
+; THUMB1-NEXT:    BUNDLE
 ; THUMB1-NEXT:    pop {r4, pc}
 ;
 ; THUMB2-LABEL: ct_i1:
@@ -61,10 +49,7 @@ define i1 @ct_i1(i1 %cond, i1 %a, i1 %b) {
 ; CORTEX-NOTHUMB-LABEL: ct_i1:
 ; CORTEX-NOTHUMB:       @ %bb.0: @ %entry
 ; CORTEX-NOTHUMB-NEXT:    and r3, r0, #1
-; CORTEX-NOTHUMB-NEXT:    rsb r12, r3, #0
-; CORTEX-NOTHUMB-NEXT:    and r0, r1, r12
-; CORTEX-NOTHUMB-NEXT:    bic r12, r2, r12
-; CORTEX-NOTHUMB-NEXT:    orr r0, r0, r12
+; CORTEX-NOTHUMB-NEXT:    BUNDLE
 ; CORTEX-NOTHUMB-NEXT:    bx lr
 entry:
   %sel = call i1 @llvm.ct.select.i1(i1 %cond, i1 %a, i1 %b)
@@ -75,19 +60,13 @@ define i8 @ct_int8(i1 %cond, i8 %a, i8 %b) {
 ; CT-LABEL: ct_int8:
 ; CT:       @ %bb.0: @ %entry
 ; CT-NEXT:    and r3, r0, #1
-; CT-NEXT:    rsb r12, r3, #0
-; CT-NEXT:    and r0, r1, r12
-; CT-NEXT:    bic r12, r2, r12
-; CT-NEXT:    orr r0, r0, r12
+; CT-NEXT:    BUNDLE
 ; CT-NEXT:    bx lr
 ;
 ; DEFAULT-LABEL: ct_int8:
 ; DEFAULT:       @ %bb.0: @ %entry
 ; DEFAULT-NEXT:    and r3, r0, #1
-; DEFAULT-NEXT:    rsb r12, r3, #0
-; DEFAULT-NEXT:    and r0, r1, r12
-; DEFAULT-NEXT:    bic r12, r2, r12
-; DEFAULT-NEXT:    orr r0, r0, r12
+; DEFAULT-NEXT:    BUNDLE
 ; DEFAULT-NEXT:    bx lr
 ;
 ; THUMB1-LABEL: ct_int8:
@@ -96,13 +75,7 @@ define i8 @ct_int8(i1 %cond, i8 %a, i8 %b) {
 ; THUMB1-NEXT:    push {r4, lr}
 ; THUMB1-NEXT:    movs r3, #1
 ; THUMB1-NEXT:    ands r3, r0
-; THUMB1-NEXT:    mov r4, r3
-; THUMB1-NEXT:    lsls r4, r4, #31
-; THUMB1-NEXT:    asrs r4, r4, #31
-; THUMB1-NEXT:    mov r0, r1
-; THUMB1-NEXT:    eors r0, r2
-; THUMB1-NEXT:    ands r0, r4
-; THUMB1-NEXT:    eors r0, r2
+; THUMB1-NEXT:    BUNDLE
 ; THUMB1-NEXT:    pop {r4, pc}
 ;
 ; THUMB2-LABEL: ct_int8:
@@ -126,10 +99,7 @@ define i8 @ct_int8(i1 %cond, i8 %a, i8 %b) {
 ; CORTEX-NOTHUMB-LABEL: ct_int8:
 ; CORTEX-NOTHUMB:       @ %bb.0: @ %entry
 ; CORTEX-NOTHUMB-NEXT:    and r3, r0, #1
-; CORTEX-NOTHUMB-NEXT:    rsb r12, r3, #0
-; CORTEX-NOTHUMB-NEXT:    and r0, r1, r12
-; CORTEX-NOTHUMB-NEXT:    bic r12, r2, r12
-; CORTEX-NOTHUMB-NEXT:    orr r0, r0, r12
+; CORTEX-NOTHUMB-NEXT:    BUNDLE
 ; CORTEX-NOTHUMB-NEXT:    bx lr
 entry:
   %sel = call i8 @llvm.ct.select.i8(i1 %cond, i8 %a, i8 %b)
@@ -140,19 +110,13 @@ define i16 @ct_int16(i1 %cond, i16 %a, i16 %b) {
 ; CT-LABEL: ct_int16:
 ; CT:       @ %bb.0: @ %entry
 ; CT-NEXT:    and r3, r0, #1
-; CT-NEXT:    rsb r12, r3, #0
-; CT-NEXT:    and r0, r1, r12
-; CT-NEXT:    bic r12, r2, r12
-; CT-NEXT:    orr r0, r0, r12
+; CT-NEXT:    BUNDLE
 ; CT-NEXT:    bx lr
 ;
 ; DEFAULT-LABEL: ct_int16:
 ; DEFAULT:       @ %bb.0: @ %entry
 ; DEFAULT-NEXT:    and r3, r0, #1
-; DEFAULT-NEXT:    rsb r12, r3, #0
-; DEFAULT-NEXT:    and r0, r1, r12
-; DEFAULT-NEXT:    bic r12, r2, r12
-; DEFAULT-NEXT:    orr r0, r0, r12
+; DEFAULT-NEXT:    BUNDLE
 ; DEFAULT-NEXT:    bx lr
 ;
 ; THUMB1-LABEL: ct_int16:
@@ -161,13 +125,7 @@ define i16 @ct_int16(i1 %cond, i16 %a, i16 %b) {
 ; THUMB1-NEXT:    push {r4, lr}
 ; THUMB1-NEXT:    movs r3, #1
 ; THUMB1-NEXT:    ands r3, r0
-; THUMB1-NEXT:    mov r4, r3
-; THUMB1-NEXT:    lsls r4, r4, #31
-; THUMB1-NEXT:    asrs r4, r4, #31
-; THUMB1-NEXT:    mov r0, r1
-; THUMB1-NEXT:    eors r0, r2
-; THUMB1-NEXT:    ands r0, r4
-; THUMB1-NEXT:    eors r0, r2
+; THUMB1-NEXT:    BUNDLE
 ; THUMB1-NEXT:    pop {r4, pc}
 ;
 ; THUMB2-LABEL: ct_int16:
@@ -191,10 +149,7 @@ define i16 @ct_int16(i1 %cond, i16 %a, i16 %b) {
 ; CORTEX-NOTHUMB-LABEL: ct_int16:
 ; CORTEX-NOTHUMB:       @ %bb.0: @ %entry
 ; CORTEX-NOTHUMB-NEXT:    and r3, r0, #1
-; CORTEX-NOTHUMB-NEXT:    rsb r12, r3, #0
-; CORTEX-NOTHUMB-NEXT:    and r0, r1, r12
-; CORTEX-NOTHUMB-NEXT:    bic r12, r2, r12
-; CORTEX-NOTHUMB-NEXT:    orr r0, r0, r12
+; CORTEX-NOTHUMB-NEXT:    BUNDLE
 ; CORTEX-NOTHUMB-NEXT:    bx lr
 entry:
   %sel = call i16 @llvm.ct.select.i16(i1 %cond, i16 %a, i16 %b)
@@ -205,19 +160,13 @@ define i32 @ct_int32(i1 %cond, i32 %a, i32 %b) {
 ; CT-LABEL: ct_int32:
 ; CT:       @ %bb.0: @ %entry
 ; CT-NEXT:    and r3, r0, #1
-; CT-NEXT:    rsb r12, r3, #0
-; CT-NEXT:    and r0, r1, r12
-; CT-NEXT:    bic r12, r2, r12
-; CT-NEXT:    orr r0, r0, r12
+; CT-NEXT:    BUNDLE
 ; CT-NEXT:    bx lr
 ;
 ; DEFAULT-LABEL: ct_int32:
 ; DEFAULT:       @ %bb.0: @ %entry
 ; DEFAULT-NEXT:    and r3, r0, #1
-; DEFAULT-NEXT:    rsb r12, r3, #0
-; DEFAULT-NEXT:    and r0, r1, r12
-; DEFAULT-NEXT:    bic r12, r2, r12
-; DEFAULT-NEXT:    orr r0, r0, r12
+; DEFAULT-NEXT:    BUNDLE
 ; DEFAULT-NEXT:    bx lr
 ;
 ; THUMB1-LABEL: ct_int32:
@@ -226,13 +175,7 @@ define i32 @ct_int32(i1 %cond, i32 %a, i32 %b) {
 ; THUMB1-NEXT:    push {r4, lr}
 ; THUMB1-NEXT:    movs r3, #1
 ; THUMB1-NEXT:    ands r3, r0
-; THUMB1-NEXT:    mov r4, r3
-; THUMB1-NEXT:    lsls r4, r4, #31
-; THUMB1-NEXT:    asrs r4, r4, #31
-; THUMB1-NEXT:    mov r0, r1
-; THUMB1-NEXT:    eors r0, r2
-; THUMB1-NEXT:    ands r0, r4
-; THUMB1-NEXT:    eors r0, r2
+; THUMB1-NEXT:    BUNDLE
 ; THUMB1-NEXT:    pop {r4, pc}
 ;
 ; THUMB2-LABEL: ct_int32:
@@ -256,10 +199,7 @@ define i32 @ct_int32(i1 %cond, i32 %a, i32 %b) {
 ; CORTEX-NOTHUMB-LABEL: ct_int32:
 ; CORTEX-NOTHUMB:       @ %bb.0: @ %entry
 ; CORTEX-NOTHUMB-NEXT:    and r3, r0, #1
-; CORTEX-NOTHUMB-NEXT:    rsb r12, r3, #0
-; CORTEX-NOTHUMB-NEXT:    and r0, r1, r12
-; CORTEX-NOTHUMB-NEXT:    bic r12, r2, r12
-; CORTEX-NOTHUMB-NEXT:    orr r0, r0, r12
+; CORTEX-NOTHUMB-NEXT:    BUNDLE
 ; CORTEX-NOTHUMB-NEXT:    bx lr
 entry:
   %sel = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b)
@@ -274,14 +214,8 @@ define i64 @ct_int64(i1 %cond, i64 %a, i64 %b) {
 ; CT-NEXT:    ldr r1, [sp, #8]
 ; CT-NEXT:    and lr, r0, #1
 ; CT-NEXT:    ldr r12, [sp, #12]
-; CT-NEXT:    rsb r4, lr, #0
-; CT-NEXT:    and r0, r2, r4
-; CT-NEXT:    bic r4, r1, r4
-; CT-NEXT:    orr r0, r0, r4
-; CT-NEXT:    rsb r2, lr, #0
-; CT-NEXT:    and r1, r3, r2
-; CT-NEXT:    bic r2, r12, r2
-; CT-NEXT:    orr r1, r1, r2
+; CT-NEXT:    BUNDLE
+; CT-NEXT:    BUNDLE
 ; CT-NEXT:    pop {r4, pc}
 ;
 ; DEFAULT-LABEL: ct_int64:
@@ -289,15 +223,9 @@ define i64 @ct_int64(i1 %cond, i64 %a, i64 %b) {
 ; DEFAULT-NEXT:    push {r11, lr}
 ; DEFAULT-NEXT:    and r12, r0, #1
 ; DEFAULT-NEXT:    ldr r1, [sp, #8]
-; DEFAULT-NEXT:    rsb lr, r12, #0
-; DEFAULT-NEXT:    and r0, r2, lr
-; DEFAULT-NEXT:    bic lr, r1, lr
-; DEFAULT-NEXT:    orr r0, r0, lr
+; DEFAULT-NEXT:    BUNDLE
 ; DEFAULT-NEXT:    ldr r2, [sp, #12]
-; DEFAULT-NEXT:    rsb lr, r12, #0
-; DEFAULT-NEXT:    and r1, r3, lr
-; DEFAULT-NEXT:    bic lr, r2, lr
-; DEFAULT-NEXT:    orr r1, r1, lr
+; DEFAULT-NEXT:    BUNDLE
 ; DEFAULT-NEXT:    pop {r11, pc}
 ;
 ; THUMB1-LABEL: ct_int64:
@@ -307,21 +235,9 @@ define i64 @ct_int64(i1 %cond, i64 %a, i64 %b) {
 ; THUMB1-NEXT:    movs r4, #1
 ; THUMB1-NEXT:    ands r4, r0
 ; THUMB1-NEXT:    ldr r1, [sp, #16]
-; THUMB1-NEXT:    mov r5, r4
-; THUMB1-NEXT:    lsls r5, r5, #31
-; THUMB1-NEXT:    asrs r5, r5, #31
-; THUMB1-NEXT:    mov r0, r2
-; THUMB1-NEXT:    eors r0, r1
-; THUMB1-NEXT:    ands r0, r5
-; THUMB1-NEXT:    eors r0, r1
+; THUMB1-NEXT:    BUNDLE
 ; THUMB1-NEXT:    ldr r2, [sp, #20]
-; THUMB1-NEXT:    mov r5, r4
-; THUMB1-NEXT:    lsls r5, r5, #31
-; THUMB1-NEXT:    asrs r5, r5, #31
-; THUMB1-NEXT:    mov r1, r3
-; THUMB1-NEXT:    eors r1, r2
-; THUMB1-NEXT:    ands r1, r5
-; THUMB1-NEXT:    eors r1, r2
+; THUMB1-NEXT:    BUNDLE
 ; THUMB1-NEXT:    pop {r4, r5, r7, pc}
 ;
 ; THUMB2-LABEL: ct_int64:
@@ -364,14 +280,8 @@ define i64 @ct_int64(i1 %cond, i64 %a, i64 %b) {
 ; CORTEX-NOTHUMB-NEXT:    ldr r12, [sp, #12]
 ; CORTEX-NOTHUMB-NEXT:    and lr, r0, #1
 ; CORTEX-NOTHUMB-NEXT:    ldr r1, [sp, #8]
-; CORTEX-NOTHUMB-NEXT:    rsb r4, lr, #0
-; CORTEX-NOTHUMB-NEXT:    and r0, r2, r4
-; CORTEX-NOTHUMB-NEXT:    bic r4, r1, r4
-; CORTEX-NOTHUMB-NEXT:    orr r0, r0, r4
-; CORTEX-NOTHUMB-NEXT:    rsb r2, lr, #0
-; CORTEX-NOTHUMB-NEXT:    and r1, r3, r2
-; CORTEX-NOTHUMB-NEXT:    bic r2, r12, r2
-; CORTEX-NOTHUMB-NEXT:    orr r1, r1, r2
+; CORTEX-NOTHUMB-NEXT:    BUNDLE
+; CORTEX-NOTHUMB-NEXT:    BUNDLE
 ; CORTEX-NOTHUMB-NEXT:    pop {r4, pc}
 entry:
   %sel = call i64 @llvm.ct.select.i64(i1 %cond, i64 %a, i64 %b)
@@ -384,23 +294,14 @@ define float @ct_float(i1 %cond, float %a, float %b) {
 ; CT-NEXT:    and r0, r0, #1
 ; CT-NEXT:    vmov s0, r2
 ; CT-NEXT:    vmov s2, r1
-; CT-NEXT:    vmov r2, s2
-; CT-NEXT:    vmov r3, s0
-; CT-NEXT:    rsb r1, r0, #0
-; CT-NEXT:    and r2, r2, r1
-; CT-NEXT:    bic r1, r3, r1
-; CT-NEXT:    orr r2, r2, r1
-; CT-NEXT:    vmov s4, r2
+; CT-NEXT:    BUNDLE
 ; CT-NEXT:    vmov r0, s4
 ; CT-NEXT:    bx lr
 ;
 ; DEFAULT-LABEL: ct_float:
 ; DEFAULT:       @ %bb.0: @ %entry
 ; DEFAULT-NEXT:    and r3, r0, #1
-; DEFAULT-NEXT:    rsb r12, r3, #0
-; DEFAULT-NEXT:    and r0, r1, r12
-; DEFAULT-NEXT:    bic r12, r2, r12
-; DEFAULT-NEXT:    orr r0, r0, r12
+; DEFAULT-NEXT:    BUNDLE
 ; DEFAULT-NEXT:    bx lr
 ;
 ; THUMB1-LABEL: ct_float:
@@ -409,13 +310,7 @@ define float @ct_float(i1 %cond, float %a, float %b) {
 ; THUMB1-NEXT:    push {r4, lr}
 ; THUMB1-NEXT:    movs r3, #1
 ; THUMB1-NEXT:    ands r3, r0
-; THUMB1-NEXT:    mov r4, r3
-; THUMB1-NEXT:    lsls r4, r4, #31
-; THUMB1-NEXT:    asrs r4, r4, #31
-; THUMB1-NEXT:    mov r0, r1
-; THUMB1-NEXT:    eors r0, r2
-; THUMB1-NEXT:    ands r0, r4
-; THUMB1-NEXT:    eors r0, r2
+; THUMB1-NEXT:    BUNDLE
 ; THUMB1-NEXT:    pop {r4, pc}
 ;
 ; THUMB2-LABEL: ct_float:
@@ -443,13 +338,7 @@ define float @ct_float(i1 %cond, float %a, float %b) {
 ; CORTEX-NOTHUMB-LABEL: ct_float:
 ; CORTEX-NOTHUMB:       @ %bb.0: @ %entry
 ; CORTEX-NOTHUMB-NEXT:    and r0, r0, #1
-; CORTEX-NOTHUMB-NEXT:    vmov r2, s0
-; CORTEX-NOTHUMB-NEXT:    vmov r3, s1
-; CORTEX-NOTHUMB-NEXT:    rsb r1, r0, #0
-; CORTEX-NOTHUMB-NEXT:    and r2, r2, r1
-; CORTEX-NOTHUMB-NEXT:    bic r1, r3, r1
-; CORTEX-NOTHUMB-NEXT:    orr r2, r2, r1
-; CORTEX-NOTHUMB-NEXT:    vmov s2, r2
+; CORTEX-NOTHUMB-NEXT:    BUNDLE
 ; CORTEX-NOTHUMB-NEXT:    vmov.f32 s0, s2
 ; CORTEX-NOTHUMB-NEXT:    bx lr
 entry:
@@ -463,63 +352,47 @@ define double @ct_f64(i1 %cond, double %a, double %b) {
 ; CT-NEXT:    vldr d16, [sp]
 ; CT-NEXT:    vmov d17, r2, r3
 ; CT-NEXT:    and r0, r0, #1
-; CT-NEXT:    rsb r1, r0, #0
-; CT-NEXT:    vdup.32 d19, r1
-; CT-NEXT:    vand d18, d17, d19
-; CT-NEXT:    vbic d19, d16, d19
-; CT-NEXT:    vorr d18, d18, d19
+; CT-NEXT:    BUNDLE
 ; CT-NEXT:    vmov r0, r1, d18
 ; CT-NEXT:    bx lr
 ;
 ; DEFAULT-LABEL: ct_f64:
 ; DEFAULT:       @ %bb.0: @ %entry
 ; DEFAULT-NEXT:    push {r11, lr}
-; DEFAULT-NEXT:    and r0, r0, #1
+; DEFAULT-NEXT:    and r12, r0, #1
 ; DEFAULT-NEXT:    ldr r1, [sp, #8]
-; DEFAULT-NEXT:    sub r12, r0, #1
-; DEFAULT-NEXT:    and lr, r1, r12
-; DEFAULT-NEXT:    rsb r1, r0, #0
-; DEFAULT-NEXT:    and r0, r2, r1
+; DEFAULT-NEXT:    BUNDLE
 ; DEFAULT-NEXT:    ldr r2, [sp, #12]
-; DEFAULT-NEXT:    and r1, r3, r1
-; DEFAULT-NEXT:    orr r0, r0, lr
-; DEFAULT-NEXT:    and r2, r2, r12
-; DEFAULT-NEXT:    orr r1, r1, r2
+; DEFAULT-NEXT:    BUNDLE
 ; DEFAULT-NEXT:    pop {r11, pc}
 ;
 ; THUMB1-LABEL: ct_f64:
 ; THUMB1:       @ %bb.0: @ %entry
-; THUMB1-NEXT:    .save {r4, lr}
-; THUMB1-NEXT:    push {r4, lr}
-; THUMB1-NEXT:    movs r1, #1
-; THUMB1-NEXT:    ands r1, r0
-; THUMB1-NEXT:    subs r4, r1, #1
-; THUMB1-NEXT:    ldr r0, [sp, #8]
-; THUMB1-NEXT:    ands r0, r4
-; THUMB1-NEXT:    rsbs r1, r1, #0
-; THUMB1-NEXT:    ands r2, r1
-; THUMB1-NEXT:    orrs r0, r2
-; THUMB1-NEXT:    ldr r2, [sp, #12]
-; THUMB1-NEXT:    ands r2, r4
-; THUMB1-NEXT:    ands r1, r3
-; THUMB1-NEXT:    orrs r1, r2
-; THUMB1-NEXT:    pop {r4, pc}
+; THUMB1-NEXT:    .save {r4, r5, r7, lr}
+; THUMB1-NEXT:    push {r4, r5, r7, lr}
+; THUMB1-NEXT:    movs r4, #1
+; THUMB1-NEXT:    ands r4, r0
+; THUMB1-NEXT:    ldr r1, [sp, #16]
+; THUMB1-NEXT:    BUNDLE
+; THUMB1-NEXT:    ldr r2, [sp, #20]
+; THUMB1-NEXT:    BUNDLE
+; THUMB1-NEXT:    pop {r4, r5, r7, pc}
 ;
 ; THUMB2-LABEL: ct_f64:
 ; THUMB2:       @ %bb.0: @ %entry
 ; THUMB2-NEXT:    .save {r7, lr}
 ; THUMB2-NEXT:    push {r7, lr}
-; THUMB2-NEXT:    and r0, r0, #1
+; THUMB2-NEXT:    and r12, r0, #1
 ; THUMB2-NEXT:    ldr r1, [sp, #8]
-; THUMB2-NEXT:    sub.w r12, r0, #1
-; THUMB2-NEXT:    and.w lr, r1, r12
-; THUMB2-NEXT:    rsbs r1, r0, #0
-; THUMB2-NEXT:    and.w r0, r2, r1
-; THUMB2-NEXT:    ldr r2, [sp, #12]
-; THUMB2-NEXT:    ands r1, r3
+; THUMB2-NEXT:    rsb.w lr, r12, #0
+; THUMB2-NEXT:    and.w r0, r2, lr
+; THUMB2-NEXT:    bic.w lr, r1, lr
 ; THUMB2-NEXT:    orr.w r0, r0, lr
-; THUMB2-NEXT:    and.w r2, r2, r12
-; THUMB2-NEXT:    orrs r1, r2
+; THUMB2-NEXT:    ldr r2, [sp, #12]
+; THUMB2-NEXT:    rsb.w lr, r12, #0
+; THUMB2-NEXT:    and.w r1, r3, lr
+; THUMB2-NEXT:    bic.w lr, r2, lr
+; THUMB2-NEXT:    orr.w r1, r1, lr
 ; THUMB2-NEXT:    pop {r7, pc}
 ;
 ; CORTEXA9-LABEL: ct_f64:
@@ -536,11 +409,7 @@ define double @ct_f64(i1 %cond, double %a, double %b) {
 ; CORTEX-NOTHUMB-LABEL: ct_f64:
 ; CORTEX-NOTHUMB:       @ %bb.0: @ %entry
 ; CORTEX-NOTHUMB-NEXT:    and r0, r0, #1
-; CORTEX-NOTHUMB-NEXT:    rsb r1, r0, #0
-; CORTEX-NOTHUMB-NEXT:    vdup.32 d17, r1
-; CORTEX-NOTHUMB-NEXT:    vand d16, d0, d17
-; CORTEX-NOTHUMB-NEXT:    vbic d17, d1, d17
-; CORTEX-NOTHUMB-NEXT:    vorr d16, d16, d17
+; CORTEX-NOTHUMB-NEXT:    BUNDLE
 ; CORTEX-NOTHUMB-NEXT:    vmov.f64 d0, d16
 ; CORTEX-NOTHUMB-NEXT:    bx lr
 entry:



More information about the llvm-branch-commits mailing list