[clang] 5bb3480 - [NFC] Migrate tests to use autoupdate for CHECK lines.

Tue Oct 22 06:05:00 PDT 2024

Author: Paul Walker
Date: 2024-10-22T12:55:15Z
New Revision: 5bb34803a48598c3fa6e480de1814d5fe2d0f652

URL: https://github.com/llvm/llvm-project/commit/5bb34803a48598c3fa6e480de1814d5fe2d0f652
DIFF: https://github.com/llvm/llvm-project/commit/5bb34803a48598c3fa6e480de1814d5fe2d0f652.diff

LOG: [NFC] Migrate tests to use autoupdate for CHECK lines.

Added: 
    

Modified: 
    clang/test/CodeGen/aarch64-sve-vls-bitwise-ops.c
    clang/test/CodeGen/arm-bf16-convert-intrinsics.c
    clang/test/CodeGen/variadic-nvptx.c
    llvm/test/Analysis/CostModel/SystemZ/divrem-pow2.ll
    llvm/test/CodeGen/AArch64/arm64-codegen-prepare-extload.ll
    llvm/test/Instrumentation/MemorySanitizer/reduce.ll
    llvm/test/Instrumentation/MemorySanitizer/vector_arith.ll
    llvm/test/Transforms/InstCombine/AArch64/2012-04-23-Neon-Intrinsics.ll
    llvm/test/Transforms/InstCombine/AArch64/aes-intrinsics.ll
    llvm/test/Transforms/InstCombine/ARM/2012-04-23-Neon-Intrinsics.ll
    llvm/test/Transforms/InstCombine/ARM/aes-intrinsics.ll
    llvm/test/Transforms/InstCombine/pow-0.ll
    llvm/test/Transforms/LoopVectorize/AArch64/blend-costs.ll

Removed: 
    


################################################################################
diff  --git a/clang/test/CodeGen/aarch64-sve-vls-bitwise-ops.c b/clang/test/CodeGen/aarch64-sve-vls-bitwise-ops.c
index f6c9e13190774f..74b543b67bfba4 100644

--- a/clang/test/CodeGen/aarch64-sve-vls-bitwise-ops.c
+++ b/clang/test/CodeGen/aarch64-sve-vls-bitwise-ops.c
@@ -34,8 +34,8 @@ typedef svbool_t fixed_bool_t __attribute__((arm_sve_vector_bits(N)));
 // CHECK-NEXT:    [[B_COERCE:%.*]] = bitcast <vscale x 16 x i1> [[TMP1:%.*]] to <vscale x 2 x i8>
 // CHECK-NEXT:    [[B:%.*]] = call <8 x i8> @llvm.vector.extract.v8i8.nxv2i8(<vscale x 2 x i8> [[B_COERCE]], i64 0)
 // CHECK-NEXT:    [[AND:%.*]] = and <8 x i8> [[A]], [[B]]
-// CHECK-NEXT:    [[CASTSCALABLESVE:%.*]] = call <vscale x 2 x i8> @llvm.vector.insert.nxv2i8.v8i8(<vscale x 2 x i8> undef, <8 x i8> [[AND]], i64 0)
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <vscale x 2 x i8> [[CASTSCALABLESVE]] to <vscale x 16 x i1>
+// CHECK-NEXT:    [[CAST_SCALABLE:%.*]] = call <vscale x 2 x i8> @llvm.vector.insert.nxv2i8.v8i8(<vscale x 2 x i8> undef, <8 x i8> [[AND]], i64 0)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <vscale x 2 x i8> [[CAST_SCALABLE]] to <vscale x 16 x i1>
 // CHECK-NEXT:    ret <vscale x 16 x i1> [[TMP2]]
 //
 fixed_bool_t and_bool(fixed_bool_t a, fixed_bool_t b) {
@@ -47,8 +47,8 @@ fixed_bool_t and_bool(fixed_bool_t a, fixed_bool_t b) {
 // CHECK-NEXT:    [[A:%.*]] = call <64 x i8> @llvm.vector.extract.v64i8.nxv16i8(<vscale x 16 x i8> [[A_COERCE:%.*]], i64 0)
 // CHECK-NEXT:    [[B:%.*]] = call <64 x i8> @llvm.vector.extract.v64i8.nxv16i8(<vscale x 16 x i8> [[B_COERCE:%.*]], i64 0)
 // CHECK-NEXT:    [[AND:%.*]] = and <64 x i8> [[A]], [[B]]
-// CHECK-NEXT:    [[CASTSCALABLESVE:%.*]] = call <vscale x 16 x i8> @llvm.vector.insert.nxv16i8.v64i8(<vscale x 16 x i8> undef, <64 x i8> [[AND]], i64 0)
-// CHECK-NEXT:    ret <vscale x 16 x i8> [[CASTSCALABLESVE]]
+// CHECK-NEXT:    [[CAST_SCALABLE:%.*]] = call <vscale x 16 x i8> @llvm.vector.insert.nxv16i8.v64i8(<vscale x 16 x i8> undef, <64 x i8> [[AND]], i64 0)
+// CHECK-NEXT:    ret <vscale x 16 x i8> [[CAST_SCALABLE]]
 //
 fixed_int8_t and_i8(fixed_int8_t a, fixed_int8_t b) {
   return a & b;
@@ -59,8 +59,8 @@ fixed_int8_t and_i8(fixed_int8_t a, fixed_int8_t b) {
 // CHECK-NEXT:    [[A:%.*]] = call <32 x i16> @llvm.vector.extract.v32i16.nxv8i16(<vscale x 8 x i16> [[A_COERCE:%.*]], i64 0)
 // CHECK-NEXT:    [[B:%.*]] = call <32 x i16> @llvm.vector.extract.v32i16.nxv8i16(<vscale x 8 x i16> [[B_COERCE:%.*]], i64 0)
 // CHECK-NEXT:    [[AND:%.*]] = and <32 x i16> [[A]], [[B]]
-// CHECK-NEXT:    [[CASTSCALABLESVE:%.*]] = call <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.v32i16(<vscale x 8 x i16> undef, <32 x i16> [[AND]], i64 0)
-// CHECK-NEXT:    ret <vscale x 8 x i16> [[CASTSCALABLESVE]]
+// CHECK-NEXT:    [[CAST_SCALABLE:%.*]] = call <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.v32i16(<vscale x 8 x i16> undef, <32 x i16> [[AND]], i64 0)
+// CHECK-NEXT:    ret <vscale x 8 x i16> [[CAST_SCALABLE]]
 //
 fixed_int16_t and_i16(fixed_int16_t a, fixed_int16_t b) {
   return a & b;
@@ -71,8 +71,8 @@ fixed_int16_t and_i16(fixed_int16_t a, fixed_int16_t b) {
 // CHECK-NEXT:    [[A:%.*]] = call <16 x i32> @llvm.vector.extract.v16i32.nxv4i32(<vscale x 4 x i32> [[A_COERCE:%.*]], i64 0)
 // CHECK-NEXT:    [[B:%.*]] = call <16 x i32> @llvm.vector.extract.v16i32.nxv4i32(<vscale x 4 x i32> [[B_COERCE:%.*]], i64 0)
 // CHECK-NEXT:    [[AND:%.*]] = and <16 x i32> [[A]], [[B]]
-// CHECK-NEXT:    [[CASTSCALABLESVE:%.*]] = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v16i32(<vscale x 4 x i32> undef, <16 x i32> [[AND]], i64 0)
-// CHECK-NEXT:    ret <vscale x 4 x i32> [[CASTSCALABLESVE]]
+// CHECK-NEXT:    [[CAST_SCALABLE:%.*]] = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v16i32(<vscale x 4 x i32> undef, <16 x i32> [[AND]], i64 0)
+// CHECK-NEXT:    ret <vscale x 4 x i32> [[CAST_SCALABLE]]
 //
 fixed_int32_t and_i32(fixed_int32_t a, fixed_int32_t b) {
   return a & b;
@@ -83,8 +83,8 @@ fixed_int32_t and_i32(fixed_int32_t a, fixed_int32_t b) {
 // CHECK-NEXT:    [[A:%.*]] = call <8 x i64> @llvm.vector.extract.v8i64.nxv2i64(<vscale x 2 x i64> [[A_COERCE:%.*]], i64 0)
 // CHECK-NEXT:    [[B:%.*]] = call <8 x i64> @llvm.vector.extract.v8i64.nxv2i64(<vscale x 2 x i64> [[B_COERCE:%.*]], i64 0)
 // CHECK-NEXT:    [[AND:%.*]] = and <8 x i64> [[A]], [[B]]
-// CHECK-NEXT:    [[CASTSCALABLESVE:%.*]] = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v8i64(<vscale x 2 x i64> undef, <8 x i64> [[AND]], i64 0)
-// CHECK-NEXT:    ret <vscale x 2 x i64> [[CASTSCALABLESVE]]
+// CHECK-NEXT:    [[CAST_SCALABLE:%.*]] = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v8i64(<vscale x 2 x i64> undef, <8 x i64> [[AND]], i64 0)
+// CHECK-NEXT:    ret <vscale x 2 x i64> [[CAST_SCALABLE]]
 //
 fixed_int64_t and_i64(fixed_int64_t a, fixed_int64_t b) {
   return a & b;
@@ -95,8 +95,8 @@ fixed_int64_t and_i64(fixed_int64_t a, fixed_int64_t b) {
 // CHECK-NEXT:    [[A:%.*]] = call <64 x i8> @llvm.vector.extract.v64i8.nxv16i8(<vscale x 16 x i8> [[A_COERCE:%.*]], i64 0)
 // CHECK-NEXT:    [[B:%.*]] = call <64 x i8> @llvm.vector.extract.v64i8.nxv16i8(<vscale x 16 x i8> [[B_COERCE:%.*]], i64 0)
 // CHECK-NEXT:    [[AND:%.*]] = and <64 x i8> [[A]], [[B]]
-// CHECK-NEXT:    [[CASTSCALABLESVE:%.*]] = call <vscale x 16 x i8> @llvm.vector.insert.nxv16i8.v64i8(<vscale x 16 x i8> undef, <64 x i8> [[AND]], i64 0)
-// CHECK-NEXT:    ret <vscale x 16 x i8> [[CASTSCALABLESVE]]
+// CHECK-NEXT:    [[CAST_SCALABLE:%.*]] = call <vscale x 16 x i8> @llvm.vector.insert.nxv16i8.v64i8(<vscale x 16 x i8> undef, <64 x i8> [[AND]], i64 0)
+// CHECK-NEXT:    ret <vscale x 16 x i8> [[CAST_SCALABLE]]
 //
 fixed_uint8_t and_u8(fixed_uint8_t a, fixed_uint8_t b) {
   return a & b;
@@ -107,8 +107,8 @@ fixed_uint8_t and_u8(fixed_uint8_t a, fixed_uint8_t b) {
 // CHECK-NEXT:    [[A:%.*]] = call <32 x i16> @llvm.vector.extract.v32i16.nxv8i16(<vscale x 8 x i16> [[A_COERCE:%.*]], i64 0)
 // CHECK-NEXT:    [[B:%.*]] = call <32 x i16> @llvm.vector.extract.v32i16.nxv8i16(<vscale x 8 x i16> [[B_COERCE:%.*]], i64 0)
 // CHECK-NEXT:    [[AND:%.*]] = and <32 x i16> [[A]], [[B]]
-// CHECK-NEXT:    [[CASTSCALABLESVE:%.*]] = call <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.v32i16(<vscale x 8 x i16> undef, <32 x i16> [[AND]], i64 0)
-// CHECK-NEXT:    ret <vscale x 8 x i16> [[CASTSCALABLESVE]]
+// CHECK-NEXT:    [[CAST_SCALABLE:%.*]] = call <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.v32i16(<vscale x 8 x i16> undef, <32 x i16> [[AND]], i64 0)
+// CHECK-NEXT:    ret <vscale x 8 x i16> [[CAST_SCALABLE]]
 //
 fixed_uint16_t and_u16(fixed_uint16_t a, fixed_uint16_t b) {
   return a & b;
@@ -119,8 +119,8 @@ fixed_uint16_t and_u16(fixed_uint16_t a, fixed_uint16_t b) {
 // CHECK-NEXT:    [[A:%.*]] = call <16 x i32> @llvm.vector.extract.v16i32.nxv4i32(<vscale x 4 x i32> [[A_COERCE:%.*]], i64 0)
 // CHECK-NEXT:    [[B:%.*]] = call <16 x i32> @llvm.vector.extract.v16i32.nxv4i32(<vscale x 4 x i32> [[B_COERCE:%.*]], i64 0)
 // CHECK-NEXT:    [[AND:%.*]] = and <16 x i32> [[A]], [[B]]
-// CHECK-NEXT:    [[CASTSCALABLESVE:%.*]] = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v16i32(<vscale x 4 x i32> undef, <16 x i32> [[AND]], i64 0)
-// CHECK-NEXT:    ret <vscale x 4 x i32> [[CASTSCALABLESVE]]
+// CHECK-NEXT:    [[CAST_SCALABLE:%.*]] = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v16i32(<vscale x 4 x i32> undef, <16 x i32> [[AND]], i64 0)
+// CHECK-NEXT:    ret <vscale x 4 x i32> [[CAST_SCALABLE]]
 //
 fixed_uint32_t and_u32(fixed_uint32_t a, fixed_uint32_t b) {
   return a & b;
@@ -131,8 +131,8 @@ fixed_uint32_t and_u32(fixed_uint32_t a, fixed_uint32_t b) {
 // CHECK-NEXT:    [[A:%.*]] = call <8 x i64> @llvm.vector.extract.v8i64.nxv2i64(<vscale x 2 x i64> [[A_COERCE:%.*]], i64 0)
 // CHECK-NEXT:    [[B:%.*]] = call <8 x i64> @llvm.vector.extract.v8i64.nxv2i64(<vscale x 2 x i64> [[B_COERCE:%.*]], i64 0)
 // CHECK-NEXT:    [[AND:%.*]] = and <8 x i64> [[A]], [[B]]
-// CHECK-NEXT:    [[CASTSCALABLESVE:%.*]] = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v8i64(<vscale x 2 x i64> undef, <8 x i64> [[AND]], i64 0)
-// CHECK-NEXT:    ret <vscale x 2 x i64> [[CASTSCALABLESVE]]
+// CHECK-NEXT:    [[CAST_SCALABLE:%.*]] = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v8i64(<vscale x 2 x i64> undef, <8 x i64> [[AND]], i64 0)
+// CHECK-NEXT:    ret <vscale x 2 x i64> [[CAST_SCALABLE]]
 //
 fixed_uint64_t and_u64(fixed_uint64_t a, fixed_uint64_t b) {
   return a & b;
@@ -147,8 +147,8 @@ fixed_uint64_t and_u64(fixed_uint64_t a, fixed_uint64_t b) {
 // CHECK-NEXT:    [[B_COERCE:%.*]] = bitcast <vscale x 16 x i1> [[TMP1:%.*]] to <vscale x 2 x i8>
 // CHECK-NEXT:    [[B:%.*]] = call <8 x i8> @llvm.vector.extract.v8i8.nxv2i8(<vscale x 2 x i8> [[B_COERCE]], i64 0)
 // CHECK-NEXT:    [[OR:%.*]] = or <8 x i8> [[A]], [[B]]
-// CHECK-NEXT:    [[CASTSCALABLESVE:%.*]] = call <vscale x 2 x i8> @llvm.vector.insert.nxv2i8.v8i8(<vscale x 2 x i8> undef, <8 x i8> [[OR]], i64 0)
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <vscale x 2 x i8> [[CASTSCALABLESVE]] to <vscale x 16 x i1>
+// CHECK-NEXT:    [[CAST_SCALABLE:%.*]] = call <vscale x 2 x i8> @llvm.vector.insert.nxv2i8.v8i8(<vscale x 2 x i8> undef, <8 x i8> [[OR]], i64 0)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <vscale x 2 x i8> [[CAST_SCALABLE]] to <vscale x 16 x i1>
 // CHECK-NEXT:    ret <vscale x 16 x i1> [[TMP2]]
 //
 fixed_bool_t or_bool(fixed_bool_t a, fixed_bool_t b) {
@@ -160,8 +160,8 @@ fixed_bool_t or_bool(fixed_bool_t a, fixed_bool_t b) {
 // CHECK-NEXT:    [[A:%.*]] = call <64 x i8> @llvm.vector.extract.v64i8.nxv16i8(<vscale x 16 x i8> [[A_COERCE:%.*]], i64 0)
 // CHECK-NEXT:    [[B:%.*]] = call <64 x i8> @llvm.vector.extract.v64i8.nxv16i8(<vscale x 16 x i8> [[B_COERCE:%.*]], i64 0)
 // CHECK-NEXT:    [[OR:%.*]] = or <64 x i8> [[A]], [[B]]
-// CHECK-NEXT:    [[CASTSCALABLESVE:%.*]] = call <vscale x 16 x i8> @llvm.vector.insert.nxv16i8.v64i8(<vscale x 16 x i8> undef, <64 x i8> [[OR]], i64 0)
-// CHECK-NEXT:    ret <vscale x 16 x i8> [[CASTSCALABLESVE]]
+// CHECK-NEXT:    [[CAST_SCALABLE:%.*]] = call <vscale x 16 x i8> @llvm.vector.insert.nxv16i8.v64i8(<vscale x 16 x i8> undef, <64 x i8> [[OR]], i64 0)
+// CHECK-NEXT:    ret <vscale x 16 x i8> [[CAST_SCALABLE]]
 //
 fixed_int8_t or_i8(fixed_int8_t a, fixed_int8_t b) {
   return a | b;
@@ -172,8 +172,8 @@ fixed_int8_t or_i8(fixed_int8_t a, fixed_int8_t b) {
 // CHECK-NEXT:    [[A:%.*]] = call <32 x i16> @llvm.vector.extract.v32i16.nxv8i16(<vscale x 8 x i16> [[A_COERCE:%.*]], i64 0)
 // CHECK-NEXT:    [[B:%.*]] = call <32 x i16> @llvm.vector.extract.v32i16.nxv8i16(<vscale x 8 x i16> [[B_COERCE:%.*]], i64 0)
 // CHECK-NEXT:    [[OR:%.*]] = or <32 x i16> [[A]], [[B]]
-// CHECK-NEXT:    [[CASTSCALABLESVE:%.*]] = call <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.v32i16(<vscale x 8 x i16> undef, <32 x i16> [[OR]], i64 0)
-// CHECK-NEXT:    ret <vscale x 8 x i16> [[CASTSCALABLESVE]]
+// CHECK-NEXT:    [[CAST_SCALABLE:%.*]] = call <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.v32i16(<vscale x 8 x i16> undef, <32 x i16> [[OR]], i64 0)
+// CHECK-NEXT:    ret <vscale x 8 x i16> [[CAST_SCALABLE]]
 //
 fixed_int16_t or_i16(fixed_int16_t a, fixed_int16_t b) {
   return a | b;
@@ -184,8 +184,8 @@ fixed_int16_t or_i16(fixed_int16_t a, fixed_int16_t b) {
 // CHECK-NEXT:    [[A:%.*]] = call <16 x i32> @llvm.vector.extract.v16i32.nxv4i32(<vscale x 4 x i32> [[A_COERCE:%.*]], i64 0)
 // CHECK-NEXT:    [[B:%.*]] = call <16 x i32> @llvm.vector.extract.v16i32.nxv4i32(<vscale x 4 x i32> [[B_COERCE:%.*]], i64 0)
 // CHECK-NEXT:    [[OR:%.*]] = or <16 x i32> [[A]], [[B]]
-// CHECK-NEXT:    [[CASTSCALABLESVE:%.*]] = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v16i32(<vscale x 4 x i32> undef, <16 x i32> [[OR]], i64 0)
-// CHECK-NEXT:    ret <vscale x 4 x i32> [[CASTSCALABLESVE]]
+// CHECK-NEXT:    [[CAST_SCALABLE:%.*]] = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v16i32(<vscale x 4 x i32> undef, <16 x i32> [[OR]], i64 0)
+// CHECK-NEXT:    ret <vscale x 4 x i32> [[CAST_SCALABLE]]
 //
 fixed_int32_t or_i32(fixed_int32_t a, fixed_int32_t b) {
   return a | b;
@@ -196,8 +196,8 @@ fixed_int32_t or_i32(fixed_int32_t a, fixed_int32_t b) {
 // CHECK-NEXT:    [[A:%.*]] = call <8 x i64> @llvm.vector.extract.v8i64.nxv2i64(<vscale x 2 x i64> [[A_COERCE:%.*]], i64 0)
 // CHECK-NEXT:    [[B:%.*]] = call <8 x i64> @llvm.vector.extract.v8i64.nxv2i64(<vscale x 2 x i64> [[B_COERCE:%.*]], i64 0)
 // CHECK-NEXT:    [[OR:%.*]] = or <8 x i64> [[A]], [[B]]
-// CHECK-NEXT:    [[CASTSCALABLESVE:%.*]] = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v8i64(<vscale x 2 x i64> undef, <8 x i64> [[OR]], i64 0)
-// CHECK-NEXT:    ret <vscale x 2 x i64> [[CASTSCALABLESVE]]
+// CHECK-NEXT:    [[CAST_SCALABLE:%.*]] = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v8i64(<vscale x 2 x i64> undef, <8 x i64> [[OR]], i64 0)
+// CHECK-NEXT:    ret <vscale x 2 x i64> [[CAST_SCALABLE]]
 //
 fixed_int64_t or_i64(fixed_int64_t a, fixed_int64_t b) {
   return a | b;
@@ -208,8 +208,8 @@ fixed_int64_t or_i64(fixed_int64_t a, fixed_int64_t b) {
 // CHECK-NEXT:    [[A:%.*]] = call <64 x i8> @llvm.vector.extract.v64i8.nxv16i8(<vscale x 16 x i8> [[A_COERCE:%.*]], i64 0)
 // CHECK-NEXT:    [[B:%.*]] = call <64 x i8> @llvm.vector.extract.v64i8.nxv16i8(<vscale x 16 x i8> [[B_COERCE:%.*]], i64 0)
 // CHECK-NEXT:    [[OR:%.*]] = or <64 x i8> [[A]], [[B]]
-// CHECK-NEXT:    [[CASTSCALABLESVE:%.*]] = call <vscale x 16 x i8> @llvm.vector.insert.nxv16i8.v64i8(<vscale x 16 x i8> undef, <64 x i8> [[OR]], i64 0)
-// CHECK-NEXT:    ret <vscale x 16 x i8> [[CASTSCALABLESVE]]
+// CHECK-NEXT:    [[CAST_SCALABLE:%.*]] = call <vscale x 16 x i8> @llvm.vector.insert.nxv16i8.v64i8(<vscale x 16 x i8> undef, <64 x i8> [[OR]], i64 0)
+// CHECK-NEXT:    ret <vscale x 16 x i8> [[CAST_SCALABLE]]
 //
 fixed_uint8_t or_u8(fixed_uint8_t a, fixed_uint8_t b) {
   return a | b;
@@ -220,8 +220,8 @@ fixed_uint8_t or_u8(fixed_uint8_t a, fixed_uint8_t b) {
 // CHECK-NEXT:    [[A:%.*]] = call <32 x i16> @llvm.vector.extract.v32i16.nxv8i16(<vscale x 8 x i16> [[A_COERCE:%.*]], i64 0)
 // CHECK-NEXT:    [[B:%.*]] = call <32 x i16> @llvm.vector.extract.v32i16.nxv8i16(<vscale x 8 x i16> [[B_COERCE:%.*]], i64 0)
 // CHECK-NEXT:    [[OR:%.*]] = or <32 x i16> [[A]], [[B]]
-// CHECK-NEXT:    [[CASTSCALABLESVE:%.*]] = call <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.v32i16(<vscale x 8 x i16> undef, <32 x i16> [[OR]], i64 0)
-// CHECK-NEXT:    ret <vscale x 8 x i16> [[CASTSCALABLESVE]]
+// CHECK-NEXT:    [[CAST_SCALABLE:%.*]] = call <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.v32i16(<vscale x 8 x i16> undef, <32 x i16> [[OR]], i64 0)
+// CHECK-NEXT:    ret <vscale x 8 x i16> [[CAST_SCALABLE]]
 //
 fixed_uint16_t or_u16(fixed_uint16_t a, fixed_uint16_t b) {
   return a | b;
@@ -232,8 +232,8 @@ fixed_uint16_t or_u16(fixed_uint16_t a, fixed_uint16_t b) {
 // CHECK-NEXT:    [[A:%.*]] = call <16 x i32> @llvm.vector.extract.v16i32.nxv4i32(<vscale x 4 x i32> [[A_COERCE:%.*]], i64 0)
 // CHECK-NEXT:    [[B:%.*]] = call <16 x i32> @llvm.vector.extract.v16i32.nxv4i32(<vscale x 4 x i32> [[B_COERCE:%.*]], i64 0)
 // CHECK-NEXT:    [[OR:%.*]] = or <16 x i32> [[A]], [[B]]
-// CHECK-NEXT:    [[CASTSCALABLESVE:%.*]] = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v16i32(<vscale x 4 x i32> undef, <16 x i32> [[OR]], i64 0)
-// CHECK-NEXT:    ret <vscale x 4 x i32> [[CASTSCALABLESVE]]
+// CHECK-NEXT:    [[CAST_SCALABLE:%.*]] = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v16i32(<vscale x 4 x i32> undef, <16 x i32> [[OR]], i64 0)
+// CHECK-NEXT:    ret <vscale x 4 x i32> [[CAST_SCALABLE]]
 //
 fixed_uint32_t or_u32(fixed_uint32_t a, fixed_uint32_t b) {
   return a | b;
@@ -244,8 +244,8 @@ fixed_uint32_t or_u32(fixed_uint32_t a, fixed_uint32_t b) {
 // CHECK-NEXT:    [[A:%.*]] = call <8 x i64> @llvm.vector.extract.v8i64.nxv2i64(<vscale x 2 x i64> [[A_COERCE:%.*]], i64 0)
 // CHECK-NEXT:    [[B:%.*]] = call <8 x i64> @llvm.vector.extract.v8i64.nxv2i64(<vscale x 2 x i64> [[B_COERCE:%.*]], i64 0)
 // CHECK-NEXT:    [[OR:%.*]] = or <8 x i64> [[A]], [[B]]
-// CHECK-NEXT:    [[CASTSCALABLESVE:%.*]] = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v8i64(<vscale x 2 x i64> undef, <8 x i64> [[OR]], i64 0)
-// CHECK-NEXT:    ret <vscale x 2 x i64> [[CASTSCALABLESVE]]
+// CHECK-NEXT:    [[CAST_SCALABLE:%.*]] = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v8i64(<vscale x 2 x i64> undef, <8 x i64> [[OR]], i64 0)
+// CHECK-NEXT:    ret <vscale x 2 x i64> [[CAST_SCALABLE]]
 //
 fixed_uint64_t or_u64(fixed_uint64_t a, fixed_uint64_t b) {
   return a | b;
@@ -260,8 +260,8 @@ fixed_uint64_t or_u64(fixed_uint64_t a, fixed_uint64_t b) {
 // CHECK-NEXT:    [[B_COERCE:%.*]] = bitcast <vscale x 16 x i1> [[TMP1:%.*]] to <vscale x 2 x i8>
 // CHECK-NEXT:    [[B:%.*]] = call <8 x i8> @llvm.vector.extract.v8i8.nxv2i8(<vscale x 2 x i8> [[B_COERCE]], i64 0)
 // CHECK-NEXT:    [[XOR:%.*]] = xor <8 x i8> [[A]], [[B]]
-// CHECK-NEXT:    [[CASTSCALABLESVE:%.*]] = call <vscale x 2 x i8> @llvm.vector.insert.nxv2i8.v8i8(<vscale x 2 x i8> undef, <8 x i8> [[XOR]], i64 0)
-// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <vscale x 2 x i8> [[CASTSCALABLESVE]] to <vscale x 16 x i1>
+// CHECK-NEXT:    [[CAST_SCALABLE:%.*]] = call <vscale x 2 x i8> @llvm.vector.insert.nxv2i8.v8i8(<vscale x 2 x i8> undef, <8 x i8> [[XOR]], i64 0)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <vscale x 2 x i8> [[CAST_SCALABLE]] to <vscale x 16 x i1>
 // CHECK-NEXT:    ret <vscale x 16 x i1> [[TMP2]]
 //
 fixed_bool_t xor_bool(fixed_bool_t a, fixed_bool_t b) {
@@ -273,8 +273,8 @@ fixed_bool_t xor_bool(fixed_bool_t a, fixed_bool_t b) {
 // CHECK-NEXT:    [[A:%.*]] = call <64 x i8> @llvm.vector.extract.v64i8.nxv16i8(<vscale x 16 x i8> [[A_COERCE:%.*]], i64 0)
 // CHECK-NEXT:    [[B:%.*]] = call <64 x i8> @llvm.vector.extract.v64i8.nxv16i8(<vscale x 16 x i8> [[B_COERCE:%.*]], i64 0)
 // CHECK-NEXT:    [[XOR:%.*]] = xor <64 x i8> [[A]], [[B]]
-// CHECK-NEXT:    [[CASTSCALABLESVE:%.*]] = call <vscale x 16 x i8> @llvm.vector.insert.nxv16i8.v64i8(<vscale x 16 x i8> undef, <64 x i8> [[XOR]], i64 0)
-// CHECK-NEXT:    ret <vscale x 16 x i8> [[CASTSCALABLESVE]]
+// CHECK-NEXT:    [[CAST_SCALABLE:%.*]] = call <vscale x 16 x i8> @llvm.vector.insert.nxv16i8.v64i8(<vscale x 16 x i8> undef, <64 x i8> [[XOR]], i64 0)
+// CHECK-NEXT:    ret <vscale x 16 x i8> [[CAST_SCALABLE]]
 //
 fixed_int8_t xor_i8(fixed_int8_t a, fixed_int8_t b) {
   return a ^ b;
@@ -285,8 +285,8 @@ fixed_int8_t xor_i8(fixed_int8_t a, fixed_int8_t b) {
 // CHECK-NEXT:    [[A:%.*]] = call <32 x i16> @llvm.vector.extract.v32i16.nxv8i16(<vscale x 8 x i16> [[A_COERCE:%.*]], i64 0)
 // CHECK-NEXT:    [[B:%.*]] = call <32 x i16> @llvm.vector.extract.v32i16.nxv8i16(<vscale x 8 x i16> [[B_COERCE:%.*]], i64 0)
 // CHECK-NEXT:    [[XOR:%.*]] = xor <32 x i16> [[A]], [[B]]
-// CHECK-NEXT:    [[CASTSCALABLESVE:%.*]] = call <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.v32i16(<vscale x 8 x i16> undef, <32 x i16> [[XOR]], i64 0)
-// CHECK-NEXT:    ret <vscale x 8 x i16> [[CASTSCALABLESVE]]
+// CHECK-NEXT:    [[CAST_SCALABLE:%.*]] = call <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.v32i16(<vscale x 8 x i16> undef, <32 x i16> [[XOR]], i64 0)
+// CHECK-NEXT:    ret <vscale x 8 x i16> [[CAST_SCALABLE]]
 //
 fixed_int16_t xor_i16(fixed_int16_t a, fixed_int16_t b) {
   return a ^ b;
@@ -297,8 +297,8 @@ fixed_int16_t xor_i16(fixed_int16_t a, fixed_int16_t b) {
 // CHECK-NEXT:    [[A:%.*]] = call <16 x i32> @llvm.vector.extract.v16i32.nxv4i32(<vscale x 4 x i32> [[A_COERCE:%.*]], i64 0)
 // CHECK-NEXT:    [[B:%.*]] = call <16 x i32> @llvm.vector.extract.v16i32.nxv4i32(<vscale x 4 x i32> [[B_COERCE:%.*]], i64 0)
 // CHECK-NEXT:    [[XOR:%.*]] = xor <16 x i32> [[A]], [[B]]
-// CHECK-NEXT:    [[CASTSCALABLESVE:%.*]] = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v16i32(<vscale x 4 x i32> undef, <16 x i32> [[XOR]], i64 0)
-// CHECK-NEXT:    ret <vscale x 4 x i32> [[CASTSCALABLESVE]]
+// CHECK-NEXT:    [[CAST_SCALABLE:%.*]] = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v16i32(<vscale x 4 x i32> undef, <16 x i32> [[XOR]], i64 0)
+// CHECK-NEXT:    ret <vscale x 4 x i32> [[CAST_SCALABLE]]
 //
 fixed_int32_t xor_i32(fixed_int32_t a, fixed_int32_t b) {
   return a ^ b;
@@ -309,8 +309,8 @@ fixed_int32_t xor_i32(fixed_int32_t a, fixed_int32_t b) {
 // CHECK-NEXT:    [[A:%.*]] = call <8 x i64> @llvm.vector.extract.v8i64.nxv2i64(<vscale x 2 x i64> [[A_COERCE:%.*]], i64 0)
 // CHECK-NEXT:    [[B:%.*]] = call <8 x i64> @llvm.vector.extract.v8i64.nxv2i64(<vscale x 2 x i64> [[B_COERCE:%.*]], i64 0)
 // CHECK-NEXT:    [[XOR:%.*]] = xor <8 x i64> [[A]], [[B]]
-// CHECK-NEXT:    [[CASTSCALABLESVE:%.*]] = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v8i64(<vscale x 2 x i64> undef, <8 x i64> [[XOR]], i64 0)
-// CHECK-NEXT:    ret <vscale x 2 x i64> [[CASTSCALABLESVE]]
+// CHECK-NEXT:    [[CAST_SCALABLE:%.*]] = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v8i64(<vscale x 2 x i64> undef, <8 x i64> [[XOR]], i64 0)
+// CHECK-NEXT:    ret <vscale x 2 x i64> [[CAST_SCALABLE]]
 //
 fixed_int64_t xor_i64(fixed_int64_t a, fixed_int64_t b) {
   return a ^ b;
@@ -321,8 +321,8 @@ fixed_int64_t xor_i64(fixed_int64_t a, fixed_int64_t b) {
 // CHECK-NEXT:    [[A:%.*]] = call <64 x i8> @llvm.vector.extract.v64i8.nxv16i8(<vscale x 16 x i8> [[A_COERCE:%.*]], i64 0)
 // CHECK-NEXT:    [[B:%.*]] = call <64 x i8> @llvm.vector.extract.v64i8.nxv16i8(<vscale x 16 x i8> [[B_COERCE:%.*]], i64 0)
 // CHECK-NEXT:    [[XOR:%.*]] = xor <64 x i8> [[A]], [[B]]
-// CHECK-NEXT:    [[CASTSCALABLESVE:%.*]] = call <vscale x 16 x i8> @llvm.vector.insert.nxv16i8.v64i8(<vscale x 16 x i8> undef, <64 x i8> [[XOR]], i64 0)
-// CHECK-NEXT:    ret <vscale x 16 x i8> [[CASTSCALABLESVE]]
+// CHECK-NEXT:    [[CAST_SCALABLE:%.*]] = call <vscale x 16 x i8> @llvm.vector.insert.nxv16i8.v64i8(<vscale x 16 x i8> undef, <64 x i8> [[XOR]], i64 0)
+// CHECK-NEXT:    ret <vscale x 16 x i8> [[CAST_SCALABLE]]
 //
 fixed_uint8_t xor_u8(fixed_uint8_t a, fixed_uint8_t b) {
   return a ^ b;
@@ -333,8 +333,8 @@ fixed_uint8_t xor_u8(fixed_uint8_t a, fixed_uint8_t b) {
 // CHECK-NEXT:    [[A:%.*]] = call <32 x i16> @llvm.vector.extract.v32i16.nxv8i16(<vscale x 8 x i16> [[A_COERCE:%.*]], i64 0)
 // CHECK-NEXT:    [[B:%.*]] = call <32 x i16> @llvm.vector.extract.v32i16.nxv8i16(<vscale x 8 x i16> [[B_COERCE:%.*]], i64 0)
 // CHECK-NEXT:    [[XOR:%.*]] = xor <32 x i16> [[A]], [[B]]
-// CHECK-NEXT:    [[CASTSCALABLESVE:%.*]] = call <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.v32i16(<vscale x 8 x i16> undef, <32 x i16> [[XOR]], i64 0)
-// CHECK-NEXT:    ret <vscale x 8 x i16> [[CASTSCALABLESVE]]
+// CHECK-NEXT:    [[CAST_SCALABLE:%.*]] = call <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.v32i16(<vscale x 8 x i16> undef, <32 x i16> [[XOR]], i64 0)
+// CHECK-NEXT:    ret <vscale x 8 x i16> [[CAST_SCALABLE]]
 //
 fixed_uint16_t xor_u16(fixed_uint16_t a, fixed_uint16_t b) {
   return a ^ b;
@@ -345,8 +345,8 @@ fixed_uint16_t xor_u16(fixed_uint16_t a, fixed_uint16_t b) {
 // CHECK-NEXT:    [[A:%.*]] = call <16 x i32> @llvm.vector.extract.v16i32.nxv4i32(<vscale x 4 x i32> [[A_COERCE:%.*]], i64 0)
 // CHECK-NEXT:    [[B:%.*]] = call <16 x i32> @llvm.vector.extract.v16i32.nxv4i32(<vscale x 4 x i32> [[B_COERCE:%.*]], i64 0)
 // CHECK-NEXT:    [[XOR:%.*]] = xor <16 x i32> [[A]], [[B]]
-// CHECK-NEXT:    [[CASTSCALABLESVE:%.*]] = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v16i32(<vscale x 4 x i32> undef, <16 x i32> [[XOR]], i64 0)
-// CHECK-NEXT:    ret <vscale x 4 x i32> [[CASTSCALABLESVE]]
+// CHECK-NEXT:    [[CAST_SCALABLE:%.*]] = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v16i32(<vscale x 4 x i32> undef, <16 x i32> [[XOR]], i64 0)
+// CHECK-NEXT:    ret <vscale x 4 x i32> [[CAST_SCALABLE]]
 //
 fixed_uint32_t xor_u32(fixed_uint32_t a, fixed_uint32_t b) {
   return a ^ b;
@@ -357,8 +357,8 @@ fixed_uint32_t xor_u32(fixed_uint32_t a, fixed_uint32_t b) {
 // CHECK-NEXT:    [[A:%.*]] = call <8 x i64> @llvm.vector.extract.v8i64.nxv2i64(<vscale x 2 x i64> [[A_COERCE:%.*]], i64 0)
 // CHECK-NEXT:    [[B:%.*]] = call <8 x i64> @llvm.vector.extract.v8i64.nxv2i64(<vscale x 2 x i64> [[B_COERCE:%.*]], i64 0)
 // CHECK-NEXT:    [[XOR:%.*]] = xor <8 x i64> [[A]], [[B]]
-// CHECK-NEXT:    [[CASTSCALABLESVE:%.*]] = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v8i64(<vscale x 2 x i64> undef, <8 x i64> [[XOR]], i64 0)
-// CHECK-NEXT:    ret <vscale x 2 x i64> [[CASTSCALABLESVE]]
+// CHECK-NEXT:    [[CAST_SCALABLE:%.*]] = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v8i64(<vscale x 2 x i64> undef, <8 x i64> [[XOR]], i64 0)
+// CHECK-NEXT:    ret <vscale x 2 x i64> [[CAST_SCALABLE]]
 //
 fixed_uint64_t xor_u64(fixed_uint64_t a, fixed_uint64_t b) {
   return a ^ b;
@@ -370,9 +370,9 @@ fixed_uint64_t xor_u64(fixed_uint64_t a, fixed_uint64_t b) {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[A_COERCE:%.*]] = bitcast <vscale x 16 x i1> [[TMP0:%.*]] to <vscale x 2 x i8>
 // CHECK-NEXT:    [[A:%.*]] = call <8 x i8> @llvm.vector.extract.v8i8.nxv2i8(<vscale x 2 x i8> [[A_COERCE]], i64 0)
-// CHECK-NEXT:    [[NEG:%.*]] = xor <8 x i8> [[A]], <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
-// CHECK-NEXT:    [[CASTSCALABLESVE:%.*]] = call <vscale x 2 x i8> @llvm.vector.insert.nxv2i8.v8i8(<vscale x 2 x i8> undef, <8 x i8> [[NEG]], i64 0)
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <vscale x 2 x i8> [[CASTSCALABLESVE]] to <vscale x 16 x i1>
+// CHECK-NEXT:    [[NOT:%.*]] = xor <8 x i8> [[A]], <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK-NEXT:    [[CAST_SCALABLE:%.*]] = call <vscale x 2 x i8> @llvm.vector.insert.nxv2i8.v8i8(<vscale x 2 x i8> undef, <8 x i8> [[NOT]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <vscale x 2 x i8> [[CAST_SCALABLE]] to <vscale x 16 x i1>
 // CHECK-NEXT:    ret <vscale x 16 x i1> [[TMP1]]
 //
 fixed_bool_t neg_bool(fixed_bool_t a) {
@@ -382,9 +382,9 @@ fixed_bool_t neg_bool(fixed_bool_t a) {
 // CHECK-LABEL: @neg_i8(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[A:%.*]] = call <64 x i8> @llvm.vector.extract.v64i8.nxv16i8(<vscale x 16 x i8> [[A_COERCE:%.*]], i64 0)
-// CHECK-NEXT:    [[NEG:%.*]] = xor <64 x i8> [[A]], <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
-// CHECK-NEXT:    [[CASTSCALABLESVE:%.*]] = call <vscale x 16 x i8> @llvm.vector.insert.nxv16i8.v64i8(<vscale x 16 x i8> undef, <64 x i8> [[NEG]], i64 0)
-// CHECK-NEXT:    ret <vscale x 16 x i8> [[CASTSCALABLESVE]]
+// CHECK-NEXT:    [[NOT:%.*]] = xor <64 x i8> [[A]], <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK-NEXT:    [[CAST_SCALABLE:%.*]] = call <vscale x 16 x i8> @llvm.vector.insert.nxv16i8.v64i8(<vscale x 16 x i8> undef, <64 x i8> [[NOT]], i64 0)
+// CHECK-NEXT:    ret <vscale x 16 x i8> [[CAST_SCALABLE]]
 //
 fixed_int8_t neg_i8(fixed_int8_t a) {
   return ~a;
@@ -393,9 +393,9 @@ fixed_int8_t neg_i8(fixed_int8_t a) {
 // CHECK-LABEL: @neg_i16(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[A:%.*]] = call <32 x i16> @llvm.vector.extract.v32i16.nxv8i16(<vscale x 8 x i16> [[A_COERCE:%.*]], i64 0)
-// CHECK-NEXT:    [[NEG:%.*]] = xor <32 x i16> [[A]], <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
-// CHECK-NEXT:    [[CASTSCALABLESVE:%.*]] = call <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.v32i16(<vscale x 8 x i16> undef, <32 x i16> [[NEG]], i64 0)
-// CHECK-NEXT:    ret <vscale x 8 x i16> [[CASTSCALABLESVE]]
+// CHECK-NEXT:    [[NOT:%.*]] = xor <32 x i16> [[A]], <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK-NEXT:    [[CAST_SCALABLE:%.*]] = call <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.v32i16(<vscale x 8 x i16> undef, <32 x i16> [[NOT]], i64 0)
+// CHECK-NEXT:    ret <vscale x 8 x i16> [[CAST_SCALABLE]]
 //
 fixed_int16_t neg_i16(fixed_int16_t a) {
   return ~a;
@@ -404,9 +404,9 @@ fixed_int16_t neg_i16(fixed_int16_t a) {
 // CHECK-LABEL: @neg_i32(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[A:%.*]] = call <16 x i32> @llvm.vector.extract.v16i32.nxv4i32(<vscale x 4 x i32> [[A_COERCE:%.*]], i64 0)
-// CHECK-NEXT:    [[NEG:%.*]] = xor <16 x i32> [[A]], <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
-// CHECK-NEXT:    [[CASTSCALABLESVE:%.*]] = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v16i32(<vscale x 4 x i32> undef, <16 x i32> [[NEG]], i64 0)
-// CHECK-NEXT:    ret <vscale x 4 x i32> [[CASTSCALABLESVE]]
+// CHECK-NEXT:    [[NOT:%.*]] = xor <16 x i32> [[A]], <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
+// CHECK-NEXT:    [[CAST_SCALABLE:%.*]] = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v16i32(<vscale x 4 x i32> undef, <16 x i32> [[NOT]], i64 0)
+// CHECK-NEXT:    ret <vscale x 4 x i32> [[CAST_SCALABLE]]
 //
 fixed_int32_t neg_i32(fixed_int32_t a) {
   return ~a;
@@ -415,9 +415,9 @@ fixed_int32_t neg_i32(fixed_int32_t a) {
 // CHECK-LABEL: @neg_i64(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[A:%.*]] = call <8 x i64> @llvm.vector.extract.v8i64.nxv2i64(<vscale x 2 x i64> [[A_COERCE:%.*]], i64 0)
-// CHECK-NEXT:    [[NEG:%.*]] = xor <8 x i64> [[A]], <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>
-// CHECK-NEXT:    [[CASTSCALABLESVE:%.*]] = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v8i64(<vscale x 2 x i64> undef, <8 x i64> [[NEG]], i64 0)
-// CHECK-NEXT:    ret <vscale x 2 x i64> [[CASTSCALABLESVE]]
+// CHECK-NEXT:    [[NOT:%.*]] = xor <8 x i64> [[A]], <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>
+// CHECK-NEXT:    [[CAST_SCALABLE:%.*]] = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v8i64(<vscale x 2 x i64> undef, <8 x i64> [[NOT]], i64 0)
+// CHECK-NEXT:    ret <vscale x 2 x i64> [[CAST_SCALABLE]]
 //
 fixed_int64_t neg_i64(fixed_int64_t a) {
   return ~a;
@@ -426,9 +426,9 @@ fixed_int64_t neg_i64(fixed_int64_t a) {
 // CHECK-LABEL: @neg_u8(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[A:%.*]] = call <64 x i8> @llvm.vector.extract.v64i8.nxv16i8(<vscale x 16 x i8> [[A_COERCE:%.*]], i64 0)
-// CHECK-NEXT:    [[NEG:%.*]] = xor <64 x i8> [[A]], <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
-// CHECK-NEXT:    [[CASTSCALABLESVE:%.*]] = call <vscale x 16 x i8> @llvm.vector.insert.nxv16i8.v64i8(<vscale x 16 x i8> undef, <64 x i8> [[NEG]], i64 0)
-// CHECK-NEXT:    ret <vscale x 16 x i8> [[CASTSCALABLESVE]]
+// CHECK-NEXT:    [[NOT:%.*]] = xor <64 x i8> [[A]], <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK-NEXT:    [[CAST_SCALABLE:%.*]] = call <vscale x 16 x i8> @llvm.vector.insert.nxv16i8.v64i8(<vscale x 16 x i8> undef, <64 x i8> [[NOT]], i64 0)
+// CHECK-NEXT:    ret <vscale x 16 x i8> [[CAST_SCALABLE]]
 //
 fixed_uint8_t neg_u8(fixed_uint8_t a) {
   return ~a;
@@ -437,9 +437,9 @@ fixed_uint8_t neg_u8(fixed_uint8_t a) {
 // CHECK-LABEL: @neg_u16(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[A:%.*]] = call <32 x i16> @llvm.vector.extract.v32i16.nxv8i16(<vscale x 8 x i16> [[A_COERCE:%.*]], i64 0)
-// CHECK-NEXT:    [[NEG:%.*]] = xor <32 x i16> [[A]], <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
-// CHECK-NEXT:    [[CASTSCALABLESVE:%.*]] = call <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.v32i16(<vscale x 8 x i16> undef, <32 x i16> [[NEG]], i64 0)
-// CHECK-NEXT:    ret <vscale x 8 x i16> [[CASTSCALABLESVE]]
+// CHECK-NEXT:    [[NOT:%.*]] = xor <32 x i16> [[A]], <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK-NEXT:    [[CAST_SCALABLE:%.*]] = call <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.v32i16(<vscale x 8 x i16> undef, <32 x i16> [[NOT]], i64 0)
+// CHECK-NEXT:    ret <vscale x 8 x i16> [[CAST_SCALABLE]]
 //
 fixed_uint16_t neg_u16(fixed_uint16_t a) {
   return ~a;
@@ -448,9 +448,9 @@ fixed_uint16_t neg_u16(fixed_uint16_t a) {
 // CHECK-LABEL: @neg_u32(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[A:%.*]] = call <16 x i32> @llvm.vector.extract.v16i32.nxv4i32(<vscale x 4 x i32> [[A_COERCE:%.*]], i64 0)
-// CHECK-NEXT:    [[NEG:%.*]] = xor <16 x i32> [[A]], <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
-// CHECK-NEXT:    [[CASTSCALABLESVE:%.*]] = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v16i32(<vscale x 4 x i32> undef, <16 x i32> [[NEG]], i64 0)
-// CHECK-NEXT:    ret <vscale x 4 x i32> [[CASTSCALABLESVE]]
+// CHECK-NEXT:    [[NOT:%.*]] = xor <16 x i32> [[A]], <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
+// CHECK-NEXT:    [[CAST_SCALABLE:%.*]] = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v16i32(<vscale x 4 x i32> undef, <16 x i32> [[NOT]], i64 0)
+// CHECK-NEXT:    ret <vscale x 4 x i32> [[CAST_SCALABLE]]
 //
 fixed_uint32_t neg_u32(fixed_uint32_t a) {
   return ~a;
@@ -459,9 +459,9 @@ fixed_uint32_t neg_u32(fixed_uint32_t a) {
 // CHECK-LABEL: @neg_u64(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[A:%.*]] = call <8 x i64> @llvm.vector.extract.v8i64.nxv2i64(<vscale x 2 x i64> [[A_COERCE:%.*]], i64 0)
-// CHECK-NEXT:    [[NEG:%.*]] = xor <8 x i64> [[A]], <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>
-// CHECK-NEXT:    [[CASTSCALABLESVE:%.*]] = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v8i64(<vscale x 2 x i64> undef, <8 x i64> [[NEG]], i64 0)
-// CHECK-NEXT:    ret <vscale x 2 x i64> [[CASTSCALABLESVE]]
+// CHECK-NEXT:    [[NOT:%.*]] = xor <8 x i64> [[A]], <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>
+// CHECK-NEXT:    [[CAST_SCALABLE:%.*]] = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v8i64(<vscale x 2 x i64> undef, <8 x i64> [[NOT]], i64 0)
+// CHECK-NEXT:    ret <vscale x 2 x i64> [[CAST_SCALABLE]]
 //
 fixed_uint64_t neg_u64(fixed_uint64_t a) {
   return ~a;

diff  --git a/clang/test/CodeGen/arm-bf16-convert-intrinsics.c b/clang/test/CodeGen/arm-bf16-convert-intrinsics.c
index 0f2c5b2546fa35..9477ebdb8285af 100644
--- a/clang/test/CodeGen/arm-bf16-convert-intrinsics.c
+++ b/clang/test/CodeGen/arm-bf16-convert-intrinsics.c
@@ -24,51 +24,51 @@
 
 // CHECK-A64-LABEL: @test_vcvt_f32_bf16(
 // CHECK-A64-NEXT:  entry:
-// CHECK-A64-NEXT:    [[__REINT_150_I:%.*]] = alloca <4 x bfloat>, align 8
-// CHECK-A64-NEXT:    [[__REINT1_150_I:%.*]] = alloca <4 x i32>, align 16
-// CHECK-A64-NEXT:    store <4 x bfloat> [[A:%.*]], ptr [[__REINT_150_I]], align 8
-// CHECK-A64-NEXT:    [[TMP1:%.*]] = load <4 x i16>, ptr [[__REINT_150_I]], align 8
-// CHECK-A64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8>
-// CHECK-A64-NEXT:    [[TMP3:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
-// CHECK-A64-NEXT:    [[VSHLL_N_I:%.*]] = shl <4 x i32> [[TMP3]], <i32 16, i32 16, i32 16, i32 16>
-// CHECK-A64-NEXT:    store <4 x i32> [[VSHLL_N_I]], ptr [[__REINT1_150_I]], align 16
-// CHECK-A64-NEXT:    [[TMP5:%.*]] = load <4 x float>, ptr [[__REINT1_150_I]], align 16
-// CHECK-A64-NEXT:    ret <4 x float> [[TMP5]]
+// CHECK-A64-NEXT:    [[__REINT_836_I:%.*]] = alloca <4 x bfloat>, align 8
+// CHECK-A64-NEXT:    [[__REINT1_836_I:%.*]] = alloca <4 x i32>, align 16
+// CHECK-A64-NEXT:    store <4 x bfloat> [[A:%.*]], ptr [[__REINT_836_I]], align 8
+// CHECK-A64-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[__REINT_836_I]], align 8
+// CHECK-A64-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
+// CHECK-A64-NEXT:    [[TMP2:%.*]] = sext <4 x i16> [[TMP0]] to <4 x i32>
+// CHECK-A64-NEXT:    [[VSHLL_N_I:%.*]] = shl <4 x i32> [[TMP2]], <i32 16, i32 16, i32 16, i32 16>
+// CHECK-A64-NEXT:    store <4 x i32> [[VSHLL_N_I]], ptr [[__REINT1_836_I]], align 16
+// CHECK-A64-NEXT:    [[TMP3:%.*]] = load <4 x float>, ptr [[__REINT1_836_I]], align 16
+// CHECK-A64-NEXT:    ret <4 x float> [[TMP3]]
 //
 // CHECK-A32-HARDFP-LABEL: @test_vcvt_f32_bf16(
 // CHECK-A32-HARDFP-NEXT:  entry:
-// CHECK-A32-HARDFP-NEXT:    [[__REINT_150_I:%.*]] = alloca <4 x bfloat>, align 8
-// CHECK-A32-HARDFP-NEXT:    [[__REINT1_150_I:%.*]] = alloca <4 x i32>, align 8
-// CHECK-A32-HARDFP-NEXT:    store <4 x bfloat> [[A:%.*]], ptr [[__REINT_150_I]], align 8
-// CHECK-A32-HARDFP-NEXT:    [[TMP1:%.*]] = load <4 x i16>, ptr [[__REINT_150_I]], align 8
-// CHECK-A32-HARDFP-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8>
-// CHECK-A32-HARDFP-NEXT:    [[TMP3:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
-// CHECK-A32-HARDFP-NEXT:    [[VSHLL_N_I:%.*]] = shl <4 x i32> [[TMP3]], <i32 16, i32 16, i32 16, i32 16>
-// CHECK-A32-HARDFP-NEXT:    store <4 x i32> [[VSHLL_N_I]], ptr [[__REINT1_150_I]], align 8
-// CHECK-A32-HARDFP-NEXT:    [[TMP5:%.*]] = load <4 x float>, ptr [[__REINT1_150_I]], align 8
-// CHECK-A32-HARDFP-NEXT:    ret <4 x float> [[TMP5]]
+// CHECK-A32-HARDFP-NEXT:    [[__REINT_836_I:%.*]] = alloca <4 x bfloat>, align 8
+// CHECK-A32-HARDFP-NEXT:    [[__REINT1_836_I:%.*]] = alloca <4 x i32>, align 8
+// CHECK-A32-HARDFP-NEXT:    store <4 x bfloat> [[A:%.*]], ptr [[__REINT_836_I]], align 8
+// CHECK-A32-HARDFP-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[__REINT_836_I]], align 8
+// CHECK-A32-HARDFP-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
+// CHECK-A32-HARDFP-NEXT:    [[TMP2:%.*]] = sext <4 x i16> [[TMP0]] to <4 x i32>
+// CHECK-A32-HARDFP-NEXT:    [[VSHLL_N_I:%.*]] = shl <4 x i32> [[TMP2]], <i32 16, i32 16, i32 16, i32 16>
+// CHECK-A32-HARDFP-NEXT:    store <4 x i32> [[VSHLL_N_I]], ptr [[__REINT1_836_I]], align 8
+// CHECK-A32-HARDFP-NEXT:    [[TMP3:%.*]] = load <4 x float>, ptr [[__REINT1_836_I]], align 8
+// CHECK-A32-HARDFP-NEXT:    ret <4 x float> [[TMP3]]
 //
 // CHECK-A32-SOFTFP-LABEL: @test_vcvt_f32_bf16(
 // CHECK-A32-SOFTFP-NEXT:  entry:
-// CHECK-A32-SOFTFP-NEXT:    [[__P0_150_I:%.*]] = alloca <4 x bfloat>, align 8
-// CHECK-A32-SOFTFP-NEXT:    [[__REINT_150_I:%.*]] = alloca <4 x bfloat>, align 8
-// CHECK-A32-SOFTFP-NEXT:    [[__REINT1_150_I:%.*]] = alloca <4 x i32>, align 8
+// CHECK-A32-SOFTFP-NEXT:    [[__P0_836_I:%.*]] = alloca <4 x bfloat>, align 8
+// CHECK-A32-SOFTFP-NEXT:    [[__REINT_836_I:%.*]] = alloca <4 x bfloat>, align 8
+// CHECK-A32-SOFTFP-NEXT:    [[__REINT1_836_I:%.*]] = alloca <4 x i32>, align 8
 // CHECK-A32-SOFTFP-NEXT:    [[A:%.*]] = alloca <4 x bfloat>, align 8
 // CHECK-A32-SOFTFP-NEXT:    [[COERCE:%.*]] = alloca <4 x bfloat>, align 8
 // CHECK-A32-SOFTFP-NEXT:    store <2 x i32> [[A_COERCE:%.*]], ptr [[A]], align 8
 // CHECK-A32-SOFTFP-NEXT:    [[A1:%.*]] = load <4 x bfloat>, ptr [[A]], align 8
 // CHECK-A32-SOFTFP-NEXT:    store <4 x bfloat> [[A1]], ptr [[COERCE]], align 8
-// CHECK-A32-SOFTFP-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr [[COERCE]], align 8
-// CHECK-A32-SOFTFP-NEXT:    store <2 x i32> [[TMP2]], ptr [[__P0_150_I]], align 8
-// CHECK-A32-SOFTFP-NEXT:    [[__P0_1501_I:%.*]] = load <4 x bfloat>, ptr [[__P0_150_I]], align 8
-// CHECK-A32-SOFTFP-NEXT:    store <4 x bfloat> [[__P0_1501_I]], ptr [[__REINT_150_I]], align 8
-// CHECK-A32-SOFTFP-NEXT:    [[TMP5:%.*]] = load <4 x i16>, ptr [[__REINT_150_I]], align 8
-// CHECK-A32-SOFTFP-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
-// CHECK-A32-SOFTFP-NEXT:    [[TMP7:%.*]] = sext <4 x i16> [[TMP5]] to <4 x i32>
-// CHECK-A32-SOFTFP-NEXT:    [[VSHLL_N_I:%.*]] = shl <4 x i32> [[TMP7]], <i32 16, i32 16, i32 16, i32 16>
-// CHECK-A32-SOFTFP-NEXT:    store <4 x i32> [[VSHLL_N_I]], ptr [[__REINT1_150_I]], align 8
-// CHECK-A32-SOFTFP-NEXT:    [[TMP9:%.*]] = load <4 x float>, ptr [[__REINT1_150_I]], align 8
-// CHECK-A32-SOFTFP-NEXT:    ret <4 x float> [[TMP9]]
+// CHECK-A32-SOFTFP-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[COERCE]], align 8
+// CHECK-A32-SOFTFP-NEXT:    store <2 x i32> [[TMP0]], ptr [[__P0_836_I]], align 8
+// CHECK-A32-SOFTFP-NEXT:    [[__P0_8361_I:%.*]] = load <4 x bfloat>, ptr [[__P0_836_I]], align 8
+// CHECK-A32-SOFTFP-NEXT:    store <4 x bfloat> [[__P0_8361_I]], ptr [[__REINT_836_I]], align 8
+// CHECK-A32-SOFTFP-NEXT:    [[TMP1:%.*]] = load <4 x i16>, ptr [[__REINT_836_I]], align 8
+// CHECK-A32-SOFTFP-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8>
+// CHECK-A32-SOFTFP-NEXT:    [[TMP3:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
+// CHECK-A32-SOFTFP-NEXT:    [[VSHLL_N_I:%.*]] = shl <4 x i32> [[TMP3]], <i32 16, i32 16, i32 16, i32 16>
+// CHECK-A32-SOFTFP-NEXT:    store <4 x i32> [[VSHLL_N_I]], ptr [[__REINT1_836_I]], align 8
+// CHECK-A32-SOFTFP-NEXT:    [[TMP4:%.*]] = load <4 x float>, ptr [[__REINT1_836_I]], align 8
+// CHECK-A32-SOFTFP-NEXT:    ret <4 x float> [[TMP4]]
 //
 float32x4_t test_vcvt_f32_bf16(bfloat16x4_t a) {
   return vcvt_f32_bf16(a);
@@ -76,39 +76,39 @@ float32x4_t test_vcvt_f32_bf16(bfloat16x4_t a) {
 
 // CHECK-A64-LABEL: @test_vcvtq_low_f32_bf16(
 // CHECK-A64-NEXT:  entry:
-// CHECK-A64-NEXT:    [[__REINT_150_I_I:%.*]] = alloca <4 x bfloat>, align 8
-// CHECK-A64-NEXT:    [[__REINT1_150_I_I:%.*]] = alloca <4 x i32>, align 16
+// CHECK-A64-NEXT:    [[__REINT_836_I_I:%.*]] = alloca <4 x bfloat>, align 8
+// CHECK-A64-NEXT:    [[__REINT1_836_I_I:%.*]] = alloca <4 x i32>, align 16
 // CHECK-A64-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[A:%.*]], <8 x bfloat> [[A]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-// CHECK-A64-NEXT:    store <4 x bfloat> [[SHUFFLE_I]], ptr [[__REINT_150_I_I]], align 8
-// CHECK-A64-NEXT:    [[TMP1:%.*]] = load <4 x i16>, ptr [[__REINT_150_I_I]], align 8
-// CHECK-A64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8>
-// CHECK-A64-NEXT:    [[TMP3:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
-// CHECK-A64-NEXT:    [[VSHLL_N_I_I:%.*]] = shl <4 x i32> [[TMP3]], <i32 16, i32 16, i32 16, i32 16>
-// CHECK-A64-NEXT:    store <4 x i32> [[VSHLL_N_I_I]], ptr [[__REINT1_150_I_I]], align 16
-// CHECK-A64-NEXT:    [[TMP5:%.*]] = load <4 x float>, ptr [[__REINT1_150_I_I]], align 16
-// CHECK-A64-NEXT:    ret <4 x float> [[TMP5]]
+// CHECK-A64-NEXT:    store <4 x bfloat> [[SHUFFLE_I]], ptr [[__REINT_836_I_I]], align 8
+// CHECK-A64-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[__REINT_836_I_I]], align 8
+// CHECK-A64-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
+// CHECK-A64-NEXT:    [[TMP2:%.*]] = sext <4 x i16> [[TMP0]] to <4 x i32>
+// CHECK-A64-NEXT:    [[VSHLL_N_I_I:%.*]] = shl <4 x i32> [[TMP2]], <i32 16, i32 16, i32 16, i32 16>
+// CHECK-A64-NEXT:    store <4 x i32> [[VSHLL_N_I_I]], ptr [[__REINT1_836_I_I]], align 16
+// CHECK-A64-NEXT:    [[TMP3:%.*]] = load <4 x float>, ptr [[__REINT1_836_I_I]], align 16
+// CHECK-A64-NEXT:    ret <4 x float> [[TMP3]]
 //
 // CHECK-A32-HARDFP-LABEL: @test_vcvtq_low_f32_bf16(
 // CHECK-A32-HARDFP-NEXT:  entry:
-// CHECK-A32-HARDFP-NEXT:    [[__REINT_150_I_I:%.*]] = alloca <4 x bfloat>, align 8
-// CHECK-A32-HARDFP-NEXT:    [[__REINT1_150_I_I:%.*]] = alloca <4 x i32>, align 8
+// CHECK-A32-HARDFP-NEXT:    [[__REINT_836_I_I:%.*]] = alloca <4 x bfloat>, align 8
+// CHECK-A32-HARDFP-NEXT:    [[__REINT1_836_I_I:%.*]] = alloca <4 x i32>, align 8
 // CHECK-A32-HARDFP-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[A:%.*]], <8 x bfloat> [[A]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-// CHECK-A32-HARDFP-NEXT:    store <4 x bfloat> [[SHUFFLE_I]], ptr [[__REINT_150_I_I]], align 8
-// CHECK-A32-HARDFP-NEXT:    [[TMP1:%.*]] = load <4 x i16>, ptr [[__REINT_150_I_I]], align 8
-// CHECK-A32-HARDFP-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8>
-// CHECK-A32-HARDFP-NEXT:    [[TMP3:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
-// CHECK-A32-HARDFP-NEXT:    [[VSHLL_N_I_I:%.*]] = shl <4 x i32> [[TMP3]], <i32 16, i32 16, i32 16, i32 16>
-// CHECK-A32-HARDFP-NEXT:    store <4 x i32> [[VSHLL_N_I_I]], ptr [[__REINT1_150_I_I]], align 8
-// CHECK-A32-HARDFP-NEXT:    [[TMP5:%.*]] = load <4 x float>, ptr [[__REINT1_150_I_I]], align 8
-// CHECK-A32-HARDFP-NEXT:    ret <4 x float> [[TMP5]]
+// CHECK-A32-HARDFP-NEXT:    store <4 x bfloat> [[SHUFFLE_I]], ptr [[__REINT_836_I_I]], align 8
+// CHECK-A32-HARDFP-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[__REINT_836_I_I]], align 8
+// CHECK-A32-HARDFP-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
+// CHECK-A32-HARDFP-NEXT:    [[TMP2:%.*]] = sext <4 x i16> [[TMP0]] to <4 x i32>
+// CHECK-A32-HARDFP-NEXT:    [[VSHLL_N_I_I:%.*]] = shl <4 x i32> [[TMP2]], <i32 16, i32 16, i32 16, i32 16>
+// CHECK-A32-HARDFP-NEXT:    store <4 x i32> [[VSHLL_N_I_I]], ptr [[__REINT1_836_I_I]], align 8
+// CHECK-A32-HARDFP-NEXT:    [[TMP3:%.*]] = load <4 x float>, ptr [[__REINT1_836_I_I]], align 8
+// CHECK-A32-HARDFP-NEXT:    ret <4 x float> [[TMP3]]
 //
 // CHECK-A32-SOFTFP-LABEL: @test_vcvtq_low_f32_bf16(
 // CHECK-A32-SOFTFP-NEXT:  entry:
 // CHECK-A32-SOFTFP-NEXT:    [[RETVAL_I:%.*]] = alloca <4 x bfloat>, align 8
 // CHECK-A32-SOFTFP-NEXT:    [[__P0_I2:%.*]] = alloca <8 x bfloat>, align 8
-// CHECK-A32-SOFTFP-NEXT:    [[__P0_150_I_I:%.*]] = alloca <4 x bfloat>, align 8
-// CHECK-A32-SOFTFP-NEXT:    [[__REINT_150_I_I:%.*]] = alloca <4 x bfloat>, align 8
-// CHECK-A32-SOFTFP-NEXT:    [[__REINT1_150_I_I:%.*]] = alloca <4 x i32>, align 8
+// CHECK-A32-SOFTFP-NEXT:    [[__P0_836_I_I:%.*]] = alloca <4 x bfloat>, align 8
+// CHECK-A32-SOFTFP-NEXT:    [[__REINT_836_I_I:%.*]] = alloca <4 x bfloat>, align 8
+// CHECK-A32-SOFTFP-NEXT:    [[__REINT1_836_I_I:%.*]] = alloca <4 x i32>, align 8
 // CHECK-A32-SOFTFP-NEXT:    [[__P0_I:%.*]] = alloca <8 x bfloat>, align 8
 // CHECK-A32-SOFTFP-NEXT:    [[COERCE_I:%.*]] = alloca <8 x bfloat>, align 8
 // CHECK-A32-SOFTFP-NEXT:    [[COERCE2_I:%.*]] = alloca <4 x bfloat>, align 8
@@ -118,30 +118,30 @@ float32x4_t test_vcvt_f32_bf16(bfloat16x4_t a) {
 // CHECK-A32-SOFTFP-NEXT:    store <4 x i32> [[A_COERCE:%.*]], ptr [[A]], align 8
 // CHECK-A32-SOFTFP-NEXT:    [[A1:%.*]] = load <8 x bfloat>, ptr [[A]], align 8
 // CHECK-A32-SOFTFP-NEXT:    store <8 x bfloat> [[A1]], ptr [[COERCE]], align 8
-// CHECK-A32-SOFTFP-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr [[COERCE]], align 8
-// CHECK-A32-SOFTFP-NEXT:    store <4 x i32> [[TMP2]], ptr [[__P0_I]], align 8
+// CHECK-A32-SOFTFP-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[COERCE]], align 8
+// CHECK-A32-SOFTFP-NEXT:    store <4 x i32> [[TMP0]], ptr [[__P0_I]], align 8
 // CHECK-A32-SOFTFP-NEXT:    [[__P01_I:%.*]] = load <8 x bfloat>, ptr [[__P0_I]], align 8
 // CHECK-A32-SOFTFP-NEXT:    store <8 x bfloat> [[__P01_I]], ptr [[COERCE_I]], align 8
-// CHECK-A32-SOFTFP-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr [[COERCE_I]], align 8
-// CHECK-A32-SOFTFP-NEXT:    store <4 x i32> [[TMP5]], ptr [[__P0_I2]], align 8
+// CHECK-A32-SOFTFP-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[COERCE_I]], align 8
+// CHECK-A32-SOFTFP-NEXT:    store <4 x i32> [[TMP1]], ptr [[__P0_I2]], align 8
 // CHECK-A32-SOFTFP-NEXT:    [[__P01_I5:%.*]] = load <8 x bfloat>, ptr [[__P0_I2]], align 8
 // CHECK-A32-SOFTFP-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[__P01_I5]], <8 x bfloat> [[__P01_I5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 // CHECK-A32-SOFTFP-NEXT:    store <4 x bfloat> [[SHUFFLE_I]], ptr [[RETVAL_I]], align 8
-// CHECK-A32-SOFTFP-NEXT:    [[TMP8:%.*]] = load <2 x i32>, ptr [[RETVAL_I]], align 8
-// CHECK-A32-SOFTFP-NEXT:    store <2 x i32> [[TMP8]], ptr [[COERCE2_I]], align 8
-// CHECK-A32-SOFTFP-NEXT:    [[TMP10:%.*]] = load <4 x bfloat>, ptr [[COERCE2_I]], align 8
-// CHECK-A32-SOFTFP-NEXT:    store <4 x bfloat> [[TMP10]], ptr [[COERCE3_I]], align 8
-// CHECK-A32-SOFTFP-NEXT:    [[TMP12:%.*]] = load <2 x i32>, ptr [[COERCE3_I]], align 8
-// CHECK-A32-SOFTFP-NEXT:    store <2 x i32> [[TMP12]], ptr [[__P0_150_I_I]], align 8
-// CHECK-A32-SOFTFP-NEXT:    [[__P0_1501_I_I:%.*]] = load <4 x bfloat>, ptr [[__P0_150_I_I]], align 8
-// CHECK-A32-SOFTFP-NEXT:    store <4 x bfloat> [[__P0_1501_I_I]], ptr [[__REINT_150_I_I]], align 8
-// CHECK-A32-SOFTFP-NEXT:    [[TMP15:%.*]] = load <4 x i16>, ptr [[__REINT_150_I_I]], align 8
-// CHECK-A32-SOFTFP-NEXT:    [[TMP16:%.*]] = bitcast <4 x i16> [[TMP15]] to <8 x i8>
-// CHECK-A32-SOFTFP-NEXT:    [[TMP17:%.*]] = sext <4 x i16> [[TMP15]] to <4 x i32>
-// CHECK-A32-SOFTFP-NEXT:    [[VSHLL_N_I_I:%.*]] = shl <4 x i32> [[TMP17]], <i32 16, i32 16, i32 16, i32 16>
-// CHECK-A32-SOFTFP-NEXT:    store <4 x i32> [[VSHLL_N_I_I]], ptr [[__REINT1_150_I_I]], align 8
-// CHECK-A32-SOFTFP-NEXT:    [[TMP19:%.*]] = load <4 x float>, ptr [[__REINT1_150_I_I]], align 8
-// CHECK-A32-SOFTFP-NEXT:    ret <4 x float> [[TMP19]]
+// CHECK-A32-SOFTFP-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr [[RETVAL_I]], align 8
+// CHECK-A32-SOFTFP-NEXT:    store <2 x i32> [[TMP2]], ptr [[COERCE2_I]], align 8
+// CHECK-A32-SOFTFP-NEXT:    [[TMP3:%.*]] = load <4 x bfloat>, ptr [[COERCE2_I]], align 8
+// CHECK-A32-SOFTFP-NEXT:    store <4 x bfloat> [[TMP3]], ptr [[COERCE3_I]], align 8
+// CHECK-A32-SOFTFP-NEXT:    [[TMP4:%.*]] = load <2 x i32>, ptr [[COERCE3_I]], align 8
+// CHECK-A32-SOFTFP-NEXT:    store <2 x i32> [[TMP4]], ptr [[__P0_836_I_I]], align 8
+// CHECK-A32-SOFTFP-NEXT:    [[__P0_8361_I_I:%.*]] = load <4 x bfloat>, ptr [[__P0_836_I_I]], align 8
+// CHECK-A32-SOFTFP-NEXT:    store <4 x bfloat> [[__P0_8361_I_I]], ptr [[__REINT_836_I_I]], align 8
+// CHECK-A32-SOFTFP-NEXT:    [[TMP5:%.*]] = load <4 x i16>, ptr [[__REINT_836_I_I]], align 8
+// CHECK-A32-SOFTFP-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK-A32-SOFTFP-NEXT:    [[TMP7:%.*]] = sext <4 x i16> [[TMP5]] to <4 x i32>
+// CHECK-A32-SOFTFP-NEXT:    [[VSHLL_N_I_I:%.*]] = shl <4 x i32> [[TMP7]], <i32 16, i32 16, i32 16, i32 16>
+// CHECK-A32-SOFTFP-NEXT:    store <4 x i32> [[VSHLL_N_I_I]], ptr [[__REINT1_836_I_I]], align 8
+// CHECK-A32-SOFTFP-NEXT:    [[TMP8:%.*]] = load <4 x float>, ptr [[__REINT1_836_I_I]], align 8
+// CHECK-A32-SOFTFP-NEXT:    ret <4 x float> [[TMP8]]
 //
 float32x4_t test_vcvtq_low_f32_bf16(bfloat16x8_t a) {
   return vcvtq_low_f32_bf16(a);
@@ -149,39 +149,39 @@ float32x4_t test_vcvtq_low_f32_bf16(bfloat16x8_t a) {
 
 // CHECK-A64-LABEL: @test_vcvtq_high_f32_bf16(
 // CHECK-A64-NEXT:  entry:
-// CHECK-A64-NEXT:    [[__REINT_150_I_I:%.*]] = alloca <4 x bfloat>, align 8
-// CHECK-A64-NEXT:    [[__REINT1_150_I_I:%.*]] = alloca <4 x i32>, align 16
+// CHECK-A64-NEXT:    [[__REINT_836_I_I:%.*]] = alloca <4 x bfloat>, align 8
+// CHECK-A64-NEXT:    [[__REINT1_836_I_I:%.*]] = alloca <4 x i32>, align 16
 // CHECK-A64-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[A:%.*]], <8 x bfloat> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK-A64-NEXT:    store <4 x bfloat> [[SHUFFLE_I]], ptr [[__REINT_150_I_I]], align 8
-// CHECK-A64-NEXT:    [[TMP1:%.*]] = load <4 x i16>, ptr [[__REINT_150_I_I]], align 8
-// CHECK-A64-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8>
-// CHECK-A64-NEXT:    [[TMP3:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
-// CHECK-A64-NEXT:    [[VSHLL_N_I_I:%.*]] = shl <4 x i32> [[TMP3]], <i32 16, i32 16, i32 16, i32 16>
-// CHECK-A64-NEXT:    store <4 x i32> [[VSHLL_N_I_I]], ptr [[__REINT1_150_I_I]], align 16
-// CHECK-A64-NEXT:    [[TMP5:%.*]] = load <4 x float>, ptr [[__REINT1_150_I_I]], align 16
-// CHECK-A64-NEXT:    ret <4 x float> [[TMP5]]
+// CHECK-A64-NEXT:    store <4 x bfloat> [[SHUFFLE_I]], ptr [[__REINT_836_I_I]], align 8
+// CHECK-A64-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[__REINT_836_I_I]], align 8
+// CHECK-A64-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
+// CHECK-A64-NEXT:    [[TMP2:%.*]] = sext <4 x i16> [[TMP0]] to <4 x i32>
+// CHECK-A64-NEXT:    [[VSHLL_N_I_I:%.*]] = shl <4 x i32> [[TMP2]], <i32 16, i32 16, i32 16, i32 16>
+// CHECK-A64-NEXT:    store <4 x i32> [[VSHLL_N_I_I]], ptr [[__REINT1_836_I_I]], align 16
+// CHECK-A64-NEXT:    [[TMP3:%.*]] = load <4 x float>, ptr [[__REINT1_836_I_I]], align 16
+// CHECK-A64-NEXT:    ret <4 x float> [[TMP3]]
 //
 // CHECK-A32-HARDFP-LABEL: @test_vcvtq_high_f32_bf16(
 // CHECK-A32-HARDFP-NEXT:  entry:
-// CHECK-A32-HARDFP-NEXT:    [[__REINT_150_I_I:%.*]] = alloca <4 x bfloat>, align 8
-// CHECK-A32-HARDFP-NEXT:    [[__REINT1_150_I_I:%.*]] = alloca <4 x i32>, align 8
+// CHECK-A32-HARDFP-NEXT:    [[__REINT_836_I_I:%.*]] = alloca <4 x bfloat>, align 8
+// CHECK-A32-HARDFP-NEXT:    [[__REINT1_836_I_I:%.*]] = alloca <4 x i32>, align 8
 // CHECK-A32-HARDFP-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[A:%.*]], <8 x bfloat> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-// CHECK-A32-HARDFP-NEXT:    store <4 x bfloat> [[SHUFFLE_I]], ptr [[__REINT_150_I_I]], align 8
-// CHECK-A32-HARDFP-NEXT:    [[TMP1:%.*]] = load <4 x i16>, ptr [[__REINT_150_I_I]], align 8
-// CHECK-A32-HARDFP-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8>
-// CHECK-A32-HARDFP-NEXT:    [[TMP3:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
-// CHECK-A32-HARDFP-NEXT:    [[VSHLL_N_I_I:%.*]] = shl <4 x i32> [[TMP3]], <i32 16, i32 16, i32 16, i32 16>
-// CHECK-A32-HARDFP-NEXT:    store <4 x i32> [[VSHLL_N_I_I]], ptr [[__REINT1_150_I_I]], align 8
-// CHECK-A32-HARDFP-NEXT:    [[TMP5:%.*]] = load <4 x float>, ptr [[__REINT1_150_I_I]], align 8
-// CHECK-A32-HARDFP-NEXT:    ret <4 x float> [[TMP5]]
+// CHECK-A32-HARDFP-NEXT:    store <4 x bfloat> [[SHUFFLE_I]], ptr [[__REINT_836_I_I]], align 8
+// CHECK-A32-HARDFP-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[__REINT_836_I_I]], align 8
+// CHECK-A32-HARDFP-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
+// CHECK-A32-HARDFP-NEXT:    [[TMP2:%.*]] = sext <4 x i16> [[TMP0]] to <4 x i32>
+// CHECK-A32-HARDFP-NEXT:    [[VSHLL_N_I_I:%.*]] = shl <4 x i32> [[TMP2]], <i32 16, i32 16, i32 16, i32 16>
+// CHECK-A32-HARDFP-NEXT:    store <4 x i32> [[VSHLL_N_I_I]], ptr [[__REINT1_836_I_I]], align 8
+// CHECK-A32-HARDFP-NEXT:    [[TMP3:%.*]] = load <4 x float>, ptr [[__REINT1_836_I_I]], align 8
+// CHECK-A32-HARDFP-NEXT:    ret <4 x float> [[TMP3]]
 //
 // CHECK-A32-SOFTFP-LABEL: @test_vcvtq_high_f32_bf16(
 // CHECK-A32-SOFTFP-NEXT:  entry:
 // CHECK-A32-SOFTFP-NEXT:    [[RETVAL_I:%.*]] = alloca <4 x bfloat>, align 8
 // CHECK-A32-SOFTFP-NEXT:    [[__P0_I2:%.*]] = alloca <8 x bfloat>, align 8
-// CHECK-A32-SOFTFP-NEXT:    [[__P0_150_I_I:%.*]] = alloca <4 x bfloat>, align 8
-// CHECK-A32-SOFTFP-NEXT:    [[__REINT_150_I_I:%.*]] = alloca <4 x bfloat>, align 8
-// CHECK-A32-SOFTFP-NEXT:    [[__REINT1_150_I_I:%.*]] = alloca <4 x i32>, align 8
+// CHECK-A32-SOFTFP-NEXT:    [[__P0_836_I_I:%.*]] = alloca <4 x bfloat>, align 8
+// CHECK-A32-SOFTFP-NEXT:    [[__REINT_836_I_I:%.*]] = alloca <4 x bfloat>, align 8
+// CHECK-A32-SOFTFP-NEXT:    [[__REINT1_836_I_I:%.*]] = alloca <4 x i32>, align 8
 // CHECK-A32-SOFTFP-NEXT:    [[__P0_I:%.*]] = alloca <8 x bfloat>, align 8
 // CHECK-A32-SOFTFP-NEXT:    [[COERCE_I:%.*]] = alloca <8 x bfloat>, align 8
 // CHECK-A32-SOFTFP-NEXT:    [[COERCE2_I:%.*]] = alloca <4 x bfloat>, align 8
@@ -191,30 +191,30 @@ float32x4_t test_vcvtq_low_f32_bf16(bfloat16x8_t a) {
 // CHECK-A32-SOFTFP-NEXT:    store <4 x i32> [[A_COERCE:%.*]], ptr [[A]], align 8
 // CHECK-A32-SOFTFP-NEXT:    [[A1:%.*]] = load <8 x bfloat>, ptr [[A]], align 8
 // CHECK-A32-SOFTFP-NEXT:    store <8 x bfloat> [[A1]], ptr [[COERCE]], align 8
-// CHECK-A32-SOFTFP-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr [[COERCE]], align 8
-// CHECK-A32-SOFTFP-NEXT:    store <4 x i32> [[TMP2]], ptr [[__P0_I]], align 8
+// CHECK-A32-SOFTFP-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[COERCE]], align 8
+// CHECK-A32-SOFTFP-NEXT:    store <4 x i32> [[TMP0]], ptr [[__P0_I]], align 8
 // CHECK-A32-SOFTFP-NEXT:    [[__P01_I:%.*]] = load <8 x bfloat>, ptr [[__P0_I]], align 8
 // CHECK-A32-SOFTFP-NEXT:    store <8 x bfloat> [[__P01_I]], ptr [[COERCE_I]], align 8
-// CHECK-A32-SOFTFP-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr [[COERCE_I]], align 8
-// CHECK-A32-SOFTFP-NEXT:    store <4 x i32> [[TMP5]], ptr [[__P0_I2]], align 8
+// CHECK-A32-SOFTFP-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[COERCE_I]], align 8
+// CHECK-A32-SOFTFP-NEXT:    store <4 x i32> [[TMP1]], ptr [[__P0_I2]], align 8
 // CHECK-A32-SOFTFP-NEXT:    [[__P01_I5:%.*]] = load <8 x bfloat>, ptr [[__P0_I2]], align 8
 // CHECK-A32-SOFTFP-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[__P01_I5]], <8 x bfloat> [[__P01_I5]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 // CHECK-A32-SOFTFP-NEXT:    store <4 x bfloat> [[SHUFFLE_I]], ptr [[RETVAL_I]], align 8
-// CHECK-A32-SOFTFP-NEXT:    [[TMP8:%.*]] = load <2 x i32>, ptr [[RETVAL_I]], align 8
-// CHECK-A32-SOFTFP-NEXT:    store <2 x i32> [[TMP8]], ptr [[COERCE2_I]], align 8
-// CHECK-A32-SOFTFP-NEXT:    [[TMP10:%.*]] = load <4 x bfloat>, ptr [[COERCE2_I]], align 8
-// CHECK-A32-SOFTFP-NEXT:    store <4 x bfloat> [[TMP10]], ptr [[COERCE3_I]], align 8
-// CHECK-A32-SOFTFP-NEXT:    [[TMP12:%.*]] = load <2 x i32>, ptr [[COERCE3_I]], align 8
-// CHECK-A32-SOFTFP-NEXT:    store <2 x i32> [[TMP12]], ptr [[__P0_150_I_I]], align 8
-// CHECK-A32-SOFTFP-NEXT:    [[__P0_1501_I_I:%.*]] = load <4 x bfloat>, ptr [[__P0_150_I_I]], align 8
-// CHECK-A32-SOFTFP-NEXT:    store <4 x bfloat> [[__P0_1501_I_I]], ptr [[__REINT_150_I_I]], align 8
-// CHECK-A32-SOFTFP-NEXT:    [[TMP15:%.*]] = load <4 x i16>, ptr [[__REINT_150_I_I]], align 8
-// CHECK-A32-SOFTFP-NEXT:    [[TMP16:%.*]] = bitcast <4 x i16> [[TMP15]] to <8 x i8>
-// CHECK-A32-SOFTFP-NEXT:    [[TMP17:%.*]] = sext <4 x i16> [[TMP15]] to <4 x i32>
-// CHECK-A32-SOFTFP-NEXT:    [[VSHLL_N_I_I:%.*]] = shl <4 x i32> [[TMP17]], <i32 16, i32 16, i32 16, i32 16>
-// CHECK-A32-SOFTFP-NEXT:    store <4 x i32> [[VSHLL_N_I_I]], ptr [[__REINT1_150_I_I]], align 8
-// CHECK-A32-SOFTFP-NEXT:    [[TMP19:%.*]] = load <4 x float>, ptr [[__REINT1_150_I_I]], align 8
-// CHECK-A32-SOFTFP-NEXT:    ret <4 x float> [[TMP19]]
+// CHECK-A32-SOFTFP-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr [[RETVAL_I]], align 8
+// CHECK-A32-SOFTFP-NEXT:    store <2 x i32> [[TMP2]], ptr [[COERCE2_I]], align 8
+// CHECK-A32-SOFTFP-NEXT:    [[TMP3:%.*]] = load <4 x bfloat>, ptr [[COERCE2_I]], align 8
+// CHECK-A32-SOFTFP-NEXT:    store <4 x bfloat> [[TMP3]], ptr [[COERCE3_I]], align 8
+// CHECK-A32-SOFTFP-NEXT:    [[TMP4:%.*]] = load <2 x i32>, ptr [[COERCE3_I]], align 8
+// CHECK-A32-SOFTFP-NEXT:    store <2 x i32> [[TMP4]], ptr [[__P0_836_I_I]], align 8
+// CHECK-A32-SOFTFP-NEXT:    [[__P0_8361_I_I:%.*]] = load <4 x bfloat>, ptr [[__P0_836_I_I]], align 8
+// CHECK-A32-SOFTFP-NEXT:    store <4 x bfloat> [[__P0_8361_I_I]], ptr [[__REINT_836_I_I]], align 8
+// CHECK-A32-SOFTFP-NEXT:    [[TMP5:%.*]] = load <4 x i16>, ptr [[__REINT_836_I_I]], align 8
+// CHECK-A32-SOFTFP-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK-A32-SOFTFP-NEXT:    [[TMP7:%.*]] = sext <4 x i16> [[TMP5]] to <4 x i32>
+// CHECK-A32-SOFTFP-NEXT:    [[VSHLL_N_I_I:%.*]] = shl <4 x i32> [[TMP7]], <i32 16, i32 16, i32 16, i32 16>
+// CHECK-A32-SOFTFP-NEXT:    store <4 x i32> [[VSHLL_N_I_I]], ptr [[__REINT1_836_I_I]], align 8
+// CHECK-A32-SOFTFP-NEXT:    [[TMP8:%.*]] = load <4 x float>, ptr [[__REINT1_836_I_I]], align 8
+// CHECK-A32-SOFTFP-NEXT:    ret <4 x float> [[TMP8]]
 //
 float32x4_t test_vcvtq_high_f32_bf16(bfloat16x8_t a) {
   return vcvtq_high_f32_bf16(a);
@@ -223,9 +223,9 @@ float32x4_t test_vcvtq_high_f32_bf16(bfloat16x8_t a) {
 // CHECK-A64-LABEL: @test_vcvt_bf16_f32(
 // CHECK-A64-NEXT:  entry:
 // CHECK-A64-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
-// CHECK-A64-NEXT:    [[__A64_VCVTQ_LOW_BF16_V1_I:%.*]] = call <8 x bfloat> @llvm.aarch64.neon.bfcvtn(<4 x float> [[A]])
-// CHECK-A64-NEXT:    [[__A64_VCVTQ_LOW_BF16_V2_I:%.*]] = bitcast <8 x bfloat> [[__A64_VCVTQ_LOW_BF16_V1_I]] to <16 x i8>
-// CHECK-A64-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[__A64_VCVTQ_LOW_BF16_V1_I]], <8 x bfloat> [[__A64_VCVTQ_LOW_BF16_V1_I]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK-A64-NEXT:    [[__A64_VCVTQ_LOW_BF16_F321_I:%.*]] = call <8 x bfloat> @llvm.aarch64.neon.bfcvtn(<4 x float> [[A]])
+// CHECK-A64-NEXT:    [[__A64_VCVTQ_LOW_BF16_F322_I:%.*]] = bitcast <8 x bfloat> [[__A64_VCVTQ_LOW_BF16_F321_I]] to <16 x i8>
+// CHECK-A64-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[__A64_VCVTQ_LOW_BF16_F321_I]], <8 x bfloat> [[__A64_VCVTQ_LOW_BF16_F321_I]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 // CHECK-A64-NEXT:    ret <4 x bfloat> [[SHUFFLE_I]]
 //
 // CHECK-A32-HARDFP-LABEL: @test_vcvt_bf16_f32(
@@ -245,16 +245,16 @@ float32x4_t test_vcvtq_high_f32_bf16(bfloat16x8_t a) {
 // CHECK-A32-SOFTFP-NEXT:    [[VCVTFP2BF1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtfp2bf.v4i16(<4 x float> [[A]])
 // CHECK-A32-SOFTFP-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[VCVTFP2BF1_I]] to <4 x bfloat>
 // CHECK-A32-SOFTFP-NEXT:    store <4 x bfloat> [[TMP1]], ptr [[RETVAL_I1]], align 8
-// CHECK-A32-SOFTFP-NEXT:    [[TMP3:%.*]] = load <2 x i32>, ptr [[RETVAL_I1]], align 8
-// CHECK-A32-SOFTFP-NEXT:    store <2 x i32> [[TMP3]], ptr [[COERCE_I]], align 8
-// CHECK-A32-SOFTFP-NEXT:    [[TMP5:%.*]] = load <4 x bfloat>, ptr [[COERCE_I]], align 8
-// CHECK-A32-SOFTFP-NEXT:    store <4 x bfloat> [[TMP5]], ptr [[RETVAL_I]], align 8
-// CHECK-A32-SOFTFP-NEXT:    [[TMP7:%.*]] = load <2 x i32>, ptr [[RETVAL_I]], align 8
-// CHECK-A32-SOFTFP-NEXT:    store <2 x i32> [[TMP7]], ptr [[COERCE]], align 8
-// CHECK-A32-SOFTFP-NEXT:    [[TMP9:%.*]] = load <4 x bfloat>, ptr [[COERCE]], align 8
-// CHECK-A32-SOFTFP-NEXT:    store <4 x bfloat> [[TMP9]], ptr [[RETVAL]], align 8
-// CHECK-A32-SOFTFP-NEXT:    [[TMP11:%.*]] = load <2 x i32>, ptr [[RETVAL]], align 8
-// CHECK-A32-SOFTFP-NEXT:    ret <2 x i32> [[TMP11]]
+// CHECK-A32-SOFTFP-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr [[RETVAL_I1]], align 8
+// CHECK-A32-SOFTFP-NEXT:    store <2 x i32> [[TMP2]], ptr [[COERCE_I]], align 8
+// CHECK-A32-SOFTFP-NEXT:    [[TMP3:%.*]] = load <4 x bfloat>, ptr [[COERCE_I]], align 8
+// CHECK-A32-SOFTFP-NEXT:    store <4 x bfloat> [[TMP3]], ptr [[RETVAL_I]], align 8
+// CHECK-A32-SOFTFP-NEXT:    [[TMP4:%.*]] = load <2 x i32>, ptr [[RETVAL_I]], align 8
+// CHECK-A32-SOFTFP-NEXT:    store <2 x i32> [[TMP4]], ptr [[COERCE]], align 8
+// CHECK-A32-SOFTFP-NEXT:    [[TMP5:%.*]] = load <4 x bfloat>, ptr [[COERCE]], align 8
+// CHECK-A32-SOFTFP-NEXT:    store <4 x bfloat> [[TMP5]], ptr [[RETVAL]], align 8
+// CHECK-A32-SOFTFP-NEXT:    [[TMP6:%.*]] = load <2 x i32>, ptr [[RETVAL]], align 8
+// CHECK-A32-SOFTFP-NEXT:    ret <2 x i32> [[TMP6]]
 //
 bfloat16x4_t test_vcvt_bf16_f32(float32x4_t a) {
   return vcvt_bf16_f32(a);
@@ -263,9 +263,9 @@ bfloat16x4_t test_vcvt_bf16_f32(float32x4_t a) {
 // CHECK-A64-LABEL: @test_vcvtq_low_bf16_f32(
 // CHECK-A64-NEXT:  entry:
 // CHECK-A64-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
-// CHECK-A64-NEXT:    [[__A64_VCVTQ_LOW_BF16_V1_I:%.*]] = call <8 x bfloat> @llvm.aarch64.neon.bfcvtn(<4 x float> [[A]])
-// CHECK-A64-NEXT:    [[__A64_VCVTQ_LOW_BF16_V2_I:%.*]] = bitcast <8 x bfloat> [[__A64_VCVTQ_LOW_BF16_V1_I]] to <16 x i8>
-// CHECK-A64-NEXT:    ret <8 x bfloat> [[__A64_VCVTQ_LOW_BF16_V1_I]]
+// CHECK-A64-NEXT:    [[__A64_VCVTQ_LOW_BF16_F321_I:%.*]] = call <8 x bfloat> @llvm.aarch64.neon.bfcvtn(<4 x float> [[A]])
+// CHECK-A64-NEXT:    [[__A64_VCVTQ_LOW_BF16_F322_I:%.*]] = bitcast <8 x bfloat> [[__A64_VCVTQ_LOW_BF16_F321_I]] to <16 x i8>
+// CHECK-A64-NEXT:    ret <8 x bfloat> [[__A64_VCVTQ_LOW_BF16_F321_I]]
 //
 // CHECK-A32-HARDFP-LABEL: @test_vcvtq_low_bf16_f32(
 // CHECK-A32-HARDFP-NEXT:  entry:
@@ -291,29 +291,29 @@ bfloat16x4_t test_vcvt_bf16_f32(float32x4_t a) {
 // CHECK-A32-SOFTFP-NEXT:    [[VCVTFP2BF1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtfp2bf.v4i16(<4 x float> [[A]])
 // CHECK-A32-SOFTFP-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[VCVTFP2BF1_I]] to <4 x bfloat>
 // CHECK-A32-SOFTFP-NEXT:    store <4 x bfloat> [[TMP1]], ptr [[RETVAL_I1]], align 8
-// CHECK-A32-SOFTFP-NEXT:    [[TMP3:%.*]] = load <2 x i32>, ptr [[RETVAL_I1]], align 8
-// CHECK-A32-SOFTFP-NEXT:    store <2 x i32> [[TMP3]], ptr [[COERCE_I]], align 8
-// CHECK-A32-SOFTFP-NEXT:    [[TMP5:%.*]] = load <4 x bfloat>, ptr [[COERCE_I]], align 8
+// CHECK-A32-SOFTFP-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr [[RETVAL_I1]], align 8
+// CHECK-A32-SOFTFP-NEXT:    store <2 x i32> [[TMP2]], ptr [[COERCE_I]], align 8
+// CHECK-A32-SOFTFP-NEXT:    [[TMP3:%.*]] = load <4 x bfloat>, ptr [[COERCE_I]], align 8
 // CHECK-A32-SOFTFP-NEXT:    store <4 x bfloat> zeroinitializer, ptr [[COERCE1_I]], align 8
-// CHECK-A32-SOFTFP-NEXT:    [[TMP7:%.*]] = load <2 x i32>, ptr [[COERCE1_I]], align 8
-// CHECK-A32-SOFTFP-NEXT:    store <4 x bfloat> [[TMP5]], ptr [[COERCE2_I]], align 8
-// CHECK-A32-SOFTFP-NEXT:    [[TMP9:%.*]] = load <2 x i32>, ptr [[COERCE2_I]], align 8
-// CHECK-A32-SOFTFP-NEXT:    store <2 x i32> [[TMP7]], ptr [[__P0_I]], align 8
+// CHECK-A32-SOFTFP-NEXT:    [[TMP4:%.*]] = load <2 x i32>, ptr [[COERCE1_I]], align 8
+// CHECK-A32-SOFTFP-NEXT:    store <4 x bfloat> [[TMP3]], ptr [[COERCE2_I]], align 8
+// CHECK-A32-SOFTFP-NEXT:    [[TMP5:%.*]] = load <2 x i32>, ptr [[COERCE2_I]], align 8
+// CHECK-A32-SOFTFP-NEXT:    store <2 x i32> [[TMP4]], ptr [[__P0_I]], align 8
 // CHECK-A32-SOFTFP-NEXT:    [[__P01_I:%.*]] = load <4 x bfloat>, ptr [[__P0_I]], align 8
-// CHECK-A32-SOFTFP-NEXT:    store <2 x i32> [[TMP9]], ptr [[__P1_I]], align 8
+// CHECK-A32-SOFTFP-NEXT:    store <2 x i32> [[TMP5]], ptr [[__P1_I]], align 8
 // CHECK-A32-SOFTFP-NEXT:    [[__P12_I:%.*]] = load <4 x bfloat>, ptr [[__P1_I]], align 8
 // CHECK-A32-SOFTFP-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x bfloat> [[__P01_I]], <4 x bfloat> [[__P12_I]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 // CHECK-A32-SOFTFP-NEXT:    store <8 x bfloat> [[SHUFFLE_I]], ptr [[RETVAL_I4]], align 8
-// CHECK-A32-SOFTFP-NEXT:    [[TMP13:%.*]] = load <4 x i32>, ptr [[RETVAL_I4]], align 8
-// CHECK-A32-SOFTFP-NEXT:    store <4 x i32> [[TMP13]], ptr [[COERCE4_I]], align 8
-// CHECK-A32-SOFTFP-NEXT:    [[TMP15:%.*]] = load <8 x bfloat>, ptr [[COERCE4_I]], align 8
-// CHECK-A32-SOFTFP-NEXT:    store <8 x bfloat> [[TMP15]], ptr [[RETVAL_I]], align 8
-// CHECK-A32-SOFTFP-NEXT:    [[TMP17:%.*]] = load <4 x i32>, ptr [[RETVAL_I]], align 8
-// CHECK-A32-SOFTFP-NEXT:    store <4 x i32> [[TMP17]], ptr [[COERCE]], align 8
-// CHECK-A32-SOFTFP-NEXT:    [[TMP19:%.*]] = load <8 x bfloat>, ptr [[COERCE]], align 8
-// CHECK-A32-SOFTFP-NEXT:    store <8 x bfloat> [[TMP19]], ptr [[RETVAL]], align 8
-// CHECK-A32-SOFTFP-NEXT:    [[TMP21:%.*]] = load <4 x i32>, ptr [[RETVAL]], align 8
-// CHECK-A32-SOFTFP-NEXT:    ret <4 x i32> [[TMP21]]
+// CHECK-A32-SOFTFP-NEXT:    [[TMP6:%.*]] = load <4 x i32>, ptr [[RETVAL_I4]], align 8
+// CHECK-A32-SOFTFP-NEXT:    store <4 x i32> [[TMP6]], ptr [[COERCE4_I]], align 8
+// CHECK-A32-SOFTFP-NEXT:    [[TMP7:%.*]] = load <8 x bfloat>, ptr [[COERCE4_I]], align 8
+// CHECK-A32-SOFTFP-NEXT:    store <8 x bfloat> [[TMP7]], ptr [[RETVAL_I]], align 8
+// CHECK-A32-SOFTFP-NEXT:    [[TMP8:%.*]] = load <4 x i32>, ptr [[RETVAL_I]], align 8
+// CHECK-A32-SOFTFP-NEXT:    store <4 x i32> [[TMP8]], ptr [[COERCE]], align 8
+// CHECK-A32-SOFTFP-NEXT:    [[TMP9:%.*]] = load <8 x bfloat>, ptr [[COERCE]], align 8
+// CHECK-A32-SOFTFP-NEXT:    store <8 x bfloat> [[TMP9]], ptr [[RETVAL]], align 8
+// CHECK-A32-SOFTFP-NEXT:    [[TMP10:%.*]] = load <4 x i32>, ptr [[RETVAL]], align 8
+// CHECK-A32-SOFTFP-NEXT:    ret <4 x i32> [[TMP10]]
 //
 bfloat16x8_t test_vcvtq_low_bf16_f32(float32x4_t a) {
   return vcvtq_low_bf16_f32(a);
@@ -323,9 +323,9 @@ bfloat16x8_t test_vcvtq_low_bf16_f32(float32x4_t a) {
 // CHECK-A64-NEXT:  entry:
 // CHECK-A64-NEXT:    [[TMP0:%.*]] = bitcast <8 x bfloat> [[INACTIVE:%.*]] to <16 x i8>
 // CHECK-A64-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
-// CHECK-A64-NEXT:    [[VCVTQ_HIGH_BF16_V2_I:%.*]] = call <8 x bfloat> @llvm.aarch64.neon.bfcvtn2(<8 x bfloat> [[INACTIVE]], <4 x float> [[A]])
-// CHECK-A64-NEXT:    [[VCVTQ_HIGH_BF16_V3_I:%.*]] = bitcast <8 x bfloat> [[VCVTQ_HIGH_BF16_V2_I]] to <16 x i8>
-// CHECK-A64-NEXT:    ret <8 x bfloat> [[VCVTQ_HIGH_BF16_V2_I]]
+// CHECK-A64-NEXT:    [[VCVTQ_HIGH_BF16_F322_I:%.*]] = call <8 x bfloat> @llvm.aarch64.neon.bfcvtn2(<8 x bfloat> [[INACTIVE]], <4 x float> [[A]])
+// CHECK-A64-NEXT:    [[VCVTQ_HIGH_BF16_F323_I:%.*]] = bitcast <8 x bfloat> [[VCVTQ_HIGH_BF16_F322_I]] to <16 x i8>
+// CHECK-A64-NEXT:    ret <8 x bfloat> [[VCVTQ_HIGH_BF16_F322_I]]
 //
 // CHECK-A32-HARDFP-LABEL: @test_vcvtq_high_bf16_f32(
 // CHECK-A32-HARDFP-NEXT:  entry:
@@ -358,45 +358,45 @@ bfloat16x8_t test_vcvtq_low_bf16_f32(float32x4_t a) {
 // CHECK-A32-SOFTFP-NEXT:    store <4 x i32> [[INACTIVE_COERCE:%.*]], ptr [[INACTIVE]], align 8
 // CHECK-A32-SOFTFP-NEXT:    [[INACTIVE1:%.*]] = load <8 x bfloat>, ptr [[INACTIVE]], align 8
 // CHECK-A32-SOFTFP-NEXT:    store <8 x bfloat> [[INACTIVE1]], ptr [[COERCE]], align 8
-// CHECK-A32-SOFTFP-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr [[COERCE]], align 8
-// CHECK-A32-SOFTFP-NEXT:    store <4 x i32> [[TMP2]], ptr [[__P0_I]], align 8
+// CHECK-A32-SOFTFP-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[COERCE]], align 8
+// CHECK-A32-SOFTFP-NEXT:    store <4 x i32> [[TMP0]], ptr [[__P0_I]], align 8
 // CHECK-A32-SOFTFP-NEXT:    [[__P01_I:%.*]] = load <8 x bfloat>, ptr [[__P0_I]], align 8
-// CHECK-A32-SOFTFP-NEXT:    [[TMP4:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
+// CHECK-A32-SOFTFP-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
 // CHECK-A32-SOFTFP-NEXT:    [[VCVTFP2BF1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtfp2bf.v4i16(<4 x float> [[A]])
-// CHECK-A32-SOFTFP-NEXT:    [[TMP5:%.*]] = bitcast <4 x i16> [[VCVTFP2BF1_I]] to <4 x bfloat>
-// CHECK-A32-SOFTFP-NEXT:    store <4 x bfloat> [[TMP5]], ptr [[RETVAL_I8]], align 8
-// CHECK-A32-SOFTFP-NEXT:    [[TMP7:%.*]] = load <2 x i32>, ptr [[RETVAL_I8]], align 8
-// CHECK-A32-SOFTFP-NEXT:    store <2 x i32> [[TMP7]], ptr [[COERCE_I]], align 8
-// CHECK-A32-SOFTFP-NEXT:    [[TMP9:%.*]] = load <4 x bfloat>, ptr [[COERCE_I]], align 8
+// CHECK-A32-SOFTFP-NEXT:    [[TMP2:%.*]] = bitcast <4 x i16> [[VCVTFP2BF1_I]] to <4 x bfloat>
+// CHECK-A32-SOFTFP-NEXT:    store <4 x bfloat> [[TMP2]], ptr [[RETVAL_I8]], align 8
+// CHECK-A32-SOFTFP-NEXT:    [[TMP3:%.*]] = load <2 x i32>, ptr [[RETVAL_I8]], align 8
+// CHECK-A32-SOFTFP-NEXT:    store <2 x i32> [[TMP3]], ptr [[COERCE_I]], align 8
+// CHECK-A32-SOFTFP-NEXT:    [[TMP4:%.*]] = load <4 x bfloat>, ptr [[COERCE_I]], align 8
 // CHECK-A32-SOFTFP-NEXT:    store <8 x bfloat> [[__P01_I]], ptr [[COERCE2_I]], align 8
-// CHECK-A32-SOFTFP-NEXT:    [[TMP11:%.*]] = load <4 x i32>, ptr [[COERCE2_I]], align 8
-// CHECK-A32-SOFTFP-NEXT:    store <4 x i32> [[TMP11]], ptr [[__P0_I4]], align 8
+// CHECK-A32-SOFTFP-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr [[COERCE2_I]], align 8
+// CHECK-A32-SOFTFP-NEXT:    store <4 x i32> [[TMP5]], ptr [[__P0_I4]], align 8
 // CHECK-A32-SOFTFP-NEXT:    [[__P01_I7:%.*]] = load <8 x bfloat>, ptr [[__P0_I4]], align 8
 // CHECK-A32-SOFTFP-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[__P01_I7]], <8 x bfloat> [[__P01_I7]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 // CHECK-A32-SOFTFP-NEXT:    store <4 x bfloat> [[SHUFFLE_I]], ptr [[RETVAL_I3]], align 8
-// CHECK-A32-SOFTFP-NEXT:    [[TMP14:%.*]] = load <2 x i32>, ptr [[RETVAL_I3]], align 8
-// CHECK-A32-SOFTFP-NEXT:    store <2 x i32> [[TMP14]], ptr [[COERCE4_I]], align 8
-// CHECK-A32-SOFTFP-NEXT:    [[TMP16:%.*]] = load <4 x bfloat>, ptr [[COERCE4_I]], align 8
-// CHECK-A32-SOFTFP-NEXT:    store <4 x bfloat> [[TMP9]], ptr [[COERCE5_I]], align 8
-// CHECK-A32-SOFTFP-NEXT:    [[TMP18:%.*]] = load <2 x i32>, ptr [[COERCE5_I]], align 8
-// CHECK-A32-SOFTFP-NEXT:    store <4 x bfloat> [[TMP16]], ptr [[COERCE6_I]], align 8
-// CHECK-A32-SOFTFP-NEXT:    [[TMP20:%.*]] = load <2 x i32>, ptr [[COERCE6_I]], align 8
-// CHECK-A32-SOFTFP-NEXT:    store <2 x i32> [[TMP18]], ptr [[__P0_I12]], align 8
+// CHECK-A32-SOFTFP-NEXT:    [[TMP6:%.*]] = load <2 x i32>, ptr [[RETVAL_I3]], align 8
+// CHECK-A32-SOFTFP-NEXT:    store <2 x i32> [[TMP6]], ptr [[COERCE4_I]], align 8
+// CHECK-A32-SOFTFP-NEXT:    [[TMP7:%.*]] = load <4 x bfloat>, ptr [[COERCE4_I]], align 8
+// CHECK-A32-SOFTFP-NEXT:    store <4 x bfloat> [[TMP4]], ptr [[COERCE5_I]], align 8
+// CHECK-A32-SOFTFP-NEXT:    [[TMP8:%.*]] = load <2 x i32>, ptr [[COERCE5_I]], align 8
+// CHECK-A32-SOFTFP-NEXT:    store <4 x bfloat> [[TMP7]], ptr [[COERCE6_I]], align 8
+// CHECK-A32-SOFTFP-NEXT:    [[TMP9:%.*]] = load <2 x i32>, ptr [[COERCE6_I]], align 8
+// CHECK-A32-SOFTFP-NEXT:    store <2 x i32> [[TMP8]], ptr [[__P0_I12]], align 8
 // CHECK-A32-SOFTFP-NEXT:    [[__P01_I16:%.*]] = load <4 x bfloat>, ptr [[__P0_I12]], align 8
-// CHECK-A32-SOFTFP-NEXT:    store <2 x i32> [[TMP20]], ptr [[__P1_I]], align 8
+// CHECK-A32-SOFTFP-NEXT:    store <2 x i32> [[TMP9]], ptr [[__P1_I]], align 8
 // CHECK-A32-SOFTFP-NEXT:    [[__P12_I:%.*]] = load <4 x bfloat>, ptr [[__P1_I]], align 8
 // CHECK-A32-SOFTFP-NEXT:    [[SHUFFLE_I17:%.*]] = shufflevector <4 x bfloat> [[__P01_I16]], <4 x bfloat> [[__P12_I]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 // CHECK-A32-SOFTFP-NEXT:    store <8 x bfloat> [[SHUFFLE_I17]], ptr [[RETVAL_I11]], align 8
-// CHECK-A32-SOFTFP-NEXT:    [[TMP24:%.*]] = load <4 x i32>, ptr [[RETVAL_I11]], align 8
-// CHECK-A32-SOFTFP-NEXT:    store <4 x i32> [[TMP24]], ptr [[COERCE8_I]], align 8
-// CHECK-A32-SOFTFP-NEXT:    [[TMP26:%.*]] = load <8 x bfloat>, ptr [[COERCE8_I]], align 8
-// CHECK-A32-SOFTFP-NEXT:    store <8 x bfloat> [[TMP26]], ptr [[RETVAL_I]], align 8
-// CHECK-A32-SOFTFP-NEXT:    [[TMP28:%.*]] = load <4 x i32>, ptr [[RETVAL_I]], align 8
-// CHECK-A32-SOFTFP-NEXT:    store <4 x i32> [[TMP28]], ptr [[COERCE2]], align 8
-// CHECK-A32-SOFTFP-NEXT:    [[TMP30:%.*]] = load <8 x bfloat>, ptr [[COERCE2]], align 8
-// CHECK-A32-SOFTFP-NEXT:    store <8 x bfloat> [[TMP30]], ptr [[RETVAL]], align 8
-// CHECK-A32-SOFTFP-NEXT:    [[TMP32:%.*]] = load <4 x i32>, ptr [[RETVAL]], align 8
-// CHECK-A32-SOFTFP-NEXT:    ret <4 x i32> [[TMP32]]
+// CHECK-A32-SOFTFP-NEXT:    [[TMP10:%.*]] = load <4 x i32>, ptr [[RETVAL_I11]], align 8
+// CHECK-A32-SOFTFP-NEXT:    store <4 x i32> [[TMP10]], ptr [[COERCE8_I]], align 8
+// CHECK-A32-SOFTFP-NEXT:    [[TMP11:%.*]] = load <8 x bfloat>, ptr [[COERCE8_I]], align 8
+// CHECK-A32-SOFTFP-NEXT:    store <8 x bfloat> [[TMP11]], ptr [[RETVAL_I]], align 8
+// CHECK-A32-SOFTFP-NEXT:    [[TMP12:%.*]] = load <4 x i32>, ptr [[RETVAL_I]], align 8
+// CHECK-A32-SOFTFP-NEXT:    store <4 x i32> [[TMP12]], ptr [[COERCE2]], align 8
+// CHECK-A32-SOFTFP-NEXT:    [[TMP13:%.*]] = load <8 x bfloat>, ptr [[COERCE2]], align 8
+// CHECK-A32-SOFTFP-NEXT:    store <8 x bfloat> [[TMP13]], ptr [[RETVAL]], align 8
+// CHECK-A32-SOFTFP-NEXT:    [[TMP14:%.*]] = load <4 x i32>, ptr [[RETVAL]], align 8
+// CHECK-A32-SOFTFP-NEXT:    ret <4 x i32> [[TMP14]]
 //
 bfloat16x8_t test_vcvtq_high_bf16_f32(bfloat16x8_t inactive, float32x4_t a) {
   return vcvtq_high_bf16_f32(inactive, a);

diff  --git a/clang/test/CodeGen/variadic-nvptx.c b/clang/test/CodeGen/variadic-nvptx.c
index dd7cba552580fd..4e4fc5ecdef65e 100644
--- a/clang/test/CodeGen/variadic-nvptx.c
+++ b/clang/test/CodeGen/variadic-nvptx.c
@@ -30,7 +30,7 @@ extern void varargs_simple(int, ...);
 // CHECK-NEXT:    [[TMP4:%.*]] = load float, ptr [[F]], align 4
 // CHECK-NEXT:    [[CONV2:%.*]] = fpext float [[TMP4]] to double
 // CHECK-NEXT:    [[TMP5:%.*]] = load double, ptr [[D]], align 8
-// CHECK-NEXT:    call void (i32, ...) @varargs_simple(i32 noundef 0, i32 noundef [[CONV]], i32 noundef [[CONV1]], i32 noundef [[TMP2]], i64 noundef [[TMP3]], double noundef [[CONV2]], double noundef [[TMP5]])
+// CHECK-NEXT:    call void (i32, ...) @varargs_simple(i32 noundef 0, i32 noundef [[CONV]], i32 noundef [[CONV1]], i32 noundef [[TMP2]], i64 noundef [[TMP3]], double noundef [[CONV2]], double noundef [[TMP5]]) #[[ATTR3:[0-9]+]]
 // CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[A]], ptr align 4 @__const.foo.a, i64 12, i1 false)
 // CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON]], ptr [[A]], i32 0, i32 0
 // CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4
@@ -38,10 +38,10 @@ extern void varargs_simple(int, ...);
 // CHECK-NEXT:    [[TMP9:%.*]] = load i8, ptr [[TMP8]], align 4
 // CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON]], ptr [[A]], i32 0, i32 2
 // CHECK-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP10]], align 4
-// CHECK-NEXT:    call void (i32, ...) @varargs_simple(i32 noundef 0, i32 [[TMP7]], i8 [[TMP9]], i32 [[TMP11]])
+// CHECK-NEXT:    call void (i32, ...) @varargs_simple(i32 noundef 0, i32 [[TMP7]], i8 [[TMP9]], i32 [[TMP11]]) #[[ATTR3]]
 // CHECK-NEXT:    store <4 x i32> <i32 1, i32 1, i32 1, i32 1>, ptr [[V]], align 16
 // CHECK-NEXT:    [[TMP12:%.*]] = load <4 x i32>, ptr [[V]], align 16
-// CHECK-NEXT:    call void (i32, ...) @varargs_simple(i32 noundef 0, <4 x i32> noundef [[TMP12]])
+// CHECK-NEXT:    call void (i32, ...) @varargs_simple(i32 noundef 0, <4 x i32> noundef [[TMP12]]) #[[ATTR3]]
 // CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON_0]], ptr [[T]], i32 0, i32 0
 // CHECK-NEXT:    [[TMP14:%.*]] = load i8, ptr [[TMP13]], align 1
 // CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON_0]], ptr [[T]], i32 0, i32 1
@@ -54,7 +54,7 @@ extern void varargs_simple(int, ...);
 // CHECK-NEXT:    [[TMP22:%.*]] = load i8, ptr [[TMP21]], align 1
 // CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON_0]], ptr [[T]], i32 0, i32 1
 // CHECK-NEXT:    [[TMP24:%.*]] = load i8, ptr [[TMP23]], align 1
-// CHECK-NEXT:    call void (i32, ...) @varargs_simple(i32 noundef 0, i8 [[TMP14]], i8 [[TMP16]], i8 [[TMP18]], i8 [[TMP20]], i32 noundef 0, i8 [[TMP22]], i8 [[TMP24]])
+// CHECK-NEXT:    call void (i32, ...) @varargs_simple(i32 noundef 0, i8 [[TMP14]], i8 [[TMP16]], i8 [[TMP18]], i8 [[TMP20]], i32 noundef 0, i8 [[TMP22]], i8 [[TMP24]]) #[[ATTR3]]
 // CHECK-NEXT:    ret void
 //
 void foo() {
@@ -85,7 +85,7 @@ extern void varargs_complex(S, S, ...);
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[S:%.*]] = alloca [[STRUCT_S:%.*]], align 8
 // CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[S]], ptr align 8 @__const.bar.s, i64 16, i1 false)
-// CHECK-NEXT:    call void (ptr, ptr, ...) @varargs_complex(ptr noundef byval([[STRUCT_S]]) align 8 [[S]], ptr noundef byval([[STRUCT_S]]) align 8 [[S]], i32 noundef 1, i64 noundef 1, double noundef 1.000000e+00)
+// CHECK-NEXT:    call void (ptr, ptr, ...) @varargs_complex(ptr noundef byval([[STRUCT_S]]) align 8 [[S]], ptr noundef byval([[STRUCT_S]]) align 8 [[S]], i32 noundef 1, i64 noundef 1, double noundef 1.000000e+00) #[[ATTR3]]
 // CHECK-NEXT:    ret void
 //
 void bar() {

diff  --git a/llvm/test/Analysis/CostModel/SystemZ/divrem-pow2.ll b/llvm/test/Analysis/CostModel/SystemZ/divrem-pow2.ll
index b30245fe9d305e..b43a46c882805f 100644
--- a/llvm/test/Analysis/CostModel/SystemZ/divrem-pow2.ll
+++ b/llvm/test/Analysis/CostModel/SystemZ/divrem-pow2.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -mtriple=systemz-unknown -mcpu=z13 \
 ; RUN:  | FileCheck %s -check-prefix=COST
 
@@ -9,375 +10,555 @@
 ; Scalar sdiv
 
 define i64 @fun0(i64 %a) {
+; COST-LABEL: 'fun0'
+; COST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r = sdiv i64 %a, 2
+; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i64 %r
+;
   %r = sdiv i64 %a, 2
   ret i64 %r
-; COST: Cost Model: Found an estimated cost of 4 for instruction:   %r = sdiv i64 %a, 2
 }
 
 define i64 @fun1(i64 %a) {
+; COST-LABEL: 'fun1'
+; COST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r = sdiv i64 %a, -4
+; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i64 %r
+;
   %r = sdiv i64 %a, -4
   ret i64 %r
-; COST: Cost Model: Found an estimated cost of 4 for instruction:   %r = sdiv i64 %a, -4
 }
 
 define i32 @fun2(i32 %a) {
+; COST-LABEL: 'fun2'
+; COST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r = sdiv i32 %a, 8
+; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %r
+;
   %r = sdiv i32 %a, 8
   ret i32 %r
-; COST: Cost Model: Found an estimated cost of 4 for instruction:   %r = sdiv i32 %a, 8
 }
 
 define i32 @fun3(i32 %a) {
+; COST-LABEL: 'fun3'
+; COST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r = sdiv i32 %a, -16
+; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %r
+;
   %r = sdiv i32 %a, -16
   ret i32 %r
-; COST: Cost Model: Found an estimated cost of 4 for instruction:   %r = sdiv i32 %a, -16
 }
 
 define i16 @fun4(i16 %a) {
+; COST-LABEL: 'fun4'
+; COST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r = sdiv i16 %a, 32
+; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i16 %r
+;
   %r = sdiv i16 %a, 32
   ret i16 %r
-; COST: Cost Model: Found an estimated cost of 4 for instruction:   %r = sdiv i16 %a, 32
 }
 
 define i16 @fun5(i16 %a) {
+; COST-LABEL: 'fun5'
+; COST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r = sdiv i16 %a, -64
+; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i16 %r
+;
   %r = sdiv i16 %a, -64
   ret i16 %r
-; COST: Cost Model: Found an estimated cost of 4 for instruction:   %r = sdiv i16 %a, -64
 }
 
 define i8 @fun6(i8 %a) {
+; COST-LABEL: 'fun6'
+; COST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r = sdiv i8 %a, 64
+; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i8 %r
+;
   %r = sdiv i8 %a, 64
   ret i8 %r
-; COST: Cost Model: Found an estimated cost of 4 for instruction:   %r = sdiv i8 %a, 64
 }
 
 define i8 @fun7(i8 %a) {
+; COST-LABEL: 'fun7'
+; COST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r = sdiv i8 %a, -128
+; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i8 %r
+;
   %r = sdiv i8 %a, -128
   ret i8 %r
-; COST: Cost Model: Found an estimated cost of 4 for instruction:   %r = sdiv i8 %a, -128
 }
 
 ; Vector sdiv
 
 define <2 x i64> @fun8(<2 x i64> %a) {
+; COST-LABEL: 'fun8'
+; COST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r = sdiv <2 x i64> %a, <i64 2, i64 2>
+; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i64> %r
+;
   %r = sdiv <2 x i64> %a, <i64 2, i64 2>
   ret <2 x i64> %r
-; COST: Cost Model: Found an estimated cost of 4 for instruction:   %r = sdiv <2 x i64> %a, <i64 2, i64 2>
 }
 
 define <2 x i64> @fun9(<2 x i64> %a) {
+; COST-LABEL: 'fun9'
+; COST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r = sdiv <2 x i64> %a, <i64 -4, i64 -4>
+; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i64> %r
+;
   %r = sdiv <2 x i64> %a, <i64 -4, i64 -4>
   ret <2 x i64> %r
-; COST: Cost Model: Found an estimated cost of 4 for instruction:   %r = sdiv <2 x i64> %a, <i64 -4, i64 -4>
 }
 
 define <4 x i32> @fun10(<4 x i32> %a) {
+; COST-LABEL: 'fun10'
+; COST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r = sdiv <4 x i32> %a, <i32 8, i32 8, i32 8, i32 8>
+; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %r
+;
   %r = sdiv <4 x i32> %a, <i32 8, i32 8, i32 8, i32 8>
   ret <4 x i32> %r
-; COST: Cost Model: Found an estimated cost of 4 for instruction:   %r = sdiv <4 x i32> %a, <i32 8, i32 8, i32 8, i32 8>
 }
 
 define <4 x i32> @fun11(<4 x i32> %a) {
+; COST-LABEL: 'fun11'
+; COST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r = sdiv <4 x i32> %a, <i32 -16, i32 -16, i32 -16, i32 -16>
+; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %r
+;
   %r = sdiv <4 x i32> %a, <i32 -16, i32 -16, i32 -16, i32 -16>
   ret <4 x i32> %r
-; COST: Cost Model: Found an estimated cost of 4 for instruction:   %r = sdiv <4 x i32> %a, <i32 -16
 }
 
 define <2 x i32> @fun12(<2 x i32> %a) {
+; COST-LABEL: 'fun12'
+; COST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r = sdiv <2 x i32> %a, <i32 -16, i32 -16>
+; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i32> %r
+;
   %r = sdiv <2 x i32> %a, <i32 -16, i32 -16>
   ret <2 x i32> %r
-; COST: Cost Model: Found an estimated cost of 4 for instruction:   %r = sdiv <2 x i32> %a, <i32 -16
 }
 
 define <8 x i16> @fun13(<8 x i16> %a) {
+; COST-LABEL: 'fun13'
+; COST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r = sdiv <8 x i16> %a, <i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32>
+; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i16> %r
+;
   %r = sdiv <8 x i16> %a, <i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32>
   ret <8 x i16> %r
-; COST: Cost Model: Found an estimated cost of 4 for instruction:   %r = sdiv <8 x i16> %a, <i16 32
 }
 
 define <8 x i16> @fun14(<8 x i16> %a) {
+; COST-LABEL: 'fun14'
+; COST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r = sdiv <8 x i16> %a, <i16 -64, i16 -64, i16 -64, i16 -64, i16 -64, i16 -64, i16 -64, i16 -64>
+; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i16> %r
+;
   %r = sdiv <8 x i16> %a, <i16 -64, i16 -64, i16 -64, i16 -64, i16 -64, i16 -64, i16 -64, i16 -64>
   ret <8 x i16> %r
-; COST: Cost Model: Found an estimated cost of 4 for instruction:   %r = sdiv <8 x i16> %a, <i16 -64
 }
 
 define <4 x i16> @fun15(<4 x i16> %a) {
+; COST-LABEL: 'fun15'
+; COST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r = sdiv <4 x i16> %a, <i16 32, i16 32, i16 32, i16 32>
+; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i16> %r
+;
   %r = sdiv <4 x i16> %a, <i16 32, i16 32, i16 32, i16 32>
   ret <4 x i16> %r
-; COST: Cost Model: Found an estimated cost of 4 for instruction:   %r = sdiv <4 x i16> %a, <i16 32
 }
 
 define <16 x i8> @fun16(<16 x i8> %a) {
+; COST-LABEL: 'fun16'
+; COST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r = sdiv <16 x i8> %a, <i8 64, i8 64, i8 64, i8 64, i8 64, i8 64, i8 64, i8 64, i8 64, i8 64, i8 64, i8 64, i8 64, i8 64, i8 64, i8 64>
+; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %r
+;
   %r = sdiv <16 x i8> %a, <i8 64, i8 64, i8 64, i8 64, i8 64, i8 64, i8 64, i8 64, i8 64, i8 64, i8 64, i8 64, i8 64, i8 64, i8 64, i8 64>
   ret <16 x i8> %r
-; COST: Cost Model: Found an estimated cost of 4 for instruction:   %r = sdiv <16 x i8> %a, <i8 64
 }
 
 define <16 x i8> @fun17(<16 x i8> %a) {
+; COST-LABEL: 'fun17'
+; COST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r = sdiv <16 x i8> %a, <i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128>
+; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %r
+;
   %r = sdiv <16 x i8> %a, <i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128>
   ret <16 x i8> %r
-; COST: Cost Model: Found an estimated cost of 4 for instruction:   %r = sdiv <16 x i8> %a, <i8 -128
 }
 
 define <8 x i8> @fun18(<8 x i8> %a) {
+; COST-LABEL: 'fun18'
+; COST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r = sdiv <8 x i8> %a, <i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128>
+; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i8> %r
+;
   %r = sdiv <8 x i8> %a, <i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128>
   ret <8 x i8> %r
-; COST: Cost Model: Found an estimated cost of 4 for instruction:   %r = sdiv <8 x i8> %a, <i8 -128
 }
 
 ; Scalar udiv
 
 define i64 @fun19(i64 %a) {
+; COST-LABEL: 'fun19'
+; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r = udiv i64 %a, 2
+; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i64 %r
+;
   %r = udiv i64 %a, 2
   ret i64 %r
-; COST: Cost Model: Found an estimated cost of 1 for instruction:   %r = udiv i64 %a, 2
 }
 
 define i32 @fun20(i32 %a) {
+; COST-LABEL: 'fun20'
+; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r = udiv i32 %a, 8
+; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %r
+;
   %r = udiv i32 %a, 8
   ret i32 %r
-; COST: Cost Model: Found an estimated cost of 1 for instruction:   %r = udiv i32 %a, 8
 }
 
 define i16 @fun21(i16 %a) {
+; COST-LABEL: 'fun21'
+; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r = udiv i16 %a, 32
+; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i16 %r
+;
   %r = udiv i16 %a, 32
   ret i16 %r
-; COST: Cost Model: Found an estimated cost of 1 for instruction:   %r = udiv i16 %a, 32
 }
 
 define i8 @fun22(i8 %a) {
+; COST-LABEL: 'fun22'
+; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r = udiv i8 %a, -128
+; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i8 %r
+;
   %r = udiv i8 %a, 128
   ret i8 %r
-; COST: Cost Model: Found an estimated cost of 1 for instruction:   %r = udiv i8 %a, -128
 }
 
 ; Vector udiv
 
 define <2 x i64> @fun23(<2 x i64> %a) {
+; COST-LABEL: 'fun23'
+; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r = udiv <2 x i64> %a, <i64 2, i64 2>
+; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i64> %r
+;
   %r = udiv <2 x i64> %a, <i64 2, i64 2>
   ret <2 x i64> %r
-; COST: Cost Model: Found an estimated cost of 1 for instruction:   %r = udiv <2 x i64> %a, <i64 2
 }
 
 define <4 x i32> @fun24(<4 x i32> %a) {
+; COST-LABEL: 'fun24'
+; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r = udiv <4 x i32> %a, <i32 8, i32 8, i32 8, i32 8>
+; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %r
+;
   %r = udiv <4 x i32> %a, <i32 8, i32 8, i32 8, i32 8>
   ret <4 x i32> %r
-; COST: Cost Model: Found an estimated cost of 1 for instruction:   %r = udiv <4 x i32> %a, <i32 8
 }
 
 define <2 x i32> @fun25(<2 x i32> %a) {
+; COST-LABEL: 'fun25'
+; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r = udiv <2 x i32> %a, <i32 8, i32 8>
+; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i32> %r
+;
   %r = udiv <2 x i32> %a, <i32 8, i32 8>
   ret <2 x i32> %r
-; COST: Cost Model: Found an estimated cost of 1 for instruction:   %r = udiv <2 x i32> %a, <i32 8
 }
 
 define <8 x i16> @fun26(<8 x i16> %a) {
+; COST-LABEL: 'fun26'
+; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r = udiv <8 x i16> %a, <i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32>
+; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i16> %r
+;
   %r = udiv <8 x i16> %a, <i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32>
   ret <8 x i16> %r
-; COST: Cost Model: Found an estimated cost of 1 for instruction:   %r = udiv <8 x i16> %a, <i16 32
 }
 
 define <4 x i16> @fun27(<4 x i16> %a) {
+; COST-LABEL: 'fun27'
+; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r = udiv <4 x i16> %a, <i16 32, i16 32, i16 32, i16 32>
+; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i16> %r
+;
   %r = udiv <4 x i16> %a, <i16 32, i16 32, i16 32, i16 32>
   ret <4 x i16> %r
-; COST: Cost Model: Found an estimated cost of 1 for instruction:   %r = udiv <4 x i16> %a, <i16 32
 }
 
 define <16 x i8> @fun28(<16 x i8> %a) {
+; COST-LABEL: 'fun28'
+; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r = udiv <16 x i8> %a, <i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128>
+; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %r
+;
   %r = udiv <16 x i8> %a, <i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128>
   ret <16 x i8> %r
-; COST: Cost Model: Found an estimated cost of 1 for instruction:   %r = udiv <16 x i8> %a, <i8 -128
 }
 
 define <8 x i8> @fun29(<8 x i8> %a) {
+; COST-LABEL: 'fun29'
+; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r = udiv <8 x i8> %a, <i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128>
+; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i8> %r
+;
   %r = udiv <8 x i8> %a, <i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128>
   ret <8 x i8> %r
-; COST: Cost Model: Found an estimated cost of 1 for instruction:   %r = udiv <8 x i8> %a, <i8 -128
 }
 
 ; Scalar srem
 
 define i64 @fun30(i64 %a) {
+; COST-LABEL: 'fun30'
+; COST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r = srem i64 %a, 2
+; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i64 %r
+;
   %r = srem i64 %a, 2
   ret i64 %r
-; COST: Cost Model: Found an estimated cost of 4 for instruction:   %r = srem i64 %a, 2
 }
 
 define i64 @fun31(i64 %a) {
+; COST-LABEL: 'fun31'
+; COST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r = srem i64 %a, -4
+; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i64 %r
+;
   %r = srem i64 %a, -4
   ret i64 %r
-; COST: Cost Model: Found an estimated cost of 4 for instruction:   %r = srem i64 %a, -4
 }
 
 define i32 @fun32(i32 %a) {
+; COST-LABEL: 'fun32'
+; COST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r = srem i32 %a, 8
+; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %r
+;
   %r = srem i32 %a, 8
   ret i32 %r
-; COST: Cost Model: Found an estimated cost of 4 for instruction:   %r = srem i32 %a, 8
 }
 
 define i32 @fun33(i32 %a) {
+; COST-LABEL: 'fun33'
+; COST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r = srem i32 %a, -16
+; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %r
+;
   %r = srem i32 %a, -16
   ret i32 %r
-; COST: Cost Model: Found an estimated cost of 4 for instruction:   %r = srem i32 %a, -16
 }
 
 define i16 @fun34(i16 %a) {
+; COST-LABEL: 'fun34'
+; COST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r = srem i16 %a, 32
+; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i16 %r
+;
   %r = srem i16 %a, 32
   ret i16 %r
-; COST: Cost Model: Found an estimated cost of 4 for instruction:   %r = srem i16 %a, 32
 }
 
 define i16 @fun35(i16 %a) {
+; COST-LABEL: 'fun35'
+; COST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r = srem i16 %a, -64
+; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i16 %r
+;
   %r = srem i16 %a, -64
   ret i16 %r
-; COST: Cost Model: Found an estimated cost of 4 for instruction:   %r = srem i16 %a, -64
 }
 
 define i8 @fun36(i8 %a) {
+; COST-LABEL: 'fun36'
+; COST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r = srem i8 %a, 64
+; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i8 %r
+;
   %r = srem i8 %a, 64
   ret i8 %r
-; COST: Cost Model: Found an estimated cost of 4 for instruction:   %r = srem i8 %a, 64
 }
 
 define i8 @fun37(i8 %a) {
+; COST-LABEL: 'fun37'
+; COST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r = srem i8 %a, -128
+; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i8 %r
+;
   %r = srem i8 %a, -128
   ret i8 %r
-; COST: Cost Model: Found an estimated cost of 4 for instruction:   %r = srem i8 %a, -128
 }
 
 ; Vector srem
 
 define <2 x i64> @fun38(<2 x i64> %a) {
+; COST-LABEL: 'fun38'
+; COST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r = srem <2 x i64> %a, <i64 2, i64 2>
+; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i64> %r
+;
   %r = srem <2 x i64> %a, <i64 2, i64 2>
   ret <2 x i64> %r
-; COST: Cost Model: Found an estimated cost of 4 for instruction:   %r = srem <2 x i64> %a, <i64 2, i64 2>
 }
 
 define <2 x i64> @fun39(<2 x i64> %a) {
+; COST-LABEL: 'fun39'
+; COST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r = srem <2 x i64> %a, <i64 -4, i64 -4>
+; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i64> %r
+;
   %r = srem <2 x i64> %a, <i64 -4, i64 -4>
   ret <2 x i64> %r
-; COST: Cost Model: Found an estimated cost of 4 for instruction:   %r = srem <2 x i64> %a, <i64 -4, i64 -4>
 }
 
 define <4 x i32> @fun40(<4 x i32> %a) {
+; COST-LABEL: 'fun40'
+; COST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r = srem <4 x i32> %a, <i32 8, i32 8, i32 8, i32 8>
+; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %r
+;
   %r = srem <4 x i32> %a, <i32 8, i32 8, i32 8, i32 8>
   ret <4 x i32> %r
-; COST: Cost Model: Found an estimated cost of 4 for instruction:   %r = srem <4 x i32> %a, <i32 8, i32 8, i32 8, i32 8>
 }
 
 define <4 x i32> @fun41(<4 x i32> %a) {
+; COST-LABEL: 'fun41'
+; COST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r = srem <4 x i32> %a, <i32 -16, i32 -16, i32 -16, i32 -16>
+; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %r
+;
   %r = srem <4 x i32> %a, <i32 -16, i32 -16, i32 -16, i32 -16>
   ret <4 x i32> %r
-; COST: Cost Model: Found an estimated cost of 4 for instruction:   %r = srem <4 x i32> %a, <i32 -16
 }
 
 define <2 x i32> @fun42(<2 x i32> %a) {
+; COST-LABEL: 'fun42'
+; COST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r = srem <2 x i32> %a, <i32 -16, i32 -16>
+; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i32> %r
+;
   %r = srem <2 x i32> %a, <i32 -16, i32 -16>
   ret <2 x i32> %r
-; COST: Cost Model: Found an estimated cost of 4 for instruction:   %r = srem <2 x i32> %a, <i32 -16
 }
 
 define <8 x i16> @fun43(<8 x i16> %a) {
+; COST-LABEL: 'fun43'
+; COST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r = srem <8 x i16> %a, <i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32>
+; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i16> %r
+;
   %r = srem <8 x i16> %a, <i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32>
   ret <8 x i16> %r
-; COST: Cost Model: Found an estimated cost of 4 for instruction:   %r = srem <8 x i16> %a, <i16 32
 }
 
 define <8 x i16> @fun44(<8 x i16> %a) {
+; COST-LABEL: 'fun44'
+; COST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r = srem <8 x i16> %a, <i16 -64, i16 -64, i16 -64, i16 -64, i16 -64, i16 -64, i16 -64, i16 -64>
+; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i16> %r
+;
   %r = srem <8 x i16> %a, <i16 -64, i16 -64, i16 -64, i16 -64, i16 -64, i16 -64, i16 -64, i16 -64>
   ret <8 x i16> %r
-; COST: Cost Model: Found an estimated cost of 4 for instruction:   %r = srem <8 x i16> %a, <i16 -64
 }
 
 define <4 x i16> @fun45(<4 x i16> %a) {
+; COST-LABEL: 'fun45'
+; COST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r = srem <4 x i16> %a, <i16 32, i16 32, i16 32, i16 32>
+; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i16> %r
+;
   %r = srem <4 x i16> %a, <i16 32, i16 32, i16 32, i16 32>
   ret <4 x i16> %r
-; COST: Cost Model: Found an estimated cost of 4 for instruction:   %r = srem <4 x i16> %a, <i16 32
 }
 
 define <16 x i8> @fun46(<16 x i8> %a) {
+; COST-LABEL: 'fun46'
+; COST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r = srem <16 x i8> %a, <i8 64, i8 64, i8 64, i8 64, i8 64, i8 64, i8 64, i8 64, i8 64, i8 64, i8 64, i8 64, i8 64, i8 64, i8 64, i8 64>
+; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %r
+;
   %r = srem <16 x i8> %a, <i8 64, i8 64, i8 64, i8 64, i8 64, i8 64, i8 64, i8 64, i8 64, i8 64, i8 64, i8 64, i8 64, i8 64, i8 64, i8 64>
   ret <16 x i8> %r
-; COST: Cost Model: Found an estimated cost of 4 for instruction:   %r = srem <16 x i8> %a, <i8 64
 }
 
 define <16 x i8> @fun47(<16 x i8> %a) {
+; COST-LABEL: 'fun47'
+; COST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r = srem <16 x i8> %a, <i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128>
+; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %r
+;
   %r = srem <16 x i8> %a, <i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128>
   ret <16 x i8> %r
-; COST: Cost Model: Found an estimated cost of 4 for instruction:   %r = srem <16 x i8> %a, <i8 -128
 }
 
 define <8 x i8> @fun48(<8 x i8> %a) {
+; COST-LABEL: 'fun48'
+; COST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %r = srem <8 x i8> %a, <i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128>
+; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i8> %r
+;
   %r = srem <8 x i8> %a, <i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128>
   ret <8 x i8> %r
-; COST: Cost Model: Found an estimated cost of 4 for instruction:   %r = srem <8 x i8> %a, <i8 -128
 }
 
 ; Scalar urem
 
 define i64 @fun49(i64 %a) {
+; COST-LABEL: 'fun49'
+; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r = urem i64 %a, 2
+; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i64 %r
+;
   %r = urem i64 %a, 2
   ret i64 %r
-; COST: Cost Model: Found an estimated cost of 1 for instruction:   %r = urem i64 %a, 2
 }
 
 define i32 @fun50(i32 %a) {
+; COST-LABEL: 'fun50'
+; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r = urem i32 %a, 8
+; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %r
+;
   %r = urem i32 %a, 8
   ret i32 %r
-; COST: Cost Model: Found an estimated cost of 1 for instruction:   %r = urem i32 %a, 8
 }
 
 define i16 @fun51(i16 %a) {
+; COST-LABEL: 'fun51'
+; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r = urem i16 %a, 32
+; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i16 %r
+;
   %r = urem i16 %a, 32
   ret i16 %r
-; COST: Cost Model: Found an estimated cost of 1 for instruction:   %r = urem i16 %a, 32
 }
 
 define i8 @fun52(i8 %a) {
+; COST-LABEL: 'fun52'
+; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r = urem i8 %a, -128
+; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i8 %r
+;
   %r = urem i8 %a, 128
   ret i8 %r
-; COST: Cost Model: Found an estimated cost of 1 for instruction:   %r = urem i8 %a, -128
 }
 
 ; Vector urem
 
 define <2 x i64> @fun53(<2 x i64> %a) {
+; COST-LABEL: 'fun53'
+; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r = urem <2 x i64> %a, <i64 2, i64 2>
+; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i64> %r
+;
   %r = urem <2 x i64> %a, <i64 2, i64 2>
   ret <2 x i64> %r
-; COST: Cost Model: Found an estimated cost of 1 for instruction:   %r = urem <2 x i64> %a, <i64 2
 }
 
 define <4 x i32> @fun54(<4 x i32> %a) {
+; COST-LABEL: 'fun54'
+; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r = urem <4 x i32> %a, <i32 8, i32 8, i32 8, i32 8>
+; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %r
+;
   %r = urem <4 x i32> %a, <i32 8, i32 8, i32 8, i32 8>
   ret <4 x i32> %r
-; COST: Cost Model: Found an estimated cost of 1 for instruction:   %r = urem <4 x i32> %a, <i32 8
 }
 
 define <2 x i32> @fun55(<2 x i32> %a) {
+; COST-LABEL: 'fun55'
+; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r = urem <2 x i32> %a, <i32 8, i32 8>
+; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i32> %r
+;
   %r = urem <2 x i32> %a, <i32 8, i32 8>
   ret <2 x i32> %r
-; COST: Cost Model: Found an estimated cost of 1 for instruction:   %r = urem <2 x i32> %a, <i32 8
 }
 
 define <8 x i16> @fun56(<8 x i16> %a) {
+; COST-LABEL: 'fun56'
+; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r = urem <8 x i16> %a, <i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32>
+; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i16> %r
+;
   %r = urem <8 x i16> %a, <i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32>
   ret <8 x i16> %r
-; COST: Cost Model: Found an estimated cost of 1 for instruction:   %r = urem <8 x i16> %a, <i16 32
 }
 
 define <4 x i16> @fun57(<4 x i16> %a) {
+; COST-LABEL: 'fun57'
+; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r = urem <4 x i16> %a, <i16 32, i16 32, i16 32, i16 32>
+; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i16> %r
+;
   %r = urem <4 x i16> %a, <i16 32, i16 32, i16 32, i16 32>
   ret <4 x i16> %r
-; COST: Cost Model: Found an estimated cost of 1 for instruction:   %r = urem <4 x i16> %a, <i16 32
 }
 
 define <16 x i8> @fun58(<16 x i8> %a) {
+; COST-LABEL: 'fun58'
+; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r = urem <16 x i8> %a, <i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128>
+; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %r
+;
   %r = urem <16 x i8> %a, <i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128>
   ret <16 x i8> %r
-; COST: Cost Model: Found an estimated cost of 1 for instruction:   %r = urem <16 x i8> %a, <i8 -128
 }
 
 define <8 x i8> @fun59(<8 x i8> %a) {
+; COST-LABEL: 'fun59'
+; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r = urem <8 x i8> %a, <i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128>
+; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i8> %r
+;
   %r = urem <8 x i8> %a, <i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128>
   ret <8 x i8> %r
-; COST: Cost Model: Found an estimated cost of 1 for instruction:   %r = urem <8 x i8> %a, <i8 -128
 }

diff  --git a/llvm/test/CodeGen/AArch64/arm64-codegen-prepare-extload.ll b/llvm/test/CodeGen/AArch64/arm64-codegen-prepare-extload.ll
index e9235c04048046..840ed6a3c40d68 100644
--- a/llvm/test/CodeGen/AArch64/arm64-codegen-prepare-extload.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-codegen-prepare-extload.ll
@@ -1,16 +1,24 @@
-; RUN: opt -passes='require<profile-summary>,function(codegenprepare)' < %s -mtriple=aarch64-apple-ios -S | FileCheck -enable-var-scope %s --check-prefix=OPTALL --check-prefix=OPT --check-prefix=NONSTRESS
-; RUN: opt -passes='require<profile-summary>,function(codegenprepare)' < %s -mtriple=aarch64-apple-ios -S -stress-cgp-ext-ld-promotion | FileCheck -enable-var-scope %s --check-prefix=OPTALL --check-prefix=OPT --check-prefix=STRESS
-; RUN: opt -passes='require<profile-summary>,function(codegenprepare)' < %s -mtriple=aarch64-apple-ios -S -disable-cgp-ext-ld-promotion | FileCheck -enable-var-scope %s --check-prefix=OPTALL --check-prefix=DISABLE
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes='require<profile-summary>,function(codegenprepare)' < %s -mtriple=aarch64-apple-ios -S | FileCheck -enable-var-scope %s --check-prefix=OPTALL --check-prefixes=OPT,NONSTRESS
+; RUN: opt -passes='require<profile-summary>,function(codegenprepare)' < %s -mtriple=aarch64-apple-ios -S -stress-cgp-ext-ld-promotion | FileCheck -enable-var-scope %s --check-prefixes=OPTALL,OPT,STRESS
+; RUN: opt -passes='require<profile-summary>,function(codegenprepare)' < %s -mtriple=aarch64-apple-ios -S -disable-cgp-ext-ld-promotion | FileCheck -enable-var-scope %s --check-prefixes=OPTALL,DISABLE
 
 ; CodeGenPrepare should move the zext into the block with the load
 ; so that SelectionDAG can select it with the load.
-;
-; OPTALL-LABEL: @foo
-; OPTALL: [[LD:%[a-zA-Z_0-9-]+]] = load i8, ptr %p
-; OPTALL-NEXT: [[ZEXT:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i32
-; OPTALL: store i32 [[ZEXT]], ptr %q
-; OPTALL: ret
 define void @foo(ptr %p, ptr %q) {
+; OPTALL-LABEL: define void @foo(
+; OPTALL-SAME: ptr [[P:%.*]], ptr [[Q:%.*]]) {
+; OPTALL-NEXT:  [[ENTRY:.*:]]
+; OPTALL-NEXT:    [[T:%.*]] = load i8, ptr [[P]], align 1
+; OPTALL-NEXT:    [[S:%.*]] = zext i8 [[T]] to i32
+; OPTALL-NEXT:    [[A:%.*]] = icmp slt i8 [[T]], 20
+; OPTALL-NEXT:    br i1 [[A]], label %[[TRUE:.*]], label %[[FALSE:.*]]
+; OPTALL:       [[TRUE]]:
+; OPTALL-NEXT:    store i32 [[S]], ptr [[Q]], align 4
+; OPTALL-NEXT:    ret void
+; OPTALL:       [[FALSE]]:
+; OPTALL-NEXT:    ret void
+;
 entry:
   %t = load i8, ptr %p
   %a = icmp slt i8 %t, 20
@@ -25,16 +33,36 @@ false:
 
 ; Check that we manage to form a zextload is an operation with only one
 ; argument to explicitly extend is in the way.
-; OPTALL-LABEL: @promoteOneArg
-; OPTALL: [[LD:%[a-zA-Z_0-9-]+]] = load i8, ptr %p
-; OPT-NEXT: [[ZEXT:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i32
-; OPT-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = add nuw i32 [[ZEXT]], 2
 ; Make sure the operation is not promoted when the promotion pass is disabled.
-; DISABLE: [[ADD:%[a-zA-Z_0-9-]+]] = add nuw i8 [[LD]], 2
-; DISABLE: [[RES:%[a-zA-Z_0-9-]+]] = zext i8 [[ADD]] to i32
-; OPTALL: store i32 [[RES]], ptr %q
-; OPTALL: ret
 define void @promoteOneArg(ptr %p, ptr %q) {
+; OPT-LABEL: define void @promoteOneArg(
+; OPT-SAME: ptr [[P:%.*]], ptr [[Q:%.*]]) {
+; OPT-NEXT:  [[ENTRY:.*:]]
+; OPT-NEXT:    [[T:%.*]] = load i8, ptr [[P]], align 1
+; OPT-NEXT:    [[PROMOTED:%.*]] = zext i8 [[T]] to i32
+; OPT-NEXT:    [[ADD:%.*]] = add nuw i32 [[PROMOTED]], 2
+; OPT-NEXT:    [[A:%.*]] = icmp slt i8 [[T]], 20
+; OPT-NEXT:    br i1 [[A]], label %[[TRUE:.*]], label %[[FALSE:.*]]
+; OPT:       [[TRUE]]:
+; OPT-NEXT:    store i32 [[ADD]], ptr [[Q]], align 4
+; OPT-NEXT:    ret void
+; OPT:       [[FALSE]]:
+; OPT-NEXT:    ret void
+;
+; DISABLE-LABEL: define void @promoteOneArg(
+; DISABLE-SAME: ptr [[P:%.*]], ptr [[Q:%.*]]) {
+; DISABLE-NEXT:  [[ENTRY:.*:]]
+; DISABLE-NEXT:    [[T:%.*]] = load i8, ptr [[P]], align 1
+; DISABLE-NEXT:    [[ADD:%.*]] = add nuw i8 [[T]], 2
+; DISABLE-NEXT:    [[A:%.*]] = icmp slt i8 [[T]], 20
+; DISABLE-NEXT:    br i1 [[A]], label %[[TRUE:.*]], label %[[FALSE:.*]]
+; DISABLE:       [[TRUE]]:
+; DISABLE-NEXT:    [[S:%.*]] = zext i8 [[ADD]] to i32
+; DISABLE-NEXT:    store i32 [[S]], ptr [[Q]], align 4
+; DISABLE-NEXT:    ret void
+; DISABLE:       [[FALSE]]:
+; DISABLE-NEXT:    ret void
+;
 entry:
   %t = load i8, ptr %p
   %add = add nuw i8 %t, 2
@@ -51,15 +79,35 @@ false:
 ; Check that we manage to form a sextload is an operation with only one
 ; argument to explicitly extend is in the way.
 ; Version with sext.
-; OPTALL-LABEL: @promoteOneArgSExt
-; OPTALL: [[LD:%[a-zA-Z_0-9-]+]] = load i8, ptr %p
-; OPT-NEXT: [[SEXT:%[a-zA-Z_0-9-]+]] = sext i8 [[LD]] to i32
-; OPT-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = add nsw i32 [[SEXT]], 2
-; DISABLE: [[ADD:%[a-zA-Z_0-9-]+]] = add nsw i8 [[LD]], 2
-; DISABLE: [[RES:%[a-zA-Z_0-9-]+]] = sext i8 [[ADD]] to i32
-; OPTALL: store i32 [[RES]], ptr %q
-; OPTALL: ret
 define void @promoteOneArgSExt(ptr %p, ptr %q) {
+; OPT-LABEL: define void @promoteOneArgSExt(
+; OPT-SAME: ptr [[P:%.*]], ptr [[Q:%.*]]) {
+; OPT-NEXT:  [[ENTRY:.*:]]
+; OPT-NEXT:    [[T:%.*]] = load i8, ptr [[P]], align 1
+; OPT-NEXT:    [[PROMOTED:%.*]] = sext i8 [[T]] to i32
+; OPT-NEXT:    [[ADD:%.*]] = add nsw i32 [[PROMOTED]], 2
+; OPT-NEXT:    [[A:%.*]] = icmp slt i8 [[T]], 20
+; OPT-NEXT:    br i1 [[A]], label %[[TRUE:.*]], label %[[FALSE:.*]]
+; OPT:       [[TRUE]]:
+; OPT-NEXT:    store i32 [[ADD]], ptr [[Q]], align 4
+; OPT-NEXT:    ret void
+; OPT:       [[FALSE]]:
+; OPT-NEXT:    ret void
+;
+; DISABLE-LABEL: define void @promoteOneArgSExt(
+; DISABLE-SAME: ptr [[P:%.*]], ptr [[Q:%.*]]) {
+; DISABLE-NEXT:  [[ENTRY:.*:]]
+; DISABLE-NEXT:    [[T:%.*]] = load i8, ptr [[P]], align 1
+; DISABLE-NEXT:    [[ADD:%.*]] = add nsw i8 [[T]], 2
+; DISABLE-NEXT:    [[A:%.*]] = icmp slt i8 [[T]], 20
+; DISABLE-NEXT:    br i1 [[A]], label %[[TRUE:.*]], label %[[FALSE:.*]]
+; DISABLE:       [[TRUE]]:
+; DISABLE-NEXT:    [[S:%.*]] = sext i8 [[ADD]] to i32
+; DISABLE-NEXT:    store i32 [[S]], ptr [[Q]], align 4
+; DISABLE-NEXT:    ret void
+; DISABLE:       [[FALSE]]:
+; DISABLE-NEXT:    ret void
+;
 entry:
   %t = load i8, ptr %p
   %add = add nsw i8 %t, 2
@@ -82,23 +130,50 @@ false:
 ; #2 may not be merged with the load because %t is used in a comparison.
 ; Since two extensions may be emitted in the end instead of one before the
 ; transformation, the regular heuristic does not apply the optimization.
-;
-; OPTALL-LABEL: @promoteTwoArgZext
-; OPTALL: [[LD:%[a-zA-Z_0-9-]+]] = load i8, ptr %p
-;
-; STRESS-NEXT: [[ZEXTLD:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i32
-; STRESS-NEXT: [[ZEXTB:%[a-zA-Z_0-9-]+]] = zext i8 %b to i32
-; STRESS-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = add nuw i32 [[ZEXTLD]], [[ZEXTB]]
-;
-; NONSTRESS: [[ADD:%[a-zA-Z_0-9-]+]] = add nuw i8 [[LD]], %b
-; NONSTRESS: [[RES:%[a-zA-Z_0-9-]+]] = zext i8 [[ADD]] to i32
-;
-; DISABLE: [[ADD:%[a-zA-Z_0-9-]+]] = add nuw i8 [[LD]], %b
-; DISABLE: [[RES:%[a-zA-Z_0-9-]+]] = zext i8 [[ADD]] to i32
-;
-; OPTALL: store i32 [[RES]], ptr %q
-; OPTALL: ret
 define void @promoteTwoArgZext(ptr %p, ptr %q, i8 %b) {
+; NONSTRESS-LABEL: define void @promoteTwoArgZext(
+; NONSTRESS-SAME: ptr [[P:%.*]], ptr [[Q:%.*]], i8 [[B:%.*]]) {
+; NONSTRESS-NEXT:  [[ENTRY:.*:]]
+; NONSTRESS-NEXT:    [[T:%.*]] = load i8, ptr [[P]], align 1
+; NONSTRESS-NEXT:    [[ADD:%.*]] = add nuw i8 [[T]], [[B]]
+; NONSTRESS-NEXT:    [[A:%.*]] = icmp slt i8 [[T]], 20
+; NONSTRESS-NEXT:    br i1 [[A]], label %[[TRUE:.*]], label %[[FALSE:.*]]
+; NONSTRESS:       [[TRUE]]:
+; NONSTRESS-NEXT:    [[S:%.*]] = zext i8 [[ADD]] to i32
+; NONSTRESS-NEXT:    store i32 [[S]], ptr [[Q]], align 4
+; NONSTRESS-NEXT:    ret void
+; NONSTRESS:       [[FALSE]]:
+; NONSTRESS-NEXT:    ret void
+;
+; STRESS-LABEL: define void @promoteTwoArgZext(
+; STRESS-SAME: ptr [[P:%.*]], ptr [[Q:%.*]], i8 [[B:%.*]]) {
+; STRESS-NEXT:  [[ENTRY:.*:]]
+; STRESS-NEXT:    [[T:%.*]] = load i8, ptr [[P]], align 1
+; STRESS-NEXT:    [[PROMOTED:%.*]] = zext i8 [[T]] to i32
+; STRESS-NEXT:    [[PROMOTED1:%.*]] = zext i8 [[B]] to i32
+; STRESS-NEXT:    [[ADD:%.*]] = add nuw i32 [[PROMOTED]], [[PROMOTED1]]
+; STRESS-NEXT:    [[A:%.*]] = icmp slt i8 [[T]], 20
+; STRESS-NEXT:    br i1 [[A]], label %[[TRUE:.*]], label %[[FALSE:.*]]
+; STRESS:       [[TRUE]]:
+; STRESS-NEXT:    store i32 [[ADD]], ptr [[Q]], align 4
+; STRESS-NEXT:    ret void
+; STRESS:       [[FALSE]]:
+; STRESS-NEXT:    ret void
+;
+; DISABLE-LABEL: define void @promoteTwoArgZext(
+; DISABLE-SAME: ptr [[P:%.*]], ptr [[Q:%.*]], i8 [[B:%.*]]) {
+; DISABLE-NEXT:  [[ENTRY:.*:]]
+; DISABLE-NEXT:    [[T:%.*]] = load i8, ptr [[P]], align 1
+; DISABLE-NEXT:    [[ADD:%.*]] = add nuw i8 [[T]], [[B]]
+; DISABLE-NEXT:    [[A:%.*]] = icmp slt i8 [[T]], 20
+; DISABLE-NEXT:    br i1 [[A]], label %[[TRUE:.*]], label %[[FALSE:.*]]
+; DISABLE:       [[TRUE]]:
+; DISABLE-NEXT:    [[S:%.*]] = zext i8 [[ADD]] to i32
+; DISABLE-NEXT:    store i32 [[S]], ptr [[Q]], align 4
+; DISABLE-NEXT:    ret void
+; DISABLE:       [[FALSE]]:
+; DISABLE-NEXT:    ret void
+;
 entry:
   %t = load i8, ptr %p
   %add = add nuw i8 %t, %b
@@ -115,21 +190,50 @@ false:
 ; Check that we manage to form a sextload is an operation with two
 ; arguments to explicitly extend is in the way.
 ; Version with sext.
-; OPTALL-LABEL: @promoteTwoArgSExt
-; OPTALL: [[LD:%[a-zA-Z_0-9-]+]] = load i8, ptr %p
-;
-; STRESS-NEXT: [[SEXTLD:%[a-zA-Z_0-9-]+]] = sext i8 [[LD]] to i32
-; STRESS-NEXT: [[SEXTB:%[a-zA-Z_0-9-]+]] = sext i8 %b to i32
-; STRESS-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = add nsw i32 [[SEXTLD]], [[SEXTB]]
-;
-; NONSTRESS: [[ADD:%[a-zA-Z_0-9-]+]] = add nsw i8 [[LD]], %b
-; NONSTRESS: [[RES:%[a-zA-Z_0-9-]+]] = sext i8 [[ADD]] to i32
-;
-; DISABLE: [[ADD:%[a-zA-Z_0-9-]+]] = add nsw i8 [[LD]], %b
-; DISABLE: [[RES:%[a-zA-Z_0-9-]+]] = sext i8 [[ADD]] to i32
-; OPTALL: store i32 [[RES]], ptr %q
-; OPTALL: ret
 define void @promoteTwoArgSExt(ptr %p, ptr %q, i8 %b) {
+; NONSTRESS-LABEL: define void @promoteTwoArgSExt(
+; NONSTRESS-SAME: ptr [[P:%.*]], ptr [[Q:%.*]], i8 [[B:%.*]]) {
+; NONSTRESS-NEXT:  [[ENTRY:.*:]]
+; NONSTRESS-NEXT:    [[T:%.*]] = load i8, ptr [[P]], align 1
+; NONSTRESS-NEXT:    [[ADD:%.*]] = add nsw i8 [[T]], [[B]]
+; NONSTRESS-NEXT:    [[A:%.*]] = icmp slt i8 [[T]], 20
+; NONSTRESS-NEXT:    br i1 [[A]], label %[[TRUE:.*]], label %[[FALSE:.*]]
+; NONSTRESS:       [[TRUE]]:
+; NONSTRESS-NEXT:    [[S:%.*]] = sext i8 [[ADD]] to i32
+; NONSTRESS-NEXT:    store i32 [[S]], ptr [[Q]], align 4
+; NONSTRESS-NEXT:    ret void
+; NONSTRESS:       [[FALSE]]:
+; NONSTRESS-NEXT:    ret void
+;
+; STRESS-LABEL: define void @promoteTwoArgSExt(
+; STRESS-SAME: ptr [[P:%.*]], ptr [[Q:%.*]], i8 [[B:%.*]]) {
+; STRESS-NEXT:  [[ENTRY:.*:]]
+; STRESS-NEXT:    [[T:%.*]] = load i8, ptr [[P]], align 1
+; STRESS-NEXT:    [[PROMOTED:%.*]] = sext i8 [[T]] to i32
+; STRESS-NEXT:    [[PROMOTED1:%.*]] = sext i8 [[B]] to i32
+; STRESS-NEXT:    [[ADD:%.*]] = add nsw i32 [[PROMOTED]], [[PROMOTED1]]
+; STRESS-NEXT:    [[A:%.*]] = icmp slt i8 [[T]], 20
+; STRESS-NEXT:    br i1 [[A]], label %[[TRUE:.*]], label %[[FALSE:.*]]
+; STRESS:       [[TRUE]]:
+; STRESS-NEXT:    store i32 [[ADD]], ptr [[Q]], align 4
+; STRESS-NEXT:    ret void
+; STRESS:       [[FALSE]]:
+; STRESS-NEXT:    ret void
+;
+; DISABLE-LABEL: define void @promoteTwoArgSExt(
+; DISABLE-SAME: ptr [[P:%.*]], ptr [[Q:%.*]], i8 [[B:%.*]]) {
+; DISABLE-NEXT:  [[ENTRY:.*:]]
+; DISABLE-NEXT:    [[T:%.*]] = load i8, ptr [[P]], align 1
+; DISABLE-NEXT:    [[ADD:%.*]] = add nsw i8 [[T]], [[B]]
+; DISABLE-NEXT:    [[A:%.*]] = icmp slt i8 [[T]], 20
+; DISABLE-NEXT:    br i1 [[A]], label %[[TRUE:.*]], label %[[FALSE:.*]]
+; DISABLE:       [[TRUE]]:
+; DISABLE-NEXT:    [[S:%.*]] = sext i8 [[ADD]] to i32
+; DISABLE-NEXT:    store i32 [[S]], ptr [[Q]], align 4
+; DISABLE-NEXT:    ret void
+; DISABLE:       [[FALSE]]:
+; DISABLE-NEXT:    ret void
+;
 entry:
   %t = load i8, ptr %p
   %add = add nsw i8 %t, %b
@@ -145,26 +249,54 @@ false:
 
 ; Check that we do not a zextload if we need to introduce more than
 ; one additional extension.
-; OPTALL-LABEL: @promoteThreeArgZext
-; OPTALL: [[LD:%[a-zA-Z_0-9-]+]] = load i8, ptr %p
-;
-; STRESS-NEXT: [[ZEXTLD:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i32
-; STRESS-NEXT: [[ZEXTB:%[a-zA-Z_0-9-]+]] = zext i8 %b to i32
-; STRESS-NEXT: [[TMP:%[a-zA-Z_0-9-]+]] = add nuw i32 [[ZEXTLD]], [[ZEXTB]]
-; STRESS-NEXT: [[ZEXTC:%[a-zA-Z_0-9-]+]] = zext i8 %c to i32
-; STRESS-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = add nuw i32 [[TMP]], [[ZEXTC]]
-;
-; NONSTRESS-NEXT: [[TMP:%[a-zA-Z_0-9-]+]] = add nuw i8 [[LD]], %b
-; NONSTRESS-NEXT: [[ADD:%[a-zA-Z_0-9-]+]] = add nuw i8 [[TMP]], %c
-; NONSTRESS: [[RES:%[a-zA-Z_0-9-]+]] = zext i8 [[ADD]] to i32
-;
-; DISABLE: add nuw i8
-; DISABLE: [[ADD:%[a-zA-Z_0-9-]+]] = add nuw i8
-; DISABLE: [[RES:%[a-zA-Z_0-9-]+]] = zext i8 [[ADD]] to i32
-;
-; OPTALL: store i32 [[RES]], ptr %q
-; OPTALL: ret
 define void @promoteThreeArgZext(ptr %p, ptr %q, i8 %b, i8 %c) {
+; NONSTRESS-LABEL: define void @promoteThreeArgZext(
+; NONSTRESS-SAME: ptr [[P:%.*]], ptr [[Q:%.*]], i8 [[B:%.*]], i8 [[C:%.*]]) {
+; NONSTRESS-NEXT:  [[ENTRY:.*:]]
+; NONSTRESS-NEXT:    [[T:%.*]] = load i8, ptr [[P]], align 1
+; NONSTRESS-NEXT:    [[TMP:%.*]] = add nuw i8 [[T]], [[B]]
+; NONSTRESS-NEXT:    [[ADD:%.*]] = add nuw i8 [[TMP]], [[C]]
+; NONSTRESS-NEXT:    [[A:%.*]] = icmp slt i8 [[T]], 20
+; NONSTRESS-NEXT:    br i1 [[A]], label %[[TRUE:.*]], label %[[FALSE:.*]]
+; NONSTRESS:       [[TRUE]]:
+; NONSTRESS-NEXT:    [[S:%.*]] = zext i8 [[ADD]] to i32
+; NONSTRESS-NEXT:    store i32 [[S]], ptr [[Q]], align 4
+; NONSTRESS-NEXT:    ret void
+; NONSTRESS:       [[FALSE]]:
+; NONSTRESS-NEXT:    ret void
+;
+; STRESS-LABEL: define void @promoteThreeArgZext(
+; STRESS-SAME: ptr [[P:%.*]], ptr [[Q:%.*]], i8 [[B:%.*]], i8 [[C:%.*]]) {
+; STRESS-NEXT:  [[ENTRY:.*:]]
+; STRESS-NEXT:    [[T:%.*]] = load i8, ptr [[P]], align 1
+; STRESS-NEXT:    [[PROMOTED2:%.*]] = zext i8 [[T]] to i32
+; STRESS-NEXT:    [[PROMOTED3:%.*]] = zext i8 [[B]] to i32
+; STRESS-NEXT:    [[TMP:%.*]] = add nuw i32 [[PROMOTED2]], [[PROMOTED3]]
+; STRESS-NEXT:    [[PROMOTED1:%.*]] = zext i8 [[C]] to i32
+; STRESS-NEXT:    [[ADD:%.*]] = add nuw i32 [[TMP]], [[PROMOTED1]]
+; STRESS-NEXT:    [[A:%.*]] = icmp slt i8 [[T]], 20
+; STRESS-NEXT:    br i1 [[A]], label %[[TRUE:.*]], label %[[FALSE:.*]]
+; STRESS:       [[TRUE]]:
+; STRESS-NEXT:    store i32 [[ADD]], ptr [[Q]], align 4
+; STRESS-NEXT:    ret void
+; STRESS:       [[FALSE]]:
+; STRESS-NEXT:    ret void
+;
+; DISABLE-LABEL: define void @promoteThreeArgZext(
+; DISABLE-SAME: ptr [[P:%.*]], ptr [[Q:%.*]], i8 [[B:%.*]], i8 [[C:%.*]]) {
+; DISABLE-NEXT:  [[ENTRY:.*:]]
+; DISABLE-NEXT:    [[T:%.*]] = load i8, ptr [[P]], align 1
+; DISABLE-NEXT:    [[TMP:%.*]] = add nuw i8 [[T]], [[B]]
+; DISABLE-NEXT:    [[ADD:%.*]] = add nuw i8 [[TMP]], [[C]]
+; DISABLE-NEXT:    [[A:%.*]] = icmp slt i8 [[T]], 20
+; DISABLE-NEXT:    br i1 [[A]], label %[[TRUE:.*]], label %[[FALSE:.*]]
+; DISABLE:       [[TRUE]]:
+; DISABLE-NEXT:    [[S:%.*]] = zext i8 [[ADD]] to i32
+; DISABLE-NEXT:    store i32 [[S]], ptr [[Q]], align 4
+; DISABLE-NEXT:    ret void
+; DISABLE:       [[FALSE]]:
+; DISABLE-NEXT:    ret void
+;
 entry:
   %t = load i8, ptr %p
   %tmp = add nuw i8 %t, %b
@@ -181,24 +313,52 @@ false:
 
 ; Check that we manage to form a zextload after promoting and merging
 ; two extensions.
-; OPTALL-LABEL: @promoteMergeExtArgZExt
-; OPTALL: [[LD:%[a-zA-Z_0-9-]+]] = load i8, ptr %p
-;
-; STRESS-NEXT: [[ZEXTLD:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i32
-; STRESS-NEXT: [[ZEXTB:%[a-zA-Z_0-9-]+]] = zext i16 %b to i32
-; STRESS-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = add nuw i32 [[ZEXTLD]], [[ZEXTB]]
-;
-; NONSTRESS: [[ZEXTLD:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i16
-; NONSTRESS: [[ADD:%[a-zA-Z_0-9-]+]] = add nuw i16 [[ZEXTLD]], %b
-; NONSTRESS: [[RES:%[a-zA-Z_0-9-]+]] = zext i16 [[ADD]] to i32
-;
-; DISABLE: [[ZEXTLD:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i16
-; DISABLE: [[ADD:%[a-zA-Z_0-9-]+]] = add nuw i16 [[ZEXTLD]], %b
-; DISABLE: [[RES:%[a-zA-Z_0-9-]+]] = zext i16 [[ADD]] to i32
-;
-; OPTALL: store i32 [[RES]], ptr %q
-; OPTALL: ret
 define void @promoteMergeExtArgZExt(ptr %p, ptr %q, i16 %b) {
+; NONSTRESS-LABEL: define void @promoteMergeExtArgZExt(
+; NONSTRESS-SAME: ptr [[P:%.*]], ptr [[Q:%.*]], i16 [[B:%.*]]) {
+; NONSTRESS-NEXT:  [[ENTRY:.*:]]
+; NONSTRESS-NEXT:    [[T:%.*]] = load i8, ptr [[P]], align 1
+; NONSTRESS-NEXT:    [[EXT:%.*]] = zext i8 [[T]] to i16
+; NONSTRESS-NEXT:    [[ADD:%.*]] = add nuw i16 [[EXT]], [[B]]
+; NONSTRESS-NEXT:    [[A:%.*]] = icmp slt i8 [[T]], 20
+; NONSTRESS-NEXT:    br i1 [[A]], label %[[TRUE:.*]], label %[[FALSE:.*]]
+; NONSTRESS:       [[TRUE]]:
+; NONSTRESS-NEXT:    [[S:%.*]] = zext i16 [[ADD]] to i32
+; NONSTRESS-NEXT:    store i32 [[S]], ptr [[Q]], align 4
+; NONSTRESS-NEXT:    ret void
+; NONSTRESS:       [[FALSE]]:
+; NONSTRESS-NEXT:    ret void
+;
+; STRESS-LABEL: define void @promoteMergeExtArgZExt(
+; STRESS-SAME: ptr [[P:%.*]], ptr [[Q:%.*]], i16 [[B:%.*]]) {
+; STRESS-NEXT:  [[ENTRY:.*:]]
+; STRESS-NEXT:    [[T:%.*]] = load i8, ptr [[P]], align 1
+; STRESS-NEXT:    [[PROMOTED2:%.*]] = zext i8 [[T]] to i32
+; STRESS-NEXT:    [[PROMOTED1:%.*]] = zext i16 [[B]] to i32
+; STRESS-NEXT:    [[ADD:%.*]] = add nuw i32 [[PROMOTED2]], [[PROMOTED1]]
+; STRESS-NEXT:    [[A:%.*]] = icmp slt i8 [[T]], 20
+; STRESS-NEXT:    br i1 [[A]], label %[[TRUE:.*]], label %[[FALSE:.*]]
+; STRESS:       [[TRUE]]:
+; STRESS-NEXT:    store i32 [[ADD]], ptr [[Q]], align 4
+; STRESS-NEXT:    ret void
+; STRESS:       [[FALSE]]:
+; STRESS-NEXT:    ret void
+;
+; DISABLE-LABEL: define void @promoteMergeExtArgZExt(
+; DISABLE-SAME: ptr [[P:%.*]], ptr [[Q:%.*]], i16 [[B:%.*]]) {
+; DISABLE-NEXT:  [[ENTRY:.*:]]
+; DISABLE-NEXT:    [[T:%.*]] = load i8, ptr [[P]], align 1
+; DISABLE-NEXT:    [[EXT:%.*]] = zext i8 [[T]] to i16
+; DISABLE-NEXT:    [[ADD:%.*]] = add nuw i16 [[EXT]], [[B]]
+; DISABLE-NEXT:    [[A:%.*]] = icmp slt i8 [[T]], 20
+; DISABLE-NEXT:    br i1 [[A]], label %[[TRUE:.*]], label %[[FALSE:.*]]
+; DISABLE:       [[TRUE]]:
+; DISABLE-NEXT:    [[S:%.*]] = zext i16 [[ADD]] to i32
+; DISABLE-NEXT:    store i32 [[S]], ptr [[Q]], align 4
+; DISABLE-NEXT:    ret void
+; DISABLE:       [[FALSE]]:
+; DISABLE-NEXT:    ret void
+;
 entry:
   %t = load i8, ptr %p
   %ext = zext i8 %t to i16
@@ -216,23 +376,52 @@ false:
 ; Check that we manage to form a sextload after promoting and merging
 ; two extensions.
 ; Version with sext.
-; OPTALL-LABEL: @promoteMergeExtArgSExt
-; OPTALL: [[LD:%[a-zA-Z_0-9-]+]] = load i8, ptr %p
-;
-; STRESS-NEXT: [[ZEXTLD:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i32
-; STRESS-NEXT: [[ZEXTB:%[a-zA-Z_0-9-]+]] = sext i16 %b to i32
-; STRESS-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = add nsw i32 [[ZEXTLD]], [[ZEXTB]]
-;
-; NONSTRESS: [[ZEXTLD:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i16
-; NONSTRESS: [[ADD:%[a-zA-Z_0-9-]+]] = add nsw i16 [[ZEXTLD]], %b
-; NONSTRESS: [[RES:%[a-zA-Z_0-9-]+]] = sext i16 [[ADD]] to i32
-;
-; DISABLE: [[ZEXTLD:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i16
-; DISABLE: [[ADD:%[a-zA-Z_0-9-]+]] = add nsw i16 [[ZEXTLD]], %b
-; DISABLE: [[RES:%[a-zA-Z_0-9-]+]] = sext i16 [[ADD]] to i32
-; OPTALL: store i32 [[RES]], ptr %q
-; OPTALL: ret
 define void @promoteMergeExtArgSExt(ptr %p, ptr %q, i16 %b) {
+; NONSTRESS-LABEL: define void @promoteMergeExtArgSExt(
+; NONSTRESS-SAME: ptr [[P:%.*]], ptr [[Q:%.*]], i16 [[B:%.*]]) {
+; NONSTRESS-NEXT:  [[ENTRY:.*:]]
+; NONSTRESS-NEXT:    [[T:%.*]] = load i8, ptr [[P]], align 1
+; NONSTRESS-NEXT:    [[EXT:%.*]] = zext i8 [[T]] to i16
+; NONSTRESS-NEXT:    [[ADD:%.*]] = add nsw i16 [[EXT]], [[B]]
+; NONSTRESS-NEXT:    [[A:%.*]] = icmp slt i8 [[T]], 20
+; NONSTRESS-NEXT:    br i1 [[A]], label %[[TRUE:.*]], label %[[FALSE:.*]]
+; NONSTRESS:       [[TRUE]]:
+; NONSTRESS-NEXT:    [[S:%.*]] = sext i16 [[ADD]] to i32
+; NONSTRESS-NEXT:    store i32 [[S]], ptr [[Q]], align 4
+; NONSTRESS-NEXT:    ret void
+; NONSTRESS:       [[FALSE]]:
+; NONSTRESS-NEXT:    ret void
+;
+; STRESS-LABEL: define void @promoteMergeExtArgSExt(
+; STRESS-SAME: ptr [[P:%.*]], ptr [[Q:%.*]], i16 [[B:%.*]]) {
+; STRESS-NEXT:  [[ENTRY:.*:]]
+; STRESS-NEXT:    [[T:%.*]] = load i8, ptr [[P]], align 1
+; STRESS-NEXT:    [[PROMOTED2:%.*]] = zext i8 [[T]] to i32
+; STRESS-NEXT:    [[PROMOTED1:%.*]] = sext i16 [[B]] to i32
+; STRESS-NEXT:    [[ADD:%.*]] = add nsw i32 [[PROMOTED2]], [[PROMOTED1]]
+; STRESS-NEXT:    [[A:%.*]] = icmp slt i8 [[T]], 20
+; STRESS-NEXT:    br i1 [[A]], label %[[TRUE:.*]], label %[[FALSE:.*]]
+; STRESS:       [[TRUE]]:
+; STRESS-NEXT:    store i32 [[ADD]], ptr [[Q]], align 4
+; STRESS-NEXT:    ret void
+; STRESS:       [[FALSE]]:
+; STRESS-NEXT:    ret void
+;
+; DISABLE-LABEL: define void @promoteMergeExtArgSExt(
+; DISABLE-SAME: ptr [[P:%.*]], ptr [[Q:%.*]], i16 [[B:%.*]]) {
+; DISABLE-NEXT:  [[ENTRY:.*:]]
+; DISABLE-NEXT:    [[T:%.*]] = load i8, ptr [[P]], align 1
+; DISABLE-NEXT:    [[EXT:%.*]] = zext i8 [[T]] to i16
+; DISABLE-NEXT:    [[ADD:%.*]] = add nsw i16 [[EXT]], [[B]]
+; DISABLE-NEXT:    [[A:%.*]] = icmp slt i8 [[T]], 20
+; DISABLE-NEXT:    br i1 [[A]], label %[[TRUE:.*]], label %[[FALSE:.*]]
+; DISABLE:       [[TRUE]]:
+; DISABLE-NEXT:    [[S:%.*]] = sext i16 [[ADD]] to i32
+; DISABLE-NEXT:    store i32 [[S]], ptr [[Q]], align 4
+; DISABLE-NEXT:    ret void
+; DISABLE:       [[FALSE]]:
+; DISABLE-NEXT:    ret void
+;
 entry:
   %t = load i8, ptr %p
   %ext = zext i8 %t to i16
@@ -275,30 +464,38 @@ false:
 ;       a zext of %ld.
 ; Currently we do not try to reuse existing extensions, so in the end we have
 ; 3 identical zext of %ld. The extensions will be CSE'ed by SDag.
-;
-; OPTALL-LABEL: @severalPromotions
-; OPTALL: [[LD:%[a-zA-Z_0-9-]+]] = load i8, ptr %addr1
-; OPT-NEXT: [[ZEXTLD1_1:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i64
-; OPT-NEXT: [[ZEXTLD1_2:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i64
-; OPT-NEXT: [[LD2:%[a-zA-Z_0-9-]+]] = load i32, ptr %addr2
-; OPT-NEXT: [[SEXTLD2:%[a-zA-Z_0-9-]+]] = sext i32 [[LD2]] to i64
-; OPT-NEXT: [[ZEXTLD1_3:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i64
-; OPT-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = add nsw i64 [[SEXTLD2]], [[ZEXTLD1_3]]
-; OPT-NEXT: [[ZEXTLD1_4:%[a-zA-Z_0-9-]+]] = zext i8 %a to i64
-; OPT-NEXT: [[RESZA:%[a-zA-Z_0-9-]+]] = add nsw i64 [[ZEXTLD1_4]], [[ZEXTLD1_2]]
-; OPT-NEXT: [[SEXTB:%[a-zA-Z_0-9-]+]] = sext i32 %b to i64
-; OPT-NEXT: [[RESB:%[a-zA-Z_0-9-]+]] = add nsw i64 [[SEXTB]], [[ZEXTLD1_1]]
-;
-; DISABLE: [[ADD:%[a-zA-Z_0-9-]+]] = add nsw i32
-; DISABLE: [[RES:%[a-zA-Z_0-9-]+]]  = sext i32 [[ADD]] to i64
-; DISABLE: [[ADDZA:%[a-zA-Z_0-9-]+]] = add nsw i32
-; DISABLE: [[RESZA:%[a-zA-Z_0-9-]+]]  = sext i32 [[ADDZA]] to i64
-; DISABLE: [[ADDB:%[a-zA-Z_0-9-]+]] = add nsw i32
-; DISABLE: [[RESB:%[a-zA-Z_0-9-]+]]  = sext i32 [[ADDB]] to i64
-;
-; OPTALL: call void @dummy(i64 [[RES]], i64 [[RESZA]], i64 [[RESB]])
-; OPTALL: ret
 define void @severalPromotions(ptr %addr1, ptr %addr2, i8 %a, i32 %b) {
+; OPT-LABEL: define void @severalPromotions(
+; OPT-SAME: ptr [[ADDR1:%.*]], ptr [[ADDR2:%.*]], i8 [[A:%.*]], i32 [[B:%.*]]) {
+; OPT-NEXT:    [[LD:%.*]] = load i8, ptr [[ADDR1]], align 1
+; OPT-NEXT:    [[PROMOTED9:%.*]] = zext i8 [[LD]] to i64
+; OPT-NEXT:    [[PROMOTED6:%.*]] = zext i8 [[LD]] to i64
+; OPT-NEXT:    [[LD2:%.*]] = load i32, ptr [[ADDR2]], align 4
+; OPT-NEXT:    [[PROMOTED:%.*]] = sext i32 [[LD2]] to i64
+; OPT-NEXT:    [[PROMOTED2:%.*]] = zext i8 [[LD]] to i64
+; OPT-NEXT:    [[ADD:%.*]] = add nsw i64 [[PROMOTED]], [[PROMOTED2]]
+; OPT-NEXT:    [[PROMOTED5:%.*]] = zext i8 [[A]] to i64
+; OPT-NEXT:    [[ADDZA:%.*]] = add nsw i64 [[PROMOTED5]], [[PROMOTED6]]
+; OPT-NEXT:    [[PROMOTED7:%.*]] = sext i32 [[B]] to i64
+; OPT-NEXT:    [[ADDB:%.*]] = add nsw i64 [[PROMOTED7]], [[PROMOTED9]]
+; OPT-NEXT:    call void @dummy(i64 [[ADD]], i64 [[ADDZA]], i64 [[ADDB]])
+; OPT-NEXT:    ret void
+;
+; DISABLE-LABEL: define void @severalPromotions(
+; DISABLE-SAME: ptr [[ADDR1:%.*]], ptr [[ADDR2:%.*]], i8 [[A:%.*]], i32 [[B:%.*]]) {
+; DISABLE-NEXT:    [[LD:%.*]] = load i8, ptr [[ADDR1]], align 1
+; DISABLE-NEXT:    [[ZEXTLD:%.*]] = zext i8 [[LD]] to i32
+; DISABLE-NEXT:    [[LD2:%.*]] = load i32, ptr [[ADDR2]], align 4
+; DISABLE-NEXT:    [[ADD:%.*]] = add nsw i32 [[LD2]], [[ZEXTLD]]
+; DISABLE-NEXT:    [[SEXTADD:%.*]] = sext i32 [[ADD]] to i64
+; DISABLE-NEXT:    [[ZEXTA:%.*]] = zext i8 [[A]] to i32
+; DISABLE-NEXT:    [[ADDZA:%.*]] = add nsw i32 [[ZEXTA]], [[ZEXTLD]]
+; DISABLE-NEXT:    [[SEXTADDZA:%.*]] = sext i32 [[ADDZA]] to i64
+; DISABLE-NEXT:    [[ADDB:%.*]] = add nsw i32 [[B]], [[ZEXTLD]]
+; DISABLE-NEXT:    [[SEXTADDB:%.*]] = sext i32 [[ADDB]] to i64
+; DISABLE-NEXT:    call void @dummy(i64 [[SEXTADD]], i64 [[SEXTADDZA]], i64 [[SEXTADDB]])
+; DISABLE-NEXT:    ret void
+;
   %ld = load i8, ptr %addr1
   %zextld = zext i8 %ld to i32
   %ld2 = load i32, ptr %addr2
@@ -317,11 +514,13 @@ declare void @dummy(i64, i64, i64)
 
 ; Make sure we do not try to promote vector types since the type promotion
 ; helper does not support them for now.
-; OPTALL-LABEL: @vectorPromotion
-; OPTALL: [[SHL:%[a-zA-Z_0-9-]+]] = shl nuw nsw <2 x i32> zeroinitializer, <i32 8, i32 8>
-; OPTALL: [[ZEXT:%[a-zA-Z_0-9-]+]] = zext <2 x i32> [[SHL]] to <2 x i64>
-; OPTALL: ret
 define void @vectorPromotion() {
+; OPTALL-LABEL: define void @vectorPromotion() {
+; OPTALL-NEXT:  [[ENTRY:.*:]]
+; OPTALL-NEXT:    [[A:%.*]] = shl nuw nsw <2 x i32> zeroinitializer, <i32 8, i32 8>
+; OPTALL-NEXT:    [[B:%.*]] = zext <2 x i32> [[A]] to <2 x i64>
+; OPTALL-NEXT:    ret void
+;
 entry:
   %a = shl nuw nsw <2 x i32> zeroinitializer, <i32 8, i32 8>
   %b = zext <2 x i32> %a to <2 x i64>
@@ -334,21 +533,27 @@ entry:
 ; Make sure we support promotion of operands that produces a Value as opposed
 ; to an instruction.
 ; This used to cause a crash.
-; OPTALL-LABEL: @promotionOfArgEndsUpInValue
-; OPTALL: [[LD:%[a-zA-Z_0-9-]+]] = load i16, ptr %addr
-;
-; OPT-NEXT: [[SEXT:%[a-zA-Z_0-9-]+]] = sext i16 [[LD]] to i32
-; OPT-NEXT: [[CMP:%[a-zA-Z_0-9-]+]] = icmp ne ptr getelementptr inbounds ([2 x i32], ptr @c, i64 0, i64 1), @a
-; OPT-NEXT: [[SEXT2:%[a-zA-Z_0-9-]+]] = zext i1 [[CMP]] to i32
-; OPT-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = add nuw nsw i32 [[SEXT]], [[SEXT2]]
-;
-; DISABLE-NEXT: [[CMP:%[a-zA-Z_0-9-]+]] = icmp ne ptr getelementptr inbounds ([2 x i32], ptr @c, i64 0, i64 1), @a
-; DISABLE-NEXT: [[EXT:%[a-zA-Z_0-9-]+]] = zext i1 [[CMP]] to i16
-; DISABLE-NEXT: [[ADD:%[a-zA-Z_0-9-]+]] = add nuw nsw i16 [[LD]], [[EXT]]
-; DISABLE-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = sext i16 [[ADD]] to i32
-;
-; OPTALL-NEXT: ret i32 [[RES]]
 define i32 @promotionOfArgEndsUpInValue(ptr %addr) {
+; OPT-LABEL: define i32 @promotionOfArgEndsUpInValue(
+; OPT-SAME: ptr [[ADDR:%.*]]) {
+; OPT-NEXT:  [[ENTRY:.*:]]
+; OPT-NEXT:    [[VAL:%.*]] = load i16, ptr [[ADDR]], align 2
+; OPT-NEXT:    [[PROMOTED:%.*]] = sext i16 [[VAL]] to i32
+; OPT-NEXT:    [[CMP:%.*]] = icmp ne ptr getelementptr inbounds ([2 x i32], ptr @c, i64 0, i64 1), @a
+; OPT-NEXT:    [[PROMOTED2:%.*]] = zext i1 [[CMP]] to i32
+; OPT-NEXT:    [[ADD:%.*]] = add nuw nsw i32 [[PROMOTED]], [[PROMOTED2]]
+; OPT-NEXT:    ret i32 [[ADD]]
+;
+; DISABLE-LABEL: define i32 @promotionOfArgEndsUpInValue(
+; DISABLE-SAME: ptr [[ADDR:%.*]]) {
+; DISABLE-NEXT:  [[ENTRY:.*:]]
+; DISABLE-NEXT:    [[VAL:%.*]] = load i16, ptr [[ADDR]], align 2
+; DISABLE-NEXT:    [[CMP:%.*]] = icmp ne ptr getelementptr inbounds ([2 x i32], ptr @c, i64 0, i64 1), @a
+; DISABLE-NEXT:    [[EXT:%.*]] = zext i1 [[CMP]] to i16
+; DISABLE-NEXT:    [[ADD:%.*]] = add nuw nsw i16 [[VAL]], [[EXT]]
+; DISABLE-NEXT:    [[CONV3:%.*]] = sext i16 [[ADD]] to i32
+; DISABLE-NEXT:    ret i32 [[CONV3]]
+;
 entry:
   %val = load i16, ptr %addr
   %cmp = icmp ne ptr getelementptr inbounds ([2 x i32], ptr @c, i64 0, i64 1), @a
@@ -359,25 +564,31 @@ entry:
 }
 
 ; Check that we see that one zext can be derived from the other for free.
-; OPTALL-LABEL: @promoteTwoArgZextWithSourceExtendedTwice
-; OPTALL: [[LD:%[a-zA-Z_0-9-]+]] = load i8, ptr %p
-;
-; OPT-NEXT: [[ZEXT64:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i64
-; OPT-NEXT: [[ZEXT32:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i32
-; OPT-NEXT: [[RES32:%[a-zA-Z_0-9-]+]] = add nuw i32 [[ZEXT32]], %b
-; OPT-NEXT: [[RES64:%[a-zA-Z_0-9-]+]] = add nuw i64 [[ZEXT64]], 12
-; OPT-NEXT: store i32 [[RES32]], ptr %addr
-; OPT-NEXT: store i64 [[RES64]], ptr %q
-;
-; DISABLE-NEXT: [[ZEXT32:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i32
-; DISABLE-NEXT: [[RES32:%[a-zA-Z_0-9-]+]] = add nuw i32 [[ZEXT32]], %b
-; DISABLE-NEXT: [[RES2_32:%[a-zA-Z_0-9-]+]] = add nuw i32 [[ZEXT32]], 12
-; DISABLE-NEXT: store i32 [[RES32]], ptr %addr
-; DISABLE-NEXT: [[ZEXT64:%[a-zA-Z_0-9-]+]] = zext i32 [[RES2_32]] to i64
-; DISABLE-NEXT: store i64 [[ZEXT64]], ptr %q
-;
-; OPTALL-NEXT: ret void
 define void @promoteTwoArgZextWithSourceExtendedTwice(ptr %p, ptr %q, i32 %b, ptr %addr) {
+; OPT-LABEL: define void @promoteTwoArgZextWithSourceExtendedTwice(
+; OPT-SAME: ptr [[P:%.*]], ptr [[Q:%.*]], i32 [[B:%.*]], ptr [[ADDR:%.*]]) {
+; OPT-NEXT:  [[ENTRY:.*:]]
+; OPT-NEXT:    [[T:%.*]] = load i8, ptr [[P]], align 1
+; OPT-NEXT:    [[PROMOTED1:%.*]] = zext i8 [[T]] to i64
+; OPT-NEXT:    [[ZEXTT:%.*]] = zext i8 [[T]] to i32
+; OPT-NEXT:    [[ADD:%.*]] = add nuw i32 [[ZEXTT]], [[B]]
+; OPT-NEXT:    [[ADD2:%.*]] = add nuw i64 [[PROMOTED1]], 12
+; OPT-NEXT:    store i32 [[ADD]], ptr [[ADDR]], align 4
+; OPT-NEXT:    store i64 [[ADD2]], ptr [[Q]], align 8
+; OPT-NEXT:    ret void
+;
+; DISABLE-LABEL: define void @promoteTwoArgZextWithSourceExtendedTwice(
+; DISABLE-SAME: ptr [[P:%.*]], ptr [[Q:%.*]], i32 [[B:%.*]], ptr [[ADDR:%.*]]) {
+; DISABLE-NEXT:  [[ENTRY:.*:]]
+; DISABLE-NEXT:    [[T:%.*]] = load i8, ptr [[P]], align 1
+; DISABLE-NEXT:    [[ZEXTT:%.*]] = zext i8 [[T]] to i32
+; DISABLE-NEXT:    [[ADD:%.*]] = add nuw i32 [[ZEXTT]], [[B]]
+; DISABLE-NEXT:    [[ADD2:%.*]] = add nuw i32 [[ZEXTT]], 12
+; DISABLE-NEXT:    store i32 [[ADD]], ptr [[ADDR]], align 4
+; DISABLE-NEXT:    [[S:%.*]] = zext i32 [[ADD2]] to i64
+; DISABLE-NEXT:    store i64 [[S]], ptr [[Q]], align 8
+; DISABLE-NEXT:    ret void
+;
 entry:
   %t = load i8, ptr %p
   %zextt = zext i8 %t to i32
@@ -393,26 +604,41 @@ entry:
 ; The input has one free zext and one free sext. If we would have promoted
 ; all the way through the load we would end up with a free zext and a
 ; non-free sext (of %b).
-; OPTALL-LABEL: @doNotPromoteFreeSExtFromAddrMode
-; OPTALL: [[LD:%[a-zA-Z_0-9-]+]] = load i8, ptr %p
-;
-; STRESS-NEXT: [[ZEXT64:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i64
-; STRESS-NEXT: [[SEXTB:%[a-zA-Z_0-9-]+]] = sext i32 %b to i64
-; STRESS-NEXT: [[IDX64:%[a-zA-Z_0-9-]+]] = add nsw i64 [[ZEXT64]], [[SEXTB]]
-; STRESS-NEXT: [[RES32:%[a-zA-Z_0-9-]+]] = trunc i64 [[IDX64]] to i32
-;
-; NONSTRESS-NEXT: [[ZEXT32:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i32
-; NONSTRESS-NEXT: [[RES32:%[a-zA-Z_0-9-]+]] = add nsw i32 [[ZEXT32]], %b
-; NONSTRESS-NEXT: [[IDX64:%[a-zA-Z_0-9-]+]] = sext i32 [[RES32]] to i64
-;
-; DISABLE-NEXT: [[ZEXT32:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i32
-; DISABLE-NEXT: [[RES32:%[a-zA-Z_0-9-]+]] = add nsw i32 [[ZEXT32]], %b
-; DISABLE-NEXT: [[IDX64:%[a-zA-Z_0-9-]+]] = sext i32 [[RES32]] to i64
-;
-; OPTALL-NEXT: [[GEP:%[a-zA-Z_0-9-]+]] = getelementptr inbounds i32, ptr %addr, i64 [[IDX64]]
-; OPTALL-NEXT: store i32 [[RES32]], ptr [[GEP]]
-; OPTALL-NEXT: ret void
 define void @doNotPromoteFreeSExtFromAddrMode(ptr %p, i32 %b, ptr %addr) {
+; NONSTRESS-LABEL: define void @doNotPromoteFreeSExtFromAddrMode(
+; NONSTRESS-SAME: ptr [[P:%.*]], i32 [[B:%.*]], ptr [[ADDR:%.*]]) {
+; NONSTRESS-NEXT:  [[ENTRY:.*:]]
+; NONSTRESS-NEXT:    [[T:%.*]] = load i8, ptr [[P]], align 1
+; NONSTRESS-NEXT:    [[ZEXTT:%.*]] = zext i8 [[T]] to i32
+; NONSTRESS-NEXT:    [[ADD:%.*]] = add nsw i32 [[ZEXTT]], [[B]]
+; NONSTRESS-NEXT:    [[IDX64:%.*]] = sext i32 [[ADD]] to i64
+; NONSTRESS-NEXT:    [[STADDR:%.*]] = getelementptr inbounds i32, ptr [[ADDR]], i64 [[IDX64]]
+; NONSTRESS-NEXT:    store i32 [[ADD]], ptr [[STADDR]], align 4
+; NONSTRESS-NEXT:    ret void
+;
+; STRESS-LABEL: define void @doNotPromoteFreeSExtFromAddrMode(
+; STRESS-SAME: ptr [[P:%.*]], i32 [[B:%.*]], ptr [[ADDR:%.*]]) {
+; STRESS-NEXT:  [[ENTRY:.*:]]
+; STRESS-NEXT:    [[T:%.*]] = load i8, ptr [[P]], align 1
+; STRESS-NEXT:    [[PROMOTED3:%.*]] = zext i8 [[T]] to i64
+; STRESS-NEXT:    [[PROMOTED2:%.*]] = sext i32 [[B]] to i64
+; STRESS-NEXT:    [[ADD:%.*]] = add nsw i64 [[PROMOTED3]], [[PROMOTED2]]
+; STRESS-NEXT:    [[PROMOTED:%.*]] = trunc i64 [[ADD]] to i32
+; STRESS-NEXT:    [[STADDR:%.*]] = getelementptr inbounds i32, ptr [[ADDR]], i64 [[ADD]]
+; STRESS-NEXT:    store i32 [[PROMOTED]], ptr [[STADDR]], align 4
+; STRESS-NEXT:    ret void
+;
+; DISABLE-LABEL: define void @doNotPromoteFreeSExtFromAddrMode(
+; DISABLE-SAME: ptr [[P:%.*]], i32 [[B:%.*]], ptr [[ADDR:%.*]]) {
+; DISABLE-NEXT:  [[ENTRY:.*:]]
+; DISABLE-NEXT:    [[T:%.*]] = load i8, ptr [[P]], align 1
+; DISABLE-NEXT:    [[ZEXTT:%.*]] = zext i8 [[T]] to i32
+; DISABLE-NEXT:    [[ADD:%.*]] = add nsw i32 [[ZEXTT]], [[B]]
+; DISABLE-NEXT:    [[IDX64:%.*]] = sext i32 [[ADD]] to i64
+; DISABLE-NEXT:    [[STADDR:%.*]] = getelementptr inbounds i32, ptr [[ADDR]], i64 [[IDX64]]
+; DISABLE-NEXT:    store i32 [[ADD]], ptr [[STADDR]], align 4
+; DISABLE-NEXT:    ret void
+;
 entry:
   %t = load i8, ptr %p
   %zextt = zext i8 %t to i32
@@ -427,25 +653,40 @@ entry:
 ; The input has one free zext and one free sext. If we would have promoted
 ; all the way through the load we would end up with a free zext and a
 ; non-free sext (of %b).
-; OPTALL-LABEL: @doNotPromoteFreeSExtFromAddrMode64
-; OPTALL: [[LD:%[a-zA-Z_0-9-]+]] = load i8, ptr %p
-;
-; STRESS-NEXT: [[ZEXT64:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i64
-; STRESS-NEXT: [[SEXTB:%[a-zA-Z_0-9-]+]] = sext i32 %b to i64
-; STRESS-NEXT: [[IDX64:%[a-zA-Z_0-9-]+]] = add nsw i64 [[ZEXT64]], [[SEXTB]]
-;
-; NONSTRESS-NEXT: [[ZEXT32:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i32
-; NONSTRESS-NEXT: [[RES32:%[a-zA-Z_0-9-]+]] = add nsw i32 [[ZEXT32]], %b
-; NONSTRESS-NEXT: [[IDX64:%[a-zA-Z_0-9-]+]] = sext i32 [[RES32]] to i64
-;
-; DISABLE-NEXT: [[ZEXT32:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i32
-; DISABLE-NEXT: [[RES32:%[a-zA-Z_0-9-]+]] = add nsw i32 [[ZEXT32]], %b
-; DISABLE-NEXT: [[IDX64:%[a-zA-Z_0-9-]+]] = sext i32 [[RES32]] to i64
-;
-; OPTALL-NEXT: [[GEP:%[a-zA-Z_0-9-]+]] = getelementptr inbounds i64, ptr %addr, i64 [[IDX64]]
-; OPTALL-NEXT: store i64 %stuff, ptr [[GEP]]
-; OPTALL-NEXT: ret void
 define void @doNotPromoteFreeSExtFromAddrMode64(ptr %p, i32 %b, ptr %addr, i64 %stuff) {
+; NONSTRESS-LABEL: define void @doNotPromoteFreeSExtFromAddrMode64(
+; NONSTRESS-SAME: ptr [[P:%.*]], i32 [[B:%.*]], ptr [[ADDR:%.*]], i64 [[STUFF:%.*]]) {
+; NONSTRESS-NEXT:  [[ENTRY:.*:]]
+; NONSTRESS-NEXT:    [[T:%.*]] = load i8, ptr [[P]], align 1
+; NONSTRESS-NEXT:    [[ZEXTT:%.*]] = zext i8 [[T]] to i32
+; NONSTRESS-NEXT:    [[ADD:%.*]] = add nsw i32 [[ZEXTT]], [[B]]
+; NONSTRESS-NEXT:    [[IDX64:%.*]] = sext i32 [[ADD]] to i64
+; NONSTRESS-NEXT:    [[STADDR:%.*]] = getelementptr inbounds i64, ptr [[ADDR]], i64 [[IDX64]]
+; NONSTRESS-NEXT:    store i64 [[STUFF]], ptr [[STADDR]], align 8
+; NONSTRESS-NEXT:    ret void
+;
+; STRESS-LABEL: define void @doNotPromoteFreeSExtFromAddrMode64(
+; STRESS-SAME: ptr [[P:%.*]], i32 [[B:%.*]], ptr [[ADDR:%.*]], i64 [[STUFF:%.*]]) {
+; STRESS-NEXT:  [[ENTRY:.*:]]
+; STRESS-NEXT:    [[T:%.*]] = load i8, ptr [[P]], align 1
+; STRESS-NEXT:    [[PROMOTED2:%.*]] = zext i8 [[T]] to i64
+; STRESS-NEXT:    [[PROMOTED1:%.*]] = sext i32 [[B]] to i64
+; STRESS-NEXT:    [[ADD:%.*]] = add nsw i64 [[PROMOTED2]], [[PROMOTED1]]
+; STRESS-NEXT:    [[STADDR:%.*]] = getelementptr inbounds i64, ptr [[ADDR]], i64 [[ADD]]
+; STRESS-NEXT:    store i64 [[STUFF]], ptr [[STADDR]], align 8
+; STRESS-NEXT:    ret void
+;
+; DISABLE-LABEL: define void @doNotPromoteFreeSExtFromAddrMode64(
+; DISABLE-SAME: ptr [[P:%.*]], i32 [[B:%.*]], ptr [[ADDR:%.*]], i64 [[STUFF:%.*]]) {
+; DISABLE-NEXT:  [[ENTRY:.*:]]
+; DISABLE-NEXT:    [[T:%.*]] = load i8, ptr [[P]], align 1
+; DISABLE-NEXT:    [[ZEXTT:%.*]] = zext i8 [[T]] to i32
+; DISABLE-NEXT:    [[ADD:%.*]] = add nsw i32 [[ZEXTT]], [[B]]
+; DISABLE-NEXT:    [[IDX64:%.*]] = sext i32 [[ADD]] to i64
+; DISABLE-NEXT:    [[STADDR:%.*]] = getelementptr inbounds i64, ptr [[ADDR]], i64 [[IDX64]]
+; DISABLE-NEXT:    store i64 [[STUFF]], ptr [[STADDR]], align 8
+; DISABLE-NEXT:    ret void
+;
 entry:
   %t = load i8, ptr %p
   %zextt = zext i8 %t to i32
@@ -460,25 +701,40 @@ entry:
 ; The input has one free zext and one free sext. If we would have promoted
 ; all the way through the load we would end up with a free zext and a
 ; non-free sext (of %b).
-; OPTALL-LABEL: @doNotPromoteFreeSExtFromAddrMode128
-; OPTALL: [[LD:%[a-zA-Z_0-9-]+]] = load i8, ptr %p
-;
-; STRESS-NEXT: [[ZEXT64:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i64
-; STRESS-NEXT: [[SEXTB:%[a-zA-Z_0-9-]+]] = sext i32 %b to i64
-; STRESS-NEXT: [[IDX64:%[a-zA-Z_0-9-]+]] = add nsw i64 [[ZEXT64]], [[SEXTB]]
-;
-; NONSTRESS-NEXT: [[ZEXT32:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i32
-; NONSTRESS-NEXT: [[RES32:%[a-zA-Z_0-9-]+]] = add nsw i32 [[ZEXT32]], %b
-; NONSTRESS-NEXT: [[IDX64:%[a-zA-Z_0-9-]+]] = sext i32 [[RES32]] to i64
-;
-; DISABLE-NEXT: [[ZEXT32:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i32
-; DISABLE-NEXT: [[RES32:%[a-zA-Z_0-9-]+]] = add nsw i32 [[ZEXT32]], %b
-; DISABLE-NEXT: [[IDX64:%[a-zA-Z_0-9-]+]] = sext i32 [[RES32]] to i64
-;
-; OPTALL-NEXT: [[GEP:%[a-zA-Z_0-9-]+]] = getelementptr inbounds i128, ptr %addr, i64 [[IDX64]]
-; OPTALL-NEXT: store i128 %stuff, ptr [[GEP]]
-; OPTALL-NEXT: ret void
 define void @doNotPromoteFreeSExtFromAddrMode128(ptr %p, i32 %b, ptr %addr, i128 %stuff) {
+; NONSTRESS-LABEL: define void @doNotPromoteFreeSExtFromAddrMode128(
+; NONSTRESS-SAME: ptr [[P:%.*]], i32 [[B:%.*]], ptr [[ADDR:%.*]], i128 [[STUFF:%.*]]) {
+; NONSTRESS-NEXT:  [[ENTRY:.*:]]
+; NONSTRESS-NEXT:    [[T:%.*]] = load i8, ptr [[P]], align 1
+; NONSTRESS-NEXT:    [[ZEXTT:%.*]] = zext i8 [[T]] to i32
+; NONSTRESS-NEXT:    [[ADD:%.*]] = add nsw i32 [[ZEXTT]], [[B]]
+; NONSTRESS-NEXT:    [[IDX64:%.*]] = sext i32 [[ADD]] to i64
+; NONSTRESS-NEXT:    [[STADDR:%.*]] = getelementptr inbounds i128, ptr [[ADDR]], i64 [[IDX64]]
+; NONSTRESS-NEXT:    store i128 [[STUFF]], ptr [[STADDR]], align 16
+; NONSTRESS-NEXT:    ret void
+;
+; STRESS-LABEL: define void @doNotPromoteFreeSExtFromAddrMode128(
+; STRESS-SAME: ptr [[P:%.*]], i32 [[B:%.*]], ptr [[ADDR:%.*]], i128 [[STUFF:%.*]]) {
+; STRESS-NEXT:  [[ENTRY:.*:]]
+; STRESS-NEXT:    [[T:%.*]] = load i8, ptr [[P]], align 1
+; STRESS-NEXT:    [[PROMOTED2:%.*]] = zext i8 [[T]] to i64
+; STRESS-NEXT:    [[PROMOTED1:%.*]] = sext i32 [[B]] to i64
+; STRESS-NEXT:    [[ADD:%.*]] = add nsw i64 [[PROMOTED2]], [[PROMOTED1]]
+; STRESS-NEXT:    [[STADDR:%.*]] = getelementptr inbounds i128, ptr [[ADDR]], i64 [[ADD]]
+; STRESS-NEXT:    store i128 [[STUFF]], ptr [[STADDR]], align 16
+; STRESS-NEXT:    ret void
+;
+; DISABLE-LABEL: define void @doNotPromoteFreeSExtFromAddrMode128(
+; DISABLE-SAME: ptr [[P:%.*]], i32 [[B:%.*]], ptr [[ADDR:%.*]], i128 [[STUFF:%.*]]) {
+; DISABLE-NEXT:  [[ENTRY:.*:]]
+; DISABLE-NEXT:    [[T:%.*]] = load i8, ptr [[P]], align 1
+; DISABLE-NEXT:    [[ZEXTT:%.*]] = zext i8 [[T]] to i32
+; DISABLE-NEXT:    [[ADD:%.*]] = add nsw i32 [[ZEXTT]], [[B]]
+; DISABLE-NEXT:    [[IDX64:%.*]] = sext i32 [[ADD]] to i64
+; DISABLE-NEXT:    [[STADDR:%.*]] = getelementptr inbounds i128, ptr [[ADDR]], i64 [[IDX64]]
+; DISABLE-NEXT:    store i128 [[STUFF]], ptr [[STADDR]], align 16
+; DISABLE-NEXT:    ret void
+;
 entry:
   %t = load i8, ptr %p
   %zextt = zext i8 %t to i32
@@ -494,21 +750,29 @@ entry:
 ; The input has one free zext and one free sext. If we would have promoted
 ; all the way through the load we would end up with a free zext and a
 ; non-free sext (of %b).
-; OPTALL-LABEL: @promoteSExtFromAddrMode256
-; OPTALL: [[LD:%[a-zA-Z_0-9-]+]] = load i8, ptr %p
-;
-; OPT-NEXT: [[ZEXT64:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i64
-; OPT-NEXT: [[SEXTB:%[a-zA-Z_0-9-]+]] = sext i32 %b to i64
-; OPT-NEXT: [[IDX64:%[a-zA-Z_0-9-]+]] = add nsw i64 [[ZEXT64]], [[SEXTB]]
-;
-; DISABLE-NEXT: [[ZEXT32:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i32
-; DISABLE-NEXT: [[RES32:%[a-zA-Z_0-9-]+]] = add nsw i32 [[ZEXT32]], %b
-; DISABLE-NEXT: [[IDX64:%[a-zA-Z_0-9-]+]] = sext i32 [[RES32]] to i64
-;
-; OPTALL-NEXT: [[GEP:%[a-zA-Z_0-9-]+]] = getelementptr inbounds i256, ptr %addr, i64 [[IDX64]]
-; OPTALL-NEXT: store i256 %stuff, ptr [[GEP]]
-; OPTALL-NEXT: ret void
 define void @promoteSExtFromAddrMode256(ptr %p, i32 %b, ptr %addr, i256 %stuff) {
+; OPT-LABEL: define void @promoteSExtFromAddrMode256(
+; OPT-SAME: ptr [[P:%.*]], i32 [[B:%.*]], ptr [[ADDR:%.*]], i256 [[STUFF:%.*]]) {
+; OPT-NEXT:  [[ENTRY:.*:]]
+; OPT-NEXT:    [[T:%.*]] = load i8, ptr [[P]], align 1
+; OPT-NEXT:    [[PROMOTED2:%.*]] = zext i8 [[T]] to i64
+; OPT-NEXT:    [[PROMOTED1:%.*]] = sext i32 [[B]] to i64
+; OPT-NEXT:    [[ADD:%.*]] = add nsw i64 [[PROMOTED2]], [[PROMOTED1]]
+; OPT-NEXT:    [[STADDR:%.*]] = getelementptr inbounds i256, ptr [[ADDR]], i64 [[ADD]]
+; OPT-NEXT:    store i256 [[STUFF]], ptr [[STADDR]], align 16
+; OPT-NEXT:    ret void
+;
+; DISABLE-LABEL: define void @promoteSExtFromAddrMode256(
+; DISABLE-SAME: ptr [[P:%.*]], i32 [[B:%.*]], ptr [[ADDR:%.*]], i256 [[STUFF:%.*]]) {
+; DISABLE-NEXT:  [[ENTRY:.*:]]
+; DISABLE-NEXT:    [[T:%.*]] = load i8, ptr [[P]], align 1
+; DISABLE-NEXT:    [[ZEXTT:%.*]] = zext i8 [[T]] to i32
+; DISABLE-NEXT:    [[ADD:%.*]] = add nsw i32 [[ZEXTT]], [[B]]
+; DISABLE-NEXT:    [[IDX64:%.*]] = sext i32 [[ADD]] to i64
+; DISABLE-NEXT:    [[STADDR:%.*]] = getelementptr inbounds i256, ptr [[ADDR]], i64 [[IDX64]]
+; DISABLE-NEXT:    store i256 [[STUFF]], ptr [[STADDR]], align 16
+; DISABLE-NEXT:    ret void
+;
 entry:
   %t = load i8, ptr %p
   %zextt = zext i8 %t to i32
@@ -527,27 +791,43 @@ entry:
 ; so the promotion happens because the cost did not change and may
 ; expose more opportunities.
 ; This would need to be fixed at some point.
-; OPTALL-LABEL: @doNotPromoteFreeZExtFromAddrMode
-; OPTALL: [[LD:%[a-zA-Z_0-9-]+]] = load i8, ptr %p
 ;
 ; This transformation should really happen only for stress mode.
-; STRESS-NEXT: [[ZEXT64:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i64
-; STRESS-NEXT: [[ZEXTB:%[a-zA-Z_0-9-]+]] = zext i32 %b to i64
-; STRESS-NEXT: [[IDX64:%[a-zA-Z_0-9-]+]] = add nuw i64 [[ZEXT64]], [[ZEXTB]]
-; STRESS-NEXT: [[RES32:%[a-zA-Z_0-9-]+]] = trunc i64 [[IDX64]] to i32
-;
-; NONSTRESS-NEXT: [[ZEXT32:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i32
-; NONSTRESS-NEXT: [[RES32:%[a-zA-Z_0-9-]+]] = add nuw i32 [[ZEXT32]], %b
-; NONSTRESS-NEXT: [[IDX64:%[a-zA-Z_0-9-]+]] = zext i32 [[RES32]] to i64
-;
-; DISABLE-NEXT: [[ZEXT32:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i32
-; DISABLE-NEXT: [[RES32:%[a-zA-Z_0-9-]+]] = add nuw i32 [[ZEXT32]], %b
-; DISABLE-NEXT: [[IDX64:%[a-zA-Z_0-9-]+]] = zext i32 [[RES32]] to i64
-;
-; OPTALL-NEXT: [[GEP:%[a-zA-Z_0-9-]+]] = getelementptr inbounds i32, ptr %addr, i64 [[IDX64]]
-; OPTALL-NEXT: store i32 [[RES32]], ptr [[GEP]]
-; OPTALL-NEXT: ret void
 define void @doNotPromoteFreeZExtFromAddrMode(ptr %p, i32 %b, ptr %addr) {
+; NONSTRESS-LABEL: define void @doNotPromoteFreeZExtFromAddrMode(
+; NONSTRESS-SAME: ptr [[P:%.*]], i32 [[B:%.*]], ptr [[ADDR:%.*]]) {
+; NONSTRESS-NEXT:  [[ENTRY:.*:]]
+; NONSTRESS-NEXT:    [[T:%.*]] = load i8, ptr [[P]], align 1
+; NONSTRESS-NEXT:    [[ZEXTT:%.*]] = zext i8 [[T]] to i32
+; NONSTRESS-NEXT:    [[ADD:%.*]] = add nuw i32 [[ZEXTT]], [[B]]
+; NONSTRESS-NEXT:    [[IDX64:%.*]] = zext i32 [[ADD]] to i64
+; NONSTRESS-NEXT:    [[STADDR:%.*]] = getelementptr inbounds i32, ptr [[ADDR]], i64 [[IDX64]]
+; NONSTRESS-NEXT:    store i32 [[ADD]], ptr [[STADDR]], align 4
+; NONSTRESS-NEXT:    ret void
+;
+; STRESS-LABEL: define void @doNotPromoteFreeZExtFromAddrMode(
+; STRESS-SAME: ptr [[P:%.*]], i32 [[B:%.*]], ptr [[ADDR:%.*]]) {
+; STRESS-NEXT:  [[ENTRY:.*:]]
+; STRESS-NEXT:    [[T:%.*]] = load i8, ptr [[P]], align 1
+; STRESS-NEXT:    [[PROMOTED3:%.*]] = zext i8 [[T]] to i64
+; STRESS-NEXT:    [[PROMOTED2:%.*]] = zext i32 [[B]] to i64
+; STRESS-NEXT:    [[ADD:%.*]] = add nuw i64 [[PROMOTED3]], [[PROMOTED2]]
+; STRESS-NEXT:    [[PROMOTED:%.*]] = trunc i64 [[ADD]] to i32
+; STRESS-NEXT:    [[STADDR:%.*]] = getelementptr inbounds i32, ptr [[ADDR]], i64 [[ADD]]
+; STRESS-NEXT:    store i32 [[PROMOTED]], ptr [[STADDR]], align 4
+; STRESS-NEXT:    ret void
+;
+; DISABLE-LABEL: define void @doNotPromoteFreeZExtFromAddrMode(
+; DISABLE-SAME: ptr [[P:%.*]], i32 [[B:%.*]], ptr [[ADDR:%.*]]) {
+; DISABLE-NEXT:  [[ENTRY:.*:]]
+; DISABLE-NEXT:    [[T:%.*]] = load i8, ptr [[P]], align 1
+; DISABLE-NEXT:    [[ZEXTT:%.*]] = zext i8 [[T]] to i32
+; DISABLE-NEXT:    [[ADD:%.*]] = add nuw i32 [[ZEXTT]], [[B]]
+; DISABLE-NEXT:    [[IDX64:%.*]] = zext i32 [[ADD]] to i64
+; DISABLE-NEXT:    [[STADDR:%.*]] = getelementptr inbounds i32, ptr [[ADDR]], i64 [[IDX64]]
+; DISABLE-NEXT:    store i32 [[ADD]], ptr [[STADDR]], align 4
+; DISABLE-NEXT:    ret void
+;
 entry:
   %t = load i8, ptr %p
   %zextt = zext i8 %t to i32
@@ -558,24 +838,37 @@ entry:
   ret void
 }
 
-; OPTALL-LABEL: @doNotPromoteFreeSExtFromShift
-; OPTALL: [[LD:%[a-zA-Z_0-9-]+]] = load i8, ptr %p
-;
-; STRESS-NEXT: [[ZEXT64:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i64
-; STRESS-NEXT: [[SEXTB:%[a-zA-Z_0-9-]+]] = sext i32 %b to i64
-; STRESS-NEXT: [[IDX64:%[a-zA-Z_0-9-]+]] = add nsw i64 [[ZEXT64]], [[SEXTB]]
-;
-; NONSTRESS-NEXT: [[ZEXT32:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i32
-; NONSTRESS-NEXT: [[RES32:%[a-zA-Z_0-9-]+]] = add nsw i32 [[ZEXT32]], %b
-; NONSTRESS-NEXT: [[IDX64:%[a-zA-Z_0-9-]+]] = sext i32 [[RES32]] to i64
-;
-; DISABLE-NEXT: [[ZEXT32:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i32
-; DISABLE-NEXT: [[RES32:%[a-zA-Z_0-9-]+]] = add nsw i32 [[ZEXT32]], %b
-; DISABLE-NEXT: [[IDX64:%[a-zA-Z_0-9-]+]] = sext i32 [[RES32]] to i64
-;
-; OPTALL-NEXT: [[RES64:%[a-zA-Z_0-9-]+]] = shl i64 [[IDX64]], 12
-; OPTALL-NEXT: ret i64 %staddr
 define i64 @doNotPromoteFreeSExtFromShift(ptr %p, i32 %b) {
+; NONSTRESS-LABEL: define i64 @doNotPromoteFreeSExtFromShift(
+; NONSTRESS-SAME: ptr [[P:%.*]], i32 [[B:%.*]]) {
+; NONSTRESS-NEXT:  [[ENTRY:.*:]]
+; NONSTRESS-NEXT:    [[T:%.*]] = load i8, ptr [[P]], align 1
+; NONSTRESS-NEXT:    [[ZEXTT:%.*]] = zext i8 [[T]] to i32
+; NONSTRESS-NEXT:    [[ADD:%.*]] = add nsw i32 [[ZEXTT]], [[B]]
+; NONSTRESS-NEXT:    [[IDX64:%.*]] = sext i32 [[ADD]] to i64
+; NONSTRESS-NEXT:    [[STADDR:%.*]] = shl i64 [[IDX64]], 12
+; NONSTRESS-NEXT:    ret i64 [[STADDR]]
+;
+; STRESS-LABEL: define i64 @doNotPromoteFreeSExtFromShift(
+; STRESS-SAME: ptr [[P:%.*]], i32 [[B:%.*]]) {
+; STRESS-NEXT:  [[ENTRY:.*:]]
+; STRESS-NEXT:    [[T:%.*]] = load i8, ptr [[P]], align 1
+; STRESS-NEXT:    [[PROMOTED2:%.*]] = zext i8 [[T]] to i64
+; STRESS-NEXT:    [[PROMOTED1:%.*]] = sext i32 [[B]] to i64
+; STRESS-NEXT:    [[ADD:%.*]] = add nsw i64 [[PROMOTED2]], [[PROMOTED1]]
+; STRESS-NEXT:    [[STADDR:%.*]] = shl i64 [[ADD]], 12
+; STRESS-NEXT:    ret i64 [[STADDR]]
+;
+; DISABLE-LABEL: define i64 @doNotPromoteFreeSExtFromShift(
+; DISABLE-SAME: ptr [[P:%.*]], i32 [[B:%.*]]) {
+; DISABLE-NEXT:  [[ENTRY:.*:]]
+; DISABLE-NEXT:    [[T:%.*]] = load i8, ptr [[P]], align 1
+; DISABLE-NEXT:    [[ZEXTT:%.*]] = zext i8 [[T]] to i32
+; DISABLE-NEXT:    [[ADD:%.*]] = add nsw i32 [[ZEXTT]], [[B]]
+; DISABLE-NEXT:    [[IDX64:%.*]] = sext i32 [[ADD]] to i64
+; DISABLE-NEXT:    [[STADDR:%.*]] = shl i64 [[IDX64]], 12
+; DISABLE-NEXT:    ret i64 [[STADDR]]
+;
 entry:
   %t = load i8, ptr %p
   %zextt = zext i8 %t to i32
@@ -586,25 +879,39 @@ entry:
 }
 
 ; Same comment as doNotPromoteFreeZExtFromAddrMode.
-; OPTALL-LABEL: @doNotPromoteFreeZExtFromShift
-; OPTALL: [[LD:%[a-zA-Z_0-9-]+]] = load i8, ptr %p
 ;
 ; This transformation should really happen only for stress mode.
-; STRESS-NEXT: [[ZEXT64:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i64
-; STRESS-NEXT: [[ZEXTB:%[a-zA-Z_0-9-]+]] = zext i32 %b to i64
-; STRESS-NEXT: [[IDX64:%[a-zA-Z_0-9-]+]] = add nuw i64 [[ZEXT64]], [[ZEXTB]]
-;
-; NONSTRESS-NEXT: [[ZEXT32:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i32
-; NONSTRESS-NEXT: [[RES32:%[a-zA-Z_0-9-]+]] = add nuw i32 [[ZEXT32]], %b
-; NONSTRESS-NEXT: [[IDX64:%[a-zA-Z_0-9-]+]] = zext i32 [[RES32]] to i64
-;
-; DISABLE-NEXT: [[ZEXT32:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i32
-; DISABLE-NEXT: [[RES32:%[a-zA-Z_0-9-]+]] = add nuw i32 [[ZEXT32]], %b
-; DISABLE-NEXT: [[IDX64:%[a-zA-Z_0-9-]+]] = zext i32 [[RES32]] to i64
-;
-; OPTALL-NEXT: [[RES64:%[a-zA-Z_0-9-]+]] = shl i64 [[IDX64]], 12
-; OPTALL-NEXT: ret i64 %staddr
 define i64 @doNotPromoteFreeZExtFromShift(ptr %p, i32 %b) {
+; NONSTRESS-LABEL: define i64 @doNotPromoteFreeZExtFromShift(
+; NONSTRESS-SAME: ptr [[P:%.*]], i32 [[B:%.*]]) {
+; NONSTRESS-NEXT:  [[ENTRY:.*:]]
+; NONSTRESS-NEXT:    [[T:%.*]] = load i8, ptr [[P]], align 1
+; NONSTRESS-NEXT:    [[ZEXTT:%.*]] = zext i8 [[T]] to i32
+; NONSTRESS-NEXT:    [[ADD:%.*]] = add nuw i32 [[ZEXTT]], [[B]]
+; NONSTRESS-NEXT:    [[IDX64:%.*]] = zext i32 [[ADD]] to i64
+; NONSTRESS-NEXT:    [[STADDR:%.*]] = shl i64 [[IDX64]], 12
+; NONSTRESS-NEXT:    ret i64 [[STADDR]]
+;
+; STRESS-LABEL: define i64 @doNotPromoteFreeZExtFromShift(
+; STRESS-SAME: ptr [[P:%.*]], i32 [[B:%.*]]) {
+; STRESS-NEXT:  [[ENTRY:.*:]]
+; STRESS-NEXT:    [[T:%.*]] = load i8, ptr [[P]], align 1
+; STRESS-NEXT:    [[PROMOTED2:%.*]] = zext i8 [[T]] to i64
+; STRESS-NEXT:    [[PROMOTED1:%.*]] = zext i32 [[B]] to i64
+; STRESS-NEXT:    [[ADD:%.*]] = add nuw i64 [[PROMOTED2]], [[PROMOTED1]]
+; STRESS-NEXT:    [[STADDR:%.*]] = shl i64 [[ADD]], 12
+; STRESS-NEXT:    ret i64 [[STADDR]]
+;
+; DISABLE-LABEL: define i64 @doNotPromoteFreeZExtFromShift(
+; DISABLE-SAME: ptr [[P:%.*]], i32 [[B:%.*]]) {
+; DISABLE-NEXT:  [[ENTRY:.*:]]
+; DISABLE-NEXT:    [[T:%.*]] = load i8, ptr [[P]], align 1
+; DISABLE-NEXT:    [[ZEXTT:%.*]] = zext i8 [[T]] to i32
+; DISABLE-NEXT:    [[ADD:%.*]] = add nuw i32 [[ZEXTT]], [[B]]
+; DISABLE-NEXT:    [[IDX64:%.*]] = zext i32 [[ADD]] to i64
+; DISABLE-NEXT:    [[STADDR:%.*]] = shl i64 [[IDX64]], 12
+; DISABLE-NEXT:    ret i64 [[STADDR]]
+;
 entry:
   %t = load i8, ptr %p
   %zextt = zext i8 %t to i32
@@ -621,23 +928,32 @@ entry:
 ; non-free. So technically, we trade a non-free sext to two non-free
 ; sext.
 ; This would need to be fixed at some point.
-; OPTALL-LABEL: @doNotPromoteBecauseOfPairedLoad
-; OPTALL: [[LD0:%[a-zA-Z_0-9-]+]] = load i32, ptr %p
-; OPTALL: [[GEP:%[a-zA-Z_0-9-]+]] = getelementptr inbounds i32, ptr %p, i64 1
-; OPTALL: [[LD1:%[a-zA-Z_0-9-]+]] = load i32, ptr [[GEP]]
 ;
 ; This transformation should really happen only for stress mode.
-; OPT-NEXT: [[SEXTLD1:%[a-zA-Z_0-9-]+]] = sext i32 [[LD1]] to i64
-; OPT-NEXT: [[SEXTCST:%[a-zA-Z_0-9-]+]] = sext i32 %cst to i64
-; OPT-NEXT: [[SEXTRES:%[a-zA-Z_0-9-]+]] = add nsw i64 [[SEXTLD1]], [[SEXTCST]]
-;
-; DISABLE-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = add nsw i32 [[LD1]], %cst
-; DISABLE-NEXT: [[SEXTRES:%[a-zA-Z_0-9-]+]] = sext i32 [[RES]] to i64
-;
-; OPTALL-NEXT: [[ZEXTLD0:%[a-zA-Z_0-9-]+]] = zext i32 [[LD0]] to i64
-; OPTALL-NEXT: [[FINAL:%[a-zA-Z_0-9-]+]] = add i64 [[SEXTRES]], [[ZEXTLD0]]
-; OPTALL-NEXT: ret i64 [[FINAL]]
 define i64 @doNotPromoteBecauseOfPairedLoad(ptr %p, i32 %cst) {
+; OPT-LABEL: define i64 @doNotPromoteBecauseOfPairedLoad(
+; OPT-SAME: ptr [[P:%.*]], i32 [[CST:%.*]]) {
+; OPT-NEXT:    [[LD0:%.*]] = load i32, ptr [[P]], align 4
+; OPT-NEXT:    [[IDXLD1:%.*]] = getelementptr inbounds i32, ptr [[P]], i64 1
+; OPT-NEXT:    [[LD1:%.*]] = load i32, ptr [[IDXLD1]], align 4
+; OPT-NEXT:    [[PROMOTED:%.*]] = sext i32 [[LD1]] to i64
+; OPT-NEXT:    [[PROMOTED1:%.*]] = sext i32 [[CST]] to i64
+; OPT-NEXT:    [[RES:%.*]] = add nsw i64 [[PROMOTED]], [[PROMOTED1]]
+; OPT-NEXT:    [[ZEXTLD0:%.*]] = zext i32 [[LD0]] to i64
+; OPT-NEXT:    [[FINAL:%.*]] = add i64 [[RES]], [[ZEXTLD0]]
+; OPT-NEXT:    ret i64 [[FINAL]]
+;
+; DISABLE-LABEL: define i64 @doNotPromoteBecauseOfPairedLoad(
+; DISABLE-SAME: ptr [[P:%.*]], i32 [[CST:%.*]]) {
+; DISABLE-NEXT:    [[LD0:%.*]] = load i32, ptr [[P]], align 4
+; DISABLE-NEXT:    [[IDXLD1:%.*]] = getelementptr inbounds i32, ptr [[P]], i64 1
+; DISABLE-NEXT:    [[LD1:%.*]] = load i32, ptr [[IDXLD1]], align 4
+; DISABLE-NEXT:    [[RES:%.*]] = add nsw i32 [[LD1]], [[CST]]
+; DISABLE-NEXT:    [[SEXTRES:%.*]] = sext i32 [[RES]] to i64
+; DISABLE-NEXT:    [[ZEXTLD0:%.*]] = zext i32 [[LD0]] to i64
+; DISABLE-NEXT:    [[FINAL:%.*]] = add i64 [[SEXTRES]], [[ZEXTLD0]]
+; DISABLE-NEXT:    ret i64 [[FINAL]]
+;
   %ld0 = load i32, ptr %p
   %idxLd1 = getelementptr inbounds i32, ptr %p, i64 1
   %ld1 = load i32, ptr %idxLd1
@@ -649,15 +965,32 @@ define i64 @doNotPromoteBecauseOfPairedLoad(ptr %p, i32 %cst) {
 }
 
 define i64 @promoteZextShl(i1 %c, ptr %P) {
+; OPT-LABEL: define i64 @promoteZextShl(
+; OPT-SAME: i1 [[C:%.*]], ptr [[P:%.*]]) {
+; OPT-NEXT:  [[ENTRY:.*:]]
+; OPT-NEXT:    [[LD:%.*]] = load i16, ptr [[P]], align 2
+; OPT-NEXT:    [[PROMOTED1:%.*]] = zext i16 [[LD]] to i64
+; OPT-NEXT:    br i1 [[C]], label %[[END:.*]], label %[[IF_THEN:.*]]
+; OPT:       [[IF_THEN]]:
+; OPT-NEXT:    [[SHL2:%.*]] = shl nsw i64 [[PROMOTED1]], 1
+; OPT-NEXT:    ret i64 [[SHL2]]
+; OPT:       [[END]]:
+; OPT-NEXT:    ret i64 0
+;
+; DISABLE-LABEL: define i64 @promoteZextShl(
+; DISABLE-SAME: i1 [[C:%.*]], ptr [[P:%.*]]) {
+; DISABLE-NEXT:  [[ENTRY:.*:]]
+; DISABLE-NEXT:    [[LD:%.*]] = load i16, ptr [[P]], align 2
+; DISABLE-NEXT:    [[Z:%.*]] = zext i16 [[LD]] to i32
+; DISABLE-NEXT:    br i1 [[C]], label %[[END:.*]], label %[[IF_THEN:.*]]
+; DISABLE:       [[IF_THEN]]:
+; DISABLE-NEXT:    [[SHL2:%.*]] = shl nsw i32 [[Z]], 1
+; DISABLE-NEXT:    [[R:%.*]] = sext i32 [[SHL2]] to i64
+; DISABLE-NEXT:    ret i64 [[R]]
+; DISABLE:       [[END]]:
+; DISABLE-NEXT:    ret i64 0
+;
 entry:
-; OPTALL-LABEL: promoteZextShl
-; OPTALL: entry:
-; OPT: %[[LD:.*]] = load i16, ptr %P
-; OPT: %[[EXT:.*]] = zext i16 %[[LD]] to i64
-; OPT: if.then:
-; OPT: shl nsw i64 %[[EXT]], 1
-; DISABLE: if.then:
-; DISABLE: %r = sext i32 %shl2 to i64
   %ld = load i16, ptr %P
   br i1 %c, label %end, label %if.then
 if.then:

diff  --git a/llvm/test/Instrumentation/MemorySanitizer/reduce.ll b/llvm/test/Instrumentation/MemorySanitizer/reduce.ll
index 2a79b5e72bca5e..669ccf97f74c3e 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/reduce.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/reduce.ll
@@ -1,4 +1,4 @@
-
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt < %s -msan-check-access-address=0 -msan-track-origins=1 -S -passes='module(msan)' 2>&1 | FileCheck -allow-deprecated-dag-overlap --check-prefix=CHECK %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
@@ -8,60 +8,84 @@ declare i32 @llvm.vector.reduce.add(<3 x i32>)
 declare i32 @llvm.vector.reduce.and(<3 x i32>)
 declare i32 @llvm.vector.reduce.or(<3 x i32>)
 
-; CHECK-LABEL: @reduce_add
 define i32 @reduce_add() sanitize_memory {
-; CHECK: [[P:%.*]] = inttoptr i64 0 to ptr
+; CHECK-LABEL: define i32 @reduce_add(
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[P:%.*]] = inttoptr i64 0 to ptr
+; CHECK-NEXT:    [[O:%.*]] = load <3 x i32>, ptr [[P]], align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 87960930222080
+; CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[TMP2]], 17592186044416
+; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load <3 x i32>, ptr [[TMP3]], align 16
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 16
+; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vector.reduce.or.v3i32(<3 x i32> [[_MSLD]])
+; CHECK-NEXT:    [[R:%.*]] = call i32 @llvm.vector.reduce.add.v3i32(<3 x i32> [[O]])
+; CHECK-NEXT:    store i32 [[TMP7]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store i32 [[TMP6]], ptr @__msan_retval_origin_tls, align 4
+; CHECK-NEXT:    ret i32 [[R]]
+;
   %p = inttoptr i64 0 to ptr
-; CHECK: [[O:%.*]] = load <3 x i32>, ptr [[P]]
   %o = load <3 x i32>, ptr %p
-; CHECK: [[O_SHADOW:%.*]] = load <3 x i32>, ptr
-; CHECK: [[O_ORIGIN:%.*]] = load i32, ptr
-; CHECK: [[R_SHADOW:%.*]] = call i32 @llvm.vector.reduce.or.v3i32(<3 x i32> [[O_SHADOW]])
-; CHECK: [[R:%.*]] = call i32 @llvm.vector.reduce.add.v3i32(<3 x i32> [[O]])
   %r = call i32 @llvm.vector.reduce.add(<3 x i32> %o)
-; CHECK: store i32 [[R_SHADOW]], {{.*}} @__msan_retval_tls
-; CHECK: store i32 [[O_ORIGIN]], {{.*}} @__msan_retval_origin_tls
-; CHECK: ret i32 [[R]]
   ret i32 %r
 }
 
-; CHECK-LABEL: @reduce_and
 define i32 @reduce_and() sanitize_memory {
-; CHECK: [[P:%.*]] = inttoptr i64 0 to ptr
+; CHECK-LABEL: define i32 @reduce_and(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[P:%.*]] = inttoptr i64 0 to ptr
+; CHECK-NEXT:    [[O:%.*]] = load <3 x i32>, ptr [[P]], align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 87960930222080
+; CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[TMP2]], 17592186044416
+; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load <3 x i32>, ptr [[TMP3]], align 16
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 16
+; CHECK-NEXT:    [[TMP7:%.*]] = or <3 x i32> [[O]], [[_MSLD]]
+; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @llvm.vector.reduce.and.v3i32(<3 x i32> [[TMP7]])
+; CHECK-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vector.reduce.or.v3i32(<3 x i32> [[_MSLD]])
+; CHECK-NEXT:    [[TMP10:%.*]] = and i32 [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[R:%.*]] = call i32 @llvm.vector.reduce.and.v3i32(<3 x i32> [[O]])
+; CHECK-NEXT:    store i32 [[TMP10]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store i32 [[TMP6]], ptr @__msan_retval_origin_tls, align 4
+; CHECK-NEXT:    ret i32 [[R]]
+;
   %p = inttoptr i64 0 to ptr
-; CHECK: [[O:%.*]] = load <3 x i32>, ptr [[P]]
   %o = load <3 x i32>, ptr %p
-; CHECK: [[O_SHADOW:%.*]] = load <3 x i32>, ptr
-; CHECK: [[O_ORIGIN:%.*]] = load i32, ptr
-; CHECK: [[O_SHADOW_1:%.*]] = or <3 x i32> [[O]], [[O_SHADOW]]
-; CHECK: [[O_SHADOW_2:%.*]] = call i32 @llvm.vector.reduce.and.v3i32(<3 x i32> [[O_SHADOW_1]]
-; CHECK: [[O_SHADOW_3:%.*]] = call i32 @llvm.vector.reduce.or.v3i32(<3 x i32> [[O_SHADOW]])
-; CHECK: [[R_SHADOW:%.*]] = and i32 [[O_SHADOW_2]], [[O_SHADOW_3]]
-; CHECK: [[R:%.*]] = call i32 @llvm.vector.reduce.and.v3i32(<3 x i32> [[O]])
   %r = call i32 @llvm.vector.reduce.and(<3 x i32> %o)
-; CHECK: store i32 [[R_SHADOW]], {{.*}} @__msan_retval_tls
-; CHECK: store i32 [[O_ORIGIN]], {{.*}} @__msan_retval_origin_tls
-; CHECK: ret i32 [[R]]
   ret i32 %r
 }
 
-; CHECK-LABEL: @reduce_or
 define i32 @reduce_or() sanitize_memory {
-; CHECK: [[P:%.*]] = inttoptr i64 0 to ptr
+; CHECK-LABEL: define i32 @reduce_or(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[P:%.*]] = inttoptr i64 0 to ptr
+; CHECK-NEXT:    [[O:%.*]] = load <3 x i32>, ptr [[P]], align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 87960930222080
+; CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[TMP2]], 17592186044416
+; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load <3 x i32>, ptr [[TMP3]], align 16
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 16
+; CHECK-NEXT:    [[TMP7:%.*]] = xor <3 x i32> [[O]], <i32 -1, i32 -1, i32 -1>
+; CHECK-NEXT:    [[TMP8:%.*]] = or <3 x i32> [[TMP7]], [[_MSLD]]
+; CHECK-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vector.reduce.and.v3i32(<3 x i32> [[TMP8]])
+; CHECK-NEXT:    [[TMP10:%.*]] = call i32 @llvm.vector.reduce.or.v3i32(<3 x i32> [[_MSLD]])
+; CHECK-NEXT:    [[TMP11:%.*]] = and i32 [[TMP9]], [[TMP10]]
+; CHECK-NEXT:    [[R:%.*]] = call i32 @llvm.vector.reduce.or.v3i32(<3 x i32> [[O]])
+; CHECK-NEXT:    store i32 [[TMP11]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store i32 [[TMP6]], ptr @__msan_retval_origin_tls, align 4
+; CHECK-NEXT:    ret i32 [[R]]
+;
   %p = inttoptr i64 0 to ptr
-; CHECK: [[O:%.*]] = load <3 x i32>, ptr [[P]]
   %o = load <3 x i32>, ptr %p
-; CHECK: [[O_SHADOW:%.*]] = load <3 x i32>, ptr
-; CHECK: [[O_ORIGIN:%.*]] = load i32, ptr
-; CHECK: [[NOT_O:%.*]] = xor <3 x i32> [[O]], <i32 -1, i32 -1, i32 -1>
-; CHECK: [[O_SHADOW_1:%.*]] = or <3 x i32> [[NOT_O]], [[O_SHADOW]]
-; CHECK: [[O_SHADOW_2:%.*]] = call i32 @llvm.vector.reduce.and.v3i32(<3 x i32> [[O_SHADOW_1]]
-; CHECK: [[O_SHADOW_3:%.*]] = call i32 @llvm.vector.reduce.or.v3i32(<3 x i32> [[O_SHADOW]])
-; CHECK: [[R_SHADOW:%.*]] = and i32 [[O_SHADOW_2]], [[O_SHADOW_3]]
-; CHECK: [[R:%.*]] = call i32 @llvm.vector.reduce.or.v3i32(<3 x i32> [[O]])
   %r = call i32 @llvm.vector.reduce.or(<3 x i32> %o)
-; CHECK: store i32 [[R_SHADOW]], {{.*}} @__msan_retval_tls
-; CHECK: store i32 [[O_ORIGIN]], {{.*}} @__msan_retval_origin_tls
-; CHECK: ret i32 [[R]]
   ret i32 %r
 }

diff  --git a/llvm/test/Instrumentation/MemorySanitizer/vector_arith.ll b/llvm/test/Instrumentation/MemorySanitizer/vector_arith.ll
index c0d738145f28e0..0e2c0e3d859415 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/vector_arith.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/vector_arith.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt < %s -msan-check-access-address=0 -S -passes=msan 2>&1 | FileCheck %s
 ; REQUIRES: x86-registered-target
 
@@ -10,57 +11,90 @@ declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone
 declare <1 x i64> @llvm.x86.mmx.psad.bw(<1 x i64>, <1 x i64>) nounwind readnone
 
 define <4 x i32> @Test_sse2_pmadd_wd(<8 x i16> %a, <8 x i16> %b) sanitize_memory {
+; CHECK-LABEL: define <4 x i32> @Test_sse2_pmadd_wd(
+; CHECK-SAME: <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]]) #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP2:%.*]] = or <8 x i16> [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to <4 x i32>
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <4 x i32> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = sext <4 x i1> [[TMP4]] to <4 x i32>
+; CHECK-NEXT:    [[C:%.*]] = tail call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> [[A]], <8 x i16> [[B]]) #[[ATTR2:[0-9]+]]
+; CHECK-NEXT:    store <4 x i32> [[TMP5]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i32> [[C]]
+;
 entry:
   %c = tail call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %a, <8 x i16> %b) nounwind
   ret <4 x i32> %c
 }
 
-; CHECK-LABEL: @Test_sse2_pmadd_wd(
-; CHECK: or <8 x i16>
-; CHECK: bitcast <8 x i16> {{.*}} to <4 x i32>
-; CHECK: icmp ne <4 x i32> {{.*}}, zeroinitializer
-; CHECK: sext <4 x i1> {{.*}} to <4 x i32>
-; CHECK: ret <4 x i32>
 
 
 define <1 x i64> @Test_ssse3_pmadd_ub_sw(<1 x i64> %a, <1 x i64> %b) sanitize_memory {
+; CHECK-LABEL: define <1 x i64> @Test_ssse3_pmadd_ub_sw(
+; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP2:%.*]] = or <1 x i64> [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <1 x i64> [[TMP2]] to <4 x i16>
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <4 x i16> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = sext <4 x i1> [[TMP4]] to <4 x i16>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <1 x i64>
+; CHECK-NEXT:    [[C:%.*]] = tail call <1 x i64> @llvm.x86.ssse3.pmadd.ub.sw(<1 x i64> [[A]], <1 x i64> [[B]]) #[[ATTR2]]
+; CHECK-NEXT:    store <1 x i64> [[TMP6]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <1 x i64> [[C]]
+;
 entry:
   %c = tail call <1 x i64> @llvm.x86.ssse3.pmadd.ub.sw(<1 x i64> %a, <1 x i64> %b) nounwind
   ret <1 x i64> %c
 }
 
-; CHECK-LABEL: @Test_ssse3_pmadd_ub_sw(
-; CHECK: or <1 x i64>
-; CHECK: bitcast <1 x i64> {{.*}} to <4 x i16>
-; CHECK: icmp ne <4 x i16> {{.*}}, zeroinitializer
-; CHECK: sext <4 x i1> {{.*}} to <4 x i16>
-; CHECK: bitcast <4 x i16> {{.*}} to <1 x i64>
-; CHECK: ret <1 x i64>
 
 
 define <2 x i64> @Test_x86_sse2_psad_bw(<16 x i8> %a, <16 x i8> %b) sanitize_memory {
+; CHECK-LABEL: define <2 x i64> @Test_x86_sse2_psad_bw(
+; CHECK-SAME: <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = or <16 x i8> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i8> [[TMP3]] to <2 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <2 x i64> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = sext <2 x i1> [[TMP5]] to <2 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = lshr <2 x i64> [[TMP6]], <i64 48, i64 48>
+; CHECK-NEXT:    [[C:%.*]] = tail call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> [[A]], <16 x i8> [[B]])
+; CHECK-NEXT:    store <2 x i64> [[TMP7]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x i64> [[C]]
+;
   %c = tail call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %a, <16 x i8> %b)
   ret <2 x i64> %c
 }
 
-; CHECK-LABEL: @Test_x86_sse2_psad_bw(
-; CHECK: or <16 x i8> {{.*}}, {{.*}}
-; CHECK: bitcast <16 x i8> {{.*}} to <2 x i64>
-; CHECK: icmp ne <2 x i64> {{.*}}, zeroinitializer
-; CHECK: sext <2 x i1> {{.*}} to <2 x i64>
-; CHECK: lshr <2 x i64> {{.*}}, <i64 48, i64 48>
-; CHECK: ret <2 x i64>
 
 
 define <1 x i64> @Test_x86_mmx_psad_bw(<1 x i64> %a, <1 x i64> %b) sanitize_memory {
+; CHECK-LABEL: define <1 x i64> @Test_x86_mmx_psad_bw(
+; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP2:%.*]] = or <1 x i64> [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <1 x i64> [[TMP2]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne i64 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = sext i1 [[TMP4]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = lshr i64 [[TMP5]], 48
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i64 [[TMP6]] to <1 x i64>
+; CHECK-NEXT:    [[C:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psad.bw(<1 x i64> [[A]], <1 x i64> [[B]]) #[[ATTR2]]
+; CHECK-NEXT:    store <1 x i64> [[TMP7]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <1 x i64> [[C]]
+;
 entry:
   %c = tail call <1 x i64> @llvm.x86.mmx.psad.bw(<1 x i64> %a, <1 x i64> %b) nounwind
   ret <1 x i64> %c
 }
 
-; CHECK-LABEL: @Test_x86_mmx_psad_bw(
-; CHECK: or <1 x i64>
-; CHECK: icmp ne i64
-; CHECK: sext i1 {{.*}} to i64
-; CHECK: lshr i64 {{.*}}, 48
-; CHECK: ret <1 x i64>

diff  --git a/llvm/test/Transforms/InstCombine/AArch64/2012-04-23-Neon-Intrinsics.ll b/llvm/test/Transforms/InstCombine/AArch64/2012-04-23-Neon-Intrinsics.ll
index 56491c5efa4f4e..63f466a5024c82 100644
--- a/llvm/test/Transforms/InstCombine/AArch64/2012-04-23-Neon-Intrinsics.ll
+++ b/llvm/test/Transforms/InstCombine/AArch64/2012-04-23-Neon-Intrinsics.ll
@@ -1,65 +1,87 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt --mtriple=aarch64-unknown-linux -S -passes=instcombine < %s | FileCheck %s
 ; ARM64 neon intrinsic variants - <rdar://problem/12349617>
 
 define <4 x i32> @mulByZeroARM64(<4 x i16> %x) nounwind readnone ssp {
+; CHECK-LABEL: define <4 x i32> @mulByZeroARM64(
+; CHECK-SAME: <4 x i16> [[X:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    ret <4 x i32> zeroinitializer
+;
 entry:
   %a = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %x, <4 x i16> zeroinitializer) nounwind
   ret <4 x i32> %a
-; CHECK: entry:
-; CHECK-NEXT: ret <4 x i32> zeroinitializer
 }
 
 define <4 x i32> @mulByOneARM64(<4 x i16> %x) nounwind readnone ssp {
+; CHECK-LABEL: define <4 x i32> @mulByOneARM64(
+; CHECK-SAME: <4 x i16> [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[A:%.*]] = sext <4 x i16> [[X]] to <4 x i32>
+; CHECK-NEXT:    ret <4 x i32> [[A]]
+;
 entry:
   %a = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %x, <4 x i16> <i16 1, i16 1, i16 1, i16 1>) nounwind
   ret <4 x i32> %a
-; CHECK: entry:
-; CHECK-NEXT: %a = sext <4 x i16> %x to <4 x i32>
-; CHECK-NEXT: ret <4 x i32> %a
 }
 
 define <4 x i32> @constantMulARM64() nounwind readnone ssp {
+; CHECK-LABEL: define <4 x i32> @constantMulARM64(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    ret <4 x i32> <i32 6, i32 6, i32 6, i32 6>
+;
 entry:
   %a = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> <i16 3, i16 3, i16 3, i16 3>, <4 x i16> <i16 2, i16 2, i16 2, i16 2>) nounwind
   ret <4 x i32> %a
-; CHECK: entry:
-; CHECK-NEXT: ret <4 x i32> <i32 6, i32 6, i32 6, i32 6>
 }
 
 define <4 x i32> @constantMulSARM64() nounwind readnone ssp {
+; CHECK-LABEL: define <4 x i32> @constantMulSARM64(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    ret <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>
+;
 entry:
   %b = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>, <4 x i16> <i16 1, i16 1, i16 1, i16 1>) nounwind
   ret <4 x i32> %b
-; CHECK: entry:
-; CHECK-NEXT: ret <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>
 }
 
 define <4 x i32> @constantMulUARM64() nounwind readnone ssp {
+; CHECK-LABEL: define <4 x i32> @constantMulUARM64(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    ret <4 x i32> <i32 65535, i32 65535, i32 65535, i32 65535>
+;
 entry:
   %b = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>, <4 x i16> <i16 1, i16 1, i16 1, i16 1>) nounwind
   ret <4 x i32> %b
-; CHECK: entry:
-; CHECK-NEXT: ret <4 x i32> <i32 65535, i32 65535, i32 65535, i32 65535>
 }
 
 define <4 x i32> @complex1ARM64(<4 x i16> %x) nounwind readnone ssp {
+; CHECK-LABEL: define <4 x i32> @complex1ARM64(
+; CHECK-SAME: <4 x i16> [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[A:%.*]] = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> <i16 2, i16 2, i16 2, i16 2>, <4 x i16> [[X]]) #[[ATTR2:[0-9]+]]
+; CHECK-NEXT:    ret <4 x i32> [[A]]
+;
 entry:
   %a = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> <i16 2, i16 2, i16 2, i16 2>, <4 x i16> %x) nounwind
   %b = add <4 x i32> zeroinitializer, %a
   ret <4 x i32> %b
-; CHECK: entry:
-; CHECK-NEXT: %a = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> <i16 2, i16 2, i16 2, i16 2>, <4 x i16> %x) [[NUW:#[0-9]+]]
-; CHECK-NEXT: ret <4 x i32> %a
 }
 
 define <4 x i32> @complex2ARM64(<4 x i32> %x) nounwind readnone ssp {
+; CHECK-LABEL: define <4 x i32> @complex2ARM64(
+; CHECK-SAME: <4 x i32> [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[B:%.*]] = add <4 x i32> [[X]], <i32 6, i32 6, i32 6, i32 6>
+; CHECK-NEXT:    ret <4 x i32> [[B]]
+;
 entry:
   %a = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> <i16 3, i16 3, i16 3, i16 3>, <4 x i16> <i16 2, i16 2, i16 2, i16 2>) nounwind
   %b = add <4 x i32> %x, %a
   ret <4 x i32> %b
-; CHECK: entry:
-; CHECK-NEXT: %b = add <4 x i32> %x, <i32 6, i32 6, i32 6, i32 6>
-; CHECK-NEXT: ret <4 x i32> %b
 }
 
 declare <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
@@ -67,4 +89,3 @@ declare <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16>, <4 x i16>) nounwind
 
 ; CHECK: attributes #0 = { nounwind ssp memory(none) }
 ; CHECK: attributes #1 = { nocallback nofree nosync nounwind willreturn memory(none) }
-; CHECK: attributes [[NUW]] = { nounwind }

diff  --git a/llvm/test/Transforms/InstCombine/AArch64/aes-intrinsics.ll b/llvm/test/Transforms/InstCombine/AArch64/aes-intrinsics.ll
index 48abdee3401c30..122f0ce31f6290 100644
--- a/llvm/test/Transforms/InstCombine/AArch64/aes-intrinsics.ll
+++ b/llvm/test/Transforms/InstCombine/AArch64/aes-intrinsics.ll
@@ -1,39 +1,48 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt -S -passes=instcombine < %s | FileCheck %s
 ; ARM64 AES intrinsic variants
 
 define <16 x i8> @combineXorAeseZeroARM64(<16 x i8> %data, <16 x i8> %key) {
-; CHECK-LABEL: @combineXorAeseZeroARM64(
-; CHECK-NEXT:    %data.aes = tail call <16 x i8> @llvm.aarch64.crypto.aese(<16 x i8> %data, <16 x i8> %key)
-; CHECK-NEXT:    ret <16 x i8> %data.aes
+; CHECK-LABEL: define <16 x i8> @combineXorAeseZeroARM64(
+; CHECK-SAME: <16 x i8> [[DATA:%.*]], <16 x i8> [[KEY:%.*]]) {
+; CHECK-NEXT:    [[DATA_AES:%.*]] = tail call <16 x i8> @llvm.aarch64.crypto.aese(<16 x i8> [[DATA]], <16 x i8> [[KEY]])
+; CHECK-NEXT:    ret <16 x i8> [[DATA_AES]]
+;
   %data.xor = xor <16 x i8> %data, %key
   %data.aes = tail call <16 x i8> @llvm.aarch64.crypto.aese(<16 x i8> %data.xor, <16 x i8> zeroinitializer)
   ret <16 x i8> %data.aes
 }
 
 define <16 x i8> @combineXorAeseNonZeroARM64(<16 x i8> %data, <16 x i8> %key) {
-; CHECK-LABEL: @combineXorAeseNonZeroARM64(
-; CHECK-NEXT:    %data.xor = xor <16 x i8> %data, %key
-; CHECK-NEXT:    %data.aes = tail call <16 x i8> @llvm.aarch64.crypto.aese(<16 x i8> %data.xor, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
-; CHECK-NEXT:    ret <16 x i8> %data.aes
+; CHECK-LABEL: define <16 x i8> @combineXorAeseNonZeroARM64(
+; CHECK-SAME: <16 x i8> [[DATA:%.*]], <16 x i8> [[KEY:%.*]]) {
+; CHECK-NEXT:    [[DATA_XOR:%.*]] = xor <16 x i8> [[DATA]], [[KEY]]
+; CHECK-NEXT:    [[DATA_AES:%.*]] = tail call <16 x i8> @llvm.aarch64.crypto.aese(<16 x i8> [[DATA_XOR]], <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+; CHECK-NEXT:    ret <16 x i8> [[DATA_AES]]
+;
   %data.xor = xor <16 x i8> %data, %key
   %data.aes = tail call <16 x i8> @llvm.aarch64.crypto.aese(<16 x i8> %data.xor, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
   ret <16 x i8> %data.aes
 }
 
 define <16 x i8> @combineXorAesdZeroARM64(<16 x i8> %data, <16 x i8> %key) {
-; CHECK-LABEL: @combineXorAesdZeroARM64(
-; CHECK-NEXT:    %data.aes = tail call <16 x i8> @llvm.aarch64.crypto.aesd(<16 x i8> %data, <16 x i8> %key)
-; CHECK-NEXT:    ret <16 x i8> %data.aes
+; CHECK-LABEL: define <16 x i8> @combineXorAesdZeroARM64(
+; CHECK-SAME: <16 x i8> [[DATA:%.*]], <16 x i8> [[KEY:%.*]]) {
+; CHECK-NEXT:    [[DATA_AES:%.*]] = tail call <16 x i8> @llvm.aarch64.crypto.aesd(<16 x i8> [[DATA]], <16 x i8> [[KEY]])
+; CHECK-NEXT:    ret <16 x i8> [[DATA_AES]]
+;
   %data.xor = xor <16 x i8> %data, %key
   %data.aes = tail call <16 x i8> @llvm.aarch64.crypto.aesd(<16 x i8> %data.xor, <16 x i8> zeroinitializer)
   ret <16 x i8> %data.aes
 }
 
 define <16 x i8> @combineXorAesdNonZeroARM64(<16 x i8> %data, <16 x i8> %key) {
-; CHECK-LABEL: @combineXorAesdNonZeroARM64(
-; CHECK-NEXT:    %data.xor = xor <16 x i8> %data, %key
-; CHECK-NEXT:    %data.aes = tail call <16 x i8> @llvm.aarch64.crypto.aesd(<16 x i8> %data.xor, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
-; CHECK-NEXT:    ret <16 x i8> %data.aes
+; CHECK-LABEL: define <16 x i8> @combineXorAesdNonZeroARM64(
+; CHECK-SAME: <16 x i8> [[DATA:%.*]], <16 x i8> [[KEY:%.*]]) {
+; CHECK-NEXT:    [[DATA_XOR:%.*]] = xor <16 x i8> [[DATA]], [[KEY]]
+; CHECK-NEXT:    [[DATA_AES:%.*]] = tail call <16 x i8> @llvm.aarch64.crypto.aesd(<16 x i8> [[DATA_XOR]], <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+; CHECK-NEXT:    ret <16 x i8> [[DATA_AES]]
+;
   %data.xor = xor <16 x i8> %data, %key
   %data.aes = tail call <16 x i8> @llvm.aarch64.crypto.aesd(<16 x i8> %data.xor, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
   ret <16 x i8> %data.aes

diff  --git a/llvm/test/Transforms/InstCombine/ARM/2012-04-23-Neon-Intrinsics.ll b/llvm/test/Transforms/InstCombine/ARM/2012-04-23-Neon-Intrinsics.ll
index 81813aa5c87893..5fbbc0ed57750a 100644
--- a/llvm/test/Transforms/InstCombine/ARM/2012-04-23-Neon-Intrinsics.ll
+++ b/llvm/test/Transforms/InstCombine/ARM/2012-04-23-Neon-Intrinsics.ll
@@ -1,64 +1,86 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt -S -passes=instcombine < %s | FileCheck %s
 
 define <4 x i32> @mulByZero(<4 x i16> %x) nounwind readnone ssp {
+; CHECK-LABEL: define <4 x i32> @mulByZero(
+; CHECK-SAME: <4 x i16> [[X:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    ret <4 x i32> zeroinitializer
+;
 entry:
   %a = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %x, <4 x i16> zeroinitializer) nounwind
   ret <4 x i32> %a
-; CHECK: entry:
-; CHECK-NEXT: ret <4 x i32> zeroinitializer
 }
 
 define <4 x i32> @mulByOne(<4 x i16> %x) nounwind readnone ssp {
+; CHECK-LABEL: define <4 x i32> @mulByOne(
+; CHECK-SAME: <4 x i16> [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[A:%.*]] = sext <4 x i16> [[X]] to <4 x i32>
+; CHECK-NEXT:    ret <4 x i32> [[A]]
+;
 entry:
   %a = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %x, <4 x i16> <i16 1, i16 1, i16 1, i16 1>) nounwind
   ret <4 x i32> %a
-; CHECK: entry:
-; CHECK-NEXT: %a = sext <4 x i16> %x to <4 x i32>
-; CHECK-NEXT: ret <4 x i32> %a
 }
 
 define <4 x i32> @constantMul() nounwind readnone ssp {
+; CHECK-LABEL: define <4 x i32> @constantMul(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    ret <4 x i32> <i32 6, i32 6, i32 6, i32 6>
+;
 entry:
   %a = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> <i16 3, i16 3, i16 3, i16 3>, <4 x i16> <i16 2, i16 2, i16 2, i16 2>) nounwind
   ret <4 x i32> %a
-; CHECK: entry:
-; CHECK-NEXT: ret <4 x i32> <i32 6, i32 6, i32 6, i32 6>
 }
 
 define <4 x i32> @constantMulS() nounwind readnone ssp {
+; CHECK-LABEL: define <4 x i32> @constantMulS(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    ret <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>
+;
 entry:
   %b = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>, <4 x i16> <i16 1, i16 1, i16 1, i16 1>) nounwind
   ret <4 x i32> %b
-; CHECK: entry:
-; CHECK-NEXT: ret <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>
 }
 
 define <4 x i32> @constantMulU() nounwind readnone ssp {
+; CHECK-LABEL: define <4 x i32> @constantMulU(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    ret <4 x i32> <i32 65535, i32 65535, i32 65535, i32 65535>
+;
 entry:
   %b = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>, <4 x i16> <i16 1, i16 1, i16 1, i16 1>) nounwind
   ret <4 x i32> %b
-; CHECK: entry:
-; CHECK-NEXT: ret <4 x i32> <i32 65535, i32 65535, i32 65535, i32 65535>
 }
 
 define <4 x i32> @complex1(<4 x i16> %x) nounwind readnone ssp {
+; CHECK-LABEL: define <4 x i32> @complex1(
+; CHECK-SAME: <4 x i16> [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[A:%.*]] = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> <i16 2, i16 2, i16 2, i16 2>, <4 x i16> [[X]]) #[[ATTR2:[0-9]+]]
+; CHECK-NEXT:    ret <4 x i32> [[A]]
+;
 entry:
   %a = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> <i16 2, i16 2, i16 2, i16 2>, <4 x i16> %x) nounwind
   %b = add <4 x i32> zeroinitializer, %a
   ret <4 x i32> %b
-; CHECK: entry:
-; CHECK-NEXT: %a = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> <i16 2, i16 2, i16 2, i16 2>, <4 x i16> %x) [[NUW:#[0-9]+]]
-; CHECK-NEXT: ret <4 x i32> %a
 }
 
 define <4 x i32> @complex2(<4 x i32> %x) nounwind readnone ssp {
+; CHECK-LABEL: define <4 x i32> @complex2(
+; CHECK-SAME: <4 x i32> [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[B:%.*]] = add <4 x i32> [[X]], <i32 6, i32 6, i32 6, i32 6>
+; CHECK-NEXT:    ret <4 x i32> [[B]]
+;
 entry:
   %a = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> <i16 3, i16 3, i16 3, i16 3>, <4 x i16> <i16 2, i16 2, i16 2, i16 2>) nounwind
   %b = add <4 x i32> %x, %a
-  ret <4 x i32> %b  
-; CHECK: entry:
-; CHECK-NEXT: %b = add <4 x i32> %x, <i32 6, i32 6, i32 6, i32 6>
-; CHECK-NEXT: ret <4 x i32> %b
+  ret <4 x i32> %b
 }
 
 declare <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16>, <4 x i16>) nounwind readnone

diff  --git a/llvm/test/Transforms/InstCombine/ARM/aes-intrinsics.ll b/llvm/test/Transforms/InstCombine/ARM/aes-intrinsics.ll
index 0765ca809a66e9..5369e9a94c32ed 100644
--- a/llvm/test/Transforms/InstCombine/ARM/aes-intrinsics.ll
+++ b/llvm/test/Transforms/InstCombine/ARM/aes-intrinsics.ll
@@ -1,39 +1,48 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt -S -passes=instcombine < %s | FileCheck %s
 ; ARM AES intrinsic variants
 
 define <16 x i8> @combineXorAeseZeroARM(<16 x i8> %data, <16 x i8> %key) {
-; CHECK-LABEL: @combineXorAeseZeroARM(
-; CHECK-NEXT:    %data.aes = tail call <16 x i8> @llvm.arm.neon.aese(<16 x i8> %data, <16 x i8> %key)
-; CHECK-NEXT:    ret <16 x i8> %data.aes
+; CHECK-LABEL: define <16 x i8> @combineXorAeseZeroARM(
+; CHECK-SAME: <16 x i8> [[DATA:%.*]], <16 x i8> [[KEY:%.*]]) {
+; CHECK-NEXT:    [[DATA_AES:%.*]] = tail call <16 x i8> @llvm.arm.neon.aese(<16 x i8> [[DATA]], <16 x i8> [[KEY]])
+; CHECK-NEXT:    ret <16 x i8> [[DATA_AES]]
+;
   %data.xor = xor <16 x i8> %data, %key
   %data.aes = tail call <16 x i8> @llvm.arm.neon.aese(<16 x i8> %data.xor, <16 x i8> zeroinitializer)
   ret <16 x i8> %data.aes
 }
 
 define <16 x i8> @combineXorAeseNonZeroARM(<16 x i8> %data, <16 x i8> %key) {
-; CHECK-LABEL: @combineXorAeseNonZeroARM(
-; CHECK-NEXT:    %data.xor = xor <16 x i8> %data, %key
-; CHECK-NEXT:    %data.aes = tail call <16 x i8> @llvm.arm.neon.aese(<16 x i8> %data.xor, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
-; CHECK-NEXT:    ret <16 x i8> %data.aes
+; CHECK-LABEL: define <16 x i8> @combineXorAeseNonZeroARM(
+; CHECK-SAME: <16 x i8> [[DATA:%.*]], <16 x i8> [[KEY:%.*]]) {
+; CHECK-NEXT:    [[DATA_XOR:%.*]] = xor <16 x i8> [[DATA]], [[KEY]]
+; CHECK-NEXT:    [[DATA_AES:%.*]] = tail call <16 x i8> @llvm.arm.neon.aese(<16 x i8> [[DATA_XOR]], <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+; CHECK-NEXT:    ret <16 x i8> [[DATA_AES]]
+;
   %data.xor = xor <16 x i8> %data, %key
   %data.aes = tail call <16 x i8> @llvm.arm.neon.aese(<16 x i8> %data.xor, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
   ret <16 x i8> %data.aes
 }
 
 define <16 x i8> @combineXorAesdZeroARM(<16 x i8> %data, <16 x i8> %key) {
-; CHECK-LABEL: @combineXorAesdZeroARM(
-; CHECK-NEXT:    %data.aes = tail call <16 x i8> @llvm.arm.neon.aesd(<16 x i8> %data, <16 x i8> %key)
-; CHECK-NEXT:    ret <16 x i8> %data.aes
+; CHECK-LABEL: define <16 x i8> @combineXorAesdZeroARM(
+; CHECK-SAME: <16 x i8> [[DATA:%.*]], <16 x i8> [[KEY:%.*]]) {
+; CHECK-NEXT:    [[DATA_AES:%.*]] = tail call <16 x i8> @llvm.arm.neon.aesd(<16 x i8> [[DATA]], <16 x i8> [[KEY]])
+; CHECK-NEXT:    ret <16 x i8> [[DATA_AES]]
+;
   %data.xor = xor <16 x i8> %data, %key
   %data.aes = tail call <16 x i8> @llvm.arm.neon.aesd(<16 x i8> %data.xor, <16 x i8> zeroinitializer)
   ret <16 x i8> %data.aes
 }
 
 define <16 x i8> @combineXorAesdNonZeroARM(<16 x i8> %data, <16 x i8> %key) {
-; CHECK-LABEL: @combineXorAesdNonZeroARM(
-; CHECK-NEXT:    %data.xor = xor <16 x i8> %data, %key
-; CHECK-NEXT:    %data.aes = tail call <16 x i8> @llvm.arm.neon.aesd(<16 x i8> %data.xor, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
-; CHECK-NEXT:    ret <16 x i8> %data.aes
+; CHECK-LABEL: define <16 x i8> @combineXorAesdNonZeroARM(
+; CHECK-SAME: <16 x i8> [[DATA:%.*]], <16 x i8> [[KEY:%.*]]) {
+; CHECK-NEXT:    [[DATA_XOR:%.*]] = xor <16 x i8> [[DATA]], [[KEY]]
+; CHECK-NEXT:    [[DATA_AES:%.*]] = tail call <16 x i8> @llvm.arm.neon.aesd(<16 x i8> [[DATA_XOR]], <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+; CHECK-NEXT:    ret <16 x i8> [[DATA_AES]]
+;
   %data.xor = xor <16 x i8> %data, %key
   %data.aes = tail call <16 x i8> @llvm.arm.neon.aesd(<16 x i8> %data.xor, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
   ret <16 x i8> %data.aes

diff  --git a/llvm/test/Transforms/InstCombine/pow-0.ll b/llvm/test/Transforms/InstCombine/pow-0.ll
index 14dff3c5dc43b3..01c03df8c6b545 100644
--- a/llvm/test/Transforms/InstCombine/pow-0.ll
+++ b/llvm/test/Transforms/InstCombine/pow-0.ll
@@ -1,57 +1,74 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt < %s -passes=instcombine -S | FileCheck %s
 
-; CHECK-LABEL: @zero(
-; CHECK-NEXT:  ret double 1.000000e+00
 define double @zero(double %value) {
+; CHECK-LABEL: define double @zero(
+; CHECK-SAME: double [[VALUE:%.*]]) {
+; CHECK-NEXT:    ret double 1.000000e+00
+;
   %res = call double @llvm.pow.f64(double %value, double 0.000000e+00)
   ret double %res
 }
 
-; CHECK-LABEL: @minus_zero(
-; CHECK-NEXT:  ret double 1.000000e+00
 define double @minus_zero(double %value) {
+; CHECK-LABEL: define double @minus_zero(
+; CHECK-SAME: double [[VALUE:%.*]]) {
+; CHECK-NEXT:    ret double 1.000000e+00
+;
   %res = call double @llvm.pow.f64(double %value, double -0.000000e+00)
   ret double %res
 }
 
-; CHECK-LABEL: @fast_zero(
-; CHECK-NEXT:  ret double 1.000000e+00
 define double @fast_zero(double %value) {
+; CHECK-LABEL: define double @fast_zero(
+; CHECK-SAME: double [[VALUE:%.*]]) {
+; CHECK-NEXT:    ret double 1.000000e+00
+;
   %res = call fast double @llvm.pow.f64(double %value, double 0.000000e+00)
   ret double %res
 }
 
-; CHECK-LABEL: @fast_minus_zero(
-; CHECK-NEXT:  ret double 1.000000e+00
 define double @fast_minus_zero(double %value) {
+; CHECK-LABEL: define double @fast_minus_zero(
+; CHECK-SAME: double [[VALUE:%.*]]) {
+; CHECK-NEXT:    ret double 1.000000e+00
+;
   %res = call fast double @llvm.pow.f64(double %value, double -0.000000e+00)
   ret double %res
 }
 
-; CHECK-LABEL: @vec_zero(
-; CHECK-NEXT:  ret <2 x double> <double 1.000000e+00, double 1.000000e+00>
 define <2 x double> @vec_zero(<2 x double> %value) {
+; CHECK-LABEL: define <2 x double> @vec_zero(
+; CHECK-SAME: <2 x double> [[VALUE:%.*]]) {
+; CHECK-NEXT:    ret <2 x double> <double 1.000000e+00, double 1.000000e+00>
+;
   %res = call <2 x double> @llvm.pow.v2f64(<2 x double> %value, <2 x double> <double 0.000000e+00, double 0.000000e+00>)
   ret <2 x double> %res
 }
 
-; CHECK-LABEL: @vec_minus_zero(
-; CHECK-NEXT:  ret <2 x double> <double 1.000000e+00, double 1.000000e+00>
 define <2 x double> @vec_minus_zero(<2 x double> %value) {
+; CHECK-LABEL: define <2 x double> @vec_minus_zero(
+; CHECK-SAME: <2 x double> [[VALUE:%.*]]) {
+; CHECK-NEXT:    ret <2 x double> <double 1.000000e+00, double 1.000000e+00>
+;
   %res = call <2 x double> @llvm.pow.v2f64(<2 x double> %value, <2 x double> <double -0.000000e+00, double -0.000000e+00>)
   ret <2 x double> %res
 }
 
-; CHECK-LABEL: @vec_fast_zero(
-; CHECK-NEXT:  ret <2 x double> <double 1.000000e+00, double 1.000000e+00>
 define <2 x double> @vec_fast_zero(<2 x double> %value) {
+; CHECK-LABEL: define <2 x double> @vec_fast_zero(
+; CHECK-SAME: <2 x double> [[VALUE:%.*]]) {
+; CHECK-NEXT:    ret <2 x double> <double 1.000000e+00, double 1.000000e+00>
+;
   %res = call fast <2 x double> @llvm.pow.v2f64(<2 x double> %value, <2 x double> <double 0.000000e+00, double 0.000000e+00>)
   ret <2 x double> %res
 }
 
-; CHECK-LABEL: @vec_fast_minus_zero(
-; CHECK-NEXT:  ret <2 x double> <double 1.000000e+00, double 1.000000e+00>
 define <2 x double> @vec_fast_minus_zero(<2 x double> %value) {
+; CHECK-LABEL: define <2 x double> @vec_fast_minus_zero(
+; CHECK-SAME: <2 x double> [[VALUE:%.*]]) {
+; CHECK-NEXT:    ret <2 x double> <double 1.000000e+00, double 1.000000e+00>
+;
   %res = call fast <2 x double> @llvm.pow.v2f64(<2 x double> %value, <2 x double> <double -0.000000e+00, double -0.000000e+00>)
   ret <2 x double> %res
 }

diff  --git a/llvm/test/Transforms/LoopVectorize/AArch64/blend-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/blend-costs.ll
index 6572dda70d42bb..cec9aa42d46935 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/blend-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/blend-costs.ll
@@ -422,6 +422,35 @@ exit:
 }
 
 define void @test_blend_feeding_replicated_store_3(ptr noalias %src.1, ptr noalias %src.2, ptr noalias %dst, i32 %x, i64 %N, i1 %c.2) {
+; CHECK-LABEL: define void @test_blend_feeding_replicated_store_3(
+; CHECK-SAME: ptr noalias [[SRC_1:%.*]], ptr noalias [[SRC_2:%.*]], ptr noalias [[DST:%.*]], i32 [[X:%.*]], i64 [[N:%.*]], i1 [[C_2:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
+; CHECK:       [[LOOP_HEADER]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[L_1:%.*]] = load i8, ptr [[SRC_1]], align 1
+; CHECK-NEXT:    [[EXT:%.*]] = zext i8 [[L_1]] to i32
+; CHECK-NEXT:    [[MUL:%.*]] = mul i32 [[X]], [[EXT]]
+; CHECK-NEXT:    [[DIV:%.*]] = sdiv i32 [[MUL]], 255
+; CHECK-NEXT:    [[L_2:%.*]] = load i8, ptr [[SRC_2]], align 1
+; CHECK-NEXT:    [[C_1:%.*]] = icmp eq i8 [[L_2]], 0
+; CHECK-NEXT:    br i1 [[C_1]], label %[[THEN:.*]], label %[[ELSE_1:.*]]
+; CHECK:       [[ELSE_1]]:
+; CHECK-NEXT:    br i1 [[C_2]], label %[[LOOP_LATCH]], label %[[ELSE_2:.*]]
+; CHECK:       [[ELSE_2]]:
+; CHECK-NEXT:    [[TRUNC_DIV:%.*]] = trunc i32 [[DIV]] to i8
+; CHECK-NEXT:    br label %[[THEN]]
+; CHECK:       [[THEN]]:
+; CHECK-NEXT:    [[P:%.*]] = phi i8 [ 0, %[[LOOP_HEADER]] ], [ [[TRUNC_DIV]], %[[ELSE_2]] ]
+; CHECK-NEXT:    store i8 [[P]], ptr [[DST]], align 1
+; CHECK-NEXT:    br label %[[LOOP_LATCH]]
+; CHECK:       [[LOOP_LATCH]]:
+; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV]], [[N]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP_HEADER]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
 entry:
   br label %loop.header