[llvm] [DAGCombiner] Optimize more vector element extractions. (PR #80520)

Harald van Dijk via llvm-commits llvm-commits at lists.llvm.org
Sat Feb 3 17:36:44 PST 2024


https://github.com/hvdijk updated https://github.com/llvm/llvm-project/pull/80520

>From d7338b460e91cdc1f04cec19ff8fa47bb6218f29 Mon Sep 17 00:00:00 2001
From: Harald van Dijk <harald at gigawatt.nl>
Date: Sun, 4 Feb 2024 01:36:31 +0000
Subject: [PATCH] [DAGCombiner] Optimize more vector element extractions.

Extracting an element from a non-legal type, floating point constants,
and non-zero constants are all worth optimizing; we see better codegen
on multiple platforms in existing tests.
---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp |    7 +-
 llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll |  171 +--
 llvm/test/CodeGen/ARM/arm-half-promote.ll     |  155 +--
 .../CodeGen/Hexagon/autohvx/hfnosplat_cp.ll   |   17 -
 llvm/test/CodeGen/Mips/cconv/vector.ll        |  675 ++++++----
 llvm/test/CodeGen/X86/nontemporal-4.ll        | 1176 +++--------------
 .../CodeGen/X86/vector-shuffle-combining.ll   |   44 +-
 7 files changed, 651 insertions(+), 1594 deletions(-)
 delete mode 100644 llvm/test/CodeGen/Hexagon/autohvx/hfnosplat_cp.ll

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index b17724cd07209..b68510df181e5 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -22240,9 +22240,8 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {
   }
 
   // extract_vector_elt (build_vector x, y), 1 -> y
-  if (((IndexC && VecOp.getOpcode() == ISD::BUILD_VECTOR) ||
-       VecOp.getOpcode() == ISD::SPLAT_VECTOR) &&
-      TLI.isTypeLegal(VecVT)) {
+  if ((IndexC && VecOp.getOpcode() == ISD::BUILD_VECTOR) ||
+      VecOp.getOpcode() == ISD::SPLAT_VECTOR) {
     assert((VecOp.getOpcode() != ISD::BUILD_VECTOR ||
             VecVT.isFixedLengthVector()) &&
            "BUILD_VECTOR used for scalable vectors");
@@ -22252,7 +22251,7 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {
     EVT InEltVT = Elt.getValueType();
 
     if (VecOp.hasOneUse() || TLI.aggressivelyPreferBuildVectorSources(VecVT) ||
-        isNullConstant(Elt)) {
+        isIntOrFPConstant(Elt)) {
       // Sometimes build_vector's scalar input types do not match result type.
       if (ScalarVT == InEltVT)
         return Elt;
diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
index 274621307f540..74d377bb2bb2b 100644
--- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
@@ -190,10 +190,6 @@ define <2 x half> @v_test_canonicalize_build_vector_v2f16(half %lo, half %hi) #1
 ; CI-LABEL: v_test_canonicalize_build_vector_v2f16:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_test_canonicalize_build_vector_v2f16:
@@ -2301,12 +2297,6 @@ define <3 x half> @v_test_canonicalize_var_v3f16(<3 x half> %val) #1 {
 ; CI-LABEL: v_test_canonicalize_var_v3f16:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_test_canonicalize_var_v3f16:
@@ -2341,14 +2331,6 @@ define <4 x half> @v_test_canonicalize_var_v4f16(<4 x half> %val) #1 {
 ; CI-LABEL: v_test_canonicalize_var_v4f16:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; CI-NEXT:    v_cvt_f32_f16_e32 v3, v3
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_test_canonicalize_var_v4f16:
@@ -2611,9 +2593,7 @@ define <2 x half> @v_test_canonicalize_reg_k_v2f16(half %val) #1 {
 ; CI-LABEL: v_test_canonicalize_reg_k_v2f16:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; CI-NEXT:    v_mov_b32_e32 v1, 2.0
-; CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_test_canonicalize_reg_k_v2f16:
@@ -2647,8 +2627,7 @@ define <2 x half> @v_test_canonicalize_k_reg_v2f16(half %val) #1 {
 ; CI-LABEL: v_test_canonicalize_k_reg_v2f16:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; CI-NEXT:    v_cvt_f32_f16_e32 v1, v0
+; CI-NEXT:    v_mov_b32_e32 v1, v0
 ; CI-NEXT:    v_mov_b32_e32 v0, 2.0
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2878,18 +2857,6 @@ define <6 x half> @v_test_canonicalize_var_v6f16(<6 x half> %val) #1 {
 ; CI-LABEL: v_test_canonicalize_var_v6f16:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; CI-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; CI-NEXT:    v_cvt_f16_f32_e32 v5, v5
-; CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; CI-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; CI-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; CI-NEXT:    v_cvt_f32_f16_e32 v5, v5
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_test_canonicalize_var_v6f16:
@@ -2933,22 +2900,6 @@ define <8 x half> @v_test_canonicalize_var_v8f16(<8 x half> %val) #1 {
 ; CI-LABEL: v_test_canonicalize_var_v8f16:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; CI-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; CI-NEXT:    v_cvt_f16_f32_e32 v5, v5
-; CI-NEXT:    v_cvt_f16_f32_e32 v6, v6
-; CI-NEXT:    v_cvt_f16_f32_e32 v7, v7
-; CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; CI-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; CI-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; CI-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; CI-NEXT:    v_cvt_f32_f16_e32 v6, v6
-; CI-NEXT:    v_cvt_f32_f16_e32 v7, v7
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_test_canonicalize_var_v8f16:
@@ -3001,30 +2952,6 @@ define <12 x half> @v_test_canonicalize_var_v12f16(<12 x half> %val) #1 {
 ; CI-LABEL: v_test_canonicalize_var_v12f16:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; CI-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; CI-NEXT:    v_cvt_f16_f32_e32 v5, v5
-; CI-NEXT:    v_cvt_f16_f32_e32 v6, v6
-; CI-NEXT:    v_cvt_f16_f32_e32 v7, v7
-; CI-NEXT:    v_cvt_f16_f32_e32 v8, v8
-; CI-NEXT:    v_cvt_f16_f32_e32 v9, v9
-; CI-NEXT:    v_cvt_f16_f32_e32 v10, v10
-; CI-NEXT:    v_cvt_f16_f32_e32 v11, v11
-; CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; CI-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; CI-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; CI-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; CI-NEXT:    v_cvt_f32_f16_e32 v6, v6
-; CI-NEXT:    v_cvt_f32_f16_e32 v7, v7
-; CI-NEXT:    v_cvt_f32_f16_e32 v8, v8
-; CI-NEXT:    v_cvt_f32_f16_e32 v9, v9
-; CI-NEXT:    v_cvt_f32_f16_e32 v10, v10
-; CI-NEXT:    v_cvt_f32_f16_e32 v11, v11
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_test_canonicalize_var_v12f16:
@@ -3087,38 +3014,6 @@ define <16 x half> @v_test_canonicalize_var_v16f16(<16 x half> %val) #1 {
 ; CI-LABEL: v_test_canonicalize_var_v16f16:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; CI-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; CI-NEXT:    v_cvt_f16_f32_e32 v5, v5
-; CI-NEXT:    v_cvt_f16_f32_e32 v6, v6
-; CI-NEXT:    v_cvt_f16_f32_e32 v7, v7
-; CI-NEXT:    v_cvt_f16_f32_e32 v8, v8
-; CI-NEXT:    v_cvt_f16_f32_e32 v9, v9
-; CI-NEXT:    v_cvt_f16_f32_e32 v10, v10
-; CI-NEXT:    v_cvt_f16_f32_e32 v11, v11
-; CI-NEXT:    v_cvt_f16_f32_e32 v12, v12
-; CI-NEXT:    v_cvt_f16_f32_e32 v13, v13
-; CI-NEXT:    v_cvt_f16_f32_e32 v14, v14
-; CI-NEXT:    v_cvt_f16_f32_e32 v15, v15
-; CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; CI-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; CI-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; CI-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; CI-NEXT:    v_cvt_f32_f16_e32 v6, v6
-; CI-NEXT:    v_cvt_f32_f16_e32 v7, v7
-; CI-NEXT:    v_cvt_f32_f16_e32 v8, v8
-; CI-NEXT:    v_cvt_f32_f16_e32 v9, v9
-; CI-NEXT:    v_cvt_f32_f16_e32 v10, v10
-; CI-NEXT:    v_cvt_f32_f16_e32 v11, v11
-; CI-NEXT:    v_cvt_f32_f16_e32 v12, v12
-; CI-NEXT:    v_cvt_f32_f16_e32 v13, v13
-; CI-NEXT:    v_cvt_f32_f16_e32 v14, v14
-; CI-NEXT:    v_cvt_f32_f16_e32 v15, v15
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_test_canonicalize_var_v16f16:
@@ -3216,71 +3111,7 @@ define <32 x half> @v_test_canonicalize_var_v32f16(<32 x half> %val) #1 {
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
-; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; CI-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; CI-NEXT:    v_cvt_f16_f32_e32 v5, v5
-; CI-NEXT:    v_cvt_f16_f32_e32 v6, v6
-; CI-NEXT:    v_cvt_f16_f32_e32 v7, v7
-; CI-NEXT:    v_cvt_f16_f32_e32 v8, v8
-; CI-NEXT:    v_cvt_f16_f32_e32 v9, v9
-; CI-NEXT:    v_cvt_f16_f32_e32 v10, v10
-; CI-NEXT:    v_cvt_f16_f32_e32 v11, v11
-; CI-NEXT:    v_cvt_f16_f32_e32 v12, v12
-; CI-NEXT:    v_cvt_f16_f32_e32 v13, v13
-; CI-NEXT:    v_cvt_f16_f32_e32 v14, v14
-; CI-NEXT:    v_cvt_f16_f32_e32 v15, v15
-; CI-NEXT:    v_cvt_f16_f32_e32 v16, v16
-; CI-NEXT:    v_cvt_f16_f32_e32 v17, v17
-; CI-NEXT:    v_cvt_f16_f32_e32 v18, v18
-; CI-NEXT:    v_cvt_f16_f32_e32 v19, v19
-; CI-NEXT:    v_cvt_f16_f32_e32 v20, v20
-; CI-NEXT:    v_cvt_f16_f32_e32 v21, v21
-; CI-NEXT:    v_cvt_f16_f32_e32 v22, v22
-; CI-NEXT:    v_cvt_f16_f32_e32 v23, v23
-; CI-NEXT:    v_cvt_f16_f32_e32 v24, v24
-; CI-NEXT:    v_cvt_f16_f32_e32 v25, v25
-; CI-NEXT:    v_cvt_f16_f32_e32 v26, v26
-; CI-NEXT:    v_cvt_f16_f32_e32 v27, v27
-; CI-NEXT:    v_cvt_f16_f32_e32 v28, v28
-; CI-NEXT:    v_cvt_f16_f32_e32 v29, v29
-; CI-NEXT:    v_cvt_f16_f32_e32 v30, v30
-; CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; CI-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; CI-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; CI-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; CI-NEXT:    v_cvt_f32_f16_e32 v6, v6
-; CI-NEXT:    v_cvt_f32_f16_e32 v7, v7
-; CI-NEXT:    v_cvt_f32_f16_e32 v8, v8
-; CI-NEXT:    v_cvt_f32_f16_e32 v9, v9
-; CI-NEXT:    v_cvt_f32_f16_e32 v10, v10
-; CI-NEXT:    v_cvt_f32_f16_e32 v11, v11
-; CI-NEXT:    v_cvt_f32_f16_e32 v12, v12
-; CI-NEXT:    v_cvt_f32_f16_e32 v13, v13
-; CI-NEXT:    v_cvt_f32_f16_e32 v14, v14
-; CI-NEXT:    v_cvt_f32_f16_e32 v15, v15
-; CI-NEXT:    v_cvt_f32_f16_e32 v16, v16
-; CI-NEXT:    v_cvt_f32_f16_e32 v17, v17
-; CI-NEXT:    v_cvt_f32_f16_e32 v18, v18
-; CI-NEXT:    v_cvt_f32_f16_e32 v19, v19
-; CI-NEXT:    v_cvt_f32_f16_e32 v20, v20
-; CI-NEXT:    v_cvt_f32_f16_e32 v21, v21
-; CI-NEXT:    v_cvt_f32_f16_e32 v22, v22
-; CI-NEXT:    v_cvt_f32_f16_e32 v23, v23
-; CI-NEXT:    v_cvt_f32_f16_e32 v24, v24
-; CI-NEXT:    v_cvt_f32_f16_e32 v25, v25
-; CI-NEXT:    v_cvt_f32_f16_e32 v26, v26
-; CI-NEXT:    v_cvt_f32_f16_e32 v27, v27
-; CI-NEXT:    v_cvt_f32_f16_e32 v28, v28
-; CI-NEXT:    v_cvt_f32_f16_e32 v29, v29
-; CI-NEXT:    v_cvt_f32_f16_e32 v30, v30
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    v_cvt_f16_f32_e32 v31, v31
-; CI-NEXT:    v_cvt_f32_f16_e32 v31, v31
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_test_canonicalize_var_v32f16:
diff --git a/llvm/test/CodeGen/ARM/arm-half-promote.ll b/llvm/test/CodeGen/ARM/arm-half-promote.ll
index e1ab75b2ac7f1..d6a8a9b9538f1 100644
--- a/llvm/test/CodeGen/ARM/arm-half-promote.ll
+++ b/llvm/test/CodeGen/ARM/arm-half-promote.ll
@@ -2,113 +2,78 @@
 
 define arm_aapcs_vfpcc { <8 x half>, <8 x half> } @f1() {
 ; CHECK-LABEL: _f1
-; CHECK:      vpush   {d8, d9, d10, d11}
-; CHECK-NEXT: vmov.i32        q8, #0x0
-; CHECK-NEXT: vmov.u16        r0, d16[0]
-; CHECK-NEXT: vmov    d4, r0, r0
-; CHECK-NEXT: vmov.u16        r0, d16[1]
-; CHECK-NEXT: vmov    d8, r0, r0
-; CHECK-NEXT: vmov.u16        r0, d16[2]
-; CHECK-NEXT: vmov    d5, r0, r0
-; CHECK-NEXT: vmov.u16        r0, d16[3]
-; CHECK-NEXT: vmov    d9, r0, r0
-; CHECK-NEXT: vmov.u16        r0, d17[0]
-; CHECK-NEXT: vmov    d6, r0, r0
-; CHECK-NEXT: vmov.u16        r0, d17[1]
-; CHECK-NEXT: vmov    d10, r0, r0
-; CHECK-NEXT: vmov.u16        r0, d17[2]
-; CHECK-NEXT: vmov    d7, r0, r0
-; CHECK-NEXT: vmov.u16        r0, d17[3]
-; CHECK-NEXT: vmov    d11, r0, r0
-; CHECK:      vmov.f32        s0, s8
-; CHECK:      vmov.f32        s1, s16
-; CHECK:      vmov.f32        s2, s10
-; CHECK:      vmov.f32        s3, s18
-; CHECK:      vmov.f32        s4, s12
-; CHECK:      vmov.f32        s5, s20
-; CHECK:      vmov.f32        s6, s14
-; CHECK:      vmov.f32        s7, s22
-; CHECK:      vmov.f32        s9, s16
-; CHECK:      vmov.f32        s11, s18
-; CHECK:      vmov.f32        s13, s20
-; CHECK:      vmov.f32        s15, s22
-; CHECK:      vpop    {d8, d9, d10, d11}
+; CHECK: vpush   {d8}
+; CHECK-NEXT: vmov.f64        d8, #5.000000e-01
+; CHECK-NEXT: vmov.i32        d8, #0x0
+; CHECK-NEXT: vmov.i32        d0, #0x0
+; CHECK-NEXT: vmov.i32        d1, #0x0
+; CHECK-NEXT: vmov.i32        d2, #0x0
+; CHECK-NEXT: vmov.i32        d3, #0x0
+; CHECK-NEXT: vmov.i32        d4, #0x0
+; CHECK-NEXT: vmov.i32        d5, #0x0
+; CHECK-NEXT: vmov.i32        d6, #0x0
+; CHECK-NEXT: vmov.i32        d7, #0x0
+; CHECK-NEXT: vmov.f32        s1, s16
+; CHECK-NEXT: vmov.f32        s3, s16
+; CHECK-NEXT: vmov.f32        s5, s16
+; CHECK-NEXT: vmov.f32        s7, s16
+; CHECK-NEXT: vmov.f32        s9, s16
+; CHECK-NEXT: vmov.f32        s11, s16
+; CHECK-NEXT: vmov.f32        s13, s16
+; CHECK-NEXT: vmov.f32        s15, s16
+; CHECK-NEXT: vpop    {d8}
 ; CHECK-NEXT: bx      lr
-
   ret { <8 x half>, <8 x half> } zeroinitializer
 }
 
 define swiftcc { <8 x half>, <8 x half> } @f2() {
 ; CHECK-LABEL: _f2
-; CHECK:      vpush   {d8, d9, d10, d11}
-; CHECK-NEXT: vmov.i32        q8, #0x0
-; CHECK-NEXT: vmov.u16        r0, d16[0]
-; CHECK-NEXT: vmov    d4, r0, r0
-; CHECK-NEXT: vmov.u16        r0, d16[1]
-; CHECK-NEXT: vmov    d8, r0, r0
-; CHECK-NEXT: vmov.u16        r0, d16[2]
-; CHECK-NEXT: vmov    d5, r0, r0
-; CHECK-NEXT: vmov.u16        r0, d16[3]
-; CHECK-NEXT: vmov    d9, r0, r0
-; CHECK-NEXT: vmov.u16        r0, d17[0]
-; CHECK-NEXT: vmov    d6, r0, r0
-; CHECK-NEXT: vmov.u16        r0, d17[1]
-; CHECK-NEXT: vmov    d10, r0, r0
-; CHECK-NEXT: vmov.u16        r0, d17[2]
-; CHECK-NEXT: vmov    d7, r0, r0
-; CHECK-NEXT: vmov.u16        r0, d17[3]
-; CHECK-NEXT: vmov    d11, r0, r0
-; CHECK:      vmov.f32        s0, s8
-; CHECK:      vmov.f32        s1, s16
-; CHECK:      vmov.f32        s2, s10
-; CHECK:      vmov.f32        s3, s18
-; CHECK:      vmov.f32        s4, s12
-; CHECK:      vmov.f32        s5, s20
-; CHECK:      vmov.f32        s6, s14
-; CHECK:      vmov.f32        s7, s22
-; CHECK:      vmov.f32        s9, s16
-; CHECK:      vmov.f32        s11, s18
-; CHECK:      vmov.f32        s13, s20
-; CHECK:      vmov.f32        s15, s22
-; CHECK-NEXT: vpop    {d8, d9, d10, d11}
+; CHECK: vpush   {d8}
+; CHECK-NEXT: vmov.f64        d8, #5.000000e-01
+; CHECK-NEXT: vmov.i32        d8, #0x0
+; CHECK-NEXT: vmov.i32        d0, #0x0
+; CHECK-NEXT: vmov.i32        d1, #0x0
+; CHECK-NEXT: vmov.i32        d2, #0x0
+; CHECK-NEXT: vmov.i32        d3, #0x0
+; CHECK-NEXT: vmov.i32        d4, #0x0
+; CHECK-NEXT: vmov.i32        d5, #0x0
+; CHECK-NEXT: vmov.i32        d6, #0x0
+; CHECK-NEXT: vmov.i32        d7, #0x0
+; CHECK-NEXT: vmov.f32        s1, s16
+; CHECK-NEXT: vmov.f32        s3, s16
+; CHECK-NEXT: vmov.f32        s5, s16
+; CHECK-NEXT: vmov.f32        s7, s16
+; CHECK-NEXT: vmov.f32        s9, s16
+; CHECK-NEXT: vmov.f32        s11, s16
+; CHECK-NEXT: vmov.f32        s13, s16
+; CHECK-NEXT: vmov.f32        s15, s16
+; CHECK-NEXT: vpop    {d8}
 ; CHECK-NEXT: bx      lr
-
   ret { <8 x half>, <8 x half> } zeroinitializer
 }
 
 define fastcc { <8 x half>, <8 x half> } @f3() {
 ; CHECK-LABEL: _f3
-; CHECK:      vpush   {d8, d9, d10, d11}
-; CHECK-NEXT: vmov.i32        q8, #0x0
-; CHECK-NEXT: vmov.u16        r0, d16[0]
-; CHECK-NEXT: vmov    d4, r0, r0
-; CHECK-NEXT: vmov.u16        r0, d16[1]
-; CHECK-NEXT: vmov    d8, r0, r0
-; CHECK-NEXT: vmov.u16        r0, d16[2]
-; CHECK-NEXT: vmov    d5, r0, r0
-; CHECK-NEXT: vmov.u16        r0, d16[3]
-; CHECK-NEXT: vmov    d9, r0, r0
-; CHECK-NEXT: vmov.u16        r0, d17[0]
-; CHECK-NEXT: vmov    d6, r0, r0
-; CHECK-NEXT: vmov.u16        r0, d17[1]
-; CHECK-NEXT: vmov    d10, r0, r0
-; CHECK-NEXT: vmov.u16        r0, d17[2]
-; CHECK-NEXT: vmov    d7, r0, r0
-; CHECK-NEXT: vmov.u16        r0, d17[3]
-; CHECK-NEXT: vmov    d11, r0, r0
-; CHECK:      vmov.f32        s0, s8
-; CHECK:      vmov.f32        s1, s16
-; CHECK:      vmov.f32        s2, s10
-; CHECK:      vmov.f32        s3, s18
-; CHECK:      vmov.f32        s4, s12
-; CHECK:      vmov.f32        s5, s20
-; CHECK:      vmov.f32        s6, s14
-; CHECK:      vmov.f32        s7, s22
-; CHECK:      vmov.f32        s9, s16
-; CHECK:      vmov.f32        s11, s18
-; CHECK:      vmov.f32        s13, s20
-; CHECK:      vmov.f32        s15, s22
-; CHECK-NEXT: vpop    {d8, d9, d10, d11}
+; CHECK: vpush   {d8}
+; CHECK-NEXT: vmov.f64        d8, #5.000000e-01
+; CHECK-NEXT: vmov.i32        d8, #0x0
+; CHECK-NEXT: vmov.i32        d0, #0x0
+; CHECK-NEXT: vmov.i32        d1, #0x0
+; CHECK-NEXT: vmov.i32        d2, #0x0
+; CHECK-NEXT: vmov.i32        d3, #0x0
+; CHECK-NEXT: vmov.i32        d4, #0x0
+; CHECK-NEXT: vmov.i32        d5, #0x0
+; CHECK-NEXT: vmov.i32        d6, #0x0
+; CHECK-NEXT: vmov.i32        d7, #0x0
+; CHECK-NEXT: vmov.f32        s1, s16
+; CHECK-NEXT: vmov.f32        s3, s16
+; CHECK-NEXT: vmov.f32        s5, s16
+; CHECK-NEXT: vmov.f32        s7, s16
+; CHECK-NEXT: vmov.f32        s9, s16
+; CHECK-NEXT: vmov.f32        s11, s16
+; CHECK-NEXT: vmov.f32        s13, s16
+; CHECK-NEXT: vmov.f32        s15, s16
+; CHECK-NEXT: vpop    {d8}
 ; CHECK-NEXT: bx      lr
 
   ret { <8 x half>, <8 x half> } zeroinitializer
diff --git a/llvm/test/CodeGen/Hexagon/autohvx/hfnosplat_cp.ll b/llvm/test/CodeGen/Hexagon/autohvx/hfnosplat_cp.ll
deleted file mode 100644
index 4c5c96e61b78c..0000000000000
--- a/llvm/test/CodeGen/Hexagon/autohvx/hfnosplat_cp.ll
+++ /dev/null
@@ -1,17 +0,0 @@
-; RUN: llc -mtriple=hexagon < %s | FileCheck %s
-
-; Check that the vsplat instruction is generated
-; CHECK: .word 1097875824
-; CHECK: .word 1048133241
-; CHECK: .word 0
-
-target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048"
-target triple = "hexagon"
-; Function Attrs: nofree norecurse nounwind writeonly
-define dso_local i32 @foo(ptr nocapture %a) local_unnamed_addr #0 {
-vector.body:
-  store <40 x half> <half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH4170, half 0xH3E79, half 0xH3E79, half 0xH3E79, half 0xH3E79, half 0xH3E79, half 0xH3E79, half 0xH3E79, half 0xH3E79, half 0xH3E79>, ptr %a, align 2
-  ret i32 0
-}
-
-attributes #0 = { nofree norecurse nounwind writeonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="hexagonv69" "target-features"="+hvx-length128b,+hvxv69,+v69,-long-calls" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/test/CodeGen/Mips/cconv/vector.ll b/llvm/test/CodeGen/Mips/cconv/vector.ll
index 28a7dc046139b..a56005ead73ef 100644
--- a/llvm/test/CodeGen/Mips/cconv/vector.ll
+++ b/llvm/test/CodeGen/Mips/cconv/vector.ll
@@ -3855,77 +3855,81 @@ define void @calli8_16() {
 ; MIPS64EB-NEXT:    jr $ra
 ; MIPS64EB-NEXT:    nop
 ;
-; MIPS32R5-LABEL: calli8_16:
-; MIPS32R5:       # %bb.0: # %entry
-; MIPS32R5-NEXT:    addiu $sp, $sp, -40
-; MIPS32R5-NEXT:    .cfi_def_cfa_offset 40
-; MIPS32R5-NEXT:    sw $ra, 36($sp) # 4-byte Folded Spill
-; MIPS32R5-NEXT:    .cfi_offset 31, -4
-; MIPS32R5-NEXT:    lui $1, %hi($CPI30_0)
-; MIPS32R5-NEXT:    addiu $1, $1, %lo($CPI30_0)
-; MIPS32R5-NEXT:    ld.w $w0, 0($1)
-; MIPS32R5-NEXT:    copy_s.w $4, $w0[0]
-; MIPS32R5-NEXT:    copy_s.w $5, $w0[1]
-; MIPS32R5-NEXT:    copy_s.w $6, $w0[2]
-; MIPS32R5-NEXT:    copy_s.w $7, $w0[3]
-; MIPS32R5-NEXT:    lui $1, %hi($CPI30_1)
-; MIPS32R5-NEXT:    addiu $1, $1, %lo($CPI30_1)
-; MIPS32R5-NEXT:    ld.w $w0, 0($1)
-; MIPS32R5-NEXT:    copy_s.w $1, $w0[0]
-; MIPS32R5-NEXT:    copy_s.w $2, $w0[1]
-; MIPS32R5-NEXT:    copy_s.w $3, $w0[2]
-; MIPS32R5-NEXT:    copy_s.w $8, $w0[3]
-; MIPS32R5-NEXT:    sw $8, 28($sp)
-; MIPS32R5-NEXT:    sw $3, 24($sp)
-; MIPS32R5-NEXT:    sw $2, 20($sp)
-; MIPS32R5-NEXT:    sw $1, 16($sp)
-; MIPS32R5-NEXT:    jal i8_16
-; MIPS32R5-NEXT:    nop
-; MIPS32R5-NEXT:    lui $1, %hi(gv16i8)
-; MIPS32R5-NEXT:    insert.w $w0[0], $2
-; MIPS32R5-NEXT:    insert.w $w0[1], $3
-; MIPS32R5-NEXT:    addiu $1, $1, %lo(gv16i8)
-; MIPS32R5-NEXT:    insert.w $w0[2], $4
-; MIPS32R5-NEXT:    insert.w $w0[3], $5
-; MIPS32R5-NEXT:    st.w $w0, 0($1)
-; MIPS32R5-NEXT:    lw $ra, 36($sp) # 4-byte Folded Reload
-; MIPS32R5-NEXT:    addiu $sp, $sp, 40
-; MIPS32R5-NEXT:    jr $ra
-; MIPS32R5-NEXT:    nop
+; MIPS32R5EB-LABEL: calli8_16:
+; MIPS32R5EB:       # %bb.0: # %entry
+; MIPS32R5EB-NEXT:    addiu $sp, $sp, -40
+; MIPS32R5EB-NEXT:    .cfi_def_cfa_offset 40
+; MIPS32R5EB-NEXT:    sw $ra, 36($sp) # 4-byte Folded Spill
+; MIPS32R5EB-NEXT:    .cfi_offset 31, -4
+; MIPS32R5EB-NEXT:    lui $1, 3080
+; MIPS32R5EB-NEXT:    ori $1, $1, 2314
+; MIPS32R5EB-NEXT:    lui $2, 1801
+; MIPS32R5EB-NEXT:    sw $1, 28($sp)
+; MIPS32R5EB-NEXT:    ori $1, $2, 1801
+; MIPS32R5EB-NEXT:    sw $1, 24($sp)
+; MIPS32R5EB-NEXT:    sw $1, 20($sp)
+; MIPS32R5EB-NEXT:    sw $1, 16($sp)
+; MIPS32R5EB-NEXT:    lui $1, 1543
+; MIPS32R5EB-NEXT:    ori $4, $1, 1543
+; MIPS32R5EB-NEXT:    ori $7, $1, 2314
+; MIPS32R5EB-NEXT:    move $5, $4
+; MIPS32R5EB-NEXT:    move $6, $4
+; MIPS32R5EB-NEXT:    jal i8_16
+; MIPS32R5EB-NEXT:    nop
+; MIPS32R5EB-NEXT:    insert.w $w0[0], $2
+; MIPS32R5EB-NEXT:    insert.w $w0[1], $3
+; MIPS32R5EB-NEXT:    insert.w $w0[2], $4
+; MIPS32R5EB-NEXT:    lui $1, %hi(gv16i8)
+; MIPS32R5EB-NEXT:    insert.w $w0[3], $5
+; MIPS32R5EB-NEXT:    addiu $1, $1, %lo(gv16i8)
+; MIPS32R5EB-NEXT:    st.w $w0, 0($1)
+; MIPS32R5EB-NEXT:    lw $ra, 36($sp) # 4-byte Folded Reload
+; MIPS32R5EB-NEXT:    addiu $sp, $sp, 40
+; MIPS32R5EB-NEXT:    jr $ra
+; MIPS32R5EB-NEXT:    nop
 ;
-; MIPS64R5-LABEL: calli8_16:
-; MIPS64R5:       # %bb.0: # %entry
-; MIPS64R5-NEXT:    daddiu $sp, $sp, -16
-; MIPS64R5-NEXT:    .cfi_def_cfa_offset 16
-; MIPS64R5-NEXT:    sd $ra, 8($sp) # 8-byte Folded Spill
-; MIPS64R5-NEXT:    sd $gp, 0($sp) # 8-byte Folded Spill
-; MIPS64R5-NEXT:    .cfi_offset 31, -8
-; MIPS64R5-NEXT:    .cfi_offset 28, -16
-; MIPS64R5-NEXT:    lui $1, %hi(%neg(%gp_rel(calli8_16)))
-; MIPS64R5-NEXT:    daddu $1, $1, $25
-; MIPS64R5-NEXT:    daddiu $gp, $1, %lo(%neg(%gp_rel(calli8_16)))
-; MIPS64R5-NEXT:    ld $1, %got_page(.LCPI30_0)($gp)
-; MIPS64R5-NEXT:    daddiu $1, $1, %got_ofst(.LCPI30_0)
-; MIPS64R5-NEXT:    ld.d $w0, 0($1)
-; MIPS64R5-NEXT:    copy_s.d $4, $w0[0]
-; MIPS64R5-NEXT:    copy_s.d $5, $w0[1]
-; MIPS64R5-NEXT:    ld $1, %got_page(.LCPI30_1)($gp)
-; MIPS64R5-NEXT:    daddiu $1, $1, %got_ofst(.LCPI30_1)
-; MIPS64R5-NEXT:    ld.d $w0, 0($1)
-; MIPS64R5-NEXT:    copy_s.d $6, $w0[0]
-; MIPS64R5-NEXT:    copy_s.d $7, $w0[1]
-; MIPS64R5-NEXT:    ld $25, %call16(i8_16)($gp)
-; MIPS64R5-NEXT:    jalr $25
-; MIPS64R5-NEXT:    nop
-; MIPS64R5-NEXT:    insert.d $w0[0], $2
-; MIPS64R5-NEXT:    insert.d $w0[1], $3
-; MIPS64R5-NEXT:    ld $1, %got_disp(gv16i8)($gp)
-; MIPS64R5-NEXT:    st.d $w0, 0($1)
-; MIPS64R5-NEXT:    ld $gp, 0($sp) # 8-byte Folded Reload
-; MIPS64R5-NEXT:    ld $ra, 8($sp) # 8-byte Folded Reload
-; MIPS64R5-NEXT:    daddiu $sp, $sp, 16
-; MIPS64R5-NEXT:    jr $ra
-; MIPS64R5-NEXT:    nop
+; MIPS64R5EB-LABEL: calli8_16:
+; MIPS64R5EB:       # %bb.0: # %entry
+; MIPS64R5EB-NEXT:    daddiu $sp, $sp, -16
+; MIPS64R5EB-NEXT:    .cfi_def_cfa_offset 16
+; MIPS64R5EB-NEXT:    sd $ra, 8($sp) # 8-byte Folded Spill
+; MIPS64R5EB-NEXT:    sd $gp, 0($sp) # 8-byte Folded Spill
+; MIPS64R5EB-NEXT:    .cfi_offset 31, -8
+; MIPS64R5EB-NEXT:    .cfi_offset 28, -16
+; MIPS64R5EB-NEXT:    lui $1, %hi(%neg(%gp_rel(calli8_16)))
+; MIPS64R5EB-NEXT:    daddu $1, $1, $25
+; MIPS64R5EB-NEXT:    daddiu $gp, $1, %lo(%neg(%gp_rel(calli8_16)))
+; MIPS64R5EB-NEXT:    lui $1, 1801
+; MIPS64R5EB-NEXT:    daddiu $1, $1, 1801
+; MIPS64R5EB-NEXT:    dsll $1, $1, 16
+; MIPS64R5EB-NEXT:    daddiu $1, $1, 1801
+; MIPS64R5EB-NEXT:    lui $2, 1543
+; MIPS64R5EB-NEXT:    dsll $1, $1, 16
+; MIPS64R5EB-NEXT:    daddiu $2, $2, 1543
+; MIPS64R5EB-NEXT:    dsll $2, $2, 16
+; MIPS64R5EB-NEXT:    daddiu $2, $2, 1543
+; MIPS64R5EB-NEXT:    dsll $2, $2, 16
+; MIPS64R5EB-NEXT:    daddiu $4, $2, 1543
+; MIPS64R5EB-NEXT:    daddiu $5, $2, 2314
+; MIPS64R5EB-NEXT:    daddiu $6, $1, 1801
+; MIPS64R5EB-NEXT:    lui $1, 225
+; MIPS64R5EB-NEXT:    daddiu $1, $1, 8417
+; MIPS64R5EB-NEXT:    dsll $1, $1, 16
+; MIPS64R5EB-NEXT:    daddiu $1, $1, 8577
+; MIPS64R5EB-NEXT:    dsll $1, $1, 19
+; MIPS64R5EB-NEXT:    daddiu $7, $1, 2314
+; MIPS64R5EB-NEXT:    ld $25, %call16(i8_16)($gp)
+; MIPS64R5EB-NEXT:    jalr $25
+; MIPS64R5EB-NEXT:    nop
+; MIPS64R5EB-NEXT:    ld $1, %got_disp(gv16i8)($gp)
+; MIPS64R5EB-NEXT:    insert.d $w0[0], $2
+; MIPS64R5EB-NEXT:    insert.d $w0[1], $3
+; MIPS64R5EB-NEXT:    st.d $w0, 0($1)
+; MIPS64R5EB-NEXT:    ld $gp, 0($sp) # 8-byte Folded Reload
+; MIPS64R5EB-NEXT:    ld $ra, 8($sp) # 8-byte Folded Reload
+; MIPS64R5EB-NEXT:    daddiu $sp, $sp, 16
+; MIPS64R5EB-NEXT:    jr $ra
+; MIPS64R5EB-NEXT:    nop
 ;
 ; MIPS32EL-LABEL: calli8_16:
 ; MIPS32EL:       # %bb.0: # %entry
@@ -4005,6 +4009,87 @@ define void @calli8_16() {
 ; MIPS64EL-NEXT:    daddiu $sp, $sp, 16
 ; MIPS64EL-NEXT:    jr $ra
 ; MIPS64EL-NEXT:    nop
+;
+; MIPS32R5EL-LABEL: calli8_16:
+; MIPS32R5EL:       # %bb.0: # %entry
+; MIPS32R5EL-NEXT:    addiu $sp, $sp, -40
+; MIPS32R5EL-NEXT:    .cfi_def_cfa_offset 40
+; MIPS32R5EL-NEXT:    sw $ra, 36($sp) # 4-byte Folded Spill
+; MIPS32R5EL-NEXT:    .cfi_offset 31, -4
+; MIPS32R5EL-NEXT:    lui $1, 2569
+; MIPS32R5EL-NEXT:    ori $2, $1, 2060
+; MIPS32R5EL-NEXT:    lui $3, 2311
+; MIPS32R5EL-NEXT:    sw $2, 28($sp)
+; MIPS32R5EL-NEXT:    ori $2, $3, 2311
+; MIPS32R5EL-NEXT:    sw $2, 24($sp)
+; MIPS32R5EL-NEXT:    sw $2, 20($sp)
+; MIPS32R5EL-NEXT:    sw $2, 16($sp)
+; MIPS32R5EL-NEXT:    lui $2, 1798
+; MIPS32R5EL-NEXT:    ori $4, $2, 1798
+; MIPS32R5EL-NEXT:    ori $7, $1, 1798
+; MIPS32R5EL-NEXT:    move $5, $4
+; MIPS32R5EL-NEXT:    move $6, $4
+; MIPS32R5EL-NEXT:    jal i8_16
+; MIPS32R5EL-NEXT:    nop
+; MIPS32R5EL-NEXT:    insert.w $w0[0], $2
+; MIPS32R5EL-NEXT:    insert.w $w0[1], $3
+; MIPS32R5EL-NEXT:    insert.w $w0[2], $4
+; MIPS32R5EL-NEXT:    lui $1, %hi(gv16i8)
+; MIPS32R5EL-NEXT:    insert.w $w0[3], $5
+; MIPS32R5EL-NEXT:    addiu $1, $1, %lo(gv16i8)
+; MIPS32R5EL-NEXT:    st.w $w0, 0($1)
+; MIPS32R5EL-NEXT:    lw $ra, 36($sp) # 4-byte Folded Reload
+; MIPS32R5EL-NEXT:    addiu $sp, $sp, 40
+; MIPS32R5EL-NEXT:    jr $ra
+; MIPS32R5EL-NEXT:    nop
+;
+; MIPS64R5EL-LABEL: calli8_16:
+; MIPS64R5EL:       # %bb.0: # %entry
+; MIPS64R5EL-NEXT:    daddiu $sp, $sp, -16
+; MIPS64R5EL-NEXT:    .cfi_def_cfa_offset 16
+; MIPS64R5EL-NEXT:    sd $ra, 8($sp) # 8-byte Folded Spill
+; MIPS64R5EL-NEXT:    sd $gp, 0($sp) # 8-byte Folded Spill
+; MIPS64R5EL-NEXT:    .cfi_offset 31, -8
+; MIPS64R5EL-NEXT:    .cfi_offset 28, -16
+; MIPS64R5EL-NEXT:    lui $1, %hi(%neg(%gp_rel(calli8_16)))
+; MIPS64R5EL-NEXT:    daddu $1, $1, $25
+; MIPS64R5EL-NEXT:    daddiu $gp, $1, %lo(%neg(%gp_rel(calli8_16)))
+; MIPS64R5EL-NEXT:    lui $1, 1285
+; MIPS64R5EL-NEXT:    daddiu $1, $1, -31869
+; MIPS64R5EL-NEXT:    dsll $1, $1, 16
+; MIPS64R5EL-NEXT:    daddiu $1, $1, 899
+; MIPS64R5EL-NEXT:    lui $2, 2311
+; MIPS64R5EL-NEXT:    daddiu $2, $2, 2311
+; MIPS64R5EL-NEXT:    dsll $2, $2, 16
+; MIPS64R5EL-NEXT:    daddiu $2, $2, 2311
+; MIPS64R5EL-NEXT:    dsll $2, $2, 16
+; MIPS64R5EL-NEXT:    dsll $1, $1, 17
+; MIPS64R5EL-NEXT:    lui $3, 899
+; MIPS64R5EL-NEXT:    daddiu $3, $3, 899
+; MIPS64R5EL-NEXT:    dsll $3, $3, 16
+; MIPS64R5EL-NEXT:    daddiu $3, $3, 899
+; MIPS64R5EL-NEXT:    dsll $3, $3, 17
+; MIPS64R5EL-NEXT:    daddiu $4, $3, 1798
+; MIPS64R5EL-NEXT:    daddiu $5, $1, 1798
+; MIPS64R5EL-NEXT:    daddiu $6, $2, 2311
+; MIPS64R5EL-NEXT:    lui $1, 642
+; MIPS64R5EL-NEXT:    daddiu $1, $1, 16899
+; MIPS64R5EL-NEXT:    dsll $1, $1, 18
+; MIPS64R5EL-NEXT:    daddiu $1, $1, 2311
+; MIPS64R5EL-NEXT:    dsll $1, $1, 16
+; MIPS64R5EL-NEXT:    daddiu $7, $1, 2311
+; MIPS64R5EL-NEXT:    ld $25, %call16(i8_16)($gp)
+; MIPS64R5EL-NEXT:    jalr $25
+; MIPS64R5EL-NEXT:    nop
+; MIPS64R5EL-NEXT:    ld $1, %got_disp(gv16i8)($gp)
+; MIPS64R5EL-NEXT:    insert.d $w0[0], $2
+; MIPS64R5EL-NEXT:    insert.d $w0[1], $3
+; MIPS64R5EL-NEXT:    st.d $w0, 0($1)
+; MIPS64R5EL-NEXT:    ld $gp, 0($sp) # 8-byte Folded Reload
+; MIPS64R5EL-NEXT:    ld $ra, 8($sp) # 8-byte Folded Reload
+; MIPS64R5EL-NEXT:    daddiu $sp, $sp, 16
+; MIPS64R5EL-NEXT:    jr $ra
+; MIPS64R5EL-NEXT:    nop
 entry:
   %0 = call <16 x i8> @i8_16(<16 x i8> <i8 6, i8 7,i8 6, i8 7,i8 6, i8 7,i8 6, i8 7,i8 6, i8 7,i8 6, i8 7, i8 6, i8 7, i8 9, i8 10>, <16 x i8> <i8 7, i8 9,i8 7, i8 9,i8 7, i8 9,i8 7, i8 9,i8 7, i8 9,i8 7, i8 9,i8 12, i8 8, i8 9, i8 10>)
   store <16 x i8> %0, ptr @gv16i8
@@ -4510,36 +4595,26 @@ define void @calli16_8() {
 ; MIPS32R5EB-NEXT:    .cfi_def_cfa_offset 40
 ; MIPS32R5EB-NEXT:    sw $ra, 36($sp) # 4-byte Folded Spill
 ; MIPS32R5EB-NEXT:    .cfi_offset 31, -4
+; MIPS32R5EB-NEXT:    lui $1, 9
+; MIPS32R5EB-NEXT:    ori $5, $1, 10
+; MIPS32R5EB-NEXT:    sw $5, 28($sp)
+; MIPS32R5EB-NEXT:    lui $1, 12
+; MIPS32R5EB-NEXT:    ori $1, $1, 8
+; MIPS32R5EB-NEXT:    sw $1, 24($sp)
+; MIPS32R5EB-NEXT:    sw $5, 20($sp)
 ; MIPS32R5EB-NEXT:    lui $1, 6
-; MIPS32R5EB-NEXT:    ori $1, $1, 7
-; MIPS32R5EB-NEXT:    lui $2, 9
-; MIPS32R5EB-NEXT:    ori $2, $2, 10
-; MIPS32R5EB-NEXT:    fill.w $w0, $2
-; MIPS32R5EB-NEXT:    insert.w $w0[1], $1
-; MIPS32R5EB-NEXT:    splati.d $w0, $w0[0]
-; MIPS32R5EB-NEXT:    copy_s.w $4, $w0[0]
-; MIPS32R5EB-NEXT:    copy_s.w $5, $w0[1]
-; MIPS32R5EB-NEXT:    copy_s.w $6, $w0[2]
-; MIPS32R5EB-NEXT:    copy_s.w $7, $w0[3]
-; MIPS32R5EB-NEXT:    lui $1, %hi($CPI33_0)
-; MIPS32R5EB-NEXT:    addiu $1, $1, %lo($CPI33_0)
-; MIPS32R5EB-NEXT:    ld.w $w0, 0($1)
-; MIPS32R5EB-NEXT:    copy_s.w $1, $w0[0]
-; MIPS32R5EB-NEXT:    copy_s.w $2, $w0[1]
-; MIPS32R5EB-NEXT:    copy_s.w $3, $w0[2]
-; MIPS32R5EB-NEXT:    copy_s.w $8, $w0[3]
-; MIPS32R5EB-NEXT:    sw $8, 28($sp)
-; MIPS32R5EB-NEXT:    sw $3, 24($sp)
-; MIPS32R5EB-NEXT:    sw $2, 20($sp)
-; MIPS32R5EB-NEXT:    sw $1, 16($sp)
+; MIPS32R5EB-NEXT:    ori $4, $1, 7
+; MIPS32R5EB-NEXT:    sw $4, 16($sp)
+; MIPS32R5EB-NEXT:    move $6, $4
+; MIPS32R5EB-NEXT:    move $7, $5
 ; MIPS32R5EB-NEXT:    jal i16_8
 ; MIPS32R5EB-NEXT:    nop
-; MIPS32R5EB-NEXT:    lui $1, %hi(gv8i16)
-; MIPS32R5EB-NEXT:    addiu $1, $1, %lo(gv8i16)
 ; MIPS32R5EB-NEXT:    insert.w $w0[0], $2
 ; MIPS32R5EB-NEXT:    insert.w $w0[1], $3
 ; MIPS32R5EB-NEXT:    insert.w $w0[2], $4
+; MIPS32R5EB-NEXT:    lui $1, %hi(gv8i16)
 ; MIPS32R5EB-NEXT:    insert.w $w0[3], $5
+; MIPS32R5EB-NEXT:    addiu $1, $1, %lo(gv8i16)
 ; MIPS32R5EB-NEXT:    st.w $w0, 0($1)
 ; MIPS32R5EB-NEXT:    lw $ra, 36($sp) # 4-byte Folded Reload
 ; MIPS32R5EB-NEXT:    addiu $sp, $sp, 40
@@ -4557,20 +4632,21 @@ define void @calli16_8() {
 ; MIPS64R5EB-NEXT:    lui $1, %hi(%neg(%gp_rel(calli16_8)))
 ; MIPS64R5EB-NEXT:    daddu $1, $1, $25
 ; MIPS64R5EB-NEXT:    daddiu $gp, $1, %lo(%neg(%gp_rel(calli16_8)))
-; MIPS64R5EB-NEXT:    lui $1, 9
-; MIPS64R5EB-NEXT:    ori $1, $1, 10
-; MIPS64R5EB-NEXT:    lui $2, 6
-; MIPS64R5EB-NEXT:    ori $2, $2, 7
-; MIPS64R5EB-NEXT:    dinsu $1, $2, 32, 32
-; MIPS64R5EB-NEXT:    fill.d $w0, $1
-; MIPS64R5EB-NEXT:    copy_s.d $4, $w0[0]
-; MIPS64R5EB-NEXT:    copy_s.d $5, $w0[1]
-; MIPS64R5EB-NEXT:    ld $1, %got_page(.LCPI33_0)($gp)
-; MIPS64R5EB-NEXT:    daddiu $1, $1, %got_ofst(.LCPI33_0)
-; MIPS64R5EB-NEXT:    ld.d $w0, 0($1)
-; MIPS64R5EB-NEXT:    copy_s.d $6, $w0[0]
-; MIPS64R5EB-NEXT:    copy_s.d $7, $w0[1]
+; MIPS64R5EB-NEXT:    lui $1, 6
+; MIPS64R5EB-NEXT:    daddiu $1, $1, 7
+; MIPS64R5EB-NEXT:    dsll $1, $1, 16
+; MIPS64R5EB-NEXT:    daddiu $1, $1, 9
+; MIPS64R5EB-NEXT:    dsll $1, $1, 16
+; MIPS64R5EB-NEXT:    daddiu $4, $1, 10
+; MIPS64R5EB-NEXT:    lui $1, 2
+; MIPS64R5EB-NEXT:    daddiu $1, $1, -32767
+; MIPS64R5EB-NEXT:    dsll $1, $1, 19
+; MIPS64R5EB-NEXT:    daddiu $1, $1, 9
+; MIPS64R5EB-NEXT:    dsll $1, $1, 16
+; MIPS64R5EB-NEXT:    daddiu $7, $1, 10
 ; MIPS64R5EB-NEXT:    ld $25, %call16(i16_8)($gp)
+; MIPS64R5EB-NEXT:    move $5, $4
+; MIPS64R5EB-NEXT:    move $6, $4
 ; MIPS64R5EB-NEXT:    jalr $25
 ; MIPS64R5EB-NEXT:    nop
 ; MIPS64R5EB-NEXT:    ld $1, %got_disp(gv8i16)($gp)
@@ -4658,35 +4734,25 @@ define void @calli16_8() {
 ; MIPS32R5EL-NEXT:    sw $ra, 36($sp) # 4-byte Folded Spill
 ; MIPS32R5EL-NEXT:    .cfi_offset 31, -4
 ; MIPS32R5EL-NEXT:    lui $1, 10
-; MIPS32R5EL-NEXT:    ori $1, $1, 9
-; MIPS32R5EL-NEXT:    lui $2, 7
-; MIPS32R5EL-NEXT:    ori $2, $2, 6
-; MIPS32R5EL-NEXT:    fill.w $w0, $2
-; MIPS32R5EL-NEXT:    insert.w $w0[1], $1
-; MIPS32R5EL-NEXT:    splati.d $w0, $w0[0]
-; MIPS32R5EL-NEXT:    copy_s.w $4, $w0[0]
-; MIPS32R5EL-NEXT:    copy_s.w $5, $w0[1]
-; MIPS32R5EL-NEXT:    copy_s.w $6, $w0[2]
-; MIPS32R5EL-NEXT:    copy_s.w $7, $w0[3]
-; MIPS32R5EL-NEXT:    lui $1, %hi($CPI33_0)
-; MIPS32R5EL-NEXT:    addiu $1, $1, %lo($CPI33_0)
-; MIPS32R5EL-NEXT:    ld.w $w0, 0($1)
-; MIPS32R5EL-NEXT:    copy_s.w $1, $w0[0]
-; MIPS32R5EL-NEXT:    copy_s.w $2, $w0[1]
-; MIPS32R5EL-NEXT:    copy_s.w $3, $w0[2]
-; MIPS32R5EL-NEXT:    copy_s.w $8, $w0[3]
-; MIPS32R5EL-NEXT:    sw $8, 28($sp)
-; MIPS32R5EL-NEXT:    sw $3, 24($sp)
-; MIPS32R5EL-NEXT:    sw $2, 20($sp)
-; MIPS32R5EL-NEXT:    sw $1, 16($sp)
+; MIPS32R5EL-NEXT:    ori $5, $1, 9
+; MIPS32R5EL-NEXT:    sw $5, 28($sp)
+; MIPS32R5EL-NEXT:    lui $1, 8
+; MIPS32R5EL-NEXT:    ori $1, $1, 12
+; MIPS32R5EL-NEXT:    sw $1, 24($sp)
+; MIPS32R5EL-NEXT:    sw $5, 20($sp)
+; MIPS32R5EL-NEXT:    lui $1, 7
+; MIPS32R5EL-NEXT:    ori $4, $1, 6
+; MIPS32R5EL-NEXT:    sw $4, 16($sp)
+; MIPS32R5EL-NEXT:    move $6, $4
+; MIPS32R5EL-NEXT:    move $7, $5
 ; MIPS32R5EL-NEXT:    jal i16_8
 ; MIPS32R5EL-NEXT:    nop
-; MIPS32R5EL-NEXT:    lui $1, %hi(gv8i16)
-; MIPS32R5EL-NEXT:    addiu $1, $1, %lo(gv8i16)
 ; MIPS32R5EL-NEXT:    insert.w $w0[0], $2
 ; MIPS32R5EL-NEXT:    insert.w $w0[1], $3
 ; MIPS32R5EL-NEXT:    insert.w $w0[2], $4
+; MIPS32R5EL-NEXT:    lui $1, %hi(gv8i16)
 ; MIPS32R5EL-NEXT:    insert.w $w0[3], $5
+; MIPS32R5EL-NEXT:    addiu $1, $1, %lo(gv8i16)
 ; MIPS32R5EL-NEXT:    st.w $w0, 0($1)
 ; MIPS32R5EL-NEXT:    lw $ra, 36($sp) # 4-byte Folded Reload
 ; MIPS32R5EL-NEXT:    addiu $sp, $sp, 40
@@ -4704,20 +4770,21 @@ define void @calli16_8() {
 ; MIPS64R5EL-NEXT:    lui $1, %hi(%neg(%gp_rel(calli16_8)))
 ; MIPS64R5EL-NEXT:    daddu $1, $1, $25
 ; MIPS64R5EL-NEXT:    daddiu $gp, $1, %lo(%neg(%gp_rel(calli16_8)))
-; MIPS64R5EL-NEXT:    lui $1, 7
-; MIPS64R5EL-NEXT:    ori $1, $1, 6
-; MIPS64R5EL-NEXT:    lui $2, 10
-; MIPS64R5EL-NEXT:    ori $2, $2, 9
-; MIPS64R5EL-NEXT:    dinsu $1, $2, 32, 32
-; MIPS64R5EL-NEXT:    fill.d $w0, $1
-; MIPS64R5EL-NEXT:    copy_s.d $4, $w0[0]
-; MIPS64R5EL-NEXT:    copy_s.d $5, $w0[1]
-; MIPS64R5EL-NEXT:    ld $1, %got_page(.LCPI33_0)($gp)
-; MIPS64R5EL-NEXT:    daddiu $1, $1, %got_ofst(.LCPI33_0)
-; MIPS64R5EL-NEXT:    ld.d $w0, 0($1)
-; MIPS64R5EL-NEXT:    copy_s.d $6, $w0[0]
-; MIPS64R5EL-NEXT:    copy_s.d $7, $w0[1]
+; MIPS64R5EL-NEXT:    lui $1, 10
+; MIPS64R5EL-NEXT:    daddiu $1, $1, 9
+; MIPS64R5EL-NEXT:    dsll $1, $1, 16
+; MIPS64R5EL-NEXT:    daddiu $1, $1, 7
+; MIPS64R5EL-NEXT:    dsll $1, $1, 16
+; MIPS64R5EL-NEXT:    daddiu $4, $1, 6
+; MIPS64R5EL-NEXT:    lui $1, 1
+; MIPS64R5EL-NEXT:    daddiu $1, $1, 16385
+; MIPS64R5EL-NEXT:    dsll $1, $1, 16
+; MIPS64R5EL-NEXT:    daddiu $1, $1, 8193
+; MIPS64R5EL-NEXT:    dsll $1, $1, 19
+; MIPS64R5EL-NEXT:    daddiu $7, $1, 12
 ; MIPS64R5EL-NEXT:    ld $25, %call16(i16_8)($gp)
+; MIPS64R5EL-NEXT:    move $5, $4
+; MIPS64R5EL-NEXT:    move $6, $4
 ; MIPS64R5EL-NEXT:    jalr $25
 ; MIPS64R5EL-NEXT:    nop
 ; MIPS64R5EL-NEXT:    ld $1, %got_disp(gv8i16)($gp)
@@ -4989,39 +5056,38 @@ define void @calli32_4() {
 ; MIPS32R5-NEXT:    jr $ra
 ; MIPS32R5-NEXT:    nop
 ;
-; MIPS64R5-LABEL: calli32_4:
-; MIPS64R5:       # %bb.0: # %entry
-; MIPS64R5-NEXT:    daddiu $sp, $sp, -16
-; MIPS64R5-NEXT:    .cfi_def_cfa_offset 16
-; MIPS64R5-NEXT:    sd $ra, 8($sp) # 8-byte Folded Spill
-; MIPS64R5-NEXT:    sd $gp, 0($sp) # 8-byte Folded Spill
-; MIPS64R5-NEXT:    .cfi_offset 31, -8
-; MIPS64R5-NEXT:    .cfi_offset 28, -16
-; MIPS64R5-NEXT:    lui $1, %hi(%neg(%gp_rel(calli32_4)))
-; MIPS64R5-NEXT:    daddu $1, $1, $25
-; MIPS64R5-NEXT:    daddiu $gp, $1, %lo(%neg(%gp_rel(calli32_4)))
-; MIPS64R5-NEXT:    ld $1, %got_page(.LCPI35_0)($gp)
-; MIPS64R5-NEXT:    daddiu $1, $1, %got_ofst(.LCPI35_0)
-; MIPS64R5-NEXT:    ld.d $w0, 0($1)
-; MIPS64R5-NEXT:    copy_s.d $4, $w0[0]
-; MIPS64R5-NEXT:    copy_s.d $5, $w0[1]
-; MIPS64R5-NEXT:    ld $1, %got_page(.LCPI35_1)($gp)
-; MIPS64R5-NEXT:    daddiu $1, $1, %got_ofst(.LCPI35_1)
-; MIPS64R5-NEXT:    ld.d $w0, 0($1)
-; MIPS64R5-NEXT:    copy_s.d $6, $w0[0]
-; MIPS64R5-NEXT:    copy_s.d $7, $w0[1]
-; MIPS64R5-NEXT:    ld $25, %call16(i32_4)($gp)
-; MIPS64R5-NEXT:    jalr $25
-; MIPS64R5-NEXT:    nop
-; MIPS64R5-NEXT:    insert.d $w0[0], $2
-; MIPS64R5-NEXT:    insert.d $w0[1], $3
-; MIPS64R5-NEXT:    ld $1, %got_disp(gv4i32)($gp)
-; MIPS64R5-NEXT:    st.d $w0, 0($1)
-; MIPS64R5-NEXT:    ld $gp, 0($sp) # 8-byte Folded Reload
-; MIPS64R5-NEXT:    ld $ra, 8($sp) # 8-byte Folded Reload
-; MIPS64R5-NEXT:    daddiu $sp, $sp, 16
-; MIPS64R5-NEXT:    jr $ra
-; MIPS64R5-NEXT:    nop
+; MIPS64R5EB-LABEL: calli32_4:
+; MIPS64R5EB:       # %bb.0: # %entry
+; MIPS64R5EB-NEXT:    daddiu $sp, $sp, -16
+; MIPS64R5EB-NEXT:    .cfi_def_cfa_offset 16
+; MIPS64R5EB-NEXT:    sd $ra, 8($sp) # 8-byte Folded Spill
+; MIPS64R5EB-NEXT:    sd $gp, 0($sp) # 8-byte Folded Spill
+; MIPS64R5EB-NEXT:    .cfi_offset 31, -8
+; MIPS64R5EB-NEXT:    .cfi_offset 28, -16
+; MIPS64R5EB-NEXT:    lui $1, %hi(%neg(%gp_rel(calli32_4)))
+; MIPS64R5EB-NEXT:    daddu $1, $1, $25
+; MIPS64R5EB-NEXT:    daddiu $gp, $1, %lo(%neg(%gp_rel(calli32_4)))
+; MIPS64R5EB-NEXT:    daddiu $1, $zero, 3
+; MIPS64R5EB-NEXT:    dsll $2, $1, 33
+; MIPS64R5EB-NEXT:    daddiu $4, $2, 7
+; MIPS64R5EB-NEXT:    dsll $1, $1, 34
+; MIPS64R5EB-NEXT:    daddiu $6, $1, 8
+; MIPS64R5EB-NEXT:    daddiu $1, $zero, 9
+; MIPS64R5EB-NEXT:    dsll $1, $1, 32
+; MIPS64R5EB-NEXT:    daddiu $5, $1, 10
+; MIPS64R5EB-NEXT:    ld $25, %call16(i32_4)($gp)
+; MIPS64R5EB-NEXT:    move $7, $5
+; MIPS64R5EB-NEXT:    jalr $25
+; MIPS64R5EB-NEXT:    nop
+; MIPS64R5EB-NEXT:    insert.d $w0[0], $2
+; MIPS64R5EB-NEXT:    insert.d $w0[1], $3
+; MIPS64R5EB-NEXT:    ld $1, %got_disp(gv4i32)($gp)
+; MIPS64R5EB-NEXT:    st.d $w0, 0($1)
+; MIPS64R5EB-NEXT:    ld $gp, 0($sp) # 8-byte Folded Reload
+; MIPS64R5EB-NEXT:    ld $ra, 8($sp) # 8-byte Folded Reload
+; MIPS64R5EB-NEXT:    daddiu $sp, $sp, 16
+; MIPS64R5EB-NEXT:    jr $ra
+; MIPS64R5EB-NEXT:    nop
 ;
 ; MIPS64EL-LABEL: calli32_4:
 ; MIPS64EL:       # %bb.0: # %entry
@@ -5055,6 +5121,40 @@ define void @calli32_4() {
 ; MIPS64EL-NEXT:    daddiu $sp, $sp, 16
 ; MIPS64EL-NEXT:    jr $ra
 ; MIPS64EL-NEXT:    nop
+;
+; MIPS64R5EL-LABEL: calli32_4:
+; MIPS64R5EL:       # %bb.0: # %entry
+; MIPS64R5EL-NEXT:    daddiu $sp, $sp, -16
+; MIPS64R5EL-NEXT:    .cfi_def_cfa_offset 16
+; MIPS64R5EL-NEXT:    sd $ra, 8($sp) # 8-byte Folded Spill
+; MIPS64R5EL-NEXT:    sd $gp, 0($sp) # 8-byte Folded Spill
+; MIPS64R5EL-NEXT:    .cfi_offset 31, -8
+; MIPS64R5EL-NEXT:    .cfi_offset 28, -16
+; MIPS64R5EL-NEXT:    lui $1, %hi(%neg(%gp_rel(calli32_4)))
+; MIPS64R5EL-NEXT:    daddu $1, $1, $25
+; MIPS64R5EL-NEXT:    daddiu $gp, $1, %lo(%neg(%gp_rel(calli32_4)))
+; MIPS64R5EL-NEXT:    daddiu $1, $zero, 7
+; MIPS64R5EL-NEXT:    dsll $1, $1, 32
+; MIPS64R5EL-NEXT:    daddiu $4, $1, 6
+; MIPS64R5EL-NEXT:    daddiu $1, $zero, 1
+; MIPS64R5EL-NEXT:    dsll $1, $1, 35
+; MIPS64R5EL-NEXT:    daddiu $6, $1, 12
+; MIPS64R5EL-NEXT:    daddiu $1, $zero, 5
+; MIPS64R5EL-NEXT:    dsll $1, $1, 33
+; MIPS64R5EL-NEXT:    daddiu $5, $1, 9
+; MIPS64R5EL-NEXT:    ld $25, %call16(i32_4)($gp)
+; MIPS64R5EL-NEXT:    move $7, $5
+; MIPS64R5EL-NEXT:    jalr $25
+; MIPS64R5EL-NEXT:    nop
+; MIPS64R5EL-NEXT:    insert.d $w0[0], $2
+; MIPS64R5EL-NEXT:    insert.d $w0[1], $3
+; MIPS64R5EL-NEXT:    ld $1, %got_disp(gv4i32)($gp)
+; MIPS64R5EL-NEXT:    st.d $w0, 0($1)
+; MIPS64R5EL-NEXT:    ld $gp, 0($sp) # 8-byte Folded Reload
+; MIPS64R5EL-NEXT:    ld $ra, 8($sp) # 8-byte Folded Reload
+; MIPS64R5EL-NEXT:    daddiu $sp, $sp, 16
+; MIPS64R5EL-NEXT:    jr $ra
+; MIPS64R5EL-NEXT:    nop
 entry:
   %0 = call <4 x i32> @i32_4(<4 x i32> <i32 6, i32 7, i32 9, i32 10>, <4 x i32> <i32 12, i32 8, i32 9, i32 10>)
   store <4 x i32> %0, ptr @gv4i32
@@ -5124,30 +5224,24 @@ define void @calli64_2() {
 ; MIPS32R5EB-NEXT:    .cfi_def_cfa_offset 40
 ; MIPS32R5EB-NEXT:    sw $ra, 36($sp) # 4-byte Folded Spill
 ; MIPS32R5EB-NEXT:    .cfi_offset 31, -4
-; MIPS32R5EB-NEXT:    lui $1, %hi($CPI36_0)
-; MIPS32R5EB-NEXT:    addiu $1, $1, %lo($CPI36_0)
-; MIPS32R5EB-NEXT:    ld.w $w0, 0($1)
-; MIPS32R5EB-NEXT:    copy_s.w $5, $w0[1]
-; MIPS32R5EB-NEXT:    copy_s.w $7, $w0[3]
-; MIPS32R5EB-NEXT:    lui $1, %hi($CPI36_1)
-; MIPS32R5EB-NEXT:    addiu $1, $1, %lo($CPI36_1)
-; MIPS32R5EB-NEXT:    ld.w $w0, 0($1)
-; MIPS32R5EB-NEXT:    copy_s.w $1, $w0[1]
-; MIPS32R5EB-NEXT:    copy_s.w $2, $w0[3]
-; MIPS32R5EB-NEXT:    sw $2, 28($sp)
+; MIPS32R5EB-NEXT:    addiu $1, $zero, 8
+; MIPS32R5EB-NEXT:    sw $1, 28($sp)
+; MIPS32R5EB-NEXT:    addiu $1, $zero, 12
 ; MIPS32R5EB-NEXT:    sw $1, 20($sp)
 ; MIPS32R5EB-NEXT:    sw $zero, 24($sp)
 ; MIPS32R5EB-NEXT:    sw $zero, 16($sp)
 ; MIPS32R5EB-NEXT:    addiu $4, $zero, 0
+; MIPS32R5EB-NEXT:    addiu $5, $zero, 6
 ; MIPS32R5EB-NEXT:    addiu $6, $zero, 0
+; MIPS32R5EB-NEXT:    addiu $7, $zero, 7
 ; MIPS32R5EB-NEXT:    jal i64_2
 ; MIPS32R5EB-NEXT:    nop
-; MIPS32R5EB-NEXT:    lui $1, %hi(gv2i64)
 ; MIPS32R5EB-NEXT:    insert.w $w0[0], $2
 ; MIPS32R5EB-NEXT:    insert.w $w0[1], $3
-; MIPS32R5EB-NEXT:    addiu $1, $1, %lo(gv2i64)
 ; MIPS32R5EB-NEXT:    insert.w $w0[2], $4
+; MIPS32R5EB-NEXT:    lui $1, %hi(gv2i64)
 ; MIPS32R5EB-NEXT:    insert.w $w0[3], $5
+; MIPS32R5EB-NEXT:    addiu $1, $1, %lo(gv2i64)
 ; MIPS32R5EB-NEXT:    st.w $w0, 0($1)
 ; MIPS32R5EB-NEXT:    lw $ra, 36($sp) # 4-byte Folded Reload
 ; MIPS32R5EB-NEXT:    addiu $sp, $sp, 40
@@ -5217,30 +5311,24 @@ define void @calli64_2() {
 ; MIPS32R5EL-NEXT:    .cfi_def_cfa_offset 40
 ; MIPS32R5EL-NEXT:    sw $ra, 36($sp) # 4-byte Folded Spill
 ; MIPS32R5EL-NEXT:    .cfi_offset 31, -4
-; MIPS32R5EL-NEXT:    lui $1, %hi($CPI36_0)
-; MIPS32R5EL-NEXT:    addiu $1, $1, %lo($CPI36_0)
-; MIPS32R5EL-NEXT:    ld.w $w0, 0($1)
-; MIPS32R5EL-NEXT:    copy_s.w $4, $w0[0]
-; MIPS32R5EL-NEXT:    copy_s.w $6, $w0[2]
-; MIPS32R5EL-NEXT:    lui $1, %hi($CPI36_1)
-; MIPS32R5EL-NEXT:    addiu $1, $1, %lo($CPI36_1)
-; MIPS32R5EL-NEXT:    ld.w $w0, 0($1)
-; MIPS32R5EL-NEXT:    copy_s.w $1, $w0[0]
-; MIPS32R5EL-NEXT:    copy_s.w $2, $w0[2]
-; MIPS32R5EL-NEXT:    sw $2, 24($sp)
+; MIPS32R5EL-NEXT:    addiu $1, $zero, 8
+; MIPS32R5EL-NEXT:    sw $1, 24($sp)
+; MIPS32R5EL-NEXT:    addiu $1, $zero, 12
 ; MIPS32R5EL-NEXT:    sw $1, 16($sp)
 ; MIPS32R5EL-NEXT:    sw $zero, 28($sp)
 ; MIPS32R5EL-NEXT:    sw $zero, 20($sp)
+; MIPS32R5EL-NEXT:    addiu $4, $zero, 6
 ; MIPS32R5EL-NEXT:    addiu $5, $zero, 0
+; MIPS32R5EL-NEXT:    addiu $6, $zero, 7
 ; MIPS32R5EL-NEXT:    addiu $7, $zero, 0
 ; MIPS32R5EL-NEXT:    jal i64_2
 ; MIPS32R5EL-NEXT:    nop
-; MIPS32R5EL-NEXT:    lui $1, %hi(gv2i64)
 ; MIPS32R5EL-NEXT:    insert.w $w0[0], $2
 ; MIPS32R5EL-NEXT:    insert.w $w0[1], $3
-; MIPS32R5EL-NEXT:    addiu $1, $1, %lo(gv2i64)
 ; MIPS32R5EL-NEXT:    insert.w $w0[2], $4
+; MIPS32R5EL-NEXT:    lui $1, %hi(gv2i64)
 ; MIPS32R5EL-NEXT:    insert.w $w0[3], $5
+; MIPS32R5EL-NEXT:    addiu $1, $1, %lo(gv2i64)
 ; MIPS32R5EL-NEXT:    st.w $w0, 0($1)
 ; MIPS32R5EL-NEXT:    lw $ra, 36($sp) # 4-byte Folded Reload
 ; MIPS32R5EL-NEXT:    addiu $sp, $sp, 40
@@ -5496,27 +5584,21 @@ define void @callfloat_4() {
 ; MIPS32R5-NEXT:    .cfi_def_cfa_register 30
 ; MIPS32R5-NEXT:    addiu $1, $zero, -16
 ; MIPS32R5-NEXT:    and $sp, $sp, $1
-; MIPS32R5-NEXT:    lui $1, %hi($CPI38_0)
-; MIPS32R5-NEXT:    addiu $1, $1, %lo($CPI38_0)
-; MIPS32R5-NEXT:    ld.w $w0, 0($1)
-; MIPS32R5-NEXT:    copy_s.w $7, $w0[1]
-; MIPS32R5-NEXT:    copy_s.w $1, $w0[2]
-; MIPS32R5-NEXT:    copy_s.w $2, $w0[3]
-; MIPS32R5-NEXT:    lui $3, %hi($CPI38_1)
-; MIPS32R5-NEXT:    addiu $3, $3, %lo($CPI38_1)
-; MIPS32R5-NEXT:    ld.w $w0, 0($3)
-; MIPS32R5-NEXT:    copy_s.w $3, $w0[0]
-; MIPS32R5-NEXT:    copy_s.w $4, $w0[1]
-; MIPS32R5-NEXT:    copy_s.w $5, $w0[2]
-; MIPS32R5-NEXT:    copy_s.w $6, $w0[3]
-; MIPS32R5-NEXT:    sw $6, 36($sp)
-; MIPS32R5-NEXT:    sw $5, 32($sp)
-; MIPS32R5-NEXT:    sw $4, 28($sp)
-; MIPS32R5-NEXT:    sw $3, 24($sp)
-; MIPS32R5-NEXT:    sw $2, 20($sp)
+; MIPS32R5-NEXT:    lui $1, 16704
+; MIPS32R5-NEXT:    lui $2, 16736
+; MIPS32R5-NEXT:    lui $3, 16752
+; MIPS32R5-NEXT:    lui $4, 16768
+; MIPS32R5-NEXT:    sw $4, 36($sp)
+; MIPS32R5-NEXT:    sw $3, 32($sp)
+; MIPS32R5-NEXT:    sw $2, 28($sp)
+; MIPS32R5-NEXT:    sw $1, 24($sp)
+; MIPS32R5-NEXT:    lui $1, 16512
+; MIPS32R5-NEXT:    sw $1, 20($sp)
+; MIPS32R5-NEXT:    lui $1, 16384
 ; MIPS32R5-NEXT:    sw $1, 16($sp)
 ; MIPS32R5-NEXT:    addiu $4, $sp, 48
 ; MIPS32R5-NEXT:    addiu $6, $zero, 0
+; MIPS32R5-NEXT:    lui $7, 49024
 ; MIPS32R5-NEXT:    jal float4_extern
 ; MIPS32R5-NEXT:    nop
 ; MIPS32R5-NEXT:    lui $1, %hi(gv4f32)
@@ -5530,39 +5612,43 @@ define void @callfloat_4() {
 ; MIPS32R5-NEXT:    jr $ra
 ; MIPS32R5-NEXT:    nop
 ;
-; MIPS64R5-LABEL: callfloat_4:
-; MIPS64R5:       # %bb.0: # %entry
-; MIPS64R5-NEXT:    daddiu $sp, $sp, -16
-; MIPS64R5-NEXT:    .cfi_def_cfa_offset 16
-; MIPS64R5-NEXT:    sd $ra, 8($sp) # 8-byte Folded Spill
-; MIPS64R5-NEXT:    sd $gp, 0($sp) # 8-byte Folded Spill
-; MIPS64R5-NEXT:    .cfi_offset 31, -8
-; MIPS64R5-NEXT:    .cfi_offset 28, -16
-; MIPS64R5-NEXT:    lui $1, %hi(%neg(%gp_rel(callfloat_4)))
-; MIPS64R5-NEXT:    daddu $1, $1, $25
-; MIPS64R5-NEXT:    daddiu $gp, $1, %lo(%neg(%gp_rel(callfloat_4)))
-; MIPS64R5-NEXT:    ld $1, %got_page(.LCPI38_0)($gp)
-; MIPS64R5-NEXT:    daddiu $1, $1, %got_ofst(.LCPI38_0)
-; MIPS64R5-NEXT:    ld.d $w0, 0($1)
-; MIPS64R5-NEXT:    copy_s.d $4, $w0[0]
-; MIPS64R5-NEXT:    copy_s.d $5, $w0[1]
-; MIPS64R5-NEXT:    ld $1, %got_page(.LCPI38_1)($gp)
-; MIPS64R5-NEXT:    daddiu $1, $1, %got_ofst(.LCPI38_1)
-; MIPS64R5-NEXT:    ld.d $w0, 0($1)
-; MIPS64R5-NEXT:    copy_s.d $6, $w0[0]
-; MIPS64R5-NEXT:    copy_s.d $7, $w0[1]
-; MIPS64R5-NEXT:    ld $25, %call16(float4_extern)($gp)
-; MIPS64R5-NEXT:    jalr $25
-; MIPS64R5-NEXT:    nop
-; MIPS64R5-NEXT:    insert.d $w0[0], $2
-; MIPS64R5-NEXT:    insert.d $w0[1], $3
-; MIPS64R5-NEXT:    ld $1, %got_disp(gv4f32)($gp)
-; MIPS64R5-NEXT:    st.d $w0, 0($1)
-; MIPS64R5-NEXT:    ld $gp, 0($sp) # 8-byte Folded Reload
-; MIPS64R5-NEXT:    ld $ra, 8($sp) # 8-byte Folded Reload
-; MIPS64R5-NEXT:    daddiu $sp, $sp, 16
-; MIPS64R5-NEXT:    jr $ra
-; MIPS64R5-NEXT:    nop
+; MIPS64R5EB-LABEL: callfloat_4:
+; MIPS64R5EB:       # %bb.0: # %entry
+; MIPS64R5EB-NEXT:    daddiu $sp, $sp, -16
+; MIPS64R5EB-NEXT:    .cfi_def_cfa_offset 16
+; MIPS64R5EB-NEXT:    sd $ra, 8($sp) # 8-byte Folded Spill
+; MIPS64R5EB-NEXT:    sd $gp, 0($sp) # 8-byte Folded Spill
+; MIPS64R5EB-NEXT:    .cfi_offset 31, -8
+; MIPS64R5EB-NEXT:    .cfi_offset 28, -16
+; MIPS64R5EB-NEXT:    lui $1, %hi(%neg(%gp_rel(callfloat_4)))
+; MIPS64R5EB-NEXT:    daddu $1, $1, $25
+; MIPS64R5EB-NEXT:    daddiu $gp, $1, %lo(%neg(%gp_rel(callfloat_4)))
+; MIPS64R5EB-NEXT:    daddiu $1, $zero, 1
+; MIPS64R5EB-NEXT:    dsll $1, $1, 39
+; MIPS64R5EB-NEXT:    daddiu $1, $1, 129
+; MIPS64R5EB-NEXT:    daddiu $2, $zero, 261
+; MIPS64R5EB-NEXT:    dsll $2, $2, 33
+; MIPS64R5EB-NEXT:    daddiu $3, $zero, 383
+; MIPS64R5EB-NEXT:    dsll $4, $3, 23
+; MIPS64R5EB-NEXT:    dsll $5, $1, 23
+; MIPS64R5EB-NEXT:    daddiu $1, $2, 523
+; MIPS64R5EB-NEXT:    dsll $6, $1, 21
+; MIPS64R5EB-NEXT:    daddiu $1, $zero, 1047
+; MIPS64R5EB-NEXT:    dsll $1, $1, 29
+; MIPS64R5EB-NEXT:    daddiu $1, $1, 131
+; MIPS64R5EB-NEXT:    dsll $7, $1, 23
+; MIPS64R5EB-NEXT:    ld $25, %call16(float4_extern)($gp)
+; MIPS64R5EB-NEXT:    jalr $25
+; MIPS64R5EB-NEXT:    nop
+; MIPS64R5EB-NEXT:    insert.d $w0[0], $2
+; MIPS64R5EB-NEXT:    insert.d $w0[1], $3
+; MIPS64R5EB-NEXT:    ld $1, %got_disp(gv4f32)($gp)
+; MIPS64R5EB-NEXT:    st.d $w0, 0($1)
+; MIPS64R5EB-NEXT:    ld $gp, 0($sp) # 8-byte Folded Reload
+; MIPS64R5EB-NEXT:    ld $ra, 8($sp) # 8-byte Folded Reload
+; MIPS64R5EB-NEXT:    daddiu $sp, $sp, 16
+; MIPS64R5EB-NEXT:    jr $ra
+; MIPS64R5EB-NEXT:    nop
 ;
 ; MIPS64EL-LABEL: callfloat_4:
 ; MIPS64EL:       # %bb.0: # %entry
@@ -5600,6 +5686,44 @@ define void @callfloat_4() {
 ; MIPS64EL-NEXT:    daddiu $sp, $sp, 16
 ; MIPS64EL-NEXT:    jr $ra
 ; MIPS64EL-NEXT:    nop
+;
+; MIPS64R5EL-LABEL: callfloat_4:
+; MIPS64R5EL:       # %bb.0: # %entry
+; MIPS64R5EL-NEXT:    daddiu $sp, $sp, -16
+; MIPS64R5EL-NEXT:    .cfi_def_cfa_offset 16
+; MIPS64R5EL-NEXT:    sd $ra, 8($sp) # 8-byte Folded Spill
+; MIPS64R5EL-NEXT:    sd $gp, 0($sp) # 8-byte Folded Spill
+; MIPS64R5EL-NEXT:    .cfi_offset 31, -8
+; MIPS64R5EL-NEXT:    .cfi_offset 28, -16
+; MIPS64R5EL-NEXT:    lui $1, %hi(%neg(%gp_rel(callfloat_4)))
+; MIPS64R5EL-NEXT:    daddu $1, $1, $25
+; MIPS64R5EL-NEXT:    daddiu $gp, $1, %lo(%neg(%gp_rel(callfloat_4)))
+; MIPS64R5EL-NEXT:    daddiu $1, $zero, 129
+; MIPS64R5EL-NEXT:    dsll $1, $1, 25
+; MIPS64R5EL-NEXT:    daddiu $1, $1, 1
+; MIPS64R5EL-NEXT:    daddiu $2, $zero, 523
+; MIPS64R5EL-NEXT:    dsll $2, $2, 31
+; MIPS64R5EL-NEXT:    daddiu $3, $zero, 383
+; MIPS64R5EL-NEXT:    dsll $4, $3, 55
+; MIPS64R5EL-NEXT:    dsll $5, $1, 30
+; MIPS64R5EL-NEXT:    daddiu $1, $2, 261
+; MIPS64R5EL-NEXT:    dsll $6, $1, 22
+; MIPS64R5EL-NEXT:    daddiu $1, $zero, 131
+; MIPS64R5EL-NEXT:    dsll $1, $1, 35
+; MIPS64R5EL-NEXT:    daddiu $1, $1, 1047
+; MIPS64R5EL-NEXT:    dsll $7, $1, 20
+; MIPS64R5EL-NEXT:    ld $25, %call16(float4_extern)($gp)
+; MIPS64R5EL-NEXT:    jalr $25
+; MIPS64R5EL-NEXT:    nop
+; MIPS64R5EL-NEXT:    insert.d $w0[0], $2
+; MIPS64R5EL-NEXT:    insert.d $w0[1], $3
+; MIPS64R5EL-NEXT:    ld $1, %got_disp(gv4f32)($gp)
+; MIPS64R5EL-NEXT:    st.d $w0, 0($1)
+; MIPS64R5EL-NEXT:    ld $gp, 0($sp) # 8-byte Folded Reload
+; MIPS64R5EL-NEXT:    ld $ra, 8($sp) # 8-byte Folded Reload
+; MIPS64R5EL-NEXT:    daddiu $sp, $sp, 16
+; MIPS64R5EL-NEXT:    jr $ra
+; MIPS64R5EL-NEXT:    nop
 entry:
   %0 = call <4 x float> @float4_extern(<4 x float> <float 0.0, float -1.0, float 2.0, float 4.0>, <4 x float> <float 12.0, float 14.0, float 15.0, float 16.0>)
   store <4 x float> %0, ptr @gv4f32
@@ -5688,17 +5812,11 @@ define void @calldouble_2() {
 ; MIPS32R5EB-NEXT:    .cfi_def_cfa_register 30
 ; MIPS32R5EB-NEXT:    addiu $1, $zero, -16
 ; MIPS32R5EB-NEXT:    and $sp, $sp, $1
-; MIPS32R5EB-NEXT:    lui $1, %hi($CPI39_0)
-; MIPS32R5EB-NEXT:    addiu $1, $1, %lo($CPI39_0)
-; MIPS32R5EB-NEXT:    ld.w $w0, 0($1)
-; MIPS32R5EB-NEXT:    copy_s.w $1, $w0[2]
-; MIPS32R5EB-NEXT:    lui $2, %hi($CPI39_1)
-; MIPS32R5EB-NEXT:    addiu $2, $2, %lo($CPI39_1)
-; MIPS32R5EB-NEXT:    ld.w $w0, 0($2)
-; MIPS32R5EB-NEXT:    copy_s.w $2, $w0[0]
-; MIPS32R5EB-NEXT:    copy_s.w $3, $w0[2]
-; MIPS32R5EB-NEXT:    sw $3, 32($sp)
-; MIPS32R5EB-NEXT:    sw $2, 24($sp)
+; MIPS32R5EB-NEXT:    lui $1, 16424
+; MIPS32R5EB-NEXT:    lui $2, 16428
+; MIPS32R5EB-NEXT:    sw $2, 32($sp)
+; MIPS32R5EB-NEXT:    sw $1, 24($sp)
+; MIPS32R5EB-NEXT:    lui $1, 49136
 ; MIPS32R5EB-NEXT:    sw $1, 16($sp)
 ; MIPS32R5EB-NEXT:    sw $zero, 36($sp)
 ; MIPS32R5EB-NEXT:    sw $zero, 28($sp)
@@ -5730,15 +5848,12 @@ define void @calldouble_2() {
 ; MIPS64R5-NEXT:    lui $1, %hi(%neg(%gp_rel(calldouble_2)))
 ; MIPS64R5-NEXT:    daddu $1, $1, $25
 ; MIPS64R5-NEXT:    daddiu $gp, $1, %lo(%neg(%gp_rel(calldouble_2)))
-; MIPS64R5-NEXT:    ld $1, %got_page(.LCPI39_0)($gp)
-; MIPS64R5-NEXT:    daddiu $1, $1, %got_ofst(.LCPI39_0)
-; MIPS64R5-NEXT:    ld.d $w0, 0($1)
-; MIPS64R5-NEXT:    copy_s.d $5, $w0[1]
-; MIPS64R5-NEXT:    ld $1, %got_page(.LCPI39_1)($gp)
-; MIPS64R5-NEXT:    daddiu $1, $1, %got_ofst(.LCPI39_1)
-; MIPS64R5-NEXT:    ld.d $w0, 0($1)
-; MIPS64R5-NEXT:    copy_s.d $6, $w0[0]
-; MIPS64R5-NEXT:    copy_s.d $7, $w0[1]
+; MIPS64R5-NEXT:    daddiu $1, $zero, 3071
+; MIPS64R5-NEXT:    dsll $5, $1, 52
+; MIPS64R5-NEXT:    daddiu $1, $zero, 2053
+; MIPS64R5-NEXT:    dsll $6, $1, 51
+; MIPS64R5-NEXT:    daddiu $1, $zero, 4107
+; MIPS64R5-NEXT:    dsll $7, $1, 50
 ; MIPS64R5-NEXT:    ld $25, %call16(double2_extern)($gp)
 ; MIPS64R5-NEXT:    daddiu $4, $zero, 0
 ; MIPS64R5-NEXT:    jalr $25
@@ -5804,17 +5919,11 @@ define void @calldouble_2() {
 ; MIPS32R5EL-NEXT:    .cfi_def_cfa_register 30
 ; MIPS32R5EL-NEXT:    addiu $1, $zero, -16
 ; MIPS32R5EL-NEXT:    and $sp, $sp, $1
-; MIPS32R5EL-NEXT:    lui $1, %hi($CPI39_0)
-; MIPS32R5EL-NEXT:    addiu $1, $1, %lo($CPI39_0)
-; MIPS32R5EL-NEXT:    ld.w $w0, 0($1)
-; MIPS32R5EL-NEXT:    copy_s.w $1, $w0[3]
-; MIPS32R5EL-NEXT:    lui $2, %hi($CPI39_1)
-; MIPS32R5EL-NEXT:    addiu $2, $2, %lo($CPI39_1)
-; MIPS32R5EL-NEXT:    ld.w $w0, 0($2)
-; MIPS32R5EL-NEXT:    copy_s.w $2, $w0[1]
-; MIPS32R5EL-NEXT:    copy_s.w $3, $w0[3]
-; MIPS32R5EL-NEXT:    sw $3, 36($sp)
-; MIPS32R5EL-NEXT:    sw $2, 28($sp)
+; MIPS32R5EL-NEXT:    lui $1, 16424
+; MIPS32R5EL-NEXT:    lui $2, 16428
+; MIPS32R5EL-NEXT:    sw $2, 36($sp)
+; MIPS32R5EL-NEXT:    sw $1, 28($sp)
+; MIPS32R5EL-NEXT:    lui $1, 49136
 ; MIPS32R5EL-NEXT:    sw $1, 20($sp)
 ; MIPS32R5EL-NEXT:    sw $zero, 32($sp)
 ; MIPS32R5EL-NEXT:    sw $zero, 24($sp)
diff --git a/llvm/test/CodeGen/X86/nontemporal-4.ll b/llvm/test/CodeGen/X86/nontemporal-4.ll
index c1eff891a9487..743d4cf0927da 100644
--- a/llvm/test/CodeGen/X86/nontemporal-4.ll
+++ b/llvm/test/CodeGen/X86/nontemporal-4.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4a | FileCheck %s --check-prefixes=CHECK,SSE,SSE4A
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=CHECK,SSE,SSE41
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefixes=CHECK,SSE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4a | FileCheck %s --check-prefixes=CHECK,SSE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=CHECK,SSE
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefixes=CHECK,AVX512
@@ -24,221 +24,61 @@ define void @test_constant_v2f64_align1(ptr %dst) nounwind {
 }
 
 define void @test_constant_v4f32_align1(ptr %dst) nounwind {
-; SSE2-LABEL: test_constant_v4f32_align1:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movabsq $4647714816524288000, %rax # imm = 0x4080000040400000
-; SSE2-NEXT:    movntiq %rax, 8(%rdi)
-; SSE2-NEXT:    movabsq $4611686019492741120, %rax # imm = 0x400000003F800000
-; SSE2-NEXT:    movntiq %rax, (%rdi)
-; SSE2-NEXT:    retq
-;
-; SSE4A-LABEL: test_constant_v4f32_align1:
-; SSE4A:       # %bb.0:
-; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; SSE4A-NEXT:    movntsd %xmm0, 8(%rdi)
-; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = [2.0000004731118679E+0,0.0E+0]
-; SSE4A-NEXT:    movntsd %xmm0, (%rdi)
-; SSE4A-NEXT:    retq
-;
-; SSE41-LABEL: test_constant_v4f32_align1:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    movabsq $4647714816524288000, %rax # imm = 0x4080000040400000
-; SSE41-NEXT:    movntiq %rax, 8(%rdi)
-; SSE41-NEXT:    movabsq $4611686019492741120, %rax # imm = 0x400000003F800000
-; SSE41-NEXT:    movntiq %rax, (%rdi)
-; SSE41-NEXT:    retq
-;
-; AVX-LABEL: test_constant_v4f32_align1:
-; AVX:       # %bb.0:
-; AVX-NEXT:    movabsq $4647714816524288000, %rax # imm = 0x4080000040400000
-; AVX-NEXT:    movntiq %rax, 8(%rdi)
-; AVX-NEXT:    movabsq $4611686019492741120, %rax # imm = 0x400000003F800000
-; AVX-NEXT:    movntiq %rax, (%rdi)
-; AVX-NEXT:    retq
-;
-; AVX512-LABEL: test_constant_v4f32_align1:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    movabsq $4647714816524288000, %rax # imm = 0x4080000040400000
-; AVX512-NEXT:    movntiq %rax, 8(%rdi)
-; AVX512-NEXT:    movabsq $4611686019492741120, %rax # imm = 0x400000003F800000
-; AVX512-NEXT:    movntiq %rax, (%rdi)
-; AVX512-NEXT:    retq
+; CHECK-LABEL: test_constant_v4f32_align1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movabsq $4647714816524288000, %rax # imm = 0x4080000040400000
+; CHECK-NEXT:    movntiq %rax, 8(%rdi)
+; CHECK-NEXT:    movabsq $4611686019492741120, %rax # imm = 0x400000003F800000
+; CHECK-NEXT:    movntiq %rax, (%rdi)
+; CHECK-NEXT:    retq
   store <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, ptr %dst, align 1, !nontemporal !1
   ret void
 }
 
 define void @test_constant_v2i64_align1(ptr %dst) nounwind {
-; SSE2-LABEL: test_constant_v2i64_align1:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movl $1, %eax
-; SSE2-NEXT:    movntiq %rax, 8(%rdi)
-; SSE2-NEXT:    xorl %eax, %eax
-; SSE2-NEXT:    movntiq %rax, (%rdi)
-; SSE2-NEXT:    retq
-;
-; SSE4A-LABEL: test_constant_v2i64_align1:
-; SSE4A:       # %bb.0:
-; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = [4.9406564584124654E-324,0.0E+0]
-; SSE4A-NEXT:    movntsd %xmm0, 8(%rdi)
-; SSE4A-NEXT:    xorl %eax, %eax
-; SSE4A-NEXT:    movntiq %rax, (%rdi)
-; SSE4A-NEXT:    retq
-;
-; SSE41-LABEL: test_constant_v2i64_align1:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    movl $1, %eax
-; SSE41-NEXT:    movntiq %rax, 8(%rdi)
-; SSE41-NEXT:    xorl %eax, %eax
-; SSE41-NEXT:    movntiq %rax, (%rdi)
-; SSE41-NEXT:    retq
-;
-; AVX-LABEL: test_constant_v2i64_align1:
-; AVX:       # %bb.0:
-; AVX-NEXT:    movl $1, %eax
-; AVX-NEXT:    movntiq %rax, 8(%rdi)
-; AVX-NEXT:    xorl %eax, %eax
-; AVX-NEXT:    movntiq %rax, (%rdi)
-; AVX-NEXT:    retq
-;
-; AVX512-LABEL: test_constant_v2i64_align1:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    movl $1, %eax
-; AVX512-NEXT:    movntiq %rax, 8(%rdi)
-; AVX512-NEXT:    xorl %eax, %eax
-; AVX512-NEXT:    movntiq %rax, (%rdi)
-; AVX512-NEXT:    retq
+; CHECK-LABEL: test_constant_v2i64_align1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl $1, %eax
+; CHECK-NEXT:    movntiq %rax, 8(%rdi)
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    movntiq %rax, (%rdi)
+; CHECK-NEXT:    retq
   store <2 x i64> <i64 0, i64 1>, ptr %dst, align 1, !nontemporal !1
   ret void
 }
 
 define void @test_constant_v4i32_align1(ptr %dst) nounwind {
-; SSE2-LABEL: test_constant_v4i32_align1:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movabsq $12884901890, %rax # imm = 0x300000002
-; SSE2-NEXT:    movntiq %rax, 8(%rdi)
-; SSE2-NEXT:    movabsq $4294967296, %rax # imm = 0x100000000
-; SSE2-NEXT:    movntiq %rax, (%rdi)
-; SSE2-NEXT:    retq
-;
-; SSE4A-LABEL: test_constant_v4i32_align1:
-; SSE4A:       # %bb.0:
-; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; SSE4A-NEXT:    movntsd %xmm0, 8(%rdi)
-; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = [2.1219957909652723E-314,0.0E+0]
-; SSE4A-NEXT:    movntsd %xmm0, (%rdi)
-; SSE4A-NEXT:    retq
-;
-; SSE41-LABEL: test_constant_v4i32_align1:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    movabsq $12884901890, %rax # imm = 0x300000002
-; SSE41-NEXT:    movntiq %rax, 8(%rdi)
-; SSE41-NEXT:    movabsq $4294967296, %rax # imm = 0x100000000
-; SSE41-NEXT:    movntiq %rax, (%rdi)
-; SSE41-NEXT:    retq
-;
-; AVX-LABEL: test_constant_v4i32_align1:
-; AVX:       # %bb.0:
-; AVX-NEXT:    movabsq $12884901890, %rax # imm = 0x300000002
-; AVX-NEXT:    movntiq %rax, 8(%rdi)
-; AVX-NEXT:    movabsq $4294967296, %rax # imm = 0x100000000
-; AVX-NEXT:    movntiq %rax, (%rdi)
-; AVX-NEXT:    retq
-;
-; AVX512-LABEL: test_constant_v4i32_align1:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    movabsq $12884901890, %rax # imm = 0x300000002
-; AVX512-NEXT:    movntiq %rax, 8(%rdi)
-; AVX512-NEXT:    movabsq $4294967296, %rax # imm = 0x100000000
-; AVX512-NEXT:    movntiq %rax, (%rdi)
-; AVX512-NEXT:    retq
+; CHECK-LABEL: test_constant_v4i32_align1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movabsq $12884901890, %rax # imm = 0x300000002
+; CHECK-NEXT:    movntiq %rax, 8(%rdi)
+; CHECK-NEXT:    movabsq $4294967296, %rax # imm = 0x100000000
+; CHECK-NEXT:    movntiq %rax, (%rdi)
+; CHECK-NEXT:    retq
   store <4 x i32> <i32 0, i32 1, i32 2, i32 3>, ptr %dst, align 1, !nontemporal !1
   ret void
 }
 
 define void @test_constant_v8i16_align1(ptr %dst) nounwind {
-; SSE2-LABEL: test_constant_v8i16_align1:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movabsq $1970350607106052, %rax # imm = 0x7000600050004
-; SSE2-NEXT:    movntiq %rax, 8(%rdi)
-; SSE2-NEXT:    movabsq $844433520132096, %rax # imm = 0x3000200010000
-; SSE2-NEXT:    movntiq %rax, (%rdi)
-; SSE2-NEXT:    retq
-;
-; SSE4A-LABEL: test_constant_v8i16_align1:
-; SSE4A:       # %bb.0:
-; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; SSE4A-NEXT:    movntsd %xmm0, 8(%rdi)
-; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = [4.1720559249406128E-309,0.0E+0]
-; SSE4A-NEXT:    movntsd %xmm0, (%rdi)
-; SSE4A-NEXT:    retq
-;
-; SSE41-LABEL: test_constant_v8i16_align1:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    movabsq $1970350607106052, %rax # imm = 0x7000600050004
-; SSE41-NEXT:    movntiq %rax, 8(%rdi)
-; SSE41-NEXT:    movabsq $844433520132096, %rax # imm = 0x3000200010000
-; SSE41-NEXT:    movntiq %rax, (%rdi)
-; SSE41-NEXT:    retq
-;
-; AVX-LABEL: test_constant_v8i16_align1:
-; AVX:       # %bb.0:
-; AVX-NEXT:    movabsq $1970350607106052, %rax # imm = 0x7000600050004
-; AVX-NEXT:    movntiq %rax, 8(%rdi)
-; AVX-NEXT:    movabsq $844433520132096, %rax # imm = 0x3000200010000
-; AVX-NEXT:    movntiq %rax, (%rdi)
-; AVX-NEXT:    retq
-;
-; AVX512-LABEL: test_constant_v8i16_align1:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    movabsq $1970350607106052, %rax # imm = 0x7000600050004
-; AVX512-NEXT:    movntiq %rax, 8(%rdi)
-; AVX512-NEXT:    movabsq $844433520132096, %rax # imm = 0x3000200010000
-; AVX512-NEXT:    movntiq %rax, (%rdi)
-; AVX512-NEXT:    retq
+; CHECK-LABEL: test_constant_v8i16_align1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movabsq $1970350607106052, %rax # imm = 0x7000600050004
+; CHECK-NEXT:    movntiq %rax, 8(%rdi)
+; CHECK-NEXT:    movabsq $844433520132096, %rax # imm = 0x3000200010000
+; CHECK-NEXT:    movntiq %rax, (%rdi)
+; CHECK-NEXT:    retq
   store <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, ptr %dst, align 1, !nontemporal !1
   ret void
 }
 
 define void @test_constant_v16i8_align1(ptr %dst) nounwind {
-; SSE2-LABEL: test_constant_v16i8_align1:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movabsq $1084818905618843912, %rax # imm = 0xF0E0D0C0B0A0908
-; SSE2-NEXT:    movntiq %rax, 8(%rdi)
-; SSE2-NEXT:    movabsq $506097522914230528, %rax # imm = 0x706050403020100
-; SSE2-NEXT:    movntiq %rax, (%rdi)
-; SSE2-NEXT:    retq
-;
-; SSE4A-LABEL: test_constant_v16i8_align1:
-; SSE4A:       # %bb.0:
-; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; SSE4A-NEXT:    movntsd %xmm0, 8(%rdi)
-; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = [7.9499288951273625E-275,0.0E+0]
-; SSE4A-NEXT:    movntsd %xmm0, (%rdi)
-; SSE4A-NEXT:    retq
-;
-; SSE41-LABEL: test_constant_v16i8_align1:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    movabsq $1084818905618843912, %rax # imm = 0xF0E0D0C0B0A0908
-; SSE41-NEXT:    movntiq %rax, 8(%rdi)
-; SSE41-NEXT:    movabsq $506097522914230528, %rax # imm = 0x706050403020100
-; SSE41-NEXT:    movntiq %rax, (%rdi)
-; SSE41-NEXT:    retq
-;
-; AVX-LABEL: test_constant_v16i8_align1:
-; AVX:       # %bb.0:
-; AVX-NEXT:    movabsq $1084818905618843912, %rax # imm = 0xF0E0D0C0B0A0908
-; AVX-NEXT:    movntiq %rax, 8(%rdi)
-; AVX-NEXT:    movabsq $506097522914230528, %rax # imm = 0x706050403020100
-; AVX-NEXT:    movntiq %rax, (%rdi)
-; AVX-NEXT:    retq
-;
-; AVX512-LABEL: test_constant_v16i8_align1:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    movabsq $1084818905618843912, %rax # imm = 0xF0E0D0C0B0A0908
-; AVX512-NEXT:    movntiq %rax, 8(%rdi)
-; AVX512-NEXT:    movabsq $506097522914230528, %rax # imm = 0x706050403020100
-; AVX512-NEXT:    movntiq %rax, (%rdi)
-; AVX512-NEXT:    retq
+; CHECK-LABEL: test_constant_v16i8_align1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movabsq $1084818905618843912, %rax # imm = 0xF0E0D0C0B0A0908
+; CHECK-NEXT:    movntiq %rax, 8(%rdi)
+; CHECK-NEXT:    movabsq $506097522914230528, %rax # imm = 0x706050403020100
+; CHECK-NEXT:    movntiq %rax, (%rdi)
+; CHECK-NEXT:    retq
   store <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, ptr %dst, align 1, !nontemporal !1
   ret void
 }
@@ -262,321 +102,81 @@ define void @test_constant_v4f64_align1(ptr %dst) nounwind {
 }
 
 define void @test_constant_v8f32_align1(ptr %dst) nounwind {
-; SSE2-LABEL: test_constant_v8f32_align1:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movabsq $-4611686015214551040, %rax # imm = 0xC0000000BF800000
-; SSE2-NEXT:    movntiq %rax, 8(%rdi)
-; SSE2-NEXT:    movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
-; SSE2-NEXT:    movntiq %rax, (%rdi)
-; SSE2-NEXT:    movabsq $-4557642819667230720, %rax # imm = 0xC0C00000C0A00000
-; SSE2-NEXT:    movntiq %rax, 24(%rdi)
-; SSE2-NEXT:    movabsq $-4575657218183004160, %rax # imm = 0xC0800000C0400000
-; SSE2-NEXT:    movntiq %rax, 16(%rdi)
-; SSE2-NEXT:    retq
-;
-; SSE4A-LABEL: test_constant_v8f32_align1:
-; SSE4A:       # %bb.0:
-; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; SSE4A-NEXT:    movntsd %xmm0, 8(%rdi)
-; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = [-0.0E+0,0.0E+0]
-; SSE4A-NEXT:    movntsd %xmm0, (%rdi)
-; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; SSE4A-NEXT:    movntsd %xmm0, 24(%rdi)
-; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = [-5.1200036668777466E+2,0.0E+0]
-; SSE4A-NEXT:    movntsd %xmm0, 16(%rdi)
-; SSE4A-NEXT:    retq
-;
-; SSE41-LABEL: test_constant_v8f32_align1:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    movabsq $-4611686015214551040, %rax # imm = 0xC0000000BF800000
-; SSE41-NEXT:    movntiq %rax, 8(%rdi)
-; SSE41-NEXT:    movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
-; SSE41-NEXT:    movntiq %rax, (%rdi)
-; SSE41-NEXT:    movabsq $-4557642819667230720, %rax # imm = 0xC0C00000C0A00000
-; SSE41-NEXT:    movntiq %rax, 24(%rdi)
-; SSE41-NEXT:    movabsq $-4575657218183004160, %rax # imm = 0xC0800000C0400000
-; SSE41-NEXT:    movntiq %rax, 16(%rdi)
-; SSE41-NEXT:    retq
-;
-; AVX-LABEL: test_constant_v8f32_align1:
-; AVX:       # %bb.0:
-; AVX-NEXT:    movabsq $-4611686015214551040, %rax # imm = 0xC0000000BF800000
-; AVX-NEXT:    movntiq %rax, 8(%rdi)
-; AVX-NEXT:    movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
-; AVX-NEXT:    movntiq %rax, (%rdi)
-; AVX-NEXT:    movabsq $-4557642819667230720, %rax # imm = 0xC0C00000C0A00000
-; AVX-NEXT:    movntiq %rax, 24(%rdi)
-; AVX-NEXT:    movabsq $-4575657218183004160, %rax # imm = 0xC0800000C0400000
-; AVX-NEXT:    movntiq %rax, 16(%rdi)
-; AVX-NEXT:    retq
-;
-; AVX512-LABEL: test_constant_v8f32_align1:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    movabsq $-4611686015214551040, %rax # imm = 0xC0000000BF800000
-; AVX512-NEXT:    movntiq %rax, 8(%rdi)
-; AVX512-NEXT:    movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
-; AVX512-NEXT:    movntiq %rax, (%rdi)
-; AVX512-NEXT:    movabsq $-4557642819667230720, %rax # imm = 0xC0C00000C0A00000
-; AVX512-NEXT:    movntiq %rax, 24(%rdi)
-; AVX512-NEXT:    movabsq $-4575657218183004160, %rax # imm = 0xC0800000C0400000
-; AVX512-NEXT:    movntiq %rax, 16(%rdi)
-; AVX512-NEXT:    retq
+; CHECK-LABEL: test_constant_v8f32_align1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movabsq $-4611686015214551040, %rax # imm = 0xC0000000BF800000
+; CHECK-NEXT:    movntiq %rax, 8(%rdi)
+; CHECK-NEXT:    movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
+; CHECK-NEXT:    movntiq %rax, (%rdi)
+; CHECK-NEXT:    movabsq $-4557642819667230720, %rax # imm = 0xC0C00000C0A00000
+; CHECK-NEXT:    movntiq %rax, 24(%rdi)
+; CHECK-NEXT:    movabsq $-4575657218183004160, %rax # imm = 0xC0800000C0400000
+; CHECK-NEXT:    movntiq %rax, 16(%rdi)
+; CHECK-NEXT:    retq
   store <8 x float> <float 0.0, float -0.0, float -1.0, float -2.0, float -3.0, float -4.0, float -5.0, float -6.0>, ptr %dst, align 1, !nontemporal !1
   ret void
 }
 
 define void @test_constant_v4i64_align1(ptr %dst) nounwind {
-; SSE2-LABEL: test_constant_v4i64_align1:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movq $-1, %rax
-; SSE2-NEXT:    movntiq %rax, 8(%rdi)
-; SSE2-NEXT:    movq $-3, %rax
-; SSE2-NEXT:    movntiq %rax, 24(%rdi)
-; SSE2-NEXT:    movq $-2, %rax
-; SSE2-NEXT:    movntiq %rax, 16(%rdi)
-; SSE2-NEXT:    xorl %eax, %eax
-; SSE2-NEXT:    movntiq %rax, (%rdi)
-; SSE2-NEXT:    retq
-;
-; SSE4A-LABEL: test_constant_v4i64_align1:
-; SSE4A:       # %bb.0:
-; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = [NaN,0.0E+0]
-; SSE4A-NEXT:    movntsd %xmm0, 8(%rdi)
-; SSE4A-NEXT:    xorl %eax, %eax
-; SSE4A-NEXT:    movntiq %rax, (%rdi)
-; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; SSE4A-NEXT:    movntsd %xmm0, 24(%rdi)
-; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = [NaN,0.0E+0]
-; SSE4A-NEXT:    movntsd %xmm0, 16(%rdi)
-; SSE4A-NEXT:    retq
-;
-; SSE41-LABEL: test_constant_v4i64_align1:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    movq $-1, %rax
-; SSE41-NEXT:    movntiq %rax, 8(%rdi)
-; SSE41-NEXT:    movq $-3, %rax
-; SSE41-NEXT:    movntiq %rax, 24(%rdi)
-; SSE41-NEXT:    movq $-2, %rax
-; SSE41-NEXT:    movntiq %rax, 16(%rdi)
-; SSE41-NEXT:    xorl %eax, %eax
-; SSE41-NEXT:    movntiq %rax, (%rdi)
-; SSE41-NEXT:    retq
-;
-; AVX-LABEL: test_constant_v4i64_align1:
-; AVX:       # %bb.0:
-; AVX-NEXT:    movq $-1, %rax
-; AVX-NEXT:    movntiq %rax, 8(%rdi)
-; AVX-NEXT:    movq $-3, %rax
-; AVX-NEXT:    movntiq %rax, 24(%rdi)
-; AVX-NEXT:    movq $-2, %rax
-; AVX-NEXT:    movntiq %rax, 16(%rdi)
-; AVX-NEXT:    xorl %eax, %eax
-; AVX-NEXT:    movntiq %rax, (%rdi)
-; AVX-NEXT:    retq
-;
-; AVX512-LABEL: test_constant_v4i64_align1:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    movq $-1, %rax
-; AVX512-NEXT:    movntiq %rax, 8(%rdi)
-; AVX512-NEXT:    movq $-3, %rax
-; AVX512-NEXT:    movntiq %rax, 24(%rdi)
-; AVX512-NEXT:    movq $-2, %rax
-; AVX512-NEXT:    movntiq %rax, 16(%rdi)
-; AVX512-NEXT:    xorl %eax, %eax
-; AVX512-NEXT:    movntiq %rax, (%rdi)
-; AVX512-NEXT:    retq
+; CHECK-LABEL: test_constant_v4i64_align1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movq $-1, %rax
+; CHECK-NEXT:    movntiq %rax, 8(%rdi)
+; CHECK-NEXT:    movq $-3, %rax
+; CHECK-NEXT:    movntiq %rax, 24(%rdi)
+; CHECK-NEXT:    movq $-2, %rax
+; CHECK-NEXT:    movntiq %rax, 16(%rdi)
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    movntiq %rax, (%rdi)
+; CHECK-NEXT:    retq
   store <4 x i64> <i64 0, i64 -1, i64 -2, i64 -3>, ptr %dst, align 1, !nontemporal !1
   ret void
 }
 
 define void @test_constant_v8i32_align1(ptr %dst) nounwind {
-; SSE2-LABEL: test_constant_v8i32_align1:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movabsq $-8589934594, %rax # imm = 0xFFFFFFFDFFFFFFFE
-; SSE2-NEXT:    movntiq %rax, 8(%rdi)
-; SSE2-NEXT:    movabsq $-4294967296, %rax # imm = 0xFFFFFFFF00000000
-; SSE2-NEXT:    movntiq %rax, (%rdi)
-; SSE2-NEXT:    movabsq $-25769803782, %rax # imm = 0xFFFFFFF9FFFFFFFA
-; SSE2-NEXT:    movntiq %rax, 24(%rdi)
-; SSE2-NEXT:    movabsq $-17179869188, %rax # imm = 0xFFFFFFFBFFFFFFFC
-; SSE2-NEXT:    movntiq %rax, 16(%rdi)
-; SSE2-NEXT:    retq
-;
-; SSE4A-LABEL: test_constant_v8i32_align1:
-; SSE4A:       # %bb.0:
-; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; SSE4A-NEXT:    movntsd %xmm0, 8(%rdi)
-; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = [NaN,0.0E+0]
-; SSE4A-NEXT:    movntsd %xmm0, (%rdi)
-; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; SSE4A-NEXT:    movntsd %xmm0, 24(%rdi)
-; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = [NaN,0.0E+0]
-; SSE4A-NEXT:    movntsd %xmm0, 16(%rdi)
-; SSE4A-NEXT:    retq
-;
-; SSE41-LABEL: test_constant_v8i32_align1:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    movabsq $-8589934594, %rax # imm = 0xFFFFFFFDFFFFFFFE
-; SSE41-NEXT:    movntiq %rax, 8(%rdi)
-; SSE41-NEXT:    movabsq $-4294967296, %rax # imm = 0xFFFFFFFF00000000
-; SSE41-NEXT:    movntiq %rax, (%rdi)
-; SSE41-NEXT:    movabsq $-25769803782, %rax # imm = 0xFFFFFFF9FFFFFFFA
-; SSE41-NEXT:    movntiq %rax, 24(%rdi)
-; SSE41-NEXT:    movabsq $-17179869188, %rax # imm = 0xFFFFFFFBFFFFFFFC
-; SSE41-NEXT:    movntiq %rax, 16(%rdi)
-; SSE41-NEXT:    retq
-;
-; AVX-LABEL: test_constant_v8i32_align1:
-; AVX:       # %bb.0:
-; AVX-NEXT:    movabsq $-8589934594, %rax # imm = 0xFFFFFFFDFFFFFFFE
-; AVX-NEXT:    movntiq %rax, 8(%rdi)
-; AVX-NEXT:    movabsq $-4294967296, %rax # imm = 0xFFFFFFFF00000000
-; AVX-NEXT:    movntiq %rax, (%rdi)
-; AVX-NEXT:    movabsq $-25769803782, %rax # imm = 0xFFFFFFF9FFFFFFFA
-; AVX-NEXT:    movntiq %rax, 24(%rdi)
-; AVX-NEXT:    movabsq $-17179869188, %rax # imm = 0xFFFFFFFBFFFFFFFC
-; AVX-NEXT:    movntiq %rax, 16(%rdi)
-; AVX-NEXT:    retq
-;
-; AVX512-LABEL: test_constant_v8i32_align1:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    movabsq $-8589934594, %rax # imm = 0xFFFFFFFDFFFFFFFE
-; AVX512-NEXT:    movntiq %rax, 8(%rdi)
-; AVX512-NEXT:    movabsq $-4294967296, %rax # imm = 0xFFFFFFFF00000000
-; AVX512-NEXT:    movntiq %rax, (%rdi)
-; AVX512-NEXT:    movabsq $-25769803782, %rax # imm = 0xFFFFFFF9FFFFFFFA
-; AVX512-NEXT:    movntiq %rax, 24(%rdi)
-; AVX512-NEXT:    movabsq $-17179869188, %rax # imm = 0xFFFFFFFBFFFFFFFC
-; AVX512-NEXT:    movntiq %rax, 16(%rdi)
-; AVX512-NEXT:    retq
+; CHECK-LABEL: test_constant_v8i32_align1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movabsq $-8589934594, %rax # imm = 0xFFFFFFFDFFFFFFFE
+; CHECK-NEXT:    movntiq %rax, 8(%rdi)
+; CHECK-NEXT:    movabsq $-4294967296, %rax # imm = 0xFFFFFFFF00000000
+; CHECK-NEXT:    movntiq %rax, (%rdi)
+; CHECK-NEXT:    movabsq $-25769803782, %rax # imm = 0xFFFFFFF9FFFFFFFA
+; CHECK-NEXT:    movntiq %rax, 24(%rdi)
+; CHECK-NEXT:    movabsq $-17179869188, %rax # imm = 0xFFFFFFFBFFFFFFFC
+; CHECK-NEXT:    movntiq %rax, 16(%rdi)
+; CHECK-NEXT:    retq
   store <8 x i32> <i32 0, i32 -1, i32 -2, i32 -3, i32 -4, i32 -5, i32 -6, i32 -7>, ptr %dst, align 1, !nontemporal !1
   ret void
 }
 
 define void @test_constant_v16i16_align1(ptr %dst) nounwind {
-; SSE2-LABEL: test_constant_v16i16_align1:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movabsq $-1688871335362564, %rax # imm = 0xFFF9FFFAFFFBFFFC
-; SSE2-NEXT:    movntiq %rax, 8(%rdi)
-; SSE2-NEXT:    movabsq $-562954248454144, %rax # imm = 0xFFFDFFFEFFFF0000
-; SSE2-NEXT:    movntiq %rax, (%rdi)
-; SSE2-NEXT:    movabsq $-3940705509310476, %rax # imm = 0xFFF1FFF2FFF3FFF4
-; SSE2-NEXT:    movntiq %rax, 24(%rdi)
-; SSE2-NEXT:    movabsq $-2814788422336520, %rax # imm = 0xFFF5FFF6FFF7FFF8
-; SSE2-NEXT:    movntiq %rax, 16(%rdi)
-; SSE2-NEXT:    retq
-;
-; SSE4A-LABEL: test_constant_v16i16_align1:
-; SSE4A:       # %bb.0:
-; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; SSE4A-NEXT:    movntsd %xmm0, 8(%rdi)
-; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = [NaN,0.0E+0]
-; SSE4A-NEXT:    movntsd %xmm0, (%rdi)
-; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; SSE4A-NEXT:    movntsd %xmm0, 24(%rdi)
-; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = [NaN,0.0E+0]
-; SSE4A-NEXT:    movntsd %xmm0, 16(%rdi)
-; SSE4A-NEXT:    retq
-;
-; SSE41-LABEL: test_constant_v16i16_align1:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    movabsq $-1688871335362564, %rax # imm = 0xFFF9FFFAFFFBFFFC
-; SSE41-NEXT:    movntiq %rax, 8(%rdi)
-; SSE41-NEXT:    movabsq $-562954248454144, %rax # imm = 0xFFFDFFFEFFFF0000
-; SSE41-NEXT:    movntiq %rax, (%rdi)
-; SSE41-NEXT:    movabsq $-3940705509310476, %rax # imm = 0xFFF1FFF2FFF3FFF4
-; SSE41-NEXT:    movntiq %rax, 24(%rdi)
-; SSE41-NEXT:    movabsq $-2814788422336520, %rax # imm = 0xFFF5FFF6FFF7FFF8
-; SSE41-NEXT:    movntiq %rax, 16(%rdi)
-; SSE41-NEXT:    retq
-;
-; AVX-LABEL: test_constant_v16i16_align1:
-; AVX:       # %bb.0:
-; AVX-NEXT:    movabsq $-1688871335362564, %rax # imm = 0xFFF9FFFAFFFBFFFC
-; AVX-NEXT:    movntiq %rax, 8(%rdi)
-; AVX-NEXT:    movabsq $-562954248454144, %rax # imm = 0xFFFDFFFEFFFF0000
-; AVX-NEXT:    movntiq %rax, (%rdi)
-; AVX-NEXT:    movabsq $-3940705509310476, %rax # imm = 0xFFF1FFF2FFF3FFF4
-; AVX-NEXT:    movntiq %rax, 24(%rdi)
-; AVX-NEXT:    movabsq $-2814788422336520, %rax # imm = 0xFFF5FFF6FFF7FFF8
-; AVX-NEXT:    movntiq %rax, 16(%rdi)
-; AVX-NEXT:    retq
-;
-; AVX512-LABEL: test_constant_v16i16_align1:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    movabsq $-1688871335362564, %rax # imm = 0xFFF9FFFAFFFBFFFC
-; AVX512-NEXT:    movntiq %rax, 8(%rdi)
-; AVX512-NEXT:    movabsq $-562954248454144, %rax # imm = 0xFFFDFFFEFFFF0000
-; AVX512-NEXT:    movntiq %rax, (%rdi)
-; AVX512-NEXT:    movabsq $-3940705509310476, %rax # imm = 0xFFF1FFF2FFF3FFF4
-; AVX512-NEXT:    movntiq %rax, 24(%rdi)
-; AVX512-NEXT:    movabsq $-2814788422336520, %rax # imm = 0xFFF5FFF6FFF7FFF8
-; AVX512-NEXT:    movntiq %rax, 16(%rdi)
-; AVX512-NEXT:    retq
+; CHECK-LABEL: test_constant_v16i16_align1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movabsq $-1688871335362564, %rax # imm = 0xFFF9FFFAFFFBFFFC
+; CHECK-NEXT:    movntiq %rax, 8(%rdi)
+; CHECK-NEXT:    movabsq $-562954248454144, %rax # imm = 0xFFFDFFFEFFFF0000
+; CHECK-NEXT:    movntiq %rax, (%rdi)
+; CHECK-NEXT:    movabsq $-3940705509310476, %rax # imm = 0xFFF1FFF2FFF3FFF4
+; CHECK-NEXT:    movntiq %rax, 24(%rdi)
+; CHECK-NEXT:    movabsq $-2814788422336520, %rax # imm = 0xFFF5FFF6FFF7FFF8
+; CHECK-NEXT:    movntiq %rax, 16(%rdi)
+; CHECK-NEXT:    retq
   store <16 x i16> <i16 0, i16 -1, i16 -2, i16 -3, i16 -4, i16 -5, i16 -6, i16 -7, i16 -8, i16 -9, i16 -10, i16 -11, i16 -12, i16 -13, i16 -14, i16 -15>, ptr %dst, align 1, !nontemporal !1
   ret void
 }
 
 define void @test_constant_v32i8_align1(ptr %dst) nounwind {
-; SSE2-LABEL: test_constant_v32i8_align1:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movabsq $-1012478732780767240, %rax # imm = 0xF1F2F3F4F5F6F7F8
-; SSE2-NEXT:    movntiq %rax, 8(%rdi)
-; SSE2-NEXT:    movabsq $-433757350076154112, %rax # imm = 0xF9FAFBFCFDFEFF00
-; SSE2-NEXT:    movntiq %rax, (%rdi)
-; SSE2-NEXT:    movabsq $-2169921498189994008, %rax # imm = 0xE1E2E3E4E5E6E7E8
-; SSE2-NEXT:    movntiq %rax, 24(%rdi)
-; SSE2-NEXT:    movabsq $-1591200115485380624, %rax # imm = 0xE9EAEBECEDEEEFF0
-; SSE2-NEXT:    movntiq %rax, 16(%rdi)
-; SSE2-NEXT:    retq
-;
-; SSE4A-LABEL: test_constant_v32i8_align1:
-; SSE4A:       # %bb.0:
-; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; SSE4A-NEXT:    movntsd %xmm0, 8(%rdi)
-; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = [-3.826728214441238E+279,0.0E+0]
-; SSE4A-NEXT:    movntsd %xmm0, (%rdi)
-; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; SSE4A-NEXT:    movntsd %xmm0, 24(%rdi)
-; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = [-1.6485712323024388E+202,0.0E+0]
-; SSE4A-NEXT:    movntsd %xmm0, 16(%rdi)
-; SSE4A-NEXT:    retq
-;
-; SSE41-LABEL: test_constant_v32i8_align1:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    movabsq $-1012478732780767240, %rax # imm = 0xF1F2F3F4F5F6F7F8
-; SSE41-NEXT:    movntiq %rax, 8(%rdi)
-; SSE41-NEXT:    movabsq $-433757350076154112, %rax # imm = 0xF9FAFBFCFDFEFF00
-; SSE41-NEXT:    movntiq %rax, (%rdi)
-; SSE41-NEXT:    movabsq $-2169921498189994008, %rax # imm = 0xE1E2E3E4E5E6E7E8
-; SSE41-NEXT:    movntiq %rax, 24(%rdi)
-; SSE41-NEXT:    movabsq $-1591200115485380624, %rax # imm = 0xE9EAEBECEDEEEFF0
-; SSE41-NEXT:    movntiq %rax, 16(%rdi)
-; SSE41-NEXT:    retq
-;
-; AVX-LABEL: test_constant_v32i8_align1:
-; AVX:       # %bb.0:
-; AVX-NEXT:    movabsq $-1012478732780767240, %rax # imm = 0xF1F2F3F4F5F6F7F8
-; AVX-NEXT:    movntiq %rax, 8(%rdi)
-; AVX-NEXT:    movabsq $-433757350076154112, %rax # imm = 0xF9FAFBFCFDFEFF00
-; AVX-NEXT:    movntiq %rax, (%rdi)
-; AVX-NEXT:    movabsq $-2169921498189994008, %rax # imm = 0xE1E2E3E4E5E6E7E8
-; AVX-NEXT:    movntiq %rax, 24(%rdi)
-; AVX-NEXT:    movabsq $-1591200115485380624, %rax # imm = 0xE9EAEBECEDEEEFF0
-; AVX-NEXT:    movntiq %rax, 16(%rdi)
-; AVX-NEXT:    retq
-;
-; AVX512-LABEL: test_constant_v32i8_align1:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    movabsq $-1012478732780767240, %rax # imm = 0xF1F2F3F4F5F6F7F8
-; AVX512-NEXT:    movntiq %rax, 8(%rdi)
-; AVX512-NEXT:    movabsq $-433757350076154112, %rax # imm = 0xF9FAFBFCFDFEFF00
-; AVX512-NEXT:    movntiq %rax, (%rdi)
-; AVX512-NEXT:    movabsq $-2169921498189994008, %rax # imm = 0xE1E2E3E4E5E6E7E8
-; AVX512-NEXT:    movntiq %rax, 24(%rdi)
-; AVX512-NEXT:    movabsq $-1591200115485380624, %rax # imm = 0xE9EAEBECEDEEEFF0
-; AVX512-NEXT:    movntiq %rax, 16(%rdi)
-; AVX512-NEXT:    retq
+; CHECK-LABEL: test_constant_v32i8_align1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movabsq $-1012478732780767240, %rax # imm = 0xF1F2F3F4F5F6F7F8
+; CHECK-NEXT:    movntiq %rax, 8(%rdi)
+; CHECK-NEXT:    movabsq $-433757350076154112, %rax # imm = 0xF9FAFBFCFDFEFF00
+; CHECK-NEXT:    movntiq %rax, (%rdi)
+; CHECK-NEXT:    movabsq $-2169921498189994008, %rax # imm = 0xE1E2E3E4E5E6E7E8
+; CHECK-NEXT:    movntiq %rax, 24(%rdi)
+; CHECK-NEXT:    movabsq $-1591200115485380624, %rax # imm = 0xE9EAEBECEDEEEFF0
+; CHECK-NEXT:    movntiq %rax, 16(%rdi)
+; CHECK-NEXT:    retq
   store <32 x i8> <i8 0, i8 -1, i8 -2, i8 -3, i8 -4, i8 -5, i8 -6, i8 -7, i8 -8, i8 -9, i8 -10, i8 -11, i8 -12, i8 -13, i8 -14, i8 -15, i8 -16, i8 -17, i8 -18, i8 -19, i8 -20, i8 -21, i8 -22, i8 -23, i8 -24, i8 -25, i8 -26, i8 -27, i8 -28, i8 -29, i8 -30, i8 -31>, ptr %dst, align 1, !nontemporal !1
   ret void
 }
@@ -779,521 +379,121 @@ define void @test_constant_v8f64_align1(ptr %dst) nounwind {
 }
 
 define void @test_constant_v16f32_align1(ptr %dst) nounwind {
-; SSE2-LABEL: test_constant_v16f32_align1:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movabsq $-4611686015214551040, %rax # imm = 0xC0000000BF800000
-; SSE2-NEXT:    movntiq %rax, 8(%rdi)
-; SSE2-NEXT:    movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
-; SSE2-NEXT:    movntiq %rax, (%rdi)
-; SSE2-NEXT:    movabsq $-4557642819667230720, %rax # imm = 0xC0C00000C0A00000
-; SSE2-NEXT:    movntiq %rax, 24(%rdi)
-; SSE2-NEXT:    movabsq $-4575657218183004160, %rax # imm = 0xC0800000C0400000
-; SSE2-NEXT:    movntiq %rax, 16(%rdi)
-; SSE2-NEXT:    movabsq $-4530621221895667712, %rax # imm = 0xC1200000C1100000
-; SSE2-NEXT:    movntiq %rax, 40(%rdi)
-; SSE2-NEXT:    movabsq $-4539628421153554432, %rax # imm = 0xC1000000C0E00000
-; SSE2-NEXT:    movntiq %rax, 32(%rdi)
-; SSE2-NEXT:    movabsq $-4512606823381991424, %rax # imm = 0xC1600000C1500000
-; SSE2-NEXT:    movntiq %rax, 56(%rdi)
-; SSE2-NEXT:    movabsq $-4521614022638829568, %rax # imm = 0xC1400000C1300000
-; SSE2-NEXT:    movntiq %rax, 48(%rdi)
-; SSE2-NEXT:    retq
-;
-; SSE4A-LABEL: test_constant_v16f32_align1:
-; SSE4A:       # %bb.0:
-; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; SSE4A-NEXT:    movntsd %xmm0, 8(%rdi)
-; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = [-0.0E+0,0.0E+0]
-; SSE4A-NEXT:    movntsd %xmm0, (%rdi)
-; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; SSE4A-NEXT:    movntsd %xmm0, 24(%rdi)
-; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = [-5.1200036668777466E+2,0.0E+0]
-; SSE4A-NEXT:    movntsd %xmm0, 16(%rdi)
-; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; SSE4A-NEXT:    movntsd %xmm0, 40(%rdi)
-; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = [-1.3107209417724609E+5,0.0E+0]
-; SSE4A-NEXT:    movntsd %xmm0, 32(%rdi)
-; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; SSE4A-NEXT:    movntsd %xmm0, 56(%rdi)
-; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = [-2.0971535092773438E+6,0.0E+0]
-; SSE4A-NEXT:    movntsd %xmm0, 48(%rdi)
-; SSE4A-NEXT:    retq
-;
-; SSE41-LABEL: test_constant_v16f32_align1:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    movabsq $-4611686015214551040, %rax # imm = 0xC0000000BF800000
-; SSE41-NEXT:    movntiq %rax, 8(%rdi)
-; SSE41-NEXT:    movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
-; SSE41-NEXT:    movntiq %rax, (%rdi)
-; SSE41-NEXT:    movabsq $-4557642819667230720, %rax # imm = 0xC0C00000C0A00000
-; SSE41-NEXT:    movntiq %rax, 24(%rdi)
-; SSE41-NEXT:    movabsq $-4575657218183004160, %rax # imm = 0xC0800000C0400000
-; SSE41-NEXT:    movntiq %rax, 16(%rdi)
-; SSE41-NEXT:    movabsq $-4530621221895667712, %rax # imm = 0xC1200000C1100000
-; SSE41-NEXT:    movntiq %rax, 40(%rdi)
-; SSE41-NEXT:    movabsq $-4539628421153554432, %rax # imm = 0xC1000000C0E00000
-; SSE41-NEXT:    movntiq %rax, 32(%rdi)
-; SSE41-NEXT:    movabsq $-4512606823381991424, %rax # imm = 0xC1600000C1500000
-; SSE41-NEXT:    movntiq %rax, 56(%rdi)
-; SSE41-NEXT:    movabsq $-4521614022638829568, %rax # imm = 0xC1400000C1300000
-; SSE41-NEXT:    movntiq %rax, 48(%rdi)
-; SSE41-NEXT:    retq
-;
-; AVX-LABEL: test_constant_v16f32_align1:
-; AVX:       # %bb.0:
-; AVX-NEXT:    movabsq $-4611686015214551040, %rax # imm = 0xC0000000BF800000
-; AVX-NEXT:    movntiq %rax, 8(%rdi)
-; AVX-NEXT:    movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
-; AVX-NEXT:    movntiq %rax, (%rdi)
-; AVX-NEXT:    movabsq $-4557642819667230720, %rax # imm = 0xC0C00000C0A00000
-; AVX-NEXT:    movntiq %rax, 24(%rdi)
-; AVX-NEXT:    movabsq $-4575657218183004160, %rax # imm = 0xC0800000C0400000
-; AVX-NEXT:    movntiq %rax, 16(%rdi)
-; AVX-NEXT:    movabsq $-4530621221895667712, %rax # imm = 0xC1200000C1100000
-; AVX-NEXT:    movntiq %rax, 40(%rdi)
-; AVX-NEXT:    movabsq $-4539628421153554432, %rax # imm = 0xC1000000C0E00000
-; AVX-NEXT:    movntiq %rax, 32(%rdi)
-; AVX-NEXT:    movabsq $-4512606823381991424, %rax # imm = 0xC1600000C1500000
-; AVX-NEXT:    movntiq %rax, 56(%rdi)
-; AVX-NEXT:    movabsq $-4521614022638829568, %rax # imm = 0xC1400000C1300000
-; AVX-NEXT:    movntiq %rax, 48(%rdi)
-; AVX-NEXT:    retq
-;
-; AVX512-LABEL: test_constant_v16f32_align1:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    movabsq $-4611686015214551040, %rax # imm = 0xC0000000BF800000
-; AVX512-NEXT:    movntiq %rax, 8(%rdi)
-; AVX512-NEXT:    movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
-; AVX512-NEXT:    movntiq %rax, (%rdi)
-; AVX512-NEXT:    movabsq $-4557642819667230720, %rax # imm = 0xC0C00000C0A00000
-; AVX512-NEXT:    movntiq %rax, 24(%rdi)
-; AVX512-NEXT:    movabsq $-4575657218183004160, %rax # imm = 0xC0800000C0400000
-; AVX512-NEXT:    movntiq %rax, 16(%rdi)
-; AVX512-NEXT:    movabsq $-4530621221895667712, %rax # imm = 0xC1200000C1100000
-; AVX512-NEXT:    movntiq %rax, 40(%rdi)
-; AVX512-NEXT:    movabsq $-4539628421153554432, %rax # imm = 0xC1000000C0E00000
-; AVX512-NEXT:    movntiq %rax, 32(%rdi)
-; AVX512-NEXT:    movabsq $-4512606823381991424, %rax # imm = 0xC1600000C1500000
-; AVX512-NEXT:    movntiq %rax, 56(%rdi)
-; AVX512-NEXT:    movabsq $-4521614022638829568, %rax # imm = 0xC1400000C1300000
-; AVX512-NEXT:    movntiq %rax, 48(%rdi)
-; AVX512-NEXT:    retq
+; CHECK-LABEL: test_constant_v16f32_align1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movabsq $-4611686015214551040, %rax # imm = 0xC0000000BF800000
+; CHECK-NEXT:    movntiq %rax, 8(%rdi)
+; CHECK-NEXT:    movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
+; CHECK-NEXT:    movntiq %rax, (%rdi)
+; CHECK-NEXT:    movabsq $-4557642819667230720, %rax # imm = 0xC0C00000C0A00000
+; CHECK-NEXT:    movntiq %rax, 24(%rdi)
+; CHECK-NEXT:    movabsq $-4575657218183004160, %rax # imm = 0xC0800000C0400000
+; CHECK-NEXT:    movntiq %rax, 16(%rdi)
+; CHECK-NEXT:    movabsq $-4530621221895667712, %rax # imm = 0xC1200000C1100000
+; CHECK-NEXT:    movntiq %rax, 40(%rdi)
+; CHECK-NEXT:    movabsq $-4539628421153554432, %rax # imm = 0xC1000000C0E00000
+; CHECK-NEXT:    movntiq %rax, 32(%rdi)
+; CHECK-NEXT:    movabsq $-4512606823381991424, %rax # imm = 0xC1600000C1500000
+; CHECK-NEXT:    movntiq %rax, 56(%rdi)
+; CHECK-NEXT:    movabsq $-4521614022638829568, %rax # imm = 0xC1400000C1300000
+; CHECK-NEXT:    movntiq %rax, 48(%rdi)
+; CHECK-NEXT:    retq
   store <16 x float> <float 0.0, float -0.0, float -1.0, float -2.0, float -3.0, float -4.0, float -5.0, float -6.0, float -7.0, float -8.0, float -9.0, float -10.0, float -11.0, float -12.0, float -13.0, float -14.0>, ptr %dst, align 1, !nontemporal !1
   ret void
 }
 
 define void @test_constant_v8i64_align1(ptr %dst) nounwind {
-; SSE2-LABEL: test_constant_v8i64_align1:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movq $-1, %rax
-; SSE2-NEXT:    movntiq %rax, 8(%rdi)
-; SSE2-NEXT:    movq $-3, %rax
-; SSE2-NEXT:    movntiq %rax, 24(%rdi)
-; SSE2-NEXT:    movq $-2, %rax
-; SSE2-NEXT:    movntiq %rax, 16(%rdi)
-; SSE2-NEXT:    movq $-5, %rax
-; SSE2-NEXT:    movntiq %rax, 40(%rdi)
-; SSE2-NEXT:    movq $-4, %rax
-; SSE2-NEXT:    movntiq %rax, 32(%rdi)
-; SSE2-NEXT:    movq $-7, %rax
-; SSE2-NEXT:    movntiq %rax, 56(%rdi)
-; SSE2-NEXT:    movq $-6, %rax
-; SSE2-NEXT:    movntiq %rax, 48(%rdi)
-; SSE2-NEXT:    xorl %eax, %eax
-; SSE2-NEXT:    movntiq %rax, (%rdi)
-; SSE2-NEXT:    retq
-;
-; SSE4A-LABEL: test_constant_v8i64_align1:
-; SSE4A:       # %bb.0:
-; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = [NaN,0.0E+0]
-; SSE4A-NEXT:    movntsd %xmm0, 8(%rdi)
-; SSE4A-NEXT:    xorl %eax, %eax
-; SSE4A-NEXT:    movntiq %rax, (%rdi)
-; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; SSE4A-NEXT:    movntsd %xmm0, 24(%rdi)
-; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = [NaN,0.0E+0]
-; SSE4A-NEXT:    movntsd %xmm0, 16(%rdi)
-; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; SSE4A-NEXT:    movntsd %xmm0, 40(%rdi)
-; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = [NaN,0.0E+0]
-; SSE4A-NEXT:    movntsd %xmm0, 32(%rdi)
-; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; SSE4A-NEXT:    movntsd %xmm0, 56(%rdi)
-; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = [NaN,0.0E+0]
-; SSE4A-NEXT:    movntsd %xmm0, 48(%rdi)
-; SSE4A-NEXT:    retq
-;
-; SSE41-LABEL: test_constant_v8i64_align1:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    movq $-1, %rax
-; SSE41-NEXT:    movntiq %rax, 8(%rdi)
-; SSE41-NEXT:    movq $-3, %rax
-; SSE41-NEXT:    movntiq %rax, 24(%rdi)
-; SSE41-NEXT:    movq $-2, %rax
-; SSE41-NEXT:    movntiq %rax, 16(%rdi)
-; SSE41-NEXT:    movq $-5, %rax
-; SSE41-NEXT:    movntiq %rax, 40(%rdi)
-; SSE41-NEXT:    movq $-4, %rax
-; SSE41-NEXT:    movntiq %rax, 32(%rdi)
-; SSE41-NEXT:    movq $-7, %rax
-; SSE41-NEXT:    movntiq %rax, 56(%rdi)
-; SSE41-NEXT:    movq $-6, %rax
-; SSE41-NEXT:    movntiq %rax, 48(%rdi)
-; SSE41-NEXT:    xorl %eax, %eax
-; SSE41-NEXT:    movntiq %rax, (%rdi)
-; SSE41-NEXT:    retq
-;
-; AVX-LABEL: test_constant_v8i64_align1:
-; AVX:       # %bb.0:
-; AVX-NEXT:    movq $-1, %rax
-; AVX-NEXT:    movntiq %rax, 8(%rdi)
-; AVX-NEXT:    movq $-3, %rax
-; AVX-NEXT:    movntiq %rax, 24(%rdi)
-; AVX-NEXT:    movq $-2, %rax
-; AVX-NEXT:    movntiq %rax, 16(%rdi)
-; AVX-NEXT:    movq $-5, %rax
-; AVX-NEXT:    movntiq %rax, 40(%rdi)
-; AVX-NEXT:    movq $-4, %rax
-; AVX-NEXT:    movntiq %rax, 32(%rdi)
-; AVX-NEXT:    movq $-7, %rax
-; AVX-NEXT:    movntiq %rax, 56(%rdi)
-; AVX-NEXT:    movq $-6, %rax
-; AVX-NEXT:    movntiq %rax, 48(%rdi)
-; AVX-NEXT:    xorl %eax, %eax
-; AVX-NEXT:    movntiq %rax, (%rdi)
-; AVX-NEXT:    retq
-;
-; AVX512-LABEL: test_constant_v8i64_align1:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    movq $-1, %rax
-; AVX512-NEXT:    movntiq %rax, 8(%rdi)
-; AVX512-NEXT:    movq $-3, %rax
-; AVX512-NEXT:    movntiq %rax, 24(%rdi)
-; AVX512-NEXT:    movq $-2, %rax
-; AVX512-NEXT:    movntiq %rax, 16(%rdi)
-; AVX512-NEXT:    movq $-5, %rax
-; AVX512-NEXT:    movntiq %rax, 40(%rdi)
-; AVX512-NEXT:    movq $-4, %rax
-; AVX512-NEXT:    movntiq %rax, 32(%rdi)
-; AVX512-NEXT:    movq $-7, %rax
-; AVX512-NEXT:    movntiq %rax, 56(%rdi)
-; AVX512-NEXT:    movq $-6, %rax
-; AVX512-NEXT:    movntiq %rax, 48(%rdi)
-; AVX512-NEXT:    xorl %eax, %eax
-; AVX512-NEXT:    movntiq %rax, (%rdi)
-; AVX512-NEXT:    retq
+; CHECK-LABEL: test_constant_v8i64_align1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movq $-1, %rax
+; CHECK-NEXT:    movntiq %rax, 8(%rdi)
+; CHECK-NEXT:    movq $-3, %rax
+; CHECK-NEXT:    movntiq %rax, 24(%rdi)
+; CHECK-NEXT:    movq $-2, %rax
+; CHECK-NEXT:    movntiq %rax, 16(%rdi)
+; CHECK-NEXT:    movq $-5, %rax
+; CHECK-NEXT:    movntiq %rax, 40(%rdi)
+; CHECK-NEXT:    movq $-4, %rax
+; CHECK-NEXT:    movntiq %rax, 32(%rdi)
+; CHECK-NEXT:    movq $-7, %rax
+; CHECK-NEXT:    movntiq %rax, 56(%rdi)
+; CHECK-NEXT:    movq $-6, %rax
+; CHECK-NEXT:    movntiq %rax, 48(%rdi)
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    movntiq %rax, (%rdi)
+; CHECK-NEXT:    retq
   store <8 x i64> <i64 0, i64 -1, i64 -2, i64 -3, i64 -4, i64 -5, i64 -6, i64 -7>, ptr %dst, align 1, !nontemporal !1
   ret void
 }
 
 define void @test_constant_v16i32_align1(ptr %dst) nounwind {
-; SSE2-LABEL: test_constant_v16i32_align1:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movabsq $-8589934594, %rax # imm = 0xFFFFFFFDFFFFFFFE
-; SSE2-NEXT:    movntiq %rax, 8(%rdi)
-; SSE2-NEXT:    movabsq $-4294967296, %rax # imm = 0xFFFFFFFF00000000
-; SSE2-NEXT:    movntiq %rax, (%rdi)
-; SSE2-NEXT:    movabsq $-25769803782, %rax # imm = 0xFFFFFFF9FFFFFFFA
-; SSE2-NEXT:    movntiq %rax, 24(%rdi)
-; SSE2-NEXT:    movabsq $-17179869188, %rax # imm = 0xFFFFFFFBFFFFFFFC
-; SSE2-NEXT:    movntiq %rax, 16(%rdi)
-; SSE2-NEXT:    movabsq $-42949672970, %rax # imm = 0xFFFFFFF5FFFFFFF6
-; SSE2-NEXT:    movntiq %rax, 40(%rdi)
-; SSE2-NEXT:    movabsq $-34359738376, %rax # imm = 0xFFFFFFF7FFFFFFF8
-; SSE2-NEXT:    movntiq %rax, 32(%rdi)
-; SSE2-NEXT:    movabsq $-60129542158, %rax # imm = 0xFFFFFFF1FFFFFFF2
-; SSE2-NEXT:    movntiq %rax, 56(%rdi)
-; SSE2-NEXT:    movabsq $-51539607564, %rax # imm = 0xFFFFFFF3FFFFFFF4
-; SSE2-NEXT:    movntiq %rax, 48(%rdi)
-; SSE2-NEXT:    retq
-;
-; SSE4A-LABEL: test_constant_v16i32_align1:
-; SSE4A:       # %bb.0:
-; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; SSE4A-NEXT:    movntsd %xmm0, 8(%rdi)
-; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = [NaN,0.0E+0]
-; SSE4A-NEXT:    movntsd %xmm0, (%rdi)
-; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; SSE4A-NEXT:    movntsd %xmm0, 24(%rdi)
-; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = [NaN,0.0E+0]
-; SSE4A-NEXT:    movntsd %xmm0, 16(%rdi)
-; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; SSE4A-NEXT:    movntsd %xmm0, 40(%rdi)
-; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = [NaN,0.0E+0]
-; SSE4A-NEXT:    movntsd %xmm0, 32(%rdi)
-; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; SSE4A-NEXT:    movntsd %xmm0, 56(%rdi)
-; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = [NaN,0.0E+0]
-; SSE4A-NEXT:    movntsd %xmm0, 48(%rdi)
-; SSE4A-NEXT:    retq
-;
-; SSE41-LABEL: test_constant_v16i32_align1:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    movabsq $-8589934594, %rax # imm = 0xFFFFFFFDFFFFFFFE
-; SSE41-NEXT:    movntiq %rax, 8(%rdi)
-; SSE41-NEXT:    movabsq $-4294967296, %rax # imm = 0xFFFFFFFF00000000
-; SSE41-NEXT:    movntiq %rax, (%rdi)
-; SSE41-NEXT:    movabsq $-25769803782, %rax # imm = 0xFFFFFFF9FFFFFFFA
-; SSE41-NEXT:    movntiq %rax, 24(%rdi)
-; SSE41-NEXT:    movabsq $-17179869188, %rax # imm = 0xFFFFFFFBFFFFFFFC
-; SSE41-NEXT:    movntiq %rax, 16(%rdi)
-; SSE41-NEXT:    movabsq $-42949672970, %rax # imm = 0xFFFFFFF5FFFFFFF6
-; SSE41-NEXT:    movntiq %rax, 40(%rdi)
-; SSE41-NEXT:    movabsq $-34359738376, %rax # imm = 0xFFFFFFF7FFFFFFF8
-; SSE41-NEXT:    movntiq %rax, 32(%rdi)
-; SSE41-NEXT:    movabsq $-60129542158, %rax # imm = 0xFFFFFFF1FFFFFFF2
-; SSE41-NEXT:    movntiq %rax, 56(%rdi)
-; SSE41-NEXT:    movabsq $-51539607564, %rax # imm = 0xFFFFFFF3FFFFFFF4
-; SSE41-NEXT:    movntiq %rax, 48(%rdi)
-; SSE41-NEXT:    retq
-;
-; AVX-LABEL: test_constant_v16i32_align1:
-; AVX:       # %bb.0:
-; AVX-NEXT:    movabsq $-8589934594, %rax # imm = 0xFFFFFFFDFFFFFFFE
-; AVX-NEXT:    movntiq %rax, 8(%rdi)
-; AVX-NEXT:    movabsq $-4294967296, %rax # imm = 0xFFFFFFFF00000000
-; AVX-NEXT:    movntiq %rax, (%rdi)
-; AVX-NEXT:    movabsq $-25769803782, %rax # imm = 0xFFFFFFF9FFFFFFFA
-; AVX-NEXT:    movntiq %rax, 24(%rdi)
-; AVX-NEXT:    movabsq $-17179869188, %rax # imm = 0xFFFFFFFBFFFFFFFC
-; AVX-NEXT:    movntiq %rax, 16(%rdi)
-; AVX-NEXT:    movabsq $-42949672970, %rax # imm = 0xFFFFFFF5FFFFFFF6
-; AVX-NEXT:    movntiq %rax, 40(%rdi)
-; AVX-NEXT:    movabsq $-34359738376, %rax # imm = 0xFFFFFFF7FFFFFFF8
-; AVX-NEXT:    movntiq %rax, 32(%rdi)
-; AVX-NEXT:    movabsq $-60129542158, %rax # imm = 0xFFFFFFF1FFFFFFF2
-; AVX-NEXT:    movntiq %rax, 56(%rdi)
-; AVX-NEXT:    movabsq $-51539607564, %rax # imm = 0xFFFFFFF3FFFFFFF4
-; AVX-NEXT:    movntiq %rax, 48(%rdi)
-; AVX-NEXT:    retq
-;
-; AVX512-LABEL: test_constant_v16i32_align1:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    movabsq $-8589934594, %rax # imm = 0xFFFFFFFDFFFFFFFE
-; AVX512-NEXT:    movntiq %rax, 8(%rdi)
-; AVX512-NEXT:    movabsq $-4294967296, %rax # imm = 0xFFFFFFFF00000000
-; AVX512-NEXT:    movntiq %rax, (%rdi)
-; AVX512-NEXT:    movabsq $-25769803782, %rax # imm = 0xFFFFFFF9FFFFFFFA
-; AVX512-NEXT:    movntiq %rax, 24(%rdi)
-; AVX512-NEXT:    movabsq $-17179869188, %rax # imm = 0xFFFFFFFBFFFFFFFC
-; AVX512-NEXT:    movntiq %rax, 16(%rdi)
-; AVX512-NEXT:    movabsq $-42949672970, %rax # imm = 0xFFFFFFF5FFFFFFF6
-; AVX512-NEXT:    movntiq %rax, 40(%rdi)
-; AVX512-NEXT:    movabsq $-34359738376, %rax # imm = 0xFFFFFFF7FFFFFFF8
-; AVX512-NEXT:    movntiq %rax, 32(%rdi)
-; AVX512-NEXT:    movabsq $-60129542158, %rax # imm = 0xFFFFFFF1FFFFFFF2
-; AVX512-NEXT:    movntiq %rax, 56(%rdi)
-; AVX512-NEXT:    movabsq $-51539607564, %rax # imm = 0xFFFFFFF3FFFFFFF4
-; AVX512-NEXT:    movntiq %rax, 48(%rdi)
-; AVX512-NEXT:    retq
+; CHECK-LABEL: test_constant_v16i32_align1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movabsq $-8589934594, %rax # imm = 0xFFFFFFFDFFFFFFFE
+; CHECK-NEXT:    movntiq %rax, 8(%rdi)
+; CHECK-NEXT:    movabsq $-4294967296, %rax # imm = 0xFFFFFFFF00000000
+; CHECK-NEXT:    movntiq %rax, (%rdi)
+; CHECK-NEXT:    movabsq $-25769803782, %rax # imm = 0xFFFFFFF9FFFFFFFA
+; CHECK-NEXT:    movntiq %rax, 24(%rdi)
+; CHECK-NEXT:    movabsq $-17179869188, %rax # imm = 0xFFFFFFFBFFFFFFFC
+; CHECK-NEXT:    movntiq %rax, 16(%rdi)
+; CHECK-NEXT:    movabsq $-42949672970, %rax # imm = 0xFFFFFFF5FFFFFFF6
+; CHECK-NEXT:    movntiq %rax, 40(%rdi)
+; CHECK-NEXT:    movabsq $-34359738376, %rax # imm = 0xFFFFFFF7FFFFFFF8
+; CHECK-NEXT:    movntiq %rax, 32(%rdi)
+; CHECK-NEXT:    movabsq $-60129542158, %rax # imm = 0xFFFFFFF1FFFFFFF2
+; CHECK-NEXT:    movntiq %rax, 56(%rdi)
+; CHECK-NEXT:    movabsq $-51539607564, %rax # imm = 0xFFFFFFF3FFFFFFF4
+; CHECK-NEXT:    movntiq %rax, 48(%rdi)
+; CHECK-NEXT:    retq
   store <16 x i32> <i32 0, i32 -1, i32 -2, i32 -3, i32 -4, i32 -5, i32 -6, i32 -7, i32 -8, i32 -9, i32 -10, i32 -11, i32 -12, i32 -13, i32 -14, i32 -15>, ptr %dst, align 1, !nontemporal !1
   ret void
 }
 
 define void @test_constant_v32i16_align1(ptr %dst) nounwind {
-; SSE2-LABEL: test_constant_v32i16_align1:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movabsq $-1688871335362564, %rax # imm = 0xFFF9FFFAFFFBFFFC
-; SSE2-NEXT:    movntiq %rax, 8(%rdi)
-; SSE2-NEXT:    movabsq $-562954248454144, %rax # imm = 0xFFFDFFFEFFFF0000
-; SSE2-NEXT:    movntiq %rax, (%rdi)
-; SSE2-NEXT:    movabsq $-3940705509310476, %rax # imm = 0xFFF1FFF2FFF3FFF4
-; SSE2-NEXT:    movntiq %rax, 24(%rdi)
-; SSE2-NEXT:    movabsq $-2814788422336520, %rax # imm = 0xFFF5FFF6FFF7FFF8
-; SSE2-NEXT:    movntiq %rax, 16(%rdi)
-; SSE2-NEXT:    movabsq $-6192539683258388, %rax # imm = 0xFFE9FFEAFFEBFFEC
-; SSE2-NEXT:    movntiq %rax, 40(%rdi)
-; SSE2-NEXT:    movabsq $-5066622596284432, %rax # imm = 0xFFEDFFEEFFEFFFF0
-; SSE2-NEXT:    movntiq %rax, 32(%rdi)
-; SSE2-NEXT:    movabsq $-8444373857206300, %rax # imm = 0xFFE1FFE2FFE3FFE4
-; SSE2-NEXT:    movntiq %rax, 56(%rdi)
-; SSE2-NEXT:    movabsq $-7318456770232344, %rax # imm = 0xFFE5FFE6FFE7FFE8
-; SSE2-NEXT:    movntiq %rax, 48(%rdi)
-; SSE2-NEXT:    retq
-;
-; SSE4A-LABEL: test_constant_v32i16_align1:
-; SSE4A:       # %bb.0:
-; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; SSE4A-NEXT:    movntsd %xmm0, 8(%rdi)
-; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = [NaN,0.0E+0]
-; SSE4A-NEXT:    movntsd %xmm0, (%rdi)
-; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; SSE4A-NEXT:    movntsd %xmm0, 24(%rdi)
-; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = [NaN,0.0E+0]
-; SSE4A-NEXT:    movntsd %xmm0, 16(%rdi)
-; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; SSE4A-NEXT:    movntsd %xmm0, 40(%rdi)
-; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = [-1.6853227412070812E+308,0.0E+0]
-; SSE4A-NEXT:    movntsd %xmm0, 32(%rdi)
-; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; SSE4A-NEXT:    movntsd %xmm0, 56(%rdi)
-; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = [-1.2358925997317751E+308,0.0E+0]
-; SSE4A-NEXT:    movntsd %xmm0, 48(%rdi)
-; SSE4A-NEXT:    retq
-;
-; SSE41-LABEL: test_constant_v32i16_align1:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    movabsq $-1688871335362564, %rax # imm = 0xFFF9FFFAFFFBFFFC
-; SSE41-NEXT:    movntiq %rax, 8(%rdi)
-; SSE41-NEXT:    movabsq $-562954248454144, %rax # imm = 0xFFFDFFFEFFFF0000
-; SSE41-NEXT:    movntiq %rax, (%rdi)
-; SSE41-NEXT:    movabsq $-3940705509310476, %rax # imm = 0xFFF1FFF2FFF3FFF4
-; SSE41-NEXT:    movntiq %rax, 24(%rdi)
-; SSE41-NEXT:    movabsq $-2814788422336520, %rax # imm = 0xFFF5FFF6FFF7FFF8
-; SSE41-NEXT:    movntiq %rax, 16(%rdi)
-; SSE41-NEXT:    movabsq $-6192539683258388, %rax # imm = 0xFFE9FFEAFFEBFFEC
-; SSE41-NEXT:    movntiq %rax, 40(%rdi)
-; SSE41-NEXT:    movabsq $-5066622596284432, %rax # imm = 0xFFEDFFEEFFEFFFF0
-; SSE41-NEXT:    movntiq %rax, 32(%rdi)
-; SSE41-NEXT:    movabsq $-8444373857206300, %rax # imm = 0xFFE1FFE2FFE3FFE4
-; SSE41-NEXT:    movntiq %rax, 56(%rdi)
-; SSE41-NEXT:    movabsq $-7318456770232344, %rax # imm = 0xFFE5FFE6FFE7FFE8
-; SSE41-NEXT:    movntiq %rax, 48(%rdi)
-; SSE41-NEXT:    retq
-;
-; AVX-LABEL: test_constant_v32i16_align1:
-; AVX:       # %bb.0:
-; AVX-NEXT:    movabsq $-1688871335362564, %rax # imm = 0xFFF9FFFAFFFBFFFC
-; AVX-NEXT:    movntiq %rax, 8(%rdi)
-; AVX-NEXT:    movabsq $-562954248454144, %rax # imm = 0xFFFDFFFEFFFF0000
-; AVX-NEXT:    movntiq %rax, (%rdi)
-; AVX-NEXT:    movabsq $-3940705509310476, %rax # imm = 0xFFF1FFF2FFF3FFF4
-; AVX-NEXT:    movntiq %rax, 24(%rdi)
-; AVX-NEXT:    movabsq $-2814788422336520, %rax # imm = 0xFFF5FFF6FFF7FFF8
-; AVX-NEXT:    movntiq %rax, 16(%rdi)
-; AVX-NEXT:    movabsq $-6192539683258388, %rax # imm = 0xFFE9FFEAFFEBFFEC
-; AVX-NEXT:    movntiq %rax, 40(%rdi)
-; AVX-NEXT:    movabsq $-5066622596284432, %rax # imm = 0xFFEDFFEEFFEFFFF0
-; AVX-NEXT:    movntiq %rax, 32(%rdi)
-; AVX-NEXT:    movabsq $-8444373857206300, %rax # imm = 0xFFE1FFE2FFE3FFE4
-; AVX-NEXT:    movntiq %rax, 56(%rdi)
-; AVX-NEXT:    movabsq $-7318456770232344, %rax # imm = 0xFFE5FFE6FFE7FFE8
-; AVX-NEXT:    movntiq %rax, 48(%rdi)
-; AVX-NEXT:    retq
-;
-; AVX512-LABEL: test_constant_v32i16_align1:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    movabsq $-1688871335362564, %rax # imm = 0xFFF9FFFAFFFBFFFC
-; AVX512-NEXT:    movntiq %rax, 8(%rdi)
-; AVX512-NEXT:    movabsq $-562954248454144, %rax # imm = 0xFFFDFFFEFFFF0000
-; AVX512-NEXT:    movntiq %rax, (%rdi)
-; AVX512-NEXT:    movabsq $-3940705509310476, %rax # imm = 0xFFF1FFF2FFF3FFF4
-; AVX512-NEXT:    movntiq %rax, 24(%rdi)
-; AVX512-NEXT:    movabsq $-2814788422336520, %rax # imm = 0xFFF5FFF6FFF7FFF8
-; AVX512-NEXT:    movntiq %rax, 16(%rdi)
-; AVX512-NEXT:    movabsq $-6192539683258388, %rax # imm = 0xFFE9FFEAFFEBFFEC
-; AVX512-NEXT:    movntiq %rax, 40(%rdi)
-; AVX512-NEXT:    movabsq $-5066622596284432, %rax # imm = 0xFFEDFFEEFFEFFFF0
-; AVX512-NEXT:    movntiq %rax, 32(%rdi)
-; AVX512-NEXT:    movabsq $-8444373857206300, %rax # imm = 0xFFE1FFE2FFE3FFE4
-; AVX512-NEXT:    movntiq %rax, 56(%rdi)
-; AVX512-NEXT:    movabsq $-7318456770232344, %rax # imm = 0xFFE5FFE6FFE7FFE8
-; AVX512-NEXT:    movntiq %rax, 48(%rdi)
-; AVX512-NEXT:    retq
+; CHECK-LABEL: test_constant_v32i16_align1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movabsq $-1688871335362564, %rax # imm = 0xFFF9FFFAFFFBFFFC
+; CHECK-NEXT:    movntiq %rax, 8(%rdi)
+; CHECK-NEXT:    movabsq $-562954248454144, %rax # imm = 0xFFFDFFFEFFFF0000
+; CHECK-NEXT:    movntiq %rax, (%rdi)
+; CHECK-NEXT:    movabsq $-3940705509310476, %rax # imm = 0xFFF1FFF2FFF3FFF4
+; CHECK-NEXT:    movntiq %rax, 24(%rdi)
+; CHECK-NEXT:    movabsq $-2814788422336520, %rax # imm = 0xFFF5FFF6FFF7FFF8
+; CHECK-NEXT:    movntiq %rax, 16(%rdi)
+; CHECK-NEXT:    movabsq $-6192539683258388, %rax # imm = 0xFFE9FFEAFFEBFFEC
+; CHECK-NEXT:    movntiq %rax, 40(%rdi)
+; CHECK-NEXT:    movabsq $-5066622596284432, %rax # imm = 0xFFEDFFEEFFEFFFF0
+; CHECK-NEXT:    movntiq %rax, 32(%rdi)
+; CHECK-NEXT:    movabsq $-8444373857206300, %rax # imm = 0xFFE1FFE2FFE3FFE4
+; CHECK-NEXT:    movntiq %rax, 56(%rdi)
+; CHECK-NEXT:    movabsq $-7318456770232344, %rax # imm = 0xFFE5FFE6FFE7FFE8
+; CHECK-NEXT:    movntiq %rax, 48(%rdi)
+; CHECK-NEXT:    retq
   store <32 x i16> <i16 0, i16 -1, i16 -2, i16 -3, i16 -4, i16 -5, i16 -6, i16 -7, i16 -8, i16 -9, i16 -10, i16 -11, i16 -12, i16 -13, i16 -14, i16 -15, i16 -16, i16 -17, i16 -18, i16 -19, i16 -20, i16 -21, i16 -22, i16 -23, i16 -24, i16 -25, i16 -26, i16 -27, i16 -28, i16 -29, i16 -30, i16 -31>, ptr %dst, align 1, !nontemporal !1
   ret void
 }
 
 define void @test_constant_v64i8_align1(ptr %dst) nounwind {
-; SSE2-LABEL: test_constant_v64i8_align1:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movabsq $-1012478732780767240, %rax # imm = 0xF1F2F3F4F5F6F7F8
-; SSE2-NEXT:    movntiq %rax, 8(%rdi)
-; SSE2-NEXT:    movabsq $-433757350076154112, %rax # imm = 0xF9FAFBFCFDFEFF00
-; SSE2-NEXT:    movntiq %rax, (%rdi)
-; SSE2-NEXT:    movabsq $-2169921498189994008, %rax # imm = 0xE1E2E3E4E5E6E7E8
-; SSE2-NEXT:    movntiq %rax, 24(%rdi)
-; SSE2-NEXT:    movabsq $-1591200115485380624, %rax # imm = 0xE9EAEBECEDEEEFF0
-; SSE2-NEXT:    movntiq %rax, 16(%rdi)
-; SSE2-NEXT:    movabsq $-3327364263599220776, %rax # imm = 0xD1D2D3D4D5D6D7D8
-; SSE2-NEXT:    movntiq %rax, 40(%rdi)
-; SSE2-NEXT:    movabsq $-2748642880894607392, %rax # imm = 0xD9DADBDCDDDEDFE0
-; SSE2-NEXT:    movntiq %rax, 32(%rdi)
-; SSE2-NEXT:    movabsq $-4484807029008447544, %rax # imm = 0xC1C2C3C4C5C6C7C8
-; SSE2-NEXT:    movntiq %rax, 56(%rdi)
-; SSE2-NEXT:    movabsq $-3906085646303834160, %rax # imm = 0xC9CACBCCCDCECFD0
-; SSE2-NEXT:    movntiq %rax, 48(%rdi)
-; SSE2-NEXT:    retq
-;
-; SSE4A-LABEL: test_constant_v64i8_align1:
-; SSE4A:       # %bb.0:
-; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; SSE4A-NEXT:    movntsd %xmm0, 8(%rdi)
-; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = [-3.826728214441238E+279,0.0E+0]
-; SSE4A-NEXT:    movntsd %xmm0, (%rdi)
-; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; SSE4A-NEXT:    movntsd %xmm0, 24(%rdi)
-; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = [-1.6485712323024388E+202,0.0E+0]
-; SSE4A-NEXT:    movntsd %xmm0, 16(%rdi)
-; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; SSE4A-NEXT:    movntsd %xmm0, 40(%rdi)
-; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = [-7.1020783099933495E+124,0.0E+0]
-; SSE4A-NEXT:    movntsd %xmm0, 32(%rdi)
-; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; SSE4A-NEXT:    movntsd %xmm0, 56(%rdi)
-; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = [-3.0595730451167367E+47,0.0E+0]
-; SSE4A-NEXT:    movntsd %xmm0, 48(%rdi)
-; SSE4A-NEXT:    retq
-;
-; SSE41-LABEL: test_constant_v64i8_align1:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    movabsq $-1012478732780767240, %rax # imm = 0xF1F2F3F4F5F6F7F8
-; SSE41-NEXT:    movntiq %rax, 8(%rdi)
-; SSE41-NEXT:    movabsq $-433757350076154112, %rax # imm = 0xF9FAFBFCFDFEFF00
-; SSE41-NEXT:    movntiq %rax, (%rdi)
-; SSE41-NEXT:    movabsq $-2169921498189994008, %rax # imm = 0xE1E2E3E4E5E6E7E8
-; SSE41-NEXT:    movntiq %rax, 24(%rdi)
-; SSE41-NEXT:    movabsq $-1591200115485380624, %rax # imm = 0xE9EAEBECEDEEEFF0
-; SSE41-NEXT:    movntiq %rax, 16(%rdi)
-; SSE41-NEXT:    movabsq $-3327364263599220776, %rax # imm = 0xD1D2D3D4D5D6D7D8
-; SSE41-NEXT:    movntiq %rax, 40(%rdi)
-; SSE41-NEXT:    movabsq $-2748642880894607392, %rax # imm = 0xD9DADBDCDDDEDFE0
-; SSE41-NEXT:    movntiq %rax, 32(%rdi)
-; SSE41-NEXT:    movabsq $-4484807029008447544, %rax # imm = 0xC1C2C3C4C5C6C7C8
-; SSE41-NEXT:    movntiq %rax, 56(%rdi)
-; SSE41-NEXT:    movabsq $-3906085646303834160, %rax # imm = 0xC9CACBCCCDCECFD0
-; SSE41-NEXT:    movntiq %rax, 48(%rdi)
-; SSE41-NEXT:    retq
-;
-; AVX-LABEL: test_constant_v64i8_align1:
-; AVX:       # %bb.0:
-; AVX-NEXT:    movabsq $-1012478732780767240, %rax # imm = 0xF1F2F3F4F5F6F7F8
-; AVX-NEXT:    movntiq %rax, 8(%rdi)
-; AVX-NEXT:    movabsq $-433757350076154112, %rax # imm = 0xF9FAFBFCFDFEFF00
-; AVX-NEXT:    movntiq %rax, (%rdi)
-; AVX-NEXT:    movabsq $-2169921498189994008, %rax # imm = 0xE1E2E3E4E5E6E7E8
-; AVX-NEXT:    movntiq %rax, 24(%rdi)
-; AVX-NEXT:    movabsq $-1591200115485380624, %rax # imm = 0xE9EAEBECEDEEEFF0
-; AVX-NEXT:    movntiq %rax, 16(%rdi)
-; AVX-NEXT:    movabsq $-3327364263599220776, %rax # imm = 0xD1D2D3D4D5D6D7D8
-; AVX-NEXT:    movntiq %rax, 40(%rdi)
-; AVX-NEXT:    movabsq $-2748642880894607392, %rax # imm = 0xD9DADBDCDDDEDFE0
-; AVX-NEXT:    movntiq %rax, 32(%rdi)
-; AVX-NEXT:    movabsq $-4484807029008447544, %rax # imm = 0xC1C2C3C4C5C6C7C8
-; AVX-NEXT:    movntiq %rax, 56(%rdi)
-; AVX-NEXT:    movabsq $-3906085646303834160, %rax # imm = 0xC9CACBCCCDCECFD0
-; AVX-NEXT:    movntiq %rax, 48(%rdi)
-; AVX-NEXT:    retq
-;
-; AVX512-LABEL: test_constant_v64i8_align1:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    movabsq $-1012478732780767240, %rax # imm = 0xF1F2F3F4F5F6F7F8
-; AVX512-NEXT:    movntiq %rax, 8(%rdi)
-; AVX512-NEXT:    movabsq $-433757350076154112, %rax # imm = 0xF9FAFBFCFDFEFF00
-; AVX512-NEXT:    movntiq %rax, (%rdi)
-; AVX512-NEXT:    movabsq $-2169921498189994008, %rax # imm = 0xE1E2E3E4E5E6E7E8
-; AVX512-NEXT:    movntiq %rax, 24(%rdi)
-; AVX512-NEXT:    movabsq $-1591200115485380624, %rax # imm = 0xE9EAEBECEDEEEFF0
-; AVX512-NEXT:    movntiq %rax, 16(%rdi)
-; AVX512-NEXT:    movabsq $-3327364263599220776, %rax # imm = 0xD1D2D3D4D5D6D7D8
-; AVX512-NEXT:    movntiq %rax, 40(%rdi)
-; AVX512-NEXT:    movabsq $-2748642880894607392, %rax # imm = 0xD9DADBDCDDDEDFE0
-; AVX512-NEXT:    movntiq %rax, 32(%rdi)
-; AVX512-NEXT:    movabsq $-4484807029008447544, %rax # imm = 0xC1C2C3C4C5C6C7C8
-; AVX512-NEXT:    movntiq %rax, 56(%rdi)
-; AVX512-NEXT:    movabsq $-3906085646303834160, %rax # imm = 0xC9CACBCCCDCECFD0
-; AVX512-NEXT:    movntiq %rax, 48(%rdi)
-; AVX512-NEXT:    retq
+; CHECK-LABEL: test_constant_v64i8_align1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movabsq $-1012478732780767240, %rax # imm = 0xF1F2F3F4F5F6F7F8
+; CHECK-NEXT:    movntiq %rax, 8(%rdi)
+; CHECK-NEXT:    movabsq $-433757350076154112, %rax # imm = 0xF9FAFBFCFDFEFF00
+; CHECK-NEXT:    movntiq %rax, (%rdi)
+; CHECK-NEXT:    movabsq $-2169921498189994008, %rax # imm = 0xE1E2E3E4E5E6E7E8
+; CHECK-NEXT:    movntiq %rax, 24(%rdi)
+; CHECK-NEXT:    movabsq $-1591200115485380624, %rax # imm = 0xE9EAEBECEDEEEFF0
+; CHECK-NEXT:    movntiq %rax, 16(%rdi)
+; CHECK-NEXT:    movabsq $-3327364263599220776, %rax # imm = 0xD1D2D3D4D5D6D7D8
+; CHECK-NEXT:    movntiq %rax, 40(%rdi)
+; CHECK-NEXT:    movabsq $-2748642880894607392, %rax # imm = 0xD9DADBDCDDDEDFE0
+; CHECK-NEXT:    movntiq %rax, 32(%rdi)
+; CHECK-NEXT:    movabsq $-4484807029008447544, %rax # imm = 0xC1C2C3C4C5C6C7C8
+; CHECK-NEXT:    movntiq %rax, 56(%rdi)
+; CHECK-NEXT:    movabsq $-3906085646303834160, %rax # imm = 0xC9CACBCCCDCECFD0
+; CHECK-NEXT:    movntiq %rax, 48(%rdi)
+; CHECK-NEXT:    retq
   store <64 x i8> <i8 0, i8 -1, i8 -2, i8 -3, i8 -4, i8 -5, i8 -6, i8 -7, i8 -8, i8 -9, i8 -10, i8 -11, i8 -12, i8 -13, i8 -14, i8 -15, i8 -16, i8 -17, i8 -18, i8 -19, i8 -20, i8 -21, i8 -22, i8 -23, i8 -24, i8 -25, i8 -26, i8 -27, i8 -28, i8 -29, i8 -30, i8 -31, i8 -32, i8 -33, i8 -34, i8 -35, i8 -36, i8 -37, i8 -38, i8 -39, i8 -40, i8 -41, i8 -42, i8 -43, i8 -44, i8 -45, i8 -46, i8 -47, i8 -48, i8 -49, i8 -50, i8 -51, i8 -52, i8 -53, i8 -54, i8 -55, i8 -56, i8 -57, i8 -58, i8 -59, i8 -60, i8 -61, i8 -62, i8 -63>, ptr %dst, align 1, !nontemporal !1
   ret void
 }
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll
index b5adfb3733357..0cfe3f60595a1 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll
@@ -3169,48 +3169,18 @@ entry:
 declare <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16>, i32)
 
 define void @PR43024() {
-; SSE2-LABEL: PR43024:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movsd {{.*#+}} xmm0 = [NaN,NaN,0.0E+0,0.0E+0]
-; SSE2-NEXT:    movaps %xmm0, (%rax)
-; SSE2-NEXT:    addss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE2-NEXT:    xorps %xmm1, %xmm1
-; SSE2-NEXT:    addss %xmm1, %xmm0
-; SSE2-NEXT:    addss %xmm1, %xmm0
-; SSE2-NEXT:    movss %xmm0, (%rax)
-; SSE2-NEXT:    retq
-;
-; SSSE3-LABEL: PR43024:
-; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    movsd {{.*#+}} xmm0 = [NaN,NaN,0.0E+0,0.0E+0]
-; SSSE3-NEXT:    movaps %xmm0, (%rax)
-; SSSE3-NEXT:    addss %xmm0, %xmm0
-; SSSE3-NEXT:    xorps %xmm1, %xmm1
-; SSSE3-NEXT:    addss %xmm1, %xmm0
-; SSSE3-NEXT:    addss %xmm1, %xmm0
-; SSSE3-NEXT:    movss %xmm0, (%rax)
-; SSSE3-NEXT:    retq
-;
-; SSE41-LABEL: PR43024:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    movsd {{.*#+}} xmm0 = [NaN,NaN,0.0E+0,0.0E+0]
-; SSE41-NEXT:    movaps %xmm0, (%rax)
-; SSE41-NEXT:    addss %xmm0, %xmm0
-; SSE41-NEXT:    xorps %xmm1, %xmm1
-; SSE41-NEXT:    addss %xmm1, %xmm0
-; SSE41-NEXT:    addss %xmm1, %xmm0
-; SSE41-NEXT:    movss %xmm0, (%rax)
-; SSE41-NEXT:    retq
+; SSE-LABEL: PR43024:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = [NaN,NaN,0.0E+0,0.0E+0]
+; SSE-NEXT:    movaps %xmm0, (%rax)
+; SSE-NEXT:    movl $2143289344, (%rax) # imm = 0x7FC00000
+; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: PR43024:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [NaN,NaN,0.0E+0,0.0E+0]
 ; AVX-NEXT:    vmovaps %xmm0, (%rax)
-; AVX-NEXT:    vaddss {{\.?LCPI[0-9]+_[0-9]+}}+4(%rip), %xmm0, %xmm0
-; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vmovss %xmm0, (%rax)
+; AVX-NEXT:    movl $2143289344, (%rax) # imm = 0x7FC00000
 ; AVX-NEXT:    retq
   store <4 x float> <float 0x7FF8000000000000, float 0x7FF8000000000000, float 0x0, float 0x0>, ptr undef, align 16
   %1 = load <4 x float>, ptr undef, align 16



More information about the llvm-commits mailing list