[Mlir-commits] [clang] [llvm] [mlir] Remove the `x86_mmx` IR type. (PR #98505)
James Y Knight
llvmlistbot at llvm.org
Thu Jul 11 10:11:06 PDT 2024
https://github.com/jyknight updated https://github.com/llvm/llvm-project/pull/98505
>From bf94321b3247ab5eb2d898e23530a1bebb307e12 Mon Sep 17 00:00:00 2001
From: James Y Knight <jyknight at google.com>
Date: Wed, 10 Jul 2024 00:49:25 -0400
Subject: [PATCH 1/4] Clang: don't unnecessarily convert inline-asm operands to
x86mmx in IR.
The SelectionDAG asm-lowering code can already handle conversion of other vector types to MMX if needed.
---
clang/lib/CodeGen/Targets/X86.cpp | 13 -------------
clang/test/CodeGen/X86/mmx-inline-asm.c | 2 +-
clang/test/CodeGen/asm-inout.c | 6 +++---
llvm/lib/Target/X86/X86ISelLowering.cpp | 6 +++---
llvm/test/CodeGen/X86/mmx-inlineasm.ll | 20 ++++++++++++++++++++
5 files changed, 27 insertions(+), 20 deletions(-)
create mode 100644 llvm/test/CodeGen/X86/mmx-inlineasm.ll
diff --git a/clang/lib/CodeGen/Targets/X86.cpp b/clang/lib/CodeGen/Targets/X86.cpp
index 1dc3172a6bdf9..8913b188f6aec 100644
--- a/clang/lib/CodeGen/Targets/X86.cpp
+++ b/clang/lib/CodeGen/Targets/X86.cpp
@@ -27,19 +27,6 @@ bool IsX86_MMXType(llvm::Type *IRType) {
static llvm::Type* X86AdjustInlineAsmType(CodeGen::CodeGenFunction &CGF,
StringRef Constraint,
llvm::Type* Ty) {
- bool IsMMXCons = llvm::StringSwitch<bool>(Constraint)
- .Cases("y", "&y", "^Ym", true)
- .Default(false);
- if (IsMMXCons && Ty->isVectorTy()) {
- if (cast<llvm::VectorType>(Ty)->getPrimitiveSizeInBits().getFixedValue() !=
- 64) {
- // Invalid MMX constraint
- return nullptr;
- }
-
- return llvm::Type::getX86_MMXTy(CGF.getLLVMContext());
- }
-
if (Constraint == "k") {
llvm::Type *Int1Ty = llvm::Type::getInt1Ty(CGF.getLLVMContext());
return llvm::FixedVectorType::get(Int1Ty, Ty->getScalarSizeInBits());
diff --git a/clang/test/CodeGen/X86/mmx-inline-asm.c b/clang/test/CodeGen/X86/mmx-inline-asm.c
index 19c24a3a91e14..a0702c7f780d1 100644
--- a/clang/test/CodeGen/X86/mmx-inline-asm.c
+++ b/clang/test/CodeGen/X86/mmx-inline-asm.c
@@ -1,7 +1,7 @@
// RUN: %clang_cc1 -emit-llvm -triple i386 -target-feature +mmx %s -o - | FileCheck %s
#include <mmintrin.h>
-// CHECK: { x86_mmx, x86_mmx, x86_mmx, x86_mmx, x86_mmx, x86_mmx, x86_mmx }
+// CHECK: { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }
void foo(long long fill) {
__m64 vfill = _mm_cvtsi64_m64(fill);
diff --git a/clang/test/CodeGen/asm-inout.c b/clang/test/CodeGen/asm-inout.c
index 1383a421efbc2..6d40451b778d9 100644
--- a/clang/test/CodeGen/asm-inout.c
+++ b/clang/test/CodeGen/asm-inout.c
@@ -38,11 +38,11 @@ int test4(volatile int *addr) {
return (int)oldval;
}
-// This should have both inputs be of type x86_mmx.
+// This should have both inputs be of type <1 x i64>.
// CHECK: @test5
typedef long long __m64 __attribute__((__vector_size__(8)));
__m64 test5(__m64 __A, __m64 __B) {
- // CHECK: call x86_mmx asm "pmulhuw $1, $0\0A\09", "=y,y,0,~{dirflag},~{fpsr},~{flags}"(x86_mmx %{{.*}}, x86_mmx %{{.*}})
+ // CHECK: call <1 x i64> asm "pmulhuw $1, $0\0A\09", "=y,y,0,~{dirflag},~{fpsr},~{flags}"(<1 x i64> %{{.*}}, <1 x i64> %{{.*}})
asm ("pmulhuw %1, %0\n\t" : "+y" (__A) : "y" (__B));
return __A;
}
@@ -51,7 +51,7 @@ __m64 test5(__m64 __A, __m64 __B) {
int test6(void) {
typedef unsigned char __attribute__((vector_size(8))) _m64u8;
_m64u8 __attribute__((aligned(16))) Mu8_0, __attribute__((aligned(16))) Mu8_1;
- // CHECK: call x86_mmx asm "nop", "=y,0,~{dirflag},~{fpsr},~{flags}"(x86_mmx %1)
+ // CHECK: call <8 x i8> asm "nop", "=y,0,~{dirflag},~{fpsr},~{flags}"(<8 x i8> %0)
asm ("nop" : "=y"(Mu8_1 ) : "0"(Mu8_0 ));
return 0;
}
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index e116285d043c0..fa26849e0bc5a 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -58236,7 +58236,7 @@ X86TargetLowering::getSingleConstraintMatchWeight(
Wt = CW_SpecificReg;
break;
case 'y':
- if (Ty->isX86_MMXTy() && Subtarget.hasMMX())
+ if (Ty->getPrimitiveSizeInBits() == 64 && Subtarget.hasMMX())
Wt = CW_SpecificReg;
break;
case 'Y':
@@ -58259,8 +58259,8 @@ X86TargetLowering::getSingleConstraintMatchWeight(
return CW_Invalid;
// Any MMX reg
case 'm':
- if (Ty->isX86_MMXTy() && Subtarget.hasMMX())
- return Wt;
+ if (Ty->getPrimitiveSizeInBits() == 64 && Subtarget.hasMMX())
+ return CW_SpecificReg;
return CW_Invalid;
// Any SSE reg when ISA >= SSE2, same as 'x'
case 'i':
diff --git a/llvm/test/CodeGen/X86/mmx-inlineasm.ll b/llvm/test/CodeGen/X86/mmx-inlineasm.ll
new file mode 100644
index 0000000000000..5a15600a4b312
--- /dev/null
+++ b/llvm/test/CodeGen/X86/mmx-inlineasm.ll
@@ -0,0 +1,20 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+mmx | FileCheck %s
+
+;; Verify that the mmx 'y' constraint works with arbitrary IR types.
+define <2 x i32> @test_mmx_asm(<2 x i32> %a) nounwind {
+; CHECK-LABEL: test_mmx_asm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movdq2q %xmm0, %mm0
+; CHECK-NEXT: #APP
+; CHECK-NEXT: # %mm0 = %mm0
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: #APP
+; CHECK-NEXT: # %mm0 = %mm0
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: movq2dq %mm0, %xmm0
+; CHECK-NEXT: retq
+ %1 = tail call i64 asm sideeffect "# $0 = $1", "=y,y"(<2 x i32> %a)
+ %2 = tail call <2 x i32> asm sideeffect "# $0 = $1", "=y,y"(i64 %1)
+ ret <2 x i32> %2
+}
>From da88aa73ad9a34dcc6745279a00edd4ee0c59501 Mon Sep 17 00:00:00 2001
From: James Y Knight <jyknight at google.com>
Date: Wed, 10 Jul 2024 19:01:55 -0400
Subject: [PATCH 2/4] Remove the `x86_mmx` IR type.
It is now translated to `<1 x i64>`, which allows the removal of a bunch of special casing.
This changes the ABI of any LLVM IR function with `x86_mmx` arguments or returns: instead of passing in mmx registers, it will now pass via integer registers. However, the real-world incompatibility generated by this is minimal, since Clang never uses the x86_mmx type -- it lowers `__m64` to either `<1 x i64>` or `double`, depending on ABI.
This change does _not_ eliminate the SelectionDAG `MVT::x86mmx` type. That no longer corresponds to an IR type, and is used only by MMX intrinsics and inline-asm operands.
In order to correctly handle the MMX intrinsics, a hack has been added to `SelectionDAGBuilder::visitTargetIntrinsic`, because there's no generic way to specify a custom translation from LLVM IR type to SelectionDAG type for an intrinsic lowering. (This may be a short-lived hack, if all the MMX intrinsics can be removed in upcoming changes.)
---
clang/lib/CodeGen/CGBuiltin.cpp | 4 +-
llvm/bindings/ocaml/llvm/llvm.mli | 4 -
llvm/bindings/ocaml/llvm/llvm_ocaml.c | 5 -
llvm/include/llvm-c/Core.h | 48 +-
llvm/include/llvm/IR/DataLayout.h | 1 -
llvm/include/llvm/IR/Type.h | 12 +-
llvm/lib/Analysis/ConstantFolding.cpp | 8 +-
llvm/lib/AsmParser/LLLexer.cpp | 3 +-
llvm/lib/Bitcode/Reader/BitcodeReader.cpp | 4 +-
llvm/lib/Bitcode/Writer/BitcodeWriter.cpp | 5 +-
.../SelectionDAG/SelectionDAGBuilder.cpp | 17 +
llvm/lib/CodeGen/ValueTypes.cpp | 7 +-
llvm/lib/IR/AsmWriter.cpp | 5 +-
llvm/lib/IR/ConstantFold.cpp | 2 +-
llvm/lib/IR/Core.cpp | 8 -
llvm/lib/IR/DataLayout.cpp | 1 -
llvm/lib/IR/Function.cpp | 14 +-
llvm/lib/IR/Instructions.cpp | 9 -
llvm/lib/IR/LLVMContextImpl.cpp | 6 +-
llvm/lib/IR/LLVMContextImpl.h | 2 +-
llvm/lib/IR/Type.cpp | 15 +-
.../DirectX/DXILWriter/DXILBitcodeWriter.cpp | 3 -
.../Hexagon/HexagonTargetObjectFile.cpp | 1 -
llvm/lib/Target/X86/X86CallingConv.td | 34 -
.../Target/X86/X86InstCombineIntrinsic.cpp | 8 +-
.../IPO/DeadArgumentElimination.cpp | 6 +-
.../InstCombine/InstCombineCasts.cpp | 7 -
.../Instrumentation/MemorySanitizer.cpp | 42 +-
llvm/test/Assembler/x86mmx.ll | 9 -
llvm/test/Bitcode/bcanalyzer-types.ll | 6 -
llvm/test/Bitcode/compatibility-3.6.ll | 2 +-
llvm/test/Bitcode/compatibility-3.7.ll | 2 +-
llvm/test/Bitcode/compatibility-3.8.ll | 2 +-
llvm/test/Bitcode/compatibility-3.9.ll | 2 +-
llvm/test/Bitcode/compatibility-4.0.ll | 2 +-
llvm/test/Bitcode/compatibility-5.0.ll | 2 +-
llvm/test/Bitcode/compatibility-6.0.ll | 2 +-
llvm/test/Bitcode/compatibility.ll | 2 +-
.../CodeGen/X86/2008-09-05-sinttofp-2xi32.ll | 21 +-
llvm/test/CodeGen/X86/3dnow-intrinsics.ll | 68 +-
llvm/test/CodeGen/X86/avx-vbroadcast.ll | 8 +-
llvm/test/CodeGen/X86/avx2-vbroadcast.ll | 11 +-
llvm/test/CodeGen/X86/fast-isel-bc.ll | 9 +-
.../test/CodeGen/X86/fast-isel-nontemporal.ll | 3 +-
.../CodeGen/X86/mmx-arg-passing-x86-64.ll | 15 +-
llvm/test/CodeGen/X86/mmx-arg-passing.ll | 10 +-
llvm/test/CodeGen/X86/mmx-bitcast-fold.ll | 2 +-
llvm/test/CodeGen/X86/mmx-bitcast.ll | 4 +-
llvm/test/CodeGen/X86/mmx-fold-load.ll | 107 +-
llvm/test/CodeGen/X86/mmx-intrinsics.ll | 398 ++--
llvm/test/CodeGen/X86/pr23246.ll | 2 +-
llvm/test/CodeGen/X86/select-mmx.ll | 35 +-
llvm/test/CodeGen/X86/stack-folding-3dnow.ll | 140 +-
llvm/test/CodeGen/X86/stack-folding-mmx.ll | 508 ++---
llvm/test/CodeGen/X86/vec_extract-mmx.ll | 11 +-
llvm/test/CodeGen/X86/vec_insert-7.ll | 3 +-
llvm/test/CodeGen/X86/vec_insert-mmx.ll | 8 +-
.../MemorySanitizer/X86/mmx-intrinsics.ll | 1667 +++++++++--------
.../MemorySanitizer/vector_arith.ll | 12 +-
.../MemorySanitizer/vector_cvt.ll | 6 +-
.../MemorySanitizer/vector_pack.ll | 15 +-
.../MemorySanitizer/vector_shift.ll | 10 +-
.../Transforms/InstCombine/X86/x86-movmsk.ll | 14 +-
.../bitcast-vec-canon-inseltpoison.ll | 45 -
.../InstCombine/bitcast-vec-canon.ll | 44 -
.../InstSimplify/ConstProp/loads.ll | 13 -
llvm/test/Transforms/LoopUnroll/X86/mmx.ll | 35 -
.../Transforms/SLPVectorizer/X86/bad_types.ll | 62 -
llvm/test/Transforms/SROA/pr57796.ll | 6 +-
llvm/tools/llvm-c-test/echo.cpp | 2 -
llvm/tools/llvm-stress/llvm-stress.cpp | 8 +-
llvm/unittests/IR/InstructionsTest.cpp | 9 +-
mlir/docs/Dialects/LLVM.md | 2 -
mlir/include/mlir/Dialect/LLVMIR/LLVMTypes.h | 1 -
mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp | 1 -
mlir/lib/Dialect/LLVMIR/IR/LLVMTypeSyntax.cpp | 2 -
mlir/lib/Dialect/LLVMIR/IR/LLVMTypes.cpp | 5 +-
mlir/lib/Target/LLVMIR/TypeFromLLVM.cpp | 2 -
mlir/lib/Target/LLVMIR/TypeToLLVM.cpp | 3 -
mlir/test/Dialect/LLVMIR/types.mlir | 2 -
mlir/test/Target/LLVMIR/llvmir-types.mlir | 2 -
81 files changed, 1777 insertions(+), 1866 deletions(-)
delete mode 100644 llvm/test/Assembler/x86mmx.ll
delete mode 100644 llvm/test/Transforms/LoopUnroll/X86/mmx.ll
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 6cc0d9485720c..36853098b118d 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -14386,7 +14386,7 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
case X86::BI__builtin_ia32_vec_init_v4hi:
case X86::BI__builtin_ia32_vec_init_v2si:
return Builder.CreateBitCast(BuildVector(Ops),
- llvm::Type::getX86_MMXTy(getLLVMContext()));
+ llvm::FixedVectorType::get(Int64Ty, 1));
case X86::BI__builtin_ia32_vec_ext_v2si:
case X86::BI__builtin_ia32_vec_ext_v16qi:
case X86::BI__builtin_ia32_vec_ext_v8hi:
@@ -15971,7 +15971,7 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
// 3DNow!
case X86::BI__builtin_ia32_pswapdsf:
case X86::BI__builtin_ia32_pswapdsi: {
- llvm::Type *MMXTy = llvm::Type::getX86_MMXTy(getLLVMContext());
+ llvm::Type *MMXTy = llvm::FixedVectorType::get(Int64Ty, 1);
Ops[0] = Builder.CreateBitCast(Ops[0], MMXTy, "cast");
llvm::Function *F = CGM.getIntrinsic(Intrinsic::x86_3dnowa_pswapd);
return Builder.CreateCall(F, Ops, "pswapd");
diff --git a/llvm/bindings/ocaml/llvm/llvm.mli b/llvm/bindings/ocaml/llvm/llvm.mli
index c16530d3a70cb..b8a430adf6cf2 100644
--- a/llvm/bindings/ocaml/llvm/llvm.mli
+++ b/llvm/bindings/ocaml/llvm/llvm.mli
@@ -760,10 +760,6 @@ val void_type : llcontext -> lltype
[llvm::Type::LabelTy]. *)
val label_type : llcontext -> lltype
-(** [x86_mmx_type c] returns the x86 64-bit MMX register type in the
- context [c]. See [llvm::Type::X86_MMXTy]. *)
-val x86_mmx_type : llcontext -> lltype
-
(** [type_by_name m name] returns the specified type from the current module
if it exists.
See the method [llvm::Module::getTypeByName] *)
diff --git a/llvm/bindings/ocaml/llvm/llvm_ocaml.c b/llvm/bindings/ocaml/llvm/llvm_ocaml.c
index 4ac824cd6a98a..5906f427e6907 100644
--- a/llvm/bindings/ocaml/llvm/llvm_ocaml.c
+++ b/llvm/bindings/ocaml/llvm/llvm_ocaml.c
@@ -686,11 +686,6 @@ value llvm_label_type(value Context) {
return to_val(LLVMLabelTypeInContext(Context_val(Context)));
}
-/* llcontext -> lltype */
-value llvm_x86_mmx_type(value Context) {
- return to_val(LLVMX86MMXTypeInContext(Context_val(Context)));
-}
-
/* llmodule -> string -> lltype option */
value llvm_type_by_name(value M, value Name) {
return ptr_to_option(LLVMGetTypeByName(Module_val(M), String_val(Name)));
diff --git a/llvm/include/llvm-c/Core.h b/llvm/include/llvm-c/Core.h
index 9867db4839fe1..1b18f31e3925c 100644
--- a/llvm/include/llvm-c/Core.h
+++ b/llvm/include/llvm-c/Core.h
@@ -146,27 +146,27 @@ typedef enum {
} LLVMOpcode;
typedef enum {
- LLVMVoidTypeKind, /**< type with no size */
- LLVMHalfTypeKind, /**< 16 bit floating point type */
- LLVMFloatTypeKind, /**< 32 bit floating point type */
- LLVMDoubleTypeKind, /**< 64 bit floating point type */
- LLVMX86_FP80TypeKind, /**< 80 bit floating point type (X87) */
- LLVMFP128TypeKind, /**< 128 bit floating point type (112-bit mantissa)*/
- LLVMPPC_FP128TypeKind, /**< 128 bit floating point type (two 64-bits) */
- LLVMLabelTypeKind, /**< Labels */
- LLVMIntegerTypeKind, /**< Arbitrary bit width integers */
- LLVMFunctionTypeKind, /**< Functions */
- LLVMStructTypeKind, /**< Structures */
- LLVMArrayTypeKind, /**< Arrays */
- LLVMPointerTypeKind, /**< Pointers */
- LLVMVectorTypeKind, /**< Fixed width SIMD vector type */
- LLVMMetadataTypeKind, /**< Metadata */
- LLVMX86_MMXTypeKind, /**< X86 MMX */
- LLVMTokenTypeKind, /**< Tokens */
- LLVMScalableVectorTypeKind, /**< Scalable SIMD vector type */
- LLVMBFloatTypeKind, /**< 16 bit brain floating point type */
- LLVMX86_AMXTypeKind, /**< X86 AMX */
- LLVMTargetExtTypeKind, /**< Target extension type */
+ LLVMVoidTypeKind = 0, /**< type with no size */
+ LLVMHalfTypeKind = 1, /**< 16 bit floating point type */
+ LLVMFloatTypeKind = 2, /**< 32 bit floating point type */
+ LLVMDoubleTypeKind = 3, /**< 64 bit floating point type */
+ LLVMX86_FP80TypeKind = 4, /**< 80 bit floating point type (X87) */
+ LLVMFP128TypeKind = 5, /**< 128 bit floating point type (112-bit mantissa)*/
+ LLVMPPC_FP128TypeKind = 6, /**< 128 bit floating point type (two 64-bits) */
+ LLVMLabelTypeKind = 7, /**< Labels */
+ LLVMIntegerTypeKind = 8, /**< Arbitrary bit width integers */
+ LLVMFunctionTypeKind = 9, /**< Functions */
+ LLVMStructTypeKind = 10, /**< Structures */
+ LLVMArrayTypeKind = 11, /**< Arrays */
+ LLVMPointerTypeKind = 12, /**< Pointers */
+ LLVMVectorTypeKind = 13, /**< Fixed width SIMD vector type */
+ LLVMMetadataTypeKind = 14, /**< Metadata */
+ /* 15 previously used by LLVMX86_MMXTypeKind */
+ LLVMTokenTypeKind = 16, /**< Tokens */
+ LLVMScalableVectorTypeKind = 17, /**< Scalable SIMD vector type */
+ LLVMBFloatTypeKind = 18, /**< 16 bit brain floating point type */
+ LLVMX86_AMXTypeKind = 19, /**< X86 AMX */
+ LLVMTargetExtTypeKind = 20, /**< Target extension type */
} LLVMTypeKind;
typedef enum {
@@ -1672,11 +1672,6 @@ LLVMTypeRef LLVMVoidTypeInContext(LLVMContextRef C);
*/
LLVMTypeRef LLVMLabelTypeInContext(LLVMContextRef C);
-/**
- * Create a X86 MMX type in a context.
- */
-LLVMTypeRef LLVMX86MMXTypeInContext(LLVMContextRef C);
-
/**
* Create a X86 AMX type in a context.
*/
@@ -1698,7 +1693,6 @@ LLVMTypeRef LLVMMetadataTypeInContext(LLVMContextRef C);
*/
LLVMTypeRef LLVMVoidType(void);
LLVMTypeRef LLVMLabelType(void);
-LLVMTypeRef LLVMX86MMXType(void);
LLVMTypeRef LLVMX86AMXType(void);
/**
diff --git a/llvm/include/llvm/IR/DataLayout.h b/llvm/include/llvm/IR/DataLayout.h
index d14adfe1590be..5f7034b5ee36f 100644
--- a/llvm/include/llvm/IR/DataLayout.h
+++ b/llvm/include/llvm/IR/DataLayout.h
@@ -693,7 +693,6 @@ inline TypeSize DataLayout::getTypeSizeInBits(Type *Ty) const {
case Type::FloatTyID:
return TypeSize::getFixed(32);
case Type::DoubleTyID:
- case Type::X86_MMXTyID:
return TypeSize::getFixed(64);
case Type::PPC_FP128TyID:
case Type::FP128TyID:
diff --git a/llvm/include/llvm/IR/Type.h b/llvm/include/llvm/IR/Type.h
index 1f0133c08e7d6..c74f9e9d24800 100644
--- a/llvm/include/llvm/IR/Type.h
+++ b/llvm/include/llvm/IR/Type.h
@@ -63,7 +63,6 @@ class Type {
VoidTyID, ///< type with no size
LabelTyID, ///< Labels
MetadataTyID, ///< Metadata
- X86_MMXTyID, ///< MMX vectors (64 bits, X86 specific)
X86_AMXTyID, ///< AMX vectors (8192 bits, X86 specific)
TokenTyID, ///< Tokens
@@ -197,9 +196,6 @@ class Type {
const fltSemantics &getFltSemantics() const;
- /// Return true if this is X86 MMX.
- bool isX86_MMXTy() const { return getTypeID() == X86_MMXTyID; }
-
/// Return true if this is X86 AMX.
bool isX86_AMXTy() const { return getTypeID() == X86_AMXTyID; }
@@ -285,8 +281,8 @@ class Type {
/// Return true if the type is a valid type for a register in codegen. This
/// includes all first-class types except struct and array types.
bool isSingleValueType() const {
- return isFloatingPointTy() || isX86_MMXTy() || isIntegerTy() ||
- isPointerTy() || isVectorTy() || isX86_AMXTy() || isTargetExtTy();
+ return isFloatingPointTy() || isIntegerTy() || isPointerTy() ||
+ isVectorTy() || isX86_AMXTy() || isTargetExtTy();
}
/// Return true if the type is an aggregate type. This means it is valid as
@@ -302,8 +298,7 @@ class Type {
bool isSized(SmallPtrSetImpl<Type*> *Visited = nullptr) const {
// If it's a primitive, it is always sized.
if (getTypeID() == IntegerTyID || isFloatingPointTy() ||
- getTypeID() == PointerTyID || getTypeID() == X86_MMXTyID ||
- getTypeID() == X86_AMXTyID)
+ getTypeID() == PointerTyID || getTypeID() == X86_AMXTyID)
return true;
// If it is not something that can have a size (e.g. a function or label),
// it doesn't have a size.
@@ -453,7 +448,6 @@ class Type {
static Type *getX86_FP80Ty(LLVMContext &C);
static Type *getFP128Ty(LLVMContext &C);
static Type *getPPC_FP128Ty(LLVMContext &C);
- static Type *getX86_MMXTy(LLVMContext &C);
static Type *getX86_AMXTy(LLVMContext &C);
static Type *getTokenTy(LLVMContext &C);
static IntegerType *getIntNTy(LLVMContext &C, unsigned N);
diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp
index 962880f68f076..0dbe85631df04 100644
--- a/llvm/lib/Analysis/ConstantFolding.cpp
+++ b/llvm/lib/Analysis/ConstantFolding.cpp
@@ -564,16 +564,14 @@ Constant *FoldReinterpretLoadFromConst(Constant *C, Type *LoadTy,
Type *MapTy = Type::getIntNTy(C->getContext(),
DL.getTypeSizeInBits(LoadTy).getFixedValue());
if (Constant *Res = FoldReinterpretLoadFromConst(C, MapTy, Offset, DL)) {
- if (Res->isNullValue() && !LoadTy->isX86_MMXTy() &&
- !LoadTy->isX86_AMXTy())
+ if (Res->isNullValue() && !LoadTy->isX86_AMXTy())
// Materializing a zero can be done trivially without a bitcast
return Constant::getNullValue(LoadTy);
Type *CastTy = LoadTy->isPtrOrPtrVectorTy() ? DL.getIntPtrType(LoadTy) : LoadTy;
Res = FoldBitCast(Res, CastTy, DL);
if (LoadTy->isPtrOrPtrVectorTy()) {
// For vector of pointer, we needed to first convert to a vector of integer, then do vector inttoptr
- if (Res->isNullValue() && !LoadTy->isX86_MMXTy() &&
- !LoadTy->isX86_AMXTy())
+ if (Res->isNullValue() && !LoadTy->isX86_AMXTy())
return Constant::getNullValue(LoadTy);
if (DL.isNonIntegralPointerType(LoadTy->getScalarType()))
// Be careful not to replace a load of an addrspace value with an inttoptr here
@@ -764,7 +762,7 @@ Constant *llvm::ConstantFoldLoadFromUniformValue(Constant *C, Type *Ty,
// uniform.
if (!DL.typeSizeEqualsStoreSize(C->getType()))
return nullptr;
- if (C->isNullValue() && !Ty->isX86_MMXTy() && !Ty->isX86_AMXTy())
+ if (C->isNullValue() && !Ty->isX86_AMXTy())
return Constant::getNullValue(Ty);
if (C->isAllOnesValue() &&
(Ty->isIntOrIntVectorTy() || Ty->isFPOrFPVectorTy()))
diff --git a/llvm/lib/AsmParser/LLLexer.cpp b/llvm/lib/AsmParser/LLLexer.cpp
index 7d7fe19568e8a..c82e74972b67c 100644
--- a/llvm/lib/AsmParser/LLLexer.cpp
+++ b/llvm/lib/AsmParser/LLLexer.cpp
@@ -838,7 +838,8 @@ lltok::Kind LLLexer::LexIdentifier() {
TYPEKEYWORD("ppc_fp128", Type::getPPC_FP128Ty(Context));
TYPEKEYWORD("label", Type::getLabelTy(Context));
TYPEKEYWORD("metadata", Type::getMetadataTy(Context));
- TYPEKEYWORD("x86_mmx", Type::getX86_MMXTy(Context));
+ TYPEKEYWORD("x86_mmx", llvm::FixedVectorType::get(
+ llvm::IntegerType::get(Context, 64), 1));
TYPEKEYWORD("x86_amx", Type::getX86_AMXTy(Context));
TYPEKEYWORD("token", Type::getTokenTy(Context));
TYPEKEYWORD("ptr", PointerType::getUnqual(Context));
diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
index f56b2b32ff98f..7c9bc66a237d5 100644
--- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -2496,7 +2496,9 @@ Error BitcodeReader::parseTypeTableBody() {
ResultTy = Type::getMetadataTy(Context);
break;
case bitc::TYPE_CODE_X86_MMX: // X86_MMX
- ResultTy = Type::getX86_MMXTy(Context);
+ // Deprecated: decodes as <1 x i64>
+ ResultTy =
+ llvm::FixedVectorType::get(llvm::IntegerType::get(Context, 64), 1);
break;
case bitc::TYPE_CODE_X86_AMX: // X86_AMX
ResultTy = Type::getX86_AMXTy(Context);
diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
index 3378931065f9b..216a0cc8e94e3 100644
--- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -1086,8 +1086,9 @@ void ModuleBitcodeWriter::writeTypeTable() {
case Type::FP128TyID: Code = bitc::TYPE_CODE_FP128; break;
case Type::PPC_FP128TyID: Code = bitc::TYPE_CODE_PPC_FP128; break;
case Type::LabelTyID: Code = bitc::TYPE_CODE_LABEL; break;
- case Type::MetadataTyID: Code = bitc::TYPE_CODE_METADATA; break;
- case Type::X86_MMXTyID: Code = bitc::TYPE_CODE_X86_MMX; break;
+ case Type::MetadataTyID:
+ Code = bitc::TYPE_CODE_METADATA;
+ break;
case Type::X86_AMXTyID: Code = bitc::TYPE_CODE_X86_AMX; break;
case Type::TokenTyID: Code = bitc::TYPE_CODE_TOKEN; break;
case Type::IntegerTyID:
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 276d980c1dcca..54f7f127ae663 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -5225,6 +5225,7 @@ void SelectionDAGBuilder::visitTargetIntrinsic(const CallInst &I,
// Ignore the callsite's attributes. A specific call site may be marked with
// readnone, but the lowering code will expect the chain based on the
// definition.
+ const auto &Triple = DAG.getTarget().getTargetTriple();
const Function *F = I.getCalledFunction();
bool HasChain = !F->doesNotAccessMemory();
bool OnlyLoad = HasChain && F->onlyReadsMemory();
@@ -5272,10 +5273,21 @@ void SelectionDAGBuilder::visitTargetIntrinsic(const CallInst &I,
DAG.getTargetConstantFP(*cast<ConstantFP>(Arg), SDLoc(), VT));
}
}
+ if (Triple.getArch() == Triple::x86 || Triple.getArch() == Triple::x86_64) {
+ for (SDValue &Op : Ops) {
+ if (Op.getValueType() == MVT::v1i64)
+ Op = DAG.getBitcast(MVT::x86mmx, Op);
+ }
+ }
SmallVector<EVT, 4> ValueVTs;
ComputeValueVTs(TLI, DAG.getDataLayout(), I.getType(), ValueVTs);
+ if (Triple.getArch() == Triple::x86 || Triple.getArch() == Triple::x86_64) {
+ if (ValueVTs.size() == 1 && ValueVTs[0] == MVT::v1i64)
+ ValueVTs[0] = MVT::x86mmx;
+ }
+
if (HasChain)
ValueVTs.push_back(MVT::Other);
@@ -5344,6 +5356,11 @@ void SelectionDAGBuilder::visitTargetIntrinsic(const CallInst &I,
}
}
+ if (Triple.getArch() == Triple::x86 || Triple.getArch() == Triple::x86_64) {
+ if (Result.getValueType() == MVT::x86mmx)
+ Result = DAG.getBitcast(MVT::v1i64, Result);
+ }
+
setValue(&I, Result);
}
diff --git a/llvm/lib/CodeGen/ValueTypes.cpp b/llvm/lib/CodeGen/ValueTypes.cpp
index b0f736a49c20e..0c6b726a28a24 100644
--- a/llvm/lib/CodeGen/ValueTypes.cpp
+++ b/llvm/lib/CodeGen/ValueTypes.cpp
@@ -207,7 +207,7 @@ Type *EVT::getTypeForEVT(LLVMContext &Context) const {
assert(isExtended() && "Type is not extended!");
return LLVMTy;
case MVT::isVoid: return Type::getVoidTy(Context);
- case MVT::x86mmx: return Type::getX86_MMXTy(Context);
+ case MVT::x86mmx: return llvm::FixedVectorType::get(llvm::IntegerType::get(Context, 64), 1);
case MVT::aarch64svcount:
return TargetExtType::get(Context, "aarch64.svcount");
case MVT::x86amx: return Type::getX86_AMXTy(Context);
@@ -241,8 +241,8 @@ MVT MVT::getVT(Type *Ty, bool HandleUnknown){
case Type::BFloatTyID: return MVT(MVT::bf16);
case Type::FloatTyID: return MVT(MVT::f32);
case Type::DoubleTyID: return MVT(MVT::f64);
- case Type::X86_FP80TyID: return MVT(MVT::f80);
- case Type::X86_MMXTyID: return MVT(MVT::x86mmx);
+ case Type::X86_FP80TyID:
+ return MVT(MVT::f80);
case Type::TargetExtTyID: {
TargetExtType *TargetExtTy = cast<TargetExtType>(Ty);
if (TargetExtTy->getName() == "aarch64.svcount")
@@ -302,4 +302,3 @@ void MVT::print(raw_ostream &OS) const {
else
OS << EVT(*this).getEVTString();
}
-
diff --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp
index 6599730590de6..01a16ccd688f4 100644
--- a/llvm/lib/IR/AsmWriter.cpp
+++ b/llvm/lib/IR/AsmWriter.cpp
@@ -573,8 +573,9 @@ void TypePrinting::print(Type *Ty, raw_ostream &OS) {
case Type::FP128TyID: OS << "fp128"; return;
case Type::PPC_FP128TyID: OS << "ppc_fp128"; return;
case Type::LabelTyID: OS << "label"; return;
- case Type::MetadataTyID: OS << "metadata"; return;
- case Type::X86_MMXTyID: OS << "x86_mmx"; return;
+ case Type::MetadataTyID:
+ OS << "metadata";
+ return;
case Type::X86_AMXTyID: OS << "x86_amx"; return;
case Type::TokenTyID: OS << "token"; return;
case Type::IntegerTyID:
diff --git a/llvm/lib/IR/ConstantFold.cpp b/llvm/lib/IR/ConstantFold.cpp
index 693674ae0d06f..05ab0968ef6f3 100644
--- a/llvm/lib/IR/ConstantFold.cpp
+++ b/llvm/lib/IR/ConstantFold.cpp
@@ -142,7 +142,7 @@ Constant *llvm::ConstantFoldCastInstruction(unsigned opc, Constant *V,
return UndefValue::get(DestTy);
}
- if (V->isNullValue() && !DestTy->isX86_MMXTy() && !DestTy->isX86_AMXTy() &&
+ if (V->isNullValue() && !DestTy->isX86_AMXTy() &&
opc != Instruction::AddrSpaceCast)
return Constant::getNullValue(DestTy);
diff --git a/llvm/lib/IR/Core.cpp b/llvm/lib/IR/Core.cpp
index 9ba7873106043..b28c3ec56827a 100644
--- a/llvm/lib/IR/Core.cpp
+++ b/llvm/lib/IR/Core.cpp
@@ -609,8 +609,6 @@ LLVMTypeKind LLVMGetTypeKind(LLVMTypeRef Ty) {
return LLVMPointerTypeKind;
case Type::FixedVectorTyID:
return LLVMVectorTypeKind;
- case Type::X86_MMXTyID:
- return LLVMX86_MMXTypeKind;
case Type::X86_AMXTyID:
return LLVMX86_AMXTypeKind;
case Type::TokenTyID:
@@ -725,9 +723,6 @@ LLVMTypeRef LLVMFP128TypeInContext(LLVMContextRef C) {
LLVMTypeRef LLVMPPCFP128TypeInContext(LLVMContextRef C) {
return (LLVMTypeRef) Type::getPPC_FP128Ty(*unwrap(C));
}
-LLVMTypeRef LLVMX86MMXTypeInContext(LLVMContextRef C) {
- return (LLVMTypeRef) Type::getX86_MMXTy(*unwrap(C));
-}
LLVMTypeRef LLVMX86AMXTypeInContext(LLVMContextRef C) {
return (LLVMTypeRef) Type::getX86_AMXTy(*unwrap(C));
}
@@ -753,9 +748,6 @@ LLVMTypeRef LLVMFP128Type(void) {
LLVMTypeRef LLVMPPCFP128Type(void) {
return LLVMPPCFP128TypeInContext(LLVMGetGlobalContext());
}
-LLVMTypeRef LLVMX86MMXType(void) {
- return LLVMX86MMXTypeInContext(LLVMGetGlobalContext());
-}
LLVMTypeRef LLVMX86AMXType(void) {
return LLVMX86AMXTypeInContext(LLVMGetGlobalContext());
}
diff --git a/llvm/lib/IR/DataLayout.cpp b/llvm/lib/IR/DataLayout.cpp
index 2741165332487..17897f77b4edb 100644
--- a/llvm/lib/IR/DataLayout.cpp
+++ b/llvm/lib/IR/DataLayout.cpp
@@ -835,7 +835,6 @@ Align DataLayout::getAlignment(Type *Ty, bool abi_or_pref) const {
// layout.
return Align(PowerOf2Ceil(BitWidth / 8));
}
- case Type::X86_MMXTyID:
case Type::FixedVectorTyID:
case Type::ScalableVectorTyID: {
unsigned BitWidth = getTypeSizeInBits(Ty).getKnownMinValue();
diff --git a/llvm/lib/IR/Function.cpp b/llvm/lib/IR/Function.cpp
index 20871982afb06..9b0dd5fca7e0e 100644
--- a/llvm/lib/IR/Function.cpp
+++ b/llvm/lib/IR/Function.cpp
@@ -1052,8 +1052,9 @@ static std::string getMangledTypeStr(Type *Ty, bool &HasUnnamedType) {
case Type::DoubleTyID: Result += "f64"; break;
case Type::X86_FP80TyID: Result += "f80"; break;
case Type::FP128TyID: Result += "f128"; break;
- case Type::PPC_FP128TyID: Result += "ppcf128"; break;
- case Type::X86_MMXTyID: Result += "x86mmx"; break;
+ case Type::PPC_FP128TyID:
+ Result += "ppcf128";
+ break;
case Type::X86_AMXTyID: Result += "x86amx"; break;
case Type::IntegerTyID:
Result += "i" + utostr(cast<IntegerType>(Ty)->getBitWidth());
@@ -1397,7 +1398,8 @@ static Type *DecodeFixedType(ArrayRef<Intrinsic::IITDescriptor> &Infos,
switch (D.Kind) {
case IITDescriptor::Void: return Type::getVoidTy(Context);
case IITDescriptor::VarArg: return Type::getVoidTy(Context);
- case IITDescriptor::MMX: return Type::getX86_MMXTy(Context);
+ case IITDescriptor::MMX:
+ return llvm::FixedVectorType::get(llvm::IntegerType::get(Context, 64), 1);
case IITDescriptor::AMX: return Type::getX86_AMXTy(Context);
case IITDescriptor::Token: return Type::getTokenTy(Context);
case IITDescriptor::Metadata: return Type::getMetadataTy(Context);
@@ -1580,7 +1582,11 @@ static bool matchIntrinsicType(
switch (D.Kind) {
case IITDescriptor::Void: return !Ty->isVoidTy();
case IITDescriptor::VarArg: return true;
- case IITDescriptor::MMX: return !Ty->isX86_MMXTy();
+ case IITDescriptor::MMX: {
+ FixedVectorType *VT = dyn_cast<FixedVectorType>(Ty);
+ return !VT || VT->getNumElements() != 1 ||
+ !VT->getElementType()->isIntegerTy(64);
+ }
case IITDescriptor::AMX: return !Ty->isX86_AMXTy();
case IITDescriptor::Token: return !Ty->isTokenTy();
case IITDescriptor::Metadata: return !Ty->isMetadataTy();
diff --git a/llvm/lib/IR/Instructions.cpp b/llvm/lib/IR/Instructions.cpp
index 7a8cf8c230498..58ebe7e95cd06 100644
--- a/llvm/lib/IR/Instructions.cpp
+++ b/llvm/lib/IR/Instructions.cpp
@@ -3116,9 +3116,6 @@ bool CastInst::isBitCastable(Type *SrcTy, Type *DestTy) {
if (SrcBits != DestBits)
return false;
- if (DestTy->isX86_MMXTy() || SrcTy->isX86_MMXTy())
- return false;
-
return true;
}
@@ -3228,12 +3225,6 @@ CastInst::getCastOpcode(
return IntToPtr; // int -> ptr
}
llvm_unreachable("Casting pointer to other than pointer or int");
- } else if (DestTy->isX86_MMXTy()) {
- if (SrcTy->isVectorTy()) {
- assert(DestBits == SrcBits && "Casting vector of wrong width to X86_MMX");
- return BitCast; // 64-bit vector to MMX
- }
- llvm_unreachable("Illegal cast to X86_MMX");
}
llvm_unreachable("Casting to type that is not first-class");
}
diff --git a/llvm/lib/IR/LLVMContextImpl.cpp b/llvm/lib/IR/LLVMContextImpl.cpp
index 0a376179d609c..4f1ef8cec3213 100644
--- a/llvm/lib/IR/LLVMContextImpl.cpp
+++ b/llvm/lib/IR/LLVMContextImpl.cpp
@@ -40,9 +40,9 @@ LLVMContextImpl::LLVMContextImpl(LLVMContext &C)
FloatTy(C, Type::FloatTyID), DoubleTy(C, Type::DoubleTyID),
MetadataTy(C, Type::MetadataTyID), TokenTy(C, Type::TokenTyID),
X86_FP80Ty(C, Type::X86_FP80TyID), FP128Ty(C, Type::FP128TyID),
- PPC_FP128Ty(C, Type::PPC_FP128TyID), X86_MMXTy(C, Type::X86_MMXTyID),
- X86_AMXTy(C, Type::X86_AMXTyID), Int1Ty(C, 1), Int8Ty(C, 8),
- Int16Ty(C, 16), Int32Ty(C, 32), Int64Ty(C, 64), Int128Ty(C, 128) {}
+ PPC_FP128Ty(C, Type::PPC_FP128TyID), X86_AMXTy(C, Type::X86_AMXTyID),
+ Int1Ty(C, 1), Int8Ty(C, 8), Int16Ty(C, 16), Int32Ty(C, 32),
+ Int64Ty(C, 64), Int128Ty(C, 128) {}
LLVMContextImpl::~LLVMContextImpl() {
#ifndef NDEBUG
diff --git a/llvm/lib/IR/LLVMContextImpl.h b/llvm/lib/IR/LLVMContextImpl.h
index 937a87d686175..8e9ca21d149f6 100644
--- a/llvm/lib/IR/LLVMContextImpl.h
+++ b/llvm/lib/IR/LLVMContextImpl.h
@@ -1582,7 +1582,7 @@ class LLVMContextImpl {
// Basic type instances.
Type VoidTy, LabelTy, HalfTy, BFloatTy, FloatTy, DoubleTy, MetadataTy,
TokenTy;
- Type X86_FP80Ty, FP128Ty, PPC_FP128Ty, X86_MMXTy, X86_AMXTy;
+ Type X86_FP80Ty, FP128Ty, PPC_FP128Ty, X86_AMXTy;
IntegerType Int1Ty, Int8Ty, Int16Ty, Int32Ty, Int64Ty, Int128Ty;
std::unique_ptr<ConstantTokenNone> TheNoneToken;
diff --git a/llvm/lib/IR/Type.cpp b/llvm/lib/IR/Type.cpp
index 5c61ad9f000b0..18a547e75fe1e 100644
--- a/llvm/lib/IR/Type.cpp
+++ b/llvm/lib/IR/Type.cpp
@@ -44,8 +44,8 @@ Type *Type::getPrimitiveType(LLVMContext &C, TypeID IDNumber) {
case FP128TyID : return getFP128Ty(C);
case PPC_FP128TyID : return getPPC_FP128Ty(C);
case LabelTyID : return getLabelTy(C);
- case MetadataTyID : return getMetadataTy(C);
- case X86_MMXTyID : return getX86_MMXTy(C);
+ case MetadataTyID:
+ return getMetadataTy(C);
case X86_AMXTyID : return getX86_AMXTy(C);
case TokenTyID : return getTokenTy(C);
default:
@@ -125,14 +125,6 @@ bool Type::canLosslesslyBitCastTo(Type *Ty) const {
if (isa<VectorType>(this) && isa<VectorType>(Ty))
return getPrimitiveSizeInBits() == Ty->getPrimitiveSizeInBits();
- // 64-bit fixed width vector types can be losslessly converted to x86mmx.
- if (((isa<FixedVectorType>(this)) && Ty->isX86_MMXTy()) &&
- getPrimitiveSizeInBits().getFixedValue() == 64)
- return true;
- if ((isX86_MMXTy() && isa<FixedVectorType>(Ty)) &&
- Ty->getPrimitiveSizeInBits().getFixedValue() == 64)
- return true;
-
// 8192-bit fixed width vector types can be losslessly converted to x86amx.
if (((isa<FixedVectorType>(this)) && Ty->isX86_AMXTy()) &&
getPrimitiveSizeInBits().getFixedValue() == 8192)
@@ -179,8 +171,6 @@ TypeSize Type::getPrimitiveSizeInBits() const {
return TypeSize::getFixed(128);
case Type::PPC_FP128TyID:
return TypeSize::getFixed(128);
- case Type::X86_MMXTyID:
- return TypeSize::getFixed(64);
case Type::X86_AMXTyID:
return TypeSize::getFixed(8192);
case Type::IntegerTyID:
@@ -245,7 +235,6 @@ Type *Type::getTokenTy(LLVMContext &C) { return &C.pImpl->TokenTy; }
Type *Type::getX86_FP80Ty(LLVMContext &C) { return &C.pImpl->X86_FP80Ty; }
Type *Type::getFP128Ty(LLVMContext &C) { return &C.pImpl->FP128Ty; }
Type *Type::getPPC_FP128Ty(LLVMContext &C) { return &C.pImpl->PPC_FP128Ty; }
-Type *Type::getX86_MMXTy(LLVMContext &C) { return &C.pImpl->X86_MMXTy; }
Type *Type::getX86_AMXTy(LLVMContext &C) { return &C.pImpl->X86_AMXTy; }
IntegerType *Type::getInt1Ty(LLVMContext &C) { return &C.pImpl->Int1Ty; }
diff --git a/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp b/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp
index 3433408f05171..cd0d6d34e9a67 100644
--- a/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp
+++ b/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp
@@ -1048,9 +1048,6 @@ void DXILBitcodeWriter::writeTypeTable() {
case Type::MetadataTyID:
Code = bitc::TYPE_CODE_METADATA;
break;
- case Type::X86_MMXTyID:
- Code = bitc::TYPE_CODE_X86_MMX;
- break;
case Type::IntegerTyID:
// INTEGER: [width]
Code = bitc::TYPE_CODE_INTEGER;
diff --git a/llvm/lib/Target/Hexagon/HexagonTargetObjectFile.cpp b/llvm/lib/Target/Hexagon/HexagonTargetObjectFile.cpp
index e5d10a75728bf..0c1b0aea41f41 100644
--- a/llvm/lib/Target/Hexagon/HexagonTargetObjectFile.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonTargetObjectFile.cpp
@@ -329,7 +329,6 @@ unsigned HexagonTargetObjectFile::getSmallestAddressableSize(const Type *Ty,
case Type::PPC_FP128TyID:
case Type::LabelTyID:
case Type::MetadataTyID:
- case Type::X86_MMXTyID:
case Type::X86_AMXTyID:
case Type::TokenTyID:
case Type::TypedPointerTyID:
diff --git a/llvm/lib/Target/X86/X86CallingConv.td b/llvm/lib/Target/X86/X86CallingConv.td
index 9ec68bfb8e0f7..c55ff3dfc9c8e 100644
--- a/llvm/lib/Target/X86/X86CallingConv.td
+++ b/llvm/lib/Target/X86/X86CallingConv.td
@@ -168,10 +168,6 @@ def CC_#NAME : CallingConv<[
CCIfType<[i32, f32], CCAssignToStack<4, 4>>,
CCIfType<[i64, f64], CCAssignToStack<8, 4>>,
- // MMX type gets 8 byte slot in stack , while alignment depends on target
- CCIfSubtarget<"is64Bit()", CCIfType<[x86mmx], CCAssignToStack<8, 8>>>,
- CCIfType<[x86mmx], CCAssignToStack<8, 4>>,
-
// float 128 get stack slots whose size and alignment depends
// on the subtarget.
CCIfType<[f80, f128], CCAssignToStack<0, 0>>,
@@ -286,10 +282,6 @@ def RetCC_X86Common : CallingConv<[
CCIfType<[v64i8, v32i16, v16i32, v8i64, v32f16, v16f32, v8f64],
CCAssignToReg<[ZMM0,ZMM1,ZMM2,ZMM3]>>,
- // MMX vector types are always returned in MM0. If the target doesn't have
- // MM0, it doesn't support these vector types.
- CCIfType<[x86mmx], CCAssignToReg<[MM0]>>,
-
// Long double types are always returned in FP0 (even with SSE),
// except on Win64.
CCIfNotSubtarget<"isTargetWin64()", CCIfType<[f80], CCAssignToReg<[FP0, FP1]>>>
@@ -376,9 +368,6 @@ def RetCC_X86_64_C : CallingConv<[
CCIfType<[f64], CCAssignToReg<[XMM0, XMM1]>>,
CCIfType<[f128], CCAssignToReg<[XMM0, XMM1]>>,
- // MMX vector types are always returned in XMM0.
- CCIfType<[x86mmx], CCAssignToReg<[XMM0, XMM1]>>,
-
// Pointers are always returned in full 64-bit registers.
CCIfPtr<CCCustom<"CC_X86_64_Pointer">>,
@@ -389,9 +378,6 @@ def RetCC_X86_64_C : CallingConv<[
// X86-Win64 C return-value convention.
def RetCC_X86_Win64_C : CallingConv<[
- // The X86-Win64 calling convention always returns __m64 values in RAX.
- CCIfType<[x86mmx], CCBitConvertToType<i64>>,
-
// GCC returns FP values in RAX on Win64.
CCIfType<[f32], CCIfNotSubtarget<"hasSSE1()", CCBitConvertToType<i32>>>,
CCIfType<[f64], CCIfNotSubtarget<"hasSSE1()", CCBitConvertToType<i64>>>,
@@ -436,8 +422,6 @@ def RetCC_X86_64_Swift : CallingConv<[
CCIfType<[f64], CCAssignToReg<[XMM0, XMM1, XMM2, XMM3]>>,
CCIfType<[f128], CCAssignToReg<[XMM0, XMM1, XMM2, XMM3]>>,
- // MMX vector types are returned in XMM0, XMM1, XMM2 and XMM3.
- CCIfType<[x86mmx], CCAssignToReg<[XMM0, XMM1, XMM2, XMM3]>>,
CCDelegateTo<RetCC_X86Common>
]>;
@@ -572,12 +556,6 @@ def CC_X86_64_C : CallingConv<[
CCIfType<[i64], CCAssignToReg<[RDI, RSI, RDX, RCX, R8 , R9 ]>>,
- // The first 8 MMX vector arguments are passed in XMM registers on Darwin.
- CCIfType<[x86mmx],
- CCIfSubtarget<"isTargetDarwin()",
- CCIfSubtarget<"hasSSE2()",
- CCPromoteToType<v2i64>>>>,
-
// Boolean vectors of AVX-512 are passed in SIMD registers.
// The call from AVX to AVX-512 function should work,
// since the boolean types in AVX/AVX2 are promoted by default.
@@ -666,9 +644,6 @@ def CC_X86_Win64_C : CallingConv<[
// Long doubles are passed by pointer
CCIfType<[f80], CCPassIndirect<i64>>,
- // The first 4 MMX vector arguments are passed in GPRs.
- CCIfType<[x86mmx], CCBitConvertToType<i64>>,
-
// If SSE was disabled, pass FP values smaller than 64-bits as integers in
// GPRs or on the stack.
CCIfType<[f32], CCIfNotSubtarget<"hasSSE1()", CCBitConvertToType<i32>>>,
@@ -843,11 +818,6 @@ def CC_X86_32_Common : CallingConv<[
CCIfNotVarArg<CCIfInReg<CCIfType<[f16], CCAssignToReg<[XMM0,XMM1,XMM2]>>>>,
- // The first 3 __m64 vector arguments are passed in mmx registers if the
- // call is not a vararg call.
- CCIfNotVarArg<CCIfType<[x86mmx],
- CCAssignToReg<[MM0, MM1, MM2]>>>,
-
CCIfType<[f16], CCAssignToStack<4, 4>>,
// Integer/Float values get stored in stack slots that are 4 bytes in
@@ -870,10 +840,6 @@ def CC_X86_32_Common : CallingConv<[
CCIfType<[v32i1], CCPromoteToType<v32i8>>,
CCIfType<[v64i1], CCPromoteToType<v64i8>>,
- // __m64 vectors get 8-byte stack slots that are 4-byte aligned. They are
- // passed in the parameter area.
- CCIfType<[x86mmx], CCAssignToStack<8, 4>>,
-
// Darwin passes vectors in a form that differs from the i386 psABI
CCIfSubtarget<"isTargetDarwin()", CCDelegateTo<CC_X86_32_Vector_Darwin>>,
diff --git a/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp b/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp
index 322cb6f6f5819..793d62ba2a8e7 100644
--- a/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp
@@ -623,11 +623,13 @@ static Value *simplifyX86movmsk(const IntrinsicInst &II,
if (isa<UndefValue>(Arg))
return Constant::getNullValue(ResTy);
- auto *ArgTy = dyn_cast<FixedVectorType>(Arg->getType());
- // We can't easily peek through x86_mmx types.
- if (!ArgTy)
+ // Preserve previous behavior and give up.
+ // TODO: treat as <8 x i8>.
+ if (II.getIntrinsicID() == Intrinsic::x86_mmx_pmovmskb)
return nullptr;
+ auto *ArgTy = cast<FixedVectorType>(Arg->getType());
+
// Expand MOVMSK to compare/bitcast/zext:
// e.g. PMOVMSKB(v16i8 x):
// %cmp = icmp slt <16 x i8> %x, zeroinitializer
diff --git a/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp b/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp
index a164c82bdf75d..f5a7ab26a49e9 100644
--- a/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp
+++ b/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp
@@ -962,8 +962,7 @@ bool DeadArgumentEliminationPass::removeDeadStuffFromFunction(Function *F) {
} else if (NewCB->getType()->isVoidTy()) {
// If the return value is dead, replace any uses of it with poison
// (any non-debug value uses will get removed later on).
- if (!CB.getType()->isX86_MMXTy())
- CB.replaceAllUsesWith(PoisonValue::get(CB.getType()));
+ CB.replaceAllUsesWith(PoisonValue::get(CB.getType()));
} else {
assert((RetTy->isStructTy() || RetTy->isArrayTy()) &&
"Return type changed, but not into a void. The old return type"
@@ -1027,8 +1026,7 @@ bool DeadArgumentEliminationPass::removeDeadStuffFromFunction(Function *F) {
} else {
// If this argument is dead, replace any uses of it with poison
// (any non-debug value uses will get removed later on).
- if (!I->getType()->isX86_MMXTy())
- I->replaceAllUsesWith(PoisonValue::get(I->getType()));
+ I->replaceAllUsesWith(PoisonValue::get(I->getType()));
}
// If we change the return value of the function we must rewrite any return
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
index 7b1268939e9c4..7fcbeb35de71d 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -2677,13 +2677,6 @@ Instruction *InstCombinerImpl::visitBitCast(BitCastInst &CI) {
return replaceInstUsesWith(CI, Src);
if (FixedVectorType *DestVTy = dyn_cast<FixedVectorType>(DestTy)) {
- // Beware: messing with this target-specific oddity may cause trouble.
- if (DestVTy->getNumElements() == 1 && SrcTy->isX86_MMXTy()) {
- Value *Elem = Builder.CreateBitCast(Src, DestVTy->getElementType());
- return InsertElementInst::Create(PoisonValue::get(DestTy), Elem,
- Constant::getNullValue(Type::getInt32Ty(CI.getContext())));
- }
-
if (isa<IntegerType>(SrcTy)) {
// If this is a cast from an integer to vector, check to see if the input
// is a trunc or zext of a bitcast from vector. If so, we can replace all
diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index c7d41f6298372..07f84479bf74a 100644
--- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -2977,8 +2977,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
/// Caller guarantees that this intrinsic does not access memory.
bool maybeHandleSimpleNomemIntrinsic(IntrinsicInst &I) {
Type *RetTy = I.getType();
- if (!(RetTy->isIntOrIntVectorTy() || RetTy->isFPOrFPVectorTy() ||
- RetTy->isX86_MMXTy()))
+ if (!(RetTy->isIntOrIntVectorTy() || RetTy->isFPOrFPVectorTy()))
return false;
unsigned NumArgOperands = I.arg_size();
@@ -3208,7 +3207,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
setOriginForNaryOp(I);
}
- // Get an X86_MMX-sized vector type.
+ // Get an MMX-sized vector type.
Type *getMMXVectorTy(unsigned EltSizeInBits) {
const unsigned X86_MMXSizeInBits = 64;
assert(EltSizeInBits != 0 && (X86_MMXSizeInBits % EltSizeInBits) == 0 &&
@@ -3254,20 +3253,21 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
// packs elements of 2 input vectors into half as many bits with saturation.
// Shadow is propagated with the signed variant of the same intrinsic applied
// to sext(Sa != zeroinitializer), sext(Sb != zeroinitializer).
- // EltSizeInBits is used only for x86mmx arguments.
- void handleVectorPackIntrinsic(IntrinsicInst &I, unsigned EltSizeInBits = 0) {
+ // MMXEltSizeInBits is used only for x86mmx arguments.
+ void handleVectorPackIntrinsic(IntrinsicInst &I,
+ unsigned MMXEltSizeInBits = 0) {
assert(I.arg_size() == 2);
- bool isX86_MMX = I.getOperand(0)->getType()->isX86_MMXTy();
IRBuilder<> IRB(&I);
Value *S1 = getShadow(&I, 0);
Value *S2 = getShadow(&I, 1);
- assert(isX86_MMX || S1->getType()->isVectorTy());
+ assert(S1->getType()->isVectorTy());
// SExt and ICmpNE below must apply to individual elements of input vectors.
// In case of x86mmx arguments, cast them to appropriate vector types and
// back.
- Type *T = isX86_MMX ? getMMXVectorTy(EltSizeInBits) : S1->getType();
- if (isX86_MMX) {
+ Type *T =
+ MMXEltSizeInBits ? getMMXVectorTy(MMXEltSizeInBits) : S1->getType();
+ if (MMXEltSizeInBits) {
S1 = IRB.CreateBitCast(S1, T);
S2 = IRB.CreateBitCast(S2, T);
}
@@ -3275,10 +3275,9 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
IRB.CreateSExt(IRB.CreateICmpNE(S1, Constant::getNullValue(T)), T);
Value *S2_ext =
IRB.CreateSExt(IRB.CreateICmpNE(S2, Constant::getNullValue(T)), T);
- if (isX86_MMX) {
- Type *X86_MMXTy = Type::getX86_MMXTy(*MS.C);
- S1_ext = IRB.CreateBitCast(S1_ext, X86_MMXTy);
- S2_ext = IRB.CreateBitCast(S2_ext, X86_MMXTy);
+ if (MMXEltSizeInBits) {
+ S1_ext = IRB.CreateBitCast(S1_ext, getMMXVectorTy(64));
+ S2_ext = IRB.CreateBitCast(S2_ext, getMMXVectorTy(64));
}
Function *ShadowFn = Intrinsic::getDeclaration(
@@ -3286,7 +3285,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
Value *S =
IRB.CreateCall(ShadowFn, {S1_ext, S2_ext}, "_msprop_vector_pack");
- if (isX86_MMX)
+ if (MMXEltSizeInBits)
S = IRB.CreateBitCast(S, getShadowTy(&I));
setShadow(&I, S);
setOriginForNaryOp(I);
@@ -3393,10 +3392,9 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
}
// Instrument sum-of-absolute-differences intrinsic.
- void handleVectorSadIntrinsic(IntrinsicInst &I) {
+ void handleVectorSadIntrinsic(IntrinsicInst &I, bool IsMMX = false) {
const unsigned SignificantBitsPerResultElement = 16;
- bool isX86_MMX = I.getOperand(0)->getType()->isX86_MMXTy();
- Type *ResTy = isX86_MMX ? IntegerType::get(*MS.C, 64) : I.getType();
+ Type *ResTy = IsMMX ? IntegerType::get(*MS.C, 64) : I.getType();
unsigned ZeroBitsPerResultElement =
ResTy->getScalarSizeInBits() - SignificantBitsPerResultElement;
@@ -3415,9 +3413,9 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
// Instrument multiply-add intrinsic.
void handleVectorPmaddIntrinsic(IntrinsicInst &I,
- unsigned EltSizeInBits = 0) {
- bool isX86_MMX = I.getOperand(0)->getType()->isX86_MMXTy();
- Type *ResTy = isX86_MMX ? getMMXVectorTy(EltSizeInBits * 2) : I.getType();
+ unsigned MMXEltSizeInBits = 0) {
+ Type *ResTy =
+ MMXEltSizeInBits ? getMMXVectorTy(MMXEltSizeInBits * 2) : I.getType();
IRBuilder<> IRB(&I);
auto *Shadow0 = getShadow(&I, 0);
auto *Shadow1 = getShadow(&I, 1);
@@ -4088,6 +4086,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
break;
case Intrinsic::x86_mmx_psad_bw:
+ handleVectorSadIntrinsic(I, true);
+ break;
case Intrinsic::x86_sse2_psad_bw:
case Intrinsic::x86_avx2_psad_bw:
handleVectorSadIntrinsic(I);
@@ -4968,7 +4968,7 @@ struct VarArgAMD64Helper : public VarArgHelperBase {
Type *T = arg->getType();
if (T->isX86_FP80Ty())
return AK_Memory;
- if (T->isFPOrFPVectorTy() || T->isX86_MMXTy())
+ if (T->isFPOrFPVectorTy())
return AK_FloatingPoint;
if (T->isIntegerTy() && T->getPrimitiveSizeInBits() <= 64)
return AK_GeneralPurpose;
diff --git a/llvm/test/Assembler/x86mmx.ll b/llvm/test/Assembler/x86mmx.ll
deleted file mode 100644
index 608347e0fceb1..0000000000000
--- a/llvm/test/Assembler/x86mmx.ll
+++ /dev/null
@@ -1,9 +0,0 @@
-; RUN: llvm-as < %s | llvm-dis | FileCheck %s
-; RUN: verify-uselistorder %s
-; Basic smoke test for x86_mmx type.
-
-; CHECK: define x86_mmx @sh16
-define x86_mmx @sh16(x86_mmx %A) {
-; CHECK: ret x86_mmx %A
- ret x86_mmx %A
-}
diff --git a/llvm/test/Bitcode/bcanalyzer-types.ll b/llvm/test/Bitcode/bcanalyzer-types.ll
index cbe6f5d22c947..f1732db174c29 100644
--- a/llvm/test/Bitcode/bcanalyzer-types.ll
+++ b/llvm/test/Bitcode/bcanalyzer-types.ll
@@ -3,7 +3,6 @@
; CHECK: Block ID {{.*}} (TYPE_BLOCK_ID)
; CHECK: BFLOAT
; CHECK: TOKEN
-; CHECK: X86_MMX
; CHECK: HALF
; CHECK: Block ID
@@ -12,11 +11,6 @@ define half @test_half(half %x, half %y) {
ret half %a
}
-define x86_mmx @test_mmx(<2 x i32> %x) {
- %a = bitcast <2 x i32> %x to x86_mmx
- ret x86_mmx %a
-}
-
define bfloat @test_bfloat(i16 %x) {
%a = bitcast i16 %x to bfloat
ret bfloat %a
diff --git a/llvm/test/Bitcode/compatibility-3.6.ll b/llvm/test/Bitcode/compatibility-3.6.ll
index 2190e2fbccf28..37a87eea41ad3 100644
--- a/llvm/test/Bitcode/compatibility-3.6.ll
+++ b/llvm/test/Bitcode/compatibility-3.6.ll
@@ -645,7 +645,7 @@ define void @typesystem() {
%t6 = alloca ppc_fp128
; CHECK: %t6 = alloca ppc_fp128
%t7 = alloca x86_mmx
- ; CHECK: %t7 = alloca x86_mmx
+ ; CHECK: %t7 = alloca <1 x i64>
%t8 = alloca %opaquety*
; CHECK: %t8 = alloca ptr
diff --git a/llvm/test/Bitcode/compatibility-3.7.ll b/llvm/test/Bitcode/compatibility-3.7.ll
index 7e59b5c1be6e2..8de2132d7ec89 100644
--- a/llvm/test/Bitcode/compatibility-3.7.ll
+++ b/llvm/test/Bitcode/compatibility-3.7.ll
@@ -689,7 +689,7 @@ define void @typesystem() {
%t6 = alloca ppc_fp128
; CHECK: %t6 = alloca ppc_fp128
%t7 = alloca x86_mmx
- ; CHECK: %t7 = alloca x86_mmx
+ ; CHECK: %t7 = alloca <1 x i64>
%t8 = alloca %opaquety*
; CHECK: %t8 = alloca ptr
diff --git a/llvm/test/Bitcode/compatibility-3.8.ll b/llvm/test/Bitcode/compatibility-3.8.ll
index ebd1f2fff8c94..7f766aa34a005 100644
--- a/llvm/test/Bitcode/compatibility-3.8.ll
+++ b/llvm/test/Bitcode/compatibility-3.8.ll
@@ -742,7 +742,7 @@ define void @typesystem() {
%t6 = alloca ppc_fp128
; CHECK: %t6 = alloca ppc_fp128
%t7 = alloca x86_mmx
- ; CHECK: %t7 = alloca x86_mmx
+ ; CHECK: %t7 = alloca <1 x i64>
%t8 = alloca %opaquety*
; CHECK: %t8 = alloca ptr
diff --git a/llvm/test/Bitcode/compatibility-3.9.ll b/llvm/test/Bitcode/compatibility-3.9.ll
index c34f04ceb0de3..c8309175e063f 100644
--- a/llvm/test/Bitcode/compatibility-3.9.ll
+++ b/llvm/test/Bitcode/compatibility-3.9.ll
@@ -813,7 +813,7 @@ define void @typesystem() {
%t6 = alloca ppc_fp128
; CHECK: %t6 = alloca ppc_fp128
%t7 = alloca x86_mmx
- ; CHECK: %t7 = alloca x86_mmx
+ ; CHECK: %t7 = alloca <1 x i64>
%t8 = alloca %opaquety*
; CHECK: %t8 = alloca ptr
diff --git a/llvm/test/Bitcode/compatibility-4.0.ll b/llvm/test/Bitcode/compatibility-4.0.ll
index 05bffda1d117a..adbd91ac6c7fe 100644
--- a/llvm/test/Bitcode/compatibility-4.0.ll
+++ b/llvm/test/Bitcode/compatibility-4.0.ll
@@ -813,7 +813,7 @@ define void @typesystem() {
%t6 = alloca ppc_fp128
; CHECK: %t6 = alloca ppc_fp128
%t7 = alloca x86_mmx
- ; CHECK: %t7 = alloca x86_mmx
+ ; CHECK: %t7 = alloca <1 x i64>
%t8 = alloca %opaquety*
; CHECK: %t8 = alloca ptr
diff --git a/llvm/test/Bitcode/compatibility-5.0.ll b/llvm/test/Bitcode/compatibility-5.0.ll
index 0c872289c62ba..1b500da69568a 100644
--- a/llvm/test/Bitcode/compatibility-5.0.ll
+++ b/llvm/test/Bitcode/compatibility-5.0.ll
@@ -820,7 +820,7 @@ define void @typesystem() {
%t6 = alloca ppc_fp128
; CHECK: %t6 = alloca ppc_fp128
%t7 = alloca x86_mmx
- ; CHECK: %t7 = alloca x86_mmx
+ ; CHECK: %t7 = alloca <1 x i64>
%t8 = alloca %opaquety*
; CHECK: %t8 = alloca ptr
diff --git a/llvm/test/Bitcode/compatibility-6.0.ll b/llvm/test/Bitcode/compatibility-6.0.ll
index 44c680885be34..c1abbf0cda6eb 100644
--- a/llvm/test/Bitcode/compatibility-6.0.ll
+++ b/llvm/test/Bitcode/compatibility-6.0.ll
@@ -830,7 +830,7 @@ define void @typesystem() {
%t6 = alloca ppc_fp128
; CHECK: %t6 = alloca ppc_fp128
%t7 = alloca x86_mmx
- ; CHECK: %t7 = alloca x86_mmx
+ ; CHECK: %t7 = alloca <1 x i64>
%t8 = alloca %opaquety*
; CHECK: %t8 = alloca ptr
diff --git a/llvm/test/Bitcode/compatibility.ll b/llvm/test/Bitcode/compatibility.ll
index e437c37d8d1c8..a7567038b7a7b 100644
--- a/llvm/test/Bitcode/compatibility.ll
+++ b/llvm/test/Bitcode/compatibility.ll
@@ -1113,7 +1113,7 @@ define void @typesystem() {
%t6 = alloca ppc_fp128
; CHECK: %t6 = alloca ppc_fp128
%t7 = alloca x86_mmx
- ; CHECK: %t7 = alloca x86_mmx
+ ; CHECK: %t7 = alloca <1 x i64>
%t8 = alloca ptr
; CHECK: %t8 = alloca ptr
%t9 = alloca <4 x i32>
diff --git a/llvm/test/CodeGen/X86/2008-09-05-sinttofp-2xi32.ll b/llvm/test/CodeGen/X86/2008-09-05-sinttofp-2xi32.ll
index ac86279ca6667..3a112ae2a2113 100644
--- a/llvm/test/CodeGen/X86/2008-09-05-sinttofp-2xi32.ll
+++ b/llvm/test/CodeGen/X86/2008-09-05-sinttofp-2xi32.ll
@@ -29,7 +29,17 @@ entry:
define <2 x double> @a2(x86_mmx %x) nounwind {
; CHECK-LABEL: a2:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: cvtpi2pd %mm0, %xmm0
+; CHECK-NEXT: pushl %ebp
+; CHECK-NEXT: movl %esp, %ebp
+; CHECK-NEXT: andl $-8, %esp
+; CHECK-NEXT: subl $8, %esp
+; CHECK-NEXT: movl 8(%ebp), %eax
+; CHECK-NEXT: movl 12(%ebp), %ecx
+; CHECK-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; CHECK-NEXT: movl %eax, (%esp)
+; CHECK-NEXT: cvtpi2pd (%esp), %xmm0
+; CHECK-NEXT: movl %ebp, %esp
+; CHECK-NEXT: popl %ebp
; CHECK-NEXT: retl
entry:
%y = tail call <2 x double> @llvm.x86.sse.cvtpi2pd(x86_mmx %x)
@@ -39,7 +49,16 @@ entry:
define x86_mmx @b2(<2 x double> %x) nounwind {
; CHECK-LABEL: b2:
; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: pushl %ebp
+; CHECK-NEXT: movl %esp, %ebp
+; CHECK-NEXT: andl $-8, %esp
+; CHECK-NEXT: subl $8, %esp
; CHECK-NEXT: cvttpd2pi %xmm0, %mm0
+; CHECK-NEXT: movq %mm0, (%esp)
+; CHECK-NEXT: movl (%esp), %eax
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx
+; CHECK-NEXT: movl %ebp, %esp
+; CHECK-NEXT: popl %ebp
; CHECK-NEXT: retl
entry:
%y = tail call x86_mmx @llvm.x86.sse.cvttpd2pi (<2 x double> %x)
diff --git a/llvm/test/CodeGen/X86/3dnow-intrinsics.ll b/llvm/test/CodeGen/X86/3dnow-intrinsics.ll
index a82f705b77d84..73870d57cb79a 100644
--- a/llvm/test/CodeGen/X86/3dnow-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/3dnow-intrinsics.ll
@@ -5,15 +5,32 @@
define <8 x i8> @test_pavgusb(x86_mmx %a.coerce, x86_mmx %b.coerce) nounwind readnone {
; X86-LABEL: test_pavgusb:
; X86: # %bb.0: # %entry
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: pavgusb %mm1, %mm0
+; X86-NEXT: pushl %ebp
+; X86-NEXT: movl %esp, %ebp
+; X86-NEXT: andl $-8, %esp
+; X86-NEXT: subl $16, %esp
+; X86-NEXT: movl 12(%ebp), %eax
+; X86-NEXT: movl 16(%ebp), %ecx
+; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %eax, (%esp)
+; X86-NEXT: movl 20(%ebp), %eax
+; X86-NEXT: movl 24(%ebp), %ecx
+; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 8(%ebp), %eax
+; X86-NEXT: movq (%esp), %mm0
+; X86-NEXT: pavgusb {{[0-9]+}}(%esp), %mm0
; X86-NEXT: movq %mm0, (%eax)
+; X86-NEXT: movl %ebp, %esp
+; X86-NEXT: popl %ebp
; X86-NEXT: retl $4
;
; X64-LABEL: test_pavgusb:
; X64: # %bb.0: # %entry
-; X64-NEXT: pavgusb %mm1, %mm0
-; X64-NEXT: movq2dq %mm0, %xmm0
+; X64-NEXT: movq %rsi, %mm0
+; X64-NEXT: movq %rdi, %mm1
+; X64-NEXT: pavgusb %mm0, %mm1
+; X64-NEXT: movq2dq %mm1, %xmm0
; X64-NEXT: retq
entry:
%0 = bitcast x86_mmx %a.coerce to <8 x i8>
@@ -638,8 +655,12 @@ define <2 x float> @test_pi2fd(x86_mmx %a.coerce) nounwind readnone {
; X86-NEXT: pushl %ebp
; X86-NEXT: movl %esp, %ebp
; X86-NEXT: andl $-8, %esp
-; X86-NEXT: subl $8, %esp
-; X86-NEXT: pi2fd %mm0, %mm0
+; X86-NEXT: subl $16, %esp
+; X86-NEXT: movl 8(%ebp), %eax
+; X86-NEXT: movl 12(%ebp), %ecx
+; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: pi2fd {{[0-9]+}}(%esp), %mm0
; X86-NEXT: movq %mm0, (%esp)
; X86-NEXT: flds {{[0-9]+}}(%esp)
; X86-NEXT: flds (%esp)
@@ -649,6 +670,7 @@ define <2 x float> @test_pi2fd(x86_mmx %a.coerce) nounwind readnone {
;
; X64-LABEL: test_pi2fd:
; X64: # %bb.0: # %entry
+; X64-NEXT: movq %rdi, %mm0
; X64-NEXT: pi2fd %mm0, %mm0
; X64-NEXT: movq %mm0, -{{[0-9]+}}(%rsp)
; X64-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0
@@ -666,15 +688,32 @@ declare x86_mmx @llvm.x86.3dnow.pi2fd(x86_mmx) nounwind readnone
define <4 x i16> @test_pmulhrw(x86_mmx %a.coerce, x86_mmx %b.coerce) nounwind readnone {
; X86-LABEL: test_pmulhrw:
; X86: # %bb.0: # %entry
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: pmulhrw %mm1, %mm0
+; X86-NEXT: pushl %ebp
+; X86-NEXT: movl %esp, %ebp
+; X86-NEXT: andl $-8, %esp
+; X86-NEXT: subl $16, %esp
+; X86-NEXT: movl 12(%ebp), %eax
+; X86-NEXT: movl 16(%ebp), %ecx
+; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %eax, (%esp)
+; X86-NEXT: movl 20(%ebp), %eax
+; X86-NEXT: movl 24(%ebp), %ecx
+; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 8(%ebp), %eax
+; X86-NEXT: movq (%esp), %mm0
+; X86-NEXT: pmulhrw {{[0-9]+}}(%esp), %mm0
; X86-NEXT: movq %mm0, (%eax)
+; X86-NEXT: movl %ebp, %esp
+; X86-NEXT: popl %ebp
; X86-NEXT: retl $4
;
; X64-LABEL: test_pmulhrw:
; X64: # %bb.0: # %entry
-; X64-NEXT: pmulhrw %mm1, %mm0
-; X64-NEXT: movq2dq %mm0, %xmm0
+; X64-NEXT: movq %rsi, %mm0
+; X64-NEXT: movq %rdi, %mm1
+; X64-NEXT: pmulhrw %mm0, %mm1
+; X64-NEXT: movq2dq %mm1, %xmm0
; X64-NEXT: retq
entry:
%0 = bitcast x86_mmx %a.coerce to <4 x i16>
@@ -805,8 +844,12 @@ define <2 x float> @test_pi2fw(x86_mmx %a.coerce) nounwind readnone {
; X86-NEXT: pushl %ebp
; X86-NEXT: movl %esp, %ebp
; X86-NEXT: andl $-8, %esp
-; X86-NEXT: subl $8, %esp
-; X86-NEXT: pi2fw %mm0, %mm0
+; X86-NEXT: subl $16, %esp
+; X86-NEXT: movl 8(%ebp), %eax
+; X86-NEXT: movl 12(%ebp), %ecx
+; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: pi2fw {{[0-9]+}}(%esp), %mm0
; X86-NEXT: movq %mm0, (%esp)
; X86-NEXT: flds {{[0-9]+}}(%esp)
; X86-NEXT: flds (%esp)
@@ -816,6 +859,7 @@ define <2 x float> @test_pi2fw(x86_mmx %a.coerce) nounwind readnone {
;
; X64-LABEL: test_pi2fw:
; X64: # %bb.0: # %entry
+; X64-NEXT: movq %rdi, %mm0
; X64-NEXT: pi2fw %mm0, %mm0
; X64-NEXT: movq %mm0, -{{[0-9]+}}(%rsp)
; X64-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0
diff --git a/llvm/test/CodeGen/X86/avx-vbroadcast.ll b/llvm/test/CodeGen/X86/avx-vbroadcast.ll
index b442a6337e3b8..3f6f8c01b9049 100644
--- a/llvm/test/CodeGen/X86/avx-vbroadcast.ll
+++ b/llvm/test/CodeGen/X86/avx-vbroadcast.ll
@@ -1014,17 +1014,13 @@ define float @broadcast_lifetime() nounwind {
define <8 x i16> @broadcast_x86_mmx(x86_mmx %tmp) nounwind {
; X86-LABEL: broadcast_x86_mmx:
; X86: ## %bb.0: ## %bb
-; X86-NEXT: subl $12, %esp
-; X86-NEXT: movq %mm0, (%esp)
; X86-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
-; X86-NEXT: addl $12, %esp
; X86-NEXT: retl
;
; X64-LABEL: broadcast_x86_mmx:
; X64: ## %bb.0: ## %bb
-; X64-NEXT: movdq2q %xmm0, %mm0
-; X64-NEXT: movq2dq %mm0, %xmm0
-; X64-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; X64-NEXT: vmovq %rdi, %xmm0
+; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; X64-NEXT: retq
bb:
%tmp1 = bitcast x86_mmx %tmp to i64
diff --git a/llvm/test/CodeGen/X86/avx2-vbroadcast.ll b/llvm/test/CodeGen/X86/avx2-vbroadcast.ll
index b7516d30df5f6..fed6c2eb8ba0a 100644
--- a/llvm/test/CodeGen/X86/avx2-vbroadcast.ll
+++ b/llvm/test/CodeGen/X86/avx2-vbroadcast.ll
@@ -1452,25 +1452,18 @@ eintry:
define <8 x i16> @broadcast_x86_mmx(x86_mmx %tmp) nounwind {
; X86-LABEL: broadcast_x86_mmx:
; X86: ## %bb.0: ## %bb
-; X86-NEXT: subl $12, %esp
-; X86-NEXT: movq %mm0, (%esp)
; X86-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
-; X86-NEXT: addl $12, %esp
; X86-NEXT: retl
;
; X64-AVX2-LABEL: broadcast_x86_mmx:
; X64-AVX2: ## %bb.0: ## %bb
-; X64-AVX2-NEXT: movdq2q %xmm0, %mm0
-; X64-AVX2-NEXT: movq %mm0, %rax
-; X64-AVX2-NEXT: vmovq %rax, %xmm0
+; X64-AVX2-NEXT: vmovq %rdi, %xmm0
; X64-AVX2-NEXT: vpbroadcastq %xmm0, %xmm0
; X64-AVX2-NEXT: retq
;
; X64-AVX512VL-LABEL: broadcast_x86_mmx:
; X64-AVX512VL: ## %bb.0: ## %bb
-; X64-AVX512VL-NEXT: movdq2q %xmm0, %mm0
-; X64-AVX512VL-NEXT: movq %mm0, %rax
-; X64-AVX512VL-NEXT: vpbroadcastq %rax, %xmm0
+; X64-AVX512VL-NEXT: vpbroadcastq %rdi, %xmm0
; X64-AVX512VL-NEXT: retq
bb:
%tmp1 = bitcast x86_mmx %tmp to i64
diff --git a/llvm/test/CodeGen/X86/fast-isel-bc.ll b/llvm/test/CodeGen/X86/fast-isel-bc.ll
index 0fbc9fab05681..e3bb5e7176e57 100644
--- a/llvm/test/CodeGen/X86/fast-isel-bc.ll
+++ b/llvm/test/CodeGen/X86/fast-isel-bc.ll
@@ -12,7 +12,11 @@ define void @func1() nounwind {
; X86-LABEL: func1:
; X86: ## %bb.0:
; X86-NEXT: subl $12, %esp
-; X86-NEXT: movq {{\.?LCPI[0-9]+_[0-9]+}}, %mm0 ## mm0 = 0x200000000
+; X86-NEXT: movl $2, %edx
+; X86-NEXT: xorl %ecx, %ecx
+; X86-NEXT: movl %esp, %eax
+; X86-NEXT: movl %edx, 4(%eax)
+; X86-NEXT: movl %ecx, (%eax)
; X86-NEXT: calll _func2
; X86-NEXT: addl $12, %esp
; X86-NEXT: retl
@@ -20,8 +24,7 @@ define void @func1() nounwind {
; X64-LABEL: func1:
; X64: ## %bb.0:
; X64-NEXT: pushq %rax
-; X64-NEXT: movq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %mm0 ## mm0 = 0x200000000
-; X64-NEXT: movq2dq %mm0, %xmm0
+; X64-NEXT: movabsq $8589934592, %rdi ## imm = 0x200000000
; X64-NEXT: callq _func2
; X64-NEXT: popq %rax
; X64-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/fast-isel-nontemporal.ll b/llvm/test/CodeGen/X86/fast-isel-nontemporal.ll
index c13fdae540d0b..fd9f4fa63a090 100644
--- a/llvm/test/CodeGen/X86/fast-isel-nontemporal.ll
+++ b/llvm/test/CodeGen/X86/fast-isel-nontemporal.ll
@@ -94,6 +94,7 @@ entry:
;
; MMX Store
+; Note: doesn't actually emit a non-temporal store here.
;
define void @test_mmx(ptr nocapture %a0, ptr nocapture %a1) {
@@ -101,7 +102,7 @@ define void @test_mmx(ptr nocapture %a0, ptr nocapture %a1) {
; ALL: # %bb.0: # %entry
; ALL-NEXT: movq (%rdi), %mm0
; ALL-NEXT: psrlq $3, %mm0
-; ALL-NEXT: movntq %mm0, (%rsi)
+; ALL-NEXT: movq %mm0, (%rsi)
; ALL-NEXT: retq
entry:
%0 = load x86_mmx, ptr %a0
diff --git a/llvm/test/CodeGen/X86/mmx-arg-passing-x86-64.ll b/llvm/test/CodeGen/X86/mmx-arg-passing-x86-64.ll
index a4dbb10e0d7a0..54f048eb697f6 100644
--- a/llvm/test/CodeGen/X86/mmx-arg-passing-x86-64.ll
+++ b/llvm/test/CodeGen/X86/mmx-arg-passing-x86-64.ll
@@ -10,8 +10,8 @@ define void @t3() nounwind {
; X86-64-LABEL: t3:
; X86-64: ## %bb.0:
; X86-64-NEXT: movq _g_v8qi at GOTPCREL(%rip), %rax
-; X86-64-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
-; X86-64-NEXT: movb $1, %al
+; X86-64-NEXT: movq (%rax), %rdi
+; X86-64-NEXT: xorl %eax, %eax
; X86-64-NEXT: jmp _pass_v8qi ## TAILCALL
%tmp3 = load <8 x i8>, ptr @g_v8qi, align 8
%tmp3a = bitcast <8 x i8> %tmp3 to x86_mmx
@@ -22,12 +22,11 @@ define void @t3() nounwind {
define void @t4(x86_mmx %v1, x86_mmx %v2) nounwind {
; X86-64-LABEL: t4:
; X86-64: ## %bb.0:
-; X86-64-NEXT: movdq2q %xmm1, %mm0
-; X86-64-NEXT: movdq2q %xmm0, %mm1
-; X86-64-NEXT: movq2dq %mm1, %xmm1
-; X86-64-NEXT: movq2dq %mm0, %xmm0
-; X86-64-NEXT: paddb %xmm1, %xmm0
-; X86-64-NEXT: movb $1, %al
+; X86-64-NEXT: movq %rdi, %xmm0
+; X86-64-NEXT: movq %rsi, %xmm1
+; X86-64-NEXT: paddb %xmm0, %xmm1
+; X86-64-NEXT: movq %xmm1, %rdi
+; X86-64-NEXT: xorl %eax, %eax
; X86-64-NEXT: jmp _pass_v8qi ## TAILCALL
%v1a = bitcast x86_mmx %v1 to <8 x i8>
%v2b = bitcast x86_mmx %v2 to <8 x i8>
diff --git a/llvm/test/CodeGen/X86/mmx-arg-passing.ll b/llvm/test/CodeGen/X86/mmx-arg-passing.ll
index af116a2ac281b..1ae9920873faf 100644
--- a/llvm/test/CodeGen/X86/mmx-arg-passing.ll
+++ b/llvm/test/CodeGen/X86/mmx-arg-passing.ll
@@ -13,15 +13,17 @@
define void @t1(x86_mmx %v1) nounwind {
; X86-32-LABEL: t1:
; X86-32: ## %bb.0:
-; X86-32-NEXT: movl L_u1$non_lazy_ptr, %eax
-; X86-32-NEXT: movq %mm0, (%eax)
+; X86-32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-32-NEXT: movl L_u1$non_lazy_ptr, %edx
+; X86-32-NEXT: movl %ecx, 4(%edx)
+; X86-32-NEXT: movl %eax, (%edx)
; X86-32-NEXT: retl
;
; X86-64-LABEL: t1:
; X86-64: ## %bb.0:
-; X86-64-NEXT: movdq2q %xmm0, %mm0
; X86-64-NEXT: movq _u1 at GOTPCREL(%rip), %rax
-; X86-64-NEXT: movq %mm0, (%rax)
+; X86-64-NEXT: movq %rdi, (%rax)
; X86-64-NEXT: retq
store x86_mmx %v1, ptr @u1, align 8
ret void
diff --git a/llvm/test/CodeGen/X86/mmx-bitcast-fold.ll b/llvm/test/CodeGen/X86/mmx-bitcast-fold.ll
index 0fa7b24ff445a..a1240911cd36a 100644
--- a/llvm/test/CodeGen/X86/mmx-bitcast-fold.ll
+++ b/llvm/test/CodeGen/X86/mmx-bitcast-fold.ll
@@ -1,6 +1,6 @@
; RUN: opt -mtriple=x86_64-- -passes=early-cse -earlycse-debug-hash < %s -S | FileCheck %s
-; CHECK: @foo(x86_mmx bitcast (double 0.000000e+00 to x86_mmx))
+; CHECK: @foo(<1 x i64> zeroinitializer)
define void @bar() {
entry:
diff --git a/llvm/test/CodeGen/X86/mmx-bitcast.ll b/llvm/test/CodeGen/X86/mmx-bitcast.ll
index f914b8622fcf4..49c2027f06604 100644
--- a/llvm/test/CodeGen/X86/mmx-bitcast.ll
+++ b/llvm/test/CodeGen/X86/mmx-bitcast.ll
@@ -58,8 +58,8 @@ define i64 @t3(ptr %p) {
define void @t4(<1 x i64> %A, <1 x i64> %B) {
; CHECK-LABEL: t4:
; CHECK: ## %bb.0: ## %entry
-; CHECK-NEXT: movq %rdi, %mm0
-; CHECK-NEXT: movq %rsi, %mm1
+; CHECK-NEXT: movq %rsi, %mm0
+; CHECK-NEXT: movq %rdi, %mm1
; CHECK-NEXT: paddusw %mm0, %mm1
; CHECK-NEXT: movq _R at GOTPCREL(%rip), %rax
; CHECK-NEXT: movq %mm1, (%rax)
diff --git a/llvm/test/CodeGen/X86/mmx-fold-load.ll b/llvm/test/CodeGen/X86/mmx-fold-load.ll
index 73df6be8d7989..a31339902bb64 100644
--- a/llvm/test/CodeGen/X86/mmx-fold-load.ll
+++ b/llvm/test/CodeGen/X86/mmx-fold-load.ll
@@ -288,8 +288,13 @@ define i64 @tt0(x86_mmx %t, ptr %q) nounwind {
; X86-NEXT: pushl %ebp
; X86-NEXT: movl %esp, %ebp
; X86-NEXT: andl $-8, %esp
-; X86-NEXT: subl $8, %esp
+; X86-NEXT: subl $16, %esp
; X86-NEXT: movl 8(%ebp), %eax
+; X86-NEXT: movl 12(%ebp), %ecx
+; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 16(%ebp), %eax
+; X86-NEXT: movq {{[0-9]+}}(%esp), %mm0
; X86-NEXT: paddb (%eax), %mm0
; X86-NEXT: movq %mm0, (%esp)
; X86-NEXT: movl (%esp), %eax
@@ -301,7 +306,8 @@ define i64 @tt0(x86_mmx %t, ptr %q) nounwind {
;
; X64-LABEL: tt0:
; X64: # %bb.0: # %entry
-; X64-NEXT: paddb (%rdi), %mm0
+; X64-NEXT: movq %rdi, %mm0
+; X64-NEXT: paddb (%rsi), %mm0
; X64-NEXT: movq %mm0, %rax
; X64-NEXT: emms
; X64-NEXT: retq
@@ -321,8 +327,13 @@ define i64 @tt1(x86_mmx %t, ptr %q) nounwind {
; X86-NEXT: pushl %ebp
; X86-NEXT: movl %esp, %ebp
; X86-NEXT: andl $-8, %esp
-; X86-NEXT: subl $8, %esp
+; X86-NEXT: subl $16, %esp
; X86-NEXT: movl 8(%ebp), %eax
+; X86-NEXT: movl 12(%ebp), %ecx
+; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 16(%ebp), %eax
+; X86-NEXT: movq {{[0-9]+}}(%esp), %mm0
; X86-NEXT: paddw (%eax), %mm0
; X86-NEXT: movq %mm0, (%esp)
; X86-NEXT: movl (%esp), %eax
@@ -334,7 +345,8 @@ define i64 @tt1(x86_mmx %t, ptr %q) nounwind {
;
; X64-LABEL: tt1:
; X64: # %bb.0: # %entry
-; X64-NEXT: paddw (%rdi), %mm0
+; X64-NEXT: movq %rdi, %mm0
+; X64-NEXT: paddw (%rsi), %mm0
; X64-NEXT: movq %mm0, %rax
; X64-NEXT: emms
; X64-NEXT: retq
@@ -353,8 +365,13 @@ define i64 @tt2(x86_mmx %t, ptr %q) nounwind {
; X86-NEXT: pushl %ebp
; X86-NEXT: movl %esp, %ebp
; X86-NEXT: andl $-8, %esp
-; X86-NEXT: subl $8, %esp
+; X86-NEXT: subl $16, %esp
; X86-NEXT: movl 8(%ebp), %eax
+; X86-NEXT: movl 12(%ebp), %ecx
+; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 16(%ebp), %eax
+; X86-NEXT: movq {{[0-9]+}}(%esp), %mm0
; X86-NEXT: paddd (%eax), %mm0
; X86-NEXT: movq %mm0, (%esp)
; X86-NEXT: movl (%esp), %eax
@@ -366,7 +383,8 @@ define i64 @tt2(x86_mmx %t, ptr %q) nounwind {
;
; X64-LABEL: tt2:
; X64: # %bb.0: # %entry
-; X64-NEXT: paddd (%rdi), %mm0
+; X64-NEXT: movq %rdi, %mm0
+; X64-NEXT: paddd (%rsi), %mm0
; X64-NEXT: movq %mm0, %rax
; X64-NEXT: emms
; X64-NEXT: retq
@@ -385,8 +403,13 @@ define i64 @tt3(x86_mmx %t, ptr %q) nounwind {
; X86-NEXT: pushl %ebp
; X86-NEXT: movl %esp, %ebp
; X86-NEXT: andl $-8, %esp
-; X86-NEXT: subl $8, %esp
+; X86-NEXT: subl $16, %esp
; X86-NEXT: movl 8(%ebp), %eax
+; X86-NEXT: movl 12(%ebp), %ecx
+; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 16(%ebp), %eax
+; X86-NEXT: movq {{[0-9]+}}(%esp), %mm0
; X86-NEXT: paddq (%eax), %mm0
; X86-NEXT: movq %mm0, (%esp)
; X86-NEXT: movl (%esp), %eax
@@ -398,7 +421,8 @@ define i64 @tt3(x86_mmx %t, ptr %q) nounwind {
;
; X64-LABEL: tt3:
; X64: # %bb.0: # %entry
-; X64-NEXT: paddq (%rdi), %mm0
+; X64-NEXT: movq %rdi, %mm0
+; X64-NEXT: paddq (%rsi), %mm0
; X64-NEXT: movq %mm0, %rax
; X64-NEXT: emms
; X64-NEXT: retq
@@ -417,8 +441,13 @@ define i64 @tt4(x86_mmx %t, ptr %q) nounwind {
; X86-NEXT: pushl %ebp
; X86-NEXT: movl %esp, %ebp
; X86-NEXT: andl $-8, %esp
-; X86-NEXT: subl $8, %esp
+; X86-NEXT: subl $16, %esp
; X86-NEXT: movl 8(%ebp), %eax
+; X86-NEXT: movl 12(%ebp), %ecx
+; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 16(%ebp), %eax
+; X86-NEXT: movq {{[0-9]+}}(%esp), %mm0
; X86-NEXT: paddusb (%eax), %mm0
; X86-NEXT: movq %mm0, (%esp)
; X86-NEXT: movl (%esp), %eax
@@ -430,7 +459,8 @@ define i64 @tt4(x86_mmx %t, ptr %q) nounwind {
;
; X64-LABEL: tt4:
; X64: # %bb.0: # %entry
-; X64-NEXT: paddusb (%rdi), %mm0
+; X64-NEXT: movq %rdi, %mm0
+; X64-NEXT: paddusb (%rsi), %mm0
; X64-NEXT: movq %mm0, %rax
; X64-NEXT: emms
; X64-NEXT: retq
@@ -449,8 +479,13 @@ define i64 @tt5(x86_mmx %t, ptr %q) nounwind {
; X86-NEXT: pushl %ebp
; X86-NEXT: movl %esp, %ebp
; X86-NEXT: andl $-8, %esp
-; X86-NEXT: subl $8, %esp
+; X86-NEXT: subl $16, %esp
; X86-NEXT: movl 8(%ebp), %eax
+; X86-NEXT: movl 12(%ebp), %ecx
+; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 16(%ebp), %eax
+; X86-NEXT: movq {{[0-9]+}}(%esp), %mm0
; X86-NEXT: paddusw (%eax), %mm0
; X86-NEXT: movq %mm0, (%esp)
; X86-NEXT: movl (%esp), %eax
@@ -462,7 +497,8 @@ define i64 @tt5(x86_mmx %t, ptr %q) nounwind {
;
; X64-LABEL: tt5:
; X64: # %bb.0: # %entry
-; X64-NEXT: paddusw (%rdi), %mm0
+; X64-NEXT: movq %rdi, %mm0
+; X64-NEXT: paddusw (%rsi), %mm0
; X64-NEXT: movq %mm0, %rax
; X64-NEXT: emms
; X64-NEXT: retq
@@ -481,8 +517,13 @@ define i64 @tt6(x86_mmx %t, ptr %q) nounwind {
; X86-NEXT: pushl %ebp
; X86-NEXT: movl %esp, %ebp
; X86-NEXT: andl $-8, %esp
-; X86-NEXT: subl $8, %esp
+; X86-NEXT: subl $16, %esp
; X86-NEXT: movl 8(%ebp), %eax
+; X86-NEXT: movl 12(%ebp), %ecx
+; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 16(%ebp), %eax
+; X86-NEXT: movq {{[0-9]+}}(%esp), %mm0
; X86-NEXT: psrlw (%eax), %mm0
; X86-NEXT: movq %mm0, (%esp)
; X86-NEXT: movl (%esp), %eax
@@ -494,7 +535,8 @@ define i64 @tt6(x86_mmx %t, ptr %q) nounwind {
;
; X64-LABEL: tt6:
; X64: # %bb.0: # %entry
-; X64-NEXT: psrlw (%rdi), %mm0
+; X64-NEXT: movq %rdi, %mm0
+; X64-NEXT: psrlw (%rsi), %mm0
; X64-NEXT: movq %mm0, %rax
; X64-NEXT: emms
; X64-NEXT: retq
@@ -513,8 +555,13 @@ define i64 @tt7(x86_mmx %t, ptr %q) nounwind {
; X86-NEXT: pushl %ebp
; X86-NEXT: movl %esp, %ebp
; X86-NEXT: andl $-8, %esp
-; X86-NEXT: subl $8, %esp
+; X86-NEXT: subl $16, %esp
; X86-NEXT: movl 8(%ebp), %eax
+; X86-NEXT: movl 12(%ebp), %ecx
+; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 16(%ebp), %eax
+; X86-NEXT: movq {{[0-9]+}}(%esp), %mm0
; X86-NEXT: psrld (%eax), %mm0
; X86-NEXT: movq %mm0, (%esp)
; X86-NEXT: movl (%esp), %eax
@@ -526,7 +573,8 @@ define i64 @tt7(x86_mmx %t, ptr %q) nounwind {
;
; X64-LABEL: tt7:
; X64: # %bb.0: # %entry
-; X64-NEXT: psrld (%rdi), %mm0
+; X64-NEXT: movq %rdi, %mm0
+; X64-NEXT: psrld (%rsi), %mm0
; X64-NEXT: movq %mm0, %rax
; X64-NEXT: emms
; X64-NEXT: retq
@@ -545,8 +593,13 @@ define i64 @tt8(x86_mmx %t, ptr %q) nounwind {
; X86-NEXT: pushl %ebp
; X86-NEXT: movl %esp, %ebp
; X86-NEXT: andl $-8, %esp
-; X86-NEXT: subl $8, %esp
+; X86-NEXT: subl $16, %esp
; X86-NEXT: movl 8(%ebp), %eax
+; X86-NEXT: movl 12(%ebp), %ecx
+; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 16(%ebp), %eax
+; X86-NEXT: movq {{[0-9]+}}(%esp), %mm0
; X86-NEXT: psrlq (%eax), %mm0
; X86-NEXT: movq %mm0, (%esp)
; X86-NEXT: movl (%esp), %eax
@@ -558,7 +611,8 @@ define i64 @tt8(x86_mmx %t, ptr %q) nounwind {
;
; X64-LABEL: tt8:
; X64: # %bb.0: # %entry
-; X64-NEXT: psrlq (%rdi), %mm0
+; X64-NEXT: movq %rdi, %mm0
+; X64-NEXT: psrlq (%rsi), %mm0
; X64-NEXT: movq %mm0, %rax
; X64-NEXT: emms
; X64-NEXT: retq
@@ -612,16 +666,29 @@ declare void @llvm.lifetime.end(i64, ptr nocapture)
define x86_mmx @vec_load(ptr %x) {
; X86-LABEL: vec_load:
; X86: # %bb.0:
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: pushl %ebp
+; X86-NEXT: .cfi_def_cfa_offset 8
+; X86-NEXT: .cfi_offset %ebp, -8
+; X86-NEXT: movl %esp, %ebp
+; X86-NEXT: .cfi_def_cfa_register %ebp
+; X86-NEXT: andl $-8, %esp
+; X86-NEXT: subl $8, %esp
+; X86-NEXT: movl 8(%ebp), %eax
; X86-NEXT: pshufw $68, (%eax), %mm0 # mm0 = mem[0,1,0,1]
; X86-NEXT: paddsb %mm0, %mm0
+; X86-NEXT: movq %mm0, (%esp)
+; X86-NEXT: movl (%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movl %ebp, %esp
+; X86-NEXT: popl %ebp
+; X86-NEXT: .cfi_def_cfa %esp, 4
; X86-NEXT: retl
;
; X64-LABEL: vec_load:
; X64: # %bb.0:
; X64-NEXT: pshufw $68, (%rdi), %mm0 # mm0 = mem[0,1,0,1]
; X64-NEXT: paddsb %mm0, %mm0
-; X64-NEXT: movq2dq %mm0, %xmm0
+; X64-NEXT: movq %mm0, %rax
; X64-NEXT: retq
%z = load <4 x float>, ptr %x
%y = extractelement <4 x float> %z, i32 0
diff --git a/llvm/test/CodeGen/X86/mmx-intrinsics.ll b/llvm/test/CodeGen/X86/mmx-intrinsics.ll
index a43d9400cde6c..69fc636107544 100644
--- a/llvm/test/CodeGen/X86/mmx-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/mmx-intrinsics.ll
@@ -32,10 +32,10 @@ define i64 @test1(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
;
; X64-LABEL: test1:
; X64: # %bb.0: # %entry
-; X64-NEXT: movq %rdi, %mm0
-; X64-NEXT: movq %rsi, %mm1
-; X64-NEXT: phaddw %mm1, %mm0
-; X64-NEXT: movq %mm0, %rax
+; X64-NEXT: movq %rsi, %mm0
+; X64-NEXT: movq %rdi, %mm1
+; X64-NEXT: phaddw %mm0, %mm1
+; X64-NEXT: movq %mm1, %rax
; X64-NEXT: retq
entry:
%0 = bitcast <1 x i64> %b to <4 x i16>
@@ -77,10 +77,10 @@ define i64 @test88(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
;
; X64-LABEL: test88:
; X64: # %bb.0: # %entry
-; X64-NEXT: movq %rdi, %mm0
-; X64-NEXT: movq %rsi, %mm1
-; X64-NEXT: pcmpgtd %mm1, %mm0
-; X64-NEXT: movq %mm0, %rax
+; X64-NEXT: movq %rsi, %mm0
+; X64-NEXT: movq %rdi, %mm1
+; X64-NEXT: pcmpgtd %mm0, %mm1
+; X64-NEXT: movq %mm1, %rax
; X64-NEXT: retq
entry:
%0 = bitcast <1 x i64> %b to <2 x i32>
@@ -122,10 +122,10 @@ define i64 @test87(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
;
; X64-LABEL: test87:
; X64: # %bb.0: # %entry
-; X64-NEXT: movq %rdi, %mm0
-; X64-NEXT: movq %rsi, %mm1
-; X64-NEXT: pcmpgtw %mm1, %mm0
-; X64-NEXT: movq %mm0, %rax
+; X64-NEXT: movq %rsi, %mm0
+; X64-NEXT: movq %rdi, %mm1
+; X64-NEXT: pcmpgtw %mm0, %mm1
+; X64-NEXT: movq %mm1, %rax
; X64-NEXT: retq
entry:
%0 = bitcast <1 x i64> %b to <4 x i16>
@@ -167,10 +167,10 @@ define i64 @test86(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
;
; X64-LABEL: test86:
; X64: # %bb.0: # %entry
-; X64-NEXT: movq %rdi, %mm0
-; X64-NEXT: movq %rsi, %mm1
-; X64-NEXT: pcmpgtb %mm1, %mm0
-; X64-NEXT: movq %mm0, %rax
+; X64-NEXT: movq %rsi, %mm0
+; X64-NEXT: movq %rdi, %mm1
+; X64-NEXT: pcmpgtb %mm0, %mm1
+; X64-NEXT: movq %mm1, %rax
; X64-NEXT: retq
entry:
%0 = bitcast <1 x i64> %b to <8 x i8>
@@ -212,10 +212,10 @@ define i64 @test85(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
;
; X64-LABEL: test85:
; X64: # %bb.0: # %entry
-; X64-NEXT: movq %rdi, %mm0
-; X64-NEXT: movq %rsi, %mm1
-; X64-NEXT: pcmpeqd %mm1, %mm0
-; X64-NEXT: movq %mm0, %rax
+; X64-NEXT: movq %rsi, %mm0
+; X64-NEXT: movq %rdi, %mm1
+; X64-NEXT: pcmpeqd %mm0, %mm1
+; X64-NEXT: movq %mm1, %rax
; X64-NEXT: retq
entry:
%0 = bitcast <1 x i64> %b to <2 x i32>
@@ -257,10 +257,10 @@ define i64 @test84(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
;
; X64-LABEL: test84:
; X64: # %bb.0: # %entry
-; X64-NEXT: movq %rdi, %mm0
-; X64-NEXT: movq %rsi, %mm1
-; X64-NEXT: pcmpeqw %mm1, %mm0
-; X64-NEXT: movq %mm0, %rax
+; X64-NEXT: movq %rsi, %mm0
+; X64-NEXT: movq %rdi, %mm1
+; X64-NEXT: pcmpeqw %mm0, %mm1
+; X64-NEXT: movq %mm1, %rax
; X64-NEXT: retq
entry:
%0 = bitcast <1 x i64> %b to <4 x i16>
@@ -302,10 +302,10 @@ define i64 @test83(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
;
; X64-LABEL: test83:
; X64: # %bb.0: # %entry
-; X64-NEXT: movq %rdi, %mm0
-; X64-NEXT: movq %rsi, %mm1
-; X64-NEXT: pcmpeqb %mm1, %mm0
-; X64-NEXT: movq %mm0, %rax
+; X64-NEXT: movq %rsi, %mm0
+; X64-NEXT: movq %rdi, %mm1
+; X64-NEXT: pcmpeqb %mm0, %mm1
+; X64-NEXT: movq %mm1, %rax
; X64-NEXT: retq
entry:
%0 = bitcast <1 x i64> %b to <8 x i8>
@@ -347,10 +347,10 @@ define i64 @test82(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
;
; X64-LABEL: test82:
; X64: # %bb.0: # %entry
-; X64-NEXT: movq %rdi, %mm0
-; X64-NEXT: movq %rsi, %mm1
-; X64-NEXT: punpckldq %mm1, %mm0 # mm0 = mm0[0],mm1[0]
-; X64-NEXT: movq %mm0, %rax
+; X64-NEXT: movq %rsi, %mm0
+; X64-NEXT: movq %rdi, %mm1
+; X64-NEXT: punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0]
+; X64-NEXT: movq %mm1, %rax
; X64-NEXT: retq
entry:
%0 = bitcast <1 x i64> %b to <2 x i32>
@@ -392,10 +392,10 @@ define i64 @test81(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
;
; X64-LABEL: test81:
; X64: # %bb.0: # %entry
-; X64-NEXT: movq %rdi, %mm0
-; X64-NEXT: movq %rsi, %mm1
-; X64-NEXT: punpcklwd %mm1, %mm0 # mm0 = mm0[0],mm1[0],mm0[1],mm1[1]
-; X64-NEXT: movq %mm0, %rax
+; X64-NEXT: movq %rsi, %mm0
+; X64-NEXT: movq %rdi, %mm1
+; X64-NEXT: punpcklwd %mm0, %mm1 # mm1 = mm1[0],mm0[0],mm1[1],mm0[1]
+; X64-NEXT: movq %mm1, %rax
; X64-NEXT: retq
entry:
%0 = bitcast <1 x i64> %b to <4 x i16>
@@ -437,10 +437,10 @@ define i64 @test80(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
;
; X64-LABEL: test80:
; X64: # %bb.0: # %entry
-; X64-NEXT: movq %rdi, %mm0
-; X64-NEXT: movq %rsi, %mm1
-; X64-NEXT: punpcklbw %mm1, %mm0 # mm0 = mm0[0],mm1[0],mm0[1],mm1[1],mm0[2],mm1[2],mm0[3],mm1[3]
-; X64-NEXT: movq %mm0, %rax
+; X64-NEXT: movq %rsi, %mm0
+; X64-NEXT: movq %rdi, %mm1
+; X64-NEXT: punpcklbw %mm0, %mm1 # mm1 = mm1[0],mm0[0],mm1[1],mm0[1],mm1[2],mm0[2],mm1[3],mm0[3]
+; X64-NEXT: movq %mm1, %rax
; X64-NEXT: retq
entry:
%0 = bitcast <1 x i64> %b to <8 x i8>
@@ -482,10 +482,10 @@ define i64 @test79(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
;
; X64-LABEL: test79:
; X64: # %bb.0: # %entry
-; X64-NEXT: movq %rdi, %mm0
-; X64-NEXT: movq %rsi, %mm1
-; X64-NEXT: punpckhdq %mm1, %mm0 # mm0 = mm0[1],mm1[1]
-; X64-NEXT: movq %mm0, %rax
+; X64-NEXT: movq %rsi, %mm0
+; X64-NEXT: movq %rdi, %mm1
+; X64-NEXT: punpckhdq %mm0, %mm1 # mm1 = mm1[1],mm0[1]
+; X64-NEXT: movq %mm1, %rax
; X64-NEXT: retq
entry:
%0 = bitcast <1 x i64> %b to <2 x i32>
@@ -527,10 +527,10 @@ define i64 @test78(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
;
; X64-LABEL: test78:
; X64: # %bb.0: # %entry
-; X64-NEXT: movq %rdi, %mm0
-; X64-NEXT: movq %rsi, %mm1
-; X64-NEXT: punpckhwd %mm1, %mm0 # mm0 = mm0[2],mm1[2],mm0[3],mm1[3]
-; X64-NEXT: movq %mm0, %rax
+; X64-NEXT: movq %rsi, %mm0
+; X64-NEXT: movq %rdi, %mm1
+; X64-NEXT: punpckhwd %mm0, %mm1 # mm1 = mm1[2],mm0[2],mm1[3],mm0[3]
+; X64-NEXT: movq %mm1, %rax
; X64-NEXT: retq
entry:
%0 = bitcast <1 x i64> %b to <4 x i16>
@@ -572,10 +572,10 @@ define i64 @test77(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
;
; X64-LABEL: test77:
; X64: # %bb.0: # %entry
-; X64-NEXT: movq %rdi, %mm0
-; X64-NEXT: movq %rsi, %mm1
-; X64-NEXT: punpckhbw %mm1, %mm0 # mm0 = mm0[4],mm1[4],mm0[5],mm1[5],mm0[6],mm1[6],mm0[7],mm1[7]
-; X64-NEXT: movq %mm0, %rax
+; X64-NEXT: movq %rsi, %mm0
+; X64-NEXT: movq %rdi, %mm1
+; X64-NEXT: punpckhbw %mm0, %mm1 # mm1 = mm1[4],mm0[4],mm1[5],mm0[5],mm1[6],mm0[6],mm1[7],mm0[7]
+; X64-NEXT: movq %mm1, %rax
; X64-NEXT: retq
entry:
%0 = bitcast <1 x i64> %b to <8 x i8>
@@ -617,10 +617,10 @@ define i64 @test76(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
;
; X64-LABEL: test76:
; X64: # %bb.0: # %entry
-; X64-NEXT: movq %rdi, %mm0
-; X64-NEXT: movq %rsi, %mm1
-; X64-NEXT: packuswb %mm1, %mm0
-; X64-NEXT: movq %mm0, %rax
+; X64-NEXT: movq %rsi, %mm0
+; X64-NEXT: movq %rdi, %mm1
+; X64-NEXT: packuswb %mm0, %mm1
+; X64-NEXT: movq %mm1, %rax
; X64-NEXT: retq
entry:
%0 = bitcast <1 x i64> %b to <4 x i16>
@@ -662,10 +662,10 @@ define i64 @test75(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
;
; X64-LABEL: test75:
; X64: # %bb.0: # %entry
-; X64-NEXT: movq %rdi, %mm0
-; X64-NEXT: movq %rsi, %mm1
-; X64-NEXT: packssdw %mm1, %mm0
-; X64-NEXT: movq %mm0, %rax
+; X64-NEXT: movq %rsi, %mm0
+; X64-NEXT: movq %rdi, %mm1
+; X64-NEXT: packssdw %mm0, %mm1
+; X64-NEXT: movq %mm1, %rax
; X64-NEXT: retq
entry:
%0 = bitcast <1 x i64> %b to <2 x i32>
@@ -707,10 +707,10 @@ define i64 @test74(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
;
; X64-LABEL: test74:
; X64: # %bb.0: # %entry
-; X64-NEXT: movq %rdi, %mm0
-; X64-NEXT: movq %rsi, %mm1
-; X64-NEXT: packsswb %mm1, %mm0
-; X64-NEXT: movq %mm0, %rax
+; X64-NEXT: movq %rsi, %mm0
+; X64-NEXT: movq %rdi, %mm1
+; X64-NEXT: packsswb %mm0, %mm1
+; X64-NEXT: movq %mm1, %rax
; X64-NEXT: retq
entry:
%0 = bitcast <1 x i64> %b to <4 x i16>
@@ -1459,8 +1459,8 @@ define i64 @test56(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
;
; X64-LABEL: test56:
; X64: # %bb.0: # %entry
-; X64-NEXT: movq %rdi, %mm0
-; X64-NEXT: movq %rsi, %mm1
+; X64-NEXT: movq %rsi, %mm0
+; X64-NEXT: movq %rdi, %mm1
; X64-NEXT: pxor %mm0, %mm1
; X64-NEXT: movq %mm1, %rax
; X64-NEXT: retq
@@ -1504,8 +1504,8 @@ define i64 @test55(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
;
; X64-LABEL: test55:
; X64: # %bb.0: # %entry
-; X64-NEXT: movq %rdi, %mm0
-; X64-NEXT: movq %rsi, %mm1
+; X64-NEXT: movq %rsi, %mm0
+; X64-NEXT: movq %rdi, %mm1
; X64-NEXT: por %mm0, %mm1
; X64-NEXT: movq %mm1, %rax
; X64-NEXT: retq
@@ -1549,10 +1549,10 @@ define i64 @test54(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
;
; X64-LABEL: test54:
; X64: # %bb.0: # %entry
-; X64-NEXT: movq %rdi, %mm0
-; X64-NEXT: movq %rsi, %mm1
-; X64-NEXT: pandn %mm1, %mm0
-; X64-NEXT: movq %mm0, %rax
+; X64-NEXT: movq %rsi, %mm0
+; X64-NEXT: movq %rdi, %mm1
+; X64-NEXT: pandn %mm0, %mm1
+; X64-NEXT: movq %mm1, %rax
; X64-NEXT: retq
entry:
%0 = bitcast <1 x i64> %b to <2 x i32>
@@ -1594,8 +1594,8 @@ define i64 @test53(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
;
; X64-LABEL: test53:
; X64: # %bb.0: # %entry
-; X64-NEXT: movq %rdi, %mm0
-; X64-NEXT: movq %rsi, %mm1
+; X64-NEXT: movq %rsi, %mm0
+; X64-NEXT: movq %rdi, %mm1
; X64-NEXT: pand %mm0, %mm1
; X64-NEXT: movq %mm1, %rax
; X64-NEXT: retq
@@ -1639,8 +1639,8 @@ define i64 @test52(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
;
; X64-LABEL: test52:
; X64: # %bb.0: # %entry
-; X64-NEXT: movq %rdi, %mm0
-; X64-NEXT: movq %rsi, %mm1
+; X64-NEXT: movq %rsi, %mm0
+; X64-NEXT: movq %rdi, %mm1
; X64-NEXT: pmullw %mm0, %mm1
; X64-NEXT: movq %mm1, %rax
; X64-NEXT: retq
@@ -1682,8 +1682,8 @@ define i64 @test51(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
;
; X64-LABEL: test51:
; X64: # %bb.0: # %entry
-; X64-NEXT: movq %rdi, %mm0
-; X64-NEXT: movq %rsi, %mm1
+; X64-NEXT: movq %rsi, %mm0
+; X64-NEXT: movq %rdi, %mm1
; X64-NEXT: pmullw %mm0, %mm1
; X64-NEXT: movq %mm1, %rax
; X64-NEXT: retq
@@ -1727,8 +1727,8 @@ define i64 @test50(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
;
; X64-LABEL: test50:
; X64: # %bb.0: # %entry
-; X64-NEXT: movq %rdi, %mm0
-; X64-NEXT: movq %rsi, %mm1
+; X64-NEXT: movq %rsi, %mm0
+; X64-NEXT: movq %rdi, %mm1
; X64-NEXT: pmulhw %mm0, %mm1
; X64-NEXT: movq %mm1, %rax
; X64-NEXT: retq
@@ -1772,8 +1772,8 @@ define i64 @test49(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
;
; X64-LABEL: test49:
; X64: # %bb.0: # %entry
-; X64-NEXT: movq %rdi, %mm0
-; X64-NEXT: movq %rsi, %mm1
+; X64-NEXT: movq %rsi, %mm0
+; X64-NEXT: movq %rdi, %mm1
; X64-NEXT: pmaddwd %mm0, %mm1
; X64-NEXT: movq %mm1, %rax
; X64-NEXT: retq
@@ -1817,10 +1817,10 @@ define i64 @test48(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
;
; X64-LABEL: test48:
; X64: # %bb.0: # %entry
-; X64-NEXT: movq %rdi, %mm0
-; X64-NEXT: movq %rsi, %mm1
-; X64-NEXT: psubusw %mm1, %mm0
-; X64-NEXT: movq %mm0, %rax
+; X64-NEXT: movq %rsi, %mm0
+; X64-NEXT: movq %rdi, %mm1
+; X64-NEXT: psubusw %mm0, %mm1
+; X64-NEXT: movq %mm1, %rax
; X64-NEXT: retq
entry:
%0 = bitcast <1 x i64> %b to <4 x i16>
@@ -1862,10 +1862,10 @@ define i64 @test47(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
;
; X64-LABEL: test47:
; X64: # %bb.0: # %entry
-; X64-NEXT: movq %rdi, %mm0
-; X64-NEXT: movq %rsi, %mm1
-; X64-NEXT: psubusb %mm1, %mm0
-; X64-NEXT: movq %mm0, %rax
+; X64-NEXT: movq %rsi, %mm0
+; X64-NEXT: movq %rdi, %mm1
+; X64-NEXT: psubusb %mm0, %mm1
+; X64-NEXT: movq %mm1, %rax
; X64-NEXT: retq
entry:
%0 = bitcast <1 x i64> %b to <8 x i8>
@@ -1907,10 +1907,10 @@ define i64 @test46(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
;
; X64-LABEL: test46:
; X64: # %bb.0: # %entry
-; X64-NEXT: movq %rdi, %mm0
-; X64-NEXT: movq %rsi, %mm1
-; X64-NEXT: psubsw %mm1, %mm0
-; X64-NEXT: movq %mm0, %rax
+; X64-NEXT: movq %rsi, %mm0
+; X64-NEXT: movq %rdi, %mm1
+; X64-NEXT: psubsw %mm0, %mm1
+; X64-NEXT: movq %mm1, %rax
; X64-NEXT: retq
entry:
%0 = bitcast <1 x i64> %b to <4 x i16>
@@ -1952,10 +1952,10 @@ define i64 @test45(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
;
; X64-LABEL: test45:
; X64: # %bb.0: # %entry
-; X64-NEXT: movq %rdi, %mm0
-; X64-NEXT: movq %rsi, %mm1
-; X64-NEXT: psubsb %mm1, %mm0
-; X64-NEXT: movq %mm0, %rax
+; X64-NEXT: movq %rsi, %mm0
+; X64-NEXT: movq %rdi, %mm1
+; X64-NEXT: psubsb %mm0, %mm1
+; X64-NEXT: movq %mm1, %rax
; X64-NEXT: retq
entry:
%0 = bitcast <1 x i64> %b to <8 x i8>
@@ -2032,10 +2032,10 @@ define i64 @test43(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
;
; X64-LABEL: test43:
; X64: # %bb.0: # %entry
-; X64-NEXT: movq %rdi, %mm0
-; X64-NEXT: movq %rsi, %mm1
-; X64-NEXT: psubd %mm1, %mm0
-; X64-NEXT: movq %mm0, %rax
+; X64-NEXT: movq %rsi, %mm0
+; X64-NEXT: movq %rdi, %mm1
+; X64-NEXT: psubd %mm0, %mm1
+; X64-NEXT: movq %mm1, %rax
; X64-NEXT: retq
entry:
%0 = bitcast <1 x i64> %b to <2 x i32>
@@ -2077,10 +2077,10 @@ define i64 @test42(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
;
; X64-LABEL: test42:
; X64: # %bb.0: # %entry
-; X64-NEXT: movq %rdi, %mm0
-; X64-NEXT: movq %rsi, %mm1
-; X64-NEXT: psubw %mm1, %mm0
-; X64-NEXT: movq %mm0, %rax
+; X64-NEXT: movq %rsi, %mm0
+; X64-NEXT: movq %rdi, %mm1
+; X64-NEXT: psubw %mm0, %mm1
+; X64-NEXT: movq %mm1, %rax
; X64-NEXT: retq
entry:
%0 = bitcast <1 x i64> %b to <4 x i16>
@@ -2122,10 +2122,10 @@ define i64 @test41(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
;
; X64-LABEL: test41:
; X64: # %bb.0: # %entry
-; X64-NEXT: movq %rdi, %mm0
-; X64-NEXT: movq %rsi, %mm1
-; X64-NEXT: psubb %mm1, %mm0
-; X64-NEXT: movq %mm0, %rax
+; X64-NEXT: movq %rsi, %mm0
+; X64-NEXT: movq %rdi, %mm1
+; X64-NEXT: psubb %mm0, %mm1
+; X64-NEXT: movq %mm1, %rax
; X64-NEXT: retq
entry:
%0 = bitcast <1 x i64> %b to <8 x i8>
@@ -2167,8 +2167,8 @@ define i64 @test40(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
;
; X64-LABEL: test40:
; X64: # %bb.0: # %entry
-; X64-NEXT: movq %rdi, %mm0
-; X64-NEXT: movq %rsi, %mm1
+; X64-NEXT: movq %rsi, %mm0
+; X64-NEXT: movq %rdi, %mm1
; X64-NEXT: paddusw %mm0, %mm1
; X64-NEXT: movq %mm1, %rax
; X64-NEXT: retq
@@ -2212,8 +2212,8 @@ define i64 @test39(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
;
; X64-LABEL: test39:
; X64: # %bb.0: # %entry
-; X64-NEXT: movq %rdi, %mm0
-; X64-NEXT: movq %rsi, %mm1
+; X64-NEXT: movq %rsi, %mm0
+; X64-NEXT: movq %rdi, %mm1
; X64-NEXT: paddusb %mm0, %mm1
; X64-NEXT: movq %mm1, %rax
; X64-NEXT: retq
@@ -2257,8 +2257,8 @@ define i64 @test38(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
;
; X64-LABEL: test38:
; X64: # %bb.0: # %entry
-; X64-NEXT: movq %rdi, %mm0
-; X64-NEXT: movq %rsi, %mm1
+; X64-NEXT: movq %rsi, %mm0
+; X64-NEXT: movq %rdi, %mm1
; X64-NEXT: paddsw %mm0, %mm1
; X64-NEXT: movq %mm1, %rax
; X64-NEXT: retq
@@ -2302,8 +2302,8 @@ define i64 @test37(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
;
; X64-LABEL: test37:
; X64: # %bb.0: # %entry
-; X64-NEXT: movq %rdi, %mm0
-; X64-NEXT: movq %rsi, %mm1
+; X64-NEXT: movq %rsi, %mm0
+; X64-NEXT: movq %rdi, %mm1
; X64-NEXT: paddsb %mm0, %mm1
; X64-NEXT: movq %mm1, %rax
; X64-NEXT: retq
@@ -2382,8 +2382,8 @@ define i64 @test35(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
;
; X64-LABEL: test35:
; X64: # %bb.0: # %entry
-; X64-NEXT: movq %rdi, %mm0
-; X64-NEXT: movq %rsi, %mm1
+; X64-NEXT: movq %rsi, %mm0
+; X64-NEXT: movq %rdi, %mm1
; X64-NEXT: paddd %mm0, %mm1
; X64-NEXT: movq %mm1, %rax
; X64-NEXT: retq
@@ -2427,8 +2427,8 @@ define i64 @test34(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
;
; X64-LABEL: test34:
; X64: # %bb.0: # %entry
-; X64-NEXT: movq %rdi, %mm0
-; X64-NEXT: movq %rsi, %mm1
+; X64-NEXT: movq %rsi, %mm0
+; X64-NEXT: movq %rdi, %mm1
; X64-NEXT: paddw %mm0, %mm1
; X64-NEXT: movq %mm1, %rax
; X64-NEXT: retq
@@ -2472,8 +2472,8 @@ define i64 @test33(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
;
; X64-LABEL: test33:
; X64: # %bb.0: # %entry
-; X64-NEXT: movq %rdi, %mm0
-; X64-NEXT: movq %rsi, %mm1
+; X64-NEXT: movq %rsi, %mm0
+; X64-NEXT: movq %rdi, %mm1
; X64-NEXT: paddb %mm0, %mm1
; X64-NEXT: movq %mm1, %rax
; X64-NEXT: retq
@@ -2517,8 +2517,8 @@ define i64 @test32(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
;
; X64-LABEL: test32:
; X64: # %bb.0: # %entry
-; X64-NEXT: movq %rdi, %mm0
-; X64-NEXT: movq %rsi, %mm1
+; X64-NEXT: movq %rsi, %mm0
+; X64-NEXT: movq %rdi, %mm1
; X64-NEXT: psadbw %mm0, %mm1
; X64-NEXT: movq %mm1, %rax
; X64-NEXT: retq
@@ -2560,8 +2560,8 @@ define i64 @test31(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
;
; X64-LABEL: test31:
; X64: # %bb.0: # %entry
-; X64-NEXT: movq %rdi, %mm0
-; X64-NEXT: movq %rsi, %mm1
+; X64-NEXT: movq %rsi, %mm0
+; X64-NEXT: movq %rdi, %mm1
; X64-NEXT: pminsw %mm0, %mm1
; X64-NEXT: movq %mm1, %rax
; X64-NEXT: retq
@@ -2605,8 +2605,8 @@ define i64 @test30(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
;
; X64-LABEL: test30:
; X64: # %bb.0: # %entry
-; X64-NEXT: movq %rdi, %mm0
-; X64-NEXT: movq %rsi, %mm1
+; X64-NEXT: movq %rsi, %mm0
+; X64-NEXT: movq %rdi, %mm1
; X64-NEXT: pminub %mm0, %mm1
; X64-NEXT: movq %mm1, %rax
; X64-NEXT: retq
@@ -2650,8 +2650,8 @@ define i64 @test29(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
;
; X64-LABEL: test29:
; X64: # %bb.0: # %entry
-; X64-NEXT: movq %rdi, %mm0
-; X64-NEXT: movq %rsi, %mm1
+; X64-NEXT: movq %rsi, %mm0
+; X64-NEXT: movq %rdi, %mm1
; X64-NEXT: pmaxsw %mm0, %mm1
; X64-NEXT: movq %mm1, %rax
; X64-NEXT: retq
@@ -2695,8 +2695,8 @@ define i64 @test28(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
;
; X64-LABEL: test28:
; X64: # %bb.0: # %entry
-; X64-NEXT: movq %rdi, %mm0
-; X64-NEXT: movq %rsi, %mm1
+; X64-NEXT: movq %rsi, %mm0
+; X64-NEXT: movq %rdi, %mm1
; X64-NEXT: pmaxub %mm0, %mm1
; X64-NEXT: movq %mm1, %rax
; X64-NEXT: retq
@@ -2740,8 +2740,8 @@ define i64 @test27(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
;
; X64-LABEL: test27:
; X64: # %bb.0: # %entry
-; X64-NEXT: movq %rdi, %mm0
-; X64-NEXT: movq %rsi, %mm1
+; X64-NEXT: movq %rsi, %mm0
+; X64-NEXT: movq %rdi, %mm1
; X64-NEXT: pavgw %mm0, %mm1
; X64-NEXT: movq %mm1, %rax
; X64-NEXT: retq
@@ -2785,8 +2785,8 @@ define i64 @test26(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
;
; X64-LABEL: test26:
; X64: # %bb.0: # %entry
-; X64-NEXT: movq %rdi, %mm0
-; X64-NEXT: movq %rsi, %mm1
+; X64-NEXT: movq %rsi, %mm0
+; X64-NEXT: movq %rdi, %mm1
; X64-NEXT: pavgb %mm0, %mm1
; X64-NEXT: movq %mm1, %rax
; X64-NEXT: retq
@@ -2884,10 +2884,10 @@ define void @test23(<1 x i64> %d, <1 x i64> %n, ptr %p) nounwind optsize ssp {
;
; X64-LABEL: test23:
; X64: # %bb.0: # %entry
-; X64-NEXT: movq %rdi, %mm0
-; X64-NEXT: movq %rsi, %mm1
+; X64-NEXT: movq %rsi, %mm0
+; X64-NEXT: movq %rdi, %mm1
; X64-NEXT: movq %rdx, %rdi
-; X64-NEXT: maskmovq %mm1, %mm0
+; X64-NEXT: maskmovq %mm0, %mm1
; X64-NEXT: retq
entry:
%0 = bitcast <1 x i64> %n to <8 x i8>
@@ -2926,8 +2926,8 @@ define i64 @test22(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
;
; X64-LABEL: test22:
; X64: # %bb.0: # %entry
-; X64-NEXT: movq %rdi, %mm0
-; X64-NEXT: movq %rsi, %mm1
+; X64-NEXT: movq %rsi, %mm0
+; X64-NEXT: movq %rdi, %mm1
; X64-NEXT: pmulhuw %mm0, %mm1
; X64-NEXT: movq %mm1, %rax
; X64-NEXT: retq
@@ -3041,8 +3041,8 @@ define i64 @test20(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
;
; X64-LABEL: test20:
; X64: # %bb.0: # %entry
-; X64-NEXT: movq %rdi, %mm0
-; X64-NEXT: movq %rsi, %mm1
+; X64-NEXT: movq %rsi, %mm0
+; X64-NEXT: movq %rdi, %mm1
; X64-NEXT: pmuludq %mm0, %mm1
; X64-NEXT: movq %mm1, %rax
; X64-NEXT: retq
@@ -3320,10 +3320,10 @@ define i64 @test12(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
;
; X64-LABEL: test12:
; X64: # %bb.0: # %entry
-; X64-NEXT: movq %rdi, %mm0
-; X64-NEXT: movq %rsi, %mm1
-; X64-NEXT: psignd %mm1, %mm0
-; X64-NEXT: movq %mm0, %rax
+; X64-NEXT: movq %rsi, %mm0
+; X64-NEXT: movq %rdi, %mm1
+; X64-NEXT: psignd %mm0, %mm1
+; X64-NEXT: movq %mm1, %rax
; X64-NEXT: retq
entry:
%0 = bitcast <1 x i64> %b to <2 x i32>
@@ -3365,10 +3365,10 @@ define i64 @test11(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
;
; X64-LABEL: test11:
; X64: # %bb.0: # %entry
-; X64-NEXT: movq %rdi, %mm0
-; X64-NEXT: movq %rsi, %mm1
-; X64-NEXT: psignw %mm1, %mm0
-; X64-NEXT: movq %mm0, %rax
+; X64-NEXT: movq %rsi, %mm0
+; X64-NEXT: movq %rdi, %mm1
+; X64-NEXT: psignw %mm0, %mm1
+; X64-NEXT: movq %mm1, %rax
; X64-NEXT: retq
entry:
%0 = bitcast <1 x i64> %b to <4 x i16>
@@ -3410,10 +3410,10 @@ define i64 @test10(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
;
; X64-LABEL: test10:
; X64: # %bb.0: # %entry
-; X64-NEXT: movq %rdi, %mm0
-; X64-NEXT: movq %rsi, %mm1
-; X64-NEXT: psignb %mm1, %mm0
-; X64-NEXT: movq %mm0, %rax
+; X64-NEXT: movq %rsi, %mm0
+; X64-NEXT: movq %rdi, %mm1
+; X64-NEXT: psignb %mm0, %mm1
+; X64-NEXT: movq %mm1, %rax
; X64-NEXT: retq
entry:
%0 = bitcast <1 x i64> %b to <8 x i8>
@@ -3455,10 +3455,10 @@ define i64 @test9(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
;
; X64-LABEL: test9:
; X64: # %bb.0: # %entry
-; X64-NEXT: movq %rdi, %mm0
-; X64-NEXT: movq %rsi, %mm1
-; X64-NEXT: pshufb %mm1, %mm0
-; X64-NEXT: movq %mm0, %rax
+; X64-NEXT: movq %rsi, %mm0
+; X64-NEXT: movq %rdi, %mm1
+; X64-NEXT: pshufb %mm0, %mm1
+; X64-NEXT: movq %mm1, %rax
; X64-NEXT: retq
entry:
%0 = bitcast <1 x i64> %b to <8 x i8>
@@ -3500,8 +3500,8 @@ define i64 @test8(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
;
; X64-LABEL: test8:
; X64: # %bb.0: # %entry
-; X64-NEXT: movq %rdi, %mm0
-; X64-NEXT: movq %rsi, %mm1
+; X64-NEXT: movq %rsi, %mm0
+; X64-NEXT: movq %rdi, %mm1
; X64-NEXT: pmulhrsw %mm0, %mm1
; X64-NEXT: movq %mm1, %rax
; X64-NEXT: retq
@@ -3545,10 +3545,10 @@ define i64 @test7(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
;
; X64-LABEL: test7:
; X64: # %bb.0: # %entry
-; X64-NEXT: movq %rdi, %mm0
-; X64-NEXT: movq %rsi, %mm1
-; X64-NEXT: pmaddubsw %mm1, %mm0
-; X64-NEXT: movq %mm0, %rax
+; X64-NEXT: movq %rsi, %mm0
+; X64-NEXT: movq %rdi, %mm1
+; X64-NEXT: pmaddubsw %mm0, %mm1
+; X64-NEXT: movq %mm1, %rax
; X64-NEXT: retq
entry:
%0 = bitcast <1 x i64> %b to <8 x i8>
@@ -3590,10 +3590,10 @@ define i64 @test6(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
;
; X64-LABEL: test6:
; X64: # %bb.0: # %entry
-; X64-NEXT: movq %rdi, %mm0
-; X64-NEXT: movq %rsi, %mm1
-; X64-NEXT: phsubsw %mm1, %mm0
-; X64-NEXT: movq %mm0, %rax
+; X64-NEXT: movq %rsi, %mm0
+; X64-NEXT: movq %rdi, %mm1
+; X64-NEXT: phsubsw %mm0, %mm1
+; X64-NEXT: movq %mm1, %rax
; X64-NEXT: retq
entry:
%0 = bitcast <1 x i64> %b to <4 x i16>
@@ -3635,10 +3635,10 @@ define i64 @test5(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
;
; X64-LABEL: test5:
; X64: # %bb.0: # %entry
-; X64-NEXT: movq %rdi, %mm0
-; X64-NEXT: movq %rsi, %mm1
-; X64-NEXT: phsubd %mm1, %mm0
-; X64-NEXT: movq %mm0, %rax
+; X64-NEXT: movq %rsi, %mm0
+; X64-NEXT: movq %rdi, %mm1
+; X64-NEXT: phsubd %mm0, %mm1
+; X64-NEXT: movq %mm1, %rax
; X64-NEXT: retq
entry:
%0 = bitcast <1 x i64> %b to <2 x i32>
@@ -3680,10 +3680,10 @@ define i64 @test4(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
;
; X64-LABEL: test4:
; X64: # %bb.0: # %entry
-; X64-NEXT: movq %rdi, %mm0
-; X64-NEXT: movq %rsi, %mm1
-; X64-NEXT: phsubw %mm1, %mm0
-; X64-NEXT: movq %mm0, %rax
+; X64-NEXT: movq %rsi, %mm0
+; X64-NEXT: movq %rdi, %mm1
+; X64-NEXT: phsubw %mm0, %mm1
+; X64-NEXT: movq %mm1, %rax
; X64-NEXT: retq
entry:
%0 = bitcast <1 x i64> %b to <4 x i16>
@@ -3725,10 +3725,10 @@ define i64 @test3(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
;
; X64-LABEL: test3:
; X64: # %bb.0: # %entry
-; X64-NEXT: movq %rdi, %mm0
-; X64-NEXT: movq %rsi, %mm1
-; X64-NEXT: phaddsw %mm1, %mm0
-; X64-NEXT: movq %mm0, %rax
+; X64-NEXT: movq %rsi, %mm0
+; X64-NEXT: movq %rdi, %mm1
+; X64-NEXT: phaddsw %mm0, %mm1
+; X64-NEXT: movq %mm1, %rax
; X64-NEXT: retq
entry:
%0 = bitcast <1 x i64> %b to <4 x i16>
@@ -3770,10 +3770,10 @@ define i64 @test2(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
;
; X64-LABEL: test2:
; X64: # %bb.0: # %entry
-; X64-NEXT: movq %rdi, %mm0
-; X64-NEXT: movq %rsi, %mm1
-; X64-NEXT: phaddd %mm1, %mm0
-; X64-NEXT: movq %mm0, %rax
+; X64-NEXT: movq %rsi, %mm0
+; X64-NEXT: movq %rdi, %mm1
+; X64-NEXT: phaddd %mm0, %mm1
+; X64-NEXT: movq %mm1, %rax
; X64-NEXT: retq
entry:
%0 = bitcast <1 x i64> %b to <2 x i32>
@@ -3788,10 +3788,26 @@ entry:
}
define <4 x float> @test89(<4 x float> %a, x86_mmx %b) nounwind {
-; ALL-LABEL: test89:
-; ALL: # %bb.0:
-; ALL-NEXT: cvtpi2ps %mm0, %xmm0
-; ALL-NEXT: ret{{[l|q]}}
+; X86-LABEL: test89:
+; X86: # %bb.0:
+; X86-NEXT: pushl %ebp
+; X86-NEXT: movl %esp, %ebp
+; X86-NEXT: andl $-8, %esp
+; X86-NEXT: subl $8, %esp
+; X86-NEXT: movl 8(%ebp), %eax
+; X86-NEXT: movl 12(%ebp), %ecx
+; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %eax, (%esp)
+; X86-NEXT: cvtpi2ps (%esp), %xmm0
+; X86-NEXT: movl %ebp, %esp
+; X86-NEXT: popl %ebp
+; X86-NEXT: retl
+;
+; X64-LABEL: test89:
+; X64: # %bb.0:
+; X64-NEXT: movq %rdi, %mm0
+; X64-NEXT: cvtpi2ps %mm0, %xmm0
+; X64-NEXT: retq
%c = tail call <4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float> %a, x86_mmx %b)
ret <4 x float> %c
}
diff --git a/llvm/test/CodeGen/X86/pr23246.ll b/llvm/test/CodeGen/X86/pr23246.ll
index 45587b8c69cd4..cd0ece12a1916 100644
--- a/llvm/test/CodeGen/X86/pr23246.ll
+++ b/llvm/test/CodeGen/X86/pr23246.ll
@@ -9,7 +9,7 @@ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
define <2 x i64> @test(x86_mmx %a) #0 {
; CHECK-LABEL: test:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: movq2dq %mm0, %xmm0
+; CHECK-NEXT: movq %rdi, %xmm0
; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; CHECK-NEXT: retq
entry:
diff --git a/llvm/test/CodeGen/X86/select-mmx.ll b/llvm/test/CodeGen/X86/select-mmx.ll
index 27b7ebb8381cd..8339cb71d4671 100644
--- a/llvm/test/CodeGen/X86/select-mmx.ll
+++ b/llvm/test/CodeGen/X86/select-mmx.ll
@@ -14,15 +14,11 @@ define i64 @test47(i64 %arg) {
;
; X64-LABEL: test47:
; X64: # %bb.0:
+; X64-NEXT: xorl %eax, %eax
; X64-NEXT: testq %rdi, %rdi
-; X64-NEXT: je .LBB0_1
-; X64-NEXT: # %bb.2:
-; X64-NEXT: pxor %mm0, %mm0
-; X64-NEXT: jmp .LBB0_3
-; X64-NEXT: .LBB0_1:
-; X64-NEXT: movl $7, %eax
-; X64-NEXT: movd %eax, %mm0
-; X64-NEXT: .LBB0_3:
+; X64-NEXT: movl $7, %ecx
+; X64-NEXT: cmovneq %rax, %rcx
+; X64-NEXT: movq %rcx, %mm0
; X64-NEXT: psllw %mm0, %mm0
; X64-NEXT: movq %mm0, %rax
; X64-NEXT: retq
@@ -35,17 +31,17 @@ define i64 @test47(i64 %arg) {
; X86-NEXT: movl %esp, %ebp
; X86-NEXT: .cfi_def_cfa_register %ebp
; X86-NEXT: andl $-8, %esp
-; X86-NEXT: subl $8, %esp
+; X86-NEXT: subl $16, %esp
; X86-NEXT: movl 8(%ebp), %eax
; X86-NEXT: orl 12(%ebp), %eax
-; X86-NEXT: je .LBB0_1
-; X86-NEXT: # %bb.2:
-; X86-NEXT: pxor %mm0, %mm0
-; X86-NEXT: jmp .LBB0_3
-; X86-NEXT: .LBB0_1:
; X86-NEXT: movl $7, %eax
-; X86-NEXT: movd %eax, %mm0
-; X86-NEXT: .LBB0_3:
+; X86-NEXT: je .LBB0_2
+; X86-NEXT: # %bb.1:
+; X86-NEXT: xorl %eax, %eax
+; X86-NEXT: .LBB0_2:
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movq {{[0-9]+}}(%esp), %mm0
; X86-NEXT: psllw %mm0, %mm0
; X86-NEXT: movq %mm0, (%esp)
; X86-NEXT: movl (%esp), %eax
@@ -74,13 +70,8 @@ define i64 @test49(i64 %arg, i64 %x, i64 %y) {
; X64-LABEL: test49:
; X64: # %bb.0:
; X64-NEXT: testq %rdi, %rdi
-; X64-NEXT: je .LBB1_1
-; X64-NEXT: # %bb.2:
-; X64-NEXT: movq %rdx, %mm0
-; X64-NEXT: jmp .LBB1_3
-; X64-NEXT: .LBB1_1:
+; X64-NEXT: cmovneq %rdx, %rsi
; X64-NEXT: movq %rsi, %mm0
-; X64-NEXT: .LBB1_3:
; X64-NEXT: psllw %mm0, %mm0
; X64-NEXT: movq %mm0, %rax
; X64-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/stack-folding-3dnow.ll b/llvm/test/CodeGen/X86/stack-folding-3dnow.ll
index 1cbd61567f327..d4821c5fa3d41 100644
--- a/llvm/test/CodeGen/X86/stack-folding-3dnow.ll
+++ b/llvm/test/CodeGen/X86/stack-folding-3dnow.ll
@@ -4,12 +4,13 @@
define x86_mmx @stack_fold_pavgusb(x86_mmx %a, x86_mmx %b) {
; CHECK-LABEL: stack_fold_pavgusb:
; CHECK: # %bb.0:
-; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT: movq %rsi, %mm0
+; CHECK-NEXT: movq %rdi, %mm1
+; CHECK-NEXT: pavgusb %mm0, %mm1
+; CHECK-NEXT: movq %mm1, %rax
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
-; CHECK-NEXT: pavgusb {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT: movq2dq %mm0, %xmm0
; CHECK-NEXT: retq
%1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
%2 = call x86_mmx @llvm.x86.3dnow.pavgusb(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -20,12 +21,13 @@ declare x86_mmx @llvm.x86.3dnow.pavgusb(x86_mmx, x86_mmx) nounwind readnone
define x86_mmx @stack_fold_pf2id(x86_mmx %a) {
; CHECK-LABEL: stack_fold_pf2id:
; CHECK: # %bb.0:
+; CHECK-NEXT: movq %rdi, %mm0
; CHECK-NEXT: movq %mm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: pf2id {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT: movq2dq %mm0, %xmm0
+; CHECK-NEXT: movq %mm0, %rax
; CHECK-NEXT: retq
%1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm1},~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
%2 = call x86_mmx @llvm.x86.3dnow.pf2id(x86_mmx %a) nounwind readnone
@@ -36,12 +38,13 @@ declare x86_mmx @llvm.x86.3dnow.pf2id(x86_mmx) nounwind readnone
define x86_mmx @stack_fold_pf2iw(x86_mmx %a) {
; CHECK-LABEL: stack_fold_pf2iw:
; CHECK: # %bb.0:
+; CHECK-NEXT: movq %rdi, %mm0
; CHECK-NEXT: movq %mm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: pf2iw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT: movq2dq %mm0, %xmm0
+; CHECK-NEXT: movq %mm0, %rax
; CHECK-NEXT: retq
%1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm1},~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
%2 = call x86_mmx @llvm.x86.3dnowa.pf2iw(x86_mmx %a) nounwind readnone
@@ -52,12 +55,13 @@ declare x86_mmx @llvm.x86.3dnowa.pf2iw(x86_mmx) nounwind readnone
define x86_mmx @stack_fold_pfacc(x86_mmx %a, x86_mmx %b) {
; CHECK-LABEL: stack_fold_pfacc:
; CHECK: # %bb.0:
-; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT: movq %rsi, %mm0
+; CHECK-NEXT: movq %rdi, %mm1
+; CHECK-NEXT: pfacc %mm0, %mm1
+; CHECK-NEXT: movq %mm1, %rax
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
-; CHECK-NEXT: pfacc {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT: movq2dq %mm0, %xmm0
; CHECK-NEXT: retq
%1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
%2 = call x86_mmx @llvm.x86.3dnow.pfacc(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -68,12 +72,13 @@ declare x86_mmx @llvm.x86.3dnow.pfacc(x86_mmx, x86_mmx) nounwind readnone
define x86_mmx @stack_fold_pfadd(x86_mmx %a, x86_mmx %b) {
; CHECK-LABEL: stack_fold_pfadd:
; CHECK: # %bb.0:
-; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT: movq %rsi, %mm0
+; CHECK-NEXT: movq %rdi, %mm1
+; CHECK-NEXT: pfadd %mm0, %mm1
+; CHECK-NEXT: movq %mm1, %rax
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
-; CHECK-NEXT: pfadd {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT: movq2dq %mm0, %xmm0
; CHECK-NEXT: retq
%1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
%2 = call x86_mmx @llvm.x86.3dnow.pfadd(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -84,12 +89,13 @@ declare x86_mmx @llvm.x86.3dnow.pfadd(x86_mmx, x86_mmx) nounwind readnone
define x86_mmx @stack_fold_pfcmpeq(x86_mmx %a, x86_mmx %b) {
; CHECK-LABEL: stack_fold_pfcmpeq:
; CHECK: # %bb.0:
-; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT: movq %rsi, %mm0
+; CHECK-NEXT: movq %rdi, %mm1
+; CHECK-NEXT: pfcmpeq %mm0, %mm1
+; CHECK-NEXT: movq %mm1, %rax
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
-; CHECK-NEXT: pfcmpeq {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT: movq2dq %mm0, %xmm0
; CHECK-NEXT: retq
%1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
%2 = call x86_mmx @llvm.x86.3dnow.pfcmpeq(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -100,12 +106,13 @@ declare x86_mmx @llvm.x86.3dnow.pfcmpeq(x86_mmx, x86_mmx) nounwind readnone
define x86_mmx @stack_fold_pfcmpge(x86_mmx %a, x86_mmx %b) {
; CHECK-LABEL: stack_fold_pfcmpge:
; CHECK: # %bb.0:
-; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT: movq %rsi, %mm0
+; CHECK-NEXT: movq %rdi, %mm1
+; CHECK-NEXT: pfcmpge %mm0, %mm1
+; CHECK-NEXT: movq %mm1, %rax
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
-; CHECK-NEXT: pfcmpge {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT: movq2dq %mm0, %xmm0
; CHECK-NEXT: retq
%1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
%2 = call x86_mmx @llvm.x86.3dnow.pfcmpge(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -116,12 +123,13 @@ declare x86_mmx @llvm.x86.3dnow.pfcmpge(x86_mmx, x86_mmx) nounwind readnone
define x86_mmx @stack_fold_pfcmpgt(x86_mmx %a, x86_mmx %b) {
; CHECK-LABEL: stack_fold_pfcmpgt:
; CHECK: # %bb.0:
-; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT: movq %rsi, %mm0
+; CHECK-NEXT: movq %rdi, %mm1
+; CHECK-NEXT: pfcmpgt %mm0, %mm1
+; CHECK-NEXT: movq %mm1, %rax
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
-; CHECK-NEXT: pfcmpgt {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT: movq2dq %mm0, %xmm0
; CHECK-NEXT: retq
%1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
%2 = call x86_mmx @llvm.x86.3dnow.pfcmpgt(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -132,12 +140,13 @@ declare x86_mmx @llvm.x86.3dnow.pfcmpgt(x86_mmx, x86_mmx) nounwind readnone
define x86_mmx @stack_fold_pfmax(x86_mmx %a, x86_mmx %b) {
; CHECK-LABEL: stack_fold_pfmax:
; CHECK: # %bb.0:
-; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT: movq %rsi, %mm0
+; CHECK-NEXT: movq %rdi, %mm1
+; CHECK-NEXT: pfmax %mm0, %mm1
+; CHECK-NEXT: movq %mm1, %rax
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
-; CHECK-NEXT: pfmax {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT: movq2dq %mm0, %xmm0
; CHECK-NEXT: retq
%1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
%2 = call x86_mmx @llvm.x86.3dnow.pfmax(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -148,12 +157,13 @@ declare x86_mmx @llvm.x86.3dnow.pfmax(x86_mmx, x86_mmx) nounwind readnone
define x86_mmx @stack_fold_pfmin(x86_mmx %a, x86_mmx %b) {
; CHECK-LABEL: stack_fold_pfmin:
; CHECK: # %bb.0:
-; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT: movq %rsi, %mm0
+; CHECK-NEXT: movq %rdi, %mm1
+; CHECK-NEXT: pfmin %mm0, %mm1
+; CHECK-NEXT: movq %mm1, %rax
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
-; CHECK-NEXT: pfmin {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT: movq2dq %mm0, %xmm0
; CHECK-NEXT: retq
%1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
%2 = call x86_mmx @llvm.x86.3dnow.pfmin(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -164,12 +174,13 @@ declare x86_mmx @llvm.x86.3dnow.pfmin(x86_mmx, x86_mmx) nounwind readnone
define x86_mmx @stack_fold_pfmul(x86_mmx %a, x86_mmx %b) {
; CHECK-LABEL: stack_fold_pfmul:
; CHECK: # %bb.0:
-; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT: movq %rsi, %mm0
+; CHECK-NEXT: movq %rdi, %mm1
+; CHECK-NEXT: pfmul %mm0, %mm1
+; CHECK-NEXT: movq %mm1, %rax
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
-; CHECK-NEXT: pfmul {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT: movq2dq %mm0, %xmm0
; CHECK-NEXT: retq
%1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
%2 = call x86_mmx @llvm.x86.3dnow.pfmul(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -180,12 +191,13 @@ declare x86_mmx @llvm.x86.3dnow.pfmul(x86_mmx, x86_mmx) nounwind readnone
define x86_mmx @stack_fold_pfnacc(x86_mmx %a, x86_mmx %b) {
; CHECK-LABEL: stack_fold_pfnacc:
; CHECK: # %bb.0:
-; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT: movq %rsi, %mm0
+; CHECK-NEXT: movq %rdi, %mm1
+; CHECK-NEXT: pfnacc %mm0, %mm1
+; CHECK-NEXT: movq %mm1, %rax
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
-; CHECK-NEXT: pfnacc {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT: movq2dq %mm0, %xmm0
; CHECK-NEXT: retq
%1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
%2 = call x86_mmx @llvm.x86.3dnowa.pfnacc(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -196,12 +208,13 @@ declare x86_mmx @llvm.x86.3dnowa.pfnacc(x86_mmx, x86_mmx) nounwind readnone
define x86_mmx @stack_fold_pfpnacc(x86_mmx %a, x86_mmx %b) {
; CHECK-LABEL: stack_fold_pfpnacc:
; CHECK: # %bb.0:
-; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT: movq %rsi, %mm0
+; CHECK-NEXT: movq %rdi, %mm1
+; CHECK-NEXT: pfpnacc %mm0, %mm1
+; CHECK-NEXT: movq %mm1, %rax
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
-; CHECK-NEXT: pfpnacc {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT: movq2dq %mm0, %xmm0
; CHECK-NEXT: retq
%1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
%2 = call x86_mmx @llvm.x86.3dnowa.pfpnacc(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -212,12 +225,13 @@ declare x86_mmx @llvm.x86.3dnowa.pfpnacc(x86_mmx, x86_mmx) nounwind readnone
define x86_mmx @stack_fold_pfrcp(x86_mmx %a) {
; CHECK-LABEL: stack_fold_pfrcp:
; CHECK: # %bb.0:
+; CHECK-NEXT: movq %rdi, %mm0
; CHECK-NEXT: movq %mm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: pfrcp {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT: movq2dq %mm0, %xmm0
+; CHECK-NEXT: movq %mm0, %rax
; CHECK-NEXT: retq
%1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm1},~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
%2 = call x86_mmx @llvm.x86.3dnow.pfrcp(x86_mmx %a) nounwind readnone
@@ -228,12 +242,13 @@ declare x86_mmx @llvm.x86.3dnow.pfrcp(x86_mmx) nounwind readnone
define x86_mmx @stack_fold_pfrcpit1(x86_mmx %a, x86_mmx %b) {
; CHECK-LABEL: stack_fold_pfrcpit1:
; CHECK: # %bb.0:
-; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT: movq %rsi, %mm0
+; CHECK-NEXT: movq %rdi, %mm1
+; CHECK-NEXT: pfrcpit1 %mm0, %mm1
+; CHECK-NEXT: movq %mm1, %rax
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
-; CHECK-NEXT: pfrcpit1 {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT: movq2dq %mm0, %xmm0
; CHECK-NEXT: retq
%1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
%2 = call x86_mmx @llvm.x86.3dnow.pfrcpit1(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -244,12 +259,13 @@ declare x86_mmx @llvm.x86.3dnow.pfrcpit1(x86_mmx, x86_mmx) nounwind readnone
define x86_mmx @stack_fold_pfrcpit2(x86_mmx %a, x86_mmx %b) {
; CHECK-LABEL: stack_fold_pfrcpit2:
; CHECK: # %bb.0:
-; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT: movq %rsi, %mm0
+; CHECK-NEXT: movq %rdi, %mm1
+; CHECK-NEXT: pfrcpit2 %mm0, %mm1
+; CHECK-NEXT: movq %mm1, %rax
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
-; CHECK-NEXT: pfrcpit2 {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT: movq2dq %mm0, %xmm0
; CHECK-NEXT: retq
%1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
%2 = call x86_mmx @llvm.x86.3dnow.pfrcpit2(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -260,12 +276,13 @@ declare x86_mmx @llvm.x86.3dnow.pfrcpit2(x86_mmx, x86_mmx) nounwind readnone
define x86_mmx @stack_fold_pfrsqit1(x86_mmx %a, x86_mmx %b) {
; CHECK-LABEL: stack_fold_pfrsqit1:
; CHECK: # %bb.0:
-; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT: movq %rsi, %mm0
+; CHECK-NEXT: movq %rdi, %mm1
+; CHECK-NEXT: pfrsqit1 %mm0, %mm1
+; CHECK-NEXT: movq %mm1, %rax
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
-; CHECK-NEXT: pfrsqit1 {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT: movq2dq %mm0, %xmm0
; CHECK-NEXT: retq
%1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
%2 = call x86_mmx @llvm.x86.3dnow.pfrsqit1(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -276,12 +293,13 @@ declare x86_mmx @llvm.x86.3dnow.pfrsqit1(x86_mmx, x86_mmx) nounwind readnone
define x86_mmx @stack_fold_pfrsqrt(x86_mmx %a) {
; CHECK-LABEL: stack_fold_pfrsqrt:
; CHECK: # %bb.0:
+; CHECK-NEXT: movq %rdi, %mm0
; CHECK-NEXT: movq %mm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: pfrsqrt {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT: movq2dq %mm0, %xmm0
+; CHECK-NEXT: movq %mm0, %rax
; CHECK-NEXT: retq
%1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm1},~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
%2 = call x86_mmx @llvm.x86.3dnow.pfrsqrt(x86_mmx %a) nounwind readnone
@@ -292,12 +310,13 @@ declare x86_mmx @llvm.x86.3dnow.pfrsqrt(x86_mmx) nounwind readnone
define x86_mmx @stack_fold_pfsub(x86_mmx %a, x86_mmx %b) {
; CHECK-LABEL: stack_fold_pfsub:
; CHECK: # %bb.0:
-; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT: movq %rsi, %mm0
+; CHECK-NEXT: movq %rdi, %mm1
+; CHECK-NEXT: pfsub %mm0, %mm1
+; CHECK-NEXT: movq %mm1, %rax
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
-; CHECK-NEXT: pfsub {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT: movq2dq %mm0, %xmm0
; CHECK-NEXT: retq
%1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
%2 = call x86_mmx @llvm.x86.3dnow.pfsub(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -308,12 +327,13 @@ declare x86_mmx @llvm.x86.3dnow.pfsub(x86_mmx, x86_mmx) nounwind readnone
define x86_mmx @stack_fold_pfsubr(x86_mmx %a, x86_mmx %b) {
; CHECK-LABEL: stack_fold_pfsubr:
; CHECK: # %bb.0:
-; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT: movq %rsi, %mm0
+; CHECK-NEXT: movq %rdi, %mm1
+; CHECK-NEXT: pfsubr %mm0, %mm1
+; CHECK-NEXT: movq %mm1, %rax
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
-; CHECK-NEXT: pfsubr {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT: movq2dq %mm0, %xmm0
; CHECK-NEXT: retq
%1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
%2 = call x86_mmx @llvm.x86.3dnow.pfsubr(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -324,12 +344,13 @@ declare x86_mmx @llvm.x86.3dnow.pfsubr(x86_mmx, x86_mmx) nounwind readnone
define x86_mmx @stack_fold_pi2fd(x86_mmx %a) {
; CHECK-LABEL: stack_fold_pi2fd:
; CHECK: # %bb.0:
+; CHECK-NEXT: movq %rdi, %mm0
; CHECK-NEXT: movq %mm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: pi2fd {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT: movq2dq %mm0, %xmm0
+; CHECK-NEXT: movq %mm0, %rax
; CHECK-NEXT: retq
%1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm1},~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
%2 = call x86_mmx @llvm.x86.3dnow.pi2fd(x86_mmx %a) nounwind readnone
@@ -340,12 +361,13 @@ declare x86_mmx @llvm.x86.3dnow.pi2fd(x86_mmx) nounwind readnone
define x86_mmx @stack_fold_pi2fw(x86_mmx %a) {
; CHECK-LABEL: stack_fold_pi2fw:
; CHECK: # %bb.0:
+; CHECK-NEXT: movq %rdi, %mm0
; CHECK-NEXT: movq %mm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: pi2fw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT: movq2dq %mm0, %xmm0
+; CHECK-NEXT: movq %mm0, %rax
; CHECK-NEXT: retq
%1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm1},~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
%2 = call x86_mmx @llvm.x86.3dnowa.pi2fw(x86_mmx %a) nounwind readnone
@@ -356,12 +378,13 @@ declare x86_mmx @llvm.x86.3dnowa.pi2fw(x86_mmx) nounwind readnone
define x86_mmx @stack_fold_pmulhrw(x86_mmx %a, x86_mmx %b) {
; CHECK-LABEL: stack_fold_pmulhrw:
; CHECK: # %bb.0:
-; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT: movq %rsi, %mm0
+; CHECK-NEXT: movq %rdi, %mm1
+; CHECK-NEXT: pmulhrw %mm0, %mm1
+; CHECK-NEXT: movq %mm1, %rax
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
-; CHECK-NEXT: pmulhrw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT: movq2dq %mm0, %xmm0
; CHECK-NEXT: retq
%1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
%2 = call x86_mmx @llvm.x86.3dnow.pmulhrw(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -372,13 +395,14 @@ declare x86_mmx @llvm.x86.3dnow.pmulhrw(x86_mmx, x86_mmx) nounwind readnone
define x86_mmx @stack_fold_pswapd(x86_mmx %a) {
; CHECK-LABEL: stack_fold_pswapd:
; CHECK: # %bb.0:
+; CHECK-NEXT: movq %rdi, %mm0
; CHECK-NEXT: movq %mm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: pswapd {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
; CHECK-NEXT: # mm0 = mem[1,0]
-; CHECK-NEXT: movq2dq %mm0, %xmm0
+; CHECK-NEXT: movq %mm0, %rax
; CHECK-NEXT: retq
%1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm1},~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
%2 = call x86_mmx @llvm.x86.3dnowa.pswapd(x86_mmx %a) nounwind readnone
diff --git a/llvm/test/CodeGen/X86/stack-folding-mmx.ll b/llvm/test/CodeGen/X86/stack-folding-mmx.ll
index 11ca9e2a547ee..6652a8ca0dbd5 100644
--- a/llvm/test/CodeGen/X86/stack-folding-mmx.ll
+++ b/llvm/test/CodeGen/X86/stack-folding-mmx.ll
@@ -9,7 +9,7 @@ define x86_mmx @stack_fold_cvtpd2pi(<2 x double> %a0) {
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: cvtpd2pi {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 16-byte Folded Reload
-; CHECK-NEXT: movq2dq %mm0, %xmm0
+; CHECK-NEXT: movq %mm0, %rax
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call x86_mmx @llvm.x86.sse.cvtpd2pi(<2 x double> %a0) nounwind readnone
@@ -20,6 +20,7 @@ declare x86_mmx @llvm.x86.sse.cvtpd2pi(<2 x double>) nounwind readnone
define <2 x double> @stack_fold_cvtpi2pd(x86_mmx %a0) {
; CHECK-LABEL: stack_fold_cvtpi2pd:
; CHECK: # %bb.0:
+; CHECK-NEXT: movq %rdi, %mm0
; CHECK-NEXT: movq %mm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
@@ -35,6 +36,7 @@ declare <2 x double> @llvm.x86.sse.cvtpi2pd(x86_mmx) nounwind readnone
define <4 x float> @stack_fold_cvtpi2ps(<4 x float> %a0, x86_mmx %a1) {
; CHECK-LABEL: stack_fold_cvtpi2ps:
; CHECK: # %bb.0:
+; CHECK-NEXT: movq %rdi, %mm0
; CHECK-NEXT: movq %mm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
@@ -55,7 +57,7 @@ define x86_mmx @stack_fold_cvtps2pi(<4 x float> %a0) {
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: cvtps2pi {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 16-byte Folded Reload
-; CHECK-NEXT: movq2dq %mm0, %xmm0
+; CHECK-NEXT: movq %mm0, %rax
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call x86_mmx @llvm.x86.sse.cvtps2pi(<4 x float> %a0) nounwind readnone
@@ -71,7 +73,7 @@ define x86_mmx @stack_fold_cvttpd2pi(<2 x double> %a0) {
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: cvttpd2pi {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 16-byte Folded Reload
-; CHECK-NEXT: movq2dq %mm0, %xmm0
+; CHECK-NEXT: movq %mm0, %rax
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call x86_mmx @llvm.x86.sse.cvttpd2pi(<2 x double> %a0) nounwind readnone
@@ -87,7 +89,7 @@ define x86_mmx @stack_fold_cvttps2pi(<4 x float> %a0) {
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: cvttps2pi {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 16-byte Folded Reload
-; CHECK-NEXT: movq2dq %mm0, %xmm0
+; CHECK-NEXT: movq %mm0, %rax
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call x86_mmx @llvm.x86.sse.cvttps2pi(<4 x float> %a0) nounwind readnone
@@ -107,6 +109,7 @@ define i32 @stack_fold_movd_store(x86_mmx %a0) nounwind {
; CHECK-NEXT: pushq %r13
; CHECK-NEXT: pushq %r12
; CHECK-NEXT: pushq %rbx
+; CHECK-NEXT: movq %rdi, %mm0
; CHECK-NEXT: paddb %mm0, %mm0
; CHECK-NEXT: movd %mm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-NEXT: #APP
@@ -139,6 +142,7 @@ define i64 @stack_fold_movq_store(x86_mmx %a0) nounwind {
; CHECK-NEXT: pushq %r13
; CHECK-NEXT: pushq %r12
; CHECK-NEXT: pushq %rbx
+; CHECK-NEXT: movq %rdi, %mm0
; CHECK-NEXT: paddb %mm0, %mm0
; CHECK-NEXT: movq %mm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-NEXT: #APP
@@ -161,12 +165,13 @@ define i64 @stack_fold_movq_store(x86_mmx %a0) nounwind {
define x86_mmx @stack_fold_pabsb(x86_mmx %a0) {
; CHECK-LABEL: stack_fold_pabsb:
; CHECK: # %bb.0:
+; CHECK-NEXT: movq %rdi, %mm0
; CHECK-NEXT: movq %mm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: pabsb {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT: movq2dq %mm0, %xmm0
+; CHECK-NEXT: movq %mm0, %rax
; CHECK-NEXT: retq
%1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm1},~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
%2 = call x86_mmx @llvm.x86.ssse3.pabs.b(x86_mmx %a0) nounwind readnone
@@ -177,12 +182,13 @@ declare x86_mmx @llvm.x86.ssse3.pabs.b(x86_mmx) nounwind readnone
define x86_mmx @stack_fold_pabsd(x86_mmx %a0) {
; CHECK-LABEL: stack_fold_pabsd:
; CHECK: # %bb.0:
+; CHECK-NEXT: movq %rdi, %mm0
; CHECK-NEXT: movq %mm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: pabsd {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT: movq2dq %mm0, %xmm0
+; CHECK-NEXT: movq %mm0, %rax
; CHECK-NEXT: retq
%1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm1},~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
%2 = call x86_mmx @llvm.x86.ssse3.pabs.d(x86_mmx %a0) nounwind readnone
@@ -193,12 +199,13 @@ declare x86_mmx @llvm.x86.ssse3.pabs.d(x86_mmx) nounwind readnone
define x86_mmx @stack_fold_pabsw(x86_mmx %a0) {
; CHECK-LABEL: stack_fold_pabsw:
; CHECK: # %bb.0:
+; CHECK-NEXT: movq %rdi, %mm0
; CHECK-NEXT: movq %mm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: pabsw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT: movq2dq %mm0, %xmm0
+; CHECK-NEXT: movq %mm0, %rax
; CHECK-NEXT: retq
%1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm1},~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
%2 = call x86_mmx @llvm.x86.ssse3.pabs.w(x86_mmx %a0) nounwind readnone
@@ -209,12 +216,13 @@ declare x86_mmx @llvm.x86.ssse3.pabs.w(x86_mmx) nounwind readnone
define x86_mmx @stack_fold_packssdw(x86_mmx %a, x86_mmx %b) {
; CHECK-LABEL: stack_fold_packssdw:
; CHECK: # %bb.0:
-; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT: movq %rsi, %mm0
+; CHECK-NEXT: movq %rdi, %mm1
+; CHECK-NEXT: packssdw %mm0, %mm1
+; CHECK-NEXT: movq %mm1, %rax
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
-; CHECK-NEXT: packssdw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT: movq2dq %mm0, %xmm0
; CHECK-NEXT: retq
%1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
%2 = call x86_mmx @llvm.x86.mmx.packssdw(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -225,12 +233,13 @@ declare x86_mmx @llvm.x86.mmx.packssdw(x86_mmx, x86_mmx) nounwind readnone
define x86_mmx @stack_fold_packsswb(x86_mmx %a, x86_mmx %b) {
; CHECK-LABEL: stack_fold_packsswb:
; CHECK: # %bb.0:
-; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT: movq %rsi, %mm0
+; CHECK-NEXT: movq %rdi, %mm1
+; CHECK-NEXT: packsswb %mm0, %mm1
+; CHECK-NEXT: movq %mm1, %rax
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
-; CHECK-NEXT: packsswb {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT: movq2dq %mm0, %xmm0
; CHECK-NEXT: retq
%1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
%2 = call x86_mmx @llvm.x86.mmx.packsswb(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -241,12 +250,13 @@ declare x86_mmx @llvm.x86.mmx.packsswb(x86_mmx, x86_mmx) nounwind readnone
define x86_mmx @stack_fold_packuswb(x86_mmx %a, x86_mmx %b) {
; CHECK-LABEL: stack_fold_packuswb:
; CHECK: # %bb.0:
-; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT: movq %rsi, %mm0
+; CHECK-NEXT: movq %rdi, %mm1
+; CHECK-NEXT: packuswb %mm0, %mm1
+; CHECK-NEXT: movq %mm1, %rax
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
-; CHECK-NEXT: packuswb {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT: movq2dq %mm0, %xmm0
; CHECK-NEXT: retq
%1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
%2 = call x86_mmx @llvm.x86.mmx.packuswb(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -257,12 +267,13 @@ declare x86_mmx @llvm.x86.mmx.packuswb(x86_mmx, x86_mmx) nounwind readnone
define x86_mmx @stack_fold_paddb(x86_mmx %a, x86_mmx %b) {
; CHECK-LABEL: stack_fold_paddb:
; CHECK: # %bb.0:
-; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT: movq %rsi, %mm0
+; CHECK-NEXT: movq %rdi, %mm1
+; CHECK-NEXT: paddb %mm0, %mm1
+; CHECK-NEXT: movq %mm1, %rax
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
-; CHECK-NEXT: paddb {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT: movq2dq %mm0, %xmm0
; CHECK-NEXT: retq
%1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
%2 = call x86_mmx @llvm.x86.mmx.padd.b(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -273,12 +284,13 @@ declare x86_mmx @llvm.x86.mmx.padd.b(x86_mmx, x86_mmx) nounwind readnone
define x86_mmx @stack_fold_paddd(x86_mmx %a, x86_mmx %b) {
; CHECK-LABEL: stack_fold_paddd:
; CHECK: # %bb.0:
-; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT: movq %rsi, %mm0
+; CHECK-NEXT: movq %rdi, %mm1
+; CHECK-NEXT: paddd %mm0, %mm1
+; CHECK-NEXT: movq %mm1, %rax
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
-; CHECK-NEXT: paddd {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT: movq2dq %mm0, %xmm0
; CHECK-NEXT: retq
%1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
%2 = call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -289,12 +301,13 @@ declare x86_mmx @llvm.x86.mmx.padd.d(x86_mmx, x86_mmx) nounwind readnone
define x86_mmx @stack_fold_paddq(x86_mmx %a, x86_mmx %b) {
; CHECK-LABEL: stack_fold_paddq:
; CHECK: # %bb.0:
-; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT: movq %rsi, %mm0
+; CHECK-NEXT: movq %rdi, %mm1
+; CHECK-NEXT: paddq %mm0, %mm1
+; CHECK-NEXT: movq %mm1, %rax
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
-; CHECK-NEXT: paddq {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT: movq2dq %mm0, %xmm0
; CHECK-NEXT: retq
%1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
%2 = call x86_mmx @llvm.x86.mmx.padd.q(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -305,12 +318,13 @@ declare x86_mmx @llvm.x86.mmx.padd.q(x86_mmx, x86_mmx) nounwind readnone
define x86_mmx @stack_fold_paddsb(x86_mmx %a, x86_mmx %b) {
; CHECK-LABEL: stack_fold_paddsb:
; CHECK: # %bb.0:
-; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT: movq %rsi, %mm0
+; CHECK-NEXT: movq %rdi, %mm1
+; CHECK-NEXT: paddsb %mm0, %mm1
+; CHECK-NEXT: movq %mm1, %rax
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
-; CHECK-NEXT: paddsb {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT: movq2dq %mm0, %xmm0
; CHECK-NEXT: retq
%1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
%2 = call x86_mmx @llvm.x86.mmx.padds.b(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -321,12 +335,13 @@ declare x86_mmx @llvm.x86.mmx.padds.b(x86_mmx, x86_mmx) nounwind readnone
define x86_mmx @stack_fold_paddsw(x86_mmx %a, x86_mmx %b) {
; CHECK-LABEL: stack_fold_paddsw:
; CHECK: # %bb.0:
-; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT: movq %rsi, %mm0
+; CHECK-NEXT: movq %rdi, %mm1
+; CHECK-NEXT: paddsw %mm0, %mm1
+; CHECK-NEXT: movq %mm1, %rax
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
-; CHECK-NEXT: paddsw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT: movq2dq %mm0, %xmm0
; CHECK-NEXT: retq
%1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
%2 = call x86_mmx @llvm.x86.mmx.padds.w(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -337,12 +352,13 @@ declare x86_mmx @llvm.x86.mmx.padds.w(x86_mmx, x86_mmx) nounwind readnone
define x86_mmx @stack_fold_paddusb(x86_mmx %a, x86_mmx %b) {
; CHECK-LABEL: stack_fold_paddusb:
; CHECK: # %bb.0:
-; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT: movq %rsi, %mm0
+; CHECK-NEXT: movq %rdi, %mm1
+; CHECK-NEXT: paddusb %mm0, %mm1
+; CHECK-NEXT: movq %mm1, %rax
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
-; CHECK-NEXT: paddusb {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT: movq2dq %mm0, %xmm0
; CHECK-NEXT: retq
%1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
%2 = call x86_mmx @llvm.x86.mmx.paddus.b(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -353,12 +369,13 @@ declare x86_mmx @llvm.x86.mmx.paddus.b(x86_mmx, x86_mmx) nounwind readnone
define x86_mmx @stack_fold_paddusw(x86_mmx %a, x86_mmx %b) {
; CHECK-LABEL: stack_fold_paddusw:
; CHECK: # %bb.0:
-; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT: movq %rsi, %mm0
+; CHECK-NEXT: movq %rdi, %mm1
+; CHECK-NEXT: paddusw %mm0, %mm1
+; CHECK-NEXT: movq %mm1, %rax
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
-; CHECK-NEXT: paddusw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT: movq2dq %mm0, %xmm0
; CHECK-NEXT: retq
%1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
%2 = call x86_mmx @llvm.x86.mmx.paddus.w(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -369,12 +386,13 @@ declare x86_mmx @llvm.x86.mmx.paddus.w(x86_mmx, x86_mmx) nounwind readnone
define x86_mmx @stack_fold_paddw(x86_mmx %a, x86_mmx %b) {
; CHECK-LABEL: stack_fold_paddw:
; CHECK: # %bb.0:
-; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT: movq %rsi, %mm0
+; CHECK-NEXT: movq %rdi, %mm1
+; CHECK-NEXT: paddw %mm0, %mm1
+; CHECK-NEXT: movq %mm1, %rax
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
-; CHECK-NEXT: paddw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT: movq2dq %mm0, %xmm0
; CHECK-NEXT: retq
%1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
%2 = call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -385,12 +403,13 @@ declare x86_mmx @llvm.x86.mmx.padd.w(x86_mmx, x86_mmx) nounwind readnone
define x86_mmx @stack_fold_palignr(x86_mmx %a, x86_mmx %b) {
; CHECK-LABEL: stack_fold_palignr:
; CHECK: # %bb.0:
-; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT: movq %rsi, %mm0
+; CHECK-NEXT: movq %rdi, %mm1
+; CHECK-NEXT: palignr $1, %mm0, %mm1
+; CHECK-NEXT: movq %mm1, %rax
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
-; CHECK-NEXT: palignr $1, {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT: movq2dq %mm0, %xmm0
; CHECK-NEXT: retq
%1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
%2 = call x86_mmx @llvm.x86.mmx.palignr.b(x86_mmx %a, x86_mmx %b, i8 1) nounwind readnone
@@ -401,12 +420,13 @@ declare x86_mmx @llvm.x86.mmx.palignr.b(x86_mmx, x86_mmx, i8) nounwind readnone
define x86_mmx @stack_fold_pand(x86_mmx %a, x86_mmx %b) {
; CHECK-LABEL: stack_fold_pand:
; CHECK: # %bb.0:
-; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT: movq %rsi, %mm0
+; CHECK-NEXT: movq %rdi, %mm1
+; CHECK-NEXT: pand %mm0, %mm1
+; CHECK-NEXT: movq %mm1, %rax
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
-; CHECK-NEXT: pand {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT: movq2dq %mm0, %xmm0
; CHECK-NEXT: retq
%1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
%2 = call x86_mmx @llvm.x86.mmx.pand(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -417,12 +437,13 @@ declare x86_mmx @llvm.x86.mmx.pand(x86_mmx, x86_mmx) nounwind readnone
define x86_mmx @stack_fold_pandn(x86_mmx %a, x86_mmx %b) {
; CHECK-LABEL: stack_fold_pandn:
; CHECK: # %bb.0:
-; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT: movq %rsi, %mm0
+; CHECK-NEXT: movq %rdi, %mm1
+; CHECK-NEXT: pandn %mm0, %mm1
+; CHECK-NEXT: movq %mm1, %rax
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
-; CHECK-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT: movq2dq %mm0, %xmm0
; CHECK-NEXT: retq
%1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
%2 = call x86_mmx @llvm.x86.mmx.pandn(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -433,12 +454,13 @@ declare x86_mmx @llvm.x86.mmx.pandn(x86_mmx, x86_mmx) nounwind readnone
define x86_mmx @stack_fold_pavgb(x86_mmx %a, x86_mmx %b) {
; CHECK-LABEL: stack_fold_pavgb:
; CHECK: # %bb.0:
-; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT: movq %rsi, %mm0
+; CHECK-NEXT: movq %rdi, %mm1
+; CHECK-NEXT: pavgb %mm0, %mm1
+; CHECK-NEXT: movq %mm1, %rax
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
-; CHECK-NEXT: pavgb {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT: movq2dq %mm0, %xmm0
; CHECK-NEXT: retq
%1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
%2 = call x86_mmx @llvm.x86.mmx.pavg.b(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -449,12 +471,13 @@ declare x86_mmx @llvm.x86.mmx.pavg.b(x86_mmx, x86_mmx) nounwind readnone
define x86_mmx @stack_fold_pavgw(x86_mmx %a, x86_mmx %b) {
; CHECK-LABEL: stack_fold_pavgw:
; CHECK: # %bb.0:
-; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT: movq %rsi, %mm0
+; CHECK-NEXT: movq %rdi, %mm1
+; CHECK-NEXT: pavgw %mm0, %mm1
+; CHECK-NEXT: movq %mm1, %rax
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
-; CHECK-NEXT: pavgw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT: movq2dq %mm0, %xmm0
; CHECK-NEXT: retq
%1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
%2 = call x86_mmx @llvm.x86.mmx.pavg.w(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -465,12 +488,13 @@ declare x86_mmx @llvm.x86.mmx.pavg.w(x86_mmx, x86_mmx) nounwind readnone
define x86_mmx @stack_fold_pcmpeqb(x86_mmx %a, x86_mmx %b) {
; CHECK-LABEL: stack_fold_pcmpeqb:
; CHECK: # %bb.0:
-; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT: movq %rsi, %mm0
+; CHECK-NEXT: movq %rdi, %mm1
+; CHECK-NEXT: pcmpeqb %mm0, %mm1
+; CHECK-NEXT: movq %mm1, %rax
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
-; CHECK-NEXT: pcmpeqb {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT: movq2dq %mm0, %xmm0
; CHECK-NEXT: retq
%1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
%2 = call x86_mmx @llvm.x86.mmx.pcmpeq.b(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -481,12 +505,13 @@ declare x86_mmx @llvm.x86.mmx.pcmpeq.b(x86_mmx, x86_mmx) nounwind readnone
define x86_mmx @stack_fold_pcmpeqd(x86_mmx %a, x86_mmx %b) {
; CHECK-LABEL: stack_fold_pcmpeqd:
; CHECK: # %bb.0:
-; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT: movq %rsi, %mm0
+; CHECK-NEXT: movq %rdi, %mm1
+; CHECK-NEXT: pcmpeqd %mm0, %mm1
+; CHECK-NEXT: movq %mm1, %rax
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
-; CHECK-NEXT: pcmpeqd {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT: movq2dq %mm0, %xmm0
; CHECK-NEXT: retq
%1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
%2 = call x86_mmx @llvm.x86.mmx.pcmpeq.d(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -497,12 +522,13 @@ declare x86_mmx @llvm.x86.mmx.pcmpeq.d(x86_mmx, x86_mmx) nounwind readnone
define x86_mmx @stack_fold_pcmpeqw(x86_mmx %a, x86_mmx %b) {
; CHECK-LABEL: stack_fold_pcmpeqw:
; CHECK: # %bb.0:
-; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT: movq %rsi, %mm0
+; CHECK-NEXT: movq %rdi, %mm1
+; CHECK-NEXT: pcmpeqw %mm0, %mm1
+; CHECK-NEXT: movq %mm1, %rax
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
-; CHECK-NEXT: pcmpeqw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT: movq2dq %mm0, %xmm0
; CHECK-NEXT: retq
%1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
%2 = call x86_mmx @llvm.x86.mmx.pcmpeq.w(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -513,12 +539,13 @@ declare x86_mmx @llvm.x86.mmx.pcmpeq.w(x86_mmx, x86_mmx) nounwind readnone
define x86_mmx @stack_fold_pcmpgtb(x86_mmx %a, x86_mmx %b) {
; CHECK-LABEL: stack_fold_pcmpgtb:
; CHECK: # %bb.0:
-; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT: movq %rsi, %mm0
+; CHECK-NEXT: movq %rdi, %mm1
+; CHECK-NEXT: pcmpgtb %mm0, %mm1
+; CHECK-NEXT: movq %mm1, %rax
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
-; CHECK-NEXT: pcmpgtb {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT: movq2dq %mm0, %xmm0
; CHECK-NEXT: retq
%1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
%2 = call x86_mmx @llvm.x86.mmx.pcmpgt.b(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -529,12 +556,13 @@ declare x86_mmx @llvm.x86.mmx.pcmpgt.b(x86_mmx, x86_mmx) nounwind readnone
define x86_mmx @stack_fold_pcmpgtd(x86_mmx %a, x86_mmx %b) {
; CHECK-LABEL: stack_fold_pcmpgtd:
; CHECK: # %bb.0:
-; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT: movq %rsi, %mm0
+; CHECK-NEXT: movq %rdi, %mm1
+; CHECK-NEXT: pcmpgtd %mm0, %mm1
+; CHECK-NEXT: movq %mm1, %rax
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
-; CHECK-NEXT: pcmpgtd {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT: movq2dq %mm0, %xmm0
; CHECK-NEXT: retq
%1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
%2 = call x86_mmx @llvm.x86.mmx.pcmpgt.d(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -545,12 +573,13 @@ declare x86_mmx @llvm.x86.mmx.pcmpgt.d(x86_mmx, x86_mmx) nounwind readnone
define x86_mmx @stack_fold_pcmpgtw(x86_mmx %a, x86_mmx %b) {
; CHECK-LABEL: stack_fold_pcmpgtw:
; CHECK: # %bb.0:
-; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT: movq %rsi, %mm0
+; CHECK-NEXT: movq %rdi, %mm1
+; CHECK-NEXT: pcmpgtw %mm0, %mm1
+; CHECK-NEXT: movq %mm1, %rax
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
-; CHECK-NEXT: pcmpgtw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT: movq2dq %mm0, %xmm0
; CHECK-NEXT: retq
%1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
%2 = call x86_mmx @llvm.x86.mmx.pcmpgt.w(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -561,12 +590,13 @@ declare x86_mmx @llvm.x86.mmx.pcmpgt.w(x86_mmx, x86_mmx) nounwind readnone
define x86_mmx @stack_fold_phaddd(x86_mmx %a, x86_mmx %b) {
; CHECK-LABEL: stack_fold_phaddd:
; CHECK: # %bb.0:
-; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT: movq %rsi, %mm0
+; CHECK-NEXT: movq %rdi, %mm1
+; CHECK-NEXT: phaddd %mm0, %mm1
+; CHECK-NEXT: movq %mm1, %rax
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
-; CHECK-NEXT: phaddd {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT: movq2dq %mm0, %xmm0
; CHECK-NEXT: retq
%1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
%2 = call x86_mmx @llvm.x86.ssse3.phadd.d(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -577,12 +607,13 @@ declare x86_mmx @llvm.x86.ssse3.phadd.d(x86_mmx, x86_mmx) nounwind readnone
define x86_mmx @stack_fold_phaddsw(x86_mmx %a, x86_mmx %b) {
; CHECK-LABEL: stack_fold_phaddsw:
; CHECK: # %bb.0:
-; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT: movq %rsi, %mm0
+; CHECK-NEXT: movq %rdi, %mm1
+; CHECK-NEXT: phaddsw %mm0, %mm1
+; CHECK-NEXT: movq %mm1, %rax
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
-; CHECK-NEXT: phaddsw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT: movq2dq %mm0, %xmm0
; CHECK-NEXT: retq
%1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
%2 = call x86_mmx @llvm.x86.ssse3.phadd.sw(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -593,12 +624,13 @@ declare x86_mmx @llvm.x86.ssse3.phadd.sw(x86_mmx, x86_mmx) nounwind readnone
define x86_mmx @stack_fold_phaddw(x86_mmx %a, x86_mmx %b) {
; CHECK-LABEL: stack_fold_phaddw:
; CHECK: # %bb.0:
-; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT: movq %rsi, %mm0
+; CHECK-NEXT: movq %rdi, %mm1
+; CHECK-NEXT: phaddw %mm0, %mm1
+; CHECK-NEXT: movq %mm1, %rax
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
-; CHECK-NEXT: phaddw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT: movq2dq %mm0, %xmm0
; CHECK-NEXT: retq
%1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
%2 = call x86_mmx @llvm.x86.ssse3.phadd.w(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -609,12 +641,13 @@ declare x86_mmx @llvm.x86.ssse3.phadd.w(x86_mmx, x86_mmx) nounwind readnone
define x86_mmx @stack_fold_phsubd(x86_mmx %a, x86_mmx %b) {
; CHECK-LABEL: stack_fold_phsubd:
; CHECK: # %bb.0:
-; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT: movq %rsi, %mm0
+; CHECK-NEXT: movq %rdi, %mm1
+; CHECK-NEXT: phsubd %mm0, %mm1
+; CHECK-NEXT: movq %mm1, %rax
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
-; CHECK-NEXT: phsubd {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT: movq2dq %mm0, %xmm0
; CHECK-NEXT: retq
%1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
%2 = call x86_mmx @llvm.x86.ssse3.phsub.d(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -625,12 +658,13 @@ declare x86_mmx @llvm.x86.ssse3.phsub.d(x86_mmx, x86_mmx) nounwind readnone
define x86_mmx @stack_fold_phsubsw(x86_mmx %a, x86_mmx %b) {
; CHECK-LABEL: stack_fold_phsubsw:
; CHECK: # %bb.0:
-; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT: movq %rsi, %mm0
+; CHECK-NEXT: movq %rdi, %mm1
+; CHECK-NEXT: phsubsw %mm0, %mm1
+; CHECK-NEXT: movq %mm1, %rax
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
-; CHECK-NEXT: phsubsw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT: movq2dq %mm0, %xmm0
; CHECK-NEXT: retq
%1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
%2 = call x86_mmx @llvm.x86.ssse3.phsub.sw(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -641,12 +675,13 @@ declare x86_mmx @llvm.x86.ssse3.phsub.sw(x86_mmx, x86_mmx) nounwind readnone
define x86_mmx @stack_fold_phsubw(x86_mmx %a, x86_mmx %b) {
; CHECK-LABEL: stack_fold_phsubw:
; CHECK: # %bb.0:
-; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT: movq %rsi, %mm0
+; CHECK-NEXT: movq %rdi, %mm1
+; CHECK-NEXT: phsubw %mm0, %mm1
+; CHECK-NEXT: movq %mm1, %rax
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
-; CHECK-NEXT: phsubw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT: movq2dq %mm0, %xmm0
; CHECK-NEXT: retq
%1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
%2 = call x86_mmx @llvm.x86.ssse3.phsub.w(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -659,12 +694,13 @@ declare x86_mmx @llvm.x86.ssse3.phsub.w(x86_mmx, x86_mmx) nounwind readnone
define x86_mmx @stack_fold_pmaddubsw(x86_mmx %a, x86_mmx %b) {
; CHECK-LABEL: stack_fold_pmaddubsw:
; CHECK: # %bb.0:
-; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT: movq %rsi, %mm0
+; CHECK-NEXT: movq %rdi, %mm1
+; CHECK-NEXT: pmaddubsw %mm0, %mm1
+; CHECK-NEXT: movq %mm1, %rax
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
-; CHECK-NEXT: pmaddubsw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT: movq2dq %mm0, %xmm0
; CHECK-NEXT: retq
%1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
%2 = call x86_mmx @llvm.x86.ssse3.pmadd.ub.sw(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -675,12 +711,13 @@ declare x86_mmx @llvm.x86.ssse3.pmadd.ub.sw(x86_mmx, x86_mmx) nounwind readnone
define x86_mmx @stack_fold_pmaddwd(x86_mmx %a, x86_mmx %b) {
; CHECK-LABEL: stack_fold_pmaddwd:
; CHECK: # %bb.0:
-; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT: movq %rsi, %mm0
+; CHECK-NEXT: movq %rdi, %mm1
+; CHECK-NEXT: pmaddwd %mm0, %mm1
+; CHECK-NEXT: movq %mm1, %rax
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
-; CHECK-NEXT: pmaddwd {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT: movq2dq %mm0, %xmm0
; CHECK-NEXT: retq
%1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
%2 = call x86_mmx @llvm.x86.mmx.pmadd.wd(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -691,12 +728,13 @@ declare x86_mmx @llvm.x86.mmx.pmadd.wd(x86_mmx, x86_mmx) nounwind readnone
define x86_mmx @stack_fold_pmaxsw(x86_mmx %a, x86_mmx %b) {
; CHECK-LABEL: stack_fold_pmaxsw:
; CHECK: # %bb.0:
-; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT: movq %rsi, %mm0
+; CHECK-NEXT: movq %rdi, %mm1
+; CHECK-NEXT: pmaxsw %mm0, %mm1
+; CHECK-NEXT: movq %mm1, %rax
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
-; CHECK-NEXT: pmaxsw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT: movq2dq %mm0, %xmm0
; CHECK-NEXT: retq
%1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
%2 = call x86_mmx @llvm.x86.mmx.pmaxs.w(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -707,12 +745,13 @@ declare x86_mmx @llvm.x86.mmx.pmaxs.w(x86_mmx, x86_mmx) nounwind readnone
define x86_mmx @stack_fold_pmaxub(x86_mmx %a, x86_mmx %b) {
; CHECK-LABEL: stack_fold_pmaxub:
; CHECK: # %bb.0:
-; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT: movq %rsi, %mm0
+; CHECK-NEXT: movq %rdi, %mm1
+; CHECK-NEXT: pmaxub %mm0, %mm1
+; CHECK-NEXT: movq %mm1, %rax
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
-; CHECK-NEXT: pmaxub {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT: movq2dq %mm0, %xmm0
; CHECK-NEXT: retq
%1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
%2 = call x86_mmx @llvm.x86.mmx.pmaxu.b(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -723,12 +762,13 @@ declare x86_mmx @llvm.x86.mmx.pmaxu.b(x86_mmx, x86_mmx) nounwind readnone
define x86_mmx @stack_fold_pminsw(x86_mmx %a, x86_mmx %b) {
; CHECK-LABEL: stack_fold_pminsw:
; CHECK: # %bb.0:
-; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT: movq %rsi, %mm0
+; CHECK-NEXT: movq %rdi, %mm1
+; CHECK-NEXT: pminsw %mm0, %mm1
+; CHECK-NEXT: movq %mm1, %rax
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
-; CHECK-NEXT: pminsw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT: movq2dq %mm0, %xmm0
; CHECK-NEXT: retq
%1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
%2 = call x86_mmx @llvm.x86.mmx.pmins.w(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -739,12 +779,13 @@ declare x86_mmx @llvm.x86.mmx.pmins.w(x86_mmx, x86_mmx) nounwind readnone
define x86_mmx @stack_fold_pminub(x86_mmx %a, x86_mmx %b) {
; CHECK-LABEL: stack_fold_pminub:
; CHECK: # %bb.0:
-; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT: movq %rsi, %mm0
+; CHECK-NEXT: movq %rdi, %mm1
+; CHECK-NEXT: pminub %mm0, %mm1
+; CHECK-NEXT: movq %mm1, %rax
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
-; CHECK-NEXT: pminub {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT: movq2dq %mm0, %xmm0
; CHECK-NEXT: retq
%1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
%2 = call x86_mmx @llvm.x86.mmx.pminu.b(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -755,12 +796,13 @@ declare x86_mmx @llvm.x86.mmx.pminu.b(x86_mmx, x86_mmx) nounwind readnone
define x86_mmx @stack_fold_pmulhrsw(x86_mmx %a, x86_mmx %b) {
; CHECK-LABEL: stack_fold_pmulhrsw:
; CHECK: # %bb.0:
-; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT: movq %rsi, %mm0
+; CHECK-NEXT: movq %rdi, %mm1
+; CHECK-NEXT: pmulhrsw %mm0, %mm1
+; CHECK-NEXT: movq %mm1, %rax
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
-; CHECK-NEXT: pmulhrsw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT: movq2dq %mm0, %xmm0
; CHECK-NEXT: retq
%1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
%2 = call x86_mmx @llvm.x86.ssse3.pmul.hr.sw(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -771,12 +813,13 @@ declare x86_mmx @llvm.x86.ssse3.pmul.hr.sw(x86_mmx, x86_mmx) nounwind readnone
define x86_mmx @stack_fold_pmulhuw(x86_mmx %a, x86_mmx %b) {
; CHECK-LABEL: stack_fold_pmulhuw:
; CHECK: # %bb.0:
-; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT: movq %rsi, %mm0
+; CHECK-NEXT: movq %rdi, %mm1
+; CHECK-NEXT: pmulhuw %mm0, %mm1
+; CHECK-NEXT: movq %mm1, %rax
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
-; CHECK-NEXT: pmulhuw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT: movq2dq %mm0, %xmm0
; CHECK-NEXT: retq
%1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
%2 = call x86_mmx @llvm.x86.mmx.pmulhu.w(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -787,12 +830,13 @@ declare x86_mmx @llvm.x86.mmx.pmulhu.w(x86_mmx, x86_mmx) nounwind readnone
define x86_mmx @stack_fold_pmulhw(x86_mmx %a, x86_mmx %b) {
; CHECK-LABEL: stack_fold_pmulhw:
; CHECK: # %bb.0:
-; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT: movq %rsi, %mm0
+; CHECK-NEXT: movq %rdi, %mm1
+; CHECK-NEXT: pmulhw %mm0, %mm1
+; CHECK-NEXT: movq %mm1, %rax
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
-; CHECK-NEXT: pmulhw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT: movq2dq %mm0, %xmm0
; CHECK-NEXT: retq
%1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
%2 = call x86_mmx @llvm.x86.mmx.pmulh.w(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -803,12 +847,13 @@ declare x86_mmx @llvm.x86.mmx.pmulh.w(x86_mmx, x86_mmx) nounwind readnone
define x86_mmx @stack_fold_pmullw(x86_mmx %a, x86_mmx %b) {
; CHECK-LABEL: stack_fold_pmullw:
; CHECK: # %bb.0:
-; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT: movq %rsi, %mm0
+; CHECK-NEXT: movq %rdi, %mm1
+; CHECK-NEXT: pmullw %mm0, %mm1
+; CHECK-NEXT: movq %mm1, %rax
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
-; CHECK-NEXT: pmullw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT: movq2dq %mm0, %xmm0
; CHECK-NEXT: retq
%1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
%2 = call x86_mmx @llvm.x86.mmx.pmull.w(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -819,12 +864,13 @@ declare x86_mmx @llvm.x86.mmx.pmull.w(x86_mmx, x86_mmx) nounwind readnone
define x86_mmx @stack_fold_pmuludq(x86_mmx %a, x86_mmx %b) {
; CHECK-LABEL: stack_fold_pmuludq:
; CHECK: # %bb.0:
-; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT: movq %rsi, %mm0
+; CHECK-NEXT: movq %rdi, %mm1
+; CHECK-NEXT: pmuludq %mm0, %mm1
+; CHECK-NEXT: movq %mm1, %rax
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
-; CHECK-NEXT: pmuludq {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT: movq2dq %mm0, %xmm0
; CHECK-NEXT: retq
%1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
%2 = call x86_mmx @llvm.x86.mmx.pmulu.dq(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -835,12 +881,13 @@ declare x86_mmx @llvm.x86.mmx.pmulu.dq(x86_mmx, x86_mmx) nounwind readnone
define x86_mmx @stack_fold_por(x86_mmx %a, x86_mmx %b) {
; CHECK-LABEL: stack_fold_por:
; CHECK: # %bb.0:
-; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT: movq %rsi, %mm0
+; CHECK-NEXT: movq %rdi, %mm1
+; CHECK-NEXT: por %mm0, %mm1
+; CHECK-NEXT: movq %mm1, %rax
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
-; CHECK-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT: movq2dq %mm0, %xmm0
; CHECK-NEXT: retq
%1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
%2 = call x86_mmx @llvm.x86.mmx.por(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -851,12 +898,13 @@ declare x86_mmx @llvm.x86.mmx.por(x86_mmx, x86_mmx) nounwind readnone
define x86_mmx @stack_fold_psadbw(x86_mmx %a, x86_mmx %b) {
; CHECK-LABEL: stack_fold_psadbw:
; CHECK: # %bb.0:
-; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT: movq %rsi, %mm0
+; CHECK-NEXT: movq %rdi, %mm1
+; CHECK-NEXT: psadbw %mm0, %mm1
+; CHECK-NEXT: movq %mm1, %rax
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
-; CHECK-NEXT: psadbw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT: movq2dq %mm0, %xmm0
; CHECK-NEXT: retq
%1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
%2 = call x86_mmx @llvm.x86.mmx.psad.bw(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -867,14 +915,13 @@ declare x86_mmx @llvm.x86.mmx.psad.bw(x86_mmx, x86_mmx) nounwind readnone
define x86_mmx @stack_fold_pshufb(x86_mmx %a, x86_mmx %b) {
; CHECK-LABEL: stack_fold_pshufb:
; CHECK: # %bb.0:
-; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT: movq %mm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT: movq %rsi, %mm0
+; CHECK-NEXT: movq %rdi, %mm1
+; CHECK-NEXT: pshufb %mm0, %mm1
+; CHECK-NEXT: movq %mm1, %rax
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
-; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Reload
-; CHECK-NEXT: pshufb {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT: movq2dq %mm0, %xmm0
; CHECK-NEXT: retq
%1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm1},~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
%2 = call x86_mmx @llvm.x86.ssse3.pshuf.b(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -885,13 +932,14 @@ declare x86_mmx @llvm.x86.ssse3.pshuf.b(x86_mmx, x86_mmx) nounwind readnone
define x86_mmx @stack_fold_pshufw(x86_mmx %a) {
; CHECK-LABEL: stack_fold_pshufw:
; CHECK: # %bb.0:
+; CHECK-NEXT: movq %rdi, %mm0
; CHECK-NEXT: movq %mm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: pshufw $1, {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
; CHECK-NEXT: # mm0 = mem[1,0,0,0]
-; CHECK-NEXT: movq2dq %mm0, %xmm0
+; CHECK-NEXT: movq %mm0, %rax
; CHECK-NEXT: retq
%1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm1},~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
%2 = call x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx %a, i8 1) nounwind readnone
@@ -902,12 +950,13 @@ declare x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx, i8) nounwind readnone
define x86_mmx @stack_fold_psignb(x86_mmx %a0, x86_mmx %a1) {
; CHECK-LABEL: stack_fold_psignb:
; CHECK: # %bb.0:
-; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT: movq %rsi, %mm0
+; CHECK-NEXT: movq %rdi, %mm1
+; CHECK-NEXT: psignb %mm0, %mm1
+; CHECK-NEXT: movq %mm1, %rax
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
-; CHECK-NEXT: psignb {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT: movq2dq %mm0, %xmm0
; CHECK-NEXT: retq
%1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
%2 = call x86_mmx @llvm.x86.ssse3.psign.b(x86_mmx %a0, x86_mmx %a1) nounwind readnone
@@ -918,12 +967,13 @@ declare x86_mmx @llvm.x86.ssse3.psign.b(x86_mmx, x86_mmx) nounwind readnone
define x86_mmx @stack_fold_psignd(x86_mmx %a0, x86_mmx %a1) {
; CHECK-LABEL: stack_fold_psignd:
; CHECK: # %bb.0:
-; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT: movq %rsi, %mm0
+; CHECK-NEXT: movq %rdi, %mm1
+; CHECK-NEXT: psignd %mm0, %mm1
+; CHECK-NEXT: movq %mm1, %rax
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
-; CHECK-NEXT: psignd {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT: movq2dq %mm0, %xmm0
; CHECK-NEXT: retq
%1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
%2 = call x86_mmx @llvm.x86.ssse3.psign.d(x86_mmx %a0, x86_mmx %a1) nounwind readnone
@@ -934,12 +984,13 @@ declare x86_mmx @llvm.x86.ssse3.psign.d(x86_mmx, x86_mmx) nounwind readnone
define x86_mmx @stack_fold_psignw(x86_mmx %a0, x86_mmx %a1) {
; CHECK-LABEL: stack_fold_psignw:
; CHECK: # %bb.0:
-; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT: movq %rsi, %mm0
+; CHECK-NEXT: movq %rdi, %mm1
+; CHECK-NEXT: psignw %mm0, %mm1
+; CHECK-NEXT: movq %mm1, %rax
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
-; CHECK-NEXT: psignw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT: movq2dq %mm0, %xmm0
; CHECK-NEXT: retq
%1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
%2 = call x86_mmx @llvm.x86.ssse3.psign.w(x86_mmx %a0, x86_mmx %a1) nounwind readnone
@@ -950,12 +1001,13 @@ declare x86_mmx @llvm.x86.ssse3.psign.w(x86_mmx, x86_mmx) nounwind readnone
define x86_mmx @stack_fold_pslld(x86_mmx %a, x86_mmx %b) {
; CHECK-LABEL: stack_fold_pslld:
; CHECK: # %bb.0:
-; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT: movq %rsi, %mm0
+; CHECK-NEXT: movq %rdi, %mm1
+; CHECK-NEXT: pslld %mm0, %mm1
+; CHECK-NEXT: movq %mm1, %rax
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
-; CHECK-NEXT: pslld {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT: movq2dq %mm0, %xmm0
; CHECK-NEXT: retq
%1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
%2 = call x86_mmx @llvm.x86.mmx.psll.d(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -966,12 +1018,13 @@ declare x86_mmx @llvm.x86.mmx.psll.d(x86_mmx, x86_mmx) nounwind readnone
define x86_mmx @stack_fold_psllq(x86_mmx %a, x86_mmx %b) {
; CHECK-LABEL: stack_fold_psllq:
; CHECK: # %bb.0:
-; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT: movq %rsi, %mm0
+; CHECK-NEXT: movq %rdi, %mm1
+; CHECK-NEXT: psllq %mm0, %mm1
+; CHECK-NEXT: movq %mm1, %rax
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
-; CHECK-NEXT: psllq {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT: movq2dq %mm0, %xmm0
; CHECK-NEXT: retq
%1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
%2 = call x86_mmx @llvm.x86.mmx.psll.q(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -982,12 +1035,13 @@ declare x86_mmx @llvm.x86.mmx.psll.q(x86_mmx, x86_mmx) nounwind readnone
define x86_mmx @stack_fold_psllw(x86_mmx %a, x86_mmx %b) {
; CHECK-LABEL: stack_fold_psllw:
; CHECK: # %bb.0:
-; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT: movq %rsi, %mm0
+; CHECK-NEXT: movq %rdi, %mm1
+; CHECK-NEXT: psllw %mm0, %mm1
+; CHECK-NEXT: movq %mm1, %rax
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
-; CHECK-NEXT: psllw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT: movq2dq %mm0, %xmm0
; CHECK-NEXT: retq
%1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
%2 = call x86_mmx @llvm.x86.mmx.psll.w(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -998,12 +1052,13 @@ declare x86_mmx @llvm.x86.mmx.psll.w(x86_mmx, x86_mmx) nounwind readnone
define x86_mmx @stack_fold_psrad(x86_mmx %a, x86_mmx %b) {
; CHECK-LABEL: stack_fold_psrad:
; CHECK: # %bb.0:
-; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT: movq %rsi, %mm0
+; CHECK-NEXT: movq %rdi, %mm1
+; CHECK-NEXT: psrad %mm0, %mm1
+; CHECK-NEXT: movq %mm1, %rax
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
-; CHECK-NEXT: psrad {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT: movq2dq %mm0, %xmm0
; CHECK-NEXT: retq
%1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
%2 = call x86_mmx @llvm.x86.mmx.psra.d(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -1014,12 +1069,13 @@ declare x86_mmx @llvm.x86.mmx.psra.d(x86_mmx, x86_mmx) nounwind readnone
define x86_mmx @stack_fold_psraw(x86_mmx %a, x86_mmx %b) {
; CHECK-LABEL: stack_fold_psraw:
; CHECK: # %bb.0:
-; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT: movq %rsi, %mm0
+; CHECK-NEXT: movq %rdi, %mm1
+; CHECK-NEXT: psraw %mm0, %mm1
+; CHECK-NEXT: movq %mm1, %rax
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
-; CHECK-NEXT: psraw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT: movq2dq %mm0, %xmm0
; CHECK-NEXT: retq
%1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
%2 = call x86_mmx @llvm.x86.mmx.psra.w(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -1030,12 +1086,13 @@ declare x86_mmx @llvm.x86.mmx.psra.w(x86_mmx, x86_mmx) nounwind readnone
define x86_mmx @stack_fold_psrld(x86_mmx %a, x86_mmx %b) {
; CHECK-LABEL: stack_fold_psrld:
; CHECK: # %bb.0:
-; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT: movq %rsi, %mm0
+; CHECK-NEXT: movq %rdi, %mm1
+; CHECK-NEXT: psrld %mm0, %mm1
+; CHECK-NEXT: movq %mm1, %rax
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
-; CHECK-NEXT: psrld {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT: movq2dq %mm0, %xmm0
; CHECK-NEXT: retq
%1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
%2 = call x86_mmx @llvm.x86.mmx.psrl.d(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -1046,12 +1103,13 @@ declare x86_mmx @llvm.x86.mmx.psrl.d(x86_mmx, x86_mmx) nounwind readnone
define x86_mmx @stack_fold_psrlq(x86_mmx %a, x86_mmx %b) {
; CHECK-LABEL: stack_fold_psrlq:
; CHECK: # %bb.0:
-; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT: movq %rsi, %mm0
+; CHECK-NEXT: movq %rdi, %mm1
+; CHECK-NEXT: psrlq %mm0, %mm1
+; CHECK-NEXT: movq %mm1, %rax
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
-; CHECK-NEXT: psrlq {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT: movq2dq %mm0, %xmm0
; CHECK-NEXT: retq
%1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
%2 = call x86_mmx @llvm.x86.mmx.psrl.q(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -1062,12 +1120,13 @@ declare x86_mmx @llvm.x86.mmx.psrl.q(x86_mmx, x86_mmx) nounwind readnone
define x86_mmx @stack_fold_psrlw(x86_mmx %a, x86_mmx %b) {
; CHECK-LABEL: stack_fold_psrlw:
; CHECK: # %bb.0:
-; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT: movq %rsi, %mm0
+; CHECK-NEXT: movq %rdi, %mm1
+; CHECK-NEXT: psrlw %mm0, %mm1
+; CHECK-NEXT: movq %mm1, %rax
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
-; CHECK-NEXT: psrlw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT: movq2dq %mm0, %xmm0
; CHECK-NEXT: retq
%1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
%2 = call x86_mmx @llvm.x86.mmx.psrl.w(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -1078,12 +1137,13 @@ declare x86_mmx @llvm.x86.mmx.psrl.w(x86_mmx, x86_mmx) nounwind readnone
define x86_mmx @stack_fold_psubb(x86_mmx %a, x86_mmx %b) {
; CHECK-LABEL: stack_fold_psubb:
; CHECK: # %bb.0:
-; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT: movq %rsi, %mm0
+; CHECK-NEXT: movq %rdi, %mm1
+; CHECK-NEXT: psubb %mm0, %mm1
+; CHECK-NEXT: movq %mm1, %rax
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
-; CHECK-NEXT: psubb {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT: movq2dq %mm0, %xmm0
; CHECK-NEXT: retq
%1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
%2 = call x86_mmx @llvm.x86.mmx.psub.b(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -1094,12 +1154,13 @@ declare x86_mmx @llvm.x86.mmx.psub.b(x86_mmx, x86_mmx) nounwind readnone
define x86_mmx @stack_fold_psubd(x86_mmx %a, x86_mmx %b) {
; CHECK-LABEL: stack_fold_psubd:
; CHECK: # %bb.0:
-; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT: movq %rsi, %mm0
+; CHECK-NEXT: movq %rdi, %mm1
+; CHECK-NEXT: psubd %mm0, %mm1
+; CHECK-NEXT: movq %mm1, %rax
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
-; CHECK-NEXT: psubd {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT: movq2dq %mm0, %xmm0
; CHECK-NEXT: retq
%1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
%2 = call x86_mmx @llvm.x86.mmx.psub.d(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -1110,12 +1171,13 @@ declare x86_mmx @llvm.x86.mmx.psub.d(x86_mmx, x86_mmx) nounwind readnone
define x86_mmx @stack_fold_psubq(x86_mmx %a, x86_mmx %b) {
; CHECK-LABEL: stack_fold_psubq:
; CHECK: # %bb.0:
-; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT: movq %rsi, %mm0
+; CHECK-NEXT: movq %rdi, %mm1
+; CHECK-NEXT: psubq %mm0, %mm1
+; CHECK-NEXT: movq %mm1, %rax
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
-; CHECK-NEXT: psubq {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT: movq2dq %mm0, %xmm0
; CHECK-NEXT: retq
%1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
%2 = call x86_mmx @llvm.x86.mmx.psub.q(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -1126,12 +1188,13 @@ declare x86_mmx @llvm.x86.mmx.psub.q(x86_mmx, x86_mmx) nounwind readnone
define x86_mmx @stack_fold_psubsb(x86_mmx %a, x86_mmx %b) {
; CHECK-LABEL: stack_fold_psubsb:
; CHECK: # %bb.0:
-; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT: movq %rsi, %mm0
+; CHECK-NEXT: movq %rdi, %mm1
+; CHECK-NEXT: psubsb %mm0, %mm1
+; CHECK-NEXT: movq %mm1, %rax
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
-; CHECK-NEXT: psubsb {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT: movq2dq %mm0, %xmm0
; CHECK-NEXT: retq
%1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
%2 = call x86_mmx @llvm.x86.mmx.psubs.b(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -1142,12 +1205,13 @@ declare x86_mmx @llvm.x86.mmx.psubs.b(x86_mmx, x86_mmx) nounwind readnone
define x86_mmx @stack_fold_psubsw(x86_mmx %a, x86_mmx %b) {
; CHECK-LABEL: stack_fold_psubsw:
; CHECK: # %bb.0:
-; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT: movq %rsi, %mm0
+; CHECK-NEXT: movq %rdi, %mm1
+; CHECK-NEXT: psubsw %mm0, %mm1
+; CHECK-NEXT: movq %mm1, %rax
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
-; CHECK-NEXT: psubsw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT: movq2dq %mm0, %xmm0
; CHECK-NEXT: retq
%1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
%2 = call x86_mmx @llvm.x86.mmx.psubs.w(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -1158,12 +1222,13 @@ declare x86_mmx @llvm.x86.mmx.psubs.w(x86_mmx, x86_mmx) nounwind readnone
define x86_mmx @stack_fold_psubusb(x86_mmx %a, x86_mmx %b) {
; CHECK-LABEL: stack_fold_psubusb:
; CHECK: # %bb.0:
-; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT: movq %rsi, %mm0
+; CHECK-NEXT: movq %rdi, %mm1
+; CHECK-NEXT: psubusb %mm0, %mm1
+; CHECK-NEXT: movq %mm1, %rax
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
-; CHECK-NEXT: psubusb {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT: movq2dq %mm0, %xmm0
; CHECK-NEXT: retq
%1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
%2 = call x86_mmx @llvm.x86.mmx.psubus.b(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -1174,12 +1239,13 @@ declare x86_mmx @llvm.x86.mmx.psubus.b(x86_mmx, x86_mmx) nounwind readnone
define x86_mmx @stack_fold_psubusw(x86_mmx %a, x86_mmx %b) {
; CHECK-LABEL: stack_fold_psubusw:
; CHECK: # %bb.0:
-; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT: movq %rsi, %mm0
+; CHECK-NEXT: movq %rdi, %mm1
+; CHECK-NEXT: psubusw %mm0, %mm1
+; CHECK-NEXT: movq %mm1, %rax
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
-; CHECK-NEXT: psubusw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT: movq2dq %mm0, %xmm0
; CHECK-NEXT: retq
%1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
%2 = call x86_mmx @llvm.x86.mmx.psubus.w(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -1190,12 +1256,13 @@ declare x86_mmx @llvm.x86.mmx.psubus.w(x86_mmx, x86_mmx) nounwind readnone
define x86_mmx @stack_fold_psubw(x86_mmx %a, x86_mmx %b) {
; CHECK-LABEL: stack_fold_psubw:
; CHECK: # %bb.0:
-; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT: movq %rsi, %mm0
+; CHECK-NEXT: movq %rdi, %mm1
+; CHECK-NEXT: psubw %mm0, %mm1
+; CHECK-NEXT: movq %mm1, %rax
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
-; CHECK-NEXT: psubw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT: movq2dq %mm0, %xmm0
; CHECK-NEXT: retq
%1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
%2 = call x86_mmx @llvm.x86.mmx.psub.w(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -1206,13 +1273,13 @@ declare x86_mmx @llvm.x86.mmx.psub.w(x86_mmx, x86_mmx) nounwind readnone
define x86_mmx @stack_fold_punpckhbw(x86_mmx %a, x86_mmx %b) {
; CHECK-LABEL: stack_fold_punpckhbw:
; CHECK: # %bb.0:
-; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT: movq %rsi, %mm0
+; CHECK-NEXT: movq %rdi, %mm1
+; CHECK-NEXT: punpckhbw %mm0, %mm1 # mm1 = mm1[4],mm0[4],mm1[5],mm0[5],mm1[6],mm0[6],mm1[7],mm0[7]
+; CHECK-NEXT: movq %mm1, %rax
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
-; CHECK-NEXT: punpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT: # mm0 = mm0[4],mem[4],mm0[5],mem[5],mm0[6],mem[6],mm0[7],mem[7]
-; CHECK-NEXT: movq2dq %mm0, %xmm0
; CHECK-NEXT: retq
%1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
%2 = call x86_mmx @llvm.x86.mmx.punpckhbw(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -1223,13 +1290,13 @@ declare x86_mmx @llvm.x86.mmx.punpckhbw(x86_mmx, x86_mmx) nounwind readnone
define x86_mmx @stack_fold_punpckhdq(x86_mmx %a, x86_mmx %b) {
; CHECK-LABEL: stack_fold_punpckhdq:
; CHECK: # %bb.0:
-; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT: movq %rsi, %mm0
+; CHECK-NEXT: movq %rdi, %mm1
+; CHECK-NEXT: punpckhdq %mm0, %mm1 # mm1 = mm1[1],mm0[1]
+; CHECK-NEXT: movq %mm1, %rax
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
-; CHECK-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT: # mm0 = mm0[1],mem[1]
-; CHECK-NEXT: movq2dq %mm0, %xmm0
; CHECK-NEXT: retq
%1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
%2 = call x86_mmx @llvm.x86.mmx.punpckhdq(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -1240,13 +1307,13 @@ declare x86_mmx @llvm.x86.mmx.punpckhdq(x86_mmx, x86_mmx) nounwind readnone
define x86_mmx @stack_fold_punpckhwd(x86_mmx %a, x86_mmx %b) {
; CHECK-LABEL: stack_fold_punpckhwd:
; CHECK: # %bb.0:
-; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT: movq %rsi, %mm0
+; CHECK-NEXT: movq %rdi, %mm1
+; CHECK-NEXT: punpckhwd %mm0, %mm1 # mm1 = mm1[2],mm0[2],mm1[3],mm0[3]
+; CHECK-NEXT: movq %mm1, %rax
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
-; CHECK-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT: # mm0 = mm0[2],mem[2],mm0[3],mem[3]
-; CHECK-NEXT: movq2dq %mm0, %xmm0
; CHECK-NEXT: retq
%1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
%2 = call x86_mmx @llvm.x86.mmx.punpckhwd(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -1257,13 +1324,13 @@ declare x86_mmx @llvm.x86.mmx.punpckhwd(x86_mmx, x86_mmx) nounwind readnone
define x86_mmx @stack_fold_punpcklbw(x86_mmx %a, x86_mmx %b) {
; CHECK-LABEL: stack_fold_punpcklbw:
; CHECK: # %bb.0:
-; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT: movq %rsi, %mm0
+; CHECK-NEXT: movq %rdi, %mm1
+; CHECK-NEXT: punpcklbw %mm0, %mm1 # mm1 = mm1[0],mm0[0],mm1[1],mm0[1],mm1[2],mm0[2],mm1[3],mm0[3]
+; CHECK-NEXT: movq %mm1, %rax
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
-; CHECK-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT: # mm0 = mm0[0],mem[0],mm0[1],mem[1],mm0[2],mem[2],mm0[3],mem[3]
-; CHECK-NEXT: movq2dq %mm0, %xmm0
; CHECK-NEXT: retq
%1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
%2 = call x86_mmx @llvm.x86.mmx.punpcklbw(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -1274,13 +1341,13 @@ declare x86_mmx @llvm.x86.mmx.punpcklbw(x86_mmx, x86_mmx) nounwind readnone
define x86_mmx @stack_fold_punpckldq(x86_mmx %a, x86_mmx %b) {
; CHECK-LABEL: stack_fold_punpckldq:
; CHECK: # %bb.0:
-; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT: movq %rsi, %mm0
+; CHECK-NEXT: movq %rdi, %mm1
+; CHECK-NEXT: punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0]
+; CHECK-NEXT: movq %mm1, %rax
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
-; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT: # mm0 = mm0[0],mem[0]
-; CHECK-NEXT: movq2dq %mm0, %xmm0
; CHECK-NEXT: retq
%1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
%2 = call x86_mmx @llvm.x86.mmx.punpckldq(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -1291,13 +1358,13 @@ declare x86_mmx @llvm.x86.mmx.punpckldq(x86_mmx, x86_mmx) nounwind readnone
define x86_mmx @stack_fold_punpcklwd(x86_mmx %a, x86_mmx %b) {
; CHECK-LABEL: stack_fold_punpcklwd:
; CHECK: # %bb.0:
-; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT: movq %rsi, %mm0
+; CHECK-NEXT: movq %rdi, %mm1
+; CHECK-NEXT: punpcklwd %mm0, %mm1 # mm1 = mm1[0],mm0[0],mm1[1],mm0[1]
+; CHECK-NEXT: movq %mm1, %rax
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
-; CHECK-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT: # mm0 = mm0[0],mem[0],mm0[1],mem[1]
-; CHECK-NEXT: movq2dq %mm0, %xmm0
; CHECK-NEXT: retq
%1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
%2 = call x86_mmx @llvm.x86.mmx.punpcklwd(x86_mmx %a, x86_mmx %b) nounwind readnone
@@ -1308,12 +1375,13 @@ declare x86_mmx @llvm.x86.mmx.punpcklwd(x86_mmx, x86_mmx) nounwind readnone
define x86_mmx @stack_fold_pxor(x86_mmx %a, x86_mmx %b) {
; CHECK-LABEL: stack_fold_pxor:
; CHECK: # %bb.0:
-; CHECK-NEXT: movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT: movq %rsi, %mm0
+; CHECK-NEXT: movq %rdi, %mm1
+; CHECK-NEXT: pxor %mm0, %mm1
+; CHECK-NEXT: movq %mm1, %rax
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
-; CHECK-NEXT: pxor {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT: movq2dq %mm0, %xmm0
; CHECK-NEXT: retq
%1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
%2 = call x86_mmx @llvm.x86.mmx.pxor(x86_mmx %a, x86_mmx %b) nounwind readnone
diff --git a/llvm/test/CodeGen/X86/vec_extract-mmx.ll b/llvm/test/CodeGen/X86/vec_extract-mmx.ll
index 672b4591316ce..6fd90243a9303 100644
--- a/llvm/test/CodeGen/X86/vec_extract-mmx.ll
+++ b/llvm/test/CodeGen/X86/vec_extract-mmx.ll
@@ -96,12 +96,13 @@ entry:
define i32 @test3(x86_mmx %a) nounwind {
; X86-LABEL: test3:
; X86: # %bb.0:
-; X86-NEXT: movd %mm0, %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: retl
;
; X64-LABEL: test3:
; X64: # %bb.0:
-; X64-NEXT: movd %mm0, %eax
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: # kill: def $eax killed $eax killed $rax
; X64-NEXT: retq
%tmp0 = bitcast x86_mmx %a to <2 x i32>
%tmp1 = extractelement <2 x i32> %tmp0, i32 0
@@ -112,14 +113,12 @@ define i32 @test3(x86_mmx %a) nounwind {
define i32 @test4(x86_mmx %a) nounwind {
; X86-LABEL: test4:
; X86: # %bb.0:
-; X86-NEXT: movq2dq %mm0, %xmm0
-; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-NEXT: movd %xmm0, %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: retl
;
; X64-LABEL: test4:
; X64: # %bb.0:
-; X64-NEXT: movq2dq %mm0, %xmm0
+; X64-NEXT: movq %rdi, %xmm0
; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
; X64-NEXT: movd %xmm0, %eax
; X64-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/vec_insert-7.ll b/llvm/test/CodeGen/X86/vec_insert-7.ll
index cea047453de43..aed8782ba40aa 100644
--- a/llvm/test/CodeGen/X86/vec_insert-7.ll
+++ b/llvm/test/CodeGen/X86/vec_insert-7.ll
@@ -9,13 +9,12 @@ define x86_mmx @mmx_movzl(x86_mmx %x) nounwind {
; X86-LABEL: mmx_movzl:
; X86: ## %bb.0:
; X86-NEXT: movl $32, %eax
-; X86-NEXT: movd %eax, %mm0
+; X86-NEXT: xorl %edx, %edx
; X86-NEXT: retl
;
; X64-LABEL: mmx_movzl:
; X64: ## %bb.0:
; X64-NEXT: movl $32, %eax
-; X64-NEXT: movq %rax, %xmm0
; X64-NEXT: retq
%tmp = bitcast x86_mmx %x to <2 x i32>
%tmp3 = insertelement <2 x i32> %tmp, i32 32, i32 0
diff --git a/llvm/test/CodeGen/X86/vec_insert-mmx.ll b/llvm/test/CodeGen/X86/vec_insert-mmx.ll
index f561a2a20e194..c00417080fe36 100644
--- a/llvm/test/CodeGen/X86/vec_insert-mmx.ll
+++ b/llvm/test/CodeGen/X86/vec_insert-mmx.ll
@@ -6,15 +6,15 @@
define x86_mmx @t0(i32 %A) nounwind {
; X86-LABEL: t0:
; X86: ## %bb.0:
-; X86-NEXT: movd {{[0-9]+}}(%esp), %mm1
-; X86-NEXT: pxor %mm0, %mm0
-; X86-NEXT: punpckldq %mm1, %mm0 ## mm0 = mm0[0],mm1[0]
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: xorl %eax, %eax
; X86-NEXT: retl
;
; X64-LABEL: t0:
; X64: ## %bb.0:
; X64-NEXT: movd %edi, %xmm0
-; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,1,1]
+; X64-NEXT: psllq $32, %xmm0
+; X64-NEXT: movq %xmm0, %rax
; X64-NEXT: retq
%tmp3 = insertelement <2 x i32> < i32 0, i32 undef >, i32 %A, i32 1
%tmp4 = bitcast <2 x i32> %tmp3 to x86_mmx
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/mmx-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/mmx-intrinsics.ll
index 39b2b6225d8b1..dce1aa2bcd1d4 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/mmx-intrinsics.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/mmx-intrinsics.ll
@@ -17,16 +17,16 @@ define i64 @test1(<1 x i64> %a, <1 x i64> %b) #0 {
; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16>
; CHECK-NEXT: [[TMP13:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16>
; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT: [[TMP16:%.*]] = bitcast <4 x i16> [[TMP13]] to i64
-; CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx
-; CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP12]] to i64
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
-; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP16]], [[TMP8]]
-; CHECK-NEXT: [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.ssse3.phadd.w(x86_mmx [[TMP2]], x86_mmx [[TMP3]]) #[[ATTR5:[0-9]+]]
-; CHECK-NEXT: [[TMP11:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16>
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to <4 x i16>
+; CHECK-NEXT: [[TMP16:%.*]] = bitcast <4 x i16> [[TMP13]] to <1 x i64>
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64>
+; CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP12]] to <1 x i64>
+; CHECK-NEXT: [[TMP17:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
+; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP16]], [[TMP8]]
+; CHECK-NEXT: [[TMP18:%.*]] = tail call <1 x i64> @llvm.x86.ssse3.phadd.w(<1 x i64> [[TMP2]], <1 x i64> [[TMP17]]) #[[ATTR5:[0-9]+]]
+; CHECK-NEXT: [[TMP11:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16>
+; CHECK-NEXT: [[TMP19:%.*]] = bitcast <1 x i64> [[TMP18]] to <4 x i16>
; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64>
-; CHECK-NEXT: [[TMP14:%.*]] = bitcast <4 x i16> [[TMP5]] to <1 x i64>
+; CHECK-NEXT: [[TMP14:%.*]] = bitcast <4 x i16> [[TMP19]] to <1 x i64>
; CHECK-NEXT: [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0
; CHECK-NEXT: [[TMP15:%.*]] = extractelement <1 x i64> [[TMP14]], i32 0
; CHECK-NEXT: store i64 [[TMP7]], ptr @__msan_retval_tls, align 8
@@ -57,16 +57,16 @@ define i64 @test88(<1 x i64> %a, <1 x i64> %b) #0 {
; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <2 x i32>
; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <2 x i32>
; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
-; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP14]] to i64
-; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to x86_mmx
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP11]] to i64
-; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx
-; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pcmpgt.d(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2:[0-9]+]]
-; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <2 x i32>
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <2 x i32>
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP14]] to <1 x i64>
+; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to <1 x i64>
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP11]] to <1 x i64>
+; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64>
+; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT: [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pcmpgt.d(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2:[0-9]+]]
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <2 x i32>
+; CHECK-NEXT: [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <2 x i32>
; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP9]] to <1 x i64>
-; CHECK-NEXT: [[TMP12:%.*]] = bitcast <2 x i32> [[TMP3]] to <1 x i64>
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast <2 x i32> [[TMP16]] to <1 x i64>
; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -97,16 +97,16 @@ define i64 @test87(<1 x i64> %a, <1 x i64> %b) #0 {
; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16>
; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16>
; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to i64
-; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to i64
-; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
-; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pcmpgt.w(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16>
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16>
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64>
+; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64>
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64>
+; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
+; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT: [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pcmpgt.w(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16>
+; CHECK-NEXT: [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <4 x i16>
; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP9]] to <1 x i64>
-; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP3]] to <1 x i64>
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP16]] to <1 x i64>
; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -137,16 +137,16 @@ define i64 @test86(<1 x i64> %a, <1 x i64> %b) #0 {
; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8>
; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
-; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to i64
-; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to x86_mmx
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to i64
-; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx
-; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pcmpgt.b(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <8 x i8>
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <8 x i8>
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to <1 x i64>
+; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64>
+; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT: [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pcmpgt.b(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <8 x i8>
+; CHECK-NEXT: [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <8 x i8>
; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
-; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x i64>
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP16]] to <1 x i64>
; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -177,16 +177,16 @@ define i64 @test85(<1 x i64> %a, <1 x i64> %b) #0 {
; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <2 x i32>
; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <2 x i32>
; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
-; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP14]] to i64
-; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to x86_mmx
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP11]] to i64
-; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx
-; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pcmpeq.d(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <2 x i32>
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <2 x i32>
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP14]] to <1 x i64>
+; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to <1 x i64>
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP11]] to <1 x i64>
+; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64>
+; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT: [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pcmpeq.d(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <2 x i32>
+; CHECK-NEXT: [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <2 x i32>
; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP9]] to <1 x i64>
-; CHECK-NEXT: [[TMP12:%.*]] = bitcast <2 x i32> [[TMP3]] to <1 x i64>
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast <2 x i32> [[TMP16]] to <1 x i64>
; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -217,16 +217,16 @@ define i64 @test84(<1 x i64> %a, <1 x i64> %b) #0 {
; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16>
; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16>
; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to i64
-; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to i64
-; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
-; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pcmpeq.w(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16>
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16>
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64>
+; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64>
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64>
+; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
+; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT: [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pcmpeq.w(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16>
+; CHECK-NEXT: [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <4 x i16>
; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP9]] to <1 x i64>
-; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP3]] to <1 x i64>
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP16]] to <1 x i64>
; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -257,16 +257,16 @@ define i64 @test83(<1 x i64> %a, <1 x i64> %b) #0 {
; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8>
; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
-; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to i64
-; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to x86_mmx
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to i64
-; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx
-; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pcmpeq.b(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <8 x i8>
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <8 x i8>
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to <1 x i64>
+; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64>
+; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT: [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pcmpeq.b(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <8 x i8>
+; CHECK-NEXT: [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <8 x i8>
; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
-; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x i64>
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP16]] to <1 x i64>
; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -297,16 +297,16 @@ define i64 @test82(<1 x i64> %a, <1 x i64> %b) #0 {
; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <2 x i32>
; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <2 x i32>
; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
-; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP14]] to i64
-; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to x86_mmx
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP11]] to i64
-; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx
-; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.punpckldq(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <2 x i32>
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <2 x i32>
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP14]] to <1 x i64>
+; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to <1 x i64>
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP11]] to <1 x i64>
+; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64>
+; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT: [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.punpckldq(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <2 x i32>
+; CHECK-NEXT: [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <2 x i32>
; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP9]] to <1 x i64>
-; CHECK-NEXT: [[TMP12:%.*]] = bitcast <2 x i32> [[TMP3]] to <1 x i64>
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast <2 x i32> [[TMP16]] to <1 x i64>
; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -337,16 +337,16 @@ define i64 @test81(<1 x i64> %a, <1 x i64> %b) #0 {
; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16>
; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16>
; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to i64
-; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to i64
-; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
-; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.punpcklwd(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16>
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16>
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64>
+; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64>
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64>
+; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
+; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT: [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.punpcklwd(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16>
+; CHECK-NEXT: [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <4 x i16>
; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP9]] to <1 x i64>
-; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP3]] to <1 x i64>
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP16]] to <1 x i64>
; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -377,16 +377,16 @@ define i64 @test80(<1 x i64> %a, <1 x i64> %b) #0 {
; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8>
; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
-; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to i64
-; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to x86_mmx
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to i64
-; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx
-; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.punpcklbw(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <8 x i8>
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <8 x i8>
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to <1 x i64>
+; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64>
+; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT: [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.punpcklbw(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <8 x i8>
+; CHECK-NEXT: [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <8 x i8>
; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
-; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x i64>
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP16]] to <1 x i64>
; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -417,16 +417,16 @@ define i64 @test79(<1 x i64> %a, <1 x i64> %b) #0 {
; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <2 x i32>
; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <2 x i32>
; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
-; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP14]] to i64
-; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to x86_mmx
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP11]] to i64
-; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx
-; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.punpckhdq(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <2 x i32>
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <2 x i32>
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP14]] to <1 x i64>
+; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to <1 x i64>
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP11]] to <1 x i64>
+; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64>
+; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT: [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.punpckhdq(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <2 x i32>
+; CHECK-NEXT: [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <2 x i32>
; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP9]] to <1 x i64>
-; CHECK-NEXT: [[TMP12:%.*]] = bitcast <2 x i32> [[TMP3]] to <1 x i64>
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast <2 x i32> [[TMP16]] to <1 x i64>
; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -457,16 +457,16 @@ define i64 @test78(<1 x i64> %a, <1 x i64> %b) #0 {
; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16>
; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16>
; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to i64
-; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to i64
-; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
-; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.punpckhwd(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16>
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16>
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64>
+; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64>
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64>
+; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
+; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT: [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.punpckhwd(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16>
+; CHECK-NEXT: [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <4 x i16>
; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP9]] to <1 x i64>
-; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP3]] to <1 x i64>
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP16]] to <1 x i64>
; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -497,16 +497,16 @@ define i64 @test77(<1 x i64> %a, <1 x i64> %b) #0 {
; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8>
; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
-; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to i64
-; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to x86_mmx
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to i64
-; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx
-; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.punpckhbw(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <8 x i8>
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <8 x i8>
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to <1 x i64>
+; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64>
+; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT: [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.punpckhbw(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <8 x i8>
+; CHECK-NEXT: [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <8 x i8>
; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
-; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x i64>
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP16]] to <1 x i64>
; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -537,23 +537,22 @@ define i64 @test76(<1 x i64> %a, <1 x i64> %b) #0 {
; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16>
; CHECK-NEXT: [[TMP20:%.*]] = bitcast <1 x i64> [[TMP17]] to <4 x i16>
; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT: [[TMP23:%.*]] = bitcast <4 x i16> [[TMP20]] to i64
-; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP19]] to i64
-; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
-; CHECK-NEXT: [[TMP8:%.*]] = bitcast i64 [[TMP23]] to <4 x i16>
-; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64 [[TMP7]] to <4 x i16>
+; CHECK-NEXT: [[TMP23:%.*]] = bitcast <4 x i16> [[TMP20]] to <1 x i64>
+; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64>
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP19]] to <1 x i64>
+; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
+; CHECK-NEXT: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP23]] to <4 x i16>
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[TMP7]] to <4 x i16>
; CHECK-NEXT: [[TMP10:%.*]] = icmp ne <4 x i16> [[TMP8]], zeroinitializer
; CHECK-NEXT: [[TMP11:%.*]] = sext <4 x i1> [[TMP10]] to <4 x i16>
; CHECK-NEXT: [[TMP12:%.*]] = icmp ne <4 x i16> [[TMP9]], zeroinitializer
; CHECK-NEXT: [[TMP13:%.*]] = sext <4 x i1> [[TMP12]] to <4 x i16>
-; CHECK-NEXT: [[TMP14:%.*]] = bitcast <4 x i16> [[TMP11]] to x86_mmx
-; CHECK-NEXT: [[TMP15:%.*]] = bitcast <4 x i16> [[TMP13]] to x86_mmx
-; CHECK-NEXT: [[_MSPROP_VECTOR_PACK:%.*]] = call x86_mmx @llvm.x86.mmx.packsswb(x86_mmx [[TMP14]], x86_mmx [[TMP15]])
-; CHECK-NEXT: [[TMP2:%.*]] = bitcast x86_mmx [[_MSPROP_VECTOR_PACK]] to i64
-; CHECK-NEXT: [[TMP3:%.*]] = tail call x86_mmx @llvm.x86.mmx.packuswb(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT: [[TMP18:%.*]] = bitcast i64 [[TMP2]] to <8 x i8>
-; CHECK-NEXT: [[TMP4:%.*]] = bitcast x86_mmx [[TMP3]] to <8 x i8>
+; CHECK-NEXT: [[TMP14:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64>
+; CHECK-NEXT: [[TMP15:%.*]] = bitcast <4 x i16> [[TMP13]] to <1 x i64>
+; CHECK-NEXT: [[_MSPROP_VECTOR_PACK:%.*]] = call <1 x i64> @llvm.x86.mmx.packsswb(<1 x i64> [[TMP14]], <1 x i64> [[TMP15]])
+; CHECK-NEXT: [[TMP24:%.*]] = tail call <1 x i64> @llvm.x86.mmx.packuswb(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT: [[TMP18:%.*]] = bitcast <1 x i64> [[_MSPROP_VECTOR_PACK]] to <8 x i8>
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP24]] to <8 x i8>
; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP18]] to <1 x i64>
; CHECK-NEXT: [[TMP21:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
; CHECK-NEXT: [[TMP6:%.*]] = extractelement <1 x i64> [[TMP5]], i32 0
@@ -586,23 +585,22 @@ define i64 @test75(<1 x i64> %a, <1 x i64> %b) #0 {
; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <2 x i32>
; CHECK-NEXT: [[TMP20:%.*]] = bitcast <1 x i64> [[TMP17]] to <2 x i32>
; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
-; CHECK-NEXT: [[TMP23:%.*]] = bitcast <2 x i32> [[TMP20]] to i64
-; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to x86_mmx
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP19]] to i64
-; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx
-; CHECK-NEXT: [[TMP8:%.*]] = bitcast i64 [[TMP23]] to <2 x i32>
-; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64 [[TMP7]] to <2 x i32>
+; CHECK-NEXT: [[TMP23:%.*]] = bitcast <2 x i32> [[TMP20]] to <1 x i64>
+; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to <1 x i64>
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP19]] to <1 x i64>
+; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64>
+; CHECK-NEXT: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP23]] to <2 x i32>
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[TMP7]] to <2 x i32>
; CHECK-NEXT: [[TMP10:%.*]] = icmp ne <2 x i32> [[TMP8]], zeroinitializer
; CHECK-NEXT: [[TMP11:%.*]] = sext <2 x i1> [[TMP10]] to <2 x i32>
; CHECK-NEXT: [[TMP12:%.*]] = icmp ne <2 x i32> [[TMP9]], zeroinitializer
; CHECK-NEXT: [[TMP13:%.*]] = sext <2 x i1> [[TMP12]] to <2 x i32>
-; CHECK-NEXT: [[TMP14:%.*]] = bitcast <2 x i32> [[TMP11]] to x86_mmx
-; CHECK-NEXT: [[TMP15:%.*]] = bitcast <2 x i32> [[TMP13]] to x86_mmx
-; CHECK-NEXT: [[_MSPROP_VECTOR_PACK:%.*]] = call x86_mmx @llvm.x86.mmx.packssdw(x86_mmx [[TMP14]], x86_mmx [[TMP15]])
-; CHECK-NEXT: [[TMP2:%.*]] = bitcast x86_mmx [[_MSPROP_VECTOR_PACK]] to i64
-; CHECK-NEXT: [[TMP3:%.*]] = tail call x86_mmx @llvm.x86.mmx.packssdw(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT: [[TMP18:%.*]] = bitcast i64 [[TMP2]] to <4 x i16>
-; CHECK-NEXT: [[TMP4:%.*]] = bitcast x86_mmx [[TMP3]] to <4 x i16>
+; CHECK-NEXT: [[TMP14:%.*]] = bitcast <2 x i32> [[TMP11]] to <1 x i64>
+; CHECK-NEXT: [[TMP15:%.*]] = bitcast <2 x i32> [[TMP13]] to <1 x i64>
+; CHECK-NEXT: [[_MSPROP_VECTOR_PACK:%.*]] = call <1 x i64> @llvm.x86.mmx.packssdw(<1 x i64> [[TMP14]], <1 x i64> [[TMP15]])
+; CHECK-NEXT: [[TMP24:%.*]] = tail call <1 x i64> @llvm.x86.mmx.packssdw(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT: [[TMP18:%.*]] = bitcast <1 x i64> [[_MSPROP_VECTOR_PACK]] to <4 x i16>
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP24]] to <4 x i16>
; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP18]] to <1 x i64>
; CHECK-NEXT: [[TMP21:%.*]] = bitcast <4 x i16> [[TMP4]] to <1 x i64>
; CHECK-NEXT: [[TMP6:%.*]] = extractelement <1 x i64> [[TMP5]], i32 0
@@ -635,23 +633,22 @@ define i64 @test74(<1 x i64> %a, <1 x i64> %b) #0 {
; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16>
; CHECK-NEXT: [[TMP20:%.*]] = bitcast <1 x i64> [[TMP17]] to <4 x i16>
; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT: [[TMP23:%.*]] = bitcast <4 x i16> [[TMP20]] to i64
-; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP19]] to i64
-; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
-; CHECK-NEXT: [[TMP8:%.*]] = bitcast i64 [[TMP23]] to <4 x i16>
-; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64 [[TMP7]] to <4 x i16>
+; CHECK-NEXT: [[TMP23:%.*]] = bitcast <4 x i16> [[TMP20]] to <1 x i64>
+; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64>
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP19]] to <1 x i64>
+; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
+; CHECK-NEXT: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP23]] to <4 x i16>
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[TMP7]] to <4 x i16>
; CHECK-NEXT: [[TMP10:%.*]] = icmp ne <4 x i16> [[TMP8]], zeroinitializer
; CHECK-NEXT: [[TMP11:%.*]] = sext <4 x i1> [[TMP10]] to <4 x i16>
; CHECK-NEXT: [[TMP12:%.*]] = icmp ne <4 x i16> [[TMP9]], zeroinitializer
; CHECK-NEXT: [[TMP13:%.*]] = sext <4 x i1> [[TMP12]] to <4 x i16>
-; CHECK-NEXT: [[TMP14:%.*]] = bitcast <4 x i16> [[TMP11]] to x86_mmx
-; CHECK-NEXT: [[TMP15:%.*]] = bitcast <4 x i16> [[TMP13]] to x86_mmx
-; CHECK-NEXT: [[_MSPROP_VECTOR_PACK:%.*]] = call x86_mmx @llvm.x86.mmx.packsswb(x86_mmx [[TMP14]], x86_mmx [[TMP15]])
-; CHECK-NEXT: [[TMP2:%.*]] = bitcast x86_mmx [[_MSPROP_VECTOR_PACK]] to i64
-; CHECK-NEXT: [[TMP3:%.*]] = tail call x86_mmx @llvm.x86.mmx.packsswb(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT: [[TMP18:%.*]] = bitcast i64 [[TMP2]] to <8 x i8>
-; CHECK-NEXT: [[TMP4:%.*]] = bitcast x86_mmx [[TMP3]] to <8 x i8>
+; CHECK-NEXT: [[TMP14:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64>
+; CHECK-NEXT: [[TMP15:%.*]] = bitcast <4 x i16> [[TMP13]] to <1 x i64>
+; CHECK-NEXT: [[_MSPROP_VECTOR_PACK:%.*]] = call <1 x i64> @llvm.x86.mmx.packsswb(<1 x i64> [[TMP14]], <1 x i64> [[TMP15]])
+; CHECK-NEXT: [[TMP24:%.*]] = tail call <1 x i64> @llvm.x86.mmx.packsswb(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT: [[TMP18:%.*]] = bitcast <1 x i64> [[_MSPROP_VECTOR_PACK]] to <8 x i8>
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP24]] to <8 x i8>
; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP18]] to <1 x i64>
; CHECK-NEXT: [[TMP21:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
; CHECK-NEXT: [[TMP6:%.*]] = extractelement <1 x i64> [[TMP5]], i32 0
@@ -681,17 +678,15 @@ define i64 @test73(<1 x i64> %a) #0 {
; CHECK-NEXT: call void @llvm.donothing()
; CHECK-NEXT: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <2 x i32>
; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
-; CHECK-NEXT: [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to i64
-; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx
-; CHECK-NEXT: [[TMP10:%.*]] = bitcast i64 [[TMP9]] to x86_mmx
-; CHECK-NEXT: [[TMP1:%.*]] = call x86_mmx @llvm.x86.mmx.psrai.d(x86_mmx [[TMP10]], i32 3)
-; CHECK-NEXT: [[TMP2:%.*]] = bitcast x86_mmx [[TMP1]] to i64
-; CHECK-NEXT: [[TMP11:%.*]] = or i64 [[TMP2]], 0
-; CHECK-NEXT: [[TMP3:%.*]] = tail call x86_mmx @llvm.x86.mmx.psrai.d(x86_mmx [[MMX_VAR_I]], i32 3) #[[ATTR2]]
-; CHECK-NEXT: [[TMP14:%.*]] = bitcast i64 [[TMP11]] to <2 x i32>
-; CHECK-NEXT: [[TMP4:%.*]] = bitcast x86_mmx [[TMP3]] to <2 x i32>
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP14]] to <1 x i64>
-; CHECK-NEXT: [[TMP12:%.*]] = bitcast <2 x i32> [[TMP4]] to <1 x i64>
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP8]] to <1 x i64>
+; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64>
+; CHECK-NEXT: [[TMP2:%.*]] = call <1 x i64> @llvm.x86.mmx.psrai.d(<1 x i64> [[TMP1]], i32 3)
+; CHECK-NEXT: [[TMP11:%.*]] = or <1 x i64> [[TMP2]], zeroinitializer
+; CHECK-NEXT: [[TMP9:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psrai.d(<1 x i64> [[MMX_VAR_I]], i32 3) #[[ATTR2]]
+; CHECK-NEXT: [[TMP10:%.*]] = bitcast <1 x i64> [[TMP11]] to <2 x i32>
+; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP9]] to <2 x i32>
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP10]] to <1 x i64>
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast <2 x i32> [[TMP14]] to <1 x i64>
; CHECK-NEXT: [[TMP6:%.*]] = extractelement <1 x i64> [[TMP5]], i32 0
; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
; CHECK-NEXT: store i64 [[TMP6]], ptr @__msan_retval_tls, align 8
@@ -717,17 +712,15 @@ define i64 @test72(<1 x i64> %a) #0 {
; CHECK-NEXT: call void @llvm.donothing()
; CHECK-NEXT: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <4 x i16>
; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT: [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to i64
-; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
-; CHECK-NEXT: [[TMP10:%.*]] = bitcast i64 [[TMP9]] to x86_mmx
-; CHECK-NEXT: [[TMP1:%.*]] = call x86_mmx @llvm.x86.mmx.psrai.w(x86_mmx [[TMP10]], i32 3)
-; CHECK-NEXT: [[TMP2:%.*]] = bitcast x86_mmx [[TMP1]] to i64
-; CHECK-NEXT: [[TMP11:%.*]] = or i64 [[TMP2]], 0
-; CHECK-NEXT: [[TMP3:%.*]] = tail call x86_mmx @llvm.x86.mmx.psrai.w(x86_mmx [[MMX_VAR_I]], i32 3) #[[ATTR2]]
-; CHECK-NEXT: [[TMP14:%.*]] = bitcast i64 [[TMP11]] to <4 x i16>
-; CHECK-NEXT: [[TMP4:%.*]] = bitcast x86_mmx [[TMP3]] to <4 x i16>
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64>
-; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP4]] to <1 x i64>
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP8]] to <1 x i64>
+; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
+; CHECK-NEXT: [[TMP2:%.*]] = call <1 x i64> @llvm.x86.mmx.psrai.w(<1 x i64> [[TMP1]], i32 3)
+; CHECK-NEXT: [[TMP11:%.*]] = or <1 x i64> [[TMP2]], zeroinitializer
+; CHECK-NEXT: [[TMP9:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psrai.w(<1 x i64> [[MMX_VAR_I]], i32 3) #[[ATTR2]]
+; CHECK-NEXT: [[TMP10:%.*]] = bitcast <1 x i64> [[TMP11]] to <4 x i16>
+; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP9]] to <4 x i16>
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP10]] to <1 x i64>
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64>
; CHECK-NEXT: [[TMP6:%.*]] = extractelement <1 x i64> [[TMP5]], i32 0
; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
; CHECK-NEXT: store i64 [[TMP6]], ptr @__msan_retval_tls, align 8
@@ -751,17 +744,15 @@ define i64 @test72_2(<1 x i64> %a) #0 {
; CHECK-NEXT: call void @llvm.donothing()
; CHECK-NEXT: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <4 x i16>
; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT: [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to i64
-; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
-; CHECK-NEXT: [[TMP10:%.*]] = bitcast i64 [[TMP9]] to x86_mmx
-; CHECK-NEXT: [[TMP1:%.*]] = call x86_mmx @llvm.x86.mmx.psrai.w(x86_mmx [[TMP10]], i32 0)
-; CHECK-NEXT: [[TMP2:%.*]] = bitcast x86_mmx [[TMP1]] to i64
-; CHECK-NEXT: [[TMP11:%.*]] = or i64 [[TMP2]], 0
-; CHECK-NEXT: [[TMP3:%.*]] = tail call x86_mmx @llvm.x86.mmx.psrai.w(x86_mmx [[MMX_VAR_I]], i32 0) #[[ATTR2]]
-; CHECK-NEXT: [[TMP14:%.*]] = bitcast i64 [[TMP11]] to <4 x i16>
-; CHECK-NEXT: [[TMP4:%.*]] = bitcast x86_mmx [[TMP3]] to <4 x i16>
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64>
-; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP4]] to <1 x i64>
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP8]] to <1 x i64>
+; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
+; CHECK-NEXT: [[TMP2:%.*]] = call <1 x i64> @llvm.x86.mmx.psrai.w(<1 x i64> [[TMP1]], i32 0)
+; CHECK-NEXT: [[TMP11:%.*]] = or <1 x i64> [[TMP2]], zeroinitializer
+; CHECK-NEXT: [[TMP9:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psrai.w(<1 x i64> [[MMX_VAR_I]], i32 0) #[[ATTR2]]
+; CHECK-NEXT: [[TMP10:%.*]] = bitcast <1 x i64> [[TMP11]] to <4 x i16>
+; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP9]] to <4 x i16>
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP10]] to <1 x i64>
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64>
; CHECK-NEXT: [[TMP6:%.*]] = extractelement <1 x i64> [[TMP5]], i32 0
; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
; CHECK-NEXT: store i64 [[TMP6]], ptr @__msan_retval_tls, align 8
@@ -787,13 +778,13 @@ define i64 @test71(<1 x i64> %a) #0 {
; CHECK-NEXT: call void @llvm.donothing()
; CHECK-NEXT: [[_MSPROP:%.*]] = extractelement <1 x i64> [[TMP5]], i32 0
; CHECK-NEXT: [[TMP0:%.*]] = extractelement <1 x i64> [[A]], i32 0
-; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast i64 [[TMP0]] to x86_mmx
-; CHECK-NEXT: [[TMP6:%.*]] = bitcast i64 [[_MSPROP]] to x86_mmx
-; CHECK-NEXT: [[TMP1:%.*]] = call x86_mmx @llvm.x86.mmx.psrli.q(x86_mmx [[TMP6]], i32 3)
-; CHECK-NEXT: [[TMP2:%.*]] = bitcast x86_mmx [[TMP1]] to i64
-; CHECK-NEXT: [[TMP7:%.*]] = or i64 [[TMP2]], 0
-; CHECK-NEXT: [[TMP3:%.*]] = tail call x86_mmx @llvm.x86.mmx.psrli.q(x86_mmx [[MMX_VAR_I]], i32 3) #[[ATTR2]]
-; CHECK-NEXT: [[TMP4:%.*]] = bitcast x86_mmx [[TMP3]] to i64
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[_MSPROP]] to <1 x i64>
+; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast i64 [[TMP0]] to <1 x i64>
+; CHECK-NEXT: [[TMP3:%.*]] = call <1 x i64> @llvm.x86.mmx.psrli.q(<1 x i64> [[TMP2]], i32 3)
+; CHECK-NEXT: [[TMP6:%.*]] = or <1 x i64> [[TMP3]], zeroinitializer
+; CHECK-NEXT: [[TMP8:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psrli.q(<1 x i64> [[MMX_VAR_I]], i32 3) #[[ATTR2]]
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to i64
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP8]] to i64
; CHECK-NEXT: store i64 [[TMP7]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret i64 [[TMP4]]
;
@@ -815,17 +806,15 @@ define i64 @test70(<1 x i64> %a) #0 {
; CHECK-NEXT: call void @llvm.donothing()
; CHECK-NEXT: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <2 x i32>
; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
-; CHECK-NEXT: [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to i64
-; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx
-; CHECK-NEXT: [[TMP10:%.*]] = bitcast i64 [[TMP9]] to x86_mmx
-; CHECK-NEXT: [[TMP1:%.*]] = call x86_mmx @llvm.x86.mmx.psrli.d(x86_mmx [[TMP10]], i32 3)
-; CHECK-NEXT: [[TMP2:%.*]] = bitcast x86_mmx [[TMP1]] to i64
-; CHECK-NEXT: [[TMP11:%.*]] = or i64 [[TMP2]], 0
-; CHECK-NEXT: [[TMP3:%.*]] = tail call x86_mmx @llvm.x86.mmx.psrli.d(x86_mmx [[MMX_VAR_I]], i32 3) #[[ATTR2]]
-; CHECK-NEXT: [[TMP14:%.*]] = bitcast i64 [[TMP11]] to <2 x i32>
-; CHECK-NEXT: [[TMP4:%.*]] = bitcast x86_mmx [[TMP3]] to <2 x i32>
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP14]] to <1 x i64>
-; CHECK-NEXT: [[TMP12:%.*]] = bitcast <2 x i32> [[TMP4]] to <1 x i64>
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP8]] to <1 x i64>
+; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64>
+; CHECK-NEXT: [[TMP2:%.*]] = call <1 x i64> @llvm.x86.mmx.psrli.d(<1 x i64> [[TMP1]], i32 3)
+; CHECK-NEXT: [[TMP11:%.*]] = or <1 x i64> [[TMP2]], zeroinitializer
+; CHECK-NEXT: [[TMP9:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psrli.d(<1 x i64> [[MMX_VAR_I]], i32 3) #[[ATTR2]]
+; CHECK-NEXT: [[TMP10:%.*]] = bitcast <1 x i64> [[TMP11]] to <2 x i32>
+; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP9]] to <2 x i32>
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP10]] to <1 x i64>
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast <2 x i32> [[TMP14]] to <1 x i64>
; CHECK-NEXT: [[TMP6:%.*]] = extractelement <1 x i64> [[TMP5]], i32 0
; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
; CHECK-NEXT: store i64 [[TMP6]], ptr @__msan_retval_tls, align 8
@@ -849,17 +838,15 @@ define i64 @test70_2(<1 x i64> %a) #0 {
; CHECK-NEXT: call void @llvm.donothing()
; CHECK-NEXT: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <2 x i32>
; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
-; CHECK-NEXT: [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to i64
-; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx
-; CHECK-NEXT: [[TMP10:%.*]] = bitcast i64 [[TMP9]] to x86_mmx
-; CHECK-NEXT: [[TMP1:%.*]] = call x86_mmx @llvm.x86.mmx.psrli.d(x86_mmx [[TMP10]], i32 0)
-; CHECK-NEXT: [[TMP2:%.*]] = bitcast x86_mmx [[TMP1]] to i64
-; CHECK-NEXT: [[TMP11:%.*]] = or i64 [[TMP2]], 0
-; CHECK-NEXT: [[TMP3:%.*]] = tail call x86_mmx @llvm.x86.mmx.psrli.d(x86_mmx [[MMX_VAR_I]], i32 0) #[[ATTR2]]
-; CHECK-NEXT: [[TMP14:%.*]] = bitcast i64 [[TMP11]] to <2 x i32>
-; CHECK-NEXT: [[TMP4:%.*]] = bitcast x86_mmx [[TMP3]] to <2 x i32>
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP14]] to <1 x i64>
-; CHECK-NEXT: [[TMP12:%.*]] = bitcast <2 x i32> [[TMP4]] to <1 x i64>
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP8]] to <1 x i64>
+; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64>
+; CHECK-NEXT: [[TMP2:%.*]] = call <1 x i64> @llvm.x86.mmx.psrli.d(<1 x i64> [[TMP1]], i32 0)
+; CHECK-NEXT: [[TMP11:%.*]] = or <1 x i64> [[TMP2]], zeroinitializer
+; CHECK-NEXT: [[TMP9:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psrli.d(<1 x i64> [[MMX_VAR_I]], i32 0) #[[ATTR2]]
+; CHECK-NEXT: [[TMP10:%.*]] = bitcast <1 x i64> [[TMP11]] to <2 x i32>
+; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP9]] to <2 x i32>
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP10]] to <1 x i64>
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast <2 x i32> [[TMP14]] to <1 x i64>
; CHECK-NEXT: [[TMP6:%.*]] = extractelement <1 x i64> [[TMP5]], i32 0
; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
; CHECK-NEXT: store i64 [[TMP6]], ptr @__msan_retval_tls, align 8
@@ -885,17 +872,15 @@ define i64 @test69(<1 x i64> %a) #0 {
; CHECK-NEXT: call void @llvm.donothing()
; CHECK-NEXT: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <4 x i16>
; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT: [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to i64
-; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
-; CHECK-NEXT: [[TMP10:%.*]] = bitcast i64 [[TMP9]] to x86_mmx
-; CHECK-NEXT: [[TMP1:%.*]] = call x86_mmx @llvm.x86.mmx.psrli.w(x86_mmx [[TMP10]], i32 3)
-; CHECK-NEXT: [[TMP2:%.*]] = bitcast x86_mmx [[TMP1]] to i64
-; CHECK-NEXT: [[TMP11:%.*]] = or i64 [[TMP2]], 0
-; CHECK-NEXT: [[TMP3:%.*]] = tail call x86_mmx @llvm.x86.mmx.psrli.w(x86_mmx [[MMX_VAR_I]], i32 3) #[[ATTR2]]
-; CHECK-NEXT: [[TMP14:%.*]] = bitcast i64 [[TMP11]] to <4 x i16>
-; CHECK-NEXT: [[TMP4:%.*]] = bitcast x86_mmx [[TMP3]] to <4 x i16>
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64>
-; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP4]] to <1 x i64>
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP8]] to <1 x i64>
+; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
+; CHECK-NEXT: [[TMP2:%.*]] = call <1 x i64> @llvm.x86.mmx.psrli.w(<1 x i64> [[TMP1]], i32 3)
+; CHECK-NEXT: [[TMP11:%.*]] = or <1 x i64> [[TMP2]], zeroinitializer
+; CHECK-NEXT: [[TMP9:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psrli.w(<1 x i64> [[MMX_VAR_I]], i32 3) #[[ATTR2]]
+; CHECK-NEXT: [[TMP10:%.*]] = bitcast <1 x i64> [[TMP11]] to <4 x i16>
+; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP9]] to <4 x i16>
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP10]] to <1 x i64>
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64>
; CHECK-NEXT: [[TMP6:%.*]] = extractelement <1 x i64> [[TMP5]], i32 0
; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
; CHECK-NEXT: store i64 [[TMP6]], ptr @__msan_retval_tls, align 8
@@ -921,13 +906,13 @@ define i64 @test68(<1 x i64> %a) #0 {
; CHECK-NEXT: call void @llvm.donothing()
; CHECK-NEXT: [[_MSPROP:%.*]] = extractelement <1 x i64> [[TMP5]], i32 0
; CHECK-NEXT: [[TMP0:%.*]] = extractelement <1 x i64> [[A]], i32 0
-; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast i64 [[TMP0]] to x86_mmx
-; CHECK-NEXT: [[TMP6:%.*]] = bitcast i64 [[_MSPROP]] to x86_mmx
-; CHECK-NEXT: [[TMP1:%.*]] = call x86_mmx @llvm.x86.mmx.pslli.q(x86_mmx [[TMP6]], i32 3)
-; CHECK-NEXT: [[TMP2:%.*]] = bitcast x86_mmx [[TMP1]] to i64
-; CHECK-NEXT: [[TMP7:%.*]] = or i64 [[TMP2]], 0
-; CHECK-NEXT: [[TMP3:%.*]] = tail call x86_mmx @llvm.x86.mmx.pslli.q(x86_mmx [[MMX_VAR_I]], i32 3) #[[ATTR2]]
-; CHECK-NEXT: [[TMP4:%.*]] = bitcast x86_mmx [[TMP3]] to i64
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[_MSPROP]] to <1 x i64>
+; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast i64 [[TMP0]] to <1 x i64>
+; CHECK-NEXT: [[TMP3:%.*]] = call <1 x i64> @llvm.x86.mmx.pslli.q(<1 x i64> [[TMP2]], i32 3)
+; CHECK-NEXT: [[TMP6:%.*]] = or <1 x i64> [[TMP3]], zeroinitializer
+; CHECK-NEXT: [[TMP8:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pslli.q(<1 x i64> [[MMX_VAR_I]], i32 3) #[[ATTR2]]
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to i64
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP8]] to i64
; CHECK-NEXT: store i64 [[TMP7]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret i64 [[TMP4]]
;
@@ -949,17 +934,15 @@ define i64 @test67(<1 x i64> %a) #0 {
; CHECK-NEXT: call void @llvm.donothing()
; CHECK-NEXT: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <2 x i32>
; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
-; CHECK-NEXT: [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to i64
-; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx
-; CHECK-NEXT: [[TMP10:%.*]] = bitcast i64 [[TMP9]] to x86_mmx
-; CHECK-NEXT: [[TMP1:%.*]] = call x86_mmx @llvm.x86.mmx.pslli.d(x86_mmx [[TMP10]], i32 3)
-; CHECK-NEXT: [[TMP2:%.*]] = bitcast x86_mmx [[TMP1]] to i64
-; CHECK-NEXT: [[TMP11:%.*]] = or i64 [[TMP2]], 0
-; CHECK-NEXT: [[TMP3:%.*]] = tail call x86_mmx @llvm.x86.mmx.pslli.d(x86_mmx [[MMX_VAR_I]], i32 3) #[[ATTR2]]
-; CHECK-NEXT: [[TMP14:%.*]] = bitcast i64 [[TMP11]] to <2 x i32>
-; CHECK-NEXT: [[TMP4:%.*]] = bitcast x86_mmx [[TMP3]] to <2 x i32>
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP14]] to <1 x i64>
-; CHECK-NEXT: [[TMP12:%.*]] = bitcast <2 x i32> [[TMP4]] to <1 x i64>
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP8]] to <1 x i64>
+; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64>
+; CHECK-NEXT: [[TMP2:%.*]] = call <1 x i64> @llvm.x86.mmx.pslli.d(<1 x i64> [[TMP1]], i32 3)
+; CHECK-NEXT: [[TMP11:%.*]] = or <1 x i64> [[TMP2]], zeroinitializer
+; CHECK-NEXT: [[TMP9:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pslli.d(<1 x i64> [[MMX_VAR_I]], i32 3) #[[ATTR2]]
+; CHECK-NEXT: [[TMP10:%.*]] = bitcast <1 x i64> [[TMP11]] to <2 x i32>
+; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP9]] to <2 x i32>
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP10]] to <1 x i64>
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast <2 x i32> [[TMP14]] to <1 x i64>
; CHECK-NEXT: [[TMP6:%.*]] = extractelement <1 x i64> [[TMP5]], i32 0
; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
; CHECK-NEXT: store i64 [[TMP6]], ptr @__msan_retval_tls, align 8
@@ -985,17 +968,15 @@ define i64 @test66(<1 x i64> %a) #0 {
; CHECK-NEXT: call void @llvm.donothing()
; CHECK-NEXT: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <4 x i16>
; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT: [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to i64
-; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
-; CHECK-NEXT: [[TMP10:%.*]] = bitcast i64 [[TMP9]] to x86_mmx
-; CHECK-NEXT: [[TMP1:%.*]] = call x86_mmx @llvm.x86.mmx.pslli.w(x86_mmx [[TMP10]], i32 3)
-; CHECK-NEXT: [[TMP2:%.*]] = bitcast x86_mmx [[TMP1]] to i64
-; CHECK-NEXT: [[TMP11:%.*]] = or i64 [[TMP2]], 0
-; CHECK-NEXT: [[TMP3:%.*]] = tail call x86_mmx @llvm.x86.mmx.pslli.w(x86_mmx [[MMX_VAR_I]], i32 3) #[[ATTR2]]
-; CHECK-NEXT: [[TMP14:%.*]] = bitcast i64 [[TMP11]] to <4 x i16>
-; CHECK-NEXT: [[TMP4:%.*]] = bitcast x86_mmx [[TMP3]] to <4 x i16>
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64>
-; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP4]] to <1 x i64>
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP8]] to <1 x i64>
+; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
+; CHECK-NEXT: [[TMP2:%.*]] = call <1 x i64> @llvm.x86.mmx.pslli.w(<1 x i64> [[TMP1]], i32 3)
+; CHECK-NEXT: [[TMP11:%.*]] = or <1 x i64> [[TMP2]], zeroinitializer
+; CHECK-NEXT: [[TMP9:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pslli.w(<1 x i64> [[MMX_VAR_I]], i32 3) #[[ATTR2]]
+; CHECK-NEXT: [[TMP10:%.*]] = bitcast <1 x i64> [[TMP11]] to <4 x i16>
+; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP9]] to <4 x i16>
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP10]] to <1 x i64>
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64>
; CHECK-NEXT: [[TMP6:%.*]] = extractelement <1 x i64> [[TMP5]], i32 0
; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
; CHECK-NEXT: store i64 [[TMP6]], ptr @__msan_retval_tls, align 8
@@ -1019,17 +1000,15 @@ define i64 @test66_2(<1 x i64> %a) #0 {
; CHECK-NEXT: call void @llvm.donothing()
; CHECK-NEXT: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <4 x i16>
; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT: [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to i64
-; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
-; CHECK-NEXT: [[TMP10:%.*]] = bitcast i64 [[TMP9]] to x86_mmx
-; CHECK-NEXT: [[TMP1:%.*]] = call x86_mmx @llvm.x86.mmx.pslli.w(x86_mmx [[TMP10]], i32 0)
-; CHECK-NEXT: [[TMP2:%.*]] = bitcast x86_mmx [[TMP1]] to i64
-; CHECK-NEXT: [[TMP11:%.*]] = or i64 [[TMP2]], 0
-; CHECK-NEXT: [[TMP3:%.*]] = tail call x86_mmx @llvm.x86.mmx.pslli.w(x86_mmx [[MMX_VAR_I]], i32 0) #[[ATTR2]]
-; CHECK-NEXT: [[TMP14:%.*]] = bitcast i64 [[TMP11]] to <4 x i16>
-; CHECK-NEXT: [[TMP4:%.*]] = bitcast x86_mmx [[TMP3]] to <4 x i16>
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64>
-; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP4]] to <1 x i64>
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP8]] to <1 x i64>
+; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
+; CHECK-NEXT: [[TMP2:%.*]] = call <1 x i64> @llvm.x86.mmx.pslli.w(<1 x i64> [[TMP1]], i32 0)
+; CHECK-NEXT: [[TMP11:%.*]] = or <1 x i64> [[TMP2]], zeroinitializer
+; CHECK-NEXT: [[TMP9:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pslli.w(<1 x i64> [[MMX_VAR_I]], i32 0) #[[ATTR2]]
+; CHECK-NEXT: [[TMP10:%.*]] = bitcast <1 x i64> [[TMP11]] to <4 x i16>
+; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP9]] to <4 x i16>
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP10]] to <1 x i64>
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64>
; CHECK-NEXT: [[TMP6:%.*]] = extractelement <1 x i64> [[TMP5]], i32 0
; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
; CHECK-NEXT: store i64 [[TMP6]], ptr @__msan_retval_tls, align 8
@@ -1056,20 +1035,21 @@ define i64 @test65(<1 x i64> %a, <1 x i64> %b) #0 {
; CHECK-NEXT: call void @llvm.donothing()
; CHECK-NEXT: [[TMP11:%.*]] = bitcast <1 x i64> [[TMP9]] to <2 x i32>
; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
-; CHECK-NEXT: [[TMP12:%.*]] = bitcast <2 x i32> [[TMP11]] to i64
-; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP11]] to <1 x i64>
+; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64>
; CHECK-NEXT: [[_MSPROP:%.*]] = extractelement <1 x i64> [[TMP10]], i32 0
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <1 x i64> [[B]], i32 0
-; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast i64 [[TMP1]] to x86_mmx
-; CHECK-NEXT: [[TMP13:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT: [[TMP8:%.*]] = bitcast i64 [[_MSPROP]] to <1 x i64>
+; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast i64 [[TMP1]] to <1 x i64>
+; CHECK-NEXT: [[TMP15:%.*]] = bitcast <1 x i64> [[TMP8]] to i64
+; CHECK-NEXT: [[TMP13:%.*]] = icmp ne i64 [[TMP15]], 0
; CHECK-NEXT: [[TMP14:%.*]] = sext i1 [[TMP13]] to i64
-; CHECK-NEXT: [[TMP8:%.*]] = bitcast i64 [[TMP12]] to x86_mmx
-; CHECK-NEXT: [[TMP2:%.*]] = call x86_mmx @llvm.x86.mmx.psra.d(x86_mmx [[TMP8]], x86_mmx [[MMX_VAR1_I]])
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to i64
-; CHECK-NEXT: [[TMP15:%.*]] = or i64 [[TMP3]], [[TMP14]]
-; CHECK-NEXT: [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.mmx.psra.d(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT: [[TMP18:%.*]] = bitcast i64 [[TMP15]] to <2 x i32>
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to <2 x i32>
+; CHECK-NEXT: [[TMP19:%.*]] = bitcast i64 [[TMP14]] to <1 x i64>
+; CHECK-NEXT: [[TMP20:%.*]] = call <1 x i64> @llvm.x86.mmx.psra.d(<1 x i64> [[TMP4]], <1 x i64> [[MMX_VAR1_I]])
+; CHECK-NEXT: [[TMP12:%.*]] = or <1 x i64> [[TMP20]], [[TMP19]]
+; CHECK-NEXT: [[TMP21:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psra.d(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT: [[TMP18:%.*]] = bitcast <1 x i64> [[TMP12]] to <2 x i32>
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <1 x i64> [[TMP21]] to <2 x i32>
; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP18]] to <1 x i64>
; CHECK-NEXT: [[TMP16:%.*]] = bitcast <2 x i32> [[TMP5]] to <1 x i64>
; CHECK-NEXT: [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0
@@ -1100,20 +1080,21 @@ define i64 @test64(<1 x i64> %a, <1 x i64> %b) #0 {
; CHECK-NEXT: call void @llvm.donothing()
; CHECK-NEXT: [[TMP11:%.*]] = bitcast <1 x i64> [[TMP9]] to <4 x i16>
; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP11]] to i64
-; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64>
+; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
; CHECK-NEXT: [[_MSPROP:%.*]] = extractelement <1 x i64> [[TMP10]], i32 0
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <1 x i64> [[B]], i32 0
-; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast i64 [[TMP1]] to x86_mmx
-; CHECK-NEXT: [[TMP13:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT: [[TMP8:%.*]] = bitcast i64 [[_MSPROP]] to <1 x i64>
+; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast i64 [[TMP1]] to <1 x i64>
+; CHECK-NEXT: [[TMP15:%.*]] = bitcast <1 x i64> [[TMP8]] to i64
+; CHECK-NEXT: [[TMP13:%.*]] = icmp ne i64 [[TMP15]], 0
; CHECK-NEXT: [[TMP14:%.*]] = sext i1 [[TMP13]] to i64
-; CHECK-NEXT: [[TMP8:%.*]] = bitcast i64 [[TMP12]] to x86_mmx
-; CHECK-NEXT: [[TMP2:%.*]] = call x86_mmx @llvm.x86.mmx.psra.w(x86_mmx [[TMP8]], x86_mmx [[MMX_VAR1_I]])
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to i64
-; CHECK-NEXT: [[TMP15:%.*]] = or i64 [[TMP3]], [[TMP14]]
-; CHECK-NEXT: [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.mmx.psra.w(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT: [[TMP18:%.*]] = bitcast i64 [[TMP15]] to <4 x i16>
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to <4 x i16>
+; CHECK-NEXT: [[TMP19:%.*]] = bitcast i64 [[TMP14]] to <1 x i64>
+; CHECK-NEXT: [[TMP20:%.*]] = call <1 x i64> @llvm.x86.mmx.psra.w(<1 x i64> [[TMP4]], <1 x i64> [[MMX_VAR1_I]])
+; CHECK-NEXT: [[TMP12:%.*]] = or <1 x i64> [[TMP20]], [[TMP19]]
+; CHECK-NEXT: [[TMP21:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psra.w(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT: [[TMP18:%.*]] = bitcast <1 x i64> [[TMP12]] to <4 x i16>
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <1 x i64> [[TMP21]] to <4 x i16>
; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP18]] to <1 x i64>
; CHECK-NEXT: [[TMP16:%.*]] = bitcast <4 x i16> [[TMP5]] to <1 x i64>
; CHECK-NEXT: [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0
@@ -1144,18 +1125,21 @@ define i64 @test63(<1 x i64> %a, <1 x i64> %b) #0 {
; CHECK-NEXT: call void @llvm.donothing()
; CHECK-NEXT: [[_MSPROP:%.*]] = extractelement <1 x i64> [[TMP7]], i32 0
; CHECK-NEXT: [[TMP0:%.*]] = extractelement <1 x i64> [[A]], i32 0
-; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast i64 [[TMP0]] to x86_mmx
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast i64 [[_MSPROP]] to <1 x i64>
+; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast i64 [[TMP0]] to <1 x i64>
; CHECK-NEXT: [[_MSPROP1:%.*]] = extractelement <1 x i64> [[TMP8]], i32 0
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <1 x i64> [[B]], i32 0
-; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast i64 [[TMP1]] to x86_mmx
-; CHECK-NEXT: [[TMP9:%.*]] = icmp ne i64 [[_MSPROP1]], 0
+; CHECK-NEXT: [[TMP13:%.*]] = bitcast i64 [[_MSPROP1]] to <1 x i64>
+; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast i64 [[TMP1]] to <1 x i64>
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP13]] to i64
+; CHECK-NEXT: [[TMP9:%.*]] = icmp ne i64 [[TMP6]], 0
; CHECK-NEXT: [[TMP10:%.*]] = sext i1 [[TMP9]] to i64
-; CHECK-NEXT: [[TMP6:%.*]] = bitcast i64 [[_MSPROP]] to x86_mmx
-; CHECK-NEXT: [[TMP2:%.*]] = call x86_mmx @llvm.x86.mmx.psrl.q(x86_mmx [[TMP6]], x86_mmx [[MMX_VAR1_I]])
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to i64
-; CHECK-NEXT: [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
-; CHECK-NEXT: [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.mmx.psrl.q(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to i64
+; CHECK-NEXT: [[TMP14:%.*]] = bitcast i64 [[TMP10]] to <1 x i64>
+; CHECK-NEXT: [[TMP15:%.*]] = call <1 x i64> @llvm.x86.mmx.psrl.q(<1 x i64> [[TMP3]], <1 x i64> [[MMX_VAR1_I]])
+; CHECK-NEXT: [[TMP16:%.*]] = or <1 x i64> [[TMP15]], [[TMP14]]
+; CHECK-NEXT: [[TMP12:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psrl.q(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT: [[TMP11:%.*]] = bitcast <1 x i64> [[TMP16]] to i64
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <1 x i64> [[TMP12]] to i64
; CHECK-NEXT: store i64 [[TMP11]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret i64 [[TMP5]]
;
@@ -1180,20 +1164,21 @@ define i64 @test62(<1 x i64> %a, <1 x i64> %b) #0 {
; CHECK-NEXT: call void @llvm.donothing()
; CHECK-NEXT: [[TMP11:%.*]] = bitcast <1 x i64> [[TMP9]] to <2 x i32>
; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
-; CHECK-NEXT: [[TMP12:%.*]] = bitcast <2 x i32> [[TMP11]] to i64
-; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP11]] to <1 x i64>
+; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64>
; CHECK-NEXT: [[_MSPROP:%.*]] = extractelement <1 x i64> [[TMP10]], i32 0
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <1 x i64> [[B]], i32 0
-; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast i64 [[TMP1]] to x86_mmx
-; CHECK-NEXT: [[TMP13:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT: [[TMP8:%.*]] = bitcast i64 [[_MSPROP]] to <1 x i64>
+; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast i64 [[TMP1]] to <1 x i64>
+; CHECK-NEXT: [[TMP15:%.*]] = bitcast <1 x i64> [[TMP8]] to i64
+; CHECK-NEXT: [[TMP13:%.*]] = icmp ne i64 [[TMP15]], 0
; CHECK-NEXT: [[TMP14:%.*]] = sext i1 [[TMP13]] to i64
-; CHECK-NEXT: [[TMP8:%.*]] = bitcast i64 [[TMP12]] to x86_mmx
-; CHECK-NEXT: [[TMP2:%.*]] = call x86_mmx @llvm.x86.mmx.psrl.d(x86_mmx [[TMP8]], x86_mmx [[MMX_VAR1_I]])
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to i64
-; CHECK-NEXT: [[TMP15:%.*]] = or i64 [[TMP3]], [[TMP14]]
-; CHECK-NEXT: [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.mmx.psrl.d(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT: [[TMP18:%.*]] = bitcast i64 [[TMP15]] to <2 x i32>
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to <2 x i32>
+; CHECK-NEXT: [[TMP19:%.*]] = bitcast i64 [[TMP14]] to <1 x i64>
+; CHECK-NEXT: [[TMP20:%.*]] = call <1 x i64> @llvm.x86.mmx.psrl.d(<1 x i64> [[TMP4]], <1 x i64> [[MMX_VAR1_I]])
+; CHECK-NEXT: [[TMP12:%.*]] = or <1 x i64> [[TMP20]], [[TMP19]]
+; CHECK-NEXT: [[TMP21:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psrl.d(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT: [[TMP18:%.*]] = bitcast <1 x i64> [[TMP12]] to <2 x i32>
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <1 x i64> [[TMP21]] to <2 x i32>
; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP18]] to <1 x i64>
; CHECK-NEXT: [[TMP16:%.*]] = bitcast <2 x i32> [[TMP5]] to <1 x i64>
; CHECK-NEXT: [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0
@@ -1224,20 +1209,21 @@ define i64 @test61(<1 x i64> %a, <1 x i64> %b) #0 {
; CHECK-NEXT: call void @llvm.donothing()
; CHECK-NEXT: [[TMP11:%.*]] = bitcast <1 x i64> [[TMP9]] to <4 x i16>
; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP11]] to i64
-; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64>
+; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
; CHECK-NEXT: [[_MSPROP:%.*]] = extractelement <1 x i64> [[TMP10]], i32 0
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <1 x i64> [[B]], i32 0
-; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast i64 [[TMP1]] to x86_mmx
-; CHECK-NEXT: [[TMP13:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT: [[TMP8:%.*]] = bitcast i64 [[_MSPROP]] to <1 x i64>
+; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast i64 [[TMP1]] to <1 x i64>
+; CHECK-NEXT: [[TMP15:%.*]] = bitcast <1 x i64> [[TMP8]] to i64
+; CHECK-NEXT: [[TMP13:%.*]] = icmp ne i64 [[TMP15]], 0
; CHECK-NEXT: [[TMP14:%.*]] = sext i1 [[TMP13]] to i64
-; CHECK-NEXT: [[TMP8:%.*]] = bitcast i64 [[TMP12]] to x86_mmx
-; CHECK-NEXT: [[TMP2:%.*]] = call x86_mmx @llvm.x86.mmx.psrl.w(x86_mmx [[TMP8]], x86_mmx [[MMX_VAR1_I]])
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to i64
-; CHECK-NEXT: [[TMP15:%.*]] = or i64 [[TMP3]], [[TMP14]]
-; CHECK-NEXT: [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.mmx.psrl.w(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT: [[TMP18:%.*]] = bitcast i64 [[TMP15]] to <4 x i16>
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to <4 x i16>
+; CHECK-NEXT: [[TMP19:%.*]] = bitcast i64 [[TMP14]] to <1 x i64>
+; CHECK-NEXT: [[TMP20:%.*]] = call <1 x i64> @llvm.x86.mmx.psrl.w(<1 x i64> [[TMP4]], <1 x i64> [[MMX_VAR1_I]])
+; CHECK-NEXT: [[TMP12:%.*]] = or <1 x i64> [[TMP20]], [[TMP19]]
+; CHECK-NEXT: [[TMP21:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psrl.w(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT: [[TMP18:%.*]] = bitcast <1 x i64> [[TMP12]] to <4 x i16>
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <1 x i64> [[TMP21]] to <4 x i16>
; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP18]] to <1 x i64>
; CHECK-NEXT: [[TMP16:%.*]] = bitcast <4 x i16> [[TMP5]] to <1 x i64>
; CHECK-NEXT: [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0
@@ -1268,18 +1254,21 @@ define i64 @test60(<1 x i64> %a, <1 x i64> %b) #0 {
; CHECK-NEXT: call void @llvm.donothing()
; CHECK-NEXT: [[_MSPROP:%.*]] = extractelement <1 x i64> [[TMP7]], i32 0
; CHECK-NEXT: [[TMP0:%.*]] = extractelement <1 x i64> [[A]], i32 0
-; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast i64 [[TMP0]] to x86_mmx
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast i64 [[_MSPROP]] to <1 x i64>
+; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast i64 [[TMP0]] to <1 x i64>
; CHECK-NEXT: [[_MSPROP1:%.*]] = extractelement <1 x i64> [[TMP8]], i32 0
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <1 x i64> [[B]], i32 0
-; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast i64 [[TMP1]] to x86_mmx
-; CHECK-NEXT: [[TMP9:%.*]] = icmp ne i64 [[_MSPROP1]], 0
+; CHECK-NEXT: [[TMP13:%.*]] = bitcast i64 [[_MSPROP1]] to <1 x i64>
+; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast i64 [[TMP1]] to <1 x i64>
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP13]] to i64
+; CHECK-NEXT: [[TMP9:%.*]] = icmp ne i64 [[TMP6]], 0
; CHECK-NEXT: [[TMP10:%.*]] = sext i1 [[TMP9]] to i64
-; CHECK-NEXT: [[TMP6:%.*]] = bitcast i64 [[_MSPROP]] to x86_mmx
-; CHECK-NEXT: [[TMP2:%.*]] = call x86_mmx @llvm.x86.mmx.psll.q(x86_mmx [[TMP6]], x86_mmx [[MMX_VAR1_I]])
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to i64
-; CHECK-NEXT: [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
-; CHECK-NEXT: [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.mmx.psll.q(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to i64
+; CHECK-NEXT: [[TMP14:%.*]] = bitcast i64 [[TMP10]] to <1 x i64>
+; CHECK-NEXT: [[TMP15:%.*]] = call <1 x i64> @llvm.x86.mmx.psll.q(<1 x i64> [[TMP3]], <1 x i64> [[MMX_VAR1_I]])
+; CHECK-NEXT: [[TMP16:%.*]] = or <1 x i64> [[TMP15]], [[TMP14]]
+; CHECK-NEXT: [[TMP12:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psll.q(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT: [[TMP11:%.*]] = bitcast <1 x i64> [[TMP16]] to i64
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <1 x i64> [[TMP12]] to i64
; CHECK-NEXT: store i64 [[TMP11]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret i64 [[TMP5]]
;
@@ -1304,20 +1293,21 @@ define i64 @test59(<1 x i64> %a, <1 x i64> %b) #0 {
; CHECK-NEXT: call void @llvm.donothing()
; CHECK-NEXT: [[TMP11:%.*]] = bitcast <1 x i64> [[TMP9]] to <2 x i32>
; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
-; CHECK-NEXT: [[TMP12:%.*]] = bitcast <2 x i32> [[TMP11]] to i64
-; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP11]] to <1 x i64>
+; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64>
; CHECK-NEXT: [[_MSPROP:%.*]] = extractelement <1 x i64> [[TMP10]], i32 0
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <1 x i64> [[B]], i32 0
-; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast i64 [[TMP1]] to x86_mmx
-; CHECK-NEXT: [[TMP13:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT: [[TMP8:%.*]] = bitcast i64 [[_MSPROP]] to <1 x i64>
+; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast i64 [[TMP1]] to <1 x i64>
+; CHECK-NEXT: [[TMP15:%.*]] = bitcast <1 x i64> [[TMP8]] to i64
+; CHECK-NEXT: [[TMP13:%.*]] = icmp ne i64 [[TMP15]], 0
; CHECK-NEXT: [[TMP14:%.*]] = sext i1 [[TMP13]] to i64
-; CHECK-NEXT: [[TMP8:%.*]] = bitcast i64 [[TMP12]] to x86_mmx
-; CHECK-NEXT: [[TMP2:%.*]] = call x86_mmx @llvm.x86.mmx.psll.d(x86_mmx [[TMP8]], x86_mmx [[MMX_VAR1_I]])
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to i64
-; CHECK-NEXT: [[TMP15:%.*]] = or i64 [[TMP3]], [[TMP14]]
-; CHECK-NEXT: [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.mmx.psll.d(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT: [[TMP18:%.*]] = bitcast i64 [[TMP15]] to <2 x i32>
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to <2 x i32>
+; CHECK-NEXT: [[TMP19:%.*]] = bitcast i64 [[TMP14]] to <1 x i64>
+; CHECK-NEXT: [[TMP20:%.*]] = call <1 x i64> @llvm.x86.mmx.psll.d(<1 x i64> [[TMP4]], <1 x i64> [[MMX_VAR1_I]])
+; CHECK-NEXT: [[TMP12:%.*]] = or <1 x i64> [[TMP20]], [[TMP19]]
+; CHECK-NEXT: [[TMP21:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psll.d(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT: [[TMP18:%.*]] = bitcast <1 x i64> [[TMP12]] to <2 x i32>
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <1 x i64> [[TMP21]] to <2 x i32>
; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP18]] to <1 x i64>
; CHECK-NEXT: [[TMP16:%.*]] = bitcast <2 x i32> [[TMP5]] to <1 x i64>
; CHECK-NEXT: [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0
@@ -1348,20 +1338,21 @@ define i64 @test58(<1 x i64> %a, <1 x i64> %b) #0 {
; CHECK-NEXT: call void @llvm.donothing()
; CHECK-NEXT: [[TMP11:%.*]] = bitcast <1 x i64> [[TMP9]] to <4 x i16>
; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP11]] to i64
-; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64>
+; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
; CHECK-NEXT: [[_MSPROP:%.*]] = extractelement <1 x i64> [[TMP10]], i32 0
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <1 x i64> [[B]], i32 0
-; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast i64 [[TMP1]] to x86_mmx
-; CHECK-NEXT: [[TMP13:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT: [[TMP8:%.*]] = bitcast i64 [[_MSPROP]] to <1 x i64>
+; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast i64 [[TMP1]] to <1 x i64>
+; CHECK-NEXT: [[TMP15:%.*]] = bitcast <1 x i64> [[TMP8]] to i64
+; CHECK-NEXT: [[TMP13:%.*]] = icmp ne i64 [[TMP15]], 0
; CHECK-NEXT: [[TMP14:%.*]] = sext i1 [[TMP13]] to i64
-; CHECK-NEXT: [[TMP8:%.*]] = bitcast i64 [[TMP12]] to x86_mmx
-; CHECK-NEXT: [[TMP2:%.*]] = call x86_mmx @llvm.x86.mmx.psll.w(x86_mmx [[TMP8]], x86_mmx [[MMX_VAR1_I]])
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to i64
-; CHECK-NEXT: [[TMP15:%.*]] = or i64 [[TMP3]], [[TMP14]]
-; CHECK-NEXT: [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.mmx.psll.w(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT: [[TMP18:%.*]] = bitcast i64 [[TMP15]] to <4 x i16>
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to <4 x i16>
+; CHECK-NEXT: [[TMP19:%.*]] = bitcast i64 [[TMP14]] to <1 x i64>
+; CHECK-NEXT: [[TMP20:%.*]] = call <1 x i64> @llvm.x86.mmx.psll.w(<1 x i64> [[TMP4]], <1 x i64> [[MMX_VAR1_I]])
+; CHECK-NEXT: [[TMP12:%.*]] = or <1 x i64> [[TMP20]], [[TMP19]]
+; CHECK-NEXT: [[TMP21:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psll.w(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT: [[TMP18:%.*]] = bitcast <1 x i64> [[TMP12]] to <4 x i16>
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <1 x i64> [[TMP21]] to <4 x i16>
; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP18]] to <1 x i64>
; CHECK-NEXT: [[TMP16:%.*]] = bitcast <4 x i16> [[TMP5]] to <1 x i64>
; CHECK-NEXT: [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0
@@ -1394,16 +1385,16 @@ define i64 @test56(<1 x i64> %a, <1 x i64> %b) #0 {
; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <2 x i32>
; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <2 x i32>
; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
-; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP14]] to i64
-; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to x86_mmx
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP11]] to i64
-; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx
-; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pxor(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <2 x i32>
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <2 x i32>
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP14]] to <1 x i64>
+; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to <1 x i64>
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP11]] to <1 x i64>
+; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64>
+; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT: [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pxor(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <2 x i32>
+; CHECK-NEXT: [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <2 x i32>
; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP9]] to <1 x i64>
-; CHECK-NEXT: [[TMP12:%.*]] = bitcast <2 x i32> [[TMP3]] to <1 x i64>
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast <2 x i32> [[TMP16]] to <1 x i64>
; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -1434,16 +1425,16 @@ define i64 @test55(<1 x i64> %a, <1 x i64> %b) #0 {
; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <2 x i32>
; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <2 x i32>
; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
-; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP14]] to i64
-; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to x86_mmx
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP11]] to i64
-; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx
-; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.por(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <2 x i32>
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <2 x i32>
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP14]] to <1 x i64>
+; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to <1 x i64>
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP11]] to <1 x i64>
+; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64>
+; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT: [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.por(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <2 x i32>
+; CHECK-NEXT: [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <2 x i32>
; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP9]] to <1 x i64>
-; CHECK-NEXT: [[TMP12:%.*]] = bitcast <2 x i32> [[TMP3]] to <1 x i64>
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast <2 x i32> [[TMP16]] to <1 x i64>
; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -1474,16 +1465,16 @@ define i64 @test54(<1 x i64> %a, <1 x i64> %b) #0 {
; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <2 x i32>
; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <2 x i32>
; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
-; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP14]] to i64
-; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to x86_mmx
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP11]] to i64
-; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx
-; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pandn(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <2 x i32>
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <2 x i32>
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP14]] to <1 x i64>
+; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to <1 x i64>
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP11]] to <1 x i64>
+; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64>
+; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT: [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pandn(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <2 x i32>
+; CHECK-NEXT: [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <2 x i32>
; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP9]] to <1 x i64>
-; CHECK-NEXT: [[TMP12:%.*]] = bitcast <2 x i32> [[TMP3]] to <1 x i64>
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast <2 x i32> [[TMP16]] to <1 x i64>
; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -1514,16 +1505,16 @@ define i64 @test53(<1 x i64> %a, <1 x i64> %b) #0 {
; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <2 x i32>
; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <2 x i32>
; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
-; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP14]] to i64
-; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to x86_mmx
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP11]] to i64
-; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx
-; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pand(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <2 x i32>
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <2 x i32>
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP14]] to <1 x i64>
+; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to <1 x i64>
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP11]] to <1 x i64>
+; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64>
+; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT: [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pand(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <2 x i32>
+; CHECK-NEXT: [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <2 x i32>
; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP9]] to <1 x i64>
-; CHECK-NEXT: [[TMP12:%.*]] = bitcast <2 x i32> [[TMP3]] to <1 x i64>
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast <2 x i32> [[TMP16]] to <1 x i64>
; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -1554,16 +1545,16 @@ define i64 @test52(<1 x i64> %a, <1 x i64> %b) #0 {
; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16>
; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16>
; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to i64
-; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to i64
-; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
-; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pmull.w(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16>
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16>
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64>
+; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64>
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64>
+; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
+; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT: [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pmull.w(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16>
+; CHECK-NEXT: [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <4 x i16>
; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP9]] to <1 x i64>
-; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP3]] to <1 x i64>
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP16]] to <1 x i64>
; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -1592,16 +1583,16 @@ define i64 @test51(<1 x i64> %a, <1 x i64> %b) #0 {
; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16>
; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16>
; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to i64
-; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to i64
-; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
-; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pmull.w(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16>
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16>
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64>
+; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64>
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64>
+; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
+; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT: [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pmull.w(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16>
+; CHECK-NEXT: [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <4 x i16>
; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP9]] to <1 x i64>
-; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP3]] to <1 x i64>
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP16]] to <1 x i64>
; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -1632,16 +1623,16 @@ define i64 @test50(<1 x i64> %a, <1 x i64> %b) #0 {
; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16>
; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16>
; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to i64
-; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to i64
-; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
-; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pmulh.w(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16>
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16>
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64>
+; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64>
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64>
+; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
+; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT: [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pmulh.w(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16>
+; CHECK-NEXT: [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <4 x i16>
; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP9]] to <1 x i64>
-; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP3]] to <1 x i64>
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP16]] to <1 x i64>
; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -1672,22 +1663,22 @@ define i64 @test49(<1 x i64> %a, <1 x i64> %b) #0 {
; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16>
; CHECK-NEXT: [[TMP19:%.*]] = bitcast <1 x i64> [[TMP15]] to <4 x i16>
; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP19]] to i64
-; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP16]] to i64
-; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
-; CHECK-NEXT: [[TMP8:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64 [[TMP8]] to <2 x i32>
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP19]] to <1 x i64>
+; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64>
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP16]] to <1 x i64>
+; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
+; CHECK-NEXT: [[TMP8:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <2 x i32>
; CHECK-NEXT: [[TMP10:%.*]] = icmp ne <2 x i32> [[TMP9]], zeroinitializer
; CHECK-NEXT: [[TMP11:%.*]] = sext <2 x i1> [[TMP10]] to <2 x i32>
-; CHECK-NEXT: [[TMP12:%.*]] = bitcast <2 x i32> [[TMP11]] to i64
-; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pmadd.wd(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT: [[TMP14:%.*]] = bitcast i64 [[TMP12]] to <2 x i32>
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <2 x i32>
-; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP14]] to <1 x i64>
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast <2 x i32> [[TMP11]] to <1 x i64>
+; CHECK-NEXT: [[TMP14:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pmadd.wd(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[TMP12]] to <2 x i32>
+; CHECK-NEXT: [[TMP20:%.*]] = bitcast <1 x i64> [[TMP14]] to <2 x i32>
; CHECK-NEXT: [[TMP17:%.*]] = bitcast <2 x i32> [[TMP3]] to <1 x i64>
-; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
-; CHECK-NEXT: [[TMP18:%.*]] = extractelement <1 x i64> [[TMP17]], i32 0
+; CHECK-NEXT: [[TMP21:%.*]] = bitcast <2 x i32> [[TMP20]] to <1 x i64>
+; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP17]], i32 0
+; CHECK-NEXT: [[TMP18:%.*]] = extractelement <1 x i64> [[TMP21]], i32 0
; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret i64 [[TMP18]]
;
@@ -1716,16 +1707,16 @@ define i64 @test48(<1 x i64> %a, <1 x i64> %b) #0 {
; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16>
; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16>
; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to i64
-; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to i64
-; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
-; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.psubus.w(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16>
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16>
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64>
+; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64>
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64>
+; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
+; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT: [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psubus.w(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16>
+; CHECK-NEXT: [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <4 x i16>
; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP9]] to <1 x i64>
-; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP3]] to <1 x i64>
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP16]] to <1 x i64>
; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -1756,16 +1747,16 @@ define i64 @test47(<1 x i64> %a, <1 x i64> %b) #0 {
; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8>
; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
-; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to i64
-; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to x86_mmx
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to i64
-; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx
-; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.psubus.b(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <8 x i8>
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <8 x i8>
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to <1 x i64>
+; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64>
+; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT: [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psubus.b(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <8 x i8>
+; CHECK-NEXT: [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <8 x i8>
; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
-; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x i64>
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP16]] to <1 x i64>
; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -1796,16 +1787,16 @@ define i64 @test46(<1 x i64> %a, <1 x i64> %b) #0 {
; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16>
; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16>
; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to i64
-; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to i64
-; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
-; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.psubs.w(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16>
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16>
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64>
+; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64>
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64>
+; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
+; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT: [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psubs.w(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16>
+; CHECK-NEXT: [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <4 x i16>
; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP9]] to <1 x i64>
-; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP3]] to <1 x i64>
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP16]] to <1 x i64>
; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -1836,16 +1827,16 @@ define i64 @test45(<1 x i64> %a, <1 x i64> %b) #0 {
; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8>
; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
-; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to i64
-; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to x86_mmx
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to i64
-; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx
-; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.psubs.b(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <8 x i8>
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <8 x i8>
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to <1 x i64>
+; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64>
+; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT: [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psubs.b(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <8 x i8>
+; CHECK-NEXT: [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <8 x i8>
; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
-; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x i64>
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP16]] to <1 x i64>
; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -1872,13 +1863,16 @@ define i64 @test44(<1 x i64> %a, <1 x i64> %b) #0 {
; CHECK-NEXT: call void @llvm.donothing()
; CHECK-NEXT: [[_MSPROP:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
; CHECK-NEXT: [[TMP0:%.*]] = extractelement <1 x i64> [[A]], i32 0
-; CHECK-NEXT: [[MMX_VAR:%.*]] = bitcast i64 [[TMP0]] to x86_mmx
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast i64 [[_MSPROP]] to <1 x i64>
+; CHECK-NEXT: [[MMX_VAR:%.*]] = bitcast i64 [[TMP0]] to <1 x i64>
; CHECK-NEXT: [[_MSPROP1:%.*]] = extractelement <1 x i64> [[TMP5]], i32 0
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <1 x i64> [[B]], i32 0
-; CHECK-NEXT: [[MMX_VAR1:%.*]] = bitcast i64 [[TMP1]] to x86_mmx
-; CHECK-NEXT: [[_MSPROP2:%.*]] = or i64 [[_MSPROP]], [[_MSPROP1]]
-; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.psub.q(x86_mmx [[MMX_VAR]], x86_mmx [[MMX_VAR1]])
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to i64
+; CHECK-NEXT: [[TMP8:%.*]] = bitcast i64 [[_MSPROP1]] to <1 x i64>
+; CHECK-NEXT: [[MMX_VAR1:%.*]] = bitcast i64 [[TMP1]] to <1 x i64>
+; CHECK-NEXT: [[_MSPROP3:%.*]] = or <1 x i64> [[TMP7]], [[TMP8]]
+; CHECK-NEXT: [[TMP6:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psub.q(<1 x i64> [[MMX_VAR]], <1 x i64> [[MMX_VAR1]])
+; CHECK-NEXT: [[_MSPROP2:%.*]] = bitcast <1 x i64> [[_MSPROP3]] to i64
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[TMP6]] to i64
; CHECK-NEXT: store i64 [[_MSPROP2]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret i64 [[TMP3]]
;
@@ -1907,16 +1901,16 @@ define i64 @test43(<1 x i64> %a, <1 x i64> %b) #0 {
; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <2 x i32>
; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <2 x i32>
; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
-; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP14]] to i64
-; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to x86_mmx
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP11]] to i64
-; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx
-; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.psub.d(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <2 x i32>
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <2 x i32>
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP14]] to <1 x i64>
+; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to <1 x i64>
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP11]] to <1 x i64>
+; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64>
+; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT: [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psub.d(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <2 x i32>
+; CHECK-NEXT: [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <2 x i32>
; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP9]] to <1 x i64>
-; CHECK-NEXT: [[TMP12:%.*]] = bitcast <2 x i32> [[TMP3]] to <1 x i64>
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast <2 x i32> [[TMP16]] to <1 x i64>
; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -1947,16 +1941,16 @@ define i64 @test42(<1 x i64> %a, <1 x i64> %b) #0 {
; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16>
; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16>
; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to i64
-; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to i64
-; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
-; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.psub.w(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16>
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16>
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64>
+; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64>
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64>
+; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
+; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT: [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psub.w(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16>
+; CHECK-NEXT: [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <4 x i16>
; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP9]] to <1 x i64>
-; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP3]] to <1 x i64>
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP16]] to <1 x i64>
; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -1987,16 +1981,16 @@ define i64 @test41(<1 x i64> %a, <1 x i64> %b) #0 {
; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8>
; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
-; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to i64
-; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to x86_mmx
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to i64
-; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx
-; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.psub.b(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <8 x i8>
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <8 x i8>
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to <1 x i64>
+; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64>
+; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT: [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psub.b(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <8 x i8>
+; CHECK-NEXT: [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <8 x i8>
; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
-; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x i64>
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP16]] to <1 x i64>
; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -2027,16 +2021,16 @@ define i64 @test40(<1 x i64> %a, <1 x i64> %b) #0 {
; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16>
; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16>
; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to i64
-; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to i64
-; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
-; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.paddus.w(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16>
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16>
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64>
+; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64>
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64>
+; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
+; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT: [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.paddus.w(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16>
+; CHECK-NEXT: [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <4 x i16>
; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP9]] to <1 x i64>
-; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP3]] to <1 x i64>
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP16]] to <1 x i64>
; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -2067,16 +2061,16 @@ define i64 @test39(<1 x i64> %a, <1 x i64> %b) #0 {
; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8>
; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
-; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to i64
-; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to x86_mmx
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to i64
-; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx
-; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.paddus.b(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <8 x i8>
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <8 x i8>
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to <1 x i64>
+; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64>
+; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT: [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.paddus.b(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <8 x i8>
+; CHECK-NEXT: [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <8 x i8>
; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
-; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x i64>
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP16]] to <1 x i64>
; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -2107,16 +2101,16 @@ define i64 @test38(<1 x i64> %a, <1 x i64> %b) #0 {
; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16>
; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16>
; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to i64
-; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to i64
-; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
-; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.padds.w(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16>
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16>
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64>
+; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64>
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64>
+; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
+; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT: [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.padds.w(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16>
+; CHECK-NEXT: [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <4 x i16>
; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP9]] to <1 x i64>
-; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP3]] to <1 x i64>
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP16]] to <1 x i64>
; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -2147,16 +2141,16 @@ define i64 @test37(<1 x i64> %a, <1 x i64> %b) #0 {
; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8>
; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
-; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to i64
-; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to x86_mmx
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to i64
-; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx
-; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.padds.b(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <8 x i8>
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <8 x i8>
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to <1 x i64>
+; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64>
+; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT: [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.padds.b(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <8 x i8>
+; CHECK-NEXT: [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <8 x i8>
; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
-; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x i64>
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP16]] to <1 x i64>
; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -2185,13 +2179,16 @@ define i64 @test36(<1 x i64> %a, <1 x i64> %b) #0 {
; CHECK-NEXT: call void @llvm.donothing()
; CHECK-NEXT: [[_MSPROP:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
; CHECK-NEXT: [[TMP0:%.*]] = extractelement <1 x i64> [[A]], i32 0
-; CHECK-NEXT: [[MMX_VAR:%.*]] = bitcast i64 [[TMP0]] to x86_mmx
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast i64 [[_MSPROP]] to <1 x i64>
+; CHECK-NEXT: [[MMX_VAR:%.*]] = bitcast i64 [[TMP0]] to <1 x i64>
; CHECK-NEXT: [[_MSPROP1:%.*]] = extractelement <1 x i64> [[TMP5]], i32 0
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <1 x i64> [[B]], i32 0
-; CHECK-NEXT: [[MMX_VAR1:%.*]] = bitcast i64 [[TMP1]] to x86_mmx
-; CHECK-NEXT: [[_MSPROP2:%.*]] = or i64 [[_MSPROP]], [[_MSPROP1]]
-; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.padd.q(x86_mmx [[MMX_VAR]], x86_mmx [[MMX_VAR1]])
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to i64
+; CHECK-NEXT: [[TMP8:%.*]] = bitcast i64 [[_MSPROP1]] to <1 x i64>
+; CHECK-NEXT: [[MMX_VAR1:%.*]] = bitcast i64 [[TMP1]] to <1 x i64>
+; CHECK-NEXT: [[_MSPROP3:%.*]] = or <1 x i64> [[TMP7]], [[TMP8]]
+; CHECK-NEXT: [[TMP6:%.*]] = tail call <1 x i64> @llvm.x86.mmx.padd.q(<1 x i64> [[MMX_VAR]], <1 x i64> [[MMX_VAR1]])
+; CHECK-NEXT: [[_MSPROP2:%.*]] = bitcast <1 x i64> [[_MSPROP3]] to i64
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[TMP6]] to i64
; CHECK-NEXT: store i64 [[_MSPROP2]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret i64 [[TMP3]]
;
@@ -2218,16 +2215,16 @@ define i64 @test35(<1 x i64> %a, <1 x i64> %b) #0 {
; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <2 x i32>
; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <2 x i32>
; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
-; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP14]] to i64
-; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to x86_mmx
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP11]] to i64
-; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx
-; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <2 x i32>
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <2 x i32>
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP14]] to <1 x i64>
+; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to <1 x i64>
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP11]] to <1 x i64>
+; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64>
+; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT: [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <2 x i32>
+; CHECK-NEXT: [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <2 x i32>
; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP9]] to <1 x i64>
-; CHECK-NEXT: [[TMP12:%.*]] = bitcast <2 x i32> [[TMP3]] to <1 x i64>
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast <2 x i32> [[TMP16]] to <1 x i64>
; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -2258,16 +2255,16 @@ define i64 @test34(<1 x i64> %a, <1 x i64> %b) #0 {
; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16>
; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16>
; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to i64
-; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to i64
-; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
-; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16>
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16>
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64>
+; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64>
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64>
+; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
+; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT: [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.padd.w(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16>
+; CHECK-NEXT: [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <4 x i16>
; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP9]] to <1 x i64>
-; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP3]] to <1 x i64>
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP16]] to <1 x i64>
; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -2298,16 +2295,16 @@ define i64 @test33(<1 x i64> %a, <1 x i64> %b) #0 {
; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8>
; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
-; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to i64
-; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to x86_mmx
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to i64
-; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx
-; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.padd.b(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <8 x i8>
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <8 x i8>
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to <1 x i64>
+; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64>
+; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT: [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.padd.b(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <8 x i8>
+; CHECK-NEXT: [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <8 x i8>
; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
-; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x i64>
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP16]] to <1 x i64>
; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -2338,17 +2335,20 @@ define i64 @test32(<1 x i64> %a, <1 x i64> %b) #0 {
; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
; CHECK-NEXT: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP12]] to <8 x i8>
; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
-; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP4]] to i64
-; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to x86_mmx
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP13]] to i64
-; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx
-; CHECK-NEXT: [[TMP8:%.*]] = or i64 [[TMP6]], [[TMP7]]
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
+; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP13]] to <1 x i64>
+; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+; CHECK-NEXT: [[TMP16:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP16]] to i64
; CHECK-NEXT: [[TMP9:%.*]] = icmp ne i64 [[TMP8]], 0
; CHECK-NEXT: [[TMP10:%.*]] = sext i1 [[TMP9]] to i64
; CHECK-NEXT: [[TMP11:%.*]] = lshr i64 [[TMP10]], 48
-; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.psad.bw(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to i64
-; CHECK-NEXT: store i64 [[TMP11]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: [[TMP17:%.*]] = bitcast i64 [[TMP11]] to <1 x i64>
+; CHECK-NEXT: [[TMP14:%.*]] = tail call <1 x i64> @llvm.x86.mmx.psad.bw(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT: [[TMP15:%.*]] = bitcast <1 x i64> [[TMP17]] to i64
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[TMP14]] to i64
+; CHECK-NEXT: store i64 [[TMP15]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret i64 [[TMP3]]
;
entry:
@@ -2374,16 +2374,16 @@ define i64 @test31(<1 x i64> %a, <1 x i64> %b) #0 {
; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16>
; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16>
; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to i64
-; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to i64
-; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
-; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pmins.w(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16>
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16>
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64>
+; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64>
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64>
+; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
+; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT: [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pmins.w(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16>
+; CHECK-NEXT: [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <4 x i16>
; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP9]] to <1 x i64>
-; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP3]] to <1 x i64>
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP16]] to <1 x i64>
; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -2414,16 +2414,16 @@ define i64 @test30(<1 x i64> %a, <1 x i64> %b) #0 {
; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8>
; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
-; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to i64
-; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to x86_mmx
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to i64
-; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx
-; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pminu.b(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <8 x i8>
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <8 x i8>
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to <1 x i64>
+; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64>
+; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT: [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pminu.b(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <8 x i8>
+; CHECK-NEXT: [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <8 x i8>
; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
-; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x i64>
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP16]] to <1 x i64>
; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -2454,16 +2454,16 @@ define i64 @test29(<1 x i64> %a, <1 x i64> %b) #0 {
; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16>
; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16>
; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to i64
-; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to i64
-; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
-; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pmaxs.w(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16>
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16>
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64>
+; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64>
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64>
+; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
+; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT: [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pmaxs.w(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16>
+; CHECK-NEXT: [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <4 x i16>
; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP9]] to <1 x i64>
-; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP3]] to <1 x i64>
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP16]] to <1 x i64>
; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -2494,16 +2494,16 @@ define i64 @test28(<1 x i64> %a, <1 x i64> %b) #0 {
; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8>
; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
-; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to i64
-; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to x86_mmx
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to i64
-; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx
-; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pmaxu.b(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <8 x i8>
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <8 x i8>
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to <1 x i64>
+; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64>
+; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT: [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pmaxu.b(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <8 x i8>
+; CHECK-NEXT: [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <8 x i8>
; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
-; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x i64>
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP16]] to <1 x i64>
; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -2534,16 +2534,16 @@ define i64 @test27(<1 x i64> %a, <1 x i64> %b) #0 {
; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16>
; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16>
; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to i64
-; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to i64
-; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
-; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pavg.w(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16>
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16>
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64>
+; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64>
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64>
+; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
+; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT: [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pavg.w(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16>
+; CHECK-NEXT: [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <4 x i16>
; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP9]] to <1 x i64>
-; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP3]] to <1 x i64>
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP16]] to <1 x i64>
; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -2574,16 +2574,16 @@ define i64 @test26(<1 x i64> %a, <1 x i64> %b) #0 {
; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8>
; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
-; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to i64
-; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to x86_mmx
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to i64
-; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx
-; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pavg.b(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <8 x i8>
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <8 x i8>
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP14]] to <1 x i64>
+; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64>
+; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT: [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pavg.b(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <8 x i8>
+; CHECK-NEXT: [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <8 x i8>
; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
-; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x i64>
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP16]] to <1 x i64>
; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -2612,16 +2612,19 @@ define void @test25(ptr %p, <1 x i64> %a) nounwind optsize ssp #0 {
; CHECK-NEXT: call void @llvm.donothing()
; CHECK-NEXT: [[_MSPROP:%.*]] = extractelement <1 x i64> [[TMP2]], i32 0
; CHECK-NEXT: [[TMP0:%.*]] = extractelement <1 x i64> [[A]], i32 0
-; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast i64 [[TMP0]] to x86_mmx
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast i64 [[_MSPROP]] to <1 x i64>
+; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast i64 [[TMP0]] to <1 x i64>
+; CHECK-NEXT: [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT: [[TMP5:%.*]] = xor i64 [[TMP4]], 87960930222080
+; CHECK-NEXT: [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; CHECK-NEXT: store <1 x i64> [[TMP3]], ptr [[TMP6]], align 1
; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i64 [[_MSPROP]], 0
-; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0:![0-9]+]]
-; CHECK: 3:
+; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF0:![0-9]+]]
+; CHECK: 7:
; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6:[0-9]+]]
; CHECK-NEXT: unreachable
-; CHECK: 4:
-; CHECK-NEXT: tail call void @llvm.x86.mmx.movnt.dq(ptr [[P]], x86_mmx [[MMX_VAR_I]]) #[[ATTR2]]
+; CHECK: 8:
+; CHECK-NEXT: tail call void @llvm.x86.mmx.movnt.dq(ptr [[P]], <1 x i64> [[MMX_VAR_I]]) #[[ATTR2]]
; CHECK-NEXT: ret void
;
entry:
@@ -2641,15 +2644,16 @@ define i32 @test24(<1 x i64> %a) #0 {
; CHECK-NEXT: call void @llvm.donothing()
; CHECK-NEXT: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP2]] to <8 x i8>
; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP6]] to i64
-; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
+; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[TMP4]] to i64
; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP3]], 0
-; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF0]]
-; CHECK: 4:
+; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP7:%.*]], !prof [[PROF0]]
+; CHECK: 5:
; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]]
; CHECK-NEXT: unreachable
-; CHECK: 5:
-; CHECK-NEXT: [[TMP1:%.*]] = tail call i32 @llvm.x86.mmx.pmovmskb(x86_mmx [[MMX_VAR_I]]) #[[ATTR2]]
+; CHECK: 6:
+; CHECK-NEXT: [[TMP1:%.*]] = tail call i32 @llvm.x86.mmx.pmovmskb(<1 x i64> [[MMX_VAR_I]]) #[[ATTR2]]
; CHECK-NEXT: store i32 0, ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret i32 [[TMP1]]
;
@@ -2674,21 +2678,23 @@ define void @test23(<1 x i64> %d, <1 x i64> %n, ptr %p) nounwind optsize ssp #0
; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[N]] to <8 x i8>
; CHECK-NEXT: [[TMP5:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[D]] to <8 x i8>
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP5]] to i64
-; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to x86_mmx
-; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP3]] to i64
-; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
+; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+; CHECK-NEXT: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x i64>
+; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast <1 x i64> [[TMP9]] to i64
; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP7]], 0
+; CHECK-NEXT: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP10]] to i64
; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i64 [[TMP8]], 0
; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP2]], 0
; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF0]]
-; CHECK: 9:
+; CHECK-NEXT: br i1 [[_MSOR3]], label [[TMP11:%.*]], label [[TMP12:%.*]], !prof [[PROF0]]
+; CHECK: 11:
; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]]
; CHECK-NEXT: unreachable
-; CHECK: 10:
-; CHECK-NEXT: tail call void @llvm.x86.mmx.maskmovq(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]], ptr [[P]]) #[[ATTR2]]
+; CHECK: 12:
+; CHECK-NEXT: tail call void @llvm.x86.mmx.maskmovq(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]], ptr [[P]]) #[[ATTR2]]
; CHECK-NEXT: ret void
;
entry:
@@ -2713,16 +2719,16 @@ define i64 @test22(<1 x i64> %a, <1 x i64> %b) #0 {
; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16>
; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16>
; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to i64
-; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to i64
-; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
-; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pmulhu.w(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16>
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16>
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP14]] to <1 x i64>
+; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64>
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64>
+; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
+; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT: [[TMP15:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pmulhu.w(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16>
+; CHECK-NEXT: [[TMP16:%.*]] = bitcast <1 x i64> [[TMP15]] to <4 x i16>
; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP9]] to <1 x i64>
-; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP3]] to <1 x i64>
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP16]] to <1 x i64>
; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
; CHECK-NEXT: [[TMP13:%.*]] = extractelement <1 x i64> [[TMP12]], i32 0
; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -2750,16 +2756,17 @@ define i64 @test21(<1 x i64> %a) #0 {
; CHECK-NEXT: call void @llvm.donothing()
; CHECK-NEXT: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <4 x i16>
; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT: [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to i64
-; CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
+; CHECK-NEXT: [[TMP10:%.*]] = bitcast <4 x i16> [[TMP8]] to <1 x i64>
+; CHECK-NEXT: [[TMP11:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[TMP10]] to i64
; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP9]], 0
-; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP10:%.*]], label [[TMP6:%.*]], !prof [[PROF0]]
-; CHECK: 5:
+; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP12:%.*]], !prof [[PROF0]]
+; CHECK: 6:
; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]]
; CHECK-NEXT: unreachable
-; CHECK: 6:
-; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx [[TMP1]], i8 3) #[[ATTR5]]
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16>
+; CHECK: 7:
+; CHECK-NEXT: [[TMP13:%.*]] = tail call <1 x i64> @llvm.x86.sse.pshuf.w(<1 x i64> [[TMP11]], i8 3) #[[ATTR5]]
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[TMP13]] to <4 x i16>
; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <1 x i64>
; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
; CHECK-NEXT: store i64 0, ptr @__msan_retval_tls, align 8
@@ -2783,16 +2790,17 @@ define i32 @test21_2(<1 x i64> %a) #0 {
; CHECK-NEXT: call void @llvm.donothing()
; CHECK-NEXT: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <4 x i16>
; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT: [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to i64
-; CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
+; CHECK-NEXT: [[TMP10:%.*]] = bitcast <4 x i16> [[TMP8]] to <1 x i64>
+; CHECK-NEXT: [[TMP11:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[TMP10]] to i64
; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP9]], 0
-; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP10:%.*]], label [[TMP6:%.*]], !prof [[PROF0]]
-; CHECK: 5:
+; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP12:%.*]], !prof [[PROF0]]
+; CHECK: 6:
; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]]
; CHECK-NEXT: unreachable
-; CHECK: 6:
-; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx [[TMP1]], i8 3) #[[ATTR5]]
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16>
+; CHECK: 7:
+; CHECK-NEXT: [[TMP13:%.*]] = tail call <1 x i64> @llvm.x86.sse.pshuf.w(<1 x i64> [[TMP11]], i8 3) #[[ATTR5]]
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[TMP13]] to <4 x i16>
; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <2 x i32>
; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i32> [[TMP4]], i32 0
; CHECK-NEXT: store i32 0, ptr @__msan_retval_tls, align 8
@@ -2821,13 +2829,14 @@ define i64 @test20(<1 x i64> %a, <1 x i64> %b) #0 {
; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <2 x i32>
; CHECK-NEXT: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP8]] to <2 x i32>
; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
-; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP4]] to i64
-; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to x86_mmx
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP9]] to i64
-; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx
-; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.pmulu.dq(x86_mmx [[MMX_VAR_I]], x86_mmx [[MMX_VAR1_I]]) #[[ATTR2]]
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to i64
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP4]] to <1 x i64>
+; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <2 x i32> [[TMP1]] to <1 x i64>
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP9]] to <1 x i64>
+; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64>
+; CHECK-NEXT: [[_MSPROP1:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT: [[TMP10:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pmulu.dq(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]]
+; CHECK-NEXT: [[_MSPROP:%.*]] = bitcast <1 x i64> [[_MSPROP1]] to i64
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[TMP10]] to i64
; CHECK-NEXT: store i64 [[_MSPROP]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret i64 [[TMP3]]
;
@@ -2851,15 +2860,16 @@ define <2 x double> @test19(<1 x i64> %a) #0 {
; CHECK-NEXT: call void @llvm.donothing()
; CHECK-NEXT: [[TMP7:%.*]] = bitcast <1 x i64> [[TMP4]] to <2 x i32>
; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP7]] to i64
-; CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP7]] to <1 x i64>
+; CHECK-NEXT: [[TMP8:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64>
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[TMP5]] to i64
; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP3]], 0
-; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF0]]
-; CHECK: 5:
+; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
+; CHECK: 6:
; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]]
; CHECK-NEXT: unreachable
-; CHECK: 6:
-; CHECK-NEXT: [[TMP2:%.*]] = tail call <2 x double> @llvm.x86.sse.cvtpi2pd(x86_mmx [[TMP1]]) #[[ATTR5]]
+; CHECK: 7:
+; CHECK-NEXT: [[TMP2:%.*]] = tail call <2 x double> @llvm.x86.sse.cvtpi2pd(<1 x i64> [[TMP8]]) #[[ATTR5]]
; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret <2 x double> [[TMP2]]
;
@@ -2885,8 +2895,8 @@ define i64 @test18(<2 x double> %a) #0 {
; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]]
; CHECK-NEXT: unreachable
; CHECK: 3:
-; CHECK-NEXT: [[TMP0:%.*]] = tail call x86_mmx @llvm.x86.sse.cvttpd2pi(<2 x double> [[A]]) #[[ATTR5]]
-; CHECK-NEXT: [[TMP1:%.*]] = bitcast x86_mmx [[TMP0]] to <2 x i32>
+; CHECK-NEXT: [[TMP8:%.*]] = tail call <1 x i64> @llvm.x86.sse.cvttpd2pi(<2 x double> [[A]]) #[[ATTR5]]
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[TMP8]] to <2 x i32>
; CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[TMP1]] to <1 x i64>
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <1 x i64> [[TMP2]], i32 0
; CHECK-NEXT: store i64 0, ptr @__msan_retval_tls, align 8
@@ -2915,8 +2925,8 @@ define i64 @test17(<2 x double> %a) #0 {
; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]]
; CHECK-NEXT: unreachable
; CHECK: 3:
-; CHECK-NEXT: [[TMP0:%.*]] = tail call x86_mmx @llvm.x86.sse.cvtpd2pi(<2 x double> [[A]]) #[[ATTR5]]
-; CHECK-NEXT: [[TMP1:%.*]] = bitcast x86_mmx [[TMP0]] to <2 x i32>
+; CHECK-NEXT: [[TMP8:%.*]] = tail call <1 x i64> @llvm.x86.sse.cvtpd2pi(<2 x double> [[A]]) #[[ATTR5]]
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[TMP8]] to <2 x i32>
; CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[TMP1]] to <1 x i64>
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <1 x i64> [[TMP2]], i32 0
; CHECK-NEXT: store i64 0, ptr @__msan_retval_tls, align 8
@@ -2941,20 +2951,24 @@ define i64 @test16(<1 x i64> %a, <1 x i64> %b) #0 {
; CHECK-NEXT: call void @llvm.donothing()
; CHECK-NEXT: [[_MSPROP:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0
; CHECK-NEXT: [[TMP0:%.*]] = extractelement <1 x i64> [[A]], i32 0
-; CHECK-NEXT: [[MMX_VAR:%.*]] = bitcast i64 [[TMP0]] to x86_mmx
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast i64 [[_MSPROP]] to <1 x i64>
+; CHECK-NEXT: [[MMX_VAR:%.*]] = bitcast i64 [[TMP0]] to <1 x i64>
; CHECK-NEXT: [[_MSPROP1:%.*]] = extractelement <1 x i64> [[TMP7]], i32 0
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <1 x i64> [[B]], i32 0
-; CHECK-NEXT: [[MMX_VAR1:%.*]] = bitcast i64 [[TMP1]] to x86_mmx
-; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
-; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[_MSPROP1]], 0
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast i64 [[_MSPROP1]] to <1 x i64>
+; CHECK-NEXT: [[MMX_VAR1:%.*]] = bitcast i64 [[TMP1]] to <1 x i64>
+; CHECK-NEXT: [[TMP11:%.*]] = bitcast <1 x i64> [[TMP4]] to i64
+; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP11]], 0
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast <1 x i64> [[TMP5]] to i64
+; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i64 [[TMP12]], 0
; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP2]]
-; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF0]]
-; CHECK: 4:
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
+; CHECK: 8:
; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]]
; CHECK-NEXT: unreachable
-; CHECK: 5:
-; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.mmx.palignr.b(x86_mmx [[MMX_VAR]], x86_mmx [[MMX_VAR1]], i8 16)
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to i64
+; CHECK: 9:
+; CHECK-NEXT: [[TMP10:%.*]] = tail call <1 x i64> @llvm.x86.mmx.palignr.b(<1 x i64> [[MMX_VAR]], <1 x i64> [[MMX_VAR1]], i8 16)
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[TMP10]] to i64
; CHECK-NEXT: store i64 0, ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret i64 [[TMP3]]
;
@@ -2978,13 +2992,13 @@ define i64 @test15(<1 x i64> %a) #0 {
; CHECK-NEXT: call void @llvm.donothing()
; CHECK-NEXT: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <2 x i32>
; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
-; CHECK-NEXT: [[TMP11:%.*]] = bitcast <2 x i32> [[TMP8]] to i64
-; CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx
-; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.ssse3.pabs.d(x86_mmx [[TMP1]]) #[[ATTR5]]
-; CHECK-NEXT: [[TMP6:%.*]] = bitcast i64 [[TMP11]] to <2 x i32>
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <2 x i32>
+; CHECK-NEXT: [[TMP11:%.*]] = bitcast <2 x i32> [[TMP8]] to <1 x i64>
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64>
+; CHECK-NEXT: [[TMP12:%.*]] = tail call <1 x i64> @llvm.x86.ssse3.pabs.d(<1 x i64> [[TMP1]]) #[[ATTR5]]
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP11]] to <2 x i32>
+; CHECK-NEXT: [[TMP13:%.*]] = bitcast <1 x i64> [[TMP12]] to <2 x i32>
; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP6]] to <1 x i64>
-; CHECK-NEXT: [[TMP9:%.*]] = bitcast <2 x i32> [[TMP3]] to <1 x i64>
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast <2 x i32> [[TMP13]] to <1 x i64>
; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
; CHECK-NEXT: [[TMP10:%.*]] = extractelement <1 x i64> [[TMP9]], i32 0
; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -3010,13 +3024,13 @@ define i64 @test14(<1 x i64> %a) #0 {
; CHECK-NEXT: call void @llvm.donothing()
; CHECK-NEXT: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <4 x i16>
; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT: [[TMP11:%.*]] = bitcast <4 x i16> [[TMP8]] to i64
-; CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
-; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.ssse3.pabs.w(x86_mmx [[TMP1]]) #[[ATTR5]]
-; CHECK-NEXT: [[TMP6:%.*]] = bitcast i64 [[TMP11]] to <4 x i16>
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <4 x i16>
+; CHECK-NEXT: [[TMP11:%.*]] = bitcast <4 x i16> [[TMP8]] to <1 x i64>
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
+; CHECK-NEXT: [[TMP12:%.*]] = tail call <1 x i64> @llvm.x86.ssse3.pabs.w(<1 x i64> [[TMP1]]) #[[ATTR5]]
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP11]] to <4 x i16>
+; CHECK-NEXT: [[TMP13:%.*]] = bitcast <1 x i64> [[TMP12]] to <4 x i16>
; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP6]] to <1 x i64>
-; CHECK-NEXT: [[TMP9:%.*]] = bitcast <4 x i16> [[TMP3]] to <1 x i64>
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast <4 x i16> [[TMP13]] to <1 x i64>
; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
; CHECK-NEXT: [[TMP10:%.*]] = extractelement <1 x i64> [[TMP9]], i32 0
; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -3042,13 +3056,13 @@ define i64 @test13(<1 x i64> %a) #0 {
; CHECK-NEXT: call void @llvm.donothing()
; CHECK-NEXT: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
-; CHECK-NEXT: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to i64
-; CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx
-; CHECK-NEXT: [[TMP2:%.*]] = tail call x86_mmx @llvm.x86.ssse3.pabs.b(x86_mmx [[TMP1]]) #[[ATTR5]]
-; CHECK-NEXT: [[TMP6:%.*]] = bitcast i64 [[TMP11]] to <8 x i8>
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast x86_mmx [[TMP2]] to <8 x i8>
+; CHECK-NEXT: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+; CHECK-NEXT: [[TMP12:%.*]] = tail call <1 x i64> @llvm.x86.ssse3.pabs.b(<1 x i64> [[TMP1]]) #[[ATTR5]]
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP11]] to <8 x i8>
+; CHECK-NEXT: [[TMP13:%.*]] = bitcast <1 x i64> [[TMP12]] to <8 x i8>
; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
-; CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x i64>
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP13]] to <1 x i64>
; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
; CHECK-NEXT: [[TMP10:%.*]] = extractelement <1 x i64> [[TMP9]], i32 0
; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
@@ -3077,16 +3091,16 @@ define i64 @test12(<1 x i64> %a, <1 x i64> %b) #0 {
; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <2 x i32>
; CHECK-NEXT: [[TMP13:%.*]] = bitcast <1 x i64> [[TMP10]] to <2 x i32>
; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
-; CHECK-NEXT: [[TMP16:%.*]] = bitcast <2 x i32> [[TMP13]] to i64
-; CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[TMP1]] to x86_mmx
-; CHECK-NEXT: [[TMP8:%.*]] = bitcast <2 x i32> [[TMP12]] to i64
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx
-; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP16]], [[TMP8]]
-; CHECK-NEXT: [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.ssse3.psign.d(x86_mmx [[TMP2]], x86_mmx [[TMP3]]) #[[ATTR5]]
-; CHECK-NEXT: [[TMP11:%.*]] = bitcast i64 [[_MSPROP]] to <2 x i32>
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to <2 x i32>
+; CHECK-NEXT: [[TMP16:%.*]] = bitcast <2 x i32> [[TMP13]] to <1 x i64>
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[TMP1]] to <1 x i64>
+; CHECK-NEXT: [[TMP8:%.*]] = bitcast <2 x i32> [[TMP12]] to <1 x i64>
+; CHECK-NEXT: [[TMP17:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64>
+; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP16]], [[TMP8]]
+; CHECK-NEXT: [[TMP18:%.*]] = tail call <1 x i64> @llvm.x86.ssse3.psign.d(<1 x i64> [[TMP2]], <1 x i64> [[TMP17]]) #[[ATTR5]]
+; CHECK-NEXT: [[TMP11:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <2 x i32>
+; CHECK-NEXT: [[TMP19:%.*]] = bitcast <1 x i64> [[TMP18]] to <2 x i32>
; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP11]] to <1 x i64>
-; CHECK-NEXT: [[TMP14:%.*]] = bitcast <2 x i32> [[TMP5]] to <1 x i64>
+; CHECK-NEXT: [[TMP14:%.*]] = bitcast <2 x i32> [[TMP19]] to <1 x i64>
; CHECK-NEXT: [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0
; CHECK-NEXT: [[TMP15:%.*]] = extractelement <1 x i64> [[TMP14]], i32 0
; CHECK-NEXT: store i64 [[TMP7]], ptr @__msan_retval_tls, align 8
@@ -3117,16 +3131,16 @@ define i64 @test11(<1 x i64> %a, <1 x i64> %b) #0 {
; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16>
; CHECK-NEXT: [[TMP13:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16>
; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT: [[TMP16:%.*]] = bitcast <4 x i16> [[TMP13]] to i64
-; CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx
-; CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP12]] to i64
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
-; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP16]], [[TMP8]]
-; CHECK-NEXT: [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.ssse3.psign.w(x86_mmx [[TMP2]], x86_mmx [[TMP3]]) #[[ATTR5]]
-; CHECK-NEXT: [[TMP11:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16>
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to <4 x i16>
+; CHECK-NEXT: [[TMP16:%.*]] = bitcast <4 x i16> [[TMP13]] to <1 x i64>
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64>
+; CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP12]] to <1 x i64>
+; CHECK-NEXT: [[TMP17:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
+; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP16]], [[TMP8]]
+; CHECK-NEXT: [[TMP18:%.*]] = tail call <1 x i64> @llvm.x86.ssse3.psign.w(<1 x i64> [[TMP2]], <1 x i64> [[TMP17]]) #[[ATTR5]]
+; CHECK-NEXT: [[TMP11:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16>
+; CHECK-NEXT: [[TMP19:%.*]] = bitcast <1 x i64> [[TMP18]] to <4 x i16>
; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64>
-; CHECK-NEXT: [[TMP14:%.*]] = bitcast <4 x i16> [[TMP5]] to <1 x i64>
+; CHECK-NEXT: [[TMP14:%.*]] = bitcast <4 x i16> [[TMP19]] to <1 x i64>
; CHECK-NEXT: [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0
; CHECK-NEXT: [[TMP15:%.*]] = extractelement <1 x i64> [[TMP14]], i32 0
; CHECK-NEXT: store i64 [[TMP7]], ptr @__msan_retval_tls, align 8
@@ -3157,16 +3171,16 @@ define i64 @test10(<1 x i64> %a, <1 x i64> %b) #0 {
; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
; CHECK-NEXT: [[TMP13:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8>
; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
-; CHECK-NEXT: [[TMP16:%.*]] = bitcast <8 x i8> [[TMP13]] to i64
-; CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to x86_mmx
-; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP12]] to i64
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx
-; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP16]], [[TMP8]]
-; CHECK-NEXT: [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.ssse3.psign.b(x86_mmx [[TMP2]], x86_mmx [[TMP3]]) #[[ATTR5]]
-; CHECK-NEXT: [[TMP11:%.*]] = bitcast i64 [[_MSPROP]] to <8 x i8>
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to <8 x i8>
+; CHECK-NEXT: [[TMP16:%.*]] = bitcast <8 x i8> [[TMP13]] to <1 x i64>
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP12]] to <1 x i64>
+; CHECK-NEXT: [[TMP17:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP16]], [[TMP8]]
+; CHECK-NEXT: [[TMP18:%.*]] = tail call <1 x i64> @llvm.x86.ssse3.psign.b(<1 x i64> [[TMP2]], <1 x i64> [[TMP17]]) #[[ATTR5]]
+; CHECK-NEXT: [[TMP11:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <8 x i8>
+; CHECK-NEXT: [[TMP19:%.*]] = bitcast <1 x i64> [[TMP18]] to <8 x i8>
; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64>
-; CHECK-NEXT: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
+; CHECK-NEXT: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP19]] to <1 x i64>
; CHECK-NEXT: [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0
; CHECK-NEXT: [[TMP15:%.*]] = extractelement <1 x i64> [[TMP14]], i32 0
; CHECK-NEXT: store i64 [[TMP7]], ptr @__msan_retval_tls, align 8
@@ -3197,16 +3211,16 @@ define i64 @test9(<1 x i64> %a, <1 x i64> %b) #0 {
; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
; CHECK-NEXT: [[TMP13:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8>
; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
-; CHECK-NEXT: [[TMP16:%.*]] = bitcast <8 x i8> [[TMP13]] to i64
-; CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to x86_mmx
-; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP12]] to i64
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx
-; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP16]], [[TMP8]]
-; CHECK-NEXT: [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.ssse3.pshuf.b(x86_mmx [[TMP2]], x86_mmx [[TMP3]]) #[[ATTR5]]
-; CHECK-NEXT: [[TMP11:%.*]] = bitcast i64 [[_MSPROP]] to <8 x i8>
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to <8 x i8>
+; CHECK-NEXT: [[TMP16:%.*]] = bitcast <8 x i8> [[TMP13]] to <1 x i64>
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP12]] to <1 x i64>
+; CHECK-NEXT: [[TMP17:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP16]], [[TMP8]]
+; CHECK-NEXT: [[TMP18:%.*]] = tail call <1 x i64> @llvm.x86.ssse3.pshuf.b(<1 x i64> [[TMP2]], <1 x i64> [[TMP17]]) #[[ATTR5]]
+; CHECK-NEXT: [[TMP11:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <8 x i8>
+; CHECK-NEXT: [[TMP19:%.*]] = bitcast <1 x i64> [[TMP18]] to <8 x i8>
; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64>
-; CHECK-NEXT: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
+; CHECK-NEXT: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP19]] to <1 x i64>
; CHECK-NEXT: [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0
; CHECK-NEXT: [[TMP15:%.*]] = extractelement <1 x i64> [[TMP14]], i32 0
; CHECK-NEXT: store i64 [[TMP7]], ptr @__msan_retval_tls, align 8
@@ -3237,16 +3251,16 @@ define i64 @test8(<1 x i64> %a, <1 x i64> %b) #0 {
; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16>
; CHECK-NEXT: [[TMP13:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16>
; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT: [[TMP16:%.*]] = bitcast <4 x i16> [[TMP13]] to i64
-; CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx
-; CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP12]] to i64
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
-; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP16]], [[TMP8]]
-; CHECK-NEXT: [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.ssse3.pmul.hr.sw(x86_mmx [[TMP2]], x86_mmx [[TMP3]]) #[[ATTR5]]
-; CHECK-NEXT: [[TMP11:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16>
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to <4 x i16>
+; CHECK-NEXT: [[TMP16:%.*]] = bitcast <4 x i16> [[TMP13]] to <1 x i64>
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64>
+; CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP12]] to <1 x i64>
+; CHECK-NEXT: [[TMP17:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
+; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP16]], [[TMP8]]
+; CHECK-NEXT: [[TMP18:%.*]] = tail call <1 x i64> @llvm.x86.ssse3.pmul.hr.sw(<1 x i64> [[TMP2]], <1 x i64> [[TMP17]]) #[[ATTR5]]
+; CHECK-NEXT: [[TMP11:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16>
+; CHECK-NEXT: [[TMP19:%.*]] = bitcast <1 x i64> [[TMP18]] to <4 x i16>
; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64>
-; CHECK-NEXT: [[TMP14:%.*]] = bitcast <4 x i16> [[TMP5]] to <1 x i64>
+; CHECK-NEXT: [[TMP14:%.*]] = bitcast <4 x i16> [[TMP19]] to <1 x i64>
; CHECK-NEXT: [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0
; CHECK-NEXT: [[TMP15:%.*]] = extractelement <1 x i64> [[TMP14]], i32 0
; CHECK-NEXT: store i64 [[TMP7]], ptr @__msan_retval_tls, align 8
@@ -3277,18 +3291,18 @@ define i64 @test7(<1 x i64> %a, <1 x i64> %b) #0 {
; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
; CHECK-NEXT: [[TMP18:%.*]] = bitcast <1 x i64> [[TMP15]] to <8 x i8>
; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
-; CHECK-NEXT: [[TMP21:%.*]] = bitcast <8 x i8> [[TMP18]] to i64
-; CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to x86_mmx
-; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP17]] to i64
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to x86_mmx
-; CHECK-NEXT: [[TMP10:%.*]] = or i64 [[TMP21]], [[TMP8]]
-; CHECK-NEXT: [[TMP11:%.*]] = bitcast i64 [[TMP10]] to <4 x i16>
+; CHECK-NEXT: [[TMP21:%.*]] = bitcast <8 x i8> [[TMP18]] to <1 x i64>
+; CHECK-NEXT: [[TMP22:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP17]] to <1 x i64>
+; CHECK-NEXT: [[TMP23:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+; CHECK-NEXT: [[TMP10:%.*]] = or <1 x i64> [[TMP21]], [[TMP8]]
+; CHECK-NEXT: [[TMP11:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16>
; CHECK-NEXT: [[TMP12:%.*]] = icmp ne <4 x i16> [[TMP11]], zeroinitializer
; CHECK-NEXT: [[TMP13:%.*]] = sext <4 x i1> [[TMP12]] to <4 x i16>
-; CHECK-NEXT: [[TMP14:%.*]] = bitcast <4 x i16> [[TMP13]] to i64
-; CHECK-NEXT: [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.ssse3.pmadd.ub.sw(x86_mmx [[TMP2]], x86_mmx [[TMP3]]) #[[ATTR5]]
-; CHECK-NEXT: [[TMP16:%.*]] = bitcast i64 [[TMP14]] to <8 x i8>
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to <8 x i8>
+; CHECK-NEXT: [[TMP14:%.*]] = bitcast <4 x i16> [[TMP13]] to <1 x i64>
+; CHECK-NEXT: [[TMP24:%.*]] = tail call <1 x i64> @llvm.x86.ssse3.pmadd.ub.sw(<1 x i64> [[TMP22]], <1 x i64> [[TMP23]]) #[[ATTR5]]
+; CHECK-NEXT: [[TMP16:%.*]] = bitcast <1 x i64> [[TMP14]] to <8 x i8>
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <1 x i64> [[TMP24]] to <8 x i8>
; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP16]] to <1 x i64>
; CHECK-NEXT: [[TMP19:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
; CHECK-NEXT: [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0
@@ -3321,16 +3335,16 @@ define i64 @test6(<1 x i64> %a, <1 x i64> %b) #0 {
; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16>
; CHECK-NEXT: [[TMP13:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16>
; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT: [[TMP16:%.*]] = bitcast <4 x i16> [[TMP13]] to i64
-; CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx
-; CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP12]] to i64
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
-; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP16]], [[TMP8]]
-; CHECK-NEXT: [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.ssse3.phsub.sw(x86_mmx [[TMP2]], x86_mmx [[TMP3]]) #[[ATTR5]]
-; CHECK-NEXT: [[TMP11:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16>
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to <4 x i16>
+; CHECK-NEXT: [[TMP16:%.*]] = bitcast <4 x i16> [[TMP13]] to <1 x i64>
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64>
+; CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP12]] to <1 x i64>
+; CHECK-NEXT: [[TMP17:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
+; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP16]], [[TMP8]]
+; CHECK-NEXT: [[TMP18:%.*]] = tail call <1 x i64> @llvm.x86.ssse3.phsub.sw(<1 x i64> [[TMP2]], <1 x i64> [[TMP17]]) #[[ATTR5]]
+; CHECK-NEXT: [[TMP11:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16>
+; CHECK-NEXT: [[TMP19:%.*]] = bitcast <1 x i64> [[TMP18]] to <4 x i16>
; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64>
-; CHECK-NEXT: [[TMP14:%.*]] = bitcast <4 x i16> [[TMP5]] to <1 x i64>
+; CHECK-NEXT: [[TMP14:%.*]] = bitcast <4 x i16> [[TMP19]] to <1 x i64>
; CHECK-NEXT: [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0
; CHECK-NEXT: [[TMP15:%.*]] = extractelement <1 x i64> [[TMP14]], i32 0
; CHECK-NEXT: store i64 [[TMP7]], ptr @__msan_retval_tls, align 8
@@ -3361,16 +3375,16 @@ define i64 @test5(<1 x i64> %a, <1 x i64> %b) #0 {
; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <2 x i32>
; CHECK-NEXT: [[TMP13:%.*]] = bitcast <1 x i64> [[TMP10]] to <2 x i32>
; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
-; CHECK-NEXT: [[TMP16:%.*]] = bitcast <2 x i32> [[TMP13]] to i64
-; CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[TMP1]] to x86_mmx
-; CHECK-NEXT: [[TMP8:%.*]] = bitcast <2 x i32> [[TMP12]] to i64
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx
-; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP16]], [[TMP8]]
-; CHECK-NEXT: [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.ssse3.phsub.d(x86_mmx [[TMP2]], x86_mmx [[TMP3]]) #[[ATTR5]]
-; CHECK-NEXT: [[TMP11:%.*]] = bitcast i64 [[_MSPROP]] to <2 x i32>
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to <2 x i32>
+; CHECK-NEXT: [[TMP16:%.*]] = bitcast <2 x i32> [[TMP13]] to <1 x i64>
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[TMP1]] to <1 x i64>
+; CHECK-NEXT: [[TMP8:%.*]] = bitcast <2 x i32> [[TMP12]] to <1 x i64>
+; CHECK-NEXT: [[TMP17:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64>
+; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP16]], [[TMP8]]
+; CHECK-NEXT: [[TMP18:%.*]] = tail call <1 x i64> @llvm.x86.ssse3.phsub.d(<1 x i64> [[TMP2]], <1 x i64> [[TMP17]]) #[[ATTR5]]
+; CHECK-NEXT: [[TMP11:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <2 x i32>
+; CHECK-NEXT: [[TMP19:%.*]] = bitcast <1 x i64> [[TMP18]] to <2 x i32>
; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP11]] to <1 x i64>
-; CHECK-NEXT: [[TMP14:%.*]] = bitcast <2 x i32> [[TMP5]] to <1 x i64>
+; CHECK-NEXT: [[TMP14:%.*]] = bitcast <2 x i32> [[TMP19]] to <1 x i64>
; CHECK-NEXT: [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0
; CHECK-NEXT: [[TMP15:%.*]] = extractelement <1 x i64> [[TMP14]], i32 0
; CHECK-NEXT: store i64 [[TMP7]], ptr @__msan_retval_tls, align 8
@@ -3401,16 +3415,16 @@ define i64 @test4(<1 x i64> %a, <1 x i64> %b) #0 {
; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16>
; CHECK-NEXT: [[TMP13:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16>
; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT: [[TMP16:%.*]] = bitcast <4 x i16> [[TMP13]] to i64
-; CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx
-; CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP12]] to i64
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
-; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP16]], [[TMP8]]
-; CHECK-NEXT: [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.ssse3.phsub.w(x86_mmx [[TMP2]], x86_mmx [[TMP3]]) #[[ATTR5]]
-; CHECK-NEXT: [[TMP11:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16>
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to <4 x i16>
+; CHECK-NEXT: [[TMP16:%.*]] = bitcast <4 x i16> [[TMP13]] to <1 x i64>
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64>
+; CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP12]] to <1 x i64>
+; CHECK-NEXT: [[TMP17:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
+; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP16]], [[TMP8]]
+; CHECK-NEXT: [[TMP18:%.*]] = tail call <1 x i64> @llvm.x86.ssse3.phsub.w(<1 x i64> [[TMP2]], <1 x i64> [[TMP17]]) #[[ATTR5]]
+; CHECK-NEXT: [[TMP11:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16>
+; CHECK-NEXT: [[TMP19:%.*]] = bitcast <1 x i64> [[TMP18]] to <4 x i16>
; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64>
-; CHECK-NEXT: [[TMP14:%.*]] = bitcast <4 x i16> [[TMP5]] to <1 x i64>
+; CHECK-NEXT: [[TMP14:%.*]] = bitcast <4 x i16> [[TMP19]] to <1 x i64>
; CHECK-NEXT: [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0
; CHECK-NEXT: [[TMP15:%.*]] = extractelement <1 x i64> [[TMP14]], i32 0
; CHECK-NEXT: store i64 [[TMP7]], ptr @__msan_retval_tls, align 8
@@ -3441,16 +3455,16 @@ define i64 @test3(<1 x i64> %a, <1 x i64> %b) #0 {
; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <4 x i16>
; CHECK-NEXT: [[TMP13:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16>
; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
-; CHECK-NEXT: [[TMP16:%.*]] = bitcast <4 x i16> [[TMP13]] to i64
-; CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[TMP1]] to x86_mmx
-; CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP12]] to i64
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP0]] to x86_mmx
-; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP16]], [[TMP8]]
-; CHECK-NEXT: [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.ssse3.phadd.sw(x86_mmx [[TMP2]], x86_mmx [[TMP3]]) #[[ATTR5]]
-; CHECK-NEXT: [[TMP11:%.*]] = bitcast i64 [[_MSPROP]] to <4 x i16>
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to <4 x i16>
+; CHECK-NEXT: [[TMP16:%.*]] = bitcast <4 x i16> [[TMP13]] to <1 x i64>
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64>
+; CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP12]] to <1 x i64>
+; CHECK-NEXT: [[TMP17:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64>
+; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP16]], [[TMP8]]
+; CHECK-NEXT: [[TMP18:%.*]] = tail call <1 x i64> @llvm.x86.ssse3.phadd.sw(<1 x i64> [[TMP2]], <1 x i64> [[TMP17]]) #[[ATTR5]]
+; CHECK-NEXT: [[TMP11:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <4 x i16>
+; CHECK-NEXT: [[TMP19:%.*]] = bitcast <1 x i64> [[TMP18]] to <4 x i16>
; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP11]] to <1 x i64>
-; CHECK-NEXT: [[TMP14:%.*]] = bitcast <4 x i16> [[TMP5]] to <1 x i64>
+; CHECK-NEXT: [[TMP14:%.*]] = bitcast <4 x i16> [[TMP19]] to <1 x i64>
; CHECK-NEXT: [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0
; CHECK-NEXT: [[TMP15:%.*]] = extractelement <1 x i64> [[TMP14]], i32 0
; CHECK-NEXT: store i64 [[TMP7]], ptr @__msan_retval_tls, align 8
@@ -3481,16 +3495,16 @@ define i64 @test2(<1 x i64> %a, <1 x i64> %b) #0 {
; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <2 x i32>
; CHECK-NEXT: [[TMP13:%.*]] = bitcast <1 x i64> [[TMP10]] to <2 x i32>
; CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
-; CHECK-NEXT: [[TMP16:%.*]] = bitcast <2 x i32> [[TMP13]] to i64
-; CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[TMP1]] to x86_mmx
-; CHECK-NEXT: [[TMP8:%.*]] = bitcast <2 x i32> [[TMP12]] to i64
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP0]] to x86_mmx
-; CHECK-NEXT: [[_MSPROP:%.*]] = or i64 [[TMP16]], [[TMP8]]
-; CHECK-NEXT: [[TMP4:%.*]] = tail call x86_mmx @llvm.x86.ssse3.phadd.d(x86_mmx [[TMP2]], x86_mmx [[TMP3]]) #[[ATTR5]]
-; CHECK-NEXT: [[TMP11:%.*]] = bitcast i64 [[_MSPROP]] to <2 x i32>
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast x86_mmx [[TMP4]] to <2 x i32>
+; CHECK-NEXT: [[TMP16:%.*]] = bitcast <2 x i32> [[TMP13]] to <1 x i64>
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[TMP1]] to <1 x i64>
+; CHECK-NEXT: [[TMP8:%.*]] = bitcast <2 x i32> [[TMP12]] to <1 x i64>
+; CHECK-NEXT: [[TMP17:%.*]] = bitcast <2 x i32> [[TMP0]] to <1 x i64>
+; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP16]], [[TMP8]]
+; CHECK-NEXT: [[TMP18:%.*]] = tail call <1 x i64> @llvm.x86.ssse3.phadd.d(<1 x i64> [[TMP2]], <1 x i64> [[TMP17]]) #[[ATTR5]]
+; CHECK-NEXT: [[TMP11:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <2 x i32>
+; CHECK-NEXT: [[TMP19:%.*]] = bitcast <1 x i64> [[TMP18]] to <2 x i32>
; CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP11]] to <1 x i64>
-; CHECK-NEXT: [[TMP14:%.*]] = bitcast <2 x i32> [[TMP5]] to <1 x i64>
+; CHECK-NEXT: [[TMP14:%.*]] = bitcast <2 x i32> [[TMP19]] to <1 x i64>
; CHECK-NEXT: [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0
; CHECK-NEXT: [[TMP15:%.*]] = extractelement <1 x i64> [[TMP14]], i32 0
; CHECK-NEXT: store i64 [[TMP7]], ptr @__msan_retval_tls, align 8
@@ -3514,20 +3528,21 @@ define <4 x float> @test89(<4 x float> %a, x86_mmx %b) nounwind #0 {
; ALL-NEXT: cvtpi2ps %mm0, %xmm0
; ALL-NEXT: ret{{[l|q]}}
; CHECK-LABEL: define <4 x float> @test89(
-; CHECK-SAME: <4 x float> [[A:%.*]], x86_mmx [[B:%.*]]) #[[ATTR4:[0-9]+]] {
+; CHECK-SAME: <4 x float> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR4:[0-9]+]] {
; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT: [[TMP4:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
; CHECK-NEXT: call void @llvm.donothing()
; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x i64> [[TMP4]] to i64
; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i64 [[TMP2]], 0
; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF0]]
-; CHECK: 4:
+; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF0]]
+; CHECK: 5:
; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]]
; CHECK-NEXT: unreachable
-; CHECK: 5:
-; CHECK-NEXT: [[C:%.*]] = tail call <4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float> [[A]], x86_mmx [[B]])
+; CHECK: 6:
+; CHECK-NEXT: [[C:%.*]] = tail call <4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float> [[A]], <1 x i64> [[B]])
; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret <4 x float> [[C]]
;
@@ -3561,8 +3576,8 @@ define <1 x i64> @test_mm_insert_pi16(<1 x i64> %a.coerce, i32 %d) nounwind #0 {
; CHECK-NEXT: [[TMP3:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
; CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP8:%.*]] = bitcast <1 x i64> [[A_COERCE]] to <1 x i64>
; CHECK-NEXT: [[TMP7:%.*]] = bitcast <1 x i64> [[TMP3]] to i64
-; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A_COERCE]] to x86_mmx
; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP7]], 0
; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i32 [[TMP6]], 0
; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
@@ -3571,8 +3586,8 @@ define <1 x i64> @test_mm_insert_pi16(<1 x i64> %a.coerce, i32 %d) nounwind #0 {
; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]]
; CHECK-NEXT: unreachable
; CHECK: 5:
-; CHECK-NEXT: [[TMP1:%.*]] = tail call x86_mmx @llvm.x86.mmx.pinsr.w(x86_mmx [[TMP0]], i32 [[D]], i32 2)
-; CHECK-NEXT: [[TMP2:%.*]] = bitcast x86_mmx [[TMP1]] to <1 x i64>
+; CHECK-NEXT: [[TMP9:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pinsr.w(<1 x i64> [[TMP8]], i32 [[D]], i32 2)
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x i64> [[TMP9]] to <1 x i64>
; CHECK-NEXT: store <1 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret <1 x i64> [[TMP2]]
;
@@ -3591,15 +3606,15 @@ define i32 @test_mm_extract_pi16(<1 x i64> %a.coerce) nounwind #0 {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP2:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
; CHECK-NEXT: call void @llvm.donothing()
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast <1 x i64> [[A_COERCE]] to <1 x i64>
; CHECK-NEXT: [[TMP5:%.*]] = bitcast <1 x i64> [[TMP2]] to i64
-; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A_COERCE]] to x86_mmx
; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP5]], 0
; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
; CHECK: 3:
; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]]
; CHECK-NEXT: unreachable
; CHECK: 4:
-; CHECK-NEXT: [[TMP1:%.*]] = tail call i32 @llvm.x86.mmx.pextr.w(x86_mmx [[TMP0]], i32 2)
+; CHECK-NEXT: [[TMP1:%.*]] = tail call i32 @llvm.x86.mmx.pextr.w(<1 x i64> [[TMP6]], i32 2)
; CHECK-NEXT: store i32 0, ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret i32 [[TMP1]]
;
diff --git a/llvm/test/Instrumentation/MemorySanitizer/vector_arith.ll b/llvm/test/Instrumentation/MemorySanitizer/vector_arith.ll
index 57d6003b3873f..5197f3277ed80 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/vector_arith.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/vector_arith.ll
@@ -31,12 +31,12 @@ entry:
}
; CHECK-LABEL: @Test_ssse3_pmadd_ub_sw(
-; CHECK: or i64
-; CHECK: bitcast i64 {{.*}} to <4 x i16>
+; CHECK: or <1 x i64>
+; CHECK: bitcast <1 x i64> {{.*}} to <4 x i16>
; CHECK: icmp ne <4 x i16> {{.*}}, zeroinitializer
; CHECK: sext <4 x i1> {{.*}} to <4 x i16>
-; CHECK: bitcast <4 x i16> {{.*}} to i64
-; CHECK: ret x86_mmx
+; CHECK: bitcast <4 x i16> {{.*}} to <1 x i64>
+; CHECK: ret <1 x i64>
define <2 x i64> @Test_x86_sse2_psad_bw(<16 x i8> %a, <16 x i8> %b) sanitize_memory {
@@ -60,8 +60,8 @@ entry:
}
; CHECK-LABEL: @Test_x86_mmx_psad_bw(
-; CHECK: or i64
+; CHECK: or <1 x i64>
; CHECK: icmp ne i64
; CHECK: sext i1 {{.*}} to i64
; CHECK: lshr i64 {{.*}}, 48
-; CHECK: ret x86_mmx
+; CHECK: ret <1 x i64>
diff --git a/llvm/test/Instrumentation/MemorySanitizer/vector_cvt.ll b/llvm/test/Instrumentation/MemorySanitizer/vector_cvt.ll
index 52acbfe0a0e77..6ae03f288e2c0 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/vector_cvt.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/vector_cvt.ll
@@ -42,9 +42,9 @@ entry:
; CHECK: icmp ne {{.*}}[[S]], 0
; CHECK: br
; CHECK: call void @__msan_warning_noreturn()
-; CHECK: call x86_mmx @llvm.x86.sse.cvtps2pi
-; CHECK: store i64 0, {{.*}} @__msan_retval_tls
-; CHECK: ret x86_mmx
+; CHECK: call <1 x i64> @llvm.x86.sse.cvtps2pi
+; CHECK: store <1 x i64> zeroinitializer, {{.*}} @__msan_retval_tls
+; CHECK: ret <1 x i64>
; avx512 rounding conversion.
diff --git a/llvm/test/Instrumentation/MemorySanitizer/vector_pack.ll b/llvm/test/Instrumentation/MemorySanitizer/vector_pack.ll
index 4f08ea7c00afe..1289abd63667e 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/vector_pack.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/vector_pack.ll
@@ -48,15 +48,14 @@ entry:
}
; CHECK-LABEL: @Test_mmx_packuswb(
-; CHECK-DAG: bitcast i64 {{.*}} to <4 x i16>
-; CHECK-DAG: bitcast i64 {{.*}} to <4 x i16>
+; CHECK-DAG: bitcast <1 x i64> {{.*}} to <4 x i16>
+; CHECK-DAG: bitcast <1 x i64> {{.*}} to <4 x i16>
; CHECK-DAG: icmp ne <4 x i16> {{.*}}, zeroinitializer
; CHECK-DAG: sext <4 x i1> {{.*}} to <4 x i16>
; CHECK-DAG: icmp ne <4 x i16> {{.*}}, zeroinitializer
; CHECK-DAG: sext <4 x i1> {{.*}} to <4 x i16>
-; CHECK-DAG: bitcast <4 x i16> {{.*}} to x86_mmx
-; CHECK-DAG: bitcast <4 x i16> {{.*}} to x86_mmx
-; CHECK-DAG: call x86_mmx @llvm.x86.mmx.packsswb({{.*}}
-; CHECK-DAG: bitcast x86_mmx {{.*}} to i64
-; CHECK-DAG: call x86_mmx @llvm.x86.mmx.packuswb({{.*}}
-; CHECK: ret x86_mmx
+; CHECK-DAG: bitcast <4 x i16> {{.*}} to <1 x i64>
+; CHECK-DAG: bitcast <4 x i16> {{.*}} to <1 x i64>
+; CHECK-DAG: call <1 x i64> @llvm.x86.mmx.packsswb({{.*}}
+; CHECK-DAG: call <1 x i64> @llvm.x86.mmx.packuswb({{.*}}
+; CHECK: ret <1 x i64>
diff --git a/llvm/test/Instrumentation/MemorySanitizer/vector_shift.ll b/llvm/test/Instrumentation/MemorySanitizer/vector_shift.ll
index 196285d910a6d..3c6c44194e3ac 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/vector_shift.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/vector_shift.ll
@@ -30,11 +30,11 @@ entry:
; CHECK-LABEL: @test_mmx
; CHECK: = icmp ne i64 {{.*}}, 0
-; CHECK: [[C:%.*]] = sext i1 {{.*}} to i64
-; CHECK: [[A:%.*]] = call x86_mmx @llvm.x86.mmx.psll.d(
-; CHECK: [[B:%.*]] = bitcast x86_mmx {{.*}}[[A]] to i64
-; CHECK: = or i64 {{.*}}[[B]], {{.*}}[[C]]
-; CHECK: call x86_mmx @llvm.x86.mmx.psll.d(
+; CHECK: [[B:%.*]] = sext i1 {{.*}} to i64
+; CHECK: [[C:%.*]] = bitcast i64 [[B]] to <1 x i64>
+; CHECK: [[A:%.*]] = call <1 x i64> @llvm.x86.mmx.psll.d(
+; CHECK: = or <1 x i64> {{.*}}[[A]], {{.*}}[[C]]
+; CHECK: call <1 x i64> @llvm.x86.mmx.psll.d(
; CHECK: ret i64
diff --git a/llvm/test/Transforms/InstCombine/X86/x86-movmsk.ll b/llvm/test/Transforms/InstCombine/X86/x86-movmsk.ll
index 63114288fc581..9fbc39241d8e9 100644
--- a/llvm/test/Transforms/InstCombine/X86/x86-movmsk.ll
+++ b/llvm/test/Transforms/InstCombine/X86/x86-movmsk.ll
@@ -9,7 +9,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
define i32 @test_upper_x86_mmx_pmovmskb(x86_mmx %a0) {
; CHECK-LABEL: @test_upper_x86_mmx_pmovmskb(
-; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.x86.mmx.pmovmskb(x86_mmx [[A0:%.*]])
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.x86.mmx.pmovmskb(<1 x i64> [[A0:%.*]])
; CHECK-NEXT: ret i32 [[TMP1]]
;
%1 = call i32 @llvm.x86.mmx.pmovmskb(x86_mmx %a0)
@@ -207,16 +207,6 @@ define i32 @undef_x86_avx2_pmovmskb() {
; Constant Folding (ZERO -> ZERO)
;
-define i32 @zero_x86_mmx_pmovmskb() {
-; CHECK-LABEL: @zero_x86_mmx_pmovmskb(
-; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.x86.mmx.pmovmskb(x86_mmx bitcast (<1 x i64> zeroinitializer to x86_mmx))
-; CHECK-NEXT: ret i32 [[TMP1]]
-;
- %1 = bitcast <1 x i64> zeroinitializer to x86_mmx
- %2 = call i32 @llvm.x86.mmx.pmovmskb(x86_mmx %1)
- ret i32 %2
-}
-
define i32 @zero_x86_sse_movmsk_ps() {
; CHECK-LABEL: @zero_x86_sse_movmsk_ps(
; CHECK-NEXT: ret i32 0
@@ -271,7 +261,7 @@ define i32 @zero_x86_avx2_pmovmskb() {
define i32 @fold_x86_mmx_pmovmskb() {
; CHECK-LABEL: @fold_x86_mmx_pmovmskb(
-; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.x86.mmx.pmovmskb(x86_mmx bitcast (<8 x i8> <i8 0, i8 -1, i8 -1, i8 127, i8 -127, i8 63, i8 64, i8 0> to x86_mmx))
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.x86.mmx.pmovmskb(<1 x i64> <i64 18084223940296448>)
; CHECK-NEXT: ret i32 [[TMP1]]
;
%1 = bitcast <8 x i8> <i8 0, i8 255, i8 -1, i8 127, i8 -127, i8 63, i8 64, i8 256> to x86_mmx
diff --git a/llvm/test/Transforms/InstCombine/bitcast-vec-canon-inseltpoison.ll b/llvm/test/Transforms/InstCombine/bitcast-vec-canon-inseltpoison.ll
index 38a7391a1a1e3..d4ec9e3aae679 100644
--- a/llvm/test/Transforms/InstCombine/bitcast-vec-canon-inseltpoison.ll
+++ b/llvm/test/Transforms/InstCombine/bitcast-vec-canon-inseltpoison.ll
@@ -38,38 +38,6 @@ define <1 x i64> @d(i64 %y) {
ret <1 x i64> %c
}
-define x86_mmx @e(<1 x i64> %y) {
-; CHECK-LABEL: @e(
-; CHECK-NEXT: [[TMP1:%.*]] = extractelement <1 x i64> [[Y:%.*]], i64 0
-; CHECK-NEXT: [[C:%.*]] = bitcast i64 [[TMP1]] to x86_mmx
-; CHECK-NEXT: ret x86_mmx [[C]]
-;
- %c = bitcast <1 x i64> %y to x86_mmx
- ret x86_mmx %c
-}
-
-define <1 x i64> @f(x86_mmx %y) {
-; CHECK-LABEL: @f(
-; CHECK-NEXT: [[TMP1:%.*]] = bitcast x86_mmx [[Y:%.*]] to i64
-; CHECK-NEXT: [[C:%.*]] = insertelement <1 x i64> poison, i64 [[TMP1]], i64 0
-; CHECK-NEXT: ret <1 x i64> [[C]]
-;
- %c = bitcast x86_mmx %y to <1 x i64>
- ret <1 x i64> %c
-}
-
-define double @g(x86_mmx %x) {
-; CHECK-LABEL: @g(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = bitcast x86_mmx [[X:%.*]] to double
-; CHECK-NEXT: ret double [[TMP0]]
-;
-entry:
- %0 = bitcast x86_mmx %x to <1 x i64>
- %1 = bitcast <1 x i64> %0 to double
- ret double %1
-}
-
; FP source is ok.
define <3 x i64> @bitcast_inselt_undef(double %x, i32 %idx) {
@@ -137,19 +105,6 @@ define <3 x i64> @bitcast_inselt_undef_vec_src(<2 x i32> %x, i32 %idx) {
ret <3 x i64> %i
}
-; Negative test - source type must be scalar
-
-define <3 x i64> @bitcast_inselt_undef_from_mmx(x86_mmx %x, i32 %idx) {
-; CHECK-LABEL: @bitcast_inselt_undef_from_mmx(
-; CHECK-NEXT: [[XB:%.*]] = bitcast x86_mmx [[X:%.*]] to i64
-; CHECK-NEXT: [[I:%.*]] = insertelement <3 x i64> poison, i64 [[XB]], i32 [[IDX:%.*]]
-; CHECK-NEXT: ret <3 x i64> [[I]]
-;
- %xb = bitcast x86_mmx %x to i64
- %i = insertelement <3 x i64> poison, i64 %xb, i32 %idx
- ret <3 x i64> %i
-}
-
; Reduce number of casts
define <2 x i64> @PR45748(double %x, double %y) {
diff --git a/llvm/test/Transforms/InstCombine/bitcast-vec-canon.ll b/llvm/test/Transforms/InstCombine/bitcast-vec-canon.ll
index 8b8325b147263..f787b3c4cc9ac 100644
--- a/llvm/test/Transforms/InstCombine/bitcast-vec-canon.ll
+++ b/llvm/test/Transforms/InstCombine/bitcast-vec-canon.ll
@@ -38,37 +38,6 @@ define <1 x i64> @d(i64 %y) {
ret <1 x i64> %c
}
-define x86_mmx @e(<1 x i64> %y) {
-; CHECK-LABEL: @e(
-; CHECK-NEXT: [[TMP1:%.*]] = extractelement <1 x i64> [[Y:%.*]], i64 0
-; CHECK-NEXT: [[C:%.*]] = bitcast i64 [[TMP1]] to x86_mmx
-; CHECK-NEXT: ret x86_mmx [[C]]
-;
- %c = bitcast <1 x i64> %y to x86_mmx
- ret x86_mmx %c
-}
-
-define <1 x i64> @f(x86_mmx %y) {
-; CHECK-LABEL: @f(
-; CHECK-NEXT: [[TMP1:%.*]] = bitcast x86_mmx [[Y:%.*]] to i64
-; CHECK-NEXT: [[C:%.*]] = insertelement <1 x i64> poison, i64 [[TMP1]], i64 0
-; CHECK-NEXT: ret <1 x i64> [[C]]
-;
- %c = bitcast x86_mmx %y to <1 x i64>
- ret <1 x i64> %c
-}
-
-define double @g(x86_mmx %x) {
-; CHECK-LABEL: @g(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = bitcast x86_mmx [[X:%.*]] to double
-; CHECK-NEXT: ret double [[TMP0]]
-;
-entry:
- %0 = bitcast x86_mmx %x to <1 x i64>
- %1 = bitcast <1 x i64> %0 to double
- ret double %1
-}
; FP source is ok.
@@ -137,19 +106,6 @@ define <3 x i64> @bitcast_inselt_undef_vec_src(<2 x i32> %x, i32 %idx) {
ret <3 x i64> %i
}
-; Negative test - source type must be scalar
-
-define <3 x i64> @bitcast_inselt_undef_from_mmx(x86_mmx %x, i32 %idx) {
-; CHECK-LABEL: @bitcast_inselt_undef_from_mmx(
-; CHECK-NEXT: [[XB:%.*]] = bitcast x86_mmx [[X:%.*]] to i64
-; CHECK-NEXT: [[I:%.*]] = insertelement <3 x i64> undef, i64 [[XB]], i32 [[IDX:%.*]]
-; CHECK-NEXT: ret <3 x i64> [[I]]
-;
- %xb = bitcast x86_mmx %x to i64
- %i = insertelement <3 x i64> undef, i64 %xb, i32 %idx
- ret <3 x i64> %i
-}
-
; Reduce number of casts
define <2 x i64> @PR45748(double %x, double %y) {
diff --git a/llvm/test/Transforms/InstSimplify/ConstProp/loads.ll b/llvm/test/Transforms/InstSimplify/ConstProp/loads.ll
index d4c49faf91b09..dd75560e25ced 100644
--- a/llvm/test/Transforms/InstSimplify/ConstProp/loads.ll
+++ b/llvm/test/Transforms/InstSimplify/ConstProp/loads.ll
@@ -335,19 +335,6 @@ define { i64, i64 } @test_load_struct() {
ret { i64, i64 } %v
}
- at m64 = internal constant [2 x i64] zeroinitializer
- at idx = external global i32
-
-; This should not try to create an x86_mmx null value.
-define x86_mmx @load_mmx() {
-; CHECK-LABEL: @load_mmx(
-; CHECK-NEXT: [[TEMP:%.*]] = load x86_mmx, ptr getelementptr ([2 x i64], ptr @m64, i64 0, i64 ptrtoint (ptr @idx to i64)), align 8
-; CHECK-NEXT: ret x86_mmx [[TEMP]]
-;
- %temp = load x86_mmx, ptr getelementptr ([2 x i64], ptr @m64, i64 0, i64 ptrtoint (ptr @idx to i64))
- ret x86_mmx %temp
-}
-
@g_offset = external global i64
@g_neg_one_vec = constant <4 x i8> <i8 -1, i8 -1, i8 -1, i8 -1>
diff --git a/llvm/test/Transforms/LoopUnroll/X86/mmx.ll b/llvm/test/Transforms/LoopUnroll/X86/mmx.ll
deleted file mode 100644
index b460b79d0640a..0000000000000
--- a/llvm/test/Transforms/LoopUnroll/X86/mmx.ll
+++ /dev/null
@@ -1,35 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
-; RUN: opt < %s -S -passes=loop-unroll | FileCheck %s
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-define x86_mmx @f() #0 {
-; CHECK-LABEL: define x86_mmx @f
-; CHECK-SAME: () #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[PHI:%.*]] = phi i32 [ 1, [[ENTRY:%.*]] ], [ [[ADD_7:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[ADD_6:%.*]] = add i32 [[PHI]], 7
-; CHECK-NEXT: [[ADD_7]] = add i32 [[PHI]], 8
-; CHECK-NEXT: [[CMP_7:%.*]] = icmp eq i32 [[ADD_6]], 0
-; CHECK-NEXT: br i1 [[CMP_7]], label [[EXIT:%.*]], label [[FOR_BODY]]
-; CHECK: exit:
-; CHECK-NEXT: [[RET:%.*]] = phi x86_mmx [ undef, [[FOR_BODY]] ]
-; CHECK-NEXT: ret x86_mmx [[RET]]
-;
-entry:
- br label %for.body
-
-for.body: ; preds = %for.body, %entry
- %phi = phi i32 [ 1, %entry ], [ %add, %for.body ]
- %add = add i32 %phi, 1
- %cmp = icmp eq i32 %phi, 0
- br i1 %cmp, label %exit, label %for.body
-
-exit: ; preds = %for.body
- %ret = phi x86_mmx [ undef, %for.body ]
- ret x86_mmx %ret
-}
-
-attributes #0 = { "target-cpu"="x86-64" }
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/bad_types.ll b/llvm/test/Transforms/SLPVectorizer/X86/bad_types.ll
index 7476ddb0fb873..19ca68fc9cb2c 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/bad_types.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/bad_types.ll
@@ -4,68 +4,6 @@
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
-define void @test1(x86_mmx %a, x86_mmx %b, ptr %ptr) {
-; Ensure we can handle x86_mmx values which are primitive and can be bitcast
-; with integer types but can't be put into a vector.
-;
-; CHECK-LABEL: @test1(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[A_CAST:%.*]] = bitcast x86_mmx [[A:%.*]] to i64
-; CHECK-NEXT: [[B_CAST:%.*]] = bitcast x86_mmx [[B:%.*]] to i64
-; CHECK-NEXT: [[A_AND:%.*]] = and i64 [[A_CAST]], 42
-; CHECK-NEXT: [[B_AND:%.*]] = and i64 [[B_CAST]], 42
-; CHECK-NEXT: [[GEP:%.*]] = getelementptr i64, ptr [[PTR:%.*]], i32 1
-; CHECK-NEXT: store i64 [[A_AND]], ptr [[PTR]], align 8
-; CHECK-NEXT: store i64 [[B_AND]], ptr [[GEP]], align 8
-; CHECK-NEXT: ret void
-;
-entry:
- %a.cast = bitcast x86_mmx %a to i64
- %b.cast = bitcast x86_mmx %b to i64
- %a.and = and i64 %a.cast, 42
- %b.and = and i64 %b.cast, 42
- %gep = getelementptr i64, ptr %ptr, i32 1
- store i64 %a.and, ptr %ptr
- store i64 %b.and, ptr %gep
- ret void
-}
-
-define void @test2(x86_mmx %a, x86_mmx %b) {
-; Same as @test1 but using phi-input vectorization instead of store
-; vectorization.
-;
-; CHECK-LABEL: @test2(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 undef, label [[IF_THEN:%.*]], label [[EXIT:%.*]]
-; CHECK: if.then:
-; CHECK-NEXT: [[A_CAST:%.*]] = bitcast x86_mmx [[A:%.*]] to i64
-; CHECK-NEXT: [[B_CAST:%.*]] = bitcast x86_mmx [[B:%.*]] to i64
-; CHECK-NEXT: [[A_AND:%.*]] = and i64 [[A_CAST]], 42
-; CHECK-NEXT: [[B_AND:%.*]] = and i64 [[B_CAST]], 42
-; CHECK-NEXT: br label [[EXIT]]
-; CHECK: exit:
-; CHECK-NEXT: [[A_PHI:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[A_AND]], [[IF_THEN]] ]
-; CHECK-NEXT: [[B_PHI:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[B_AND]], [[IF_THEN]] ]
-; CHECK-NEXT: tail call void @f(i64 [[A_PHI]], i64 [[B_PHI]])
-; CHECK-NEXT: ret void
-;
-entry:
- br i1 undef, label %if.then, label %exit
-
-if.then:
- %a.cast = bitcast x86_mmx %a to i64
- %b.cast = bitcast x86_mmx %b to i64
- %a.and = and i64 %a.cast, 42
- %b.and = and i64 %b.cast, 42
- br label %exit
-
-exit:
- %a.phi = phi i64 [ 0, %entry ], [ %a.and, %if.then ]
- %b.phi = phi i64 [ 0, %entry ], [ %b.and, %if.then ]
- tail call void @f(i64 %a.phi, i64 %b.phi)
- ret void
-}
-
define i8 @test3(ptr %addr) {
; Check that we do not vectorize types that are padded to a bigger ones.
;
diff --git a/llvm/test/Transforms/SROA/pr57796.ll b/llvm/test/Transforms/SROA/pr57796.ll
index 1bf1ad7ee934a..dbcb6d0784971 100644
--- a/llvm/test/Transforms/SROA/pr57796.ll
+++ b/llvm/test/Transforms/SROA/pr57796.ll
@@ -17,9 +17,9 @@ define void @foo() {
; CHECK-NEXT: [[CALL_I:%.*]] = call align 32 ptr @value_set_type(ptr align 32 [[REF_TMP_I]])
; CHECK-NEXT: [[TMP0:%.*]] = load <32 x i8>, ptr [[CALL_I]], align 32
; CHECK-NEXT: [[REF_TMP_SROA_0_0_VEC_EXTRACT:%.*]] = shufflevector <32 x i8> [[TMP0]], <32 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[REF_TMP_SROA_0_0_VEC_EXTRACT]] to x86_mmx
-; CHECK-NEXT: [[TMP2:%.*]] = call x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx [[TMP1]], i8 0)
-; CHECK-NEXT: store x86_mmx [[TMP2]], ptr @A, align 8
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[REF_TMP_SROA_0_0_VEC_EXTRACT]] to <1 x i64>
+; CHECK-NEXT: [[TMP2:%.*]] = call <1 x i64> @llvm.x86.sse.pshuf.w(<1 x i64> [[TMP1]], i8 0)
+; CHECK-NEXT: store <1 x i64> [[TMP2]], ptr @A, align 8
; CHECK-NEXT: ret void
;
entry:
diff --git a/llvm/tools/llvm-c-test/echo.cpp b/llvm/tools/llvm-c-test/echo.cpp
index 6fa36421810f0..1e77930933c7a 100644
--- a/llvm/tools/llvm-c-test/echo.cpp
+++ b/llvm/tools/llvm-c-test/echo.cpp
@@ -153,8 +153,6 @@ struct TypeCloner {
return LLVMMetadataTypeInContext(Ctx);
case LLVMX86_AMXTypeKind:
return LLVMX86AMXTypeInContext(Ctx);
- case LLVMX86_MMXTypeKind:
- return LLVMX86MMXTypeInContext(Ctx);
case LLVMTokenTypeKind:
return LLVMTokenTypeInContext(Ctx);
case LLVMTargetExtTypeKind: {
diff --git a/llvm/tools/llvm-stress/llvm-stress.cpp b/llvm/tools/llvm-stress/llvm-stress.cpp
index 758643f1414c1..80fb21038d304 100644
--- a/llvm/tools/llvm-stress/llvm-stress.cpp
+++ b/llvm/tools/llvm-stress/llvm-stress.cpp
@@ -173,8 +173,6 @@ struct Modifier {
Ty = Type::getX86_FP80Ty(Context);
else if (Arg == "ppc_fp128")
Ty = Type::getPPC_FP128Ty(Context);
- else if (Arg == "x86_mmx")
- Ty = Type::getX86_MMXTy(Context);
else if (Arg.starts_with("i")) {
unsigned N = 0;
Arg.drop_front().getAsInteger(10, N);
@@ -294,11 +292,7 @@ struct Modifier {
/// Pick a random vector type.
Type *pickVectorType(VectorType *VTy = nullptr) {
- // Vectors of x86mmx are illegal; keep trying till we get something else.
- Type *Ty;
- do {
- Ty = pickScalarType();
- } while (Ty->isX86_MMXTy());
+ Type *Ty = pickScalarType();
if (VTy)
return VectorType::get(Ty, VTy->getElementCount());
diff --git a/llvm/unittests/IR/InstructionsTest.cpp b/llvm/unittests/IR/InstructionsTest.cpp
index 4c1e9a9acb29a..44b25035dde2c 100644
--- a/llvm/unittests/IR/InstructionsTest.cpp
+++ b/llvm/unittests/IR/InstructionsTest.cpp
@@ -205,7 +205,6 @@ TEST(InstructionsTest, CastInst) {
Type *Int64Ty = Type::getInt64Ty(C);
Type *V8x8Ty = FixedVectorType::get(Int8Ty, 8);
Type *V8x64Ty = FixedVectorType::get(Int64Ty, 8);
- Type *X86MMXTy = Type::getX86_MMXTy(C);
Type *HalfTy = Type::getHalfTy(C);
Type *FloatTy = Type::getFloatTy(C);
@@ -248,9 +247,6 @@ TEST(InstructionsTest, CastInst) {
EXPECT_EQ(CastInst::Trunc, CastInst::getCastOpcode(c64, true, V8x8Ty, true));
EXPECT_EQ(CastInst::SExt, CastInst::getCastOpcode(c8, true, V8x64Ty, true));
- EXPECT_FALSE(CastInst::isBitCastable(V8x8Ty, X86MMXTy));
- EXPECT_FALSE(CastInst::isBitCastable(X86MMXTy, V8x8Ty));
- EXPECT_FALSE(CastInst::isBitCastable(Int64Ty, X86MMXTy));
EXPECT_FALSE(CastInst::isBitCastable(V8x64Ty, V8x8Ty));
EXPECT_FALSE(CastInst::isBitCastable(V8x8Ty, V8x64Ty));
@@ -1745,7 +1741,7 @@ TEST(InstructionsTest, AllocaInst) {
%A = alloca i32, i32 1
%B = alloca i32, i32 4
%C = alloca i32, i32 %n
- %D = alloca <8 x double>
+ %D = alloca double
%E = alloca <vscale x 8 x double>
%F = alloca [2 x half]
%G = alloca [2 x [3 x i128]]
@@ -1771,7 +1767,8 @@ TEST(InstructionsTest, AllocaInst) {
EXPECT_EQ(A.getAllocationSizeInBits(DL), TypeSize::getFixed(32));
EXPECT_EQ(B.getAllocationSizeInBits(DL), TypeSize::getFixed(128));
EXPECT_FALSE(C.getAllocationSizeInBits(DL));
- EXPECT_EQ(D.getAllocationSizeInBits(DL), TypeSize::getFixed(512));
+ EXPECT_EQ(DL.getTypeSizeInBits(D.getAllocatedType()), TypeSize::getFixed(64));
+ EXPECT_EQ(D.getAllocationSizeInBits(DL), TypeSize::getFixed(64));
EXPECT_EQ(E.getAllocationSizeInBits(DL), TypeSize::getScalable(512));
EXPECT_EQ(F.getAllocationSizeInBits(DL), TypeSize::getFixed(32));
EXPECT_EQ(G.getAllocationSizeInBits(DL), TypeSize::getFixed(768));
diff --git a/mlir/docs/Dialects/LLVM.md b/mlir/docs/Dialects/LLVM.md
index bc0f484108fac..fadc81b567b4e 100644
--- a/mlir/docs/Dialects/LLVM.md
+++ b/mlir/docs/Dialects/LLVM.md
@@ -240,8 +240,6 @@ dialect as there is no corresponding built-in type.
The following non-parametric types derived from the LLVM IR are available in the
LLVM dialect:
-- `!llvm.x86_mmx` (`LLVMX86MMXType`) - value held in an MMX register on x86
- machine.
- `!llvm.ppc_fp128` (`LLVMPPCFP128Type`) - 128-bit floating-point value (two
64 bits).
- `!llvm.token` (`LLVMTokenType`) - a non-inspectable value associated with an
diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMTypes.h b/mlir/include/mlir/Dialect/LLVMIR/LLVMTypes.h
index 93733ccd4929a..1befdfa74f67c 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMTypes.h
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMTypes.h
@@ -67,7 +67,6 @@ namespace LLVM {
DEFINE_TRIVIAL_LLVM_TYPE(LLVMVoidType, "llvm.void");
DEFINE_TRIVIAL_LLVM_TYPE(LLVMPPCFP128Type, "llvm.ppc_fp128");
-DEFINE_TRIVIAL_LLVM_TYPE(LLVMX86MMXType, "llvm.x86_mmx");
DEFINE_TRIVIAL_LLVM_TYPE(LLVMTokenType, "llvm.token");
DEFINE_TRIVIAL_LLVM_TYPE(LLVMLabelType, "llvm.label");
DEFINE_TRIVIAL_LLVM_TYPE(LLVMMetadataType, "llvm.metadata");
diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
index 9372caf6e32a7..c08360fc6101d 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
@@ -3132,7 +3132,6 @@ void LLVMDialect::initialize() {
// clang-format off
addTypes<LLVMVoidType,
LLVMPPCFP128Type,
- LLVMX86MMXType,
LLVMTokenType,
LLVMLabelType,
LLVMMetadataType,
diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMTypeSyntax.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMTypeSyntax.cpp
index 9810d4d643677..c4708d826f2b3 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/LLVMTypeSyntax.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMTypeSyntax.cpp
@@ -35,7 +35,6 @@ static StringRef getTypeKeyword(Type type) {
return TypeSwitch<Type, StringRef>(type)
.Case<LLVMVoidType>([&](Type) { return "void"; })
.Case<LLVMPPCFP128Type>([&](Type) { return "ppc_fp128"; })
- .Case<LLVMX86MMXType>([&](Type) { return "x86_mmx"; })
.Case<LLVMTokenType>([&](Type) { return "token"; })
.Case<LLVMLabelType>([&](Type) { return "label"; })
.Case<LLVMMetadataType>([&](Type) { return "metadata"; })
@@ -309,7 +308,6 @@ static Type dispatchParse(AsmParser &parser, bool allowAny = true) {
return StringSwitch<function_ref<Type()>>(key)
.Case("void", [&] { return LLVMVoidType::get(ctx); })
.Case("ppc_fp128", [&] { return LLVMPPCFP128Type::get(ctx); })
- .Case("x86_mmx", [&] { return LLVMX86MMXType::get(ctx); })
.Case("token", [&] { return LLVMTokenType::get(ctx); })
.Case("label", [&] { return LLVMLabelType::get(ctx); })
.Case("metadata", [&] { return LLVMMetadataType::get(ctx); })
diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMTypes.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMTypes.cpp
index cf3f38b710130..e536c4a792732 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/LLVMTypes.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMTypes.cpp
@@ -781,7 +781,6 @@ bool mlir::LLVM::isCompatibleOuterType(Type type) {
LLVMScalableVectorType,
LLVMTargetExtType,
LLVMVoidType,
- LLVMX86MMXType
>(type)) {
// clang-format on
return true;
@@ -844,7 +843,6 @@ static bool isCompatibleImpl(Type type, DenseSet<Type> &compatibleTypes) {
LLVMPPCFP128Type,
LLVMTokenType,
LLVMVoidType,
- LLVMX86MMXType
>([](Type) { return true; })
// clang-format on
.Default([](Type) { return false; });
@@ -986,8 +984,7 @@ llvm::TypeSize mlir::LLVM::getPrimitiveTypeSizeInBits(Type type) {
.Case<BFloat16Type, Float16Type>(
[](Type) { return llvm::TypeSize::getFixed(16); })
.Case<Float32Type>([](Type) { return llvm::TypeSize::getFixed(32); })
- .Case<Float64Type, LLVMX86MMXType>(
- [](Type) { return llvm::TypeSize::getFixed(64); })
+ .Case<Float64Type>([](Type) { return llvm::TypeSize::getFixed(64); })
.Case<Float80Type>([](Type) { return llvm::TypeSize::getFixed(80); })
.Case<Float128Type>([](Type) { return llvm::TypeSize::getFixed(128); })
.Case<IntegerType>([](IntegerType intTy) {
diff --git a/mlir/lib/Target/LLVMIR/TypeFromLLVM.cpp b/mlir/lib/Target/LLVMIR/TypeFromLLVM.cpp
index a4db958207756..db184ae8e6e83 100644
--- a/mlir/lib/Target/LLVMIR/TypeFromLLVM.cpp
+++ b/mlir/lib/Target/LLVMIR/TypeFromLLVM.cpp
@@ -65,8 +65,6 @@ class TypeFromLLVMIRTranslatorImpl {
return Float80Type::get(&context);
if (type->isPPC_FP128Ty())
return LLVM::LLVMPPCFP128Type::get(&context);
- if (type->isX86_MMXTy())
- return LLVM::LLVMX86MMXType::get(&context);
if (type->isLabelTy())
return LLVM::LLVMLabelType::get(&context);
if (type->isMetadataTy())
diff --git a/mlir/lib/Target/LLVMIR/TypeToLLVM.cpp b/mlir/lib/Target/LLVMIR/TypeToLLVM.cpp
index 6d8b415ff09dc..6591502723880 100644
--- a/mlir/lib/Target/LLVMIR/TypeToLLVM.cpp
+++ b/mlir/lib/Target/LLVMIR/TypeToLLVM.cpp
@@ -58,9 +58,6 @@ class TypeToLLVMIRTranslatorImpl {
.Case([this](LLVM::LLVMPPCFP128Type) {
return llvm::Type::getPPC_FP128Ty(context);
})
- .Case([this](LLVM::LLVMX86MMXType) {
- return llvm::Type::getX86_MMXTy(context);
- })
.Case([this](LLVM::LLVMTokenType) {
return llvm::Type::getTokenTy(context);
})
diff --git a/mlir/test/Dialect/LLVMIR/types.mlir b/mlir/test/Dialect/LLVMIR/types.mlir
index 2dd292408fa60..42d370a5477c2 100644
--- a/mlir/test/Dialect/LLVMIR/types.mlir
+++ b/mlir/test/Dialect/LLVMIR/types.mlir
@@ -6,8 +6,6 @@ func.func @primitive() {
"some.op"() : () -> !llvm.void
// CHECK: !llvm.ppc_fp128
"some.op"() : () -> !llvm.ppc_fp128
- // CHECK: !llvm.x86_mmx
- "some.op"() : () -> !llvm.x86_mmx
// CHECK: !llvm.token
"some.op"() : () -> !llvm.token
// CHECK: !llvm.label
diff --git a/mlir/test/Target/LLVMIR/llvmir-types.mlir b/mlir/test/Target/LLVMIR/llvmir-types.mlir
index c85fa0101c00d..3e533211b0d0c 100644
--- a/mlir/test/Target/LLVMIR/llvmir-types.mlir
+++ b/mlir/test/Target/LLVMIR/llvmir-types.mlir
@@ -20,8 +20,6 @@ llvm.func @return_fp128() -> f128
llvm.func @return_x86_fp80() -> f80
// CHECK: declare ppc_fp128 @return_ppc_fp128()
llvm.func @return_ppc_fp128() -> !llvm.ppc_fp128
-// CHECK: declare x86_mmx @return_x86_mmx()
-llvm.func @return_x86_mmx() -> !llvm.x86_mmx
//
// Functions.
>From 4b625fdb92e4a13a3cbb0b20d40d3c81bddb7cc4 Mon Sep 17 00:00:00 2001
From: James Y Knight <jyknight at google.com>
Date: Wed, 10 Jul 2024 19:04:31 -0400
Subject: [PATCH 3/4] Move the MMX intrinsic cast hack from generic
SelectionDAGBuilder to X86 DAGCombine.
---
.../SelectionDAG/SelectionDAGBuilder.cpp | 16 ---
llvm/lib/Target/X86/X86ISelLowering.cpp | 92 ++++++++++++-
llvm/lib/Target/X86/X86IntrinsicsInfo.h | 121 +++++++++++++++++-
llvm/test/CodeGen/X86/mmx-arith.ll | 73 ++++++-----
llvm/test/CodeGen/X86/mmx-cvt.ll | 98 ++------------
llvm/test/CodeGen/X86/pr29222.ll | 6 +-
6 files changed, 271 insertions(+), 135 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 54f7f127ae663..b19047e03b149 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -5273,21 +5273,10 @@ void SelectionDAGBuilder::visitTargetIntrinsic(const CallInst &I,
DAG.getTargetConstantFP(*cast<ConstantFP>(Arg), SDLoc(), VT));
}
}
- if (Triple.getArch() == Triple::x86 || Triple.getArch() == Triple::x86_64) {
- for (SDValue &Op : Ops) {
- if (Op.getValueType() == MVT::v1i64)
- Op = DAG.getBitcast(MVT::x86mmx, Op);
- }
- }
SmallVector<EVT, 4> ValueVTs;
ComputeValueVTs(TLI, DAG.getDataLayout(), I.getType(), ValueVTs);
- if (Triple.getArch() == Triple::x86 || Triple.getArch() == Triple::x86_64) {
- if (ValueVTs.size() == 1 && ValueVTs[0] == MVT::v1i64)
- ValueVTs[0] = MVT::x86mmx;
- }
-
if (HasChain)
ValueVTs.push_back(MVT::Other);
@@ -5356,11 +5345,6 @@ void SelectionDAGBuilder::visitTargetIntrinsic(const CallInst &I,
}
}
- if (Triple.getArch() == Triple::x86 || Triple.getArch() == Triple::x86_64) {
- if (Result.getValueType() == MVT::x86mmx)
- Result = DAG.getBitcast(MVT::v1i64, Result);
- }
-
setValue(&I, Result);
}
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index fa26849e0bc5a..18f8c1a22ff5a 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -2565,7 +2565,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
ISD::FP_EXTEND,
ISD::STRICT_FP_EXTEND,
ISD::FP_ROUND,
- ISD::STRICT_FP_ROUND});
+ ISD::STRICT_FP_ROUND,
+ ISD::INTRINSIC_VOID,
+ ISD::INTRINSIC_WO_CHAIN,
+ ISD::INTRINSIC_W_CHAIN});
computeRegisterProperties(Subtarget.getRegisterInfo());
@@ -27299,6 +27302,8 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
llvm_unreachable("Unsupported truncstore intrinsic");
}
}
+ case INTR_TYPE_CAST_MMX:
+ return SDValue(); // handled in combineINTRINSIC_*
}
}
@@ -57594,6 +57599,86 @@ static SDValue combinePDEP(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
+// Fixup the MMX intrinsics' types: in IR they are expressed with <1 x i64>,
+// and so SelectionDAGBuilder creates them with v1i64 types, but they need to
+// use x86mmx instead.
+static SDValue FixupMMXIntrinsicTypes(SDNode *N, SelectionDAG &DAG) {
+ SDLoc dl(N);
+
+ bool MadeChange = false, CastReturnVal = false;
+ SmallVector<SDValue, 8> Args;
+ for (const SDValue &Arg : N->op_values()) {
+ if (Arg.getValueType() == MVT::v1i64) {
+ MadeChange = true;
+ Args.push_back(DAG.getBitcast(MVT::x86mmx, Arg));
+ } else
+ Args.push_back(Arg);
+ }
+ SDVTList VTs = N->getVTList();
+ SDVTList NewVTs = VTs;
+ if (VTs.NumVTs > 0 && VTs.VTs[0] == MVT::v1i64) {
+ SmallVector<EVT> NewVTArr(ArrayRef<EVT>(VTs.VTs, VTs.NumVTs));
+ NewVTArr[0] = MVT::x86mmx;
+ NewVTs = DAG.getVTList(NewVTArr);
+ MadeChange = true;
+ CastReturnVal = true;
+ }
+
+ if (MadeChange) {
+ SDValue Result = DAG.getNode(N->getOpcode(), dl, NewVTs, Args);
+ if (CastReturnVal) {
+ SmallVector<SDValue, 2> Returns;
+ for (unsigned i = 0, e = Result->getNumValues(); i != e; ++i)
+ Returns.push_back(Result.getValue(i));
+ Returns[0] = DAG.getBitcast(MVT::v1i64, Returns[0]);
+ return DAG.getMergeValues(Returns, dl);
+ }
+ return Result;
+ }
+ return SDValue();
+}
+static SDValue combineINTRINSIC_WO_CHAIN(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ if (!DCI.isBeforeLegalize())
+ return SDValue();
+
+ unsigned IntNo = N->getConstantOperandVal(0);
+ const IntrinsicData *IntrData = getIntrinsicWithoutChain(IntNo);
+
+ if (IntrData && IntrData->Type == INTR_TYPE_CAST_MMX)
+ return FixupMMXIntrinsicTypes(N, DAG);
+
+ return SDValue();
+}
+
+static SDValue combineINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ if (!DCI.isBeforeLegalize())
+ return SDValue();
+
+ unsigned IntNo = N->getConstantOperandVal(1);
+ const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);
+
+ if (IntrData && IntrData->Type == INTR_TYPE_CAST_MMX)
+ return FixupMMXIntrinsicTypes(N, DAG);
+
+ return SDValue();
+}
+
+static SDValue combineINTRINSIC_VOID(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ if (!DCI.isBeforeLegalize())
+ return SDValue();
+
+ unsigned IntNo = N->getConstantOperandVal(1);
+ const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);
+
+ if (IntrData && IntrData->Type == INTR_TYPE_CAST_MMX)
+ return FixupMMXIntrinsicTypes(N, DAG);
+
+ return SDValue();
+}
+
SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
@@ -57784,7 +57869,10 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
case X86ISD::SUBV_BROADCAST_LOAD: return combineBROADCAST_LOAD(N, DAG, DCI);
case X86ISD::MOVDQ2Q: return combineMOVDQ2Q(N, DAG);
case X86ISD::PDEP: return combinePDEP(N, DAG, DCI);
- // clang-format on
+ case ISD::INTRINSIC_WO_CHAIN: return combineINTRINSIC_WO_CHAIN(N, DAG, DCI);
+ case ISD::INTRINSIC_W_CHAIN: return combineINTRINSIC_W_CHAIN(N, DAG, DCI);
+ case ISD::INTRINSIC_VOID: return combineINTRINSIC_VOID(N, DAG, DCI);
+ // clang-format on
}
return SDValue();
diff --git a/llvm/lib/Target/X86/X86IntrinsicsInfo.h b/llvm/lib/Target/X86/X86IntrinsicsInfo.h
index 717541cf6c559..a336ea75afdae 100644
--- a/llvm/lib/Target/X86/X86IntrinsicsInfo.h
+++ b/llvm/lib/Target/X86/X86IntrinsicsInfo.h
@@ -73,7 +73,8 @@ enum IntrinsicType : uint16_t {
GATHER_AVX2,
ROUNDP,
ROUNDS,
- RDPRU
+ RDPRU,
+ INTR_TYPE_CAST_MMX
};
struct IntrinsicData {
@@ -323,6 +324,8 @@ static const IntrinsicData IntrinsicsWithChain[] = {
X86_INTRINSIC_DATA(avx512_scattersiv4_si, SCATTER, 0, 0),
X86_INTRINSIC_DATA(avx512_scattersiv8_sf, SCATTER, 0, 0),
X86_INTRINSIC_DATA(avx512_scattersiv8_si, SCATTER, 0, 0),
+ X86_INTRINSIC_DATA(mmx_maskmovq, INTR_TYPE_CAST_MMX, 0, 0),
+ X86_INTRINSIC_DATA(mmx_movnt_dq, INTR_TYPE_CAST_MMX, 0, 0),
X86_INTRINSIC_DATA(rdpmc, RDPMC, X86::RDPMC, 0),
X86_INTRINSIC_DATA(rdpru, RDPRU, X86::RDPRU, 0),
X86_INTRINSIC_DATA(rdrand_16, RDRAND, X86ISD::RDRAND, 0),
@@ -352,6 +355,31 @@ static const IntrinsicData *getIntrinsicWithChain(unsigned IntNo) {
* the alphabetical order.
*/
static const IntrinsicData IntrinsicsWithoutChain[] = {
+ X86_INTRINSIC_DATA(3dnow_pavgusb, INTR_TYPE_CAST_MMX, 0, 0),
+ X86_INTRINSIC_DATA(3dnow_pf2id, INTR_TYPE_CAST_MMX, 0, 0),
+ X86_INTRINSIC_DATA(3dnow_pfacc, INTR_TYPE_CAST_MMX, 0, 0),
+ X86_INTRINSIC_DATA(3dnow_pfadd, INTR_TYPE_CAST_MMX, 0, 0),
+ X86_INTRINSIC_DATA(3dnow_pfcmpeq, INTR_TYPE_CAST_MMX, 0, 0),
+ X86_INTRINSIC_DATA(3dnow_pfcmpge, INTR_TYPE_CAST_MMX, 0, 0),
+ X86_INTRINSIC_DATA(3dnow_pfcmpgt, INTR_TYPE_CAST_MMX, 0, 0),
+ X86_INTRINSIC_DATA(3dnow_pfmax, INTR_TYPE_CAST_MMX, 0, 0),
+ X86_INTRINSIC_DATA(3dnow_pfmin, INTR_TYPE_CAST_MMX, 0, 0),
+ X86_INTRINSIC_DATA(3dnow_pfmul, INTR_TYPE_CAST_MMX, 0, 0),
+ X86_INTRINSIC_DATA(3dnow_pfrcp, INTR_TYPE_CAST_MMX, 0, 0),
+ X86_INTRINSIC_DATA(3dnow_pfrcpit1, INTR_TYPE_CAST_MMX, 0, 0),
+ X86_INTRINSIC_DATA(3dnow_pfrcpit2, INTR_TYPE_CAST_MMX, 0, 0),
+ X86_INTRINSIC_DATA(3dnow_pfrsqit1, INTR_TYPE_CAST_MMX, 0, 0),
+ X86_INTRINSIC_DATA(3dnow_pfrsqrt, INTR_TYPE_CAST_MMX, 0, 0),
+ X86_INTRINSIC_DATA(3dnow_pfsub, INTR_TYPE_CAST_MMX, 0, 0),
+ X86_INTRINSIC_DATA(3dnow_pfsubr, INTR_TYPE_CAST_MMX, 0, 0),
+ X86_INTRINSIC_DATA(3dnow_pi2fd, INTR_TYPE_CAST_MMX, 0, 0),
+ X86_INTRINSIC_DATA(3dnow_pmulhrw, INTR_TYPE_CAST_MMX, 0, 0),
+ X86_INTRINSIC_DATA(3dnowa_pf2iw, INTR_TYPE_CAST_MMX, 0, 0),
+ X86_INTRINSIC_DATA(3dnowa_pfnacc, INTR_TYPE_CAST_MMX, 0, 0),
+ X86_INTRINSIC_DATA(3dnowa_pfpnacc, INTR_TYPE_CAST_MMX, 0, 0),
+ X86_INTRINSIC_DATA(3dnowa_pi2fw, INTR_TYPE_CAST_MMX, 0, 0),
+ X86_INTRINSIC_DATA(3dnowa_pswapd, INTR_TYPE_CAST_MMX, 0, 0),
+
X86_INTRINSIC_DATA(addcarry_32, ADX, X86ISD::ADC, X86ISD::ADD),
X86_INTRINSIC_DATA(addcarry_64, ADX, X86ISD::ADC, X86ISD::ADD),
X86_INTRINSIC_DATA(avx_addsub_pd_256, INTR_TYPE_2OP, X86ISD::ADDSUB, 0),
@@ -1495,6 +1523,75 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(fma_vfmaddsub_ps, INTR_TYPE_3OP, X86ISD::FMADDSUB, 0),
X86_INTRINSIC_DATA(fma_vfmaddsub_ps_256, INTR_TYPE_3OP, X86ISD::FMADDSUB,
0),
+
+ X86_INTRINSIC_DATA(mmx_packssdw, INTR_TYPE_CAST_MMX, 0, 0),
+ X86_INTRINSIC_DATA(mmx_packsswb, INTR_TYPE_CAST_MMX, 0, 0),
+ X86_INTRINSIC_DATA(mmx_packuswb, INTR_TYPE_CAST_MMX, 0, 0),
+ X86_INTRINSIC_DATA(mmx_padd_b, INTR_TYPE_CAST_MMX, 0, 0),
+ X86_INTRINSIC_DATA(mmx_padd_d, INTR_TYPE_CAST_MMX, 0, 0),
+ X86_INTRINSIC_DATA(mmx_padd_q, INTR_TYPE_CAST_MMX, 0, 0),
+ X86_INTRINSIC_DATA(mmx_padd_w, INTR_TYPE_CAST_MMX, 0, 0),
+ X86_INTRINSIC_DATA(mmx_padds_b, INTR_TYPE_CAST_MMX, 0, 0),
+ X86_INTRINSIC_DATA(mmx_padds_w, INTR_TYPE_CAST_MMX, 0, 0),
+ X86_INTRINSIC_DATA(mmx_paddus_b, INTR_TYPE_CAST_MMX, 0, 0),
+ X86_INTRINSIC_DATA(mmx_paddus_w, INTR_TYPE_CAST_MMX, 0, 0),
+ X86_INTRINSIC_DATA(mmx_palignr_b, INTR_TYPE_CAST_MMX, 0, 0),
+ X86_INTRINSIC_DATA(mmx_pand, INTR_TYPE_CAST_MMX, 0, 0),
+ X86_INTRINSIC_DATA(mmx_pandn, INTR_TYPE_CAST_MMX, 0, 0),
+ X86_INTRINSIC_DATA(mmx_pavg_b, INTR_TYPE_CAST_MMX, 0, 0),
+ X86_INTRINSIC_DATA(mmx_pavg_w, INTR_TYPE_CAST_MMX, 0, 0),
+ X86_INTRINSIC_DATA(mmx_pcmpeq_b, INTR_TYPE_CAST_MMX, 0, 0),
+ X86_INTRINSIC_DATA(mmx_pcmpeq_d, INTR_TYPE_CAST_MMX, 0, 0),
+ X86_INTRINSIC_DATA(mmx_pcmpeq_w, INTR_TYPE_CAST_MMX, 0, 0),
+ X86_INTRINSIC_DATA(mmx_pcmpgt_b, INTR_TYPE_CAST_MMX, 0, 0),
+ X86_INTRINSIC_DATA(mmx_pcmpgt_d, INTR_TYPE_CAST_MMX, 0, 0),
+ X86_INTRINSIC_DATA(mmx_pcmpgt_w, INTR_TYPE_CAST_MMX, 0, 0),
+ X86_INTRINSIC_DATA(mmx_pextr_w, INTR_TYPE_CAST_MMX, 0, 0),
+ X86_INTRINSIC_DATA(mmx_pinsr_w, INTR_TYPE_CAST_MMX, 0, 0),
+ X86_INTRINSIC_DATA(mmx_pmadd_wd, INTR_TYPE_CAST_MMX, 0, 0),
+ X86_INTRINSIC_DATA(mmx_pmaxs_w, INTR_TYPE_CAST_MMX, 0, 0),
+ X86_INTRINSIC_DATA(mmx_pmaxu_b, INTR_TYPE_CAST_MMX, 0, 0),
+ X86_INTRINSIC_DATA(mmx_pmins_w, INTR_TYPE_CAST_MMX, 0, 0),
+ X86_INTRINSIC_DATA(mmx_pminu_b, INTR_TYPE_CAST_MMX, 0, 0),
+ X86_INTRINSIC_DATA(mmx_pmovmskb, INTR_TYPE_CAST_MMX, 0, 0),
+ X86_INTRINSIC_DATA(mmx_pmulh_w, INTR_TYPE_CAST_MMX, 0, 0),
+ X86_INTRINSIC_DATA(mmx_pmulhu_w, INTR_TYPE_CAST_MMX, 0, 0),
+ X86_INTRINSIC_DATA(mmx_pmull_w, INTR_TYPE_CAST_MMX, 0, 0),
+ X86_INTRINSIC_DATA(mmx_pmulu_dq, INTR_TYPE_CAST_MMX, 0, 0),
+ X86_INTRINSIC_DATA(mmx_por, INTR_TYPE_CAST_MMX, 0, 0),
+ X86_INTRINSIC_DATA(mmx_psad_bw, INTR_TYPE_CAST_MMX, 0, 0),
+ X86_INTRINSIC_DATA(mmx_psll_d, INTR_TYPE_CAST_MMX, 0, 0),
+ X86_INTRINSIC_DATA(mmx_psll_q, INTR_TYPE_CAST_MMX, 0, 0),
+ X86_INTRINSIC_DATA(mmx_psll_w, INTR_TYPE_CAST_MMX, 0, 0),
+ X86_INTRINSIC_DATA(mmx_pslli_d, INTR_TYPE_CAST_MMX, 0, 0),
+ X86_INTRINSIC_DATA(mmx_pslli_q, INTR_TYPE_CAST_MMX, 0, 0),
+ X86_INTRINSIC_DATA(mmx_pslli_w, INTR_TYPE_CAST_MMX, 0, 0),
+ X86_INTRINSIC_DATA(mmx_psra_d, INTR_TYPE_CAST_MMX, 0, 0),
+ X86_INTRINSIC_DATA(mmx_psra_w, INTR_TYPE_CAST_MMX, 0, 0),
+ X86_INTRINSIC_DATA(mmx_psrai_d, INTR_TYPE_CAST_MMX, 0, 0),
+ X86_INTRINSIC_DATA(mmx_psrai_w, INTR_TYPE_CAST_MMX, 0, 0),
+ X86_INTRINSIC_DATA(mmx_psrl_d, INTR_TYPE_CAST_MMX, 0, 0),
+ X86_INTRINSIC_DATA(mmx_psrl_q, INTR_TYPE_CAST_MMX, 0, 0),
+ X86_INTRINSIC_DATA(mmx_psrl_w, INTR_TYPE_CAST_MMX, 0, 0),
+ X86_INTRINSIC_DATA(mmx_psrli_d, INTR_TYPE_CAST_MMX, 0, 0),
+ X86_INTRINSIC_DATA(mmx_psrli_q, INTR_TYPE_CAST_MMX, 0, 0),
+ X86_INTRINSIC_DATA(mmx_psrli_w, INTR_TYPE_CAST_MMX, 0, 0),
+ X86_INTRINSIC_DATA(mmx_psub_b, INTR_TYPE_CAST_MMX, 0, 0),
+ X86_INTRINSIC_DATA(mmx_psub_d, INTR_TYPE_CAST_MMX, 0, 0),
+ X86_INTRINSIC_DATA(mmx_psub_q, INTR_TYPE_CAST_MMX, 0, 0),
+ X86_INTRINSIC_DATA(mmx_psub_w, INTR_TYPE_CAST_MMX, 0, 0),
+ X86_INTRINSIC_DATA(mmx_psubs_b, INTR_TYPE_CAST_MMX, 0, 0),
+ X86_INTRINSIC_DATA(mmx_psubs_w, INTR_TYPE_CAST_MMX, 0, 0),
+ X86_INTRINSIC_DATA(mmx_psubus_b, INTR_TYPE_CAST_MMX, 0, 0),
+ X86_INTRINSIC_DATA(mmx_psubus_w, INTR_TYPE_CAST_MMX, 0, 0),
+ X86_INTRINSIC_DATA(mmx_punpckhbw, INTR_TYPE_CAST_MMX, 0, 0),
+ X86_INTRINSIC_DATA(mmx_punpckhdq, INTR_TYPE_CAST_MMX, 0, 0),
+ X86_INTRINSIC_DATA(mmx_punpckhwd, INTR_TYPE_CAST_MMX, 0, 0),
+ X86_INTRINSIC_DATA(mmx_punpcklbw, INTR_TYPE_CAST_MMX, 0, 0),
+ X86_INTRINSIC_DATA(mmx_punpckldq, INTR_TYPE_CAST_MMX, 0, 0),
+ X86_INTRINSIC_DATA(mmx_punpcklwd, INTR_TYPE_CAST_MMX, 0, 0),
+ X86_INTRINSIC_DATA(mmx_pxor, INTR_TYPE_CAST_MMX, 0, 0),
+
X86_INTRINSIC_DATA(sse_cmp_ps, INTR_TYPE_3OP, X86ISD::CMPP, 0),
X86_INTRINSIC_DATA(sse_cmp_ss, INTR_TYPE_3OP, X86ISD::FSETCC, 0),
X86_INTRINSIC_DATA(sse_comieq_ss, COMI, X86ISD::COMI, ISD::SETEQ),
@@ -1503,8 +1600,14 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(sse_comile_ss, COMI, X86ISD::COMI, ISD::SETLE),
X86_INTRINSIC_DATA(sse_comilt_ss, COMI, X86ISD::COMI, ISD::SETLT),
X86_INTRINSIC_DATA(sse_comineq_ss, COMI, X86ISD::COMI, ISD::SETNE),
+ X86_INTRINSIC_DATA(sse_cvtpd2pi, INTR_TYPE_CAST_MMX, 0, 0),
+ X86_INTRINSIC_DATA(sse_cvtpi2pd, INTR_TYPE_CAST_MMX, 0, 0),
+ X86_INTRINSIC_DATA(sse_cvtpi2ps, INTR_TYPE_CAST_MMX, 0, 0),
+ X86_INTRINSIC_DATA(sse_cvtps2pi, INTR_TYPE_CAST_MMX, 0, 0),
X86_INTRINSIC_DATA(sse_cvtss2si, INTR_TYPE_1OP, X86ISD::CVTS2SI, 0),
X86_INTRINSIC_DATA(sse_cvtss2si64, INTR_TYPE_1OP, X86ISD::CVTS2SI, 0),
+ X86_INTRINSIC_DATA(sse_cvttpd2pi, INTR_TYPE_CAST_MMX, 0, 0),
+ X86_INTRINSIC_DATA(sse_cvttps2pi, INTR_TYPE_CAST_MMX, 0, 0),
X86_INTRINSIC_DATA(sse_cvttss2si, INTR_TYPE_1OP, X86ISD::CVTTS2SI, 0),
X86_INTRINSIC_DATA(sse_cvttss2si64, INTR_TYPE_1OP, X86ISD::CVTTS2SI, 0),
X86_INTRINSIC_DATA(sse_max_ps, INTR_TYPE_2OP, X86ISD::FMAX, 0),
@@ -1512,6 +1615,7 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(sse_min_ps, INTR_TYPE_2OP, X86ISD::FMIN, 0),
X86_INTRINSIC_DATA(sse_min_ss, INTR_TYPE_2OP, X86ISD::FMINS, 0),
X86_INTRINSIC_DATA(sse_movmsk_ps, INTR_TYPE_1OP, X86ISD::MOVMSK, 0),
+ X86_INTRINSIC_DATA(sse_pshuf_w, INTR_TYPE_CAST_MMX, 0, 0),
X86_INTRINSIC_DATA(sse_rcp_ps, INTR_TYPE_1OP, X86ISD::FRCP, 0),
X86_INTRINSIC_DATA(sse_rsqrt_ps, INTR_TYPE_1OP, X86ISD::FRSQRT, 0),
X86_INTRINSIC_DATA(sse_ucomieq_ss, COMI, X86ISD::UCOMI, ISD::SETEQ),
@@ -1593,14 +1697,29 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(sse41_round_ss, ROUNDS, X86ISD::VRNDSCALES, 0),
X86_INTRINSIC_DATA(sse4a_extrqi, INTR_TYPE_3OP, X86ISD::EXTRQI, 0),
X86_INTRINSIC_DATA(sse4a_insertqi, INTR_TYPE_4OP_IMM8, X86ISD::INSERTQI, 0),
+ X86_INTRINSIC_DATA(ssse3_pabs_b, INTR_TYPE_CAST_MMX, 0, 0),
+ X86_INTRINSIC_DATA(ssse3_pabs_d, INTR_TYPE_CAST_MMX, 0, 0),
+ X86_INTRINSIC_DATA(ssse3_pabs_w, INTR_TYPE_CAST_MMX, 0, 0),
+ X86_INTRINSIC_DATA(ssse3_phadd_d, INTR_TYPE_CAST_MMX, 0, 0),
X86_INTRINSIC_DATA(ssse3_phadd_d_128, INTR_TYPE_2OP, X86ISD::HADD, 0),
+ X86_INTRINSIC_DATA(ssse3_phadd_sw, INTR_TYPE_CAST_MMX, 0, 0),
+ X86_INTRINSIC_DATA(ssse3_phadd_w, INTR_TYPE_CAST_MMX, 0, 0),
X86_INTRINSIC_DATA(ssse3_phadd_w_128, INTR_TYPE_2OP, X86ISD::HADD, 0),
+ X86_INTRINSIC_DATA(ssse3_phsub_d, INTR_TYPE_CAST_MMX, 0, 0),
X86_INTRINSIC_DATA(ssse3_phsub_d_128, INTR_TYPE_2OP, X86ISD::HSUB, 0),
+ X86_INTRINSIC_DATA(ssse3_phsub_sw, INTR_TYPE_CAST_MMX, 0, 0),
+ X86_INTRINSIC_DATA(ssse3_phsub_w, INTR_TYPE_CAST_MMX, 0, 0),
X86_INTRINSIC_DATA(ssse3_phsub_w_128, INTR_TYPE_2OP, X86ISD::HSUB, 0),
+ X86_INTRINSIC_DATA(ssse3_pmadd_ub_sw, INTR_TYPE_CAST_MMX, 0, 0),
X86_INTRINSIC_DATA(ssse3_pmadd_ub_sw_128, INTR_TYPE_2OP, X86ISD::VPMADDUBSW,
0),
+ X86_INTRINSIC_DATA(ssse3_pmul_hr_sw, INTR_TYPE_CAST_MMX, 0, 0),
X86_INTRINSIC_DATA(ssse3_pmul_hr_sw_128, INTR_TYPE_2OP, X86ISD::MULHRS, 0),
+ X86_INTRINSIC_DATA(ssse3_pshuf_b, INTR_TYPE_CAST_MMX, 0, 0),
X86_INTRINSIC_DATA(ssse3_pshuf_b_128, INTR_TYPE_2OP, X86ISD::PSHUFB, 0),
+ X86_INTRINSIC_DATA(ssse3_psign_b, INTR_TYPE_CAST_MMX, 0, 0),
+ X86_INTRINSIC_DATA(ssse3_psign_d, INTR_TYPE_CAST_MMX, 0, 0),
+ X86_INTRINSIC_DATA(ssse3_psign_w, INTR_TYPE_CAST_MMX, 0, 0),
X86_INTRINSIC_DATA(subborrow_32, ADX, X86ISD::SBB, X86ISD::SUB),
X86_INTRINSIC_DATA(subborrow_64, ADX, X86ISD::SBB, X86ISD::SUB),
X86_INTRINSIC_DATA(tbm_bextri_u32, BEXTRI, X86ISD::BEXTRI, 0),
diff --git a/llvm/test/CodeGen/X86/mmx-arith.ll b/llvm/test/CodeGen/X86/mmx-arith.ll
index 68287a4feee47..230e763a7c734 100644
--- a/llvm/test/CodeGen/X86/mmx-arith.ll
+++ b/llvm/test/CodeGen/X86/mmx-arith.ll
@@ -18,8 +18,8 @@ define void @test0(ptr %A, ptr %B) nounwind {
; X86-NEXT: paddsb (%ecx), %mm0
; X86-NEXT: movq %mm0, (%eax)
; X86-NEXT: paddusb (%ecx), %mm0
-; X86-NEXT: movq %mm0, (%eax)
; X86-NEXT: movq2dq %mm0, %xmm0
+; X86-NEXT: movq %mm0, (%eax)
; X86-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
; X86-NEXT: psubb %xmm1, %xmm0
; X86-NEXT: movdq2q %xmm0, %mm0
@@ -27,8 +27,8 @@ define void @test0(ptr %A, ptr %B) nounwind {
; X86-NEXT: psubsb (%ecx), %mm0
; X86-NEXT: movq %mm0, (%eax)
; X86-NEXT: psubusb (%ecx), %mm0
-; X86-NEXT: movq %mm0, (%eax)
; X86-NEXT: movq2dq %mm0, %xmm0
+; X86-NEXT: movq %mm0, (%eax)
; X86-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
; X86-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; X86-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
@@ -58,8 +58,8 @@ define void @test0(ptr %A, ptr %B) nounwind {
; X64-NEXT: paddsb (%rsi), %mm0
; X64-NEXT: movq %mm0, (%rdi)
; X64-NEXT: paddusb (%rsi), %mm0
-; X64-NEXT: movq %mm0, (%rdi)
; X64-NEXT: movq2dq %mm0, %xmm0
+; X64-NEXT: movq %mm0, (%rdi)
; X64-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
; X64-NEXT: psubb %xmm1, %xmm0
; X64-NEXT: movdq2q %xmm0, %mm0
@@ -67,8 +67,8 @@ define void @test0(ptr %A, ptr %B) nounwind {
; X64-NEXT: psubsb (%rsi), %mm0
; X64-NEXT: movq %mm0, (%rdi)
; X64-NEXT: psubusb (%rsi), %mm0
-; X64-NEXT: movq %mm0, (%rdi)
; X64-NEXT: movq2dq %mm0, %xmm0
+; X64-NEXT: movq %mm0, (%rdi)
; X64-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; X64-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
@@ -239,8 +239,13 @@ entry:
define void @test2(ptr %A, ptr %B) nounwind {
; X86-LABEL: test2:
; X86: # %bb.0: # %entry
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: pushl %ebp
+; X86-NEXT: movl %esp, %ebp
+; X86-NEXT: pushl %esi
+; X86-NEXT: andl $-8, %esp
+; X86-NEXT: subl $16, %esp
+; X86-NEXT: movl 12(%ebp), %ecx
+; X86-NEXT: movl 8(%ebp), %eax
; X86-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; X86-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
; X86-NEXT: paddw %xmm0, %xmm1
@@ -249,8 +254,8 @@ define void @test2(ptr %A, ptr %B) nounwind {
; X86-NEXT: paddsw (%ecx), %mm0
; X86-NEXT: movq %mm0, (%eax)
; X86-NEXT: paddusw (%ecx), %mm0
-; X86-NEXT: movq %mm0, (%eax)
; X86-NEXT: movq2dq %mm0, %xmm0
+; X86-NEXT: movq %mm0, (%eax)
; X86-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
; X86-NEXT: psubw %xmm1, %xmm0
; X86-NEXT: movdq2q %xmm0, %mm0
@@ -258,8 +263,8 @@ define void @test2(ptr %A, ptr %B) nounwind {
; X86-NEXT: psubsw (%ecx), %mm0
; X86-NEXT: movq %mm0, (%eax)
; X86-NEXT: psubusw (%ecx), %mm0
-; X86-NEXT: movq %mm0, (%eax)
; X86-NEXT: movq2dq %mm0, %xmm0
+; X86-NEXT: movq %mm0, (%eax)
; X86-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
; X86-NEXT: pmullw %xmm0, %xmm1
; X86-NEXT: movdq2q %xmm1, %mm0
@@ -267,18 +272,26 @@ define void @test2(ptr %A, ptr %B) nounwind {
; X86-NEXT: pmulhw (%ecx), %mm0
; X86-NEXT: movq %mm0, (%eax)
; X86-NEXT: pmaddwd (%ecx), %mm0
+; X86-NEXT: movq %mm0, (%esp)
+; X86-NEXT: movl (%esp), %edx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-NEXT: movq %mm0, (%eax)
-; X86-NEXT: movq2dq %mm0, %xmm0
-; X86-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
-; X86-NEXT: andps %xmm0, %xmm1
-; X86-NEXT: movlps %xmm1, (%eax)
-; X86-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
-; X86-NEXT: orps %xmm1, %xmm0
-; X86-NEXT: movlps %xmm0, (%eax)
-; X86-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
-; X86-NEXT: xorps %xmm0, %xmm1
-; X86-NEXT: movlps %xmm1, (%eax)
+; X86-NEXT: andl 4(%ecx), %esi
+; X86-NEXT: movd %esi, %xmm0
+; X86-NEXT: andl (%ecx), %edx
+; X86-NEXT: movd %edx, %xmm1
+; X86-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; X86-NEXT: movq %xmm1, (%eax)
+; X86-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X86-NEXT: por %xmm1, %xmm0
+; X86-NEXT: movq %xmm0, (%eax)
+; X86-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
+; X86-NEXT: pxor %xmm0, %xmm1
+; X86-NEXT: movq %xmm1, (%eax)
; X86-NEXT: emms
+; X86-NEXT: leal -4(%ebp), %esp
+; X86-NEXT: popl %esi
+; X86-NEXT: popl %ebp
; X86-NEXT: retl
;
; X64-LABEL: test2:
@@ -291,8 +304,8 @@ define void @test2(ptr %A, ptr %B) nounwind {
; X64-NEXT: paddsw (%rsi), %mm0
; X64-NEXT: movq %mm0, (%rdi)
; X64-NEXT: paddusw (%rsi), %mm0
-; X64-NEXT: movq %mm0, (%rdi)
; X64-NEXT: movq2dq %mm0, %xmm0
+; X64-NEXT: movq %mm0, (%rdi)
; X64-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
; X64-NEXT: psubw %xmm1, %xmm0
; X64-NEXT: movdq2q %xmm0, %mm0
@@ -300,8 +313,8 @@ define void @test2(ptr %A, ptr %B) nounwind {
; X64-NEXT: psubsw (%rsi), %mm0
; X64-NEXT: movq %mm0, (%rdi)
; X64-NEXT: psubusw (%rsi), %mm0
-; X64-NEXT: movq %mm0, (%rdi)
; X64-NEXT: movq2dq %mm0, %xmm0
+; X64-NEXT: movq %mm0, (%rdi)
; X64-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
; X64-NEXT: pmullw %xmm0, %xmm1
; X64-NEXT: movdq2q %xmm1, %mm0
@@ -309,17 +322,17 @@ define void @test2(ptr %A, ptr %B) nounwind {
; X64-NEXT: pmulhw (%rsi), %mm0
; X64-NEXT: movq %mm0, (%rdi)
; X64-NEXT: pmaddwd (%rsi), %mm0
+; X64-NEXT: movq %mm0, %rax
; X64-NEXT: movq %mm0, (%rdi)
-; X64-NEXT: movq2dq %mm0, %xmm0
-; X64-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
-; X64-NEXT: andps %xmm0, %xmm1
-; X64-NEXT: movlps %xmm1, (%rdi)
-; X64-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
-; X64-NEXT: orps %xmm1, %xmm0
-; X64-NEXT: movlps %xmm0, (%rdi)
-; X64-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
-; X64-NEXT: xorps %xmm0, %xmm1
-; X64-NEXT: movlps %xmm1, (%rdi)
+; X64-NEXT: andq (%rsi), %rax
+; X64-NEXT: movq %rax, %xmm0
+; X64-NEXT: movq %rax, (%rdi)
+; X64-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
+; X64-NEXT: por %xmm0, %xmm1
+; X64-NEXT: movq %xmm1, (%rdi)
+; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X64-NEXT: pxor %xmm1, %xmm0
+; X64-NEXT: movq %xmm0, (%rdi)
; X64-NEXT: emms
; X64-NEXT: retq
entry:
diff --git a/llvm/test/CodeGen/X86/mmx-cvt.ll b/llvm/test/CodeGen/X86/mmx-cvt.ll
index c09c417c11c96..11473f3f6c236 100644
--- a/llvm/test/CodeGen/X86/mmx-cvt.ll
+++ b/llvm/test/CodeGen/X86/mmx-cvt.ll
@@ -8,20 +8,10 @@
define void @cvt_v2f64_v2i32(<2 x double>, ptr) nounwind {
; X86-LABEL: cvt_v2f64_v2i32:
; X86: # %bb.0:
-; X86-NEXT: pushl %ebp
-; X86-NEXT: movl %esp, %ebp
-; X86-NEXT: andl $-8, %esp
-; X86-NEXT: subl $8, %esp
-; X86-NEXT: movl 8(%ebp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: cvtpd2pi %xmm0, %mm0
; X86-NEXT: paddd %mm0, %mm0
-; X86-NEXT: movq %mm0, (%esp)
-; X86-NEXT: movl (%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movl %edx, 4(%eax)
-; X86-NEXT: movl %ecx, (%eax)
-; X86-NEXT: movl %ebp, %esp
-; X86-NEXT: popl %ebp
+; X86-NEXT: movq %mm0, (%eax)
; X86-NEXT: retl
;
; X64-LABEL: cvt_v2f64_v2i32:
@@ -44,20 +34,10 @@ define void @cvt_v2f64_v2i32(<2 x double>, ptr) nounwind {
define void @cvtt_v2f64_v2i32(<2 x double>, ptr) nounwind {
; X86-LABEL: cvtt_v2f64_v2i32:
; X86: # %bb.0:
-; X86-NEXT: pushl %ebp
-; X86-NEXT: movl %esp, %ebp
-; X86-NEXT: andl $-8, %esp
-; X86-NEXT: subl $8, %esp
-; X86-NEXT: movl 8(%ebp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: cvttpd2pi %xmm0, %mm0
; X86-NEXT: paddd %mm0, %mm0
-; X86-NEXT: movq %mm0, (%esp)
-; X86-NEXT: movl (%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movl %edx, 4(%eax)
-; X86-NEXT: movl %ecx, (%eax)
-; X86-NEXT: movl %ebp, %esp
-; X86-NEXT: popl %ebp
+; X86-NEXT: movq %mm0, (%eax)
; X86-NEXT: retl
;
; X64-LABEL: cvtt_v2f64_v2i32:
@@ -80,20 +60,10 @@ define void @cvtt_v2f64_v2i32(<2 x double>, ptr) nounwind {
define void @fptosi_v2f64_v2i32(<2 x double>, ptr) nounwind {
; X86-LABEL: fptosi_v2f64_v2i32:
; X86: # %bb.0:
-; X86-NEXT: pushl %ebp
-; X86-NEXT: movl %esp, %ebp
-; X86-NEXT: andl $-8, %esp
-; X86-NEXT: subl $8, %esp
-; X86-NEXT: movl 8(%ebp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: cvttpd2pi %xmm0, %mm0
; X86-NEXT: paddd %mm0, %mm0
-; X86-NEXT: movq %mm0, (%esp)
-; X86-NEXT: movl (%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movl %edx, 4(%eax)
-; X86-NEXT: movl %ecx, (%eax)
-; X86-NEXT: movl %ebp, %esp
-; X86-NEXT: popl %ebp
+; X86-NEXT: movq %mm0, (%eax)
; X86-NEXT: retl
;
; X64-LABEL: fptosi_v2f64_v2i32:
@@ -114,20 +84,10 @@ define void @fptosi_v2f64_v2i32(<2 x double>, ptr) nounwind {
define void @cvt_v2f32_v2i32(<4 x float>, ptr) nounwind {
; X86-LABEL: cvt_v2f32_v2i32:
; X86: # %bb.0:
-; X86-NEXT: pushl %ebp
-; X86-NEXT: movl %esp, %ebp
-; X86-NEXT: andl $-8, %esp
-; X86-NEXT: subl $8, %esp
-; X86-NEXT: movl 8(%ebp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: cvtps2pi %xmm0, %mm0
; X86-NEXT: paddd %mm0, %mm0
-; X86-NEXT: movq %mm0, (%esp)
-; X86-NEXT: movl (%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movl %edx, 4(%eax)
-; X86-NEXT: movl %ecx, (%eax)
-; X86-NEXT: movl %ebp, %esp
-; X86-NEXT: popl %ebp
+; X86-NEXT: movq %mm0, (%eax)
; X86-NEXT: retl
;
; X64-LABEL: cvt_v2f32_v2i32:
@@ -150,20 +110,10 @@ define void @cvt_v2f32_v2i32(<4 x float>, ptr) nounwind {
define void @cvtt_v2f32_v2i32(<4 x float>, ptr) nounwind {
; X86-LABEL: cvtt_v2f32_v2i32:
; X86: # %bb.0:
-; X86-NEXT: pushl %ebp
-; X86-NEXT: movl %esp, %ebp
-; X86-NEXT: andl $-8, %esp
-; X86-NEXT: subl $8, %esp
-; X86-NEXT: movl 8(%ebp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: cvttps2pi %xmm0, %mm0
; X86-NEXT: paddd %mm0, %mm0
-; X86-NEXT: movq %mm0, (%esp)
-; X86-NEXT: movl (%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movl %edx, 4(%eax)
-; X86-NEXT: movl %ecx, (%eax)
-; X86-NEXT: movl %ebp, %esp
-; X86-NEXT: popl %ebp
+; X86-NEXT: movq %mm0, (%eax)
; X86-NEXT: retl
;
; X64-LABEL: cvtt_v2f32_v2i32:
@@ -186,20 +136,10 @@ define void @cvtt_v2f32_v2i32(<4 x float>, ptr) nounwind {
define void @fptosi_v4f32_v4i32(<4 x float>, ptr) nounwind {
; X86-LABEL: fptosi_v4f32_v4i32:
; X86: # %bb.0:
-; X86-NEXT: pushl %ebp
-; X86-NEXT: movl %esp, %ebp
-; X86-NEXT: andl $-8, %esp
-; X86-NEXT: subl $8, %esp
-; X86-NEXT: movl 8(%ebp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: cvttps2pi %xmm0, %mm0
; X86-NEXT: paddd %mm0, %mm0
-; X86-NEXT: movq %mm0, (%esp)
-; X86-NEXT: movl (%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movl %edx, 4(%eax)
-; X86-NEXT: movl %ecx, (%eax)
-; X86-NEXT: movl %ebp, %esp
-; X86-NEXT: popl %ebp
+; X86-NEXT: movq %mm0, (%eax)
; X86-NEXT: retl
;
; X64-LABEL: fptosi_v4f32_v4i32:
@@ -221,20 +161,10 @@ define void @fptosi_v4f32_v4i32(<4 x float>, ptr) nounwind {
define void @fptosi_v2f32_v2i32(<4 x float>, ptr) nounwind {
; X86-LABEL: fptosi_v2f32_v2i32:
; X86: # %bb.0:
-; X86-NEXT: pushl %ebp
-; X86-NEXT: movl %esp, %ebp
-; X86-NEXT: andl $-8, %esp
-; X86-NEXT: subl $8, %esp
-; X86-NEXT: movl 8(%ebp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: cvttps2pi %xmm0, %mm0
; X86-NEXT: paddd %mm0, %mm0
-; X86-NEXT: movq %mm0, (%esp)
-; X86-NEXT: movl (%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movl %edx, 4(%eax)
-; X86-NEXT: movl %ecx, (%eax)
-; X86-NEXT: movl %ebp, %esp
-; X86-NEXT: popl %ebp
+; X86-NEXT: movq %mm0, (%eax)
; X86-NEXT: retl
;
; X64-LABEL: fptosi_v2f32_v2i32:
diff --git a/llvm/test/CodeGen/X86/pr29222.ll b/llvm/test/CodeGen/X86/pr29222.ll
index 9a38515b65594..1ddcb1fb56524 100644
--- a/llvm/test/CodeGen/X86/pr29222.ll
+++ b/llvm/test/CodeGen/X86/pr29222.ll
@@ -32,7 +32,7 @@ define i32 @PR29222(i32) nounwind {
; X86-AVX-NEXT: pshufw $68, %mm0, %mm0 # mm0 = mm0[0,1,0,1]
; X86-AVX-NEXT: packsswb %mm0, %mm0
; X86-AVX-NEXT: movq %mm0, (%esp)
-; X86-AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; X86-AVX-NEXT: vpbroadcastq (%esp), %xmm0
; X86-AVX-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
; X86-AVX-NEXT: vmovd %xmm0, %eax
; X86-AVX-NEXT: movl %ebp, %esp
@@ -54,7 +54,9 @@ define i32 @PR29222(i32) nounwind {
; X64-AVX-NEXT: movd %edi, %mm0
; X64-AVX-NEXT: pshufw $68, %mm0, %mm0 # mm0 = mm0[0,1,0,1]
; X64-AVX-NEXT: packsswb %mm0, %mm0
-; X64-AVX-NEXT: movq2dq %mm0, %xmm0
+; X64-AVX-NEXT: movq %mm0, %rax
+; X64-AVX-NEXT: vmovq %rax, %xmm0
+; X64-AVX-NEXT: vpbroadcastq %xmm0, %xmm0
; X64-AVX-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
; X64-AVX-NEXT: vmovd %xmm0, %eax
; X64-AVX-NEXT: retq
>From 98a8b237810812c2f429aa5555fa8de65a1e0962 Mon Sep 17 00:00:00 2001
From: James Y Knight <jyknight at google.com>
Date: Thu, 11 Jul 2024 13:10:51 -0400
Subject: [PATCH 4/4] Add release notes and adjust documentation for change.
Fix a couple nits.
---
clang/lib/CodeGen/Targets/X86.cpp | 4 ++--
llvm/docs/BitCodeFormat.rst | 2 +-
llvm/docs/LangRef.rst | 20 +------------------
llvm/docs/ReleaseNotes.rst | 13 ++++++++++++
.../SelectionDAG/SelectionDAGBuilder.cpp | 1 -
5 files changed, 17 insertions(+), 23 deletions(-)
diff --git a/clang/lib/CodeGen/Targets/X86.cpp b/clang/lib/CodeGen/Targets/X86.cpp
index 8913b188f6aec..16d52bee3490b 100644
--- a/clang/lib/CodeGen/Targets/X86.cpp
+++ b/clang/lib/CodeGen/Targets/X86.cpp
@@ -24,9 +24,9 @@ bool IsX86_MMXType(llvm::Type *IRType) {
IRType->getScalarSizeInBits() != 64;
}
-static llvm::Type* X86AdjustInlineAsmType(CodeGen::CodeGenFunction &CGF,
+static llvm::Type *X86AdjustInlineAsmType(CodeGen::CodeGenFunction &CGF,
StringRef Constraint,
- llvm::Type* Ty) {
+ llvm::Type *Ty) {
if (Constraint == "k") {
llvm::Type *Int1Ty = llvm::Type::getInt1Ty(CGF.getLLVMContext());
return llvm::FixedVectorType::get(Int1Ty, Ty->getScalarSizeInBits());
diff --git a/llvm/docs/BitCodeFormat.rst b/llvm/docs/BitCodeFormat.rst
index 46af2e421a258..1a724a58f58e0 100644
--- a/llvm/docs/BitCodeFormat.rst
+++ b/llvm/docs/BitCodeFormat.rst
@@ -1291,7 +1291,7 @@ TYPE_CODE_X86_MMX Record
``[X86_MMX]``
-The ``X86_MMX`` record (code 17) adds an ``x86_mmx`` type to the type table.
+The ``X86_MMX`` record (code 17) is deprecated, and imported as a <1 x i64> vector.
TYPE_CODE_STRUCT_ANON Record
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index ae39217dc8ff8..2ff5a35ad650b 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -3945,24 +3945,6 @@ or constants of this type.
x86_amx
-X86_mmx Type
-""""""""""""
-
-:Overview:
-
-The x86_mmx type represents a value held in an MMX register on an x86
-machine. The operations allowed on it are quite limited: parameters and
-return values, load and store, and bitcast. User-specified MMX
-instructions are represented as intrinsic or asm calls with arguments
-and/or results of this type. There are no arrays, vectors or constants
-of this type.
-
-:Syntax:
-
-::
-
- x86_mmx
-
.. _t_pointer:
@@ -4396,7 +4378,7 @@ represented by ``0xH`` followed by 4 hexadecimal digits. The bfloat 16-bit
format is represented by ``0xR`` followed by 4 hexadecimal digits. All
hexadecimal formats are big-endian (sign bit at the left).
-There are no constants of type x86_mmx and x86_amx.
+There are no constants of type x86_amx.
.. _complexconstants:
diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst
index 4f442bde239a9..0060917259b8f 100644
--- a/llvm/docs/ReleaseNotes.rst
+++ b/llvm/docs/ReleaseNotes.rst
@@ -79,6 +79,8 @@ Changes to the LLVM IR
* ``llvm.instprof.mcdc.tvbitmap.update``: 3rd argument has been
removed. The next argument has been changed from byte index to bit
index.
+* The ``x86_mmx`` IR type has been removed. It will be translated to
+ the standard vector type ``<1 x i64>`` in bitcode upgrade.
Changes to LLVM infrastructure
------------------------------
@@ -209,6 +211,11 @@ Changes to the X86 Backend
- Removed knl/knm specific ISA intrinsics: AVX512PF, AVX512ER, PREFETCHWT1,
while assembly encoding/decoding supports are kept.
+- Due to the removal of the ``x86_mmx`` IR type, functions with
+ ``x86_mmx`` arguments or return values will use a different,
+ incompatible, calling convention ABI. Such functions are not
+ generally seen in the wild (Clang never generates them!), so this is
+ not expected to result in real-world compatibility problems.
Changes to the OCaml bindings
-----------------------------
@@ -301,6 +308,12 @@ They are described in detail in the `debug info migration guide <https://llvm.or
* ``LLVMGetTargetExtTypeNumTypeParams``/``LLVMGetTargetExtTypeTypeParam``
* ``LLVMGetTargetExtTypeNumIntParams``/``LLVMGetTargetExtTypeIntParam``
+* The following symbols are deleted due to the removal of the ``x86_mmx`` IR type:
+
+ * ``LLVMX86_MMXTypeKind``
+ * ``LLVMX86MMXTypeInContext``
+ * ``LLVMX86MMXType``
+
Changes to the CodeGen infrastructure
-------------------------------------
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index b19047e03b149..276d980c1dcca 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -5225,7 +5225,6 @@ void SelectionDAGBuilder::visitTargetIntrinsic(const CallInst &I,
// Ignore the callsite's attributes. A specific call site may be marked with
// readnone, but the lowering code will expect the chain based on the
// definition.
- const auto &Triple = DAG.getTarget().getTargetTriple();
const Function *F = I.getCalledFunction();
bool HasChain = !F->doesNotAccessMemory();
bool OnlyLoad = HasChain && F->onlyReadsMemory();
More information about the Mlir-commits
mailing list