[clang] [llvm] Remove mmx 3dnow (PR #96246)

James Y Knight via cfe-commits cfe-commits at lists.llvm.org
Mon Jun 24 13:57:21 PDT 2024


https://github.com/jyknight updated https://github.com/llvm/llvm-project/pull/96246

>From 28be235255725f9ef36193fc4302fb62a682701a Mon Sep 17 00:00:00 2001
From: James Y Knight <jyknight at google.com>
Date: Thu, 20 Jun 2024 14:28:04 -0400
Subject: [PATCH 1/6] Clang: Remove support for 3DNow!, both intrinsics and
 builtins.

This set of instructions was only supported by AMD chips starting in
the K6-2 (introduced 1998), and before the "Bulldozer" family
(2011). They were never much used, as they were effectively superseded
by the more-widely-implemented SSE (first implemented on the AMD side
in Athlon XP in 2001).

This is being done as a predecessor towards general removal of MMX
register usage. Since there is almost no usage of the 3DNow!
intrinsics, and no modern hardware even implements them, simple
removal seems like the best option.

Support for the underlying LLVM intrinsics remains, for the
moment. They will be removed in a future patch.

(Originally uploaded in https://reviews.llvm.org/D94213)

Works towards issue #41665.
---
 clang/docs/ReleaseNotes.rst                   |  17 ++
 clang/include/clang/Basic/BuiltinsX86.def     |  30 ---
 clang/include/clang/Driver/Options.td         |  10 +-
 clang/lib/Basic/Targets/X86.cpp               |  29 +--
 clang/lib/Basic/Targets/X86.h                 |  10 +-
 clang/lib/CodeGen/CGBuiltin.cpp               |   8 -
 clang/lib/Driver/ToolChains/Arch/X86.cpp      |  13 ++
 clang/lib/Headers/mm3dnow.h                   | 143 +-------------
 clang/lib/Headers/x86intrin.h                 |   4 -
 clang/test/CodeGen/X86/3dnow-builtins.c       | 181 ------------------
 clang/test/CodeGen/builtins-x86.c             |  32 +---
 clang/test/Driver/x86-target-features.c       |   9 +-
 clang/test/Headers/mm3dnow.c                  |   9 +-
 .../Preprocessor/predefined-arch-macros.c     |  88 ---------
 clang/test/Preprocessor/x86_target_features.c |   4 +-
 llvm/include/llvm/IR/IntrinsicsX86.td         |  49 ++---
 16 files changed, 85 insertions(+), 551 deletions(-)
 delete mode 100644 clang/test/CodeGen/X86/3dnow-builtins.c

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 36e23981cc5df..cfe3526283d69 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -936,6 +936,23 @@ X86 Support
 ^^^^^^^^^^^
 
 - Remove knl/knm specific ISA supports: AVX512PF, AVX512ER, PREFETCHWT1
+- Support has been removed for the AMD "3DNow!" instruction-set.
+  Neither modern AMD CPUs, nor any Intel CPUs implement these
+  instructions, and they were never widely used.
+  * The options ``-m3dnow`` and ``-m3dnowa`` are no longer honored, and will emit a warning if used.
+  * The macros ``__3dNOW__`` and ``__3dNOW_A__`` are no longer ever set by the compiler.
+  * The header ``<mm3dnow.h>`` still exists, but all of the the 3dNow
+    intrinsic functions have been removed: ``_m_femms``,
+    ``_m_pavgusb``, ``_m_pf2id``, ``_m_pfacc``, ``_m_pfadd``,
+    ``_m_pfcmpeq``, ``_m_pfcmpge``, ``_m_pfcmpgt``, ``_m_pfmax``,
+    ``_m_pfmin``, ``_m_pfmul``, ``_m_pfrcp``, ``_m_pfrcpit1``,
+    ``_m_pfrcpit2``, ``_m_pfrsqrt``, ``_m_pfrsqrtit1``, ``_m_pfsub``,
+    ``_m_pfsubr``, ``_m_pi2fd``, ``_m_pmulhrw``, ``_m_pf2iw``,
+    ``_m_pfnacc``, ``_m_pfpnacc``, ``_m_pi2fw``, ``_m_pswapdsf``,
+    ``_m_pswapdsi``
+  * The compiler builtins (``__builtin_ia32_femms``, and so on)
+    corresponding to each of the above intrinsics have been removed.
+
 
 Arm and AArch64 Support
 ^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/clang/include/clang/Basic/BuiltinsX86.def b/clang/include/clang/Basic/BuiltinsX86.def
index 7074479786b97..a85e7918f4d7e 100644
--- a/clang/include/clang/Basic/BuiltinsX86.def
+++ b/clang/include/clang/Basic/BuiltinsX86.def
@@ -37,36 +37,6 @@ TARGET_BUILTIN(__builtin_ia32_undef512, "V8d", "ncV:512:", "")
 TARGET_BUILTIN(__builtin_ia32_readeflags_u32, "Ui", "n", "")
 TARGET_BUILTIN(__builtin_ia32_writeeflags_u32, "vUi", "n", "")
 
-// 3DNow!
-//
-TARGET_BUILTIN(__builtin_ia32_femms, "v", "n", "3dnow")
-TARGET_BUILTIN(__builtin_ia32_pavgusb, "V8cV8cV8c", "ncV:64:", "3dnow")
-TARGET_BUILTIN(__builtin_ia32_pf2id, "V2iV2f", "ncV:64:", "3dnow")
-TARGET_BUILTIN(__builtin_ia32_pfacc, "V2fV2fV2f", "ncV:64:", "3dnow")
-TARGET_BUILTIN(__builtin_ia32_pfadd, "V2fV2fV2f", "ncV:64:", "3dnow")
-TARGET_BUILTIN(__builtin_ia32_pfcmpeq, "V2iV2fV2f", "ncV:64:", "3dnow")
-TARGET_BUILTIN(__builtin_ia32_pfcmpge, "V2iV2fV2f", "ncV:64:", "3dnow")
-TARGET_BUILTIN(__builtin_ia32_pfcmpgt, "V2iV2fV2f", "ncV:64:", "3dnow")
-TARGET_BUILTIN(__builtin_ia32_pfmax, "V2fV2fV2f", "ncV:64:", "3dnow")
-TARGET_BUILTIN(__builtin_ia32_pfmin, "V2fV2fV2f", "ncV:64:", "3dnow")
-TARGET_BUILTIN(__builtin_ia32_pfmul, "V2fV2fV2f", "ncV:64:", "3dnow")
-TARGET_BUILTIN(__builtin_ia32_pfrcp, "V2fV2f", "ncV:64:", "3dnow")
-TARGET_BUILTIN(__builtin_ia32_pfrcpit1, "V2fV2fV2f", "ncV:64:", "3dnow")
-TARGET_BUILTIN(__builtin_ia32_pfrcpit2, "V2fV2fV2f", "ncV:64:", "3dnow")
-TARGET_BUILTIN(__builtin_ia32_pfrsqrt, "V2fV2f", "ncV:64:", "3dnow")
-TARGET_BUILTIN(__builtin_ia32_pfrsqit1, "V2fV2fV2f", "ncV:64:", "3dnow")
-TARGET_BUILTIN(__builtin_ia32_pfsub, "V2fV2fV2f", "ncV:64:", "3dnow")
-TARGET_BUILTIN(__builtin_ia32_pfsubr, "V2fV2fV2f", "ncV:64:", "3dnow")
-TARGET_BUILTIN(__builtin_ia32_pi2fd, "V2fV2i", "ncV:64:", "3dnow")
-TARGET_BUILTIN(__builtin_ia32_pmulhrw, "V4sV4sV4s", "ncV:64:", "3dnow")
-// 3DNow! Extensions (3dnowa).
-TARGET_BUILTIN(__builtin_ia32_pf2iw, "V2iV2f", "ncV:64:", "3dnowa")
-TARGET_BUILTIN(__builtin_ia32_pfnacc, "V2fV2fV2f", "ncV:64:", "3dnowa")
-TARGET_BUILTIN(__builtin_ia32_pfpnacc, "V2fV2fV2f", "ncV:64:", "3dnowa")
-TARGET_BUILTIN(__builtin_ia32_pi2fw, "V2fV2i", "ncV:64:", "3dnowa")
-TARGET_BUILTIN(__builtin_ia32_pswapdsf, "V2fV2f", "ncV:64:", "3dnowa")
-TARGET_BUILTIN(__builtin_ia32_pswapdsi, "V2iV2i", "ncV:64:", "3dnowa")
-
 // MMX
 //
 // All MMX instructions will be generated via builtins. Any MMX vector
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index bbf860aa491e1..78286fecad24f 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -6094,10 +6094,6 @@ def mno_80387 : Flag<["-"], "mno-80387">, Alias<mno_x87>;
 def mno_fp_ret_in_387 : Flag<["-"], "mno-fp-ret-in-387">, Alias<mno_x87>;
 def mmmx : Flag<["-"], "mmmx">, Group<m_x86_Features_Group>;
 def mno_mmx : Flag<["-"], "mno-mmx">, Group<m_x86_Features_Group>;
-def m3dnow : Flag<["-"], "m3dnow">, Group<m_x86_Features_Group>;
-def mno_3dnow : Flag<["-"], "mno-3dnow">, Group<m_x86_Features_Group>;
-def m3dnowa : Flag<["-"], "m3dnowa">, Group<m_x86_Features_Group>;
-def mno_3dnowa : Flag<["-"], "mno-3dnowa">, Group<m_x86_Features_Group>;
 def mamx_bf16 : Flag<["-"], "mamx-bf16">, Group<m_x86_Features_Group>;
 def mno_amx_bf16 : Flag<["-"], "mno-amx-bf16">, Group<m_x86_Features_Group>;
 def mamx_complex : Flag<["-"], "mamx-complex">, Group<m_x86_Features_Group>;
@@ -6331,6 +6327,12 @@ def mvevpu : Flag<["-"], "mvevpu">, Group<m_ve_Features_Group>,
 def mno_vevpu : Flag<["-"], "mno-vevpu">, Group<m_ve_Features_Group>;
 } // let Flags = [TargetSpecific]
 
+// Unsupported X86 feature flags (triggers a warning)
+def m3dnow : Flag<["-"], "m3dnow">;
+def mno_3dnow : Flag<["-"], "mno-3dnow">;
+def m3dnowa : Flag<["-"], "m3dnowa">;
+def mno_3dnowa : Flag<["-"], "mno-3dnowa">;
+
 // These are legacy user-facing driver-level option spellings. They are always
 // aliases for options that are spelled using the more common Unix / GNU flag
 // style of double-dash and equals-joined flags.
diff --git a/clang/lib/Basic/Targets/X86.cpp b/clang/lib/Basic/Targets/X86.cpp
index 036a655a4d073..6083be5ee698e 100644
--- a/clang/lib/Basic/Targets/X86.cpp
+++ b/clang/lib/Basic/Targets/X86.cpp
@@ -258,7 +258,9 @@ bool X86TargetInfo::handleTargetFeatures(std::vector<std::string> &Features,
     if (Feature[0] != '+')
       continue;
 
-    if (Feature == "+aes") {
+    if (Feature == "+mmx") {
+      HasMMX = true;
+    } else if (Feature == "+aes") {
       HasAES = true;
     } else if (Feature == "+vaes") {
       HasVAES = true;
@@ -483,13 +485,6 @@ bool X86TargetInfo::handleTargetFeatures(std::vector<std::string> &Features,
     // for bfloat16 arithmetic operations in the front-end.
     HasBFloat16 = SSELevel >= SSE2;
 
-    MMX3DNowEnum ThreeDNowLevel = llvm::StringSwitch<MMX3DNowEnum>(Feature)
-                                      .Case("+3dnowa", AMD3DNowAthlon)
-                                      .Case("+3dnow", AMD3DNow)
-                                      .Case("+mmx", MMX)
-                                      .Default(NoMMX3DNow);
-    MMX3DNowLevel = std::max(MMX3DNowLevel, ThreeDNowLevel);
-
     XOPEnum XLevel = llvm::StringSwitch<XOPEnum>(Feature)
                          .Case("+xop", XOP)
                          .Case("+fma4", FMA4)
@@ -1025,18 +1020,8 @@ void X86TargetInfo::getTargetDefines(const LangOptions &Opts,
   }
 
   // Each case falls through to the previous one here.
-  switch (MMX3DNowLevel) {
-  case AMD3DNowAthlon:
-    Builder.defineMacro("__3dNOW_A__");
-    [[fallthrough]];
-  case AMD3DNow:
-    Builder.defineMacro("__3dNOW__");
-    [[fallthrough]];
-  case MMX:
+  if (HasMMX) {
     Builder.defineMacro("__MMX__");
-    [[fallthrough]];
-  case NoMMX3DNow:
-    break;
   }
 
   if (CPU >= CK_i486 || CPU == CK_None) {
@@ -1055,8 +1040,6 @@ void X86TargetInfo::getTargetDefines(const LangOptions &Opts,
 
 bool X86TargetInfo::isValidFeatureName(StringRef Name) const {
   return llvm::StringSwitch<bool>(Name)
-      .Case("3dnow", true)
-      .Case("3dnowa", true)
       .Case("adx", true)
       .Case("aes", true)
       .Case("amx-bf16", true)
@@ -1225,9 +1208,7 @@ bool X86TargetInfo::hasFeature(StringRef Feature) const {
       .Case("widekl", HasWIDEKL)
       .Case("lwp", HasLWP)
       .Case("lzcnt", HasLZCNT)
-      .Case("mm3dnow", MMX3DNowLevel >= AMD3DNow)
-      .Case("mm3dnowa", MMX3DNowLevel >= AMD3DNowAthlon)
-      .Case("mmx", MMX3DNowLevel >= MMX)
+      .Case("mmx", HasMMX)
       .Case("movbe", HasMOVBE)
       .Case("movdiri", HasMOVDIRI)
       .Case("movdir64b", HasMOVDIR64B)
diff --git a/clang/lib/Basic/Targets/X86.h b/clang/lib/Basic/Targets/X86.h
index 9b2ae87adb2e7..2ef404dfa18c8 100644
--- a/clang/lib/Basic/Targets/X86.h
+++ b/clang/lib/Basic/Targets/X86.h
@@ -67,12 +67,7 @@ class LLVM_LIBRARY_VISIBILITY X86TargetInfo : public TargetInfo {
     AVX2,
     AVX512F
   } SSELevel = NoSSE;
-  enum MMX3DNowEnum {
-    NoMMX3DNow,
-    MMX,
-    AMD3DNow,
-    AMD3DNowAthlon
-  } MMX3DNowLevel = NoMMX3DNow;
+  bool HasMMX = false;
   enum XOPEnum { NoXOP, SSE4A, FMA4, XOP } XOPLevel = NoXOP;
   enum AddrSpace { ptr32_sptr = 270, ptr32_uptr = 271, ptr64 = 272 };
 
@@ -346,8 +341,7 @@ class LLVM_LIBRARY_VISIBILITY X86TargetInfo : public TargetInfo {
       return "avx512";
     if (getTriple().getArch() == llvm::Triple::x86_64 && SSELevel >= AVX)
       return "avx";
-    if (getTriple().getArch() == llvm::Triple::x86 &&
-        MMX3DNowLevel == NoMMX3DNow)
+    if (getTriple().getArch() == llvm::Triple::x86 && !HasMMX)
       return "no-mmx";
     return "";
   }
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 931726a78dae9..54fd0de3e20ed 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -15936,14 +15936,6 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
     return Builder.CreateCall(F, {Ops[0]});
   }
 
-  // 3DNow!
-  case X86::BI__builtin_ia32_pswapdsf:
-  case X86::BI__builtin_ia32_pswapdsi: {
-    llvm::Type *MMXTy = llvm::Type::getX86_MMXTy(getLLVMContext());
-    Ops[0] = Builder.CreateBitCast(Ops[0], MMXTy, "cast");
-    llvm::Function *F = CGM.getIntrinsic(Intrinsic::x86_3dnowa_pswapd);
-    return Builder.CreateCall(F, Ops, "pswapd");
-  }
   case X86::BI__builtin_ia32_rdrand16_step:
   case X86::BI__builtin_ia32_rdrand32_step:
   case X86::BI__builtin_ia32_rdrand64_step:
diff --git a/clang/lib/Driver/ToolChains/Arch/X86.cpp b/clang/lib/Driver/ToolChains/Arch/X86.cpp
index 75f9c99d5d0bf..067d11162cb19 100644
--- a/clang/lib/Driver/ToolChains/Arch/X86.cpp
+++ b/clang/lib/Driver/ToolChains/Arch/X86.cpp
@@ -312,4 +312,17 @@ void x86::getX86TargetFeatures(const Driver &D, const llvm::Triple &Triple,
     Features.push_back("+prefer-no-scatter");
   if (Args.hasArg(options::OPT_mapx_inline_asm_use_gpr32))
     Features.push_back("+inline-asm-use-gpr32");
+
+  // Warn for removed 3dnow support
+  if (const Arg *A =
+          Args.getLastArg(options::OPT_m3dnowa, options::OPT_mno_3dnowa,
+                          options::OPT_mno_3dnow)) {
+    if (A->getOption().matches(options::OPT_m3dnowa))
+      D.Diag(diag::warn_drv_clang_unsupported) << A->getAsString(Args);
+  }
+  if (const Arg *A =
+          Args.getLastArg(options::OPT_m3dnow, options::OPT_mno_3dnow)) {
+    if (A->getOption().matches(options::OPT_m3dnow))
+      D.Diag(diag::warn_drv_clang_unsupported) << A->getAsString(Args);
+  }
 }
diff --git a/clang/lib/Headers/mm3dnow.h b/clang/lib/Headers/mm3dnow.h
index 22ab13aa33409..10049553969f2 100644
--- a/clang/lib/Headers/mm3dnow.h
+++ b/clang/lib/Headers/mm3dnow.h
@@ -7,151 +7,16 @@
  *===-----------------------------------------------------------------------===
  */
 
+// 3dNow intrinsics are no longer supported, and this header remains only as a
+// stub for users who were including it to get to _m_prefetch or
+// _m_prefetchw. Such uses should prefer x86intrin.h.
+
 #ifndef _MM3DNOW_H_INCLUDED
 #define _MM3DNOW_H_INCLUDED
 
 #include <mmintrin.h>
 #include <prfchwintrin.h>
 
-typedef float __v2sf __attribute__((__vector_size__(8)));
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("3dnow"), __min_vector_width__(64)))
-
-static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("3dnow")))
-_m_femms(void) {
-  __builtin_ia32_femms();
-}
-
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_m_pavgusb(__m64 __m1, __m64 __m2) {
-  return (__m64)__builtin_ia32_pavgusb((__v8qi)__m1, (__v8qi)__m2);
-}
-
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_m_pf2id(__m64 __m) {
-  return (__m64)__builtin_ia32_pf2id((__v2sf)__m);
-}
-
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_m_pfacc(__m64 __m1, __m64 __m2) {
-  return (__m64)__builtin_ia32_pfacc((__v2sf)__m1, (__v2sf)__m2);
-}
-
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_m_pfadd(__m64 __m1, __m64 __m2) {
-  return (__m64)__builtin_ia32_pfadd((__v2sf)__m1, (__v2sf)__m2);
-}
-
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_m_pfcmpeq(__m64 __m1, __m64 __m2) {
-  return (__m64)__builtin_ia32_pfcmpeq((__v2sf)__m1, (__v2sf)__m2);
-}
-
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_m_pfcmpge(__m64 __m1, __m64 __m2) {
-  return (__m64)__builtin_ia32_pfcmpge((__v2sf)__m1, (__v2sf)__m2);
-}
-
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_m_pfcmpgt(__m64 __m1, __m64 __m2) {
-  return (__m64)__builtin_ia32_pfcmpgt((__v2sf)__m1, (__v2sf)__m2);
-}
-
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_m_pfmax(__m64 __m1, __m64 __m2) {
-  return (__m64)__builtin_ia32_pfmax((__v2sf)__m1, (__v2sf)__m2);
-}
-
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_m_pfmin(__m64 __m1, __m64 __m2) {
-  return (__m64)__builtin_ia32_pfmin((__v2sf)__m1, (__v2sf)__m2);
-}
-
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_m_pfmul(__m64 __m1, __m64 __m2) {
-  return (__m64)__builtin_ia32_pfmul((__v2sf)__m1, (__v2sf)__m2);
-}
-
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_m_pfrcp(__m64 __m) {
-  return (__m64)__builtin_ia32_pfrcp((__v2sf)__m);
-}
-
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_m_pfrcpit1(__m64 __m1, __m64 __m2) {
-  return (__m64)__builtin_ia32_pfrcpit1((__v2sf)__m1, (__v2sf)__m2);
-}
-
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_m_pfrcpit2(__m64 __m1, __m64 __m2) {
-  return (__m64)__builtin_ia32_pfrcpit2((__v2sf)__m1, (__v2sf)__m2);
-}
-
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_m_pfrsqrt(__m64 __m) {
-  return (__m64)__builtin_ia32_pfrsqrt((__v2sf)__m);
-}
-
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_m_pfrsqrtit1(__m64 __m1, __m64 __m2) {
-  return (__m64)__builtin_ia32_pfrsqit1((__v2sf)__m1, (__v2sf)__m2);
-}
-
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_m_pfsub(__m64 __m1, __m64 __m2) {
-  return (__m64)__builtin_ia32_pfsub((__v2sf)__m1, (__v2sf)__m2);
-}
-
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_m_pfsubr(__m64 __m1, __m64 __m2) {
-  return (__m64)__builtin_ia32_pfsubr((__v2sf)__m1, (__v2sf)__m2);
-}
-
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_m_pi2fd(__m64 __m) {
-  return (__m64)__builtin_ia32_pi2fd((__v2si)__m);
-}
-
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_m_pmulhrw(__m64 __m1, __m64 __m2) {
-  return (__m64)__builtin_ia32_pmulhrw((__v4hi)__m1, (__v4hi)__m2);
-}
-
-/* Handle the 3dnowa instructions here. */
-#undef __DEFAULT_FN_ATTRS
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("3dnowa"), __min_vector_width__(64)))
-
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_m_pf2iw(__m64 __m) {
-  return (__m64)__builtin_ia32_pf2iw((__v2sf)__m);
-}
-
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_m_pfnacc(__m64 __m1, __m64 __m2) {
-  return (__m64)__builtin_ia32_pfnacc((__v2sf)__m1, (__v2sf)__m2);
-}
-
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_m_pfpnacc(__m64 __m1, __m64 __m2) {
-  return (__m64)__builtin_ia32_pfpnacc((__v2sf)__m1, (__v2sf)__m2);
-}
-
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_m_pi2fw(__m64 __m) {
-  return (__m64)__builtin_ia32_pi2fw((__v2si)__m);
-}
-
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_m_pswapdsf(__m64 __m) {
-  return (__m64)__builtin_ia32_pswapdsf((__v2sf)__m);
-}
-
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_m_pswapdsi(__m64 __m) {
-  return (__m64)__builtin_ia32_pswapdsi((__v2si)__m);
-}
-
 #undef __DEFAULT_FN_ATTRS
 
 #endif
diff --git a/clang/lib/Headers/x86intrin.h b/clang/lib/Headers/x86intrin.h
index c20bfbb8fe46e..f42e9e580f883 100644
--- a/clang/lib/Headers/x86intrin.h
+++ b/clang/lib/Headers/x86intrin.h
@@ -14,10 +14,6 @@
 
 #include <immintrin.h>
 
-#if !defined(__SCE__) || __has_feature(modules) || defined(__3dNOW__)
-#include <mm3dnow.h>
-#endif
-
 #if !defined(__SCE__) || __has_feature(modules) || defined(__PRFCHW__)
 #include <prfchwintrin.h>
 #endif
diff --git a/clang/test/CodeGen/X86/3dnow-builtins.c b/clang/test/CodeGen/X86/3dnow-builtins.c
deleted file mode 100644
index af754b71555c4..0000000000000
--- a/clang/test/CodeGen/X86/3dnow-builtins.c
+++ /dev/null
@@ -1,181 +0,0 @@
-// RUN: %clang_cc1 -ffreestanding %s -triple=x86_64-unknown-unknown -target-feature +3dnowa -emit-llvm -o - -Wall -Werror | FileCheck %s -check-prefix=GCC -check-prefix=CHECK
-// RUN: %clang_cc1 -ffreestanding %s -triple=x86_64-scei-ps4 -target-feature +3dnowa -emit-llvm -o - -Wall -Werror | FileCheck %s -check-prefix=PS4 -check-prefix=CHECK
-// RUN: %clang_cc1 -ffreestanding %s -triple=x86_64-sie-ps5  -target-feature +3dnowa -emit-llvm -o - -Wall -Werror | FileCheck %s -check-prefix=PS4 -check-prefix=CHECK
-
-
-#include <x86intrin.h>
-
-__m64 test_m_pavgusb(__m64 m1, __m64 m2) {
-  // PS4-LABEL: define{{.*}} i64 @test_m_pavgusb
-  // GCC-LABEL: define{{.*}} double @test_m_pavgusb
-  // CHECK: @llvm.x86.3dnow.pavgusb
-  return _m_pavgusb(m1, m2);
-}
-
-__m64 test_m_pf2id(__m64 m) {
-  // PS4-LABEL: define{{.*}} i64 @test_m_pf2id
-  // GCC-LABEL: define{{.*}} double @test_m_pf2id
-  // CHECK: @llvm.x86.3dnow.pf2id
-  return _m_pf2id(m);
-}
-
-__m64 test_m_pfacc(__m64 m1, __m64 m2) {
-  // PS4-LABEL: define{{.*}} i64 @test_m_pfacc
-  // GCC-LABEL: define{{.*}} double @test_m_pfacc
-  // CHECK: @llvm.x86.3dnow.pfacc
-  return _m_pfacc(m1, m2);
-}
-
-__m64 test_m_pfadd(__m64 m1, __m64 m2) {
-  // PS4-LABEL: define{{.*}} i64 @test_m_pfadd
-  // GCC-LABEL: define{{.*}} double @test_m_pfadd
-  // CHECK: @llvm.x86.3dnow.pfadd
-  return _m_pfadd(m1, m2);
-}
-
-__m64 test_m_pfcmpeq(__m64 m1, __m64 m2) {
-  // PS4-LABEL: define{{.*}} i64 @test_m_pfcmpeq
-  // GCC-LABEL: define{{.*}} double @test_m_pfcmpeq
-  // CHECK: @llvm.x86.3dnow.pfcmpeq
-  return _m_pfcmpeq(m1, m2);
-}
-
-__m64 test_m_pfcmpge(__m64 m1, __m64 m2) {
-  // PS4-LABEL: define{{.*}} i64 @test_m_pfcmpge
-  // GCC-LABEL: define{{.*}} double @test_m_pfcmpge
-  // CHECK: @llvm.x86.3dnow.pfcmpge
-  return _m_pfcmpge(m1, m2);
-}
-
-__m64 test_m_pfcmpgt(__m64 m1, __m64 m2) {
-  // PS4-LABEL: define{{.*}} i64 @test_m_pfcmpgt
-  // GCC-LABEL: define{{.*}} double @test_m_pfcmpgt
-  // CHECK: @llvm.x86.3dnow.pfcmpgt
-  return _m_pfcmpgt(m1, m2);
-}
-
-__m64 test_m_pfmax(__m64 m1, __m64 m2) {
-  // PS4-LABEL: define{{.*}} i64 @test_m_pfmax
-  // GCC-LABEL: define{{.*}} double @test_m_pfmax
-  // CHECK: @llvm.x86.3dnow.pfmax
-  return _m_pfmax(m1, m2);
-}
-
-__m64 test_m_pfmin(__m64 m1, __m64 m2) {
-  // PS4-LABEL: define{{.*}} i64 @test_m_pfmin
-  // GCC-LABEL: define{{.*}} double @test_m_pfmin
-  // CHECK: @llvm.x86.3dnow.pfmin
-  return _m_pfmin(m1, m2);
-}
-
-__m64 test_m_pfmul(__m64 m1, __m64 m2) {
-  // PS4-LABEL: define{{.*}} i64 @test_m_pfmul
-  // GCC-LABEL: define{{.*}} double @test_m_pfmul
-  // CHECK: @llvm.x86.3dnow.pfmul
-  return _m_pfmul(m1, m2);
-}
-
-__m64 test_m_pfrcp(__m64 m) {
-  // PS4-LABEL: define{{.*}} i64 @test_m_pfrcp
-  // GCC-LABEL: define{{.*}} double @test_m_pfrcp
-  // CHECK: @llvm.x86.3dnow.pfrcp
-  return _m_pfrcp(m);
-}
-
-__m64 test_m_pfrcpit1(__m64 m1, __m64 m2) {
-  // PS4-LABEL: define{{.*}} i64 @test_m_pfrcpit1
-  // GCC-LABEL: define{{.*}} double @test_m_pfrcpit1
-  // CHECK: @llvm.x86.3dnow.pfrcpit1
-  return _m_pfrcpit1(m1, m2);
-}
-
-__m64 test_m_pfrcpit2(__m64 m1, __m64 m2) {
-  // PS4-LABEL: define{{.*}} i64 @test_m_pfrcpit2
-  // GCC-LABEL: define{{.*}} double @test_m_pfrcpit2
-  // CHECK: @llvm.x86.3dnow.pfrcpit2
-  return _m_pfrcpit2(m1, m2);
-}
-
-__m64 test_m_pfrsqrt(__m64 m) {
-  // PS4-LABEL: define{{.*}} i64 @test_m_pfrsqrt
-  // GCC-LABEL: define{{.*}} double @test_m_pfrsqrt
-  // CHECK: @llvm.x86.3dnow.pfrsqrt
-  return _m_pfrsqrt(m);
-}
-
-__m64 test_m_pfrsqrtit1(__m64 m1, __m64 m2) {
-  // PS4-LABEL: define{{.*}} i64 @test_m_pfrsqrtit1
-  // GCC-LABEL: define{{.*}} double @test_m_pfrsqrtit1
-  // CHECK: @llvm.x86.3dnow.pfrsqit1
-  return _m_pfrsqrtit1(m1, m2);
-}
-
-__m64 test_m_pfsub(__m64 m1, __m64 m2) {
-  // PS4-LABEL: define{{.*}} i64 @test_m_pfsub
-  // GCC-LABEL: define{{.*}} double @test_m_pfsub
-  // CHECK: @llvm.x86.3dnow.pfsub
-  return _m_pfsub(m1, m2);
-}
-
-__m64 test_m_pfsubr(__m64 m1, __m64 m2) {
-  // PS4-LABEL: define{{.*}} i64 @test_m_pfsubr
-  // GCC-LABEL: define{{.*}} double @test_m_pfsubr
-  // CHECK: @llvm.x86.3dnow.pfsubr
-  return _m_pfsubr(m1, m2);
-}
-
-__m64 test_m_pi2fd(__m64 m) {
-  // PS4-LABEL: define{{.*}} i64 @test_m_pi2fd
-  // GCC-LABEL: define{{.*}} double @test_m_pi2fd
-  // CHECK: @llvm.x86.3dnow.pi2fd
-  return _m_pi2fd(m);
-}
-
-__m64 test_m_pmulhrw(__m64 m1, __m64 m2) {
-  // PS4-LABEL: define{{.*}} i64 @test_m_pmulhrw
-  // GCC-LABEL: define{{.*}} double @test_m_pmulhrw
-  // CHECK: @llvm.x86.3dnow.pmulhrw
-  return _m_pmulhrw(m1, m2);
-}
-
-__m64 test_m_pf2iw(__m64 m) {
-  // PS4-LABEL: define{{.*}} i64 @test_m_pf2iw
-  // GCC-LABEL: define{{.*}} double @test_m_pf2iw
-  // CHECK: @llvm.x86.3dnowa.pf2iw
-  return _m_pf2iw(m);
-}
-
-__m64 test_m_pfnacc(__m64 m1, __m64 m2) {
-  // PS4-LABEL: define{{.*}} i64 @test_m_pfnacc
-  // GCC-LABEL: define{{.*}} double @test_m_pfnacc
-  // CHECK: @llvm.x86.3dnowa.pfnacc
-  return _m_pfnacc(m1, m2);
-}
-
-__m64 test_m_pfpnacc(__m64 m1, __m64 m2) {
-  // PS4-LABEL: define{{.*}} i64 @test_m_pfpnacc
-  // GCC-LABEL: define{{.*}} double @test_m_pfpnacc
-  // CHECK: @llvm.x86.3dnowa.pfpnacc
-  return _m_pfpnacc(m1, m2);
-}
-
-__m64 test_m_pi2fw(__m64 m) {
-  // PS4-LABEL: define{{.*}} i64 @test_m_pi2fw
-  // GCC-LABEL: define{{.*}} double @test_m_pi2fw
-  // CHECK: @llvm.x86.3dnowa.pi2fw
-  return _m_pi2fw(m);
-}
-
-__m64 test_m_pswapdsf(__m64 m) {
-  // PS4-LABEL: define{{.*}} i64 @test_m_pswapdsf
-  // GCC-LABEL: define{{.*}} double @test_m_pswapdsf
-  // CHECK: @llvm.x86.3dnowa.pswapd
-  return _m_pswapdsf(m);
-}
-
-__m64 test_m_pswapdsi(__m64 m) {
-  // PS4-LABEL: define{{.*}} i64 @test_m_pswapdsi
-  // GCC-LABEL: define{{.*}} double @test_m_pswapdsi
-  // CHECK: @llvm.x86.3dnowa.pswapd
-  return _m_pswapdsi(m);
-}
diff --git a/clang/test/CodeGen/builtins-x86.c b/clang/test/CodeGen/builtins-x86.c
index e0f220dbeafcc..de31a4db5b0c1 100644
--- a/clang/test/CodeGen/builtins-x86.c
+++ b/clang/test/CodeGen/builtins-x86.c
@@ -3,7 +3,6 @@
 // RUN: %clang_cc1 -DUSE_64 -DOPENCL -x cl -cl-std=CL2.0 -triple x86_64-unknown-unknown -target-feature +fxsr -target-feature +avx -target-feature +xsaveopt -target-feature +xsaves -target-feature +xsavec -target-feature +mwaitx -target-feature +clzero -target-feature +shstk -target-feature +wbnoinvd -target-feature +cldemote -emit-llvm -o %t %s
 
 #ifdef USE_ALL
-#define USE_3DNOW
 #define USE_64
 #define USE_SSE4
 #endif
@@ -96,9 +95,6 @@ void f0(void) {
   V4s    tmp_V4s;
   V2i    tmp_V2i;
   V1LLi  tmp_V1LLi;
-#ifdef USE_3DNOW
-  V2f    tmp_V2f;
-#endif
 
   // 128-bit
   V16c   tmp_V16c;
@@ -513,33 +509,7 @@ void f0(void) {
   __builtin_ia32_maskstorepd256(tmp_V4dp, tmp_V4LLi, tmp_V4d);
   __builtin_ia32_maskstoreps256(tmp_V8fp, tmp_V8i, tmp_V8f);
 
-#ifdef USE_3DNOW
-  tmp_V8c = __builtin_ia32_pavgusb(tmp_V8c, tmp_V8c);
-  tmp_V2i = __builtin_ia32_pf2id(tmp_V2f);
-  tmp_V2f = __builtin_ia32_pfacc(tmp_V2f, tmp_V2f);
-  tmp_V2f = __builtin_ia32_pfadd(tmp_V2f, tmp_V2f);
-  tmp_V2i = __builtin_ia32_pfcmpeq(tmp_V2f, tmp_V2f);
-  tmp_V2i = __builtin_ia32_pfcmpge(tmp_V2f, tmp_V2f);
-  tmp_V2i = __builtin_ia32_pfcmpgt(tmp_V2f, tmp_V2f);
-  tmp_V2f = __builtin_ia32_pfmax(tmp_V2f, tmp_V2f);
-  tmp_V2f = __builtin_ia32_pfmin(tmp_V2f, tmp_V2f);
-  tmp_V2f = __builtin_ia32_pfmul(tmp_V2f, tmp_V2f);
-  tmp_V2f = __builtin_ia32_pfrcp(tmp_V2f);
-  tmp_V2f = __builtin_ia32_pfrcpit1(tmp_V2f, tmp_V2f);
-  tmp_V2f = __builtin_ia32_pfrcpit2(tmp_V2f, tmp_V2f);
-  tmp_V2f = __builtin_ia32_pfrsqrt(tmp_V2f);
-  tmp_V2f = __builtin_ia32_pfrsqit1(tmp_V2f, tmp_V2f);
-  tmp_V2f = __builtin_ia32_pfsub(tmp_V2f, tmp_V2f);
-  tmp_V2f = __builtin_ia32_pfsubr(tmp_V2f, tmp_V2f);
-  tmp_V2f = __builtin_ia32_pi2fd(tmp_V2i);
-  tmp_V4s = __builtin_ia32_pmulhrw(tmp_V4s, tmp_V4s);
-  tmp_V2i = __builtin_ia32_pf2iw(tmp_V2f);
-  tmp_V2f = __builtin_ia32_pfnacc(tmp_V2f, tmp_V2f);
-  tmp_V2f = __builtin_ia32_pfpnacc(tmp_V2f, tmp_V2f);
-  tmp_V2f = __builtin_ia32_pi2fw(tmp_V2i);
-  tmp_V2f = __builtin_ia32_pswapdsf(tmp_V2f);
-  tmp_V2i = __builtin_ia32_pswapdsi(tmp_V2i);
-
+#if USE_ALL
   tmp_V4i = __builtin_ia32_sha1rnds4(tmp_V4i, tmp_V4i, imm_i_0_4);
   tmp_V4i = __builtin_ia32_sha1nexte(tmp_V4i, tmp_V4i);
   tmp_V4i = __builtin_ia32_sha1msg1(tmp_V4i, tmp_V4i);
diff --git a/clang/test/Driver/x86-target-features.c b/clang/test/Driver/x86-target-features.c
index 3022ed1250d59..1790e27e7502b 100644
--- a/clang/test/Driver/x86-target-features.c
+++ b/clang/test/Driver/x86-target-features.c
@@ -8,8 +8,13 @@
 
 // RUN: %clang --target=i386 -march=i386 -mmmx -m3dnow -m3dnowa %s -### 2>&1 | FileCheck -check-prefix=MMX %s
 // RUN: %clang --target=i386 -march=i386 -mno-mmx -mno-3dnow -mno-3dnowa %s -### 2>&1 | FileCheck -check-prefix=NO-MMX %s
-// MMX: "-target-feature" "+mmx" "-target-feature" "+3dnow" "-target-feature" "+3dnowa"
-// NO-MMX: "-target-feature" "-mmx" "-target-feature" "-3dnow" "-target-feature" "-3dnowa"
+// MMX: warning: the clang compiler does not support '-m3dnowa'
+// MMX: warning: the clang compiler does not support '-m3dnow'
+// MMX-NOT: "3dnow"
+// MMX: "-target-feature" "+mmx"
+// MMX-NOT: "3dnow"
+// NO-MMX-NOT: warning
+// NO-MMX: "-target-feature" "-mmx"
 
 // RUN: %clang --target=i386 -march=i386 -msse -msse2 -msse3 -mssse3 -msse4a -msse4.1 -msse4.2 %s -### 2>&1 | FileCheck -check-prefix=SSE %s
 // RUN: %clang --target=i386 -march=i386 -mno-sse -mno-sse2 -mno-sse3 -mno-ssse3 -mno-sse4a -mno-sse4.1 -mno-sse4.2 %s -### 2>&1 | FileCheck -check-prefix=NO-SSE %s
diff --git a/clang/test/Headers/mm3dnow.c b/clang/test/Headers/mm3dnow.c
index 255483cb9b836..f63882688b612 100644
--- a/clang/test/Headers/mm3dnow.c
+++ b/clang/test/Headers/mm3dnow.c
@@ -5,12 +5,9 @@
 #if defined(i386) || defined(__x86_64__)
 #include <mm3dnow.h>
 
-int __attribute__((__target__(("3dnow")))) foo(int a) {
-  _m_femms();
+int foo(void *x) {
+  _m_prefetch(x);
+  _m_prefetchw(x);
   return 4;
 }
-
-__m64 __attribute__((__target__(("3dnowa")))) bar(__m64 a) {
-  return _m_pf2iw(a);
-}
 #endif
diff --git a/clang/test/Preprocessor/predefined-arch-macros.c b/clang/test/Preprocessor/predefined-arch-macros.c
index f0a2ef851287f..6f470d85ca563 100644
--- a/clang/test/Preprocessor/predefined-arch-macros.c
+++ b/clang/test/Preprocessor/predefined-arch-macros.c
@@ -99,7 +99,6 @@
 // RUN: %clang -march=winchip2 -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
 // RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_WINCHIP2_M32
-// CHECK_WINCHIP2_M32: #define __3dNOW__ 1
 // CHECK_WINCHIP2_M32: #define __MMX__ 1
 // CHECK_WINCHIP2_M32: #define __i386 1
 // CHECK_WINCHIP2_M32: #define __i386__ 1
@@ -115,7 +114,6 @@
 // RUN: %clang -march=c3 -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
 // RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_C3_M32
-// CHECK_C3_M32: #define __3dNOW__ 1
 // CHECK_C3_M32: #define __MMX__ 1
 // CHECK_C3_M32: #define __i386 1
 // CHECK_C3_M32: #define __i386__ 1
@@ -2707,8 +2705,6 @@
 // RUN: %clang -march=geode -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
 // RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_GEODE_M32
-// CHECK_GEODE_M32: #define __3dNOW_A__ 1
-// CHECK_GEODE_M32: #define __3dNOW__ 1
 // CHECK_GEODE_M32: #define __MMX__ 1
 // CHECK_GEODE_M32: #define __geode 1
 // CHECK_GEODE_M32: #define __geode__ 1
@@ -2739,7 +2735,6 @@
 // RUN: %clang -march=k6-2 -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
 // RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_K6_2_M32
-// CHECK_K6_2_M32: #define __3dNOW__ 1
 // CHECK_K6_2_M32: #define __MMX__ 1
 // CHECK_K6_2_M32: #define __i386 1
 // CHECK_K6_2_M32: #define __i386__ 1
@@ -2757,7 +2752,6 @@
 // RUN: %clang -march=k6-3 -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
 // RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_K6_3_M32
-// CHECK_K6_3_M32: #define __3dNOW__ 1
 // CHECK_K6_3_M32: #define __MMX__ 1
 // CHECK_K6_3_M32: #define __i386 1
 // CHECK_K6_3_M32: #define __i386__ 1
@@ -2775,8 +2769,6 @@
 // RUN: %clang -march=athlon -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
 // RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_ATHLON_M32
-// CHECK_ATHLON_M32: #define __3dNOW_A__ 1
-// CHECK_ATHLON_M32: #define __3dNOW__ 1
 // CHECK_ATHLON_M32: #define __MMX__ 1
 // CHECK_ATHLON_M32: #define __athlon 1
 // CHECK_ATHLON_M32: #define __athlon__ 1
@@ -2792,8 +2784,6 @@
 // RUN: %clang -march=athlon-tbird -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
 // RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_ATHLON_TBIRD_M32
-// CHECK_ATHLON_TBIRD_M32: #define __3dNOW_A__ 1
-// CHECK_ATHLON_TBIRD_M32: #define __3dNOW__ 1
 // CHECK_ATHLON_TBIRD_M32: #define __MMX__ 1
 // CHECK_ATHLON_TBIRD_M32: #define __athlon 1
 // CHECK_ATHLON_TBIRD_M32: #define __athlon__ 1
@@ -2809,8 +2799,6 @@
 // RUN: %clang -march=athlon-4 -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
 // RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_ATHLON_4_M32
-// CHECK_ATHLON_4_M32: #define __3dNOW_A__ 1
-// CHECK_ATHLON_4_M32: #define __3dNOW__ 1
 // CHECK_ATHLON_4_M32: #define __MMX__ 1
 // CHECK_ATHLON_4_M32: #define __SSE__ 1
 // CHECK_ATHLON_4_M32: #define __athlon 1
@@ -2829,8 +2817,6 @@
 // RUN: %clang -march=athlon-xp -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
 // RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_ATHLON_XP_M32
-// CHECK_ATHLON_XP_M32: #define __3dNOW_A__ 1
-// CHECK_ATHLON_XP_M32: #define __3dNOW__ 1
 // CHECK_ATHLON_XP_M32: #define __MMX__ 1
 // CHECK_ATHLON_XP_M32: #define __SSE__ 1
 // CHECK_ATHLON_XP_M32: #define __athlon 1
@@ -2849,8 +2835,6 @@
 // RUN: %clang -march=athlon-mp -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
 // RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_ATHLON_MP_M32
-// CHECK_ATHLON_MP_M32: #define __3dNOW_A__ 1
-// CHECK_ATHLON_MP_M32: #define __3dNOW__ 1
 // CHECK_ATHLON_MP_M32: #define __MMX__ 1
 // CHECK_ATHLON_MP_M32: #define __SSE__ 1
 // CHECK_ATHLON_MP_M32: #define __athlon 1
@@ -2881,8 +2865,6 @@
 // RUN: %clang -march=k8 -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
 // RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_K8_M32
-// CHECK_K8_M32: #define __3dNOW_A__ 1
-// CHECK_K8_M32: #define __3dNOW__ 1
 // CHECK_K8_M32: #define __MMX__ 1
 // CHECK_K8_M32: #define __SSE2__ 1
 // CHECK_K8_M32: #define __SSE__ 1
@@ -2896,8 +2878,6 @@
 // RUN: %clang -march=k8 -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
 // RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_K8_M64
-// CHECK_K8_M64: #define __3dNOW_A__ 1
-// CHECK_K8_M64: #define __3dNOW__ 1
 // CHECK_K8_M64: #define __MMX__ 1
 // CHECK_K8_M64: #define __SSE2_MATH__ 1
 // CHECK_K8_M64: #define __SSE2__ 1
@@ -2914,8 +2894,6 @@
 // RUN: %clang -march=k8-sse3 -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
 // RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_K8_SSE3_M32
-// CHECK_K8_SSE3_M32: #define __3dNOW_A__ 1
-// CHECK_K8_SSE3_M32: #define __3dNOW__ 1
 // CHECK_K8_SSE3_M32: #define __MMX__ 1
 // CHECK_K8_SSE3_M32: #define __SSE2__ 1
 // CHECK_K8_SSE3_M32: #define __SSE3__ 1
@@ -2930,8 +2908,6 @@
 // RUN: %clang -march=k8-sse3 -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
 // RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_K8_SSE3_M64
-// CHECK_K8_SSE3_M64: #define __3dNOW_A__ 1
-// CHECK_K8_SSE3_M64: #define __3dNOW__ 1
 // CHECK_K8_SSE3_M64: #define __MMX__ 1
 // CHECK_K8_SSE3_M64: #define __SSE2_MATH__ 1
 // CHECK_K8_SSE3_M64: #define __SSE2__ 1
@@ -2949,8 +2925,6 @@
 // RUN: %clang -march=opteron -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
 // RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_OPTERON_M32
-// CHECK_OPTERON_M32: #define __3dNOW_A__ 1
-// CHECK_OPTERON_M32: #define __3dNOW__ 1
 // CHECK_OPTERON_M32: #define __MMX__ 1
 // CHECK_OPTERON_M32: #define __SSE2__ 1
 // CHECK_OPTERON_M32: #define __SSE__ 1
@@ -2964,8 +2938,6 @@
 // RUN: %clang -march=opteron -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
 // RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_OPTERON_M64
-// CHECK_OPTERON_M64: #define __3dNOW_A__ 1
-// CHECK_OPTERON_M64: #define __3dNOW__ 1
 // CHECK_OPTERON_M64: #define __MMX__ 1
 // CHECK_OPTERON_M64: #define __SSE2_MATH__ 1
 // CHECK_OPTERON_M64: #define __SSE2__ 1
@@ -2982,8 +2954,6 @@
 // RUN: %clang -march=opteron-sse3 -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
 // RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_OPTERON_SSE3_M32
-// CHECK_OPTERON_SSE3_M32: #define __3dNOW_A__ 1
-// CHECK_OPTERON_SSE3_M32: #define __3dNOW__ 1
 // CHECK_OPTERON_SSE3_M32: #define __MMX__ 1
 // CHECK_OPTERON_SSE3_M32: #define __SSE2__ 1
 // CHECK_OPTERON_SSE3_M32: #define __SSE3__ 1
@@ -2998,8 +2968,6 @@
 // RUN: %clang -march=opteron-sse3 -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
 // RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_OPTERON_SSE3_M64
-// CHECK_OPTERON_SSE3_M64: #define __3dNOW_A__ 1
-// CHECK_OPTERON_SSE3_M64: #define __3dNOW__ 1
 // CHECK_OPTERON_SSE3_M64: #define __MMX__ 1
 // CHECK_OPTERON_SSE3_M64: #define __SSE2_MATH__ 1
 // CHECK_OPTERON_SSE3_M64: #define __SSE2__ 1
@@ -3017,8 +2985,6 @@
 // RUN: %clang -march=athlon64 -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
 // RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_ATHLON64_M32
-// CHECK_ATHLON64_M32: #define __3dNOW_A__ 1
-// CHECK_ATHLON64_M32: #define __3dNOW__ 1
 // CHECK_ATHLON64_M32: #define __MMX__ 1
 // CHECK_ATHLON64_M32: #define __SSE2__ 1
 // CHECK_ATHLON64_M32: #define __SSE__ 1
@@ -3032,8 +2998,6 @@
 // RUN: %clang -march=athlon64 -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
 // RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_ATHLON64_M64
-// CHECK_ATHLON64_M64: #define __3dNOW_A__ 1
-// CHECK_ATHLON64_M64: #define __3dNOW__ 1
 // CHECK_ATHLON64_M64: #define __MMX__ 1
 // CHECK_ATHLON64_M64: #define __SSE2_MATH__ 1
 // CHECK_ATHLON64_M64: #define __SSE2__ 1
@@ -3050,8 +3014,6 @@
 // RUN: %clang -march=athlon64-sse3 -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
 // RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_ATHLON64_SSE3_M32
-// CHECK_ATHLON64_SSE3_M32: #define __3dNOW_A__ 1
-// CHECK_ATHLON64_SSE3_M32: #define __3dNOW__ 1
 // CHECK_ATHLON64_SSE3_M32: #define __MMX__ 1
 // CHECK_ATHLON64_SSE3_M32: #define __SSE2__ 1
 // CHECK_ATHLON64_SSE3_M32: #define __SSE3__ 1
@@ -3066,8 +3028,6 @@
 // RUN: %clang -march=athlon64-sse3 -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
 // RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_ATHLON64_SSE3_M64
-// CHECK_ATHLON64_SSE3_M64: #define __3dNOW_A__ 1
-// CHECK_ATHLON64_SSE3_M64: #define __3dNOW__ 1
 // CHECK_ATHLON64_SSE3_M64: #define __MMX__ 1
 // CHECK_ATHLON64_SSE3_M64: #define __SSE2_MATH__ 1
 // CHECK_ATHLON64_SSE3_M64: #define __SSE2__ 1
@@ -3085,8 +3045,6 @@
 // RUN: %clang -march=athlon-fx -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
 // RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_ATHLON_FX_M32
-// CHECK_ATHLON_FX_M32: #define __3dNOW_A__ 1
-// CHECK_ATHLON_FX_M32: #define __3dNOW__ 1
 // CHECK_ATHLON_FX_M32: #define __MMX__ 1
 // CHECK_ATHLON_FX_M32: #define __SSE2__ 1
 // CHECK_ATHLON_FX_M32: #define __SSE__ 1
@@ -3100,8 +3058,6 @@
 // RUN: %clang -march=athlon-fx -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
 // RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_ATHLON_FX_M64
-// CHECK_ATHLON_FX_M64: #define __3dNOW_A__ 1
-// CHECK_ATHLON_FX_M64: #define __3dNOW__ 1
 // CHECK_ATHLON_FX_M64: #define __MMX__ 1
 // CHECK_ATHLON_FX_M64: #define __SSE2_MATH__ 1
 // CHECK_ATHLON_FX_M64: #define __SSE2__ 1
@@ -3118,8 +3074,6 @@
 // RUN: %clang -march=amdfam10 -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
 // RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_AMDFAM10_M32
-// CHECK_AMDFAM10_M32: #define __3dNOW_A__ 1
-// CHECK_AMDFAM10_M32: #define __3dNOW__ 1
 // CHECK_AMDFAM10_M32: #define __LAHF_SAHF__ 1
 // CHECK_AMDFAM10_M32: #define __LZCNT__ 1
 // CHECK_AMDFAM10_M32: #define __MMX__ 1
@@ -3141,8 +3095,6 @@
 // RUN: %clang -march=amdfam10 -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
 // RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_AMDFAM10_M64
-// CHECK_AMDFAM10_M64: #define __3dNOW_A__ 1
-// CHECK_AMDFAM10_M64: #define __3dNOW__ 1
 // CHECK_AMDFAM10_M64: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_16 1
 // CHECK_AMDFAM10_M64: #define __LAHF_SAHF__ 1
 // CHECK_AMDFAM10_M64: #define __LZCNT__ 1
@@ -3167,8 +3119,6 @@
 // RUN: %clang -march=btver1 -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
 // RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_BTVER1_M32
-// CHECK_BTVER1_M32-NOT: #define __3dNOW_A__ 1
-// CHECK_BTVER1_M32-NOT: #define __3dNOW__ 1
 // CHECK_BTVER1_M32: #define __LAHF_SAHF__ 1
 // CHECK_BTVER1_M32: #define __LZCNT__ 1
 // CHECK_BTVER1_M32: #define __MMX__ 1
@@ -3190,8 +3140,6 @@
 // RUN: %clang -march=btver1 -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
 // RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_BTVER1_M64
-// CHECK_BTVER1_M64-NOT: #define __3dNOW_A__ 1
-// CHECK_BTVER1_M64-NOT: #define __3dNOW__ 1
 // CHECK_BTVER1_M64: #define __LAHF_SAHF__ 1
 // CHECK_BTVER1_M64: #define __LZCNT__ 1
 // CHECK_BTVER1_M64: #define __MMX__ 1
@@ -3215,8 +3163,6 @@
 // RUN: %clang -march=btver2 -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
 // RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_BTVER2_M32
-// CHECK_BTVER2_M32-NOT: #define __3dNOW_A__ 1
-// CHECK_BTVER2_M32-NOT: #define __3dNOW__ 1
 // CHECK_BTVER2_M32: #define __AES__ 1
 // CHECK_BTVER2_M32: #define __AVX__ 1
 // CHECK_BTVER2_M32: #define __BMI__ 1
@@ -3245,8 +3191,6 @@
 // RUN: %clang -march=btver2 -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
 // RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_BTVER2_M64
-// CHECK_BTVER2_M64-NOT: #define __3dNOW_A__ 1
-// CHECK_BTVER2_M64-NOT: #define __3dNOW__ 1
 // CHECK_BTVER2_M64: #define __AES__ 1
 // CHECK_BTVER2_M64: #define __AVX__ 1
 // CHECK_BTVER2_M64: #define __BMI__ 1
@@ -3277,8 +3221,6 @@
 // RUN: %clang -march=bdver1 -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
 // RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_BDVER1_M32
-// CHECK_BDVER1_M32-NOT: #define __3dNOW_A__ 1
-// CHECK_BDVER1_M32-NOT: #define __3dNOW__ 1
 // CHECK_BDVER1_M32: #define __AES__ 1
 // CHECK_BDVER1_M32: #define __AVX__ 1
 // CHECK_BDVER1_M32: #define __FMA4__ 1
@@ -3308,8 +3250,6 @@
 // RUN: %clang -march=bdver1 -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
 // RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_BDVER1_M64
-// CHECK_BDVER1_M64-NOT: #define __3dNOW_A__ 1
-// CHECK_BDVER1_M64-NOT: #define __3dNOW__ 1
 // CHECK_BDVER1_M64: #define __AES__ 1
 // CHECK_BDVER1_M64: #define __AVX__ 1
 // CHECK_BDVER1_M64: #define __FMA4__ 1
@@ -3341,8 +3281,6 @@
 // RUN: %clang -march=bdver2 -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
 // RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_BDVER2_M32
-// CHECK_BDVER2_M32-NOT: #define __3dNOW_A__ 1
-// CHECK_BDVER2_M32-NOT: #define __3dNOW__ 1
 // CHECK_BDVER2_M32: #define __AES__ 1
 // CHECK_BDVER2_M32: #define __AVX__ 1
 // CHECK_BDVER2_M32: #define __BMI__ 1
@@ -3376,8 +3314,6 @@
 // RUN: %clang -march=bdver2 -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
 // RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_BDVER2_M64
-// CHECK_BDVER2_M64-NOT: #define __3dNOW_A__ 1
-// CHECK_BDVER2_M64-NOT: #define __3dNOW__ 1
 // CHECK_BDVER2_M64: #define __AES__ 1
 // CHECK_BDVER2_M64: #define __AVX__ 1
 // CHECK_BDVER2_M64: #define __BMI__ 1
@@ -3413,8 +3349,6 @@
 // RUN: %clang -march=bdver3 -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
 // RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_BDVER3_M32
-// CHECK_BDVER3_M32-NOT: #define __3dNOW_A__ 1
-// CHECK_BDVER3_M32-NOT: #define __3dNOW__ 1
 // CHECK_BDVER3_M32: #define __AES__ 1
 // CHECK_BDVER3_M32: #define __AVX__ 1
 // CHECK_BDVER3_M32: #define __BMI__ 1
@@ -3450,8 +3384,6 @@
 // RUN: %clang -march=bdver3 -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
 // RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_BDVER3_M64
-// CHECK_BDVER3_M64-NOT: #define __3dNOW_A__ 1
-// CHECK_BDVER3_M64-NOT: #define __3dNOW__ 1
 // CHECK_BDVER3_M64: #define __AES__ 1
 // CHECK_BDVER3_M64: #define __AVX__ 1
 // CHECK_BDVER3_M64: #define __BMI__ 1
@@ -3489,8 +3421,6 @@
 // RUN: %clang -march=bdver4 -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
 // RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_BDVER4_M32
-// CHECK_BDVER4_M32-NOT: #define __3dNOW_A__ 1
-// CHECK_BDVER4_M32-NOT: #define __3dNOW__ 1
 // CHECK_BDVER4_M32: #define __AES__ 1
 // CHECK_BDVER4_M32: #define __AVX2__ 1
 // CHECK_BDVER4_M32: #define __AVX__ 1
@@ -3529,8 +3459,6 @@
 // RUN: %clang -march=bdver4 -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
 // RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_BDVER4_M64
-// CHECK_BDVER4_M64-NOT: #define __3dNOW_A__ 1
-// CHECK_BDVER4_M64-NOT: #define __3dNOW__ 1
 // CHECK_BDVER4_M64: #define __AES__ 1
 // CHECK_BDVER4_M64: #define __AVX2__ 1
 // CHECK_BDVER4_M64: #define __AVX__ 1
@@ -3571,8 +3499,6 @@
 // RUN: %clang -march=znver1 -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
 // RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_ZNVER1_M32
-// CHECK_ZNVER1_M32-NOT: #define __3dNOW_A__ 1
-// CHECK_ZNVER1_M32-NOT: #define __3dNOW__ 1
 // CHECK_ZNVER1_M32: #define __ADX__ 1
 // CHECK_ZNVER1_M32: #define __AES__ 1
 // CHECK_ZNVER1_M32: #define __AVX2__ 1
@@ -3618,8 +3544,6 @@
 // RUN: %clang -march=znver1 -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
 // RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_ZNVER1_M64
-// CHECK_ZNVER1_M64-NOT: #define __3dNOW_A__ 1
-// CHECK_ZNVER1_M64-NOT: #define __3dNOW__ 1
 // CHECK_ZNVER1_M64: #define __ADX__ 1
 // CHECK_ZNVER1_M64: #define __AES__ 1
 // CHECK_ZNVER1_M64: #define __AVX2__ 1
@@ -3668,8 +3592,6 @@
 // RUN: %clang -march=znver2 -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
 // RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_ZNVER2_M32
-// CHECK_ZNVER2_M32-NOT: #define __3dNOW_A__ 1
-// CHECK_ZNVER2_M32-NOT: #define __3dNOW__ 1
 // CHECK_ZNVER2_M32: #define __ADX__ 1
 // CHECK_ZNVER2_M32: #define __AES__ 1
 // CHECK_ZNVER2_M32: #define __AVX2__ 1
@@ -3719,8 +3641,6 @@
 // RUN: %clang -march=znver2 -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
 // RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_ZNVER2_M64
-// CHECK_ZNVER2_M64-NOT: #define __3dNOW_A__ 1
-// CHECK_ZNVER2_M64-NOT: #define __3dNOW__ 1
 // CHECK_ZNVER2_M64: #define __ADX__ 1
 // CHECK_ZNVER2_M64: #define __AES__ 1
 // CHECK_ZNVER2_M64: #define __AVX2__ 1
@@ -3772,8 +3692,6 @@
 // RUN: %clang -march=znver3 -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
 // RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_ZNVER3_M32
-// CHECK_ZNVER3_M32-NOT: #define __3dNOW_A__ 1
-// CHECK_ZNVER3_M32-NOT: #define __3dNOW__ 1
 // CHECK_ZNVER3_M32: #define __ADX__ 1
 // CHECK_ZNVER3_M32: #define __AES__ 1
 // CHECK_ZNVER3_M32: #define __AVX2__ 1
@@ -3823,8 +3741,6 @@
 // RUN: %clang -march=znver3 -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
 // RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_ZNVER3_M64
-// CHECK_ZNVER3_M64-NOT: #define __3dNOW_A__ 1
-// CHECK_ZNVER3_M64-NOT: #define __3dNOW__ 1
 // CHECK_ZNVER3_M64: #define __ADX__ 1
 // CHECK_ZNVER3_M64: #define __AES__ 1
 // CHECK_ZNVER3_M64: #define __AVX2__ 1
@@ -3878,8 +3794,6 @@
 // RUN: %clang -march=znver4 -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
 // RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_ZNVER4_M32
-// CHECK_ZNVER4_M32-NOT: #define __3dNOW_A__ 1
-// CHECK_ZNVER4_M32-NOT: #define __3dNOW__ 1
 // CHECK_ZNVER4_M32: #define __ADX__ 1
 // CHECK_ZNVER4_M32: #define __AES__ 1
 // CHECK_ZNVER4_M32: #define __AVX2__ 1
@@ -3944,8 +3858,6 @@
 // RUN: %clang -march=znver4 -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
 // RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_ZNVER4_M64
-// CHECK_ZNVER4_M64-NOT: #define __3dNOW_A__ 1
-// CHECK_ZNVER4_M64-NOT: #define __3dNOW__ 1
 // CHECK_ZNVER4_M64: #define __ADX__ 1
 // CHECK_ZNVER4_M64: #define __AES__ 1
 // CHECK_ZNVER4_M64: #define __AVX2__ 1
diff --git a/clang/test/Preprocessor/x86_target_features.c b/clang/test/Preprocessor/x86_target_features.c
index 3e63e2c77fddf..80d8711f1aadb 100644
--- a/clang/test/Preprocessor/x86_target_features.c
+++ b/clang/test/Preprocessor/x86_target_features.c
@@ -348,12 +348,12 @@
 
 // RUN: %clang -target i386-unknown-unknown -march=atom -m3dnow -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=3DNOWPRFCHW %s
 
-// 3DNOWPRFCHW: #define __3dNOW__ 1
+// 3DNOWPRFCHW-NOT: #define __3dNOW__ 1
 // 3DNOWPRFCHW-NOT: #define __PRFCHW__ 1
 
 // RUN: %clang -target i386-unknown-unknown -march=atom -mno-prfchw -m3dnow -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=3DNOWNOPRFCHW %s
 
-// 3DNOWNOPRFCHW: #define __3dNOW__ 1
+// 3DNOWNOPRFCHW-NOT: #define __3dNOW__ 1
 // 3DNOWNOPRFCHW-NOT: #define __PRFCHW__ 1
 
 // RUN: %clang -target i386-unknown-unknown -march=atom -mprfchw -mno-3dnow -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=NO3DNOWPRFCHW %s
diff --git a/llvm/include/llvm/IR/IntrinsicsX86.td b/llvm/include/llvm/IR/IntrinsicsX86.td
index aee804047e1b0..51a80e7be22ce 100644
--- a/llvm/include/llvm/IR/IntrinsicsX86.td
+++ b/llvm/include/llvm/IR/IntrinsicsX86.td
@@ -133,57 +133,57 @@ let TargetPrefix = "x86" in {
 // 3DNow!
 
 let TargetPrefix = "x86" in {
-  def int_x86_3dnow_pavgusb : ClangBuiltin<"__builtin_ia32_pavgusb">,
+  def int_x86_3dnow_pavgusb :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                             [IntrNoMem]>;
-  def int_x86_3dnow_pf2id : ClangBuiltin<"__builtin_ia32_pf2id">,
+  def int_x86_3dnow_pf2id :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty], [IntrNoMem]>;
-  def int_x86_3dnow_pfacc : ClangBuiltin<"__builtin_ia32_pfacc">,
+  def int_x86_3dnow_pfacc :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                             [IntrNoMem]>;
-  def int_x86_3dnow_pfadd : ClangBuiltin<"__builtin_ia32_pfadd">,
+  def int_x86_3dnow_pfadd :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                             [IntrNoMem]>;
-  def int_x86_3dnow_pfcmpeq : ClangBuiltin<"__builtin_ia32_pfcmpeq">,
+  def int_x86_3dnow_pfcmpeq :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                             [IntrNoMem]>;
-  def int_x86_3dnow_pfcmpge : ClangBuiltin<"__builtin_ia32_pfcmpge">,
+  def int_x86_3dnow_pfcmpge :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                             [IntrNoMem]>;
-  def int_x86_3dnow_pfcmpgt : ClangBuiltin<"__builtin_ia32_pfcmpgt">,
+  def int_x86_3dnow_pfcmpgt :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                             [IntrNoMem]>;
-  def int_x86_3dnow_pfmax : ClangBuiltin<"__builtin_ia32_pfmax">,
+  def int_x86_3dnow_pfmax :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                             [IntrNoMem]>;
-  def int_x86_3dnow_pfmin : ClangBuiltin<"__builtin_ia32_pfmin">,
+  def int_x86_3dnow_pfmin :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                             [IntrNoMem]>;
-  def int_x86_3dnow_pfmul : ClangBuiltin<"__builtin_ia32_pfmul">,
+  def int_x86_3dnow_pfmul :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                             [IntrNoMem]>;
-  def int_x86_3dnow_pfrcp : ClangBuiltin<"__builtin_ia32_pfrcp">,
+  def int_x86_3dnow_pfrcp :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty], [IntrNoMem]>;
-  def int_x86_3dnow_pfrcpit1 : ClangBuiltin<"__builtin_ia32_pfrcpit1">,
+  def int_x86_3dnow_pfrcpit1 :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                             [IntrNoMem]>;
-  def int_x86_3dnow_pfrcpit2 : ClangBuiltin<"__builtin_ia32_pfrcpit2">,
+  def int_x86_3dnow_pfrcpit2 :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                             [IntrNoMem]>;
-  def int_x86_3dnow_pfrsqrt : ClangBuiltin<"__builtin_ia32_pfrsqrt">,
+  def int_x86_3dnow_pfrsqrt :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty], [IntrNoMem]>;
-  def int_x86_3dnow_pfrsqit1 : ClangBuiltin<"__builtin_ia32_pfrsqit1">,
+  def int_x86_3dnow_pfrsqit1 :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                             [IntrNoMem]>;
-  def int_x86_3dnow_pfsub : ClangBuiltin<"__builtin_ia32_pfsub">,
+  def int_x86_3dnow_pfsub :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                             [IntrNoMem]>;
-  def int_x86_3dnow_pfsubr : ClangBuiltin<"__builtin_ia32_pfsubr">,
+  def int_x86_3dnow_pfsubr :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                             [IntrNoMem]>;
-  def int_x86_3dnow_pi2fd : ClangBuiltin<"__builtin_ia32_pi2fd">,
+  def int_x86_3dnow_pi2fd :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty], [IntrNoMem]>;
-  def int_x86_3dnow_pmulhrw : ClangBuiltin<"__builtin_ia32_pmulhrw">,
+  def int_x86_3dnow_pmulhrw :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                             [IntrNoMem]>;
 }
@@ -192,15 +192,15 @@ let TargetPrefix = "x86" in {
 // 3DNow! extensions
 
 let TargetPrefix = "x86" in {
-  def int_x86_3dnowa_pf2iw : ClangBuiltin<"__builtin_ia32_pf2iw">,
+  def int_x86_3dnowa_pf2iw :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty], [IntrNoMem]>;
-  def int_x86_3dnowa_pfnacc : ClangBuiltin<"__builtin_ia32_pfnacc">,
+  def int_x86_3dnowa_pfnacc :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                             [IntrNoMem]>;
-  def int_x86_3dnowa_pfpnacc : ClangBuiltin<"__builtin_ia32_pfpnacc">,
+  def int_x86_3dnowa_pfpnacc :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
                             [IntrNoMem]>;
-  def int_x86_3dnowa_pi2fw : ClangBuiltin<"__builtin_ia32_pi2fw">,
+  def int_x86_3dnowa_pi2fw :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty], [IntrNoMem]>;
   def int_x86_3dnowa_pswapd :
       DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty], [IntrNoMem]>;
@@ -2332,7 +2332,8 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
   def int_x86_mmx_emms  : ClangBuiltin<"__builtin_ia32_emms">,
               Intrinsic<[], [], []>;
-  def int_x86_mmx_femms : ClangBuiltin<"__builtin_ia32_femms">,
+  // 3DNow! extension
+  def int_x86_mmx_femms :
               Intrinsic<[], [], []>;
 }
 

>From d2b65609edc95931bc852bdb1145764a96a6dc36 Mon Sep 17 00:00:00 2001
From: James Y Knight <jyknight at google.com>
Date: Thu, 20 Jun 2024 17:37:33 -0400
Subject: [PATCH 2/6] LLVM: Remove support for 3DNow! intrinsics.

This set of instructions was only supported by AMD chips starting in
the K6-2 (introduced 1998), and before the "Bulldozer" family
(2011). They were never much used, as they were effectively superseded
by the more-widely-implemented SSE (first implemented on the AMD side
in Athlon XP in 2001).

This is being done as a predecessor towards general removal of MMX
register usage. Since there is almost no usage of the 3DNow!
intrinsics, and no modern hardware even implements them, simple
removal seems like the best option.

Works towards issue #41665.
---
 llvm/docs/ReleaseNotes.rst                    |   3 +
 llvm/include/llvm/IR/IntrinsicsX86.td         |  80 --
 llvm/lib/Target/X86/X86.td                    |  27 +-
 llvm/lib/Target/X86/X86ISelLowering.cpp       |   2 +-
 llvm/lib/Target/X86/X86Instr3DNow.td          |  87 +-
 llvm/lib/Target/X86/X86InstrPredicates.td     |   3 -
 llvm/lib/Target/X86/X86Schedule.td            |   2 +-
 llvm/lib/Target/X86/X86ScheduleZnver3.td      |   2 +-
 llvm/lib/Target/X86/X86ScheduleZnver4.td      |   2 +-
 llvm/lib/Target/X86/X86Subtarget.cpp          |   2 +-
 llvm/lib/Target/X86/X86Subtarget.h            |  22 +-
 llvm/test/CodeGen/X86/3dnow-intrinsics.ll     | 896 ------------------
 llvm/test/CodeGen/X86/commute-3dnow.ll        | 270 ------
 .../CodeGen/X86/expand-vr64-gr64-copy.mir     |  17 +-
 llvm/test/CodeGen/X86/pr35982.ll              |  45 -
 llvm/test/CodeGen/X86/prefetch.ll             |  23 +-
 llvm/test/CodeGen/X86/stack-folding-3dnow.ll  | 387 --------
 17 files changed, 70 insertions(+), 1800 deletions(-)
 delete mode 100644 llvm/test/CodeGen/X86/3dnow-intrinsics.ll
 delete mode 100644 llvm/test/CodeGen/X86/commute-3dnow.ll
 delete mode 100644 llvm/test/CodeGen/X86/stack-folding-3dnow.ll

diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst
index 61f63c59cbc57..088f48d135e1a 100644
--- a/llvm/docs/ReleaseNotes.rst
+++ b/llvm/docs/ReleaseNotes.rst
@@ -196,6 +196,9 @@ Changes to the X86 Backend
 - Removed knl/knm specific ISA intrinsics: AVX512PF, AVX512ER, PREFETCHWT1,
   while assembly encoding/decoding supports are kept.
 
+- Removed ``3DNow!``-specific ISA intrinsics and codegen support. The ``3dnow`` and ``3dnowa`` target features are no longer supported. The intrinsics ``llvm.x86.3dnow.*``, ``llvm.x86.3dnowa.*``, and ``llvm.x86.mmx.femms`` have been removed. Assembly encoding/decoding for the corresponding instructions remains supported.
+
+
 Changes to the OCaml bindings
 -----------------------------
 
diff --git a/llvm/include/llvm/IR/IntrinsicsX86.td b/llvm/include/llvm/IR/IntrinsicsX86.td
index 51a80e7be22ce..adc46f9789ebb 100644
--- a/llvm/include/llvm/IR/IntrinsicsX86.td
+++ b/llvm/include/llvm/IR/IntrinsicsX86.td
@@ -129,83 +129,6 @@ let TargetPrefix = "x86" in {
               Intrinsic<[], [llvm_ptr_ty], []>;
 }
 
-//===----------------------------------------------------------------------===//
-// 3DNow!
-
-let TargetPrefix = "x86" in {
-  def int_x86_3dnow_pavgusb :
-      DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
-                            [IntrNoMem]>;
-  def int_x86_3dnow_pf2id :
-      DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty], [IntrNoMem]>;
-  def int_x86_3dnow_pfacc :
-      DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
-                            [IntrNoMem]>;
-  def int_x86_3dnow_pfadd :
-      DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
-                            [IntrNoMem]>;
-  def int_x86_3dnow_pfcmpeq :
-      DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
-                            [IntrNoMem]>;
-  def int_x86_3dnow_pfcmpge :
-      DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
-                            [IntrNoMem]>;
-  def int_x86_3dnow_pfcmpgt :
-      DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
-                            [IntrNoMem]>;
-  def int_x86_3dnow_pfmax :
-      DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
-                            [IntrNoMem]>;
-  def int_x86_3dnow_pfmin :
-      DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
-                            [IntrNoMem]>;
-  def int_x86_3dnow_pfmul :
-      DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
-                            [IntrNoMem]>;
-  def int_x86_3dnow_pfrcp :
-      DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty], [IntrNoMem]>;
-  def int_x86_3dnow_pfrcpit1 :
-      DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
-                            [IntrNoMem]>;
-  def int_x86_3dnow_pfrcpit2 :
-      DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
-                            [IntrNoMem]>;
-  def int_x86_3dnow_pfrsqrt :
-      DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty], [IntrNoMem]>;
-  def int_x86_3dnow_pfrsqit1 :
-      DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
-                            [IntrNoMem]>;
-  def int_x86_3dnow_pfsub :
-      DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
-                            [IntrNoMem]>;
-  def int_x86_3dnow_pfsubr :
-      DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
-                            [IntrNoMem]>;
-  def int_x86_3dnow_pi2fd :
-      DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty], [IntrNoMem]>;
-  def int_x86_3dnow_pmulhrw :
-      DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
-                            [IntrNoMem]>;
-}
-
-//===----------------------------------------------------------------------===//
-// 3DNow! extensions
-
-let TargetPrefix = "x86" in {
-  def int_x86_3dnowa_pf2iw :
-      DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty], [IntrNoMem]>;
-  def int_x86_3dnowa_pfnacc :
-      DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
-                            [IntrNoMem]>;
-  def int_x86_3dnowa_pfpnacc :
-      DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
-                            [IntrNoMem]>;
-  def int_x86_3dnowa_pi2fw :
-      DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty], [IntrNoMem]>;
-  def int_x86_3dnowa_pswapd :
-      DefaultAttrsIntrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty], [IntrNoMem]>;
-}
-
 //===----------------------------------------------------------------------===//
 // SSE1
 
@@ -2332,9 +2255,6 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
   def int_x86_mmx_emms  : ClangBuiltin<"__builtin_ia32_emms">,
               Intrinsic<[], [], []>;
-  // 3DNow! extension
-  def int_x86_mmx_femms :
-              Intrinsic<[], [], []>;
 }
 
 // Integer arithmetic ops.
diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td
index 628ff560017ed..242ba2937aa4b 100644
--- a/llvm/lib/Target/X86/X86.td
+++ b/llvm/lib/Target/X86/X86.td
@@ -86,14 +86,8 @@ def FeatureSSE42   : SubtargetFeature<"sse4.2", "X86SSELevel", "SSE42",
 // The MMX subtarget feature is separate from the rest of the SSE features
 // because it's important (for odd compatibility reasons) to be able to
 // turn it off explicitly while allowing SSE+ to be on.
-def FeatureMMX     : SubtargetFeature<"mmx","X863DNowLevel", "MMX",
+def FeatureMMX     : SubtargetFeature<"mmx","HasMMX", "true",
                                       "Enable MMX instructions">;
-def Feature3DNow   : SubtargetFeature<"3dnow", "X863DNowLevel", "ThreeDNow",
-                                      "Enable 3DNow! instructions",
-                                      [FeatureMMX]>;
-def Feature3DNowA  : SubtargetFeature<"3dnowa", "X863DNowLevel", "ThreeDNowA",
-                                      "Enable 3DNow! Athlon instructions",
-                                      [Feature3DNow]>;
 // All x86-64 hardware has SSE2, but we don't mark SSE2 as an implied
 // feature, because SSE2 can be disabled (e.g. for compiling OS kernels)
 // without disabling 64-bit mode. Nothing should imply this feature bit. It
@@ -1332,7 +1326,6 @@ def ProcessorFeatures {
   list<SubtargetFeature> BarcelonaFeatures = [FeatureX87,
                                               FeatureCX8,
                                               FeatureSSE4A,
-                                              Feature3DNowA,
                                               FeatureFXSR,
                                               FeatureNOPL,
                                               FeatureCX16,
@@ -1825,32 +1818,32 @@ def : ProcModel<P, SapphireRapidsModel,
 
 def : Proc<"k6",   [FeatureX87, FeatureCX8, FeatureMMX],
                    [TuningSlowUAMem16, TuningInsertVZEROUPPER]>;
-def : Proc<"k6-2", [FeatureX87, FeatureCX8, Feature3DNow],
+def : Proc<"k6-2", [FeatureX87, FeatureCX8],
                    [TuningSlowUAMem16, TuningInsertVZEROUPPER]>;
-def : Proc<"k6-3", [FeatureX87, FeatureCX8, Feature3DNow],
+def : Proc<"k6-3", [FeatureX87, FeatureCX8],
                    [TuningSlowUAMem16, TuningInsertVZEROUPPER]>;
 
 foreach P = ["athlon", "athlon-tbird"] in {
-  def : Proc<P, [FeatureX87, FeatureCX8, FeatureCMOV, Feature3DNowA,
+  def : Proc<P, [FeatureX87, FeatureCX8, FeatureCMOV,
                  FeatureNOPL],
                 [TuningSlowSHLD, TuningSlowUAMem16, TuningInsertVZEROUPPER]>;
 }
 
 foreach P = ["athlon-4", "athlon-xp", "athlon-mp"] in {
   def : Proc<P, [FeatureX87, FeatureCX8, FeatureCMOV,
-                 FeatureSSE1, Feature3DNowA, FeatureFXSR, FeatureNOPL],
+                 FeatureSSE1, FeatureFXSR, FeatureNOPL],
                 [TuningSlowSHLD, TuningSlowUAMem16, TuningInsertVZEROUPPER]>;
 }
 
 foreach P = ["k8", "opteron", "athlon64", "athlon-fx"] in {
-  def : Proc<P, [FeatureX87, FeatureCX8, FeatureSSE2, Feature3DNowA,
+  def : Proc<P, [FeatureX87, FeatureCX8, FeatureSSE2,
                  FeatureFXSR, FeatureNOPL, FeatureX86_64, FeatureCMOV],
                 [TuningFastScalarShiftMasks, TuningSlowSHLD, TuningSlowUAMem16,
                  TuningSBBDepBreaking, TuningInsertVZEROUPPER]>;
 }
 
 foreach P = ["k8-sse3", "opteron-sse3", "athlon64-sse3"] in {
-  def : Proc<P, [FeatureX87, FeatureCX8, FeatureSSE3, Feature3DNowA,
+  def : Proc<P, [FeatureX87, FeatureCX8, FeatureSSE3,
                  FeatureFXSR, FeatureNOPL, FeatureCX16, FeatureCMOV,
                  FeatureX86_64],
                 [TuningFastScalarShiftMasks, TuningSlowSHLD, TuningSlowUAMem16,
@@ -1891,14 +1884,14 @@ def : ProcModel<"znver3", Znver3Model, ProcessorFeatures.ZN3Features,
 def : ProcModel<"znver4", Znver4Model, ProcessorFeatures.ZN4Features,
            ProcessorFeatures.ZN4Tuning>;
 
-def : Proc<"geode",           [FeatureX87, FeatureCX8, Feature3DNowA],
+def : Proc<"geode",           [FeatureX87, FeatureCX8],
                               [TuningSlowUAMem16, TuningInsertVZEROUPPER]>;
 
 def : Proc<"winchip-c6",      [FeatureX87, FeatureMMX],
                               [TuningSlowUAMem16, TuningInsertVZEROUPPER]>;
-def : Proc<"winchip2",        [FeatureX87, Feature3DNow],
+def : Proc<"winchip2",        [FeatureX87],
                               [TuningSlowUAMem16, TuningInsertVZEROUPPER]>;
-def : Proc<"c3",              [FeatureX87, Feature3DNow],
+def : Proc<"c3",              [FeatureX87],
                               [TuningSlowUAMem16, TuningInsertVZEROUPPER]>;
 def : Proc<"c3-2",            [FeatureX87, FeatureCX8, FeatureMMX,
                                FeatureSSE1, FeatureFXSR, FeatureCMOV],
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 2d5066dd8854c..b0f1641a271cd 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -535,7 +535,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::SRL_PARTS, VT, Custom);
   }
 
-  if (Subtarget.hasSSEPrefetch() || Subtarget.hasThreeDNow())
+  if (Subtarget.hasSSEPrefetch())
     setOperationAction(ISD::PREFETCH      , MVT::Other, Custom);
 
   setOperationAction(ISD::ATOMIC_FENCE  , MVT::Other, Custom);
diff --git a/llvm/lib/Target/X86/X86Instr3DNow.td b/llvm/lib/Target/X86/X86Instr3DNow.td
index 03612de0fad94..444b1f0962955 100644
--- a/llvm/lib/Target/X86/X86Instr3DNow.td
+++ b/llvm/lib/Target/X86/X86Instr3DNow.td
@@ -12,7 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 class I3DNow<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pat>
-      : I<o, F, outs, ins, asm, pat>, Requires<[Has3DNow]> {
+      : I<o, F, outs, ins, asm, pat> {
 }
 
 class I3DNow_binop<bits<8> o, Format F, dag ins, string Mnemonic, list<dag> pat>
@@ -25,66 +25,53 @@ class I3DNow_conv<bits<8> o, Format F, dag ins, string Mnemonic, list<dag> pat>
       : I3DNow<o, F, (outs VR64:$dst), ins,
           !strconcat(Mnemonic, "\t{$src, $dst|$dst, $src}"), pat>, ThreeDNow;
 
-multiclass I3DNow_binop_rm_int<bits<8> opc, string Mn,
-                               X86FoldableSchedWrite sched, bit Commutable = 0,
-                               string Ver = ""> {
+multiclass I3DNow_binop_rm<bits<8> opc, string Mn,
+                           X86FoldableSchedWrite sched, bit Commutable = 0> {
   let isCommutable = Commutable in
   def rr : I3DNow_binop<opc, MRMSrcReg, (ins VR64:$src1, VR64:$src2), Mn,
-    [(set VR64:$dst, (!cast<Intrinsic>(
-      !strconcat("int_x86_3dnow", Ver, "_", Mn)) VR64:$src1, VR64:$src2))]>,
-      Sched<[sched]>;
+    []>, Sched<[sched]>;
   def rm : I3DNow_binop<opc, MRMSrcMem, (ins VR64:$src1, i64mem:$src2), Mn,
-    [(set VR64:$dst, (!cast<Intrinsic>(
-      !strconcat("int_x86_3dnow", Ver, "_", Mn)) VR64:$src1,
-        (bitconvert (load_mmx addr:$src2))))]>,
-        Sched<[sched.Folded, sched.ReadAfterFold]>;
+    []>, Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 
-multiclass I3DNow_conv_rm_int<bits<8> opc, string Mn,
-                              X86FoldableSchedWrite sched, string Ver = ""> {
+multiclass I3DNow_conv_rm<bits<8> opc, string Mn,
+                              X86FoldableSchedWrite sched> {
   def rr : I3DNow_conv<opc, MRMSrcReg, (ins VR64:$src), Mn,
-    [(set VR64:$dst, (!cast<Intrinsic>(
-      !strconcat("int_x86_3dnow", Ver, "_", Mn)) VR64:$src))]>,
-      Sched<[sched]>;
+    []>, Sched<[sched]>;
   def rm : I3DNow_conv<opc, MRMSrcMem, (ins i64mem:$src), Mn,
-    [(set VR64:$dst, (!cast<Intrinsic>(
-      !strconcat("int_x86_3dnow", Ver, "_", Mn))
-        (bitconvert (load_mmx addr:$src))))]>,
-        Sched<[sched.Folded, sched.ReadAfterFold]>;
+    []>, Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 
-defm PAVGUSB  : I3DNow_binop_rm_int<0xBF, "pavgusb", SchedWriteVecALU.MMX, 1>;
-defm PF2ID    : I3DNow_conv_rm_int<0x1D, "pf2id", WriteCvtPS2I>;
-defm PFACC    : I3DNow_binop_rm_int<0xAE, "pfacc", WriteFAdd>;
-defm PFADD    : I3DNow_binop_rm_int<0x9E, "pfadd", WriteFAdd, 1>;
-defm PFCMPEQ  : I3DNow_binop_rm_int<0xB0, "pfcmpeq", WriteFAdd, 1>;
-defm PFCMPGE  : I3DNow_binop_rm_int<0x90, "pfcmpge", WriteFAdd>;
-defm PFCMPGT  : I3DNow_binop_rm_int<0xA0, "pfcmpgt", WriteFAdd>;
-defm PFMAX    : I3DNow_binop_rm_int<0xA4, "pfmax", WriteFAdd>;
-defm PFMIN    : I3DNow_binop_rm_int<0x94, "pfmin", WriteFAdd>;
-defm PFMUL    : I3DNow_binop_rm_int<0xB4, "pfmul", WriteFAdd, 1>;
-defm PFRCP    : I3DNow_conv_rm_int<0x96, "pfrcp", WriteFAdd>;
-defm PFRCPIT1 : I3DNow_binop_rm_int<0xA6, "pfrcpit1", WriteFAdd>;
-defm PFRCPIT2 : I3DNow_binop_rm_int<0xB6, "pfrcpit2", WriteFAdd>;
-defm PFRSQIT1 : I3DNow_binop_rm_int<0xA7, "pfrsqit1", WriteFAdd>;
-defm PFRSQRT  : I3DNow_conv_rm_int<0x97, "pfrsqrt", WriteFAdd>;
-defm PFSUB    : I3DNow_binop_rm_int<0x9A, "pfsub", WriteFAdd, 1>;
-defm PFSUBR   : I3DNow_binop_rm_int<0xAA, "pfsubr", WriteFAdd, 1>;
-defm PI2FD    : I3DNow_conv_rm_int<0x0D, "pi2fd", WriteCvtI2PS>;
-defm PMULHRW  : I3DNow_binop_rm_int<0xB7, "pmulhrw", SchedWriteVecIMul.MMX, 1>;
+defm PAVGUSB  : I3DNow_binop_rm<0xBF, "pavgusb", SchedWriteVecALU.MMX, 1>;
+defm PF2ID    : I3DNow_conv_rm<0x1D, "pf2id", WriteCvtPS2I>;
+defm PFACC    : I3DNow_binop_rm<0xAE, "pfacc", WriteFAdd>;
+defm PFADD    : I3DNow_binop_rm<0x9E, "pfadd", WriteFAdd, 1>;
+defm PFCMPEQ  : I3DNow_binop_rm<0xB0, "pfcmpeq", WriteFAdd, 1>;
+defm PFCMPGE  : I3DNow_binop_rm<0x90, "pfcmpge", WriteFAdd>;
+defm PFCMPGT  : I3DNow_binop_rm<0xA0, "pfcmpgt", WriteFAdd>;
+defm PFMAX    : I3DNow_binop_rm<0xA4, "pfmax", WriteFAdd>;
+defm PFMIN    : I3DNow_binop_rm<0x94, "pfmin", WriteFAdd>;
+defm PFMUL    : I3DNow_binop_rm<0xB4, "pfmul", WriteFAdd, 1>;
+defm PFRCP    : I3DNow_conv_rm<0x96, "pfrcp", WriteFAdd>;
+defm PFRCPIT1 : I3DNow_binop_rm<0xA6, "pfrcpit1", WriteFAdd>;
+defm PFRCPIT2 : I3DNow_binop_rm<0xB6, "pfrcpit2", WriteFAdd>;
+defm PFRSQIT1 : I3DNow_binop_rm<0xA7, "pfrsqit1", WriteFAdd>;
+defm PFRSQRT  : I3DNow_conv_rm<0x97, "pfrsqrt", WriteFAdd>;
+defm PFSUB    : I3DNow_binop_rm<0x9A, "pfsub", WriteFAdd, 1>;
+defm PFSUBR   : I3DNow_binop_rm<0xAA, "pfsubr", WriteFAdd, 1>;
+defm PI2FD    : I3DNow_conv_rm<0x0D, "pi2fd", WriteCvtI2PS>;
+defm PMULHRW  : I3DNow_binop_rm<0xB7, "pmulhrw", SchedWriteVecIMul.MMX, 1>;
 
-let SchedRW = [WriteEMMS],
-    Defs = [MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7,
-            ST0, ST1, ST2, ST3, ST4, ST5, ST6, ST7] in
+let SchedRW = [WriteEMMS] in
 def FEMMS : I3DNow<0x0E, RawFrm, (outs), (ins), "femms",
-                   [(int_x86_mmx_femms)]>, TB;
+                   []>, TB;
 
 let SchedRW = [WriteLoad] in {
-let Predicates = [Has3DNow, NoSSEPrefetch] in
 def PREFETCH : I3DNow<0x0D, MRM0m, (outs), (ins i8mem:$addr),
                       "prefetch\t$addr",
-                      [(prefetch addr:$addr, timm, timm, (i32 1))]>, TB;
+                      []>, TB;
 
+// Note: PREFETCHW is the only instruction in this file which is NOT specific to 3DNow!
 def PREFETCHW : I<0x0D, MRM1m, (outs), (ins i8mem:$addr), "prefetchw\t$addr",
                   [(prefetch addr:$addr, (i32 1), (i32 PrefetchWLevel), (i32 1))]>,
                   TB, Requires<[HasPrefetchW]>;
@@ -94,8 +81,8 @@ def PREFETCHWT1 : I<0x0D, MRM2m, (outs), (ins i8mem:$addr), "prefetchwt1\t$addr"
 }
 
 // "3DNowA" instructions
-defm PF2IW    : I3DNow_conv_rm_int<0x1C, "pf2iw", WriteCvtPS2I, "a">;
-defm PI2FW    : I3DNow_conv_rm_int<0x0C, "pi2fw", WriteCvtI2PS, "a">;
-defm PFNACC   : I3DNow_binop_rm_int<0x8A, "pfnacc", WriteFAdd, 0, "a">;
-defm PFPNACC  : I3DNow_binop_rm_int<0x8E, "pfpnacc", WriteFAdd, 0, "a">;
-defm PSWAPD   : I3DNow_conv_rm_int<0xBB, "pswapd", SchedWriteShuffle.MMX, "a">;
+defm PF2IW    : I3DNow_conv_rm<0x1C, "pf2iw", WriteCvtPS2I>;
+defm PI2FW    : I3DNow_conv_rm<0x0C, "pi2fw", WriteCvtI2PS>;
+defm PFNACC   : I3DNow_binop_rm<0x8A, "pfnacc", WriteFAdd, 0>;
+defm PFPNACC  : I3DNow_binop_rm<0x8E, "pfpnacc", WriteFAdd, 0>;
+defm PSWAPD   : I3DNow_conv_rm<0xBB, "pswapd", SchedWriteShuffle.MMX>;
diff --git a/llvm/lib/Target/X86/X86InstrPredicates.td b/llvm/lib/Target/X86/X86InstrPredicates.td
index 419ff9e6f5c0f..f6038cf7a94cb 100644
--- a/llvm/lib/Target/X86/X86InstrPredicates.td
+++ b/llvm/lib/Target/X86/X86InstrPredicates.td
@@ -50,8 +50,6 @@ def HasCMOV      : Predicate<"Subtarget->canUseCMOV()">;
 def NoCMOV       : Predicate<"!Subtarget->canUseCMOV()">;
 def HasNOPL      : Predicate<"Subtarget->hasNOPL()">;
 def HasMMX       : Predicate<"Subtarget->hasMMX()">;
-def Has3DNow     : Predicate<"Subtarget->hasThreeDNow()">;
-def Has3DNowA    : Predicate<"Subtarget->hasThreeDNowA()">;
 def HasSSE1      : Predicate<"Subtarget->hasSSE1()">;
 def UseSSE1      : Predicate<"Subtarget->hasSSE1() && !Subtarget->hasAVX()">;
 def HasSSE2      : Predicate<"Subtarget->hasSSE2()">;
@@ -141,7 +139,6 @@ def HasSGX       : Predicate<"Subtarget->hasSGX()">;
 def HasSM3       : Predicate<"Subtarget->hasSM3()">;
 def HasRDSEED    : Predicate<"Subtarget->hasRDSEED()">;
 def HasSSEPrefetch : Predicate<"Subtarget->hasSSEPrefetch()">;
-def NoSSEPrefetch : Predicate<"!Subtarget->hasSSEPrefetch()">;
 def HasPRFCHW    : Predicate<"Subtarget->hasPRFCHW()">;
 def HasPREFETCHI : Predicate<"Subtarget->hasPREFETCHI()">;
 def HasPrefetchW : Predicate<"Subtarget->hasPrefetchW()">;
diff --git a/llvm/lib/Target/X86/X86Schedule.td b/llvm/lib/Target/X86/X86Schedule.td
index faa8be05d179c..3365667bdc20e 100644
--- a/llvm/lib/Target/X86/X86Schedule.td
+++ b/llvm/lib/Target/X86/X86Schedule.td
@@ -481,7 +481,7 @@ defm WriteAESKeyGen : X86SchedWritePair<ReadAfterVecXLd>; // Key Generation.
 // Carry-less multiplication instructions.
 defm WriteCLMul : X86SchedWritePair<ReadAfterVecXLd>;
 
-// EMMS/FEMMS
+// EMMS
 def WriteEMMS : SchedWrite;
 
 // Load/store MXCSR
diff --git a/llvm/lib/Target/X86/X86ScheduleZnver3.td b/llvm/lib/Target/X86/X86ScheduleZnver3.td
index cbf1de8408798..29e5f0154fca2 100644
--- a/llvm/lib/Target/X86/X86ScheduleZnver3.td
+++ b/llvm/lib/Target/X86/X86ScheduleZnver3.td
@@ -1301,7 +1301,7 @@ defm : Zn3WriteResXMMPair<WriteAESKeyGen, [Zn3FPAES01], 4, [1], 1>; // Key Gener
 // Carry-less multiplication instructions.
 defm : Zn3WriteResXMMPair<WriteCLMul, [Zn3FPCLM01], 4, [4], 4>;
 
-// EMMS/FEMMS
+// EMMS
 defm : Zn3WriteResInt<WriteEMMS, [Zn3ALU0123], 2, [1], 1>; // FIXME: latency not from llvm-exegesis
 
 // Load/store MXCSR
diff --git a/llvm/lib/Target/X86/X86ScheduleZnver4.td b/llvm/lib/Target/X86/X86ScheduleZnver4.td
index 420c42928c1c4..d929b66d357c2 100644
--- a/llvm/lib/Target/X86/X86ScheduleZnver4.td
+++ b/llvm/lib/Target/X86/X86ScheduleZnver4.td
@@ -1341,7 +1341,7 @@ defm : Zn4WriteResXMMPair<WriteAESKeyGen, [Zn4FPAES01], 4, [1], 1>; // Key Gener
 // Carry-less multiplication instructions.
 defm : Zn4WriteResXMMPair<WriteCLMul, [Zn4FPCLM01], 4, [4], 4>;
 
-// EMMS/FEMMS
+// EMMS
 defm : Zn4WriteResInt<WriteEMMS, [Zn4ALU0123], 2, [1], 1>; // FIXME: latency not from llvm-exegesis
 
 // Load/store MXCSR
diff --git a/llvm/lib/Target/X86/X86Subtarget.cpp b/llvm/lib/Target/X86/X86Subtarget.cpp
index c2e6ddd7e7fa2..11538534c238b 100644
--- a/llvm/lib/Target/X86/X86Subtarget.cpp
+++ b/llvm/lib/Target/X86/X86Subtarget.cpp
@@ -289,7 +289,7 @@ void X86Subtarget::initSubtargetFeatures(StringRef CPU, StringRef TuneCPU,
     IsUnalignedMem16Slow = false;
 
   LLVM_DEBUG(dbgs() << "Subtarget features: SSELevel " << X86SSELevel
-                    << ", 3DNowLevel " << X863DNowLevel << ", 64bit "
+                    << ", MMX " << HasMMX << ", 64bit "
                     << HasX86_64 << "\n");
   if (Is64Bit && !HasX86_64)
     report_fatal_error("64-bit code requested on a subtarget that doesn't "
diff --git a/llvm/lib/Target/X86/X86Subtarget.h b/llvm/lib/Target/X86/X86Subtarget.h
index 4532db134fcb4..e3cb9ee8ce190 100644
--- a/llvm/lib/Target/X86/X86Subtarget.h
+++ b/llvm/lib/Target/X86/X86Subtarget.h
@@ -55,10 +55,6 @@ class X86Subtarget final : public X86GenSubtargetInfo {
     NoSSE, SSE1, SSE2, SSE3, SSSE3, SSE41, SSE42, AVX, AVX2, AVX512
   };
 
-  enum X863DNowEnum {
-    NoThreeDNow, MMX, ThreeDNow, ThreeDNowA
-  };
-
   /// Which PIC style to use
   PICStyles::Style PICStyle;
 
@@ -67,9 +63,6 @@ class X86Subtarget final : public X86GenSubtargetInfo {
   /// SSE1, SSE2, SSE3, SSSE3, SSE41, SSE42, or none supported.
   X86SSEEnum X86SSELevel = NoSSE;
 
-  /// MMX, 3DNow, 3DNow Athlon, or none supported.
-  X863DNowEnum X863DNowLevel = NoThreeDNow;
-
 #define GET_SUBTARGETINFO_MACRO(ATTRIBUTE, DEFAULT, GETTER)                    \
   bool ATTRIBUTE = DEFAULT;
 #include "X86GenSubtargetInfo.inc"
@@ -207,21 +200,16 @@ class X86Subtarget final : public X86GenSubtargetInfo {
   bool hasAVX2() const { return X86SSELevel >= AVX2; }
   bool hasAVX512() const { return X86SSELevel >= AVX512; }
   bool hasInt256() const { return hasAVX2(); }
-  bool hasMMX() const { return X863DNowLevel >= MMX; }
-  bool hasThreeDNow() const { return X863DNowLevel >= ThreeDNow; }
-  bool hasThreeDNowA() const { return X863DNowLevel >= ThreeDNowA; }
   bool hasAnyFMA() const { return hasFMA() || hasFMA4(); }
   bool hasPrefetchW() const {
     // The PREFETCHW instruction was added with 3DNow but later CPUs gave it
-    // its own CPUID bit as part of deprecating 3DNow. We assume the
-    // L1 version exists if the L2 version does.
-    return hasThreeDNow() || hasPRFCHW();
+    // its own CPUID bit as part of deprecating 3DNow.
+    return hasPRFCHW();
   }
   bool hasSSEPrefetch() const {
-    // We implicitly enable these when we have a write prefix supporting cache
-    // level OR if we have prfchw, but don't already have a read prefetch from
-    // 3dnow.
-    return hasSSE1() || (hasPRFCHW() && !hasThreeDNow()) || hasPREFETCHI();
+    // We also implicitly enable these when we have a write prefix supporting
+    // cache level OR if we have prfchw.
+    return hasSSE1() || hasPRFCHW() || hasPREFETCHI();
   }
   bool canUseLAHFSAHF() const { return hasLAHFSAHF64() || !is64Bit(); }
   // These are generic getters that OR together all of the thunk types
diff --git a/llvm/test/CodeGen/X86/3dnow-intrinsics.ll b/llvm/test/CodeGen/X86/3dnow-intrinsics.ll
deleted file mode 100644
index a82f705b77d84..0000000000000
--- a/llvm/test/CodeGen/X86/3dnow-intrinsics.ll
+++ /dev/null
@@ -1,896 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+3dnow | FileCheck %s --check-prefix=X86
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+3dnow | FileCheck %s --check-prefix=X64
-
-define <8 x i8> @test_pavgusb(x86_mmx %a.coerce, x86_mmx %b.coerce) nounwind readnone {
-; X86-LABEL: test_pavgusb:
-; X86:       # %bb.0: # %entry
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pavgusb %mm1, %mm0
-; X86-NEXT:    movq %mm0, (%eax)
-; X86-NEXT:    retl $4
-;
-; X64-LABEL: test_pavgusb:
-; X64:       # %bb.0: # %entry
-; X64-NEXT:    pavgusb %mm1, %mm0
-; X64-NEXT:    movq2dq %mm0, %xmm0
-; X64-NEXT:    retq
-entry:
-  %0 = bitcast x86_mmx %a.coerce to <8 x i8>
-  %1 = bitcast x86_mmx %b.coerce to <8 x i8>
-  %2 = bitcast <8 x i8> %0 to x86_mmx
-  %3 = bitcast <8 x i8> %1 to x86_mmx
-  %4 = call x86_mmx @llvm.x86.3dnow.pavgusb(x86_mmx %2, x86_mmx %3)
-  %5 = bitcast x86_mmx %4 to <8 x i8>
-  ret <8 x i8> %5
-}
-
-declare x86_mmx @llvm.x86.3dnow.pavgusb(x86_mmx, x86_mmx) nounwind readnone
-
-define <2 x i32> @test_pf2id(<2 x float> %a) nounwind readnone {
-; X86-LABEL: test_pf2id:
-; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    andl $-8, %esp
-; X86-NEXT:    subl $8, %esp
-; X86-NEXT:    movd 12(%ebp), %mm0
-; X86-NEXT:    movd 8(%ebp), %mm1
-; X86-NEXT:    punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0]
-; X86-NEXT:    pf2id %mm1, %mm0
-; X86-NEXT:    movq %mm0, (%esp)
-; X86-NEXT:    movl (%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %ebp, %esp
-; X86-NEXT:    popl %ebp
-; X86-NEXT:    retl
-;
-; X64-LABEL: test_pf2id:
-; X64:       # %bb.0: # %entry
-; X64-NEXT:    movdq2q %xmm0, %mm0
-; X64-NEXT:    pf2id %mm0, %mm0
-; X64-NEXT:    movq2dq %mm0, %xmm0
-; X64-NEXT:    retq
-entry:
-  %0 = bitcast <2 x float> %a to x86_mmx
-  %1 = tail call x86_mmx @llvm.x86.3dnow.pf2id(x86_mmx %0)
-  %2 = bitcast x86_mmx %1 to <2 x i32>
-  ret <2 x i32> %2
-}
-
-declare x86_mmx @llvm.x86.3dnow.pf2id(x86_mmx) nounwind readnone
-
-define <2 x float> @test_pfacc(<2 x float> %a, <2 x float> %b) nounwind readnone {
-; X86-LABEL: test_pfacc:
-; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    andl $-8, %esp
-; X86-NEXT:    subl $8, %esp
-; X86-NEXT:    movd 20(%ebp), %mm0
-; X86-NEXT:    movd 16(%ebp), %mm1
-; X86-NEXT:    punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0]
-; X86-NEXT:    movd 12(%ebp), %mm0
-; X86-NEXT:    movd 8(%ebp), %mm2
-; X86-NEXT:    punpckldq %mm0, %mm2 # mm2 = mm2[0],mm0[0]
-; X86-NEXT:    pfacc %mm1, %mm2
-; X86-NEXT:    movq %mm2, (%esp)
-; X86-NEXT:    flds {{[0-9]+}}(%esp)
-; X86-NEXT:    flds (%esp)
-; X86-NEXT:    movl %ebp, %esp
-; X86-NEXT:    popl %ebp
-; X86-NEXT:    retl
-;
-; X64-LABEL: test_pfacc:
-; X64:       # %bb.0: # %entry
-; X64-NEXT:    movdq2q %xmm1, %mm0
-; X64-NEXT:    movdq2q %xmm0, %mm1
-; X64-NEXT:    pfacc %mm0, %mm1
-; X64-NEXT:    movq %mm1, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movaps -{{[0-9]+}}(%rsp), %xmm0
-; X64-NEXT:    retq
-entry:
-  %0 = bitcast <2 x float> %a to x86_mmx
-  %1 = bitcast <2 x float> %b to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.3dnow.pfacc(x86_mmx %0, x86_mmx %1)
-  %3 = bitcast x86_mmx %2 to <2 x float>
-  ret <2 x float> %3
-}
-
-declare x86_mmx @llvm.x86.3dnow.pfacc(x86_mmx, x86_mmx) nounwind readnone
-
-define <2 x float> @test_pfadd(<2 x float> %a, <2 x float> %b) nounwind readnone {
-; X86-LABEL: test_pfadd:
-; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    andl $-8, %esp
-; X86-NEXT:    subl $8, %esp
-; X86-NEXT:    movd 20(%ebp), %mm0
-; X86-NEXT:    movd 16(%ebp), %mm1
-; X86-NEXT:    punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0]
-; X86-NEXT:    movd 12(%ebp), %mm0
-; X86-NEXT:    movd 8(%ebp), %mm2
-; X86-NEXT:    punpckldq %mm0, %mm2 # mm2 = mm2[0],mm0[0]
-; X86-NEXT:    pfadd %mm1, %mm2
-; X86-NEXT:    movq %mm2, (%esp)
-; X86-NEXT:    flds {{[0-9]+}}(%esp)
-; X86-NEXT:    flds (%esp)
-; X86-NEXT:    movl %ebp, %esp
-; X86-NEXT:    popl %ebp
-; X86-NEXT:    retl
-;
-; X64-LABEL: test_pfadd:
-; X64:       # %bb.0: # %entry
-; X64-NEXT:    movdq2q %xmm1, %mm0
-; X64-NEXT:    movdq2q %xmm0, %mm1
-; X64-NEXT:    pfadd %mm0, %mm1
-; X64-NEXT:    movq %mm1, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movaps -{{[0-9]+}}(%rsp), %xmm0
-; X64-NEXT:    retq
-entry:
-  %0 = bitcast <2 x float> %a to x86_mmx
-  %1 = bitcast <2 x float> %b to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.3dnow.pfadd(x86_mmx %0, x86_mmx %1)
-  %3 = bitcast x86_mmx %2 to <2 x float>
-  ret <2 x float> %3
-}
-
-declare x86_mmx @llvm.x86.3dnow.pfadd(x86_mmx, x86_mmx) nounwind readnone
-
-define <2 x i32> @test_pfcmpeq(<2 x float> %a, <2 x float> %b) nounwind readnone {
-; X86-LABEL: test_pfcmpeq:
-; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    andl $-8, %esp
-; X86-NEXT:    subl $8, %esp
-; X86-NEXT:    movd 20(%ebp), %mm0
-; X86-NEXT:    movd 16(%ebp), %mm1
-; X86-NEXT:    punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0]
-; X86-NEXT:    movd 12(%ebp), %mm0
-; X86-NEXT:    movd 8(%ebp), %mm2
-; X86-NEXT:    punpckldq %mm0, %mm2 # mm2 = mm2[0],mm0[0]
-; X86-NEXT:    pfcmpeq %mm1, %mm2
-; X86-NEXT:    movq %mm2, (%esp)
-; X86-NEXT:    movl (%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %ebp, %esp
-; X86-NEXT:    popl %ebp
-; X86-NEXT:    retl
-;
-; X64-LABEL: test_pfcmpeq:
-; X64:       # %bb.0: # %entry
-; X64-NEXT:    movdq2q %xmm1, %mm0
-; X64-NEXT:    movdq2q %xmm0, %mm1
-; X64-NEXT:    pfcmpeq %mm0, %mm1
-; X64-NEXT:    movq2dq %mm1, %xmm0
-; X64-NEXT:    retq
-entry:
-  %0 = bitcast <2 x float> %a to x86_mmx
-  %1 = bitcast <2 x float> %b to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.3dnow.pfcmpeq(x86_mmx %0, x86_mmx %1)
-  %3 = bitcast x86_mmx %2 to <2 x i32>
-  ret <2 x i32> %3
-}
-
-declare x86_mmx @llvm.x86.3dnow.pfcmpeq(x86_mmx, x86_mmx) nounwind readnone
-
-define <2 x i32> @test_pfcmpge(<2 x float> %a, <2 x float> %b) nounwind readnone {
-; X86-LABEL: test_pfcmpge:
-; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    andl $-8, %esp
-; X86-NEXT:    subl $8, %esp
-; X86-NEXT:    movd 20(%ebp), %mm0
-; X86-NEXT:    movd 16(%ebp), %mm1
-; X86-NEXT:    punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0]
-; X86-NEXT:    movd 12(%ebp), %mm0
-; X86-NEXT:    movd 8(%ebp), %mm2
-; X86-NEXT:    punpckldq %mm0, %mm2 # mm2 = mm2[0],mm0[0]
-; X86-NEXT:    pfcmpge %mm1, %mm2
-; X86-NEXT:    movq %mm2, (%esp)
-; X86-NEXT:    movl (%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %ebp, %esp
-; X86-NEXT:    popl %ebp
-; X86-NEXT:    retl
-;
-; X64-LABEL: test_pfcmpge:
-; X64:       # %bb.0: # %entry
-; X64-NEXT:    movdq2q %xmm1, %mm0
-; X64-NEXT:    movdq2q %xmm0, %mm1
-; X64-NEXT:    pfcmpge %mm0, %mm1
-; X64-NEXT:    movq2dq %mm1, %xmm0
-; X64-NEXT:    retq
-entry:
-  %0 = bitcast <2 x float> %a to x86_mmx
-  %1 = bitcast <2 x float> %b to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.3dnow.pfcmpge(x86_mmx %0, x86_mmx %1)
-  %3 = bitcast x86_mmx %2 to <2 x i32>
-  ret <2 x i32> %3
-}
-
-declare x86_mmx @llvm.x86.3dnow.pfcmpge(x86_mmx, x86_mmx) nounwind readnone
-
-define <2 x i32> @test_pfcmpgt(<2 x float> %a, <2 x float> %b) nounwind readnone {
-; X86-LABEL: test_pfcmpgt:
-; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    andl $-8, %esp
-; X86-NEXT:    subl $8, %esp
-; X86-NEXT:    movd 20(%ebp), %mm0
-; X86-NEXT:    movd 16(%ebp), %mm1
-; X86-NEXT:    punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0]
-; X86-NEXT:    movd 12(%ebp), %mm0
-; X86-NEXT:    movd 8(%ebp), %mm2
-; X86-NEXT:    punpckldq %mm0, %mm2 # mm2 = mm2[0],mm0[0]
-; X86-NEXT:    pfcmpgt %mm1, %mm2
-; X86-NEXT:    movq %mm2, (%esp)
-; X86-NEXT:    movl (%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %ebp, %esp
-; X86-NEXT:    popl %ebp
-; X86-NEXT:    retl
-;
-; X64-LABEL: test_pfcmpgt:
-; X64:       # %bb.0: # %entry
-; X64-NEXT:    movdq2q %xmm1, %mm0
-; X64-NEXT:    movdq2q %xmm0, %mm1
-; X64-NEXT:    pfcmpgt %mm0, %mm1
-; X64-NEXT:    movq2dq %mm1, %xmm0
-; X64-NEXT:    retq
-entry:
-  %0 = bitcast <2 x float> %a to x86_mmx
-  %1 = bitcast <2 x float> %b to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.3dnow.pfcmpgt(x86_mmx %0, x86_mmx %1)
-  %3 = bitcast x86_mmx %2 to <2 x i32>
-  ret <2 x i32> %3
-}
-
-declare x86_mmx @llvm.x86.3dnow.pfcmpgt(x86_mmx, x86_mmx) nounwind readnone
-
-define <2 x float> @test_pfmax(<2 x float> %a, <2 x float> %b) nounwind readnone {
-; X86-LABEL: test_pfmax:
-; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    andl $-8, %esp
-; X86-NEXT:    subl $8, %esp
-; X86-NEXT:    movd 20(%ebp), %mm0
-; X86-NEXT:    movd 16(%ebp), %mm1
-; X86-NEXT:    punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0]
-; X86-NEXT:    movd 12(%ebp), %mm0
-; X86-NEXT:    movd 8(%ebp), %mm2
-; X86-NEXT:    punpckldq %mm0, %mm2 # mm2 = mm2[0],mm0[0]
-; X86-NEXT:    pfmax %mm1, %mm2
-; X86-NEXT:    movq %mm2, (%esp)
-; X86-NEXT:    flds {{[0-9]+}}(%esp)
-; X86-NEXT:    flds (%esp)
-; X86-NEXT:    movl %ebp, %esp
-; X86-NEXT:    popl %ebp
-; X86-NEXT:    retl
-;
-; X64-LABEL: test_pfmax:
-; X64:       # %bb.0: # %entry
-; X64-NEXT:    movdq2q %xmm1, %mm0
-; X64-NEXT:    movdq2q %xmm0, %mm1
-; X64-NEXT:    pfmax %mm0, %mm1
-; X64-NEXT:    movq %mm1, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movaps -{{[0-9]+}}(%rsp), %xmm0
-; X64-NEXT:    retq
-entry:
-  %0 = bitcast <2 x float> %a to x86_mmx
-  %1 = bitcast <2 x float> %b to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.3dnow.pfmax(x86_mmx %0, x86_mmx %1)
-  %3 = bitcast x86_mmx %2 to <2 x float>
-  ret <2 x float> %3
-}
-
-declare x86_mmx @llvm.x86.3dnow.pfmax(x86_mmx, x86_mmx) nounwind readnone
-
-define <2 x float> @test_pfmin(<2 x float> %a, <2 x float> %b) nounwind readnone {
-; X86-LABEL: test_pfmin:
-; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    andl $-8, %esp
-; X86-NEXT:    subl $8, %esp
-; X86-NEXT:    movd 20(%ebp), %mm0
-; X86-NEXT:    movd 16(%ebp), %mm1
-; X86-NEXT:    punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0]
-; X86-NEXT:    movd 12(%ebp), %mm0
-; X86-NEXT:    movd 8(%ebp), %mm2
-; X86-NEXT:    punpckldq %mm0, %mm2 # mm2 = mm2[0],mm0[0]
-; X86-NEXT:    pfmin %mm1, %mm2
-; X86-NEXT:    movq %mm2, (%esp)
-; X86-NEXT:    flds {{[0-9]+}}(%esp)
-; X86-NEXT:    flds (%esp)
-; X86-NEXT:    movl %ebp, %esp
-; X86-NEXT:    popl %ebp
-; X86-NEXT:    retl
-;
-; X64-LABEL: test_pfmin:
-; X64:       # %bb.0: # %entry
-; X64-NEXT:    movdq2q %xmm1, %mm0
-; X64-NEXT:    movdq2q %xmm0, %mm1
-; X64-NEXT:    pfmin %mm0, %mm1
-; X64-NEXT:    movq %mm1, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movaps -{{[0-9]+}}(%rsp), %xmm0
-; X64-NEXT:    retq
-entry:
-  %0 = bitcast <2 x float> %a to x86_mmx
-  %1 = bitcast <2 x float> %b to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.3dnow.pfmin(x86_mmx %0, x86_mmx %1)
-  %3 = bitcast x86_mmx %2 to <2 x float>
-  ret <2 x float> %3
-}
-
-declare x86_mmx @llvm.x86.3dnow.pfmin(x86_mmx, x86_mmx) nounwind readnone
-
-define <2 x float> @test_pfmul(<2 x float> %a, <2 x float> %b) nounwind readnone {
-; X86-LABEL: test_pfmul:
-; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    andl $-8, %esp
-; X86-NEXT:    subl $8, %esp
-; X86-NEXT:    movd 20(%ebp), %mm0
-; X86-NEXT:    movd 16(%ebp), %mm1
-; X86-NEXT:    punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0]
-; X86-NEXT:    movd 12(%ebp), %mm0
-; X86-NEXT:    movd 8(%ebp), %mm2
-; X86-NEXT:    punpckldq %mm0, %mm2 # mm2 = mm2[0],mm0[0]
-; X86-NEXT:    pfmul %mm1, %mm2
-; X86-NEXT:    movq %mm2, (%esp)
-; X86-NEXT:    flds {{[0-9]+}}(%esp)
-; X86-NEXT:    flds (%esp)
-; X86-NEXT:    movl %ebp, %esp
-; X86-NEXT:    popl %ebp
-; X86-NEXT:    retl
-;
-; X64-LABEL: test_pfmul:
-; X64:       # %bb.0: # %entry
-; X64-NEXT:    movdq2q %xmm1, %mm0
-; X64-NEXT:    movdq2q %xmm0, %mm1
-; X64-NEXT:    pfmul %mm0, %mm1
-; X64-NEXT:    movq %mm1, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movaps -{{[0-9]+}}(%rsp), %xmm0
-; X64-NEXT:    retq
-entry:
-  %0 = bitcast <2 x float> %a to x86_mmx
-  %1 = bitcast <2 x float> %b to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.3dnow.pfmul(x86_mmx %0, x86_mmx %1)
-  %3 = bitcast x86_mmx %2 to <2 x float>
-  ret <2 x float> %3
-}
-
-declare x86_mmx @llvm.x86.3dnow.pfmul(x86_mmx, x86_mmx) nounwind readnone
-
-define <2 x float> @test_pfrcp(<2 x float> %a) nounwind readnone {
-; X86-LABEL: test_pfrcp:
-; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    andl $-8, %esp
-; X86-NEXT:    subl $8, %esp
-; X86-NEXT:    movd 12(%ebp), %mm0
-; X86-NEXT:    movd 8(%ebp), %mm1
-; X86-NEXT:    punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0]
-; X86-NEXT:    pfrcp %mm1, %mm0
-; X86-NEXT:    movq %mm0, (%esp)
-; X86-NEXT:    flds {{[0-9]+}}(%esp)
-; X86-NEXT:    flds (%esp)
-; X86-NEXT:    movl %ebp, %esp
-; X86-NEXT:    popl %ebp
-; X86-NEXT:    retl
-;
-; X64-LABEL: test_pfrcp:
-; X64:       # %bb.0: # %entry
-; X64-NEXT:    movdq2q %xmm0, %mm0
-; X64-NEXT:    pfrcp %mm0, %mm0
-; X64-NEXT:    movq %mm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movaps -{{[0-9]+}}(%rsp), %xmm0
-; X64-NEXT:    retq
-entry:
-  %0 = bitcast <2 x float> %a to x86_mmx
-  %1 = tail call x86_mmx @llvm.x86.3dnow.pfrcp(x86_mmx %0)
-  %2 = bitcast x86_mmx %1 to <2 x float>
-  ret <2 x float> %2
-}
-
-declare x86_mmx @llvm.x86.3dnow.pfrcp(x86_mmx) nounwind readnone
-
-define <2 x float> @test_pfrcpit1(<2 x float> %a, <2 x float> %b) nounwind readnone {
-; X86-LABEL: test_pfrcpit1:
-; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    andl $-8, %esp
-; X86-NEXT:    subl $8, %esp
-; X86-NEXT:    movd 20(%ebp), %mm0
-; X86-NEXT:    movd 16(%ebp), %mm1
-; X86-NEXT:    punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0]
-; X86-NEXT:    movd 12(%ebp), %mm0
-; X86-NEXT:    movd 8(%ebp), %mm2
-; X86-NEXT:    punpckldq %mm0, %mm2 # mm2 = mm2[0],mm0[0]
-; X86-NEXT:    pfrcpit1 %mm1, %mm2
-; X86-NEXT:    movq %mm2, (%esp)
-; X86-NEXT:    flds {{[0-9]+}}(%esp)
-; X86-NEXT:    flds (%esp)
-; X86-NEXT:    movl %ebp, %esp
-; X86-NEXT:    popl %ebp
-; X86-NEXT:    retl
-;
-; X64-LABEL: test_pfrcpit1:
-; X64:       # %bb.0: # %entry
-; X64-NEXT:    movdq2q %xmm1, %mm0
-; X64-NEXT:    movdq2q %xmm0, %mm1
-; X64-NEXT:    pfrcpit1 %mm0, %mm1
-; X64-NEXT:    movq %mm1, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movaps -{{[0-9]+}}(%rsp), %xmm0
-; X64-NEXT:    retq
-entry:
-  %0 = bitcast <2 x float> %a to x86_mmx
-  %1 = bitcast <2 x float> %b to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.3dnow.pfrcpit1(x86_mmx %0, x86_mmx %1)
-  %3 = bitcast x86_mmx %2 to <2 x float>
-  ret <2 x float> %3
-}
-
-declare x86_mmx @llvm.x86.3dnow.pfrcpit1(x86_mmx, x86_mmx) nounwind readnone
-
-define <2 x float> @test_pfrcpit2(<2 x float> %a, <2 x float> %b) nounwind readnone {
-; X86-LABEL: test_pfrcpit2:
-; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    andl $-8, %esp
-; X86-NEXT:    subl $8, %esp
-; X86-NEXT:    movd 20(%ebp), %mm0
-; X86-NEXT:    movd 16(%ebp), %mm1
-; X86-NEXT:    punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0]
-; X86-NEXT:    movd 12(%ebp), %mm0
-; X86-NEXT:    movd 8(%ebp), %mm2
-; X86-NEXT:    punpckldq %mm0, %mm2 # mm2 = mm2[0],mm0[0]
-; X86-NEXT:    pfrcpit2 %mm1, %mm2
-; X86-NEXT:    movq %mm2, (%esp)
-; X86-NEXT:    flds {{[0-9]+}}(%esp)
-; X86-NEXT:    flds (%esp)
-; X86-NEXT:    movl %ebp, %esp
-; X86-NEXT:    popl %ebp
-; X86-NEXT:    retl
-;
-; X64-LABEL: test_pfrcpit2:
-; X64:       # %bb.0: # %entry
-; X64-NEXT:    movdq2q %xmm1, %mm0
-; X64-NEXT:    movdq2q %xmm0, %mm1
-; X64-NEXT:    pfrcpit2 %mm0, %mm1
-; X64-NEXT:    movq %mm1, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movaps -{{[0-9]+}}(%rsp), %xmm0
-; X64-NEXT:    retq
-entry:
-  %0 = bitcast <2 x float> %a to x86_mmx
-  %1 = bitcast <2 x float> %b to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.3dnow.pfrcpit2(x86_mmx %0, x86_mmx %1)
-  %3 = bitcast x86_mmx %2 to <2 x float>
-  ret <2 x float> %3
-}
-
-declare x86_mmx @llvm.x86.3dnow.pfrcpit2(x86_mmx, x86_mmx) nounwind readnone
-
-define <2 x float> @test_pfrsqrt(<2 x float> %a) nounwind readnone {
-; X86-LABEL: test_pfrsqrt:
-; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    andl $-8, %esp
-; X86-NEXT:    subl $8, %esp
-; X86-NEXT:    movd 12(%ebp), %mm0
-; X86-NEXT:    movd 8(%ebp), %mm1
-; X86-NEXT:    punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0]
-; X86-NEXT:    pfrsqrt %mm1, %mm0
-; X86-NEXT:    movq %mm0, (%esp)
-; X86-NEXT:    flds {{[0-9]+}}(%esp)
-; X86-NEXT:    flds (%esp)
-; X86-NEXT:    movl %ebp, %esp
-; X86-NEXT:    popl %ebp
-; X86-NEXT:    retl
-;
-; X64-LABEL: test_pfrsqrt:
-; X64:       # %bb.0: # %entry
-; X64-NEXT:    movdq2q %xmm0, %mm0
-; X64-NEXT:    pfrsqrt %mm0, %mm0
-; X64-NEXT:    movq %mm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movaps -{{[0-9]+}}(%rsp), %xmm0
-; X64-NEXT:    retq
-entry:
-  %0 = bitcast <2 x float> %a to x86_mmx
-  %1 = tail call x86_mmx @llvm.x86.3dnow.pfrsqrt(x86_mmx %0)
-  %2 = bitcast x86_mmx %1 to <2 x float>
-  ret <2 x float> %2
-}
-
-declare x86_mmx @llvm.x86.3dnow.pfrsqrt(x86_mmx) nounwind readnone
-
-define <2 x float> @test_pfrsqit1(<2 x float> %a, <2 x float> %b) nounwind readnone {
-; X86-LABEL: test_pfrsqit1:
-; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    andl $-8, %esp
-; X86-NEXT:    subl $8, %esp
-; X86-NEXT:    movd 20(%ebp), %mm0
-; X86-NEXT:    movd 16(%ebp), %mm1
-; X86-NEXT:    punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0]
-; X86-NEXT:    movd 12(%ebp), %mm0
-; X86-NEXT:    movd 8(%ebp), %mm2
-; X86-NEXT:    punpckldq %mm0, %mm2 # mm2 = mm2[0],mm0[0]
-; X86-NEXT:    pfrsqit1 %mm1, %mm2
-; X86-NEXT:    movq %mm2, (%esp)
-; X86-NEXT:    flds {{[0-9]+}}(%esp)
-; X86-NEXT:    flds (%esp)
-; X86-NEXT:    movl %ebp, %esp
-; X86-NEXT:    popl %ebp
-; X86-NEXT:    retl
-;
-; X64-LABEL: test_pfrsqit1:
-; X64:       # %bb.0: # %entry
-; X64-NEXT:    movdq2q %xmm1, %mm0
-; X64-NEXT:    movdq2q %xmm0, %mm1
-; X64-NEXT:    pfrsqit1 %mm0, %mm1
-; X64-NEXT:    movq %mm1, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movaps -{{[0-9]+}}(%rsp), %xmm0
-; X64-NEXT:    retq
-entry:
-  %0 = bitcast <2 x float> %a to x86_mmx
-  %1 = bitcast <2 x float> %b to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.3dnow.pfrsqit1(x86_mmx %0, x86_mmx %1)
-  %3 = bitcast x86_mmx %2 to <2 x float>
-  ret <2 x float> %3
-}
-
-declare x86_mmx @llvm.x86.3dnow.pfrsqit1(x86_mmx, x86_mmx) nounwind readnone
-
-define <2 x float> @test_pfsub(<2 x float> %a, <2 x float> %b) nounwind readnone {
-; X86-LABEL: test_pfsub:
-; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    andl $-8, %esp
-; X86-NEXT:    subl $8, %esp
-; X86-NEXT:    movd 20(%ebp), %mm0
-; X86-NEXT:    movd 16(%ebp), %mm1
-; X86-NEXT:    punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0]
-; X86-NEXT:    movd 12(%ebp), %mm0
-; X86-NEXT:    movd 8(%ebp), %mm2
-; X86-NEXT:    punpckldq %mm0, %mm2 # mm2 = mm2[0],mm0[0]
-; X86-NEXT:    pfsub %mm1, %mm2
-; X86-NEXT:    movq %mm2, (%esp)
-; X86-NEXT:    flds {{[0-9]+}}(%esp)
-; X86-NEXT:    flds (%esp)
-; X86-NEXT:    movl %ebp, %esp
-; X86-NEXT:    popl %ebp
-; X86-NEXT:    retl
-;
-; X64-LABEL: test_pfsub:
-; X64:       # %bb.0: # %entry
-; X64-NEXT:    movdq2q %xmm1, %mm0
-; X64-NEXT:    movdq2q %xmm0, %mm1
-; X64-NEXT:    pfsub %mm0, %mm1
-; X64-NEXT:    movq %mm1, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movaps -{{[0-9]+}}(%rsp), %xmm0
-; X64-NEXT:    retq
-entry:
-  %0 = bitcast <2 x float> %a to x86_mmx
-  %1 = bitcast <2 x float> %b to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.3dnow.pfsub(x86_mmx %0, x86_mmx %1)
-  %3 = bitcast x86_mmx %2 to <2 x float>
-  ret <2 x float> %3
-}
-
-declare x86_mmx @llvm.x86.3dnow.pfsub(x86_mmx, x86_mmx) nounwind readnone
-
-define <2 x float> @test_pfsubr(<2 x float> %a, <2 x float> %b) nounwind readnone {
-; X86-LABEL: test_pfsubr:
-; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    andl $-8, %esp
-; X86-NEXT:    subl $8, %esp
-; X86-NEXT:    movd 20(%ebp), %mm0
-; X86-NEXT:    movd 16(%ebp), %mm1
-; X86-NEXT:    punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0]
-; X86-NEXT:    movd 12(%ebp), %mm0
-; X86-NEXT:    movd 8(%ebp), %mm2
-; X86-NEXT:    punpckldq %mm0, %mm2 # mm2 = mm2[0],mm0[0]
-; X86-NEXT:    pfsubr %mm1, %mm2
-; X86-NEXT:    movq %mm2, (%esp)
-; X86-NEXT:    flds {{[0-9]+}}(%esp)
-; X86-NEXT:    flds (%esp)
-; X86-NEXT:    movl %ebp, %esp
-; X86-NEXT:    popl %ebp
-; X86-NEXT:    retl
-;
-; X64-LABEL: test_pfsubr:
-; X64:       # %bb.0: # %entry
-; X64-NEXT:    movdq2q %xmm1, %mm0
-; X64-NEXT:    movdq2q %xmm0, %mm1
-; X64-NEXT:    pfsubr %mm0, %mm1
-; X64-NEXT:    movq %mm1, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movaps -{{[0-9]+}}(%rsp), %xmm0
-; X64-NEXT:    retq
-entry:
-  %0 = bitcast <2 x float> %a to x86_mmx
-  %1 = bitcast <2 x float> %b to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.3dnow.pfsubr(x86_mmx %0, x86_mmx %1)
-  %3 = bitcast x86_mmx %2 to <2 x float>
-  ret <2 x float> %3
-}
-
-declare x86_mmx @llvm.x86.3dnow.pfsubr(x86_mmx, x86_mmx) nounwind readnone
-
-define <2 x float> @test_pi2fd(x86_mmx %a.coerce) nounwind readnone {
-; X86-LABEL: test_pi2fd:
-; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    andl $-8, %esp
-; X86-NEXT:    subl $8, %esp
-; X86-NEXT:    pi2fd %mm0, %mm0
-; X86-NEXT:    movq %mm0, (%esp)
-; X86-NEXT:    flds {{[0-9]+}}(%esp)
-; X86-NEXT:    flds (%esp)
-; X86-NEXT:    movl %ebp, %esp
-; X86-NEXT:    popl %ebp
-; X86-NEXT:    retl
-;
-; X64-LABEL: test_pi2fd:
-; X64:       # %bb.0: # %entry
-; X64-NEXT:    pi2fd %mm0, %mm0
-; X64-NEXT:    movq %mm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movaps -{{[0-9]+}}(%rsp), %xmm0
-; X64-NEXT:    retq
-entry:
-  %0 = bitcast x86_mmx %a.coerce to <2 x i32>
-  %1 = bitcast <2 x i32> %0 to x86_mmx
-  %2 = call x86_mmx @llvm.x86.3dnow.pi2fd(x86_mmx %1)
-  %3 = bitcast x86_mmx %2 to <2 x float>
-  ret <2 x float> %3
-}
-
-declare x86_mmx @llvm.x86.3dnow.pi2fd(x86_mmx) nounwind readnone
-
-define <4 x i16> @test_pmulhrw(x86_mmx %a.coerce, x86_mmx %b.coerce) nounwind readnone {
-; X86-LABEL: test_pmulhrw:
-; X86:       # %bb.0: # %entry
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pmulhrw %mm1, %mm0
-; X86-NEXT:    movq %mm0, (%eax)
-; X86-NEXT:    retl $4
-;
-; X64-LABEL: test_pmulhrw:
-; X64:       # %bb.0: # %entry
-; X64-NEXT:    pmulhrw %mm1, %mm0
-; X64-NEXT:    movq2dq %mm0, %xmm0
-; X64-NEXT:    retq
-entry:
-  %0 = bitcast x86_mmx %a.coerce to <4 x i16>
-  %1 = bitcast x86_mmx %b.coerce to <4 x i16>
-  %2 = bitcast <4 x i16> %0 to x86_mmx
-  %3 = bitcast <4 x i16> %1 to x86_mmx
-  %4 = call x86_mmx @llvm.x86.3dnow.pmulhrw(x86_mmx %2, x86_mmx %3)
-  %5 = bitcast x86_mmx %4 to <4 x i16>
-  ret <4 x i16> %5
-}
-
-declare x86_mmx @llvm.x86.3dnow.pmulhrw(x86_mmx, x86_mmx) nounwind readnone
-
-define <2 x i32> @test_pf2iw(<2 x float> %a) nounwind readnone {
-; X86-LABEL: test_pf2iw:
-; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    andl $-8, %esp
-; X86-NEXT:    subl $8, %esp
-; X86-NEXT:    movd 12(%ebp), %mm0
-; X86-NEXT:    movd 8(%ebp), %mm1
-; X86-NEXT:    punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0]
-; X86-NEXT:    pf2iw %mm1, %mm0
-; X86-NEXT:    movq %mm0, (%esp)
-; X86-NEXT:    movl (%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %ebp, %esp
-; X86-NEXT:    popl %ebp
-; X86-NEXT:    retl
-;
-; X64-LABEL: test_pf2iw:
-; X64:       # %bb.0: # %entry
-; X64-NEXT:    movdq2q %xmm0, %mm0
-; X64-NEXT:    pf2iw %mm0, %mm0
-; X64-NEXT:    movq2dq %mm0, %xmm0
-; X64-NEXT:    retq
-entry:
-  %0 = bitcast <2 x float> %a to x86_mmx
-  %1 = tail call x86_mmx @llvm.x86.3dnowa.pf2iw(x86_mmx %0)
-  %2 = bitcast x86_mmx %1 to <2 x i32>
-  ret <2 x i32> %2
-}
-
-declare x86_mmx @llvm.x86.3dnowa.pf2iw(x86_mmx) nounwind readnone
-
-define <2 x float> @test_pfnacc(<2 x float> %a, <2 x float> %b) nounwind readnone {
-; X86-LABEL: test_pfnacc:
-; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    andl $-8, %esp
-; X86-NEXT:    subl $8, %esp
-; X86-NEXT:    movd 20(%ebp), %mm0
-; X86-NEXT:    movd 16(%ebp), %mm1
-; X86-NEXT:    punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0]
-; X86-NEXT:    movd 12(%ebp), %mm0
-; X86-NEXT:    movd 8(%ebp), %mm2
-; X86-NEXT:    punpckldq %mm0, %mm2 # mm2 = mm2[0],mm0[0]
-; X86-NEXT:    pfnacc %mm1, %mm2
-; X86-NEXT:    movq %mm2, (%esp)
-; X86-NEXT:    flds {{[0-9]+}}(%esp)
-; X86-NEXT:    flds (%esp)
-; X86-NEXT:    movl %ebp, %esp
-; X86-NEXT:    popl %ebp
-; X86-NEXT:    retl
-;
-; X64-LABEL: test_pfnacc:
-; X64:       # %bb.0: # %entry
-; X64-NEXT:    movdq2q %xmm1, %mm0
-; X64-NEXT:    movdq2q %xmm0, %mm1
-; X64-NEXT:    pfnacc %mm0, %mm1
-; X64-NEXT:    movq %mm1, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movaps -{{[0-9]+}}(%rsp), %xmm0
-; X64-NEXT:    retq
-entry:
-  %0 = bitcast <2 x float> %a to x86_mmx
-  %1 = bitcast <2 x float> %b to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.3dnowa.pfnacc(x86_mmx %0, x86_mmx %1)
-  %3 = bitcast x86_mmx %2 to <2 x float>
-  ret <2 x float> %3
-}
-
-declare x86_mmx @llvm.x86.3dnowa.pfnacc(x86_mmx, x86_mmx) nounwind readnone
-
-define <2 x float> @test_pfpnacc(<2 x float> %a, <2 x float> %b) nounwind readnone {
-; X86-LABEL: test_pfpnacc:
-; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    andl $-8, %esp
-; X86-NEXT:    subl $8, %esp
-; X86-NEXT:    movd 20(%ebp), %mm0
-; X86-NEXT:    movd 16(%ebp), %mm1
-; X86-NEXT:    punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0]
-; X86-NEXT:    movd 12(%ebp), %mm0
-; X86-NEXT:    movd 8(%ebp), %mm2
-; X86-NEXT:    punpckldq %mm0, %mm2 # mm2 = mm2[0],mm0[0]
-; X86-NEXT:    pfpnacc %mm1, %mm2
-; X86-NEXT:    movq %mm2, (%esp)
-; X86-NEXT:    flds {{[0-9]+}}(%esp)
-; X86-NEXT:    flds (%esp)
-; X86-NEXT:    movl %ebp, %esp
-; X86-NEXT:    popl %ebp
-; X86-NEXT:    retl
-;
-; X64-LABEL: test_pfpnacc:
-; X64:       # %bb.0: # %entry
-; X64-NEXT:    movdq2q %xmm1, %mm0
-; X64-NEXT:    movdq2q %xmm0, %mm1
-; X64-NEXT:    pfpnacc %mm0, %mm1
-; X64-NEXT:    movq %mm1, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movaps -{{[0-9]+}}(%rsp), %xmm0
-; X64-NEXT:    retq
-entry:
-  %0 = bitcast <2 x float> %a to x86_mmx
-  %1 = bitcast <2 x float> %b to x86_mmx
-  %2 = tail call x86_mmx @llvm.x86.3dnowa.pfpnacc(x86_mmx %0, x86_mmx %1)
-  %3 = bitcast x86_mmx %2 to <2 x float>
-  ret <2 x float> %3
-}
-
-declare x86_mmx @llvm.x86.3dnowa.pfpnacc(x86_mmx, x86_mmx) nounwind readnone
-
-define <2 x float> @test_pi2fw(x86_mmx %a.coerce) nounwind readnone {
-; X86-LABEL: test_pi2fw:
-; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    andl $-8, %esp
-; X86-NEXT:    subl $8, %esp
-; X86-NEXT:    pi2fw %mm0, %mm0
-; X86-NEXT:    movq %mm0, (%esp)
-; X86-NEXT:    flds {{[0-9]+}}(%esp)
-; X86-NEXT:    flds (%esp)
-; X86-NEXT:    movl %ebp, %esp
-; X86-NEXT:    popl %ebp
-; X86-NEXT:    retl
-;
-; X64-LABEL: test_pi2fw:
-; X64:       # %bb.0: # %entry
-; X64-NEXT:    pi2fw %mm0, %mm0
-; X64-NEXT:    movq %mm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movaps -{{[0-9]+}}(%rsp), %xmm0
-; X64-NEXT:    retq
-entry:
-  %0 = bitcast x86_mmx %a.coerce to <2 x i32>
-  %1 = bitcast <2 x i32> %0 to x86_mmx
-  %2 = call x86_mmx @llvm.x86.3dnowa.pi2fw(x86_mmx %1)
-  %3 = bitcast x86_mmx %2 to <2 x float>
-  ret <2 x float> %3
-}
-
-declare x86_mmx @llvm.x86.3dnowa.pi2fw(x86_mmx) nounwind readnone
-
-define <2 x float> @test_pswapdsf(<2 x float> %a) nounwind readnone {
-; X86-LABEL: test_pswapdsf:
-; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    andl $-8, %esp
-; X86-NEXT:    subl $8, %esp
-; X86-NEXT:    movd 12(%ebp), %mm0
-; X86-NEXT:    movd 8(%ebp), %mm1
-; X86-NEXT:    punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0]
-; X86-NEXT:    pswapd %mm1, %mm0 # mm0 = mm1[1,0]
-; X86-NEXT:    movq %mm0, (%esp)
-; X86-NEXT:    flds {{[0-9]+}}(%esp)
-; X86-NEXT:    flds (%esp)
-; X86-NEXT:    movl %ebp, %esp
-; X86-NEXT:    popl %ebp
-; X86-NEXT:    retl
-;
-; X64-LABEL: test_pswapdsf:
-; X64:       # %bb.0: # %entry
-; X64-NEXT:    movdq2q %xmm0, %mm0
-; X64-NEXT:    pswapd %mm0, %mm0 # mm0 = mm0[1,0]
-; X64-NEXT:    movq %mm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movaps -{{[0-9]+}}(%rsp), %xmm0
-; X64-NEXT:    retq
-entry:
-  %0 = bitcast <2 x float> %a to x86_mmx
-  %1 = tail call x86_mmx @llvm.x86.3dnowa.pswapd(x86_mmx %0)
-  %2 = bitcast x86_mmx %1 to <2 x float>
-  ret <2 x float> %2
-}
-
-define <2 x i32> @test_pswapdsi(<2 x i32> %a) nounwind readnone {
-; X86-LABEL: test_pswapdsi:
-; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    andl $-8, %esp
-; X86-NEXT:    subl $8, %esp
-; X86-NEXT:    movd 12(%ebp), %mm0
-; X86-NEXT:    movd 8(%ebp), %mm1
-; X86-NEXT:    punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0]
-; X86-NEXT:    pswapd %mm1, %mm0 # mm0 = mm1[1,0]
-; X86-NEXT:    movq %mm0, (%esp)
-; X86-NEXT:    movl (%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %ebp, %esp
-; X86-NEXT:    popl %ebp
-; X86-NEXT:    retl
-;
-; X64-LABEL: test_pswapdsi:
-; X64:       # %bb.0: # %entry
-; X64-NEXT:    movdq2q %xmm0, %mm0
-; X64-NEXT:    pswapd %mm0, %mm0 # mm0 = mm0[1,0]
-; X64-NEXT:    movq2dq %mm0, %xmm0
-; X64-NEXT:    retq
-entry:
-  %0 = bitcast <2 x i32> %a to x86_mmx
-  %1 = tail call x86_mmx @llvm.x86.3dnowa.pswapd(x86_mmx %0)
-  %2 = bitcast x86_mmx %1 to <2 x i32>
-  ret <2 x i32> %2
-}
-
-declare x86_mmx @llvm.x86.3dnowa.pswapd(x86_mmx) nounwind readnone
diff --git a/llvm/test/CodeGen/X86/commute-3dnow.ll b/llvm/test/CodeGen/X86/commute-3dnow.ll
deleted file mode 100644
index dc3910920365d..0000000000000
--- a/llvm/test/CodeGen/X86/commute-3dnow.ll
+++ /dev/null
@@ -1,270 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+mmx,+3dnow | FileCheck %s --check-prefix=X86
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+mmx,+3dnow | FileCheck %s --check-prefix=X64
-
-define void @commute_m_pfadd(ptr%a0, ptr%a1, ptr%a2) nounwind {
-; X86-LABEL: commute_m_pfadd:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movq (%edx), %mm0
-; X86-NEXT:    pfadd (%eax), %mm0
-; X86-NEXT:    pfadd (%ecx), %mm0
-; X86-NEXT:    movq %mm0, (%ecx)
-; X86-NEXT:    retl
-;
-; X64-LABEL: commute_m_pfadd:
-; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %mm0
-; X64-NEXT:    pfadd (%rsi), %mm0
-; X64-NEXT:    pfadd (%rdx), %mm0
-; X64-NEXT:    movq %mm0, (%rdx)
-; X64-NEXT:    retq
-  %1 = load x86_mmx, ptr %a0
-  %2 = load x86_mmx, ptr %a1
-  %3 = load x86_mmx, ptr %a2
-  %4 = tail call x86_mmx @llvm.x86.3dnow.pfadd(x86_mmx %1, x86_mmx %2)
-  %5 = tail call x86_mmx @llvm.x86.3dnow.pfadd(x86_mmx %3, x86_mmx %4)
-  store x86_mmx %5, ptr %a2
-  ret void
-}
-declare x86_mmx @llvm.x86.3dnow.pfadd(x86_mmx, x86_mmx)
-
-define void @commute_m_pfsub(ptr%a0, ptr%a1, ptr%a2) nounwind {
-; X86-LABEL: commute_m_pfsub:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movq (%edx), %mm0
-; X86-NEXT:    pfsub (%eax), %mm0
-; X86-NEXT:    pfsubr (%ecx), %mm0
-; X86-NEXT:    movq %mm0, (%ecx)
-; X86-NEXT:    retl
-;
-; X64-LABEL: commute_m_pfsub:
-; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %mm0
-; X64-NEXT:    pfsub (%rsi), %mm0
-; X64-NEXT:    pfsubr (%rdx), %mm0
-; X64-NEXT:    movq %mm0, (%rdx)
-; X64-NEXT:    retq
-  %1 = load x86_mmx, ptr %a0
-  %2 = load x86_mmx, ptr %a1
-  %3 = load x86_mmx, ptr %a2
-  %4 = tail call x86_mmx @llvm.x86.3dnow.pfsub(x86_mmx %1, x86_mmx %2)
-  %5 = tail call x86_mmx @llvm.x86.3dnow.pfsub(x86_mmx %3, x86_mmx %4)
-  store x86_mmx %5, ptr %a2
-  ret void
-}
-declare x86_mmx @llvm.x86.3dnow.pfsub(x86_mmx, x86_mmx)
-
-define void @commute_m_pfsubr(ptr%a0, ptr%a1, ptr%a2) nounwind {
-; X86-LABEL: commute_m_pfsubr:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movq (%edx), %mm0
-; X86-NEXT:    pfsubr (%eax), %mm0
-; X86-NEXT:    pfsub (%ecx), %mm0
-; X86-NEXT:    movq %mm0, (%ecx)
-; X86-NEXT:    retl
-;
-; X64-LABEL: commute_m_pfsubr:
-; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %mm0
-; X64-NEXT:    pfsubr (%rsi), %mm0
-; X64-NEXT:    pfsub (%rdx), %mm0
-; X64-NEXT:    movq %mm0, (%rdx)
-; X64-NEXT:    retq
-  %1 = load x86_mmx, ptr %a0
-  %2 = load x86_mmx, ptr %a1
-  %3 = load x86_mmx, ptr %a2
-  %4 = tail call x86_mmx @llvm.x86.3dnow.pfsubr(x86_mmx %1, x86_mmx %2)
-  %5 = tail call x86_mmx @llvm.x86.3dnow.pfsubr(x86_mmx %3, x86_mmx %4)
-  store x86_mmx %5, ptr %a2
-  ret void
-}
-declare x86_mmx @llvm.x86.3dnow.pfsubr(x86_mmx, x86_mmx)
-
-define void @commute_m_pfmul(ptr%a0, ptr%a1, ptr%a2) nounwind {
-; X86-LABEL: commute_m_pfmul:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movq (%edx), %mm0
-; X86-NEXT:    pfmul (%eax), %mm0
-; X86-NEXT:    pfmul (%ecx), %mm0
-; X86-NEXT:    movq %mm0, (%ecx)
-; X86-NEXT:    retl
-;
-; X64-LABEL: commute_m_pfmul:
-; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %mm0
-; X64-NEXT:    pfmul (%rsi), %mm0
-; X64-NEXT:    pfmul (%rdx), %mm0
-; X64-NEXT:    movq %mm0, (%rdx)
-; X64-NEXT:    retq
-  %1 = load x86_mmx, ptr %a0
-  %2 = load x86_mmx, ptr %a1
-  %3 = load x86_mmx, ptr %a2
-  %4 = tail call x86_mmx @llvm.x86.3dnow.pfmul(x86_mmx %1, x86_mmx %2)
-  %5 = tail call x86_mmx @llvm.x86.3dnow.pfmul(x86_mmx %3, x86_mmx %4)
-  store x86_mmx %5, ptr %a2
-  ret void
-}
-declare x86_mmx @llvm.x86.3dnow.pfmul(x86_mmx, x86_mmx)
-
-; PFMAX can't commute without fast-math.
-define void @commute_m_pfmax(ptr%a0, ptr%a1, ptr%a2) nounwind {
-; X86-LABEL: commute_m_pfmax:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movq (%edx), %mm0
-; X86-NEXT:    movq (%ecx), %mm1
-; X86-NEXT:    pfmax (%eax), %mm0
-; X86-NEXT:    pfmax %mm0, %mm1
-; X86-NEXT:    movq %mm1, (%ecx)
-; X86-NEXT:    retl
-;
-; X64-LABEL: commute_m_pfmax:
-; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %mm0
-; X64-NEXT:    movq (%rdx), %mm1
-; X64-NEXT:    pfmax (%rsi), %mm0
-; X64-NEXT:    pfmax %mm0, %mm1
-; X64-NEXT:    movq %mm1, (%rdx)
-; X64-NEXT:    retq
-  %1 = load x86_mmx, ptr %a0
-  %2 = load x86_mmx, ptr %a1
-  %3 = load x86_mmx, ptr %a2
-  %4 = tail call x86_mmx @llvm.x86.3dnow.pfmax(x86_mmx %1, x86_mmx %2)
-  %5 = tail call x86_mmx @llvm.x86.3dnow.pfmax(x86_mmx %3, x86_mmx %4)
-  store x86_mmx %5, ptr %a2
-  ret void
-}
-declare x86_mmx @llvm.x86.3dnow.pfmax(x86_mmx, x86_mmx)
-
-; PFMIN can't commute without fast-math.
-define void @commute_m_pfmin(ptr%a0, ptr%a1, ptr%a2) nounwind {
-; X86-LABEL: commute_m_pfmin:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movq (%edx), %mm0
-; X86-NEXT:    movq (%ecx), %mm1
-; X86-NEXT:    pfmin (%eax), %mm0
-; X86-NEXT:    pfmin %mm0, %mm1
-; X86-NEXT:    movq %mm1, (%ecx)
-; X86-NEXT:    retl
-;
-; X64-LABEL: commute_m_pfmin:
-; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %mm0
-; X64-NEXT:    movq (%rdx), %mm1
-; X64-NEXT:    pfmin (%rsi), %mm0
-; X64-NEXT:    pfmin %mm0, %mm1
-; X64-NEXT:    movq %mm1, (%rdx)
-; X64-NEXT:    retq
-  %1 = load x86_mmx, ptr %a0
-  %2 = load x86_mmx, ptr %a1
-  %3 = load x86_mmx, ptr %a2
-  %4 = tail call x86_mmx @llvm.x86.3dnow.pfmin(x86_mmx %1, x86_mmx %2)
-  %5 = tail call x86_mmx @llvm.x86.3dnow.pfmin(x86_mmx %3, x86_mmx %4)
-  store x86_mmx %5, ptr %a2
-  ret void
-}
-declare x86_mmx @llvm.x86.3dnow.pfmin(x86_mmx, x86_mmx)
-
-define void @commute_m_pfcmpeq(ptr%a0, ptr%a1, ptr%a2) nounwind {
-; X86-LABEL: commute_m_pfcmpeq:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movq (%edx), %mm0
-; X86-NEXT:    pfcmpeq (%eax), %mm0
-; X86-NEXT:    pfcmpeq (%ecx), %mm0
-; X86-NEXT:    movq %mm0, (%ecx)
-; X86-NEXT:    retl
-;
-; X64-LABEL: commute_m_pfcmpeq:
-; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %mm0
-; X64-NEXT:    pfcmpeq (%rsi), %mm0
-; X64-NEXT:    pfcmpeq (%rdx), %mm0
-; X64-NEXT:    movq %mm0, (%rdx)
-; X64-NEXT:    retq
-  %1 = load x86_mmx, ptr %a0
-  %2 = load x86_mmx, ptr %a1
-  %3 = load x86_mmx, ptr %a2
-  %4 = tail call x86_mmx @llvm.x86.3dnow.pfcmpeq(x86_mmx %1, x86_mmx %2)
-  %5 = tail call x86_mmx @llvm.x86.3dnow.pfcmpeq(x86_mmx %3, x86_mmx %4)
-  store x86_mmx %5, ptr %a2
-  ret void
-}
-declare x86_mmx @llvm.x86.3dnow.pfcmpeq(x86_mmx, x86_mmx)
-
-define void @commute_m_pavgusb(ptr%a0, ptr%a1, ptr%a2) nounwind {
-; X86-LABEL: commute_m_pavgusb:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movq (%edx), %mm0
-; X86-NEXT:    pavgusb (%eax), %mm0
-; X86-NEXT:    pavgusb (%ecx), %mm0
-; X86-NEXT:    movq %mm0, (%ecx)
-; X86-NEXT:    retl
-;
-; X64-LABEL: commute_m_pavgusb:
-; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %mm0
-; X64-NEXT:    pavgusb (%rsi), %mm0
-; X64-NEXT:    pavgusb (%rdx), %mm0
-; X64-NEXT:    movq %mm0, (%rdx)
-; X64-NEXT:    retq
-  %1 = load x86_mmx, ptr %a0
-  %2 = load x86_mmx, ptr %a1
-  %3 = load x86_mmx, ptr %a2
-  %4 = tail call x86_mmx @llvm.x86.3dnow.pavgusb(x86_mmx %1, x86_mmx %2)
-  %5 = tail call x86_mmx @llvm.x86.3dnow.pavgusb(x86_mmx %3, x86_mmx %4)
-  store x86_mmx %5, ptr %a2
-  ret void
-}
-declare x86_mmx @llvm.x86.3dnow.pavgusb(x86_mmx, x86_mmx)
-
-define void @commute_m_pmulhrw(ptr%a0, ptr%a1, ptr%a2) nounwind {
-; X86-LABEL: commute_m_pmulhrw:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movq (%edx), %mm0
-; X86-NEXT:    pmulhrw (%eax), %mm0
-; X86-NEXT:    pmulhrw (%ecx), %mm0
-; X86-NEXT:    movq %mm0, (%ecx)
-; X86-NEXT:    retl
-;
-; X64-LABEL: commute_m_pmulhrw:
-; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %mm0
-; X64-NEXT:    pmulhrw (%rsi), %mm0
-; X64-NEXT:    pmulhrw (%rdx), %mm0
-; X64-NEXT:    movq %mm0, (%rdx)
-; X64-NEXT:    retq
-  %1 = load x86_mmx, ptr %a0
-  %2 = load x86_mmx, ptr %a1
-  %3 = load x86_mmx, ptr %a2
-  %4 = tail call x86_mmx @llvm.x86.3dnow.pmulhrw(x86_mmx %1, x86_mmx %2)
-  %5 = tail call x86_mmx @llvm.x86.3dnow.pmulhrw(x86_mmx %3, x86_mmx %4)
-  store x86_mmx %5, ptr %a2
-  ret void
-}
-declare x86_mmx @llvm.x86.3dnow.pmulhrw(x86_mmx, x86_mmx)
diff --git a/llvm/test/CodeGen/X86/expand-vr64-gr64-copy.mir b/llvm/test/CodeGen/X86/expand-vr64-gr64-copy.mir
index 800af1ce5432e..f0496a2f56ea6 100644
--- a/llvm/test/CodeGen/X86/expand-vr64-gr64-copy.mir
+++ b/llvm/test/CodeGen/X86/expand-vr64-gr64-copy.mir
@@ -1,4 +1,4 @@
-# RUN: llc -run-pass postrapseudos -mtriple=x86_64-unknown-unknown -mattr=+3dnow -o - %s | FileCheck %s
+# RUN: llc -run-pass postrapseudos -mtriple=x86_64-unknown-unknown -mattr=+mmx -o - %s | FileCheck %s
 # This test verifies that the ExpandPostRA pass expands the GR64 <-> VR64
 # copies into appropriate MMX_MOV instructions.
 
@@ -7,7 +7,7 @@
   define <2 x i32> @test_pswapdsi(<2 x i32> %a) nounwind readnone {
   entry:
     %0 = bitcast <2 x i32> %a to x86_mmx
-    %1 = tail call x86_mmx @llvm.x86.3dnowa.pswapd(x86_mmx %0)
+    %1 = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %0, x86_mmx %0)
     %2 = bitcast x86_mmx %1 to <2 x i32>
     ret <2 x i32> %2
   }
@@ -21,16 +21,13 @@ tracksRegLiveness: true
 body: |
   bb.0.entry:
     liveins: $xmm0
-
-    $xmm0 = PSHUFDri killed $xmm0, -24
-    MOVPQI2QImr $rsp, 1, $noreg, -8, $noreg, killed $xmm0
-    $mm0 = PSWAPDrm $rsp, 1, $noreg, -8, $noreg
+    $mm0 = MMX_MOVDQ2Qrr killed $xmm0
+    $mm0 = MMX_PADDWrr killed $mm0, $mm0
+  ; Inserted dummy copy here, for test:
   ; CHECK:      $rax = MMX_MOVD64from64rr $mm0
   ; CHECK-NEXT: $mm0 = MMX_MOVD64to64rr $rax
     $rax = COPY $mm0
     $mm0 = COPY $rax
-    MMX_MOVQ64mr $rsp, 1, $noreg, -16, $noreg, killed $mm0
-    $xmm0 = MOVQI2PQIrm $rsp, 1, $noreg, -16, $noreg
-    $xmm0 = PSHUFDri killed $xmm0, -44
-    RET64 $xmm0
+    $xmm0 = MMX_MOVQ2DQrr killed $mm0
+    RET 0, $xmm0
 ...
diff --git a/llvm/test/CodeGen/X86/pr35982.ll b/llvm/test/CodeGen/X86/pr35982.ll
index 4a79a109f8b60..b6022698edaeb 100644
--- a/llvm/test/CodeGen/X86/pr35982.ll
+++ b/llvm/test/CodeGen/X86/pr35982.ll
@@ -46,50 +46,5 @@ define float @PR35982_emms(<1 x i64>) nounwind {
   ret float %11
 }
 
-define float @PR35982_femms(<1 x i64>) nounwind {
-; NO-POSTRA-LABEL: PR35982_femms:
-; NO-POSTRA:       # %bb.0:
-; NO-POSTRA-NEXT:    subl $8, %esp
-; NO-POSTRA-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; NO-POSTRA-NEXT:    movq {{[0-9]+}}(%esp), %mm0
-; NO-POSTRA-NEXT:    punpckhdq %mm0, %mm0 # mm0 = mm0[1,1]
-; NO-POSTRA-NEXT:    movd %mm0, %ecx
-; NO-POSTRA-NEXT:    femms
-; NO-POSTRA-NEXT:    movl %eax, (%esp)
-; NO-POSTRA-NEXT:    fildl (%esp)
-; NO-POSTRA-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; NO-POSTRA-NEXT:    fiaddl {{[0-9]+}}(%esp)
-; NO-POSTRA-NEXT:    addl $8, %esp
-; NO-POSTRA-NEXT:    retl
-;
-; POSTRA-LABEL: PR35982_femms:
-; POSTRA:       # %bb.0:
-; POSTRA-NEXT:    subl $8, %esp
-; POSTRA-NEXT:    movq {{[0-9]+}}(%esp), %mm0
-; POSTRA-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; POSTRA-NEXT:    punpckhdq %mm0, %mm0 # mm0 = mm0[1,1]
-; POSTRA-NEXT:    movd %mm0, %ecx
-; POSTRA-NEXT:    femms
-; POSTRA-NEXT:    movl %eax, (%esp)
-; POSTRA-NEXT:    fildl (%esp)
-; POSTRA-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; POSTRA-NEXT:    fiaddl {{[0-9]+}}(%esp)
-; POSTRA-NEXT:    addl $8, %esp
-; POSTRA-NEXT:    retl
-  %2 = bitcast <1 x i64> %0 to <2 x i32>
-  %3 = extractelement <2 x i32> %2, i32 0
-  %4 = extractelement <1 x i64> %0, i32 0
-  %5 = bitcast i64 %4 to x86_mmx
-  %6 = tail call x86_mmx @llvm.x86.mmx.punpckhdq(x86_mmx %5, x86_mmx %5)
-  %7 = bitcast x86_mmx %6 to <2 x i32>
-  %8 = extractelement <2 x i32> %7, i32 0
-  tail call void @llvm.x86.mmx.femms()
-  %9 = sitofp i32 %3 to float
-  %10 = sitofp i32 %8 to float
-  %11 = fadd float %9, %10
-  ret float %11
-}
-
 declare x86_mmx @llvm.x86.mmx.punpckhdq(x86_mmx, x86_mmx)
-declare void @llvm.x86.mmx.femms()
 declare void @llvm.x86.mmx.emms()
diff --git a/llvm/test/CodeGen/X86/prefetch.ll b/llvm/test/CodeGen/X86/prefetch.ll
index c10e0526787d5..f761d9e140b03 100644
--- a/llvm/test/CodeGen/X86/prefetch.ll
+++ b/llvm/test/CodeGen/X86/prefetch.ll
@@ -6,16 +6,11 @@
 ; RUN: llc < %s -mtriple=i686-- -mcpu=slm | FileCheck %s -check-prefix=X86-PRFCHWSSE
 ; RUN: llc < %s -mtriple=i686-- -mcpu=btver2 | FileCheck %s -check-prefix=X86-PRFCHWSSE
 ; RUN: llc < %s -mtriple=i686-- -mcpu=btver2 -mattr=-prfchw | FileCheck %s -check-prefix=X86-SSE
-; RUN: llc < %s -mtriple=i686-- -mattr=+3dnow | FileCheck %s -check-prefix=X86-3DNOW
-; RUN: llc < %s -mtriple=i686-- -mattr=+3dnow,+prfchw | FileCheck %s -check-prefix=X86-3DNOW
+; RUN: llc < %s -mtriple=i686-- -mattr=+prfchw | FileCheck %s -check-prefix=X86-SSE
 
 ; Rules:
-; 3dnow by itself get you just the single prefetch instruction with no hints
 ; sse provides prefetch0/1/2/nta
-; supporting prefetchw, but not 3dnow implicitly provides prefetcht0/1/2/nta regardless of sse setting as we need something to fall back to for the non-write hint.
-; 3dnow prefetch instruction will only get used if you have no other prefetch instructions enabled
-
-; rdar://10538297
+; supporting prefetchw implicitly provides prefetcht0/1/2/nta as well, as we need something to fall back to for the non-write hint.
 
 define void @t(ptr %ptr) nounwind  {
 ; X86-SSE-LABEL: t:
@@ -43,19 +38,7 @@ define void @t(ptr %ptr) nounwind  {
 ; X86-PRFCHWSSE-NEXT:    prefetchw (%eax)
 ; X86-PRFCHWSSE-NEXT:    prefetchw (%eax)
 ; X86-PRFCHWSSE-NEXT:    retl
-;
-; X86-3DNOW-LABEL: t:
-; X86-3DNOW:       # %bb.0: # %entry
-; X86-3DNOW-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-3DNOW-NEXT:    prefetch (%eax)
-; X86-3DNOW-NEXT:    prefetch (%eax)
-; X86-3DNOW-NEXT:    prefetch (%eax)
-; X86-3DNOW-NEXT:    prefetch (%eax)
-; X86-3DNOW-NEXT:    prefetchw (%eax)
-; X86-3DNOW-NEXT:    prefetchw (%eax)
-; X86-3DNOW-NEXT:    prefetchw (%eax)
-; X86-3DNOW-NEXT:    prefetchw (%eax)
-; X86-3DNOW-NEXT:    retl
+
 entry:
   tail call void @llvm.prefetch( ptr %ptr, i32 0, i32 1, i32 1 )
   tail call void @llvm.prefetch( ptr %ptr, i32 0, i32 2, i32 1 )
diff --git a/llvm/test/CodeGen/X86/stack-folding-3dnow.ll b/llvm/test/CodeGen/X86/stack-folding-3dnow.ll
deleted file mode 100644
index 1cbd61567f327..0000000000000
--- a/llvm/test/CodeGen/X86/stack-folding-3dnow.ll
+++ /dev/null
@@ -1,387 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+3dnow | FileCheck %s
-
-define x86_mmx @stack_fold_pavgusb(x86_mmx %a, x86_mmx %b) {
-; CHECK-LABEL: stack_fold_pavgusb:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    #APP
-; CHECK-NEXT:    nop
-; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    pavgusb {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
-; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.3dnow.pavgusb(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
-}
-declare x86_mmx @llvm.x86.3dnow.pavgusb(x86_mmx, x86_mmx) nounwind readnone
-
-define x86_mmx @stack_fold_pf2id(x86_mmx %a) {
-; CHECK-LABEL: stack_fold_pf2id:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    #APP
-; CHECK-NEXT:    nop
-; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    pf2id {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
-; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm1},~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.3dnow.pf2id(x86_mmx %a) nounwind readnone
-  ret x86_mmx %2
-}
-declare x86_mmx @llvm.x86.3dnow.pf2id(x86_mmx) nounwind readnone
-
-define x86_mmx @stack_fold_pf2iw(x86_mmx %a) {
-; CHECK-LABEL: stack_fold_pf2iw:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    #APP
-; CHECK-NEXT:    nop
-; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    pf2iw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
-; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm1},~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.3dnowa.pf2iw(x86_mmx %a) nounwind readnone
-  ret x86_mmx %2
-}
-declare x86_mmx @llvm.x86.3dnowa.pf2iw(x86_mmx) nounwind readnone
-
-define x86_mmx @stack_fold_pfacc(x86_mmx %a, x86_mmx %b) {
-; CHECK-LABEL: stack_fold_pfacc:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    #APP
-; CHECK-NEXT:    nop
-; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    pfacc {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
-; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.3dnow.pfacc(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
-}
-declare x86_mmx @llvm.x86.3dnow.pfacc(x86_mmx, x86_mmx) nounwind readnone
-
-define x86_mmx @stack_fold_pfadd(x86_mmx %a, x86_mmx %b) {
-; CHECK-LABEL: stack_fold_pfadd:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    #APP
-; CHECK-NEXT:    nop
-; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    pfadd {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
-; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.3dnow.pfadd(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
-}
-declare x86_mmx @llvm.x86.3dnow.pfadd(x86_mmx, x86_mmx) nounwind readnone
-
-define x86_mmx @stack_fold_pfcmpeq(x86_mmx %a, x86_mmx %b) {
-; CHECK-LABEL: stack_fold_pfcmpeq:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    #APP
-; CHECK-NEXT:    nop
-; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    pfcmpeq {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
-; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.3dnow.pfcmpeq(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
-}
-declare x86_mmx @llvm.x86.3dnow.pfcmpeq(x86_mmx, x86_mmx) nounwind readnone
-
-define x86_mmx @stack_fold_pfcmpge(x86_mmx %a, x86_mmx %b) {
-; CHECK-LABEL: stack_fold_pfcmpge:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    #APP
-; CHECK-NEXT:    nop
-; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    pfcmpge {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
-; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.3dnow.pfcmpge(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
-}
-declare x86_mmx @llvm.x86.3dnow.pfcmpge(x86_mmx, x86_mmx) nounwind readnone
-
-define x86_mmx @stack_fold_pfcmpgt(x86_mmx %a, x86_mmx %b) {
-; CHECK-LABEL: stack_fold_pfcmpgt:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    #APP
-; CHECK-NEXT:    nop
-; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    pfcmpgt {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
-; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.3dnow.pfcmpgt(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
-}
-declare x86_mmx @llvm.x86.3dnow.pfcmpgt(x86_mmx, x86_mmx) nounwind readnone
-
-define x86_mmx @stack_fold_pfmax(x86_mmx %a, x86_mmx %b) {
-; CHECK-LABEL: stack_fold_pfmax:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    #APP
-; CHECK-NEXT:    nop
-; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    pfmax {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
-; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.3dnow.pfmax(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
-}
-declare x86_mmx @llvm.x86.3dnow.pfmax(x86_mmx, x86_mmx) nounwind readnone
-
-define x86_mmx @stack_fold_pfmin(x86_mmx %a, x86_mmx %b) {
-; CHECK-LABEL: stack_fold_pfmin:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    #APP
-; CHECK-NEXT:    nop
-; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    pfmin {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
-; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.3dnow.pfmin(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
-}
-declare x86_mmx @llvm.x86.3dnow.pfmin(x86_mmx, x86_mmx) nounwind readnone
-
-define x86_mmx @stack_fold_pfmul(x86_mmx %a, x86_mmx %b) {
-; CHECK-LABEL: stack_fold_pfmul:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    #APP
-; CHECK-NEXT:    nop
-; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    pfmul {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
-; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.3dnow.pfmul(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
-}
-declare x86_mmx @llvm.x86.3dnow.pfmul(x86_mmx, x86_mmx) nounwind readnone
-
-define x86_mmx @stack_fold_pfnacc(x86_mmx %a, x86_mmx %b) {
-; CHECK-LABEL: stack_fold_pfnacc:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    #APP
-; CHECK-NEXT:    nop
-; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    pfnacc {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
-; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.3dnowa.pfnacc(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
-}
-declare x86_mmx @llvm.x86.3dnowa.pfnacc(x86_mmx, x86_mmx) nounwind readnone
-
-define x86_mmx @stack_fold_pfpnacc(x86_mmx %a, x86_mmx %b) {
-; CHECK-LABEL: stack_fold_pfpnacc:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    #APP
-; CHECK-NEXT:    nop
-; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    pfpnacc {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
-; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.3dnowa.pfpnacc(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
-}
-declare x86_mmx @llvm.x86.3dnowa.pfpnacc(x86_mmx, x86_mmx) nounwind readnone
-
-define x86_mmx @stack_fold_pfrcp(x86_mmx %a) {
-; CHECK-LABEL: stack_fold_pfrcp:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    #APP
-; CHECK-NEXT:    nop
-; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    pfrcp {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
-; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm1},~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.3dnow.pfrcp(x86_mmx %a) nounwind readnone
-  ret x86_mmx %2
-}
-declare x86_mmx @llvm.x86.3dnow.pfrcp(x86_mmx) nounwind readnone
-
-define x86_mmx @stack_fold_pfrcpit1(x86_mmx %a, x86_mmx %b) {
-; CHECK-LABEL: stack_fold_pfrcpit1:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    #APP
-; CHECK-NEXT:    nop
-; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    pfrcpit1 {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
-; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.3dnow.pfrcpit1(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
-}
-declare x86_mmx @llvm.x86.3dnow.pfrcpit1(x86_mmx, x86_mmx) nounwind readnone
-
-define x86_mmx @stack_fold_pfrcpit2(x86_mmx %a, x86_mmx %b) {
-; CHECK-LABEL: stack_fold_pfrcpit2:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    #APP
-; CHECK-NEXT:    nop
-; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    pfrcpit2 {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
-; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.3dnow.pfrcpit2(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
-}
-declare x86_mmx @llvm.x86.3dnow.pfrcpit2(x86_mmx, x86_mmx) nounwind readnone
-
-define x86_mmx @stack_fold_pfrsqit1(x86_mmx %a, x86_mmx %b) {
-; CHECK-LABEL: stack_fold_pfrsqit1:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    #APP
-; CHECK-NEXT:    nop
-; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    pfrsqit1 {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
-; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.3dnow.pfrsqit1(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
-}
-declare x86_mmx @llvm.x86.3dnow.pfrsqit1(x86_mmx, x86_mmx) nounwind readnone
-
-define x86_mmx @stack_fold_pfrsqrt(x86_mmx %a) {
-; CHECK-LABEL: stack_fold_pfrsqrt:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    #APP
-; CHECK-NEXT:    nop
-; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    pfrsqrt {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
-; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm1},~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.3dnow.pfrsqrt(x86_mmx %a) nounwind readnone
-  ret x86_mmx %2
-}
-declare x86_mmx @llvm.x86.3dnow.pfrsqrt(x86_mmx) nounwind readnone
-
-define x86_mmx @stack_fold_pfsub(x86_mmx %a, x86_mmx %b) {
-; CHECK-LABEL: stack_fold_pfsub:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    #APP
-; CHECK-NEXT:    nop
-; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    pfsub {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
-; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.3dnow.pfsub(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
-}
-declare x86_mmx @llvm.x86.3dnow.pfsub(x86_mmx, x86_mmx) nounwind readnone
-
-define x86_mmx @stack_fold_pfsubr(x86_mmx %a, x86_mmx %b) {
-; CHECK-LABEL: stack_fold_pfsubr:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    #APP
-; CHECK-NEXT:    nop
-; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    pfsubr {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
-; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.3dnow.pfsubr(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
-}
-declare x86_mmx @llvm.x86.3dnow.pfsubr(x86_mmx, x86_mmx) nounwind readnone
-
-define x86_mmx @stack_fold_pi2fd(x86_mmx %a) {
-; CHECK-LABEL: stack_fold_pi2fd:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    #APP
-; CHECK-NEXT:    nop
-; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    pi2fd {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
-; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm1},~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.3dnow.pi2fd(x86_mmx %a) nounwind readnone
-  ret x86_mmx %2
-}
-declare x86_mmx @llvm.x86.3dnow.pi2fd(x86_mmx) nounwind readnone
-
-define x86_mmx @stack_fold_pi2fw(x86_mmx %a) {
-; CHECK-LABEL: stack_fold_pi2fw:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    #APP
-; CHECK-NEXT:    nop
-; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    pi2fw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
-; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm1},~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.3dnowa.pi2fw(x86_mmx %a) nounwind readnone
-  ret x86_mmx %2
-}
-declare x86_mmx @llvm.x86.3dnowa.pi2fw(x86_mmx) nounwind readnone
-
-define x86_mmx @stack_fold_pmulhrw(x86_mmx %a, x86_mmx %b) {
-; CHECK-LABEL: stack_fold_pmulhrw:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    #APP
-; CHECK-NEXT:    nop
-; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    pmulhrw {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
-; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.3dnow.pmulhrw(x86_mmx %a, x86_mmx %b) nounwind readnone
-  ret x86_mmx %2
-}
-declare x86_mmx @llvm.x86.3dnow.pmulhrw(x86_mmx, x86_mmx) nounwind readnone
-
-define x86_mmx @stack_fold_pswapd(x86_mmx %a) {
-; CHECK-LABEL: stack_fold_pswapd:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq %mm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    #APP
-; CHECK-NEXT:    nop
-; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    pswapd {{[-0-9]+}}(%r{{[sb]}}p), %mm0 # 8-byte Folded Reload
-; CHECK-NEXT:    # mm0 = mem[1,0]
-; CHECK-NEXT:    movq2dq %mm0, %xmm0
-; CHECK-NEXT:    retq
-  %1 = tail call x86_mmx asm sideeffect "nop", "=y,~{mm1},~{mm2},~{mm3},~{mm4},~{mm5},~{mm6},~{mm7}"()
-  %2 = call x86_mmx @llvm.x86.3dnowa.pswapd(x86_mmx %a) nounwind readnone
-  ret x86_mmx %2
-}
-declare x86_mmx @llvm.x86.3dnowa.pswapd(x86_mmx) nounwind readnone

>From f16755931b2087c084758a0768023fcfb1b8ccc4 Mon Sep 17 00:00:00 2001
From: James Y Knight <jyknight at google.com>
Date: Fri, 21 Jun 2024 12:41:00 -0400
Subject: [PATCH 3/6] Fix bugs: - Add FeatureMMX where needed. - Add mayLoad,
 mayStore, hasSideEffects as appropriate, where they were previously inferred
 from the pattern.

---
 llvm/lib/Target/X86/X86.td           | 18 ++++++++---------
 llvm/lib/Target/X86/X86Instr3DNow.td | 29 +++++++++++++++++-----------
 llvm/test/CodeGen/X86/prefetch.ll    |  2 +-
 3 files changed, 28 insertions(+), 21 deletions(-)

diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td
index 242ba2937aa4b..2a5a28daea2c8 100644
--- a/llvm/lib/Target/X86/X86.td
+++ b/llvm/lib/Target/X86/X86.td
@@ -1818,32 +1818,32 @@ def : ProcModel<P, SapphireRapidsModel,
 
 def : Proc<"k6",   [FeatureX87, FeatureCX8, FeatureMMX],
                    [TuningSlowUAMem16, TuningInsertVZEROUPPER]>;
-def : Proc<"k6-2", [FeatureX87, FeatureCX8],
+def : Proc<"k6-2", [FeatureX87, FeatureCX8, FeatureMMX],
                    [TuningSlowUAMem16, TuningInsertVZEROUPPER]>;
-def : Proc<"k6-3", [FeatureX87, FeatureCX8],
+def : Proc<"k6-3", [FeatureX87, FeatureCX8, FeatureMMX],
                    [TuningSlowUAMem16, TuningInsertVZEROUPPER]>;
 
 foreach P = ["athlon", "athlon-tbird"] in {
-  def : Proc<P, [FeatureX87, FeatureCX8, FeatureCMOV,
+  def : Proc<P, [FeatureX87, FeatureCX8, FeatureCMOV, FeatureMMX,
                  FeatureNOPL],
                 [TuningSlowSHLD, TuningSlowUAMem16, TuningInsertVZEROUPPER]>;
 }
 
 foreach P = ["athlon-4", "athlon-xp", "athlon-mp"] in {
   def : Proc<P, [FeatureX87, FeatureCX8, FeatureCMOV,
-                 FeatureSSE1, FeatureFXSR, FeatureNOPL],
+                 FeatureSSE1, FeatureMMX, FeatureFXSR, FeatureNOPL],
                 [TuningSlowSHLD, TuningSlowUAMem16, TuningInsertVZEROUPPER]>;
 }
 
 foreach P = ["k8", "opteron", "athlon64", "athlon-fx"] in {
-  def : Proc<P, [FeatureX87, FeatureCX8, FeatureSSE2,
+  def : Proc<P, [FeatureX87, FeatureCX8, FeatureSSE2, FeatureMMX,
                  FeatureFXSR, FeatureNOPL, FeatureX86_64, FeatureCMOV],
                 [TuningFastScalarShiftMasks, TuningSlowSHLD, TuningSlowUAMem16,
                  TuningSBBDepBreaking, TuningInsertVZEROUPPER]>;
 }
 
 foreach P = ["k8-sse3", "opteron-sse3", "athlon64-sse3"] in {
-  def : Proc<P, [FeatureX87, FeatureCX8, FeatureSSE3,
+  def : Proc<P, [FeatureX87, FeatureCX8, FeatureSSE3, FeatureMMX,
                  FeatureFXSR, FeatureNOPL, FeatureCX16, FeatureCMOV,
                  FeatureX86_64],
                 [TuningFastScalarShiftMasks, TuningSlowSHLD, TuningSlowUAMem16,
@@ -1884,14 +1884,14 @@ def : ProcModel<"znver3", Znver3Model, ProcessorFeatures.ZN3Features,
 def : ProcModel<"znver4", Znver4Model, ProcessorFeatures.ZN4Features,
            ProcessorFeatures.ZN4Tuning>;
 
-def : Proc<"geode",           [FeatureX87, FeatureCX8],
+def : Proc<"geode",           [FeatureX87, FeatureCX8, FeatureMMX],
                               [TuningSlowUAMem16, TuningInsertVZEROUPPER]>;
 
 def : Proc<"winchip-c6",      [FeatureX87, FeatureMMX],
                               [TuningSlowUAMem16, TuningInsertVZEROUPPER]>;
-def : Proc<"winchip2",        [FeatureX87],
+def : Proc<"winchip2",        [FeatureX87, FeatureMMX],
                               [TuningSlowUAMem16, TuningInsertVZEROUPPER]>;
-def : Proc<"c3",              [FeatureX87],
+def : Proc<"c3",              [FeatureX87, FeatureMMX],
                               [TuningSlowUAMem16, TuningInsertVZEROUPPER]>;
 def : Proc<"c3-2",            [FeatureX87, FeatureCX8, FeatureMMX,
                                FeatureSSE1, FeatureFXSR, FeatureCMOV],
diff --git a/llvm/lib/Target/X86/X86Instr3DNow.td b/llvm/lib/Target/X86/X86Instr3DNow.td
index 444b1f0962955..13fe7d2ccbe77 100644
--- a/llvm/lib/Target/X86/X86Instr3DNow.td
+++ b/llvm/lib/Target/X86/X86Instr3DNow.td
@@ -27,19 +27,26 @@ class I3DNow_conv<bits<8> o, Format F, dag ins, string Mnemonic, list<dag> pat>
 
 multiclass I3DNow_binop_rm<bits<8> opc, string Mn,
                            X86FoldableSchedWrite sched, bit Commutable = 0> {
-  let isCommutable = Commutable in
-  def rr : I3DNow_binop<opc, MRMSrcReg, (ins VR64:$src1, VR64:$src2), Mn,
-    []>, Sched<[sched]>;
-  def rm : I3DNow_binop<opc, MRMSrcMem, (ins VR64:$src1, i64mem:$src2), Mn,
-    []>, Sched<[sched.Folded, sched.ReadAfterFold]>;
+  let mayStore=0, hasSideEffects=0 in {
+    let isCommutable = Commutable, mayLoad=0 in
+    def rr : I3DNow_binop<opc, MRMSrcReg, (ins VR64:$src1, VR64:$src2), Mn,
+      []>, Sched<[sched]>;
+    let mayLoad=1 in
+    def rm : I3DNow_binop<opc, MRMSrcMem, (ins VR64:$src1, i64mem:$src2), Mn,
+      []>, Sched<[sched.Folded, sched.ReadAfterFold]>;
+  }
 }
 
 multiclass I3DNow_conv_rm<bits<8> opc, string Mn,
                               X86FoldableSchedWrite sched> {
-  def rr : I3DNow_conv<opc, MRMSrcReg, (ins VR64:$src), Mn,
-    []>, Sched<[sched]>;
-  def rm : I3DNow_conv<opc, MRMSrcMem, (ins i64mem:$src), Mn,
-    []>, Sched<[sched.Folded, sched.ReadAfterFold]>;
+  let mayStore=0, hasSideEffects=0 in {
+    let mayLoad=0 in
+    def rr : I3DNow_conv<opc, MRMSrcReg, (ins VR64:$src), Mn,
+      []>, Sched<[sched]>;
+    let mayLoad=1 in
+    def rm : I3DNow_conv<opc, MRMSrcMem, (ins i64mem:$src), Mn,
+      []>, Sched<[sched.Folded, sched.ReadAfterFold]>;
+  }
 }
 
 defm PAVGUSB  : I3DNow_binop_rm<0xBF, "pavgusb", SchedWriteVecALU.MMX, 1>;
@@ -62,11 +69,11 @@ defm PFSUBR   : I3DNow_binop_rm<0xAA, "pfsubr", WriteFAdd, 1>;
 defm PI2FD    : I3DNow_conv_rm<0x0D, "pi2fd", WriteCvtI2PS>;
 defm PMULHRW  : I3DNow_binop_rm<0xB7, "pmulhrw", SchedWriteVecIMul.MMX, 1>;
 
-let SchedRW = [WriteEMMS] in
+let SchedRW = [WriteEMMS], mayLoad=1, mayStore=1, hasSideEffects=1 in
 def FEMMS : I3DNow<0x0E, RawFrm, (outs), (ins), "femms",
                    []>, TB;
 
-let SchedRW = [WriteLoad] in {
+let SchedRW = [WriteLoad], mayLoad=1, mayStore=1, hasSideEffects=0 in {
 def PREFETCH : I3DNow<0x0D, MRM0m, (outs), (ins i8mem:$addr),
                       "prefetch\t$addr",
                       []>, TB;
diff --git a/llvm/test/CodeGen/X86/prefetch.ll b/llvm/test/CodeGen/X86/prefetch.ll
index f761d9e140b03..c3551644dfb7f 100644
--- a/llvm/test/CodeGen/X86/prefetch.ll
+++ b/llvm/test/CodeGen/X86/prefetch.ll
@@ -6,7 +6,7 @@
 ; RUN: llc < %s -mtriple=i686-- -mcpu=slm | FileCheck %s -check-prefix=X86-PRFCHWSSE
 ; RUN: llc < %s -mtriple=i686-- -mcpu=btver2 | FileCheck %s -check-prefix=X86-PRFCHWSSE
 ; RUN: llc < %s -mtriple=i686-- -mcpu=btver2 -mattr=-prfchw | FileCheck %s -check-prefix=X86-SSE
-; RUN: llc < %s -mtriple=i686-- -mattr=+prfchw | FileCheck %s -check-prefix=X86-SSE
+; RUN: llc < %s -mtriple=i686-- -mattr=+prfchw | FileCheck %s -check-prefix=X86-PRFCHWSSE
 
 ; Rules:
 ; sse provides prefetch0/1/2/nta

>From 8a0252f670f7339a8c8b63db9bbb9a8c9fd0e8d6 Mon Sep 17 00:00:00 2001
From: James Y Knight <jyknight at google.com>
Date: Fri, 21 Jun 2024 12:47:32 -0400
Subject: [PATCH 4/6] Fix clang release notes formatting.

---
 clang/docs/ReleaseNotes.rst | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index cfe3526283d69..a7b69c9cfab6c 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -939,6 +939,7 @@ X86 Support
 - Support has been removed for the AMD "3DNow!" instruction-set.
   Neither modern AMD CPUs, nor any Intel CPUs implement these
   instructions, and they were never widely used.
+
   * The options ``-m3dnow`` and ``-m3dnowa`` are no longer honored, and will emit a warning if used.
   * The macros ``__3dNOW__`` and ``__3dNOW_A__`` are no longer ever set by the compiler.
   * The header ``<mm3dnow.h>`` still exists, but all of the the 3dNow
@@ -950,8 +951,8 @@ X86 Support
     ``_m_pfsubr``, ``_m_pi2fd``, ``_m_pmulhrw``, ``_m_pf2iw``,
     ``_m_pfnacc``, ``_m_pfpnacc``, ``_m_pi2fw``, ``_m_pswapdsf``,
     ``_m_pswapdsi``
-  * The compiler builtins (``__builtin_ia32_femms``, and so on)
-    corresponding to each of the above intrinsics have been removed.
+  * The compiler builtins corresponding to each of the above
+    intrinsics have also been removed  (``__builtin_ia32_femms``, and so on).
 
 
 Arm and AArch64 Support

>From ad026004efdb09ef6aa95ab1a18c74e2517744c6 Mon Sep 17 00:00:00 2001
From: James Y Knight <jyknight at google.com>
Date: Fri, 21 Jun 2024 13:53:26 -0400
Subject: [PATCH 5/6] Mark mm3dnow.h header as deprecated.

---
 clang/docs/ReleaseNotes.rst  |  6 +++---
 clang/lib/Headers/mm3dnow.h  | 10 +++++-----
 clang/test/Headers/mm3dnow.c | 10 +++++++++-
 3 files changed, 17 insertions(+), 9 deletions(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index a7b69c9cfab6c..9c90242356bff 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -942,15 +942,15 @@ X86 Support
 
   * The options ``-m3dnow`` and ``-m3dnowa`` are no longer honored, and will emit a warning if used.
   * The macros ``__3dNOW__`` and ``__3dNOW_A__`` are no longer ever set by the compiler.
-  * The header ``<mm3dnow.h>`` still exists, but all of the the 3dNow
-    intrinsic functions have been removed: ``_m_femms``,
+  * The header ``<mm3dnow.h>`` is deprecated, and emits a warning if included.
+  * The 3dNow intrinsic functions have been removed: ``_m_femms``,
     ``_m_pavgusb``, ``_m_pf2id``, ``_m_pfacc``, ``_m_pfadd``,
     ``_m_pfcmpeq``, ``_m_pfcmpge``, ``_m_pfcmpgt``, ``_m_pfmax``,
     ``_m_pfmin``, ``_m_pfmul``, ``_m_pfrcp``, ``_m_pfrcpit1``,
     ``_m_pfrcpit2``, ``_m_pfrsqrt``, ``_m_pfrsqrtit1``, ``_m_pfsub``,
     ``_m_pfsubr``, ``_m_pi2fd``, ``_m_pmulhrw``, ``_m_pf2iw``,
     ``_m_pfnacc``, ``_m_pfpnacc``, ``_m_pi2fw``, ``_m_pswapdsf``,
-    ``_m_pswapdsi``
+    ``_m_pswapdsi``.
   * The compiler builtins corresponding to each of the above
     intrinsics have also been removed  (``__builtin_ia32_femms``, and so on).
 
diff --git a/clang/lib/Headers/mm3dnow.h b/clang/lib/Headers/mm3dnow.h
index 10049553969f2..afffba3a9c75e 100644
--- a/clang/lib/Headers/mm3dnow.h
+++ b/clang/lib/Headers/mm3dnow.h
@@ -7,16 +7,16 @@
  *===-----------------------------------------------------------------------===
  */
 
-// 3dNow intrinsics are no longer supported, and this header remains only as a
-// stub for users who were including it to get to _m_prefetch or
-// _m_prefetchw. Such uses should prefer x86intrin.h.
+// 3dNow intrinsics are no longer supported.
 
 #ifndef _MM3DNOW_H_INCLUDED
 #define _MM3DNOW_H_INCLUDED
 
+#ifndef _CLANG_DISABLE_CRT_DEPRECATION_WARNINGS
+#warning "The <mm3dnow.h> header is deprecated, and 3dNow! intrinsics are unsupported. For other intrinsics, include <x86intrin.h>, instead."
+#endif
+
 #include <mmintrin.h>
 #include <prfchwintrin.h>
 
-#undef __DEFAULT_FN_ATTRS
-
 #endif
diff --git a/clang/test/Headers/mm3dnow.c b/clang/test/Headers/mm3dnow.c
index f63882688b612..a9b6dd88f8034 100644
--- a/clang/test/Headers/mm3dnow.c
+++ b/clang/test/Headers/mm3dnow.c
@@ -1,8 +1,14 @@
 // RUN: %clang_cc1 -fsyntax-only -ffreestanding %s -verify
+// RUN: %clang_cc1 -fsyntax-only -D_CLANG_DISABLE_CRT_DEPRECATION_WARNINGS -ffreestanding %s -verify
 // RUN: %clang_cc1 -fsyntax-only -ffreestanding -x c++ %s -verify
-// expected-no-diagnostics
 
 #if defined(i386) || defined(__x86_64__)
+#ifndef _CLANG_DISABLE_CRT_DEPRECATION_WARNINGS
+// expected-warning at mm3dnow.h:*{{The <mm3dnow.h> header is deprecated}}
+#else
+// expected-no-diagnostics
+#endif
+
 #include <mm3dnow.h>
 
 int foo(void *x) {
@@ -10,4 +16,6 @@ int foo(void *x) {
   _m_prefetchw(x);
   return 4;
 }
+#else
+// expected-no-diagnostics
 #endif

>From 5e2446ded5ff89ae4e9cf2b917d75102a0cfa252 Mon Sep 17 00:00:00 2001
From: James Y Knight <jyknight at google.com>
Date: Mon, 24 Jun 2024 15:42:23 -0400
Subject: [PATCH 6/6] Also add the prfchw feature where needed.

---
 llvm/lib/Target/X86/X86.td | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td
index 2a5a28daea2c8..d19a34df2e1e8 100644
--- a/llvm/lib/Target/X86/X86.td
+++ b/llvm/lib/Target/X86/X86.td
@@ -1818,32 +1818,32 @@ def : ProcModel<P, SapphireRapidsModel,
 
 def : Proc<"k6",   [FeatureX87, FeatureCX8, FeatureMMX],
                    [TuningSlowUAMem16, TuningInsertVZEROUPPER]>;
-def : Proc<"k6-2", [FeatureX87, FeatureCX8, FeatureMMX],
+def : Proc<"k6-2", [FeatureX87, FeatureCX8, FeatureMMX, FeaturePRFCHW],
                    [TuningSlowUAMem16, TuningInsertVZEROUPPER]>;
-def : Proc<"k6-3", [FeatureX87, FeatureCX8, FeatureMMX],
+def : Proc<"k6-3", [FeatureX87, FeatureCX8, FeatureMMX, FeaturePRFCHW],
                    [TuningSlowUAMem16, TuningInsertVZEROUPPER]>;
 
 foreach P = ["athlon", "athlon-tbird"] in {
-  def : Proc<P, [FeatureX87, FeatureCX8, FeatureCMOV, FeatureMMX,
+  def : Proc<P, [FeatureX87, FeatureCX8, FeatureCMOV, FeatureMMX, FeaturePRFCHW,
                  FeatureNOPL],
                 [TuningSlowSHLD, TuningSlowUAMem16, TuningInsertVZEROUPPER]>;
 }
 
 foreach P = ["athlon-4", "athlon-xp", "athlon-mp"] in {
   def : Proc<P, [FeatureX87, FeatureCX8, FeatureCMOV,
-                 FeatureSSE1, FeatureMMX, FeatureFXSR, FeatureNOPL],
+                 FeatureSSE1, FeatureMMX, FeaturePRFCHW, FeatureFXSR, FeatureNOPL],
                 [TuningSlowSHLD, TuningSlowUAMem16, TuningInsertVZEROUPPER]>;
 }
 
 foreach P = ["k8", "opteron", "athlon64", "athlon-fx"] in {
-  def : Proc<P, [FeatureX87, FeatureCX8, FeatureSSE2, FeatureMMX,
+  def : Proc<P, [FeatureX87, FeatureCX8, FeatureSSE2, FeatureMMX, FeaturePRFCHW,
                  FeatureFXSR, FeatureNOPL, FeatureX86_64, FeatureCMOV],
                 [TuningFastScalarShiftMasks, TuningSlowSHLD, TuningSlowUAMem16,
                  TuningSBBDepBreaking, TuningInsertVZEROUPPER]>;
 }
 
 foreach P = ["k8-sse3", "opteron-sse3", "athlon64-sse3"] in {
-  def : Proc<P, [FeatureX87, FeatureCX8, FeatureSSE3, FeatureMMX,
+  def : Proc<P, [FeatureX87, FeatureCX8, FeatureSSE3, FeatureMMX, FeaturePRFCHW,
                  FeatureFXSR, FeatureNOPL, FeatureCX16, FeatureCMOV,
                  FeatureX86_64],
                 [TuningFastScalarShiftMasks, TuningSlowSHLD, TuningSlowUAMem16,
@@ -1884,14 +1884,14 @@ def : ProcModel<"znver3", Znver3Model, ProcessorFeatures.ZN3Features,
 def : ProcModel<"znver4", Znver4Model, ProcessorFeatures.ZN4Features,
            ProcessorFeatures.ZN4Tuning>;
 
-def : Proc<"geode",           [FeatureX87, FeatureCX8, FeatureMMX],
+def : Proc<"geode",           [FeatureX87, FeatureCX8, FeatureMMX, FeaturePRFCHW],
                               [TuningSlowUAMem16, TuningInsertVZEROUPPER]>;
 
 def : Proc<"winchip-c6",      [FeatureX87, FeatureMMX],
                               [TuningSlowUAMem16, TuningInsertVZEROUPPER]>;
-def : Proc<"winchip2",        [FeatureX87, FeatureMMX],
+def : Proc<"winchip2",        [FeatureX87, FeatureMMX, FeaturePRFCHW],
                               [TuningSlowUAMem16, TuningInsertVZEROUPPER]>;
-def : Proc<"c3",              [FeatureX87, FeatureMMX],
+def : Proc<"c3",              [FeatureX87, FeatureMMX, FeaturePRFCHW],
                               [TuningSlowUAMem16, TuningInsertVZEROUPPER]>;
 def : Proc<"c3-2",            [FeatureX87, FeatureCX8, FeatureMMX,
                                FeatureSSE1, FeatureFXSR, FeatureCMOV],



More information about the cfe-commits mailing list