[llvm] 79b69bf - Autogenerate stack-folding-fp X86 tests. NFC

Amaury Séchet via llvm-commits llvm-commits at lists.llvm.org
Mon Sep 26 07:29:29 PDT 2022


Author: Amaury Séchet
Date: 2022-09-26T14:26:08Z
New Revision: 79b69bf8c930036edc9ea09c0c334533ebbcda6f

URL: https://github.com/llvm/llvm-project/commit/79b69bf8c930036edc9ea09c0c334533ebbcda6f
DIFF: https://github.com/llvm/llvm-project/commit/79b69bf8c930036edc9ea09c0c334533ebbcda6f.diff

LOG: Autogenerate stack-folding-fp X86 tests. NFC

Added: 
    

Modified: 
    llvm/test/CodeGen/X86/stack-folding-fp-avx512.ll
    llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16-fma.ll
    llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16.ll
    llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16vl-fma.ll
    llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16vl.ll

Removed: 
    


################################################################################
diff  --git a/llvm/test/CodeGen/X86/stack-folding-fp-avx512.ll b/llvm/test/CodeGen/X86/stack-folding-fp-avx512.ll
index 76a2767d3f3e..a75cdf9db619 100644
--- a/llvm/test/CodeGen/X86/stack-folding-fp-avx512.ll
+++ b/llvm/test/CodeGen/X86/stack-folding-fp-avx512.ll
@@ -336,7 +336,7 @@ define <8 x double> @stack_fold_cmppd_mask(<8 x double> %a0, <8 x double> %a1, p
 ; CHECK-NEXT:    kmovw %esi, %k1
 ; CHECK-NEXT:    kandb %k0, %k1, %k1
 ; CHECK-NEXT:    vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; CHECK-NEXT:    vmovupd (%rsp), %zmm1  # 64-byte Reload
+; CHECK-NEXT:    vmovupd (%rsp), %zmm1 # 64-byte Reload
 ; CHECK-NEXT:    vmovapd %zmm1, %zmm0 {%k1}
 ; CHECK-NEXT:    addq $136, %rsp
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8

diff  --git a/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16-fma.ll b/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16-fma.ll
index 090c78cd364e..f36fc3fa0d17 100644
--- a/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16-fma.ll
+++ b/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16-fma.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16 < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
@@ -9,8 +10,14 @@ target triple = "x86_64-unknown-unknown"
 ; relevant registers and check that the reload is correctly folded into the instruction.
 
 define <32 x half> @stack_fold_fmadd123ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
-  ;CHECK-LABEL: stack_fold_fmadd123ph:
-  ;CHECK:       vfmadd213ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmadd123ph:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfmadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2)
   ret <32 x half> %2
@@ -18,48 +25,87 @@ define <32 x half> @stack_fold_fmadd123ph(<32 x half> %a0, <32 x half> %a1, <32
 declare <32 x half> @llvm.fma.v32f16(<32 x half>, <32 x half>, <32 x half>)
 
 define <32 x half> @stack_fold_fmadd213ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
-  ;CHECK-LABEL: stack_fold_fmadd213ph:
-  ;CHECK:       vfmadd213ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmadd213ph:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfmadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a1, <32 x half> %a0, <32 x half> %a2)
   ret <32 x half> %2
 }
 
 define <32 x half> @stack_fold_fmadd231ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
-  ;CHECK-LABEL: stack_fold_fmadd231ph:
-  ;CHECK:       vfmadd231ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmadd231ph:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfmadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a1, <32 x half> %a2, <32 x half> %a0)
   ret <32 x half> %2
 }
 
 define <32 x half> @stack_fold_fmadd321ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
-  ;CHECK-LABEL: stack_fold_fmadd321ph:
-  ;CHECK:       vfmadd231ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmadd321ph:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfmadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a2, <32 x half> %a1, <32 x half> %a0)
   ret <32 x half> %2
 }
 
 define <32 x half> @stack_fold_fmadd132ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
-  ;CHECK-LABEL: stack_fold_fmadd132ph:
-  ;CHECK:       vfmadd132ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmadd132ph:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfmadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a0, <32 x half> %a2, <32 x half> %a1)
   ret <32 x half> %2
 }
 
 define <32 x half> @stack_fold_fmadd312ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
-  ;CHECK-LABEL: stack_fold_fmadd312ph:
-  ;CHECK:       vfmadd132ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmadd312ph:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfmadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a2, <32 x half> %a0, <32 x half> %a1)
   ret <32 x half> %2
 }
 
 define <32 x half> @stack_fold_fmadd123ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
-  ;CHECK-LABEL: stack_fold_fmadd123ph_mask:
-  ;CHECK:       vfmadd213ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmadd123ph_mask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovaps (%rdi), %zmm2
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vfmadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = load <32 x half>, ptr %p
   %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2)
@@ -69,8 +115,17 @@ define <32 x half> @stack_fold_fmadd123ph_mask(ptr %p, <32 x half> %a1, <32 x ha
 }
 
 define <32 x half> @stack_fold_fmadd213ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
-  ;CHECK-LABEL: stack_fold_fmadd213ph_mask:
-  ;CHECK:       vfmadd213ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmadd213ph_mask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovaps (%rdi), %zmm2
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vfmadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = load <32 x half>, ptr %p
   %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a1, <32 x half> %a0, <32 x half> %a2)
@@ -80,8 +135,17 @@ define <32 x half> @stack_fold_fmadd213ph_mask(ptr %p, <32 x half> %a1, <32 x ha
 }
 
 define <32 x half> @stack_fold_fmadd231ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
-  ;CHECK-LABEL: stack_fold_fmadd231ph_mask:
-  ;CHECK:       vfmadd231ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmadd231ph_mask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovaps (%rdi), %zmm2
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vfmadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = load <32 x half>, ptr %p
   %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a1, <32 x half> %a2, <32 x half> %a0)
@@ -91,8 +155,17 @@ define <32 x half> @stack_fold_fmadd231ph_mask(ptr %p, <32 x half> %a1, <32 x ha
 }
 
 define <32 x half> @stack_fold_fmadd321ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
-  ;CHECK-LABEL: stack_fold_fmadd321ph_mask:
-  ;CHECK:       vfmadd231ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmadd321ph_mask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovaps (%rdi), %zmm2
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vfmadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = load <32 x half>, ptr %p
   %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a2, <32 x half> %a1, <32 x half> %a0)
@@ -102,8 +175,17 @@ define <32 x half> @stack_fold_fmadd321ph_mask(ptr %p, <32 x half> %a1, <32 x ha
 }
 
 define <32 x half> @stack_fold_fmadd132ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
-  ;CHECK-LABEL: stack_fold_fmadd132ph_mask:
-  ;CHECK:       vfmadd132ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmadd132ph_mask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovaps (%rdi), %zmm2
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vfmadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = load <32 x half>, ptr %p
   %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a0, <32 x half> %a2, <32 x half> %a1)
@@ -113,8 +195,17 @@ define <32 x half> @stack_fold_fmadd132ph_mask(ptr %p, <32 x half> %a1, <32 x ha
 }
 
 define <32 x half> @stack_fold_fmadd312ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
-  ;CHECK-LABEL: stack_fold_fmadd312ph_mask:
-  ;CHECK:       vfmadd132ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmadd312ph_mask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovaps (%rdi), %zmm2
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vfmadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = load <32 x half>, ptr %p
   %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a2, <32 x half> %a0, <32 x half> %a1)
@@ -124,8 +215,15 @@ define <32 x half> @stack_fold_fmadd312ph_mask(ptr %p, <32 x half> %a1, <32 x ha
 }
 
 define <32 x half> @stack_fold_fmadd123ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fmadd123ph_maskz:
-  ;CHECK:       vfmadd213ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmadd123ph_maskz:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovd (%rdi), %k1
+; CHECK-NEXT:    vfmadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2)
   %3 = load i32, ptr %mask
@@ -135,8 +233,15 @@ define <32 x half> @stack_fold_fmadd123ph_maskz(<32 x half> %a0, <32 x half> %a1
 }
 
 define <32 x half> @stack_fold_fmadd213ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fmadd213ph_maskz:
-  ;CHECK:       vfmadd213ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmadd213ph_maskz:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovd (%rdi), %k1
+; CHECK-NEXT:    vfmadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a1, <32 x half> %a0, <32 x half> %a2)
   %3 = load i32, ptr %mask
@@ -146,8 +251,15 @@ define <32 x half> @stack_fold_fmadd213ph_maskz(<32 x half> %a0, <32 x half> %a1
 }
 
 define <32 x half> @stack_fold_fmadd231ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fmadd231ph_maskz:
-  ;CHECK:       vfmadd231ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmadd231ph_maskz:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovd (%rdi), %k1
+; CHECK-NEXT:    vfmadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a1, <32 x half> %a2, <32 x half> %a0)
   %3 = load i32, ptr %mask
@@ -157,8 +269,15 @@ define <32 x half> @stack_fold_fmadd231ph_maskz(<32 x half> %a0, <32 x half> %a1
 }
 
 define <32 x half> @stack_fold_fmadd321ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fmadd321ph_maskz:
-  ;CHECK:       vfmadd231ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmadd321ph_maskz:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovd (%rdi), %k1
+; CHECK-NEXT:    vfmadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a2, <32 x half> %a1, <32 x half> %a0)
   %3 = load i32, ptr %mask
@@ -168,8 +287,15 @@ define <32 x half> @stack_fold_fmadd321ph_maskz(<32 x half> %a0, <32 x half> %a1
 }
 
 define <32 x half> @stack_fold_fmadd132ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fmadd132ph_maskz:
-  ;CHECK:       vfmadd132ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmadd132ph_maskz:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovd (%rdi), %k1
+; CHECK-NEXT:    vfmadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a0, <32 x half> %a2, <32 x half> %a1)
   %3 = load i32, ptr %mask
@@ -179,8 +305,15 @@ define <32 x half> @stack_fold_fmadd132ph_maskz(<32 x half> %a0, <32 x half> %a1
 }
 
 define <32 x half> @stack_fold_fmadd312ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fmadd312ph_maskz:
-  ;CHECK:       vfmadd132ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmadd312ph_maskz:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovd (%rdi), %k1
+; CHECK-NEXT:    vfmadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a2, <32 x half> %a0, <32 x half> %a1)
   %3 = load i32, ptr %mask
@@ -190,8 +323,14 @@ define <32 x half> @stack_fold_fmadd312ph_maskz(<32 x half> %a0, <32 x half> %a1
 }
 
 define <32 x half> @stack_fold_fmsub123ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
-  ;CHECK-LABEL: stack_fold_fmsub123ph:
-  ;CHECK:       vfmsub213ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmsub123ph:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfmsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = fneg <32 x half> %a2
   %3 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a0, <32 x half> %a1, <32 x half> %2)
@@ -199,8 +338,14 @@ define <32 x half> @stack_fold_fmsub123ph(<32 x half> %a0, <32 x half> %a1, <32
 }
 
 define <32 x half> @stack_fold_fmsub213ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
-  ;CHECK-LABEL: stack_fold_fmsub213ph:
-  ;CHECK:       vfmsub213ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmsub213ph:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfmsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = fneg <32 x half> %a2
   %3 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a1, <32 x half> %a0, <32 x half> %2)
@@ -208,8 +353,14 @@ define <32 x half> @stack_fold_fmsub213ph(<32 x half> %a0, <32 x half> %a1, <32
 }
 
 define <32 x half> @stack_fold_fmsub231ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
-  ;CHECK-LABEL: stack_fold_fmsub231ph:
-  ;CHECK:       vfmsub231ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmsub231ph:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfmsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = fneg <32 x half> %a0
   %3 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a1, <32 x half> %a2, <32 x half> %2)
@@ -217,8 +368,14 @@ define <32 x half> @stack_fold_fmsub231ph(<32 x half> %a0, <32 x half> %a1, <32
 }
 
 define <32 x half> @stack_fold_fmsub321ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
-  ;CHECK-LABEL: stack_fold_fmsub321ph:
-  ;CHECK:       vfmsub231ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmsub321ph:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfmsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = fneg <32 x half> %a0
   %3 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a2, <32 x half> %a1, <32 x half> %2)
@@ -226,8 +383,14 @@ define <32 x half> @stack_fold_fmsub321ph(<32 x half> %a0, <32 x half> %a1, <32
 }
 
 define <32 x half> @stack_fold_fmsub132ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
-  ;CHECK-LABEL: stack_fold_fmsub132ph:
-  ;CHECK:       vfmsub132ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmsub132ph:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfmsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = fneg <32 x half> %a1
   %3 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a0, <32 x half> %a2, <32 x half> %2)
@@ -235,8 +398,14 @@ define <32 x half> @stack_fold_fmsub132ph(<32 x half> %a0, <32 x half> %a1, <32
 }
 
 define <32 x half> @stack_fold_fmsub312ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
-  ;CHECK-LABEL: stack_fold_fmsub312ph:
-  ;CHECK:       vfmsub132ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmsub312ph:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfmsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = fneg <32 x half> %a1
   %3 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a2, <32 x half> %a0, <32 x half> %2)
@@ -244,8 +413,17 @@ define <32 x half> @stack_fold_fmsub312ph(<32 x half> %a0, <32 x half> %a1, <32
 }
 
 define <32 x half> @stack_fold_fmsub123ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
-  ;CHECK-LABEL: stack_fold_fmsub123ph_mask:
-  ;CHECK:       vfmsub213ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmsub123ph_mask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovaps (%rdi), %zmm2
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vfmsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = load <32 x half>, ptr %p
   %neg = fneg <32 x half> %a2
@@ -256,8 +434,17 @@ define <32 x half> @stack_fold_fmsub123ph_mask(ptr %p, <32 x half> %a1, <32 x ha
 }
 
 define <32 x half> @stack_fold_fmsub213ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
-  ;CHECK-LABEL: stack_fold_fmsub213ph_mask:
-  ;CHECK:       vfmsub213ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmsub213ph_mask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovaps (%rdi), %zmm2
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vfmsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = load <32 x half>, ptr %p
   %neg = fneg <32 x half> %a2
@@ -268,8 +455,17 @@ define <32 x half> @stack_fold_fmsub213ph_mask(ptr %p, <32 x half> %a1, <32 x ha
 }
 
 define <32 x half> @stack_fold_fmsub231ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
-  ;CHECK-LABEL: stack_fold_fmsub231ph_mask:
-  ;CHECK:       vfmsub231ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmsub231ph_mask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovaps (%rdi), %zmm2
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vfmsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = load <32 x half>, ptr %p
   %neg = fneg <32 x half> %a0
@@ -280,8 +476,17 @@ define <32 x half> @stack_fold_fmsub231ph_mask(ptr %p, <32 x half> %a1, <32 x ha
 }
 
 define <32 x half> @stack_fold_fmsub321ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
-  ;CHECK-LABEL: stack_fold_fmsub321ph_mask:
-  ;CHECK:       vfmsub231ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmsub321ph_mask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovaps (%rdi), %zmm2
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vfmsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = load <32 x half>, ptr %p
   %neg = fneg <32 x half> %a0
@@ -292,8 +497,17 @@ define <32 x half> @stack_fold_fmsub321ph_mask(ptr %p, <32 x half> %a1, <32 x ha
 }
 
 define <32 x half> @stack_fold_fmsub132ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
-  ;CHECK-LABEL: stack_fold_fmsub132ph_mask:
-  ;CHECK:       vfmsub132ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmsub132ph_mask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovaps (%rdi), %zmm2
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vfmsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = load <32 x half>, ptr %p
   %neg = fneg <32 x half> %a1
@@ -304,8 +518,17 @@ define <32 x half> @stack_fold_fmsub132ph_mask(ptr %p, <32 x half> %a1, <32 x ha
 }
 
 define <32 x half> @stack_fold_fmsub312ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
-  ;CHECK-LABEL: stack_fold_fmsub312ph_mask:
-  ;CHECK:       vfmsub132ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmsub312ph_mask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovaps (%rdi), %zmm2
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vfmsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = load <32 x half>, ptr %p
   %neg = fneg <32 x half> %a1
@@ -316,8 +539,15 @@ define <32 x half> @stack_fold_fmsub312ph_mask(ptr %p, <32 x half> %a1, <32 x ha
 }
 
 define <32 x half> @stack_fold_fmsub123ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fmsub123ph_maskz:
-  ;CHECK:       vfmsub213ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmsub123ph_maskz:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovd (%rdi), %k1
+; CHECK-NEXT:    vfmsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %neg = fneg <32 x half> %a2
   %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a0, <32 x half> %a1, <32 x half> %neg)
@@ -328,8 +558,15 @@ define <32 x half> @stack_fold_fmsub123ph_maskz(<32 x half> %a0, <32 x half> %a1
 }
 
 define <32 x half> @stack_fold_fmsub213ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fmsub213ph_maskz:
-  ;CHECK:       vfmsub213ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmsub213ph_maskz:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovd (%rdi), %k1
+; CHECK-NEXT:    vfmsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %neg = fneg <32 x half> %a2
   %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a1, <32 x half> %a0, <32 x half> %neg)
@@ -340,8 +577,15 @@ define <32 x half> @stack_fold_fmsub213ph_maskz(<32 x half> %a0, <32 x half> %a1
 }
 
 define <32 x half> @stack_fold_fmsub231ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fmsub231ph_maskz:
-  ;CHECK:       vfmsub231ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmsub231ph_maskz:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovd (%rdi), %k1
+; CHECK-NEXT:    vfmsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %neg = fneg <32 x half> %a0
   %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a1, <32 x half> %a2, <32 x half> %neg)
@@ -352,8 +596,15 @@ define <32 x half> @stack_fold_fmsub231ph_maskz(<32 x half> %a0, <32 x half> %a1
 }
 
 define <32 x half> @stack_fold_fmsub321ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fmsub321ph_maskz:
-  ;CHECK:       vfmsub231ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmsub321ph_maskz:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovd (%rdi), %k1
+; CHECK-NEXT:    vfmsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %neg = fneg <32 x half> %a0
   %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a2, <32 x half> %a1, <32 x half> %neg)
@@ -364,8 +615,15 @@ define <32 x half> @stack_fold_fmsub321ph_maskz(<32 x half> %a0, <32 x half> %a1
 }
 
 define <32 x half> @stack_fold_fmsub132ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fmsub132ph_maskz:
-  ;CHECK:       vfmsub132ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmsub132ph_maskz:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovd (%rdi), %k1
+; CHECK-NEXT:    vfmsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %neg = fneg <32 x half> %a1
   %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a0, <32 x half> %a2, <32 x half> %neg)
@@ -376,8 +634,15 @@ define <32 x half> @stack_fold_fmsub132ph_maskz(<32 x half> %a0, <32 x half> %a1
 }
 
 define <32 x half> @stack_fold_fmsub312ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fmsub312ph_maskz:
-  ;CHECK:       vfmsub132ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmsub312ph_maskz:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovd (%rdi), %k1
+; CHECK-NEXT:    vfmsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %neg = fneg <32 x half> %a1
   %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a2, <32 x half> %a0, <32 x half> %neg)
@@ -388,8 +653,14 @@ define <32 x half> @stack_fold_fmsub312ph_maskz(<32 x half> %a0, <32 x half> %a1
 }
 
 define <32 x half> @stack_fold_fnmadd123ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
-  ;CHECK-LABEL: stack_fold_fnmadd123ph:
-  ;CHECK:       vfnmadd213ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmadd123ph:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfnmadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = fneg <32 x half> %a0
   %3 = call <32 x half> @llvm.fma.v32f16(<32 x half> %2, <32 x half> %a1, <32 x half> %a2)
@@ -397,8 +668,14 @@ define <32 x half> @stack_fold_fnmadd123ph(<32 x half> %a0, <32 x half> %a1, <32
 }
 
 define <32 x half> @stack_fold_fnmadd213ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
-  ;CHECK-LABEL: stack_fold_fnmadd213ph:
-  ;CHECK:       vfnmadd213ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmadd213ph:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfnmadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = fneg <32 x half> %a1
   %3 = call <32 x half> @llvm.fma.v32f16(<32 x half> %2, <32 x half> %a0, <32 x half> %a2)
@@ -406,8 +683,14 @@ define <32 x half> @stack_fold_fnmadd213ph(<32 x half> %a0, <32 x half> %a1, <32
 }
 
 define <32 x half> @stack_fold_fnmadd231ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
-  ;CHECK-LABEL: stack_fold_fnmadd231ph:
-  ;CHECK:       vfnmadd231ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmadd231ph:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfnmadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = fneg <32 x half> %a1
   %3 = call <32 x half> @llvm.fma.v32f16(<32 x half> %2, <32 x half> %a2, <32 x half> %a0)
@@ -415,8 +698,14 @@ define <32 x half> @stack_fold_fnmadd231ph(<32 x half> %a0, <32 x half> %a1, <32
 }
 
 define <32 x half> @stack_fold_fnmadd321ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
-  ;CHECK-LABEL: stack_fold_fnmadd321ph:
-  ;CHECK:       vfnmadd231ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmadd321ph:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfnmadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = fneg <32 x half> %a2
   %3 = call <32 x half> @llvm.fma.v32f16(<32 x half> %2, <32 x half> %a1, <32 x half> %a0)
@@ -424,8 +713,14 @@ define <32 x half> @stack_fold_fnmadd321ph(<32 x half> %a0, <32 x half> %a1, <32
 }
 
 define <32 x half> @stack_fold_fnmadd132ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
-  ;CHECK-LABEL: stack_fold_fnmadd132ph:
-  ;CHECK:       vfnmadd132ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmadd132ph:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfnmadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = fneg <32 x half> %a0
   %3 = call <32 x half> @llvm.fma.v32f16(<32 x half> %2, <32 x half> %a2, <32 x half> %a1)
@@ -433,8 +728,14 @@ define <32 x half> @stack_fold_fnmadd132ph(<32 x half> %a0, <32 x half> %a1, <32
 }
 
 define <32 x half> @stack_fold_fnmadd312ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
-  ;CHECK-LABEL: stack_fold_fnmadd312ph:
-  ;CHECK:       vfnmadd132ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmadd312ph:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfnmadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = fneg <32 x half> %a2
   %3 = call <32 x half> @llvm.fma.v32f16(<32 x half> %2, <32 x half> %a0, <32 x half> %a1)
@@ -442,8 +743,17 @@ define <32 x half> @stack_fold_fnmadd312ph(<32 x half> %a0, <32 x half> %a1, <32
 }
 
 define <32 x half> @stack_fold_fnmadd123ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
-  ;CHECK-LABEL: stack_fold_fnmadd123ph_mask:
-  ;CHECK:       vfnmadd213ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmadd123ph_mask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovaps (%rdi), %zmm2
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vfnmadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = load <32 x half>, ptr %p
   %neg = fneg <32 x half> %a0
@@ -454,8 +764,17 @@ define <32 x half> @stack_fold_fnmadd123ph_mask(ptr %p, <32 x half> %a1, <32 x h
 }
 
 define <32 x half> @stack_fold_fnmadd213ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
-  ;CHECK-LABEL: stack_fold_fnmadd213ph_mask:
-  ;CHECK:       vfnmadd213ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmadd213ph_mask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovaps (%rdi), %zmm2
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vfnmadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = load <32 x half>, ptr %p
   %neg = fneg <32 x half> %a1
@@ -466,8 +785,17 @@ define <32 x half> @stack_fold_fnmadd213ph_mask(ptr %p, <32 x half> %a1, <32 x h
 }
 
 define <32 x half> @stack_fold_fnmadd231ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
-  ;CHECK-LABEL: stack_fold_fnmadd231ph_mask:
-  ;CHECK:       vfnmadd231ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmadd231ph_mask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovaps (%rdi), %zmm2
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vfnmadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = load <32 x half>, ptr %p
   %neg = fneg <32 x half> %a1
@@ -478,8 +806,17 @@ define <32 x half> @stack_fold_fnmadd231ph_mask(ptr %p, <32 x half> %a1, <32 x h
 }
 
 define <32 x half> @stack_fold_fnmadd321ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
-  ;CHECK-LABEL: stack_fold_fnmadd321ph_mask:
-  ;CHECK:       vfnmadd231ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmadd321ph_mask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovaps (%rdi), %zmm2
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vfnmadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = load <32 x half>, ptr %p
   %neg = fneg <32 x half> %a2
@@ -490,8 +827,17 @@ define <32 x half> @stack_fold_fnmadd321ph_mask(ptr %p, <32 x half> %a1, <32 x h
 }
 
 define <32 x half> @stack_fold_fnmadd132ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
-  ;CHECK-LABEL: stack_fold_fnmadd132ph_mask:
-  ;CHECK:       vfnmadd132ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmadd132ph_mask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovaps (%rdi), %zmm2
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vfnmadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = load <32 x half>, ptr %p
   %neg = fneg <32 x half> %a0
@@ -502,8 +848,17 @@ define <32 x half> @stack_fold_fnmadd132ph_mask(ptr %p, <32 x half> %a1, <32 x h
 }
 
 define <32 x half> @stack_fold_fnmadd312ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
-  ;CHECK-LABEL: stack_fold_fnmadd312ph_mask:
-  ;CHECK:       vfnmadd132ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmadd312ph_mask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovaps (%rdi), %zmm2
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vfnmadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = load <32 x half>, ptr %p
   %neg = fneg <32 x half> %a2
@@ -514,8 +869,15 @@ define <32 x half> @stack_fold_fnmadd312ph_mask(ptr %p, <32 x half> %a1, <32 x h
 }
 
 define <32 x half> @stack_fold_fnmadd123ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fnmadd123ph_maskz:
-  ;CHECK:       vfnmadd213ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmadd123ph_maskz:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovd (%rdi), %k1
+; CHECK-NEXT:    vfnmadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %neg = fneg <32 x half> %a0
   %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg, <32 x half> %a1, <32 x half> %a2)
@@ -526,8 +888,15 @@ define <32 x half> @stack_fold_fnmadd123ph_maskz(<32 x half> %a0, <32 x half> %a
 }
 
 define <32 x half> @stack_fold_fnmadd213ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fnmadd213ph_maskz:
-  ;CHECK:       vfnmadd213ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmadd213ph_maskz:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovd (%rdi), %k1
+; CHECK-NEXT:    vfnmadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %neg = fneg <32 x half> %a1
   %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg, <32 x half> %a0, <32 x half> %a2)
@@ -538,8 +907,15 @@ define <32 x half> @stack_fold_fnmadd213ph_maskz(<32 x half> %a0, <32 x half> %a
 }
 
 define <32 x half> @stack_fold_fnmadd231ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fnmadd231ph_maskz:
-  ;CHECK:       vfnmadd231ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmadd231ph_maskz:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovd (%rdi), %k1
+; CHECK-NEXT:    vfnmadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %neg = fneg <32 x half> %a1
   %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg, <32 x half> %a2, <32 x half> %a0)
@@ -550,8 +926,15 @@ define <32 x half> @stack_fold_fnmadd231ph_maskz(<32 x half> %a0, <32 x half> %a
 }
 
 define <32 x half> @stack_fold_fnmadd321ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fnmadd321ph_maskz:
-  ;CHECK:       vfnmadd231ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmadd321ph_maskz:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovd (%rdi), %k1
+; CHECK-NEXT:    vfnmadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %neg = fneg <32 x half> %a2
   %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg, <32 x half> %a1, <32 x half> %a0)
@@ -562,8 +945,15 @@ define <32 x half> @stack_fold_fnmadd321ph_maskz(<32 x half> %a0, <32 x half> %a
 }
 
 define <32 x half> @stack_fold_fnmadd132ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fnmadd132ph_maskz:
-  ;CHECK:       vfnmadd132ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmadd132ph_maskz:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovd (%rdi), %k1
+; CHECK-NEXT:    vfnmadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %neg = fneg <32 x half> %a0
   %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg, <32 x half> %a2, <32 x half> %a1)
@@ -574,8 +964,15 @@ define <32 x half> @stack_fold_fnmadd132ph_maskz(<32 x half> %a0, <32 x half> %a
 }
 
 define <32 x half> @stack_fold_fnmadd312ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fnmadd312ph_maskz:
-  ;CHECK:       vfnmadd132ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmadd312ph_maskz:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovd (%rdi), %k1
+; CHECK-NEXT:    vfnmadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %neg = fneg <32 x half> %a2
   %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg, <32 x half> %a0, <32 x half> %a1)
@@ -586,8 +983,14 @@ define <32 x half> @stack_fold_fnmadd312ph_maskz(<32 x half> %a0, <32 x half> %a
 }
 
 define <32 x half> @stack_fold_fnmsub123ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
-  ;CHECK-LABEL: stack_fold_fnmsub123ph:
-  ;CHECK:       vfnmsub213ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmsub123ph:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfnmsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = fneg <32 x half> %a0
   %3 = fneg <32 x half> %a2
@@ -596,8 +999,14 @@ define <32 x half> @stack_fold_fnmsub123ph(<32 x half> %a0, <32 x half> %a1, <32
 }
 
 define <32 x half> @stack_fold_fnmsub213ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
-  ;CHECK-LABEL: stack_fold_fnmsub213ph:
-  ;CHECK:       vfnmsub213ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmsub213ph:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfnmsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = fneg <32 x half> %a1
   %3 = fneg <32 x half> %a2
@@ -606,8 +1015,14 @@ define <32 x half> @stack_fold_fnmsub213ph(<32 x half> %a0, <32 x half> %a1, <32
 }
 
 define <32 x half> @stack_fold_fnmsub231ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
-  ;CHECK-LABEL: stack_fold_fnmsub231ph:
-  ;CHECK:       vfnmsub231ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmsub231ph:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfnmsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = fneg <32 x half> %a1
   %3 = fneg <32 x half> %a0
@@ -616,8 +1031,14 @@ define <32 x half> @stack_fold_fnmsub231ph(<32 x half> %a0, <32 x half> %a1, <32
 }
 
 define <32 x half> @stack_fold_fnmsub321ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
-  ;CHECK-LABEL: stack_fold_fnmsub321ph:
-  ;CHECK:       vfnmsub231ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmsub321ph:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfnmsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = fneg <32 x half> %a2
   %3 = fneg <32 x half> %a0
@@ -626,8 +1047,14 @@ define <32 x half> @stack_fold_fnmsub321ph(<32 x half> %a0, <32 x half> %a1, <32
 }
 
 define <32 x half> @stack_fold_fnmsub132ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
-  ;CHECK-LABEL: stack_fold_fnmsub132ph:
-  ;CHECK:       vfnmsub132ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmsub132ph:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfnmsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = fneg <32 x half> %a0
   %3 = fneg <32 x half> %a1
@@ -636,8 +1063,14 @@ define <32 x half> @stack_fold_fnmsub132ph(<32 x half> %a0, <32 x half> %a1, <32
 }
 
 define <32 x half> @stack_fold_fnmsub312ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
-  ;CHECK-LABEL: stack_fold_fnmsub312ph:
-  ;CHECK:       vfnmsub132ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmsub312ph:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfnmsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = fneg <32 x half> %a2
   %3 = fneg <32 x half> %a1
@@ -646,8 +1079,17 @@ define <32 x half> @stack_fold_fnmsub312ph(<32 x half> %a0, <32 x half> %a1, <32
 }
 
 define <32 x half> @stack_fold_fnmsub123ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
-  ;CHECK-LABEL: stack_fold_fnmsub123ph_mask:
-  ;CHECK:       vfnmsub213ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmsub123ph_mask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovaps (%rdi), %zmm2
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vfnmsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = load <32 x half>, ptr %p
   %neg = fneg <32 x half> %a2
@@ -659,8 +1101,17 @@ define <32 x half> @stack_fold_fnmsub123ph_mask(ptr %p, <32 x half> %a1, <32 x h
 }
 
 define <32 x half> @stack_fold_fnmsub213ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
-  ;CHECK-LABEL: stack_fold_fnmsub213ph_mask:
-  ;CHECK:       vfnmsub213ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmsub213ph_mask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovaps (%rdi), %zmm2
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vfnmsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = load <32 x half>, ptr %p
   %neg = fneg <32 x half> %a2
@@ -672,8 +1123,17 @@ define <32 x half> @stack_fold_fnmsub213ph_mask(ptr %p, <32 x half> %a1, <32 x h
 }
 
 define <32 x half> @stack_fold_fnmsub231ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
-  ;CHECK-LABEL: stack_fold_fnmsub231ph_mask:
-  ;CHECK:       vfnmsub231ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmsub231ph_mask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovaps (%rdi), %zmm2
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vfnmsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = load <32 x half>, ptr %p
   %neg = fneg <32 x half> %a0
@@ -685,8 +1145,17 @@ define <32 x half> @stack_fold_fnmsub231ph_mask(ptr %p, <32 x half> %a1, <32 x h
 }
 
 define <32 x half> @stack_fold_fnmsub321ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
-  ;CHECK-LABEL: stack_fold_fnmsub321ph_mask:
-  ;CHECK:       vfnmsub231ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmsub321ph_mask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovaps (%rdi), %zmm2
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vfnmsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = load <32 x half>, ptr %p
   %neg = fneg <32 x half> %a0
@@ -698,8 +1167,17 @@ define <32 x half> @stack_fold_fnmsub321ph_mask(ptr %p, <32 x half> %a1, <32 x h
 }
 
 define <32 x half> @stack_fold_fnmsub132ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
-  ;CHECK-LABEL: stack_fold_fnmsub132ph_mask:
-  ;CHECK:       vfnmsub132ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmsub132ph_mask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovaps (%rdi), %zmm2
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vfnmsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = load <32 x half>, ptr %p
   %neg = fneg <32 x half> %a1
@@ -711,8 +1189,17 @@ define <32 x half> @stack_fold_fnmsub132ph_mask(ptr %p, <32 x half> %a1, <32 x h
 }
 
 define <32 x half> @stack_fold_fnmsub312ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
-  ;CHECK-LABEL: stack_fold_fnmsub312ph_mask:
-  ;CHECK:       vfnmsub132ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmsub312ph_mask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovaps (%rdi), %zmm2
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vfnmsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = load <32 x half>, ptr %p
   %neg = fneg <32 x half> %a1
@@ -724,8 +1211,15 @@ define <32 x half> @stack_fold_fnmsub312ph_mask(ptr %p, <32 x half> %a1, <32 x h
 }
 
 define <32 x half> @stack_fold_fnmsub123ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fnmsub123ph_maskz:
-  ;CHECK:       vfnmsub213ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmsub123ph_maskz:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovd (%rdi), %k1
+; CHECK-NEXT:    vfnmsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %neg = fneg <32 x half> %a2
   %neg1 = fneg <32 x half> %a0
@@ -737,8 +1231,15 @@ define <32 x half> @stack_fold_fnmsub123ph_maskz(<32 x half> %a0, <32 x half> %a
 }
 
 define <32 x half> @stack_fold_fnmsub213ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fnmsub213ph_maskz:
-  ;CHECK:       vfnmsub213ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmsub213ph_maskz:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovd (%rdi), %k1
+; CHECK-NEXT:    vfnmsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %neg = fneg <32 x half> %a2
   %neg1 = fneg <32 x half> %a1
@@ -750,8 +1251,15 @@ define <32 x half> @stack_fold_fnmsub213ph_maskz(<32 x half> %a0, <32 x half> %a
 }
 
 define <32 x half> @stack_fold_fnmsub231ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fnmsub231ph_maskz:
-  ;CHECK:       vfnmsub231ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmsub231ph_maskz:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovd (%rdi), %k1
+; CHECK-NEXT:    vfnmsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %neg = fneg <32 x half> %a0
   %neg1 = fneg <32 x half> %a1
@@ -763,8 +1271,15 @@ define <32 x half> @stack_fold_fnmsub231ph_maskz(<32 x half> %a0, <32 x half> %a
 }
 
 define <32 x half> @stack_fold_fnmsub321ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fnmsub321ph_maskz:
-  ;CHECK:       vfnmsub231ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmsub321ph_maskz:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovd (%rdi), %k1
+; CHECK-NEXT:    vfnmsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %neg = fneg <32 x half> %a0
   %neg1 = fneg <32 x half> %a2
@@ -776,8 +1291,15 @@ define <32 x half> @stack_fold_fnmsub321ph_maskz(<32 x half> %a0, <32 x half> %a
 }
 
 define <32 x half> @stack_fold_fnmsub132ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fnmsub132ph_maskz:
-  ;CHECK:       vfnmsub132ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmsub132ph_maskz:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovd (%rdi), %k1
+; CHECK-NEXT:    vfnmsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %neg = fneg <32 x half> %a1
   %neg1 = fneg <32 x half> %a0
@@ -789,8 +1311,15 @@ define <32 x half> @stack_fold_fnmsub132ph_maskz(<32 x half> %a0, <32 x half> %a
 }
 
 define <32 x half> @stack_fold_fnmsub312ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fnmsub312ph_maskz:
-  ;CHECK:       vfnmsub132ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmsub312ph_maskz:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovd (%rdi), %k1
+; CHECK-NEXT:    vfnmsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %neg = fneg <32 x half> %a1
   %neg1 = fneg <32 x half> %a2
@@ -802,8 +1331,14 @@ define <32 x half> @stack_fold_fnmsub312ph_maskz(<32 x half> %a0, <32 x half> %a
 }
 
 define half @stack_fold_fmadd123sh(half %a0, half %a1, half %a2) {
-  ;CHECK-LABEL: stack_fold_fmadd123sh:
-  ;CHECK:       vfmadd213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmadd123sh:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovsh %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfmadd213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call half @llvm.fma.f16(half %a0, half %a1, half %a2)
   ret half %2
@@ -811,48 +1346,84 @@ define half @stack_fold_fmadd123sh(half %a0, half %a1, half %a2) {
 declare half @llvm.fma.f16(half, half, half)
 
 define half @stack_fold_fmadd213sh(half %a0, half %a1, half %a2) {
-  ;CHECK-LABEL: stack_fold_fmadd213sh:
-  ;CHECK:       vfmadd213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmadd213sh:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovsh %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfmadd213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call half @llvm.fma.f16(half %a1, half %a0, half %a2)
   ret half %2
 }
 
 define half @stack_fold_fmadd231sh(half %a0, half %a1, half %a2) {
-  ;CHECK-LABEL: stack_fold_fmadd231sh:
-  ;CHECK:       vfmadd231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmadd231sh:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovsh %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfmadd231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call half @llvm.fma.f16(half %a1, half %a2, half %a0)
   ret half %2
 }
 
 define half @stack_fold_fmadd321sh(half %a0, half %a1, half %a2) {
-  ;CHECK-LABEL: stack_fold_fmadd321sh:
-  ;CHECK:       vfmadd231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmadd321sh:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovsh %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfmadd231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call half @llvm.fma.f16(half %a2, half %a1, half %a0)
   ret half %2
 }
 
 define half @stack_fold_fmadd132sh(half %a0, half %a1, half %a2) {
-  ;CHECK-LABEL: stack_fold_fmadd132sh:
-  ;CHECK:       vfmadd132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmadd132sh:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovsh %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfmadd132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call half @llvm.fma.f16(half %a0, half %a2, half %a1)
   ret half %2
 }
 
 define half @stack_fold_fmadd312sh(half %a0, half %a1, half %a2) {
-  ;CHECK-LABEL: stack_fold_fmadd312sh:
-  ;CHECK:       vfmadd132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmadd312sh:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovsh %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfmadd132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call half @llvm.fma.f16(half %a2, half %a0, half %a1)
   ret half %2
 }
 
 define half @stack_fold_fmsub123sh(half %a0, half %a1, half %a2) {
-  ;CHECK-LABEL: stack_fold_fmsub123sh:
-  ;CHECK:       vfmsub213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmsub123sh:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovsh %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfmsub213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = fneg half %a2
   %3 = call half @llvm.fma.f16(half %a0, half %a1, half %2)
@@ -860,8 +1431,14 @@ define half @stack_fold_fmsub123sh(half %a0, half %a1, half %a2) {
 }
 
 define half @stack_fold_fmsub213sh(half %a0, half %a1, half %a2) {
-  ;CHECK-LABEL: stack_fold_fmsub213sh:
-  ;CHECK:       vfmsub213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmsub213sh:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovsh %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfmsub213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = fneg half %a2
   %3 = call half @llvm.fma.f16(half %a1, half %a0, half %2)
@@ -869,8 +1446,14 @@ define half @stack_fold_fmsub213sh(half %a0, half %a1, half %a2) {
 }
 
 define half @stack_fold_fmsub231sh(half %a0, half %a1, half %a2) {
-  ;CHECK-LABEL: stack_fold_fmsub231sh:
-  ;CHECK:       vfmsub231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmsub231sh:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovsh %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfmsub231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = fneg half %a0
   %3 = call half @llvm.fma.f16(half %a1, half %a2, half %2)
@@ -878,8 +1461,14 @@ define half @stack_fold_fmsub231sh(half %a0, half %a1, half %a2) {
 }
 
 define half @stack_fold_fmsub321sh(half %a0, half %a1, half %a2) {
-  ;CHECK-LABEL: stack_fold_fmsub321sh:
-  ;CHECK:       vfmsub231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmsub321sh:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovsh %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfmsub231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = fneg half %a0
   %3 = call half @llvm.fma.f16(half %a2, half %a1, half %2)
@@ -887,8 +1476,14 @@ define half @stack_fold_fmsub321sh(half %a0, half %a1, half %a2) {
 }
 
 define half @stack_fold_fmsub132sh(half %a0, half %a1, half %a2) {
-  ;CHECK-LABEL: stack_fold_fmsub132sh:
-  ;CHECK:       vfmsub132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmsub132sh:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovsh %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfmsub132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = fneg half %a1
   %3 = call half @llvm.fma.f16(half %a0, half %a2, half %2)
@@ -896,8 +1491,14 @@ define half @stack_fold_fmsub132sh(half %a0, half %a1, half %a2) {
 }
 
 define half @stack_fold_fmsub312sh(half %a0, half %a1, half %a2) {
-  ;CHECK-LABEL: stack_fold_fmsub312sh:
-  ;CHECK:       vfmsub132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmsub312sh:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovsh %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfmsub132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = fneg half %a1
   %3 = call half @llvm.fma.f16(half %a2, half %a0, half %2)
@@ -905,8 +1506,14 @@ define half @stack_fold_fmsub312sh(half %a0, half %a1, half %a2) {
 }
 
 define half @stack_fold_fnmadd123sh(half %a0, half %a1, half %a2) {
-  ;CHECK-LABEL: stack_fold_fnmadd123sh:
-  ;CHECK:       vfnmadd213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmadd123sh:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovsh %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfnmadd213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = fneg half %a0
   %3 = call half @llvm.fma.f16(half %2, half %a1, half %a2)
@@ -914,8 +1521,14 @@ define half @stack_fold_fnmadd123sh(half %a0, half %a1, half %a2) {
 }
 
 define half @stack_fold_fnmadd213sh(half %a0, half %a1, half %a2) {
-  ;CHECK-LABEL: stack_fold_fnmadd213sh:
-  ;CHECK:       vfnmadd213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmadd213sh:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovsh %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfnmadd213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = fneg half %a1
   %3 = call half @llvm.fma.f16(half %2, half %a0, half %a2)
@@ -923,8 +1536,14 @@ define half @stack_fold_fnmadd213sh(half %a0, half %a1, half %a2) {
 }
 
 define half @stack_fold_fnmadd231sh(half %a0, half %a1, half %a2) {
-  ;CHECK-LABEL: stack_fold_fnmadd231sh:
-  ;CHECK:       vfnmadd231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmadd231sh:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovsh %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfnmadd231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = fneg half %a1
   %3 = call half @llvm.fma.f16(half %2, half %a2, half %a0)
@@ -932,8 +1551,14 @@ define half @stack_fold_fnmadd231sh(half %a0, half %a1, half %a2) {
 }
 
 define half @stack_fold_fnmadd321sh(half %a0, half %a1, half %a2) {
-  ;CHECK-LABEL: stack_fold_fnmadd321sh:
-  ;CHECK:       vfnmadd231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmadd321sh:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovsh %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfnmadd231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = fneg half %a2
   %3 = call half @llvm.fma.f16(half %2, half %a1, half %a0)
@@ -941,8 +1566,14 @@ define half @stack_fold_fnmadd321sh(half %a0, half %a1, half %a2) {
 }
 
 define half @stack_fold_fnmadd132sh(half %a0, half %a1, half %a2) {
-  ;CHECK-LABEL: stack_fold_fnmadd132sh:
-  ;CHECK:       vfnmadd132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmadd132sh:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovsh %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfnmadd132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = fneg half %a0
   %3 = call half @llvm.fma.f16(half %2, half %a2, half %a1)
@@ -950,8 +1581,14 @@ define half @stack_fold_fnmadd132sh(half %a0, half %a1, half %a2) {
 }
 
 define half @stack_fold_fnmadd312sh(half %a0, half %a1, half %a2) {
-  ;CHECK-LABEL: stack_fold_fnmadd312sh:
-  ;CHECK:       vfnmadd132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmadd312sh:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovsh %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfnmadd132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = fneg half %a2
   %3 = call half @llvm.fma.f16(half %2, half %a0, half %a1)
@@ -959,8 +1596,14 @@ define half @stack_fold_fnmadd312sh(half %a0, half %a1, half %a2) {
 }
 
 define half @stack_fold_fnmsub123sh(half %a0, half %a1, half %a2) {
-  ;CHECK-LABEL: stack_fold_fnmsub123sh:
-  ;CHECK:       vfnmsub213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmsub123sh:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovsh %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfnmsub213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = fneg half %a0
   %3 = fneg half %a2
@@ -969,8 +1612,14 @@ define half @stack_fold_fnmsub123sh(half %a0, half %a1, half %a2) {
 }
 
 define half @stack_fold_fnmsub213sh(half %a0, half %a1, half %a2) {
-  ;CHECK-LABEL: stack_fold_fnmsub213sh:
-  ;CHECK:       vfnmsub213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmsub213sh:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovsh %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfnmsub213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = fneg half %a1
   %3 = fneg half %a2
@@ -979,8 +1628,14 @@ define half @stack_fold_fnmsub213sh(half %a0, half %a1, half %a2) {
 }
 
 define half @stack_fold_fnmsub231sh(half %a0, half %a1, half %a2) {
-  ;CHECK-LABEL: stack_fold_fnmsub231sh:
-  ;CHECK:       vfnmsub231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmsub231sh:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovsh %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfnmsub231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = fneg half %a1
   %3 = fneg half %a0
@@ -989,8 +1644,14 @@ define half @stack_fold_fnmsub231sh(half %a0, half %a1, half %a2) {
 }
 
 define half @stack_fold_fnmsub321sh(half %a0, half %a1, half %a2) {
-  ;CHECK-LABEL: stack_fold_fnmsub321sh:
-  ;CHECK:       vfnmsub231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmsub321sh:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovsh %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfnmsub231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = fneg half %a2
   %3 = fneg half %a0
@@ -999,8 +1660,14 @@ define half @stack_fold_fnmsub321sh(half %a0, half %a1, half %a2) {
 }
 
 define half @stack_fold_fnmsub132sh(half %a0, half %a1, half %a2) {
-  ;CHECK-LABEL: stack_fold_fnmsub132sh:
-  ;CHECK:       vfnmsub132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmsub132sh:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovsh %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfnmsub132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = fneg half %a0
   %3 = fneg half %a1
@@ -1009,8 +1676,14 @@ define half @stack_fold_fnmsub132sh(half %a0, half %a1, half %a2) {
 }
 
 define half @stack_fold_fnmsub312sh(half %a0, half %a1, half %a2) {
-  ;CHECK-LABEL: stack_fold_fnmsub312sh:
-  ;CHECK:       vfnmsub132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmsub312sh:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovsh %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfnmsub132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = fneg half %a2
   %3 = fneg half %a1
@@ -1019,8 +1692,14 @@ define half @stack_fold_fnmsub312sh(half %a0, half %a1, half %a2) {
 }
 
 define <8 x half> @stack_fold_fmadd123sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) {
-  ;CHECK-LABEL: stack_fold_fmadd123sh_int:
-  ;CHECK:       vfmadd213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmadd123sh_int:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfmadd213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = extractelement <8 x half> %a0v, i64 0
   %a1 = extractelement <8 x half> %a1v, i64 0
@@ -1031,8 +1710,14 @@ define <8 x half> @stack_fold_fmadd123sh_int(<8 x half> %a0v, <8 x half> %a1v, <
 }
 
 define <8 x half> @stack_fold_fmadd213sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) {
-  ;CHECK-LABEL: stack_fold_fmadd213sh_int:
-  ;CHECK:       vfmadd213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmadd213sh_int:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfmadd213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = extractelement <8 x half> %a0v, i64 0
   %a1 = extractelement <8 x half> %a1v, i64 0
@@ -1043,8 +1728,14 @@ define <8 x half> @stack_fold_fmadd213sh_int(<8 x half> %a0v, <8 x half> %a1v, <
 }
 
 define <8 x half> @stack_fold_fmadd231sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) {
-  ;CHECK-LABEL: stack_fold_fmadd231sh_int:
-  ;CHECK:       vfmadd231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmadd231sh_int:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfmadd231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = extractelement <8 x half> %a0v, i64 0
   %a1 = extractelement <8 x half> %a1v, i64 0
@@ -1055,8 +1746,14 @@ define <8 x half> @stack_fold_fmadd231sh_int(<8 x half> %a0v, <8 x half> %a1v, <
 }
 
 define <8 x half> @stack_fold_fmadd321sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) {
-  ;CHECK-LABEL: stack_fold_fmadd321sh_int:
-  ;CHECK:       vfmadd231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmadd321sh_int:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfmadd231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = extractelement <8 x half> %a0v, i64 0
   %a1 = extractelement <8 x half> %a1v, i64 0
@@ -1067,8 +1764,14 @@ define <8 x half> @stack_fold_fmadd321sh_int(<8 x half> %a0v, <8 x half> %a1v, <
 }
 
 define <8 x half> @stack_fold_fmadd132sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) {
-  ;CHECK-LABEL: stack_fold_fmadd132sh_int:
-  ;CHECK:       vfmadd132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmadd132sh_int:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfmadd132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = extractelement <8 x half> %a0v, i64 0
   %a1 = extractelement <8 x half> %a1v, i64 0
@@ -1079,8 +1782,14 @@ define <8 x half> @stack_fold_fmadd132sh_int(<8 x half> %a0v, <8 x half> %a1v, <
 }
 
 define <8 x half> @stack_fold_fmadd312sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) {
-  ;CHECK-LABEL: stack_fold_fmadd312sh_int:
-  ;CHECK:       vfmadd132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmadd312sh_int:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfmadd132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = extractelement <8 x half> %a0v, i64 0
   %a1 = extractelement <8 x half> %a1v, i64 0
@@ -1091,8 +1800,14 @@ define <8 x half> @stack_fold_fmadd312sh_int(<8 x half> %a0v, <8 x half> %a1v, <
 }
 
 define <8 x half> @stack_fold_fmsub123sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) {
-  ;CHECK-LABEL: stack_fold_fmsub123sh_int:
-  ;CHECK:       vfmsub213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmsub123sh_int:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfmsub213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = extractelement <8 x half> %a0v, i64 0
   %a1 = extractelement <8 x half> %a1v, i64 0
@@ -1104,8 +1819,14 @@ define <8 x half> @stack_fold_fmsub123sh_int(<8 x half> %a0v, <8 x half> %a1v, <
 }
 
 define <8 x half> @stack_fold_fmsub213sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) {
-  ;CHECK-LABEL: stack_fold_fmsub213sh_int:
-  ;CHECK:       vfmsub213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmsub213sh_int:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfmsub213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = extractelement <8 x half> %a0v, i64 0
   %a1 = extractelement <8 x half> %a1v, i64 0
@@ -1117,8 +1838,14 @@ define <8 x half> @stack_fold_fmsub213sh_int(<8 x half> %a0v, <8 x half> %a1v, <
 }
 
 define <8 x half> @stack_fold_fmsub231sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) {
-  ;CHECK-LABEL: stack_fold_fmsub231sh_int:
-  ;CHECK:       vfmsub231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmsub231sh_int:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfmsub231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = extractelement <8 x half> %a0v, i64 0
   %a1 = extractelement <8 x half> %a1v, i64 0
@@ -1130,8 +1857,14 @@ define <8 x half> @stack_fold_fmsub231sh_int(<8 x half> %a0v, <8 x half> %a1v, <
 }
 
 define <8 x half> @stack_fold_fmsub321sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) {
-  ;CHECK-LABEL: stack_fold_fmsub321sh_int:
-  ;CHECK:       vfmsub231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmsub321sh_int:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfmsub231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = extractelement <8 x half> %a0v, i64 0
   %a1 = extractelement <8 x half> %a1v, i64 0
@@ -1143,8 +1876,14 @@ define <8 x half> @stack_fold_fmsub321sh_int(<8 x half> %a0v, <8 x half> %a1v, <
 }
 
 define <8 x half> @stack_fold_fmsub132sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) {
-  ;CHECK-LABEL: stack_fold_fmsub132sh_int:
-  ;CHECK:       vfmsub132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmsub132sh_int:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfmsub132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = extractelement <8 x half> %a0v, i64 0
   %a1 = extractelement <8 x half> %a1v, i64 0
@@ -1156,8 +1895,14 @@ define <8 x half> @stack_fold_fmsub132sh_int(<8 x half> %a0v, <8 x half> %a1v, <
 }
 
 define <8 x half> @stack_fold_fmsub312sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) {
-  ;CHECK-LABEL: stack_fold_fmsub312sh_int:
-  ;CHECK:       vfmsub132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmsub312sh_int:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfmsub132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = extractelement <8 x half> %a0v, i64 0
   %a1 = extractelement <8 x half> %a1v, i64 0
@@ -1169,8 +1914,14 @@ define <8 x half> @stack_fold_fmsub312sh_int(<8 x half> %a0v, <8 x half> %a1v, <
 }
 
 define <8 x half> @stack_fold_fnmadd123sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) {
-  ;CHECK-LABEL: stack_fold_fnmadd123sh_int:
-  ;CHECK:       vfnmadd213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmadd123sh_int:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfnmadd213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = extractelement <8 x half> %a0v, i64 0
   %a1 = extractelement <8 x half> %a1v, i64 0
@@ -1182,8 +1933,14 @@ define <8 x half> @stack_fold_fnmadd123sh_int(<8 x half> %a0v, <8 x half> %a1v,
 }
 
 define <8 x half> @stack_fold_fnmadd213sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) {
-  ;CHECK-LABEL: stack_fold_fnmadd213sh_int:
-  ;CHECK:       vfnmadd213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmadd213sh_int:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfnmadd213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = extractelement <8 x half> %a0v, i64 0
   %a1 = extractelement <8 x half> %a1v, i64 0
@@ -1195,8 +1952,14 @@ define <8 x half> @stack_fold_fnmadd213sh_int(<8 x half> %a0v, <8 x half> %a1v,
 }
 
 define <8 x half> @stack_fold_fnmadd231sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) {
-  ;CHECK-LABEL: stack_fold_fnmadd231sh_int:
-  ;CHECK:       vfnmadd231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmadd231sh_int:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfnmadd231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = extractelement <8 x half> %a0v, i64 0
   %a1 = extractelement <8 x half> %a1v, i64 0
@@ -1208,8 +1971,14 @@ define <8 x half> @stack_fold_fnmadd231sh_int(<8 x half> %a0v, <8 x half> %a1v,
 }
 
 define <8 x half> @stack_fold_fnmadd321sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) {
-  ;CHECK-LABEL: stack_fold_fnmadd321sh_int:
-  ;CHECK:       vfnmadd231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmadd321sh_int:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfnmadd231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = extractelement <8 x half> %a0v, i64 0
   %a1 = extractelement <8 x half> %a1v, i64 0
@@ -1221,8 +1990,14 @@ define <8 x half> @stack_fold_fnmadd321sh_int(<8 x half> %a0v, <8 x half> %a1v,
 }
 
 define <8 x half> @stack_fold_fnmadd132sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) {
-  ;CHECK-LABEL: stack_fold_fnmadd132sh_int:
-  ;CHECK:       vfnmadd132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmadd132sh_int:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfnmadd132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = extractelement <8 x half> %a0v, i64 0
   %a1 = extractelement <8 x half> %a1v, i64 0
@@ -1234,8 +2009,14 @@ define <8 x half> @stack_fold_fnmadd132sh_int(<8 x half> %a0v, <8 x half> %a1v,
 }
 
 define <8 x half> @stack_fold_fnmadd312sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) {
-  ;CHECK-LABEL: stack_fold_fnmadd312sh_int:
-  ;CHECK:       vfnmadd132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmadd312sh_int:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfnmadd132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = extractelement <8 x half> %a0v, i64 0
   %a1 = extractelement <8 x half> %a1v, i64 0
@@ -1247,8 +2028,14 @@ define <8 x half> @stack_fold_fnmadd312sh_int(<8 x half> %a0v, <8 x half> %a1v,
 }
 
 define <8 x half> @stack_fold_fnmsub123sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) {
-  ;CHECK-LABEL: stack_fold_fnmsub123sh_int:
-  ;CHECK:       vfnmsub213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmsub123sh_int:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfnmsub213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = extractelement <8 x half> %a0v, i64 0
   %a1 = extractelement <8 x half> %a1v, i64 0
@@ -1261,8 +2048,14 @@ define <8 x half> @stack_fold_fnmsub123sh_int(<8 x half> %a0v, <8 x half> %a1v,
 }
 
 define <8 x half> @stack_fold_fnmsub213sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) {
-  ;CHECK-LABEL: stack_fold_fnmsub213sh_int:
-  ;CHECK:       vfnmsub213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmsub213sh_int:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfnmsub213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = extractelement <8 x half> %a0v, i64 0
   %a1 = extractelement <8 x half> %a1v, i64 0
@@ -1275,8 +2068,14 @@ define <8 x half> @stack_fold_fnmsub213sh_int(<8 x half> %a0v, <8 x half> %a1v,
 }
 
 define <8 x half> @stack_fold_fnmsub231sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) {
-  ;CHECK-LABEL: stack_fold_fnmsub231sh_int:
-  ;CHECK:       vfnmsub231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmsub231sh_int:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfnmsub231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = extractelement <8 x half> %a0v, i64 0
   %a1 = extractelement <8 x half> %a1v, i64 0
@@ -1289,8 +2088,14 @@ define <8 x half> @stack_fold_fnmsub231sh_int(<8 x half> %a0v, <8 x half> %a1v,
 }
 
 define <8 x half> @stack_fold_fnmsub321sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) {
-  ;CHECK-LABEL: stack_fold_fnmsub321sh_int:
-  ;CHECK:       vfnmsub231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmsub321sh_int:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfnmsub231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = extractelement <8 x half> %a0v, i64 0
   %a1 = extractelement <8 x half> %a1v, i64 0
@@ -1303,8 +2108,14 @@ define <8 x half> @stack_fold_fnmsub321sh_int(<8 x half> %a0v, <8 x half> %a1v,
 }
 
 define <8 x half> @stack_fold_fnmsub132sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) {
-  ;CHECK-LABEL: stack_fold_fnmsub132sh_int:
-  ;CHECK:       vfnmsub132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmsub132sh_int:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfnmsub132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = extractelement <8 x half> %a0v, i64 0
   %a1 = extractelement <8 x half> %a1v, i64 0
@@ -1317,8 +2128,14 @@ define <8 x half> @stack_fold_fnmsub132sh_int(<8 x half> %a0v, <8 x half> %a1v,
 }
 
 define <8 x half> @stack_fold_fnmsub312sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) {
-  ;CHECK-LABEL: stack_fold_fnmsub312sh_int:
-  ;CHECK:       vfnmsub132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmsub312sh_int:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfnmsub132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = extractelement <8 x half> %a0v, i64 0
   %a1 = extractelement <8 x half> %a1v, i64 0
@@ -1331,8 +2148,15 @@ define <8 x half> @stack_fold_fnmsub312sh_int(<8 x half> %a0v, <8 x half> %a1v,
 }
 
 define <8 x half> @stack_fold_fmadd123sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fmadd123sh_intk:
-  ;CHECK:       vfmadd213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmadd123sh_intk:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovb (%rdi), %k1
+; CHECK-NEXT:    vfmadd213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = extractelement <8 x half> %a0v, i64 0
   %a1 = extractelement <8 x half> %a1v, i64 0
@@ -1347,8 +2171,15 @@ define <8 x half> @stack_fold_fmadd123sh_intk(<8 x half> %a0v, <8 x half> %a1v,
 }
 
 define <8 x half> @stack_fold_fmadd213sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fmadd213sh_intk:
-  ;CHECK:       vfmadd213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmadd213sh_intk:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovb (%rdi), %k1
+; CHECK-NEXT:    vfmadd213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = extractelement <8 x half> %a0v, i64 0
   %a1 = extractelement <8 x half> %a1v, i64 0
@@ -1363,8 +2194,15 @@ define <8 x half> @stack_fold_fmadd213sh_intk(<8 x half> %a0v, <8 x half> %a1v,
 }
 
 define <8 x half> @stack_fold_fmadd231sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fmadd231sh_intk:
-  ;CHECK:       vfmadd231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmadd231sh_intk:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovb (%rdi), %k1
+; CHECK-NEXT:    vfmadd231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = extractelement <8 x half> %a0v, i64 0
   %a1 = extractelement <8 x half> %a1v, i64 0
@@ -1379,8 +2217,15 @@ define <8 x half> @stack_fold_fmadd231sh_intk(<8 x half> %a0v, <8 x half> %a1v,
 }
 
 define <8 x half> @stack_fold_fmadd321sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fmadd321sh_intk:
-  ;CHECK:       vfmadd231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmadd321sh_intk:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovb (%rdi), %k1
+; CHECK-NEXT:    vfmadd231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = extractelement <8 x half> %a0v, i64 0
   %a1 = extractelement <8 x half> %a1v, i64 0
@@ -1395,8 +2240,15 @@ define <8 x half> @stack_fold_fmadd321sh_intk(<8 x half> %a0v, <8 x half> %a1v,
 }
 
 define <8 x half> @stack_fold_fmadd132sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fmadd132sh_intk:
-  ;CHECK:       vfmadd132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmadd132sh_intk:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovb (%rdi), %k1
+; CHECK-NEXT:    vfmadd132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = extractelement <8 x half> %a0v, i64 0
   %a1 = extractelement <8 x half> %a1v, i64 0
@@ -1411,8 +2263,15 @@ define <8 x half> @stack_fold_fmadd132sh_intk(<8 x half> %a0v, <8 x half> %a1v,
 }
 
 define <8 x half> @stack_fold_fmadd312sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fmadd312sh_intk:
-  ;CHECK:       vfmadd132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmadd312sh_intk:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovb (%rdi), %k1
+; CHECK-NEXT:    vfmadd132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = extractelement <8 x half> %a0v, i64 0
   %a1 = extractelement <8 x half> %a1v, i64 0
@@ -1427,8 +2286,15 @@ define <8 x half> @stack_fold_fmadd312sh_intk(<8 x half> %a0v, <8 x half> %a1v,
 }
 
 define <8 x half> @stack_fold_fmsub123sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fmsub123sh_intk:
-  ;CHECK:       vfmsub213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmsub123sh_intk:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovb (%rdi), %k1
+; CHECK-NEXT:    vfmsub213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = extractelement <8 x half> %a0v, i64 0
   %a1 = extractelement <8 x half> %a1v, i64 0
@@ -1444,8 +2310,15 @@ define <8 x half> @stack_fold_fmsub123sh_intk(<8 x half> %a0v, <8 x half> %a1v,
 }
 
 define <8 x half> @stack_fold_fmsub213sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fmsub213sh_intk:
-  ;CHECK:       vfmsub213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmsub213sh_intk:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovb (%rdi), %k1
+; CHECK-NEXT:    vfmsub213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = extractelement <8 x half> %a0v, i64 0
   %a1 = extractelement <8 x half> %a1v, i64 0
@@ -1461,8 +2334,15 @@ define <8 x half> @stack_fold_fmsub213sh_intk(<8 x half> %a0v, <8 x half> %a1v,
 }
 
 define <8 x half> @stack_fold_fmsub231sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fmsub231sh_intk:
-  ;CHECK:       vfmsub231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmsub231sh_intk:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovb (%rdi), %k1
+; CHECK-NEXT:    vfmsub231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = extractelement <8 x half> %a0v, i64 0
   %a1 = extractelement <8 x half> %a1v, i64 0
@@ -1478,8 +2358,15 @@ define <8 x half> @stack_fold_fmsub231sh_intk(<8 x half> %a0v, <8 x half> %a1v,
 }
 
 define <8 x half> @stack_fold_fmsub321sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fmsub321sh_intk:
-  ;CHECK:       vfmsub231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmsub321sh_intk:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovb (%rdi), %k1
+; CHECK-NEXT:    vfmsub231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = extractelement <8 x half> %a0v, i64 0
   %a1 = extractelement <8 x half> %a1v, i64 0
@@ -1495,8 +2382,15 @@ define <8 x half> @stack_fold_fmsub321sh_intk(<8 x half> %a0v, <8 x half> %a1v,
 }
 
 define <8 x half> @stack_fold_fmsub132sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fmsub132sh_intk:
-  ;CHECK:       vfmsub132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmsub132sh_intk:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovb (%rdi), %k1
+; CHECK-NEXT:    vfmsub132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = extractelement <8 x half> %a0v, i64 0
   %a1 = extractelement <8 x half> %a1v, i64 0
@@ -1512,8 +2406,15 @@ define <8 x half> @stack_fold_fmsub132sh_intk(<8 x half> %a0v, <8 x half> %a1v,
 }
 
 define <8 x half> @stack_fold_fmsub312sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fmsub312sh_intk:
-  ;CHECK:       vfmsub132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmsub312sh_intk:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovb (%rdi), %k1
+; CHECK-NEXT:    vfmsub132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = extractelement <8 x half> %a0v, i64 0
   %a1 = extractelement <8 x half> %a1v, i64 0
@@ -1529,8 +2430,15 @@ define <8 x half> @stack_fold_fmsub312sh_intk(<8 x half> %a0v, <8 x half> %a1v,
 }
 
 define <8 x half> @stack_fold_fnmadd123sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fnmadd123sh_intk:
-  ;CHECK:       vfnmadd213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmadd123sh_intk:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovb (%rdi), %k1
+; CHECK-NEXT:    vfnmadd213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = extractelement <8 x half> %a0v, i64 0
   %a1 = extractelement <8 x half> %a1v, i64 0
@@ -1546,8 +2454,15 @@ define <8 x half> @stack_fold_fnmadd123sh_intk(<8 x half> %a0v, <8 x half> %a1v,
 }
 
 define <8 x half> @stack_fold_fnmadd213sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fnmadd213sh_intk:
-  ;CHECK:       vfnmadd213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmadd213sh_intk:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovb (%rdi), %k1
+; CHECK-NEXT:    vfnmadd213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = extractelement <8 x half> %a0v, i64 0
   %a1 = extractelement <8 x half> %a1v, i64 0
@@ -1563,8 +2478,15 @@ define <8 x half> @stack_fold_fnmadd213sh_intk(<8 x half> %a0v, <8 x half> %a1v,
 }
 
 define <8 x half> @stack_fold_fnmadd231sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fnmadd231sh_intk:
-  ;CHECK:       vfnmadd231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmadd231sh_intk:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovb (%rdi), %k1
+; CHECK-NEXT:    vfnmadd231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = extractelement <8 x half> %a0v, i64 0
   %a1 = extractelement <8 x half> %a1v, i64 0
@@ -1580,8 +2502,15 @@ define <8 x half> @stack_fold_fnmadd231sh_intk(<8 x half> %a0v, <8 x half> %a1v,
 }
 
 define <8 x half> @stack_fold_fnmadd321sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fnmadd321sh_intk:
-  ;CHECK:       vfnmadd231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmadd321sh_intk:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovb (%rdi), %k1
+; CHECK-NEXT:    vfnmadd231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = extractelement <8 x half> %a0v, i64 0
   %a1 = extractelement <8 x half> %a1v, i64 0
@@ -1597,8 +2526,15 @@ define <8 x half> @stack_fold_fnmadd321sh_intk(<8 x half> %a0v, <8 x half> %a1v,
 }
 
 define <8 x half> @stack_fold_fnmadd132sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fnmadd132sh_intk:
-  ;CHECK:       vfnmadd132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmadd132sh_intk:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovb (%rdi), %k1
+; CHECK-NEXT:    vfnmadd132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = extractelement <8 x half> %a0v, i64 0
   %a1 = extractelement <8 x half> %a1v, i64 0
@@ -1614,8 +2550,15 @@ define <8 x half> @stack_fold_fnmadd132sh_intk(<8 x half> %a0v, <8 x half> %a1v,
 }
 
 define <8 x half> @stack_fold_fnmadd312sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fnmadd312sh_intk:
-  ;CHECK:       vfnmadd132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmadd312sh_intk:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovb (%rdi), %k1
+; CHECK-NEXT:    vfnmadd132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = extractelement <8 x half> %a0v, i64 0
   %a1 = extractelement <8 x half> %a1v, i64 0
@@ -1631,8 +2574,15 @@ define <8 x half> @stack_fold_fnmadd312sh_intk(<8 x half> %a0v, <8 x half> %a1v,
 }
 
 define <8 x half> @stack_fold_fnmsub123sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fnmsub123sh_intk:
-  ;CHECK:       vfnmsub213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmsub123sh_intk:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovb (%rdi), %k1
+; CHECK-NEXT:    vfnmsub213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = extractelement <8 x half> %a0v, i64 0
   %a1 = extractelement <8 x half> %a1v, i64 0
@@ -1649,8 +2599,15 @@ define <8 x half> @stack_fold_fnmsub123sh_intk(<8 x half> %a0v, <8 x half> %a1v,
 }
 
 define <8 x half> @stack_fold_fnmsub213sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fnmsub213sh_intk:
-  ;CHECK:       vfnmsub213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmsub213sh_intk:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovb (%rdi), %k1
+; CHECK-NEXT:    vfnmsub213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = extractelement <8 x half> %a0v, i64 0
   %a1 = extractelement <8 x half> %a1v, i64 0
@@ -1667,8 +2624,15 @@ define <8 x half> @stack_fold_fnmsub213sh_intk(<8 x half> %a0v, <8 x half> %a1v,
 }
 
 define <8 x half> @stack_fold_fnmsub231sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fnmsub231sh_intk:
-  ;CHECK:       vfnmsub231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmsub231sh_intk:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovb (%rdi), %k1
+; CHECK-NEXT:    vfnmsub231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = extractelement <8 x half> %a0v, i64 0
   %a1 = extractelement <8 x half> %a1v, i64 0
@@ -1685,8 +2649,15 @@ define <8 x half> @stack_fold_fnmsub231sh_intk(<8 x half> %a0v, <8 x half> %a1v,
 }
 
 define <8 x half> @stack_fold_fnmsub321sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fnmsub321sh_intk:
-  ;CHECK:       vfnmsub231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmsub321sh_intk:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovb (%rdi), %k1
+; CHECK-NEXT:    vfnmsub231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = extractelement <8 x half> %a0v, i64 0
   %a1 = extractelement <8 x half> %a1v, i64 0
@@ -1703,8 +2674,15 @@ define <8 x half> @stack_fold_fnmsub321sh_intk(<8 x half> %a0v, <8 x half> %a1v,
 }
 
 define <8 x half> @stack_fold_fnmsub132sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fnmsub132sh_intk:
-  ;CHECK:       vfnmsub132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmsub132sh_intk:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovb (%rdi), %k1
+; CHECK-NEXT:    vfnmsub132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = extractelement <8 x half> %a0v, i64 0
   %a1 = extractelement <8 x half> %a1v, i64 0
@@ -1721,8 +2699,15 @@ define <8 x half> @stack_fold_fnmsub132sh_intk(<8 x half> %a0v, <8 x half> %a1v,
 }
 
 define <8 x half> @stack_fold_fnmsub312sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fnmsub312sh_intk:
-  ;CHECK:       vfnmsub132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmsub312sh_intk:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovb (%rdi), %k1
+; CHECK-NEXT:    vfnmsub132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = extractelement <8 x half> %a0v, i64 0
   %a1 = extractelement <8 x half> %a1v, i64 0
@@ -1739,8 +2724,15 @@ define <8 x half> @stack_fold_fnmsub312sh_intk(<8 x half> %a0v, <8 x half> %a1v,
 }
 
 define <8 x half> @stack_fold_fmadd123sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fmadd123sh_intkz:
-  ;CHECK:       vfmadd213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmadd123sh_intkz:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovb (%rdi), %k1
+; CHECK-NEXT:    vfmadd213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = extractelement <8 x half> %a0v, i64 0
   %a1 = extractelement <8 x half> %a1v, i64 0
@@ -1755,8 +2747,15 @@ define <8 x half> @stack_fold_fmadd123sh_intkz(<8 x half> %a0v, <8 x half> %a1v,
 }
 
 define <8 x half> @stack_fold_fmadd213sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fmadd213sh_intkz:
-  ;CHECK:       vfmadd213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmadd213sh_intkz:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovb (%rdi), %k1
+; CHECK-NEXT:    vfmadd213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = extractelement <8 x half> %a0v, i64 0
   %a1 = extractelement <8 x half> %a1v, i64 0
@@ -1771,8 +2770,15 @@ define <8 x half> @stack_fold_fmadd213sh_intkz(<8 x half> %a0v, <8 x half> %a1v,
 }
 
 define <8 x half> @stack_fold_fmadd231sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fmadd231sh_intkz:
-  ;CHECK:       vfmadd231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmadd231sh_intkz:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovb (%rdi), %k1
+; CHECK-NEXT:    vfmadd231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = extractelement <8 x half> %a0v, i64 0
   %a1 = extractelement <8 x half> %a1v, i64 0
@@ -1787,8 +2793,15 @@ define <8 x half> @stack_fold_fmadd231sh_intkz(<8 x half> %a0v, <8 x half> %a1v,
 }
 
 define <8 x half> @stack_fold_fmadd321sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fmadd321sh_intkz:
-  ;CHECK:       vfmadd231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmadd321sh_intkz:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovb (%rdi), %k1
+; CHECK-NEXT:    vfmadd231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = extractelement <8 x half> %a0v, i64 0
   %a1 = extractelement <8 x half> %a1v, i64 0
@@ -1803,8 +2816,15 @@ define <8 x half> @stack_fold_fmadd321sh_intkz(<8 x half> %a0v, <8 x half> %a1v,
 }
 
 define <8 x half> @stack_fold_fmadd132sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fmadd132sh_intkz:
-  ;CHECK:       vfmadd132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmadd132sh_intkz:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovb (%rdi), %k1
+; CHECK-NEXT:    vfmadd132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = extractelement <8 x half> %a0v, i64 0
   %a1 = extractelement <8 x half> %a1v, i64 0
@@ -1819,8 +2839,15 @@ define <8 x half> @stack_fold_fmadd132sh_intkz(<8 x half> %a0v, <8 x half> %a1v,
 }
 
 define <8 x half> @stack_fold_fmadd312sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fmadd312sh_intkz:
-  ;CHECK:       vfmadd132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmadd312sh_intkz:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovb (%rdi), %k1
+; CHECK-NEXT:    vfmadd132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = extractelement <8 x half> %a0v, i64 0
   %a1 = extractelement <8 x half> %a1v, i64 0
@@ -1835,8 +2862,15 @@ define <8 x half> @stack_fold_fmadd312sh_intkz(<8 x half> %a0v, <8 x half> %a1v,
 }
 
 define <8 x half> @stack_fold_fmsub123sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fmsub123sh_intkz:
-  ;CHECK:       vfmsub213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmsub123sh_intkz:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovb (%rdi), %k1
+; CHECK-NEXT:    vfmsub213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = extractelement <8 x half> %a0v, i64 0
   %a1 = extractelement <8 x half> %a1v, i64 0
@@ -1852,8 +2886,15 @@ define <8 x half> @stack_fold_fmsub123sh_intkz(<8 x half> %a0v, <8 x half> %a1v,
 }
 
 define <8 x half> @stack_fold_fmsub213sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fmsub213sh_intkz:
-  ;CHECK:       vfmsub213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmsub213sh_intkz:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovb (%rdi), %k1
+; CHECK-NEXT:    vfmsub213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = extractelement <8 x half> %a0v, i64 0
   %a1 = extractelement <8 x half> %a1v, i64 0
@@ -1869,8 +2910,15 @@ define <8 x half> @stack_fold_fmsub213sh_intkz(<8 x half> %a0v, <8 x half> %a1v,
 }
 
 define <8 x half> @stack_fold_fmsub231sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fmsub231sh_intkz:
-  ;CHECK:       vfmsub231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmsub231sh_intkz:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovb (%rdi), %k1
+; CHECK-NEXT:    vfmsub231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = extractelement <8 x half> %a0v, i64 0
   %a1 = extractelement <8 x half> %a1v, i64 0
@@ -1886,8 +2934,15 @@ define <8 x half> @stack_fold_fmsub231sh_intkz(<8 x half> %a0v, <8 x half> %a1v,
 }
 
 define <8 x half> @stack_fold_fmsub321sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fmsub321sh_intkz:
-  ;CHECK:       vfmsub231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmsub321sh_intkz:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovb (%rdi), %k1
+; CHECK-NEXT:    vfmsub231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = extractelement <8 x half> %a0v, i64 0
   %a1 = extractelement <8 x half> %a1v, i64 0
@@ -1903,8 +2958,15 @@ define <8 x half> @stack_fold_fmsub321sh_intkz(<8 x half> %a0v, <8 x half> %a1v,
 }
 
 define <8 x half> @stack_fold_fmsub132sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fmsub132sh_intkz:
-  ;CHECK:       vfmsub132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmsub132sh_intkz:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovb (%rdi), %k1
+; CHECK-NEXT:    vfmsub132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = extractelement <8 x half> %a0v, i64 0
   %a1 = extractelement <8 x half> %a1v, i64 0
@@ -1920,8 +2982,15 @@ define <8 x half> @stack_fold_fmsub132sh_intkz(<8 x half> %a0v, <8 x half> %a1v,
 }
 
 define <8 x half> @stack_fold_fmsub312sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fmsub312sh_intkz:
-  ;CHECK:       vfmsub132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmsub312sh_intkz:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovb (%rdi), %k1
+; CHECK-NEXT:    vfmsub132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = extractelement <8 x half> %a0v, i64 0
   %a1 = extractelement <8 x half> %a1v, i64 0
@@ -1937,8 +3006,15 @@ define <8 x half> @stack_fold_fmsub312sh_intkz(<8 x half> %a0v, <8 x half> %a1v,
 }
 
 define <8 x half> @stack_fold_fnmadd123sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fnmadd123sh_intkz:
-  ;CHECK:       vfnmadd213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmadd123sh_intkz:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovb (%rdi), %k1
+; CHECK-NEXT:    vfnmadd213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = extractelement <8 x half> %a0v, i64 0
   %a1 = extractelement <8 x half> %a1v, i64 0
@@ -1954,8 +3030,15 @@ define <8 x half> @stack_fold_fnmadd123sh_intkz(<8 x half> %a0v, <8 x half> %a1v
 }
 
 define <8 x half> @stack_fold_fnmadd213sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fnmadd213sh_intkz:
-  ;CHECK:       vfnmadd213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmadd213sh_intkz:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovb (%rdi), %k1
+; CHECK-NEXT:    vfnmadd213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = extractelement <8 x half> %a0v, i64 0
   %a1 = extractelement <8 x half> %a1v, i64 0
@@ -1971,8 +3054,15 @@ define <8 x half> @stack_fold_fnmadd213sh_intkz(<8 x half> %a0v, <8 x half> %a1v
 }
 
 define <8 x half> @stack_fold_fnmadd231sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fnmadd231sh_intkz:
-  ;CHECK:       vfnmadd231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmadd231sh_intkz:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovb (%rdi), %k1
+; CHECK-NEXT:    vfnmadd231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = extractelement <8 x half> %a0v, i64 0
   %a1 = extractelement <8 x half> %a1v, i64 0
@@ -1988,8 +3078,15 @@ define <8 x half> @stack_fold_fnmadd231sh_intkz(<8 x half> %a0v, <8 x half> %a1v
 }
 
 define <8 x half> @stack_fold_fnmadd321sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fnmadd321sh_intkz:
-  ;CHECK:       vfnmadd231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmadd321sh_intkz:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovb (%rdi), %k1
+; CHECK-NEXT:    vfnmadd231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = extractelement <8 x half> %a0v, i64 0
   %a1 = extractelement <8 x half> %a1v, i64 0
@@ -2005,8 +3102,15 @@ define <8 x half> @stack_fold_fnmadd321sh_intkz(<8 x half> %a0v, <8 x half> %a1v
 }
 
 define <8 x half> @stack_fold_fnmadd132sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fnmadd132sh_intkz:
-  ;CHECK:       vfnmadd132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmadd132sh_intkz:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovb (%rdi), %k1
+; CHECK-NEXT:    vfnmadd132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = extractelement <8 x half> %a0v, i64 0
   %a1 = extractelement <8 x half> %a1v, i64 0
@@ -2022,8 +3126,15 @@ define <8 x half> @stack_fold_fnmadd132sh_intkz(<8 x half> %a0v, <8 x half> %a1v
 }
 
 define <8 x half> @stack_fold_fnmadd312sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fnmadd312sh_intkz:
-  ;CHECK:       vfnmadd132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmadd312sh_intkz:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovb (%rdi), %k1
+; CHECK-NEXT:    vfnmadd132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = extractelement <8 x half> %a0v, i64 0
   %a1 = extractelement <8 x half> %a1v, i64 0
@@ -2039,8 +3150,15 @@ define <8 x half> @stack_fold_fnmadd312sh_intkz(<8 x half> %a0v, <8 x half> %a1v
 }
 
 define <8 x half> @stack_fold_fnmsub123sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fnmsub123sh_intkz:
-  ;CHECK:       vfnmsub213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmsub123sh_intkz:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovb (%rdi), %k1
+; CHECK-NEXT:    vfnmsub213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = extractelement <8 x half> %a0v, i64 0
   %a1 = extractelement <8 x half> %a1v, i64 0
@@ -2057,8 +3175,15 @@ define <8 x half> @stack_fold_fnmsub123sh_intkz(<8 x half> %a0v, <8 x half> %a1v
 }
 
 define <8 x half> @stack_fold_fnmsub213sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fnmsub213sh_intkz:
-  ;CHECK:       vfnmsub213sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmsub213sh_intkz:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovb (%rdi), %k1
+; CHECK-NEXT:    vfnmsub213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = extractelement <8 x half> %a0v, i64 0
   %a1 = extractelement <8 x half> %a1v, i64 0
@@ -2075,8 +3200,15 @@ define <8 x half> @stack_fold_fnmsub213sh_intkz(<8 x half> %a0v, <8 x half> %a1v
 }
 
 define <8 x half> @stack_fold_fnmsub231sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fnmsub231sh_intkz:
-  ;CHECK:       vfnmsub231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmsub231sh_intkz:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovb (%rdi), %k1
+; CHECK-NEXT:    vfnmsub231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = extractelement <8 x half> %a0v, i64 0
   %a1 = extractelement <8 x half> %a1v, i64 0
@@ -2093,8 +3225,15 @@ define <8 x half> @stack_fold_fnmsub231sh_intkz(<8 x half> %a0v, <8 x half> %a1v
 }
 
 define <8 x half> @stack_fold_fnmsub321sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fnmsub321sh_intkz:
-  ;CHECK:       vfnmsub231sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmsub321sh_intkz:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovb (%rdi), %k1
+; CHECK-NEXT:    vfnmsub231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = extractelement <8 x half> %a0v, i64 0
   %a1 = extractelement <8 x half> %a1v, i64 0
@@ -2111,8 +3250,15 @@ define <8 x half> @stack_fold_fnmsub321sh_intkz(<8 x half> %a0v, <8 x half> %a1v
 }
 
 define <8 x half> @stack_fold_fnmsub132sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fnmsub132sh_intkz:
-  ;CHECK:       vfnmsub132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmsub132sh_intkz:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovb (%rdi), %k1
+; CHECK-NEXT:    vfnmsub132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = extractelement <8 x half> %a0v, i64 0
   %a1 = extractelement <8 x half> %a1v, i64 0
@@ -2129,8 +3275,15 @@ define <8 x half> @stack_fold_fnmsub132sh_intkz(<8 x half> %a0v, <8 x half> %a1v
 }
 
 define <8 x half> @stack_fold_fnmsub312sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fnmsub312sh_intkz:
-  ;CHECK:       vfnmsub132sh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmsub312sh_intkz:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovb (%rdi), %k1
+; CHECK-NEXT:    vfnmsub132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = extractelement <8 x half> %a0v, i64 0
   %a1 = extractelement <8 x half> %a1v, i64 0
@@ -2147,8 +3300,14 @@ define <8 x half> @stack_fold_fnmsub312sh_intkz(<8 x half> %a0v, <8 x half> %a1v
 }
 
 define <32 x half> @stack_fold_fmaddsub123ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
-  ;CHECK-LABEL: stack_fold_fmaddsub123ph:
-  ;CHECK:       vfmaddsub213ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmaddsub123ph:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfmaddsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, i32 4)
   ret <32 x half> %2
@@ -2156,48 +3315,87 @@ define <32 x half> @stack_fold_fmaddsub123ph(<32 x half> %a0, <32 x half> %a1, <
 declare <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half>, <32 x half>, <32 x half>, i32)
 
 define <32 x half> @stack_fold_fmaddsub213ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
-  ;CHECK-LABEL: stack_fold_fmaddsub213ph:
-  ;CHECK:       vfmaddsub213ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmaddsub213ph:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfmaddsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a1, <32 x half> %a0, <32 x half> %a2, i32 4)
   ret <32 x half> %2
 }
 
 define <32 x half> @stack_fold_fmaddsub231ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
-  ;CHECK-LABEL: stack_fold_fmaddsub231ph:
-  ;CHECK:       vfmaddsub231ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmaddsub231ph:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfmaddsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a1, <32 x half> %a2, <32 x half> %a0, i32 4)
   ret <32 x half> %2
 }
 
 define <32 x half> @stack_fold_fmaddsub321ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
-  ;CHECK-LABEL: stack_fold_fmaddsub321ph:
-  ;CHECK:       vfmaddsub231ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmaddsub321ph:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfmaddsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a2, <32 x half> %a1, <32 x half> %a0, i32 4)
   ret <32 x half> %2
 }
 
 define <32 x half> @stack_fold_fmaddsub132ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
-  ;CHECK-LABEL: stack_fold_fmaddsub132ph:
-  ;CHECK:       vfmaddsub132ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmaddsub132ph:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfmaddsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a0, <32 x half> %a2, <32 x half> %a1, i32 4)
   ret <32 x half> %2
 }
 
 define <32 x half> @stack_fold_fmaddsub312ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
-  ;CHECK-LABEL: stack_fold_fmaddsub312ph:
-  ;CHECK:       vfmaddsub132ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmaddsub312ph:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfmaddsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a2, <32 x half> %a0, <32 x half> %a1, i32 4)
   ret <32 x half> %2
 }
 
 define <32 x half> @stack_fold_fmaddsub123ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
-  ;CHECK-LABEL: stack_fold_fmaddsub123ph_mask:
-  ;CHECK:       vfmaddsub213ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmaddsub123ph_mask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovaps (%rdi), %zmm2
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vfmaddsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = load <32 x half>, ptr %p
   %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, i32 4)
@@ -2207,8 +3405,17 @@ define <32 x half> @stack_fold_fmaddsub123ph_mask(ptr %p, <32 x half> %a1, <32 x
 }
 
 define <32 x half> @stack_fold_fmaddsub213ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
-  ;CHECK-LABEL: stack_fold_fmaddsub213ph_mask:
-  ;CHECK:       vfmaddsub213ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmaddsub213ph_mask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovaps (%rdi), %zmm2
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vfmaddsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = load <32 x half>, ptr %p
   %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a1, <32 x half> %a0, <32 x half> %a2, i32 4)
@@ -2218,8 +3425,17 @@ define <32 x half> @stack_fold_fmaddsub213ph_mask(ptr %p, <32 x half> %a1, <32 x
 }
 
 define <32 x half> @stack_fold_fmaddsub231ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
-  ;CHECK-LABEL: stack_fold_fmaddsub231ph_mask:
-  ;CHECK:       vfmaddsub231ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmaddsub231ph_mask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovaps (%rdi), %zmm2
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vfmaddsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = load <32 x half>, ptr %p
   %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a1, <32 x half> %a2, <32 x half> %a0, i32 4)
@@ -2229,8 +3445,17 @@ define <32 x half> @stack_fold_fmaddsub231ph_mask(ptr %p, <32 x half> %a1, <32 x
 }
 
 define <32 x half> @stack_fold_fmaddsub321ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
-  ;CHECK-LABEL: stack_fold_fmaddsub321ph_mask:
-  ;CHECK:       vfmaddsub231ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmaddsub321ph_mask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovaps (%rdi), %zmm2
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vfmaddsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = load <32 x half>, ptr %p
   %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a2, <32 x half> %a1, <32 x half> %a0, i32 4)
@@ -2240,8 +3465,17 @@ define <32 x half> @stack_fold_fmaddsub321ph_mask(ptr %p, <32 x half> %a1, <32 x
 }
 
 define <32 x half> @stack_fold_fmaddsub132ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
-  ;CHECK-LABEL: stack_fold_fmaddsub132ph_mask:
-  ;CHECK:       vfmaddsub132ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmaddsub132ph_mask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovaps (%rdi), %zmm2
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vfmaddsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = load <32 x half>, ptr %p
   %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a0, <32 x half> %a2, <32 x half> %a1, i32 4)
@@ -2251,8 +3485,17 @@ define <32 x half> @stack_fold_fmaddsub132ph_mask(ptr %p, <32 x half> %a1, <32 x
 }
 
 define <32 x half> @stack_fold_fmaddsub312ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
-  ;CHECK-LABEL: stack_fold_fmaddsub312ph_mask:
-  ;CHECK:       vfmaddsub132ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmaddsub312ph_mask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovaps (%rdi), %zmm2
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vfmaddsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = load <32 x half>, ptr %p
   %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a2, <32 x half> %a0, <32 x half> %a1, i32 4)
@@ -2262,8 +3505,15 @@ define <32 x half> @stack_fold_fmaddsub312ph_mask(ptr %p, <32 x half> %a1, <32 x
 }
 
 define <32 x half> @stack_fold_fmaddsub123ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fmaddsub123ph_maskz:
-  ;CHECK:       vfmaddsub213ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmaddsub123ph_maskz:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovd (%rdi), %k1
+; CHECK-NEXT:    vfmaddsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, i32 4)
   %3 = load i32, ptr %mask
@@ -2273,8 +3523,15 @@ define <32 x half> @stack_fold_fmaddsub123ph_maskz(<32 x half> %a0, <32 x half>
 }
 
 define <32 x half> @stack_fold_fmaddsub213ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fmaddsub213ph_maskz:
-  ;CHECK:       vfmaddsub213ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmaddsub213ph_maskz:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovd (%rdi), %k1
+; CHECK-NEXT:    vfmaddsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a1, <32 x half> %a0, <32 x half> %a2, i32 4)
   %3 = load i32, ptr %mask
@@ -2284,8 +3541,15 @@ define <32 x half> @stack_fold_fmaddsub213ph_maskz(<32 x half> %a0, <32 x half>
 }
 
 define <32 x half> @stack_fold_fmaddsub231ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fmaddsub231ph_maskz:
-  ;CHECK:       vfmaddsub231ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmaddsub231ph_maskz:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovd (%rdi), %k1
+; CHECK-NEXT:    vfmaddsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a1, <32 x half> %a2, <32 x half> %a0, i32 4)
   %3 = load i32, ptr %mask
@@ -2295,8 +3559,15 @@ define <32 x half> @stack_fold_fmaddsub231ph_maskz(<32 x half> %a0, <32 x half>
 }
 
 define <32 x half> @stack_fold_fmaddsub321ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fmaddsub321ph_maskz:
-  ;CHECK:       vfmaddsub231ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmaddsub321ph_maskz:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovd (%rdi), %k1
+; CHECK-NEXT:    vfmaddsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a2, <32 x half> %a1, <32 x half> %a0, i32 4)
   %3 = load i32, ptr %mask
@@ -2306,8 +3577,15 @@ define <32 x half> @stack_fold_fmaddsub321ph_maskz(<32 x half> %a0, <32 x half>
 }
 
 define <32 x half> @stack_fold_fmaddsub132ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fmaddsub132ph_maskz:
-  ;CHECK:       vfmaddsub132ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmaddsub132ph_maskz:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovd (%rdi), %k1
+; CHECK-NEXT:    vfmaddsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a0, <32 x half> %a2, <32 x half> %a1, i32 4)
   %3 = load i32, ptr %mask
@@ -2317,8 +3595,15 @@ define <32 x half> @stack_fold_fmaddsub132ph_maskz(<32 x half> %a0, <32 x half>
 }
 
 define <32 x half> @stack_fold_fmaddsub312ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fmaddsub312ph_maskz:
-  ;CHECK:       vfmaddsub132ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmaddsub312ph_maskz:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovd (%rdi), %k1
+; CHECK-NEXT:    vfmaddsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a2, <32 x half> %a0, <32 x half> %a1, i32 4)
   %3 = load i32, ptr %mask
@@ -2328,8 +3613,14 @@ define <32 x half> @stack_fold_fmaddsub312ph_maskz(<32 x half> %a0, <32 x half>
 }
 
 define <32 x half> @stack_fold_fmsubadd123ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
-  ;CHECK-LABEL: stack_fold_fmsubadd123ph:
-  ;CHECK:       vfmsubadd213ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmsubadd123ph:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfmsubadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = fneg <32 x half> %a2
   %3 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a0, <32 x half> %a1, <32 x half> %2, i32 4)
@@ -2337,8 +3628,14 @@ define <32 x half> @stack_fold_fmsubadd123ph(<32 x half> %a0, <32 x half> %a1, <
 }
 
 define <32 x half> @stack_fold_fmsubadd213ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
-  ;CHECK-LABEL: stack_fold_fmsubadd213ph:
-  ;CHECK:       vfmsubadd213ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmsubadd213ph:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfmsubadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = fneg <32 x half> %a2
   %3 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a1, <32 x half> %a0, <32 x half> %2, i32 4)
@@ -2346,8 +3643,14 @@ define <32 x half> @stack_fold_fmsubadd213ph(<32 x half> %a0, <32 x half> %a1, <
 }
 
 define <32 x half> @stack_fold_fmsubadd231ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
-  ;CHECK-LABEL: stack_fold_fmsubadd231ph:
-  ;CHECK:       vfmsubadd231ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmsubadd231ph:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfmsubadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = fneg <32 x half> %a0
   %3 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a1, <32 x half> %a2, <32 x half> %2, i32 4)
@@ -2355,8 +3658,14 @@ define <32 x half> @stack_fold_fmsubadd231ph(<32 x half> %a0, <32 x half> %a1, <
 }
 
 define <32 x half> @stack_fold_fmsubadd321ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
-  ;CHECK-LABEL: stack_fold_fmsubadd321ph:
-  ;CHECK:       vfmsubadd231ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmsubadd321ph:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfmsubadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = fneg <32 x half> %a0
   %3 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a2, <32 x half> %a1, <32 x half> %2, i32 4)
@@ -2364,8 +3673,14 @@ define <32 x half> @stack_fold_fmsubadd321ph(<32 x half> %a0, <32 x half> %a1, <
 }
 
 define <32 x half> @stack_fold_fmsubadd132ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
-  ;CHECK-LABEL: stack_fold_fmsubadd132ph:
-  ;CHECK:       vfmsubadd132ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmsubadd132ph:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfmsubadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = fneg <32 x half> %a1
   %3 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a0, <32 x half> %a2, <32 x half> %2, i32 4)
@@ -2373,8 +3688,14 @@ define <32 x half> @stack_fold_fmsubadd132ph(<32 x half> %a0, <32 x half> %a1, <
 }
 
 define <32 x half> @stack_fold_fmsubadd312ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
-  ;CHECK-LABEL: stack_fold_fmsubadd312ph:
-  ;CHECK:       vfmsubadd132ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmsubadd312ph:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfmsubadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = fneg <32 x half> %a1
   %3 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a2, <32 x half> %a0, <32 x half> %2, i32 4)
@@ -2382,8 +3703,17 @@ define <32 x half> @stack_fold_fmsubadd312ph(<32 x half> %a0, <32 x half> %a1, <
 }
 
 define <32 x half> @stack_fold_fmsubadd123ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
-  ;CHECK-LABEL: stack_fold_fmsubadd123ph_mask:
-  ;CHECK:       vfmsubadd213ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmsubadd123ph_mask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovaps (%rdi), %zmm2
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vfmsubadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = load <32 x half>, ptr %p
   %neg = fneg <32 x half> %a2
@@ -2394,8 +3724,17 @@ define <32 x half> @stack_fold_fmsubadd123ph_mask(ptr %p, <32 x half> %a1, <32 x
 }
 
 define <32 x half> @stack_fold_fmsubadd213ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
-  ;CHECK-LABEL: stack_fold_fmsubadd213ph_mask:
-  ;CHECK:       vfmsubadd213ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmsubadd213ph_mask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovaps (%rdi), %zmm2
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vfmsubadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = load <32 x half>, ptr %p
   %neg = fneg <32 x half> %a2
@@ -2406,8 +3745,17 @@ define <32 x half> @stack_fold_fmsubadd213ph_mask(ptr %p, <32 x half> %a1, <32 x
 }
 
 define <32 x half> @stack_fold_fmsubadd231ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
-  ;CHECK-LABEL: stack_fold_fmsubadd231ph_mask:
-  ;CHECK:       vfmsubadd231ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmsubadd231ph_mask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovaps (%rdi), %zmm2
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vfmsubadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = load <32 x half>, ptr %p
   %neg = fneg <32 x half> %a0
@@ -2418,8 +3766,17 @@ define <32 x half> @stack_fold_fmsubadd231ph_mask(ptr %p, <32 x half> %a1, <32 x
 }
 
 define <32 x half> @stack_fold_fmsubadd321ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
-  ;CHECK-LABEL: stack_fold_fmsubadd321ph_mask:
-  ;CHECK:       vfmsubadd231ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmsubadd321ph_mask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovaps (%rdi), %zmm2
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vfmsubadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = load <32 x half>, ptr %p
   %neg = fneg <32 x half> %a0
@@ -2430,8 +3787,17 @@ define <32 x half> @stack_fold_fmsubadd321ph_mask(ptr %p, <32 x half> %a1, <32 x
 }
 
 define <32 x half> @stack_fold_fmsubadd132ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
-  ;CHECK-LABEL: stack_fold_fmsubadd132ph_mask:
-  ;CHECK:       vfmsubadd132ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmsubadd132ph_mask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovaps (%rdi), %zmm2
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vfmsubadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = load <32 x half>, ptr %p
   %neg = fneg <32 x half> %a1
@@ -2442,8 +3808,17 @@ define <32 x half> @stack_fold_fmsubadd132ph_mask(ptr %p, <32 x half> %a1, <32 x
 }
 
 define <32 x half> @stack_fold_fmsubadd312ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
-  ;CHECK-LABEL: stack_fold_fmsubadd312ph_mask:
-  ;CHECK:       vfmsubadd132ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmsubadd312ph_mask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovaps (%rdi), %zmm2
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vfmsubadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = load <32 x half>, ptr %p
   %neg = fneg <32 x half> %a1
@@ -2454,8 +3829,15 @@ define <32 x half> @stack_fold_fmsubadd312ph_mask(ptr %p, <32 x half> %a1, <32 x
 }
 
 define <32 x half> @stack_fold_fmsubadd123ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fmsubadd123ph_maskz:
-  ;CHECK:       vfmsubadd213ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmsubadd123ph_maskz:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovd (%rdi), %k1
+; CHECK-NEXT:    vfmsubadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %neg = fneg <32 x half> %a2
   %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a0, <32 x half> %a1, <32 x half> %neg, i32 4)
@@ -2466,8 +3848,15 @@ define <32 x half> @stack_fold_fmsubadd123ph_maskz(<32 x half> %a0, <32 x half>
 }
 
 define <32 x half> @stack_fold_fmsubadd213ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fmsubadd213ph_maskz:
-  ;CHECK:       vfmsubadd213ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmsubadd213ph_maskz:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovd (%rdi), %k1
+; CHECK-NEXT:    vfmsubadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %neg = fneg <32 x half> %a2
   %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a1, <32 x half> %a0, <32 x half> %neg, i32 4)
@@ -2478,8 +3867,15 @@ define <32 x half> @stack_fold_fmsubadd213ph_maskz(<32 x half> %a0, <32 x half>
 }
 
 define <32 x half> @stack_fold_fmsubadd231ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fmsubadd231ph_maskz:
-  ;CHECK:       vfmsubadd231ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmsubadd231ph_maskz:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovd (%rdi), %k1
+; CHECK-NEXT:    vfmsubadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %neg = fneg <32 x half> %a0
   %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a1, <32 x half> %a2, <32 x half> %neg, i32 4)
@@ -2490,8 +3886,15 @@ define <32 x half> @stack_fold_fmsubadd231ph_maskz(<32 x half> %a0, <32 x half>
 }
 
 define <32 x half> @stack_fold_fmsubadd321ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fmsubadd321ph_maskz:
-  ;CHECK:       vfmsubadd231ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmsubadd321ph_maskz:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovd (%rdi), %k1
+; CHECK-NEXT:    vfmsubadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %neg = fneg <32 x half> %a0
   %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a2, <32 x half> %a1, <32 x half> %neg, i32 4)
@@ -2502,8 +3905,15 @@ define <32 x half> @stack_fold_fmsubadd321ph_maskz(<32 x half> %a0, <32 x half>
 }
 
 define <32 x half> @stack_fold_fmsubadd132ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fmsubadd132ph_maskz:
-  ;CHECK:       vfmsubadd132ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmsubadd132ph_maskz:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovd (%rdi), %k1
+; CHECK-NEXT:    vfmsubadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %neg = fneg <32 x half> %a1
   %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a0, <32 x half> %a2, <32 x half> %neg, i32 4)
@@ -2514,8 +3924,15 @@ define <32 x half> @stack_fold_fmsubadd132ph_maskz(<32 x half> %a0, <32 x half>
 }
 
 define <32 x half> @stack_fold_fmsubadd312ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fmsubadd312ph_maskz:
-  ;CHECK:       vfmsubadd132ph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmsubadd312ph_maskz:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovd (%rdi), %k1
+; CHECK-NEXT:    vfmsubadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %neg = fneg <32 x half> %a1
   %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a2, <32 x half> %a0, <32 x half> %neg, i32 4)

diff  --git a/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16.ll b/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16.ll
index ea32a5209716..f6863be4b919 100644
--- a/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16.ll
+++ b/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16 < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
@@ -9,16 +10,31 @@ target triple = "x86_64-unknown-unknown"
 ; relevant registers and check that the reload is correctly folded into the instruction.
 
 define <32 x half> @stack_fold_addph_zmm(<32 x half> %a0, <32 x half> %a1) {
-  ;CHECK-LABEL: stack_fold_addph_zmm
-  ;CHECK:       vaddph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_addph_zmm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vaddph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = fadd <32 x half> %a0, %a1
   ret <32 x half> %2
 }
 
 define <32 x half> @stack_fold_addph_zmm_k(<32 x half> %a0, <32 x half> %a1, i32 %mask, ptr %passthru) {
-  ;CHECK-LABEL: stack_fold_addph_zmm_k:
-  ;CHECK:       vaddph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_addph_zmm_k:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vmovaps (%rsi), %zmm2
+; CHECK-NEXT:    vaddph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = fadd <32 x half> %a0, %a1
   %3 = bitcast i32 %mask to <32 x i1>
@@ -28,8 +44,17 @@ define <32 x half> @stack_fold_addph_zmm_k(<32 x half> %a0, <32 x half> %a1, i32
 }
 
 define <32 x half> @stack_fold_addph_zmm_k_commuted(<32 x half> %a0, <32 x half> %a1, i32 %mask, ptr %passthru) {
-  ;CHECK-LABEL: stack_fold_addph_zmm_k_commuted:
-  ;CHECK:       vaddph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_addph_zmm_k_commuted:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vmovaps (%rsi), %zmm2
+; CHECK-NEXT:    vaddph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = fadd <32 x half> %a1, %a0
   %3 = bitcast i32 %mask to <32 x i1>
@@ -39,8 +64,15 @@ define <32 x half> @stack_fold_addph_zmm_k_commuted(<32 x half> %a0, <32 x half>
 }
 
 define <32 x half> @stack_fold_addph_zmm_kz(<32 x half> %a0, <32 x half> %a1, i32 %mask) {
-  ;CHECK-LABEL: stack_fold_addph_zmm_kz
-  ;CHECK:       vaddph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_addph_zmm_kz:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vaddph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = fadd <32 x half> %a1, %a0
   %3 = bitcast i32 %mask to <32 x i1>
@@ -49,16 +81,28 @@ define <32 x half> @stack_fold_addph_zmm_kz(<32 x half> %a0, <32 x half> %a1, i3
 }
 
 define half @stack_fold_addsh(half %a0, half %a1) {
-  ;CHECK-LABEL: stack_fold_addsh
-  ;CHECK:       vaddsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+; CHECK-LABEL: stack_fold_addsh:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovsh %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vaddsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = fadd half %a0, %a1
   ret half %2
 }
 
 define <8 x half> @stack_fold_addsh_int(<8 x half> %a0, <8 x half> %a1) {
-  ;CHECK-LABEL: stack_fold_addsh_int
-  ;CHECK:       vaddsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_addsh_int:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vaddsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = extractelement <8 x half> %a0, i32 0
   %3 = extractelement <8 x half> %a1, i32 0
@@ -68,8 +112,16 @@ define <8 x half> @stack_fold_addsh_int(<8 x half> %a0, <8 x half> %a1) {
 }
 
 define i32 @stack_fold_cmpph(<32 x half> %a0, <32 x half> %a1) {
-  ;CHECK-LABEL: stack_fold_cmpph
-  ;CHECK:       vcmpeqph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%k[0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_cmpph:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vcmpeqph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %k0 # 64-byte Folded Reload
+; CHECK-NEXT:    kmovd %k0, %eax
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %res = call <32 x i1> @llvm.x86.avx512fp16.mask.cmp.ph.512(<32 x half> %a0, <32 x half> %a1, i32 0, <32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 4)
   %2 = bitcast <32 x i1> %res to i32
@@ -78,8 +130,27 @@ define i32 @stack_fold_cmpph(<32 x half> %a0, <32 x half> %a1) {
 declare <32 x i1> @llvm.x86.avx512fp16.mask.cmp.ph.512(<32 x half>, <32 x half>, i32, <32 x i1>, i32)
 
 define <32 x half> @stack_fold_cmpph_mask(<32 x half> %a0, <32 x half> %a1, ptr %a2, i32 %mask, <32 x half> %b0, <32 x half> %b1) {
-  ;CHECK-LABEL: stack_fold_cmpph_mask:
-  ;CHECK:       vcmpeqph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%k[0-7]}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_cmpph_mask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subq $136, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 144
+; CHECK-NEXT:    vmovups %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    vmovups %zmm2, (%rsp) # 64-byte Spill
+; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; CHECK-NEXT:    vaddph (%rdi), %zmm0, %zmm0
+; CHECK-NEXT:    vcmpeqph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %k0 # 64-byte Folded Reload
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    kandd %k0, %k1, %k1
+; CHECK-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; CHECK-NEXT:    vmovdqu16 (%rsp), %zmm0 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT:    addq $136, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   ; load and fadd are here to keep the operations below the side effecting block and to avoid folding the wrong load
   %2 = load <32 x half>, ptr %a2
@@ -92,8 +163,27 @@ define <32 x half> @stack_fold_cmpph_mask(<32 x half> %a0, <32 x half> %a1, ptr
 }
 
 define <32 x half> @stack_fold_cmpph_mask_commuted(<32 x half> %a0, <32 x half> %a1, ptr %a2, i32 %mask, <32 x half> %b0, <32 x half> %b1) {
-  ;CHECK-LABEL: stack_fold_cmpph_mask_commuted:
-  ;CHECK:       vcmpeqph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%k[0-7]}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_cmpph_mask_commuted:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subq $136, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 144
+; CHECK-NEXT:    vmovups %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    vmovups %zmm2, (%rsp) # 64-byte Spill
+; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; CHECK-NEXT:    vaddph (%rdi), %zmm0, %zmm0
+; CHECK-NEXT:    vcmpeqph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %k0 # 64-byte Folded Reload
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    kandd %k0, %k1, %k1
+; CHECK-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; CHECK-NEXT:    vmovdqu16 (%rsp), %zmm0 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT:    addq $136, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   ; load and fadd are here to keep the operations below the side effecting block and to avoid folding the wrong load
   %2 = load <32 x half>, ptr %a2
@@ -106,16 +196,28 @@ define <32 x half> @stack_fold_cmpph_mask_commuted(<32 x half> %a0, <32 x half>
 }
 
 define half @stack_fold_divsh(half %a0, half %a1) {
-  ;CHECK-LABEL: stack_fold_divsh
-  ;CHECK:       vdivsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+; CHECK-LABEL: stack_fold_divsh:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovsh %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vdivsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = fdiv half %a0, %a1
   ret half %2
 }
 
 define <8 x half> @stack_fold_divsh_int(<8 x half> %a0, <8 x half> %a1) {
-  ;CHECK-LABEL: stack_fold_divsh_int
-  ;CHECK:       vdivsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_divsh_int:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vdivsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = extractelement <8 x half> %a0, i32 0
   %3 = extractelement <8 x half> %a1, i32 0
@@ -125,8 +227,16 @@ define <8 x half> @stack_fold_divsh_int(<8 x half> %a0, <8 x half> %a1) {
 }
 
 define i32 @stack_fold_fpclassph(<32 x half> %a0) {
-  ;CHECK-LABEL: stack_fold_fpclassph:
-  ;CHECK:       vfpclassphz $4, {{-?[0-9]*}}(%rsp), {{%k[0-7]}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_fpclassph:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfpclassphz $4, {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 64-byte Folded Reload
+; CHECK-NEXT:    kmovd %k0, %eax
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <32 x i1> @llvm.x86.avx512fp16.fpclass.ph.512(<32 x half> %a0, i32 4)
   %3 = bitcast <32 x i1> %2 to i32
@@ -135,8 +245,17 @@ define i32 @stack_fold_fpclassph(<32 x half> %a0) {
 declare <32 x i1> @llvm.x86.avx512fp16.fpclass.ph.512(<32 x half>, i32)
 
 define i32 @stack_fold_fpclassph_mask(<32 x half> %a0, ptr %p) {
-  ;CHECK-LABEL: stack_fold_fpclassph_mask:
-  ;CHECK:       vfpclassphz $4, {{-?[0-9]*}}(%rsp), {{%k[0-7]}} {{{%k[1-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_fpclassph_mask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovd (%rdi), %k1
+; CHECK-NEXT:    vfpclassphz $4, {{[-0-9]+}}(%r{{[sb]}}p), %k0 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT:    kmovd %k0, %eax
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <32 x i1> @llvm.x86.avx512fp16.fpclass.ph.512(<32 x half> %a0, i32 4)
   %mask = load <32 x i1>, ptr %p
@@ -147,7 +266,16 @@ define i32 @stack_fold_fpclassph_mask(<32 x half> %a0, ptr %p) {
 
 define i8 @stack_fold_fpclasssh(<8 x half> %a0) {
   ;CHECK-LABEl: stack_fold_fpclasssh:
-  ;CHECK:       vfpclasssh $4, {{-?[0-9]*}}(%rsp), {{%k[0-7]}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fpclasssh:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfpclasssh $4, {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 16-byte Folded Reload
+; CHECK-NEXT:    kmovd %k0, %eax
+; CHECK-NEXT:    # kill: def $al killed $al killed $eax
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call i8 @llvm.x86.avx512fp16.mask.fpclass.sh(<8 x half> %a0, i32 4, i8 -1)
   ret i8 %2
@@ -155,8 +283,17 @@ define i8 @stack_fold_fpclasssh(<8 x half> %a0) {
 declare i8 @llvm.x86.avx512fp16.mask.fpclass.sh(<8 x half>, i32, i8)
 
 define i8 @stack_fold_fpclasssh_mask(<8 x half> %a0, ptr %p) {
-  ;CHECK-LABEL: stack_fold_fpclasssh_mask:
-  ;CHECK:       vfpclasssh $4, {{-?[0-9]*}}(%rsp), {{%k[0-7]}} {{{%k[1-7]}}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fpclasssh_mask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovb (%rdi), %k1
+; CHECK-NEXT:    vfpclasssh $4, {{[-0-9]+}}(%r{{[sb]}}p), %k0 {%k1} # 16-byte Folded Reload
+; CHECK-NEXT:    kmovd %k0, %eax
+; CHECK-NEXT:    # kill: def $al killed $al killed $eax
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %mask = load i8, ptr %p
   %2 = call i8 @llvm.x86.avx512fp16.mask.fpclass.sh(<8 x half> %a0, i32 4, i8 %mask)
@@ -164,8 +301,14 @@ define i8 @stack_fold_fpclasssh_mask(<8 x half> %a0, ptr %p) {
 }
 
 define <32 x half> @stack_fold_getexpph(<32 x half> %a0) {
-  ;CHECK-LABEL: stack_fold_getexpph:
-  ;CHECK:       vgetexpph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_getexpph:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vgetexpph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <32 x half> @llvm.x86.avx512fp16.mask.getexp.ph.512(<32 x half> %a0, <32 x half> undef, i32 -1, i32 4)
   ret <32 x half> %2
@@ -173,8 +316,17 @@ define <32 x half> @stack_fold_getexpph(<32 x half> %a0) {
 declare <32 x half> @llvm.x86.avx512fp16.mask.getexp.ph.512(<32 x half>, <32 x half>, i32, i32)
 
 define <32 x half> @stack_fold_getexpph_mask(<32 x half> %a0, ptr %passthru, i32 %mask) {
-  ;CHECK-LABEL: stack_fold_getexpph_mask:
-  ;CHECK:       vgetexpph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_getexpph_mask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovaps (%rdi), %zmm1
+; CHECK-NEXT:    vgetexpph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT:    vmovaps %zmm1, %zmm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = load <32 x half>, ptr %passthru
   %3 = call <32 x half> @llvm.x86.avx512fp16.mask.getexp.ph.512(<32 x half> %a0, <32 x half> %2, i32 %mask, i32 4)
@@ -182,8 +334,15 @@ define <32 x half> @stack_fold_getexpph_mask(<32 x half> %a0, ptr %passthru, i32
 }
 
 define <32 x half> @stack_fold_getexpph_maskz(<32 x half> %a0, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_getexpph_maskz:
-  ;CHECK:       vgetexpph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_getexpph_maskz:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovd (%rdi), %k1
+; CHECK-NEXT:    vgetexpph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = load i32, ptr %mask
   %3 = call <32 x half> @llvm.x86.avx512fp16.mask.getexp.ph.512(<32 x half> %a0, <32 x half> zeroinitializer, i32 %2, i32 4)
@@ -191,8 +350,14 @@ define <32 x half> @stack_fold_getexpph_maskz(<32 x half> %a0, ptr %mask) {
 }
 
 define <8 x half> @stack_fold_getexpsh(<8 x half> %a0, <8 x half> %a1) {
-  ;CHECK-LABEL: stack_fold_getexpsh:
-  ;CHECK:       vgetexpsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_getexpsh:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vgetexpsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <8 x half> @llvm.x86.avx512fp16.mask.getexp.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> undef, i8 -1, i32 4)
   ret <8 x half> %2
@@ -200,8 +365,17 @@ define <8 x half> @stack_fold_getexpsh(<8 x half> %a0, <8 x half> %a1) {
 declare <8 x half> @llvm.x86.avx512fp16.mask.getexp.sh(<8 x half>, <8 x half>, <8 x half>, i8, i32)
 
 define <8 x half> @stack_fold_getexpsh_mask(<8 x half> %a0, <8 x half> %a1, ptr %passthru, i8 %mask) {
-  ;CHECK-LABEL: stack_fold_getexpsh_mask:
-  ;CHECK:       vgetexpsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_getexpsh_mask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovaps (%rdi), %xmm2
+; CHECK-NEXT:    vgetexpsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload
+; CHECK-NEXT:    vmovaps %xmm2, %xmm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = load <8 x half>, ptr %passthru
   %3 = call <8 x half> @llvm.x86.avx512fp16.mask.getexp.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> %2, i8 %mask, i32 4)
@@ -209,8 +383,15 @@ define <8 x half> @stack_fold_getexpsh_mask(<8 x half> %a0, <8 x half> %a1, ptr
 }
 
 define <8 x half> @stack_fold_getexpsh_maskz(<8 x half> %a0, <8 x half> %a1, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_getexpsh_maskz:
-  ;CHECK:       vgetexpsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_getexpsh_maskz:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovb (%rdi), %k1
+; CHECK-NEXT:    vgetexpsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 {%k1} {z} # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = load i8, ptr %mask
   %3 = call <8 x half> @llvm.x86.avx512fp16.mask.getexp.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> zeroinitializer, i8 %2, i32 4)
@@ -218,8 +399,14 @@ define <8 x half> @stack_fold_getexpsh_maskz(<8 x half> %a0, <8 x half> %a1, ptr
 }
 
 define <32 x half> @stack_fold_getmantph(<32 x half> %a0) {
-  ;CHECK-LABEL: stack_fold_getmantph:
-  ;CHECK:       vgetmantph $8, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_getmantph:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vgetmantph $8, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <32 x half> @llvm.x86.avx512fp16.mask.getmant.ph.512(<32 x half> %a0, i32 8, <32 x half> undef, i32 -1, i32 4)
   ret <32 x half> %2
@@ -227,8 +414,17 @@ define <32 x half> @stack_fold_getmantph(<32 x half> %a0) {
 declare <32 x half> @llvm.x86.avx512fp16.mask.getmant.ph.512(<32 x half>, i32, <32 x half>, i32, i32)
 
 define <32 x half> @stack_fold_getmantph_mask(<32 x half> %a0, ptr %passthru, i32 %mask) {
-  ;CHECK-LABEL: stack_fold_getmantph_mask:
-  ;CHECK:       vgetmantph $8, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_getmantph_mask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovaps (%rdi), %zmm1
+; CHECK-NEXT:    vgetmantph $8, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT:    vmovaps %zmm1, %zmm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = load <32 x half>, ptr %passthru
   %3 = call <32 x half> @llvm.x86.avx512fp16.mask.getmant.ph.512(<32 x half> %a0, i32 8, <32 x half> %2, i32 %mask, i32 4)
@@ -236,8 +432,15 @@ define <32 x half> @stack_fold_getmantph_mask(<32 x half> %a0, ptr %passthru, i3
 }
 
 define <32 x half> @stack_fold_getmantph_maskz(<32 x half> %a0, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_getmantph_maskz:
-  ;CHECK:       vgetmantph $8, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_getmantph_maskz:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovd (%rdi), %k1
+; CHECK-NEXT:    vgetmantph $8, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = load i32, ptr %mask
   %3 = call <32 x half> @llvm.x86.avx512fp16.mask.getmant.ph.512(<32 x half> %a0, i32 8, <32 x half> zeroinitializer, i32 %2, i32 4)
@@ -245,8 +448,14 @@ define <32 x half> @stack_fold_getmantph_maskz(<32 x half> %a0, ptr %mask) {
 }
 
 define <8 x half> @stack_fold_getmantsh(<8 x half> %a0, <8 x half> %a1) {
-  ;CHECK-LABEL: stack_fold_getmantsh:
-  ;CHECK:       vgetmantsh $8, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_getmantsh:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vgetmantsh $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <8 x half> @llvm.x86.avx512fp16.mask.getmant.sh(<8 x half> %a0, <8 x half> %a1, i32 8, <8 x half> undef, i8 -1, i32 4)
   ret <8 x half> %2
@@ -254,8 +463,17 @@ define <8 x half> @stack_fold_getmantsh(<8 x half> %a0, <8 x half> %a1) {
 declare <8 x half> @llvm.x86.avx512fp16.mask.getmant.sh(<8 x half>, <8 x half>, i32, <8 x half>, i8, i32)
 
 define <8 x half> @stack_fold_getmantsh_mask(<8 x half> %a0, <8 x half> %a1, ptr %passthru, i8 %mask) {
-  ;CHECK-LABEL: stack_fold_getmantsh_mask:
-  ;CHECK:       vgetmantsh $8, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_getmantsh_mask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovaps (%rdi), %xmm2
+; CHECK-NEXT:    vgetmantsh $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload
+; CHECK-NEXT:    vmovaps %xmm2, %xmm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = load <8 x half>, ptr %passthru
   %3 = call <8 x half> @llvm.x86.avx512fp16.mask.getmant.sh(<8 x half> %a0, <8 x half> %a1, i32 8, <8 x half> %2, i8 %mask, i32 4)
@@ -263,8 +481,15 @@ define <8 x half> @stack_fold_getmantsh_mask(<8 x half> %a0, <8 x half> %a1, ptr
 }
 
 define <8 x half> @stack_fold_getmantsh_maskz(<8 x half> %a0, <8 x half> %a1, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_getmantsh_maskz:
-  ;CHECK:       vgetmantsh $8, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_getmantsh_maskz:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovb (%rdi), %k1
+; CHECK-NEXT:    vgetmantsh $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 {%k1} {z} # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = load i8, ptr %mask
   %3 = call <8 x half> @llvm.x86.avx512fp16.mask.getmant.sh(<8 x half> %a0, <8 x half> %a1, i32 8, <8 x half> zeroinitializer, i8 %2, i32 4)
@@ -272,8 +497,14 @@ define <8 x half> @stack_fold_getmantsh_maskz(<8 x half> %a0, <8 x half> %a1, pt
 }
 
 define <32 x half> @stack_fold_maxph_zmm(<32 x half> %a0, <32 x half> %a1) #0 {
-  ;CHECK-LABEL: stack_fold_maxph_zmm:
-  ;CHECK:       vmaxph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_maxph_zmm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmaxph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <32 x half> @llvm.x86.avx512fp16.max.ph.512(<32 x half> %a0, <32 x half> %a1, i32 4)
   ret <32 x half> %2
@@ -281,16 +512,32 @@ define <32 x half> @stack_fold_maxph_zmm(<32 x half> %a0, <32 x half> %a1) #0 {
 declare <32 x half> @llvm.x86.avx512fp16.max.ph.512(<32 x half>, <32 x half>, i32) nounwind readnone
 
 define <32 x half> @stack_fold_maxph_zmm_commuted(<32 x half> %a0, <32 x half> %a1) #0 {
-  ;CHECK-LABEL: stack_fold_maxph_zmm_commuted:
-  ;CHECK-NOT:       vmaxph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_maxph_zmm_commuted:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; CHECK-NEXT:    vmaxph %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <32 x half> @llvm.x86.avx512fp16.max.ph.512(<32 x half> %a1, <32 x half> %a0, i32 4)
   ret <32 x half> %2
 }
 
 define <32 x half> @stack_fold_maxph_zmm_k(<32 x half> %a0, <32 x half> %a1, i32 %mask, ptr %passthru) #0 {
-  ;CHECK-LABEL: stack_fold_maxph_zmm_k:
-  ;CHECK:       vmaxph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_maxph_zmm_k:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vmovaps (%rsi), %zmm2
+; CHECK-NEXT:    vmaxph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <32 x half> @llvm.x86.avx512fp16.max.ph.512(<32 x half> %a0, <32 x half> %a1, i32 4)
   %3 = bitcast i32 %mask to <32 x i1>
@@ -300,8 +547,18 @@ define <32 x half> @stack_fold_maxph_zmm_k(<32 x half> %a0, <32 x half> %a1, i32
 }
 
 define <32 x half> @stack_fold_maxph_zmm_k_commuted(<32 x half> %a0, <32 x half> %a1, i32 %mask, ptr %passthru) #0 {
-  ;CHECK-LABEL: stack_fold_maxph_zmm_k_commuted:
-  ;CHECK-NOT:       vmaxph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_maxph_zmm_k_commuted:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vmovaps (%rsi), %zmm2
+; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; CHECK-NEXT:    vmaxph %zmm0, %zmm1, %zmm2 {%k1}
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <32 x half> @llvm.x86.avx512fp16.max.ph.512(<32 x half> %a1, <32 x half> %a0, i32 4)
   %3 = bitcast i32 %mask to <32 x i1>
@@ -311,8 +568,15 @@ define <32 x half> @stack_fold_maxph_zmm_k_commuted(<32 x half> %a0, <32 x half>
 }
 
 define <32 x half> @stack_fold_maxph_zmm_kz(<32 x half> %a0, <32 x half> %a1, i32 %mask) #0 {
-  ;CHECK-LABEL: stack_fold_maxph_zmm_kz:
-  ;CHECK:       vmaxph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_maxph_zmm_kz:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vmaxph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <32 x half> @llvm.x86.avx512fp16.max.ph.512(<32 x half> %a0, <32 x half> %a1, i32 4)
   %3 = bitcast i32 %mask to <32 x i1>
@@ -321,8 +585,16 @@ define <32 x half> @stack_fold_maxph_zmm_kz(<32 x half> %a0, <32 x half> %a1, i3
 }
 
 define <32 x half> @stack_fold_maxph_zmm_kz_commuted(<32 x half> %a0, <32 x half> %a1, i32 %mask) #0 {
-  ;CHECK-LABEL: stack_fold_maxph_zmm_kz_commuted:
-  ;CHECK-NOT:       vmaxph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_maxph_zmm_kz_commuted:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; CHECK-NEXT:    vmaxph %zmm0, %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <32 x half> @llvm.x86.avx512fp16.max.ph.512(<32 x half> %a1, <32 x half> %a0, i32 4)
   %3 = bitcast i32 %mask to <32 x i1>
@@ -331,24 +603,45 @@ define <32 x half> @stack_fold_maxph_zmm_kz_commuted(<32 x half> %a0, <32 x half
 }
 
 define <32 x half> @stack_fold_maxph_zmm_commutable(<32 x half> %a0, <32 x half> %a1) #1 {
-  ;CHECK-LABEL: stack_fold_maxph_zmm_commutable:
-  ;CHECK:       vmaxph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_maxph_zmm_commutable:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmaxph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <32 x half> @llvm.x86.avx512fp16.max.ph.512(<32 x half> %a0, <32 x half> %a1, i32 4)
   ret <32 x half> %2
 }
 
 define <32 x half> @stack_fold_maxph_zmm_commutable_commuted(<32 x half> %a0, <32 x half> %a1) #1 {
-  ;CHECK-LABEL: stack_fold_maxph_zmm_commutable_commuted:
-  ;CHECK:       vmaxph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_maxph_zmm_commutable_commuted:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmaxph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <32 x half> @llvm.x86.avx512fp16.max.ph.512(<32 x half> %a1, <32 x half> %a0, i32 4)
   ret <32 x half> %2
 }
 
 define <32 x half> @stack_fold_maxph_zmm_commutable_k(<32 x half> %a0, <32 x half> %a1, i32 %mask, ptr %passthru) #1 {
-  ;CHECK-LABEL: stack_fold_maxph_zmm_commutable_k:
-  ;CHECK:       vmaxph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_maxph_zmm_commutable_k:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vmovaps (%rsi), %zmm2
+; CHECK-NEXT:    vmaxph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <32 x half> @llvm.x86.avx512fp16.max.ph.512(<32 x half> %a0, <32 x half> %a1, i32 4)
   %3 = bitcast i32 %mask to <32 x i1>
@@ -358,8 +651,17 @@ define <32 x half> @stack_fold_maxph_zmm_commutable_k(<32 x half> %a0, <32 x hal
 }
 
 define <32 x half> @stack_fold_maxph_zmm_commutable_k_commuted(<32 x half> %a0, <32 x half> %a1, i32 %mask, ptr %passthru) #1 {
-  ;CHECK-LABEL: stack_fold_maxph_zmm_commutable_k_commuted:
-  ;CHECK:       vmaxph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_maxph_zmm_commutable_k_commuted:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vmovaps (%rsi), %zmm2
+; CHECK-NEXT:    vmaxph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <32 x half> @llvm.x86.avx512fp16.max.ph.512(<32 x half> %a1, <32 x half> %a0, i32 4)
   %3 = bitcast i32 %mask to <32 x i1>
@@ -369,8 +671,15 @@ define <32 x half> @stack_fold_maxph_zmm_commutable_k_commuted(<32 x half> %a0,
 }
 
 define <32 x half> @stack_fold_maxph_zmm_commutable_kz(<32 x half> %a0, <32 x half> %a1, i32 %mask) #1 {
-  ;CHECK-LABEL: stack_fold_maxph_zmm_commutable_kz:
-  ;CHECK:       vmaxph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_maxph_zmm_commutable_kz:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vmaxph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <32 x half> @llvm.x86.avx512fp16.max.ph.512(<32 x half> %a0, <32 x half> %a1, i32 4)
   %3 = bitcast i32 %mask to <32 x i1>
@@ -379,8 +688,15 @@ define <32 x half> @stack_fold_maxph_zmm_commutable_kz(<32 x half> %a0, <32 x ha
 }
 
 define <32 x half> @stack_fold_maxph_zmm_commutable_kz_commuted(<32 x half> %a0, <32 x half> %a1, i32 %mask) #1 {
-  ;CHECK-LABEL: stack_fold_maxph_zmm_commutable_kz_commuted:
-  ;CHECK:       vmaxph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_maxph_zmm_commutable_kz_commuted:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vmaxph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <32 x half> @llvm.x86.avx512fp16.max.ph.512(<32 x half> %a1, <32 x half> %a0, i32 4)
   %3 = bitcast i32 %mask to <32 x i1>
@@ -389,8 +705,14 @@ define <32 x half> @stack_fold_maxph_zmm_commutable_kz_commuted(<32 x half> %a0,
 }
 
 define half @stack_fold_maxsh(half %a0, half %a1) #0 {
-  ;CHECK-LABEL: stack_fold_maxsh:
-  ;CHECK:       vmaxsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+; CHECK-LABEL: stack_fold_maxsh:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovsh %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmaxsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = fcmp ogt half %a0, %a1
   %3 = select i1 %2, half %a0, half %a1
@@ -398,8 +720,15 @@ define half @stack_fold_maxsh(half %a0, half %a1) #0 {
 }
 
 define half @stack_fold_maxsh_commuted(half %a0, half %a1) #0 {
-  ;CHECK-LABEL: stack_fold_maxsh_commuted:
-  ;CHECK-NOT:       vmaxsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+; CHECK-LABEL: stack_fold_maxsh_commuted:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovsh %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload
+; CHECK-NEXT:    vmaxsh %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = fcmp ogt half %a1, %a0
   %3 = select i1 %2, half %a1, half %a0
@@ -407,8 +736,14 @@ define half @stack_fold_maxsh_commuted(half %a0, half %a1) #0 {
 }
 
 define half @stack_fold_maxsh_commutable(half %a0, half %a1) #1 {
-  ;CHECK-LABEL: stack_fold_maxsh_commutable:
-  ;CHECK:       vmaxsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+; CHECK-LABEL: stack_fold_maxsh_commutable:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovsh %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmaxsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = fcmp ogt half %a0, %a1
   %3 = select i1 %2, half %a0, half %a1
@@ -416,8 +751,14 @@ define half @stack_fold_maxsh_commutable(half %a0, half %a1) #1 {
 }
 
 define half @stack_fold_maxsh_commutable_commuted(half %a0, half %a1) #1 {
-  ;CHECK-LABEL: stack_fold_maxsh_commutable_commuted:
-  ;CHECK:       vmaxsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+; CHECK-LABEL: stack_fold_maxsh_commutable_commuted:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovsh %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmaxsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = fcmp ogt half %a1, %a0
   %3 = select i1 %2, half %a1, half %a0
@@ -425,8 +766,14 @@ define half @stack_fold_maxsh_commutable_commuted(half %a0, half %a1) #1 {
 }
 
 define <8 x half> @stack_fold_maxsh_int(<8 x half> %a0, <8 x half> %a1) #0 {
-  ;CHECK-LABEL: stack_fold_maxsh_int:
-  ;CHECK:       vmaxsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_maxsh_int:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmaxsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <8 x half> @llvm.x86.avx512fp16.mask.max.sh.round(<8 x half> %a0, <8 x half> %a1, <8 x half> undef, i8 -1, i32 4)
   ret <8 x half> %2
@@ -434,8 +781,17 @@ define <8 x half> @stack_fold_maxsh_int(<8 x half> %a0, <8 x half> %a1) #0 {
 declare <8 x half> @llvm.x86.avx512fp16.mask.max.sh.round(<8 x half>, <8 x half>, <8 x half>, i8, i32)
 
 define <8 x half> @stack_fold_maxsh_mask(<8 x half> %a0, <8 x half> %a1, i8 %mask, ptr %passthru) {
-  ;CHECK-LABEL: stack_fold_maxsh_mask:
-  ;CHECK:       vmaxsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_maxsh_mask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovaps (%rsi), %xmm2
+; CHECK-NEXT:    vmaxsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload
+; CHECK-NEXT:    vmovaps %xmm2, %xmm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = load <8 x half>, ptr %passthru
   %3 = call <8 x half> @llvm.x86.avx512fp16.mask.max.sh.round(<8 x half> %a0, <8 x half> %a1, <8 x half> %2, i8 %mask, i32 4)
@@ -443,16 +799,29 @@ define <8 x half> @stack_fold_maxsh_mask(<8 x half> %a0, <8 x half> %a1, i8 %mas
 }
 
 define <8 x half> @stack_fold_maxsh_maskz(<8 x half> %a0, <8 x half> %a1, i8 %mask) {
-  ;CHECK-LABEL: stack_fold_maxsh_maskz:
-  ;CHECK:       vmaxsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_maxsh_maskz:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmaxsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 {%k1} {z} # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <8 x half> @llvm.x86.avx512fp16.mask.max.sh.round(<8 x half> %a0, <8 x half> %a1, <8 x half> zeroinitializer, i8 %mask, i32 4)
   ret <8 x half> %2
 }
 
 define <32 x half> @stack_fold_minph_zmm(<32 x half> %a0, <32 x half> %a1) #0 {
-  ;CHECK-LABEL: stack_fold_minph_zmm:
-  ;CHECK:       vminph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_minph_zmm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vminph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <32 x half> @llvm.x86.avx512fp16.min.ph.512(<32 x half> %a0, <32 x half> %a1, i32 4)
   ret <32 x half> %2
@@ -460,16 +829,32 @@ define <32 x half> @stack_fold_minph_zmm(<32 x half> %a0, <32 x half> %a1) #0 {
 declare <32 x half> @llvm.x86.avx512fp16.min.ph.512(<32 x half>, <32 x half>, i32) nounwind readnone
 
 define <32 x half> @stack_fold_minph_zmm_commuted(<32 x half> %a0, <32 x half> %a1) #0 {
-  ;CHECK-LABEL: stack_fold_minph_zmm_commuted:
-  ;CHECK-NOT:       vminph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_minph_zmm_commuted:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; CHECK-NEXT:    vminph %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <32 x half> @llvm.x86.avx512fp16.min.ph.512(<32 x half> %a1, <32 x half> %a0, i32 4)
   ret <32 x half> %2
 }
 
 define <32 x half> @stack_fold_minph_zmm_k(<32 x half> %a0, <32 x half> %a1, i32 %mask, ptr %passthru) #0 {
-  ;CHECK-LABEL: stack_fold_minph_zmm_k:
-  ;CHECK:       vminph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_minph_zmm_k:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vmovaps (%rsi), %zmm2
+; CHECK-NEXT:    vminph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <32 x half> @llvm.x86.avx512fp16.min.ph.512(<32 x half> %a0, <32 x half> %a1, i32 4)
   %3 = bitcast i32 %mask to <32 x i1>
@@ -479,8 +864,18 @@ define <32 x half> @stack_fold_minph_zmm_k(<32 x half> %a0, <32 x half> %a1, i32
 }
 
 define <32 x half> @stack_fold_minph_zmm_k_commuted(<32 x half> %a0, <32 x half> %a1, i32 %mask, ptr %passthru) #0 {
-  ;CHECK-LABEL: stack_fold_minph_zmm_k_commuted:
-  ;CHECK-NOT:       vminph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_minph_zmm_k_commuted:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vmovaps (%rsi), %zmm2
+; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; CHECK-NEXT:    vminph %zmm0, %zmm1, %zmm2 {%k1}
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <32 x half> @llvm.x86.avx512fp16.min.ph.512(<32 x half> %a1, <32 x half> %a0, i32 4)
   %3 = bitcast i32 %mask to <32 x i1>
@@ -490,8 +885,15 @@ define <32 x half> @stack_fold_minph_zmm_k_commuted(<32 x half> %a0, <32 x half>
 }
 
 define <32 x half> @stack_fold_minph_zmm_kz(<32 x half> %a0, <32 x half> %a1, i32 %mask) #0 {
-  ;CHECK-LABEL: stack_fold_minph_zmm_kz:
-  ;CHECK:       vminph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_minph_zmm_kz:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vminph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <32 x half> @llvm.x86.avx512fp16.min.ph.512(<32 x half> %a0, <32 x half> %a1, i32 4)
   %3 = bitcast i32 %mask to <32 x i1>
@@ -500,8 +902,16 @@ define <32 x half> @stack_fold_minph_zmm_kz(<32 x half> %a0, <32 x half> %a1, i3
 }
 
 define <32 x half> @stack_fold_minph_zmm_kz_commuted(<32 x half> %a0, <32 x half> %a1, i32 %mask) #0 {
-  ;CHECK-LABEL: stack_fold_minph_zmm_kz_commuted:
-  ;CHECK-NOT:       vminph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_minph_zmm_kz_commuted:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; CHECK-NEXT:    vminph %zmm0, %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <32 x half> @llvm.x86.avx512fp16.min.ph.512(<32 x half> %a1, <32 x half> %a0, i32 4)
   %3 = bitcast i32 %mask to <32 x i1>
@@ -510,24 +920,45 @@ define <32 x half> @stack_fold_minph_zmm_kz_commuted(<32 x half> %a0, <32 x half
 }
 
 define <32 x half> @stack_fold_minph_zmm_commutable(<32 x half> %a0, <32 x half> %a1) #1 {
-  ;CHECK-LABEL: stack_fold_minph_zmm_commutable:
-  ;CHECK:       vminph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_minph_zmm_commutable:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vminph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <32 x half> @llvm.x86.avx512fp16.min.ph.512(<32 x half> %a0, <32 x half> %a1, i32 4)
   ret <32 x half> %2
 }
 
 define <32 x half> @stack_fold_minph_zmm_commutable_commuted(<32 x half> %a0, <32 x half> %a1) #1 {
-  ;CHECK-LABEL: stack_fold_minph_zmm_commutable_commuted:
-  ;CHECK:       vminph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_minph_zmm_commutable_commuted:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vminph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <32 x half> @llvm.x86.avx512fp16.min.ph.512(<32 x half> %a1, <32 x half> %a0, i32 4)
   ret <32 x half> %2
 }
 
 define <32 x half> @stack_fold_minph_zmm_commutable_k(<32 x half> %a0, <32 x half> %a1, i32 %mask, ptr %passthru) #1 {
-  ;CHECK-LABEL: stack_fold_minph_zmm_commutable_k:
-  ;CHECK:       vminph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_minph_zmm_commutable_k:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vmovaps (%rsi), %zmm2
+; CHECK-NEXT:    vminph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <32 x half> @llvm.x86.avx512fp16.min.ph.512(<32 x half> %a0, <32 x half> %a1, i32 4)
   %3 = bitcast i32 %mask to <32 x i1>
@@ -537,8 +968,17 @@ define <32 x half> @stack_fold_minph_zmm_commutable_k(<32 x half> %a0, <32 x hal
 }
 
 define <32 x half> @stack_fold_minph_zmm_commutable_k_commuted(<32 x half> %a0, <32 x half> %a1, i32 %mask, ptr %passthru) #1 {
-  ;CHECK-LABEL: stack_fold_minph_zmm_commutable_k_commuted:
-  ;CHECK:       vminph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_minph_zmm_commutable_k_commuted:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vmovaps (%rsi), %zmm2
+; CHECK-NEXT:    vminph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <32 x half> @llvm.x86.avx512fp16.min.ph.512(<32 x half> %a1, <32 x half> %a0, i32 4)
   %3 = bitcast i32 %mask to <32 x i1>
@@ -548,8 +988,15 @@ define <32 x half> @stack_fold_minph_zmm_commutable_k_commuted(<32 x half> %a0,
 }
 
 define <32 x half> @stack_fold_minph_zmm_commutable_kz(<32 x half> %a0, <32 x half> %a1, i32 %mask) #1 {
-  ;CHECK-LABEL: stack_fold_minph_zmm_commutable_kz:
-  ;CHECK:       vminph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_minph_zmm_commutable_kz:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vminph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <32 x half> @llvm.x86.avx512fp16.min.ph.512(<32 x half> %a0, <32 x half> %a1, i32 4)
   %3 = bitcast i32 %mask to <32 x i1>
@@ -558,8 +1005,15 @@ define <32 x half> @stack_fold_minph_zmm_commutable_kz(<32 x half> %a0, <32 x ha
 }
 
 define <32 x half> @stack_fold_minph_zmm_commutable_kz_commuted(<32 x half> %a0, <32 x half> %a1, i32 %mask) #1 {
-  ;CHECK-LABEL: stack_fold_minph_zmm_commutable_kz_commuted:
-  ;CHECK:       vminph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_minph_zmm_commutable_kz_commuted:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vminph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <32 x half> @llvm.x86.avx512fp16.min.ph.512(<32 x half> %a1, <32 x half> %a0, i32 4)
   %3 = bitcast i32 %mask to <32 x i1>
@@ -568,8 +1022,14 @@ define <32 x half> @stack_fold_minph_zmm_commutable_kz_commuted(<32 x half> %a0,
 }
 
 define half @stack_fold_minsh(half %a0, half %a1) #0 {
-  ;CHECK-LABEL: stack_fold_minsh:
-  ;CHECK:       vminsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+; CHECK-LABEL: stack_fold_minsh:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovsh %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vminsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = fcmp olt half %a0, %a1
   %3 = select i1 %2, half %a0, half %a1
@@ -577,8 +1037,15 @@ define half @stack_fold_minsh(half %a0, half %a1) #0 {
 }
 
 define half @stack_fold_minsh_commuted(half %a0, half %a1) #0 {
-  ;CHECK-LABEL: stack_fold_minsh_commuted:
-  ;CHECK-NOT:       vminsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+; CHECK-LABEL: stack_fold_minsh_commuted:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovsh %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload
+; CHECK-NEXT:    vminsh %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = fcmp olt half %a1, %a0
   %3 = select i1 %2, half %a1, half %a0
@@ -586,8 +1053,14 @@ define half @stack_fold_minsh_commuted(half %a0, half %a1) #0 {
 }
 
 define half @stack_fold_minsh_commutable(half %a0, half %a1) #1 {
-  ;CHECK-LABEL: stack_fold_minsh_commutable:
-  ;CHECK:       vminsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+; CHECK-LABEL: stack_fold_minsh_commutable:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovsh %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vminsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = fcmp olt half %a0, %a1
   %3 = select i1 %2, half %a0, half %a1
@@ -595,8 +1068,14 @@ define half @stack_fold_minsh_commutable(half %a0, half %a1) #1 {
 }
 
 define half @stack_fold_minsh_commutable_commuted(half %a0, half %a1) #1 {
-  ;CHECK-LABEL: stack_fold_minsh_commutable_commuted:
-  ;CHECK:       vminsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+; CHECK-LABEL: stack_fold_minsh_commutable_commuted:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovsh %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vminsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = fcmp olt half %a1, %a0
   %3 = select i1 %2, half %a1, half %a0
@@ -604,8 +1083,14 @@ define half @stack_fold_minsh_commutable_commuted(half %a0, half %a1) #1 {
 }
 
 define <8 x half> @stack_fold_minsh_int(<8 x half> %a0, <8 x half> %a1) #0 {
-  ;CHECK-LABEL: stack_fold_minsh_int:
-  ;CHECK:       vminsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_minsh_int:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vminsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <8 x half> @llvm.x86.avx512fp16.mask.min.sh.round(<8 x half> %a0, <8 x half> %a1, <8 x half> undef, i8 -1, i32 4)
   ret <8 x half> %2
@@ -613,8 +1098,17 @@ define <8 x half> @stack_fold_minsh_int(<8 x half> %a0, <8 x half> %a1) #0 {
 declare <8 x half> @llvm.x86.avx512fp16.mask.min.sh.round(<8 x half>, <8 x half>, <8 x half>, i8, i32)
 
 define <8 x half> @stack_fold_minsh_mask(<8 x half> %a0, <8 x half> %a1, i8 %mask, ptr %passthru) {
-  ;CHECK-LABEL: stack_fold_minsh_mask:
-  ;CHECK:       vminsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_minsh_mask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovaps (%rsi), %xmm2
+; CHECK-NEXT:    vminsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload
+; CHECK-NEXT:    vmovaps %xmm2, %xmm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = load <8 x half>, ptr %passthru
   %3 = call <8 x half> @llvm.x86.avx512fp16.mask.min.sh.round(<8 x half> %a0, <8 x half> %a1, <8 x half> %2, i8 %mask, i32 4)
@@ -622,24 +1116,46 @@ define <8 x half> @stack_fold_minsh_mask(<8 x half> %a0, <8 x half> %a1, i8 %mas
 }
 
 define <8 x half> @stack_fold_minsh_maskz(<8 x half> %a0, <8 x half> %a1, i8 %mask) {
-  ;CHECK-LABEL: stack_fold_minsh_maskz:
-  ;CHECK:       vminsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_minsh_maskz:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vminsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 {%k1} {z} # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <8 x half> @llvm.x86.avx512fp16.mask.min.sh.round(<8 x half> %a0, <8 x half> %a1, <8 x half> zeroinitializer, i8 %mask, i32 4)
   ret <8 x half> %2
 }
 
 define <32 x half> @stack_fold_mulph_zmm(<32 x half> %a0, <32 x half> %a1) {
-  ;CHECK-LABEL: stack_fold_mulph_zmm
-  ;CHECK:       vmulph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_mulph_zmm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmulph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = fmul <32 x half> %a0, %a1
   ret <32 x half> %2
 }
 
 define <32 x half> @stack_fold_mulph_zmm_k(<32 x half> %a0, <32 x half> %a1, i32 %mask, ptr %passthru) {
-  ;CHECK-LABEL: stack_fold_mulph_zmm_k:
-  ;CHECK:       vmulph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_mulph_zmm_k:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vmovaps (%rsi), %zmm2
+; CHECK-NEXT:    vmulph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = fmul <32 x half> %a0, %a1
   %3 = bitcast i32 %mask to <32 x i1>
@@ -649,8 +1165,17 @@ define <32 x half> @stack_fold_mulph_zmm_k(<32 x half> %a0, <32 x half> %a1, i32
 }
 
 define <32 x half> @stack_fold_mulph_zmm_k_commuted(<32 x half> %a0, <32 x half> %a1, i32 %mask, ptr %passthru) {
-  ;CHECK-LABEL: stack_fold_mulph_zmm_k_commuted:
-  ;CHECK:       vmulph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_mulph_zmm_k_commuted:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vmovaps (%rsi), %zmm2
+; CHECK-NEXT:    vmulph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = fmul <32 x half> %a1, %a0
   %3 = bitcast i32 %mask to <32 x i1>
@@ -660,8 +1185,15 @@ define <32 x half> @stack_fold_mulph_zmm_k_commuted(<32 x half> %a0, <32 x half>
 }
 
 define <32 x half> @stack_fold_mulph_zmm_kz(<32 x half> %a0, <32 x half> %a1, i32 %mask) {
-  ;CHECK-LABEL: stack_fold_mulph_zmm_kz
-  ;CHECK:       vmulph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_mulph_zmm_kz:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vmulph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = fmul <32 x half> %a1, %a0
   %3 = bitcast i32 %mask to <32 x i1>
@@ -670,16 +1202,28 @@ define <32 x half> @stack_fold_mulph_zmm_kz(<32 x half> %a0, <32 x half> %a1, i3
 }
 
 define half @stack_fold_mulsh(half %a0, half %a1) {
-  ;CHECK-LABEL: stack_fold_mulsh
-  ;CHECK-NOT:       vmulss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+; CHECK-LABEL: stack_fold_mulsh:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovsh %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmulsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = fmul half %a0, %a1
   ret half %2
 }
 
 define <8 x half> @stack_fold_mulsh_int(<8 x half> %a0, <8 x half> %a1) {
-  ;CHECK-LABEL: stack_fold_mulsh_int
-  ;CHECK-NOT:       vmulss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_mulsh_int:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmulsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = extractelement <8 x half> %a0, i32 0
   %3 = extractelement <8 x half> %a1, i32 0
@@ -689,8 +1233,14 @@ define <8 x half> @stack_fold_mulsh_int(<8 x half> %a0, <8 x half> %a1) {
 }
 
 define <32 x half> @stack_fold_rcpph(<32 x half> %a0) {
-  ;CHECK-LABEL: stack_fold_rcpph:
-  ;CHECK:       vrcpph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_rcpph:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vrcpph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <32 x half> @llvm.x86.avx512fp16.mask.rcp.ph.512(<32 x half> %a0, <32 x half> undef, i32 -1)
   ret <32 x half> %2
@@ -698,8 +1248,17 @@ define <32 x half> @stack_fold_rcpph(<32 x half> %a0) {
 declare <32 x half> @llvm.x86.avx512fp16.mask.rcp.ph.512(<32 x half>, <32 x half>, i32)
 
 define <32 x half> @stack_fold_rcpph_mask(<32 x half> %a0, ptr %passthru, i32 %mask) {
-  ;CHECK-LABEL: stack_fold_rcpph_mask:
-  ;CHECK:       vrcpph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_rcpph_mask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovaps (%rdi), %zmm1
+; CHECK-NEXT:    vrcpph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT:    vmovaps %zmm1, %zmm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = load <32 x half>, ptr %passthru
   %3 = call <32 x half> @llvm.x86.avx512fp16.mask.rcp.ph.512(<32 x half> %a0, <32 x half> %2, i32 %mask)
@@ -707,8 +1266,15 @@ define <32 x half> @stack_fold_rcpph_mask(<32 x half> %a0, ptr %passthru, i32 %m
 }
 
 define <32 x half> @stack_fold_rcpph_maskz(<32 x half> %a0, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_rcpph_maskz:
-  ;CHECK:       vrcpph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_rcpph_maskz:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovd (%rdi), %k1
+; CHECK-NEXT:    vrcpph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = load i32, ptr %mask
   %3 = call <32 x half> @llvm.x86.avx512fp16.mask.rcp.ph.512(<32 x half> %a0, <32 x half> zeroinitializer, i32 %2)
@@ -716,8 +1282,14 @@ define <32 x half> @stack_fold_rcpph_maskz(<32 x half> %a0, ptr %mask) {
 }
 
 define <8 x half> @stack_fold_rcpsh(<8 x half> %a0, <8 x half> %a1) {
-  ;CHECK-LABEL: stack_fold_rcpsh:
-  ;CHECK:       vrcpsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_rcpsh:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vrcpsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <8 x half> @llvm.x86.avx512fp16.mask.rcp.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> undef, i8 -1)
   ret <8 x half> %2
@@ -725,8 +1297,17 @@ define <8 x half> @stack_fold_rcpsh(<8 x half> %a0, <8 x half> %a1) {
 declare <8 x half> @llvm.x86.avx512fp16.mask.rcp.sh(<8 x half>, <8 x half>, <8 x half>, i8)
 
 define <8 x half> @stack_fold_rcpsh_mask(<8 x half> %a0, <8 x half> %a1, ptr %passthru, i8 %mask) {
-  ;CHECK-LABEL: stack_fold_rcpsh_mask:
-  ;CHECK:       vrcpsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_rcpsh_mask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovaps (%rdi), %xmm2
+; CHECK-NEXT:    vrcpsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload
+; CHECK-NEXT:    vmovaps %xmm2, %xmm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = load <8 x half>, ptr %passthru
   %3 = call <8 x half> @llvm.x86.avx512fp16.mask.rcp.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> %2, i8 %mask)
@@ -734,8 +1315,15 @@ define <8 x half> @stack_fold_rcpsh_mask(<8 x half> %a0, <8 x half> %a1, ptr %pa
 }
 
 define <8 x half> @stack_fold_rcpsh_maskz(<8 x half> %a0, <8 x half> %a1, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_rcpsh_maskz:
-  ;CHECK:       vrcpsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_rcpsh_maskz:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovb (%rdi), %k1
+; CHECK-NEXT:    vrcpsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 {%k1} {z} # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = load i8, ptr %mask
   %3 = call <8 x half> @llvm.x86.avx512fp16.mask.rcp.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> zeroinitializer, i8 %2)
@@ -743,8 +1331,14 @@ define <8 x half> @stack_fold_rcpsh_maskz(<8 x half> %a0, <8 x half> %a1, ptr %m
 }
 
 define <32 x half> @stack_fold_reduceph(<32 x half> %a0) {
-  ;CHECK-LABEL: stack_fold_reduceph:
-  ;CHECK:       vreduceph $8, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_reduceph:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vreduceph $8, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <32 x half> @llvm.x86.avx512fp16.mask.reduce.ph.512(<32 x half> %a0, i32 8, <32 x half> undef, i32 -1, i32 4)
   ret <32 x half> %2
@@ -752,8 +1346,17 @@ define <32 x half> @stack_fold_reduceph(<32 x half> %a0) {
 declare <32 x half> @llvm.x86.avx512fp16.mask.reduce.ph.512(<32 x half>, i32, <32 x half>, i32, i32)
 
 define <32 x half> @stack_fold_reduceph_mask(<32 x half> %a0, ptr %passthru, i32 %mask) {
-  ;CHECK-LABEL: stack_fold_reduceph_mask:
-  ;CHECK:       vreduceph $8, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_reduceph_mask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovaps (%rdi), %zmm1
+; CHECK-NEXT:    vreduceph $8, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT:    vmovaps %zmm1, %zmm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = load <32 x half>, ptr %passthru
   %3 = call <32 x half> @llvm.x86.avx512fp16.mask.reduce.ph.512(<32 x half> %a0, i32 8, <32 x half> %2, i32 %mask, i32 4)
@@ -761,8 +1364,15 @@ define <32 x half> @stack_fold_reduceph_mask(<32 x half> %a0, ptr %passthru, i32
 }
 
 define <32 x half> @stack_fold_reduceph_maskz(<32 x half> %a0, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_reduceph_maskz:
-  ;CHECK:       vreduceph $8, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_reduceph_maskz:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovd (%rdi), %k1
+; CHECK-NEXT:    vreduceph $8, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = load i32, ptr %mask
   %3 = call <32 x half> @llvm.x86.avx512fp16.mask.reduce.ph.512(<32 x half> %a0, i32 8, <32 x half> zeroinitializer, i32 %2, i32 4)
@@ -770,8 +1380,14 @@ define <32 x half> @stack_fold_reduceph_maskz(<32 x half> %a0, ptr %mask) {
 }
 
 define <8 x half> @stack_fold_reducesh(<8 x half> %a0, <8 x half> %a1) {
-  ;CHECK-LABEL: stack_fold_reducesh:
-  ;CHECK:       vreducesh $8, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_reducesh:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vreducesh $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <8 x half> @llvm.x86.avx512fp16.mask.reduce.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> undef, i8 -1, i32 8, i32 4)
   ret <8 x half> %2
@@ -779,8 +1395,17 @@ define <8 x half> @stack_fold_reducesh(<8 x half> %a0, <8 x half> %a1) {
 declare <8 x half> @llvm.x86.avx512fp16.mask.reduce.sh(<8 x half>, <8 x half>, <8 x half>, i8, i32, i32)
 
 define <8 x half> @stack_fold_reducesh_mask(<8 x half> %a0, <8 x half> %a1, ptr %passthru, i8 %mask) {
-  ;CHECK-LABEL: stack_fold_reducesh_mask:
-  ;CHECK:       vreducesh $8, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_reducesh_mask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovaps (%rdi), %xmm2
+; CHECK-NEXT:    vreducesh $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload
+; CHECK-NEXT:    vmovaps %xmm2, %xmm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = load <8 x half>, ptr %passthru
   %3 = call <8 x half> @llvm.x86.avx512fp16.mask.reduce.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> %2, i8 %mask, i32 8, i32 4)
@@ -788,8 +1413,15 @@ define <8 x half> @stack_fold_reducesh_mask(<8 x half> %a0, <8 x half> %a1, ptr
 }
 
 define <8 x half> @stack_fold_reducesh_maskz(<8 x half> %a0, <8 x half> %a1, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_reducesh_maskz:
-  ;CHECK:       vreducesh $8, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_reducesh_maskz:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovb (%rdi), %k1
+; CHECK-NEXT:    vreducesh $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 {%k1} {z} # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = load i8, ptr %mask
   %3 = call <8 x half> @llvm.x86.avx512fp16.mask.reduce.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> zeroinitializer, i8 %2, i32 8, i32 4)
@@ -797,8 +1429,14 @@ define <8 x half> @stack_fold_reducesh_maskz(<8 x half> %a0, <8 x half> %a1, ptr
 }
 
 define <32 x half> @stack_fold_rndscaleph(<32 x half> %a0) {
-  ;CHECK-LABEL: stack_fold_rndscaleph:
-  ;CHECK:       vrndscaleph $8, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_rndscaleph:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vrndscaleph $8, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <32 x half> @llvm.x86.avx512fp16.mask.rndscale.ph.512(<32 x half> %a0, i32 8, <32 x half> undef, i32 -1, i32 4)
   ret <32 x half> %2
@@ -806,8 +1444,17 @@ define <32 x half> @stack_fold_rndscaleph(<32 x half> %a0) {
 declare <32 x half> @llvm.x86.avx512fp16.mask.rndscale.ph.512(<32 x half>, i32, <32 x half>, i32, i32)
 
 define <32 x half> @stack_fold_rndscaleph_mask(<32 x half> %a0, ptr %passthru, i32 %mask) {
-  ;CHECK-LABEL: stack_fold_rndscaleph_mask:
-  ;CHECK:       vrndscaleph $8, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_rndscaleph_mask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovaps (%rdi), %zmm1
+; CHECK-NEXT:    vrndscaleph $8, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT:    vmovaps %zmm1, %zmm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = load <32 x half>, ptr %passthru
   %3 = call <32 x half> @llvm.x86.avx512fp16.mask.rndscale.ph.512(<32 x half> %a0, i32 8, <32 x half> %2, i32 %mask, i32 4)
@@ -815,8 +1462,15 @@ define <32 x half> @stack_fold_rndscaleph_mask(<32 x half> %a0, ptr %passthru, i
 }
 
 define <32 x half> @stack_fold_rndscaleph_maskz(<32 x half> %a0, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_rndscaleph_maskz:
-  ;CHECK:       vrndscaleph $8, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_rndscaleph_maskz:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovd (%rdi), %k1
+; CHECK-NEXT:    vrndscaleph $8, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = load i32, ptr %mask
   %3 = call <32 x half> @llvm.x86.avx512fp16.mask.rndscale.ph.512(<32 x half> %a0, i32 8, <32 x half> zeroinitializer, i32 %2, i32 4)
@@ -824,8 +1478,14 @@ define <32 x half> @stack_fold_rndscaleph_maskz(<32 x half> %a0, ptr %mask) {
 }
 
 define <8 x half> @stack_fold_rndscalesh(<8 x half> %a0, <8 x half> %a1) {
-  ;CHECK-LABEL: stack_fold_rndscalesh:
-  ;CHECK:       vrndscalesh $8, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_rndscalesh:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vrndscalesh $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <8 x half> @llvm.x86.avx512fp16.mask.rndscale.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> undef, i8 -1, i32 8, i32 4)
   ret <8 x half> %2
@@ -833,8 +1493,17 @@ define <8 x half> @stack_fold_rndscalesh(<8 x half> %a0, <8 x half> %a1) {
 declare <8 x half> @llvm.x86.avx512fp16.mask.rndscale.sh(<8 x half>, <8 x half>, <8 x half>, i8, i32, i32)
 
 define <8 x half> @stack_fold_rndscalesh_mask(<8 x half> %a0, <8 x half> %a1, ptr %passthru, i8 %mask) {
-  ;CHECK-LABEL: stack_fold_rndscalesh_mask:
-  ;CHECK:       vrndscalesh $8, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_rndscalesh_mask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovaps (%rdi), %xmm2
+; CHECK-NEXT:    vrndscalesh $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload
+; CHECK-NEXT:    vmovaps %xmm2, %xmm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = load <8 x half>, ptr %passthru
   %3 = call <8 x half> @llvm.x86.avx512fp16.mask.rndscale.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> %2, i8 %mask, i32 8, i32 4)
@@ -842,8 +1511,15 @@ define <8 x half> @stack_fold_rndscalesh_mask(<8 x half> %a0, <8 x half> %a1, pt
 }
 
 define <8 x half> @stack_fold_rndscalesh_maskz(<8 x half> %a0, <8 x half> %a1, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_rndscalesh_maskz:
-  ;CHECK:       vrndscalesh $8, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_rndscalesh_maskz:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovb (%rdi), %k1
+; CHECK-NEXT:    vrndscalesh $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 {%k1} {z} # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = load i8, ptr %mask
   %3 = call <8 x half> @llvm.x86.avx512fp16.mask.rndscale.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> zeroinitializer, i8 %2, i32 8, i32 4)
@@ -851,8 +1527,14 @@ define <8 x half> @stack_fold_rndscalesh_maskz(<8 x half> %a0, <8 x half> %a1, p
 }
 
 define <32 x half> @stack_fold_rsqrtph(<32 x half> %a0) {
-  ;CHECK-LABEL: stack_fold_rsqrtph:
-  ;CHECK:       vrsqrtph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_rsqrtph:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vrsqrtph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <32 x half> @llvm.x86.avx512fp16.mask.rsqrt.ph.512(<32 x half> %a0, <32 x half> undef, i32 -1)
   ret <32 x half> %2
@@ -860,8 +1542,17 @@ define <32 x half> @stack_fold_rsqrtph(<32 x half> %a0) {
 declare <32 x half> @llvm.x86.avx512fp16.mask.rsqrt.ph.512(<32 x half>, <32 x half>, i32)
 
 define <32 x half> @stack_fold_rsqrtph_mask(<32 x half> %a0, ptr %passthru, i32 %mask) {
-  ;CHECK-LABEL: stack_fold_rsqrtph_mask:
-  ;CHECK:       vrsqrtph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_rsqrtph_mask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovaps (%rdi), %zmm1
+; CHECK-NEXT:    vrsqrtph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT:    vmovaps %zmm1, %zmm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = load <32 x half>, ptr %passthru
   %3 = call <32 x half> @llvm.x86.avx512fp16.mask.rsqrt.ph.512(<32 x half> %a0, <32 x half> %2, i32 %mask)
@@ -869,8 +1560,15 @@ define <32 x half> @stack_fold_rsqrtph_mask(<32 x half> %a0, ptr %passthru, i32
 }
 
 define <32 x half> @stack_fold_rsqrtph_maskz(<32 x half> %a0, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_rsqrtph_maskz:
-  ;CHECK:       vrsqrtph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_rsqrtph_maskz:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovd (%rdi), %k1
+; CHECK-NEXT:    vrsqrtph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = load i32, ptr %mask
   %3 = call <32 x half> @llvm.x86.avx512fp16.mask.rsqrt.ph.512(<32 x half> %a0, <32 x half> zeroinitializer, i32 %2)
@@ -878,8 +1576,14 @@ define <32 x half> @stack_fold_rsqrtph_maskz(<32 x half> %a0, ptr %mask) {
 }
 
 define <8 x half> @stack_fold_rsqrtsh(<8 x half> %a0, <8 x half> %a1) {
-  ;CHECK-LABEL: stack_fold_rsqrtsh:
-  ;CHECK:       vrsqrtsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_rsqrtsh:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vrsqrtsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <8 x half> @llvm.x86.avx512fp16.mask.rsqrt.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> undef, i8 -1)
   ret <8 x half> %2
@@ -887,8 +1591,17 @@ define <8 x half> @stack_fold_rsqrtsh(<8 x half> %a0, <8 x half> %a1) {
 declare <8 x half> @llvm.x86.avx512fp16.mask.rsqrt.sh(<8 x half>, <8 x half>, <8 x half>, i8)
 
 define <8 x half> @stack_fold_rsqrtsh_mask(<8 x half> %a0, <8 x half> %a1, ptr %passthru, i8 %mask) {
-  ;CHECK-LABEL: stack_fold_rsqrtsh_mask:
-  ;CHECK:       vrsqrtsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_rsqrtsh_mask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovaps (%rdi), %xmm2
+; CHECK-NEXT:    vrsqrtsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload
+; CHECK-NEXT:    vmovaps %xmm2, %xmm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = load <8 x half>, ptr %passthru
   %3 = call <8 x half> @llvm.x86.avx512fp16.mask.rsqrt.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> %2, i8 %mask)
@@ -896,8 +1609,15 @@ define <8 x half> @stack_fold_rsqrtsh_mask(<8 x half> %a0, <8 x half> %a1, ptr %
 }
 
 define <8 x half> @stack_fold_rsqrtsh_maskz(<8 x half> %a0, <8 x half> %a1, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_rsqrtsh_maskz:
-  ;CHECK:       vrsqrtsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_rsqrtsh_maskz:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovb (%rdi), %k1
+; CHECK-NEXT:    vrsqrtsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 {%k1} {z} # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = load i8, ptr %mask
   %3 = call <8 x half> @llvm.x86.avx512fp16.mask.rsqrt.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> zeroinitializer, i8 %2)
@@ -905,8 +1625,14 @@ define <8 x half> @stack_fold_rsqrtsh_maskz(<8 x half> %a0, <8 x half> %a1, ptr
 }
 
 define <32 x half> @stack_fold_sqrtph(<32 x half> %a0) {
-  ;CHECK-LABEL: stack_fold_sqrtph:
-  ;CHECK:       vsqrtph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_sqrtph:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vsqrtph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <32 x half> @llvm.sqrt.v32f16(<32 x half> %a0)
   ret <32 x half> %2
@@ -914,8 +1640,17 @@ define <32 x half> @stack_fold_sqrtph(<32 x half> %a0) {
 declare <32 x half> @llvm.sqrt.v32f16(<32 x half>)
 
 define <32 x half> @stack_fold_sqrtph_mask(<32 x half> %a0, ptr %passthru, i32 %mask) {
-  ;CHECK-LABEL: stack_fold_sqrtph_mask:
-  ;CHECK:       vsqrtph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_sqrtph_mask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovaps (%rdi), %zmm1
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vsqrtph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT:    vmovaps %zmm1, %zmm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = load <32 x half>, ptr %passthru
   %3 = call <32 x half> @llvm.sqrt.v32f16(<32 x half> %a0)
@@ -925,8 +1660,15 @@ define <32 x half> @stack_fold_sqrtph_mask(<32 x half> %a0, ptr %passthru, i32 %
 }
 
 define <32 x half> @stack_fold_sqrtph_maskz(<32 x half> %a0, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_sqrtph_maskz:
-  ;CHECK:       vsqrtph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_sqrtph_maskz:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovd (%rdi), %k1
+; CHECK-NEXT:    vsqrtph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = load i32, ptr %mask
   %3 = call <32 x half> @llvm.sqrt.v32f16(<32 x half> %a0)
@@ -936,8 +1678,14 @@ define <32 x half> @stack_fold_sqrtph_maskz(<32 x half> %a0, ptr %mask) {
 }
 
 define <8 x half> @stack_fold_sqrtsh(<8 x half> %a0, <8 x half> %a1) {
-  ;CHECK-LABEL: stack_fold_sqrtsh:
-  ;CHECK:       vsqrtsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_sqrtsh:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vsqrtsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <8 x half> @llvm.x86.avx512fp16.mask.sqrt.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> undef, i8 -1, i32 4)
   ret <8 x half> %2
@@ -945,8 +1693,17 @@ define <8 x half> @stack_fold_sqrtsh(<8 x half> %a0, <8 x half> %a1) {
 declare <8 x half> @llvm.x86.avx512fp16.mask.sqrt.sh(<8 x half>, <8 x half>, <8 x half>, i8, i32)
 
 define <8 x half> @stack_fold_sqrtsh_mask(<8 x half> %a0, <8 x half> %a1, ptr %passthru, i8 %mask) {
-  ;CHECK-LABEL: stack_fold_sqrtsh_mask:
-  ;CHECK:       vsqrtsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_sqrtsh_mask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovaps (%rdi), %xmm2
+; CHECK-NEXT:    vsqrtsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload
+; CHECK-NEXT:    vmovaps %xmm2, %xmm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = load <8 x half>, ptr %passthru
   %3 = call <8 x half> @llvm.x86.avx512fp16.mask.sqrt.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> %2, i8 %mask, i32 4)
@@ -954,8 +1711,15 @@ define <8 x half> @stack_fold_sqrtsh_mask(<8 x half> %a0, <8 x half> %a1, ptr %p
 }
 
 define <8 x half> @stack_fold_sqrtsh_maskz(<8 x half> %a0, <8 x half> %a1, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_sqrtsh_maskz:
-  ;CHECK:       vsqrtsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_sqrtsh_maskz:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovb (%rdi), %k1
+; CHECK-NEXT:    vsqrtsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 {%k1} {z} # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = load i8, ptr %mask
   %3 = call <8 x half> @llvm.x86.avx512fp16.mask.sqrt.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> zeroinitializer, i8 %2, i32 4)
@@ -963,24 +1727,42 @@ define <8 x half> @stack_fold_sqrtsh_maskz(<8 x half> %a0, <8 x half> %a1, ptr %
 }
 
 define <32 x half> @stack_fold_subph_zmm(<32 x half> %a0, <32 x half> %a1) {
-  ;CHECK-LABEL: stack_fold_subph_zmm
-  ;CHECK:       vsubph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_subph_zmm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vsubph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = fsub <32 x half> %a0, %a1
   ret <32 x half> %2
 }
 
 define half @stack_fold_subsh(half %a0, half %a1) {
-  ;CHECK-LABEL: stack_fold_subsh
-  ;CHECK:       vsubsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+; CHECK-LABEL: stack_fold_subsh:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovsh %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vsubsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = fsub half %a0, %a1
   ret half %2
 }
 
 define <8 x half> @stack_fold_subsh_int(<8 x half> %a0, <8 x half> %a1) {
-  ;CHECK-LABEL: stack_fold_subsh_int
-  ;CHECK:       vsubsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_subsh_int:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vsubsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = extractelement <8 x half> %a0, i32 0
   %3 = extractelement <8 x half> %a1, i32 0
@@ -990,16 +1772,30 @@ define <8 x half> @stack_fold_subsh_int(<8 x half> %a0, <8 x half> %a1) {
 }
 
 define <16 x float> @stack_fold_fmulcph(<16 x float> %a0, <16 x float> %a1) {
-  ;CHECK-LABEL: stack_fold_fmulcph:
-  ;CHECK:       vfmulcph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmulcph:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfmulcph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <16 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.512(<16 x float> %a0, <16 x float> %a1, <16 x float> undef, i16 -1, i32 4)
   ret <16 x float> %2
 }
 
 define <16 x float> @stack_fold_fmulcph_commute(<16 x float> %a0, <16 x float> %a1) {
-  ;CHECK-LABEL: stack_fold_fmulcph_commute:
-  ;CHECK:       vfmulcph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmulcph_commute:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfmulcph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <16 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.512(<16 x float> %a1, <16 x float> %a0, <16 x float> undef, i16 -1, i32 4)
   ret <16 x float> %2
@@ -1007,8 +1803,17 @@ define <16 x float> @stack_fold_fmulcph_commute(<16 x float> %a0, <16 x float> %
 declare <16 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
 
 define <16 x float> @stack_fold_fmulcph_mask(<16 x float> %a0, <16 x float> %a1, ptr %passthru, i16 %mask) {
-  ;CHECK-LABEL: stack_fold_fmulcph_mask:
-  ;CHECK:       vfmulcph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmulcph_mask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovaps (%rdi), %zmm2
+; CHECK-NEXT:    vfmulcph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = load <16 x float>, ptr %passthru
   %3 = call <16 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %2, i16 %mask, i32 4)
@@ -1016,8 +1821,16 @@ define <16 x float> @stack_fold_fmulcph_mask(<16 x float> %a0, <16 x float> %a1,
 }
 
 define <16 x float> @stack_fold_fmulcph_maskz(<16 x float> %a0, <16 x float> %a1, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fmulcph_maskz:
-  ;CHECK:       vfmulcph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmulcph_maskz:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovw (%rdi), %k1
+; CHECK-NEXT:    vfmulcph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = load i16, ptr %mask
   %3 = call <16 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.512(<16 x float> %a0, <16 x float> %a1, <16 x float> zeroinitializer, i16 %2, i32 4)
@@ -1025,17 +1838,31 @@ define <16 x float> @stack_fold_fmulcph_maskz(<16 x float> %a0, <16 x float> %a1
 }
 
 define <16 x float> @stack_fold_fcmulcph(<16 x float> %a0, <16 x float> %a1) {
-  ;CHECK-LABEL: stack_fold_fcmulcph:
-  ;CHECK:       vfcmulcph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_fcmulcph:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfcmulcph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <16 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.512(<16 x float> %a0, <16 x float> %a1, <16 x float> undef, i16 -1, i32 4)
   ret <16 x float> %2
 }
 
 define <16 x float> @stack_fold_fcmulcph_commute(<16 x float> %a0, <16 x float> %a1) {
-  ;CHECK-LABEL: stack_fold_fcmulcph_commute:
-  ;CHECK:       vmovups {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Reload
-  ;CHECK:       vfcmulcph {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}}
+; CHECK-LABEL: stack_fold_fcmulcph_commute:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; CHECK-NEXT:    vfcmulcph %zmm0, %zmm1, %zmm2
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <16 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.512(<16 x float> %a1, <16 x float> %a0, <16 x float> undef, i16 -1, i32 4)
   ret <16 x float> %2
@@ -1043,8 +1870,17 @@ define <16 x float> @stack_fold_fcmulcph_commute(<16 x float> %a0, <16 x float>
 declare <16 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
 
 define <16 x float> @stack_fold_fcmulcph_mask(<16 x float> %a0, <16 x float> %a1, ptr %passthru, i16 %mask) {
-  ;CHECK-LABEL: stack_fold_fcmulcph_mask:
-  ;CHECK:       vfcmulcph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_fcmulcph_mask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovaps (%rdi), %zmm2
+; CHECK-NEXT:    vfcmulcph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = load <16 x float>, ptr %passthru
   %3 = call <16 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %2, i16 %mask, i32 4)
@@ -1052,8 +1888,16 @@ define <16 x float> @stack_fold_fcmulcph_mask(<16 x float> %a0, <16 x float> %a1
 }
 
 define <16 x float> @stack_fold_fcmulcph_maskz(<16 x float> %a0, <16 x float> %a1, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fcmulcph_maskz:
-  ;CHECK:       vfcmulcph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_fcmulcph_maskz:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovw (%rdi), %k1
+; CHECK-NEXT:    vfcmulcph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = load i16, ptr %mask
   %3 = call <16 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.512(<16 x float> %a0, <16 x float> %a1, <16 x float> zeroinitializer, i16 %2, i32 4)
@@ -1061,16 +1905,28 @@ define <16 x float> @stack_fold_fcmulcph_maskz(<16 x float> %a0, <16 x float> %a
 }
 
 define <16 x float> @stack_fold_fmaddcph(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
-  ;CHECK-LABEL: stack_fold_fmaddcph:
-  ;CHECK:       vfmaddcph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmaddcph:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfmaddcph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <16 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.512(<16 x float> %a1, <16 x float> %a2, <16 x float> %a0, i16 -1, i32 4)
   ret <16 x float> %2
 }
 
 define <16 x float> @stack_fold_fmaddcph_commute(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
-  ;CHECK-LABEL: stack_fold_fmaddcph_commute:
-  ;CHECK:       vfmaddcph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmaddcph_commute:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfmaddcph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <16 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.512(<16 x float> %a2, <16 x float> %a1, <16 x float> %a0, i16 -1, i32 4)
   ret <16 x float> %2
@@ -1078,8 +1934,17 @@ define <16 x float> @stack_fold_fmaddcph_commute(<16 x float> %a0, <16 x float>
 declare <16 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
 
 define <16 x float> @stack_fold_fmaddcph_mask(ptr %p, <16 x float> %a1, <16 x float> %a2, i16 %mask) {
-  ;CHECK-LABEL: stack_fold_fmaddcph_mask:
-  ;CHECK:       vfmaddcph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmaddcph_mask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovaps (%rdi), %zmm2
+; CHECK-NEXT:    vfmaddcph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = load <16 x float>, ptr %p
   %2 = call <16 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.512(<16 x float> %a1, <16 x float> %a2, <16 x float> %a0, i16 %mask, i32 4)
@@ -1087,8 +1952,16 @@ define <16 x float> @stack_fold_fmaddcph_mask(ptr %p, <16 x float> %a1, <16 x fl
 }
 
 define <16 x float> @stack_fold_fmaddcph_maskz(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fmaddcph_maskz:
-  ;CHECK:       vfmaddcph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmaddcph_maskz:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovw (%rdi), %k1
+; CHECK-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    vfmaddcph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = load i16, ptr %mask
   %3 = call <16 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.512(<16 x float> %a1, <16 x float> %a2, <16 x float> zeroinitializer, i16 %2, i32 4)
@@ -1097,17 +1970,29 @@ define <16 x float> @stack_fold_fmaddcph_maskz(<16 x float> %a0, <16 x float> %a
 declare <16 x float> @llvm.x86.avx512fp16.maskz.vfmadd.cph.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
 
 define <16 x float> @stack_fold_fcmaddcph(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
-  ;CHECK-LABEL: stack_fold_fcmaddcph:
-  ;CHECK:       vfcmaddcph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_fcmaddcph:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfcmaddcph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <16 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.512(<16 x float> %a1, <16 x float> %a2, <16 x float> %a0, i16 -1, i32 4)
   ret <16 x float> %2
 }
 
 define <16 x float> @stack_fold_fcmaddcph_commute(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
-  ;CHECK-LABEL: stack_fold_fcmaddcph_commute:
-  ;CHECK:       vmovups {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Reload
-  ;CHECK:       vfcmaddcph {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}}
+; CHECK-LABEL: stack_fold_fcmaddcph_commute:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; CHECK-NEXT:    vfcmaddcph %zmm1, %zmm2, %zmm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <16 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.512(<16 x float> %a2, <16 x float> %a1, <16 x float> %a0, i16 -1, i32 4)
   ret <16 x float> %2
@@ -1115,8 +2000,17 @@ define <16 x float> @stack_fold_fcmaddcph_commute(<16 x float> %a0, <16 x float>
 declare <16 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
 
 define <16 x float> @stack_fold_fcmaddcph_mask(ptr %p, <16 x float> %a1, <16 x float> %a2, i16 %mask) {
-  ;CHECK-LABEL: stack_fold_fcmaddcph_mask:
-  ;CHECK:       vfcmaddcph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_fcmaddcph_mask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovaps (%rdi), %zmm2
+; CHECK-NEXT:    vfcmaddcph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = load <16 x float>, ptr %p
   %2 = call <16 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.512(<16 x float> %a1, <16 x float> %a2, <16 x float> %a0, i16 %mask, i32 4)
@@ -1124,8 +2018,16 @@ define <16 x float> @stack_fold_fcmaddcph_mask(ptr %p, <16 x float> %a1, <16 x f
 }
 
 define <16 x float> @stack_fold_fcmaddcph_maskz(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fcmaddcph_maskz:
-  ;CHECK:       vfcmaddcph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_fcmaddcph_maskz:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovw (%rdi), %k1
+; CHECK-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    vfcmaddcph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = load i16, ptr %mask
   %3 = call <16 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.512(<16 x float> %a1, <16 x float> %a2, <16 x float> zeroinitializer, i16 %2, i32 4)
@@ -1134,16 +2036,30 @@ define <16 x float> @stack_fold_fcmaddcph_maskz(<16 x float> %a0, <16 x float> %
 declare <16 x float> @llvm.x86.avx512fp16.maskz.vfcmadd.cph.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
 
 define <4 x float> @stack_fold_fmulcsh(<4 x float> %a0, <4 x float> %a1) {
-  ;CHECK-LABEL: stack_fold_fmulcsh:
-  ;CHECK:       vfmulcsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmulcsh:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfmulcsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
+; CHECK-NEXT:    vmovaps %xmm2, %xmm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfmul.csh(<4 x float> %a0, <4 x float> %a1, <4 x float> undef, i8 -1, i32 4)
   ret <4 x float> %2
 }
 
 define <4 x float> @stack_fold_fmulcsh_commute(<4 x float> %a0, <4 x float> %a1) {
-  ;CHECK-LABEL: stack_fold_fmulcsh_commute:
-  ;CHECK:       vfmulcsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmulcsh_commute:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfmulcsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
+; CHECK-NEXT:    vmovaps %xmm2, %xmm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfmul.csh(<4 x float> %a1, <4 x float> %a0, <4 x float> undef, i8 -1, i32 4)
   ret <4 x float> %2
@@ -1151,8 +2067,17 @@ define <4 x float> @stack_fold_fmulcsh_commute(<4 x float> %a0, <4 x float> %a1)
 declare <4 x float> @llvm.x86.avx512fp16.mask.vfmul.csh(<4 x float>, <4 x float>, <4 x float>, i8, i32)
 
 define <4 x float> @stack_fold_fmulcsh_mask(<4 x float> %a0, <4 x float> %a1, ptr %passthru, i8 %mask) {
-  ;CHECK-LABEL: stack_fold_fmulcsh_mask:
-  ;CHECK:       vfmulcsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmulcsh_mask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovaps (%rdi), %xmm2
+; CHECK-NEXT:    vfmulcsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload
+; CHECK-NEXT:    vmovaps %xmm2, %xmm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = load <4 x float>, ptr %passthru
   %3 = call <4 x float> @llvm.x86.avx512fp16.mask.vfmul.csh(<4 x float> %a0, <4 x float> %a1, <4 x float> %2, i8 %mask, i32 4)
@@ -1160,8 +2085,16 @@ define <4 x float> @stack_fold_fmulcsh_mask(<4 x float> %a0, <4 x float> %a1, pt
 }
 
 define <4 x float> @stack_fold_fmulcsh_maskz(<4 x float> %a0, <4 x float> %a1, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fmulcsh_maskz:
-  ;CHECK:       vfmulcsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmulcsh_maskz:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovb (%rdi), %k1
+; CHECK-NEXT:    vfmulcsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} {z} # 16-byte Folded Reload
+; CHECK-NEXT:    vmovaps %xmm2, %xmm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = load i8, ptr %mask
   %3 = call <4 x float> @llvm.x86.avx512fp16.mask.vfmul.csh(<4 x float> %a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 %2, i32 4)
@@ -1169,17 +2102,31 @@ define <4 x float> @stack_fold_fmulcsh_maskz(<4 x float> %a0, <4 x float> %a1, p
 }
 
 define <4 x float> @stack_fold_fcmulcsh(<4 x float> %a0, <4 x float> %a1) {
-  ;CHECK-LABEL: stack_fold_fcmulcsh:
-  ;CHECK:       vfcmulcsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fcmulcsh:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfcmulcsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
+; CHECK-NEXT:    vmovaps %xmm2, %xmm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmul.csh(<4 x float> %a0, <4 x float> %a1, <4 x float> undef, i8 -1, i32 4)
   ret <4 x float> %2
 }
 
 define <4 x float> @stack_fold_fcmulcsh_commute(<4 x float> %a0, <4 x float> %a1) {
-  ;CHECK-LABEL: stack_fold_fcmulcsh_commute:
-  ;CHECK:       vmovaps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Reload
-  ;CHECK:       vfcmulcsh {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}}
+; CHECK-LABEL: stack_fold_fcmulcsh_commute:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    vfcmulcsh %xmm0, %xmm1, %xmm2
+; CHECK-NEXT:    vmovaps %xmm2, %xmm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmul.csh(<4 x float> %a1, <4 x float> %a0, <4 x float> undef, i8 -1, i32 4)
   ret <4 x float> %2
@@ -1187,8 +2134,17 @@ define <4 x float> @stack_fold_fcmulcsh_commute(<4 x float> %a0, <4 x float> %a1
 declare <4 x float> @llvm.x86.avx512fp16.mask.vfcmul.csh(<4 x float>, <4 x float>, <4 x float>, i8, i32)
 
 define <4 x float> @stack_fold_fcmulcsh_mask(<4 x float> %a0, <4 x float> %a1, ptr %passthru, i8 %mask) {
-  ;CHECK-LABEL: stack_fold_fcmulcsh_mask:
-  ;CHECK:       vfcmulcsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fcmulcsh_mask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovaps (%rdi), %xmm2
+; CHECK-NEXT:    vfcmulcsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload
+; CHECK-NEXT:    vmovaps %xmm2, %xmm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = load <4 x float>, ptr %passthru
   %3 = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmul.csh(<4 x float> %a0, <4 x float> %a1, <4 x float> %2, i8 %mask, i32 4)
@@ -1196,8 +2152,16 @@ define <4 x float> @stack_fold_fcmulcsh_mask(<4 x float> %a0, <4 x float> %a1, p
 }
 
 define <4 x float> @stack_fold_fcmulcsh_maskz(<4 x float> %a0, <4 x float> %a1, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fcmulcsh_maskz:
-  ;CHECK:       vfcmulcsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fcmulcsh_maskz:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovb (%rdi), %k1
+; CHECK-NEXT:    vfcmulcsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} {z} # 16-byte Folded Reload
+; CHECK-NEXT:    vmovaps %xmm2, %xmm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = load i8, ptr %mask
   %3 = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmul.csh(<4 x float> %a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 %2, i32 4)
@@ -1205,16 +2169,28 @@ define <4 x float> @stack_fold_fcmulcsh_maskz(<4 x float> %a0, <4 x float> %a1,
 }
 
 define <4 x float> @stack_fold_fmaddcsh(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
-  ;CHECK-LABEL: stack_fold_fmaddcsh:
-  ;CHECK:       vfmaddcsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmaddcsh:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfmaddcsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfmadd.csh(<4 x float> %a1, <4 x float> %a2, <4 x float> %a0, i8 -1, i32 4)
   ret <4 x float> %2
 }
 
 define <4 x float> @stack_fold_fmaddcsh_commute(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
-  ;CHECK-LABEL: stack_fold_fmaddcsh_commute:
-  ;CHECK:       vfmaddcsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmaddcsh_commute:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfmaddcsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfmadd.csh(<4 x float> %a2, <4 x float> %a1, <4 x float> %a0, i8 -1, i32 4)
   ret <4 x float> %2
@@ -1222,8 +2198,17 @@ define <4 x float> @stack_fold_fmaddcsh_commute(<4 x float> %a0, <4 x float> %a1
 declare <4 x float> @llvm.x86.avx512fp16.mask.vfmadd.csh(<4 x float>, <4 x float>, <4 x float>, i8, i32)
 
 define <4 x float> @stack_fold_fmaddcsh_mask(ptr %p, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
-  ;CHECK-LABEL: stack_fold_fmaddcsh_mask:
-  ;CHECK:       vfmaddcsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmaddcsh_mask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovaps (%rdi), %xmm2
+; CHECK-NEXT:    vfmaddcsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload
+; CHECK-NEXT:    vmovaps %xmm2, %xmm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = load <4 x float>, ptr %p
   %2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfmadd.csh(<4 x float> %a1, <4 x float> %a2, <4 x float> %a0, i8 %mask, i32 4)
@@ -1231,8 +2216,16 @@ define <4 x float> @stack_fold_fmaddcsh_mask(ptr %p, <4 x float> %a1, <4 x float
 }
 
 define <4 x float> @stack_fold_fmaddcsh_maskz(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fmaddcsh_maskz:
-  ;CHECK:       vfmaddcsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmaddcsh_maskz:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovb (%rdi), %k1
+; CHECK-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    vfmaddcsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = load i8, ptr %mask
   %3 = call <4 x float> @llvm.x86.avx512fp16.mask.vfmadd.csh(<4 x float> %a1, <4 x float> %a2, <4 x float> zeroinitializer, i8 %2, i32 4)
@@ -1241,17 +2234,29 @@ define <4 x float> @stack_fold_fmaddcsh_maskz(<4 x float> %a0, <4 x float> %a1,
 declare <4 x float> @llvm.x86.avx512fp16.maskz.vfmadd.csh(<4 x float>, <4 x float>, <4 x float>, i8, i32)
 
 define <4 x float> @stack_fold_fcmaddcsh(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
-  ;CHECK-LABEL: stack_fold_fcmaddcsh:
-  ;CHECK:       vfcmaddcsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fcmaddcsh:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfcmaddcsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmadd.csh(<4 x float> %a1, <4 x float> %a2, <4 x float> %a0, i8 -1, i32 4)
   ret <4 x float> %2
 }
 
 define <4 x float> @stack_fold_fcmaddcsh_commute(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
-  ;CHECK-LABEL: stack_fold_fcmaddcsh_commute:
-  ;CHECK:       vmovaps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Reload
-  ;CHECK:       vfcmaddcsh {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}}
+; CHECK-LABEL: stack_fold_fcmaddcsh_commute:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; CHECK-NEXT:    vfcmaddcsh %xmm1, %xmm2, %xmm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmadd.csh(<4 x float> %a2, <4 x float> %a1, <4 x float> %a0, i8 -1, i32 4)
   ret <4 x float> %2
@@ -1259,8 +2264,17 @@ define <4 x float> @stack_fold_fcmaddcsh_commute(<4 x float> %a0, <4 x float> %a
 declare <4 x float> @llvm.x86.avx512fp16.mask.vfcmadd.csh(<4 x float>, <4 x float>, <4 x float>, i8, i32)
 
 define <4 x float> @stack_fold_fcmaddcsh_mask(ptr %p, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
-  ;CHECK-LABEL: stack_fold_fcmaddcsh_mask:
-  ;CHECK:       vfcmaddcsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fcmaddcsh_mask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovaps (%rdi), %xmm2
+; CHECK-NEXT:    vfcmaddcsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload
+; CHECK-NEXT:    vmovaps %xmm2, %xmm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = load <4 x float>, ptr %p
   %2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmadd.csh(<4 x float> %a1, <4 x float> %a2, <4 x float> %a0, i8 %mask, i32 4)
@@ -1268,8 +2282,16 @@ define <4 x float> @stack_fold_fcmaddcsh_mask(ptr %p, <4 x float> %a1, <4 x floa
 }
 
 define <4 x float> @stack_fold_fcmaddcsh_maskz(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fcmaddcsh_maskz:
-  ;CHECK:       vfcmaddcsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fcmaddcsh_maskz:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovb (%rdi), %k1
+; CHECK-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    vfcmaddcsh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = load i8, ptr %mask
   %3 = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmadd.csh(<4 x float> %a1, <4 x float> %a2, <4 x float> zeroinitializer, i8 %2, i32 4)

diff  --git a/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16vl-fma.ll b/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16vl-fma.ll
index 0494cbe26bfb..bae213bc5f21 100644
--- a/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16vl-fma.ll
+++ b/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16vl-fma.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16,+avx512vl < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
@@ -9,8 +10,14 @@ target triple = "x86_64-unknown-unknown"
 ; relevant registers and check that the reload is correctly folded into the instruction.
 
 define <8 x half> @stack_fold_fmadd123ph(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) {
-  ;CHECK-LABEL: stack_fold_fmadd123ph:
-  ;CHECK:       vfmadd213ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmadd123ph:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfmadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2)
   ret <8 x half> %2
@@ -18,48 +25,87 @@ define <8 x half> @stack_fold_fmadd123ph(<8 x half> %a0, <8 x half> %a1, <8 x ha
 declare <8 x half> @llvm.fma.v8f16(<8 x half>, <8 x half>, <8 x half>)
 
 define <8 x half> @stack_fold_fmadd213ph(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) {
-  ;CHECK-LABEL: stack_fold_fmadd213ph:
-  ;CHECK:       vfmadd213ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmadd213ph:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfmadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a1, <8 x half> %a0, <8 x half> %a2)
   ret <8 x half> %2
 }
 
 define <8 x half> @stack_fold_fmadd231ph(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) {
-  ;CHECK-LABEL: stack_fold_fmadd231ph:
-  ;CHECK:       vfmadd231ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmadd231ph:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfmadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a1, <8 x half> %a2, <8 x half> %a0)
   ret <8 x half> %2
 }
 
 define <8 x half> @stack_fold_fmadd321ph(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) {
-  ;CHECK-LABEL: stack_fold_fmadd321ph:
-  ;CHECK:       vfmadd231ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmadd321ph:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfmadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a2, <8 x half> %a1, <8 x half> %a0)
   ret <8 x half> %2
 }
 
 define <8 x half> @stack_fold_fmadd132ph(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) {
-  ;CHECK-LABEL: stack_fold_fmadd132ph:
-  ;CHECK:       vfmadd132ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmadd132ph:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfmadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a0, <8 x half> %a2, <8 x half> %a1)
   ret <8 x half> %2
 }
 
 define <8 x half> @stack_fold_fmadd312ph(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) {
-  ;CHECK-LABEL: stack_fold_fmadd312ph:
-  ;CHECK:       vfmadd132ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmadd312ph:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfmadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a2, <8 x half> %a0, <8 x half> %a1)
   ret <8 x half> %2
 }
 
 define <8 x half> @stack_fold_fmadd123ph_mask(ptr %p, <8 x half> %a1, <8 x half> %a2, i8 %mask) {
-  ;CHECK-LABEL: stack_fold_fmadd123ph_mask:
-  ;CHECK:       vfmadd213ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmadd123ph_mask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovaps (%rdi), %xmm2
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vfmadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload
+; CHECK-NEXT:    vmovaps %xmm2, %xmm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = load <8 x half>, ptr %p
   %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2)
@@ -69,8 +115,17 @@ define <8 x half> @stack_fold_fmadd123ph_mask(ptr %p, <8 x half> %a1, <8 x half>
 }
 
 define <8 x half> @stack_fold_fmadd213ph_mask(ptr %p, <8 x half> %a1, <8 x half> %a2, i8 %mask) {
-  ;CHECK-LABEL: stack_fold_fmadd213ph_mask:
-  ;CHECK:       vfmadd213ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmadd213ph_mask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovaps (%rdi), %xmm2
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vfmadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload
+; CHECK-NEXT:    vmovaps %xmm2, %xmm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = load <8 x half>, ptr %p
   %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a1, <8 x half> %a0, <8 x half> %a2)
@@ -80,8 +135,17 @@ define <8 x half> @stack_fold_fmadd213ph_mask(ptr %p, <8 x half> %a1, <8 x half>
 }
 
 define <8 x half> @stack_fold_fmadd231ph_mask(ptr %p, <8 x half> %a1, <8 x half> %a2, i8 %mask) {
-  ;CHECK-LABEL: stack_fold_fmadd231ph_mask:
-  ;CHECK:       vfmadd231ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmadd231ph_mask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovaps (%rdi), %xmm2
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vfmadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload
+; CHECK-NEXT:    vmovaps %xmm2, %xmm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = load <8 x half>, ptr %p
   %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a1, <8 x half> %a2, <8 x half> %a0)
@@ -91,8 +155,17 @@ define <8 x half> @stack_fold_fmadd231ph_mask(ptr %p, <8 x half> %a1, <8 x half>
 }
 
 define <8 x half> @stack_fold_fmadd321ph_mask(ptr %p, <8 x half> %a1, <8 x half> %a2, i8 %mask) {
-  ;CHECK-LABEL: stack_fold_fmadd321ph_mask:
-  ;CHECK:       vfmadd231ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmadd321ph_mask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovaps (%rdi), %xmm2
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vfmadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload
+; CHECK-NEXT:    vmovaps %xmm2, %xmm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = load <8 x half>, ptr %p
   %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a2, <8 x half> %a1, <8 x half> %a0)
@@ -102,8 +175,17 @@ define <8 x half> @stack_fold_fmadd321ph_mask(ptr %p, <8 x half> %a1, <8 x half>
 }
 
 define <8 x half> @stack_fold_fmadd132ph_mask(ptr %p, <8 x half> %a1, <8 x half> %a2, i8 %mask) {
-  ;CHECK-LABEL: stack_fold_fmadd132ph_mask:
-  ;CHECK:       vfmadd132ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmadd132ph_mask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovaps (%rdi), %xmm2
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vfmadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload
+; CHECK-NEXT:    vmovaps %xmm2, %xmm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = load <8 x half>, ptr %p
   %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a0, <8 x half> %a2, <8 x half> %a1)
@@ -113,8 +195,17 @@ define <8 x half> @stack_fold_fmadd132ph_mask(ptr %p, <8 x half> %a1, <8 x half>
 }
 
 define <8 x half> @stack_fold_fmadd312ph_mask(ptr %p, <8 x half> %a1, <8 x half> %a2, i8 %mask) {
-  ;CHECK-LABEL: stack_fold_fmadd312ph_mask:
-  ;CHECK:       vfmadd132ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmadd312ph_mask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovaps (%rdi), %xmm2
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vfmadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload
+; CHECK-NEXT:    vmovaps %xmm2, %xmm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = load <8 x half>, ptr %p
   %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a2, <8 x half> %a0, <8 x half> %a1)
@@ -124,8 +215,15 @@ define <8 x half> @stack_fold_fmadd312ph_mask(ptr %p, <8 x half> %a1, <8 x half>
 }
 
 define <8 x half> @stack_fold_fmadd123ph_maskz(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fmadd123ph_maskz:
-  ;CHECK:       vfmadd213ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmadd123ph_maskz:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovb (%rdi), %k1
+; CHECK-NEXT:    vfmadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2)
   %3 = load i8, ptr %mask
@@ -135,8 +233,15 @@ define <8 x half> @stack_fold_fmadd123ph_maskz(<8 x half> %a0, <8 x half> %a1, <
 }
 
 define <8 x half> @stack_fold_fmadd213ph_maskz(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fmadd213ph_maskz:
-  ;CHECK:       vfmadd213ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmadd213ph_maskz:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovb (%rdi), %k1
+; CHECK-NEXT:    vfmadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a1, <8 x half> %a0, <8 x half> %a2)
   %3 = load i8, ptr %mask
@@ -146,8 +251,15 @@ define <8 x half> @stack_fold_fmadd213ph_maskz(<8 x half> %a0, <8 x half> %a1, <
 }
 
 define <8 x half> @stack_fold_fmadd231ph_maskz(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fmadd231ph_maskz:
-  ;CHECK:       vfmadd231ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmadd231ph_maskz:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovb (%rdi), %k1
+; CHECK-NEXT:    vfmadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a1, <8 x half> %a2, <8 x half> %a0)
   %3 = load i8, ptr %mask
@@ -157,8 +269,15 @@ define <8 x half> @stack_fold_fmadd231ph_maskz(<8 x half> %a0, <8 x half> %a1, <
 }
 
 define <8 x half> @stack_fold_fmadd321ph_maskz(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fmadd321ph_maskz:
-  ;CHECK:       vfmadd231ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmadd321ph_maskz:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovb (%rdi), %k1
+; CHECK-NEXT:    vfmadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a2, <8 x half> %a1, <8 x half> %a0)
   %3 = load i8, ptr %mask
@@ -168,8 +287,15 @@ define <8 x half> @stack_fold_fmadd321ph_maskz(<8 x half> %a0, <8 x half> %a1, <
 }
 
 define <8 x half> @stack_fold_fmadd132ph_maskz(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fmadd132ph_maskz:
-  ;CHECK:       vfmadd132ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmadd132ph_maskz:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovb (%rdi), %k1
+; CHECK-NEXT:    vfmadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a0, <8 x half> %a2, <8 x half> %a1)
   %3 = load i8, ptr %mask
@@ -179,8 +305,15 @@ define <8 x half> @stack_fold_fmadd132ph_maskz(<8 x half> %a0, <8 x half> %a1, <
 }
 
 define <8 x half> @stack_fold_fmadd312ph_maskz(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fmadd312ph_maskz:
-  ;CHECK:       vfmadd132ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmadd312ph_maskz:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovb (%rdi), %k1
+; CHECK-NEXT:    vfmadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a2, <8 x half> %a0, <8 x half> %a1)
   %3 = load i8, ptr %mask
@@ -190,8 +323,14 @@ define <8 x half> @stack_fold_fmadd312ph_maskz(<8 x half> %a0, <8 x half> %a1, <
 }
 
 define <8 x half> @stack_fold_fmsub123ph(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) {
-  ;CHECK-LABEL: stack_fold_fmsub123ph:
-  ;CHECK:       vfmsub213ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmsub123ph:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfmsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = fneg <8 x half> %a2
   %3 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a0, <8 x half> %a1, <8 x half> %2)
@@ -199,8 +338,14 @@ define <8 x half> @stack_fold_fmsub123ph(<8 x half> %a0, <8 x half> %a1, <8 x ha
 }
 
 define <8 x half> @stack_fold_fmsub213ph(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) {
-  ;CHECK-LABEL: stack_fold_fmsub213ph:
-  ;CHECK:       vfmsub213ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmsub213ph:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfmsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = fneg <8 x half> %a2
   %3 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a1, <8 x half> %a0, <8 x half> %2)
@@ -208,8 +353,14 @@ define <8 x half> @stack_fold_fmsub213ph(<8 x half> %a0, <8 x half> %a1, <8 x ha
 }
 
 define <8 x half> @stack_fold_fmsub231ph(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) {
-  ;CHECK-LABEL: stack_fold_fmsub231ph:
-  ;CHECK:       vfmsub231ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmsub231ph:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfmsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = fneg <8 x half> %a0
   %3 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a1, <8 x half> %a2, <8 x half> %2)
@@ -217,8 +368,14 @@ define <8 x half> @stack_fold_fmsub231ph(<8 x half> %a0, <8 x half> %a1, <8 x ha
 }
 
 define <8 x half> @stack_fold_fmsub321ph(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) {
-  ;CHECK-LABEL: stack_fold_fmsub321ph:
-  ;CHECK:       vfmsub231ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmsub321ph:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfmsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = fneg <8 x half> %a0
   %3 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a2, <8 x half> %a1, <8 x half> %2)
@@ -226,8 +383,14 @@ define <8 x half> @stack_fold_fmsub321ph(<8 x half> %a0, <8 x half> %a1, <8 x ha
 }
 
 define <8 x half> @stack_fold_fmsub132ph(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) {
-  ;CHECK-LABEL: stack_fold_fmsub132ph:
-  ;CHECK:       vfmsub132ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmsub132ph:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfmsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = fneg <8 x half> %a1
   %3 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a0, <8 x half> %a2, <8 x half> %2)
@@ -235,8 +398,14 @@ define <8 x half> @stack_fold_fmsub132ph(<8 x half> %a0, <8 x half> %a1, <8 x ha
 }
 
 define <8 x half> @stack_fold_fmsub312ph(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) {
-  ;CHECK-LABEL: stack_fold_fmsub312ph:
-  ;CHECK:       vfmsub132ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmsub312ph:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfmsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = fneg <8 x half> %a1
   %3 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a2, <8 x half> %a0, <8 x half> %2)
@@ -244,8 +413,17 @@ define <8 x half> @stack_fold_fmsub312ph(<8 x half> %a0, <8 x half> %a1, <8 x ha
 }
 
 define <8 x half> @stack_fold_fmsub123ph_mask(ptr %p, <8 x half> %a1, <8 x half> %a2, i8 %mask) {
-  ;CHECK-LABEL: stack_fold_fmsub123ph_mask:
-  ;CHECK:       vfmsub213ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmsub123ph_mask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovaps (%rdi), %xmm2
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vfmsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload
+; CHECK-NEXT:    vmovaps %xmm2, %xmm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = load <8 x half>, ptr %p
   %neg = fneg <8 x half> %a2
@@ -256,8 +434,17 @@ define <8 x half> @stack_fold_fmsub123ph_mask(ptr %p, <8 x half> %a1, <8 x half>
 }
 
 define <8 x half> @stack_fold_fmsub213ph_mask(ptr %p, <8 x half> %a1, <8 x half> %a2, i8 %mask) {
-  ;CHECK-LABEL: stack_fold_fmsub213ph_mask:
-  ;CHECK:       vfmsub213ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmsub213ph_mask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovaps (%rdi), %xmm2
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vfmsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload
+; CHECK-NEXT:    vmovaps %xmm2, %xmm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = load <8 x half>, ptr %p
   %neg = fneg <8 x half> %a2
@@ -268,8 +455,17 @@ define <8 x half> @stack_fold_fmsub213ph_mask(ptr %p, <8 x half> %a1, <8 x half>
 }
 
 define <8 x half> @stack_fold_fmsub231ph_mask(ptr %p, <8 x half> %a1, <8 x half> %a2, i8 %mask) {
-  ;CHECK-LABEL: stack_fold_fmsub231ph_mask:
-  ;CHECK:       vfmsub231ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmsub231ph_mask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovaps (%rdi), %xmm2
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vfmsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload
+; CHECK-NEXT:    vmovaps %xmm2, %xmm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = load <8 x half>, ptr %p
   %neg = fneg <8 x half> %a0
@@ -280,8 +476,17 @@ define <8 x half> @stack_fold_fmsub231ph_mask(ptr %p, <8 x half> %a1, <8 x half>
 }
 
 define <8 x half> @stack_fold_fmsub321ph_mask(ptr %p, <8 x half> %a1, <8 x half> %a2, i8 %mask) {
-  ;CHECK-LABEL: stack_fold_fmsub321ph_mask:
-  ;CHECK:       vfmsub231ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmsub321ph_mask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovaps (%rdi), %xmm2
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vfmsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload
+; CHECK-NEXT:    vmovaps %xmm2, %xmm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = load <8 x half>, ptr %p
   %neg = fneg <8 x half> %a0
@@ -292,8 +497,17 @@ define <8 x half> @stack_fold_fmsub321ph_mask(ptr %p, <8 x half> %a1, <8 x half>
 }
 
 define <8 x half> @stack_fold_fmsub132ph_mask(ptr %p, <8 x half> %a1, <8 x half> %a2, i8 %mask) {
-  ;CHECK-LABEL: stack_fold_fmsub132ph_mask:
-  ;CHECK:       vfmsub132ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmsub132ph_mask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovaps (%rdi), %xmm2
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vfmsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload
+; CHECK-NEXT:    vmovaps %xmm2, %xmm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = load <8 x half>, ptr %p
   %neg = fneg <8 x half> %a1
@@ -304,8 +518,17 @@ define <8 x half> @stack_fold_fmsub132ph_mask(ptr %p, <8 x half> %a1, <8 x half>
 }
 
 define <8 x half> @stack_fold_fmsub312ph_mask(ptr %p, <8 x half> %a1, <8 x half> %a2, i8 %mask) {
-  ;CHECK-LABEL: stack_fold_fmsub312ph_mask:
-  ;CHECK:       vfmsub132ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmsub312ph_mask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovaps (%rdi), %xmm2
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vfmsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload
+; CHECK-NEXT:    vmovaps %xmm2, %xmm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = load <8 x half>, ptr %p
   %neg = fneg <8 x half> %a1
@@ -316,8 +539,15 @@ define <8 x half> @stack_fold_fmsub312ph_mask(ptr %p, <8 x half> %a1, <8 x half>
 }
 
 define <8 x half> @stack_fold_fmsub123ph_maskz(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fmsub123ph_maskz:
-  ;CHECK:       vfmsub213ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmsub123ph_maskz:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovb (%rdi), %k1
+; CHECK-NEXT:    vfmsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %neg = fneg <8 x half> %a2
   %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a0, <8 x half> %a1, <8 x half> %neg)
@@ -328,8 +558,15 @@ define <8 x half> @stack_fold_fmsub123ph_maskz(<8 x half> %a0, <8 x half> %a1, <
 }
 
 define <8 x half> @stack_fold_fmsub213ph_maskz(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fmsub213ph_maskz:
-  ;CHECK:       vfmsub213ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmsub213ph_maskz:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovb (%rdi), %k1
+; CHECK-NEXT:    vfmsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %neg = fneg <8 x half> %a2
   %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a1, <8 x half> %a0, <8 x half> %neg)
@@ -340,8 +577,15 @@ define <8 x half> @stack_fold_fmsub213ph_maskz(<8 x half> %a0, <8 x half> %a1, <
 }
 
 define <8 x half> @stack_fold_fmsub231ph_maskz(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fmsub231ph_maskz:
-  ;CHECK:       vfmsub231ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmsub231ph_maskz:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovb (%rdi), %k1
+; CHECK-NEXT:    vfmsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %neg = fneg <8 x half> %a0
   %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a1, <8 x half> %a2, <8 x half> %neg)
@@ -352,8 +596,15 @@ define <8 x half> @stack_fold_fmsub231ph_maskz(<8 x half> %a0, <8 x half> %a1, <
 }
 
 define <8 x half> @stack_fold_fmsub321ph_maskz(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fmsub321ph_maskz:
-  ;CHECK:       vfmsub231ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmsub321ph_maskz:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovb (%rdi), %k1
+; CHECK-NEXT:    vfmsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %neg = fneg <8 x half> %a0
   %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a2, <8 x half> %a1, <8 x half> %neg)
@@ -364,8 +615,15 @@ define <8 x half> @stack_fold_fmsub321ph_maskz(<8 x half> %a0, <8 x half> %a1, <
 }
 
 define <8 x half> @stack_fold_fmsub132ph_maskz(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fmsub132ph_maskz:
-  ;CHECK:       vfmsub132ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmsub132ph_maskz:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovb (%rdi), %k1
+; CHECK-NEXT:    vfmsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %neg = fneg <8 x half> %a1
   %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a0, <8 x half> %a2, <8 x half> %neg)
@@ -376,8 +634,15 @@ define <8 x half> @stack_fold_fmsub132ph_maskz(<8 x half> %a0, <8 x half> %a1, <
 }
 
 define <8 x half> @stack_fold_fmsub312ph_maskz(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fmsub312ph_maskz:
-  ;CHECK:       vfmsub132ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmsub312ph_maskz:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovb (%rdi), %k1
+; CHECK-NEXT:    vfmsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %neg = fneg <8 x half> %a1
   %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %a2, <8 x half> %a0, <8 x half> %neg)
@@ -388,8 +653,14 @@ define <8 x half> @stack_fold_fmsub312ph_maskz(<8 x half> %a0, <8 x half> %a1, <
 }
 
 define <8 x half> @stack_fold_fnmadd123ph(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) {
-  ;CHECK-LABEL: stack_fold_fnmadd123ph:
-  ;CHECK:       vfnmadd213ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmadd123ph:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfnmadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = fneg <8 x half> %a0
   %3 = call <8 x half> @llvm.fma.v8f16(<8 x half> %2, <8 x half> %a1, <8 x half> %a2)
@@ -397,8 +668,14 @@ define <8 x half> @stack_fold_fnmadd123ph(<8 x half> %a0, <8 x half> %a1, <8 x h
 }
 
 define <8 x half> @stack_fold_fnmadd213ph(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) {
-  ;CHECK-LABEL: stack_fold_fnmadd213ph:
-  ;CHECK:       vfnmadd213ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmadd213ph:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfnmadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = fneg <8 x half> %a1
   %3 = call <8 x half> @llvm.fma.v8f16(<8 x half> %2, <8 x half> %a0, <8 x half> %a2)
@@ -406,8 +683,14 @@ define <8 x half> @stack_fold_fnmadd213ph(<8 x half> %a0, <8 x half> %a1, <8 x h
 }
 
 define <8 x half> @stack_fold_fnmadd231ph(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) {
-  ;CHECK-LABEL: stack_fold_fnmadd231ph:
-  ;CHECK:       vfnmadd231ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmadd231ph:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfnmadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = fneg <8 x half> %a1
   %3 = call <8 x half> @llvm.fma.v8f16(<8 x half> %2, <8 x half> %a2, <8 x half> %a0)
@@ -415,8 +698,14 @@ define <8 x half> @stack_fold_fnmadd231ph(<8 x half> %a0, <8 x half> %a1, <8 x h
 }
 
 define <8 x half> @stack_fold_fnmadd321ph(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) {
-  ;CHECK-LABEL: stack_fold_fnmadd321ph:
-  ;CHECK:       vfnmadd231ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmadd321ph:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfnmadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = fneg <8 x half> %a2
   %3 = call <8 x half> @llvm.fma.v8f16(<8 x half> %2, <8 x half> %a1, <8 x half> %a0)
@@ -424,8 +713,14 @@ define <8 x half> @stack_fold_fnmadd321ph(<8 x half> %a0, <8 x half> %a1, <8 x h
 }
 
 define <8 x half> @stack_fold_fnmadd132ph(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) {
-  ;CHECK-LABEL: stack_fold_fnmadd132ph:
-  ;CHECK:       vfnmadd132ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmadd132ph:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfnmadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = fneg <8 x half> %a0
   %3 = call <8 x half> @llvm.fma.v8f16(<8 x half> %2, <8 x half> %a2, <8 x half> %a1)
@@ -433,8 +728,14 @@ define <8 x half> @stack_fold_fnmadd132ph(<8 x half> %a0, <8 x half> %a1, <8 x h
 }
 
 define <8 x half> @stack_fold_fnmadd312ph(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) {
-  ;CHECK-LABEL: stack_fold_fnmadd312ph:
-  ;CHECK:       vfnmadd132ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmadd312ph:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfnmadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = fneg <8 x half> %a2
   %3 = call <8 x half> @llvm.fma.v8f16(<8 x half> %2, <8 x half> %a0, <8 x half> %a1)
@@ -442,8 +743,17 @@ define <8 x half> @stack_fold_fnmadd312ph(<8 x half> %a0, <8 x half> %a1, <8 x h
 }
 
 define <8 x half> @stack_fold_fnmadd123ph_mask(ptr %p, <8 x half> %a1, <8 x half> %a2, i8 %mask) {
-  ;CHECK-LABEL: stack_fold_fnmadd123ph_mask:
-  ;CHECK:       vfnmadd213ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmadd123ph_mask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovaps (%rdi), %xmm2
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vfnmadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload
+; CHECK-NEXT:    vmovaps %xmm2, %xmm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = load <8 x half>, ptr %p
   %neg = fneg <8 x half> %a0
@@ -454,8 +764,17 @@ define <8 x half> @stack_fold_fnmadd123ph_mask(ptr %p, <8 x half> %a1, <8 x half
 }
 
 define <8 x half> @stack_fold_fnmadd213ph_mask(ptr %p, <8 x half> %a1, <8 x half> %a2, i8 %mask) {
-  ;CHECK-LABEL: stack_fold_fnmadd213ph_mask:
-  ;CHECK:       vfnmadd213ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmadd213ph_mask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovaps (%rdi), %xmm2
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vfnmadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload
+; CHECK-NEXT:    vmovaps %xmm2, %xmm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = load <8 x half>, ptr %p
   %neg = fneg <8 x half> %a1
@@ -466,8 +785,17 @@ define <8 x half> @stack_fold_fnmadd213ph_mask(ptr %p, <8 x half> %a1, <8 x half
 }
 
 define <8 x half> @stack_fold_fnmadd231ph_mask(ptr %p, <8 x half> %a1, <8 x half> %a2, i8 %mask) {
-  ;CHECK-LABEL: stack_fold_fnmadd231ph_mask:
-  ;CHECK:       vfnmadd231ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmadd231ph_mask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovaps (%rdi), %xmm2
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vfnmadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload
+; CHECK-NEXT:    vmovaps %xmm2, %xmm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = load <8 x half>, ptr %p
   %neg = fneg <8 x half> %a1
@@ -478,8 +806,17 @@ define <8 x half> @stack_fold_fnmadd231ph_mask(ptr %p, <8 x half> %a1, <8 x half
 }
 
 define <8 x half> @stack_fold_fnmadd321ph_mask(ptr %p, <8 x half> %a1, <8 x half> %a2, i8 %mask) {
-  ;CHECK-LABEL: stack_fold_fnmadd321ph_mask:
-  ;CHECK:       vfnmadd231ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmadd321ph_mask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovaps (%rdi), %xmm2
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vfnmadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload
+; CHECK-NEXT:    vmovaps %xmm2, %xmm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = load <8 x half>, ptr %p
   %neg = fneg <8 x half> %a2
@@ -490,8 +827,17 @@ define <8 x half> @stack_fold_fnmadd321ph_mask(ptr %p, <8 x half> %a1, <8 x half
 }
 
 define <8 x half> @stack_fold_fnmadd132ph_mask(ptr %p, <8 x half> %a1, <8 x half> %a2, i8 %mask) {
-  ;CHECK-LABEL: stack_fold_fnmadd132ph_mask:
-  ;CHECK:       vfnmadd132ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmadd132ph_mask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovaps (%rdi), %xmm2
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vfnmadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload
+; CHECK-NEXT:    vmovaps %xmm2, %xmm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = load <8 x half>, ptr %p
   %neg = fneg <8 x half> %a0
@@ -502,8 +848,17 @@ define <8 x half> @stack_fold_fnmadd132ph_mask(ptr %p, <8 x half> %a1, <8 x half
 }
 
 define <8 x half> @stack_fold_fnmadd312ph_mask(ptr %p, <8 x half> %a1, <8 x half> %a2, i8 %mask) {
-  ;CHECK-LABEL: stack_fold_fnmadd312ph_mask:
-  ;CHECK:       vfnmadd132ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmadd312ph_mask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovaps (%rdi), %xmm2
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vfnmadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload
+; CHECK-NEXT:    vmovaps %xmm2, %xmm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = load <8 x half>, ptr %p
   %neg = fneg <8 x half> %a2
@@ -514,8 +869,15 @@ define <8 x half> @stack_fold_fnmadd312ph_mask(ptr %p, <8 x half> %a1, <8 x half
 }
 
 define <8 x half> @stack_fold_fnmadd123ph_maskz(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fnmadd123ph_maskz:
-  ;CHECK:       vfnmadd213ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmadd123ph_maskz:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovb (%rdi), %k1
+; CHECK-NEXT:    vfnmadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %neg = fneg <8 x half> %a0
   %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %neg, <8 x half> %a1, <8 x half> %a2)
@@ -526,8 +888,15 @@ define <8 x half> @stack_fold_fnmadd123ph_maskz(<8 x half> %a0, <8 x half> %a1,
 }
 
 define <8 x half> @stack_fold_fnmadd213ph_maskz(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fnmadd213ph_maskz:
-  ;CHECK:       vfnmadd213ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmadd213ph_maskz:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovb (%rdi), %k1
+; CHECK-NEXT:    vfnmadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %neg = fneg <8 x half> %a1
   %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %neg, <8 x half> %a0, <8 x half> %a2)
@@ -538,8 +907,15 @@ define <8 x half> @stack_fold_fnmadd213ph_maskz(<8 x half> %a0, <8 x half> %a1,
 }
 
 define <8 x half> @stack_fold_fnmadd231ph_maskz(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fnmadd231ph_maskz:
-  ;CHECK:       vfnmadd231ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmadd231ph_maskz:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovb (%rdi), %k1
+; CHECK-NEXT:    vfnmadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %neg = fneg <8 x half> %a1
   %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %neg, <8 x half> %a2, <8 x half> %a0)
@@ -550,8 +926,15 @@ define <8 x half> @stack_fold_fnmadd231ph_maskz(<8 x half> %a0, <8 x half> %a1,
 }
 
 define <8 x half> @stack_fold_fnmadd321ph_maskz(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fnmadd321ph_maskz:
-  ;CHECK:       vfnmadd231ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmadd321ph_maskz:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovb (%rdi), %k1
+; CHECK-NEXT:    vfnmadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %neg = fneg <8 x half> %a2
   %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %neg, <8 x half> %a1, <8 x half> %a0)
@@ -562,8 +945,15 @@ define <8 x half> @stack_fold_fnmadd321ph_maskz(<8 x half> %a0, <8 x half> %a1,
 }
 
 define <8 x half> @stack_fold_fnmadd132ph_maskz(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fnmadd132ph_maskz:
-  ;CHECK:       vfnmadd132ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmadd132ph_maskz:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovb (%rdi), %k1
+; CHECK-NEXT:    vfnmadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %neg = fneg <8 x half> %a0
   %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %neg, <8 x half> %a2, <8 x half> %a1)
@@ -574,8 +964,15 @@ define <8 x half> @stack_fold_fnmadd132ph_maskz(<8 x half> %a0, <8 x half> %a1,
 }
 
 define <8 x half> @stack_fold_fnmadd312ph_maskz(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fnmadd312ph_maskz:
-  ;CHECK:       vfnmadd132ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmadd312ph_maskz:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovb (%rdi), %k1
+; CHECK-NEXT:    vfnmadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %neg = fneg <8 x half> %a2
   %2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %neg, <8 x half> %a0, <8 x half> %a1)
@@ -586,8 +983,14 @@ define <8 x half> @stack_fold_fnmadd312ph_maskz(<8 x half> %a0, <8 x half> %a1,
 }
 
 define <8 x half> @stack_fold_fnmsub123ph(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) {
-  ;CHECK-LABEL: stack_fold_fnmsub123ph:
-  ;CHECK:       vfnmsub213ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmsub123ph:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfnmsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = fneg <8 x half> %a0
   %3 = fneg <8 x half> %a2
@@ -596,8 +999,14 @@ define <8 x half> @stack_fold_fnmsub123ph(<8 x half> %a0, <8 x half> %a1, <8 x h
 }
 
 define <8 x half> @stack_fold_fnmsub213ph(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) {
-  ;CHECK-LABEL: stack_fold_fnmsub213ph:
-  ;CHECK:       vfnmsub213ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmsub213ph:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfnmsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = fneg <8 x half> %a1
   %3 = fneg <8 x half> %a2
@@ -606,8 +1015,14 @@ define <8 x half> @stack_fold_fnmsub213ph(<8 x half> %a0, <8 x half> %a1, <8 x h
 }
 
 define <8 x half> @stack_fold_fnmsub231ph(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) {
-  ;CHECK-LABEL: stack_fold_fnmsub231ph:
-  ;CHECK:       vfnmsub231ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmsub231ph:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfnmsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = fneg <8 x half> %a1
   %3 = fneg <8 x half> %a0
@@ -616,8 +1031,14 @@ define <8 x half> @stack_fold_fnmsub231ph(<8 x half> %a0, <8 x half> %a1, <8 x h
 }
 
 define <8 x half> @stack_fold_fnmsub321ph(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) {
-  ;CHECK-LABEL: stack_fold_fnmsub321ph:
-  ;CHECK:       vfnmsub231ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmsub321ph:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfnmsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = fneg <8 x half> %a2
   %3 = fneg <8 x half> %a0
@@ -626,8 +1047,14 @@ define <8 x half> @stack_fold_fnmsub321ph(<8 x half> %a0, <8 x half> %a1, <8 x h
 }
 
 define <8 x half> @stack_fold_fnmsub132ph(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) {
-  ;CHECK-LABEL: stack_fold_fnmsub132ph:
-  ;CHECK:       vfnmsub132ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmsub132ph:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfnmsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = fneg <8 x half> %a0
   %3 = fneg <8 x half> %a1
@@ -636,8 +1063,14 @@ define <8 x half> @stack_fold_fnmsub132ph(<8 x half> %a0, <8 x half> %a1, <8 x h
 }
 
 define <8 x half> @stack_fold_fnmsub312ph(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) {
-  ;CHECK-LABEL: stack_fold_fnmsub312ph:
-  ;CHECK:       vfnmsub132ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmsub312ph:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfnmsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = fneg <8 x half> %a2
   %3 = fneg <8 x half> %a1
@@ -646,8 +1079,17 @@ define <8 x half> @stack_fold_fnmsub312ph(<8 x half> %a0, <8 x half> %a1, <8 x h
 }
 
 define <8 x half> @stack_fold_fnmsub123ph_mask(ptr %p, <8 x half> %a1, <8 x half> %a2, i8 %mask) {
-  ;CHECK-LABEL: stack_fold_fnmsub123ph_mask:
-  ;CHECK:       vfnmsub213ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmsub123ph_mask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovaps (%rdi), %xmm2
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vfnmsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload
+; CHECK-NEXT:    vmovaps %xmm2, %xmm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = load <8 x half>, ptr %p
   %neg = fneg <8 x half> %a2
@@ -659,8 +1101,17 @@ define <8 x half> @stack_fold_fnmsub123ph_mask(ptr %p, <8 x half> %a1, <8 x half
 }
 
 define <8 x half> @stack_fold_fnmsub213ph_mask(ptr %p, <8 x half> %a1, <8 x half> %a2, i8 %mask) {
-  ;CHECK-LABEL: stack_fold_fnmsub213ph_mask:
-  ;CHECK:       vfnmsub213ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmsub213ph_mask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovaps (%rdi), %xmm2
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vfnmsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload
+; CHECK-NEXT:    vmovaps %xmm2, %xmm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = load <8 x half>, ptr %p
   %neg = fneg <8 x half> %a2
@@ -672,8 +1123,17 @@ define <8 x half> @stack_fold_fnmsub213ph_mask(ptr %p, <8 x half> %a1, <8 x half
 }
 
 define <8 x half> @stack_fold_fnmsub231ph_mask(ptr %p, <8 x half> %a1, <8 x half> %a2, i8 %mask) {
-  ;CHECK-LABEL: stack_fold_fnmsub231ph_mask:
-  ;CHECK:       vfnmsub231ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmsub231ph_mask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovaps (%rdi), %xmm2
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vfnmsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload
+; CHECK-NEXT:    vmovaps %xmm2, %xmm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = load <8 x half>, ptr %p
   %neg = fneg <8 x half> %a0
@@ -685,8 +1145,17 @@ define <8 x half> @stack_fold_fnmsub231ph_mask(ptr %p, <8 x half> %a1, <8 x half
 }
 
 define <8 x half> @stack_fold_fnmsub321ph_mask(ptr %p, <8 x half> %a1, <8 x half> %a2, i8 %mask) {
-  ;CHECK-LABEL: stack_fold_fnmsub321ph_mask:
-  ;CHECK:       vfnmsub231ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmsub321ph_mask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovaps (%rdi), %xmm2
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vfnmsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload
+; CHECK-NEXT:    vmovaps %xmm2, %xmm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = load <8 x half>, ptr %p
   %neg = fneg <8 x half> %a0
@@ -698,8 +1167,17 @@ define <8 x half> @stack_fold_fnmsub321ph_mask(ptr %p, <8 x half> %a1, <8 x half
 }
 
 define <8 x half> @stack_fold_fnmsub132ph_mask(ptr %p, <8 x half> %a1, <8 x half> %a2, i8 %mask) {
-  ;CHECK-LABEL: stack_fold_fnmsub132ph_mask:
-  ;CHECK:       vfnmsub132ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmsub132ph_mask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovaps (%rdi), %xmm2
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vfnmsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload
+; CHECK-NEXT:    vmovaps %xmm2, %xmm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = load <8 x half>, ptr %p
   %neg = fneg <8 x half> %a1
@@ -711,8 +1189,17 @@ define <8 x half> @stack_fold_fnmsub132ph_mask(ptr %p, <8 x half> %a1, <8 x half
 }
 
 define <8 x half> @stack_fold_fnmsub312ph_mask(ptr %p, <8 x half> %a1, <8 x half> %a2, i8 %mask) {
-  ;CHECK-LABEL: stack_fold_fnmsub312ph_mask:
-  ;CHECK:       vfnmsub132ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmsub312ph_mask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovaps (%rdi), %xmm2
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vfnmsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload
+; CHECK-NEXT:    vmovaps %xmm2, %xmm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = load <8 x half>, ptr %p
   %neg = fneg <8 x half> %a1
@@ -724,8 +1211,15 @@ define <8 x half> @stack_fold_fnmsub312ph_mask(ptr %p, <8 x half> %a1, <8 x half
 }
 
 define <8 x half> @stack_fold_fnmsub123ph_maskz(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fnmsub123ph_maskz:
-  ;CHECK:       vfnmsub213ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmsub123ph_maskz:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovb (%rdi), %k1
+; CHECK-NEXT:    vfnmsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %neg = fneg <8 x half> %a2
   %neg1 = fneg <8 x half> %a0
@@ -737,8 +1231,15 @@ define <8 x half> @stack_fold_fnmsub123ph_maskz(<8 x half> %a0, <8 x half> %a1,
 }
 
 define <8 x half> @stack_fold_fnmsub213ph_maskz(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fnmsub213ph_maskz:
-  ;CHECK:       vfnmsub213ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmsub213ph_maskz:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovb (%rdi), %k1
+; CHECK-NEXT:    vfnmsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %neg = fneg <8 x half> %a2
   %neg1 = fneg <8 x half> %a1
@@ -750,8 +1251,15 @@ define <8 x half> @stack_fold_fnmsub213ph_maskz(<8 x half> %a0, <8 x half> %a1,
 }
 
 define <8 x half> @stack_fold_fnmsub231ph_maskz(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fnmsub231ph_maskz:
-  ;CHECK:       vfnmsub231ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmsub231ph_maskz:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovb (%rdi), %k1
+; CHECK-NEXT:    vfnmsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %neg = fneg <8 x half> %a0
   %neg1 = fneg <8 x half> %a1
@@ -763,8 +1271,15 @@ define <8 x half> @stack_fold_fnmsub231ph_maskz(<8 x half> %a0, <8 x half> %a1,
 }
 
 define <8 x half> @stack_fold_fnmsub321ph_maskz(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fnmsub321ph_maskz:
-  ;CHECK:       vfnmsub231ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmsub321ph_maskz:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovb (%rdi), %k1
+; CHECK-NEXT:    vfnmsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %neg = fneg <8 x half> %a0
   %neg1 = fneg <8 x half> %a2
@@ -776,8 +1291,15 @@ define <8 x half> @stack_fold_fnmsub321ph_maskz(<8 x half> %a0, <8 x half> %a1,
 }
 
 define <8 x half> @stack_fold_fnmsub132ph_maskz(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fnmsub132ph_maskz:
-  ;CHECK:       vfnmsub132ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmsub132ph_maskz:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovb (%rdi), %k1
+; CHECK-NEXT:    vfnmsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %neg = fneg <8 x half> %a1
   %neg1 = fneg <8 x half> %a0
@@ -789,8 +1311,15 @@ define <8 x half> @stack_fold_fnmsub132ph_maskz(<8 x half> %a0, <8 x half> %a1,
 }
 
 define <8 x half> @stack_fold_fnmsub312ph_maskz(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fnmsub312ph_maskz:
-  ;CHECK:       vfnmsub132ph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmsub312ph_maskz:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovb (%rdi), %k1
+; CHECK-NEXT:    vfnmsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %neg = fneg <8 x half> %a1
   %neg1 = fneg <8 x half> %a2
@@ -802,8 +1331,14 @@ define <8 x half> @stack_fold_fnmsub312ph_maskz(<8 x half> %a0, <8 x half> %a1,
 }
 
 define <16 x half> @stack_fold_fmadd123ph_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2) {
-  ;CHECK-LABEL: stack_fold_fmadd123ph_ymm:
-  ;CHECK:       vfmadd213ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmadd123ph_ymm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfmadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2)
   ret <16 x half> %2
@@ -811,48 +1346,87 @@ define <16 x half> @stack_fold_fmadd123ph_ymm(<16 x half> %a0, <16 x half> %a1,
 declare <16 x half> @llvm.fma.v16f16(<16 x half>, <16 x half>, <16 x half>)
 
 define <16 x half> @stack_fold_fmadd213ph_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2) {
-  ;CHECK-LABEL: stack_fold_fmadd213ph_ymm:
-  ;CHECK:       vfmadd213ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmadd213ph_ymm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfmadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a1, <16 x half> %a0, <16 x half> %a2)
   ret <16 x half> %2
 }
 
 define <16 x half> @stack_fold_fmadd231ph_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2) {
-  ;CHECK-LABEL: stack_fold_fmadd231ph_ymm:
-  ;CHECK:       vfmadd231ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmadd231ph_ymm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfmadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a1, <16 x half> %a2, <16 x half> %a0)
   ret <16 x half> %2
 }
 
 define <16 x half> @stack_fold_fmadd321ph_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2) {
-  ;CHECK-LABEL: stack_fold_fmadd321ph_ymm:
-  ;CHECK:       vfmadd231ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmadd321ph_ymm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfmadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a2, <16 x half> %a1, <16 x half> %a0)
   ret <16 x half> %2
 }
 
 define <16 x half> @stack_fold_fmadd132ph_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2) {
-  ;CHECK-LABEL: stack_fold_fmadd132ph_ymm:
-  ;CHECK:       vfmadd132ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmadd132ph_ymm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfmadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a0, <16 x half> %a2, <16 x half> %a1)
   ret <16 x half> %2
 }
 
 define <16 x half> @stack_fold_fmadd312ph_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2) {
-  ;CHECK-LABEL: stack_fold_fmadd312ph_ymm:
-  ;CHECK:       vfmadd132ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmadd312ph_ymm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfmadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a2, <16 x half> %a0, <16 x half> %a1)
   ret <16 x half> %2
 }
 
 define <16 x half> @stack_fold_fmadd123ph_mask_ymm(ptr %p, <16 x half> %a1, <16 x half> %a2, i16 %mask) {
-  ;CHECK-LABEL: stack_fold_fmadd123ph_mask_ymm:
-  ;CHECK:       vfmadd213ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmadd123ph_mask_ymm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovaps (%rdi), %ymm2
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vfmadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 {%k1} # 32-byte Folded Reload
+; CHECK-NEXT:    vmovaps %ymm2, %ymm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = load <16 x half>, ptr %p
   %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2)
@@ -862,8 +1436,17 @@ define <16 x half> @stack_fold_fmadd123ph_mask_ymm(ptr %p, <16 x half> %a1, <16
 }
 
 define <16 x half> @stack_fold_fmadd213ph_mask_ymm(ptr %p, <16 x half> %a1, <16 x half> %a2, i16 %mask) {
-  ;CHECK-LABEL: stack_fold_fmadd213ph_mask_ymm:
-  ;CHECK:       vfmadd213ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmadd213ph_mask_ymm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovaps (%rdi), %ymm2
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vfmadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 {%k1} # 32-byte Folded Reload
+; CHECK-NEXT:    vmovaps %ymm2, %ymm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = load <16 x half>, ptr %p
   %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a1, <16 x half> %a0, <16 x half> %a2)
@@ -873,8 +1456,17 @@ define <16 x half> @stack_fold_fmadd213ph_mask_ymm(ptr %p, <16 x half> %a1, <16
 }
 
 define <16 x half> @stack_fold_fmadd231ph_mask_ymm(ptr %p, <16 x half> %a1, <16 x half> %a2, i16 %mask) {
-  ;CHECK-LABEL: stack_fold_fmadd231ph_mask_ymm:
-  ;CHECK:       vfmadd231ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmadd231ph_mask_ymm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovaps (%rdi), %ymm2
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vfmadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 {%k1} # 32-byte Folded Reload
+; CHECK-NEXT:    vmovaps %ymm2, %ymm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = load <16 x half>, ptr %p
   %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a1, <16 x half> %a2, <16 x half> %a0)
@@ -884,8 +1476,17 @@ define <16 x half> @stack_fold_fmadd231ph_mask_ymm(ptr %p, <16 x half> %a1, <16
 }
 
 define <16 x half> @stack_fold_fmadd321ph_mask_ymm(ptr %p, <16 x half> %a1, <16 x half> %a2, i16 %mask) {
-  ;CHECK-LABEL: stack_fold_fmadd321ph_mask_ymm:
-  ;CHECK:       vfmadd231ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmadd321ph_mask_ymm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovaps (%rdi), %ymm2
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vfmadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 {%k1} # 32-byte Folded Reload
+; CHECK-NEXT:    vmovaps %ymm2, %ymm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = load <16 x half>, ptr %p
   %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a2, <16 x half> %a1, <16 x half> %a0)
@@ -895,8 +1496,17 @@ define <16 x half> @stack_fold_fmadd321ph_mask_ymm(ptr %p, <16 x half> %a1, <16
 }
 
 define <16 x half> @stack_fold_fmadd132ph_mask_ymm(ptr %p, <16 x half> %a1, <16 x half> %a2, i16 %mask) {
-  ;CHECK-LABEL: stack_fold_fmadd132ph_mask_ymm:
-  ;CHECK:       vfmadd132ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmadd132ph_mask_ymm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovaps (%rdi), %ymm2
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vfmadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 {%k1} # 32-byte Folded Reload
+; CHECK-NEXT:    vmovaps %ymm2, %ymm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = load <16 x half>, ptr %p
   %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a0, <16 x half> %a2, <16 x half> %a1)
@@ -906,8 +1516,17 @@ define <16 x half> @stack_fold_fmadd132ph_mask_ymm(ptr %p, <16 x half> %a1, <16
 }
 
 define <16 x half> @stack_fold_fmadd312ph_mask_ymm(ptr %p, <16 x half> %a1, <16 x half> %a2, i16 %mask) {
-  ;CHECK-LABEL: stack_fold_fmadd312ph_mask_ymm:
-  ;CHECK:       vfmadd132ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmadd312ph_mask_ymm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovaps (%rdi), %ymm2
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vfmadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 {%k1} # 32-byte Folded Reload
+; CHECK-NEXT:    vmovaps %ymm2, %ymm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = load <16 x half>, ptr %p
   %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a2, <16 x half> %a0, <16 x half> %a1)
@@ -917,8 +1536,15 @@ define <16 x half> @stack_fold_fmadd312ph_mask_ymm(ptr %p, <16 x half> %a1, <16
 }
 
 define <16 x half> @stack_fold_fmadd123ph_maskz_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fmadd123ph_maskz_ymm:
-  ;CHECK:       vfmadd213ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmadd123ph_maskz_ymm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovw (%rdi), %k1
+; CHECK-NEXT:    vfmadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 {%k1} {z} # 32-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2)
   %3 = load i16, ptr %mask
@@ -928,8 +1554,15 @@ define <16 x half> @stack_fold_fmadd123ph_maskz_ymm(<16 x half> %a0, <16 x half>
 }
 
 define <16 x half> @stack_fold_fmadd213ph_maskz_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fmadd213ph_maskz_ymm:
-  ;CHECK:       vfmadd213ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmadd213ph_maskz_ymm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovw (%rdi), %k1
+; CHECK-NEXT:    vfmadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 {%k1} {z} # 32-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a1, <16 x half> %a0, <16 x half> %a2)
   %3 = load i16, ptr %mask
@@ -939,8 +1572,15 @@ define <16 x half> @stack_fold_fmadd213ph_maskz_ymm(<16 x half> %a0, <16 x half>
 }
 
 define <16 x half> @stack_fold_fmadd231ph_maskz_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fmadd231ph_maskz_ymm:
-  ;CHECK:       vfmadd231ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmadd231ph_maskz_ymm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovw (%rdi), %k1
+; CHECK-NEXT:    vfmadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 {%k1} {z} # 32-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a1, <16 x half> %a2, <16 x half> %a0)
   %3 = load i16, ptr %mask
@@ -950,8 +1590,15 @@ define <16 x half> @stack_fold_fmadd231ph_maskz_ymm(<16 x half> %a0, <16 x half>
 }
 
 define <16 x half> @stack_fold_fmadd321ph_maskz_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fmadd321ph_maskz_ymm:
-  ;CHECK:       vfmadd231ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmadd321ph_maskz_ymm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovw (%rdi), %k1
+; CHECK-NEXT:    vfmadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 {%k1} {z} # 32-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a2, <16 x half> %a1, <16 x half> %a0)
   %3 = load i16, ptr %mask
@@ -961,8 +1608,15 @@ define <16 x half> @stack_fold_fmadd321ph_maskz_ymm(<16 x half> %a0, <16 x half>
 }
 
 define <16 x half> @stack_fold_fmadd132ph_maskz_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fmadd132ph_maskz_ymm:
-  ;CHECK:       vfmadd132ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmadd132ph_maskz_ymm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovw (%rdi), %k1
+; CHECK-NEXT:    vfmadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 {%k1} {z} # 32-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a0, <16 x half> %a2, <16 x half> %a1)
   %3 = load i16, ptr %mask
@@ -972,8 +1626,15 @@ define <16 x half> @stack_fold_fmadd132ph_maskz_ymm(<16 x half> %a0, <16 x half>
 }
 
 define <16 x half> @stack_fold_fmadd312ph_maskz_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fmadd312ph_maskz_ymm:
-  ;CHECK:       vfmadd132ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmadd312ph_maskz_ymm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovw (%rdi), %k1
+; CHECK-NEXT:    vfmadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 {%k1} {z} # 32-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a2, <16 x half> %a0, <16 x half> %a1)
   %3 = load i16, ptr %mask
@@ -983,8 +1644,14 @@ define <16 x half> @stack_fold_fmadd312ph_maskz_ymm(<16 x half> %a0, <16 x half>
 }
 
 define <16 x half> @stack_fold_fmsub123ph_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2) {
-  ;CHECK-LABEL: stack_fold_fmsub123ph_ymm:
-  ;CHECK:       vfmsub213ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmsub123ph_ymm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfmsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = fneg <16 x half> %a2
   %3 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a0, <16 x half> %a1, <16 x half> %2)
@@ -992,8 +1659,14 @@ define <16 x half> @stack_fold_fmsub123ph_ymm(<16 x half> %a0, <16 x half> %a1,
 }
 
 define <16 x half> @stack_fold_fmsub213ph_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2) {
-  ;CHECK-LABEL: stack_fold_fmsub213ph_ymm:
-  ;CHECK:       vfmsub213ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmsub213ph_ymm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfmsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = fneg <16 x half> %a2
   %3 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a1, <16 x half> %a0, <16 x half> %2)
@@ -1001,8 +1674,14 @@ define <16 x half> @stack_fold_fmsub213ph_ymm(<16 x half> %a0, <16 x half> %a1,
 }
 
 define <16 x half> @stack_fold_fmsub231ph_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2) {
-  ;CHECK-LABEL: stack_fold_fmsub231ph_ymm:
-  ;CHECK:       vfmsub231ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmsub231ph_ymm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfmsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = fneg <16 x half> %a0
   %3 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a1, <16 x half> %a2, <16 x half> %2)
@@ -1010,8 +1689,14 @@ define <16 x half> @stack_fold_fmsub231ph_ymm(<16 x half> %a0, <16 x half> %a1,
 }
 
 define <16 x half> @stack_fold_fmsub321ph_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2) {
-  ;CHECK-LABEL: stack_fold_fmsub321ph_ymm:
-  ;CHECK:       vfmsub231ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmsub321ph_ymm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfmsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = fneg <16 x half> %a0
   %3 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a2, <16 x half> %a1, <16 x half> %2)
@@ -1019,8 +1704,14 @@ define <16 x half> @stack_fold_fmsub321ph_ymm(<16 x half> %a0, <16 x half> %a1,
 }
 
 define <16 x half> @stack_fold_fmsub132ph_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2) {
-  ;CHECK-LABEL: stack_fold_fmsub132ph_ymm:
-  ;CHECK:       vfmsub132ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmsub132ph_ymm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfmsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = fneg <16 x half> %a1
   %3 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a0, <16 x half> %a2, <16 x half> %2)
@@ -1028,8 +1719,14 @@ define <16 x half> @stack_fold_fmsub132ph_ymm(<16 x half> %a0, <16 x half> %a1,
 }
 
 define <16 x half> @stack_fold_fmsub312ph_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2) {
-  ;CHECK-LABEL: stack_fold_fmsub312ph_ymm:
-  ;CHECK:       vfmsub132ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmsub312ph_ymm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfmsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = fneg <16 x half> %a1
   %3 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a2, <16 x half> %a0, <16 x half> %2)
@@ -1037,8 +1734,17 @@ define <16 x half> @stack_fold_fmsub312ph_ymm(<16 x half> %a0, <16 x half> %a1,
 }
 
 define <16 x half> @stack_fold_fmsub123ph_mask_ymm(ptr %p, <16 x half> %a1, <16 x half> %a2, i16 %mask) {
-  ;CHECK-LABEL: stack_fold_fmsub123ph_mask_ymm:
-  ;CHECK:       vfmsub213ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmsub123ph_mask_ymm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovaps (%rdi), %ymm2
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vfmsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 {%k1} # 32-byte Folded Reload
+; CHECK-NEXT:    vmovaps %ymm2, %ymm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = load <16 x half>, ptr %p
   %neg = fneg <16 x half> %a2
@@ -1049,8 +1755,17 @@ define <16 x half> @stack_fold_fmsub123ph_mask_ymm(ptr %p, <16 x half> %a1, <16
 }
 
 define <16 x half> @stack_fold_fmsub213ph_mask_ymm(ptr %p, <16 x half> %a1, <16 x half> %a2, i16 %mask) {
-  ;CHECK-LABEL: stack_fold_fmsub213ph_mask_ymm:
-  ;CHECK:       vfmsub213ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmsub213ph_mask_ymm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovaps (%rdi), %ymm2
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vfmsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 {%k1} # 32-byte Folded Reload
+; CHECK-NEXT:    vmovaps %ymm2, %ymm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = load <16 x half>, ptr %p
   %neg = fneg <16 x half> %a2
@@ -1061,8 +1776,17 @@ define <16 x half> @stack_fold_fmsub213ph_mask_ymm(ptr %p, <16 x half> %a1, <16
 }
 
 define <16 x half> @stack_fold_fmsub231ph_mask_ymm(ptr %p, <16 x half> %a1, <16 x half> %a2, i16 %mask) {
-  ;CHECK-LABEL: stack_fold_fmsub231ph_mask_ymm:
-  ;CHECK:       vfmsub231ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmsub231ph_mask_ymm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovaps (%rdi), %ymm2
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vfmsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 {%k1} # 32-byte Folded Reload
+; CHECK-NEXT:    vmovaps %ymm2, %ymm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = load <16 x half>, ptr %p
   %neg = fneg <16 x half> %a0
@@ -1073,8 +1797,17 @@ define <16 x half> @stack_fold_fmsub231ph_mask_ymm(ptr %p, <16 x half> %a1, <16
 }
 
 define <16 x half> @stack_fold_fmsub321ph_mask_ymm(ptr %p, <16 x half> %a1, <16 x half> %a2, i16 %mask) {
-  ;CHECK-LABEL: stack_fold_fmsub321ph_mask_ymm:
-  ;CHECK:       vfmsub231ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmsub321ph_mask_ymm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovaps (%rdi), %ymm2
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vfmsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 {%k1} # 32-byte Folded Reload
+; CHECK-NEXT:    vmovaps %ymm2, %ymm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = load <16 x half>, ptr %p
   %neg = fneg <16 x half> %a0
@@ -1085,8 +1818,17 @@ define <16 x half> @stack_fold_fmsub321ph_mask_ymm(ptr %p, <16 x half> %a1, <16
 }
 
 define <16 x half> @stack_fold_fmsub132ph_mask_ymm(ptr %p, <16 x half> %a1, <16 x half> %a2, i16 %mask) {
-  ;CHECK-LABEL: stack_fold_fmsub132ph_mask_ymm:
-  ;CHECK:       vfmsub132ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmsub132ph_mask_ymm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovaps (%rdi), %ymm2
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vfmsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 {%k1} # 32-byte Folded Reload
+; CHECK-NEXT:    vmovaps %ymm2, %ymm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = load <16 x half>, ptr %p
   %neg = fneg <16 x half> %a1
@@ -1097,8 +1839,17 @@ define <16 x half> @stack_fold_fmsub132ph_mask_ymm(ptr %p, <16 x half> %a1, <16
 }
 
 define <16 x half> @stack_fold_fmsub312ph_mask_ymm(ptr %p, <16 x half> %a1, <16 x half> %a2, i16 %mask) {
-  ;CHECK-LABEL: stack_fold_fmsub312ph_mask_ymm:
-  ;CHECK:       vfmsub132ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmsub312ph_mask_ymm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovaps (%rdi), %ymm2
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vfmsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 {%k1} # 32-byte Folded Reload
+; CHECK-NEXT:    vmovaps %ymm2, %ymm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = load <16 x half>, ptr %p
   %neg = fneg <16 x half> %a1
@@ -1109,8 +1860,15 @@ define <16 x half> @stack_fold_fmsub312ph_mask_ymm(ptr %p, <16 x half> %a1, <16
 }
 
 define <16 x half> @stack_fold_fmsub123ph_maskz_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fmsub123ph_maskz_ymm:
-  ;CHECK:       vfmsub213ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmsub123ph_maskz_ymm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovw (%rdi), %k1
+; CHECK-NEXT:    vfmsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 {%k1} {z} # 32-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %neg = fneg <16 x half> %a2
   %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a0, <16 x half> %a1, <16 x half> %neg)
@@ -1121,8 +1879,15 @@ define <16 x half> @stack_fold_fmsub123ph_maskz_ymm(<16 x half> %a0, <16 x half>
 }
 
 define <16 x half> @stack_fold_fmsub213ph_maskz_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fmsub213ph_maskz_ymm:
-  ;CHECK:       vfmsub213ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmsub213ph_maskz_ymm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovw (%rdi), %k1
+; CHECK-NEXT:    vfmsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 {%k1} {z} # 32-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %neg = fneg <16 x half> %a2
   %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a1, <16 x half> %a0, <16 x half> %neg)
@@ -1133,8 +1898,15 @@ define <16 x half> @stack_fold_fmsub213ph_maskz_ymm(<16 x half> %a0, <16 x half>
 }
 
 define <16 x half> @stack_fold_fmsub231ph_maskz_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fmsub231ph_maskz_ymm:
-  ;CHECK:       vfmsub231ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmsub231ph_maskz_ymm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovw (%rdi), %k1
+; CHECK-NEXT:    vfmsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 {%k1} {z} # 32-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %neg = fneg <16 x half> %a0
   %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a1, <16 x half> %a2, <16 x half> %neg)
@@ -1145,8 +1917,15 @@ define <16 x half> @stack_fold_fmsub231ph_maskz_ymm(<16 x half> %a0, <16 x half>
 }
 
 define <16 x half> @stack_fold_fmsub321ph_maskz_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fmsub321ph_maskz_ymm:
-  ;CHECK:       vfmsub231ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmsub321ph_maskz_ymm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovw (%rdi), %k1
+; CHECK-NEXT:    vfmsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 {%k1} {z} # 32-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %neg = fneg <16 x half> %a0
   %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a2, <16 x half> %a1, <16 x half> %neg)
@@ -1157,8 +1936,15 @@ define <16 x half> @stack_fold_fmsub321ph_maskz_ymm(<16 x half> %a0, <16 x half>
 }
 
 define <16 x half> @stack_fold_fmsub132ph_maskz_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fmsub132ph_maskz_ymm:
-  ;CHECK:       vfmsub132ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmsub132ph_maskz_ymm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovw (%rdi), %k1
+; CHECK-NEXT:    vfmsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 {%k1} {z} # 32-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %neg = fneg <16 x half> %a1
   %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a0, <16 x half> %a2, <16 x half> %neg)
@@ -1169,8 +1955,15 @@ define <16 x half> @stack_fold_fmsub132ph_maskz_ymm(<16 x half> %a0, <16 x half>
 }
 
 define <16 x half> @stack_fold_fmsub312ph_maskz_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fmsub312ph_maskz_ymm:
-  ;CHECK:       vfmsub132ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmsub312ph_maskz_ymm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovw (%rdi), %k1
+; CHECK-NEXT:    vfmsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 {%k1} {z} # 32-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %neg = fneg <16 x half> %a1
   %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %a2, <16 x half> %a0, <16 x half> %neg)
@@ -1181,8 +1974,14 @@ define <16 x half> @stack_fold_fmsub312ph_maskz_ymm(<16 x half> %a0, <16 x half>
 }
 
 define <16 x half> @stack_fold_fnmadd123ph_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2) {
-  ;CHECK-LABEL: stack_fold_fnmadd123ph_ymm:
-  ;CHECK:       vfnmadd213ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmadd123ph_ymm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfnmadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = fneg <16 x half> %a0
   %3 = call <16 x half> @llvm.fma.v16f16(<16 x half> %2, <16 x half> %a1, <16 x half> %a2)
@@ -1190,8 +1989,14 @@ define <16 x half> @stack_fold_fnmadd123ph_ymm(<16 x half> %a0, <16 x half> %a1,
 }
 
 define <16 x half> @stack_fold_fnmadd213ph_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2) {
-  ;CHECK-LABEL: stack_fold_fnmadd213ph_ymm:
-  ;CHECK:       vfnmadd213ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmadd213ph_ymm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfnmadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = fneg <16 x half> %a1
   %3 = call <16 x half> @llvm.fma.v16f16(<16 x half> %2, <16 x half> %a0, <16 x half> %a2)
@@ -1199,8 +2004,14 @@ define <16 x half> @stack_fold_fnmadd213ph_ymm(<16 x half> %a0, <16 x half> %a1,
 }
 
 define <16 x half> @stack_fold_fnmadd231ph_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2) {
-  ;CHECK-LABEL: stack_fold_fnmadd231ph_ymm:
-  ;CHECK:       vfnmadd231ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmadd231ph_ymm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfnmadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = fneg <16 x half> %a1
   %3 = call <16 x half> @llvm.fma.v16f16(<16 x half> %2, <16 x half> %a2, <16 x half> %a0)
@@ -1208,8 +2019,14 @@ define <16 x half> @stack_fold_fnmadd231ph_ymm(<16 x half> %a0, <16 x half> %a1,
 }
 
 define <16 x half> @stack_fold_fnmadd321ph_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2) {
-  ;CHECK-LABEL: stack_fold_fnmadd321ph_ymm:
-  ;CHECK:       vfnmadd231ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmadd321ph_ymm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfnmadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = fneg <16 x half> %a2
   %3 = call <16 x half> @llvm.fma.v16f16(<16 x half> %2, <16 x half> %a1, <16 x half> %a0)
@@ -1217,8 +2034,14 @@ define <16 x half> @stack_fold_fnmadd321ph_ymm(<16 x half> %a0, <16 x half> %a1,
 }
 
 define <16 x half> @stack_fold_fnmadd132ph_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2) {
-  ;CHECK-LABEL: stack_fold_fnmadd132ph_ymm:
-  ;CHECK:       vfnmadd132ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmadd132ph_ymm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfnmadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = fneg <16 x half> %a0
   %3 = call <16 x half> @llvm.fma.v16f16(<16 x half> %2, <16 x half> %a2, <16 x half> %a1)
@@ -1226,8 +2049,14 @@ define <16 x half> @stack_fold_fnmadd132ph_ymm(<16 x half> %a0, <16 x half> %a1,
 }
 
 define <16 x half> @stack_fold_fnmadd312ph_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2) {
-  ;CHECK-LABEL: stack_fold_fnmadd312ph_ymm:
-  ;CHECK:       vfnmadd132ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmadd312ph_ymm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfnmadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = fneg <16 x half> %a2
   %3 = call <16 x half> @llvm.fma.v16f16(<16 x half> %2, <16 x half> %a0, <16 x half> %a1)
@@ -1235,8 +2064,17 @@ define <16 x half> @stack_fold_fnmadd312ph_ymm(<16 x half> %a0, <16 x half> %a1,
 }
 
 define <16 x half> @stack_fold_fnmadd123ph_mask_ymm(ptr %p, <16 x half> %a1, <16 x half> %a2, i16 %mask) {
-  ;CHECK-LABEL: stack_fold_fnmadd123ph_mask_ymm:
-  ;CHECK:       vfnmadd213ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmadd123ph_mask_ymm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovaps (%rdi), %ymm2
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vfnmadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 {%k1} # 32-byte Folded Reload
+; CHECK-NEXT:    vmovaps %ymm2, %ymm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = load <16 x half>, ptr %p
   %neg = fneg <16 x half> %a0
@@ -1247,8 +2085,17 @@ define <16 x half> @stack_fold_fnmadd123ph_mask_ymm(ptr %p, <16 x half> %a1, <16
 }
 
 define <16 x half> @stack_fold_fnmadd213ph_mask_ymm(ptr %p, <16 x half> %a1, <16 x half> %a2, i16 %mask) {
-  ;CHECK-LABEL: stack_fold_fnmadd213ph_mask_ymm:
-  ;CHECK:       vfnmadd213ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmadd213ph_mask_ymm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovaps (%rdi), %ymm2
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vfnmadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 {%k1} # 32-byte Folded Reload
+; CHECK-NEXT:    vmovaps %ymm2, %ymm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = load <16 x half>, ptr %p
   %neg = fneg <16 x half> %a1
@@ -1259,8 +2106,17 @@ define <16 x half> @stack_fold_fnmadd213ph_mask_ymm(ptr %p, <16 x half> %a1, <16
 }
 
 define <16 x half> @stack_fold_fnmadd231ph_mask_ymm(ptr %p, <16 x half> %a1, <16 x half> %a2, i16 %mask) {
-  ;CHECK-LABEL: stack_fold_fnmadd231ph_mask_ymm:
-  ;CHECK:       vfnmadd231ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmadd231ph_mask_ymm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovaps (%rdi), %ymm2
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vfnmadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 {%k1} # 32-byte Folded Reload
+; CHECK-NEXT:    vmovaps %ymm2, %ymm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = load <16 x half>, ptr %p
   %neg = fneg <16 x half> %a1
@@ -1271,8 +2127,17 @@ define <16 x half> @stack_fold_fnmadd231ph_mask_ymm(ptr %p, <16 x half> %a1, <16
 }
 
 define <16 x half> @stack_fold_fnmadd321ph_mask_ymm(ptr %p, <16 x half> %a1, <16 x half> %a2, i16 %mask) {
-  ;CHECK-LABEL: stack_fold_fnmadd321ph_mask_ymm:
-  ;CHECK:       vfnmadd231ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmadd321ph_mask_ymm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovaps (%rdi), %ymm2
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vfnmadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 {%k1} # 32-byte Folded Reload
+; CHECK-NEXT:    vmovaps %ymm2, %ymm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = load <16 x half>, ptr %p
   %neg = fneg <16 x half> %a2
@@ -1283,8 +2148,17 @@ define <16 x half> @stack_fold_fnmadd321ph_mask_ymm(ptr %p, <16 x half> %a1, <16
 }
 
 define <16 x half> @stack_fold_fnmadd132ph_mask_ymm(ptr %p, <16 x half> %a1, <16 x half> %a2, i16 %mask) {
-  ;CHECK-LABEL: stack_fold_fnmadd132ph_mask_ymm:
-  ;CHECK:       vfnmadd132ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmadd132ph_mask_ymm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovaps (%rdi), %ymm2
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vfnmadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 {%k1} # 32-byte Folded Reload
+; CHECK-NEXT:    vmovaps %ymm2, %ymm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = load <16 x half>, ptr %p
   %neg = fneg <16 x half> %a0
@@ -1295,8 +2169,17 @@ define <16 x half> @stack_fold_fnmadd132ph_mask_ymm(ptr %p, <16 x half> %a1, <16
 }
 
 define <16 x half> @stack_fold_fnmadd312ph_mask_ymm(ptr %p, <16 x half> %a1, <16 x half> %a2, i16 %mask) {
-  ;CHECK-LABEL: stack_fold_fnmadd312ph_mask_ymm:
-  ;CHECK:       vfnmadd132ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmadd312ph_mask_ymm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovaps (%rdi), %ymm2
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vfnmadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 {%k1} # 32-byte Folded Reload
+; CHECK-NEXT:    vmovaps %ymm2, %ymm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = load <16 x half>, ptr %p
   %neg = fneg <16 x half> %a2
@@ -1307,8 +2190,15 @@ define <16 x half> @stack_fold_fnmadd312ph_mask_ymm(ptr %p, <16 x half> %a1, <16
 }
 
 define <16 x half> @stack_fold_fnmadd123ph_maskz_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fnmadd123ph_maskz_ymm:
-  ;CHECK:       vfnmadd213ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmadd123ph_maskz_ymm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovw (%rdi), %k1
+; CHECK-NEXT:    vfnmadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 {%k1} {z} # 32-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %neg = fneg <16 x half> %a0
   %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %neg, <16 x half> %a1, <16 x half> %a2)
@@ -1319,8 +2209,15 @@ define <16 x half> @stack_fold_fnmadd123ph_maskz_ymm(<16 x half> %a0, <16 x half
 }
 
 define <16 x half> @stack_fold_fnmadd213ph_maskz_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fnmadd213ph_maskz_ymm:
-  ;CHECK:       vfnmadd213ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmadd213ph_maskz_ymm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovw (%rdi), %k1
+; CHECK-NEXT:    vfnmadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 {%k1} {z} # 32-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %neg = fneg <16 x half> %a1
   %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %neg, <16 x half> %a0, <16 x half> %a2)
@@ -1331,8 +2228,15 @@ define <16 x half> @stack_fold_fnmadd213ph_maskz_ymm(<16 x half> %a0, <16 x half
 }
 
 define <16 x half> @stack_fold_fnmadd231ph_maskz_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fnmadd231ph_maskz_ymm:
-  ;CHECK:       vfnmadd231ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmadd231ph_maskz_ymm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovw (%rdi), %k1
+; CHECK-NEXT:    vfnmadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 {%k1} {z} # 32-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %neg = fneg <16 x half> %a1
   %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %neg, <16 x half> %a2, <16 x half> %a0)
@@ -1343,8 +2247,15 @@ define <16 x half> @stack_fold_fnmadd231ph_maskz_ymm(<16 x half> %a0, <16 x half
 }
 
 define <16 x half> @stack_fold_fnmadd321ph_maskz_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fnmadd321ph_maskz_ymm:
-  ;CHECK:       vfnmadd231ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmadd321ph_maskz_ymm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovw (%rdi), %k1
+; CHECK-NEXT:    vfnmadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 {%k1} {z} # 32-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %neg = fneg <16 x half> %a2
   %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %neg, <16 x half> %a1, <16 x half> %a0)
@@ -1355,8 +2266,15 @@ define <16 x half> @stack_fold_fnmadd321ph_maskz_ymm(<16 x half> %a0, <16 x half
 }
 
 define <16 x half> @stack_fold_fnmadd132ph_maskz_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fnmadd132ph_maskz_ymm:
-  ;CHECK:       vfnmadd132ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmadd132ph_maskz_ymm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovw (%rdi), %k1
+; CHECK-NEXT:    vfnmadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 {%k1} {z} # 32-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %neg = fneg <16 x half> %a0
   %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %neg, <16 x half> %a2, <16 x half> %a1)
@@ -1367,8 +2285,15 @@ define <16 x half> @stack_fold_fnmadd132ph_maskz_ymm(<16 x half> %a0, <16 x half
 }
 
 define <16 x half> @stack_fold_fnmadd312ph_maskz_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fnmadd312ph_maskz_ymm:
-  ;CHECK:       vfnmadd132ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmadd312ph_maskz_ymm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovw (%rdi), %k1
+; CHECK-NEXT:    vfnmadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 {%k1} {z} # 32-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %neg = fneg <16 x half> %a2
   %2 = call <16 x half> @llvm.fma.v16f16(<16 x half> %neg, <16 x half> %a0, <16 x half> %a1)
@@ -1379,8 +2304,14 @@ define <16 x half> @stack_fold_fnmadd312ph_maskz_ymm(<16 x half> %a0, <16 x half
 }
 
 define <16 x half> @stack_fold_fnmsub123ph_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2) {
-  ;CHECK-LABEL: stack_fold_fnmsub123ph_ymm:
-  ;CHECK:       vfnmsub213ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmsub123ph_ymm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfnmsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = fneg <16 x half> %a0
   %3 = fneg <16 x half> %a2
@@ -1389,8 +2320,14 @@ define <16 x half> @stack_fold_fnmsub123ph_ymm(<16 x half> %a0, <16 x half> %a1,
 }
 
 define <16 x half> @stack_fold_fnmsub213ph_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2) {
-  ;CHECK-LABEL: stack_fold_fnmsub213ph_ymm:
-  ;CHECK:       vfnmsub213ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmsub213ph_ymm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfnmsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = fneg <16 x half> %a1
   %3 = fneg <16 x half> %a2
@@ -1399,8 +2336,14 @@ define <16 x half> @stack_fold_fnmsub213ph_ymm(<16 x half> %a0, <16 x half> %a1,
 }
 
 define <16 x half> @stack_fold_fnmsub231ph_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2) {
-  ;CHECK-LABEL: stack_fold_fnmsub231ph_ymm:
-  ;CHECK:       vfnmsub231ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmsub231ph_ymm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfnmsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = fneg <16 x half> %a1
   %3 = fneg <16 x half> %a0
@@ -1409,8 +2352,14 @@ define <16 x half> @stack_fold_fnmsub231ph_ymm(<16 x half> %a0, <16 x half> %a1,
 }
 
 define <16 x half> @stack_fold_fnmsub321ph_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2) {
-  ;CHECK-LABEL: stack_fold_fnmsub321ph_ymm:
-  ;CHECK:       vfnmsub231ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmsub321ph_ymm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfnmsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = fneg <16 x half> %a2
   %3 = fneg <16 x half> %a0
@@ -1419,8 +2368,14 @@ define <16 x half> @stack_fold_fnmsub321ph_ymm(<16 x half> %a0, <16 x half> %a1,
 }
 
 define <16 x half> @stack_fold_fnmsub132ph_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2) {
-  ;CHECK-LABEL: stack_fold_fnmsub132ph_ymm:
-  ;CHECK:       vfnmsub132ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmsub132ph_ymm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfnmsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = fneg <16 x half> %a0
   %3 = fneg <16 x half> %a1
@@ -1429,8 +2384,14 @@ define <16 x half> @stack_fold_fnmsub132ph_ymm(<16 x half> %a0, <16 x half> %a1,
 }
 
 define <16 x half> @stack_fold_fnmsub312ph_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2) {
-  ;CHECK-LABEL: stack_fold_fnmsub312ph_ymm:
-  ;CHECK:       vfnmsub132ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmsub312ph_ymm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfnmsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = fneg <16 x half> %a2
   %3 = fneg <16 x half> %a1
@@ -1439,8 +2400,17 @@ define <16 x half> @stack_fold_fnmsub312ph_ymm(<16 x half> %a0, <16 x half> %a1,
 }
 
 define <16 x half> @stack_fold_fnmsub123ph_mask_ymm(ptr %p, <16 x half> %a1, <16 x half> %a2, i16 %mask) {
-  ;CHECK-LABEL: stack_fold_fnmsub123ph_mask_ymm:
-  ;CHECK:       vfnmsub213ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmsub123ph_mask_ymm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovaps (%rdi), %ymm2
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vfnmsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 {%k1} # 32-byte Folded Reload
+; CHECK-NEXT:    vmovaps %ymm2, %ymm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = load <16 x half>, ptr %p
   %neg = fneg <16 x half> %a2
@@ -1452,8 +2422,17 @@ define <16 x half> @stack_fold_fnmsub123ph_mask_ymm(ptr %p, <16 x half> %a1, <16
 }
 
 define <16 x half> @stack_fold_fnmsub213ph_mask_ymm(ptr %p, <16 x half> %a1, <16 x half> %a2, i16 %mask) {
-  ;CHECK-LABEL: stack_fold_fnmsub213ph_mask_ymm:
-  ;CHECK:       vfnmsub213ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmsub213ph_mask_ymm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovaps (%rdi), %ymm2
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vfnmsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 {%k1} # 32-byte Folded Reload
+; CHECK-NEXT:    vmovaps %ymm2, %ymm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = load <16 x half>, ptr %p
   %neg = fneg <16 x half> %a2
@@ -1465,8 +2444,17 @@ define <16 x half> @stack_fold_fnmsub213ph_mask_ymm(ptr %p, <16 x half> %a1, <16
 }
 
 define <16 x half> @stack_fold_fnmsub231ph_mask_ymm(ptr %p, <16 x half> %a1, <16 x half> %a2, i16 %mask) {
-  ;CHECK-LABEL: stack_fold_fnmsub231ph_mask_ymm:
-  ;CHECK:       vfnmsub231ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmsub231ph_mask_ymm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovaps (%rdi), %ymm2
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vfnmsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 {%k1} # 32-byte Folded Reload
+; CHECK-NEXT:    vmovaps %ymm2, %ymm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = load <16 x half>, ptr %p
   %neg = fneg <16 x half> %a0
@@ -1478,8 +2466,17 @@ define <16 x half> @stack_fold_fnmsub231ph_mask_ymm(ptr %p, <16 x half> %a1, <16
 }
 
 define <16 x half> @stack_fold_fnmsub321ph_mask_ymm(ptr %p, <16 x half> %a1, <16 x half> %a2, i16 %mask) {
-  ;CHECK-LABEL: stack_fold_fnmsub321ph_mask_ymm:
-  ;CHECK:       vfnmsub231ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmsub321ph_mask_ymm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovaps (%rdi), %ymm2
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vfnmsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 {%k1} # 32-byte Folded Reload
+; CHECK-NEXT:    vmovaps %ymm2, %ymm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = load <16 x half>, ptr %p
   %neg = fneg <16 x half> %a0
@@ -1491,8 +2488,17 @@ define <16 x half> @stack_fold_fnmsub321ph_mask_ymm(ptr %p, <16 x half> %a1, <16
 }
 
 define <16 x half> @stack_fold_fnmsub132ph_mask_ymm(ptr %p, <16 x half> %a1, <16 x half> %a2, i16 %mask) {
-  ;CHECK-LABEL: stack_fold_fnmsub132ph_mask_ymm:
-  ;CHECK:       vfnmsub132ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmsub132ph_mask_ymm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovaps (%rdi), %ymm2
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vfnmsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 {%k1} # 32-byte Folded Reload
+; CHECK-NEXT:    vmovaps %ymm2, %ymm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = load <16 x half>, ptr %p
   %neg = fneg <16 x half> %a1
@@ -1504,8 +2510,17 @@ define <16 x half> @stack_fold_fnmsub132ph_mask_ymm(ptr %p, <16 x half> %a1, <16
 }
 
 define <16 x half> @stack_fold_fnmsub312ph_mask_ymm(ptr %p, <16 x half> %a1, <16 x half> %a2, i16 %mask) {
-  ;CHECK-LABEL: stack_fold_fnmsub312ph_mask_ymm:
-  ;CHECK:       vfnmsub132ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmsub312ph_mask_ymm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovaps (%rdi), %ymm2
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vfnmsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 {%k1} # 32-byte Folded Reload
+; CHECK-NEXT:    vmovaps %ymm2, %ymm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = load <16 x half>, ptr %p
   %neg = fneg <16 x half> %a1
@@ -1517,8 +2532,15 @@ define <16 x half> @stack_fold_fnmsub312ph_mask_ymm(ptr %p, <16 x half> %a1, <16
 }
 
 define <16 x half> @stack_fold_fnmsub123ph_maskz_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fnmsub123ph_maskz_ymm:
-  ;CHECK:       vfnmsub213ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmsub123ph_maskz_ymm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovw (%rdi), %k1
+; CHECK-NEXT:    vfnmsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 {%k1} {z} # 32-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %neg = fneg <16 x half> %a2
   %neg1 = fneg <16 x half> %a0
@@ -1530,8 +2552,15 @@ define <16 x half> @stack_fold_fnmsub123ph_maskz_ymm(<16 x half> %a0, <16 x half
 }
 
 define <16 x half> @stack_fold_fnmsub213ph_maskz_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fnmsub213ph_maskz_ymm:
-  ;CHECK:       vfnmsub213ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmsub213ph_maskz_ymm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovw (%rdi), %k1
+; CHECK-NEXT:    vfnmsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 {%k1} {z} # 32-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %neg = fneg <16 x half> %a2
   %neg1 = fneg <16 x half> %a1
@@ -1543,8 +2572,15 @@ define <16 x half> @stack_fold_fnmsub213ph_maskz_ymm(<16 x half> %a0, <16 x half
 }
 
 define <16 x half> @stack_fold_fnmsub231ph_maskz_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fnmsub231ph_maskz_ymm:
-  ;CHECK:       vfnmsub231ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmsub231ph_maskz_ymm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovw (%rdi), %k1
+; CHECK-NEXT:    vfnmsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 {%k1} {z} # 32-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %neg = fneg <16 x half> %a0
   %neg1 = fneg <16 x half> %a1
@@ -1556,8 +2592,15 @@ define <16 x half> @stack_fold_fnmsub231ph_maskz_ymm(<16 x half> %a0, <16 x half
 }
 
 define <16 x half> @stack_fold_fnmsub321ph_maskz_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fnmsub321ph_maskz_ymm:
-  ;CHECK:       vfnmsub231ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmsub321ph_maskz_ymm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovw (%rdi), %k1
+; CHECK-NEXT:    vfnmsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 {%k1} {z} # 32-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %neg = fneg <16 x half> %a0
   %neg1 = fneg <16 x half> %a2
@@ -1569,8 +2612,15 @@ define <16 x half> @stack_fold_fnmsub321ph_maskz_ymm(<16 x half> %a0, <16 x half
 }
 
 define <16 x half> @stack_fold_fnmsub132ph_maskz_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fnmsub132ph_maskz_ymm:
-  ;CHECK:       vfnmsub132ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmsub132ph_maskz_ymm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovw (%rdi), %k1
+; CHECK-NEXT:    vfnmsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 {%k1} {z} # 32-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %neg = fneg <16 x half> %a1
   %neg1 = fneg <16 x half> %a0
@@ -1582,8 +2632,15 @@ define <16 x half> @stack_fold_fnmsub132ph_maskz_ymm(<16 x half> %a0, <16 x half
 }
 
 define <16 x half> @stack_fold_fnmsub312ph_maskz_ymm(<16 x half> %a0, <16 x half> %a1, <16 x half> %a2, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fnmsub312ph_maskz_ymm:
-  ;CHECK:       vfnmsub132ph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_fnmsub312ph_maskz_ymm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovw (%rdi), %k1
+; CHECK-NEXT:    vfnmsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 {%k1} {z} # 32-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %neg = fneg <16 x half> %a1
   %neg1 = fneg <16 x half> %a2

diff  --git a/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16vl.ll b/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16vl.ll
index f2fd7e3e88a0..e2ed997783f5 100644
--- a/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16vl.ll
+++ b/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16vl.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512fp16 < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
@@ -9,24 +10,44 @@ target triple = "x86_64-unknown-unknown"
 ; relevant registers and check that the reload is correctly folded into the instruction.
 
 define <8 x half> @stack_fold_addph(<8 x half> %a0, <8 x half> %a1) {
-  ;CHECK-LABEL: stack_fold_addph
-  ;CHECK:       vaddph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_addph:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vaddph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = fadd <8 x half> %a0, %a1
   ret <8 x half> %2
 }
 
 define <16 x half> @stack_fold_addph_ymm(<16 x half> %a0, <16 x half> %a1) {
-  ;CHECK-LABEL: stack_fold_addph_ymm
-  ;CHECK:       vaddph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_addph_ymm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vaddph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = fadd <16 x half> %a0, %a1
   ret <16 x half> %2
 }
 
 define i8 @stack_fold_cmpph(<8 x half> %a0, <8 x half> %a1) {
-  ;CHECK-LABEL: stack_fold_cmpph
-  ;CHECK:       vcmpeqph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%k[0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_cmpph:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vcmpeqph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %k0 # 16-byte Folded Reload
+; CHECK-NEXT:    kmovd %k0, %eax
+; CHECK-NEXT:    # kill: def $al killed $al killed $eax
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %res = call <8 x i1> @llvm.x86.avx512fp16.mask.cmp.ph.128(<8 x half> %a0, <8 x half> %a1, i32 0, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
   %2 = bitcast <8 x i1> %res to i8
@@ -35,8 +56,17 @@ define i8 @stack_fold_cmpph(<8 x half> %a0, <8 x half> %a1) {
 declare <8 x i1> @llvm.x86.avx512fp16.mask.cmp.ph.128(<8 x half>, <8 x half>, i32,  <8 x i1>)
 
 define i16 @stack_fold_cmpph_ymm(<16 x half> %a0, <16 x half> %a1) {
-  ;CHECK-LABEL: stack_fold_cmpph_ymm
-  ;CHECK:       vcmpeqph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%k[0-9]*}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_cmpph_ymm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vcmpeqph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %k0 # 32-byte Folded Reload
+; CHECK-NEXT:    kmovd %k0, %eax
+; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %res = call <16 x i1> @llvm.x86.avx512fp16.mask.cmp.ph.256(<16 x half> %a0, <16 x half> %a1, i32 0, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
   %2 = bitcast <16 x i1> %res to i16
@@ -45,24 +75,44 @@ define i16 @stack_fold_cmpph_ymm(<16 x half> %a0, <16 x half> %a1) {
 declare <16 x i1> @llvm.x86.avx512fp16.mask.cmp.ph.256(<16 x half>, <16 x half>, i32, <16 x i1>)
 
 define <8 x half> @stack_fold_divph(<8 x half> %a0, <8 x half> %a1) {
-  ;CHECK-LABEL: stack_fold_divph
-  ;CHECK:       vdivph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_divph:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vdivph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = fdiv <8 x half> %a0, %a1
   ret <8 x half> %2
 }
 
 define <16 x half> @stack_fold_divph_ymm(<16 x half> %a0, <16 x half> %a1) {
-  ;CHECK-LABEL: stack_fold_divph_ymm
-  ;CHECK:       vdivph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_divph_ymm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vdivph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = fdiv <16 x half> %a0, %a1
   ret <16 x half> %2
 }
 
 define i8 @stack_fold_fpclassph(<8 x half> %a0) {
-  ;CHECK-LABEL: stack_fold_fpclassph:
-  ;CHECK:       fpclassphx $4, {{-?[0-9]*}}(%rsp), {{%k[0-7]}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fpclassph:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfpclassphx $4, {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 16-byte Folded Reload
+; CHECK-NEXT:    kmovd %k0, %eax
+; CHECK-NEXT:    # kill: def $al killed $al killed $eax
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <8 x i1> @llvm.x86.avx512fp16.fpclass.ph.128(<8 x half> %a0, i32 4)
   %3 = bitcast <8 x i1> %2 to i8
@@ -71,8 +121,17 @@ define i8 @stack_fold_fpclassph(<8 x half> %a0) {
 declare <8 x i1> @llvm.x86.avx512fp16.fpclass.ph.128(<8 x half>, i32)
 
 define i8 @stack_fold_fpclassph_mask(<8 x half> %a0, ptr %p) {
-  ;CHECK-LABEL: stack_fold_fpclassph_mask:
-  ;CHECK:       fpclassphx $4, {{-?[0-9]*}}(%rsp), {{%k[0-7]}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fpclassph_mask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovb (%rdi), %k1
+; CHECK-NEXT:    vfpclassphx $4, {{[-0-9]+}}(%r{{[sb]}}p), %k0 {%k1} # 16-byte Folded Reload
+; CHECK-NEXT:    kmovd %k0, %eax
+; CHECK-NEXT:    # kill: def $al killed $al killed $eax
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <8 x i1> @llvm.x86.avx512fp16.fpclass.ph.128(<8 x half> %a0, i32 4)
   %mask = load <8 x i1>, ptr %p
@@ -82,8 +141,17 @@ define i8 @stack_fold_fpclassph_mask(<8 x half> %a0, ptr %p) {
 }
 
 define i16 @stack_fold_fpclassph_ymm(<16 x half> %a0) {
-  ;CHECK-LABEL: stack_fold_fpclassph_ymm:
-  ;CHECK:       fpclassphy $4, {{-?[0-9]*}}(%rsp), {{%k[0-7]}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_fpclassph_ymm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfpclassphy $4, {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 32-byte Folded Reload
+; CHECK-NEXT:    kmovd %k0, %eax
+; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <16 x i1> @llvm.x86.avx512fp16.fpclass.ph.256(<16 x half> %a0, i32 4)
   %3 = bitcast <16 x i1> %2 to i16
@@ -92,8 +160,18 @@ define i16 @stack_fold_fpclassph_ymm(<16 x half> %a0) {
 declare <16 x i1> @llvm.x86.avx512fp16.fpclass.ph.256(<16 x half>, i32)
 
 define i16 @stack_fold_fpclassph_mask_ymm(<16 x half> %a0, ptr %p) {
-  ;CHECK-LABEL: stack_fold_fpclassph_mask_ymm:
-  ;CHECK:       fpclassphy $4, {{-?[0-9]*}}(%rsp), {{%k[0-7]}} {{{%k[0-7]}}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_fpclassph_mask_ymm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovw (%rdi), %k1
+; CHECK-NEXT:    vfpclassphy $4, {{[-0-9]+}}(%r{{[sb]}}p), %k0 {%k1} # 32-byte Folded Reload
+; CHECK-NEXT:    kmovd %k0, %eax
+; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <16 x i1> @llvm.x86.avx512fp16.fpclass.ph.256(<16 x half> %a0, i32 4)
   %mask = load <16 x i1>, ptr %p
@@ -103,8 +181,14 @@ define i16 @stack_fold_fpclassph_mask_ymm(<16 x half> %a0, ptr %p) {
 }
 
 define <8 x half> @stack_fold_getexpph(<8 x half> %a0) {
-  ;CHECK-LABEL: stack_fold_getexpph:
-  ;CHECK:       vgetexpph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_getexpph:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vgetexpph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <8 x half> @llvm.x86.avx512fp16.mask.getexp.ph.128(<8 x half> %a0, <8 x half> undef, i8 -1)
   ret <8 x half> %2
@@ -112,8 +196,17 @@ define <8 x half> @stack_fold_getexpph(<8 x half> %a0) {
 declare <8 x half> @llvm.x86.avx512fp16.mask.getexp.ph.128(<8 x half>, <8 x half>, i8)
 
 define <8 x half> @stack_fold_getexpph_mask(<8 x half> %a0, ptr %passthru, i8 %mask) {
-  ;CHECK-LABEL: stack_fold_getexpph_mask:
-  ;CHECK:       vgetexpph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_getexpph_mask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovaps (%rdi), %xmm1
+; CHECK-NEXT:    vgetexpph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 {%k1} # 16-byte Folded Reload
+; CHECK-NEXT:    vmovaps %xmm1, %xmm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = load <8 x half>, ptr %passthru
   %3 = call <8 x half> @llvm.x86.avx512fp16.mask.getexp.ph.128(<8 x half> %a0, <8 x half> %2, i8 %mask)
@@ -121,8 +214,15 @@ define <8 x half> @stack_fold_getexpph_mask(<8 x half> %a0, ptr %passthru, i8 %m
 }
 
 define <8 x half> @stack_fold_getexpph_maskz(<8 x half> %a0, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_getexpph_maskz:
-  ;CHECK:       vgetexpph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_getexpph_maskz:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovb (%rdi), %k1
+; CHECK-NEXT:    vgetexpph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 {%k1} {z} # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = load i8, ptr %mask
   %3 = call <8 x half> @llvm.x86.avx512fp16.mask.getexp.ph.128(<8 x half> %a0, <8 x half> zeroinitializer, i8 %2)
@@ -130,8 +230,14 @@ define <8 x half> @stack_fold_getexpph_maskz(<8 x half> %a0, ptr %mask) {
 }
 
 define <16 x half> @stack_fold_getexpph_ymm(<16 x half> %a0) {
-  ;CHECK-LABEL: stack_fold_getexpph_ymm:
-  ;CHECK:       vgetexpph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_getexpph_ymm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vgetexpph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <16 x half> @llvm.x86.avx512fp16.mask.getexp.ph.256(<16 x half> %a0, <16 x half> undef, i16 -1)
   ret <16 x half> %2
@@ -139,8 +245,17 @@ define <16 x half> @stack_fold_getexpph_ymm(<16 x half> %a0) {
 declare <16 x half> @llvm.x86.avx512fp16.mask.getexp.ph.256(<16 x half>, <16 x half>, i16)
 
 define <16 x half> @stack_fold_getexpph_mask_ymm(<16 x half> %a0, ptr %passthru, i16 %mask) {
-  ;CHECK-LABEL: stack_fold_getexpph_mask_ymm:
-  ;CHECK:       vgetexpph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_getexpph_mask_ymm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovaps (%rdi), %ymm1
+; CHECK-NEXT:    vgetexpph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 {%k1} # 32-byte Folded Reload
+; CHECK-NEXT:    vmovaps %ymm1, %ymm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = load <16 x half>, ptr %passthru
   %3 = call <16 x half> @llvm.x86.avx512fp16.mask.getexp.ph.256(<16 x half> %a0, <16 x half> %2, i16 %mask)
@@ -148,8 +263,15 @@ define <16 x half> @stack_fold_getexpph_mask_ymm(<16 x half> %a0, ptr %passthru,
 }
 
 define <16 x half> @stack_fold_getexpph_maskz_ymm(<16 x half> %a0, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_getexpph_maskz_ymm:
-  ;CHECK:       vgetexpph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_getexpph_maskz_ymm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovw (%rdi), %k1
+; CHECK-NEXT:    vgetexpph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 {%k1} {z} # 32-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = load i16, ptr %mask
   %3 = call <16 x half> @llvm.x86.avx512fp16.mask.getexp.ph.256(<16 x half> %a0, <16 x half> zeroinitializer, i16 %2)
@@ -157,8 +279,14 @@ define <16 x half> @stack_fold_getexpph_maskz_ymm(<16 x half> %a0, ptr %mask) {
 }
 
 define <8 x half> @stack_fold_getmantph(<8 x half> %a0) {
-  ;CHECK-LABEL: stack_fold_getmantph:
-  ;CHECK:       vgetmantph $8, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_getmantph:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vgetmantph $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <8 x half> @llvm.x86.avx512fp16.mask.getmant.ph.128(<8 x half> %a0, i32 8, <8 x half> undef, i8 -1)
   ret <8 x half> %2
@@ -166,8 +294,17 @@ define <8 x half> @stack_fold_getmantph(<8 x half> %a0) {
 declare <8 x half> @llvm.x86.avx512fp16.mask.getmant.ph.128(<8 x half>, i32, <8 x half>, i8)
 
 define <8 x half> @stack_fold_getmantph_mask(<8 x half> %a0, ptr %passthru, i8 %mask) {
-  ;CHECK-LABEL: stack_fold_getmantph_mask:
-  ;CHECK:       vgetmantph $8, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_getmantph_mask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovaps (%rdi), %xmm1
+; CHECK-NEXT:    vgetmantph $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 {%k1} # 16-byte Folded Reload
+; CHECK-NEXT:    vmovaps %xmm1, %xmm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = load <8 x half>, ptr %passthru
   %3 = call <8 x half> @llvm.x86.avx512fp16.mask.getmant.ph.128(<8 x half> %a0, i32 8, <8 x half> %2, i8 %mask)
@@ -175,8 +312,15 @@ define <8 x half> @stack_fold_getmantph_mask(<8 x half> %a0, ptr %passthru, i8 %
 }
 
 define <8 x half> @stack_fold_getmantph_maskz(<8 x half> %a0, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_getmantph_maskz:
-  ;CHECK:       vgetmantph $8, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_getmantph_maskz:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovb (%rdi), %k1
+; CHECK-NEXT:    vgetmantph $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 {%k1} {z} # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = load i8, ptr %mask
   %3 = call <8 x half> @llvm.x86.avx512fp16.mask.getmant.ph.128(<8 x half> %a0, i32 8, <8 x half> zeroinitializer, i8 %2)
@@ -184,8 +328,14 @@ define <8 x half> @stack_fold_getmantph_maskz(<8 x half> %a0, ptr %mask) {
 }
 
 define <16 x half> @stack_fold_getmantph_ymm(<16 x half> %a0) {
-  ;CHECK-LABEL: stack_fold_getmantph_ymm:
-  ;CHECK:       vgetmantph $8, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_getmantph_ymm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vgetmantph $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <16 x half> @llvm.x86.avx512fp16.mask.getmant.ph.256(<16 x half> %a0, i32 8, <16 x half> undef, i16 -1)
   ret <16 x half> %2
@@ -193,8 +343,17 @@ define <16 x half> @stack_fold_getmantph_ymm(<16 x half> %a0) {
 declare <16 x half> @llvm.x86.avx512fp16.mask.getmant.ph.256(<16 x half>, i32, <16 x half>, i16)
 
 define <16 x half> @stack_fold_getmantph_mask_ymm(<16 x half> %a0, ptr %passthru, i16 %mask) {
-  ;CHECK-LABEL: stack_fold_getmantph_mask_ymm:
-  ;CHECK:       vgetmantph $8, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_getmantph_mask_ymm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovaps (%rdi), %ymm1
+; CHECK-NEXT:    vgetmantph $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 {%k1} # 32-byte Folded Reload
+; CHECK-NEXT:    vmovaps %ymm1, %ymm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = load <16 x half>, ptr %passthru
   %3 = call <16 x half> @llvm.x86.avx512fp16.mask.getmant.ph.256(<16 x half> %a0, i32 8, <16 x half> %2, i16 %mask)
@@ -202,8 +361,15 @@ define <16 x half> @stack_fold_getmantph_mask_ymm(<16 x half> %a0, ptr %passthru
 }
 
 define <16 x half> @stack_fold_getmantph_maskz_ymm(<16 x half> %a0, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_getmantph_maskz_ymm:
-  ;CHECK:       vgetmantph $8, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_getmantph_maskz_ymm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovw (%rdi), %k1
+; CHECK-NEXT:    vgetmantph $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 {%k1} {z} # 32-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = load i16, ptr %mask
   %3 = call <16 x half> @llvm.x86.avx512fp16.mask.getmant.ph.256(<16 x half> %a0, i32 8, <16 x half> zeroinitializer, i16 %2)
@@ -211,8 +377,14 @@ define <16 x half> @stack_fold_getmantph_maskz_ymm(<16 x half> %a0, ptr %mask) {
 }
 
 define <8 x half> @stack_fold_maxph(<8 x half> %a0, <8 x half> %a1) #0 {
-  ;CHECK-LABEL: stack_fold_maxph
-  ;CHECK:       vmaxph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_maxph:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmaxph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <8 x half> @llvm.x86.avx512fp16.max.ph.128(<8 x half> %a0, <8 x half> %a1)
   ret <8 x half> %2
@@ -220,16 +392,28 @@ define <8 x half> @stack_fold_maxph(<8 x half> %a0, <8 x half> %a1) #0 {
 declare <8 x half> @llvm.x86.avx512fp16.max.ph.128(<8 x half>, <8 x half>) nounwind readnone
 
 define <8 x half> @stack_fold_maxph_commutable(<8 x half> %a0, <8 x half> %a1) #1 {
-  ;CHECK-LABEL: stack_fold_maxph_commutable
-  ;CHECK:       vmaxph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_maxph_commutable:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmaxph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <8 x half> @llvm.x86.avx512fp16.max.ph.128(<8 x half> %a0, <8 x half> %a1)
   ret <8 x half> %2
 }
 
 define <16 x half> @stack_fold_maxph_ymm(<16 x half> %a0, <16 x half> %a1) #0 {
-  ;CHECK-LABEL: stack_fold_maxph_ymm
-  ;CHECK:       vmaxph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_maxph_ymm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmaxph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <16 x half> @llvm.x86.avx512fp16.max.ph.256(<16 x half> %a0, <16 x half> %a1)
   ret <16 x half> %2
@@ -237,16 +421,28 @@ define <16 x half> @stack_fold_maxph_ymm(<16 x half> %a0, <16 x half> %a1) #0 {
 declare <16 x half> @llvm.x86.avx512fp16.max.ph.256(<16 x half>, <16 x half>) nounwind readnone
 
 define <16 x half> @stack_fold_maxph_ymm_commutable(<16 x half> %a0, <16 x half> %a1) #1 {
-  ;CHECK-LABEL: stack_fold_maxph_ymm_commutable
-  ;CHECK:       vmaxph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_maxph_ymm_commutable:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmaxph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <16 x half> @llvm.x86.avx512fp16.max.ph.256(<16 x half> %a0, <16 x half> %a1)
   ret <16 x half> %2
 }
 
 define <8 x half> @stack_fold_minph(<8 x half> %a0, <8 x half> %a1) #0 {
-  ;CHECK-LABEL: stack_fold_minph
-  ;CHECK:       vminph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_minph:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vminph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <8 x half> @llvm.x86.avx512fp16.min.ph.128(<8 x half> %a0, <8 x half> %a1)
   ret <8 x half> %2
@@ -254,16 +450,28 @@ define <8 x half> @stack_fold_minph(<8 x half> %a0, <8 x half> %a1) #0 {
 declare <8 x half> @llvm.x86.avx512fp16.min.ph.128(<8 x half>, <8 x half>) nounwind readnone
 
 define <8 x half> @stack_fold_minph_commutable(<8 x half> %a0, <8 x half> %a1) #1 {
-  ;CHECK-LABEL: stack_fold_minph_commutable
-  ;CHECK:       vminph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_minph_commutable:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vminph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <8 x half> @llvm.x86.avx512fp16.min.ph.128(<8 x half> %a0, <8 x half> %a1)
   ret <8 x half> %2
 }
 
 define <16 x half> @stack_fold_minph_ymm(<16 x half> %a0, <16 x half> %a1) #0 {
-  ;CHECK-LABEL: stack_fold_minph_ymm
-  ;CHECK:       vminph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_minph_ymm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vminph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <16 x half> @llvm.x86.avx512fp16.min.ph.256(<16 x half> %a0, <16 x half> %a1)
   ret <16 x half> %2
@@ -271,32 +479,56 @@ define <16 x half> @stack_fold_minph_ymm(<16 x half> %a0, <16 x half> %a1) #0 {
 declare <16 x half> @llvm.x86.avx512fp16.min.ph.256(<16 x half>, <16 x half>) nounwind readnone
 
 define <16 x half> @stack_fold_minph_ymm_commutable(<16 x half> %a0, <16 x half> %a1) #1 {
-  ;CHECK-LABEL: stack_fold_minph_ymm_commutable
-  ;CHECK:       vminph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_minph_ymm_commutable:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vminph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <16 x half> @llvm.x86.avx512fp16.min.ph.256(<16 x half> %a0, <16 x half> %a1)
   ret <16 x half> %2
 }
 
 define <8 x half> @stack_fold_mulph(<8 x half> %a0, <8 x half> %a1) {
-  ;CHECK-LABEL: stack_fold_mulph
-  ;CHECK:       vmulph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_mulph:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmulph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = fmul <8 x half> %a0, %a1
   ret <8 x half> %2
 }
 
 define <16 x half> @stack_fold_mulph_ymm(<16 x half> %a0, <16 x half> %a1) {
-  ;CHECK-LABEL: stack_fold_mulph_ymm
-  ;CHECK:       vmulph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_mulph_ymm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmulph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = fmul <16 x half> %a0, %a1
   ret <16 x half> %2
 }
 
 define <8 x half> @stack_fold_rcpph(<8 x half> %a0) {
-  ;CHECK-LABEL: stack_fold_rcpph:
-  ;CHECK:       vrcpph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_rcpph:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vrcpph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <8 x half> @llvm.x86.avx512fp16.mask.rcp.ph.128(<8 x half> %a0, <8 x half> undef, i8 -1)
   ret <8 x half> %2
@@ -304,8 +536,17 @@ define <8 x half> @stack_fold_rcpph(<8 x half> %a0) {
 declare <8 x half> @llvm.x86.avx512fp16.mask.rcp.ph.128(<8 x half>, <8 x half>, i8)
 
 define <8 x half> @stack_fold_rcpph_mask(<8 x half> %a0, ptr %passthru, i8 %mask) {
-  ;CHECK-LABEL: stack_fold_rcpph_mask:
-  ;CHECK:       vrcpph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_rcpph_mask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovaps (%rdi), %xmm1
+; CHECK-NEXT:    vrcpph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 {%k1} # 16-byte Folded Reload
+; CHECK-NEXT:    vmovaps %xmm1, %xmm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = load <8 x half>, ptr %passthru
   %3 = call <8 x half> @llvm.x86.avx512fp16.mask.rcp.ph.128(<8 x half> %a0, <8 x half> %2, i8 %mask)
@@ -313,8 +554,15 @@ define <8 x half> @stack_fold_rcpph_mask(<8 x half> %a0, ptr %passthru, i8 %mask
 }
 
 define <8 x half> @stack_fold_rcpph_maskz(<8 x half> %a0, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_rcpph_maskz:
-  ;CHECK:       vrcpph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_rcpph_maskz:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovb (%rdi), %k1
+; CHECK-NEXT:    vrcpph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 {%k1} {z} # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = load i8, ptr %mask
   %3 = call <8 x half> @llvm.x86.avx512fp16.mask.rcp.ph.128(<8 x half> %a0, <8 x half> zeroinitializer, i8 %2)
@@ -322,8 +570,14 @@ define <8 x half> @stack_fold_rcpph_maskz(<8 x half> %a0, ptr %mask) {
 }
 
 define <16 x half> @stack_fold_rcpph_ymm(<16 x half> %a0) {
-  ;CHECK-LABEL: stack_fold_rcpph_ymm:
-  ;CHECK:       vrcpph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_rcpph_ymm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vrcpph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <16 x half> @llvm.x86.avx512fp16.mask.rcp.ph.256(<16 x half> %a0, <16 x half> undef, i16 -1)
   ret <16 x half> %2
@@ -331,8 +585,17 @@ define <16 x half> @stack_fold_rcpph_ymm(<16 x half> %a0) {
 declare <16 x half> @llvm.x86.avx512fp16.mask.rcp.ph.256(<16 x half>, <16 x half>, i16)
 
 define <16 x half> @stack_fold_rcpph_mask_ymm(<16 x half> %a0, ptr %passthru, i16 %mask) {
-  ;CHECK-LABEL: stack_fold_rcpph_mask_ymm:
-  ;CHECK:       vrcpph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_rcpph_mask_ymm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovaps (%rdi), %ymm1
+; CHECK-NEXT:    vrcpph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 {%k1} # 32-byte Folded Reload
+; CHECK-NEXT:    vmovaps %ymm1, %ymm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = load <16 x half>, ptr %passthru
   %3 = call <16 x half> @llvm.x86.avx512fp16.mask.rcp.ph.256(<16 x half> %a0, <16 x half> %2, i16 %mask)
@@ -340,8 +603,15 @@ define <16 x half> @stack_fold_rcpph_mask_ymm(<16 x half> %a0, ptr %passthru, i1
 }
 
 define <16 x half> @stack_fold_rcpph_maskz_ymm(<16 x half> %a0, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_rcpph_maskz_ymm:
-  ;CHECK:       vrcpph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_rcpph_maskz_ymm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovw (%rdi), %k1
+; CHECK-NEXT:    vrcpph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 {%k1} {z} # 32-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = load i16, ptr %mask
   %3 = call <16 x half> @llvm.x86.avx512fp16.mask.rcp.ph.256(<16 x half> %a0, <16 x half> zeroinitializer, i16 %2)
@@ -349,8 +619,14 @@ define <16 x half> @stack_fold_rcpph_maskz_ymm(<16 x half> %a0, ptr %mask) {
 }
 
 define <8 x half> @stack_fold_reduceph(<8 x half> %a0) {
-  ;CHECK-LABEL: stack_fold_reduceph:
-  ;CHECK:       vreduceph $8, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_reduceph:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vreduceph $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <8 x half> @llvm.x86.avx512fp16.mask.reduce.ph.128(<8 x half> %a0, i32 8, <8 x half> undef, i8 -1)
   ret <8 x half> %2
@@ -358,8 +634,17 @@ define <8 x half> @stack_fold_reduceph(<8 x half> %a0) {
 declare <8 x half> @llvm.x86.avx512fp16.mask.reduce.ph.128(<8 x half>, i32, <8 x half>, i8)
 
 define <8 x half> @stack_fold_reduceph_mask(<8 x half> %a0, ptr %passthru, i8 %mask) {
-  ;CHECK-LABEL: stack_fold_reduceph_mask:
-  ;CHECK:       vreduceph $8, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_reduceph_mask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovaps (%rdi), %xmm1
+; CHECK-NEXT:    vreduceph $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 {%k1} # 16-byte Folded Reload
+; CHECK-NEXT:    vmovaps %xmm1, %xmm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = load <8 x half>, ptr %passthru
   %3 = call <8 x half> @llvm.x86.avx512fp16.mask.reduce.ph.128(<8 x half> %a0, i32 8, <8 x half> %2, i8 %mask)
@@ -367,8 +652,15 @@ define <8 x half> @stack_fold_reduceph_mask(<8 x half> %a0, ptr %passthru, i8 %m
 }
 
 define <8 x half> @stack_fold_reduceph_maskz(<8 x half> %a0, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_reduceph_maskz:
-  ;CHECK:       vreduceph $8, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_reduceph_maskz:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovb (%rdi), %k1
+; CHECK-NEXT:    vreduceph $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 {%k1} {z} # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = load i8, ptr %mask
   %3 = call <8 x half> @llvm.x86.avx512fp16.mask.reduce.ph.128(<8 x half> %a0, i32 8, <8 x half> zeroinitializer, i8 %2)
@@ -376,8 +668,14 @@ define <8 x half> @stack_fold_reduceph_maskz(<8 x half> %a0, ptr %mask) {
 }
 
 define <16 x half> @stack_fold_reduceph_ymm(<16 x half> %a0) {
-  ;CHECK-LABEL: stack_fold_reduceph_ymm:
-  ;CHECK:       vreduceph $8, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_reduceph_ymm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vreduceph $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <16 x half> @llvm.x86.avx512fp16.mask.reduce.ph.256(<16 x half> %a0, i32 8, <16 x half> undef, i16 -1)
   ret <16 x half> %2
@@ -385,8 +683,17 @@ define <16 x half> @stack_fold_reduceph_ymm(<16 x half> %a0) {
 declare <16 x half> @llvm.x86.avx512fp16.mask.reduce.ph.256(<16 x half>, i32, <16 x half>, i16)
 
 define <16 x half> @stack_fold_reduceph_mask_ymm(<16 x half> %a0, ptr %passthru, i16 %mask) {
-  ;CHECK-LABEL: stack_fold_reduceph_mask_ymm:
-  ;CHECK:       vreduceph $8, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_reduceph_mask_ymm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovaps (%rdi), %ymm1
+; CHECK-NEXT:    vreduceph $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 {%k1} # 32-byte Folded Reload
+; CHECK-NEXT:    vmovaps %ymm1, %ymm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = load <16 x half>, ptr %passthru
   %3 = call <16 x half> @llvm.x86.avx512fp16.mask.reduce.ph.256(<16 x half> %a0, i32 8, <16 x half> %2, i16 %mask)
@@ -394,8 +701,15 @@ define <16 x half> @stack_fold_reduceph_mask_ymm(<16 x half> %a0, ptr %passthru,
 }
 
 define <16 x half> @stack_fold_reduceph_maskz_ymm(<16 x half> %a0, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_reduceph_maskz_ymm:
-  ;CHECK:       vreduceph $8, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_reduceph_maskz_ymm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovw (%rdi), %k1
+; CHECK-NEXT:    vreduceph $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 {%k1} {z} # 32-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = load i16, ptr %mask
   %3 = call <16 x half> @llvm.x86.avx512fp16.mask.reduce.ph.256(<16 x half> %a0, i32 8, <16 x half> zeroinitializer, i16 %2)
@@ -403,8 +717,14 @@ define <16 x half> @stack_fold_reduceph_maskz_ymm(<16 x half> %a0, ptr %mask) {
 }
 
 define <8 x half> @stack_fold_rndscaleph(<8 x half> %a0) {
-  ;CHECK-LABEL: stack_fold_rndscaleph:
-  ;CHECK:       vrndscaleph $8, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_rndscaleph:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vrndscaleph $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <8 x half> @llvm.x86.avx512fp16.mask.rndscale.ph.128(<8 x half> %a0, i32 8, <8 x half> undef, i8 -1)
   ret <8 x half> %2
@@ -412,8 +732,17 @@ define <8 x half> @stack_fold_rndscaleph(<8 x half> %a0) {
 declare <8 x half> @llvm.x86.avx512fp16.mask.rndscale.ph.128(<8 x half>, i32, <8 x half>, i8)
 
 define <8 x half> @stack_fold_rndscaleph_mask(<8 x half> %a0, ptr %passthru, i8 %mask) {
-  ;CHECK-LABEL: stack_fold_rndscaleph_mask:
-  ;CHECK:       vrndscaleph $8, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_rndscaleph_mask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovaps (%rdi), %xmm1
+; CHECK-NEXT:    vrndscaleph $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 {%k1} # 16-byte Folded Reload
+; CHECK-NEXT:    vmovaps %xmm1, %xmm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = load <8 x half>, ptr %passthru
   %3 = call <8 x half> @llvm.x86.avx512fp16.mask.rndscale.ph.128(<8 x half> %a0, i32 8, <8 x half> %2, i8 %mask)
@@ -421,8 +750,15 @@ define <8 x half> @stack_fold_rndscaleph_mask(<8 x half> %a0, ptr %passthru, i8
 }
 
 define <8 x half> @stack_fold_rndscaleph_maskz(<8 x half> %a0, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_rndscaleph_maskz:
-  ;CHECK:       vrndscaleph $8, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_rndscaleph_maskz:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovb (%rdi), %k1
+; CHECK-NEXT:    vrndscaleph $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 {%k1} {z} # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = load i8, ptr %mask
   %3 = call <8 x half> @llvm.x86.avx512fp16.mask.rndscale.ph.128(<8 x half> %a0, i32 8, <8 x half> zeroinitializer, i8 %2)
@@ -430,8 +766,14 @@ define <8 x half> @stack_fold_rndscaleph_maskz(<8 x half> %a0, ptr %mask) {
 }
 
 define <16 x half> @stack_fold_rndscaleph_ymm(<16 x half> %a0) {
-  ;CHECK-LABEL: stack_fold_rndscaleph_ymm:
-  ;CHECK:       vrndscaleph $8, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_rndscaleph_ymm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vrndscaleph $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <16 x half> @llvm.x86.avx512fp16.mask.rndscale.ph.256(<16 x half> %a0, i32 8, <16 x half> undef, i16 -1)
   ret <16 x half> %2
@@ -439,8 +781,17 @@ define <16 x half> @stack_fold_rndscaleph_ymm(<16 x half> %a0) {
 declare <16 x half> @llvm.x86.avx512fp16.mask.rndscale.ph.256(<16 x half>, i32, <16 x half>, i16)
 
 define <16 x half> @stack_fold_rndscaleph_mask_ymm(<16 x half> %a0, ptr %passthru, i16 %mask) {
-  ;CHECK-LABEL: stack_fold_rndscaleph_mask_ymm:
-  ;CHECK:       vrndscaleph $8, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_rndscaleph_mask_ymm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovaps (%rdi), %ymm1
+; CHECK-NEXT:    vrndscaleph $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 {%k1} # 32-byte Folded Reload
+; CHECK-NEXT:    vmovaps %ymm1, %ymm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = load <16 x half>, ptr %passthru
   %3 = call <16 x half> @llvm.x86.avx512fp16.mask.rndscale.ph.256(<16 x half> %a0, i32 8, <16 x half> %2, i16 %mask)
@@ -448,8 +799,15 @@ define <16 x half> @stack_fold_rndscaleph_mask_ymm(<16 x half> %a0, ptr %passthr
 }
 
 define <16 x half> @stack_fold_rndscaleph_maskz_ymm(<16 x half> %a0, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_rndscaleph_maskz_ymm:
-  ;CHECK:       vrndscaleph $8, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_rndscaleph_maskz_ymm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovw (%rdi), %k1
+; CHECK-NEXT:    vrndscaleph $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 {%k1} {z} # 32-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = load i16, ptr %mask
   %3 = call <16 x half> @llvm.x86.avx512fp16.mask.rndscale.ph.256(<16 x half> %a0, i32 8, <16 x half> zeroinitializer, i16 %2)
@@ -457,8 +815,14 @@ define <16 x half> @stack_fold_rndscaleph_maskz_ymm(<16 x half> %a0, ptr %mask)
 }
 
 define <8 x half> @stack_fold_rsqrtph(<8 x half> %a0) {
-  ;CHECK-LABEL: stack_fold_rsqrtph:
-  ;CHECK:       vrsqrtph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_rsqrtph:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vrsqrtph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <8 x half> @llvm.x86.avx512fp16.mask.rsqrt.ph.128(<8 x half> %a0, <8 x half> undef, i8 -1)
   ret <8 x half> %2
@@ -466,8 +830,17 @@ define <8 x half> @stack_fold_rsqrtph(<8 x half> %a0) {
 declare <8 x half> @llvm.x86.avx512fp16.mask.rsqrt.ph.128(<8 x half>, <8 x half>, i8)
 
 define <8 x half> @stack_fold_rsqrtph_mask(<8 x half> %a0, ptr %passthru, i8 %mask) {
-  ;CHECK-LABEL: stack_fold_rsqrtph_mask:
-  ;CHECK:       vrsqrtph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_rsqrtph_mask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovaps (%rdi), %xmm1
+; CHECK-NEXT:    vrsqrtph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 {%k1} # 16-byte Folded Reload
+; CHECK-NEXT:    vmovaps %xmm1, %xmm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = load <8 x half>, ptr %passthru
   %3 = call <8 x half> @llvm.x86.avx512fp16.mask.rsqrt.ph.128(<8 x half> %a0, <8 x half> %2, i8 %mask)
@@ -475,8 +848,15 @@ define <8 x half> @stack_fold_rsqrtph_mask(<8 x half> %a0, ptr %passthru, i8 %ma
 }
 
 define <8 x half> @stack_fold_rsqrtph_maskz(<8 x half> %a0, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_rsqrtph_maskz:
-  ;CHECK:       vrsqrtph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_rsqrtph_maskz:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovb (%rdi), %k1
+; CHECK-NEXT:    vrsqrtph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 {%k1} {z} # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = load i8, ptr %mask
   %3 = call <8 x half> @llvm.x86.avx512fp16.mask.rsqrt.ph.128(<8 x half> %a0, <8 x half> zeroinitializer, i8 %2)
@@ -484,8 +864,14 @@ define <8 x half> @stack_fold_rsqrtph_maskz(<8 x half> %a0, ptr %mask) {
 }
 
 define <16 x half> @stack_fold_rsqrtph_ymm(<16 x half> %a0) {
-  ;CHECK-LABEL: stack_fold_rsqrtph_ymm:
-  ;CHECK:       vrsqrtph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_rsqrtph_ymm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vrsqrtph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <16 x half> @llvm.x86.avx512fp16.mask.rsqrt.ph.256(<16 x half> %a0, <16 x half> undef, i16 -1)
   ret <16 x half> %2
@@ -493,8 +879,17 @@ define <16 x half> @stack_fold_rsqrtph_ymm(<16 x half> %a0) {
 declare <16 x half> @llvm.x86.avx512fp16.mask.rsqrt.ph.256(<16 x half>, <16 x half>, i16)
 
 define <16 x half> @stack_fold_rsqrtph_mask_ymm(<16 x half> %a0, ptr %passthru, i16 %mask) {
-  ;CHECK-LABEL: stack_fold_rsqrtph_mask_ymm:
-  ;CHECK:       vrsqrtph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_rsqrtph_mask_ymm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovaps (%rdi), %ymm1
+; CHECK-NEXT:    vrsqrtph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 {%k1} # 32-byte Folded Reload
+; CHECK-NEXT:    vmovaps %ymm1, %ymm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = load <16 x half>, ptr %passthru
   %3 = call <16 x half> @llvm.x86.avx512fp16.mask.rsqrt.ph.256(<16 x half> %a0, <16 x half> %2, i16 %mask)
@@ -502,8 +897,15 @@ define <16 x half> @stack_fold_rsqrtph_mask_ymm(<16 x half> %a0, ptr %passthru,
 }
 
 define <16 x half> @stack_fold_rsqrtph_maskz_ymm(<16 x half> %a0, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_rsqrtph_maskz_ymm:
-  ;CHECK:       vrsqrtph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_rsqrtph_maskz_ymm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovw (%rdi), %k1
+; CHECK-NEXT:    vrsqrtph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 {%k1} {z} # 32-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = load i16, ptr %mask
   %3 = call <16 x half> @llvm.x86.avx512fp16.mask.rsqrt.ph.256(<16 x half> %a0, <16 x half> zeroinitializer, i16 %2)
@@ -511,8 +913,14 @@ define <16 x half> @stack_fold_rsqrtph_maskz_ymm(<16 x half> %a0, ptr %mask) {
 }
 
 define <8 x half> @stack_fold_sqrtph(<8 x half> %a0) {
-  ;CHECK-LABEL: stack_fold_sqrtph:
-  ;CHECK:       vsqrtph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_sqrtph:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vsqrtph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <8 x half> @llvm.sqrt.v8f16(<8 x half> %a0)
   ret <8 x half> %2
@@ -520,8 +928,17 @@ define <8 x half> @stack_fold_sqrtph(<8 x half> %a0) {
 declare <8 x half> @llvm.sqrt.v8f16(<8 x half>)
 
 define <8 x half> @stack_fold_sqrtph_mask(<8 x half> %a0, ptr %passthru, i8 %mask) {
-  ;CHECK-LABEL: stack_fold_sqrtph_mask:
-  ;CHECK:       vsqrtph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_sqrtph_mask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovaps (%rdi), %xmm1
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vsqrtph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 {%k1} # 16-byte Folded Reload
+; CHECK-NEXT:    vmovaps %xmm1, %xmm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = load <8 x half>, ptr %passthru
   %3 = call <8 x half> @llvm.sqrt.v8f16(<8 x half> %a0)
@@ -531,8 +948,15 @@ define <8 x half> @stack_fold_sqrtph_mask(<8 x half> %a0, ptr %passthru, i8 %mas
 }
 
 define <8 x half> @stack_fold_sqrtph_maskz(<8 x half> %a0, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_sqrtph_maskz:
-  ;CHECK:       vsqrtph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_sqrtph_maskz:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovb (%rdi), %k1
+; CHECK-NEXT:    vsqrtph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 {%k1} {z} # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = load i8, ptr %mask
   %3 = call <8 x half> @llvm.sqrt.v8f16(<8 x half> %a0)
@@ -542,8 +966,14 @@ define <8 x half> @stack_fold_sqrtph_maskz(<8 x half> %a0, ptr %mask) {
 }
 
 define <16 x half> @stack_fold_sqrtph_ymm(<16 x half> %a0) {
-  ;CHECK-LABEL: stack_fold_sqrtph_ymm:
-  ;CHECK:       vsqrtph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_sqrtph_ymm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vsqrtph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <16 x half> @llvm.sqrt.v16f16(<16 x half> %a0)
   ret <16 x half> %2
@@ -551,8 +981,17 @@ define <16 x half> @stack_fold_sqrtph_ymm(<16 x half> %a0) {
 declare <16 x half> @llvm.sqrt.v16f16(<16 x half>)
 
 define <16 x half> @stack_fold_sqrtph_mask_ymm(<16 x half> %a0, ptr %passthru, i16 %mask) {
-  ;CHECK-LABEL: stack_fold_sqrtph_mask_ymm:
-  ;CHECK:       vsqrtph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_sqrtph_mask_ymm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovaps (%rdi), %ymm1
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vsqrtph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 {%k1} # 32-byte Folded Reload
+; CHECK-NEXT:    vmovaps %ymm1, %ymm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = load <16 x half>, ptr %passthru
   %3 = call <16 x half> @llvm.sqrt.v16f16(<16 x half> %a0)
@@ -562,8 +1001,15 @@ define <16 x half> @stack_fold_sqrtph_mask_ymm(<16 x half> %a0, ptr %passthru, i
 }
 
 define <16 x half> @stack_fold_sqrtph_maskz_ymm(<16 x half> %a0, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_sqrtph_maskz_ymm:
-  ;CHECK:       vsqrtph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_sqrtph_maskz_ymm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovw (%rdi), %k1
+; CHECK-NEXT:    vsqrtph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 {%k1} {z} # 32-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = load i16, ptr %mask
   %3 = call <16 x half> @llvm.sqrt.v16f16(<16 x half> %a0)
@@ -573,24 +1019,43 @@ define <16 x half> @stack_fold_sqrtph_maskz_ymm(<16 x half> %a0, ptr %mask) {
 }
 
 define <8 x half> @stack_fold_subph(<8 x half> %a0, <8 x half> %a1) {
-  ;CHECK-LABEL: stack_fold_subph
-  ;CHECK:       vsubph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_subph:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vsubph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = fsub <8 x half> %a0, %a1
   ret <8 x half> %2
 }
 
 define <16 x half> @stack_fold_subph_ymm(<16 x half> %a0, <16 x half> %a1) {
-  ;CHECK-LABEL: stack_fold_subph_ymm
-  ;CHECK:       vsubph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_subph_ymm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vsubph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = fsub <16 x half> %a0, %a1
   ret <16 x half> %2
 }
 
 define <4 x float> @stack_fold_fmulc(<4 x float> %a0, <4 x float> %a1) {
-  ;CHECK-LABEL: stack_fold_fmulc:
-  ;CHECK:       vfmulcph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmulc:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfmulcph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
+; CHECK-NEXT:    vmovaps %xmm2, %xmm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.128(<4 x float> %a0, <4 x float> %a1, <4 x float> undef, i8 -1)
   ret <4 x float> %2
@@ -598,8 +1063,17 @@ define <4 x float> @stack_fold_fmulc(<4 x float> %a0, <4 x float> %a1) {
 declare <4 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.128(<4 x float>, <4 x float>, <4 x float>, i8)
 
 define <4 x float> @stack_fold_fmulc_mask(<4 x float> %a0, <4 x float> %a1, ptr %passthru, i8 %mask) {
-  ;CHECK-LABEL: stack_fold_fmulc_mask:
-  ;CHECK:       vfmulcph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmulc_mask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovaps (%rdi), %xmm2
+; CHECK-NEXT:    vfmulcph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload
+; CHECK-NEXT:    vmovaps %xmm2, %xmm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = load <4 x float>, ptr %passthru
   %3 = call <4 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %2, i8 %mask)
@@ -607,8 +1081,16 @@ define <4 x float> @stack_fold_fmulc_mask(<4 x float> %a0, <4 x float> %a1, ptr
 }
 
 define <4 x float> @stack_fold_fmulc_maskz(<4 x float> %a0, <4 x float> %a1, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fmulc_maskz:
-  ;CHECK:       vfmulcph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmulc_maskz:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovb (%rdi), %k1
+; CHECK-NEXT:    vfmulcph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} {z} # 16-byte Folded Reload
+; CHECK-NEXT:    vmovaps %xmm2, %xmm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = load i8, ptr %mask
   %3 = call <4 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.128(<4 x float> %a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 %2)
@@ -616,8 +1098,15 @@ define <4 x float> @stack_fold_fmulc_maskz(<4 x float> %a0, <4 x float> %a1, ptr
 }
 
 define <4 x float> @stack_fold_fcmulc(<4 x float> %a0, <4 x float> %a1) {
-  ;CHECK-LABEL: stack_fold_fcmulc:
-  ;CHECK:       vfcmulcph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fcmulc:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfcmulcph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
+; CHECK-NEXT:    vmovaps %xmm2, %xmm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.128(<4 x float> %a0, <4 x float> %a1, <4 x float> undef, i8 -1)
   ret <4 x float> %2
@@ -625,8 +1114,17 @@ define <4 x float> @stack_fold_fcmulc(<4 x float> %a0, <4 x float> %a1) {
 declare <4 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.128(<4 x float>, <4 x float>, <4 x float>, i8)
 
 define <4 x float> @stack_fold_fcmulc_mask(<4 x float> %a0, <4 x float> %a1, ptr %passthru, i8 %mask) {
-  ;CHECK-LABEL: stack_fold_fcmulc_mask:
-  ;CHECK:       vfcmulcph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fcmulc_mask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovaps (%rdi), %xmm2
+; CHECK-NEXT:    vfcmulcph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload
+; CHECK-NEXT:    vmovaps %xmm2, %xmm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = load <4 x float>, ptr %passthru
   %3 = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %2, i8 %mask)
@@ -634,8 +1132,16 @@ define <4 x float> @stack_fold_fcmulc_mask(<4 x float> %a0, <4 x float> %a1, ptr
 }
 
 define <4 x float> @stack_fold_fcmulc_maskz(<4 x float> %a0, <4 x float> %a1, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fcmulc_maskz:
-  ;CHECK:       vfcmulcph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fcmulc_maskz:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovb (%rdi), %k1
+; CHECK-NEXT:    vfcmulcph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} {z} # 16-byte Folded Reload
+; CHECK-NEXT:    vmovaps %xmm2, %xmm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = load i8, ptr %mask
   %3 = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.128(<4 x float> %a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 %2)
@@ -643,8 +1149,14 @@ define <4 x float> @stack_fold_fcmulc_maskz(<4 x float> %a0, <4 x float> %a1, pt
 }
 
 define <4 x float> @stack_fold_fmaddc(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
-  ;CHECK-LABEL: stack_fold_fmaddc:
-  ;CHECK:       vfmaddcph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmaddc:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfmaddcph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.128(<4 x float> %a1, <4 x float> %a2, <4 x float> %a0, i8 -1)
   ret <4 x float> %2
@@ -652,8 +1164,17 @@ define <4 x float> @stack_fold_fmaddc(<4 x float> %a0, <4 x float> %a1, <4 x flo
 declare <4 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.128(<4 x float>, <4 x float>, <4 x float>, i8)
 
 define <4 x float> @stack_fold_fmaddc_mask(ptr %p, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
-  ;CHECK-LABEL: stack_fold_fmaddc_mask:
-  ;CHECK:       vfmaddcph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmaddc_mask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovaps (%rdi), %xmm2
+; CHECK-NEXT:    vfmaddcph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload
+; CHECK-NEXT:    vmovaps %xmm2, %xmm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = load <4 x float>, ptr %p
   %2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.128(<4 x float> %a1, <4 x float> %a2, <4 x float> %a0, i8 %mask)
@@ -661,8 +1182,16 @@ define <4 x float> @stack_fold_fmaddc_mask(ptr %p, <4 x float> %a1, <4 x float>
 }
 
 define <4 x float> @stack_fold_fmaddc_maskz(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fmaddc_maskz:
-  ;CHECK:       vfmaddcph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmaddc_maskz:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovb (%rdi), %k1
+; CHECK-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    vfmaddcph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = load i8, ptr %mask
   %3 = call <4 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.128(<4 x float> %a1, <4 x float> %a2, <4 x float> zeroinitializer, i8 %2)
@@ -671,8 +1200,14 @@ define <4 x float> @stack_fold_fmaddc_maskz(<4 x float> %a0, <4 x float> %a1, <4
 declare <4 x float> @llvm.x86.avx512fp16.maskz.vfmadd.cph.128(<4 x float>, <4 x float>, <4 x float>, i8)
 
 define <4 x float> @stack_fold_fcmaddc(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
-  ;CHECK-LABEL: stack_fold_fcmaddc:
-  ;CHECK:       vfcmaddcph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fcmaddc:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfcmaddcph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.128(<4 x float> %a1, <4 x float> %a2, <4 x float> %a0, i8 -1)
   ret <4 x float> %2
@@ -680,8 +1215,17 @@ define <4 x float> @stack_fold_fcmaddc(<4 x float> %a0, <4 x float> %a1, <4 x fl
 declare <4 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.128(<4 x float>, <4 x float>, <4 x float>, i8)
 
 define <4 x float> @stack_fold_fcmaddc_mask(ptr %p, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
-  ;CHECK-LABEL: stack_fold_fcmaddc_mask:
-  ;CHECK:       vfcmaddcph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fcmaddc_mask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovaps (%rdi), %xmm2
+; CHECK-NEXT:    vfcmaddcph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload
+; CHECK-NEXT:    vmovaps %xmm2, %xmm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = load <4 x float>, ptr %p
   %2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.128(<4 x float> %a1, <4 x float> %a2, <4 x float> %a0, i8 %mask)
@@ -689,8 +1233,16 @@ define <4 x float> @stack_fold_fcmaddc_mask(ptr %p, <4 x float> %a1, <4 x float>
 }
 
 define <4 x float> @stack_fold_fcmaddc_maskz(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fcmaddc_maskz:
-  ;CHECK:       vfcmaddcph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_fcmaddc_maskz:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovb (%rdi), %k1
+; CHECK-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    vfcmaddcph {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = load i8, ptr %mask
   %3 = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.128(<4 x float> %a1, <4 x float> %a2, <4 x float> zeroinitializer, i8 %2)
@@ -699,8 +1251,15 @@ define <4 x float> @stack_fold_fcmaddc_maskz(<4 x float> %a0, <4 x float> %a1, <
 declare <4 x float> @llvm.x86.avx512fp16.maskz.vfcmadd.cph.128(<4 x float>, <4 x float>, <4 x float>, i8)
 
 define <8 x float> @stack_fold_fmulc_ymm(<8 x float> %a0, <8 x float> %a1) {
-  ;CHECK-LABEL: stack_fold_fmulc_ymm:
-  ;CHECK:       vfmulcph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmulc_ymm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfmulcph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
+; CHECK-NEXT:    vmovaps %ymm2, %ymm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <8 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.256(<8 x float> %a0, <8 x float> %a1, <8 x float> undef, i8 -1)
   ret <8 x float> %2
@@ -708,8 +1267,17 @@ define <8 x float> @stack_fold_fmulc_ymm(<8 x float> %a0, <8 x float> %a1) {
 declare <8 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.256(<8 x float>, <8 x float>, <8 x float>, i8)
 
 define <8 x float> @stack_fold_fmulc_mask_ymm(<8 x float> %a0, <8 x float> %a1, ptr %passthru, i8 %mask) {
-  ;CHECK-LABEL: stack_fold_fmulc_mask_ymm:
-  ;CHECK:       vfmulcph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmulc_mask_ymm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovaps (%rdi), %ymm2
+; CHECK-NEXT:    vfmulcph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 {%k1} # 32-byte Folded Reload
+; CHECK-NEXT:    vmovaps %ymm2, %ymm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = load <8 x float>, ptr %passthru
   %3 = call <8 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %2, i8 %mask)
@@ -717,8 +1285,16 @@ define <8 x float> @stack_fold_fmulc_mask_ymm(<8 x float> %a0, <8 x float> %a1,
 }
 
 define <8 x float> @stack_fold_fmulc_maskz_ymm(<8 x float> %a0, <8 x float> %a1, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fmulc_maskz_ymm:
-  ;CHECK:       vfmulcph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmulc_maskz_ymm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovb (%rdi), %k1
+; CHECK-NEXT:    vfmulcph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 {%k1} {z} # 32-byte Folded Reload
+; CHECK-NEXT:    vmovaps %ymm2, %ymm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = load i8, ptr %mask
   %3 = call <8 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.256(<8 x float> %a0, <8 x float> %a1, <8 x float> zeroinitializer, i8 %2)
@@ -726,8 +1302,15 @@ define <8 x float> @stack_fold_fmulc_maskz_ymm(<8 x float> %a0, <8 x float> %a1,
 }
 
 define <8 x float> @stack_fold_fcmulc_ymm(<8 x float> %a0, <8 x float> %a1) {
-  ;CHECK-LABEL: stack_fold_fcmulc_ymm:
-  ;CHECK:       vfcmulcph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_fcmulc_ymm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfcmulcph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
+; CHECK-NEXT:    vmovaps %ymm2, %ymm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <8 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.256(<8 x float> %a0, <8 x float> %a1, <8 x float> undef, i8 -1)
   ret <8 x float> %2
@@ -735,8 +1318,17 @@ define <8 x float> @stack_fold_fcmulc_ymm(<8 x float> %a0, <8 x float> %a1) {
 declare <8 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.256(<8 x float>, <8 x float>, <8 x float>, i8)
 
 define <8 x float> @stack_fold_fcmulc_mask_ymm(<8 x float> %a0, <8 x float> %a1, ptr %passthru, i8 %mask) {
-  ;CHECK-LABEL: stack_fold_fcmulc_mask_ymm:
-  ;CHECK:       vfcmulcph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_fcmulc_mask_ymm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovaps (%rdi), %ymm2
+; CHECK-NEXT:    vfcmulcph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 {%k1} # 32-byte Folded Reload
+; CHECK-NEXT:    vmovaps %ymm2, %ymm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = load <8 x float>, ptr %passthru
   %3 = call <8 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %2, i8 %mask)
@@ -744,8 +1336,16 @@ define <8 x float> @stack_fold_fcmulc_mask_ymm(<8 x float> %a0, <8 x float> %a1,
 }
 
 define <8 x float> @stack_fold_fcmulc_maskz_ymm(<8 x float> %a0, <8 x float> %a1, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fcmulc_maskz_ymm:
-  ;CHECK:       vfcmulcph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_fcmulc_maskz_ymm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovb (%rdi), %k1
+; CHECK-NEXT:    vfcmulcph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 {%k1} {z} # 32-byte Folded Reload
+; CHECK-NEXT:    vmovaps %ymm2, %ymm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = load i8, ptr %mask
   %3 = call <8 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.256(<8 x float> %a0, <8 x float> %a1, <8 x float> zeroinitializer, i8 %2)
@@ -753,8 +1353,14 @@ define <8 x float> @stack_fold_fcmulc_maskz_ymm(<8 x float> %a0, <8 x float> %a1
 }
 
 define <8 x float> @stack_fold_fmaddc_ymm(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) {
-  ;CHECK-LABEL: stack_fold_fmaddc_ymm:
-  ;CHECK:       vfmaddcph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmaddc_ymm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfmaddcph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <8 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.256(<8 x float> %a1, <8 x float> %a2, <8 x float> %a0, i8 -1)
   ret <8 x float> %2
@@ -762,8 +1368,17 @@ define <8 x float> @stack_fold_fmaddc_ymm(<8 x float> %a0, <8 x float> %a1, <8 x
 declare <8 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.256(<8 x float>, <8 x float>, <8 x float>, i8)
 
 define <8 x float> @stack_fold_fmaddc_mask_ymm(ptr %p, <8 x float> %a1, <8 x float> %a2, i8 %mask) {
-  ;CHECK-LABEL: stack_fold_fmaddc_mask_ymm:
-  ;CHECK:       vfmaddcph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmaddc_mask_ymm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovaps (%rdi), %ymm2
+; CHECK-NEXT:    vfmaddcph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 {%k1} # 32-byte Folded Reload
+; CHECK-NEXT:    vmovaps %ymm2, %ymm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = load <8 x float>, ptr %p
   %2 = call <8 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.256(<8 x float> %a1, <8 x float> %a2, <8 x float> %a0, i8 %mask)
@@ -771,8 +1386,16 @@ define <8 x float> @stack_fold_fmaddc_mask_ymm(ptr %p, <8 x float> %a1, <8 x flo
 }
 
 define <8 x float> @stack_fold_fmaddc_maskz_ymm(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fmaddc_maskz_ymm:
-  ;CHECK:       vfmaddcph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_fmaddc_maskz_ymm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovb (%rdi), %k1
+; CHECK-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    vfmaddcph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 {%k1} {z} # 32-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = load i8, ptr %mask
   %3 = call <8 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.256(<8 x float> %a1, <8 x float> %a2, <8 x float> zeroinitializer, i8 %2)
@@ -781,8 +1404,14 @@ define <8 x float> @stack_fold_fmaddc_maskz_ymm(<8 x float> %a0, <8 x float> %a1
 declare <8 x float> @llvm.x86.avx512fp16.maskz.vfmadd.cph.256(<8 x float>, <8 x float>, <8 x float>, i8)
 
 define <8 x float> @stack_fold_fcmaddc_ymm(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) {
-  ;CHECK-LABEL: stack_fold_fcmaddc_ymm:
-  ;CHECK:       vfcmaddcph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_fcmaddc_ymm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vfcmaddcph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = call <8 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.256(<8 x float> %a1, <8 x float> %a2, <8 x float> %a0, i8 -1)
   ret <8 x float> %2
@@ -790,8 +1419,17 @@ define <8 x float> @stack_fold_fcmaddc_ymm(<8 x float> %a0, <8 x float> %a1, <8
 declare <8 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.256(<8 x float>, <8 x float>, <8 x float>, i8)
 
 define <8 x float> @stack_fold_fcmaddc_mask_ymm(ptr %p, <8 x float> %a1, <8 x float> %a2, i8 %mask) {
-  ;CHECK-LABEL: stack_fold_fcmaddc_mask_ymm:
-  ;CHECK:       vfcmaddcph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_fcmaddc_mask_ymm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovaps (%rdi), %ymm2
+; CHECK-NEXT:    vfcmaddcph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 {%k1} # 32-byte Folded Reload
+; CHECK-NEXT:    vmovaps %ymm2, %ymm0
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %a0 = load <8 x float>, ptr %p
   %2 = call <8 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.256(<8 x float> %a1, <8 x float> %a2, <8 x float> %a0, i8 %mask)
@@ -799,8 +1437,16 @@ define <8 x float> @stack_fold_fcmaddc_mask_ymm(ptr %p, <8 x float> %a1, <8 x fl
 }
 
 define <8 x float> @stack_fold_fcmaddc_maskz_ymm(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, ptr %mask) {
-  ;CHECK-LABEL: stack_fold_fcmaddc_maskz_ymm:
-  ;CHECK:       vfcmaddcph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_fcmaddc_maskz_ymm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    kmovb (%rdi), %k1
+; CHECK-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    vfcmaddcph {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 {%k1} {z} # 32-byte Folded Reload
+; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = load i8, ptr %mask
   %3 = call <8 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.256(<8 x float> %a1, <8 x float> %a2, <8 x float> zeroinitializer, i8 %2)


        


More information about the llvm-commits mailing list