[llvm] r328451 - [X86] Update cost model for Goldmont. Add fsqrt costs for Silvermont

Sun Mar 25 08:58:13 PDT 2018

Author: ctopper
Date: Sun Mar 25 08:58:12 2018
New Revision: 328451

URL: http://llvm.org/viewvc/llvm-project?rev=328451&view=rev
Log:
[X86] Update cost model for Goldmont. Add fsqrt costs for Silvermont

Add fdiv costs for Goldmont using table 16-17 of the Intel Optimization Manual. Also add overrides for FSQRT for Goldmont and Silvermont.

Reviewers: RKSimon

Reviewed By: RKSimon

Subscribers: llvm-commits

Differential Revision: https://reviews.llvm.org/D44644

Modified:
    llvm/trunk/lib/Target/X86/X86Subtarget.h
    llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp
    llvm/trunk/test/Analysis/CostModel/X86/arith-fp.ll
    llvm/trunk/test/Analysis/CostModel/X86/arith.ll

Modified: llvm/trunk/lib/Target/X86/X86Subtarget.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86Subtarget.h?rev=328451&r1=328450&r2=328451&view=diff
==============================================================================

--- llvm/trunk/lib/Target/X86/X86Subtarget.h (original)
+++ llvm/trunk/lib/Target/X86/X86Subtarget.h Sun Mar 25 08:58:12 2018
@@ -655,6 +655,7 @@ public:
   /// TODO: to be removed later and replaced with suitable properties
   bool isAtom() const { return X86ProcFamily == IntelAtom; }
   bool isSLM() const { return X86ProcFamily == IntelSLM; }
+  bool isGLM() const { return X86ProcFamily == IntelGLM; }
   bool useSoftFloat() const { return UseSoftFloat; }
 
   /// Use mfence if we have SSE2 or we're on x86-64 (even if we asked for

Modified: llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp?rev=328451&r1=328450&r2=328451&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp Sun Mar 25 08:58:12 2018
@@ -181,28 +181,40 @@ int X86TTIImpl::getArithmeticInstrCost(
   int ISD = TLI->InstructionOpcodeToISD(Opcode);
   assert(ISD && "Invalid opcode");
 
+  static const CostTblEntry GLMCostTable[] = {
+    { ISD::FDIV,  MVT::f32,   18 }, // divss
+    { ISD::FDIV,  MVT::v4f32, 35 }, // divps
+    { ISD::FDIV,  MVT::f64,   33 }, // divsd
+    { ISD::FDIV,  MVT::v2f64, 65 }, // divpd
+  };
+
+  if (ST->isGLM())
+    if (const auto *Entry = CostTableLookup(GLMCostTable, ISD,
+                                            LT.second))
+      return LT.first * Entry->Cost;
+
   static const CostTblEntry SLMCostTable[] = {
-    { ISD::MUL,  MVT::v4i32, 11 }, // pmulld
-    { ISD::MUL,  MVT::v8i16, 2  }, // pmullw
-    { ISD::MUL,  MVT::v16i8, 14 }, // extend/pmullw/trunc sequence.
-    { ISD::FMUL, MVT::f64,   2  }, // mulsd
-    { ISD::FMUL, MVT::v2f64, 4  }, // mulpd
-    { ISD::FMUL, MVT::v4f32, 2  }, // mulps
-    { ISD::FDIV, MVT::f32,   17 }, // divss
-    { ISD::FDIV, MVT::v4f32, 39 }, // divps
-    { ISD::FDIV, MVT::f64,   32 }, // divsd
-    { ISD::FDIV, MVT::v2f64, 69 }, // divpd
-    { ISD::FADD, MVT::v2f64, 2  }, // addpd
-    { ISD::FSUB, MVT::v2f64, 2  }, // subpd
+    { ISD::MUL,   MVT::v4i32, 11 }, // pmulld
+    { ISD::MUL,   MVT::v8i16, 2  }, // pmullw
+    { ISD::MUL,   MVT::v16i8, 14 }, // extend/pmullw/trunc sequence.
+    { ISD::FMUL,  MVT::f64,   2  }, // mulsd
+    { ISD::FMUL,  MVT::v2f64, 4  }, // mulpd
+    { ISD::FMUL,  MVT::v4f32, 2  }, // mulps
+    { ISD::FDIV,  MVT::f32,   17 }, // divss
+    { ISD::FDIV,  MVT::v4f32, 39 }, // divps
+    { ISD::FDIV,  MVT::f64,   32 }, // divsd
+    { ISD::FDIV,  MVT::v2f64, 69 }, // divpd
+    { ISD::FADD,  MVT::v2f64, 2  }, // addpd
+    { ISD::FSUB,  MVT::v2f64, 2  }, // subpd
     // v2i64/v4i64 mul is custom lowered as a series of long:
     // multiplies(3), shifts(3) and adds(2)
     // slm muldq version throughput is 2 and addq throughput 4
     // thus: 3X2 (muldq throughput) + 3X1 (shift throughput) +
     //       3X4 (addq throughput) = 17
-    { ISD::MUL,  MVT::v2i64, 17 },
+    { ISD::MUL,   MVT::v2i64, 17 },
     // slm addq\subq throughput is 4
-    { ISD::ADD,  MVT::v2i64, 4  },
-    { ISD::SUB,  MVT::v2i64, 4  },
+    { ISD::ADD,   MVT::v2i64, 4  },
+    { ISD::SUB,   MVT::v2i64, 4  },
   };
 
   if (ST->isSLM()) {
@@ -225,6 +237,7 @@ int X86TTIImpl::getArithmeticInstrCost(
       if (!signedMode && OpMinSize <= 16)
         return LT.first * 5; // pmullw/pmulhw/pshuf
     }
+
     if (const auto *Entry = CostTableLookup(SLMCostTable, ISD,
                                             LT.second)) {
       return LT.first * Entry->Cost;
@@ -1665,6 +1678,18 @@ int X86TTIImpl::getIntrinsicInstrCost(In
     { ISD::FSQRT,      MVT::v2f64,  21 }, // SNB from http://www.agner.org/
     { ISD::FSQRT,      MVT::v4f64,  43 }, // SNB from http://www.agner.org/
   };
+  static const CostTblEntry GLMCostTbl[] = {
+    { ISD::FSQRT, MVT::f32,   19 }, // sqrtss
+    { ISD::FSQRT, MVT::v4f32, 37 }, // sqrtps
+    { ISD::FSQRT, MVT::f64,   34 }, // sqrtsd
+    { ISD::FSQRT, MVT::v2f64, 67 }, // sqrtpd
+  };
+  static const CostTblEntry SLMCostTbl[] = {
+    { ISD::FSQRT, MVT::f32,   20 }, // sqrtss
+    { ISD::FSQRT, MVT::v4f32, 40 }, // sqrtps
+    { ISD::FSQRT, MVT::f64,   35 }, // sqrtsd
+    { ISD::FSQRT, MVT::v2f64, 70 }, // sqrtpd
+  };
   static const CostTblEntry SSE42CostTbl[] = {
     { ISD::FSQRT,      MVT::f32,    18 }, // Nehalem from http://www.agner.org/
     { ISD::FSQRT,      MVT::v4f32,  18 }, // Nehalem from http://www.agner.org/
@@ -1755,6 +1780,14 @@ int X86TTIImpl::getIntrinsicInstrCost(In
   MVT MTy = LT.second;
 
   // Attempt to lookup cost.
+  if (ST->isGLM())
+    if (const auto *Entry = CostTableLookup(GLMCostTbl, ISD, MTy))
+      return LT.first * Entry->Cost;
+
+  if (ST->isSLM())
+    if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
+      return LT.first * Entry->Cost;
+
   if (ST->hasCDI())
     if (const auto *Entry = CostTableLookup(AVX512CDCostTbl, ISD, MTy))
       return LT.first * Entry->Cost;

Modified: llvm/trunk/test/Analysis/CostModel/X86/arith-fp.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Analysis/CostModel/X86/arith-fp.ll?rev=328451&r1=328450&r2=328451&view=diff
==============================================================================
--- llvm/trunk/test/Analysis/CostModel/X86/arith-fp.ll (original)
+++ llvm/trunk/test/Analysis/CostModel/X86/arith-fp.ll Sun Mar 25 08:58:12 2018
@@ -5,6 +5,7 @@
 ; RUN: opt < %s -enable-no-nans-fp-math  -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx512f | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512F
 ; RUN: opt < %s -enable-no-nans-fp-math  -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512BW
 ; RUN: opt < %s -enable-no-nans-fp-math  -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mcpu=slm | FileCheck %s --check-prefix=CHECK --check-prefix=SLM
+; RUN: opt < %s -enable-no-nans-fp-math  -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mcpu=goldmont | FileCheck %s --check-prefix=CHECK --check-prefix=GLM
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.8.0"
@@ -17,6 +18,7 @@ define i32 @fadd(i32 %arg) {
   ; AVX2: cost of 1 {{.*}} %F32 = fadd
   ; AVX512: cost of 1 {{.*}} %F32 = fadd
   ; SLM: cost of 1 {{.*}} %F32 = fadd
+  ; GLM: cost of 1 {{.*}} %F32 = fadd
   %F32 = fadd float undef, undef
   ; SSE2: cost of 2 {{.*}} %V4F32 = fadd
   ; SSE42: cost of 1 {{.*}} %V4F32 = fadd
@@ -24,6 +26,7 @@ define i32 @fadd(i32 %arg) {
   ; AVX2: cost of 1 {{.*}} %V4F32 = fadd
   ; AVX512: cost of 1 {{.*}} %V4F32 = fadd
   ; SLM: cost of 1 {{.*}} %V4F32 = fadd
+  ; GLM: cost of 1 {{.*}} %V4F32 = fadd
   %V4F32 = fadd <4 x float> undef, undef
   ; SSE2: cost of 4 {{.*}} %V8F32 = fadd
   ; SSE42: cost of 2 {{.*}} %V8F32 = fadd
@@ -31,6 +34,7 @@ define i32 @fadd(i32 %arg) {
   ; AVX2: cost of 1 {{.*}} %V8F32 = fadd
   ; AVX512: cost of 1 {{.*}} %V8F32 = fadd
   ; SLM: cost of 2 {{.*}} %V8F32 = fadd
+  ; GLM: cost of 2 {{.*}} %V8F32 = fadd
   %V8F32 = fadd <8 x float> undef, undef
   ; SSE2: cost of 8 {{.*}} %V16F32 = fadd
   ; SSE42: cost of 4 {{.*}} %V16F32 = fadd
@@ -38,6 +42,7 @@ define i32 @fadd(i32 %arg) {
   ; AVX2: cost of 2 {{.*}} %V16F32 = fadd
   ; AVX512: cost of 1 {{.*}} %V16F32 = fadd
   ; SLM: cost of 4 {{.*}} %V16F32 = fadd
+  ; GLM: cost of 4 {{.*}} %V16F32 = fadd
   %V16F32 = fadd <16 x float> undef, undef
 
   ; SSE2: cost of 2 {{.*}} %F64 = fadd
@@ -46,6 +51,7 @@ define i32 @fadd(i32 %arg) {
   ; AVX2: cost of 1 {{.*}} %F64 = fadd
   ; AVX512: cost of 1 {{.*}} %F64 = fadd
   ; SLM: cost of 1 {{.*}} %F64 = fadd
+  ; GLM: cost of 1 {{.*}} %F64 = fadd
   %F64 = fadd double undef, undef
   ; SSE2: cost of 2 {{.*}} %V2F64 = fadd
   ; SSE42: cost of 1 {{.*}} %V2F64 = fadd
@@ -53,6 +59,7 @@ define i32 @fadd(i32 %arg) {
   ; AVX2: cost of 1 {{.*}} %V2F64 = fadd
   ; AVX512: cost of 1 {{.*}} %V2F64 = fadd
   ; SLM: cost of 2 {{.*}} %V2F64 = fadd
+  ; GLM: cost of 1 {{.*}} %V2F64 = fadd
   %V2F64 = fadd <2 x double> undef, undef
   ; SSE2: cost of 4 {{.*}} %V4F64 = fadd
   ; SSE42: cost of 2 {{.*}} %V4F64 = fadd
@@ -60,6 +67,7 @@ define i32 @fadd(i32 %arg) {
   ; AVX2: cost of 1 {{.*}} %V4F64 = fadd
   ; AVX512: cost of 1 {{.*}} %V4F64 = fadd
   ; SLM: cost of 4 {{.*}} %V4F64 = fadd
+  ; GLM: cost of 2 {{.*}} %V4F64 = fadd
   %V4F64 = fadd <4 x double> undef, undef
   ; SSE2: cost of 8 {{.*}} %V8F64 = fadd
   ; SSE42: cost of 4 {{.*}} %V8F64 = fadd
@@ -67,6 +75,7 @@ define i32 @fadd(i32 %arg) {
   ; AVX2: cost of 2 {{.*}} %V8F64 = fadd
   ; AVX512: cost of 1 {{.*}} %V8F64 = fadd
   ; SLM: cost of 8 {{.*}} %V8F64 = fadd
+  ; GLM: cost of 4 {{.*}} %V8F64 = fadd
   %V8F64 = fadd <8 x double> undef, undef
 
   ret i32 undef
@@ -80,6 +89,7 @@ define i32 @fsub(i32 %arg) {
   ; AVX2: cost of 1 {{.*}} %F32 = fsub
   ; AVX512: cost of 1 {{.*}} %F32 = fsub
   ; SLM: cost of 1 {{.*}} %F32 = fsub
+  ; GLM: cost of 1 {{.*}} %F32 = fsub
   %F32 = fsub float undef, undef
   ; SSE2: cost of 2 {{.*}} %V4F32 = fsub
   ; SSE42: cost of 1 {{.*}} %V4F32 = fsub
@@ -87,6 +97,7 @@ define i32 @fsub(i32 %arg) {
   ; AVX2: cost of 1 {{.*}} %V4F32 = fsub
   ; AVX512: cost of 1 {{.*}} %V4F32 = fsub
   ; SLM: cost of 1 {{.*}} %V4F32 = fsub
+  ; GLM: cost of 1 {{.*}} %V4F32 = fsub
   %V4F32 = fsub <4 x float> undef, undef
   ; SSE2: cost of 4 {{.*}} %V8F32 = fsub
   ; SSE42: cost of 2 {{.*}} %V8F32 = fsub
@@ -94,6 +105,7 @@ define i32 @fsub(i32 %arg) {
   ; AVX2: cost of 1 {{.*}} %V8F32 = fsub
   ; AVX512: cost of 1 {{.*}} %V8F32 = fsub
   ; SLM: cost of 2 {{.*}} %V8F32 = fsub
+  ; GLM: cost of 2 {{.*}} %V8F32 = fsub
   %V8F32 = fsub <8 x float> undef, undef
   ; SSE2: cost of 8 {{.*}} %V16F32 = fsub
   ; SSE42: cost of 4 {{.*}} %V16F32 = fsub
@@ -101,6 +113,7 @@ define i32 @fsub(i32 %arg) {
   ; AVX2: cost of 2 {{.*}} %V16F32 = fsub
   ; AVX512: cost of 1 {{.*}} %V16F32 = fsub
   ; SLM: cost of 4 {{.*}} %V16F32 = fsub
+  ; GLM: cost of 4 {{.*}} %V16F32 = fsub
   %V16F32 = fsub <16 x float> undef, undef
 
   ; SSE2: cost of 2 {{.*}} %F64 = fsub
@@ -109,6 +122,7 @@ define i32 @fsub(i32 %arg) {
   ; AVX2: cost of 1 {{.*}} %F64 = fsub
   ; AVX512: cost of 1 {{.*}} %F64 = fsub
   ; SLM: cost of 1 {{.*}} %F64 = fsub
+  ; GLM: cost of 1 {{.*}} %F64 = fsub
   %F64 = fsub double undef, undef
   ; SSE2: cost of 2 {{.*}} %V2F64 = fsub
   ; SSE42: cost of 1 {{.*}} %V2F64 = fsub
@@ -116,6 +130,7 @@ define i32 @fsub(i32 %arg) {
   ; AVX2: cost of 1 {{.*}} %V2F64 = fsub
   ; AVX512: cost of 1 {{.*}} %V2F64 = fsub
   ; SLM: cost of 2 {{.*}} %V2F64 = fsub
+  ; GLM: cost of 1 {{.*}} %V2F64 = fsub
   %V2F64 = fsub <2 x double> undef, undef
   ; SSE2: cost of 4 {{.*}} %V4F64 = fsub
   ; SSE42: cost of 2 {{.*}} %V4F64 = fsub
@@ -123,6 +138,7 @@ define i32 @fsub(i32 %arg) {
   ; AVX2: cost of 1 {{.*}} %V4F64 = fsub
   ; AVX512: cost of 1 {{.*}} %V4F64 = fsub
   ; SLM: cost of 4 {{.*}} %V4F64 = fsub
+  ; GLM: cost of 2 {{.*}} %V4F64 = fsub
   %V4F64 = fsub <4 x double> undef, undef
   ; SSE2: cost of 8 {{.*}} %V8F64 = fsub
   ; SSE42: cost of 4 {{.*}} %V8F64 = fsub
@@ -130,6 +146,7 @@ define i32 @fsub(i32 %arg) {
   ; AVX2: cost of 2 {{.*}} %V8F64 = fsub
   ; AVX512: cost of 1 {{.*}} %V8F64 = fsub
   ; SLM: cost of 8 {{.*}} %V8F64 = fsub
+  ; GLM: cost of 4 {{.*}} %V8F64 = fsub
   %V8F64 = fsub <8 x double> undef, undef
 
   ret i32 undef
@@ -143,6 +160,7 @@ define i32 @fmul(i32 %arg) {
   ; AVX2: cost of 1 {{.*}} %F32 = fmul
   ; AVX512: cost of 1 {{.*}} %F32 = fmul
   ; SLM: cost of 1 {{.*}} %F32 = fmul
+  ; GLM: cost of 1 {{.*}} %F32 = fmul
   %F32 = fmul float undef, undef
   ; SSE2: cost of 2 {{.*}} %V4F32 = fmul
   ; SSE42: cost of 1 {{.*}} %V4F32 = fmul
@@ -150,6 +168,7 @@ define i32 @fmul(i32 %arg) {
   ; AVX2: cost of 1 {{.*}} %V4F32 = fmul
   ; AVX512: cost of 1 {{.*}} %V4F32 = fmul
   ; SLM: cost of 2 {{.*}} %V4F32 = fmul
+  ; GLM: cost of 1 {{.*}} %V4F32 = fmul
   %V4F32 = fmul <4 x float> undef, undef
   ; SSE2: cost of 4 {{.*}} %V8F32 = fmul
   ; SSE42: cost of 2 {{.*}} %V8F32 = fmul
@@ -157,6 +176,7 @@ define i32 @fmul(i32 %arg) {
   ; AVX2: cost of 1 {{.*}} %V8F32 = fmul
   ; AVX512: cost of 1 {{.*}} %V8F32 = fmul
   ; SLM: cost of 4 {{.*}} %V8F32 = fmul
+  ; GLM: cost of 2 {{.*}} %V8F32 = fmul
   %V8F32 = fmul <8 x float> undef, undef
   ; SSE2: cost of 8 {{.*}} %V16F32 = fmul
   ; SSE42: cost of 4 {{.*}} %V16F32 = fmul
@@ -164,6 +184,7 @@ define i32 @fmul(i32 %arg) {
   ; AVX2: cost of 2 {{.*}} %V16F32 = fmul
   ; AVX512: cost of 1 {{.*}} %V16F32 = fmul
   ; SLM: cost of 8 {{.*}} %V16F32 = fmul
+  ; GLM: cost of 4 {{.*}} %V16F32 = fmul
   %V16F32 = fmul <16 x float> undef, undef
 
   ; SSE2: cost of 2 {{.*}} %F64 = fmul
@@ -172,6 +193,7 @@ define i32 @fmul(i32 %arg) {
   ; AVX2: cost of 1 {{.*}} %F64 = fmul
   ; AVX512: cost of 1 {{.*}} %F64 = fmul
   ; SLM: cost of 2 {{.*}} %F64 = fmul
+  ; GLM: cost of 1 {{.*}} %F64 = fmul
   %F64 = fmul double undef, undef
   ; SSE2: cost of 2 {{.*}} %V2F64 = fmul
   ; SSE42: cost of 1 {{.*}} %V2F64 = fmul
@@ -179,6 +201,7 @@ define i32 @fmul(i32 %arg) {
   ; AVX2: cost of 1 {{.*}} %V2F64 = fmul
   ; AVX512: cost of 1 {{.*}} %V2F64 = fmul
   ; SLM: cost of 4 {{.*}} %V2F64 = fmul
+  ; GLM: cost of 1 {{.*}} %V2F64 = fmul
   %V2F64 = fmul <2 x double> undef, undef
   ; SSE2: cost of 4 {{.*}} %V4F64 = fmul
   ; SSE42: cost of 2 {{.*}} %V4F64 = fmul
@@ -186,6 +209,7 @@ define i32 @fmul(i32 %arg) {
   ; AVX2: cost of 1 {{.*}} %V4F64 = fmul
   ; AVX512: cost of 1 {{.*}} %V4F64 = fmul
   ; SLM: cost of 8 {{.*}} %V4F64 = fmul
+  ; GLM: cost of 2 {{.*}} %V4F64 = fmul
   %V4F64 = fmul <4 x double> undef, undef
   ; SSE2: cost of 8 {{.*}} %V8F64 = fmul
   ; SSE42: cost of 4 {{.*}} %V8F64 = fmul
@@ -193,6 +217,7 @@ define i32 @fmul(i32 %arg) {
   ; AVX2: cost of 2 {{.*}} %V8F64 = fmul
   ; AVX512: cost of 1 {{.*}} %V8F64 = fmul
   ; SLM: cost of 16 {{.*}} %V8F64 = fmul
+  ; GLM: cost of 4 {{.*}} %V8F64 = fmul
   %V8F64 = fmul <8 x double> undef, undef
 
   ret i32 undef
@@ -206,6 +231,7 @@ define i32 @fdiv(i32 %arg) {
   ; AVX2: cost of 7 {{.*}} %F32 = fdiv
   ; AVX512: cost of 7 {{.*}} %F32 = fdiv
   ; SLM: cost of 17 {{.*}} %F32 = fdiv
+  ; GLM: cost of 18 {{.*}} %F32 = fdiv
   %F32 = fdiv float undef, undef
   ; SSE2: cost of 39 {{.*}} %V4F32 = fdiv
   ; SSE42: cost of 14 {{.*}} %V4F32 = fdiv
@@ -213,6 +239,7 @@ define i32 @fdiv(i32 %arg) {
   ; AVX2: cost of 7 {{.*}} %V4F32 = fdiv
   ; AVX512: cost of 7 {{.*}} %V4F32 = fdiv
   ; SLM: cost of 39 {{.*}} %V4F32 = fdiv
+  ; GLM: cost of 35 {{.*}} %V4F32 = fdiv
   %V4F32 = fdiv <4 x float> undef, undef
   ; SSE2: cost of 78 {{.*}} %V8F32 = fdiv
   ; SSE42: cost of 28 {{.*}} %V8F32 = fdiv
@@ -220,6 +247,7 @@ define i32 @fdiv(i32 %arg) {
   ; AVX2: cost of 14 {{.*}} %V8F32 = fdiv
   ; AVX512: cost of 14 {{.*}} %V8F32 = fdiv
   ; SLM: cost of 78 {{.*}} %V8F32 = fdiv
+  ; GLM: cost of 70 {{.*}} %V8F32 = fdiv
   %V8F32 = fdiv <8 x float> undef, undef
   ; SSE2: cost of 156 {{.*}} %V16F32 = fdiv
   ; SSE42: cost of 56 {{.*}} %V16F32 = fdiv
@@ -227,6 +255,7 @@ define i32 @fdiv(i32 %arg) {
   ; AVX2: cost of 28 {{.*}} %V16F32 = fdiv
   ; AVX512: cost of 2 {{.*}} %V16F32 = fdiv
   ; SLM: cost of 156 {{.*}} %V16F32 = fdiv
+  ; GLM: cost of 140 {{.*}} %V16F32 = fdiv
   %V16F32 = fdiv <16 x float> undef, undef
 
   ; SSE2: cost of 38 {{.*}} %F64 = fdiv
@@ -235,6 +264,7 @@ define i32 @fdiv(i32 %arg) {
   ; AVX2: cost of 14 {{.*}} %F64 = fdiv
   ; AVX512: cost of 14 {{.*}} %F64 = fdiv
   ; SLM: cost of 32 {{.*}} %F64 = fdiv
+  ; GLM: cost of 33 {{.*}} %F64 = fdiv
   %F64 = fdiv double undef, undef
   ; SSE2: cost of 69 {{.*}} %V2F64 = fdiv
   ; SSE42: cost of 22 {{.*}} %V2F64 = fdiv
@@ -242,6 +272,7 @@ define i32 @fdiv(i32 %arg) {
   ; AVX2: cost of 14 {{.*}} %V2F64 = fdiv
   ; AVX512: cost of 14 {{.*}} %V2F64 = fdiv
   ; SLM: cost of 69 {{.*}} %V2F64 = fdiv
+  ; GLM: cost of 65 {{.*}} %V2F64 = fdiv
   %V2F64 = fdiv <2 x double> undef, undef
   ; SSE2: cost of 138 {{.*}} %V4F64 = fdiv
   ; SSE42: cost of 44 {{.*}} %V4F64 = fdiv
@@ -249,6 +280,7 @@ define i32 @fdiv(i32 %arg) {
   ; AVX2: cost of 28 {{.*}} %V4F64 = fdiv
   ; AVX512: cost of 28 {{.*}} %V4F64 = fdiv
   ; SLM: cost of 138 {{.*}} %V4F64 = fdiv
+  ; GLM: cost of 130 {{.*}} %V4F64 = fdiv
   %V4F64 = fdiv <4 x double> undef, undef
   ; SSE2: cost of 276 {{.*}} %V8F64 = fdiv
   ; SSE42: cost of 88 {{.*}} %V8F64 = fdiv
@@ -256,6 +288,7 @@ define i32 @fdiv(i32 %arg) {
   ; AVX2: cost of 56 {{.*}} %V8F64 = fdiv
   ; AVX512: cost of 2 {{.*}} %V8F64 = fdiv
   ; SLM: cost of 276 {{.*}} %V8F64 = fdiv
+  ; GLM: cost of 260 {{.*}} %V8F64 = fdiv
   %V8F64 = fdiv <8 x double> undef, undef
 
   ret i32 undef
@@ -269,6 +302,7 @@ define i32 @frem(i32 %arg) {
   ; AVX2: cost of 2 {{.*}} %F32 = frem
   ; AVX512: cost of 2 {{.*}} %F32 = frem
   ; SLM: cost of 2 {{.*}} %F32 = frem
+  ; GLM: cost of 2 {{.*}} %F32 = frem
   %F32 = frem float undef, undef
   ; SSE2: cost of 14 {{.*}} %V4F32 = frem
   ; SSE42: cost of 14 {{.*}} %V4F32 = frem
@@ -276,6 +310,7 @@ define i32 @frem(i32 %arg) {
   ; AVX2: cost of 14 {{.*}} %V4F32 = frem
   ; AVX512: cost of 14 {{.*}} %V4F32 = frem
   ; SLM: cost of 14 {{.*}} %V4F32 = frem
+  ; GLM: cost of 14 {{.*}} %V4F32 = frem
   %V4F32 = frem <4 x float> undef, undef
   ; SSE2: cost of 28 {{.*}} %V8F32 = frem
   ; SSE42: cost of 28 {{.*}} %V8F32 = frem
@@ -283,6 +318,7 @@ define i32 @frem(i32 %arg) {
   ; AVX2: cost of 30 {{.*}} %V8F32 = frem
   ; AVX512: cost of 30 {{.*}} %V8F32 = frem
   ; SLM: cost of 28 {{.*}} %V8F32 = frem
+  ; GLM: cost of 28 {{.*}} %V8F32 = frem
   %V8F32 = frem <8 x float> undef, undef
   ; SSE2: cost of 56 {{.*}} %V16F32 = frem
   ; SSE42: cost of 56 {{.*}} %V16F32 = frem
@@ -290,6 +326,7 @@ define i32 @frem(i32 %arg) {
   ; AVX2: cost of 60 {{.*}} %V16F32 = frem
   ; AVX512: cost of 62 {{.*}} %V16F32 = frem
   ; SLM: cost of 56 {{.*}} %V16F32 = frem
+  ; GLM: cost of 56 {{.*}} %V16F32 = frem
   %V16F32 = frem <16 x float> undef, undef
 
   ; SSE2: cost of 2 {{.*}} %F64 = frem
@@ -298,6 +335,7 @@ define i32 @frem(i32 %arg) {
   ; AVX2: cost of 2 {{.*}} %F64 = frem
   ; AVX512: cost of 2 {{.*}} %F64 = frem
   ; SLM: cost of 2 {{.*}} %F64 = frem
+  ; GLM: cost of 2 {{.*}} %F64 = frem
   %F64 = frem double undef, undef
   ; SSE2: cost of 6 {{.*}} %V2F64 = frem
   ; SSE42: cost of 6 {{.*}} %V2F64 = frem
@@ -305,6 +343,7 @@ define i32 @frem(i32 %arg) {
   ; AVX2: cost of 6 {{.*}} %V2F64 = frem
   ; AVX512: cost of 6 {{.*}} %V2F64 = frem
   ; SLM: cost of 6 {{.*}} %V2F64 = frem
+  ; GLM: cost of 6 {{.*}} %V2F64 = frem
   %V2F64 = frem <2 x double> undef, undef
   ; SSE2: cost of 12 {{.*}} %V4F64 = frem
   ; SSE42: cost of 12 {{.*}} %V4F64 = frem
@@ -312,6 +351,7 @@ define i32 @frem(i32 %arg) {
   ; AVX2: cost of 14 {{.*}} %V4F64 = frem
   ; AVX512: cost of 14 {{.*}} %V4F64 = frem
   ; SLM: cost of 12 {{.*}} %V4F64 = frem
+  ; GLM: cost of 12 {{.*}} %V4F64 = frem
   %V4F64 = frem <4 x double> undef, undef
   ; SSE2: cost of 24 {{.*}} %V8F64 = frem
   ; SSE42: cost of 24 {{.*}} %V8F64 = frem
@@ -319,6 +359,7 @@ define i32 @frem(i32 %arg) {
   ; AVX2: cost of 28 {{.*}} %V8F64 = frem
   ; AVX512: cost of 30 {{.*}} %V8F64 = frem
   ; SLM: cost of 24 {{.*}} %V8F64 = frem
+  ; GLM: cost of 24 {{.*}} %V8F64 = frem
   %V8F64 = frem <8 x double> undef, undef
 
   ret i32 undef
@@ -331,28 +372,32 @@ define i32 @fsqrt(i32 %arg) {
   ; AVX: cost of 14 {{.*}} %F32 = call float @llvm.sqrt.f32
   ; AVX2: cost of 7 {{.*}} %F32 = call float @llvm.sqrt.f32
   ; AVX512: cost of 7 {{.*}} %F32 = call float @llvm.sqrt.f32
-  ; SLM: cost of 18 {{.*}} %F32 = call float @llvm.sqrt.f32
+  ; SLM: cost of 20 {{.*}} %F32 = call float @llvm.sqrt.f32
+  ; GLM: cost of 19 {{.*}} %F32 = call float @llvm.sqrt.f32
   %F32 = call float @llvm.sqrt.f32(float undef)
   ; SSE2: cost of 56 {{.*}} %V4F32 = call <4 x float> @llvm.sqrt.v4f32
   ; SSE42: cost of 18 {{.*}} %V4F32 = call <4 x float> @llvm.sqrt.v4f32
   ; AVX: cost of 14 {{.*}} %V4F32 = call <4 x float> @llvm.sqrt.v4f32
   ; AVX2: cost of 7 {{.*}} %V4F32 = call <4 x float> @llvm.sqrt.v4f32
   ; AVX512: cost of 7 {{.*}} %V4F32 = call <4 x float> @llvm.sqrt.v4f32
-  ; SLM: cost of 18 {{.*}} %V4F32 = call <4 x float> @llvm.sqrt.v4f32
+  ; SLM: cost of 40 {{.*}} %V4F32 = call <4 x float> @llvm.sqrt.v4f32
+  ; GLM: cost of 37 {{.*}} %V4F32 = call <4 x float> @llvm.sqrt.v4f32
   %V4F32 = call <4 x float> @llvm.sqrt.v4f32(<4 x float> undef)
   ; SSE2: cost of 112 {{.*}} %V8F32 = call <8 x float> @llvm.sqrt.v8f32
   ; SSE42: cost of 36 {{.*}} %V8F32 = call <8 x float> @llvm.sqrt.v8f32
   ; AVX: cost of 28 {{.*}} %V8F32 = call <8 x float> @llvm.sqrt.v8f32
   ; AVX2: cost of 14 {{.*}} %V8F32 = call <8 x float> @llvm.sqrt.v8f32
   ; AVX512: cost of 14 {{.*}} %V8F32 = call <8 x float> @llvm.sqrt.v8f32
-  ; SLM: cost of 36 {{.*}} %V8F32 = call <8 x float> @llvm.sqrt.v8f32
+  ; SLM: cost of 80 {{.*}} %V8F32 = call <8 x float> @llvm.sqrt.v8f32
+  ; GLM: cost of 74 {{.*}} %V8F32 = call <8 x float> @llvm.sqrt.v8f32
   %V8F32 = call <8 x float> @llvm.sqrt.v8f32(<8 x float> undef)
   ; SSE2: cost of 224 {{.*}} %V16F32 = call <16 x float> @llvm.sqrt.v16f32
   ; SSE42: cost of 72 {{.*}} %V16F32 = call <16 x float> @llvm.sqrt.v16f32
   ; AVX: cost of 56 {{.*}} %V16F32 = call <16 x float> @llvm.sqrt.v16f32
   ; AVX2: cost of 28 {{.*}} %V16F32 = call <16 x float> @llvm.sqrt.v16f32
   ; AVX512: cost of 1 {{.*}} %V16F32 = call <16 x float> @llvm.sqrt.v16f32
-  ; SLM: cost of 72 {{.*}} %V16F32 = call <16 x float> @llvm.sqrt.v16f32
+  ; SLM: cost of 160 {{.*}} %V16F32 = call <16 x float> @llvm.sqrt.v16f32
+  ; GLM: cost of 148 {{.*}} %V16F32 = call <16 x float> @llvm.sqrt.v16f32
   %V16F32 = call <16 x float> @llvm.sqrt.v16f32(<16 x float> undef)
 
   ; SSE2: cost of 32 {{.*}} %F64 = call double @llvm.sqrt.f64
@@ -360,28 +405,32 @@ define i32 @fsqrt(i32 %arg) {
   ; AVX: cost of 21 {{.*}} %F64 = call double @llvm.sqrt.f64
   ; AVX2: cost of 14 {{.*}} %F64 = call double @llvm.sqrt.f64
   ; AVX512: cost of 14 {{.*}} %F64 = call double @llvm.sqrt.f64
-  ; SLM: cost of 32 {{.*}} %F64 = call double @llvm.sqrt.f64
+  ; SLM: cost of 35 {{.*}} %F64 = call double @llvm.sqrt.f64
+  ; GLM: cost of 34 {{.*}} %F64 = call double @llvm.sqrt.f64
   %F64 = call double @llvm.sqrt.f64(double undef)
   ; SSE2: cost of 32 {{.*}} %V2F64 = call <2 x double> @llvm.sqrt.v2f64
   ; SSE42: cost of 32 {{.*}} %V2F64 = call <2 x double> @llvm.sqrt.v2f64
   ; AVX: cost of 21 {{.*}} %V2F64 = call <2 x double> @llvm.sqrt.v2f64
   ; AVX2: cost of 14 {{.*}} %V2F64 = call <2 x double> @llvm.sqrt.v2f64
   ; AVX512: cost of 14 {{.*}} %V2F64 = call <2 x double> @llvm.sqrt.v2f64
-  ; SLM: cost of 32 {{.*}} %V2F64 = call <2 x double> @llvm.sqrt.v2f64
+  ; SLM: cost of 70 {{.*}} %V2F64 = call <2 x double> @llvm.sqrt.v2f64
+  ; GLM: cost of 67 {{.*}} %V2F64 = call <2 x double> @llvm.sqrt.v2f64
   %V2F64 = call <2 x double> @llvm.sqrt.v2f64(<2 x double> undef)
   ; SSE2: cost of 64 {{.*}} %V4F64 = call <4 x double> @llvm.sqrt.v4f64
   ; SSE42: cost of 64 {{.*}} %V4F64 = call <4 x double> @llvm.sqrt.v4f64
   ; AVX: cost of 43 {{.*}} %V4F64 = call <4 x double> @llvm.sqrt.v4f64
   ; AVX2: cost of 28 {{.*}} %V4F64 = call <4 x double> @llvm.sqrt.v4f64
   ; AVX512: cost of 28 {{.*}} %V4F64 = call <4 x double> @llvm.sqrt.v4f64
-  ; SLM: cost of 64 {{.*}} %V4F64 = call <4 x double> @llvm.sqrt.v4f64
+  ; SLM: cost of 140 {{.*}} %V4F64 = call <4 x double> @llvm.sqrt.v4f64
+  ; GLM: cost of 134 {{.*}} %V4F64 = call <4 x double> @llvm.sqrt.v4f64
   %V4F64 = call <4 x double> @llvm.sqrt.v4f64(<4 x double> undef)
   ; SSE2: cost of 128 {{.*}} %V8F64 = call <8 x double> @llvm.sqrt.v8f64
   ; SSE42: cost of 128 {{.*}} %V8F64 = call <8 x double> @llvm.sqrt.v8f64
   ; AVX: cost of 86 {{.*}} %V8F64 = call <8 x double> @llvm.sqrt.v8f64
   ; AVX2: cost of 56 {{.*}} %V8F64 = call <8 x double> @llvm.sqrt.v8f64
   ; AVX512: cost of 1 {{.*}} %V8F64 = call <8 x double> @llvm.sqrt.v8f64
-  ; SLM: cost of 128 {{.*}} %V8F64 = call <8 x double> @llvm.sqrt.v8f64
+  ; SLM: cost of 280 {{.*}} %V8F64 = call <8 x double> @llvm.sqrt.v8f64
+  ; GLM: cost of 268 {{.*}} %V8F64 = call <8 x double> @llvm.sqrt.v8f64
   %V8F64 = call <8 x double> @llvm.sqrt.v8f64(<8 x double> undef)
 
   ret i32 undef
@@ -395,6 +444,7 @@ define i32 @fabs(i32 %arg) {
   ; AVX2: cost of 2 {{.*}} %F32 = call float @llvm.fabs.f32
   ; AVX512: cost of 2 {{.*}} %F32 = call float @llvm.fabs.f32
   ; SLM: cost of 2 {{.*}} %F32 = call float @llvm.fabs.f32
+  ; GLM: cost of 2 {{.*}} %F32 = call float @llvm.fabs.f32
   %F32 = call float @llvm.fabs.f32(float undef)
   ; SSE2: cost of 2 {{.*}} %V4F32 = call <4 x float> @llvm.fabs.v4f32
   ; SSE42: cost of 2 {{.*}} %V4F32 = call <4 x float> @llvm.fabs.v4f32
@@ -402,6 +452,7 @@ define i32 @fabs(i32 %arg) {
   ; AVX2: cost of 2 {{.*}} %V4F32 = call <4 x float> @llvm.fabs.v4f32
   ; AVX512: cost of 2 {{.*}} %V4F32 = call <4 x float> @llvm.fabs.v4f32
   ; SLM: cost of 2 {{.*}} %V4F32 = call <4 x float> @llvm.fabs.v4f32
+  ; GLM: cost of 2 {{.*}} %V4F32 = call <4 x float> @llvm.fabs.v4f32
   %V4F32 = call <4 x float> @llvm.fabs.v4f32(<4 x float> undef)
   ; SSE2: cost of 4 {{.*}} %V8F32 = call <8 x float> @llvm.fabs.v8f32
   ; SSE42: cost of 4 {{.*}} %V8F32 = call <8 x float> @llvm.fabs.v8f32
@@ -409,6 +460,7 @@ define i32 @fabs(i32 %arg) {
   ; AVX2: cost of 2 {{.*}} %V8F32 = call <8 x float> @llvm.fabs.v8f32
   ; AVX512: cost of 2 {{.*}} %V8F32 = call <8 x float> @llvm.fabs.v8f32
   ; SLM: cost of 4 {{.*}} %V8F32 = call <8 x float> @llvm.fabs.v8f32
+  ; GLM: cost of 4 {{.*}} %V8F32 = call <8 x float> @llvm.fabs.v8f32
   %V8F32 = call <8 x float> @llvm.fabs.v8f32(<8 x float> undef)
   ; SSE2: cost of 8 {{.*}} %V16F32 = call <16 x float> @llvm.fabs.v16f32
   ; SSE42: cost of 8 {{.*}} %V16F32 = call <16 x float> @llvm.fabs.v16f32
@@ -416,6 +468,7 @@ define i32 @fabs(i32 %arg) {
   ; AVX2: cost of 4 {{.*}} %V16F32 = call <16 x float> @llvm.fabs.v16f32
   ; AVX512: cost of 2 {{.*}} %V16F32 = call <16 x float> @llvm.fabs.v16f32
   ; SLM: cost of 8 {{.*}} %V16F32 = call <16 x float> @llvm.fabs.v16f32
+  ; GLM: cost of 8 {{.*}} %V16F32 = call <16 x float> @llvm.fabs.v16f32
   %V16F32 = call <16 x float> @llvm.fabs.v16f32(<16 x float> undef)
 
   ; SSE2: cost of 2 {{.*}} %F64 = call double @llvm.fabs.f64
@@ -424,6 +477,7 @@ define i32 @fabs(i32 %arg) {
   ; AVX2: cost of 2 {{.*}} %F64 = call double @llvm.fabs.f64
   ; AVX512: cost of 2 {{.*}} %F64 = call double @llvm.fabs.f64
   ; SLM: cost of 2 {{.*}} %F64 = call double @llvm.fabs.f64
+  ; GLM: cost of 2 {{.*}} %F64 = call double @llvm.fabs.f64
   %F64 = call double @llvm.fabs.f64(double undef)
   ; SSE2: cost of 2 {{.*}} %V2F64 = call <2 x double> @llvm.fabs.v2f64
   ; SSE42: cost of 2 {{.*}} %V2F64 = call <2 x double> @llvm.fabs.v2f64
@@ -431,6 +485,7 @@ define i32 @fabs(i32 %arg) {
   ; AVX2: cost of 2 {{.*}} %V2F64 = call <2 x double> @llvm.fabs.v2f64
   ; AVX512: cost of 2 {{.*}} %V2F64 = call <2 x double> @llvm.fabs.v2f64
   ; SLM: cost of 2 {{.*}} %V2F64 = call <2 x double> @llvm.fabs.v2f64
+  ; GLM: cost of 2 {{.*}} %V2F64 = call <2 x double> @llvm.fabs.v2f64
   %V2F64 = call <2 x double> @llvm.fabs.v2f64(<2 x double> undef)
   ; SSE2: cost of 4 {{.*}} %V4F64 = call <4 x double> @llvm.fabs.v4f64
   ; SSE42: cost of 4 {{.*}} %V4F64 = call <4 x double> @llvm.fabs.v4f64
@@ -438,6 +493,7 @@ define i32 @fabs(i32 %arg) {
   ; AVX2: cost of 2 {{.*}} %V4F64 = call <4 x double> @llvm.fabs.v4f64
   ; AVX512: cost of 2 {{.*}} %V4F64 = call <4 x double> @llvm.fabs.v4f64
   ; SLM: cost of 4 {{.*}} %V4F64 = call <4 x double> @llvm.fabs.v4f64
+  ; GLM: cost of 4 {{.*}} %V4F64 = call <4 x double> @llvm.fabs.v4f64
   %V4F64 = call <4 x double> @llvm.fabs.v4f64(<4 x double> undef)
   ; SSE2: cost of 8 {{.*}} %V8F64 = call <8 x double> @llvm.fabs.v8f64
   ; SSE42: cost of 8 {{.*}} %V8F64 = call <8 x double> @llvm.fabs.v8f64
@@ -445,6 +501,7 @@ define i32 @fabs(i32 %arg) {
   ; AVX2: cost of 4 {{.*}} %V8F64 = call <8 x double> @llvm.fabs.v8f64
   ; AVX512: cost of 2 {{.*}} %V8F64 = call <8 x double> @llvm.fabs.v8f64
   ; SLM: cost of 8 {{.*}} %V8F64 = call <8 x double> @llvm.fabs.v8f64
+  ; GLM: cost of 8 {{.*}} %V8F64 = call <8 x double> @llvm.fabs.v8f64
   %V8F64 = call <8 x double> @llvm.fabs.v8f64(<8 x double> undef)
 
   ret i32 undef
@@ -458,6 +515,7 @@ define i32 @fcopysign(i32 %arg) {
   ; AVX2: cost of 2 {{.*}} %F32 = call float @llvm.copysign.f32
   ; AVX512: cost of 2 {{.*}} %F32 = call float @llvm.copysign.f32
   ; SLM: cost of 2 {{.*}} %F32 = call float @llvm.copysign.f32
+  ; GLM: cost of 2 {{.*}} %F32 = call float @llvm.copysign.f32
   %F32 = call float @llvm.copysign.f32(float undef, float undef)
   ; SSE2: cost of 2 {{.*}} %V4F32 = call <4 x float> @llvm.copysign.v4f32
   ; SSE42: cost of 2 {{.*}} %V4F32 = call <4 x float> @llvm.copysign.v4f32
@@ -465,6 +523,7 @@ define i32 @fcopysign(i32 %arg) {
   ; AVX2: cost of 2 {{.*}} %V4F32 = call <4 x float> @llvm.copysign.v4f32
   ; AVX512: cost of 2 {{.*}} %V4F32 = call <4 x float> @llvm.copysign.v4f32
   ; SLM: cost of 2 {{.*}} %V4F32 = call <4 x float> @llvm.copysign.v4f32
+  ; GLM: cost of 2 {{.*}} %V4F32 = call <4 x float> @llvm.copysign.v4f32
   %V4F32 = call <4 x float> @llvm.copysign.v4f32(<4 x float> undef, <4 x float> undef)
   ; SSE2: cost of 4 {{.*}} %V8F32 = call <8 x float> @llvm.copysign.v8f32
   ; SSE42: cost of 4 {{.*}} %V8F32 = call <8 x float> @llvm.copysign.v8f32
@@ -472,6 +531,7 @@ define i32 @fcopysign(i32 %arg) {
   ; AVX2: cost of 2 {{.*}} %V8F32 = call <8 x float> @llvm.copysign.v8f32
   ; AVX512: cost of 2 {{.*}} %V8F32 = call <8 x float> @llvm.copysign.v8f32
   ; SLM: cost of 4 {{.*}} %V8F32 = call <8 x float> @llvm.copysign.v8f32
+  ; GLM: cost of 4 {{.*}} %V8F32 = call <8 x float> @llvm.copysign.v8f32
   %V8F32 = call <8 x float> @llvm.copysign.v8f32(<8 x float> undef, <8 x float> undef)
   ; SSE2: cost of 8 {{.*}} %V16F32 = call <16 x float> @llvm.copysign.v16f32
   ; SSE42: cost of 8 {{.*}} %V16F32 = call <16 x float> @llvm.copysign.v16f32
@@ -479,6 +539,7 @@ define i32 @fcopysign(i32 %arg) {
   ; AVX2: cost of 4 {{.*}} %V16F32 = call <16 x float> @llvm.copysign.v16f32
   ; AVX512: cost of 2 {{.*}} %V16F32 = call <16 x float> @llvm.copysign.v16f32
   ; SLM: cost of 8 {{.*}} %V16F32 = call <16 x float> @llvm.copysign.v16f32
+  ; GLM: cost of 8 {{.*}} %V16F32 = call <16 x float> @llvm.copysign.v16f32
   %V16F32 = call <16 x float> @llvm.copysign.v16f32(<16 x float> undef, <16 x float> undef)
 
   ; SSE2: cost of 2 {{.*}} %F64 = call double @llvm.copysign.f64
@@ -487,6 +548,7 @@ define i32 @fcopysign(i32 %arg) {
   ; AVX2: cost of 2 {{.*}} %F64 = call double @llvm.copysign.f64
   ; AVX512: cost of 2 {{.*}} %F64 = call double @llvm.copysign.f64
   ; SLM: cost of 2 {{.*}} %F64 = call double @llvm.copysign.f64
+  ; GLM: cost of 2 {{.*}} %F64 = call double @llvm.copysign.f64
   %F64 = call double @llvm.copysign.f64(double undef, double undef)
   ; SSE2: cost of 2 {{.*}} %V2F64 = call <2 x double> @llvm.copysign.v2f64
   ; SSE42: cost of 2 {{.*}} %V2F64 = call <2 x double> @llvm.copysign.v2f64
@@ -494,6 +556,7 @@ define i32 @fcopysign(i32 %arg) {
   ; AVX2: cost of 2 {{.*}} %V2F64 = call <2 x double> @llvm.copysign.v2f64
   ; AVX512: cost of 2 {{.*}} %V2F64 = call <2 x double> @llvm.copysign.v2f64
   ; SLM: cost of 2 {{.*}} %V2F64 = call <2 x double> @llvm.copysign.v2f64
+  ; GLM: cost of 2 {{.*}} %V2F64 = call <2 x double> @llvm.copysign.v2f64
   %V2F64 = call <2 x double> @llvm.copysign.v2f64(<2 x double> undef, <2 x double> undef)
   ; SSE2: cost of 4 {{.*}} %V4F64 = call <4 x double> @llvm.copysign.v4f64
   ; SSE42: cost of 4 {{.*}} %V4F64 = call <4 x double> @llvm.copysign.v4f64
@@ -501,6 +564,7 @@ define i32 @fcopysign(i32 %arg) {
   ; AVX2: cost of 2 {{.*}} %V4F64 = call <4 x double> @llvm.copysign.v4f64
   ; AVX512: cost of 2 {{.*}} %V4F64 = call <4 x double> @llvm.copysign.v4f64
   ; SLM: cost of 4 {{.*}} %V4F64 = call <4 x double> @llvm.copysign.v4f64
+  ; GLM: cost of 4 {{.*}} %V4F64 = call <4 x double> @llvm.copysign.v4f64
   %V4F64 = call <4 x double> @llvm.copysign.v4f64(<4 x double> undef, <4 x double> undef)
   ; SSE2: cost of 8 {{.*}} %V8F64 = call <8 x double> @llvm.copysign.v8f64
   ; SSE42: cost of 8 {{.*}} %V8F64 = call <8 x double> @llvm.copysign.v8f64
@@ -508,6 +572,7 @@ define i32 @fcopysign(i32 %arg) {
   ; AVX2: cost of 4 {{.*}} %V8F64 = call <8 x double> @llvm.copysign.v8f64
   ; AVX512: cost of 2 {{.*}} %V8F64 = call <8 x double> @llvm.copysign.v8f64
   ; SLM: cost of 8 {{.*}} %V8F64 = call <8 x double> @llvm.copysign.v8f64
+  ; GLM: cost of 8 {{.*}} %V8F64 = call <8 x double> @llvm.copysign.v8f64
   %V8F64 = call <8 x double> @llvm.copysign.v8f64(<8 x double> undef, <8 x double> undef)
 
   ret i32 undef
@@ -521,6 +586,7 @@ define i32 @fma(i32 %arg) {
   ; AVX2: cost of 1 {{.*}} %F32 = call float @llvm.fma.f32
   ; AVX512: cost of 1 {{.*}} %F32 = call float @llvm.fma.f32
   ; SLM: cost of 10 {{.*}} %F32 = call float @llvm.fma.f32
+  ; GLM: cost of 10 {{.*}} %F32 = call float @llvm.fma.f32
   %F32 = call float @llvm.fma.f32(float undef, float undef, float undef)
   ; SSE2: cost of 43 {{.*}} %V4F32 = call <4 x float> @llvm.fma.v4f32
   ; SSE42: cost of 43 {{.*}} %V4F32 = call <4 x float> @llvm.fma.v4f32
@@ -528,6 +594,7 @@ define i32 @fma(i32 %arg) {
   ; AVX2: cost of 1 {{.*}} %V4F32 = call <4 x float> @llvm.fma.v4f32
   ; AVX512: cost of 1 {{.*}} %V4F32 = call <4 x float> @llvm.fma.v4f32
   ; SLM: cost of 43 {{.*}} %V4F32 = call <4 x float> @llvm.fma.v4f32
+  ; GLM: cost of 43 {{.*}} %V4F32 = call <4 x float> @llvm.fma.v4f32
   %V4F32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef)
   ; SSE2: cost of 86 {{.*}} %V8F32 = call <8 x float> @llvm.fma.v8f32
   ; SSE42: cost of 86 {{.*}} %V8F32 = call <8 x float> @llvm.fma.v8f32
@@ -535,6 +602,7 @@ define i32 @fma(i32 %arg) {
   ; AVX2: cost of 1 {{.*}} %V8F32 = call <8 x float> @llvm.fma.v8f32
   ; AVX512: cost of 1 {{.*}} %V8F32 = call <8 x float> @llvm.fma.v8f32
   ; SLM: cost of 86 {{.*}} %V8F32 = call <8 x float> @llvm.fma.v8f32
+  ; GLM: cost of 86 {{.*}} %V8F32 = call <8 x float> @llvm.fma.v8f32
   %V8F32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef)
   ; SSE2: cost of 172 {{.*}} %V16F32 = call <16 x float> @llvm.fma.v16f32
   ; SSE42: cost of 172 {{.*}} %V16F32 = call <16 x float> @llvm.fma.v16f32
@@ -542,6 +610,7 @@ define i32 @fma(i32 %arg) {
   ; AVX2: cost of 4 {{.*}} %V16F32 = call <16 x float> @llvm.fma.v16f32
   ; AVX512: cost of 1 {{.*}} %V16F32 = call <16 x float> @llvm.fma.v16f32
   ; SLM: cost of 172 {{.*}} %V16F32 = call <16 x float> @llvm.fma.v16f32
+  ; GLM: cost of 172 {{.*}} %V16F32 = call <16 x float> @llvm.fma.v16f32
   %V16F32 = call <16 x float> @llvm.fma.v16f32(<16 x float> undef, <16 x float> undef, <16 x float> undef)
 
   ; SSE2: cost of 10 {{.*}} %F64 = call double @llvm.fma.f64
@@ -550,6 +619,7 @@ define i32 @fma(i32 %arg) {
   ; AVX2: cost of 1 {{.*}} %F64 = call double @llvm.fma.f64
   ; AVX512: cost of 1 {{.*}} %F64 = call double @llvm.fma.f64
   ; SLM: cost of 10 {{.*}} %F64 = call double @llvm.fma.f64
+  ; GLM: cost of 10 {{.*}} %F64 = call double @llvm.fma.f64
   %F64 = call double @llvm.fma.f64(double undef, double undef, double undef)
   ; SSE2: cost of 21 {{.*}} %V2F64 = call <2 x double> @llvm.fma.v2f64
   ; SSE42: cost of 21 {{.*}} %V2F64 = call <2 x double> @llvm.fma.v2f64
@@ -557,6 +627,7 @@ define i32 @fma(i32 %arg) {
   ; AVX2: cost of 1 {{.*}} %V2F64 = call <2 x double> @llvm.fma.v2f64
   ; AVX512: cost of 1 {{.*}} %V2F64 = call <2 x double> @llvm.fma.v2f64
   ; SLM: cost of 21 {{.*}} %V2F64 = call <2 x double> @llvm.fma.v2f64
+  ; GLM: cost of 21 {{.*}} %V2F64 = call <2 x double> @llvm.fma.v2f64
   %V2F64 = call <2 x double> @llvm.fma.v2f64(<2 x double> undef, <2 x double> undef, <2 x double> undef)
   ; SSE2: cost of 42 {{.*}} %V4F64 = call <4 x double> @llvm.fma.v4f64
   ; SSE42: cost of 42 {{.*}} %V4F64 = call <4 x double> @llvm.fma.v4f64
@@ -564,6 +635,7 @@ define i32 @fma(i32 %arg) {
   ; AVX2: cost of 1 {{.*}} %V4F64 = call <4 x double> @llvm.fma.v4f64
   ; AVX512: cost of 1 {{.*}} %V4F64 = call <4 x double> @llvm.fma.v4f64
   ; SLM: cost of 42 {{.*}} %V4F64 = call <4 x double> @llvm.fma.v4f64
+  ; GLM: cost of 42 {{.*}} %V4F64 = call <4 x double> @llvm.fma.v4f64
   %V4F64 = call <4 x double> @llvm.fma.v4f64(<4 x double> undef, <4 x double> undef, <4 x double> undef)
   ; SSE2: cost of 84 {{.*}} %V8F64 = call <8 x double> @llvm.fma.v8f64
   ; SSE42: cost of 84 {{.*}} %V8F64 = call <8 x double> @llvm.fma.v8f64
@@ -571,6 +643,7 @@ define i32 @fma(i32 %arg) {
   ; AVX2: cost of 4 {{.*}} %V8F64 = call <8 x double> @llvm.fma.v8f64
   ; AVX512: cost of 1 {{.*}} %V8F64 = call <8 x double> @llvm.fma.v8f64
   ; SLM: cost of 84 {{.*}} %V8F64 = call <8 x double> @llvm.fma.v8f64
+  ; GLM: cost of 84 {{.*}} %V8F64 = call <8 x double> @llvm.fma.v8f64
   %V8F64 = call <8 x double> @llvm.fma.v8f64(<8 x double> undef, <8 x double> undef, <8 x double> undef)
 
   ret i32 undef

Modified: llvm/trunk/test/Analysis/CostModel/X86/arith.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Analysis/CostModel/X86/arith.ll?rev=328451&r1=328450&r2=328451&view=diff
==============================================================================
--- llvm/trunk/test/Analysis/CostModel/X86/arith.ll (original)
+++ llvm/trunk/test/Analysis/CostModel/X86/arith.ll Sun Mar 25 08:58:12 2018
@@ -5,6 +5,8 @@
 ; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx512f | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512F
 ; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512BW
 ; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512DQ
+; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mcpu=slm | FileCheck %s --check-prefix=CHECK --check-prefix=SLM
+; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mcpu=goldmont | FileCheck %s --check-prefix=CHECK --check-prefix=GLM
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.8.0"
@@ -15,18 +17,24 @@ define i32 @add(i32 %arg) {
   %I64 = add i64 undef, undef
   ; SSSE3: cost of 1 {{.*}} %V2I64 = add
   ; SSE42: cost of 1 {{.*}} %V2I64 = add
+  ; SLM: cost of 4 {{.*}} %V2I64 = add
+  ; GLM: cost of 1 {{.*}} %V2I64 = add
   ; AVX: cost of 1 {{.*}} %V2I64 = add
   ; AVX2: cost of 1 {{.*}} %V2I64 = add
   ; AVX512: cost of 1 {{.*}} %V2I64 = add
   %V2I64 = add <2 x i64> undef, undef
   ; SSSE3: cost of 2 {{.*}} %V4I64 = add
   ; SSE42: cost of 2 {{.*}} %V4I64 = add
+  ; SLM: cost of 8 {{.*}} %V4I64 = add
+  ; GLM: cost of 2 {{.*}} %V4I64 = add
   ; AVX: cost of 4 {{.*}} %V4I64 = add
   ; AVX2: cost of 1 {{.*}} %V4I64 = add
   ; AVX512: cost of 1 {{.*}} %V4I64 = add
   %V4I64 = add <4 x i64> undef, undef
   ; SSSE3: cost of 4 {{.*}} %V8I64 = add
   ; SSE42: cost of 4 {{.*}} %V8I64 = add
+  ; SLM: cost of 16 {{.*}} %V8I64 = add
+  ; GLM: cost of 4 {{.*}} %V8I64 = add
   ; AVX: cost of 8 {{.*}} %V8I64 = add
   ; AVX2: cost of 2 {{.*}} %V8I64 = add
   ; AVX512: cost of 1 {{.*}} %V8I64 = add
@@ -36,18 +44,24 @@ define i32 @add(i32 %arg) {
   %I32 = add i32 undef, undef
   ; SSSE3: cost of 1 {{.*}} %V4I32 = add
   ; SSE42: cost of 1 {{.*}} %V4I32 = add
+  ; SLM: cost of 1 {{.*}} %V4I32 = add
+  ; GLM: cost of 1 {{.*}} %V4I32 = add
   ; AVX: cost of 1 {{.*}} %V4I32 = add
   ; AVX2: cost of 1 {{.*}} %V4I32 = add
   ; AVX512: cost of 1 {{.*}} %V4I32 = add
   %V4I32 = add <4 x i32> undef, undef
   ; SSSE3: cost of 2 {{.*}} %V8I32 = add
   ; SSE42: cost of 2 {{.*}} %V8I32 = add
+  ; SLM: cost of 2 {{.*}} %V8I32 = add
+  ; GLM: cost of 2 {{.*}} %V8I32 = add
   ; AVX: cost of 4 {{.*}} %V8I32 = add
   ; AVX2: cost of 1 {{.*}} %V8I32 = add
   ; AVX512: cost of 1 {{.*}} %V8I32 = add
   %V8I32 = add <8 x i32> undef, undef
   ; SSSE3: cost of 4 {{.*}} %V16I32 = add
   ; SSE42: cost of 4 {{.*}} %V16I32 = add
+  ; SLM: cost of 4 {{.*}} %V16I32 = add
+  ; GLM: cost of 4 {{.*}} %V16I32 = add
   ; AVX: cost of 8 {{.*}} %V16I32 = add
   ; AVX2: cost of 2 {{.*}} %V16I32 = add
   ; AVX512: cost of 1 {{.*}} %V16I32 = add
@@ -57,18 +71,24 @@ define i32 @add(i32 %arg) {
   %I16 = add i16 undef, undef
   ; SSSE3: cost of 1 {{.*}} %V8I16 = add
   ; SSE42: cost of 1 {{.*}} %V8I16 = add
+  ; SLM: cost of 1 {{.*}} %V8I16 = add
+  ; GLM: cost of 1 {{.*}} %V8I16 = add
   ; AVX: cost of 1 {{.*}} %V8I16 = add
   ; AVX2: cost of 1 {{.*}} %V8I16 = add
   ; AVX512: cost of 1 {{.*}} %V8I16 = add
   %V8I16 = add <8 x i16> undef, undef
   ; SSSE3: cost of 2 {{.*}} %V16I16 = add
   ; SSE42: cost of 2 {{.*}} %V16I16 = add
+  ; SLM: cost of 2 {{.*}} %V16I16 = add
+  ; GLM: cost of 2 {{.*}} %V16I16 = add
   ; AVX: cost of 4 {{.*}} %V16I16 = add
   ; AVX2: cost of 1 {{.*}} %V16I16 = add
   ; AVX512: cost of 1 {{.*}} %V16I16 = add
   %V16I16 = add <16 x i16> undef, undef
   ; SSSE3: cost of 4 {{.*}} %V32I16 = add
   ; SSE42: cost of 4 {{.*}} %V32I16 = add
+  ; SLM: cost of 4 {{.*}} %V32I16 = add
+  ; GLM: cost of 4 {{.*}} %V32I16 = add
   ; AVX: cost of 8 {{.*}} %V32I16 = add
   ; AVX2: cost of 2 {{.*}} %V32I16 = add
   ; AVX512F: cost of 2 {{.*}} %V32I16 = add
@@ -79,18 +99,24 @@ define i32 @add(i32 %arg) {
   %I8 = add i8 undef, undef
   ; SSSE3: cost of 1 {{.*}} %V16I8 = add
   ; SSE42: cost of 1 {{.*}} %V16I8 = add
+  ; SLM: cost of 1 {{.*}} %V16I8 = add
+  ; GLM: cost of 1 {{.*}} %V16I8 = add
   ; AVX: cost of 1 {{.*}} %V16I8 = add
   ; AVX2: cost of 1 {{.*}} %V16I8 = add
   ; AVX512: cost of 1 {{.*}} %V16I8 = add
   %V16I8 = add <16 x i8> undef, undef
   ; SSSE3: cost of 2 {{.*}} %V32I8 = add
   ; SSE42: cost of 2 {{.*}} %V32I8 = add
+  ; SLM: cost of 2 {{.*}} %V32I8 = add
+  ; GLM: cost of 2 {{.*}} %V32I8 = add
   ; AVX: cost of 4 {{.*}} %V32I8 = add
   ; AVX2: cost of 1 {{.*}} %V32I8 = add
   ; AVX512: cost of 1 {{.*}} %V32I8 = add
   %V32I8 = add <32 x i8> undef, undef
   ; SSSE3: cost of 4 {{.*}} %V64I8 = add
   ; SSE42: cost of 4 {{.*}} %V64I8 = add
+  ; SLM: cost of 4 {{.*}} %V64I8 = add
+  ; GLM: cost of 4 {{.*}} %V64I8 = add
   ; AVX: cost of 8 {{.*}} %V64I8 = add
   ; AVX2: cost of 2 {{.*}} %V64I8 = add
   ; AVX512F: cost of 2 {{.*}} %V64I8 = add
@@ -106,18 +132,24 @@ define i32 @sub(i32 %arg) {
   %I64 = sub i64 undef, undef
   ; SSSE3: cost of 1 {{.*}} %V2I64 = sub
   ; SSE42: cost of 1 {{.*}} %V2I64 = sub
+  ; SLM: cost of 4 {{.*}} %V2I64 = sub
+  ; GLM: cost of 1 {{.*}} %V2I64 = sub
   ; AVX: cost of 1 {{.*}} %V2I64 = sub
   ; AVX2: cost of 1 {{.*}} %V2I64 = sub
   ; AVX512: cost of 1 {{.*}} %V2I64 = sub
   %V2I64 = sub <2 x i64> undef, undef
   ; SSSE3: cost of 2 {{.*}} %V4I64 = sub
   ; SSE42: cost of 2 {{.*}} %V4I64 = sub
+  ; SLM: cost of 8 {{.*}} %V4I64 = sub
+  ; GLM: cost of 2 {{.*}} %V4I64 = sub
   ; AVX: cost of 4 {{.*}} %V4I64 = sub
   ; AVX2: cost of 1 {{.*}} %V4I64 = sub
   ; AVX512: cost of 1 {{.*}} %V4I64 = sub
   %V4I64 = sub <4 x i64> undef, undef
   ; SSSE3: cost of 4 {{.*}} %V8I64 = sub
   ; SSE42: cost of 4 {{.*}} %V8I64 = sub
+  ; SLM: cost of 16 {{.*}} %V8I64 = sub
+  ; GLM: cost of 4 {{.*}} %V8I64 = sub
   ; AVX: cost of 8 {{.*}} %V8I64 = sub
   ; AVX2: cost of 2 {{.*}} %V8I64 = sub
   ; AVX512: cost of 1 {{.*}} %V8I64 = sub
@@ -127,18 +159,24 @@ define i32 @sub(i32 %arg) {
   %I32 = sub i32 undef, undef
   ; SSSE3: cost of 1 {{.*}} %V4I32 = sub
   ; SSE42: cost of 1 {{.*}} %V4I32 = sub
+  ; SLM: cost of 1 {{.*}} %V4I32 = sub
+  ; GLM: cost of 1 {{.*}} %V4I32 = sub
   ; AVX: cost of 1 {{.*}} %V4I32 = sub
   ; AVX2: cost of 1 {{.*}} %V4I32 = sub
   ; AVX512: cost of 1 {{.*}} %V4I32 = sub
   %V4I32 = sub <4 x i32> undef, undef
   ; SSSE3: cost of 2 {{.*}} %V8I32 = sub
   ; SSE42: cost of 2 {{.*}} %V8I32 = sub
+  ; SLM: cost of 2 {{.*}} %V8I32 = sub
+  ; GLM: cost of 2 {{.*}} %V8I32 = sub
   ; AVX: cost of 4 {{.*}} %V8I32 = sub
   ; AVX2: cost of 1 {{.*}} %V8I32 = sub
   ; AVX512: cost of 1 {{.*}} %V8I32 = sub
   %V8I32 = sub <8 x i32> undef, undef
   ; SSSE3: cost of 4 {{.*}} %V16I32 = sub
   ; SSE42: cost of 4 {{.*}} %V16I32 = sub
+  ; SLM: cost of 4 {{.*}} %V16I32 = sub
+  ; GLM: cost of 4 {{.*}} %V16I32 = sub
   ; AVX: cost of 8 {{.*}} %V16I32 = sub
   ; AVX2: cost of 2 {{.*}} %V16I32 = sub
   ; AVX512: cost of 1 {{.*}} %V16I32 = sub
@@ -148,18 +186,24 @@ define i32 @sub(i32 %arg) {
   %I16 = sub i16 undef, undef
   ; SSSE3: cost of 1 {{.*}} %V8I16 = sub
   ; SSE42: cost of 1 {{.*}} %V8I16 = sub
+  ; SLM: cost of 1 {{.*}} %V8I16 = sub
+  ; GLM: cost of 1 {{.*}} %V8I16 = sub
   ; AVX: cost of 1 {{.*}} %V8I16 = sub
   ; AVX2: cost of 1 {{.*}} %V8I16 = sub
   ; AVX512: cost of 1 {{.*}} %V8I16 = sub
   %V8I16 = sub <8 x i16> undef, undef
   ; SSSE3: cost of 2 {{.*}} %V16I16 = sub
   ; SSE42: cost of 2 {{.*}} %V16I16 = sub
+  ; SLM: cost of 2 {{.*}} %V16I16 = sub
+  ; GLM: cost of 2 {{.*}} %V16I16 = sub
   ; AVX: cost of 4 {{.*}} %V16I16 = sub
   ; AVX2: cost of 1 {{.*}} %V16I16 = sub
   ; AVX512: cost of 1 {{.*}} %V16I16 = sub
   %V16I16 = sub <16 x i16> undef, undef
   ; SSSE3: cost of 4 {{.*}} %V32I16 = sub
   ; SSE42: cost of 4 {{.*}} %V32I16 = sub
+  ; SLM: cost of 4 {{.*}} %V32I16 = sub
+  ; GLM: cost of 4 {{.*}} %V32I16 = sub
   ; AVX: cost of 8 {{.*}} %V32I16 = sub
   ; AVX2: cost of 2 {{.*}} %V32I16 = sub
   ; AVX512F: cost of 2 {{.*}} %V32I16 = sub
@@ -170,18 +214,24 @@ define i32 @sub(i32 %arg) {
   %I8 = sub i8 undef, undef
   ; SSSE3: cost of 1 {{.*}} %V16I8 = sub
   ; SSE42: cost of 1 {{.*}} %V16I8 = sub
+  ; SLM: cost of 1 {{.*}} %V16I8 = sub
+  ; GLM: cost of 1 {{.*}} %V16I8 = sub
   ; AVX: cost of 1 {{.*}} %V16I8 = sub
   ; AVX2: cost of 1 {{.*}} %V16I8 = sub
   ; AVX512: cost of 1 {{.*}} %V16I8 = sub
   %V16I8 = sub <16 x i8> undef, undef
   ; SSSE3: cost of 2 {{.*}} %V32I8 = sub
   ; SSE42: cost of 2 {{.*}} %V32I8 = sub
+  ; SLM: cost of 2 {{.*}} %V32I8 = sub
+  ; GLM: cost of 2 {{.*}} %V32I8 = sub
   ; AVX: cost of 4 {{.*}} %V32I8 = sub
   ; AVX2: cost of 1 {{.*}} %V32I8 = sub
   ; AVX512: cost of 1 {{.*}} %V32I8 = sub
   %V32I8 = sub <32 x i8> undef, undef
   ; SSSE3: cost of 4 {{.*}} %V64I8 = sub
   ; SSE42: cost of 4 {{.*}} %V64I8 = sub
+  ; SLM: cost of 4 {{.*}} %V64I8 = sub
+  ; GLM: cost of 4 {{.*}} %V64I8 = sub
   ; AVX: cost of 8 {{.*}} %V64I8 = sub
   ; AVX2: cost of 2 {{.*}} %V64I8 = sub
   ; AVX512F: cost of 2 {{.*}} %V64I8 = sub
@@ -197,18 +247,24 @@ define i32 @or(i32 %arg) {
   %I64 = or i64 undef, undef
   ; SSSE3: cost of 1 {{.*}} %V2I64 = or
   ; SSE42: cost of 1 {{.*}} %V2I64 = or
+  ; SLM: cost of 1 {{.*}} %V2I64 = or
+  ; GLM: cost of 1 {{.*}} %V2I64 = or
   ; AVX: cost of 1 {{.*}} %V2I64 = or
   ; AVX2: cost of 1 {{.*}} %V2I64 = or
   ; AVX512: cost of 1 {{.*}} %V2I64 = or
   %V2I64 = or <2 x i64> undef, undef
   ; SSSE3: cost of 2 {{.*}} %V4I64 = or
   ; SSE42: cost of 2 {{.*}} %V4I64 = or
+  ; SLM: cost of 2 {{.*}} %V4I64 = or
+  ; GLM: cost of 2 {{.*}} %V4I64 = or
   ; AVX: cost of 1 {{.*}} %V4I64 = or
   ; AVX2: cost of 1 {{.*}} %V4I64 = or
   ; AVX512: cost of 1 {{.*}} %V4I64 = or
   %V4I64 = or <4 x i64> undef, undef
   ; SSSE3: cost of 4 {{.*}} %V8I64 = or
   ; SSE42: cost of 4 {{.*}} %V8I64 = or
+  ; SLM: cost of 4 {{.*}} %V8I64 = or
+  ; GLM: cost of 4 {{.*}} %V8I64 = or
   ; AVX: cost of 2 {{.*}} %V8I64 = or
   ; AVX2: cost of 2 {{.*}} %V8I64 = or
   ; AVX512: cost of 1 {{.*}} %V8I64 = or
@@ -218,18 +274,24 @@ define i32 @or(i32 %arg) {
   %I32 = or i32 undef, undef
   ; SSSE3: cost of 1 {{.*}} %V4I32 = or
   ; SSE42: cost of 1 {{.*}} %V4I32 = or
+  ; SLM: cost of 1 {{.*}} %V4I32 = or
+  ; GLM: cost of 1 {{.*}} %V4I32 = or
   ; AVX: cost of 1 {{.*}} %V4I32 = or
   ; AVX2: cost of 1 {{.*}} %V4I32 = or
   ; AVX512: cost of 1 {{.*}} %V4I32 = or
   %V4I32 = or <4 x i32> undef, undef
   ; SSSE3: cost of 2 {{.*}} %V8I32 = or
   ; SSE42: cost of 2 {{.*}} %V8I32 = or
+  ; SLM: cost of 2 {{.*}} %V8I32 = or
+  ; GLM: cost of 2 {{.*}} %V8I32 = or
   ; AVX: cost of 1 {{.*}} %V8I32 = or
   ; AVX2: cost of 1 {{.*}} %V8I32 = or
   ; AVX512: cost of 1 {{.*}} %V8I32 = or
   %V8I32 = or <8 x i32> undef, undef
   ; SSSE3: cost of 4 {{.*}} %V16I32 = or
   ; SSE42: cost of 4 {{.*}} %V16I32 = or
+  ; SLM: cost of 4 {{.*}} %V16I32 = or
+  ; GLM: cost of 4 {{.*}} %V16I32 = or
   ; AVX: cost of 2 {{.*}} %V16I32 = or
   ; AVX2: cost of 2 {{.*}} %V16I32 = or
   ; AVX512: cost of 1 {{.*}} %V16I32 = or
@@ -239,18 +301,24 @@ define i32 @or(i32 %arg) {
   %I16 = or i16 undef, undef
   ; SSSE3: cost of 1 {{.*}} %V8I16 = or
   ; SSE42: cost of 1 {{.*}} %V8I16 = or
+  ; SLM: cost of 1 {{.*}} %V8I16 = or
+  ; GLM: cost of 1 {{.*}} %V8I16 = or
   ; AVX: cost of 1 {{.*}} %V8I16 = or
   ; AVX2: cost of 1 {{.*}} %V8I16 = or
   ; AVX512: cost of 1 {{.*}} %V8I16 = or
   %V8I16 = or <8 x i16> undef, undef
   ; SSSE3: cost of 2 {{.*}} %V16I16 = or
   ; SSE42: cost of 2 {{.*}} %V16I16 = or
+  ; SLM: cost of 2 {{.*}} %V16I16 = or
+  ; GLM: cost of 2 {{.*}} %V16I16 = or
   ; AVX: cost of 1 {{.*}} %V16I16 = or
   ; AVX2: cost of 1 {{.*}} %V16I16 = or
   ; AVX512: cost of 1 {{.*}} %V16I16 = or
   %V16I16 = or <16 x i16> undef, undef
   ; SSSE3: cost of 4 {{.*}} %V32I16 = or
   ; SSE42: cost of 4 {{.*}} %V32I16 = or
+  ; SLM: cost of 4 {{.*}} %V32I16 = or
+  ; GLM: cost of 4 {{.*}} %V32I16 = or
   ; AVX: cost of 2 {{.*}} %V32I16 = or
   ; AVX2: cost of 2 {{.*}} %V32I16 = or
   ; AVX512F: cost of 2 {{.*}} %V32I16 = or
@@ -261,18 +329,24 @@ define i32 @or(i32 %arg) {
   %I8 = or i8 undef, undef
   ; SSSE3: cost of 1 {{.*}} %V16I8 = or
   ; SSE42: cost of 1 {{.*}} %V16I8 = or
+  ; SLM: cost of 1 {{.*}} %V16I8 = or
+  ; GLM: cost of 1 {{.*}} %V16I8 = or
   ; AVX: cost of 1 {{.*}} %V16I8 = or
   ; AVX2: cost of 1 {{.*}} %V16I8 = or
   ; AVX512: cost of 1 {{.*}} %V16I8 = or
   %V16I8 = or <16 x i8> undef, undef
   ; SSSE3: cost of 2 {{.*}} %V32I8 = or
   ; SSE42: cost of 2 {{.*}} %V32I8 = or
+  ; SLM: cost of 2 {{.*}} %V32I8 = or
+  ; GLM: cost of 2 {{.*}} %V32I8 = or
   ; AVX: cost of 1 {{.*}} %V32I8 = or
   ; AVX2: cost of 1 {{.*}} %V32I8 = or
   ; AVX512: cost of 1 {{.*}} %V32I8 = or
   %V32I8 = or <32 x i8> undef, undef
   ; SSSE3: cost of 4 {{.*}} %V64I8 = or
   ; SSE42: cost of 4 {{.*}} %V64I8 = or
+  ; SLM: cost of 4 {{.*}} %V64I8 = or
+  ; GLM: cost of 4 {{.*}} %V64I8 = or
   ; AVX: cost of 2 {{.*}} %V64I8 = or
   ; AVX2: cost of 2 {{.*}} %V64I8 = or
   ; AVX512F: cost of 2 {{.*}} %V64I8 = or
@@ -288,18 +362,24 @@ define i32 @xor(i32 %arg) {
   %I64 = xor i64 undef, undef
   ; SSSE3: cost of 1 {{.*}} %V2I64 = xor
   ; SSE42: cost of 1 {{.*}} %V2I64 = xor
+  ; SLM: cost of 1 {{.*}} %V2I64 = xor
+  ; GLM: cost of 1 {{.*}} %V2I64 = xor
   ; AVX: cost of 1 {{.*}} %V2I64 = xor
   ; AVX2: cost of 1 {{.*}} %V2I64 = xor
   ; AVX512: cost of 1 {{.*}} %V2I64 = xor
   %V2I64 = xor <2 x i64> undef, undef
   ; SSSE3: cost of 2 {{.*}} %V4I64 = xor
   ; SSE42: cost of 2 {{.*}} %V4I64 = xor
+  ; SLM: cost of 2 {{.*}} %V4I64 = xor
+  ; GLM: cost of 2 {{.*}} %V4I64 = xor
   ; AVX: cost of 1 {{.*}} %V4I64 = xor
   ; AVX2: cost of 1 {{.*}} %V4I64 = xor
   ; AVX512: cost of 1 {{.*}} %V4I64 = xor
   %V4I64 = xor <4 x i64> undef, undef
   ; SSSE3: cost of 4 {{.*}} %V8I64 = xor
   ; SSE42: cost of 4 {{.*}} %V8I64 = xor
+  ; SLM: cost of 4 {{.*}} %V8I64 = xor
+  ; GLM: cost of 4 {{.*}} %V8I64 = xor
   ; AVX: cost of 2 {{.*}} %V8I64 = xor
   ; AVX2: cost of 2 {{.*}} %V8I64 = xor
   ; AVX512: cost of 1 {{.*}} %V8I64 = xor
@@ -309,18 +389,24 @@ define i32 @xor(i32 %arg) {
   %I32 = xor i32 undef, undef
   ; SSSE3: cost of 1 {{.*}} %V4I32 = xor
   ; SSE42: cost of 1 {{.*}} %V4I32 = xor
+  ; SLM: cost of 1 {{.*}} %V4I32 = xor
+  ; GLM: cost of 1 {{.*}} %V4I32 = xor
   ; AVX: cost of 1 {{.*}} %V4I32 = xor
   ; AVX2: cost of 1 {{.*}} %V4I32 = xor
   ; AVX512: cost of 1 {{.*}} %V4I32 = xor
   %V4I32 = xor <4 x i32> undef, undef
   ; SSSE3: cost of 2 {{.*}} %V8I32 = xor
   ; SSE42: cost of 2 {{.*}} %V8I32 = xor
+  ; SLM: cost of 2 {{.*}} %V8I32 = xor
+  ; GLM: cost of 2 {{.*}} %V8I32 = xor
   ; AVX: cost of 1 {{.*}} %V8I32 = xor
   ; AVX2: cost of 1 {{.*}} %V8I32 = xor
   ; AVX512: cost of 1 {{.*}} %V8I32 = xor
   %V8I32 = xor <8 x i32> undef, undef
   ; SSSE3: cost of 4 {{.*}} %V16I32 = xor
   ; SSE42: cost of 4 {{.*}} %V16I32 = xor
+  ; SLM: cost of 4 {{.*}} %V16I32 = xor
+  ; GLM: cost of 4 {{.*}} %V16I32 = xor
   ; AVX: cost of 2 {{.*}} %V16I32 = xor
   ; AVX2: cost of 2 {{.*}} %V16I32 = xor
   ; AVX512: cost of 1 {{.*}} %V16I32 = xor
@@ -330,18 +416,24 @@ define i32 @xor(i32 %arg) {
   %I16 = xor i16 undef, undef
   ; SSSE3: cost of 1 {{.*}} %V8I16 = xor
   ; SSE42: cost of 1 {{.*}} %V8I16 = xor
+  ; SLM: cost of 1 {{.*}} %V8I16 = xor
+  ; GLM: cost of 1 {{.*}} %V8I16 = xor
   ; AVX: cost of 1 {{.*}} %V8I16 = xor
   ; AVX2: cost of 1 {{.*}} %V8I16 = xor
   ; AVX512: cost of 1 {{.*}} %V8I16 = xor
   %V8I16 = xor <8 x i16> undef, undef
   ; SSSE3: cost of 2 {{.*}} %V16I16 = xor
   ; SSE42: cost of 2 {{.*}} %V16I16 = xor
+  ; SLM: cost of 2 {{.*}} %V16I16 = xor
+  ; GLM: cost of 2 {{.*}} %V16I16 = xor
   ; AVX: cost of 1 {{.*}} %V16I16 = xor
   ; AVX2: cost of 1 {{.*}} %V16I16 = xor
   ; AVX512: cost of 1 {{.*}} %V16I16 = xor
   %V16I16 = xor <16 x i16> undef, undef
   ; SSSE3: cost of 4 {{.*}} %V32I16 = xor
   ; SSE42: cost of 4 {{.*}} %V32I16 = xor
+  ; SLM: cost of 4 {{.*}} %V32I16 = xor
+  ; GLM: cost of 4 {{.*}} %V32I16 = xor
   ; AVX: cost of 2 {{.*}} %V32I16 = xor
   ; AVX2: cost of 2 {{.*}} %V32I16 = xor
   ; AVX512F: cost of 2 {{.*}} %V32I16 = xor
@@ -352,18 +444,24 @@ define i32 @xor(i32 %arg) {
   %I8 = xor i8 undef, undef
   ; SSSE3: cost of 1 {{.*}} %V16I8 = xor
   ; SSE42: cost of 1 {{.*}} %V16I8 = xor
+  ; SLM: cost of 1 {{.*}} %V16I8 = xor
+  ; GLM: cost of 1 {{.*}} %V16I8 = xor
   ; AVX: cost of 1 {{.*}} %V16I8 = xor
   ; AVX2: cost of 1 {{.*}} %V16I8 = xor
   ; AVX512: cost of 1 {{.*}} %V16I8 = xor
   %V16I8 = xor <16 x i8> undef, undef
   ; SSSE3: cost of 2 {{.*}} %V32I8 = xor
   ; SSE42: cost of 2 {{.*}} %V32I8 = xor
+  ; SLM: cost of 2 {{.*}} %V32I8 = xor
+  ; GLM: cost of 2 {{.*}} %V32I8 = xor
   ; AVX: cost of 1 {{.*}} %V32I8 = xor
   ; AVX2: cost of 1 {{.*}} %V32I8 = xor
   ; AVX512: cost of 1 {{.*}} %V32I8 = xor
   %V32I8 = xor <32 x i8> undef, undef
   ; SSSE3: cost of 4 {{.*}} %V64I8 = xor
   ; SSE42: cost of 4 {{.*}} %V64I8 = xor
+  ; SLM: cost of 4 {{.*}} %V64I8 = xor
+  ; GLM: cost of 4 {{.*}} %V64I8 = xor
   ; AVX: cost of 2 {{.*}} %V64I8 = xor
   ; AVX2: cost of 2 {{.*}} %V64I8 = xor
   ; AVX512F: cost of 2 {{.*}} %V64I8 = xor
@@ -379,18 +477,24 @@ define i32 @and(i32 %arg) {
   %I64 = and i64 undef, undef
   ; SSSE3: cost of 1 {{.*}} %V2I64 = and
   ; SSE42: cost of 1 {{.*}} %V2I64 = and
+  ; SLM: cost of 1 {{.*}} %V2I64 = and
+  ; GLM: cost of 1 {{.*}} %V2I64 = and
   ; AVX: cost of 1 {{.*}} %V2I64 = and
   ; AVX2: cost of 1 {{.*}} %V2I64 = and
   ; AVX512: cost of 1 {{.*}} %V2I64 = and
   %V2I64 = and <2 x i64> undef, undef
   ; SSSE3: cost of 2 {{.*}} %V4I64 = and
   ; SSE42: cost of 2 {{.*}} %V4I64 = and
+  ; SLM: cost of 2 {{.*}} %V4I64 = and
+  ; GLM: cost of 2 {{.*}} %V4I64 = and
   ; AVX: cost of 1 {{.*}} %V4I64 = and
   ; AVX2: cost of 1 {{.*}} %V4I64 = and
   ; AVX512: cost of 1 {{.*}} %V4I64 = and
   %V4I64 = and <4 x i64> undef, undef
   ; SSSE3: cost of 4 {{.*}} %V8I64 = and
   ; SSE42: cost of 4 {{.*}} %V8I64 = and
+  ; SLM: cost of 4 {{.*}} %V8I64 = and
+  ; GLM: cost of 4 {{.*}} %V8I64 = and
   ; AVX: cost of 2 {{.*}} %V8I64 = and
   ; AVX2: cost of 2 {{.*}} %V8I64 = and
   ; AVX512: cost of 1 {{.*}} %V8I64 = and
@@ -400,18 +504,24 @@ define i32 @and(i32 %arg) {
   %I32 = and i32 undef, undef
   ; SSSE3: cost of 1 {{.*}} %V4I32 = and
   ; SSE42: cost of 1 {{.*}} %V4I32 = and
+  ; SLM: cost of 1 {{.*}} %V4I32 = and
+  ; GLM: cost of 1 {{.*}} %V4I32 = and
   ; AVX: cost of 1 {{.*}} %V4I32 = and
   ; AVX2: cost of 1 {{.*}} %V4I32 = and
   ; AVX512: cost of 1 {{.*}} %V4I32 = and
   %V4I32 = and <4 x i32> undef, undef
   ; SSSE3: cost of 2 {{.*}} %V8I32 = and
   ; SSE42: cost of 2 {{.*}} %V8I32 = and
+  ; SLM: cost of 2 {{.*}} %V8I32 = and
+  ; GLM: cost of 2 {{.*}} %V8I32 = and
   ; AVX: cost of 1 {{.*}} %V8I32 = and
   ; AVX2: cost of 1 {{.*}} %V8I32 = and
   ; AVX512: cost of 1 {{.*}} %V8I32 = and
   %V8I32 = and <8 x i32> undef, undef
   ; SSSE3: cost of 4 {{.*}} %V16I32 = and
   ; SSE42: cost of 4 {{.*}} %V16I32 = and
+  ; SLM: cost of 4 {{.*}} %V16I32 = and
+  ; GLM: cost of 4 {{.*}} %V16I32 = and
   ; AVX: cost of 2 {{.*}} %V16I32 = and
   ; AVX2: cost of 2 {{.*}} %V16I32 = and
   ; AVX512: cost of 1 {{.*}} %V16I32 = and
@@ -421,18 +531,24 @@ define i32 @and(i32 %arg) {
   %I16 = and i16 undef, undef
   ; SSSE3: cost of 1 {{.*}} %V8I16 = and
   ; SSE42: cost of 1 {{.*}} %V8I16 = and
+  ; SLM: cost of 1 {{.*}} %V8I16 = and
+  ; GLM: cost of 1 {{.*}} %V8I16 = and
   ; AVX: cost of 1 {{.*}} %V8I16 = and
   ; AVX2: cost of 1 {{.*}} %V8I16 = and
   ; AVX512: cost of 1 {{.*}} %V8I16 = and
   %V8I16 = and <8 x i16> undef, undef
   ; SSSE3: cost of 2 {{.*}} %V16I16 = and
   ; SSE42: cost of 2 {{.*}} %V16I16 = and
+  ; SLM: cost of 2 {{.*}} %V16I16 = and
+  ; GLM: cost of 2 {{.*}} %V16I16 = and
   ; AVX: cost of 1 {{.*}} %V16I16 = and
   ; AVX2: cost of 1 {{.*}} %V16I16 = and
   ; AVX512: cost of 1 {{.*}} %V16I16 = and
   %V16I16 = and <16 x i16> undef, undef
   ; SSSE3: cost of 4 {{.*}} %V32I16 = and
   ; SSE42: cost of 4 {{.*}} %V32I16 = and
+  ; SLM: cost of 4 {{.*}} %V32I16 = and
+  ; GLM: cost of 4 {{.*}} %V32I16 = and
   ; AVX: cost of 2 {{.*}} %V32I16 = and
   ; AVX2: cost of 2 {{.*}} %V32I16 = and
   ; AVX512F: cost of 2 {{.*}} %V32I16 = and
@@ -443,18 +559,24 @@ define i32 @and(i32 %arg) {
   %I8 = and i8 undef, undef
   ; SSSE3: cost of 1 {{.*}} %V16I8 = and
   ; SSE42: cost of 1 {{.*}} %V16I8 = and
+  ; SLM: cost of 1 {{.*}} %V16I8 = and
+  ; GLM: cost of 1 {{.*}} %V16I8 = and
   ; AVX: cost of 1 {{.*}} %V16I8 = and
   ; AVX2: cost of 1 {{.*}} %V16I8 = and
   ; AVX512: cost of 1 {{.*}} %V16I8 = and
   %V16I8 = and <16 x i8> undef, undef
   ; SSSE3: cost of 2 {{.*}} %V32I8 = and
   ; SSE42: cost of 2 {{.*}} %V32I8 = and
+  ; SLM: cost of 2 {{.*}} %V32I8 = and
+  ; GLM: cost of 2 {{.*}} %V32I8 = and
   ; AVX: cost of 1 {{.*}} %V32I8 = and
   ; AVX2: cost of 1 {{.*}} %V32I8 = and
   ; AVX512: cost of 1 {{.*}} %V32I8 = and
   %V32I8 = and <32 x i8> undef, undef
   ; SSSE3: cost of 4 {{.*}} %V64I8 = and
   ; SSE42: cost of 4 {{.*}} %V64I8 = and
+  ; SLM: cost of 4 {{.*}} %V64I8 = and
+  ; GLM: cost of 4 {{.*}} %V64I8 = and
   ; AVX: cost of 2 {{.*}} %V64I8 = and
   ; AVX2: cost of 2 {{.*}} %V64I8 = and
   ; AVX512F: cost of 2 {{.*}} %V64I8 = and
@@ -470,6 +592,8 @@ define i32 @mul(i32 %arg) {
   %I64 = mul i64 undef, undef
   ; SSSE3: cost of 8 {{.*}} %V2I64 = mul
   ; SSE42: cost of 8 {{.*}} %V2I64 = mul
+  ; SLM: cost of 17 {{.*}} %V2I64 = mul
+  ; GLM: cost of 8 {{.*}} %V2I64 = mul
   ; AVX: cost of 8 {{.*}} %V2I64 = mul
   ; AVX2: cost of 8 {{.*}} %V2I64 = mul
   ; AVX512F: cost of 8 {{.*}} %V2I64 = mul
@@ -478,6 +602,8 @@ define i32 @mul(i32 %arg) {
   %V2I64 = mul <2 x i64> undef, undef
   ; SSSE3: cost of 16 {{.*}} %V4I64 = mul
   ; SSE42: cost of 16 {{.*}} %V4I64 = mul
+  ; SLM: cost of 34 {{.*}} %V4I64 = mul
+  ; GLM: cost of 16 {{.*}} %V4I64 = mul
   ; AVX: cost of 18 {{.*}} %V4I64 = mul
   ; AVX2: cost of 8 {{.*}} %V4I64 = mul
   ; AVX512F: cost of 8 {{.*}} %V4I64 = mul
@@ -486,6 +612,8 @@ define i32 @mul(i32 %arg) {
   %V4I64 = mul <4 x i64> undef, undef
   ; SSSE3: cost of 32 {{.*}} %V8I64 = mul
   ; SSE42: cost of 32 {{.*}} %V8I64 = mul
+  ; SLM: cost of 68 {{.*}} %V8I64 = mul
+  ; GLM: cost of 32 {{.*}} %V8I64 = mul
   ; AVX: cost of 36 {{.*}} %V8I64 = mul
   ; AVX2: cost of 16 {{.*}} %V8I64 = mul
   ; AVX512F: cost of 8 {{.*}} %V8I64 = mul
@@ -497,18 +625,24 @@ define i32 @mul(i32 %arg) {
   %I32 = mul i32 undef, undef
   ; SSSE3: cost of 6 {{.*}} %V4I32 = mul
   ; SSE42: cost of 2 {{.*}} %V4I32 = mul
+  ; SLM: cost of 11 {{.*}} %V4I32 = mul
+  ; GLM: cost of 2 {{.*}} %V4I32 = mul
   ; AVX: cost of 2 {{.*}} %V4I32 = mul
   ; AVX2: cost of 2 {{.*}} %V4I32 = mul
   ; AVX512: cost of 1 {{.*}} %V4I32 = mul
   %V4I32 = mul <4 x i32> undef, undef
   ; SSSE3: cost of 12 {{.*}} %V8I32 = mul
   ; SSE42: cost of 4 {{.*}} %V8I32 = mul
+  ; SLM: cost of 22 {{.*}} %V8I32 = mul
+  ; GLM: cost of 4 {{.*}} %V8I32 = mul
   ; AVX: cost of 4 {{.*}} %V8I32 = mul
   ; AVX2: cost of 2 {{.*}} %V8I32 = mul
   ; AVX512: cost of 1 {{.*}} %V8I32 = mul
   %V8I32 = mul <8 x i32> undef, undef
   ; SSSE3: cost of 24 {{.*}} %V16I32 = mul
   ; SSE42: cost of 8 {{.*}} %V16I32 = mul
+  ; SLM: cost of 44 {{.*}} %V16I32 = mul
+  ; GLM: cost of 8 {{.*}} %V16I32 = mul
   ; AVX: cost of 8 {{.*}} %V16I32 = mul
   ; AVX2: cost of 4 {{.*}} %V16I32 = mul
   ; AVX512: cost of 1 {{.*}} %V16I32 = mul
@@ -518,18 +652,24 @@ define i32 @mul(i32 %arg) {
   %I16 = mul i16 undef, undef
   ; SSSE3: cost of 1 {{.*}} %V8I16 = mul
   ; SSE42: cost of 1 {{.*}} %V8I16 = mul
+  ; SLM: cost of 2 {{.*}} %V8I16 = mul
+  ; GLM: cost of 1 {{.*}} %V8I16 = mul
   ; AVX: cost of 1 {{.*}} %V8I16 = mul
   ; AVX2: cost of 1 {{.*}} %V8I16 = mul
   ; AVX512: cost of 1 {{.*}} %V8I16 = mul
   %V8I16 = mul <8 x i16> undef, undef
   ; SSSE3: cost of 2 {{.*}} %V16I16 = mul
   ; SSE42: cost of 2 {{.*}} %V16I16 = mul
+  ; SLM: cost of 4 {{.*}} %V16I16 = mul
+  ; GLM: cost of 2 {{.*}} %V16I16 = mul
   ; AVX: cost of 4 {{.*}} %V16I16 = mul
   ; AVX2: cost of 1 {{.*}} %V16I16 = mul
   ; AVX512: cost of 1 {{.*}} %V16I16 = mul
   %V16I16 = mul <16 x i16> undef, undef
   ; SSSE3: cost of 4 {{.*}} %V32I16 = mul
   ; SSE42: cost of 4 {{.*}} %V32I16 = mul
+  ; SLM: cost of 8 {{.*}} %V32I16 = mul
+  ; GLM: cost of 4 {{.*}} %V32I16 = mul
   ; AVX: cost of 8 {{.*}} %V32I16 = mul
   ; AVX2: cost of 2 {{.*}} %V32I16 = mul
   ; AVX512F: cost of 2 {{.*}} %V32I16 = mul
@@ -540,6 +680,8 @@ define i32 @mul(i32 %arg) {
   %I8 = mul i8 undef, undef
   ; SSSE3: cost of 12 {{.*}} %V16I8 = mul
   ; SSE42: cost of 12 {{.*}} %V16I8 = mul
+  ; SLM: cost of 14 {{.*}} %V16I8 = mul
+  ; GLM: cost of 12 {{.*}} %V16I8 = mul
   ; AVX: cost of 12 {{.*}} %V16I8 = mul
   ; AVX2: cost of 7 {{.*}} %V16I8 = mul
   ; AVX512F: cost of 5 {{.*}} %V16I8 = mul
@@ -547,6 +689,8 @@ define i32 @mul(i32 %arg) {
   %V16I8 = mul <16 x i8> undef, undef
   ; SSSE3: cost of 24 {{.*}} %V32I8 = mul
   ; SSE42: cost of 24 {{.*}} %V32I8 = mul
+  ; SLM: cost of 28 {{.*}} %V32I8 = mul
+  ; GLM: cost of 24 {{.*}} %V32I8 = mul
   ; AVX: cost of 26 {{.*}} %V32I8 = mul
   ; AVX2: cost of 17 {{.*}} %V32I8 = mul
   ; AVX512F: cost of 13 {{.*}} %V32I8 = mul
@@ -554,6 +698,8 @@ define i32 @mul(i32 %arg) {
   %V32I8 = mul <32 x i8> undef, undef
   ; SSSE3: cost of 48 {{.*}} %V64I8 = mul
   ; SSE42: cost of 48 {{.*}} %V64I8 = mul
+  ; SLM: cost of 56 {{.*}} %V64I8 = mul
+  ; GLM: cost of 48 {{.*}} %V64I8 = mul
   ; AVX: cost of 52 {{.*}} %V64I8 = mul
   ; AVX2: cost of 34 {{.*}} %V64I8 = mul
   ; AVX512F: cost of 26 {{.*}} %V64I8 = mul
@@ -570,6 +716,8 @@ define void @mul_2i32() {
   ; 3 PMULUDQ and 2 PADDS and 4 shifts.
   ; SSSE3: cost of 8 {{.*}} %A0 = mul
   ; SSE42: cost of 8 {{.*}} %A0 = mul
+  ; SLM: cost of 17 {{.*}} %A0 = mul
+  ; GLM: cost of 8 {{.*}} %A0 = mul
   ; AVX: cost of 8 {{.*}} %A0 = mul
   ; AVX2: cost of 8 {{.*}} %A0 = mul
   ; AVX512F: cost of 8 {{.*}} %A0 = mul