[llvm] 0df3651 - [X86] matchVPMADD52 - only use 512-bit MADD52 on AVX512IFMA targets (#161011)

via llvm-commits llvm-commits at lists.llvm.org
Sat Sep 27 09:44:02 PDT 2025


Author: Simon Pilgrim
Date: 2025-09-27T16:43:58Z
New Revision: 0df3651802d35b26ae857b549de9edf73b67fb98

URL: https://github.com/llvm/llvm-project/commit/0df3651802d35b26ae857b549de9edf73b67fb98
DIFF: https://github.com/llvm/llvm-project/commit/0df3651802d35b26ae857b549de9edf73b67fb98.diff

LOG: [X86] matchVPMADD52 - only use 512-bit MADD52 on AVX512IFMA targets (#161011)

If we have a AVX512 target capable of AVXIFMA but not AVX512IFMA then we must split 512-bit (or larger) types to 256-bits

Fixes #160928

Added: 
    

Modified: 
    llvm/lib/Target/X86/X86ISelLowering.cpp
    llvm/test/CodeGen/X86/ifma-combine-vpmadd52.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 3af673d951f65..efeddd7c9bd4b 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -4452,11 +4452,12 @@ static SDValue splitVectorIntBinary(SDValue Op, SelectionDAG &DAG,
 template <typename F>
 SDValue SplitOpsAndApply(SelectionDAG &DAG, const X86Subtarget &Subtarget,
                          const SDLoc &DL, EVT VT, ArrayRef<SDValue> Ops,
-                         F Builder, bool CheckBWI = true) {
+                         F Builder, bool CheckBWI = true,
+                         bool AllowAVX512 = true) {
   assert(Subtarget.hasSSE2() && "Target assumed to support at least SSE2");
   unsigned NumSubs = 1;
   if ((CheckBWI && Subtarget.useBWIRegs()) ||
-      (!CheckBWI && Subtarget.useAVX512Regs())) {
+      (!CheckBWI && AllowAVX512 && Subtarget.useAVX512Regs())) {
     if (VT.getSizeInBits() > 512) {
       NumSubs = VT.getSizeInBits() / 512;
       assert((VT.getSizeInBits() % 512) == 0 && "Illegal vector size");
@@ -58076,7 +58077,8 @@ static SDValue matchVPMADD52(SDNode *N, SelectionDAG &DAG, const SDLoc &DL,
   };
 
   return SplitOpsAndApply(DAG, Subtarget, DL, VT, {Acc, X, Y}, VPMADD52Builder,
-                          /*CheckBWI*/ false);
+                          /*CheckBWI*/ false,
+                          /*AllowAVX512*/ Subtarget.hasIFMA());
 }
 
 static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,

diff  --git a/llvm/test/CodeGen/X86/ifma-combine-vpmadd52.ll b/llvm/test/CodeGen/X86/ifma-combine-vpmadd52.ll
index aebfc7d483d6f..3ece4beb9c22e 100644
--- a/llvm/test/CodeGen/X86/ifma-combine-vpmadd52.ll
+++ b/llvm/test/CodeGen/X86/ifma-combine-vpmadd52.ll
@@ -1,25 +1,26 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc < %s -mtriple=x86_64-- -mattr=+avxifma | FileCheck %s --check-prefixes=X64,AVX
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avxifma | FileCheck %s --check-prefixes=X64,AVX,AVXIFMA
 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512ifma | FileCheck %s --check-prefixes=X64,AVX512,AVX512-NOVL
 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512ifma,+avx512vl | FileCheck %s --check-prefixes=X64,AVX512,AVX512VL
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avxifma,+avx512vl | FileCheck %s --check-prefixes=X64,AVX,AVX512-NOIFMA
 
 ; 67108863 == (1 << 26) - 1
 ; 4503599627370496 == (1 << 52)
 ; 4503599627370495 == (1 << 52) - 1
 
 define <8 x i64> @test_512_combine(<8 x i64> %x, <8 x i64> %y, <8 x i64> %z) {
-; AVX-LABEL: test_512_combine:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpbroadcastq {{.*#+}} ymm6 = [67108863,67108863,67108863,67108863]
-; AVX-NEXT:    vpand %ymm6, %ymm2, %ymm2
-; AVX-NEXT:    vpand %ymm6, %ymm0, %ymm0
-; AVX-NEXT:    {vex} vpmadd52luq %ymm2, %ymm0, %ymm4
-; AVX-NEXT:    vpand %ymm6, %ymm3, %ymm0
-; AVX-NEXT:    vpand %ymm6, %ymm1, %ymm1
-; AVX-NEXT:    {vex} vpmadd52luq %ymm0, %ymm1, %ymm5
-; AVX-NEXT:    vmovdqa %ymm4, %ymm0
-; AVX-NEXT:    vmovdqa %ymm5, %ymm1
-; AVX-NEXT:    retq
+; AVXIFMA-LABEL: test_512_combine:
+; AVXIFMA:       # %bb.0:
+; AVXIFMA-NEXT:    vpbroadcastq {{.*#+}} ymm6 = [67108863,67108863,67108863,67108863]
+; AVXIFMA-NEXT:    vpand %ymm6, %ymm2, %ymm2
+; AVXIFMA-NEXT:    vpand %ymm6, %ymm0, %ymm0
+; AVXIFMA-NEXT:    {vex} vpmadd52luq %ymm2, %ymm0, %ymm4
+; AVXIFMA-NEXT:    vpand %ymm6, %ymm3, %ymm0
+; AVXIFMA-NEXT:    vpand %ymm6, %ymm1, %ymm1
+; AVXIFMA-NEXT:    {vex} vpmadd52luq %ymm0, %ymm1, %ymm5
+; AVXIFMA-NEXT:    vmovdqa %ymm4, %ymm0
+; AVXIFMA-NEXT:    vmovdqa %ymm5, %ymm1
+; AVXIFMA-NEXT:    retq
 ;
 ; AVX512-LABEL: test_512_combine:
 ; AVX512:       # %bb.0:
@@ -29,6 +30,19 @@ define <8 x i64> @test_512_combine(<8 x i64> %x, <8 x i64> %y, <8 x i64> %z) {
 ; AVX512-NEXT:    vpmadd52luq %zmm1, %zmm0, %zmm2
 ; AVX512-NEXT:    vmovdqa64 %zmm2, %zmm0
 ; AVX512-NEXT:    retq
+;
+; AVX512-NOIFMA-LABEL: test_512_combine:
+; AVX512-NOIFMA:       # %bb.0:
+; AVX512-NOIFMA-NEXT:    vpbroadcastq {{.*#+}} zmm3 = [67108863,67108863,67108863,67108863,67108863,67108863,67108863,67108863]
+; AVX512-NOIFMA-NEXT:    vpandq %zmm3, %zmm0, %zmm0
+; AVX512-NOIFMA-NEXT:    vpandq %zmm3, %zmm1, %zmm1
+; AVX512-NOIFMA-NEXT:    vextracti64x4 $1, %zmm1, %ymm3
+; AVX512-NOIFMA-NEXT:    vextracti64x4 $1, %zmm0, %ymm4
+; AVX512-NOIFMA-NEXT:    vextracti64x4 $1, %zmm2, %ymm5
+; AVX512-NOIFMA-NEXT:    {vex} vpmadd52luq %ymm3, %ymm4, %ymm5
+; AVX512-NOIFMA-NEXT:    {vex} vpmadd52luq %ymm1, %ymm0, %ymm2
+; AVX512-NOIFMA-NEXT:    vinserti64x4 $1, %ymm5, %zmm2, %zmm0
+; AVX512-NOIFMA-NEXT:    retq
   %x_masked = and <8 x i64> %x, splat (i64 67108863)
   %y_masked = and <8 x i64> %y, splat (i64 67108863)
   %mul = mul nuw nsw <8 x i64> %x_masked, %y_masked
@@ -37,19 +51,19 @@ define <8 x i64> @test_512_combine(<8 x i64> %x, <8 x i64> %y, <8 x i64> %z) {
 }
 
 define <8 x i64> @test_512_combine_v2(<8 x i64> %x, <8 x i64> %y, <8 x i64> %z) {
-; AVX-LABEL: test_512_combine_v2:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpbroadcastq {{.*#+}} ymm6 = [3,3,3,3]
-; AVX-NEXT:    vpand %ymm6, %ymm2, %ymm2
-; AVX-NEXT:    vpbroadcastq {{.*#+}} ymm7 = [1125899906842623,1125899906842623,1125899906842623,1125899906842623]
-; AVX-NEXT:    vpand %ymm7, %ymm0, %ymm0
-; AVX-NEXT:    {vex} vpmadd52luq %ymm2, %ymm0, %ymm4
-; AVX-NEXT:    vpand %ymm6, %ymm3, %ymm0
-; AVX-NEXT:    vpand %ymm7, %ymm1, %ymm1
-; AVX-NEXT:    {vex} vpmadd52luq %ymm0, %ymm1, %ymm5
-; AVX-NEXT:    vmovdqa %ymm4, %ymm0
-; AVX-NEXT:    vmovdqa %ymm5, %ymm1
-; AVX-NEXT:    retq
+; AVXIFMA-LABEL: test_512_combine_v2:
+; AVXIFMA:       # %bb.0:
+; AVXIFMA-NEXT:    vpbroadcastq {{.*#+}} ymm6 = [3,3,3,3]
+; AVXIFMA-NEXT:    vpand %ymm6, %ymm2, %ymm2
+; AVXIFMA-NEXT:    vpbroadcastq {{.*#+}} ymm7 = [1125899906842623,1125899906842623,1125899906842623,1125899906842623]
+; AVXIFMA-NEXT:    vpand %ymm7, %ymm0, %ymm0
+; AVXIFMA-NEXT:    {vex} vpmadd52luq %ymm2, %ymm0, %ymm4
+; AVXIFMA-NEXT:    vpand %ymm6, %ymm3, %ymm0
+; AVXIFMA-NEXT:    vpand %ymm7, %ymm1, %ymm1
+; AVXIFMA-NEXT:    {vex} vpmadd52luq %ymm0, %ymm1, %ymm5
+; AVXIFMA-NEXT:    vmovdqa %ymm4, %ymm0
+; AVXIFMA-NEXT:    vmovdqa %ymm5, %ymm1
+; AVXIFMA-NEXT:    retq
 ;
 ; AVX512-LABEL: test_512_combine_v2:
 ; AVX512:       # %bb.0:
@@ -58,6 +72,18 @@ define <8 x i64> @test_512_combine_v2(<8 x i64> %x, <8 x i64> %y, <8 x i64> %z)
 ; AVX512-NEXT:    vpmadd52luq %zmm1, %zmm0, %zmm2
 ; AVX512-NEXT:    vmovdqa64 %zmm2, %zmm0
 ; AVX512-NEXT:    retq
+;
+; AVX512-NOIFMA-LABEL: test_512_combine_v2:
+; AVX512-NOIFMA:       # %bb.0:
+; AVX512-NOIFMA-NEXT:    vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
+; AVX512-NOIFMA-NEXT:    vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm1
+; AVX512-NOIFMA-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
+; AVX512-NOIFMA-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
+; AVX512-NOIFMA-NEXT:    vextracti64x4 $1, %zmm0, %ymm5
+; AVX512-NOIFMA-NEXT:    {vex} vpmadd52luq %ymm4, %ymm5, %ymm3
+; AVX512-NOIFMA-NEXT:    {vex} vpmadd52luq %ymm1, %ymm0, %ymm2
+; AVX512-NOIFMA-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm0
+; AVX512-NOIFMA-NEXT:    retq
   %x_masked = and <8 x i64> %x, splat (i64 1125899906842623) ; (1 << 50) - 1
   %y_masked = and <8 x i64> %y, splat (i64 3)
   %mul = mul nuw nsw <8 x i64> %x_masked, %y_masked
@@ -66,32 +92,32 @@ define <8 x i64> @test_512_combine_v2(<8 x i64> %x, <8 x i64> %y, <8 x i64> %z)
 }
 
 define <8 x i64> @test_512_no_combine(<8 x i64> %x, <8 x i64> %y, <8 x i64> %z) {
-; AVX-LABEL: test_512_no_combine:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpbroadcastq {{.*#+}} ymm6 = [4503599627370495,4503599627370495,4503599627370495,4503599627370495]
-; AVX-NEXT:    vpand %ymm6, %ymm0, %ymm7
-; AVX-NEXT:    vpand %ymm6, %ymm1, %ymm8
-; AVX-NEXT:    vpand %ymm6, %ymm2, %ymm9
-; AVX-NEXT:    vpand %ymm6, %ymm3, %ymm6
-; AVX-NEXT:    vpsrlq $32, %ymm8, %ymm8
-; AVX-NEXT:    vpmuludq %ymm3, %ymm8, %ymm8
-; AVX-NEXT:    vpsrlq $32, %ymm6, %ymm6
-; AVX-NEXT:    vpmuludq %ymm6, %ymm1, %ymm6
-; AVX-NEXT:    vpaddq %ymm6, %ymm8, %ymm6
-; AVX-NEXT:    vpsllq $32, %ymm6, %ymm6
-; AVX-NEXT:    vpmuludq %ymm3, %ymm1, %ymm1
-; AVX-NEXT:    vpsrlq $32, %ymm7, %ymm3
-; AVX-NEXT:    vpmuludq %ymm2, %ymm3, %ymm3
-; AVX-NEXT:    vpsrlq $32, %ymm9, %ymm7
-; AVX-NEXT:    vpmuludq %ymm7, %ymm0, %ymm7
-; AVX-NEXT:    vpaddq %ymm3, %ymm7, %ymm3
-; AVX-NEXT:    vpsllq $32, %ymm3, %ymm3
-; AVX-NEXT:    vpmuludq %ymm2, %ymm0, %ymm0
-; AVX-NEXT:    vpaddq %ymm4, %ymm0, %ymm0
-; AVX-NEXT:    vpaddq %ymm3, %ymm0, %ymm0
-; AVX-NEXT:    vpaddq %ymm5, %ymm1, %ymm1
-; AVX-NEXT:    vpaddq %ymm6, %ymm1, %ymm1
-; AVX-NEXT:    retq
+; AVXIFMA-LABEL: test_512_no_combine:
+; AVXIFMA:       # %bb.0:
+; AVXIFMA-NEXT:    vpbroadcastq {{.*#+}} ymm6 = [4503599627370495,4503599627370495,4503599627370495,4503599627370495]
+; AVXIFMA-NEXT:    vpand %ymm6, %ymm0, %ymm7
+; AVXIFMA-NEXT:    vpand %ymm6, %ymm1, %ymm8
+; AVXIFMA-NEXT:    vpand %ymm6, %ymm2, %ymm9
+; AVXIFMA-NEXT:    vpand %ymm6, %ymm3, %ymm6
+; AVXIFMA-NEXT:    vpsrlq $32, %ymm8, %ymm8
+; AVXIFMA-NEXT:    vpmuludq %ymm3, %ymm8, %ymm8
+; AVXIFMA-NEXT:    vpsrlq $32, %ymm6, %ymm6
+; AVXIFMA-NEXT:    vpmuludq %ymm6, %ymm1, %ymm6
+; AVXIFMA-NEXT:    vpaddq %ymm6, %ymm8, %ymm6
+; AVXIFMA-NEXT:    vpsllq $32, %ymm6, %ymm6
+; AVXIFMA-NEXT:    vpmuludq %ymm3, %ymm1, %ymm1
+; AVXIFMA-NEXT:    vpsrlq $32, %ymm7, %ymm3
+; AVXIFMA-NEXT:    vpmuludq %ymm2, %ymm3, %ymm3
+; AVXIFMA-NEXT:    vpsrlq $32, %ymm9, %ymm7
+; AVXIFMA-NEXT:    vpmuludq %ymm7, %ymm0, %ymm7
+; AVXIFMA-NEXT:    vpaddq %ymm3, %ymm7, %ymm3
+; AVXIFMA-NEXT:    vpsllq $32, %ymm3, %ymm3
+; AVXIFMA-NEXT:    vpmuludq %ymm2, %ymm0, %ymm0
+; AVXIFMA-NEXT:    vpaddq %ymm4, %ymm0, %ymm0
+; AVXIFMA-NEXT:    vpaddq %ymm3, %ymm0, %ymm0
+; AVXIFMA-NEXT:    vpaddq %ymm5, %ymm1, %ymm1
+; AVXIFMA-NEXT:    vpaddq %ymm6, %ymm1, %ymm1
+; AVXIFMA-NEXT:    retq
 ;
 ; AVX512-LABEL: test_512_no_combine:
 ; AVX512:       # %bb.0:
@@ -108,6 +134,22 @@ define <8 x i64> @test_512_no_combine(<8 x i64> %x, <8 x i64> %y, <8 x i64> %z)
 ; AVX512-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
 ; AVX512-NEXT:    vpaddq %zmm3, %zmm0, %zmm0
 ; AVX512-NEXT:    retq
+;
+; AVX512-NOIFMA-LABEL: test_512_no_combine:
+; AVX512-NOIFMA:       # %bb.0:
+; AVX512-NOIFMA-NEXT:    vpbroadcastq {{.*#+}} zmm3 = [4503599627370495,4503599627370495,4503599627370495,4503599627370495,4503599627370495,4503599627370495,4503599627370495,4503599627370495]
+; AVX512-NOIFMA-NEXT:    vpandq %zmm3, %zmm0, %zmm4
+; AVX512-NOIFMA-NEXT:    vpandq %zmm3, %zmm1, %zmm3
+; AVX512-NOIFMA-NEXT:    vpsrlq $32, %zmm4, %zmm4
+; AVX512-NOIFMA-NEXT:    vpmuludq %zmm1, %zmm4, %zmm4
+; AVX512-NOIFMA-NEXT:    vpsrlq $32, %zmm3, %zmm3
+; AVX512-NOIFMA-NEXT:    vpmuludq %zmm3, %zmm0, %zmm3
+; AVX512-NOIFMA-NEXT:    vpaddq %zmm4, %zmm3, %zmm3
+; AVX512-NOIFMA-NEXT:    vpsllq $32, %zmm3, %zmm3
+; AVX512-NOIFMA-NEXT:    vpmuludq %zmm1, %zmm0, %zmm0
+; AVX512-NOIFMA-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
+; AVX512-NOIFMA-NEXT:    vpaddq %zmm3, %zmm0, %zmm0
+; AVX512-NOIFMA-NEXT:    retq
   %x_masked = and <8 x i64> %x, splat (i64 4503599627370495)
   %y_masked = and <8 x i64> %y, splat (i64 4503599627370495)
   %mul = mul nuw nsw <8 x i64> %x_masked, %y_masked
@@ -116,27 +158,27 @@ define <8 x i64> @test_512_no_combine(<8 x i64> %x, <8 x i64> %y, <8 x i64> %z)
 }
 
 define <8 x i64> @test_512_no_combine_v2(<8 x i64> %x, <8 x i64> %y, <8 x i64> %z) {
-; AVX-LABEL: test_512_no_combine_v2:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpsrlq $32, %ymm1, %ymm6
-; AVX-NEXT:    vpmuludq %ymm3, %ymm6, %ymm6
-; AVX-NEXT:    vpsrlq $32, %ymm3, %ymm7
-; AVX-NEXT:    vpmuludq %ymm7, %ymm1, %ymm7
-; AVX-NEXT:    vpaddq %ymm6, %ymm7, %ymm6
-; AVX-NEXT:    vpsllq $32, %ymm6, %ymm6
-; AVX-NEXT:    vpmuludq %ymm3, %ymm1, %ymm1
-; AVX-NEXT:    vpsrlq $32, %ymm0, %ymm3
-; AVX-NEXT:    vpmuludq %ymm2, %ymm3, %ymm3
-; AVX-NEXT:    vpsrlq $32, %ymm2, %ymm7
-; AVX-NEXT:    vpmuludq %ymm7, %ymm0, %ymm7
-; AVX-NEXT:    vpaddq %ymm3, %ymm7, %ymm3
-; AVX-NEXT:    vpsllq $32, %ymm3, %ymm3
-; AVX-NEXT:    vpmuludq %ymm2, %ymm0, %ymm0
-; AVX-NEXT:    vpaddq %ymm4, %ymm0, %ymm0
-; AVX-NEXT:    vpaddq %ymm3, %ymm0, %ymm0
-; AVX-NEXT:    vpaddq %ymm5, %ymm1, %ymm1
-; AVX-NEXT:    vpaddq %ymm6, %ymm1, %ymm1
-; AVX-NEXT:    retq
+; AVXIFMA-LABEL: test_512_no_combine_v2:
+; AVXIFMA:       # %bb.0:
+; AVXIFMA-NEXT:    vpsrlq $32, %ymm1, %ymm6
+; AVXIFMA-NEXT:    vpmuludq %ymm3, %ymm6, %ymm6
+; AVXIFMA-NEXT:    vpsrlq $32, %ymm3, %ymm7
+; AVXIFMA-NEXT:    vpmuludq %ymm7, %ymm1, %ymm7
+; AVXIFMA-NEXT:    vpaddq %ymm6, %ymm7, %ymm6
+; AVXIFMA-NEXT:    vpsllq $32, %ymm6, %ymm6
+; AVXIFMA-NEXT:    vpmuludq %ymm3, %ymm1, %ymm1
+; AVXIFMA-NEXT:    vpsrlq $32, %ymm0, %ymm3
+; AVXIFMA-NEXT:    vpmuludq %ymm2, %ymm3, %ymm3
+; AVXIFMA-NEXT:    vpsrlq $32, %ymm2, %ymm7
+; AVXIFMA-NEXT:    vpmuludq %ymm7, %ymm0, %ymm7
+; AVXIFMA-NEXT:    vpaddq %ymm3, %ymm7, %ymm3
+; AVXIFMA-NEXT:    vpsllq $32, %ymm3, %ymm3
+; AVXIFMA-NEXT:    vpmuludq %ymm2, %ymm0, %ymm0
+; AVXIFMA-NEXT:    vpaddq %ymm4, %ymm0, %ymm0
+; AVXIFMA-NEXT:    vpaddq %ymm3, %ymm0, %ymm0
+; AVXIFMA-NEXT:    vpaddq %ymm5, %ymm1, %ymm1
+; AVXIFMA-NEXT:    vpaddq %ymm6, %ymm1, %ymm1
+; AVXIFMA-NEXT:    retq
 ;
 ; AVX512-LABEL: test_512_no_combine_v2:
 ; AVX512:       # %bb.0:
@@ -150,6 +192,19 @@ define <8 x i64> @test_512_no_combine_v2(<8 x i64> %x, <8 x i64> %y, <8 x i64> %
 ; AVX512-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
 ; AVX512-NEXT:    vpaddq %zmm3, %zmm0, %zmm0
 ; AVX512-NEXT:    retq
+;
+; AVX512-NOIFMA-LABEL: test_512_no_combine_v2:
+; AVX512-NOIFMA:       # %bb.0:
+; AVX512-NOIFMA-NEXT:    vpsrlq $32, %zmm0, %zmm3
+; AVX512-NOIFMA-NEXT:    vpmuludq %zmm1, %zmm3, %zmm3
+; AVX512-NOIFMA-NEXT:    vpsrlq $32, %zmm1, %zmm4
+; AVX512-NOIFMA-NEXT:    vpmuludq %zmm4, %zmm0, %zmm4
+; AVX512-NOIFMA-NEXT:    vpaddq %zmm3, %zmm4, %zmm3
+; AVX512-NOIFMA-NEXT:    vpsllq $32, %zmm3, %zmm3
+; AVX512-NOIFMA-NEXT:    vpmuludq %zmm1, %zmm0, %zmm0
+; AVX512-NOIFMA-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
+; AVX512-NOIFMA-NEXT:    vpaddq %zmm3, %zmm0, %zmm0
+; AVX512-NOIFMA-NEXT:    retq
   %mul = mul <8 x i64> %x, %y
   %res = add <8 x i64> %mul, %z
   ret <8 x i64> %res
@@ -255,25 +310,25 @@ define <1 x i64> @test_scalar_no_ifma(<1 x i64> %x, <1 x i64> %y, <1 x i64> %z)
 
 ; 40-bit and 13-bit, too wide
 define <8 x i64> @test_mixed_width_too_wide(<8 x i64> %x, <8 x i64> %y, <8 x i64> %z) {
-; AVX-LABEL: test_mixed_width_too_wide:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpbroadcastq {{.*#+}} ymm6 = [8191,8191,8191,8191]
-; AVX-NEXT:    vpand %ymm6, %ymm2, %ymm2
-; AVX-NEXT:    vpand %ymm6, %ymm3, %ymm3
-; AVX-NEXT:    vpmovzxdq {{.*#+}} ymm6 = [2155905028,2155905036,2155905044,2155905052]
-; AVX-NEXT:    vpshufb %ymm6, %ymm1, %ymm7
-; AVX-NEXT:    vpmuludq %ymm3, %ymm7, %ymm7
-; AVX-NEXT:    vpsllq $32, %ymm7, %ymm7
-; AVX-NEXT:    vpmuludq %ymm3, %ymm1, %ymm1
-; AVX-NEXT:    vpshufb %ymm6, %ymm0, %ymm3
-; AVX-NEXT:    vpmuludq %ymm2, %ymm3, %ymm3
-; AVX-NEXT:    vpsllq $32, %ymm3, %ymm3
-; AVX-NEXT:    vpmuludq %ymm2, %ymm0, %ymm0
-; AVX-NEXT:    vpaddq %ymm0, %ymm4, %ymm0
-; AVX-NEXT:    vpaddq %ymm3, %ymm0, %ymm0
-; AVX-NEXT:    vpaddq %ymm1, %ymm5, %ymm1
-; AVX-NEXT:    vpaddq %ymm7, %ymm1, %ymm1
-; AVX-NEXT:    retq
+; AVXIFMA-LABEL: test_mixed_width_too_wide:
+; AVXIFMA:       # %bb.0:
+; AVXIFMA-NEXT:    vpbroadcastq {{.*#+}} ymm6 = [8191,8191,8191,8191]
+; AVXIFMA-NEXT:    vpand %ymm6, %ymm2, %ymm2
+; AVXIFMA-NEXT:    vpand %ymm6, %ymm3, %ymm3
+; AVXIFMA-NEXT:    vpmovzxdq {{.*#+}} ymm6 = [2155905028,2155905036,2155905044,2155905052]
+; AVXIFMA-NEXT:    vpshufb %ymm6, %ymm1, %ymm7
+; AVXIFMA-NEXT:    vpmuludq %ymm3, %ymm7, %ymm7
+; AVXIFMA-NEXT:    vpsllq $32, %ymm7, %ymm7
+; AVXIFMA-NEXT:    vpmuludq %ymm3, %ymm1, %ymm1
+; AVXIFMA-NEXT:    vpshufb %ymm6, %ymm0, %ymm3
+; AVXIFMA-NEXT:    vpmuludq %ymm2, %ymm3, %ymm3
+; AVXIFMA-NEXT:    vpsllq $32, %ymm3, %ymm3
+; AVXIFMA-NEXT:    vpmuludq %ymm2, %ymm0, %ymm0
+; AVXIFMA-NEXT:    vpaddq %ymm0, %ymm4, %ymm0
+; AVXIFMA-NEXT:    vpaddq %ymm3, %ymm0, %ymm0
+; AVXIFMA-NEXT:    vpaddq %ymm1, %ymm5, %ymm1
+; AVXIFMA-NEXT:    vpaddq %ymm7, %ymm1, %ymm1
+; AVXIFMA-NEXT:    retq
 ;
 ; AVX512-LABEL: test_mixed_width_too_wide:
 ; AVX512:       # %bb.0:
@@ -286,6 +341,18 @@ define <8 x i64> @test_mixed_width_too_wide(<8 x i64> %x, <8 x i64> %y, <8 x i64
 ; AVX512-NEXT:    vpaddq %zmm3, %zmm2, %zmm1
 ; AVX512-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
 ; AVX512-NEXT:    retq
+;
+; AVX512-NOIFMA-LABEL: test_mixed_width_too_wide:
+; AVX512-NOIFMA:       # %bb.0:
+; AVX512-NOIFMA-NEXT:    vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm1
+; AVX512-NOIFMA-NEXT:    vpmuludq %zmm1, %zmm0, %zmm3
+; AVX512-NOIFMA-NEXT:    vpsrlq $32, %zmm0, %zmm0
+; AVX512-NOIFMA-NEXT:    vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
+; AVX512-NOIFMA-NEXT:    vpmuludq %zmm1, %zmm0, %zmm0
+; AVX512-NOIFMA-NEXT:    vpsllq $32, %zmm0, %zmm0
+; AVX512-NOIFMA-NEXT:    vpaddq %zmm3, %zmm2, %zmm1
+; AVX512-NOIFMA-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
+; AVX512-NOIFMA-NEXT:    retq
   %x40 = and <8 x i64> %x, splat (i64 1099511627775)
   %y13 = and <8 x i64> %y, splat (i64 8191)
   %mul = mul <8 x i64> %x40, %y13
@@ -294,19 +361,19 @@ define <8 x i64> @test_mixed_width_too_wide(<8 x i64> %x, <8 x i64> %y, <8 x i64
 }
 
 define <8 x i64> @test_zext32_inputs_not_safe(<8 x i32> %xi32, <8 x i32> %yi32, <8 x i64> %z) {
-; AVX-LABEL: test_zext32_inputs_not_safe:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpmovzxdq {{.*#+}} ymm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX-NEXT:    vpmovzxdq {{.*#+}} ymm5 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; AVX-NEXT:    vpmuludq %ymm5, %ymm4, %ymm4
-; AVX-NEXT:    vextracti128 $1, %ymm1, %xmm1
-; AVX-NEXT:    vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; AVX-NEXT:    vpmuludq %ymm1, %ymm0, %ymm1
-; AVX-NEXT:    vpaddq %ymm4, %ymm2, %ymm0
-; AVX-NEXT:    vpaddq %ymm1, %ymm3, %ymm1
-; AVX-NEXT:    retq
+; AVXIFMA-LABEL: test_zext32_inputs_not_safe:
+; AVXIFMA:       # %bb.0:
+; AVXIFMA-NEXT:    vpmovzxdq {{.*#+}} ymm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVXIFMA-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVXIFMA-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVXIFMA-NEXT:    vpmovzxdq {{.*#+}} ymm5 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; AVXIFMA-NEXT:    vpmuludq %ymm5, %ymm4, %ymm4
+; AVXIFMA-NEXT:    vextracti128 $1, %ymm1, %xmm1
+; AVXIFMA-NEXT:    vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; AVXIFMA-NEXT:    vpmuludq %ymm1, %ymm0, %ymm1
+; AVXIFMA-NEXT:    vpaddq %ymm4, %ymm2, %ymm0
+; AVXIFMA-NEXT:    vpaddq %ymm1, %ymm3, %ymm1
+; AVXIFMA-NEXT:    retq
 ;
 ; AVX512-LABEL: test_zext32_inputs_not_safe:
 ; AVX512:       # %bb.0:
@@ -315,6 +382,14 @@ define <8 x i64> @test_zext32_inputs_not_safe(<8 x i32> %xi32, <8 x i32> %yi32,
 ; AVX512-NEXT:    vpmuludq %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
 ; AVX512-NEXT:    retq
+;
+; AVX512-NOIFMA-LABEL: test_zext32_inputs_not_safe:
+; AVX512-NOIFMA:       # %bb.0:
+; AVX512-NOIFMA-NEXT:    vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
+; AVX512-NOIFMA-NEXT:    vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero
+; AVX512-NOIFMA-NEXT:    vpmuludq %zmm1, %zmm0, %zmm0
+; AVX512-NOIFMA-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
+; AVX512-NOIFMA-NEXT:    retq
   %x = zext <8 x i32> %xi32 to <8 x i64>
   %y = zext <8 x i32> %yi32 to <8 x i64>
   %mul = mul <8 x i64> %x, %y
@@ -323,36 +398,36 @@ define <8 x i64> @test_zext32_inputs_not_safe(<8 x i32> %xi32, <8 x i32> %yi32,
 }
 
 define <16 x i64> @test_1024_combine_split(<16 x i64> %x, <16 x i64> %y, <16 x i64> %z) nounwind {
-; AVX-LABEL: test_1024_combine_split:
-; AVX:       # %bb.0:
-; AVX-NEXT:    pushq %rbp
-; AVX-NEXT:    movq %rsp, %rbp
-; AVX-NEXT:    andq $-32, %rsp
-; AVX-NEXT:    subq $32, %rsp
-; AVX-NEXT:    vmovdqa 112(%rbp), %ymm8
-; AVX-NEXT:    vmovdqa 80(%rbp), %ymm9
-; AVX-NEXT:    vmovdqa 48(%rbp), %ymm10
-; AVX-NEXT:    vmovdqa 16(%rbp), %ymm11
-; AVX-NEXT:    vpbroadcastq {{.*#+}} ymm12 = [67108863,67108863,67108863,67108863]
-; AVX-NEXT:    vpand %ymm3, %ymm12, %ymm3
-; AVX-NEXT:    vpand %ymm2, %ymm12, %ymm2
-; AVX-NEXT:    vpand %ymm1, %ymm12, %ymm1
-; AVX-NEXT:    vpand %ymm0, %ymm12, %ymm0
-; AVX-NEXT:    vpand %ymm7, %ymm12, %ymm7
-; AVX-NEXT:    {vex} vpmadd52luq %ymm7, %ymm3, %ymm8
-; AVX-NEXT:    vpand %ymm6, %ymm12, %ymm3
-; AVX-NEXT:    {vex} vpmadd52luq %ymm3, %ymm2, %ymm9
-; AVX-NEXT:    vpand %ymm5, %ymm12, %ymm2
-; AVX-NEXT:    {vex} vpmadd52luq %ymm2, %ymm1, %ymm10
-; AVX-NEXT:    vpand %ymm4, %ymm12, %ymm1
-; AVX-NEXT:    {vex} vpmadd52luq %ymm1, %ymm0, %ymm11
-; AVX-NEXT:    vmovdqa %ymm11, %ymm0
-; AVX-NEXT:    vmovdqa %ymm10, %ymm1
-; AVX-NEXT:    vmovdqa %ymm9, %ymm2
-; AVX-NEXT:    vmovdqa %ymm8, %ymm3
-; AVX-NEXT:    movq %rbp, %rsp
-; AVX-NEXT:    popq %rbp
-; AVX-NEXT:    retq
+; AVXIFMA-LABEL: test_1024_combine_split:
+; AVXIFMA:       # %bb.0:
+; AVXIFMA-NEXT:    pushq %rbp
+; AVXIFMA-NEXT:    movq %rsp, %rbp
+; AVXIFMA-NEXT:    andq $-32, %rsp
+; AVXIFMA-NEXT:    subq $32, %rsp
+; AVXIFMA-NEXT:    vmovdqa 112(%rbp), %ymm8
+; AVXIFMA-NEXT:    vmovdqa 80(%rbp), %ymm9
+; AVXIFMA-NEXT:    vmovdqa 48(%rbp), %ymm10
+; AVXIFMA-NEXT:    vmovdqa 16(%rbp), %ymm11
+; AVXIFMA-NEXT:    vpbroadcastq {{.*#+}} ymm12 = [67108863,67108863,67108863,67108863]
+; AVXIFMA-NEXT:    vpand %ymm3, %ymm12, %ymm3
+; AVXIFMA-NEXT:    vpand %ymm2, %ymm12, %ymm2
+; AVXIFMA-NEXT:    vpand %ymm1, %ymm12, %ymm1
+; AVXIFMA-NEXT:    vpand %ymm0, %ymm12, %ymm0
+; AVXIFMA-NEXT:    vpand %ymm7, %ymm12, %ymm7
+; AVXIFMA-NEXT:    {vex} vpmadd52luq %ymm7, %ymm3, %ymm8
+; AVXIFMA-NEXT:    vpand %ymm6, %ymm12, %ymm3
+; AVXIFMA-NEXT:    {vex} vpmadd52luq %ymm3, %ymm2, %ymm9
+; AVXIFMA-NEXT:    vpand %ymm5, %ymm12, %ymm2
+; AVXIFMA-NEXT:    {vex} vpmadd52luq %ymm2, %ymm1, %ymm10
+; AVXIFMA-NEXT:    vpand %ymm4, %ymm12, %ymm1
+; AVXIFMA-NEXT:    {vex} vpmadd52luq %ymm1, %ymm0, %ymm11
+; AVXIFMA-NEXT:    vmovdqa %ymm11, %ymm0
+; AVXIFMA-NEXT:    vmovdqa %ymm10, %ymm1
+; AVXIFMA-NEXT:    vmovdqa %ymm9, %ymm2
+; AVXIFMA-NEXT:    vmovdqa %ymm8, %ymm3
+; AVXIFMA-NEXT:    movq %rbp, %rsp
+; AVXIFMA-NEXT:    popq %rbp
+; AVXIFMA-NEXT:    retq
 ;
 ; AVX512-LABEL: test_1024_combine_split:
 ; AVX512:       # %bb.0:
@@ -366,6 +441,27 @@ define <16 x i64> @test_1024_combine_split(<16 x i64> %x, <16 x i64> %y, <16 x i
 ; AVX512-NEXT:    vmovdqa64 %zmm4, %zmm0
 ; AVX512-NEXT:    vmovdqa64 %zmm5, %zmm1
 ; AVX512-NEXT:    retq
+;
+; AVX512-NOIFMA-LABEL: test_1024_combine_split:
+; AVX512-NOIFMA:       # %bb.0:
+; AVX512-NOIFMA-NEXT:    vpbroadcastq {{.*#+}} zmm6 = [67108863,67108863,67108863,67108863,67108863,67108863,67108863,67108863]
+; AVX512-NOIFMA-NEXT:    vpandq %zmm6, %zmm1, %zmm1
+; AVX512-NOIFMA-NEXT:    vpandq %zmm6, %zmm0, %zmm0
+; AVX512-NOIFMA-NEXT:    vpandq %zmm6, %zmm3, %zmm3
+; AVX512-NOIFMA-NEXT:    vpandq %zmm6, %zmm2, %zmm2
+; AVX512-NOIFMA-NEXT:    vextracti64x4 $1, %zmm2, %ymm6
+; AVX512-NOIFMA-NEXT:    vextracti64x4 $1, %zmm0, %ymm7
+; AVX512-NOIFMA-NEXT:    vextracti64x4 $1, %zmm4, %ymm8
+; AVX512-NOIFMA-NEXT:    {vex} vpmadd52luq %ymm6, %ymm7, %ymm8
+; AVX512-NOIFMA-NEXT:    {vex} vpmadd52luq %ymm2, %ymm0, %ymm4
+; AVX512-NOIFMA-NEXT:    vinserti64x4 $1, %ymm8, %zmm4, %zmm0
+; AVX512-NOIFMA-NEXT:    vextracti64x4 $1, %zmm3, %ymm2
+; AVX512-NOIFMA-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
+; AVX512-NOIFMA-NEXT:    vextracti64x4 $1, %zmm5, %ymm6
+; AVX512-NOIFMA-NEXT:    {vex} vpmadd52luq %ymm2, %ymm4, %ymm6
+; AVX512-NOIFMA-NEXT:    {vex} vpmadd52luq %ymm3, %ymm1, %ymm5
+; AVX512-NOIFMA-NEXT:    vinserti64x4 $1, %ymm6, %zmm5, %zmm1
+; AVX512-NOIFMA-NEXT:    retq
   %x_masked = and <16 x i64> %x, splat (i64 67108863)
   %y_masked = and <16 x i64> %y, splat (i64 67108863)
   %mul = mul <16 x i64> %x_masked, %y_masked
@@ -388,13 +484,13 @@ define <1 x i64> @test_not_v1i64(<1 x i64> %x, <1 x i64> %y, <1 x i64> %z) {
 }
 
 define <3 x i64> @test_v3i64(<3 x i64> %x, <3 x i64> %y, <3 x i64> %z) {
-; AVX-LABEL: test_v3i64:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [67108863,67108863,67108863,67108863]
-; AVX-NEXT:    vpand %ymm1, %ymm0, %ymm0
-; AVX-NEXT:    vpmuludq %ymm0, %ymm0, %ymm0
-; AVX-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
-; AVX-NEXT:    retq
+; AVXIFMA-LABEL: test_v3i64:
+; AVXIFMA:       # %bb.0:
+; AVXIFMA-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [67108863,67108863,67108863,67108863]
+; AVXIFMA-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVXIFMA-NEXT:    vpmuludq %ymm0, %ymm0, %ymm0
+; AVXIFMA-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
+; AVXIFMA-NEXT:    retq
 ;
 ; AVX512-NOVL-LABEL: test_v3i64:
 ; AVX512-NOVL:       # %bb.0:
@@ -410,6 +506,13 @@ define <3 x i64> @test_v3i64(<3 x i64> %x, <3 x i64> %y, <3 x i64> %z) {
 ; AVX512VL-NEXT:    vpmuludq %ymm0, %ymm0, %ymm0
 ; AVX512VL-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
 ; AVX512VL-NEXT:    retq
+;
+; AVX512-NOIFMA-LABEL: test_v3i64:
+; AVX512-NOIFMA:       # %bb.0:
+; AVX512-NOIFMA-NEXT:    vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0
+; AVX512-NOIFMA-NEXT:    vpmuludq %ymm0, %ymm0, %ymm0
+; AVX512-NOIFMA-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
+; AVX512-NOIFMA-NEXT:    retq
   %x_masked = and <3 x i64> %x, splat (i64 67108863)
   %y_masked = and <3 x i64> %x, splat (i64 67108863)
   %mul = mul <3 x i64> %x_masked, %y_masked
@@ -418,35 +521,35 @@ define <3 x i64> @test_v3i64(<3 x i64> %x, <3 x i64> %y, <3 x i64> %z) {
 }
 
 define <5 x i64> @test_v5i64(<5 x i64> %x, <5 x i64> %y, <5 x i64> %z) {
-; AVX-LABEL: test_v5i64:
-; AVX:       # %bb.0:
-; AVX-NEXT:    movq %rdi, %rax
-; AVX-NEXT:    vmovq %r8, %xmm0
-; AVX-NEXT:    vmovq %rcx, %xmm1
-; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX-NEXT:    vmovq %rdx, %xmm1
-; AVX-NEXT:    vmovq %rsi, %xmm2
-; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
-; AVX-NEXT:    vmovdqu {{[0-9]+}}(%rsp), %ymm2
-; AVX-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [67108863,67108863,67108863,67108863]
-; AVX-NEXT:    vpand %ymm3, %ymm0, %ymm0
-; AVX-NEXT:    movl $67108863, %ecx # imm = 0x3FFFFFF
-; AVX-NEXT:    vmovq %rcx, %xmm3
-; AVX-NEXT:    vmovq %r9, %xmm4
-; AVX-NEXT:    vpand %xmm3, %xmm4, %xmm3
-; AVX-NEXT:    vpsrlq $32, %xmm3, %xmm4
-; AVX-NEXT:    vpmuludq %xmm4, %xmm3, %xmm4
-; AVX-NEXT:    vpsllq $33, %xmm4, %xmm4
-; AVX-NEXT:    vpmuludq %xmm3, %xmm3, %xmm3
-; AVX-NEXT:    vpaddq %xmm1, %xmm3, %xmm1
-; AVX-NEXT:    vpaddq %xmm4, %xmm1, %xmm1
-; AVX-NEXT:    {vex} vpmadd52luq %ymm0, %ymm0, %ymm2
-; AVX-NEXT:    vmovdqa %ymm2, (%rdi)
-; AVX-NEXT:    vmovq %xmm1, 32(%rdi)
-; AVX-NEXT:    vzeroupper
-; AVX-NEXT:    retq
+; AVXIFMA-LABEL: test_v5i64:
+; AVXIFMA:       # %bb.0:
+; AVXIFMA-NEXT:    movq %rdi, %rax
+; AVXIFMA-NEXT:    vmovq %r8, %xmm0
+; AVXIFMA-NEXT:    vmovq %rcx, %xmm1
+; AVXIFMA-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVXIFMA-NEXT:    vmovq %rdx, %xmm1
+; AVXIFMA-NEXT:    vmovq %rsi, %xmm2
+; AVXIFMA-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVXIFMA-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVXIFMA-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVXIFMA-NEXT:    vmovdqu {{[0-9]+}}(%rsp), %ymm2
+; AVXIFMA-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [67108863,67108863,67108863,67108863]
+; AVXIFMA-NEXT:    vpand %ymm3, %ymm0, %ymm0
+; AVXIFMA-NEXT:    movl $67108863, %ecx # imm = 0x3FFFFFF
+; AVXIFMA-NEXT:    vmovq %rcx, %xmm3
+; AVXIFMA-NEXT:    vmovq %r9, %xmm4
+; AVXIFMA-NEXT:    vpand %xmm3, %xmm4, %xmm3
+; AVXIFMA-NEXT:    vpsrlq $32, %xmm3, %xmm4
+; AVXIFMA-NEXT:    vpmuludq %xmm4, %xmm3, %xmm4
+; AVXIFMA-NEXT:    vpsllq $33, %xmm4, %xmm4
+; AVXIFMA-NEXT:    vpmuludq %xmm3, %xmm3, %xmm3
+; AVXIFMA-NEXT:    vpaddq %xmm1, %xmm3, %xmm1
+; AVXIFMA-NEXT:    vpaddq %xmm4, %xmm1, %xmm1
+; AVXIFMA-NEXT:    {vex} vpmadd52luq %ymm0, %ymm0, %ymm2
+; AVXIFMA-NEXT:    vmovdqa %ymm2, (%rdi)
+; AVXIFMA-NEXT:    vmovq %xmm1, 32(%rdi)
+; AVXIFMA-NEXT:    vzeroupper
+; AVXIFMA-NEXT:    retq
 ;
 ; AVX512-LABEL: test_v5i64:
 ; AVX512:       # %bb.0:
@@ -454,6 +557,13 @@ define <5 x i64> @test_v5i64(<5 x i64> %x, <5 x i64> %y, <5 x i64> %z) {
 ; AVX512-NEXT:    vpmuludq %zmm0, %zmm0, %zmm0
 ; AVX512-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
 ; AVX512-NEXT:    retq
+;
+; AVX512-NOIFMA-LABEL: test_v5i64:
+; AVX512-NOIFMA:       # %bb.0:
+; AVX512-NOIFMA-NEXT:    vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
+; AVX512-NOIFMA-NEXT:    vpmuludq %zmm0, %zmm0, %zmm0
+; AVX512-NOIFMA-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
+; AVX512-NOIFMA-NEXT:    retq
   %x_masked = and <5 x i64> %x, splat (i64 67108863)
   %y_masked = and <5 x i64> %x, splat (i64 67108863)
   %mul = mul <5 x i64> %x_masked, %y_masked
@@ -462,30 +572,30 @@ define <5 x i64> @test_v5i64(<5 x i64> %x, <5 x i64> %y, <5 x i64> %z) {
 }
 
 define <6 x i64> @test_v6i64(<6 x i64> %x, <6 x i64> %y, <6 x i64> %z) {
-; AVX-LABEL: test_v6i64:
-; AVX:       # %bb.0:
-; AVX-NEXT:    movq %rdi, %rax
-; AVX-NEXT:    vmovq %r8, %xmm0
-; AVX-NEXT:    vmovq %rcx, %xmm1
-; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX-NEXT:    vmovq %rdx, %xmm1
-; AVX-NEXT:    vmovq %rsi, %xmm2
-; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX-NEXT:    vmovdqu {{[0-9]+}}(%rsp), %ymm1
-; AVX-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [67108863,67108863,67108863,67108863]
-; AVX-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; AVX-NEXT:    {vex} vpmadd52luq %ymm0, %ymm0, %ymm1
-; AVX-NEXT:    vmovq %r9, %xmm0
-; AVX-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
-; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
-; AVX-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; AVX-NEXT:    vpmuldq %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vpaddq {{[0-9]+}}(%rsp), %xmm0, %xmm0
-; AVX-NEXT:    vmovdqa %xmm0, 32(%rdi)
-; AVX-NEXT:    vmovdqa %ymm1, (%rdi)
-; AVX-NEXT:    vzeroupper
-; AVX-NEXT:    retq
+; AVXIFMA-LABEL: test_v6i64:
+; AVXIFMA:       # %bb.0:
+; AVXIFMA-NEXT:    movq %rdi, %rax
+; AVXIFMA-NEXT:    vmovq %r8, %xmm0
+; AVXIFMA-NEXT:    vmovq %rcx, %xmm1
+; AVXIFMA-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVXIFMA-NEXT:    vmovq %rdx, %xmm1
+; AVXIFMA-NEXT:    vmovq %rsi, %xmm2
+; AVXIFMA-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVXIFMA-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVXIFMA-NEXT:    vmovdqu {{[0-9]+}}(%rsp), %ymm1
+; AVXIFMA-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [67108863,67108863,67108863,67108863]
+; AVXIFMA-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; AVXIFMA-NEXT:    {vex} vpmadd52luq %ymm0, %ymm0, %ymm1
+; AVXIFMA-NEXT:    vmovq %r9, %xmm0
+; AVXIFMA-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
+; AVXIFMA-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
+; AVXIFMA-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVXIFMA-NEXT:    vpmuldq %xmm0, %xmm0, %xmm0
+; AVXIFMA-NEXT:    vpaddq {{[0-9]+}}(%rsp), %xmm0, %xmm0
+; AVXIFMA-NEXT:    vmovdqa %xmm0, 32(%rdi)
+; AVXIFMA-NEXT:    vmovdqa %ymm1, (%rdi)
+; AVXIFMA-NEXT:    vzeroupper
+; AVXIFMA-NEXT:    retq
 ;
 ; AVX512-LABEL: test_v6i64:
 ; AVX512:       # %bb.0:
@@ -493,6 +603,13 @@ define <6 x i64> @test_v6i64(<6 x i64> %x, <6 x i64> %y, <6 x i64> %z) {
 ; AVX512-NEXT:    vpmuludq %zmm0, %zmm0, %zmm0
 ; AVX512-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
 ; AVX512-NEXT:    retq
+;
+; AVX512-NOIFMA-LABEL: test_v6i64:
+; AVX512-NOIFMA:       # %bb.0:
+; AVX512-NOIFMA-NEXT:    vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
+; AVX512-NOIFMA-NEXT:    vpmuludq %zmm0, %zmm0, %zmm0
+; AVX512-NOIFMA-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
+; AVX512-NOIFMA-NEXT:    retq
   %x_masked = and <6 x i64> %x, splat (i64 67108863)
   %y_masked = and <6 x i64> %x, splat (i64 67108863)
   %mul = mul <6 x i64> %x_masked, %y_masked
@@ -501,43 +618,43 @@ define <6 x i64> @test_v6i64(<6 x i64> %x, <6 x i64> %y, <6 x i64> %z) {
 }
 
 define <9 x i64> @test_v9i64(<9 x i64> %x, <9 x i64> %y, <9 x i64> %z) {
-; AVX-LABEL: test_v9i64:
-; AVX:       # %bb.0:
-; AVX-NEXT:    movq %rdi, %rax
-; AVX-NEXT:    vmovq %r8, %xmm0
-; AVX-NEXT:    vmovq %rcx, %xmm1
-; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX-NEXT:    vmovq %rdx, %xmm1
-; AVX-NEXT:    vmovq %rsi, %xmm2
-; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX-NEXT:    vmovq %r9, %xmm1
-; AVX-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
-; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX-NEXT:    vinserti128 $1, {{[0-9]+}}(%rsp), %ymm1, %ymm1
-; AVX-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
-; AVX-NEXT:    vmovdqu {{[0-9]+}}(%rsp), %ymm3
-; AVX-NEXT:    vmovdqu {{[0-9]+}}(%rsp), %ymm4
-; AVX-NEXT:    vpbroadcastq {{.*#+}} ymm5 = [67108863,67108863,67108863,67108863]
-; AVX-NEXT:    vpand %ymm5, %ymm0, %ymm0
-; AVX-NEXT:    vpand %ymm5, %ymm1, %ymm1
-; AVX-NEXT:    movl $67108863, %ecx # imm = 0x3FFFFFF
-; AVX-NEXT:    vmovq %rcx, %xmm5
-; AVX-NEXT:    vmovq {{.*#+}} xmm6 = mem[0],zero
-; AVX-NEXT:    vpand %xmm5, %xmm6, %xmm5
-; AVX-NEXT:    vpsrlq $32, %xmm5, %xmm6
-; AVX-NEXT:    vpmuludq %xmm6, %xmm5, %xmm6
-; AVX-NEXT:    vpsllq $33, %xmm6, %xmm6
-; AVX-NEXT:    vpmuludq %xmm5, %xmm5, %xmm5
-; AVX-NEXT:    vpaddq %xmm2, %xmm5, %xmm2
-; AVX-NEXT:    vpaddq %xmm6, %xmm2, %xmm2
-; AVX-NEXT:    {vex} vpmadd52luq %ymm0, %ymm0, %ymm4
-; AVX-NEXT:    {vex} vpmadd52luq %ymm1, %ymm1, %ymm3
-; AVX-NEXT:    vmovdqa %ymm3, 32(%rdi)
-; AVX-NEXT:    vmovdqa %ymm4, (%rdi)
-; AVX-NEXT:    vmovq %xmm2, 64(%rdi)
-; AVX-NEXT:    vzeroupper
-; AVX-NEXT:    retq
+; AVXIFMA-LABEL: test_v9i64:
+; AVXIFMA:       # %bb.0:
+; AVXIFMA-NEXT:    movq %rdi, %rax
+; AVXIFMA-NEXT:    vmovq %r8, %xmm0
+; AVXIFMA-NEXT:    vmovq %rcx, %xmm1
+; AVXIFMA-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVXIFMA-NEXT:    vmovq %rdx, %xmm1
+; AVXIFMA-NEXT:    vmovq %rsi, %xmm2
+; AVXIFMA-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVXIFMA-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVXIFMA-NEXT:    vmovq %r9, %xmm1
+; AVXIFMA-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
+; AVXIFMA-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVXIFMA-NEXT:    vinserti128 $1, {{[0-9]+}}(%rsp), %ymm1, %ymm1
+; AVXIFMA-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
+; AVXIFMA-NEXT:    vmovdqu {{[0-9]+}}(%rsp), %ymm3
+; AVXIFMA-NEXT:    vmovdqu {{[0-9]+}}(%rsp), %ymm4
+; AVXIFMA-NEXT:    vpbroadcastq {{.*#+}} ymm5 = [67108863,67108863,67108863,67108863]
+; AVXIFMA-NEXT:    vpand %ymm5, %ymm0, %ymm0
+; AVXIFMA-NEXT:    vpand %ymm5, %ymm1, %ymm1
+; AVXIFMA-NEXT:    movl $67108863, %ecx # imm = 0x3FFFFFF
+; AVXIFMA-NEXT:    vmovq %rcx, %xmm5
+; AVXIFMA-NEXT:    vmovq {{.*#+}} xmm6 = mem[0],zero
+; AVXIFMA-NEXT:    vpand %xmm5, %xmm6, %xmm5
+; AVXIFMA-NEXT:    vpsrlq $32, %xmm5, %xmm6
+; AVXIFMA-NEXT:    vpmuludq %xmm6, %xmm5, %xmm6
+; AVXIFMA-NEXT:    vpsllq $33, %xmm6, %xmm6
+; AVXIFMA-NEXT:    vpmuludq %xmm5, %xmm5, %xmm5
+; AVXIFMA-NEXT:    vpaddq %xmm2, %xmm5, %xmm2
+; AVXIFMA-NEXT:    vpaddq %xmm6, %xmm2, %xmm2
+; AVXIFMA-NEXT:    {vex} vpmadd52luq %ymm0, %ymm0, %ymm4
+; AVXIFMA-NEXT:    {vex} vpmadd52luq %ymm1, %ymm1, %ymm3
+; AVXIFMA-NEXT:    vmovdqa %ymm3, 32(%rdi)
+; AVXIFMA-NEXT:    vmovdqa %ymm4, (%rdi)
+; AVXIFMA-NEXT:    vmovq %xmm2, 64(%rdi)
+; AVXIFMA-NEXT:    vzeroupper
+; AVXIFMA-NEXT:    retq
 ;
 ; AVX512-LABEL: test_v9i64:
 ; AVX512:       # %bb.0:
@@ -572,6 +689,44 @@ define <9 x i64> @test_v9i64(<9 x i64> %x, <9 x i64> %y, <9 x i64> %z) {
 ; AVX512-NEXT:    vmovdqa64 %zmm2, (%rdi)
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
+;
+; AVX512-NOIFMA-LABEL: test_v9i64:
+; AVX512-NOIFMA:       # %bb.0:
+; AVX512-NOIFMA-NEXT:    movq %rdi, %rax
+; AVX512-NOIFMA-NEXT:    vmovq %r8, %xmm0
+; AVX512-NOIFMA-NEXT:    vmovq %rcx, %xmm1
+; AVX512-NOIFMA-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512-NOIFMA-NEXT:    vmovq %rdx, %xmm1
+; AVX512-NOIFMA-NEXT:    vmovq %rsi, %xmm2
+; AVX512-NOIFMA-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512-NOIFMA-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512-NOIFMA-NEXT:    vmovq %r9, %xmm1
+; AVX512-NOIFMA-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
+; AVX512-NOIFMA-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX512-NOIFMA-NEXT:    vinserti128 $1, {{[0-9]+}}(%rsp), %ymm1, %ymm1
+; AVX512-NOIFMA-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NOIFMA-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX512-NOIFMA-NEXT:    vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
+; AVX512-NOIFMA-NEXT:    movl $67108863, %ecx # imm = 0x3FFFFFF
+; AVX512-NOIFMA-NEXT:    vmovq %rcx, %xmm2
+; AVX512-NOIFMA-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
+; AVX512-NOIFMA-NEXT:    vpand %xmm2, %xmm3, %xmm2
+; AVX512-NOIFMA-NEXT:    vpsrlq $32, %xmm2, %xmm3
+; AVX512-NOIFMA-NEXT:    vpmuludq %xmm3, %xmm2, %xmm3
+; AVX512-NOIFMA-NEXT:    vpsllq $33, %xmm3, %xmm3
+; AVX512-NOIFMA-NEXT:    vpmuludq %xmm2, %xmm2, %xmm2
+; AVX512-NOIFMA-NEXT:    vpaddq %xmm1, %xmm2, %xmm1
+; AVX512-NOIFMA-NEXT:    vpaddq %xmm3, %xmm1, %xmm1
+; AVX512-NOIFMA-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
+; AVX512-NOIFMA-NEXT:    vmovdqu {{[0-9]+}}(%rsp), %ymm3
+; AVX512-NOIFMA-NEXT:    {vex} vpmadd52luq %ymm2, %ymm2, %ymm3
+; AVX512-NOIFMA-NEXT:    vmovdqu {{[0-9]+}}(%rsp), %ymm2
+; AVX512-NOIFMA-NEXT:    {vex} vpmadd52luq %ymm0, %ymm0, %ymm2
+; AVX512-NOIFMA-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm0
+; AVX512-NOIFMA-NEXT:    vmovq %xmm1, 64(%rdi)
+; AVX512-NOIFMA-NEXT:    vmovdqa64 %zmm0, (%rdi)
+; AVX512-NOIFMA-NEXT:    vzeroupper
+; AVX512-NOIFMA-NEXT:    retq
   %x_masked = and <9 x i64> %x, splat (i64 67108863)
   %y_masked = and <9 x i64> %x, splat (i64 67108863)
   %mul = mul <9 x i64> %x_masked, %y_masked


        


More information about the llvm-commits mailing list