[llvm] [X86] Lower i512 ADD/SUB using Kogge-Stone on AVX512 (PR #174761)

Islam Imad via llvm-commits llvm-commits at lists.llvm.org
Wed Jan 7 04:40:55 PST 2026


https://github.com/Islam-Imad created https://github.com/llvm/llvm-project/pull/174761

Fixes #173996

>From cd4d64fc410a36f593d7975c859aed040dda98a6 Mon Sep 17 00:00:00 2001
From: Islam-Imad <islamimad404 at gmail.com>
Date: Wed, 7 Jan 2026 14:27:11 +0200
Subject: [PATCH] [X86] Lower i512 ADD/SUB using Kogge-Stone on AVX512 Fixes
 #173996

---
 llvm/lib/Target/X86/X86ISelLowering.cpp |  62 +++++
 llvm/test/CodeGen/X86/add-i512.ll       | 306 ++++++++++++++--------
 llvm/test/CodeGen/X86/sub-i512.ll       | 322 +++++++++++++++---------
 3 files changed, 464 insertions(+), 226 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 6501aa3746a0f..29bb306c88add 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -28,6 +28,7 @@
 #include "llvm/Analysis/BlockFrequencyInfo.h"
 #include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/Analysis/VectorUtils.h"
+#include "llvm/CodeGen/ISDOpcodes.h"
 #include "llvm/CodeGen/LivePhysRegs.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
@@ -1847,6 +1848,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::AND, MVT::i512, Custom);
     setOperationAction(ISD::OR, MVT::i512, Custom);
     setOperationAction(ISD::XOR, MVT::i512, Custom);
+    setOperationAction(ISD::ADD, MVT::i512, Custom);
+    setOperationAction(ISD::SUB, MVT::i512, Custom);
 
     for (MVT VT : { MVT::v16i1, MVT::v16i8 }) {
       setOperationPromotedToType(ISD::FP_TO_SINT       , VT, MVT::v16i32);
@@ -34031,6 +34034,65 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
     Results.push_back(DAG.getBitcast(VT, Op));
     return;
   }
+  case ISD::ADD:
+  case ISD::SUB: {
+    // Use Kogge-Stone parallel carry/borrow propagation for i512 add/sub.
+    // TODO: ISD::UADDO_CARRY
+    SDValue LHS = N->getOperand(0);
+    SDValue RHS = N->getOperand(1);
+    EVT VT = N->getValueType(0);
+    bool IsAdd = Opc == ISD::ADD;
+
+    assert(Subtarget.useAVX512Regs() && "AVX512 required");
+    assert(VT == MVT::i512 && "Unexpected VT!");
+
+    if (!mayFoldIntoVector(LHS, Subtarget) ||
+        !mayFoldIntoVector(RHS, Subtarget))
+      return;
+
+    MVT VecVT = MVT::getVectorVT(MVT::i64, 8);
+    MVT BoolVT = MVT::getVectorVT(MVT::i1, 8);
+    SDValue Vec0 = DAG.getBitcast(VecVT, LHS);
+    SDValue Vec1 = DAG.getBitcast(VecVT, RHS);
+    SDValue AllOnes = DAG.getAllOnesConstant(dl, VecVT);
+
+    // Compute partial sum/difference (per-lane, no carry propagation).
+    SDValue Partial = DAG.getNode(Opc, dl, VecVT, Vec0, Vec1);
+
+    // Detect carry/borrow generation.
+    ISD::CondCode CarryCC = IsAdd ? ISD::SETULT : ISD::SETUGT;
+    SDValue Carry = DAG.getSetCC(dl, BoolVT, Partial, Vec0, CarryCC);
+
+    // Detect propagate lanes.
+    SDValue PropCmp = IsAdd ? AllOnes : DAG.getConstant(0, dl, VecVT);
+    SDValue Propagate = DAG.getSetCC(dl, BoolVT, Partial, PropCmp, ISD::SETEQ);
+
+    // Convert masks to scalar for Kogge-Stone propagation.
+    SDValue CarryIn = DAG.getNode(ISD::BITCAST, dl, MVT::i8, Carry);
+    SDValue PropIn = DAG.getNode(ISD::BITCAST, dl, MVT::i8, Propagate);
+    CarryIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, CarryIn);
+    PropIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, PropIn);
+
+    // Kogge-Stone: shift carry left and add propagate.
+    SDValue ShiftedCarry = DAG.getNode(ISD::SHL, dl, MVT::i32, CarryIn,
+                                       DAG.getConstant(1, dl, MVT::i8));
+    SDValue CarryOut =
+        DAG.getNode(ISD::ADD, dl, MVT::i32, ShiftedCarry, PropIn);
+
+    // Correction mask: lanes that received a carry/borrow.
+    SDValue CorrMask = DAG.getNode(ISD::XOR, dl, MVT::i32, PropIn, CarryOut);
+    CorrMask = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, CorrMask);
+    SDValue CorrVec = DAG.getNode(ISD::BITCAST, dl, BoolVT, CorrMask);
+
+    // Apply correction: +1 for ADD (via -(-1)), -1 for SUB (via +(-1)).
+    unsigned AdjustOpc = IsAdd ? ISD::SUB : ISD::ADD;
+    SDValue Adjusted = DAG.getNode(AdjustOpc, dl, VecVT, Partial, AllOnes);
+    SDValue Res =
+        DAG.getNode(ISD::VSELECT, dl, VecVT, CorrVec, Adjusted, Partial);
+
+    Results.push_back(DAG.getBitcast(VT, Res));
+    return;
+  }
   case ISD::CTPOP: {
     assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!");
     // If we have at most 32 active bits, then perform as i32 CTPOP.
diff --git a/llvm/test/CodeGen/X86/add-i512.ll b/llvm/test/CodeGen/X86/add-i512.ll
index 16ba701072031..1e31fd4fe374f 100644
--- a/llvm/test/CodeGen/X86/add-i512.ll
+++ b/llvm/test/CodeGen/X86/add-i512.ll
@@ -58,23 +58,31 @@ define i512 @test_add_i512_reg_reg(i512 %a0, i512 %a1) nounwind {
 ; AVX512F-NEXT:    addq {{[0-9]+}}(%rsp), %rsi
 ; AVX512F-NEXT:    adcq {{[0-9]+}}(%rsp), %rdx
 ; AVX512F-NEXT:    adcq {{[0-9]+}}(%rsp), %rcx
-; AVX512F-NEXT:    adcq {{[0-9]+}}(%rsp), %r8
 ; AVX512F-NEXT:    movq %rdi, %rax
-; AVX512F-NEXT:    adcq {{[0-9]+}}(%rsp), %r9
+; AVX512F-NEXT:    adcq {{[0-9]+}}(%rsp), %r8
 ; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %rdi
-; AVX512F-NEXT:    adcq {{[0-9]+}}(%rsp), %rdi
+; AVX512F-NEXT:    adcq {{[0-9]+}}(%rsp), %r9
 ; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %r10
 ; AVX512F-NEXT:    adcq {{[0-9]+}}(%rsp), %r10
 ; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %r11
+; AVX512F-NEXT:    vmovq %r9, %xmm0
+; AVX512F-NEXT:    vmovq %r10, %xmm1
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512F-NEXT:    adcq {{[0-9]+}}(%rsp), %rdi
+; AVX512F-NEXT:    vmovq %rdi, %xmm1
 ; AVX512F-NEXT:    adcq {{[0-9]+}}(%rsp), %r11
-; AVX512F-NEXT:    movq %rcx, 16(%rax)
-; AVX512F-NEXT:    movq %rdx, 8(%rax)
-; AVX512F-NEXT:    movq %rsi, (%rax)
-; AVX512F-NEXT:    movq %r8, 24(%rax)
-; AVX512F-NEXT:    movq %r9, 32(%rax)
-; AVX512F-NEXT:    movq %rdi, 40(%rax)
-; AVX512F-NEXT:    movq %r10, 48(%rax)
-; AVX512F-NEXT:    movq %r11, 56(%rax)
+; AVX512F-NEXT:    vmovq %r11, %xmm2
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX512F-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT:    vmovq %rcx, %xmm1
+; AVX512F-NEXT:    vmovq %r8, %xmm2
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX512F-NEXT:    vmovq %rdx, %xmm2
+; AVX512F-NEXT:    vmovq %rsi, %xmm3
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
+; AVX512F-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-NEXT:    vmovdqu64 %zmm0, (%rax)
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: test_add_i512_reg_reg:
@@ -82,23 +90,32 @@ define i512 @test_add_i512_reg_reg(i512 %a0, i512 %a1) nounwind {
 ; AVX512VL-NEXT:    movq %rdi, %rax
 ; AVX512VL-NEXT:    movq {{[0-9]+}}(%rsp), %rdi
 ; AVX512VL-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512VL-NEXT:    movq {{[0-9]+}}(%rsp), %r11
 ; AVX512VL-NEXT:    addq {{[0-9]+}}(%rsp), %rsi
 ; AVX512VL-NEXT:    adcq {{[0-9]+}}(%rsp), %rdx
 ; AVX512VL-NEXT:    adcq {{[0-9]+}}(%rsp), %rcx
 ; AVX512VL-NEXT:    adcq {{[0-9]+}}(%rsp), %r8
 ; AVX512VL-NEXT:    adcq {{[0-9]+}}(%rsp), %r9
+; AVX512VL-NEXT:    movq {{[0-9]+}}(%rsp), %r11
+; AVX512VL-NEXT:    vmovq %r9, %xmm0
 ; AVX512VL-NEXT:    adcq {{[0-9]+}}(%rsp), %r11
+; AVX512VL-NEXT:    vmovq %r11, %xmm1
+; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; AVX512VL-NEXT:    adcq {{[0-9]+}}(%rsp), %rdi
+; AVX512VL-NEXT:    vmovq %rdi, %xmm1
 ; AVX512VL-NEXT:    adcq {{[0-9]+}}(%rsp), %r10
-; AVX512VL-NEXT:    movq %rcx, 16(%rax)
-; AVX512VL-NEXT:    movq %rdx, 8(%rax)
-; AVX512VL-NEXT:    movq %rsi, (%rax)
-; AVX512VL-NEXT:    movq %r8, 24(%rax)
-; AVX512VL-NEXT:    movq %r9, 32(%rax)
-; AVX512VL-NEXT:    movq %r11, 40(%rax)
-; AVX512VL-NEXT:    movq %rdi, 48(%rax)
-; AVX512VL-NEXT:    movq %r10, 56(%rax)
+; AVX512VL-NEXT:    vmovq %r10, %xmm2
+; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX512VL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512VL-NEXT:    vmovq %rcx, %xmm1
+; AVX512VL-NEXT:    vmovq %r8, %xmm2
+; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX512VL-NEXT:    vmovq %rdx, %xmm2
+; AVX512VL-NEXT:    vmovq %rsi, %xmm3
+; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
+; AVX512VL-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
+; AVX512VL-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512VL-NEXT:    vmovdqu64 %zmm0, (%rax)
+; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
   %r = add i512 %a0, %a1
   ret i512 %r
@@ -162,27 +179,35 @@ define i512 @test_add_i512_reg_mem(i512 %a0, ptr %p1) nounwind {
 ; AVX512F-LABEL: test_add_i512_reg_mem:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    pushq %rbx
+; AVX512F-NEXT:    movq %rdi, %rax
+; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %rdi
 ; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %r10
 ; AVX512F-NEXT:    addq (%r10), %rsi
+; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %r11
 ; AVX512F-NEXT:    adcq 8(%r10), %rdx
+; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %rbx
+; AVX512F-NEXT:    vmovq %rsi, %xmm0
+; AVX512F-NEXT:    vmovq %rdx, %xmm1
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; AVX512F-NEXT:    adcq 16(%r10), %rcx
+; AVX512F-NEXT:    vmovq %rcx, %xmm1
 ; AVX512F-NEXT:    adcq 24(%r10), %r8
-; AVX512F-NEXT:    movq %rdi, %rax
+; AVX512F-NEXT:    vmovq %r8, %xmm2
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
 ; AVX512F-NEXT:    adcq 32(%r10), %r9
-; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %rdi
-; AVX512F-NEXT:    adcq 40(%r10), %rdi
-; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %r11
-; AVX512F-NEXT:    adcq 48(%r10), %r11
-; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %rbx
-; AVX512F-NEXT:    adcq 56(%r10), %rbx
-; AVX512F-NEXT:    movq %rsi, (%rax)
-; AVX512F-NEXT:    movq %rdx, 8(%rax)
-; AVX512F-NEXT:    movq %rcx, 16(%rax)
-; AVX512F-NEXT:    movq %r8, 24(%rax)
-; AVX512F-NEXT:    movq %r9, 32(%rax)
-; AVX512F-NEXT:    movq %rdi, 40(%rax)
-; AVX512F-NEXT:    movq %r11, 48(%rax)
-; AVX512F-NEXT:    movq %rbx, 56(%rax)
+; AVX512F-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT:    adcq 40(%r10), %rbx
+; AVX512F-NEXT:    vmovq %r9, %xmm1
+; AVX512F-NEXT:    adcq 48(%r10), %rdi
+; AVX512F-NEXT:    vmovq %rbx, %xmm2
+; AVX512F-NEXT:    adcq 56(%r10), %r11
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX512F-NEXT:    vmovq %rdi, %xmm2
+; AVX512F-NEXT:    vmovq %r11, %xmm3
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; AVX512F-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512F-NEXT:    vmovdqu64 %zmm0, (%rax)
 ; AVX512F-NEXT:    popq %rbx
 ; AVX512F-NEXT:    retq
 ;
@@ -190,27 +215,36 @@ define i512 @test_add_i512_reg_mem(i512 %a0, ptr %p1) nounwind {
 ; AVX512VL:       # %bb.0:
 ; AVX512VL-NEXT:    pushq %rbx
 ; AVX512VL-NEXT:    movq %rdi, %rax
-; AVX512VL-NEXT:    movq {{[0-9]+}}(%rsp), %rdi
 ; AVX512VL-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; AVX512VL-NEXT:    movq {{[0-9]+}}(%rsp), %rdi
 ; AVX512VL-NEXT:    movq {{[0-9]+}}(%rsp), %r11
 ; AVX512VL-NEXT:    movq {{[0-9]+}}(%rsp), %rbx
 ; AVX512VL-NEXT:    addq (%rbx), %rsi
+; AVX512VL-NEXT:    vmovq %rsi, %xmm0
 ; AVX512VL-NEXT:    adcq 8(%rbx), %rdx
+; AVX512VL-NEXT:    vmovq %rdx, %xmm1
+; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; AVX512VL-NEXT:    adcq 16(%rbx), %rcx
+; AVX512VL-NEXT:    vmovq %rcx, %xmm1
 ; AVX512VL-NEXT:    adcq 24(%rbx), %r8
+; AVX512VL-NEXT:    vmovq %r8, %xmm2
+; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX512VL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; AVX512VL-NEXT:    adcq 32(%rbx), %r9
+; AVX512VL-NEXT:    vmovq %r9, %xmm1
 ; AVX512VL-NEXT:    adcq 40(%rbx), %r11
-; AVX512VL-NEXT:    adcq 48(%rbx), %rdi
-; AVX512VL-NEXT:    adcq 56(%rbx), %r10
-; AVX512VL-NEXT:    movq %rsi, (%rax)
-; AVX512VL-NEXT:    movq %rdx, 8(%rax)
-; AVX512VL-NEXT:    movq %rcx, 16(%rax)
-; AVX512VL-NEXT:    movq %r8, 24(%rax)
-; AVX512VL-NEXT:    movq %r9, 32(%rax)
-; AVX512VL-NEXT:    movq %r11, 40(%rax)
-; AVX512VL-NEXT:    movq %rdi, 48(%rax)
-; AVX512VL-NEXT:    movq %r10, 56(%rax)
+; AVX512VL-NEXT:    vmovq %r11, %xmm2
+; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX512VL-NEXT:    adcq 48(%rbx), %r10
+; AVX512VL-NEXT:    vmovq %r10, %xmm2
+; AVX512VL-NEXT:    adcq 56(%rbx), %rdi
+; AVX512VL-NEXT:    vmovq %rdi, %xmm3
+; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; AVX512VL-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512VL-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512VL-NEXT:    vmovdqu64 %zmm0, (%rax)
 ; AVX512VL-NEXT:    popq %rbx
+; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
   %a1 = load i512, ptr %p1
   %r = add i512 %a0, %a1
@@ -270,27 +304,37 @@ define i512 @test_add_i512_mem_reg(ptr %p0, i512 %a1) nounwind {
 ;
 ; AVX512F-LABEL: test_add_i512_mem_reg:
 ; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    pushq %rbx
+; AVX512F-NEXT:    movq %rdi, %rax
+; AVX512F-NEXT:    movq 56(%rsi), %rdi
+; AVX512F-NEXT:    movq 48(%rsi), %r10
 ; AVX512F-NEXT:    addq (%rsi), %rdx
+; AVX512F-NEXT:    movq 32(%rsi), %r11
 ; AVX512F-NEXT:    adcq 8(%rsi), %rcx
+; AVX512F-NEXT:    movq 40(%rsi), %rbx
+; AVX512F-NEXT:    vmovq %rdx, %xmm0
+; AVX512F-NEXT:    vmovq %rcx, %xmm1
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; AVX512F-NEXT:    adcq 16(%rsi), %r8
-; AVX512F-NEXT:    movq %rdi, %rax
+; AVX512F-NEXT:    vmovq %r8, %xmm1
 ; AVX512F-NEXT:    adcq 24(%rsi), %r9
-; AVX512F-NEXT:    movq 32(%rsi), %rdi
-; AVX512F-NEXT:    adcq {{[0-9]+}}(%rsp), %rdi
-; AVX512F-NEXT:    movq 40(%rsi), %r10
-; AVX512F-NEXT:    adcq {{[0-9]+}}(%rsp), %r10
-; AVX512F-NEXT:    movq 48(%rsi), %r11
+; AVX512F-NEXT:    vmovq %r9, %xmm2
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
 ; AVX512F-NEXT:    adcq {{[0-9]+}}(%rsp), %r11
-; AVX512F-NEXT:    movq 56(%rsi), %rsi
-; AVX512F-NEXT:    adcq {{[0-9]+}}(%rsp), %rsi
-; AVX512F-NEXT:    movq %rdx, (%rax)
-; AVX512F-NEXT:    movq %rcx, 8(%rax)
-; AVX512F-NEXT:    movq %r8, 16(%rax)
-; AVX512F-NEXT:    movq %r9, 24(%rax)
-; AVX512F-NEXT:    movq %rdi, 32(%rax)
-; AVX512F-NEXT:    movq %r10, 40(%rax)
-; AVX512F-NEXT:    movq %r11, 48(%rax)
-; AVX512F-NEXT:    movq %rsi, 56(%rax)
+; AVX512F-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT:    adcq {{[0-9]+}}(%rsp), %rbx
+; AVX512F-NEXT:    vmovq %r11, %xmm1
+; AVX512F-NEXT:    adcq {{[0-9]+}}(%rsp), %r10
+; AVX512F-NEXT:    vmovq %rbx, %xmm2
+; AVX512F-NEXT:    adcq {{[0-9]+}}(%rsp), %rdi
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX512F-NEXT:    vmovq %r10, %xmm2
+; AVX512F-NEXT:    vmovq %rdi, %xmm3
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; AVX512F-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512F-NEXT:    vmovdqu64 %zmm0, (%rax)
+; AVX512F-NEXT:    popq %rbx
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: test_add_i512_mem_reg:
@@ -302,22 +346,31 @@ define i512 @test_add_i512_mem_reg(ptr %p0, i512 %a1) nounwind {
 ; AVX512VL-NEXT:    movq 32(%rsi), %r11
 ; AVX512VL-NEXT:    movq 40(%rsi), %rbx
 ; AVX512VL-NEXT:    addq (%rsi), %rdx
+; AVX512VL-NEXT:    vmovq %rdx, %xmm0
 ; AVX512VL-NEXT:    adcq 8(%rsi), %rcx
+; AVX512VL-NEXT:    vmovq %rcx, %xmm1
+; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; AVX512VL-NEXT:    adcq 16(%rsi), %r8
+; AVX512VL-NEXT:    vmovq %r8, %xmm1
 ; AVX512VL-NEXT:    adcq 24(%rsi), %r9
+; AVX512VL-NEXT:    vmovq %r9, %xmm2
+; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX512VL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; AVX512VL-NEXT:    adcq {{[0-9]+}}(%rsp), %r11
+; AVX512VL-NEXT:    vmovq %r11, %xmm1
 ; AVX512VL-NEXT:    adcq {{[0-9]+}}(%rsp), %rbx
+; AVX512VL-NEXT:    vmovq %rbx, %xmm2
+; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
 ; AVX512VL-NEXT:    adcq {{[0-9]+}}(%rsp), %r10
+; AVX512VL-NEXT:    vmovq %r10, %xmm2
 ; AVX512VL-NEXT:    adcq {{[0-9]+}}(%rsp), %rdi
-; AVX512VL-NEXT:    movq %rdx, (%rax)
-; AVX512VL-NEXT:    movq %rcx, 8(%rax)
-; AVX512VL-NEXT:    movq %r8, 16(%rax)
-; AVX512VL-NEXT:    movq %r9, 24(%rax)
-; AVX512VL-NEXT:    movq %r11, 32(%rax)
-; AVX512VL-NEXT:    movq %rbx, 40(%rax)
-; AVX512VL-NEXT:    movq %r10, 48(%rax)
-; AVX512VL-NEXT:    movq %rdi, 56(%rax)
+; AVX512VL-NEXT:    vmovq %rdi, %xmm3
+; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; AVX512VL-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512VL-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512VL-NEXT:    vmovdqu64 %zmm0, (%rax)
 ; AVX512VL-NEXT:    popq %rbx
+; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
   %a0 = load i512, ptr %p0
   %r = add i512 %a0, %a1
@@ -383,34 +436,43 @@ define i512 @test_inc_i512_mem(ptr %p0) nounwind {
 ; AVX2-NEXT:    movq %rcx, 56(%rax)
 ; AVX2-NEXT:    retq
 ;
-; AVX512-LABEL: test_inc_i512_mem:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    movq %rdi, %rax
-; AVX512-NEXT:    movq 56(%rsi), %rcx
-; AVX512-NEXT:    movq 48(%rsi), %rdx
-; AVX512-NEXT:    movq 40(%rsi), %rdi
-; AVX512-NEXT:    movq 32(%rsi), %r8
-; AVX512-NEXT:    movq 24(%rsi), %r9
-; AVX512-NEXT:    movq 16(%rsi), %r10
-; AVX512-NEXT:    movq (%rsi), %r11
-; AVX512-NEXT:    movq 8(%rsi), %rsi
-; AVX512-NEXT:    addq $1, %r11
-; AVX512-NEXT:    adcq $0, %rsi
-; AVX512-NEXT:    adcq $0, %r10
-; AVX512-NEXT:    adcq $0, %r9
-; AVX512-NEXT:    adcq $0, %r8
-; AVX512-NEXT:    adcq $0, %rdi
-; AVX512-NEXT:    adcq $0, %rdx
-; AVX512-NEXT:    adcq $0, %rcx
-; AVX512-NEXT:    movq %r11, (%rax)
-; AVX512-NEXT:    movq %rsi, 8(%rax)
-; AVX512-NEXT:    movq %r10, 16(%rax)
-; AVX512-NEXT:    movq %r9, 24(%rax)
-; AVX512-NEXT:    movq %r8, 32(%rax)
-; AVX512-NEXT:    movq %rdi, 40(%rax)
-; AVX512-NEXT:    movq %rdx, 48(%rax)
-; AVX512-NEXT:    movq %rcx, 56(%rax)
-; AVX512-NEXT:    retq
+; AVX512F-LABEL: test_inc_i512_mem:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    movq %rdi, %rax
+; AVX512F-NEXT:    vmovdqu64 (%rsi), %zmm0
+; AVX512F-NEXT:    vmovd {{.*#+}} xmm1 = [1,0,0,0]
+; AVX512F-NEXT:    vpaddq %zmm1, %zmm0, %zmm1
+; AVX512F-NEXT:    vpcmpltuq %zmm0, %zmm1, %k0
+; AVX512F-NEXT:    kmovw %k0, %ecx
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm2 = -1
+; AVX512F-NEXT:    vpcmpeqq %zmm2, %zmm1, %k0
+; AVX512F-NEXT:    kmovw %k0, %edx
+; AVX512F-NEXT:    movzbl %dl, %edx
+; AVX512F-NEXT:    leal (%rdx,%rcx,2), %ecx
+; AVX512F-NEXT:    xorl %edx, %ecx
+; AVX512F-NEXT:    kmovw %ecx, %k1
+; AVX512F-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 {%k1}
+; AVX512F-NEXT:    vmovdqu64 %zmm1, (%rdi)
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: test_inc_i512_mem:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vmovdqu64 (%rsi), %zmm0
+; AVX512VL-NEXT:    vmovd {{.*#+}} xmm1 = [1,0,0,0]
+; AVX512VL-NEXT:    movq %rdi, %rax
+; AVX512VL-NEXT:    vpaddq %zmm1, %zmm0, %zmm1
+; AVX512VL-NEXT:    vpcmpltuq %zmm0, %zmm1, %k0
+; AVX512VL-NEXT:    kmovd %k0, %ecx
+; AVX512VL-NEXT:    vpternlogd {{.*#+}} zmm2 = -1
+; AVX512VL-NEXT:    vpcmpeqq %zmm2, %zmm1, %k0
+; AVX512VL-NEXT:    kmovb %k0, %edx
+; AVX512VL-NEXT:    leal (%rdx,%rcx,2), %ecx
+; AVX512VL-NEXT:    xorl %edx, %ecx
+; AVX512VL-NEXT:    kmovd %ecx, %k1
+; AVX512VL-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 {%k1}
+; AVX512VL-NEXT:    vmovdqu64 %zmm1, (%rdi)
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
   %a0 = load i512, ptr %p0
   %r = add i512 %a0, 1
   ret i512 %r
@@ -429,19 +491,45 @@ define void @test_inc_i512_rmw(ptr %p0) nounwind {
 ; CHECK-NEXT:    adcq $0, 56(%rdi)
 ; CHECK-NEXT:    retq
 ;
-; AVX512-LABEL: test_inc_i512_rmw:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    addq $1, (%rdi)
-; AVX512-NEXT:    adcq $0, 8(%rdi)
-; AVX512-NEXT:    adcq $0, 16(%rdi)
-; AVX512-NEXT:    adcq $0, 24(%rdi)
-; AVX512-NEXT:    adcq $0, 32(%rdi)
-; AVX512-NEXT:    adcq $0, 40(%rdi)
-; AVX512-NEXT:    adcq $0, 48(%rdi)
-; AVX512-NEXT:    adcq $0, 56(%rdi)
-; AVX512-NEXT:    retq
+; AVX512F-LABEL: test_inc_i512_rmw:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vmovdqu64 (%rdi), %zmm0
+; AVX512F-NEXT:    vmovd {{.*#+}} xmm1 = [1,0,0,0]
+; AVX512F-NEXT:    vpaddq %zmm1, %zmm0, %zmm1
+; AVX512F-NEXT:    vpcmpltuq %zmm0, %zmm1, %k0
+; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm2 = -1
+; AVX512F-NEXT:    vpcmpeqq %zmm2, %zmm1, %k0
+; AVX512F-NEXT:    kmovw %k0, %ecx
+; AVX512F-NEXT:    movzbl %cl, %ecx
+; AVX512F-NEXT:    leal (%rcx,%rax,2), %eax
+; AVX512F-NEXT:    xorl %ecx, %eax
+; AVX512F-NEXT:    kmovw %eax, %k1
+; AVX512F-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 {%k1}
+; AVX512F-NEXT:    vmovdqu64 %zmm1, (%rdi)
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: test_inc_i512_rmw:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vmovdqu64 (%rdi), %zmm0
+; AVX512VL-NEXT:    vmovd {{.*#+}} xmm1 = [1,0,0,0]
+; AVX512VL-NEXT:    vpaddq %zmm1, %zmm0, %zmm1
+; AVX512VL-NEXT:    vpcmpltuq %zmm0, %zmm1, %k0
+; AVX512VL-NEXT:    kmovd %k0, %eax
+; AVX512VL-NEXT:    vpternlogd {{.*#+}} zmm2 = -1
+; AVX512VL-NEXT:    vpcmpeqq %zmm2, %zmm1, %k0
+; AVX512VL-NEXT:    kmovb %k0, %ecx
+; AVX512VL-NEXT:    leal (%rcx,%rax,2), %eax
+; AVX512VL-NEXT:    xorl %ecx, %eax
+; AVX512VL-NEXT:    kmovd %eax, %k1
+; AVX512VL-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 {%k1}
+; AVX512VL-NEXT:    vmovdqu64 %zmm1, (%rdi)
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
   %a0 = load i512, ptr %p0
   %r = add i512 %a0, 1
   store i512 %r, ptr %p0
   ret void
 }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; AVX512: {{.*}}
diff --git a/llvm/test/CodeGen/X86/sub-i512.ll b/llvm/test/CodeGen/X86/sub-i512.ll
index b2b57fe923adc..65c084623d06c 100644
--- a/llvm/test/CodeGen/X86/sub-i512.ll
+++ b/llvm/test/CodeGen/X86/sub-i512.ll
@@ -58,23 +58,31 @@ define i512 @test_sub_i512_reg_reg(i512 %a0, i512 %a1) nounwind {
 ; AVX512F-NEXT:    subq {{[0-9]+}}(%rsp), %rsi
 ; AVX512F-NEXT:    sbbq {{[0-9]+}}(%rsp), %rdx
 ; AVX512F-NEXT:    sbbq {{[0-9]+}}(%rsp), %rcx
-; AVX512F-NEXT:    sbbq {{[0-9]+}}(%rsp), %r8
 ; AVX512F-NEXT:    movq %rdi, %rax
-; AVX512F-NEXT:    sbbq {{[0-9]+}}(%rsp), %r9
+; AVX512F-NEXT:    sbbq {{[0-9]+}}(%rsp), %r8
 ; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %rdi
-; AVX512F-NEXT:    sbbq {{[0-9]+}}(%rsp), %rdi
+; AVX512F-NEXT:    sbbq {{[0-9]+}}(%rsp), %r9
 ; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %r10
 ; AVX512F-NEXT:    sbbq {{[0-9]+}}(%rsp), %r10
 ; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %r11
+; AVX512F-NEXT:    vmovq %r9, %xmm0
+; AVX512F-NEXT:    vmovq %r10, %xmm1
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512F-NEXT:    sbbq {{[0-9]+}}(%rsp), %rdi
+; AVX512F-NEXT:    vmovq %rdi, %xmm1
 ; AVX512F-NEXT:    sbbq {{[0-9]+}}(%rsp), %r11
-; AVX512F-NEXT:    movq %rcx, 16(%rax)
-; AVX512F-NEXT:    movq %rdx, 8(%rax)
-; AVX512F-NEXT:    movq %rsi, (%rax)
-; AVX512F-NEXT:    movq %r8, 24(%rax)
-; AVX512F-NEXT:    movq %r9, 32(%rax)
-; AVX512F-NEXT:    movq %rdi, 40(%rax)
-; AVX512F-NEXT:    movq %r10, 48(%rax)
-; AVX512F-NEXT:    movq %r11, 56(%rax)
+; AVX512F-NEXT:    vmovq %r11, %xmm2
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX512F-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT:    vmovq %rcx, %xmm1
+; AVX512F-NEXT:    vmovq %r8, %xmm2
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX512F-NEXT:    vmovq %rdx, %xmm2
+; AVX512F-NEXT:    vmovq %rsi, %xmm3
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
+; AVX512F-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-NEXT:    vmovdqu64 %zmm0, (%rax)
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: test_sub_i512_reg_reg:
@@ -82,23 +90,32 @@ define i512 @test_sub_i512_reg_reg(i512 %a0, i512 %a1) nounwind {
 ; AVX512VL-NEXT:    movq %rdi, %rax
 ; AVX512VL-NEXT:    movq {{[0-9]+}}(%rsp), %rdi
 ; AVX512VL-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512VL-NEXT:    movq {{[0-9]+}}(%rsp), %r11
 ; AVX512VL-NEXT:    subq {{[0-9]+}}(%rsp), %rsi
 ; AVX512VL-NEXT:    sbbq {{[0-9]+}}(%rsp), %rdx
 ; AVX512VL-NEXT:    sbbq {{[0-9]+}}(%rsp), %rcx
 ; AVX512VL-NEXT:    sbbq {{[0-9]+}}(%rsp), %r8
 ; AVX512VL-NEXT:    sbbq {{[0-9]+}}(%rsp), %r9
+; AVX512VL-NEXT:    movq {{[0-9]+}}(%rsp), %r11
+; AVX512VL-NEXT:    vmovq %r9, %xmm0
 ; AVX512VL-NEXT:    sbbq {{[0-9]+}}(%rsp), %r11
+; AVX512VL-NEXT:    vmovq %r11, %xmm1
+; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; AVX512VL-NEXT:    sbbq {{[0-9]+}}(%rsp), %rdi
+; AVX512VL-NEXT:    vmovq %rdi, %xmm1
 ; AVX512VL-NEXT:    sbbq {{[0-9]+}}(%rsp), %r10
-; AVX512VL-NEXT:    movq %rcx, 16(%rax)
-; AVX512VL-NEXT:    movq %rdx, 8(%rax)
-; AVX512VL-NEXT:    movq %rsi, (%rax)
-; AVX512VL-NEXT:    movq %r8, 24(%rax)
-; AVX512VL-NEXT:    movq %r9, 32(%rax)
-; AVX512VL-NEXT:    movq %r11, 40(%rax)
-; AVX512VL-NEXT:    movq %rdi, 48(%rax)
-; AVX512VL-NEXT:    movq %r10, 56(%rax)
+; AVX512VL-NEXT:    vmovq %r10, %xmm2
+; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX512VL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512VL-NEXT:    vmovq %rcx, %xmm1
+; AVX512VL-NEXT:    vmovq %r8, %xmm2
+; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX512VL-NEXT:    vmovq %rdx, %xmm2
+; AVX512VL-NEXT:    vmovq %rsi, %xmm3
+; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
+; AVX512VL-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
+; AVX512VL-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512VL-NEXT:    vmovdqu64 %zmm0, (%rax)
+; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
   %r = sub i512 %a0, %a1
   ret i512 %r
@@ -162,27 +179,35 @@ define i512 @test_sub_i512_reg_mem(i512 %a0, ptr %p1) nounwind {
 ; AVX512F-LABEL: test_sub_i512_reg_mem:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    pushq %rbx
+; AVX512F-NEXT:    movq %rdi, %rax
+; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %rdi
 ; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %r10
 ; AVX512F-NEXT:    subq (%r10), %rsi
+; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %r11
 ; AVX512F-NEXT:    sbbq 8(%r10), %rdx
+; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %rbx
+; AVX512F-NEXT:    vmovq %rsi, %xmm0
+; AVX512F-NEXT:    vmovq %rdx, %xmm1
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; AVX512F-NEXT:    sbbq 16(%r10), %rcx
+; AVX512F-NEXT:    vmovq %rcx, %xmm1
 ; AVX512F-NEXT:    sbbq 24(%r10), %r8
-; AVX512F-NEXT:    movq %rdi, %rax
+; AVX512F-NEXT:    vmovq %r8, %xmm2
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
 ; AVX512F-NEXT:    sbbq 32(%r10), %r9
-; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %rdi
-; AVX512F-NEXT:    sbbq 40(%r10), %rdi
-; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %r11
-; AVX512F-NEXT:    sbbq 48(%r10), %r11
-; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %rbx
-; AVX512F-NEXT:    sbbq 56(%r10), %rbx
-; AVX512F-NEXT:    movq %rsi, (%rax)
-; AVX512F-NEXT:    movq %rdx, 8(%rax)
-; AVX512F-NEXT:    movq %rcx, 16(%rax)
-; AVX512F-NEXT:    movq %r8, 24(%rax)
-; AVX512F-NEXT:    movq %r9, 32(%rax)
-; AVX512F-NEXT:    movq %rdi, 40(%rax)
-; AVX512F-NEXT:    movq %r11, 48(%rax)
-; AVX512F-NEXT:    movq %rbx, 56(%rax)
+; AVX512F-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT:    sbbq 40(%r10), %rbx
+; AVX512F-NEXT:    vmovq %r9, %xmm1
+; AVX512F-NEXT:    sbbq 48(%r10), %rdi
+; AVX512F-NEXT:    vmovq %rbx, %xmm2
+; AVX512F-NEXT:    sbbq 56(%r10), %r11
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX512F-NEXT:    vmovq %rdi, %xmm2
+; AVX512F-NEXT:    vmovq %r11, %xmm3
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; AVX512F-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512F-NEXT:    vmovdqu64 %zmm0, (%rax)
 ; AVX512F-NEXT:    popq %rbx
 ; AVX512F-NEXT:    retq
 ;
@@ -190,27 +215,36 @@ define i512 @test_sub_i512_reg_mem(i512 %a0, ptr %p1) nounwind {
 ; AVX512VL:       # %bb.0:
 ; AVX512VL-NEXT:    pushq %rbx
 ; AVX512VL-NEXT:    movq %rdi, %rax
-; AVX512VL-NEXT:    movq {{[0-9]+}}(%rsp), %rdi
 ; AVX512VL-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; AVX512VL-NEXT:    movq {{[0-9]+}}(%rsp), %rdi
 ; AVX512VL-NEXT:    movq {{[0-9]+}}(%rsp), %r11
 ; AVX512VL-NEXT:    movq {{[0-9]+}}(%rsp), %rbx
 ; AVX512VL-NEXT:    subq (%rbx), %rsi
+; AVX512VL-NEXT:    vmovq %rsi, %xmm0
 ; AVX512VL-NEXT:    sbbq 8(%rbx), %rdx
+; AVX512VL-NEXT:    vmovq %rdx, %xmm1
+; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; AVX512VL-NEXT:    sbbq 16(%rbx), %rcx
+; AVX512VL-NEXT:    vmovq %rcx, %xmm1
 ; AVX512VL-NEXT:    sbbq 24(%rbx), %r8
+; AVX512VL-NEXT:    vmovq %r8, %xmm2
+; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX512VL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; AVX512VL-NEXT:    sbbq 32(%rbx), %r9
+; AVX512VL-NEXT:    vmovq %r9, %xmm1
 ; AVX512VL-NEXT:    sbbq 40(%rbx), %r11
-; AVX512VL-NEXT:    sbbq 48(%rbx), %rdi
-; AVX512VL-NEXT:    sbbq 56(%rbx), %r10
-; AVX512VL-NEXT:    movq %rsi, (%rax)
-; AVX512VL-NEXT:    movq %rdx, 8(%rax)
-; AVX512VL-NEXT:    movq %rcx, 16(%rax)
-; AVX512VL-NEXT:    movq %r8, 24(%rax)
-; AVX512VL-NEXT:    movq %r9, 32(%rax)
-; AVX512VL-NEXT:    movq %r11, 40(%rax)
-; AVX512VL-NEXT:    movq %rdi, 48(%rax)
-; AVX512VL-NEXT:    movq %r10, 56(%rax)
+; AVX512VL-NEXT:    vmovq %r11, %xmm2
+; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX512VL-NEXT:    sbbq 48(%rbx), %r10
+; AVX512VL-NEXT:    vmovq %r10, %xmm2
+; AVX512VL-NEXT:    sbbq 56(%rbx), %rdi
+; AVX512VL-NEXT:    vmovq %rdi, %xmm3
+; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; AVX512VL-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512VL-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512VL-NEXT:    vmovdqu64 %zmm0, (%rax)
 ; AVX512VL-NEXT:    popq %rbx
+; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
   %a1 = load i512, ptr %p1
   %r = sub i512 %a0, %a1
@@ -284,33 +318,47 @@ define i512 @test_sub_i512_mem_reg(ptr %p0, i512 %a1) nounwind {
 ;
 ; AVX512F-LABEL: test_sub_i512_mem_reg:
 ; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    pushq %r15
+; AVX512F-NEXT:    pushq %r14
+; AVX512F-NEXT:    pushq %r12
 ; AVX512F-NEXT:    pushq %rbx
 ; AVX512F-NEXT:    movq %rdi, %rax
-; AVX512F-NEXT:    movq 24(%rsi), %rdi
-; AVX512F-NEXT:    movq 16(%rsi), %r10
-; AVX512F-NEXT:    movq (%rsi), %r11
-; AVX512F-NEXT:    movq 8(%rsi), %rbx
-; AVX512F-NEXT:    subq %rdx, %r11
-; AVX512F-NEXT:    sbbq %rcx, %rbx
-; AVX512F-NEXT:    sbbq %r8, %r10
-; AVX512F-NEXT:    sbbq %r9, %rdi
-; AVX512F-NEXT:    movq 32(%rsi), %rcx
-; AVX512F-NEXT:    sbbq {{[0-9]+}}(%rsp), %rcx
-; AVX512F-NEXT:    movq 40(%rsi), %rdx
-; AVX512F-NEXT:    sbbq {{[0-9]+}}(%rsp), %rdx
-; AVX512F-NEXT:    movq 48(%rsi), %r8
-; AVX512F-NEXT:    sbbq {{[0-9]+}}(%rsp), %r8
-; AVX512F-NEXT:    movq 56(%rsi), %rsi
-; AVX512F-NEXT:    sbbq {{[0-9]+}}(%rsp), %rsi
-; AVX512F-NEXT:    movq %r11, (%rax)
-; AVX512F-NEXT:    movq %rbx, 8(%rax)
-; AVX512F-NEXT:    movq %r10, 16(%rax)
-; AVX512F-NEXT:    movq %rdi, 24(%rax)
-; AVX512F-NEXT:    movq %rcx, 32(%rax)
-; AVX512F-NEXT:    movq %rdx, 40(%rax)
-; AVX512F-NEXT:    movq %r8, 48(%rax)
-; AVX512F-NEXT:    movq %rsi, 56(%rax)
+; AVX512F-NEXT:    movq 56(%rsi), %rdi
+; AVX512F-NEXT:    movq 48(%rsi), %r10
+; AVX512F-NEXT:    movq 40(%rsi), %r11
+; AVX512F-NEXT:    movq 32(%rsi), %rbx
+; AVX512F-NEXT:    movq 24(%rsi), %r14
+; AVX512F-NEXT:    movq 16(%rsi), %r15
+; AVX512F-NEXT:    movq (%rsi), %r12
+; AVX512F-NEXT:    movq 8(%rsi), %rsi
+; AVX512F-NEXT:    subq %rdx, %r12
+; AVX512F-NEXT:    vmovq %r12, %xmm0
+; AVX512F-NEXT:    sbbq %rcx, %rsi
+; AVX512F-NEXT:    vmovq %rsi, %xmm1
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512F-NEXT:    sbbq %r8, %r15
+; AVX512F-NEXT:    vmovq %r15, %xmm1
+; AVX512F-NEXT:    sbbq %r9, %r14
+; AVX512F-NEXT:    vmovq %r14, %xmm2
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX512F-NEXT:    sbbq {{[0-9]+}}(%rsp), %rbx
+; AVX512F-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT:    sbbq {{[0-9]+}}(%rsp), %r11
+; AVX512F-NEXT:    vmovq %rbx, %xmm1
+; AVX512F-NEXT:    sbbq {{[0-9]+}}(%rsp), %r10
+; AVX512F-NEXT:    vmovq %r11, %xmm2
+; AVX512F-NEXT:    sbbq {{[0-9]+}}(%rsp), %rdi
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX512F-NEXT:    vmovq %r10, %xmm2
+; AVX512F-NEXT:    vmovq %rdi, %xmm3
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; AVX512F-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512F-NEXT:    vmovdqu64 %zmm0, (%rax)
 ; AVX512F-NEXT:    popq %rbx
+; AVX512F-NEXT:    popq %r12
+; AVX512F-NEXT:    popq %r14
+; AVX512F-NEXT:    popq %r15
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: test_sub_i512_mem_reg:
@@ -329,25 +377,34 @@ define i512 @test_sub_i512_mem_reg(ptr %p0, i512 %a1) nounwind {
 ; AVX512VL-NEXT:    movq (%rsi), %r12
 ; AVX512VL-NEXT:    movq 8(%rsi), %rsi
 ; AVX512VL-NEXT:    subq %rdx, %r12
+; AVX512VL-NEXT:    vmovq %r12, %xmm0
 ; AVX512VL-NEXT:    sbbq %rcx, %rsi
+; AVX512VL-NEXT:    vmovq %rsi, %xmm1
+; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; AVX512VL-NEXT:    sbbq %r8, %r15
+; AVX512VL-NEXT:    vmovq %r15, %xmm1
 ; AVX512VL-NEXT:    sbbq %r9, %r14
+; AVX512VL-NEXT:    vmovq %r14, %xmm2
+; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX512VL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; AVX512VL-NEXT:    sbbq {{[0-9]+}}(%rsp), %rbx
+; AVX512VL-NEXT:    vmovq %rbx, %xmm1
 ; AVX512VL-NEXT:    sbbq {{[0-9]+}}(%rsp), %r11
+; AVX512VL-NEXT:    vmovq %r11, %xmm2
+; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
 ; AVX512VL-NEXT:    sbbq {{[0-9]+}}(%rsp), %r10
+; AVX512VL-NEXT:    vmovq %r10, %xmm2
 ; AVX512VL-NEXT:    sbbq {{[0-9]+}}(%rsp), %rdi
-; AVX512VL-NEXT:    movq %r12, (%rax)
-; AVX512VL-NEXT:    movq %rsi, 8(%rax)
-; AVX512VL-NEXT:    movq %r15, 16(%rax)
-; AVX512VL-NEXT:    movq %r14, 24(%rax)
-; AVX512VL-NEXT:    movq %rbx, 32(%rax)
-; AVX512VL-NEXT:    movq %r11, 40(%rax)
-; AVX512VL-NEXT:    movq %r10, 48(%rax)
-; AVX512VL-NEXT:    movq %rdi, 56(%rax)
+; AVX512VL-NEXT:    vmovq %rdi, %xmm3
+; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; AVX512VL-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512VL-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512VL-NEXT:    vmovdqu64 %zmm0, (%rax)
 ; AVX512VL-NEXT:    popq %rbx
 ; AVX512VL-NEXT:    popq %r12
 ; AVX512VL-NEXT:    popq %r14
 ; AVX512VL-NEXT:    popq %r15
+; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
   %a0 = load i512, ptr %p0
   %r = sub i512 %a0, %a1
@@ -413,34 +470,41 @@ define i512 @test_dec_i512_mem(ptr %p0) nounwind {
 ; AVX2-NEXT:    movq %rcx, 56(%rax)
 ; AVX2-NEXT:    retq
 ;
-; AVX512-LABEL: test_dec_i512_mem:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    movq %rdi, %rax
-; AVX512-NEXT:    movq 56(%rsi), %rcx
-; AVX512-NEXT:    movq 48(%rsi), %rdx
-; AVX512-NEXT:    movq 40(%rsi), %rdi
-; AVX512-NEXT:    movq 32(%rsi), %r8
-; AVX512-NEXT:    movq 24(%rsi), %r9
-; AVX512-NEXT:    movq 16(%rsi), %r10
-; AVX512-NEXT:    movq (%rsi), %r11
-; AVX512-NEXT:    movq 8(%rsi), %rsi
-; AVX512-NEXT:    addq $-1, %r11
-; AVX512-NEXT:    adcq $-1, %rsi
-; AVX512-NEXT:    adcq $-1, %r10
-; AVX512-NEXT:    adcq $-1, %r9
-; AVX512-NEXT:    adcq $-1, %r8
-; AVX512-NEXT:    adcq $-1, %rdi
-; AVX512-NEXT:    adcq $-1, %rdx
-; AVX512-NEXT:    adcq $-1, %rcx
-; AVX512-NEXT:    movq %r11, (%rax)
-; AVX512-NEXT:    movq %rsi, 8(%rax)
-; AVX512-NEXT:    movq %r10, 16(%rax)
-; AVX512-NEXT:    movq %r9, 24(%rax)
-; AVX512-NEXT:    movq %r8, 32(%rax)
-; AVX512-NEXT:    movq %rdi, 40(%rax)
-; AVX512-NEXT:    movq %rdx, 48(%rax)
-; AVX512-NEXT:    movq %rcx, 56(%rax)
-; AVX512-NEXT:    retq
+; AVX512F-LABEL: test_dec_i512_mem:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    movq %rdi, %rax
+; AVX512F-NEXT:    vmovdqu64 (%rsi), %zmm0
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm1 = -1
+; AVX512F-NEXT:    vpaddq %zmm1, %zmm0, %zmm1
+; AVX512F-NEXT:    vpcmpltuq %zmm0, %zmm1, %k0
+; AVX512F-NEXT:    kmovw %k0, %ecx
+; AVX512F-NEXT:    vptestnmq %zmm0, %zmm0, %k0
+; AVX512F-NEXT:    kmovw %k0, %edx
+; AVX512F-NEXT:    movzbl %dl, %edx
+; AVX512F-NEXT:    leal (%rdx,%rcx,2), %ecx
+; AVX512F-NEXT:    xorl %edx, %ecx
+; AVX512F-NEXT:    kmovw %ecx, %k1
+; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512F-NEXT:    vmovdqu64 %zmm1, (%rdi)
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: test_dec_i512_mem:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    movq %rdi, %rax
+; AVX512VL-NEXT:    vmovdqu64 (%rsi), %zmm0
+; AVX512VL-NEXT:    vptestnmq %zmm0, %zmm0, %k0
+; AVX512VL-NEXT:    kmovb %k0, %ecx
+; AVX512VL-NEXT:    vpternlogd {{.*#+}} zmm1 = -1
+; AVX512VL-NEXT:    vpaddq %zmm1, %zmm0, %zmm1
+; AVX512VL-NEXT:    vpcmpltuq %zmm0, %zmm1, %k0
+; AVX512VL-NEXT:    kmovd %k0, %edx
+; AVX512VL-NEXT:    leal (%rcx,%rdx,2), %edx
+; AVX512VL-NEXT:    xorl %ecx, %edx
+; AVX512VL-NEXT:    kmovd %edx, %k1
+; AVX512VL-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512VL-NEXT:    vmovdqu64 %zmm1, (%rdi)
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
   %a0 = load i512, ptr %p0
   %r = sub i512 %a0, 1
   ret i512 %r
@@ -459,19 +523,43 @@ define void @test_dec_i512_rmw(ptr %p0) nounwind {
 ; CHECK-NEXT:    adcq $-1, 56(%rdi)
 ; CHECK-NEXT:    retq
 ;
-; AVX512-LABEL: test_dec_i512_rmw:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    addq $-1, (%rdi)
-; AVX512-NEXT:    adcq $-1, 8(%rdi)
-; AVX512-NEXT:    adcq $-1, 16(%rdi)
-; AVX512-NEXT:    adcq $-1, 24(%rdi)
-; AVX512-NEXT:    adcq $-1, 32(%rdi)
-; AVX512-NEXT:    adcq $-1, 40(%rdi)
-; AVX512-NEXT:    adcq $-1, 48(%rdi)
-; AVX512-NEXT:    adcq $-1, 56(%rdi)
-; AVX512-NEXT:    retq
+; AVX512F-LABEL: test_dec_i512_rmw:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vmovdqu64 (%rdi), %zmm0
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm1 = -1
+; AVX512F-NEXT:    vpaddq %zmm1, %zmm0, %zmm1
+; AVX512F-NEXT:    vpcmpltuq %zmm0, %zmm1, %k0
+; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    vptestnmq %zmm0, %zmm0, %k0
+; AVX512F-NEXT:    kmovw %k0, %ecx
+; AVX512F-NEXT:    movzbl %cl, %ecx
+; AVX512F-NEXT:    leal (%rcx,%rax,2), %eax
+; AVX512F-NEXT:    xorl %ecx, %eax
+; AVX512F-NEXT:    kmovw %eax, %k0
+; AVX512F-NEXT:    knotw %k0, %k1
+; AVX512F-NEXT:    vmovdqu64 %zmm1, (%rdi) {%k1}
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: test_dec_i512_rmw:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vmovdqu64 (%rdi), %zmm0
+; AVX512VL-NEXT:    vptestnmq %zmm0, %zmm0, %k0
+; AVX512VL-NEXT:    kmovb %k0, %eax
+; AVX512VL-NEXT:    vpternlogd {{.*#+}} zmm1 = -1
+; AVX512VL-NEXT:    vpaddq %zmm1, %zmm0, %zmm1
+; AVX512VL-NEXT:    vpcmpltuq %zmm0, %zmm1, %k0
+; AVX512VL-NEXT:    kmovd %k0, %ecx
+; AVX512VL-NEXT:    leal (%rax,%rcx,2), %ecx
+; AVX512VL-NEXT:    xorl %eax, %ecx
+; AVX512VL-NEXT:    kmovd %ecx, %k0
+; AVX512VL-NEXT:    knotb %k0, %k1
+; AVX512VL-NEXT:    vmovdqu64 %zmm1, (%rdi) {%k1}
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
   %a0 = load i512, ptr %p0
   %r = sub i512 %a0, 1
   store i512 %r, ptr %p0
   ret void
 }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; AVX512: {{.*}}



More information about the llvm-commits mailing list