[llvm] [X86] Lower i512 ADD/SUB using Kogge-Stone on AVX512 (PR #174761)
Islam Imad via llvm-commits
llvm-commits at lists.llvm.org
Wed Jan 7 04:40:55 PST 2026
https://github.com/Islam-Imad created https://github.com/llvm/llvm-project/pull/174761
Fixes #173996
>From cd4d64fc410a36f593d7975c859aed040dda98a6 Mon Sep 17 00:00:00 2001
From: Islam-Imad <islamimad404 at gmail.com>
Date: Wed, 7 Jan 2026 14:27:11 +0200
Subject: [PATCH] [X86] Lower i512 ADD/SUB using Kogge-Stone on AVX512 Fixes
#173996
---
llvm/lib/Target/X86/X86ISelLowering.cpp | 62 +++++
llvm/test/CodeGen/X86/add-i512.ll | 306 ++++++++++++++--------
llvm/test/CodeGen/X86/sub-i512.ll | 322 +++++++++++++++---------
3 files changed, 464 insertions(+), 226 deletions(-)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 6501aa3746a0f..29bb306c88add 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -28,6 +28,7 @@
#include "llvm/Analysis/BlockFrequencyInfo.h"
#include "llvm/Analysis/ProfileSummaryInfo.h"
#include "llvm/Analysis/VectorUtils.h"
+#include "llvm/CodeGen/ISDOpcodes.h"
#include "llvm/CodeGen/LivePhysRegs.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
@@ -1847,6 +1848,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::AND, MVT::i512, Custom);
setOperationAction(ISD::OR, MVT::i512, Custom);
setOperationAction(ISD::XOR, MVT::i512, Custom);
+ setOperationAction(ISD::ADD, MVT::i512, Custom);
+ setOperationAction(ISD::SUB, MVT::i512, Custom);
for (MVT VT : { MVT::v16i1, MVT::v16i8 }) {
setOperationPromotedToType(ISD::FP_TO_SINT , VT, MVT::v16i32);
@@ -34031,6 +34034,65 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
Results.push_back(DAG.getBitcast(VT, Op));
return;
}
+ case ISD::ADD:
+ case ISD::SUB: {
+ // Use Kogge-Stone parallel carry/borrow propagation for i512 add/sub.
+ // TODO: ISD::UADDO_CARRY
+ SDValue LHS = N->getOperand(0);
+ SDValue RHS = N->getOperand(1);
+ EVT VT = N->getValueType(0);
+ bool IsAdd = Opc == ISD::ADD;
+
+ assert(Subtarget.useAVX512Regs() && "AVX512 required");
+ assert(VT == MVT::i512 && "Unexpected VT!");
+
+ if (!mayFoldIntoVector(LHS, Subtarget) ||
+ !mayFoldIntoVector(RHS, Subtarget))
+ return;
+
+ MVT VecVT = MVT::getVectorVT(MVT::i64, 8);
+ MVT BoolVT = MVT::getVectorVT(MVT::i1, 8);
+ SDValue Vec0 = DAG.getBitcast(VecVT, LHS);
+ SDValue Vec1 = DAG.getBitcast(VecVT, RHS);
+ SDValue AllOnes = DAG.getAllOnesConstant(dl, VecVT);
+
+ // Compute partial sum/difference (per-lane, no carry propagation).
+ SDValue Partial = DAG.getNode(Opc, dl, VecVT, Vec0, Vec1);
+
+ // Detect carry/borrow generation.
+ ISD::CondCode CarryCC = IsAdd ? ISD::SETULT : ISD::SETUGT;
+ SDValue Carry = DAG.getSetCC(dl, BoolVT, Partial, Vec0, CarryCC);
+
+ // Detect propagate lanes.
+ SDValue PropCmp = IsAdd ? AllOnes : DAG.getConstant(0, dl, VecVT);
+ SDValue Propagate = DAG.getSetCC(dl, BoolVT, Partial, PropCmp, ISD::SETEQ);
+
+ // Convert masks to scalar for Kogge-Stone propagation.
+ SDValue CarryIn = DAG.getNode(ISD::BITCAST, dl, MVT::i8, Carry);
+ SDValue PropIn = DAG.getNode(ISD::BITCAST, dl, MVT::i8, Propagate);
+ CarryIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, CarryIn);
+ PropIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, PropIn);
+
+ // Kogge-Stone: shift carry left and add propagate.
+ SDValue ShiftedCarry = DAG.getNode(ISD::SHL, dl, MVT::i32, CarryIn,
+ DAG.getConstant(1, dl, MVT::i8));
+ SDValue CarryOut =
+ DAG.getNode(ISD::ADD, dl, MVT::i32, ShiftedCarry, PropIn);
+
+ // Correction mask: lanes that received a carry/borrow.
+ SDValue CorrMask = DAG.getNode(ISD::XOR, dl, MVT::i32, PropIn, CarryOut);
+ CorrMask = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, CorrMask);
+ SDValue CorrVec = DAG.getNode(ISD::BITCAST, dl, BoolVT, CorrMask);
+
+ // Apply correction: +1 for ADD (via -(-1)), -1 for SUB (via +(-1)).
+ unsigned AdjustOpc = IsAdd ? ISD::SUB : ISD::ADD;
+ SDValue Adjusted = DAG.getNode(AdjustOpc, dl, VecVT, Partial, AllOnes);
+ SDValue Res =
+ DAG.getNode(ISD::VSELECT, dl, VecVT, CorrVec, Adjusted, Partial);
+
+ Results.push_back(DAG.getBitcast(VT, Res));
+ return;
+ }
case ISD::CTPOP: {
assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!");
// If we have at most 32 active bits, then perform as i32 CTPOP.
diff --git a/llvm/test/CodeGen/X86/add-i512.ll b/llvm/test/CodeGen/X86/add-i512.ll
index 16ba701072031..1e31fd4fe374f 100644
--- a/llvm/test/CodeGen/X86/add-i512.ll
+++ b/llvm/test/CodeGen/X86/add-i512.ll
@@ -58,23 +58,31 @@ define i512 @test_add_i512_reg_reg(i512 %a0, i512 %a1) nounwind {
; AVX512F-NEXT: addq {{[0-9]+}}(%rsp), %rsi
; AVX512F-NEXT: adcq {{[0-9]+}}(%rsp), %rdx
; AVX512F-NEXT: adcq {{[0-9]+}}(%rsp), %rcx
-; AVX512F-NEXT: adcq {{[0-9]+}}(%rsp), %r8
; AVX512F-NEXT: movq %rdi, %rax
-; AVX512F-NEXT: adcq {{[0-9]+}}(%rsp), %r9
+; AVX512F-NEXT: adcq {{[0-9]+}}(%rsp), %r8
; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rdi
-; AVX512F-NEXT: adcq {{[0-9]+}}(%rsp), %rdi
+; AVX512F-NEXT: adcq {{[0-9]+}}(%rsp), %r9
; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r10
; AVX512F-NEXT: adcq {{[0-9]+}}(%rsp), %r10
; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r11
+; AVX512F-NEXT: vmovq %r9, %xmm0
+; AVX512F-NEXT: vmovq %r10, %xmm1
+; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512F-NEXT: adcq {{[0-9]+}}(%rsp), %rdi
+; AVX512F-NEXT: vmovq %rdi, %xmm1
; AVX512F-NEXT: adcq {{[0-9]+}}(%rsp), %r11
-; AVX512F-NEXT: movq %rcx, 16(%rax)
-; AVX512F-NEXT: movq %rdx, 8(%rax)
-; AVX512F-NEXT: movq %rsi, (%rax)
-; AVX512F-NEXT: movq %r8, 24(%rax)
-; AVX512F-NEXT: movq %r9, 32(%rax)
-; AVX512F-NEXT: movq %rdi, 40(%rax)
-; AVX512F-NEXT: movq %r10, 48(%rax)
-; AVX512F-NEXT: movq %r11, 56(%rax)
+; AVX512F-NEXT: vmovq %r11, %xmm2
+; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT: vmovq %rcx, %xmm1
+; AVX512F-NEXT: vmovq %r8, %xmm2
+; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX512F-NEXT: vmovq %rdx, %xmm2
+; AVX512F-NEXT: vmovq %rsi, %xmm3
+; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
+; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
+; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-NEXT: vmovdqu64 %zmm0, (%rax)
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: test_add_i512_reg_reg:
@@ -82,23 +90,32 @@ define i512 @test_add_i512_reg_reg(i512 %a0, i512 %a1) nounwind {
; AVX512VL-NEXT: movq %rdi, %rax
; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %rdi
; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %r11
; AVX512VL-NEXT: addq {{[0-9]+}}(%rsp), %rsi
; AVX512VL-NEXT: adcq {{[0-9]+}}(%rsp), %rdx
; AVX512VL-NEXT: adcq {{[0-9]+}}(%rsp), %rcx
; AVX512VL-NEXT: adcq {{[0-9]+}}(%rsp), %r8
; AVX512VL-NEXT: adcq {{[0-9]+}}(%rsp), %r9
+; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %r11
+; AVX512VL-NEXT: vmovq %r9, %xmm0
; AVX512VL-NEXT: adcq {{[0-9]+}}(%rsp), %r11
+; AVX512VL-NEXT: vmovq %r11, %xmm1
+; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512VL-NEXT: adcq {{[0-9]+}}(%rsp), %rdi
+; AVX512VL-NEXT: vmovq %rdi, %xmm1
; AVX512VL-NEXT: adcq {{[0-9]+}}(%rsp), %r10
-; AVX512VL-NEXT: movq %rcx, 16(%rax)
-; AVX512VL-NEXT: movq %rdx, 8(%rax)
-; AVX512VL-NEXT: movq %rsi, (%rax)
-; AVX512VL-NEXT: movq %r8, 24(%rax)
-; AVX512VL-NEXT: movq %r9, 32(%rax)
-; AVX512VL-NEXT: movq %r11, 40(%rax)
-; AVX512VL-NEXT: movq %rdi, 48(%rax)
-; AVX512VL-NEXT: movq %r10, 56(%rax)
+; AVX512VL-NEXT: vmovq %r10, %xmm2
+; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512VL-NEXT: vmovq %rcx, %xmm1
+; AVX512VL-NEXT: vmovq %r8, %xmm2
+; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX512VL-NEXT: vmovq %rdx, %xmm2
+; AVX512VL-NEXT: vmovq %rsi, %xmm3
+; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
+; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512VL-NEXT: vmovdqu64 %zmm0, (%rax)
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
%r = add i512 %a0, %a1
ret i512 %r
@@ -162,27 +179,35 @@ define i512 @test_add_i512_reg_mem(i512 %a0, ptr %p1) nounwind {
; AVX512F-LABEL: test_add_i512_reg_mem:
; AVX512F: # %bb.0:
; AVX512F-NEXT: pushq %rbx
+; AVX512F-NEXT: movq %rdi, %rax
+; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rdi
; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r10
; AVX512F-NEXT: addq (%r10), %rsi
+; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r11
; AVX512F-NEXT: adcq 8(%r10), %rdx
+; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rbx
+; AVX512F-NEXT: vmovq %rsi, %xmm0
+; AVX512F-NEXT: vmovq %rdx, %xmm1
+; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512F-NEXT: adcq 16(%r10), %rcx
+; AVX512F-NEXT: vmovq %rcx, %xmm1
; AVX512F-NEXT: adcq 24(%r10), %r8
-; AVX512F-NEXT: movq %rdi, %rax
+; AVX512F-NEXT: vmovq %r8, %xmm2
+; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; AVX512F-NEXT: adcq 32(%r10), %r9
-; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rdi
-; AVX512F-NEXT: adcq 40(%r10), %rdi
-; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r11
-; AVX512F-NEXT: adcq 48(%r10), %r11
-; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rbx
-; AVX512F-NEXT: adcq 56(%r10), %rbx
-; AVX512F-NEXT: movq %rsi, (%rax)
-; AVX512F-NEXT: movq %rdx, 8(%rax)
-; AVX512F-NEXT: movq %rcx, 16(%rax)
-; AVX512F-NEXT: movq %r8, 24(%rax)
-; AVX512F-NEXT: movq %r9, 32(%rax)
-; AVX512F-NEXT: movq %rdi, 40(%rax)
-; AVX512F-NEXT: movq %r11, 48(%rax)
-; AVX512F-NEXT: movq %rbx, 56(%rax)
+; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT: adcq 40(%r10), %rbx
+; AVX512F-NEXT: vmovq %r9, %xmm1
+; AVX512F-NEXT: adcq 48(%r10), %rdi
+; AVX512F-NEXT: vmovq %rbx, %xmm2
+; AVX512F-NEXT: adcq 56(%r10), %r11
+; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX512F-NEXT: vmovq %rdi, %xmm2
+; AVX512F-NEXT: vmovq %r11, %xmm3
+; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512F-NEXT: vmovdqu64 %zmm0, (%rax)
; AVX512F-NEXT: popq %rbx
; AVX512F-NEXT: retq
;
@@ -190,27 +215,36 @@ define i512 @test_add_i512_reg_mem(i512 %a0, ptr %p1) nounwind {
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: pushq %rbx
; AVX512VL-NEXT: movq %rdi, %rax
-; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %rdi
; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %rdi
; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %r11
; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %rbx
; AVX512VL-NEXT: addq (%rbx), %rsi
+; AVX512VL-NEXT: vmovq %rsi, %xmm0
; AVX512VL-NEXT: adcq 8(%rbx), %rdx
+; AVX512VL-NEXT: vmovq %rdx, %xmm1
+; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512VL-NEXT: adcq 16(%rbx), %rcx
+; AVX512VL-NEXT: vmovq %rcx, %xmm1
; AVX512VL-NEXT: adcq 24(%rbx), %r8
+; AVX512VL-NEXT: vmovq %r8, %xmm2
+; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX512VL-NEXT: adcq 32(%rbx), %r9
+; AVX512VL-NEXT: vmovq %r9, %xmm1
; AVX512VL-NEXT: adcq 40(%rbx), %r11
-; AVX512VL-NEXT: adcq 48(%rbx), %rdi
-; AVX512VL-NEXT: adcq 56(%rbx), %r10
-; AVX512VL-NEXT: movq %rsi, (%rax)
-; AVX512VL-NEXT: movq %rdx, 8(%rax)
-; AVX512VL-NEXT: movq %rcx, 16(%rax)
-; AVX512VL-NEXT: movq %r8, 24(%rax)
-; AVX512VL-NEXT: movq %r9, 32(%rax)
-; AVX512VL-NEXT: movq %r11, 40(%rax)
-; AVX512VL-NEXT: movq %rdi, 48(%rax)
-; AVX512VL-NEXT: movq %r10, 56(%rax)
+; AVX512VL-NEXT: vmovq %r11, %xmm2
+; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX512VL-NEXT: adcq 48(%rbx), %r10
+; AVX512VL-NEXT: vmovq %r10, %xmm2
+; AVX512VL-NEXT: adcq 56(%rbx), %rdi
+; AVX512VL-NEXT: vmovq %rdi, %xmm3
+; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; AVX512VL-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512VL-NEXT: vmovdqu64 %zmm0, (%rax)
; AVX512VL-NEXT: popq %rbx
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
%a1 = load i512, ptr %p1
%r = add i512 %a0, %a1
@@ -270,27 +304,37 @@ define i512 @test_add_i512_mem_reg(ptr %p0, i512 %a1) nounwind {
;
; AVX512F-LABEL: test_add_i512_mem_reg:
; AVX512F: # %bb.0:
+; AVX512F-NEXT: pushq %rbx
+; AVX512F-NEXT: movq %rdi, %rax
+; AVX512F-NEXT: movq 56(%rsi), %rdi
+; AVX512F-NEXT: movq 48(%rsi), %r10
; AVX512F-NEXT: addq (%rsi), %rdx
+; AVX512F-NEXT: movq 32(%rsi), %r11
; AVX512F-NEXT: adcq 8(%rsi), %rcx
+; AVX512F-NEXT: movq 40(%rsi), %rbx
+; AVX512F-NEXT: vmovq %rdx, %xmm0
+; AVX512F-NEXT: vmovq %rcx, %xmm1
+; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512F-NEXT: adcq 16(%rsi), %r8
-; AVX512F-NEXT: movq %rdi, %rax
+; AVX512F-NEXT: vmovq %r8, %xmm1
; AVX512F-NEXT: adcq 24(%rsi), %r9
-; AVX512F-NEXT: movq 32(%rsi), %rdi
-; AVX512F-NEXT: adcq {{[0-9]+}}(%rsp), %rdi
-; AVX512F-NEXT: movq 40(%rsi), %r10
-; AVX512F-NEXT: adcq {{[0-9]+}}(%rsp), %r10
-; AVX512F-NEXT: movq 48(%rsi), %r11
+; AVX512F-NEXT: vmovq %r9, %xmm2
+; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; AVX512F-NEXT: adcq {{[0-9]+}}(%rsp), %r11
-; AVX512F-NEXT: movq 56(%rsi), %rsi
-; AVX512F-NEXT: adcq {{[0-9]+}}(%rsp), %rsi
-; AVX512F-NEXT: movq %rdx, (%rax)
-; AVX512F-NEXT: movq %rcx, 8(%rax)
-; AVX512F-NEXT: movq %r8, 16(%rax)
-; AVX512F-NEXT: movq %r9, 24(%rax)
-; AVX512F-NEXT: movq %rdi, 32(%rax)
-; AVX512F-NEXT: movq %r10, 40(%rax)
-; AVX512F-NEXT: movq %r11, 48(%rax)
-; AVX512F-NEXT: movq %rsi, 56(%rax)
+; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT: adcq {{[0-9]+}}(%rsp), %rbx
+; AVX512F-NEXT: vmovq %r11, %xmm1
+; AVX512F-NEXT: adcq {{[0-9]+}}(%rsp), %r10
+; AVX512F-NEXT: vmovq %rbx, %xmm2
+; AVX512F-NEXT: adcq {{[0-9]+}}(%rsp), %rdi
+; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX512F-NEXT: vmovq %r10, %xmm2
+; AVX512F-NEXT: vmovq %rdi, %xmm3
+; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512F-NEXT: vmovdqu64 %zmm0, (%rax)
+; AVX512F-NEXT: popq %rbx
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: test_add_i512_mem_reg:
@@ -302,22 +346,31 @@ define i512 @test_add_i512_mem_reg(ptr %p0, i512 %a1) nounwind {
; AVX512VL-NEXT: movq 32(%rsi), %r11
; AVX512VL-NEXT: movq 40(%rsi), %rbx
; AVX512VL-NEXT: addq (%rsi), %rdx
+; AVX512VL-NEXT: vmovq %rdx, %xmm0
; AVX512VL-NEXT: adcq 8(%rsi), %rcx
+; AVX512VL-NEXT: vmovq %rcx, %xmm1
+; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512VL-NEXT: adcq 16(%rsi), %r8
+; AVX512VL-NEXT: vmovq %r8, %xmm1
; AVX512VL-NEXT: adcq 24(%rsi), %r9
+; AVX512VL-NEXT: vmovq %r9, %xmm2
+; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX512VL-NEXT: adcq {{[0-9]+}}(%rsp), %r11
+; AVX512VL-NEXT: vmovq %r11, %xmm1
; AVX512VL-NEXT: adcq {{[0-9]+}}(%rsp), %rbx
+; AVX512VL-NEXT: vmovq %rbx, %xmm2
+; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; AVX512VL-NEXT: adcq {{[0-9]+}}(%rsp), %r10
+; AVX512VL-NEXT: vmovq %r10, %xmm2
; AVX512VL-NEXT: adcq {{[0-9]+}}(%rsp), %rdi
-; AVX512VL-NEXT: movq %rdx, (%rax)
-; AVX512VL-NEXT: movq %rcx, 8(%rax)
-; AVX512VL-NEXT: movq %r8, 16(%rax)
-; AVX512VL-NEXT: movq %r9, 24(%rax)
-; AVX512VL-NEXT: movq %r11, 32(%rax)
-; AVX512VL-NEXT: movq %rbx, 40(%rax)
-; AVX512VL-NEXT: movq %r10, 48(%rax)
-; AVX512VL-NEXT: movq %rdi, 56(%rax)
+; AVX512VL-NEXT: vmovq %rdi, %xmm3
+; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; AVX512VL-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512VL-NEXT: vmovdqu64 %zmm0, (%rax)
; AVX512VL-NEXT: popq %rbx
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
%a0 = load i512, ptr %p0
%r = add i512 %a0, %a1
@@ -383,34 +436,43 @@ define i512 @test_inc_i512_mem(ptr %p0) nounwind {
; AVX2-NEXT: movq %rcx, 56(%rax)
; AVX2-NEXT: retq
;
-; AVX512-LABEL: test_inc_i512_mem:
-; AVX512: # %bb.0:
-; AVX512-NEXT: movq %rdi, %rax
-; AVX512-NEXT: movq 56(%rsi), %rcx
-; AVX512-NEXT: movq 48(%rsi), %rdx
-; AVX512-NEXT: movq 40(%rsi), %rdi
-; AVX512-NEXT: movq 32(%rsi), %r8
-; AVX512-NEXT: movq 24(%rsi), %r9
-; AVX512-NEXT: movq 16(%rsi), %r10
-; AVX512-NEXT: movq (%rsi), %r11
-; AVX512-NEXT: movq 8(%rsi), %rsi
-; AVX512-NEXT: addq $1, %r11
-; AVX512-NEXT: adcq $0, %rsi
-; AVX512-NEXT: adcq $0, %r10
-; AVX512-NEXT: adcq $0, %r9
-; AVX512-NEXT: adcq $0, %r8
-; AVX512-NEXT: adcq $0, %rdi
-; AVX512-NEXT: adcq $0, %rdx
-; AVX512-NEXT: adcq $0, %rcx
-; AVX512-NEXT: movq %r11, (%rax)
-; AVX512-NEXT: movq %rsi, 8(%rax)
-; AVX512-NEXT: movq %r10, 16(%rax)
-; AVX512-NEXT: movq %r9, 24(%rax)
-; AVX512-NEXT: movq %r8, 32(%rax)
-; AVX512-NEXT: movq %rdi, 40(%rax)
-; AVX512-NEXT: movq %rdx, 48(%rax)
-; AVX512-NEXT: movq %rcx, 56(%rax)
-; AVX512-NEXT: retq
+; AVX512F-LABEL: test_inc_i512_mem:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: movq %rdi, %rax
+; AVX512F-NEXT: vmovdqu64 (%rsi), %zmm0
+; AVX512F-NEXT: vmovd {{.*#+}} xmm1 = [1,0,0,0]
+; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm1
+; AVX512F-NEXT: vpcmpltuq %zmm0, %zmm1, %k0
+; AVX512F-NEXT: kmovw %k0, %ecx
+; AVX512F-NEXT: vpternlogd {{.*#+}} zmm2 = -1
+; AVX512F-NEXT: vpcmpeqq %zmm2, %zmm1, %k0
+; AVX512F-NEXT: kmovw %k0, %edx
+; AVX512F-NEXT: movzbl %dl, %edx
+; AVX512F-NEXT: leal (%rdx,%rcx,2), %ecx
+; AVX512F-NEXT: xorl %edx, %ecx
+; AVX512F-NEXT: kmovw %ecx, %k1
+; AVX512F-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 {%k1}
+; AVX512F-NEXT: vmovdqu64 %zmm1, (%rdi)
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: test_inc_i512_mem:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqu64 (%rsi), %zmm0
+; AVX512VL-NEXT: vmovd {{.*#+}} xmm1 = [1,0,0,0]
+; AVX512VL-NEXT: movq %rdi, %rax
+; AVX512VL-NEXT: vpaddq %zmm1, %zmm0, %zmm1
+; AVX512VL-NEXT: vpcmpltuq %zmm0, %zmm1, %k0
+; AVX512VL-NEXT: kmovd %k0, %ecx
+; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm2 = -1
+; AVX512VL-NEXT: vpcmpeqq %zmm2, %zmm1, %k0
+; AVX512VL-NEXT: kmovb %k0, %edx
+; AVX512VL-NEXT: leal (%rdx,%rcx,2), %ecx
+; AVX512VL-NEXT: xorl %edx, %ecx
+; AVX512VL-NEXT: kmovd %ecx, %k1
+; AVX512VL-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 {%k1}
+; AVX512VL-NEXT: vmovdqu64 %zmm1, (%rdi)
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retq
%a0 = load i512, ptr %p0
%r = add i512 %a0, 1
ret i512 %r
@@ -429,19 +491,45 @@ define void @test_inc_i512_rmw(ptr %p0) nounwind {
; CHECK-NEXT: adcq $0, 56(%rdi)
; CHECK-NEXT: retq
;
-; AVX512-LABEL: test_inc_i512_rmw:
-; AVX512: # %bb.0:
-; AVX512-NEXT: addq $1, (%rdi)
-; AVX512-NEXT: adcq $0, 8(%rdi)
-; AVX512-NEXT: adcq $0, 16(%rdi)
-; AVX512-NEXT: adcq $0, 24(%rdi)
-; AVX512-NEXT: adcq $0, 32(%rdi)
-; AVX512-NEXT: adcq $0, 40(%rdi)
-; AVX512-NEXT: adcq $0, 48(%rdi)
-; AVX512-NEXT: adcq $0, 56(%rdi)
-; AVX512-NEXT: retq
+; AVX512F-LABEL: test_inc_i512_rmw:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovdqu64 (%rdi), %zmm0
+; AVX512F-NEXT: vmovd {{.*#+}} xmm1 = [1,0,0,0]
+; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm1
+; AVX512F-NEXT: vpcmpltuq %zmm0, %zmm1, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: vpternlogd {{.*#+}} zmm2 = -1
+; AVX512F-NEXT: vpcmpeqq %zmm2, %zmm1, %k0
+; AVX512F-NEXT: kmovw %k0, %ecx
+; AVX512F-NEXT: movzbl %cl, %ecx
+; AVX512F-NEXT: leal (%rcx,%rax,2), %eax
+; AVX512F-NEXT: xorl %ecx, %eax
+; AVX512F-NEXT: kmovw %eax, %k1
+; AVX512F-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 {%k1}
+; AVX512F-NEXT: vmovdqu64 %zmm1, (%rdi)
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: test_inc_i512_rmw:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqu64 (%rdi), %zmm0
+; AVX512VL-NEXT: vmovd {{.*#+}} xmm1 = [1,0,0,0]
+; AVX512VL-NEXT: vpaddq %zmm1, %zmm0, %zmm1
+; AVX512VL-NEXT: vpcmpltuq %zmm0, %zmm1, %k0
+; AVX512VL-NEXT: kmovd %k0, %eax
+; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm2 = -1
+; AVX512VL-NEXT: vpcmpeqq %zmm2, %zmm1, %k0
+; AVX512VL-NEXT: kmovb %k0, %ecx
+; AVX512VL-NEXT: leal (%rcx,%rax,2), %eax
+; AVX512VL-NEXT: xorl %ecx, %eax
+; AVX512VL-NEXT: kmovd %eax, %k1
+; AVX512VL-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 {%k1}
+; AVX512VL-NEXT: vmovdqu64 %zmm1, (%rdi)
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retq
%a0 = load i512, ptr %p0
%r = add i512 %a0, 1
store i512 %r, ptr %p0
ret void
}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; AVX512: {{.*}}
diff --git a/llvm/test/CodeGen/X86/sub-i512.ll b/llvm/test/CodeGen/X86/sub-i512.ll
index b2b57fe923adc..65c084623d06c 100644
--- a/llvm/test/CodeGen/X86/sub-i512.ll
+++ b/llvm/test/CodeGen/X86/sub-i512.ll
@@ -58,23 +58,31 @@ define i512 @test_sub_i512_reg_reg(i512 %a0, i512 %a1) nounwind {
; AVX512F-NEXT: subq {{[0-9]+}}(%rsp), %rsi
; AVX512F-NEXT: sbbq {{[0-9]+}}(%rsp), %rdx
; AVX512F-NEXT: sbbq {{[0-9]+}}(%rsp), %rcx
-; AVX512F-NEXT: sbbq {{[0-9]+}}(%rsp), %r8
; AVX512F-NEXT: movq %rdi, %rax
-; AVX512F-NEXT: sbbq {{[0-9]+}}(%rsp), %r9
+; AVX512F-NEXT: sbbq {{[0-9]+}}(%rsp), %r8
; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rdi
-; AVX512F-NEXT: sbbq {{[0-9]+}}(%rsp), %rdi
+; AVX512F-NEXT: sbbq {{[0-9]+}}(%rsp), %r9
; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r10
; AVX512F-NEXT: sbbq {{[0-9]+}}(%rsp), %r10
; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r11
+; AVX512F-NEXT: vmovq %r9, %xmm0
+; AVX512F-NEXT: vmovq %r10, %xmm1
+; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512F-NEXT: sbbq {{[0-9]+}}(%rsp), %rdi
+; AVX512F-NEXT: vmovq %rdi, %xmm1
; AVX512F-NEXT: sbbq {{[0-9]+}}(%rsp), %r11
-; AVX512F-NEXT: movq %rcx, 16(%rax)
-; AVX512F-NEXT: movq %rdx, 8(%rax)
-; AVX512F-NEXT: movq %rsi, (%rax)
-; AVX512F-NEXT: movq %r8, 24(%rax)
-; AVX512F-NEXT: movq %r9, 32(%rax)
-; AVX512F-NEXT: movq %rdi, 40(%rax)
-; AVX512F-NEXT: movq %r10, 48(%rax)
-; AVX512F-NEXT: movq %r11, 56(%rax)
+; AVX512F-NEXT: vmovq %r11, %xmm2
+; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT: vmovq %rcx, %xmm1
+; AVX512F-NEXT: vmovq %r8, %xmm2
+; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX512F-NEXT: vmovq %rdx, %xmm2
+; AVX512F-NEXT: vmovq %rsi, %xmm3
+; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
+; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
+; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-NEXT: vmovdqu64 %zmm0, (%rax)
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: test_sub_i512_reg_reg:
@@ -82,23 +90,32 @@ define i512 @test_sub_i512_reg_reg(i512 %a0, i512 %a1) nounwind {
; AVX512VL-NEXT: movq %rdi, %rax
; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %rdi
; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %r11
; AVX512VL-NEXT: subq {{[0-9]+}}(%rsp), %rsi
; AVX512VL-NEXT: sbbq {{[0-9]+}}(%rsp), %rdx
; AVX512VL-NEXT: sbbq {{[0-9]+}}(%rsp), %rcx
; AVX512VL-NEXT: sbbq {{[0-9]+}}(%rsp), %r8
; AVX512VL-NEXT: sbbq {{[0-9]+}}(%rsp), %r9
+; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %r11
+; AVX512VL-NEXT: vmovq %r9, %xmm0
; AVX512VL-NEXT: sbbq {{[0-9]+}}(%rsp), %r11
+; AVX512VL-NEXT: vmovq %r11, %xmm1
+; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512VL-NEXT: sbbq {{[0-9]+}}(%rsp), %rdi
+; AVX512VL-NEXT: vmovq %rdi, %xmm1
; AVX512VL-NEXT: sbbq {{[0-9]+}}(%rsp), %r10
-; AVX512VL-NEXT: movq %rcx, 16(%rax)
-; AVX512VL-NEXT: movq %rdx, 8(%rax)
-; AVX512VL-NEXT: movq %rsi, (%rax)
-; AVX512VL-NEXT: movq %r8, 24(%rax)
-; AVX512VL-NEXT: movq %r9, 32(%rax)
-; AVX512VL-NEXT: movq %r11, 40(%rax)
-; AVX512VL-NEXT: movq %rdi, 48(%rax)
-; AVX512VL-NEXT: movq %r10, 56(%rax)
+; AVX512VL-NEXT: vmovq %r10, %xmm2
+; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512VL-NEXT: vmovq %rcx, %xmm1
+; AVX512VL-NEXT: vmovq %r8, %xmm2
+; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX512VL-NEXT: vmovq %rdx, %xmm2
+; AVX512VL-NEXT: vmovq %rsi, %xmm3
+; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
+; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512VL-NEXT: vmovdqu64 %zmm0, (%rax)
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
%r = sub i512 %a0, %a1
ret i512 %r
@@ -162,27 +179,35 @@ define i512 @test_sub_i512_reg_mem(i512 %a0, ptr %p1) nounwind {
; AVX512F-LABEL: test_sub_i512_reg_mem:
; AVX512F: # %bb.0:
; AVX512F-NEXT: pushq %rbx
+; AVX512F-NEXT: movq %rdi, %rax
+; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rdi
; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r10
; AVX512F-NEXT: subq (%r10), %rsi
+; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r11
; AVX512F-NEXT: sbbq 8(%r10), %rdx
+; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rbx
+; AVX512F-NEXT: vmovq %rsi, %xmm0
+; AVX512F-NEXT: vmovq %rdx, %xmm1
+; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512F-NEXT: sbbq 16(%r10), %rcx
+; AVX512F-NEXT: vmovq %rcx, %xmm1
; AVX512F-NEXT: sbbq 24(%r10), %r8
-; AVX512F-NEXT: movq %rdi, %rax
+; AVX512F-NEXT: vmovq %r8, %xmm2
+; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; AVX512F-NEXT: sbbq 32(%r10), %r9
-; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rdi
-; AVX512F-NEXT: sbbq 40(%r10), %rdi
-; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r11
-; AVX512F-NEXT: sbbq 48(%r10), %r11
-; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rbx
-; AVX512F-NEXT: sbbq 56(%r10), %rbx
-; AVX512F-NEXT: movq %rsi, (%rax)
-; AVX512F-NEXT: movq %rdx, 8(%rax)
-; AVX512F-NEXT: movq %rcx, 16(%rax)
-; AVX512F-NEXT: movq %r8, 24(%rax)
-; AVX512F-NEXT: movq %r9, 32(%rax)
-; AVX512F-NEXT: movq %rdi, 40(%rax)
-; AVX512F-NEXT: movq %r11, 48(%rax)
-; AVX512F-NEXT: movq %rbx, 56(%rax)
+; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT: sbbq 40(%r10), %rbx
+; AVX512F-NEXT: vmovq %r9, %xmm1
+; AVX512F-NEXT: sbbq 48(%r10), %rdi
+; AVX512F-NEXT: vmovq %rbx, %xmm2
+; AVX512F-NEXT: sbbq 56(%r10), %r11
+; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX512F-NEXT: vmovq %rdi, %xmm2
+; AVX512F-NEXT: vmovq %r11, %xmm3
+; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512F-NEXT: vmovdqu64 %zmm0, (%rax)
; AVX512F-NEXT: popq %rbx
; AVX512F-NEXT: retq
;
@@ -190,27 +215,36 @@ define i512 @test_sub_i512_reg_mem(i512 %a0, ptr %p1) nounwind {
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: pushq %rbx
; AVX512VL-NEXT: movq %rdi, %rax
-; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %rdi
; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %rdi
; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %r11
; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %rbx
; AVX512VL-NEXT: subq (%rbx), %rsi
+; AVX512VL-NEXT: vmovq %rsi, %xmm0
; AVX512VL-NEXT: sbbq 8(%rbx), %rdx
+; AVX512VL-NEXT: vmovq %rdx, %xmm1
+; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512VL-NEXT: sbbq 16(%rbx), %rcx
+; AVX512VL-NEXT: vmovq %rcx, %xmm1
; AVX512VL-NEXT: sbbq 24(%rbx), %r8
+; AVX512VL-NEXT: vmovq %r8, %xmm2
+; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX512VL-NEXT: sbbq 32(%rbx), %r9
+; AVX512VL-NEXT: vmovq %r9, %xmm1
; AVX512VL-NEXT: sbbq 40(%rbx), %r11
-; AVX512VL-NEXT: sbbq 48(%rbx), %rdi
-; AVX512VL-NEXT: sbbq 56(%rbx), %r10
-; AVX512VL-NEXT: movq %rsi, (%rax)
-; AVX512VL-NEXT: movq %rdx, 8(%rax)
-; AVX512VL-NEXT: movq %rcx, 16(%rax)
-; AVX512VL-NEXT: movq %r8, 24(%rax)
-; AVX512VL-NEXT: movq %r9, 32(%rax)
-; AVX512VL-NEXT: movq %r11, 40(%rax)
-; AVX512VL-NEXT: movq %rdi, 48(%rax)
-; AVX512VL-NEXT: movq %r10, 56(%rax)
+; AVX512VL-NEXT: vmovq %r11, %xmm2
+; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX512VL-NEXT: sbbq 48(%rbx), %r10
+; AVX512VL-NEXT: vmovq %r10, %xmm2
+; AVX512VL-NEXT: sbbq 56(%rbx), %rdi
+; AVX512VL-NEXT: vmovq %rdi, %xmm3
+; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; AVX512VL-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512VL-NEXT: vmovdqu64 %zmm0, (%rax)
; AVX512VL-NEXT: popq %rbx
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
%a1 = load i512, ptr %p1
%r = sub i512 %a0, %a1
@@ -284,33 +318,47 @@ define i512 @test_sub_i512_mem_reg(ptr %p0, i512 %a1) nounwind {
;
; AVX512F-LABEL: test_sub_i512_mem_reg:
; AVX512F: # %bb.0:
+; AVX512F-NEXT: pushq %r15
+; AVX512F-NEXT: pushq %r14
+; AVX512F-NEXT: pushq %r12
; AVX512F-NEXT: pushq %rbx
; AVX512F-NEXT: movq %rdi, %rax
-; AVX512F-NEXT: movq 24(%rsi), %rdi
-; AVX512F-NEXT: movq 16(%rsi), %r10
-; AVX512F-NEXT: movq (%rsi), %r11
-; AVX512F-NEXT: movq 8(%rsi), %rbx
-; AVX512F-NEXT: subq %rdx, %r11
-; AVX512F-NEXT: sbbq %rcx, %rbx
-; AVX512F-NEXT: sbbq %r8, %r10
-; AVX512F-NEXT: sbbq %r9, %rdi
-; AVX512F-NEXT: movq 32(%rsi), %rcx
-; AVX512F-NEXT: sbbq {{[0-9]+}}(%rsp), %rcx
-; AVX512F-NEXT: movq 40(%rsi), %rdx
-; AVX512F-NEXT: sbbq {{[0-9]+}}(%rsp), %rdx
-; AVX512F-NEXT: movq 48(%rsi), %r8
-; AVX512F-NEXT: sbbq {{[0-9]+}}(%rsp), %r8
-; AVX512F-NEXT: movq 56(%rsi), %rsi
-; AVX512F-NEXT: sbbq {{[0-9]+}}(%rsp), %rsi
-; AVX512F-NEXT: movq %r11, (%rax)
-; AVX512F-NEXT: movq %rbx, 8(%rax)
-; AVX512F-NEXT: movq %r10, 16(%rax)
-; AVX512F-NEXT: movq %rdi, 24(%rax)
-; AVX512F-NEXT: movq %rcx, 32(%rax)
-; AVX512F-NEXT: movq %rdx, 40(%rax)
-; AVX512F-NEXT: movq %r8, 48(%rax)
-; AVX512F-NEXT: movq %rsi, 56(%rax)
+; AVX512F-NEXT: movq 56(%rsi), %rdi
+; AVX512F-NEXT: movq 48(%rsi), %r10
+; AVX512F-NEXT: movq 40(%rsi), %r11
+; AVX512F-NEXT: movq 32(%rsi), %rbx
+; AVX512F-NEXT: movq 24(%rsi), %r14
+; AVX512F-NEXT: movq 16(%rsi), %r15
+; AVX512F-NEXT: movq (%rsi), %r12
+; AVX512F-NEXT: movq 8(%rsi), %rsi
+; AVX512F-NEXT: subq %rdx, %r12
+; AVX512F-NEXT: vmovq %r12, %xmm0
+; AVX512F-NEXT: sbbq %rcx, %rsi
+; AVX512F-NEXT: vmovq %rsi, %xmm1
+; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512F-NEXT: sbbq %r8, %r15
+; AVX512F-NEXT: vmovq %r15, %xmm1
+; AVX512F-NEXT: sbbq %r9, %r14
+; AVX512F-NEXT: vmovq %r14, %xmm2
+; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX512F-NEXT: sbbq {{[0-9]+}}(%rsp), %rbx
+; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT: sbbq {{[0-9]+}}(%rsp), %r11
+; AVX512F-NEXT: vmovq %rbx, %xmm1
+; AVX512F-NEXT: sbbq {{[0-9]+}}(%rsp), %r10
+; AVX512F-NEXT: vmovq %r11, %xmm2
+; AVX512F-NEXT: sbbq {{[0-9]+}}(%rsp), %rdi
+; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX512F-NEXT: vmovq %r10, %xmm2
+; AVX512F-NEXT: vmovq %rdi, %xmm3
+; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512F-NEXT: vmovdqu64 %zmm0, (%rax)
; AVX512F-NEXT: popq %rbx
+; AVX512F-NEXT: popq %r12
+; AVX512F-NEXT: popq %r14
+; AVX512F-NEXT: popq %r15
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: test_sub_i512_mem_reg:
@@ -329,25 +377,34 @@ define i512 @test_sub_i512_mem_reg(ptr %p0, i512 %a1) nounwind {
; AVX512VL-NEXT: movq (%rsi), %r12
; AVX512VL-NEXT: movq 8(%rsi), %rsi
; AVX512VL-NEXT: subq %rdx, %r12
+; AVX512VL-NEXT: vmovq %r12, %xmm0
; AVX512VL-NEXT: sbbq %rcx, %rsi
+; AVX512VL-NEXT: vmovq %rsi, %xmm1
+; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512VL-NEXT: sbbq %r8, %r15
+; AVX512VL-NEXT: vmovq %r15, %xmm1
; AVX512VL-NEXT: sbbq %r9, %r14
+; AVX512VL-NEXT: vmovq %r14, %xmm2
+; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX512VL-NEXT: sbbq {{[0-9]+}}(%rsp), %rbx
+; AVX512VL-NEXT: vmovq %rbx, %xmm1
; AVX512VL-NEXT: sbbq {{[0-9]+}}(%rsp), %r11
+; AVX512VL-NEXT: vmovq %r11, %xmm2
+; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; AVX512VL-NEXT: sbbq {{[0-9]+}}(%rsp), %r10
+; AVX512VL-NEXT: vmovq %r10, %xmm2
; AVX512VL-NEXT: sbbq {{[0-9]+}}(%rsp), %rdi
-; AVX512VL-NEXT: movq %r12, (%rax)
-; AVX512VL-NEXT: movq %rsi, 8(%rax)
-; AVX512VL-NEXT: movq %r15, 16(%rax)
-; AVX512VL-NEXT: movq %r14, 24(%rax)
-; AVX512VL-NEXT: movq %rbx, 32(%rax)
-; AVX512VL-NEXT: movq %r11, 40(%rax)
-; AVX512VL-NEXT: movq %r10, 48(%rax)
-; AVX512VL-NEXT: movq %rdi, 56(%rax)
+; AVX512VL-NEXT: vmovq %rdi, %xmm3
+; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; AVX512VL-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512VL-NEXT: vmovdqu64 %zmm0, (%rax)
; AVX512VL-NEXT: popq %rbx
; AVX512VL-NEXT: popq %r12
; AVX512VL-NEXT: popq %r14
; AVX512VL-NEXT: popq %r15
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
%a0 = load i512, ptr %p0
%r = sub i512 %a0, %a1
@@ -413,34 +470,41 @@ define i512 @test_dec_i512_mem(ptr %p0) nounwind {
; AVX2-NEXT: movq %rcx, 56(%rax)
; AVX2-NEXT: retq
;
-; AVX512-LABEL: test_dec_i512_mem:
-; AVX512: # %bb.0:
-; AVX512-NEXT: movq %rdi, %rax
-; AVX512-NEXT: movq 56(%rsi), %rcx
-; AVX512-NEXT: movq 48(%rsi), %rdx
-; AVX512-NEXT: movq 40(%rsi), %rdi
-; AVX512-NEXT: movq 32(%rsi), %r8
-; AVX512-NEXT: movq 24(%rsi), %r9
-; AVX512-NEXT: movq 16(%rsi), %r10
-; AVX512-NEXT: movq (%rsi), %r11
-; AVX512-NEXT: movq 8(%rsi), %rsi
-; AVX512-NEXT: addq $-1, %r11
-; AVX512-NEXT: adcq $-1, %rsi
-; AVX512-NEXT: adcq $-1, %r10
-; AVX512-NEXT: adcq $-1, %r9
-; AVX512-NEXT: adcq $-1, %r8
-; AVX512-NEXT: adcq $-1, %rdi
-; AVX512-NEXT: adcq $-1, %rdx
-; AVX512-NEXT: adcq $-1, %rcx
-; AVX512-NEXT: movq %r11, (%rax)
-; AVX512-NEXT: movq %rsi, 8(%rax)
-; AVX512-NEXT: movq %r10, 16(%rax)
-; AVX512-NEXT: movq %r9, 24(%rax)
-; AVX512-NEXT: movq %r8, 32(%rax)
-; AVX512-NEXT: movq %rdi, 40(%rax)
-; AVX512-NEXT: movq %rdx, 48(%rax)
-; AVX512-NEXT: movq %rcx, 56(%rax)
-; AVX512-NEXT: retq
+; AVX512F-LABEL: test_dec_i512_mem:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: movq %rdi, %rax
+; AVX512F-NEXT: vmovdqu64 (%rsi), %zmm0
+; AVX512F-NEXT: vpternlogd {{.*#+}} zmm1 = -1
+; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm1
+; AVX512F-NEXT: vpcmpltuq %zmm0, %zmm1, %k0
+; AVX512F-NEXT: kmovw %k0, %ecx
+; AVX512F-NEXT: vptestnmq %zmm0, %zmm0, %k0
+; AVX512F-NEXT: kmovw %k0, %edx
+; AVX512F-NEXT: movzbl %dl, %edx
+; AVX512F-NEXT: leal (%rdx,%rcx,2), %ecx
+; AVX512F-NEXT: xorl %edx, %ecx
+; AVX512F-NEXT: kmovw %ecx, %k1
+; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512F-NEXT: vmovdqu64 %zmm1, (%rdi)
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: test_dec_i512_mem:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: movq %rdi, %rax
+; AVX512VL-NEXT: vmovdqu64 (%rsi), %zmm0
+; AVX512VL-NEXT: vptestnmq %zmm0, %zmm0, %k0
+; AVX512VL-NEXT: kmovb %k0, %ecx
+; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm1 = -1
+; AVX512VL-NEXT: vpaddq %zmm1, %zmm0, %zmm1
+; AVX512VL-NEXT: vpcmpltuq %zmm0, %zmm1, %k0
+; AVX512VL-NEXT: kmovd %k0, %edx
+; AVX512VL-NEXT: leal (%rcx,%rdx,2), %edx
+; AVX512VL-NEXT: xorl %ecx, %edx
+; AVX512VL-NEXT: kmovd %edx, %k1
+; AVX512VL-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512VL-NEXT: vmovdqu64 %zmm1, (%rdi)
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retq
%a0 = load i512, ptr %p0
%r = sub i512 %a0, 1
ret i512 %r
@@ -459,19 +523,43 @@ define void @test_dec_i512_rmw(ptr %p0) nounwind {
; CHECK-NEXT: adcq $-1, 56(%rdi)
; CHECK-NEXT: retq
;
-; AVX512-LABEL: test_dec_i512_rmw:
-; AVX512: # %bb.0:
-; AVX512-NEXT: addq $-1, (%rdi)
-; AVX512-NEXT: adcq $-1, 8(%rdi)
-; AVX512-NEXT: adcq $-1, 16(%rdi)
-; AVX512-NEXT: adcq $-1, 24(%rdi)
-; AVX512-NEXT: adcq $-1, 32(%rdi)
-; AVX512-NEXT: adcq $-1, 40(%rdi)
-; AVX512-NEXT: adcq $-1, 48(%rdi)
-; AVX512-NEXT: adcq $-1, 56(%rdi)
-; AVX512-NEXT: retq
+; AVX512F-LABEL: test_dec_i512_rmw:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovdqu64 (%rdi), %zmm0
+; AVX512F-NEXT: vpternlogd {{.*#+}} zmm1 = -1
+; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm1
+; AVX512F-NEXT: vpcmpltuq %zmm0, %zmm1, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: vptestnmq %zmm0, %zmm0, %k0
+; AVX512F-NEXT: kmovw %k0, %ecx
+; AVX512F-NEXT: movzbl %cl, %ecx
+; AVX512F-NEXT: leal (%rcx,%rax,2), %eax
+; AVX512F-NEXT: xorl %ecx, %eax
+; AVX512F-NEXT: kmovw %eax, %k0
+; AVX512F-NEXT: knotw %k0, %k1
+; AVX512F-NEXT: vmovdqu64 %zmm1, (%rdi) {%k1}
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: test_dec_i512_rmw:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqu64 (%rdi), %zmm0
+; AVX512VL-NEXT: vptestnmq %zmm0, %zmm0, %k0
+; AVX512VL-NEXT: kmovb %k0, %eax
+; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm1 = -1
+; AVX512VL-NEXT: vpaddq %zmm1, %zmm0, %zmm1
+; AVX512VL-NEXT: vpcmpltuq %zmm0, %zmm1, %k0
+; AVX512VL-NEXT: kmovd %k0, %ecx
+; AVX512VL-NEXT: leal (%rax,%rcx,2), %ecx
+; AVX512VL-NEXT: xorl %eax, %ecx
+; AVX512VL-NEXT: kmovd %ecx, %k0
+; AVX512VL-NEXT: knotb %k0, %k1
+; AVX512VL-NEXT: vmovdqu64 %zmm1, (%rdi) {%k1}
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retq
%a0 = load i512, ptr %p0
%r = sub i512 %a0, 1
store i512 %r, ptr %p0
ret void
}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; AVX512: {{.*}}
More information about the llvm-commits
mailing list