[llvm] [X86] Lower i512 ADD/SUB using Kogge-Stone on AVX512 (PR #174761)
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Wed Feb 4 07:34:14 PST 2026
https://github.com/RKSimon updated https://github.com/llvm/llvm-project/pull/174761
>From 98d7ac643a991dc9a8616c0fdaeddbb0327bdf65 Mon Sep 17 00:00:00 2001
From: Islam-Imad <islamimad404 at gmail.com>
Date: Wed, 7 Jan 2026 14:27:11 +0200
Subject: [PATCH 1/8] [X86] Lower i512 ADD/SUB using Kogge-Stone on AVX512
Fixes #173996
---
llvm/lib/Target/X86/X86ISelLowering.cpp | 62 +++++
llvm/test/CodeGen/X86/add-i512.ll | 306 ++++++++++++++--------
llvm/test/CodeGen/X86/sub-i512.ll | 322 +++++++++++++++---------
3 files changed, 464 insertions(+), 226 deletions(-)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index b2fac92676eaa..9112bddae6358 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -29,6 +29,7 @@
#include "llvm/Analysis/BlockFrequencyInfo.h"
#include "llvm/Analysis/ProfileSummaryInfo.h"
#include "llvm/Analysis/VectorUtils.h"
+#include "llvm/CodeGen/ISDOpcodes.h"
#include "llvm/CodeGen/LivePhysRegs.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
@@ -1874,6 +1875,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::AND, MVT::i512, Custom);
setOperationAction(ISD::OR, MVT::i512, Custom);
setOperationAction(ISD::XOR, MVT::i512, Custom);
+ setOperationAction(ISD::ADD, MVT::i512, Custom);
+ setOperationAction(ISD::SUB, MVT::i512, Custom);
for (MVT VT : { MVT::v16i1, MVT::v16i8 }) {
setOperationPromotedToType(ISD::FP_TO_SINT , VT, MVT::v16i32);
@@ -34255,6 +34258,65 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
Results.push_back(DAG.getBitcast(VT, Op));
return;
}
+ case ISD::ADD:
+ case ISD::SUB: {
+ // Use Kogge-Stone parallel carry/borrow propagation for i512 add/sub.
+ // TODO: ISD::UADDO_CARRY
+ SDValue LHS = N->getOperand(0);
+ SDValue RHS = N->getOperand(1);
+ EVT VT = N->getValueType(0);
+ bool IsAdd = Opc == ISD::ADD;
+
+ assert(Subtarget.useAVX512Regs() && "AVX512 required");
+ assert(VT == MVT::i512 && "Unexpected VT!");
+
+ if (!mayFoldIntoVector(LHS, Subtarget) ||
+ !mayFoldIntoVector(RHS, Subtarget))
+ return;
+
+ MVT VecVT = MVT::getVectorVT(MVT::i64, 8);
+ MVT BoolVT = MVT::getVectorVT(MVT::i1, 8);
+ SDValue Vec0 = DAG.getBitcast(VecVT, LHS);
+ SDValue Vec1 = DAG.getBitcast(VecVT, RHS);
+ SDValue AllOnes = DAG.getAllOnesConstant(dl, VecVT);
+
+ // Compute partial sum/difference (per-lane, no carry propagation).
+ SDValue Partial = DAG.getNode(Opc, dl, VecVT, Vec0, Vec1);
+
+ // Detect carry/borrow generation.
+ ISD::CondCode CarryCC = IsAdd ? ISD::SETULT : ISD::SETUGT;
+ SDValue Carry = DAG.getSetCC(dl, BoolVT, Partial, Vec0, CarryCC);
+
+ // Detect propagate lanes.
+ SDValue PropCmp = IsAdd ? AllOnes : DAG.getConstant(0, dl, VecVT);
+ SDValue Propagate = DAG.getSetCC(dl, BoolVT, Partial, PropCmp, ISD::SETEQ);
+
+ // Convert masks to scalar for Kogge-Stone propagation.
+ SDValue CarryIn = DAG.getNode(ISD::BITCAST, dl, MVT::i8, Carry);
+ SDValue PropIn = DAG.getNode(ISD::BITCAST, dl, MVT::i8, Propagate);
+ CarryIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, CarryIn);
+ PropIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, PropIn);
+
+ // Kogge-Stone: shift carry left and add propagate.
+ SDValue ShiftedCarry = DAG.getNode(ISD::SHL, dl, MVT::i32, CarryIn,
+ DAG.getConstant(1, dl, MVT::i8));
+ SDValue CarryOut =
+ DAG.getNode(ISD::ADD, dl, MVT::i32, ShiftedCarry, PropIn);
+
+ // Correction mask: lanes that received a carry/borrow.
+ SDValue CorrMask = DAG.getNode(ISD::XOR, dl, MVT::i32, PropIn, CarryOut);
+ CorrMask = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, CorrMask);
+ SDValue CorrVec = DAG.getNode(ISD::BITCAST, dl, BoolVT, CorrMask);
+
+ // Apply correction: +1 for ADD (via -(-1)), -1 for SUB (via +(-1)).
+ unsigned AdjustOpc = IsAdd ? ISD::SUB : ISD::ADD;
+ SDValue Adjusted = DAG.getNode(AdjustOpc, dl, VecVT, Partial, AllOnes);
+ SDValue Res =
+ DAG.getNode(ISD::VSELECT, dl, VecVT, CorrVec, Adjusted, Partial);
+
+ Results.push_back(DAG.getBitcast(VT, Res));
+ return;
+ }
case ISD::CTPOP: {
assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!");
// If we have at most 32 active bits, then perform as i32 CTPOP.
diff --git a/llvm/test/CodeGen/X86/add-i512.ll b/llvm/test/CodeGen/X86/add-i512.ll
index 16ba701072031..1e31fd4fe374f 100644
--- a/llvm/test/CodeGen/X86/add-i512.ll
+++ b/llvm/test/CodeGen/X86/add-i512.ll
@@ -58,23 +58,31 @@ define i512 @test_add_i512_reg_reg(i512 %a0, i512 %a1) nounwind {
; AVX512F-NEXT: addq {{[0-9]+}}(%rsp), %rsi
; AVX512F-NEXT: adcq {{[0-9]+}}(%rsp), %rdx
; AVX512F-NEXT: adcq {{[0-9]+}}(%rsp), %rcx
-; AVX512F-NEXT: adcq {{[0-9]+}}(%rsp), %r8
; AVX512F-NEXT: movq %rdi, %rax
-; AVX512F-NEXT: adcq {{[0-9]+}}(%rsp), %r9
+; AVX512F-NEXT: adcq {{[0-9]+}}(%rsp), %r8
; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rdi
-; AVX512F-NEXT: adcq {{[0-9]+}}(%rsp), %rdi
+; AVX512F-NEXT: adcq {{[0-9]+}}(%rsp), %r9
; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r10
; AVX512F-NEXT: adcq {{[0-9]+}}(%rsp), %r10
; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r11
+; AVX512F-NEXT: vmovq %r9, %xmm0
+; AVX512F-NEXT: vmovq %r10, %xmm1
+; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512F-NEXT: adcq {{[0-9]+}}(%rsp), %rdi
+; AVX512F-NEXT: vmovq %rdi, %xmm1
; AVX512F-NEXT: adcq {{[0-9]+}}(%rsp), %r11
-; AVX512F-NEXT: movq %rcx, 16(%rax)
-; AVX512F-NEXT: movq %rdx, 8(%rax)
-; AVX512F-NEXT: movq %rsi, (%rax)
-; AVX512F-NEXT: movq %r8, 24(%rax)
-; AVX512F-NEXT: movq %r9, 32(%rax)
-; AVX512F-NEXT: movq %rdi, 40(%rax)
-; AVX512F-NEXT: movq %r10, 48(%rax)
-; AVX512F-NEXT: movq %r11, 56(%rax)
+; AVX512F-NEXT: vmovq %r11, %xmm2
+; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT: vmovq %rcx, %xmm1
+; AVX512F-NEXT: vmovq %r8, %xmm2
+; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX512F-NEXT: vmovq %rdx, %xmm2
+; AVX512F-NEXT: vmovq %rsi, %xmm3
+; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
+; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
+; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-NEXT: vmovdqu64 %zmm0, (%rax)
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: test_add_i512_reg_reg:
@@ -82,23 +90,32 @@ define i512 @test_add_i512_reg_reg(i512 %a0, i512 %a1) nounwind {
; AVX512VL-NEXT: movq %rdi, %rax
; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %rdi
; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %r11
; AVX512VL-NEXT: addq {{[0-9]+}}(%rsp), %rsi
; AVX512VL-NEXT: adcq {{[0-9]+}}(%rsp), %rdx
; AVX512VL-NEXT: adcq {{[0-9]+}}(%rsp), %rcx
; AVX512VL-NEXT: adcq {{[0-9]+}}(%rsp), %r8
; AVX512VL-NEXT: adcq {{[0-9]+}}(%rsp), %r9
+; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %r11
+; AVX512VL-NEXT: vmovq %r9, %xmm0
; AVX512VL-NEXT: adcq {{[0-9]+}}(%rsp), %r11
+; AVX512VL-NEXT: vmovq %r11, %xmm1
+; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512VL-NEXT: adcq {{[0-9]+}}(%rsp), %rdi
+; AVX512VL-NEXT: vmovq %rdi, %xmm1
; AVX512VL-NEXT: adcq {{[0-9]+}}(%rsp), %r10
-; AVX512VL-NEXT: movq %rcx, 16(%rax)
-; AVX512VL-NEXT: movq %rdx, 8(%rax)
-; AVX512VL-NEXT: movq %rsi, (%rax)
-; AVX512VL-NEXT: movq %r8, 24(%rax)
-; AVX512VL-NEXT: movq %r9, 32(%rax)
-; AVX512VL-NEXT: movq %r11, 40(%rax)
-; AVX512VL-NEXT: movq %rdi, 48(%rax)
-; AVX512VL-NEXT: movq %r10, 56(%rax)
+; AVX512VL-NEXT: vmovq %r10, %xmm2
+; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512VL-NEXT: vmovq %rcx, %xmm1
+; AVX512VL-NEXT: vmovq %r8, %xmm2
+; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX512VL-NEXT: vmovq %rdx, %xmm2
+; AVX512VL-NEXT: vmovq %rsi, %xmm3
+; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
+; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512VL-NEXT: vmovdqu64 %zmm0, (%rax)
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
%r = add i512 %a0, %a1
ret i512 %r
@@ -162,27 +179,35 @@ define i512 @test_add_i512_reg_mem(i512 %a0, ptr %p1) nounwind {
; AVX512F-LABEL: test_add_i512_reg_mem:
; AVX512F: # %bb.0:
; AVX512F-NEXT: pushq %rbx
+; AVX512F-NEXT: movq %rdi, %rax
+; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rdi
; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r10
; AVX512F-NEXT: addq (%r10), %rsi
+; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r11
; AVX512F-NEXT: adcq 8(%r10), %rdx
+; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rbx
+; AVX512F-NEXT: vmovq %rsi, %xmm0
+; AVX512F-NEXT: vmovq %rdx, %xmm1
+; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512F-NEXT: adcq 16(%r10), %rcx
+; AVX512F-NEXT: vmovq %rcx, %xmm1
; AVX512F-NEXT: adcq 24(%r10), %r8
-; AVX512F-NEXT: movq %rdi, %rax
+; AVX512F-NEXT: vmovq %r8, %xmm2
+; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; AVX512F-NEXT: adcq 32(%r10), %r9
-; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rdi
-; AVX512F-NEXT: adcq 40(%r10), %rdi
-; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r11
-; AVX512F-NEXT: adcq 48(%r10), %r11
-; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rbx
-; AVX512F-NEXT: adcq 56(%r10), %rbx
-; AVX512F-NEXT: movq %rsi, (%rax)
-; AVX512F-NEXT: movq %rdx, 8(%rax)
-; AVX512F-NEXT: movq %rcx, 16(%rax)
-; AVX512F-NEXT: movq %r8, 24(%rax)
-; AVX512F-NEXT: movq %r9, 32(%rax)
-; AVX512F-NEXT: movq %rdi, 40(%rax)
-; AVX512F-NEXT: movq %r11, 48(%rax)
-; AVX512F-NEXT: movq %rbx, 56(%rax)
+; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT: adcq 40(%r10), %rbx
+; AVX512F-NEXT: vmovq %r9, %xmm1
+; AVX512F-NEXT: adcq 48(%r10), %rdi
+; AVX512F-NEXT: vmovq %rbx, %xmm2
+; AVX512F-NEXT: adcq 56(%r10), %r11
+; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX512F-NEXT: vmovq %rdi, %xmm2
+; AVX512F-NEXT: vmovq %r11, %xmm3
+; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512F-NEXT: vmovdqu64 %zmm0, (%rax)
; AVX512F-NEXT: popq %rbx
; AVX512F-NEXT: retq
;
@@ -190,27 +215,36 @@ define i512 @test_add_i512_reg_mem(i512 %a0, ptr %p1) nounwind {
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: pushq %rbx
; AVX512VL-NEXT: movq %rdi, %rax
-; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %rdi
; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %rdi
; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %r11
; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %rbx
; AVX512VL-NEXT: addq (%rbx), %rsi
+; AVX512VL-NEXT: vmovq %rsi, %xmm0
; AVX512VL-NEXT: adcq 8(%rbx), %rdx
+; AVX512VL-NEXT: vmovq %rdx, %xmm1
+; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512VL-NEXT: adcq 16(%rbx), %rcx
+; AVX512VL-NEXT: vmovq %rcx, %xmm1
; AVX512VL-NEXT: adcq 24(%rbx), %r8
+; AVX512VL-NEXT: vmovq %r8, %xmm2
+; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX512VL-NEXT: adcq 32(%rbx), %r9
+; AVX512VL-NEXT: vmovq %r9, %xmm1
; AVX512VL-NEXT: adcq 40(%rbx), %r11
-; AVX512VL-NEXT: adcq 48(%rbx), %rdi
-; AVX512VL-NEXT: adcq 56(%rbx), %r10
-; AVX512VL-NEXT: movq %rsi, (%rax)
-; AVX512VL-NEXT: movq %rdx, 8(%rax)
-; AVX512VL-NEXT: movq %rcx, 16(%rax)
-; AVX512VL-NEXT: movq %r8, 24(%rax)
-; AVX512VL-NEXT: movq %r9, 32(%rax)
-; AVX512VL-NEXT: movq %r11, 40(%rax)
-; AVX512VL-NEXT: movq %rdi, 48(%rax)
-; AVX512VL-NEXT: movq %r10, 56(%rax)
+; AVX512VL-NEXT: vmovq %r11, %xmm2
+; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX512VL-NEXT: adcq 48(%rbx), %r10
+; AVX512VL-NEXT: vmovq %r10, %xmm2
+; AVX512VL-NEXT: adcq 56(%rbx), %rdi
+; AVX512VL-NEXT: vmovq %rdi, %xmm3
+; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; AVX512VL-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512VL-NEXT: vmovdqu64 %zmm0, (%rax)
; AVX512VL-NEXT: popq %rbx
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
%a1 = load i512, ptr %p1
%r = add i512 %a0, %a1
@@ -270,27 +304,37 @@ define i512 @test_add_i512_mem_reg(ptr %p0, i512 %a1) nounwind {
;
; AVX512F-LABEL: test_add_i512_mem_reg:
; AVX512F: # %bb.0:
+; AVX512F-NEXT: pushq %rbx
+; AVX512F-NEXT: movq %rdi, %rax
+; AVX512F-NEXT: movq 56(%rsi), %rdi
+; AVX512F-NEXT: movq 48(%rsi), %r10
; AVX512F-NEXT: addq (%rsi), %rdx
+; AVX512F-NEXT: movq 32(%rsi), %r11
; AVX512F-NEXT: adcq 8(%rsi), %rcx
+; AVX512F-NEXT: movq 40(%rsi), %rbx
+; AVX512F-NEXT: vmovq %rdx, %xmm0
+; AVX512F-NEXT: vmovq %rcx, %xmm1
+; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512F-NEXT: adcq 16(%rsi), %r8
-; AVX512F-NEXT: movq %rdi, %rax
+; AVX512F-NEXT: vmovq %r8, %xmm1
; AVX512F-NEXT: adcq 24(%rsi), %r9
-; AVX512F-NEXT: movq 32(%rsi), %rdi
-; AVX512F-NEXT: adcq {{[0-9]+}}(%rsp), %rdi
-; AVX512F-NEXT: movq 40(%rsi), %r10
-; AVX512F-NEXT: adcq {{[0-9]+}}(%rsp), %r10
-; AVX512F-NEXT: movq 48(%rsi), %r11
+; AVX512F-NEXT: vmovq %r9, %xmm2
+; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; AVX512F-NEXT: adcq {{[0-9]+}}(%rsp), %r11
-; AVX512F-NEXT: movq 56(%rsi), %rsi
-; AVX512F-NEXT: adcq {{[0-9]+}}(%rsp), %rsi
-; AVX512F-NEXT: movq %rdx, (%rax)
-; AVX512F-NEXT: movq %rcx, 8(%rax)
-; AVX512F-NEXT: movq %r8, 16(%rax)
-; AVX512F-NEXT: movq %r9, 24(%rax)
-; AVX512F-NEXT: movq %rdi, 32(%rax)
-; AVX512F-NEXT: movq %r10, 40(%rax)
-; AVX512F-NEXT: movq %r11, 48(%rax)
-; AVX512F-NEXT: movq %rsi, 56(%rax)
+; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT: adcq {{[0-9]+}}(%rsp), %rbx
+; AVX512F-NEXT: vmovq %r11, %xmm1
+; AVX512F-NEXT: adcq {{[0-9]+}}(%rsp), %r10
+; AVX512F-NEXT: vmovq %rbx, %xmm2
+; AVX512F-NEXT: adcq {{[0-9]+}}(%rsp), %rdi
+; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX512F-NEXT: vmovq %r10, %xmm2
+; AVX512F-NEXT: vmovq %rdi, %xmm3
+; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512F-NEXT: vmovdqu64 %zmm0, (%rax)
+; AVX512F-NEXT: popq %rbx
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: test_add_i512_mem_reg:
@@ -302,22 +346,31 @@ define i512 @test_add_i512_mem_reg(ptr %p0, i512 %a1) nounwind {
; AVX512VL-NEXT: movq 32(%rsi), %r11
; AVX512VL-NEXT: movq 40(%rsi), %rbx
; AVX512VL-NEXT: addq (%rsi), %rdx
+; AVX512VL-NEXT: vmovq %rdx, %xmm0
; AVX512VL-NEXT: adcq 8(%rsi), %rcx
+; AVX512VL-NEXT: vmovq %rcx, %xmm1
+; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512VL-NEXT: adcq 16(%rsi), %r8
+; AVX512VL-NEXT: vmovq %r8, %xmm1
; AVX512VL-NEXT: adcq 24(%rsi), %r9
+; AVX512VL-NEXT: vmovq %r9, %xmm2
+; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX512VL-NEXT: adcq {{[0-9]+}}(%rsp), %r11
+; AVX512VL-NEXT: vmovq %r11, %xmm1
; AVX512VL-NEXT: adcq {{[0-9]+}}(%rsp), %rbx
+; AVX512VL-NEXT: vmovq %rbx, %xmm2
+; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; AVX512VL-NEXT: adcq {{[0-9]+}}(%rsp), %r10
+; AVX512VL-NEXT: vmovq %r10, %xmm2
; AVX512VL-NEXT: adcq {{[0-9]+}}(%rsp), %rdi
-; AVX512VL-NEXT: movq %rdx, (%rax)
-; AVX512VL-NEXT: movq %rcx, 8(%rax)
-; AVX512VL-NEXT: movq %r8, 16(%rax)
-; AVX512VL-NEXT: movq %r9, 24(%rax)
-; AVX512VL-NEXT: movq %r11, 32(%rax)
-; AVX512VL-NEXT: movq %rbx, 40(%rax)
-; AVX512VL-NEXT: movq %r10, 48(%rax)
-; AVX512VL-NEXT: movq %rdi, 56(%rax)
+; AVX512VL-NEXT: vmovq %rdi, %xmm3
+; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; AVX512VL-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512VL-NEXT: vmovdqu64 %zmm0, (%rax)
; AVX512VL-NEXT: popq %rbx
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
%a0 = load i512, ptr %p0
%r = add i512 %a0, %a1
@@ -383,34 +436,43 @@ define i512 @test_inc_i512_mem(ptr %p0) nounwind {
; AVX2-NEXT: movq %rcx, 56(%rax)
; AVX2-NEXT: retq
;
-; AVX512-LABEL: test_inc_i512_mem:
-; AVX512: # %bb.0:
-; AVX512-NEXT: movq %rdi, %rax
-; AVX512-NEXT: movq 56(%rsi), %rcx
-; AVX512-NEXT: movq 48(%rsi), %rdx
-; AVX512-NEXT: movq 40(%rsi), %rdi
-; AVX512-NEXT: movq 32(%rsi), %r8
-; AVX512-NEXT: movq 24(%rsi), %r9
-; AVX512-NEXT: movq 16(%rsi), %r10
-; AVX512-NEXT: movq (%rsi), %r11
-; AVX512-NEXT: movq 8(%rsi), %rsi
-; AVX512-NEXT: addq $1, %r11
-; AVX512-NEXT: adcq $0, %rsi
-; AVX512-NEXT: adcq $0, %r10
-; AVX512-NEXT: adcq $0, %r9
-; AVX512-NEXT: adcq $0, %r8
-; AVX512-NEXT: adcq $0, %rdi
-; AVX512-NEXT: adcq $0, %rdx
-; AVX512-NEXT: adcq $0, %rcx
-; AVX512-NEXT: movq %r11, (%rax)
-; AVX512-NEXT: movq %rsi, 8(%rax)
-; AVX512-NEXT: movq %r10, 16(%rax)
-; AVX512-NEXT: movq %r9, 24(%rax)
-; AVX512-NEXT: movq %r8, 32(%rax)
-; AVX512-NEXT: movq %rdi, 40(%rax)
-; AVX512-NEXT: movq %rdx, 48(%rax)
-; AVX512-NEXT: movq %rcx, 56(%rax)
-; AVX512-NEXT: retq
+; AVX512F-LABEL: test_inc_i512_mem:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: movq %rdi, %rax
+; AVX512F-NEXT: vmovdqu64 (%rsi), %zmm0
+; AVX512F-NEXT: vmovd {{.*#+}} xmm1 = [1,0,0,0]
+; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm1
+; AVX512F-NEXT: vpcmpltuq %zmm0, %zmm1, %k0
+; AVX512F-NEXT: kmovw %k0, %ecx
+; AVX512F-NEXT: vpternlogd {{.*#+}} zmm2 = -1
+; AVX512F-NEXT: vpcmpeqq %zmm2, %zmm1, %k0
+; AVX512F-NEXT: kmovw %k0, %edx
+; AVX512F-NEXT: movzbl %dl, %edx
+; AVX512F-NEXT: leal (%rdx,%rcx,2), %ecx
+; AVX512F-NEXT: xorl %edx, %ecx
+; AVX512F-NEXT: kmovw %ecx, %k1
+; AVX512F-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 {%k1}
+; AVX512F-NEXT: vmovdqu64 %zmm1, (%rdi)
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: test_inc_i512_mem:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqu64 (%rsi), %zmm0
+; AVX512VL-NEXT: vmovd {{.*#+}} xmm1 = [1,0,0,0]
+; AVX512VL-NEXT: movq %rdi, %rax
+; AVX512VL-NEXT: vpaddq %zmm1, %zmm0, %zmm1
+; AVX512VL-NEXT: vpcmpltuq %zmm0, %zmm1, %k0
+; AVX512VL-NEXT: kmovd %k0, %ecx
+; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm2 = -1
+; AVX512VL-NEXT: vpcmpeqq %zmm2, %zmm1, %k0
+; AVX512VL-NEXT: kmovb %k0, %edx
+; AVX512VL-NEXT: leal (%rdx,%rcx,2), %ecx
+; AVX512VL-NEXT: xorl %edx, %ecx
+; AVX512VL-NEXT: kmovd %ecx, %k1
+; AVX512VL-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 {%k1}
+; AVX512VL-NEXT: vmovdqu64 %zmm1, (%rdi)
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retq
%a0 = load i512, ptr %p0
%r = add i512 %a0, 1
ret i512 %r
@@ -429,19 +491,45 @@ define void @test_inc_i512_rmw(ptr %p0) nounwind {
; CHECK-NEXT: adcq $0, 56(%rdi)
; CHECK-NEXT: retq
;
-; AVX512-LABEL: test_inc_i512_rmw:
-; AVX512: # %bb.0:
-; AVX512-NEXT: addq $1, (%rdi)
-; AVX512-NEXT: adcq $0, 8(%rdi)
-; AVX512-NEXT: adcq $0, 16(%rdi)
-; AVX512-NEXT: adcq $0, 24(%rdi)
-; AVX512-NEXT: adcq $0, 32(%rdi)
-; AVX512-NEXT: adcq $0, 40(%rdi)
-; AVX512-NEXT: adcq $0, 48(%rdi)
-; AVX512-NEXT: adcq $0, 56(%rdi)
-; AVX512-NEXT: retq
+; AVX512F-LABEL: test_inc_i512_rmw:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovdqu64 (%rdi), %zmm0
+; AVX512F-NEXT: vmovd {{.*#+}} xmm1 = [1,0,0,0]
+; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm1
+; AVX512F-NEXT: vpcmpltuq %zmm0, %zmm1, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: vpternlogd {{.*#+}} zmm2 = -1
+; AVX512F-NEXT: vpcmpeqq %zmm2, %zmm1, %k0
+; AVX512F-NEXT: kmovw %k0, %ecx
+; AVX512F-NEXT: movzbl %cl, %ecx
+; AVX512F-NEXT: leal (%rcx,%rax,2), %eax
+; AVX512F-NEXT: xorl %ecx, %eax
+; AVX512F-NEXT: kmovw %eax, %k1
+; AVX512F-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 {%k1}
+; AVX512F-NEXT: vmovdqu64 %zmm1, (%rdi)
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: test_inc_i512_rmw:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqu64 (%rdi), %zmm0
+; AVX512VL-NEXT: vmovd {{.*#+}} xmm1 = [1,0,0,0]
+; AVX512VL-NEXT: vpaddq %zmm1, %zmm0, %zmm1
+; AVX512VL-NEXT: vpcmpltuq %zmm0, %zmm1, %k0
+; AVX512VL-NEXT: kmovd %k0, %eax
+; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm2 = -1
+; AVX512VL-NEXT: vpcmpeqq %zmm2, %zmm1, %k0
+; AVX512VL-NEXT: kmovb %k0, %ecx
+; AVX512VL-NEXT: leal (%rcx,%rax,2), %eax
+; AVX512VL-NEXT: xorl %ecx, %eax
+; AVX512VL-NEXT: kmovd %eax, %k1
+; AVX512VL-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 {%k1}
+; AVX512VL-NEXT: vmovdqu64 %zmm1, (%rdi)
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retq
%a0 = load i512, ptr %p0
%r = add i512 %a0, 1
store i512 %r, ptr %p0
ret void
}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; AVX512: {{.*}}
diff --git a/llvm/test/CodeGen/X86/sub-i512.ll b/llvm/test/CodeGen/X86/sub-i512.ll
index b2b57fe923adc..65c084623d06c 100644
--- a/llvm/test/CodeGen/X86/sub-i512.ll
+++ b/llvm/test/CodeGen/X86/sub-i512.ll
@@ -58,23 +58,31 @@ define i512 @test_sub_i512_reg_reg(i512 %a0, i512 %a1) nounwind {
; AVX512F-NEXT: subq {{[0-9]+}}(%rsp), %rsi
; AVX512F-NEXT: sbbq {{[0-9]+}}(%rsp), %rdx
; AVX512F-NEXT: sbbq {{[0-9]+}}(%rsp), %rcx
-; AVX512F-NEXT: sbbq {{[0-9]+}}(%rsp), %r8
; AVX512F-NEXT: movq %rdi, %rax
-; AVX512F-NEXT: sbbq {{[0-9]+}}(%rsp), %r9
+; AVX512F-NEXT: sbbq {{[0-9]+}}(%rsp), %r8
; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rdi
-; AVX512F-NEXT: sbbq {{[0-9]+}}(%rsp), %rdi
+; AVX512F-NEXT: sbbq {{[0-9]+}}(%rsp), %r9
; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r10
; AVX512F-NEXT: sbbq {{[0-9]+}}(%rsp), %r10
; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r11
+; AVX512F-NEXT: vmovq %r9, %xmm0
+; AVX512F-NEXT: vmovq %r10, %xmm1
+; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512F-NEXT: sbbq {{[0-9]+}}(%rsp), %rdi
+; AVX512F-NEXT: vmovq %rdi, %xmm1
; AVX512F-NEXT: sbbq {{[0-9]+}}(%rsp), %r11
-; AVX512F-NEXT: movq %rcx, 16(%rax)
-; AVX512F-NEXT: movq %rdx, 8(%rax)
-; AVX512F-NEXT: movq %rsi, (%rax)
-; AVX512F-NEXT: movq %r8, 24(%rax)
-; AVX512F-NEXT: movq %r9, 32(%rax)
-; AVX512F-NEXT: movq %rdi, 40(%rax)
-; AVX512F-NEXT: movq %r10, 48(%rax)
-; AVX512F-NEXT: movq %r11, 56(%rax)
+; AVX512F-NEXT: vmovq %r11, %xmm2
+; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT: vmovq %rcx, %xmm1
+; AVX512F-NEXT: vmovq %r8, %xmm2
+; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX512F-NEXT: vmovq %rdx, %xmm2
+; AVX512F-NEXT: vmovq %rsi, %xmm3
+; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
+; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
+; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-NEXT: vmovdqu64 %zmm0, (%rax)
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: test_sub_i512_reg_reg:
@@ -82,23 +90,32 @@ define i512 @test_sub_i512_reg_reg(i512 %a0, i512 %a1) nounwind {
; AVX512VL-NEXT: movq %rdi, %rax
; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %rdi
; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %r11
; AVX512VL-NEXT: subq {{[0-9]+}}(%rsp), %rsi
; AVX512VL-NEXT: sbbq {{[0-9]+}}(%rsp), %rdx
; AVX512VL-NEXT: sbbq {{[0-9]+}}(%rsp), %rcx
; AVX512VL-NEXT: sbbq {{[0-9]+}}(%rsp), %r8
; AVX512VL-NEXT: sbbq {{[0-9]+}}(%rsp), %r9
+; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %r11
+; AVX512VL-NEXT: vmovq %r9, %xmm0
; AVX512VL-NEXT: sbbq {{[0-9]+}}(%rsp), %r11
+; AVX512VL-NEXT: vmovq %r11, %xmm1
+; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512VL-NEXT: sbbq {{[0-9]+}}(%rsp), %rdi
+; AVX512VL-NEXT: vmovq %rdi, %xmm1
; AVX512VL-NEXT: sbbq {{[0-9]+}}(%rsp), %r10
-; AVX512VL-NEXT: movq %rcx, 16(%rax)
-; AVX512VL-NEXT: movq %rdx, 8(%rax)
-; AVX512VL-NEXT: movq %rsi, (%rax)
-; AVX512VL-NEXT: movq %r8, 24(%rax)
-; AVX512VL-NEXT: movq %r9, 32(%rax)
-; AVX512VL-NEXT: movq %r11, 40(%rax)
-; AVX512VL-NEXT: movq %rdi, 48(%rax)
-; AVX512VL-NEXT: movq %r10, 56(%rax)
+; AVX512VL-NEXT: vmovq %r10, %xmm2
+; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512VL-NEXT: vmovq %rcx, %xmm1
+; AVX512VL-NEXT: vmovq %r8, %xmm2
+; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX512VL-NEXT: vmovq %rdx, %xmm2
+; AVX512VL-NEXT: vmovq %rsi, %xmm3
+; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
+; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512VL-NEXT: vmovdqu64 %zmm0, (%rax)
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
%r = sub i512 %a0, %a1
ret i512 %r
@@ -162,27 +179,35 @@ define i512 @test_sub_i512_reg_mem(i512 %a0, ptr %p1) nounwind {
; AVX512F-LABEL: test_sub_i512_reg_mem:
; AVX512F: # %bb.0:
; AVX512F-NEXT: pushq %rbx
+; AVX512F-NEXT: movq %rdi, %rax
+; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rdi
; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r10
; AVX512F-NEXT: subq (%r10), %rsi
+; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r11
; AVX512F-NEXT: sbbq 8(%r10), %rdx
+; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rbx
+; AVX512F-NEXT: vmovq %rsi, %xmm0
+; AVX512F-NEXT: vmovq %rdx, %xmm1
+; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512F-NEXT: sbbq 16(%r10), %rcx
+; AVX512F-NEXT: vmovq %rcx, %xmm1
; AVX512F-NEXT: sbbq 24(%r10), %r8
-; AVX512F-NEXT: movq %rdi, %rax
+; AVX512F-NEXT: vmovq %r8, %xmm2
+; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; AVX512F-NEXT: sbbq 32(%r10), %r9
-; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rdi
-; AVX512F-NEXT: sbbq 40(%r10), %rdi
-; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r11
-; AVX512F-NEXT: sbbq 48(%r10), %r11
-; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rbx
-; AVX512F-NEXT: sbbq 56(%r10), %rbx
-; AVX512F-NEXT: movq %rsi, (%rax)
-; AVX512F-NEXT: movq %rdx, 8(%rax)
-; AVX512F-NEXT: movq %rcx, 16(%rax)
-; AVX512F-NEXT: movq %r8, 24(%rax)
-; AVX512F-NEXT: movq %r9, 32(%rax)
-; AVX512F-NEXT: movq %rdi, 40(%rax)
-; AVX512F-NEXT: movq %r11, 48(%rax)
-; AVX512F-NEXT: movq %rbx, 56(%rax)
+; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT: sbbq 40(%r10), %rbx
+; AVX512F-NEXT: vmovq %r9, %xmm1
+; AVX512F-NEXT: sbbq 48(%r10), %rdi
+; AVX512F-NEXT: vmovq %rbx, %xmm2
+; AVX512F-NEXT: sbbq 56(%r10), %r11
+; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX512F-NEXT: vmovq %rdi, %xmm2
+; AVX512F-NEXT: vmovq %r11, %xmm3
+; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512F-NEXT: vmovdqu64 %zmm0, (%rax)
; AVX512F-NEXT: popq %rbx
; AVX512F-NEXT: retq
;
@@ -190,27 +215,36 @@ define i512 @test_sub_i512_reg_mem(i512 %a0, ptr %p1) nounwind {
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: pushq %rbx
; AVX512VL-NEXT: movq %rdi, %rax
-; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %rdi
; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %rdi
; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %r11
; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %rbx
; AVX512VL-NEXT: subq (%rbx), %rsi
+; AVX512VL-NEXT: vmovq %rsi, %xmm0
; AVX512VL-NEXT: sbbq 8(%rbx), %rdx
+; AVX512VL-NEXT: vmovq %rdx, %xmm1
+; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512VL-NEXT: sbbq 16(%rbx), %rcx
+; AVX512VL-NEXT: vmovq %rcx, %xmm1
; AVX512VL-NEXT: sbbq 24(%rbx), %r8
+; AVX512VL-NEXT: vmovq %r8, %xmm2
+; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX512VL-NEXT: sbbq 32(%rbx), %r9
+; AVX512VL-NEXT: vmovq %r9, %xmm1
; AVX512VL-NEXT: sbbq 40(%rbx), %r11
-; AVX512VL-NEXT: sbbq 48(%rbx), %rdi
-; AVX512VL-NEXT: sbbq 56(%rbx), %r10
-; AVX512VL-NEXT: movq %rsi, (%rax)
-; AVX512VL-NEXT: movq %rdx, 8(%rax)
-; AVX512VL-NEXT: movq %rcx, 16(%rax)
-; AVX512VL-NEXT: movq %r8, 24(%rax)
-; AVX512VL-NEXT: movq %r9, 32(%rax)
-; AVX512VL-NEXT: movq %r11, 40(%rax)
-; AVX512VL-NEXT: movq %rdi, 48(%rax)
-; AVX512VL-NEXT: movq %r10, 56(%rax)
+; AVX512VL-NEXT: vmovq %r11, %xmm2
+; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX512VL-NEXT: sbbq 48(%rbx), %r10
+; AVX512VL-NEXT: vmovq %r10, %xmm2
+; AVX512VL-NEXT: sbbq 56(%rbx), %rdi
+; AVX512VL-NEXT: vmovq %rdi, %xmm3
+; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; AVX512VL-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512VL-NEXT: vmovdqu64 %zmm0, (%rax)
; AVX512VL-NEXT: popq %rbx
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
%a1 = load i512, ptr %p1
%r = sub i512 %a0, %a1
@@ -284,33 +318,47 @@ define i512 @test_sub_i512_mem_reg(ptr %p0, i512 %a1) nounwind {
;
; AVX512F-LABEL: test_sub_i512_mem_reg:
; AVX512F: # %bb.0:
+; AVX512F-NEXT: pushq %r15
+; AVX512F-NEXT: pushq %r14
+; AVX512F-NEXT: pushq %r12
; AVX512F-NEXT: pushq %rbx
; AVX512F-NEXT: movq %rdi, %rax
-; AVX512F-NEXT: movq 24(%rsi), %rdi
-; AVX512F-NEXT: movq 16(%rsi), %r10
-; AVX512F-NEXT: movq (%rsi), %r11
-; AVX512F-NEXT: movq 8(%rsi), %rbx
-; AVX512F-NEXT: subq %rdx, %r11
-; AVX512F-NEXT: sbbq %rcx, %rbx
-; AVX512F-NEXT: sbbq %r8, %r10
-; AVX512F-NEXT: sbbq %r9, %rdi
-; AVX512F-NEXT: movq 32(%rsi), %rcx
-; AVX512F-NEXT: sbbq {{[0-9]+}}(%rsp), %rcx
-; AVX512F-NEXT: movq 40(%rsi), %rdx
-; AVX512F-NEXT: sbbq {{[0-9]+}}(%rsp), %rdx
-; AVX512F-NEXT: movq 48(%rsi), %r8
-; AVX512F-NEXT: sbbq {{[0-9]+}}(%rsp), %r8
-; AVX512F-NEXT: movq 56(%rsi), %rsi
-; AVX512F-NEXT: sbbq {{[0-9]+}}(%rsp), %rsi
-; AVX512F-NEXT: movq %r11, (%rax)
-; AVX512F-NEXT: movq %rbx, 8(%rax)
-; AVX512F-NEXT: movq %r10, 16(%rax)
-; AVX512F-NEXT: movq %rdi, 24(%rax)
-; AVX512F-NEXT: movq %rcx, 32(%rax)
-; AVX512F-NEXT: movq %rdx, 40(%rax)
-; AVX512F-NEXT: movq %r8, 48(%rax)
-; AVX512F-NEXT: movq %rsi, 56(%rax)
+; AVX512F-NEXT: movq 56(%rsi), %rdi
+; AVX512F-NEXT: movq 48(%rsi), %r10
+; AVX512F-NEXT: movq 40(%rsi), %r11
+; AVX512F-NEXT: movq 32(%rsi), %rbx
+; AVX512F-NEXT: movq 24(%rsi), %r14
+; AVX512F-NEXT: movq 16(%rsi), %r15
+; AVX512F-NEXT: movq (%rsi), %r12
+; AVX512F-NEXT: movq 8(%rsi), %rsi
+; AVX512F-NEXT: subq %rdx, %r12
+; AVX512F-NEXT: vmovq %r12, %xmm0
+; AVX512F-NEXT: sbbq %rcx, %rsi
+; AVX512F-NEXT: vmovq %rsi, %xmm1
+; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512F-NEXT: sbbq %r8, %r15
+; AVX512F-NEXT: vmovq %r15, %xmm1
+; AVX512F-NEXT: sbbq %r9, %r14
+; AVX512F-NEXT: vmovq %r14, %xmm2
+; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX512F-NEXT: sbbq {{[0-9]+}}(%rsp), %rbx
+; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT: sbbq {{[0-9]+}}(%rsp), %r11
+; AVX512F-NEXT: vmovq %rbx, %xmm1
+; AVX512F-NEXT: sbbq {{[0-9]+}}(%rsp), %r10
+; AVX512F-NEXT: vmovq %r11, %xmm2
+; AVX512F-NEXT: sbbq {{[0-9]+}}(%rsp), %rdi
+; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX512F-NEXT: vmovq %r10, %xmm2
+; AVX512F-NEXT: vmovq %rdi, %xmm3
+; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512F-NEXT: vmovdqu64 %zmm0, (%rax)
; AVX512F-NEXT: popq %rbx
+; AVX512F-NEXT: popq %r12
+; AVX512F-NEXT: popq %r14
+; AVX512F-NEXT: popq %r15
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: test_sub_i512_mem_reg:
@@ -329,25 +377,34 @@ define i512 @test_sub_i512_mem_reg(ptr %p0, i512 %a1) nounwind {
; AVX512VL-NEXT: movq (%rsi), %r12
; AVX512VL-NEXT: movq 8(%rsi), %rsi
; AVX512VL-NEXT: subq %rdx, %r12
+; AVX512VL-NEXT: vmovq %r12, %xmm0
; AVX512VL-NEXT: sbbq %rcx, %rsi
+; AVX512VL-NEXT: vmovq %rsi, %xmm1
+; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512VL-NEXT: sbbq %r8, %r15
+; AVX512VL-NEXT: vmovq %r15, %xmm1
; AVX512VL-NEXT: sbbq %r9, %r14
+; AVX512VL-NEXT: vmovq %r14, %xmm2
+; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX512VL-NEXT: sbbq {{[0-9]+}}(%rsp), %rbx
+; AVX512VL-NEXT: vmovq %rbx, %xmm1
; AVX512VL-NEXT: sbbq {{[0-9]+}}(%rsp), %r11
+; AVX512VL-NEXT: vmovq %r11, %xmm2
+; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; AVX512VL-NEXT: sbbq {{[0-9]+}}(%rsp), %r10
+; AVX512VL-NEXT: vmovq %r10, %xmm2
; AVX512VL-NEXT: sbbq {{[0-9]+}}(%rsp), %rdi
-; AVX512VL-NEXT: movq %r12, (%rax)
-; AVX512VL-NEXT: movq %rsi, 8(%rax)
-; AVX512VL-NEXT: movq %r15, 16(%rax)
-; AVX512VL-NEXT: movq %r14, 24(%rax)
-; AVX512VL-NEXT: movq %rbx, 32(%rax)
-; AVX512VL-NEXT: movq %r11, 40(%rax)
-; AVX512VL-NEXT: movq %r10, 48(%rax)
-; AVX512VL-NEXT: movq %rdi, 56(%rax)
+; AVX512VL-NEXT: vmovq %rdi, %xmm3
+; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; AVX512VL-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512VL-NEXT: vmovdqu64 %zmm0, (%rax)
; AVX512VL-NEXT: popq %rbx
; AVX512VL-NEXT: popq %r12
; AVX512VL-NEXT: popq %r14
; AVX512VL-NEXT: popq %r15
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
%a0 = load i512, ptr %p0
%r = sub i512 %a0, %a1
@@ -413,34 +470,41 @@ define i512 @test_dec_i512_mem(ptr %p0) nounwind {
; AVX2-NEXT: movq %rcx, 56(%rax)
; AVX2-NEXT: retq
;
-; AVX512-LABEL: test_dec_i512_mem:
-; AVX512: # %bb.0:
-; AVX512-NEXT: movq %rdi, %rax
-; AVX512-NEXT: movq 56(%rsi), %rcx
-; AVX512-NEXT: movq 48(%rsi), %rdx
-; AVX512-NEXT: movq 40(%rsi), %rdi
-; AVX512-NEXT: movq 32(%rsi), %r8
-; AVX512-NEXT: movq 24(%rsi), %r9
-; AVX512-NEXT: movq 16(%rsi), %r10
-; AVX512-NEXT: movq (%rsi), %r11
-; AVX512-NEXT: movq 8(%rsi), %rsi
-; AVX512-NEXT: addq $-1, %r11
-; AVX512-NEXT: adcq $-1, %rsi
-; AVX512-NEXT: adcq $-1, %r10
-; AVX512-NEXT: adcq $-1, %r9
-; AVX512-NEXT: adcq $-1, %r8
-; AVX512-NEXT: adcq $-1, %rdi
-; AVX512-NEXT: adcq $-1, %rdx
-; AVX512-NEXT: adcq $-1, %rcx
-; AVX512-NEXT: movq %r11, (%rax)
-; AVX512-NEXT: movq %rsi, 8(%rax)
-; AVX512-NEXT: movq %r10, 16(%rax)
-; AVX512-NEXT: movq %r9, 24(%rax)
-; AVX512-NEXT: movq %r8, 32(%rax)
-; AVX512-NEXT: movq %rdi, 40(%rax)
-; AVX512-NEXT: movq %rdx, 48(%rax)
-; AVX512-NEXT: movq %rcx, 56(%rax)
-; AVX512-NEXT: retq
+; AVX512F-LABEL: test_dec_i512_mem:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: movq %rdi, %rax
+; AVX512F-NEXT: vmovdqu64 (%rsi), %zmm0
+; AVX512F-NEXT: vpternlogd {{.*#+}} zmm1 = -1
+; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm1
+; AVX512F-NEXT: vpcmpltuq %zmm0, %zmm1, %k0
+; AVX512F-NEXT: kmovw %k0, %ecx
+; AVX512F-NEXT: vptestnmq %zmm0, %zmm0, %k0
+; AVX512F-NEXT: kmovw %k0, %edx
+; AVX512F-NEXT: movzbl %dl, %edx
+; AVX512F-NEXT: leal (%rdx,%rcx,2), %ecx
+; AVX512F-NEXT: xorl %edx, %ecx
+; AVX512F-NEXT: kmovw %ecx, %k1
+; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512F-NEXT: vmovdqu64 %zmm1, (%rdi)
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: test_dec_i512_mem:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: movq %rdi, %rax
+; AVX512VL-NEXT: vmovdqu64 (%rsi), %zmm0
+; AVX512VL-NEXT: vptestnmq %zmm0, %zmm0, %k0
+; AVX512VL-NEXT: kmovb %k0, %ecx
+; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm1 = -1
+; AVX512VL-NEXT: vpaddq %zmm1, %zmm0, %zmm1
+; AVX512VL-NEXT: vpcmpltuq %zmm0, %zmm1, %k0
+; AVX512VL-NEXT: kmovd %k0, %edx
+; AVX512VL-NEXT: leal (%rcx,%rdx,2), %edx
+; AVX512VL-NEXT: xorl %ecx, %edx
+; AVX512VL-NEXT: kmovd %edx, %k1
+; AVX512VL-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512VL-NEXT: vmovdqu64 %zmm1, (%rdi)
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retq
%a0 = load i512, ptr %p0
%r = sub i512 %a0, 1
ret i512 %r
@@ -459,19 +523,43 @@ define void @test_dec_i512_rmw(ptr %p0) nounwind {
; CHECK-NEXT: adcq $-1, 56(%rdi)
; CHECK-NEXT: retq
;
-; AVX512-LABEL: test_dec_i512_rmw:
-; AVX512: # %bb.0:
-; AVX512-NEXT: addq $-1, (%rdi)
-; AVX512-NEXT: adcq $-1, 8(%rdi)
-; AVX512-NEXT: adcq $-1, 16(%rdi)
-; AVX512-NEXT: adcq $-1, 24(%rdi)
-; AVX512-NEXT: adcq $-1, 32(%rdi)
-; AVX512-NEXT: adcq $-1, 40(%rdi)
-; AVX512-NEXT: adcq $-1, 48(%rdi)
-; AVX512-NEXT: adcq $-1, 56(%rdi)
-; AVX512-NEXT: retq
+; AVX512F-LABEL: test_dec_i512_rmw:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovdqu64 (%rdi), %zmm0
+; AVX512F-NEXT: vpternlogd {{.*#+}} zmm1 = -1
+; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm1
+; AVX512F-NEXT: vpcmpltuq %zmm0, %zmm1, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: vptestnmq %zmm0, %zmm0, %k0
+; AVX512F-NEXT: kmovw %k0, %ecx
+; AVX512F-NEXT: movzbl %cl, %ecx
+; AVX512F-NEXT: leal (%rcx,%rax,2), %eax
+; AVX512F-NEXT: xorl %ecx, %eax
+; AVX512F-NEXT: kmovw %eax, %k0
+; AVX512F-NEXT: knotw %k0, %k1
+; AVX512F-NEXT: vmovdqu64 %zmm1, (%rdi) {%k1}
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: test_dec_i512_rmw:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqu64 (%rdi), %zmm0
+; AVX512VL-NEXT: vptestnmq %zmm0, %zmm0, %k0
+; AVX512VL-NEXT: kmovb %k0, %eax
+; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm1 = -1
+; AVX512VL-NEXT: vpaddq %zmm1, %zmm0, %zmm1
+; AVX512VL-NEXT: vpcmpltuq %zmm0, %zmm1, %k0
+; AVX512VL-NEXT: kmovd %k0, %ecx
+; AVX512VL-NEXT: leal (%rax,%rcx,2), %ecx
+; AVX512VL-NEXT: xorl %eax, %ecx
+; AVX512VL-NEXT: kmovd %ecx, %k0
+; AVX512VL-NEXT: knotb %k0, %k1
+; AVX512VL-NEXT: vmovdqu64 %zmm1, (%rdi) {%k1}
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retq
%a0 = load i512, ptr %p0
%r = sub i512 %a0, 1
store i512 %r, ptr %p0
ret void
}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; AVX512: {{.*}}
>From 19289e6722e09c2e00357159b02cc4a87b49538e Mon Sep 17 00:00:00 2001
From: Islam-Imad <islamimad404 at gmail.com>
Date: Sun, 1 Feb 2026 18:05:37 +0200
Subject: [PATCH 2/8] Fix : regression issue when add/sub i512 is computed on
GPRs instead of avx512
---
llvm/lib/Target/X86/X86ISelLowering.cpp | 23 ++-
llvm/test/CodeGen/X86/add-i512.ll | 193 ++++++++-------------
llvm/test/CodeGen/X86/sub-i512.ll | 213 +++++++++---------------
3 files changed, 159 insertions(+), 270 deletions(-)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 9112bddae6358..b95569e934f1a 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -29,7 +29,6 @@
#include "llvm/Analysis/BlockFrequencyInfo.h"
#include "llvm/Analysis/ProfileSummaryInfo.h"
#include "llvm/Analysis/VectorUtils.h"
-#include "llvm/CodeGen/ISDOpcodes.h"
#include "llvm/CodeGen/LivePhysRegs.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
@@ -34261,6 +34260,8 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
case ISD::ADD:
case ISD::SUB: {
// Use Kogge-Stone parallel carry/borrow propagation for i512 add/sub.
+ // Article : https://www.numberworld.org/y-cruncher/internals/addition.html
+ // related work : combineStore -> if (VT == MVT::i256 || VT == MVT::i512)
// TODO: ISD::UADDO_CARRY
SDValue LHS = N->getOperand(0);
SDValue RHS = N->getOperand(1);
@@ -34280,35 +34281,29 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
SDValue Vec1 = DAG.getBitcast(VecVT, RHS);
SDValue AllOnes = DAG.getAllOnesConstant(dl, VecVT);
- // Compute partial sum/difference (per-lane, no carry propagation).
SDValue Partial = DAG.getNode(Opc, dl, VecVT, Vec0, Vec1);
- // Detect carry/borrow generation.
ISD::CondCode CarryCC = IsAdd ? ISD::SETULT : ISD::SETUGT;
SDValue Carry = DAG.getSetCC(dl, BoolVT, Partial, Vec0, CarryCC);
- // Detect propagate lanes.
SDValue PropCmp = IsAdd ? AllOnes : DAG.getConstant(0, dl, VecVT);
SDValue Propagate = DAG.getSetCC(dl, BoolVT, Partial, PropCmp, ISD::SETEQ);
- // Convert masks to scalar for Kogge-Stone propagation.
- SDValue CarryIn = DAG.getNode(ISD::BITCAST, dl, MVT::i8, Carry);
- SDValue PropIn = DAG.getNode(ISD::BITCAST, dl, MVT::i8, Propagate);
+ SDValue CarryIn = DAG.getBitcast(MVT::i8, Carry);
+ SDValue PropIn = DAG.getBitcast(MVT::i8, Propagate);
CarryIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, CarryIn);
PropIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, PropIn);
- // Kogge-Stone: shift carry left and add propagate.
- SDValue ShiftedCarry = DAG.getNode(ISD::SHL, dl, MVT::i32, CarryIn,
- DAG.getConstant(1, dl, MVT::i8));
+ SDValue ShiftedCarry =
+ DAG.getNode(ISD::SHL, dl, MVT::i32, CarryIn,
+ DAG.getShiftAmountConstant(1, MVT::i8, dl));
SDValue CarryOut =
DAG.getNode(ISD::ADD, dl, MVT::i32, ShiftedCarry, PropIn);
- // Correction mask: lanes that received a carry/borrow.
SDValue CorrMask = DAG.getNode(ISD::XOR, dl, MVT::i32, PropIn, CarryOut);
CorrMask = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, CorrMask);
SDValue CorrVec = DAG.getNode(ISD::BITCAST, dl, BoolVT, CorrMask);
- // Apply correction: +1 for ADD (via -(-1)), -1 for SUB (via +(-1)).
unsigned AdjustOpc = IsAdd ? ISD::SUB : ISD::ADD;
SDValue Adjusted = DAG.getNode(AdjustOpc, dl, VecVT, Partial, AllOnes);
SDValue Res =
@@ -54626,6 +54621,10 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
// vector type or the operation is likely to expand to a vector type
// (legalization can scalarize back if it the op failed).
if (VT == MVT::i256 || VT == MVT::i512) {
+ // Issue : 173996 , PRs : [174761,179503] : when add/sub lowered on avx512 we hit a
+ // regression issue. my approach is to allow the combine only when the
+ // operation is done by our custome handling.
+ // X86TargetLowering::ReplaceNodeResults (ADD/SUB) cases.
MVT VecVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
if (TLI.isTypeLegal(VecVT) && ISD::isNormalStore(St) &&
mayFoldIntoVector(StoredVal, Subtarget))
diff --git a/llvm/test/CodeGen/X86/add-i512.ll b/llvm/test/CodeGen/X86/add-i512.ll
index 1e31fd4fe374f..3b0755cf2183a 100644
--- a/llvm/test/CodeGen/X86/add-i512.ll
+++ b/llvm/test/CodeGen/X86/add-i512.ll
@@ -58,31 +58,23 @@ define i512 @test_add_i512_reg_reg(i512 %a0, i512 %a1) nounwind {
; AVX512F-NEXT: addq {{[0-9]+}}(%rsp), %rsi
; AVX512F-NEXT: adcq {{[0-9]+}}(%rsp), %rdx
; AVX512F-NEXT: adcq {{[0-9]+}}(%rsp), %rcx
-; AVX512F-NEXT: movq %rdi, %rax
; AVX512F-NEXT: adcq {{[0-9]+}}(%rsp), %r8
-; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rdi
+; AVX512F-NEXT: movq %rdi, %rax
; AVX512F-NEXT: adcq {{[0-9]+}}(%rsp), %r9
+; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rdi
+; AVX512F-NEXT: adcq {{[0-9]+}}(%rsp), %rdi
; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r10
; AVX512F-NEXT: adcq {{[0-9]+}}(%rsp), %r10
; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r11
-; AVX512F-NEXT: vmovq %r9, %xmm0
-; AVX512F-NEXT: vmovq %r10, %xmm1
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512F-NEXT: adcq {{[0-9]+}}(%rsp), %rdi
-; AVX512F-NEXT: vmovq %rdi, %xmm1
; AVX512F-NEXT: adcq {{[0-9]+}}(%rsp), %r11
-; AVX512F-NEXT: vmovq %r11, %xmm2
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512F-NEXT: vmovq %rcx, %xmm1
-; AVX512F-NEXT: vmovq %r8, %xmm2
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX512F-NEXT: vmovq %rdx, %xmm2
-; AVX512F-NEXT: vmovq %rsi, %xmm3
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
-; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
-; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-NEXT: vmovdqu64 %zmm0, (%rax)
+; AVX512F-NEXT: movq %rcx, 16(%rax)
+; AVX512F-NEXT: movq %rdx, 8(%rax)
+; AVX512F-NEXT: movq %rsi, (%rax)
+; AVX512F-NEXT: movq %r8, 24(%rax)
+; AVX512F-NEXT: movq %r9, 32(%rax)
+; AVX512F-NEXT: movq %rdi, 40(%rax)
+; AVX512F-NEXT: movq %r10, 48(%rax)
+; AVX512F-NEXT: movq %r11, 56(%rax)
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: test_add_i512_reg_reg:
@@ -90,32 +82,23 @@ define i512 @test_add_i512_reg_reg(i512 %a0, i512 %a1) nounwind {
; AVX512VL-NEXT: movq %rdi, %rax
; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %rdi
; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %r11
; AVX512VL-NEXT: addq {{[0-9]+}}(%rsp), %rsi
; AVX512VL-NEXT: adcq {{[0-9]+}}(%rsp), %rdx
; AVX512VL-NEXT: adcq {{[0-9]+}}(%rsp), %rcx
; AVX512VL-NEXT: adcq {{[0-9]+}}(%rsp), %r8
; AVX512VL-NEXT: adcq {{[0-9]+}}(%rsp), %r9
-; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %r11
-; AVX512VL-NEXT: vmovq %r9, %xmm0
; AVX512VL-NEXT: adcq {{[0-9]+}}(%rsp), %r11
-; AVX512VL-NEXT: vmovq %r11, %xmm1
-; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512VL-NEXT: adcq {{[0-9]+}}(%rsp), %rdi
-; AVX512VL-NEXT: vmovq %rdi, %xmm1
; AVX512VL-NEXT: adcq {{[0-9]+}}(%rsp), %r10
-; AVX512VL-NEXT: vmovq %r10, %xmm2
-; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512VL-NEXT: vmovq %rcx, %xmm1
-; AVX512VL-NEXT: vmovq %r8, %xmm2
-; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX512VL-NEXT: vmovq %rdx, %xmm2
-; AVX512VL-NEXT: vmovq %rsi, %xmm3
-; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
-; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
-; AVX512VL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512VL-NEXT: vmovdqu64 %zmm0, (%rax)
-; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: movq %rcx, 16(%rax)
+; AVX512VL-NEXT: movq %rdx, 8(%rax)
+; AVX512VL-NEXT: movq %rsi, (%rax)
+; AVX512VL-NEXT: movq %r8, 24(%rax)
+; AVX512VL-NEXT: movq %r9, 32(%rax)
+; AVX512VL-NEXT: movq %r11, 40(%rax)
+; AVX512VL-NEXT: movq %rdi, 48(%rax)
+; AVX512VL-NEXT: movq %r10, 56(%rax)
; AVX512VL-NEXT: retq
%r = add i512 %a0, %a1
ret i512 %r
@@ -179,35 +162,27 @@ define i512 @test_add_i512_reg_mem(i512 %a0, ptr %p1) nounwind {
; AVX512F-LABEL: test_add_i512_reg_mem:
; AVX512F: # %bb.0:
; AVX512F-NEXT: pushq %rbx
-; AVX512F-NEXT: movq %rdi, %rax
-; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rdi
; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r10
; AVX512F-NEXT: addq (%r10), %rsi
-; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r11
; AVX512F-NEXT: adcq 8(%r10), %rdx
-; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rbx
-; AVX512F-NEXT: vmovq %rsi, %xmm0
-; AVX512F-NEXT: vmovq %rdx, %xmm1
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512F-NEXT: adcq 16(%r10), %rcx
-; AVX512F-NEXT: vmovq %rcx, %xmm1
; AVX512F-NEXT: adcq 24(%r10), %r8
-; AVX512F-NEXT: vmovq %r8, %xmm2
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX512F-NEXT: movq %rdi, %rax
; AVX512F-NEXT: adcq 32(%r10), %r9
-; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512F-NEXT: adcq 40(%r10), %rbx
-; AVX512F-NEXT: vmovq %r9, %xmm1
-; AVX512F-NEXT: adcq 48(%r10), %rdi
-; AVX512F-NEXT: vmovq %rbx, %xmm2
-; AVX512F-NEXT: adcq 56(%r10), %r11
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX512F-NEXT: vmovq %rdi, %xmm2
-; AVX512F-NEXT: vmovq %r11, %xmm3
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
-; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-NEXT: vmovdqu64 %zmm0, (%rax)
+; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rdi
+; AVX512F-NEXT: adcq 40(%r10), %rdi
+; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r11
+; AVX512F-NEXT: adcq 48(%r10), %r11
+; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rbx
+; AVX512F-NEXT: adcq 56(%r10), %rbx
+; AVX512F-NEXT: movq %rsi, (%rax)
+; AVX512F-NEXT: movq %rdx, 8(%rax)
+; AVX512F-NEXT: movq %rcx, 16(%rax)
+; AVX512F-NEXT: movq %r8, 24(%rax)
+; AVX512F-NEXT: movq %r9, 32(%rax)
+; AVX512F-NEXT: movq %rdi, 40(%rax)
+; AVX512F-NEXT: movq %r11, 48(%rax)
+; AVX512F-NEXT: movq %rbx, 56(%rax)
; AVX512F-NEXT: popq %rbx
; AVX512F-NEXT: retq
;
@@ -215,36 +190,27 @@ define i512 @test_add_i512_reg_mem(i512 %a0, ptr %p1) nounwind {
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: pushq %rbx
; AVX512VL-NEXT: movq %rdi, %rax
-; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %r10
; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %rdi
+; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %r10
; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %r11
; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %rbx
; AVX512VL-NEXT: addq (%rbx), %rsi
-; AVX512VL-NEXT: vmovq %rsi, %xmm0
; AVX512VL-NEXT: adcq 8(%rbx), %rdx
-; AVX512VL-NEXT: vmovq %rdx, %xmm1
-; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512VL-NEXT: adcq 16(%rbx), %rcx
-; AVX512VL-NEXT: vmovq %rcx, %xmm1
; AVX512VL-NEXT: adcq 24(%rbx), %r8
-; AVX512VL-NEXT: vmovq %r8, %xmm2
-; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX512VL-NEXT: adcq 32(%rbx), %r9
-; AVX512VL-NEXT: vmovq %r9, %xmm1
; AVX512VL-NEXT: adcq 40(%rbx), %r11
-; AVX512VL-NEXT: vmovq %r11, %xmm2
-; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX512VL-NEXT: adcq 48(%rbx), %r10
-; AVX512VL-NEXT: vmovq %r10, %xmm2
-; AVX512VL-NEXT: adcq 56(%rbx), %rdi
-; AVX512VL-NEXT: vmovq %rdi, %xmm3
-; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; AVX512VL-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
-; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512VL-NEXT: vmovdqu64 %zmm0, (%rax)
+; AVX512VL-NEXT: adcq 48(%rbx), %rdi
+; AVX512VL-NEXT: adcq 56(%rbx), %r10
+; AVX512VL-NEXT: movq %rsi, (%rax)
+; AVX512VL-NEXT: movq %rdx, 8(%rax)
+; AVX512VL-NEXT: movq %rcx, 16(%rax)
+; AVX512VL-NEXT: movq %r8, 24(%rax)
+; AVX512VL-NEXT: movq %r9, 32(%rax)
+; AVX512VL-NEXT: movq %r11, 40(%rax)
+; AVX512VL-NEXT: movq %rdi, 48(%rax)
+; AVX512VL-NEXT: movq %r10, 56(%rax)
; AVX512VL-NEXT: popq %rbx
-; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
%a1 = load i512, ptr %p1
%r = add i512 %a0, %a1
@@ -304,37 +270,27 @@ define i512 @test_add_i512_mem_reg(ptr %p0, i512 %a1) nounwind {
;
; AVX512F-LABEL: test_add_i512_mem_reg:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: pushq %rbx
-; AVX512F-NEXT: movq %rdi, %rax
-; AVX512F-NEXT: movq 56(%rsi), %rdi
-; AVX512F-NEXT: movq 48(%rsi), %r10
; AVX512F-NEXT: addq (%rsi), %rdx
-; AVX512F-NEXT: movq 32(%rsi), %r11
; AVX512F-NEXT: adcq 8(%rsi), %rcx
-; AVX512F-NEXT: movq 40(%rsi), %rbx
-; AVX512F-NEXT: vmovq %rdx, %xmm0
-; AVX512F-NEXT: vmovq %rcx, %xmm1
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512F-NEXT: adcq 16(%rsi), %r8
-; AVX512F-NEXT: vmovq %r8, %xmm1
+; AVX512F-NEXT: movq %rdi, %rax
; AVX512F-NEXT: adcq 24(%rsi), %r9
-; AVX512F-NEXT: vmovq %r9, %xmm2
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX512F-NEXT: adcq {{[0-9]+}}(%rsp), %r11
-; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512F-NEXT: adcq {{[0-9]+}}(%rsp), %rbx
-; AVX512F-NEXT: vmovq %r11, %xmm1
-; AVX512F-NEXT: adcq {{[0-9]+}}(%rsp), %r10
-; AVX512F-NEXT: vmovq %rbx, %xmm2
+; AVX512F-NEXT: movq 32(%rsi), %rdi
; AVX512F-NEXT: adcq {{[0-9]+}}(%rsp), %rdi
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX512F-NEXT: vmovq %r10, %xmm2
-; AVX512F-NEXT: vmovq %rdi, %xmm3
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
-; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-NEXT: vmovdqu64 %zmm0, (%rax)
-; AVX512F-NEXT: popq %rbx
+; AVX512F-NEXT: movq 40(%rsi), %r10
+; AVX512F-NEXT: adcq {{[0-9]+}}(%rsp), %r10
+; AVX512F-NEXT: movq 48(%rsi), %r11
+; AVX512F-NEXT: adcq {{[0-9]+}}(%rsp), %r11
+; AVX512F-NEXT: movq 56(%rsi), %rsi
+; AVX512F-NEXT: adcq {{[0-9]+}}(%rsp), %rsi
+; AVX512F-NEXT: movq %rdx, (%rax)
+; AVX512F-NEXT: movq %rcx, 8(%rax)
+; AVX512F-NEXT: movq %r8, 16(%rax)
+; AVX512F-NEXT: movq %r9, 24(%rax)
+; AVX512F-NEXT: movq %rdi, 32(%rax)
+; AVX512F-NEXT: movq %r10, 40(%rax)
+; AVX512F-NEXT: movq %r11, 48(%rax)
+; AVX512F-NEXT: movq %rsi, 56(%rax)
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: test_add_i512_mem_reg:
@@ -346,31 +302,22 @@ define i512 @test_add_i512_mem_reg(ptr %p0, i512 %a1) nounwind {
; AVX512VL-NEXT: movq 32(%rsi), %r11
; AVX512VL-NEXT: movq 40(%rsi), %rbx
; AVX512VL-NEXT: addq (%rsi), %rdx
-; AVX512VL-NEXT: vmovq %rdx, %xmm0
; AVX512VL-NEXT: adcq 8(%rsi), %rcx
-; AVX512VL-NEXT: vmovq %rcx, %xmm1
-; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512VL-NEXT: adcq 16(%rsi), %r8
-; AVX512VL-NEXT: vmovq %r8, %xmm1
; AVX512VL-NEXT: adcq 24(%rsi), %r9
-; AVX512VL-NEXT: vmovq %r9, %xmm2
-; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX512VL-NEXT: adcq {{[0-9]+}}(%rsp), %r11
-; AVX512VL-NEXT: vmovq %r11, %xmm1
; AVX512VL-NEXT: adcq {{[0-9]+}}(%rsp), %rbx
-; AVX512VL-NEXT: vmovq %rbx, %xmm2
-; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; AVX512VL-NEXT: adcq {{[0-9]+}}(%rsp), %r10
-; AVX512VL-NEXT: vmovq %r10, %xmm2
; AVX512VL-NEXT: adcq {{[0-9]+}}(%rsp), %rdi
-; AVX512VL-NEXT: vmovq %rdi, %xmm3
-; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; AVX512VL-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
-; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512VL-NEXT: vmovdqu64 %zmm0, (%rax)
+; AVX512VL-NEXT: movq %rdx, (%rax)
+; AVX512VL-NEXT: movq %rcx, 8(%rax)
+; AVX512VL-NEXT: movq %r8, 16(%rax)
+; AVX512VL-NEXT: movq %r9, 24(%rax)
+; AVX512VL-NEXT: movq %r11, 32(%rax)
+; AVX512VL-NEXT: movq %rbx, 40(%rax)
+; AVX512VL-NEXT: movq %r10, 48(%rax)
+; AVX512VL-NEXT: movq %rdi, 56(%rax)
; AVX512VL-NEXT: popq %rbx
-; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
%a0 = load i512, ptr %p0
%r = add i512 %a0, %a1
diff --git a/llvm/test/CodeGen/X86/sub-i512.ll b/llvm/test/CodeGen/X86/sub-i512.ll
index 65c084623d06c..1576489fd89ff 100644
--- a/llvm/test/CodeGen/X86/sub-i512.ll
+++ b/llvm/test/CodeGen/X86/sub-i512.ll
@@ -58,31 +58,23 @@ define i512 @test_sub_i512_reg_reg(i512 %a0, i512 %a1) nounwind {
; AVX512F-NEXT: subq {{[0-9]+}}(%rsp), %rsi
; AVX512F-NEXT: sbbq {{[0-9]+}}(%rsp), %rdx
; AVX512F-NEXT: sbbq {{[0-9]+}}(%rsp), %rcx
-; AVX512F-NEXT: movq %rdi, %rax
; AVX512F-NEXT: sbbq {{[0-9]+}}(%rsp), %r8
-; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rdi
+; AVX512F-NEXT: movq %rdi, %rax
; AVX512F-NEXT: sbbq {{[0-9]+}}(%rsp), %r9
+; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rdi
+; AVX512F-NEXT: sbbq {{[0-9]+}}(%rsp), %rdi
; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r10
; AVX512F-NEXT: sbbq {{[0-9]+}}(%rsp), %r10
; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r11
-; AVX512F-NEXT: vmovq %r9, %xmm0
-; AVX512F-NEXT: vmovq %r10, %xmm1
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512F-NEXT: sbbq {{[0-9]+}}(%rsp), %rdi
-; AVX512F-NEXT: vmovq %rdi, %xmm1
; AVX512F-NEXT: sbbq {{[0-9]+}}(%rsp), %r11
-; AVX512F-NEXT: vmovq %r11, %xmm2
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512F-NEXT: vmovq %rcx, %xmm1
-; AVX512F-NEXT: vmovq %r8, %xmm2
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX512F-NEXT: vmovq %rdx, %xmm2
-; AVX512F-NEXT: vmovq %rsi, %xmm3
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
-; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
-; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-NEXT: vmovdqu64 %zmm0, (%rax)
+; AVX512F-NEXT: movq %rcx, 16(%rax)
+; AVX512F-NEXT: movq %rdx, 8(%rax)
+; AVX512F-NEXT: movq %rsi, (%rax)
+; AVX512F-NEXT: movq %r8, 24(%rax)
+; AVX512F-NEXT: movq %r9, 32(%rax)
+; AVX512F-NEXT: movq %rdi, 40(%rax)
+; AVX512F-NEXT: movq %r10, 48(%rax)
+; AVX512F-NEXT: movq %r11, 56(%rax)
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: test_sub_i512_reg_reg:
@@ -90,32 +82,23 @@ define i512 @test_sub_i512_reg_reg(i512 %a0, i512 %a1) nounwind {
; AVX512VL-NEXT: movq %rdi, %rax
; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %rdi
; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %r11
; AVX512VL-NEXT: subq {{[0-9]+}}(%rsp), %rsi
; AVX512VL-NEXT: sbbq {{[0-9]+}}(%rsp), %rdx
; AVX512VL-NEXT: sbbq {{[0-9]+}}(%rsp), %rcx
; AVX512VL-NEXT: sbbq {{[0-9]+}}(%rsp), %r8
; AVX512VL-NEXT: sbbq {{[0-9]+}}(%rsp), %r9
-; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %r11
-; AVX512VL-NEXT: vmovq %r9, %xmm0
; AVX512VL-NEXT: sbbq {{[0-9]+}}(%rsp), %r11
-; AVX512VL-NEXT: vmovq %r11, %xmm1
-; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512VL-NEXT: sbbq {{[0-9]+}}(%rsp), %rdi
-; AVX512VL-NEXT: vmovq %rdi, %xmm1
; AVX512VL-NEXT: sbbq {{[0-9]+}}(%rsp), %r10
-; AVX512VL-NEXT: vmovq %r10, %xmm2
-; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512VL-NEXT: vmovq %rcx, %xmm1
-; AVX512VL-NEXT: vmovq %r8, %xmm2
-; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX512VL-NEXT: vmovq %rdx, %xmm2
-; AVX512VL-NEXT: vmovq %rsi, %xmm3
-; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
-; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
-; AVX512VL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512VL-NEXT: vmovdqu64 %zmm0, (%rax)
-; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: movq %rcx, 16(%rax)
+; AVX512VL-NEXT: movq %rdx, 8(%rax)
+; AVX512VL-NEXT: movq %rsi, (%rax)
+; AVX512VL-NEXT: movq %r8, 24(%rax)
+; AVX512VL-NEXT: movq %r9, 32(%rax)
+; AVX512VL-NEXT: movq %r11, 40(%rax)
+; AVX512VL-NEXT: movq %rdi, 48(%rax)
+; AVX512VL-NEXT: movq %r10, 56(%rax)
; AVX512VL-NEXT: retq
%r = sub i512 %a0, %a1
ret i512 %r
@@ -179,35 +162,27 @@ define i512 @test_sub_i512_reg_mem(i512 %a0, ptr %p1) nounwind {
; AVX512F-LABEL: test_sub_i512_reg_mem:
; AVX512F: # %bb.0:
; AVX512F-NEXT: pushq %rbx
-; AVX512F-NEXT: movq %rdi, %rax
-; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rdi
; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r10
; AVX512F-NEXT: subq (%r10), %rsi
-; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r11
; AVX512F-NEXT: sbbq 8(%r10), %rdx
-; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rbx
-; AVX512F-NEXT: vmovq %rsi, %xmm0
-; AVX512F-NEXT: vmovq %rdx, %xmm1
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512F-NEXT: sbbq 16(%r10), %rcx
-; AVX512F-NEXT: vmovq %rcx, %xmm1
; AVX512F-NEXT: sbbq 24(%r10), %r8
-; AVX512F-NEXT: vmovq %r8, %xmm2
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX512F-NEXT: movq %rdi, %rax
; AVX512F-NEXT: sbbq 32(%r10), %r9
-; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512F-NEXT: sbbq 40(%r10), %rbx
-; AVX512F-NEXT: vmovq %r9, %xmm1
-; AVX512F-NEXT: sbbq 48(%r10), %rdi
-; AVX512F-NEXT: vmovq %rbx, %xmm2
-; AVX512F-NEXT: sbbq 56(%r10), %r11
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX512F-NEXT: vmovq %rdi, %xmm2
-; AVX512F-NEXT: vmovq %r11, %xmm3
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
-; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-NEXT: vmovdqu64 %zmm0, (%rax)
+; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rdi
+; AVX512F-NEXT: sbbq 40(%r10), %rdi
+; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r11
+; AVX512F-NEXT: sbbq 48(%r10), %r11
+; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rbx
+; AVX512F-NEXT: sbbq 56(%r10), %rbx
+; AVX512F-NEXT: movq %rsi, (%rax)
+; AVX512F-NEXT: movq %rdx, 8(%rax)
+; AVX512F-NEXT: movq %rcx, 16(%rax)
+; AVX512F-NEXT: movq %r8, 24(%rax)
+; AVX512F-NEXT: movq %r9, 32(%rax)
+; AVX512F-NEXT: movq %rdi, 40(%rax)
+; AVX512F-NEXT: movq %r11, 48(%rax)
+; AVX512F-NEXT: movq %rbx, 56(%rax)
; AVX512F-NEXT: popq %rbx
; AVX512F-NEXT: retq
;
@@ -215,36 +190,27 @@ define i512 @test_sub_i512_reg_mem(i512 %a0, ptr %p1) nounwind {
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: pushq %rbx
; AVX512VL-NEXT: movq %rdi, %rax
-; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %r10
; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %rdi
+; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %r10
; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %r11
; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %rbx
; AVX512VL-NEXT: subq (%rbx), %rsi
-; AVX512VL-NEXT: vmovq %rsi, %xmm0
; AVX512VL-NEXT: sbbq 8(%rbx), %rdx
-; AVX512VL-NEXT: vmovq %rdx, %xmm1
-; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512VL-NEXT: sbbq 16(%rbx), %rcx
-; AVX512VL-NEXT: vmovq %rcx, %xmm1
; AVX512VL-NEXT: sbbq 24(%rbx), %r8
-; AVX512VL-NEXT: vmovq %r8, %xmm2
-; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX512VL-NEXT: sbbq 32(%rbx), %r9
-; AVX512VL-NEXT: vmovq %r9, %xmm1
; AVX512VL-NEXT: sbbq 40(%rbx), %r11
-; AVX512VL-NEXT: vmovq %r11, %xmm2
-; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX512VL-NEXT: sbbq 48(%rbx), %r10
-; AVX512VL-NEXT: vmovq %r10, %xmm2
-; AVX512VL-NEXT: sbbq 56(%rbx), %rdi
-; AVX512VL-NEXT: vmovq %rdi, %xmm3
-; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; AVX512VL-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
-; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512VL-NEXT: vmovdqu64 %zmm0, (%rax)
+; AVX512VL-NEXT: sbbq 48(%rbx), %rdi
+; AVX512VL-NEXT: sbbq 56(%rbx), %r10
+; AVX512VL-NEXT: movq %rsi, (%rax)
+; AVX512VL-NEXT: movq %rdx, 8(%rax)
+; AVX512VL-NEXT: movq %rcx, 16(%rax)
+; AVX512VL-NEXT: movq %r8, 24(%rax)
+; AVX512VL-NEXT: movq %r9, 32(%rax)
+; AVX512VL-NEXT: movq %r11, 40(%rax)
+; AVX512VL-NEXT: movq %rdi, 48(%rax)
+; AVX512VL-NEXT: movq %r10, 56(%rax)
; AVX512VL-NEXT: popq %rbx
-; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
%a1 = load i512, ptr %p1
%r = sub i512 %a0, %a1
@@ -318,47 +284,33 @@ define i512 @test_sub_i512_mem_reg(ptr %p0, i512 %a1) nounwind {
;
; AVX512F-LABEL: test_sub_i512_mem_reg:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: pushq %r15
-; AVX512F-NEXT: pushq %r14
-; AVX512F-NEXT: pushq %r12
; AVX512F-NEXT: pushq %rbx
; AVX512F-NEXT: movq %rdi, %rax
-; AVX512F-NEXT: movq 56(%rsi), %rdi
-; AVX512F-NEXT: movq 48(%rsi), %r10
-; AVX512F-NEXT: movq 40(%rsi), %r11
-; AVX512F-NEXT: movq 32(%rsi), %rbx
-; AVX512F-NEXT: movq 24(%rsi), %r14
-; AVX512F-NEXT: movq 16(%rsi), %r15
-; AVX512F-NEXT: movq (%rsi), %r12
-; AVX512F-NEXT: movq 8(%rsi), %rsi
-; AVX512F-NEXT: subq %rdx, %r12
-; AVX512F-NEXT: vmovq %r12, %xmm0
-; AVX512F-NEXT: sbbq %rcx, %rsi
-; AVX512F-NEXT: vmovq %rsi, %xmm1
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512F-NEXT: sbbq %r8, %r15
-; AVX512F-NEXT: vmovq %r15, %xmm1
-; AVX512F-NEXT: sbbq %r9, %r14
-; AVX512F-NEXT: vmovq %r14, %xmm2
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX512F-NEXT: sbbq {{[0-9]+}}(%rsp), %rbx
-; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512F-NEXT: sbbq {{[0-9]+}}(%rsp), %r11
-; AVX512F-NEXT: vmovq %rbx, %xmm1
-; AVX512F-NEXT: sbbq {{[0-9]+}}(%rsp), %r10
-; AVX512F-NEXT: vmovq %r11, %xmm2
-; AVX512F-NEXT: sbbq {{[0-9]+}}(%rsp), %rdi
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX512F-NEXT: vmovq %r10, %xmm2
-; AVX512F-NEXT: vmovq %rdi, %xmm3
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
-; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-NEXT: vmovdqu64 %zmm0, (%rax)
+; AVX512F-NEXT: movq 24(%rsi), %rdi
+; AVX512F-NEXT: movq 16(%rsi), %r10
+; AVX512F-NEXT: movq (%rsi), %r11
+; AVX512F-NEXT: movq 8(%rsi), %rbx
+; AVX512F-NEXT: subq %rdx, %r11
+; AVX512F-NEXT: sbbq %rcx, %rbx
+; AVX512F-NEXT: sbbq %r8, %r10
+; AVX512F-NEXT: sbbq %r9, %rdi
+; AVX512F-NEXT: movq 32(%rsi), %rcx
+; AVX512F-NEXT: sbbq {{[0-9]+}}(%rsp), %rcx
+; AVX512F-NEXT: movq 40(%rsi), %rdx
+; AVX512F-NEXT: sbbq {{[0-9]+}}(%rsp), %rdx
+; AVX512F-NEXT: movq 48(%rsi), %r8
+; AVX512F-NEXT: sbbq {{[0-9]+}}(%rsp), %r8
+; AVX512F-NEXT: movq 56(%rsi), %rsi
+; AVX512F-NEXT: sbbq {{[0-9]+}}(%rsp), %rsi
+; AVX512F-NEXT: movq %r11, (%rax)
+; AVX512F-NEXT: movq %rbx, 8(%rax)
+; AVX512F-NEXT: movq %r10, 16(%rax)
+; AVX512F-NEXT: movq %rdi, 24(%rax)
+; AVX512F-NEXT: movq %rcx, 32(%rax)
+; AVX512F-NEXT: movq %rdx, 40(%rax)
+; AVX512F-NEXT: movq %r8, 48(%rax)
+; AVX512F-NEXT: movq %rsi, 56(%rax)
; AVX512F-NEXT: popq %rbx
-; AVX512F-NEXT: popq %r12
-; AVX512F-NEXT: popq %r14
-; AVX512F-NEXT: popq %r15
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: test_sub_i512_mem_reg:
@@ -377,34 +329,25 @@ define i512 @test_sub_i512_mem_reg(ptr %p0, i512 %a1) nounwind {
; AVX512VL-NEXT: movq (%rsi), %r12
; AVX512VL-NEXT: movq 8(%rsi), %rsi
; AVX512VL-NEXT: subq %rdx, %r12
-; AVX512VL-NEXT: vmovq %r12, %xmm0
; AVX512VL-NEXT: sbbq %rcx, %rsi
-; AVX512VL-NEXT: vmovq %rsi, %xmm1
-; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512VL-NEXT: sbbq %r8, %r15
-; AVX512VL-NEXT: vmovq %r15, %xmm1
; AVX512VL-NEXT: sbbq %r9, %r14
-; AVX512VL-NEXT: vmovq %r14, %xmm2
-; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX512VL-NEXT: sbbq {{[0-9]+}}(%rsp), %rbx
-; AVX512VL-NEXT: vmovq %rbx, %xmm1
; AVX512VL-NEXT: sbbq {{[0-9]+}}(%rsp), %r11
-; AVX512VL-NEXT: vmovq %r11, %xmm2
-; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; AVX512VL-NEXT: sbbq {{[0-9]+}}(%rsp), %r10
-; AVX512VL-NEXT: vmovq %r10, %xmm2
; AVX512VL-NEXT: sbbq {{[0-9]+}}(%rsp), %rdi
-; AVX512VL-NEXT: vmovq %rdi, %xmm3
-; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; AVX512VL-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
-; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512VL-NEXT: vmovdqu64 %zmm0, (%rax)
+; AVX512VL-NEXT: movq %r12, (%rax)
+; AVX512VL-NEXT: movq %rsi, 8(%rax)
+; AVX512VL-NEXT: movq %r15, 16(%rax)
+; AVX512VL-NEXT: movq %r14, 24(%rax)
+; AVX512VL-NEXT: movq %rbx, 32(%rax)
+; AVX512VL-NEXT: movq %r11, 40(%rax)
+; AVX512VL-NEXT: movq %r10, 48(%rax)
+; AVX512VL-NEXT: movq %rdi, 56(%rax)
; AVX512VL-NEXT: popq %rbx
; AVX512VL-NEXT: popq %r12
; AVX512VL-NEXT: popq %r14
; AVX512VL-NEXT: popq %r15
-; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
%a0 = load i512, ptr %p0
%r = sub i512 %a0, %a1
>From 9282d694c412ec253f6b20a24ccb260f0f773e6a Mon Sep 17 00:00:00 2001
From: Islam-Imad <islamimad404 at gmail.com>
Date: Wed, 4 Feb 2026 00:50:45 +0200
Subject: [PATCH 3/8] - move regression problem handling into mayFoldIntoVector
- optimize inc/dec operation
---
llvm/include/llvm/CodeGen/ISDOpcodes.h | 5 +++
llvm/lib/Target/X86/X86ISelLowering.cpp | 22 +++++++---
llvm/test/CodeGen/X86/add-i512.ll | 54 ++++++++++++-------------
3 files changed, 46 insertions(+), 35 deletions(-)
diff --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h
index 2ebd2641944f5..4b39a308e8472 100644
--- a/llvm/include/llvm/CodeGen/ISDOpcodes.h
+++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h
@@ -1618,6 +1618,11 @@ inline bool isBitwiseLogicOp(unsigned Opcode) {
return Opcode == ISD::AND || Opcode == ISD::OR || Opcode == ISD::XOR;
}
+/// Whether this is add/sub opcode.
+inline bool isAddSubOp(unsigned Opcode) {
+ return Opcode == ISD::ADD || Opcode == ISD::SUB;
+}
+
/// Given a \p MinMaxOpc of ISD::(U|S)MIN or ISD::(U|S)MAX, returns
/// ISD::(U|S)MAX and ISD::(U|S)MIN, respectively.
LLVM_ABI NodeType getInverseMinMaxOpcode(unsigned MinMaxOpc);
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index b95569e934f1a..7b7798b0d4722 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -2904,8 +2904,9 @@ static bool mayFoldIntoVector(SDValue Op, const X86Subtarget &Subtarget,
if (isa<ConstantSDNode>(Op) || isa<ConstantFPSDNode>(Op))
return true;
EVT VT = Op.getValueType();
- if (ISD::isBitwiseLogicOp(Op.getOpcode()) &&
- (VT == MVT::i128 || VT == MVT::i256 || VT == MVT::i512))
+ bool ValidOp = (ISD::isBitwiseLogicOp(Op.getOpcode()) ||
+ ISD::isAddSubOp(Op.getOpcode()));
+ if (ValidOp && (VT == MVT::i128 || VT == MVT::i256 || VT == MVT::i512))
return mayFoldIntoVector(Op.getOperand(0), Subtarget) &&
mayFoldIntoVector(Op.getOperand(1), Subtarget);
return X86::mayFoldLoad(Op, Subtarget, AssumeSingleUse,
@@ -34267,7 +34268,6 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
SDValue RHS = N->getOperand(1);
EVT VT = N->getValueType(0);
bool IsAdd = Opc == ISD::ADD;
-
assert(Subtarget.useAVX512Regs() && "AVX512 required");
assert(VT == MVT::i512 && "Unexpected VT!");
@@ -34275,11 +34275,21 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
!mayFoldIntoVector(RHS, Subtarget))
return;
- MVT VecVT = MVT::getVectorVT(MVT::i64, 8);
- MVT BoolVT = MVT::getVectorVT(MVT::i1, 8);
+ MVT VecVT = MVT::v8i64;
+ MVT BoolVT = MVT::v8i1;
+ SDValue AllOnes = DAG.getAllOnesConstant(dl, VecVT);
+
+ if (isOneConstant(RHS)) {
+ RHS = AllOnes;
+ Opc = (IsAdd ? ISD::SUB : ISD::ADD);
+ IsAdd = !IsAdd;
+ // LHS + 1 => LHS - (- 1 , LHS - 1 => LHS + (- 1)
+ // we utilize var `AllOnes` to do less work, this optimization makes snese
+ // since inc/dec operations are common :)
+ }
+
SDValue Vec0 = DAG.getBitcast(VecVT, LHS);
SDValue Vec1 = DAG.getBitcast(VecVT, RHS);
- SDValue AllOnes = DAG.getAllOnesConstant(dl, VecVT);
SDValue Partial = DAG.getNode(Opc, dl, VecVT, Vec0, Vec1);
diff --git a/llvm/test/CodeGen/X86/add-i512.ll b/llvm/test/CodeGen/X86/add-i512.ll
index 3b0755cf2183a..855435c80b1bf 100644
--- a/llvm/test/CodeGen/X86/add-i512.ll
+++ b/llvm/test/CodeGen/X86/add-i512.ll
@@ -387,36 +387,34 @@ define i512 @test_inc_i512_mem(ptr %p0) nounwind {
; AVX512F: # %bb.0:
; AVX512F-NEXT: movq %rdi, %rax
; AVX512F-NEXT: vmovdqu64 (%rsi), %zmm0
-; AVX512F-NEXT: vmovd {{.*#+}} xmm1 = [1,0,0,0]
-; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm1
-; AVX512F-NEXT: vpcmpltuq %zmm0, %zmm1, %k0
+; AVX512F-NEXT: vpternlogd {{.*#+}} zmm1 = -1
+; AVX512F-NEXT: vpsubq %zmm1, %zmm0, %zmm1
+; AVX512F-NEXT: vpcmpnleuq %zmm0, %zmm1, %k0
; AVX512F-NEXT: kmovw %k0, %ecx
-; AVX512F-NEXT: vpternlogd {{.*#+}} zmm2 = -1
-; AVX512F-NEXT: vpcmpeqq %zmm2, %zmm1, %k0
+; AVX512F-NEXT: vptestnmq %zmm1, %zmm1, %k0
; AVX512F-NEXT: kmovw %k0, %edx
; AVX512F-NEXT: movzbl %dl, %edx
; AVX512F-NEXT: leal (%rdx,%rcx,2), %ecx
; AVX512F-NEXT: xorl %edx, %ecx
; AVX512F-NEXT: kmovw %ecx, %k1
-; AVX512F-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 {%k1}
+; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
; AVX512F-NEXT: vmovdqu64 %zmm1, (%rdi)
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: test_inc_i512_mem:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqu64 (%rsi), %zmm0
-; AVX512VL-NEXT: vmovd {{.*#+}} xmm1 = [1,0,0,0]
; AVX512VL-NEXT: movq %rdi, %rax
-; AVX512VL-NEXT: vpaddq %zmm1, %zmm0, %zmm1
-; AVX512VL-NEXT: vpcmpltuq %zmm0, %zmm1, %k0
+; AVX512VL-NEXT: vmovdqu64 (%rsi), %zmm0
+; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm1 = -1
+; AVX512VL-NEXT: vpsubq %zmm1, %zmm0, %zmm1
+; AVX512VL-NEXT: vpcmpnleuq %zmm0, %zmm1, %k0
; AVX512VL-NEXT: kmovd %k0, %ecx
-; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm2 = -1
-; AVX512VL-NEXT: vpcmpeqq %zmm2, %zmm1, %k0
+; AVX512VL-NEXT: vptestnmq %zmm1, %zmm1, %k0
; AVX512VL-NEXT: kmovb %k0, %edx
; AVX512VL-NEXT: leal (%rdx,%rcx,2), %ecx
; AVX512VL-NEXT: xorl %edx, %ecx
; AVX512VL-NEXT: kmovd %ecx, %k1
-; AVX512VL-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 {%k1}
+; AVX512VL-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
; AVX512VL-NEXT: vmovdqu64 %zmm1, (%rdi)
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
@@ -441,36 +439,34 @@ define void @test_inc_i512_rmw(ptr %p0) nounwind {
; AVX512F-LABEL: test_inc_i512_rmw:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqu64 (%rdi), %zmm0
-; AVX512F-NEXT: vmovd {{.*#+}} xmm1 = [1,0,0,0]
-; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm1
-; AVX512F-NEXT: vpcmpltuq %zmm0, %zmm1, %k0
+; AVX512F-NEXT: vpternlogd {{.*#+}} zmm1 = -1
+; AVX512F-NEXT: vpsubq %zmm1, %zmm0, %zmm1
+; AVX512F-NEXT: vpcmpnleuq %zmm0, %zmm1, %k0
; AVX512F-NEXT: kmovw %k0, %eax
-; AVX512F-NEXT: vpternlogd {{.*#+}} zmm2 = -1
-; AVX512F-NEXT: vpcmpeqq %zmm2, %zmm1, %k0
+; AVX512F-NEXT: vptestnmq %zmm1, %zmm1, %k0
; AVX512F-NEXT: kmovw %k0, %ecx
; AVX512F-NEXT: movzbl %cl, %ecx
; AVX512F-NEXT: leal (%rcx,%rax,2), %eax
; AVX512F-NEXT: xorl %ecx, %eax
-; AVX512F-NEXT: kmovw %eax, %k1
-; AVX512F-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 {%k1}
-; AVX512F-NEXT: vmovdqu64 %zmm1, (%rdi)
+; AVX512F-NEXT: kmovw %eax, %k0
+; AVX512F-NEXT: knotw %k0, %k1
+; AVX512F-NEXT: vmovdqu64 %zmm1, (%rdi) {%k1}
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: test_inc_i512_rmw:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovdqu64 (%rdi), %zmm0
-; AVX512VL-NEXT: vmovd {{.*#+}} xmm1 = [1,0,0,0]
-; AVX512VL-NEXT: vpaddq %zmm1, %zmm0, %zmm1
-; AVX512VL-NEXT: vpcmpltuq %zmm0, %zmm1, %k0
+; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm1 = -1
+; AVX512VL-NEXT: vpsubq %zmm1, %zmm0, %zmm1
+; AVX512VL-NEXT: vpcmpnleuq %zmm0, %zmm1, %k0
; AVX512VL-NEXT: kmovd %k0, %eax
-; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm2 = -1
-; AVX512VL-NEXT: vpcmpeqq %zmm2, %zmm1, %k0
+; AVX512VL-NEXT: vptestnmq %zmm1, %zmm1, %k0
; AVX512VL-NEXT: kmovb %k0, %ecx
; AVX512VL-NEXT: leal (%rcx,%rax,2), %eax
; AVX512VL-NEXT: xorl %ecx, %eax
-; AVX512VL-NEXT: kmovd %eax, %k1
-; AVX512VL-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 {%k1}
-; AVX512VL-NEXT: vmovdqu64 %zmm1, (%rdi)
+; AVX512VL-NEXT: kmovd %eax, %k0
+; AVX512VL-NEXT: knotb %k0, %k1
+; AVX512VL-NEXT: vmovdqu64 %zmm1, (%rdi) {%k1}
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
%a0 = load i512, ptr %p0
>From 8c0e80592e6a291dc182ede1dd665f414095e655 Mon Sep 17 00:00:00 2001
From: Islam-Imad <islamimad404 at gmail.com>
Date: Wed, 4 Feb 2026 00:58:53 +0200
Subject: [PATCH 4/8] fix format issue
---
llvm/lib/Target/X86/X86ISelLowering.cpp | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 7b7798b0d4722..b756a6ccb1eb2 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -54631,9 +54631,9 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
// vector type or the operation is likely to expand to a vector type
// (legalization can scalarize back if it the op failed).
if (VT == MVT::i256 || VT == MVT::i512) {
- // Issue : 173996 , PRs : [174761,179503] : when add/sub lowered on avx512 we hit a
- // regression issue. my approach is to allow the combine only when the
- // operation is done by our custome handling.
+ // Issue : 173996 , PRs : [174761,179503] : when add/sub lowered on avx512
+ // we hit a regression issue. my approach is to allow the combine only
+ // when the operation is done by our custome handling.
// X86TargetLowering::ReplaceNodeResults (ADD/SUB) cases.
MVT VecVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
if (TLI.isTypeLegal(VecVT) && ISD::isNormalStore(St) &&
>From 3f1a2963a97d23274add2dbdaab378e649b8c81a Mon Sep 17 00:00:00 2001
From: Islam-Imad <islamimad404 at gmail.com>
Date: Wed, 4 Feb 2026 11:33:52 +0200
Subject: [PATCH 5/8] added more tests to add/sub i512
---
llvm/test/CodeGen/X86/add-i512.ll | 158 ++++++++++++++++++++++++++++++
llvm/test/CodeGen/X86/sub-i512.ll | 158 ++++++++++++++++++++++++++++++
2 files changed, 316 insertions(+)
diff --git a/llvm/test/CodeGen/X86/add-i512.ll b/llvm/test/CodeGen/X86/add-i512.ll
index 855435c80b1bf..b7ed76da4922b 100644
--- a/llvm/test/CodeGen/X86/add-i512.ll
+++ b/llvm/test/CodeGen/X86/add-i512.ll
@@ -423,6 +423,110 @@ define i512 @test_inc_i512_mem(ptr %p0) nounwind {
ret i512 %r
}
+define i512 @test_add_i512_mem_mem(ptr %p0, ptr %p1) nounwind {
+; SSE-LABEL: test_add_i512_mem_mem:
+; SSE: # %bb.0:
+; SSE-NEXT: pushq %rbx
+; SSE-NEXT: movq %rdi, %rax
+; SSE-NEXT: movq 56(%rsi), %rcx
+; SSE-NEXT: movq (%rsi), %rdi
+; SSE-NEXT: addq (%rdx), %rdi
+; SSE-NEXT: movq 8(%rsi), %r8
+; SSE-NEXT: adcq 8(%rdx), %r8
+; SSE-NEXT: movq 16(%rsi), %r9
+; SSE-NEXT: adcq 16(%rdx), %r9
+; SSE-NEXT: movq 24(%rsi), %r10
+; SSE-NEXT: adcq 24(%rdx), %r10
+; SSE-NEXT: movq 32(%rsi), %r11
+; SSE-NEXT: adcq 32(%rdx), %r11
+; SSE-NEXT: movq 40(%rsi), %rbx
+; SSE-NEXT: adcq 40(%rdx), %rbx
+; SSE-NEXT: movq 48(%rsi), %rsi
+; SSE-NEXT: adcq 48(%rdx), %rsi
+; SSE-NEXT: adcq 56(%rdx), %rcx
+; SSE-NEXT: movq %rdi, (%rax)
+; SSE-NEXT: movq %r8, 8(%rax)
+; SSE-NEXT: movq %r9, 16(%rax)
+; SSE-NEXT: movq %r10, 24(%rax)
+; SSE-NEXT: movq %r11, 32(%rax)
+; SSE-NEXT: movq %rbx, 40(%rax)
+; SSE-NEXT: movq %rsi, 48(%rax)
+; SSE-NEXT: movq %rcx, 56(%rax)
+; SSE-NEXT: popq %rbx
+; SSE-NEXT: retq
+;
+; AVX2-LABEL: test_add_i512_mem_mem:
+; AVX2: # %bb.0:
+; AVX2-NEXT: pushq %rbx
+; AVX2-NEXT: movq %rdi, %rax
+; AVX2-NEXT: movq (%rsi), %rcx
+; AVX2-NEXT: addq (%rdx), %rcx
+; AVX2-NEXT: movq 8(%rsi), %rdi
+; AVX2-NEXT: adcq 8(%rdx), %rdi
+; AVX2-NEXT: movq 16(%rsi), %r8
+; AVX2-NEXT: adcq 16(%rdx), %r8
+; AVX2-NEXT: movq 24(%rsi), %r9
+; AVX2-NEXT: adcq 24(%rdx), %r9
+; AVX2-NEXT: movq 32(%rsi), %r10
+; AVX2-NEXT: adcq 32(%rdx), %r10
+; AVX2-NEXT: movq 40(%rsi), %r11
+; AVX2-NEXT: adcq 40(%rdx), %r11
+; AVX2-NEXT: movq 48(%rsi), %rbx
+; AVX2-NEXT: adcq 48(%rdx), %rbx
+; AVX2-NEXT: movq 56(%rsi), %rsi
+; AVX2-NEXT: adcq 56(%rdx), %rsi
+; AVX2-NEXT: movq %rcx, (%rax)
+; AVX2-NEXT: movq %rdi, 8(%rax)
+; AVX2-NEXT: movq %r8, 16(%rax)
+; AVX2-NEXT: movq %r9, 24(%rax)
+; AVX2-NEXT: movq %r10, 32(%rax)
+; AVX2-NEXT: movq %r11, 40(%rax)
+; AVX2-NEXT: movq %rbx, 48(%rax)
+; AVX2-NEXT: movq %rsi, 56(%rax)
+; AVX2-NEXT: popq %rbx
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: test_add_i512_mem_mem:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: movq %rdi, %rax
+; AVX512F-NEXT: vmovdqu64 (%rsi), %zmm0
+; AVX512F-NEXT: vpaddq (%rdx), %zmm0, %zmm1
+; AVX512F-NEXT: vpcmpltuq %zmm0, %zmm1, %k0
+; AVX512F-NEXT: kmovw %k0, %ecx
+; AVX512F-NEXT: vpternlogd {{.*#+}} zmm0 = -1
+; AVX512F-NEXT: vpcmpeqq %zmm0, %zmm1, %k0
+; AVX512F-NEXT: kmovw %k0, %edx
+; AVX512F-NEXT: movzbl %dl, %edx
+; AVX512F-NEXT: leal (%rdx,%rcx,2), %ecx
+; AVX512F-NEXT: xorl %edx, %ecx
+; AVX512F-NEXT: kmovw %ecx, %k1
+; AVX512F-NEXT: vpsubq %zmm0, %zmm1, %zmm1 {%k1}
+; AVX512F-NEXT: vmovdqu64 %zmm1, (%rdi)
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: test_add_i512_mem_mem:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqu64 (%rsi), %zmm0
+; AVX512VL-NEXT: movq %rdi, %rax
+; AVX512VL-NEXT: vpaddq (%rdx), %zmm0, %zmm1
+; AVX512VL-NEXT: vpcmpltuq %zmm0, %zmm1, %k0
+; AVX512VL-NEXT: kmovd %k0, %ecx
+; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm0 = -1
+; AVX512VL-NEXT: vpcmpeqq %zmm0, %zmm1, %k0
+; AVX512VL-NEXT: kmovb %k0, %edx
+; AVX512VL-NEXT: leal (%rdx,%rcx,2), %ecx
+; AVX512VL-NEXT: xorl %edx, %ecx
+; AVX512VL-NEXT: kmovd %ecx, %k1
+; AVX512VL-NEXT: vpsubq %zmm0, %zmm1, %zmm1 {%k1}
+; AVX512VL-NEXT: vmovdqu64 %zmm1, (%rdi)
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retq
+ %a0 = load i512, ptr %p0
+ %a1 = load i512, ptr %p1
+ %r = add i512 %a0, %a1
+ ret i512 %r
+}
+
define void @test_inc_i512_rmw(ptr %p0) nounwind {
; CHECK-LABEL: test_inc_i512_rmw:
; CHECK: # %bb.0:
@@ -474,5 +578,59 @@ define void @test_inc_i512_rmw(ptr %p0) nounwind {
store i512 %r, ptr %p0
ret void
}
+
+define void @test_add_i512_rmw(ptr %p0) nounwind {
+; CHECK-LABEL: test_add_i512_rmw:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addq $9, (%rdi)
+; CHECK-NEXT: adcq $0, 8(%rdi)
+; CHECK-NEXT: adcq $0, 16(%rdi)
+; CHECK-NEXT: adcq $0, 24(%rdi)
+; CHECK-NEXT: adcq $0, 32(%rdi)
+; CHECK-NEXT: adcq $0, 40(%rdi)
+; CHECK-NEXT: adcq $0, 48(%rdi)
+; CHECK-NEXT: adcq $0, 56(%rdi)
+; CHECK-NEXT: retq
+;
+; AVX512F-LABEL: test_add_i512_rmw:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovdqu64 (%rdi), %zmm0
+; AVX512F-NEXT: vmovd {{.*#+}} xmm1 = [9,0,0,0]
+; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm1
+; AVX512F-NEXT: vpcmpltuq %zmm0, %zmm1, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: vpternlogd {{.*#+}} zmm2 = -1
+; AVX512F-NEXT: vpcmpeqq %zmm2, %zmm1, %k0
+; AVX512F-NEXT: kmovw %k0, %ecx
+; AVX512F-NEXT: movzbl %cl, %ecx
+; AVX512F-NEXT: leal (%rcx,%rax,2), %eax
+; AVX512F-NEXT: xorl %ecx, %eax
+; AVX512F-NEXT: kmovw %eax, %k1
+; AVX512F-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 {%k1}
+; AVX512F-NEXT: vmovdqu64 %zmm1, (%rdi)
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: test_add_i512_rmw:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqu64 (%rdi), %zmm0
+; AVX512VL-NEXT: vmovd {{.*#+}} xmm1 = [9,0,0,0]
+; AVX512VL-NEXT: vpaddq %zmm1, %zmm0, %zmm1
+; AVX512VL-NEXT: vpcmpltuq %zmm0, %zmm1, %k0
+; AVX512VL-NEXT: kmovd %k0, %eax
+; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm2 = -1
+; AVX512VL-NEXT: vpcmpeqq %zmm2, %zmm1, %k0
+; AVX512VL-NEXT: kmovb %k0, %ecx
+; AVX512VL-NEXT: leal (%rcx,%rax,2), %eax
+; AVX512VL-NEXT: xorl %ecx, %eax
+; AVX512VL-NEXT: kmovd %eax, %k1
+; AVX512VL-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 {%k1}
+; AVX512VL-NEXT: vmovdqu64 %zmm1, (%rdi)
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retq
+ %a0 = load i512, ptr %p0
+ %r = add i512 %a0, 9
+ store i512 %r, ptr %p0
+ ret void
+}
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; AVX512: {{.*}}
diff --git a/llvm/test/CodeGen/X86/sub-i512.ll b/llvm/test/CodeGen/X86/sub-i512.ll
index 1576489fd89ff..e968e5e27e2ce 100644
--- a/llvm/test/CodeGen/X86/sub-i512.ll
+++ b/llvm/test/CodeGen/X86/sub-i512.ll
@@ -453,6 +453,110 @@ define i512 @test_dec_i512_mem(ptr %p0) nounwind {
ret i512 %r
}
+define i512 @test_sub_i512_mem_mem(ptr %p0, ptr %p1) nounwind {
+; SSE-LABEL: test_sub_i512_mem_mem:
+; SSE: # %bb.0:
+; SSE-NEXT: pushq %rbx
+; SSE-NEXT: movq %rdi, %rax
+; SSE-NEXT: movq 56(%rsi), %rcx
+; SSE-NEXT: movq (%rsi), %rdi
+; SSE-NEXT: subq (%rdx), %rdi
+; SSE-NEXT: movq 8(%rsi), %r8
+; SSE-NEXT: sbbq 8(%rdx), %r8
+; SSE-NEXT: movq 16(%rsi), %r9
+; SSE-NEXT: sbbq 16(%rdx), %r9
+; SSE-NEXT: movq 24(%rsi), %r10
+; SSE-NEXT: sbbq 24(%rdx), %r10
+; SSE-NEXT: movq 32(%rsi), %r11
+; SSE-NEXT: sbbq 32(%rdx), %r11
+; SSE-NEXT: movq 40(%rsi), %rbx
+; SSE-NEXT: sbbq 40(%rdx), %rbx
+; SSE-NEXT: movq 48(%rsi), %rsi
+; SSE-NEXT: sbbq 48(%rdx), %rsi
+; SSE-NEXT: sbbq 56(%rdx), %rcx
+; SSE-NEXT: movq %rdi, (%rax)
+; SSE-NEXT: movq %r8, 8(%rax)
+; SSE-NEXT: movq %r9, 16(%rax)
+; SSE-NEXT: movq %r10, 24(%rax)
+; SSE-NEXT: movq %r11, 32(%rax)
+; SSE-NEXT: movq %rbx, 40(%rax)
+; SSE-NEXT: movq %rsi, 48(%rax)
+; SSE-NEXT: movq %rcx, 56(%rax)
+; SSE-NEXT: popq %rbx
+; SSE-NEXT: retq
+;
+; AVX2-LABEL: test_sub_i512_mem_mem:
+; AVX2: # %bb.0:
+; AVX2-NEXT: pushq %rbx
+; AVX2-NEXT: movq %rdi, %rax
+; AVX2-NEXT: movq (%rsi), %rcx
+; AVX2-NEXT: subq (%rdx), %rcx
+; AVX2-NEXT: movq 8(%rsi), %rdi
+; AVX2-NEXT: sbbq 8(%rdx), %rdi
+; AVX2-NEXT: movq 16(%rsi), %r8
+; AVX2-NEXT: sbbq 16(%rdx), %r8
+; AVX2-NEXT: movq 24(%rsi), %r9
+; AVX2-NEXT: sbbq 24(%rdx), %r9
+; AVX2-NEXT: movq 32(%rsi), %r10
+; AVX2-NEXT: sbbq 32(%rdx), %r10
+; AVX2-NEXT: movq 40(%rsi), %r11
+; AVX2-NEXT: sbbq 40(%rdx), %r11
+; AVX2-NEXT: movq 48(%rsi), %rbx
+; AVX2-NEXT: sbbq 48(%rdx), %rbx
+; AVX2-NEXT: movq 56(%rsi), %rsi
+; AVX2-NEXT: sbbq 56(%rdx), %rsi
+; AVX2-NEXT: movq %rcx, (%rax)
+; AVX2-NEXT: movq %rdi, 8(%rax)
+; AVX2-NEXT: movq %r8, 16(%rax)
+; AVX2-NEXT: movq %r9, 24(%rax)
+; AVX2-NEXT: movq %r10, 32(%rax)
+; AVX2-NEXT: movq %r11, 40(%rax)
+; AVX2-NEXT: movq %rbx, 48(%rax)
+; AVX2-NEXT: movq %rsi, 56(%rax)
+; AVX2-NEXT: popq %rbx
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: test_sub_i512_mem_mem:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: movq %rdi, %rax
+; AVX512F-NEXT: vmovdqu64 (%rsi), %zmm0
+; AVX512F-NEXT: vpsubq (%rdx), %zmm0, %zmm1
+; AVX512F-NEXT: vpcmpnleuq %zmm0, %zmm1, %k0
+; AVX512F-NEXT: kmovw %k0, %ecx
+; AVX512F-NEXT: vptestnmq %zmm1, %zmm1, %k0
+; AVX512F-NEXT: kmovw %k0, %edx
+; AVX512F-NEXT: movzbl %dl, %edx
+; AVX512F-NEXT: leal (%rdx,%rcx,2), %ecx
+; AVX512F-NEXT: xorl %edx, %ecx
+; AVX512F-NEXT: kmovw %ecx, %k1
+; AVX512F-NEXT: vpternlogd {{.*#+}} zmm0 = -1
+; AVX512F-NEXT: vpaddq %zmm0, %zmm1, %zmm1 {%k1}
+; AVX512F-NEXT: vmovdqu64 %zmm1, (%rdi)
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: test_sub_i512_mem_mem:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqu64 (%rsi), %zmm0
+; AVX512VL-NEXT: movq %rdi, %rax
+; AVX512VL-NEXT: vpsubq (%rdx), %zmm0, %zmm1
+; AVX512VL-NEXT: vpcmpnleuq %zmm0, %zmm1, %k0
+; AVX512VL-NEXT: kmovd %k0, %ecx
+; AVX512VL-NEXT: vptestnmq %zmm1, %zmm1, %k0
+; AVX512VL-NEXT: kmovb %k0, %edx
+; AVX512VL-NEXT: leal (%rdx,%rcx,2), %ecx
+; AVX512VL-NEXT: xorl %edx, %ecx
+; AVX512VL-NEXT: kmovd %ecx, %k1
+; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm0 = -1
+; AVX512VL-NEXT: vpaddq %zmm0, %zmm1, %zmm1 {%k1}
+; AVX512VL-NEXT: vmovdqu64 %zmm1, (%rdi)
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retq
+ %a0 = load i512, ptr %p0
+ %a1 = load i512, ptr %p1
+ %r = sub i512 %a0, %a1
+ ret i512 %r
+}
+
define void @test_dec_i512_rmw(ptr %p0) nounwind {
; CHECK-LABEL: test_dec_i512_rmw:
; CHECK: # %bb.0:
@@ -504,5 +608,59 @@ define void @test_dec_i512_rmw(ptr %p0) nounwind {
store i512 %r, ptr %p0
ret void
}
+
+define void @test_sub_i512_rmw(ptr %p0) nounwind {
+; CHECK-LABEL: test_sub_i512_rmw:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addq $-9, (%rdi)
+; CHECK-NEXT: adcq $-1, 8(%rdi)
+; CHECK-NEXT: adcq $-1, 16(%rdi)
+; CHECK-NEXT: adcq $-1, 24(%rdi)
+; CHECK-NEXT: adcq $-1, 32(%rdi)
+; CHECK-NEXT: adcq $-1, 40(%rdi)
+; CHECK-NEXT: adcq $-1, 48(%rdi)
+; CHECK-NEXT: adcq $-1, 56(%rdi)
+; CHECK-NEXT: retq
+;
+; AVX512F-LABEL: test_sub_i512_rmw:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovdqu64 (%rdi), %zmm0
+; AVX512F-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1
+; AVX512F-NEXT: vpcmpltuq %zmm0, %zmm1, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: vpternlogd {{.*#+}} zmm2 = -1
+; AVX512F-NEXT: vpcmpeqq %zmm2, %zmm1, %k0
+; AVX512F-NEXT: kmovw %k0, %ecx
+; AVX512F-NEXT: movzbl %cl, %ecx
+; AVX512F-NEXT: leal (%rcx,%rax,2), %eax
+; AVX512F-NEXT: xorl %ecx, %eax
+; AVX512F-NEXT: kmovw %eax, %k1
+; AVX512F-NEXT: vmovq {{.*#+}} xmm2 = [18446744073709551608,0]
+; AVX512F-NEXT: vpaddq %zmm2, %zmm0, %zmm1 {%k1}
+; AVX512F-NEXT: vmovdqu64 %zmm1, (%rdi)
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: test_sub_i512_rmw:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqu64 (%rdi), %zmm0
+; AVX512VL-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1
+; AVX512VL-NEXT: vpcmpltuq %zmm0, %zmm1, %k0
+; AVX512VL-NEXT: kmovd %k0, %eax
+; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm2 = -1
+; AVX512VL-NEXT: vpcmpeqq %zmm2, %zmm1, %k0
+; AVX512VL-NEXT: kmovb %k0, %ecx
+; AVX512VL-NEXT: leal (%rcx,%rax,2), %eax
+; AVX512VL-NEXT: xorl %ecx, %eax
+; AVX512VL-NEXT: kmovd %eax, %k1
+; AVX512VL-NEXT: vmovq {{.*#+}} xmm2 = [18446744073709551608,0]
+; AVX512VL-NEXT: vpaddq %zmm2, %zmm0, %zmm1 {%k1}
+; AVX512VL-NEXT: vmovdqu64 %zmm1, (%rdi)
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retq
+ %a0 = load i512, ptr %p0
+ %r = sub i512 %a0, 9
+ store i512 %r, ptr %p0
+ ret void
+}
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; AVX512: {{.*}}
>From bf61eb7da17a438f43fcd844387abcccc1e45d81 Mon Sep 17 00:00:00 2001
From: Islam-Imad <islamimad404 at gmail.com>
Date: Wed, 4 Feb 2026 12:13:05 +0200
Subject: [PATCH 6/8] refactor to use function call instead of assign to a
variable to not be confusing
---
llvm/lib/Target/X86/X86ISelLowering.cpp | 10 ++++++----
1 file changed, 6 insertions(+), 4 deletions(-)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index b756a6ccb1eb2..47d6b872e450f 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -34277,10 +34277,10 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
MVT VecVT = MVT::v8i64;
MVT BoolVT = MVT::v8i1;
- SDValue AllOnes = DAG.getAllOnesConstant(dl, VecVT);
if (isOneConstant(RHS)) {
- RHS = AllOnes;
+ RHS = DAG.getAllOnesConstant(dl, VecVT);
+ ;
Opc = (IsAdd ? ISD::SUB : ISD::ADD);
IsAdd = !IsAdd;
// LHS + 1 => LHS - (- 1 , LHS - 1 => LHS + (- 1)
@@ -34296,7 +34296,8 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
ISD::CondCode CarryCC = IsAdd ? ISD::SETULT : ISD::SETUGT;
SDValue Carry = DAG.getSetCC(dl, BoolVT, Partial, Vec0, CarryCC);
- SDValue PropCmp = IsAdd ? AllOnes : DAG.getConstant(0, dl, VecVT);
+ SDValue PropCmp = IsAdd ? DAG.getAllOnesConstant(dl, VecVT)
+ : DAG.getConstant(0, dl, VecVT);
SDValue Propagate = DAG.getSetCC(dl, BoolVT, Partial, PropCmp, ISD::SETEQ);
SDValue CarryIn = DAG.getBitcast(MVT::i8, Carry);
@@ -34315,7 +34316,8 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
SDValue CorrVec = DAG.getNode(ISD::BITCAST, dl, BoolVT, CorrMask);
unsigned AdjustOpc = IsAdd ? ISD::SUB : ISD::ADD;
- SDValue Adjusted = DAG.getNode(AdjustOpc, dl, VecVT, Partial, AllOnes);
+ SDValue Adjusted = DAG.getNode(AdjustOpc, dl, VecVT, Partial,
+ DAG.getAllOnesConstant(dl, VecVT));
SDValue Res =
DAG.getNode(ISD::VSELECT, dl, VecVT, CorrVec, Adjusted, Partial);
>From 4af736479dea01cf41b3321352facf6cd35ff41f Mon Sep 17 00:00:00 2001
From: Islam-Imad <islamimad404 at gmail.com>
Date: Wed, 4 Feb 2026 13:08:03 +0200
Subject: [PATCH 7/8] - removed non-wanted comments - split add/sub case
handling in mayfold function
---
llvm/lib/Target/X86/X86ISelLowering.cpp | 13 ++++++-------
1 file changed, 6 insertions(+), 7 deletions(-)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 489d927226abd..290d111463ef6 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -2917,9 +2917,12 @@ static bool mayFoldIntoVector(SDValue Op, const X86Subtarget &Subtarget,
if (isa<ConstantSDNode>(Op) || isa<ConstantFPSDNode>(Op))
return true;
EVT VT = Op.getValueType();
- bool ValidOp = (ISD::isBitwiseLogicOp(Op.getOpcode()) ||
- ISD::isAddSubOp(Op.getOpcode()));
- if (ValidOp && (VT == MVT::i128 || VT == MVT::i256 || VT == MVT::i512))
+ // TODO : might have better handling by using
+ // `TargetLowering::LegalizeAction::Custom`
+ bool BitwiseCase = (ISD::isBitwiseLogicOp(Op.getOpcode()) &&
+ (VT == MVT::i128 || VT == MVT::i256 || VT == MVT::i512));
+ bool AddSubCase = (ISD::isAddSubOp(Op.getOpcode()) && (VT == MVT::i512));
+ if (BitwiseCase || AddSubCase)
return mayFoldIntoVector(Op.getOperand(0), Subtarget) &&
mayFoldIntoVector(Op.getOperand(1), Subtarget);
return X86::mayFoldLoad(Op, Subtarget, AssumeSingleUse,
@@ -54657,10 +54660,6 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
// vector type or the operation is likely to expand to a vector type
// (legalization can scalarize back if it the op failed).
if (VT == MVT::i256 || VT == MVT::i512) {
- // Issue : 173996 , PRs : [174761,179503] : when add/sub lowered on avx512
- // we hit a regression issue. my approach is to allow the combine only
- // when the operation is done by our custome handling.
- // X86TargetLowering::ReplaceNodeResults (ADD/SUB) cases.
MVT VecVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
if (TLI.isTypeLegal(VecVT) && ISD::isNormalStore(St) &&
mayFoldIntoVector(StoredVal, Subtarget))
>From 81e3610251b0fab94f91de9c8217cda0b0f5f3bf Mon Sep 17 00:00:00 2001
From: Islam-Imad <islamimad404 at gmail.com>
Date: Wed, 4 Feb 2026 15:13:00 +0200
Subject: [PATCH 8/8] removed isAddSubOp function
---
llvm/include/llvm/CodeGen/ISDOpcodes.h | 5 -----
llvm/lib/Target/X86/X86ISelLowering.cpp | 11 +++++------
2 files changed, 5 insertions(+), 11 deletions(-)
diff --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h
index 4b39a308e8472..2ebd2641944f5 100644
--- a/llvm/include/llvm/CodeGen/ISDOpcodes.h
+++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h
@@ -1618,11 +1618,6 @@ inline bool isBitwiseLogicOp(unsigned Opcode) {
return Opcode == ISD::AND || Opcode == ISD::OR || Opcode == ISD::XOR;
}
-/// Whether this is add/sub opcode.
-inline bool isAddSubOp(unsigned Opcode) {
- return Opcode == ISD::ADD || Opcode == ISD::SUB;
-}
-
/// Given a \p MinMaxOpc of ISD::(U|S)MIN or ISD::(U|S)MAX, returns
/// ISD::(U|S)MAX and ISD::(U|S)MIN, respectively.
LLVM_ABI NodeType getInverseMinMaxOpcode(unsigned MinMaxOpc);
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 290d111463ef6..75f7a89d2dac9 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -2919,9 +2919,11 @@ static bool mayFoldIntoVector(SDValue Op, const X86Subtarget &Subtarget,
EVT VT = Op.getValueType();
// TODO : might have better handling by using
// `TargetLowering::LegalizeAction::Custom`
- bool BitwiseCase = (ISD::isBitwiseLogicOp(Op.getOpcode()) &&
+ unsigned Opcode = Op.getOpcode();
+ bool BitwiseCase = (ISD::isBitwiseLogicOp(Opcode) &&
(VT == MVT::i128 || VT == MVT::i256 || VT == MVT::i512));
- bool AddSubCase = (ISD::isAddSubOp(Op.getOpcode()) && (VT == MVT::i512));
+ bool AddSubCase =
+ ((Opcode == ISD::ADD || Opcode == ISD::SUB) && (VT == MVT::i512));
if (BitwiseCase || AddSubCase)
return mayFoldIntoVector(Op.getOperand(0), Subtarget) &&
mayFoldIntoVector(Op.getOperand(1), Subtarget);
@@ -34267,9 +34269,6 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
}
case ISD::ADD:
case ISD::SUB: {
- // Use Kogge-Stone parallel carry/borrow propagation for i512 add/sub.
- // Article : https://www.numberworld.org/y-cruncher/internals/addition.html
- // related work : combineStore -> if (VT == MVT::i256 || VT == MVT::i512)
// TODO: ISD::UADDO_CARRY
SDValue LHS = N->getOperand(0);
SDValue RHS = N->getOperand(1);
@@ -34320,7 +34319,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
SDValue CorrMask = DAG.getNode(ISD::XOR, dl, MVT::i32, PropIn, CarryOut);
CorrMask = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, CorrMask);
- SDValue CorrVec = DAG.getNode(ISD::BITCAST, dl, BoolVT, CorrMask);
+ SDValue CorrVec = DAG.getBitcast(BoolVT, CorrMask);
unsigned AdjustOpc = IsAdd ? ISD::SUB : ISD::ADD;
SDValue Adjusted = DAG.getNode(AdjustOpc, dl, VecVT, Partial,
More information about the llvm-commits
mailing list