[llvm] [SelectionDAG][x86] Ensure vector reduction optimization (PR #144231)
Suhajda Tamás via llvm-commits
llvm-commits at lists.llvm.org
Thu Jun 19 00:14:39 PDT 2025
https://github.com/sutajo updated https://github.com/llvm/llvm-project/pull/144231
>From 2b2130a54aa74635ca194d6533e2e9ecc313f39e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Suhajda=20Tam=C3=A1s?= <sutajo at gmail.com>
Date: Sat, 14 Jun 2025 19:04:49 +0200
Subject: [PATCH 1/8] [x86] Add test for reduction
---
llvm/test/CodeGen/X86/optimize-reduction.ll | 140 ++++++++++++++++++++
1 file changed, 140 insertions(+)
create mode 100644 llvm/test/CodeGen/X86/optimize-reduction.ll
diff --git a/llvm/test/CodeGen/X86/optimize-reduction.ll b/llvm/test/CodeGen/X86/optimize-reduction.ll
new file mode 100644
index 0000000000000..003c41612b8bf
--- /dev/null
+++ b/llvm/test/CodeGen/X86/optimize-reduction.ll
@@ -0,0 +1,140 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse4.1,+fast-hops | FileCheck %s --check-prefixes=SSE41
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-hops | FileCheck %s --check-prefixes=AVX2
+
+define { i16, i16 } @test_reduce_v16i16_with_umin(<16 x i16> %x, <16 x i16> %y) {
+; SSE41-LABEL: test_reduce_v16i16_with_umin:
+; SSE41: # %bb.0:
+; SSE41-NEXT: movdqa %xmm0, %xmm4
+; SSE41-NEXT: pminuw %xmm1, %xmm4
+; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[2,3,2,3]
+; SSE41-NEXT: pminuw %xmm4, %xmm5
+; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,1,1]
+; SSE41-NEXT: pminuw %xmm5, %xmm6
+; SSE41-NEXT: movdqa %xmm6, %xmm5
+; SSE41-NEXT: psrld $16, %xmm5
+; SSE41-NEXT: pminuw %xmm6, %xmm5
+; SSE41-NEXT: phminposuw %xmm4, %xmm4
+; SSE41-NEXT: movd %xmm4, %eax
+; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm5[0,0,0,0,4,5,6,7]
+; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,1]
+; SSE41-NEXT: pcmpeqw %xmm4, %xmm1
+; SSE41-NEXT: pcmpeqd %xmm5, %xmm5
+; SSE41-NEXT: pxor %xmm5, %xmm1
+; SSE41-NEXT: por %xmm3, %xmm1
+; SSE41-NEXT: pcmpeqw %xmm4, %xmm0
+; SSE41-NEXT: pxor %xmm5, %xmm0
+; SSE41-NEXT: por %xmm2, %xmm0
+; SSE41-NEXT: pminuw %xmm1, %xmm0
+; SSE41-NEXT: phminposuw %xmm0, %xmm0
+; SSE41-NEXT: movd %xmm0, %edx
+; SSE41-NEXT: # kill: def $ax killed $ax killed $eax
+; SSE41-NEXT: # kill: def $dx killed $dx killed $edx
+; SSE41-NEXT: retq
+;
+; AVX2-LABEL: test_reduce_v16i16_with_umin:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT: vpminuw %xmm2, %xmm0, %xmm2
+; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,3,2,3]
+; AVX2-NEXT: vpminuw %xmm3, %xmm2, %xmm3
+; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,1,1]
+; AVX2-NEXT: vpminuw %xmm4, %xmm3, %xmm3
+; AVX2-NEXT: vpsrld $16, %xmm3, %xmm4
+; AVX2-NEXT: vphminposuw %xmm2, %xmm2
+; AVX2-NEXT: vmovd %xmm2, %eax
+; AVX2-NEXT: vpminuw %xmm4, %xmm3, %xmm2
+; AVX2-NEXT: vpbroadcastw %xmm2, %ymm2
+; AVX2-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
+; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vpminuw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vphminposuw %xmm0, %xmm0
+; AVX2-NEXT: vmovd %xmm0, %edx
+; AVX2-NEXT: # kill: def $ax killed $ax killed $eax
+; AVX2-NEXT: # kill: def $dx killed $dx killed $edx
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+ %min_x = tail call i16 @llvm.vector.reduce.umin.v16i16(<16 x i16> %x)
+ %min_x_vec = insertelement <1 x i16> poison, i16 %min_x, i64 0
+ %min_x_splat = shufflevector <1 x i16> %min_x_vec, <1 x i16> poison, <16 x i32> zeroinitializer
+ %cmp = icmp eq <16 x i16> %x, %min_x_splat
+ %select = select <16 x i1> %cmp, <16 x i16> %y, <16 x i16> splat (i16 -1)
+ %select_min = tail call i16 @llvm.vector.reduce.umin.v16i16(<16 x i16> %select)
+ %ret_0 = insertvalue { i16, i16 } poison, i16 %min_x, 0
+ %ret = insertvalue { i16, i16 } %ret_0, i16 %select_min, 1
+ ret { i16, i16 } %ret
+}
+
+define { i16, i16 } @test_reduce_v16i16_with_add(<16 x i16> %x, <16 x i16> %y) {
+; SSE41-LABEL: test_reduce_v16i16_with_add:
+; SSE41: # %bb.0: # %start
+; SSE41-NEXT: movdqa %xmm0, %xmm4
+; SSE41-NEXT: paddw %xmm1, %xmm4
+; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[2,3,2,3]
+; SSE41-NEXT: paddw %xmm4, %xmm5
+; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,1,1]
+; SSE41-NEXT: paddw %xmm5, %xmm4
+; SSE41-NEXT: phaddw %xmm4, %xmm4
+; SSE41-NEXT: movdqa %xmm1, %xmm5
+; SSE41-NEXT: phaddw %xmm0, %xmm5
+; SSE41-NEXT: phaddw %xmm5, %xmm5
+; SSE41-NEXT: phaddw %xmm5, %xmm5
+; SSE41-NEXT: phaddw %xmm5, %xmm5
+; SSE41-NEXT: movd %xmm5, %eax
+; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,0,0,0,4,5,6,7]
+; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,1]
+; SSE41-NEXT: pcmpeqw %xmm4, %xmm1
+; SSE41-NEXT: pcmpeqd %xmm5, %xmm5
+; SSE41-NEXT: pxor %xmm5, %xmm1
+; SSE41-NEXT: por %xmm3, %xmm1
+; SSE41-NEXT: pcmpeqw %xmm4, %xmm0
+; SSE41-NEXT: pxor %xmm5, %xmm0
+; SSE41-NEXT: por %xmm2, %xmm0
+; SSE41-NEXT: pminuw %xmm1, %xmm0
+; SSE41-NEXT: phminposuw %xmm0, %xmm0
+; SSE41-NEXT: movd %xmm0, %edx
+; SSE41-NEXT: # kill: def $ax killed $ax killed $eax
+; SSE41-NEXT: # kill: def $dx killed $dx killed $edx
+; SSE41-NEXT: retq
+;
+; AVX2-LABEL: test_reduce_v16i16_with_add:
+; AVX2: # %bb.0: # %start
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT: vpaddw %xmm2, %xmm0, %xmm3
+; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,3,2,3]
+; AVX2-NEXT: vpaddw %xmm4, %xmm3, %xmm3
+; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,1,1]
+; AVX2-NEXT: vpaddw %xmm4, %xmm3, %xmm3
+; AVX2-NEXT: vphaddw %xmm3, %xmm3, %xmm3
+; AVX2-NEXT: vphaddw %xmm0, %xmm2, %xmm2
+; AVX2-NEXT: vphaddw %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: vphaddw %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: vphaddw %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: vmovd %xmm2, %eax
+; AVX2-NEXT: vpbroadcastw %xmm3, %ymm2
+; AVX2-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
+; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vpminuw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vphminposuw %xmm0, %xmm0
+; AVX2-NEXT: vmovd %xmm0, %edx
+; AVX2-NEXT: # kill: def $ax killed $ax killed $eax
+; AVX2-NEXT: # kill: def $dx killed $dx killed $edx
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+start:
+ %sum_x = tail call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %x)
+ %sum_x_vec = insertelement <1 x i16> poison, i16 %sum_x, i64 0
+ %sum_x_splat = shufflevector <1 x i16> %sum_x_vec, <1 x i16> poison, <16 x i32> zeroinitializer
+ %cmp = icmp eq <16 x i16> %x, %sum_x_splat
+ %select = select <16 x i1> %cmp, <16 x i16> %y, <16 x i16> splat (i16 -1)
+ %select_min = tail call i16 @llvm.vector.reduce.umin.v16i16(<16 x i16> %select)
+ %ret_0 = insertvalue { i16, i16 } poison, i16 %sum_x, 0
+ %ret = insertvalue { i16, i16 } %ret_0, i16 %select_min, 1
+ ret { i16, i16 } %ret
+}
>From 93df21fcff0b9f836947cfd9bb8e88b565b1c435 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Suhajda=20Tam=C3=A1s?= <sutajo at gmail.com>
Date: Sat, 14 Jun 2025 19:09:44 +0200
Subject: [PATCH 2/8] [x86] Implement optimization and update tests
---
llvm/lib/Target/X86/X86ISelLowering.cpp | 52 ++++++++++++++++++---
llvm/test/CodeGen/X86/optimize-reduction.ll | 40 +++-------------
2 files changed, 52 insertions(+), 40 deletions(-)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index b4670e270141f..61e3979b6c0bb 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -47081,7 +47081,8 @@ static SDValue combineArithReduction(SDNode *ExtElt, SelectionDAG &DAG,
/// scalars back, while for x64 we should use 64-bit extracts and shifts.
static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
- const X86Subtarget &Subtarget) {
+ const X86Subtarget &Subtarget,
+ bool& TransformedBinOpReduction) {
if (SDValue NewOp = combineExtractWithShuffle(N, DAG, DCI, Subtarget))
return NewOp;
@@ -47169,23 +47170,33 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
// Check whether this extract is the root of a sum of absolute differences
// pattern. This has to be done here because we really want it to happen
// pre-legalization,
- if (SDValue SAD = combineBasicSADPattern(N, DAG, Subtarget))
+ if (SDValue SAD = combineBasicSADPattern(N, DAG, Subtarget)) {
+ TransformedBinOpReduction = true;
return SAD;
+ }
- if (SDValue VPDPBUSD = combineVPDPBUSDPattern(N, DAG, Subtarget))
+ if (SDValue VPDPBUSD = combineVPDPBUSDPattern(N, DAG, Subtarget)) {
+ TransformedBinOpReduction = true;
return VPDPBUSD;
+ }
// Attempt to replace an all_of/any_of horizontal reduction with a MOVMSK.
- if (SDValue Cmp = combinePredicateReduction(N, DAG, Subtarget))
+ if (SDValue Cmp = combinePredicateReduction(N, DAG, Subtarget)) {
+ TransformedBinOpReduction = true;
return Cmp;
+ }
// Attempt to replace min/max v8i16/v16i8 reductions with PHMINPOSUW.
- if (SDValue MinMax = combineMinMaxReduction(N, DAG, Subtarget))
+ if (SDValue MinMax = combineMinMaxReduction(N, DAG, Subtarget)) {
+ TransformedBinOpReduction = true;
return MinMax;
+ }
// Attempt to optimize ADD/FADD/MUL reductions with HADD, promotion etc..
- if (SDValue V = combineArithReduction(N, DAG, Subtarget))
+ if (SDValue V = combineArithReduction(N, DAG, Subtarget)) {
+ TransformedBinOpReduction = true;
return V;
+ }
if (SDValue V = scalarizeExtEltFP(N, DAG, Subtarget, DCI))
return V;
@@ -47255,6 +47266,33 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
+static SDValue combineExtractVectorEltAndOperand(SDNode* N, SelectionDAG& DAG,
+ TargetLowering::DAGCombinerInfo& DCI,
+ const X86Subtarget& Subtarget)
+{
+ bool TransformedBinOpReduction = false;
+ auto Op = combineExtractVectorElt(N, DAG, DCI, Subtarget, TransformedBinOpReduction);
+
+ if (TransformedBinOpReduction)
+ {
+ // In case we simplified N = extract_vector_element(V, 0) with Op and V
+ // resulted from a reduction, then we need to replace all uses of V with
+ // scalar_to_vector(Op) to make sure that we eliminated the binop + shuffle
+ // pyramid. This is safe to do, because the elements of V are undefined except
+ // for the zeroth element.
+
+ auto OldV = N->getOperand(0);
+ auto NewV = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), OldV->getValueType(0), Op);
+
+ auto NV = DCI.CombineTo(N, Op);
+ DCI.CombineTo(OldV.getNode(), NewV);
+
+ Op = NV; // Return N so it doesn't get rechecked!
+ }
+
+ return Op;
+}
+
// Convert (vXiY *ext(vXi1 bitcast(iX))) to extend_in_reg(broadcast(iX)).
// This is more or less the reverse of combineBitcastvxi1.
static SDValue combineToExtendBoolVectorInReg(
@@ -60702,7 +60740,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
case ISD::EXTRACT_VECTOR_ELT:
case X86ISD::PEXTRW:
case X86ISD::PEXTRB:
- return combineExtractVectorElt(N, DAG, DCI, Subtarget);
+ return combineExtractVectorEltAndOperand(N, DAG, DCI, Subtarget);
case ISD::CONCAT_VECTORS:
return combineCONCAT_VECTORS(N, DAG, DCI, Subtarget);
case ISD::INSERT_SUBVECTOR:
diff --git a/llvm/test/CodeGen/X86/optimize-reduction.ll b/llvm/test/CodeGen/X86/optimize-reduction.ll
index 003c41612b8bf..e51ac1bd3c13c 100644
--- a/llvm/test/CodeGen/X86/optimize-reduction.ll
+++ b/llvm/test/CodeGen/X86/optimize-reduction.ll
@@ -7,16 +7,9 @@ define { i16, i16 } @test_reduce_v16i16_with_umin(<16 x i16> %x, <16 x i16> %y)
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm4
; SSE41-NEXT: pminuw %xmm1, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[2,3,2,3]
-; SSE41-NEXT: pminuw %xmm4, %xmm5
-; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,1,1]
-; SSE41-NEXT: pminuw %xmm5, %xmm6
-; SSE41-NEXT: movdqa %xmm6, %xmm5
-; SSE41-NEXT: psrld $16, %xmm5
-; SSE41-NEXT: pminuw %xmm6, %xmm5
; SSE41-NEXT: phminposuw %xmm4, %xmm4
; SSE41-NEXT: movd %xmm4, %eax
-; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm5[0,0,0,0,4,5,6,7]
+; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,0,0,0,4,5,6,7]
; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,1]
; SSE41-NEXT: pcmpeqw %xmm4, %xmm1
; SSE41-NEXT: pcmpeqd %xmm5, %xmm5
@@ -36,14 +29,8 @@ define { i16, i16 } @test_reduce_v16i16_with_umin(<16 x i16> %x, <16 x i16> %y)
; AVX2: # %bb.0:
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
; AVX2-NEXT: vpminuw %xmm2, %xmm0, %xmm2
-; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,3,2,3]
-; AVX2-NEXT: vpminuw %xmm3, %xmm2, %xmm3
-; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,1,1]
-; AVX2-NEXT: vpminuw %xmm4, %xmm3, %xmm3
-; AVX2-NEXT: vpsrld $16, %xmm3, %xmm4
; AVX2-NEXT: vphminposuw %xmm2, %xmm2
; AVX2-NEXT: vmovd %xmm2, %eax
-; AVX2-NEXT: vpminuw %xmm4, %xmm3, %xmm2
; AVX2-NEXT: vpbroadcastw %xmm2, %ymm2
; AVX2-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
@@ -71,19 +58,12 @@ define { i16, i16 } @test_reduce_v16i16_with_umin(<16 x i16> %x, <16 x i16> %y)
define { i16, i16 } @test_reduce_v16i16_with_add(<16 x i16> %x, <16 x i16> %y) {
; SSE41-LABEL: test_reduce_v16i16_with_add:
; SSE41: # %bb.0: # %start
-; SSE41-NEXT: movdqa %xmm0, %xmm4
-; SSE41-NEXT: paddw %xmm1, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[2,3,2,3]
-; SSE41-NEXT: paddw %xmm4, %xmm5
-; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,1,1]
-; SSE41-NEXT: paddw %xmm5, %xmm4
+; SSE41-NEXT: movdqa %xmm1, %xmm4
+; SSE41-NEXT: phaddw %xmm0, %xmm4
+; SSE41-NEXT: phaddw %xmm4, %xmm4
; SSE41-NEXT: phaddw %xmm4, %xmm4
-; SSE41-NEXT: movdqa %xmm1, %xmm5
-; SSE41-NEXT: phaddw %xmm0, %xmm5
-; SSE41-NEXT: phaddw %xmm5, %xmm5
-; SSE41-NEXT: phaddw %xmm5, %xmm5
-; SSE41-NEXT: phaddw %xmm5, %xmm5
-; SSE41-NEXT: movd %xmm5, %eax
+; SSE41-NEXT: phaddw %xmm4, %xmm4
+; SSE41-NEXT: movd %xmm4, %eax
; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,0,0,0,4,5,6,7]
; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,1]
; SSE41-NEXT: pcmpeqw %xmm4, %xmm1
@@ -103,18 +83,12 @@ define { i16, i16 } @test_reduce_v16i16_with_add(<16 x i16> %x, <16 x i16> %y) {
; AVX2-LABEL: test_reduce_v16i16_with_add:
; AVX2: # %bb.0: # %start
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
-; AVX2-NEXT: vpaddw %xmm2, %xmm0, %xmm3
-; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,3,2,3]
-; AVX2-NEXT: vpaddw %xmm4, %xmm3, %xmm3
-; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,1,1]
-; AVX2-NEXT: vpaddw %xmm4, %xmm3, %xmm3
-; AVX2-NEXT: vphaddw %xmm3, %xmm3, %xmm3
; AVX2-NEXT: vphaddw %xmm0, %xmm2, %xmm2
; AVX2-NEXT: vphaddw %xmm2, %xmm2, %xmm2
; AVX2-NEXT: vphaddw %xmm2, %xmm2, %xmm2
; AVX2-NEXT: vphaddw %xmm2, %xmm2, %xmm2
; AVX2-NEXT: vmovd %xmm2, %eax
-; AVX2-NEXT: vpbroadcastw %xmm3, %ymm2
+; AVX2-NEXT: vpbroadcastw %xmm2, %ymm2
; AVX2-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0
>From 57a37886a8d426a049c1a1c63b5064b9c55cf086 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Suhajda=20Tam=C3=A1s?= <sutajo at gmail.com>
Date: Sat, 14 Jun 2025 19:59:01 +0200
Subject: [PATCH 3/8] [x86] Assert that the new reduction does not depend on
the converted one
---
llvm/lib/Target/X86/X86ISelLowering.cpp | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 61e3979b6c0bb..b606de022daf0 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -47279,9 +47279,12 @@ static SDValue combineExtractVectorEltAndOperand(SDNode* N, SelectionDAG& DAG,
// resulted from a reduction, then we need to replace all uses of V with
// scalar_to_vector(Op) to make sure that we eliminated the binop + shuffle
// pyramid. This is safe to do, because the elements of V are undefined except
- // for the zeroth element.
+ // for the zeroth element and Op does not depend on V.
auto OldV = N->getOperand(0);
+ assert(!Op.getNode()->hasPredecessor(OldV.getNode()) &&
+ "Op must not depend on the converted reduction");
+
auto NewV = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), OldV->getValueType(0), Op);
auto NV = DCI.CombineTo(N, Op);
>From ebb3ba0cdf1a8159b279367e3e94f65dab38e654 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Suhajda=20Tam=C3=A1s?= <sutajo at gmail.com>
Date: Tue, 17 Jun 2025 19:58:42 +0200
Subject: [PATCH 4/8] [x86] Add custom lowering for min/max vector reductions
---
llvm/lib/Target/X86/X86ISelLowering.cpp | 163 ++++++++++++------
.../lib/Target/X86/X86TargetTransformInfo.cpp | 19 ++
llvm/lib/Target/X86/X86TargetTransformInfo.h | 1 +
llvm/test/CodeGen/X86/optimize-reduction.ll | 1 +
4 files changed, 130 insertions(+), 54 deletions(-)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index b606de022daf0..684092e416ca4 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -1435,6 +1435,20 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::BITREVERSE, VT, Custom);
}
+ // Vector min/max reductions
+ if (Subtarget.hasSSE41())
+ {
+ for (MVT VT : MVT::vector_valuetypes()) {
+ if (VT.getScalarType() == MVT::i8 || VT.getScalarType() == MVT::i16)
+ {
+ setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom);
+ setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom);
+ setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom);
+ setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom);
+ }
+ }
+ }
+
if (!Subtarget.useSoftFloat() && Subtarget.hasAVX()) {
bool HasInt256 = Subtarget.hasInt256();
@@ -25409,6 +25423,94 @@ static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op,
return SignExt;
}
+// Create a min/max v8i16/v16i8 horizontal reduction with PHMINPOSUW.
+static SDValue createMinMaxReduction(SDValue Src, EVT TargetVT, SDLoc DL,
+ ISD::NodeType BinOp, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget)
+{
+ assert(Subtarget.hasSSE41() && "The caller must check if SSE4.1 is available");
+
+ EVT SrcVT = Src.getValueType();
+ EVT SrcSVT = SrcVT.getScalarType();
+
+ if (SrcSVT != TargetVT || (SrcVT.getSizeInBits() % 128) != 0)
+ return SDValue();
+
+ // First, reduce the source down to 128-bit, applying BinOp to lo/hi.
+ while (SrcVT.getSizeInBits() > 128) {
+ SDValue Lo, Hi;
+ std::tie(Lo, Hi) = splitVector(Src, DAG, DL);
+ SrcVT = Lo.getValueType();
+ Src = DAG.getNode(BinOp, DL, SrcVT, Lo, Hi);
+ }
+ assert(((SrcVT == MVT::v8i16 && TargetVT == MVT::i16) ||
+ (SrcVT == MVT::v16i8 && TargetVT == MVT::i8)) &&
+ "Unexpected value type");
+
+ // PHMINPOSUW applies to UMIN(v8i16), for SMIN/SMAX/UMAX we must apply a mask
+ // to flip the value accordingly.
+ SDValue Mask;
+ unsigned MaskEltsBits = TargetVT.getSizeInBits();
+ if (BinOp == ISD::SMAX)
+ Mask = DAG.getConstant(APInt::getSignedMaxValue(MaskEltsBits), DL, SrcVT);
+ else if (BinOp == ISD::SMIN)
+ Mask = DAG.getConstant(APInt::getSignedMinValue(MaskEltsBits), DL, SrcVT);
+ else if (BinOp == ISD::UMAX)
+ Mask = DAG.getAllOnesConstant(DL, SrcVT);
+
+ if (Mask)
+ Src = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, Src);
+
+ // For v16i8 cases we need to perform UMIN on pairs of byte elements,
+ // shuffling each upper element down and insert zeros. This means that the
+ // v16i8 UMIN will leave the upper element as zero, performing zero-extension
+ // ready for the PHMINPOS.
+ if (TargetVT == MVT::i8) {
+ SDValue Upper = DAG.getVectorShuffle(
+ SrcVT, DL, Src, DAG.getConstant(0, DL, MVT::v16i8),
+ {1, 16, 3, 16, 5, 16, 7, 16, 9, 16, 11, 16, 13, 16, 15, 16});
+ Src = DAG.getNode(ISD::UMIN, DL, SrcVT, Src, Upper);
+ }
+
+ // Perform the PHMINPOS on a v8i16 vector,
+ Src = DAG.getBitcast(MVT::v8i16, Src);
+ Src = DAG.getNode(X86ISD::PHMINPOS, DL, MVT::v8i16, Src);
+ Src = DAG.getBitcast(SrcVT, Src);
+
+ if (Mask)
+ Src = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, Src);
+
+ return DAG.getExtractVectorElt(DL, TargetVT, Src, 0);
+}
+
+static SDValue LowerVECTOR_REDUCE_MINMAX(SDValue Op,
+ const X86Subtarget& Subtarget,
+ SelectionDAG& DAG)
+{
+ ISD::NodeType BinOp;
+ switch (Op.getOpcode())
+ {
+ default:
+ assert(false && "Expected min/max reduction");
+ break;
+ case ISD::VECREDUCE_UMIN:
+ BinOp = ISD::UMIN;
+ break;
+ case ISD::VECREDUCE_UMAX:
+ BinOp = ISD::UMAX;
+ break;
+ case ISD::VECREDUCE_SMIN:
+ BinOp = ISD::SMIN;
+ break;
+ case ISD::VECREDUCE_SMAX:
+ BinOp = ISD::SMAX;
+ break;
+ }
+
+ return createMinMaxReduction(Op->getOperand(0), Op.getValueType(), SDLoc(Op),
+ BinOp, DAG, Subtarget);
+}
+
static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
MVT VT = Op->getSimpleValueType(0);
@@ -33620,6 +33722,11 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::ZERO_EXTEND_VECTOR_INREG:
case ISD::SIGN_EXTEND_VECTOR_INREG:
return LowerEXTEND_VECTOR_INREG(Op, Subtarget, DAG);
+ case ISD::VECREDUCE_UMIN:
+ case ISD::VECREDUCE_UMAX:
+ case ISD::VECREDUCE_SMIN:
+ case ISD::VECREDUCE_SMAX:
+ return LowerVECTOR_REDUCE_MINMAX(Op, Subtarget, DAG);
case ISD::FP_TO_SINT:
case ISD::STRICT_FP_TO_SINT:
case ISD::FP_TO_UINT:
@@ -46192,60 +46299,8 @@ static SDValue combineMinMaxReduction(SDNode *Extract, SelectionDAG &DAG,
if (!Src)
return SDValue();
- EVT SrcVT = Src.getValueType();
- EVT SrcSVT = SrcVT.getScalarType();
- if (SrcSVT != ExtractVT || (SrcVT.getSizeInBits() % 128) != 0)
- return SDValue();
-
- SDLoc DL(Extract);
- SDValue MinPos = Src;
-
- // First, reduce the source down to 128-bit, applying BinOp to lo/hi.
- while (SrcVT.getSizeInBits() > 128) {
- SDValue Lo, Hi;
- std::tie(Lo, Hi) = splitVector(MinPos, DAG, DL);
- SrcVT = Lo.getValueType();
- MinPos = DAG.getNode(BinOp, DL, SrcVT, Lo, Hi);
- }
- assert(((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) ||
- (SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) &&
- "Unexpected value type");
-
- // PHMINPOSUW applies to UMIN(v8i16), for SMIN/SMAX/UMAX we must apply a mask
- // to flip the value accordingly.
- SDValue Mask;
- unsigned MaskEltsBits = ExtractVT.getSizeInBits();
- if (BinOp == ISD::SMAX)
- Mask = DAG.getConstant(APInt::getSignedMaxValue(MaskEltsBits), DL, SrcVT);
- else if (BinOp == ISD::SMIN)
- Mask = DAG.getConstant(APInt::getSignedMinValue(MaskEltsBits), DL, SrcVT);
- else if (BinOp == ISD::UMAX)
- Mask = DAG.getAllOnesConstant(DL, SrcVT);
-
- if (Mask)
- MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
-
- // For v16i8 cases we need to perform UMIN on pairs of byte elements,
- // shuffling each upper element down and insert zeros. This means that the
- // v16i8 UMIN will leave the upper element as zero, performing zero-extension
- // ready for the PHMINPOS.
- if (ExtractVT == MVT::i8) {
- SDValue Upper = DAG.getVectorShuffle(
- SrcVT, DL, MinPos, DAG.getConstant(0, DL, MVT::v16i8),
- {1, 16, 3, 16, 5, 16, 7, 16, 9, 16, 11, 16, 13, 16, 15, 16});
- MinPos = DAG.getNode(ISD::UMIN, DL, SrcVT, MinPos, Upper);
- }
-
- // Perform the PHMINPOS on a v8i16 vector,
- MinPos = DAG.getBitcast(MVT::v8i16, MinPos);
- MinPos = DAG.getNode(X86ISD::PHMINPOS, DL, MVT::v8i16, MinPos);
- MinPos = DAG.getBitcast(SrcVT, MinPos);
-
- if (Mask)
- MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
-
- return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, MinPos,
- DAG.getVectorIdxConstant(0, DL));
+ return createMinMaxReduction(Src, ExtractVT, SDLoc(Extract),
+ BinOp, DAG, Subtarget);
}
// Attempt to replace an all_of/any_of/parity style horizontal reduction with a MOVMSK.
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index a1a177528eb23..3c479fc72ce30 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -6575,6 +6575,25 @@ X86TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
return Options;
}
+bool llvm::X86TTIImpl::shouldExpandReduction(const IntrinsicInst *II) const {
+ switch (II->getIntrinsicID()) {
+ default:
+ return true;
+
+ case Intrinsic::vector_reduce_umin:
+ case Intrinsic::vector_reduce_umax:
+ case Intrinsic::vector_reduce_smin:
+ case Intrinsic::vector_reduce_smax:
+ auto *VType = cast<FixedVectorType>(II->getOperand(0)->getType());
+ auto SType = VType->getScalarType();
+ bool CanUsePHMINPOSUW =
+ ST->hasSSE41() && II->getType() == SType &&
+ (VType->getPrimitiveSizeInBits() % 128) == 0 &&
+ (SType->isIntegerTy(8) || SType->isIntegerTy(16));
+ return !CanUsePHMINPOSUW;
+ }
+}
+
bool X86TTIImpl::prefersVectorizedAddressing() const {
return supportsGather();
}
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h
index 72673d6fbd80f..5e2fe40f9f902 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.h
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h
@@ -303,6 +303,7 @@ class X86TTIImpl : public BasicTTIImplBase<X86TTIImpl> {
TTI::MemCmpExpansionOptions
enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const override;
bool preferAlternateOpcodeVectorization() const override { return false; }
+ bool shouldExpandReduction(const IntrinsicInst *II) const override;
bool prefersVectorizedAddressing() const override;
bool supportsEfficientVectorElementLoadStore() const override;
bool enableInterleavedAccessVectorization() const override;
diff --git a/llvm/test/CodeGen/X86/optimize-reduction.ll b/llvm/test/CodeGen/X86/optimize-reduction.ll
index e51ac1bd3c13c..4e9732882d2bb 100644
--- a/llvm/test/CodeGen/X86/optimize-reduction.ll
+++ b/llvm/test/CodeGen/X86/optimize-reduction.ll
@@ -31,6 +31,7 @@ define { i16, i16 } @test_reduce_v16i16_with_umin(<16 x i16> %x, <16 x i16> %y)
; AVX2-NEXT: vpminuw %xmm2, %xmm0, %xmm2
; AVX2-NEXT: vphminposuw %xmm2, %xmm2
; AVX2-NEXT: vmovd %xmm2, %eax
+; AVX2-NEXT: vmovd %eax, %xmm2
; AVX2-NEXT: vpbroadcastw %xmm2, %ymm2
; AVX2-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
>From 85e5195d1d3e3f9afd39ed9afd3f515ed6148d8e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Suhajda=20Tam=C3=A1s?= <sutajo at gmail.com>
Date: Wed, 18 Jun 2025 19:20:03 +0200
Subject: [PATCH 5/8] Add checks before custom lowering
---
llvm/lib/Target/X86/X86ISelLowering.cpp | 89 ++++++++-----------
.../lib/Target/X86/X86TargetTransformInfo.cpp | 7 +-
2 files changed, 40 insertions(+), 56 deletions(-)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 684092e416ca4..14169634b0e56 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -1436,16 +1436,18 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
}
// Vector min/max reductions
- if (Subtarget.hasSSE41())
- {
+ // These are lowered to PHMINPOSUW if possible,
+ // otherwise they are expaned to shuffles + binops.
+ if (Subtarget.hasSSE41()) {
for (MVT VT : MVT::vector_valuetypes()) {
- if (VT.getScalarType() == MVT::i8 || VT.getScalarType() == MVT::i16)
- {
- setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom);
- setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom);
- setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom);
- setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom);
- }
+ if (!VT.isFixedLengthVector() || (VT.getSizeInBits() % 128) != 0 ||
+ !(VT.getScalarType() == MVT::i8 || VT.getScalarType() == MVT::i16))
+ continue;
+
+ setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom);
+ setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom);
+ setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom);
+ setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom);
}
}
@@ -25426,9 +25428,11 @@ static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op,
// Create a min/max v8i16/v16i8 horizontal reduction with PHMINPOSUW.
static SDValue createMinMaxReduction(SDValue Src, EVT TargetVT, SDLoc DL,
ISD::NodeType BinOp, SelectionDAG &DAG,
- const X86Subtarget &Subtarget)
-{
- assert(Subtarget.hasSSE41() && "The caller must check if SSE4.1 is available");
+ const X86Subtarget &Subtarget) {
+ assert(Subtarget.hasSSE41() &&
+ "The caller must check if SSE4.1 is available");
+ assert(TargetVT == MVT::i16 ||
+ TargetVT == MVT::i8 && "Unexpected return type");
EVT SrcVT = Src.getValueType();
EVT SrcSVT = SrcVT.getScalarType();
@@ -25484,31 +25488,11 @@ static SDValue createMinMaxReduction(SDValue Src, EVT TargetVT, SDLoc DL,
}
static SDValue LowerVECTOR_REDUCE_MINMAX(SDValue Op,
- const X86Subtarget& Subtarget,
- SelectionDAG& DAG)
-{
- ISD::NodeType BinOp;
- switch (Op.getOpcode())
- {
- default:
- assert(false && "Expected min/max reduction");
- break;
- case ISD::VECREDUCE_UMIN:
- BinOp = ISD::UMIN;
- break;
- case ISD::VECREDUCE_UMAX:
- BinOp = ISD::UMAX;
- break;
- case ISD::VECREDUCE_SMIN:
- BinOp = ISD::SMIN;
- break;
- case ISD::VECREDUCE_SMAX:
- BinOp = ISD::SMAX;
- break;
- }
-
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ ISD::NodeType BinOp = ISD::getVecReduceBaseOpcode(Op.getOpcode());
return createMinMaxReduction(Op->getOperand(0), Op.getValueType(), SDLoc(Op),
- BinOp, DAG, Subtarget);
+ BinOp, DAG, Subtarget);
}
static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
@@ -46299,8 +46283,8 @@ static SDValue combineMinMaxReduction(SDNode *Extract, SelectionDAG &DAG,
if (!Src)
return SDValue();
- return createMinMaxReduction(Src, ExtractVT, SDLoc(Extract),
- BinOp, DAG, Subtarget);
+ return createMinMaxReduction(Src, ExtractVT, SDLoc(Extract), BinOp, DAG,
+ Subtarget);
}
// Attempt to replace an all_of/any_of/parity style horizontal reduction with a MOVMSK.
@@ -47136,8 +47120,8 @@ static SDValue combineArithReduction(SDNode *ExtElt, SelectionDAG &DAG,
/// scalars back, while for x64 we should use 64-bit extracts and shifts.
static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
- const X86Subtarget &Subtarget,
- bool& TransformedBinOpReduction) {
+ const X86Subtarget &Subtarget,
+ bool &TransformedBinOpReduction) {
if (SDValue NewOp = combineExtractWithShuffle(N, DAG, DCI, Subtarget))
return NewOp;
@@ -47321,26 +47305,27 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
-static SDValue combineExtractVectorEltAndOperand(SDNode* N, SelectionDAG& DAG,
- TargetLowering::DAGCombinerInfo& DCI,
- const X86Subtarget& Subtarget)
-{
+static SDValue
+combineExtractVectorEltAndOperand(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
bool TransformedBinOpReduction = false;
- auto Op = combineExtractVectorElt(N, DAG, DCI, Subtarget, TransformedBinOpReduction);
+ auto Op = combineExtractVectorElt(N, DAG, DCI, Subtarget,
+ TransformedBinOpReduction);
- if (TransformedBinOpReduction)
- {
+ if (TransformedBinOpReduction) {
// In case we simplified N = extract_vector_element(V, 0) with Op and V
// resulted from a reduction, then we need to replace all uses of V with
// scalar_to_vector(Op) to make sure that we eliminated the binop + shuffle
- // pyramid. This is safe to do, because the elements of V are undefined except
- // for the zeroth element and Op does not depend on V.
+ // pyramid. This is safe to do, because the elements of V are undefined
+ // except for the zeroth element and Op does not depend on V.
auto OldV = N->getOperand(0);
- assert(!Op.getNode()->hasPredecessor(OldV.getNode()) &&
- "Op must not depend on the converted reduction");
+ assert(!Op.getNode()->hasPredecessor(OldV.getNode()) &&
+ "Op must not depend on the converted reduction");
- auto NewV = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), OldV->getValueType(0), Op);
+ auto NewV =
+ DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), OldV->getValueType(0), Op);
auto NV = DCI.CombineTo(N, Op);
DCI.CombineTo(OldV.getNode(), NewV);
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index 3c479fc72ce30..e753946d0e19b 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -6586,10 +6586,9 @@ bool llvm::X86TTIImpl::shouldExpandReduction(const IntrinsicInst *II) const {
case Intrinsic::vector_reduce_smax:
auto *VType = cast<FixedVectorType>(II->getOperand(0)->getType());
auto SType = VType->getScalarType();
- bool CanUsePHMINPOSUW =
- ST->hasSSE41() && II->getType() == SType &&
- (VType->getPrimitiveSizeInBits() % 128) == 0 &&
- (SType->isIntegerTy(8) || SType->isIntegerTy(16));
+ bool CanUsePHMINPOSUW = ST->hasSSE41() && II->getType() == SType &&
+ (VType->getPrimitiveSizeInBits() % 128) == 0 &&
+ (SType->isIntegerTy(8) || SType->isIntegerTy(16));
return !CanUsePHMINPOSUW;
}
}
>From 90da8ffb58ecf827aade1709775def678b9bce2d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Suhajda=20Tam=C3=A1s?= <sutajo at gmail.com>
Date: Wed, 18 Jun 2025 19:28:10 +0200
Subject: [PATCH 6/8] Add missing parentheses
---
llvm/lib/Target/X86/X86ISelLowering.cpp | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 14169634b0e56..e2ba9dbe2208d 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -25431,8 +25431,8 @@ static SDValue createMinMaxReduction(SDValue Src, EVT TargetVT, SDLoc DL,
const X86Subtarget &Subtarget) {
assert(Subtarget.hasSSE41() &&
"The caller must check if SSE4.1 is available");
- assert(TargetVT == MVT::i16 ||
- TargetVT == MVT::i8 && "Unexpected return type");
+ assert((TargetVT == MVT::i16 || TargetVT == MVT::i8) &&
+ "Unexpected return type");
EVT SrcVT = Src.getValueType();
EVT SrcSVT = SrcVT.getScalarType();
>From 376965edaa4a37a3e2d56196f832d89e2913cf49 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Suhajda=20Tam=C3=A1s?= <sutajo at gmail.com>
Date: Thu, 19 Jun 2025 01:06:34 +0200
Subject: [PATCH 7/8] [x86] Lower arithmetic vector reductions
---
llvm/lib/Target/X86/X86ISelLowering.cpp | 425 ++++++++++--------
llvm/lib/Target/X86/X86ISelLowering.h | 5 +
.../lib/Target/X86/X86TargetTransformInfo.cpp | 38 +-
llvm/test/CodeGen/X86/horizontal-sum.ll | 146 +++---
llvm/test/CodeGen/X86/optimize-reduction.ll | 1 +
.../CodeGen/X86/vector-reduce-add-mask.ll | 42 +-
.../CodeGen/X86/vector-reduce-add-zext.ll | 59 ++-
llvm/test/CodeGen/X86/vector-reduce-fadd.ll | 390 +++++++++++-----
8 files changed, 697 insertions(+), 409 deletions(-)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index e2ba9dbe2208d..cef1148504259 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -1435,20 +1435,17 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::BITREVERSE, VT, Custom);
}
- // Vector min/max reductions
- // These are lowered to PHMINPOSUW if possible,
+ // Vector reductions
+ // These are lowered to fast implementations if possible,
// otherwise they are expaned to shuffles + binops.
- if (Subtarget.hasSSE41()) {
- for (MVT VT : MVT::vector_valuetypes()) {
- if (!VT.isFixedLengthVector() || (VT.getSizeInBits() % 128) != 0 ||
- !(VT.getScalarType() == MVT::i8 || VT.getScalarType() == MVT::i16))
- continue;
-
- setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom);
- setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom);
- setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom);
- setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom);
- }
+ for (ISD::NodeType VectorOp :
+ {ISD::VECREDUCE_FADD, ISD::VECREDUCE_ADD, ISD::VECREDUCE_MUL,
+ ISD::VECREDUCE_SMAX, ISD::VECREDUCE_SMIN, ISD::VECREDUCE_UMAX,
+ ISD::VECREDUCE_UMIN}) {
+ for (MVT VT : MVT::vector_valuetypes())
+ if (VT.isFixedLengthVector() &&
+ X86::isVectorReductionFast(Subtarget, VectorOp, VT))
+ setOperationAction(VectorOp, VT, Custom);
}
if (!Subtarget.useSoftFloat() && Subtarget.hasAVX()) {
@@ -25433,6 +25430,9 @@ static SDValue createMinMaxReduction(SDValue Src, EVT TargetVT, SDLoc DL,
"The caller must check if SSE4.1 is available");
assert((TargetVT == MVT::i16 || TargetVT == MVT::i8) &&
"Unexpected return type");
+ assert((BinOp == ISD::UMIN || BinOp == ISD::UMAX || BinOp == ISD::SMIN ||
+ BinOp == ISD::SMAX) &&
+ "Invalid BinOp");
EVT SrcVT = Src.getValueType();
EVT SrcSVT = SrcVT.getScalarType();
@@ -25495,6 +25495,189 @@ static SDValue LowerVECTOR_REDUCE_MINMAX(SDValue Op,
BinOp, DAG, Subtarget);
}
+static SDValue createArithReduction(SDValue V, EVT TargetVT, SDLoc DL,
+ ISD::NodeType BinOp,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ assert(Subtarget.hasSSE2() && "Expected to have SSE2");
+
+ EVT VecVT = V.getValueType();
+ EVT VT = TargetVT;
+ assert(VecVT.getScalarType() == VT && "Type mismatch");
+
+ unsigned NumElts = VecVT.getVectorNumElements();
+ unsigned EltSizeInBits = VecVT.getScalarSizeInBits();
+
+ // Extend v4i8/v8i8 vector to v16i8, with undef upper 64-bits.
+ auto WidenToV16I8 = [&](SDValue V, bool ZeroExtend) {
+ if (V.getValueType() == MVT::v4i8) {
+ if (ZeroExtend && Subtarget.hasSSE41()) {
+ V = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4i32,
+ DAG.getConstant(0, DL, MVT::v4i32),
+ DAG.getBitcast(MVT::i32, V),
+ DAG.getVectorIdxConstant(0, DL));
+ return DAG.getBitcast(MVT::v16i8, V);
+ }
+ V = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i8, V,
+ ZeroExtend ? DAG.getConstant(0, DL, MVT::v4i8)
+ : DAG.getUNDEF(MVT::v4i8));
+ }
+ return DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V,
+ DAG.getUNDEF(MVT::v8i8));
+ };
+
+ // vXi8 mul reduction - promote to vXi16 mul reduction.
+ if (BinOp == ISD::MUL) {
+ if (VT != MVT::i8 || NumElts < 4 || !isPowerOf2_32(NumElts))
+ return SDValue();
+ if (VecVT.getSizeInBits() >= 128) {
+ EVT WideVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts / 2);
+ SDValue Lo = getUnpackl(DAG, DL, VecVT, V, DAG.getUNDEF(VecVT));
+ SDValue Hi = getUnpackh(DAG, DL, VecVT, V, DAG.getUNDEF(VecVT));
+ Lo = DAG.getBitcast(WideVT, Lo);
+ Hi = DAG.getBitcast(WideVT, Hi);
+ V = DAG.getNode(BinOp, DL, WideVT, Lo, Hi);
+ while (V.getValueSizeInBits() > 128) {
+ std::tie(Lo, Hi) = splitVector(V, DAG, DL);
+ V = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi);
+ }
+ } else {
+ V = WidenToV16I8(V, false);
+ V = getUnpackl(DAG, DL, MVT::v16i8, V, DAG.getUNDEF(MVT::v16i8));
+ V = DAG.getBitcast(MVT::v8i16, V);
+ }
+ if (NumElts >= 8)
+ V = DAG.getNode(BinOp, DL, MVT::v8i16, V,
+ DAG.getVectorShuffle(MVT::v8i16, DL, V, V,
+ {4, 5, 6, 7, -1, -1, -1, -1}));
+ V = DAG.getNode(BinOp, DL, MVT::v8i16, V,
+ DAG.getVectorShuffle(MVT::v8i16, DL, V, V,
+ {2, 3, -1, -1, -1, -1, -1, -1}));
+ V = DAG.getNode(BinOp, DL, MVT::v8i16, V,
+ DAG.getVectorShuffle(MVT::v8i16, DL, V, V,
+ {1, -1, -1, -1, -1, -1, -1, -1}));
+ V = DAG.getBitcast(MVT::v16i8, V);
+ return DAG.getExtractVectorElt(DL, VT, V, 0);
+ }
+
+ // vXi8 add reduction - sub 128-bit vector.
+ if (VecVT == MVT::v4i8 || VecVT == MVT::v8i8) {
+ V = WidenToV16I8(V, true);
+ V = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, V,
+ DAG.getConstant(0, DL, MVT::v16i8));
+ V = DAG.getBitcast(MVT::v16i8, V);
+ return DAG.getExtractVectorElt(DL, VT, V, 0);
+ }
+
+ // Must be a >=128-bit vector with pow2 elements.
+ if ((VecVT.getSizeInBits() % 128) != 0 || !isPowerOf2_32(NumElts))
+ return SDValue();
+
+ // vXi8 add reduction - sum lo/hi halves then use PSADBW.
+ if (VT == MVT::i8) {
+ while (V.getValueSizeInBits() > 128) {
+ SDValue Lo, Hi;
+ std::tie(Lo, Hi) = splitVector(V, DAG, DL);
+ VecVT = Lo.getValueType();
+ V = DAG.getNode(ISD::ADD, DL, VecVT, Lo, Hi);
+ }
+ assert(VecVT == MVT::v16i8 && "v16i8 reduction expected");
+
+ SDValue Hi = DAG.getVectorShuffle(
+ MVT::v16i8, DL, V, V,
+ {8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1});
+ V = DAG.getNode(ISD::ADD, DL, MVT::v16i8, V, Hi);
+ V = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, V,
+ getZeroVector(MVT::v16i8, Subtarget, DAG, DL));
+ V = DAG.getBitcast(MVT::v16i8, V);
+ return DAG.getExtractVectorElt(DL, VT, V, 0);
+ }
+
+ // See if we can use vXi8 PSADBW add reduction for larger zext types.
+ // If the source vector values are 0-255, then we can use PSADBW to
+ // sum+zext v8i8 subvectors to vXi64, then perform the reduction.
+ // TODO: See if its worth avoiding vXi16/i32 truncations?
+ if (BinOp == ISD::ADD && NumElts >= 4 && EltSizeInBits >= 16 &&
+ DAG.computeKnownBits(V).getMaxValue().ule(255) &&
+ (EltSizeInBits == 16 || V.getOpcode() == ISD::ZERO_EXTEND ||
+ Subtarget.hasAVX512())) {
+ if (V.getValueType() == MVT::v8i16) {
+ V = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V,
+ DAG.getUNDEF(MVT::v8i16));
+ } else {
+ EVT ByteVT = VecVT.changeVectorElementType(MVT::i8);
+ V = DAG.getNode(ISD::TRUNCATE, DL, ByteVT, V);
+ if (ByteVT.getSizeInBits() < 128)
+ V = WidenToV16I8(V, true);
+ }
+
+ // Build the PSADBW, split as 128/256/512 bits for SSE/AVX2/AVX512BW.
+ auto PSADBWBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
+ ArrayRef<SDValue> Ops) {
+ MVT VT = MVT::getVectorVT(MVT::i64, Ops[0].getValueSizeInBits() / 64);
+ SDValue Zero = DAG.getConstant(0, DL, Ops[0].getValueType());
+ return DAG.getNode(X86ISD::PSADBW, DL, VT, Ops[0], Zero);
+ };
+ MVT SadVT = MVT::getVectorVT(MVT::i64, V.getValueSizeInBits() / 64);
+ V = SplitOpsAndApply(DAG, Subtarget, DL, SadVT, {V}, PSADBWBuilder);
+
+ // TODO: We could truncate to vXi16/vXi32 before performing the reduction.
+ while (V.getValueSizeInBits() > 128) {
+ SDValue Lo, Hi;
+ std::tie(Lo, Hi) = splitVector(V, DAG, DL);
+ VecVT = Lo.getValueType();
+ V = DAG.getNode(ISD::ADD, DL, VecVT, Lo, Hi);
+ }
+ assert(V.getValueType() == MVT::v2i64 && "v2i64 reduction expected");
+
+ if (NumElts > 8) {
+ SDValue RdxHi = DAG.getVectorShuffle(MVT::v2i64, DL, V, V, {1, -1});
+ V = DAG.getNode(ISD::ADD, DL, MVT::v2i64, V, RdxHi);
+ }
+
+ VecVT = MVT::getVectorVT(VT.getSimpleVT(), 128 / VT.getSizeInBits());
+ V = DAG.getBitcast(VecVT, V);
+ return DAG.getExtractVectorElt(DL, VT, V, 0);
+ }
+
+ // Only use (F)HADD opcodes if they aren't microcoded or minimizes codesize.
+ if (!shouldUseHorizontalOp(true, DAG, Subtarget))
+ return SDValue();
+
+ unsigned HorizOpcode = BinOp == ISD::ADD ? X86ISD::HADD : X86ISD::FHADD;
+
+ // 256-bit horizontal instructions operate on 128-bit chunks rather than
+ // across the whole vector, so we need an extract + hop preliminary stage.
+ // This is the only step where the operands of the hop are not the same value.
+ // TODO: We could extend this to handle 512-bit or even longer vectors.
+ if (((VecVT == MVT::v16i16 || VecVT == MVT::v8i32) && Subtarget.hasSSSE3()) ||
+ ((VecVT == MVT::v8f32 || VecVT == MVT::v4f64) && Subtarget.hasSSE3())) {
+ unsigned NumElts = VecVT.getVectorNumElements();
+ SDValue Hi = extract128BitVector(V, NumElts / 2, DAG, DL);
+ SDValue Lo = extract128BitVector(V, 0, DAG, DL);
+ V = DAG.getNode(HorizOpcode, DL, Lo.getValueType(), Hi, Lo);
+ VecVT = V.getValueType();
+ }
+ if (!((VecVT == MVT::v8i16 || VecVT == MVT::v4i32) && Subtarget.hasSSSE3()) &&
+ !((VecVT == MVT::v4f32 || VecVT == MVT::v2f64) && Subtarget.hasSSE3()))
+ return SDValue();
+
+ // extract (add (shuf X), X), 0 --> extract (hadd X, X), 0
+ unsigned ReductionSteps = Log2_32(VecVT.getVectorNumElements());
+ for (unsigned i = 0; i != ReductionSteps; ++i)
+ V = DAG.getNode(HorizOpcode, DL, VecVT, V, V);
+
+ return DAG.getExtractVectorElt(DL, VT, V, 0);
+}
+
+static SDValue LowerVECTOR_REDUCE_ADD_FADD_MUL(SDValue V,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ ISD::NodeType BinOp = ISD::getVecReduceBaseOpcode(V.getOpcode());
+ return createArithReduction(V.getOperand(0), V.getValueType(), SDValue(V),
+ BinOp, Subtarget, DAG);
+}
+
static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
MVT VT = Op->getSimpleValueType(0);
@@ -27752,6 +27935,53 @@ bool X86::isExtendedSwiftAsyncFrameSupported(const X86Subtarget &Subtarget,
return !MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
}
+bool llvm::X86::isVectorReductionFast(const X86Subtarget &Subtarget,
+ ISD::NodeType VectorOp, MVT VT) {
+ if (!VT.isFixedLengthVector())
+ return false;
+
+ unsigned NumElts = VT.getVectorNumElements();
+
+ switch (VectorOp) {
+ case ISD::VECREDUCE_UMIN:
+ case ISD::VECREDUCE_UMAX:
+ case ISD::VECREDUCE_SMIN:
+ case ISD::VECREDUCE_SMAX:
+ return Subtarget.hasSSE41() && (VT.getSizeInBits() % 128) == 0 &&
+ (VT.getScalarType() == MVT::i8 || VT.getScalarType() == MVT::i16);
+
+ case ISD::VECREDUCE_MUL:
+ return Subtarget.hasSSE2() && VT == MVT::i8 && NumElts >= 4 &&
+ isPowerOf2_32(NumElts);
+
+ case ISD::VECREDUCE_ADD:
+ case ISD::VECREDUCE_FADD:
+ if (!Subtarget.hasSSE2())
+ return false;
+
+ if (VT == MVT::v4i8 || VT == MVT::v8i8)
+ return true;
+
+ if ((VT.getSizeInBits() % 128) == 0 && isPowerOf2_32(NumElts)) {
+ bool CanUseHorizontalAdd =
+ Subtarget.hasFastHorizontalOps() &&
+ (((VT == MVT::v16i16 || VT == MVT::v8i32) && Subtarget.hasSSSE3()) ||
+ ((VT == MVT::v8f32 || VT == MVT::v4f64) && Subtarget.hasSSE3()) ||
+ ((VT == MVT::v8i16 || VT == MVT::v4i32) && Subtarget.hasSSSE3()) ||
+ ((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget.hasSSE3()));
+
+ return VT == MVT::i8 || CanUseHorizontalAdd;
+ }
+
+ break;
+
+ default:
+ break;
+ }
+
+ return false;
+}
+
static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
unsigned IntNo = Op.getConstantOperandVal(1);
@@ -33711,6 +33941,10 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::VECREDUCE_SMIN:
case ISD::VECREDUCE_SMAX:
return LowerVECTOR_REDUCE_MINMAX(Op, Subtarget, DAG);
+ case ISD::VECREDUCE_ADD:
+ case ISD::VECREDUCE_FADD:
+ case ISD::VECREDUCE_MUL:
+ return LowerVECTOR_REDUCE_ADD_FADD_MUL(Op, Subtarget, DAG);
case ISD::FP_TO_SINT:
case ISD::STRICT_FP_TO_SINT:
case ISD::FP_TO_UINT:
@@ -46948,170 +47182,7 @@ static SDValue combineArithReduction(SDNode *ExtElt, SelectionDAG &DAG,
if (VecVT.getScalarType() != VT)
return SDValue();
- SDLoc DL(ExtElt);
- unsigned NumElts = VecVT.getVectorNumElements();
- unsigned EltSizeInBits = VecVT.getScalarSizeInBits();
-
- // Extend v4i8/v8i8 vector to v16i8, with undef upper 64-bits.
- auto WidenToV16I8 = [&](SDValue V, bool ZeroExtend) {
- if (V.getValueType() == MVT::v4i8) {
- if (ZeroExtend && Subtarget.hasSSE41()) {
- V = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4i32,
- DAG.getConstant(0, DL, MVT::v4i32),
- DAG.getBitcast(MVT::i32, V),
- DAG.getVectorIdxConstant(0, DL));
- return DAG.getBitcast(MVT::v16i8, V);
- }
- V = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i8, V,
- ZeroExtend ? DAG.getConstant(0, DL, MVT::v4i8)
- : DAG.getUNDEF(MVT::v4i8));
- }
- return DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V,
- DAG.getUNDEF(MVT::v8i8));
- };
-
- // vXi8 mul reduction - promote to vXi16 mul reduction.
- if (Opc == ISD::MUL) {
- if (VT != MVT::i8 || NumElts < 4 || !isPowerOf2_32(NumElts))
- return SDValue();
- if (VecVT.getSizeInBits() >= 128) {
- EVT WideVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts / 2);
- SDValue Lo = getUnpackl(DAG, DL, VecVT, Rdx, DAG.getUNDEF(VecVT));
- SDValue Hi = getUnpackh(DAG, DL, VecVT, Rdx, DAG.getUNDEF(VecVT));
- Lo = DAG.getBitcast(WideVT, Lo);
- Hi = DAG.getBitcast(WideVT, Hi);
- Rdx = DAG.getNode(Opc, DL, WideVT, Lo, Hi);
- while (Rdx.getValueSizeInBits() > 128) {
- std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);
- Rdx = DAG.getNode(Opc, DL, Lo.getValueType(), Lo, Hi);
- }
- } else {
- Rdx = WidenToV16I8(Rdx, false);
- Rdx = getUnpackl(DAG, DL, MVT::v16i8, Rdx, DAG.getUNDEF(MVT::v16i8));
- Rdx = DAG.getBitcast(MVT::v8i16, Rdx);
- }
- if (NumElts >= 8)
- Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
- DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
- {4, 5, 6, 7, -1, -1, -1, -1}));
- Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
- DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
- {2, 3, -1, -1, -1, -1, -1, -1}));
- Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
- DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
- {1, -1, -1, -1, -1, -1, -1, -1}));
- Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
- return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
- }
-
- // vXi8 add reduction - sub 128-bit vector.
- if (VecVT == MVT::v4i8 || VecVT == MVT::v8i8) {
- Rdx = WidenToV16I8(Rdx, true);
- Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx,
- DAG.getConstant(0, DL, MVT::v16i8));
- Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
- return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
- }
-
- // Must be a >=128-bit vector with pow2 elements.
- if ((VecVT.getSizeInBits() % 128) != 0 || !isPowerOf2_32(NumElts))
- return SDValue();
-
- // vXi8 add reduction - sum lo/hi halves then use PSADBW.
- if (VT == MVT::i8) {
- while (Rdx.getValueSizeInBits() > 128) {
- SDValue Lo, Hi;
- std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);
- VecVT = Lo.getValueType();
- Rdx = DAG.getNode(ISD::ADD, DL, VecVT, Lo, Hi);
- }
- assert(VecVT == MVT::v16i8 && "v16i8 reduction expected");
-
- SDValue Hi = DAG.getVectorShuffle(
- MVT::v16i8, DL, Rdx, Rdx,
- {8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1});
- Rdx = DAG.getNode(ISD::ADD, DL, MVT::v16i8, Rdx, Hi);
- Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx,
- getZeroVector(MVT::v16i8, Subtarget, DAG, DL));
- Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
- return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
- }
-
- // See if we can use vXi8 PSADBW add reduction for larger zext types.
- // If the source vector values are 0-255, then we can use PSADBW to
- // sum+zext v8i8 subvectors to vXi64, then perform the reduction.
- // TODO: See if its worth avoiding vXi16/i32 truncations?
- if (Opc == ISD::ADD && NumElts >= 4 && EltSizeInBits >= 16 &&
- DAG.computeKnownBits(Rdx).getMaxValue().ule(255) &&
- (EltSizeInBits == 16 || Rdx.getOpcode() == ISD::ZERO_EXTEND ||
- Subtarget.hasAVX512())) {
- if (Rdx.getValueType() == MVT::v8i16) {
- Rdx = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Rdx,
- DAG.getUNDEF(MVT::v8i16));
- } else {
- EVT ByteVT = VecVT.changeVectorElementType(MVT::i8);
- Rdx = DAG.getNode(ISD::TRUNCATE, DL, ByteVT, Rdx);
- if (ByteVT.getSizeInBits() < 128)
- Rdx = WidenToV16I8(Rdx, true);
- }
-
- // Build the PSADBW, split as 128/256/512 bits for SSE/AVX2/AVX512BW.
- auto PSADBWBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
- ArrayRef<SDValue> Ops) {
- MVT VT = MVT::getVectorVT(MVT::i64, Ops[0].getValueSizeInBits() / 64);
- SDValue Zero = DAG.getConstant(0, DL, Ops[0].getValueType());
- return DAG.getNode(X86ISD::PSADBW, DL, VT, Ops[0], Zero);
- };
- MVT SadVT = MVT::getVectorVT(MVT::i64, Rdx.getValueSizeInBits() / 64);
- Rdx = SplitOpsAndApply(DAG, Subtarget, DL, SadVT, {Rdx}, PSADBWBuilder);
-
- // TODO: We could truncate to vXi16/vXi32 before performing the reduction.
- while (Rdx.getValueSizeInBits() > 128) {
- SDValue Lo, Hi;
- std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);
- VecVT = Lo.getValueType();
- Rdx = DAG.getNode(ISD::ADD, DL, VecVT, Lo, Hi);
- }
- assert(Rdx.getValueType() == MVT::v2i64 && "v2i64 reduction expected");
-
- if (NumElts > 8) {
- SDValue RdxHi = DAG.getVectorShuffle(MVT::v2i64, DL, Rdx, Rdx, {1, -1});
- Rdx = DAG.getNode(ISD::ADD, DL, MVT::v2i64, Rdx, RdxHi);
- }
-
- VecVT = MVT::getVectorVT(VT.getSimpleVT(), 128 / VT.getSizeInBits());
- Rdx = DAG.getBitcast(VecVT, Rdx);
- return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
- }
-
- // Only use (F)HADD opcodes if they aren't microcoded or minimizes codesize.
- if (!shouldUseHorizontalOp(true, DAG, Subtarget))
- return SDValue();
-
- unsigned HorizOpcode = Opc == ISD::ADD ? X86ISD::HADD : X86ISD::FHADD;
-
- // 256-bit horizontal instructions operate on 128-bit chunks rather than
- // across the whole vector, so we need an extract + hop preliminary stage.
- // This is the only step where the operands of the hop are not the same value.
- // TODO: We could extend this to handle 512-bit or even longer vectors.
- if (((VecVT == MVT::v16i16 || VecVT == MVT::v8i32) && Subtarget.hasSSSE3()) ||
- ((VecVT == MVT::v8f32 || VecVT == MVT::v4f64) && Subtarget.hasSSE3())) {
- unsigned NumElts = VecVT.getVectorNumElements();
- SDValue Hi = extract128BitVector(Rdx, NumElts / 2, DAG, DL);
- SDValue Lo = extract128BitVector(Rdx, 0, DAG, DL);
- Rdx = DAG.getNode(HorizOpcode, DL, Lo.getValueType(), Hi, Lo);
- VecVT = Rdx.getValueType();
- }
- if (!((VecVT == MVT::v8i16 || VecVT == MVT::v4i32) && Subtarget.hasSSSE3()) &&
- !((VecVT == MVT::v4f32 || VecVT == MVT::v2f64) && Subtarget.hasSSE3()))
- return SDValue();
-
- // extract (add (shuf X), X), 0 --> extract (hadd X, X), 0
- unsigned ReductionSteps = Log2_32(VecVT.getVectorNumElements());
- for (unsigned i = 0; i != ReductionSteps; ++i)
- Rdx = DAG.getNode(HorizOpcode, DL, VecVT, Rdx, Rdx);
-
- return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
+ return createArithReduction(Rdx, VT, SDLoc(ExtElt), Opc, Subtarget, DAG);
}
/// Detect vector gather/scatter index generation and convert it from being a
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index 5cb6b3e493a32..a2e746ed106fd 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -1058,6 +1058,11 @@ namespace llvm {
/// functions.
bool isExtendedSwiftAsyncFrameSupported(const X86Subtarget &Subtarget,
const MachineFunction &MF);
+
+ /// True if the target supports a fast implementation for the specific
+ /// operation and vector type combination.
+ bool isVectorReductionFast(const X86Subtarget &Subtarget, ISD::NodeType VectorOp, MVT VT);
+
} // end namespace X86
//===--------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index e753946d0e19b..7b915cb575c89 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -6576,21 +6576,41 @@ X86TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
}
bool llvm::X86TTIImpl::shouldExpandReduction(const IntrinsicInst *II) const {
+ ISD::NodeType OpCode;
switch (II->getIntrinsicID()) {
- default:
- return true;
-
+ case Intrinsic::vector_reduce_add:
+ OpCode = ISD::VECREDUCE_ADD;
+ break;
+ case Intrinsic::vector_reduce_fadd:
+ OpCode = ISD::VECREDUCE_FADD;
+ break;
+ case Intrinsic::vector_reduce_mul:
+ OpCode = ISD::VECREDUCE_MUL;
+ break;
case Intrinsic::vector_reduce_umin:
+ OpCode = ISD::VECREDUCE_UMIN;
+ break;
case Intrinsic::vector_reduce_umax:
+ OpCode = ISD::VECREDUCE_UMAX;
+ break;
case Intrinsic::vector_reduce_smin:
+ OpCode = ISD::VECREDUCE_SMIN;
+ break;
case Intrinsic::vector_reduce_smax:
- auto *VType = cast<FixedVectorType>(II->getOperand(0)->getType());
- auto SType = VType->getScalarType();
- bool CanUsePHMINPOSUW = ST->hasSSE41() && II->getType() == SType &&
- (VType->getPrimitiveSizeInBits() % 128) == 0 &&
- (SType->isIntegerTy(8) || SType->isIntegerTy(16));
- return !CanUsePHMINPOSUW;
+ OpCode = ISD::VECREDUCE_SMAX;
+ break;
+
+ default:
+ return true;
}
+
+ auto *VType = dyn_cast<FixedVectorType>(
+ II->getOperand(II->getIntrinsicID() == Intrinsic::vector_reduce_fadd ? 1
+ : 0)
+ ->getType());
+ auto VT = EVT::getEVT(VType).getSimpleVT();
+
+ return !X86::isVectorReductionFast(*ST, OpCode, VT);
}
bool X86TTIImpl::prefersVectorizedAddressing() const {
diff --git a/llvm/test/CodeGen/X86/horizontal-sum.ll b/llvm/test/CodeGen/X86/horizontal-sum.ll
index 0afc4f784bc5e..438d9ccd4759a 100644
--- a/llvm/test/CodeGen/X86/horizontal-sum.ll
+++ b/llvm/test/CodeGen/X86/horizontal-sum.ll
@@ -833,34 +833,34 @@ define <4 x float> @reduction_sum_v4f32_v4f32(<4 x float> %0, <4 x float> %1, <4
;
; SSSE3-FAST-LABEL: reduction_sum_v4f32_v4f32:
; SSSE3-FAST: # %bb.0:
-; SSSE3-FAST-NEXT: movaps %xmm0, %xmm4
-; SSSE3-FAST-NEXT: haddps %xmm0, %xmm4
-; SSSE3-FAST-NEXT: movaps %xmm0, %xmm5
-; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1]
-; SSSE3-FAST-NEXT: addss %xmm4, %xmm5
-; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; SSSE3-FAST-NEXT: addss %xmm5, %xmm0
-; SSSE3-FAST-NEXT: movaps %xmm1, %xmm4
-; SSSE3-FAST-NEXT: haddps %xmm1, %xmm4
-; SSSE3-FAST-NEXT: movaps %xmm1, %xmm5
-; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1]
-; SSSE3-FAST-NEXT: addss %xmm4, %xmm5
-; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
-; SSSE3-FAST-NEXT: addss %xmm5, %xmm1
-; SSSE3-FAST-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSSE3-FAST-NEXT: movaps %xmm2, %xmm1
-; SSSE3-FAST-NEXT: haddps %xmm2, %xmm1
-; SSSE3-FAST-NEXT: movaps %xmm2, %xmm4
-; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1]
-; SSSE3-FAST-NEXT: addss %xmm1, %xmm4
-; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
-; SSSE3-FAST-NEXT: addss %xmm4, %xmm2
-; SSSE3-FAST-NEXT: movaps %xmm3, %xmm1
-; SSSE3-FAST-NEXT: haddps %xmm3, %xmm1
; SSSE3-FAST-NEXT: movaps %xmm3, %xmm4
-; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1]
-; SSSE3-FAST-NEXT: addss %xmm1, %xmm4
-; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3,3,3]
+; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,3],xmm3[3,3]
+; SSSE3-FAST-NEXT: movaps %xmm3, %xmm5
+; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm3[1]
+; SSSE3-FAST-NEXT: movaps %xmm2, %xmm6
+; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,3],xmm2[3,3]
+; SSSE3-FAST-NEXT: movaps %xmm2, %xmm7
+; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm2[1]
+; SSSE3-FAST-NEXT: movaps %xmm1, %xmm8
+; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm8 = xmm8[3,3],xmm1[3,3]
+; SSSE3-FAST-NEXT: movaps %xmm1, %xmm9
+; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm1[1]
+; SSSE3-FAST-NEXT: movaps %xmm0, %xmm10
+; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm10 = xmm10[3,3],xmm0[3,3]
+; SSSE3-FAST-NEXT: movaps %xmm0, %xmm11
+; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm0[1]
+; SSSE3-FAST-NEXT: haddps %xmm0, %xmm0
+; SSSE3-FAST-NEXT: addss %xmm11, %xmm0
+; SSSE3-FAST-NEXT: addss %xmm10, %xmm0
+; SSSE3-FAST-NEXT: haddps %xmm1, %xmm1
+; SSSE3-FAST-NEXT: addss %xmm9, %xmm1
+; SSSE3-FAST-NEXT: addss %xmm8, %xmm1
+; SSSE3-FAST-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSSE3-FAST-NEXT: haddps %xmm2, %xmm2
+; SSSE3-FAST-NEXT: addss %xmm7, %xmm2
+; SSSE3-FAST-NEXT: addss %xmm6, %xmm2
+; SSSE3-FAST-NEXT: haddps %xmm3, %xmm3
+; SSSE3-FAST-NEXT: addss %xmm5, %xmm3
; SSSE3-FAST-NEXT: addss %xmm4, %xmm3
; SSSE3-FAST-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; SSSE3-FAST-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
@@ -899,28 +899,28 @@ define <4 x float> @reduction_sum_v4f32_v4f32(<4 x float> %0, <4 x float> %1, <4
;
; AVX-FAST-LABEL: reduction_sum_v4f32_v4f32:
; AVX-FAST: # %bb.0:
-; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm4
-; AVX-FAST-NEXT: vshufpd {{.*#+}} xmm5 = xmm0[1,0]
-; AVX-FAST-NEXT: vaddss %xmm5, %xmm4, %xmm4
-; AVX-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; AVX-FAST-NEXT: vaddss %xmm0, %xmm4, %xmm0
-; AVX-FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm4
-; AVX-FAST-NEXT: vshufpd {{.*#+}} xmm5 = xmm1[1,0]
-; AVX-FAST-NEXT: vaddss %xmm5, %xmm4, %xmm4
-; AVX-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
-; AVX-FAST-NEXT: vaddss %xmm1, %xmm4, %xmm1
+; AVX-FAST-NEXT: vshufps {{.*#+}} xmm4 = xmm3[3,3,3,3]
+; AVX-FAST-NEXT: vshufpd {{.*#+}} xmm5 = xmm3[1,0]
+; AVX-FAST-NEXT: vshufps {{.*#+}} xmm6 = xmm2[3,3,3,3]
+; AVX-FAST-NEXT: vshufpd {{.*#+}} xmm7 = xmm2[1,0]
+; AVX-FAST-NEXT: vshufps {{.*#+}} xmm8 = xmm1[3,3,3,3]
+; AVX-FAST-NEXT: vshufpd {{.*#+}} xmm9 = xmm1[1,0]
+; AVX-FAST-NEXT: vshufps {{.*#+}} xmm10 = xmm0[3,3,3,3]
+; AVX-FAST-NEXT: vshufpd {{.*#+}} xmm11 = xmm0[1,0]
+; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
+; AVX-FAST-NEXT: vaddss %xmm0, %xmm11, %xmm0
+; AVX-FAST-NEXT: vaddss %xmm0, %xmm10, %xmm0
+; AVX-FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm1
+; AVX-FAST-NEXT: vaddss %xmm1, %xmm9, %xmm1
+; AVX-FAST-NEXT: vaddss %xmm1, %xmm8, %xmm1
; AVX-FAST-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
; AVX-FAST-NEXT: vhaddps %xmm2, %xmm2, %xmm1
-; AVX-FAST-NEXT: vshufpd {{.*#+}} xmm4 = xmm2[1,0]
-; AVX-FAST-NEXT: vaddss %xmm4, %xmm1, %xmm1
-; AVX-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
-; AVX-FAST-NEXT: vaddss %xmm2, %xmm1, %xmm1
+; AVX-FAST-NEXT: vaddss %xmm7, %xmm1, %xmm1
+; AVX-FAST-NEXT: vaddss %xmm6, %xmm1, %xmm1
; AVX-FAST-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
; AVX-FAST-NEXT: vhaddps %xmm3, %xmm3, %xmm1
-; AVX-FAST-NEXT: vshufpd {{.*#+}} xmm2 = xmm3[1,0]
-; AVX-FAST-NEXT: vaddss %xmm2, %xmm1, %xmm1
-; AVX-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm3[3,3,3,3]
-; AVX-FAST-NEXT: vaddss %xmm2, %xmm1, %xmm1
+; AVX-FAST-NEXT: vaddss %xmm5, %xmm1, %xmm1
+; AVX-FAST-NEXT: vaddss %xmm4, %xmm1, %xmm1
; AVX-FAST-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
; AVX-FAST-NEXT: retq
%5 = call float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %0)
@@ -964,21 +964,9 @@ define <4 x float> @reduction_sum_v4f32_v4f32_reassoc(<4 x float> %0, <4 x float
;
; SSSE3-FAST-LABEL: reduction_sum_v4f32_v4f32_reassoc:
; SSSE3-FAST: # %bb.0:
-; SSSE3-FAST-NEXT: movaps %xmm0, %xmm4
-; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1]
-; SSSE3-FAST-NEXT: addps %xmm4, %xmm0
-; SSSE3-FAST-NEXT: movaps %xmm1, %xmm4
-; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1]
-; SSSE3-FAST-NEXT: addps %xmm1, %xmm4
-; SSSE3-FAST-NEXT: haddps %xmm4, %xmm0
-; SSSE3-FAST-NEXT: movaps %xmm2, %xmm1
-; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
-; SSSE3-FAST-NEXT: addps %xmm2, %xmm1
-; SSSE3-FAST-NEXT: movaps %xmm3, %xmm2
-; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1]
-; SSSE3-FAST-NEXT: addps %xmm3, %xmm2
-; SSSE3-FAST-NEXT: haddps %xmm2, %xmm1
-; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; SSSE3-FAST-NEXT: haddps %xmm3, %xmm2
+; SSSE3-FAST-NEXT: haddps %xmm1, %xmm0
+; SSSE3-FAST-NEXT: haddps %xmm2, %xmm0
; SSSE3-FAST-NEXT: retq
;
; AVX-SLOW-LABEL: reduction_sum_v4f32_v4f32_reassoc:
@@ -1002,17 +990,9 @@ define <4 x float> @reduction_sum_v4f32_v4f32_reassoc(<4 x float> %0, <4 x float
;
; AVX-FAST-LABEL: reduction_sum_v4f32_v4f32_reassoc:
; AVX-FAST: # %bb.0:
-; AVX-FAST-NEXT: vshufpd {{.*#+}} xmm4 = xmm0[1,0]
-; AVX-FAST-NEXT: vaddps %xmm4, %xmm0, %xmm0
-; AVX-FAST-NEXT: vshufpd {{.*#+}} xmm4 = xmm1[1,0]
-; AVX-FAST-NEXT: vaddps %xmm4, %xmm1, %xmm1
+; AVX-FAST-NEXT: vhaddps %xmm3, %xmm2, %xmm2
; AVX-FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0
-; AVX-FAST-NEXT: vshufpd {{.*#+}} xmm1 = xmm2[1,0]
-; AVX-FAST-NEXT: vaddps %xmm1, %xmm2, %xmm1
-; AVX-FAST-NEXT: vshufpd {{.*#+}} xmm2 = xmm3[1,0]
-; AVX-FAST-NEXT: vaddps %xmm2, %xmm3, %xmm2
-; AVX-FAST-NEXT: vhaddps %xmm2, %xmm1, %xmm1
-; AVX-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; AVX-FAST-NEXT: vhaddps %xmm2, %xmm0, %xmm0
; AVX-FAST-NEXT: retq
%5 = call reassoc float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %0)
%6 = call reassoc float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %1)
@@ -1051,17 +1031,9 @@ define <4 x i32> @reduction_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32
;
; SSSE3-FAST-LABEL: reduction_sum_v4i32_v4i32:
; SSSE3-FAST: # %bb.0:
-; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3]
-; SSSE3-FAST-NEXT: paddd %xmm4, %xmm0
-; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,2,3]
-; SSSE3-FAST-NEXT: paddd %xmm1, %xmm4
-; SSSE3-FAST-NEXT: phaddd %xmm4, %xmm0
-; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3]
-; SSSE3-FAST-NEXT: paddd %xmm2, %xmm1
-; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3]
-; SSSE3-FAST-NEXT: paddd %xmm3, %xmm2
-; SSSE3-FAST-NEXT: phaddd %xmm2, %xmm1
-; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; SSSE3-FAST-NEXT: phaddd %xmm3, %xmm2
+; SSSE3-FAST-NEXT: phaddd %xmm1, %xmm0
+; SSSE3-FAST-NEXT: phaddd %xmm2, %xmm0
; SSSE3-FAST-NEXT: retq
;
; AVX1-SLOW-LABEL: reduction_sum_v4i32_v4i32:
@@ -1089,17 +1061,9 @@ define <4 x i32> @reduction_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32
;
; AVX-FAST-LABEL: reduction_sum_v4i32_v4i32:
; AVX-FAST: # %bb.0:
-; AVX-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3]
-; AVX-FAST-NEXT: vpaddd %xmm4, %xmm0, %xmm0
-; AVX-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,3,2,3]
-; AVX-FAST-NEXT: vpaddd %xmm4, %xmm1, %xmm1
+; AVX-FAST-NEXT: vphaddd %xmm3, %xmm2, %xmm2
; AVX-FAST-NEXT: vphaddd %xmm1, %xmm0, %xmm0
-; AVX-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,3,2,3]
-; AVX-FAST-NEXT: vpaddd %xmm1, %xmm2, %xmm1
-; AVX-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,3,2,3]
-; AVX-FAST-NEXT: vpaddd %xmm2, %xmm3, %xmm2
-; AVX-FAST-NEXT: vphaddd %xmm2, %xmm1, %xmm1
-; AVX-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; AVX-FAST-NEXT: vphaddd %xmm2, %xmm0, %xmm0
; AVX-FAST-NEXT: retq
;
; AVX2-SLOW-LABEL: reduction_sum_v4i32_v4i32:
diff --git a/llvm/test/CodeGen/X86/optimize-reduction.ll b/llvm/test/CodeGen/X86/optimize-reduction.ll
index 4e9732882d2bb..93ea5142f2fdc 100644
--- a/llvm/test/CodeGen/X86/optimize-reduction.ll
+++ b/llvm/test/CodeGen/X86/optimize-reduction.ll
@@ -89,6 +89,7 @@ define { i16, i16 } @test_reduce_v16i16_with_add(<16 x i16> %x, <16 x i16> %y) {
; AVX2-NEXT: vphaddw %xmm2, %xmm2, %xmm2
; AVX2-NEXT: vphaddw %xmm2, %xmm2, %xmm2
; AVX2-NEXT: vmovd %xmm2, %eax
+; AVX2-NEXT: vmovd %eax, %xmm2
; AVX2-NEXT: vpbroadcastw %xmm2, %ymm2
; AVX2-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
diff --git a/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll b/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll
index 4898ae98faea2..9002317a03ca6 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll
@@ -1006,20 +1006,34 @@ define i16 @test_v16i16_v16i8(<16 x i16> %a0) {
; SSE41-NEXT: # kill: def $ax killed $ax killed $eax
; SSE41-NEXT: retq
;
-; AVX1-LABEL: test_v16i16_v16i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovd %xmm0, %eax
-; AVX1-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
+; AVX1-SLOW-LABEL: test_v16i16_v16i8:
+; AVX1-SLOW: # %bb.0:
+; AVX1-SLOW-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-SLOW-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX1-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-SLOW-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX1-SLOW-NEXT: vpaddq %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT: vmovd %xmm0, %eax
+; AVX1-SLOW-NEXT: # kill: def $ax killed $ax killed $eax
+; AVX1-SLOW-NEXT: vzeroupper
+; AVX1-SLOW-NEXT: retq
+;
+; AVX1-FAST-LABEL: test_v16i16_v16i8:
+; AVX1-FAST: # %bb.0:
+; AVX1-FAST-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-FAST-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-FAST-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX1-FAST-NEXT: vpaddq %xmm1, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vmovd %xmm0, %eax
+; AVX1-FAST-NEXT: # kill: def $ax killed $ax killed $eax
+; AVX1-FAST-NEXT: vzeroupper
+; AVX1-FAST-NEXT: retq
;
; AVX2-LABEL: test_v16i16_v16i8:
; AVX2: # %bb.0:
diff --git a/llvm/test/CodeGen/X86/vector-reduce-add-zext.ll b/llvm/test/CodeGen/X86/vector-reduce-add-zext.ll
index 937ac3d2db885..ff371322640c2 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-add-zext.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-add-zext.ll
@@ -220,13 +220,21 @@ define i32 @test_v4i32(<4 x i8> %a0) {
; SSE41-NEXT: movd %xmm0, %eax
; SSE41-NEXT: retq
;
-; AVX1-LABEL: test_v4i32:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
-; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovd %xmm0, %eax
-; AVX1-NEXT: retq
+; AVX1-SLOW-LABEL: test_v4i32:
+; AVX1-SLOW: # %bb.0:
+; AVX1-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX1-SLOW-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT: vmovd %xmm0, %eax
+; AVX1-SLOW-NEXT: retq
+;
+; AVX1-FAST-LABEL: test_v4i32:
+; AVX1-FAST: # %bb.0:
+; AVX1-FAST-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vmovd %xmm0, %eax
+; AVX1-FAST-NEXT: retq
;
; AVX2-LABEL: test_v4i32:
; AVX2: # %bb.0:
@@ -257,12 +265,37 @@ define i32 @test_v8i32_v8i8(<8 x i8> %a0) {
; SSE-NEXT: movd %xmm1, %eax
; SSE-NEXT: retq
;
-; AVX-LABEL: test_v8i32_v8i8:
-; AVX: # %bb.0:
-; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vmovd %xmm0, %eax
-; AVX-NEXT: retq
+; AVX1-SLOW-LABEL: test_v8i32_v8i8:
+; AVX1-SLOW: # %bb.0:
+; AVX1-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-SLOW-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT: vmovd %xmm0, %eax
+; AVX1-SLOW-NEXT: retq
+;
+; AVX1-FAST-LABEL: test_v8i32_v8i8:
+; AVX1-FAST: # %bb.0:
+; AVX1-FAST-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; AVX1-FAST-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX1-FAST-NEXT: vphaddd %xmm1, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vmovd %xmm0, %eax
+; AVX1-FAST-NEXT: retq
+;
+; AVX2-LABEL: test_v8i32_v8i8:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vmovd %xmm0, %eax
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test_v8i32_v8i8:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vmovd %xmm0, %eax
+; AVX512-NEXT: retq
%1 = zext <8 x i8> %a0 to <8 x i32>
%2 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %1)
ret i32 %2
diff --git a/llvm/test/CodeGen/X86/vector-reduce-fadd.ll b/llvm/test/CodeGen/X86/vector-reduce-fadd.ll
index 606beeaff750e..36ccecfce64bc 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-fadd.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-fadd.ll
@@ -69,16 +69,38 @@ define float @test_v4f32(float %a0, <4 x float> %a1) {
; SSE41-NEXT: addss %xmm1, %xmm0
; SSE41-NEXT: retq
;
-; AVX-LABEL: test_v4f32:
-; AVX: # %bb.0:
-; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0]
-; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
-; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
-; AVX-NEXT: retq
+; AVX1-SLOW-LABEL: test_v4f32:
+; AVX1-SLOW: # %bb.0:
+; AVX1-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; AVX1-SLOW-NEXT: vaddss %xmm2, %xmm0, %xmm0
+; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0]
+; AVX1-SLOW-NEXT: vaddss %xmm2, %xmm0, %xmm0
+; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
+; AVX1-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT: retq
+;
+; AVX1-FAST-LABEL: test_v4f32:
+; AVX1-FAST: # %bb.0:
+; AVX1-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm1[3,3,3,3]
+; AVX1-FAST-NEXT: vshufpd {{.*#+}} xmm3 = xmm1[1,0]
+; AVX1-FAST-NEXT: vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3]
+; AVX1-FAST-NEXT: vaddss %xmm1, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vaddss %xmm4, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vaddss %xmm3, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vaddss %xmm2, %xmm0, %xmm0
+; AVX1-FAST-NEXT: retq
+;
+; AVX2-LABEL: test_v4f32:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vaddss %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; AVX2-NEXT: vaddss %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0]
+; AVX2-NEXT: vaddss %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
+; AVX2-NEXT: vaddss %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: retq
;
; AVX512-LABEL: test_v4f32:
; AVX512: # %bb.0:
@@ -137,25 +159,65 @@ define float @test_v8f32(float %a0, <8 x float> %a1) {
; SSE41-NEXT: addss %xmm2, %xmm0
; SSE41-NEXT: retq
;
-; AVX-LABEL: test_v8f32:
-; AVX: # %bb.0:
-; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0]
-; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm1[3,3,3,3]
-; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1
-; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0]
-; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
-; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vzeroupper
-; AVX-NEXT: retq
+; AVX1-SLOW-LABEL: test_v8f32:
+; AVX1-SLOW: # %bb.0:
+; AVX1-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; AVX1-SLOW-NEXT: vaddss %xmm2, %xmm0, %xmm0
+; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0]
+; AVX1-SLOW-NEXT: vaddss %xmm2, %xmm0, %xmm0
+; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm1[3,3,3,3]
+; AVX1-SLOW-NEXT: vaddss %xmm2, %xmm0, %xmm0
+; AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX1-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; AVX1-SLOW-NEXT: vaddss %xmm2, %xmm0, %xmm0
+; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0]
+; AVX1-SLOW-NEXT: vaddss %xmm2, %xmm0, %xmm0
+; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
+; AVX1-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT: vzeroupper
+; AVX1-SLOW-NEXT: retq
+;
+; AVX1-FAST-LABEL: test_v8f32:
+; AVX1-FAST: # %bb.0:
+; AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-FAST-NEXT: vshufps {{.*#+}} xmm3 = xmm2[3,3,3,3]
+; AVX1-FAST-NEXT: vshufpd {{.*#+}} xmm4 = xmm2[1,0]
+; AVX1-FAST-NEXT: vmovshdup {{.*#+}} xmm5 = xmm2[1,1,3,3]
+; AVX1-FAST-NEXT: vshufps {{.*#+}} xmm6 = xmm1[3,3,3,3]
+; AVX1-FAST-NEXT: vshufpd {{.*#+}} xmm7 = xmm1[1,0]
+; AVX1-FAST-NEXT: vmovshdup {{.*#+}} xmm8 = xmm1[1,1,3,3]
+; AVX1-FAST-NEXT: vaddss %xmm1, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vaddss %xmm0, %xmm8, %xmm0
+; AVX1-FAST-NEXT: vaddss %xmm7, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vaddss %xmm6, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vaddss %xmm2, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vaddss %xmm5, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vaddss %xmm4, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vaddss %xmm3, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vzeroupper
+; AVX1-FAST-NEXT: retq
+;
+; AVX2-LABEL: test_v8f32:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vaddss %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; AVX2-NEXT: vaddss %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0]
+; AVX2-NEXT: vaddss %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vshufps {{.*#+}} xmm2 = xmm1[3,3,3,3]
+; AVX2-NEXT: vaddss %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX2-NEXT: vaddss %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; AVX2-NEXT: vaddss %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0]
+; AVX2-NEXT: vaddss %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
+; AVX2-NEXT: vaddss %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
;
; AVX512-LABEL: test_v8f32:
; AVX512: # %bb.0:
@@ -411,11 +473,11 @@ define float @test_v4f32_zero(<4 x float> %a0) {
;
; AVX1-FAST-LABEL: test_v4f32_zero:
; AVX1-FAST: # %bb.0:
-; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm1
+; AVX1-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3]
; AVX1-FAST-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0]
-; AVX1-FAST-NEXT: vaddss %xmm2, %xmm1, %xmm1
-; AVX1-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; AVX1-FAST-NEXT: vaddss %xmm0, %xmm1, %xmm0
+; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vaddss %xmm2, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vaddss %xmm1, %xmm0, %xmm0
; AVX1-FAST-NEXT: retq
;
; AVX2-LABEL: test_v4f32_zero:
@@ -503,19 +565,19 @@ define float @test_v8f32_zero(<8 x float> %a0) {
;
; AVX1-FAST-LABEL: test_v8f32_zero:
; AVX1-FAST: # %bb.0:
-; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm1
-; AVX1-FAST-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0]
-; AVX1-FAST-NEXT: vaddss %xmm2, %xmm1, %xmm1
-; AVX1-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3]
-; AVX1-FAST-NEXT: vaddss %xmm2, %xmm1, %xmm1
-; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-FAST-NEXT: vaddss %xmm0, %xmm1, %xmm1
-; AVX1-FAST-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; AVX1-FAST-NEXT: vaddss %xmm2, %xmm1, %xmm1
-; AVX1-FAST-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0]
-; AVX1-FAST-NEXT: vaddss %xmm2, %xmm1, %xmm1
-; AVX1-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; AVX1-FAST-NEXT: vaddss %xmm0, %xmm1, %xmm0
+; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm1[3,3,3,3]
+; AVX1-FAST-NEXT: vshufpd {{.*#+}} xmm3 = xmm1[1,0]
+; AVX1-FAST-NEXT: vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3]
+; AVX1-FAST-NEXT: vshufps {{.*#+}} xmm5 = xmm0[3,3,3,3]
+; AVX1-FAST-NEXT: vshufpd {{.*#+}} xmm6 = xmm0[1,0]
+; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vaddss %xmm6, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vaddss %xmm5, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vaddss %xmm1, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vaddss %xmm4, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vaddss %xmm3, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vaddss %xmm2, %xmm0, %xmm0
; AVX1-FAST-NEXT: vzeroupper
; AVX1-FAST-NEXT: retq
;
@@ -830,15 +892,35 @@ define float @test_v4f32_undef(<4 x float> %a0) {
; SSE41-NEXT: addss %xmm2, %xmm0
; SSE41-NEXT: retq
;
-; AVX-LABEL: test_v4f32_undef:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0]
-; AVX-NEXT: vaddss %xmm2, %xmm1, %xmm1
-; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; AVX-NEXT: vaddss %xmm0, %xmm1, %xmm0
-; AVX-NEXT: retq
+; AVX1-SLOW-LABEL: test_v4f32_undef:
+; AVX1-SLOW: # %bb.0:
+; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX1-SLOW-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0]
+; AVX1-SLOW-NEXT: vaddss %xmm2, %xmm1, %xmm1
+; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX1-SLOW-NEXT: vaddss %xmm0, %xmm1, %xmm0
+; AVX1-SLOW-NEXT: retq
+;
+; AVX1-FAST-LABEL: test_v4f32_undef:
+; AVX1-FAST: # %bb.0:
+; AVX1-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3]
+; AVX1-FAST-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0]
+; AVX1-FAST-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX1-FAST-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX1-FAST-NEXT: vaddss %xmm2, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vaddss %xmm1, %xmm0, %xmm0
+; AVX1-FAST-NEXT: retq
+;
+; AVX2-LABEL: test_v4f32_undef:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX2-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX2-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0]
+; AVX2-NEXT: vaddss %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX2-NEXT: vaddss %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: retq
;
; AVX512-LABEL: test_v4f32_undef:
; AVX512: # %bb.0:
@@ -894,24 +976,62 @@ define float @test_v8f32_undef(<8 x float> %a0) {
; SSE41-NEXT: addss %xmm1, %xmm0
; SSE41-NEXT: retq
;
-; AVX-LABEL: test_v8f32_undef:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0]
-; AVX-NEXT: vaddss %xmm2, %xmm1, %xmm1
-; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3]
-; AVX-NEXT: vaddss %xmm2, %xmm1, %xmm1
-; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX-NEXT: vaddss %xmm0, %xmm1, %xmm1
-; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; AVX-NEXT: vaddss %xmm2, %xmm1, %xmm1
-; AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0]
-; AVX-NEXT: vaddss %xmm2, %xmm1, %xmm1
-; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; AVX-NEXT: vaddss %xmm0, %xmm1, %xmm0
-; AVX-NEXT: vzeroupper
-; AVX-NEXT: retq
+; AVX1-SLOW-LABEL: test_v8f32_undef:
+; AVX1-SLOW: # %bb.0:
+; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX1-SLOW-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0]
+; AVX1-SLOW-NEXT: vaddss %xmm2, %xmm1, %xmm1
+; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3]
+; AVX1-SLOW-NEXT: vaddss %xmm2, %xmm1, %xmm1
+; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-SLOW-NEXT: vaddss %xmm0, %xmm1, %xmm1
+; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; AVX1-SLOW-NEXT: vaddss %xmm2, %xmm1, %xmm1
+; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0]
+; AVX1-SLOW-NEXT: vaddss %xmm2, %xmm1, %xmm1
+; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX1-SLOW-NEXT: vaddss %xmm0, %xmm1, %xmm0
+; AVX1-SLOW-NEXT: vzeroupper
+; AVX1-SLOW-NEXT: retq
+;
+; AVX1-FAST-LABEL: test_v8f32_undef:
+; AVX1-FAST: # %bb.0:
+; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm1[3,3,3,3]
+; AVX1-FAST-NEXT: vshufpd {{.*#+}} xmm3 = xmm1[1,0]
+; AVX1-FAST-NEXT: vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3]
+; AVX1-FAST-NEXT: vshufps {{.*#+}} xmm5 = xmm0[3,3,3,3]
+; AVX1-FAST-NEXT: vshufpd {{.*#+}} xmm6 = xmm0[1,0]
+; AVX1-FAST-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX1-FAST-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX1-FAST-NEXT: vaddss %xmm6, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vaddss %xmm5, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vaddss %xmm1, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vaddss %xmm4, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vaddss %xmm3, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vaddss %xmm2, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vzeroupper
+; AVX1-FAST-NEXT: retq
+;
+; AVX2-LABEL: test_v8f32_undef:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX2-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX2-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0]
+; AVX2-NEXT: vaddss %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3]
+; AVX2-NEXT: vaddss %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX2-NEXT: vaddss %xmm0, %xmm1, %xmm1
+; AVX2-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; AVX2-NEXT: vaddss %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0]
+; AVX2-NEXT: vaddss %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX2-NEXT: vaddss %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
;
; AVX512-LABEL: test_v8f32_undef:
; AVX512: # %bb.0:
@@ -1094,12 +1214,26 @@ define double @test_v2f64(double %a0, <2 x double> %a1) {
; SSE-NEXT: addsd %xmm1, %xmm0
; SSE-NEXT: retq
;
-; AVX-LABEL: test_v2f64:
-; AVX: # %bb.0:
-; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0]
-; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT: retq
+; AVX1-SLOW-LABEL: test_v2f64:
+; AVX1-SLOW: # %bb.0:
+; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0]
+; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT: retq
+;
+; AVX1-FAST-LABEL: test_v2f64:
+; AVX1-FAST: # %bb.0:
+; AVX1-FAST-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0]
+; AVX1-FAST-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vaddsd %xmm2, %xmm0, %xmm0
+; AVX1-FAST-NEXT: retq
+;
+; AVX2-LABEL: test_v2f64:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0]
+; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: retq
;
; AVX512-LABEL: test_v2f64:
; AVX512: # %bb.0:
@@ -1122,17 +1256,41 @@ define double @test_v4f64(double %a0, <4 x double> %a1) {
; SSE-NEXT: addsd %xmm2, %xmm0
; SSE-NEXT: retq
;
-; AVX-LABEL: test_v4f64:
-; AVX: # %bb.0:
-; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0]
-; AVX-NEXT: vaddsd %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1
-; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0]
-; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vzeroupper
-; AVX-NEXT: retq
+; AVX1-SLOW-LABEL: test_v4f64:
+; AVX1-SLOW: # %bb.0:
+; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0]
+; AVX1-SLOW-NEXT: vaddsd %xmm2, %xmm0, %xmm0
+; AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0]
+; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT: vzeroupper
+; AVX1-SLOW-NEXT: retq
+;
+; AVX1-FAST-LABEL: test_v4f64:
+; AVX1-FAST: # %bb.0:
+; AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-FAST-NEXT: vshufpd {{.*#+}} xmm3 = xmm2[1,0]
+; AVX1-FAST-NEXT: vshufpd {{.*#+}} xmm4 = xmm1[1,0]
+; AVX1-FAST-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vaddsd %xmm4, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vaddsd %xmm2, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vaddsd %xmm3, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vzeroupper
+; AVX1-FAST-NEXT: retq
+;
+; AVX2-LABEL: test_v4f64:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0]
+; AVX2-NEXT: vaddsd %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0]
+; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
;
; AVX512-LABEL: test_v4f64:
; AVX512: # %bb.0:
@@ -1398,11 +1556,11 @@ define double @test_v4f64_zero(<4 x double> %a0) {
;
; AVX1-FAST-LABEL: test_v4f64_zero:
; AVX1-FAST: # %bb.0:
-; AVX1-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm1
-; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-FAST-NEXT: vaddsd %xmm0, %xmm1, %xmm1
-; AVX1-FAST-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX1-FAST-NEXT: vaddsd %xmm0, %xmm1, %xmm0
+; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-FAST-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0]
+; AVX1-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vaddsd %xmm2, %xmm0, %xmm0
; AVX1-FAST-NEXT: vzeroupper
; AVX1-FAST-NEXT: retq
;
@@ -1720,16 +1878,38 @@ define double @test_v4f64_undef(<4 x double> %a0) {
; SSE-NEXT: addsd %xmm1, %xmm0
; SSE-NEXT: retq
;
-; AVX-LABEL: test_v4f64_undef:
-; AVX: # %bb.0:
-; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX-NEXT: vaddsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX-NEXT: vaddsd %xmm0, %xmm1, %xmm1
-; AVX-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX-NEXT: vaddsd %xmm0, %xmm1, %xmm0
-; AVX-NEXT: vzeroupper
-; AVX-NEXT: retq
+; AVX1-SLOW-LABEL: test_v4f64_undef:
+; AVX1-SLOW: # %bb.0:
+; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX1-SLOW-NEXT: vaddsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-SLOW-NEXT: vaddsd %xmm0, %xmm1, %xmm1
+; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX1-SLOW-NEXT: vaddsd %xmm0, %xmm1, %xmm0
+; AVX1-SLOW-NEXT: vzeroupper
+; AVX1-SLOW-NEXT: retq
+;
+; AVX1-FAST-LABEL: test_v4f64_undef:
+; AVX1-FAST: # %bb.0:
+; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-FAST-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0]
+; AVX1-FAST-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX1-FAST-NEXT: vaddsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX1-FAST-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vaddsd %xmm2, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vzeroupper
+; AVX1-FAST-NEXT: retq
+;
+; AVX2-LABEL: test_v4f64_undef:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX2-NEXT: vaddsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX2-NEXT: vaddsd %xmm0, %xmm1, %xmm1
+; AVX2-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX2-NEXT: vaddsd %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
;
; AVX512-LABEL: test_v4f64_undef:
; AVX512: # %bb.0:
>From 0bdc1227650b15548e2a276ea454963d50bc4e2c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Suhajda=20Tam=C3=A1s?= <sutajo at gmail.com>
Date: Thu, 19 Jun 2025 09:14:19 +0200
Subject: [PATCH 8/8] Remove redundant check
---
CMakeFiles/3.21.3/CMakeSystem.cmake | 15 ++++++++++++
CMakeFiles/3.21.3/VCTargetsPath.vcxproj | 31 +++++++++++++++++++++++++
CMakeFiles/CMakeOutput.log | 1 +
llvm/.gitignore | 1 +
llvm/lib/Target/X86/X86ISelLowering.cpp | 3 +--
llvm/updatetest.bat | 8 +++++++
6 files changed, 57 insertions(+), 2 deletions(-)
create mode 100644 CMakeFiles/3.21.3/CMakeSystem.cmake
create mode 100644 CMakeFiles/3.21.3/VCTargetsPath.vcxproj
create mode 100644 CMakeFiles/CMakeOutput.log
create mode 100644 llvm/updatetest.bat
diff --git a/CMakeFiles/3.21.3/CMakeSystem.cmake b/CMakeFiles/3.21.3/CMakeSystem.cmake
new file mode 100644
index 0000000000000..000cf69875ef8
--- /dev/null
+++ b/CMakeFiles/3.21.3/CMakeSystem.cmake
@@ -0,0 +1,15 @@
+set(CMAKE_HOST_SYSTEM "Windows-10.0.26100")
+set(CMAKE_HOST_SYSTEM_NAME "Windows")
+set(CMAKE_HOST_SYSTEM_VERSION "10.0.26100")
+set(CMAKE_HOST_SYSTEM_PROCESSOR "AMD64")
+
+
+
+set(CMAKE_SYSTEM "Windows-10.0.26100")
+set(CMAKE_SYSTEM_NAME "Windows")
+set(CMAKE_SYSTEM_VERSION "10.0.26100")
+set(CMAKE_SYSTEM_PROCESSOR "AMD64")
+
+set(CMAKE_CROSSCOMPILING "FALSE")
+
+set(CMAKE_SYSTEM_LOADED 1)
diff --git a/CMakeFiles/3.21.3/VCTargetsPath.vcxproj b/CMakeFiles/3.21.3/VCTargetsPath.vcxproj
new file mode 100644
index 0000000000000..ccb7d6259f3d4
--- /dev/null
+++ b/CMakeFiles/3.21.3/VCTargetsPath.vcxproj
@@ -0,0 +1,31 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+ <ItemGroup Label="ProjectConfigurations">
+ <ProjectConfiguration Include="Debug|x64">
+ <Configuration>Debug</Configuration>
+ <Platform>x64</Platform>
+ </ProjectConfiguration>
+ </ItemGroup>
+ <PropertyGroup Label="Globals">
+ <ProjectGuid>{F3FC6D86-508D-3FB1-96D2-995F08B142EC}</ProjectGuid>
+ <Keyword>Win32Proj</Keyword>
+ <Platform>x64</Platform>
+ <WindowsTargetPlatformVersion>10.0.22621.0</WindowsTargetPlatformVersion>
+ </PropertyGroup>
+ <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props"/>
+ <PropertyGroup>
+ <PreferredToolArchitecture>x64</PreferredToolArchitecture>
+ </PropertyGroup>
+ <PropertyGroup Label="Configuration">
+ <ConfigurationType>Utility</ConfigurationType>
+ <CharacterSet>MultiByte</CharacterSet>
+ <PlatformToolset>v143</PlatformToolset>
+ </PropertyGroup>
+ <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props"/>
+ <ItemDefinitionGroup>
+ <PostBuildEvent>
+ <Command>echo VCTargetsPath=$(VCTargetsPath)</Command>
+ </PostBuildEvent>
+ </ItemDefinitionGroup>
+ <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets"/>
+</Project>
diff --git a/CMakeFiles/CMakeOutput.log b/CMakeFiles/CMakeOutput.log
new file mode 100644
index 0000000000000..be1d66dec5a8f
--- /dev/null
+++ b/CMakeFiles/CMakeOutput.log
@@ -0,0 +1 @@
+The system is: Windows - 10.0.26100 - AMD64
diff --git a/llvm/.gitignore b/llvm/.gitignore
index 48dda423228c2..58c6730a8ca0c 100644
--- a/llvm/.gitignore
+++ b/llvm/.gitignore
@@ -40,6 +40,7 @@ autoconf/autom4te.cache
# Directories to ignore (do not add trailing '/'s, they skip symlinks).
#==============================================================================#
# External projects that are tracked independently.
+out
projects/*
!projects/*.*
!projects/Makefile
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index cef1148504259..63524fa1b9290 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -1443,8 +1443,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
ISD::VECREDUCE_SMAX, ISD::VECREDUCE_SMIN, ISD::VECREDUCE_UMAX,
ISD::VECREDUCE_UMIN}) {
for (MVT VT : MVT::vector_valuetypes())
- if (VT.isFixedLengthVector() &&
- X86::isVectorReductionFast(Subtarget, VectorOp, VT))
+ if (X86::isVectorReductionFast(Subtarget, VectorOp, VT))
setOperationAction(VectorOp, VT, Custom);
}
diff --git a/llvm/updatetest.bat b/llvm/updatetest.bat
new file mode 100644
index 0000000000000..ffda5550f5f0b
--- /dev/null
+++ b/llvm/updatetest.bat
@@ -0,0 +1,8 @@
+python Q:/llvm-project/llvm/utils/update_llc_test_checks.py --force-update --llc-binary Q:\llvm-project\llvm\out\build\x64-Release\bin\llc Q:\llvm-project\llvm\test\CodeGen\X86\horizontal-sum.ll
+python Q:/llvm-project/llvm/utils/update_llc_test_checks.py --force-update --llc-binary Q:\llvm-project\llvm\out\build\x64-Release\bin\llc Q:\llvm-project\llvm\test\CodeGen\X86\optimize-reduction.ll
+python Q:/llvm-project/llvm/utils/update_llc_test_checks.py --force-update --llc-binary Q:\llvm-project\llvm\out\build\x64-Release\bin\llc Q:\llvm-project\llvm\test\CodeGen\X86\vector-reduce-add-mask.ll
+python Q:/llvm-project/llvm/utils/update_llc_test_checks.py --force-update --llc-binary Q:\llvm-project\llvm\out\build\x64-Release\bin\llc Q:\llvm-project\llvm\test\CodeGen\X86\vector-reduce-add-zext.ll
+python Q:/llvm-project/llvm/utils/update_llc_test_checks.py --force-update --llc-binary Q:\llvm-project\llvm\out\build\x64-Release\bin\llc Q:\llvm-project\llvm\test\CodeGen\X86\haddsub.ll
+python Q:/llvm-project/llvm/utils/update_llc_test_checks.py --force-update --llc-binary Q:\llvm-project\llvm\out\build\x64-Release\bin\llc Q:\llvm-project\llvm\test\CodeGen\X86\horizontal-sum.ll
+python Q:/llvm-project/llvm/utils/update_llc_test_checks.py --force-update --llc-binary Q:\llvm-project\llvm\out\build\x64-Release\bin\llc Q:\llvm-project\llvm\test\CodeGen\X86\vector-reduce-fadd-fast.ll
+python Q:/llvm-project/llvm/utils/update_llc_test_checks.py --force-update --llc-binary Q:\llvm-project\llvm\out\build\x64-Release\bin\llc Q:\llvm-project\llvm\test\CodeGen\X86\vector-reduce-fadd.ll
\ No newline at end of file
More information about the llvm-commits
mailing list