[llvm] [X86][AVX] Fix handling of out-of-bounds shift amounts in AVX2 vector logical shift nodes #83840 (PR #86922)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Jun 11 23:03:33 PDT 2024
https://github.com/SahilPatidar updated https://github.com/llvm/llvm-project/pull/86922
>From 8c4e699979ca51abfcda3a711c93946017d8d68b Mon Sep 17 00:00:00 2001
From: SahilPatidar <patidarsahil2001 at gmail.com>
Date: Tue, 26 Mar 2024 10:29:35 +0530
Subject: [PATCH 1/6] [X86][AVX] Fix handling of out-of-bounds shift amounts in
AVX2 vector logical shift nodes #83840
---
llvm/lib/Target/X86/X86ISelLowering.cpp | 29 ++++++
llvm/test/CodeGen/X86/combine-srl.ll | 116 ++++++++++++++++++++++++
2 files changed, 145 insertions(+)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 7b9e6c0a00273..61080006f48ec 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -45950,6 +45950,19 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
}
}
+ if (N->getOpcode() == ISD::VSELECT && LHS.getOpcode() == ISD::SRL &&
+ supportedVectorVarShift(VT, Subtarget, ISD::SRL)) {
+ APInt SV;
+ if (Cond.getOpcode() == ISD::SETCC && Cond.getOperand(0) == LHS.getOperand(1) &&
+ cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETULT &&
+ ISD::isConstantSplatVector(Cond.getOperand(1).getNode(), SV) &&
+ ISD::isConstantSplatVectorAllZeros(RHS.getNode()) &&
+ SV == VT.getScalarSizeInBits()) {
+ SDLoc DL(LHS);
+ return DAG.getNode(X86ISD::VSRLV, DL, LHS->getVTList(), LHS.getOperand(0), LHS.getOperand(1));
+ }
+ }
+
// Early exit check
if (!TLI.isTypeLegal(VT) || isSoftF16(VT, Subtarget))
return SDValue();
@@ -47825,6 +47838,22 @@ static SDValue combineShiftRightLogical(SDNode *N, SelectionDAG &DAG,
if (SDValue V = combineShiftToPMULH(N, DAG, Subtarget))
return V;
+ if (N0.getOpcode() == ISD::VSELECT &&
+ supportedVectorVarShift(VT, Subtarget, ISD::SRL)) {
+ SDValue Cond = N0.getOperand(0);
+ SDValue N00 = N0.getOperand(1);
+ SDValue N01 = N0.getOperand(2);
+ APInt SV;
+ if (Cond.getOpcode() == ISD::SETCC && Cond.getOperand(0) == N1 &&
+ cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETULT &&
+ ISD::isConstantSplatVector(Cond.getOperand(1).getNode(), SV) &&
+ ISD::isConstantSplatVectorAllZeros(N01.getNode()) &&
+ SV == VT.getScalarSizeInBits()) {
+ SDLoc DL(N);
+ return DAG.getNode(X86ISD::VSRLV, DL, N->getVTList(), N00, N1);
+ }
+ }
+
// Only do this on the last DAG combine as it can interfere with other
// combines.
if (!DCI.isAfterLegalizeDAG())
diff --git a/llvm/test/CodeGen/X86/combine-srl.ll b/llvm/test/CodeGen/X86/combine-srl.ll
index 33649e6d87b91..eeeff2f8eb25b 100644
--- a/llvm/test/CodeGen/X86/combine-srl.ll
+++ b/llvm/test/CodeGen/X86/combine-srl.ll
@@ -606,3 +606,119 @@ define <4 x i32> @combine_vec_lshr_trunc_and(<4 x i32> %x, <4 x i64> %y) {
%3 = lshr <4 x i32> %x, %2
ret <4 x i32> %3
}
+
+define <4 x i32> @combine_vec_lshr_clamped1(<4 x i32> %sh, <4 x i32> %amt) {
+; SSE2-LABEL: combine_vec_lshr_clamped1:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT: pxor %xmm1, %xmm2
+; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7]
+; SSE2-NEXT: movdqa %xmm0, %xmm4
+; SSE2-NEXT: psrld %xmm3, %xmm4
+; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[0,1,1,1,4,5,6,7]
+; SSE2-NEXT: movdqa %xmm0, %xmm5
+; SSE2-NEXT: psrld %xmm3, %xmm5
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm4[0]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7]
+; SSE2-NEXT: movdqa %xmm0, %xmm4
+; SSE2-NEXT: psrld %xmm3, %xmm4
+; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
+; SSE2-NEXT: psrld %xmm1, %xmm0
+; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1]
+; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm0[0,3]
+; SSE2-NEXT: pandn %xmm5, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: combine_vec_lshr_clamped1:
+; SSE41: # %bb.0:
+; SSE41-NEXT: pmovsxbd {{.*#+}} xmm2 = [31,31,31,31]
+; SSE41-NEXT: pminud %xmm1, %xmm2
+; SSE41-NEXT: pcmpeqd %xmm1, %xmm2
+; SSE41-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7]
+; SSE41-NEXT: movdqa %xmm0, %xmm4
+; SSE41-NEXT: psrld %xmm3, %xmm4
+; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
+; SSE41-NEXT: pshuflw {{.*#+}} xmm5 = xmm3[2,3,3,3,4,5,6,7]
+; SSE41-NEXT: movdqa %xmm0, %xmm6
+; SSE41-NEXT: psrld %xmm5, %xmm6
+; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm4[0,1,2,3],xmm6[4,5,6,7]
+; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
+; SSE41-NEXT: movdqa %xmm0, %xmm4
+; SSE41-NEXT: psrld %xmm1, %xmm4
+; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[0,1,1,1,4,5,6,7]
+; SSE41-NEXT: psrld %xmm1, %xmm0
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm4[0,1,2,3],xmm0[4,5,6,7]
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3],xmm0[4,5],xmm6[6,7]
+; SSE41-NEXT: pand %xmm2, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: combine_vec_lshr_clamped1:
+; AVX: # %bb.0:
+; AVX-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
+ %cmp.i = icmp ult <4 x i32> %amt, <i32 32, i32 32, i32 32, i32 32>
+ %shr = lshr <4 x i32> %sh, %amt
+ %1 = select <4 x i1> %cmp.i, <4 x i32> %shr, <4 x i32> zeroinitializer
+ ret <4 x i32> %1
+}
+
+define <4 x i32> @combine_vec_lshr_clamped2(<4 x i32> %sh, <4 x i32> %amt) {
+; SSE2-LABEL: combine_vec_lshr_clamped2:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT: pxor %xmm1, %xmm2
+; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSE2-NEXT: pandn %xmm0, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[2,3,3,3,4,5,6,7]
+; SSE2-NEXT: movdqa %xmm2, %xmm4
+; SSE2-NEXT: psrld %xmm3, %xmm4
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,1,4,5,6,7]
+; SSE2-NEXT: movdqa %xmm2, %xmm3
+; SSE2-NEXT: psrld %xmm0, %xmm3
+; SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm4[1]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[2,3,3,3,4,5,6,7]
+; SSE2-NEXT: movdqa %xmm2, %xmm4
+; SSE2-NEXT: psrld %xmm0, %xmm4
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,1,1,1,4,5,6,7]
+; SSE2-NEXT: psrld %xmm0, %xmm2
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0]
+; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm3[0,3]
+; SSE2-NEXT: movaps %xmm2, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: combine_vec_lshr_clamped2:
+; SSE41: # %bb.0:
+; SSE41-NEXT: pmovsxbd {{.*#+}} xmm2 = [31,31,31,31]
+; SSE41-NEXT: pminud %xmm1, %xmm2
+; SSE41-NEXT: pcmpeqd %xmm1, %xmm2
+; SSE41-NEXT: pand %xmm2, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
+; SSE41-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[2,3,3,3,4,5,6,7]
+; SSE41-NEXT: movdqa %xmm0, %xmm4
+; SSE41-NEXT: psrld %xmm3, %xmm4
+; SSE41-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7]
+; SSE41-NEXT: movdqa %xmm0, %xmm5
+; SSE41-NEXT: psrld %xmm3, %xmm5
+; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm4[4,5,6,7]
+; SSE41-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,1,1,4,5,6,7]
+; SSE41-NEXT: movdqa %xmm0, %xmm3
+; SSE41-NEXT: psrld %xmm2, %xmm3
+; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
+; SSE41-NEXT: psrld %xmm1, %xmm0
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5],xmm5[6,7]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: combine_vec_lshr_clamped2:
+; AVX: # %bb.0:
+; AVX-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
+ %cmp.i = icmp ult <4 x i32> %amt, <i32 32, i32 32, i32 32, i32 32>
+ %1 = select <4 x i1> %cmp.i, <4 x i32> %sh, <4 x i32> zeroinitializer
+ %shr = lshr <4 x i32> %1, %amt
+ ret <4 x i32> %shr
+}
>From e0be66b8c030d44a7843a5715a99688fe0e9f2c1 Mon Sep 17 00:00:00 2001
From: SahilPatidar <patidarsahil2001 at gmail.com>
Date: Mon, 8 Apr 2024 21:49:03 +0530
Subject: [PATCH 2/6] Add and update code and test for shl
---
llvm/lib/Target/X86/X86ISelLowering.cpp | 41 +++-
llvm/test/CodeGen/X86/combine-srl.ll | 299 ++++++++++++++++++++++++
2 files changed, 335 insertions(+), 5 deletions(-)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 61080006f48ec..2888a367dd8b3 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -45950,16 +45950,27 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
}
}
- if (N->getOpcode() == ISD::VSELECT && LHS.getOpcode() == ISD::SRL &&
- supportedVectorVarShift(VT, Subtarget, ISD::SRL)) {
+ // Exploits AVX2 VSHLV/VSRLV instructions for efficient unsigned vector shifts
+ // with out-of-bounds clamping.
+
+ // Unlike general shift instructions (SHL/SRL), AVX2's VSHLV/VSRLV handle
+ // shift amounts exceeding the element bitwidth. VSHLV clamps the amount to
+ // bitwidth-1 for unsigned shifts, effectively performing a maximum left shift
+ // of bitwidth-1 positions. Similarly, VSRLV returns zero for unsigned shifts
+ // exceeding bitwidth-1, achieving a maximum right shift of bitwidth-1.
+ if (N->getOpcode() == ISD::VSELECT &&
+ (LHS.getOpcode() == ISD::SRL || LHS.getOpcode() == ISD::SHL) &&
+ supportedVectorVarShift(VT, Subtarget, LHS.getOpcode())) {
APInt SV;
- if (Cond.getOpcode() == ISD::SETCC && Cond.getOperand(0) == LHS.getOperand(1) &&
+ if (Cond.getOpcode() == ISD::SETCC &&
+ Cond.getOperand(0) == LHS.getOperand(1) &&
cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETULT &&
ISD::isConstantSplatVector(Cond.getOperand(1).getNode(), SV) &&
ISD::isConstantSplatVectorAllZeros(RHS.getNode()) &&
SV == VT.getScalarSizeInBits()) {
- SDLoc DL(LHS);
- return DAG.getNode(X86ISD::VSRLV, DL, LHS->getVTList(), LHS.getOperand(0), LHS.getOperand(1));
+ return DAG.getNode(
+ LHS.getOpcode() == ISD::SRL ? X86ISD::VSRLV : X86ISD::VSHLV, DL,
+ LHS->getVTList(), LHS.getOperand(0), LHS.getOperand(1));
}
}
@@ -47720,6 +47731,24 @@ static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG) {
ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
EVT VT = N0.getValueType();
+ // Exploits AVX2 VSHLV/VSRLV instructions for efficient unsigned vector shifts
+ // with out-of-bounds clamping.
+ if (N0.getOpcode() == ISD::VSELECT &&
+ supportedVectorVarShift(VT, DAG.getSubtarget<X86Subtarget>(), ISD::SHL)) {
+ SDValue Cond = N0.getOperand(0);
+ SDValue N00 = N0.getOperand(1);
+ SDValue N01 = N0.getOperand(2);
+ APInt SV;
+ if (Cond.getOpcode() == ISD::SETCC && Cond.getOperand(0) == N1 &&
+ cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETULT &&
+ ISD::isConstantSplatVector(Cond.getOperand(1).getNode(), SV) &&
+ ISD::isConstantSplatVectorAllZeros(N01.getNode()) &&
+ SV == VT.getScalarSizeInBits()) {
+ SDLoc DL(N);
+ return DAG.getNode(X86ISD::VSHLV, DL, N->getVTList(), N00, N1);
+ }
+ }
+
// fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
// since the result of setcc_c is all zero's or all ones.
if (VT.isInteger() && !VT.isVector() &&
@@ -47838,6 +47867,8 @@ static SDValue combineShiftRightLogical(SDNode *N, SelectionDAG &DAG,
if (SDValue V = combineShiftToPMULH(N, DAG, Subtarget))
return V;
+ // Exploits AVX2 VSHLV/VSRLV instructions for efficient unsigned vector shifts
+ // with out-of-bounds clamping.
if (N0.getOpcode() == ISD::VSELECT &&
supportedVectorVarShift(VT, Subtarget, ISD::SRL)) {
SDValue Cond = N0.getOperand(0);
diff --git a/llvm/test/CodeGen/X86/combine-srl.ll b/llvm/test/CodeGen/X86/combine-srl.ll
index eeeff2f8eb25b..641a3a815d55e 100644
--- a/llvm/test/CodeGen/X86/combine-srl.ll
+++ b/llvm/test/CodeGen/X86/combine-srl.ll
@@ -722,3 +722,302 @@ define <4 x i32> @combine_vec_lshr_clamped2(<4 x i32> %sh, <4 x i32> %amt) {
%shr = lshr <4 x i32> %1, %amt
ret <4 x i32> %shr
}
+
+define <4 x i32> @combine_vec_lshr_commuted_clamped(<4 x i32> %sh, <4 x i32> %amt) {
+; SSE2-LABEL: combine_vec_lshr_commuted_clamped:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT: pxor %xmm1, %xmm2
+; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSE2-NEXT: pandn %xmm0, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[2,3,3,3,4,5,6,7]
+; SSE2-NEXT: movdqa %xmm2, %xmm4
+; SSE2-NEXT: psrld %xmm3, %xmm4
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,1,4,5,6,7]
+; SSE2-NEXT: movdqa %xmm2, %xmm3
+; SSE2-NEXT: psrld %xmm0, %xmm3
+; SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm4[1]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[2,3,3,3,4,5,6,7]
+; SSE2-NEXT: movdqa %xmm2, %xmm4
+; SSE2-NEXT: psrld %xmm0, %xmm4
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,1,1,1,4,5,6,7]
+; SSE2-NEXT: psrld %xmm0, %xmm2
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0]
+; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm3[0,3]
+; SSE2-NEXT: movaps %xmm2, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: combine_vec_lshr_commuted_clamped:
+; SSE41: # %bb.0:
+; SSE41-NEXT: pmovsxbd {{.*#+}} xmm2 = [31,31,31,31]
+; SSE41-NEXT: pminud %xmm1, %xmm2
+; SSE41-NEXT: pcmpeqd %xmm1, %xmm2
+; SSE41-NEXT: pand %xmm2, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
+; SSE41-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[2,3,3,3,4,5,6,7]
+; SSE41-NEXT: movdqa %xmm0, %xmm4
+; SSE41-NEXT: psrld %xmm3, %xmm4
+; SSE41-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7]
+; SSE41-NEXT: movdqa %xmm0, %xmm5
+; SSE41-NEXT: psrld %xmm3, %xmm5
+; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm4[4,5,6,7]
+; SSE41-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,1,1,4,5,6,7]
+; SSE41-NEXT: movdqa %xmm0, %xmm3
+; SSE41-NEXT: psrld %xmm2, %xmm3
+; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
+; SSE41-NEXT: psrld %xmm1, %xmm0
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5],xmm5[6,7]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: combine_vec_lshr_commuted_clamped:
+; AVX: # %bb.0:
+; AVX-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
+ %cmp.i = icmp uge <4 x i32> %amt, <i32 32, i32 32, i32 32, i32 32>
+ %1 = select <4 x i1> %cmp.i, <4 x i32> zeroinitializer, <4 x i32> %sh
+ %shr = lshr <4 x i32> %1, %amt
+ ret <4 x i32> %shr
+}
+
+define <4 x i32> @combine_vec_lshr_commuted_clamped1(<4 x i32> %sh, <4 x i32> %amt) {
+; SSE2-LABEL: combine_vec_lshr_commuted_clamped1:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: psrld %xmm2, %xmm3
+; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[0,1,1,1,4,5,6,7]
+; SSE2-NEXT: movdqa %xmm0, %xmm4
+; SSE2-NEXT: psrld %xmm2, %xmm4
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm3[0]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[2,3,3,3,4,5,6,7]
+; SSE2-NEXT: movdqa %xmm0, %xmm5
+; SSE2-NEXT: psrld %xmm3, %xmm5
+; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,1,1,4,5,6,7]
+; SSE2-NEXT: psrld %xmm2, %xmm0
+; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm5[1]
+; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,3],xmm0[0,3]
+; SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT: pandn %xmm4, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: combine_vec_lshr_commuted_clamped1:
+; SSE41: # %bb.0:
+; SSE41-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
+; SSE41-NEXT: movdqa %xmm0, %xmm3
+; SSE41-NEXT: psrld %xmm2, %xmm3
+; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
+; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7]
+; SSE41-NEXT: movdqa %xmm0, %xmm5
+; SSE41-NEXT: psrld %xmm4, %xmm5
+; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4,5,6,7]
+; SSE41-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[0,1,1,1,4,5,6,7]
+; SSE41-NEXT: movdqa %xmm0, %xmm4
+; SSE41-NEXT: psrld %xmm3, %xmm4
+; SSE41-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,1,1,4,5,6,7]
+; SSE41-NEXT: psrld %xmm2, %xmm0
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm4[0,1,2,3],xmm0[4,5,6,7]
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5],xmm5[6,7]
+; SSE41-NEXT: pmovsxbd {{.*#+}} xmm2 = [31,31,31,31]
+; SSE41-NEXT: pminud %xmm1, %xmm2
+; SSE41-NEXT: pcmpeqd %xmm1, %xmm2
+; SSE41-NEXT: pand %xmm2, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX2-LABEL: combine_vec_lshr_commuted_clamped1:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [31,31,31,31]
+; AVX2-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpminud %xmm2, %xmm1, %xmm2
+; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: combine_vec_lshr_commuted_clamped1:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: retq
+ %cmp.i = icmp uge <4 x i32> %amt, <i32 32, i32 32, i32 32, i32 32>
+ %shr = lshr <4 x i32> %sh, %amt
+ %1 = select <4 x i1> %cmp.i, <4 x i32> zeroinitializer, <4 x i32> %shr
+ ret <4 x i32> %1
+}
+
+define <4 x i32> @combine_vec_shl_clamped1(<4 x i32> %sh, <4 x i32> %amt) {
+; SSE2-LABEL: combine_vec_shl_clamped1:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT: pxor %xmm1, %xmm2
+; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSE2-NEXT: pslld $23, %xmm1
+; SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT: cvttps2dq %xmm1, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm1, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm3, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT: pandn %xmm0, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: combine_vec_shl_clamped1:
+; SSE41: # %bb.0:
+; SSE41-NEXT: pmovsxbd {{.*#+}} xmm2 = [31,31,31,31]
+; SSE41-NEXT: pminud %xmm1, %xmm2
+; SSE41-NEXT: pcmpeqd %xmm1, %xmm2
+; SSE41-NEXT: pslld $23, %xmm1
+; SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE41-NEXT: cvttps2dq %xmm1, %xmm1
+; SSE41-NEXT: pmulld %xmm1, %xmm0
+; SSE41-NEXT: pand %xmm2, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: combine_vec_shl_clamped1:
+; AVX: # %bb.0:
+; AVX-NEXT: vpsllvd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
+ %cmp.i = icmp ult <4 x i32> %amt, <i32 32, i32 32, i32 32, i32 32>
+ %shr = shl <4 x i32> %sh, %amt
+ %1 = select <4 x i1> %cmp.i, <4 x i32> %shr, <4 x i32> zeroinitializer
+ ret <4 x i32> %1
+}
+
+define <4 x i32> @combine_vec_shl_clamped2(<4 x i32> %sh, <4 x i32> %amt) {
+; SSE2-LABEL: combine_vec_shl_clamped2:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT: pxor %xmm1, %xmm2
+; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSE2-NEXT: pandn %xmm0, %xmm2
+; SSE2-NEXT: pslld $23, %xmm1
+; SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT: cvttps2dq %xmm1, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm1, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm3, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: combine_vec_shl_clamped2:
+; SSE41: # %bb.0:
+; SSE41-NEXT: pmovsxbd {{.*#+}} xmm2 = [31,31,31,31]
+; SSE41-NEXT: pminud %xmm1, %xmm2
+; SSE41-NEXT: pcmpeqd %xmm1, %xmm2
+; SSE41-NEXT: pand %xmm2, %xmm0
+; SSE41-NEXT: pslld $23, %xmm1
+; SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE41-NEXT: cvttps2dq %xmm1, %xmm1
+; SSE41-NEXT: pmulld %xmm1, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: combine_vec_shl_clamped2:
+; AVX: # %bb.0:
+; AVX-NEXT: vpsllvd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
+ %cmp.i = icmp ult <4 x i32> %amt, <i32 32, i32 32, i32 32, i32 32>
+ %1 = select <4 x i1> %cmp.i, <4 x i32> %sh, <4 x i32> zeroinitializer
+ %shr = shl <4 x i32> %1, %amt
+ ret <4 x i32> %shr
+}
+
+define <4 x i32> @combine_vec_shl_commuted_clamped(<4 x i32> %sh, <4 x i32> %amt) {
+; SSE2-LABEL: combine_vec_shl_commuted_clamped:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT: pxor %xmm1, %xmm2
+; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSE2-NEXT: pandn %xmm0, %xmm2
+; SSE2-NEXT: pslld $23, %xmm1
+; SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT: cvttps2dq %xmm1, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm1, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm3, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: combine_vec_shl_commuted_clamped:
+; SSE41: # %bb.0:
+; SSE41-NEXT: pmovsxbd {{.*#+}} xmm2 = [31,31,31,31]
+; SSE41-NEXT: pminud %xmm1, %xmm2
+; SSE41-NEXT: pcmpeqd %xmm1, %xmm2
+; SSE41-NEXT: pand %xmm2, %xmm0
+; SSE41-NEXT: pslld $23, %xmm1
+; SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE41-NEXT: cvttps2dq %xmm1, %xmm1
+; SSE41-NEXT: pmulld %xmm1, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: combine_vec_shl_commuted_clamped:
+; AVX: # %bb.0:
+; AVX-NEXT: vpsllvd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
+ %cmp.i = icmp uge <4 x i32> %amt, <i32 32, i32 32, i32 32, i32 32>
+ %1 = select <4 x i1> %cmp.i, <4 x i32> zeroinitializer, <4 x i32> %sh
+ %shr = shl <4 x i32> %1, %amt
+ ret <4 x i32> %shr
+}
+
+define <4 x i32> @combine_vec_shl_commuted_clamped1(<4 x i32> %sh, <4 x i32> %amt) {
+; SSE2-LABEL: combine_vec_shl_commuted_clamped1:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: pslld $23, %xmm2
+; SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSE2-NEXT: cvttps2dq %xmm2, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm2, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm3, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT: pandn %xmm0, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: combine_vec_shl_commuted_clamped1:
+; SSE41: # %bb.0:
+; SSE41-NEXT: pmovsxbd {{.*#+}} xmm2 = [31,31,31,31]
+; SSE41-NEXT: pminud %xmm1, %xmm2
+; SSE41-NEXT: pcmpeqd %xmm1, %xmm2
+; SSE41-NEXT: pslld $23, %xmm1
+; SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE41-NEXT: cvttps2dq %xmm1, %xmm1
+; SSE41-NEXT: pmulld %xmm1, %xmm0
+; SSE41-NEXT: pand %xmm2, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX2-LABEL: combine_vec_shl_commuted_clamped1:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [31,31,31,31]
+; AVX2-NEXT: vpsllvd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpminud %xmm2, %xmm1, %xmm2
+; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: combine_vec_shl_commuted_clamped1:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpsllvd %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: retq
+ %cmp.i = icmp uge <4 x i32> %amt, <i32 32, i32 32, i32 32, i32 32>
+ %shr = shl <4 x i32> %sh, %amt
+ %1 = select <4 x i1> %cmp.i, <4 x i32> zeroinitializer, <4 x i32> %shr
+ ret <4 x i32> %1
+}
>From bb912db8f847712dc13bb788db87b507d45ef3c0 Mon Sep 17 00:00:00 2001
From: SahilPatidar <patidarsahil2001 at gmail.com>
Date: Tue, 9 Apr 2024 14:29:02 +0530
Subject: [PATCH 3/6] Fix code add Subtarget
---
llvm/lib/Target/X86/X86ISelLowering.cpp | 7 ++---
llvm/test/CodeGen/X86/combine-srl.ll | 34 +++++++++++++++++++------
2 files changed, 30 insertions(+), 11 deletions(-)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 2888a367dd8b3..0840fd907729f 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -47725,7 +47725,8 @@ static SDValue combineShiftToPMULH(SDNode *N, SelectionDAG &DAG,
return DAG.getNode(ExtOpc, DL, VT, Mulh);
}
-static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG) {
+static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
@@ -47734,7 +47735,7 @@ static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG) {
// Exploits AVX2 VSHLV/VSRLV instructions for efficient unsigned vector shifts
// with out-of-bounds clamping.
if (N0.getOpcode() == ISD::VSELECT &&
- supportedVectorVarShift(VT, DAG.getSubtarget<X86Subtarget>(), ISD::SHL)) {
+ supportedVectorVarShift(VT, Subtarget, ISD::SHL)) {
SDValue Cond = N0.getOperand(0);
SDValue N00 = N0.getOperand(1);
SDValue N01 = N0.getOperand(2);
@@ -57372,7 +57373,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
case X86ISD::SBB: return combineSBB(N, DAG);
case X86ISD::ADC: return combineADC(N, DAG, DCI);
case ISD::MUL: return combineMul(N, DAG, DCI, Subtarget);
- case ISD::SHL: return combineShiftLeft(N, DAG);
+ case ISD::SHL: return combineShiftLeft(N, DAG, Subtarget);
case ISD::SRA: return combineShiftRightArithmetic(N, DAG, Subtarget);
case ISD::SRL: return combineShiftRightLogical(N, DAG, DCI, Subtarget);
case ISD::AND: return combineAnd(N, DAG, DCI, Subtarget);
diff --git a/llvm/test/CodeGen/X86/combine-srl.ll b/llvm/test/CodeGen/X86/combine-srl.ll
index 641a3a815d55e..8cfc0a1a32582 100644
--- a/llvm/test/CodeGen/X86/combine-srl.ll
+++ b/llvm/test/CodeGen/X86/combine-srl.ll
@@ -771,10 +771,19 @@ define <4 x i32> @combine_vec_lshr_commuted_clamped(<4 x i32> %sh, <4 x i32> %am
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5],xmm5[6,7]
; SSE41-NEXT: retq
;
-; AVX-LABEL: combine_vec_lshr_commuted_clamped:
-; AVX: # %bb.0:
-; AVX-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
-; AVX-NEXT: retq
+; AVX2-LABEL: combine_vec_lshr_commuted_clamped:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [31,31,31,31]
+; AVX2-NEXT: vpminud %xmm2, %xmm1, %xmm2
+; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm2
+; AVX2-NEXT: vpand %xmm0, %xmm2, %xmm0
+; AVX2-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: combine_vec_lshr_commuted_clamped:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: retq
%cmp.i = icmp uge <4 x i32> %amt, <i32 32, i32 32, i32 32, i32 32>
%1 = select <4 x i1> %cmp.i, <4 x i32> zeroinitializer, <4 x i32> %sh
%shr = lshr <4 x i32> %1, %amt
@@ -961,10 +970,19 @@ define <4 x i32> @combine_vec_shl_commuted_clamped(<4 x i32> %sh, <4 x i32> %amt
; SSE41-NEXT: pmulld %xmm1, %xmm0
; SSE41-NEXT: retq
;
-; AVX-LABEL: combine_vec_shl_commuted_clamped:
-; AVX: # %bb.0:
-; AVX-NEXT: vpsllvd %xmm1, %xmm0, %xmm0
-; AVX-NEXT: retq
+; AVX2-LABEL: combine_vec_shl_commuted_clamped:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [31,31,31,31]
+; AVX2-NEXT: vpminud %xmm2, %xmm1, %xmm2
+; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm2
+; AVX2-NEXT: vpand %xmm0, %xmm2, %xmm0
+; AVX2-NEXT: vpsllvd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: combine_vec_shl_commuted_clamped:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpsllvd %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: retq
%cmp.i = icmp uge <4 x i32> %amt, <i32 32, i32 32, i32 32, i32 32>
%1 = select <4 x i1> %cmp.i, <4 x i32> zeroinitializer, <4 x i32> %sh
%shr = shl <4 x i32> %1, %amt
>From 726dd30054189f8b564b657a58604341043c018b Mon Sep 17 00:00:00 2001
From: SahilPatidar <patidarsahil2001 at gmail.com>
Date: Sun, 5 May 2024 10:30:22 +0530
Subject: [PATCH 4/6] modifys code: N->getVTList() to VT
---
llvm/lib/Target/X86/X86ISelLowering.cpp | 12 ++++++------
1 file changed, 6 insertions(+), 6 deletions(-)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 0840fd907729f..afd4a5957da67 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -45954,10 +45954,10 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
// with out-of-bounds clamping.
// Unlike general shift instructions (SHL/SRL), AVX2's VSHLV/VSRLV handle
- // shift amounts exceeding the element bitwidth. VSHLV clamps the amount to
+ // shift amounts exceeding the element bitwidth. VSHLV/VSRLV clamps the amount to
// bitwidth-1 for unsigned shifts, effectively performing a maximum left shift
- // of bitwidth-1 positions. Similarly, VSRLV returns zero for unsigned shifts
- // exceeding bitwidth-1, achieving a maximum right shift of bitwidth-1.
+ // of bitwidth-1 positions. and returns zero for unsigned right shifts
+ // exceeding bitwidth-1.
if (N->getOpcode() == ISD::VSELECT &&
(LHS.getOpcode() == ISD::SRL || LHS.getOpcode() == ISD::SHL) &&
supportedVectorVarShift(VT, Subtarget, LHS.getOpcode())) {
@@ -45970,7 +45970,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
SV == VT.getScalarSizeInBits()) {
return DAG.getNode(
LHS.getOpcode() == ISD::SRL ? X86ISD::VSRLV : X86ISD::VSHLV, DL,
- LHS->getVTList(), LHS.getOperand(0), LHS.getOperand(1));
+ VT, LHS.getOperand(0), LHS.getOperand(1));
}
}
@@ -47746,7 +47746,7 @@ static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG,
ISD::isConstantSplatVectorAllZeros(N01.getNode()) &&
SV == VT.getScalarSizeInBits()) {
SDLoc DL(N);
- return DAG.getNode(X86ISD::VSHLV, DL, N->getVTList(), N00, N1);
+ return DAG.getNode(X86ISD::VSHLV, DL, VT, N00, N1);
}
}
@@ -47882,7 +47882,7 @@ static SDValue combineShiftRightLogical(SDNode *N, SelectionDAG &DAG,
ISD::isConstantSplatVectorAllZeros(N01.getNode()) &&
SV == VT.getScalarSizeInBits()) {
SDLoc DL(N);
- return DAG.getNode(X86ISD::VSRLV, DL, N->getVTList(), N00, N1);
+ return DAG.getNode(X86ISD::VSRLV, DL, VT, N00, N1);
}
}
>From f1102b197dc86f17a9cd596ae83f5f2fbc0c5f9a Mon Sep 17 00:00:00 2001
From: SahilPatidar <patidarsahil2001 at gmail.com>
Date: Mon, 6 May 2024 14:23:57 +0530
Subject: [PATCH 5/6] fixes code format
---
llvm/lib/Target/X86/X86ISelLowering.cpp | 12 ++++++------
1 file changed, 6 insertions(+), 6 deletions(-)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index afd4a5957da67..a9403fffae54b 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -45954,9 +45954,9 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
// with out-of-bounds clamping.
// Unlike general shift instructions (SHL/SRL), AVX2's VSHLV/VSRLV handle
- // shift amounts exceeding the element bitwidth. VSHLV/VSRLV clamps the amount to
- // bitwidth-1 for unsigned shifts, effectively performing a maximum left shift
- // of bitwidth-1 positions. and returns zero for unsigned right shifts
+ // shift amounts exceeding the element bitwidth. VSHLV/VSRLV clamps the amount
+ // to bitwidth-1 for unsigned shifts, effectively performing a maximum left
+ // shift of bitwidth-1 positions. and returns zero for unsigned right shifts
// exceeding bitwidth-1.
if (N->getOpcode() == ISD::VSELECT &&
(LHS.getOpcode() == ISD::SRL || LHS.getOpcode() == ISD::SHL) &&
@@ -45968,9 +45968,9 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
ISD::isConstantSplatVector(Cond.getOperand(1).getNode(), SV) &&
ISD::isConstantSplatVectorAllZeros(RHS.getNode()) &&
SV == VT.getScalarSizeInBits()) {
- return DAG.getNode(
- LHS.getOpcode() == ISD::SRL ? X86ISD::VSRLV : X86ISD::VSHLV, DL,
- VT, LHS.getOperand(0), LHS.getOperand(1));
+ return DAG.getNode(LHS.getOpcode() == ISD::SRL ? X86ISD::VSRLV
+ : X86ISD::VSHLV,
+ DL, VT, LHS.getOperand(0), LHS.getOperand(1));
}
}
>From 38858d60e221651088651a6b250179aa498d6c38 Mon Sep 17 00:00:00 2001
From: SahilPatidar <patidarsahil2001 at gmail.com>
Date: Wed, 12 Jun 2024 08:34:28 +0530
Subject: [PATCH 6/6] Move tests to `combine-shl.ll`
---
llvm/test/CodeGen/X86/combine-shl.ll | 184 ++++++++++++++++++++++++++
llvm/test/CodeGen/X86/combine-srl.ll | 186 +--------------------------
2 files changed, 185 insertions(+), 185 deletions(-)
diff --git a/llvm/test/CodeGen/X86/combine-shl.ll b/llvm/test/CodeGen/X86/combine-shl.ll
index 5472e1e6c0833..c5ce1e0046ad0 100644
--- a/llvm/test/CodeGen/X86/combine-shl.ll
+++ b/llvm/test/CodeGen/X86/combine-shl.ll
@@ -929,3 +929,187 @@ define <4 x i32> @combine_vec_add_shuffle_shl(<4 x i32> %a0) {
%3 = add <4 x i32> %2, <i32 3, i32 3, i32 3, i32 3>
ret <4 x i32> %3
}
+
+define <4 x i32> @combine_vec_shl_clamped1(<4 x i32> %sh, <4 x i32> %amt) {
+; SSE2-LABEL: combine_vec_shl_clamped1:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT: pxor %xmm1, %xmm2
+; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSE2-NEXT: pslld $23, %xmm1
+; SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT: cvttps2dq %xmm1, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm1, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm3, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT: pandn %xmm0, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: combine_vec_shl_clamped1:
+; SSE41: # %bb.0:
+; SSE41-NEXT: pmovsxbd {{.*#+}} xmm2 = [31,31,31,31]
+; SSE41-NEXT: pminud %xmm1, %xmm2
+; SSE41-NEXT: pcmpeqd %xmm1, %xmm2
+; SSE41-NEXT: pslld $23, %xmm1
+; SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE41-NEXT: cvttps2dq %xmm1, %xmm1
+; SSE41-NEXT: pmulld %xmm1, %xmm0
+; SSE41-NEXT: pand %xmm2, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: combine_vec_shl_clamped1:
+; AVX: # %bb.0:
+; AVX-NEXT: vpsllvd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
+ %cmp.i = icmp ult <4 x i32> %amt, <i32 32, i32 32, i32 32, i32 32>
+ %shl = shl <4 x i32> %sh, %amt
+ %1 = select <4 x i1> %cmp.i, <4 x i32> %shl, <4 x i32> zeroinitializer
+ ret <4 x i32> %1
+}
+
+define <4 x i32> @combine_vec_shl_clamped2(<4 x i32> %sh, <4 x i32> %amt) {
+; SSE2-LABEL: combine_vec_shl_clamped2:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT: pxor %xmm1, %xmm2
+; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSE2-NEXT: pandn %xmm0, %xmm2
+; SSE2-NEXT: pslld $23, %xmm1
+; SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT: cvttps2dq %xmm1, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm1, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm3, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: combine_vec_shl_clamped2:
+; SSE41: # %bb.0:
+; SSE41-NEXT: pmovsxbd {{.*#+}} xmm2 = [31,31,31,31]
+; SSE41-NEXT: pminud %xmm1, %xmm2
+; SSE41-NEXT: pcmpeqd %xmm1, %xmm2
+; SSE41-NEXT: pand %xmm2, %xmm0
+; SSE41-NEXT: pslld $23, %xmm1
+; SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE41-NEXT: cvttps2dq %xmm1, %xmm1
+; SSE41-NEXT: pmulld %xmm1, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: combine_vec_shl_clamped2:
+; AVX: # %bb.0:
+; AVX-NEXT: vpsllvd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
+ %cmp.i = icmp ult <4 x i32> %amt, <i32 32, i32 32, i32 32, i32 32>
+ %1 = select <4 x i1> %cmp.i, <4 x i32> %sh, <4 x i32> zeroinitializer
+ %shl = shl <4 x i32> %1, %amt
+ ret <4 x i32> %shl
+}
+
+define <4 x i32> @combine_vec_shl_commuted_clamped(<4 x i32> %sh, <4 x i32> %amt) {
+; SSE2-LABEL: combine_vec_shl_commuted_clamped:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT: pxor %xmm1, %xmm2
+; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSE2-NEXT: pandn %xmm0, %xmm2
+; SSE2-NEXT: pslld $23, %xmm1
+; SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT: cvttps2dq %xmm1, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm1, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm3, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: combine_vec_shl_commuted_clamped:
+; SSE41: # %bb.0:
+; SSE41-NEXT: pmovsxbd {{.*#+}} xmm2 = [31,31,31,31]
+; SSE41-NEXT: pminud %xmm1, %xmm2
+; SSE41-NEXT: pcmpeqd %xmm1, %xmm2
+; SSE41-NEXT: pand %xmm2, %xmm0
+; SSE41-NEXT: pslld $23, %xmm1
+; SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE41-NEXT: cvttps2dq %xmm1, %xmm1
+; SSE41-NEXT: pmulld %xmm1, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX2-LABEL: combine_vec_shl_commuted_clamped:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [31,31,31,31]
+; AVX2-NEXT: vpminud %xmm2, %xmm1, %xmm2
+; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm2
+; AVX2-NEXT: vpand %xmm0, %xmm2, %xmm0
+; AVX2-NEXT: vpsllvd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: combine_vec_shl_commuted_clamped:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpsllvd %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: retq
+ %cmp.i = icmp uge <4 x i32> %amt, <i32 32, i32 32, i32 32, i32 32>
+ %1 = select <4 x i1> %cmp.i, <4 x i32> zeroinitializer, <4 x i32> %sh
+ %shl = shl <4 x i32> %1, %amt
+ ret <4 x i32> %shl
+}
+
+define <4 x i32> @combine_vec_shl_commuted_clamped1(<4 x i32> %sh, <4 x i32> %amt) {
+; SSE2-LABEL: combine_vec_shl_commuted_clamped1:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: pslld $23, %xmm2
+; SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSE2-NEXT: cvttps2dq %xmm2, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm2, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm3, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT: pandn %xmm0, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: combine_vec_shl_commuted_clamped1:
+; SSE41: # %bb.0:
+; SSE41-NEXT: pmovsxbd {{.*#+}} xmm2 = [31,31,31,31]
+; SSE41-NEXT: pminud %xmm1, %xmm2
+; SSE41-NEXT: pcmpeqd %xmm1, %xmm2
+; SSE41-NEXT: pslld $23, %xmm1
+; SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE41-NEXT: cvttps2dq %xmm1, %xmm1
+; SSE41-NEXT: pmulld %xmm1, %xmm0
+; SSE41-NEXT: pand %xmm2, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX2-LABEL: combine_vec_shl_commuted_clamped1:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [31,31,31,31]
+; AVX2-NEXT: vpsllvd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpminud %xmm2, %xmm1, %xmm2
+; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: combine_vec_shl_commuted_clamped1:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpsllvd %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: retq
+ %cmp.i = icmp uge <4 x i32> %amt, <i32 32, i32 32, i32 32, i32 32>
+ %shl = shl <4 x i32> %sh, %amt
+ %1 = select <4 x i1> %cmp.i, <4 x i32> zeroinitializer, <4 x i32> %shl
+ ret <4 x i32> %1
+}
\ No newline at end of file
diff --git a/llvm/test/CodeGen/X86/combine-srl.ll b/llvm/test/CodeGen/X86/combine-srl.ll
index 8cfc0a1a32582..78dcf6e743400 100644
--- a/llvm/test/CodeGen/X86/combine-srl.ll
+++ b/llvm/test/CodeGen/X86/combine-srl.ll
@@ -854,188 +854,4 @@ define <4 x i32> @combine_vec_lshr_commuted_clamped1(<4 x i32> %sh, <4 x i32> %a
%shr = lshr <4 x i32> %sh, %amt
%1 = select <4 x i1> %cmp.i, <4 x i32> zeroinitializer, <4 x i32> %shr
ret <4 x i32> %1
-}
-
-define <4 x i32> @combine_vec_shl_clamped1(<4 x i32> %sh, <4 x i32> %amt) {
-; SSE2-LABEL: combine_vec_shl_clamped1:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
-; SSE2-NEXT: pxor %xmm1, %xmm2
-; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE2-NEXT: pslld $23, %xmm1
-; SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: cvttps2dq %xmm1, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; SSE2-NEXT: pmuludq %xmm1, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT: pmuludq %xmm3, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT: pandn %xmm0, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: combine_vec_shl_clamped1:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pmovsxbd {{.*#+}} xmm2 = [31,31,31,31]
-; SSE41-NEXT: pminud %xmm1, %xmm2
-; SSE41-NEXT: pcmpeqd %xmm1, %xmm2
-; SSE41-NEXT: pslld $23, %xmm1
-; SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE41-NEXT: cvttps2dq %xmm1, %xmm1
-; SSE41-NEXT: pmulld %xmm1, %xmm0
-; SSE41-NEXT: pand %xmm2, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: combine_vec_shl_clamped1:
-; AVX: # %bb.0:
-; AVX-NEXT: vpsllvd %xmm1, %xmm0, %xmm0
-; AVX-NEXT: retq
- %cmp.i = icmp ult <4 x i32> %amt, <i32 32, i32 32, i32 32, i32 32>
- %shr = shl <4 x i32> %sh, %amt
- %1 = select <4 x i1> %cmp.i, <4 x i32> %shr, <4 x i32> zeroinitializer
- ret <4 x i32> %1
-}
-
-define <4 x i32> @combine_vec_shl_clamped2(<4 x i32> %sh, <4 x i32> %amt) {
-; SSE2-LABEL: combine_vec_shl_clamped2:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
-; SSE2-NEXT: pxor %xmm1, %xmm2
-; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE2-NEXT: pandn %xmm0, %xmm2
-; SSE2-NEXT: pslld $23, %xmm1
-; SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: cvttps2dq %xmm1, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
-; SSE2-NEXT: pmuludq %xmm1, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT: pmuludq %xmm3, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: combine_vec_shl_clamped2:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pmovsxbd {{.*#+}} xmm2 = [31,31,31,31]
-; SSE41-NEXT: pminud %xmm1, %xmm2
-; SSE41-NEXT: pcmpeqd %xmm1, %xmm2
-; SSE41-NEXT: pand %xmm2, %xmm0
-; SSE41-NEXT: pslld $23, %xmm1
-; SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE41-NEXT: cvttps2dq %xmm1, %xmm1
-; SSE41-NEXT: pmulld %xmm1, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: combine_vec_shl_clamped2:
-; AVX: # %bb.0:
-; AVX-NEXT: vpsllvd %xmm1, %xmm0, %xmm0
-; AVX-NEXT: retq
- %cmp.i = icmp ult <4 x i32> %amt, <i32 32, i32 32, i32 32, i32 32>
- %1 = select <4 x i1> %cmp.i, <4 x i32> %sh, <4 x i32> zeroinitializer
- %shr = shl <4 x i32> %1, %amt
- ret <4 x i32> %shr
-}
-
-define <4 x i32> @combine_vec_shl_commuted_clamped(<4 x i32> %sh, <4 x i32> %amt) {
-; SSE2-LABEL: combine_vec_shl_commuted_clamped:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
-; SSE2-NEXT: pxor %xmm1, %xmm2
-; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE2-NEXT: pandn %xmm0, %xmm2
-; SSE2-NEXT: pslld $23, %xmm1
-; SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: cvttps2dq %xmm1, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
-; SSE2-NEXT: pmuludq %xmm1, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT: pmuludq %xmm3, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: combine_vec_shl_commuted_clamped:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pmovsxbd {{.*#+}} xmm2 = [31,31,31,31]
-; SSE41-NEXT: pminud %xmm1, %xmm2
-; SSE41-NEXT: pcmpeqd %xmm1, %xmm2
-; SSE41-NEXT: pand %xmm2, %xmm0
-; SSE41-NEXT: pslld $23, %xmm1
-; SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE41-NEXT: cvttps2dq %xmm1, %xmm1
-; SSE41-NEXT: pmulld %xmm1, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX2-LABEL: combine_vec_shl_commuted_clamped:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [31,31,31,31]
-; AVX2-NEXT: vpminud %xmm2, %xmm1, %xmm2
-; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm2
-; AVX2-NEXT: vpand %xmm0, %xmm2, %xmm0
-; AVX2-NEXT: vpsllvd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: combine_vec_shl_commuted_clamped:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpsllvd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: retq
- %cmp.i = icmp uge <4 x i32> %amt, <i32 32, i32 32, i32 32, i32 32>
- %1 = select <4 x i1> %cmp.i, <4 x i32> zeroinitializer, <4 x i32> %sh
- %shr = shl <4 x i32> %1, %amt
- ret <4 x i32> %shr
-}
-
-define <4 x i32> @combine_vec_shl_commuted_clamped1(<4 x i32> %sh, <4 x i32> %amt) {
-; SSE2-LABEL: combine_vec_shl_commuted_clamped1:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: pslld $23, %xmm2
-; SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE2-NEXT: cvttps2dq %xmm2, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; SSE2-NEXT: pmuludq %xmm2, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE2-NEXT: pmuludq %xmm3, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: pandn %xmm0, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: combine_vec_shl_commuted_clamped1:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pmovsxbd {{.*#+}} xmm2 = [31,31,31,31]
-; SSE41-NEXT: pminud %xmm1, %xmm2
-; SSE41-NEXT: pcmpeqd %xmm1, %xmm2
-; SSE41-NEXT: pslld $23, %xmm1
-; SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE41-NEXT: cvttps2dq %xmm1, %xmm1
-; SSE41-NEXT: pmulld %xmm1, %xmm0
-; SSE41-NEXT: pand %xmm2, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX2-LABEL: combine_vec_shl_commuted_clamped1:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [31,31,31,31]
-; AVX2-NEXT: vpsllvd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpminud %xmm2, %xmm1, %xmm2
-; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: combine_vec_shl_commuted_clamped1:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpsllvd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: retq
- %cmp.i = icmp uge <4 x i32> %amt, <i32 32, i32 32, i32 32, i32 32>
- %shr = shl <4 x i32> %sh, %amt
- %1 = select <4 x i1> %cmp.i, <4 x i32> zeroinitializer, <4 x i32> %shr
- ret <4 x i32> %1
-}
+}
\ No newline at end of file
More information about the llvm-commits
mailing list