[llvm] [X86] Improve variable 8-bit shifts on AVX512BW (PR #164136)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Nov 4 09:45:06 PST 2025
https://github.com/Sp00ph updated https://github.com/llvm/llvm-project/pull/164136
>From 9f773b82465fdae7501adc11268c3f3a873fab00 Mon Sep 17 00:00:00 2001
From: Markus Everling <markuseverling at gmail.com>
Date: Sun, 19 Oct 2025 00:55:36 +0200
Subject: [PATCH 1/4] Improve variable 8-bit shifts on AVX512BW
The existing implementation used three shifts by an immediate followed
by selects. This commit changes the implementation to use two variable
16 bit shifts instead.
---
llvm/lib/Target/X86/X86ISelLowering.cpp | 70 +++++++++++++++++
llvm/test/CodeGen/X86/gfni-shifts.ll | 75 +++++++------------
.../test/CodeGen/X86/vector-shift-ashr-512.ll | 39 +++-------
.../test/CodeGen/X86/vector-shift-lshr-512.ll | 25 +++----
llvm/test/CodeGen/X86/vector-shift-shl-512.ll | 19 ++---
5 files changed, 123 insertions(+), 105 deletions(-)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index b05d7c7fd7da3..e829456a83c77 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -30968,6 +30968,76 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
return DAG.getNode(X86ISD::PACKUS, dl, VT, LoR, HiR);
}
+ if (VT == MVT::v64i8 && Subtarget.canExtendTo512BW()) {
+ // On AVX512BW, we can use variable 16-bit shifts to implement variable
+ // 8-bit shifts. For this, we split the input into two vectors, RLo and RHi.
+ // The i-th lane of RLo contains the (2*i)-th lane of R, and the i-th lane
+ // of RHi contains the (2*i+1)-th lane of R. After shifting, these vectors
+ // can efficiently be merged together using a masked move.
+ MVT ExtVT = MVT::v32i16;
+
+ // When used in a vectorshuffle, selects even-index lanes from the first
+ // vector and odd index lanes from the second vector.
+ SmallVector<int, 64> InterleaveIndices;
+ for (unsigned i = 0; i < 64; ++i) {
+ unsigned offset = (i % 2 == 0) ? 0 : 64;
+ InterleaveIndices.push_back(i + offset);
+ }
+
+ SDValue zero = DAG.getConstant(0, dl, VT);
+ SDValue eight = DAG.getTargetConstant(8, dl, MVT::i8);
+ SDValue RLo, RHi;
+
+ // Isolate lower and upper lanes of Amt by shuffling zeros into AmtLo and
+ // right shifting AmtHi.
+ SDValue AmtLo = DAG.getBitcast(
+ ExtVT, DAG.getVectorShuffle(VT, dl, Amt, zero, InterleaveIndices));
+ SDValue AmtHi = DAG.getNode(X86ISD::VSRLI, dl, ExtVT,
+ DAG.getBitcast(ExtVT, Amt), eight);
+ unsigned int ShiftOp;
+ switch (Opc) {
+ case ISD::SHL:
+ // Because we shift left, no bits from the high half can influence the low
+ // half, so we don't need to mask RLo. We do however need to mask RHi, to
+ // prevent high bits of an even lane overflowing into low bits of an odd
+ // lane.
+ RLo = DAG.getBitcast(ExtVT, R);
+ RHi = DAG.getBitcast(
+ ExtVT, DAG.getVectorShuffle(VT, dl, zero, R, InterleaveIndices));
+ ShiftOp = X86ISD::VSHLV;
+ break;
+ case ISD::SRL:
+ // Same idea as above, but this time we need to make sure no low bits of
+ // an odd lane can overflow into high bits of an even lane.
+ RLo = DAG.getBitcast(
+ ExtVT, DAG.getVectorShuffle(VT, dl, R, zero, InterleaveIndices));
+ RHi = DAG.getBitcast(ExtVT, R);
+ ShiftOp = X86ISD::VSRLV;
+ break;
+ case ISD::SRA:
+ // For arithmetic right shifts, we want to sign extend each even lane of R
+ // such that the upper half of the corresponding lane of RLo is 0 or -1
+ // depending on the sign bit of the original lane. We do this using 2
+ // immediate shifts.
+ RHi = DAG.getBitcast(ExtVT, R);
+ RLo = DAG.getNode(X86ISD::VSHLI, dl, ExtVT, RHi, eight);
+ RLo = DAG.getNode(X86ISD::VSRAI, dl, ExtVT, RLo, eight);
+ ShiftOp = X86ISD::VSRAV;
+ break;
+ default:
+ llvm_unreachable("Unexpected Shift Op");
+ return SDValue();
+ }
+
+ SDValue ShiftedLo =
+ DAG.getBitcast(VT, DAG.getNode(ShiftOp, dl, ExtVT, RLo, AmtLo));
+ SDValue ShiftedHi =
+ DAG.getBitcast(VT, DAG.getNode(ShiftOp, dl, ExtVT, RHi, AmtHi));
+
+ return DAG.getVectorShuffle(VT, dl, ShiftedLo, ShiftedHi,
+ InterleaveIndices);
+ }
+
if (VT == MVT::v16i8 ||
(VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP()) ||
(VT == MVT::v64i8 && Subtarget.hasBWI())) {
diff --git a/llvm/test/CodeGen/X86/gfni-shifts.ll b/llvm/test/CodeGen/X86/gfni-shifts.ll
index feac3dcad243a..abcf7ce6aa098 100644
--- a/llvm/test/CodeGen/X86/gfni-shifts.ll
+++ b/llvm/test/CodeGen/X86/gfni-shifts.ll
@@ -1684,15 +1684,14 @@ define <64 x i8> @var_shl_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
;
; GFNIAVX512BW-LABEL: var_shl_v64i8:
; GFNIAVX512BW: # %bb.0:
-; GFNIAVX512BW-NEXT: vpsllw $5, %zmm1, %zmm1
-; GFNIAVX512BW-NEXT: vpmovb2m %zmm1, %k1
-; GFNIAVX512BW-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 {%k1}
-; GFNIAVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
-; GFNIAVX512BW-NEXT: vpmovb2m %zmm1, %k1
-; GFNIAVX512BW-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 {%k1}
-; GFNIAVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
-; GFNIAVX512BW-NEXT: vpmovb2m %zmm1, %k1
-; GFNIAVX512BW-NEXT: vpaddb %zmm0, %zmm0, %zmm0 {%k1}
+; GFNIAVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm2
+; GFNIAVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm3
+; GFNIAVX512BW-NEXT: vpsllvw %zmm2, %zmm3, %zmm2
+; GFNIAVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1
+; GFNIAVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
+; GFNIAVX512BW-NEXT: movabsq $-6148914691236517206, %rax # imm = 0xAAAAAAAAAAAAAAAA
+; GFNIAVX512BW-NEXT: kmovq %rax, %k1
+; GFNIAVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1}
; GFNIAVX512BW-NEXT: retq
%shift = shl <64 x i8> %a, %b
ret <64 x i8> %shift
@@ -1876,15 +1875,16 @@ define <64 x i8> @var_lshr_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
;
; GFNIAVX512BW-LABEL: var_lshr_v64i8:
; GFNIAVX512BW: # %bb.0:
-; GFNIAVX512BW-NEXT: vpsllw $5, %zmm1, %zmm1
-; GFNIAVX512BW-NEXT: vpmovb2m %zmm1, %k1
-; GFNIAVX512BW-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 {%k1}
-; GFNIAVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
-; GFNIAVX512BW-NEXT: vpmovb2m %zmm1, %k1
-; GFNIAVX512BW-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 {%k1}
-; GFNIAVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
-; GFNIAVX512BW-NEXT: vpmovb2m %zmm1, %k1
-; GFNIAVX512BW-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 {%k1}
+; GFNIAVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; GFNIAVX512BW-NEXT: vpandq %zmm2, %zmm1, %zmm3
+; GFNIAVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm2
+; GFNIAVX512BW-NEXT: vpsrlvw %zmm3, %zmm2, %zmm2
+; GFNIAVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1
+; GFNIAVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
+; GFNIAVX512BW-NEXT: movabsq $-6148914691236517206, %rax # imm = 0xAAAAAAAAAAAAAAAA
+; GFNIAVX512BW-NEXT: kmovq %rax, %k1
+; GFNIAVX512BW-NEXT: vmovdqu8 %zmm0, %zmm2 {%k1}
+; GFNIAVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0
; GFNIAVX512BW-NEXT: retq
%shift = lshr <64 x i8> %a, %b
ret <64 x i8> %shift
@@ -2232,36 +2232,15 @@ define <64 x i8> @var_ashr_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
;
; GFNIAVX512BW-LABEL: var_ashr_v64i8:
; GFNIAVX512BW: # %bb.0:
-; GFNIAVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
-; GFNIAVX512BW-NEXT: vpsraw $4, %zmm2, %zmm3
-; GFNIAVX512BW-NEXT: vpsllw $5, %zmm1, %zmm1
-; GFNIAVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm4 = zmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
-; GFNIAVX512BW-NEXT: vpmovb2m %zmm4, %k1
-; GFNIAVX512BW-NEXT: vmovdqu8 %zmm3, %zmm2 {%k1}
-; GFNIAVX512BW-NEXT: vpsraw $2, %zmm2, %zmm3
-; GFNIAVX512BW-NEXT: vpaddw %zmm4, %zmm4, %zmm5
-; GFNIAVX512BW-NEXT: vpmovb2m %zmm5, %k1
-; GFNIAVX512BW-NEXT: vmovdqu8 %zmm3, %zmm2 {%k1}
-; GFNIAVX512BW-NEXT: vpsraw $1, %zmm2, %zmm3
-; GFNIAVX512BW-NEXT: vpsllw $2, %zmm4, %zmm4
-; GFNIAVX512BW-NEXT: vpmovb2m %zmm4, %k1
-; GFNIAVX512BW-NEXT: vmovdqu8 %zmm3, %zmm2 {%k1}
-; GFNIAVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2
-; GFNIAVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
-; GFNIAVX512BW-NEXT: vpsraw $4, %zmm0, %zmm3
-; GFNIAVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
-; GFNIAVX512BW-NEXT: vpmovb2m %zmm1, %k1
-; GFNIAVX512BW-NEXT: vmovdqu8 %zmm3, %zmm0 {%k1}
-; GFNIAVX512BW-NEXT: vpsraw $2, %zmm0, %zmm3
-; GFNIAVX512BW-NEXT: vpaddw %zmm1, %zmm1, %zmm4
-; GFNIAVX512BW-NEXT: vpmovb2m %zmm4, %k1
-; GFNIAVX512BW-NEXT: vmovdqu8 %zmm3, %zmm0 {%k1}
-; GFNIAVX512BW-NEXT: vpsraw $1, %zmm0, %zmm3
-; GFNIAVX512BW-NEXT: vpsllw $2, %zmm1, %zmm1
-; GFNIAVX512BW-NEXT: vpmovb2m %zmm1, %k1
-; GFNIAVX512BW-NEXT: vmovdqu8 %zmm3, %zmm0 {%k1}
-; GFNIAVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0
-; GFNIAVX512BW-NEXT: vpackuswb %zmm2, %zmm0, %zmm0
+; GFNIAVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm2
+; GFNIAVX512BW-NEXT: vpsravw %zmm2, %zmm0, %zmm2
+; GFNIAVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1
+; GFNIAVX512BW-NEXT: vpsllw $8, %zmm0, %zmm0
+; GFNIAVX512BW-NEXT: vpsraw $8, %zmm0, %zmm0
+; GFNIAVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0
+; GFNIAVX512BW-NEXT: movabsq $-6148914691236517206, %rax # imm = 0xAAAAAAAAAAAAAAAA
+; GFNIAVX512BW-NEXT: kmovq %rax, %k1
+; GFNIAVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1}
; GFNIAVX512BW-NEXT: retq
%shift = ashr <64 x i8> %a, %b
ret <64 x i8> %shift
diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll
index 0fb0420bb2609..d4bb8835c5d9e 100644
--- a/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll
@@ -106,36 +106,15 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
;
; AVX512BW-LABEL: var_shift_v64i8:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
-; AVX512BW-NEXT: vpsraw $4, %zmm2, %zmm3
-; AVX512BW-NEXT: vpsllw $5, %zmm1, %zmm1
-; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm4 = zmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
-; AVX512BW-NEXT: vpmovb2m %zmm4, %k1
-; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm2 {%k1}
-; AVX512BW-NEXT: vpsraw $2, %zmm2, %zmm3
-; AVX512BW-NEXT: vpaddw %zmm4, %zmm4, %zmm5
-; AVX512BW-NEXT: vpmovb2m %zmm5, %k1
-; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm2 {%k1}
-; AVX512BW-NEXT: vpsraw $1, %zmm2, %zmm3
-; AVX512BW-NEXT: vpsllw $2, %zmm4, %zmm4
-; AVX512BW-NEXT: vpmovb2m %zmm4, %k1
-; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm2 {%k1}
-; AVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2
-; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
-; AVX512BW-NEXT: vpsraw $4, %zmm0, %zmm3
-; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
-; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
-; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm0 {%k1}
-; AVX512BW-NEXT: vpsraw $2, %zmm0, %zmm3
-; AVX512BW-NEXT: vpaddw %zmm1, %zmm1, %zmm4
-; AVX512BW-NEXT: vpmovb2m %zmm4, %k1
-; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm0 {%k1}
-; AVX512BW-NEXT: vpsraw $1, %zmm0, %zmm3
-; AVX512BW-NEXT: vpsllw $2, %zmm1, %zmm1
-; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
-; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm0 {%k1}
-; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0
-; AVX512BW-NEXT: vpackuswb %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm2
+; AVX512BW-NEXT: vpsravw %zmm2, %zmm0, %zmm2
+; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1
+; AVX512BW-NEXT: vpsllw $8, %zmm0, %zmm0
+; AVX512BW-NEXT: vpsraw $8, %zmm0, %zmm0
+; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: movabsq $-6148914691236517206, %rax # imm = 0xAAAAAAAAAAAAAAAA
+; AVX512BW-NEXT: kmovq %rax, %k1
+; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1}
; AVX512BW-NEXT: retq
%shift = ashr <64 x i8> %a, %b
ret <64 x i8> %shift
diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-512.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-512.ll
index 103d5702fb93a..38ac0f8ea6f8a 100644
--- a/llvm/test/CodeGen/X86/vector-shift-lshr-512.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-lshr-512.ll
@@ -85,21 +85,16 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
;
; AVX512BW-LABEL: var_shift_v64i8:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm2
-; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2
-; AVX512BW-NEXT: vpsllw $5, %zmm1, %zmm1
-; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
-; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1}
-; AVX512BW-NEXT: vpsrlw $2, %zmm0, %zmm2
-; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2
-; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
-; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
-; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1}
-; AVX512BW-NEXT: vpsrlw $1, %zmm0, %zmm2
-; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2
-; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
-; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
-; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1}
+; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; AVX512BW-NEXT: vpandq %zmm2, %zmm1, %zmm3
+; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm2
+; AVX512BW-NEXT: vpsrlvw %zmm3, %zmm2, %zmm2
+; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1
+; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: movabsq $-6148914691236517206, %rax # imm = 0xAAAAAAAAAAAAAAAA
+; AVX512BW-NEXT: kmovq %rax, %k1
+; AVX512BW-NEXT: vmovdqu8 %zmm0, %zmm2 {%k1}
+; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0
; AVX512BW-NEXT: retq
%shift = lshr <64 x i8> %a, %b
ret <64 x i8> %shift
diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-512.ll b/llvm/test/CodeGen/X86/vector-shift-shl-512.ll
index efd742956ed09..1fca3a2682b21 100644
--- a/llvm/test/CodeGen/X86/vector-shift-shl-512.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-shl-512.ll
@@ -82,19 +82,14 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
;
; AVX512BW-LABEL: var_shift_v64i8:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm2
-; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2
-; AVX512BW-NEXT: vpsllw $5, %zmm1, %zmm1
-; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
-; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1}
-; AVX512BW-NEXT: vpsllw $2, %zmm0, %zmm2
-; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2
-; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
-; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm2
+; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm3
+; AVX512BW-NEXT: vpsllvw %zmm2, %zmm3, %zmm2
+; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1
+; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: movabsq $-6148914691236517206, %rax # imm = 0xAAAAAAAAAAAAAAAA
+; AVX512BW-NEXT: kmovq %rax, %k1
; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1}
-; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
-; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
-; AVX512BW-NEXT: vpaddb %zmm0, %zmm0, %zmm0 {%k1}
; AVX512BW-NEXT: retq
%shift = shl <64 x i8> %a, %b
ret <64 x i8> %shift
>From c113f2bbb48c79d5937c900fcdb4061bc1b322ce Mon Sep 17 00:00:00 2001
From: Markus Everling <markuseverling at gmail.com>
Date: Sun, 19 Oct 2025 15:46:36 +0200
Subject: [PATCH 2/4] Changes according to review
---
llvm/lib/Target/X86/X86ISelLowering.cpp | 40 ++++++++-----------
llvm/test/CodeGen/X86/gfni-shifts.ll | 22 +++++-----
.../test/CodeGen/X86/vector-shift-ashr-512.ll | 5 ++-
.../test/CodeGen/X86/vector-shift-lshr-512.ll | 7 ++--
llvm/test/CodeGen/X86/vector-shift-shl-512.ll | 10 ++---
5 files changed, 38 insertions(+), 46 deletions(-)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index e829456a83c77..03eee57701119 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -30976,24 +30976,13 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
// can efficiently be merged together using a masked move.
MVT ExtVT = MVT::v32i16;
- // When used in a vectorshuffle, selects even-index lanes from the first
- // vector and odd index lanes from the second vector.
- SmallVector<int, 64> InterleaveIndices;
- for (unsigned i = 0; i < 64; ++i) {
- unsigned offset = (i % 2 == 0) ? 0 : 64;
- InterleaveIndices.push_back(i + offset);
- }
-
- SDValue zero = DAG.getConstant(0, dl, VT);
- SDValue eight = DAG.getTargetConstant(8, dl, MVT::i8);
SDValue RLo, RHi;
-
- // Isolate lower and upper lanes of Amt by shuffling zeros into AmtLo and
+ // Isolate lower and upper lanes of Amt by masking odd lanes in AmtLo and
// right shifting AmtHi.
- SDValue AmtLo = DAG.getBitcast(
- ExtVT, DAG.getVectorShuffle(VT, dl, Amt, zero, InterleaveIndices));
- SDValue AmtHi = DAG.getNode(X86ISD::VSRLI, dl, ExtVT,
- DAG.getBitcast(ExtVT, Amt), eight);
+ SDValue AmtLo = DAG.getNode(ISD::AND, dl, ExtVT, DAG.getBitcast(ExtVT, Amt),
+ DAG.getConstant(0x00ff, dl, ExtVT));
+ SDValue AmtHi = getTargetVShiftByConstNode(
+ X86ISD::VSRLI, dl, ExtVT, DAG.getBitcast(ExtVT, Amt), 8, DAG);
unsigned int ShiftOp;
switch (Opc) {
case ISD::SHL:
@@ -31002,16 +30991,16 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
// prevent high bits of an even lane overflowing into low bits of an odd
// lane.
RLo = DAG.getBitcast(ExtVT, R);
- RHi = DAG.getBitcast(
- ExtVT, DAG.getVectorShuffle(VT, dl, zero, R, InterleaveIndices));
+ RHi = DAG.getNode(ISD::AND, dl, ExtVT, RLo,
+ DAG.getConstant(0xff00, dl, ExtVT));
ShiftOp = X86ISD::VSHLV;
break;
case ISD::SRL:
// Same idea as above, but this time we need to make sure no low bits of
// an odd lane can overflow into high bits of an even lane.
- RLo = DAG.getBitcast(
- ExtVT, DAG.getVectorShuffle(VT, dl, R, zero, InterleaveIndices));
RHi = DAG.getBitcast(ExtVT, R);
+ RLo = DAG.getNode(ISD::AND, dl, ExtVT, RHi,
+ DAG.getConstant(0x00ff, dl, ExtVT));
ShiftOp = X86ISD::VSRLV;
break;
case ISD::SRA:
@@ -31020,8 +31009,8 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
// depending on the sign bit of the original lane. We do this using 2
// immediate shifts.
RHi = DAG.getBitcast(ExtVT, R);
- RLo = DAG.getNode(X86ISD::VSHLI, dl, ExtVT, RHi, eight);
- RLo = DAG.getNode(X86ISD::VSRAI, dl, ExtVT, RLo, eight);
+ RLo = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExtVT, RHi, 8, DAG);
+ RLo = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExtVT, RLo, 8, DAG);
ShiftOp = X86ISD::VSRAV;
break;
default:
@@ -31034,8 +31023,11 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
SDValue ShiftedHi =
DAG.getBitcast(VT, DAG.getNode(ShiftOp, dl, ExtVT, RHi, AmtHi));
- return DAG.getVectorShuffle(VT, dl, ShiftedLo, ShiftedHi,
- InterleaveIndices);
+ // To merge the shifted vectors back together, we select even lanes
+ // from ShiftedLo and odd lanes from ShiftedHi.
+ SDValue SelectMask = DAG.getBitcast(
+ MVT::v64i1, DAG.getConstant(0x5555555555555555, dl, MVT::i64));
+ return DAG.getSelect(dl, VT, SelectMask, ShiftedLo, ShiftedHi);
}
if (VT == MVT::v16i8 ||
diff --git a/llvm/test/CodeGen/X86/gfni-shifts.ll b/llvm/test/CodeGen/X86/gfni-shifts.ll
index abcf7ce6aa098..30f1874c51fed 100644
--- a/llvm/test/CodeGen/X86/gfni-shifts.ll
+++ b/llvm/test/CodeGen/X86/gfni-shifts.ll
@@ -1684,12 +1684,12 @@ define <64 x i8> @var_shl_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
;
; GFNIAVX512BW-LABEL: var_shl_v64i8:
; GFNIAVX512BW: # %bb.0:
-; GFNIAVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm2
-; GFNIAVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm3
-; GFNIAVX512BW-NEXT: vpsllvw %zmm2, %zmm3, %zmm2
-; GFNIAVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1
+; GFNIAVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm2
+; GFNIAVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm2
+; GFNIAVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1
+; GFNIAVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
; GFNIAVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
-; GFNIAVX512BW-NEXT: movabsq $-6148914691236517206, %rax # imm = 0xAAAAAAAAAAAAAAAA
+; GFNIAVX512BW-NEXT: movabsq $6148914691236517205, %rax # imm = 0x5555555555555555
; GFNIAVX512BW-NEXT: kmovq %rax, %k1
; GFNIAVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1}
; GFNIAVX512BW-NEXT: retq
@@ -1875,16 +1875,15 @@ define <64 x i8> @var_lshr_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
;
; GFNIAVX512BW-LABEL: var_lshr_v64i8:
; GFNIAVX512BW: # %bb.0:
-; GFNIAVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; GFNIAVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; GFNIAVX512BW-NEXT: vpandq %zmm2, %zmm1, %zmm3
; GFNIAVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm2
; GFNIAVX512BW-NEXT: vpsrlvw %zmm3, %zmm2, %zmm2
; GFNIAVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1
; GFNIAVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
-; GFNIAVX512BW-NEXT: movabsq $-6148914691236517206, %rax # imm = 0xAAAAAAAAAAAAAAAA
+; GFNIAVX512BW-NEXT: movabsq $6148914691236517205, %rax # imm = 0x5555555555555555
; GFNIAVX512BW-NEXT: kmovq %rax, %k1
-; GFNIAVX512BW-NEXT: vmovdqu8 %zmm0, %zmm2 {%k1}
-; GFNIAVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0
+; GFNIAVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1}
; GFNIAVX512BW-NEXT: retq
%shift = lshr <64 x i8> %a, %b
ret <64 x i8> %shift
@@ -2238,9 +2237,10 @@ define <64 x i8> @var_ashr_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
; GFNIAVX512BW-NEXT: vpsllw $8, %zmm0, %zmm0
; GFNIAVX512BW-NEXT: vpsraw $8, %zmm0, %zmm0
; GFNIAVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0
-; GFNIAVX512BW-NEXT: movabsq $-6148914691236517206, %rax # imm = 0xAAAAAAAAAAAAAAAA
+; GFNIAVX512BW-NEXT: movabsq $6148914691236517205, %rax # imm = 0x5555555555555555
; GFNIAVX512BW-NEXT: kmovq %rax, %k1
-; GFNIAVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1}
+; GFNIAVX512BW-NEXT: vmovdqu8 %zmm0, %zmm2 {%k1}
+; GFNIAVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0
; GFNIAVX512BW-NEXT: retq
%shift = ashr <64 x i8> %a, %b
ret <64 x i8> %shift
diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll
index d4bb8835c5d9e..aff2228c258b5 100644
--- a/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll
@@ -112,9 +112,10 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
; AVX512BW-NEXT: vpsllw $8, %zmm0, %zmm0
; AVX512BW-NEXT: vpsraw $8, %zmm0, %zmm0
; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: movabsq $-6148914691236517206, %rax # imm = 0xAAAAAAAAAAAAAAAA
+; AVX512BW-NEXT: movabsq $6148914691236517205, %rax # imm = 0x5555555555555555
; AVX512BW-NEXT: kmovq %rax, %k1
-; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1}
+; AVX512BW-NEXT: vmovdqu8 %zmm0, %zmm2 {%k1}
+; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0
; AVX512BW-NEXT: retq
%shift = ashr <64 x i8> %a, %b
ret <64 x i8> %shift
diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-512.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-512.ll
index 38ac0f8ea6f8a..4450d07e01cca 100644
--- a/llvm/test/CodeGen/X86/vector-shift-lshr-512.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-lshr-512.ll
@@ -85,16 +85,15 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
;
; AVX512BW-LABEL: var_shift_v64i8:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX512BW-NEXT: vpandq %zmm2, %zmm1, %zmm3
; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm2
; AVX512BW-NEXT: vpsrlvw %zmm3, %zmm2, %zmm2
; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1
; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: movabsq $-6148914691236517206, %rax # imm = 0xAAAAAAAAAAAAAAAA
+; AVX512BW-NEXT: movabsq $6148914691236517205, %rax # imm = 0x5555555555555555
; AVX512BW-NEXT: kmovq %rax, %k1
-; AVX512BW-NEXT: vmovdqu8 %zmm0, %zmm2 {%k1}
-; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0
+; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1}
; AVX512BW-NEXT: retq
%shift = lshr <64 x i8> %a, %b
ret <64 x i8> %shift
diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-512.ll b/llvm/test/CodeGen/X86/vector-shift-shl-512.ll
index 1fca3a2682b21..41238acc4b74d 100644
--- a/llvm/test/CodeGen/X86/vector-shift-shl-512.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-shl-512.ll
@@ -82,12 +82,12 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
;
; AVX512BW-LABEL: var_shift_v64i8:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm2
-; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm3
-; AVX512BW-NEXT: vpsllvw %zmm2, %zmm3, %zmm2
-; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1
+; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm2
+; AVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm2
+; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1
+; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: movabsq $-6148914691236517206, %rax # imm = 0xAAAAAAAAAAAAAAAA
+; AVX512BW-NEXT: movabsq $6148914691236517205, %rax # imm = 0x5555555555555555
; AVX512BW-NEXT: kmovq %rax, %k1
; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1}
; AVX512BW-NEXT: retq
>From 50abee1a8fbb1a2c29f266509b269934364644f7 Mon Sep 17 00:00:00 2001
From: Markus Everling <markuseverling at gmail.com>
Date: Mon, 3 Nov 2025 16:18:34 +0100
Subject: [PATCH 3/4] Remove unnecessary return
---
llvm/lib/Target/X86/X86ISelLowering.cpp | 1 -
1 file changed, 1 deletion(-)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 03eee57701119..524b244faaf11 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -31015,7 +31015,6 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
break;
default:
llvm_unreachable("Unexpected Shift Op");
- return SDValue();
}
SDValue ShiftedLo =
>From f20a14663f2b2fa2208796f012f535f93287d0d4 Mon Sep 17 00:00:00 2001
From: Markus Everling <markuseverling at gmail.com>
Date: Tue, 4 Nov 2025 18:44:40 +0100
Subject: [PATCH 4/4] Remove unnecessary ShiftOp
---
llvm/lib/Target/X86/X86ISelLowering.cpp | 8 ++------
1 file changed, 2 insertions(+), 6 deletions(-)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 524b244faaf11..7bf5940a3dd71 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -30983,7 +30983,6 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
DAG.getConstant(0x00ff, dl, ExtVT));
SDValue AmtHi = getTargetVShiftByConstNode(
X86ISD::VSRLI, dl, ExtVT, DAG.getBitcast(ExtVT, Amt), 8, DAG);
- unsigned int ShiftOp;
switch (Opc) {
case ISD::SHL:
// Because we shift left, no bits from the high half can influence the low
@@ -30993,7 +30992,6 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
RLo = DAG.getBitcast(ExtVT, R);
RHi = DAG.getNode(ISD::AND, dl, ExtVT, RLo,
DAG.getConstant(0xff00, dl, ExtVT));
- ShiftOp = X86ISD::VSHLV;
break;
case ISD::SRL:
// Same idea as above, but this time we need to make sure no low bits of
@@ -31001,7 +30999,6 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
RHi = DAG.getBitcast(ExtVT, R);
RLo = DAG.getNode(ISD::AND, dl, ExtVT, RHi,
DAG.getConstant(0x00ff, dl, ExtVT));
- ShiftOp = X86ISD::VSRLV;
break;
case ISD::SRA:
// For arithmetic right shifts, we want to sign extend each even lane of R
@@ -31011,16 +31008,15 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
RHi = DAG.getBitcast(ExtVT, R);
RLo = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExtVT, RHi, 8, DAG);
RLo = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExtVT, RLo, 8, DAG);
- ShiftOp = X86ISD::VSRAV;
break;
default:
llvm_unreachable("Unexpected Shift Op");
}
SDValue ShiftedLo =
- DAG.getBitcast(VT, DAG.getNode(ShiftOp, dl, ExtVT, RLo, AmtLo));
+ DAG.getBitcast(VT, DAG.getNode(Opc, dl, ExtVT, RLo, AmtLo));
SDValue ShiftedHi =
- DAG.getBitcast(VT, DAG.getNode(ShiftOp, dl, ExtVT, RHi, AmtHi));
+ DAG.getBitcast(VT, DAG.getNode(Opc, dl, ExtVT, RHi, AmtHi));
// To merge the shifted vectors back together, we select even lanes
// from ShiftedLo and odd lanes from ShiftedHi.
More information about the llvm-commits
mailing list