[llvm] [GISel][AArch64] Added more efficient lowering of Bitreverse (PR #139233)
via llvm-commits
llvm-commits at lists.llvm.org
Mon May 12 06:21:32 PDT 2025
https://github.com/jyli0116 updated https://github.com/llvm/llvm-project/pull/139233
>From 84944245ea1cac8c65cc26ba71cc793a27760931 Mon Sep 17 00:00:00 2001
From: Yu Li <yu.li at arm.com>
Date: Thu, 8 May 2025 12:31:48 +0000
Subject: [PATCH 1/4] [GISel][AArch64] Added more efficient lowering for
bitreverse of various vector types
---
.../CodeGen/GlobalISel/LegalizerHelper.cpp | 66 +----
.../AArch64/GISel/AArch64LegalizerInfo.cpp | 5 +-
.../GlobalISel/legalize-bitreverse.mir | 150 ++---------
llvm/test/CodeGen/AArch64/bitreverse.ll | 253 +++++-------------
4 files changed, 100 insertions(+), 374 deletions(-)
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index da0bcdc123e2a..a0ec0068c6275 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -8971,67 +8971,25 @@ LegalizerHelper::LegalizeResult LegalizerHelper::lowerBswap(MachineInstr &MI) {
return Legalized;
}
-//{ (Src & Mask) >> N } | { (Src << N) & Mask }
-static MachineInstrBuilder SwapN(unsigned N, DstOp Dst, MachineIRBuilder &B,
- MachineInstrBuilder Src, const APInt &Mask) {
- const LLT Ty = Dst.getLLTTy(*B.getMRI());
- MachineInstrBuilder C_N = B.buildConstant(Ty, N);
- MachineInstrBuilder MaskLoNTo0 = B.buildConstant(Ty, Mask);
- auto LHS = B.buildLShr(Ty, B.buildAnd(Ty, Src, MaskLoNTo0), C_N);
- auto RHS = B.buildAnd(Ty, B.buildShl(Ty, Src, C_N), MaskLoNTo0);
- return B.buildOr(Dst, LHS, RHS);
-}
LegalizerHelper::LegalizeResult
LegalizerHelper::lowerBitreverse(MachineInstr &MI) {
auto [Dst, Src] = MI.getFirst2Regs();
- const LLT Ty = MRI.getType(Src);
- unsigned Size = Ty.getScalarSizeInBits();
-
- if (Size >= 8) {
- MachineInstrBuilder BSWAP =
- MIRBuilder.buildInstr(TargetOpcode::G_BSWAP, {Ty}, {Src});
-
- // swap high and low 4 bits in 8 bit blocks 7654|3210 -> 3210|7654
- // [(val & 0xF0F0F0F0) >> 4] | [(val & 0x0F0F0F0F) << 4]
- // -> [(val & 0xF0F0F0F0) >> 4] | [(val << 4) & 0xF0F0F0F0]
- MachineInstrBuilder Swap4 =
- SwapN(4, Ty, MIRBuilder, BSWAP, APInt::getSplat(Size, APInt(8, 0xF0)));
-
- // swap high and low 2 bits in 4 bit blocks 32|10 76|54 -> 10|32 54|76
- // [(val & 0xCCCCCCCC) >> 2] & [(val & 0x33333333) << 2]
- // -> [(val & 0xCCCCCCCC) >> 2] & [(val << 2) & 0xCCCCCCCC]
- MachineInstrBuilder Swap2 =
- SwapN(2, Ty, MIRBuilder, Swap4, APInt::getSplat(Size, APInt(8, 0xCC)));
-
- // swap high and low 1 bit in 2 bit blocks 1|0 3|2 5|4 7|6 -> 0|1 2|3 4|5
- // 6|7
- // [(val & 0xAAAAAAAA) >> 1] & [(val & 0x55555555) << 1]
- // -> [(val & 0xAAAAAAAA) >> 1] & [(val << 1) & 0xAAAAAAAA]
- SwapN(1, Dst, MIRBuilder, Swap2, APInt::getSplat(Size, APInt(8, 0xAA)));
- } else {
- // Expand bitreverse for types smaller than 8 bits.
- MachineInstrBuilder Tmp;
- for (unsigned I = 0, J = Size - 1; I < Size; ++I, --J) {
- MachineInstrBuilder Tmp2;
- if (I < J) {
- auto ShAmt = MIRBuilder.buildConstant(Ty, J - I);
- Tmp2 = MIRBuilder.buildShl(Ty, Src, ShAmt);
- } else {
- auto ShAmt = MIRBuilder.buildConstant(Ty, I - J);
- Tmp2 = MIRBuilder.buildLShr(Ty, Src, ShAmt);
- }
+ const LLT SrcTy = MRI.getType(Src);
+ unsigned Size = SrcTy.getScalarSizeInBits();
+ unsigned VSize = SrcTy.getSizeInBits();
- auto Mask = MIRBuilder.buildConstant(Ty, 1ULL << J);
- Tmp2 = MIRBuilder.buildAnd(Ty, Tmp2, Mask);
- if (I == 0)
- Tmp = Tmp2;
- else
- Tmp = MIRBuilder.buildOr(Ty, Tmp, Tmp2);
- }
- MIRBuilder.buildCopy(Dst, Tmp);
+ LLT VTy = VSize == 128 ? LLT::fixed_vector(16, 8) : LLT::fixed_vector(8, 8);
+
+ if (!LI.isLegal({TargetOpcode::G_BSWAP, {SrcTy, SrcTy}})) {
+ return UnableToLegalize;
}
+ auto BSWAP = MIRBuilder.buildBSwap(SrcTy, Src);
+ auto Cast = MIRBuilder.buildBitcast(VTy , BSWAP);
+ auto RBIT = MIRBuilder.buildBitReverse(VTy, Cast);
+ MIRBuilder.buildBitcast(Dst, RBIT);
+
MI.eraseFromParent();
return Legalized;
}
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index 28957f2664282..7ff2e55e802c5 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -361,12 +361,15 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
getActionDefinitionsBuilder(G_CTTZ_ZERO_UNDEF).lower();
- // TODO: Custom lowering for v2s32, v4s32, v2s64.
getActionDefinitionsBuilder(G_BITREVERSE)
.legalFor({s32, s64, v8s8, v16s8})
.widenScalarToNextPow2(0, /*Min = */ 32)
+ .widenScalarOrEltToNextPow2OrMinSize(0, 8)
.clampScalar(0, s32, s64)
.clampNumElements(0, v8s8, v16s8)
+ .clampNumElements(0, v4s16, v8s16)
+ .clampNumElements(0, v2s32, v4s32)
+ .clampNumElements(0, v2s64, v2s64)
.moreElementsToNextPow2(0)
.lower();
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-bitreverse.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-bitreverse.mir
index 607f213f67145..dfd4dd8a43544 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-bitreverse.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-bitreverse.mir
@@ -152,33 +152,9 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: %vec:_(<4 x s16>) = COPY $d0
; CHECK-NEXT: [[BSWAP:%[0-9]+]]:_(<4 x s16>) = G_BSWAP %vec
- ; CHECK-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 4
- ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s16>) = G_BUILD_VECTOR [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16)
- ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 -3856
- ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s16>) = G_BUILD_VECTOR [[C1]](s16), [[C1]](s16), [[C1]](s16), [[C1]](s16)
- ; CHECK-NEXT: [[AND:%[0-9]+]]:_(<4 x s16>) = G_AND [[BSWAP]], [[BUILD_VECTOR1]]
- ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(<4 x s16>) = G_LSHR [[AND]], [[BUILD_VECTOR]](<4 x s16>)
- ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(<4 x s16>) = G_SHL [[BSWAP]], [[BUILD_VECTOR]](<4 x s16>)
- ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(<4 x s16>) = G_AND [[SHL]], [[BUILD_VECTOR1]]
- ; CHECK-NEXT: [[OR:%[0-9]+]]:_(<4 x s16>) = G_OR [[LSHR]], [[AND1]]
- ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 2
- ; CHECK-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s16>) = G_BUILD_VECTOR [[C2]](s16), [[C2]](s16), [[C2]](s16), [[C2]](s16)
- ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 -13108
- ; CHECK-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<4 x s16>) = G_BUILD_VECTOR [[C3]](s16), [[C3]](s16), [[C3]](s16), [[C3]](s16)
- ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(<4 x s16>) = G_AND [[OR]], [[BUILD_VECTOR3]]
- ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(<4 x s16>) = G_LSHR [[AND2]], [[BUILD_VECTOR2]](<4 x s16>)
- ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(<4 x s16>) = G_SHL [[OR]], [[BUILD_VECTOR2]](<4 x s16>)
- ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(<4 x s16>) = G_AND [[SHL1]], [[BUILD_VECTOR3]]
- ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(<4 x s16>) = G_OR [[LSHR1]], [[AND3]]
- ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 1
- ; CHECK-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<4 x s16>) = G_BUILD_VECTOR [[C4]](s16), [[C4]](s16), [[C4]](s16), [[C4]](s16)
- ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 -21846
- ; CHECK-NEXT: [[BUILD_VECTOR5:%[0-9]+]]:_(<4 x s16>) = G_BUILD_VECTOR [[C5]](s16), [[C5]](s16), [[C5]](s16), [[C5]](s16)
- ; CHECK-NEXT: [[AND4:%[0-9]+]]:_(<4 x s16>) = G_AND [[OR1]], [[BUILD_VECTOR5]]
- ; CHECK-NEXT: [[LSHR2:%[0-9]+]]:_(<4 x s16>) = G_LSHR [[AND4]], [[BUILD_VECTOR4]](<4 x s16>)
- ; CHECK-NEXT: [[SHL2:%[0-9]+]]:_(<4 x s16>) = G_SHL [[OR1]], [[BUILD_VECTOR4]](<4 x s16>)
- ; CHECK-NEXT: [[AND5:%[0-9]+]]:_(<4 x s16>) = G_AND [[SHL2]], [[BUILD_VECTOR5]]
- ; CHECK-NEXT: %bitreverse:_(<4 x s16>) = G_OR [[LSHR2]], [[AND5]]
+ ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<8 x s8>) = G_BITCAST [[BSWAP]](<4 x s16>)
+ ; CHECK-NEXT: [[BITREVERSE:%[0-9]+]]:_(<8 x s8>) = G_BITREVERSE [[BITCAST]]
+ ; CHECK-NEXT: %bitreverse:_(<4 x s16>) = G_BITCAST [[BITREVERSE]](<8 x s8>)
; CHECK-NEXT: $d0 = COPY %bitreverse(<4 x s16>)
; CHECK-NEXT: RET_ReallyLR implicit $q0
%vec:_(<4 x s16>) = COPY $d0
@@ -197,33 +173,9 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: %vec:_(<2 x s32>) = COPY $d0
; CHECK-NEXT: [[BSWAP:%[0-9]+]]:_(<2 x s32>) = G_BSWAP %vec
- ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
- ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[C]](s32), [[C]](s32)
- ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -252645136
- ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[C1]](s32), [[C1]](s32)
- ; CHECK-NEXT: [[AND:%[0-9]+]]:_(<2 x s32>) = G_AND [[BSWAP]], [[BUILD_VECTOR1]]
- ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(<2 x s32>) = G_LSHR [[AND]], [[BUILD_VECTOR]](<2 x s32>)
- ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(<2 x s32>) = G_SHL [[BSWAP]], [[BUILD_VECTOR]](<2 x s32>)
- ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(<2 x s32>) = G_AND [[SHL]], [[BUILD_VECTOR1]]
- ; CHECK-NEXT: [[OR:%[0-9]+]]:_(<2 x s32>) = G_OR [[LSHR]], [[AND1]]
- ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
- ; CHECK-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[C2]](s32), [[C2]](s32)
- ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 -858993460
- ; CHECK-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[C3]](s32), [[C3]](s32)
- ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(<2 x s32>) = G_AND [[OR]], [[BUILD_VECTOR3]]
- ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(<2 x s32>) = G_LSHR [[AND2]], [[BUILD_VECTOR2]](<2 x s32>)
- ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(<2 x s32>) = G_SHL [[OR]], [[BUILD_VECTOR2]](<2 x s32>)
- ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(<2 x s32>) = G_AND [[SHL1]], [[BUILD_VECTOR3]]
- ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(<2 x s32>) = G_OR [[LSHR1]], [[AND3]]
- ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
- ; CHECK-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[C4]](s32), [[C4]](s32)
- ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1431655766
- ; CHECK-NEXT: [[BUILD_VECTOR5:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[C5]](s32), [[C5]](s32)
- ; CHECK-NEXT: [[AND4:%[0-9]+]]:_(<2 x s32>) = G_AND [[OR1]], [[BUILD_VECTOR5]]
- ; CHECK-NEXT: [[LSHR2:%[0-9]+]]:_(<2 x s32>) = G_LSHR [[AND4]], [[BUILD_VECTOR4]](<2 x s32>)
- ; CHECK-NEXT: [[SHL2:%[0-9]+]]:_(<2 x s32>) = G_SHL [[OR1]], [[BUILD_VECTOR4]](<2 x s32>)
- ; CHECK-NEXT: [[AND5:%[0-9]+]]:_(<2 x s32>) = G_AND [[SHL2]], [[BUILD_VECTOR5]]
- ; CHECK-NEXT: %bitreverse:_(<2 x s32>) = G_OR [[LSHR2]], [[AND5]]
+ ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<8 x s8>) = G_BITCAST [[BSWAP]](<2 x s32>)
+ ; CHECK-NEXT: [[BITREVERSE:%[0-9]+]]:_(<8 x s8>) = G_BITREVERSE [[BITCAST]]
+ ; CHECK-NEXT: %bitreverse:_(<2 x s32>) = G_BITCAST [[BITREVERSE]](<8 x s8>)
; CHECK-NEXT: $d0 = COPY %bitreverse(<2 x s32>)
; CHECK-NEXT: RET_ReallyLR implicit $d0
%vec:_(<2 x s32>) = COPY $d0
@@ -242,33 +194,9 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: %vec:_(<2 x s64>) = COPY $q0
; CHECK-NEXT: [[BSWAP:%[0-9]+]]:_(<2 x s64>) = G_BSWAP %vec
- ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
- ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[C]](s64), [[C]](s64)
- ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1085102592571150096
- ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[C1]](s64), [[C1]](s64)
- ; CHECK-NEXT: [[AND:%[0-9]+]]:_(<2 x s64>) = G_AND [[BSWAP]], [[BUILD_VECTOR1]]
- ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(<2 x s64>) = G_LSHR [[AND]], [[BUILD_VECTOR]](<2 x s64>)
- ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(<2 x s64>) = G_SHL [[BSWAP]], [[BUILD_VECTOR]](<2 x s64>)
- ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(<2 x s64>) = G_AND [[SHL]], [[BUILD_VECTOR1]]
- ; CHECK-NEXT: [[OR:%[0-9]+]]:_(<2 x s64>) = G_OR [[LSHR]], [[AND1]]
- ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
- ; CHECK-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[C2]](s64), [[C2]](s64)
- ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 -3689348814741910324
- ; CHECK-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[C3]](s64), [[C3]](s64)
- ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(<2 x s64>) = G_AND [[OR]], [[BUILD_VECTOR3]]
- ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(<2 x s64>) = G_LSHR [[AND2]], [[BUILD_VECTOR2]](<2 x s64>)
- ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(<2 x s64>) = G_SHL [[OR]], [[BUILD_VECTOR2]](<2 x s64>)
- ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(<2 x s64>) = G_AND [[SHL1]], [[BUILD_VECTOR3]]
- ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(<2 x s64>) = G_OR [[LSHR1]], [[AND3]]
- ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
- ; CHECK-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[C4]](s64), [[C4]](s64)
- ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 -6148914691236517206
- ; CHECK-NEXT: [[BUILD_VECTOR5:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[C5]](s64), [[C5]](s64)
- ; CHECK-NEXT: [[AND4:%[0-9]+]]:_(<2 x s64>) = G_AND [[OR1]], [[BUILD_VECTOR5]]
- ; CHECK-NEXT: [[LSHR2:%[0-9]+]]:_(<2 x s64>) = G_LSHR [[AND4]], [[BUILD_VECTOR4]](<2 x s64>)
- ; CHECK-NEXT: [[SHL2:%[0-9]+]]:_(<2 x s64>) = G_SHL [[OR1]], [[BUILD_VECTOR4]](<2 x s64>)
- ; CHECK-NEXT: [[AND5:%[0-9]+]]:_(<2 x s64>) = G_AND [[SHL2]], [[BUILD_VECTOR5]]
- ; CHECK-NEXT: %bitreverse:_(<2 x s64>) = G_OR [[LSHR2]], [[AND5]]
+ ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<16 x s8>) = G_BITCAST [[BSWAP]](<2 x s64>)
+ ; CHECK-NEXT: [[BITREVERSE:%[0-9]+]]:_(<16 x s8>) = G_BITREVERSE [[BITCAST]]
+ ; CHECK-NEXT: %bitreverse:_(<2 x s64>) = G_BITCAST [[BITREVERSE]](<16 x s8>)
; CHECK-NEXT: $q0 = COPY %bitreverse(<2 x s64>)
; CHECK-NEXT: RET_ReallyLR implicit $q0
%vec:_(<2 x s64>) = COPY $q0
@@ -287,33 +215,9 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: %vec:_(<4 x s32>) = COPY $q0
; CHECK-NEXT: [[BSWAP:%[0-9]+]]:_(<4 x s32>) = G_BSWAP %vec
- ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
- ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32)
- ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -252645136
- ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[C1]](s32), [[C1]](s32), [[C1]](s32), [[C1]](s32)
- ; CHECK-NEXT: [[AND:%[0-9]+]]:_(<4 x s32>) = G_AND [[BSWAP]], [[BUILD_VECTOR1]]
- ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(<4 x s32>) = G_LSHR [[AND]], [[BUILD_VECTOR]](<4 x s32>)
- ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(<4 x s32>) = G_SHL [[BSWAP]], [[BUILD_VECTOR]](<4 x s32>)
- ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(<4 x s32>) = G_AND [[SHL]], [[BUILD_VECTOR1]]
- ; CHECK-NEXT: [[OR:%[0-9]+]]:_(<4 x s32>) = G_OR [[LSHR]], [[AND1]]
- ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
- ; CHECK-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[C2]](s32), [[C2]](s32), [[C2]](s32), [[C2]](s32)
- ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 -858993460
- ; CHECK-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[C3]](s32), [[C3]](s32), [[C3]](s32), [[C3]](s32)
- ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(<4 x s32>) = G_AND [[OR]], [[BUILD_VECTOR3]]
- ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(<4 x s32>) = G_LSHR [[AND2]], [[BUILD_VECTOR2]](<4 x s32>)
- ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(<4 x s32>) = G_SHL [[OR]], [[BUILD_VECTOR2]](<4 x s32>)
- ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(<4 x s32>) = G_AND [[SHL1]], [[BUILD_VECTOR3]]
- ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(<4 x s32>) = G_OR [[LSHR1]], [[AND3]]
- ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
- ; CHECK-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[C4]](s32), [[C4]](s32), [[C4]](s32), [[C4]](s32)
- ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1431655766
- ; CHECK-NEXT: [[BUILD_VECTOR5:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[C5]](s32), [[C5]](s32), [[C5]](s32), [[C5]](s32)
- ; CHECK-NEXT: [[AND4:%[0-9]+]]:_(<4 x s32>) = G_AND [[OR1]], [[BUILD_VECTOR5]]
- ; CHECK-NEXT: [[LSHR2:%[0-9]+]]:_(<4 x s32>) = G_LSHR [[AND4]], [[BUILD_VECTOR4]](<4 x s32>)
- ; CHECK-NEXT: [[SHL2:%[0-9]+]]:_(<4 x s32>) = G_SHL [[OR1]], [[BUILD_VECTOR4]](<4 x s32>)
- ; CHECK-NEXT: [[AND5:%[0-9]+]]:_(<4 x s32>) = G_AND [[SHL2]], [[BUILD_VECTOR5]]
- ; CHECK-NEXT: %bitreverse:_(<4 x s32>) = G_OR [[LSHR2]], [[AND5]]
+ ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<16 x s8>) = G_BITCAST [[BSWAP]](<4 x s32>)
+ ; CHECK-NEXT: [[BITREVERSE:%[0-9]+]]:_(<16 x s8>) = G_BITREVERSE [[BITCAST]]
+ ; CHECK-NEXT: %bitreverse:_(<4 x s32>) = G_BITCAST [[BITREVERSE]](<16 x s8>)
; CHECK-NEXT: $q0 = COPY %bitreverse(<4 x s32>)
; CHECK-NEXT: RET_ReallyLR implicit $q0
%vec:_(<4 x s32>) = COPY $q0
@@ -332,33 +236,9 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: %vec:_(<8 x s16>) = COPY $q0
; CHECK-NEXT: [[BSWAP:%[0-9]+]]:_(<8 x s16>) = G_BSWAP %vec
- ; CHECK-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 4
- ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16)
- ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 -3856
- ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C1]](s16), [[C1]](s16), [[C1]](s16), [[C1]](s16), [[C1]](s16), [[C1]](s16), [[C1]](s16), [[C1]](s16)
- ; CHECK-NEXT: [[AND:%[0-9]+]]:_(<8 x s16>) = G_AND [[BSWAP]], [[BUILD_VECTOR1]]
- ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(<8 x s16>) = G_LSHR [[AND]], [[BUILD_VECTOR]](<8 x s16>)
- ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(<8 x s16>) = G_SHL [[BSWAP]], [[BUILD_VECTOR]](<8 x s16>)
- ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(<8 x s16>) = G_AND [[SHL]], [[BUILD_VECTOR1]]
- ; CHECK-NEXT: [[OR:%[0-9]+]]:_(<8 x s16>) = G_OR [[LSHR]], [[AND1]]
- ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 2
- ; CHECK-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C2]](s16), [[C2]](s16), [[C2]](s16), [[C2]](s16), [[C2]](s16), [[C2]](s16), [[C2]](s16), [[C2]](s16)
- ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 -13108
- ; CHECK-NEXT: [[BUILD_VECTOR3:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C3]](s16), [[C3]](s16), [[C3]](s16), [[C3]](s16), [[C3]](s16), [[C3]](s16), [[C3]](s16), [[C3]](s16)
- ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(<8 x s16>) = G_AND [[OR]], [[BUILD_VECTOR3]]
- ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(<8 x s16>) = G_LSHR [[AND2]], [[BUILD_VECTOR2]](<8 x s16>)
- ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(<8 x s16>) = G_SHL [[OR]], [[BUILD_VECTOR2]](<8 x s16>)
- ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(<8 x s16>) = G_AND [[SHL1]], [[BUILD_VECTOR3]]
- ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(<8 x s16>) = G_OR [[LSHR1]], [[AND3]]
- ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 1
- ; CHECK-NEXT: [[BUILD_VECTOR4:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C4]](s16), [[C4]](s16), [[C4]](s16), [[C4]](s16), [[C4]](s16), [[C4]](s16), [[C4]](s16), [[C4]](s16)
- ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 -21846
- ; CHECK-NEXT: [[BUILD_VECTOR5:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C5]](s16), [[C5]](s16), [[C5]](s16), [[C5]](s16), [[C5]](s16), [[C5]](s16), [[C5]](s16), [[C5]](s16)
- ; CHECK-NEXT: [[AND4:%[0-9]+]]:_(<8 x s16>) = G_AND [[OR1]], [[BUILD_VECTOR5]]
- ; CHECK-NEXT: [[LSHR2:%[0-9]+]]:_(<8 x s16>) = G_LSHR [[AND4]], [[BUILD_VECTOR4]](<8 x s16>)
- ; CHECK-NEXT: [[SHL2:%[0-9]+]]:_(<8 x s16>) = G_SHL [[OR1]], [[BUILD_VECTOR4]](<8 x s16>)
- ; CHECK-NEXT: [[AND5:%[0-9]+]]:_(<8 x s16>) = G_AND [[SHL2]], [[BUILD_VECTOR5]]
- ; CHECK-NEXT: %bitreverse:_(<8 x s16>) = G_OR [[LSHR2]], [[AND5]]
+ ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<16 x s8>) = G_BITCAST [[BSWAP]](<8 x s16>)
+ ; CHECK-NEXT: [[BITREVERSE:%[0-9]+]]:_(<16 x s8>) = G_BITREVERSE [[BITCAST]]
+ ; CHECK-NEXT: %bitreverse:_(<8 x s16>) = G_BITCAST [[BITREVERSE]](<16 x s8>)
; CHECK-NEXT: $q0 = COPY %bitreverse(<8 x s16>)
; CHECK-NEXT: RET_ReallyLR implicit $q0
%vec:_(<8 x s16>) = COPY $q0
diff --git a/llvm/test/CodeGen/AArch64/bitreverse.ll b/llvm/test/CodeGen/AArch64/bitreverse.ll
index 25fa3c6248ee3..8d9f6996d0b70 100644
--- a/llvm/test/CodeGen/AArch64/bitreverse.ll
+++ b/llvm/test/CodeGen/AArch64/bitreverse.ll
@@ -17,52 +17,10 @@ define <2 x i16> @f(<2 x i16> %a) {
; GISEL-LABEL: f:
; GISEL: // %bb.0:
; GISEL-NEXT: uzp1 v0.4h, v0.4h, v0.4h
-; GISEL-NEXT: mov w8, #61680 // =0xf0f0
-; GISEL-NEXT: dup v1.2s, w8
-; GISEL-NEXT: mov w8, #4 // =0x4
-; GISEL-NEXT: fmov s3, w8
; GISEL-NEXT: rev16 v0.8b, v0.8b
-; GISEL-NEXT: mov v3.h[1], w8
-; GISEL-NEXT: mov w8, #52428 // =0xcccc
-; GISEL-NEXT: ushll v2.4s, v0.4h, #0
-; GISEL-NEXT: neg v4.4h, v3.4h
-; GISEL-NEXT: and v2.8b, v2.8b, v1.8b
-; GISEL-NEXT: uzp1 v2.4h, v2.4h, v0.4h
-; GISEL-NEXT: ushl v0.4h, v0.4h, v3.4h
-; GISEL-NEXT: ushll v0.4s, v0.4h, #0
-; GISEL-NEXT: ushl v2.4h, v2.4h, v4.4h
-; GISEL-NEXT: and v0.8b, v0.8b, v1.8b
-; GISEL-NEXT: ushll v1.4s, v2.4h, #0
-; GISEL-NEXT: dup v2.2s, w8
-; GISEL-NEXT: mov w8, #2 // =0x2
-; GISEL-NEXT: orr v0.8b, v1.8b, v0.8b
-; GISEL-NEXT: fmov s1, w8
-; GISEL-NEXT: and v3.8b, v0.8b, v2.8b
-; GISEL-NEXT: uzp1 v0.4h, v0.4h, v0.4h
-; GISEL-NEXT: mov v1.h[1], w8
-; GISEL-NEXT: mov w8, #43690 // =0xaaaa
-; GISEL-NEXT: uzp1 v3.4h, v3.4h, v0.4h
-; GISEL-NEXT: neg v4.4h, v1.4h
-; GISEL-NEXT: ushl v0.4h, v0.4h, v1.4h
-; GISEL-NEXT: ushll v0.4s, v0.4h, #0
-; GISEL-NEXT: ushl v1.4h, v3.4h, v4.4h
-; GISEL-NEXT: and v0.8b, v0.8b, v2.8b
-; GISEL-NEXT: dup v2.2s, w8
-; GISEL-NEXT: mov w8, #1 // =0x1
-; GISEL-NEXT: ushll v1.4s, v1.4h, #0
-; GISEL-NEXT: orr v0.8b, v1.8b, v0.8b
-; GISEL-NEXT: fmov s1, w8
-; GISEL-NEXT: and v3.8b, v0.8b, v2.8b
-; GISEL-NEXT: uzp1 v0.4h, v0.4h, v0.4h
-; GISEL-NEXT: mov v1.h[1], w8
-; GISEL-NEXT: uzp1 v3.4h, v3.4h, v0.4h
-; GISEL-NEXT: neg v4.4h, v1.4h
-; GISEL-NEXT: ushl v0.4h, v0.4h, v1.4h
+; GISEL-NEXT: rbit v0.8b, v0.8b
; GISEL-NEXT: ushll v0.4s, v0.4h, #0
-; GISEL-NEXT: ushl v1.4h, v3.4h, v4.4h
-; GISEL-NEXT: and v0.8b, v0.8b, v2.8b
-; GISEL-NEXT: ushll v1.4s, v1.4h, #0
-; GISEL-NEXT: orr v0.8b, v1.8b, v0.8b
+; GISEL-NEXT: // kill: def $d0 killed $d0 killed $q0
; GISEL-NEXT: ret
%b = call <2 x i16> @llvm.bitreverse.v2i16(<2 x i16> %a)
ret <2 x i16> %b
@@ -183,34 +141,11 @@ define <9 x i8> @g_vec_9x8(<9 x i8> %a) {
declare <4 x i16> @llvm.bitreverse.v4i16(<4 x i16>) readnone
define <4 x i16> @g_vec_4x16(<4 x i16> %a) {
-; SDAG-LABEL: g_vec_4x16:
-; SDAG: // %bb.0:
-; SDAG-NEXT: rev16 v0.8b, v0.8b
-; SDAG-NEXT: rbit v0.8b, v0.8b
-; SDAG-NEXT: ret
-;
-; GISEL-LABEL: g_vec_4x16:
-; GISEL: // %bb.0:
-; GISEL-NEXT: movi v1.8b, #240
-; GISEL-NEXT: rev16 v0.8b, v0.8b
-; GISEL-NEXT: and v2.8b, v0.8b, v1.8b
-; GISEL-NEXT: shl v0.4h, v0.4h, #4
-; GISEL-NEXT: ushr v2.4h, v2.4h, #4
-; GISEL-NEXT: and v0.8b, v0.8b, v1.8b
-; GISEL-NEXT: movi v1.8b, #204
-; GISEL-NEXT: orr v0.8b, v2.8b, v0.8b
-; GISEL-NEXT: and v2.8b, v0.8b, v1.8b
-; GISEL-NEXT: shl v0.4h, v0.4h, #2
-; GISEL-NEXT: ushr v2.4h, v2.4h, #2
-; GISEL-NEXT: and v0.8b, v0.8b, v1.8b
-; GISEL-NEXT: movi v1.8b, #170
-; GISEL-NEXT: orr v0.8b, v2.8b, v0.8b
-; GISEL-NEXT: and v2.8b, v0.8b, v1.8b
-; GISEL-NEXT: shl v0.4h, v0.4h, #1
-; GISEL-NEXT: ushr v2.4h, v2.4h, #1
-; GISEL-NEXT: and v0.8b, v0.8b, v1.8b
-; GISEL-NEXT: orr v0.8b, v2.8b, v0.8b
-; GISEL-NEXT: ret
+; CHECK-LABEL: g_vec_4x16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rev16 v0.8b, v0.8b
+; CHECK-NEXT: rbit v0.8b, v0.8b
+; CHECK-NEXT: ret
%b = call <4 x i16> @llvm.bitreverse.v4i16(<4 x i16> %a)
ret <4 x i16> %b
}
@@ -218,69 +153,37 @@ define <4 x i16> @g_vec_4x16(<4 x i16> %a) {
declare <8 x i16> @llvm.bitreverse.v8i16(<8 x i16>) readnone
define <8 x i16> @g_vec_8x16(<8 x i16> %a) {
-; SDAG-LABEL: g_vec_8x16:
-; SDAG: // %bb.0:
-; SDAG-NEXT: rev16 v0.16b, v0.16b
-; SDAG-NEXT: rbit v0.16b, v0.16b
-; SDAG-NEXT: ret
-;
-; GISEL-LABEL: g_vec_8x16:
-; GISEL: // %bb.0:
-; GISEL-NEXT: movi v1.16b, #240
-; GISEL-NEXT: rev16 v0.16b, v0.16b
-; GISEL-NEXT: and v2.16b, v0.16b, v1.16b
-; GISEL-NEXT: shl v0.8h, v0.8h, #4
-; GISEL-NEXT: ushr v2.8h, v2.8h, #4
-; GISEL-NEXT: and v0.16b, v0.16b, v1.16b
-; GISEL-NEXT: movi v1.16b, #204
-; GISEL-NEXT: orr v0.16b, v2.16b, v0.16b
-; GISEL-NEXT: and v2.16b, v0.16b, v1.16b
-; GISEL-NEXT: shl v0.8h, v0.8h, #2
-; GISEL-NEXT: ushr v2.8h, v2.8h, #2
-; GISEL-NEXT: and v0.16b, v0.16b, v1.16b
-; GISEL-NEXT: movi v1.16b, #170
-; GISEL-NEXT: orr v0.16b, v2.16b, v0.16b
-; GISEL-NEXT: and v2.16b, v0.16b, v1.16b
-; GISEL-NEXT: shl v0.8h, v0.8h, #1
-; GISEL-NEXT: ushr v2.8h, v2.8h, #1
-; GISEL-NEXT: and v0.16b, v0.16b, v1.16b
-; GISEL-NEXT: orr v0.16b, v2.16b, v0.16b
-; GISEL-NEXT: ret
+; CHECK-LABEL: g_vec_8x16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rev16 v0.16b, v0.16b
+; CHECK-NEXT: rbit v0.16b, v0.16b
+; CHECK-NEXT: ret
%b = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %a)
ret <8 x i16> %b
}
+declare <16 x i16> @llvm.bitreverse.v16i16(<16 x i16>) readnone
+
+define <16 x i16> @g_vec_16x16(<16 x i16> %a) {
+; CHECK-LABEL: g_vec_16x16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rev16 v0.16b, v0.16b
+; CHECK-NEXT: rev16 v1.16b, v1.16b
+; CHECK-NEXT: rbit v0.16b, v0.16b
+; CHECK-NEXT: rbit v1.16b, v1.16b
+; CHECK-NEXT: ret
+ %b = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %a)
+ ret <16 x i16> %b
+}
+
declare <2 x i32> @llvm.bitreverse.v2i32(<2 x i32>) readnone
define <2 x i32> @g_vec_2x32(<2 x i32> %a) {
-; SDAG-LABEL: g_vec_2x32:
-; SDAG: // %bb.0:
-; SDAG-NEXT: rev32 v0.8b, v0.8b
-; SDAG-NEXT: rbit v0.8b, v0.8b
-; SDAG-NEXT: ret
-;
-; GISEL-LABEL: g_vec_2x32:
-; GISEL: // %bb.0:
-; GISEL-NEXT: movi v1.8b, #240
-; GISEL-NEXT: rev32 v0.8b, v0.8b
-; GISEL-NEXT: and v2.8b, v0.8b, v1.8b
-; GISEL-NEXT: shl v0.2s, v0.2s, #4
-; GISEL-NEXT: ushr v2.2s, v2.2s, #4
-; GISEL-NEXT: and v0.8b, v0.8b, v1.8b
-; GISEL-NEXT: movi v1.8b, #204
-; GISEL-NEXT: orr v0.8b, v2.8b, v0.8b
-; GISEL-NEXT: and v2.8b, v0.8b, v1.8b
-; GISEL-NEXT: shl v0.2s, v0.2s, #2
-; GISEL-NEXT: ushr v2.2s, v2.2s, #2
-; GISEL-NEXT: and v0.8b, v0.8b, v1.8b
-; GISEL-NEXT: movi v1.8b, #170
-; GISEL-NEXT: orr v0.8b, v2.8b, v0.8b
-; GISEL-NEXT: and v2.8b, v0.8b, v1.8b
-; GISEL-NEXT: shl v0.2s, v0.2s, #1
-; GISEL-NEXT: ushr v2.2s, v2.2s, #1
-; GISEL-NEXT: and v0.8b, v0.8b, v1.8b
-; GISEL-NEXT: orr v0.8b, v2.8b, v0.8b
-; GISEL-NEXT: ret
+; CHECK-LABEL: g_vec_2x32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rev32 v0.8b, v0.8b
+; CHECK-NEXT: rbit v0.8b, v0.8b
+; CHECK-NEXT: ret
%b = call <2 x i32> @llvm.bitreverse.v2i32(<2 x i32> %a)
ret <2 x i32> %b
}
@@ -288,38 +191,29 @@ define <2 x i32> @g_vec_2x32(<2 x i32> %a) {
declare <4 x i32> @llvm.bitreverse.v4i32(<4 x i32>) readnone
define <4 x i32> @g_vec_4x32(<4 x i32> %a) {
-; SDAG-LABEL: g_vec_4x32:
-; SDAG: // %bb.0:
-; SDAG-NEXT: rev32 v0.16b, v0.16b
-; SDAG-NEXT: rbit v0.16b, v0.16b
-; SDAG-NEXT: ret
-;
-; GISEL-LABEL: g_vec_4x32:
-; GISEL: // %bb.0:
-; GISEL-NEXT: movi v1.16b, #240
-; GISEL-NEXT: rev32 v0.16b, v0.16b
-; GISEL-NEXT: and v2.16b, v0.16b, v1.16b
-; GISEL-NEXT: shl v0.4s, v0.4s, #4
-; GISEL-NEXT: ushr v2.4s, v2.4s, #4
-; GISEL-NEXT: and v0.16b, v0.16b, v1.16b
-; GISEL-NEXT: movi v1.16b, #204
-; GISEL-NEXT: orr v0.16b, v2.16b, v0.16b
-; GISEL-NEXT: and v2.16b, v0.16b, v1.16b
-; GISEL-NEXT: shl v0.4s, v0.4s, #2
-; GISEL-NEXT: ushr v2.4s, v2.4s, #2
-; GISEL-NEXT: and v0.16b, v0.16b, v1.16b
-; GISEL-NEXT: movi v1.16b, #170
-; GISEL-NEXT: orr v0.16b, v2.16b, v0.16b
-; GISEL-NEXT: and v2.16b, v0.16b, v1.16b
-; GISEL-NEXT: shl v0.4s, v0.4s, #1
-; GISEL-NEXT: ushr v2.4s, v2.4s, #1
-; GISEL-NEXT: and v0.16b, v0.16b, v1.16b
-; GISEL-NEXT: orr v0.16b, v2.16b, v0.16b
-; GISEL-NEXT: ret
+; CHECK-LABEL: g_vec_4x32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rev32 v0.16b, v0.16b
+; CHECK-NEXT: rbit v0.16b, v0.16b
+; CHECK-NEXT: ret
%b = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %a)
ret <4 x i32> %b
}
+declare <8 x i32> @llvm.bitreverse.v8i32(<8 x i32>) readnone
+
+define <8 x i32> @g_vec_8x32(<8 x i32> %a) {
+; CHECK-LABEL: g_vec_8x32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rev32 v0.16b, v0.16b
+; CHECK-NEXT: rev32 v1.16b, v1.16b
+; CHECK-NEXT: rbit v0.16b, v0.16b
+; CHECK-NEXT: rbit v1.16b, v1.16b
+; CHECK-NEXT: ret
+ %b = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %a)
+ ret <8 x i32> %b
+}
+
declare <1 x i64> @llvm.bitreverse.v1i64(<1 x i64>) readnone
define <1 x i64> @g_vec_1x64(<1 x i64> %a) {
@@ -342,34 +236,25 @@ define <1 x i64> @g_vec_1x64(<1 x i64> %a) {
declare <2 x i64> @llvm.bitreverse.v2i64(<2 x i64>) readnone
define <2 x i64> @g_vec_2x64(<2 x i64> %a) {
-; SDAG-LABEL: g_vec_2x64:
-; SDAG: // %bb.0:
-; SDAG-NEXT: rev64 v0.16b, v0.16b
-; SDAG-NEXT: rbit v0.16b, v0.16b
-; SDAG-NEXT: ret
-;
-; GISEL-LABEL: g_vec_2x64:
-; GISEL: // %bb.0:
-; GISEL-NEXT: movi v1.16b, #240
-; GISEL-NEXT: rev64 v0.16b, v0.16b
-; GISEL-NEXT: and v2.16b, v0.16b, v1.16b
-; GISEL-NEXT: shl v0.2d, v0.2d, #4
-; GISEL-NEXT: ushr v2.2d, v2.2d, #4
-; GISEL-NEXT: and v0.16b, v0.16b, v1.16b
-; GISEL-NEXT: movi v1.16b, #204
-; GISEL-NEXT: orr v0.16b, v2.16b, v0.16b
-; GISEL-NEXT: and v2.16b, v0.16b, v1.16b
-; GISEL-NEXT: shl v0.2d, v0.2d, #2
-; GISEL-NEXT: ushr v2.2d, v2.2d, #2
-; GISEL-NEXT: and v0.16b, v0.16b, v1.16b
-; GISEL-NEXT: movi v1.16b, #170
-; GISEL-NEXT: orr v0.16b, v2.16b, v0.16b
-; GISEL-NEXT: and v2.16b, v0.16b, v1.16b
-; GISEL-NEXT: shl v0.2d, v0.2d, #1
-; GISEL-NEXT: ushr v2.2d, v2.2d, #1
-; GISEL-NEXT: and v0.16b, v0.16b, v1.16b
-; GISEL-NEXT: orr v0.16b, v2.16b, v0.16b
-; GISEL-NEXT: ret
+; CHECK-LABEL: g_vec_2x64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rev64 v0.16b, v0.16b
+; CHECK-NEXT: rbit v0.16b, v0.16b
+; CHECK-NEXT: ret
%b = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %a)
ret <2 x i64> %b
}
+
+declare <4 x i64> @llvm.bitreverse.v4i64(<4 x i64>) readnone
+
+define <4 x i64> @g_vec_4x64(<4 x i64> %a) {
+; CHECK-LABEL: g_vec_4x64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rev64 v0.16b, v0.16b
+; CHECK-NEXT: rev64 v1.16b, v1.16b
+; CHECK-NEXT: rbit v0.16b, v0.16b
+; CHECK-NEXT: rbit v1.16b, v1.16b
+; CHECK-NEXT: ret
+ %b = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %a)
+ ret <4 x i64> %b
+}
>From c1ce7b7af992dd9329c23496db36888d4a0f447c Mon Sep 17 00:00:00 2001
From: Yu Li <yu.li at arm.com>
Date: Mon, 12 May 2025 08:59:36 +0000
Subject: [PATCH 2/4] [GISel][AArch64] Handles bitreverse as i8 vector type if
legal
---
.../CodeGen/GlobalISel/LegalizerHelper.cpp | 71 ++++++++++++++++---
1 file changed, 63 insertions(+), 8 deletions(-)
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index a0ec0068c6275..180db09bb643c 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -8971,6 +8971,16 @@ LegalizerHelper::LegalizeResult LegalizerHelper::lowerBswap(MachineInstr &MI) {
return Legalized;
}
+//{ (Src & Mask) >> N } | { (Src << N) & Mask }
+static MachineInstrBuilder SwapN(unsigned N, DstOp Dst, MachineIRBuilder &B,
+ MachineInstrBuilder Src, const APInt &Mask) {
+ const LLT Ty = Dst.getLLTTy(*B.getMRI());
+ MachineInstrBuilder C_N = B.buildConstant(Ty, N);
+ MachineInstrBuilder MaskLoNTo0 = B.buildConstant(Ty, Mask);
+ auto LHS = B.buildLShr(Ty, B.buildAnd(Ty, Src, MaskLoNTo0), C_N);
+ auto RHS = B.buildAnd(Ty, B.buildShl(Ty, Src, C_N), MaskLoNTo0);
+ return B.buildOr(Dst, LHS, RHS);
+}
LegalizerHelper::LegalizeResult
LegalizerHelper::lowerBitreverse(MachineInstr &MI) {
@@ -8979,17 +8989,62 @@ LegalizerHelper::lowerBitreverse(MachineInstr &MI) {
unsigned Size = SrcTy.getScalarSizeInBits();
unsigned VSize = SrcTy.getSizeInBits();
- LLT VTy = VSize == 128 ? LLT::fixed_vector(16, 8) : LLT::fixed_vector(8, 8);
+ if (Size >= 8) {
+ LLT VTy = LLT::fixed_vector(VSize / 8, 8);
+
+ if (LI.isLegal({TargetOpcode::G_BITREVERSE, {VTy, VTy}})) {
+ // If bitreverse is legal for i8 vector of the same size, then handle
+ // with bswap and cast to i8 vector types.
+ // e.g. v4s32 -> v16s8
+ auto BSWAP = MIRBuilder.buildBSwap(SrcTy, Src);
+ auto Cast = MIRBuilder.buildBitcast(VTy, BSWAP);
+ auto RBIT = MIRBuilder.buildBitReverse(VTy, Cast);
+ MIRBuilder.buildBitcast(Dst, RBIT);
+ } else {
+ MachineInstrBuilder BSWAP =
+ MIRBuilder.buildInstr(TargetOpcode::G_BSWAP, {SrcTy}, {Src});
+
+ // swap high and low 4 bits in 8 bit blocks 7654|3210 -> 3210|7654
+ // [(val & 0xF0F0F0F0) >> 4] | [(val & 0x0F0F0F0F) << 4]
+ // -> [(val & 0xF0F0F0F0) >> 4] | [(val << 4) & 0xF0F0F0F0]
+ MachineInstrBuilder Swap4 = SwapN(4, SrcTy, MIRBuilder, BSWAP,
+ APInt::getSplat(Size, APInt(8, 0xF0)));
+
+ // swap high and low 2 bits in 4 bit blocks 32|10 76|54 -> 10|32 54|76
+ // [(val & 0xCCCCCCCC) >> 2] & [(val & 0x33333333) << 2]
+ // -> [(val & 0xCCCCCCCC) >> 2] & [(val << 2) & 0xCCCCCCCC]
+ MachineInstrBuilder Swap2 = SwapN(2, SrcTy, MIRBuilder, Swap4,
+ APInt::getSplat(Size, APInt(8, 0xCC)));
+
+ // swap high and low 1 bit in 2 bit blocks 1|0 3|2 5|4 7|6 -> 0|1 2|3 4|5
+ // 6|7
+ // [(val & 0xAAAAAAAA) >> 1] & [(val & 0x55555555) << 1]
+ // -> [(val & 0xAAAAAAAA) >> 1] & [(val << 1) & 0xAAAAAAAA]
+ SwapN(1, Dst, MIRBuilder, Swap2, APInt::getSplat(Size, APInt(8, 0xAA)));
+ }
+ } else {
+ // Expand bitreverse for types smaller than 8 bits.
+ MachineInstrBuilder Tmp;
+ for (unsigned I = 0, J = Size - 1; I < Size; ++I, --J) {
+ MachineInstrBuilder Tmp2;
+ if (I < J) {
+ auto ShAmt = MIRBuilder.buildConstant(SrcTy, J - I);
+ Tmp2 = MIRBuilder.buildShl(SrcTy, Src, ShAmt);
+ } else {
+ auto ShAmt = MIRBuilder.buildConstant(SrcTy, I - J);
+ Tmp2 = MIRBuilder.buildLShr(SrcTy, Src, ShAmt);
+ }
- if (!LI.isLegal({TargetOpcode::G_BSWAP, {SrcTy, SrcTy}})) {
- return UnableToLegalize;
+ auto Mask = MIRBuilder.buildConstant(SrcTy, 1ULL << J);
+ Tmp2 = MIRBuilder.buildAnd(SrcTy, Tmp2, Mask);
+ if (I == 0)
+ Tmp = Tmp2;
+ else
+ Tmp = MIRBuilder.buildOr(SrcTy, Tmp, Tmp2);
+ }
+ MIRBuilder.buildCopy(Dst, Tmp);
}
- auto BSWAP = MIRBuilder.buildBSwap(SrcTy, Src);
- auto Cast = MIRBuilder.buildBitcast(VTy , BSWAP);
- auto RBIT = MIRBuilder.buildBitReverse(VTy, Cast);
- MIRBuilder.buildBitcast(Dst, RBIT);
-
MI.eraseFromParent();
return Legalized;
}
>From c1035dccd00eff41a34ec616c60f22d0f4bc6ca5 Mon Sep 17 00:00:00 2001
From: Yu Li <yu.li at arm.com>
Date: Mon, 12 May 2025 09:33:26 +0000
Subject: [PATCH 3/4] [GISel][AArch64] Add unit tests for types smaller than i8
---
.../CodeGen/GlobalISel/LegalizerHelper.cpp | 4 ++--
llvm/test/CodeGen/AArch64/bitreverse.ll | 24 +++++++++++++++++++
2 files changed, 26 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index 180db09bb643c..9a16319b48b2e 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -8993,8 +8993,8 @@ LegalizerHelper::lowerBitreverse(MachineInstr &MI) {
LLT VTy = LLT::fixed_vector(VSize / 8, 8);
if (LI.isLegal({TargetOpcode::G_BITREVERSE, {VTy, VTy}})) {
- // If bitreverse is legal for i8 vector of the same size, then handle
- // with bswap and cast to i8 vector types.
+ // If bitreverse is legal for i8 vector of the same size, then cast
+ // to i8 vector type.
// e.g. v4s32 -> v16s8
auto BSWAP = MIRBuilder.buildBSwap(SrcTy, Src);
auto Cast = MIRBuilder.buildBitcast(VTy, BSWAP);
diff --git a/llvm/test/CodeGen/AArch64/bitreverse.ll b/llvm/test/CodeGen/AArch64/bitreverse.ll
index 8d9f6996d0b70..c1ca39d87105e 100644
--- a/llvm/test/CodeGen/AArch64/bitreverse.ll
+++ b/llvm/test/CodeGen/AArch64/bitreverse.ll
@@ -72,6 +72,30 @@ define i64 @g_64(i64 %a) {
ret i64 %b
}
+declare <16 x i3> @llvm.bitreverse.v16i3(<16 x i3>) readnone
+
+define <16 x i3> @g_vec_16x3(<16 x i3> %a) {
+; CHECK-LABEL: g_vec_16x3:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rbit v0.16b, v0.16b
+; CHECK-NEXT: ushr v0.16b, v0.16b, #5
+; CHECK-NEXT: ret
+ %b = call <16 x i3> @llvm.bitreverse.v16i3(<16 x i3> %a)
+ ret <16 x i3> %b
+}
+
+declare <16 x i4> @llvm.bitreverse.v16i4(<16 x i4>) readnone
+
+define <16 x i4> @g_vec_16x4(<16 x i4> %a) {
+; CHECK-LABEL: g_vec_16x4:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rbit v0.16b, v0.16b
+; CHECK-NEXT: ushr v0.16b, v0.16b, #4
+; CHECK-NEXT: ret
+ %b = call <16 x i4> @llvm.bitreverse.v16i4(<16 x i4> %a)
+ ret <16 x i4> %b
+}
+
declare <8 x i8> @llvm.bitreverse.v8i8(<8 x i8>) readnone
define <8 x i8> @g_vec(<8 x i8> %a) {
>From 07e1d8b4cbbdfb51c6d547bb607056e8ff8b141f Mon Sep 17 00:00:00 2001
From: Yu Li <yu.li at arm.com>
Date: Mon, 12 May 2025 13:02:29 +0000
Subject: [PATCH 4/4] [AArch64][GIsel] Added vector check on bitreverse
lowering
---
llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp | 8 +++++---
1 file changed, 5 insertions(+), 3 deletions(-)
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index 9a16319b48b2e..72f2ba75c927e 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -8990,12 +8990,14 @@ LegalizerHelper::lowerBitreverse(MachineInstr &MI) {
unsigned VSize = SrcTy.getSizeInBits();
if (Size >= 8) {
- LLT VTy = LLT::fixed_vector(VSize / 8, 8);
-
- if (LI.isLegal({TargetOpcode::G_BITREVERSE, {VTy, VTy}})) {
+ if (SrcTy.isVector() && (VSize % 8 == 0) &&
+ (LI.isLegal({TargetOpcode::G_BITREVERSE,
+ {LLT::fixed_vector(VSize / 8, 8),
+ LLT::fixed_vector(VSize / 8, 8)}}))) {
// If bitreverse is legal for i8 vector of the same size, then cast
// to i8 vector type.
// e.g. v4s32 -> v16s8
+ LLT VTy = LLT::fixed_vector(VSize / 8, 8);
auto BSWAP = MIRBuilder.buildBSwap(SrcTy, Src);
auto Cast = MIRBuilder.buildBitcast(VTy, BSWAP);
auto RBIT = MIRBuilder.buildBitReverse(VTy, Cast);
More information about the llvm-commits
mailing list