[llvm] 0b38af8 - [AArch64] match splat of bitcasted extract subvector to DUPLANE
Sanjay Patel via llvm-commits
llvm-commits at lists.llvm.org
Sun Dec 22 06:02:04 PST 2019
Author: Sanjay Patel
Date: 2019-12-22T08:37:03-05:00
New Revision: 0b38af89e2c0adc13a6efb1dd04485229ef0d1c6
URL: https://github.com/llvm/llvm-project/commit/0b38af89e2c0adc13a6efb1dd04485229ef0d1c6
DIFF: https://github.com/llvm/llvm-project/commit/0b38af89e2c0adc13a6efb1dd04485229ef0d1c6.diff
LOG: [AArch64] match splat of bitcasted extract subvector to DUPLANE
This is another potential regression exposed by D63815.
Here we peek through a bitcast to find an extract subvector and
scale the splat offset based on that:
splat (bitcast (extract X, C)), LaneC --> duplane (bitcast X), LaneC'
Differential Revision: https://reviews.llvm.org/D71672
Added:
Modified:
llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
llvm/test/CodeGen/AArch64/arm64-neon-2velem.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index a3dd2e65a121..d01979d8be8c 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -7086,19 +7086,55 @@ SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
// Otherwise, duplicate from the lane of the input vector.
unsigned Opcode = getDUPLANEOp(V1.getValueType().getVectorElementType());
- // SelectionDAGBuilder may have "helpfully" already extracted or conatenated
- // to make a vector of the same size as this SHUFFLE. We can ignore the
- // extract entirely, and canonicalise the concat using WidenVector.
- if (V1.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
- Lane += cast<ConstantSDNode>(V1.getOperand(1))->getZExtValue();
+ // Try to eliminate a bitcasted extract subvector before a DUPLANE.
+ auto getScaledOffsetDup = [](SDValue BitCast, int &LaneC, MVT &CastVT) {
+ // Match: dup (bitcast (extract_subv X, C)), LaneC
+ if (BitCast.getOpcode() != ISD::BITCAST ||
+ BitCast.getOperand(0).getOpcode() != ISD::EXTRACT_SUBVECTOR)
+ return false;
+
+ // The extract index must align in the destination type. That may not
+ // happen if the bitcast is from narrow to wide type.
+ SDValue Extract = BitCast.getOperand(0);
+ unsigned ExtIdx = Extract.getConstantOperandVal(1);
+ unsigned SrcEltBitWidth = Extract.getScalarValueSizeInBits();
+ unsigned ExtIdxInBits = ExtIdx * SrcEltBitWidth;
+ unsigned CastedEltBitWidth = BitCast.getScalarValueSizeInBits();
+ if (ExtIdxInBits % CastedEltBitWidth != 0)
+ return false;
+
+ // Update the lane value by offsetting with the scaled extract index.
+ LaneC += ExtIdxInBits / CastedEltBitWidth;
+
+ // Determine the casted vector type of the wide vector input.
+ // dup (bitcast (extract_subv X, C)), LaneC --> dup (bitcast X), LaneC'
+ // Examples:
+ // dup (bitcast (extract_subv v2f64 X, 1) to v2f32), 1 --> dup v4f32 X, 3
+ // dup (bitcast (extract_subv v16i8 X, 8) to v4i16), 1 --> dup v8i16 X, 5
+ unsigned SrcVecNumElts =
+ Extract.getOperand(0).getValueSizeInBits() / CastedEltBitWidth;
+ CastVT = MVT::getVectorVT(BitCast.getSimpleValueType().getScalarType(),
+ SrcVecNumElts);
+ return true;
+ };
+ MVT CastVT;
+ if (getScaledOffsetDup(V1, Lane, CastVT)) {
+ V1 = DAG.getBitcast(CastVT, V1.getOperand(0).getOperand(0));
+ } else if (V1.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
+ // The lane is incremented by the index of the extract.
+ // Example: dup v2f32 (extract v4f32 X, 2), 1 --> dup v4f32 X, 3
+ Lane += V1.getConstantOperandVal(1);
V1 = V1.getOperand(0);
} else if (V1.getOpcode() == ISD::CONCAT_VECTORS) {
+ // The lane is decremented if we are splatting from the 2nd operand.
+ // Example: dup v4i32 (concat v2i32 X, v2i32 Y), 3 --> dup v4i32 Y, 1
unsigned Idx = Lane >= (int)VT.getVectorNumElements() / 2;
Lane -= Idx * VT.getVectorNumElements() / 2;
V1 = WidenVector(V1.getOperand(Idx), DAG);
- } else if (VT.getSizeInBits() == 64)
+ } else if (VT.getSizeInBits() == 64) {
+ // Widen the operand to 128-bit register with undef.
V1 = WidenVector(V1, DAG);
-
+ }
return DAG.getNode(Opcode, dl, VT, V1, DAG.getConstant(Lane, dl, MVT::i64));
}
diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-2velem.ll b/llvm/test/CodeGen/AArch64/arm64-neon-2velem.ll
index 47d82a174853..26826789107e 100644
--- a/llvm/test/CodeGen/AArch64/arm64-neon-2velem.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-neon-2velem.ll
@@ -1663,8 +1663,7 @@ entry:
define <2 x float> @test_vmul_laneq3_f32_bitcast(<2 x float> %a, <2 x double> %v) {
; CHECK-LABEL: test_vmul_laneq3_f32_bitcast:
; CHECK: // %bb.0:
-; CHECK-NEXT: ext v1.16b, v1.16b, v1.16b, #8
-; CHECK-NEXT: fmul v0.2s, v0.2s, v1.s[1]
+; CHECK-NEXT: fmul v0.2s, v0.2s, v1.s[3]
; CHECK-NEXT: ret
%extract = shufflevector <2 x double> %v, <2 x double> undef, <1 x i32> <i32 1>
%bc = bitcast <1 x double> %extract to <2 x float>
@@ -1676,8 +1675,7 @@ define <2 x float> @test_vmul_laneq3_f32_bitcast(<2 x float> %a, <2 x double> %v
define <2 x float> @test_vmul_laneq2_f32_bitcast(<2 x float> %a, <2 x double> %v) {
; CHECK-LABEL: test_vmul_laneq2_f32_bitcast:
; CHECK: // %bb.0:
-; CHECK-NEXT: ext v1.16b, v1.16b, v1.16b, #8
-; CHECK-NEXT: fmul v0.2s, v0.2s, v1.s[0]
+; CHECK-NEXT: fmul v0.2s, v0.2s, v1.s[2]
; CHECK-NEXT: ret
%extract = shufflevector <2 x double> %v, <2 x double> undef, <1 x i32> <i32 1>
%bc = bitcast <1 x double> %extract to <2 x float>
@@ -1689,8 +1687,7 @@ define <2 x float> @test_vmul_laneq2_f32_bitcast(<2 x float> %a, <2 x double> %v
define <4 x i16> @test_vadd_laneq5_i16_bitcast(<4 x i16> %a, <2 x double> %v) {
; CHECK-LABEL: test_vadd_laneq5_i16_bitcast:
; CHECK: // %bb.0:
-; CHECK-NEXT: ext v1.16b, v1.16b, v1.16b, #8
-; CHECK-NEXT: dup v1.4h, v1.h[1]
+; CHECK-NEXT: dup v1.4h, v1.h[5]
; CHECK-NEXT: add v0.4h, v1.4h, v0.4h
; CHECK-NEXT: ret
%extract = shufflevector <2 x double> %v, <2 x double> undef, <1 x i32> <i32 1>
@@ -1700,6 +1697,8 @@ define <4 x i16> @test_vadd_laneq5_i16_bitcast(<4 x i16> %a, <2 x double> %v) {
ret <4 x i16> %r
}
+; TODO: The pattern in LowerVECTOR_SHUFFLE does not match what we are looking for.
+
define <4 x i16> @test_vadd_lane2_i16_bitcast_bigger_aligned(<4 x i16> %a, <16 x i8> %v) {
; CHECK-LABEL: test_vadd_lane2_i16_bitcast_bigger_aligned:
; CHECK: // %bb.0:
@@ -1717,8 +1716,7 @@ define <4 x i16> @test_vadd_lane2_i16_bitcast_bigger_aligned(<4 x i16> %a, <16 x
define <4 x i16> @test_vadd_lane5_i16_bitcast_bigger_aligned(<4 x i16> %a, <16 x i8> %v) {
; CHECK-LABEL: test_vadd_lane5_i16_bitcast_bigger_aligned:
; CHECK: // %bb.0:
-; CHECK-NEXT: ext v1.16b, v1.16b, v1.16b, #8
-; CHECK-NEXT: dup v1.4h, v1.h[1]
+; CHECK-NEXT: dup v1.4h, v1.h[5]
; CHECK-NEXT: add v0.4h, v1.4h, v0.4h
; CHECK-NEXT: ret
%extract = shufflevector <16 x i8> %v, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -1728,6 +1726,8 @@ define <4 x i16> @test_vadd_lane5_i16_bitcast_bigger_aligned(<4 x i16> %a, <16 x
ret <4 x i16> %r
}
+; Negative test - can't dup bytes {3,4} of v8i16.
+
define <4 x i16> @test_vadd_lane_i16_bitcast_bigger_unaligned(<4 x i16> %a, <16 x i8> %v) {
; CHECK-LABEL: test_vadd_lane_i16_bitcast_bigger_unaligned:
; CHECK: // %bb.0:
More information about the llvm-commits
mailing list