[llvm] 635164b - [AArch64][SVE] Improve SVE codegen for fixed length BITCAST

Mon May 10 06:52:39 PDT 2021

Author: Bradley Smith
Date: 2021-05-10T14:43:53+01:00
New Revision: 635164b95a8e5d8c0c40804c7ffa3981a82b917a

URL: https://github.com/llvm/llvm-project/commit/635164b95a8e5d8c0c40804c7ffa3981a82b917a
DIFF: https://github.com/llvm/llvm-project/commit/635164b95a8e5d8c0c40804c7ffa3981a82b917a.diff

LOG: [AArch64][SVE] Improve SVE codegen for fixed length BITCAST

Expanding a fixed length operation involves wrapping the operation in an
insert/extract subvector pair, as such, when this is done to bitcast we
end up with an extract_subvector of a bitcast. DAGCombine tries to
convert this into a bitcast of an extract_subvector which restores the
initial fixed length bitcast, causing an infinite loop of legalization.

As part of this patch, we must make sure the above DAGCombine does not
trigger after legalization if the created bitcast would not be legal.

Differential Revision: https://reviews.llvm.org/D101990

Added: 
    llvm/test/CodeGen/AArch64/sve-fixed-length-bitcast.ll

Modified: 
    llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
    llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
    llvm/lib/Target/AArch64/AArch64ISelLowering.h

Removed: 
    


################################################################################
diff  --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 83a417aa148c..26bbeb2bbb9c 100644

--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -20214,7 +20214,8 @@ SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode *N) {
   // Try to move vector bitcast after extract_subv by scaling extraction index:
   // extract_subv (bitcast X), Index --> bitcast (extract_subv X, Index')
   if (V.getOpcode() == ISD::BITCAST &&
-      V.getOperand(0).getValueType().isVector()) {
+      V.getOperand(0).getValueType().isVector() &&
+      (!LegalOperations || TLI.isOperationLegal(ISD::BITCAST, NVT))) {
     SDValue SrcOp = V.getOperand(0);
     EVT SrcVT = SrcOp.getValueType();
     unsigned SrcNumElts = SrcVT.getVectorMinNumElements();

diff  --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 36d46dd3706a..0c7255f17d81 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1444,6 +1444,7 @@ void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) {
   setOperationAction(ISD::ADD, VT, Custom);
   setOperationAction(ISD::AND, VT, Custom);
   setOperationAction(ISD::ANY_EXTEND, VT, Custom);
+  setOperationAction(ISD::BITCAST, VT, Custom);
   setOperationAction(ISD::BITREVERSE, VT, Custom);
   setOperationAction(ISD::BSWAP, VT, Custom);
   setOperationAction(ISD::CTLZ, VT, Custom);
@@ -3400,8 +3401,13 @@ SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op,
   return CallResult.first;
 }
 
-static SDValue LowerBITCAST(SDValue Op, SelectionDAG &DAG) {
+SDValue AArch64TargetLowering::LowerBITCAST(SDValue Op,
+                                            SelectionDAG &DAG) const {
   EVT OpVT = Op.getValueType();
+
+  if (useSVEForFixedLengthVectorVT(OpVT))
+    return LowerFixedLengthBitcastToSVE(Op, DAG);
+
   if (OpVT != MVT::f16 && OpVT != MVT::bf16)
     return SDValue();
 
@@ -17645,6 +17651,21 @@ SDValue AArch64TargetLowering::LowerFixedLengthVectorSetccToSVE(
   return convertFromScalableVector(DAG, Op.getValueType(), Promote);
 }
 
+SDValue
+AArch64TargetLowering::LowerFixedLengthBitcastToSVE(SDValue Op,
+                                                    SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  auto SrcOp = Op.getOperand(0);
+  EVT VT = Op.getValueType();
+  EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT);
+  EVT ContainerSrcVT =
+      getContainerForFixedLengthVector(DAG, SrcOp.getValueType());
+
+  SrcOp = convertToScalableVector(DAG, ContainerSrcVT, SrcOp);
+  Op = DAG.getNode(ISD::BITCAST, DL, ContainerDstVT, SrcOp);
+  return convertFromScalableVector(DAG, VT, Op);
+}
+
 SDValue AArch64TargetLowering::getSVESafeBitCast(EVT VT, SDValue Op,
                                                  SelectionDAG &DAG) const {
   SDLoc DL(Op);

diff  --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 90f77a7c276a..e6c60d817efc 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -963,6 +963,7 @@ class AArch64TargetLowering : public TargetLowering {
   SDValue LowerXOR(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerBITCAST(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerVSCALE(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerVECREDUCE(SDValue Op, SelectionDAG &DAG) const;
@@ -991,6 +992,7 @@ class AArch64TargetLowering : public TargetLowering {
                                               SelectionDAG &DAG) const;
   SDValue LowerFixedLengthExtractVectorElt(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFixedLengthInsertVectorElt(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFixedLengthBitcastToSVE(SDValue Op, SelectionDAG &DAG) const;
 
   SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG,
                         SmallVectorImpl<SDNode *> &Created) const override;

diff  --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-bitcast.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-bitcast.ll
new file mode 100644
index 000000000000..eb4186cd6aef
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-bitcast.ll
@@ -0,0 +1,239 @@
+; RUN: llc -aarch64-sve-vector-bits-min=128  -asm-verbose=0 < %s | FileCheck %s -check-prefix=NO_SVE
+; RUN: llc -aarch64-sve-vector-bits-min=256  -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK
+; RUN: llc -aarch64-sve-vector-bits-min=384  -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK
+; RUN: llc -aarch64-sve-vector-bits-min=512  -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
+; RUN: llc -aarch64-sve-vector-bits-min=640  -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
+; RUN: llc -aarch64-sve-vector-bits-min=768  -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
+; RUN: llc -aarch64-sve-vector-bits-min=896  -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
+; RUN: llc -aarch64-sve-vector-bits-min=1024 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
+; RUN: llc -aarch64-sve-vector-bits-min=1152 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
+; RUN: llc -aarch64-sve-vector-bits-min=1280 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
+; RUN: llc -aarch64-sve-vector-bits-min=1408 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
+; RUN: llc -aarch64-sve-vector-bits-min=1536 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
+; RUN: llc -aarch64-sve-vector-bits-min=1664 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
+; RUN: llc -aarch64-sve-vector-bits-min=1792 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
+; RUN: llc -aarch64-sve-vector-bits-min=1920 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
+; RUN: llc -aarch64-sve-vector-bits-min=2048 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048
+
+target triple = "aarch64-unknown-linux-gnu"
+
+; Don't use SVE when its registers are no bigger than NEON.
+; NO_SVE-NOT: ptrue
+
+; Don't use SVE for 64-bit vectors.
+define void @bitcast_v4i16(<4 x i16> *%a, <4 x half>* %b) #0 {
+; CHECK-LABEL: bitcast_v4i16:
+; CHECK: ldr d0, [x0]
+; CHECK-NEXT: str d0, [x1]
+; CHECK-NEXT: ret
+  %load = load volatile <4 x i16>, <4 x i16>* %a
+  %cast = bitcast <4 x i16> %load to <4 x half>
+  store volatile <4 x half> %cast, <4 x half>* %b
+  ret void
+}
+
+; Don't use SVE for 128-bit vectors.
+define void @bitcast_v8i16(<8 x i16> *%a, <8 x half>* %b) #0 {
+; CHECK-LABEL: bitcast_v8i16:
+; CHECK: ldr q0, [x0]
+; CHECK-NEXT: str q0, [x1]
+; CHECK-NEXT: ret
+  %load = load volatile <8 x i16>, <8 x i16>* %a
+  %cast = bitcast <8 x i16> %load to <8 x half>
+  store volatile <8 x half> %cast, <8 x half>* %b
+  ret void
+}
+
+define void @bitcast_v16i16(<16 x i16> *%a, <16 x half>* %b) #0 {
+; CHECK-LABEL: bitcast_v16i16:
+; CHECK: ptrue [[PG:p[0-9]+]].h, vl16
+; CHECK-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
+; CHECK-NEXT: st1h { [[OP]].h }, [[PG]], [x1]
+; CHECK-NEXT: ret
+  %load = load volatile <16 x i16>, <16 x i16>* %a
+  %cast = bitcast <16 x i16> %load to <16 x half>
+  store volatile <16 x half> %cast, <16 x half>* %b
+  ret void
+}
+
+define void @bitcast_v32i16(<32 x i16> *%a, <32 x half>* %b) #0 {
+; CHECK-LABEL: bitcast_v32i16:
+; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32
+; VBITS_GE_512-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
+; VBITS_GE_512-NEXT: st1h { [[OP]].h }, [[PG]], [x1]
+; VBITS_GE_512-NEXT: ret
+  %load = load volatile <32 x i16>, <32 x i16>* %a
+  %cast = bitcast <32 x i16> %load to <32 x half>
+  store volatile <32 x half> %cast, <32 x half>* %b
+  ret void
+}
+
+define void @bitcast_v64i16(<64 x i16> *%a, <64 x half>* %b) #0 {
+; CHECK-LABEL: bitcast_v64i16:
+; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64
+; VBITS_GE_1024-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
+; VBITS_GE_1024-NEXT: st1h { [[OP]].h }, [[PG]], [x1]
+; VBITS_GE_1024-NEXT: ret
+  %load = load volatile <64 x i16>, <64 x i16>* %a
+  %cast = bitcast <64 x i16> %load to <64 x half>
+  store volatile <64 x half> %cast, <64 x half>* %b
+  ret void
+}
+
+define void @bitcast_v128i16(<128 x i16> *%a, <128 x half>* %b) #0 {
+; CHECK-LABEL: bitcast_v128i16:
+; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128
+; VBITS_GE_2048-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
+; VBITS_GE_2048-NEXT: st1h { [[OP]].h }, [[PG]], [x1]
+; VBITS_GE_2048-NEXT: ret
+  %load = load volatile <128 x i16>, <128 x i16>* %a
+  %cast = bitcast <128 x i16> %load to <128 x half>
+  store volatile <128 x half> %cast, <128 x half>* %b
+  ret void
+}
+
+; Don't use SVE for 64-bit vectors.
+define void @bitcast_v2i32(<2 x i32> *%a, <2 x float>* %b) #0 {
+; CHECK-LABEL: bitcast_v2i32:
+; CHECK: ldr d0, [x0]
+; CHECK-NEXT: str d0, [x1]
+; CHECK-NEXT: ret
+  %load = load volatile <2 x i32>, <2 x i32>* %a
+  %cast = bitcast <2 x i32> %load to <2 x float>
+  store volatile <2 x float> %cast, <2 x float>* %b
+  ret void
+}
+
+; Don't use SVE for 128-bit vectors.
+define void @bitcast_v4i32(<4 x i32> *%a, <4 x float>* %b) #0 {
+; CHECK-LABEL: bitcast_v4i32:
+; CHECK: ldr q0, [x0]
+; CHECK-NEXT: str q0, [x1]
+; CHECK-NEXT: ret
+  %load = load volatile <4 x i32>, <4 x i32>* %a
+  %cast = bitcast <4 x i32> %load to <4 x float>
+  store volatile <4 x float> %cast, <4 x float>* %b
+  ret void
+}
+
+define void @bitcast_v8i32(<8 x i32> *%a, <8 x float>* %b) #0 {
+; CHECK-LABEL: bitcast_v8i32:
+; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
+; CHECK-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
+; CHECK-NEXT: st1w { [[OP]].s }, [[PG]], [x1]
+; CHECK-NEXT: ret
+  %load = load volatile <8 x i32>, <8 x i32>* %a
+  %cast = bitcast <8 x i32> %load to <8 x float>
+  store volatile <8 x float> %cast, <8 x float>* %b
+  ret void
+}
+
+define void @bitcast_v16i32(<16 x i32> *%a, <16 x float>* %b) #0 {
+; CHECK-LABEL: bitcast_v16i32:
+; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
+; VBITS_GE_512-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
+; VBITS_GE_512-NEXT: st1w { [[OP]].s }, [[PG]], [x1]
+; VBITS_GE_512-NEXT: ret
+  %load = load volatile <16 x i32>, <16 x i32>* %a
+  %cast = bitcast <16 x i32> %load to <16 x float>
+  store volatile <16 x float> %cast, <16 x float>* %b
+  ret void
+}
+
+define void @bitcast_v32i32(<32 x i32> *%a, <32 x float>* %b) #0 {
+; CHECK-LABEL: bitcast_v32i32:
+; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32
+; VBITS_GE_1024-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
+; VBITS_GE_1024-NEXT: st1w { [[OP]].s }, [[PG]], [x1]
+; VBITS_GE_1024-NEXT: ret
+  %load = load volatile <32 x i32>, <32 x i32>* %a
+  %cast = bitcast <32 x i32> %load to <32 x float>
+  store volatile <32 x float> %cast, <32 x float>* %b
+  ret void
+}
+
+define void @bitcast_v64i32(<64 x i32> *%a, <64 x float>* %b) #0 {
+; CHECK-LABEL: bitcast_v64i32:
+; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64
+; VBITS_GE_2048-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
+; VBITS_GE_2048-NEXT: st1w { [[OP]].s }, [[PG]], [x1]
+; VBITS_GE_2048-NEXT: ret
+  %load = load volatile <64 x i32>, <64 x i32>* %a
+  %cast = bitcast <64 x i32> %load to <64 x float>
+  store volatile <64 x float> %cast, <64 x float>* %b
+  ret void
+}
+
+; Don't use SVE for 64-bit vectors.
+define void @bitcast_v1i64(<1 x i64> *%a, <1 x double>* %b) #0 {
+; CHECK-LABEL: bitcast_v1i64:
+; CHECK: ldr d0, [x0]
+; CHECK-NEXT: str d0, [x1]
+; CHECK-NEXT: ret
+  %load = load volatile <1 x i64>, <1 x i64>* %a
+  %cast = bitcast <1 x i64> %load to <1 x double>
+  store volatile <1 x double> %cast, <1 x double>* %b
+  ret void
+}
+
+; Don't use SVE for 128-bit vectors.
+define void @bitcast_v2i64(<2 x i64> *%a, <2 x double>* %b) #0 {
+; CHECK-LABEL: bitcast_v2i64:
+; CHECK: ldr q0, [x0]
+; CHECK-NEXT: str q0, [x1]
+; CHECK-NEXT: ret
+  %load = load volatile <2 x i64>, <2 x i64>* %a
+  %cast = bitcast <2 x i64> %load to <2 x double>
+  store volatile <2 x double> %cast, <2 x double>* %b
+  ret void
+}
+
+define void @bitcast_v4i64(<4 x i64> *%a, <4 x double>* %b) #0 {
+; CHECK-LABEL: bitcast_v4i64:
+; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
+; CHECK-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
+; CHECK-NEXT: st1d { [[OP]].d }, [[PG]], [x1]
+; CHECK-NEXT: ret
+  %load = load volatile <4 x i64>, <4 x i64>* %a
+  %cast = bitcast <4 x i64> %load to <4 x double>
+  store volatile <4 x double> %cast, <4 x double>* %b
+  ret void
+}
+
+define void @bitcast_v8i64(<8 x i64> *%a, <8 x double>* %b) #0 {
+; CHECK-LABEL: bitcast_v8i64:
+; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
+; VBITS_GE_512-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
+; VBITS_GE_512-NEXT: st1d { [[OP]].d }, [[PG]], [x1]
+; VBITS_GE_512-NEXT: ret
+  %load = load volatile <8 x i64>, <8 x i64>* %a
+  %cast = bitcast <8 x i64> %load to <8 x double>
+  store volatile <8 x double> %cast, <8 x double>* %b
+  ret void
+}
+
+define void @bitcast_v16i64(<16 x i64> *%a, <16 x double>* %b) #0 {
+; CHECK-LABEL: bitcast_v16i64:
+; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
+; VBITS_GE_1024-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
+; VBITS_GE_1024-NEXT: st1d { [[OP]].d }, [[PG]], [x1]
+; VBITS_GE_1024-NEXT: ret
+  %load = load volatile <16 x i64>, <16 x i64>* %a
+  %cast = bitcast <16 x i64> %load to <16 x double>
+  store volatile <16 x double> %cast, <16 x double>* %b
+  ret void
+}
+
+define void @bitcast_v32i64(<32 x i64> *%a, <32 x double>* %b) #0 {
+; CHECK-LABEL: bitcast_v32i64:
+; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
+; VBITS_GE_2048-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
+; VBITS_GE_2048-NEXT: st1d { [[OP]].d }, [[PG]], [x1]
+; VBITS_GE_2048-NEXT: ret
+  %load = load volatile <32 x i64>, <32 x i64>* %a
+  %cast = bitcast <32 x i64> %load to <32 x double>
+  store volatile <32 x double> %cast, <32 x double>* %b
+  ret void
+}
+
+attributes #0 = { "target-features"="+sve" }