[llvm] cd3d7bf - [AArch64][SVE] Add DAG-Combine to push bitcasts from floating point loads after DUPLANE128

Thu Jul 21 04:00:33 PDT 2022

Author: Matt Devereau
Date: 2022-07-21T11:00:10Z
New Revision: cd3d7bf15d3e64ca2572c1711df2b849cada8d3b

URL: https://github.com/llvm/llvm-project/commit/cd3d7bf15d3e64ca2572c1711df2b849cada8d3b
DIFF: https://github.com/llvm/llvm-project/commit/cd3d7bf15d3e64ca2572c1711df2b849cada8d3b.diff

LOG: [AArch64][SVE] Add DAG-Combine to push bitcasts from floating point loads after DUPLANE128

This patch lowers
  duplane128(insert_subvector(undef, bitcast(op(128bitsubvec)), 0), 0)
to
  bitcast(duplane128(insert_subvector(undef, op(128bitsubvec), 0), 0)).

This enables floating-point loads to match patterns added in
https://reviews.llvm.org/D130010

Differential Revision: https://reviews.llvm.org/D130013

Added: 
    

Modified: 
    llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
    llvm/test/CodeGen/AArch64/sve-ld1r.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index cb20554e6a6b..52f026456f02 100644

--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -19256,6 +19256,41 @@ static SDValue performBSPExpandForSVE(SDNode *N, SelectionDAG &DAG,
   return DAG.getNode(ISD::OR, DL, VT, Sel, SelInv);
 }
 
+static SDValue performDupLane128Combine(SDNode *N, SelectionDAG &DAG) {
+  EVT VT = N->getValueType(0);
+
+  SDValue Insert = N->getOperand(0);
+  if (Insert.getOpcode() != ISD::INSERT_SUBVECTOR)
+    return SDValue();
+
+  if (!Insert.getOperand(0).isUndef())
+    return SDValue();
+
+  uint64_t IdxInsert = Insert.getConstantOperandVal(2);
+  uint64_t IdxDupLane = N->getConstantOperandVal(1);
+  if (IdxInsert != IdxDupLane)
+    return SDValue();
+
+  SDValue Bitcast = Insert.getOperand(1);
+  if (Bitcast.getOpcode() != ISD::BITCAST)
+    return SDValue();
+
+  SDValue Subvec = Bitcast.getOperand(0);
+  EVT SubvecVT = Subvec.getValueType();
+  if (!SubvecVT.is128BitVector())
+    return SDValue();
+  EVT NewSubvecVT =
+      getPackedSVEVectorVT(Subvec.getValueType().getVectorElementType());
+
+  SDLoc DL(N);
+  SDValue NewInsert =
+      DAG.getNode(ISD::INSERT_SUBVECTOR, DL, NewSubvecVT,
+                  DAG.getUNDEF(NewSubvecVT), Subvec, Insert->getOperand(2));
+  SDValue NewDuplane128 = DAG.getNode(AArch64ISD::DUPLANE128, DL, NewSubvecVT,
+                                      NewInsert, N->getOperand(1));
+  return DAG.getNode(ISD::BITCAST, DL, VT, NewDuplane128);
+}
+
 SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
                                                  DAGCombinerInfo &DCI) const {
   SelectionDAG &DAG = DCI.DAG;
@@ -19342,6 +19377,8 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
     return performCSELCombine(N, DCI, DAG);
   case AArch64ISD::DUP:
     return performDUPCombine(N, DCI);
+  case AArch64ISD::DUPLANE128:
+    return performDupLane128Combine(N, DAG);
   case AArch64ISD::NVCAST:
     return performNVCASTCombine(N);
   case AArch64ISD::SPLICE:

diff  --git a/llvm/test/CodeGen/AArch64/sve-ld1r.ll b/llvm/test/CodeGen/AArch64/sve-ld1r.ll
index eafa10754b44..ce019e5d220c 100644
--- a/llvm/test/CodeGen/AArch64/sve-ld1r.ll
+++ b/llvm/test/CodeGen/AArch64/sve-ld1r.ll
@@ -726,8 +726,8 @@ define <vscale x 2 x double> @ld1rd_double_gep_out_of_range_down(double* %valp)
 define <vscale x 2 x double> @dupq_ld1rqd_f64(<2 x double>* %a) {
 ; CHECK-LABEL: dupq_ld1rqd_f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    mov z0.q, q0
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ld1rqd { z0.d }, p0/z, [x0]
 ; CHECK-NEXT:    ret
   %1 = load <2 x double>, <2 x double>* %a
   %2 = tail call fast <vscale x 2 x double> @llvm.vector.insert.nxv2f64.v2f64(<vscale x 2 x double> undef, <2 x double> %1, i64 0)
@@ -738,8 +738,8 @@ define <vscale x 2 x double> @dupq_ld1rqd_f64(<2 x double>* %a) {
 define <vscale x 4 x float> @dupq_ld1rqw_f32(<4 x float>* %a) {
 ; CHECK-LABEL: dupq_ld1rqw_f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    mov z0.q, q0
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ld1rqw { z0.s }, p0/z, [x0]
 ; CHECK-NEXT:    ret
   %1 = load <4 x float>, <4 x float>* %a
   %2 = tail call fast <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v4f32(<vscale x 4 x float> undef, <4 x float> %1, i64 0)
@@ -750,8 +750,8 @@ define <vscale x 4 x float> @dupq_ld1rqw_f32(<4 x float>* %a) {
 define <vscale x 8 x half> @dupq_ld1rqh_f16(<8 x half>* %a) {
 ; CHECK-LABEL: dupq_ld1rqh_f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    mov z0.q, q0
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    ld1rqh { z0.h }, p0/z, [x0]
 ; CHECK-NEXT:    ret
   %1 = load <8 x half>, <8 x half>* %a
   %2 = tail call fast <vscale x 8 x half> @llvm.vector.insert.nxv8f16.v8f16(<vscale x 8 x half> undef, <8 x half> %1, i64 0)
@@ -762,8 +762,8 @@ define <vscale x 8 x half> @dupq_ld1rqh_f16(<8 x half>* %a) {
 define <vscale x 8 x bfloat> @dupq_ld1rqh_bf16(<8 x bfloat>* %a) #0 {
 ; CHECK-LABEL: dupq_ld1rqh_bf16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    mov z0.q, q0
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    ld1rqh { z0.h }, p0/z, [x0]
 ; CHECK-NEXT:    ret
   %1 = load <8 x bfloat>, <8 x bfloat>* %a
   %2 = tail call fast <vscale x 8 x bfloat> @llvm.vector.insert.nxv8bf16.v8bf16(<vscale x 8 x bfloat> undef, <8 x bfloat> %1, i64 0)