[llvm] 9cc5b06 - [PowerPC] Update handling of splat loads for v4i32/v4f32/v2i64 to require non-extending loads.

Fri Jan 28 06:23:10 PST 2022

Author: Amy Kwan
Date: 2022-01-28T08:23:01-06:00
New Revision: 9cc5b064f185678be494fc48c18e832e10583e07

URL: https://github.com/llvm/llvm-project/commit/9cc5b064f185678be494fc48c18e832e10583e07
DIFF: https://github.com/llvm/llvm-project/commit/9cc5b064f185678be494fc48c18e832e10583e07.diff

LOG: [PowerPC] Update handling of splat loads for v4i32/v4f32/v2i64 to require non-extending loads.

This patch updates how splat loads handled and is an extension of D106555.

Particularly, for v2i64/v4f32/v4i32 types, they are updated to handle only
non-extending loads. For v8i16/v16i8 types, they are updated to handle extending
loads only if the memory VT is the same vector element VT type.

A test case has been added to illustrate a scenario where a PPCISD::LD_SPLAT
node should not be produced. In this test, it depicts the following f64
extending load used in a v2f64 build vector, but the extending load is actually
used in more places other than the build vector (such as in t12 and t16).
```
Type-legalized selection DAG: %bb.0 'test:entry'
SelectionDAG has 20 nodes:
  t0: ch = EntryToken
  t4: i64,ch = CopyFromReg t0, Register:i64 %1
  t6: i64,ch = CopyFromReg t0, Register:i64 %2
  t11: f64,ch = load<(load (s64) from %ir.b, !tbaa !7)> t0, t4, undef:i64
        t16: f64 = fadd t31, t37
      t34: ch = store<(store (s64) into %ir.c, !tbaa !7)> t31:1, t16, t6, undef:i64
    t36: ch = TokenFactor t34, t37:1
    t27: v2f64 = BUILD_VECTOR t37, t37
  t22: ch,glue = CopyToReg t36, Register:v2f64 $v2, t27
      t12: f64 = fadd t11, t37
    t28: ch = store<(store (s64) into %ir.b, !tbaa !7)> t11:1, t12, t4, undef:i64
  t31: f64,ch = load<(load (s64) from %ir.c, !tbaa !7)> t28, t6, undef:i64
    t2: i64,ch = CopyFromReg t0, Register:i64 %0
  t37: f64,ch = load<(load (s32) from %ir.a, !tbaa !3), anyext from f32> t0, t2, undef:i64
  t23: ch = PPCISD::RET_FLAG t22, Register:v2f64 $v2, t22:1
```

Differential Revision: https://reviews.llvm.org/D117803

Added: 
    

Modified: 
    llvm/lib/Target/PowerPC/PPCISelLowering.cpp
    llvm/test/CodeGen/PowerPC/load-and-splat.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 25cc34badda04..90479eea1ad64 100644

--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -9093,22 +9093,30 @@ bool llvm::checkConvertToNonDenormSingle(APFloat &ArgAPFloat) {
 
 static bool isValidSplatLoad(const PPCSubtarget &Subtarget, const SDValue &Op,
                              unsigned &Opcode) {
-  const SDNode *InputNode = Op.getOperand(0).getNode();
-  if (!InputNode || !ISD::isUNINDEXEDLoad(InputNode))
-    return false;
-
-  if (!Subtarget.hasVSX())
+  LoadSDNode *InputNode = dyn_cast<LoadSDNode>(Op.getOperand(0));
+  if (!InputNode || !Subtarget.hasVSX() || !ISD::isUNINDEXEDLoad(InputNode))
     return false;
 
   EVT Ty = Op->getValueType(0);
-  if (Ty == MVT::v2f64 || Ty == MVT::v4f32 || Ty == MVT::v4i32 ||
-      Ty == MVT::v8i16 || Ty == MVT::v16i8)
+  // For v2f64, v4f32 and v4i32 types, we require the load to be non-extending
+  // as we cannot handle extending loads for these types.
+  if ((Ty == MVT::v2f64 || Ty == MVT::v4f32 || Ty == MVT::v4i32) &&
+      ISD::isNON_EXTLoad(InputNode))
+    return true;
+
+  EVT MemVT = InputNode->getMemoryVT();
+  // For v8i16 and v16i8 types, extending loads can be handled as long as the
+  // memory VT is the same vector element VT type.
+  // The loads feeding into the v8i16 and v16i8 types will be extending because
+  // scalar i8/i16 are not legal types.
+  if ((Ty == MVT::v8i16 || Ty == MVT::v16i8) && ISD::isEXTLoad(InputNode) &&
+      (MemVT == Ty.getVectorElementType()))
     return true;
 
   if (Ty == MVT::v2i64) {
     // Check the extend type, when the input type is i32, and the output vector
     // type is v2i64.
-    if (cast<LoadSDNode>(Op.getOperand(0))->getMemoryVT() == MVT::i32) {
+    if (MemVT == MVT::i32) {
       if (ISD::isZEXTLoad(InputNode))
         Opcode = PPCISD::ZEXT_LD_SPLAT;
       if (ISD::isSEXTLoad(InputNode))

diff  --git a/llvm/test/CodeGen/PowerPC/load-and-splat.ll b/llvm/test/CodeGen/PowerPC/load-and-splat.ll
index ce44cdc18aae1..a72472d53502b 100644
--- a/llvm/test/CodeGen/PowerPC/load-and-splat.ll
+++ b/llvm/test/CodeGen/PowerPC/load-and-splat.ll
@@ -1242,3 +1242,92 @@ entry:
   ret <16 x i8> %splat.splat
 }
 
+; The following test case should not produce a load and splat node,
+; as we cannot handle extending loads (from f32 to f64), and this test
+; shows that there are multiple uses of the extending load (other than
+; a build vector node). `lxvdsx` should not be produced in this case.
+define <2 x double> @test_v2f64_multiple_use(float* nocapture readonly %a, double* nocapture %b, double* nocapture %c) {
+; P9-LABEL: test_v2f64_multiple_use:
+; P9:       # %bb.0: # %entry
+; P9-NEXT:    lfs f0, 0(r3)
+; P9-NEXT:    xxspltd v2, vs0, 0
+; P9-NEXT:    lfd f1, 0(r4)
+; P9-NEXT:    xsadddp f1, f1, f0
+; P9-NEXT:    stfd f1, 0(r4)
+; P9-NEXT:    lfd f1, 0(r5)
+; P9-NEXT:    xsadddp f1, f1, f0
+; P9-NEXT:    stfd f1, 0(r5)
+; P9-NEXT:    blr
+;
+; P8-LABEL: test_v2f64_multiple_use:
+; P8:       # %bb.0: # %entry
+; P8-NEXT:    lfs f0, 0(r3)
+; P8-NEXT:    lfd f1, 0(r4)
+; P8-NEXT:    xsadddp f1, f1, f0
+; P8-NEXT:    xxspltd v2, vs0, 0
+; P8-NEXT:    stfd f1, 0(r4)
+; P8-NEXT:    lfd f1, 0(r5)
+; P8-NEXT:    xsadddp f1, f1, f0
+; P8-NEXT:    stfd f1, 0(r5)
+; P8-NEXT:    blr
+;
+; P7-LABEL: test_v2f64_multiple_use:
+; P7:       # %bb.0: # %entry
+; P7-NEXT:    lfs f0, 0(r3)
+; P7-NEXT:    lfd f1, 0(r4)
+; P7-NEXT:    xsadddp f1, f1, f0
+; P7-NEXT:    xxspltd v2, vs0, 0
+; P7-NEXT:    stfd f1, 0(r4)
+; P7-NEXT:    lfd f1, 0(r5)
+; P7-NEXT:    xsadddp f1, f1, f0
+; P7-NEXT:    stfd f1, 0(r5)
+; P7-NEXT:    blr
+;
+; P9-AIX32-LABEL: test_v2f64_multiple_use:
+; P9-AIX32:       # %bb.0: # %entry
+; P9-AIX32-NEXT:    lfs f0, 0(r3)
+; P9-AIX32-NEXT:    xxmrghd v2, vs0, vs0
+; P9-AIX32-NEXT:    lfd f1, 0(r4)
+; P9-AIX32-NEXT:    xsadddp f1, f1, f0
+; P9-AIX32-NEXT:    stfd f1, 0(r4)
+; P9-AIX32-NEXT:    lfd f1, 0(r5)
+; P9-AIX32-NEXT:    xsadddp f1, f1, f0
+; P9-AIX32-NEXT:    stfd f1, 0(r5)
+; P9-AIX32-NEXT:    blr
+;
+; P8-AIX32-LABEL: test_v2f64_multiple_use:
+; P8-AIX32:       # %bb.0: # %entry
+; P8-AIX32-NEXT:    lfs f0, 0(r3)
+; P8-AIX32-NEXT:    lfd f1, 0(r4)
+; P8-AIX32-NEXT:    xsadddp f1, f1, f0
+; P8-AIX32-NEXT:    xxmrghd v2, vs0, vs0
+; P8-AIX32-NEXT:    stfd f1, 0(r4)
+; P8-AIX32-NEXT:    lfd f1, 0(r5)
+; P8-AIX32-NEXT:    xsadddp f1, f1, f0
+; P8-AIX32-NEXT:    stfd f1, 0(r5)
+; P8-AIX32-NEXT:    blr
+;
+; P7-AIX32-LABEL: test_v2f64_multiple_use:
+; P7-AIX32:       # %bb.0: # %entry
+; P7-AIX32-NEXT:    lfs f0, 0(r3)
+; P7-AIX32-NEXT:    lfd f1, 0(r4)
+; P7-AIX32-NEXT:    xsadddp f1, f1, f0
+; P7-AIX32-NEXT:    xxmrghd v2, vs0, vs0
+; P7-AIX32-NEXT:    stfd f1, 0(r4)
+; P7-AIX32-NEXT:    lfd f1, 0(r5)
+; P7-AIX32-NEXT:    xsadddp f1, f1, f0
+; P7-AIX32-NEXT:    stfd f1, 0(r5)
+; P7-AIX32-NEXT:    blr
+entry:
+  %0 = load float, float* %a, align 4
+  %conv = fpext float %0 to double
+  %1 = load double, double* %b, align 8
+  %add = fadd double %1, %conv
+  store double %add, double* %b, align 8
+  %2 = load double, double* %c, align 8
+  %add2 = fadd double %2, %conv
+  store double %add2, double* %c, align 8
+  %vecinit = insertelement <2 x double> undef, double %conv, i64 0
+  %vecinit5 = shufflevector <2 x double> %vecinit, <2 x double> poison, <2 x i32> zeroinitializer
+  ret <2 x double> %vecinit5
+}