[llvm] 9cc5b06 - [PowerPC] Update handling of splat loads for v4i32/v4f32/v2i64 to require non-extending loads.
Amy Kwan via llvm-commits
llvm-commits at lists.llvm.org
Fri Jan 28 06:23:10 PST 2022
Author: Amy Kwan
Date: 2022-01-28T08:23:01-06:00
New Revision: 9cc5b064f185678be494fc48c18e832e10583e07
URL: https://github.com/llvm/llvm-project/commit/9cc5b064f185678be494fc48c18e832e10583e07
DIFF: https://github.com/llvm/llvm-project/commit/9cc5b064f185678be494fc48c18e832e10583e07.diff
LOG: [PowerPC] Update handling of splat loads for v4i32/v4f32/v2i64 to require non-extending loads.
This patch updates how splat loads handled and is an extension of D106555.
Particularly, for v2i64/v4f32/v4i32 types, they are updated to handle only
non-extending loads. For v8i16/v16i8 types, they are updated to handle extending
loads only if the memory VT is the same vector element VT type.
A test case has been added to illustrate a scenario where a PPCISD::LD_SPLAT
node should not be produced. In this test, it depicts the following f64
extending load used in a v2f64 build vector, but the extending load is actually
used in more places other than the build vector (such as in t12 and t16).
```
Type-legalized selection DAG: %bb.0 'test:entry'
SelectionDAG has 20 nodes:
t0: ch = EntryToken
t4: i64,ch = CopyFromReg t0, Register:i64 %1
t6: i64,ch = CopyFromReg t0, Register:i64 %2
t11: f64,ch = load<(load (s64) from %ir.b, !tbaa !7)> t0, t4, undef:i64
t16: f64 = fadd t31, t37
t34: ch = store<(store (s64) into %ir.c, !tbaa !7)> t31:1, t16, t6, undef:i64
t36: ch = TokenFactor t34, t37:1
t27: v2f64 = BUILD_VECTOR t37, t37
t22: ch,glue = CopyToReg t36, Register:v2f64 $v2, t27
t12: f64 = fadd t11, t37
t28: ch = store<(store (s64) into %ir.b, !tbaa !7)> t11:1, t12, t4, undef:i64
t31: f64,ch = load<(load (s64) from %ir.c, !tbaa !7)> t28, t6, undef:i64
t2: i64,ch = CopyFromReg t0, Register:i64 %0
t37: f64,ch = load<(load (s32) from %ir.a, !tbaa !3), anyext from f32> t0, t2, undef:i64
t23: ch = PPCISD::RET_FLAG t22, Register:v2f64 $v2, t22:1
```
Differential Revision: https://reviews.llvm.org/D117803
Added:
Modified:
llvm/lib/Target/PowerPC/PPCISelLowering.cpp
llvm/test/CodeGen/PowerPC/load-and-splat.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 25cc34badda04..90479eea1ad64 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -9093,22 +9093,30 @@ bool llvm::checkConvertToNonDenormSingle(APFloat &ArgAPFloat) {
static bool isValidSplatLoad(const PPCSubtarget &Subtarget, const SDValue &Op,
unsigned &Opcode) {
- const SDNode *InputNode = Op.getOperand(0).getNode();
- if (!InputNode || !ISD::isUNINDEXEDLoad(InputNode))
- return false;
-
- if (!Subtarget.hasVSX())
+ LoadSDNode *InputNode = dyn_cast<LoadSDNode>(Op.getOperand(0));
+ if (!InputNode || !Subtarget.hasVSX() || !ISD::isUNINDEXEDLoad(InputNode))
return false;
EVT Ty = Op->getValueType(0);
- if (Ty == MVT::v2f64 || Ty == MVT::v4f32 || Ty == MVT::v4i32 ||
- Ty == MVT::v8i16 || Ty == MVT::v16i8)
+ // For v2f64, v4f32 and v4i32 types, we require the load to be non-extending
+ // as we cannot handle extending loads for these types.
+ if ((Ty == MVT::v2f64 || Ty == MVT::v4f32 || Ty == MVT::v4i32) &&
+ ISD::isNON_EXTLoad(InputNode))
+ return true;
+
+ EVT MemVT = InputNode->getMemoryVT();
+ // For v8i16 and v16i8 types, extending loads can be handled as long as the
+ // memory VT is the same vector element VT type.
+ // The loads feeding into the v8i16 and v16i8 types will be extending because
+ // scalar i8/i16 are not legal types.
+ if ((Ty == MVT::v8i16 || Ty == MVT::v16i8) && ISD::isEXTLoad(InputNode) &&
+ (MemVT == Ty.getVectorElementType()))
return true;
if (Ty == MVT::v2i64) {
// Check the extend type, when the input type is i32, and the output vector
// type is v2i64.
- if (cast<LoadSDNode>(Op.getOperand(0))->getMemoryVT() == MVT::i32) {
+ if (MemVT == MVT::i32) {
if (ISD::isZEXTLoad(InputNode))
Opcode = PPCISD::ZEXT_LD_SPLAT;
if (ISD::isSEXTLoad(InputNode))
diff --git a/llvm/test/CodeGen/PowerPC/load-and-splat.ll b/llvm/test/CodeGen/PowerPC/load-and-splat.ll
index ce44cdc18aae1..a72472d53502b 100644
--- a/llvm/test/CodeGen/PowerPC/load-and-splat.ll
+++ b/llvm/test/CodeGen/PowerPC/load-and-splat.ll
@@ -1242,3 +1242,92 @@ entry:
ret <16 x i8> %splat.splat
}
+; The following test case should not produce a load and splat node,
+; as we cannot handle extending loads (from f32 to f64), and this test
+; shows that there are multiple uses of the extending load (other than
+; a build vector node). `lxvdsx` should not be produced in this case.
+define <2 x double> @test_v2f64_multiple_use(float* nocapture readonly %a, double* nocapture %b, double* nocapture %c) {
+; P9-LABEL: test_v2f64_multiple_use:
+; P9: # %bb.0: # %entry
+; P9-NEXT: lfs f0, 0(r3)
+; P9-NEXT: xxspltd v2, vs0, 0
+; P9-NEXT: lfd f1, 0(r4)
+; P9-NEXT: xsadddp f1, f1, f0
+; P9-NEXT: stfd f1, 0(r4)
+; P9-NEXT: lfd f1, 0(r5)
+; P9-NEXT: xsadddp f1, f1, f0
+; P9-NEXT: stfd f1, 0(r5)
+; P9-NEXT: blr
+;
+; P8-LABEL: test_v2f64_multiple_use:
+; P8: # %bb.0: # %entry
+; P8-NEXT: lfs f0, 0(r3)
+; P8-NEXT: lfd f1, 0(r4)
+; P8-NEXT: xsadddp f1, f1, f0
+; P8-NEXT: xxspltd v2, vs0, 0
+; P8-NEXT: stfd f1, 0(r4)
+; P8-NEXT: lfd f1, 0(r5)
+; P8-NEXT: xsadddp f1, f1, f0
+; P8-NEXT: stfd f1, 0(r5)
+; P8-NEXT: blr
+;
+; P7-LABEL: test_v2f64_multiple_use:
+; P7: # %bb.0: # %entry
+; P7-NEXT: lfs f0, 0(r3)
+; P7-NEXT: lfd f1, 0(r4)
+; P7-NEXT: xsadddp f1, f1, f0
+; P7-NEXT: xxspltd v2, vs0, 0
+; P7-NEXT: stfd f1, 0(r4)
+; P7-NEXT: lfd f1, 0(r5)
+; P7-NEXT: xsadddp f1, f1, f0
+; P7-NEXT: stfd f1, 0(r5)
+; P7-NEXT: blr
+;
+; P9-AIX32-LABEL: test_v2f64_multiple_use:
+; P9-AIX32: # %bb.0: # %entry
+; P9-AIX32-NEXT: lfs f0, 0(r3)
+; P9-AIX32-NEXT: xxmrghd v2, vs0, vs0
+; P9-AIX32-NEXT: lfd f1, 0(r4)
+; P9-AIX32-NEXT: xsadddp f1, f1, f0
+; P9-AIX32-NEXT: stfd f1, 0(r4)
+; P9-AIX32-NEXT: lfd f1, 0(r5)
+; P9-AIX32-NEXT: xsadddp f1, f1, f0
+; P9-AIX32-NEXT: stfd f1, 0(r5)
+; P9-AIX32-NEXT: blr
+;
+; P8-AIX32-LABEL: test_v2f64_multiple_use:
+; P8-AIX32: # %bb.0: # %entry
+; P8-AIX32-NEXT: lfs f0, 0(r3)
+; P8-AIX32-NEXT: lfd f1, 0(r4)
+; P8-AIX32-NEXT: xsadddp f1, f1, f0
+; P8-AIX32-NEXT: xxmrghd v2, vs0, vs0
+; P8-AIX32-NEXT: stfd f1, 0(r4)
+; P8-AIX32-NEXT: lfd f1, 0(r5)
+; P8-AIX32-NEXT: xsadddp f1, f1, f0
+; P8-AIX32-NEXT: stfd f1, 0(r5)
+; P8-AIX32-NEXT: blr
+;
+; P7-AIX32-LABEL: test_v2f64_multiple_use:
+; P7-AIX32: # %bb.0: # %entry
+; P7-AIX32-NEXT: lfs f0, 0(r3)
+; P7-AIX32-NEXT: lfd f1, 0(r4)
+; P7-AIX32-NEXT: xsadddp f1, f1, f0
+; P7-AIX32-NEXT: xxmrghd v2, vs0, vs0
+; P7-AIX32-NEXT: stfd f1, 0(r4)
+; P7-AIX32-NEXT: lfd f1, 0(r5)
+; P7-AIX32-NEXT: xsadddp f1, f1, f0
+; P7-AIX32-NEXT: stfd f1, 0(r5)
+; P7-AIX32-NEXT: blr
+entry:
+ %0 = load float, float* %a, align 4
+ %conv = fpext float %0 to double
+ %1 = load double, double* %b, align 8
+ %add = fadd double %1, %conv
+ store double %add, double* %b, align 8
+ %2 = load double, double* %c, align 8
+ %add2 = fadd double %2, %conv
+ store double %add2, double* %c, align 8
+ %vecinit = insertelement <2 x double> undef, double %conv, i64 0
+ %vecinit5 = shufflevector <2 x double> %vecinit, <2 x double> poison, <2 x i32> zeroinitializer
+ ret <2 x double> %vecinit5
+}
More information about the llvm-commits
mailing list