[llvm] 947f78a - [SystemZ] Fix/optimize vec_load_len and related intrinsics

Wed May 6 12:16:24 PDT 2020

Author: Ulrich Weigand
Date: 2020-05-06T21:15:58+02:00
New Revision: 947f78ac27f4ea52a443ba9d5983cfe3eaf51148

URL: https://github.com/llvm/llvm-project/commit/947f78ac27f4ea52a443ba9d5983cfe3eaf51148
DIFF: https://github.com/llvm/llvm-project/commit/947f78ac27f4ea52a443ba9d5983cfe3eaf51148.diff

LOG: [SystemZ] Fix/optimize vec_load_len and related intrinsics

When using vec_load/store_len_r with an immediate length operand
of 16 or larger, LLVM will currently emit an VLRL/VSTRL instruction
with that immediate.  This creates a valid encoding (which should be
supported by the assembler), but always traps at runtime.  This patch
fixes this by not creating VLRL/VSTRL in those cases.

This would result in loading the length into a register and
calling VLRLR/VSTRLR instead.  However, these operations with
a length of 15 or larger are in fact simply equivalent to a
full vector load or store.  And in fact the same holds true for
vec_load/store_len as well.

Therefore, add a DAGCombine rule to replace those operations with
plain vector loads or stores if the length is known at compile
time and equal or larger to 15.

Added: 
    

Modified: 
    llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
    llvm/lib/Target/SystemZ/SystemZISelLowering.h
    llvm/lib/Target/SystemZ/SystemZInstrVector.td
    llvm/test/CodeGen/SystemZ/vec-intrinsics-01.ll
    llvm/test/CodeGen/SystemZ/vec-intrinsics-02.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
index 8e05e872074d..c65f5d7ddcdf 100644

--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -649,6 +649,8 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
   setTargetDAGCombine(ISD::UDIV);
   setTargetDAGCombine(ISD::SREM);
   setTargetDAGCombine(ISD::UREM);
+  setTargetDAGCombine(ISD::INTRINSIC_VOID);
+  setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN);
 
   // Handle intrinsics.
   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
@@ -6413,6 +6415,34 @@ SDValue SystemZTargetLowering::combineIntDIVREM(
   return SDValue();
 }
 
+SDValue SystemZTargetLowering::combineINTRINSIC(
+    SDNode *N, DAGCombinerInfo &DCI) const {
+  SelectionDAG &DAG = DCI.DAG;
+
+  unsigned Id = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+  switch (Id) {
+  // VECTOR LOAD (RIGHTMOST) WITH LENGTH with a length operand of 15
+  // or larger is simply a vector load.
+  case Intrinsic::s390_vll:
+  case Intrinsic::s390_vlrl:
+    if (auto *C = dyn_cast<ConstantSDNode>(N->getOperand(2)))
+      if (C->getZExtValue() >= 15)
+        return DAG.getLoad(N->getValueType(0), SDLoc(N), N->getOperand(0),
+                           N->getOperand(3), MachinePointerInfo());
+    break;
+  // Likewise for VECTOR STORE (RIGHTMOST) WITH LENGTH.
+  case Intrinsic::s390_vstl:
+  case Intrinsic::s390_vstrl:
+    if (auto *C = dyn_cast<ConstantSDNode>(N->getOperand(3)))
+      if (C->getZExtValue() >= 15)
+        return DAG.getStore(N->getOperand(0), SDLoc(N), N->getOperand(2),
+                            N->getOperand(4), MachinePointerInfo());
+    break;
+  }
+
+  return SDValue();
+}
+
 SDValue SystemZTargetLowering::unwrapAddress(SDValue N) const {
   if (N->getOpcode() == SystemZISD::PCREL_WRAPPER)
     return N->getOperand(0);
@@ -6447,6 +6477,8 @@ SDValue SystemZTargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::UDIV:
   case ISD::SREM:
   case ISD::UREM:               return combineIntDIVREM(N, DCI);
+  case ISD::INTRINSIC_W_CHAIN:
+  case ISD::INTRINSIC_VOID:     return combineINTRINSIC(N, DCI);
   }
 
   return SDValue();

diff  --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.h b/llvm/lib/Target/SystemZ/SystemZISelLowering.h
index 3f46ba04d541..dd6098d7bb94 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.h
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.h
@@ -648,6 +648,7 @@ class SystemZTargetLowering : public TargetLowering {
   SDValue combineSELECT_CCMASK(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue combineGET_CCMASK(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue combineIntDIVREM(SDNode *N, DAGCombinerInfo &DCI) const;
+  SDValue combineINTRINSIC(SDNode *N, DAGCombinerInfo &DCI) const;
 
   SDValue unwrapAddress(SDValue N) const override;
 

diff  --git a/llvm/lib/Target/SystemZ/SystemZInstrVector.td b/llvm/lib/Target/SystemZ/SystemZInstrVector.td
index de3a834acb86..171d0303a165 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrVector.td
+++ b/llvm/lib/Target/SystemZ/SystemZInstrVector.td
@@ -177,9 +177,13 @@ let Predicates = [FeatureVector] in {
 
 let Predicates = [FeatureVectorPackedDecimal] in {
   // Load rightmost with length.  The number of loaded bytes is only known
-  // at run time.
-  def VLRL : BinaryVSI<"vlrl", 0xE635, int_s390_vlrl, 0>;
+  // at run time.  Note that while the instruction will accept immediate
+  // lengths larger that 15 at runtime, those will always result in a trap,
+  // so we never emit them here.
+  def VLRL : BinaryVSI<"vlrl", 0xE635, null_frag, 0>;
   def VLRLR : BinaryVRSd<"vlrlr", 0xE637, int_s390_vlrl, 0>;
+  def : Pat<(int_s390_vlrl imm32zx4:$len, bdaddr12only:$addr),
+            (VLRL bdaddr12only:$addr, imm32zx4:$len)>;
 }
 
 // Use replicating loads if we're inserting a single element into an
@@ -243,9 +247,13 @@ let Predicates = [FeatureVector] in {
 
 let Predicates = [FeatureVectorPackedDecimal] in {
   // Store rightmost with length.  The number of stored bytes is only known
-  // at run time.
-  def VSTRL : StoreLengthVSI<"vstrl", 0xE63D, int_s390_vstrl, 0>;
+  // at run time.  Note that while the instruction will accept immediate
+  // lengths larger that 15 at runtime, those will always result in a trap,
+  // so we never emit them here.
+  def VSTRL : StoreLengthVSI<"vstrl", 0xE63D, null_frag, 0>;
   def VSTRLR : StoreLengthVRSd<"vstrlr", 0xE63F, int_s390_vstrl, 0>;
+  def : Pat<(int_s390_vstrl VR128:$val, imm32zx4:$len, bdaddr12only:$addr),
+            (VSTRL VR128:$val, bdaddr12only:$addr, imm32zx4:$len)>;
 }
 
 //===----------------------------------------------------------------------===//

diff  --git a/llvm/test/CodeGen/SystemZ/vec-intrinsics-01.ll b/llvm/test/CodeGen/SystemZ/vec-intrinsics-01.ll
index a576dc24e44f..87fb3fa4afee 100644
--- a/llvm/test/CodeGen/SystemZ/vec-intrinsics-01.ll
+++ b/llvm/test/CodeGen/SystemZ/vec-intrinsics-01.ll
@@ -322,6 +322,15 @@ define <16 x i8> @test_vll4(i8 *%base, i64 %index, i32 %length) {
   ret <16 x i8> %res
 }
 
+; VLL with length >= 15 should become VL.
+define <16 x i8> @test_vll5(i8 *%ptr) {
+; CHECK-LABEL: test_vll5:
+; CHECK: vl %v24, 0({{%r[1-5]}})
+; CHECK: br %r14
+  %res = call <16 x i8> @llvm.s390.vll(i32 15, i8 *%ptr)
+  ret <16 x i8> %res
+}
+
 ; VPDI taking element 0 from each half.
 define <2 x i64> @test_vpdi1(<2 x i64> %a, <2 x i64> %b) {
 ; CHECK-LABEL: test_vpdi1:
@@ -663,6 +672,15 @@ define void @test_vstl4(<16 x i8> %vec, i8 *%base, i64 %index, i32 %length) {
   ret void
 }
 
+; VSTL with length >= 15 should become VST.
+define void @test_vstl5(<16 x i8> %vec, i8 *%ptr) {
+; CHECK-LABEL: test_vstl5:
+; CHECK: vst %v24, 0({{%r[1-5]}})
+; CHECK: br %r14
+  call void @llvm.s390.vstl(<16 x i8> %vec, i32 15, i8 *%ptr)
+  ret void
+}
+
 ; VUPHB.
 define <8 x i16> @test_vuphb(<16 x i8> %a) {
 ; CHECK-LABEL: test_vuphb:

diff  --git a/llvm/test/CodeGen/SystemZ/vec-intrinsics-02.ll b/llvm/test/CodeGen/SystemZ/vec-intrinsics-02.ll
index 397d10e02e24..7e095807d1b2 100644
--- a/llvm/test/CodeGen/SystemZ/vec-intrinsics-02.ll
+++ b/llvm/test/CodeGen/SystemZ/vec-intrinsics-02.ll
@@ -123,6 +123,15 @@ define <16 x i8> @test_vlrl4(i8 *%base, i64 %index) {
   ret <16 x i8> %res
 }
 
+; VLRL with length >= 15 should become VL.
+define <16 x i8> @test_vlrl5(i8 *%ptr) {
+; CHECK-LABEL: test_vlrl5:
+; CHECK: vl %v24, 0({{%r[1-5]}})
+; CHECK: br %r14
+  %res = call <16 x i8> @llvm.s390.vlrl(i32 15, i8 *%ptr)
+  ret <16 x i8> %res
+}
+
 ; VSTRLR with the lowest in-range displacement.
 define void @test_vstrlr1(<16 x i8> %vec, i8 *%ptr, i32 %length) {
 ; CHECK-LABEL: test_vstrlr1:
@@ -201,6 +210,15 @@ define void @test_vstrl4(<16 x i8> %vec, i8 *%base, i64 %index) {
   ret void
 }
 
+; VSTRL with length >= 15 should become VST.
+define void @test_vstrl5(<16 x i8> %vec, i8 *%ptr) {
+; CHECK-LABEL: test_vstrl5:
+; CHECK: vst %v24, 0({{%r[1-5]}})
+; CHECK: br %r14
+  call void @llvm.s390.vstrl(<16 x i8> %vec, i32 15, i8 *%ptr)
+  ret void
+}
+
 ; VFCESBS with no processing of the result.
 define i32 @test_vfcesbs(<4 x float> %a, <4 x float> %b) {
 ; CHECK-LABEL: test_vfcesbs: