[llvm] 947f78a - [SystemZ] Fix/optimize vec_load_len and related intrinsics
Ulrich Weigand via llvm-commits
llvm-commits at lists.llvm.org
Wed May 6 12:16:24 PDT 2020
Author: Ulrich Weigand
Date: 2020-05-06T21:15:58+02:00
New Revision: 947f78ac27f4ea52a443ba9d5983cfe3eaf51148
URL: https://github.com/llvm/llvm-project/commit/947f78ac27f4ea52a443ba9d5983cfe3eaf51148
DIFF: https://github.com/llvm/llvm-project/commit/947f78ac27f4ea52a443ba9d5983cfe3eaf51148.diff
LOG: [SystemZ] Fix/optimize vec_load_len and related intrinsics
When using vec_load/store_len_r with an immediate length operand
of 16 or larger, LLVM will currently emit an VLRL/VSTRL instruction
with that immediate. This creates a valid encoding (which should be
supported by the assembler), but always traps at runtime. This patch
fixes this by not creating VLRL/VSTRL in those cases.
This would result in loading the length into a register and
calling VLRLR/VSTRLR instead. However, these operations with
a length of 15 or larger are in fact simply equivalent to a
full vector load or store. And in fact the same holds true for
vec_load/store_len as well.
Therefore, add a DAGCombine rule to replace those operations with
plain vector loads or stores if the length is known at compile
time and equal or larger to 15.
Added:
Modified:
llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
llvm/lib/Target/SystemZ/SystemZISelLowering.h
llvm/lib/Target/SystemZ/SystemZInstrVector.td
llvm/test/CodeGen/SystemZ/vec-intrinsics-01.ll
llvm/test/CodeGen/SystemZ/vec-intrinsics-02.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
index 8e05e872074d..c65f5d7ddcdf 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -649,6 +649,8 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
setTargetDAGCombine(ISD::UDIV);
setTargetDAGCombine(ISD::SREM);
setTargetDAGCombine(ISD::UREM);
+ setTargetDAGCombine(ISD::INTRINSIC_VOID);
+ setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN);
// Handle intrinsics.
setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
@@ -6413,6 +6415,34 @@ SDValue SystemZTargetLowering::combineIntDIVREM(
return SDValue();
}
+SDValue SystemZTargetLowering::combineINTRINSIC(
+ SDNode *N, DAGCombinerInfo &DCI) const {
+ SelectionDAG &DAG = DCI.DAG;
+
+ unsigned Id = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+ switch (Id) {
+ // VECTOR LOAD (RIGHTMOST) WITH LENGTH with a length operand of 15
+ // or larger is simply a vector load.
+ case Intrinsic::s390_vll:
+ case Intrinsic::s390_vlrl:
+ if (auto *C = dyn_cast<ConstantSDNode>(N->getOperand(2)))
+ if (C->getZExtValue() >= 15)
+ return DAG.getLoad(N->getValueType(0), SDLoc(N), N->getOperand(0),
+ N->getOperand(3), MachinePointerInfo());
+ break;
+ // Likewise for VECTOR STORE (RIGHTMOST) WITH LENGTH.
+ case Intrinsic::s390_vstl:
+ case Intrinsic::s390_vstrl:
+ if (auto *C = dyn_cast<ConstantSDNode>(N->getOperand(3)))
+ if (C->getZExtValue() >= 15)
+ return DAG.getStore(N->getOperand(0), SDLoc(N), N->getOperand(2),
+ N->getOperand(4), MachinePointerInfo());
+ break;
+ }
+
+ return SDValue();
+}
+
SDValue SystemZTargetLowering::unwrapAddress(SDValue N) const {
if (N->getOpcode() == SystemZISD::PCREL_WRAPPER)
return N->getOperand(0);
@@ -6447,6 +6477,8 @@ SDValue SystemZTargetLowering::PerformDAGCombine(SDNode *N,
case ISD::UDIV:
case ISD::SREM:
case ISD::UREM: return combineIntDIVREM(N, DCI);
+ case ISD::INTRINSIC_W_CHAIN:
+ case ISD::INTRINSIC_VOID: return combineINTRINSIC(N, DCI);
}
return SDValue();
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.h b/llvm/lib/Target/SystemZ/SystemZISelLowering.h
index 3f46ba04d541..dd6098d7bb94 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.h
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.h
@@ -648,6 +648,7 @@ class SystemZTargetLowering : public TargetLowering {
SDValue combineSELECT_CCMASK(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue combineGET_CCMASK(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue combineIntDIVREM(SDNode *N, DAGCombinerInfo &DCI) const;
+ SDValue combineINTRINSIC(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue unwrapAddress(SDValue N) const override;
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrVector.td b/llvm/lib/Target/SystemZ/SystemZInstrVector.td
index de3a834acb86..171d0303a165 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrVector.td
+++ b/llvm/lib/Target/SystemZ/SystemZInstrVector.td
@@ -177,9 +177,13 @@ let Predicates = [FeatureVector] in {
let Predicates = [FeatureVectorPackedDecimal] in {
// Load rightmost with length. The number of loaded bytes is only known
- // at run time.
- def VLRL : BinaryVSI<"vlrl", 0xE635, int_s390_vlrl, 0>;
+ // at run time. Note that while the instruction will accept immediate
+ // lengths larger that 15 at runtime, those will always result in a trap,
+ // so we never emit them here.
+ def VLRL : BinaryVSI<"vlrl", 0xE635, null_frag, 0>;
def VLRLR : BinaryVRSd<"vlrlr", 0xE637, int_s390_vlrl, 0>;
+ def : Pat<(int_s390_vlrl imm32zx4:$len, bdaddr12only:$addr),
+ (VLRL bdaddr12only:$addr, imm32zx4:$len)>;
}
// Use replicating loads if we're inserting a single element into an
@@ -243,9 +247,13 @@ let Predicates = [FeatureVector] in {
let Predicates = [FeatureVectorPackedDecimal] in {
// Store rightmost with length. The number of stored bytes is only known
- // at run time.
- def VSTRL : StoreLengthVSI<"vstrl", 0xE63D, int_s390_vstrl, 0>;
+ // at run time. Note that while the instruction will accept immediate
+ // lengths larger that 15 at runtime, those will always result in a trap,
+ // so we never emit them here.
+ def VSTRL : StoreLengthVSI<"vstrl", 0xE63D, null_frag, 0>;
def VSTRLR : StoreLengthVRSd<"vstrlr", 0xE63F, int_s390_vstrl, 0>;
+ def : Pat<(int_s390_vstrl VR128:$val, imm32zx4:$len, bdaddr12only:$addr),
+ (VSTRL VR128:$val, bdaddr12only:$addr, imm32zx4:$len)>;
}
//===----------------------------------------------------------------------===//
diff --git a/llvm/test/CodeGen/SystemZ/vec-intrinsics-01.ll b/llvm/test/CodeGen/SystemZ/vec-intrinsics-01.ll
index a576dc24e44f..87fb3fa4afee 100644
--- a/llvm/test/CodeGen/SystemZ/vec-intrinsics-01.ll
+++ b/llvm/test/CodeGen/SystemZ/vec-intrinsics-01.ll
@@ -322,6 +322,15 @@ define <16 x i8> @test_vll4(i8 *%base, i64 %index, i32 %length) {
ret <16 x i8> %res
}
+; VLL with length >= 15 should become VL.
+define <16 x i8> @test_vll5(i8 *%ptr) {
+; CHECK-LABEL: test_vll5:
+; CHECK: vl %v24, 0({{%r[1-5]}})
+; CHECK: br %r14
+ %res = call <16 x i8> @llvm.s390.vll(i32 15, i8 *%ptr)
+ ret <16 x i8> %res
+}
+
; VPDI taking element 0 from each half.
define <2 x i64> @test_vpdi1(<2 x i64> %a, <2 x i64> %b) {
; CHECK-LABEL: test_vpdi1:
@@ -663,6 +672,15 @@ define void @test_vstl4(<16 x i8> %vec, i8 *%base, i64 %index, i32 %length) {
ret void
}
+; VSTL with length >= 15 should become VST.
+define void @test_vstl5(<16 x i8> %vec, i8 *%ptr) {
+; CHECK-LABEL: test_vstl5:
+; CHECK: vst %v24, 0({{%r[1-5]}})
+; CHECK: br %r14
+ call void @llvm.s390.vstl(<16 x i8> %vec, i32 15, i8 *%ptr)
+ ret void
+}
+
; VUPHB.
define <8 x i16> @test_vuphb(<16 x i8> %a) {
; CHECK-LABEL: test_vuphb:
diff --git a/llvm/test/CodeGen/SystemZ/vec-intrinsics-02.ll b/llvm/test/CodeGen/SystemZ/vec-intrinsics-02.ll
index 397d10e02e24..7e095807d1b2 100644
--- a/llvm/test/CodeGen/SystemZ/vec-intrinsics-02.ll
+++ b/llvm/test/CodeGen/SystemZ/vec-intrinsics-02.ll
@@ -123,6 +123,15 @@ define <16 x i8> @test_vlrl4(i8 *%base, i64 %index) {
ret <16 x i8> %res
}
+; VLRL with length >= 15 should become VL.
+define <16 x i8> @test_vlrl5(i8 *%ptr) {
+; CHECK-LABEL: test_vlrl5:
+; CHECK: vl %v24, 0({{%r[1-5]}})
+; CHECK: br %r14
+ %res = call <16 x i8> @llvm.s390.vlrl(i32 15, i8 *%ptr)
+ ret <16 x i8> %res
+}
+
; VSTRLR with the lowest in-range displacement.
define void @test_vstrlr1(<16 x i8> %vec, i8 *%ptr, i32 %length) {
; CHECK-LABEL: test_vstrlr1:
@@ -201,6 +210,15 @@ define void @test_vstrl4(<16 x i8> %vec, i8 *%base, i64 %index) {
ret void
}
+; VSTRL with length >= 15 should become VST.
+define void @test_vstrl5(<16 x i8> %vec, i8 *%ptr) {
+; CHECK-LABEL: test_vstrl5:
+; CHECK: vst %v24, 0({{%r[1-5]}})
+; CHECK: br %r14
+ call void @llvm.s390.vstrl(<16 x i8> %vec, i32 15, i8 *%ptr)
+ ret void
+}
+
; VFCESBS with no processing of the result.
define i32 @test_vfcesbs(<4 x float> %a, <4 x float> %b) {
; CHECK-LABEL: test_vfcesbs:
More information about the llvm-commits
mailing list