[llvm] 7a211ed - [RISCV] Prevent store combining from infinitely looping

Mon May 24 02:27:28 PDT 2021

Author: Fraser Cormack
Date: 2021-05-24T10:19:32+01:00
New Revision: 7a211ed110a72ad453305aba250da50c965c2f8e

URL: https://github.com/llvm/llvm-project/commit/7a211ed110a72ad453305aba250da50c965c2f8e
DIFF: https://github.com/llvm/llvm-project/commit/7a211ed110a72ad453305aba250da50c965c2f8e.diff

LOG: [RISCV] Prevent store combining from infinitely looping

RVV code generation does not successfully custom-lower BUILD_VECTOR in all
cases. When it resorts to default expansion it may, on occasion, be expanded to
scalar stores through the stack. Unfortunately these stores may then be picked
up by the post-legalization DAGCombiner which merges them again. The merged
store uses a BUILD_VECTOR which is then expanded, and so on.

This patch addresses the issue by overriding the `mergeStoresAfterLegalization`
hook. A lack of granularity in this method (being passed the scalar type) means
we opt out in almost all cases when RVV fixed-length vector support is enabled.
The only exception to this rule are mask vectors, which are always either
custom-lowered or are expanded to a load from a constant pool.

Reviewed By: HsiangKai

Differential Revision: https://reviews.llvm.org/D102913

Added: 
    

Modified: 
    llvm/lib/Target/RISCV/RISCVISelLowering.cpp
    llvm/lib/Target/RISCV/RISCVISelLowering.h
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 0c219f763047d..e786d5e656f77 100644

--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -1144,6 +1144,13 @@ RISCVTargetLowering::decomposeSubvectorInsertExtractToSubRegs(
   return {SubRegIdx, InsertExtractIdx};
 }
 
+// Permit combining of mask vectors as BUILD_VECTOR never expands to scalar
+// stores for those types.
+bool RISCVTargetLowering::mergeStoresAfterLegalization(EVT VT) const {
+  return !Subtarget.useRVVForFixedLengthVectors() ||
+         (VT.isFixedLengthVector() && VT.getVectorElementType() == MVT::i1);
+}
+
 static bool useRVVForFixedLengthVectorVT(MVT VT,
                                          const RISCVSubtarget &Subtarget) {
   assert(VT.isFixedLengthVector() && "Expected a fixed length vector type!");
@@ -1372,8 +1379,7 @@ static SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
     if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
       // If we have to use more than one INSERT_VECTOR_ELT then this
       // optimization is likely to increase code size; avoid peforming it in
-      // such a case. We can go through the stack as long as we're at least
-      // byte-sized.
+      // such a case. We can use a load from a constant pool in this case.
       if (DAG.shouldOptForSize() && NumElts > NumViaIntegerBits)
         return SDValue();
       // Now we can create our integer vector type. Note that it may be larger

diff  --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index 503a38df0840f..24220f6e32dd7 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -557,6 +557,14 @@ class RISCVTargetLowering : public TargetLowering {
       MachineFunction &MF) const;
 
   bool useRVVForFixedLengthVectorVT(MVT VT) const;
+
+  /// RVV code generation for fixed length vectors does not lower all
+  /// BUILD_VECTORs. This makes BUILD_VECTOR legalisation a source of stores to
+  /// merge. However, merging them creates a BUILD_VECTOR that is just as
+  /// illegal as the original, thus leading to an infinite legalisation loop.
+  /// NOTE: Once BUILD_VECTOR can be custom lowered for all legal vector types,
+  /// this override can be removed.
+  bool mergeStoresAfterLegalization(EVT VT) const override;
 };
 
 namespace RISCV {

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll
index cf761b8fcbbd9..daff1debb5a64 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll
@@ -1,9 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 
-; RUN: llc -mtriple=riscv32 -target-abi=ilp32d -mattr=+experimental-v,+experimental-zfh,+f,+d -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=2 -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+experimental-v,+experimental-zfh,+f,+d -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=2 -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -mtriple=riscv32 -target-abi=ilp32d -mattr=+experimental-v,+experimental-zfh,+f,+d -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=1 -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+experimental-v,+experimental-zfh,+f,+d -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=1 -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=riscv32 -target-abi=ilp32d -mattr=+experimental-v,+experimental-zfh,+f,+d -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=2 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,LMULMAX1
+; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+experimental-v,+experimental-zfh,+f,+d -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=2 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,LMULMAX1
+; RUN: llc -mtriple=riscv32 -target-abi=ilp32d -mattr=+experimental-v,+experimental-zfh,+f,+d -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=1 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,LMULMAX2
+; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+experimental-v,+experimental-zfh,+f,+d -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=1 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,LMULMAX2
 
 ; Tests that a floating-point build_vector doesn't try and generate a VID
 ; instruction
@@ -20,12 +20,81 @@ define void @buildvec_no_vid_v4f32(<4 x float>* %x) {
   ret void
 }
 
+; Not all BUILD_VECTORs are successfully lowered by the backend: some are
+; expanded into scalarized stack stores. However, this may result in an
+; infinite loop in the DAGCombiner which tries to recombine those stores into a
+; BUILD_VECTOR followed by a vector store. The BUILD_VECTOR is then expanded
+; and the loop begins.
+; Until all BUILD_VECTORs are lowered, we disable store-combining after
+; legalization for fixed-length vectors.
+; This test uses a trick with a shufflevector which can't be lowered to a
+; SHUFFLE_VECTOR node; the mask is shorter than the source vectors and the
+; shuffle indices aren't located within the same 4-element subvector, so is
+; expanded to 4 EXTRACT_VECTOR_ELTs and a BUILD_VECTOR. This then triggers the
+; loop when expanded.
+define <4 x float> @hang_when_merging_stores_after_legalization(<8 x float> %x, <8 x float> %y) optsize {
+; LMULMAX1-LABEL: hang_when_merging_stores_after_legalization:
+; LMULMAX1:       # %bb.0:
+; LMULMAX1-NEXT:    addi sp, sp, -16
+; LMULMAX1-NEXT:    .cfi_def_cfa_offset 16
+; LMULMAX1-NEXT:    vsetvli zero, zero, e32,m2,ta,mu
+; LMULMAX1-NEXT:    vfmv.f.s ft0, v10
+; LMULMAX1-NEXT:    fsw ft0, 8(sp)
+; LMULMAX1-NEXT:    vfmv.f.s ft0, v8
+; LMULMAX1-NEXT:    fsw ft0, 0(sp)
+; LMULMAX1-NEXT:    vsetivli a0, 1, e32,m2,ta,mu
+; LMULMAX1-NEXT:    vslidedown.vi v26, v10, 7
+; LMULMAX1-NEXT:    vfmv.f.s ft0, v26
+; LMULMAX1-NEXT:    fsw ft0, 12(sp)
+; LMULMAX1-NEXT:    vslidedown.vi v26, v8, 7
+; LMULMAX1-NEXT:    vfmv.f.s ft0, v26
+; LMULMAX1-NEXT:    fsw ft0, 4(sp)
+; LMULMAX1-NEXT:    vsetivli a0, 4, e32,m1,ta,mu
+; LMULMAX1-NEXT:    vle32.v v8, (sp)
+; LMULMAX1-NEXT:    addi sp, sp, 16
+; LMULMAX1-NEXT:    ret
+;
+; LMULMAX2-LABEL: hang_when_merging_stores_after_legalization:
+; LMULMAX2:       # %bb.0:
+; LMULMAX2-NEXT:    vsetivli a0, 4, e32,m1,ta,mu
+; LMULMAX2-NEXT:    vmv.v.i v25, 0
+; LMULMAX2-NEXT:    vrgather.vv v26, v8, v25
+; LMULMAX2-NEXT:    addi a0, zero, 2
+; LMULMAX2-NEXT:    vsetivli a1, 1, e8,mf8,ta,mu
+; LMULMAX2-NEXT:    vmv.s.x v0, a0
+; LMULMAX2-NEXT:    lui a0, %hi(.LCPI1_0)
+; LMULMAX2-NEXT:    addi a0, a0, %lo(.LCPI1_0)
+; LMULMAX2-NEXT:    vsetivli a1, 4, e32,m1,ta,mu
+; LMULMAX2-NEXT:    vle32.v v27, (a0)
+; LMULMAX2-NEXT:    vsetivli a0, 4, e32,m1,tu,mu
+; LMULMAX2-NEXT:    vrgather.vv v26, v9, v27, v0.t
+; LMULMAX2-NEXT:    vsetivli a0, 4, e32,m1,ta,mu
+; LMULMAX2-NEXT:    vrgather.vv v27, v10, v25
+; LMULMAX2-NEXT:    addi a0, zero, 8
+; LMULMAX2-NEXT:    vsetivli a1, 1, e8,mf8,ta,mu
+; LMULMAX2-NEXT:    vmv.s.x v0, a0
+; LMULMAX2-NEXT:    lui a0, %hi(.LCPI1_1)
+; LMULMAX2-NEXT:    addi a0, a0, %lo(.LCPI1_1)
+; LMULMAX2-NEXT:    vsetivli a1, 4, e32,m1,ta,mu
+; LMULMAX2-NEXT:    vle32.v v25, (a0)
+; LMULMAX2-NEXT:    vsetivli a0, 4, e32,m1,tu,mu
+; LMULMAX2-NEXT:    vrgather.vv v27, v11, v25, v0.t
+; LMULMAX2-NEXT:    addi a0, zero, 3
+; LMULMAX2-NEXT:    vsetivli a1, 1, e8,mf8,ta,mu
+; LMULMAX2-NEXT:    vmv.s.x v0, a0
+; LMULMAX2-NEXT:    vsetivli a0, 4, e32,m1,ta,mu
+; LMULMAX2-NEXT:    vmerge.vvm v8, v27, v26, v0
+; LMULMAX2-NEXT:    ret
+  %z = shufflevector <8 x float> %x, <8 x float> %y, <4 x i32> <i32 0, i32 7, i32 8, i32 15>
+  ret <4 x float> %z
+}
+
 define void @buildvec_dominant0_v4f32(<4 x float>* %x) {
 ; CHECK-LABEL: buildvec_dominant0_v4f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli a1, 4, e32,m1,ta,mu
-; CHECK-NEXT:    lui a1, %hi(.LCPI1_0)
-; CHECK-NEXT:    addi a1, a1, %lo(.LCPI1_0)
+; CHECK-NEXT:    lui a1, %hi(.LCPI2_0)
+; CHECK-NEXT:    addi a1, a1, %lo(.LCPI2_0)
 ; CHECK-NEXT:    vlse32.v v25, (a1), zero
 ; CHECK-NEXT:    fmv.w.x ft0, zero
 ; CHECK-NEXT:    vfmv.s.f v26, ft0
@@ -61,8 +130,8 @@ define void @buildvec_dominant1_v4f32(<4 x float>* %x, float %f) {
 define void @buildvec_dominant2_v4f32(<4 x float>* %x, float %f) {
 ; CHECK-LABEL: buildvec_dominant2_v4f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a1, %hi(.LCPI3_0)
-; CHECK-NEXT:    flw ft0, %lo(.LCPI3_0)(a1)
+; CHECK-NEXT:    lui a1, %hi(.LCPI4_0)
+; CHECK-NEXT:    flw ft0, %lo(.LCPI4_0)(a1)
 ; CHECK-NEXT:    vsetivli a1, 4, e32,m1,ta,mu
 ; CHECK-NEXT:    vfmv.s.f v25, ft0
 ; CHECK-NEXT:    vfmv.v.f v26, fa0
@@ -84,8 +153,8 @@ define void @buildvec_merge0_v4f32(<4 x float>* %x, float %f) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    addi a1, zero, 6
 ; CHECK-NEXT:    vsetivli a2, 1, e8,mf8,ta,mu
-; CHECK-NEXT:    lui a2, %hi(.LCPI4_0)
-; CHECK-NEXT:    flw ft0, %lo(.LCPI4_0)(a2)
+; CHECK-NEXT:    lui a2, %hi(.LCPI5_0)
+; CHECK-NEXT:    flw ft0, %lo(.LCPI5_0)(a2)
 ; CHECK-NEXT:    vmv.s.x v0, a1
 ; CHECK-NEXT:    vsetivli a1, 4, e32,m1,ta,mu
 ; CHECK-NEXT:    vfmv.v.f v25, fa0