[llvm] 59c4d5a - [SVE] Fix InstCombinerImpl::PromoteCastOfAllocation for scalable vectors

Wed Sep 23 00:43:20 PDT 2020

Author: David Sherwood
Date: 2020-09-23T08:43:05+01:00
New Revision: 59c4d5aad060927fa95b917c11aad4e310849a4b

URL: https://github.com/llvm/llvm-project/commit/59c4d5aad060927fa95b917c11aad4e310849a4b
DIFF: https://github.com/llvm/llvm-project/commit/59c4d5aad060927fa95b917c11aad4e310849a4b.diff

LOG: [SVE] Fix InstCombinerImpl::PromoteCastOfAllocation for scalable vectors

In this patch I've fixed some warnings that arose from the implicit
cast of TypeSize -> uint64_t. I tried writing a variety of different
cases to show how this optimisation might work for scalable vectors
and found:

1. The optimisation does not work for cases where the cast type
is scalable and the allocated type is not. This because we need to
know how many times the cast type fits into the allocated type.
2. If we pass all the various checks for the case when the allocated
type is scalable and the cast type is not, then when creating the
new alloca we have to take vscale into account. This leads to
sub-optimal IR that is worse than the original IR.
3. For the remaining case when both the alloca and cast types are
scalable it is hard to find examples where the optimisation would
kick in, except for simple bitcasts, because we typically fail the
ABI alignment checks.

For now I've changed the code to bail out if only one of the alloca
and cast types is scalable. This means we continue to support the
existing cases where both types are fixed, and also the specific case
when both types are scalable with the same size and alignment, for
example a simple bitcast of an alloca to another type.

I've added tests that show we don't attempt to promote the alloca,
except for simple bitcasts:

  Transforms/InstCombine/AArch64/sve-cast-of-alloc.ll

Differential revision: https://reviews.llvm.org/D87378

Added: 
    llvm/test/Transforms/InstCombine/AArch64/sve-cast-of-alloc.ll

Modified: 
    llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
index 35db21245754..5982d48e6bf6 100644

--- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -94,6 +94,18 @@ Instruction *InstCombinerImpl::PromoteCastOfAllocation(BitCastInst &CI,
   Type *CastElTy = PTy->getElementType();
   if (!AllocElTy->isSized() || !CastElTy->isSized()) return nullptr;
 
+  // This optimisation does not work for cases where the cast type
+  // is scalable and the allocated type is not. This because we need to
+  // know how many times the casted type fits into the allocated type.
+  // For the opposite case where the allocated type is scalable and the
+  // cast type is not this leads to poor code quality due to the
+  // introduction of 'vscale' into the calculations. It seems better to
+  // bail out for this case too until we've done a proper cost-benefit
+  // analysis.
+  bool AllocIsScalable = isa<ScalableVectorType>(AllocElTy);
+  bool CastIsScalable = isa<ScalableVectorType>(CastElTy);
+  if (AllocIsScalable != CastIsScalable) return nullptr;
+
   Align AllocElTyAlign = DL.getABITypeAlign(AllocElTy);
   Align CastElTyAlign = DL.getABITypeAlign(CastElTy);
   if (CastElTyAlign < AllocElTyAlign) return nullptr;
@@ -103,14 +115,15 @@ Instruction *InstCombinerImpl::PromoteCastOfAllocation(BitCastInst &CI,
   // same, we open the door to infinite loops of various kinds.
   if (!AI.hasOneUse() && CastElTyAlign == AllocElTyAlign) return nullptr;
 
-  uint64_t AllocElTySize = DL.getTypeAllocSize(AllocElTy);
-  uint64_t CastElTySize = DL.getTypeAllocSize(CastElTy);
+  // The alloc and cast types should be either both fixed or both scalable.
+  uint64_t AllocElTySize = DL.getTypeAllocSize(AllocElTy).getKnownMinSize();
+  uint64_t CastElTySize = DL.getTypeAllocSize(CastElTy).getKnownMinSize();
   if (CastElTySize == 0 || AllocElTySize == 0) return nullptr;
 
   // If the allocation has multiple uses, only promote it if we're not
   // shrinking the amount of memory being allocated.
-  uint64_t AllocElTyStoreSize = DL.getTypeStoreSize(AllocElTy);
-  uint64_t CastElTyStoreSize = DL.getTypeStoreSize(CastElTy);
+  uint64_t AllocElTyStoreSize = DL.getTypeStoreSize(AllocElTy).getKnownMinSize();
+  uint64_t CastElTyStoreSize = DL.getTypeStoreSize(CastElTy).getKnownMinSize();
   if (!AI.hasOneUse() && CastElTyStoreSize < AllocElTyStoreSize) return nullptr;
 
   // See if we can satisfy the modulus by pulling a scale out of the array
@@ -125,6 +138,9 @@ Instruction *InstCombinerImpl::PromoteCastOfAllocation(BitCastInst &CI,
   if ((AllocElTySize*ArraySizeScale) % CastElTySize != 0 ||
       (AllocElTySize*ArrayOffset   ) % CastElTySize != 0) return nullptr;
 
+  // We don't currently support arrays of scalable types.
+  assert(!AllocIsScalable || (ArrayOffset == 1 && ArraySizeScale == 0));
+
   unsigned Scale = (AllocElTySize*ArraySizeScale)/CastElTySize;
   Value *Amt = nullptr;
   if (Scale == 1) {

diff  --git a/llvm/test/Transforms/InstCombine/AArch64/sve-cast-of-alloc.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-cast-of-alloc.ll
new file mode 100644
index 000000000000..b593770c1ccb
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/AArch64/sve-cast-of-alloc.ll
@@ -0,0 +1,142 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -instcombine -mtriple aarch64-linux-gnu -mattr=+sve -S < %s 2>%t | FileCheck %s
+; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t
+
+; If this check fails please read test/CodeGen/AArch64/README for instructions on how to resolve it.
+; WARN-NOT: warning
+
+define void @fixed_array16i32_to_scalable4i32(<vscale x 4 x i32>* %out) {
+; CHECK-LABEL: @fixed_array16i32_to_scalable4i32(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP:%.*]] = alloca [16 x i32], align 16
+; CHECK-NEXT:    [[CAST:%.*]] = bitcast [16 x i32]* [[TMP]] to <vscale x 4 x i32>*
+; CHECK-NEXT:    store volatile <vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32>* [[CAST]], align 16
+; CHECK-NEXT:    [[RELOAD:%.*]] = load volatile <vscale x 4 x i32>, <vscale x 4 x i32>* [[CAST]], align 16
+; CHECK-NEXT:    store <vscale x 4 x i32> [[RELOAD]], <vscale x 4 x i32>* [[OUT:%.*]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %tmp = alloca [16 x i32], align 16
+  %cast = bitcast [16 x i32]* %tmp to <vscale x 4 x i32>*
+  store volatile <vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32>* %cast, align 16
+  %reload = load volatile <vscale x 4 x i32>, <vscale x 4 x i32>* %cast, align 16
+  store <vscale x 4 x i32> %reload, <vscale x 4 x i32>* %out, align 16
+  ret void
+}
+
+define void @scalable4i32_to_fixed16i32(<16 x i32>* %out) {
+; CHECK-LABEL: @scalable4i32_to_fixed16i32(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP:%.*]] = alloca <vscale x 4 x i32>, align 64
+; CHECK-NEXT:    [[CAST:%.*]] = bitcast <vscale x 4 x i32>* [[TMP]] to <16 x i32>*
+; CHECK-NEXT:    store <16 x i32> zeroinitializer, <16 x i32>* [[CAST]], align 64
+; CHECK-NEXT:    [[RELOAD:%.*]] = load volatile <16 x i32>, <16 x i32>* [[CAST]], align 64
+; CHECK-NEXT:    store <16 x i32> [[RELOAD]], <16 x i32>* [[OUT:%.*]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %tmp = alloca <vscale x 4 x i32>, align 16
+  %cast = bitcast <vscale x 4 x i32>* %tmp to <16 x i32>*
+  store <16 x i32> zeroinitializer, <16 x i32>* %cast, align 16
+  %reload = load volatile <16 x i32>, <16 x i32>* %cast, align 16
+  store <16 x i32> %reload, <16 x i32>* %out, align 16
+  ret void
+}
+
+define void @fixed16i32_to_scalable4i32(<vscale x 4 x i32>* %out) {
+; CHECK-LABEL: @fixed16i32_to_scalable4i32(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP:%.*]] = alloca <16 x i32>, align 16
+; CHECK-NEXT:    [[CAST:%.*]] = bitcast <16 x i32>* [[TMP]] to <vscale x 4 x i32>*
+; CHECK-NEXT:    store volatile <vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32>* [[CAST]], align 16
+; CHECK-NEXT:    [[RELOAD:%.*]] = load volatile <vscale x 4 x i32>, <vscale x 4 x i32>* [[CAST]], align 16
+; CHECK-NEXT:    store <vscale x 4 x i32> [[RELOAD]], <vscale x 4 x i32>* [[OUT:%.*]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %tmp = alloca <16 x i32>, align 16
+  %cast = bitcast <16 x i32>* %tmp to <vscale x 4 x i32>*
+  store volatile <vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32>* %cast, align 16
+  %reload = load volatile <vscale x 4 x i32>, <vscale x 4 x i32>* %cast, align 16
+  store <vscale x 4 x i32> %reload, <vscale x 4 x i32>* %out, align 16
+  ret void
+}
+
+define void @scalable16i32_to_fixed16i32(<16 x i32>* %out) {
+; CHECK-LABEL: @scalable16i32_to_fixed16i32(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP:%.*]] = alloca <vscale x 16 x i32>, align 64
+; CHECK-NEXT:    [[CAST:%.*]] = bitcast <vscale x 16 x i32>* [[TMP]] to <16 x i32>*
+; CHECK-NEXT:    store volatile <16 x i32> zeroinitializer, <16 x i32>* [[CAST]], align 64
+; CHECK-NEXT:    [[RELOAD:%.*]] = load volatile <16 x i32>, <16 x i32>* [[CAST]], align 64
+; CHECK-NEXT:    store <16 x i32> [[RELOAD]], <16 x i32>* [[OUT:%.*]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %tmp = alloca <vscale x 16 x i32>, align 16
+  %cast = bitcast <vscale x 16 x i32>* %tmp to <16 x i32>*
+  store volatile <16 x i32> zeroinitializer, <16 x i32>* %cast, align 16
+  %reload = load volatile <16 x i32>, <16 x i32>* %cast, align 16
+  store <16 x i32> %reload, <16 x i32>* %out, align 16
+  ret void
+}
+
+define void @scalable32i32_to_scalable16i32(<vscale x 16 x i32>* %out) {
+; CHECK-LABEL: @scalable32i32_to_scalable16i32(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP:%.*]] = alloca <vscale x 32 x i32>, align 64
+; CHECK-NEXT:    [[CAST:%.*]] = bitcast <vscale x 32 x i32>* [[TMP]] to <vscale x 16 x i32>*
+; CHECK-NEXT:    store volatile <vscale x 16 x i32> zeroinitializer, <vscale x 16 x i32>* [[CAST]], align 64
+; CHECK-NEXT:    [[RELOAD:%.*]] = load volatile <vscale x 16 x i32>, <vscale x 16 x i32>* [[CAST]], align 64
+; CHECK-NEXT:    store <vscale x 16 x i32> [[RELOAD]], <vscale x 16 x i32>* [[OUT:%.*]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %tmp = alloca <vscale x 32 x i32>, align 16
+  %cast = bitcast <vscale x 32 x i32>* %tmp to <vscale x 16 x i32>*
+  store volatile <vscale x 16 x i32> zeroinitializer, <vscale x 16 x i32>* %cast, align 16
+  %reload = load volatile <vscale x 16 x i32>, <vscale x 16 x i32>* %cast, align 16
+  store <vscale x 16 x i32> %reload, <vscale x 16 x i32>* %out, align 16
+  ret void
+}
+
+define void @scalable32i16_to_scalable16i32(<vscale x 16 x i32>* %out) {
+; CHECK-LABEL: @scalable32i16_to_scalable16i32(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP:%.*]] = alloca <vscale x 16 x i32>, align 64
+; CHECK-NEXT:    store volatile <vscale x 16 x i32> zeroinitializer, <vscale x 16 x i32>* [[TMP]], align 64
+; CHECK-NEXT:    [[RELOAD:%.*]] = load volatile <vscale x 16 x i32>, <vscale x 16 x i32>* [[TMP]], align 64
+; CHECK-NEXT:    store <vscale x 16 x i32> [[RELOAD]], <vscale x 16 x i32>* [[OUT:%.*]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %tmp = alloca <vscale x 32 x i16>, align 16
+  %cast = bitcast <vscale x 32 x i16>* %tmp to <vscale x 16 x i32>*
+  store volatile <vscale x 16 x i32> zeroinitializer, <vscale x 16 x i32>* %cast, align 16
+  %reload = load volatile <vscale x 16 x i32>, <vscale x 16 x i32>* %cast, align 16
+  store <vscale x 16 x i32> %reload, <vscale x 16 x i32>* %out, align 16
+  ret void
+}
+
+define void @scalable32i16_to_scalable16i32_multiuse(<vscale x 16 x i32>* %out, <vscale x 32 x i16>* %out2) {
+; CHECK-LABEL: @scalable32i16_to_scalable16i32_multiuse(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP:%.*]] = alloca <vscale x 32 x i16>, align 64
+; CHECK-NEXT:    [[CAST:%.*]] = bitcast <vscale x 32 x i16>* [[TMP]] to <vscale x 16 x i32>*
+; CHECK-NEXT:    store volatile <vscale x 16 x i32> zeroinitializer, <vscale x 16 x i32>* [[CAST]], align 64
+; CHECK-NEXT:    [[RELOAD:%.*]] = load volatile <vscale x 16 x i32>, <vscale x 16 x i32>* [[CAST]], align 64
+; CHECK-NEXT:    store <vscale x 16 x i32> [[RELOAD]], <vscale x 16 x i32>* [[OUT:%.*]], align 16
+; CHECK-NEXT:    [[RELOAD2:%.*]] = load volatile <vscale x 32 x i16>, <vscale x 32 x i16>* [[TMP]], align 64
+; CHECK-NEXT:    store <vscale x 32 x i16> [[RELOAD2]], <vscale x 32 x i16>* [[OUT2:%.*]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %tmp = alloca <vscale x 32 x i16>, align 16
+  %cast = bitcast <vscale x 32 x i16>* %tmp to <vscale x 16 x i32>*
+  store volatile <vscale x 16 x i32> zeroinitializer, <vscale x 16 x i32>* %cast, align 16
+  %reload = load volatile <vscale x 16 x i32>, <vscale x 16 x i32>* %cast, align 16
+  store <vscale x 16 x i32> %reload, <vscale x 16 x i32>* %out, align 16
+  %reload2 = load volatile <vscale x 32 x i16>, <vscale x 32 x i16>* %tmp, align 16
+  store <vscale x 32 x i16> %reload2, <vscale x 32 x i16>* %out2, align 16
+  ret void
+}