[llvm] 2572512 - [SROA] Try harder to find a vector promotion viable type when rewriting

Mon Aug 8 11:04:46 PDT 2022

Author: Vang Thao
Date: 2022-08-08T11:04:01-07:00
New Revision: 257251247a267c3fa30fdeef17ffa4987d8a52e5

URL: https://github.com/llvm/llvm-project/commit/257251247a267c3fa30fdeef17ffa4987d8a52e5
DIFF: https://github.com/llvm/llvm-project/commit/257251247a267c3fa30fdeef17ffa4987d8a52e5.diff

LOG: [SROA] Try harder to find a vector promotion viable type when rewriting

We are seeing significant performance loss when an alloca fails to get promoted
to register. I have observed that this is due to the common type found when
attempting to rewrite partition users being unviable for promotion. While if we
would have continue looking for a type, we would have found a subtype in the
original allocated type that would have enabled promotion. Thus first check if
the initial common type found is promotion viable and if not then continue
looking instead of stopping with the initial common type found.

Reviewed By: arsenm

Differential Revision: https://reviews.llvm.org/D128073

Added: 
    llvm/test/Transforms/SROA/sroa-common-type-fail-promotion.ll

Modified: 
    llvm/lib/Transforms/Scalar/SROA.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp
index 8df86ce630202..cb2c43ecb73ad 100644

--- a/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -1847,6 +1847,34 @@ static bool isVectorPromotionViableForSlice(Partition &P, const Slice &S,
   return true;
 }
 
+/// Test whether a vector type is viable for promotion.
+///
+/// This implements the necessary checking for \c isVectorPromotionViable over
+/// all slices of the alloca for the given VectorType.
+static bool checkVectorTypeForPromotion(Partition &P, VectorType *VTy,
+                                        const DataLayout &DL) {
+  uint64_t ElementSize =
+      DL.getTypeSizeInBits(VTy->getElementType()).getFixedSize();
+
+  // While the definition of LLVM vectors is bitpacked, we don't support sizes
+  // that aren't byte sized.
+  if (ElementSize % 8)
+    return false;
+  assert((DL.getTypeSizeInBits(VTy).getFixedSize() % 8) == 0 &&
+         "vector size not a multiple of element size?");
+  ElementSize /= 8;
+
+  for (const Slice &S : P)
+    if (!isVectorPromotionViableForSlice(P, S, VTy, ElementSize, DL))
+      return false;
+
+  for (const Slice *S : P.splitSliceTails())
+    if (!isVectorPromotionViableForSlice(P, *S, VTy, ElementSize, DL))
+      return false;
+
+  return true;
+}
+
 /// Test whether the given alloca partitioning and range of slices can be
 /// promoted to a vector.
 ///
@@ -1939,31 +1967,8 @@ static VectorType *isVectorPromotionViable(Partition &P, const DataLayout &DL) {
     CandidateTys.resize(1);
   }
 
-  // Try each vector type, and return the one which works.
-  auto CheckVectorTypeForPromotion = [&](VectorType *VTy) {
-    uint64_t ElementSize =
-        DL.getTypeSizeInBits(VTy->getElementType()).getFixedSize();
-
-    // While the definition of LLVM vectors is bitpacked, we don't support sizes
-    // that aren't byte sized.
-    if (ElementSize % 8)
-      return false;
-    assert((DL.getTypeSizeInBits(VTy).getFixedSize() % 8) == 0 &&
-           "vector size not a multiple of element size?");
-    ElementSize /= 8;
-
-    for (const Slice &S : P)
-      if (!isVectorPromotionViableForSlice(P, S, VTy, ElementSize, DL))
-        return false;
-
-    for (const Slice *S : P.splitSliceTails())
-      if (!isVectorPromotionViableForSlice(P, *S, VTy, ElementSize, DL))
-        return false;
-
-    return true;
-  };
   for (VectorType *VTy : CandidateTys)
-    if (CheckVectorTypeForPromotion(VTy))
+    if (checkVectorTypeForPromotion(P, VTy, DL))
       return VTy;
 
   return nullptr;
@@ -4246,26 +4251,45 @@ AllocaInst *SROAPass::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
   // won't always succeed, in which case we fall back to a legal integer type
   // or an i8 array of an appropriate size.
   Type *SliceTy = nullptr;
+  VectorType *SliceVecTy = nullptr;
   const DataLayout &DL = AI.getModule()->getDataLayout();
   std::pair<Type *, IntegerType *> CommonUseTy =
       findCommonType(P.begin(), P.end(), P.endOffset());
   // Do all uses operate on the same type?
   if (CommonUseTy.first)
-    if (DL.getTypeAllocSize(CommonUseTy.first).getFixedSize() >= P.size())
+    if (DL.getTypeAllocSize(CommonUseTy.first).getFixedSize() >= P.size()) {
       SliceTy = CommonUseTy.first;
+      SliceVecTy = dyn_cast<VectorType>(SliceTy);
+    }
   // If not, can we find an appropriate subtype in the original allocated type?
   if (!SliceTy)
     if (Type *TypePartitionTy = getTypePartition(DL, AI.getAllocatedType(),
                                                  P.beginOffset(), P.size()))
       SliceTy = TypePartitionTy;
+
   // If still not, can we use the largest bitwidth integer type used?
   if (!SliceTy && CommonUseTy.second)
-    if (DL.getTypeAllocSize(CommonUseTy.second).getFixedSize() >= P.size())
+    if (DL.getTypeAllocSize(CommonUseTy.second).getFixedSize() >= P.size()) {
       SliceTy = CommonUseTy.second;
+      SliceVecTy = dyn_cast<VectorType>(SliceTy);
+    }
   if ((!SliceTy || (SliceTy->isArrayTy() &&
                     SliceTy->getArrayElementType()->isIntegerTy())) &&
-      DL.isLegalInteger(P.size() * 8))
+      DL.isLegalInteger(P.size() * 8)) {
     SliceTy = Type::getIntNTy(*C, P.size() * 8);
+  }
+
+  // If the common use types are not viable for promotion then attempt to find
+  // another type that is viable.
+  if (SliceVecTy && !checkVectorTypeForPromotion(P, SliceVecTy, DL))
+    if (Type *TypePartitionTy = getTypePartition(DL, AI.getAllocatedType(),
+                                                 P.beginOffset(), P.size())) {
+      VectorType *TypePartitionVecTy = dyn_cast<VectorType>(TypePartitionTy);
+      if (TypePartitionVecTy &&
+          checkVectorTypeForPromotion(P, TypePartitionVecTy, DL))
+        SliceTy = TypePartitionTy;
+    }
+
   if (!SliceTy)
     SliceTy = ArrayType::get(Type::getInt8Ty(*C), P.size());
   assert(DL.getTypeAllocSize(SliceTy).getFixedSize() >= P.size());

diff  --git a/llvm/test/Transforms/SROA/sroa-common-type-fail-promotion.ll b/llvm/test/Transforms/SROA/sroa-common-type-fail-promotion.ll
new file mode 100644
index 0000000000000..00cbe56929c02
--- /dev/null
+++ b/llvm/test/Transforms/SROA/sroa-common-type-fail-promotion.ll
@@ -0,0 +1,411 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=sroa -S < %s | FileCheck %s
+
+%"struct.a" = type { <8 x half> }
+%"struct.b" = type { %"struct.a" }
+%"struct.c" = type { %"struct.a", i32, i8 }
+%"struct.d" = type { [4 x i32], %"struct.a" }
+%"struct.e" = type { [2 x <8 x half>], i32, i32 }
+%"struct.f" = type { [2 x <8 x i16>], i32, i32 }
+%"array.a" = type [2 x <8 x half>]
+%"array.b" = type [2 x %"struct.a"]
+
+define amdgpu_kernel void @test_zeroinit() #0 {
+; CHECK-LABEL: @test_zeroinit(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DATA:%.*]] = load <4 x float>, ptr undef, align 16
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[DATA]] to <8 x half>
+; CHECK-NEXT:    br label [[BB:%.*]]
+; CHECK:       bb:
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_0_VEC_EXTRACT:%.*]] = extractelement <8 x half> [[TMP0]], i32 0
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_2_VEC_EXTRACT:%.*]] = extractelement <8 x half> [[TMP0]], i32 1
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_4_VEC_EXTRACT:%.*]] = extractelement <8 x half> [[TMP0]], i32 2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %b_blockwise_copy = alloca %"struct.b", align 16
+  store <8 x half> zeroinitializer, ptr %b_blockwise_copy, align 16
+  %data = load <4 x float>, <4 x float>* undef
+  store <4 x float> %data, ptr %b_blockwise_copy, align 16
+  br label %bb
+
+bb:
+  %load1 = load half, ptr %b_blockwise_copy, align 16
+  %ptr2 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 2
+  %load2 = load half, ptr %ptr2, align 16
+  %ptr3 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 4
+  %load3 = load half, ptr %ptr3, align 16
+  ret void
+}
+
+define amdgpu_kernel void @test_memset() #0 {
+; CHECK-LABEL: @test_memset(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DATA:%.*]] = load <4 x float>, ptr undef, align 16
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[DATA]] to <8 x half>
+; CHECK-NEXT:    br label [[BB:%.*]]
+; CHECK:       bb:
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_0_VEC_EXTRACT:%.*]] = extractelement <8 x half> [[TMP0]], i32 0
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_2_VEC_EXTRACT:%.*]] = extractelement <8 x half> [[TMP0]], i32 1
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_4_VEC_EXTRACT:%.*]] = extractelement <8 x half> [[TMP0]], i32 2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %b_blockwise_copy = alloca %"struct.b", align 16
+  call void @llvm.memset.p0.i64(ptr align 16 %b_blockwise_copy, i8 0, i64 16, i1 false)
+  %data = load <4 x float>, <4 x float>* undef
+  store <4 x float> %data, ptr %b_blockwise_copy, align 16
+  br label %bb
+
+bb:
+  %load1 = load half, ptr %b_blockwise_copy, align 16
+  %ptr2 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 2
+  %load2 = load half, ptr %ptr2, align 16
+  %ptr3 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 4
+  %load3 = load half, ptr %ptr3, align 16
+  ret void
+}
+
+; Initial SROA pass failed to promote alloca and same alloca type was re-used
+; so alloca was not re-added to the worklist after initial SROA pass. This
+; caused it to fail to promote unlike the other tests.
+define amdgpu_kernel void @vector_type_alloca() #0 {
+; CHECK-LABEL: @vector_type_alloca(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY:%.*]] = alloca <8 x half>, align 16
+; CHECK-NEXT:    store <8 x half> zeroinitializer, ptr [[B_BLOCKWISE_COPY]], align 16
+; CHECK-NEXT:    [[DATA:%.*]] = load <4 x float>, ptr undef, align 16
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[DATA]] to <8 x half>
+; CHECK-NEXT:    store <8 x half> [[TMP0]], ptr [[B_BLOCKWISE_COPY]], align 16
+; CHECK-NEXT:    br label [[BB:%.*]]
+; CHECK:       bb:
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_0_LOAD1:%.*]] = load half, ptr [[B_BLOCKWISE_COPY]], align 16
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_2_PTR2_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[B_BLOCKWISE_COPY]], i64 2
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_2_LOAD2:%.*]] = load half, ptr [[B_BLOCKWISE_COPY_2_PTR2_SROA_IDX]], align 2
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_4_PTR3_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[B_BLOCKWISE_COPY]], i64 4
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_4_LOAD3:%.*]] = load half, ptr [[B_BLOCKWISE_COPY_4_PTR3_SROA_IDX]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %b_blockwise_copy = alloca <8 x half>, align 16
+  store <8 x half> zeroinitializer, ptr %b_blockwise_copy, align 16
+  %data = load <4 x float>, <4 x float>* undef
+  store <4 x float> %data, ptr %b_blockwise_copy, align 16
+  br label %bb
+
+bb:
+  %load1 = load half, ptr %b_blockwise_copy, align 16
+  %ptr2 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 2
+  %load2 = load half, ptr %ptr2, align 16
+  %ptr3 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 4
+  %load3 = load half, ptr %ptr3, align 16
+  ret void
+}
+
+define amdgpu_kernel void @test_struct_contain_multiple_types1() #0 {
+; CHECK-LABEL: @test_struct_contain_multiple_types1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DATA:%.*]] = load <4 x float>, ptr undef, align 16
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[DATA]] to <8 x half>
+; CHECK-NEXT:    br label [[BB:%.*]]
+; CHECK:       bb:
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_0_VEC_EXTRACT:%.*]] = extractelement <8 x half> [[TMP0]], i32 0
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_2_VEC_EXTRACT:%.*]] = extractelement <8 x half> [[TMP0]], i32 1
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_4_VEC_EXTRACT:%.*]] = extractelement <8 x half> [[TMP0]], i32 2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %b_blockwise_copy = alloca %"struct.c", align 16
+  store <8 x half> zeroinitializer, ptr %b_blockwise_copy, align 16
+  %data = load <4 x float>, <4 x float>* undef
+  store <4 x float> %data, ptr %b_blockwise_copy, align 16
+  br label %bb
+
+bb:
+  %load1 = load half, ptr %b_blockwise_copy, align 16
+  %ptr2 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 2
+  %load2 = load half, ptr %ptr2, align 16
+  %ptr3 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 4
+  %load3 = load half, ptr %ptr3, align 16
+  ret void
+}
+
+define amdgpu_kernel void @test_struct_contain_multiple_types2() #0 {
+; CHECK-LABEL: @test_struct_contain_multiple_types2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DATA1:%.*]] = load [4 x i32], ptr undef, align 4
+; CHECK-NEXT:    [[DATA1_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i32] [[DATA1]], 0
+; CHECK-NEXT:    [[DATA1_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i32] [[DATA1]], 1
+; CHECK-NEXT:    [[DATA1_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i32] [[DATA1]], 2
+; CHECK-NEXT:    [[DATA1_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i32] [[DATA1]], 3
+; CHECK-NEXT:    [[DATA2:%.*]] = load <4 x float>, ptr undef, align 16
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[DATA2]] to <8 x half>
+; CHECK-NEXT:    br label [[BB:%.*]]
+; CHECK:       bb:
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_5_0_VEC_EXTRACT:%.*]] = extractelement <8 x half> [[TMP0]], i32 0
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_5_2_VEC_EXTRACT:%.*]] = extractelement <8 x half> [[TMP0]], i32 1
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_5_4_VEC_EXTRACT:%.*]] = extractelement <8 x half> [[TMP0]], i32 2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %b_blockwise_copy = alloca %"struct.d", align 16
+  call void @llvm.memset.p0.i32(ptr align 16 %b_blockwise_copy, i8 0, i32 16, i1 false)
+  %data1 = load [4 x i32], [4 x i32]* undef
+  store [4 x i32] %data1, ptr %b_blockwise_copy, align 16
+  %data2_gep = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 16
+  store <8 x half> zeroinitializer, ptr %data2_gep, align 16
+  %data2 = load <4 x float>, <4 x float>* undef
+  store <4 x float> %data2, ptr %data2_gep, align 16
+  br label %bb
+
+bb:
+  %ptr1 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 16
+  %load1 = load half, ptr %ptr1, align 16
+  %ptr2 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 18
+  %load2 = load half, ptr %ptr2, align 16
+  %ptr3 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 20
+  %load3 = load half, ptr %ptr3, align 16
+  ret void
+}
+
+define amdgpu_kernel void @test_struct_array_vector() #0 {
+; CHECK-LABEL: @test_struct_array_vector(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DATA0:%.*]] = load <4 x float>, ptr undef, align 16
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[DATA0]] to <8 x half>
+; CHECK-NEXT:    [[DATA1:%.*]] = load <4 x float>, ptr undef, align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[DATA1]] to <8 x half>
+; CHECK-NEXT:    br label [[BB:%.*]]
+; CHECK:       bb:
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_0_VEC_EXTRACT:%.*]] = extractelement <8 x half> [[TMP0]], i32 0
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_3_0_VEC_EXTRACT:%.*]] = extractelement <8 x half> [[TMP1]], i32 0
+; CHECK-NEXT:    ret void
+;
+entry:
+  %b_blockwise_copy = alloca %"struct.e", align 16
+  store <8 x half> zeroinitializer, ptr %b_blockwise_copy, align 16
+  %0 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 16
+  store <8 x half> zeroinitializer, ptr %0, align 16
+  %data0 = load <4 x float>, <4 x float>* undef
+  store <4 x float> %data0, ptr %b_blockwise_copy, align 16
+  %data1 = load <4 x float>, <4 x float>* undef
+  store <4 x float> %data1, ptr %0, align 16
+  br label %bb
+
+bb:
+  %load1 = load half, ptr %b_blockwise_copy, align 16
+  %ptr2 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 16
+  %load2 = load half, ptr %ptr2, align 16
+  ret void
+}
+
+define amdgpu_kernel void @test_struct_array_vector_i16() #0 {
+; CHECK-LABEL: @test_struct_array_vector_i16(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DATA:%.*]] = load <4 x i32>, ptr undef, align 16
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[DATA]] to <8 x i16>
+; CHECK-NEXT:    [[DATA2:%.*]] = load <4 x i32>, ptr undef, align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[DATA2]] to <8 x i16>
+; CHECK-NEXT:    br label [[BB:%.*]]
+; CHECK:       bb:
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_0_VEC_EXTRACT:%.*]] = extractelement <8 x i16> [[TMP0]], i32 0
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_2_VEC_EXTRACT:%.*]] = extractelement <8 x i16> [[TMP0]], i32 1
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_4_0_VEC_EXTRACT:%.*]] = extractelement <8 x i16> [[TMP1]], i32 0
+; CHECK-NEXT:    ret void
+;
+entry:
+  %b_blockwise_copy = alloca %"struct.f", align 16
+  call void @llvm.memset.p0.i32(ptr align 16 %b_blockwise_copy, i8 0, i32 32, i1 false)
+  %data = load <4 x i32>, <4 x i32>* undef
+  store <4 x i32> %data, ptr %b_blockwise_copy, align 16
+  %data2 = load <4 x i32>, <4 x i32>* undef
+  %data2_gep = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 16
+  store <4 x i32> %data2, ptr %data2_gep, align 16
+  br label %bb
+
+bb:
+  %load1 = load i16, ptr %b_blockwise_copy, align 16
+  %ptr2 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 2
+  %load2 = load i16, ptr %ptr2, align 16
+  %ptr3 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 16
+  %load3 = load i16, ptr %ptr3, align 16
+  ret void
+}
+
+define amdgpu_kernel void @test_half_array() #0 {
+; CHECK-LABEL: @test_half_array(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0:%.*]] = alloca float, align 16
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_4:%.*]] = alloca float, align 4
+; CHECK-NEXT:    call void @llvm.memset.p0.i32(ptr align 16 [[B_BLOCKWISE_COPY_SROA_0]], i8 0, i32 4, i1 false)
+; CHECK-NEXT:    call void @llvm.memset.p0.i32(ptr align 4 [[B_BLOCKWISE_COPY_SROA_4]], i8 0, i32 4, i1 false)
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float undef to i32
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float undef to i32
+; CHECK-NEXT:    [[DATA:%.*]] = load [4 x float], ptr undef, align 4
+; CHECK-NEXT:    [[DATA_FCA_0_EXTRACT:%.*]] = extractvalue [4 x float] [[DATA]], 0
+; CHECK-NEXT:    store float [[DATA_FCA_0_EXTRACT]], ptr [[B_BLOCKWISE_COPY_SROA_0]], align 16
+; CHECK-NEXT:    [[DATA_FCA_1_EXTRACT:%.*]] = extractvalue [4 x float] [[DATA]], 1
+; CHECK-NEXT:    store float [[DATA_FCA_1_EXTRACT]], ptr [[B_BLOCKWISE_COPY_SROA_4]], align 4
+; CHECK-NEXT:    [[DATA_FCA_2_EXTRACT:%.*]] = extractvalue [4 x float] [[DATA]], 2
+; CHECK-NEXT:    [[DATA_FCA_3_EXTRACT:%.*]] = extractvalue [4 x float] [[DATA]], 3
+; CHECK-NEXT:    br label [[BB:%.*]]
+; CHECK:       bb:
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_0_B_BLOCKWISE_COPY_SROA_0_0_LOAD1:%.*]] = load half, ptr [[B_BLOCKWISE_COPY_SROA_0]], align 16
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_2_PTR2_SROA_IDX1:%.*]] = getelementptr inbounds i8, ptr [[B_BLOCKWISE_COPY_SROA_0]], i64 2
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_2_B_BLOCKWISE_COPY_SROA_0_2_LOAD2:%.*]] = load half, ptr [[B_BLOCKWISE_COPY_SROA_0_2_PTR2_SROA_IDX1]], align 2
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_4_0_B_BLOCKWISE_COPY_SROA_4_4_LOAD3:%.*]] = load half, ptr [[B_BLOCKWISE_COPY_SROA_4]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %b_blockwise_copy = alloca [8 x half], align 16
+  call void @llvm.memset.p0.i32(ptr align 16 %b_blockwise_copy, i8 0, i32 16, i1 false)
+  %data = load [4 x float], [4 x float]* undef
+  store [4 x float] %data, ptr %b_blockwise_copy, align 16
+  br label %bb
+
+bb:
+  %load1 = load half, ptr %b_blockwise_copy, align 16
+  %ptr2 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 2
+  %load2 = load half, ptr %ptr2, align 16
+  %ptr3 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 4
+  %load3 = load half, ptr %ptr3, align 16
+  ret void
+}
+
+define amdgpu_kernel void @test_array_vector() #0 {
+; CHECK-LABEL: @test_array_vector(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_5:%.*]] = alloca <8 x half>, align 16
+; CHECK-NEXT:    call void @llvm.memset.p0.i32(ptr align 16 [[B_BLOCKWISE_COPY_SROA_5]], i8 0, i32 16, i1 false)
+; CHECK-NEXT:    [[DATA:%.*]] = load <4 x float>, ptr undef, align 16
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[DATA]] to <8 x half>
+; CHECK-NEXT:    br label [[BB:%.*]]
+; CHECK:       bb:
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_0_VEC_EXTRACT:%.*]] = extractelement <8 x half> [[TMP0]], i32 0
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_2_VEC_EXTRACT:%.*]] = extractelement <8 x half> [[TMP0]], i32 1
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_4_VEC_EXTRACT:%.*]] = extractelement <8 x half> [[TMP0]], i32 2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %b_blockwise_copy = alloca %"array.a", align 16
+  call void @llvm.memset.p0.i32(ptr align 16 %b_blockwise_copy, i8 0, i32 32, i1 false)
+  %data = load <4 x float>, <4 x float>* undef
+  store <4 x float> %data, ptr %b_blockwise_copy, align 16
+  br label %bb
+
+bb:
+  %load1 = load half, ptr %b_blockwise_copy, align 16
+  %ptr2 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 2
+  %load2 = load half, ptr %ptr2, align 16
+  %ptr3 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 4
+  %load3 = load half, ptr %ptr3, align 16
+  ret void
+}
+
+define amdgpu_kernel void @test_array_vector2() #0 {
+; CHECK-LABEL: @test_array_vector2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_5:%.*]] = alloca <8 x half>, align 16
+; CHECK-NEXT:    call void @llvm.memset.p0.i32(ptr align 16 [[B_BLOCKWISE_COPY_SROA_5]], i8 0, i32 16, i1 false)
+; CHECK-NEXT:    [[DATA:%.*]] = load <4 x float>, ptr undef, align 16
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[DATA]] to <8 x half>
+; CHECK-NEXT:    br label [[BB:%.*]]
+; CHECK:       bb:
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_0_VEC_EXTRACT:%.*]] = extractelement <8 x half> [[TMP0]], i32 0
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_2_VEC_EXTRACT:%.*]] = extractelement <8 x half> [[TMP0]], i32 1
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_4_VEC_EXTRACT:%.*]] = extractelement <8 x half> [[TMP0]], i32 2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %b_blockwise_copy = alloca %"array.b", align 16
+  call void @llvm.memset.p0.i32(ptr align 16 %b_blockwise_copy, i8 0, i32 32, i1 false)
+  %data = load <4 x float>, <4 x float>* undef
+  store <4 x float> %data, ptr %b_blockwise_copy, align 16
+  br label %bb
+
+bb:
+  %load1 = load half, ptr %b_blockwise_copy, align 16
+  %ptr2 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 2
+  %load2 = load half, ptr %ptr2, align 16
+  %ptr3 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 4
+  %load3 = load half, ptr %ptr3, align 16
+  ret void
+}
+
+define amdgpu_kernel void @test_array_vector_no_vector_common_type() #0 {
+; CHECK-LABEL: @test_array_vector_no_vector_common_type(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0:%.*]] = alloca float, align 16
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_4:%.*]] = alloca float, align 4
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_7:%.*]] = alloca float, align 8
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_10:%.*]] = alloca float, align 4
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_13:%.*]] = alloca <8 x half>, align 16
+; CHECK-NEXT:    call void @llvm.memset.p0.i32(ptr align 16 [[B_BLOCKWISE_COPY_SROA_0]], i8 0, i32 4, i1 false)
+; CHECK-NEXT:    call void @llvm.memset.p0.i32(ptr align 4 [[B_BLOCKWISE_COPY_SROA_4]], i8 0, i32 4, i1 false)
+; CHECK-NEXT:    call void @llvm.memset.p0.i32(ptr align 8 [[B_BLOCKWISE_COPY_SROA_7]], i8 0, i32 4, i1 false)
+; CHECK-NEXT:    call void @llvm.memset.p0.i32(ptr align 4 [[B_BLOCKWISE_COPY_SROA_10]], i8 0, i32 4, i1 false)
+; CHECK-NEXT:    call void @llvm.memset.p0.i32(ptr align 16 [[B_BLOCKWISE_COPY_SROA_13]], i8 0, i32 16, i1 false)
+; CHECK-NEXT:    [[DATA1:%.*]] = load float, ptr undef, align 4
+; CHECK-NEXT:    [[DATA2:%.*]] = load float, ptr undef, align 4
+; CHECK-NEXT:    [[DATA3:%.*]] = load float, ptr undef, align 4
+; CHECK-NEXT:    [[DATA4:%.*]] = load float, ptr undef, align 4
+; CHECK-NEXT:    store float [[DATA1]], ptr [[B_BLOCKWISE_COPY_SROA_0]], align 16
+; CHECK-NEXT:    store float [[DATA2]], ptr [[B_BLOCKWISE_COPY_SROA_4]], align 4
+; CHECK-NEXT:    store float [[DATA3]], ptr [[B_BLOCKWISE_COPY_SROA_7]], align 8
+; CHECK-NEXT:    store float [[DATA4]], ptr [[B_BLOCKWISE_COPY_SROA_10]], align 4
+; CHECK-NEXT:    br label [[BB:%.*]]
+; CHECK:       bb:
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_0_B_BLOCKWISE_COPY_SROA_0_0_LOAD1:%.*]] = load half, ptr [[B_BLOCKWISE_COPY_SROA_0]], align 16
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_2_PTR2_SROA_IDX1:%.*]] = getelementptr inbounds i8, ptr [[B_BLOCKWISE_COPY_SROA_0]], i64 2
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_2_B_BLOCKWISE_COPY_SROA_0_2_LOAD2:%.*]] = load half, ptr [[B_BLOCKWISE_COPY_SROA_0_2_PTR2_SROA_IDX1]], align 2
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_4_0_B_BLOCKWISE_COPY_SROA_4_4_LOAD3:%.*]] = load half, ptr [[B_BLOCKWISE_COPY_SROA_4]], align 4
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_4_2_PTR4_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[B_BLOCKWISE_COPY_SROA_4]], i64 2
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_4_2_B_BLOCKWISE_COPY_SROA_4_6_LOAD4:%.*]] = load half, ptr [[B_BLOCKWISE_COPY_SROA_4_2_PTR4_SROA_IDX]], align 2
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_7_0_B_BLOCKWISE_COPY_SROA_7_8_LOAD5:%.*]] = load half, ptr [[B_BLOCKWISE_COPY_SROA_7]], align 8
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_7_2_PTR6_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[B_BLOCKWISE_COPY_SROA_7]], i64 2
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_7_2_B_BLOCKWISE_COPY_SROA_7_10_LOAD6:%.*]] = load half, ptr [[B_BLOCKWISE_COPY_SROA_7_2_PTR6_SROA_IDX]], align 2
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_10_0_B_BLOCKWISE_COPY_SROA_10_12_LOAD7:%.*]] = load half, ptr [[B_BLOCKWISE_COPY_SROA_10]], align 4
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_10_2_PTR8_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[B_BLOCKWISE_COPY_SROA_10]], i64 2
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_10_2_B_BLOCKWISE_COPY_SROA_10_14_LOAD8:%.*]] = load half, ptr [[B_BLOCKWISE_COPY_SROA_10_2_PTR8_SROA_IDX]], align 2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %b_blockwise_copy = alloca %"array.a", align 16
+  call void @llvm.memset.p0.i32(ptr align 16 %b_blockwise_copy, i8 0, i32 32, i1 false)
+  %data1 = load float, float* undef
+  %data2 = load float, float* undef
+  %data3 = load float, float* undef
+  %data4 = load float, float* undef
+  store float %data1, ptr %b_blockwise_copy, align 16
+  %data_ptr1 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 4
+  store float %data2, ptr %data_ptr1, align 16
+  %data_ptr2 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 8
+  store float %data3, ptr %data_ptr2, align 16
+  %data_ptr3 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 12
+  store float %data4, ptr %data_ptr3, align 16
+  br label %bb
+
+bb:
+  %load1 = load half, ptr %b_blockwise_copy, align 16
+  %ptr2 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 2
+  %load2 = load half, ptr %ptr2, align 16
+  %ptr3 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 4
+  %load3 = load half, ptr %ptr3, align 16
+  %ptr4 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 6
+  %load4 = load half, ptr %ptr4, align 16
+  %ptr5 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 8
+  %load5 = load half, ptr %ptr5, align 16
+  %ptr6 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 10
+  %load6 = load half, ptr %ptr6, align 16
+  %ptr7 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 12
+  %load7 = load half, ptr %ptr7, align 16
+  %ptr8 = getelementptr inbounds i8, ptr %b_blockwise_copy, i64 14
+  %load8 = load half, ptr %ptr8, align 16
+  ret void
+}
+
+declare <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32>, i32, i32, i32) #0
+declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1) nounwind
+declare void @llvm.memset.p0.i32(ptr nocapture writeonly, i8, i32, i1) nounwind
+attributes #0 = { nounwind readonly }
+