[llvm] d888496 - [SROA] Fix bug where RankVectorTypes is used in std::unique

Tue Mar 7 14:11:11 PST 2023

Author: Han Zhu
Date: 2023-03-07T14:10:58-08:00
New Revision: d888496e3c54f40e37db8ad273dff2456aad671d

URL: https://github.com/llvm/llvm-project/commit/d888496e3c54f40e37db8ad273dff2456aad671d
DIFF: https://github.com/llvm/llvm-project/commit/d888496e3c54f40e37db8ad273dff2456aad671d.diff

LOG: [SROA] Fix bug where RankVectorTypes is used in std::unique

`RankVectorTypes` is a not an equivalence relation so when it is used in
`std::unique`, the behavior is undefined. Create `RankVectorTypesEq` and use
that instead.

Added: 
    

Modified: 
    llvm/lib/Transforms/Scalar/SROA.cpp
    llvm/test/Transforms/SROA/sroa-common-type-fail-promotion.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp
index d315910dd993d..67032a9a13dc7 100644

--- a/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -2045,7 +2045,7 @@ static VectorType *isVectorPromotionViable(Partition &P, const DataLayout &DL) {
 
     // Rank the remaining candidate vector types. This is easy because we know
     // they're all integer vectors. We sort by ascending number of elements.
-    auto RankVectorTypes = [&DL](VectorType *RHSTy, VectorType *LHSTy) {
+    auto RankVectorTypesComp = [&DL](VectorType *RHSTy, VectorType *LHSTy) {
       (void)DL;
       assert(DL.getTypeSizeInBits(RHSTy).getFixedValue() ==
                  DL.getTypeSizeInBits(LHSTy).getFixedValue() &&
@@ -2057,10 +2057,22 @@ static VectorType *isVectorPromotionViable(Partition &P, const DataLayout &DL) {
       return cast<FixedVectorType>(RHSTy)->getNumElements() <
              cast<FixedVectorType>(LHSTy)->getNumElements();
     };
-    llvm::sort(CandidateTys, RankVectorTypes);
-    CandidateTys.erase(
-        std::unique(CandidateTys.begin(), CandidateTys.end(), RankVectorTypes),
-        CandidateTys.end());
+    auto RankVectorTypesEq = [&DL](VectorType *RHSTy, VectorType *LHSTy) {
+      (void)DL;
+      assert(DL.getTypeSizeInBits(RHSTy).getFixedValue() ==
+                 DL.getTypeSizeInBits(LHSTy).getFixedValue() &&
+             "Cannot have vector types of 
diff erent sizes!");
+      assert(RHSTy->getElementType()->isIntegerTy() &&
+             "All non-integer types eliminated!");
+      assert(LHSTy->getElementType()->isIntegerTy() &&
+             "All non-integer types eliminated!");
+      return cast<FixedVectorType>(RHSTy)->getNumElements() ==
+             cast<FixedVectorType>(LHSTy)->getNumElements();
+    };
+    llvm::sort(CandidateTys, RankVectorTypesComp);
+    CandidateTys.erase(std::unique(CandidateTys.begin(), CandidateTys.end(),
+                                   RankVectorTypesEq),
+                       CandidateTys.end());
   } else {
 // The only way to have the same element type in every vector type is to
 // have the same vector type. Check that and remove all but one.

diff  --git a/llvm/test/Transforms/SROA/sroa-common-type-fail-promotion.ll b/llvm/test/Transforms/SROA/sroa-common-type-fail-promotion.ll
index 2734c1a520333..ffe5258fcb21d 100644
--- a/llvm/test/Transforms/SROA/sroa-common-type-fail-promotion.ll
+++ b/llvm/test/Transforms/SROA/sroa-common-type-fail-promotion.ll
@@ -15,12 +15,15 @@ define amdgpu_kernel void @test_zeroinit() #0 {
 ; CHECK-LABEL: @test_zeroinit(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[DATA:%.*]] = load <4 x float>, ptr undef, align 16
-; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[DATA]] to <8 x half>
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[DATA]] to <8 x i16>
 ; CHECK-NEXT:    br label [[BB:%.*]]
 ; CHECK:       bb:
-; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_0_VEC_EXTRACT:%.*]] = extractelement <8 x half> [[TMP0]], i32 0
-; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_2_VEC_EXTRACT:%.*]] = extractelement <8 x half> [[TMP0]], i32 1
-; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_4_VEC_EXTRACT:%.*]] = extractelement <8 x half> [[TMP0]], i32 2
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_0_VEC_EXTRACT:%.*]] = extractelement <8 x i16> [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16 [[B_BLOCKWISE_COPY_SROA_0_0_VEC_EXTRACT]] to half
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_2_VEC_EXTRACT:%.*]] = extractelement <8 x i16> [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[B_BLOCKWISE_COPY_SROA_0_2_VEC_EXTRACT]] to half
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_4_VEC_EXTRACT:%.*]] = extractelement <8 x i16> [[TMP0]], i32 2
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i16 [[B_BLOCKWISE_COPY_SROA_0_4_VEC_EXTRACT]] to half
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -67,24 +70,19 @@ bb:
   ret void
 }
 
-; Initial SROA pass failed to promote alloca and same alloca type was re-used
-; so alloca was not re-added to the worklist after initial SROA pass. This
-; caused it to fail to promote unlike the other tests.
 define amdgpu_kernel void @vector_type_alloca() #0 {
 ; CHECK-LABEL: @vector_type_alloca(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[B_BLOCKWISE_COPY:%.*]] = alloca <8 x half>, align 16
-; CHECK-NEXT:    store <8 x half> zeroinitializer, ptr [[B_BLOCKWISE_COPY]], align 16
 ; CHECK-NEXT:    [[DATA:%.*]] = load <4 x float>, ptr undef, align 16
-; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[DATA]] to <8 x half>
-; CHECK-NEXT:    store <8 x half> [[TMP0]], ptr [[B_BLOCKWISE_COPY]], align 16
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[DATA]] to <8 x i16>
 ; CHECK-NEXT:    br label [[BB:%.*]]
 ; CHECK:       bb:
-; CHECK-NEXT:    [[B_BLOCKWISE_COPY_0_LOAD1:%.*]] = load half, ptr [[B_BLOCKWISE_COPY]], align 16
-; CHECK-NEXT:    [[B_BLOCKWISE_COPY_2_PTR2_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[B_BLOCKWISE_COPY]], i64 2
-; CHECK-NEXT:    [[B_BLOCKWISE_COPY_2_LOAD2:%.*]] = load half, ptr [[B_BLOCKWISE_COPY_2_PTR2_SROA_IDX]], align 2
-; CHECK-NEXT:    [[B_BLOCKWISE_COPY_4_PTR3_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[B_BLOCKWISE_COPY]], i64 4
-; CHECK-NEXT:    [[B_BLOCKWISE_COPY_4_LOAD3:%.*]] = load half, ptr [[B_BLOCKWISE_COPY_4_PTR3_SROA_IDX]], align 4
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_0_VEC_EXTRACT:%.*]] = extractelement <8 x i16> [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16 [[B_BLOCKWISE_COPY_SROA_0_0_VEC_EXTRACT]] to half
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_2_VEC_EXTRACT:%.*]] = extractelement <8 x i16> [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[B_BLOCKWISE_COPY_SROA_0_2_VEC_EXTRACT]] to half
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_4_VEC_EXTRACT:%.*]] = extractelement <8 x i16> [[TMP0]], i32 2
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i16 [[B_BLOCKWISE_COPY_SROA_0_4_VEC_EXTRACT]] to half
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -107,12 +105,15 @@ define amdgpu_kernel void @test_struct_contain_multiple_types1() #0 {
 ; CHECK-LABEL: @test_struct_contain_multiple_types1(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[DATA:%.*]] = load <4 x float>, ptr undef, align 16
-; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[DATA]] to <8 x half>
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[DATA]] to <8 x i16>
 ; CHECK-NEXT:    br label [[BB:%.*]]
 ; CHECK:       bb:
-; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_0_VEC_EXTRACT:%.*]] = extractelement <8 x half> [[TMP0]], i32 0
-; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_2_VEC_EXTRACT:%.*]] = extractelement <8 x half> [[TMP0]], i32 1
-; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_4_VEC_EXTRACT:%.*]] = extractelement <8 x half> [[TMP0]], i32 2
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_0_VEC_EXTRACT:%.*]] = extractelement <8 x i16> [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16 [[B_BLOCKWISE_COPY_SROA_0_0_VEC_EXTRACT]] to half
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_2_VEC_EXTRACT:%.*]] = extractelement <8 x i16> [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[B_BLOCKWISE_COPY_SROA_0_2_VEC_EXTRACT]] to half
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_4_VEC_EXTRACT:%.*]] = extractelement <8 x i16> [[TMP0]], i32 2
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i16 [[B_BLOCKWISE_COPY_SROA_0_4_VEC_EXTRACT]] to half
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -140,12 +141,15 @@ define amdgpu_kernel void @test_struct_contain_multiple_types2() #0 {
 ; CHECK-NEXT:    [[DATA1_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i32] [[DATA1]], 2
 ; CHECK-NEXT:    [[DATA1_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i32] [[DATA1]], 3
 ; CHECK-NEXT:    [[DATA2:%.*]] = load <4 x float>, ptr undef, align 16
-; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[DATA2]] to <8 x half>
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[DATA2]] to <8 x i16>
 ; CHECK-NEXT:    br label [[BB:%.*]]
 ; CHECK:       bb:
-; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_5_0_VEC_EXTRACT:%.*]] = extractelement <8 x half> [[TMP0]], i32 0
-; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_5_2_VEC_EXTRACT:%.*]] = extractelement <8 x half> [[TMP0]], i32 1
-; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_5_4_VEC_EXTRACT:%.*]] = extractelement <8 x half> [[TMP0]], i32 2
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_5_16_VEC_EXTRACT:%.*]] = extractelement <8 x i16> [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16 [[B_BLOCKWISE_COPY_SROA_5_16_VEC_EXTRACT]] to half
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_5_18_VEC_EXTRACT:%.*]] = extractelement <8 x i16> [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[B_BLOCKWISE_COPY_SROA_5_18_VEC_EXTRACT]] to half
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_5_20_VEC_EXTRACT:%.*]] = extractelement <8 x i16> [[TMP0]], i32 2
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i16 [[B_BLOCKWISE_COPY_SROA_5_20_VEC_EXTRACT]] to half
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -173,13 +177,15 @@ define amdgpu_kernel void @test_struct_array_vector() #0 {
 ; CHECK-LABEL: @test_struct_array_vector(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[DATA0:%.*]] = load <4 x float>, ptr undef, align 16
-; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[DATA0]] to <8 x half>
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[DATA0]] to <8 x i16>
 ; CHECK-NEXT:    [[DATA1:%.*]] = load <4 x float>, ptr undef, align 16
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[DATA1]] to <8 x half>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[DATA1]] to <8 x i16>
 ; CHECK-NEXT:    br label [[BB:%.*]]
 ; CHECK:       bb:
-; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_0_VEC_EXTRACT:%.*]] = extractelement <8 x half> [[TMP0]], i32 0
-; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_3_0_VEC_EXTRACT:%.*]] = extractelement <8 x half> [[TMP1]], i32 0
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_0_0_VEC_EXTRACT:%.*]] = extractelement <8 x i16> [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[B_BLOCKWISE_COPY_SROA_0_0_VEC_EXTRACT]] to half
+; CHECK-NEXT:    [[B_BLOCKWISE_COPY_SROA_3_16_VEC_EXTRACT:%.*]] = extractelement <8 x i16> [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i16 [[B_BLOCKWISE_COPY_SROA_3_16_VEC_EXTRACT]] to half
 ; CHECK-NEXT:    ret void
 ;
 entry: