[llvm] 6bb4b2d - [NFC] Test case intended to cover SLP cost for chain with masked gather loads.

Fri Dec 30 12:32:04 PST 2022

Author: Valery N Dmitriev
Date: 2022-12-30T12:27:34-08:00
New Revision: 6bb4b2d00221658b3fac421957e2905a13579c68

URL: https://github.com/llvm/llvm-project/commit/6bb4b2d00221658b3fac421957e2905a13579c68
DIFF: https://github.com/llvm/llvm-project/commit/6bb4b2d00221658b3fac421957e2905a13579c68.diff

LOG: [NFC] Test case intended to cover SLP cost for chain with masked gather loads.

SLP produces two gather loads (one feeds another).
For the first set of scalar loads GEP indices are all constant.
The result of the second load is then fed into reduction (as a seed).

Differential Revision: https://reviews.llvm.org/D140785

Added: 
    llvm/test/Transforms/SLPVectorizer/X86/remark_gather-load-redux-cost.ll

Modified: 
    

Removed: 
    


################################################################################
diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/remark_gather-load-redux-cost.ll b/llvm/test/Transforms/SLPVectorizer/X86/remark_gather-load-redux-cost.ll
new file mode 100644
index 000000000000..cd9265ecc47c

--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/remark_gather-load-redux-cost.ll
@@ -0,0 +1,78 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -mtriple=x86_64 -mcpu=skylake-avx512 -passes=slp-vectorizer -pass-remarks-output=%t < %s | FileCheck %s
+; RUN: FileCheck --input-file=%t --check-prefix=YAML %s
+
+define i32 @test(ptr noalias %p, ptr noalias %addr) {
+; CHECK-LABEL: @test(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <8 x ptr> poison, ptr [[ADDR:%.*]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x ptr> [[TMP0]], <8 x ptr> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i32, <8 x ptr> [[TMP1]], <8 x i32> <i32 15, i32 13, i32 11, i32 9, i32 7, i32 5, i32 3, i32 1>
+; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[TMP2]], i32 8, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> poison)
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <8 x ptr> poison, ptr [[P:%.*]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <8 x ptr> [[TMP4]], <8 x ptr> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i32, <8 x ptr> [[TMP5]], <8 x i32> [[TMP3]]
+; CHECK-NEXT:    [[TMP7:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[TMP6]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> poison)
+; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP7]])
+; CHECK-NEXT:    ret i32 [[TMP8]]
+;
+; YAML:      --- !Passed
+  ; YAML-NEXT: Pass:            slp-vectorizer
+  ; YAML-NEXT: Name:            VectorizedHorizontalReduction
+  ; YAML-NEXT: Function:        test
+  ; YAML-NEXT: Args:
+  ; YAML-NEXT:   - String:          'Vectorized horizontal reduction with cost '
+  ; YAML-NEXT:   - Cost:            '-17'
+  ; YAML-NEXT:   - String:          ' and with tree size '
+  ; YAML-NEXT:   - TreeSize:        '7'
+entry:
+  %off0.1 = getelementptr inbounds i32, ptr %addr, i32 1
+  %idx0 = load i32, ptr %off0.1, align 8
+  %gep0 = getelementptr inbounds i32, ptr %p, i32 %idx0
+  %ld0 = load i32, ptr %gep0, align 4
+
+  %off1.3 = getelementptr inbounds i32, ptr %addr, i32 3
+  %idx1 = load i32, ptr %off1.3, align 8
+  %gep1 = getelementptr inbounds i32, ptr %p, i32 %idx1
+  %ld1 = load i32, ptr %gep1, align 4
+
+  %off2.5 = getelementptr inbounds i32, ptr %addr, i32 5
+  %idx2 = load i32, ptr %off2.5, align 8
+  %gep2 = getelementptr inbounds i32, ptr %p, i32 %idx2
+  %ld2 = load i32, ptr %gep2, align 4
+
+  %off3.7 = getelementptr inbounds i32, ptr %addr, i32 7
+  %idx3 = load i32, ptr %off3.7, align 8
+  %gep3 = getelementptr inbounds i32, ptr %p, i32 %idx3
+  %ld3 = load i32, ptr %gep3, align 4
+
+  %off4.9 = getelementptr inbounds i32, ptr %addr, i32 9
+  %idx4 = load i32, ptr %off4.9, align 8
+  %gep4 = getelementptr inbounds i32, ptr %p, i32 %idx4
+  %ld4 = load i32, ptr %gep4, align 4
+
+  %off5.11 = getelementptr inbounds i32, ptr %addr, i32 11
+  %idx5 = load i32, ptr %off5.11, align 8
+  %gep5 = getelementptr inbounds i32, ptr %p, i32 %idx5
+  %ld5 = load i32, ptr %gep5, align 4
+
+  %off6.13 = getelementptr inbounds i32, ptr %addr, i32 13
+  %idx6 = load i32, ptr %off6.13, align 8
+  %gep6 = getelementptr inbounds i32, ptr %p, i32 %idx6
+  %ld6 = load i32, ptr %gep6, align 4
+
+  %off7.15 = getelementptr inbounds i32, ptr %addr, i32 15
+  %idx7 = load i32, ptr %off7.15, align 8
+  %gep7 = getelementptr inbounds i32, ptr %p, i32 %idx7
+  %ld7 = load i32, ptr %gep7, align 4
+
+  %add0 = add nsw i32 %ld1, %ld0
+  %add1 = add nsw i32 %add0, %ld2
+  %add2 = add nsw i32 %add1, %ld3
+  %add3 = add nsw i32 %add2, %ld4
+  %add4 = add nsw i32 %add3, %ld5
+  %add5 = add nsw i32 %add4, %ld6
+  %add6 = add nsw i32 %add5, %ld7
+
+  ret i32 %add6
+}