[llvm] 89ab020 - [tests, SLP] Add coverage for missing dependencies for stacksave intrinsics

Sat Mar 19 18:05:20 PDT 2022

Author: Philip Reames
Date: 2022-03-19T18:05:09-07:00
New Revision: 89ab020d0237122992362dced21c51751c73e529

URL: https://github.com/llvm/llvm-project/commit/89ab020d0237122992362dced21c51751c73e529
DIFF: https://github.com/llvm/llvm-project/commit/89ab020d0237122992362dced21c51751c73e529.diff

LOG: [tests, SLP] Add coverage for missing dependencies for stacksave intrinsics

The existing scheduling doesn't account for the scheduling restrictions implied by inalloca allocas combined with stacksave/stackrestore.  This adds coverage including one currently miscompiling case.

Added: 
    llvm/test/Transforms/SLPVectorizer/X86/stacksave-dependence.ll

Modified: 
    

Removed: 
    


################################################################################
diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/stacksave-dependence.ll b/llvm/test/Transforms/SLPVectorizer/X86/stacksave-dependence.ll
new file mode 100644
index 0000000000000..7c89e165b8c60

--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/stacksave-dependence.ll
@@ -0,0 +1,268 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -slp-vectorizer -slp-threshold=-999 -S -mtriple=x86_64-unknown-linux-gnu -mcpu=skylake < %s | FileCheck %s
+
+declare i64 @may_inf_loop_ro() nounwind readonly
+
+; Base case without allocas or stacksave
+define void @basecase(i8** %a, i8** %b, i8** %c) {
+; CHECK-LABEL: @basecase(
+; CHECK-NEXT:    [[A2:%.*]] = getelementptr i8*, i8** [[A:%.*]], i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8** [[A]] to <2 x i8*>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i8*>, <2 x i8*>* [[TMP1]], align 8
+; CHECK-NEXT:    store i8* null, i8** [[A]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, <2 x i8*> [[TMP2]], <2 x i32> <i32 1, i32 1>
+; CHECK-NEXT:    [[B2:%.*]] = getelementptr i8*, i8** [[B:%.*]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8** [[B]] to <2 x i8*>*
+; CHECK-NEXT:    store <2 x i8*> [[TMP3]], <2 x i8*>* [[TMP4]], align 8
+; CHECK-NEXT:    ret void
+;
+
+  %v1 = load i8*, i8** %a
+  store i8* zeroinitializer, i8** %a
+  %a2 = getelementptr i8*, i8** %a, i32 1
+  %v2 = load i8*, i8** %a2
+
+  %add1 = getelementptr i8, i8* %v1, i32 1
+  %add2 = getelementptr i8, i8* %v2, i32 1
+
+  store i8* %add1, i8** %b
+  %b2 = getelementptr i8*, i8** %b, i32 1
+  store i8* %add2, i8** %b2
+  ret void
+}
+
+; Using two allocas and a buildvector
+define void @allocas(i8** %a, i8** %b, i8** %c) {
+; CHECK-LABEL: @allocas(
+; CHECK-NEXT:    [[V1:%.*]] = alloca i8, align 1
+; CHECK-NEXT:    [[V2:%.*]] = alloca i8, align 1
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i8*> poison, i8* [[V1]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i8*> [[TMP1]], i8* [[V2]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, <2 x i8*> [[TMP2]], <2 x i32> <i32 1, i32 1>
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i8*> [[TMP3]], i32 0
+; CHECK-NEXT:    store i8* [[TMP4]], i8** [[A:%.*]], align 8
+; CHECK-NEXT:    [[B2:%.*]] = getelementptr i8*, i8** [[B:%.*]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8** [[B]] to <2 x i8*>*
+; CHECK-NEXT:    store <2 x i8*> [[TMP3]], <2 x i8*>* [[TMP5]], align 8
+; CHECK-NEXT:    ret void
+;
+
+  %v1 = alloca i8
+  %add1 = getelementptr i8, i8* %v1, i32 1
+  store i8* %add1, i8** %a
+  %v2 = alloca i8
+
+  %add2 = getelementptr i8, i8* %v2, i32 1
+
+  store i8* %add1, i8** %b
+  %b2 = getelementptr i8*, i8** %b, i32 1
+  store i8* %add2, i8** %b2
+  ret void
+}
+
+; Allocas can not be speculated above a potentially non-returning call
+define void @allocas_speculation(i8** %a, i8** %b, i8** %c) {
+; CHECK-LABEL: @allocas_speculation(
+; CHECK-NEXT:    [[V1:%.*]] = alloca i8, align 1
+; CHECK-NEXT:    [[ADD1:%.*]] = getelementptr i8, i8* [[V1]], i32 1
+; CHECK-NEXT:    store i8* [[ADD1]], i8** [[A:%.*]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @may_inf_loop_ro()
+; CHECK-NEXT:    [[V2:%.*]] = alloca i8, align 1
+; CHECK-NEXT:    [[ADD2:%.*]] = getelementptr i8, i8* [[V2]], i32 1
+; CHECK-NEXT:    store i8* [[ADD1]], i8** [[B:%.*]], align 8
+; CHECK-NEXT:    [[B2:%.*]] = getelementptr i8*, i8** [[B]], i32 1
+; CHECK-NEXT:    store i8* [[ADD2]], i8** [[B2]], align 8
+; CHECK-NEXT:    ret void
+;
+
+  %v1 = alloca i8
+  %add1 = getelementptr i8, i8* %v1, i32 1
+  store i8* %add1, i8** %a
+  call i64 @may_inf_loop_ro()
+  %v2 = alloca i8
+
+  %add2 = getelementptr i8, i8* %v2, i32 1
+
+  store i8* %add1, i8** %b
+  %b2 = getelementptr i8*, i8** %b, i32 1
+  store i8* %add2, i8** %b2
+  ret void
+}
+
+; FIXME: This is wrong because we life the alloca out of the region
+define void @stacksave(i8** %a, i8** %b, i8** %c) {
+; CHECK-LABEL: @stacksave(
+; CHECK-NEXT:    [[V1:%.*]] = alloca i8, align 1
+; CHECK-NEXT:    [[V2:%.*]] = alloca inalloca i8, align 1
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i8*> poison, i8* [[V1]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i8*> [[TMP1]], i8* [[V2]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, <2 x i8*> [[TMP2]], <2 x i32> <i32 1, i32 1>
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i8*> [[TMP3]], i32 0
+; CHECK-NEXT:    store i8* [[TMP4]], i8** [[A:%.*]], align 8
+; CHECK-NEXT:    [[STACK:%.*]] = call i8* @llvm.stacksave()
+; CHECK-NEXT:    call void @use(i8* inalloca(i8) [[V2]]) #[[ATTR3:[0-9]+]]
+; CHECK-NEXT:    call void @llvm.stackrestore(i8* [[STACK]])
+; CHECK-NEXT:    [[B2:%.*]] = getelementptr i8*, i8** [[B:%.*]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8** [[B]] to <2 x i8*>*
+; CHECK-NEXT:    store <2 x i8*> [[TMP3]], <2 x i8*>* [[TMP5]], align 8
+; CHECK-NEXT:    ret void
+;
+
+  %v1 = alloca i8
+  %add1 = getelementptr i8, i8* %v1, i32 1
+  store i8* %add1, i8** %a
+
+  %stack = call i8* @llvm.stacksave()
+  %v2 = alloca inalloca i8
+  call void @use(i8* inalloca(i8) %v2) readnone
+  call void @llvm.stackrestore(i8* %stack)
+
+  %add2 = getelementptr i8, i8* %v2, i32 1
+
+  store i8* %add1, i8** %b
+  %b2 = getelementptr i8*, i8** %b, i32 1
+  store i8* %add2, i8** %b2
+  ret void
+}
+
+define void @stacksave2(i8** %a, i8** %b, i8** %c) {
+; CHECK-LABEL: @stacksave2(
+; CHECK-NEXT:    [[V1:%.*]] = alloca i8, align 1
+; CHECK-NEXT:    [[STACK:%.*]] = call i8* @llvm.stacksave()
+; CHECK-NEXT:    [[V2:%.*]] = alloca inalloca i8, align 1
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i8*> poison, i8* [[V1]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i8*> [[TMP1]], i8* [[V2]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, <2 x i8*> [[TMP2]], <2 x i32> <i32 1, i32 1>
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i8*> [[TMP3]], i32 0
+; CHECK-NEXT:    store i8* [[TMP4]], i8** [[A:%.*]], align 8
+; CHECK-NEXT:    call void @use(i8* inalloca(i8) [[V2]]) #[[ATTR4:[0-9]+]]
+; CHECK-NEXT:    call void @llvm.stackrestore(i8* [[STACK]])
+; CHECK-NEXT:    [[B2:%.*]] = getelementptr i8*, i8** [[B:%.*]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8** [[B]] to <2 x i8*>*
+; CHECK-NEXT:    store <2 x i8*> [[TMP3]], <2 x i8*>* [[TMP5]], align 8
+; CHECK-NEXT:    ret void
+;
+
+  %v1 = alloca i8
+  %add1 = getelementptr i8, i8* %v1, i32 1
+
+  %stack = call i8* @llvm.stacksave()
+  store i8* %add1, i8** %a
+  %v2 = alloca inalloca i8
+  call void @use(i8* inalloca(i8) %v2) readonly
+  call void @llvm.stackrestore(i8* %stack)
+
+  %add2 = getelementptr i8, i8* %v2, i32 1
+
+  store i8* %add1, i8** %b
+  %b2 = getelementptr i8*, i8** %b, i32 1
+  store i8* %add2, i8** %b2
+  ret void
+}
+
+define void @stacksave3(i8** %a, i8** %b, i8** %c) {
+; CHECK-LABEL: @stacksave3(
+; CHECK-NEXT:    [[STACK:%.*]] = call i8* @llvm.stacksave()
+; CHECK-NEXT:    [[V1:%.*]] = alloca i8, align 1
+; CHECK-NEXT:    [[V2:%.*]] = alloca inalloca i8, align 1
+; CHECK-NEXT:    call void @use(i8* inalloca(i8) [[V2]]) #[[ATTR3]]
+; CHECK-NEXT:    call void @llvm.stackrestore(i8* [[STACK]])
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i8*> poison, i8* [[V1]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i8*> [[TMP1]], i8* [[V2]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, <2 x i8*> [[TMP2]], <2 x i32> <i32 1, i32 1>
+; CHECK-NEXT:    [[B2:%.*]] = getelementptr i8*, i8** [[B:%.*]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8** [[B]] to <2 x i8*>*
+; CHECK-NEXT:    store <2 x i8*> [[TMP3]], <2 x i8*>* [[TMP4]], align 8
+; CHECK-NEXT:    ret void
+;
+
+  %stack = call i8* @llvm.stacksave()
+  %v1 = alloca i8
+
+  %v2 = alloca inalloca i8
+  call void @use(i8* inalloca(i8) %v2) readnone
+  call void @llvm.stackrestore(i8* %stack)
+
+  %add1 = getelementptr i8, i8* %v1, i32 1
+  %add2 = getelementptr i8, i8* %v2, i32 1
+
+  store i8* %add1, i8** %b
+  %b2 = getelementptr i8*, i8** %b, i32 1
+  store i8* %add2, i8** %b2
+  ret void
+}
+
+; Here we have an alloca which needs to stay under the stacksave, but is not
+; directly part of the vectorization tree.  Instead, the stacksave is
+; encountered during dependency scanning via the memory chain.
+define void @stacksave4(i8** %a, i8** %b, i8** %c) {
+; CHECK-LABEL: @stacksave4(
+; CHECK-NEXT:    [[A2:%.*]] = getelementptr i8*, i8** [[A:%.*]], i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8** [[A]] to <2 x i8*>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i8*>, <2 x i8*>* [[TMP1]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, <2 x i8*> [[TMP2]], <2 x i32> <i32 1, i32 1>
+; CHECK-NEXT:    [[STACK:%.*]] = call i8* @llvm.stacksave()
+; CHECK-NEXT:    [[X:%.*]] = alloca inalloca i8, align 1
+; CHECK-NEXT:    call void @use(i8* inalloca(i8) [[X]]) #[[ATTR3]]
+; CHECK-NEXT:    call void @llvm.stackrestore(i8* [[STACK]])
+; CHECK-NEXT:    [[B2:%.*]] = getelementptr i8*, i8** [[B:%.*]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8** [[B]] to <2 x i8*>*
+; CHECK-NEXT:    store <2 x i8*> [[TMP3]], <2 x i8*>* [[TMP4]], align 8
+; CHECK-NEXT:    ret void
+;
+
+  %v1 = load i8*, i8** %a
+  %a2 = getelementptr i8*, i8** %a, i32 1
+  %v2 = load i8*, i8** %a2
+
+  %add1 = getelementptr i8, i8* %v1, i32 1
+  %add2 = getelementptr i8, i8* %v2, i32 1
+
+  %stack = call i8* @llvm.stacksave()
+  %x = alloca inalloca i8
+  call void @use(i8* inalloca(i8) %x) readnone
+  call void @llvm.stackrestore(i8* %stack)
+
+  store i8* %add1, i8** %b
+  %b2 = getelementptr i8*, i8** %b, i32 1
+  store i8* %add2, i8** %b2
+  ret void
+}
+
+define void @stacksave5(i8** %a, i8** %b, i8** %c) {
+; CHECK-LABEL: @stacksave5(
+; CHECK-NEXT:    [[A2:%.*]] = getelementptr i8*, i8** [[A:%.*]], i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8** [[A]] to <2 x i8*>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i8*>, <2 x i8*>* [[TMP1]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, <2 x i8*> [[TMP2]], <2 x i32> <i32 1, i32 1>
+; CHECK-NEXT:    [[STACK:%.*]] = call i8* @llvm.stacksave()
+; CHECK-NEXT:    [[X:%.*]] = alloca inalloca i8, align 1
+; CHECK-NEXT:    call void @use(i8* inalloca(i8) [[X]]) #[[ATTR3]]
+; CHECK-NEXT:    call void @llvm.stackrestore(i8* [[STACK]])
+; CHECK-NEXT:    [[B2:%.*]] = getelementptr i8*, i8** [[B:%.*]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8** [[B]] to <2 x i8*>*
+; CHECK-NEXT:    store <2 x i8*> [[TMP3]], <2 x i8*>* [[TMP4]], align 8
+; CHECK-NEXT:    ret void
+;
+
+  %v1 = load i8*, i8** %a
+  %a2 = getelementptr i8*, i8** %a, i32 1
+  %v2 = load i8*, i8** %a2
+
+  %add1 = getelementptr i8, i8* %v1, i32 1
+  %add2 = getelementptr i8, i8* %v2, i32 1
+
+  %stack = call i8* @llvm.stacksave()
+  %x = alloca inalloca i8
+  call void @use(i8* inalloca(i8) %x) readnone
+  call void @llvm.stackrestore(i8* %stack)
+
+  store i8* %add1, i8** %b
+  %b2 = getelementptr i8*, i8** %b, i32 1
+  store i8* %add2, i8** %b2
+  ret void
+}
+
+declare void @use(i8* inalloca(i8))
+declare i8* @llvm.stacksave()
+declare void @llvm.stackrestore(i8*)