[clang] 8ff85ed - As a follow-up to my initial mail to llvm-dev here's a first pass at the O1 described there.

Eric Christopher via cfe-commits cfe-commits at lists.llvm.org
Mon Nov 25 17:16:59 PST 2019


Author: Eric Christopher
Date: 2019-11-25T17:16:46-08:00
New Revision: 8ff85ed905a7306977d07a5cd67ab4d5a56fafb4

URL: https://github.com/llvm/llvm-project/commit/8ff85ed905a7306977d07a5cd67ab4d5a56fafb4
DIFF: https://github.com/llvm/llvm-project/commit/8ff85ed905a7306977d07a5cd67ab4d5a56fafb4.diff

LOG: As a follow-up to my initial mail to llvm-dev here's a first pass at the O1 described there.

This change doesn't include any change to move from selection dag to fast isel
and that will come with other numbers that should help inform that decision.
There also haven't been any real debuggability studies with this pipeline yet,
this is just the initial start done so that people could see it and we could start
tweaking after.

Test updates: Outside of the newpm tests most of the updates are coming from either
optimization passes not run anymore (and without a compelling argument at the moment)
that were largely used for canonicalization in clang.

Original post:

http://lists.llvm.org/pipermail/llvm-dev/2019-April/131494.html

Tags: #llvm
Differential Revision: https://reviews.llvm.org/D65410

Added: 
    

Modified: 
    clang/test/CodeGen/2008-07-30-implicit-initialization.c
    clang/test/CodeGen/arm-fp16-arguments.c
    clang/test/CodeGen/arm-vfp16-arguments2.cpp
    clang/test/CodeGen/atomic-ops-libcall.c
    clang/test/CodeGenCXX/atomicinit.cpp
    clang/test/CodeGenCXX/auto-var-init.cpp
    clang/test/CodeGenCXX/discard-name-values.cpp
    clang/test/CodeGenCXX/microsoft-abi-dynamic-cast.cpp
    clang/test/CodeGenCXX/microsoft-abi-typeid.cpp
    clang/test/CodeGenCXX/nrvo.cpp
    clang/test/CodeGenCXX/stack-reuse.cpp
    clang/test/CodeGenCXX/wasm-args-returns.cpp
    clang/test/CodeGenObjCXX/arc-blocks.mm
    clang/test/CodeGenObjCXX/nrvo.mm
    clang/test/Lexer/minimize_source_to_dependency_directives_invalid_error.c
    clang/test/PCH/no-escaping-block-tail-calls.cpp
    llvm/include/llvm/Passes/PassBuilder.h
    llvm/lib/Passes/PassBuilder.cpp
    llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
    llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll
    llvm/test/Feature/optnone-opt.ll
    llvm/test/Other/new-pm-defaults.ll
    llvm/test/Other/new-pm-thinlto-defaults.ll
    llvm/test/Transforms/MemCpyOpt/lifetime.ll
    llvm/test/Transforms/PhaseOrdering/simplifycfg-options.ll
    llvm/test/Transforms/PhaseOrdering/two-shifts-by-sext.ll

Removed: 
    


################################################################################
diff  --git a/clang/test/CodeGen/2008-07-30-implicit-initialization.c b/clang/test/CodeGen/2008-07-30-implicit-initialization.c
index e77c70a140f9..f2621f4560ec 100644
--- a/clang/test/CodeGen/2008-07-30-implicit-initialization.c
+++ b/clang/test/CodeGen/2008-07-30-implicit-initialization.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple i386-unknown-unknown -O1 -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple i386-unknown-unknown -O2 -emit-llvm -o - %s | FileCheck %s
 // CHECK-LABEL: define i32 @f0()
 // CHECK:   ret i32 0
 // CHECK-LABEL: define i32 @f1()

diff  --git a/clang/test/CodeGen/arm-fp16-arguments.c b/clang/test/CodeGen/arm-fp16-arguments.c
index d739f4b9c66a..34dc1a1cbf6a 100644
--- a/clang/test/CodeGen/arm-fp16-arguments.c
+++ b/clang/test/CodeGen/arm-fp16-arguments.c
@@ -1,6 +1,6 @@
-// RUN: %clang_cc1 -triple armv7a--none-eabi -target-abi aapcs -mfloat-abi soft -fallow-half-arguments-and-returns -emit-llvm -o - -O1 %s | FileCheck %s --check-prefix=CHECK --check-prefix=SOFT
-// RUN: %clang_cc1 -triple armv7a--none-eabi -target-abi aapcs -mfloat-abi hard -fallow-half-arguments-and-returns -emit-llvm -o - -O1 %s | FileCheck %s --check-prefix=CHECK --check-prefix=HARD
-// RUN: %clang_cc1 -triple armv7a--none-eabi -target-abi aapcs -mfloat-abi soft -fnative-half-arguments-and-returns -emit-llvm -o - -O1 %s | FileCheck %s --check-prefix=NATIVE
+// RUN: %clang_cc1 -triple armv7a--none-eabi -target-abi aapcs -mfloat-abi soft -fallow-half-arguments-and-returns -emit-llvm -o - -O2 %s | FileCheck %s --check-prefix=CHECK --check-prefix=SOFT
+// RUN: %clang_cc1 -triple armv7a--none-eabi -target-abi aapcs -mfloat-abi hard -fallow-half-arguments-and-returns -emit-llvm -o - -O2 %s | FileCheck %s --check-prefix=CHECK --check-prefix=HARD
+// RUN: %clang_cc1 -triple armv7a--none-eabi -target-abi aapcs -mfloat-abi soft -fnative-half-arguments-and-returns -emit-llvm -o - -O2 %s | FileCheck %s --check-prefix=NATIVE
 
 __fp16 g;
 

diff  --git a/clang/test/CodeGen/arm-vfp16-arguments2.cpp b/clang/test/CodeGen/arm-vfp16-arguments2.cpp
index 4f75971d8327..e436a5ecd6ab 100644
--- a/clang/test/CodeGen/arm-vfp16-arguments2.cpp
+++ b/clang/test/CodeGen/arm-vfp16-arguments2.cpp
@@ -1,12 +1,12 @@
 // RUN: %clang_cc1 -triple armv7a--none-eabi -target-abi aapcs \
-// RUN:   -mfloat-abi soft -target-feature +neon -emit-llvm -o - -O1 %s \
+// RUN:   -mfloat-abi soft -target-feature +neon -emit-llvm -o - -O2 %s \
 // RUN:   | FileCheck %s --check-prefix=CHECK-SOFT
 // RUN: %clang_cc1 -triple armv7a--none-eabi -target-abi aapcs \
-// RUN:   -mfloat-abi hard -target-feature +neon -emit-llvm -o - -O1 %s \
+// RUN:   -mfloat-abi hard -target-feature +neon -emit-llvm -o - -O2 %s \
 // RUN:   | FileCheck %s --check-prefix=CHECK-HARD
 // RUN: %clang_cc1 -triple armv7a--none-eabi -target-abi aapcs \
 // RUN:   -mfloat-abi hard -target-feature +neon -target-feature +fullfp16 \
-// RUN:   -emit-llvm -o - -O1 %s \
+// RUN:   -emit-llvm -o - -O2 %s \
 // RUN:   | FileCheck %s --check-prefix=CHECK-FULL
 
 typedef float float32_t;

diff  --git a/clang/test/CodeGen/atomic-ops-libcall.c b/clang/test/CodeGen/atomic-ops-libcall.c
index c673b07f8ed8..ca79688c8a0c 100644
--- a/clang/test/CodeGen/atomic-ops-libcall.c
+++ b/clang/test/CodeGen/atomic-ops-libcall.c
@@ -10,109 +10,109 @@ enum memory_order {
 
 int *test_c11_atomic_fetch_add_int_ptr(_Atomic(int *) *p) {
   // CHECK: test_c11_atomic_fetch_add_int_ptr
-  // CHECK: {{%[^ ]*}} = tail call i32 @__atomic_fetch_add_4(i8* {{%[0-9]+}}, i32 12, i32 5)
+  // CHECK: {{%[^ ]*}} = call i32 @__atomic_fetch_add_4(i8* {{%[0-9]+}}, i32 12, i32 5)
   return __c11_atomic_fetch_add(p, 3, memory_order_seq_cst);
 }
 
 int *test_c11_atomic_fetch_sub_int_ptr(_Atomic(int *) *p) {
   // CHECK: test_c11_atomic_fetch_sub_int_ptr
-  // CHECK: {{%[^ ]*}} = tail call i32 @__atomic_fetch_sub_4(i8* {{%[0-9]+}}, i32 20, i32 5)
+  // CHECK: {{%[^ ]*}} = call i32 @__atomic_fetch_sub_4(i8* {{%[0-9]+}}, i32 20, i32 5)
   return __c11_atomic_fetch_sub(p, 5, memory_order_seq_cst);
 }
 
 int test_c11_atomic_fetch_add_int(_Atomic(int) *p) {
   // CHECK: test_c11_atomic_fetch_add_int
-  // CHECK: {{%[^ ]*}} = tail call i32 @__atomic_fetch_add_4(i8* {{%[0-9]+}}, i32 3, i32 5)
+  // CHECK: {{%[^ ]*}} = call i32 @__atomic_fetch_add_4(i8* {{%[0-9]+}}, i32 3, i32 5)
   return __c11_atomic_fetch_add(p, 3, memory_order_seq_cst);
 }
 
 int test_c11_atomic_fetch_sub_int(_Atomic(int) *p) {
   // CHECK: test_c11_atomic_fetch_sub_int
-  // CHECK: {{%[^ ]*}} = tail call i32 @__atomic_fetch_sub_4(i8* {{%[0-9]+}}, i32 5, i32 5)
+  // CHECK: {{%[^ ]*}} = call i32 @__atomic_fetch_sub_4(i8* {{%[0-9]+}}, i32 5, i32 5)
   return __c11_atomic_fetch_sub(p, 5, memory_order_seq_cst);
 }
 
 int *fp2a(int **p) {
   // CHECK: @fp2a
-  // CHECK: {{%[^ ]*}} = tail call i32 @__atomic_fetch_sub_4(i8* {{%[0-9]+}}, i32 4, i32 0)
+  // CHECK: {{%[^ ]*}} = call i32 @__atomic_fetch_sub_4(i8* {{%[0-9]+}}, i32 4, i32 0)
   // Note, the GNU builtins do not multiply by sizeof(T)!
   return __atomic_fetch_sub(p, 4, memory_order_relaxed);
 }
 
 int test_atomic_fetch_add(int *p) {
   // CHECK: test_atomic_fetch_add
-  // CHECK: {{%[^ ]*}} = tail call i32 @__atomic_fetch_add_4(i8* {{%[0-9]+}}, i32 55, i32 5)
+  // CHECK: {{%[^ ]*}} = call i32 @__atomic_fetch_add_4(i8* {{%[0-9]+}}, i32 55, i32 5)
   return __atomic_fetch_add(p, 55, memory_order_seq_cst);
 }
 
 int test_atomic_fetch_sub(int *p) {
   // CHECK: test_atomic_fetch_sub
-  // CHECK: {{%[^ ]*}} = tail call i32 @__atomic_fetch_sub_4(i8* {{%[0-9]+}}, i32 55, i32 5)
+  // CHECK: {{%[^ ]*}} = call i32 @__atomic_fetch_sub_4(i8* {{%[0-9]+}}, i32 55, i32 5)
   return __atomic_fetch_sub(p, 55, memory_order_seq_cst);
 }
 
 int test_atomic_fetch_and(int *p) {
   // CHECK: test_atomic_fetch_and
-  // CHECK: {{%[^ ]*}} = tail call i32 @__atomic_fetch_and_4(i8* {{%[0-9]+}}, i32 55, i32 5)
+  // CHECK: {{%[^ ]*}} = call i32 @__atomic_fetch_and_4(i8* {{%[0-9]+}}, i32 55, i32 5)
   return __atomic_fetch_and(p, 55, memory_order_seq_cst);
 }
 
 int test_atomic_fetch_or(int *p) {
   // CHECK: test_atomic_fetch_or
-  // CHECK: {{%[^ ]*}} = tail call i32 @__atomic_fetch_or_4(i8* {{%[0-9]+}}, i32 55, i32 5)
+  // CHECK: {{%[^ ]*}} = call i32 @__atomic_fetch_or_4(i8* {{%[0-9]+}}, i32 55, i32 5)
   return __atomic_fetch_or(p, 55, memory_order_seq_cst);
 }
 
 int test_atomic_fetch_xor(int *p) {
   // CHECK: test_atomic_fetch_xor
-  // CHECK: {{%[^ ]*}} = tail call i32 @__atomic_fetch_xor_4(i8* {{%[0-9]+}}, i32 55, i32 5)
+  // CHECK: {{%[^ ]*}} = call i32 @__atomic_fetch_xor_4(i8* {{%[0-9]+}}, i32 55, i32 5)
   return __atomic_fetch_xor(p, 55, memory_order_seq_cst);
 }
 
 int test_atomic_fetch_nand(int *p) {
   // CHECK: test_atomic_fetch_nand
-  // CHECK: {{%[^ ]*}} = tail call i32 @__atomic_fetch_nand_4(i8* {{%[0-9]+}}, i32 55, i32 5)
+  // CHECK: {{%[^ ]*}} = call i32 @__atomic_fetch_nand_4(i8* {{%[0-9]+}}, i32 55, i32 5)
   return __atomic_fetch_nand(p, 55, memory_order_seq_cst);
 }
 
 int test_atomic_add_fetch(int *p) {
   // CHECK: test_atomic_add_fetch
-  // CHECK: [[CALL:%[^ ]*]] = tail call i32 @__atomic_fetch_add_4(i8* {{%[0-9]+}}, i32 55, i32 5)
+  // CHECK: [[CALL:%[^ ]*]] = call i32 @__atomic_fetch_add_4(i8* {{%[0-9]+}}, i32 55, i32 5)
   // CHECK: {{%[^ ]*}} = add i32 [[CALL]], 55
   return __atomic_add_fetch(p, 55, memory_order_seq_cst);
 }
 
 int test_atomic_sub_fetch(int *p) {
   // CHECK: test_atomic_sub_fetch
-  // CHECK: [[CALL:%[^ ]*]] = tail call i32 @__atomic_fetch_sub_4(i8* {{%[0-9]+}}, i32 55, i32 5)
+  // CHECK: [[CALL:%[^ ]*]] = call i32 @__atomic_fetch_sub_4(i8* {{%[0-9]+}}, i32 55, i32 5)
   // CHECK: {{%[^ ]*}} = add i32 [[CALL]], -55
   return __atomic_sub_fetch(p, 55, memory_order_seq_cst);
 }
 
 int test_atomic_and_fetch(int *p) {
   // CHECK: test_atomic_and_fetch
-  // CHECK: [[CALL:%[^ ]*]] = tail call i32 @__atomic_fetch_and_4(i8* {{%[0-9]+}}, i32 55, i32 5)
+  // CHECK: [[CALL:%[^ ]*]] = call i32 @__atomic_fetch_and_4(i8* {{%[0-9]+}}, i32 55, i32 5)
   // CHECK: {{%[^ ]*}} = and i32 [[CALL]], 55
   return __atomic_and_fetch(p, 55, memory_order_seq_cst);
 }
 
 int test_atomic_or_fetch(int *p) {
   // CHECK: test_atomic_or_fetch
-  // CHECK: [[CALL:%[^ ]*]] = tail call i32 @__atomic_fetch_or_4(i8* {{%[0-9]+}}, i32 55, i32 5)
+  // CHECK: [[CALL:%[^ ]*]] = call i32 @__atomic_fetch_or_4(i8* {{%[0-9]+}}, i32 55, i32 5)
   // CHECK: {{%[^ ]*}} = or i32 [[CALL]], 55
   return __atomic_or_fetch(p, 55, memory_order_seq_cst);
 }
 
 int test_atomic_xor_fetch(int *p) {
   // CHECK: test_atomic_xor_fetch
-  // CHECK: [[CALL:%[^ ]*]] = tail call i32 @__atomic_fetch_xor_4(i8* {{%[0-9]+}}, i32 55, i32 5)
+  // CHECK: [[CALL:%[^ ]*]] = call i32 @__atomic_fetch_xor_4(i8* {{%[0-9]+}}, i32 55, i32 5)
   // CHECK: {{%[^ ]*}} = xor i32 [[CALL]], 55
   return __atomic_xor_fetch(p, 55, memory_order_seq_cst);
 }
 
 int test_atomic_nand_fetch(int *p) {
   // CHECK: test_atomic_nand_fetch
-  // CHECK: [[CALL:%[^ ]*]] = tail call i32 @__atomic_fetch_nand_4(i8* {{%[0-9]+}}, i32 55, i32 5)
+  // CHECK: [[CALL:%[^ ]*]] = call i32 @__atomic_fetch_nand_4(i8* {{%[0-9]+}}, i32 55, i32 5)
   // FIXME: We should not be checking optimized IR. It changes independently of clang.
   // FIXME-CHECK: [[AND:%[^ ]*]] = and i32 [[CALL]], 55
   // FIXME-CHECK: {{%[^ ]*}} = xor i32 [[AND]], -1

diff  --git a/clang/test/CodeGenCXX/atomicinit.cpp b/clang/test/CodeGenCXX/atomicinit.cpp
index 85ec74593fe0..657ade588fd5 100644
--- a/clang/test/CodeGenCXX/atomicinit.cpp
+++ b/clang/test/CodeGenCXX/atomicinit.cpp
@@ -31,7 +31,7 @@ _Atomic(B) b;
 // CHECK-LABEL: define void @_Z11atomic_initR1Ai
 void atomic_init(A& a, int i) {
   // CHECK-NOT: atomic
-  // CHECK: tail call void @_ZN1BC1Ei
+  // CHECK: call void @_ZN1BC1Ei
   __c11_atomic_init(&b, B(i));
   // CHECK-NEXT: ret void
 }

diff  --git a/clang/test/CodeGenCXX/auto-var-init.cpp b/clang/test/CodeGenCXX/auto-var-init.cpp
index a2cb2c8352b6..9cd71bdfd1a7 100644
--- a/clang/test/CodeGenCXX/auto-var-init.cpp
+++ b/clang/test/CodeGenCXX/auto-var-init.cpp
@@ -645,7 +645,7 @@ TEST_UNINIT(smallpartinit, smallpartinit);
 // ZERO-LABEL: @test_smallpartinit_uninit()
 // ZERO-O0: call void @llvm.memset{{.*}}, i8 0,
 // ZERO-O1-LEGACY: store i16 0, i16* %uninit, align 2
-// ZERO-O1-NEWPM: store i16 42, i16* %uninit, align 2
+// ZERO-O1-NEWPM: store i16 0, i16* %uninit, align 2
 
 TEST_BRACES(smallpartinit, smallpartinit);
 // CHECK-LABEL: @test_smallpartinit_braces()
@@ -718,7 +718,7 @@ TEST_UNINIT(paddednullinit, paddednullinit);
 // PATTERN-LABEL: @test_paddednullinit_uninit()
 // PATTERN-O0: call void @llvm.memcpy{{.*}} @__const.test_paddednullinit_uninit.uninit
 // PATTERN-O1-LEGACY: store i64 [[I64]], i64* %uninit, align 8
-// PATTERN-O1-NEWPM: store i64 2863311360, i64* %uninit, align 8
+// PATTERN-O1-NEWPM: store i64 [[I64]], i64* %uninit, align 8
 // ZERO-LABEL: @test_paddednullinit_uninit()
 // ZERO-O0: call void @llvm.memset{{.*}}, i8 0,
 // ZERO-O1: store i64 0, i64* %uninit, align 8
@@ -1344,10 +1344,7 @@ TEST_UNINIT(virtualderived, virtualderived);
 // ZERO-LABEL: @test_virtualderived_uninit()
 // ZERO-O0: call void @llvm.memset{{.*}}, i8 0,
 // ZERO-O1-LEGACY: call void @llvm.memset{{.*}}, i8 0,
-// ZERO-O1-NEWPM: [[FIELD1:%.*]] = getelementptr inbounds %struct.virtualderived, %struct.virtualderived* %uninit, i64 0, i32 1, i32 0, i32 0
-// ZERO-O1-NEWPM: [[FIELD0:%.*]] = getelementptr inbounds %struct.virtualderived, %struct.virtualderived* %uninit, i64 0, i32 0, i32 0
-// ZERO-O1-NEWPM: store i32 (...)** bitcast (i8** getelementptr inbounds ({ [7 x i8*], [5 x i8*] }, { [7 x i8*], [5 x i8*] }* @_ZTV14virtualderived, i64 0, inrange i32 0, i64 5) to i32 (...)**), i32 (...)*** [[FIELD0]], align 8
-// ZERO-O1-NEWPM: store i32 (...)** bitcast (i8** getelementptr inbounds ({ [7 x i8*], [5 x i8*] }, { [7 x i8*], [5 x i8*] }* @_ZTV14virtualderived, i64 0, inrange i32 1, i64 3) to i32 (...)**), i32 (...)*** [[FIELD1]], align 8
+// ZERO-O1-NEWPM: call void @llvm.memset{{.*}}, i8 0,
 
 TEST_BRACES(virtualderived, virtualderived);
 // CHECK-LABEL: @test_virtualderived_braces()

diff  --git a/clang/test/CodeGenCXX/discard-name-values.cpp b/clang/test/CodeGenCXX/discard-name-values.cpp
index aa30dae7501b..91328a4ddade 100644
--- a/clang/test/CodeGenCXX/discard-name-values.cpp
+++ b/clang/test/CodeGenCXX/discard-name-values.cpp
@@ -11,11 +11,11 @@ bool test(bool pred) {
 
   if (pred) {
     // DISCARDVALUE: 2:
-    // DISCARDVALUE-NEXT: tail call void @branch()
+    // DISCARDVALUE-NEXT: call void @branch()
     // DISCARDVALUE-NEXT: br label %3
 
     // CHECK: if.then:
-    // CHECK-NEXT: tail call void @branch()
+    // CHECK-NEXT: call void @branch()
     // CHECK-NEXT: br label %if.end
     branch();
   }

diff  --git a/clang/test/CodeGenCXX/microsoft-abi-dynamic-cast.cpp b/clang/test/CodeGenCXX/microsoft-abi-dynamic-cast.cpp
index c99df0e88b42..a07114dce7d0 100644
--- a/clang/test/CodeGenCXX/microsoft-abi-dynamic-cast.cpp
+++ b/clang/test/CodeGenCXX/microsoft-abi-dynamic-cast.cpp
@@ -13,7 +13,7 @@ T* test0() { return dynamic_cast<T*>((B*)0); }
 T* test1(V* x) { return &dynamic_cast<T&>(*x); }
 // CHECK-LABEL: define dso_local %struct.T* @"?test1@@YAPAUT@@PAUV@@@Z"(%struct.V* %x)
 // CHECK:        [[CAST:%.*]] = bitcast %struct.V* %x to i8*
-// CHECK-NEXT:   [[CALL:%.*]] = tail call i8* @__RTDynamicCast(i8* [[CAST]], i32 0, i8* bitcast (%rtti.TypeDescriptor7* @"??_R0?AUV@@@8" to i8*), i8* bitcast (%rtti.TypeDescriptor7* @"??_R0?AUT@@@8" to i8*), i32 1)
+// CHECK-NEXT:   [[CALL:%.*]] = call i8* @__RTDynamicCast(i8* [[CAST]], i32 0, i8* bitcast (%rtti.TypeDescriptor7* @"??_R0?AUV@@@8" to i8*), i8* bitcast (%rtti.TypeDescriptor7* @"??_R0?AUT@@@8" to i8*), i32 1)
 // CHECK-NEXT:   [[RET:%.*]] = bitcast i8* [[CALL]] to %struct.T*
 // CHECK-NEXT:   ret %struct.T* [[RET]]
 
@@ -25,7 +25,7 @@ T* test2(A* x) { return &dynamic_cast<T&>(*x); }
 // CHECK-NEXT:   [[VBOFFP:%.*]] = getelementptr inbounds i32, i32* [[VBTBL]], i32 1
 // CHECK-NEXT:   [[VBOFFS:%.*]] = load i32, i32* [[VBOFFP]], align 4
 // CHECK-NEXT:   [[ADJ:%.*]] = getelementptr inbounds i8, i8* [[CAST]], i32 [[VBOFFS]]
-// CHECK-NEXT:   [[CALL:%.*]] = tail call i8* @__RTDynamicCast(i8* [[ADJ]], i32 [[VBOFFS]], i8* bitcast (%rtti.TypeDescriptor7* @"??_R0?AUA@@@8" to i8*), i8* bitcast (%rtti.TypeDescriptor7* @"??_R0?AUT@@@8" to i8*), i32 1)
+// CHECK-NEXT:   [[CALL:%.*]] = call i8* @__RTDynamicCast(i8* [[ADJ]], i32 [[VBOFFS]], i8* bitcast (%rtti.TypeDescriptor7* @"??_R0?AUA@@@8" to i8*), i8* bitcast (%rtti.TypeDescriptor7* @"??_R0?AUT@@@8" to i8*), i32 1)
 // CHECK-NEXT:   [[RET:%.*]] = bitcast i8* [[CALL]] to %struct.T*
 // CHECK-NEXT:   ret %struct.T* [[RET]]
 
@@ -39,14 +39,14 @@ T* test3(B* x) { return &dynamic_cast<T&>(*x); }
 // CHECK-NEXT:   [[VBOFFS:%.*]] = load i32, i32* [[VBOFFP]], align 4
 // CHECK-NEXT:   [[DELTA:%.*]] = add nsw i32 [[VBOFFS]], 4
 // CHECK-NEXT:   [[ADJ:%.*]] = getelementptr inbounds i8, i8* [[VOIDP]], i32 [[DELTA]]
-// CHECK-NEXT:   [[CALL:%.*]] = tail call i8* @__RTDynamicCast(i8* [[ADJ]], i32 [[DELTA]], i8* bitcast (%rtti.TypeDescriptor7* @"??_R0?AUB@@@8" to i8*), i8* bitcast (%rtti.TypeDescriptor7* @"??_R0?AUT@@@8" to i8*), i32 1)
+// CHECK-NEXT:   [[CALL:%.*]] = call i8* @__RTDynamicCast(i8* [[ADJ]], i32 [[DELTA]], i8* bitcast (%rtti.TypeDescriptor7* @"??_R0?AUB@@@8" to i8*), i8* bitcast (%rtti.TypeDescriptor7* @"??_R0?AUT@@@8" to i8*), i32 1)
 // CHECK-NEXT:   [[RET:%.*]] = bitcast i8* [[CALL]] to %struct.T*
 // CHECK-NEXT:   ret %struct.T* [[RET]]
 
 T* test4(V* x) { return dynamic_cast<T*>(x); }
 // CHECK-LABEL: define dso_local %struct.T* @"?test4@@YAPAUT@@PAUV@@@Z"(%struct.V* %x)
 // CHECK:        [[CAST:%.*]] = bitcast %struct.V* %x to i8*
-// CHECK-NEXT:   [[CALL:%.*]] = tail call i8* @__RTDynamicCast(i8* [[CAST]], i32 0, i8* bitcast (%rtti.TypeDescriptor7* @"??_R0?AUV@@@8" to i8*), i8* bitcast (%rtti.TypeDescriptor7* @"??_R0?AUT@@@8" to i8*), i32 0)
+// CHECK-NEXT:   [[CALL:%.*]] = call i8* @__RTDynamicCast(i8* [[CAST]], i32 0, i8* bitcast (%rtti.TypeDescriptor7* @"??_R0?AUV@@@8" to i8*), i8* bitcast (%rtti.TypeDescriptor7* @"??_R0?AUT@@@8" to i8*), i32 0)
 // CHECK-NEXT:   [[RET:%.*]] = bitcast i8* [[CALL]] to %struct.T*
 // CHECK-NEXT:   ret %struct.T* [[RET]]
 
@@ -60,7 +60,7 @@ T* test5(A* x) { return dynamic_cast<T*>(x); }
 // CHECK-NEXT:   [[VBOFFP:%.*]] = getelementptr inbounds i32, i32* [[VBTBL]], i32 1
 // CHECK-NEXT:   [[VBOFFS:%.*]] = load i32, i32* [[VBOFFP]], align 4
 // CHECK-NEXT:   [[ADJ:%.*]] = getelementptr inbounds i8, i8* [[VOIDP]], i32 [[VBOFFS]]
-// CHECK-NEXT:   [[CALL:%.*]] = tail call i8* @__RTDynamicCast(i8* nonnull [[ADJ]], i32 [[VBOFFS]], i8* {{.*}}bitcast (%rtti.TypeDescriptor7* @"??_R0?AUA@@@8" to i8*), i8* {{.*}}bitcast (%rtti.TypeDescriptor7* @"??_R0?AUT@@@8" to i8*), i32 0)
+// CHECK-NEXT:   [[CALL:%.*]] = call i8* @__RTDynamicCast(i8* nonnull [[ADJ]], i32 [[VBOFFS]], i8* {{.*}}bitcast (%rtti.TypeDescriptor7* @"??_R0?AUA@@@8" to i8*), i8* {{.*}}bitcast (%rtti.TypeDescriptor7* @"??_R0?AUT@@@8" to i8*), i32 0)
 // CHECK-NEXT:   [[RES:%.*]] = bitcast i8* [[CALL]] to %struct.T*
 // CHECK-NEXT:   br label
 // CHECK:        [[RET:%.*]] = phi %struct.T*
@@ -78,7 +78,7 @@ T* test6(B* x) { return dynamic_cast<T*>(x); }
 // CHECK-NEXT:   [[VBOFFS:%.*]] = load i32, i32* [[VBOFFP]], align 4
 // CHECK-NEXT:   [[DELTA:%.*]] = add nsw i32 [[VBOFFS]], 4
 // CHECK-NEXT:   [[ADJ:%.*]] = getelementptr inbounds i8, i8* [[CAST]], i32 [[DELTA]]
-// CHECK-NEXT:   [[CALL:%.*]] = tail call i8* @__RTDynamicCast(i8* [[ADJ]], i32 [[DELTA]], i8* {{.*}}bitcast (%rtti.TypeDescriptor7* @"??_R0?AUB@@@8" to i8*), i8* {{.*}}bitcast (%rtti.TypeDescriptor7* @"??_R0?AUT@@@8" to i8*), i32 0)
+// CHECK-NEXT:   [[CALL:%.*]] = call i8* @__RTDynamicCast(i8* [[ADJ]], i32 [[DELTA]], i8* {{.*}}bitcast (%rtti.TypeDescriptor7* @"??_R0?AUB@@@8" to i8*), i8* {{.*}}bitcast (%rtti.TypeDescriptor7* @"??_R0?AUT@@@8" to i8*), i32 0)
 // CHECK-NEXT:   [[RES:%.*]] = bitcast i8* [[CALL]] to %struct.T*
 // CHECK-NEXT:   br label
 // CHECK:        [[RET:%.*]] = phi %struct.T*
@@ -87,7 +87,7 @@ T* test6(B* x) { return dynamic_cast<T*>(x); }
 void* test7(V* x) { return dynamic_cast<void*>(x); }
 // CHECK-LABEL: define dso_local i8* @"?test7@@YAPAXPAUV@@@Z"(%struct.V* %x)
 // CHECK:        [[CAST:%.*]] = bitcast %struct.V* %x to i8*
-// CHECK-NEXT:   [[RET:%.*]] = tail call i8* @__RTCastToVoid(i8* [[CAST]])
+// CHECK-NEXT:   [[RET:%.*]] = call i8* @__RTCastToVoid(i8* [[CAST]])
 // CHECK-NEXT:   ret i8* [[RET]]
 
 void* test8(A* x) { return dynamic_cast<void*>(x); }
@@ -100,7 +100,7 @@ void* test8(A* x) { return dynamic_cast<void*>(x); }
 // CHECK-NEXT:   [[VBOFFP:%.*]] = getelementptr inbounds i32, i32* [[VBTBL]], i32 1
 // CHECK-NEXT:   [[VBOFFS:%.*]] = load i32, i32* [[VBOFFP]], align 4
 // CHECK-NEXT:   [[ADJ:%.*]] = getelementptr inbounds i8, i8* [[VOIDP]], i32 [[VBOFFS]]
-// CHECK-NEXT:   [[RES:%.*]] = tail call i8* @__RTCastToVoid(i8* nonnull [[ADJ]])
+// CHECK-NEXT:   [[RES:%.*]] = call i8* @__RTCastToVoid(i8* nonnull [[ADJ]])
 // CHECK-NEXT:   br label
 // CHECK:        [[RET:%.*]] = phi i8*
 // CHECK-NEXT:   ret i8* [[RET]]
@@ -117,7 +117,7 @@ void* test9(B* x) { return dynamic_cast<void*>(x); }
 // CHECK-NEXT:   [[VBOFFS:%.*]] = load i32, i32* [[VBOFFP]], align 4
 // CHECK-NEXT:   [[DELTA:%.*]] = add nsw i32 [[VBOFFS]], 4
 // CHECK-NEXT:   [[ADJ:%.*]] = getelementptr inbounds i8, i8* [[CAST]], i32 [[DELTA]]
-// CHECK-NEXT:   [[CALL:%.*]] = tail call i8* @__RTCastToVoid(i8* [[ADJ]])
+// CHECK-NEXT:   [[CALL:%.*]] = call i8* @__RTCastToVoid(i8* [[ADJ]])
 // CHECK-NEXT:   br label
 // CHECK:        [[RET:%.*]] = phi i8*
 // CHECK-NEXT:   ret i8* [[RET]]

diff  --git a/clang/test/CodeGenCXX/microsoft-abi-typeid.cpp b/clang/test/CodeGenCXX/microsoft-abi-typeid.cpp
index 848e280cd9fe..f3bd7e6fd6c8 100644
--- a/clang/test/CodeGenCXX/microsoft-abi-typeid.cpp
+++ b/clang/test/CodeGenCXX/microsoft-abi-typeid.cpp
@@ -25,10 +25,10 @@ const std::type_info* test2_typeid() { return &typeid(&a); }
 
 const std::type_info* test3_typeid() { return &typeid(*fn()); }
 // CHECK-LABEL: define dso_local %struct.type_info* @"?test3_typeid@@YAPBUtype_info@@XZ"()
-// CHECK:        [[CALL:%.*]] = tail call %struct.A* @"?fn@@YAPAUA@@XZ"()
+// CHECK:        [[CALL:%.*]] = call %struct.A* @"?fn@@YAPAUA@@XZ"()
 // CHECK-NEXT:   [[CMP:%.*]] = icmp eq %struct.A* [[CALL]], null
 // CHECK-NEXT:   br i1 [[CMP]]
-// CHECK:        tail call i8* @__RTtypeid(i8* null)
+// CHECK:        call i8* @__RTtypeid(i8* null)
 // CHECK-NEXT:   unreachable
 // CHECK:        [[THIS:%.*]] = bitcast %struct.A* [[CALL]] to i8*
 // CHECK-NEXT:   [[VBTBLP:%.*]] = getelementptr %struct.A, %struct.A* [[CALL]], i32 0, i32 0
@@ -36,7 +36,7 @@ const std::type_info* test3_typeid() { return &typeid(*fn()); }
 // CHECK-NEXT:   [[VBSLOT:%.*]] = getelementptr inbounds i32, i32* [[VBTBL]], i32 1
 // CHECK-NEXT:   [[VBASE_OFFS:%.*]] = load i32, i32* [[VBSLOT]], align 4
 // CHECK-NEXT:   [[ADJ:%.*]] = getelementptr inbounds i8, i8* [[THIS]], i32 [[VBASE_OFFS]]
-// CHECK-NEXT:   [[RT:%.*]] = tail call i8* @__RTtypeid(i8* nonnull [[ADJ]])
+// CHECK-NEXT:   [[RT:%.*]] = call i8* @__RTtypeid(i8* nonnull [[ADJ]])
 // CHECK-NEXT:   [[RET:%.*]] = bitcast i8* [[RT]] to %struct.type_info*
 // CHECK-NEXT:   ret %struct.type_info* [[RET]]
 
@@ -46,7 +46,7 @@ const std::type_info* test4_typeid() { return &typeid(b); }
 
 const std::type_info* test5_typeid() { return &typeid(v); }
 // CHECK: define dso_local %struct.type_info* @"?test5_typeid@@YAPBUtype_info@@XZ"()
-// CHECK:        [[RT:%.*]] = tail call i8* @__RTtypeid(i8* bitcast (%struct.V* @"?v@@3UV@@A" to i8*))
+// CHECK:        [[RT:%.*]] = call i8* @__RTtypeid(i8* bitcast (%struct.V* @"?v@@3UV@@A" to i8*))
 // CHECK-NEXT:   [[RET:%.*]] = bitcast i8* [[RT]] to %struct.type_info*
 // CHECK-NEXT:   ret %struct.type_info* [[RET]]
 

diff  --git a/clang/test/CodeGenCXX/nrvo.cpp b/clang/test/CodeGenCXX/nrvo.cpp
index aab26890ea98..74a5af765d13 100644
--- a/clang/test/CodeGenCXX/nrvo.cpp
+++ b/clang/test/CodeGenCXX/nrvo.cpp
@@ -33,13 +33,13 @@ X test0() {
 // CHECK-LABEL: define void @_Z5test1b(
 // CHECK-EH-LABEL: define void @_Z5test1b(
 X test1(bool B) {
-  // CHECK:      tail call {{.*}} @_ZN1XC1Ev
+  // CHECK:      call {{.*}} @_ZN1XC1Ev
   // CHECK-NEXT: ret void
   X x;
   if (B)
     return (x);
   return x;
-  // CHECK-EH:      tail call {{.*}} @_ZN1XC1Ev
+  // CHECK-EH:      call {{.*}} @_ZN1XC1Ev
   // CHECK-EH-NEXT: ret void
 }
 
@@ -130,7 +130,7 @@ X test2(bool B) {
 
 // CHECK-LABEL: define void @_Z5test3b
 X test3(bool B) {
-  // CHECK: tail call {{.*}} @_ZN1XC1Ev
+  // CHECK: call {{.*}} @_ZN1XC1Ev
   // CHECK-NOT: call {{.*}} @_ZN1XC1ERKS_
   // CHECK: call {{.*}} @_ZN1XC1Ev
   // CHECK: call {{.*}} @_ZN1XC1ERKS_
@@ -148,14 +148,14 @@ extern "C" void exit(int) throw();
 // CHECK-LABEL: define void @_Z5test4b
 X test4(bool B) {
   {
-    // CHECK: tail call {{.*}} @_ZN1XC1Ev
+    // CHECK: call {{.*}} @_ZN1XC1Ev
     X x;
     // CHECK: br i1
     if (B)
       return x;
   }
-  // CHECK: tail call {{.*}} @_ZN1XD1Ev
-  // CHECK: tail call void @exit(i32 1)
+  // CHECK: call {{.*}} @_ZN1XD1Ev
+  // CHECK: call void @exit(i32 1)
   exit(1);
 }
 
@@ -191,7 +191,7 @@ X test6() {
 
 // CHECK-LABEL: define void @_Z5test7b
 X test7(bool b) {
-  // CHECK: tail call {{.*}} @_ZN1XC1Ev
+  // CHECK: call {{.*}} @_ZN1XC1Ev
   // CHECK-NEXT: ret
   if (b) {
     X x;
@@ -202,7 +202,7 @@ X test7(bool b) {
 
 // CHECK-LABEL: define void @_Z5test8b
 X test8(bool b) {
-  // CHECK: tail call {{.*}} @_ZN1XC1Ev
+  // CHECK: call {{.*}} @_ZN1XC1Ev
   // CHECK-NEXT: ret
   if (b) {
     X x;
@@ -218,6 +218,6 @@ Y<int> test9() {
 }
 
 // CHECK-LABEL: define linkonce_odr void @_ZN1YIiE1fEv
-// CHECK: tail call {{.*}} @_ZN1YIiEC1Ev
+// CHECK: call {{.*}} @_ZN1YIiEC1Ev
 
 // CHECK-EH-03: attributes [[NR_NUW]] = { noreturn nounwind }

diff  --git a/clang/test/CodeGenCXX/stack-reuse.cpp b/clang/test/CodeGenCXX/stack-reuse.cpp
index 8325604391ae..35dcb5b349c3 100644
--- a/clang/test/CodeGenCXX/stack-reuse.cpp
+++ b/clang/test/CodeGenCXX/stack-reuse.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple armv7-unknown-linux-gnueabihf %s -o - -emit-llvm -O1 | FileCheck %s
+// RUN: %clang_cc1 -triple armv7-unknown-linux-gnueabihf %s -o - -emit-llvm -O2 | FileCheck %s
 
 // Stack should be reused when possible, no need to allocate two separate slots
 // if they have disjoint lifetime.

diff  --git a/clang/test/CodeGenCXX/wasm-args-returns.cpp b/clang/test/CodeGenCXX/wasm-args-returns.cpp
index 5718223f9f74..c547eb85390d 100644
--- a/clang/test/CodeGenCXX/wasm-args-returns.cpp
+++ b/clang/test/CodeGenCXX/wasm-args-returns.cpp
@@ -19,8 +19,8 @@ test(one_field);
 // CHECK: define double @_Z7forward9one_field(double returned %{{.*}})
 //
 // CHECK: define void @_Z14test_one_fieldv()
-// CHECK: %[[call:.*]] = tail call double @_Z13def_one_fieldv()
-// CHECK: tail call void @_Z3use9one_field(double %[[call]])
+// CHECK: %[[call:.*]] = call double @_Z13def_one_fieldv()
+// CHECK: call void @_Z3use9one_field(double %[[call]])
 // CHECK: ret void
 //
 // CHECK: declare void @_Z3use9one_field(double)
@@ -82,8 +82,8 @@ test(empty);
 // CHECK: define void @_Z7forward5empty()
 //
 // CHECK: define void @_Z10test_emptyv()
-// CHECK: tail call void @_Z9def_emptyv()
-// CHECK: tail call void @_Z3use5empty()
+// CHECK: call void @_Z9def_emptyv()
+// CHECK: call void @_Z3use5empty()
 // CHECK: ret void
 //
 // CHECK: declare void @_Z3use5empty()
@@ -96,8 +96,8 @@ test(one_bitfield);
 // CHECK: define i32 @_Z7forward12one_bitfield(i32 returned %{{.*}})
 //
 // CHECK: define void @_Z17test_one_bitfieldv()
-// CHECK: %[[call:.*]] = tail call i32 @_Z16def_one_bitfieldv()
-// CHECK: tail call void @_Z3use12one_bitfield(i32 %[[call]])
+// CHECK: %[[call:.*]] = call i32 @_Z16def_one_bitfieldv()
+// CHECK: call void @_Z3use12one_bitfield(i32 %[[call]])
 // CHECK: ret void
 //
 // CHECK: declare void @_Z3use12one_bitfield(i32)

diff  --git a/clang/test/CodeGenObjCXX/arc-blocks.mm b/clang/test/CodeGenObjCXX/arc-blocks.mm
index 24697cf1bd37..d29491ed077e 100644
--- a/clang/test/CodeGenObjCXX/arc-blocks.mm
+++ b/clang/test/CodeGenObjCXX/arc-blocks.mm
@@ -122,7 +122,7 @@ void foo() {
 // CHECK: call void @__clang_call_terminate(
 
 // CHECK-O1-LABEL: define linkonce_odr hidden void @__copy_helper_block_ea8_32s40r48w56c15_ZTSN5test12S0E60c15_ZTSN5test12S0E(
-// CHECK-O1: tail call void @llvm.objc.release({{.*}}) {{.*}} !clang.imprecise_release
+// CHECK-O1: call void @llvm.objc.release({{.*}}) {{.*}} !clang.imprecise_release
 // CHECK-NOEXCP: define linkonce_odr hidden void @__copy_helper_block_8_32s40r48w56c15_ZTSN5test12S0E60c15_ZTSN5test12S0E(
 
 // CHECK: define linkonce_odr hidden void @__destroy_helper_block_ea8_32s40r48w56c15_ZTSN5test12S0E60c15_ZTSN5test12S0E(
@@ -170,8 +170,8 @@ void foo() {
 // CHECK: call void @__clang_call_terminate(
 
 // CHECK-O1-LABEL: define linkonce_odr hidden void @__destroy_helper_block_ea8_32s40r48w56c15_ZTSN5test12S0E60c15_ZTSN5test12S0E(
-// CHECK-O1: tail call void @llvm.objc.release({{.*}}) {{.*}} !clang.imprecise_release
-// CHECK-O1: tail call void @llvm.objc.release({{.*}}) {{.*}} !clang.imprecise_release
+// CHECK-O1: call void @llvm.objc.release({{.*}}) {{.*}} !clang.imprecise_release
+// CHECK-O1: call void @llvm.objc.release({{.*}}) {{.*}} !clang.imprecise_release
 // CHECK-NOEXCP: define linkonce_odr hidden void @__destroy_helper_block_8_32s40r48w56c15_ZTSN5test12S0E60c15_ZTSN5test12S0E(
 
 namespace {

diff  --git a/clang/test/CodeGenObjCXX/nrvo.mm b/clang/test/CodeGenObjCXX/nrvo.mm
index 1ad5f79ad12e..a02b38b820a3 100644
--- a/clang/test/CodeGenObjCXX/nrvo.mm
+++ b/clang/test/CodeGenObjCXX/nrvo.mm
@@ -14,7 +14,7 @@ @implementation NRVO
 // CHECK: define internal void @"\01-[NRVO getNRVO]"
 - (X)getNRVO { 
   X x;
-  // CHECK: tail call void @_ZN1XC1Ev
+  // CHECK: call void @_ZN1XC1Ev
   // CHECK-NEXT: ret void
   return x;
 }
@@ -24,7 +24,7 @@ X blocksNRVO() {
   return ^{
     // CHECK-LABEL: define internal void @___Z10blocksNRVOv_block_invoke
     X x;
-    // CHECK: tail call void @_ZN1XC1Ev
+    // CHECK: call void @_ZN1XC1Ev
     // CHECK-NEXT: ret void
     return x;
   }() ;

diff  --git a/clang/test/Lexer/minimize_source_to_dependency_directives_invalid_error.c b/clang/test/Lexer/minimize_source_to_dependency_directives_invalid_error.c
index c4a4cf3d9752..020912a4965d 100644
--- a/clang/test/Lexer/minimize_source_to_dependency_directives_invalid_error.c
+++ b/clang/test/Lexer/minimize_source_to_dependency_directives_invalid_error.c
@@ -1,16 +1,16 @@
-// Test CF+LF are properly handled along with quoted, multi-line #error
-// RUN: %clang_cc1 -DOTHER -print-dependency-directives-minimized-source %s 2>&1 | FileCheck %s
-
-#ifndef TEST
-#error "message \
-   more message \
-   even more"
-#endif
-
-#ifdef OTHER
-#include <string>
-#endif
-
-// CHECK:      #ifdef OTHER
-// CHECK-NEXT: #include <string>
-// CHECK-NEXT: #endif
+// Test CF+LF are properly handled along with quoted, multi-line #error
+// RUN: %clang_cc1 -DOTHER -print-dependency-directives-minimized-source %s 2>&1 | FileCheck %s
+
+#ifndef TEST
+#error "message \
+   more message \
+   even more"
+#endif
+
+#ifdef OTHER
+#include <string>
+#endif
+
+// CHECK:      #ifdef OTHER
+// CHECK-NEXT: #include <string>
+// CHECK-NEXT: #endif

diff  --git a/clang/test/PCH/no-escaping-block-tail-calls.cpp b/clang/test/PCH/no-escaping-block-tail-calls.cpp
index 5ae8108f387d..bf197267d67d 100644
--- a/clang/test/PCH/no-escaping-block-tail-calls.cpp
+++ b/clang/test/PCH/no-escaping-block-tail-calls.cpp
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -x c++-header -triple x86_64-apple-darwin11 -emit-pch -O1 -fblocks -fno-escaping-block-tail-calls -o %t %S/no-escaping-block-tail-calls.h
-// RUN: %clang_cc1 -triple x86_64-apple-darwin11 -include-pch %t -emit-llvm -O1 -fblocks -fno-escaping-block-tail-calls -o - %s | FileCheck %s
+// RUN: %clang_cc1 -x c++-header -triple x86_64-apple-darwin11 -emit-pch -O2 -fblocks -fno-escaping-block-tail-calls -o %t %S/no-escaping-block-tail-calls.h
+// RUN: %clang_cc1 -triple x86_64-apple-darwin11 -include-pch %t -emit-llvm -O2 -fblocks -fno-escaping-block-tail-calls -o - %s | FileCheck %s
 
 // Check that -fno-escaping-block-tail-calls doesn't disable tail-call
 // optimization if the block is non-escaping.

diff  --git a/llvm/include/llvm/Passes/PassBuilder.h b/llvm/include/llvm/Passes/PassBuilder.h
index f73e4b42dd4b..7fe03f72305b 100644
--- a/llvm/include/llvm/Passes/PassBuilder.h
+++ b/llvm/include/llvm/Passes/PassBuilder.h
@@ -151,10 +151,6 @@ class PassBuilder {
 
     /// Optimize quickly without destroying debuggability.
     ///
-    /// FIXME: The current and historical behavior of this level does *not*
-    /// agree with this goal, but we would like to move toward this goal in the
-    /// future.
-    ///
     /// This level is tuned to produce a result from the optimizer as quickly
     /// as possible and to avoid destroying debuggability. This tends to result
     /// in a very good development mode where the compiled code will be
@@ -164,9 +160,9 @@ class PassBuilder {
     /// debugging of the resulting binary.
     ///
     /// As an example, complex loop transformations such as versioning,
-    /// vectorization, or fusion might not make sense here due to the degree to
-    /// which the executed code would 
diff er from the source code, and the
-    /// potential compile time cost.
+    /// vectorization, or fusion don't make sense here due to the degree to
+    /// which the executed code 
diff ers from the source code, and the compile time
+    /// cost.
     O1,
 
     /// Optimize for fast execution as much as possible without triggering

diff  --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index 5896dbf5bb98..b22921b2b878 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -400,21 +400,25 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level,
   FPM.addPass(EarlyCSEPass(true /* Enable mem-ssa. */));
 
   // Hoisting of scalars and load expressions.
-  if (EnableGVNHoist)
-    FPM.addPass(GVNHoistPass());
-
-  // Global value numbering based sinking.
-  if (EnableGVNSink) {
-    FPM.addPass(GVNSinkPass());
-    FPM.addPass(SimplifyCFGPass());
+  if (Level > O1) {
+    if (EnableGVNHoist)
+      FPM.addPass(GVNHoistPass());
+
+    // Global value numbering based sinking.
+    if (EnableGVNSink) {
+      FPM.addPass(GVNSinkPass());
+      FPM.addPass(SimplifyCFGPass());
+    }
   }
 
   // Speculative execution if the target has divergent branches; otherwise nop.
-  FPM.addPass(SpeculativeExecutionPass());
+  if (Level > O1) {
+    FPM.addPass(SpeculativeExecutionPass());
 
-  // Optimize based on known information about branches, and cleanup afterward.
-  FPM.addPass(JumpThreadingPass());
-  FPM.addPass(CorrelatedValuePropagationPass());
+    // Optimize based on known information about branches, and cleanup afterward.
+    FPM.addPass(JumpThreadingPass());
+    FPM.addPass(CorrelatedValuePropagationPass());
+  }
   FPM.addPass(SimplifyCFGPass());
   if (Level == O3)
     FPM.addPass(AggressiveInstCombinePass());
@@ -428,10 +432,12 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level,
   // For PGO use pipeline, try to optimize memory intrinsics such as memcpy
   // using the size value profile. Don't perform this when optimizing for size.
   if (PGOOpt && PGOOpt->Action == PGOOptions::IRUse &&
-      !isOptimizingForSize(Level))
+      !isOptimizingForSize(Level) && Level > O1)
     FPM.addPass(PGOMemOPSizeOpt());
 
-  FPM.addPass(TailCallElimPass());
+  // TODO: Investigate the cost/benefit of tail call elimination on debugging.
+  if (Level > O1)
+    FPM.addPass(TailCallElimPass());
   FPM.addPass(SimplifyCFGPass());
 
   // Form canonically associated expression trees, and simplify the trees using
@@ -458,6 +464,7 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level,
 
   // Rotate Loop - disable header duplication at -Oz
   LPM1.addPass(LoopRotatePass(Level != Oz));
+  // TODO: Investigate promotion cap for O1.
   LPM1.addPass(LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap));
   LPM1.addPass(SimpleLoopUnswitchPass());
   LPM2.addPass(IndVarSimplifyPass());
@@ -525,18 +532,21 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level,
 
   // Re-consider control flow based optimizations after redundancy elimination,
   // redo DCE, etc.
-  FPM.addPass(JumpThreadingPass());
-  FPM.addPass(CorrelatedValuePropagationPass());
-  FPM.addPass(DSEPass());
-  FPM.addPass(createFunctionToLoopPassAdaptor(
-      LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap),
-      EnableMSSALoopDependency, DebugLogging));
+  if (Level > O1) {
+    FPM.addPass(JumpThreadingPass());
+    FPM.addPass(CorrelatedValuePropagationPass());
+    FPM.addPass(DSEPass());
+    FPM.addPass(createFunctionToLoopPassAdaptor(
+        LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap),
+        EnableMSSALoopDependency, DebugLogging));
+  }
 
   for (auto &C : ScalarOptimizerLateEPCallbacks)
     C(FPM, Level);
 
   // Finally, do an expensive DCE pass to catch all the dead code exposed by
   // the simplifications and basic cleanup after all the simplifications.
+  // TODO: Investigate if this is too expensive.
   FPM.addPass(ADCEPass());
   FPM.addPass(SimplifyCFGPass());
   FPM.addPass(InstCombinePass());

diff  --git a/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp b/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
index 5314a8219b1e..81424229c3bf 100644
--- a/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
+++ b/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
@@ -320,19 +320,26 @@ void PassManagerBuilder::addFunctionSimplificationPasses(
     legacy::PassManagerBase &MPM) {
   // Start of function pass.
   // Break up aggregate allocas, using SSAUpdater.
+  assert(OptLevel >= 1 && "Calling function optimizer with no optimization level!");
   MPM.add(createSROAPass());
   MPM.add(createEarlyCSEPass(true /* Enable mem-ssa. */)); // Catch trivial redundancies
-  if (EnableGVNHoist)
-    MPM.add(createGVNHoistPass());
-  if (EnableGVNSink) {
-    MPM.add(createGVNSinkPass());
-    MPM.add(createCFGSimplificationPass());
+
+  if (OptLevel > 1) {
+    if (EnableGVNHoist)
+      MPM.add(createGVNHoistPass());
+    if (EnableGVNSink) {
+      MPM.add(createGVNSinkPass());
+      MPM.add(createCFGSimplificationPass());
+    }
   }
 
-  // Speculative execution if the target has divergent branches; otherwise nop.
-  MPM.add(createSpeculativeExecutionIfHasBranchDivergencePass());
-  MPM.add(createJumpThreadingPass());         // Thread jumps.
-  MPM.add(createCorrelatedValuePropagationPass()); // Propagate conditionals
+  if (OptLevel > 1) {
+    // Speculative execution if the target has divergent branches; otherwise nop.
+    MPM.add(createSpeculativeExecutionIfHasBranchDivergencePass());
+
+    MPM.add(createJumpThreadingPass());         // Thread jumps.
+    MPM.add(createCorrelatedValuePropagationPass()); // Propagate conditionals
+  }
   MPM.add(createCFGSimplificationPass());     // Merge & remove BBs
   // Combine silly seq's
   if (OptLevel > 2)
@@ -346,8 +353,10 @@ void PassManagerBuilder::addFunctionSimplificationPasses(
   if (SizeLevel == 0)
     MPM.add(createPGOMemOPSizeOptLegacyPass());
 
-  MPM.add(createTailCallEliminationPass()); // Eliminate tail calls
-  MPM.add(createCFGSimplificationPass());     // Merge & remove BBs
+  // TODO: Investigate the cost/benefit of tail call elimination on debugging.
+  if (OptLevel > 1)
+    MPM.add(createTailCallEliminationPass()); // Eliminate tail calls
+  MPM.add(createCFGSimplificationPass());      // Merge & remove BBs
   MPM.add(createReassociatePass());           // Reassociate expressions
 
   // Begin the loop pass pipeline.
@@ -360,6 +369,7 @@ void PassManagerBuilder::addFunctionSimplificationPasses(
   }
   // Rotate Loop - disable header duplication at -Oz
   MPM.add(createLoopRotatePass(SizeLevel == 2 ? 0 : -1));
+  // TODO: Investigate promotion cap for O1.
   MPM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap));
   if (EnableSimpleLoopUnswitch)
     MPM.add(createSimpleLoopUnswitchLegacyPass());
@@ -402,16 +412,19 @@ void PassManagerBuilder::addFunctionSimplificationPasses(
   // opened up by them.
   addInstructionCombiningPass(MPM);
   addExtensionsToPM(EP_Peephole, MPM);
-  MPM.add(createJumpThreadingPass());         // Thread jumps
-  MPM.add(createCorrelatedValuePropagationPass());
-  MPM.add(createDeadStoreEliminationPass());  // Delete dead stores
-  MPM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap));
+  if (OptLevel > 1) {
+    MPM.add(createJumpThreadingPass());         // Thread jumps
+    MPM.add(createCorrelatedValuePropagationPass());
+    MPM.add(createDeadStoreEliminationPass());  // Delete dead stores
+    MPM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap));
+  }
 
   addExtensionsToPM(EP_ScalarOptimizerLate, MPM);
 
   if (RerollLoops)
     MPM.add(createLoopRerollPass());
 
+  // TODO: Investigate if this is too expensive at O1.
   MPM.add(createAggressiveDCEPass());         // Delete dead instructions
   MPM.add(createCFGSimplificationPass()); // Merge & remove BBs
   // Clean up after everything.
@@ -899,7 +912,8 @@ void PassManagerBuilder::addLTOOptimizationPasses(legacy::PassManagerBase &PM) {
 
   // LTO provides additional opportunities for tailcall elimination due to
   // link-time inlining, and visibility of nocapture attribute.
-  PM.add(createTailCallEliminationPass());
+  if (OptLevel > 1)
+    PM.add(createTailCallEliminationPass());
 
   // Infer attributes on declarations, call sites, arguments, etc.
   PM.add(createPostOrderFunctionAttrsLegacyPass()); // Add nocapture.

diff  --git a/llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll b/llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll
index 859f848d228c..682c0679fa24 100644
--- a/llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll
+++ b/llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll
@@ -3,17 +3,17 @@
 ; RUN: opt -S -O1 -mtriple=amdgcn-- -amdgpu-use-native -amdgpu-prelink < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GCN-NATIVE %s
 
 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_sincos
-; GCN-POSTLINK: tail call fast float @_Z3sinf(
-; GCN-POSTLINK: tail call fast float @_Z3cosf(
+; GCN-POSTLINK: call fast float @_Z3sinf(
+; GCN-POSTLINK: call fast float @_Z3cosf(
 ; GCN-PRELINK: call fast float @_Z6sincosfPf(
-; GCN-NATIVE: tail call fast float @_Z10native_sinf(
-; GCN-NATIVE: tail call fast float @_Z10native_cosf(
+; GCN-NATIVE: call fast float @_Z10native_sinf(
+; GCN-NATIVE: call fast float @_Z10native_cosf(
 define amdgpu_kernel void @test_sincos(float addrspace(1)* nocapture %a) {
 entry:
   %tmp = load float, float addrspace(1)* %a, align 4
-  %call = tail call fast float @_Z3sinf(float %tmp)
+  %call = call fast float @_Z3sinf(float %tmp)
   store float %call, float addrspace(1)* %a, align 4
-  %call2 = tail call fast float @_Z3cosf(float %tmp)
+  %call2 = call fast float @_Z3cosf(float %tmp)
   %arrayidx3 = getelementptr inbounds float, float addrspace(1)* %a, i64 1
   store float %call2, float addrspace(1)* %arrayidx3, align 4
   ret void
@@ -24,17 +24,17 @@ declare float @_Z3sinf(float)
 declare float @_Z3cosf(float)
 
 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_sincos_v2
-; GCN-POSTLINK: tail call fast <2 x float> @_Z3sinDv2_f(
-; GCN-POSTLINK: tail call fast <2 x float> @_Z3cosDv2_f(
+; GCN-POSTLINK: call fast <2 x float> @_Z3sinDv2_f(
+; GCN-POSTLINK: call fast <2 x float> @_Z3cosDv2_f(
 ; GCN-PRELINK: call fast <2 x float> @_Z6sincosDv2_fPS_(
-; GCN-NATIVE: tail call fast <2 x float> @_Z10native_sinDv2_f(
-; GCN-NATIVE: tail call fast <2 x float> @_Z10native_cosDv2_f(
+; GCN-NATIVE: call fast <2 x float> @_Z10native_sinDv2_f(
+; GCN-NATIVE: call fast <2 x float> @_Z10native_cosDv2_f(
 define amdgpu_kernel void @test_sincos_v2(<2 x float> addrspace(1)* nocapture %a) {
 entry:
   %tmp = load <2 x float>, <2 x float> addrspace(1)* %a, align 8
-  %call = tail call fast <2 x float> @_Z3sinDv2_f(<2 x float> %tmp)
+  %call = call fast <2 x float> @_Z3sinDv2_f(<2 x float> %tmp)
   store <2 x float> %call, <2 x float> addrspace(1)* %a, align 8
-  %call2 = tail call fast <2 x float> @_Z3cosDv2_f(<2 x float> %tmp)
+  %call2 = call fast <2 x float> @_Z3cosDv2_f(<2 x float> %tmp)
   %arrayidx3 = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i64 1
   store <2 x float> %call2, <2 x float> addrspace(1)* %arrayidx3, align 8
   ret void
@@ -45,20 +45,20 @@ declare <2 x float> @_Z3sinDv2_f(<2 x float>)
 declare <2 x float> @_Z3cosDv2_f(<2 x float>)
 
 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_sincos_v3
-; GCN-POSTLINK: tail call fast <3 x float> @_Z3sinDv3_f(
-; GCN-POSTLINK: tail call fast <3 x float> @_Z3cosDv3_f(
+; GCN-POSTLINK: call fast <3 x float> @_Z3sinDv3_f(
+; GCN-POSTLINK: call fast <3 x float> @_Z3cosDv3_f(
 ; GCN-PRELINK: call fast <3 x float> @_Z6sincosDv3_fPS_(
-; GCN-NATIVE: tail call fast <3 x float> @_Z10native_sinDv3_f(
-; GCN-NATIVE: tail call fast <3 x float> @_Z10native_cosDv3_f(
+; GCN-NATIVE: call fast <3 x float> @_Z10native_sinDv3_f(
+; GCN-NATIVE: call fast <3 x float> @_Z10native_cosDv3_f(
 define amdgpu_kernel void @test_sincos_v3(<3 x float> addrspace(1)* nocapture %a) {
 entry:
   %castToVec4 = bitcast <3 x float> addrspace(1)* %a to <4 x float> addrspace(1)*
   %loadVec4 = load <4 x float>, <4 x float> addrspace(1)* %castToVec4, align 16
   %extractVec4 = shufflevector <4 x float> %loadVec4, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
-  %call = tail call fast <3 x float> @_Z3sinDv3_f(<3 x float> %extractVec4)
+  %call = call fast <3 x float> @_Z3sinDv3_f(<3 x float> %extractVec4)
   %extractVec6 = shufflevector <3 x float> %call, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 undef>
   store <4 x float> %extractVec6, <4 x float> addrspace(1)* %castToVec4, align 16
-  %call11 = tail call fast <3 x float> @_Z3cosDv3_f(<3 x float> %extractVec4)
+  %call11 = call fast <3 x float> @_Z3cosDv3_f(<3 x float> %extractVec4)
   %arrayidx12 = getelementptr inbounds <3 x float>, <3 x float> addrspace(1)* %a, i64 1
   %extractVec13 = shufflevector <3 x float> %call11, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 undef>
   %storetmp14 = bitcast <3 x float> addrspace(1)* %arrayidx12 to <4 x float> addrspace(1)*
@@ -71,17 +71,17 @@ declare <3 x float> @_Z3sinDv3_f(<3 x float>)
 declare <3 x float> @_Z3cosDv3_f(<3 x float>)
 
 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_sincos_v4
-; GCN-POSTLINK: tail call fast <4 x float> @_Z3sinDv4_f(
-; GCN-POSTLINK: tail call fast <4 x float> @_Z3cosDv4_f(
+; GCN-POSTLINK: call fast <4 x float> @_Z3sinDv4_f(
+; GCN-POSTLINK: call fast <4 x float> @_Z3cosDv4_f(
 ; GCN-PRELINK: call fast <4 x float> @_Z6sincosDv4_fPS_(
-; GCN-NATIVE: tail call fast <4 x float> @_Z10native_sinDv4_f(
-; GCN-NATIVE: tail call fast <4 x float> @_Z10native_cosDv4_f(
+; GCN-NATIVE: call fast <4 x float> @_Z10native_sinDv4_f(
+; GCN-NATIVE: call fast <4 x float> @_Z10native_cosDv4_f(
 define amdgpu_kernel void @test_sincos_v4(<4 x float> addrspace(1)* nocapture %a) {
 entry:
   %tmp = load <4 x float>, <4 x float> addrspace(1)* %a, align 16
-  %call = tail call fast <4 x float> @_Z3sinDv4_f(<4 x float> %tmp)
+  %call = call fast <4 x float> @_Z3sinDv4_f(<4 x float> %tmp)
   store <4 x float> %call, <4 x float> addrspace(1)* %a, align 16
-  %call2 = tail call fast <4 x float> @_Z3cosDv4_f(<4 x float> %tmp)
+  %call2 = call fast <4 x float> @_Z3cosDv4_f(<4 x float> %tmp)
   %arrayidx3 = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %a, i64 1
   store <4 x float> %call2, <4 x float> addrspace(1)* %arrayidx3, align 16
   ret void
@@ -92,17 +92,17 @@ declare <4 x float> @_Z3sinDv4_f(<4 x float>)
 declare <4 x float> @_Z3cosDv4_f(<4 x float>)
 
 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_sincos_v8
-; GCN-POSTLINK: tail call fast <8 x float> @_Z3sinDv8_f(
-; GCN-POSTLINK: tail call fast <8 x float> @_Z3cosDv8_f(
+; GCN-POSTLINK: call fast <8 x float> @_Z3sinDv8_f(
+; GCN-POSTLINK: call fast <8 x float> @_Z3cosDv8_f(
 ; GCN-PRELINK: call fast <8 x float> @_Z6sincosDv8_fPS_(
-; GCN-NATIVE: tail call fast <8 x float> @_Z10native_sinDv8_f(
-; GCN-NATIVE: tail call fast <8 x float> @_Z10native_cosDv8_f(
+; GCN-NATIVE: call fast <8 x float> @_Z10native_sinDv8_f(
+; GCN-NATIVE: call fast <8 x float> @_Z10native_cosDv8_f(
 define amdgpu_kernel void @test_sincos_v8(<8 x float> addrspace(1)* nocapture %a) {
 entry:
   %tmp = load <8 x float>, <8 x float> addrspace(1)* %a, align 32
-  %call = tail call fast <8 x float> @_Z3sinDv8_f(<8 x float> %tmp)
+  %call = call fast <8 x float> @_Z3sinDv8_f(<8 x float> %tmp)
   store <8 x float> %call, <8 x float> addrspace(1)* %a, align 32
-  %call2 = tail call fast <8 x float> @_Z3cosDv8_f(<8 x float> %tmp)
+  %call2 = call fast <8 x float> @_Z3cosDv8_f(<8 x float> %tmp)
   %arrayidx3 = getelementptr inbounds <8 x float>, <8 x float> addrspace(1)* %a, i64 1
   store <8 x float> %call2, <8 x float> addrspace(1)* %arrayidx3, align 32
   ret void
@@ -113,17 +113,17 @@ declare <8 x float> @_Z3sinDv8_f(<8 x float>)
 declare <8 x float> @_Z3cosDv8_f(<8 x float>)
 
 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_sincos_v16
-; GCN-POSTLINK: tail call fast <16 x float> @_Z3sinDv16_f(
-; GCN-POSTLINK: tail call fast <16 x float> @_Z3cosDv16_f(
+; GCN-POSTLINK: call fast <16 x float> @_Z3sinDv16_f(
+; GCN-POSTLINK: call fast <16 x float> @_Z3cosDv16_f(
 ; GCN-PRELINK: call fast <16 x float> @_Z6sincosDv16_fPS_(
-; GCN-NATIVE: tail call fast <16 x float> @_Z10native_sinDv16_f(
-; GCN-NATIVE: tail call fast <16 x float> @_Z10native_cosDv16_f(
+; GCN-NATIVE: call fast <16 x float> @_Z10native_sinDv16_f(
+; GCN-NATIVE: call fast <16 x float> @_Z10native_cosDv16_f(
 define amdgpu_kernel void @test_sincos_v16(<16 x float> addrspace(1)* nocapture %a) {
 entry:
   %tmp = load <16 x float>, <16 x float> addrspace(1)* %a, align 64
-  %call = tail call fast <16 x float> @_Z3sinDv16_f(<16 x float> %tmp)
+  %call = call fast <16 x float> @_Z3sinDv16_f(<16 x float> %tmp)
   store <16 x float> %call, <16 x float> addrspace(1)* %a, align 64
-  %call2 = tail call fast <16 x float> @_Z3cosDv16_f(<16 x float> %tmp)
+  %call2 = call fast <16 x float> @_Z3cosDv16_f(<16 x float> %tmp)
   %arrayidx3 = getelementptr inbounds <16 x float>, <16 x float> addrspace(1)* %a, i64 1
   store <16 x float> %call2, <16 x float> addrspace(1)* %arrayidx3, align 64
   ret void
@@ -137,7 +137,7 @@ declare <16 x float> @_Z3cosDv16_f(<16 x float>)
 ; GCN: store float 0x3FD5555560000000, float addrspace(1)* %a
 define amdgpu_kernel void @test_native_recip(float addrspace(1)* nocapture %a) {
 entry:
-  %call = tail call fast float @_Z12native_recipf(float 3.000000e+00)
+  %call = call fast float @_Z12native_recipf(float 3.000000e+00)
   store float %call, float addrspace(1)* %a, align 4
   ret void
 }
@@ -148,7 +148,7 @@ declare float @_Z12native_recipf(float)
 ; GCN: store float 0x3FD5555560000000, float addrspace(1)* %a
 define amdgpu_kernel void @test_half_recip(float addrspace(1)* nocapture %a) {
 entry:
-  %call = tail call fast float @_Z10half_recipf(float 3.000000e+00)
+  %call = call fast float @_Z10half_recipf(float 3.000000e+00)
   store float %call, float addrspace(1)* %a, align 4
   ret void
 }
@@ -160,7 +160,7 @@ declare float @_Z10half_recipf(float)
 define amdgpu_kernel void @test_native_divide(float addrspace(1)* nocapture %a) {
 entry:
   %tmp = load float, float addrspace(1)* %a, align 4
-  %call = tail call fast float @_Z13native_divideff(float %tmp, float 3.000000e+00)
+  %call = call fast float @_Z13native_divideff(float %tmp, float 3.000000e+00)
   store float %call, float addrspace(1)* %a, align 4
   ret void
 }
@@ -172,7 +172,7 @@ declare float @_Z13native_divideff(float, float)
 define amdgpu_kernel void @test_half_divide(float addrspace(1)* nocapture %a) {
 entry:
   %tmp = load float, float addrspace(1)* %a, align 4
-  %call = tail call fast float @_Z11half_divideff(float %tmp, float 3.000000e+00)
+  %call = call fast float @_Z11half_divideff(float %tmp, float 3.000000e+00)
   store float %call, float addrspace(1)* %a, align 4
   ret void
 }
@@ -184,7 +184,7 @@ declare float @_Z11half_divideff(float, float)
 define amdgpu_kernel void @test_pow_0f(float addrspace(1)* nocapture %a) {
 entry:
   %tmp = load float, float addrspace(1)* %a, align 4
-  %call = tail call fast float @_Z3powff(float %tmp, float 0.000000e+00)
+  %call = call fast float @_Z3powff(float %tmp, float 0.000000e+00)
   store float %call, float addrspace(1)* %a, align 4
   ret void
 }
@@ -196,7 +196,7 @@ declare float @_Z3powff(float, float)
 define amdgpu_kernel void @test_pow_0i(float addrspace(1)* nocapture %a) {
 entry:
   %tmp = load float, float addrspace(1)* %a, align 4
-  %call = tail call fast float @_Z3powff(float %tmp, float 0.000000e+00)
+  %call = call fast float @_Z3powff(float %tmp, float 0.000000e+00)
   store float %call, float addrspace(1)* %a, align 4
   ret void
 }
@@ -208,7 +208,7 @@ define amdgpu_kernel void @test_pow_1f(float addrspace(1)* nocapture %a) {
 entry:
   %arrayidx = getelementptr inbounds float, float addrspace(1)* %a, i64 1
   %tmp = load float, float addrspace(1)* %arrayidx, align 4
-  %call = tail call fast float @_Z3powff(float %tmp, float 1.000000e+00)
+  %call = call fast float @_Z3powff(float %tmp, float 1.000000e+00)
   store float %call, float addrspace(1)* %a, align 4
   ret void
 }
@@ -220,7 +220,7 @@ define amdgpu_kernel void @test_pow_1i(float addrspace(1)* nocapture %a) {
 entry:
   %arrayidx = getelementptr inbounds float, float addrspace(1)* %a, i64 1
   %tmp = load float, float addrspace(1)* %arrayidx, align 4
-  %call = tail call fast float @_Z3powff(float %tmp, float 1.000000e+00)
+  %call = call fast float @_Z3powff(float %tmp, float 1.000000e+00)
   store float %call, float addrspace(1)* %a, align 4
   ret void
 }
@@ -231,7 +231,7 @@ entry:
 define amdgpu_kernel void @test_pow_2f(float addrspace(1)* nocapture %a) {
 entry:
   %tmp = load float, float addrspace(1)* %a, align 4
-  %call = tail call fast float @_Z3powff(float %tmp, float 2.000000e+00)
+  %call = call fast float @_Z3powff(float %tmp, float 2.000000e+00)
   store float %call, float addrspace(1)* %a, align 4
   ret void
 }
@@ -242,7 +242,7 @@ entry:
 define amdgpu_kernel void @test_pow_2i(float addrspace(1)* nocapture %a) {
 entry:
   %tmp = load float, float addrspace(1)* %a, align 4
-  %call = tail call fast float @_Z3powff(float %tmp, float 2.000000e+00)
+  %call = call fast float @_Z3powff(float %tmp, float 2.000000e+00)
   store float %call, float addrspace(1)* %a, align 4
   ret void
 }
@@ -254,7 +254,7 @@ define amdgpu_kernel void @test_pow_m1f(float addrspace(1)* nocapture %a) {
 entry:
   %arrayidx = getelementptr inbounds float, float addrspace(1)* %a, i64 1
   %tmp = load float, float addrspace(1)* %arrayidx, align 4
-  %call = tail call fast float @_Z3powff(float %tmp, float -1.000000e+00)
+  %call = call fast float @_Z3powff(float %tmp, float -1.000000e+00)
   store float %call, float addrspace(1)* %a, align 4
   ret void
 }
@@ -266,31 +266,31 @@ define amdgpu_kernel void @test_pow_m1i(float addrspace(1)* nocapture %a) {
 entry:
   %arrayidx = getelementptr inbounds float, float addrspace(1)* %a, i64 1
   %tmp = load float, float addrspace(1)* %arrayidx, align 4
-  %call = tail call fast float @_Z3powff(float %tmp, float -1.000000e+00)
+  %call = call fast float @_Z3powff(float %tmp, float -1.000000e+00)
   store float %call, float addrspace(1)* %a, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_pow_half
-; GCN-POSTLINK: tail call fast float @_Z3powff(float %tmp, float 5.000000e-01)
-; GCN-PRELINK: %__pow2sqrt = tail call fast float @_Z4sqrtf(float %tmp)
+; GCN-POSTLINK: call fast float @_Z3powff(float %tmp, float 5.000000e-01)
+; GCN-PRELINK: %__pow2sqrt = call fast float @_Z4sqrtf(float %tmp)
 define amdgpu_kernel void @test_pow_half(float addrspace(1)* nocapture %a) {
 entry:
   %arrayidx = getelementptr inbounds float, float addrspace(1)* %a, i64 1
   %tmp = load float, float addrspace(1)* %arrayidx, align 4
-  %call = tail call fast float @_Z3powff(float %tmp, float 5.000000e-01)
+  %call = call fast float @_Z3powff(float %tmp, float 5.000000e-01)
   store float %call, float addrspace(1)* %a, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_pow_mhalf
-; GCN-POSTLINK: tail call fast float @_Z3powff(float %tmp, float -5.000000e-01)
-; GCN-PRELINK: %__pow2rsqrt = tail call fast float @_Z5rsqrtf(float %tmp)
+; GCN-POSTLINK: call fast float @_Z3powff(float %tmp, float -5.000000e-01)
+; GCN-PRELINK: %__pow2rsqrt = call fast float @_Z5rsqrtf(float %tmp)
 define amdgpu_kernel void @test_pow_mhalf(float addrspace(1)* nocapture %a) {
 entry:
   %arrayidx = getelementptr inbounds float, float addrspace(1)* %a, i64 1
   %tmp = load float, float addrspace(1)* %arrayidx, align 4
-  %call = tail call fast float @_Z3powff(float %tmp, float -5.000000e-01)
+  %call = call fast float @_Z3powff(float %tmp, float -5.000000e-01)
   store float %call, float addrspace(1)* %a, align 4
   ret void
 }
@@ -305,7 +305,7 @@ define amdgpu_kernel void @test_pow_c(float addrspace(1)* nocapture %a) {
 entry:
   %arrayidx = getelementptr inbounds float, float addrspace(1)* %a, i64 1
   %tmp = load float, float addrspace(1)* %arrayidx, align 4
-  %call = tail call fast float @_Z3powff(float %tmp, float 1.100000e+01)
+  %call = call fast float @_Z3powff(float %tmp, float 1.100000e+01)
   store float %call, float addrspace(1)* %a, align 4
   ret void
 }
@@ -320,7 +320,7 @@ define amdgpu_kernel void @test_powr_c(float addrspace(1)* nocapture %a) {
 entry:
   %arrayidx = getelementptr inbounds float, float addrspace(1)* %a, i64 1
   %tmp = load float, float addrspace(1)* %arrayidx, align 4
-  %call = tail call fast float @_Z4powrff(float %tmp, float 1.100000e+01)
+  %call = call fast float @_Z4powrff(float %tmp, float 1.100000e+01)
   store float %call, float addrspace(1)* %a, align 4
   ret void
 }
@@ -337,7 +337,7 @@ define amdgpu_kernel void @test_pown_c(float addrspace(1)* nocapture %a) {
 entry:
   %arrayidx = getelementptr inbounds float, float addrspace(1)* %a, i64 1
   %tmp = load float, float addrspace(1)* %arrayidx, align 4
-  %call = tail call fast float @_Z4pownfi(float %tmp, i32 11)
+  %call = call fast float @_Z4pownfi(float %tmp, i32 11)
   store float %call, float addrspace(1)* %a, align 4
   ret void
 }
@@ -345,11 +345,11 @@ entry:
 declare float @_Z4pownfi(float, i32)
 
 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_pow
-; GCN-POSTLINK: tail call fast float @_Z3powff(float %tmp, float 1.013000e+03)
-; GCN-PRELINK: %__fabs = tail call fast float @_Z4fabsf(float %tmp)
-; GCN-PRELINK: %__log2 = tail call fast float @_Z4log2f(float %__fabs)
+; GCN-POSTLINK: call fast float @_Z3powff(float %tmp, float 1.013000e+03)
+; GCN-PRELINK: %__fabs = call fast float @_Z4fabsf(float %tmp)
+; GCN-PRELINK: %__log2 = call fast float @_Z4log2f(float %__fabs)
 ; GCN-PRELINK: %__ylogx = fmul fast float %__log2, 1.013000e+03
-; GCN-PRELINK: %__exp2 = tail call fast float @_Z4exp2f(float %__ylogx)
+; GCN-PRELINK: %__exp2 = call fast float @_Z4exp2f(float %__ylogx)
 ; GCN-PRELINK: %[[r0:.*]] = bitcast float %tmp to i32
 ; GCN-PRELINK: %__pow_sign = and i32 %[[r0]], -2147483648
 ; GCN-PRELINK: %[[r1:.*]] = bitcast float %__exp2 to i32
@@ -359,39 +359,39 @@ declare float @_Z4pownfi(float, i32)
 define amdgpu_kernel void @test_pow(float addrspace(1)* nocapture %a) {
 entry:
   %tmp = load float, float addrspace(1)* %a, align 4
-  %call = tail call fast float @_Z3powff(float %tmp, float 1.013000e+03)
+  %call = call fast float @_Z3powff(float %tmp, float 1.013000e+03)
   store float %call, float addrspace(1)* %a, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_powr
-; GCN-POSTLINK: tail call fast float @_Z4powrff(float %tmp, float %tmp1)
-; GCN-PRELINK: %__log2 = tail call fast float @_Z4log2f(float %tmp)
+; GCN-POSTLINK: call fast float @_Z4powrff(float %tmp, float %tmp1)
+; GCN-PRELINK: %__log2 = call fast float @_Z4log2f(float %tmp)
 ; GCN-PRELINK: %__ylogx = fmul fast float %__log2, %tmp1
-; GCN-PRELINK: %__exp2 = tail call fast float @_Z4exp2f(float %__ylogx)
+; GCN-PRELINK: %__exp2 = call fast float @_Z4exp2f(float %__ylogx)
 ; GCN-PRELINK: store float %__exp2, float addrspace(1)* %a, align 4
-; GCN-NATIVE:  %__log2 = tail call fast float @_Z11native_log2f(float %tmp)
+; GCN-NATIVE:  %__log2 = call fast float @_Z11native_log2f(float %tmp)
 ; GCN-NATIVE:  %__ylogx = fmul fast float %__log2, %tmp1
-; GCN-NATIVE:  %__exp2 = tail call fast float @_Z11native_exp2f(float %__ylogx)
+; GCN-NATIVE:  %__exp2 = call fast float @_Z11native_exp2f(float %__ylogx)
 ; GCN-NATIVE:  store float %__exp2, float addrspace(1)* %a, align 4
 define amdgpu_kernel void @test_powr(float addrspace(1)* nocapture %a) {
 entry:
   %tmp = load float, float addrspace(1)* %a, align 4
   %arrayidx1 = getelementptr inbounds float, float addrspace(1)* %a, i64 1
   %tmp1 = load float, float addrspace(1)* %arrayidx1, align 4
-  %call = tail call fast float @_Z4powrff(float %tmp, float %tmp1)
+  %call = call fast float @_Z4powrff(float %tmp, float %tmp1)
   store float %call, float addrspace(1)* %a, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_pown
-; GCN-POSTLINK: tail call fast float @_Z4pownfi(float %tmp, i32 %conv)
+; GCN-POSTLINK: call fast float @_Z4pownfi(float %tmp, i32 %conv)
 ; GCN-PRELINK: %conv = fptosi float %tmp1 to i32
-; GCN-PRELINK: %__fabs = tail call fast float @_Z4fabsf(float %tmp)
-; GCN-PRELINK: %__log2 = tail call fast float @_Z4log2f(float %__fabs)
+; GCN-PRELINK: %__fabs = call fast float @_Z4fabsf(float %tmp)
+; GCN-PRELINK: %__log2 = call fast float @_Z4log2f(float %__fabs)
 ; GCN-PRELINK: %pownI2F = sitofp i32 %conv to float
 ; GCN-PRELINK: %__ylogx = fmul fast float %__log2, %pownI2F
-; GCN-PRELINK: %__exp2 = tail call fast float @_Z4exp2f(float %__ylogx)
+; GCN-PRELINK: %__exp2 = call fast float @_Z4exp2f(float %__ylogx)
 ; GCN-PRELINK: %__yeven = shl i32 %conv, 31
 ; GCN-PRELINK: %[[r0:.*]] = bitcast float %tmp to i32
 ; GCN-PRELINK: %__pow_sign = and i32 %__yeven, %[[r0]]
@@ -405,7 +405,7 @@ entry:
   %arrayidx1 = getelementptr inbounds float, float addrspace(1)* %a, i64 1
   %tmp1 = load float, float addrspace(1)* %arrayidx1, align 4
   %conv = fptosi float %tmp1 to i32
-  %call = tail call fast float @_Z4pownfi(float %tmp, i32 %conv)
+  %call = call fast float @_Z4pownfi(float %tmp, i32 %conv)
   store float %call, float addrspace(1)* %a, align 4
   ret void
 }
@@ -417,7 +417,7 @@ define amdgpu_kernel void @test_rootn_1(float addrspace(1)* nocapture %a) {
 entry:
   %arrayidx = getelementptr inbounds float, float addrspace(1)* %a, i64 1
   %tmp = load float, float addrspace(1)* %arrayidx, align 4
-  %call = tail call fast float @_Z5rootnfi(float %tmp, i32 1)
+  %call = call fast float @_Z5rootnfi(float %tmp, i32 1)
   store float %call, float addrspace(1)* %a, align 4
   ret void
 }
@@ -425,23 +425,23 @@ entry:
 declare float @_Z5rootnfi(float, i32)
 
 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_rootn_2
-; GCN-POSTLINK: tail call fast float @_Z5rootnfi(float %tmp, i32 2)
-; GCN-PRELINK: %__rootn2sqrt = tail call fast float @_Z4sqrtf(float %tmp)
+; GCN-POSTLINK: call fast float @_Z5rootnfi(float %tmp, i32 2)
+; GCN-PRELINK: %__rootn2sqrt = call fast float @_Z4sqrtf(float %tmp)
 define amdgpu_kernel void @test_rootn_2(float addrspace(1)* nocapture %a) {
 entry:
   %tmp = load float, float addrspace(1)* %a, align 4
-  %call = tail call fast float @_Z5rootnfi(float %tmp, i32 2)
+  %call = call fast float @_Z5rootnfi(float %tmp, i32 2)
   store float %call, float addrspace(1)* %a, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_rootn_3
-; GCN-POSTLINK: tail call fast float @_Z5rootnfi(float %tmp, i32 3)
-; GCN-PRELINK: %__rootn2cbrt = tail call fast float @_Z4cbrtf(float %tmp)
+; GCN-POSTLINK: call fast float @_Z5rootnfi(float %tmp, i32 3)
+; GCN-PRELINK: %__rootn2cbrt = call fast float @_Z4cbrtf(float %tmp)
 define amdgpu_kernel void @test_rootn_3(float addrspace(1)* nocapture %a) {
 entry:
   %tmp = load float, float addrspace(1)* %a, align 4
-  %call = tail call fast float @_Z5rootnfi(float %tmp, i32 3)
+  %call = call fast float @_Z5rootnfi(float %tmp, i32 3)
   store float %call, float addrspace(1)* %a, align 4
   ret void
 }
@@ -451,18 +451,18 @@ entry:
 define amdgpu_kernel void @test_rootn_m1(float addrspace(1)* nocapture %a) {
 entry:
   %tmp = load float, float addrspace(1)* %a, align 4
-  %call = tail call fast float @_Z5rootnfi(float %tmp, i32 -1)
+  %call = call fast float @_Z5rootnfi(float %tmp, i32 -1)
   store float %call, float addrspace(1)* %a, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_rootn_m2
-; GCN-POSTLINK: tail call fast float @_Z5rootnfi(float %tmp, i32 -2)
-; GCN-PRELINK: %__rootn2rsqrt = tail call fast float @_Z5rsqrtf(float %tmp)
+; GCN-POSTLINK: call fast float @_Z5rootnfi(float %tmp, i32 -2)
+; GCN-PRELINK: %__rootn2rsqrt = call fast float @_Z5rsqrtf(float %tmp)
 define amdgpu_kernel void @test_rootn_m2(float addrspace(1)* nocapture %a) {
 entry:
   %tmp = load float, float addrspace(1)* %a, align 4
-  %call = tail call fast float @_Z5rootnfi(float %tmp, i32 -2)
+  %call = call fast float @_Z5rootnfi(float %tmp, i32 -2)
   store float %call, float addrspace(1)* %a, align 4
   ret void
 }
@@ -472,7 +472,7 @@ entry:
 define amdgpu_kernel void @test_fma_0x(float addrspace(1)* nocapture %a, float %y) {
 entry:
   %tmp = load float, float addrspace(1)* %a, align 4
-  %call = tail call fast float @_Z3fmafff(float 0.000000e+00, float %tmp, float %y)
+  %call = call fast float @_Z3fmafff(float 0.000000e+00, float %tmp, float %y)
   store float %call, float addrspace(1)* %a, align 4
   ret void
 }
@@ -484,7 +484,7 @@ declare float @_Z3fmafff(float, float, float)
 define amdgpu_kernel void @test_fma_x0(float addrspace(1)* nocapture %a, float %y) {
 entry:
   %tmp = load float, float addrspace(1)* %a, align 4
-  %call = tail call fast float @_Z3fmafff(float %tmp, float 0.000000e+00, float %y)
+  %call = call fast float @_Z3fmafff(float %tmp, float 0.000000e+00, float %y)
   store float %call, float addrspace(1)* %a, align 4
   ret void
 }
@@ -494,7 +494,7 @@ entry:
 define amdgpu_kernel void @test_mad_0x(float addrspace(1)* nocapture %a, float %y) {
 entry:
   %tmp = load float, float addrspace(1)* %a, align 4
-  %call = tail call fast float @_Z3madfff(float 0.000000e+00, float %tmp, float %y)
+  %call = call fast float @_Z3madfff(float 0.000000e+00, float %tmp, float %y)
   store float %call, float addrspace(1)* %a, align 4
   ret void
 }
@@ -506,7 +506,7 @@ declare float @_Z3madfff(float, float, float)
 define amdgpu_kernel void @test_mad_x0(float addrspace(1)* nocapture %a, float %y) {
 entry:
   %tmp = load float, float addrspace(1)* %a, align 4
-  %call = tail call fast float @_Z3madfff(float %tmp, float 0.000000e+00, float %y)
+  %call = call fast float @_Z3madfff(float %tmp, float 0.000000e+00, float %y)
   store float %call, float addrspace(1)* %a, align 4
   ret void
 }
@@ -516,7 +516,7 @@ entry:
 define amdgpu_kernel void @test_fma_x1y(float addrspace(1)* nocapture %a, float %y) {
 entry:
   %tmp = load float, float addrspace(1)* %a, align 4
-  %call = tail call fast float @_Z3fmafff(float %tmp, float 1.000000e+00, float %y)
+  %call = call fast float @_Z3fmafff(float %tmp, float 1.000000e+00, float %y)
   store float %call, float addrspace(1)* %a, align 4
   ret void
 }
@@ -526,7 +526,7 @@ entry:
 define amdgpu_kernel void @test_fma_1xy(float addrspace(1)* nocapture %a, float %y) {
 entry:
   %tmp = load float, float addrspace(1)* %a, align 4
-  %call = tail call fast float @_Z3fmafff(float 1.000000e+00, float %tmp, float %y)
+  %call = call fast float @_Z3fmafff(float 1.000000e+00, float %tmp, float %y)
   store float %call, float addrspace(1)* %a, align 4
   ret void
 }
@@ -538,17 +538,17 @@ entry:
   %arrayidx = getelementptr inbounds float, float addrspace(1)* %a, i64 1
   %tmp = load float, float addrspace(1)* %arrayidx, align 4
   %tmp1 = load float, float addrspace(1)* %a, align 4
-  %call = tail call fast float @_Z3fmafff(float %tmp, float %tmp1, float 0.000000e+00)
+  %call = call fast float @_Z3fmafff(float %tmp, float %tmp1, float 0.000000e+00)
   store float %call, float addrspace(1)* %a, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_use_native_exp
-; GCN-NATIVE: tail call fast float @_Z10native_expf(float %tmp)
+; GCN-NATIVE: call fast float @_Z10native_expf(float %tmp)
 define amdgpu_kernel void @test_use_native_exp(float addrspace(1)* nocapture %a) {
 entry:
   %tmp = load float, float addrspace(1)* %a, align 4
-  %call = tail call fast float @_Z3expf(float %tmp)
+  %call = call fast float @_Z3expf(float %tmp)
   store float %call, float addrspace(1)* %a, align 4
   ret void
 }
@@ -556,11 +556,11 @@ entry:
 declare float @_Z3expf(float)
 
 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_use_native_exp2
-; GCN-NATIVE: tail call fast float @_Z11native_exp2f(float %tmp)
+; GCN-NATIVE: call fast float @_Z11native_exp2f(float %tmp)
 define amdgpu_kernel void @test_use_native_exp2(float addrspace(1)* nocapture %a) {
 entry:
   %tmp = load float, float addrspace(1)* %a, align 4
-  %call = tail call fast float @_Z4exp2f(float %tmp)
+  %call = call fast float @_Z4exp2f(float %tmp)
   store float %call, float addrspace(1)* %a, align 4
   ret void
 }
@@ -568,11 +568,11 @@ entry:
 declare float @_Z4exp2f(float)
 
 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_use_native_exp10
-; GCN-NATIVE: tail call fast float @_Z12native_exp10f(float %tmp)
+; GCN-NATIVE: call fast float @_Z12native_exp10f(float %tmp)
 define amdgpu_kernel void @test_use_native_exp10(float addrspace(1)* nocapture %a) {
 entry:
   %tmp = load float, float addrspace(1)* %a, align 4
-  %call = tail call fast float @_Z5exp10f(float %tmp)
+  %call = call fast float @_Z5exp10f(float %tmp)
   store float %call, float addrspace(1)* %a, align 4
   ret void
 }
@@ -580,11 +580,11 @@ entry:
 declare float @_Z5exp10f(float)
 
 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_use_native_log
-; GCN-NATIVE: tail call fast float @_Z10native_logf(float %tmp)
+; GCN-NATIVE: call fast float @_Z10native_logf(float %tmp)
 define amdgpu_kernel void @test_use_native_log(float addrspace(1)* nocapture %a) {
 entry:
   %tmp = load float, float addrspace(1)* %a, align 4
-  %call = tail call fast float @_Z3logf(float %tmp)
+  %call = call fast float @_Z3logf(float %tmp)
   store float %call, float addrspace(1)* %a, align 4
   ret void
 }
@@ -592,11 +592,11 @@ entry:
 declare float @_Z3logf(float)
 
 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_use_native_log2
-; GCN-NATIVE: tail call fast float @_Z11native_log2f(float %tmp)
+; GCN-NATIVE: call fast float @_Z11native_log2f(float %tmp)
 define amdgpu_kernel void @test_use_native_log2(float addrspace(1)* nocapture %a) {
 entry:
   %tmp = load float, float addrspace(1)* %a, align 4
-  %call = tail call fast float @_Z4log2f(float %tmp)
+  %call = call fast float @_Z4log2f(float %tmp)
   store float %call, float addrspace(1)* %a, align 4
   ret void
 }
@@ -604,11 +604,11 @@ entry:
 declare float @_Z4log2f(float)
 
 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_use_native_log10
-; GCN-NATIVE: tail call fast float @_Z12native_log10f(float %tmp)
+; GCN-NATIVE: call fast float @_Z12native_log10f(float %tmp)
 define amdgpu_kernel void @test_use_native_log10(float addrspace(1)* nocapture %a) {
 entry:
   %tmp = load float, float addrspace(1)* %a, align 4
-  %call = tail call fast float @_Z5log10f(float %tmp)
+  %call = call fast float @_Z5log10f(float %tmp)
   store float %call, float addrspace(1)* %a, align 4
   ret void
 }
@@ -617,36 +617,36 @@ declare float @_Z5log10f(float)
 
 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_use_native_powr
 ; GCN-NATIVE: %tmp1 = load float, float addrspace(1)* %arrayidx1, align 4
-; GCN-NATIVE: %__log2 = tail call fast float @_Z11native_log2f(float %tmp)
+; GCN-NATIVE: %__log2 = call fast float @_Z11native_log2f(float %tmp)
 ; GCN-NATIVE: %__ylogx = fmul fast float %__log2, %tmp1
-; GCN-NATIVE: %__exp2 = tail call fast float @_Z11native_exp2f(float %__ylogx)
+; GCN-NATIVE: %__exp2 = call fast float @_Z11native_exp2f(float %__ylogx)
 ; GCN-NATIVE: store float %__exp2, float addrspace(1)* %a, align 4
 define amdgpu_kernel void @test_use_native_powr(float addrspace(1)* nocapture %a) {
 entry:
   %tmp = load float, float addrspace(1)* %a, align 4
   %arrayidx1 = getelementptr inbounds float, float addrspace(1)* %a, i64 1
   %tmp1 = load float, float addrspace(1)* %arrayidx1, align 4
-  %call = tail call fast float @_Z4powrff(float %tmp, float %tmp1)
+  %call = call fast float @_Z4powrff(float %tmp, float %tmp1)
   store float %call, float addrspace(1)* %a, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_use_native_sqrt
-; GCN-NATIVE: tail call fast float @_Z11native_sqrtf(float %tmp)
+; GCN-NATIVE: call fast float @_Z11native_sqrtf(float %tmp)
 define amdgpu_kernel void @test_use_native_sqrt(float addrspace(1)* nocapture %a) {
 entry:
   %tmp = load float, float addrspace(1)* %a, align 4
-  %call = tail call fast float @_Z4sqrtf(float %tmp)
+  %call = call fast float @_Z4sqrtf(float %tmp)
   store float %call, float addrspace(1)* %a, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_dont_use_native_sqrt_fast_f64
-; GCN: tail call fast double @_Z4sqrtd(double %tmp)
+; GCN: call fast double @_Z4sqrtd(double %tmp)
 define amdgpu_kernel void @test_dont_use_native_sqrt_fast_f64(double addrspace(1)* nocapture %a) {
 entry:
   %tmp = load double, double addrspace(1)* %a, align 8
-  %call = tail call fast double @_Z4sqrtd(double %tmp)
+  %call = call fast double @_Z4sqrtd(double %tmp)
   store double %call, double addrspace(1)* %a, align 8
   ret void
 }
@@ -655,11 +655,11 @@ declare float @_Z4sqrtf(float)
 declare double @_Z4sqrtd(double)
 
 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_use_native_rsqrt
-; GCN-NATIVE: tail call fast float @_Z12native_rsqrtf(float %tmp)
+; GCN-NATIVE: call fast float @_Z12native_rsqrtf(float %tmp)
 define amdgpu_kernel void @test_use_native_rsqrt(float addrspace(1)* nocapture %a) {
 entry:
   %tmp = load float, float addrspace(1)* %a, align 4
-  %call = tail call fast float @_Z5rsqrtf(float %tmp)
+  %call = call fast float @_Z5rsqrtf(float %tmp)
   store float %call, float addrspace(1)* %a, align 4
   ret void
 }
@@ -667,11 +667,11 @@ entry:
 declare float @_Z5rsqrtf(float)
 
 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_use_native_tan
-; GCN-NATIVE: tail call fast float @_Z10native_tanf(float %tmp)
+; GCN-NATIVE: call fast float @_Z10native_tanf(float %tmp)
 define amdgpu_kernel void @test_use_native_tan(float addrspace(1)* nocapture %a) {
 entry:
   %tmp = load float, float addrspace(1)* %a, align 4
-  %call = tail call fast float @_Z3tanf(float %tmp)
+  %call = call fast float @_Z3tanf(float %tmp)
   store float %call, float addrspace(1)* %a, align 4
   ret void
 }
@@ -679,14 +679,14 @@ entry:
 declare float @_Z3tanf(float)
 
 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_use_native_sincos
-; GCN-NATIVE: tail call float @_Z10native_sinf(float %tmp)
-; GCN-NATIVE: tail call float @_Z10native_cosf(float %tmp)
+; GCN-NATIVE: call float @_Z10native_sinf(float %tmp)
+; GCN-NATIVE: call float @_Z10native_cosf(float %tmp)
 define amdgpu_kernel void @test_use_native_sincos(float addrspace(1)* %a) {
 entry:
   %tmp = load float, float addrspace(1)* %a, align 4
   %arrayidx1 = getelementptr inbounds float, float addrspace(1)* %a, i64 1
   %tmp1 = addrspacecast float addrspace(1)* %arrayidx1 to float*
-  %call = tail call fast float @_Z6sincosfPf(float %tmp, float* %tmp1)
+  %call = call fast float @_Z6sincosfPf(float %tmp, float* %tmp1)
   store float %call, float addrspace(1)* %a, align 4
   ret void
 }
@@ -703,10 +703,10 @@ define amdgpu_kernel void @test_read_pipe(%opencl.pipe_t addrspace(1)* %p, i32 a
 entry:
   %tmp = bitcast i32 addrspace(1)* %ptr to i8 addrspace(1)*
   %tmp1 = addrspacecast i8 addrspace(1)* %tmp to i8*
-  %tmp2 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p, i8* %tmp1, i32 4, i32 4) #0
-  %tmp3 = tail call %opencl.reserve_id_t addrspace(5)* @__reserve_read_pipe(%opencl.pipe_t addrspace(1)* %p, i32 2, i32 4, i32 4)
-  %tmp4 = tail call i32 @__read_pipe_4(%opencl.pipe_t addrspace(1)* %p, %opencl.reserve_id_t addrspace(5)* %tmp3, i32 2, i8* %tmp1, i32 4, i32 4) #0
-  tail call void @__commit_read_pipe(%opencl.pipe_t addrspace(1)* %p, %opencl.reserve_id_t addrspace(5)* %tmp3, i32 4, i32 4)
+  %tmp2 = call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p, i8* %tmp1, i32 4, i32 4) #0
+  %tmp3 = call %opencl.reserve_id_t addrspace(5)* @__reserve_read_pipe(%opencl.pipe_t addrspace(1)* %p, i32 2, i32 4, i32 4)
+  %tmp4 = call i32 @__read_pipe_4(%opencl.pipe_t addrspace(1)* %p, %opencl.reserve_id_t addrspace(5)* %tmp3, i32 2, i8* %tmp1, i32 4, i32 4) #0
+  call void @__commit_read_pipe(%opencl.pipe_t addrspace(1)* %p, %opencl.reserve_id_t addrspace(5)* %tmp3, i32 4, i32 4)
   ret void
 }
 
@@ -725,10 +725,10 @@ define amdgpu_kernel void @test_write_pipe(%opencl.pipe_t addrspace(1)* %p, i32
 entry:
   %tmp = bitcast i32 addrspace(1)* %ptr to i8 addrspace(1)*
   %tmp1 = addrspacecast i8 addrspace(1)* %tmp to i8*
-  %tmp2 = tail call i32 @__write_pipe_2(%opencl.pipe_t addrspace(1)* %p, i8* %tmp1, i32 4, i32 4) #0
-  %tmp3 = tail call %opencl.reserve_id_t addrspace(5)* @__reserve_write_pipe(%opencl.pipe_t addrspace(1)* %p, i32 2, i32 4, i32 4) #0
-  %tmp4 = tail call i32 @__write_pipe_4(%opencl.pipe_t addrspace(1)* %p, %opencl.reserve_id_t addrspace(5)* %tmp3, i32 2, i8* %tmp1, i32 4, i32 4) #0
-  tail call void @__commit_write_pipe(%opencl.pipe_t addrspace(1)* %p, %opencl.reserve_id_t addrspace(5)* %tmp3, i32 4, i32 4) #0
+  %tmp2 = call i32 @__write_pipe_2(%opencl.pipe_t addrspace(1)* %p, i8* %tmp1, i32 4, i32 4) #0
+  %tmp3 = call %opencl.reserve_id_t addrspace(5)* @__reserve_write_pipe(%opencl.pipe_t addrspace(1)* %p, i32 2, i32 4, i32 4) #0
+  %tmp4 = call i32 @__write_pipe_4(%opencl.pipe_t addrspace(1)* %p, %opencl.reserve_id_t addrspace(5)* %tmp3, i32 2, i8* %tmp1, i32 4, i32 4) #0
+  call void @__commit_write_pipe(%opencl.pipe_t addrspace(1)* %p, %opencl.reserve_id_t addrspace(5)* %tmp3, i32 4, i32 4) #0
   ret void
 }
 
@@ -755,31 +755,31 @@ declare void @__commit_write_pipe(%opencl.pipe_t addrspace(1)*, %opencl.reserve_
 define amdgpu_kernel void @test_pipe_size(%opencl.pipe_t addrspace(1)* %p1, i8 addrspace(1)* %ptr1, %opencl.pipe_t addrspace(1)* %p2, i16 addrspace(1)* %ptr2, %opencl.pipe_t addrspace(1)* %p4, i32 addrspace(1)* %ptr4, %opencl.pipe_t addrspace(1)* %p8, i64 addrspace(1)* %ptr8, %opencl.pipe_t addrspace(1)* %p16, <2 x i64> addrspace(1)* %ptr16, %opencl.pipe_t addrspace(1)* %p32, <4 x i64> addrspace(1)* %ptr32, %opencl.pipe_t addrspace(1)* %p64, <8 x i64> addrspace(1)* %ptr64, %opencl.pipe_t addrspace(1)* %p128, <16 x i64> addrspace(1)* %ptr128, %opencl.pipe_t addrspace(1)* %pu, %struct.S addrspace(1)* %ptru) local_unnamed_addr #0 {
 entry:
   %tmp = addrspacecast i8 addrspace(1)* %ptr1 to i8*
-  %tmp1 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p1, i8* %tmp, i32 1, i32 1) #0
+  %tmp1 = call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p1, i8* %tmp, i32 1, i32 1) #0
   %tmp2 = bitcast i16 addrspace(1)* %ptr2 to i8 addrspace(1)*
   %tmp3 = addrspacecast i8 addrspace(1)* %tmp2 to i8*
-  %tmp4 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p2, i8* %tmp3, i32 2, i32 2) #0
+  %tmp4 = call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p2, i8* %tmp3, i32 2, i32 2) #0
   %tmp5 = bitcast i32 addrspace(1)* %ptr4 to i8 addrspace(1)*
   %tmp6 = addrspacecast i8 addrspace(1)* %tmp5 to i8*
-  %tmp7 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p4, i8* %tmp6, i32 4, i32 4) #0
+  %tmp7 = call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p4, i8* %tmp6, i32 4, i32 4) #0
   %tmp8 = bitcast i64 addrspace(1)* %ptr8 to i8 addrspace(1)*
   %tmp9 = addrspacecast i8 addrspace(1)* %tmp8 to i8*
-  %tmp10 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p8, i8* %tmp9, i32 8, i32 8) #0
+  %tmp10 = call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p8, i8* %tmp9, i32 8, i32 8) #0
   %tmp11 = bitcast <2 x i64> addrspace(1)* %ptr16 to i8 addrspace(1)*
   %tmp12 = addrspacecast i8 addrspace(1)* %tmp11 to i8*
-  %tmp13 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p16, i8* %tmp12, i32 16, i32 16) #0
+  %tmp13 = call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p16, i8* %tmp12, i32 16, i32 16) #0
   %tmp14 = bitcast <4 x i64> addrspace(1)* %ptr32 to i8 addrspace(1)*
   %tmp15 = addrspacecast i8 addrspace(1)* %tmp14 to i8*
-  %tmp16 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p32, i8* %tmp15, i32 32, i32 32) #0
+  %tmp16 = call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p32, i8* %tmp15, i32 32, i32 32) #0
   %tmp17 = bitcast <8 x i64> addrspace(1)* %ptr64 to i8 addrspace(1)*
   %tmp18 = addrspacecast i8 addrspace(1)* %tmp17 to i8*
-  %tmp19 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p64, i8* %tmp18, i32 64, i32 64) #0
+  %tmp19 = call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p64, i8* %tmp18, i32 64, i32 64) #0
   %tmp20 = bitcast <16 x i64> addrspace(1)* %ptr128 to i8 addrspace(1)*
   %tmp21 = addrspacecast i8 addrspace(1)* %tmp20 to i8*
-  %tmp22 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p128, i8* %tmp21, i32 128, i32 128) #0
+  %tmp22 = call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p128, i8* %tmp21, i32 128, i32 128) #0
   %tmp23 = bitcast %struct.S addrspace(1)* %ptru to i8 addrspace(1)*
   %tmp24 = addrspacecast i8 addrspace(1)* %tmp23 to i8*
-  %tmp25 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %pu, i8* %tmp24, i32 400, i32 4) #0
+  %tmp25 = call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %pu, i8* %tmp24, i32 400, i32 4) #0
   ret void
 }
 

diff  --git a/llvm/test/Feature/optnone-opt.ll b/llvm/test/Feature/optnone-opt.ll
index ae0e1a48acc5..f706ade7934f 100644
--- a/llvm/test/Feature/optnone-opt.ll
+++ b/llvm/test/Feature/optnone-opt.ll
@@ -39,16 +39,10 @@ attributes #0 = { optnone noinline }
 ; IR passes run at -O1 and higher.
 ; OPT-O1-DAG: Skipping pass 'Aggressive Dead Code Elimination'
 ; OPT-O1-DAG: Skipping pass 'Combine redundant instructions'
-; OPT-O1-DAG: Skipping pass 'Dead Store Elimination'
 ; OPT-O1-DAG: Skipping pass 'Early CSE'
-; OPT-O1-DAG: Skipping pass 'Jump Threading'
-; OPT-O1-DAG: Skipping pass 'MemCpy Optimization'
 ; OPT-O1-DAG: Skipping pass 'Reassociate expressions'
 ; OPT-O1-DAG: Skipping pass 'Simplify the CFG'
 ; OPT-O1-DAG: Skipping pass 'Sparse Conditional Constant Propagation'
-; OPT-O1-DAG: Skipping pass 'SROA'
-; OPT-O1-DAG: Skipping pass 'Tail Call Elimination'
-; OPT-O1-DAG: Skipping pass 'Value Propagation'
 
 ; Additional IR passes run at -O2 and higher.
 ; OPT-O2O3-DAG: Skipping pass 'Global Value Numbering'

diff  --git a/llvm/test/Other/new-pm-defaults.ll b/llvm/test/Other/new-pm-defaults.ll
index 009f19e544c8..e79a359277f6 100644
--- a/llvm/test/Other/new-pm-defaults.ll
+++ b/llvm/test/Other/new-pm-defaults.ll
@@ -12,66 +12,70 @@
 ; RUN:     | FileCheck %s --check-prefix=CHECK-O --check-prefix=CHECK-O1
 ; RUN: opt -disable-verify -debug-pass-manager \
 ; RUN:     -passes='default<O2>' -S  %s 2>&1 \
-; RUN:     | FileCheck %s --check-prefix=CHECK-O --check-prefix=CHECK-O2
+; RUN:     | FileCheck %s --check-prefix=CHECK-O --check-prefix=CHECK-O2 \
+; RUN:     --check-prefix=CHECK-O23SZ
 ; RUN: opt -disable-verify -debug-pass-manager \
 ; RUN:     -passes='default<O3>' -S  %s 2>&1 \
-; RUN:     | FileCheck %s --check-prefix=CHECK-O --check-prefix=CHECK-O3
+; RUN:     | FileCheck %s --check-prefix=CHECK-O --check-prefix=CHECK-O3 \
+; RUN:     --check-prefix=CHECK-O23SZ
 ; RUN: opt -disable-verify -debug-pass-manager \
 ; RUN:     -passes='default<Os>' -S %s 2>&1 \
-; RUN:     | FileCheck %s --check-prefix=CHECK-O --check-prefix=CHECK-Os
+; RUN:     | FileCheck %s --check-prefix=CHECK-O --check-prefix=CHECK-Os \
+; RUN:     --check-prefix=CHECK-O23SZ
 ; RUN: opt -disable-verify -debug-pass-manager \
 ; RUN:     -passes='default<Oz>' -S %s 2>&1 \
-; RUN:     | FileCheck %s --check-prefix=CHECK-O --check-prefix=CHECK-Oz
+; RUN:     | FileCheck %s --check-prefix=CHECK-O --check-prefix=CHECK-Oz \
+; RUN:     --check-prefix=CHECK-O23SZ
 ; RUN: opt -disable-verify -debug-pass-manager \
 ; RUN:     -passes='lto-pre-link<O2>' -S %s 2>&1 \
 ; RUN:     | FileCheck %s --check-prefix=CHECK-O --check-prefix=CHECK-O2 \
-; RUN:     --check-prefix=CHECK-O2-LTO
+; RUN:     --check-prefix=CHECK-O2-LTO --check-prefix=CHECK-O23SZ
 
 ; RUN: opt -disable-verify -debug-pass-manager \
 ; RUN:     -passes-ep-peephole='no-op-function' \
 ; RUN:     -passes='default<O3>' -S  %s 2>&1 \
 ; RUN:     | FileCheck %s --check-prefix=CHECK-O --check-prefix=CHECK-O3 \
-; RUN:     --check-prefix=CHECK-EP-PEEPHOLE
+; RUN:     --check-prefix=CHECK-EP-PEEPHOLE --check-prefix=CHECK-O23SZ
 ; RUN: opt -disable-verify -debug-pass-manager \
 ; RUN:     -passes-ep-late-loop-optimizations='no-op-loop' \
 ; RUN:     -passes='default<O3>' -S  %s 2>&1 \
 ; RUN:     | FileCheck %s --check-prefix=CHECK-O --check-prefix=CHECK-O3 \
-; RUN:     --check-prefix=CHECK-EP-LOOP-LATE
+; RUN:     --check-prefix=CHECK-EP-LOOP-LATE --check-prefix=CHECK-O23SZ
 ; RUN: opt -disable-verify -debug-pass-manager \
 ; RUN:     -passes-ep-loop-optimizer-end='no-op-loop' \
 ; RUN:     -passes='default<O3>' -S  %s 2>&1 \
 ; RUN:     | FileCheck %s --check-prefix=CHECK-O --check-prefix=CHECK-O3 \
-; RUN:     --check-prefix=CHECK-EP-LOOP-END
+; RUN:     --check-prefix=CHECK-EP-LOOP-END --check-prefix=CHECK-O23SZ
 ; RUN: opt -disable-verify -debug-pass-manager \
 ; RUN:     -passes-ep-scalar-optimizer-late='no-op-function' \
 ; RUN:     -passes='default<O3>' -S  %s 2>&1 \
 ; RUN:     | FileCheck %s --check-prefix=CHECK-O --check-prefix=CHECK-O3 \
-; RUN:     --check-prefix=CHECK-EP-SCALAR-LATE
+; RUN:     --check-prefix=CHECK-EP-SCALAR-LATE --check-prefix=CHECK-O23SZ
 ; RUN: opt -disable-verify -debug-pass-manager \
 ; RUN:     -passes-ep-cgscc-optimizer-late='no-op-cgscc' \
 ; RUN:     -passes='default<O3>' -S  %s 2>&1 \
 ; RUN:     | FileCheck %s --check-prefix=CHECK-O --check-prefix=CHECK-O3 \
-; RUN:     --check-prefix=CHECK-EP-CGSCC-LATE
+; RUN:     --check-prefix=CHECK-EP-CGSCC-LATE --check-prefix=CHECK-O23SZ
 ; RUN: opt -disable-verify -debug-pass-manager \
 ; RUN:     -passes-ep-vectorizer-start='no-op-function' \
 ; RUN:     -passes='default<O3>' -S  %s 2>&1 \
 ; RUN:     | FileCheck %s --check-prefix=CHECK-O --check-prefix=CHECK-O3 \
-; RUN:     --check-prefix=CHECK-EP-VECTORIZER-START
+; RUN:     --check-prefix=CHECK-EP-VECTORIZER-START --check-prefix=CHECK-O23SZ
 ; RUN: opt -disable-verify -debug-pass-manager \
 ; RUN:     -passes-ep-pipeline-start='no-op-module' \
 ; RUN:     -passes='default<O3>' -S  %s 2>&1 \
 ; RUN:     | FileCheck %s --check-prefix=CHECK-O --check-prefix=CHECK-O3 \
-; RUN:     --check-prefix=CHECK-EP-PIPELINE-START
+; RUN:     --check-prefix=CHECK-EP-PIPELINE-START --check-prefix=CHECK-O23SZ
 ; RUN: opt -disable-verify -debug-pass-manager \
 ; RUN:     -passes-ep-pipeline-start='no-op-module' \
 ; RUN:     -passes='lto-pre-link<O3>' -S  %s 2>&1 \
 ; RUN:     | FileCheck %s --check-prefix=CHECK-O --check-prefix=CHECK-O3 \
-; RUN:     --check-prefix=CHECK-EP-PIPELINE-START
+; RUN:     --check-prefix=CHECK-EP-PIPELINE-START --check-prefix=CHECK-O23SZ
 ; RUN: opt -disable-verify -debug-pass-manager \
 ; RUN:     -passes-ep-optimizer-last='no-op-function' \
 ; RUN:     -passes='default<O3>' -S  %s 2>&1 \
 ; RUN:     | FileCheck %s --check-prefix=CHECK-O --check-prefix=CHECK-O3 \
-; RUN:     --check-prefix=CHECK-EP-OPTIMIZER-LAST
+; RUN:     --check-prefix=CHECK-EP-OPTIMIZER-LAST --check-prefix=CHECK-O23SZ
 
 ; CHECK-O: Running analysis: PassInstrumentationAnalysis
 ; CHECK-O-NEXT: Starting llvm::Module pass manager run.
@@ -132,10 +136,10 @@
 ; CHECK-O-NEXT: Running pass: SROA
 ; CHECK-O-NEXT: Running pass: EarlyCSEPass
 ; CHECK-O-NEXT: Running analysis: MemorySSAAnalysis
-; CHECK-O-NEXT: Running pass: SpeculativeExecutionPass
-; CHECK-O-NEXT: Running pass: JumpThreadingPass
-; CHECK-O-NEXT: Running analysis: LazyValueAnalysis
-; CHECK-O-NEXT: Running pass: CorrelatedValuePropagationPass
+; CHECK-O23SZ-NEXT: Running pass: SpeculativeExecutionPass
+; CHECK-O23SZ-NEXT: Running pass: JumpThreadingPass
+; CHECK-O23SZ-NEXT: Running analysis: LazyValueAnalysis
+; CHECK-O23SZ-NEXT: Running pass: CorrelatedValuePropagationPass
 ; CHECK-O-NEXT: Running pass: SimplifyCFGPass
 ; CHECK-O3-NEXT: AggressiveInstCombinePass
 ; CHECK-O-NEXT: Running pass: InstCombinePass
@@ -143,7 +147,7 @@
 ; CHECK-O2-NEXT: Running pass: LibCallsShrinkWrapPass
 ; CHECK-O3-NEXT: Running pass: LibCallsShrinkWrapPass
 ; CHECK-EP-PEEPHOLE-NEXT: Running pass: NoOpFunctionPass
-; CHECK-O-NEXT: Running pass: TailCallElimPass
+; CHECK-O23SZ-NEXT: Running pass: TailCallElimPass
 ; CHECK-O-NEXT: Running pass: SimplifyCFGPass
 ; CHECK-O-NEXT: Running pass: ReassociatePass
 ; CHECK-O-NEXT: Running pass: RequireAnalysisPass<{{.*}}OptimizationRemarkEmitterAnalysis
@@ -180,22 +184,10 @@
 ; CHECK-EP-LOOP-END-NEXT: Running pass: NoOpLoopPass
 ; CHECK-O-NEXT: Finished Loop pass manager run.
 ; CHECK-O-NEXT: Running pass: SROA on foo
-; CHECK-Os-NEXT: Running pass: MergedLoadStoreMotionPass
-; CHECK-Os-NEXT: Running pass: GVN
-; CHECK-Os-NEXT: Running analysis: MemoryDependenceAnalysis
-; CHECK-Os-NEXT: Running analysis: PhiValuesAnalysis
-; CHECK-Oz-NEXT: Running pass: MergedLoadStoreMotionPass
-; CHECK-Oz-NEXT: Running pass: GVN
-; CHECK-Oz-NEXT: Running analysis: MemoryDependenceAnalysis
-; CHECK-Oz-NEXT: Running analysis: PhiValuesAnalysis
-; CHECK-O2-NEXT: Running pass: MergedLoadStoreMotionPass
-; CHECK-O2-NEXT: Running pass: GVN
-; CHECK-O2-NEXT: Running analysis: MemoryDependenceAnalysis
-; CHECK-O2-NEXT: Running analysis: PhiValuesAnalysis
-; CHECK-O3-NEXT: Running pass: MergedLoadStoreMotionPass
-; CHECK-O3-NEXT: Running pass: GVN
-; CHECK-O3-NEXT: Running analysis: MemoryDependenceAnalysis
-; CHECK-O3-NEXT: Running analysis: PhiValuesAnalysis
+; CHECK-O23SZ-NEXT: Running pass: MergedLoadStoreMotionPass
+; CHECK-O23SZ-NEXT: Running pass: GVN
+; CHECK-O23SZ-NEXT: Running analysis: MemoryDependenceAnalysis
+; CHECK-O23SZ-NEXT: Running analysis: PhiValuesAnalysis
 ; CHECK-O-NEXT: Running pass: MemCpyOptPass
 ; CHECK-O1-NEXT: Running analysis: MemoryDependenceAnalysis
 ; CHECK-O1-NEXT: Running analysis: PhiValuesAnalysis
@@ -204,14 +196,14 @@
 ; CHECK-O-NEXT: Running analysis: DemandedBitsAnalysis
 ; CHECK-O-NEXT: Running pass: InstCombinePass
 ; CHECK-EP-PEEPHOLE-NEXT: Running pass: NoOpFunctionPass
-; CHECK-O-NEXT: Running pass: JumpThreadingPass
-; CHECK-O-NEXT: Running pass: CorrelatedValuePropagationPass
-; CHECK-O-NEXT: Running pass: DSEPass
-; CHECK-O-NEXT: Running pass: FunctionToLoopPassAdaptor<{{.*}}LICMPass{{.*}}>
-; CHECK-O-NEXT: Starting llvm::Function pass manager run.
-; CHECK-O-NEXT: Running pass: LoopSimplifyPass
-; CHECK-O-NEXT: Running pass: LCSSAPass
-; CHECK-O-NEXT: Finished llvm::Function pass manager run.
+; CHECK-O23SZ-NEXT: Running pass: JumpThreadingPass
+; CHECK-O23SZ-NEXT: Running pass: CorrelatedValuePropagationPass
+; CHECK-O23SZ-NEXT: Running pass: DSEPass
+; CHECK-O23SZ-NEXT: Running pass: FunctionToLoopPassAdaptor<{{.*}}LICMPass{{.*}}>
+; CHECK-O23SZ-NEXT: Starting llvm::Function pass manager run.
+; CHECK-O23SZ-NEXT: Running pass: LoopSimplifyPass
+; CHECK-O23SZ-NEXT: Running pass: LCSSAPass
+; CHECK-O23SZ-NEXT: Finished llvm::Function pass manager run.
 ; CHECK-EP-SCALAR-LATE-NEXT: Running pass: NoOpFunctionPass
 ; CHECK-O-NEXT: Running pass: ADCEPass
 ; CHECK-O-NEXT: Running analysis: PostDominatorTreeAnalysis

diff  --git a/llvm/test/Other/new-pm-thinlto-defaults.ll b/llvm/test/Other/new-pm-thinlto-defaults.ll
index a0b4df044450..c93b360009b2 100644
--- a/llvm/test/Other/new-pm-thinlto-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-defaults.ll
@@ -13,19 +13,19 @@
 ; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-O1,CHECK-PRELINK-O,CHECK-PRELINK-O-NODIS,CHECK-PRELINK-O1
 ; RUN: opt -disable-verify -debug-pass-manager \
 ; RUN:     -passes='thinlto-pre-link<O2>,name-anon-globals' -S  %s 2>&1 \
-; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-O2,CHECK-PRELINK-O,CHECK-PRELINK-O-NODIS,CHECK-PRELINK-O2
+; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-O2,CHECK-O23SZ,CHECK-PRELINK-O,CHECK-PRELINK-O-NODIS,CHECK-PRELINK-O2
 ; RUN: opt -disable-verify -debug-pass-manager \
 ; RUN:     -passes='thinlto-pre-link<O3>,name-anon-globals' -S -passes-ep-pipeline-start='no-op-module' %s 2>&1 \
-; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-O3,CHECK-PRELINK-O,CHECK-PRELINK-O-NODIS,CHECK-PRELINK-O3,CHECK-EP-PIPELINE-START
+; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-O3,CHECK-O23SZ,CHECK-PRELINK-O,CHECK-PRELINK-O-NODIS,CHECK-PRELINK-O3,CHECK-EP-PIPELINE-START
 ; RUN: opt -disable-verify -debug-pass-manager \
 ; RUN:     -passes='thinlto-pre-link<Os>,name-anon-globals' -S %s 2>&1 \
-; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-Os,CHECK-PRELINK-O,CHECK-PRELINK-O-NODIS,CHECK-PRELINK-Os
+; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-Os,CHECK-O23SZ,CHECK-PRELINK-O,CHECK-PRELINK-O-NODIS,CHECK-PRELINK-Os
 ; RUN: opt -disable-verify -debug-pass-manager \
 ; RUN:     -passes='thinlto-pre-link<Oz>,name-anon-globals' -S %s 2>&1 \
-; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-Oz,CHECK-PRELINK-O,CHECK-PRELINK-O-NODIS,CHECK-PRELINK-Oz
+; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-Oz,CHECK-O23SZ,CHECK-PRELINK-O,CHECK-PRELINK-O-NODIS,CHECK-PRELINK-Oz
 ; RUN: opt -disable-verify -debug-pass-manager -new-pm-debug-info-for-profiling \
 ; RUN:     -passes='thinlto-pre-link<O2>,name-anon-globals' -S  %s 2>&1 \
-; RUN:     | FileCheck %s --check-prefixes=CHECK-DIS,CHECK-O,CHECK-O2,CHECK-PRELINK-O,CHECK-PRELINK-O2
+; RUN:     | FileCheck %s --check-prefixes=CHECK-DIS,CHECK-O,CHECK-O2,CHECK-O23SZ,CHECK-PRELINK-O,CHECK-PRELINK-O2
 ;
 ; Postlink pipelines:
 ; RUN: opt -disable-verify -debug-pass-manager \
@@ -33,19 +33,19 @@
 ; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-O1,CHECK-POSTLINK-O,CHECK-POSTLINK-O1
 ; RUN: opt -disable-verify -debug-pass-manager \
 ; RUN:     -passes='thinlto<O2>' -S  %s 2>&1 \
-; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-O2,CHECK-POSTLINK-O,CHECK-POSTLINK-O2
+; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-O2,CHECK-O23SZ,CHECK-POSTLINK-O,CHECK-POSTLINK-O2
 ; RUN: opt -disable-verify -debug-pass-manager -passes-ep-pipeline-start='no-op-module' \
 ; RUN:     -passes='thinlto<O3>' -S  %s 2>&1 \
-; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-O3,CHECK-POSTLINK-O,CHECK-POSTLINK-O3
+; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-O3,CHECK-O23SZ,CHECK-POSTLINK-O,CHECK-POSTLINK-O3
 ; RUN: opt -disable-verify -debug-pass-manager \
 ; RUN:     -passes='thinlto<Os>' -S %s 2>&1 \
-; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-Os,CHECK-POSTLINK-O,CHECK-POSTLINK-Os
+; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-Os,CHECK-O23SZ,CHECK-POSTLINK-O,CHECK-POSTLINK-Os
 ; RUN: opt -disable-verify -debug-pass-manager \
 ; RUN:     -passes='thinlto<Oz>' -S %s 2>&1 \
-; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-Oz,CHECK-POSTLINK-O,CHECK-POSTLINK-Oz
+; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-Oz,CHECK-O23SZ,CHECK-POSTLINK-O,CHECK-POSTLINK-Oz
 ; RUN: opt -disable-verify -debug-pass-manager -new-pm-debug-info-for-profiling \
 ; RUN:     -passes='thinlto<O2>' -S  %s 2>&1 \
-; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-O2,CHECK-POSTLINK-O,CHECK-POSTLINK-O2
+; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-O2,CHECK-O23SZ,CHECK-POSTLINK-O,CHECK-POSTLINK-O2
 ;
 ; CHECK-O: Running analysis: PassInstrumentationAnalysis
 ; CHECK-O-NEXT: Starting llvm::Module pass manager run.
@@ -112,17 +112,17 @@
 ; CHECK-O-NEXT: Running pass: SROA
 ; CHECK-O-NEXT: Running pass: EarlyCSEPass
 ; CHECK-O-NEXT: Running analysis: MemorySSAAnalysis
-; CHECK-O-NEXT: Running pass: SpeculativeExecutionPass
-; CHECK-O-NEXT: Running pass: JumpThreadingPass
-; CHECK-O-NEXT: Running analysis: LazyValueAnalysis
-; CHECK-O-NEXT: Running pass: CorrelatedValuePropagationPass
+; CHECK-O23SZ-NEXT: Running pass: SpeculativeExecutionPass
+; CHECK-O23SZ-NEXT: Running pass: JumpThreadingPass
+; CHECK-O23SZ-NEXT: Running analysis: LazyValueAnalysis
+; CHECK-O23SZ-NEXT: Running pass: CorrelatedValuePropagationPass
 ; CHECK-O-NEXT: Running pass: SimplifyCFGPass
 ; CHECK-O3-NEXT: Running pass: AggressiveInstCombinePass
 ; CHECK-O-NEXT: Running pass: InstCombinePass
 ; CHECK-O1-NEXT: Running pass: LibCallsShrinkWrapPass
 ; CHECK-O2-NEXT: Running pass: LibCallsShrinkWrapPass
 ; CHECK-O3-NEXT: Running pass: LibCallsShrinkWrapPass
-; CHECK-O-NEXT: Running pass: TailCallElimPass
+; CHECK-O23SZ-NEXT: Running pass: TailCallElimPass
 ; CHECK-O-NEXT: Running pass: SimplifyCFGPass
 ; CHECK-O-NEXT: Running pass: ReassociatePass
 ; CHECK-O-NEXT: Running pass: RequireAnalysisPass<{{.*}}OptimizationRemarkEmitterAnalysis
@@ -180,14 +180,14 @@
 ; CHECK-O-NEXT: Running pass: BDCEPass
 ; CHECK-O-NEXT: Running analysis: DemandedBitsAnalysis
 ; CHECK-O-NEXT: Running pass: InstCombinePass
-; CHECK-O-NEXT: Running pass: JumpThreadingPass
-; CHECK-O-NEXT: Running pass: CorrelatedValuePropagationPass
-; CHECK-O-NEXT: Running pass: DSEPass
-; CHECK-O-NEXT: Running pass: FunctionToLoopPassAdaptor<{{.*}}LICMPass{{.*}}>
-; CHECK-O-NEXT: Starting llvm::Function pass manager run
-; CHECK-O-NEXT: Running pass: LoopSimplifyPass
-; CHECK-O-NEXT: Running pass: LCSSAPass
-; CHECK-O-NEXT: Finished llvm::Function pass manager run
+; CHECK-O23SZ-NEXT: Running pass: JumpThreadingPass
+; CHECK-O23SZ-NEXT: Running pass: CorrelatedValuePropagationPass
+; CHECK-O23SZ-NEXT: Running pass: DSEPass
+; CHECK-O23SZ-NEXT: Running pass: FunctionToLoopPassAdaptor<{{.*}}LICMPass{{.*}}>
+; CHECK-O23SZ-NEXT: Starting llvm::Function pass manager run
+; CHECK-O23SZ-NEXT: Running pass: LoopSimplifyPass
+; CHECK-O23SZ-NEXT: Running pass: LCSSAPass
+; CHECK-O23SZ-NEXT: Finished llvm::Function pass manager run
 ; CHECK-O-NEXT: Running pass: ADCEPass
 ; CHECK-O-NEXT: Running analysis: PostDominatorTreeAnalysis
 ; CHECK-O-NEXT: Running pass: SimplifyCFGPass

diff  --git a/llvm/test/Transforms/MemCpyOpt/lifetime.ll b/llvm/test/Transforms/MemCpyOpt/lifetime.ll
index 9ddf3f4f9c29..ad14bdd6df66 100644
--- a/llvm/test/Transforms/MemCpyOpt/lifetime.ll
+++ b/llvm/test/Transforms/MemCpyOpt/lifetime.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -O1 -S | FileCheck %s
+; RUN: opt < %s -O2 -S | FileCheck %s
 
 ; performCallSlotOptzn in MemCpy should not exchange the calls to
 ; @llvm.lifetime.start and @llvm.memcpy.

diff  --git a/llvm/test/Transforms/PhaseOrdering/simplifycfg-options.ll b/llvm/test/Transforms/PhaseOrdering/simplifycfg-options.ll
index 693462346357..6b3ba66c951e 100644
--- a/llvm/test/Transforms/PhaseOrdering/simplifycfg-options.ll
+++ b/llvm/test/Transforms/PhaseOrdering/simplifycfg-options.ll
@@ -7,7 +7,7 @@
 
 define i1 @PR33605(i32 %a, i32 %b, i32* %c) {
 ; ALL-LABEL: @PR33605(
-; ALL-NEXT:  for.body:
+; ALL-NEXT:  entry:
 ; ALL-NEXT:    [[OR:%.*]] = or i32 [[B:%.*]], [[A:%.*]]
 ; ALL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[C:%.*]], i64 1
 ; ALL-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
@@ -15,16 +15,16 @@ define i1 @PR33605(i32 %a, i32 %b, i32* %c) {
 ; ALL-NEXT:    br i1 [[CMP]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
 ; ALL:       if.then:
 ; ALL-NEXT:    store i32 [[OR]], i32* [[ARRAYIDX]], align 4
-; ALL-NEXT:    tail call void @foo()
+; ALL-NEXT:    call void @foo()
 ; ALL-NEXT:    br label [[IF_END]]
 ; ALL:       if.end:
-; ALL-NEXT:    [[CHANGED_1_OFF0:%.*]] = phi i1 [ true, [[IF_THEN]] ], [ false, [[FOR_BODY:%.*]] ]
+; ALL-NEXT:    [[CHANGED_1_OFF0:%.*]] = phi i1 [ true, [[IF_THEN]] ], [ false, [[ENTRY:%.*]] ]
 ; ALL-NEXT:    [[TMP1:%.*]] = load i32, i32* [[C]], align 4
 ; ALL-NEXT:    [[CMP_1:%.*]] = icmp eq i32 [[OR]], [[TMP1]]
 ; ALL-NEXT:    br i1 [[CMP_1]], label [[IF_END_1:%.*]], label [[IF_THEN_1:%.*]]
 ; ALL:       if.then.1:
 ; ALL-NEXT:    store i32 [[OR]], i32* [[C]], align 4
-; ALL-NEXT:    tail call void @foo()
+; ALL-NEXT:    call void @foo()
 ; ALL-NEXT:    br label [[IF_END_1]]
 ; ALL:       if.end.1:
 ; ALL-NEXT:    [[CHANGED_1_OFF0_1:%.*]] = phi i1 [ true, [[IF_THEN_1]] ], [ [[CHANGED_1_OFF0]], [[IF_END]] ]

diff  --git a/llvm/test/Transforms/PhaseOrdering/two-shifts-by-sext.ll b/llvm/test/Transforms/PhaseOrdering/two-shifts-by-sext.ll
index 4d4a30e00eaf..82f5cfbc9d51 100644
--- a/llvm/test/Transforms/PhaseOrdering/two-shifts-by-sext.ll
+++ b/llvm/test/Transforms/PhaseOrdering/two-shifts-by-sext.ll
@@ -74,7 +74,7 @@ define i32 @two_shifts_by_same_sext(i32 %val, i8 signext %len) {
 define i32 @two_shifts_by_sext_with_extra_use(i32 %val, i8 signext %len) {
 ; CHECK-LABEL: @two_shifts_by_sext_with_extra_use(
 ; CHECK-NEXT:    [[CONV:%.*]] = sext i8 [[LEN:%.*]] to i32
-; CHECK-NEXT:    tail call void @use_int32(i32 [[CONV]])
+; CHECK-NEXT:    call void @use_int32(i32 [[CONV]])
 ; CHECK-NEXT:    [[SHL:%.*]] = shl i32 [[VAL:%.*]], [[CONV]]
 ; CHECK-NEXT:    [[SHR:%.*]] = ashr i32 [[SHL]], [[CONV]]
 ; CHECK-NEXT:    ret i32 [[SHR]]
@@ -101,7 +101,7 @@ declare void @use_int32(i32)
 define i32 @two_shifts_by_same_sext_with_extra_use(i32 %val, i8 signext %len) {
 ; CHECK-LABEL: @two_shifts_by_same_sext_with_extra_use(
 ; CHECK-NEXT:    [[CONV:%.*]] = sext i8 [[LEN:%.*]] to i32
-; CHECK-NEXT:    tail call void @use_int32(i32 [[CONV]])
+; CHECK-NEXT:    call void @use_int32(i32 [[CONV]])
 ; CHECK-NEXT:    [[SHL:%.*]] = shl i32 [[VAL:%.*]], [[CONV]]
 ; CHECK-NEXT:    [[SHR:%.*]] = ashr i32 [[SHL]], [[CONV]]
 ; CHECK-NEXT:    ret i32 [[SHR]]


        


More information about the cfe-commits mailing list