[llvm] ed50e60 - [PGO][PGSO] Enable size optimizations in code gen / target passes for cold code.

Fri Dec 13 11:01:45 PST 2019

Author: Hiroshi Yamauchi
Date: 2019-12-13T11:01:19-08:00
New Revision: ed50e6060b1c51ec4a5dad6c01a64a5f1526cdb5

URL: https://github.com/llvm/llvm-project/commit/ed50e6060b1c51ec4a5dad6c01a64a5f1526cdb5
DIFF: https://github.com/llvm/llvm-project/commit/ed50e6060b1c51ec4a5dad6c01a64a5f1526cdb5.diff

LOG: [PGO][PGSO] Enable size optimizations in code gen / target passes for cold code.

Summary: Split off of D67120.

Reviewers: davidxl

Subscribers: hiraditya, asb, rbar, johnrusso, simoncook, sabuasal, niosHD, jrtc27, MaskRay, zzheng, edward-jones, rogfer01, MartinMosbeck, brucehoult, the_o, PkmX, jocewei, lenary, s.egerton, pzheng, sameer.abuasal, apazos, luismarques, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D71288

Added: 
    llvm/test/CodeGen/AArch64/arm64-memset-to-bzero-pgso.ll
    llvm/test/CodeGen/X86/conditional-tailcall-pgso.ll
    llvm/test/CodeGen/X86/memcmp-pgso.ll
    llvm/test/CodeGen/X86/shrink-compare-pgso.ll

Modified: 
    llvm/lib/Transforms/Utils/SizeOpts.cpp
    llvm/test/CodeGen/AArch64/max-jump-table.ll
    llvm/test/CodeGen/ARM/constantpool-align.ll
    llvm/test/CodeGen/RISCV/tail-calls.ll
    llvm/test/CodeGen/X86/atom-pad-short-functions.ll
    llvm/test/CodeGen/X86/avx-cvt.ll
    llvm/test/CodeGen/X86/avx512-mask-op.ll
    llvm/test/CodeGen/X86/bypass-slow-division-tune.ll
    llvm/test/CodeGen/X86/cmov-into-branch.ll
    llvm/test/CodeGen/X86/fixup-lea.ll
    llvm/test/CodeGen/X86/fold-load-unops.ll
    llvm/test/CodeGen/X86/fshl.ll
    llvm/test/CodeGen/X86/fshr.ll
    llvm/test/CodeGen/X86/haddsub.ll
    llvm/test/CodeGen/X86/immediate_merging.ll
    llvm/test/CodeGen/X86/immediate_merging64.ll
    llvm/test/CodeGen/X86/loop-blocks.ll
    llvm/test/CodeGen/X86/materialize.ll
    llvm/test/CodeGen/X86/memcpy.ll
    llvm/test/CodeGen/X86/powi.ll
    llvm/test/CodeGen/X86/rounding-ops.ll
    llvm/test/CodeGen/X86/slow-incdec.ll
    llvm/test/CodeGen/X86/splat-for-size.ll
    llvm/test/CodeGen/X86/store-zero-and-minus-one.ll
    llvm/test/CodeGen/X86/switch-density.ll
    llvm/test/CodeGen/X86/tail-opts.ll
    llvm/test/CodeGen/X86/test-vs-bittest.ll
    llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll
    llvm/test/CodeGen/X86/x86-64-bittest-logic.ll
    llvm/test/CodeGen/X86/x86-64-double-shifts-Oz-Os-O2.ll
    llvm/test/CodeGen/X86/x86-repmov-copy-eflags.ll
    llvm/test/Transforms/CodeGenPrepare/X86/sink-addrmode.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Utils/SizeOpts.cpp b/llvm/lib/Transforms/Utils/SizeOpts.cpp
index cab375225e89..d2a400027d4b 100644

--- a/llvm/lib/Transforms/Utils/SizeOpts.cpp
+++ b/llvm/lib/Transforms/Utils/SizeOpts.cpp
@@ -29,7 +29,7 @@ cl::opt<bool> PGSOColdCodeOnly(
              "to cold code."));
 
 cl::opt<bool> PGSOIRPassOrTestOnly(
-    "pgso-ir-pass-or-test-only", cl::Hidden, cl::init(true),
+    "pgso-ir-pass-or-test-only", cl::Hidden, cl::init(false),
     cl::desc("Apply the profile guided size optimizations only"
              "to the IR passes or tests."));
 

diff  --git a/llvm/test/CodeGen/AArch64/arm64-memset-to-bzero-pgso.ll b/llvm/test/CodeGen/AArch64/arm64-memset-to-bzero-pgso.ll
new file mode 100644
index 000000000000..086592bf1321
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/arm64-memset-to-bzero-pgso.ll
@@ -0,0 +1,128 @@
+; RUN: llc %s -enable-machine-outliner=never -mtriple=arm64-linux-gnu -o - | \
+; RUN:   FileCheck --check-prefixes=CHECK,CHECK-LINUX %s
+; <rdar://problem/14199482> ARM64: Calls to bzero() replaced with calls to memset()
+
+; CHECK-LABEL: fct1:
+; For small size (<= 256), we do not change memset to bzero.
+; CHECK-DARWIN: {{b|bl}} _memset
+; CHECK-LINUX: {{b|bl}} memset
+define void @fct1(i8* nocapture %ptr) !prof !14 {
+entry:
+  tail call void @llvm.memset.p0i8.i64(i8* %ptr, i8 0, i64 256, i1 false)
+  ret void
+}
+
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i1)
+
+; CHECK-LABEL: fct2:
+; When the size is bigger than 256, change into bzero.
+; CHECK-DARWIN: {{b|bl}} _bzero
+; CHECK-LINUX: {{b|bl}} memset
+define void @fct2(i8* nocapture %ptr) !prof !14 {
+entry:
+  tail call void @llvm.memset.p0i8.i64(i8* %ptr, i8 0, i64 257, i1 false)
+  ret void
+}
+
+; CHECK-LABEL: fct3:
+; For unknown size, change to bzero.
+; CHECK-DARWIN: {{b|bl}} _bzero
+; CHECK-LINUX: {{b|bl}} memset
+define void @fct3(i8* nocapture %ptr, i32 %unknown) !prof !14 {
+entry:
+  %conv = sext i32 %unknown to i64
+  tail call void @llvm.memset.p0i8.i64(i8* %ptr, i8 0, i64 %conv, i1 false)
+  ret void
+}
+
+; CHECK-LABEL: fct4:
+; Size <= 256, no change.
+; CHECK-DARWIN: {{b|bl}} _memset
+; CHECK-LINUX: {{b|bl}} memset
+define void @fct4(i8* %ptr) !prof !14 {
+entry:
+  %tmp = tail call i64 @llvm.objectsize.i64(i8* %ptr, i1 false)
+  %call = tail call i8* @__memset_chk(i8* %ptr, i32 0, i64 256, i64 %tmp)
+  ret void
+}
+
+declare i8* @__memset_chk(i8*, i32, i64, i64)
+
+declare i64 @llvm.objectsize.i64(i8*, i1)
+
+; CHECK-LABEL: fct5:
+; Size > 256, change.
+; CHECK-DARWIN: {{b|bl}} _bzero
+; CHECK-LINUX: {{b|bl}} memset
+define void @fct5(i8* %ptr) !prof !14 {
+entry:
+  %tmp = tail call i64 @llvm.objectsize.i64(i8* %ptr, i1 false)
+  %call = tail call i8* @__memset_chk(i8* %ptr, i32 0, i64 257, i64 %tmp)
+  ret void
+}
+
+; CHECK-LABEL: fct6:
+; Size = unknown, change.
+; CHECK-DARWIN: {{b|bl}} _bzero
+; CHECK-LINUX: {{b|bl}} memset
+define void @fct6(i8* %ptr, i32 %unknown) !prof !14 {
+entry:
+  %conv = sext i32 %unknown to i64
+  %tmp = tail call i64 @llvm.objectsize.i64(i8* %ptr, i1 false)
+  %call = tail call i8* @__memset_chk(i8* %ptr, i32 0, i64 %conv, i64 %tmp)
+  ret void
+}
+
+; Next functions check that memset is not turned into bzero
+; when the set constant is non-zero, whatever the given size.
+
+; CHECK-LABEL: fct7:
+; memset with something that is not a zero, no change.
+; CHECK-DARWIN: {{b|bl}} _memset
+; CHECK-LINUX: {{b|bl}} memset
+define void @fct7(i8* %ptr) !prof !14 {
+entry:
+  %tmp = tail call i64 @llvm.objectsize.i64(i8* %ptr, i1 false)
+  %call = tail call i8* @__memset_chk(i8* %ptr, i32 1, i64 256, i64 %tmp)
+  ret void
+}
+
+; CHECK-LABEL: fct8:
+; memset with something that is not a zero, no change.
+; CHECK-DARWIN: {{b|bl}} _memset
+; CHECK-LINUX: {{b|bl}} memset
+define void @fct8(i8* %ptr) !prof !14 {
+entry:
+  %tmp = tail call i64 @llvm.objectsize.i64(i8* %ptr, i1 false)
+  %call = tail call i8* @__memset_chk(i8* %ptr, i32 1, i64 257, i64 %tmp)
+  ret void
+}
+
+; CHECK-LABEL: fct9:
+; memset with something that is not a zero, no change.
+; CHECK-DARWIN: {{b|bl}} _memset
+; CHECK-LINUX: {{b|bl}} memset
+define void @fct9(i8* %ptr, i32 %unknown) !prof !14 {
+entry:
+  %conv = sext i32 %unknown to i64
+  %tmp = tail call i64 @llvm.objectsize.i64(i8* %ptr, i1 false)
+  %call = tail call i8* @__memset_chk(i8* %ptr, i32 1, i64 %conv, i64 %tmp)
+  ret void
+}
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"ProfileSummary", !1}
+!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
+!2 = !{!"ProfileFormat", !"InstrProf"}
+!3 = !{!"TotalCount", i64 10000}
+!4 = !{!"MaxCount", i64 10}
+!5 = !{!"MaxInternalCount", i64 1}
+!6 = !{!"MaxFunctionCount", i64 1000}
+!7 = !{!"NumCounts", i64 3}
+!8 = !{!"NumFunctions", i64 3}
+!9 = !{!"DetailedSummary", !10}
+!10 = !{!11, !12, !13}
+!11 = !{i32 10000, i64 100, i32 1}
+!12 = !{i32 999000, i64 100, i32 1}
+!13 = !{i32 999999, i64 1, i32 2}
+!14 = !{!"function_entry_count", i64 0}

diff  --git a/llvm/test/CodeGen/AArch64/max-jump-table.ll b/llvm/test/CodeGen/AArch64/max-jump-table.ll
index 2a32ba96b920..2d0dbbf241fc 100644
--- a/llvm/test/CodeGen/AArch64/max-jump-table.ll
+++ b/llvm/test/CodeGen/AArch64/max-jump-table.ll
@@ -203,3 +203,136 @@ default: unreachable
 
 return: ret void
 }
+
+define i32 @jt1_optsize(i32 %a, i32 %b) optsize {
+entry:
+  switch i32 %a, label %return [
+    i32 1,  label %bb1
+    i32 2,  label %bb2
+    i32 3,  label %bb3
+    i32 4,  label %bb4
+    i32 5,  label %bb5
+    i32 6,  label %bb6
+    i32 7,  label %bb7
+    i32 8,  label %bb8
+    i32 9,  label %bb9
+    i32 10, label %bb10
+    i32 11, label %bb11
+    i32 12, label %bb12
+    i32 13, label %bb13
+    i32 14, label %bb14
+    i32 15, label %bb15
+    i32 16, label %bb16
+    i32 17, label %bb17
+  ]
+; CHECK-LABEL: function jt1_optsize:
+; CHECK-NEXT: Jump Tables:
+; CHECK0-NEXT:  %jump-table.0: %bb.1 %bb.2 %bb.3 %bb.4 %bb.5 %bb.6 %bb.7 %bb.8 %bb.9 %bb.10 %bb.11 %bb.12 %bb.13 %bb.14 %bb.15 %bb.16 %bb.17
+; CHECK0-NOT:   %jump-table.1:
+; CHECK4-NEXT:  %jump-table.0: %bb.1 %bb.2 %bb.3 %bb.4 %bb.5 %bb.6 %bb.7 %bb.8 %bb.9 %bb.10 %bb.11 %bb.12 %bb.13 %bb.14 %bb.15 %bb.16 %bb.17
+; CHECK4-NOT:   %jump-table.1:
+; CHECK8-NEXT:  %jump-table.0: %bb.1 %bb.2 %bb.3 %bb.4 %bb.5 %bb.6 %bb.7 %bb.8 %bb.9 %bb.10 %bb.11 %bb.12 %bb.13 %bb.14 %bb.15 %bb.16 %bb.17
+; CHECK8-NOT:   %jump-table.1:
+; CHECK16-NEXT: %jump-table.0: %bb.1 %bb.2 %bb.3 %bb.4 %bb.5 %bb.6 %bb.7 %bb.8 %bb.9 %bb.10 %bb.11 %bb.12 %bb.13 %bb.14 %bb.15 %bb.16 %bb.17
+; CHECK16-NOT:  %jump-table.1:
+; CHECKM1-NEXT: %jump-table.0: %bb.1 %bb.2 %bb.3 %bb.4 %bb.5 %bb.6 %bb.7 %bb.8 %bb.9 %bb.10 %bb.11 %bb.12 %bb.13 %bb.14 %bb.15 %bb.16 %bb.17
+; CHECKM1-NOT:  %jump-table.1:
+; CHECKM3-NEXT: %jump-table.0: %bb.1 %bb.2 %bb.3 %bb.4 %bb.5 %bb.6 %bb.7 %bb.8 %bb.9 %bb.10 %bb.11 %bb.12 %bb.13 %bb.14 %bb.15 %bb.16 %bb.17
+; CHECKM3-NOT:  %jump-table.1:
+; CHECK-DAG: End machine code for function jt1_optsize.
+
+bb1:  tail call void @ext(i32 1, i32 0)  br label %return
+bb2:  tail call void @ext(i32 2, i32 2)  br label %return
+bb3:  tail call void @ext(i32 3, i32 4)  br label %return
+bb4:  tail call void @ext(i32 4, i32 6)  br label %return
+bb5:  tail call void @ext(i32 5, i32 8)  br label %return
+bb6:  tail call void @ext(i32 6, i32 10) br label %return
+bb7:  tail call void @ext(i32 7, i32 12) br label %return
+bb8:  tail call void @ext(i32 8, i32 14) br label %return
+bb9:  tail call void @ext(i32 9, i32 16) br label %return
+bb10: tail call void @ext(i32 1, i32 18) br label %return
+bb11: tail call void @ext(i32 2, i32 20) br label %return
+bb12: tail call void @ext(i32 3, i32 22) br label %return
+bb13: tail call void @ext(i32 4, i32 24) br label %return
+bb14: tail call void @ext(i32 5, i32 26) br label %return
+bb15: tail call void @ext(i32 6, i32 28) br label %return
+bb16: tail call void @ext(i32 7, i32 30) br label %return
+bb17: tail call void @ext(i32 8, i32 32) br label %return
+
+return: ret i32 %b
+}
+
+define i32 @jt1_pgso(i32 %a, i32 %b) !prof !14 {
+entry:
+  switch i32 %a, label %return [
+    i32 1,  label %bb1
+    i32 2,  label %bb2
+    i32 3,  label %bb3
+    i32 4,  label %bb4
+    i32 5,  label %bb5
+    i32 6,  label %bb6
+    i32 7,  label %bb7
+    i32 8,  label %bb8
+    i32 9,  label %bb9
+    i32 10, label %bb10
+    i32 11, label %bb11
+    i32 12, label %bb12
+    i32 13, label %bb13
+    i32 14, label %bb14
+    i32 15, label %bb15
+    i32 16, label %bb16
+    i32 17, label %bb17
+  ]
+; CHECK-LABEL: function jt1_pgso:
+; CHECK-NEXT: Jump Tables:
+; CHECK0-NEXT:  %jump-table.0: %bb.1 %bb.2 %bb.3 %bb.4 %bb.5 %bb.6 %bb.7 %bb.8 %bb.9 %bb.10 %bb.11 %bb.12 %bb.13 %bb.14 %bb.15 %bb.16 %bb.17
+; CHECK0-NOT:   %jump-table.1:
+; CHECK4-NEXT:  %jump-table.0: %bb.1 %bb.2 %bb.3 %bb.4 %bb.5 %bb.6 %bb.7 %bb.8 %bb.9 %bb.10 %bb.11 %bb.12 %bb.13 %bb.14 %bb.15 %bb.16 %bb.17
+; CHECK4-NOT:   %jump-table.1:
+; CHECK8-NEXT:  %jump-table.0: %bb.1 %bb.2 %bb.3 %bb.4 %bb.5 %bb.6 %bb.7 %bb.8 %bb.9 %bb.10 %bb.11 %bb.12 %bb.13 %bb.14 %bb.15 %bb.16 %bb.17
+; CHECK8-NOT:   %jump-table.1:
+; CHECK16-NEXT: %jump-table.0: %bb.1 %bb.2 %bb.3 %bb.4 %bb.5 %bb.6 %bb.7 %bb.8 %bb.9 %bb.10 %bb.11 %bb.12 %bb.13 %bb.14 %bb.15 %bb.16 %bb.17
+; CHECK16-NOT:  %jump-table.1:
+; CHECKM1-NEXT: %jump-table.0: %bb.1 %bb.2 %bb.3 %bb.4 %bb.5 %bb.6 %bb.7 %bb.8 %bb.9 %bb.10 %bb.11 %bb.12 %bb.13 %bb.14 %bb.15 %bb.16 %bb.17
+; CHECKM1-NOT:  %jump-table.1:
+; CHECKM3-NEXT: %jump-table.0: %bb.1 %bb.2 %bb.3 %bb.4 %bb.5 %bb.6 %bb.7 %bb.8 %bb.9 %bb.10 %bb.11 %bb.12 %bb.13 %bb.14 %bb.15 %bb.16 %bb.17
+; CHECKM3-NOT:  %jump-table.1:
+; CHECK-DAG: End machine code for function jt1_pgso.
+
+bb1:  tail call void @ext(i32 1, i32 0)  br label %return
+bb2:  tail call void @ext(i32 2, i32 2)  br label %return
+bb3:  tail call void @ext(i32 3, i32 4)  br label %return
+bb4:  tail call void @ext(i32 4, i32 6)  br label %return
+bb5:  tail call void @ext(i32 5, i32 8)  br label %return
+bb6:  tail call void @ext(i32 6, i32 10) br label %return
+bb7:  tail call void @ext(i32 7, i32 12) br label %return
+bb8:  tail call void @ext(i32 8, i32 14) br label %return
+bb9:  tail call void @ext(i32 9, i32 16) br label %return
+bb10: tail call void @ext(i32 1, i32 18) br label %return
+bb11: tail call void @ext(i32 2, i32 20) br label %return
+bb12: tail call void @ext(i32 3, i32 22) br label %return
+bb13: tail call void @ext(i32 4, i32 24) br label %return
+bb14: tail call void @ext(i32 5, i32 26) br label %return
+bb15: tail call void @ext(i32 6, i32 28) br label %return
+bb16: tail call void @ext(i32 7, i32 30) br label %return
+bb17: tail call void @ext(i32 8, i32 32) br label %return
+
+return: ret i32 %b
+}
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"ProfileSummary", !1}
+!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
+!2 = !{!"ProfileFormat", !"InstrProf"}
+!3 = !{!"TotalCount", i64 10000}
+!4 = !{!"MaxCount", i64 10}
+!5 = !{!"MaxInternalCount", i64 1}
+!6 = !{!"MaxFunctionCount", i64 1000}
+!7 = !{!"NumCounts", i64 3}
+!8 = !{!"NumFunctions", i64 3}
+!9 = !{!"DetailedSummary", !10}
+!10 = !{!11, !12, !13}
+!11 = !{i32 10000, i64 100, i32 1}
+!12 = !{i32 999000, i64 100, i32 1}
+!13 = !{i32 999999, i64 1, i32 2}
+!14 = !{!"function_entry_count", i64 0}

diff  --git a/llvm/test/CodeGen/ARM/constantpool-align.ll b/llvm/test/CodeGen/ARM/constantpool-align.ll
index 1815b87469b6..e8891ed31e20 100644
--- a/llvm/test/CodeGen/ARM/constantpool-align.ll
+++ b/llvm/test/CodeGen/ARM/constantpool-align.ll
@@ -17,3 +17,28 @@ define void @f_optsize(<4 x i32>* %p) optsize {
   store <4 x i32> <i32 -1, i32 0, i32 0, i32 -1>, <4 x i32>* %p, align 4
   ret void 
 }
+
+; CHECK-LABEL: f_pgso:
+; CHECK: vld1.64 {{.*}}, [r1]
+; CHECK: .p2align 3
+define void @f_pgso(<4 x i32>* %p) !prof !14 {
+  store <4 x i32> <i32 -1, i32 0, i32 0, i32 -1>, <4 x i32>* %p, align 4
+  ret void 
+}
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"ProfileSummary", !1}
+!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
+!2 = !{!"ProfileFormat", !"InstrProf"}
+!3 = !{!"TotalCount", i64 10000}
+!4 = !{!"MaxCount", i64 10}
+!5 = !{!"MaxInternalCount", i64 1}
+!6 = !{!"MaxFunctionCount", i64 1000}
+!7 = !{!"NumCounts", i64 3}
+!8 = !{!"NumFunctions", i64 3}
+!9 = !{!"DetailedSummary", !10}
+!10 = !{!11, !12, !13}
+!11 = !{i32 10000, i64 100, i32 1}
+!12 = !{i32 999000, i64 100, i32 1}
+!13 = !{i32 999999, i64 1, i32 2}
+!14 = !{!"function_entry_count", i64 0}

diff  --git a/llvm/test/CodeGen/RISCV/tail-calls.ll b/llvm/test/CodeGen/RISCV/tail-calls.ll
index 970f9ee31b32..30e530b51168 100644
--- a/llvm/test/CodeGen/RISCV/tail-calls.ll
+++ b/llvm/test/CodeGen/RISCV/tail-calls.ll
@@ -23,6 +23,17 @@ entry:
   ret void
 }
 
+; Perform tail call optimization for external symbol.
+ at dest_pgso = global [2 x i8] zeroinitializer
+define void @caller_extern_pgso(i8* %src) !prof !14 {
+entry:
+; CHECK: caller_extern_pgso
+; CHECK-NOT: call memcpy
+; CHECK: tail memcpy
+  tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* getelementptr inbounds ([2 x i8], [2 x i8]* @dest_pgso, i32 0, i32 0), i8* %src, i32 7, i1 false)
+  ret void
+}
+
 ; Perform indirect tail call optimization (for function pointer call).
 declare void @callee_indirect1()
 declare void @callee_indirect2()
@@ -146,3 +157,20 @@ entry:
   tail call void @callee_nostruct()
   ret void
 }
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"ProfileSummary", !1}
+!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
+!2 = !{!"ProfileFormat", !"InstrProf"}
+!3 = !{!"TotalCount", i64 10000}
+!4 = !{!"MaxCount", i64 10}
+!5 = !{!"MaxInternalCount", i64 1}
+!6 = !{!"MaxFunctionCount", i64 1000}
+!7 = !{!"NumCounts", i64 3}
+!8 = !{!"NumFunctions", i64 3}
+!9 = !{!"DetailedSummary", !10}
+!10 = !{!11, !12, !13}
+!11 = !{i32 10000, i64 100, i32 1}
+!12 = !{i32 999000, i64 100, i32 1}
+!13 = !{i32 999999, i64 1, i32 2}
+!14 = !{!"function_entry_count", i64 0}

diff  --git a/llvm/test/CodeGen/X86/atom-pad-short-functions.ll b/llvm/test/CodeGen/X86/atom-pad-short-functions.ll
index b9a39e08cb51..debf14e14157 100644
--- a/llvm/test/CodeGen/X86/atom-pad-short-functions.ll
+++ b/llvm/test/CodeGen/X86/atom-pad-short-functions.ll
@@ -29,6 +29,13 @@ define i32 @test_minsize(i32 %a) nounwind minsize {
   ret i32 %a
 }
 
+define i32 @test_pgso(i32 %a) nounwind !prof !14 {
+; CHECK: test_pgso
+; CHECK: movl
+; CHECK-NEXT: ret
+  ret i32 %a
+}
+
 define i32 @test_add(i32 %a, i32 %b) nounwind {
 ; CHECK: test_add
 ; CHECK: addl
@@ -101,3 +108,19 @@ while.end:
   ret void
 }
 
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"ProfileSummary", !1}
+!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
+!2 = !{!"ProfileFormat", !"InstrProf"}
+!3 = !{!"TotalCount", i64 10000}
+!4 = !{!"MaxCount", i64 10}
+!5 = !{!"MaxInternalCount", i64 1}
+!6 = !{!"MaxFunctionCount", i64 1000}
+!7 = !{!"NumCounts", i64 3}
+!8 = !{!"NumFunctions", i64 3}
+!9 = !{!"DetailedSummary", !10}
+!10 = !{!11, !12, !13}
+!11 = !{i32 10000, i64 100, i32 1}
+!12 = !{i32 999000, i64 100, i32 1}
+!13 = !{i32 999999, i64 1, i32 2}
+!14 = !{!"function_entry_count", i64 0}

diff  --git a/llvm/test/CodeGen/X86/avx-cvt.ll b/llvm/test/CodeGen/X86/avx-cvt.ll
index 2057abf41c7f..653a88edd26a 100644
--- a/llvm/test/CodeGen/X86/avx-cvt.ll
+++ b/llvm/test/CodeGen/X86/avx-cvt.ll
@@ -190,6 +190,16 @@ define float @floor_f32_load(float* %aptr) optsize {
   ret float %res
 }
 
+define float @floor_f32_load_pgso(float* %aptr) !prof !14 {
+; CHECK-LABEL: floor_f32_load_pgso:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vroundss $9, (%rdi), %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %a = load float, float* %aptr
+  %res = call float @llvm.floor.f32(float %a)
+  ret float %res
+}
+
 define double @nearbyint_f64_load(double* %aptr) optsize {
 ; CHECK-LABEL: nearbyint_f64_load:
 ; CHECK:       # %bb.0:
@@ -200,3 +210,29 @@ define double @nearbyint_f64_load(double* %aptr) optsize {
   ret double %res
 }
 
+define double @nearbyint_f64_load_pgso(double* %aptr) !prof !14 {
+; CHECK-LABEL: nearbyint_f64_load_pgso:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vroundsd $12, (%rdi), %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %a = load double, double* %aptr
+  %res = call double @llvm.nearbyint.f64(double %a)
+  ret double %res
+}
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"ProfileSummary", !1}
+!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
+!2 = !{!"ProfileFormat", !"InstrProf"}
+!3 = !{!"TotalCount", i64 10000}
+!4 = !{!"MaxCount", i64 10}
+!5 = !{!"MaxInternalCount", i64 1}
+!6 = !{!"MaxFunctionCount", i64 1000}
+!7 = !{!"NumCounts", i64 3}
+!8 = !{!"NumFunctions", i64 3}
+!9 = !{!"DetailedSummary", !10}
+!10 = !{!11, !12, !13}
+!11 = !{i32 10000, i64 100, i32 1}
+!12 = !{i32 999000, i64 100, i32 1}
+!13 = !{i32 999999, i64 1, i32 2}
+!14 = !{!"function_entry_count", i64 0}

diff  --git a/llvm/test/CodeGen/X86/avx512-mask-op.ll b/llvm/test/CodeGen/X86/avx512-mask-op.ll
index e3febe4b38e3..ff0f1d34076d 100644
--- a/llvm/test/CodeGen/X86/avx512-mask-op.ll
+++ b/llvm/test/CodeGen/X86/avx512-mask-op.ll
@@ -1970,6 +1970,47 @@ define <32 x i16> @test_build_vec_v32i1_optsize(<32 x i16> %x) optsize {
   ret <32 x i16> %ret
 }
 
+define <32 x i16> @test_build_vec_v32i1_pgso(<32 x i16> %x) !prof !14 {
+; KNL-LABEL: test_build_vec_v32i1_pgso:
+; KNL:       ## %bb.0:
+; KNL-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
+; KNL-NEXT:    vandps {{.*}}(%rip), %ymm1, %ymm1
+; KNL-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
+; KNL-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: test_build_vec_v32i1_pgso:
+; SKX:       ## %bb.0:
+; SKX-NEXT:    movl $1497715861, %eax ## imm = 0x59455495
+; SKX-NEXT:    kmovd %eax, %k1
+; SKX-NEXT:    vmovdqu16 %zmm0, %zmm0 {%k1} {z}
+; SKX-NEXT:    retq
+;
+; AVX512BW-LABEL: test_build_vec_v32i1_pgso:
+; AVX512BW:       ## %bb.0:
+; AVX512BW-NEXT:    movl $1497715861, %eax ## imm = 0x59455495
+; AVX512BW-NEXT:    kmovd %eax, %k1
+; AVX512BW-NEXT:    vmovdqu16 %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQ-LABEL: test_build_vec_v32i1_pgso:
+; AVX512DQ:       ## %bb.0:
+; AVX512DQ-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
+; AVX512DQ-NEXT:    vandps {{.*}}(%rip), %ymm1, %ymm1
+; AVX512DQ-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
+; AVX512DQ-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512DQ-NEXT:    retq
+;
+; X86-LABEL: test_build_vec_v32i1_pgso:
+; X86:       ## %bb.0:
+; X86-NEXT:    movl $1497715861, %eax ## imm = 0x59455495
+; X86-NEXT:    kmovd %eax, %k1
+; X86-NEXT:    vmovdqu16 %zmm0, %zmm0 {%k1} {z}
+; X86-NEXT:    retl
+  %ret = select <32 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 true, i1 false, i1 true, i1 false>, <32 x i16> %x, <32 x i16> zeroinitializer
+  ret <32 x i16> %ret
+}
+
 define <64 x i8> @test_build_vec_v64i1(<64 x i8> %x) {
 ; KNL-LABEL: test_build_vec_v64i1:
 ; KNL:       ## %bb.0:
@@ -2013,12 +2054,12 @@ define void @ktest_1(<8 x double> %in, double * %base) {
 ; KNL-NEXT:    vcmpltpd %zmm1, %zmm0, %k0 {%k1}
 ; KNL-NEXT:    kmovw %k0, %eax
 ; KNL-NEXT:    testb %al, %al
-; KNL-NEXT:    je LBB43_2
+; KNL-NEXT:    je LBB44_2
 ; KNL-NEXT:  ## %bb.1: ## %L1
 ; KNL-NEXT:    vmovapd %zmm0, (%rdi)
 ; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    retq
-; KNL-NEXT:  LBB43_2: ## %L2
+; KNL-NEXT:  LBB44_2: ## %L2
 ; KNL-NEXT:    vmovapd %zmm0, 8(%rdi)
 ; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    retq
@@ -2029,12 +2070,12 @@ define void @ktest_1(<8 x double> %in, double * %base) {
 ; SKX-NEXT:    vmovupd 8(%rdi), %zmm1 {%k1} {z}
 ; SKX-NEXT:    vcmpltpd %zmm1, %zmm0, %k0
 ; SKX-NEXT:    ktestb %k0, %k1
-; SKX-NEXT:    je LBB43_2
+; SKX-NEXT:    je LBB44_2
 ; SKX-NEXT:  ## %bb.1: ## %L1
 ; SKX-NEXT:    vmovapd %zmm0, (%rdi)
 ; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
-; SKX-NEXT:  LBB43_2: ## %L2
+; SKX-NEXT:  LBB44_2: ## %L2
 ; SKX-NEXT:    vmovapd %zmm0, 8(%rdi)
 ; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
@@ -2046,12 +2087,12 @@ define void @ktest_1(<8 x double> %in, double * %base) {
 ; AVX512BW-NEXT:    vcmpltpd %zmm1, %zmm0, %k0 {%k1}
 ; AVX512BW-NEXT:    kmovd %k0, %eax
 ; AVX512BW-NEXT:    testb %al, %al
-; AVX512BW-NEXT:    je LBB43_2
+; AVX512BW-NEXT:    je LBB44_2
 ; AVX512BW-NEXT:  ## %bb.1: ## %L1
 ; AVX512BW-NEXT:    vmovapd %zmm0, (%rdi)
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
-; AVX512BW-NEXT:  LBB43_2: ## %L2
+; AVX512BW-NEXT:  LBB44_2: ## %L2
 ; AVX512BW-NEXT:    vmovapd %zmm0, 8(%rdi)
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
@@ -2062,12 +2103,12 @@ define void @ktest_1(<8 x double> %in, double * %base) {
 ; AVX512DQ-NEXT:    vmovupd 8(%rdi), %zmm1 {%k1} {z}
 ; AVX512DQ-NEXT:    vcmpltpd %zmm1, %zmm0, %k0
 ; AVX512DQ-NEXT:    ktestb %k0, %k1
-; AVX512DQ-NEXT:    je LBB43_2
+; AVX512DQ-NEXT:    je LBB44_2
 ; AVX512DQ-NEXT:  ## %bb.1: ## %L1
 ; AVX512DQ-NEXT:    vmovapd %zmm0, (%rdi)
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
-; AVX512DQ-NEXT:  LBB43_2: ## %L2
+; AVX512DQ-NEXT:  LBB44_2: ## %L2
 ; AVX512DQ-NEXT:    vmovapd %zmm0, 8(%rdi)
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
@@ -2079,12 +2120,12 @@ define void @ktest_1(<8 x double> %in, double * %base) {
 ; X86-NEXT:    vmovupd 8(%eax), %zmm1 {%k1} {z}
 ; X86-NEXT:    vcmpltpd %zmm1, %zmm0, %k0
 ; X86-NEXT:    ktestb %k0, %k1
-; X86-NEXT:    je LBB43_2
+; X86-NEXT:    je LBB44_2
 ; X86-NEXT:  ## %bb.1: ## %L1
 ; X86-NEXT:    vmovapd %zmm0, (%eax)
 ; X86-NEXT:    vzeroupper
 ; X86-NEXT:    retl
-; X86-NEXT:  LBB43_2: ## %L2
+; X86-NEXT:  LBB44_2: ## %L2
 ; X86-NEXT:    vmovapd %zmm0, 8(%eax)
 ; X86-NEXT:    vzeroupper
 ; X86-NEXT:    retl
@@ -2131,13 +2172,13 @@ define void @ktest_2(<32 x float> %in, float * %base) {
 ; KNL-NEXT:    kmovw %k0, %ecx
 ; KNL-NEXT:    shll $16, %ecx
 ; KNL-NEXT:    orl %eax, %ecx
-; KNL-NEXT:    je LBB44_2
+; KNL-NEXT:    je LBB45_2
 ; KNL-NEXT:  ## %bb.1: ## %L1
 ; KNL-NEXT:    vmovaps %zmm0, (%rdi)
 ; KNL-NEXT:    vmovaps %zmm1, 64(%rdi)
 ; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    retq
-; KNL-NEXT:  LBB44_2: ## %L2
+; KNL-NEXT:  LBB45_2: ## %L2
 ; KNL-NEXT:    vmovaps %zmm0, 4(%rdi)
 ; KNL-NEXT:    vmovaps %zmm1, 68(%rdi)
 ; KNL-NEXT:    vzeroupper
@@ -2154,13 +2195,13 @@ define void @ktest_2(<32 x float> %in, float * %base) {
 ; SKX-NEXT:    vcmpltps %zmm2, %zmm1, %k2
 ; SKX-NEXT:    kunpckwd %k1, %k2, %k1
 ; SKX-NEXT:    kortestd %k1, %k0
-; SKX-NEXT:    je LBB44_2
+; SKX-NEXT:    je LBB45_2
 ; SKX-NEXT:  ## %bb.1: ## %L1
 ; SKX-NEXT:    vmovaps %zmm0, (%rdi)
 ; SKX-NEXT:    vmovaps %zmm1, 64(%rdi)
 ; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
-; SKX-NEXT:  LBB44_2: ## %L2
+; SKX-NEXT:  LBB45_2: ## %L2
 ; SKX-NEXT:    vmovaps %zmm0, 4(%rdi)
 ; SKX-NEXT:    vmovaps %zmm1, 68(%rdi)
 ; SKX-NEXT:    vzeroupper
@@ -2177,13 +2218,13 @@ define void @ktest_2(<32 x float> %in, float * %base) {
 ; AVX512BW-NEXT:    vcmpltps %zmm2, %zmm1, %k2
 ; AVX512BW-NEXT:    kunpckwd %k1, %k2, %k1
 ; AVX512BW-NEXT:    kortestd %k1, %k0
-; AVX512BW-NEXT:    je LBB44_2
+; AVX512BW-NEXT:    je LBB45_2
 ; AVX512BW-NEXT:  ## %bb.1: ## %L1
 ; AVX512BW-NEXT:    vmovaps %zmm0, (%rdi)
 ; AVX512BW-NEXT:    vmovaps %zmm1, 64(%rdi)
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
-; AVX512BW-NEXT:  LBB44_2: ## %L2
+; AVX512BW-NEXT:  LBB45_2: ## %L2
 ; AVX512BW-NEXT:    vmovaps %zmm0, 4(%rdi)
 ; AVX512BW-NEXT:    vmovaps %zmm1, 68(%rdi)
 ; AVX512BW-NEXT:    vzeroupper
@@ -2203,13 +2244,13 @@ define void @ktest_2(<32 x float> %in, float * %base) {
 ; AVX512DQ-NEXT:    kmovw %k0, %ecx
 ; AVX512DQ-NEXT:    shll $16, %ecx
 ; AVX512DQ-NEXT:    orl %eax, %ecx
-; AVX512DQ-NEXT:    je LBB44_2
+; AVX512DQ-NEXT:    je LBB45_2
 ; AVX512DQ-NEXT:  ## %bb.1: ## %L1
 ; AVX512DQ-NEXT:    vmovaps %zmm0, (%rdi)
 ; AVX512DQ-NEXT:    vmovaps %zmm1, 64(%rdi)
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
-; AVX512DQ-NEXT:  LBB44_2: ## %L2
+; AVX512DQ-NEXT:  LBB45_2: ## %L2
 ; AVX512DQ-NEXT:    vmovaps %zmm0, 4(%rdi)
 ; AVX512DQ-NEXT:    vmovaps %zmm1, 68(%rdi)
 ; AVX512DQ-NEXT:    vzeroupper
@@ -2227,13 +2268,13 @@ define void @ktest_2(<32 x float> %in, float * %base) {
 ; X86-NEXT:    vcmpltps %zmm2, %zmm1, %k2
 ; X86-NEXT:    kunpckwd %k1, %k2, %k1
 ; X86-NEXT:    kortestd %k1, %k0
-; X86-NEXT:    je LBB44_2
+; X86-NEXT:    je LBB45_2
 ; X86-NEXT:  ## %bb.1: ## %L1
 ; X86-NEXT:    vmovaps %zmm0, (%eax)
 ; X86-NEXT:    vmovaps %zmm1, 64(%eax)
 ; X86-NEXT:    vzeroupper
 ; X86-NEXT:    retl
-; X86-NEXT:  LBB44_2: ## %L2
+; X86-NEXT:  LBB45_2: ## %L2
 ; X86-NEXT:    vmovaps %zmm0, 4(%eax)
 ; X86-NEXT:    vmovaps %zmm1, 68(%eax)
 ; X86-NEXT:    vzeroupper
@@ -4188,12 +4229,12 @@ define void @ktest_signed(<16 x i32> %x, <16 x i32> %y) {
 ; KNL-NEXT:    vptestnmd %zmm0, %zmm0, %k0
 ; KNL-NEXT:    kmovw %k0, %eax
 ; KNL-NEXT:    testw %ax, %ax
-; KNL-NEXT:    jle LBB65_1
+; KNL-NEXT:    jle LBB66_1
 ; KNL-NEXT:  ## %bb.2: ## %bb.2
 ; KNL-NEXT:    popq %rax
 ; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    retq
-; KNL-NEXT:  LBB65_1: ## %bb.1
+; KNL-NEXT:  LBB66_1: ## %bb.1
 ; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    callq _foo
 ; KNL-NEXT:    popq %rax
@@ -4207,12 +4248,12 @@ define void @ktest_signed(<16 x i32> %x, <16 x i32> %y) {
 ; SKX-NEXT:    vptestnmd %zmm0, %zmm0, %k0
 ; SKX-NEXT:    kmovd %k0, %eax
 ; SKX-NEXT:    testw %ax, %ax
-; SKX-NEXT:    jle LBB65_1
+; SKX-NEXT:    jle LBB66_1
 ; SKX-NEXT:  ## %bb.2: ## %bb.2
 ; SKX-NEXT:    popq %rax
 ; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
-; SKX-NEXT:  LBB65_1: ## %bb.1
+; SKX-NEXT:  LBB66_1: ## %bb.1
 ; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    callq _foo
 ; SKX-NEXT:    popq %rax
@@ -4226,12 +4267,12 @@ define void @ktest_signed(<16 x i32> %x, <16 x i32> %y) {
 ; AVX512BW-NEXT:    vptestnmd %zmm0, %zmm0, %k0
 ; AVX512BW-NEXT:    kmovd %k0, %eax
 ; AVX512BW-NEXT:    testw %ax, %ax
-; AVX512BW-NEXT:    jle LBB65_1
+; AVX512BW-NEXT:    jle LBB66_1
 ; AVX512BW-NEXT:  ## %bb.2: ## %bb.2
 ; AVX512BW-NEXT:    popq %rax
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
-; AVX512BW-NEXT:  LBB65_1: ## %bb.1
+; AVX512BW-NEXT:  LBB66_1: ## %bb.1
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    callq _foo
 ; AVX512BW-NEXT:    popq %rax
@@ -4245,12 +4286,12 @@ define void @ktest_signed(<16 x i32> %x, <16 x i32> %y) {
 ; AVX512DQ-NEXT:    vptestnmd %zmm0, %zmm0, %k0
 ; AVX512DQ-NEXT:    kmovw %k0, %eax
 ; AVX512DQ-NEXT:    testw %ax, %ax
-; AVX512DQ-NEXT:    jle LBB65_1
+; AVX512DQ-NEXT:    jle LBB66_1
 ; AVX512DQ-NEXT:  ## %bb.2: ## %bb.2
 ; AVX512DQ-NEXT:    popq %rax
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
-; AVX512DQ-NEXT:  LBB65_1: ## %bb.1
+; AVX512DQ-NEXT:  LBB66_1: ## %bb.1
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    callq _foo
 ; AVX512DQ-NEXT:    popq %rax
@@ -4264,12 +4305,12 @@ define void @ktest_signed(<16 x i32> %x, <16 x i32> %y) {
 ; X86-NEXT:    vptestnmd %zmm0, %zmm0, %k0
 ; X86-NEXT:    kmovd %k0, %eax
 ; X86-NEXT:    testw %ax, %ax
-; X86-NEXT:    jle LBB65_1
+; X86-NEXT:    jle LBB66_1
 ; X86-NEXT:  ## %bb.2: ## %bb.2
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    vzeroupper
 ; X86-NEXT:    retl
-; X86-NEXT:  LBB65_1: ## %bb.1
+; X86-NEXT:  LBB66_1: ## %bb.1
 ; X86-NEXT:    vzeroupper
 ; X86-NEXT:    calll _foo
 ; X86-NEXT:    addl $12, %esp
@@ -4297,11 +4338,11 @@ define void @ktest_allones(<16 x i32> %x, <16 x i32> %y) {
 ; CHECK-NEXT:    vpord %zmm1, %zmm0, %zmm0
 ; CHECK-NEXT:    vptestnmd %zmm0, %zmm0, %k0
 ; CHECK-NEXT:    kortestw %k0, %k0
-; CHECK-NEXT:    jb LBB66_2
+; CHECK-NEXT:    jb LBB67_2
 ; CHECK-NEXT:  ## %bb.1: ## %bb.1
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    callq _foo
-; CHECK-NEXT:  LBB66_2: ## %bb.2
+; CHECK-NEXT:  LBB67_2: ## %bb.2
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
@@ -4313,11 +4354,11 @@ define void @ktest_allones(<16 x i32> %x, <16 x i32> %y) {
 ; X86-NEXT:    vpord %zmm1, %zmm0, %zmm0
 ; X86-NEXT:    vptestnmd %zmm0, %zmm0, %k0
 ; X86-NEXT:    kortestw %k0, %k0
-; X86-NEXT:    jb LBB66_2
+; X86-NEXT:    jb LBB67_2
 ; X86-NEXT:  ## %bb.1: ## %bb.1
 ; X86-NEXT:    vzeroupper
 ; X86-NEXT:    calll _foo
-; X86-NEXT:  LBB66_2: ## %bb.2
+; X86-NEXT:  LBB67_2: ## %bb.2
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    vzeroupper
 ; X86-NEXT:    retl
@@ -4505,12 +4546,12 @@ define void @ktest_3(<8 x i32> %w, <8 x i32> %x, <8 x i32> %y, <8 x i32> %z) {
 ; KNL-NEXT:    kandw %k1, %k0, %k0
 ; KNL-NEXT:    kmovw %k0, %eax
 ; KNL-NEXT:    testb %al, %al
-; KNL-NEXT:    je LBB72_1
+; KNL-NEXT:    je LBB73_1
 ; KNL-NEXT:  ## %bb.2: ## %exit
 ; KNL-NEXT:    popq %rax
 ; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    retq
-; KNL-NEXT:  LBB72_1: ## %bar
+; KNL-NEXT:  LBB73_1: ## %bar
 ; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    callq _foo
 ; KNL-NEXT:    popq %rax
@@ -4527,12 +4568,12 @@ define void @ktest_3(<8 x i32> %w, <8 x i32> %x, <8 x i32> %y, <8 x i32> %z) {
 ; SKX-NEXT:    vptestnmd %ymm3, %ymm3, %k2
 ; SKX-NEXT:    korb %k2, %k1, %k1
 ; SKX-NEXT:    ktestb %k1, %k0
-; SKX-NEXT:    je LBB72_1
+; SKX-NEXT:    je LBB73_1
 ; SKX-NEXT:  ## %bb.2: ## %exit
 ; SKX-NEXT:    popq %rax
 ; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
-; SKX-NEXT:  LBB72_1: ## %bar
+; SKX-NEXT:  LBB73_1: ## %bar
 ; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    callq _foo
 ; SKX-NEXT:    popq %rax
@@ -4555,12 +4596,12 @@ define void @ktest_3(<8 x i32> %w, <8 x i32> %x, <8 x i32> %y, <8 x i32> %z) {
 ; AVX512BW-NEXT:    kandw %k1, %k0, %k0
 ; AVX512BW-NEXT:    kmovd %k0, %eax
 ; AVX512BW-NEXT:    testb %al, %al
-; AVX512BW-NEXT:    je LBB72_1
+; AVX512BW-NEXT:    je LBB73_1
 ; AVX512BW-NEXT:  ## %bb.2: ## %exit
 ; AVX512BW-NEXT:    popq %rax
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
-; AVX512BW-NEXT:  LBB72_1: ## %bar
+; AVX512BW-NEXT:  LBB73_1: ## %bar
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    callq _foo
 ; AVX512BW-NEXT:    popq %rax
@@ -4581,12 +4622,12 @@ define void @ktest_3(<8 x i32> %w, <8 x i32> %x, <8 x i32> %y, <8 x i32> %z) {
 ; AVX512DQ-NEXT:    korb %k1, %k0, %k0
 ; AVX512DQ-NEXT:    korb %k3, %k2, %k1
 ; AVX512DQ-NEXT:    ktestb %k1, %k0
-; AVX512DQ-NEXT:    je LBB72_1
+; AVX512DQ-NEXT:    je LBB73_1
 ; AVX512DQ-NEXT:  ## %bb.2: ## %exit
 ; AVX512DQ-NEXT:    popq %rax
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
-; AVX512DQ-NEXT:  LBB72_1: ## %bar
+; AVX512DQ-NEXT:  LBB73_1: ## %bar
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    callq _foo
 ; AVX512DQ-NEXT:    popq %rax
@@ -4603,12 +4644,12 @@ define void @ktest_3(<8 x i32> %w, <8 x i32> %x, <8 x i32> %y, <8 x i32> %z) {
 ; X86-NEXT:    vptestnmd %ymm3, %ymm3, %k2
 ; X86-NEXT:    korb %k2, %k1, %k1
 ; X86-NEXT:    ktestb %k1, %k0
-; X86-NEXT:    je LBB72_1
+; X86-NEXT:    je LBB73_1
 ; X86-NEXT:  ## %bb.2: ## %exit
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    vzeroupper
 ; X86-NEXT:    retl
-; X86-NEXT:  LBB72_1: ## %bar
+; X86-NEXT:  LBB73_1: ## %bar
 ; X86-NEXT:    vzeroupper
 ; X86-NEXT:    calll _foo
 ; X86-NEXT:    addl $12, %esp
@@ -4646,12 +4687,12 @@ define void @ktest_4(<8 x i64> %w, <8 x i64> %x, <8 x i64> %y, <8 x i64> %z) {
 ; KNL-NEXT:    kandw %k1, %k0, %k0
 ; KNL-NEXT:    kmovw %k0, %eax
 ; KNL-NEXT:    testb %al, %al
-; KNL-NEXT:    je LBB73_1
+; KNL-NEXT:    je LBB74_1
 ; KNL-NEXT:  ## %bb.2: ## %exit
 ; KNL-NEXT:    popq %rax
 ; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    retq
-; KNL-NEXT:  LBB73_1: ## %bar
+; KNL-NEXT:  LBB74_1: ## %bar
 ; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    callq _foo
 ; KNL-NEXT:    popq %rax
@@ -4668,12 +4709,12 @@ define void @ktest_4(<8 x i64> %w, <8 x i64> %x, <8 x i64> %y, <8 x i64> %z) {
 ; SKX-NEXT:    vptestnmq %zmm3, %zmm3, %k2
 ; SKX-NEXT:    korb %k2, %k1, %k1
 ; SKX-NEXT:    ktestb %k1, %k0
-; SKX-NEXT:    je LBB73_1
+; SKX-NEXT:    je LBB74_1
 ; SKX-NEXT:  ## %bb.2: ## %exit
 ; SKX-NEXT:    popq %rax
 ; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
-; SKX-NEXT:  LBB73_1: ## %bar
+; SKX-NEXT:  LBB74_1: ## %bar
 ; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    callq _foo
 ; SKX-NEXT:    popq %rax
@@ -4692,12 +4733,12 @@ define void @ktest_4(<8 x i64> %w, <8 x i64> %x, <8 x i64> %y, <8 x i64> %z) {
 ; AVX512BW-NEXT:    kandw %k1, %k0, %k0
 ; AVX512BW-NEXT:    kmovd %k0, %eax
 ; AVX512BW-NEXT:    testb %al, %al
-; AVX512BW-NEXT:    je LBB73_1
+; AVX512BW-NEXT:    je LBB74_1
 ; AVX512BW-NEXT:  ## %bb.2: ## %exit
 ; AVX512BW-NEXT:    popq %rax
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
-; AVX512BW-NEXT:  LBB73_1: ## %bar
+; AVX512BW-NEXT:  LBB74_1: ## %bar
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    callq _foo
 ; AVX512BW-NEXT:    popq %rax
@@ -4714,12 +4755,12 @@ define void @ktest_4(<8 x i64> %w, <8 x i64> %x, <8 x i64> %y, <8 x i64> %z) {
 ; AVX512DQ-NEXT:    vptestnmq %zmm3, %zmm3, %k2
 ; AVX512DQ-NEXT:    korb %k2, %k1, %k1
 ; AVX512DQ-NEXT:    ktestb %k1, %k0
-; AVX512DQ-NEXT:    je LBB73_1
+; AVX512DQ-NEXT:    je LBB74_1
 ; AVX512DQ-NEXT:  ## %bb.2: ## %exit
 ; AVX512DQ-NEXT:    popq %rax
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
-; AVX512DQ-NEXT:  LBB73_1: ## %bar
+; AVX512DQ-NEXT:  LBB74_1: ## %bar
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    callq _foo
 ; AVX512DQ-NEXT:    popq %rax
@@ -4736,12 +4777,12 @@ define void @ktest_4(<8 x i64> %w, <8 x i64> %x, <8 x i64> %y, <8 x i64> %z) {
 ; X86-NEXT:    vptestnmq %zmm3, %zmm3, %k2
 ; X86-NEXT:    korb %k2, %k1, %k1
 ; X86-NEXT:    ktestb %k1, %k0
-; X86-NEXT:    je LBB73_1
+; X86-NEXT:    je LBB74_1
 ; X86-NEXT:  ## %bb.2: ## %exit
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    vzeroupper
 ; X86-NEXT:    retl
-; X86-NEXT:  LBB73_1: ## %bar
+; X86-NEXT:  LBB74_1: ## %bar
 ; X86-NEXT:    vzeroupper
 ; X86-NEXT:    calll _foo
 ; X86-NEXT:    addl $12, %esp
@@ -4778,12 +4819,12 @@ define void @ktest_5(<16 x i32> %w, <16 x i32> %x, <16 x i32> %y, <16 x i32> %z)
 ; KNL-NEXT:    korw %k2, %k1, %k1
 ; KNL-NEXT:    kandw %k1, %k0, %k0
 ; KNL-NEXT:    kortestw %k0, %k0
-; KNL-NEXT:    je LBB74_1
+; KNL-NEXT:    je LBB75_1
 ; KNL-NEXT:  ## %bb.2: ## %exit
 ; KNL-NEXT:    popq %rax
 ; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    retq
-; KNL-NEXT:  LBB74_1: ## %bar
+; KNL-NEXT:  LBB75_1: ## %bar
 ; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    callq _foo
 ; KNL-NEXT:    popq %rax
@@ -4800,12 +4841,12 @@ define void @ktest_5(<16 x i32> %w, <16 x i32> %x, <16 x i32> %y, <16 x i32> %z)
 ; SKX-NEXT:    vptestnmd %zmm3, %zmm3, %k2
 ; SKX-NEXT:    korw %k2, %k1, %k1
 ; SKX-NEXT:    ktestw %k1, %k0
-; SKX-NEXT:    je LBB74_1
+; SKX-NEXT:    je LBB75_1
 ; SKX-NEXT:  ## %bb.2: ## %exit
 ; SKX-NEXT:    popq %rax
 ; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
-; SKX-NEXT:  LBB74_1: ## %bar
+; SKX-NEXT:  LBB75_1: ## %bar
 ; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    callq _foo
 ; SKX-NEXT:    popq %rax
@@ -4823,12 +4864,12 @@ define void @ktest_5(<16 x i32> %w, <16 x i32> %x, <16 x i32> %y, <16 x i32> %z)
 ; AVX512BW-NEXT:    korw %k2, %k1, %k1
 ; AVX512BW-NEXT:    kandw %k1, %k0, %k0
 ; AVX512BW-NEXT:    kortestw %k0, %k0
-; AVX512BW-NEXT:    je LBB74_1
+; AVX512BW-NEXT:    je LBB75_1
 ; AVX512BW-NEXT:  ## %bb.2: ## %exit
 ; AVX512BW-NEXT:    popq %rax
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
-; AVX512BW-NEXT:  LBB74_1: ## %bar
+; AVX512BW-NEXT:  LBB75_1: ## %bar
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    callq _foo
 ; AVX512BW-NEXT:    popq %rax
@@ -4845,12 +4886,12 @@ define void @ktest_5(<16 x i32> %w, <16 x i32> %x, <16 x i32> %y, <16 x i32> %z)
 ; AVX512DQ-NEXT:    vptestnmd %zmm3, %zmm3, %k2
 ; AVX512DQ-NEXT:    korw %k2, %k1, %k1
 ; AVX512DQ-NEXT:    ktestw %k1, %k0
-; AVX512DQ-NEXT:    je LBB74_1
+; AVX512DQ-NEXT:    je LBB75_1
 ; AVX512DQ-NEXT:  ## %bb.2: ## %exit
 ; AVX512DQ-NEXT:    popq %rax
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
-; AVX512DQ-NEXT:  LBB74_1: ## %bar
+; AVX512DQ-NEXT:  LBB75_1: ## %bar
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    callq _foo
 ; AVX512DQ-NEXT:    popq %rax
@@ -4867,12 +4908,12 @@ define void @ktest_5(<16 x i32> %w, <16 x i32> %x, <16 x i32> %y, <16 x i32> %z)
 ; X86-NEXT:    vptestnmd %zmm3, %zmm3, %k2
 ; X86-NEXT:    korw %k2, %k1, %k1
 ; X86-NEXT:    ktestw %k1, %k0
-; X86-NEXT:    je LBB74_1
+; X86-NEXT:    je LBB75_1
 ; X86-NEXT:  ## %bb.2: ## %exit
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    vzeroupper
 ; X86-NEXT:    retl
-; X86-NEXT:  LBB74_1: ## %bar
+; X86-NEXT:  LBB75_1: ## %bar
 ; X86-NEXT:    vzeroupper
 ; X86-NEXT:    calll _foo
 ; X86-NEXT:    addl $12, %esp
@@ -4928,12 +4969,12 @@ define void @ktest_6(<32 x i16> %w, <32 x i16> %x, <32 x i16> %y, <32 x i16> %z)
 ; KNL-NEXT:    kmovw %k0, %ecx
 ; KNL-NEXT:    shll $16, %ecx
 ; KNL-NEXT:    orl %eax, %ecx
-; KNL-NEXT:    je LBB75_1
+; KNL-NEXT:    je LBB76_1
 ; KNL-NEXT:  ## %bb.2: ## %exit
 ; KNL-NEXT:    popq %rax
 ; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    retq
-; KNL-NEXT:  LBB75_1: ## %bar
+; KNL-NEXT:  LBB76_1: ## %bar
 ; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    callq _foo
 ; KNL-NEXT:    popq %rax
@@ -4950,12 +4991,12 @@ define void @ktest_6(<32 x i16> %w, <32 x i16> %x, <32 x i16> %y, <32 x i16> %z)
 ; SKX-NEXT:    vptestnmw %zmm3, %zmm3, %k2
 ; SKX-NEXT:    kord %k2, %k1, %k1
 ; SKX-NEXT:    ktestd %k1, %k0
-; SKX-NEXT:    je LBB75_1
+; SKX-NEXT:    je LBB76_1
 ; SKX-NEXT:  ## %bb.2: ## %exit
 ; SKX-NEXT:    popq %rax
 ; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
-; SKX-NEXT:  LBB75_1: ## %bar
+; SKX-NEXT:  LBB76_1: ## %bar
 ; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    callq _foo
 ; SKX-NEXT:    popq %rax
@@ -4972,12 +5013,12 @@ define void @ktest_6(<32 x i16> %w, <32 x i16> %x, <32 x i16> %y, <32 x i16> %z)
 ; AVX512BW-NEXT:    vptestnmw %zmm3, %zmm3, %k2
 ; AVX512BW-NEXT:    kord %k2, %k1, %k1
 ; AVX512BW-NEXT:    ktestd %k1, %k0
-; AVX512BW-NEXT:    je LBB75_1
+; AVX512BW-NEXT:    je LBB76_1
 ; AVX512BW-NEXT:  ## %bb.2: ## %exit
 ; AVX512BW-NEXT:    popq %rax
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
-; AVX512BW-NEXT:  LBB75_1: ## %bar
+; AVX512BW-NEXT:  LBB76_1: ## %bar
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    callq _foo
 ; AVX512BW-NEXT:    popq %rax
@@ -5014,12 +5055,12 @@ define void @ktest_6(<32 x i16> %w, <32 x i16> %x, <32 x i16> %y, <32 x i16> %z)
 ; AVX512DQ-NEXT:    kmovw %k0, %ecx
 ; AVX512DQ-NEXT:    shll $16, %ecx
 ; AVX512DQ-NEXT:    orl %eax, %ecx
-; AVX512DQ-NEXT:    je LBB75_1
+; AVX512DQ-NEXT:    je LBB76_1
 ; AVX512DQ-NEXT:  ## %bb.2: ## %exit
 ; AVX512DQ-NEXT:    popq %rax
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
-; AVX512DQ-NEXT:  LBB75_1: ## %bar
+; AVX512DQ-NEXT:  LBB76_1: ## %bar
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    callq _foo
 ; AVX512DQ-NEXT:    popq %rax
@@ -5036,12 +5077,12 @@ define void @ktest_6(<32 x i16> %w, <32 x i16> %x, <32 x i16> %y, <32 x i16> %z)
 ; X86-NEXT:    vptestnmw %zmm3, %zmm3, %k2
 ; X86-NEXT:    kord %k2, %k1, %k1
 ; X86-NEXT:    ktestd %k1, %k0
-; X86-NEXT:    je LBB75_1
+; X86-NEXT:    je LBB76_1
 ; X86-NEXT:  ## %bb.2: ## %exit
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    vzeroupper
 ; X86-NEXT:    retl
-; X86-NEXT:  LBB75_1: ## %bar
+; X86-NEXT:  LBB76_1: ## %bar
 ; X86-NEXT:    vzeroupper
 ; X86-NEXT:    calll _foo
 ; X86-NEXT:    addl $12, %esp
@@ -5121,12 +5162,12 @@ define void @ktest_7(<64 x i8> %w, <64 x i8> %x, <64 x i8> %y, <64 x i8> %z) {
 ; KNL-NEXT:    orl %eax, %edx
 ; KNL-NEXT:    shlq $32, %rdx
 ; KNL-NEXT:    orq %rcx, %rdx
-; KNL-NEXT:    je LBB76_1
+; KNL-NEXT:    je LBB77_1
 ; KNL-NEXT:  ## %bb.2: ## %exit
 ; KNL-NEXT:    popq %rax
 ; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    retq
-; KNL-NEXT:  LBB76_1: ## %bar
+; KNL-NEXT:  LBB77_1: ## %bar
 ; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    callq _foo
 ; KNL-NEXT:    popq %rax
@@ -5143,12 +5184,12 @@ define void @ktest_7(<64 x i8> %w, <64 x i8> %x, <64 x i8> %y, <64 x i8> %z) {
 ; SKX-NEXT:    vptestnmb %zmm3, %zmm3, %k2
 ; SKX-NEXT:    korq %k2, %k1, %k1
 ; SKX-NEXT:    ktestq %k1, %k0
-; SKX-NEXT:    je LBB76_1
+; SKX-NEXT:    je LBB77_1
 ; SKX-NEXT:  ## %bb.2: ## %exit
 ; SKX-NEXT:    popq %rax
 ; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
-; SKX-NEXT:  LBB76_1: ## %bar
+; SKX-NEXT:  LBB77_1: ## %bar
 ; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    callq _foo
 ; SKX-NEXT:    popq %rax
@@ -5165,12 +5206,12 @@ define void @ktest_7(<64 x i8> %w, <64 x i8> %x, <64 x i8> %y, <64 x i8> %z) {
 ; AVX512BW-NEXT:    vptestnmb %zmm3, %zmm3, %k2
 ; AVX512BW-NEXT:    korq %k2, %k1, %k1
 ; AVX512BW-NEXT:    ktestq %k1, %k0
-; AVX512BW-NEXT:    je LBB76_1
+; AVX512BW-NEXT:    je LBB77_1
 ; AVX512BW-NEXT:  ## %bb.2: ## %exit
 ; AVX512BW-NEXT:    popq %rax
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
-; AVX512BW-NEXT:  LBB76_1: ## %bar
+; AVX512BW-NEXT:  LBB77_1: ## %bar
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    callq _foo
 ; AVX512BW-NEXT:    popq %rax
@@ -5231,12 +5272,12 @@ define void @ktest_7(<64 x i8> %w, <64 x i8> %x, <64 x i8> %y, <64 x i8> %z) {
 ; AVX512DQ-NEXT:    orl %eax, %edx
 ; AVX512DQ-NEXT:    shlq $32, %rdx
 ; AVX512DQ-NEXT:    orq %rcx, %rdx
-; AVX512DQ-NEXT:    je LBB76_1
+; AVX512DQ-NEXT:    je LBB77_1
 ; AVX512DQ-NEXT:  ## %bb.2: ## %exit
 ; AVX512DQ-NEXT:    popq %rax
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
-; AVX512DQ-NEXT:  LBB76_1: ## %bar
+; AVX512DQ-NEXT:  LBB77_1: ## %bar
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    callq _foo
 ; AVX512DQ-NEXT:    popq %rax
@@ -5255,12 +5296,12 @@ define void @ktest_7(<64 x i8> %w, <64 x i8> %x, <64 x i8> %y, <64 x i8> %z) {
 ; X86-NEXT:    kandq %k1, %k0, %k0
 ; X86-NEXT:    kshiftrq $32, %k0, %k1
 ; X86-NEXT:    kortestd %k1, %k0
-; X86-NEXT:    je LBB76_1
+; X86-NEXT:    je LBB77_1
 ; X86-NEXT:  ## %bb.2: ## %exit
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    vzeroupper
 ; X86-NEXT:    retl
-; X86-NEXT:  LBB76_1: ## %bar
+; X86-NEXT:  LBB77_1: ## %bar
 ; X86-NEXT:    vzeroupper
 ; X86-NEXT:    calll _foo
 ; X86-NEXT:    addl $12, %esp
@@ -5360,3 +5401,20 @@ define <64 x i1> @mask64_insert(i32 %a) {
   %maskv = insertelement <64 x i1> <i1 true, i1 false, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i1 %a_i, i32 0
   ret <64 x i1> %maskv
 }
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"ProfileSummary", !1}
+!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
+!2 = !{!"ProfileFormat", !"InstrProf"}
+!3 = !{!"TotalCount", i64 10000}
+!4 = !{!"MaxCount", i64 10}
+!5 = !{!"MaxInternalCount", i64 1}
+!6 = !{!"MaxFunctionCount", i64 1000}
+!7 = !{!"NumCounts", i64 3}
+!8 = !{!"NumFunctions", i64 3}
+!9 = !{!"DetailedSummary", !10}
+!10 = !{!11, !12, !13}
+!11 = !{i32 10000, i64 100, i32 1}
+!12 = !{i32 999000, i64 100, i32 1}
+!13 = !{i32 999999, i64 1, i32 2}
+!14 = !{!"function_entry_count", i64 0}

diff  --git a/llvm/test/CodeGen/X86/bypass-slow-division-tune.ll b/llvm/test/CodeGen/X86/bypass-slow-division-tune.ll
index 7bcb01227063..84355448d1cd 100644
--- a/llvm/test/CodeGen/X86/bypass-slow-division-tune.ll
+++ b/llvm/test/CodeGen/X86/bypass-slow-division-tune.ll
@@ -130,6 +130,24 @@ define i64 @div64_optsize(i64 %a, i64 %b) optsize {
   ret i64 %div
 }
 
+define i64 @div64_pgso(i64 %a, i64 %b) !prof !15 {
+; CHECK-LABEL: div64_pgso:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    cqto
+; CHECK-NEXT:    idivq %rsi
+; CHECK-NEXT:    retq
+;
+; HUGEWS-LABEL: div64_pgso:
+; HUGEWS:       # %bb.0:
+; HUGEWS-NEXT:    movq %rdi, %rax
+; HUGEWS-NEXT:    cqto
+; HUGEWS-NEXT:    idivq %rsi
+; HUGEWS-NEXT:    retq
+  %div = sdiv i64 %a, %b
+  ret i64 %div
+}
+
 define i64 @div64_hugews(i64 %a, i64 %b) {
 ; ATOM-LABEL: div64_hugews:
 ; ATOM:       # %bb.0:
@@ -137,12 +155,12 @@ define i64 @div64_hugews(i64 %a, i64 %b) {
 ; ATOM-NEXT:    movq %rdi, %rax
 ; ATOM-NEXT:    orq %rsi, %rcx
 ; ATOM-NEXT:    shrq $32, %rcx
-; ATOM-NEXT:    je .LBB3_1
+; ATOM-NEXT:    je .LBB4_1
 ; ATOM-NEXT:  # %bb.2:
 ; ATOM-NEXT:    cqto
 ; ATOM-NEXT:    idivq %rsi
 ; ATOM-NEXT:    retq
-; ATOM-NEXT:  .LBB3_1:
+; ATOM-NEXT:  .LBB4_1:
 ; ATOM-NEXT:    # kill: def $eax killed $eax killed $rax
 ; ATOM-NEXT:    xorl %edx, %edx
 ; ATOM-NEXT:    divl %esi
@@ -155,12 +173,12 @@ define i64 @div64_hugews(i64 %a, i64 %b) {
 ; SLM-NEXT:    movq %rdi, %rax
 ; SLM-NEXT:    orq %rsi, %rcx
 ; SLM-NEXT:    shrq $32, %rcx
-; SLM-NEXT:    je .LBB3_1
+; SLM-NEXT:    je .LBB4_1
 ; SLM-NEXT:  # %bb.2:
 ; SLM-NEXT:    cqto
 ; SLM-NEXT:    idivq %rsi
 ; SLM-NEXT:    retq
-; SLM-NEXT:  .LBB3_1:
+; SLM-NEXT:  .LBB4_1:
 ; SLM-NEXT:    xorl %edx, %edx
 ; SLM-NEXT:    # kill: def $eax killed $eax killed $rax
 ; SLM-NEXT:    divl %esi
@@ -173,12 +191,12 @@ define i64 @div64_hugews(i64 %a, i64 %b) {
 ; SKL-NEXT:    movq %rdi, %rcx
 ; SKL-NEXT:    orq %rsi, %rcx
 ; SKL-NEXT:    shrq $32, %rcx
-; SKL-NEXT:    je .LBB3_1
+; SKL-NEXT:    je .LBB4_1
 ; SKL-NEXT:  # %bb.2:
 ; SKL-NEXT:    cqto
 ; SKL-NEXT:    idivq %rsi
 ; SKL-NEXT:    retq
-; SKL-NEXT:  .LBB3_1:
+; SKL-NEXT:  .LBB4_1:
 ; SKL-NEXT:    # kill: def $eax killed $eax killed $rax
 ; SKL-NEXT:    xorl %edx, %edx
 ; SKL-NEXT:    divl %esi
@@ -213,6 +231,24 @@ define i32 @div32_optsize(i32 %a, i32 %b) optsize {
   ret i32 %div
 }
 
+define i32 @div32_pgso(i32 %a, i32 %b) !prof !15 {
+; CHECK-LABEL: div32_pgso:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    cltd
+; CHECK-NEXT:    idivl %esi
+; CHECK-NEXT:    retq
+;
+; HUGEWS-LABEL: div32_pgso:
+; HUGEWS:       # %bb.0:
+; HUGEWS-NEXT:    movl %edi, %eax
+; HUGEWS-NEXT:    cltd
+; HUGEWS-NEXT:    idivl %esi
+; HUGEWS-NEXT:    retq
+  %div = sdiv i32 %a, %b
+  ret i32 %div
+}
+
 define i32 @div32_minsize(i32 %a, i32 %b) minsize {
 ; CHECK-LABEL: div32_minsize:
 ; CHECK:       # %bb.0:
@@ -246,3 +282,4 @@ define i32 @div32_minsize(i32 %a, i32 %b) minsize {
 !12 = !{i32 10000, i64 1000, i32 1}
 !13 = !{i32 999000, i64 1000, i32 3}
 !14 = !{i32 999999, i64 5, i32 3}
+!15 = !{!"function_entry_count", i64 0}

diff  --git a/llvm/test/CodeGen/X86/cmov-into-branch.ll b/llvm/test/CodeGen/X86/cmov-into-branch.ll
index 9ce99c7897a7..6cc81ac0833b 100644
--- a/llvm/test/CodeGen/X86/cmov-into-branch.ll
+++ b/llvm/test/CodeGen/X86/cmov-into-branch.ll
@@ -88,7 +88,7 @@ define i32 @weighted_select1(i32 %a, i32 %b) {
 ; CHECK-NEXT:    cmovnel %edi, %eax
 ; CHECK-NEXT:    retq
   %cmp = icmp ne i32 %a, 0
-  %sel = select i1 %cmp, i32 %a, i32 %b, !prof !0
+  %sel = select i1 %cmp, i32 %a, i32 %b, !prof !15
   ret i32 %sel
 }
 
@@ -104,7 +104,7 @@ define i32 @weighted_select2(i32 %a, i32 %b) {
 ; CHECK-NEXT:  .LBB6_2: # %select.end
 ; CHECK-NEXT:    retq
   %cmp = icmp ne i32 %a, 0
-  %sel = select i1 %cmp, i32 %a, i32 %b, !prof !1
+  %sel = select i1 %cmp, i32 %a, i32 %b, !prof !16
   ret i32 %sel
 }
 
@@ -124,7 +124,7 @@ define i32 @weighted_select3(i32 %a, i32 %b) {
 ; CHECK-NEXT:    movl %esi, %eax
 ; CHECK-NEXT:    retq
   %cmp = icmp ne i32 %a, 0
-  %sel = select i1 %cmp, i32 %a, i32 %b, !prof !2
+  %sel = select i1 %cmp, i32 %a, i32 %b, !prof !17
   ret i32 %sel
 }
 
@@ -137,12 +137,51 @@ define i32 @unweighted_select(i32 %a, i32 %b) {
 ; CHECK-NEXT:    cmovnel %edi, %eax
 ; CHECK-NEXT:    retq
   %cmp = icmp ne i32 %a, 0
-  %sel = select i1 %cmp, i32 %a, i32 %b, !prof !3
+  %sel = select i1 %cmp, i32 %a, i32 %b, !prof !18
   ret i32 %sel
 }
 
-!0 = !{!"branch_weights", i32 1, i32 99}
-!1 = !{!"branch_weights", i32 1, i32 100}
-!2 = !{!"branch_weights", i32 100, i32 1}
-!3 = !{!"branch_weights", i32 0, i32 0}
+define i32 @weighted_select_optsize(i32 %a, i32 %b) optsize {
+; CHECK-LABEL: weighted_select_optsize:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    testl %edi, %edi
+; CHECK-NEXT:    cmovnel %edi, %eax
+; CHECK-NEXT:    retq
+  %cmp = icmp ne i32 %a, 0
+  %sel = select i1 %cmp, i32 %a, i32 %b, !prof !16
+  ret i32 %sel
+}
+
+define i32 @weighted_select_pgso(i32 %a, i32 %b) !prof !14 {
+; CHECK-LABEL: weighted_select_pgso:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    testl %edi, %edi
+; CHECK-NEXT:    cmovnel %edi, %eax
+; CHECK-NEXT:    retq
+  %cmp = icmp ne i32 %a, 0
+  %sel = select i1 %cmp, i32 %a, i32 %b, !prof !16
+  ret i32 %sel
+}
 
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"ProfileSummary", !1}
+!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
+!2 = !{!"ProfileFormat", !"InstrProf"}
+!3 = !{!"TotalCount", i64 10000}
+!4 = !{!"MaxCount", i64 10}
+!5 = !{!"MaxInternalCount", i64 1}
+!6 = !{!"MaxFunctionCount", i64 1000}
+!7 = !{!"NumCounts", i64 3}
+!8 = !{!"NumFunctions", i64 3}
+!9 = !{!"DetailedSummary", !10}
+!10 = !{!11, !12, !13}
+!11 = !{i32 10000, i64 100, i32 1}
+!12 = !{i32 999000, i64 100, i32 1}
+!13 = !{i32 999999, i64 1, i32 2}
+!14 = !{!"function_entry_count", i64 0}
+!15 = !{!"branch_weights", i32 1, i32 99}
+!16 = !{!"branch_weights", i32 1, i32 100}
+!17 = !{!"branch_weights", i32 100, i32 1}
+!18 = !{!"branch_weights", i32 0, i32 0}

diff  --git a/llvm/test/CodeGen/X86/conditional-tailcall-pgso.ll b/llvm/test/CodeGen/X86/conditional-tailcall-pgso.ll
new file mode 100644
index 000000000000..65bd1dad21a8
--- /dev/null
+++ b/llvm/test/CodeGen/X86/conditional-tailcall-pgso.ll
@@ -0,0 +1,242 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-linux   -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK32
+; RUN: llc < %s -mtriple=x86_64-linux -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK64
+; RUN: llc < %s -mtriple=x86_64-win32 -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=WIN64
+
+declare void @foo()
+declare void @bar()
+
+define void @f(i32 %x, i32 %y) !prof !14 {
+; CHECK32-LABEL: f:
+; CHECK32:       # %bb.0: # %entry
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; CHECK32-NEXT:    cmpl {{[0-9]+}}(%esp), %eax # encoding: [0x3b,0x44,0x24,0x08]
+; CHECK32-NEXT:    jne bar # TAILCALL
+; CHECK32-NEXT:    # encoding: [0x75,A]
+; CHECK32-NEXT:    # fixup A - offset: 1, value: bar-1, kind: FK_PCRel_1
+; CHECK32-NEXT:  # %bb.1: # %bb1
+; CHECK32-NEXT:    jmp foo # TAILCALL
+; CHECK32-NEXT:    # encoding: [0xeb,A]
+; CHECK32-NEXT:    # fixup A - offset: 1, value: foo-1, kind: FK_PCRel_1
+;
+; CHECK64-LABEL: f:
+; CHECK64:       # %bb.0: # %entry
+; CHECK64-NEXT:    cmpl %esi, %edi # encoding: [0x39,0xf7]
+; CHECK64-NEXT:    jne bar # TAILCALL
+; CHECK64-NEXT:    # encoding: [0x75,A]
+; CHECK64-NEXT:    # fixup A - offset: 1, value: bar-1, kind: FK_PCRel_1
+; CHECK64-NEXT:  # %bb.1: # %bb1
+; CHECK64-NEXT:    jmp foo # TAILCALL
+; CHECK64-NEXT:    # encoding: [0xeb,A]
+; CHECK64-NEXT:    # fixup A - offset: 1, value: foo-1, kind: FK_PCRel_1
+;
+; WIN64-LABEL: f:
+; WIN64:       # %bb.0: # %entry
+; WIN64-NEXT:    cmpl %edx, %ecx # encoding: [0x39,0xd1]
+; WIN64-NEXT:    jne bar # TAILCALL
+; WIN64-NEXT:    # encoding: [0x75,A]
+; WIN64-NEXT:    # fixup A - offset: 1, value: bar-1, kind: FK_PCRel_1
+; WIN64-NEXT:  # %bb.1: # %bb1
+; WIN64-NEXT:    jmp foo # TAILCALL
+; WIN64-NEXT:    # encoding: [0xeb,A]
+; WIN64-NEXT:    # fixup A - offset: 1, value: foo-1, kind: FK_PCRel_1
+entry:
+	%p = icmp eq i32 %x, %y
+  br i1 %p, label %bb1, label %bb2
+bb1:
+  tail call void @foo()
+  ret void
+bb2:
+  tail call void @bar()
+  ret void
+
+; Check that the asm doesn't just look good, but uses the correct encoding.
+}
+
+define void @f_non_leaf(i32 %x, i32 %y) !prof !14 {
+; CHECK32-LABEL: f_non_leaf:
+; CHECK32:       # %bb.0: # %entry
+; CHECK32-NEXT:    pushl %ebx # encoding: [0x53]
+; CHECK32-NEXT:    .cfi_def_cfa_offset 8
+; CHECK32-NEXT:    .cfi_offset %ebx, -8
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
+; CHECK32-NEXT:    #APP
+; CHECK32-NEXT:    #NO_APP
+; CHECK32-NEXT:    cmpl {{[0-9]+}}(%esp), %eax # encoding: [0x3b,0x44,0x24,0x0c]
+; CHECK32-NEXT:    jne .LBB1_2 # encoding: [0x75,A]
+; CHECK32-NEXT:    # fixup A - offset: 1, value: .LBB1_2-1, kind: FK_PCRel_1
+; CHECK32-NEXT:  # %bb.1: # %bb1
+; CHECK32-NEXT:    popl %ebx # encoding: [0x5b]
+; CHECK32-NEXT:    .cfi_def_cfa_offset 4
+; CHECK32-NEXT:    jmp foo # TAILCALL
+; CHECK32-NEXT:    # encoding: [0xeb,A]
+; CHECK32-NEXT:    # fixup A - offset: 1, value: foo-1, kind: FK_PCRel_1
+; CHECK32-NEXT:  .LBB1_2: # %bb2
+; CHECK32-NEXT:    .cfi_def_cfa_offset 8
+; CHECK32-NEXT:    popl %ebx # encoding: [0x5b]
+; CHECK32-NEXT:    .cfi_def_cfa_offset 4
+; CHECK32-NEXT:    jmp bar # TAILCALL
+; CHECK32-NEXT:    # encoding: [0xeb,A]
+; CHECK32-NEXT:    # fixup A - offset: 1, value: bar-1, kind: FK_PCRel_1
+;
+; CHECK64-LABEL: f_non_leaf:
+; CHECK64:       # %bb.0: # %entry
+; CHECK64-NEXT:    pushq %rbx # encoding: [0x53]
+; CHECK64-NEXT:    .cfi_def_cfa_offset 16
+; CHECK64-NEXT:    .cfi_offset %rbx, -16
+; CHECK64-NEXT:    #APP
+; CHECK64-NEXT:    #NO_APP
+; CHECK64-NEXT:    cmpl %esi, %edi # encoding: [0x39,0xf7]
+; CHECK64-NEXT:    jne .LBB1_2 # encoding: [0x75,A]
+; CHECK64-NEXT:    # fixup A - offset: 1, value: .LBB1_2-1, kind: FK_PCRel_1
+; CHECK64-NEXT:  # %bb.1: # %bb1
+; CHECK64-NEXT:    popq %rbx # encoding: [0x5b]
+; CHECK64-NEXT:    .cfi_def_cfa_offset 8
+; CHECK64-NEXT:    jmp foo # TAILCALL
+; CHECK64-NEXT:    # encoding: [0xeb,A]
+; CHECK64-NEXT:    # fixup A - offset: 1, value: foo-1, kind: FK_PCRel_1
+; CHECK64-NEXT:  .LBB1_2: # %bb2
+; CHECK64-NEXT:    .cfi_def_cfa_offset 16
+; CHECK64-NEXT:    popq %rbx # encoding: [0x5b]
+; CHECK64-NEXT:    .cfi_def_cfa_offset 8
+; CHECK64-NEXT:    jmp bar # TAILCALL
+; CHECK64-NEXT:    # encoding: [0xeb,A]
+; CHECK64-NEXT:    # fixup A - offset: 1, value: bar-1, kind: FK_PCRel_1
+;
+; WIN64-LABEL: f_non_leaf:
+; WIN64:       # %bb.0: # %entry
+; WIN64-NEXT:    pushq %rbx # encoding: [0x53]
+; WIN64-NEXT:    .seh_pushreg %rbx
+; WIN64-NEXT:    .seh_endprologue
+; WIN64-NEXT:    #APP
+; WIN64-NEXT:    #NO_APP
+; WIN64-NEXT:    cmpl %edx, %ecx # encoding: [0x39,0xd1]
+; WIN64-NEXT:    jne .LBB1_2 # encoding: [0x75,A]
+; WIN64-NEXT:    # fixup A - offset: 1, value: .LBB1_2-1, kind: FK_PCRel_1
+; WIN64-NEXT:  # %bb.1: # %bb1
+; WIN64-NEXT:    popq %rbx # encoding: [0x5b]
+; WIN64-NEXT:    jmp foo # TAILCALL
+; WIN64-NEXT:    # encoding: [0xeb,A]
+; WIN64-NEXT:    # fixup A - offset: 1, value: foo-1, kind: FK_PCRel_1
+; WIN64-NEXT:  .LBB1_2: # %bb2
+; WIN64-NEXT:    nop # encoding: [0x90]
+; WIN64-NEXT:    popq %rbx # encoding: [0x5b]
+; WIN64-NEXT:    jmp bar # TAILCALL
+; WIN64-NEXT:    # encoding: [0xeb,A]
+; WIN64-NEXT:    # fixup A - offset: 1, value: bar-1, kind: FK_PCRel_1
+; WIN64-NEXT:    .seh_handlerdata
+; WIN64-NEXT:    .text
+; WIN64-NEXT:    .seh_endproc
+entry:
+  ; Force %ebx to be spilled on the stack, turning this into
+  ; not a "leaf" function for Win64.
+  tail call void asm sideeffect "", "~{ebx}"()
+
+	%p = icmp eq i32 %x, %y
+  br i1 %p, label %bb1, label %bb2
+bb1:
+  tail call void @foo()
+  ret void
+bb2:
+  tail call void @bar()
+  ret void
+
+}
+
+declare x86_thiscallcc zeroext i1 @baz(i8*, i32)
+define x86_thiscallcc zeroext i1 @BlockPlacementTest(i8* %this, i32 %x) !prof !14 {
+; CHECK32-LABEL: BlockPlacementTest:
+; CHECK32:       # %bb.0: # %entry
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x04]
+; CHECK32-NEXT:    testb $42, %dl # encoding: [0xf6,0xc2,0x2a]
+; CHECK32-NEXT:    je .LBB2_3 # encoding: [0x74,A]
+; CHECK32-NEXT:    # fixup A - offset: 1, value: .LBB2_3-1, kind: FK_PCRel_1
+; CHECK32-NEXT:  # %bb.1: # %land.rhs
+; CHECK32-NEXT:    movb $1, %al # encoding: [0xb0,0x01]
+; CHECK32-NEXT:    testb $44, %dl # encoding: [0xf6,0xc2,0x2c]
+; CHECK32-NEXT:    je baz # TAILCALL
+; CHECK32-NEXT:    # encoding: [0x74,A]
+; CHECK32-NEXT:    # fixup A - offset: 1, value: baz-1, kind: FK_PCRel_1
+; CHECK32-NEXT:  .LBB2_2: # %land.end
+; CHECK32-NEXT:    # kill: def $al killed $al killed $eax
+; CHECK32-NEXT:    retl $4 # encoding: [0xc2,0x04,0x00]
+; CHECK32-NEXT:  .LBB2_3:
+; CHECK32-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; CHECK32-NEXT:    jmp .LBB2_2 # encoding: [0xeb,A]
+; CHECK32-NEXT:    # fixup A - offset: 1, value: .LBB2_2-1, kind: FK_PCRel_1
+;
+; CHECK64-LABEL: BlockPlacementTest:
+; CHECK64:       # %bb.0: # %entry
+; CHECK64-NEXT:    testb $42, %sil # encoding: [0x40,0xf6,0xc6,0x2a]
+; CHECK64-NEXT:    je .LBB2_3 # encoding: [0x74,A]
+; CHECK64-NEXT:    # fixup A - offset: 1, value: .LBB2_3-1, kind: FK_PCRel_1
+; CHECK64-NEXT:  # %bb.1: # %land.rhs
+; CHECK64-NEXT:    movb $1, %al # encoding: [0xb0,0x01]
+; CHECK64-NEXT:    testb $44, %sil # encoding: [0x40,0xf6,0xc6,0x2c]
+; CHECK64-NEXT:    je baz # TAILCALL
+; CHECK64-NEXT:    # encoding: [0x74,A]
+; CHECK64-NEXT:    # fixup A - offset: 1, value: baz-1, kind: FK_PCRel_1
+; CHECK64-NEXT:  .LBB2_2: # %land.end
+; CHECK64-NEXT:    # kill: def $al killed $al killed $eax
+; CHECK64-NEXT:    retq # encoding: [0xc3]
+; CHECK64-NEXT:  .LBB2_3:
+; CHECK64-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; CHECK64-NEXT:    jmp .LBB2_2 # encoding: [0xeb,A]
+; CHECK64-NEXT:    # fixup A - offset: 1, value: .LBB2_2-1, kind: FK_PCRel_1
+;
+; WIN64-LABEL: BlockPlacementTest:
+; WIN64:       # %bb.0: # %entry
+; WIN64-NEXT:    testb $42, %dl # encoding: [0xf6,0xc2,0x2a]
+; WIN64-NEXT:    je .LBB2_3 # encoding: [0x74,A]
+; WIN64-NEXT:    # fixup A - offset: 1, value: .LBB2_3-1, kind: FK_PCRel_1
+; WIN64-NEXT:  # %bb.1: # %land.rhs
+; WIN64-NEXT:    movb $1, %al # encoding: [0xb0,0x01]
+; WIN64-NEXT:    testb $44, %dl # encoding: [0xf6,0xc2,0x2c]
+; WIN64-NEXT:    je baz # TAILCALL
+; WIN64-NEXT:    # encoding: [0x74,A]
+; WIN64-NEXT:    # fixup A - offset: 1, value: baz-1, kind: FK_PCRel_1
+; WIN64-NEXT:  .LBB2_2: # %land.end
+; WIN64-NEXT:    # kill: def $al killed $al killed $eax
+; WIN64-NEXT:    retq # encoding: [0xc3]
+; WIN64-NEXT:  .LBB2_3:
+; WIN64-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; WIN64-NEXT:    jmp .LBB2_2 # encoding: [0xeb,A]
+; WIN64-NEXT:    # fixup A - offset: 1, value: .LBB2_2-1, kind: FK_PCRel_1
+entry:
+  %and = and i32 %x, 42
+  %tobool = icmp eq i32 %and, 0
+  br i1 %tobool, label %land.end, label %land.rhs
+
+land.rhs:
+  %and6 = and i32 %x, 44
+  %tobool7 = icmp eq i32 %and6, 0
+  br i1 %tobool7, label %lor.rhs, label %land.end
+
+lor.rhs:
+  %call = tail call x86_thiscallcc zeroext i1 @baz(i8* %this, i32 %x) #2
+  br label %land.end
+
+land.end:
+  %0 = phi i1 [ false, %entry ], [ true, %land.rhs ], [ %call, %lor.rhs ]
+  ret i1 %0
+
+; Make sure machine block placement isn't confused by the conditional tail call,
+; but sees that it can fall through to the next block.
+}
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"ProfileSummary", !1}
+!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
+!2 = !{!"ProfileFormat", !"InstrProf"}
+!3 = !{!"TotalCount", i64 10000}
+!4 = !{!"MaxCount", i64 10}
+!5 = !{!"MaxInternalCount", i64 1}
+!6 = !{!"MaxFunctionCount", i64 1000}
+!7 = !{!"NumCounts", i64 3}
+!8 = !{!"NumFunctions", i64 3}
+!9 = !{!"DetailedSummary", !10}
+!10 = !{!11, !12, !13}
+!11 = !{i32 10000, i64 100, i32 1}
+!12 = !{i32 999000, i64 100, i32 1}
+!13 = !{i32 999999, i64 1, i32 2}
+!14 = !{!"function_entry_count", i64 0}

diff  --git a/llvm/test/CodeGen/X86/fixup-lea.ll b/llvm/test/CodeGen/X86/fixup-lea.ll
index 8d8a1cd19f05..3f661a8e991e 100644
--- a/llvm/test/CodeGen/X86/fixup-lea.ll
+++ b/llvm/test/CodeGen/X86/fixup-lea.ll
@@ -108,17 +108,96 @@ for.end:
   ret void
 }
 
+define void @foo_pgso(i32 inreg %dns) !prof !14 {
+; SLOW-LABEL: foo_pgso:
+; SLOW:       # %bb.0: # %entry
+; SLOW-NEXT:    xorl %ecx, %ecx
+; SLOW-NEXT:    decl %ecx
+; SLOW-NEXT:  .LBB4_1: # %for.body
+; SLOW-NEXT:    # =>This Inner Loop Header: Depth=1
+; SLOW-NEXT:    movzwl %cx, %edx
+; SLOW-NEXT:    decl %ecx
+; SLOW-NEXT:    cmpl %eax, %edx
+; SLOW-NEXT:    jl .LBB4_1
+; SLOW-NEXT:  # %bb.2: # %for.end
+; SLOW-NEXT:    retl
+;
+; FAST-LABEL: foo_pgso:
+; FAST:       # %bb.0: # %entry
+; FAST-NEXT:    xorl %ecx, %ecx
+; FAST-NEXT:    decl %ecx
+; FAST-NEXT:  .LBB4_1: # %for.body
+; FAST-NEXT:    # =>This Inner Loop Header: Depth=1
+; FAST-NEXT:    movzwl %cx, %edx
+; FAST-NEXT:    addl $-1, %ecx
+; FAST-NEXT:    cmpl %eax, %edx
+; FAST-NEXT:    jl .LBB4_1
+; FAST-NEXT:  # %bb.2: # %for.end
+; FAST-NEXT:    retl
+entry:
+  br label %for.body
+
+for.body:
+  %i.05 = phi i16 [ %dec, %for.body ], [ 0, %entry ]
+  %dec = add i16 %i.05, -1
+  %conv = zext i16 %dec to i32
+  %cmp = icmp slt i32 %conv, %dns
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+define void @bar_pgso(i32 inreg %dns) !prof !14 {
+; SLOW-LABEL: bar_pgso:
+; SLOW:       # %bb.0: # %entry
+; SLOW-NEXT:    xorl %ecx, %ecx
+; SLOW-NEXT:    incl %ecx
+; SLOW-NEXT:  .LBB5_1: # %for.body
+; SLOW-NEXT:    # =>This Inner Loop Header: Depth=1
+; SLOW-NEXT:    movzwl %cx, %edx
+; SLOW-NEXT:    incl %ecx
+; SLOW-NEXT:    cmpl %eax, %edx
+; SLOW-NEXT:    jl .LBB5_1
+; SLOW-NEXT:  # %bb.2: # %for.end
+; SLOW-NEXT:    retl
+;
+; FAST-LABEL: bar_pgso:
+; FAST:       # %bb.0: # %entry
+; FAST-NEXT:    xorl %ecx, %ecx
+; FAST-NEXT:    incl %ecx
+; FAST-NEXT:  .LBB5_1: # %for.body
+; FAST-NEXT:    # =>This Inner Loop Header: Depth=1
+; FAST-NEXT:    movzwl %cx, %edx
+; FAST-NEXT:    addl $1, %ecx
+; FAST-NEXT:    cmpl %eax, %edx
+; FAST-NEXT:    jl .LBB5_1
+; FAST-NEXT:  # %bb.2: # %for.end
+; FAST-NEXT:    retl
+entry:
+  br label %for.body
+
+for.body:
+  %i.05 = phi i16 [ %inc, %for.body ], [ 0, %entry ]
+  %inc = add i16 %i.05, 1
+  %conv = zext i16 %inc to i32
+  %cmp = icmp slt i32 %conv, %dns
+  br i1 %cmp, label %for.body, label %for.end
+for.end:
+  ret void
+}
+
 define void @foo_nosize(i32 inreg %dns) {
 ; SLOW-LABEL: foo_nosize:
 ; SLOW:       # %bb.0: # %entry
 ; SLOW-NEXT:    movw $-1, %cx
 ; SLOW-NEXT:    .p2align 4, 0x90
-; SLOW-NEXT:  .LBB4_1: # %for.body
+; SLOW-NEXT:  .LBB6_1: # %for.body
 ; SLOW-NEXT:    # =>This Inner Loop Header: Depth=1
 ; SLOW-NEXT:    movzwl %cx, %edx
 ; SLOW-NEXT:    decl %ecx
 ; SLOW-NEXT:    cmpl %eax, %edx
-; SLOW-NEXT:    jl .LBB4_1
+; SLOW-NEXT:    jl .LBB6_1
 ; SLOW-NEXT:  # %bb.2: # %for.end
 ; SLOW-NEXT:    retl
 ;
@@ -126,12 +205,12 @@ define void @foo_nosize(i32 inreg %dns) {
 ; FAST:       # %bb.0: # %entry
 ; FAST-NEXT:    movw $-1, %cx
 ; FAST-NEXT:    .p2align 4, 0x90
-; FAST-NEXT:  .LBB4_1: # %for.body
+; FAST-NEXT:  .LBB6_1: # %for.body
 ; FAST-NEXT:    # =>This Inner Loop Header: Depth=1
 ; FAST-NEXT:    movzwl %cx, %edx
 ; FAST-NEXT:    addl $-1, %ecx
 ; FAST-NEXT:    cmpl %eax, %edx
-; FAST-NEXT:    jl .LBB4_1
+; FAST-NEXT:    jl .LBB6_1
 ; FAST-NEXT:  # %bb.2: # %for.end
 ; FAST-NEXT:    retl
 entry:
@@ -153,12 +232,12 @@ define void @bar_nosize(i32 inreg %dns) {
 ; SLOW:       # %bb.0: # %entry
 ; SLOW-NEXT:    movw $1, %cx
 ; SLOW-NEXT:    .p2align 4, 0x90
-; SLOW-NEXT:  .LBB5_1: # %for.body
+; SLOW-NEXT:  .LBB7_1: # %for.body
 ; SLOW-NEXT:    # =>This Inner Loop Header: Depth=1
 ; SLOW-NEXT:    movzwl %cx, %edx
 ; SLOW-NEXT:    incl %ecx
 ; SLOW-NEXT:    cmpl %eax, %edx
-; SLOW-NEXT:    jl .LBB5_1
+; SLOW-NEXT:    jl .LBB7_1
 ; SLOW-NEXT:  # %bb.2: # %for.end
 ; SLOW-NEXT:    retl
 ;
@@ -166,12 +245,12 @@ define void @bar_nosize(i32 inreg %dns) {
 ; FAST:       # %bb.0: # %entry
 ; FAST-NEXT:    movw $1, %cx
 ; FAST-NEXT:    .p2align 4, 0x90
-; FAST-NEXT:  .LBB5_1: # %for.body
+; FAST-NEXT:  .LBB7_1: # %for.body
 ; FAST-NEXT:    # =>This Inner Loop Header: Depth=1
 ; FAST-NEXT:    movzwl %cx, %edx
 ; FAST-NEXT:    addl $1, %ecx
 ; FAST-NEXT:    cmpl %eax, %edx
-; FAST-NEXT:    jl .LBB5_1
+; FAST-NEXT:    jl .LBB7_1
 ; FAST-NEXT:  # %bb.2: # %for.end
 ; FAST-NEXT:    retl
 entry:
@@ -186,3 +265,20 @@ for.body:
 for.end:
   ret void
 }
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"ProfileSummary", !1}
+!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
+!2 = !{!"ProfileFormat", !"InstrProf"}
+!3 = !{!"TotalCount", i64 10000}
+!4 = !{!"MaxCount", i64 10}
+!5 = !{!"MaxInternalCount", i64 1}
+!6 = !{!"MaxFunctionCount", i64 1000}
+!7 = !{!"NumCounts", i64 3}
+!8 = !{!"NumFunctions", i64 3}
+!9 = !{!"DetailedSummary", !10}
+!10 = !{!11, !12, !13}
+!11 = !{i32 10000, i64 100, i32 1}
+!12 = !{i32 999000, i64 100, i32 1}
+!13 = !{i32 999999, i64 1, i32 2}
+!14 = !{!"function_entry_count", i64 0}

diff  --git a/llvm/test/CodeGen/X86/fold-load-unops.ll b/llvm/test/CodeGen/X86/fold-load-unops.ll
index aa6bc720fa97..b42b94a63dcb 100644
--- a/llvm/test/CodeGen/X86/fold-load-unops.ll
+++ b/llvm/test/CodeGen/X86/fold-load-unops.ll
@@ -113,6 +113,38 @@ define <4 x float> @rcpss_full_size(<4 x float>* %a) optsize {
     ret <4 x float> %res
 }
 
+define float @rcpss_pgso(float* %a) !prof !14 {
+; SSE-LABEL: rcpss_pgso:
+; SSE:       # %bb.0:
+; SSE-NEXT:    rcpss (%rdi), %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: rcpss_pgso:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vrcpss (%rdi), %xmm0, %xmm0
+; AVX-NEXT:    retq
+    %ld = load float, float* %a
+    %ins = insertelement <4 x float> undef, float %ld, i32 0
+    %res = tail call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %ins)
+    %ext = extractelement <4 x float> %res, i32 0
+    ret float %ext
+}
+
+define <4 x float> @rcpss_full_pgso(<4 x float>* %a) !prof !14 {
+; SSE-LABEL: rcpss_full_pgso:
+; SSE:       # %bb.0:
+; SSE-NEXT:    rcpss (%rdi), %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: rcpss_full_pgso:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vrcpss (%rdi), %xmm0, %xmm0
+; AVX-NEXT:    retq
+    %ld = load <4 x float>, <4 x float>* %a
+    %res = tail call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %ld)
+    ret <4 x float> %res
+}
+
 define float @rsqrtss_size(float* %a) optsize {
 ; SSE-LABEL: rsqrtss_size:
 ; SSE:       # %bb.0:
@@ -145,6 +177,38 @@ define <4 x float> @rsqrtss_full_size(<4 x float>* %a) optsize {
     ret <4 x float> %res
 }
 
+define float @rsqrtss_pgso(float* %a) !prof !14 {
+; SSE-LABEL: rsqrtss_pgso:
+; SSE:       # %bb.0:
+; SSE-NEXT:    rsqrtss (%rdi), %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: rsqrtss_pgso:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vrsqrtss (%rdi), %xmm0, %xmm0
+; AVX-NEXT:    retq
+    %ld = load float, float* %a
+    %ins = insertelement <4 x float> undef, float %ld, i32 0
+    %res = tail call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %ins)
+    %ext = extractelement <4 x float> %res, i32 0
+    ret float %ext
+}
+
+define <4 x float> @rsqrtss_full_pgso(<4 x float>* %a) !prof !14 {
+; SSE-LABEL: rsqrtss_full_pgso:
+; SSE:       # %bb.0:
+; SSE-NEXT:    rsqrtss (%rdi), %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: rsqrtss_full_pgso:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vrsqrtss (%rdi), %xmm0, %xmm0
+; AVX-NEXT:    retq
+    %ld = load <4 x float>, <4 x float>* %a
+    %res = tail call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %ld)
+    ret <4 x float> %res
+}
+
 define float @sqrtss_size(float* %a) optsize{
 ; SSE-LABEL: sqrtss_size:
 ; SSE:       # %bb.0:
@@ -196,6 +260,57 @@ define <4 x float> @sqrtss_full_size_volatile(<4 x float>* %a) optsize{
     ret <4 x float> %res
 }
 
+define float @sqrtss_pgso(float* %a) !prof !14 {
+; SSE-LABEL: sqrtss_pgso:
+; SSE:       # %bb.0:
+; SSE-NEXT:    sqrtss (%rdi), %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: sqrtss_pgso:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vsqrtss (%rdi), %xmm0, %xmm0
+; AVX-NEXT:    retq
+    %ld = load float, float* %a
+    %ins = insertelement <4 x float> undef, float %ld, i32 0
+    %res = tail call <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float> %ins)
+    %ext = extractelement <4 x float> %res, i32 0
+    ret float %ext
+}
+
+define <4 x float> @sqrtss_full_pgso(<4 x float>* %a) !prof !14 {
+; SSE-LABEL: sqrtss_full_pgso:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movaps (%rdi), %xmm0
+; SSE-NEXT:    sqrtss %xmm0, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: sqrtss_full_pgso:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovaps (%rdi), %xmm0
+; AVX-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    retq
+    %ld = load <4 x float>, <4 x float>* %a
+    %res = tail call <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float> %ld)
+    ret <4 x float> %res
+}
+
+define <4 x float> @sqrtss_full_pgso_volatile(<4 x float>* %a) !prof !14 {
+; SSE-LABEL: sqrtss_full_pgso_volatile:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movaps (%rdi), %xmm0
+; SSE-NEXT:    sqrtss %xmm0, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: sqrtss_full_pgso_volatile:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovaps (%rdi), %xmm0
+; AVX-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    retq
+    %ld = load volatile <4 x float>, <4 x float>* %a
+    %res = tail call <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float> %ld)
+    ret <4 x float> %res
+}
+
 define double @sqrtsd_size(double* %a) optsize {
 ; SSE-LABEL: sqrtsd_size:
 ; SSE:       # %bb.0:
@@ -247,7 +362,75 @@ define <2 x double> @sqrtsd_full_size_volatile(<2 x double>* %a) optsize {
     ret <2 x double> %res
 }
 
+define double @sqrtsd_pgso(double* %a) !prof !14 {
+; SSE-LABEL: sqrtsd_pgso:
+; SSE:       # %bb.0:
+; SSE-NEXT:    sqrtsd (%rdi), %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: sqrtsd_pgso:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vsqrtsd (%rdi), %xmm0, %xmm0
+; AVX-NEXT:    retq
+    %ld = load double, double* %a
+    %ins = insertelement <2 x double> undef, double %ld, i32 0
+    %res = tail call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %ins)
+    %ext = extractelement <2 x double> %res, i32 0
+    ret double %ext
+}
+
+define <2 x double> @sqrtsd_full_pgso(<2 x double>* %a) !prof !14 {
+; SSE-LABEL: sqrtsd_full_pgso:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movapd (%rdi), %xmm0
+; SSE-NEXT:    sqrtsd %xmm0, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: sqrtsd_full_pgso:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovapd (%rdi), %xmm0
+; AVX-NEXT:    vsqrtsd %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    retq
+    %ld = load <2 x double>, <2 x double>* %a
+    %res = tail call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %ld)
+    ret <2 x double> %res
+}
+
+define <2 x double> @sqrtsd_full_pgso_volatile(<2 x double>* %a) !prof !14 {
+; SSE-LABEL: sqrtsd_full_pgso_volatile:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movapd (%rdi), %xmm0
+; SSE-NEXT:    sqrtsd %xmm0, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: sqrtsd_full_pgso_volatile:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovapd (%rdi), %xmm0
+; AVX-NEXT:    vsqrtsd %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    retq
+    %ld = load volatile <2 x double>, <2 x double>* %a
+    %res = tail call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %ld)
+    ret <2 x double> %res
+}
+
 declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone
 declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone
 declare <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float>) nounwind readnone
 declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"ProfileSummary", !1}
+!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
+!2 = !{!"ProfileFormat", !"InstrProf"}
+!3 = !{!"TotalCount", i64 10000}
+!4 = !{!"MaxCount", i64 10}
+!5 = !{!"MaxInternalCount", i64 1}
+!6 = !{!"MaxFunctionCount", i64 1000}
+!7 = !{!"NumCounts", i64 3}
+!8 = !{!"NumFunctions", i64 3}
+!9 = !{!"DetailedSummary", !10}
+!10 = !{!11, !12, !13}
+!11 = !{i32 10000, i64 100, i32 1}
+!12 = !{i32 999000, i64 100, i32 1}
+!13 = !{i32 999999, i64 1, i32 2}
+!14 = !{!"function_entry_count", i64 0}

diff  --git a/llvm/test/CodeGen/X86/fshl.ll b/llvm/test/CodeGen/X86/fshl.ll
index c7ca98ddf241..522a03afa59d 100644
--- a/llvm/test/CodeGen/X86/fshl.ll
+++ b/llvm/test/CodeGen/X86/fshl.ll
@@ -196,6 +196,26 @@ define i32 @var_shift_i32_optsize(i32 %x, i32 %y, i32 %z) nounwind optsize {
   ret i32 %tmp
 }
 
+define i32 @var_shift_i32_pgso(i32 %x, i32 %y, i32 %z) nounwind !prof !14 {
+; X86-LABEL: var_shift_i32_pgso:
+; X86:       # %bb.0:
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    shldl %cl, %edx, %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: var_shift_i32_pgso:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %edx, %ecx
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NEXT:    shldl %cl, %esi, %eax
+; X64-NEXT:    retq
+  %tmp = tail call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 %z)
+  ret i32 %tmp
+}
+
 define i64 @var_shift_i64(i64 %x, i64 %y, i64 %z) nounwind {
 ; X86-FAST-LABEL: var_shift_i64:
 ; X86-FAST:       # %bb.0:
@@ -216,36 +236,36 @@ define i64 @var_shift_i64(i64 %x, i64 %y, i64 %z) nounwind {
 ; X86-FAST-NEXT:    shll %cl, %edi
 ; X86-FAST-NEXT:    shldl %cl, %eax, %ebp
 ; X86-FAST-NEXT:    testb $32, %bl
-; X86-FAST-NEXT:    je .LBB4_2
+; X86-FAST-NEXT:    je .LBB5_2
 ; X86-FAST-NEXT:  # %bb.1:
 ; X86-FAST-NEXT:    movl %edi, %ebp
 ; X86-FAST-NEXT:    xorl %edi, %edi
-; X86-FAST-NEXT:  .LBB4_2:
+; X86-FAST-NEXT:  .LBB5_2:
 ; X86-FAST-NEXT:    movb $64, %cl
 ; X86-FAST-NEXT:    subb %bl, %cl
 ; X86-FAST-NEXT:    movl %edx, %esi
 ; X86-FAST-NEXT:    shrl %cl, %esi
 ; X86-FAST-NEXT:    shrdl %cl, %edx, (%esp) # 4-byte Folded Spill
 ; X86-FAST-NEXT:    testb $32, %cl
-; X86-FAST-NEXT:    jne .LBB4_3
+; X86-FAST-NEXT:    jne .LBB5_3
 ; X86-FAST-NEXT:  # %bb.4:
 ; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-FAST-NEXT:    movl (%esp), %ecx # 4-byte Reload
 ; X86-FAST-NEXT:    testl %ebx, %ebx
-; X86-FAST-NEXT:    jne .LBB4_6
-; X86-FAST-NEXT:    jmp .LBB4_7
-; X86-FAST-NEXT:  .LBB4_3:
+; X86-FAST-NEXT:    jne .LBB5_6
+; X86-FAST-NEXT:    jmp .LBB5_7
+; X86-FAST-NEXT:  .LBB5_3:
 ; X86-FAST-NEXT:    movl %esi, %ecx
 ; X86-FAST-NEXT:    xorl %esi, %esi
 ; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-FAST-NEXT:    testl %ebx, %ebx
-; X86-FAST-NEXT:    je .LBB4_7
-; X86-FAST-NEXT:  .LBB4_6:
+; X86-FAST-NEXT:    je .LBB5_7
+; X86-FAST-NEXT:  .LBB5_6:
 ; X86-FAST-NEXT:    orl %esi, %ebp
 ; X86-FAST-NEXT:    orl %ecx, %edi
 ; X86-FAST-NEXT:    movl %edi, %eax
 ; X86-FAST-NEXT:    movl %ebp, %edx
-; X86-FAST-NEXT:  .LBB4_7:
+; X86-FAST-NEXT:  .LBB5_7:
 ; X86-FAST-NEXT:    addl $4, %esp
 ; X86-FAST-NEXT:    popl %esi
 ; X86-FAST-NEXT:    popl %edi
@@ -279,11 +299,11 @@ define i64 @var_shift_i64(i64 %x, i64 %y, i64 %z) nounwind {
 ; X86-SLOW-NEXT:    testb %dl, %dl
 ; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SLOW-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SLOW-NEXT:    je .LBB4_2
+; X86-SLOW-NEXT:    je .LBB5_2
 ; X86-SLOW-NEXT:  # %bb.1:
 ; X86-SLOW-NEXT:    orl %eax, %ebp
 ; X86-SLOW-NEXT:    movl %ebp, (%esp) # 4-byte Spill
-; X86-SLOW-NEXT:  .LBB4_2:
+; X86-SLOW-NEXT:  .LBB5_2:
 ; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-SLOW-NEXT:    movl %ebp, %eax
 ; X86-SLOW-NEXT:    movl %ebx, %ecx
@@ -294,41 +314,41 @@ define i64 @var_shift_i64(i64 %x, i64 %y, i64 %z) nounwind {
 ; X86-SLOW-NEXT:    negb %cl
 ; X86-SLOW-NEXT:    shrl %cl, %edi
 ; X86-SLOW-NEXT:    testb %ch, %ch
-; X86-SLOW-NEXT:    je .LBB4_4
+; X86-SLOW-NEXT:    je .LBB5_4
 ; X86-SLOW-NEXT:  # %bb.3:
 ; X86-SLOW-NEXT:    orl %edi, %eax
 ; X86-SLOW-NEXT:    movl %eax, %ebp
-; X86-SLOW-NEXT:  .LBB4_4:
+; X86-SLOW-NEXT:  .LBB5_4:
 ; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SLOW-NEXT:    movl %eax, %edi
 ; X86-SLOW-NEXT:    movl %ebx, %ecx
 ; X86-SLOW-NEXT:    shll %cl, %edi
 ; X86-SLOW-NEXT:    testb $32, %bl
-; X86-SLOW-NEXT:    je .LBB4_6
+; X86-SLOW-NEXT:    je .LBB5_6
 ; X86-SLOW-NEXT:  # %bb.5:
 ; X86-SLOW-NEXT:    movl %edi, %ebp
 ; X86-SLOW-NEXT:    xorl %edi, %edi
-; X86-SLOW-NEXT:  .LBB4_6:
+; X86-SLOW-NEXT:  .LBB5_6:
 ; X86-SLOW-NEXT:    movb %dh, %cl
 ; X86-SLOW-NEXT:    shrl %cl, %esi
 ; X86-SLOW-NEXT:    testb $32, %dh
-; X86-SLOW-NEXT:    jne .LBB4_7
+; X86-SLOW-NEXT:    jne .LBB5_7
 ; X86-SLOW-NEXT:  # %bb.8:
 ; X86-SLOW-NEXT:    movl (%esp), %ecx # 4-byte Reload
 ; X86-SLOW-NEXT:    testl %ebx, %ebx
-; X86-SLOW-NEXT:    jne .LBB4_10
-; X86-SLOW-NEXT:    jmp .LBB4_11
-; X86-SLOW-NEXT:  .LBB4_7:
+; X86-SLOW-NEXT:    jne .LBB5_10
+; X86-SLOW-NEXT:    jmp .LBB5_11
+; X86-SLOW-NEXT:  .LBB5_7:
 ; X86-SLOW-NEXT:    movl %esi, %ecx
 ; X86-SLOW-NEXT:    xorl %esi, %esi
 ; X86-SLOW-NEXT:    testl %ebx, %ebx
-; X86-SLOW-NEXT:    je .LBB4_11
-; X86-SLOW-NEXT:  .LBB4_10:
+; X86-SLOW-NEXT:    je .LBB5_11
+; X86-SLOW-NEXT:  .LBB5_10:
 ; X86-SLOW-NEXT:    orl %esi, %ebp
 ; X86-SLOW-NEXT:    orl %ecx, %edi
 ; X86-SLOW-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-SLOW-NEXT:    movl %edi, %eax
-; X86-SLOW-NEXT:  .LBB4_11:
+; X86-SLOW-NEXT:  .LBB5_11:
 ; X86-SLOW-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-SLOW-NEXT:    addl $8, %esp
 ; X86-SLOW-NEXT:    popl %esi
@@ -503,3 +523,20 @@ define i64 @const_shift_i64(i64 %x, i64 %y) nounwind {
   %tmp = tail call i64 @llvm.fshl.i64(i64 %x, i64 %y, i64 7)
   ret i64 %tmp
 }
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"ProfileSummary", !1}
+!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
+!2 = !{!"ProfileFormat", !"InstrProf"}
+!3 = !{!"TotalCount", i64 10000}
+!4 = !{!"MaxCount", i64 10}
+!5 = !{!"MaxInternalCount", i64 1}
+!6 = !{!"MaxFunctionCount", i64 1000}
+!7 = !{!"NumCounts", i64 3}
+!8 = !{!"NumFunctions", i64 3}
+!9 = !{!"DetailedSummary", !10}
+!10 = !{!11, !12, !13}
+!11 = !{i32 10000, i64 100, i32 1}
+!12 = !{i32 999000, i64 100, i32 1}
+!13 = !{i32 999999, i64 1, i32 2}
+!14 = !{!"function_entry_count", i64 0}

diff  --git a/llvm/test/CodeGen/X86/fshr.ll b/llvm/test/CodeGen/X86/fshr.ll
index 009f2420575f..26e284f1527f 100644
--- a/llvm/test/CodeGen/X86/fshr.ll
+++ b/llvm/test/CodeGen/X86/fshr.ll
@@ -195,6 +195,26 @@ define i32 @var_shift_i32_optsize(i32 %x, i32 %y, i32 %z) nounwind optsize {
   ret i32 %tmp
 }
 
+define i32 @var_shift_i32_pgso(i32 %x, i32 %y, i32 %z) nounwind !prof !14 {
+; X86-LABEL: var_shift_i32_pgso:
+; X86:       # %bb.0:
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    shrdl %cl, %edx, %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: var_shift_i32_pgso:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %edx, %ecx
+; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NEXT:    shrdl %cl, %edi, %eax
+; X64-NEXT:    retq
+  %tmp = tail call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 %z)
+  ret i32 %tmp
+}
+
 define i64 @var_shift_i64(i64 %x, i64 %y, i64 %z) nounwind {
 ; X86-FAST-LABEL: var_shift_i64:
 ; X86-FAST:       # %bb.0:
@@ -216,30 +236,30 @@ define i64 @var_shift_i64(i64 %x, i64 %y, i64 %z) nounwind {
 ; X86-FAST-NEXT:    shll %cl, %edi
 ; X86-FAST-NEXT:    shldl %cl, %eax, %esi
 ; X86-FAST-NEXT:    testb $32, %cl
-; X86-FAST-NEXT:    je .LBB4_2
+; X86-FAST-NEXT:    je .LBB5_2
 ; X86-FAST-NEXT:  # %bb.1:
 ; X86-FAST-NEXT:    movl %edi, %esi
 ; X86-FAST-NEXT:    xorl %edi, %edi
-; X86-FAST-NEXT:  .LBB4_2:
+; X86-FAST-NEXT:  .LBB5_2:
 ; X86-FAST-NEXT:    movl %edx, %ebp
 ; X86-FAST-NEXT:    movl %ebx, %ecx
 ; X86-FAST-NEXT:    shrl %cl, %ebp
 ; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-FAST-NEXT:    shrdl %cl, %edx, %eax
 ; X86-FAST-NEXT:    testb $32, %bl
-; X86-FAST-NEXT:    je .LBB4_4
+; X86-FAST-NEXT:    je .LBB5_4
 ; X86-FAST-NEXT:  # %bb.3:
 ; X86-FAST-NEXT:    movl %ebp, %eax
 ; X86-FAST-NEXT:    xorl %ebp, %ebp
-; X86-FAST-NEXT:  .LBB4_4:
+; X86-FAST-NEXT:  .LBB5_4:
 ; X86-FAST-NEXT:    testl %ebx, %ebx
-; X86-FAST-NEXT:    je .LBB4_6
+; X86-FAST-NEXT:    je .LBB5_6
 ; X86-FAST-NEXT:  # %bb.5:
 ; X86-FAST-NEXT:    orl %ebp, %esi
 ; X86-FAST-NEXT:    orl %eax, %edi
 ; X86-FAST-NEXT:    movl %edi, (%esp) # 4-byte Spill
 ; X86-FAST-NEXT:    movl %esi, %edx
-; X86-FAST-NEXT:  .LBB4_6:
+; X86-FAST-NEXT:  .LBB5_6:
 ; X86-FAST-NEXT:    movl (%esp), %eax # 4-byte Reload
 ; X86-FAST-NEXT:    addl $4, %esp
 ; X86-FAST-NEXT:    popl %esi
@@ -274,11 +294,11 @@ define i64 @var_shift_i64(i64 %x, i64 %y, i64 %z) nounwind {
 ; X86-SLOW-NEXT:    shrl %cl, %edi
 ; X86-SLOW-NEXT:    testb %ch, %ch
 ; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-SLOW-NEXT:    je .LBB4_2
+; X86-SLOW-NEXT:    je .LBB5_2
 ; X86-SLOW-NEXT:  # %bb.1:
 ; X86-SLOW-NEXT:    orl %edi, %edx
 ; X86-SLOW-NEXT:    movl %edx, (%esp) # 4-byte Spill
-; X86-SLOW-NEXT:  .LBB4_2:
+; X86-SLOW-NEXT:  .LBB5_2:
 ; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-SLOW-NEXT:    movl %ebx, %ecx
 ; X86-SLOW-NEXT:    shrl %cl, %edx
@@ -290,41 +310,41 @@ define i64 @var_shift_i64(i64 %x, i64 %y, i64 %z) nounwind {
 ; X86-SLOW-NEXT:    shll %cl, %edi
 ; X86-SLOW-NEXT:    testb %ah, %ah
 ; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-SLOW-NEXT:    je .LBB4_4
+; X86-SLOW-NEXT:    je .LBB5_4
 ; X86-SLOW-NEXT:  # %bb.3:
 ; X86-SLOW-NEXT:    orl %edx, %edi
 ; X86-SLOW-NEXT:    movl %edi, %ebp
-; X86-SLOW-NEXT:  .LBB4_4:
+; X86-SLOW-NEXT:  .LBB5_4:
 ; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-SLOW-NEXT:    movl %ebx, %ecx
 ; X86-SLOW-NEXT:    shrl %cl, %edi
 ; X86-SLOW-NEXT:    testb $32, %bl
-; X86-SLOW-NEXT:    je .LBB4_6
+; X86-SLOW-NEXT:    je .LBB5_6
 ; X86-SLOW-NEXT:  # %bb.5:
 ; X86-SLOW-NEXT:    movl %edi, %ebp
 ; X86-SLOW-NEXT:    xorl %edi, %edi
-; X86-SLOW-NEXT:  .LBB4_6:
+; X86-SLOW-NEXT:  .LBB5_6:
 ; X86-SLOW-NEXT:    movl %eax, %ecx
 ; X86-SLOW-NEXT:    shll %cl, %esi
 ; X86-SLOW-NEXT:    testb $32, %al
 ; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-SLOW-NEXT:    jne .LBB4_7
+; X86-SLOW-NEXT:    jne .LBB5_7
 ; X86-SLOW-NEXT:  # %bb.8:
 ; X86-SLOW-NEXT:    movl (%esp), %eax # 4-byte Reload
 ; X86-SLOW-NEXT:    testl %ebx, %ebx
-; X86-SLOW-NEXT:    jne .LBB4_10
-; X86-SLOW-NEXT:    jmp .LBB4_11
-; X86-SLOW-NEXT:  .LBB4_7:
+; X86-SLOW-NEXT:    jne .LBB5_10
+; X86-SLOW-NEXT:    jmp .LBB5_11
+; X86-SLOW-NEXT:  .LBB5_7:
 ; X86-SLOW-NEXT:    movl %esi, %eax
 ; X86-SLOW-NEXT:    xorl %esi, %esi
 ; X86-SLOW-NEXT:    testl %ebx, %ebx
-; X86-SLOW-NEXT:    je .LBB4_11
-; X86-SLOW-NEXT:  .LBB4_10:
+; X86-SLOW-NEXT:    je .LBB5_11
+; X86-SLOW-NEXT:  .LBB5_10:
 ; X86-SLOW-NEXT:    orl %ebp, %esi
 ; X86-SLOW-NEXT:    orl %edi, %eax
 ; X86-SLOW-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-SLOW-NEXT:    movl %eax, %edx
-; X86-SLOW-NEXT:  .LBB4_11:
+; X86-SLOW-NEXT:  .LBB5_11:
 ; X86-SLOW-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-SLOW-NEXT:    addl $8, %esp
 ; X86-SLOW-NEXT:    popl %esi
@@ -498,3 +518,20 @@ define i64 @const_shift_i64(i64 %x, i64 %y) nounwind {
   %tmp = tail call i64 @llvm.fshr.i64(i64 %x, i64 %y, i64 7)
   ret i64 %tmp
 }
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"ProfileSummary", !1}
+!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
+!2 = !{!"ProfileFormat", !"InstrProf"}
+!3 = !{!"TotalCount", i64 10000}
+!4 = !{!"MaxCount", i64 10}
+!5 = !{!"MaxInternalCount", i64 1}
+!6 = !{!"MaxFunctionCount", i64 1000}
+!7 = !{!"NumCounts", i64 3}
+!8 = !{!"NumFunctions", i64 3}
+!9 = !{!"DetailedSummary", !10}
+!10 = !{!11, !12, !13}
+!11 = !{i32 10000, i64 100, i32 1}
+!12 = !{i32 999000, i64 100, i32 1}
+!13 = !{i32 999999, i64 1, i32 2}
+!14 = !{!"function_entry_count", i64 0}

diff  --git a/llvm/test/CodeGen/X86/haddsub.ll b/llvm/test/CodeGen/X86/haddsub.ll
index 597066108bfc..872cc4bf9bce 100644
--- a/llvm/test/CodeGen/X86/haddsub.ll
+++ b/llvm/test/CodeGen/X86/haddsub.ll
@@ -1983,6 +1983,80 @@ define float @hadd32_16_optsize(<16 x float> %x225) optsize {
   ret float %x230
 }
 
+define float @hadd32_4_pgso(<4 x float> %x225) !prof !14 {
+; SSE3-LABEL: hadd32_4_pgso:
+; SSE3:       # %bb.0:
+; SSE3-NEXT:    movaps %xmm0, %xmm1
+; SSE3-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
+; SSE3-NEXT:    addps %xmm0, %xmm1
+; SSE3-NEXT:    haddps %xmm1, %xmm1
+; SSE3-NEXT:    movaps %xmm1, %xmm0
+; SSE3-NEXT:    retq
+;
+; AVX-LABEL: hadd32_4_pgso:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX-NEXT:    vaddps %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %x226 = shufflevector <4 x float> %x225, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+  %x227 = fadd <4 x float> %x225, %x226
+  %x228 = shufflevector <4 x float> %x227, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+  %x229 = fadd <4 x float> %x227, %x228
+  %x230 = extractelement <4 x float> %x229, i32 0
+  ret float %x230
+}
+
+define float @hadd32_8_pgso(<8 x float> %x225) !prof !14 {
+; SSE3-LABEL: hadd32_8_pgso:
+; SSE3:       # %bb.0:
+; SSE3-NEXT:    movaps %xmm0, %xmm1
+; SSE3-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
+; SSE3-NEXT:    addps %xmm0, %xmm1
+; SSE3-NEXT:    haddps %xmm1, %xmm1
+; SSE3-NEXT:    movaps %xmm1, %xmm0
+; SSE3-NEXT:    retq
+;
+; AVX-LABEL: hadd32_8_pgso:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX-NEXT:    vaddps %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    retq
+  %x226 = shufflevector <8 x float> %x225, <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %x227 = fadd <8 x float> %x225, %x226
+  %x228 = shufflevector <8 x float> %x227, <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %x229 = fadd <8 x float> %x227, %x228
+  %x230 = extractelement <8 x float> %x229, i32 0
+  ret float %x230
+}
+
+define float @hadd32_16_pgso(<16 x float> %x225) !prof !14 {
+; SSE3-LABEL: hadd32_16_pgso:
+; SSE3:       # %bb.0:
+; SSE3-NEXT:    movaps %xmm0, %xmm1
+; SSE3-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
+; SSE3-NEXT:    addps %xmm0, %xmm1
+; SSE3-NEXT:    haddps %xmm1, %xmm1
+; SSE3-NEXT:    movaps %xmm1, %xmm0
+; SSE3-NEXT:    retq
+;
+; AVX-LABEL: hadd32_16_pgso:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX-NEXT:    vaddps %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    retq
+  %x226 = shufflevector <16 x float> %x225, <16 x float> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %x227 = fadd <16 x float> %x225, %x226
+  %x228 = shufflevector <16 x float> %x227, <16 x float> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %x229 = fadd <16 x float> %x227, %x228
+  %x230 = extractelement <16 x float> %x229, i32 0
+  ret float %x230
+}
+
 define float @partial_reduction_fadd_v8f32(<8 x float> %x) {
 ; SSE3-SLOW-LABEL: partial_reduction_fadd_v8f32:
 ; SSE3-SLOW:       # %bb.0:
@@ -2115,3 +2189,20 @@ define float @partial_reduction_fadd_v16f32(<16 x float> %x) {
   %r = extractelement <16 x float> %x0123, i32 0
   ret float %r
 }
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"ProfileSummary", !1}
+!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
+!2 = !{!"ProfileFormat", !"InstrProf"}
+!3 = !{!"TotalCount", i64 10000}
+!4 = !{!"MaxCount", i64 10}
+!5 = !{!"MaxInternalCount", i64 1}
+!6 = !{!"MaxFunctionCount", i64 1000}
+!7 = !{!"NumCounts", i64 3}
+!8 = !{!"NumFunctions", i64 3}
+!9 = !{!"DetailedSummary", !10}
+!10 = !{!11, !12, !13}
+!11 = !{i32 10000, i64 100, i32 1}
+!12 = !{i32 999000, i64 100, i32 1}
+!13 = !{i32 999999, i64 1, i32 2}
+!14 = !{!"function_entry_count", i64 0}

diff  --git a/llvm/test/CodeGen/X86/immediate_merging.ll b/llvm/test/CodeGen/X86/immediate_merging.ll
index a6e36c73467e..1bed1014f94e 100644
--- a/llvm/test/CodeGen/X86/immediate_merging.ll
+++ b/llvm/test/CodeGen/X86/immediate_merging.ll
@@ -73,6 +73,68 @@ if.end:                                           ; preds = %if.then, %entry
   ret i32 0
 }
 
+; Test PGSO to make sure immediates with multiple users don't get pulled in to
+; instructions.
+define i32 @foo_pgso() !prof !14 {
+; X86-LABEL: foo_pgso:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movl $1234, %eax # imm = 0x4D2
+; X86-NEXT:    movl %eax, a
+; X86-NEXT:    movl %eax, b
+; X86-NEXT:    movl $12, %eax
+; X86-NEXT:    movl %eax, c
+; X86-NEXT:    cmpl %eax, e
+; X86-NEXT:    jne .LBB1_2
+; X86-NEXT:  # %bb.1: # %if.then
+; X86-NEXT:    movl $1, x
+; X86-NEXT:  .LBB1_2: # %if.end
+; X86-NEXT:    movl $1234, f # imm = 0x4D2
+; X86-NEXT:    movl $555, %eax # imm = 0x22B
+; X86-NEXT:    movl %eax, h
+; X86-NEXT:    addl %eax, i
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: foo_pgso:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movl $1234, %eax # imm = 0x4D2
+; X64-NEXT:    movl %eax, {{.*}}(%rip)
+; X64-NEXT:    movl %eax, {{.*}}(%rip)
+; X64-NEXT:    movl $12, %eax
+; X64-NEXT:    movl %eax, {{.*}}(%rip)
+; X64-NEXT:    cmpl %eax, {{.*}}(%rip)
+; X64-NEXT:    jne .LBB1_2
+; X64-NEXT:  # %bb.1: # %if.then
+; X64-NEXT:    movl $1, {{.*}}(%rip)
+; X64-NEXT:  .LBB1_2: # %if.end
+; X64-NEXT:    movl $1234, {{.*}}(%rip) # imm = 0x4D2
+; X64-NEXT:    movl $555, %eax # imm = 0x22B
+; X64-NEXT:    movl %eax, {{.*}}(%rip)
+; X64-NEXT:    addl %eax, {{.*}}(%rip)
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    retq
+entry:
+  store i32 1234, i32* @a
+  store i32 1234, i32* @b
+  store i32 12, i32* @c
+  %0 = load i32, i32* @e
+  %cmp = icmp eq i32 %0, 12
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  store i32 1, i32* @x
+  br label %if.end
+
+; New block.. Make sure 1234 isn't live across basic blocks from before.
+if.end:                                           ; preds = %if.then, %entry
+  store i32 1234, i32* @f
+  store i32 555, i32* @h
+  %1 = load i32, i32* @i
+  %add1 = add nsw i32 %1, 555
+  store i32 %add1, i32* @i
+  ret i32 0
+}
+
 ; Test -O2 to make sure that all immediates get pulled in to their users.
 define i32 @foo2() {
 ; X86-LABEL: foo2:
@@ -124,3 +186,47 @@ entry:
   call void @llvm.memset.p0i8.i32(i8* getelementptr inbounds ([100 x i8], [100 x i8]* @AA, i32 0, i32 0), i8 33, i32 24, i1 false)
   ret void
 }
+
+; memset gets lowered in DAG. Constant merging should hoist all the
+; immediates used to store to the individual memory locations. Make
+; sure we don't directly store the immediates.
+define void @foomemset_pgso() !prof !14 {
+; X86-LABEL: foomemset_pgso:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movl $555819297, %eax # imm = 0x21212121
+; X86-NEXT:    movl %eax, AA+20
+; X86-NEXT:    movl %eax, AA+16
+; X86-NEXT:    movl %eax, AA+12
+; X86-NEXT:    movl %eax, AA+8
+; X86-NEXT:    movl %eax, AA+4
+; X86-NEXT:    movl %eax, AA
+; X86-NEXT:    retl
+;
+; X64-LABEL: foomemset_pgso:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movabsq $2387225703656530209, %rax # imm = 0x2121212121212121
+; X64-NEXT:    movq %rax, AA+{{.*}}(%rip)
+; X64-NEXT:    movq %rax, AA+{{.*}}(%rip)
+; X64-NEXT:    movq %rax, {{.*}}(%rip)
+; X64-NEXT:    retq
+entry:
+  call void @llvm.memset.p0i8.i32(i8* getelementptr inbounds ([100 x i8], [100 x i8]* @AA, i32 0, i32 0), i8 33, i32 24, i1 false)
+  ret void
+}
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"ProfileSummary", !1}
+!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
+!2 = !{!"ProfileFormat", !"InstrProf"}
+!3 = !{!"TotalCount", i64 10000}
+!4 = !{!"MaxCount", i64 10}
+!5 = !{!"MaxInternalCount", i64 1}
+!6 = !{!"MaxFunctionCount", i64 1000}
+!7 = !{!"NumCounts", i64 3}
+!8 = !{!"NumFunctions", i64 3}
+!9 = !{!"DetailedSummary", !10}
+!10 = !{!11, !12, !13}
+!11 = !{i32 10000, i64 100, i32 1}
+!12 = !{i32 999000, i64 100, i32 1}
+!13 = !{i32 999999, i64 1, i32 2}
+!14 = !{!"function_entry_count", i64 0}

diff  --git a/llvm/test/CodeGen/X86/immediate_merging64.ll b/llvm/test/CodeGen/X86/immediate_merging64.ll
index 12be8bdff83a..a807a119e893 100644
--- a/llvm/test/CodeGen/X86/immediate_merging64.ll
+++ b/llvm/test/CodeGen/X86/immediate_merging64.ll
@@ -19,6 +19,19 @@ define i1 @imm_multiple_users(i64 %a, i64* %b) optsize {
   ret i1 %cmp
 }
 
+define i1 @imm_multiple_users_pgso(i64 %a, i64* %b) !prof !14 {
+; CHECK-LABEL: imm_multiple_users_pgso:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movq $-1, %rax
+; CHECK-NEXT:    movq %rax, (%rsi)
+; CHECK-NEXT:    cmpq %rax, %rdi
+; CHECK-NEXT:    sete %al
+; CHECK-NEXT:    retq
+  store i64 -1, i64* %b, align 8
+  %cmp = icmp eq i64 %a, -1
+  ret i1 %cmp
+}
+
 declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i1)
 
 ; Inlined memsets requiring multiple same-sized stores should be lowered using
@@ -34,3 +47,31 @@ define void @memset_zero(i8* noalias nocapture %D) optsize {
   tail call void @llvm.memset.p0i8.i64(i8* %D, i8 0, i64 15, i1 false)
   ret void
 }
+
+define void @memset_zero_pgso(i8* noalias nocapture %D) !prof !14 {
+; CHECK-LABEL: memset_zero_pgso:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    movq %rax, 7(%rdi)
+; CHECK-NEXT:    movq %rax, (%rdi)
+; CHECK-NEXT:    retq
+  tail call void @llvm.memset.p0i8.i64(i8* %D, i8 0, i64 15, i1 false)
+  ret void
+}
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"ProfileSummary", !1}
+!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
+!2 = !{!"ProfileFormat", !"InstrProf"}
+!3 = !{!"TotalCount", i64 10000}
+!4 = !{!"MaxCount", i64 10}
+!5 = !{!"MaxInternalCount", i64 1}
+!6 = !{!"MaxFunctionCount", i64 1000}
+!7 = !{!"NumCounts", i64 3}
+!8 = !{!"NumFunctions", i64 3}
+!9 = !{!"DetailedSummary", !10}
+!10 = !{!11, !12, !13}
+!11 = !{i32 10000, i64 100, i32 1}
+!12 = !{i32 999000, i64 100, i32 1}
+!13 = !{i32 999999, i64 1, i32 2}
+!14 = !{!"function_entry_count", i64 0}

diff  --git a/llvm/test/CodeGen/X86/loop-blocks.ll b/llvm/test/CodeGen/X86/loop-blocks.ll
index a5e806d936b7..e970061c0cf7 100644
--- a/llvm/test/CodeGen/X86/loop-blocks.ll
+++ b/llvm/test/CodeGen/X86/loop-blocks.ll
@@ -269,6 +269,35 @@ exit:
 
 attributes #0 = { minsize norecurse nounwind optsize readnone uwtable }
 
+; CHECK-LABEL: slightly_more_involved_2_pgso:
+; CHECK-NOT:      jmp .LBB6_1
+; CHECK:          .LBB6_1:
+; CHECK-NEXT:     callq body
+
+define void @slightly_more_involved_2_pgso() norecurse nounwind readnone uwtable !prof !14 {
+entry:
+  br label %loop
+
+loop:
+  call void @body()
+  %t0 = call i32 @get()
+  %t1 = icmp slt i32 %t0, 2
+  br i1 %t1, label %block_a, label %bb
+
+bb:
+  %t2 = call i32 @get()
+  %t3 = icmp slt i32 %t2, 99
+  br i1 %t3, label %exit, label %loop
+
+block_a:
+  call void @bar99()
+  br label %loop
+
+exit:
+  call void @exit()
+  ret void
+}
+
 declare void @bar99() nounwind
 declare void @bar100() nounwind
 declare void @bar101() nounwind
@@ -281,3 +310,20 @@ declare i32 @get() nounwind
 declare void @block_a_true_func() nounwind
 declare void @block_a_false_func() nounwind
 declare void @block_a_merge_func() nounwind
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"ProfileSummary", !1}
+!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
+!2 = !{!"ProfileFormat", !"InstrProf"}
+!3 = !{!"TotalCount", i64 10000}
+!4 = !{!"MaxCount", i64 10}
+!5 = !{!"MaxInternalCount", i64 1}
+!6 = !{!"MaxFunctionCount", i64 1000}
+!7 = !{!"NumCounts", i64 3}
+!8 = !{!"NumFunctions", i64 3}
+!9 = !{!"DetailedSummary", !10}
+!10 = !{!11, !12, !13}
+!11 = !{i32 10000, i64 100, i32 1}
+!12 = !{i32 999000, i64 100, i32 1}
+!13 = !{i32 999999, i64 1, i32 2}
+!14 = !{!"function_entry_count", i64 0}

diff  --git a/llvm/test/CodeGen/X86/materialize.ll b/llvm/test/CodeGen/X86/materialize.ll
index 6e1264b4fd43..429d578feccc 100644
--- a/llvm/test/CodeGen/X86/materialize.ll
+++ b/llvm/test/CodeGen/X86/materialize.ll
@@ -30,6 +30,21 @@ entry:
 ; CHECK64-NEXT:  retq
 }
 
+define i32 @one32_pgso() !prof !14 {
+entry:
+  ret i32 1
+
+; CHECK32-LABEL: one32_pgso:
+; CHECK32:       xorl %eax, %eax
+; CHECK32-NEXT:  incl %eax
+; CHECK32-NEXT:  retl
+
+; FIXME: Figure out the best approach in 64-bit mode.
+; CHECK64-LABEL: one32_pgso:
+; CHECK64:       movl $1, %eax
+; CHECK64-NEXT:  retq
+}
+
 define i32 @one32_minsize() minsize {
 entry:
   ret i32 1
@@ -107,6 +122,16 @@ entry:
 ; CHECK32-NEXT:  retl
 }
 
+define i32 @minus_one32_pgso() !prof !14 {
+entry:
+  ret i32 -1
+
+; CHECK32-LABEL: minus_one32_pgso:
+; CHECK32:       xorl %eax, %eax
+; CHECK32-NEXT:  decl %eax
+; CHECK32-NEXT:  retl
+}
+
 define i32 @minus_one32_minsize() minsize {
 entry:
   ret i32 -1
@@ -140,6 +165,28 @@ entry:
 ; CHECK32-NEXT:  retl
 }
 
+define i16 @one16_pgso() !prof !14 {
+entry:
+  ret i16 1
+
+; CHECK32-LABEL: one16_pgso:
+; CHECK32:       xorl %eax, %eax
+; CHECK32-NEXT:  incl %eax
+; CHECK32-NEXT:  # kill
+; CHECK32-NEXT:  retl
+}
+
+define i16 @minus_one16_pgso() !prof !14 {
+entry:
+  ret i16 -1
+
+; CHECK32-LABEL: minus_one16_pgso:
+; CHECK32:       xorl %eax, %eax
+; CHECK32-NEXT:  decl %eax
+; CHECK32-NEXT:  # kill
+; CHECK32-NEXT:  retl
+}
+
 define i32 @minus_five32() minsize {
 entry:
   ret i32 -5
@@ -213,4 +260,72 @@ entry:
 ; CHECK32:       retl
 }
 
+define i32 @rematerialize_minus_one_pgso() !prof !14 {
+entry:
+  ; Materialize -1 (thiscall forces it into %ecx).
+  tail call x86_thiscallcc void @f(i32 -1)
+
+  ; Clobber all registers except %esp, leaving nowhere to store the -1 besides
+  ; spilling it to the stack.
+  tail call void asm sideeffect "", "~{eax},~{ebx},~{ecx},~{edx},~{edi},~{esi},~{ebp},~{dirflag},~{fpsr},~{flags}"()
+
+  ; -1 should be re-materialized here instead of getting spilled above.
+  ret i32 -1
+
+; CHECK32-LABEL: rematerialize_minus_one_pgso
+; CHECK32:       xorl %ecx, %ecx
+; CHECK32-NEXT:  decl %ecx
+; CHECK32:       calll
+; CHECK32:       xorl %eax, %eax
+; CHECK32-NEXT:  decl %eax
+; CHECK32-NOT:   %eax
+; CHECK32:       retl
+}
+
+define i32 @rematerialize_minus_one_eflags_pgso(i32 %x) !prof !14 {
+entry:
+  ; Materialize -1 (thiscall forces it into %ecx).
+  tail call x86_thiscallcc void @f(i32 -1)
+
+  ; Clobber all registers except %esp, leaving nowhere to store the -1 besides
+  ; spilling it to the stack.
+  tail call void asm sideeffect "", "~{eax},~{ebx},~{ecx},~{edx},~{edi},~{esi},~{ebp},~{dirflag},~{fpsr},~{flags}"()
+
+  ; Define eflags.
+  %a = icmp ne i32 %x, 123
+  %b = zext i1 %a to i32
+  ; Cause -1 to be rematerialized right in front of the cmov, which needs eflags.
+  ; It must therefore not use the xor-dec lowering.
+  %c = select i1 %a, i32 %b, i32 -1
+  ret i32 %c
+
+; CHECK32-LABEL: rematerialize_minus_one_eflags_pgso
+; CHECK32:       xorl %ecx, %ecx
+; CHECK32-NEXT:  decl %ecx
+; CHECK32:       calll
+; CHECK32:       cmpl
+; CHECK32:       setne
+; CHECK32-NOT:   xorl
+; CHECK32:       movl $-1
+; CHECK32:       cmov
+; CHECK32:       retl
+}
+
 declare x86_thiscallcc void @f(i32)
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"ProfileSummary", !1}
+!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
+!2 = !{!"ProfileFormat", !"InstrProf"}
+!3 = !{!"TotalCount", i64 10000}
+!4 = !{!"MaxCount", i64 10}
+!5 = !{!"MaxInternalCount", i64 1}
+!6 = !{!"MaxFunctionCount", i64 1000}
+!7 = !{!"NumCounts", i64 3}
+!8 = !{!"NumFunctions", i64 3}
+!9 = !{!"DetailedSummary", !10}
+!10 = !{!11, !12, !13}
+!11 = !{i32 10000, i64 100, i32 1}
+!12 = !{i32 999000, i64 100, i32 1}
+!13 = !{i32 999999, i64 1, i32 2}
+!14 = !{!"function_entry_count", i64 0}

diff  --git a/llvm/test/CodeGen/X86/memcmp-pgso.ll b/llvm/test/CodeGen/X86/memcmp-pgso.ll
new file mode 100644
index 000000000000..c3c3a1db8171
--- /dev/null
+++ b/llvm/test/CodeGen/X86/memcmp-pgso.ll
@@ -0,0 +1,1064 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=cmov | FileCheck %s --check-prefix=X86 --check-prefix=X86-NOSSE
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX2
+
+; This tests codegen time inlining/optimization of memcmp
+; rdar://6480398
+
+ at .str = private constant [65 x i8] c"0123456789012345678901234567890123456789012345678901234567890123\00", align 1
+
+declare i32 @memcmp(i8*, i8*, i64)
+declare i32 @bcmp(i8*, i8*, i64)
+
+define i32 @length2(i8* %X, i8* %Y) nounwind !prof !14 {
+; X86-LABEL: length2:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movzwl (%ecx), %ecx
+; X86-NEXT:    movzwl (%eax), %edx
+; X86-NEXT:    rolw $8, %cx
+; X86-NEXT:    rolw $8, %dx
+; X86-NEXT:    movzwl %cx, %eax
+; X86-NEXT:    movzwl %dx, %ecx
+; X86-NEXT:    subl %ecx, %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: length2:
+; X64:       # %bb.0:
+; X64-NEXT:    movzwl (%rdi), %eax
+; X64-NEXT:    movzwl (%rsi), %ecx
+; X64-NEXT:    rolw $8, %ax
+; X64-NEXT:    rolw $8, %cx
+; X64-NEXT:    movzwl %ax, %eax
+; X64-NEXT:    movzwl %cx, %ecx
+; X64-NEXT:    subl %ecx, %eax
+; X64-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 2) nounwind
+  ret i32 %m
+}
+
+define i1 @length2_eq(i8* %X, i8* %Y) nounwind !prof !14 {
+; X86-LABEL: length2_eq:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movzwl (%ecx), %ecx
+; X86-NEXT:    cmpw (%eax), %cx
+; X86-NEXT:    sete %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: length2_eq:
+; X64:       # %bb.0:
+; X64-NEXT:    movzwl (%rdi), %eax
+; X64-NEXT:    cmpw (%rsi), %ax
+; X64-NEXT:    sete %al
+; X64-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 2) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length2_eq_const(i8* %X) nounwind !prof !14 {
+; X86-LABEL: length2_eq_const:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movzwl (%eax), %eax
+; X86-NEXT:    cmpl $12849, %eax # imm = 0x3231
+; X86-NEXT:    setne %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: length2_eq_const:
+; X64:       # %bb.0:
+; X64-NEXT:    movzwl (%rdi), %eax
+; X64-NEXT:    cmpl $12849, %eax # imm = 0x3231
+; X64-NEXT:    setne %al
+; X64-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 1), i64 2) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length2_eq_nobuiltin_attr(i8* %X, i8* %Y) nounwind !prof !14 {
+; X86-LABEL: length2_eq_nobuiltin_attr:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $0
+; X86-NEXT:    pushl $2
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    sete %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: length2_eq_nobuiltin_attr:
+; X64:       # %bb.0:
+; X64-NEXT:    pushq %rax
+; X64-NEXT:    movl $2, %edx
+; X64-NEXT:    callq memcmp
+; X64-NEXT:    testl %eax, %eax
+; X64-NEXT:    sete %al
+; X64-NEXT:    popq %rcx
+; X64-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 2) nounwind nobuiltin
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length3(i8* %X, i8* %Y) nounwind !prof !14 {
+; X86-LABEL: length3:
+; X86:       # %bb.0: # %loadbb
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movzwl (%eax), %edx
+; X86-NEXT:    movzwl (%ecx), %esi
+; X86-NEXT:    rolw $8, %dx
+; X86-NEXT:    rolw $8, %si
+; X86-NEXT:    cmpw %si, %dx
+; X86-NEXT:    jne .LBB4_1
+; X86-NEXT:  # %bb.2: # %loadbb1
+; X86-NEXT:    movzbl 2(%eax), %eax
+; X86-NEXT:    movzbl 2(%ecx), %ecx
+; X86-NEXT:    subl %ecx, %eax
+; X86-NEXT:    jmp .LBB4_3
+; X86-NEXT:  .LBB4_1: # %res_block
+; X86-NEXT:    setae %al
+; X86-NEXT:    movzbl %al, %eax
+; X86-NEXT:    leal -1(%eax,%eax), %eax
+; X86-NEXT:  .LBB4_3: # %endblock
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+;
+; X64-LABEL: length3:
+; X64:       # %bb.0: # %loadbb
+; X64-NEXT:    movzwl (%rdi), %eax
+; X64-NEXT:    movzwl (%rsi), %ecx
+; X64-NEXT:    rolw $8, %ax
+; X64-NEXT:    rolw $8, %cx
+; X64-NEXT:    cmpw %cx, %ax
+; X64-NEXT:    jne .LBB4_1
+; X64-NEXT:  # %bb.2: # %loadbb1
+; X64-NEXT:    movzbl 2(%rdi), %eax
+; X64-NEXT:    movzbl 2(%rsi), %ecx
+; X64-NEXT:    subl %ecx, %eax
+; X64-NEXT:    retq
+; X64-NEXT:  .LBB4_1: # %res_block
+; X64-NEXT:    setae %al
+; X64-NEXT:    movzbl %al, %eax
+; X64-NEXT:    leal -1(%rax,%rax), %eax
+; X64-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 3) nounwind
+  ret i32 %m
+}
+
+define i1 @length3_eq(i8* %X, i8* %Y) nounwind !prof !14 {
+; X86-LABEL: length3_eq:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movzwl (%ecx), %edx
+; X86-NEXT:    xorw (%eax), %dx
+; X86-NEXT:    movb 2(%ecx), %cl
+; X86-NEXT:    xorb 2(%eax), %cl
+; X86-NEXT:    movzbl %cl, %eax
+; X86-NEXT:    orw %dx, %ax
+; X86-NEXT:    setne %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: length3_eq:
+; X64:       # %bb.0:
+; X64-NEXT:    movzwl (%rdi), %eax
+; X64-NEXT:    xorw (%rsi), %ax
+; X64-NEXT:    movb 2(%rdi), %cl
+; X64-NEXT:    xorb 2(%rsi), %cl
+; X64-NEXT:    movzbl %cl, %ecx
+; X64-NEXT:    orw %ax, %cx
+; X64-NEXT:    setne %al
+; X64-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 3) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length4(i8* %X, i8* %Y) nounwind !prof !14 {
+; X86-LABEL: length4:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl (%ecx), %ecx
+; X86-NEXT:    movl (%eax), %edx
+; X86-NEXT:    bswapl %ecx
+; X86-NEXT:    bswapl %edx
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    seta %al
+; X86-NEXT:    sbbl $0, %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: length4:
+; X64:       # %bb.0:
+; X64-NEXT:    movl (%rdi), %ecx
+; X64-NEXT:    movl (%rsi), %edx
+; X64-NEXT:    bswapl %ecx
+; X64-NEXT:    bswapl %edx
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    cmpl %edx, %ecx
+; X64-NEXT:    seta %al
+; X64-NEXT:    sbbl $0, %eax
+; X64-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 4) nounwind
+  ret i32 %m
+}
+
+define i1 @length4_eq(i8* %X, i8* %Y) nounwind !prof !14 {
+; X86-LABEL: length4_eq:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl (%ecx), %ecx
+; X86-NEXT:    cmpl (%eax), %ecx
+; X86-NEXT:    setne %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: length4_eq:
+; X64:       # %bb.0:
+; X64-NEXT:    movl (%rdi), %eax
+; X64-NEXT:    cmpl (%rsi), %eax
+; X64-NEXT:    setne %al
+; X64-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 4) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length4_eq_const(i8* %X) nounwind !prof !14 {
+; X86-LABEL: length4_eq_const:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl $875770417, (%eax) # imm = 0x34333231
+; X86-NEXT:    sete %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: length4_eq_const:
+; X64:       # %bb.0:
+; X64-NEXT:    cmpl $875770417, (%rdi) # imm = 0x34333231
+; X64-NEXT:    sete %al
+; X64-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 1), i64 4) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length5(i8* %X, i8* %Y) nounwind !prof !14 {
+; X86-LABEL: length5:
+; X86:       # %bb.0: # %loadbb
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl (%eax), %edx
+; X86-NEXT:    movl (%ecx), %esi
+; X86-NEXT:    bswapl %edx
+; X86-NEXT:    bswapl %esi
+; X86-NEXT:    cmpl %esi, %edx
+; X86-NEXT:    jne .LBB9_1
+; X86-NEXT:  # %bb.2: # %loadbb1
+; X86-NEXT:    movzbl 4(%eax), %eax
+; X86-NEXT:    movzbl 4(%ecx), %ecx
+; X86-NEXT:    subl %ecx, %eax
+; X86-NEXT:    jmp .LBB9_3
+; X86-NEXT:  .LBB9_1: # %res_block
+; X86-NEXT:    setae %al
+; X86-NEXT:    movzbl %al, %eax
+; X86-NEXT:    leal -1(%eax,%eax), %eax
+; X86-NEXT:  .LBB9_3: # %endblock
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+;
+; X64-LABEL: length5:
+; X64:       # %bb.0: # %loadbb
+; X64-NEXT:    movl (%rdi), %eax
+; X64-NEXT:    movl (%rsi), %ecx
+; X64-NEXT:    bswapl %eax
+; X64-NEXT:    bswapl %ecx
+; X64-NEXT:    cmpl %ecx, %eax
+; X64-NEXT:    jne .LBB9_1
+; X64-NEXT:  # %bb.2: # %loadbb1
+; X64-NEXT:    movzbl 4(%rdi), %eax
+; X64-NEXT:    movzbl 4(%rsi), %ecx
+; X64-NEXT:    subl %ecx, %eax
+; X64-NEXT:    retq
+; X64-NEXT:  .LBB9_1: # %res_block
+; X64-NEXT:    setae %al
+; X64-NEXT:    movzbl %al, %eax
+; X64-NEXT:    leal -1(%rax,%rax), %eax
+; X64-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 5) nounwind
+  ret i32 %m
+}
+
+define i1 @length5_eq(i8* %X, i8* %Y) nounwind !prof !14 {
+; X86-LABEL: length5_eq:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl (%ecx), %edx
+; X86-NEXT:    xorl (%eax), %edx
+; X86-NEXT:    movb 4(%ecx), %cl
+; X86-NEXT:    xorb 4(%eax), %cl
+; X86-NEXT:    movzbl %cl, %eax
+; X86-NEXT:    orl %edx, %eax
+; X86-NEXT:    setne %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: length5_eq:
+; X64:       # %bb.0:
+; X64-NEXT:    movl (%rdi), %eax
+; X64-NEXT:    xorl (%rsi), %eax
+; X64-NEXT:    movb 4(%rdi), %cl
+; X64-NEXT:    xorb 4(%rsi), %cl
+; X64-NEXT:    movzbl %cl, %ecx
+; X64-NEXT:    orl %eax, %ecx
+; X64-NEXT:    setne %al
+; X64-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 5) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length8(i8* %X, i8* %Y) nounwind !prof !14 {
+; X86-LABEL: length8:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl (%esi), %ecx
+; X86-NEXT:    movl (%eax), %edx
+; X86-NEXT:    bswapl %ecx
+; X86-NEXT:    bswapl %edx
+; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    jne .LBB11_2
+; X86-NEXT:  # %bb.1: # %loadbb1
+; X86-NEXT:    movl 4(%esi), %ecx
+; X86-NEXT:    movl 4(%eax), %edx
+; X86-NEXT:    bswapl %ecx
+; X86-NEXT:    bswapl %edx
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    je .LBB11_3
+; X86-NEXT:  .LBB11_2: # %res_block
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    setae %al
+; X86-NEXT:    leal -1(%eax,%eax), %eax
+; X86-NEXT:  .LBB11_3: # %endblock
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+;
+; X64-LABEL: length8:
+; X64:       # %bb.0:
+; X64-NEXT:    movq (%rdi), %rcx
+; X64-NEXT:    movq (%rsi), %rdx
+; X64-NEXT:    bswapq %rcx
+; X64-NEXT:    bswapq %rdx
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    cmpq %rdx, %rcx
+; X64-NEXT:    seta %al
+; X64-NEXT:    sbbl $0, %eax
+; X64-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 8) nounwind
+  ret i32 %m
+}
+
+define i1 @length8_eq(i8* %X, i8* %Y) nounwind !prof !14 {
+; X86-LABEL: length8_eq:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl (%ecx), %edx
+; X86-NEXT:    movl 4(%ecx), %ecx
+; X86-NEXT:    xorl (%eax), %edx
+; X86-NEXT:    xorl 4(%eax), %ecx
+; X86-NEXT:    orl %edx, %ecx
+; X86-NEXT:    sete %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: length8_eq:
+; X64:       # %bb.0:
+; X64-NEXT:    movq (%rdi), %rax
+; X64-NEXT:    cmpq (%rsi), %rax
+; X64-NEXT:    sete %al
+; X64-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 8) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length8_eq_const(i8* %X) nounwind !prof !14 {
+; X86-LABEL: length8_eq_const:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl $858927408, %ecx # imm = 0x33323130
+; X86-NEXT:    xorl (%eax), %ecx
+; X86-NEXT:    movl $926299444, %edx # imm = 0x37363534
+; X86-NEXT:    xorl 4(%eax), %edx
+; X86-NEXT:    orl %ecx, %edx
+; X86-NEXT:    setne %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: length8_eq_const:
+; X64:       # %bb.0:
+; X64-NEXT:    movabsq $3978425819141910832, %rax # imm = 0x3736353433323130
+; X64-NEXT:    cmpq %rax, (%rdi)
+; X64-NEXT:    setne %al
+; X64-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 8) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length12_eq(i8* %X, i8* %Y) nounwind !prof !14 {
+; X86-LABEL: length12_eq:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $0
+; X86-NEXT:    pushl $12
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    setne %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: length12_eq:
+; X64:       # %bb.0:
+; X64-NEXT:    movq (%rdi), %rax
+; X64-NEXT:    xorq (%rsi), %rax
+; X64-NEXT:    movl 8(%rdi), %ecx
+; X64-NEXT:    xorl 8(%rsi), %ecx
+; X64-NEXT:    orq %rax, %rcx
+; X64-NEXT:    setne %al
+; X64-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 12) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length12(i8* %X, i8* %Y) nounwind !prof !14 {
+; X86-LABEL: length12:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $0
+; X86-NEXT:    pushl $12
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    retl
+;
+; X64-LABEL: length12:
+; X64:       # %bb.0:
+; X64-NEXT:    movq (%rdi), %rcx
+; X64-NEXT:    movq (%rsi), %rdx
+; X64-NEXT:    bswapq %rcx
+; X64-NEXT:    bswapq %rdx
+; X64-NEXT:    cmpq %rdx, %rcx
+; X64-NEXT:    jne .LBB15_2
+; X64-NEXT:  # %bb.1: # %loadbb1
+; X64-NEXT:    movl 8(%rdi), %ecx
+; X64-NEXT:    movl 8(%rsi), %edx
+; X64-NEXT:    bswapl %ecx
+; X64-NEXT:    bswapl %edx
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    cmpq %rdx, %rcx
+; X64-NEXT:    je .LBB15_3
+; X64-NEXT:  .LBB15_2: # %res_block
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    cmpq %rdx, %rcx
+; X64-NEXT:    setae %al
+; X64-NEXT:    leal -1(%rax,%rax), %eax
+; X64-NEXT:  .LBB15_3: # %endblock
+; X64-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 12) nounwind
+  ret i32 %m
+}
+
+; PR33329 - https://bugs.llvm.org/show_bug.cgi?id=33329
+
+define i32 @length16(i8* %X, i8* %Y) nounwind !prof !14 {
+; X86-LABEL: length16:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $0
+; X86-NEXT:    pushl $16
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    retl
+;
+; X64-LABEL: length16:
+; X64:       # %bb.0:
+; X64-NEXT:    movq (%rdi), %rcx
+; X64-NEXT:    movq (%rsi), %rdx
+; X64-NEXT:    bswapq %rcx
+; X64-NEXT:    bswapq %rdx
+; X64-NEXT:    cmpq %rdx, %rcx
+; X64-NEXT:    jne .LBB16_2
+; X64-NEXT:  # %bb.1: # %loadbb1
+; X64-NEXT:    movq 8(%rdi), %rcx
+; X64-NEXT:    movq 8(%rsi), %rdx
+; X64-NEXT:    bswapq %rcx
+; X64-NEXT:    bswapq %rdx
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    cmpq %rdx, %rcx
+; X64-NEXT:    je .LBB16_3
+; X64-NEXT:  .LBB16_2: # %res_block
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    cmpq %rdx, %rcx
+; X64-NEXT:    setae %al
+; X64-NEXT:    leal -1(%rax,%rax), %eax
+; X64-NEXT:  .LBB16_3: # %endblock
+; X64-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 16) nounwind
+  ret i32 %m
+}
+
+define i1 @length16_eq(i8* %x, i8* %y) nounwind !prof !14 {
+; X86-NOSSE-LABEL: length16_eq:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl $0
+; X86-NOSSE-NEXT:    pushl $16
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    addl $16, %esp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    setne %al
+; X86-NOSSE-NEXT:    retl
+;
+; X86-SSE2-LABEL: length16_eq:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
+; X86-SSE2-NEXT:    movdqu (%eax), %xmm1
+; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
+; X86-SSE2-NEXT:    pmovmskb %xmm1, %eax
+; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; X86-SSE2-NEXT:    setne %al
+; X86-SSE2-NEXT:    retl
+;
+; X64-SSE2-LABEL: length16_eq:
+; X64-SSE2:       # %bb.0:
+; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
+; X64-SSE2-NEXT:    movdqu (%rsi), %xmm1
+; X64-SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
+; X64-SSE2-NEXT:    pmovmskb %xmm1, %eax
+; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; X64-SSE2-NEXT:    setne %al
+; X64-SSE2-NEXT:    retq
+;
+; X64-AVX-LABEL: length16_eq:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vmovdqu (%rdi), %xmm0
+; X64-AVX-NEXT:    vpxor (%rsi), %xmm0, %xmm0
+; X64-AVX-NEXT:    vptest %xmm0, %xmm0
+; X64-AVX-NEXT:    setne %al
+; X64-AVX-NEXT:    retq
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 16) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length16_eq_const(i8* %X) nounwind !prof !14 {
+; X86-NOSSE-LABEL: length16_eq_const:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl $0
+; X86-NOSSE-NEXT:    pushl $16
+; X86-NOSSE-NEXT:    pushl $.L.str
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    addl $16, %esp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    sete %al
+; X86-NOSSE-NEXT:    retl
+;
+; X86-SSE2-LABEL: length16_eq_const:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    movdqu (%eax), %xmm0
+; X86-SSE2-NEXT:    pcmpeqb {{\.LCPI.*}}, %xmm0
+; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
+; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; X86-SSE2-NEXT:    sete %al
+; X86-SSE2-NEXT:    retl
+;
+; X64-SSE2-LABEL: length16_eq_const:
+; X64-SSE2:       # %bb.0:
+; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
+; X64-SSE2-NEXT:    pcmpeqb {{.*}}(%rip), %xmm0
+; X64-SSE2-NEXT:    pmovmskb %xmm0, %eax
+; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; X64-SSE2-NEXT:    sete %al
+; X64-SSE2-NEXT:    retq
+;
+; X64-AVX-LABEL: length16_eq_const:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vmovdqu (%rdi), %xmm0
+; X64-AVX-NEXT:    vpxor {{.*}}(%rip), %xmm0, %xmm0
+; X64-AVX-NEXT:    vptest %xmm0, %xmm0
+; X64-AVX-NEXT:    sete %al
+; X64-AVX-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 16) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+; PR33914 - https://bugs.llvm.org/show_bug.cgi?id=33914
+
+define i32 @length24(i8* %X, i8* %Y) nounwind !prof !14 {
+; X86-LABEL: length24:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $0
+; X86-NEXT:    pushl $24
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    retl
+;
+; X64-LABEL: length24:
+; X64:       # %bb.0:
+; X64-NEXT:    movl $24, %edx
+; X64-NEXT:    jmp memcmp # TAILCALL
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 24) nounwind
+  ret i32 %m
+}
+
+define i1 @length24_eq(i8* %x, i8* %y) nounwind !prof !14 {
+; X86-NOSSE-LABEL: length24_eq:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl $0
+; X86-NOSSE-NEXT:    pushl $24
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    addl $16, %esp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    sete %al
+; X86-NOSSE-NEXT:    retl
+;
+; X86-SSE2-LABEL: length24_eq:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
+; X86-SSE2-NEXT:    movdqu 8(%ecx), %xmm1
+; X86-SSE2-NEXT:    movdqu (%eax), %xmm2
+; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
+; X86-SSE2-NEXT:    movdqu 8(%eax), %xmm0
+; X86-SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
+; X86-SSE2-NEXT:    pand %xmm2, %xmm0
+; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
+; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; X86-SSE2-NEXT:    sete %al
+; X86-SSE2-NEXT:    retl
+;
+; X64-SSE2-LABEL: length24_eq:
+; X64-SSE2:       # %bb.0:
+; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
+; X64-SSE2-NEXT:    movdqu (%rsi), %xmm1
+; X64-SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
+; X64-SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X64-SSE2-NEXT:    movq {{.*#+}} xmm2 = mem[0],zero
+; X64-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
+; X64-SSE2-NEXT:    pand %xmm1, %xmm2
+; X64-SSE2-NEXT:    pmovmskb %xmm2, %eax
+; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; X64-SSE2-NEXT:    sete %al
+; X64-SSE2-NEXT:    retq
+;
+; X64-AVX-LABEL: length24_eq:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vmovdqu (%rdi), %xmm0
+; X64-AVX-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; X64-AVX-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
+; X64-AVX-NEXT:    vpxor %xmm2, %xmm1, %xmm1
+; X64-AVX-NEXT:    vpxor (%rsi), %xmm0, %xmm0
+; X64-AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT:    vptest %xmm0, %xmm0
+; X64-AVX-NEXT:    sete %al
+; X64-AVX-NEXT:    retq
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 24) nounwind
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length24_eq_const(i8* %X) nounwind !prof !14 {
+; X86-NOSSE-LABEL: length24_eq_const:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl $0
+; X86-NOSSE-NEXT:    pushl $24
+; X86-NOSSE-NEXT:    pushl $.L.str
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    addl $16, %esp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    setne %al
+; X86-NOSSE-NEXT:    retl
+;
+; X86-SSE2-LABEL: length24_eq_const:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    movdqu (%eax), %xmm0
+; X86-SSE2-NEXT:    movdqu 8(%eax), %xmm1
+; X86-SSE2-NEXT:    pcmpeqb {{\.LCPI.*}}, %xmm1
+; X86-SSE2-NEXT:    pcmpeqb {{\.LCPI.*}}, %xmm0
+; X86-SSE2-NEXT:    pand %xmm1, %xmm0
+; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
+; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; X86-SSE2-NEXT:    setne %al
+; X86-SSE2-NEXT:    retl
+;
+; X64-SSE2-LABEL: length24_eq_const:
+; X64-SSE2:       # %bb.0:
+; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
+; X64-SSE2-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
+; X64-SSE2-NEXT:    pcmpeqb {{.*}}(%rip), %xmm1
+; X64-SSE2-NEXT:    pcmpeqb {{.*}}(%rip), %xmm0
+; X64-SSE2-NEXT:    pand %xmm1, %xmm0
+; X64-SSE2-NEXT:    pmovmskb %xmm0, %eax
+; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; X64-SSE2-NEXT:    setne %al
+; X64-SSE2-NEXT:    retq
+;
+; X64-AVX-LABEL: length24_eq_const:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vmovdqu (%rdi), %xmm0
+; X64-AVX-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; X64-AVX-NEXT:    vpxor {{.*}}(%rip), %xmm1, %xmm1
+; X64-AVX-NEXT:    vpxor {{.*}}(%rip), %xmm0, %xmm0
+; X64-AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT:    vptest %xmm0, %xmm0
+; X64-AVX-NEXT:    setne %al
+; X64-AVX-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 24) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length32(i8* %X, i8* %Y) nounwind !prof !14 {
+; X86-LABEL: length32:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $0
+; X86-NEXT:    pushl $32
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    retl
+;
+; X64-LABEL: length32:
+; X64:       # %bb.0:
+; X64-NEXT:    movl $32, %edx
+; X64-NEXT:    jmp memcmp # TAILCALL
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 32) nounwind
+  ret i32 %m
+}
+
+; PR33325 - https://bugs.llvm.org/show_bug.cgi?id=33325
+
+define i1 @length32_eq(i8* %x, i8* %y) nounwind !prof !14 {
+; X86-NOSSE-LABEL: length32_eq:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl $0
+; X86-NOSSE-NEXT:    pushl $32
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    addl $16, %esp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    sete %al
+; X86-NOSSE-NEXT:    retl
+;
+; X86-SSE2-LABEL: length32_eq:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
+; X86-SSE2-NEXT:    movdqu 16(%ecx), %xmm1
+; X86-SSE2-NEXT:    movdqu (%eax), %xmm2
+; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
+; X86-SSE2-NEXT:    movdqu 16(%eax), %xmm0
+; X86-SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
+; X86-SSE2-NEXT:    pand %xmm2, %xmm0
+; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
+; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; X86-SSE2-NEXT:    sete %al
+; X86-SSE2-NEXT:    retl
+;
+; X64-SSE2-LABEL: length32_eq:
+; X64-SSE2:       # %bb.0:
+; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
+; X64-SSE2-NEXT:    movdqu 16(%rdi), %xmm1
+; X64-SSE2-NEXT:    movdqu (%rsi), %xmm2
+; X64-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
+; X64-SSE2-NEXT:    movdqu 16(%rsi), %xmm0
+; X64-SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
+; X64-SSE2-NEXT:    pand %xmm2, %xmm0
+; X64-SSE2-NEXT:    pmovmskb %xmm0, %eax
+; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; X64-SSE2-NEXT:    sete %al
+; X64-SSE2-NEXT:    retq
+;
+; X64-AVX1-LABEL: length32_eq:
+; X64-AVX1:       # %bb.0:
+; X64-AVX1-NEXT:    vmovups (%rdi), %ymm0
+; X64-AVX1-NEXT:    vxorps (%rsi), %ymm0, %ymm0
+; X64-AVX1-NEXT:    vptest %ymm0, %ymm0
+; X64-AVX1-NEXT:    sete %al
+; X64-AVX1-NEXT:    vzeroupper
+; X64-AVX1-NEXT:    retq
+;
+; X64-AVX2-LABEL: length32_eq:
+; X64-AVX2:       # %bb.0:
+; X64-AVX2-NEXT:    vmovdqu (%rdi), %ymm0
+; X64-AVX2-NEXT:    vpxor (%rsi), %ymm0, %ymm0
+; X64-AVX2-NEXT:    vptest %ymm0, %ymm0
+; X64-AVX2-NEXT:    sete %al
+; X64-AVX2-NEXT:    vzeroupper
+; X64-AVX2-NEXT:    retq
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 32) nounwind
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length32_eq_const(i8* %X) nounwind !prof !14 {
+; X86-NOSSE-LABEL: length32_eq_const:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl $0
+; X86-NOSSE-NEXT:    pushl $32
+; X86-NOSSE-NEXT:    pushl $.L.str
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    addl $16, %esp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    setne %al
+; X86-NOSSE-NEXT:    retl
+;
+; X86-SSE2-LABEL: length32_eq_const:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    movdqu (%eax), %xmm0
+; X86-SSE2-NEXT:    movdqu 16(%eax), %xmm1
+; X86-SSE2-NEXT:    pcmpeqb {{\.LCPI.*}}, %xmm1
+; X86-SSE2-NEXT:    pcmpeqb {{\.LCPI.*}}, %xmm0
+; X86-SSE2-NEXT:    pand %xmm1, %xmm0
+; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
+; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; X86-SSE2-NEXT:    setne %al
+; X86-SSE2-NEXT:    retl
+;
+; X64-SSE2-LABEL: length32_eq_const:
+; X64-SSE2:       # %bb.0:
+; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
+; X64-SSE2-NEXT:    movdqu 16(%rdi), %xmm1
+; X64-SSE2-NEXT:    pcmpeqb {{.*}}(%rip), %xmm1
+; X64-SSE2-NEXT:    pcmpeqb {{.*}}(%rip), %xmm0
+; X64-SSE2-NEXT:    pand %xmm1, %xmm0
+; X64-SSE2-NEXT:    pmovmskb %xmm0, %eax
+; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; X64-SSE2-NEXT:    setne %al
+; X64-SSE2-NEXT:    retq
+;
+; X64-AVX1-LABEL: length32_eq_const:
+; X64-AVX1:       # %bb.0:
+; X64-AVX1-NEXT:    vmovups (%rdi), %ymm0
+; X64-AVX1-NEXT:    vxorps {{.*}}(%rip), %ymm0, %ymm0
+; X64-AVX1-NEXT:    vptest %ymm0, %ymm0
+; X64-AVX1-NEXT:    setne %al
+; X64-AVX1-NEXT:    vzeroupper
+; X64-AVX1-NEXT:    retq
+;
+; X64-AVX2-LABEL: length32_eq_const:
+; X64-AVX2:       # %bb.0:
+; X64-AVX2-NEXT:    vmovdqu (%rdi), %ymm0
+; X64-AVX2-NEXT:    vpxor {{.*}}(%rip), %ymm0, %ymm0
+; X64-AVX2-NEXT:    vptest %ymm0, %ymm0
+; X64-AVX2-NEXT:    setne %al
+; X64-AVX2-NEXT:    vzeroupper
+; X64-AVX2-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 32) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length64(i8* %X, i8* %Y) nounwind !prof !14 {
+; X86-LABEL: length64:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $0
+; X86-NEXT:    pushl $64
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    retl
+;
+; X64-LABEL: length64:
+; X64:       # %bb.0:
+; X64-NEXT:    movl $64, %edx
+; X64-NEXT:    jmp memcmp # TAILCALL
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 64) nounwind
+  ret i32 %m
+}
+
+define i1 @length64_eq(i8* %x, i8* %y) nounwind !prof !14 {
+; X86-LABEL: length64_eq:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $0
+; X86-NEXT:    pushl $64
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    setne %al
+; X86-NEXT:    retl
+;
+; X64-SSE2-LABEL: length64_eq:
+; X64-SSE2:       # %bb.0:
+; X64-SSE2-NEXT:    pushq %rax
+; X64-SSE2-NEXT:    movl $64, %edx
+; X64-SSE2-NEXT:    callq memcmp
+; X64-SSE2-NEXT:    testl %eax, %eax
+; X64-SSE2-NEXT:    setne %al
+; X64-SSE2-NEXT:    popq %rcx
+; X64-SSE2-NEXT:    retq
+;
+; X64-AVX1-LABEL: length64_eq:
+; X64-AVX1:       # %bb.0:
+; X64-AVX1-NEXT:    vmovups (%rdi), %ymm0
+; X64-AVX1-NEXT:    vmovups 32(%rdi), %ymm1
+; X64-AVX1-NEXT:    vxorps 32(%rsi), %ymm1, %ymm1
+; X64-AVX1-NEXT:    vxorps (%rsi), %ymm0, %ymm0
+; X64-AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
+; X64-AVX1-NEXT:    vptest %ymm0, %ymm0
+; X64-AVX1-NEXT:    setne %al
+; X64-AVX1-NEXT:    vzeroupper
+; X64-AVX1-NEXT:    retq
+;
+; X64-AVX2-LABEL: length64_eq:
+; X64-AVX2:       # %bb.0:
+; X64-AVX2-NEXT:    vmovdqu (%rdi), %ymm0
+; X64-AVX2-NEXT:    vmovdqu 32(%rdi), %ymm1
+; X64-AVX2-NEXT:    vpxor 32(%rsi), %ymm1, %ymm1
+; X64-AVX2-NEXT:    vpxor (%rsi), %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vptest %ymm0, %ymm0
+; X64-AVX2-NEXT:    setne %al
+; X64-AVX2-NEXT:    vzeroupper
+; X64-AVX2-NEXT:    retq
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 64) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length64_eq_const(i8* %X) nounwind !prof !14 {
+; X86-LABEL: length64_eq_const:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $0
+; X86-NEXT:    pushl $64
+; X86-NEXT:    pushl $.L.str
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    sete %al
+; X86-NEXT:    retl
+;
+; X64-SSE2-LABEL: length64_eq_const:
+; X64-SSE2:       # %bb.0:
+; X64-SSE2-NEXT:    pushq %rax
+; X64-SSE2-NEXT:    movl $.L.str, %esi
+; X64-SSE2-NEXT:    movl $64, %edx
+; X64-SSE2-NEXT:    callq memcmp
+; X64-SSE2-NEXT:    testl %eax, %eax
+; X64-SSE2-NEXT:    sete %al
+; X64-SSE2-NEXT:    popq %rcx
+; X64-SSE2-NEXT:    retq
+;
+; X64-AVX1-LABEL: length64_eq_const:
+; X64-AVX1:       # %bb.0:
+; X64-AVX1-NEXT:    vmovups (%rdi), %ymm0
+; X64-AVX1-NEXT:    vmovups 32(%rdi), %ymm1
+; X64-AVX1-NEXT:    vxorps {{.*}}(%rip), %ymm1, %ymm1
+; X64-AVX1-NEXT:    vxorps {{.*}}(%rip), %ymm0, %ymm0
+; X64-AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
+; X64-AVX1-NEXT:    vptest %ymm0, %ymm0
+; X64-AVX1-NEXT:    sete %al
+; X64-AVX1-NEXT:    vzeroupper
+; X64-AVX1-NEXT:    retq
+;
+; X64-AVX2-LABEL: length64_eq_const:
+; X64-AVX2:       # %bb.0:
+; X64-AVX2-NEXT:    vmovdqu (%rdi), %ymm0
+; X64-AVX2-NEXT:    vmovdqu 32(%rdi), %ymm1
+; X64-AVX2-NEXT:    vpxor {{.*}}(%rip), %ymm1, %ymm1
+; X64-AVX2-NEXT:    vpxor {{.*}}(%rip), %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vptest %ymm0, %ymm0
+; X64-AVX2-NEXT:    sete %al
+; X64-AVX2-NEXT:    vzeroupper
+; X64-AVX2-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 64) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @bcmp_length2(i8* %X, i8* %Y) nounwind !prof !14 {
+; X86-LABEL: bcmp_length2:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movzwl (%ecx), %ecx
+; X86-NEXT:    movzwl (%eax), %edx
+; X86-NEXT:    rolw $8, %cx
+; X86-NEXT:    rolw $8, %dx
+; X86-NEXT:    movzwl %cx, %eax
+; X86-NEXT:    movzwl %dx, %ecx
+; X86-NEXT:    subl %ecx, %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: bcmp_length2:
+; X64:       # %bb.0:
+; X64-NEXT:    movzwl (%rdi), %eax
+; X64-NEXT:    movzwl (%rsi), %ecx
+; X64-NEXT:    rolw $8, %ax
+; X64-NEXT:    rolw $8, %cx
+; X64-NEXT:    movzwl %ax, %eax
+; X64-NEXT:    movzwl %cx, %ecx
+; X64-NEXT:    subl %ecx, %eax
+; X64-NEXT:    retq
+  %m = tail call i32 @bcmp(i8* %X, i8* %Y, i64 2) nounwind
+  ret i32 %m
+}
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"ProfileSummary", !1}
+!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
+!2 = !{!"ProfileFormat", !"InstrProf"}
+!3 = !{!"TotalCount", i64 10000}
+!4 = !{!"MaxCount", i64 10}
+!5 = !{!"MaxInternalCount", i64 1}
+!6 = !{!"MaxFunctionCount", i64 1000}
+!7 = !{!"NumCounts", i64 3}
+!8 = !{!"NumFunctions", i64 3}
+!9 = !{!"DetailedSummary", !10}
+!10 = !{!11, !12, !13}
+!11 = !{i32 10000, i64 100, i32 1}
+!12 = !{i32 999000, i64 100, i32 1}
+!13 = !{i32 999999, i64 1, i32 2}
+!14 = !{!"function_entry_count", i64 0}

diff  --git a/llvm/test/CodeGen/X86/memcpy.ll b/llvm/test/CodeGen/X86/memcpy.ll
index 531117dbcda4..4c0a937a7031 100644
--- a/llvm/test/CodeGen/X86/memcpy.ll
+++ b/llvm/test/CodeGen/X86/memcpy.ll
@@ -139,6 +139,36 @@ entry:
   ret void
 }
 
+define void @test3_pgso(i8* nocapture %A, i8* nocapture %B) nounwind noredzone !prof !14 {
+; LINUX-LABEL: test3_pgso:
+; LINUX:       # %bb.0: # %entry
+; LINUX-NEXT:    movl $64, %edx
+; LINUX-NEXT:    jmp memcpy # TAILCALL
+;
+; DARWIN-LABEL: test3_pgso:
+; DARWIN:       ## %bb.0: ## %entry
+; DARWIN-NEXT:    movq 56(%rsi), %rax
+; DARWIN-NEXT:    movq %rax, 56(%rdi)
+; DARWIN-NEXT:    movq 48(%rsi), %rax
+; DARWIN-NEXT:    movq %rax, 48(%rdi)
+; DARWIN-NEXT:    movq 40(%rsi), %rax
+; DARWIN-NEXT:    movq %rax, 40(%rdi)
+; DARWIN-NEXT:    movq 32(%rsi), %rax
+; DARWIN-NEXT:    movq %rax, 32(%rdi)
+; DARWIN-NEXT:    movq 24(%rsi), %rax
+; DARWIN-NEXT:    movq %rax, 24(%rdi)
+; DARWIN-NEXT:    movq 16(%rsi), %rax
+; DARWIN-NEXT:    movq %rax, 16(%rdi)
+; DARWIN-NEXT:    movq (%rsi), %rax
+; DARWIN-NEXT:    movq 8(%rsi), %rcx
+; DARWIN-NEXT:    movq %rcx, 8(%rdi)
+; DARWIN-NEXT:    movq %rax, (%rdi)
+; DARWIN-NEXT:    retq
+entry:
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %A, i8* %B, i64 64, i1 false)
+  ret void
+}
+
 define void @test3_minsize(i8* nocapture %A, i8* nocapture %B) nounwind minsize noredzone {
 ; DARWIN-LABEL: test3_minsize:
 ; DARWIN:       ## %bb.0:
@@ -506,3 +536,20 @@ define void @addrspace256(i8 addrspace(256)* %a, i8 addrspace(256)* %b) nounwind
   tail call void @llvm.memcpy.p256i8.p256i8.i64(i8 addrspace(256)* align 8 %a, i8 addrspace(256)* align 8 %b, i64 16, i1 false)
   ret void
 }
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"ProfileSummary", !1}
+!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
+!2 = !{!"ProfileFormat", !"InstrProf"}
+!3 = !{!"TotalCount", i64 10000}
+!4 = !{!"MaxCount", i64 10}
+!5 = !{!"MaxInternalCount", i64 1}
+!6 = !{!"MaxFunctionCount", i64 1000}
+!7 = !{!"NumCounts", i64 3}
+!8 = !{!"NumFunctions", i64 3}
+!9 = !{!"DetailedSummary", !10}
+!10 = !{!11, !12, !13}
+!11 = !{i32 10000, i64 100, i32 1}
+!12 = !{i32 999000, i64 100, i32 1}
+!13 = !{i32 999999, i64 1, i32 2}
+!14 = !{!"function_entry_count", i64 0}

diff  --git a/llvm/test/CodeGen/X86/powi.ll b/llvm/test/CodeGen/X86/powi.ll
index fe2cfb0bdcc7..7a5282337040 100644
--- a/llvm/test/CodeGen/X86/powi.ll
+++ b/llvm/test/CodeGen/X86/powi.ll
@@ -86,6 +86,39 @@ define double @pow_wrapper_optsize(double %a) optsize {
   ret double %ret
 }
 
+define double @pow_wrapper_pgso(double %a) !prof !14 {
+; X86-X87-LABEL: pow_wrapper_pgso:
+; X86-X87:       # %bb.0:
+; X86-X87-NEXT:    subl $12, %esp
+; X86-X87-NEXT:    .cfi_def_cfa_offset 16
+; X86-X87-NEXT:    fldl {{[0-9]+}}(%esp)
+; X86-X87-NEXT:    fstpl (%esp)
+; X86-X87-NEXT:    movl $15, {{[0-9]+}}(%esp)
+; X86-X87-NEXT:    calll __powidf2
+; X86-X87-NEXT:    addl $12, %esp
+; X86-X87-NEXT:    .cfi_def_cfa_offset 4
+; X86-X87-NEXT:    retl
+;
+; X86-SSE-LABEL: pow_wrapper_pgso:
+; X86-SSE:       # %bb.0:
+; X86-SSE-NEXT:    subl $12, %esp
+; X86-SSE-NEXT:    .cfi_def_cfa_offset 16
+; X86-SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-SSE-NEXT:    movsd %xmm0, (%esp)
+; X86-SSE-NEXT:    movl $15, {{[0-9]+}}(%esp)
+; X86-SSE-NEXT:    calll __powidf2
+; X86-SSE-NEXT:    addl $12, %esp
+; X86-SSE-NEXT:    .cfi_def_cfa_offset 4
+; X86-SSE-NEXT:    retl
+;
+; X64-LABEL: pow_wrapper_pgso:
+; X64:       # %bb.0:
+; X64-NEXT:    movl $15, %edi
+; X64-NEXT:    jmp __powidf2 # TAILCALL
+  %ret = tail call double @llvm.powi.f64(double %a, i32 15) nounwind ; <double> [#uses=1]
+  ret double %ret
+}
+
 define double @pow_wrapper_minsize(double %a) minsize {
 ; X86-X87-LABEL: pow_wrapper_minsize:
 ; X86-X87:       # %bb.0:
@@ -124,3 +157,19 @@ define double @pow_wrapper_minsize(double %a) minsize {
 
 declare double @llvm.powi.f64(double, i32) nounwind readonly
 
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"ProfileSummary", !1}
+!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
+!2 = !{!"ProfileFormat", !"InstrProf"}
+!3 = !{!"TotalCount", i64 10000}
+!4 = !{!"MaxCount", i64 10}
+!5 = !{!"MaxInternalCount", i64 1}
+!6 = !{!"MaxFunctionCount", i64 1000}
+!7 = !{!"NumCounts", i64 3}
+!8 = !{!"NumFunctions", i64 3}
+!9 = !{!"DetailedSummary", !10}
+!10 = !{!11, !12, !13}
+!11 = !{i32 10000, i64 100, i32 1}
+!12 = !{i32 999000, i64 100, i32 1}
+!13 = !{i32 999999, i64 1, i32 2}
+!14 = !{!"function_entry_count", i64 0}

diff  --git a/llvm/test/CodeGen/X86/rounding-ops.ll b/llvm/test/CodeGen/X86/rounding-ops.ll
index ded613b7f748..00a5f2d0baab 100644
--- a/llvm/test/CodeGen/X86/rounding-ops.ll
+++ b/llvm/test/CodeGen/X86/rounding-ops.ll
@@ -252,3 +252,60 @@ define double @test12(double* %xptr) nounwind optsize {
   %call = tail call double @trunc(double %x) nounwind readnone
   ret double %call
 }
+
+define float @test11_pgso(float* %xptr) nounwind !prof !14 {
+; CHECK-SSE-LABEL: test11_pgso:
+; CHECK-SSE:       ## %bb.0:
+; CHECK-SSE-NEXT:    roundss $11, (%rdi), %xmm0
+; CHECK-SSE-NEXT:    retq
+;
+; CHECK-AVX-LABEL: test11_pgso:
+; CHECK-AVX:       ## %bb.0:
+; CHECK-AVX-NEXT:    vroundss $11, (%rdi), %xmm0, %xmm0
+; CHECK-AVX-NEXT:    retq
+;
+; CHECK-AVX512-LABEL: test11_pgso:
+; CHECK-AVX512:       ## %bb.0:
+; CHECK-AVX512-NEXT:    vroundss $11, (%rdi), %xmm0, %xmm0
+; CHECK-AVX512-NEXT:    retq
+  %x = load float, float* %xptr
+  %call = tail call float @truncf(float %x) nounwind readnone
+  ret float %call
+}
+
+define double @test12_pgso(double* %xptr) nounwind !prof !14 {
+; CHECK-SSE-LABEL: test12_pgso:
+; CHECK-SSE:       ## %bb.0:
+; CHECK-SSE-NEXT:    roundsd $11, (%rdi), %xmm0
+; CHECK-SSE-NEXT:    retq
+;
+; CHECK-AVX-LABEL: test12_pgso:
+; CHECK-AVX:       ## %bb.0:
+; CHECK-AVX-NEXT:    vroundsd $11, (%rdi), %xmm0, %xmm0
+; CHECK-AVX-NEXT:    retq
+;
+; CHECK-AVX512-LABEL: test12_pgso:
+; CHECK-AVX512:       ## %bb.0:
+; CHECK-AVX512-NEXT:    vroundsd $11, (%rdi), %xmm0, %xmm0
+; CHECK-AVX512-NEXT:    retq
+  %x = load double, double* %xptr
+  %call = tail call double @trunc(double %x) nounwind readnone
+  ret double %call
+}
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"ProfileSummary", !1}
+!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
+!2 = !{!"ProfileFormat", !"InstrProf"}
+!3 = !{!"TotalCount", i64 10000}
+!4 = !{!"MaxCount", i64 10}
+!5 = !{!"MaxInternalCount", i64 1}
+!6 = !{!"MaxFunctionCount", i64 1000}
+!7 = !{!"NumCounts", i64 3}
+!8 = !{!"NumFunctions", i64 3}
+!9 = !{!"DetailedSummary", !10}
+!10 = !{!11, !12, !13}
+!11 = !{i32 10000, i64 100, i32 1}
+!12 = !{i32 999000, i64 100, i32 1}
+!13 = !{i32 999999, i64 1, i32 2}
+!14 = !{!"function_entry_count", i64 0}

diff  --git a/llvm/test/CodeGen/X86/shrink-compare-pgso.ll b/llvm/test/CodeGen/X86/shrink-compare-pgso.ll
new file mode 100644
index 000000000000..337f7d11e954
--- /dev/null
+++ b/llvm/test/CodeGen/X86/shrink-compare-pgso.ll
@@ -0,0 +1,321 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s
+
+declare void @bar()
+
+define void @test1(i32* nocapture %X) nounwind !prof !14 {
+; CHECK-LABEL: test1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    cmpb $47, (%rdi)
+; CHECK-NEXT:    je bar # TAILCALL
+; CHECK-NEXT:  # %bb.1: # %if.end
+; CHECK-NEXT:    retq
+entry:
+  %tmp1 = load i32, i32* %X, align 4
+  %and = and i32 %tmp1, 255
+  %cmp = icmp eq i32 %and, 47
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  tail call void @bar() nounwind
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+define void @test2(i32 %X) nounwind !prof !14 {
+; CHECK-LABEL: test2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    cmpb $47, %dil
+; CHECK-NEXT:    je bar # TAILCALL
+; CHECK-NEXT:  # %bb.1: # %if.end
+; CHECK-NEXT:    retq
+entry:
+  %and = and i32 %X, 255
+  %cmp = icmp eq i32 %and, 47
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  tail call void @bar() nounwind
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+define void @test3(i32 %X) nounwind !prof !14 {
+; CHECK-LABEL: test3:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    cmpb $-1, %dil
+; CHECK-NEXT:    je bar # TAILCALL
+; CHECK-NEXT:  # %bb.1: # %if.end
+; CHECK-NEXT:    retq
+entry:
+  %and = and i32 %X, 255
+  %cmp = icmp eq i32 %and, 255
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  tail call void @bar() nounwind
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+; PR16083
+define i1 @test4(i64 %a, i32 %b) {
+; CHECK-LABEL: test4:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movb $1, %al
+; CHECK-NEXT:    testl %esi, %esi
+; CHECK-NEXT:    je .LBB3_1
+; CHECK-NEXT:  # %bb.2: # %lor.end
+; CHECK-NEXT:    # kill: def $al killed $al killed $eax
+; CHECK-NEXT:    retq
+; CHECK-NEXT:  .LBB3_1: # %lor.rhs
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    # kill: def $al killed $al killed $eax
+; CHECK-NEXT:    retq
+entry:
+  %tobool = icmp ne i32 %b, 0
+  br i1 %tobool, label %lor.end, label %lor.rhs
+
+lor.rhs:                                          ; preds = %entry
+  %and = and i64 0, %a
+  %tobool1 = icmp ne i64 %and, 0
+  br label %lor.end
+
+lor.end:                                          ; preds = %lor.rhs, %entry
+  %p = phi i1 [ true, %entry ], [ %tobool1, %lor.rhs ]
+  ret i1 %p
+}
+
+ at x = global { i8, i8, i8, i8, i8, i8, i8, i8 } { i8 1, i8 0, i8 0, i8 0, i8 1, i8 0, i8 0, i8 1 }, align 4
+
+; PR16551
+define void @test5(i32 %X) nounwind !prof !14 {
+; CHECK-LABEL: test5:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movzbl x+{{.*}}(%rip), %eax
+; CHECK-NEXT:    shll $16, %eax
+; CHECK-NEXT:    movzwl x+{{.*}}(%rip), %ecx
+; CHECK-NEXT:    orl %eax, %ecx
+; CHECK-NEXT:    cmpl $1, %ecx
+; CHECK-NEXT:    jne bar # TAILCALL
+; CHECK-NEXT:  # %bb.1: # %if.end
+; CHECK-NEXT:    retq
+entry:
+  %bf.load = load i56, i56* bitcast ({ i8, i8, i8, i8, i8, i8, i8, i8 }* @x to i56*), align 4
+  %bf.lshr = lshr i56 %bf.load, 32
+  %bf.cast = trunc i56 %bf.lshr to i32
+  %cmp = icmp ne i32 %bf.cast, 1
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  tail call void @bar() nounwind
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+define void @test2_1(i32 %X) nounwind !prof !14 {
+; CHECK-LABEL: test2_1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    cmpl $256, %eax # imm = 0x100
+; CHECK-NEXT:    je bar # TAILCALL
+; CHECK-NEXT:  # %bb.1: # %if.end
+; CHECK-NEXT:    retq
+entry:
+  %and = and i32 %X, 255
+  %cmp = icmp eq i32 %and, 256
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  tail call void @bar() nounwind
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+define void @test_sext_i8_icmp_1(i8 %x) nounwind !prof !14 {
+; CHECK-LABEL: test_sext_i8_icmp_1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    cmpb $1, %dil
+; CHECK-NEXT:    je bar # TAILCALL
+; CHECK-NEXT:  # %bb.1: # %if.end
+; CHECK-NEXT:    retq
+entry:
+  %sext = sext i8 %x to i32
+  %cmp = icmp eq i32 %sext, 1
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  tail call void @bar() nounwind
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+define void @test_sext_i8_icmp_47(i8 %x) nounwind !prof !14 {
+; CHECK-LABEL: test_sext_i8_icmp_47:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    cmpb $47, %dil
+; CHECK-NEXT:    je bar # TAILCALL
+; CHECK-NEXT:  # %bb.1: # %if.end
+; CHECK-NEXT:    retq
+entry:
+  %sext = sext i8 %x to i32
+  %cmp = icmp eq i32 %sext, 47
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  tail call void @bar() nounwind
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+define void @test_sext_i8_icmp_127(i8 %x) nounwind !prof !14 {
+; CHECK-LABEL: test_sext_i8_icmp_127:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    cmpb $127, %dil
+; CHECK-NEXT:    je bar # TAILCALL
+; CHECK-NEXT:  # %bb.1: # %if.end
+; CHECK-NEXT:    retq
+entry:
+  %sext = sext i8 %x to i32
+  %cmp = icmp eq i32 %sext, 127
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  tail call void @bar() nounwind
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+define void @test_sext_i8_icmp_neg1(i8 %x) nounwind !prof !14 {
+; CHECK-LABEL: test_sext_i8_icmp_neg1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    cmpb $-1, %dil
+; CHECK-NEXT:    je bar # TAILCALL
+; CHECK-NEXT:  # %bb.1: # %if.end
+; CHECK-NEXT:    retq
+entry:
+  %sext = sext i8 %x to i32
+  %cmp = icmp eq i32 %sext, -1
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  tail call void @bar() nounwind
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+define void @test_sext_i8_icmp_neg2(i8 %x) nounwind !prof !14 {
+; CHECK-LABEL: test_sext_i8_icmp_neg2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    cmpb $-2, %dil
+; CHECK-NEXT:    je bar # TAILCALL
+; CHECK-NEXT:  # %bb.1: # %if.end
+; CHECK-NEXT:    retq
+entry:
+  %sext = sext i8 %x to i32
+  %cmp = icmp eq i32 %sext, -2
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  tail call void @bar() nounwind
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+define void @test_sext_i8_icmp_neg127(i8 %x) nounwind !prof !14 {
+; CHECK-LABEL: test_sext_i8_icmp_neg127:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    cmpb $-127, %dil
+; CHECK-NEXT:    je bar # TAILCALL
+; CHECK-NEXT:  # %bb.1: # %if.end
+; CHECK-NEXT:    retq
+entry:
+  %sext = sext i8 %x to i32
+  %cmp = icmp eq i32 %sext, -127
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  tail call void @bar() nounwind
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+define void @test_sext_i8_icmp_neg128(i8 %x) nounwind !prof !14 {
+; CHECK-LABEL: test_sext_i8_icmp_neg128:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    cmpb $-128, %dil
+; CHECK-NEXT:    je bar # TAILCALL
+; CHECK-NEXT:  # %bb.1: # %if.end
+; CHECK-NEXT:    retq
+entry:
+  %sext = sext i8 %x to i32
+  %cmp = icmp eq i32 %sext, -128
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  tail call void @bar() nounwind
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+define void @test_sext_i8_icmp_255(i8 %x) nounwind !prof !14 {
+; CHECK-LABEL: test_sext_i8_icmp_255:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movb $1, %al
+; CHECK-NEXT:    testb %al, %al
+; CHECK-NEXT:    je bar # TAILCALL
+; CHECK-NEXT:  # %bb.1: # %if.end
+; CHECK-NEXT:    retq
+entry:
+  %sext = sext i8 %x to i32
+  %cmp = icmp eq i32 %sext, 255
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  tail call void @bar() nounwind
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"ProfileSummary", !1}
+!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
+!2 = !{!"ProfileFormat", !"InstrProf"}
+!3 = !{!"TotalCount", i64 10000}
+!4 = !{!"MaxCount", i64 10}
+!5 = !{!"MaxInternalCount", i64 1}
+!6 = !{!"MaxFunctionCount", i64 1000}
+!7 = !{!"NumCounts", i64 3}
+!8 = !{!"NumFunctions", i64 3}
+!9 = !{!"DetailedSummary", !10}
+!10 = !{!11, !12, !13}
+!11 = !{i32 10000, i64 100, i32 1}
+!12 = !{i32 999000, i64 100, i32 1}
+!13 = !{i32 999999, i64 1, i32 2}
+!14 = !{!"function_entry_count", i64 0}

diff  --git a/llvm/test/CodeGen/X86/slow-incdec.ll b/llvm/test/CodeGen/X86/slow-incdec.ll
index 83bf250d44c3..6f4ad98a59a9 100644
--- a/llvm/test/CodeGen/X86/slow-incdec.ll
+++ b/llvm/test/CodeGen/X86/slow-incdec.ll
@@ -54,6 +54,26 @@ define i32 @dec_size(i32 %x) optsize {
   ret i32 %r
 }
 
+define i32 @inc_pgso(i32 %x) !prof !14 {
+; CHECK-LABEL: inc_pgso:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    incl %eax
+; CHECK-NEXT:    retl
+  %r = add i32 %x, 1
+  ret i32 %r
+}
+
+define i32 @dec_pgso(i32 %x) !prof !14 {
+; CHECK-LABEL: dec_pgso:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    decl %eax
+; CHECK-NEXT:    retl
+  %r = add i32 %x, -1
+  ret i32 %r
+}
+
 declare {i32, i1} @llvm.uadd.with.overflow.i32(i32, i32)
 declare void @other(i32* ) nounwind;
 
@@ -62,20 +82,20 @@ define void @cond_ae_to_cond_ne(i32* %p) nounwind {
 ; INCDEC:       # %bb.0: # %entry
 ; INCDEC-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; INCDEC-NEXT:    incl (%eax)
-; INCDEC-NEXT:    jne .LBB4_1
+; INCDEC-NEXT:    jne .LBB6_1
 ; INCDEC-NEXT:  # %bb.2: # %if.end4
 ; INCDEC-NEXT:    jmp other # TAILCALL
-; INCDEC-NEXT:  .LBB4_1: # %return
+; INCDEC-NEXT:  .LBB6_1: # %return
 ; INCDEC-NEXT:    retl
 ;
 ; ADD-LABEL: cond_ae_to_cond_ne:
 ; ADD:       # %bb.0: # %entry
 ; ADD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; ADD-NEXT:    addl $1, (%eax)
-; ADD-NEXT:    jne .LBB4_1
+; ADD-NEXT:    jne .LBB6_1
 ; ADD-NEXT:  # %bb.2: # %if.end4
 ; ADD-NEXT:    jmp other # TAILCALL
-; ADD-NEXT:  .LBB4_1: # %return
+; ADD-NEXT:  .LBB6_1: # %return
 ; ADD-NEXT:    retl
 entry:
   %t0 = load i32, i32* %p, align 8
@@ -109,10 +129,10 @@ define void @test_tail_call(i32* %ptr) nounwind {
 ; INCDEC-NEXT:    incb a
 ; INCDEC-NEXT:    sete d
 ; INCDEC-NEXT:    testb %al, %al
-; INCDEC-NEXT:    jne .LBB5_2
+; INCDEC-NEXT:    jne .LBB7_2
 ; INCDEC-NEXT:  # %bb.1: # %then
 ; INCDEC-NEXT:    jmp external_a # TAILCALL
-; INCDEC-NEXT:  .LBB5_2: # %else
+; INCDEC-NEXT:  .LBB7_2: # %else
 ; INCDEC-NEXT:    jmp external_b # TAILCALL
 ;
 ; ADD-LABEL: test_tail_call:
@@ -123,10 +143,10 @@ define void @test_tail_call(i32* %ptr) nounwind {
 ; ADD-NEXT:    addb $1, a
 ; ADD-NEXT:    sete d
 ; ADD-NEXT:    testb %al, %al
-; ADD-NEXT:    jne .LBB5_2
+; ADD-NEXT:    jne .LBB7_2
 ; ADD-NEXT:  # %bb.1: # %then
 ; ADD-NEXT:    jmp external_a # TAILCALL
-; ADD-NEXT:  .LBB5_2: # %else
+; ADD-NEXT:  .LBB7_2: # %else
 ; ADD-NEXT:    jmp external_b # TAILCALL
 entry:
   %val = load i32, i32* %ptr
@@ -152,3 +172,19 @@ else:
   ret void
 }
 
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"ProfileSummary", !1}
+!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
+!2 = !{!"ProfileFormat", !"InstrProf"}
+!3 = !{!"TotalCount", i64 10000}
+!4 = !{!"MaxCount", i64 10}
+!5 = !{!"MaxInternalCount", i64 1}
+!6 = !{!"MaxFunctionCount", i64 1000}
+!7 = !{!"NumCounts", i64 3}
+!8 = !{!"NumFunctions", i64 3}
+!9 = !{!"DetailedSummary", !10}
+!10 = !{!11, !12, !13}
+!11 = !{i32 10000, i64 100, i32 1}
+!12 = !{i32 999000, i64 100, i32 1}
+!13 = !{i32 999999, i64 1, i32 2}
+!14 = !{!"function_entry_count", i64 0}

diff  --git a/llvm/test/CodeGen/X86/splat-for-size.ll b/llvm/test/CodeGen/X86/splat-for-size.ll
index 7aae59080fd0..da68f069b078 100644
--- a/llvm/test/CodeGen/X86/splat-for-size.ll
+++ b/llvm/test/CodeGen/X86/splat-for-size.ll
@@ -17,6 +17,17 @@ define <2 x double> @splat_v2f64(<2 x double> %x) #0 {
   ret <2 x double> %add
 }
 
+define <2 x double> @splat_v2f64_pgso(<2 x double> %x) !prof !14 {
+; CHECK-LABEL: splat_v2f64_pgso:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovddup {{.*#+}} xmm1 = [1.0E+0,1.0E+0]
+; CHECK-NEXT:    # xmm1 = mem[0,0]
+; CHECK-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %add = fadd <2 x double> %x, <double 1.0, double 1.0>
+  ret <2 x double> %add
+}
+
 define <4 x double> @splat_v4f64(<4 x double> %x) #1 {
 ; CHECK-LABEL: splat_v4f64:
 ; CHECK:       # %bb.0:
@@ -27,6 +38,16 @@ define <4 x double> @splat_v4f64(<4 x double> %x) #1 {
   ret <4 x double> %add
 }
 
+define <4 x double> @splat_v4f64_pgso(<4 x double> %x) !prof !14 {
+; CHECK-LABEL: splat_v4f64_pgso:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vbroadcastsd {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; CHECK-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    retq
+  %add = fadd <4 x double> %x, <double 1.0, double 1.0, double 1.0, double 1.0>
+  ret <4 x double> %add
+}
+
 define <4 x float> @splat_v4f32(<4 x float> %x) #0 {
 ; CHECK-LABEL: splat_v4f32:
 ; CHECK:       # %bb.0:
@@ -37,6 +58,16 @@ define <4 x float> @splat_v4f32(<4 x float> %x) #0 {
   ret <4 x float> %add
 }
 
+define <4 x float> @splat_v4f32_pgso(<4 x float> %x) !prof !14 {
+; CHECK-LABEL: splat_v4f32_pgso:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; CHECK-NEXT:    vaddps %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %add = fadd <4 x float> %x, <float 1.0, float 1.0, float 1.0, float 1.0>
+  ret <4 x float> %add
+}
+
 define <8 x float> @splat_v8f32(<8 x float> %x) #1 {
 ; CHECK-LABEL: splat_v8f32:
 ; CHECK:       # %bb.0:
@@ -47,6 +78,16 @@ define <8 x float> @splat_v8f32(<8 x float> %x) #1 {
   ret <8 x float> %add
 }
 
+define <8 x float> @splat_v8f32_pgso(<8 x float> %x) !prof !14 {
+; CHECK-LABEL: splat_v8f32_pgso:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vbroadcastss {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; CHECK-NEXT:    vaddps %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    retq
+  %add = fadd <8 x float> %x, <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>
+  ret <8 x float> %add
+}
+
 ; AVX can't do integer splats, so fake it: use vmovddup to splat 64-bit value.
 ; We also generate vmovddup for AVX2 because it's one byte smaller than vpbroadcastq.
 define <2 x i64> @splat_v2i64(<2 x i64> %x) #1 {
@@ -66,6 +107,23 @@ define <2 x i64> @splat_v2i64(<2 x i64> %x) #1 {
   ret <2 x i64> %add
 }
 
+define <2 x i64> @splat_v2i64_pgso(<2 x i64> %x) !prof !14 {
+; AVX-LABEL: splat_v2i64_pgso:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovddup {{.*#+}} xmm1 = [2,2]
+; AVX-NEXT:    # xmm1 = mem[0,0]
+; AVX-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX2-LABEL: splat_v2i64_pgso:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [2,2]
+; AVX2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+  %add = add <2 x i64> %x, <i64 2, i64 2>
+  ret <2 x i64> %add
+}
+
 ; AVX can't do 256-bit integer ops, so we split this into two 128-bit vectors,
 ; and then we fake it: use vmovddup to splat 64-bit value.
 define <4 x i64> @splat_v4i64(<4 x i64> %x) #0 {
@@ -88,6 +146,26 @@ define <4 x i64> @splat_v4i64(<4 x i64> %x) #0 {
   ret <4 x i64> %add
 }
 
+define <4 x i64> @splat_v4i64_pgso(<4 x i64> %x) !prof !14 {
+; AVX-LABEL: splat_v4i64_pgso:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX-NEXT:    vmovddup {{.*#+}} xmm2 = [2,2]
+; AVX-NEXT:    # xmm2 = mem[0,0]
+; AVX-NEXT:    vpaddq %xmm2, %xmm1, %xmm1
+; AVX-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT:    retq
+;
+; AVX2-LABEL: splat_v4i64_pgso:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [2,2,2,2]
+; AVX2-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+  %add = add <4 x i64> %x, <i64 2, i64 2, i64 2, i64 2>
+  ret <4 x i64> %add
+}
+
 ; AVX can't do integer splats, so fake it: use vbroadcastss to splat 32-bit value.
 define <4 x i32> @splat_v4i32(<4 x i32> %x) #1 {
 ; AVX-LABEL: splat_v4i32:
@@ -105,6 +183,22 @@ define <4 x i32> @splat_v4i32(<4 x i32> %x) #1 {
   ret <4 x i32> %add
 }
 
+define <4 x i32> @splat_v4i32_pgso(<4 x i32> %x) !prof !14 {
+; AVX-LABEL: splat_v4i32_pgso:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vbroadcastss {{.*#+}} xmm1 = [2,2,2,2]
+; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX2-LABEL: splat_v4i32_pgso:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [2,2,2,2]
+; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+  %add = add <4 x i32> %x, <i32 2, i32 2, i32 2, i32 2>
+  ret <4 x i32> %add
+}
+
 ; AVX can't do integer splats, so fake it: use vbroadcastss to splat 32-bit value.
 define <8 x i32> @splat_v8i32(<8 x i32> %x) #0 {
 ; AVX-LABEL: splat_v8i32:
@@ -125,6 +219,25 @@ define <8 x i32> @splat_v8i32(<8 x i32> %x) #0 {
   ret <8 x i32> %add
 }
 
+define <8 x i32> @splat_v8i32_pgso(<8 x i32> %x) !prof !14 {
+; AVX-LABEL: splat_v8i32_pgso:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX-NEXT:    vbroadcastss {{.*#+}} xmm2 = [2,2,2,2]
+; AVX-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
+; AVX-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT:    retq
+;
+; AVX2-LABEL: splat_v8i32_pgso:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2]
+; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+  %add = add <8 x i32> %x, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+  ret <8 x i32> %add
+}
+
 ; AVX can't do integer splats, and there's no broadcast fakery for 16-bit. Could use pshuflw, etc?
 define <8 x i16> @splat_v8i16(<8 x i16> %x) #1 {
 ; AVX-LABEL: splat_v8i16:
@@ -141,6 +254,21 @@ define <8 x i16> @splat_v8i16(<8 x i16> %x) #1 {
   ret <8 x i16> %add
 }
 
+define <8 x i16> @splat_v8i16_pgso(<8 x i16> %x) !prof !14 {
+; AVX-LABEL: splat_v8i16_pgso:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpaddw {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX2-LABEL: splat_v8i16_pgso:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2]
+; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+  %add = add <8 x i16> %x, <i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2>
+  ret <8 x i16> %add
+}
+
 ; AVX can't do integer splats, and there's no broadcast fakery for 16-bit. Could use pshuflw, etc?
 define <16 x i16> @splat_v16i16(<16 x i16> %x) #0 {
 ; AVX-LABEL: splat_v16i16:
@@ -161,6 +289,25 @@ define <16 x i16> @splat_v16i16(<16 x i16> %x) #0 {
   ret <16 x i16> %add
 }
 
+define <16 x i16> @splat_v16i16_pgso(<16 x i16> %x) !prof !14 {
+; AVX-LABEL: splat_v16i16_pgso:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [2,2,2,2,2,2,2,2]
+; AVX-NEXT:    vpaddw %xmm2, %xmm1, %xmm1
+; AVX-NEXT:    vpaddw %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT:    retq
+;
+; AVX2-LABEL: splat_v16i16_pgso:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
+; AVX2-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+  %add = add <16 x i16> %x, <i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2>
+  ret <16 x i16> %add
+}
+
 ; AVX can't do integer splats, and there's no broadcast fakery for 8-bit. Could use pshufb, etc?
 define <16 x i8> @splat_v16i8(<16 x i8> %x) #1 {
 ; AVX-LABEL: splat_v16i8:
@@ -177,6 +324,21 @@ define <16 x i8> @splat_v16i8(<16 x i8> %x) #1 {
   ret <16 x i8> %add
 }
 
+define <16 x i8> @splat_v16i8_pgso(<16 x i8> %x) !prof !14 {
+; AVX-LABEL: splat_v16i8_pgso:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpaddb {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX2-LABEL: splat_v16i8_pgso:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
+; AVX2-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+  %add = add <16 x i8> %x, <i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2>
+  ret <16 x i8> %add
+}
+
 ; AVX can't do integer splats, and there's no broadcast fakery for 8-bit. Could use pshufb, etc?
 define <32 x i8> @splat_v32i8(<32 x i8> %x) #0 {
 ; AVX-LABEL: splat_v32i8:
@@ -197,6 +359,25 @@ define <32 x i8> @splat_v32i8(<32 x i8> %x) #0 {
   ret <32 x i8> %add
 }
 
+define <32 x i8> @splat_v32i8_pgso(<32 x i8> %x) !prof !14 {
+; AVX-LABEL: splat_v32i8_pgso:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
+; AVX-NEXT:    vpaddb %xmm2, %xmm1, %xmm1
+; AVX-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT:    retq
+;
+; AVX2-LABEL: splat_v32i8_pgso:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
+; AVX2-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+  %add = add <32 x i8> %x, <i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2>
+  ret <32 x i8> %add
+}
+
 ; PR23259: Verify that ISel doesn't crash with a 'fatal error in backend'
 ; due to a missing AVX pattern to select a v2i64 X86ISD::BROADCAST of a
 ; loadi64 with multiple uses.
@@ -238,3 +419,20 @@ entry:
 
 attributes #0 = { optsize }
 attributes #1 = { minsize }
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"ProfileSummary", !1}
+!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
+!2 = !{!"ProfileFormat", !"InstrProf"}
+!3 = !{!"TotalCount", i64 10000}
+!4 = !{!"MaxCount", i64 10}
+!5 = !{!"MaxInternalCount", i64 1}
+!6 = !{!"MaxFunctionCount", i64 1000}
+!7 = !{!"NumCounts", i64 3}
+!8 = !{!"NumFunctions", i64 3}
+!9 = !{!"DetailedSummary", !10}
+!10 = !{!11, !12, !13}
+!11 = !{i32 10000, i64 100, i32 1}
+!12 = !{i32 999000, i64 100, i32 1}
+!13 = !{i32 999999, i64 1, i32 2}
+!14 = !{!"function_entry_count", i64 0}

diff  --git a/llvm/test/CodeGen/X86/store-zero-and-minus-one.ll b/llvm/test/CodeGen/X86/store-zero-and-minus-one.ll
index 97aa87e9b1db..12e276e6b334 100644
--- a/llvm/test/CodeGen/X86/store-zero-and-minus-one.ll
+++ b/llvm/test/CodeGen/X86/store-zero-and-minus-one.ll
@@ -19,6 +19,23 @@ entry:
 
 }
 
+define void @zero_pgso(i32* %p) !prof !14 {
+; CHECK32-LABEL: zero_pgso:
+; CHECK32:       # %bb.0: # %entry
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK32-NEXT:    movl $0, (%eax)
+; CHECK32-NEXT:    retl
+;
+; CHECK64-LABEL: zero_pgso:
+; CHECK64:       # %bb.0: # %entry
+; CHECK64-NEXT:    movl $0, (%rdi)
+; CHECK64-NEXT:    retq
+entry:
+  store i32 0, i32* %p
+  ret void
+
+}
+
 define void @minus_one_optsize(i32* %p) optsize {
 ; CHECK32-LABEL: minus_one_optsize:
 ; CHECK32:       # %bb.0: # %entry
@@ -36,6 +53,22 @@ entry:
 
 }
 
+define void @minus_one_pgso(i32* %p) !prof !14 {
+; CHECK32-LABEL: minus_one_pgso:
+; CHECK32:       # %bb.0: # %entry
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK32-NEXT:    movl $-1, (%eax)
+; CHECK32-NEXT:    retl
+;
+; CHECK64-LABEL: minus_one_pgso:
+; CHECK64:       # %bb.0: # %entry
+; CHECK64-NEXT:    movl $-1, (%rdi)
+; CHECK64-NEXT:    retq
+entry:
+  store i32 -1, i32* %p
+  ret void
+
+}
 
 define void @zero_64(i64* %p) minsize {
 ; CHECK32-LABEL: zero_64:
@@ -244,3 +277,20 @@ entry:
   store volatile i16 -1, i16* %p
   ret void
 }
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"ProfileSummary", !1}
+!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
+!2 = !{!"ProfileFormat", !"InstrProf"}
+!3 = !{!"TotalCount", i64 10000}
+!4 = !{!"MaxCount", i64 10}
+!5 = !{!"MaxInternalCount", i64 1}
+!6 = !{!"MaxFunctionCount", i64 1000}
+!7 = !{!"NumCounts", i64 3}
+!8 = !{!"NumFunctions", i64 3}
+!9 = !{!"DetailedSummary", !10}
+!10 = !{!11, !12, !13}
+!11 = !{i32 10000, i64 100, i32 1}
+!12 = !{i32 999000, i64 100, i32 1}
+!13 = !{i32 999999, i64 1, i32 2}
+!14 = !{!"function_entry_count", i64 0}

diff  --git a/llvm/test/CodeGen/X86/switch-density.ll b/llvm/test/CodeGen/X86/switch-density.ll
index 52216fb4d7c2..8055a2b87643 100644
--- a/llvm/test/CodeGen/X86/switch-density.ll
+++ b/llvm/test/CodeGen/X86/switch-density.ll
@@ -79,3 +79,72 @@ return: ret void
 ; CHECK: ja
 ; CHECK: jmpq *.LJTI
 }
+
+define void @dense_optsize(i32 %x) optsize {
+entry:
+  switch i32 %x, label %return [
+    i32 12, label %bb0
+    i32 4,  label %bb1
+    i32 16, label %bb1
+    i32 20, label %bb2
+    i32 8,  label %bb3
+  ]
+bb0: tail call void @g(i32 0) br label %return
+bb1: tail call void @g(i32 1) br label %return
+bb2: tail call void @g(i32 1) br label %return
+bb3: tail call void @g(i32 2) br label %return
+return: ret void
+
+; Lowered as branches.
+; CHECK-LABEL: dense_optsize
+; CHECK: cmpl $11
+; CHECK: cmpl $20
+; CHECK: cmpl $16
+; CHECK: cmpl $12
+; CHECK: cmpl $4
+; CHECK: cmpl $8
+; CHECK: retq
+}
+
+define void @dense_pgso(i32 %x) !prof !14 {
+entry:
+  switch i32 %x, label %return [
+    i32 12, label %bb0
+    i32 4,  label %bb1
+    i32 16, label %bb1
+    i32 20, label %bb2
+    i32 8,  label %bb3
+  ]
+bb0: tail call void @g(i32 0) br label %return
+bb1: tail call void @g(i32 1) br label %return
+bb2: tail call void @g(i32 1) br label %return
+bb3: tail call void @g(i32 2) br label %return
+return: ret void
+
+; Lowered as branches.
+; CHECK-LABEL: dense_pgso
+; CHECK: cmpl $11
+; CHECK: cmpl $20
+; CHECK: cmpl $16
+; CHECK: cmpl $12
+; CHECK: cmpl $4
+; CHECK: cmpl $8
+; CHECK: retq
+}
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"ProfileSummary", !1}
+!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
+!2 = !{!"ProfileFormat", !"InstrProf"}
+!3 = !{!"TotalCount", i64 10000}
+!4 = !{!"MaxCount", i64 10}
+!5 = !{!"MaxInternalCount", i64 1}
+!6 = !{!"MaxFunctionCount", i64 1000}
+!7 = !{!"NumCounts", i64 3}
+!8 = !{!"NumFunctions", i64 3}
+!9 = !{!"DetailedSummary", !10}
+!10 = !{!11, !12, !13}
+!11 = !{i32 10000, i64 100, i32 1}
+!12 = !{i32 999000, i64 100, i32 1}
+!13 = !{i32 999999, i64 1, i32 2}
+!14 = !{!"function_entry_count", i64 0}

diff  --git a/llvm/test/CodeGen/X86/tail-opts.ll b/llvm/test/CodeGen/X86/tail-opts.ll
index 27d50dfb0430..32ad04a65fae 100644
--- a/llvm/test/CodeGen/X86/tail-opts.ll
+++ b/llvm/test/CodeGen/X86/tail-opts.ll
@@ -473,6 +473,47 @@ return:
   ret void
 }
 
+define void @one_pgso(i32 %v) nounwind !prof !14 {
+; CHECK-LABEL: one_pgso:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    testl %edi, %edi
+; CHECK-NEXT:    je .LBB6_3
+; CHECK-NEXT:  # %bb.1: # %bby
+; CHECK-NEXT:    cmpl $16, %edi
+; CHECK-NEXT:    je .LBB6_4
+; CHECK-NEXT:  # %bb.2: # %bb7
+; CHECK-NEXT:    jmp tail_call_me # TAILCALL
+; CHECK-NEXT:  .LBB6_3: # %bbx
+; CHECK-NEXT:    cmpl $128, %edi
+; CHECK-NEXT:    jne tail_call_me # TAILCALL
+; CHECK-NEXT:  .LBB6_4: # %return
+; CHECK-NEXT:    retq
+entry:
+  %0 = icmp eq i32 %v, 0
+  br i1 %0, label %bbx, label %bby
+
+bby:
+  switch i32 %v, label %bb7 [
+    i32 16, label %return
+  ]
+
+bb7:
+  tail call void @tail_call_me()
+  ret void
+
+bbx:
+  switch i32 %v, label %bb12 [
+    i32 128, label %return
+  ]
+
+bb12:
+  tail call void @tail_call_me()
+  ret void
+
+return:
+  ret void
+}
+
 ; two - Same as one, but with two instructions in the common
 ; tail instead of one. This is too much to be merged, given
 ; the optsize attribute.
@@ -484,10 +525,51 @@ define void @two() nounwind optsize {
 ; CHECK-NEXT:    testb %al, %al
 ; CHECK-NEXT:    xorl %eax, %eax
 ; CHECK-NEXT:    testb %al, %al
-; CHECK-NEXT:    je .LBB6_1
+; CHECK-NEXT:    je .LBB7_1
 ; CHECK-NEXT:  # %bb.2: # %return
 ; CHECK-NEXT:    retq
-; CHECK-NEXT:  .LBB6_1: # %bb7
+; CHECK-NEXT:  .LBB7_1: # %bb7
+; CHECK-NEXT:    movl $0, {{.*}}(%rip)
+; CHECK-NEXT:    movl $1, {{.*}}(%rip)
+entry:
+  %0 = icmp eq i32 undef, 0
+  br i1 %0, label %bbx, label %bby
+
+bby:
+  switch i32 undef, label %bb7 [
+    i32 16, label %return
+  ]
+
+bb7:
+  store volatile i32 0, i32* @XYZ
+  store volatile i32 1, i32* @XYZ
+  unreachable
+
+bbx:
+  switch i32 undef, label %bb12 [
+    i32 128, label %return
+  ]
+
+bb12:
+  store volatile i32 0, i32* @XYZ
+  store volatile i32 1, i32* @XYZ
+  unreachable
+
+return:
+  ret void
+}
+
+define void @two_pgso() nounwind !prof !14 {
+; CHECK-LABEL: two_pgso:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    testb %al, %al
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    testb %al, %al
+; CHECK-NEXT:    je .LBB8_1
+; CHECK-NEXT:  # %bb.2: # %return
+; CHECK-NEXT:    retq
+; CHECK-NEXT:  .LBB8_1: # %bb7
 ; CHECK-NEXT:    movl $0, {{.*}}(%rip)
 ; CHECK-NEXT:    movl $1, {{.*}}(%rip)
 entry:
@@ -527,10 +609,10 @@ define void @two_minsize() nounwind minsize {
 ; CHECK-NEXT:    testb %al, %al
 ; CHECK-NEXT:    xorl %eax, %eax
 ; CHECK-NEXT:    testb %al, %al
-; CHECK-NEXT:    je .LBB7_1
+; CHECK-NEXT:    je .LBB9_1
 ; CHECK-NEXT:  # %bb.2: # %return
 ; CHECK-NEXT:    retq
-; CHECK-NEXT:  .LBB7_1: # %bb7
+; CHECK-NEXT:  .LBB9_1: # %bb7
 ; CHECK-NEXT:    movl $0, {{.*}}(%rip)
 ; CHECK-NEXT:    movl $1, {{.*}}(%rip)
 entry:
@@ -568,20 +650,20 @@ define void @two_nosize(i32 %x, i32 %y, i32 %z) nounwind {
 ; CHECK-LABEL: two_nosize:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    testl %edi, %edi
-; CHECK-NEXT:    je .LBB8_3
+; CHECK-NEXT:    je .LBB10_3
 ; CHECK-NEXT:  # %bb.1: # %bby
 ; CHECK-NEXT:    testl %esi, %esi
-; CHECK-NEXT:    je .LBB8_4
+; CHECK-NEXT:    je .LBB10_4
 ; CHECK-NEXT:  # %bb.2: # %bb7
 ; CHECK-NEXT:    movl $0, {{.*}}(%rip)
 ; CHECK-NEXT:    jmp tail_call_me # TAILCALL
-; CHECK-NEXT:  .LBB8_3: # %bbx
+; CHECK-NEXT:  .LBB10_3: # %bbx
 ; CHECK-NEXT:    cmpl $-1, %edx
-; CHECK-NEXT:    je .LBB8_4
+; CHECK-NEXT:    je .LBB10_4
 ; CHECK-NEXT:  # %bb.5: # %bb12
 ; CHECK-NEXT:    movl $0, {{.*}}(%rip)
 ; CHECK-NEXT:    jmp tail_call_me # TAILCALL
-; CHECK-NEXT:  .LBB8_4: # %return
+; CHECK-NEXT:  .LBB10_4: # %return
 ; CHECK-NEXT:    retq
 entry:
   %0 = icmp eq i32 %x, 0
@@ -621,11 +703,11 @@ define i64 @TESTE(i64 %parami, i64 %paraml) nounwind readnone {
 ; CHECK-NEXT:    movl $1, %eax
 ; CHECK-NEXT:    cmovgq %rdi, %rax
 ; CHECK-NEXT:    testq %rsi, %rsi
-; CHECK-NEXT:    jle .LBB9_2
+; CHECK-NEXT:    jle .LBB11_2
 ; CHECK-NEXT:  # %bb.1: # %bb.nph
 ; CHECK-NEXT:    imulq %rdi, %rsi
 ; CHECK-NEXT:    movq %rsi, %rax
-; CHECK-NEXT:  .LBB9_2: # %for.end
+; CHECK-NEXT:  .LBB11_2: # %for.end
 ; CHECK-NEXT:    retq
 entry:
   %cmp = icmp slt i64 %parami, 1                  ; <i1> [#uses=1]
@@ -654,24 +736,24 @@ define void @merge_aborts() {
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    callq qux
 ; CHECK-NEXT:    testb $1, %al
-; CHECK-NEXT:    je .LBB10_5
+; CHECK-NEXT:    je .LBB12_5
 ; CHECK-NEXT:  # %bb.1: # %cont1
 ; CHECK-NEXT:    callq qux
 ; CHECK-NEXT:    testb $1, %al
-; CHECK-NEXT:    je .LBB10_5
+; CHECK-NEXT:    je .LBB12_5
 ; CHECK-NEXT:  # %bb.2: # %cont2
 ; CHECK-NEXT:    callq qux
 ; CHECK-NEXT:    testb $1, %al
-; CHECK-NEXT:    je .LBB10_5
+; CHECK-NEXT:    je .LBB12_5
 ; CHECK-NEXT:  # %bb.3: # %cont3
 ; CHECK-NEXT:    callq qux
 ; CHECK-NEXT:    testb $1, %al
-; CHECK-NEXT:    je .LBB10_5
+; CHECK-NEXT:    je .LBB12_5
 ; CHECK-NEXT:  # %bb.4: # %cont4
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-NEXT:    retq
-; CHECK-NEXT:  .LBB10_5: # %abort1
+; CHECK-NEXT:  .LBB12_5: # %abort1
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    callq abort
 entry:
@@ -714,27 +796,27 @@ define void @merge_alternating_aborts() {
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    callq qux
 ; CHECK-NEXT:    testb $1, %al
-; CHECK-NEXT:    je .LBB11_5
+; CHECK-NEXT:    je .LBB13_5
 ; CHECK-NEXT:  # %bb.1: # %cont1
 ; CHECK-NEXT:    callq qux
 ; CHECK-NEXT:    testb $1, %al
-; CHECK-NEXT:    je .LBB11_6
+; CHECK-NEXT:    je .LBB13_6
 ; CHECK-NEXT:  # %bb.2: # %cont2
 ; CHECK-NEXT:    callq qux
 ; CHECK-NEXT:    testb $1, %al
-; CHECK-NEXT:    je .LBB11_5
+; CHECK-NEXT:    je .LBB13_5
 ; CHECK-NEXT:  # %bb.3: # %cont3
 ; CHECK-NEXT:    callq qux
 ; CHECK-NEXT:    testb $1, %al
-; CHECK-NEXT:    je .LBB11_6
+; CHECK-NEXT:    je .LBB13_6
 ; CHECK-NEXT:  # %bb.4: # %cont4
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-NEXT:    retq
-; CHECK-NEXT:  .LBB11_5: # %abort1
+; CHECK-NEXT:  .LBB13_5: # %abort1
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    callq abort
-; CHECK-NEXT:  .LBB11_6: # %abort2
+; CHECK-NEXT:  .LBB13_6: # %abort2
 ; CHECK-NEXT:    callq alt_abort
 entry:
   %c1 = call i1 @qux()
@@ -763,3 +845,20 @@ abort4:
 cont4:
   ret void
 }
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"ProfileSummary", !1}
+!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
+!2 = !{!"ProfileFormat", !"InstrProf"}
+!3 = !{!"TotalCount", i64 10000}
+!4 = !{!"MaxCount", i64 10}
+!5 = !{!"MaxInternalCount", i64 1}
+!6 = !{!"MaxFunctionCount", i64 1000}
+!7 = !{!"NumCounts", i64 3}
+!8 = !{!"NumFunctions", i64 3}
+!9 = !{!"DetailedSummary", !10}
+!10 = !{!11, !12, !13}
+!11 = !{i32 10000, i64 100, i32 1}
+!12 = !{i32 999000, i64 100, i32 1}
+!13 = !{i32 999999, i64 1, i32 2}
+!14 = !{!"function_entry_count", i64 0}

diff  --git a/llvm/test/CodeGen/X86/test-vs-bittest.ll b/llvm/test/CodeGen/X86/test-vs-bittest.ll
index 2f93ad58ad32..38ff7a40aa6d 100644
--- a/llvm/test/CodeGen/X86/test-vs-bittest.ll
+++ b/llvm/test/CodeGen/X86/test-vs-bittest.ll
@@ -49,6 +49,30 @@ no:
   ret void
 }
 
+define void @test64_pgso(i64 inreg %x) !prof !14 {
+; CHECK-LABEL: test64_pgso:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    btl $11, %edi
+; CHECK-NEXT:    jb .LBB2_2
+; CHECK-NEXT:  # %bb.1: # %yes
+; CHECK-NEXT:    callq bar
+; CHECK-NEXT:  .LBB2_2: # %no
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
+  %t = and i64 %x, 2048
+  %s = icmp eq i64 %t, 0
+  br i1 %s, label %yes, label %no
+
+yes:
+  call void @bar()
+  ret void
+no:
+  ret void
+}
+
 ; This test is identical to test64 above with only the destination of the br
 ; reversed. This somehow causes the two functions to get slightly 
diff erent
 ; initial IR. One has an extra invert of the setcc. This previous caused one
@@ -60,10 +84,10 @@ define void @test64_2(i64 inreg %x) {
 ; CHECK-NEXT:    pushq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    testl $2048, %edi # imm = 0x800
-; CHECK-NEXT:    je .LBB2_2
+; CHECK-NEXT:    je .LBB3_2
 ; CHECK-NEXT:  # %bb.1: # %yes
 ; CHECK-NEXT:    callq bar
-; CHECK-NEXT:  .LBB2_2: # %no
+; CHECK-NEXT:  .LBB3_2: # %no
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-NEXT:    retq
@@ -84,10 +108,34 @@ define void @test64_optsize_2(i64 inreg %x) optsize {
 ; CHECK-NEXT:    pushq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    btl $11, %edi
-; CHECK-NEXT:    jae .LBB3_2
+; CHECK-NEXT:    jae .LBB4_2
 ; CHECK-NEXT:  # %bb.1: # %yes
 ; CHECK-NEXT:    callq bar
-; CHECK-NEXT:  .LBB3_2: # %no
+; CHECK-NEXT:  .LBB4_2: # %no
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
+  %t = and i64 %x, 2048
+  %s = icmp eq i64 %t, 0
+  br i1 %s, label %no, label %yes
+
+yes:
+  call void @bar()
+  ret void
+no:
+  ret void
+}
+
+define void @test64_pgso_2(i64 inreg %x) !prof !14 {
+; CHECK-LABEL: test64_pgso_2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    btl $11, %edi
+; CHECK-NEXT:    jae .LBB5_2
+; CHECK-NEXT:  # %bb.1: # %yes
+; CHECK-NEXT:    callq bar
+; CHECK-NEXT:  .LBB5_2: # %no
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-NEXT:    retq
@@ -108,10 +156,10 @@ define void @test64_3(i64 inreg %x) {
 ; CHECK-NEXT:    pushq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    btq $32, %rdi
-; CHECK-NEXT:    jb .LBB4_2
+; CHECK-NEXT:    jb .LBB6_2
 ; CHECK-NEXT:  # %bb.1: # %yes
 ; CHECK-NEXT:    callq bar
-; CHECK-NEXT:  .LBB4_2: # %no
+; CHECK-NEXT:  .LBB6_2: # %no
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-NEXT:    retq
@@ -132,10 +180,34 @@ define void @test64_optsize_3(i64 inreg %x) optsize {
 ; CHECK-NEXT:    pushq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    btq $32, %rdi
-; CHECK-NEXT:    jb .LBB5_2
+; CHECK-NEXT:    jb .LBB7_2
 ; CHECK-NEXT:  # %bb.1: # %yes
 ; CHECK-NEXT:    callq bar
-; CHECK-NEXT:  .LBB5_2: # %no
+; CHECK-NEXT:  .LBB7_2: # %no
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
+  %t = and i64 %x, 4294967296
+  %s = icmp eq i64 %t, 0
+  br i1 %s, label %yes, label %no
+
+yes:
+  call void @bar()
+  ret void
+no:
+  ret void
+}
+
+define void @test64_pgso_3(i64 inreg %x) !prof !14 {
+; CHECK-LABEL: test64_pgso_3:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    btq $32, %rdi
+; CHECK-NEXT:    jb .LBB8_2
+; CHECK-NEXT:  # %bb.1: # %yes
+; CHECK-NEXT:    callq bar
+; CHECK-NEXT:  .LBB8_2: # %no
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-NEXT:    retq
@@ -156,10 +228,10 @@ define void @test64_4(i64 inreg %x) {
 ; CHECK-NEXT:    pushq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    btq $32, %rdi
-; CHECK-NEXT:    jae .LBB6_2
+; CHECK-NEXT:    jae .LBB9_2
 ; CHECK-NEXT:  # %bb.1: # %yes
 ; CHECK-NEXT:    callq bar
-; CHECK-NEXT:  .LBB6_2: # %no
+; CHECK-NEXT:  .LBB9_2: # %no
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-NEXT:    retq
@@ -180,10 +252,34 @@ define void @test64_optsize_4(i64 inreg %x) optsize {
 ; CHECK-NEXT:    pushq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    btq $32, %rdi
-; CHECK-NEXT:    jae .LBB7_2
+; CHECK-NEXT:    jae .LBB10_2
 ; CHECK-NEXT:  # %bb.1: # %yes
 ; CHECK-NEXT:    callq bar
-; CHECK-NEXT:  .LBB7_2: # %no
+; CHECK-NEXT:  .LBB10_2: # %no
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
+  %t = and i64 %x, 4294967296
+  %s = icmp eq i64 %t, 0
+  br i1 %s, label %no, label %yes
+
+yes:
+  call void @bar()
+  ret void
+no:
+  ret void
+}
+
+define void @test64_pgso_4(i64 inreg %x) !prof !14 {
+; CHECK-LABEL: test64_pgso_4:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    btq $32, %rdi
+; CHECK-NEXT:    jae .LBB11_2
+; CHECK-NEXT:  # %bb.1: # %yes
+; CHECK-NEXT:    callq bar
+; CHECK-NEXT:  .LBB11_2: # %no
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-NEXT:    retq
@@ -204,10 +300,10 @@ define void @test32(i32 inreg %x) {
 ; CHECK-NEXT:    pushq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    testl $2048, %edi # imm = 0x800
-; CHECK-NEXT:    jne .LBB8_2
+; CHECK-NEXT:    jne .LBB12_2
 ; CHECK-NEXT:  # %bb.1: # %yes
 ; CHECK-NEXT:    callq bar
-; CHECK-NEXT:  .LBB8_2: # %no
+; CHECK-NEXT:  .LBB12_2: # %no
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-NEXT:    retq
@@ -228,10 +324,10 @@ define void @test32_optsize(i32 inreg %x) optsize {
 ; CHECK-NEXT:    pushq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    btl $11, %edi
-; CHECK-NEXT:    jb .LBB9_2
+; CHECK-NEXT:    jb .LBB13_2
 ; CHECK-NEXT:  # %bb.1: # %yes
 ; CHECK-NEXT:    callq bar
-; CHECK-NEXT:  .LBB9_2: # %no
+; CHECK-NEXT:  .LBB13_2: # %no
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-NEXT:    retq
@@ -252,10 +348,10 @@ define void @test32_2(i32 inreg %x) {
 ; CHECK-NEXT:    pushq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    testl $2048, %edi # imm = 0x800
-; CHECK-NEXT:    je .LBB10_2
+; CHECK-NEXT:    je .LBB14_2
 ; CHECK-NEXT:  # %bb.1: # %yes
 ; CHECK-NEXT:    callq bar
-; CHECK-NEXT:  .LBB10_2: # %no
+; CHECK-NEXT:  .LBB14_2: # %no
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-NEXT:    retq
@@ -276,10 +372,34 @@ define void @test32_optsize_2(i32 inreg %x) optsize {
 ; CHECK-NEXT:    pushq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    btl $11, %edi
-; CHECK-NEXT:    jae .LBB11_2
+; CHECK-NEXT:    jae .LBB15_2
 ; CHECK-NEXT:  # %bb.1: # %yes
 ; CHECK-NEXT:    callq bar
-; CHECK-NEXT:  .LBB11_2: # %no
+; CHECK-NEXT:  .LBB15_2: # %no
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
+  %t = and i32 %x, 2048
+  %s = icmp eq i32 %t, 0
+  br i1 %s, label %no, label %yes
+
+yes:
+  call void @bar()
+  ret void
+no:
+  ret void
+}
+
+define void @test32_pgso_2(i32 inreg %x) !prof !14 {
+; CHECK-LABEL: test32_pgso_2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    btl $11, %edi
+; CHECK-NEXT:    jae .LBB16_2
+; CHECK-NEXT:  # %bb.1: # %yes
+; CHECK-NEXT:    callq bar
+; CHECK-NEXT:  .LBB16_2: # %no
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-NEXT:    retq
@@ -300,10 +420,10 @@ define void @test16(i16 inreg %x) {
 ; CHECK-NEXT:    pushq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    testl $2048, %edi # imm = 0x800
-; CHECK-NEXT:    jne .LBB12_2
+; CHECK-NEXT:    jne .LBB17_2
 ; CHECK-NEXT:  # %bb.1: # %yes
 ; CHECK-NEXT:    callq bar
-; CHECK-NEXT:  .LBB12_2: # %no
+; CHECK-NEXT:  .LBB17_2: # %no
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-NEXT:    retq
@@ -324,10 +444,34 @@ define void @test16_optsize(i16 inreg %x) optsize {
 ; CHECK-NEXT:    pushq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    btl $11, %edi
-; CHECK-NEXT:    jb .LBB13_2
+; CHECK-NEXT:    jb .LBB18_2
 ; CHECK-NEXT:  # %bb.1: # %yes
 ; CHECK-NEXT:    callq bar
-; CHECK-NEXT:  .LBB13_2: # %no
+; CHECK-NEXT:  .LBB18_2: # %no
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
+  %t = and i16 %x, 2048
+  %s = icmp eq i16 %t, 0
+  br i1 %s, label %yes, label %no
+
+yes:
+  call void @bar()
+  ret void
+no:
+  ret void
+}
+
+define void @test16_pgso(i16 inreg %x) !prof !14 {
+; CHECK-LABEL: test16_pgso:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    btl $11, %edi
+; CHECK-NEXT:    jb .LBB19_2
+; CHECK-NEXT:  # %bb.1: # %yes
+; CHECK-NEXT:    callq bar
+; CHECK-NEXT:  .LBB19_2: # %no
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-NEXT:    retq
@@ -348,10 +492,10 @@ define void @test16_2(i16 inreg %x) {
 ; CHECK-NEXT:    pushq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    testl $2048, %edi # imm = 0x800
-; CHECK-NEXT:    je .LBB14_2
+; CHECK-NEXT:    je .LBB20_2
 ; CHECK-NEXT:  # %bb.1: # %yes
 ; CHECK-NEXT:    callq bar
-; CHECK-NEXT:  .LBB14_2: # %no
+; CHECK-NEXT:  .LBB20_2: # %no
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-NEXT:    retq
@@ -372,10 +516,34 @@ define void @test16_optsize_2(i16 inreg %x) optsize {
 ; CHECK-NEXT:    pushq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    btl $11, %edi
-; CHECK-NEXT:    jae .LBB15_2
+; CHECK-NEXT:    jae .LBB21_2
 ; CHECK-NEXT:  # %bb.1: # %yes
 ; CHECK-NEXT:    callq bar
-; CHECK-NEXT:  .LBB15_2: # %no
+; CHECK-NEXT:  .LBB21_2: # %no
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
+  %t = and i16 %x, 2048
+  %s = icmp eq i16 %t, 0
+  br i1 %s, label %no, label %yes
+
+yes:
+  call void @bar()
+  ret void
+no:
+  ret void
+}
+
+define void @test16_pgso_2(i16 inreg %x) !prof !14 {
+; CHECK-LABEL: test16_pgso_2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    btl $11, %edi
+; CHECK-NEXT:    jae .LBB22_2
+; CHECK-NEXT:  # %bb.1: # %yes
+; CHECK-NEXT:    callq bar
+; CHECK-NEXT:  .LBB22_2: # %no
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-NEXT:    retq
@@ -512,3 +680,20 @@ define i32 @setcc_is_bit_set(i32 %x) {
 }
 
 declare void @bar()
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"ProfileSummary", !1}
+!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
+!2 = !{!"ProfileFormat", !"InstrProf"}
+!3 = !{!"TotalCount", i64 10000}
+!4 = !{!"MaxCount", i64 10}
+!5 = !{!"MaxInternalCount", i64 1}
+!6 = !{!"MaxFunctionCount", i64 1000}
+!7 = !{!"NumCounts", i64 3}
+!8 = !{!"NumFunctions", i64 3}
+!9 = !{!"DetailedSummary", !10}
+!10 = !{!11, !12, !13}
+!11 = !{i32 10000, i64 100, i32 1}
+!12 = !{i32 999000, i64 100, i32 1}
+!13 = !{i32 999999, i64 1, i32 2}
+!14 = !{!"function_entry_count", i64 0}

diff  --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll
index 7db60b922644..274a7f1267c1 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll
@@ -2002,6 +2002,56 @@ define <8 x i32> @shuffle_v8i32_0zzzzzzz_optsize(<8 x i32> %a) optsize {
   ret <8 x i32> %b
 }
 
+define <4 x double> @shuffle_v4f64_0zzz_pgso(<4 x double> %a) !prof !14 {
+; ALL-LABEL: shuffle_v4f64_0zzz_pgso:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; ALL-NEXT:    retq
+  %b = shufflevector <4 x double> %a, <4 x double> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+  ret <4 x double> %b
+}
+
+define <4 x i64> @shuffle_v4i64_0zzz_pgso(<4 x i64> %a) !prof !14 {
+; ALL-LABEL: shuffle_v4i64_0zzz_pgso:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; ALL-NEXT:    retq
+  %b = shufflevector <4 x i64> %a, <4 x i64> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+  ret <4 x i64> %b
+}
+
+define <8 x float> @shuffle_v8f32_0zzzzzzz_pgso(<8 x float> %a) !prof !14 {
+; AVX1OR2-LABEL: shuffle_v8f32_0zzzzzzz_pgso:
+; AVX1OR2:       # %bb.0:
+; AVX1OR2-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX1OR2-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX1OR2-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v8f32_0zzzzzzz_pgso:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX512VL-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX512VL-NEXT:    retq
+  %b = shufflevector <8 x float> %a, <8 x float> zeroinitializer, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <8 x float> %b
+}
+
+define <8 x i32> @shuffle_v8i32_0zzzzzzz_pgso(<8 x i32> %a) !prof !14 {
+; AVX1OR2-LABEL: shuffle_v8i32_0zzzzzzz_pgso:
+; AVX1OR2:       # %bb.0:
+; AVX1OR2-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX1OR2-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX1OR2-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v8i32_0zzzzzzz_pgso:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX512VL-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX512VL-NEXT:    retq
+  %b = shufflevector <8 x i32> %a, <8 x i32> zeroinitializer, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <8 x i32> %b
+}
+
 define <4 x i64> @unpckh_v4i64(<4 x i64> %x, <4 x i64> %y) {
 ; ALL-LABEL: unpckh_v4i64:
 ; ALL:       # %bb.0:
@@ -2022,3 +2072,19 @@ define <4 x double> @unpckh_v4f64(<4 x double> %x, <4 x double> %y) {
   ret <4 x double> %unpckh
 }
 
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"ProfileSummary", !1}
+!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
+!2 = !{!"ProfileFormat", !"InstrProf"}
+!3 = !{!"TotalCount", i64 10000}
+!4 = !{!"MaxCount", i64 10}
+!5 = !{!"MaxInternalCount", i64 1}
+!6 = !{!"MaxFunctionCount", i64 1000}
+!7 = !{!"NumCounts", i64 3}
+!8 = !{!"NumFunctions", i64 3}
+!9 = !{!"DetailedSummary", !10}
+!10 = !{!11, !12, !13}
+!11 = !{i32 10000, i64 100, i32 1}
+!12 = !{i32 999000, i64 100, i32 1}
+!13 = !{i32 999999, i64 1, i32 2}
+!14 = !{!"function_entry_count", i64 0}

diff  --git a/llvm/test/CodeGen/X86/x86-64-bittest-logic.ll b/llvm/test/CodeGen/X86/x86-64-bittest-logic.ll
index 253a5d124995..ceb08ceadd09 100644
--- a/llvm/test/CodeGen/X86/x86-64-bittest-logic.ll
+++ b/llvm/test/CodeGen/X86/x86-64-bittest-logic.ll
@@ -240,3 +240,140 @@ define i64 @xor4_optsize(i64 %x) optsize {
   %a = xor i64 %x, 9223372036854775808 ; toggle bit 63
   ret i64 %a
 }
+
+define i64 @and1_pgso(i64 %x) !prof !14 {
+; CHECK-LABEL: and1_pgso:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    btrq $31, %rax
+; CHECK-NEXT:    retq
+  %a = and i64 %x, 18446744071562067967 ; clear bit 31
+  ret i64 %a
+}
+
+define i64 @and2_pgso(i64 %x) !prof !14 {
+; CHECK-LABEL: and2_pgso:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    btrq $32, %rax
+; CHECK-NEXT:    retq
+  %a = and i64 %x, 18446744069414584319 ; clear bit 32
+  ret i64 %a
+}
+
+define i64 @and3_pgso(i64 %x) !prof !14 {
+; CHECK-LABEL: and3_pgso:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    btrq $62, %rax
+; CHECK-NEXT:    retq
+  %a = and i64 %x, 13835058055282163711 ; clear bit 62
+  ret i64 %a
+}
+
+define i64 @and4_pgso(i64 %x) !prof !14 {
+; CHECK-LABEL: and4_pgso:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    btrq $63, %rax
+; CHECK-NEXT:    retq
+  %a = and i64 %x, 9223372036854775807 ; clear bit 63
+  ret i64 %a
+}
+
+define i64 @or1_pgso(i64 %x) !prof !14 {
+; CHECK-LABEL: or1_pgso:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    btsq $31, %rax
+; CHECK-NEXT:    retq
+  %a = or i64 %x, 2147483648 ; set bit 31
+  ret i64 %a
+}
+
+define i64 @or2_pgso(i64 %x) !prof !14 {
+; CHECK-LABEL: or2_pgso:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    btsq $32, %rax
+; CHECK-NEXT:    retq
+  %a = or i64 %x, 4294967296 ; set bit 32
+  ret i64 %a
+}
+
+define i64 @or3_pgso(i64 %x) !prof !14 {
+; CHECK-LABEL: or3_pgso:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    btsq $62, %rax
+; CHECK-NEXT:    retq
+  %a = or i64 %x, 4611686018427387904 ; set bit 62
+  ret i64 %a
+}
+
+define i64 @or4_pgso(i64 %x) !prof !14 {
+; CHECK-LABEL: or4_pgso:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    btsq $63, %rax
+; CHECK-NEXT:    retq
+  %a = or i64 %x, 9223372036854775808 ; set bit 63
+  ret i64 %a
+}
+
+define i64 @xor1_pgso(i64 %x) !prof !14 {
+; CHECK-LABEL: xor1_pgso:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    btcq $31, %rax
+; CHECK-NEXT:    retq
+  %a = xor i64 %x, 2147483648 ; toggle bit 31
+  ret i64 %a
+}
+
+define i64 @xor2_pgso(i64 %x) !prof !14 {
+; CHECK-LABEL: xor2_pgso:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    btcq $32, %rax
+; CHECK-NEXT:    retq
+  %a = xor i64 %x, 4294967296 ; toggle bit 32
+  ret i64 %a
+}
+
+define i64 @xor3_pgso(i64 %x) !prof !14 {
+; CHECK-LABEL: xor3_pgso:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    btcq $62, %rax
+; CHECK-NEXT:    retq
+  %a = xor i64 %x, 4611686018427387904 ; toggle bit 62
+  ret i64 %a
+}
+
+define i64 @xor4_pgso(i64 %x) !prof !14 {
+; CHECK-LABEL: xor4_pgso:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    btcq $63, %rax
+; CHECK-NEXT:    retq
+  %a = xor i64 %x, 9223372036854775808 ; toggle bit 63
+  ret i64 %a
+}
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"ProfileSummary", !1}
+!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
+!2 = !{!"ProfileFormat", !"InstrProf"}
+!3 = !{!"TotalCount", i64 10000}
+!4 = !{!"MaxCount", i64 10}
+!5 = !{!"MaxInternalCount", i64 1}
+!6 = !{!"MaxFunctionCount", i64 1000}
+!7 = !{!"NumCounts", i64 3}
+!8 = !{!"NumFunctions", i64 3}
+!9 = !{!"DetailedSummary", !10}
+!10 = !{!11, !12, !13}
+!11 = !{i32 10000, i64 100, i32 1}
+!12 = !{i32 999000, i64 100, i32 1}
+!13 = !{i32 999999, i64 1, i32 2}
+!14 = !{!"function_entry_count", i64 0}

diff  --git a/llvm/test/CodeGen/X86/x86-64-double-shifts-Oz-Os-O2.ll b/llvm/test/CodeGen/X86/x86-64-double-shifts-Oz-Os-O2.ll
index 5a180104bb42..fa7394463fbb 100644
--- a/llvm/test/CodeGen/X86/x86-64-double-shifts-Oz-Os-O2.ll
+++ b/llvm/test/CodeGen/X86/x86-64-double-shifts-Oz-Os-O2.ll
@@ -50,6 +50,19 @@ entry:
   ret i64 %or
 }
 
+define i64 @_Z8lshift11mm_pgso(i64 %a, i64 %b) !prof !14 {
+; CHECK-LABEL: _Z8lshift11mm_pgso:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    shldq $11, %rsi, %rax
+; CHECK-NEXT:    retq
+entry:
+  %shl = shl i64 %a, 11
+  %shr = lshr i64 %b, 53
+  %or = or i64 %shr, %shl
+  ret i64 %or
+}
+
 attributes #1 = { nounwind optsize readnone uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
 
 ; clang -O2 -c test2.cpp -emit-llvm -S
@@ -78,3 +91,19 @@ entry:
 
 attributes #2= { nounwind readnone uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
 
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"ProfileSummary", !1}
+!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
+!2 = !{!"ProfileFormat", !"InstrProf"}
+!3 = !{!"TotalCount", i64 10000}
+!4 = !{!"MaxCount", i64 10}
+!5 = !{!"MaxInternalCount", i64 1}
+!6 = !{!"MaxFunctionCount", i64 1000}
+!7 = !{!"NumCounts", i64 3}
+!8 = !{!"NumFunctions", i64 3}
+!9 = !{!"DetailedSummary", !10}
+!10 = !{!11, !12, !13}
+!11 = !{i32 10000, i64 100, i32 1}
+!12 = !{i32 999000, i64 100, i32 1}
+!13 = !{i32 999999, i64 1, i32 2}
+!14 = !{!"function_entry_count", i64 0}

diff  --git a/llvm/test/CodeGen/X86/x86-repmov-copy-eflags.ll b/llvm/test/CodeGen/X86/x86-repmov-copy-eflags.ll
index 636f1092c264..91bce1610c8b 100644
--- a/llvm/test/CodeGen/X86/x86-repmov-copy-eflags.ll
+++ b/llvm/test/CodeGen/X86/x86-repmov-copy-eflags.ll
@@ -25,6 +25,26 @@ while.end:                                        ; preds = %while.body
   ret void
 }
 
+define void @f_pgso(i8* %p, i8* %q, i32* inalloca nocapture %unused) !prof !14 {
+entry:
+  %g = alloca %struct.T, align 8
+  %r = alloca i32, align 8
+  store i32 0, i32* %r, align 4
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 %p, i8* align 8 %q, i32 24, i1 false)
+  br label %while.body
+
+while.body:                                       ; preds = %while.body, %entry
+  %load = load i32, i32* %r, align 4
+  %dec = add nsw i32 %load, -1
+  store i32 %dec, i32* %r, align 4
+  call void @g(%struct.T* %g)
+  %tobool = icmp eq i32 %dec, 0
+  br i1 %tobool, label %while.end, label %while.body
+
+while.end:                                        ; preds = %while.body
+  ret void
+}
+
 ; Function Attrs: argmemonly nounwind
 declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture readonly, i32, i1) #1
 
@@ -46,5 +66,38 @@ declare void @g(%struct.T*)
 ; CHECK:     testb    %[[NE_REG]], %[[NE_REG]]
 ; CHECK:     jne
 
+; CHECK-LABEL: _f_pgso:
+; CHECK:     pushl %ebp
+; CHECK:     movl %esp, %ebp
+; CHECK:     andl $-8, %esp
+; CHECK-NOT: movl %esp, %esi
+; CHECK:     rep;movsl
+; CHECK:     leal 8(%esp), %esi
+
+; CHECK:     decl     (%esp)
+; CHECK:     setne    %[[NE_REG:.*]]
+; CHECK:     pushl     %esi
+; CHECK:     calll     _g
+; CHECK:     addl     $4, %esp
+; CHECK:     testb    %[[NE_REG]], %[[NE_REG]]
+; CHECK:     jne
+
 attributes #0 = { nounwind optsize }
 attributes #1 = { argmemonly nounwind }
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"ProfileSummary", !1}
+!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
+!2 = !{!"ProfileFormat", !"InstrProf"}
+!3 = !{!"TotalCount", i64 10000}
+!4 = !{!"MaxCount", i64 10}
+!5 = !{!"MaxInternalCount", i64 1}
+!6 = !{!"MaxFunctionCount", i64 1000}
+!7 = !{!"NumCounts", i64 3}
+!8 = !{!"NumFunctions", i64 3}
+!9 = !{!"DetailedSummary", !10}
+!10 = !{!11, !12, !13}
+!11 = !{i32 10000, i64 100, i32 1}
+!12 = !{i32 999000, i64 100, i32 1}
+!13 = !{i32 999999, i64 1, i32 2}
+!14 = !{!"function_entry_count", i64 0}

diff  --git a/llvm/test/Transforms/CodeGenPrepare/X86/sink-addrmode.ll b/llvm/test/Transforms/CodeGenPrepare/X86/sink-addrmode.ll
index 4d28e06f2527..7a5a2ebd698a 100644
--- a/llvm/test/Transforms/CodeGenPrepare/X86/sink-addrmode.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/X86/sink-addrmode.ll
@@ -152,6 +152,30 @@ rare.1:
   br label %fallthrough
 }
 
+; Negative test - opt for size
+define void @test6_pgso(i1 %cond, i64* %base) !prof !14 {
+; CHECK-LABEL: @test6
+entry:
+; CHECK: %addr = getelementptr
+  %addr = getelementptr inbounds i64, i64* %base, i64 5
+  %casted = bitcast i64* %addr to i32*
+  br i1 %cond, label %if.then, label %fallthrough
+
+if.then:
+; CHECK-LABEL: if.then:
+; CHECK-NOT: getelementptr inbounds i8, {{.+}} 40
+  %v1 = load i32, i32* %casted, align 4
+  call void @foo(i32 %v1)
+  %cmp = icmp eq i32 %v1, 0
+  br i1 %cmp, label %rare.1, label %fallthrough
+
+fallthrough:
+  ret void
+
+rare.1:
+  call void @slowpath(i32 %v1, i32* %casted) cold
+  br label %fallthrough
+}
 
 ; Make sure sinking two copies of addressing mode into 
diff erent blocks works
 ; when there are cold paths for each.
@@ -278,3 +302,20 @@ BB:
   store i1 false, i1* %G23
   ret void
 }
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"ProfileSummary", !1}
+!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
+!2 = !{!"ProfileFormat", !"InstrProf"}
+!3 = !{!"TotalCount", i64 10000}
+!4 = !{!"MaxCount", i64 10}
+!5 = !{!"MaxInternalCount", i64 1}
+!6 = !{!"MaxFunctionCount", i64 1000}
+!7 = !{!"NumCounts", i64 3}
+!8 = !{!"NumFunctions", i64 3}
+!9 = !{!"DetailedSummary", !10}
+!10 = !{!11, !12, !13}
+!11 = !{i32 10000, i64 100, i32 1}
+!12 = !{i32 999000, i64 100, i32 1}
+!13 = !{i32 999999, i64 1, i32 2}
+!14 = !{!"function_entry_count", i64 0}